Source code for csvdiff

#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
#  __init__.py
#  csvdiff
#

from __future__ import absolute_import, print_function, division

import sys

import click

from . import records, patch, error


__author__ = 'Lars Yencken'
__email__ = 'lars@yencken.org'
__version__ = '0.3.1'


if sys.version_info.major == 2:
    import StringIO as io
else:
    import io


# exit codes for the command-line
EXIT_SAME = 0
EXIT_DIFFERENT = 1
EXIT_ERROR = 2


[docs]def diff_files(from_file, to_file, index_columns, sep=',', ignored_columns=None):
    """
    Diff two CSV files, returning the patch which transforms one into the
    other.
    """
    with open(from_file) as from_stream:
        with open(to_file) as to_stream:
            from_records = records.load(from_stream, sep=sep)
            to_records = records.load(to_stream, sep=sep)
            return patch.create(from_records, to_records, index_columns,
                                ignore_columns=ignored_columns)


[docs]def diff_records(from_records, to_records, index_columns):
    """
    Diff two sequences of dictionary records, returning the patch which
    transforms one into the other.
    """
    return patch.create(from_records, to_records, index_columns)


[docs]def patch_file(patch_stream, fromcsv_stream, tocsv_stream, strict=True,
               sep=','):
    """
    Apply the patch to the source CSV file, and save the result to the target
    file.
    """
    diff = patch.load(patch_stream)

    from_records = records.load(fromcsv_stream, sep=sep)
    to_records = patch.apply(diff, from_records, strict=strict)

    # what order should the columns be in?
    if to_records:
        # have data, use a nice ordering
        all_columns = to_records[0].keys()
        index_columns = diff['_index']
        fieldnames = _nice_fieldnames(all_columns, index_columns)
    else:
        # no data, use the original order
        fieldnames = from_records.fieldnames

    records.save(to_records, fieldnames, tocsv_stream)


[docs]def patch_records(diff, from_records, strict=True):
    """
    Apply the patch to the sequence of records, returning the transformed
    records.
    """
    return patch.apply(diff, from_records, strict=strict)


def _nice_fieldnames(all_columns, index_columns):
    "Indexes on the left, other fields in alphabetical order on the right."
    non_index_columns = set(all_columns).difference(index_columns)
    return index_columns + sorted(non_index_columns)


[docs]class CSVType(click.ParamType):
    name = 'csv'

[docs]    def convert(self, value, param, ctx):
        if isinstance(value, bytes):
            try:
                enc = getattr(sys.stdin, 'encoding', None)
                if enc is not None:
                    value = value.decode(enc)
            except UnicodeError:
                try:
                    value = value.decode(sys.getfilesystemencoding())
                except UnicodeError:
                    value = value.decode('utf-8', 'replace')
            return value.split(',')

        return value.split(',')

    def __repr__(self):
        return 'CSV'


@click.command()
@click.argument('index_columns', type=CSVType())
@click.argument('from_csv', type=click.Path(exists=True))
@click.argument('to_csv', type=click.Path(exists=True))
@click.option('--style',
              type=click.Choice(['compact', 'pretty', 'summary']),
              default='compact',
              help=('Instead of the default compact output, pretty-print '
                    'or give a summary instead'))
@click.option('--output', '-o', type=click.Path(),
              help='Output to a file instead of stdout')
@click.option('--quiet', '-q', is_flag=True,
              help="Don't output anything, just use exit codes")
@click.option('--sep', default=',',
              help='Separator to use between fields [default: comma]')
@click.option('--ignore-columns', '-i', type=CSVType(),
              help='a comma seperated list of columns to ignore from the comparison')
@click.option('--significance', type=int,
              help='Ignore numeric changes less than this number of significant figures')
def csvdiff_cmd(index_columns, from_csv, to_csv, style=None, output=None,
                sep=',', quiet=False, ignore_columns=None, significance=None):
    """
    Compare two csv files to see what rows differ between them. The files
    are each expected to have a header row, and for each row to be uniquely
    identified by one or more indexing columns.
    """

    if ignore_columns is not None:
        for i in ignore_columns:
            if i in index_columns:
                error.abort("You can't ignore an index column")

    ostream = (open(output, 'w') if output
               else io.StringIO() if quiet
               else sys.stdout)

    try:
        if style == 'summary':
            _diff_and_summarize(from_csv, to_csv, index_columns, ostream,
                                sep=sep, ignored_columns=ignore_columns,
                                significance=significance)
        else:
            compact = (style == 'compact')
            _diff_files_to_stream(from_csv, to_csv, index_columns, ostream,
                                  compact=compact, sep=sep, ignored_columns=ignore_columns,
                                  significance=significance)

    except records.InvalidKeyError as e:
        error.abort(e.args[0])

    finally:
        ostream.close()


def _diff_files_to_stream(from_csv, to_csv, index_columns, ostream,
                          compact=False, sep=',', ignored_columns=None,
                          significance=None):
    diff = diff_files(from_csv, to_csv, index_columns, sep=sep, ignored_columns=ignored_columns)

    if significance is not None:
        diff = patch.filter_significance(diff, significance)

    patch.save(diff, ostream, compact=compact)
    exit_code = (EXIT_SAME
                 if patch.is_empty(diff)
                 else EXIT_DIFFERENT)
    sys.exit(exit_code)


def _diff_and_summarize(from_csv, to_csv, index_columns, stream=sys.stdout,
                        sep=',', ignored_columns=None, significance=None):
    """
    Print a summary of the difference between the two files.
    """
    from_records = list(records.load(from_csv, sep=sep))
    to_records = records.load(to_csv, sep=sep)

    diff = patch.create(from_records, to_records, index_columns, ignored_columns)
    if significance is not None:
        diff = patch.filter_significance(diff, significance)

    _summarize_diff(diff, len(from_records), stream=stream)
    exit_code = (EXIT_SAME
                 if patch.is_empty(diff)
                 else EXIT_DIFFERENT)
    sys.exit(exit_code)


def _summarize_diff(diff, orig_size, stream=sys.stdout):
    if orig_size == 0:
        # slightly arbitrary when the original data was empty
        orig_size = 1

    n_removed = len(diff['removed'])
    n_added = len(diff['added'])
    n_changed = len(diff['changed'])

    if n_removed or n_added or n_changed:
        print(u'%d rows removed (%.01f%%)' % (
            n_removed, 100 * n_removed / orig_size
        ), file=stream)
        print(u'%d rows added (%.01f%%)' % (
            n_added, 100 * n_added / orig_size
        ), file=stream)
        print(u'%d rows changed (%.01f%%)' % (
            n_changed, 100 * n_changed / orig_size
        ), file=stream)
    else:
        print(u'files are identical', file=stream)


@click.command()
@click.argument('input_csv', type=click.Path(exists=True))
@click.option('--input', '-i', type=click.Path(exists=True),
              help='Read the JSON patch from the given file.')
@click.option('--output', '-o', type=click.Path(),
              help='Write the transformed CSV to the given file.')
@click.option('--strict/--no-strict', default=True,
              help='Whether or not to tolerate a changed source document '
                   '(default: strict)')
def csvpatch_cmd(input_csv, input=None, output=None, strict=True):
    """
    Apply the changes from a csvdiff patch to an existing CSV file.
    """
    patch_stream = (sys.stdin
                    if input is None
                    else open(input))
    tocsv_stream = (sys.stdout
                    if output is None
                    else open(output, 'w'))
    fromcsv_stream = open(input_csv)

    try:
        patch_file(patch_stream, fromcsv_stream, tocsv_stream, strict=strict)

    except patch.InvalidPatchError as e:
        error.abort('reading patch, {0}'.format(e.args[0]))

    finally:
        patch_stream.close()
        fromcsv_stream.close()
        tocsv_stream.close()