#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# __init__.py
# csvdiff
#
from __future__ import absolute_import, print_function, division
import sys
import click
from . import records, patch, error
__author__ = 'Lars Yencken'
__email__ = 'lars@yencken.org'
__version__ = '0.3.1'
if sys.version_info.major == 2:
import StringIO as io
else:
import io
# exit codes for the command-line
EXIT_SAME = 0
EXIT_DIFFERENT = 1
EXIT_ERROR = 2
[docs]def diff_files(from_file, to_file, index_columns, sep=',', ignored_columns=None):
"""
Diff two CSV files, returning the patch which transforms one into the
other.
"""
with open(from_file) as from_stream:
with open(to_file) as to_stream:
from_records = records.load(from_stream, sep=sep)
to_records = records.load(to_stream, sep=sep)
return patch.create(from_records, to_records, index_columns,
ignore_columns=ignored_columns)
[docs]def diff_records(from_records, to_records, index_columns):
"""
Diff two sequences of dictionary records, returning the patch which
transforms one into the other.
"""
return patch.create(from_records, to_records, index_columns)
[docs]def patch_file(patch_stream, fromcsv_stream, tocsv_stream, strict=True,
sep=','):
"""
Apply the patch to the source CSV file, and save the result to the target
file.
"""
diff = patch.load(patch_stream)
from_records = records.load(fromcsv_stream, sep=sep)
to_records = patch.apply(diff, from_records, strict=strict)
# what order should the columns be in?
if to_records:
# have data, use a nice ordering
all_columns = to_records[0].keys()
index_columns = diff['_index']
fieldnames = _nice_fieldnames(all_columns, index_columns)
else:
# no data, use the original order
fieldnames = from_records.fieldnames
records.save(to_records, fieldnames, tocsv_stream)
[docs]def patch_records(diff, from_records, strict=True):
"""
Apply the patch to the sequence of records, returning the transformed
records.
"""
return patch.apply(diff, from_records, strict=strict)
def _nice_fieldnames(all_columns, index_columns):
"Indexes on the left, other fields in alphabetical order on the right."
non_index_columns = set(all_columns).difference(index_columns)
return index_columns + sorted(non_index_columns)
[docs]class CSVType(click.ParamType):
name = 'csv'
[docs] def convert(self, value, param, ctx):
if isinstance(value, bytes):
try:
enc = getattr(sys.stdin, 'encoding', None)
if enc is not None:
value = value.decode(enc)
except UnicodeError:
try:
value = value.decode(sys.getfilesystemencoding())
except UnicodeError:
value = value.decode('utf-8', 'replace')
return value.split(',')
return value.split(',')
def __repr__(self):
return 'CSV'
@click.command()
@click.argument('index_columns', type=CSVType())
@click.argument('from_csv', type=click.Path(exists=True))
@click.argument('to_csv', type=click.Path(exists=True))
@click.option('--style',
type=click.Choice(['compact', 'pretty', 'summary']),
default='compact',
help=('Instead of the default compact output, pretty-print '
'or give a summary instead'))
@click.option('--output', '-o', type=click.Path(),
help='Output to a file instead of stdout')
@click.option('--quiet', '-q', is_flag=True,
help="Don't output anything, just use exit codes")
@click.option('--sep', default=',',
help='Separator to use between fields [default: comma]')
@click.option('--ignore-columns', '-i', type=CSVType(),
help='a comma seperated list of columns to ignore from the comparison')
@click.option('--significance', type=int,
help='Ignore numeric changes less than this number of significant figures')
def csvdiff_cmd(index_columns, from_csv, to_csv, style=None, output=None,
sep=',', quiet=False, ignore_columns=None, significance=None):
"""
Compare two csv files to see what rows differ between them. The files
are each expected to have a header row, and for each row to be uniquely
identified by one or more indexing columns.
"""
if ignore_columns is not None:
for i in ignore_columns:
if i in index_columns:
error.abort("You can't ignore an index column")
ostream = (open(output, 'w') if output
else io.StringIO() if quiet
else sys.stdout)
try:
if style == 'summary':
_diff_and_summarize(from_csv, to_csv, index_columns, ostream,
sep=sep, ignored_columns=ignore_columns,
significance=significance)
else:
compact = (style == 'compact')
_diff_files_to_stream(from_csv, to_csv, index_columns, ostream,
compact=compact, sep=sep, ignored_columns=ignore_columns,
significance=significance)
except records.InvalidKeyError as e:
error.abort(e.args[0])
finally:
ostream.close()
def _diff_files_to_stream(from_csv, to_csv, index_columns, ostream,
compact=False, sep=',', ignored_columns=None,
significance=None):
diff = diff_files(from_csv, to_csv, index_columns, sep=sep, ignored_columns=ignored_columns)
if significance is not None:
diff = patch.filter_significance(diff, significance)
patch.save(diff, ostream, compact=compact)
exit_code = (EXIT_SAME
if patch.is_empty(diff)
else EXIT_DIFFERENT)
sys.exit(exit_code)
def _diff_and_summarize(from_csv, to_csv, index_columns, stream=sys.stdout,
sep=',', ignored_columns=None, significance=None):
"""
Print a summary of the difference between the two files.
"""
from_records = list(records.load(from_csv, sep=sep))
to_records = records.load(to_csv, sep=sep)
diff = patch.create(from_records, to_records, index_columns, ignored_columns)
if significance is not None:
diff = patch.filter_significance(diff, significance)
_summarize_diff(diff, len(from_records), stream=stream)
exit_code = (EXIT_SAME
if patch.is_empty(diff)
else EXIT_DIFFERENT)
sys.exit(exit_code)
def _summarize_diff(diff, orig_size, stream=sys.stdout):
if orig_size == 0:
# slightly arbitrary when the original data was empty
orig_size = 1
n_removed = len(diff['removed'])
n_added = len(diff['added'])
n_changed = len(diff['changed'])
if n_removed or n_added or n_changed:
print(u'%d rows removed (%.01f%%)' % (
n_removed, 100 * n_removed / orig_size
), file=stream)
print(u'%d rows added (%.01f%%)' % (
n_added, 100 * n_added / orig_size
), file=stream)
print(u'%d rows changed (%.01f%%)' % (
n_changed, 100 * n_changed / orig_size
), file=stream)
else:
print(u'files are identical', file=stream)
@click.command()
@click.argument('input_csv', type=click.Path(exists=True))
@click.option('--input', '-i', type=click.Path(exists=True),
help='Read the JSON patch from the given file.')
@click.option('--output', '-o', type=click.Path(),
help='Write the transformed CSV to the given file.')
@click.option('--strict/--no-strict', default=True,
help='Whether or not to tolerate a changed source document '
'(default: strict)')
def csvpatch_cmd(input_csv, input=None, output=None, strict=True):
"""
Apply the changes from a csvdiff patch to an existing CSV file.
"""
patch_stream = (sys.stdin
if input is None
else open(input))
tocsv_stream = (sys.stdout
if output is None
else open(output, 'w'))
fromcsv_stream = open(input_csv)
try:
patch_file(patch_stream, fromcsv_stream, tocsv_stream, strict=strict)
except patch.InvalidPatchError as e:
error.abort('reading patch, {0}'.format(e.args[0]))
finally:
patch_stream.close()
fromcsv_stream.close()
tocsv_stream.close()