diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..a4ca423 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2020 Kishore Kumar + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md new file mode 100644 index 0000000..c83f261 --- /dev/null +++ b/README.md @@ -0,0 +1,46 @@ +# comparesc +### CSV Comparison on steriods + +## Usage + +```console +comparesv [-h] [-v] [--enc1 ENCODING] [--enc2 ENCODING] [-i] + [-rm ROW_MATCH] [-cm COLUMN_MATCH] [-sm STRING_MATCH] [-ir] + [-ic] [-is] [-s] + [FILE1] [FILE2] + +CSV files comparison + +positional arguments: + FILE1 the first CSV file + FILE2 the second CSV file + +optional arguments: + -h, --help show this help message and exit + -v, --version show program's version number and exit + --enc1 ENCODING encoding of the first file (default is to autodetect) + --enc2 ENCODING encoding of the second file (default is to autodetect) + -i, --ignore-case ignore case (default is case-sensitive) + -rm ROW_MATCH, --row-match ROW_MATCH + Logic to be used to identify the rows. Possible + options 'order', 'fuzzy', 'deep' (default is order) + -cm COLUMN_MATCH, --column-match COLUMN_MATCH + Logic to be used to identify the columns. Possible + options 'exact','fuzzy' (default is exact) + -sm STRING_MATCH, --string-match STRING_MATCH + Logic to be used to identify the columns. Possible + options 'exact','fuzzy' (default is exact) + -ir, --include-addnl-rows + Include added additional added rows from second file + (default is false) + -ic, --include-addnl-columns + Include added additional columns from second file + (default is false) + -is, --include-stats Include stats (default is false) + -s, --save-output Save output to file +``` + +## Description + + + diff --git a/cli.py b/cli.py new file mode 100644 index 0000000..7cd7906 --- /dev/null +++ b/cli.py @@ -0,0 +1,117 @@ +import sys +import os +import io +import csv +import logging +import warnings +import argparse +import chardet +import tqdm +import comparesv +from version import __version__ + +from pprint import pprint + +# This file is derived from https://github.com/maxharlow/csvmatch/blob/master/cli.py + +def main(): + logging.captureWarnings(True) + logging.basicConfig(level=logging.WARN, format='Warning: %(message)s') + warnings.formatwarning = lambda e, *args: str(e) + sys.stderr.write('Starting up...\n') + try: + file1, file2, args = arguments() + print(args) + data1, headers1 = read(*file1) + data2, headers2 = read(*file2) + results = comparesv.run(data1, headers1, data2, headers2, ticker=ticker, **args) + # formatted = format(results['values'],results['headers']) + if args.get("save_output"): + save_file("values.csv", results['headers'], results['values']) + save_file("results.csv", results['headers'], results['results']) + pprint(results['stats']) + sys.stdout.flush() + except BaseException as e: + sys.exit(e) + + +def ticker(text, total): + progress = tqdm.tqdm(bar_format=text + ' |{bar}| {percentage:3.0f}% / {remaining} left', total=total) + return progress.update + + +def read(filename, encoding): + if not os.path.isfile(filename) and filename != '-': + raise Exception(filename + ': no such file') + file = sys.stdin if filename == '-' else io.open(filename, 'rb') + text = file.read() + if text == '': + raise Exception(filename + ': file is empty') + if not encoding: + detector = chardet.universaldetector.UniversalDetector() + text_lines = text.split(b'\n') + for i in range(0, len(text_lines)): + detector.feed(text_lines[i]) + if detector.done: + break + detector.close() + encoding = detector.result['encoding'] # can't always be relied upon + sys.stderr.write(filename + ': autodetected character encoding as ' + encoding.upper() + '\n') + try: + text_decoded = text.decode(encoding) + reader = csv.reader(io.StringIO(text_decoded, newline=None)) + headers = next(reader) + return list(reader), headers + except UnicodeDecodeError as e: + raise Exception(filename + ': could not read file -- try specifying the encoding') + except csv.Error as e: + raise Exception(filename + ': could not read file as a CSV') + + +def arguments(): + parser = argparse.ArgumentParser(description='CSV files comparison') + parser.add_argument('-v', '--version', action='version', version=__version__) + parser.add_argument('FILE1', nargs='?', default='-', help='the first CSV file') + parser.add_argument('FILE2', nargs='?', default='-', help='the second CSV file') + parser.add_argument('--enc1', type=str, metavar='ENCODING', help='encoding of the first file (default is to autodetect)') + parser.add_argument('--enc2', type=str, metavar='ENCODING', help='encoding of the second file (default is to autodetect)') + parser.add_argument('-i', '--ignore-case', action='store_true', help='ignore case (default is case-sensitive)') + parser.add_argument('-rm', '--row-match', default='order', help='Logic to be used to identify the rows. Possible options \'order\', \'fuzzy\', \'deep\' (default is order)') + parser.add_argument('-cm', '--column-match', default='exact', help='Logic to be used to identify the columns. Possible options \'exact\',\'fuzzy\' (default is exact)') + parser.add_argument('-sm', '--string-match', default='exact', help='Logic to be used to identify the columns. Possible options \'exact\',\'fuzzy\' (default is exact)') + parser.add_argument('-ir', '--include-addnl-rows', action='store_true', help='Include additional rows from second file (default is false)') + parser.add_argument('-ic', '--include-addnl-columns', action='store_true', help='Include additional columns from second file (default is false)') + parser.add_argument('-is', '--include-stats', default=True, action='store_true', help='Include stats (default is true)') + parser.add_argument('-s', '--save-output', default=True, action='store_true', help='Save output to file. This saves the output in the current directory (default is true)') + + args = vars(parser.parse_args()) + if args['FILE1'] == '-' and args['FILE2'] == '-': + parser.print_help(sys.stderr) + parser.exit(1) + file1 = args.pop('FILE1') + file2 = args.pop('FILE2') + enc1 = args.pop('enc1') + enc2 = args.pop('enc2') + return (file1, enc1), (file2, enc2), args + +def save_file(file_name, keys, results): + updated_keys = ['S.No'] + keys + updated_results = [[idx+1]+result for idx,result in enumerate(results)] + + curr_dir = os.getcwd() + with open(os.getcwd() + os.path.sep + file_name, 'w') as file: + writer = csv.writer(file, lineterminator='\n') # can't use dictwriter as headers are printed even when there's no results + writer.writerow(updated_keys) + writer.writerows(updated_results) + +def format(results, keys): + writer_io = io.StringIO() + writer = csv.writer(writer_io, lineterminator='\n') # can't use dictwriter as headers are printed even when there's no results + writer.writerow(keys) + writer.writerows(results) + return writer_io.getvalue()[:-1] + + +if __name__ == '__main__': + main() + diff --git a/comparesv.py b/comparesv.py new file mode 100644 index 0000000..d2199b2 --- /dev/null +++ b/comparesv.py @@ -0,0 +1,292 @@ +import os +from fuzzywuzzy import fuzz, process +from collections import OrderedDict +import time + +ROW_THRESHOLD = 80 +CELL_THRESHOLD = 80 + +__version__ = 0.01 + +def run(data1, + headers1, + data2, + headers2, + ignore_case=False, + row_match='order', + column_match='exact', + string_match='exact', + include_addnl_rows=False, + include_addnl_columns=False, + include_stats=True, + save_output=True, + ticker=None): + headers1 = cleanup(headers1) + headers2 = cleanup(headers2) + matched_headers = prepare_headers(data1, headers1, headers2, column_match) + comparison_output, added_rows, deleted_rows = compare_data(data1, data2, headers1, headers2, matched_headers, row_match=row_match, string_match=string_match, include_addnl_rows=include_addnl_rows, + include_addnl_columns=include_addnl_columns, ignore_case=ignore_case) + + rows_result_list, rows_values_list = populate_output(comparison_output) + final_headers = headers1 + if include_addnl_columns: + updated_headers = populate_headers(matched_headers, headers2) + final_headers = final_headers + updated_headers + + final_result = {} + final_result["results"] = rows_result_list + final_result["values"] = rows_values_list + final_result["added"] = added_rows + final_result["deleted"] = deleted_rows + final_result["headers"] = final_headers + + if include_stats: + stats = populate_stats(final_headers, rows_result_list) + final_result["stats"] = stats + + return final_result + + +def populate_stats(headers, results_list): + stat = {} + for index, header in enumerate(headers): + header_data = [result[index] for result in results_list] + total_records = len(header_data) + matched_records = header_data.count(True) + match_percentage = 100 * matched_records/total_records + stat[header] = "{:.2f}".format(match_percentage) + return stat + + +def populate_output(rows_match): + rows_result_list = [] + rows_values_list = [] + for match in rows_match: + row_result = [] + value_result = [] + for item in match: + value_result.append(item[0]) + row_result.append(item[1]) + rows_result_list.append(row_result) + rows_values_list.append(value_result) + + return rows_result_list, rows_values_list + + +def populate_headers(header_index, headers2): + mapped_headers2 = [value['matched_header'] for item, value in header_index.items() if value['index'] > -1] + addnl_headers2 = [header for header in headers2 if header not in mapped_headers2] + return addnl_headers2 + + +def compare_data(data1, data2, headers1, headers2, matched_headers, **kwargs): + added_rows = [] + common_rows = [] + deleted_rows = [] + + rows_output = [] + row_match = kwargs['row_match'] + data2_compared_indices = [] + + # Comparing the data1 rows with available rows in data2 + for index, row1 in enumerate(data1): + row2 = None + data2_indices = list(range(len(data2))) + data2_indices_left = [item for item in data2_indices if item not in data2_compared_indices] + if row_match == 'order' and index < len(data2): + row2 = data2[index] + data2_compared_indices.append(index) + elif row_match == 'fuzzy': + row2, row2_index = fuzzy_row_find(row1, data2, headers1, matched_headers, data2_indices_left) + data2_compared_indices.append(row2_index) + elif row_match == 'deep': + row2, row2_index = deep_row_find(row1, data2, headers1, headers2, matched_headers, data2_indices_left, kwargs) + data2_compared_indices.append(row2_index) + + row_compare_result, mode = compare_rows(row1, row2, matched_headers, headers2, kwargs) + if mode == 'added': + added_rows.append(row2) + elif mode == 'deleted': + deleted_rows.append(row1) + # else: + # common_rows.append(row1) + rows_output.append(row_compare_result) + + if kwargs.get('include_addnl_rows'): + # Calculate and process the remaining records left in data2 + data2_indices = list(range(len(data2))) + data2_indices_left = [item for item in data2_indices if item not in data2_compared_indices] + for index in data2_indices_left: + row1 = None + row2 = data2[index] + row_compare_result, mode = compare_rows(row1, row2, matched_headers, headers2, kwargs) + added_rows.append(row2) + rows_output.append(row_compare_result) + return rows_output, added_rows, deleted_rows + + +def cleanup(headers): + return [header.strip() for header in headers] + + +def exist_in_list(option, option_list): + cleaned_list = [str(o).lower().strip() for o in option_list] + exists = str(option).lower().strip() in cleaned_list + index = -1 + if exists: + index = cleaned_list.index(str(option).lower().strip()) + return exists, index + + +def prepare_headers(data1, headers1, headers2, column_match): + mapped_headers_index = OrderedDict() + updated_indices2 = [] + for index, header in enumerate(headers1): + index = -1 + if column_match == 'exact': + exists, index = exist_in_list(header, headers2) + elif column_match == 'fuzzy': + indices2_left = [x for x in range(len(headers2)) if x not in updated_indices2] + index = fuzzy_column_index(header, headers2) + updated_indices2.append(index) + + column_data = {} + column_data['index'] = index + if index != -1: + column_data['matched_header'] = headers2[index] + column_data['type'] = predict_column_type([val[index] for val in data1]) + mapped_headers_index[header] = column_data + + return mapped_headers_index + + +def fuzzy_column_index(header, headers_list): + exist, index = exist_in_list(header, headers_list) + if exist: + return index + + highest = process.extractOne(header, headers_list) + if highest[1] < ROW_THRESHOLD: + return -1 + return headers_list.index(highest[0]) + + +def deep_row_find(row, data2, headers1, headers2, matched_headers, data2_indices_left, opts): + """ + 1. Take a row from data1 + 2. Compare against all the rows in data2 by column wise data + 3. Get the best matched one + """ + count = 0 + selected_index = -1 + selected_row = None + for index in data2_indices_left: + row2 = data2[index] + row_comparison = compare_rows(row, row2, matched_headers, headers2, opts) + results = [x[1] for x in row_comparison[0]] + if results.count(True) > count: + count = results.count(True) + selected_index = index + selected_row = row2 + + return selected_row, selected_index + + +def fuzzy_row_find(row, data2, headers1, matched_headers, data2_indices_left): + row1 = ' '.join(str(x) for x in row) + rows_list2 = [' '.join(str(x) for x in elem) for index, elem in enumerate(data2) if index in data2_indices_left] + highest = process.extractOne(row1, rows_list2) + + if highest[1] < ROW_THRESHOLD: + return None, None + + index = rows_list2.index(highest[0]) + return data2[index], index + + +def compare_rows(row1, row2, header_index, headers2, opts): + mode = "existing" + if not row1: + mode = "added" + if not row2: + mode = "deleted" + + row_result = [] + for index, column in enumerate(header_index.keys()): + result = None + column_info = header_index[column] + cell1 = row1[index] if row1 else "" + cell2 = row2[column_info['index']] if row2 and column_info['index'] > -1 else "" + + result = compare_cells(cell1, cell2, fetch_compare_mode(column_info.get('type'), opts['string_match']), opts['ignore_case']) + output = [f"[{cell1}]:[{cell2}]", result] + row_result.append(output) + + if opts.get('include_addnl_columns'): + mapped_headers2 = [value['matched_header'] for item, value in header_index.items() if value['index'] > -1] + addnl_headers2_indices = [headers2.index(header) for header in headers2 if header not in mapped_headers2] + for index in addnl_headers2_indices: + cell1 = "" + cell2 = row2[index] + + result = compare_cells(cell1, cell2, "str", opts['ignore_case']) + output = [f"[{cell1}]:[{cell2}]", result] + row_result.append(output) + + return row_result, mode + + +def fetch_compare_mode(data_type, string_match): + if data_type == 'str' and string_match == 'fuzzy': + return "fuzzy_string" + else: + return data_type + + +def compare_cells(cell1, cell2, comparison_type, ignore_case): + if not cell1 or not cell2: + return False + + try: + if comparison_type == 'fuzzy_string': + if fuzz.token_set_ratio(cell1, cell2) > CELL_THRESHOLD: + return True + elif comparison_type == 'int': + return int(cell1) == int(cell2) + elif comparison_type == 'float': + return float(cell1) == float(cell2) + else: + if ignore_case: + cell1 = cell1.lower() + cell2 = cell2.lower() + + return str(cell1).strip() == str(cell2).strip() + except: + if ignore_case: + cell1 = cell1.lower() + cell2 = cell2.lower() + return str(cell1).strip() == str(cell2).strip() + + +def predict_column_type(data): + """ + Predict the data type of the elements present in a list. It will be defaulted to string. + + Args: + data : array + + Returns: + type: Column data type + """ + data_types = [type(item) for item in data] + data_types = list(set(data_types)) + if len(data_types) == 1: + return data_types[0].__name__ + elif str in data_types: + return "str" + elif float in data_types: + return "float" + elif int in data_types: + return "int" + else: + return "str" \ No newline at end of file diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000..3c6e79c --- /dev/null +++ b/setup.cfg @@ -0,0 +1,2 @@ +[bdist_wheel] +universal=1 diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..c99cf86 --- /dev/null +++ b/setup.py @@ -0,0 +1,54 @@ +try: + from setuptools import setup +except ImportError: + from distutils.core import setup + +from version import __version__ +import os + +def open_file(fname): + return open(os.path.join(os.path.dirname(__file__), fname)) + +setup( + name='comparesv', + packages=[], + version=__version__, + license='MIT', + description='CSV Comparison on steroids', + long_description=open_file('README.md').read(), + author='Kishore Kumar', + author_email='ukisho@gmail.com', + url='https://github.com/kishorek', + download_url='https://github.com/user/reponame/archive/v_01.tar.gz', + keywords=['CSV', 'Comparison', 'Compare'], + install_requires=[ + 'chardet==3.0.4', + 'tqdm==4.18.0', + 'unidecode==1.1.1', + 'doublemetaphone==0.1', + 'fuzzywuzzy==0.18.0' + ], + setup_requires=[ + 'chardet==3.0.4', + 'tqdm==4.18.0', + 'unidecode==1.1.1', + 'doublemetaphone==0.1', + 'fuzzywuzzy==0.18.0' + ], + entry_points={ + 'console_scripts': [ + 'comparesv = cli:main' + ] + }, + classifiers=[ + 'Development Status :: 3 - Alpha', # Chose either "3 - Alpha", "4 - Beta" or "5 - Production/Stable" as the current state of your package + 'Intended Audience :: Developers', + 'Intended Audience :: Developers', + 'Topic :: Software Development :: Build Tools', + 'License :: OSI Approved :: MIT License', + 'Programming Language :: Python :: 3.6', + 'Natural Language :: English', + 'Topic :: Scientific/Engineering :: Information Analysis', + 'Topic :: Utilities' + ] +) diff --git a/tests.py b/tests.py new file mode 100644 index 0000000..d12fd47 --- /dev/null +++ b/tests.py @@ -0,0 +1,100 @@ +import comparesv + +def test_basic(): + h1 = ["id", "age"] + h2 = ["id", "age"] + d1 = [["A1", 23], ["A2", 24], ["A3", 34]] + d2 = [["A1", 23], ["A2", 24], ["A3", 34]] + + result = [[True, True], [True, True], [True, True]] + values = [['[A1]:[A1]', '[23]:[23]'], ['[A2]:[A2]', '[24]:[24]'], ['[A3]:[A3]', '[34]:[34]']] + output = comparesv.run(d1, h1, d2, h2) + assert result == output['results'] + assert values == output['values'] + +def test_column_order(): + h1 = ["id", "age"] + h2 = ["age", "id"] + d1 = [["A1", 23], ["A2", 24], ["A3", 34]] + d2 = [[23, "A1"], [24, "A2"], [34, "A3"]] + + result = [[True, True], [True, True], [True, True]] + values = [['[A1]:[A1]', '[23]:[23]'], ['[A2]:[A2]', '[24]:[24]'], ['[A3]:[A3]', '[34]:[34]']] + output = comparesv.run(d1, h1, d2, h2) + assert result == output['results'] + assert values == output['values'] + +def test_fuzzy_column_order(): + h1 = ["id", "age"] + h2 = ["age of student", "identity"] + d1 = [["A1", 23], ["A2", 24], ["A3", 34]] + d2 = [[23, "A1"], [24, "A2"], [34, "A3"]] + + result = [[True, True], [True, True], [True, True]] + values = [['[A1]:[A1]', '[23]:[23]'], ['[A2]:[A2]', '[24]:[24]'], ['[A3]:[A3]', '[34]:[34]']] + output = comparesv.run(d1, h1, d2, h2, column_match='fuzzy') + assert result == output['results'] + assert values == output['values'] + +def test_row_order_fuzzy(): + h1 = ["id", "age"] + h2 = ["id", "age"] + d1 = [["A1", 23], ["A2", 24], ["A3", 34]] + d2 = [["A2", 24], ["A1", 23], ["A3", 34]] + + result = [[True, True], [True, True], [True, True]] + values = [['[A1]:[A1]', '[23]:[23]'], ['[A2]:[A2]', '[24]:[24]'], ['[A3]:[A3]', '[34]:[34]']] + output = comparesv.run(d1, h1, d2, h2, row_match='fuzzy') + assert result == output['results'] + assert values == output['values'] + +def test_extra_column(): + h1 = ["id", "age", "name"] + h2 = ["id", "age"] + d1 = [["A1", 23, "Alpha"], ["A2", 24, "Beta"], ["A3", 34, "Gamma"]] + d2 = [["A2", 24], ["A1", 23], ["A3", 34]] + + result = [[True, True, False], [True, True, False], [True, True, False]] + values = [['[A1]:[A1]', '[23]:[23]', '[Alpha]:[]'], ['[A2]:[A2]', '[24]:[24]', '[Beta]:[]'], ['[A3]:[A3]', '[34]:[34]', '[Gamma]:[]']] + output = comparesv.run(d1, h1, d2, h2, row_match='fuzzy') + assert result == output['results'] + assert values == output['values'] + +def test_include_extra_rows(): + h1 = ["id", "age"] + h2 = ["id", "age"] + d1 = [["A1", 23], ["A2", 24], ["A3", 34]] + d2 = [["A1", 23], ["A2", 24], ["A3", 34],["A4", 34]] + + result = [[True, True], [True, True], [True, True], [False, False]] + values = [['[A1]:[A1]', '[23]:[23]'], ['[A2]:[A2]', '[24]:[24]'], ['[A3]:[A3]', '[34]:[34]'],['[]:[A4]', '[]:[34]']] + output = comparesv.run(d1, h1, d2, h2, include_addnl_rows=True) + assert result == output['results'] + assert values == output['values'] + +def test_include_extra_column(): + h1 = ["id", "age"] + h2 = ["id", "age", "name"] + d1 = [["A2", 24], ["A1", 23], ["A3", 34]] + d2 = [["A1", 23, "Alpha"], ["A2", 24, "Beta"], ["A3", 34, "Gamma"]] + + output = comparesv.run(d1, h1, d2, h2, include_addnl_columns=True) + result = [[False, False, False], [False, False, False], [True, True, False]] + values = [['[A2]:[A1]', '[24]:[23]', '[]:[Alpha]'], + ['[A1]:[A2]', '[23]:[24]', '[]:[Beta]'], + ['[A3]:[A3]', '[34]:[34]', '[]:[Gamma]']] + + assert result == output['results'] + assert values == output['values'] + +def test_basic_case(): + h1 = ["id", "age"] + h2 = ["id", "age"] + d1 = [["A1", 23], ["A2", 24], ["A3", 34]] + d2 = [["a1", 23], ["a2", 24], ["a3", 34]] + + result = [[True, True], [True, True], [True, True]] + values = [['[A1]:[a1]', '[23]:[23]'], ['[A2]:[a2]', '[24]:[24]'], ['[A3]:[a3]', '[34]:[34]']] + output = comparesv.run(d1, h1, d2, h2, ignore_case=True) + assert result == output['results'] + assert values == output['values'] \ No newline at end of file diff --git a/version.py b/version.py new file mode 100644 index 0000000..76a9af4 --- /dev/null +++ b/version.py @@ -0,0 +1 @@ +__version__ = 0.01 \ No newline at end of file