Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for multi-key and ignoring keys #22

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,10 @@ Consider two CSV files:
name: Pancakes
age: 2

The `--key=id` option means that the `id` column should be treated as the unique key, to identify which records have changed.
The `--key=id` option means that the `id` column should be treated as the unique key, to identify which records have changed. To use a combination of columns as the key, separate them with a comma, e.g., `--key=id1,id2`.

The `--ignore=col` option means that the `col` column will be ignored during the comparison. To ignore multiple columns, separate them with a comma,
e.g., `--ignore=col1,col2`.

The tool will automatically detect if your files are comma- or tab-separated. You can over-ride this automatic detection and force the tool to use a specific format using `--format=tsv` or `--format=csv`.

Expand Down
16 changes: 11 additions & 5 deletions csv_diff/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,10 @@
from dictdiffer import diff
import json
import hashlib
from operator import itemgetter


def load_csv(fp, key=None, dialect=None):
def load_csv(fp, key=None, dialect=None, ignore=None):
if dialect is None and fp.seekable():
# Peek at first 1MB to sniff the delimiter and other dialect details
peek = fp.read(1024 ** 2)
Expand All @@ -16,24 +17,29 @@ def load_csv(fp, key=None, dialect=None):
pass
fp = csv.reader(fp, dialect=(dialect or "excel"))
headings = next(fp)
rows = [dict(zip(headings, line)) for line in fp]
ignore = set(ignore.split(',')) if ignore else set()
rows = [dict( (k, v) for k,v in zip(headings, line) if k not in ignore) for line in fp]
if key:
keyfn = lambda r: r[key]
keyfn = itemgetter(*key.split(','))
else:
keyfn = lambda r: hashlib.sha1(
json.dumps(r, sort_keys=True).encode("utf8")
).hexdigest()
return {keyfn(r): r for r in rows}


def load_json(fp, key=None):
def load_json(fp, key=None, ignore=None):
raw_list = json.load(fp)
assert isinstance(raw_list, list)
if ignore:
for item in raw_list:
for field in ignore.split(','):
item.pop(field, None)
common_keys = set()
for item in raw_list:
common_keys.update(item.keys())
if key:
keyfn = lambda r: r[key]
keyfn = itemgetter(*key.split(','))
else:
keyfn = lambda r: hashlib.sha1(
json.dumps(r, sort_keys=True).encode("utf8")
Expand Down
17 changes: 13 additions & 4 deletions csv_diff/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,16 @@
type=click.Path(exists=True, file_okay=True, dir_okay=False, allow_dash=False),
)
@click.option(
"--key", type=str, default=None, help="Column to use as a unique ID for each row"
"--key",
type=str,
default=None,
help="Column(s) to use as a unique ID for each row. To use multiple keys, separate them with a comma, e.g., key1,key2"
)
@click.option(
"--ignore",
type=str,
default=None,
help="Column(s) to be ignored. To ignore multiple keys, separate them with a comma, e.g., key1,key2"
)
@click.option(
"--format",
Expand Down Expand Up @@ -42,7 +51,7 @@
is_flag=True,
help="Show unchanged fields for rows with at least one change",
)
def cli(previous, current, key, format, json, singular, plural, show_unchanged):
def cli(previous, current, key, ignore, format, json, singular, plural, show_unchanged):
"Diff two CSV or JSON files"
dialect = {
"csv": "excel",
Expand All @@ -51,10 +60,10 @@ def cli(previous, current, key, format, json, singular, plural, show_unchanged):

def load(filename):
if format == "json":
return load_json(open(filename), key=key)
return load_json(open(filename), key=key, ignore=ignore)
else:
return load_csv(
open(filename, newline=""), key=key, dialect=dialect.get(format)
open(filename, newline=""), key=key, dialect=dialect.get(format), ignore=ignore
)

diff = compare(load(previous), load(current), show_unchanged)
Expand Down
68 changes: 68 additions & 0 deletions tests/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,32 @@ def json_files(tmpdir):
return str(one), str(two)


@pytest.fixture
def json_files_two(tmpdir):
one = tmpdir / "one.json"
one.write(
json.dumps(
[
{"state": "CA", "county": "Yikes", "pop": 100, "extra": 1},
{"state": "NY", "county": "Beep", "pop": 200, "extra": 2 },
{"state": "CA", "county": "Zoinks", "pop": 100 },
{"state": "NY", "county": "Zoinks", "pop": 200 }
]
)
)
two = tmpdir / "two.json"
two.write(
json.dumps(
[
{"state": "CA", "county": "Yikes", "pop": 100},
{"state": "NY", "county": "Beep", "pop": 200, "extra": 2 },
{"state": "CA", "county": "Zoinks", "pop": 300 },
{"state": "NY", "county": "Zoinks", "pop": 200 }
]
)
)
return str(one), str(two)

def test_human_cli(tmpdir):
one = tmpdir / "one.csv"
one.write(ONE)
Expand Down Expand Up @@ -234,3 +260,45 @@ def test_semicolon_delimited(tmpdir):
"columns_added": [],
"columns_removed": [],
} == json.loads(result.output.strip())


def test_multikey(json_files_two):
# https://github.com/simonw/csv-diff/issues/7
one, two = json_files_two
result = CliRunner().invoke(
cli.cli,
[one, two, "--key", "state,county", "--json", "--format", "json"],
catch_exceptions=False,
)
assert 0 == result.exit_code
assert {
"added": [],
"removed": [],
"changed": [
{"key": ["CA", "Yikes"], "changes": {"extra": [1, None]}},
{"key": ["CA", "Zoinks"], "changes": {"pop": [100, 300]}},
],
"columns_added": [],
"columns_removed": [],
} == json.loads(result.output.strip())



def test_ignore(json_files_two):
# https://github.com/simonw/csv-diff/issues/7
one, two = json_files_two
result = CliRunner().invoke(
cli.cli,
[one, two, "--key", "state,county", "--ignore", "extra", "--json", "--format", "json"],
catch_exceptions=False,
)
assert 0 == result.exit_code
assert {
"added": [],
"removed": [],
"changed": [
{"key": ["CA", "Zoinks"], "changes": {"pop": [100, 300]}},
],
"columns_added": [],
"columns_removed": [],
} == json.loads(result.output.strip())
50 changes: 50 additions & 0 deletions tests/test_csv_diff.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,29 @@
1,Cleo,5
2,Pancakes,3"""

ELEVEN = """state,county,pop
CA,Yikes,100
NY,Beep,200
CA,Zoinks,100
NY,Zoinks,200
"""

TWELVE = """state,county,pop
CA,Yikes,100
NY,Beep,200
CA,Zoinks,300
NY,Zoinks,200
"""

THIRTEEN = """id,name,age,sex
1,Cleo,5,male
2,Pancakes,4,female
"""

FOURTEEN = """id,name,age,sex
1,Cleo,5,female
2,Pancakes,3,female
"""

def test_row_changed():
diff = compare(
Expand Down Expand Up @@ -115,3 +138,30 @@ def test_tsv():
"columns_added": [],
"columns_removed": [],
} == diff

def test_multikey():
diff = compare(
load_csv(io.StringIO(ELEVEN), key="state,county"),
load_csv(io.StringIO(TWELVE), key="state,county"),
)
assert {
"added": [],
"removed": [],
"changed": [{"key": ("CA", "Zoinks"), "changes": {"pop": ["100", "300"]}}],
"columns_added": [],
"columns_removed": [],
} == diff


def test_ignore_columns():
diff = compare(
load_csv(io.StringIO(THIRTEEN), key="id", ignore="sex"),
load_csv(io.StringIO(FOURTEEN), key="id", ignore="sex"),
)
assert {
"added": [],
"removed": [],
"changed": [{"key": "2", "changes": {"age": ["4", "3"]}}],
"columns_added": [],
"columns_removed": [],
} == diff