From 3406af37efb7ec3d027d3e4b3afff6b007079e6f Mon Sep 17 00:00:00 2001 From: Douglas Mennella Date: Fri, 29 Apr 2022 19:33:13 +0900 Subject: [PATCH 1/2] Fixes #68. Protect against sniffing failing. Fallback to overrides --- csv_reconcile/initdb.py | 9 +++++++-- tests/conftest.py | 18 ++++++++++++++++++ tests/main/test_csv_reconcile.py | 19 +++++++++++++++++++ 3 files changed, 44 insertions(+), 2 deletions(-) diff --git a/csv_reconcile/initdb.py b/csv_reconcile/initdb.py index 807e435..15d5668 100644 --- a/csv_reconcile/initdb.py +++ b/csv_reconcile/initdb.py @@ -73,9 +73,14 @@ def init_db(db, with db: # Create a table with ids (as PRIMARY ID), words and bigrams with open(csvfilenm, newline='', **enckwarg) as csvfile: - dialect = csv.Sniffer().sniff(csvfile.read(1024)) + dialect = None + try: + dialect = csv.Sniffer().sniff(csvfile.read(1024)) + except: + pass + csvfile.seek(0) - reader = csv.reader(csvfile, dialect, **csvkwargs) + reader = csv.reader(csvfile, dialect=dialect, **csvkwargs) header = next(reader) # Throws if col doesn't exist diff --git a/tests/conftest.py b/tests/conftest.py index 6359a4e..8e6cabb 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -34,6 +34,13 @@ def ambiguous_csvcontents(): However, above all, columns matter most '''.strip() +@pytest.fixture +def sniffer_throwing_csvcontents(): + '''Try to throw off csv.Sniffer() to test overrides''' + return ''' +a,b,c\n1,2 +'''.strip() + @pytest.fixture def formContentHeader(): @@ -78,6 +85,17 @@ def getSetup(idnm): return getSetup +@pytest.fixture +def sniffer_throwing_setup(tmp_path, sniffer_throwing_csvcontents): + '''mock csv file with id and name columns indicated''' + + def getSetup(idnm): + p = tmp_path / "snfthrw_csvfile" + p.write_text(sniffer_throwing_csvcontents) + return (p, *idnm) + + return getSetup + @pytest.fixture def cfgContents(): return ''' diff --git a/tests/main/test_csv_reconcile.py b/tests/main/test_csv_reconcile.py index 8d80d8a..0c7b606 100644 --- a/tests/main/test_csv_reconcile.py +++ b/tests/main/test_csv_reconcile.py @@ -305,3 +305,22 @@ def test_csv_sniffer_overrides(app, ambiguous_setup, ambiguous_csvcontents, conf with chk.app_context(): headernms = [name for _,name in getCSVCols()] assert headernms == items(SEP) + +def test_csv_sniffer_throwing(app, sniffer_throwing_setup, sniffer_throwing_csvcontents, config, mkConfig): + + topline = sniffer_throwing_csvcontents.splitlines()[0] + items = lambda sep: [ h.strip() for h in topline.split(sep)] + + # First guess is that the , is a separator + SEP = ',' + chk = app(sniffer_throwing_setup(items(SEP)[:2]), config) + with chk.app_context(): + headernms = [name for _,name in getCSVCols()] + assert headernms == items(SEP) + + # Now parse with override + cfg = mkConfig('CSVKWARGS = {"delimiter": ","}') + chk = app(sniffer_throwing_setup(items(SEP)[:2]), cfg) + with chk.app_context(): + headernms = [name for _,name in getCSVCols()] + assert headernms == items(SEP) From 5112a0cd111c4793ff70eca65fdb335abd5fd0ae Mon Sep 17 00:00:00 2001 From: Douglas Mennella Date: Sat, 30 Apr 2022 14:45:54 +0900 Subject: [PATCH 2/2] Automated csv file encoding detection (defensive) --- csv_reconcile/__init__.py | 2 +- pyproject.toml | 2 +- tests/main/test_csv_reconcile.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/csv_reconcile/__init__.py b/csv_reconcile/__init__.py index 928a955..3575fc2 100644 --- a/csv_reconcile/__init__.py +++ b/csv_reconcile/__init__.py @@ -22,7 +22,7 @@ except: from importlib import metadata -__version__ = '0.3.1' +__version__ = '0.3.2' #------------------------------------------------------------------ # Implement reconciliation API # [[https://reconciliation-api.github.io/specs/latest/]] diff --git a/pyproject.toml b/pyproject.toml index 8db7102..5538297 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "csv-reconcile" -version = "0.3.1" +version = "0.3.2" description = "OpenRefine reconciliation service backed by csv resource" authors = ["Douglas Mennella "] license = "MIT" diff --git a/tests/main/test_csv_reconcile.py b/tests/main/test_csv_reconcile.py index 0c7b606..36cd001 100644 --- a/tests/main/test_csv_reconcile.py +++ b/tests/main/test_csv_reconcile.py @@ -8,7 +8,7 @@ def test_version(): - assert __version__ == '0.3.1' + assert __version__ == '0.3.2' def test_manifest(basicClient):