Merge branch 'release/v0.3.1'

gitonthescene · Apr 29, 2022 · bc723c7 · bc723c7
2 parents 243f6e6 + 0d74297
commit bc723c7
Show file tree

Hide file tree

Showing 7 changed files with 224 additions and 171 deletions.
diff --git a/csv_reconcile/__init__.py b/csv_reconcile/__init__.py
@@ -22,7 +22,7 @@
 except:
     from importlib import metadata
 
-__version__ = '0.3.0'
+__version__ = '0.3.1'
 #------------------------------------------------------------------
 # Implement reconciliation API
 # [[https://reconciliation-api.github.io/specs/latest/]]

diff --git a/csv_reconcile/default_settings.py b/csv_reconcile/default_settings.py
@@ -2,10 +2,6 @@
 
 DATABASE = 'csvreconcile.db'
 
-CSVKWARGS = dict(delimiter='\t')
-
-# CSVENCODING='utf-8-sig'
-
 LIMIT = 10  # At most 10 matches per query
 
 THRESHOLD = 30.0  # At least a 30% match

diff --git a/csv_reconcile/initdb.py b/csv_reconcile/initdb.py
@@ -1,5 +1,7 @@
 from flask import current_app
 import csv
+from chardet.universaldetector import UniversalDetector
+
 from collections import defaultdict
 from itertools import count
 
@@ -39,6 +41,15 @@ def initReconcileTable(db, colnames):
     # create data table with the contents of the csv file
     db.execute(',\n  '.join(create) + '\n)')
 
+def detectEncoding(filenm):
+    detector = UniversalDetector()
+    for line in open(filenm, 'rb'):
+        detector.feed(line)
+        if detector.done: break
+    detector.close()
+    if detector.result['confidence'] > .95:
+        return detector.result['encoding']
+    return None
 
 def init_db(db,
             csvfilenm,
@@ -49,6 +60,8 @@ def init_db(db,
             csvkwargs=None):
 
     enckwarg = dict()
+    csvencoding = csvencoding or detectEncoding(csvfilenm)
+
     if csvencoding:
         enckwarg['encoding'] = csvencoding
 
@@ -60,7 +73,9 @@ def init_db(db,
     with db:
         # Create a table with ids (as PRIMARY ID), words and bigrams
         with open(csvfilenm, newline='', **enckwarg) as csvfile:
-            reader = csv.reader(csvfile, **csvkwargs)
+            dialect = csv.Sniffer().sniff(csvfile.read(1024))
+            csvfile.seek(0)
+            reader = csv.reader(csvfile, dialect, **csvkwargs)
             header = next(reader)
 
             # Throws if col doesn't exist

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "csv-reconcile"
-version = "0.3.0"
+version = "0.3.1"
 description = "OpenRefine reconciliation service backed by csv resource"
 authors = ["Douglas Mennella <[email protected]>"]
 license = "MIT"
@@ -23,9 +23,10 @@ flask-cors = "^3.0.10"
 cython = "^0.29.21"
 normality = "^2.1.1"
 importlib_metadata = { version = "^4.5.0", python = "<3.10" }
+chardet = "^4.0.0"
 
 [tool.poetry.dev-dependencies]
-pytest = "^6.2"
+pytest = "^7.1"
 
 [tool.poe.tasks]
 dummydoc = { script = "utils:dummydoc" }

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -26,6 +26,14 @@ def csvcontents():
 third	3	and so on
 '''.strip()
 
+@pytest.fixture
+def ambiguous_csvcontents():
+    '''Try to throw off csv.Sniffer() to test overrides'''
+    return '''
+These, my friends, are the columns
+However, above all, columns matter most
+'''.strip()
+
 
 @pytest.fixture
 def formContentHeader():
@@ -59,6 +67,16 @@ def setup(tmp_path, csvcontents, idnm):
     p.write_text(csvcontents)
     return (p, *idnm)
 
+@pytest.fixture
+def ambiguous_setup(tmp_path, ambiguous_csvcontents):
+    '''mock csv file with id and name columns indicated'''
+
+    def getSetup(idnm):
+        p = tmp_path / "amb_csvfile"
+        p.write_text(ambiguous_csvcontents)
+        return (p, *idnm)
+
+    return getSetup
 
 @pytest.fixture
 def cfgContents():

diff --git a/tests/main/test_csv_reconcile.py b/tests/main/test_csv_reconcile.py
@@ -1,12 +1,14 @@
 import pytest
 
 from csv_reconcile import __version__, scorer
+from csv_reconcile.db import getCSVCols
+
 import json
 from urllib.parse import urlencode
 
 
 def test_version():
-    assert __version__ == '0.3.0'
+    assert __version__ == '0.3.1'
 
 
 def test_manifest(basicClient):
@@ -283,3 +285,23 @@ def valid(normalizedFields):
 
     # processScoreOptions still called once, getNormalizedFields only called twice
     assert called[:2] == [1, 2]
+
+def test_csv_sniffer_overrides(app, ambiguous_setup, ambiguous_csvcontents, config, mkConfig):
+
+    topline = ambiguous_csvcontents.splitlines()[0]
+    items = lambda sep: [ h.strip() for h in topline.split(sep)]
+
+    # First guess is that the , is a separator
+    SEP = ','
+    chk = app(ambiguous_setup(items(SEP)[:2]), config)
+    with chk.app_context():
+        headernms = [name for _,name in getCSVCols()]
+        assert headernms == items(SEP)
+
+    # Now parse with override
+    SEP = ' '
+    cfg = mkConfig('CSVKWARGS = {"delimiter": " "}')
+    chk = app(ambiguous_setup(items(SEP)[:2]), cfg)
+    with chk.app_context():
+        headernms = [name for _,name in getCSVCols()]
+        assert headernms == items(SEP)