Skip to content

Commit

Permalink
Merge branch 'release/v0.3.1'
Browse files Browse the repository at this point in the history
  • Loading branch information
Douglas Mennella committed Apr 29, 2022
2 parents 243f6e6 + 0d74297 commit bc723c7
Show file tree
Hide file tree
Showing 7 changed files with 224 additions and 171 deletions.
2 changes: 1 addition & 1 deletion csv_reconcile/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
except:
from importlib import metadata

__version__ = '0.3.0'
__version__ = '0.3.1'
#------------------------------------------------------------------
# Implement reconciliation API
# [[https://reconciliation-api.github.io/specs/latest/]]
Expand Down
4 changes: 0 additions & 4 deletions csv_reconcile/default_settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,6 @@

DATABASE = 'csvreconcile.db'

CSVKWARGS = dict(delimiter='\t')

# CSVENCODING='utf-8-sig'

LIMIT = 10 # At most 10 matches per query

THRESHOLD = 30.0 # At least a 30% match
Expand Down
17 changes: 16 additions & 1 deletion csv_reconcile/initdb.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
from flask import current_app
import csv
from chardet.universaldetector import UniversalDetector

from collections import defaultdict
from itertools import count

Expand Down Expand Up @@ -39,6 +41,15 @@ def initReconcileTable(db, colnames):
# create data table with the contents of the csv file
db.execute(',\n '.join(create) + '\n)')

def detectEncoding(filenm):
detector = UniversalDetector()
for line in open(filenm, 'rb'):
detector.feed(line)
if detector.done: break
detector.close()
if detector.result['confidence'] > .95:
return detector.result['encoding']
return None

def init_db(db,
csvfilenm,
Expand All @@ -49,6 +60,8 @@ def init_db(db,
csvkwargs=None):

enckwarg = dict()
csvencoding = csvencoding or detectEncoding(csvfilenm)

if csvencoding:
enckwarg['encoding'] = csvencoding

Expand All @@ -60,7 +73,9 @@ def init_db(db,
with db:
# Create a table with ids (as PRIMARY ID), words and bigrams
with open(csvfilenm, newline='', **enckwarg) as csvfile:
reader = csv.reader(csvfile, **csvkwargs)
dialect = csv.Sniffer().sniff(csvfile.read(1024))
csvfile.seek(0)
reader = csv.reader(csvfile, dialect, **csvkwargs)
header = next(reader)

# Throws if col doesn't exist
Expand Down
325 changes: 163 additions & 162 deletions poetry.lock

Large diffs are not rendered by default.

5 changes: 3 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "csv-reconcile"
version = "0.3.0"
version = "0.3.1"
description = "OpenRefine reconciliation service backed by csv resource"
authors = ["Douglas Mennella <[email protected]>"]
license = "MIT"
Expand All @@ -23,9 +23,10 @@ flask-cors = "^3.0.10"
cython = "^0.29.21"
normality = "^2.1.1"
importlib_metadata = { version = "^4.5.0", python = "<3.10" }
chardet = "^4.0.0"

[tool.poetry.dev-dependencies]
pytest = "^6.2"
pytest = "^7.1"

[tool.poe.tasks]
dummydoc = { script = "utils:dummydoc" }
Expand Down
18 changes: 18 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,14 @@ def csvcontents():
third 3 and so on
'''.strip()

@pytest.fixture
def ambiguous_csvcontents():
'''Try to throw off csv.Sniffer() to test overrides'''
return '''
These, my friends, are the columns
However, above all, columns matter most
'''.strip()


@pytest.fixture
def formContentHeader():
Expand Down Expand Up @@ -59,6 +67,16 @@ def setup(tmp_path, csvcontents, idnm):
p.write_text(csvcontents)
return (p, *idnm)

@pytest.fixture
def ambiguous_setup(tmp_path, ambiguous_csvcontents):
'''mock csv file with id and name columns indicated'''

def getSetup(idnm):
p = tmp_path / "amb_csvfile"
p.write_text(ambiguous_csvcontents)
return (p, *idnm)

return getSetup

@pytest.fixture
def cfgContents():
Expand Down
24 changes: 23 additions & 1 deletion tests/main/test_csv_reconcile.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
import pytest

from csv_reconcile import __version__, scorer
from csv_reconcile.db import getCSVCols

import json
from urllib.parse import urlencode


def test_version():
assert __version__ == '0.3.0'
assert __version__ == '0.3.1'


def test_manifest(basicClient):
Expand Down Expand Up @@ -283,3 +285,23 @@ def valid(normalizedFields):

# processScoreOptions still called once, getNormalizedFields only called twice
assert called[:2] == [1, 2]

def test_csv_sniffer_overrides(app, ambiguous_setup, ambiguous_csvcontents, config, mkConfig):

topline = ambiguous_csvcontents.splitlines()[0]
items = lambda sep: [ h.strip() for h in topline.split(sep)]

# First guess is that the , is a separator
SEP = ','
chk = app(ambiguous_setup(items(SEP)[:2]), config)
with chk.app_context():
headernms = [name for _,name in getCSVCols()]
assert headernms == items(SEP)

# Now parse with override
SEP = ' '
cfg = mkConfig('CSVKWARGS = {"delimiter": " "}')
chk = app(ambiguous_setup(items(SEP)[:2]), cfg)
with chk.app_context():
headernms = [name for _,name in getCSVCols()]
assert headernms == items(SEP)

0 comments on commit bc723c7

Please sign in to comment.