From 1080622c02be75a64ad13dcdf3c2674a2668d194 Mon Sep 17 00:00:00 2001 From: Micha Moskovic Date: Wed, 10 Apr 2019 08:16:58 +0200 Subject: [PATCH 1/4] Enable NYSIIS for Python 3 Signed-off-by: Micha Moskovic --- beard/utils/names.py | 40 +++++++++++++++++++++++---------------- setup.py | 6 ++++-- tests/utils/test_names.py | 15 +++++++++------ 3 files changed, 37 insertions(+), 24 deletions(-) diff --git a/beard/utils/names.py b/beard/utils/names.py index b70784d..9ec17d2 100644 --- a/beard/utils/names.py +++ b/beard/utils/names.py @@ -18,6 +18,8 @@ import re import sys +import fuzzy + from .misc import memoize from .strings import asciify @@ -101,8 +103,8 @@ def phonetic_tokenize_name(name, phonetic_algorithm="double_metaphone"): :param phonetic algorithm: string Which phonetic algorithm will be used. Options: - "double_metaphone" - - "nysiis" (only for Python 2) - - "soundex" (only for Python 2) + - "nysiis" + - "soundex" Returns ------- @@ -112,20 +114,26 @@ def phonetic_tokenize_name(name, phonetic_algorithm="double_metaphone"): exactly two elements. Only the first results of the double metaphone algorithm are included in tuples. """ - if sys.version[0] == '2': - import fuzzy - dm = fuzzy.DMetaphone() - soundex = fuzzy.Soundex(5) - phonetic_algorithms = { - "double_metaphone": lambda y: dm(y)[0] or '', - "nysiis": lambda y: fuzzy.nysiis(y), - "soundex": lambda y: soundex(y) - } - else: - from ..ext.metaphone import dm - phonetic_algorithms = { - "double_metaphone": lambda y: dm(y)[0] - } + if phonetic_algorithm == "soundex": + error = ( + "The version of the 'fuzzy' package in use has a buggy soundex" + " implementation (see https://github.com/yougov/fuzzy/issues/14 )," + " downgrade the package to 1.1 (compatible with Python 2 only) if" + " you want to use the soundex phonetic encoding." + ) + try: + if fuzzy.Soundex(4)("fuzzy") != "F200": + raise ValueError(error) + except UnicodeDecodeError: + raise ValueError(error) + + dm = fuzzy.DMetaphone() + soundex = fuzzy.Soundex(5) + phonetic_algorithms = { + "double_metaphone": lambda y: (dm(y)[0] or b'').decode(), + "nysiis": lambda y: fuzzy.nysiis(y), + "soundex": lambda y: soundex(y) + } tokens = tokenize_name(name) # Use double metaphone diff --git a/setup.py b/setup.py index 4a88177..f060f02 100644 --- a/setup.py +++ b/setup.py @@ -92,9 +92,11 @@ def run_tests(self): ] if sys.version[0] == '2': - # fuzzy package is not available on Python 3 - # version 1.1 due to Soundex bug in 1.2 + # use version 1.1 due to Soundex bug in 1.2 _install_requires.append("fuzzy==1.1") +else: + # need to use version 1.2 with buggy Soundex for Python 3 compatibility + _install_requires.append("fuzzy~=1.0,>=1.2") _tests_require = [ "coverage", diff --git a/tests/utils/test_names.py b/tests/utils/test_names.py index 363581e..fdfce34 100644 --- a/tests/utils/test_names.py +++ b/tests/utils/test_names.py @@ -17,6 +17,8 @@ import pytest import sys +import fuzzy + from beard.ext.metaphone import dm from beard.utils.names import phonetic_tokenize_name @@ -87,14 +89,15 @@ def test_phonetic_tokenize_name_simple(): phonetic_tokenize_name("Dupont, Jean") -@pytest.mark.skipif(sys.version[0] == '3', - reason="fuzzy package doesn't work with Python 3") -def test_phonetic_tokenize_name_python2(): - """Test checking if custom phonetic algorithms from fuzzy packages work.""" - import fuzzy - soundex = fuzzy.Soundex(5) +def test_phonetic_tokenize_name_nysiis(): assert phonetic_tokenize_name("Dupont, René", "nysiis") == ( ((fuzzy.nysiis(u"Dupont"),), (fuzzy.nysiis(u"René"),))) + + +@pytest.mark.xfail(reason="soundex is broken in fuzzy 1.2.*") +def test_phonetic_tokenize_name_soundex(): + """Test checking if custom phonetic algorithms from fuzzy packages work.""" + soundex = fuzzy.Soundex(5) assert phonetic_tokenize_name("Dupont, René", "soundex") == ( # no direct support for unicode in soundex, thus "Rene" ((soundex(u"Dupont"),), (soundex(u"Rene"),))) From 50efdc09d3e69c279c78c6f0038066fa1545bd95 Mon Sep 17 00:00:00 2001 From: Micha Moskovic Date: Wed, 10 Apr 2019 10:38:38 +0200 Subject: [PATCH 2/4] Pin jellyfish for Python 2 compatibility Signed-off-by: Micha Moskovic --- setup.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index f060f02..b194a8a 100644 --- a/setup.py +++ b/setup.py @@ -83,7 +83,8 @@ def run_tests(self): ] _install_requires = [ - "jellyfish", + # jellyfish 0.7 is Python 3 only + "jellyfish<=0.7", "numpy>=1.9", "scipy>=0.14", "scikit-learn>=0.15.2", From 611496eb675a016dda9d6b57717e12f28010927c Mon Sep 17 00:00:00 2001 From: Micha Moskovic Date: Wed, 10 Apr 2019 08:17:33 +0200 Subject: [PATCH 3/4] Make package compatible with recent sklearn Signed-off-by: Micha Moskovic --- examples/applications/author-disambiguation/clustering.py | 5 ++++- tests/similarity/test_pairs.py | 5 ++++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/examples/applications/author-disambiguation/clustering.py b/examples/applications/author-disambiguation/clustering.py index 49d8b05..c71f336 100644 --- a/examples/applications/author-disambiguation/clustering.py +++ b/examples/applications/author-disambiguation/clustering.py @@ -23,7 +23,10 @@ from functools import partial -from sklearn.cross_validation import train_test_split +try: + from sklearn.cross_validation import train_test_split +except ImportError: + from sklearn.model_selection import train_test_split # These imports are used during unpickling. from utils import get_author_full_name diff --git a/tests/similarity/test_pairs.py b/tests/similarity/test_pairs.py index 564d76a..c6f8d85 100644 --- a/tests/similarity/test_pairs.py +++ b/tests/similarity/test_pairs.py @@ -22,7 +22,10 @@ from sklearn.preprocessing import MinMaxScaler from sklearn.preprocessing import OneHotEncoder from sklearn.preprocessing import StandardScaler -from sklearn.cross_validation import train_test_split +try: + from sklearn.cross_validation import train_test_split +except ImportError: + from sklearn.model_selection import train_test_split from sklearn.datasets import load_iris from sklearn.svm import LinearSVC From 80bcf6e77c9ec0c9868fcc15220fdd4d7f6d150b Mon Sep 17 00:00:00 2001 From: Micha Moskovic Date: Wed, 10 Apr 2019 08:45:27 +0200 Subject: [PATCH 4/4] release v0.2.1 Signed-off-by: Micha Moskovic --- beard/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/beard/__init__.py b/beard/__init__.py index 2a34339..6df3319 100644 --- a/beard/__init__.py +++ b/beard/__init__.py @@ -9,4 +9,4 @@ """Bibliographic Entity Automatic Recognition and Disambiguation.""" -__version__ = "0.2" +__version__ = "0.2.1"