inspirehep · michamos · Apr 10, 2019 · Apr 10, 2019 · Apr 10, 2019 · Apr 10, 2019
diff --git a/beard/__init__.py b/beard/__init__.py
@@ -9,4 +9,4 @@
 
 """Bibliographic Entity Automatic Recognition and Disambiguation."""
 
-__version__ = "0.2"
+__version__ = "0.2.1"
diff --git a/beard/utils/names.py b/beard/utils/names.py
@@ -18,6 +18,8 @@
 import re
 import sys
 
+import fuzzy
+
 from .misc import memoize
 from .strings import asciify
 
@@ -101,8 +103,8 @@ def phonetic_tokenize_name(name, phonetic_algorithm="double_metaphone"):
     :param phonetic algorithm: string
         Which phonetic algorithm will be used. Options:
         -  "double_metaphone"
-        -  "nysiis" (only for Python 2)
-        -  "soundex" (only for Python 2)
+        -  "nysiis"
+        -  "soundex"
 
     Returns
     -------
@@ -112,20 +114,26 @@ def phonetic_tokenize_name(name, phonetic_algorithm="double_metaphone"):
         exactly two elements. Only the first results of the double metaphone
         algorithm are included in tuples.
     """
-    if sys.version[0] == '2':
-        import fuzzy
-        dm = fuzzy.DMetaphone()
-        soundex = fuzzy.Soundex(5)
-        phonetic_algorithms = {
-            "double_metaphone": lambda y: dm(y)[0] or '',
-            "nysiis": lambda y: fuzzy.nysiis(y),
-            "soundex": lambda y: soundex(y)
-        }
-    else:
-        from ..ext.metaphone import dm
-        phonetic_algorithms = {
-            "double_metaphone": lambda y: dm(y)[0]
-        }
+    if phonetic_algorithm == "soundex":
+        error = (
+            "The version of the 'fuzzy' package in use has a buggy soundex"
+            " implementation (see https://github.com/yougov/fuzzy/issues/14 ),"
+            " downgrade the package to 1.1 (compatible with Python 2 only) if"
+            " you want to use the soundex phonetic encoding."
+        )
+        try:
+            if fuzzy.Soundex(4)("fuzzy") != "F200":
+                raise ValueError(error)
+        except UnicodeDecodeError:
+            raise ValueError(error)
+
+    dm = fuzzy.DMetaphone()
+    soundex = fuzzy.Soundex(5)
+    phonetic_algorithms = {
+        "double_metaphone": lambda y: (dm(y)[0] or b'').decode(),
+        "nysiis": lambda y: fuzzy.nysiis(y),
+        "soundex": lambda y: soundex(y)
+    }
 
     tokens = tokenize_name(name)
     # Use double metaphone

diff --git a/examples/applications/author-disambiguation/clustering.py b/examples/applications/author-disambiguation/clustering.py
@@ -23,7 +23,10 @@
 
 from functools import partial
 
-from sklearn.cross_validation import train_test_split
+try:
+    from sklearn.cross_validation import train_test_split
+except ImportError:
+    from sklearn.model_selection import train_test_split
 
 # These imports are used during unpickling.
 from utils import get_author_full_name

diff --git a/setup.py b/setup.py
@@ -83,7 +83,8 @@ def run_tests(self):
 ]
 
 _install_requires = [
-    "jellyfish",
+    # jellyfish 0.7 is Python 3 only
+    "jellyfish<=0.7",
     "numpy>=1.9",
     "scipy>=0.14",
     "scikit-learn>=0.15.2",
@@ -92,9 +93,11 @@ def run_tests(self):
 ]
 
 if sys.version[0] == '2':
-    # fuzzy package is not available on Python 3
-    # version 1.1 due to Soundex bug in 1.2
+    # use version 1.1 due to Soundex bug in 1.2
     _install_requires.append("fuzzy==1.1")
+else:
+    # need to use version 1.2 with buggy Soundex for Python 3 compatibility
+    _install_requires.append("fuzzy~=1.0,>=1.2")
 
 _tests_require = [
     "coverage",

diff --git a/tests/similarity/test_pairs.py b/tests/similarity/test_pairs.py
@@ -22,7 +22,10 @@
 from sklearn.preprocessing import MinMaxScaler
 from sklearn.preprocessing import OneHotEncoder
 from sklearn.preprocessing import StandardScaler
-from sklearn.cross_validation import train_test_split
+try:
+    from sklearn.cross_validation import train_test_split
+except ImportError:
+    from sklearn.model_selection import train_test_split
 from sklearn.datasets import load_iris
 from sklearn.svm import LinearSVC
 

diff --git a/tests/utils/test_names.py b/tests/utils/test_names.py
@@ -17,6 +17,8 @@
 import pytest
 import sys
 
+import fuzzy
+
 from beard.ext.metaphone import dm
 
 from beard.utils.names import phonetic_tokenize_name
@@ -87,14 +89,15 @@ def test_phonetic_tokenize_name_simple():
         phonetic_tokenize_name("Dupont, Jean")
 
 
-@pytest.mark.skipif(sys.version[0] == '3',
-                    reason="fuzzy package doesn't work with Python 3")
-def test_phonetic_tokenize_name_python2():
-    """Test checking if custom phonetic algorithms from fuzzy packages work."""
-    import fuzzy
-    soundex = fuzzy.Soundex(5)
+def test_phonetic_tokenize_name_nysiis():
     assert phonetic_tokenize_name("Dupont, René", "nysiis") == (
         ((fuzzy.nysiis(u"Dupont"),), (fuzzy.nysiis(u"René"),)))
+
+
+@pytest.mark.xfail(reason="soundex is broken in fuzzy 1.2.*")
+def test_phonetic_tokenize_name_soundex():
+    """Test checking if custom phonetic algorithms from fuzzy packages work."""
+    soundex = fuzzy.Soundex(5)
     assert phonetic_tokenize_name("Dupont, René", "soundex") == (
         # no direct support for unicode in soundex, thus "Rene"
         ((soundex(u"Dupont"),), (soundex(u"Rene"),)))
Original file line number	Diff line number	Diff line change
Expand Up		@@ -9,4 +9,4 @@

		"""Bibliographic Entity Automatic Recognition and Disambiguation."""

		__version__ = "0.2"
		__version__ = "0.2.1"