Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add nysiis python 3 support #103

Merged
merged 4 commits into from
Apr 10, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion beard/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,4 @@

"""Bibliographic Entity Automatic Recognition and Disambiguation."""

__version__ = "0.2"
__version__ = "0.2.1"
40 changes: 24 additions & 16 deletions beard/utils/names.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@
import re
import sys

import fuzzy

from .misc import memoize
from .strings import asciify

Expand Down Expand Up @@ -101,8 +103,8 @@ def phonetic_tokenize_name(name, phonetic_algorithm="double_metaphone"):
:param phonetic algorithm: string
Which phonetic algorithm will be used. Options:
- "double_metaphone"
- "nysiis" (only for Python 2)
- "soundex" (only for Python 2)
- "nysiis"
- "soundex"

Returns
-------
Expand All @@ -112,20 +114,26 @@ def phonetic_tokenize_name(name, phonetic_algorithm="double_metaphone"):
exactly two elements. Only the first results of the double metaphone
algorithm are included in tuples.
"""
if sys.version[0] == '2':
import fuzzy
dm = fuzzy.DMetaphone()
soundex = fuzzy.Soundex(5)
phonetic_algorithms = {
"double_metaphone": lambda y: dm(y)[0] or '',
"nysiis": lambda y: fuzzy.nysiis(y),
"soundex": lambda y: soundex(y)
}
else:
from ..ext.metaphone import dm
phonetic_algorithms = {
"double_metaphone": lambda y: dm(y)[0]
}
if phonetic_algorithm == "soundex":
error = (
"The version of the 'fuzzy' package in use has a buggy soundex"
" implementation (see https://github.com/yougov/fuzzy/issues/14 ),"
" downgrade the package to 1.1 (compatible with Python 2 only) if"
" you want to use the soundex phonetic encoding."
)
try:
if fuzzy.Soundex(4)("fuzzy") != "F200":
raise ValueError(error)
except UnicodeDecodeError:
raise ValueError(error)

dm = fuzzy.DMetaphone()
soundex = fuzzy.Soundex(5)
phonetic_algorithms = {
"double_metaphone": lambda y: (dm(y)[0] or b'').decode(),
"nysiis": lambda y: fuzzy.nysiis(y),
"soundex": lambda y: soundex(y)
}

tokens = tokenize_name(name)
# Use double metaphone
Expand Down
5 changes: 4 additions & 1 deletion examples/applications/author-disambiguation/clustering.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,10 @@

from functools import partial

from sklearn.cross_validation import train_test_split
try:
from sklearn.cross_validation import train_test_split
except ImportError:
from sklearn.model_selection import train_test_split

# These imports are used during unpickling.
from utils import get_author_full_name
Expand Down
9 changes: 6 additions & 3 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,8 @@ def run_tests(self):
]

_install_requires = [
"jellyfish",
# jellyfish 0.7 is Python 3 only
"jellyfish<=0.7",
"numpy>=1.9",
"scipy>=0.14",
"scikit-learn>=0.15.2",
Expand All @@ -92,9 +93,11 @@ def run_tests(self):
]

if sys.version[0] == '2':
# fuzzy package is not available on Python 3
# version 1.1 due to Soundex bug in 1.2
# use version 1.1 due to Soundex bug in 1.2
_install_requires.append("fuzzy==1.1")
else:
# need to use version 1.2 with buggy Soundex for Python 3 compatibility
_install_requires.append("fuzzy~=1.0,>=1.2")

_tests_require = [
"coverage",
Expand Down
5 changes: 4 additions & 1 deletion tests/similarity/test_pairs.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,10 @@
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.cross_validation import train_test_split
try:
from sklearn.cross_validation import train_test_split
except ImportError:
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris
from sklearn.svm import LinearSVC

Expand Down
15 changes: 9 additions & 6 deletions tests/utils/test_names.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@
import pytest
import sys

import fuzzy

from beard.ext.metaphone import dm

from beard.utils.names import phonetic_tokenize_name
Expand Down Expand Up @@ -87,14 +89,15 @@ def test_phonetic_tokenize_name_simple():
phonetic_tokenize_name("Dupont, Jean")


@pytest.mark.skipif(sys.version[0] == '3',
reason="fuzzy package doesn't work with Python 3")
def test_phonetic_tokenize_name_python2():
"""Test checking if custom phonetic algorithms from fuzzy packages work."""
import fuzzy
soundex = fuzzy.Soundex(5)
def test_phonetic_tokenize_name_nysiis():
assert phonetic_tokenize_name("Dupont, René", "nysiis") == (
((fuzzy.nysiis(u"Dupont"),), (fuzzy.nysiis(u"René"),)))


@pytest.mark.xfail(reason="soundex is broken in fuzzy 1.2.*")
def test_phonetic_tokenize_name_soundex():
"""Test checking if custom phonetic algorithms from fuzzy packages work."""
soundex = fuzzy.Soundex(5)
assert phonetic_tokenize_name("Dupont, René", "soundex") == (
# no direct support for unicode in soundex, thus "Rene"
((soundex(u"Dupont"),), (soundex(u"Rene"),)))
Expand Down