Skip to content

Commit

Permalink
Re-work phonetic indexer
Browse files Browse the repository at this point in the history
  • Loading branch information
pudo committed Oct 6, 2023
1 parent 12d0afa commit 4824131
Show file tree
Hide file tree
Showing 3 changed files with 30 additions and 16 deletions.
2 changes: 2 additions & 0 deletions tests/unit/test_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,3 +49,5 @@ def test_phonetic_names():
assert len(shortened) == 2
phonemes = phonetic_names(["Vladimir Peter Putin"])
assert len(phonemes) == 3
phonemes = phonetic_names(["OAO Gazprom"])
assert len(phonemes) == 1
27 changes: 22 additions & 5 deletions yente/data/util.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,27 @@
from pathlib import Path
from normality import WS
from urllib.parse import urlparse
from jellyfish import metaphone
from prefixdate.precision import Precision
from contextlib import asynccontextmanager
from aiohttp import ClientSession, ClientTimeout, TCPConnector
from typing import AsyncGenerator, Dict, List, Union, Iterable, Optional
from followthemoney.types import registry
from fingerprints import remove_types, clean_name_light
from nomenklatura.util import fingerprint_name, levenshtein, phonetic_token
from nomenklatura.util import names_word_list
from normality.cleaning import decompose_nfkd, category_replace
from fingerprints import remove_types, clean_name_light, clean_entity_prefix
from nomenklatura.util import fingerprint_name, levenshtein, names_word_list


def _clean_phonetic(original: str) -> Optional[str]:
# We're not using the nomenklatura function for this because we want to
# be extra picky what phonemes are put into the search index, so that
# we can reduce the number of false positives.
text = clean_entity_prefix(original)
cleaned = clean_name_light(text)
cleaned = decompose_nfkd(cleaned)
cleaned = category_replace(cleaned)
cleaned = remove_types(cleaned)
return cleaned


def expand_dates(dates: List[str]) -> List[str]:
Expand All @@ -24,8 +37,12 @@ def expand_dates(dates: List[str]) -> List[str]:
def phonetic_names(names: List[str]) -> List[str]:
"""Generate phonetic forms of the given names."""
phonemes: List[str] = []
for word in names_word_list(names, min_length=2):
phonemes.append(phonetic_token(word))
for word in names_word_list(names, normalizer=_clean_phonetic, min_length=2):
if not word.isalpha():
continue
token = metaphone(word)
if len(token) > 1:
phonemes.append(token)
return phonemes


Expand Down
17 changes: 6 additions & 11 deletions yente/search/queries.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,18 +72,13 @@ def names_query(entity: EntityProxy, fuzzy: bool = True) -> List[Clause]:
shoulds.append({"match": match})
cleaned = clean_name_light(name)
if cleaned is not None:
keyq = {"term": {NAME_KEY_FIELD: {"value": cleaned, "boost": 4.0}}}
shoulds.append(keyq)
name_parts: Dict[str, int] = {}
for part in index_name_parts(names):
name_parts.setdefault(part, 0)
name_parts[part] += 1
total = float(sum(name_parts.values()))
for token, count in name_parts.items():
boost = 1.1 + (count / total)
shoulds.append({"term": {NAME_PART_FIELD: {"value": token, "boost": boost}}})
term = {NAME_KEY_FIELD: {"value": cleaned, "boost": 4.0}}
shoulds.append({"term": term})
for token in set(index_name_parts(names)):
shoulds.append({"term": {NAME_PART_FIELD: {"value": token}}})
for phoneme in set(phonetic_names(names)):
shoulds.append({"term": {NAME_PHONETIC_FIELD: {"value": phoneme}}})
term = {NAME_PHONETIC_FIELD: {"value": phoneme, "boost": 0.8}}
shoulds.append({"term": term})
return shoulds


Expand Down

0 comments on commit 4824131

Please sign in to comment.