Re-work phonetic indexer

opensanctions · Oct 6, 2023 · 4824131 · 4824131
1 parent 12d0afa
commit 4824131
Show file tree

Hide file tree

Showing 3 changed files with 30 additions and 16 deletions.
diff --git a/tests/unit/test_data.py b/tests/unit/test_data.py
@@ -49,3 +49,5 @@ def test_phonetic_names():
     assert len(shortened) == 2
     phonemes = phonetic_names(["Vladimir Peter Putin"])
     assert len(phonemes) == 3
+    phonemes = phonetic_names(["OAO Gazprom"])
+    assert len(phonemes) == 1
diff --git a/yente/data/util.py b/yente/data/util.py
@@ -1,14 +1,27 @@
 from pathlib import Path
 from normality import WS
 from urllib.parse import urlparse
+from jellyfish import metaphone
 from prefixdate.precision import Precision
 from contextlib import asynccontextmanager
 from aiohttp import ClientSession, ClientTimeout, TCPConnector
 from typing import AsyncGenerator, Dict, List, Union, Iterable, Optional
 from followthemoney.types import registry
-from fingerprints import remove_types, clean_name_light
-from nomenklatura.util import fingerprint_name, levenshtein, phonetic_token
-from nomenklatura.util import names_word_list
+from normality.cleaning import decompose_nfkd, category_replace
+from fingerprints import remove_types, clean_name_light, clean_entity_prefix
+from nomenklatura.util import fingerprint_name, levenshtein, names_word_list
+
+
+def _clean_phonetic(original: str) -> Optional[str]:
+    # We're not using the nomenklatura function for this because we want to
+    # be extra picky what phonemes are put into the search index, so that
+    # we can reduce the number of false positives.
+    text = clean_entity_prefix(original)
+    cleaned = clean_name_light(text)
+    cleaned = decompose_nfkd(cleaned)
+    cleaned = category_replace(cleaned)
+    cleaned = remove_types(cleaned)
+    return cleaned
 
 
 def expand_dates(dates: List[str]) -> List[str]:
@@ -24,8 +37,12 @@ def expand_dates(dates: List[str]) -> List[str]:
 def phonetic_names(names: List[str]) -> List[str]:
     """Generate phonetic forms of the given names."""
     phonemes: List[str] = []
-    for word in names_word_list(names, min_length=2):
-        phonemes.append(phonetic_token(word))
+    for word in names_word_list(names, normalizer=_clean_phonetic, min_length=2):
+        if not word.isalpha():
+            continue
+        token = metaphone(word)
+        if len(token) > 1:
+            phonemes.append(token)
     return phonemes
 
 

diff --git a/yente/search/queries.py b/yente/search/queries.py
@@ -72,18 +72,13 @@ def names_query(entity: EntityProxy, fuzzy: bool = True) -> List[Clause]:
         shoulds.append({"match": match})
         cleaned = clean_name_light(name)
         if cleaned is not None:
-            keyq = {"term": {NAME_KEY_FIELD: {"value": cleaned, "boost": 4.0}}}
-            shoulds.append(keyq)
-    name_parts: Dict[str, int] = {}
-    for part in index_name_parts(names):
-        name_parts.setdefault(part, 0)
-        name_parts[part] += 1
-    total = float(sum(name_parts.values()))
-    for token, count in name_parts.items():
-        boost = 1.1 + (count / total)
-        shoulds.append({"term": {NAME_PART_FIELD: {"value": token, "boost": boost}}})
+            term = {NAME_KEY_FIELD: {"value": cleaned, "boost": 4.0}}
+            shoulds.append({"term": term})
+    for token in set(index_name_parts(names)):
+        shoulds.append({"term": {NAME_PART_FIELD: {"value": token}}})
     for phoneme in set(phonetic_names(names)):
-        shoulds.append({"term": {NAME_PHONETIC_FIELD: {"value": phoneme}}})
+        term = {NAME_PHONETIC_FIELD: {"value": phoneme, "boost": 0.8}}
+        shoulds.append({"term": term})
     return shoulds