diff --git a/tests/unit/test_data.py b/tests/unit/test_data.py index 52c71c1c..efdc19f4 100644 --- a/tests/unit/test_data.py +++ b/tests/unit/test_data.py @@ -49,3 +49,5 @@ def test_phonetic_names(): assert len(shortened) == 2 phonemes = phonetic_names(["Vladimir Peter Putin"]) assert len(phonemes) == 3 + phonemes = phonetic_names(["OAO Gazprom"]) + assert len(phonemes) == 1 diff --git a/yente/data/util.py b/yente/data/util.py index a2d6c1b7..3fcd385d 100644 --- a/yente/data/util.py +++ b/yente/data/util.py @@ -1,14 +1,27 @@ from pathlib import Path from normality import WS from urllib.parse import urlparse +from jellyfish import metaphone from prefixdate.precision import Precision from contextlib import asynccontextmanager from aiohttp import ClientSession, ClientTimeout, TCPConnector from typing import AsyncGenerator, Dict, List, Union, Iterable, Optional from followthemoney.types import registry -from fingerprints import remove_types, clean_name_light -from nomenklatura.util import fingerprint_name, levenshtein, phonetic_token -from nomenklatura.util import names_word_list +from normality.cleaning import decompose_nfkd, category_replace +from fingerprints import remove_types, clean_name_light, clean_entity_prefix +from nomenklatura.util import fingerprint_name, levenshtein, names_word_list + + +def _clean_phonetic(original: str) -> Optional[str]: + # We're not using the nomenklatura function for this because we want to + # be extra picky what phonemes are put into the search index, so that + # we can reduce the number of false positives. + text = clean_entity_prefix(original) + cleaned = clean_name_light(text) + cleaned = decompose_nfkd(cleaned) + cleaned = category_replace(cleaned) + cleaned = remove_types(cleaned) + return cleaned def expand_dates(dates: List[str]) -> List[str]: @@ -24,8 +37,12 @@ def expand_dates(dates: List[str]) -> List[str]: def phonetic_names(names: List[str]) -> List[str]: """Generate phonetic forms of the given names.""" phonemes: List[str] = [] - for word in names_word_list(names, min_length=2): - phonemes.append(phonetic_token(word)) + for word in names_word_list(names, normalizer=_clean_phonetic, min_length=2): + if not word.isalpha(): + continue + token = metaphone(word) + if len(token) > 1: + phonemes.append(token) return phonemes diff --git a/yente/search/queries.py b/yente/search/queries.py index 6edcec51..d4653262 100644 --- a/yente/search/queries.py +++ b/yente/search/queries.py @@ -72,18 +72,13 @@ def names_query(entity: EntityProxy, fuzzy: bool = True) -> List[Clause]: shoulds.append({"match": match}) cleaned = clean_name_light(name) if cleaned is not None: - keyq = {"term": {NAME_KEY_FIELD: {"value": cleaned, "boost": 4.0}}} - shoulds.append(keyq) - name_parts: Dict[str, int] = {} - for part in index_name_parts(names): - name_parts.setdefault(part, 0) - name_parts[part] += 1 - total = float(sum(name_parts.values())) - for token, count in name_parts.items(): - boost = 1.1 + (count / total) - shoulds.append({"term": {NAME_PART_FIELD: {"value": token, "boost": boost}}}) + term = {NAME_KEY_FIELD: {"value": cleaned, "boost": 4.0}} + shoulds.append({"term": term}) + for token in set(index_name_parts(names)): + shoulds.append({"term": {NAME_PART_FIELD: {"value": token}}}) for phoneme in set(phonetic_names(names)): - shoulds.append({"term": {NAME_PHONETIC_FIELD: {"value": phoneme}}}) + term = {NAME_PHONETIC_FIELD: {"value": phoneme, "boost": 0.8}} + shoulds.append({"term": term}) return shoulds