From 89813d8c6b74855492e05b08206dc17cf5cda5b0 Mon Sep 17 00:00:00 2001 From: Friedrich Lindenberg Date: Fri, 6 Oct 2023 12:38:20 +0200 Subject: [PATCH] set the name keys --- yente/data/util.py | 8 ++++---- yente/search/indexer.py | 7 +++++-- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/yente/data/util.py b/yente/data/util.py index 3fcd385d..e8cd7b0e 100644 --- a/yente/data/util.py +++ b/yente/data/util.py @@ -5,7 +5,7 @@ from prefixdate.precision import Precision from contextlib import asynccontextmanager from aiohttp import ClientSession, ClientTimeout, TCPConnector -from typing import AsyncGenerator, Dict, List, Union, Iterable, Optional +from typing import AsyncGenerator, Dict, List, Union, Iterable, Optional, Set from followthemoney.types import registry from normality.cleaning import decompose_nfkd, category_replace from fingerprints import remove_types, clean_name_light, clean_entity_prefix @@ -67,12 +67,12 @@ def index_name_parts(names: List[str]) -> List[str]: def index_name_keys(names: List[str]) -> List[str]: """Generate a indexable name keys from the given names.""" - keys: List[str] = [] + keys: Set[str] = set() for name in names: for key in (fingerprint_name(name), clean_name_light(name)): if key is not None: - keys.append(key) - return keys + keys.add(key) + return list(keys) def pick_names(names: List[str], limit: int = 3) -> List[str]: diff --git a/yente/search/indexer.py b/yente/search/indexer.py index ef981c5c..936f4de6 100644 --- a/yente/search/indexer.py +++ b/yente/search/indexer.py @@ -47,13 +47,16 @@ async def iter_entity_docs( texts = entity.pop("indexText") doc = entity.to_full_dict(matchable=True) - doc["text"] = texts names: List[str] = doc.get(NAMES_FIELD, []) names.extend(entity.get("weakAlias", quiet=True)) - doc[NAME_PART_FIELD] = index_name_parts(names) + name_parts = index_name_parts(names) + texts.extend(name_parts) + doc[NAME_PART_FIELD] = name_parts doc[NAME_KEY_FIELD] = index_name_keys(names) doc[NAME_PHONETIC_FIELD] = phonetic_names(names) doc[DateType.group] = expand_dates(doc.pop(DateType.group, [])) + doc["text"] = texts + entity_id = doc.pop("id") yield {"_index": index, "_id": entity_id, "_source": doc} except FollowTheMoneyException as exc: