From dbd54b2527e6d0032ab503dc5bf5829d17dbd6f8 Mon Sep 17 00:00:00 2001 From: Alex Garel Date: Fri, 25 Oct 2024 10:27:19 +0200 Subject: [PATCH] feat: enhanced completion * add the option of a preprocessor for taxonomies * add a preprocessor for Open Food Facts to handle: * brands taxonomy * add main language * add xx entries to all languages * fix to have names in synonyms * add a score to completion to first match shortest entries * enable querying multiple languages at once * added tests --- Makefile | 4 ++ app/_import.py | 42 +++++++++++++++----- app/api.py | 9 +++-- app/config.py | 20 ++++++++++ app/indexing.py | 40 +++++++++++++++++-- app/openfoodfacts.py | 34 +++++++++++++++- app/postprocessing.py | 29 +++++++++----- app/query.py | 37 ++++++++++-------- app/taxonomy.py | 46 +++++++++++++++------- app/taxonomy_es.py | 6 +-- data/config/openfoodfacts.yml | 1 + tests/int/data/test_off.yml | 1 + tests/int/helpers.py | 9 +++++ tests/int/test_completion.py | 73 +++++++++++++++++++++++++++++++++++ 14 files changed, 291 insertions(+), 60 deletions(-) create mode 100644 tests/int/test_completion.py diff --git a/Makefile b/Makefile index 670e422f..f0f0c7be 100644 --- a/Makefile +++ b/Makefile @@ -176,6 +176,10 @@ build-translations: @echo "🔎 Building translations …" ${DOCKER_COMPOSE} run --rm search_nodejs npm run translations:build +cleanup-indexes: + @echo "🔎 Cleaning indexes …" + ${DOCKER_COMPOSE} run --rm api python3 -m app cleanup-indexes ${args} + generate-openapi: _ensure_network @echo "🔎 Generating OpenAPI spec …" ${DOCKER_COMPOSE} run --rm api python3 -m app export-openapi /opt/search/data/searchalicious-openapi.yml diff --git a/app/_import.py b/app/_import.py index 7789af50..51fc49fd 100644 --- a/app/_import.py +++ b/app/_import.py @@ -12,8 +12,9 @@ from redis import Redis from app._types import FetcherResult, FetcherStatus, JSONType -from app.config import Config, IndexConfig, TaxonomyConfig, settings +from app.config import Config, IndexConfig, settings from app.indexing import ( + BaseTaxonomyPreprocessor, DocumentProcessor, generate_index_object, generate_taxonomy_index_object, @@ -252,7 +253,7 @@ def gen_documents( def gen_taxonomy_documents( - taxonomy_config: TaxonomyConfig, next_index: str, supported_langs: set[str] + config: IndexConfig, next_index: str, supported_langs: set[str] ): """Generator for taxonomy documents in Elasticsearch. @@ -261,26 +262,49 @@ def gen_taxonomy_documents( :param supported_langs: a set of supported languages :yield: a dict with the document to index, compatible with ES bulk API """ - for taxonomy_name, taxonomy in tqdm.tqdm(iter_taxonomies(taxonomy_config)): + taxonomy_config = config.taxonomy + preprocessor: BaseTaxonomyPreprocessor | None = None + if taxonomy_config.preprocessor: + preprocessor_cls = load_class_object_from_string(taxonomy_config.preprocessor) + preprocessor = preprocessor_cls(config) + for taxonomy in tqdm.tqdm(iter_taxonomies(taxonomy_config)): for node in taxonomy.iter_nodes(): + if preprocessor: + result = preprocessor.preprocess(taxonomy, node) + if result.status != FetcherStatus.FOUND or result.node is None: + continue # skip this entry + node = result.node names = { lang: lang_names for lang, lang_names in node.names.items() if lang in supported_langs } - synonyms = { - lang: lang_names - for lang, lang_names in node.synonyms.items() + synonyms: dict[str, set[str]] = { + lang: set(node.synonyms.get(lang) or []) + for lang in node.synonyms if lang in supported_langs } + for lang, lang_names in names.items(): + if lang_names: + if not isinstance(lang_names, str): + import pdb + + pdb.set_trace() + synonyms.setdefault(lang, set()).add(lang_names) yield { "_index": next_index, "_source": { "id": node.id, - "taxonomy_name": taxonomy_name, + "taxonomy_name": taxonomy.name, "name": names, - "synonyms": synonyms, + "synonyms": { + lang: { + "input": list(lang_synonyms), + "weight": max(100 - len(node.id), 0), + } + for lang, lang_synonyms in synonyms.items() + }, }, } @@ -370,7 +394,7 @@ def import_taxonomies(config: IndexConfig, next_index: str): success, errors = bulk( es, gen_taxonomy_documents( - config.taxonomy, next_index, supported_langs=set(config.supported_langs) + config, next_index, supported_langs=set(config.supported_langs) ), raise_on_error=False, ) diff --git a/app/api.py b/app/api.py index 6f6fcc47..ed09c7ef 100644 --- a/app/api.py +++ b/app/api.py @@ -149,8 +149,11 @@ def taxonomy_autocomplete( description="Name(s) of the taxonomy to search in, as a comma-separated value." ), ], - lang: Annotated[ - str, Query(description="Language to search in, defaults to 'en'.") + langs: Annotated[ + str, + Query( + description="Languages to search in (as a comma separated list), defaults to 'en'." + ), ] = "en", size: Annotated[int, Query(description="Number of results to return.")] = 10, fuzziness: Annotated[ @@ -167,7 +170,7 @@ def taxonomy_autocomplete( query = build_completion_query( q=q, taxonomy_names=taxonomy_names_list, - lang=lang, + langs=langs.split(","), size=size, config=index_config, fuzziness=fuzziness, diff --git a/app/config.py b/app/config.py index f820099b..c381d87b 100644 --- a/app/config.py +++ b/app/config.py @@ -510,6 +510,26 @@ class TaxonomyConfig(BaseModel): TaxonomyIndexConfig, Field(description=TaxonomyIndexConfig.__doc__), ] + preprocessor: ( + Annotated[ + str, + Field( + description=cd_( + """The full qualified reference to the preprocessor + to use before taxonomy entry import. + + This class must inherit `app.indexing.BaseTaxonomyPreprocessor` + and specialize the `preprocess` method. + + This is used to adapt the taxonomy schema + or to add specific fields for example. + """ + ), + examples=["app.openfoodfacts.TaxonomyPreprocessor"], + ), + ] + | None + ) = None class ScriptConfig(BaseModel): diff --git a/app/indexing.py b/app/indexing.py index 9d09f776..6a3050fd 100644 --- a/app/indexing.py +++ b/app/indexing.py @@ -9,12 +9,12 @@ from app._types import FetcherResult, FetcherStatus, JSONType from app.config import ( ANALYZER_LANG_MAPPING, - Config, FieldConfig, FieldType, IndexConfig, TaxonomyConfig, ) +from app.taxonomy import Taxonomy, TaxonomyNode, TaxonomyNodeResult from app.utils import load_class_object_from_string from app.utils.analyzers import ( get_autocomplete_analyzer, @@ -104,8 +104,41 @@ def preprocess_field_value( return input_value +class BaseTaxonomyPreprocessor(abc.ABC): + """Base class for taxonomy entries preprocessors. + + Classes referenced in index configuration `preprocess` field, + has to be derived from it. + """ + + def __init__(self, config: IndexConfig) -> None: + self.config = config + + @abc.abstractmethod + def preprocess(self, taxonomy: Taxonomy, node: TaxonomyNode) -> TaxonomyNodeResult: + """Preprocess the taxonomy entry before ingestion in Elasticsearch, + and before synonyms generation + + This can be used to make document schema compatible with the project + schema or to add custom fields. + + :return: a TaxonomyNodeResult object: + + * the status can be used to pilot wether + to index or not the entry (even delete it) + * the entry is the transformed entry + """ + pass + + class BaseDocumentPreprocessor(abc.ABC): - def __init__(self, config: Config) -> None: + """Base class for document preprocessors. + + Classes referenced in index configuration `preprocess` field, + has to be derived from it. + """ + + def __init__(self, config: IndexConfig) -> None: self.config = config @abc.abstractmethod @@ -119,7 +152,7 @@ def preprocess(self, document: JSONType) -> FetcherResult: * the status can be used to pilot wether to index or not the document (even delete it) - * the document is the document transformed document + * the document is the transformed document """ pass @@ -379,6 +412,7 @@ def generate_taxonomy_mapping_object(config: IndexConfig) -> Mapping: "type": "category", } ], + preserve_separators=False, # help match plurals ) for lang in supported_langs }, diff --git a/app/openfoodfacts.py b/app/openfoodfacts.py index 1a4adc6e..8b3e1fd1 100644 --- a/app/openfoodfacts.py +++ b/app/openfoodfacts.py @@ -7,8 +7,9 @@ from app._import import BaseDocumentFetcher from app._types import FetcherResult, FetcherStatus, JSONType -from app.indexing import BaseDocumentPreprocessor +from app.indexing import BaseDocumentPreprocessor, BaseTaxonomyPreprocessor from app.postprocessing import BaseResultProcessor +from app.taxonomy import Taxonomy, TaxonomyNode, TaxonomyNodeResult from app.utils.download import http_session from app.utils.log import get_logger @@ -87,6 +88,37 @@ def generate_image_url(code: str, image_id: str) -> str: OFF_API_URL = os.environ.get("OFF_API_URL", "https://world.openfoodfacts.org") +class TaxonomyPreprocessor(BaseTaxonomyPreprocessor): + """Preprocessor for Open Food Facts taxonomies.""" + + def preprocess(self, taxonomy: Taxonomy, node: TaxonomyNode) -> TaxonomyNodeResult: + """Preprocess a taxonomy node, + + We add the main language, and we also have specificities for some taxonomies + """ + if taxonomy.name == "brands": + # brands are english only, put them in "main lang" + node.names.update(main=node.names["en"]) + if node.synonyms and (synonyms_en := list(node.synonyms.get("en", []))): + node.synonyms.update(main=synonyms_en) + else: + # main language is entry id prefix + eventual xx entries + id_lang = node.id.split(":")[0] + if node_names := node.names.get(id_lang): + node.names.update(main=node_names) + node.synonyms.update(main=list(node.synonyms.get(id_lang, []))) + # add eventual xx entries as synonyms to all languages + xx_name = node.names.get("xx") + xx_names = [xx_name] if xx_name else [] + xx_names += node.synonyms.get("xx", []) + if xx_names: + for lang in self.config.supported_langs: + node.names.setdefault(lang, xx_names[0]) + lang_synonyms = node.synonyms.setdefault(lang, []) + lang_synonyms += xx_names + return TaxonomyNodeResult(status=FetcherStatus.FOUND, node=node) + + class DocumentFetcher(BaseDocumentFetcher): def fetch_document(self, stream_name: str, item: JSONType) -> FetcherResult: if item.get("action") == "deleted": diff --git a/app/postprocessing.py b/app/postprocessing.py index 40d0f87a..35c00ff1 100644 --- a/app/postprocessing.py +++ b/app/postprocessing.py @@ -65,13 +65,24 @@ def load_result_processor(config: IndexConfig) -> BaseResultProcessor | None: def process_taxonomy_completion_response(response: Response) -> JSONType: output = {"took": response.took, "timed_out": response.timed_out} options = [] - suggestion = response.suggest["taxonomy_suggest"][0] - for option in suggestion.options: - result = { - "id": option._source["id"], - "text": option.text, - "taxonomy_name": option._source["taxonomy_name"], - } - options.append(result) - output["options"] = options + ids = set() + for suggestion_id in dir(response.suggest): + if not suggestion_id.startswith("taxonomy_suggest_"): + continue + for suggestion in getattr(response.suggest, suggestion_id): + for option in suggestion.options: + if option._source["id"] in ids: + continue + ids.add(option._source["id"]) + result = { + "id": option._source["id"], + "text": option.text, + "score": option._score, + "taxonomy_name": option._source["taxonomy_name"], + } + options.append(result) + # highest score first + output["options"] = sorted( + options, key=lambda option: option["score"], reverse=True + ) return output diff --git a/app/query.py b/app/query.py index b5299902..d4555d2f 100644 --- a/app/query.py +++ b/app/query.py @@ -322,7 +322,7 @@ def build_es_query( def build_completion_query( q: str, taxonomy_names: list[str], - lang: str, + langs: list[str], size: int, config: IndexConfig, fuzziness: int | None = 2, @@ -331,28 +331,31 @@ def build_completion_query( :param q: the user autocomplete query :param taxonomy_names: a list of taxonomies we want to search in - :param lang: the language we want search in + :param langs: the language we want search in :param size: number of results to return :param config: the index configuration to use :param fuzziness: fuzziness parameter for completion query :return: the built Query """ - - completion_clause = { - "field": f"synonyms.{lang}", - "size": size, - "contexts": {"taxonomy_name": taxonomy_names}, - } - - if fuzziness is not None: - completion_clause["fuzzy"] = {"fuzziness": fuzziness} - query = Search(index=config.taxonomy.index.name) - query = query.suggest( - "taxonomy_suggest", - q, - completion=completion_clause, - ) + # import pdb;pdb.set_trace(); + for lang in langs: + completion_clause = { + "field": f"synonyms.{lang}", + "size": size, + "contexts": {"taxonomy_name": taxonomy_names}, + "skip_duplicates": True, + } + if fuzziness is not None: + completion_clause["fuzzy"] = {"fuzziness": fuzziness} + + query = query.suggest( + f"taxonomy_suggest_{lang}", + q, + completion=completion_clause, + ) + # limit returned fields + # query.source(fields=["id", "taxonomy_name"]) return query diff --git a/app/taxonomy.py b/app/taxonomy.py index e1face38..46f7222b 100644 --- a/app/taxonomy.py +++ b/app/taxonomy.py @@ -9,8 +9,9 @@ import cachetools import requests +from pydantic import BaseModel, ConfigDict -from app._types import JSONType +from app._types import FetcherStatus, JSONType from app.config import TaxonomyConfig, settings from app.utils import get_logger from app.utils.download import download_file, http_session, should_download_file @@ -157,8 +158,9 @@ class Taxonomy: node identifier to a `TaxonomyNode`. """ - def __init__(self) -> None: + def __init__(self, name: str) -> None: self.nodes: Dict[str, TaxonomyNode] = {} + self.name = name def add(self, key: str, node: TaxonomyNode) -> None: """Add a node to the taxonomy under the id `key`. @@ -263,13 +265,13 @@ def to_dict(self) -> JSONType: return export @classmethod - def from_dict(cls, data: JSONType) -> "Taxonomy": + def from_dict(cls, name: str, data: JSONType) -> "Taxonomy": """Create a Taxonomy from `data`. :param data: the taxonomy as a dict :return: a Taxonomy """ - taxonomy = Taxonomy() + taxonomy = Taxonomy(name) for key, key_data in data.items(): if key not in taxonomy: @@ -293,17 +295,21 @@ def from_dict(cls, data: JSONType) -> "Taxonomy": return taxonomy @classmethod - def from_path(cls, file_path: Union[str, Path]) -> "Taxonomy": + def from_path(cls, name: str, file_path: Union[str, Path]) -> "Taxonomy": """Create a Taxonomy from a JSON file. :param file_path: a JSON file, gzipped (.json.gz) files are supported :return: a Taxonomy """ - return cls.from_dict(load_json(file_path)) # type: ignore + return cls.from_dict(name, load_json(file_path)) # type: ignore @classmethod def from_url( - cls, url: str, session: Optional[requests.Session] = None, timeout: int = 120 + cls, + name: str, + url: str, + session: Optional[requests.Session] = None, + timeout: int = 120, ) -> "Taxonomy": """Create a Taxonomy from a taxonomy file hosted at `url`. @@ -315,7 +321,7 @@ def from_url( session = http_session if session is None else session r = session.get(url, timeout=timeout) data = r.json() - return cls.from_dict(data) + return cls.from_dict(name, data) @cachetools.cached(cachetools.TTLCache(maxsize=100, ttl=3600)) @@ -345,7 +351,7 @@ def get_taxonomy( fpath = taxonomy_url[len("file://") :] if not fpath.startswith("/"): raise RuntimeError("Relative path (not yet) supported for taxonomy url") - return Taxonomy.from_path(fpath.rstrip("/")) + return Taxonomy.from_path(taxonomy_name, fpath.rstrip("/")) filename = f"{taxonomy_name}.json" cache_dir = DEFAULT_CACHE_DIR if cache_dir is None else cache_dir @@ -354,16 +360,26 @@ def get_taxonomy( if not should_download_file( taxonomy_url, taxonomy_path, force_download, download_newer ): - return Taxonomy.from_path(taxonomy_path) + return Taxonomy.from_path(taxonomy_name, taxonomy_path) cache_dir.mkdir(parents=True, exist_ok=True) logger.info("Downloading taxonomy, saving it in %s", taxonomy_path) download_file(taxonomy_url, taxonomy_path) - return Taxonomy.from_path(taxonomy_path) + return Taxonomy.from_path(taxonomy_name, taxonomy_path) -def iter_taxonomies(taxonomy_config: TaxonomyConfig) -> Iterator[tuple[str, Taxonomy]]: +def iter_taxonomies(taxonomy_config: TaxonomyConfig) -> Iterator[Taxonomy]: for taxonomy_source_config in taxonomy_config.sources: - yield taxonomy_source_config.name, get_taxonomy( - taxonomy_source_config.name, str(taxonomy_source_config.url) - ) + yield get_taxonomy(taxonomy_source_config.name, str(taxonomy_source_config.url)) + + +class TaxonomyNodeResult(BaseModel): + """Result for a taxonomy node transformation. + + This is used to eventually skip entry after preprocessing + """ + + model_config = ConfigDict(arbitrary_types_allowed=True) + + status: FetcherStatus + node: TaxonomyNode | None diff --git a/app/taxonomy_es.py b/app/taxonomy_es.py index a8f713e4..89e293a5 100644 --- a/app/taxonomy_es.py +++ b/app/taxonomy_es.py @@ -94,10 +94,10 @@ def create_synonyms_files(taxonomy: Taxonomy, langs: list[str], target_dir: Path def create_synonyms(index_config: IndexConfig, target_dir: Path): - for name, taxonomy in iter_taxonomies(index_config.taxonomy): - target = target_dir / name + for taxonomy in iter_taxonomies(index_config.taxonomy): + target = target_dir / taxonomy.name # a temporary directory, we move at the end - target_tmp = target_dir / f"{name}.tmp" + target_tmp = target_dir / f"{taxonomy.name}.tmp" shutil.rmtree(target_tmp, ignore_errors=True) # ensure directory os.makedirs(target_tmp, mode=0o775, exist_ok=True) diff --git a/data/config/openfoodfacts.yml b/data/config/openfoodfacts.yml index a12d3b73..13d39535 100644 --- a/data/config/openfoodfacts.yml +++ b/data/config/openfoodfacts.yml @@ -160,6 +160,7 @@ indices: primary_color: "#341100" accent_color: "#ff8714" taxonomy: + preprocessor: app.openfoodfacts.TaxonomyPreprocessor sources: - name: categories url: https://static.openfoodfacts.org/data/taxonomies/categories.full.json diff --git a/tests/int/data/test_off.yml b/tests/int/data/test_off.yml index 3a17f8ef..9800e470 100644 --- a/tests/int/data/test_off.yml +++ b/tests/int/data/test_off.yml @@ -50,6 +50,7 @@ indices: primary_color: "#341100" accent_color: "#ff8714" taxonomy: + preprocessor: tests.int.helpers.TestTaxonomyPreprocessor sources: - name: categories url: file:///opt/search/tests/int/data/test_categories.full.json diff --git a/tests/int/helpers.py b/tests/int/helpers.py index 983811c6..35573e48 100644 --- a/tests/int/helpers.py +++ b/tests/int/helpers.py @@ -5,7 +5,9 @@ from app._import import BaseDocumentFetcher from app._types import FetcherResult, FetcherStatus, JSONType from app.indexing import BaseDocumentPreprocessor +from app.openfoodfacts import TaxonomyPreprocessor from app.postprocessing import BaseResultProcessor +from app.taxonomy import Taxonomy, TaxonomyNode, TaxonomyNodeResult class CallRegistration: @@ -41,6 +43,13 @@ def get_calls(cls): return calls +class TestTaxonomyPreprocessor(TaxonomyPreprocessor, CallRegistration): + + def preprocess(self, taxonomy: Taxonomy, node: TaxonomyNode) -> TaxonomyNodeResult: + self.register_call((taxonomy.name, node.id)) + return super().preprocess(taxonomy, node) + + class TestDocumentFetcher(BaseDocumentFetcher, CallRegistration): def fetch_document(self, stream_name: str, item: JSONType) -> FetcherResult: diff --git a/tests/int/test_completion.py b/tests/int/test_completion.py new file mode 100644 index 00000000..75eb022e --- /dev/null +++ b/tests/int/test_completion.py @@ -0,0 +1,73 @@ +import pytest + + +@pytest.mark.parametrize( + "q,taxonomies,langs,results", + [ + # simple + ("organ", "labels", "en", [("en:organic", "Organic", 90)]), + # no case match + ("ORGAN", "labels", "en", [("en:organic", "Organic", 90)]), + # french + ("biol", "labels", "fr", [("en:organic", "biologique", 90)]), + # multiple languages + ("biol", "labels", "en,fr", [("en:organic", "biologique", 90)]), + # xx added to french + ("Max H", "labels", "fr", [("en:max-havelaar", "Max Havelaar", 85)]), + # main for an entry without french + ( + "Fairtrade/Max H", + "labels", + "fr,main", + [("en:max-havelaar", "Fairtrade/Max Havelaar", 85)], + ), + # multiple taxonomies + ( + "fr", + "labels,categories", + "en", + [ + ("en:organic", "From Organic Agriculture", 90), + ("en:fr-bio-01", "FR-BIO-01", 88), + ("en:no-artificial-flavors", "free of artificial flavor", 76), + ( + "en:fruits-and-vegetables-based-foods", + "Fruits and vegetables based foods", + 64, + ), + ], + ), + # different answers + ( + "b", + "categories", + "en", + [ + ("en:biscuits", "biscuit", 89), + ("en:beverages", "Beverages", 88), + ("en:chocolate-biscuits", "Biscuit with chocolate", 79), + ("en:biscuits-and-cakes", "Biscuits and cakes", 79), + ("en:sweetened-beverages", "Beverages with added sugar", 78), + ], + ), + ], +) +def test_completion(q, taxonomies, langs, results, test_client, synonyms_created): + response = test_client.get( + f"/autocomplete?q={q}&langs={langs}&taxonomy_names={taxonomies}&size=5" + ) + assert response.status_code == 200 + options = response.json()["options"] + assert len(options) == len(results) + # only requested taxonomies + result_taxonomies = set([option["taxonomy_name"] for option in options]) + assert result_taxonomies <= set(taxonomies.split(",")) + # well sorted + assert sorted([option["score"] for option in options], reverse=True) == [ + option["score"] for option in options + ] + # expected results + completions = [ + (option["id"], option["text"], int(option["score"])) for option in options + ] + assert completions == results