feat: enhanced completion

* add the option of a preprocessor for taxonomies * add a preprocessor for Open Food Facts to handle: * brands taxonomy * add main language * add xx entries to all languages * fix to have names in synonyms * add a score to completion to first match shortest entries * enable querying multiple languages at once * added tests
openfoodfacts · Oct 25, 2024 · dbd54b2 · dbd54b2
1 parent df65d3a
commit dbd54b2
Show file tree

Hide file tree

Showing 14 changed files with 291 additions and 60 deletions.
diff --git a/Makefile b/Makefile
@@ -176,6 +176,10 @@ build-translations:
 	@echo "🔎 Building translations …"
 	${DOCKER_COMPOSE} run --rm search_nodejs npm run translations:build
 
+cleanup-indexes:
+	@echo "🔎 Cleaning indexes …"
+	${DOCKER_COMPOSE} run --rm api python3 -m app cleanup-indexes ${args}
+
 generate-openapi: _ensure_network
 	@echo "🔎 Generating OpenAPI spec …"
 	${DOCKER_COMPOSE} run --rm api python3 -m app export-openapi /opt/search/data/searchalicious-openapi.yml

diff --git a/app/_import.py b/app/_import.py
@@ -12,8 +12,9 @@
 from redis import Redis
 
 from app._types import FetcherResult, FetcherStatus, JSONType
-from app.config import Config, IndexConfig, TaxonomyConfig, settings
+from app.config import Config, IndexConfig, settings
 from app.indexing import (
+    BaseTaxonomyPreprocessor,
     DocumentProcessor,
     generate_index_object,
     generate_taxonomy_index_object,
@@ -252,7 +253,7 @@ def gen_documents(
 
 
 def gen_taxonomy_documents(
-    taxonomy_config: TaxonomyConfig, next_index: str, supported_langs: set[str]
+    config: IndexConfig, next_index: str, supported_langs: set[str]
 ):
     """Generator for taxonomy documents in Elasticsearch.
 
@@ -261,26 +262,49 @@ def gen_taxonomy_documents(
     :param supported_langs: a set of supported languages
     :yield: a dict with the document to index, compatible with ES bulk API
     """
-    for taxonomy_name, taxonomy in tqdm.tqdm(iter_taxonomies(taxonomy_config)):
+    taxonomy_config = config.taxonomy
+    preprocessor: BaseTaxonomyPreprocessor | None = None
+    if taxonomy_config.preprocessor:
+        preprocessor_cls = load_class_object_from_string(taxonomy_config.preprocessor)
+        preprocessor = preprocessor_cls(config)
+    for taxonomy in tqdm.tqdm(iter_taxonomies(taxonomy_config)):
         for node in taxonomy.iter_nodes():
+            if preprocessor:
+                result = preprocessor.preprocess(taxonomy, node)
+                if result.status != FetcherStatus.FOUND or result.node is None:
+                    continue  # skip this entry
+                node = result.node
             names = {
                 lang: lang_names
                 for lang, lang_names in node.names.items()
                 if lang in supported_langs
             }
-            synonyms = {
-                lang: lang_names
-                for lang, lang_names in node.synonyms.items()
+            synonyms: dict[str, set[str]] = {
+                lang: set(node.synonyms.get(lang) or [])
+                for lang in node.synonyms
                 if lang in supported_langs
             }
+            for lang, lang_names in names.items():
+                if lang_names:
+                    if not isinstance(lang_names, str):
+                        import pdb
+
+                        pdb.set_trace()
+                    synonyms.setdefault(lang, set()).add(lang_names)
 
             yield {
                 "_index": next_index,
                 "_source": {
                     "id": node.id,
-                    "taxonomy_name": taxonomy_name,
+                    "taxonomy_name": taxonomy.name,
                     "name": names,
-                    "synonyms": synonyms,
+                    "synonyms": {
+                        lang: {
+                            "input": list(lang_synonyms),
+                            "weight": max(100 - len(node.id), 0),
+                        }
+                        for lang, lang_synonyms in synonyms.items()
+                    },
                 },
             }
 
@@ -370,7 +394,7 @@ def import_taxonomies(config: IndexConfig, next_index: str):
     success, errors = bulk(
         es,
         gen_taxonomy_documents(
-            config.taxonomy, next_index, supported_langs=set(config.supported_langs)
+            config, next_index, supported_langs=set(config.supported_langs)
         ),
         raise_on_error=False,
     )

diff --git a/app/api.py b/app/api.py
@@ -149,8 +149,11 @@ def taxonomy_autocomplete(
             description="Name(s) of the taxonomy to search in, as a comma-separated value."
         ),
     ],
-    lang: Annotated[
-        str, Query(description="Language to search in, defaults to 'en'.")
+    langs: Annotated[
+        str,
+        Query(
+            description="Languages to search in (as a comma separated list), defaults to 'en'."
+        ),
     ] = "en",
     size: Annotated[int, Query(description="Number of results to return.")] = 10,
     fuzziness: Annotated[
@@ -167,7 +170,7 @@ def taxonomy_autocomplete(
     query = build_completion_query(
         q=q,
         taxonomy_names=taxonomy_names_list,
-        lang=lang,
+        langs=langs.split(","),
         size=size,
         config=index_config,
         fuzziness=fuzziness,

diff --git a/app/config.py b/app/config.py
@@ -510,6 +510,26 @@ class TaxonomyConfig(BaseModel):
         TaxonomyIndexConfig,
         Field(description=TaxonomyIndexConfig.__doc__),
     ]
+    preprocessor: (
+        Annotated[
+            str,
+            Field(
+                description=cd_(
+                    """The full qualified reference to the preprocessor
+                    to use before taxonomy entry import.
+
+                    This class must inherit `app.indexing.BaseTaxonomyPreprocessor`
+                    and specialize the `preprocess` method.
+
+                    This is used to adapt the taxonomy schema
+                    or to add specific fields for example.
+                    """
+                ),
+                examples=["app.openfoodfacts.TaxonomyPreprocessor"],
+            ),
+        ]
+        | None
+    ) = None
 
 
 class ScriptConfig(BaseModel):

diff --git a/app/indexing.py b/app/indexing.py
@@ -9,12 +9,12 @@
 from app._types import FetcherResult, FetcherStatus, JSONType
 from app.config import (
     ANALYZER_LANG_MAPPING,
-    Config,
     FieldConfig,
     FieldType,
     IndexConfig,
     TaxonomyConfig,
 )
+from app.taxonomy import Taxonomy, TaxonomyNode, TaxonomyNodeResult
 from app.utils import load_class_object_from_string
 from app.utils.analyzers import (
     get_autocomplete_analyzer,
@@ -104,8 +104,41 @@ def preprocess_field_value(
     return input_value
 
 
+class BaseTaxonomyPreprocessor(abc.ABC):
+    """Base class for taxonomy entries preprocessors.
+
+    Classes referenced in index configuration `preprocess` field,
+    has to be derived from it.
+    """
+
+    def __init__(self, config: IndexConfig) -> None:
+        self.config = config
+
+    @abc.abstractmethod
+    def preprocess(self, taxonomy: Taxonomy, node: TaxonomyNode) -> TaxonomyNodeResult:
+        """Preprocess the taxonomy entry before ingestion in Elasticsearch,
+        and before synonyms generation
+
+        This can be used to make document schema compatible with the project
+        schema or to add custom fields.
+
+        :return: a TaxonomyNodeResult object:
+
+        * the status can be used to pilot wether
+          to index or not the entry (even delete it)
+        * the entry is the transformed entry
+        """
+        pass
+
+
 class BaseDocumentPreprocessor(abc.ABC):
-    def __init__(self, config: Config) -> None:
+    """Base class for document preprocessors.
+
+    Classes referenced in index configuration `preprocess` field,
+    has to be derived from it.
+    """
+
+    def __init__(self, config: IndexConfig) -> None:
         self.config = config
 
     @abc.abstractmethod
@@ -119,7 +152,7 @@ def preprocess(self, document: JSONType) -> FetcherResult:
 
         * the status can be used to pilot wether
           to index or not the document (even delete it)
-        * the document is the document transformed document
+        * the document is the transformed document
 
         """
         pass
@@ -379,6 +412,7 @@ def generate_taxonomy_mapping_object(config: IndexConfig) -> Mapping:
                             "type": "category",
                         }
                     ],
+                    preserve_separators=False,  # help match plurals
                 )
                 for lang in supported_langs
             },

diff --git a/app/openfoodfacts.py b/app/openfoodfacts.py
@@ -7,8 +7,9 @@
 
 from app._import import BaseDocumentFetcher
 from app._types import FetcherResult, FetcherStatus, JSONType
-from app.indexing import BaseDocumentPreprocessor
+from app.indexing import BaseDocumentPreprocessor, BaseTaxonomyPreprocessor
 from app.postprocessing import BaseResultProcessor
+from app.taxonomy import Taxonomy, TaxonomyNode, TaxonomyNodeResult
 from app.utils.download import http_session
 from app.utils.log import get_logger
 
@@ -87,6 +88,37 @@ def generate_image_url(code: str, image_id: str) -> str:
 OFF_API_URL = os.environ.get("OFF_API_URL", "https://world.openfoodfacts.org")
 
 
+class TaxonomyPreprocessor(BaseTaxonomyPreprocessor):
+    """Preprocessor for Open Food Facts taxonomies."""
+
+    def preprocess(self, taxonomy: Taxonomy, node: TaxonomyNode) -> TaxonomyNodeResult:
+        """Preprocess a taxonomy node,
+
+        We add the main language, and we also have specificities for some taxonomies
+        """
+        if taxonomy.name == "brands":
+            # brands are english only, put them in "main lang"
+            node.names.update(main=node.names["en"])
+            if node.synonyms and (synonyms_en := list(node.synonyms.get("en", []))):
+                node.synonyms.update(main=synonyms_en)
+        else:
+            # main language is entry id prefix + eventual xx entries
+            id_lang = node.id.split(":")[0]
+            if node_names := node.names.get(id_lang):
+                node.names.update(main=node_names)
+            node.synonyms.update(main=list(node.synonyms.get(id_lang, [])))
+            # add eventual xx entries as synonyms to all languages
+            xx_name = node.names.get("xx")
+            xx_names = [xx_name] if xx_name else []
+            xx_names += node.synonyms.get("xx", [])
+            if xx_names:
+                for lang in self.config.supported_langs:
+                    node.names.setdefault(lang, xx_names[0])
+                    lang_synonyms = node.synonyms.setdefault(lang, [])
+                    lang_synonyms += xx_names
+        return TaxonomyNodeResult(status=FetcherStatus.FOUND, node=node)
+
+
 class DocumentFetcher(BaseDocumentFetcher):
     def fetch_document(self, stream_name: str, item: JSONType) -> FetcherResult:
         if item.get("action") == "deleted":

diff --git a/app/postprocessing.py b/app/postprocessing.py
@@ -65,13 +65,24 @@ def load_result_processor(config: IndexConfig) -> BaseResultProcessor | None:
 def process_taxonomy_completion_response(response: Response) -> JSONType:
     output = {"took": response.took, "timed_out": response.timed_out}
     options = []
-    suggestion = response.suggest["taxonomy_suggest"][0]
-    for option in suggestion.options:
-        result = {
-            "id": option._source["id"],
-            "text": option.text,
-            "taxonomy_name": option._source["taxonomy_name"],
-        }
-        options.append(result)
-    output["options"] = options
+    ids = set()
+    for suggestion_id in dir(response.suggest):
+        if not suggestion_id.startswith("taxonomy_suggest_"):
+            continue
+        for suggestion in getattr(response.suggest, suggestion_id):
+            for option in suggestion.options:
+                if option._source["id"] in ids:
+                    continue
+                ids.add(option._source["id"])
+                result = {
+                    "id": option._source["id"],
+                    "text": option.text,
+                    "score": option._score,
+                    "taxonomy_name": option._source["taxonomy_name"],
+                }
+                options.append(result)
+    # highest score first
+    output["options"] = sorted(
+        options, key=lambda option: option["score"], reverse=True
+    )
     return output
diff --git a/app/query.py b/app/query.py
@@ -322,7 +322,7 @@ def build_es_query(
 def build_completion_query(
     q: str,
     taxonomy_names: list[str],
-    lang: str,
+    langs: list[str],
     size: int,
     config: IndexConfig,
     fuzziness: int | None = 2,
@@ -331,28 +331,31 @@ def build_completion_query(
 
     :param q: the user autocomplete query
     :param taxonomy_names: a list of taxonomies we want to search in
-    :param lang: the language we want search in
+    :param langs: the language we want search in
     :param size: number of results to return
     :param config: the index configuration to use
     :param fuzziness: fuzziness parameter for completion query
     :return: the built Query
     """
-
-    completion_clause = {
-        "field": f"synonyms.{lang}",
-        "size": size,
-        "contexts": {"taxonomy_name": taxonomy_names},
-    }
-
-    if fuzziness is not None:
-        completion_clause["fuzzy"] = {"fuzziness": fuzziness}
-
     query = Search(index=config.taxonomy.index.name)
-    query = query.suggest(
-        "taxonomy_suggest",
-        q,
-        completion=completion_clause,
-    )
+    # import pdb;pdb.set_trace();
+    for lang in langs:
+        completion_clause = {
+            "field": f"synonyms.{lang}",
+            "size": size,
+            "contexts": {"taxonomy_name": taxonomy_names},
+            "skip_duplicates": True,
+        }
+        if fuzziness is not None:
+            completion_clause["fuzzy"] = {"fuzziness": fuzziness}
+
+        query = query.suggest(
+            f"taxonomy_suggest_{lang}",
+            q,
+            completion=completion_clause,
+        )
+    # limit returned fields
+    # query.source(fields=["id", "taxonomy_name"])
     return query