From dbd54b2527e6d0032ab503dc5bf5829d17dbd6f8 Mon Sep 17 00:00:00 2001
From: Alex Garel <alex@garel.org>
Date: Fri, 25 Oct 2024 10:27:19 +0200
Subject: [PATCH] feat: enhanced completion * add the option of a preprocessor
 for taxonomies * add a preprocessor for Open Food Facts to handle:   * brands
 taxonomy   * add main language   * add xx entries to all languages * fix to
 have names in synonyms * add a score to completion to first match shortest
 entries * enable querying multiple languages at once * added tests

---
 Makefile                      |  4 ++
 app/_import.py                | 42 +++++++++++++++-----
 app/api.py                    |  9 +++--
 app/config.py                 | 20 ++++++++++
 app/indexing.py               | 40 +++++++++++++++++--
 app/openfoodfacts.py          | 34 +++++++++++++++-
 app/postprocessing.py         | 29 +++++++++-----
 app/query.py                  | 37 ++++++++++--------
 app/taxonomy.py               | 46 +++++++++++++++-------
 app/taxonomy_es.py            |  6 +--
 data/config/openfoodfacts.yml |  1 +
 tests/int/data/test_off.yml   |  1 +
 tests/int/helpers.py          |  9 +++++
 tests/int/test_completion.py  | 73 +++++++++++++++++++++++++++++++++++
 14 files changed, 291 insertions(+), 60 deletions(-)
 create mode 100644 tests/int/test_completion.py

diff --git a/Makefile b/Makefile
index 670e422f..f0f0c7be 100644
--- a/Makefile
+++ b/Makefile
@@ -176,6 +176,10 @@ build-translations:
 	@echo "🔎 Building translations …"
 	${DOCKER_COMPOSE} run --rm search_nodejs npm run translations:build
 
+cleanup-indexes:
+	@echo "🔎 Cleaning indexes …"
+	${DOCKER_COMPOSE} run --rm api python3 -m app cleanup-indexes ${args}
+
 generate-openapi: _ensure_network
 	@echo "🔎 Generating OpenAPI spec …"
 	${DOCKER_COMPOSE} run --rm api python3 -m app export-openapi /opt/search/data/searchalicious-openapi.yml
diff --git a/app/_import.py b/app/_import.py
index 7789af50..51fc49fd 100644
--- a/app/_import.py
+++ b/app/_import.py
@@ -12,8 +12,9 @@
 from redis import Redis
 
 from app._types import FetcherResult, FetcherStatus, JSONType
-from app.config import Config, IndexConfig, TaxonomyConfig, settings
+from app.config import Config, IndexConfig, settings
 from app.indexing import (
+    BaseTaxonomyPreprocessor,
     DocumentProcessor,
     generate_index_object,
     generate_taxonomy_index_object,
@@ -252,7 +253,7 @@ def gen_documents(
 
 
 def gen_taxonomy_documents(
-    taxonomy_config: TaxonomyConfig, next_index: str, supported_langs: set[str]
+    config: IndexConfig, next_index: str, supported_langs: set[str]
 ):
     """Generator for taxonomy documents in Elasticsearch.
 
@@ -261,26 +262,49 @@ def gen_taxonomy_documents(
     :param supported_langs: a set of supported languages
     :yield: a dict with the document to index, compatible with ES bulk API
     """
-    for taxonomy_name, taxonomy in tqdm.tqdm(iter_taxonomies(taxonomy_config)):
+    taxonomy_config = config.taxonomy
+    preprocessor: BaseTaxonomyPreprocessor | None = None
+    if taxonomy_config.preprocessor:
+        preprocessor_cls = load_class_object_from_string(taxonomy_config.preprocessor)
+        preprocessor = preprocessor_cls(config)
+    for taxonomy in tqdm.tqdm(iter_taxonomies(taxonomy_config)):
         for node in taxonomy.iter_nodes():
+            if preprocessor:
+                result = preprocessor.preprocess(taxonomy, node)
+                if result.status != FetcherStatus.FOUND or result.node is None:
+                    continue  # skip this entry
+                node = result.node
             names = {
                 lang: lang_names
                 for lang, lang_names in node.names.items()
                 if lang in supported_langs
             }
-            synonyms = {
-                lang: lang_names
-                for lang, lang_names in node.synonyms.items()
+            synonyms: dict[str, set[str]] = {
+                lang: set(node.synonyms.get(lang) or [])
+                for lang in node.synonyms
                 if lang in supported_langs
             }
+            for lang, lang_names in names.items():
+                if lang_names:
+                    if not isinstance(lang_names, str):
+                        import pdb
+
+                        pdb.set_trace()
+                    synonyms.setdefault(lang, set()).add(lang_names)
 
             yield {
                 "_index": next_index,
                 "_source": {
                     "id": node.id,
-                    "taxonomy_name": taxonomy_name,
+                    "taxonomy_name": taxonomy.name,
                     "name": names,
-                    "synonyms": synonyms,
+                    "synonyms": {
+                        lang: {
+                            "input": list(lang_synonyms),
+                            "weight": max(100 - len(node.id), 0),
+                        }
+                        for lang, lang_synonyms in synonyms.items()
+                    },
                 },
             }
 
@@ -370,7 +394,7 @@ def import_taxonomies(config: IndexConfig, next_index: str):
     success, errors = bulk(
         es,
         gen_taxonomy_documents(
-            config.taxonomy, next_index, supported_langs=set(config.supported_langs)
+            config, next_index, supported_langs=set(config.supported_langs)
         ),
         raise_on_error=False,
     )
diff --git a/app/api.py b/app/api.py
index 6f6fcc47..ed09c7ef 100644
--- a/app/api.py
+++ b/app/api.py
@@ -149,8 +149,11 @@ def taxonomy_autocomplete(
             description="Name(s) of the taxonomy to search in, as a comma-separated value."
         ),
     ],
-    lang: Annotated[
-        str, Query(description="Language to search in, defaults to 'en'.")
+    langs: Annotated[
+        str,
+        Query(
+            description="Languages to search in (as a comma separated list), defaults to 'en'."
+        ),
     ] = "en",
     size: Annotated[int, Query(description="Number of results to return.")] = 10,
     fuzziness: Annotated[
@@ -167,7 +170,7 @@ def taxonomy_autocomplete(
     query = build_completion_query(
         q=q,
         taxonomy_names=taxonomy_names_list,
-        lang=lang,
+        langs=langs.split(","),
         size=size,
         config=index_config,
         fuzziness=fuzziness,
diff --git a/app/config.py b/app/config.py
index f820099b..c381d87b 100644
--- a/app/config.py
+++ b/app/config.py
@@ -510,6 +510,26 @@ class TaxonomyConfig(BaseModel):
         TaxonomyIndexConfig,
         Field(description=TaxonomyIndexConfig.__doc__),
     ]
+    preprocessor: (
+        Annotated[
+            str,
+            Field(
+                description=cd_(
+                    """The full qualified reference to the preprocessor
+                    to use before taxonomy entry import.
+
+                    This class must inherit `app.indexing.BaseTaxonomyPreprocessor`
+                    and specialize the `preprocess` method.
+
+                    This is used to adapt the taxonomy schema
+                    or to add specific fields for example.
+                    """
+                ),
+                examples=["app.openfoodfacts.TaxonomyPreprocessor"],
+            ),
+        ]
+        | None
+    ) = None
 
 
 class ScriptConfig(BaseModel):
diff --git a/app/indexing.py b/app/indexing.py
index 9d09f776..6a3050fd 100644
--- a/app/indexing.py
+++ b/app/indexing.py
@@ -9,12 +9,12 @@
 from app._types import FetcherResult, FetcherStatus, JSONType
 from app.config import (
     ANALYZER_LANG_MAPPING,
-    Config,
     FieldConfig,
     FieldType,
     IndexConfig,
     TaxonomyConfig,
 )
+from app.taxonomy import Taxonomy, TaxonomyNode, TaxonomyNodeResult
 from app.utils import load_class_object_from_string
 from app.utils.analyzers import (
     get_autocomplete_analyzer,
@@ -104,8 +104,41 @@ def preprocess_field_value(
     return input_value
 
 
+class BaseTaxonomyPreprocessor(abc.ABC):
+    """Base class for taxonomy entries preprocessors.
+
+    Classes referenced in index configuration `preprocess` field,
+    has to be derived from it.
+    """
+
+    def __init__(self, config: IndexConfig) -> None:
+        self.config = config
+
+    @abc.abstractmethod
+    def preprocess(self, taxonomy: Taxonomy, node: TaxonomyNode) -> TaxonomyNodeResult:
+        """Preprocess the taxonomy entry before ingestion in Elasticsearch,
+        and before synonyms generation
+
+        This can be used to make document schema compatible with the project
+        schema or to add custom fields.
+
+        :return: a TaxonomyNodeResult object:
+
+        * the status can be used to pilot wether
+          to index or not the entry (even delete it)
+        * the entry is the transformed entry
+        """
+        pass
+
+
 class BaseDocumentPreprocessor(abc.ABC):
-    def __init__(self, config: Config) -> None:
+    """Base class for document preprocessors.
+
+    Classes referenced in index configuration `preprocess` field,
+    has to be derived from it.
+    """
+
+    def __init__(self, config: IndexConfig) -> None:
         self.config = config
 
     @abc.abstractmethod
@@ -119,7 +152,7 @@ def preprocess(self, document: JSONType) -> FetcherResult:
 
         * the status can be used to pilot wether
           to index or not the document (even delete it)
-        * the document is the document transformed document
+        * the document is the transformed document
 
         """
         pass
@@ -379,6 +412,7 @@ def generate_taxonomy_mapping_object(config: IndexConfig) -> Mapping:
                             "type": "category",
                         }
                     ],
+                    preserve_separators=False,  # help match plurals
                 )
                 for lang in supported_langs
             },
diff --git a/app/openfoodfacts.py b/app/openfoodfacts.py
index 1a4adc6e..8b3e1fd1 100644
--- a/app/openfoodfacts.py
+++ b/app/openfoodfacts.py
@@ -7,8 +7,9 @@
 
 from app._import import BaseDocumentFetcher
 from app._types import FetcherResult, FetcherStatus, JSONType
-from app.indexing import BaseDocumentPreprocessor
+from app.indexing import BaseDocumentPreprocessor, BaseTaxonomyPreprocessor
 from app.postprocessing import BaseResultProcessor
+from app.taxonomy import Taxonomy, TaxonomyNode, TaxonomyNodeResult
 from app.utils.download import http_session
 from app.utils.log import get_logger
 
@@ -87,6 +88,37 @@ def generate_image_url(code: str, image_id: str) -> str:
 OFF_API_URL = os.environ.get("OFF_API_URL", "https://world.openfoodfacts.org")
 
 
+class TaxonomyPreprocessor(BaseTaxonomyPreprocessor):
+    """Preprocessor for Open Food Facts taxonomies."""
+
+    def preprocess(self, taxonomy: Taxonomy, node: TaxonomyNode) -> TaxonomyNodeResult:
+        """Preprocess a taxonomy node,
+
+        We add the main language, and we also have specificities for some taxonomies
+        """
+        if taxonomy.name == "brands":
+            # brands are english only, put them in "main lang"
+            node.names.update(main=node.names["en"])
+            if node.synonyms and (synonyms_en := list(node.synonyms.get("en", []))):
+                node.synonyms.update(main=synonyms_en)
+        else:
+            # main language is entry id prefix + eventual xx entries
+            id_lang = node.id.split(":")[0]
+            if node_names := node.names.get(id_lang):
+                node.names.update(main=node_names)
+            node.synonyms.update(main=list(node.synonyms.get(id_lang, [])))
+            # add eventual xx entries as synonyms to all languages
+            xx_name = node.names.get("xx")
+            xx_names = [xx_name] if xx_name else []
+            xx_names += node.synonyms.get("xx", [])
+            if xx_names:
+                for lang in self.config.supported_langs:
+                    node.names.setdefault(lang, xx_names[0])
+                    lang_synonyms = node.synonyms.setdefault(lang, [])
+                    lang_synonyms += xx_names
+        return TaxonomyNodeResult(status=FetcherStatus.FOUND, node=node)
+
+
 class DocumentFetcher(BaseDocumentFetcher):
     def fetch_document(self, stream_name: str, item: JSONType) -> FetcherResult:
         if item.get("action") == "deleted":
diff --git a/app/postprocessing.py b/app/postprocessing.py
index 40d0f87a..35c00ff1 100644
--- a/app/postprocessing.py
+++ b/app/postprocessing.py
@@ -65,13 +65,24 @@ def load_result_processor(config: IndexConfig) -> BaseResultProcessor | None:
 def process_taxonomy_completion_response(response: Response) -> JSONType:
     output = {"took": response.took, "timed_out": response.timed_out}
     options = []
-    suggestion = response.suggest["taxonomy_suggest"][0]
-    for option in suggestion.options:
-        result = {
-            "id": option._source["id"],
-            "text": option.text,
-            "taxonomy_name": option._source["taxonomy_name"],
-        }
-        options.append(result)
-    output["options"] = options
+    ids = set()
+    for suggestion_id in dir(response.suggest):
+        if not suggestion_id.startswith("taxonomy_suggest_"):
+            continue
+        for suggestion in getattr(response.suggest, suggestion_id):
+            for option in suggestion.options:
+                if option._source["id"] in ids:
+                    continue
+                ids.add(option._source["id"])
+                result = {
+                    "id": option._source["id"],
+                    "text": option.text,
+                    "score": option._score,
+                    "taxonomy_name": option._source["taxonomy_name"],
+                }
+                options.append(result)
+    # highest score first
+    output["options"] = sorted(
+        options, key=lambda option: option["score"], reverse=True
+    )
     return output
diff --git a/app/query.py b/app/query.py
index b5299902..d4555d2f 100644
--- a/app/query.py
+++ b/app/query.py
@@ -322,7 +322,7 @@ def build_es_query(
 def build_completion_query(
     q: str,
     taxonomy_names: list[str],
-    lang: str,
+    langs: list[str],
     size: int,
     config: IndexConfig,
     fuzziness: int | None = 2,
@@ -331,28 +331,31 @@ def build_completion_query(
 
     :param q: the user autocomplete query
     :param taxonomy_names: a list of taxonomies we want to search in
-    :param lang: the language we want search in
+    :param langs: the language we want search in
     :param size: number of results to return
     :param config: the index configuration to use
     :param fuzziness: fuzziness parameter for completion query
     :return: the built Query
     """
-
-    completion_clause = {
-        "field": f"synonyms.{lang}",
-        "size": size,
-        "contexts": {"taxonomy_name": taxonomy_names},
-    }
-
-    if fuzziness is not None:
-        completion_clause["fuzzy"] = {"fuzziness": fuzziness}
-
     query = Search(index=config.taxonomy.index.name)
-    query = query.suggest(
-        "taxonomy_suggest",
-        q,
-        completion=completion_clause,
-    )
+    # import pdb;pdb.set_trace();
+    for lang in langs:
+        completion_clause = {
+            "field": f"synonyms.{lang}",
+            "size": size,
+            "contexts": {"taxonomy_name": taxonomy_names},
+            "skip_duplicates": True,
+        }
+        if fuzziness is not None:
+            completion_clause["fuzzy"] = {"fuzziness": fuzziness}
+
+        query = query.suggest(
+            f"taxonomy_suggest_{lang}",
+            q,
+            completion=completion_clause,
+        )
+    # limit returned fields
+    # query.source(fields=["id", "taxonomy_name"])
     return query
 
 
diff --git a/app/taxonomy.py b/app/taxonomy.py
index e1face38..46f7222b 100644
--- a/app/taxonomy.py
+++ b/app/taxonomy.py
@@ -9,8 +9,9 @@
 
 import cachetools
 import requests
+from pydantic import BaseModel, ConfigDict
 
-from app._types import JSONType
+from app._types import FetcherStatus, JSONType
 from app.config import TaxonomyConfig, settings
 from app.utils import get_logger
 from app.utils.download import download_file, http_session, should_download_file
@@ -157,8 +158,9 @@ class Taxonomy:
     node identifier to a `TaxonomyNode`.
     """
 
-    def __init__(self) -> None:
+    def __init__(self, name: str) -> None:
         self.nodes: Dict[str, TaxonomyNode] = {}
+        self.name = name
 
     def add(self, key: str, node: TaxonomyNode) -> None:
         """Add a node to the taxonomy under the id `key`.
@@ -263,13 +265,13 @@ def to_dict(self) -> JSONType:
         return export
 
     @classmethod
-    def from_dict(cls, data: JSONType) -> "Taxonomy":
+    def from_dict(cls, name: str, data: JSONType) -> "Taxonomy":
         """Create a Taxonomy from `data`.
 
         :param data: the taxonomy as a dict
         :return: a Taxonomy
         """
-        taxonomy = Taxonomy()
+        taxonomy = Taxonomy(name)
 
         for key, key_data in data.items():
             if key not in taxonomy:
@@ -293,17 +295,21 @@ def from_dict(cls, data: JSONType) -> "Taxonomy":
         return taxonomy
 
     @classmethod
-    def from_path(cls, file_path: Union[str, Path]) -> "Taxonomy":
+    def from_path(cls, name: str, file_path: Union[str, Path]) -> "Taxonomy":
         """Create a Taxonomy from a JSON file.
 
         :param file_path: a JSON file, gzipped (.json.gz) files are supported
         :return: a Taxonomy
         """
-        return cls.from_dict(load_json(file_path))  # type: ignore
+        return cls.from_dict(name, load_json(file_path))  # type: ignore
 
     @classmethod
     def from_url(
-        cls, url: str, session: Optional[requests.Session] = None, timeout: int = 120
+        cls,
+        name: str,
+        url: str,
+        session: Optional[requests.Session] = None,
+        timeout: int = 120,
     ) -> "Taxonomy":
         """Create a Taxonomy from a taxonomy file hosted at `url`.
 
@@ -315,7 +321,7 @@ def from_url(
         session = http_session if session is None else session
         r = session.get(url, timeout=timeout)
         data = r.json()
-        return cls.from_dict(data)
+        return cls.from_dict(name, data)
 
 
 @cachetools.cached(cachetools.TTLCache(maxsize=100, ttl=3600))
@@ -345,7 +351,7 @@ def get_taxonomy(
         fpath = taxonomy_url[len("file://") :]
         if not fpath.startswith("/"):
             raise RuntimeError("Relative path (not yet) supported for taxonomy url")
-        return Taxonomy.from_path(fpath.rstrip("/"))
+        return Taxonomy.from_path(taxonomy_name, fpath.rstrip("/"))
     filename = f"{taxonomy_name}.json"
 
     cache_dir = DEFAULT_CACHE_DIR if cache_dir is None else cache_dir
@@ -354,16 +360,26 @@ def get_taxonomy(
     if not should_download_file(
         taxonomy_url, taxonomy_path, force_download, download_newer
     ):
-        return Taxonomy.from_path(taxonomy_path)
+        return Taxonomy.from_path(taxonomy_name, taxonomy_path)
 
     cache_dir.mkdir(parents=True, exist_ok=True)
     logger.info("Downloading taxonomy, saving it in %s", taxonomy_path)
     download_file(taxonomy_url, taxonomy_path)
-    return Taxonomy.from_path(taxonomy_path)
+    return Taxonomy.from_path(taxonomy_name, taxonomy_path)
 
 
-def iter_taxonomies(taxonomy_config: TaxonomyConfig) -> Iterator[tuple[str, Taxonomy]]:
+def iter_taxonomies(taxonomy_config: TaxonomyConfig) -> Iterator[Taxonomy]:
     for taxonomy_source_config in taxonomy_config.sources:
-        yield taxonomy_source_config.name, get_taxonomy(
-            taxonomy_source_config.name, str(taxonomy_source_config.url)
-        )
+        yield get_taxonomy(taxonomy_source_config.name, str(taxonomy_source_config.url))
+
+
+class TaxonomyNodeResult(BaseModel):
+    """Result for a taxonomy node transformation.
+
+    This is used to eventually skip entry after preprocessing
+    """
+
+    model_config = ConfigDict(arbitrary_types_allowed=True)
+
+    status: FetcherStatus
+    node: TaxonomyNode | None
diff --git a/app/taxonomy_es.py b/app/taxonomy_es.py
index a8f713e4..89e293a5 100644
--- a/app/taxonomy_es.py
+++ b/app/taxonomy_es.py
@@ -94,10 +94,10 @@ def create_synonyms_files(taxonomy: Taxonomy, langs: list[str], target_dir: Path
 
 
 def create_synonyms(index_config: IndexConfig, target_dir: Path):
-    for name, taxonomy in iter_taxonomies(index_config.taxonomy):
-        target = target_dir / name
+    for taxonomy in iter_taxonomies(index_config.taxonomy):
+        target = target_dir / taxonomy.name
         # a temporary directory, we move at the end
-        target_tmp = target_dir / f"{name}.tmp"
+        target_tmp = target_dir / f"{taxonomy.name}.tmp"
         shutil.rmtree(target_tmp, ignore_errors=True)
         # ensure directory
         os.makedirs(target_tmp, mode=0o775, exist_ok=True)
diff --git a/data/config/openfoodfacts.yml b/data/config/openfoodfacts.yml
index a12d3b73..13d39535 100644
--- a/data/config/openfoodfacts.yml
+++ b/data/config/openfoodfacts.yml
@@ -160,6 +160,7 @@ indices:
     primary_color: "#341100"
     accent_color: "#ff8714"
     taxonomy:
+      preprocessor: app.openfoodfacts.TaxonomyPreprocessor
       sources:
       - name: categories
         url: https://static.openfoodfacts.org/data/taxonomies/categories.full.json
diff --git a/tests/int/data/test_off.yml b/tests/int/data/test_off.yml
index 3a17f8ef..9800e470 100644
--- a/tests/int/data/test_off.yml
+++ b/tests/int/data/test_off.yml
@@ -50,6 +50,7 @@ indices:
     primary_color: "#341100"
     accent_color: "#ff8714"
     taxonomy:
+      preprocessor: tests.int.helpers.TestTaxonomyPreprocessor
       sources:
       - name: categories
         url: file:///opt/search/tests/int/data/test_categories.full.json
diff --git a/tests/int/helpers.py b/tests/int/helpers.py
index 983811c6..35573e48 100644
--- a/tests/int/helpers.py
+++ b/tests/int/helpers.py
@@ -5,7 +5,9 @@
 from app._import import BaseDocumentFetcher
 from app._types import FetcherResult, FetcherStatus, JSONType
 from app.indexing import BaseDocumentPreprocessor
+from app.openfoodfacts import TaxonomyPreprocessor
 from app.postprocessing import BaseResultProcessor
+from app.taxonomy import Taxonomy, TaxonomyNode, TaxonomyNodeResult
 
 
 class CallRegistration:
@@ -41,6 +43,13 @@ def get_calls(cls):
         return calls
 
 
+class TestTaxonomyPreprocessor(TaxonomyPreprocessor, CallRegistration):
+
+    def preprocess(self, taxonomy: Taxonomy, node: TaxonomyNode) -> TaxonomyNodeResult:
+        self.register_call((taxonomy.name, node.id))
+        return super().preprocess(taxonomy, node)
+
+
 class TestDocumentFetcher(BaseDocumentFetcher, CallRegistration):
 
     def fetch_document(self, stream_name: str, item: JSONType) -> FetcherResult:
diff --git a/tests/int/test_completion.py b/tests/int/test_completion.py
new file mode 100644
index 00000000..75eb022e
--- /dev/null
+++ b/tests/int/test_completion.py
@@ -0,0 +1,73 @@
+import pytest
+
+
+@pytest.mark.parametrize(
+    "q,taxonomies,langs,results",
+    [
+        # simple
+        ("organ", "labels", "en", [("en:organic", "Organic", 90)]),
+        # no case match
+        ("ORGAN", "labels", "en", [("en:organic", "Organic", 90)]),
+        # french
+        ("biol", "labels", "fr", [("en:organic", "biologique", 90)]),
+        # multiple languages
+        ("biol", "labels", "en,fr", [("en:organic", "biologique", 90)]),
+        # xx added to french
+        ("Max H", "labels", "fr", [("en:max-havelaar", "Max Havelaar", 85)]),
+        # main for an entry without french
+        (
+            "Fairtrade/Max H",
+            "labels",
+            "fr,main",
+            [("en:max-havelaar", "Fairtrade/Max Havelaar", 85)],
+        ),
+        # multiple taxonomies
+        (
+            "fr",
+            "labels,categories",
+            "en",
+            [
+                ("en:organic", "From Organic Agriculture", 90),
+                ("en:fr-bio-01", "FR-BIO-01", 88),
+                ("en:no-artificial-flavors", "free of artificial flavor", 76),
+                (
+                    "en:fruits-and-vegetables-based-foods",
+                    "Fruits and vegetables based foods",
+                    64,
+                ),
+            ],
+        ),
+        # different answers
+        (
+            "b",
+            "categories",
+            "en",
+            [
+                ("en:biscuits", "biscuit", 89),
+                ("en:beverages", "Beverages", 88),
+                ("en:chocolate-biscuits", "Biscuit with chocolate", 79),
+                ("en:biscuits-and-cakes", "Biscuits and cakes", 79),
+                ("en:sweetened-beverages", "Beverages with added sugar", 78),
+            ],
+        ),
+    ],
+)
+def test_completion(q, taxonomies, langs, results, test_client, synonyms_created):
+    response = test_client.get(
+        f"/autocomplete?q={q}&langs={langs}&taxonomy_names={taxonomies}&size=5"
+    )
+    assert response.status_code == 200
+    options = response.json()["options"]
+    assert len(options) == len(results)
+    # only requested taxonomies
+    result_taxonomies = set([option["taxonomy_name"] for option in options])
+    assert result_taxonomies <= set(taxonomies.split(","))
+    # well sorted
+    assert sorted([option["score"] for option in options], reverse=True) == [
+        option["score"] for option in options
+    ]
+    # expected results
+    completions = [
+        (option["id"], option["text"], int(option["score"])) for option in options
+    ]
+    assert completions == results