diff --git a/.gitignore b/.gitignore
index 245a3504..c74fef74 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,6 +3,8 @@ data/taxonomies
data/**/*.jsonl
products*.jsonl.gz
data/searchalicious-openapi.yml
+data/searchalicious-config-schema.yml
+data/searchalicious-settings-schema.yml
# Byte-compiled / optimized / DLL files
__pycache__/
diff --git a/Makefile b/Makefile
index 3d832ee2..97d894fc 100644
--- a/Makefile
+++ b/Makefile
@@ -164,6 +164,14 @@ generate-custom-elements: _ensure_network
@echo "🔎 Generating custome-elements.json …"
${DOCKER_COMPOSE} run --rm search_nodejs npm run analyze
+generate-config-schema: _ensure_network
+ @echo "🔎 Generating config-schema.yml …"
+ ${DOCKER_COMPOSE} run --rm api python3 -m app export-config-schema /opt/search/data/searchalicious-config-schema.yml
+
+generate-settings-schema: _ensure_network
+ @echo "🔎 Generating settings-schema.yml …"
+ ${DOCKER_COMPOSE} run --rm api python3 -m app export-settings-schema /opt/search/data/searchalicious-settings-schema.yml
+
#-------#
# Tests #
#-------#
diff --git a/app/cli/main.py b/app/cli/main.py
index ebf79af9..bd62c4f4 100644
--- a/app/cli/main.py
+++ b/app/cli/main.py
@@ -225,7 +225,7 @@ def export_openapi(
exists=None,
file_okay=True,
dir_okay=False,
- help="Path of target_path the YAML or JSON data file",
+ help="Path of the YAML or JSON data file",
)
):
"""Export OpenAPI specification to a file."""
@@ -248,5 +248,51 @@ def export_openapi(
print(f"spec written to {target_path}")
+def export_schema(
+ class_: type["app.config.Config"] | type["app.config.Settings"], target_path: Path
+):
+ """Export schema to a file."""
+ import json
+
+ import yaml
+
+ from app.config import ConfigGenerateJsonSchema
+
+ schema = class_.model_json_schema(schema_generator=ConfigGenerateJsonSchema)
+
+ print("writing json schema")
+ with open(target_path, "w") as f:
+ if str(target_path).endswith(".json"):
+ json.dump(schema, f, indent=2)
+ else:
+ yaml.safe_dump(schema, f, sort_keys=False)
+
+ print(f"schema written to {target_path}")
+
+
+schema_target_path = typer.Argument(
+ exists=None,
+ file_okay=True,
+ dir_okay=False,
+ help="Path of the YAML or JSON data file",
+)
+
+
+@cli.command()
+def export_config_schema(target_path: Path = schema_target_path):
+ """Export Configuration json schema to a file."""
+ from app.config import Config
+
+ export_schema(Config, target_path)
+
+
+@cli.command()
+def export_settings_schema(target_path: Path = schema_target_path):
+ """Export Configuration json schema to a file."""
+ from app.config import Settings
+
+ export_schema(Settings, target_path)
+
+
def main() -> None:
cli()
diff --git a/app/config.py b/app/config.py
index 3182059e..6ce2adcc 100644
--- a/app/config.py
+++ b/app/config.py
@@ -1,6 +1,6 @@
-import json
import logging
from enum import StrEnum, auto
+from inspect import cleandoc as cd_
from pathlib import Path
from typing import Annotated, Any
@@ -11,8 +11,17 @@
log = logging.getLogger(__name__)
+ES_DOCS_URL = "https://www.elastic.co/guide/en/elasticsearch/reference/current"
+
class LoggingLevel(StrEnum):
+ """Accepted logging levels
+
+ * NOTSET - means no los
+ * DEBUG / INFO / WARNING / ERROR / CRITICAL
+ - match standard Python logging levels
+ """
+
NOTSET = "NOTSET"
DEBUG = "DEBUG"
INFO = "INFO"
@@ -41,17 +50,77 @@ class ScriptType(StrEnum):
class Settings(BaseSettings):
- # Path of the search-a-licious yaml configuration file
- config_path: Path | None = None
- redis_reader_timeout: int = 5
- elasticsearch_url: str = "http://localhost:9200"
- redis_host: str = "localhost"
- redis_port: int = 6379
- sentry_dns: str | None = None
- log_level: LoggingLevel = LoggingLevel.INFO
- taxonomy_cache_dir: Path = Path("data/taxonomies")
- # User-Agent used when fetching resources (taxonomies) or documents
- user_agent: str = "search-a-licious"
+ """Settings for Search-a-licious
+
+ The most important settings is `config_path`.
+
+ Those settings can be overridden through environment
+ by using the name in capital letters.
+ If you use docker compose, a good way to do that
+ is to modify those values in your .env file.
+ """
+
+ config_path: Annotated[
+ Path | None,
+ Field(
+ description=cd_(
+ """Path to the search-a-licious yaml configuration file.
+
+ See [Explain configuration file](../explain-configuration/) for more information
+ """
+ )
+ ),
+ ] = None
+ elasticsearch_url: Annotated[
+ str,
+ Field(
+ description=cd_(
+ """URL to the ElasticSearch instance
+
+ Bare in mind this is from inside the container.
+ """
+ )
+ ),
+ ] = "http://localhost:9200"
+ redis_host: Annotated[
+ str,
+ Field(
+ description=cd_(
+ """Host for the Redis instance containing event stream
+
+ Bare in mind this is from inside the container.
+ """
+ )
+ ),
+ ] = "localhost"
+ redis_port: Annotated[
+ int,
+ Field(description="Port for the redis host instance containing event stream"),
+ ] = 6379
+ redis_reader_timeout: Annotated[
+ int, Field(description="timeout in seconds to read redis event stream")
+ ] = 5
+ sentry_dns: Annotated[
+ str | None,
+ Field(
+ description="Sentry DNS to report incident, if None no incident is reported"
+ ),
+ ] = None
+ log_level: Annotated[
+ LoggingLevel, Field(description=f"Log level. {LoggingLevel.__doc__}")
+ ] = LoggingLevel.INFO
+ taxonomy_cache_dir: Annotated[
+ Path,
+ Field(
+ description="Directory where to store taxonomies before ingestion to ElasticSearch"
+ ),
+ ] = Path("data/taxonomies")
+ user_agent: Annotated[
+ str,
+ Field(
+ description="User-Agent used when fetching resources (taxonomies) or documents"
+ ),
+ ] = "search-a-licious"
settings = Settings()
@@ -105,17 +174,65 @@ def generate(self, schema, mode="validation"):
class TaxonomySourceConfig(BaseModel):
- name: Annotated[str, Field(description="name of the taxonomy")]
+ """Configuration on how to fetch a particular taxonomy."""
+
+ name: Annotated[
+ str,
+ Field(
+ description=cd_(
+ """Name of the taxonomy
+
+ This is the name you will use in the configuration (and the API)
+ to reference this taxonomy
+ """
+ )
+ ),
+ ]
url: Annotated[
HttpUrl,
Field(
- description="URL of the taxonomy, must be in JSON format and follows Open Food Facts "
- "taxonomy format."
+ description=cd_(
+ """URL of the taxonomy.
+
+ The target file must be in JSON format
+ and follows Open Food Facts JSON taxonomy format.
+
+ This is a dict where each key correspond to a taxonomy entry id,
+ values are dict with following properties:
+
+ * name: contains a dict giving the name (string) for this entry
+ in various languages (keys are language codes)
+ * synonyms: contains a dict giving a list of synonyms by language code
+ * parents: contains a list of direct parent ids (taxonomy is a directed acyclic graph)
+
+ Other keys correspond to properties associated to this entry (eg. wikidata id).
+ """
+ )
),
]
class FieldType(StrEnum):
+ """Supported field types in Search-a-Licious are:
+
+ * keyword: string values that won't be interpreted (tokenized).
+ Good for things like tags, serial, property values, etc.
+ * date: Date fields
+ * double, float, half_float, scaled_float:
+ different ways of storing floats with different capacity
+ * short, integer, long, unsigned_long :
+ integers (with different capacity: 8 / 16 / 32 bits)
+ * bool: boolean (true / false) values
+ * text: a text which is tokenized to enable full text search
+ * text_lang: like text, but with different values in different languages.
+ Tokenization will use analyzers specific to each languages.
+ * taxonomy: a field akin to keyword but
+ with support for matching using taxonomy synonyms and translations
+ * disabled: a field that is not stored nor searchable
+ (see [Elasticsearch help])
+ * object: this field contains a dict with sub-fields.
+ """
+
keyword = auto()
date = auto()
half_float = auto()
@@ -136,63 +253,107 @@ class FieldType(StrEnum):
object = auto()
def is_numeric(self):
+ """Return wether this field type can be considered numeric"""
return self in (FieldType.integer, FieldType.float, FieldType.double)
+# add url to FieldType doc
+if FieldType.__doc__:
+ FieldType.__doc__ += f"\n\n[Elasticsearch help]: {ES_DOCS_URL}/enabled.html"
+
+
class FieldConfig(BaseModel):
# name of the field (internal field), it's added here for convenience.
# It's set by the `add_field_name_to_each_field` classmethod.
- name: Annotated[str, Field(description="name of the field, must be unique")] = ""
+ name: Annotated[str, Field(description="name of the field (must be unique")] = ""
type: Annotated[
FieldType,
- Field(description="type of the field, see `FieldType` for possible values"),
+ Field(description=f"Type of the field\n\n{cd_(FieldType.__doc__)}"),
]
required: Annotated[
bool,
- Field(description="if required=True, the field is required in the input data"),
+ Field(
+ description=cd_(
+ """if required=True, the field is required in the input data
+
+ An entry that does not contains a value for this field will be rejected.
+ """
+ )
+ ),
] = False
input_field: Annotated[
str | None,
- Field(description="name of the input field to use when importing data"),
+ Field(
+ description=cd_(
+ """name of the input field to use when importing data
+
+ By default, Search-a-licious use the same name as the field name.
+
+ This is useful to index the same field using different types or configurations.
+ """
+ )
+ ),
] = None
- #
split: Annotated[
bool,
Field(
- description="do we split the input field with `split_separator` ?\n\n"
- "This is useful if you have some text fields that contains list of values, "
- "(for example a comma separated list of values, like apple,banana,carrot).\n\n"
- "You must set split_separator to the character that separates the values in the dataset."
+ description=cd_(
+ """do we split the input field with `split_separator` ?
+
+ This is useful if you have some text fields that contains list of values,
+ (for example a comma separated list of values, like apple,banana,carrot).
+
+ You must set split_separator to the character that separates the values in the dataset.
+ """
+ )
),
] = False
full_text_search: Annotated[
bool,
Field(
- description="do we include perform full text search using this field. If "
- "false, the field is only used during search when filters involving this "
- "field are provided."
+ description=cd_(
+ """Wether this field in included on default full text search.
+
+ If `false`, the field is only used during search
+ when filters involving this field are provided
+ (as opposed to full text search expressions without any explicit field).
+ """
+ )
),
] = False
bucket_agg: Annotated[
bool,
Field(
- description="do we add an bucket aggregation to the elasticsearch query for this field. "
- "It is used to return a 'faceted-view' with the number of results for each facet value. "
- "Only valid for keyword or numeric field types."
+ description=cd_(
+ """do we add an bucket aggregation to the elasticsearch query for this field.
+
+ It is used to return a 'faceted-view' with the number of results for each facet value,
+ or to generate bar charts.
+
+ Only valid for keyword or numeric field types.
+ """
+ )
),
] = False
taxonomy_name: Annotated[
str | None,
Field(
- description="the name of the taxonomy associated with this field. "
- "It must only be provided for taxonomy field type."
+ description=cd_(
+ """the name of the taxonomy associated with this field.
+
+ It must only be provided for taxonomy field type.
+ """
+ )
),
] = None
add_taxonomy_synonyms: Annotated[
bool,
Field(
- description="if True, add all synonyms of the taxonomy values to the index. "
- "The flag is ignored if the field type is not `taxonomy`."
+ description=cd_(
+ """if True, add all synonyms of the taxonomy values to the index.
+ The flag is ignored if the field type is not `taxonomy`.
+ """
+ )
),
] = True
@@ -213,81 +374,138 @@ def get_input_field(self):
return self.input_field or self.name
def has_lang_subfield(self) -> bool:
+ """Return wether this field type is supposed to have different values
+ per languages"""
return self.type in (FieldType.taxonomy, FieldType.text_lang)
-class ESIndexConfig(BaseModel):
- name: Annotated[str, Field(description="name of the index alias to use")]
- id_field_name: Annotated[
+class BaseESIndexConfig(BaseModel):
+ """Base class for configuring ElasticSearch indexes"""
+
+ name: Annotated[
str,
Field(
- description="name of the field to use for `_id`."
- "it is mandatory to provide one.\n\n "
- "If your dataset does not have an identifier field, "
- "you should use a document preprocessor to compute one."
+ description=cd_(
+ """Name of the index alias to use.
+
+ Search-a-licious will create an index using this name and an import date,
+ but alias will always point to the latest index.
+
+ The alias must not already exists in your ElasticSearch instance.
+ """
+ )
),
]
- last_modified_field_name: Annotated[
- str,
+ number_of_shards: Annotated[
+ int,
Field(
- description="name of the field containing the date of last modification, "
- "used for incremental updates using Redis queues. "
- "The field value must be an int/float representing the timestamp.\n\n"
+ description=cd_(
+ f"""Number of shards to use for the index.
+
+ Shards are useful to distribute the load on your cluster.
+ (see [index settings]({ES_DOCS_URL}/index-modules.html#_static_index_settings))
+ """
+ )
),
- ]
- number_of_shards: Annotated[
- int, Field(description="number of shards to use for the index")
] = 4
number_of_replicas: Annotated[
- int, Field(description="number of replicas to use for the index")
+ int,
+ Field(
+ description=cd_(
+ f"""Number of replicas to use for the index.
+
+ More replica means more resiliency but also more disk space and memory.
+
+ (see [index settings]({ES_DOCS_URL}/index-modules.html#_static_index_settings))
+ """
+ )
+ ),
] = 1
-class TaxonomyIndexConfig(BaseModel):
- """We have an index storing multiple taxonomies
+class ESIndexConfig(BaseESIndexConfig):
+ """This is the configuration for the main index containing the data.
- It enables functions like auto-completion, or field suggestions
- as well as enrichment of requests with synonyms
+ It's used to create the index in ElasticSearch, and configure its mappings
+ (along with the *fields* config)
"""
- name: Annotated[
+ id_field_name: Annotated[
str,
- Field(description="name of the taxonomy index alias to use"),
+ Field(
+ description=cd_(
+ """Name of the field to use for `_id`.
+ it is mandatory to provide one.
+
+ If your dataset does not have an identifier field,
+ you should use a document preprocessor to compute one (see `preprocessor`).
+ """
+ )
+ ),
]
- number_of_shards: Annotated[
- int, Field(description="number of shards to use for the index")
- ] = 4
- number_of_replicas: Annotated[
- int, Field(description="number of replicas to use for the index")
- ] = 1
+ last_modified_field_name: Annotated[
+ str,
+ Field(
+ description=cd_(
+ """Name of the field containing the date of last modification,
+ in your indexed objects.
+
+ This is used for incremental updates using Redis queues.
+
+ The field value must be an int/float representing the timestamp.
+ """
+ )
+ ),
+ ]
+
+
+class TaxonomyIndexConfig(BaseESIndexConfig):
+ """This is the configuration of
+ the ElasticSearch index storing the taxonomies.
+
+ All taxonomies are stored within the same index.
+
+ It enables functions like auto-completion, or field suggestions
+ as well as enrichment of requests with synonyms.
+ """
class TaxonomyConfig(BaseModel):
"""Configuration of taxonomies,
- that is collections of entries with synonyms in multiple languages
+ that is collections of entries with synonyms in multiple languages.
Field may be linked to taxonomies.
+
+ It enables enriching search with synonyms,
+ as well as providing suggestions,
+ or informative facets.
"""
sources: Annotated[
list[TaxonomySourceConfig],
- Field(description="configurations of used taxonomies"),
+ Field(description="Configurations of taxonomies that this project will use."),
]
exported_langs: Annotated[
list[str],
Field(
- description="a list of languages for which we want taxonomized fields "
- "to be always exported during indexing. During indexing, we use the taxonomy "
- "to translate every taxonomized field in a language-specific subfield. The list "
- "of language depends on the value defined here and on the optional "
- "`taxonomy_langs` field that can be defined in each document.",
+ description=cd_(
+ """a list of languages for which
+ we want taxonomized fields to be always exported during indexing.
+
+ During indexing, we use the taxonomy to translate every taxonomized field
+ in a language-specific subfield.
+
+ The list of language depends on the value defined here and on the optional
+ `taxonomy_langs` field that can be defined in each document.
+
+ Beware that providing many language might inflate the index size.
+ """,
+ )
),
]
index: Annotated[
TaxonomyIndexConfig,
- Field(
- description="configuration of the taxonomy index. There is a single index for all taxonomies."
- ),
+ Field(description=TaxonomyIndexConfig.__doc__),
]
@@ -326,20 +544,39 @@ class ScriptConfig(BaseModel):
# Or some type checking/transformation ?
+INDEX_CONFIG_INDEX_DESCRIPTION = """
+Through this settings, you can tweak some of the index settings.
+"""
+
+
class IndexConfig(BaseModel):
- """Inside the config file we can have several indexes defined.
+ """This object gives configuration for one index.
- This object gives configuration for one index.
+ One index usually correspond to one dataset.
"""
- index: Annotated[
- ESIndexConfig, Field(description="configuration of the Elasticsearch index")
- ]
+ index: Annotated[ESIndexConfig, Field(description=ESIndexConfig.__doc__)]
fields: Annotated[
dict[str, FieldConfig],
Field(
- description="configuration of all fields in the index, keys are field "
- "names and values contain the field configuration"
+ description=cd_(
+ """Configuration of all fields we need to store in the index.
+
+ Keys are field names,
+ values contain the field configuration.
+
+ This is a very important part of the configuration.
+
+ Most of the ElasticSearch mapping will depends on it.
+ ElasticSearch will also use this configuration
+ to provide intended behaviour.
+
+ (see also [Explain Configuration](./explain_configuration.md#fields))
+
+ If you change those settings you will have to re-index all the data.
+ (But you can do so in the background).
+ """
+ )
),
]
split_separator: Annotated[
@@ -357,30 +594,36 @@ class IndexConfig(BaseModel):
] = "_"
primary_color: Annotated[
str,
- Field(description="Used for vega charts. Should be html code."),
+ Field(description="Used for vega charts. Use CSS color code."),
] = "#aaa"
accent_color: Annotated[
str,
- Field(
- description="Used for vega. Should be html code."
- 'and the language code, ex: product_name_it if lang_separator="_"'
- ),
+ Field(description="Used for vega. Should be CSS color code."),
] = "#222"
- taxonomy: Annotated[
- TaxonomyConfig, Field(description="configuration of the taxonomies used")
- ]
+ taxonomy: Annotated[TaxonomyConfig, Field(description=TaxonomyConfig.__doc__)]
supported_langs: Annotated[
list[str],
Field(
- description="A list of all supported languages, it is used to build index mapping"
+ description="A list of all supported languages, it is used to build index mapping",
+ examples=[["en", "fr", "it"]],
),
]
document_fetcher: Annotated[
str,
Field(
- description="The full qualified reference to the document fetcher, i.e. the class "
- "responsible from fetching the document using the document ID present in the Redis "
- "Stream.",
+ description=cd_(
+ """The full qualified reference to the document fetcher,
+ i.e. the class responsible from fetching the document.
+ using the document ID present in the Redis Stream.
+
+ It should inherit `app._import.BaseDocumentFetcher`
+ and specialize the `fetch_document` method.
+
+ To keep things sleek,
+ you generally have few item fields in the event stream payload.
+ This class will fetch the full document using your application API.
+ """
+ ),
examples=["app.openfoodfacts.DocumentFetcher"],
),
]
@@ -388,9 +631,18 @@ class IndexConfig(BaseModel):
Annotated[
str,
Field(
- description="The full qualified reference to the preprocessor to use before "
- "data import. This is used to adapt the data schema or to add search-a-licious "
- "specific fields for example.",
+ description=cd_(
+ """The full qualified reference to the preprocessor
+ to use before data import.
+
+ This class must inherit `app.indexing.BaseDocumentPreprocessor`
+ and specialize the `preprocess` method.
+
+ This is used to adapt the data schema
+ or to add search-a-licious specific fields
+ for example.
+ """
+ ),
examples=["app.openfoodfacts.DocumentPreprocessor"],
),
]
@@ -400,9 +652,16 @@ class IndexConfig(BaseModel):
Annotated[
str,
Field(
- description="The full qualified reference to the elasticsearch result processor "
- "to use after search query to Elasticsearch. This is used to add custom fields "
- "for example.",
+ description=cd_(
+ """The full qualified reference to the elasticsearch result processor
+ to use after search query to Elasticsearch.
+
+) This class must inherit `app.postprocessing.BaseResultProcessor`
+ and specialize the `process_after`
+
+ This is can be used to add custom fields computed from index content.
+ """
+ ),
examples=["app.openfoodfacts.ResultProcessor"],
),
]
@@ -412,23 +671,48 @@ class IndexConfig(BaseModel):
Annotated[
dict[str, ScriptConfig],
Field(
- description="You can add scripts that can be used for sorting results",
+ description=cd_(
+ """You can add scripts that can be used for sorting results.
+
+ Each key is a script name, with it's configuration.
+ """
+ ),
),
]
| None
) = None
match_phrase_boost: Annotated[
- float, Field(description="How much we boost exact matches on individual fields")
+ float,
+ Field(
+ description=cd_(
+ """How much we boost exact matches on individual fields
+
+ This only makes sense when using "best match" order.
+ """
+ )
+ ),
] = 2.0
document_denylist: Annotated[
- set[str], Field(description="list of documents IDs to ignore")
+ set[str],
+ Field(
+ description=cd_(
+ """list of documents IDs to ignore.
+
+ Use this to skip some documents at indexing time.
+ """
+ )
+ ),
] = Field(default_factory=set)
redis_stream_name: Annotated[
str | None,
Field(
- description="name of the Redis stream to read from when listening to document updates. "
- "If not provided, document updates won't be listened to for this index."
+ description=cd_(
+ """Name of the Redis stream to read from when listening to document updates.
+
+ If not provided, document updates won't be listened to for this index.
+ """
+ )
),
] = None
@@ -473,6 +757,7 @@ def field_references_must_exist_and_be_valid(self):
@field_validator("fields")
@classmethod
def add_field_name_to_each_field(cls, fields: dict[str, FieldConfig]):
+ """It's handy to have the name of the field in the field definition"""
for field_name, field_item in fields.items():
field_item.name = field_name
return fields
@@ -503,17 +788,31 @@ def get_fields_with_bucket_agg(self):
]
+CONFIG_DESCRIPTION_INDICES = """
+A Search-a-licious instance only have one configuration file,
+but is capable of serving multiple datasets
+
+It provides a section for each index you want to create (corresponding to a dataset).
+
+The key is the ID of the index that can be referenced at query time.
+One index corresponds to a specific set of documents and can be queried independently.
+
+If you have multiple indexes, one of those index must be designed as the default one,
+see `default_index`.
+"""
+
+
class Config(BaseModel):
- """This is the global config object that reflects
- the yaml configuration file.
+ """Search-a-licious server configuration.
+
+ The configuration is loaded from a YAML file,
+ that must satisfy this schema.
Validations will be performed while we load it.
"""
indices: dict[str, IndexConfig] = Field(
- description="configuration of indices. "
- "The key is the ID of the index that can be referenced at query time. "
- "One index corresponds to a specific set of documents and can be queried independently."
+ description="configuration of indices.\n\n" + CONFIG_DESCRIPTION_INDICES
)
default_index: Annotated[
str,
@@ -560,16 +859,6 @@ def from_yaml(cls, path: Path) -> "Config":
data = yaml.safe_load(f)
return cls(**data)
- @classmethod
- def export_json_schema(cls):
- """Export JSON schema."""
- (Path(__file__).parent.parent / "config_schema.json").write_text(
- json.dumps(
- cls.model_json_schema(schema_generator=ConfigGenerateJsonSchema),
- indent=4,
- )
- )
-
# CONFIG is a global variable that contains the search-a-licious configuration
# used. It is specified by the envvar CONFIG_PATH.
diff --git a/config_schema.json b/config_schema.json
deleted file mode 100644
index 925684dd..00000000
--- a/config_schema.json
+++ /dev/null
@@ -1,374 +0,0 @@
-{
- "$defs": {
- "ESIndexConfig": {
- "properties": {
- "name": {
- "description": "name of the index alias to use",
- "title": "Name",
- "type": "string"
- },
- "id_field_name": {
- "description": "name of the field to use for `_id`",
- "title": "Id Field Name",
- "type": "string"
- },
- "last_modified_field_name": {
- "description": "name of the field containing the date of last modification, used for incremental updates using Redis queues. The field value must be an int/float representing the timestamp.",
- "title": "Last Modified Field Name",
- "type": "string"
- },
- "number_of_shards": {
- "default": 4,
- "description": "number of shards to use for the index",
- "title": "Number Of Shards",
- "type": "integer"
- },
- "number_of_replicas": {
- "default": 1,
- "description": "number of replicas to use for the index",
- "title": "Number Of Replicas",
- "type": "integer"
- }
- },
- "required": [
- "name",
- "id_field_name",
- "last_modified_field_name"
- ],
- "title": "ESIndexConfig",
- "type": "object"
- },
- "FieldConfig": {
- "properties": {
- "name": {
- "default": "",
- "description": "name of the field, must be unique",
- "title": "Name",
- "type": "string"
- },
- "type": {
- "allOf": [
- {
- "$ref": "#/$defs/FieldType"
- }
- ],
- "description": "type of the field, see `FieldType` for possible values"
- },
- "required": {
- "default": false,
- "description": "if required=True, the field is required in the input data",
- "title": "Required",
- "type": "boolean"
- },
- "input_field": {
- "anyOf": [
- {
- "type": "string"
- },
- {
- "type": "null"
- }
- ],
- "default": null,
- "description": "name of the input field to use when importing data",
- "title": "Input Field"
- },
- "split": {
- "default": false,
- "description": "do we split the input field with `split_separator`",
- "title": "Split",
- "type": "boolean"
- },
- "full_text_search": {
- "default": false,
- "description": "do we include perform full text search using this field. If false, the field is only used during search when filters involving this field are provided.",
- "title": "Full Text Search",
- "type": "boolean"
- },
- "bucket_agg": {
- "default": false,
- "description": "do we add an bucket aggregation to the elasticsearch query for this field. It is used to return a 'faceted-view' with the number of results for each facet value. Only valid for keyword or numeric field types.",
- "title": "Bucket Agg",
- "type": "boolean"
- },
- "taxonomy_name": {
- "anyOf": [
- {
- "type": "string"
- },
- {
- "type": "null"
- }
- ],
- "default": null,
- "description": "the name of the taxonomy associated with this field. It must only be provided for taxonomy field type.",
- "title": "Taxonomy Name"
- },
- "add_taxonomy_synonyms": {
- "default": true,
- "description": "if True, add all synonyms of the taxonomy values to the index. The flag is ignored if the field type is not `taxonomy`.",
- "title": "Add Taxonomy Synonyms",
- "type": "boolean"
- }
- },
- "required": [
- "type"
- ],
- "title": "FieldConfig",
- "type": "object"
- },
- "FieldType": {
- "enum": [
- "keyword",
- "date",
- "half_float",
- "scaled_float",
- "float",
- "double",
- "integer",
- "short",
- "long",
- "unsigned_long",
- "bool",
- "text",
- "text_lang",
- "taxonomy",
- "disabled",
- "object"
- ],
- "title": "FieldType",
- "type": "string"
- },
- "IndexConfig": {
- "properties": {
- "index": {
- "allOf": [
- {
- "$ref": "#/$defs/ESIndexConfig"
- }
- ],
- "description": "configuration of the Elasticsearch index"
- },
- "fields": {
- "additionalProperties": {
- "$ref": "#/$defs/FieldConfig"
- },
- "description": "configuration of all fields in the index, keys are field names and values contain the field configuration",
- "title": "Fields",
- "type": "object"
- },
- "split_separator": {
- "default": ",",
- "description": "separator to use when splitting values, for fields that have split=True",
- "title": "Split Separator",
- "type": "string"
- },
- "lang_separator": {
- "default": "_",
- "description": "for `text_lang` FieldType, the separator between the name of the field and the language code, ex: product_name_it if lang_separator=\"_\"",
- "title": "Lang Separator",
- "type": "string"
- },
- "taxonomy": {
- "allOf": [
- {
- "$ref": "#/$defs/TaxonomyConfig"
- }
- ],
- "description": "configuration of the taxonomies used"
- },
- "supported_langs": {
- "description": "A list of all supported languages, it is used to build index mapping",
- "items": {
- "type": "string"
- },
- "title": "Supported Langs",
- "type": "array"
- },
- "document_fetcher": {
- "description": "The full qualified reference to the document fetcher, i.e. the class responsible from fetching the document using the document ID present in the Redis Stream.",
- "examples": [
- "app.openfoodfacts.DocumentFetcher"
- ],
- "title": "Document Fetcher",
- "type": "string"
- },
- "preprocessor": {
- "anyOf": [
- {
- "description": "The full qualified reference to the preprocessor to use before data import. This is used to adapt the data schema or to add search-a-licious specific fields for example.",
- "examples": [
- "app.openfoodfacts.DocumentPreprocessor"
- ],
- "type": "string"
- },
- {
- "type": "null"
- }
- ],
- "default": null,
- "title": "Preprocessor"
- },
- "result_processor": {
- "anyOf": [
- {
- "description": "The full qualified reference to the elasticsearch result processor to use after search query to Elasticsearch. This is used to add custom fields for example.",
- "examples": [
- "app.openfoodfacts.ResultProcessor"
- ],
- "type": "string"
- },
- {
- "type": "null"
- }
- ],
- "default": null,
- "title": "Result Processor"
- },
- "match_phrase_boost": {
- "default": 2.0,
- "description": "How much we boost exact matches on individual fields",
- "title": "Match Phrase Boost",
- "type": "number"
- },
- "document_denylist": {
- "description": "list of documents IDs to ignore",
- "items": {
- "type": "string"
- },
- "title": "Document Denylist",
- "type": "array",
- "uniqueItems": true
- },
- "redis_stream_name": {
- "anyOf": [
- {
- "type": "string"
- },
- {
- "type": "null"
- }
- ],
- "default": null,
- "description": "name of the Redis stream to read from when listening to document updates. If not provided, document updates won't be listened to for this index.",
- "title": "Redis Stream Name"
- }
- },
- "required": [
- "index",
- "fields",
- "taxonomy",
- "supported_langs",
- "document_fetcher"
- ],
- "title": "IndexConfig",
- "type": "object"
- },
- "TaxonomyConfig": {
- "properties": {
- "sources": {
- "description": "configurations of used taxonomies",
- "items": {
- "$ref": "#/$defs/TaxonomySourceConfig"
- },
- "title": "Sources",
- "type": "array"
- },
- "exported_langs": {
- "description": "a list of languages for which we want taxonomized fields to be always exported during indexing. During indexing, we use the taxonomy to translate every taxonomized field in a language-specific subfield. The list of language depends on the value defined here and on the optional `taxonomy_langs` field that can be defined in each document.",
- "items": {
- "type": "string"
- },
- "title": "Exported Langs",
- "type": "array"
- },
- "index": {
- "allOf": [
- {
- "$ref": "#/$defs/TaxonomyIndexConfig"
- }
- ],
- "description": "configuration of the taxonomy index. There is a single index for all taxonomies."
- }
- },
- "required": [
- "sources",
- "exported_langs",
- "index"
- ],
- "title": "TaxonomyConfig",
- "type": "object"
- },
- "TaxonomyIndexConfig": {
- "properties": {
- "name": {
- "description": "name of the taxonomy index alias to use",
- "title": "Name",
- "type": "string"
- },
- "number_of_shards": {
- "default": 4,
- "description": "number of shards to use for the index",
- "title": "Number Of Shards",
- "type": "integer"
- },
- "number_of_replicas": {
- "default": 1,
- "description": "number of replicas to use for the index",
- "title": "Number Of Replicas",
- "type": "integer"
- }
- },
- "required": [
- "name"
- ],
- "title": "TaxonomyIndexConfig",
- "type": "object"
- },
- "TaxonomySourceConfig": {
- "properties": {
- "name": {
- "description": "name of the taxonomy",
- "title": "Name",
- "type": "string"
- },
- "url": {
- "description": "URL of the taxonomy, must be in JSON format and follows Open Food Facts taxonomy format.",
- "format": "uri",
- "maxLength": 2083,
- "minLength": 1,
- "title": "Url",
- "type": "string"
- }
- },
- "required": [
- "name",
- "url"
- ],
- "title": "TaxonomySourceConfig",
- "type": "object"
- }
- },
- "properties": {
- "indices": {
- "additionalProperties": {
- "$ref": "#/$defs/IndexConfig"
- },
- "description": "configuration of indices. The key is the ID of the index that can be referenced at query time. One index corresponds to a specific set of documents and can be queried independently.",
- "title": "Indices",
- "type": "object"
- },
- "default_index": {
- "description": "the default index to use when no index is specified in the query",
- "title": "Default Index",
- "type": "string"
- }
- },
- "required": [
- "indices",
- "default_index"
- ],
- "title": "JSON schema for search-a-licious configuration file",
- "type": "object",
- "$schema": "https://json-schema.org/draft/2020-12/schema"
-}
\ No newline at end of file
diff --git a/docker-compose.yml b/docker-compose.yml
index f5ee159f..4467b325 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -35,7 +35,7 @@ x-api-common: &api-common
image: ghcr.io/openfoodfacts/search-a-licious/search_service_image:${TAG:-dev}
restart: ${RESTART_POLICY:-always}
environment:
- - ELASTICSEARCH_URL=http://es01:9200
+ - ELASTICSEARCH_URL=${ELASTICSEARCH_URL:-http://es01:9200}
- SENTRY_DNS
- LOG_LEVEL
- REDIS_HOST
diff --git a/docs/sphinx/conf.py b/docs/sphinx/conf.py
index 4c9066ad..4468747d 100644
--- a/docs/sphinx/conf.py
+++ b/docs/sphinx/conf.py
@@ -35,7 +35,7 @@
"github_repo": "search-a-licious",
"github_banner": True,
"extra_nav_links": {
- "Back to main doc": "/search-a-licious",
+ "🢀 Back to main doc": "/search-a-licious",
},
}
diff --git a/docs/users/explain-configuration.md b/docs/users/explain-configuration.md
index 30c16beb..fe5c8a86 100644
--- a/docs/users/explain-configuration.md
+++ b/docs/users/explain-configuration.md
@@ -5,8 +5,72 @@ and all the rest works (at least for main scenarios).
The configuration file is a YAML file.
+## One configuration, multiple datasets
+
+A Search-a-licious instance only have one configuration file,
+but is capable of serving multiple datasets
+
It provides a section for each index you want to create (corresponding to a dataset).
+If you have more than one dataset, one must be declared the default (see [default_index](../ref-config/searchalicious-config-schema.html#default_index))
+
+## Main sections
+
+For each indexe the main sections are:
+
+* index: some configuration of the Elasticsearch index
+* fields: the fields you want to put in the index, their type and other configurations
+* taxonomy: definitions of taxonomies that are used by this index
+* redis_stream_name and document_fetcher: if you use continuous updates, you will need to define one
+* preprocessor and result_processor are two fields enabling to handle specificities of your dataset.
+* scripts: to use sort by script (see [How to use scripts](./how-to-use-scripts.md))
+
+
+## Index configuration
+
+Search-a-licious is really based upon Elasticsearch,
+
+This section provides some important fields to control the way it is used.
+
+`id_field_name` is particularly important as it must contain a field that uniquely identifies each items.
+If you don't have such field, you might use `preprocessor` to compute one.
+It is important to have such an id to be able to use [continuous updates](FIXME).
+
+`last_modified_field_name` is also important for continuous updates to decide
+where to start the event stream processing.
+
+## Fields
+
+This is one of the most important section.
+
+It specifies what will be stored in your index,
+which fields will be searchable, and how.
+
+You have to plan in advance how you configure this.
+
+Think well about:
+* fields you want to search and how you want to search them
+* which informations you need to display in search results
+* what you need to sort on
+* which facets you want to display
+* which charts you need to build
+
+Changing this section will probably involve a full re-indexing of all your items.
+
+Read more in the [reference documentation](../ref-config/searchalicious-config-schema.html#fields).
+
+## Document fetcher, pre-processors and post-processors
+
+It is not always straight forward to index an item.
+
+Search-a-licious offers a way for you to customize some critical operations using Python code.
+
+* preprocessor adapts you document before being indexed
+* whereas result_processor adapts each result returned by a search, keep it lightweight !
+* document_fetcher is only used for continuous updates to fetch documents using an API
+
+Read more in the [reference documentation](../ref-config/searchalicious-config-schema.html).
+## Scripts
-### Split separator
\ No newline at end of file
+You can also add scripts for sorting documents. See [How to use scripts](./how-to-use-scripts.md).
\ No newline at end of file
diff --git a/docs/users/how-to-install.md b/docs/users/how-to-install.md
index 9d25db03..e9b6fe48 100644
--- a/docs/users/how-to-install.md
+++ b/docs/users/how-to-install.md
@@ -8,5 +8,4 @@ All configuration are passed through environment variables to services through t
The only required change is to set the `CONFIG_PATH` variable to the path of your YAML configuration file. This file is used to configure the search-a-licious indexer and search services.
-
-
+If you want to see more about settings, see the [Reference for Settings](./ref-settings.md)
\ No newline at end of file
diff --git a/docs/users/how-to-use-scripts.md b/docs/users/how-to-use-scripts.md
index c3670c7d..c7b5c54f 100644
--- a/docs/users/how-to-use-scripts.md
+++ b/docs/users/how-to-use-scripts.md
@@ -59,7 +59,10 @@ Here:
It's mostly a way to declare constants in the script.
(hopefully more convenient than declaring them in the script)
-See [introduction to script in Elasticsearch documentation](https://www.elastic.co/guide/en/elasticsearch/reference/current/modules-scripting-using.html)
+For more information on configuration for scripts see [configuration reference](../ref-config/searchalicious-config-schema.html#indices_additionalProperties_scripts)
+
+For informations on how to write scripts,
+see [introduction to script in Elasticsearch documentation](https://www.elastic.co/guide/en/elasticsearch/reference/current/modules-scripting-using.html)
## Import the scripts in Elasticsearch
diff --git a/docs/users/ref-config.md b/docs/users/ref-config.md
new file mode 100644
index 00000000..3c7e76ee
--- /dev/null
+++ b/docs/users/ref-config.md
@@ -0,0 +1,8 @@
+# Reference for Configuration file
+
+You can find the [raw json schema here](./searchalicious-config-schema.yml)
+
+[See configuration documentation on it's own page](./searchalicious-config-schema.html)
+
\ No newline at end of file
diff --git a/docs/users/ref-settings.md b/docs/users/ref-settings.md
new file mode 100644
index 00000000..a4489189
--- /dev/null
+++ b/docs/users/ref-settings.md
@@ -0,0 +1,8 @@
+# Reference for Settings
+
+You can find the [raw json schema here](./searchalicious-settings-schema.yml)
+
+[See Settings documentation on it's own page](./searchalicious-settings-schema.html)
+
\ No newline at end of file
diff --git a/docs/users/searchalicious-config-schema.yml b/docs/users/searchalicious-config-schema.yml
new file mode 100644
index 00000000..f060e0f6
--- /dev/null
+++ b/docs/users/searchalicious-config-schema.yml
@@ -0,0 +1,2 @@
+# keep empty - This file will be replaced by generated schema
+# at documentation generation time
\ No newline at end of file
diff --git a/scripts/Dockerfile.schema b/scripts/Dockerfile.schema
new file mode 100644
index 00000000..5e8fce33
--- /dev/null
+++ b/scripts/Dockerfile.schema
@@ -0,0 +1,14 @@
+FROM python:3-slim
+
+ARG USER_UID=1000
+ARG USER_GID=1000
+USER root
+# add user with right id
+RUN addgroup --gid $USER_GID user && adduser --uid $USER_UID --ingroup user --no-create-home --disabled-password --quiet user
+# create folders
+RUN mkdir -p /docs/in /docs/out && chown user:user /docs
+# install some packages we need
+RUN pip3 install -U pip && pip3 install json-schema-for-humans
+CMD ["generate-schema-doc", "/docs/in/", "/docs/out/"]
+WORKDIR /docs
+USER user
diff --git a/scripts/build_mkdocs.sh b/scripts/build_mkdocs.sh
index f6690fbf..e4444867 100755
--- a/scripts/build_mkdocs.sh
+++ b/scripts/build_mkdocs.sh
@@ -12,4 +12,4 @@ docker build --build-arg "USER_UID=$UID" --build-arg "USER_GID=$GID" --tag 'mkdo
docker run --rm \
-e USER_ID=$UID -e GROUP_ID=$GID \
-v $(pwd):/app -w /app \
- mkdocs-builder build
+ mkdocs-builder build
\ No newline at end of file
diff --git a/scripts/build_schema.sh b/scripts/build_schema.sh
new file mode 100755
index 00000000..87ba91a2
--- /dev/null
+++ b/scripts/build_schema.sh
@@ -0,0 +1,34 @@
+#!/usr/bin/env bash
+# Build config documentation in markdown
+# Use it before using mkdocs
+
+# Parameter is the schema type: config / settings
+SCHEMA=$1
+
+[[ -z $SCHEMA ]] && echo "You must provide a schema type: config / settings" && exit 1
+
+set -e
+
+# get group id to use it in the docker
+GID=$(id -g)
+
+# ensure dest dir
+mkdir -p build/ref-$SCHEMA
+
+# create yaml
+make generate-$SCHEMA-schema
+# create image
+docker build --build-arg "USER_UID=$UID" --build-arg "USER_GID=$GID" --tag 'json-schema-for-humans' -f scripts/Dockerfile.schema .
+
+# use image to generate documentation
+docker run --rm --user user \
+ -v $(pwd)/scripts/schema-config.json:/docs/schema-config.json \
+ -v $(pwd)/data/searchalicious-$SCHEMA-schema.yml:/docs/in/searchalicious-$SCHEMA-schema.yml \
+ -v $(pwd)/build/ref-$SCHEMA:/docs/out \
+ json-schema-for-humans \
+ generate-schema-doc --config-file /docs/schema-config.json /docs/in/ /docs/out/
+
+# copy to ref-$SCHEMA folder
+mv build/ref-$SCHEMA/* gh_pages/users/ref-$SCHEMA/
+# also source
+cp data/searchalicious-$SCHEMA-schema.yml gh_pages/users/ref-$SCHEMA/
diff --git a/scripts/generate_doc.sh b/scripts/generate_doc.sh
index 43c4db39..e41f9939 100755
--- a/scripts/generate_doc.sh
+++ b/scripts/generate_doc.sh
@@ -11,7 +11,9 @@ mkdir -p gh_pages
echo "Build documentation with MkDocs"
scripts/build_mkdocs.sh
-# TODO: generating python and documentation with sphinx
+echo "Generate documentation for configuration file and settings"
+scripts/build_schema.sh config
+scripts/build_schema.sh settings
echo "Generate OpenAPI documentation"
make generate-openapi
diff --git a/scripts/schema-config.json b/scripts/schema-config.json
new file mode 100644
index 00000000..fad7e013
--- /dev/null
+++ b/scripts/schema-config.json
@@ -0,0 +1,5 @@
+{
+ "collapse_long_descriptions": false,
+ "examples_as_yaml": true,
+ "expand_buttons": true
+}
\ No newline at end of file