Skip to content

Commit

Permalink
feat(search): adjust search config (#10774)
Browse files Browse the repository at this point in the history
  • Loading branch information
david-leifker authored Jun 25, 2024
1 parent 724907b commit bffcafb
Show file tree
Hide file tree
Showing 9 changed files with 290 additions and 120 deletions.
8 changes: 6 additions & 2 deletions docker/profiles/docker-compose.gms.yml
Original file line number Diff line number Diff line change
Expand Up @@ -99,8 +99,7 @@ x-datahub-gms-service: &datahub-gms-service
- ${DATAHUB_LOCAL_GMS_ENV:-empty2.env}
environment: &datahub-gms-env
<<: [*primary-datastore-mysql-env, *graph-datastore-search-env, *search-datastore-env, *datahub-quickstart-telemetry-env, *kafka-env]
ELASTICSEARCH_QUERY_CUSTOM_CONFIG_ENABLED: true
ELASTICSEARCH_QUERY_CUSTOM_CONFIG_FILE: '/etc/datahub/search/search_config.yaml'
ELASTICSEARCH_QUERY_CUSTOM_CONFIG_FILE: ${ELASTICSEARCH_QUERY_CUSTOM_CONFIG_FILE:-search_config.yaml}
healthcheck:
test: curl -sS --fail http://datahub-gms:${DATAHUB_GMS_PORT:-8080}/health
start_period: 90s
Expand All @@ -119,8 +118,13 @@ x-datahub-gms-service-dev: &datahub-gms-service-dev
ports:
- ${DATAHUB_MAPPED_GMS_DEBUG_PORT:-5001}:5001
- ${DATAHUB_MAPPED_GMS_PORT:-8080}:8080
env_file:
- datahub-gms/env/docker.env
- ${DATAHUB_LOCAL_COMMON_ENV:-empty.env}
- ${DATAHUB_LOCAL_GMS_ENV:-empty2.env}
environment: &datahub-gms-dev-env
<<: [*datahub-dev-telemetry-env, *datahub-gms-env]
ELASTICSEARCH_QUERY_CUSTOM_CONFIG_FILE: ${ELASTICSEARCH_QUERY_CUSTOM_CONFIG_FILE:-/etc/datahub/search/search_config.yaml}
SKIP_ELASTICSEARCH_CHECK: false
JAVA_TOOL_OPTIONS: '-agentlib:jdwp=transport=dt_socket,server=y,suspend=n,address=*:5001'
BOOTSTRAP_SYSTEM_UPDATE_WAIT_FOR_SYSTEM_UPDATE: false
Expand Down
2 changes: 1 addition & 1 deletion docs/deploy/environment-vars.md
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ DataHub works.
| `ELASTICSEARCH_QUERY_EXACT_MATCH_ENABLE_STRUCTURED` | `true` | boolean | [`GMS`] | When using structured query, also include exact matches. |
| `ELASTICSEARCH_QUERY_PARTIAL_URN_FACTOR` | 0.5 | float | [`GMS`] | Multiply by this number when partial token match on URN) |
| `ELASTICSEARCH_QUERY_PARTIAL_FACTOR` | 0.4 | float | [`GMS`] | Multiply by this number when partial token match on non-URN field. |
| `ELASTICSEARCH_QUERY_CUSTOM_CONFIG_ENABLED` | `false` | boolean | [`GMS`] | Enable search query and ranking customization configuration. |
| `ELASTICSEARCH_QUERY_CUSTOM_CONFIG_ENABLED` | `true` | boolean | [`GMS`] | Enable search query and ranking customization configuration. |
| `ELASTICSEARCH_QUERY_CUSTOM_CONFIG_FILE` | `search_config.yml` | string | [`GMS`] | The location of the search customization configuration. |
| `ELASTICSEARCH_INDEX_BUILDER_MAPPINGS_REINDEX` | `false` | boolean | [`System Update`] | Enable reindexing on Elasticsearch schema changes. |
| `ENABLE_STRUCTURED_PROPERTIES_SYSTEM_UPDATE` | `false` | boolean | [`System Update`] | Enable reindexing to remove hard deleted structured properties. |
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -217,9 +217,9 @@ elasticsearch:
exactMatch:
exclusive: ${ELASTICSEARCH_QUERY_EXACT_MATCH_EXCLUSIVE:false} # if false will only apply weights, if true will exclude non-exact
withPrefix: ${ELASTICSEARCH_QUERY_EXACT_MATCH_WITH_PREFIX:true} # include prefix exact matches
exactFactor: ${ELASTICSEARCH_QUERY_EXACT_MATCH_FACTOR:10.0} # boost multiplier when exact with case
prefixFactor: ${ELASTICSEARCH_QUERY_EXACT_MATCH_PREFIX_FACTOR:1.6} # boost multiplier when exact prefix
caseSensitivityFactor: ${ELASTICSEARCH_QUERY_EXACT_MATCH_CASE_FACTOR:0.7} # stacked boost multiplier when case mismatch
exactFactor: ${ELASTICSEARCH_QUERY_EXACT_MATCH_FACTOR:16.0} # boost multiplier when exact with case
prefixFactor: ${ELASTICSEARCH_QUERY_EXACT_MATCH_PREFIX_FACTOR:1.1} # boost multiplier when exact prefix
caseSensitivityFactor: ${ELASTICSEARCH_QUERY_EXACT_MATCH_CASE_FACTOR:0.0} # stacked boost multiplier when case mismatch
enableStructured: ${ELASTICSEARCH_QUERY_EXACT_MATCH_ENABLE_STRUCTURED:true} # enable exact match on structured search
wordGram:
twoGramFactor: ${ELASTICSEARCH_QUERY_TWO_GRAM_FACTOR:1.2} # boost multiplier when match on 2-gram tokens
Expand All @@ -230,8 +230,8 @@ elasticsearch:
urnFactor: ${ELASTICSEARCH_QUERY_PARTIAL_URN_FACTOR:0.5} # multiplier on Urn token match, a partial match on Urn > non-Urn is assumed
factor: ${ELASTICSEARCH_QUERY_PARTIAL_FACTOR:0.4} # multiplier on possible non-Urn token match
custom:
enabled: ${ELASTICSEARCH_QUERY_CUSTOM_CONFIG_ENABLED:false}
file: ${ELASTICSEARCH_QUERY_CUSTOM_CONFIG_FILE:search_config.yml}
enabled: ${ELASTICSEARCH_QUERY_CUSTOM_CONFIG_ENABLED:true}
file: ${ELASTICSEARCH_QUERY_CUSTOM_CONFIG_FILE:search_config.yaml}
graph:
timeoutSeconds: ${ELASTICSEARCH_SEARCH_GRAPH_TIMEOUT_SECONDS:50} # graph dao timeout seconds
batchSize: ${ELASTICSEARCH_SEARCH_GRAPH_BATCH_SIZE:1000} # graph dao batch size
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
# Notes:
#
# First match wins
#
# queryRegex = Java regex syntax
#
# functionScores - See the following for function score syntax
# https://www.elastic.co/guide/en/elasticsearch/reference/7.17/query-dsl-function-score-query.html

queryConfigurations:
# Select */explore all
# Attempt to rank active incidents at the top followed by enrichment factors
- queryRegex: '[*]|'
simpleQuery: false
prefixMatchQuery: false
exactMatchQuery: false
functionScore:
functions:
- filter:
term:
hasActiveIncidents:
value: true
weight: 2.0
- filter:
term:
hasDescription:
value: true
weight: 1.25
- filter:
term:
hasOwners:
value: true
weight: 1.25
- filter:
term:
hasDomain:
value: true
weight: 1.1
- filter:
term:
hasGlossaryTerms:
value: true
weight: 1.1
- filter:
term:
hasTags:
value: true
weight: 1.1
- filter:
term:
hasRowCount:
value: true
weight: 1.05
- filter:
term:
hasColumnCount:
value: true
weight: 1.05
- filter:
term:
deprecated:
value: true
weight: 0.25
score_mode: multiply
boost_mode: replace

# Criteria for exact-match only
# Contains quotes, is a single term with `_`, `.`, or `-` (normally consider for tokenization) then use exact match query
- queryRegex: >-
^["'].+["']$|^[a-zA-Z0-9]\S+[_.-]\S+[a-zA-Z0-9]$
simpleQuery: false
prefixMatchQuery: false
exactMatchQuery: true
functionScore:
functions:
- filter:
term:
deprecated:
value: true
weight: 0.25
score_mode: multiply
boost_mode: multiply
# default
- queryRegex: .*
simpleQuery: true
prefixMatchQuery: true
exactMatchQuery: true
functionScore:
functions:
- filter:
term:
deprecated:
value: true
weight: 0.25
score_mode: multiply
boost_mode: multiply
71 changes: 0 additions & 71 deletions metadata-service/factories/src/main/resources/search_config.yml

This file was deleted.

1 change: 1 addition & 0 deletions metadata-service/openapi-servlet/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ dependencies {
implementation externalDependency.guava
implementation('io.acryl:json-schema-avro:0.2.3')
implementation externalDependency.jsonSchemaValidator
implementation group: 'io.github.deblockt', name: 'json-diff', version: '1.1.0'

annotationProcessor externalDependency.lombok

Expand Down
Loading

0 comments on commit bffcafb

Please sign in to comment.