From bffcafbdec938576b76d88ecc4be9e6581513dfa Mon Sep 17 00:00:00 2001 From: david-leifker <114954101+david-leifker@users.noreply.github.com> Date: Tue, 25 Jun 2024 18:32:01 -0500 Subject: [PATCH] feat(search): adjust search config (#10774) --- docker/profiles/docker-compose.gms.yml | 8 +- docs/deploy/environment-vars.md | 2 +- .../src/main/resources/application.yaml | 10 +- .../src/main/resources/search_config.yaml | 97 ++++++++ .../src/main/resources/search_config.yml | 71 ------ metadata-service/openapi-servlet/build.gradle | 1 + .../elastic/OperationsController.java | 211 +++++++++++++++--- .../cypress/cypress/e2e/search/search.js | 4 +- .../cypress/cypress/e2e/siblings/siblings.js | 6 +- 9 files changed, 290 insertions(+), 120 deletions(-) create mode 100644 metadata-service/configuration/src/main/resources/search_config.yaml delete mode 100644 metadata-service/factories/src/main/resources/search_config.yml diff --git a/docker/profiles/docker-compose.gms.yml b/docker/profiles/docker-compose.gms.yml index 76bdcacd2dfc95..8cfff2280e2fea 100644 --- a/docker/profiles/docker-compose.gms.yml +++ b/docker/profiles/docker-compose.gms.yml @@ -99,8 +99,7 @@ x-datahub-gms-service: &datahub-gms-service - ${DATAHUB_LOCAL_GMS_ENV:-empty2.env} environment: &datahub-gms-env <<: [*primary-datastore-mysql-env, *graph-datastore-search-env, *search-datastore-env, *datahub-quickstart-telemetry-env, *kafka-env] - ELASTICSEARCH_QUERY_CUSTOM_CONFIG_ENABLED: true - ELASTICSEARCH_QUERY_CUSTOM_CONFIG_FILE: '/etc/datahub/search/search_config.yaml' + ELASTICSEARCH_QUERY_CUSTOM_CONFIG_FILE: ${ELASTICSEARCH_QUERY_CUSTOM_CONFIG_FILE:-search_config.yaml} healthcheck: test: curl -sS --fail http://datahub-gms:${DATAHUB_GMS_PORT:-8080}/health start_period: 90s @@ -119,8 +118,13 @@ x-datahub-gms-service-dev: &datahub-gms-service-dev ports: - ${DATAHUB_MAPPED_GMS_DEBUG_PORT:-5001}:5001 - ${DATAHUB_MAPPED_GMS_PORT:-8080}:8080 + env_file: + - datahub-gms/env/docker.env + - ${DATAHUB_LOCAL_COMMON_ENV:-empty.env} + - ${DATAHUB_LOCAL_GMS_ENV:-empty2.env} environment: &datahub-gms-dev-env <<: [*datahub-dev-telemetry-env, *datahub-gms-env] + ELASTICSEARCH_QUERY_CUSTOM_CONFIG_FILE: ${ELASTICSEARCH_QUERY_CUSTOM_CONFIG_FILE:-/etc/datahub/search/search_config.yaml} SKIP_ELASTICSEARCH_CHECK: false JAVA_TOOL_OPTIONS: '-agentlib:jdwp=transport=dt_socket,server=y,suspend=n,address=*:5001' BOOTSTRAP_SYSTEM_UPDATE_WAIT_FOR_SYSTEM_UPDATE: false diff --git a/docs/deploy/environment-vars.md b/docs/deploy/environment-vars.md index 3314d2db1f467f..21ed738e878f88 100644 --- a/docs/deploy/environment-vars.md +++ b/docs/deploy/environment-vars.md @@ -60,7 +60,7 @@ DataHub works. | `ELASTICSEARCH_QUERY_EXACT_MATCH_ENABLE_STRUCTURED` | `true` | boolean | [`GMS`] | When using structured query, also include exact matches. | | `ELASTICSEARCH_QUERY_PARTIAL_URN_FACTOR` | 0.5 | float | [`GMS`] | Multiply by this number when partial token match on URN) | | `ELASTICSEARCH_QUERY_PARTIAL_FACTOR` | 0.4 | float | [`GMS`] | Multiply by this number when partial token match on non-URN field. | -| `ELASTICSEARCH_QUERY_CUSTOM_CONFIG_ENABLED` | `false` | boolean | [`GMS`] | Enable search query and ranking customization configuration. | +| `ELASTICSEARCH_QUERY_CUSTOM_CONFIG_ENABLED` | `true` | boolean | [`GMS`] | Enable search query and ranking customization configuration. | | `ELASTICSEARCH_QUERY_CUSTOM_CONFIG_FILE` | `search_config.yml` | string | [`GMS`] | The location of the search customization configuration. | | `ELASTICSEARCH_INDEX_BUILDER_MAPPINGS_REINDEX` | `false` | boolean | [`System Update`] | Enable reindexing on Elasticsearch schema changes. | | `ENABLE_STRUCTURED_PROPERTIES_SYSTEM_UPDATE` | `false` | boolean | [`System Update`] | Enable reindexing to remove hard deleted structured properties. | diff --git a/metadata-service/configuration/src/main/resources/application.yaml b/metadata-service/configuration/src/main/resources/application.yaml index 14de0530f51f05..57dff4918ea950 100644 --- a/metadata-service/configuration/src/main/resources/application.yaml +++ b/metadata-service/configuration/src/main/resources/application.yaml @@ -217,9 +217,9 @@ elasticsearch: exactMatch: exclusive: ${ELASTICSEARCH_QUERY_EXACT_MATCH_EXCLUSIVE:false} # if false will only apply weights, if true will exclude non-exact withPrefix: ${ELASTICSEARCH_QUERY_EXACT_MATCH_WITH_PREFIX:true} # include prefix exact matches - exactFactor: ${ELASTICSEARCH_QUERY_EXACT_MATCH_FACTOR:10.0} # boost multiplier when exact with case - prefixFactor: ${ELASTICSEARCH_QUERY_EXACT_MATCH_PREFIX_FACTOR:1.6} # boost multiplier when exact prefix - caseSensitivityFactor: ${ELASTICSEARCH_QUERY_EXACT_MATCH_CASE_FACTOR:0.7} # stacked boost multiplier when case mismatch + exactFactor: ${ELASTICSEARCH_QUERY_EXACT_MATCH_FACTOR:16.0} # boost multiplier when exact with case + prefixFactor: ${ELASTICSEARCH_QUERY_EXACT_MATCH_PREFIX_FACTOR:1.1} # boost multiplier when exact prefix + caseSensitivityFactor: ${ELASTICSEARCH_QUERY_EXACT_MATCH_CASE_FACTOR:0.0} # stacked boost multiplier when case mismatch enableStructured: ${ELASTICSEARCH_QUERY_EXACT_MATCH_ENABLE_STRUCTURED:true} # enable exact match on structured search wordGram: twoGramFactor: ${ELASTICSEARCH_QUERY_TWO_GRAM_FACTOR:1.2} # boost multiplier when match on 2-gram tokens @@ -230,8 +230,8 @@ elasticsearch: urnFactor: ${ELASTICSEARCH_QUERY_PARTIAL_URN_FACTOR:0.5} # multiplier on Urn token match, a partial match on Urn > non-Urn is assumed factor: ${ELASTICSEARCH_QUERY_PARTIAL_FACTOR:0.4} # multiplier on possible non-Urn token match custom: - enabled: ${ELASTICSEARCH_QUERY_CUSTOM_CONFIG_ENABLED:false} - file: ${ELASTICSEARCH_QUERY_CUSTOM_CONFIG_FILE:search_config.yml} + enabled: ${ELASTICSEARCH_QUERY_CUSTOM_CONFIG_ENABLED:true} + file: ${ELASTICSEARCH_QUERY_CUSTOM_CONFIG_FILE:search_config.yaml} graph: timeoutSeconds: ${ELASTICSEARCH_SEARCH_GRAPH_TIMEOUT_SECONDS:50} # graph dao timeout seconds batchSize: ${ELASTICSEARCH_SEARCH_GRAPH_BATCH_SIZE:1000} # graph dao batch size diff --git a/metadata-service/configuration/src/main/resources/search_config.yaml b/metadata-service/configuration/src/main/resources/search_config.yaml new file mode 100644 index 00000000000000..2ffe962d393579 --- /dev/null +++ b/metadata-service/configuration/src/main/resources/search_config.yaml @@ -0,0 +1,97 @@ +# Notes: +# +# First match wins +# +# queryRegex = Java regex syntax +# +# functionScores - See the following for function score syntax +# https://www.elastic.co/guide/en/elasticsearch/reference/7.17/query-dsl-function-score-query.html + +queryConfigurations: + # Select */explore all + # Attempt to rank active incidents at the top followed by enrichment factors + - queryRegex: '[*]|' + simpleQuery: false + prefixMatchQuery: false + exactMatchQuery: false + functionScore: + functions: + - filter: + term: + hasActiveIncidents: + value: true + weight: 2.0 + - filter: + term: + hasDescription: + value: true + weight: 1.25 + - filter: + term: + hasOwners: + value: true + weight: 1.25 + - filter: + term: + hasDomain: + value: true + weight: 1.1 + - filter: + term: + hasGlossaryTerms: + value: true + weight: 1.1 + - filter: + term: + hasTags: + value: true + weight: 1.1 + - filter: + term: + hasRowCount: + value: true + weight: 1.05 + - filter: + term: + hasColumnCount: + value: true + weight: 1.05 + - filter: + term: + deprecated: + value: true + weight: 0.25 + score_mode: multiply + boost_mode: replace + + # Criteria for exact-match only + # Contains quotes, is a single term with `_`, `.`, or `-` (normally consider for tokenization) then use exact match query + - queryRegex: >- + ^["'].+["']$|^[a-zA-Z0-9]\S+[_.-]\S+[a-zA-Z0-9]$ + simpleQuery: false + prefixMatchQuery: false + exactMatchQuery: true + functionScore: + functions: + - filter: + term: + deprecated: + value: true + weight: 0.25 + score_mode: multiply + boost_mode: multiply + + # default + - queryRegex: .* + simpleQuery: true + prefixMatchQuery: true + exactMatchQuery: true + functionScore: + functions: + - filter: + term: + deprecated: + value: true + weight: 0.25 + score_mode: multiply + boost_mode: multiply diff --git a/metadata-service/factories/src/main/resources/search_config.yml b/metadata-service/factories/src/main/resources/search_config.yml deleted file mode 100644 index 331e7d8571476b..00000000000000 --- a/metadata-service/factories/src/main/resources/search_config.yml +++ /dev/null @@ -1,71 +0,0 @@ -# Notes: -# -# First match wins -# -# queryRegex = Java regex syntax -# -# functionScores - See the following for function score syntax -# https://www.elastic.co/guide/en/elasticsearch/reference/7.17/query-dsl-function-score-query.html - -queryConfigurations: - # Select * - - queryRegex: '[*]|' - simpleQuery: false - prefixMatchQuery: false - exactMatchQuery: false - boolQuery: - must_not: - term: - deprecated: - value: true - functionScore: - functions: - - filter: - term: - materialized: - value: true - weight: 0.8 - score_mode: multiply - boost_mode: multiply - - # Criteria for exact-match only - # Contains quoted then use exact match query - - queryRegex: >- - ["'].+["'] - simpleQuery: false - prefixMatchQuery: true - exactMatchQuery: true - functionScore: - functions: - - filter: - term: - materialized: - value: true - weight: 0.8 - - filter: - term: - deprecated: - value: true - weight: 0 - score_mode: multiply - boost_mode: multiply - - # default - - queryRegex: .* - simpleQuery: true - prefixMatchQuery: true - exactMatchQuery: true - boolQuery: - must_not: - term: - deprecated: - value: true - functionScore: - functions: - - filter: - term: - materialized: - value: true - weight: 0.8 - score_mode: multiply - boost_mode: multiply diff --git a/metadata-service/openapi-servlet/build.gradle b/metadata-service/openapi-servlet/build.gradle index bfe9d954dfc7d7..7b6a3c7c89e31c 100644 --- a/metadata-service/openapi-servlet/build.gradle +++ b/metadata-service/openapi-servlet/build.gradle @@ -29,6 +29,7 @@ dependencies { implementation externalDependency.guava implementation('io.acryl:json-schema-avro:0.2.3') implementation externalDependency.jsonSchemaValidator + implementation group: 'io.github.deblockt', name: 'json-diff', version: '1.1.0' annotationProcessor externalDependency.lombok diff --git a/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/operations/elastic/OperationsController.java b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/operations/elastic/OperationsController.java index f4437e71ba299c..341f2a45197d18 100644 --- a/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/operations/elastic/OperationsController.java +++ b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/operations/elastic/OperationsController.java @@ -4,6 +4,15 @@ import com.datahub.authentication.AuthenticationContext; import com.datahub.authorization.AuthUtil; import com.datahub.authorization.AuthorizerChain; +import com.deblock.jsondiff.DiffGenerator; +import com.deblock.jsondiff.matcher.CompositeJsonMatcher; +import com.deblock.jsondiff.matcher.LenientJsonArrayPartialMatcher; +import com.deblock.jsondiff.matcher.LenientJsonObjectPartialMatcher; +import com.deblock.jsondiff.matcher.LenientNumberPrimitivePartialMatcher; +import com.deblock.jsondiff.matcher.StrictPrimitivePartialMatcher; +import com.deblock.jsondiff.viewer.PatchDiffViewer; +import com.fasterxml.jackson.core.JsonProcessingException; +import com.fasterxml.jackson.databind.ObjectMapper; import com.linkedin.common.urn.UrnUtils; import com.linkedin.metadata.authorization.PoliciesConfig; import com.linkedin.metadata.entity.EntityService; @@ -11,7 +20,6 @@ import com.linkedin.metadata.entity.restoreindices.RestoreIndicesResult; import com.linkedin.metadata.query.SearchFlags; import com.linkedin.metadata.query.filter.Filter; -import com.linkedin.metadata.query.filter.SortCriterion; import com.linkedin.metadata.search.EntitySearchService; import com.linkedin.metadata.systemmetadata.SystemMetadataService; import com.linkedin.metadata.timeseries.TimeseriesAspectService; @@ -24,6 +32,8 @@ import io.swagger.v3.oas.annotations.Parameter; import io.swagger.v3.oas.annotations.tags.Tag; import java.net.URISyntaxException; +import java.net.URLEncoder; +import java.nio.charset.StandardCharsets; import java.util.List; import java.util.Optional; import java.util.Set; @@ -60,6 +70,7 @@ public class OperationsController { private final TimeseriesAspectService timeseriesAspectService; private final EntitySearchService searchService; private final EntityService entityService; + private final ObjectMapper objectMapper; public OperationsController( OperationContext systemOperationContext, @@ -67,13 +78,15 @@ public OperationsController( TimeseriesAspectService timeseriesAspectService, EntitySearchService searchService, EntityService entityService, - AuthorizerChain authorizerChain) { + AuthorizerChain authorizerChain, + ObjectMapper objectMapper) { this.systemOperationContext = systemOperationContext; this.authorizerChain = authorizerChain; this.systemMetadataService = systemMetadataService; this.timeseriesAspectService = timeseriesAspectService; this.searchService = searchService; this.entityService = entityService; + this.objectMapper = objectMapper; } @InitBinder @@ -163,7 +176,7 @@ public ResponseEntity explainSearchQuery( required = true, description = "Query to evaluate for specified document, will be applied as an input string to standard search query builder.") - @RequestParam("query") + @RequestParam(value = "query", defaultValue = "*") @Nonnull String query, @Parameter( @@ -177,16 +190,15 @@ public ResponseEntity explainSearchQuery( name = "entityName", required = true, description = "Name of the entity the document belongs to.") - @RequestParam("entityName") + @RequestParam(value = "entityName", defaultValue = "dataset") @Nonnull String entityName, - @Parameter(name = "scrollId", required = false, description = "Scroll ID for pagination.") + @Parameter(name = "scrollId", description = "Scroll ID for pagination.") @RequestParam("scrollId") @Nullable String scrollId, @Parameter( name = "keepAlive", - required = false, description = "Keep alive time for point in time scroll context" + ", only relevant where point in time is supported.") @@ -194,36 +206,116 @@ public ResponseEntity explainSearchQuery( @Nullable String keepAlive, @Parameter(name = "size", required = true, description = "Page size for pagination.") - @RequestParam("size") + @RequestParam(value = "size", required = false, defaultValue = "1") int size, - @Parameter( - name = "filters", - required = false, - description = "Additional filters to apply to query.") - @RequestParam("filters") + @Parameter(name = "filters", description = "Additional filters to apply to query.") + @RequestParam(value = "filters", required = false) @Nullable - Filter filters, - @Parameter( - name = "sortCriterion", - required = false, - description = "Criterion to sort results on.") - @RequestParam("sortCriterion") + String filters, + @Parameter(name = "searchFlags", description = "Optional configuration flags.") + @RequestParam(value = "searchFlags", required = false) @Nullable - SortCriterion sortCriterion, + String searchFlags) + throws JsonProcessingException { + + Authentication authentication = AuthenticationContext.getAuthentication(); + String actorUrnStr = authentication.getActor().toUrnStr(); + + if (!AuthUtil.isAPIAuthorized( + authentication, authorizerChain, PoliciesConfig.ES_EXPLAIN_QUERY_PRIVILEGE)) { + log.error("{} is not authorized to get explain queries", actorUrnStr); + return ResponseEntity.status(HttpStatus.FORBIDDEN).body(null); + } + OperationContext opContext = + systemOperationContext + .asSession( + RequestContext.builder().buildOpenapi("explainSearchQuery", entityName), + authorizerChain, + authentication) + .withSearchFlags( + flags -> { + try { + return searchFlags == null + ? flags + : objectMapper.readValue(searchFlags, SearchFlags.class); + } catch (JsonProcessingException e) { + throw new RuntimeException(e); + } + }); + + ExplainResponse response = + searchService.explain( + opContext, + query, + encodeValue(documentId), + entityName, + filters == null ? null : objectMapper.readValue(filters, Filter.class), + null, + scrollId, + keepAlive, + size, + null); + + return ResponseEntity.ok(response); + } + + @Tag(name = "ElasticSearchOperations") + @GetMapping(path = "/explainSearchQueryDiff", produces = MediaType.TEXT_PLAIN_VALUE) + @Operation(summary = "Explain the differences in scoring for 2 documents") + public ResponseEntity explainSearchQueryDiff( @Parameter( - name = "searchFlags", - required = false, - description = "Optional configuration flags.") - @RequestParam("searchFlags") + name = "query", + required = true, + description = + "Query to evaluate for specified document, will be applied as an input string to standard search query builder.") + @RequestParam(value = "query", defaultValue = "*") + @Nonnull + String query, + @Parameter( + name = "documentIdA", + required = true, + description = "Document 1st ID to apply explain to.") + @RequestParam("documentIdA") + @Nonnull + String documentIdA, + @Parameter( + name = "documentIdB", + required = true, + description = "Document 2nd ID to apply explain to.") + @RequestParam("documentIdB") + @Nonnull + String documentIdB, + @Parameter( + name = "entityName", + required = true, + description = "Name of the entity the document belongs to.") + @RequestParam(value = "entityName", defaultValue = "dataset") + @Nonnull + String entityName, + @Parameter(name = "scrollId", description = "Scroll ID for pagination.") + @RequestParam("scrollId") @Nullable - SearchFlags searchFlags, + String scrollId, @Parameter( - name = "facets", - required = false, - description = "List of facet fields for aggregations.") - @RequestParam("facets") + name = "keepAlive", + description = + "Keep alive time for point in time scroll context" + + ", only relevant where point in time is supported.") + @RequestParam("keepAlive") + @Nullable + String keepAlive, + @Parameter(name = "size", required = true, description = "Page size for pagination.") + @RequestParam(value = "size", required = false, defaultValue = "1") + int size, + @Parameter(name = "filters", description = "Additional filters to apply to query.") + @RequestParam(value = "filters", required = false) + @Nullable + String filters, + @Parameter(name = "searchFlags", description = "Optional configuration flags.") + @RequestParam(value = "searchFlags", required = false) @Nullable - List facets) { + String searchFlags) + throws JsonProcessingException { Authentication authentication = AuthenticationContext.getAuthentication(); String actorUrnStr = authentication.getActor().toUrnStr(); @@ -239,22 +331,69 @@ public ResponseEntity explainSearchQuery( RequestContext.builder().buildOpenapi("explainSearchQuery", entityName), authorizerChain, authentication) - .withSearchFlags(flags -> searchFlags); + .withSearchFlags( + flags -> { + try { + return searchFlags == null + ? flags + : objectMapper.readValue(searchFlags, SearchFlags.class); + } catch (JsonProcessingException e) { + throw new RuntimeException(e); + } + }); - ExplainResponse response = + ExplainResponse responseA = searchService.explain( opContext, query, - documentId, + encodeValue(documentIdA), entityName, - filters, - sortCriterion, + filters == null ? null : objectMapper.readValue(filters, Filter.class), + null, scrollId, keepAlive, size, - facets); + null); - return ResponseEntity.ok(response); + ExplainResponse responseB = + searchService.explain( + opContext, + query, + encodeValue(documentIdB), + entityName, + filters == null ? null : objectMapper.readValue(filters, Filter.class), + null, + scrollId, + keepAlive, + size, + null); + + String a = objectMapper.writerWithDefaultPrettyPrinter().writeValueAsString(responseA); + String b = objectMapper.writerWithDefaultPrettyPrinter().writeValueAsString(responseB); + + CompositeJsonMatcher fullLenient = + new CompositeJsonMatcher( + new LenientJsonArrayPartialMatcher(), // comparing array using lenient mode (ignore + // array order and extra items) + new LenientJsonObjectPartialMatcher(), // comparing object using lenient mode (ignoring + // extra properties) + new LenientNumberPrimitivePartialMatcher( + new StrictPrimitivePartialMatcher()) // comparing primitive types and manage numbers + // (100.00 == 100) + ); + + // generate a diff + final var jsondiff = DiffGenerator.diff(a, b, fullLenient); + PatchDiffViewer patch = PatchDiffViewer.from(jsondiff); + + return ResponseEntity.ok(patch.toString()); + } + + private static String encodeValue(String value) { + if (value.startsWith("urn:li:")) { + return URLEncoder.encode(value, StandardCharsets.UTF_8); + } + return value; } @Tag(name = "RestoreIndices") diff --git a/smoke-test/tests/cypress/cypress/e2e/search/search.js b/smoke-test/tests/cypress/cypress/e2e/search/search.js index ea46b2d8d012b5..cff48cbd8be715 100644 --- a/smoke-test/tests/cypress/cypress/e2e/search/search.js +++ b/smoke-test/tests/cypress/cypress/e2e/search/search.js @@ -22,9 +22,9 @@ describe("search", () => { it("can search, find a result, and visit the dataset page", () => { cy.login(); cy.visit( - "/search?filter_entity=DATASET&filter_tags=urn%3Ali%3Atag%3ACypress&page=1&query=users_created", + "/search?filter_entity=DATASET&filter_tags=urn%3Ali%3Atag%3ACypress&page=1&query=users created", ); - cy.contains("of 2 result"); + cy.contains("of 1 result"); cy.contains("Cypress"); diff --git a/smoke-test/tests/cypress/cypress/e2e/siblings/siblings.js b/smoke-test/tests/cypress/cypress/e2e/siblings/siblings.js index b6b6fea1a7d703..fb772bd7af1e74 100644 --- a/smoke-test/tests/cypress/cypress/e2e/siblings/siblings.js +++ b/smoke-test/tests/cypress/cypress/e2e/siblings/siblings.js @@ -100,10 +100,10 @@ describe("siblings", () => { cy.login(); cy.visit("/search?page=1&query=raw_orders"); - cy.contains("Showing 1 - 10 of "); + cy.contains("Showing 1 - 2 of "); - cy.get(".test-search-result").should("have.length", 5); - cy.get(".test-search-result-sibling-section").should("have.length", 5); + cy.get(".test-search-result").should("have.length", 1); + cy.get(".test-search-result-sibling-section").should("have.length", 1); cy.get(".test-search-result-sibling-section") .get(".test-mini-preview-class:contains(raw_orders)")