From adf52ac44d7fbbd38677652894f31fac75fb00a2 Mon Sep 17 00:00:00 2001 From: "anjali.agarwal" Date: Thu, 5 Oct 2023 16:23:46 +0530 Subject: [PATCH] feat: enable column level search --- cli/config.go | 3 + cli/server.go | 3 +- cli/worker.go | 8 +- core/asset/discovery.go | 2 + internal/server/v1beta1/search.go | 1 + .../elasticsearch/discovery_repository.go | 16 +- .../discovery_repository_test.go | 23 +- .../discovery_search_repository.go | 112 ++++++- .../discovery_search_repository_test.go | 313 +++++++++++++++++- internal/store/elasticsearch/schema.go | 6 + .../testdata/search-test-fixture.json | 41 +-- 11 files changed, 457 insertions(+), 71 deletions(-) diff --git a/cli/config.go b/cli/config.go index f61aec83..14b9b88e 100644 --- a/cli/config.go +++ b/cli/config.go @@ -103,6 +103,9 @@ type Config struct { // Client Client client.Config `mapstructure:"client"` + + // Column search excluded keyword list + ColSearchExclusionKeywords string `yaml:"col_search_excluded_keywords" mapstructure:"col_search_excluded_keywords"` } func LoadConfig() (*Config, error) { diff --git a/cli/server.go b/cli/server.go index 63890a75..9a7b5a9b 100644 --- a/cli/server.go +++ b/cli/server.go @@ -4,6 +4,7 @@ import ( "context" "fmt" "os" + "strings" "github.com/MakeNowJust/heredoc" "github.com/goto/compass/core/asset" @@ -131,7 +132,7 @@ func runServer(ctx context.Context, cfg *Config) error { if err != nil { return fmt.Errorf("create new asset repository: %w", err) } - discoveryRepository := esStore.NewDiscoveryRepository(esClient, logger, cfg.Elasticsearch.RequestTimeout) + discoveryRepository := esStore.NewDiscoveryRepository(esClient, logger, cfg.Elasticsearch.RequestTimeout, strings.Split(cfg.ColSearchExclusionKeywords, ",")) lineageRepository, err := postgres.NewLineageRepository(pgClient) if err != nil { return fmt.Errorf("create new lineage repository: %w", err) diff --git a/cli/worker.go b/cli/worker.go index c9b75aad..e7d372a1 100644 --- a/cli/worker.go +++ b/cli/worker.go @@ -4,6 +4,7 @@ import ( "context" "errors" "fmt" + "strings" "github.com/MakeNowJust/heredoc" "github.com/goto/compass/internal/store/elasticsearch" @@ -67,9 +68,10 @@ func runWorker(ctx context.Context, cfg *Config) error { } mgr, err := workermanager.New(ctx, workermanager.Deps{ - Config: cfg.Worker, - DiscoveryRepo: elasticsearch.NewDiscoveryRepository(esClient, logger, cfg.Elasticsearch.RequestTimeout), - Logger: logger, + Config: cfg.Worker, + DiscoveryRepo: elasticsearch.NewDiscoveryRepository(esClient, logger, cfg.Elasticsearch.RequestTimeout, + strings.Split(cfg.ColSearchExclusionKeywords, ",")), + Logger: logger, }) if err != nil { return err diff --git a/core/asset/discovery.go b/core/asset/discovery.go index ba2fd2c0..e60abf13 100644 --- a/core/asset/discovery.go +++ b/core/asset/discovery.go @@ -43,6 +43,8 @@ type SearchFlags struct { // DisableFuzzy disables fuzziness on search DisableFuzzy bool + + IsColumnSearch bool } // SearchConfig represents a search query along diff --git a/internal/server/v1beta1/search.go b/internal/server/v1beta1/search.go index 35b672b8..4717b502 100644 --- a/internal/server/v1beta1/search.go +++ b/internal/server/v1beta1/search.go @@ -140,5 +140,6 @@ func getSearchFlagsFromFlags(inputFlags *compassv1beta1.SearchFlags) asset.Searc return asset.SearchFlags{ EnableHighlight: inputFlags.GetEnableHighlight(), DisableFuzzy: inputFlags.GetDisableFuzzy(), + IsColumnSearch: inputFlags.GetIsColumnSearch(), } } diff --git a/internal/store/elasticsearch/discovery_repository.go b/internal/store/elasticsearch/discovery_repository.go index c193237b..d9c21780 100644 --- a/internal/store/elasticsearch/discovery_repository.go +++ b/internal/store/elasticsearch/discovery_repository.go @@ -18,16 +18,18 @@ import ( // DiscoveryRepository implements discovery.Repository // with elasticsearch as the backing store. type DiscoveryRepository struct { - cli *Client - logger log.Logger - requestTimeout time.Duration + cli *Client + logger log.Logger + requestTimeout time.Duration + columnSearchExclusionList []string } -func NewDiscoveryRepository(cli *Client, logger log.Logger, requestTimeout time.Duration) *DiscoveryRepository { +func NewDiscoveryRepository(cli *Client, logger log.Logger, requestTimeout time.Duration, colSearchExclusionList []string) *DiscoveryRepository { return &DiscoveryRepository{ - cli: cli, - logger: logger, - requestTimeout: requestTimeout, + cli: cli, + logger: logger, + requestTimeout: requestTimeout, + columnSearchExclusionList: colSearchExclusionList, } } diff --git a/internal/store/elasticsearch/discovery_repository_test.go b/internal/store/elasticsearch/discovery_repository_test.go index a3c36136..cb230a2d 100644 --- a/internal/store/elasticsearch/discovery_repository_test.go +++ b/internal/store/elasticsearch/discovery_repository_test.go @@ -30,8 +30,7 @@ func TestDiscoveryRepositoryUpsert(t *testing.T) { store.WithClient(cli), ) require.NoError(t, err) - - repo := store.NewDiscoveryRepository(esClient, log.NewNoop(), time.Second*10) + repo := store.NewDiscoveryRepository(esClient, log.NewNoop(), time.Second*10, []string{"number", "id"}) err = repo.Upsert(ctx, asset.Asset{ ID: "", Type: asset.TypeTable, @@ -50,7 +49,7 @@ func TestDiscoveryRepositoryUpsert(t *testing.T) { ) require.NoError(t, err) - repo := store.NewDiscoveryRepository(esClient, log.NewNoop(), time.Second*10) + repo := store.NewDiscoveryRepository(esClient, log.NewNoop(), time.Second*10, []string{"number", "id"}) err = repo.Upsert(ctx, asset.Asset{ ID: "sample-id", Type: asset.Type("unknown-type"), @@ -68,8 +67,7 @@ func TestDiscoveryRepositoryUpsert(t *testing.T) { store.WithClient(cli), ) require.NoError(t, err) - - repo := store.NewDiscoveryRepository(esClient, log.NewNoop(), time.Second*10) + repo := store.NewDiscoveryRepository(esClient, log.NewNoop(), time.Second*10, []string{"number", "id"}) // upsert with create_time as a object err = repo.Upsert(ctx, asset.Asset{ @@ -129,7 +127,7 @@ func TestDiscoveryRepositoryUpsert(t *testing.T) { ) require.NoError(t, err) - repo := store.NewDiscoveryRepository(esClient, log.NewNoop(), time.Second*10) + repo := store.NewDiscoveryRepository(esClient, log.NewNoop(), time.Second*10, []string{"number", "id"}) err = repo.Upsert(ctx, ast) assert.NoError(t, err) @@ -177,8 +175,7 @@ func TestDiscoveryRepositoryUpsert(t *testing.T) { store.WithClient(cli), ) require.NoError(t, err) - - repo := store.NewDiscoveryRepository(esClient, log.NewNoop(), time.Second*10) + repo := store.NewDiscoveryRepository(esClient, log.NewNoop(), time.Second*10, []string{"number", "id"}) err = repo.Upsert(ctx, existingAsset) assert.NoError(t, err) @@ -219,7 +216,7 @@ func TestDiscoveryRepositoryDeleteByID(t *testing.T) { ) require.NoError(t, err) - repo := store.NewDiscoveryRepository(esClient, log.NewNoop(), time.Second*10) + repo := store.NewDiscoveryRepository(esClient, log.NewNoop(), time.Second*10, []string{"number", "id"}) err = repo.DeleteByID(ctx, "") assert.ErrorIs(t, err, asset.ErrEmptyID) }) @@ -241,7 +238,7 @@ func TestDiscoveryRepositoryDeleteByID(t *testing.T) { ) require.NoError(t, err) - repo := store.NewDiscoveryRepository(esClient, log.NewNoop(), time.Second*10) + repo := store.NewDiscoveryRepository(esClient, log.NewNoop(), time.Second*10, []string{"number", "id"}) err = repo.Upsert(ctx, ast) require.NoError(t, err) @@ -288,7 +285,7 @@ func TestDiscoveryRepositoryDeleteByID(t *testing.T) { ) require.NoError(t, err) - repo := store.NewDiscoveryRepository(esClient, log.NewNoop(), time.Second*10) + repo := store.NewDiscoveryRepository(esClient, log.NewNoop(), time.Second*10, []string{"number", "id"}) err = repo.Upsert(ctx, ast1) require.NoError(t, err) @@ -319,7 +316,7 @@ func TestDiscoveryRepositoryDeleteByURN(t *testing.T) { ) require.NoError(t, err) - repo := store.NewDiscoveryRepository(esClient, log.NewNoop(), time.Second*10) + repo := store.NewDiscoveryRepository(esClient, log.NewNoop(), time.Second*10, []string{"number", "id"}) t.Run("should return error if the given urn is empty", func(t *testing.T) { err = repo.DeleteByURN(ctx, "") @@ -378,7 +375,7 @@ func TestDiscoveryRepositoryDeleteByURN(t *testing.T) { ) require.NoError(t, err) - repo := store.NewDiscoveryRepository(esClient, log.NewNoop(), time.Second*10) + repo := store.NewDiscoveryRepository(esClient, log.NewNoop(), time.Second*10, []string{"number", "id"}) err = repo.Upsert(ctx, ast1) require.NoError(t, err) diff --git a/internal/store/elasticsearch/discovery_search_repository.go b/internal/store/elasticsearch/discovery_search_repository.go index 0ce7e189..e50d8422 100644 --- a/internal/store/elasticsearch/discovery_search_repository.go +++ b/internal/store/elasticsearch/discovery_search_repository.go @@ -53,7 +53,7 @@ func (repo *DiscoveryRepository) Search(ctx context.Context, cfg asset.SearchCon }) }(time.Now()) - query, err := buildQuery(cfg) + query, err := repo.buildQuery(cfg) if err != nil { return nil, asset.DiscoveryError{Op: "Search", Err: fmt.Errorf("build query: %w", err)} } @@ -197,17 +197,109 @@ func (repo *DiscoveryRepository) Suggest(ctx context.Context, config asset.Searc return results, nil } -func buildQuery(cfg asset.SearchConfig) (io.Reader, error) { +func (repo *DiscoveryRepository) buildColumnQuery(query *elastic.BoolQuery, cfg asset.SearchConfig, field string) *elastic.Highlight { + matchString := cfg.Text + for _, exclusionStr := range repo.columnSearchExclusionList { + exclusionStr = strings.TrimSpace(exclusionStr) + if strings.Contains(matchString, exclusionStr) { + matchString = strings.ReplaceAll(matchString, fmt.Sprintf("_%s", exclusionStr), "") + matchString = strings.ReplaceAll(matchString, fmt.Sprintf(" %s", exclusionStr), "") + matchString = strings.ReplaceAll(matchString, fmt.Sprintf("-%s", exclusionStr), "") + } + } + + if matchString == "" { + matchString = cfg.Text + } + + queries := make([]elastic.Query, 0) + termQuery := elastic.NewTermQuery( + fmt.Sprintf("%s.keyword", field), + cfg.Text, + ).Boost(20) + + descriptionTermQuery := elastic.NewTermQuery( + fmt.Sprintf("%s.keyword", "data.columns.description"), + cfg.Text, + ) + phraseQuery := elastic.NewMultiMatchQuery( + cfg.Text, + []string{ + "data.columns.name^10", + "data.columns.description", + }..., + ).Type("phrase") + + matchQuery := elastic.NewMultiMatchQuery( + matchString, + []string{ + "data.columns.name^5", + "data.columns.description", + }..., + ) + + andMatchQuery := elastic.NewMultiMatchQuery( + matchString, + []string{ + "data.columns.name^5", + "data.columns.description", + }..., + ).Operator("and") + + multiMatchQueries := []*elastic.MultiMatchQuery{matchQuery, andMatchQuery} + queries = append(queries, termQuery, descriptionTermQuery, phraseQuery) + query.Should(queries...) + highlightQuery := make([]elastic.Query, 0) + highlightQuery = append(highlightQuery, queries...) + for _, q := range multiMatchQueries { + if !cfg.Flags.DisableFuzzy { + updatedQuery := q.Fuzziness("AUTO") + highlightQuery = append(highlightQuery, updatedQuery) + } + query.Should(q) + } + + if cfg.Flags.EnableHighlight { + return elastic.NewHighlight(). + Order("score"). + Field("data.columns.name"). + Field("data.columns.description"). + HighlightQuery( + elastic.NewBoolQuery(). + Should(highlightQuery...), + ) + } + + return nil +} + +func (repo *DiscoveryRepository) buildQuery(cfg asset.SearchConfig) (io.Reader, error) { boolQuery := elastic.NewBoolQuery() - buildTextQuery(boolQuery, cfg) + var highlightQuery *elastic.Highlight + field := "" + + // if the search text is empty, do a match all query and return results + if strings.TrimSpace(cfg.Text) == "" { + boolQuery.Should(elastic.NewMatchAllQuery()) + highlightQuery = buildHighlightQuery(cfg) + } else { + if cfg.Flags.IsColumnSearch { + field = "data.columns.name" + highlightQuery = repo.buildColumnQuery(boolQuery, cfg, field) + } else { + field = "name" + buildTextQuery(boolQuery, cfg) + highlightQuery = buildHighlightQuery(cfg) + } + } + buildFilterTermQueries(boolQuery, cfg.Filters) buildMustMatchQueries(boolQuery, cfg) - query := buildFunctionScoreQuery(boolQuery, cfg.RankBy, cfg.Text) - highlight := buildHighlightQuery(cfg) + query := buildFunctionScoreQuery(boolQuery, cfg.RankBy, cfg.Text, field) body, err := elastic.NewSearchRequest(). Query(query). - Highlight(highlight). + Highlight(highlightQuery). MinScore(defaultMinScore). Body() if err != nil { @@ -240,10 +332,6 @@ func buildSuggestQuery(cfg asset.SearchConfig) (io.Reader, error) { } func buildTextQuery(q *elastic.BoolQuery, cfg asset.SearchConfig) { - if strings.TrimSpace(cfg.Text) == "" { - q.Should(elastic.NewMatchAllQuery()) - } - boostedFields := []string{"urn^10", "name^5"} q.Should( // Phrase query cannot have `FUZZINESS` @@ -314,12 +402,12 @@ func buildFilterExistsQueries(q *elastic.BoolQuery, fields []string) { } } -func buildFunctionScoreQuery(query elastic.Query, rankBy, text string) elastic.Query { +func buildFunctionScoreQuery(query elastic.Query, rankBy, text, field string) elastic.Query { // Added exact match term query here so that exact match gets higher priority. fsQuery := elastic.NewFunctionScoreQuery() if text != "" { fsQuery.Add( - elastic.NewTermQuery("name.keyword", text), + elastic.NewTermQuery(fmt.Sprintf("%s.keyword", field), text), elastic.NewWeightFactorFunction(2), ) } diff --git a/internal/store/elasticsearch/discovery_search_repository_test.go b/internal/store/elasticsearch/discovery_search_repository_test.go index de0a4896..0fe1b524 100644 --- a/internal/store/elasticsearch/discovery_search_repository_test.go +++ b/internal/store/elasticsearch/discovery_search_repository_test.go @@ -35,9 +35,7 @@ func TestSearcherSearch(t *testing.T) { err = loadTestFixture(cli, esClient, "./testdata/search-test-fixture.json") require.NoError(t, err) - - repo := store.NewDiscoveryRepository(esClient, log.NewNoop(), time.Second*10) - + repo := store.NewDiscoveryRepository(esClient, log.NewNoop(), time.Second*10, []string{"number", "id"}) type expectedRow struct { Type string AssetID string @@ -287,7 +285,7 @@ func TestSearcherSearch(t *testing.T) { Config: asset.SearchConfig{ Text: "tablename", Queries: map[string]string{ - "data.schema.columns.name": "common", + "data.columns.name": "common", }, IncludeFields: []string{"type", "id"}, }, @@ -355,7 +353,304 @@ func TestSearcherSearch(t *testing.T) { }, }, } - for _, test := range tests { + + columnTests := []searchTest{ + { + Description: "should fetch assets with fields mentioned in included fields for column search", + Config: asset.SearchConfig{ + Text: "username", + IncludeFields: []string{"id", "data.company", "type"}, + Flags: asset.SearchFlags{ + IsColumnSearch: true, + }, + }, + Expected: []expectedRow{ + {Type: "table", AssetID: "au2-microsoft-invoice", Data: map[string]interface{}{"company": "microsoft"}}, + }, + }, + { + Description: "should fetch assets with default fields if included fields is empty", + Config: asset.SearchConfig{ + Text: "username", + Flags: asset.SearchFlags{ + IsColumnSearch: true, + }, + }, + Expected: []expectedRow{ + {Type: "table", AssetID: "au2-microsoft-invoice", Service: "postgres", Data: map[string]interface{}{ + "columns": []interface{}{map[string]interface{}{"name": "id"}, map[string]interface{}{"description": "purchaser username", "name": "username"}, map[string]interface{}{"description": "item identifications", "name": "item_id"}}, "company": "microsoft", "country": "us", "description": "Transaction records for every microsoft purchase", "environment": "integration", "table_id": "au2-microsoft-invoice", "table_name": "microsoft-invoice", "total_rows": float64(100), + }}, + }, + }, + { + Description: "should fetch assets with empty text", + Config: asset.SearchConfig{ + Text: "", + IncludeFields: []string{"id", "type"}, + Filters: map[string][]string{"service": {"bigquery"}}, + Flags: asset.SearchFlags{ + IsColumnSearch: true, + }, + }, + Expected: []expectedRow{ + {Type: "table", AssetID: "bigquery::gcpproject/dataset/tablename-1"}, + {Type: "table", AssetID: "bigquery::gcpproject/dataset/tablename-common"}, + {Type: "table", AssetID: "bigquery::gcpproject/dataset/tablename-abc-common-test"}, + {Type: "table", AssetID: "bigquery::gcpproject/dataset/tablename-mid"}, + {Type: "table", AssetID: "bigquery::gcpproject/dataset/abc-tablename-mid"}, + {Type: "table", AssetID: "bigquery::gcpproject/dataset/test"}, + }, + }, + { + Description: "should fetch assets with empty text and rank by", + Config: asset.SearchConfig{ + Text: "", + RankBy: "data.profile.usage_count", + IncludeFields: []string{"id", "type"}, + Filters: map[string][]string{"service": {"bigquery"}}, + Flags: asset.SearchFlags{ + IsColumnSearch: true, + }, + }, + Expected: []expectedRow{ + {Type: "table", AssetID: "bigquery::gcpproject/dataset/tablename-common"}, + {Type: "table", AssetID: "bigquery::gcpproject/dataset/tablename-mid"}, + {Type: "table", AssetID: "bigquery::gcpproject/dataset/test"}, + {Type: "table", AssetID: "bigquery::gcpproject/dataset/tablename-1"}, + {Type: "table", AssetID: "bigquery::gcpproject/dataset/tablename-abc-common-test"}, + {Type: "table", AssetID: "bigquery::gcpproject/dataset/abc-tablename-mid"}, + }, + }, + { + Description: "should fetch assets which has text in either column name or description", + Config: asset.SearchConfig{ + Text: "purchaser", + IncludeFields: []string{"type", "id"}, + Flags: asset.SearchFlags{ + IsColumnSearch: true, + }, + }, + Expected: []expectedRow{ + {Type: "table", AssetID: "us1-apple-invoice"}, + {Type: "table", AssetID: "au2-microsoft-invoice"}, + }, + }, + { + Description: "should enable fuzzy search", + Config: asset.SearchConfig{ + Text: "uername", + IncludeFields: []string{"type", "id"}, + Flags: asset.SearchFlags{IsColumnSearch: true}, + }, + Expected: []expectedRow{ + {Type: "table", AssetID: "au2-microsoft-invoice"}, + }, + }, + { + Description: "should disable fuzzy search", + Config: asset.SearchConfig{ + Text: "uername", + Flags: asset.SearchFlags{DisableFuzzy: true, IsColumnSearch: true}, + IncludeFields: []string{"type", "id"}, + }, + Expected: []expectedRow{}, + }, + { + Description: "should put more weight on column name field", + Config: asset.SearchConfig{ + Text: "abc", + IncludeFields: []string{"type", "id"}, + }, + Expected: []expectedRow{ + {Type: "table", AssetID: "bigquery::gcpproject/dataset/abc-tablename-mid"}, + {Type: "table", AssetID: "bigquery::gcpproject/dataset/tablename-abc-common-test"}, + }, + }, + { + Description: "should filter by service if given", + Config: asset.SearchConfig{ + Text: "purchaser", + Filters: map[string][]string{ + "service": {"postgres"}, + }, + IncludeFields: []string{"type", "id"}, + Flags: asset.SearchFlags{IsColumnSearch: true}, + }, + Expected: []expectedRow{ + {Type: "table", AssetID: "au2-microsoft-invoice"}, + }, + }, + { + Description: "should match documents based on filter criteria", + Config: asset.SearchConfig{ + Text: "purchaser", + Filters: map[string][]string{ + "data.company": {"apple"}, + }, + IncludeFields: []string{"type", "id"}, + Flags: asset.SearchFlags{IsColumnSearch: true}, + }, + Expected: []expectedRow{ + {Type: "table", AssetID: "us1-apple-invoice"}, + }, + }, + { + Description: "should return a descendingly sorted based on usage count in search results if rank by usage in the config", + Config: asset.SearchConfig{ + Text: "column3", + RankBy: "data.profile.usage_count", + IncludeFields: []string{"type", "id"}, + Flags: asset.SearchFlags{IsColumnSearch: true}, + }, + Expected: []expectedRow{ + {Type: "table", AssetID: "bigquery::gcpproject/dataset/tablename-common"}, + {Type: "table", AssetID: "bigquery::gcpproject/dataset/tablename-mid"}, + {Type: "table", AssetID: "bigquery::gcpproject/dataset/test"}, + {Type: "table", AssetID: "bigquery::gcpproject/dataset/tablename-1"}, + {Type: "table", AssetID: "bigquery::gcpproject/dataset/tablename-abc-common-test"}, + {Type: "table", AssetID: "bigquery::gcpproject/dataset/abc-tablename-mid"}, + }, + }, + { + Description: "should return 5 records with offset of 0", + Config: asset.SearchConfig{ + Text: "column3", + Offset: 0, + MaxResults: 5, + IncludeFields: []string{"type", "id"}, + RankBy: "data.profile.usage_count", + Flags: asset.SearchFlags{IsColumnSearch: true}, + }, + Expected: []expectedRow{ + {Type: "table", AssetID: "bigquery::gcpproject/dataset/tablename-common"}, + {Type: "table", AssetID: "bigquery::gcpproject/dataset/tablename-mid"}, + {Type: "table", AssetID: "bigquery::gcpproject/dataset/test"}, + {Type: "table", AssetID: "bigquery::gcpproject/dataset/tablename-1"}, + {Type: "table", AssetID: "bigquery::gcpproject/dataset/tablename-abc-common-test"}, + }, + }, + { + Description: "should return 4 records with offset of 2", + Config: asset.SearchConfig{ + Text: "column3", + Offset: 2, + MaxResults: 5, + IncludeFields: []string{"type", "id"}, + RankBy: "data.profile.usage_count", + Flags: asset.SearchFlags{IsColumnSearch: true}, + }, + Expected: []expectedRow{ + {Type: "table", AssetID: "bigquery::gcpproject/dataset/test"}, + {Type: "table", AssetID: "bigquery::gcpproject/dataset/tablename-1"}, + {Type: "table", AssetID: "bigquery::gcpproject/dataset/tablename-abc-common-test"}, + {Type: "table", AssetID: "bigquery::gcpproject/dataset/abc-tablename-mid"}, + }, + }, + { + Description: "should return 'bigquery::gcpproject/dataset/test' resource on top if search by column name 'tablename-common-column2", + Config: asset.SearchConfig{ + Text: "test-column3", + IncludeFields: []string{"type", "id"}, + Flags: asset.SearchFlags{IsColumnSearch: true}, + }, + Expected: []expectedRow{ + {Type: "table", AssetID: "bigquery::gcpproject/dataset/test"}, + {Type: "table", AssetID: "bigquery::gcpproject/dataset/tablename-abc-common-test"}, + {Type: "table", AssetID: "bigquery::gcpproject/dataset/tablename-1"}, + {Type: "table", AssetID: "bigquery::gcpproject/dataset/tablename-common"}, + {Type: "table", AssetID: "bigquery::gcpproject/dataset/tablename-mid"}, + {Type: "table", AssetID: "bigquery::gcpproject/dataset/abc-tablename-mid"}, + }, + }, + { + Description: "should return 'au2-microsoft-invoice' as it has column name same as text inspite of being part of excluded col search keywords", + Config: asset.SearchConfig{ + Text: "item_id", + IncludeFields: []string{"type", "id"}, + Flags: asset.SearchFlags{IsColumnSearch: true}, + }, + Expected: []expectedRow{ + {Type: "table", AssetID: "us1-apple-invoice"}, + {Type: "table", AssetID: "au2-microsoft-invoice"}, + }, + }, + { + Description: "should return results if text matches column description", + Config: asset.SearchConfig{ + Text: "identifications", + IncludeFields: []string{"type", "id"}, + Flags: asset.SearchFlags{IsColumnSearch: true}, + }, + Expected: []expectedRow{ + {Type: "table", AssetID: "us1-apple-invoice"}, + {Type: "table", AssetID: "au2-microsoft-invoice"}, + }, + }, + { + Description: "should return 'us1-apple-invoice' on top as it matches the column description exactly", + Config: asset.SearchConfig{ + Text: "purchaser user idenfitication", + IncludeFields: []string{"type", "id"}, + Flags: asset.SearchFlags{IsColumnSearch: true}, + }, + Expected: []expectedRow{ + {Type: "table", AssetID: "us1-apple-invoice"}, + {Type: "table", AssetID: "au2-microsoft-invoice"}, + }, + }, + { + Description: "should return highlighted text in resource if searched highlight text is enabled.", + Config: asset.SearchConfig{ + Text: "identification", + RankBy: "data.profile.usage_count", + Flags: asset.SearchFlags{ + EnableHighlight: true, + IsColumnSearch: true, + }, + IncludeFields: []string{"type", "id", "data.columns"}, + }, + + Expected: []expectedRow{ + { + Type: "table", + AssetID: "us1-apple-invoice", + Data: map[string]interface{}{ + "_highlight": map[string]interface{}{ + "data.columns.description": []interface{}{ + "item identifications", + "purchaser user idenfitication", + }, + }, + "columns": []interface{}{ + map[string]interface{}{"name": "id"}, + map[string]interface{}{"description": "purchaser user idenfitication", "name": "user_id"}, + map[string]interface{}{"description": "item identifications", "name": "item_id"}, + }, + }, + }, + { + Type: "table", + AssetID: "au2-microsoft-invoice", + Data: map[string]interface{}{ + "_highlight": map[string]interface{}{ + "data.columns.description": []interface{}{ + "item identifications", + }, + }, + "columns": []interface{}{ + map[string]interface{}{"name": "id"}, + map[string]interface{}{"description": "purchaser username", "name": "username"}, + map[string]interface{}{"description": "item identifications", "name": "item_id"}, + }, + }, + }, + }, + }, + } + + allTests := append(tests, columnTests...) + + for _, test := range allTests { t.Run(test.Description, func(t *testing.T) { results, err := repo.Search(ctx, test.Config) require.NoError(t, err) @@ -388,7 +683,7 @@ func TestSearcherSuggest(t *testing.T) { err = loadTestFixture(cli, esClient, "./testdata/suggest-test-fixture.json") require.NoError(t, err) - repo := store.NewDiscoveryRepository(esClient, log.NewNoop(), time.Second*10) + repo := store.NewDiscoveryRepository(esClient, log.NewNoop(), time.Second*10, []string{"number", "id"}) t.Run("fixtures", func(t *testing.T) { testCases := []struct { @@ -425,7 +720,7 @@ func loadTestFixture(cli *elasticsearch.Client, esClient *store.Client, filePath ctx := context.TODO() for _, testdata := range data { - repo := store.NewDiscoveryRepository(esClient, log.NewNoop(), time.Second*10) + repo := store.NewDiscoveryRepository(esClient, log.NewNoop(), time.Second*10, []string{"number", "id"}) for _, ast := range testdata.Assets { if err := repo.Upsert(ctx, ast); err != nil { return err @@ -453,7 +748,7 @@ func TestGroupAssets(t *testing.T) { ) require.NoError(t, err) - repo := store.NewDiscoveryRepository(esClient, log.NewNoop(), time.Second*10) + repo := store.NewDiscoveryRepository(esClient, log.NewNoop(), time.Second*10, []string{"number", "id"}) _, err = repo.GroupAssets(ctx, asset.GroupConfig{ GroupBy: []string{""}, }) @@ -474,7 +769,7 @@ func TestGroupAssets(t *testing.T) { err = loadTestFixture(cli, esClient, "./testdata/search-test-fixture.json") require.NoError(t, err) - repo := store.NewDiscoveryRepository(esClient, log.NewNoop(), time.Second*10) + repo := store.NewDiscoveryRepository(esClient, log.NewNoop(), time.Second*10, []string{"number", "id"}) type groupTest struct { Description string diff --git a/internal/store/elasticsearch/schema.go b/internal/store/elasticsearch/schema.go index 18b7d398..ccb3ea7f 100644 --- a/internal/store/elasticsearch/schema.go +++ b/internal/store/elasticsearch/schema.go @@ -9,6 +9,12 @@ var indexSettingsTemplate = `{ %q: {} }, "settings": { + "similarity": { + "my_bm25_without_length_normalization": { + "type": "BM25", + "b": "0" + } + }, "index.mapping.ignore_malformed": true, "analysis": { "analyzer": { diff --git a/internal/store/elasticsearch/testdata/search-test-fixture.json b/internal/store/elasticsearch/testdata/search-test-fixture.json index b990e928..c7fc662c 100644 --- a/internal/store/elasticsearch/testdata/search-test-fixture.json +++ b/internal/store/elasticsearch/testdata/search-test-fixture.json @@ -114,7 +114,7 @@ "country": "us", "description": "Transaction records for every microsoft purchase", "total_rows": 100, - "schema": [ + "columns": [ { "name": "id" }, @@ -144,7 +144,7 @@ "country": "id", "description": "Transaction records for every Apple purchase", "total_rows": 100, - "schema": [ + "columns": [ { "name": "id" }, @@ -195,8 +195,7 @@ "owner": "user_1" } }, - "schema": { - "columns": [ + "columns": [ { "data_type": "STRING", "is_nullable": true, @@ -227,8 +226,7 @@ } } } - ] - }, + ], "resource": { "name": "tablename-1", "service": "bigquery", @@ -280,8 +278,7 @@ "owner": "user_1" } }, - "schema": { - "columns": [ + "columns": [ { "data_type": "STRING", "is_nullable": true, @@ -312,8 +309,7 @@ } } } - ] - }, + ], "resource": { "name": "tablename-common", "service": "bigquery", @@ -371,8 +367,7 @@ "owner": "user_1" } }, - "schema": { - "columns": [ + "columns": [ { "data_type": "STRING", "is_nullable": true, @@ -403,8 +398,8 @@ } } } - ] - }, + ], + "resource": { "name": "tablename-common-test", "service": "bigquery", @@ -469,8 +464,7 @@ "owner": "user_1" } }, - "schema": { - "columns": [ + "columns": [ { "data_type": "STRING", "is_nullable": true, @@ -501,8 +495,7 @@ } } } - ] - }, + ], "resource": { "name": "tablename-mid", "service": "bigquery", @@ -560,8 +553,7 @@ "owner": "user_1" } }, - "schema": { - "columns": [ + "columns": [ { "data_type": "STRING", "is_nullable": true, @@ -592,8 +584,7 @@ } } } - ] - }, + ], "resource": { "name": "abc-mid-test", "service": "bigquery", @@ -658,8 +649,7 @@ "owner": "user_1" } }, - "schema": { - "columns": [ + "columns": [ { "data_type": "STRING", "is_nullable": true, @@ -690,8 +680,7 @@ } } } - ] - }, + ], "resource": { "name": "tablename-mid", "service": "bigquery",