Skip to content

Commit

Permalink
Implementation for match_only_text field (#11039)
Browse files Browse the repository at this point in the history
* Implementation for match_only_text field

Signed-off-by: Rishabh Maurya <[email protected]>

* Fix build failures

Signed-off-by: Rishabh Maurya <[email protected]>

* Fix bugs

Signed-off-by: Rishabh Maurya <[email protected]>

* Added mapper tests, stil failing on prefix and phrase tests

Signed-off-by: Rishabh Maurya <[email protected]>

* Disable index prefix and phrase mapper

Signed-off-by: Rishabh Maurya <[email protected]>

* Added unit tests for phrase and multiphrase query validation

Signed-off-by: Rishabh Maurya <[email protected]>

* Add unit tests for prefix and prefix phrase queries

Signed-off-by: Rishabh Maurya <[email protected]>

* Add a test to cover 3 word with synonym match phrase prefix query

Signed-off-by: Rishabh Maurya <[email protected]>

* Add unit test for SourceFieldMatchQuery

Signed-off-by: Rishabh Maurya <[email protected]>

* Added test for _source disabled case

Signed-off-by: Rishabh Maurya <[email protected]>

* Add unit test for missing field

Signed-off-by: Rishabh Maurya <[email protected]>

* more validation tests and changelog update

Signed-off-by: Rishabh Maurya <[email protected]>

* Added integration tests for match_only_text replicating text field integ tests

Signed-off-by: Rishabh Maurya <[email protected]>

* Added skip section in integ test to fix mixed cluster failures

Signed-off-by: Rishabh Maurya <[email protected]>

* remove unused import

Signed-off-by: Rishabh Maurya <[email protected]>

* Address PR comments

Signed-off-by: Rishabh Maurya <[email protected]>

* fix integ tests

Signed-off-by: Rishabh Maurya <[email protected]>

* Fix flaky test due to random indexwriter

Signed-off-by: Rishabh Maurya <[email protected]>

* pr comment: header modification

Signed-off-by: Rishabh Maurya <[email protected]>

* Address PR comments

Signed-off-by: Rishabh Maurya <[email protected]>

* addded change to the right section of CHANGELOG

Signed-off-by: Rishabh Maurya <[email protected]>

* overriding the textFieldType before every test

Signed-off-by: Rishabh Maurya <[email protected]>

* rename @before method

Signed-off-by: Rishabh Maurya <[email protected]>

* update changelog description

Signed-off-by: Rishabh Maurya <[email protected]>

---------

Signed-off-by: Rishabh Maurya <[email protected]>
(cherry picked from commit 7b1c2c7)
Signed-off-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>
  • Loading branch information
github-actions[bot] committed Jan 3, 2024
1 parent 37f788f commit d381a59
Show file tree
Hide file tree
Showing 36 changed files with 3,959 additions and 169 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
- Create separate transport action for render search template action ([#11170](https://github.com/opensearch-project/OpenSearch/pull/11170))
- Add additional handling in SearchTemplateRequest when simulate is set to true ([#11591](https://github.com/opensearch-project/OpenSearch/pull/11591))
- Introduce cluster level setting `cluster.index.restrict.replication.type` to prevent replication type setting override during index creations([#11583](https://github.com/opensearch-project/OpenSearch/pull/11583))
- Add match_only_text field that is optimized for storage by trading off positional queries performance ([#6836](https://github.com/opensearch-project/OpenSearch/pull/11039))

### Dependencies
- Bumps jetty version to 9.4.52.v20230823 to fix GMS-2023-1857 ([#9822](https://github.com/opensearch-project/OpenSearch/pull/9822))
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
# integration tests for queries with specific analysis chains

"match query with stacked stems":
- skip:
version: " - 2.99.99"
reason: "match_only_text was added in 3.0"
# Tests the match query stemmed tokens are "stacked" on top of the unstemmed
# versions in the same position.
- do:
indices.create:
index: test
body:
settings:
number_of_shards: 1
number_of_replicas: 1
analysis:
analyzer:
index:
tokenizer: standard
filter: [lowercase]
search:
rest_total_hits_as_int: true
tokenizer: standard
filter: [lowercase, keyword_repeat, porter_stem, unique_stem]
filter:
unique_stem:
type: unique
only_on_same_position: true
mappings:
properties:
text:
type: match_only_text
analyzer: index
search_analyzer: search

- do:
index:
index: test
id: 1
body: { "text": "the fox runs across the street" }
refresh: true

- do:
search:
rest_total_hits_as_int: true
body:
query:
match:
text:
query: fox runs
operator: AND
- match: {hits.total: 1}

- do:
index:
index: test
id: 2
body: { "text": "run fox run" }
refresh: true

- do:
search:
rest_total_hits_as_int: true
body:
query:
match:
text:
query: fox runs
operator: AND
- match: {hits.total: 2}
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
"ngram search":
- skip:
version: " - 2.99.99"
reason: "match_only_text was added in 3.0"
- do:
indices.create:
index: test
body:
settings:
number_of_shards: 1
number_of_replicas: 0
analysis:
analyzer:
my_analyzer:
tokenizer: standard
filter: [my_ngram]
filter:
my_ngram:
type: ngram
min: 2,
max: 2
mappings:
properties:
text:
type: match_only_text
analyzer: my_analyzer

- do:
index:
index: test
id: 1
body: { "text": "foo bar baz" }
refresh: true

- do:
search:
rest_total_hits_as_int: true
body:
query:
match:
text:
query: foa
- match: {hits.total: 1}

---
"testNGramCopyField":
- skip:
version: " - 2.99.99"
reason: "match_only_text was added in 3.0"
- do:
indices.create:
index: test
body:
settings:
number_of_shards: 1
number_of_replicas: 0
max_ngram_diff: 9
analysis:
analyzer:
my_ngram_analyzer:
tokenizer: my_ngram_tokenizer
tokenizer:
my_ngram_tokenizer:
type: ngram
min: 1,
max: 10
token_chars: []
mappings:
properties:
origin:
type: match_only_text
copy_to: meta
meta:
type: match_only_text
analyzer: my_ngram_analyzer

- do:
index:
index: test
id: 1
body: { "origin": "C.A1234.5678" }
refresh: true

- do:
search:
rest_total_hits_as_int: true
body:
query:
match:
meta:
query: 1234
- match: {hits.total: 1}

- do:
search:
rest_total_hits_as_int: true
body:
query:
match:
meta:
query: 1234.56
- match: {hits.total: 1}

- do:
search:
rest_total_hits_as_int: true
body:
query:
match:
meta:
query: A1234
- match: {hits.total: 1}

- do:
search:
rest_total_hits_as_int: true
body:
query:
term:
meta:
value: a1234
- match: {hits.total: 0}

- do:
search:
rest_total_hits_as_int: true
body:
query:
match:
meta:
query: A1234
analyzer: my_ngram_analyzer
- match: {hits.total: 1}

- do:
search:
rest_total_hits_as_int: true
body:
query:
match:
meta:
query: a1234
analyzer: my_ngram_analyzer
- match: {hits.total: 1}
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
"ngram highlighting":
- skip:
version: " - 2.99.99"
reason: "match_only_text was added in 3.0"
- do:
indices.create:
index: test
body:
settings:
number_of_shards: 1
number_of_replicas: 0
index.max_ngram_diff: 19
analysis:
tokenizer:
my_ngramt:
type: ngram
min_gram: 1
max_gram: 20
token_chars: letter,digit
filter:
my_ngram:
type: ngram
min_gram: 1
max_gram: 20
analyzer:
name2_index_analyzer:
tokenizer: whitespace
filter: [my_ngram]
name_index_analyzer:
tokenizer: my_ngramt
name_search_analyzer:
tokenizer: whitespace
mappings:
properties:
name:
type: match_only_text
term_vector: with_positions_offsets
analyzer: name_index_analyzer
search_analyzer: name_search_analyzer
name2:
type: match_only_text
term_vector: with_positions_offsets
analyzer: name2_index_analyzer
search_analyzer: name_search_analyzer

- do:
index:
index: test
id: 1
refresh: true
body:
name: logicacmg ehemals avinci - the know how company
name2: logicacmg ehemals avinci - the know how company

- do:
search:
rest_total_hits_as_int: true
body:
query:
match:
name:
query: logica m
highlight:
fields:
- name: {}
- match: {hits.total: 1}
- match: {hits.hits.0.highlight.name.0: "<em>logica</em>c<em>m</em>g ehe<em>m</em>als avinci - the know how co<em>m</em>pany"}

- do:
search:
rest_total_hits_as_int: true
body:
query:
match:
name:
query: logica ma
highlight:
fields:
- name: {}
- match: {hits.total: 1}
- match: {hits.hits.0.highlight.name.0: "<em>logica</em>cmg ehe<em>ma</em>ls avinci - the know how company"}

- do:
search:
rest_total_hits_as_int: true
body:
query:
match:
name:
query: logica
highlight:
fields:
- name: {}
- match: {hits.total: 1}
- match: {hits.hits.0.highlight.name.0: "<em>logica</em>cmg ehemals avinci - the know how company"}

- do:
search:
rest_total_hits_as_int: true
body:
query:
match:
name2:
query: logica m
highlight:
fields:
- name2: {}
- match: {hits.total: 1}
- match: {hits.hits.0.highlight.name2.0: "<em>logicacmg</em> <em>ehemals</em> avinci - the know how <em>company</em>"}

- do:
search:
rest_total_hits_as_int: true
body:
query:
match:
name2:
query: logica ma
highlight:
fields:
- name2: {}
- match: {hits.total: 1}
- match: {hits.hits.0.highlight.name2.0: "<em>logicacmg</em> <em>ehemals</em> avinci - the know how company"}

- do:
search:
rest_total_hits_as_int: true
body:
query:
match:
name2:
query: logica
highlight:
fields:
- name2: {}
- match: {hits.total: 1}
- match: {hits.hits.0.highlight.name2.0: "<em>logicacmg</em> ehemals avinci - the know how company"}
Loading

0 comments on commit d381a59

Please sign in to comment.