From f580b30169e4abde28242bd38cf97348d1d6788b Mon Sep 17 00:00:00 2001 From: Michael Froh Date: Thu, 14 Dec 2023 22:49:13 +0000 Subject: [PATCH 01/13] Add documentation for collapse, oversample, truncate_hits processors Signed-off-by: Michael Froh --- .../search-pipelines/collapse-processor.md | 143 +++++ .../search-pipelines/oversample-processor.md | 292 ++++++++++ .../search-pipelines/search-processors.md | 6 +- .../truncate-hits-processor.md | 515 ++++++++++++++++++ 4 files changed, 955 insertions(+), 1 deletion(-) create mode 100644 _search-plugins/search-pipelines/collapse-processor.md create mode 100644 _search-plugins/search-pipelines/oversample-processor.md create mode 100644 _search-plugins/search-pipelines/truncate-hits-processor.md diff --git a/_search-plugins/search-pipelines/collapse-processor.md b/_search-plugins/search-pipelines/collapse-processor.md new file mode 100644 index 0000000000..4b0bad616c --- /dev/null +++ b/_search-plugins/search-pipelines/collapse-processor.md @@ -0,0 +1,143 @@ +--- +layout: default +title: Collapse processor +nav_order: 7 +has_children: false +parent: Search processors +grand_parent: Search pipelines +--- + +# Collapse processor + +The `collapse` response processor discards hits that have the same value for some field as a previous document in the result set. +This is similar to the `collapse` parameter that can be passed in a search request, but the response processor is applied to the +response after fetching from all shards. The `collapse` response processor may be used in conjunction with the `rescore` search +request parameter or may be applied after a reranking response processor. + +Using the `collapse` response processor will likely result in fewer than `size` results being returned, since hits are discarded +from a set whose size is already less than or equal to `size`. To increase the likelihood of returning `size` hits, use the +`oversample` request processor and `truncate_hits` response processor, as shown in [this example]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/truncate-hits-processor/#oversample-collapse-and-truncate-hits). + +## Request fields + +The following table lists all request fields. + +Field | Data type | Description +:--- | :--- | :--- +`field` | String | The field whose value will be read from each returned search hit. Only the first hit for each given field value will be returned in the search response. Required. +`context_prefix` | String | May be used to read the `original_size` variable from a specific scope to avoid collisions. Optional. +`tag` | String | The processor's identifier. Optional. +`description` | String | A description of the processor. Optional. +`ignore_failure` | Boolean | If `true`, OpenSearch [ignores any failure]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/creating-search-pipeline/#ignoring-processor-failures) of this processor and continues to run the remaining processors in the search pipeline. Optional. Default is `false`. + +## Example + +The following example demonstrates using a search pipeline with a `collapse` processor. + +### Setup + +Create many documents with a field that we'll use for collapsing: + +```json +POST /_bulk +{ "create":{"_index":"my_index","_id":1}} +{ "title" : "document 1", "color":"blue" } +{ "create":{"_index":"my_index","_id":2}} +{ "title" : "document 2", "color":"blue" } +{ "create":{"_index":"my_index","_id":3}} +{ "title" : "document 3", "color":"red" } +{ "create":{"_index":"my_index","_id":4}} +{ "title" : "document 4", "color":"red" } +{ "create":{"_index":"my_index","_id":5}} +{ "title" : "document 5", "color":"yellow" } +{ "create":{"_index":"my_index","_id":6}} +{ "title" : "document 6", "color":"yellow" } +{ "create":{"_index":"my_index","_id":7}} +{ "title" : "document 7", "color":"orange" } +{ "create":{"_index":"my_index","_id":8}} +{ "title" : "document 8", "color":"orange" } +{ "create":{"_index":"my_index","_id":9}} +{ "title" : "document 9", "color":"green" } +{ "create":{"_index":"my_index","_id":10}} +{ "title" : "document 10", "color":"green" } +``` +{% include copy-curl.html %} + +Create a pipeline that just collapses on the `color` field: + +```json +PUT /_search/pipeline/collapse_pipeline +{ + "response_processors": [ + { + "collapse" : { + "field": "color" + } + } + ] +} +``` +{% include copy-curl.html %} + +### Using a search pipeline + +In this example, we request the top 3 documents before collapsing on the "color" field. Since the first 2 documents have the same "color", the second one is discarded, +and the request returns the first and third document: + +```json +POST /my_index/_search?search_pipeline=collapse_pipeline +{ + "size": 3 +} +``` +{% include copy-curl.html %} + + +
+ + Response + + {: .text-delta} +```json +{ + "took" : 2, + "timed_out" : false, + "_shards" : { + "total" : 1, + "successful" : 1, + "skipped" : 0, + "failed" : 0 + }, + "hits" : { + "total" : { + "value" : 10, + "relation" : "eq" + }, + "max_score" : 1.0, + "hits" : [ + { + "_index" : "my_index", + "_id" : "1", + "_score" : 1.0, + "_source" : { + "title" : "document 1", + "color" : "blue" + } + }, + { + "_index" : "my_index", + "_id" : "3", + "_score" : 1.0, + "_source" : { + "title" : "document 3", + "color" : "red" + } + } + ] + }, + "profile" : { + "shards" : [ ] + } +} +``` +
diff --git a/_search-plugins/search-pipelines/oversample-processor.md b/_search-plugins/search-pipelines/oversample-processor.md new file mode 100644 index 0000000000..9f36b63570 --- /dev/null +++ b/_search-plugins/search-pipelines/oversample-processor.md @@ -0,0 +1,292 @@ +--- +layout: default +title: Oversample processor +nav_order: 17 +has_children: false +parent: Search processors +grand_parent: Search pipelines +--- + +# Oversample processor + +The `oversample` request processor multiplies the `size` parameter of the search request by a specified `sample_factor` (>= 1.0), saving the +original value in the `original_size` pipeline variable. The `oversample` processor is designed to work together with the +[`truncate_hits` response processor]({{site.url}}{{site.baseurl}}/search-plugins/truncate-hits-processor/), but may be used on its own. + +## Request fields + +The following table lists all request fields. + +Field | Data type | Description +:--- | :--- | :--- +`sample_factor` | Number | The multiplicative factor (>= 1.0) that will be applied to the `size` parameter before processing the search request. Required. +`context_prefix` | String | May be used to scope the `original_size` variable to avoid collisions. Optional. +`tag` | String | The processor's identifier. Optional. +`description` | String | A description of the processor. Optional. +`ignore_failure` | Boolean | If `true`, OpenSearch [ignores any failure]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/creating-search-pipeline/#ignoring-processor-failures) of this processor and continues to run the remaining processors in the search pipeline. Optional. Default is `false`. + + +## Example + +The following example demonstrates using a search pipeline with an `oversample` processor. + +### Setup + +Create an index named `my_index` with many documents: + +```json +POST /_bulk +{ "create":{"_index":"my_index","_id":1}} +{ "doc": { "title" : "document 1" }} +{ "create":{"_index":"my_index","_id":2}} +{ "doc": { "title" : "document 2" }} +{ "create":{"_index":"my_index","_id":3}} +{ "doc": { "title" : "document 3" }} +{ "create":{"_index":"my_index","_id":4}} +{ "doc": { "title" : "document 4" }} +{ "create":{"_index":"my_index","_id":5}} +{ "doc": { "title" : "document 5" }} +{ "create":{"_index":"my_index","_id":6}} +{ "doc": { "title" : "document 6" }} +{ "create":{"_index":"my_index","_id":7}} +{ "doc": { "title" : "document 7" }} +{ "create":{"_index":"my_index","_id":8}} +{ "doc": { "title" : "document 8" }} +{ "create":{"_index":"my_index","_id":9}} +{ "doc": { "title" : "document 9" }} +{ "create":{"_index":"my_index","_id":10}} +{ "doc": { "title" : "document 10" }} +``` +{% include copy-curl.html %} + +### Creating a search pipeline + +The following request creates a search pipeline called `my_pipeline` with a `oversample` request processor that requests 50% more hits than specified in `size`: + +```json +PUT /_search/pipeline/my_pipeline +{ + "request_processors": [ + { + "oversample" : { + "tag" : "oversample_1", + "description" : "This processor will multiply `size` by 1.5.", + "sample_factor" : 1.5 + } + } + ] +} +``` +{% include copy-curl.html %} + +### Using a search pipeline + +Search for documents in `my_index` without a search pipeline: + +```json +POST /my_index/_search +{ + "size": 5 +} +``` +{% include copy-curl.html %} + +The response contains 5 hits: + +
+ + Response + + {: .text-delta} +```json +{ + "took" : 3, + "timed_out" : false, + "_shards" : { + "total" : 1, + "successful" : 1, + "skipped" : 0, + "failed" : 0 + }, + "hits" : { + "total" : { + "value" : 10, + "relation" : "eq" + }, + "max_score" : 1.0, + "hits" : [ + { + "_index" : "my_index", + "_id" : "1", + "_score" : 1.0, + "_source" : { + "doc" : { + "title" : "document 1" + } + } + }, + { + "_index" : "my_index", + "_id" : "2", + "_score" : 1.0, + "_source" : { + "doc" : { + "title" : "document 2" + } + } + }, + { + "_index" : "my_index", + "_id" : "3", + "_score" : 1.0, + "_source" : { + "doc" : { + "title" : "document 3" + } + } + }, + { + "_index" : "my_index", + "_id" : "4", + "_score" : 1.0, + "_source" : { + "doc" : { + "title" : "document 4" + } + } + }, + { + "_index" : "my_index", + "_id" : "5", + "_score" : 1.0, + "_source" : { + "doc" : { + "title" : "document 5" + } + } + } + ] + } +} +``` +
+ +To search with a pipeline, specify the pipeline name in the `search_pipeline` query parameter: + +```json +POST /my_index/_search?search_pipeline=my_pipeline +{ + "size": 5 +} +``` +{% include copy-curl.html %} + +The response contains 8 documents (5 * 1.5 = 7.5, rounded up to 8): + +
+ + Response + + {: .text-delta} +```json +{ + "took" : 13, + "timed_out" : false, + "_shards" : { + "total" : 1, + "successful" : 1, + "skipped" : 0, + "failed" : 0 + }, + "hits" : { + "total" : { + "value" : 10, + "relation" : "eq" + }, + "max_score" : 1.0, + "hits" : [ + { + "_index" : "my_index", + "_id" : "1", + "_score" : 1.0, + "_source" : { + "doc" : { + "title" : "document 1" + } + } + }, + { + "_index" : "my_index", + "_id" : "2", + "_score" : 1.0, + "_source" : { + "doc" : { + "title" : "document 2" + } + } + }, + { + "_index" : "my_index", + "_id" : "3", + "_score" : 1.0, + "_source" : { + "doc" : { + "title" : "document 3" + } + } + }, + { + "_index" : "my_index", + "_id" : "4", + "_score" : 1.0, + "_source" : { + "doc" : { + "title" : "document 4" + } + } + }, + { + "_index" : "my_index", + "_id" : "5", + "_score" : 1.0, + "_source" : { + "doc" : { + "title" : "document 5" + } + } + }, + { + "_index" : "my_index", + "_id" : "6", + "_score" : 1.0, + "_source" : { + "doc" : { + "title" : "document 6" + } + } + }, + { + "_index" : "my_index", + "_id" : "7", + "_score" : 1.0, + "_source" : { + "doc" : { + "title" : "document 7" + } + } + }, + { + "_index" : "my_index", + "_id" : "8", + "_score" : 1.0, + "_source" : { + "doc" : { + "title" : "document 8" + } + } + } + ] + } +} +``` +
diff --git a/_search-plugins/search-pipelines/search-processors.md b/_search-plugins/search-pipelines/search-processors.md index e4ef4e5f8f..73d2fdfd32 100644 --- a/_search-plugins/search-pipelines/search-processors.md +++ b/_search-plugins/search-pipelines/search-processors.md @@ -26,6 +26,8 @@ Processor | Description | Earliest available version [`filter_query`]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/filter-query-processor/) | Adds a filtering query that is used to filter requests. | 2.8 [`neural_query_enricher`]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/neural-query-enricher/) | Sets a default model for neural search at the index or field level. | 2.11 [`script`]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/script-processor/) | Adds a script that is run on newly indexed documents. | 2.8 +[`oversample`]({site.url}}{{site.baseurl}}/search-plugins/search-pipelines/oversample-processor/) | Increases the `size` parameter on a search request, storing the original value in pipeline state. | 2.12 + ## Search response processors @@ -37,12 +39,14 @@ Processor | Description | Earliest available version :--- | :--- | :--- [`personalize_search_ranking`]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/personalize-search-ranking/) | Uses [Amazon Personalize](https://aws.amazon.com/personalize/) to rerank search results (requires setting up the Amazon Personalize service). | 2.9 [`rename_field`]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/rename-field-processor/)| Renames an existing field. | 2.8 +[`collapse`]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/collapse-processor/)| Deduplicates search hits based on a field value, similar to `collapse` in a search request. | 2.12 +[`truncate_hits`]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/truncate-hits-processor/)| Discards search hits after a specified target count. Can "undo" the effect of `oversample` request processor. | 2.12 ## Search phase results processors A search phase results processor runs between search phases at the coordinating node level. It intercepts the results retrieved from one search phase and transforms them before passing them to the next search phase. -The following table lists all supported search request processors. +The following table lists all supported search phase results processors. Processor | Description | Earliest available version :--- | :--- | :--- diff --git a/_search-plugins/search-pipelines/truncate-hits-processor.md b/_search-plugins/search-pipelines/truncate-hits-processor.md new file mode 100644 index 0000000000..13400fc064 --- /dev/null +++ b/_search-plugins/search-pipelines/truncate-hits-processor.md @@ -0,0 +1,515 @@ +--- +layout: default +title: Truncate hits processor +nav_order: 35 +has_children: false +parent: Search processors +grand_parent: Search pipelines +--- + +# Truncate hits processor + +The `truncate_hits` response processor discards returned search hits after a given hit count. The `truncate_hits` processor is designed to work together with the +[`oversample` request processor]({{site.url}}{{site.baseurl}}/search-plugins/oversample-processor/), but may be used on its own. + +The `target_size` parameter (to specify where to truncate) is optional. If it is not specified, then we'll use the `original_size` variable set by the +`oversample` processor (if available). + +A common usage pattern is to add the `oversample` processor to a request pipeline to fetch a larger set of results, then in the response pipeline, apply a +reranking processor (which may now promote results from beyond the the originally-requested top N) or the `collapse` processor (which may discard results after +deduplication), then apply the `truncate` processor to return (at most) the originally-requested number of hits. + +## Request fields + +The following table lists all request fields. + +Field | Data type | Description +:--- | :--- | :--- +`target_size` | Integer | The maximum number of search hits to return (>=0). If not specified, the processor will try to read the `original_size` variable, failing if it is not available. Optional. +`context_prefix` | String | May be used to read the `original_size` variable from a specific scope to avoid collisions. Optional. +`tag` | String | The processor's identifier. Optional. +`description` | String | A description of the processor. Optional. +`ignore_failure` | Boolean | If `true`, OpenSearch [ignores any failure]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/creating-search-pipeline/#ignoring-processor-failures) of this processor and continues to run the remaining processors in the search pipeline. Optional. Default is `false`. + +## Example + +The following example demonstrates using a search pipeline with a `truncate` processor. + +### Setup + +Create an index named `my_index` with many documents: + +```json +POST /_bulk +{ "create":{"_index":"my_index","_id":1}} +{ "doc": { "title" : "document 1" }} +{ "create":{"_index":"my_index","_id":2}} +{ "doc": { "title" : "document 2" }} +{ "create":{"_index":"my_index","_id":3}} +{ "doc": { "title" : "document 3" }} +{ "create":{"_index":"my_index","_id":4}} +{ "doc": { "title" : "document 4" }} +{ "create":{"_index":"my_index","_id":5}} +{ "doc": { "title" : "document 5" }} +{ "create":{"_index":"my_index","_id":6}} +{ "doc": { "title" : "document 6" }} +{ "create":{"_index":"my_index","_id":7}} +{ "doc": { "title" : "document 7" }} +{ "create":{"_index":"my_index","_id":8}} +{ "doc": { "title" : "document 8" }} +{ "create":{"_index":"my_index","_id":9}} +{ "doc": { "title" : "document 9" }} +{ "create":{"_index":"my_index","_id":10}} +{ "doc": { "title" : "document 10" }} +``` +{% include copy-curl.html %} + +### Creating a search pipeline + +The following request creates a search pipeline called `my_pipeline` with a `truncate_hits` response processor that discards hits after the first 5: + +```json +PUT /_search/pipeline/my_pipeline +{ + "response_processors": [ + { + "truncate_hits" : { + "tag" : "truncate_1", + "description" : "This processor will discard results after the first 5.", + "target_size" : 5 + } + } + ] +} +``` +{% include copy-curl.html %} + +### Using a search pipeline + +Search for documents in `my_index` without a search pipeline: + +```json +POST /my_index/_search +{ + "size": 8 +} +``` +{% include copy-curl.html %} + +The response contains 8 hits: + +
+ + Response + + {: .text-delta} +```json +{ + "took" : 13, + "timed_out" : false, + "_shards" : { + "total" : 1, + "successful" : 1, + "skipped" : 0, + "failed" : 0 + }, + "hits" : { + "total" : { + "value" : 10, + "relation" : "eq" + }, + "max_score" : 1.0, + "hits" : [ + { + "_index" : "my_index", + "_id" : "1", + "_score" : 1.0, + "_source" : { + "doc" : { + "title" : "document 1" + } + } + }, + { + "_index" : "my_index", + "_id" : "2", + "_score" : 1.0, + "_source" : { + "doc" : { + "title" : "document 2" + } + } + }, + { + "_index" : "my_index", + "_id" : "3", + "_score" : 1.0, + "_source" : { + "doc" : { + "title" : "document 3" + } + } + }, + { + "_index" : "my_index", + "_id" : "4", + "_score" : 1.0, + "_source" : { + "doc" : { + "title" : "document 4" + } + } + }, + { + "_index" : "my_index", + "_id" : "5", + "_score" : 1.0, + "_source" : { + "doc" : { + "title" : "document 5" + } + } + }, + { + "_index" : "my_index", + "_id" : "6", + "_score" : 1.0, + "_source" : { + "doc" : { + "title" : "document 6" + } + } + }, + { + "_index" : "my_index", + "_id" : "7", + "_score" : 1.0, + "_source" : { + "doc" : { + "title" : "document 7" + } + } + }, + { + "_index" : "my_index", + "_id" : "8", + "_score" : 1.0, + "_source" : { + "doc" : { + "title" : "document 8" + } + } + } + ] + } +} +``` +
+ +To search with a pipeline, specify the pipeline name in the `search_pipeline` query parameter: + +```json +POST /my_index/_search?search_pipeline=my_pipeline +{ + "size": 8 +} +``` +{% include copy-curl.html %} + +The response only contains 5 hits, though we requested 8 and 10 were available. + +
+ + Response + + {: .text-delta} +```json +{ + "took" : 3, + "timed_out" : false, + "_shards" : { + "total" : 1, + "successful" : 1, + "skipped" : 0, + "failed" : 0 + }, + "hits" : { + "total" : { + "value" : 10, + "relation" : "eq" + }, + "max_score" : 1.0, + "hits" : [ + { + "_index" : "my_index", + "_id" : "1", + "_score" : 1.0, + "_source" : { + "doc" : { + "title" : "document 1" + } + } + }, + { + "_index" : "my_index", + "_id" : "2", + "_score" : 1.0, + "_source" : { + "doc" : { + "title" : "document 2" + } + } + }, + { + "_index" : "my_index", + "_id" : "3", + "_score" : 1.0, + "_source" : { + "doc" : { + "title" : "document 3" + } + } + }, + { + "_index" : "my_index", + "_id" : "4", + "_score" : 1.0, + "_source" : { + "doc" : { + "title" : "document 4" + } + } + }, + { + "_index" : "my_index", + "_id" : "5", + "_score" : 1.0, + "_source" : { + "doc" : { + "title" : "document 5" + } + } + } + ] + } +} +``` +
+ +## Oversample, collapse, and truncate hits + +This shows a more realistic example where you would use `oversample` to request many candidate documents, use `collapse` to remove documents that +duplicate some field (to get more diverse results), then use `truncate` to return the originally-requested document count (avoiding returning a +large result payload from the cluster). + + +### Setup + +Create many documents with a field that we'll use for collapsing: + +```json +POST /_bulk +{ "create":{"_index":"my_index","_id":1}} +{ "title" : "document 1", "color":"blue" } +{ "create":{"_index":"my_index","_id":2}} +{ "title" : "document 2", "color":"blue" } +{ "create":{"_index":"my_index","_id":3}} +{ "title" : "document 3", "color":"red" } +{ "create":{"_index":"my_index","_id":4}} +{ "title" : "document 4", "color":"red" } +{ "create":{"_index":"my_index","_id":5}} +{ "title" : "document 5", "color":"yellow" } +{ "create":{"_index":"my_index","_id":6}} +{ "title" : "document 6", "color":"yellow" } +{ "create":{"_index":"my_index","_id":7}} +{ "title" : "document 7", "color":"orange" } +{ "create":{"_index":"my_index","_id":8}} +{ "title" : "document 8", "color":"orange" } +{ "create":{"_index":"my_index","_id":9}} +{ "title" : "document 9", "color":"green" } +{ "create":{"_index":"my_index","_id":10}} +{ "title" : "document 10", "color":"green" } +``` +{% include copy-curl.html %} + +Create a pipeline that just collapses on the `color` field: + +```json +PUT /_search/pipeline/collapse_pipeline +{ + "response_processors": [ + { + "collapse" : { + "field": "color" + } + } + ] +} +``` +{% include copy-curl.html %} + +Create another pipeline that oversamples, collapses, then truncates results: + +```json +PUT /_search/pipeline/oversampling_collapse_pipeline +{ + "request_processors": [ + { + "oversample": { + "sample_factor": 3 + } + } + ], + "response_processors": [ + { + "collapse" : { + "field": "color" + } + }, + { + "truncate_hits": { + "description": "Truncates back to the original size before oversample increased it." + } + } + ] +} +``` +{% include copy-curl.html %} + +### Collapse without oversample + +In this example, we request the top 3 documents before collapsing on the "color" field. Since the first 2 documents have the same "color", the second one is discarded, +and the request returns the first and third document: + +```json +POST /my_index/_search?search_pipeline=collapse_pipeline +{ + "size": 3 +} +``` +{% include copy-curl.html %} + + +
+ + Response + + {: .text-delta} +```json +{ + "took" : 2, + "timed_out" : false, + "_shards" : { + "total" : 1, + "successful" : 1, + "skipped" : 0, + "failed" : 0 + }, + "hits" : { + "total" : { + "value" : 10, + "relation" : "eq" + }, + "max_score" : 1.0, + "hits" : [ + { + "_index" : "my_index", + "_id" : "1", + "_score" : 1.0, + "_source" : { + "title" : "document 1", + "color" : "blue" + } + }, + { + "_index" : "my_index", + "_id" : "3", + "_score" : 1.0, + "_source" : { + "title" : "document 3", + "color" : "red" + } + } + ] + }, + "profile" : { + "shards" : [ ] + } +} +``` +
+ + +### Oversample, collapse, and truncate + +Now, we will use the `oversampling_collapse_pipeline` that requests the top 9 documents (multiplying the size by 3), deduplicates by "color", +then returns the top 3 hits: + +```json +POST /my_index/_search?search_pipeline=oversampling_collapse_pipeline +{ + "size": 3 +} +``` +{% include copy-curl.html %} + + +
+ + Response + + {: .text-delta} +```json +{ + "took" : 2, + "timed_out" : false, + "_shards" : { + "total" : 1, + "successful" : 1, + "skipped" : 0, + "failed" : 0 + }, + "hits" : { + "total" : { + "value" : 10, + "relation" : "eq" + }, + "max_score" : 1.0, + "hits" : [ + { + "_index" : "my_index", + "_id" : "1", + "_score" : 1.0, + "_source" : { + "title" : "document 1", + "color" : "blue" + } + }, + { + "_index" : "my_index", + "_id" : "3", + "_score" : 1.0, + "_source" : { + "title" : "document 3", + "color" : "red" + } + }, + { + "_index" : "my_index", + "_id" : "5", + "_score" : 1.0, + "_source" : { + "title" : "document 5", + "color" : "yellow" + } + } + ] + }, + "profile" : { + "shards" : [ ] + } +} +``` +
+ + From ae3e287410db67269faadab8794ca46ea8d8aa10 Mon Sep 17 00:00:00 2001 From: kolchfa-aws <105444904+kolchfa-aws@users.noreply.github.com> Date: Fri, 15 Dec 2023 17:16:29 -0500 Subject: [PATCH 02/13] Apply suggestions from code review Signed-off-by: kolchfa-aws <105444904+kolchfa-aws@users.noreply.github.com> --- .../truncate-hits-processor.md | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/_search-plugins/search-pipelines/truncate-hits-processor.md b/_search-plugins/search-pipelines/truncate-hits-processor.md index 13400fc064..d577b355fa 100644 --- a/_search-plugins/search-pipelines/truncate-hits-processor.md +++ b/_search-plugins/search-pipelines/truncate-hits-processor.md @@ -12,12 +12,12 @@ grand_parent: Search pipelines The `truncate_hits` response processor discards returned search hits after a given hit count. The `truncate_hits` processor is designed to work together with the [`oversample` request processor]({{site.url}}{{site.baseurl}}/search-plugins/oversample-processor/), but may be used on its own. -The `target_size` parameter (to specify where to truncate) is optional. If it is not specified, then we'll use the `original_size` variable set by the +The `target_size` parameter (to specify where to truncate) is optional. If it is not specified, then OpenSearch uses the `original_size` variable set by the `oversample` processor (if available). -A common usage pattern is to add the `oversample` processor to a request pipeline to fetch a larger set of results, then in the response pipeline, apply a -reranking processor (which may now promote results from beyond the the originally-requested top N) or the `collapse` processor (which may discard results after -deduplication), then apply the `truncate` processor to return (at most) the originally-requested number of hits. +A common usage pattern is to add the `oversample` processor to a request pipeline to fetch a larger set of results, then, in the response pipeline, to apply a +reranking processor (which may promote results from beyond the the originally requested top N) or the `collapse` processor (which may discard results after +deduplication), then apply the `truncate` processor to return (at most) the originally requested number of hits. ## Request fields @@ -216,7 +216,7 @@ POST /my_index/_search?search_pipeline=my_pipeline ``` {% include copy-curl.html %} -The response only contains 5 hits, though we requested 8 and 10 were available. +The response only contains 5 hits, even though 8 were requested and 10 were available:
@@ -298,14 +298,14 @@ The response only contains 5 hits, though we requested 8 and 10 were available. ## Oversample, collapse, and truncate hits -This shows a more realistic example where you would use `oversample` to request many candidate documents, use `collapse` to remove documents that -duplicate some field (to get more diverse results), then use `truncate` to return the originally-requested document count (avoiding returning a +The following is a more realistic example, where you use `oversample` to request many candidate documents, use `collapse` to remove documents that +duplicate some field (to get more diverse results), then use `truncate` to return the originally requested document count (to avoid returning a large result payload from the cluster). ### Setup -Create many documents with a field that we'll use for collapsing: +Create many documents with a field that you'll use for collapsing: ```json POST /_bulk @@ -332,7 +332,7 @@ POST /_bulk ``` {% include copy-curl.html %} -Create a pipeline that just collapses on the `color` field: +Create a pipeline that only collapses on the `color` field: ```json PUT /_search/pipeline/collapse_pipeline @@ -378,8 +378,8 @@ PUT /_search/pipeline/oversampling_collapse_pipeline ### Collapse without oversample -In this example, we request the top 3 documents before collapsing on the "color" field. Since the first 2 documents have the same "color", the second one is discarded, -and the request returns the first and third document: +In this example, you request the top 3 documents before collapsing on the "color" field. Because the first two documents have the same `color`, the second one is discarded, +and the request returns the first and third documents: ```json POST /my_index/_search?search_pipeline=collapse_pipeline @@ -442,7 +442,7 @@ POST /my_index/_search?search_pipeline=collapse_pipeline ### Oversample, collapse, and truncate -Now, we will use the `oversampling_collapse_pipeline` that requests the top 9 documents (multiplying the size by 3), deduplicates by "color", +Now, you will use the `oversampling_collapse_pipeline` that requests the top 9 documents (multiplying the size by 3), deduplicates by "color", then returns the top 3 hits: ```json From 218f6d289b0c2f07ae94eb18254717e9796b2291 Mon Sep 17 00:00:00 2001 From: kolchfa-aws <105444904+kolchfa-aws@users.noreply.github.com> Date: Mon, 18 Dec 2023 07:45:41 -0500 Subject: [PATCH 03/13] Update _search-plugins/search-pipelines/oversample-processor.md Signed-off-by: kolchfa-aws <105444904+kolchfa-aws@users.noreply.github.com> --- _search-plugins/search-pipelines/oversample-processor.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/_search-plugins/search-pipelines/oversample-processor.md b/_search-plugins/search-pipelines/oversample-processor.md index 9f36b63570..9afdac726a 100644 --- a/_search-plugins/search-pipelines/oversample-processor.md +++ b/_search-plugins/search-pipelines/oversample-processor.md @@ -19,7 +19,7 @@ The following table lists all request fields. Field | Data type | Description :--- | :--- | :--- -`sample_factor` | Number | The multiplicative factor (>= 1.0) that will be applied to the `size` parameter before processing the search request. Required. +`sample_factor` | Float | The multiplicative factor (>= 1.0) that will be applied to the `size` parameter before processing the search request. Required. `context_prefix` | String | May be used to scope the `original_size` variable to avoid collisions. Optional. `tag` | String | The processor's identifier. Optional. `description` | String | A description of the processor. Optional. From dd5b6f94cf949e1a3c1be89d0b1c79ac0767124a Mon Sep 17 00:00:00 2001 From: kolchfa-aws <105444904+kolchfa-aws@users.noreply.github.com> Date: Mon, 18 Dec 2023 07:55:20 -0500 Subject: [PATCH 04/13] Update _search-plugins/search-pipelines/collapse-processor.md Signed-off-by: kolchfa-aws <105444904+kolchfa-aws@users.noreply.github.com> --- _search-plugins/search-pipelines/collapse-processor.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/_search-plugins/search-pipelines/collapse-processor.md b/_search-plugins/search-pipelines/collapse-processor.md index 4b0bad616c..082b4de468 100644 --- a/_search-plugins/search-pipelines/collapse-processor.md +++ b/_search-plugins/search-pipelines/collapse-processor.md @@ -1,6 +1,6 @@ --- layout: default -title: Collapse processor +title: Collapse nav_order: 7 has_children: false parent: Search processors From fee0d2f15206cf8868345c66751c1577278869b6 Mon Sep 17 00:00:00 2001 From: kolchfa-aws <105444904+kolchfa-aws@users.noreply.github.com> Date: Mon, 18 Dec 2023 07:55:40 -0500 Subject: [PATCH 05/13] Update _search-plugins/search-pipelines/oversample-processor.md Signed-off-by: kolchfa-aws <105444904+kolchfa-aws@users.noreply.github.com> --- _search-plugins/search-pipelines/oversample-processor.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/_search-plugins/search-pipelines/oversample-processor.md b/_search-plugins/search-pipelines/oversample-processor.md index 9afdac726a..9f201b8ca8 100644 --- a/_search-plugins/search-pipelines/oversample-processor.md +++ b/_search-plugins/search-pipelines/oversample-processor.md @@ -1,6 +1,6 @@ --- layout: default -title: Oversample processor +title: Oversample nav_order: 17 has_children: false parent: Search processors From 30d049022255f6e031f12084d9fbb09ad33e9e68 Mon Sep 17 00:00:00 2001 From: kolchfa-aws <105444904+kolchfa-aws@users.noreply.github.com> Date: Mon, 18 Dec 2023 07:56:03 -0500 Subject: [PATCH 06/13] Update _search-plugins/search-pipelines/truncate-hits-processor.md Signed-off-by: kolchfa-aws <105444904+kolchfa-aws@users.noreply.github.com> --- _search-plugins/search-pipelines/truncate-hits-processor.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/_search-plugins/search-pipelines/truncate-hits-processor.md b/_search-plugins/search-pipelines/truncate-hits-processor.md index d577b355fa..2c9713e367 100644 --- a/_search-plugins/search-pipelines/truncate-hits-processor.md +++ b/_search-plugins/search-pipelines/truncate-hits-processor.md @@ -1,6 +1,6 @@ --- layout: default -title: Truncate hits processor +title: Truncate hits nav_order: 35 has_children: false parent: Search processors From 705aa2892382f522bde6056ffb1e8d7e64cbda48 Mon Sep 17 00:00:00 2001 From: kolchfa-aws <105444904+kolchfa-aws@users.noreply.github.com> Date: Mon, 18 Dec 2023 09:28:59 -0500 Subject: [PATCH 07/13] Apply suggestions from code review Co-authored-by: Nathan Bower Signed-off-by: kolchfa-aws <105444904+kolchfa-aws@users.noreply.github.com> --- .../search-pipelines/collapse-processor.md | 12 ++++---- .../search-pipelines/oversample-processor.md | 12 ++++---- .../search-pipelines/search-processors.md | 6 ++-- .../truncate-hits-processor.md | 30 +++++++++---------- 4 files changed, 30 insertions(+), 30 deletions(-) diff --git a/_search-plugins/search-pipelines/collapse-processor.md b/_search-plugins/search-pipelines/collapse-processor.md index 082b4de468..55ac8db873 100644 --- a/_search-plugins/search-pipelines/collapse-processor.md +++ b/_search-plugins/search-pipelines/collapse-processor.md @@ -14,7 +14,7 @@ This is similar to the `collapse` parameter that can be passed in a search reque response after fetching from all shards. The `collapse` response processor may be used in conjunction with the `rescore` search request parameter or may be applied after a reranking response processor. -Using the `collapse` response processor will likely result in fewer than `size` results being returned, since hits are discarded +Using the `collapse` response processor will likely result in fewer than `size` results being returned because hits are discarded from a set whose size is already less than or equal to `size`. To increase the likelihood of returning `size` hits, use the `oversample` request processor and `truncate_hits` response processor, as shown in [this example]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/truncate-hits-processor/#oversample-collapse-and-truncate-hits). @@ -25,7 +25,7 @@ The following table lists all request fields. Field | Data type | Description :--- | :--- | :--- `field` | String | The field whose value will be read from each returned search hit. Only the first hit for each given field value will be returned in the search response. Required. -`context_prefix` | String | May be used to read the `original_size` variable from a specific scope to avoid collisions. Optional. +`context_prefix` | String | May be used to read the `original_size` variable from a specific scope in order to avoid collisions. Optional. `tag` | String | The processor's identifier. Optional. `description` | String | A description of the processor. Optional. `ignore_failure` | Boolean | If `true`, OpenSearch [ignores any failure]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/creating-search-pipeline/#ignoring-processor-failures) of this processor and continues to run the remaining processors in the search pipeline. Optional. Default is `false`. @@ -36,7 +36,7 @@ The following example demonstrates using a search pipeline with a `collapse` pro ### Setup -Create many documents with a field that we'll use for collapsing: +Create many documents containing a field to use for collapsing: ```json POST /_bulk @@ -63,7 +63,7 @@ POST /_bulk ``` {% include copy-curl.html %} -Create a pipeline that just collapses on the `color` field: +Create a pipeline that only collapses on the `color` field: ```json PUT /_search/pipeline/collapse_pipeline @@ -81,8 +81,8 @@ PUT /_search/pipeline/collapse_pipeline ### Using a search pipeline -In this example, we request the top 3 documents before collapsing on the "color" field. Since the first 2 documents have the same "color", the second one is discarded, -and the request returns the first and third document: +In this example, you request the top three documents before collapsing on the `color` field. Because the first two documents have the same `color`, the second one is discarded, +and the request returns the first and third documents: ```json POST /my_index/_search?search_pipeline=collapse_pipeline diff --git a/_search-plugins/search-pipelines/oversample-processor.md b/_search-plugins/search-pipelines/oversample-processor.md index 9f201b8ca8..f14d2100e5 100644 --- a/_search-plugins/search-pipelines/oversample-processor.md +++ b/_search-plugins/search-pipelines/oversample-processor.md @@ -10,8 +10,8 @@ grand_parent: Search pipelines # Oversample processor The `oversample` request processor multiplies the `size` parameter of the search request by a specified `sample_factor` (>= 1.0), saving the -original value in the `original_size` pipeline variable. The `oversample` processor is designed to work together with the -[`truncate_hits` response processor]({{site.url}}{{site.baseurl}}/search-plugins/truncate-hits-processor/), but may be used on its own. +original value in the `original_size` pipeline variable. The `oversample` processor is designed to work with the +[`truncate_hits` response processor]({{site.url}}{{site.baseurl}}/search-plugins/truncate-hits-processor/) but may be used on its own. ## Request fields @@ -20,7 +20,7 @@ The following table lists all request fields. Field | Data type | Description :--- | :--- | :--- `sample_factor` | Float | The multiplicative factor (>= 1.0) that will be applied to the `size` parameter before processing the search request. Required. -`context_prefix` | String | May be used to scope the `original_size` variable to avoid collisions. Optional. +`context_prefix` | String | May be used to scope the `original_size` variable in order to avoid collisions. Optional. `tag` | String | The processor's identifier. Optional. `description` | String | A description of the processor. Optional. `ignore_failure` | Boolean | If `true`, OpenSearch [ignores any failure]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/creating-search-pipeline/#ignoring-processor-failures) of this processor and continues to run the remaining processors in the search pipeline. Optional. Default is `false`. @@ -32,7 +32,7 @@ The following example demonstrates using a search pipeline with an `oversample` ### Setup -Create an index named `my_index` with many documents: +Create an index named `my_index` containing many documents: ```json POST /_bulk @@ -61,7 +61,7 @@ POST /_bulk ### Creating a search pipeline -The following request creates a search pipeline called `my_pipeline` with a `oversample` request processor that requests 50% more hits than specified in `size`: +The following request creates a search pipeline named `my_pipeline` with an `oversample` request processor that requests 50% more hits than specified in `size`: ```json PUT /_search/pipeline/my_pipeline @@ -91,7 +91,7 @@ POST /my_index/_search ``` {% include copy-curl.html %} -The response contains 5 hits: +The response contains five hits:
diff --git a/_search-plugins/search-pipelines/search-processors.md b/_search-plugins/search-pipelines/search-processors.md index 73d2fdfd32..aa4934d2b5 100644 --- a/_search-plugins/search-pipelines/search-processors.md +++ b/_search-plugins/search-pipelines/search-processors.md @@ -26,7 +26,7 @@ Processor | Description | Earliest available version [`filter_query`]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/filter-query-processor/) | Adds a filtering query that is used to filter requests. | 2.8 [`neural_query_enricher`]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/neural-query-enricher/) | Sets a default model for neural search at the index or field level. | 2.11 [`script`]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/script-processor/) | Adds a script that is run on newly indexed documents. | 2.8 -[`oversample`]({site.url}}{{site.baseurl}}/search-plugins/search-pipelines/oversample-processor/) | Increases the `size` parameter on a search request, storing the original value in pipeline state. | 2.12 +[`oversample`]({site.url}}{{site.baseurl}}/search-plugins/search-pipelines/oversample-processor/) | Increases the search request `size` parameter, storing the original value in the pipeline state. | 2.12 ## Search response processors @@ -39,8 +39,8 @@ Processor | Description | Earliest available version :--- | :--- | :--- [`personalize_search_ranking`]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/personalize-search-ranking/) | Uses [Amazon Personalize](https://aws.amazon.com/personalize/) to rerank search results (requires setting up the Amazon Personalize service). | 2.9 [`rename_field`]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/rename-field-processor/)| Renames an existing field. | 2.8 -[`collapse`]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/collapse-processor/)| Deduplicates search hits based on a field value, similar to `collapse` in a search request. | 2.12 -[`truncate_hits`]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/truncate-hits-processor/)| Discards search hits after a specified target count. Can "undo" the effect of `oversample` request processor. | 2.12 +[`collapse`]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/collapse-processor/)| Deduplicates search hits based on a field value, similarly to `collapse` in a search request. | 2.12 +[`truncate_hits`]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/truncate-hits-processor/)| Discards search hits after a specified target count is reached. Can undo the effect of the `oversample` request processor. | 2.12 ## Search phase results processors diff --git a/_search-plugins/search-pipelines/truncate-hits-processor.md b/_search-plugins/search-pipelines/truncate-hits-processor.md index 2c9713e367..d1cb861b5c 100644 --- a/_search-plugins/search-pipelines/truncate-hits-processor.md +++ b/_search-plugins/search-pipelines/truncate-hits-processor.md @@ -9,10 +9,10 @@ grand_parent: Search pipelines # Truncate hits processor -The `truncate_hits` response processor discards returned search hits after a given hit count. The `truncate_hits` processor is designed to work together with the -[`oversample` request processor]({{site.url}}{{site.baseurl}}/search-plugins/oversample-processor/), but may be used on its own. +The `truncate_hits` response processor discards returned search hits after a given hit count is reached. The `truncate_hits` processor is designed to work with the +[`oversample` request processor]({{site.url}}{{site.baseurl}}/search-plugins/oversample-processor/) but may be used on its own. -The `target_size` parameter (to specify where to truncate) is optional. If it is not specified, then OpenSearch uses the `original_size` variable set by the +The `target_size` parameter (which specifies where to truncate) is optional. If it is not specified, then OpenSearch uses the `original_size` variable set by the `oversample` processor (if available). A common usage pattern is to add the `oversample` processor to a request pipeline to fetch a larger set of results, then, in the response pipeline, to apply a @@ -25,8 +25,8 @@ The following table lists all request fields. Field | Data type | Description :--- | :--- | :--- -`target_size` | Integer | The maximum number of search hits to return (>=0). If not specified, the processor will try to read the `original_size` variable, failing if it is not available. Optional. -`context_prefix` | String | May be used to read the `original_size` variable from a specific scope to avoid collisions. Optional. +`target_size` | Integer | The maximum number of search hits to return (>=0). If not specified, the processor will try to read the `original_size` variable and will fail if it is not available. Optional. +`context_prefix` | String | May be used to read the `original_size` variable from a specific scope in order to avoid collisions. Optional. `tag` | String | The processor's identifier. Optional. `description` | String | A description of the processor. Optional. `ignore_failure` | Boolean | If `true`, OpenSearch [ignores any failure]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/creating-search-pipeline/#ignoring-processor-failures) of this processor and continues to run the remaining processors in the search pipeline. Optional. Default is `false`. @@ -37,7 +37,7 @@ The following example demonstrates using a search pipeline with a `truncate` pro ### Setup -Create an index named `my_index` with many documents: +Create an index named `my_index` containing many documents: ```json POST /_bulk @@ -66,7 +66,7 @@ POST /_bulk ### Creating a search pipeline -The following request creates a search pipeline called `my_pipeline` with a `truncate_hits` response processor that discards hits after the first 5: +The following request creates a search pipeline named `my_pipeline` with a `truncate_hits` response processor that discards hits after the first five: ```json PUT /_search/pipeline/my_pipeline @@ -96,7 +96,7 @@ POST /my_index/_search ``` {% include copy-curl.html %} -The response contains 8 hits: +The response contains eight hits:
@@ -216,7 +216,7 @@ POST /my_index/_search?search_pipeline=my_pipeline ``` {% include copy-curl.html %} -The response only contains 5 hits, even though 8 were requested and 10 were available: +The response contains only 5 hits, even though 8 were requested and 10 were available:
@@ -298,14 +298,14 @@ The response only contains 5 hits, even though 8 were requested and 10 were avai ## Oversample, collapse, and truncate hits -The following is a more realistic example, where you use `oversample` to request many candidate documents, use `collapse` to remove documents that -duplicate some field (to get more diverse results), then use `truncate` to return the originally requested document count (to avoid returning a +The following is a more realistic example in which you use `oversample` to request many candidate documents, use `collapse` to remove documents that +duplicate a particular field (to get more diverse results), and then use `truncate` to return the originally requested document count (to avoid returning a large result payload from the cluster). ### Setup -Create many documents with a field that you'll use for collapsing: +Create many documents containing a field that you'll use for collapsing: ```json POST /_bulk @@ -332,7 +332,7 @@ POST /_bulk ``` {% include copy-curl.html %} -Create a pipeline that only collapses on the `color` field: +Create a pipeline that collapses only on the `color` field: ```json PUT /_search/pipeline/collapse_pipeline @@ -348,7 +348,7 @@ PUT /_search/pipeline/collapse_pipeline ``` {% include copy-curl.html %} -Create another pipeline that oversamples, collapses, then truncates results: +Create another pipeline that oversamples, collapses, and then truncates results: ```json PUT /_search/pipeline/oversampling_collapse_pipeline @@ -378,7 +378,7 @@ PUT /_search/pipeline/oversampling_collapse_pipeline ### Collapse without oversample -In this example, you request the top 3 documents before collapsing on the "color" field. Because the first two documents have the same `color`, the second one is discarded, +In this example, you request the top three documents before collapsing on the `color` field. Because the first two documents have the same `color`, the second one is discarded, and the request returns the first and third documents: ```json From 1c45c94f35c2541c48f65b088c728b2d1d4380bd Mon Sep 17 00:00:00 2001 From: kolchfa-aws <105444904+kolchfa-aws@users.noreply.github.com> Date: Mon, 18 Dec 2023 09:30:16 -0500 Subject: [PATCH 08/13] Update _search-plugins/search-pipelines/collapse-processor.md Signed-off-by: kolchfa-aws <105444904+kolchfa-aws@users.noreply.github.com> --- _search-plugins/search-pipelines/collapse-processor.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/_search-plugins/search-pipelines/collapse-processor.md b/_search-plugins/search-pipelines/collapse-processor.md index 55ac8db873..2c987c2471 100644 --- a/_search-plugins/search-pipelines/collapse-processor.md +++ b/_search-plugins/search-pipelines/collapse-processor.md @@ -9,7 +9,7 @@ grand_parent: Search pipelines # Collapse processor -The `collapse` response processor discards hits that have the same value for some field as a previous document in the result set. +The `collapse` response processor discards hits that have the same value for a particular field as a previous document in the result set. This is similar to the `collapse` parameter that can be passed in a search request, but the response processor is applied to the response after fetching from all shards. The `collapse` response processor may be used in conjunction with the `rescore` search request parameter or may be applied after a reranking response processor. From a3b2925ad7f3f41ea56fb6406c085e99fc1fade7 Mon Sep 17 00:00:00 2001 From: kolchfa-aws <105444904+kolchfa-aws@users.noreply.github.com> Date: Mon, 18 Dec 2023 09:31:03 -0500 Subject: [PATCH 09/13] Update _search-plugins/search-pipelines/collapse-processor.md Signed-off-by: kolchfa-aws <105444904+kolchfa-aws@users.noreply.github.com> --- _search-plugins/search-pipelines/collapse-processor.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/_search-plugins/search-pipelines/collapse-processor.md b/_search-plugins/search-pipelines/collapse-processor.md index 2c987c2471..7ebaafb209 100644 --- a/_search-plugins/search-pipelines/collapse-processor.md +++ b/_search-plugins/search-pipelines/collapse-processor.md @@ -10,7 +10,7 @@ grand_parent: Search pipelines # Collapse processor The `collapse` response processor discards hits that have the same value for a particular field as a previous document in the result set. -This is similar to the `collapse` parameter that can be passed in a search request, but the response processor is applied to the +This is similar to passing the `collapse` parameter in a search request, but the response processor is applied to the response after fetching from all shards. The `collapse` response processor may be used in conjunction with the `rescore` search request parameter or may be applied after a reranking response processor. From a0e19bb830ce99b2f6931b42d60b7d8b00616bad Mon Sep 17 00:00:00 2001 From: kolchfa-aws <105444904+kolchfa-aws@users.noreply.github.com> Date: Mon, 18 Dec 2023 09:32:00 -0500 Subject: [PATCH 10/13] Update _search-plugins/search-pipelines/truncate-hits-processor.md Co-authored-by: Nathan Bower Signed-off-by: kolchfa-aws <105444904+kolchfa-aws@users.noreply.github.com> --- _search-plugins/search-pipelines/truncate-hits-processor.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/_search-plugins/search-pipelines/truncate-hits-processor.md b/_search-plugins/search-pipelines/truncate-hits-processor.md index d1cb861b5c..bf153aa658 100644 --- a/_search-plugins/search-pipelines/truncate-hits-processor.md +++ b/_search-plugins/search-pipelines/truncate-hits-processor.md @@ -442,7 +442,7 @@ POST /my_index/_search?search_pipeline=collapse_pipeline ### Oversample, collapse, and truncate -Now, you will use the `oversampling_collapse_pipeline` that requests the top 9 documents (multiplying the size by 3), deduplicates by "color", +Now you will use the `oversampling_collapse_pipeline`, which requests the top 9 documents (multiplying the size by 3), deduplicates by `color`, and then returns the top 3 hits: ```json From b6644de3591c4d550ef99f939811db13ddf7b843 Mon Sep 17 00:00:00 2001 From: Fanit Kolchina Date: Mon, 18 Dec 2023 12:03:45 -0500 Subject: [PATCH 11/13] More editorial comments and link fixes Signed-off-by: Fanit Kolchina --- .../search-pipelines/oversample-processor.md | 2 +- .../search-pipelines/personalize-search-ranking.md | 2 +- _search-plugins/search-pipelines/search-processors.md | 2 +- .../search-pipelines/truncate-hits-processor.md | 10 ++++++---- 4 files changed, 9 insertions(+), 7 deletions(-) diff --git a/_search-plugins/search-pipelines/oversample-processor.md b/_search-plugins/search-pipelines/oversample-processor.md index f14d2100e5..4b7d8483b8 100644 --- a/_search-plugins/search-pipelines/oversample-processor.md +++ b/_search-plugins/search-pipelines/oversample-processor.md @@ -11,7 +11,7 @@ grand_parent: Search pipelines The `oversample` request processor multiplies the `size` parameter of the search request by a specified `sample_factor` (>= 1.0), saving the original value in the `original_size` pipeline variable. The `oversample` processor is designed to work with the -[`truncate_hits` response processor]({{site.url}}{{site.baseurl}}/search-plugins/truncate-hits-processor/) but may be used on its own. +[`truncate_hits` response processor]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/truncate-hits-processor/) but may be used on its own. ## Request fields diff --git a/_search-plugins/search-pipelines/personalize-search-ranking.md b/_search-plugins/search-pipelines/personalize-search-ranking.md index b73ebb7476..c7a7dd8dde 100644 --- a/_search-plugins/search-pipelines/personalize-search-ranking.md +++ b/_search-plugins/search-pipelines/personalize-search-ranking.md @@ -1,7 +1,7 @@ --- layout: default title: Personalize search ranking -nav_order: 40 +nav_order: 18 has_children: false parent: Search processors grand_parent: Search pipelines diff --git a/_search-plugins/search-pipelines/search-processors.md b/_search-plugins/search-pipelines/search-processors.md index aa4934d2b5..e82dabc661 100644 --- a/_search-plugins/search-pipelines/search-processors.md +++ b/_search-plugins/search-pipelines/search-processors.md @@ -26,7 +26,7 @@ Processor | Description | Earliest available version [`filter_query`]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/filter-query-processor/) | Adds a filtering query that is used to filter requests. | 2.8 [`neural_query_enricher`]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/neural-query-enricher/) | Sets a default model for neural search at the index or field level. | 2.11 [`script`]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/script-processor/) | Adds a script that is run on newly indexed documents. | 2.8 -[`oversample`]({site.url}}{{site.baseurl}}/search-plugins/search-pipelines/oversample-processor/) | Increases the search request `size` parameter, storing the original value in the pipeline state. | 2.12 +[`oversample`]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/oversample-processor/) | Increases the search request `size` parameter, storing the original value in the pipeline state. | 2.12 ## Search response processors diff --git a/_search-plugins/search-pipelines/truncate-hits-processor.md b/_search-plugins/search-pipelines/truncate-hits-processor.md index bf153aa658..956bcf02b8 100644 --- a/_search-plugins/search-pipelines/truncate-hits-processor.md +++ b/_search-plugins/search-pipelines/truncate-hits-processor.md @@ -10,14 +10,16 @@ grand_parent: Search pipelines # Truncate hits processor The `truncate_hits` response processor discards returned search hits after a given hit count is reached. The `truncate_hits` processor is designed to work with the -[`oversample` request processor]({{site.url}}{{site.baseurl}}/search-plugins/oversample-processor/) but may be used on its own. +[`oversample` request processor]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/oversample-processor/) but may be used on its own. The `target_size` parameter (which specifies where to truncate) is optional. If it is not specified, then OpenSearch uses the `original_size` variable set by the `oversample` processor (if available). -A common usage pattern is to add the `oversample` processor to a request pipeline to fetch a larger set of results, then, in the response pipeline, to apply a -reranking processor (which may promote results from beyond the the originally requested top N) or the `collapse` processor (which may discard results after -deduplication), then apply the `truncate` processor to return (at most) the originally requested number of hits. +The following is a common usage pattern: + +1. Add the `oversample` processor to a request pipeline to fetch a larger set of results. +1. In the response pipeline, to apply a reranking processor (which may promote results from beyond the originally requested top N) or the `collapse` processor (which may discard results after deduplication). +1. Apply the `truncate` processor to return (at most) the originally requested number of hits. ## Request fields From 58d4b739033409c5ad27b3015b0fb28b89828b5d Mon Sep 17 00:00:00 2001 From: Fanit Kolchina Date: Mon, 18 Dec 2023 12:17:14 -0500 Subject: [PATCH 12/13] Add oversample and deduplicate to vale and format files nicely Signed-off-by: Fanit Kolchina --- .../styles/Vocab/OpenSearch/Words/accept.txt | 2 ++ .../search-pipelines/collapse-processor.md | 1 + .../search-pipelines/oversample-processor.md | 6 +++--- .../search-pipelines/truncate-hits-processor.md | 17 ++++++++--------- 4 files changed, 14 insertions(+), 12 deletions(-) diff --git a/.github/vale/styles/Vocab/OpenSearch/Words/accept.txt b/.github/vale/styles/Vocab/OpenSearch/Words/accept.txt index 15997e71ef..cdfec52618 100644 --- a/.github/vale/styles/Vocab/OpenSearch/Words/accept.txt +++ b/.github/vale/styles/Vocab/OpenSearch/Words/accept.txt @@ -17,6 +17,7 @@ Boolean [Dd]ashboarding [Dd]atagram [Dd]eallocate +[Dd]eduplicates? [Dd]eduplication [Dd]eserialize [Dd]eserialization @@ -71,6 +72,7 @@ Levenshtein [Mm]ultithreaded [Mm]ultivalued [Nn]amespace +[Oo]versamples? pebibyte [Pp]luggable [Pp]reconfigure diff --git a/_search-plugins/search-pipelines/collapse-processor.md b/_search-plugins/search-pipelines/collapse-processor.md index 7ebaafb209..cea0a15396 100644 --- a/_search-plugins/search-pipelines/collapse-processor.md +++ b/_search-plugins/search-pipelines/collapse-processor.md @@ -98,6 +98,7 @@ POST /my_index/_search?search_pipeline=collapse_pipeline Response {: .text-delta} + ```json { "took" : 2, diff --git a/_search-plugins/search-pipelines/oversample-processor.md b/_search-plugins/search-pipelines/oversample-processor.md index 4b7d8483b8..698d9572cf 100644 --- a/_search-plugins/search-pipelines/oversample-processor.md +++ b/_search-plugins/search-pipelines/oversample-processor.md @@ -9,9 +9,7 @@ grand_parent: Search pipelines # Oversample processor -The `oversample` request processor multiplies the `size` parameter of the search request by a specified `sample_factor` (>= 1.0), saving the -original value in the `original_size` pipeline variable. The `oversample` processor is designed to work with the -[`truncate_hits` response processor]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/truncate-hits-processor/) but may be used on its own. +The `oversample` request processor multiplies the `size` parameter of the search request by a specified `sample_factor` (>= 1.0), saving the original value in the `original_size` pipeline variable. The `oversample` processor is designed to work with the [`truncate_hits` response processor]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/truncate-hits-processor/) but may be used on its own. ## Request fields @@ -98,6 +96,7 @@ The response contains five hits: Response
{: .text-delta} + ```json { "took" : 3, @@ -188,6 +187,7 @@ The response contains 8 documents (5 * 1.5 = 7.5, rounded up to 8): Response
{: .text-delta} + ```json { "took" : 13, diff --git a/_search-plugins/search-pipelines/truncate-hits-processor.md b/_search-plugins/search-pipelines/truncate-hits-processor.md index 956bcf02b8..490abeb2b7 100644 --- a/_search-plugins/search-pipelines/truncate-hits-processor.md +++ b/_search-plugins/search-pipelines/truncate-hits-processor.md @@ -9,8 +9,7 @@ grand_parent: Search pipelines # Truncate hits processor -The `truncate_hits` response processor discards returned search hits after a given hit count is reached. The `truncate_hits` processor is designed to work with the -[`oversample` request processor]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/oversample-processor/) but may be used on its own. +The `truncate_hits` response processor discards returned search hits after a given hit count is reached. The `truncate_hits` processor is designed to work with the [`oversample` request processor]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/oversample-processor/) but may be used on its own. The `target_size` parameter (which specifies where to truncate) is optional. If it is not specified, then OpenSearch uses the `original_size` variable set by the `oversample` processor (if available). @@ -105,6 +104,7 @@ The response contains eight hits: Response
{: .text-delta} + ```json { "took" : 13, @@ -225,6 +225,7 @@ The response contains only 5 hits, even though 8 were requested and 10 were avai Response {: .text-delta} + ```json { "took" : 3, @@ -300,9 +301,7 @@ The response contains only 5 hits, even though 8 were requested and 10 were avai ## Oversample, collapse, and truncate hits -The following is a more realistic example in which you use `oversample` to request many candidate documents, use `collapse` to remove documents that -duplicate a particular field (to get more diverse results), and then use `truncate` to return the originally requested document count (to avoid returning a -large result payload from the cluster). +The following is a more realistic example in which you use `oversample` to request many candidate documents, use `collapse` to remove documents that duplicate a particular field (to get more diverse results), and then use `truncate` to return the originally requested document count (to avoid returning a large result payload from the cluster). ### Setup @@ -380,8 +379,7 @@ PUT /_search/pipeline/oversampling_collapse_pipeline ### Collapse without oversample -In this example, you request the top three documents before collapsing on the `color` field. Because the first two documents have the same `color`, the second one is discarded, -and the request returns the first and third documents: +In this example, you request the top three documents before collapsing on the `color` field. Because the first two documents have the same `color`, the second one is discarded, and the request returns the first and third documents: ```json POST /my_index/_search?search_pipeline=collapse_pipeline @@ -397,6 +395,7 @@ POST /my_index/_search?search_pipeline=collapse_pipeline Response {: .text-delta} + ```json { "took" : 2, @@ -444,8 +443,7 @@ POST /my_index/_search?search_pipeline=collapse_pipeline ### Oversample, collapse, and truncate -Now you will use the `oversampling_collapse_pipeline`, which requests the top 9 documents (multiplying the size by 3), deduplicates by `color`, and -then returns the top 3 hits: +Now you will use the `oversampling_collapse_pipeline`, which requests the top 9 documents (multiplying the size by 3), deduplicates by `color`, and then returns the top 3 hits: ```json POST /my_index/_search?search_pipeline=oversampling_collapse_pipeline @@ -461,6 +459,7 @@ POST /my_index/_search?search_pipeline=oversampling_collapse_pipeline Response {: .text-delta} + ```json { "took" : 2, From 6511d285f055d0ee5a5306a55b3c6691e62ad44c Mon Sep 17 00:00:00 2001 From: kolchfa-aws <105444904+kolchfa-aws@users.noreply.github.com> Date: Mon, 18 Dec 2023 12:18:25 -0500 Subject: [PATCH 13/13] Update _search-plugins/search-pipelines/truncate-hits-processor.md Co-authored-by: Nathan Bower Signed-off-by: kolchfa-aws <105444904+kolchfa-aws@users.noreply.github.com> --- _search-plugins/search-pipelines/truncate-hits-processor.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/_search-plugins/search-pipelines/truncate-hits-processor.md b/_search-plugins/search-pipelines/truncate-hits-processor.md index 490abeb2b7..871879efe3 100644 --- a/_search-plugins/search-pipelines/truncate-hits-processor.md +++ b/_search-plugins/search-pipelines/truncate-hits-processor.md @@ -17,7 +17,7 @@ The `target_size` parameter (which specifies where to truncate) is optional. If The following is a common usage pattern: 1. Add the `oversample` processor to a request pipeline to fetch a larger set of results. -1. In the response pipeline, to apply a reranking processor (which may promote results from beyond the originally requested top N) or the `collapse` processor (which may discard results after deduplication). +1. In the response pipeline, apply a reranking processor (which may promote results from beyond the originally requested top N) or the `collapse` processor (which may discard results after deduplication). 1. Apply the `truncate` processor to return (at most) the originally requested number of hits. ## Request fields