From 4c1e7825e8c31378aef8dcefb9f33869db591f3e Mon Sep 17 00:00:00 2001 From: bowenlan-amzn Date: Fri, 13 Sep 2024 05:58:20 -0700 Subject: [PATCH] Terms query can accept encoded terms input as bitmap (#8133) * draft Signed-off-by: bowenlan-amzn * Doc review Signed-off-by: Fanit Kolchina * Update _query-dsl/term/terms.md Signed-off-by: kolchfa-aws <105444904+kolchfa-aws@users.noreply.github.com> * Apply suggestions from code review Co-authored-by: Nathan Bower Signed-off-by: kolchfa-aws <105444904+kolchfa-aws@users.noreply.github.com> --------- Signed-off-by: bowenlan-amzn Signed-off-by: Fanit Kolchina Signed-off-by: kolchfa-aws <105444904+kolchfa-aws@users.noreply.github.com> Co-authored-by: Fanit Kolchina Co-authored-by: kolchfa-aws <105444904+kolchfa-aws@users.noreply.github.com> Co-authored-by: Nathan Bower --- _query-dsl/term/terms.md | 134 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 134 insertions(+) diff --git a/_query-dsl/term/terms.md b/_query-dsl/term/terms.md index 42c74c0436..7dac6a9619 100644 --- a/_query-dsl/term/terms.md +++ b/_query-dsl/term/terms.md @@ -39,6 +39,7 @@ Parameter | Data type | Description :--- | :--- | :--- `` | String | The field in which to search. A document is returned in the results only if its field value exactly matches at least one term, with the correct spacing and capitalization. `boost` | Floating-point | A floating-point value that specifies the weight of this field toward the relevance score. Values above 1.0 increase the field’s relevance. Values between 0.0 and 1.0 decrease the field’s relevance. Default is 1.0. +`value_type` | String | Specifies the types of values used for filtering. Valid values are `default` and `bitmap`. If omitted, the value defaults to `default`. ## Terms lookup @@ -250,3 +251,136 @@ Parameter | Data type | Description `path` | String | The name of the field from which to fetch field values. Specify nested fields using dot path notation. Required. `routing` | String | Custom routing value of the document from which to fetch field values. Optional. Required if a custom routing value was provided when the document was indexed. `boost` | Floating-point | A floating-point value that specifies the weight of this field toward the relevance score. Values above 1.0 increase the field’s relevance. Values between 0.0 and 1.0 decrease the field’s relevance. Default is 1.0. + +## Bitmap filtering +**Introduced 2.17** +{: .label .label-purple } + +The `terms` query can filter for multiple terms simultaneously. However, when the number of terms in the input filter increases to a large value (around 10,000), the resulting network and memory overhead can become significant, making the query inefficient. In such cases, consider encoding your large terms filter using a [roaring bitmap](https://github.com/RoaringBitmap/RoaringBitmap) for more efficient filtering. + +The following example assumes that you have two indexes: a `products` index, which contains all the products sold by a company, and a `customers` index, which stores filters representing customers who own specific products. + +First, create a `products` index and map `product_id` as a `keyword`: + +```json +PUT /products +{ + "mappings": { + "properties": { + "product_id": { "type": "keyword" } + } + } +} +``` +{% include copy-curl.html %} + +Next, index three documents that correspond to products: + +```json +PUT students/_doc/1 +{ + "name": "Product 1", + "product_id" : "111" +} +``` +{% include copy-curl.html %} + +```json +PUT students/_doc/2 +{ + "name": "Product 2", + "product_id" : "222" +} +``` +{% include copy-curl.html %} + +```json +PUT students/_doc/3 +{ + "name": "Product 3", + "product_id" : "333" +} +``` +{% include copy-curl.html %} + +To store customer bitmap filters, you'll create a `customer_filter` [binary field](https://opensearch.org/docs/latest/field-types/supported-field-types/binary/) in the `customers` index. Specify `store` as `true` to store the field: + +```json +PUT /customers +{ + "mappings": { + "properties": { + "customer_filter": { + "type": "binary", + "store": true + } + } + } +} +``` +{% include copy-curl.html %} + +For each customer, you need to generate a bitmap that represents the product IDs of the products the customer owns. This bitmap effectively encodes the filter criteria for that customer. In this example, you'll create a `terms` filter for a customer whose ID is `customer123` and who owns products `111`, `222`, and `333`. + +To encode a `terms` filter for the customer, first create a roaring bitmap for the filter. This example creates a bitmap using the [PyRoaringBitMap] library, so first run `pip install pyroaring` to install the library. Then serialize the bitmap and encode it using a [Base64](https://en.wikipedia.org/wiki/Base64) encoding scheme: + +```py +from pyroaring import BitMap +import base64 + +# Create a bitmap, serialize it into a byte string, and encode into Base64 +bm = BitMap([111, 222, 333]) # product ids owned by a customer +encoded = base64.b64encode(BitMap.serialize(bm)) + +# Convert the Base64-encoded bytes to a string for storage or transmission +encoded_bm_str = encoded.decode('utf-8') + +# Print the encoded bitmap +print(f"Encoded Bitmap: {encoded_bm_str}") +``` +{% include copy.html %} + +Next, index the customer filter into the `customers` index. The document ID for the filter is the same as the ID for the corresponding customer (in this example, `customer123`). The `customer_filter` field contains the bitmap you generated for this customer: + +```json +POST customers/_doc/customer123 +{ + "customer_filter": "OjAAAAEAAAAAAAIAEAAAAG8A3gBNAQ==" +} +``` +{% include copy-curl.html %} + +Now you can run a `terms` query on the `products` index to look up a specific customer in the `customers` index. Because you're looking up a stored field instead of `_source`, set `store` to `true`. In the `value_type` field, specify the data type of the `terms` input as `bitmap`: + +```json +POST /products/_search +{ + "query": { + "terms": { + "product_id": { + "index": "customers", + "id": "customer123", + "path": "customer_filter", + "store": true + }, + "value_type": "bitmap" + } + } +} +``` +{% include copy-curl.html %} + +You can also directly pass the bitmap to the `terms` query. In this example, the `product_id` field contains the customer filter bitmap for the customer whose ID is `customer123`: + +```json +POST /products/_search +{ + "query": { + "terms": { + "product_id": "OjAAAAEAAAAAAAIAEAAAAG8A3gBNAQ==", + "value_type": "bitmap" + } + } +} +``` +{% include copy-curl.html %} \ No newline at end of file