add examples for text match

Signed-off-by: aoiasd <[email protected]>
milvus-io · Nov 26, 2024 · 98ec398 · 98ec398
1 parent 73d0394
commit 98ec398
Show file tree

Hide file tree

Showing 2 changed files with 314 additions and 0 deletions.
diff --git a/examples/orm/hello_text_match.py b/examples/orm/hello_text_match.py
@@ -0,0 +1,168 @@
+# hello_text_match.py demonstrates how to insert raw data only into Milvus and perform
+# document retrieval based on specific terms by text match expression.
+# 1. connect to Milvus
+# 2. create collection
+# 3. insert data
+# 4. create index
+# 5. search, query, and filtering search on entities
+# 7. drop collection
+import time
+import numpy as np
+
+
+
+from pymilvus import (
+    connections,
+    utility,
+    FieldSchema, CollectionSchema, Function, DataType, FunctionType,
+    Collection,
+)
+
+fmt = "\n=== {:30} ===\n"
+search_latency_fmt = "search latency = {:.4f}s"
+dim = 8
+
+#################################################################################
+# 1. connect to Milvus
+# Add a new connection alias `default` for Milvus server in `localhost:19530`
+print(fmt.format("start connecting to Milvus"))
+connections.connect("default", host="localhost", port="19530")
+
+has = utility.has_collection("hello_text_match")
+print(f"Does collection hello_text_match exist in Milvus: {has}")
+
+#################################################################################
+# 2. create collection
+# We're going to create a collection with 2 explicit fields and a function.
+# +-+------------+------------+----------------------+------------------------------+
+# | | field name | field type |   other attributes   |       field description      |
+# +-+------------+------------+----------------------+------------------------------+
+# |1|    "id"    |   INT64    |    is_primary=True   |      "primary field"         |
+# | |            |            |     auto_id=False    |                              |
+# +-+------------+------------+----------------------+------------------------------+
+# |2| "document" | VarChar    | enable_analyzer=True |     "raw text document"      |
+# | |            |            |   enable_match=True  |                              |
+# +-+------------+------------+----------------------+------------------------------+
+# |3|"embeddings"| FloatVector|        dim=8         |  "float vector with dim 8"   |
+# +-+------------+------------+----------------------+------------------------------+
+fields = [
+    FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=True),
+    FieldSchema(name="document", dtype=DataType.VARCHAR, max_length=1000, enable_analyzer=True, enable_match=True),
+    FieldSchema(name="embeddings", dtype=DataType.FLOAT_VECTOR, dim=dim)
+]
+
+
+schema = CollectionSchema(fields, "hello_text_match demo")
+
+print(fmt.format("Create collection `hello_text_match`"))
+hello_text_match = Collection("hello_text_match", schema, consistency_level="Strong")
+
+################################################################################
+# 3. insert data
+# We are going to insert 6 rows of data into `hello_text_match`
+# Data to be inserted must be organized in fields.
+#
+# The insert() method returns:
+# - either automatically generated primary keys by Milvus if auto_id=True in the schema;
+# - or the existing primary key field from the entities if auto_id=False in the schema.
+
+print(fmt.format("Start inserting entities"))
+
+rng = np.random.default_rng(seed=19530)
+num_entities = 6
+keywords = ["milvus", "match", "search", "query", "analyzer", "tokenizer"]
+
+entities = [
+    [f"This is a test document {i + hello_text_match.num_entities} with keywords: {keywords[i]}" for i in range(num_entities)],
+    rng.random((num_entities, dim), np.float32)
+]
+
+insert_result = hello_text_match.insert(entities)
+ids = insert_result.primary_keys
+
+hello_text_match.flush()
+print(f"Number of entities in Milvus: {hello_text_match.num_entities}")  # check the num_entities
+
+################################################################################
+# 4. create index
+# We are going to create an vector index for hello_text_match collection
+print(fmt.format("Start Creating index AUTOINDEX"))
+index = {
+    "index_type": "AUTOINDEX",
+    "metric_type": "IP",
+}
+
+hello_text_match.create_index("embeddings", index)
+################################################################################
+# 5. query and scalar filtering search with text match
+# After data were inserted into Milvus and indexed, you can perform:
+# - query with text match expression
+# - search data with text match filter
+
+# Before conducting a search or a query, you need to load the data in `hello_text_match` into memory.
+print(fmt.format("Start loading"))
+hello_text_match.load()
+
+# -----------------------------------------------------------------------------
+# query based text match with single keyword
+expr = f"TEXT_MATCH(document, '{keywords[0]}')"
+print(fmt.format(f"Start querying with `{expr}`"))
+
+start_time = time.time()
+result = hello_text_match.query(expr=expr, output_fields=["document"])
+end_time = time.time()
+
+print(f"query result:\n-{result[0]}")
+print(search_latency_fmt.format(end_time - start_time))
+
+# query based text match with mutiple keywords
+expr = f"TEXT_MATCH(document, '{keywords[0]} {keywords[1]} {keywords[2]}')"
+print(fmt.format(f"Start querying with `{expr}`"))
+
+start_time = time.time()
+result = hello_text_match.query(expr=expr, output_fields=["document"])
+end_time = time.time()
+
+print(f"query result:\n-{result[0]}")
+print(search_latency_fmt.format(end_time - start_time))
+
+# -----------------------------------------------------------------------------
+# scalar filtering search with text match
+search_params = {
+    "metric_type": "IP",
+    "params": {},
+}
+expr = f"TEXT_MATCH(document, '{keywords[0]} {keywords[1]} {keywords[2]}')"
+print(fmt.format(f"Start filtered searching with `{expr}`"))
+
+start_time = time.time()
+vector_to_search = rng.random((1, dim), np.float32)
+result = hello_text_match.search(vector_to_search, "embeddings", search_params, limit=3, expr=expr, output_fields=["document"])
+end_time = time.time()
+
+for hits in result:
+    for hit in hits:
+        print(f"\thit: {hit}, document field: {hit.entity.get('document')}")
+print(search_latency_fmt.format(end_time - start_time))
+
+###############################################################################
+# 6. delete entities by text match
+# You can delete entities by their PK values using boolean expressions.
+
+expr = f"TEXT_MATCH(document, '{keywords[4]}')"
+print(fmt.format(f"Start deleting with expr `{expr}`"))
+
+result = hello_text_match.query(expr=expr, output_fields=["document"])
+print(f"query before delete by expr=`{expr}` -> result: \n- {result[0]}\n")
+
+hello_text_match.delete(expr)
+
+result = hello_text_match.query(expr=expr, output_fields=["document"])
+print(f"query after delete by expr=`{expr}` -> result: {result}\n")
+
+
+###############################################################################
+# 7. drop collection
+# Finally, drop the hello_text_match collection
+print(fmt.format("Drop collection `hello_text_match`"))
+utility.drop_collection("hello_text_match")
diff --git a/examples/text_match.py b/examples/text_match.py
@@ -0,0 +1,146 @@
+# hello_text_match.py demonstrates how to insert raw data only into Milvus and perform
+# document retrieval based on specific terms by text match expression.
+# 1. connect to Milvus
+# 2. create collection
+# 3. insert data
+# 4. search, query, and filtering search on entities
+# 5. drop collection
+import time
+import numpy as np
+
+from pymilvus import (
+    MilvusClient,
+    Function,
+    FunctionType,
+    DataType,
+)
+
+fmt = "\n=== {:30} ===\n"
+collection_name = "text_match"
+dim = 8
+
+#################################################################################
+# 1. connect to Milvus
+# Add a new connection alias `default` for Milvus server in `localhost:19530`
+print(fmt.format("start connecting to Milvus"))
+milvus_client = MilvusClient("http://localhost:19530")
+
+has_collection = milvus_client.has_collection(collection_name, timeout=5)
+print(f"Does collection hello_text_match exist in Milvus: {has_collection}")
+if has_collection:
+    milvus_client.drop_collection(collection_name)
+
+#################################################################################
+# 2. create collection
+# We're going to create a collection with 3 explicit fields.
+# +-+------------+------------+----------------------+------------------------------+
+# | | field name | field type |   other attributes   |       field description      |
+# +-+------------+------------+----------------------+------------------------------+
+# |1|    "id"    |   INT64    |    is_primary=True   |      "primary field"         |
+# | |            |            |     auto_id=False    |                              |
+# +-+------------+------------+----------------------+------------------------------+
+# |2| "document" | VarChar    | enable_analyzer=True |     "raw text document"      |
+# | |            |            |   enable_match=True  |                              |
+# +-+------------+------------+----------------------+------------------------------+
+# |3|"embeddings"| FloatVector|        dim=8         |  "float vector with dim 8"   |
+# +-+------------+------------+----------------------+------------------------------+
+
+schema = milvus_client.create_schema()
+schema.add_field("id", DataType.INT64, is_primary=True, auto_id=False)
+schema.add_field("document", DataType.VARCHAR, max_length=1000, enable_analyzer=True, enable_match=True),
+schema.add_field("embeddings", DataType.FLOAT_VECTOR, dim=dim)
+
+print(fmt.format("Create collection `hello_text_match`"))
+
+index_params = milvus_client.prepare_index_params()
+index_params.add_index(
+    "embeddings",
+    index_type= "AUTOINDEX",
+    metric_type= "IP"
+)
+
+milvus_client.create_collection(collection_name, schema=schema, index_params=index_params, consistency_level="Strong")
+
+################################################################################
+# 3. insert data
+# We are going to insert 6 rows of data into `hello_text_match`
+# Data to be inserted must be organized in fields.
+#
+# The insert() method returns:
+# - either automatically generated primary keys by Milvus if auto_id=True in the schema;
+# - or the existing primary key field from the entities if auto_id=False in the schema.
+
+print(fmt.format("Start inserting entities"))
+
+rng = np.random.default_rng(seed=19530)
+num_entities = 6
+keywords = ["milvus", "match", "search", "query", "analyzer", "tokenizer"]
+embeddings = rng.random((num_entities, dim), np.float32)
+
+entities = [{
+        "id": i,
+        "document":f"This is a test document {i} with keywords: {keywords[i]}",
+        "embeddings": embeddings[i]
+    } for i in range(num_entities)
+]
+
+insert_result = milvus_client.insert(collection_name, entities)
+print(f"Number of insert entities in Milvus: {insert_result['insert_count']}")  # check the num_entities
+milvus_client.flush(collection_name)
+
+# ###############################################################################
+# 4. query and scalar filtering search with text match
+# After data were inserted into Milvus and indexed, you can perform:
+# - query with text match expression
+# - search data with text match filter
+
+# -----------------------------------------------------------------------------
+# query based text match with single keyword filter
+filter = f"TEXT_MATCH(document, '{keywords[0]}')"
+print(fmt.format(f"Start querying with `{filter}`"))
+
+result = milvus_client.query(collection_name, filter, output_fields=["document"])
+print(f"query result:\n-{result}")
+
+# query based text match with mutiple keywords
+filter = f"TEXT_MATCH(document, '{keywords[0]} {keywords[1]} {keywords[2]}')"
+print(fmt.format(f"Start querying with `{filter}`"))
+
+result = milvus_client.query(collection_name, filter, output_fields=["document"])
+print(f"query result:\n-{result}")
+
+# -----------------------------------------------------------------------------
+# scalar filtering search with text match
+search_params = {
+    "metric_type": "IP",
+    "params": {},
+}
+filter = f"TEXT_MATCH(document, '{keywords[0]} {keywords[1]} {keywords[2]}')"
+print(fmt.format(f"Start filtered searching with `{filter}`"))
+
+vector_to_search = rng.random((1, dim), np.float32)
+result = milvus_client.search(collection_name ,vector_to_search, filter, anns_field="embeddings", search_params=search_params, limit=3, output_fields=["document"])
+
+print(result)
+
+###############################################################################
+# 6. delete entities by text match filter
+# You can delete entities by their PK values using boolean expressions.
+
+filter = f"TEXT_MATCH(document, '{keywords[4]}')"
+print(fmt.format(f"Start deleting with expr `{filter}`"))
+
+result = milvus_client.query(collection_name, filter, output_fields=["document"])
+print(f"query before delete by expr=`{filter}` -> result: \n- {result}\n")
+
+milvus_client.delete(collection_name, filter=filter)
+
+result = milvus_client.query(collection_name, filter, output_fields=["document"])
+print(f"query after delete by expr=`{filter}` -> result: {result}\n")
+
+
+###############################################################################
+# 5. drop collection
+# Finally, drop the hello_text_match collection
+print(fmt.format(f"Drop collection `{collection_name}`"))
+milvus_client.drop_collection(collection_name)