test: add bulk insert case for text match feature (milvus-io#36398)

/kind improvement --------- Signed-off-by: zhuwenxing <[email protected]>
wangting0128 · Sep 27, 2024 · 7c2cb8c · 7c2cb8c
1 parent ff4c62e
commit 7c2cb8c
Show file tree

Hide file tree

Showing 3 changed files with 65 additions and 10 deletions.
diff --git a/tests/python_client/common/bulk_insert_data.py b/tests/python_client/common/bulk_insert_data.py
@@ -34,6 +34,7 @@ class DataField:
     fp16_vec_field = "float16_vec_field"
     int_field = "int_scalar"
     string_field = "string_scalar"
+    text_field = "text_scalar"
     bool_field = "bool_scalar"
     float_field = "float_scalar"
     double_field = "double_scalar"
@@ -403,6 +404,23 @@ def gen_string_in_numpy_file(dir, data_field, rows, start=0, force=False):
     return file_name
 
 
+def gen_text_in_numpy_file(dir, data_field, rows, start=0, force=False, nullable=False):
+    file_name = f"{data_field}.npy"
+    file = f"{dir}/{file_name}"
+    if not os.path.exists(file) or force:
+        # non vector columns
+        data = []
+        if rows > 0:
+            data = [fake.text() + " milvus " for i in range(start, rows+start)]
+            if nullable:
+                data = [None if random.random() < 0.5 else fake.text() + " milvus "  for _ in range(rows)]
+        arr = np.array(data)
+        # print(f"file_name: {file_name} data type: {arr.dtype}")
+        log.info(f"file_name: {file_name} data type: {arr.dtype} data shape: {arr.shape}")
+        np.save(file, arr)
+    return file_name
+
+
 def gen_dynamic_field_in_numpy_file(dir, rows, start=0, force=False):
     file_name = f"$meta.npy"
     file = f"{dir}/{file_name}"
@@ -553,6 +571,11 @@ def gen_data_by_data_field(data_field, rows, start=0, float_vector=True, dim=128
                 data = [gen_unique_str(str(i)) for i in range(start, rows + start)]
             else:
                 data = [None for _ in range(start, rows + start)]
+        elif data_field == DataField.text_field:
+            if not nullable:
+                data = [fake.text() + " milvus " for i in range(start, rows + start)]
+            else:
+                data = [None if random.random() < 0.5 else  fake.text() + " milvus " for _ in range(start, rows + start)]
         elif data_field == DataField.bool_field:
             if not nullable:
                 data = [random.choice([True, False]) for i in range(start, rows + start)]
@@ -573,31 +596,33 @@ def gen_data_by_data_field(data_field, rows, start=0, float_vector=True, dim=128
                      for i in range(start, rows + start)])
             else:
                 data = pd.Series(
-                    [np.array(None) for i in range(start, rows + start)])
+                    [None for i in range(start, rows + start)])
         elif data_field == DataField.array_int_field:
             if not nullable:
                 data = pd.Series(
                     [np.array([random.randint(-999999, 9999999) for _ in range(array_length)], dtype=np.dtype("int64"))
                      for i in range(start, rows + start)])
             else:
                 data = pd.Series(
-                    [np.array(None) for i in range(start, rows + start)])
+                    [None for i in range(start, rows + start)])
         elif data_field == DataField.array_float_field:
             if not nullable:
                 data = pd.Series(
                     [np.array([random.random() for _ in range(array_length)], dtype=np.dtype("float32"))
                      for i in range(start, rows + start)])
             else:
                 data = pd.Series(
-                    [np.array(None) for i in range(start, rows + start)])
+                    [None for i in range(start, rows + start)])
         elif data_field == DataField.array_string_field:
             if not nullable:
                 data = pd.Series(
                     [np.array([gen_unique_str(str(i)) for _ in range(array_length)], dtype=np.dtype("str"))
                      for i in range(start, rows + start)])
             else:
                 data = pd.Series(
-                    [np.array(None) for i in range(start, rows + start)])
+                    [None for i in range(start, rows + start)])
+        else:
+            raise Exception("unsupported field name")
     return data
 
 
@@ -714,6 +739,14 @@ def gen_dict_data_by_data_field(data_fields, rows, start=0, float_vector=True, d
             elif data_field == DataField.string_field:
                 if not nullable:
                     d[data_field] = gen_unique_str(str(r + start))
+            elif data_field == DataField.text_field:
+                if not nullable:
+                    d[data_field] = fake.text() + " milvus "
+                else:
+                    if random.random() < 0.5:
+                         d[data_field] = None
+                    else:
+                        d[data_field] = fake.text() + " milvus "
             elif data_field == DataField.bool_field:
                 if not nullable:
                     d[data_field] = random.choice([True, False])
@@ -746,6 +779,8 @@ def gen_dict_data_by_data_field(data_fields, rows, start=0, float_vector=True, d
                     d[data_field] = [gen_unique_str(str(i)) for i in range(array_length)]
                 else:
                     d[data_field] = None
+            else:
+                raise Exception("unsupported field name")
         if enable_dynamic_field:
             d[str(r+start)] = r+start
             d["name"] = fake.name()
@@ -845,6 +880,8 @@ def gen_npy_files(float_vector, rows, dim, data_fields, file_size=None, file_num
                                                       vector_type=vector_type, rows=rows, dim=dim, force=force)
             elif data_field == DataField.string_field:  # string field for numpy not supported yet at 2022-10-17
                 file_name = gen_string_in_numpy_file(dir=data_source_new, data_field=data_field, rows=rows, force=force)
+            elif data_field == DataField.text_field:
+                file_name = gen_text_in_numpy_file(dir=data_source_new, data_field=data_field, rows=rows, force=force, nullable=nullable)
             elif data_field == DataField.bool_field:
                 file_name = gen_bool_in_numpy_file(dir=data_source_new, data_field=data_field, rows=rows, force=force)
             elif data_field == DataField.json_field:

diff --git a/tests/python_client/testcases/test_bulk_insert.py b/tests/python_client/testcases/test_bulk_insert.py
@@ -770,6 +770,7 @@ def test_bulk_insert_all_field_with_new_json_format(self, auto_id, dim, entities
             cf.gen_int64_field(name=df.int_field, nullable=nullable),
             cf.gen_float_field(name=df.float_field, nullable=nullable),
             cf.gen_string_field(name=df.string_field, is_partition_key=enable_partition_key, nullable=nullable),
+            cf.gen_string_field(name=df.text_field, enable_match=True, nullable=nullable),
             cf.gen_json_field(name=df.json_field, nullable=nullable),
             cf.gen_array_field(name=df.array_int_field, element_type=DataType.INT64, nullable=nullable),
             cf.gen_array_field(name=df.array_float_field, element_type=DataType.FLOAT, nullable=nullable),
@@ -895,6 +896,11 @@ def test_bulk_insert_all_field_with_new_json_format(self, auto_id, dim, entities
         query_data = [r[expr_field] for r in res][:len(self.collection_wrap.partitions)]
         res, _ = self.collection_wrap.query(expr=f"{expr_field} in {query_data}", output_fields=[expr_field])
         assert len(res) == len(query_data)
+        res, _ = self.collection_wrap.query(expr=f"TextMatch({df.text_field}, 'milvus')", output_fields=[df.text_field])
+        if nullable is False:
+            assert len(res) == entities
+        else:
+            assert 0 < len(res) < entities
         if enable_partition_key:
             assert len(self.collection_wrap.partitions) > 1
 
@@ -929,6 +935,7 @@ def test_bulk_insert_all_field_with_numpy(self, auto_id, dim, entities, enable_d
             cf.gen_int64_field(name=df.int_field, nullable=nullable),
             cf.gen_float_field(name=df.float_field),
             cf.gen_string_field(name=df.string_field, is_partition_key=enable_partition_key),
+            cf.gen_string_field(name=df.text_field, enable_match=True, nullable=nullable),
             cf.gen_json_field(name=df.json_field),
             cf.gen_float_vec_field(name=df.float_vec_field, dim=float_vec_field_dim),
             cf.gen_binary_vec_field(name=df.binary_vec_field, dim=binary_vec_field_dim),
@@ -1042,6 +1049,11 @@ def test_bulk_insert_all_field_with_numpy(self, auto_id, dim, entities, enable_d
         query_data = [r[df.string_field] for r in res][:len(self.collection_wrap.partitions)]
         res, _ = self.collection_wrap.query(expr=f"{df.string_field} in {query_data}", output_fields=[df.string_field])
         assert len(res) == len(query_data)
+        res, _ = self.collection_wrap.query(expr=f"TextMatch({df.text_field}, 'milvus')", output_fields=[df.text_field])
+        if nullable is False:
+            assert len(res) == entities
+        else:
+            assert 0 < len(res) < entities
         if enable_partition_key:
             assert len(self.collection_wrap.partitions) > 1
 
@@ -1065,8 +1077,6 @@ def test_bulk_insert_all_field_with_parquet(self, auto_id, dim, entities, enable
         """
         if enable_dynamic_field is False and include_meta is True:
             pytest.skip("include_meta only works with enable_dynamic_field")
-        if nullable is True:
-            pytest.skip("issue #36252")
         if enable_partition_key is True and nullable is True:
             pytest.skip("partition key field not support nullable")
         float_vec_field_dim = dim
@@ -1078,6 +1088,7 @@ def test_bulk_insert_all_field_with_parquet(self, auto_id, dim, entities, enable
             cf.gen_int64_field(name=df.int_field, nullable=nullable),
             cf.gen_float_field(name=df.float_field, nullable=nullable),
             cf.gen_string_field(name=df.string_field, is_partition_key=enable_partition_key, nullable=nullable),
+            cf.gen_string_field(name=df.text_field, enable_match=True, nullable=nullable),
             cf.gen_json_field(name=df.json_field, nullable=nullable),
             cf.gen_array_field(name=df.array_int_field, element_type=DataType.INT64, nullable=nullable),
             cf.gen_array_field(name=df.array_float_field, element_type=DataType.FLOAT, nullable=nullable),
@@ -1191,10 +1202,17 @@ def test_bulk_insert_all_field_with_parquet(self, auto_id, dim, entities, enable
                         assert "address" in fields_from_search
         # query data
         res, _ = self.collection_wrap.query(expr=f"{df.string_field} >= '0'", output_fields=[df.string_field])
-        assert len(res) == entities
+        if nullable is False:
+            assert len(res) == entities
         query_data = [r[df.string_field] for r in res][:len(self.collection_wrap.partitions)]
         res, _ = self.collection_wrap.query(expr=f"{df.string_field} in {query_data}", output_fields=[df.string_field])
-        assert len(res) == len(query_data)
+        if nullable is False:
+            assert len(res) == len(query_data)
+        res, _ = self.collection_wrap.query(expr=f"TextMatch({df.text_field}, 'milvus')", output_fields=[df.text_field])
+        if nullable is False:
+            assert len(res) == entities
+        else:
+            assert 0 < len(res) < entities
         if enable_partition_key:
             assert len(self.collection_wrap.partitions) > 1
 

diff --git a/tests/scripts/ci_e2e_4am.sh b/tests/scripts/ci_e2e_4am.sh
@@ -116,10 +116,10 @@ fi
 if [[ "${MILVUS_HELM_RELEASE_NAME}" != *"msop"* ]]; then
   if [[ -n "${TEST_TIMEOUT:-}" ]]; then
 
-    timeout  "${TEST_TIMEOUT}" pytest testcases --endpoint http://${MILVUS_SERVICE_NAME}:${MILVUS_SERVICE_PORT} --minio_host ${MINIO_SERVICE_NAME} -v -x -m BulkInsert -n 6 --timeout 180\
+    timeout  "${TEST_TIMEOUT}" pytest testcases --endpoint http://${MILVUS_SERVICE_NAME}:${MILVUS_SERVICE_PORT} --minio_host ${MINIO_SERVICE_NAME} -v -x -m BulkInsert -n 6 --timeout 240\
                                       --html=${CI_LOG_PATH}/report_restful.html  --self-contained-html
   else
-    pytest testcases --endpoint http://${MILVUS_SERVICE_NAME}:${MILVUS_SERVICE_PORT} --minio_host ${MINIO_SERVICE_NAME} -v -x -m BulkInsert -n 6 --timeout 180\
+    pytest testcases --endpoint http://${MILVUS_SERVICE_NAME}:${MILVUS_SERVICE_PORT} --minio_host ${MINIO_SERVICE_NAME} -v -x -m BulkInsert -n 6 --timeout 240\
                                       --html=${CI_LOG_PATH}/report_restful.html --self-contained-html
   fi
 fi