test: Add nullable test cases for bulk writer (#37572)

issue: #36129 Signed-off-by: binbin lv <binbin.lv@zilliz.com>
milvus-io · Nov 12, 2024 · 21b6802 · 21b6802
1 parent c1eccce
commit 21b6802
Show file tree

Hide file tree

Showing 2 changed files with 178 additions and 37 deletions.
diff --git a/tests/python_client/testcases/test_bulk_insert.py b/tests/python_client/testcases/test_bulk_insert.py
@@ -1477,7 +1477,8 @@ def test_bulk_insert_sparse_vector_with_json(self, auto_id, dim, entities, enabl
     @pytest.mark.parametrize("entities", [1000])  # 1000
     @pytest.mark.parametrize("enable_dynamic_field", [True, False])
     @pytest.mark.parametrize("sparse_format", ["doc", "coo"])
-    def test_with_all_field_json_with_bulk_writer(self, auto_id, dim, entities, enable_dynamic_field, sparse_format):
+    @pytest.mark.parametrize("nullable", [True, False])
+    def test_with_all_field_json_with_bulk_writer(self, auto_id, dim, entities, enable_dynamic_field, sparse_format, nullable):
         """
         collection schema 1: [pk, int64, float64, string float_vector]
         data file: vectors.npy and uid.npy,
@@ -1489,14 +1490,14 @@ def test_with_all_field_json_with_bulk_writer(self, auto_id, dim, entities, enab
         self._connect()
         fields = [
             cf.gen_int64_field(name=df.pk_field, is_primary=True, auto_id=auto_id),
-            cf.gen_int64_field(name=df.int_field),
-            cf.gen_float_field(name=df.float_field),
-            cf.gen_string_field(name=df.string_field),
-            cf.gen_json_field(name=df.json_field),
-            cf.gen_array_field(name=df.array_int_field, element_type=DataType.INT64),
-            cf.gen_array_field(name=df.array_float_field, element_type=DataType.FLOAT),
-            cf.gen_array_field(name=df.array_string_field, element_type=DataType.VARCHAR, max_length=100),
-            cf.gen_array_field(name=df.array_bool_field, element_type=DataType.BOOL),
+            cf.gen_int64_field(name=df.int_field, nullable=nullable),
+            cf.gen_float_field(name=df.float_field, nullable=nullable),
+            cf.gen_string_field(name=df.string_field, nullable=nullable),
+            cf.gen_json_field(name=df.json_field, nullable=nullable),
+            cf.gen_array_field(name=df.array_int_field, element_type=DataType.INT64, nullable=nullable),
+            cf.gen_array_field(name=df.array_float_field, element_type=DataType.FLOAT, nullable=nullable),
+            cf.gen_array_field(name=df.array_string_field, element_type=DataType.VARCHAR, max_length=100, nullable=nullable),
+            cf.gen_array_field(name=df.array_bool_field, element_type=DataType.BOOL, nullable=nullable),
             cf.gen_float_vec_field(name=df.float_vec_field, dim=dim),
             cf.gen_float16_vec_field(name=df.fp16_vec_field, dim=dim),
             cf.gen_bfloat16_vec_field(name=df.bf16_vec_field, dim=dim),
@@ -1528,14 +1529,14 @@ def test_with_all_field_json_with_bulk_writer(self, auto_id, dim, entities, enab
             for i in range(entities):
                 row = {
                     df.pk_field: i,
-                    df.int_field: 1,
-                    df.float_field: 1.0,
-                    df.string_field: "string",
-                    df.json_field: json_value[i%len(json_value)],
-                    df.array_int_field: [1, 2],
-                    df.array_float_field: [1.0, 2.0],
-                    df.array_string_field: ["string1", "string2"],
-                    df.array_bool_field: [True, False],
+                    df.int_field: 1 if not (nullable and random.random() < 0.5) else None,
+                    df.float_field: 1.0 if not (nullable and random.random() < 0.5) else None,
+                    df.string_field: "string" if not (nullable and random.random() < 0.5) else None,
+                    df.json_field: json_value[i%len(json_value)] if not (nullable and random.random() < 0.5) else None,
+                    df.array_int_field: [1, 2] if not (nullable and random.random() < 0.5) else None,
+                    df.array_float_field: [1.0, 2.0] if not (nullable and random.random() < 0.5) else None,
+                    df.array_string_field: ["string1", "string2"] if not (nullable and random.random() < 0.5) else None,
+                    df.array_bool_field: [True, False] if not (nullable and random.random() < 0.5) else None,
                     df.float_vec_field: cf.gen_vectors(1, dim)[0],
                     df.fp16_vec_field: cf.gen_vectors(1, dim, vector_data_type="FLOAT16_VECTOR")[0],
                     df.bf16_vec_field: cf.gen_vectors(1, dim, vector_data_type="BFLOAT16_VECTOR")[0],
@@ -1606,13 +1607,17 @@ def test_with_all_field_json_with_bulk_writer(self, auto_id, dim, entities, enab
     @pytest.mark.parametrize("dim", [128])  # 128
     @pytest.mark.parametrize("entities", [1000])  # 1000
     @pytest.mark.parametrize("enable_dynamic_field", [True, False])
-    def test_with_all_field_numpy_with_bulk_writer(self, auto_id, dim, entities, enable_dynamic_field):
+    @pytest.mark.parametrize("nullable", [True, False])
+    def test_with_all_field_numpy_with_bulk_writer(self, auto_id, dim, entities, enable_dynamic_field, nullable):
         """
         """
+        if nullable is True:
+            pytest.skip("not support bulk writer numpy files in field(int_scalar) which has 'None' data")
+
         self._connect()
         fields = [
             cf.gen_int64_field(name=df.pk_field, is_primary=True, auto_id=auto_id),
-            cf.gen_int64_field(name=df.int_field),
+            cf.gen_int64_field(name=df.int_field, nullable=nullable),
             cf.gen_float_field(name=df.float_field),
             cf.gen_string_field(name=df.string_field),
             cf.gen_json_field(name=df.json_field),
@@ -1646,7 +1651,7 @@ def test_with_all_field_numpy_with_bulk_writer(self, auto_id, dim, entities, ena
             for i in range(entities):
                 row = {
                     df.pk_field: i,
-                    df.int_field: 1,
+                    df.int_field: 1 if not (nullable and random.random() < 0.5) else None,
                     df.float_field: 1.0,
                     df.string_field: "string",
                     df.json_field: json_value[i%len(json_value)],
@@ -1720,20 +1725,21 @@ def test_with_all_field_numpy_with_bulk_writer(self, auto_id, dim, entities, ena
     @pytest.mark.parametrize("entities", [1000])  # 1000
     @pytest.mark.parametrize("enable_dynamic_field", [True, False])
     @pytest.mark.parametrize("sparse_format", ["doc", "coo"])
-    def test_with_all_field_parquet_with_bulk_writer(self, auto_id, dim, entities, enable_dynamic_field, sparse_format):
+    @pytest.mark.parametrize("nullable", [True, False])
+    def test_with_all_field_parquet_with_bulk_writer(self, auto_id, dim, entities, enable_dynamic_field, sparse_format, nullable):
         """
         """
         self._connect()
         fields = [
             cf.gen_int64_field(name=df.pk_field, is_primary=True, auto_id=auto_id),
-            cf.gen_int64_field(name=df.int_field),
-            cf.gen_float_field(name=df.float_field),
-            cf.gen_string_field(name=df.string_field),
-            cf.gen_json_field(name=df.json_field),
-            cf.gen_array_field(name=df.array_int_field, element_type=DataType.INT64),
-            cf.gen_array_field(name=df.array_float_field, element_type=DataType.FLOAT),
-            cf.gen_array_field(name=df.array_string_field, element_type=DataType.VARCHAR, max_length=100),
-            cf.gen_array_field(name=df.array_bool_field, element_type=DataType.BOOL),
+            cf.gen_int64_field(name=df.int_field, nullable=nullable),
+            cf.gen_float_field(name=df.float_field, nullable=nullable),
+            cf.gen_string_field(name=df.string_field, nullable=nullable),
+            cf.gen_json_field(name=df.json_field, nullable=nullable),
+            cf.gen_array_field(name=df.array_int_field, element_type=DataType.INT64, nullable=nullable),
+            cf.gen_array_field(name=df.array_float_field, element_type=DataType.FLOAT, nullable=nullable),
+            cf.gen_array_field(name=df.array_string_field, element_type=DataType.VARCHAR, max_length=100, nullable=nullable),
+            cf.gen_array_field(name=df.array_bool_field, element_type=DataType.BOOL, nullable=nullable),
             cf.gen_float_vec_field(name=df.float_vec_field, dim=dim),
             cf.gen_float16_vec_field(name=df.fp16_vec_field, dim=dim),
             cf.gen_bfloat16_vec_field(name=df.bf16_vec_field, dim=dim),
@@ -1765,14 +1771,14 @@ def test_with_all_field_parquet_with_bulk_writer(self, auto_id, dim, entities, e
             for i in range(entities):
                 row = {
                     df.pk_field: i,
-                    df.int_field: 1,
-                    df.float_field: 1.0,
-                    df.string_field: "string",
-                    df.json_field: json_value[i%len(json_value)],
-                    df.array_int_field: [1, 2],
-                    df.array_float_field: [1.0, 2.0],
-                    df.array_string_field: ["string1", "string2"],
-                    df.array_bool_field: [True, False],
+                    df.int_field: 1 if not (nullable and random.random() < 0.5) else None,
+                    df.float_field: 1.0 if not (nullable and random.random() < 0.5) else None,
+                    df.string_field: "string" if not (nullable and random.random() < 0.5) else None,
+                    df.json_field: json_value[i%len(json_value)] if not (nullable and random.random() < 0.5) else None,
+                    df.array_int_field: [1, 2] if not (nullable and random.random() < 0.5) else None,
+                    df.array_float_field: [1.0, 2.0] if not (nullable and random.random() < 0.5) else None,
+                    df.array_string_field: ["string1", "string2"] if not (nullable and random.random() < 0.5) else None,
+                    df.array_bool_field: [True, False] if not (nullable and random.random() < 0.5) else None,
                     df.float_vec_field: cf.gen_vectors(1, dim)[0],
                     df.fp16_vec_field: cf.gen_vectors(1, dim, vector_data_type="FLOAT16_VECTOR")[0],
                     df.bf16_vec_field: cf.gen_vectors(1, dim, vector_data_type="BFLOAT16_VECTOR")[0],

diff --git a/tests/python_client/testcases/test_search.py b/tests/python_client/testcases/test_search.py
@@ -4809,6 +4809,73 @@ def test_binary_indexed_over_max_dim(self, dim):
                                   check_task=CheckTasks.err_res,
                                   check_items={"err_code": 999, "err_msg": f"invalid dimension: {dim}."})
 
+    @pytest.mark.tags(CaseLabel.L1)
+    @pytest.mark.skip(reason="issue #37547")
+    def test_search_verify_expr_cache(self, is_flush):
+        """
+        target: test search case to test expr cache
+        method: 1. create collection with a double datatype field
+                2. search with expr "doubleField == 0"
+                3. drop this collection
+                4. create collection with same collection name and same field name but modify the type of double field
+                   as varchar datatype
+                5. search with expr "doubleField == 0" again
+        expected: 1. search successfully with limit(topK) for the first collection
+                  2. report error for the second collection with the same name
+        """
+        # 1. initialize with data
+        collection_w, _, _, insert_ids, time_stamp = \
+            self.init_collection_general(prefix, True, is_flush=is_flush)[0:5]
+        collection_name = collection_w.name
+        # 2. generate search data
+        vectors = cf.gen_vectors_based_on_vector_type(default_nq, default_dim)
+        # 3. search with expr "nullableFid == 0"
+        search_exp = f"{ct.default_float_field_name} == 0"
+        output_fields = [default_int64_field_name, default_float_field_name]
+        collection_w.search(vectors[:default_nq], default_search_field,
+                            default_search_params, default_limit,
+                            search_exp,
+                            output_fields=output_fields,
+                            check_task=CheckTasks.check_search_results,
+                            check_items={"nq": default_nq,
+                                         "ids": insert_ids,
+                                         "limit": 1,
+                                         "output_fields": output_fields})
+        # 4. drop collection
+        collection_w.drop()
+        # 5. create the same collection name with same field name but varchar field type
+        int64_field = cf.gen_int64_field(is_primary=True)
+        string_field = cf.gen_string_field(ct.default_float_field_name)
+        json_field = cf.gen_json_field()
+        float_vector_field = cf.gen_float_vec_field()
+        fields = [int64_field, string_field, json_field, float_vector_field]
+        schema = cf.gen_collection_schema(fields)
+        collection_w = self.init_collection_wrap(name=collection_name, schema=schema)
+        int64_values = pd.Series(data=[i for i in range(default_nb)])
+        string_values = pd.Series(data=[str(i) for i in range(default_nb)], dtype="string")
+        json_values = [{"number": i, "string": str(i), "bool": bool(i),
+                        "list": [j for j in range(i, i + ct.default_json_list_length)]} for i in range(default_nb)]
+        float_vec_values = cf.gen_vectors(default_nb, default_dim)
+        df = pd.DataFrame({
+            ct.default_int64_field_name: int64_values,
+            ct.default_float_field_name: string_values,
+            ct.default_json_field_name: json_values,
+            ct.default_float_vec_field_name: float_vec_values
+        })
+        collection_w.insert(df)
+        collection_w.create_index(ct.default_float_vec_field_name, ct.default_flat_index)
+        collection_w.load()
+        collection_w.flush()
+        collection_w.search(vectors[:default_nq], default_search_field,
+                            default_search_params, default_limit,
+                            search_exp,
+                            output_fields=output_fields,
+                            check_task=CheckTasks.err_res,
+                            check_items={"err_code": 1100,
+                                         "err_msg": "failed to create query plan: cannot parse expression: float == 0, "
+                                                    "error: comparisons between VarChar and Int64 are not supported: "
+                                                    "invalid parameter"})
+
 
 class TestSearchBase(TestcaseBase):
     @pytest.fixture(
@@ -13279,6 +13346,74 @@ def test_search_none_data_partial_load(self, is_flush, enable_dynamic_field, nul
                                          "limit": default_limit,
                                          "output_fields": output_fields})
 
+    @pytest.mark.tags(CaseLabel.L1)
+    @pytest.mark.skip(reason="issue #37547")
+    def test_search_none_data_expr_cache(self, is_flush):
+        """
+        target: test search case with none data to test expr cache
+        method: 1. create collection with double datatype as nullable field
+                2. search with expr "nullableFid == 0"
+                3. drop this collection
+                4. create collection with same collection name and same field name but modify the type of nullable field
+                   as varchar datatype
+                5. search with expr "nullableFid == 0" again
+        expected: 1. search successfully with limit(topK) for the first collection
+                  2. report error for the second collection with the same name
+        """
+        # 1. initialize with data
+        collection_w, _, _, insert_ids, time_stamp = \
+            self.init_collection_general(prefix, True, is_flush=is_flush,
+                                         nullable_fields={ct.default_float_field_name: 0.5})[0:5]
+        collection_name = collection_w.name
+        # 2. generate search data
+        vectors = cf.gen_vectors_based_on_vector_type(default_nq, default_dim)
+        # 3. search with expr "nullableFid == 0"
+        search_exp = f"{ct.default_float_field_name} == 0"
+        output_fields = [default_int64_field_name, default_float_field_name]
+        collection_w.search(vectors[:default_nq], default_search_field,
+                            default_search_params, default_limit,
+                            search_exp,
+                            output_fields=output_fields,
+                            check_task=CheckTasks.check_search_results,
+                            check_items={"nq": default_nq,
+                                         "ids": insert_ids,
+                                         "limit": 1,
+                                         "output_fields": output_fields})
+        # 4. drop collection
+        collection_w.drop()
+        # 5. create the same collection name with same field name but varchar field type
+        int64_field = cf.gen_int64_field(is_primary=True)
+        string_field = cf.gen_string_field(ct.default_float_field_name, nullable=True)
+        json_field = cf.gen_json_field()
+        float_vector_field = cf.gen_float_vec_field()
+        fields = [int64_field, string_field, json_field, float_vector_field]
+        schema = cf.gen_collection_schema(fields)
+        collection_w = self.init_collection_wrap(name=collection_name, schema=schema)
+        int64_values = pd.Series(data=[i for i in range(default_nb)])
+        string_values = pd.Series(data=[str(i) for i in range(default_nb)], dtype="string")
+        json_values = [{"number": i, "string": str(i), "bool": bool(i),
+                        "list": [j for j in range(i, i + ct.default_json_list_length)]} for i in range(default_nb)]
+        float_vec_values = cf.gen_vectors(default_nb, default_dim)
+        df = pd.DataFrame({
+            ct.default_int64_field_name: int64_values,
+            ct.default_float_field_name: None,
+            ct.default_json_field_name: json_values,
+            ct.default_float_vec_field_name: float_vec_values
+        })
+        collection_w.insert(df)
+        collection_w.create_index(ct.default_float_vec_field_name, ct.default_flat_index)
+        collection_w.load()
+        collection_w.flush()
+        collection_w.search(vectors[:default_nq], default_search_field,
+                            default_search_params, default_limit,
+                            search_exp,
+                            output_fields=output_fields,
+                            check_task=CheckTasks.err_res,
+                            check_items={"err_code": 1100,
+                                         "err_msg": "failed to create query plan: cannot parse expression: float == 0, "
+                                                    "error: comparisons between VarChar and Int64 are not supported: "
+                                                    "invalid parameter"})
+
 
 class TestSearchWithTextMatchFilter(TestcaseBase):
     """