Skip to content

Commit

Permalink
test: Add nullable test cases for bulk writer (#37572)
Browse files Browse the repository at this point in the history
issue: #36129

Signed-off-by: binbin lv <binbin.lv@zilliz.com>
  • Loading branch information
binbinlv authored Nov 12, 2024
1 parent c1eccce commit 21b6802
Show file tree
Hide file tree
Showing 2 changed files with 178 additions and 37 deletions.
80 changes: 43 additions & 37 deletions tests/python_client/testcases/test_bulk_insert.py
Original file line number Diff line number Diff line change
Expand Up @@ -1477,7 +1477,8 @@ def test_bulk_insert_sparse_vector_with_json(self, auto_id, dim, entities, enabl
@pytest.mark.parametrize("entities", [1000]) # 1000
@pytest.mark.parametrize("enable_dynamic_field", [True, False])
@pytest.mark.parametrize("sparse_format", ["doc", "coo"])
def test_with_all_field_json_with_bulk_writer(self, auto_id, dim, entities, enable_dynamic_field, sparse_format):
@pytest.mark.parametrize("nullable", [True, False])
def test_with_all_field_json_with_bulk_writer(self, auto_id, dim, entities, enable_dynamic_field, sparse_format, nullable):
"""
collection schema 1: [pk, int64, float64, string float_vector]
data file: vectors.npy and uid.npy,
Expand All @@ -1489,14 +1490,14 @@ def test_with_all_field_json_with_bulk_writer(self, auto_id, dim, entities, enab
self._connect()
fields = [
cf.gen_int64_field(name=df.pk_field, is_primary=True, auto_id=auto_id),
cf.gen_int64_field(name=df.int_field),
cf.gen_float_field(name=df.float_field),
cf.gen_string_field(name=df.string_field),
cf.gen_json_field(name=df.json_field),
cf.gen_array_field(name=df.array_int_field, element_type=DataType.INT64),
cf.gen_array_field(name=df.array_float_field, element_type=DataType.FLOAT),
cf.gen_array_field(name=df.array_string_field, element_type=DataType.VARCHAR, max_length=100),
cf.gen_array_field(name=df.array_bool_field, element_type=DataType.BOOL),
cf.gen_int64_field(name=df.int_field, nullable=nullable),
cf.gen_float_field(name=df.float_field, nullable=nullable),
cf.gen_string_field(name=df.string_field, nullable=nullable),
cf.gen_json_field(name=df.json_field, nullable=nullable),
cf.gen_array_field(name=df.array_int_field, element_type=DataType.INT64, nullable=nullable),
cf.gen_array_field(name=df.array_float_field, element_type=DataType.FLOAT, nullable=nullable),
cf.gen_array_field(name=df.array_string_field, element_type=DataType.VARCHAR, max_length=100, nullable=nullable),
cf.gen_array_field(name=df.array_bool_field, element_type=DataType.BOOL, nullable=nullable),
cf.gen_float_vec_field(name=df.float_vec_field, dim=dim),
cf.gen_float16_vec_field(name=df.fp16_vec_field, dim=dim),
cf.gen_bfloat16_vec_field(name=df.bf16_vec_field, dim=dim),
Expand Down Expand Up @@ -1528,14 +1529,14 @@ def test_with_all_field_json_with_bulk_writer(self, auto_id, dim, entities, enab
for i in range(entities):
row = {
df.pk_field: i,
df.int_field: 1,
df.float_field: 1.0,
df.string_field: "string",
df.json_field: json_value[i%len(json_value)],
df.array_int_field: [1, 2],
df.array_float_field: [1.0, 2.0],
df.array_string_field: ["string1", "string2"],
df.array_bool_field: [True, False],
df.int_field: 1 if not (nullable and random.random() < 0.5) else None,
df.float_field: 1.0 if not (nullable and random.random() < 0.5) else None,
df.string_field: "string" if not (nullable and random.random() < 0.5) else None,
df.json_field: json_value[i%len(json_value)] if not (nullable and random.random() < 0.5) else None,
df.array_int_field: [1, 2] if not (nullable and random.random() < 0.5) else None,
df.array_float_field: [1.0, 2.0] if not (nullable and random.random() < 0.5) else None,
df.array_string_field: ["string1", "string2"] if not (nullable and random.random() < 0.5) else None,
df.array_bool_field: [True, False] if not (nullable and random.random() < 0.5) else None,
df.float_vec_field: cf.gen_vectors(1, dim)[0],
df.fp16_vec_field: cf.gen_vectors(1, dim, vector_data_type="FLOAT16_VECTOR")[0],
df.bf16_vec_field: cf.gen_vectors(1, dim, vector_data_type="BFLOAT16_VECTOR")[0],
Expand Down Expand Up @@ -1606,13 +1607,17 @@ def test_with_all_field_json_with_bulk_writer(self, auto_id, dim, entities, enab
@pytest.mark.parametrize("dim", [128]) # 128
@pytest.mark.parametrize("entities", [1000]) # 1000
@pytest.mark.parametrize("enable_dynamic_field", [True, False])
def test_with_all_field_numpy_with_bulk_writer(self, auto_id, dim, entities, enable_dynamic_field):
@pytest.mark.parametrize("nullable", [True, False])
def test_with_all_field_numpy_with_bulk_writer(self, auto_id, dim, entities, enable_dynamic_field, nullable):
"""
"""
if nullable is True:
pytest.skip("not support bulk writer numpy files in field(int_scalar) which has 'None' data")

self._connect()
fields = [
cf.gen_int64_field(name=df.pk_field, is_primary=True, auto_id=auto_id),
cf.gen_int64_field(name=df.int_field),
cf.gen_int64_field(name=df.int_field, nullable=nullable),
cf.gen_float_field(name=df.float_field),
cf.gen_string_field(name=df.string_field),
cf.gen_json_field(name=df.json_field),
Expand Down Expand Up @@ -1646,7 +1651,7 @@ def test_with_all_field_numpy_with_bulk_writer(self, auto_id, dim, entities, ena
for i in range(entities):
row = {
df.pk_field: i,
df.int_field: 1,
df.int_field: 1 if not (nullable and random.random() < 0.5) else None,
df.float_field: 1.0,
df.string_field: "string",
df.json_field: json_value[i%len(json_value)],
Expand Down Expand Up @@ -1720,20 +1725,21 @@ def test_with_all_field_numpy_with_bulk_writer(self, auto_id, dim, entities, ena
@pytest.mark.parametrize("entities", [1000]) # 1000
@pytest.mark.parametrize("enable_dynamic_field", [True, False])
@pytest.mark.parametrize("sparse_format", ["doc", "coo"])
def test_with_all_field_parquet_with_bulk_writer(self, auto_id, dim, entities, enable_dynamic_field, sparse_format):
@pytest.mark.parametrize("nullable", [True, False])
def test_with_all_field_parquet_with_bulk_writer(self, auto_id, dim, entities, enable_dynamic_field, sparse_format, nullable):
"""
"""
self._connect()
fields = [
cf.gen_int64_field(name=df.pk_field, is_primary=True, auto_id=auto_id),
cf.gen_int64_field(name=df.int_field),
cf.gen_float_field(name=df.float_field),
cf.gen_string_field(name=df.string_field),
cf.gen_json_field(name=df.json_field),
cf.gen_array_field(name=df.array_int_field, element_type=DataType.INT64),
cf.gen_array_field(name=df.array_float_field, element_type=DataType.FLOAT),
cf.gen_array_field(name=df.array_string_field, element_type=DataType.VARCHAR, max_length=100),
cf.gen_array_field(name=df.array_bool_field, element_type=DataType.BOOL),
cf.gen_int64_field(name=df.int_field, nullable=nullable),
cf.gen_float_field(name=df.float_field, nullable=nullable),
cf.gen_string_field(name=df.string_field, nullable=nullable),
cf.gen_json_field(name=df.json_field, nullable=nullable),
cf.gen_array_field(name=df.array_int_field, element_type=DataType.INT64, nullable=nullable),
cf.gen_array_field(name=df.array_float_field, element_type=DataType.FLOAT, nullable=nullable),
cf.gen_array_field(name=df.array_string_field, element_type=DataType.VARCHAR, max_length=100, nullable=nullable),
cf.gen_array_field(name=df.array_bool_field, element_type=DataType.BOOL, nullable=nullable),
cf.gen_float_vec_field(name=df.float_vec_field, dim=dim),
cf.gen_float16_vec_field(name=df.fp16_vec_field, dim=dim),
cf.gen_bfloat16_vec_field(name=df.bf16_vec_field, dim=dim),
Expand Down Expand Up @@ -1765,14 +1771,14 @@ def test_with_all_field_parquet_with_bulk_writer(self, auto_id, dim, entities, e
for i in range(entities):
row = {
df.pk_field: i,
df.int_field: 1,
df.float_field: 1.0,
df.string_field: "string",
df.json_field: json_value[i%len(json_value)],
df.array_int_field: [1, 2],
df.array_float_field: [1.0, 2.0],
df.array_string_field: ["string1", "string2"],
df.array_bool_field: [True, False],
df.int_field: 1 if not (nullable and random.random() < 0.5) else None,
df.float_field: 1.0 if not (nullable and random.random() < 0.5) else None,
df.string_field: "string" if not (nullable and random.random() < 0.5) else None,
df.json_field: json_value[i%len(json_value)] if not (nullable and random.random() < 0.5) else None,
df.array_int_field: [1, 2] if not (nullable and random.random() < 0.5) else None,
df.array_float_field: [1.0, 2.0] if not (nullable and random.random() < 0.5) else None,
df.array_string_field: ["string1", "string2"] if not (nullable and random.random() < 0.5) else None,
df.array_bool_field: [True, False] if not (nullable and random.random() < 0.5) else None,
df.float_vec_field: cf.gen_vectors(1, dim)[0],
df.fp16_vec_field: cf.gen_vectors(1, dim, vector_data_type="FLOAT16_VECTOR")[0],
df.bf16_vec_field: cf.gen_vectors(1, dim, vector_data_type="BFLOAT16_VECTOR")[0],
Expand Down
135 changes: 135 additions & 0 deletions tests/python_client/testcases/test_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -4809,6 +4809,73 @@ def test_binary_indexed_over_max_dim(self, dim):
check_task=CheckTasks.err_res,
check_items={"err_code": 999, "err_msg": f"invalid dimension: {dim}."})

@pytest.mark.tags(CaseLabel.L1)
@pytest.mark.skip(reason="issue #37547")
def test_search_verify_expr_cache(self, is_flush):
"""
target: test search case to test expr cache
method: 1. create collection with a double datatype field
2. search with expr "doubleField == 0"
3. drop this collection
4. create collection with same collection name and same field name but modify the type of double field
as varchar datatype
5. search with expr "doubleField == 0" again
expected: 1. search successfully with limit(topK) for the first collection
2. report error for the second collection with the same name
"""
# 1. initialize with data
collection_w, _, _, insert_ids, time_stamp = \
self.init_collection_general(prefix, True, is_flush=is_flush)[0:5]
collection_name = collection_w.name
# 2. generate search data
vectors = cf.gen_vectors_based_on_vector_type(default_nq, default_dim)
# 3. search with expr "nullableFid == 0"
search_exp = f"{ct.default_float_field_name} == 0"
output_fields = [default_int64_field_name, default_float_field_name]
collection_w.search(vectors[:default_nq], default_search_field,
default_search_params, default_limit,
search_exp,
output_fields=output_fields,
check_task=CheckTasks.check_search_results,
check_items={"nq": default_nq,
"ids": insert_ids,
"limit": 1,
"output_fields": output_fields})
# 4. drop collection
collection_w.drop()
# 5. create the same collection name with same field name but varchar field type
int64_field = cf.gen_int64_field(is_primary=True)
string_field = cf.gen_string_field(ct.default_float_field_name)
json_field = cf.gen_json_field()
float_vector_field = cf.gen_float_vec_field()
fields = [int64_field, string_field, json_field, float_vector_field]
schema = cf.gen_collection_schema(fields)
collection_w = self.init_collection_wrap(name=collection_name, schema=schema)
int64_values = pd.Series(data=[i for i in range(default_nb)])
string_values = pd.Series(data=[str(i) for i in range(default_nb)], dtype="string")
json_values = [{"number": i, "string": str(i), "bool": bool(i),
"list": [j for j in range(i, i + ct.default_json_list_length)]} for i in range(default_nb)]
float_vec_values = cf.gen_vectors(default_nb, default_dim)
df = pd.DataFrame({
ct.default_int64_field_name: int64_values,
ct.default_float_field_name: string_values,
ct.default_json_field_name: json_values,
ct.default_float_vec_field_name: float_vec_values
})
collection_w.insert(df)
collection_w.create_index(ct.default_float_vec_field_name, ct.default_flat_index)
collection_w.load()
collection_w.flush()
collection_w.search(vectors[:default_nq], default_search_field,
default_search_params, default_limit,
search_exp,
output_fields=output_fields,
check_task=CheckTasks.err_res,
check_items={"err_code": 1100,
"err_msg": "failed to create query plan: cannot parse expression: float == 0, "
"error: comparisons between VarChar and Int64 are not supported: "
"invalid parameter"})


class TestSearchBase(TestcaseBase):
@pytest.fixture(
Expand Down Expand Up @@ -13279,6 +13346,74 @@ def test_search_none_data_partial_load(self, is_flush, enable_dynamic_field, nul
"limit": default_limit,
"output_fields": output_fields})

@pytest.mark.tags(CaseLabel.L1)
@pytest.mark.skip(reason="issue #37547")
def test_search_none_data_expr_cache(self, is_flush):
"""
target: test search case with none data to test expr cache
method: 1. create collection with double datatype as nullable field
2. search with expr "nullableFid == 0"
3. drop this collection
4. create collection with same collection name and same field name but modify the type of nullable field
as varchar datatype
5. search with expr "nullableFid == 0" again
expected: 1. search successfully with limit(topK) for the first collection
2. report error for the second collection with the same name
"""
# 1. initialize with data
collection_w, _, _, insert_ids, time_stamp = \
self.init_collection_general(prefix, True, is_flush=is_flush,
nullable_fields={ct.default_float_field_name: 0.5})[0:5]
collection_name = collection_w.name
# 2. generate search data
vectors = cf.gen_vectors_based_on_vector_type(default_nq, default_dim)
# 3. search with expr "nullableFid == 0"
search_exp = f"{ct.default_float_field_name} == 0"
output_fields = [default_int64_field_name, default_float_field_name]
collection_w.search(vectors[:default_nq], default_search_field,
default_search_params, default_limit,
search_exp,
output_fields=output_fields,
check_task=CheckTasks.check_search_results,
check_items={"nq": default_nq,
"ids": insert_ids,
"limit": 1,
"output_fields": output_fields})
# 4. drop collection
collection_w.drop()
# 5. create the same collection name with same field name but varchar field type
int64_field = cf.gen_int64_field(is_primary=True)
string_field = cf.gen_string_field(ct.default_float_field_name, nullable=True)
json_field = cf.gen_json_field()
float_vector_field = cf.gen_float_vec_field()
fields = [int64_field, string_field, json_field, float_vector_field]
schema = cf.gen_collection_schema(fields)
collection_w = self.init_collection_wrap(name=collection_name, schema=schema)
int64_values = pd.Series(data=[i for i in range(default_nb)])
string_values = pd.Series(data=[str(i) for i in range(default_nb)], dtype="string")
json_values = [{"number": i, "string": str(i), "bool": bool(i),
"list": [j for j in range(i, i + ct.default_json_list_length)]} for i in range(default_nb)]
float_vec_values = cf.gen_vectors(default_nb, default_dim)
df = pd.DataFrame({
ct.default_int64_field_name: int64_values,
ct.default_float_field_name: None,
ct.default_json_field_name: json_values,
ct.default_float_vec_field_name: float_vec_values
})
collection_w.insert(df)
collection_w.create_index(ct.default_float_vec_field_name, ct.default_flat_index)
collection_w.load()
collection_w.flush()
collection_w.search(vectors[:default_nq], default_search_field,
default_search_params, default_limit,
search_exp,
output_fields=output_fields,
check_task=CheckTasks.err_res,
check_items={"err_code": 1100,
"err_msg": "failed to create query plan: cannot parse expression: float == 0, "
"error: comparisons between VarChar and Int64 are not supported: "
"invalid parameter"})


class TestSearchWithTextMatchFilter(TestcaseBase):
"""
Expand Down

0 comments on commit 21b6802

Please sign in to comment.