Skip to content

Commit

Permalink
test: add different language tests and modify some cases (#36465)
Browse files Browse the repository at this point in the history
fix: #36396

Signed-off-by: nico <cheng.yuan@zilliz.com>
  • Loading branch information
NicoYuan1986 authored Sep 26, 2024
1 parent 447e326 commit cfd636e
Show file tree
Hide file tree
Showing 5 changed files with 161 additions and 83 deletions.
6 changes: 3 additions & 3 deletions tests/python_client/base/client_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -242,7 +242,7 @@ def init_collection_general(self, prefix="test", insert_data=False, nb=ct.defaul
primary_field=ct.default_int64_field_name, is_flush=True, name=None,
enable_dynamic_field=False, with_json=True, random_primary_key=False,
multiple_dim_array=[], is_partition_key=None, vector_data_type="FLOAT_VECTOR",
nullable_fields={}, default_value_fields={}, **kwargs):
nullable_fields={}, default_value_fields={}, language=None, **kwargs):
"""
target: create specified collections
method: 1. create collections (binary/non-binary, default/all data type, auto_id or not)
Expand Down Expand Up @@ -311,7 +311,7 @@ def init_collection_general(self, prefix="test", insert_data=False, nb=ct.defaul
dim=dim, enable_dynamic_field=enable_dynamic_field, with_json=with_json,
random_primary_key=random_primary_key, multiple_dim_array=multiple_dim_array,
primary_field=primary_field, vector_data_type=vector_data_type,
nullable_fields=nullable_fields)
nullable_fields=nullable_fields, language=language)
if is_flush:
assert collection_w.is_empty is False
assert collection_w.num_entities == nb
Expand All @@ -324,7 +324,7 @@ def init_collection_general(self, prefix="test", insert_data=False, nb=ct.defaul
for vector_name in vector_name_list:
collection_w.create_index(vector_name, ct.default_sparse_inverted_index)
else:
if len(multiple_dim_array) == 0 or is_all_data_type == False:
if len(multiple_dim_array) == 0 or is_all_data_type is False:
vector_name_list.append(ct.default_float_vec_field_name)
for vector_name in vector_name_list:
# Unlike dense vectors, sparse vectors cannot create flat index.
Expand Down
64 changes: 49 additions & 15 deletions tests/python_client/common/common_func.py
Original file line number Diff line number Diff line change
Expand Up @@ -322,6 +322,7 @@ def generate_array_dataset(size, array_length, hit_probabilities, target_values)

return dataset


def prepare_array_test_data(data_size, hit_rate=0.005, dim=128):
size = data_size # Number of arrays in the dataset
array_length = 10 # Length of each array
Expand Down Expand Up @@ -421,7 +422,6 @@ def prepare_array_test_data(data_size, hit_rate=0.005, dim=128):
return train_df, query_expr



def gen_unique_str(str_value=None):
prefix = "".join(random.choice(string.ascii_letters + string.digits) for _ in range(8))
return "test_" + prefix if str_value is None else str_value + "_" + prefix
Expand All @@ -433,6 +433,26 @@ def gen_str_by_length(length=8, letters_only=False):
return "".join(random.choice(string.ascii_letters + string.digits) for _ in range(length))


def generate_random_sentence(language):
language_map = {
"English": "en_US",
"French": "fr_FR",
"Spanish": "es_ES",
"German": "de_DE",
"Italian": "it_IT",
"Portuguese": "pt_PT",
"Russian": "ru_RU",
"Chinese": "zh_CN",
"Japanese": "ja_JP",
"Korean": "ko_KR",
"Arabic": "ar_SA",
"Hindi": "hi_IN"
}
lang_code = language_map.get(language, "en_US")
faker = Faker(lang_code)
return faker.sentence()


def gen_digits_by_length(length=8):
return "".join(random.choice(string.digits) for _ in range(length))

Expand Down Expand Up @@ -957,7 +977,7 @@ def gen_binary_vectors(num, dim):
def gen_default_dataframe_data(nb=ct.default_nb, dim=ct.default_dim, start=0, with_json=True,
random_primary_key=False, multiple_dim_array=[], multiple_vector_field_name=[],
vector_data_type="FLOAT_VECTOR", auto_id=False,
primary_field = ct.default_int64_field_name, nullable_fields={}):
primary_field=ct.default_int64_field_name, nullable_fields={}, language=None):
if not random_primary_key:
int_values = pd.Series(data=[i for i in range(start, start + nb)])
else:
Expand All @@ -973,6 +993,8 @@ def gen_default_dataframe_data(nb=ct.default_nb, dim=ct.default_dim, start=0, wi
float_values = pd.Series(data=float_data, dtype=object)

string_data = [str(i) for i in range(start, start + nb)]
if language:
string_data = [generate_random_sentence(language) for _ in range(nb)]
string_values = pd.Series(data=string_data, dtype="string")
if ct.default_string_field_name in nullable_fields:
null_number = int(nb*nullable_fields[ct.default_string_field_name])
Expand Down Expand Up @@ -1017,7 +1039,7 @@ def gen_default_dataframe_data(nb=ct.default_nb, dim=ct.default_dim, start=0, wi
def gen_default_list_data(nb=ct.default_nb, dim=ct.default_dim, start=0, with_json=True,
random_primary_key=False, multiple_dim_array=[], multiple_vector_field_name=[],
vector_data_type="FLOAT_VECTOR", auto_id=False,
primary_field=ct.default_int64_field_name, nullable_fields={}):
primary_field=ct.default_int64_field_name, nullable_fields={}, language=None):
insert_list = []
if not random_primary_key:
int_values = pd.Series(data=[i for i in range(start, start + nb)])
Expand All @@ -1031,6 +1053,8 @@ def gen_default_list_data(nb=ct.default_nb, dim=ct.default_dim, start=0, with_js
float_data = float_data[:nb - null_number] + null_data
float_values = pd.Series(data=float_data, dtype=object)
string_data = [str(i) for i in range(start, start + nb)]
if language:
string_data = [generate_random_sentence(language) for _ in range(nb)]
string_values = pd.Series(data=string_data, dtype="string")
if ct.default_string_field_name in nullable_fields:
null_number = int(nb * nullable_fields[ct.default_string_field_name])
Expand Down Expand Up @@ -1069,7 +1093,7 @@ def gen_default_list_data(nb=ct.default_nb, dim=ct.default_dim, start=0, with_js

def gen_default_rows_data(nb=ct.default_nb, dim=ct.default_dim, start=0, with_json=True, multiple_dim_array=[],
multiple_vector_field_name=[], vector_data_type="FLOAT_VECTOR", auto_id=False,
primary_field = ct.default_int64_field_name, nullable_fields={}):
primary_field = ct.default_int64_field_name, nullable_fields={}, language=None):
array = []
for i in range(start, start + nb):
dict = {ct.default_int64_field_name: i,
Expand All @@ -1080,6 +1104,8 @@ def gen_default_rows_data(nb=ct.default_nb, dim=ct.default_dim, start=0, with_js
}
if with_json is False:
dict.pop(ct.default_json_field_name, None)
if language:
dict[ct.default_string_field_name] = generate_random_sentence(language)
if auto_id is True:
if primary_field == ct.default_int64_field_name:
dict.pop(ct.default_int64_field_name)
Expand Down Expand Up @@ -1281,7 +1307,7 @@ def gen_dataframe_all_data_type(nb=ct.default_nb, dim=ct.default_dim, start=0, w
def gen_general_list_all_data_type(nb=ct.default_nb, dim=ct.default_dim, start=0, with_json=True,
auto_id=False, random_primary_key=False, multiple_dim_array=[],
multiple_vector_field_name=[], primary_field=ct.default_int64_field_name,
nullable_fields={}):
nullable_fields={}, language=None):
if not random_primary_key:
int64_values = pd.Series(data=[i for i in range(start, start + nb)])
else:
Expand Down Expand Up @@ -1335,6 +1361,8 @@ def gen_general_list_all_data_type(nb=ct.default_nb, dim=ct.default_dim, start=0
double_values = pd.Series(data=double_data, dtype=object)

string_data = [str(i) for i in range(start, start + nb)]
if language:
string_data = [generate_random_sentence(language) for _ in range(nb)]
string_values = pd.Series(data=string_data, dtype="string")
if ct.default_string_field_name in nullable_fields:
null_number = int(nb * nullable_fields[ct.default_string_field_name])
Expand Down Expand Up @@ -1375,7 +1403,7 @@ def gen_general_list_all_data_type(nb=ct.default_nb, dim=ct.default_dim, start=0

def gen_default_rows_data_all_data_type(nb=ct.default_nb, dim=ct.default_dim, start=0, with_json=True,
multiple_dim_array=[], multiple_vector_field_name=[], partition_id=0,
auto_id=False, primary_field=ct.default_int64_field_name):
auto_id=False, primary_field=ct.default_int64_field_name, language=None):
array = []
for i in range(start, start + nb):
dict = {ct.default_int64_field_name: i,
Expand All @@ -1391,6 +1419,8 @@ def gen_default_rows_data_all_data_type(nb=ct.default_nb, dim=ct.default_dim, st
}
if with_json is False:
dict.pop(ct.default_json_field_name, None)
if language:
dict[ct.default_string_field_name] = generate_random_sentence(language)
if auto_id is True:
if primary_field == ct.default_int64_field_name:
dict.pop(ct.default_int64_field_name, None)
Expand All @@ -1412,7 +1442,7 @@ def gen_default_rows_data_all_data_type(nb=ct.default_nb, dim=ct.default_dim, st


def gen_default_binary_dataframe_data(nb=ct.default_nb, dim=ct.default_dim, start=0, auto_id=False,
primary_field=ct.default_int64_field_name, nullable_fields={}):
primary_field=ct.default_int64_field_name, nullable_fields={}, language=None):
int_data = [i for i in range(start, start + nb)]
int_values = pd.Series(data=int_data)
if ct.default_int64_field_name in nullable_fields:
Expand All @@ -1430,6 +1460,8 @@ def gen_default_binary_dataframe_data(nb=ct.default_nb, dim=ct.default_dim, star
float_values = pd.Series(data=float_data, dtype=object)

string_data = [str(i) for i in range(start, start + nb)]
if language:
string_data = [generate_random_sentence(language) for _ in range(nb)]
string_values = pd.Series(data=string_data, dtype="string")
if ct.default_string_field_name in nullable_fields:
null_number = int(nb * nullable_fields[ct.default_string_field_name])
Expand Down Expand Up @@ -2525,7 +2557,7 @@ def gen_partitions(collection_w, partition_num=1):
def insert_data(collection_w, nb=ct.default_nb, is_binary=False, is_all_data_type=False,
auto_id=False, dim=ct.default_dim, insert_offset=0, enable_dynamic_field=False, with_json=True,
random_primary_key=False, multiple_dim_array=[], primary_field=ct.default_int64_field_name,
vector_data_type="FLOAT_VECTOR", nullable_fields={}):
vector_data_type="FLOAT_VECTOR", nullable_fields={}, language=None):
"""
target: insert non-binary/binary data
method: insert non-binary/binary data into partitions if any
Expand Down Expand Up @@ -2553,23 +2585,23 @@ def insert_data(collection_w, nb=ct.default_nb, is_binary=False, is_all_data_typ
multiple_vector_field_name=vector_name_list,
vector_data_type=vector_data_type,
auto_id=auto_id, primary_field=primary_field,
nullable_fields=nullable_fields)
nullable_fields=nullable_fields, language=language)
elif vector_data_type in ct.append_vector_type:
default_data = gen_default_list_data(nb // num, dim=dim, start=start, with_json=with_json,
random_primary_key=random_primary_key,
multiple_dim_array=multiple_dim_array,
multiple_vector_field_name=vector_name_list,
vector_data_type=vector_data_type,
auto_id=auto_id, primary_field=primary_field,
nullable_fields=nullable_fields)
nullable_fields=nullable_fields, language=language)

else:
default_data = gen_default_rows_data(nb // num, dim=dim, start=start, with_json=with_json,
multiple_dim_array=multiple_dim_array,
multiple_vector_field_name=vector_name_list,
vector_data_type=vector_data_type,
auto_id=auto_id, primary_field=primary_field,
nullable_fields=nullable_fields)
nullable_fields=nullable_fields, language=language)

else:
if not enable_dynamic_field:
Expand All @@ -2579,14 +2611,14 @@ def insert_data(collection_w, nb=ct.default_nb, is_binary=False, is_all_data_typ
multiple_dim_array=multiple_dim_array,
multiple_vector_field_name=vector_name_list,
auto_id=auto_id, primary_field=primary_field,
nullable_fields=nullable_fields)
nullable_fields=nullable_fields, language=language)
elif vector_data_type == "FLOAT16_VECTOR" or "BFLOAT16_VECTOR":
default_data = gen_general_list_all_data_type(nb // num, dim=dim, start=start, with_json=with_json,
random_primary_key=random_primary_key,
multiple_dim_array=multiple_dim_array,
multiple_vector_field_name=vector_name_list,
auto_id=auto_id, primary_field=primary_field,
nullable_fields=nullable_fields)
nullable_fields=nullable_fields, language=language)
else:
if os.path.exists(ct.rows_all_data_type_file_path + f'_{i}' + f'_dim{dim}.txt'):
with open(ct.rows_all_data_type_file_path + f'_{i}' + f'_dim{dim}.txt', 'rb') as f:
Expand All @@ -2597,12 +2629,14 @@ def insert_data(collection_w, nb=ct.default_nb, is_binary=False, is_all_data_typ
multiple_dim_array=multiple_dim_array,
multiple_vector_field_name=vector_name_list,
partition_id=i, auto_id=auto_id,
primary_field=primary_field)
primary_field=primary_field,
language=language)
else:
default_data, binary_raw_data = gen_default_binary_dataframe_data(nb // num, dim=dim, start=start,
auto_id=auto_id,
primary_field=primary_field,
nullable_fields=nullable_fields)
nullable_fields=nullable_fields,
language=language)
binary_raw_vectors.extend(binary_raw_data)
insert_res = collection_w.insert(default_data, par[i].name)[0]
log.info(f"inserted {nb // num} data into collection {collection_w.name}")
Expand Down
6 changes: 3 additions & 3 deletions tests/python_client/testcases/test_insert.py
Original file line number Diff line number Diff line change
Expand Up @@ -387,11 +387,11 @@ def test_insert_inconsistent_data(self):
"""
c_name = cf.gen_unique_str(prefix)
collection_w = self.init_collection_wrap(name=c_name)
data = cf.gen_default_list_data(nb=100)
data[0][1] = 1.0
data = cf.gen_default_rows_data(nb=100)
data[0][ct.default_int64_field_name] = 1.0
error = {ct.err_code: 999,
ct.err_msg: "The Input data type is inconsistent with defined schema, {%s} field should be a int64, "
"but got a {<class 'int'>} instead." % ct.default_int64_field_name}
"but got a {<class 'float'>} instead." % ct.default_int64_field_name}
collection_w.insert(data, check_task=CheckTasks.err_res, check_items=error)


Expand Down
Loading

0 comments on commit cfd636e

Please sign in to comment.