diff --git a/tests/python_client/testcases/test_query.py b/tests/python_client/testcases/test_query.py index a0f41378fb47e..2ec849bedf585 100644 --- a/tests/python_client/testcases/test_query.py +++ b/tests/python_client/testcases/test_query.py @@ -4201,7 +4201,7 @@ def test_query_text_match_normal( FieldSchema(name="emb", dtype=DataType.FLOAT_VECTOR, dim=dim), ] schema = CollectionSchema(fields=fields, description="test collection") - data_size = 5000 + data_size = 3000 collection_w = self.init_collection_wrap( name=cf.gen_unique_str(prefix), schema=schema ) @@ -4232,7 +4232,6 @@ def test_query_text_match_normal( if i + batch_size < len(df) else data[i : len(df)] ) - collection_w.flush() collection_w.create_index( "emb", {"index_type": "IVF_SQ8", "metric_type": "L2", "params": {"nlist": 64}}, @@ -4252,7 +4251,7 @@ def test_query_text_match_normal( log.info(f"expr: {expr}") res, _ = collection_w.query(expr=expr, output_fields=["id", field]) assert len(res) > 0 - log.info(f"res len {len(res)} res {res}") + log.info(f"res len {len(res)}") for r in res: assert token in r[field] @@ -4275,7 +4274,7 @@ def test_query_text_match_normal( expr = f"TextMatch({field}, '{string_of_top_10_words}')" log.info(f"expr {expr}") res, _ = collection_w.query(expr=expr, output_fields=["id", field]) - log.info(f"res len {len(res)} res {res}") + log.info(f"res len {len(res)}") for r in res: assert any([token in r[field] for token in top_10_tokens]) @@ -4386,7 +4385,7 @@ def test_query_text_match_custom_analyzer(self): expr = f"TextMatch({field}, '{token}')" log.info(f"expr: {expr}") res, _ = collection_w.query(expr=expr, output_fields=["id", field]) - log.info(f"res len {len(res)} res {res}") + log.info(f"res len {len(res)}") for r in res: assert token in r[field] @@ -4400,7 +4399,7 @@ def test_query_text_match_custom_analyzer(self): expr = f"TextMatch({field}, '{string_of_top_10_words}')" log.info(f"expr {expr}") res, _ = collection_w.query(expr=expr, output_fields=["id", field]) - log.info(f"res len {len(res)} res {res}") + log.info(f"res len {len(res)}") for r in res: assert any([token in r[field] for token in top_10_tokens]) @@ -4609,7 +4608,7 @@ def test_query_text_match_with_combined_expression_for_multi_field(self): log.info(f"expr: {text_match_expr}") res, _ = collection_w.query(expr=text_match_expr, output_fields=text_fields) onetime_res = res - log.info(f"res len {len(res)} res {res}") + log.info(f"res len {len(res)}") step_by_step_results = [] for expr in query: if isinstance(expr, dict): @@ -4626,7 +4625,7 @@ def test_query_text_match_with_combined_expression_for_multi_field(self): log.info( f"text match res {len(text_match_df)}\n{text_match_df[key]}" ) - log.info(f"tmp expr {tmp_expr} {len(res)}, {res}") + log.info(f"tmp expr {tmp_expr} {len(res)}") tmp_idx = [r["id"] for r in res] step_by_step_results.append(tmp_idx) pandas_filter_res = cf.generate_pandas_text_match_result( @@ -4645,7 +4644,6 @@ def test_query_text_match_with_combined_expression_for_multi_field(self): ) if isinstance(expr, str): step_by_step_results.append(expr) - log.info(f"step by step results {step_by_step_results}") final_res = cf.evaluate_expression(step_by_step_results) log.info(f"one time res {len(onetime_res)}, final res {len(final_res)}") if len(onetime_res) != len(final_res): @@ -4774,6 +4772,8 @@ def test_query_text_match_with_multi_lang(self): res, _ = collection_w.query(expr=expr, output_fields=["id", field]) log.info(f"res len {len(res)}") assert len(res) > 0 + for r in res: + assert token in r[field] # query single field for multi-word for field in text_fields: @@ -4786,6 +4786,9 @@ def test_query_text_match_with_multi_lang(self): log.info(f"expr {expr}") res, _ = collection_w.query(expr=expr, output_fields=["id", field]) log.info(f"res len {len(res)}") + assert len(res) > 0 + for r in res: + assert any([token in r[field] for token in multi_words]) @pytest.mark.tags(CaseLabel.L1) def test_query_text_match_with_addition_inverted_index(self): @@ -4847,14 +4850,13 @@ def test_query_text_match_with_addition_inverted_index(self): for i in range(data_size): d = { "id": i, - "word": fake_en.word(), - "sentence": fake_en.sentence(), - "paragraph": fake_en.paragraph(), - "text": fake_en.text(), + "word": fake_en.word().lower(), + "sentence": fake_en.sentence().lower(), + "paragraph": fake_en.paragraph().lower(), + "text": fake_en.text().lower(), "emb": cf.gen_vectors(1, dim)[0], } data.append(d) - log.info(f"data\n{data[:10]}") batch_size = 5000 for i in range(0, data_size, batch_size): collection_w.insert( @@ -4862,7 +4864,6 @@ def test_query_text_match_with_addition_inverted_index(self): if i + batch_size < data_size else data[i:data_size] ) - collection_w.flush() collection_w.create_index( "emb", {"index_type": "IVF_SQ8", "metric_type": "L2", "params": {"nlist": 64}}, @@ -4870,6 +4871,7 @@ def test_query_text_match_with_addition_inverted_index(self): collection_w.create_index("word", {"index_type": "INVERTED"}) collection_w.load() df = pd.DataFrame(data) + log.info(f"dataframe\n{df}") text_fields = ["word", "sentence", "paragraph", "text"] wf_map = {} for field in text_fields: @@ -4880,14 +4882,17 @@ def test_query_text_match_with_addition_inverted_index(self): expr = f"TextMatch({field}, '{token}')" log.info(f"expr: {expr}") res, _ = collection_w.query(expr=expr, output_fields=["id", field]) - log.info(f"res len {len(res)} res {res}") + log.info(f"res len {len(res)}") assert len(res) > 0 + for r in res: + assert token in r[field] if field == "word": assert len(res) == wf_map[field].most_common()[-1][1] expr = f"{field} == '{token}'" log.info(f"expr: {expr}") res, _ = collection_w.query(expr=expr, output_fields=["id", field]) - log.info(f"res len {len(res)} res {res}") + log.info(f"res len {len(res)}") + assert len(res) == wf_map[field].most_common()[-1][1] @pytest.mark.tags(CaseLabel.L1) def test_query_text_match_with_some_empty_string(self): @@ -4991,9 +4996,9 @@ def test_query_text_match_with_some_empty_string(self): batch_size = 5000 for i in range(0, len(df), batch_size): collection_w.insert( - data[i : i + batch_size] + data[i: i + batch_size] if i + batch_size < len(df) - else data[i : len(df)] + else data[i: len(df)] ) collection_w.flush() collection_w.create_index( @@ -5009,7 +5014,8 @@ def test_query_text_match_with_some_empty_string(self): res, _ = collection_w.query(expr=expr, output_fields=["id", field]) log.info(f"res len {len(res)}") assert len(res) > 0 - + for r in res: + assert token in r[field] # query single field for multi-word for field in text_fields: # match top 3 most common words @@ -5021,6 +5027,9 @@ def test_query_text_match_with_some_empty_string(self): log.info(f"expr {expr}") res, _ = collection_w.query(expr=expr, output_fields=["id", field]) log.info(f"res len {len(res)}") + assert len(res) > 0 + for r in res: + assert any([token in r[field] for token in multi_words]) class TestQueryTextMatchNegative(TestcaseBase):