Skip to content

Commit

Permalink
Refactor create index interface (infiniflow#1634)
Browse files Browse the repository at this point in the history
### What problem does this PR solve?

Refactor list of create index parameters to one structure.

### Type of change

- [x] Refactoring
- [x] Test cases
- [x] Python SDK impacted, Need to update PyPI

---------

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
  • Loading branch information
JinHai-CN authored Aug 10, 2024
1 parent 1f8b540 commit 3ea3a16
Show file tree
Hide file tree
Showing 54 changed files with 3,215 additions and 3,485 deletions.
6 changes: 2 additions & 4 deletions benchmark/local_infinity/fulltext/fulltext_benchmark.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -192,22 +192,20 @@ void BenchmarkCreateIndex(SharedPtr<Infinity> infinity,
const String &index_name) {
BaseProfiler profiler;
profiler.Begin();
auto index_info_list = new Vector<IndexInfo *>();
auto index_info = new IndexInfo();
index_info->index_type_ = IndexType::kFullText;
index_info->column_name_ = "text";
index_info->index_param_list_ = new Vector<InitParameter *>();
index_info_list->push_back(index_info);

auto r = infinity->CreateIndex(db_name, table_name, index_name, index_info_list, CreateIndexOptions());
auto r = infinity->CreateIndex(db_name, table_name, index_name, index_info, CreateIndexOptions());
if (r.IsOk()) {
r = infinity->Flush();
} else {
LOG_ERROR(fmt::format("Fail to create index {}", r.ToString()));
return;
}

// NOTE: ~CreateStatement() has already deleated or freed index_info_list, index_info, index_info->index_param_list_.
// NOTE: ~CreateStatement() has already deleted or freed index_info, index_info->index_param_list_.
LOG_INFO(fmt::format("Create index cost: {}", profiler.ElapsedToString()));
profiler.End();
}
Expand Down
28 changes: 12 additions & 16 deletions benchmark/local_infinity/infinity_benchmark.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -382,25 +382,21 @@ int main() {
import_options.copy_file_type_ = CopyFileType::kFVECS;
auto r3 = infinity->Import(db_name, table_name, sift_base_path, import_options);

auto index_info_list = new std::vector<IndexInfo *>();
{
auto index_info = new IndexInfo();
index_info->index_type_ = IndexType::kHnsw;
index_info->column_name_ = col1_name;
auto index_info = new IndexInfo();
index_info->index_type_ = IndexType::kHnsw;
index_info->column_name_ = col1_name;

{
auto index_param_list = new std::vector<InitParameter *>();
index_param_list->emplace_back(new InitParameter("M", std::to_string(16)));
index_param_list->emplace_back(new InitParameter("ef_construction", std::to_string(200)));
index_param_list->emplace_back(new InitParameter("ef", std::to_string(200)));
index_param_list->emplace_back(new InitParameter("metric", "l2"));
index_param_list->emplace_back(new InitParameter("encode", "lvq"));
index_info->index_param_list_ = index_param_list;
}
index_info_list->emplace_back(index_info);
{
auto index_param_list = new std::vector<InitParameter *>();
index_param_list->emplace_back(new InitParameter("M", std::to_string(16)));
index_param_list->emplace_back(new InitParameter("ef_construction", std::to_string(200)));
index_param_list->emplace_back(new InitParameter("ef", std::to_string(200)));
index_param_list->emplace_back(new InitParameter("metric", "l2"));
index_param_list->emplace_back(new InitParameter("encode", "lvq"));
index_info->index_param_list_ = index_param_list;
}

infinity->CreateIndex(db_name, table_name, index_name, index_info_list, CreateIndexOptions());
infinity->CreateIndex(db_name, table_name, index_name, index_info, CreateIndexOptions());
} while (false);

// {
Expand Down
28 changes: 12 additions & 16 deletions benchmark/local_infinity/knn/knn_import_benchmark.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -127,25 +127,21 @@ int main(int argc, char *argv[]) {
QueryResult query_result = infinity->Import(db_name, table_name, base_path, import_options);
std::cout << "Import data cost: " << profiler.ElapsedToString() << std::endl;

auto index_info_list = new std::vector<IndexInfo *>();
auto index_info = new IndexInfo();
index_info->index_type_ = IndexType::kHnsw;
index_info->column_name_ = col1_name;

{
auto index_info = new IndexInfo();
index_info->index_type_ = IndexType::kHnsw;
index_info->column_name_ = col1_name;

{
auto index_param_list = new std::vector<InitParameter *>();
index_param_list->emplace_back(new InitParameter("M", std::to_string(16)));
index_param_list->emplace_back(new InitParameter("ef_construction", std::to_string(200)));
index_param_list->emplace_back(new InitParameter("ef", std::to_string(200)));
index_param_list->emplace_back(new InitParameter("metric", "l2"));
index_param_list->emplace_back(new InitParameter("encode", "lvq"));
index_info->index_param_list_ = index_param_list;
}
index_info_list->emplace_back(index_info);
auto index_param_list = new std::vector<InitParameter *>();
index_param_list->emplace_back(new InitParameter("M", std::to_string(16)));
index_param_list->emplace_back(new InitParameter("ef_construction", std::to_string(200)));
index_param_list->emplace_back(new InitParameter("ef", std::to_string(200)));
index_param_list->emplace_back(new InitParameter("metric", "l2"));
index_param_list->emplace_back(new InitParameter("encode", "lvq"));
index_info->index_param_list_ = index_param_list;
}

query_result = infinity->CreateIndex(db_name, table_name, index_name, index_info_list, CreateIndexOptions());
query_result = infinity->CreateIndex(db_name, table_name, index_name, index_info, CreateIndexOptions());

if (query_result.IsOk()) {
std::cout << "Create Index cost: " << profiler.ElapsedToString() << std::endl;
Expand Down
6 changes: 3 additions & 3 deletions example/ColBERT_reranker_example/helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,12 +69,12 @@ def create_test_env(self, schema: dict):
self.colbert_test_table = self.colbert_test_db.create_table(self.test_table_name, schema, ConflictType.Error)
# NOTICE: the following statement is for english text
self.colbert_test_table.create_index("test_ft_index",
[index.IndexInfo(self.inner_col_txt, index.IndexType.FullText, [])],
index.IndexInfo(self.inner_col_txt, index.IndexType.FullText, []),
ConflictType.Error)
# please enable the following statement to use chinese text
# self.colbert_test_table.create_index("test_ft_index",
# [index.IndexInfo(self.inner_col_txt, index.IndexType.FullText,
# [infinity.index.InitParameter("ANALYZER", "chinese")])],
# index.IndexInfo(self.inner_col_txt, index.IndexType.FullText,
# infinity.index.InitParameter("ANALYZER", "chinese")),
# ConflictType.Error)

# clear the test environment for ColBERT
Expand Down
4 changes: 1 addition & 3 deletions example/fulltext_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,9 +70,7 @@
# Create index on varchar column for full-text search and multiple way fusion.
res = table_instance.create_index(
"my_index",
[
infinity.index.IndexInfo("body", infinity.index.IndexType.FullText, []),
],
infinity.index.IndexInfo("body", infinity.index.IndexType.FullText, []),
infinity.common.ConflictType.Error,
)

Expand Down
6 changes: 2 additions & 4 deletions example/fulltext_search_zh.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,10 +92,8 @@
# Create index on varchar column for full-text search and multiple way fusion.
res = table_instance.create_index(
"my_index",
[
infinity.index.IndexInfo("body", infinity.index.IndexType.FullText,
[infinity.index.InitParameter("ANALYZER", "chinese")])
],
infinity.index.IndexInfo("body", infinity.index.IndexType.FullText,
infinity.index.InitParameter("ANALYZER", "chinese")),
infinity.common.ConflictType.Error,
)

Expand Down
2 changes: 1 addition & 1 deletion python/benchmark/clients/infinity_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ def upload(self):
# create index
indexs = self._parse_index_schema(self.data["index"])
for i, idx in enumerate(indexs):
table_obj.create_index(f"index{i}", [idx])
table_obj.create_index(f"index{i}", idx)

inf_http_client = InfinityHttpClient("default_db", self.table_name)

Expand Down
4 changes: 2 additions & 2 deletions python/benchmark/fulltext_import_benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ def insert_data(db_obj, data):
table_obj = db_obj.create_table("insert_benchmark",
{"id": {"type": "varchar"}, "title": {"type": "varchar"},
"text": {"type": "varchar"}}, ConflictType.Error)
res = table_obj.create_index("text_index", [index.IndexInfo("text", index.IndexType.FullText, [])])
res = table_obj.create_index("text_index", index.IndexInfo("text", index.IndexType.FullText, []))
assert res.error_code == ErrorCode.OK
inserted_records_num = 0
while inserted_records_num < len(data):
Expand All @@ -50,7 +50,7 @@ def import_file(db_obj, path):
{"id": {"type": "varchar"}, "title": {"type": "varchar"},
"text": {"type": "varchar"}}, ConflictType.Error)
assert table_obj
res = table_obj.create_index("text_index", [index.IndexInfo("text", index.IndexType.FullText, [])])
res = table_obj.create_index("text_index", index.IndexInfo("text", index.IndexType.FullText, []))
assert res.error_code == ErrorCode.OK
table_obj.import_data(path, {'file_type': 'jsonl'})

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ def insert_data(db_obj, data):
table_obj = db_obj.create_table("insert_benchmark",
{"id": {"type": "varchar"}, "title": {"type": "varchar"},
"text": {"type": "varchar"}}, ConflictType.Error)
res = table_obj.create_index("text_index", [index.IndexInfo("text", index.IndexType.FullText, [])])
res = table_obj.create_index("text_index", index.IndexInfo("text", index.IndexType.FullText, []))
assert res.error_code == ErrorCode.OK
inserted_records_num = 0
while inserted_records_num < len(data):
Expand All @@ -50,7 +50,7 @@ def import_file(db_obj, path):
{"id": {"type": "varchar"}, "title": {"type": "varchar"},
"text": {"type": "varchar"}}, ConflictType.Error)
assert table_obj
res = table_obj.create_index("text_index", [index.IndexInfo("text", index.IndexType.FullText, [])])
res = table_obj.create_index("text_index", index.IndexInfo("text", index.IndexType.FullText, []))
assert res.error_code == ErrorCode.OK
table_obj.import_data(path, {'file_type': 'jsonl'})

Expand Down
24 changes: 11 additions & 13 deletions python/benchmark/legacy_benchmark/remote_benchmark_knn_import.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,19 +103,17 @@ def import_gist_1m(path, m: int, ef_construction: int, remote: bool):
def create_index(table_obj, m: int, ef_construction: int, remote: bool):
res = table_obj.create_index(
"hnsw_index",
[
index.IndexInfo(
"col1",
index.IndexType.Hnsw,
[
index.InitParameter("M", str(m)),
index.InitParameter("ef_construction", str(ef_construction)),
index.InitParameter("ef", str(ef_construction)),
index.InitParameter("metric", "l2"),
index.InitParameter("encode", "lvq"),
],
)
],
index.IndexInfo(
"col1",
index.IndexType.Hnsw,
[
index.InitParameter("M", str(m)),
index.InitParameter("ef_construction", str(ef_construction)),
index.InitParameter("ef", str(ef_construction)),
index.InitParameter("metric", "l2"),
index.InitParameter("encode", "lvq"),
],
)
)

assert res.error_code == ErrorCode.OK
Expand Down
18 changes: 9 additions & 9 deletions python/benchmark/legacy_benchmark/remote_benchmark_knn_insert.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,15 +123,15 @@ def create_index(table_name):
conn = ThriftInfinityClient(LOCAL_HOST)
table = RemoteTable(conn, "default_db", table_name)
res = table.create_index("hnsw_index",
[index.IndexInfo("col1",
index.IndexType.Hnsw,
[
index.InitParameter("M", "16"),
index.InitParameter("ef_construction", "200"),
index.InitParameter("ef", "200"),
index.InitParameter("metric", "l2"),
index.InitParameter("encode", "lvq")
])])
index.IndexInfo("col1",
index.IndexType.Hnsw,
[
index.InitParameter("M", "16"),
index.InitParameter("ef_construction", "200"),
index.InitParameter("ef", "200"),
index.InitParameter("metric", "l2"),
index.InitParameter("encode", "lvq")
]))

assert res.error_code == ErrorCode.OK

Expand Down
34 changes: 16 additions & 18 deletions python/benchmark/legacy_benchmark/remote_benchmark_sparse_import.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,12 @@


def import_data(
data_set: str,
block_size: int,
compress: bool,
opt_topk: int,
bp_reorder: bool,
remote: bool,
data_set: str,
block_size: int,
compress: bool,
opt_topk: int,
bp_reorder: bool,
remote: bool,
):
current_path = os.getcwd()
data_dir = current_path + "/test/data/benchmark/splade"
Expand Down Expand Up @@ -70,18 +70,16 @@ def import_data(
start = time.time()
res = table_obj.create_index(
"splade_index",
[
index.IndexInfo(
"col1",
index.IndexType.BMP,
[
index.InitParameter("block_size", str(block_size)),
index.InitParameter(
"compress_type", "compress" if compress else "raww"
),
],
)
],
index.IndexInfo(
"col1",
index.IndexType.BMP,
[
index.InitParameter("block_size", str(block_size)),
index.InitParameter(
"compress_type", "compress" if compress else "raww"
),
],
)
)
assert res.error_code == ErrorCode.OK
end = time.time()
Expand Down
24 changes: 12 additions & 12 deletions python/benchmark/mldr_benchmark/insert_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,26 +91,26 @@ def main(self):
if lang == "zh":
ft_params.append(index.InitParameter("analyzer", "chinese"))
res = self.infinity_table.create_index("ft_index",
[index.IndexInfo("fulltext_col", index.IndexType.FullText, ft_params)],
index.IndexInfo("fulltext_col", index.IndexType.FullText, ft_params),
ConflictType.Error)
assert res.error_code == ErrorCode.OK
print("Finish creating fulltext index.")
print("Start creating Hnsw index...")
res = self.infinity_table.create_index("hnsw_index", [index.IndexInfo("dense_col", index.IndexType.Hnsw,
[index.InitParameter("M", "16"),
index.InitParameter("ef_construction",
"200"),
index.InitParameter("ef", "200"),
index.InitParameter("metric", "ip"),
index.InitParameter("encode", "lvq")])],
res = self.infinity_table.create_index("hnsw_index", index.IndexInfo("dense_col", index.IndexType.Hnsw,
[index.InitParameter("M", "16"),
index.InitParameter("ef_construction",
"200"),
index.InitParameter("ef", "200"),
index.InitParameter("metric", "ip"),
index.InitParameter("encode", "lvq")]),
ConflictType.Error)
assert res.error_code == ErrorCode.OK
print("Finish creating Hnsw index.")
print("Start creating BMP index...")
res = self.infinity_table.create_index("bmp_index", [index.IndexInfo("sparse_col", index.IndexType.BMP,
[index.InitParameter("block_size", "8"),
index.InitParameter("compress_type",
"compress")])],
res = self.infinity_table.create_index("bmp_index", index.IndexInfo("sparse_col", index.IndexType.BMP,
[index.InitParameter("block_size", "8"),
index.InitParameter("compress_type",
"compress")]),
ConflictType.Error)
assert res.error_code == ErrorCode.OK
self.infinity_table.optimize("bmp_index", {"topk": "1000", "bp_reorder": ""})
Expand Down
Loading

0 comments on commit 3ea3a16

Please sign in to comment.