Skip to content

Commit

Permalink
[Feature](inverted index) add inverted index tool (apache#22207)
Browse files Browse the repository at this point in the history
  • Loading branch information
airborne12 authored Jul 27, 2023
1 parent 461c4df commit 8371171
Show file tree
Hide file tree
Showing 4 changed files with 308 additions and 0 deletions.
4 changes: 4 additions & 0 deletions be/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -1067,6 +1067,10 @@ if (BUILD_META_TOOL AND BUILD_META_TOOL STREQUAL "ON")
add_subdirectory(${SRC_DIR}/tools)
endif()

if (BUILD_INDEX_TOOL AND BUILD_INDEX_TOOL STREQUAL "ON")
add_subdirectory(${SRC_DIR}/index-tools)
endif()

add_subdirectory(${SRC_DIR}/util)
add_subdirectory(${SRC_DIR}/vec)
add_subdirectory(${SRC_DIR}/pipeline)
Expand Down
47 changes: 47 additions & 0 deletions be/src/index-tools/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.

# where to put generated libraries
set(LIBRARY_OUTPUT_PATH "${BUILD_DIR}/src/index-tools")

# where to put generated binaries
set(EXECUTABLE_OUTPUT_PATH "${BUILD_DIR}/src/index-tools")

add_executable(index_tool
index_tool.cpp
)

pch_reuse(index_tool)

# This permits libraries loaded by dlopen to link to the symbols in the program.
set_target_properties(index_tool PROPERTIES ENABLE_EXPORTS 1)


target_link_libraries(index_tool
${DORIS_LINK_LIBS}
)

install(DIRECTORY DESTINATION ${OUTPUT_DIR}/lib/)
install(TARGETS index_tool DESTINATION ${OUTPUT_DIR}/lib/)
if (NOT OS_MACOSX)
# strip debug info to save space
add_custom_command(TARGET index_tool POST_BUILD
COMMAND ${CMAKE_OBJCOPY} --only-keep-debug $<TARGET_FILE:index_tool> $<TARGET_FILE:index_tool>.dbg
COMMAND ${CMAKE_STRIP} --strip-debug --strip-unneeded $<TARGET_FILE:index_tool>
COMMAND ${CMAKE_OBJCOPY} --add-gnu-debuglink=$<TARGET_FILE:index_tool>.dbg $<TARGET_FILE:index_tool>
)
endif()
241 changes: 241 additions & 0 deletions be/src/index-tools/index_tool.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,241 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

#include <CLucene.h>
#include <CLucene/config/repl_wchar.h>
#include <gflags/gflags.h>

#include <filesystem>
#include <iostream>
#include <sstream>
#include <string>
#include <vector>

#include "io/fs/local_file_system.h"
#include "olap/rowset/segment_v2/inverted_index_compound_directory.h"
#include "olap/rowset/segment_v2/inverted_index_compound_reader.h"

using doris::segment_v2::DorisCompoundReader;
using doris::segment_v2::DorisCompoundDirectory;
using doris::io::FileInfo;
using namespace lucene::analysis;
using namespace lucene::index;
using namespace lucene::util;
using namespace lucene::search;

DEFINE_string(operation, "", "valid operation: show_nested_files,check_terms,term_query");

DEFINE_string(directory, "./", "inverted index file directory");
DEFINE_string(idx_file_name, "", "inverted index file name");
DEFINE_string(idx_file_path, "", "inverted index file path");
DEFINE_string(term, "", "inverted index term to query");
DEFINE_string(column_name, "", "inverted index column_name to query");
DEFINE_string(pred_type, "", "inverted index term query predicate, eq/lt/gt/le/ge/match etc.");
DEFINE_bool(print_row_id, false, "print row id when query terms");

std::string get_usage(const std::string& progname) {
std::stringstream ss;
ss << progname << " is the Doris inverted index file tool.\n";
ss << "Stop BE first before use this tool.\n";
ss << "Usage:\n";
ss << "./index_tool --operation=show_nested_files --idx_file_path=path/to/file\n";
ss << "./index_tool --operation=check_terms_stats --idx_file_path=path/to/file\n";
ss << "./index_tool --operation=term_query --directory=directory "
"--idx_file_name=file --print_row_id --term=term --column_name=column_name "
"--pred_type=eq/lt/gt/le/ge/match etc\n";
return ss.str();
}

void search(lucene::store::Directory* dir, std::string& field, std::string& token,
std::string& pred) {
IndexReader* reader = IndexReader::open(dir);

IndexReader* newreader = reader->reopen();
if (newreader != reader) {
_CLLDELETE(reader);
reader = newreader;
}
IndexSearcher s(reader);
std::unique_ptr<lucene::search::Query> query;

std::wstring field_ws(field.begin(), field.end());
std::wstring token_ws(token.begin(), token.end());
lucene::index::Term* term = _CLNEW lucene::index::Term(field_ws.c_str(), token_ws.c_str());
if (pred == "eq" || pred == "match") {
query.reset(new lucene::search::TermQuery(term));
} else if (pred == "lt") {
query.reset(new lucene::search::RangeQuery(nullptr, term, false));
} else if (pred == "gt") {
query.reset(new lucene::search::RangeQuery(term, nullptr, false));
} else if (pred == "le") {
query.reset(new lucene::search::RangeQuery(nullptr, term, true));
} else if (pred == "ge") {
query.reset(new lucene::search::RangeQuery(term, nullptr, true));
} else {
std::cout << "invalid predicate type:" << pred << std::endl;
exit(-1);
}
_CLDECDELETE(term);

std::vector<uint32_t> result;
int total = 0;

s._search(query.get(), [&result, &total](const int32_t docid, const float_t /*score*/) {
// docid equal to rowid in segment
result.push_back(docid);
if (FLAGS_print_row_id) {
printf("RowID is %d\n", docid);
}
total += 1;
});
std::cout << "Term queried count:" << total << std::endl;

s.close();
reader->close();
_CLLDELETE(reader);
}

void check_terms_stats(lucene::store::Directory* dir) {
IndexReader* r = IndexReader::open(dir);

printf("Max Docs: %d\n", r->maxDoc());
printf("Num Docs: %d\n", r->numDocs());

int64_t ver = r->getCurrentVersion(dir);
printf("Current Version: %f\n", (float_t)ver);

TermEnum* te = r->terms();
int32_t nterms;
for (nterms = 0; te->next(); nterms++) {
/* empty */
std::string token = lucene_wcstoutf8string(te->term()->text(), te->term()->textLength());

printf("Term: %s ", token.c_str());
printf("Freq: %d\n", te->docFreq());
}
printf("Term count: %d\n\n", nterms);
_CLLDELETE(te);

r->close();
_CLLDELETE(r);
}

int main(int argc, char** argv) {
std::string usage = get_usage(argv[0]);
gflags::SetUsageMessage(usage);
google::ParseCommandLineFlags(&argc, &argv, true);

if (FLAGS_operation == "show_nested_files") {
if (FLAGS_idx_file_path == "") {
std::cout << "no file flag for show " << std::endl;
return -1;
}
std::filesystem::path p(FLAGS_idx_file_path);
std::string dir_str = p.parent_path().string();
std::string file_str = p.filename().string();
auto fs = doris::io::global_local_filesystem();
try {
lucene::store::Directory* dir =
DorisCompoundDirectory::getDirectory(fs, dir_str.c_str());
auto reader = new DorisCompoundReader(dir, file_str.c_str(), 4096);
std::vector<std::string> files;
std::cout << "Nested files for " << file_str << std::endl;
std::cout << "==================================" << std::endl;
reader->list(&files);
for (auto& file : files) {
std::cout << file << std::endl;
}
} catch (CLuceneError& err) {
std::cerr << "error occurred when show files: " << err.what() << std::endl;
}
} else if (FLAGS_operation == "check_terms_stats") {
if (FLAGS_idx_file_path == "") {
std::cout << "no file flag for check " << std::endl;
return -1;
}
std::filesystem::path p(FLAGS_idx_file_path);
std::string dir_str = p.parent_path().string();
std::string file_str = p.filename().string();
auto fs = doris::io::global_local_filesystem();
try {
lucene::store::Directory* dir =
DorisCompoundDirectory::getDirectory(fs, dir_str.c_str());
auto reader = new DorisCompoundReader(dir, file_str.c_str(), 4096);
std::cout << "Term statistics for " << file_str << std::endl;
std::cout << "==================================" << std::endl;
check_terms_stats(reader);
} catch (CLuceneError& err) {
std::cerr << "error occurred when check_terms_stats: " << err.what() << std::endl;
}
} else if (FLAGS_operation == "term_query") {
if (FLAGS_directory == "" || FLAGS_term == "" || FLAGS_column_name == "" ||
FLAGS_pred_type == "") {
std::cout << "invalid params for term_query " << std::endl;
return -1;
}
auto fs = doris::io::global_local_filesystem();
try {
lucene::store::Directory* dir =
DorisCompoundDirectory::getDirectory(fs, FLAGS_directory.c_str());
if (FLAGS_idx_file_name == "") {
//try to search from directory's all files
std::vector<FileInfo> files;
bool exists = false;
std::filesystem::path root_dir(FLAGS_directory);
fs->list(root_dir, true, &files, &exists);
if (!exists) {
std::cout << FLAGS_directory << " is not exists" << std::endl;
return -1;
}
for (auto& f : files) {
try {
auto file_str = f.file_name;
if (!file_str.ends_with(".idx")) {
continue;
}
auto reader = new DorisCompoundReader(dir, file_str.c_str(), 4096);
std::cout << "Search " << FLAGS_column_name << ":" << FLAGS_term << " from "
<< file_str << std::endl;
std::cout << "==================================" << std::endl;
search(reader, FLAGS_column_name, FLAGS_term, FLAGS_pred_type);
} catch (CLuceneError& err) {
std::cerr << "error occurred when search file: " << f.file_name
<< ", error:" << err.what() << std::endl;
}
}
} else {
auto reader = new DorisCompoundReader(dir, FLAGS_idx_file_name.c_str(), 4096);
std::cout << "Search " << FLAGS_column_name << ":" << FLAGS_term << " from "
<< FLAGS_idx_file_name << std::endl;
std::cout << "==================================" << std::endl;
try {
search(reader, FLAGS_column_name, FLAGS_term, FLAGS_pred_type);
} catch (CLuceneError& err) {
std::cerr << "error occurred when search file: " << FLAGS_idx_file_name
<< ", error:" << err.what() << std::endl;
}
}
} catch (CLuceneError& err) {
std::cerr << "error occurred when check_terms_stats: " << err.what() << std::endl;
}
} else {
std::cout << "invalid operation: " << FLAGS_operation << "\n" << usage << std::endl;
return -1;
}
gflags::ShutDownCommandLineFlags();
return 0;
}
16 changes: 16 additions & 0 deletions build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ Usage: $0 <options>
--fe build Frontend and Spark DPP application. Default ON.
--be build Backend. Default ON.
--meta-tool build Backend meta tool. Default OFF.
--index-tool build Backend inverted index tool. Default OFF.
--broker build Broker. Default ON.
--audit build audit loader. Default ON.
--spark-dpp build Spark DPP application. Default ON.
Expand All @@ -60,6 +61,7 @@ Usage: $0 <options>
$0 build all
$0 --be build Backend
$0 --meta-tool build Backend meta tool
$0 --index-tool build Backend inverted index tool
$0 --fe --clean clean and build Frontend and Spark Dpp application
$0 --fe --be --clean clean and build Frontend, Spark Dpp application and Backend
$0 --spark-dpp build Spark DPP application alone
Expand Down Expand Up @@ -117,6 +119,7 @@ if ! OPTS="$(getopt \
-l 'broker' \
-l 'audit' \
-l 'meta-tool' \
-l 'index-tool' \
-l 'spark-dpp' \
-l 'hive-udf' \
-l 'be-java-extensions' \
Expand All @@ -137,6 +140,7 @@ BUILD_BE=0
BUILD_BROKER=0
BUILD_AUDIT=0
BUILD_META_TOOL='OFF'
BUILD_INDEX_TOOL='OFF'
BUILD_SPARK_DPP=0
BUILD_BE_JAVA_EXTENSIONS=0
BUILD_HIVE_UDF=0
Expand All @@ -152,6 +156,7 @@ if [[ "$#" == 1 ]]; then
BUILD_BROKER=1
BUILD_AUDIT=1
BUILD_META_TOOL='OFF'
BUILD_INDEX_TOOL='OFF'
BUILD_SPARK_DPP=1
BUILD_HIVE_UDF=1
BUILD_BE_JAVA_EXTENSIONS=1
Expand Down Expand Up @@ -183,6 +188,10 @@ else
BUILD_META_TOOL='ON'
shift
;;
--index-tool)
BUILD_INDEX_TOOL='ON'
shift
;;
--spark-dpp)
BUILD_SPARK_DPP=1
shift
Expand Down Expand Up @@ -237,6 +246,7 @@ else
BUILD_BROKER=1
BUILD_AUDIT=1
BUILD_META_TOOL='ON'
BUILD_INDEX_TOOL='ON'
BUILD_SPARK_DPP=1
BUILD_HIVE_UDF=1
BUILD_BE_JAVA_EXTENSIONS=1
Expand Down Expand Up @@ -392,6 +402,7 @@ echo "Get params:
BUILD_BROKER -- ${BUILD_BROKER}
BUILD_AUDIT -- ${BUILD_AUDIT}
BUILD_META_TOOL -- ${BUILD_META_TOOL}
BUILD_INDEX_TOOL -- ${BUILD_INDEX_TOOL}
BUILD_SPARK_DPP -- ${BUILD_SPARK_DPP}
BUILD_BE_JAVA_EXTENSIONS -- ${BUILD_BE_JAVA_EXTENSIONS}
BUILD_HIVE_UDF -- ${BUILD_HIVE_UDF}
Expand Down Expand Up @@ -487,6 +498,7 @@ if [[ "${BUILD_BE}" -eq 1 ]]; then
-DWITH_LZO="${WITH_LZO}" \
-DUSE_LIBCPP="${USE_LIBCPP}" \
-DBUILD_META_TOOL="${BUILD_META_TOOL}" \
-DBUILD_INDEX_TOOL="${BUILD_INDEX_TOOL}" \
-DSTRIP_DEBUG_INFO="${STRIP_DEBUG_INFO}" \
-DUSE_DWARF="${USE_DWARF}" \
-DUSE_UNWIND="${USE_UNWIND}" \
Expand Down Expand Up @@ -642,6 +654,10 @@ EOF
cp -r -p "${DORIS_HOME}/be/output/lib/meta_tool" "${DORIS_OUTPUT}/be/lib"/
fi

if [[ "${BUILD_INDEX_TOOL}" = "ON" ]]; then
cp -r -p "${DORIS_HOME}/be/output/lib/index_tool" "${DORIS_OUTPUT}/be/lib"/
fi

cp -r -p "${DORIS_HOME}/webroot/be"/* "${DORIS_OUTPUT}/be/www"/
if [[ "${STRIP_DEBUG_INFO}" = "ON" ]]; then
cp -r -p "${DORIS_HOME}/be/output/lib/debug_info" "${DORIS_OUTPUT}/be/lib"/
Expand Down

0 comments on commit 8371171

Please sign in to comment.