Skip to content

Commit

Permalink
enhance: rename tokenizer to analyzer and check analyzer params (#37478)
Browse files Browse the repository at this point in the history
relate: #35853

---------

Signed-off-by: aoiasd <zhicheng.yue@zilliz.com>
  • Loading branch information
aoiasd authored Nov 10, 2024
1 parent ff00a12 commit 12951f0
Show file tree
Hide file tree
Showing 40 changed files with 514 additions and 488 deletions.
14 changes: 7 additions & 7 deletions internal/core/src/common/FieldMeta.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
namespace milvus {
TokenizerParams
ParseTokenizerParams(const TypeParams& params) {
auto iter = params.find("tokenizer_params");
auto iter = params.find("analyzer_params");
if (iter == params.end()) {
return "{}";
}
Expand All @@ -39,19 +39,19 @@ FieldMeta::enable_match() const {
}

bool
FieldMeta::enable_tokenizer() const {
FieldMeta::enable_analyzer() const {
if (!IsStringDataType(type_)) {
return false;
}
if (!string_info_.has_value()) {
return false;
}
return string_info_->enable_tokenizer;
return string_info_->enable_analyzer;
}

TokenizerParams
FieldMeta::get_tokenizer_params() const {
Assert(enable_tokenizer());
FieldMeta::get_analyzer_params() const {
Assert(enable_analyzer());
auto params = string_info_->params;
return ParseTokenizerParams(params);
}
Expand Down Expand Up @@ -109,7 +109,7 @@ FieldMeta::ParseFrom(const milvus::proto::schema::FieldSchema& schema_proto) {
return b;
};

bool enable_tokenizer = get_bool_value("enable_tokenizer");
bool enable_analyzer = get_bool_value("enable_analyzer");
bool enable_match = get_bool_value("enable_match");

return FieldMeta{name,
Expand All @@ -118,7 +118,7 @@ FieldMeta::ParseFrom(const milvus::proto::schema::FieldSchema& schema_proto) {
max_len,
nullable,
enable_match,
enable_tokenizer,
enable_analyzer,
type_map};
}

Expand Down
10 changes: 5 additions & 5 deletions internal/core/src/common/FieldMeta.h
Original file line number Diff line number Diff line change
Expand Up @@ -64,13 +64,13 @@ class FieldMeta {
int64_t max_length,
bool nullable,
bool enable_match,
bool enable_tokenizer,
bool enable_analyzer,
std::map<std::string, std::string>& params)
: name_(name),
id_(id),
type_(type),
string_info_(StringInfo{
max_length, enable_match, enable_tokenizer, std::move(params)}),
max_length, enable_match, enable_analyzer, std::move(params)}),
nullable_(nullable) {
Assert(IsStringDataType(type_));
}
Expand Down Expand Up @@ -125,10 +125,10 @@ class FieldMeta {
enable_match() const;

bool
enable_tokenizer() const;
enable_analyzer() const;

TokenizerParams
get_tokenizer_params() const;
get_analyzer_params() const;

std::optional<knowhere::MetricType>
get_metric_type() const {
Expand Down Expand Up @@ -203,7 +203,7 @@ class FieldMeta {
struct StringInfo {
int64_t max_length;
bool enable_match;
bool enable_tokenizer;
bool enable_analyzer;
std::map<std::string, std::string> params;
};
FieldName name_;
Expand Down
4 changes: 2 additions & 2 deletions internal/core/src/common/Schema.h
Original file line number Diff line number Diff line change
Expand Up @@ -121,15 +121,15 @@ class Schema {
int64_t max_length,
bool nullable,
bool enable_match,
bool enable_tokenizer,
bool enable_analyzer,
std::map<std::string, std::string>& params) {
auto field_meta = FieldMeta(name,
id,
data_type,
max_length,
nullable,
enable_match,
enable_tokenizer,
enable_analyzer,
params);
this->AddField(std::move(field_meta));
}
Expand Down
16 changes: 8 additions & 8 deletions internal/core/src/index/TextMatchIndex.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,18 +21,18 @@ constexpr const char* TMP_TEXT_LOG_PREFIX = "/tmp/milvus/text-log/";

TextMatchIndex::TextMatchIndex(int64_t commit_interval_in_ms,
const char* tokenizer_name,
const char* tokenizer_params)
const char* analyzer_params)
: commit_interval_in_ms_(commit_interval_in_ms),
last_commit_time_(stdclock::now()) {
d_type_ = TantivyDataType::Text;
std::string field_name = "tmp_text_index";
wrapper_ = std::make_shared<TantivyIndexWrapper>(
field_name.c_str(), true, "", tokenizer_name, tokenizer_params);
field_name.c_str(), true, "", tokenizer_name, analyzer_params);
}

TextMatchIndex::TextMatchIndex(const std::string& path,
const char* tokenizer_name,
const char* tokenizer_params)
const char* analyzer_params)
: commit_interval_in_ms_(std::numeric_limits<int64_t>::max()),
last_commit_time_(stdclock::now()) {
path_ = path;
Expand All @@ -42,12 +42,12 @@ TextMatchIndex::TextMatchIndex(const std::string& path,
false,
path_.c_str(),
tokenizer_name,
tokenizer_params);
analyzer_params);
}

TextMatchIndex::TextMatchIndex(const storage::FileManagerContext& ctx,
const char* tokenizer_name,
const char* tokenizer_params)
const char* analyzer_params)
: commit_interval_in_ms_(std::numeric_limits<int64_t>::max()),
last_commit_time_(stdclock::now()) {
schema_ = ctx.fieldDataMeta.field_schema;
Expand All @@ -65,7 +65,7 @@ TextMatchIndex::TextMatchIndex(const storage::FileManagerContext& ctx,
false,
path_.c_str(),
tokenizer_name,
tokenizer_params);
analyzer_params);
}

TextMatchIndex::TextMatchIndex(const storage::FileManagerContext& ctx)
Expand Down Expand Up @@ -172,8 +172,8 @@ TextMatchIndex::CreateReader() {

void
TextMatchIndex::RegisterTokenizer(const char* tokenizer_name,
const char* tokenizer_params) {
wrapper_->register_tokenizer(tokenizer_name, tokenizer_params);
const char* analyzer_params) {
wrapper_->register_tokenizer(tokenizer_name, analyzer_params);
}

TargetBitmap
Expand Down
8 changes: 4 additions & 4 deletions internal/core/src/index/TextMatchIndex.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,15 +24,15 @@ class TextMatchIndex : public InvertedIndexTantivy<std::string> {
// for growing segment.
explicit TextMatchIndex(int64_t commit_interval_in_ms,
const char* tokenizer_name,
const char* tokenizer_params);
const char* analyzer_params);
// for sealed segment.
explicit TextMatchIndex(const std::string& path,
const char* tokenizer_name,
const char* tokenizer_params);
const char* analyzer_params);
// for building index.
explicit TextMatchIndex(const storage::FileManagerContext& ctx,
const char* tokenizer_name,
const char* tokenizer_params);
const char* analyzer_params);
// for loading index
explicit TextMatchIndex(const storage::FileManagerContext& ctx);

Expand Down Expand Up @@ -64,7 +64,7 @@ class TextMatchIndex : public InvertedIndexTantivy<std::string> {
CreateReader();

void
RegisterTokenizer(const char* tokenizer_name, const char* tokenizer_params);
RegisterTokenizer(const char* tokenizer_name, const char* analyzer_params);

TargetBitmap
MatchQuery(const std::string& query);
Expand Down
2 changes: 1 addition & 1 deletion internal/core/src/indexbuilder/index_c.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -284,7 +284,7 @@ BuildTextIndex(CBinarySet* c_binary_set,
auto index = std::make_unique<index::TextMatchIndex>(
fileManagerContext,
"milvus_tokenizer",
field_schema.get_tokenizer_params().c_str());
field_schema.get_analyzer_params().c_str());
index->Build(config);
auto binary =
std::make_unique<knowhere::BinarySet>(index->Upload(config));
Expand Down
8 changes: 4 additions & 4 deletions internal/core/src/segcore/ChunkedSegmentSealedImpl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1516,13 +1516,13 @@ ChunkedSegmentSealedImpl::CreateTextIndex(FieldId field_id) {
index = std::make_unique<index::TextMatchIndex>(
std::numeric_limits<int64_t>::max(),
"milvus_tokenizer",
field_meta.get_tokenizer_params().c_str());
field_meta.get_analyzer_params().c_str());
} else {
// build text index using mmap.
index = std::make_unique<index::TextMatchIndex>(
cfg.GetMmapPath(),
"milvus_tokenizer",
field_meta.get_tokenizer_params().c_str());
field_meta.get_analyzer_params().c_str());
}

{
Expand Down Expand Up @@ -1572,7 +1572,7 @@ ChunkedSegmentSealedImpl::CreateTextIndex(FieldId field_id) {
index->Reload();

index->RegisterTokenizer("milvus_tokenizer",
field_meta.get_tokenizer_params().c_str());
field_meta.get_analyzer_params().c_str());

text_indexes_[field_id] = std::move(index);
}
Expand All @@ -1583,7 +1583,7 @@ ChunkedSegmentSealedImpl::LoadTextIndex(
std::unique_lock lck(mutex_);
const auto& field_meta = schema_->operator[](field_id);
index->RegisterTokenizer("milvus_tokenizer",
field_meta.get_tokenizer_params().c_str());
field_meta.get_analyzer_params().c_str());
text_indexes_[field_id] = std::move(index);
}

Expand Down
4 changes: 2 additions & 2 deletions internal/core/src/segcore/SegmentGrowingImpl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -859,11 +859,11 @@ SegmentGrowingImpl::CreateTextIndex(FieldId field_id) {
"cannot create text index on non-string type");
// todo: make this(200) configurable.
auto index = std::make_unique<index::TextMatchIndex>(
200, "milvus_tokenizer", field_meta.get_tokenizer_params().c_str());
200, "milvus_tokenizer", field_meta.get_analyzer_params().c_str());
index->Commit();
index->CreateReader();
index->RegisterTokenizer("milvus_tokenizer",
field_meta.get_tokenizer_params().c_str());
field_meta.get_analyzer_params().c_str());
text_indexes_[field_id] = std::move(index);
}

Expand Down
8 changes: 4 additions & 4 deletions internal/core/src/segcore/SegmentSealedImpl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2014,13 +2014,13 @@ SegmentSealedImpl::CreateTextIndex(FieldId field_id) {
index = std::make_unique<index::TextMatchIndex>(
std::numeric_limits<int64_t>::max(),
"milvus_tokenizer",
field_meta.get_tokenizer_params().c_str());
field_meta.get_analyzer_params().c_str());
} else {
// build text index using mmap.
index = std::make_unique<index::TextMatchIndex>(
cfg.GetMmapPath(),
"milvus_tokenizer",
field_meta.get_tokenizer_params().c_str());
field_meta.get_analyzer_params().c_str());
}

{
Expand Down Expand Up @@ -2069,7 +2069,7 @@ SegmentSealedImpl::CreateTextIndex(FieldId field_id) {
index->Reload();

index->RegisterTokenizer("milvus_tokenizer",
field_meta.get_tokenizer_params().c_str());
field_meta.get_analyzer_params().c_str());

text_indexes_[field_id] = std::move(index);
}
Expand All @@ -2080,7 +2080,7 @@ SegmentSealedImpl::LoadTextIndex(FieldId field_id,
std::unique_lock lck(mutex_);
const auto& field_meta = schema_->operator[](field_id);
index->RegisterTokenizer("milvus_tokenizer",
field_meta.get_tokenizer_params().c_str());
field_meta.get_analyzer_params().c_str());
text_indexes_[field_id] = std::move(index);
}

Expand Down
8 changes: 4 additions & 4 deletions internal/core/thirdparty/tantivy/jieba_demo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,11 @@ to_set(const RustArrayWrapper& w) {
int
main(int argc, char* argv[]) {
std::string tokenizer_name = "jieba";
std::map<std::string, std::string> tokenizer_params;
tokenizer_params["tokenizer"] = tokenizer_name;
std::map<std::string, std::string> analyzer_params;
analyzer_params["tokenizer"] = tokenizer_name;

auto text_index = TantivyIndexWrapper(
"text_demo", true, "", tokenizer_name.c_str(), tokenizer_params);
"text_demo", true, "", tokenizer_name.c_str(), analyzer_params);
auto write_single_text = [&text_index](const std::string& s,
int64_t offset) {
text_index.add_data(&s, 1, offset);
Expand All @@ -38,7 +38,7 @@ main(int argc, char* argv[]) {
}

text_index.create_reader();
text_index.register_tokenizer(tokenizer_name.c_str(), tokenizer_params);
text_index.register_tokenizer(tokenizer_name.c_str(), analyzer_params);

{
auto result = to_set(text_index.match_query("北京"));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -88,9 +88,7 @@ RustArray tantivy_regex_query(void *ptr, const char *pattern);

RustArray tantivy_match_query(void *ptr, const char *query);

void tantivy_register_tokenizer(void *ptr,
const char *tokenizer_name,
const char *tokenizer_params);
void tantivy_register_tokenizer(void *ptr, const char *tokenizer_name, const char *analyzer_params);

void *tantivy_create_index(const char *field_name,
TantivyDataType data_type,
Expand Down Expand Up @@ -144,7 +142,7 @@ void tantivy_index_add_multi_keywords(void *ptr,
void *tantivy_create_text_writer(const char *field_name,
const char *path,
const char *tokenizer_name,
const char *tokenizer_params,
const char *analyzer_params,
uintptr_t num_threads,
uintptr_t overall_memory_budget_in_bytes,
bool in_ram);
Expand All @@ -159,7 +157,7 @@ bool tantivy_token_stream_advance(void *token_stream);

const char *tantivy_token_stream_get_token(void *token_stream);

void *tantivy_create_tokenizer(const char *tokenizer_params);
void *tantivy_create_tokenizer(const char *analyzer_params);

void *tantivy_clone_tokenizer(void *ptr);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,12 +24,12 @@ pub extern "C" fn tantivy_match_query(ptr: *mut c_void, query: *const c_char) ->
pub extern "C" fn tantivy_register_tokenizer(
ptr: *mut c_void,
tokenizer_name: *const c_char,
tokenizer_params: *const c_char,
analyzer_params: *const c_char,
) {
init_log();
let real = ptr as *mut IndexReaderWrapper;
let tokenizer_name_str = unsafe { CStr::from_ptr(tokenizer_name) };
let params = unsafe{c_str_to_str(tokenizer_params).to_string()};
let params = unsafe{c_str_to_str(analyzer_params).to_string()};
let analyzer = create_tokenizer(&params);
match analyzer {
Ok(text_analyzer) => unsafe {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ pub extern "C" fn tantivy_create_text_writer(
field_name: *const c_char,
path: *const c_char,
tokenizer_name: *const c_char,
tokenizer_params: *const c_char,
analyzer_params: *const c_char,
num_threads: usize,
overall_memory_budget_in_bytes: usize,
in_ram: bool,
Expand All @@ -22,7 +22,7 @@ pub extern "C" fn tantivy_create_text_writer(
let field_name_str = unsafe { CStr::from_ptr(field_name).to_str().unwrap() };
let path_str = unsafe { CStr::from_ptr(path).to_str().unwrap() };
let tokenizer_name_str = unsafe { CStr::from_ptr(tokenizer_name).to_str().unwrap() };
let params = unsafe{c_str_to_str(tokenizer_params).to_string()};
let params = unsafe{c_str_to_str(analyzer_params).to_string()};
let analyzer = create_tokenizer(&params);
match analyzer {
Ok(text_analyzer) => {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,9 @@ use crate::{
};

#[no_mangle]
pub extern "C" fn tantivy_create_tokenizer(tokenizer_params: *const c_char) -> *mut c_void {
pub extern "C" fn tantivy_create_tokenizer(analyzer_params: *const c_char) -> *mut c_void {
init_log();
let params = unsafe{c_str_to_str(tokenizer_params).to_string()};
let params = unsafe{c_str_to_str(analyzer_params).to_string()};
let analyzer = create_tokenizer(&params);
match analyzer {
Ok(text_analyzer) => create_binding(text_analyzer),
Expand Down
Loading

0 comments on commit 12951f0

Please sign in to comment.