From a3627170e102e90aa9425d15fcdbe8200da51678 Mon Sep 17 00:00:00 2001 From: Zhichang Yu Date: Fri, 21 Jun 2024 11:44:13 +0800 Subject: [PATCH] Fix phrase bug (#1362) Fix phrase query crash, added fusion doc, added fulltext example. - [x] Bug Fix (non-breaking change which fixes an issue) - [x] Test cases Issue:#1352 --- docs/getstarted/build_from_source.md | 2 +- docs/getstarted/quickstart.md | 86 ++++++++++++++++--- docs/references/pysdk_api_reference.md | 17 +++- python/hello_infinity.py | 65 +++++++------- .../format/position_list_encoder.cpp | 2 +- .../invertedindex/format/skiplist_reader.cppm | 5 +- .../invertedindex/posting_iterator.cpp | 9 +- .../invertedindex/posting_iterator.cppm | 9 ++ .../search/phrase_doc_iterator.cpp | 4 +- .../invertedindex/search/search_driver.cpp | 23 +++-- .../common/analyzer/chinese_analyzer.cpp | 22 ++--- .../common/analyzer/standard_analyzer.cpp | 21 +++++ .../storage/invertedindex/memory_indexer.cpp | 60 +++++++++++-- .../invertedindex/search/query_match.cpp | 62 ++++++++----- 14 files changed, 283 insertions(+), 104 deletions(-) diff --git a/docs/getstarted/build_from_source.md b/docs/getstarted/build_from_source.md index 0ac501b7eb..771fbaf474 100644 --- a/docs/getstarted/build_from_source.md +++ b/docs/getstarted/build_from_source.md @@ -47,7 +47,7 @@ git clone https://github.com/infiniflow/infinity.git ```shell cd infinity && mkdir cmake-build-debug TZ=$(readlink -f /etc/localtime | awk -F '/zoneinfo/' '{print $2}') -docker run -d --name infinity_build --network=host -e TZ=$TZ -v $PWD:/infinity infiniflow/infinity_builder:centos7 +docker run -d --privileged --name infinity_build -e TZ=$TZ -v $PWD:/infinity -v /boot:/boot infiniflow/infinity_builder:centos7 docker exec infinity_build bash -c "cd /infinity/cmake-build-debug && cmake -G Ninja -DCMAKE_BUILD_TYPE=Debug -DCMAKE_VERBOSE_MAKEFILE=ON .. && cmake --build ." ``` diff --git a/docs/getstarted/quickstart.md b/docs/getstarted/quickstart.md index fad2826963..c29ee82378 100644 --- a/docs/getstarted/quickstart.md +++ b/docs/getstarted/quickstart.md @@ -61,36 +61,98 @@ from infinity.common import ConflictType ## Connect to the remote server ```python -infinity_obj = infinity.connect(REMOTE_HOST) + infinity_obj = infinity.connect(REMOTE_HOST) ``` ## Get a database ```python -db = infinity_obj.get_database("default_db") + db = infinity_obj.get_database("default_db") ``` ## Create a table ```python -# Drop my_table if it already exists -db.drop_table("my_table", ConflictType.Ignore) -# Create a table named "my_table" -table = db.create_table("my_table", {"num": {"type": "integer"}, "body": {"type": "varchar"}, "vec": {"type": "vector, 4, float"}}) + # Drop my_table if it already exists + db.drop_table("my_table", ConflictType.Ignore) + # Create a table named "my_table" + table = db.create_table( + "my_table", + { + "num": {"type": "integer"}, + "body": {"type": "varchar"}, + "vec": {"type": "vector, 4, float"}, + }, + ) ``` -## Insert two records +## Insert some records ```python -table.insert([{"num": 1, "body": "unnecessary and harmful", "vec": [1.0, 1.2, 0.8, 0.9]}]) -table.insert([{"num": 2, "body": "Office for Harmful Blooms", "vec": [4.0, 4.2, 4.3, 4.5]}]) + table.insert( + [ + { + "num": 1, + "body": r"unnecessary and harmful", + "vec": [1.0, 1.2, 0.8, 0.9], + }, + { + "num": 2, + "body": r"Office for Harmful Blooms", + "vec": [4.0, 4.2, 4.3, 4.5], + }, + { + "num": 2, + "body": r"A Bloom filter is a space-efficient probabilistic data structure, conceived by Burton Howard Bloom in 1970, that is used to test whether an element is a member of a set.", + "vec": [4.0, 4.2, 4.3, 4.5], + }, + ] + ) ``` ## Execute a vector search ```python -res = table.output(["*"]).knn("vec", [3.0, 2.8, 2.7, 3.1], "float", "ip", 2).to_pl() -print(res) + res = ( + table.output(["num", "body"]) + .knn("vec", [3.0, 2.8, 2.7, 3.1], "float", "ip", 2) + .to_pl() + ) + print(res) ``` -> 💡 For more information about the Python API, see the [Python API Reference](../references/pysdk_api_reference.md). \ No newline at end of file +## Execute some fulltext search + +```python + questions = [ + r"blooms", # single term + r"Bloom filter", # OR multiple terms + r'"Bloom filter"', # phrase is surrounded by double-quotes + r'space\-efficient', # escape reserved characters, equivalent to: `space efficient` + r'"space\-efficient"', # phrase and escape reserved character, equivalent to: `"space efficient"` + ] + for question in questions: + qb_result = ( + table_obj.output(["num", "body", "_score"]) + .match("body", question, "topn=10") + .to_pl() + ) + print(f"question: {question}") + print(qb_result) +``` + +## Execute a fusion search + +```python + qb_result2 = ( + table_obj.output(["num", "body"]) + .knn("vec", [3.0, 2.8, 2.7, 3.1], "float", "ip", 3) + .match("body", "blooms", "topn=1") + .fusion("rrf") + .to_pl() + ) + print(qb_result2) +``` + +> 💡 For more information about the Python API, see the [Python API Reference](../references/pysdk_api_reference.md). +> 💡 For a complete example, see the [hello_infinity.py](../../python/hello_infinity.py). diff --git a/docs/references/pysdk_api_reference.md b/docs/references/pysdk_api_reference.md index a3cee513f2..72d61f6643 100644 --- a/docs/references/pysdk_api_reference.md +++ b/docs/references/pysdk_api_reference.md @@ -717,7 +717,20 @@ Build a fusion expression. ### Parameters - **method : str** -- **method : options_text** + The supported methods are: rrf, weighted_sum, match_tensor +- **options_text : str** + + Common options: + + - 'topn=10': Retrieve the 10 most relevant records. The defualt value is `100`. + + Dedicated options of rrf: + + - 'rank_constant=30': The default value is `60`. + + Dedicated options of weighted_sum: + + - 'weights=1,2,0.5': The weights of children scorers. The default weight of each weight is `1.0`. ### Returns @@ -728,6 +741,8 @@ Build a fusion expression. ```python table_obj.fusion('rrf') +table_obj.fusion('rrf', 'topn=10') +table_obj.fusion('weighted_sum', 'weights=1,2,0.5') ``` ### Details diff --git a/python/hello_infinity.py b/python/hello_infinity.py index 623fb5392f..a3103fdd01 100644 --- a/python/hello_infinity.py +++ b/python/hello_infinity.py @@ -17,7 +17,7 @@ # remove local path, use the installed infinity sdk current_path = os.path.abspath(os.path.dirname(__file__)) -local_infinity_path = os.path.join(current_path, 'infinity') +local_infinity_path = os.path.join(current_path, "infinity") if local_infinity_path in sys.path: sys.path.remove(local_infinity_path) if current_path in sys.path: @@ -48,17 +48,17 @@ def test_english(): [ { "num": 1, - "body": "unnecessary and harmful", + "body": r"unnecessary and harmful", "vec": [1.0, 1.2, 0.8, 0.9], }, { "num": 2, - "body": "Office for Harmful Blooms", + "body": r"Office for Harmful Blooms", "vec": [4.0, 4.2, 4.3, 4.5], }, { - "num": 2, - "body": "A Bloom filter is a space-efficient probabilistic data structure, conceived by Burton Howard Bloom in 1970, that is used to test whether an element is a member of a set.", + "num": 3, + "body": r"A Bloom filter is a space-efficient probabilistic data structure, conceived by Burton Howard Bloom in 1970, that is used to test whether an element is a member of a set.", "vec": [4.0, 4.2, 4.3, 4.5], }, ] @@ -96,9 +96,13 @@ def test_english(): print("------fulltext-------") questions = [ - "blooms", - "Bloom filter", - '"Bloom filter"', + r"blooms", # single term + r"Bloom filter", # OR multiple terms + r'"Bloom filter"', # phrase: adjacent multiple terms + r"space efficient", # OR multiple terms + # r"space-efficient", # Error 3013: Invalid query statement: OrQueryNode should not have both not child and non-not child + r"space\-efficient", # Escape reserved character '-', equivalent to: `space efficient` + r'"space\-efficient"', # phrase and escape reserved character, equivalent to: `"space efficient"` ] for question in questions: qb_result = ( @@ -147,42 +151,42 @@ def test_chinese(): [ { "num": 1, - "body": "据Wccftech报道,苹果正在开发一种定制芯片,旨在为人工智能(AI)服务器提供动力。暂时还不清楚这款芯片的具体规格,以及具体的实现目标。传闻苹果已选择台积电(TSMC)的3nm制程节点来制造这款芯片,预计2025年下半年量产。如果按照量产时间和台积电的半导体工艺进度,那么对应的很可能是N3E工艺。", + "body": r"据Wccftech报道,苹果正在开发一种定制芯片,旨在为人工智能(AI)服务器提供动力。暂时还不清楚这款芯片的具体规格,以及具体的实现目标。传闻苹果已选择台积电(TSMC)的3nm制程节点来制造这款芯片,预计2025年下半年量产。如果按照量产时间和台积电的半导体工艺进度,那么对应的很可能是N3E工艺。", "vec": [1.0, 1.2, 0.8, 0.9], }, { "num": 2, - "body": "两个月前有报道称,苹果已正式放弃了努力超过十年、投下海量资金的“泰坦计划(Project Titan)”电动车项目。苹果随后解散了大约2000人的开发团队,各人会被分配到其他地方,其中一个很重要的去处就是人工智能部门。有传言称,苹果已经将注意力转向生成式AI,希望能够为业务找到新的增长动力。", + "body": r"两个月前有报道称,苹果已正式放弃了努力超过十年、投下海量资金的“泰坦计划(Project Titan)”电动车项目。苹果随后解散了大约2000人的开发团队,各人会被分配到其他地方,其中一个很重要的去处就是人工智能部门。有传言称,苹果已经将注意力转向生成式AI,希望能够为业务找到新的增长动力。", "vec": [1.2, 0.8, 0.9, 1.0], }, { "num": 3, - "body": "爬行动物是一类包括蛇、蜥蜴、龟鳖和鳄鱼等的脊椎动物。它们的特点是体表覆盖着角质鳞片,而不是羽毛。羽毛是鸟类的特征,鸟类是从爬行动物中的某一类演化而来的,但它们已经发展出了独特的特征,包括羽毛。因此,爬行动物没有羽毛。", + "body": r"爬行动物是一类包括蛇、蜥蜴、龟鳖和鳄鱼等的脊椎动物。它们的特点是体表覆盖着角质鳞片,而不是羽毛。羽毛是鸟类的特征,鸟类是从爬行动物中的某一类演化而来的,但它们已经发展出了独特的特征,包括羽毛。因此,爬行动物没有羽毛。", "vec": [0.8, 0.9, 1.0, 1.2], }, { "num": 4, - "body": "会徽整体形似运动中的羽毛球,球头绑带部分演化为“城墙”的图形元素,极具南京的地域特征,凸显出举办地的历史底蕴和人文气息。尾羽部分图形则巧妙融入了举办年份“2018”和南京的首字母“NJ”,结合中国传统书法笔触的表现形式,传递出羽毛球运动的速度感。会徽红黑配色鲜艳明快,契合了体育运动的活力与朝气[3]2018年世界羽毛球锦标赛吉祥物南京羽毛球世锦赛吉祥物2018年道达尔羽毛球世锦赛吉祥物在南京发布。造型简洁、形态生动、富有亲和力的“羽宝”拔得头筹,成为2018年世界羽毛球锦标赛吉祥物。比赛将于7月30日在宁举行,赛程7天,预计近340名顶尖运动员参赛。吉祥物“羽宝”头部由羽毛球外形变化而来,手持球拍,拟人化的设计再现了羽毛球运动员比赛时的接击球动作,胸前佩戴的梅花造型的金牌,代表着在南京举办的世锦赛将向世界献上精彩的羽毛球盛宴。同时黄蓝两色为主色调,在视觉冲击中体现了羽毛球运动动静转换的速度感和竞技魅力[6]2018年世界羽毛球锦标赛抽签结果7月17日,2018年南京羽毛球世锦赛抽签出炉。男单中国获得满额席位,石宇奇、谌龙、林丹和黄宇翔全部被分到了上半区。", + "body": r"会徽整体形似运动中的羽毛球,球头绑带部分演化为“城墙”的图形元素,极具南京的地域特征,凸显出举办地的历史底蕴和人文气息。尾羽部分图形则巧妙融入了举办年份“2018”和南京的首字母“NJ”,结合中国传统书法笔触的表现形式,传递出羽毛球运动的速度感。会徽红黑配色鲜艳明快,契合了体育运动的活力与朝气[3]2018年世界羽毛球锦标赛吉祥物南京羽毛球世锦赛吉祥物2018年道达尔羽毛球世锦赛吉祥物在南京发布。造型简洁、形态生动、富有亲和力的“羽宝”拔得头筹,成为2018年世界羽毛球锦标赛吉祥物。比赛将于7月30日在宁举行,赛程7天,预计近340名顶尖运动员参赛。吉祥物“羽宝”头部由羽毛球外形变化而来,手持球拍,拟人化的设计再现了羽毛球运动员比赛时的接击球动作,胸前佩戴的梅花造型的金牌,代表着在南京举办的世锦赛将向世界献上精彩的羽毛球盛宴。同时黄蓝两色为主色调,在视觉冲击中体现了羽毛球运动动静转换的速度感和竞技魅力[6]2018年世界羽毛球锦标赛抽签结果7月17日,2018年南京羽毛球世锦赛抽签出炉。男单中国获得满额席位,石宇奇、谌龙、林丹和黄宇翔全部被分到了上半区。", "vec": [0.9, 1.0, 1.2, 0.8], }, { "num": 5, - "body": "周末我和朋友一起去“电子城”,想挑选一些新的“电脑配件”。那里有各种各样的“hardware”,如“motherboard”、“graphics card”等。我们还看到了一些很“awesome”的“peripheral devices”,像“keyboard”和“mouse”。我朋友说他需要一个新的“power supply”,而我则对那些“high-tech”的“storage devices”比较感兴趣。逛了一会儿后,我们都买到了自己心仪的东西,然后就“happily”回家了。", + "body": r"周末我和朋友一起去“电子城”,想挑选一些新的“电脑配件”。那里有各种各样的“hardware”,如“motherboard”、“graphics card”等。我们还看到了一些很“awesome”的“peripheral devices”,像“keyboard”和“mouse”。我朋友说他需要一个新的“power supply”,而我则对那些“high-tech”的“storage devices”比较感兴趣。逛了一会儿后,我们都买到了自己心仪的东西,然后就“happily”回家了。", "vec": [1.0, 0.9, 0.8, 0.9], }, { "num": 6, - "body": "便携式计算机 Model Name型号:ThinkBook 16 G5+ ARP 输入电压/电流:20V=5A CMIT ID:2023AP123456 MO: DS-K3AJ303/Dm140", + "body": r"便携式计算机 Model Name型号:ThinkBook 16 G5+ ARP 输入电压/电流:20V=5A CMIT ID:2023AP123456 MO: DS-K3AJ303/Dm140", "vec": [0.9, 0.8, 0.9, 1.0], }, { "num": 7, - "body": "Office for Harmful Blooms", + "body": r"Office for Harmful Blooms", "vec": [4.0, 4.2, 4.3, 4.5], }, { "num": 8, - "body": "A Bloom filter is a space-efficient probabilistic data structure, conceived by Burton Howard Bloom in 1970, that is used to test whether an element is a member of a set.", + "body": r"A Bloom filter is a space-efficient probabilistic data structure, conceived by Burton Howard Bloom in 1970, that is used to test whether an element is a member of a set.", "vec": [4.0, 4.2, 4.3, 4.5], }, ] @@ -224,19 +228,20 @@ def test_chinese(): print("------fulltext-------") questions = [ - "芯片", - "苹果", - "羽毛", - "羽毛球", - '"羽毛球锦标赛"', - "2018年世界羽毛球锦标赛在哪个城市举办?", - "hi\-tech", - '"hi-tech"', - "graphics card", - '"graphics card"', - '"DS-K3AJ303/Dm140"', - "Bloom filter", - '"Bloom filter"', + r"芯片", # single term + r"苹果", # single term + r"羽毛", # single term + r"羽毛球", # single term + r'"羽毛球锦标赛"', # phrase: adjacent multiple terms + r"2018年世界羽毛球锦标赛在哪个城市举办?", # OR multiple terms + r"high\-tech", # Escape reserved character '-' + r'"high tech"', # phrase: adjacent multiple terms + r'"high-tech"', # phrase: adjacent multiple terms + r"graphics card", # OR multiple terms + r'"graphics card"', # phrase: adjacent multiple terms + r'"DS-K3AJ303/Dm140"', # phrase: adjacent multiple terms + r"Bloom filter", # OR multiple terms + r'"Bloom filter"', # phrase: adjacent multiple terms ] for question in questions: qb_result = ( @@ -264,5 +269,5 @@ def test_chinese(): if __name__ == "__main__": - test_english() + # test_english() test_chinese() diff --git a/src/storage/invertedindex/format/position_list_encoder.cpp b/src/storage/invertedindex/format/position_list_encoder.cpp index b3dd618fd7..8b87d51ab1 100644 --- a/src/storage/invertedindex/format/position_list_encoder.cpp +++ b/src/storage/invertedindex/format/position_list_encoder.cpp @@ -124,7 +124,7 @@ InMemPositionListDecoder *PositionListEncoder::GetInMemPositionListDecoder() con SkipListReaderPostingByteSlice *in_mem_skiplist_reader = nullptr; if (pos_skiplist_writer_.get()) { // not support tf bitmap in realtime segment - in_mem_skiplist_reader = new SkipListReaderPostingByteSlice(format_option_.GetDocListFormatOption()); + in_mem_skiplist_reader = new SkipListReaderPostingByteSlice(format_option_.GetPosListFormatOption()); in_mem_skiplist_reader->Load(pos_skiplist_writer_.get()); } PostingByteSlice *posting_buffer = new PostingByteSlice(); diff --git a/src/storage/invertedindex/format/skiplist_reader.cppm b/src/storage/invertedindex/format/skiplist_reader.cppm index 463521bbf2..3d5eed2f54 100644 --- a/src/storage/invertedindex/format/skiplist_reader.cppm +++ b/src/storage/invertedindex/format/skiplist_reader.cppm @@ -120,8 +120,9 @@ protected: export class SkipListReaderPostingByteSlice final : public SkipListReader { public: - SkipListReaderPostingByteSlice(const DocListFormatOption &doc_list_format_option) - : SkipListReader(doc_list_format_option) {} + explicit SkipListReaderPostingByteSlice(const DocListFormatOption &doc_list_format_option) : SkipListReader(doc_list_format_option) {} + + explicit SkipListReaderPostingByteSlice(const PositionListFormatOption &pos_list_format_option) : SkipListReader(pos_list_format_option) {} ~SkipListReaderPostingByteSlice() override; diff --git a/src/storage/invertedindex/posting_iterator.cpp b/src/storage/invertedindex/posting_iterator.cpp index 541bb0ce58..0acdc3efd8 100644 --- a/src/storage/invertedindex/posting_iterator.cpp +++ b/src/storage/invertedindex/posting_iterator.cpp @@ -64,11 +64,12 @@ bool PostingIterator::SkipTo(RowID doc_id) { Pair PostingIterator::GetBlockMaxInfo() const { return posting_decoder_->GetBlockMaxInfo(); } RowID PostingIterator::SeekDoc(RowID row_id) { - RowID current_row_id = finish_decode_docid_ ? current_row_id_ : INVALID_ROWID; - if (row_id == current_row_id) [[unlikely]] { - return current_row_id; + if (segment_postings_.get() == nullptr || segment_postings_->empty()) [[unlikely]] { + current_row_id_ = INVALID_ROWID; + return INVALID_ROWID; } - if (current_row_id != INVALID_ROWID and row_id < current_row_id) { + RowID current_row_id = finish_decode_docid_ ? current_row_id_ : INVALID_ROWID; + if (current_row_id != INVALID_ROWID and row_id <= current_row_id) [[unlikely]] { return current_row_id; } assert(row_id > current_row_id or current_row_id == INVALID_ROWID); diff --git a/src/storage/invertedindex/posting_iterator.cppm b/src/storage/invertedindex/posting_iterator.cppm index 64400e5714..7ede9bff24 100644 --- a/src/storage/invertedindex/posting_iterator.cppm +++ b/src/storage/invertedindex/posting_iterator.cppm @@ -50,6 +50,9 @@ public: void SeekPosition(pos_t pos, pos_t &result); docpayload_t GetCurrentDocPayload() { + if (current_row_id_ == INVALID_ROWID) [[unlikely]] { + return 0; + } if (posting_option_.HasDocPayload()) { DecodeTFBuffer(); DecodeDocPayloadBuffer(); @@ -59,6 +62,9 @@ public: } tf_t GetCurrentTF() { + if (current_row_id_ == INVALID_ROWID) [[unlikely]] { + return 0; + } if (posting_option_.HasTfList()) { DecodeTFBuffer(); return tf_buffer_[GetDocOffsetInBuffer()]; @@ -67,6 +73,9 @@ public: } ttf_t GetCurrentTTF() { + if (current_row_id_ == INVALID_ROWID) [[unlikely]] { + return 0; + } if (posting_option_.HasTfList()) { DecodeTFBuffer(); i32 offset = GetDocOffsetInBuffer(); diff --git a/src/storage/invertedindex/search/phrase_doc_iterator.cpp b/src/storage/invertedindex/search/phrase_doc_iterator.cpp index da878e479b..5e5241b570 100644 --- a/src/storage/invertedindex/search/phrase_doc_iterator.cpp +++ b/src/storage/invertedindex/search/phrase_doc_iterator.cpp @@ -47,9 +47,9 @@ namespace infinity { os << "PhraseDocIterator"; os << " (weight: " << weight_ << ")"; os << " (column: " << *column_name_ptr_ << ")"; - os << " (phrase: "; + os << " (phrase:"; for (auto term : *terms_ptr_) { - os << term << " "; + os << " " << term; } os << ")"; os << " (doc_freq: " << GetDF() << ")"; diff --git a/src/storage/invertedindex/search/search_driver.cpp b/src/storage/invertedindex/search/search_driver.cpp index 3b8676f0b5..98390f2839 100644 --- a/src/storage/invertedindex/search/search_driver.cpp +++ b/src/storage/invertedindex/search/search_driver.cpp @@ -199,14 +199,23 @@ std::unique_ptr SearchDriver::AnalyzeAndBuildQueryNode(const std::str result->column_ = field; return result; } else { - auto result = std::make_unique(); - for (auto &term : terms) { - auto subquery = std::make_unique(); - subquery->term_ = std::move(term.text_); - subquery->column_ = field; - result->Add(std::move(subquery)); + if (from_quoted) { + auto result = std::make_unique(); + for (auto term : terms) { + result->AddTerm(term.Text()); + } + result->column_ = field; + return result; + } else { + auto result = std::make_unique(); + for (auto &term : terms) { + auto subquery = std::make_unique(); + subquery->term_ = std::move(term.text_); + subquery->column_ = field; + result->Add(std::move(subquery)); + } + return result; } - return result; } } diff --git a/src/unit_test/common/analyzer/chinese_analyzer.cpp b/src/unit_test/common/analyzer/chinese_analyzer.cpp index 3df76f45c1..7e01a5db8d 100644 --- a/src/unit_test/common/analyzer/chinese_analyzer.cpp +++ b/src/unit_test/common/analyzer/chinese_analyzer.cpp @@ -47,22 +47,12 @@ TEST_F(ChineseAnalyzerTest, test1) { ChineseAnalyzer analyzer(ROOT_PATH.string()); analyzer.Load(); Vector queries = { - "graphic card", - "graphics card", - "南京市长江大桥", - "小明硕士毕业于中国科学院计算所,后在日本京都大学深造", - "会徽整体形似运动中的羽毛球,球头绑带部分演化为“城墙”的图形元素,极具南京的地域特征,凸显出举办地的历史底蕴和人文气息。尾羽部分图形则巧妙融入" - "了举办年份“2018”和南京的首字母“NJ”,结合中国传统书法笔触的表现形式,传递出羽毛球运动的速度感。会徽红黑配色鲜艳明快,契合了体育运动的活力与朝" - "气[3]" - "2018年世界羽毛球锦标赛吉祥物南京羽毛球世锦赛吉祥物2018年道达尔羽毛球世锦赛吉祥物在南京发布。造型简洁、形态生动、富有亲和力的“羽宝”拔得头筹," - "成为2018年世界羽毛球锦标赛吉祥物。比赛将于7月30日在宁举行,赛程7天,预计近340名顶尖运动员参赛。吉祥物“羽宝”头部由羽毛球外形变化而来,手持球" - "拍,拟人化的设计再现了羽毛球运动员比赛时的接击球动作,胸前佩戴的梅花造型的金牌,代表着在南京举办的世锦赛将向世界献上精彩的羽毛球盛宴。同时黄" - "蓝两色为主色调,在视觉冲击中体现了羽毛球运动动静转换的速度感和竞技魅力[6]" - "2018年世界羽毛球锦标赛抽签结果7月17日,2018年南京羽毛球世锦赛抽签出炉。男单中国获得满额席位,石宇奇、谌龙、林丹和黄宇翔全部被分到了上半区" - "。", - "周末我和朋友一起去“电子城”,想挑选一些新的“电脑配件”。那里有各种各样的“hardware”,如“motherboard”、“graphics " - "card”等。我们还看到了一些很“awesome”的“peripheral devices”,像“keyboard”和“mouse”。我朋友说他需要一个新的“power " - "supply”,而我则对那些“high-tech”的“storage devices”比较感兴趣。逛了一会儿后,我们都买到了自己心仪的东西,然后就“happily”回家了。"}; + R"#(graphic card)#", + R"#(graphics card)#", + R"#(南京市长江大桥)#", + R"#(小明硕士毕业于中国科学院计算所,后在日本京都大学深造)#", + R"#(会徽整体形似运动中的羽毛球,球头绑带部分演化为“城墙”的图形元素,极具南京的地域特征,凸显出举办地的历史底蕴和人文气息。尾羽部分图形则巧妙融入了举办年份“2018”和南京的首字母“NJ”,结合中国传统书法笔触的表现形式,传递出羽毛球运动的速度感。会徽红黑配色鲜艳明快,契合了体育运动的活力与朝气[3]2018年世界羽毛球锦标赛吉祥物南京羽毛球世锦赛吉祥物2018年道达尔羽毛球世锦赛吉祥物在南京发布。造型简洁、形态生动、富有亲和力的“羽宝”拔得头筹,成为2018年世界羽毛球锦标赛吉祥物。比赛将于7月30日在宁举行,赛程7天,预计近340名顶尖运动员参赛。吉祥物“羽宝”头部由羽毛球外形变化而来,手持球拍,拟人化的设计再现了羽毛球运动员比赛时的接击球动作,胸前佩戴的梅花造型的金牌,代表着在南京举办的世锦赛将向世界献上精彩的羽毛球盛宴。同时黄蓝两色为主色调,在视觉冲击中体现了羽毛球运动动静转换的速度感和竞技魅力[6]2018年世界羽毛球锦标赛抽签结果7月17日,2018年南京羽毛球世锦赛抽签出炉。男单中国获得满额席位,石宇奇、谌龙、林丹和黄宇翔全部被分到了上半区。)#", + R"#(周末我和朋友一起去“电子城”,想挑选一些新的“电脑配件”。那里有各种各样的“hardware”,如“motherboard”、“graphics card”等。我们还看到了一些很“awesome”的“peripheral devices”,像“keyboard”和“mouse”。我朋友说他需要一个新的“power supply”,而我则对那些“high-tech”的“storage devices”比较感兴趣。逛了一会儿后,我们都买到了自己心仪的东西,然后就“happily”回家了。)#"}; ChineseAnalyzer analyzer2(analyzer); analyzer2.SetCutGrain(CutGrain::kFine); diff --git a/src/unit_test/common/analyzer/standard_analyzer.cpp b/src/unit_test/common/analyzer/standard_analyzer.cpp index 9de9079d55..6cfd5489e2 100644 --- a/src/unit_test/common/analyzer/standard_analyzer.cpp +++ b/src/unit_test/common/analyzer/standard_analyzer.cpp @@ -117,3 +117,24 @@ TEST_F(StandardAnalyzerTest, test5) { // ASSERT_EQ(term_list[3].text_, PLACE_HOLDER); // ASSERT_EQ(term_list[3].word_offset_, 3U); } + +TEST_F(StandardAnalyzerTest, test6) { + StandardAnalyzer analyzer; + TermList term_list; + Vector queries = { + R"#({{Redirect|Anarchist|the fictional character|Anarchist (comics)}} {{Redirect|Anarchists}} {{Anarchism sidebar}} {{Libertarianism sidebar}} '''Anarchism''' is generally defined as the [[political philosophy]] which holds the [[state (polity)|state]] to be undesirable, unnecessary, and harmful, {{Cite journal|last=Malatesta|first=Errico|title=Towards Anarchism|journal=MAN!|publisher=International Group of San Francisco|location=Los Angeles|oclc=3930443|url=http://www.marxists.org/archive/malatesta/1930s/xx/toanarchy.htm|authorlink=Errico Malatesta}} {{Cite journal|url=http://www.theglobeandmail.com/servlet/story/RTGAM.20070514.wxlanarchist14/BNStory/lifeWork/home/ |title=Working for The Man |journal=[[The Globe and Mail]] |accessdate=2008-04-14 |last=Agrell |first=Siri |date=2007-05-14}} {{cite web|url=http://www.britannica.com/eb/article-9117285|title=Anarchism|year=2006|work=Encyclopædia Britannica|publisher=Encyclopædia Britannica Premium Service|accessdate=2006-08-29| archiveurl=)#", + R"#(http://web.archive.org/web/20061214085638/http://www.britannica.com/eb/article-9117285| archivedate= 14 December 2006}} {{Cite journal|year=2005|title=Anarchism|journal=The Shorter [[Routledge Encyclopedia of Philosophy]]|page=14|quote=Anarchism is the view that a society without the state, or government, is both possible and desirable.}} The following sources cite anarchism as a political philosophy: {{Cite book| last = Mclaughlin | first = Paul | title = Anarchism and Authority | publisher = Ashgate | location = Aldershot | year = 2007 | isbn = 0-7546-6196-2 |page=59}} {{Cite book| last = Johnston | first = R. | title = The Dictionary of Human Geography | publisher = Blackwell Publishers | location = Cambridge | year = 2000 | isbn = 0-631-20561-6 |page=24}}Slevin, Carl. "Anarchism." ''The Concise Oxford Dictionary of Politics''. Ed. Iain McLean and Alistair McMillan. Oxford University Press, 2003. or alternatively as opposing [[authority]] and)#", + R"#([[hierarchical organization]] in the conduct of human relations."The [[International of Anarchist Federations|IAF - IFA]] fights for : the abolition of all forms of authority whether economical, political, social, religious, cultural or sexual."[http://www.iaf-ifa.org/principles/english.html "Principles of The [[International of Anarchist Federations]]"]"Anarchism, then, really stands for the liberation of the human mind from the dominion of religion; the liberation of the human body from the dominion of property; liberation from the shackles and restraint of government. Anarchism stands for a social order based on the free grouping of individuals for the purpose of producing real social wealth; an order that will guarantee to every human being free access to the earth and full enjoyment of the necessities of life, according to individual desires, tastes, and inclinations." [[Emma Goldman]]. "What it Really Stands for Anarchy" in ''[[Anarchism and Other)#", + R"#(Essays]]''.Individualist anarchist Benjamin Tucker defined anarchism as opposition to authority as follows "They found that they must turn either to the right or to the left, — follow either the path of Authority or the path of Liberty. Marx went one way; Warren and Proudhon the other. Thus were born State Socialism and Anarchism...Authority, takes many shapes, but, broadly speaking, her enemies divide themselves into three classes: first, those who abhor her both as a means and as an end of progress, opposing her openly, avowedly, sincerely, consistently, universally; second, those who profess to believe in her as a means of progress, but who accept her only so far as they think she will subserve their own selfish interests, denying her and her blessings to the rest of the world; third, those who distrust her as a means of progress, believing in her only as an end to be obtained by first trampling upon, violating, and outraging her. These three phases of opposition to Liberty are met in almost)#", + R"#(every sphere of thought and human activity. Good representatives of the first are seen in the Catholic Church and the Russian autocracy; of the second, in the Protestant Church and the Manchester school of politics and political economy; of the third, in the atheism of Gambetta and the socialism of the socialism off Karl Marg." [[Benjamin Tucker]]. [http://www.theanarchistlibrary.org/HTML/Benjamin_Tucker__Individual_Liberty.html ''Individual Liberty.'']{{cite web|url=http://www.panarchy.org/ward/organization.1966.html|last=Ward|first=Colin|year=1966|title=Anarchism as a Theory of Organization|accessdate=1 March 2010| archiveurl= http://web.archive.org/web/20100325081119/http://www.panarchy.org/ward/organization.1966.html| archivedate= 25 March 2010}}Anarchist historian [[George Woodcock]] report of [[Mikhail Bakunin]]'s anti-authoritarianism and shows opposition to both state and non-state forms of authority as follows: "All anarchists deny)#"}; + + for (auto &query : queries) { + TermList term_list; + analyzer.Analyze(query, term_list); + std::cout << "Text #" << query << "# parsed as:" << std::endl; + for (unsigned i = 0; i < term_list.size(); ++i) { + std::cout << "\t" << i << "#" << term_list[i].text_ << "@" << term_list[i].word_offset_ << "#"; + } + std::cout << std::endl; + } +} diff --git a/src/unit_test/storage/invertedindex/memory_indexer.cpp b/src/unit_test/storage/invertedindex/memory_indexer.cpp index ce00eee145..96c428df0d 100644 --- a/src/unit_test/storage/invertedindex/memory_indexer.cpp +++ b/src/unit_test/storage/invertedindex/memory_indexer.cpp @@ -153,17 +153,15 @@ TEST_F(MemoryIndexerTest, test2) { TEST_F(MemoryIndexerTest, SpillLoadTest) { auto fake_segment_index_entry_1 = SegmentIndexEntry::CreateFakeEntry(GetTmpDir()); auto indexer1 = MakeUnique(GetTmpDir(), "chunk1", RowID(0U, 0U), flag_, "standard"); - bool offline = false; - bool spill = true; - indexer1->Insert(column_, 0, 2, offline); - indexer1->Insert(column_, 2, 2, offline); - indexer1->Insert(column_, 4, 1, offline); + indexer1->Insert(column_, 0, 2); + indexer1->Insert(column_, 2, 2); + indexer1->Insert(column_, 4, 1); while (indexer1->GetInflightTasks() > 0) { sleep(1); indexer1->CommitSync(); } - indexer1->Dump(offline, spill); + indexer1->Dump(false, true); UniquePtr loaded_indexer = MakeUnique(GetTmpDir(), "chunk1", RowID(0U, 0U), flag_, "standard"); loaded_indexer->Load(); @@ -191,3 +189,53 @@ TEST_F(MemoryIndexerTest, SpillLoadTest) { } } } + +TEST_F(MemoryIndexerTest, SeekPosition) { + // "A B C" repeats 7 times + String paragraph(R"#(A B C A B C A B C A B C A B C A B C A B C)#"); + auto column = ColumnVector::Make(MakeShared(LogicalType::kVarchar)); + column->Initialize(); + Value v = Value::MakeVarchar(paragraph); + for (SizeT i = 0; i < 8192; i++) { + column->AppendValue(v); + } + + auto fake_segment_index_entry_1 = SegmentIndexEntry::CreateFakeEntry(GetTmpDir()); + MemoryIndexer indexer1(GetTmpDir(), "chunk1", RowID(0U, 0U), flag_, "standard"); + indexer1.Insert(column, 0, 8192); + while (indexer1.GetInflightTasks() > 0) { + sleep(1); + indexer1.CommitSync(); + } + + SharedPtr segment_reader = MakeShared(&indexer1); + const String term("a"); + SegmentPosting seg_posting; + SharedPtr> seg_postings = MakeShared>(); + auto ret = segment_reader->GetSegmentPosting(term, seg_posting); + if (ret) { + seg_postings->push_back(seg_posting); + } + + auto posting_iter = MakeUnique(flag_); + u32 state_pool_size = 0; + posting_iter->Init(seg_postings, state_pool_size); + RowID doc_id = INVALID_ROWID; + Vector doc_ids = {0, 1, 2, 5, 127, 128, 512, 1024, 2048, 4096, 8191}; + for (SizeT i = 0; i < doc_ids.size(); ++i) { + doc_id = RowID::FromUint64(doc_ids[i]); + doc_id = posting_iter->SeekDoc(doc_id); + ASSERT_EQ(doc_id, doc_ids[i]); + u32 tf = posting_iter->GetCurrentTF(); + ASSERT_EQ(tf, 7); + pos_t target_pos = 0; + pos_t act_pos = 0; + for (SizeT j = 0; j < 7; ++j) { + posting_iter->SeekPosition(target_pos, act_pos); + ASSERT_EQ(act_pos, 3 * j); + target_pos = act_pos + 1; + } + posting_iter->SeekPosition(act_pos + 1, act_pos); + ASSERT_EQ(act_pos, INVALID_POSITION); + } +} diff --git a/src/unit_test/storage/invertedindex/search/query_match.cpp b/src/unit_test/storage/invertedindex/search/query_match.cpp index 120ecc0296..41444c8a6b 100644 --- a/src/unit_test/storage/invertedindex/search/query_match.cpp +++ b/src/unit_test/storage/invertedindex/search/query_match.cpp @@ -65,7 +65,7 @@ class QueryMatchTest : public BaseTest { void CreateDBAndTable(const String& db_name, const String& table_name); - void CreateIndex(const String& db_name, const String& table_name, const String& index_name); + void CreateIndex(const String &db_name, const String &table_name, const String &index_name, const String &analyzer); void InsertData(const String& db_name, const String& table_name); @@ -93,33 +93,45 @@ class QueryMatchTest : public BaseTest { void QueryMatchTest::InitData() { datas_ = { - {"1", "Animalia (book)", "Animalia is an illustrated children's book by anarchism Graeme Base. It was originally published in 1986, followed by a tenth anniversary edition in 1996, and a 25th anniversary edition in 2012. Over three million copies have been sold. A special numbered and signed anniversary edition was also published in 1996, with an embossed gold jacket."}, - {"2", "Academy Award for Best Production Design", "harmful chemical The Academy Awards are the oldest awards ceremony for achievements in motion pictures. one of The add test Academy Award for Best Production Design recognizes achievement in art direction on a film. The category's original name was Best Art Direction, but was changed to its current name in 2012 for the 85th Academy Awards. This change resulted from the Art Director's branch of the Academy being renamed the Designer's branch."}, - {"3", "Animation", "The American Football Conference (AFC) harm chemical anarchism add test is one of harm chemical the two conferences of the National Football League (NFL). This add test conference and its counterpart, the National Football Conference (NFC), currently contain 16 teams each, making up the 32 teams of the NFL. The current AFC title holder is the New England Patriots."}, + {"1", + "Animalia (book)", + R"#(Animalia is an illustrated children's book by anarchism Graeme Base. It was originally published in 1986, followed by a tenth anniversary edition in 1996, and a 25th anniversary edition in 2012. Over three million copies have been sold. A special numbered and signed anniversary edition was also published in 1996, with an embossed gold jacket.)#"}, + {"2", + "Academy Award for Best Production Design", + R"#(harmful chemical The Academy Awards are the oldest awards ceremony for achievements in motion pictures. one of The add test Academy Award for Best Production Design recognizes achievement in art direction on a film. The category's original name was Best Art Direction, but was changed to its current name in 2012 for the 85th Academy Awards. This change resulted from the Art Director's branch of the Academy being renamed the Designer's branch.)#"}, + {"3", + "Animation", + R"#(The American Football Conference (AFC) harm chemical anarchism add test is one of harm chemical the two conferences of the National Football League (NFL). This add test conference and its counterpart, the National Football Conference (NFC), currently contain 16 teams each, making up the 32 teams of the NFL. The current AFC title holder is the New England Patriots.)#"}, + {"4", + "Foobar", + R"#(周末我和朋友一起去“电子城”,想挑选一些新的“电脑配件”。那里有各种各样的“hardware”,如“motherboard”、“graphics card”等。我们还看到了一些很“awesome”的“peripheral devices”,像“keyboard”和“mouse”。我朋友说他需要一个新的“power supply”,而我则对那些“high-tech”的“storage devices”比较感兴趣。逛了一会儿后,我们都买到了自己心仪的东西,然后就“happily”回家了。)#"}, }; } -TEST_F(QueryMatchTest, DISABLED_basic_phrase) { +TEST_F(QueryMatchTest, basic_phrase) { CreateDBAndTable(db_name_, table_name_); - CreateIndex(db_name_, table_name_, index_name_); - InsertData(db_name_, table_name_); - String fields = "text"; - Vector phrases = {"\"Animalia is an\"", "\"one of\"", "\"are book\"", "\"add test\"", "\"harmful chemical\""}; - Vector expected_doc_freq = {1, 2, 0, 2, 2}; - Vector expected_phrase_freq = {1, 2, 0, 3, 3}; - EXPECT_EQ(phrases.size(), expected_doc_freq.size()); - EXPECT_EQ(phrases.size(), expected_phrase_freq.size()); - for (SizeT i = 0; i < phrases.size(); ++i) { - auto phrase = phrases[i]; - auto doc_freq = expected_doc_freq[i]; - auto phrase_freq = expected_phrase_freq[i]; - QueryMatch(db_name_, table_name_, index_name_, fields, phrase, doc_freq, phrase_freq, DocIteratorType::kPhraseIterator); + Vector analyzers = {String("standard")}; + for (auto &analyzer : analyzers) { + CreateIndex(db_name_, table_name_, index_name_, analyzer); + InsertData(db_name_, table_name_); + String fields = "text"; + Vector phrases = {"\"Animalia is an\"", "\"one of\"", "\"are book\"", "\"add test\"", "\"harmful chemical\""}; + Vector expected_doc_freq = {1, 2, 0, 2, 2}; + Vector expected_phrase_freq = {1, 2, 0, 3, 3}; + EXPECT_EQ(phrases.size(), expected_doc_freq.size()); + EXPECT_EQ(phrases.size(), expected_phrase_freq.size()); + for (SizeT i = 0; i < phrases.size(); ++i) { + auto phrase = phrases[i]; + auto doc_freq = expected_doc_freq[i]; + auto phrase_freq = expected_phrase_freq[i]; + QueryMatch(db_name_, table_name_, index_name_, fields, phrase, doc_freq, phrase_freq, DocIteratorType::kPhraseIterator); + } } } TEST_F(QueryMatchTest, basic_term) { CreateDBAndTable(db_name_, table_name_); - CreateIndex(db_name_, table_name_, index_name_); + CreateIndex(db_name_, table_name_, index_name_, "standard"); InsertData(db_name_, table_name_); String fields = "text"; Vector terms = {"the", "harmful", "chemical", "anarchism"}; @@ -158,6 +170,11 @@ void QueryMatchTest::CreateDBAndTable(const String& db_name, const String& table auto table_def = TableDef::Make(MakeShared(db_name), MakeShared(table_name), std::move(column_defs)); Storage *storage = InfinityContext::instance().storage(); TxnManager *txn_mgr = storage->txn_manager(); + { + auto *txn = txn_mgr->BeginTxn(MakeUnique("drop table")); + txn->DropTableCollectionByName(db_name, table_name, ConflictType::kIgnore); + last_commit_ts_ = txn_mgr->CommitTxn(txn); + } { auto *txn = txn_mgr->BeginTxn(MakeUnique("create table")); txn->CreateTable(db_name, table_def, ConflictType::kError); @@ -167,19 +184,20 @@ void QueryMatchTest::CreateDBAndTable(const String& db_name, const String& table last_commit_ts_ = txn_mgr->CommitTxn(txn); } - } -void QueryMatchTest::CreateIndex(const String& db_name, const String& table_name, const String& index_name) { +void QueryMatchTest::CreateIndex(const String &db_name, const String &table_name, const String &index_name, const String &analyzer) { Storage *storage = InfinityContext::instance().storage(); TxnManager *txn_mgr = storage->txn_manager(); - String analyzer{"standard"}; Vector col_name_list{"text"}; String index_file_name = index_name + ".json"; { auto *txn_idx = txn_mgr->BeginTxn(MakeUnique("create index")); + auto status0 = txn_idx->DropIndexByName(db_name, table_name, index_name, ConflictType::kIgnore); + // EXPECT_TRUE(status0.ok()); + auto [table_entry, status1] = txn_idx->GetTableByName(db_name, table_name); EXPECT_TRUE(status1.ok());