Fix phrase bug (infiniflow#1362)

Fix phrase query crash, added fusion doc, added fulltext example. - [x] Bug Fix (non-breaking change which fixes an issue) - [x] Test cases Issue:infiniflow#1352
maiqxuanfeng · Jun 21, 2024 · a362717 · a362717
1 parent ed5add3
commit a362717
Show file tree

Hide file tree

Showing 14 changed files with 283 additions and 104 deletions.
diff --git a/docs/getstarted/build_from_source.md b/docs/getstarted/build_from_source.md
@@ -47,7 +47,7 @@ git clone https://github.com/infiniflow/infinity.git
 ```shell
 cd infinity && mkdir cmake-build-debug
 TZ=$(readlink -f /etc/localtime | awk -F '/zoneinfo/' '{print $2}')
-docker run -d --name infinity_build --network=host -e TZ=$TZ -v $PWD:/infinity infiniflow/infinity_builder:centos7
+docker run -d --privileged --name infinity_build -e TZ=$TZ -v $PWD:/infinity -v /boot:/boot infiniflow/infinity_builder:centos7
 docker exec infinity_build bash -c "cd /infinity/cmake-build-debug && cmake -G Ninja -DCMAKE_BUILD_TYPE=Debug -DCMAKE_VERBOSE_MAKEFILE=ON .. && cmake --build ."
 ```
 

diff --git a/docs/getstarted/quickstart.md b/docs/getstarted/quickstart.md
@@ -61,36 +61,98 @@ from infinity.common import ConflictType
 ## Connect to the remote server
 
 ```python
-infinity_obj = infinity.connect(REMOTE_HOST)
+        infinity_obj = infinity.connect(REMOTE_HOST)
 ```
 
 ## Get a database
 
 ```python
-db = infinity_obj.get_database("default_db")
+        db = infinity_obj.get_database("default_db")
 ```
 
 ## Create a table
 
 ```python
-# Drop my_table if it already exists
-db.drop_table("my_table", ConflictType.Ignore)
-# Create a table named "my_table"
-table = db.create_table("my_table", {"num": {"type": "integer"}, "body": {"type": "varchar"}, "vec": {"type": "vector, 4, float"}})
+        # Drop my_table if it already exists
+        db.drop_table("my_table", ConflictType.Ignore)
+        # Create a table named "my_table"
+        table = db.create_table(
+            "my_table",
+            {
+                "num": {"type": "integer"},
+                "body": {"type": "varchar"},
+                "vec": {"type": "vector, 4, float"},
+            },
+        )
 ```
 
-## Insert two records 
+## Insert some records 
 
 ```python
-table.insert([{"num": 1, "body": "unnecessary and harmful", "vec": [1.0, 1.2, 0.8, 0.9]}])
-table.insert([{"num": 2, "body": "Office for Harmful Blooms", "vec": [4.0, 4.2, 4.3, 4.5]}])
+        table.insert(
+            [
+                {
+                    "num": 1,
+                    "body": r"unnecessary and harmful",
+                    "vec": [1.0, 1.2, 0.8, 0.9],
+                },
+                {
+                    "num": 2,
+                    "body": r"Office for Harmful Blooms",
+                    "vec": [4.0, 4.2, 4.3, 4.5],
+                },
+                {
+                    "num": 2,
+                    "body": r"A Bloom filter is a space-efficient probabilistic data structure, conceived by Burton Howard Bloom in 1970, that is used to test whether an element is a member of a set.",
+                    "vec": [4.0, 4.2, 4.3, 4.5],
+                },
+            ]
+        )
 ```
 
 ## Execute a vector search
 
 ```python
-res = table.output(["*"]).knn("vec", [3.0, 2.8, 2.7, 3.1], "float", "ip", 2).to_pl()
-print(res)
+        res = (
+            table.output(["num", "body"])
+            .knn("vec", [3.0, 2.8, 2.7, 3.1], "float", "ip", 2)
+            .to_pl()
+        )
+        print(res)
 ```
 
-> 💡 For more information about the Python API, see the [Python API Reference](../references/pysdk_api_reference.md).
+## Execute some fulltext search
+
+```python
+        questions = [
+            r"blooms",               # single term
+            r"Bloom filter",         # OR multiple terms
+            r'"Bloom filter"',       # phrase is surrounded by double-quotes
+            r'space\-efficient',     # escape reserved characters, equivalent to: `space efficient`
+            r'"space\-efficient"',   # phrase and escape reserved character, equivalent to: `"space efficient"`
+        ]
+        for question in questions:
+            qb_result = (
+                table_obj.output(["num", "body", "_score"])
+                .match("body", question, "topn=10")
+                .to_pl()
+            )
+            print(f"question: {question}")
+            print(qb_result)
+```
+
+## Execute a fusion search
+
+```python
+        qb_result2 = (
+            table_obj.output(["num", "body"])
+            .knn("vec", [3.0, 2.8, 2.7, 3.1], "float", "ip", 3)
+            .match("body", "blooms", "topn=1")
+            .fusion("rrf")
+            .to_pl()
+        )
+        print(qb_result2)
+```
+
+> 💡 For more information about the Python API, see the [Python API Reference](../references/pysdk_api_reference.md).
+> 💡 For a complete example, see the [hello_infinity.py](../../python/hello_infinity.py).
diff --git a/docs/references/pysdk_api_reference.md b/docs/references/pysdk_api_reference.md
@@ -717,7 +717,20 @@ Build a fusion expression.
 ### Parameters
 
 - **method : str**
-- **method : options_text**
+    The supported methods are: rrf, weighted_sum, match_tensor
+- **options_text : str**
+
+    Common options:
+
+    - 'topn=10': Retrieve the 10 most relevant records. The defualt value is `100`.
+
+    Dedicated options of rrf:
+
+    - 'rank_constant=30': The default value is `60`.
+
+    Dedicated options of weighted_sum:
+
+    - 'weights=1,2,0.5': The weights of children scorers. The default weight of each weight is `1.0`.
 
 ### Returns
 
@@ -728,6 +741,8 @@ Build a fusion expression.
 
 ```python
 table_obj.fusion('rrf')
+table_obj.fusion('rrf', 'topn=10')
+table_obj.fusion('weighted_sum', 'weights=1,2,0.5')
 ```
 
 ### Details

diff --git a/python/hello_infinity.py b/python/hello_infinity.py
@@ -17,7 +17,7 @@
 
 # remove local path, use the installed infinity sdk
 current_path = os.path.abspath(os.path.dirname(__file__))
-local_infinity_path = os.path.join(current_path, 'infinity')
+local_infinity_path = os.path.join(current_path, "infinity")
 if local_infinity_path in sys.path:
     sys.path.remove(local_infinity_path)
 if current_path in sys.path:
@@ -48,17 +48,17 @@ def test_english():
             [
                 {
                     "num": 1,
-                    "body": "unnecessary and harmful",
+                    "body": r"unnecessary and harmful",
                     "vec": [1.0, 1.2, 0.8, 0.9],
                 },
                 {
                     "num": 2,
-                    "body": "Office for Harmful Blooms",
+                    "body": r"Office for Harmful Blooms",
                     "vec": [4.0, 4.2, 4.3, 4.5],
                 },
                 {
-                    "num": 2,
-                    "body": "A Bloom filter is a space-efficient probabilistic data structure, conceived by Burton Howard Bloom in 1970, that is used to test whether an element is a member of a set.",
+                    "num": 3,
+                    "body": r"A Bloom filter is a space-efficient probabilistic data structure, conceived by Burton Howard Bloom in 1970, that is used to test whether an element is a member of a set.",
                     "vec": [4.0, 4.2, 4.3, 4.5],
                 },
             ]
@@ -96,9 +96,13 @@ def test_english():
 
         print("------fulltext-------")
         questions = [
-            "blooms",
-            "Bloom filter",
-            '"Bloom filter"',
+            r"blooms",  # single term
+            r"Bloom filter",  # OR multiple terms
+            r'"Bloom filter"',  # phrase: adjacent multiple terms
+            r"space efficient",  # OR multiple terms
+            # r"space-efficient",      # Error 3013: Invalid query statement: OrQueryNode should not have both not child and non-not child
+            r"space\-efficient",  # Escape reserved character '-', equivalent to: `space efficient`
+            r'"space\-efficient"',  # phrase and escape reserved character, equivalent to: `"space efficient"`
         ]
         for question in questions:
             qb_result = (
@@ -147,42 +151,42 @@ def test_chinese():
             [
                 {
                     "num": 1,
-                    "body": "据Wccftech报道，苹果正在开发一种定制芯片，旨在为人工智能（AI）服务器提供动力。暂时还不清楚这款芯片的具体规格，以及具体的实现目标。传闻苹果已选择台积电（TSMC）的3nm制程节点来制造这款芯片，预计2025年下半年量产。如果按照量产时间和台积电的半导体工艺进度，那么对应的很可能是N3E工艺。",
+                    "body": r"据Wccftech报道，苹果正在开发一种定制芯片，旨在为人工智能（AI）服务器提供动力。暂时还不清楚这款芯片的具体规格，以及具体的实现目标。传闻苹果已选择台积电（TSMC）的3nm制程节点来制造这款芯片，预计2025年下半年量产。如果按照量产时间和台积电的半导体工艺进度，那么对应的很可能是N3E工艺。",
                     "vec": [1.0, 1.2, 0.8, 0.9],
                 },
                 {
                     "num": 2,
-                    "body": "两个月前有报道称，苹果已正式放弃了努力超过十年、投下海量资金的“泰坦计划（Project Titan）”电动车项目。苹果随后解散了大约2000人的开发团队，各人会被分配到其他地方，其中一个很重要的去处就是人工智能部门。有传言称，苹果已经将注意力转向生成式AI，希望能够为业务找到新的增长动力。",
+                    "body": r"两个月前有报道称，苹果已正式放弃了努力超过十年、投下海量资金的“泰坦计划（Project Titan）”电动车项目。苹果随后解散了大约2000人的开发团队，各人会被分配到其他地方，其中一个很重要的去处就是人工智能部门。有传言称，苹果已经将注意力转向生成式AI，希望能够为业务找到新的增长动力。",
                     "vec": [1.2, 0.8, 0.9, 1.0],
                 },
                 {
                     "num": 3,
-                    "body": "爬行动物是一类包括蛇、蜥蜴、龟鳖和鳄鱼等的脊椎动物。它们的特点是体表覆盖着角质鳞片，而不是羽毛。羽毛是鸟类的特征，鸟类是从爬行动物中的某一类演化而来的，但它们已经发展出了独特的特征，包括羽毛。因此，爬行动物没有羽毛。",
+                    "body": r"爬行动物是一类包括蛇、蜥蜴、龟鳖和鳄鱼等的脊椎动物。它们的特点是体表覆盖着角质鳞片，而不是羽毛。羽毛是鸟类的特征，鸟类是从爬行动物中的某一类演化而来的，但它们已经发展出了独特的特征，包括羽毛。因此，爬行动物没有羽毛。",
                     "vec": [0.8, 0.9, 1.0, 1.2],
                 },
                 {
                     "num": 4,
-                    "body": "会徽整体形似运动中的羽毛球，球头绑带部分演化为“城墙”的图形元素，极具南京的地域特征，凸显出举办地的历史底蕴和人文气息。尾羽部分图形则巧妙融入了举办年份“2018”和南京的首字母“NJ”，结合中国传统书法笔触的表现形式，传递出羽毛球运动的速度感。会徽红黑配色鲜艳明快，契合了体育运动的活力与朝气[3]2018年世界羽毛球锦标赛吉祥物南京羽毛球世锦赛吉祥物2018年道达尔羽毛球世锦赛吉祥物在南京发布。造型简洁、形态生动、富有亲和力的“羽宝”拔得头筹，成为2018年世界羽毛球锦标赛吉祥物。比赛将于7月30日在宁举行，赛程7天，预计近340名顶尖运动员参赛。吉祥物“羽宝”头部由羽毛球外形变化而来，手持球拍，拟人化的设计再现了羽毛球运动员比赛时的接击球动作，胸前佩戴的梅花造型的金牌，代表着在南京举办的世锦赛将向世界献上精彩的羽毛球盛宴。同时黄蓝两色为主色调，在视觉冲击中体现了羽毛球运动动静转换的速度感和竞技魅力[6]2018年世界羽毛球锦标赛抽签结果7月17日，2018年南京羽毛球世锦赛抽签出炉。男单中国获得满额席位，石宇奇、谌龙、林丹和黄宇翔全部被分到了上半区。",
+                    "body": r"会徽整体形似运动中的羽毛球，球头绑带部分演化为“城墙”的图形元素，极具南京的地域特征，凸显出举办地的历史底蕴和人文气息。尾羽部分图形则巧妙融入了举办年份“2018”和南京的首字母“NJ”，结合中国传统书法笔触的表现形式，传递出羽毛球运动的速度感。会徽红黑配色鲜艳明快，契合了体育运动的活力与朝气[3]2018年世界羽毛球锦标赛吉祥物南京羽毛球世锦赛吉祥物2018年道达尔羽毛球世锦赛吉祥物在南京发布。造型简洁、形态生动、富有亲和力的“羽宝”拔得头筹，成为2018年世界羽毛球锦标赛吉祥物。比赛将于7月30日在宁举行，赛程7天，预计近340名顶尖运动员参赛。吉祥物“羽宝”头部由羽毛球外形变化而来，手持球拍，拟人化的设计再现了羽毛球运动员比赛时的接击球动作，胸前佩戴的梅花造型的金牌，代表着在南京举办的世锦赛将向世界献上精彩的羽毛球盛宴。同时黄蓝两色为主色调，在视觉冲击中体现了羽毛球运动动静转换的速度感和竞技魅力[6]2018年世界羽毛球锦标赛抽签结果7月17日，2018年南京羽毛球世锦赛抽签出炉。男单中国获得满额席位，石宇奇、谌龙、林丹和黄宇翔全部被分到了上半区。",
                     "vec": [0.9, 1.0, 1.2, 0.8],
                 },
                 {
                     "num": 5,
-                    "body": "周末我和朋友一起去“电子城”，想挑选一些新的“电脑配件”。那里有各种各样的“hardware”，如“motherboard”、“graphics card”等。我们还看到了一些很“awesome”的“peripheral devices”，像“keyboard”和“mouse”。我朋友说他需要一个新的“power supply”，而我则对那些“high-tech”的“storage devices”比较感兴趣。逛了一会儿后，我们都买到了自己心仪的东西，然后就“happily”回家了。",
+                    "body": r"周末我和朋友一起去“电子城”，想挑选一些新的“电脑配件”。那里有各种各样的“hardware”，如“motherboard”、“graphics card”等。我们还看到了一些很“awesome”的“peripheral devices”，像“keyboard”和“mouse”。我朋友说他需要一个新的“power supply”，而我则对那些“high-tech”的“storage devices”比较感兴趣。逛了一会儿后，我们都买到了自己心仪的东西，然后就“happily”回家了。",
                     "vec": [1.0, 0.9, 0.8, 0.9],
                 },
                 {
                     "num": 6,
-                    "body": "便携式计算机  Model Name型号：ThinkBook 16 G5+ ARP  输入电压/电流：20V=5A  CMIT ID：2023AP123456  MO: DS-K3AJ303/Dm140",
+                    "body": r"便携式计算机  Model Name型号：ThinkBook 16 G5+ ARP  输入电压/电流：20V=5A  CMIT ID：2023AP123456  MO: DS-K3AJ303/Dm140",
                     "vec": [0.9, 0.8, 0.9, 1.0],
                 },
                 {
                     "num": 7,
-                    "body": "Office for Harmful Blooms",
+                    "body": r"Office for Harmful Blooms",
                     "vec": [4.0, 4.2, 4.3, 4.5],
                 },
                 {
                     "num": 8,
-                    "body": "A Bloom filter is a space-efficient probabilistic data structure, conceived by Burton Howard Bloom in 1970, that is used to test whether an element is a member of a set.",
+                    "body": r"A Bloom filter is a space-efficient probabilistic data structure, conceived by Burton Howard Bloom in 1970, that is used to test whether an element is a member of a set.",
                     "vec": [4.0, 4.2, 4.3, 4.5],
                 },
             ]
@@ -224,19 +228,20 @@ def test_chinese():
 
         print("------fulltext-------")
         questions = [
-            "芯片",
-            "苹果",
-            "羽毛",
-            "羽毛球",
-            '"羽毛球锦标赛"',
-            "2018年世界羽毛球锦标赛在哪个城市举办？",
-            "hi\-tech",
-            '"hi-tech"',
-            "graphics card",
-            '"graphics card"',
-            '"DS-K3AJ303/Dm140"',
-            "Bloom filter",
-            '"Bloom filter"',
+            r"芯片",  # single term
+            r"苹果",  # single term
+            r"羽毛",  # single term
+            r"羽毛球",  # single term
+            r'"羽毛球锦标赛"',  # phrase: adjacent multiple terms
+            r"2018年世界羽毛球锦标赛在哪个城市举办？",  # OR multiple terms
+            r"high\-tech",  # Escape reserved character '-'
+            r'"high tech"',  # phrase: adjacent multiple terms
+            r'"high-tech"',  # phrase: adjacent multiple terms
+            r"graphics card",  # OR multiple terms
+            r'"graphics card"',  # phrase: adjacent multiple terms
+            r'"DS-K3AJ303/Dm140"',  # phrase: adjacent multiple terms
+            r"Bloom filter",  # OR multiple terms
+            r'"Bloom filter"',  # phrase: adjacent multiple terms
         ]
         for question in questions:
             qb_result = (
@@ -264,5 +269,5 @@ def test_chinese():
 
 
 if __name__ == "__main__":
-    test_english()
+    # test_english()
     test_chinese()
diff --git a/src/storage/invertedindex/format/position_list_encoder.cpp b/src/storage/invertedindex/format/position_list_encoder.cpp
@@ -124,7 +124,7 @@ InMemPositionListDecoder *PositionListEncoder::GetInMemPositionListDecoder() con
     SkipListReaderPostingByteSlice *in_mem_skiplist_reader = nullptr;
     if (pos_skiplist_writer_.get()) {
         // not support tf bitmap in realtime segment
-        in_mem_skiplist_reader = new SkipListReaderPostingByteSlice(format_option_.GetDocListFormatOption());
+        in_mem_skiplist_reader = new SkipListReaderPostingByteSlice(format_option_.GetPosListFormatOption());
         in_mem_skiplist_reader->Load(pos_skiplist_writer_.get());
     }
     PostingByteSlice *posting_buffer = new PostingByteSlice();

diff --git a/src/storage/invertedindex/format/skiplist_reader.cppm b/src/storage/invertedindex/format/skiplist_reader.cppm
@@ -120,8 +120,9 @@ protected:
 
 export class SkipListReaderPostingByteSlice final : public SkipListReader {
 public:
-    SkipListReaderPostingByteSlice(const DocListFormatOption &doc_list_format_option)
-        : SkipListReader(doc_list_format_option) {}
+    explicit SkipListReaderPostingByteSlice(const DocListFormatOption &doc_list_format_option) : SkipListReader(doc_list_format_option) {}
+
+    explicit SkipListReaderPostingByteSlice(const PositionListFormatOption &pos_list_format_option) : SkipListReader(pos_list_format_option) {}
 
     ~SkipListReaderPostingByteSlice() override;
 

diff --git a/src/storage/invertedindex/posting_iterator.cpp b/src/storage/invertedindex/posting_iterator.cpp
@@ -64,11 +64,12 @@ bool PostingIterator::SkipTo(RowID doc_id) {
 Pair<u32, u16> PostingIterator::GetBlockMaxInfo() const { return posting_decoder_->GetBlockMaxInfo(); }
 
 RowID PostingIterator::SeekDoc(RowID row_id) {
-    RowID current_row_id = finish_decode_docid_ ? current_row_id_ : INVALID_ROWID;
-    if (row_id == current_row_id) [[unlikely]] {
-        return current_row_id;
+    if (segment_postings_.get() == nullptr || segment_postings_->empty()) [[unlikely]] {
+        current_row_id_ = INVALID_ROWID;
+        return INVALID_ROWID;
     }
-    if (current_row_id != INVALID_ROWID and row_id < current_row_id) {
+    RowID current_row_id = finish_decode_docid_ ? current_row_id_ : INVALID_ROWID;
+    if (current_row_id != INVALID_ROWID and row_id <= current_row_id) [[unlikely]] {
         return current_row_id;
     }
     assert(row_id > current_row_id or current_row_id == INVALID_ROWID);

diff --git a/src/storage/invertedindex/posting_iterator.cppm b/src/storage/invertedindex/posting_iterator.cppm
@@ -50,6 +50,9 @@ public:
     void SeekPosition(pos_t pos, pos_t &result);
 
     docpayload_t GetCurrentDocPayload() {
+        if (current_row_id_ == INVALID_ROWID) [[unlikely]] {
+            return 0;
+        }
         if (posting_option_.HasDocPayload()) {
             DecodeTFBuffer();
             DecodeDocPayloadBuffer();
@@ -59,6 +62,9 @@ public:
     }
 
     tf_t GetCurrentTF() {
+        if (current_row_id_ == INVALID_ROWID) [[unlikely]] {
+            return 0;
+        }
         if (posting_option_.HasTfList()) {
             DecodeTFBuffer();
             return tf_buffer_[GetDocOffsetInBuffer()];
@@ -67,6 +73,9 @@ public:
     }
 
     ttf_t GetCurrentTTF() {
+        if (current_row_id_ == INVALID_ROWID) [[unlikely]] {
+            return 0;
+        }
         if (posting_option_.HasTfList()) {
             DecodeTFBuffer();
             i32 offset = GetDocOffsetInBuffer();