Skip to content

Commit

Permalink
Fix phrase bug (infiniflow#1362)
Browse files Browse the repository at this point in the history
Fix phrase query crash, added fusion doc, added fulltext example.

- [x] Bug Fix (non-breaking change which fixes an issue)
- [x] Test cases

Issue:infiniflow#1352
  • Loading branch information
yuzhichang authored Jun 21, 2024
1 parent ed5add3 commit a362717
Show file tree
Hide file tree
Showing 14 changed files with 283 additions and 104 deletions.
2 changes: 1 addition & 1 deletion docs/getstarted/build_from_source.md
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ git clone https://github.com/infiniflow/infinity.git
```shell
cd infinity && mkdir cmake-build-debug
TZ=$(readlink -f /etc/localtime | awk -F '/zoneinfo/' '{print $2}')
docker run -d --name infinity_build --network=host -e TZ=$TZ -v $PWD:/infinity infiniflow/infinity_builder:centos7
docker run -d --privileged --name infinity_build -e TZ=$TZ -v $PWD:/infinity -v /boot:/boot infiniflow/infinity_builder:centos7
docker exec infinity_build bash -c "cd /infinity/cmake-build-debug && cmake -G Ninja -DCMAKE_BUILD_TYPE=Debug -DCMAKE_VERBOSE_MAKEFILE=ON .. && cmake --build ."
```

Expand Down
86 changes: 74 additions & 12 deletions docs/getstarted/quickstart.md
Original file line number Diff line number Diff line change
Expand Up @@ -61,36 +61,98 @@ from infinity.common import ConflictType
## Connect to the remote server

```python
infinity_obj = infinity.connect(REMOTE_HOST)
infinity_obj = infinity.connect(REMOTE_HOST)
```

## Get a database

```python
db = infinity_obj.get_database("default_db")
db = infinity_obj.get_database("default_db")
```

## Create a table

```python
# Drop my_table if it already exists
db.drop_table("my_table", ConflictType.Ignore)
# Create a table named "my_table"
table = db.create_table("my_table", {"num": {"type": "integer"}, "body": {"type": "varchar"}, "vec": {"type": "vector, 4, float"}})
# Drop my_table if it already exists
db.drop_table("my_table", ConflictType.Ignore)
# Create a table named "my_table"
table = db.create_table(
"my_table",
{
"num": {"type": "integer"},
"body": {"type": "varchar"},
"vec": {"type": "vector, 4, float"},
},
)
```

## Insert two records
## Insert some records

```python
table.insert([{"num": 1, "body": "unnecessary and harmful", "vec": [1.0, 1.2, 0.8, 0.9]}])
table.insert([{"num": 2, "body": "Office for Harmful Blooms", "vec": [4.0, 4.2, 4.3, 4.5]}])
table.insert(
[
{
"num": 1,
"body": r"unnecessary and harmful",
"vec": [1.0, 1.2, 0.8, 0.9],
},
{
"num": 2,
"body": r"Office for Harmful Blooms",
"vec": [4.0, 4.2, 4.3, 4.5],
},
{
"num": 2,
"body": r"A Bloom filter is a space-efficient probabilistic data structure, conceived by Burton Howard Bloom in 1970, that is used to test whether an element is a member of a set.",
"vec": [4.0, 4.2, 4.3, 4.5],
},
]
)
```

## Execute a vector search

```python
res = table.output(["*"]).knn("vec", [3.0, 2.8, 2.7, 3.1], "float", "ip", 2).to_pl()
print(res)
res = (
table.output(["num", "body"])
.knn("vec", [3.0, 2.8, 2.7, 3.1], "float", "ip", 2)
.to_pl()
)
print(res)
```

> 💡 For more information about the Python API, see the [Python API Reference](../references/pysdk_api_reference.md).
## Execute some fulltext search

```python
questions = [
r"blooms", # single term
r"Bloom filter", # OR multiple terms
r'"Bloom filter"', # phrase is surrounded by double-quotes
r'space\-efficient', # escape reserved characters, equivalent to: `space efficient`
r'"space\-efficient"', # phrase and escape reserved character, equivalent to: `"space efficient"`
]
for question in questions:
qb_result = (
table_obj.output(["num", "body", "_score"])
.match("body", question, "topn=10")
.to_pl()
)
print(f"question: {question}")
print(qb_result)
```

## Execute a fusion search

```python
qb_result2 = (
table_obj.output(["num", "body"])
.knn("vec", [3.0, 2.8, 2.7, 3.1], "float", "ip", 3)
.match("body", "blooms", "topn=1")
.fusion("rrf")
.to_pl()
)
print(qb_result2)
```

> 💡 For more information about the Python API, see the [Python API Reference](../references/pysdk_api_reference.md).
> 💡 For a complete example, see the [hello_infinity.py](../../python/hello_infinity.py).
17 changes: 16 additions & 1 deletion docs/references/pysdk_api_reference.md
Original file line number Diff line number Diff line change
Expand Up @@ -717,7 +717,20 @@ Build a fusion expression.
### Parameters

- **method : str**
- **method : options_text**
The supported methods are: rrf, weighted_sum, match_tensor
- **options_text : str**

Common options:

- 'topn=10': Retrieve the 10 most relevant records. The defualt value is `100`.

Dedicated options of rrf:

- 'rank_constant=30': The default value is `60`.

Dedicated options of weighted_sum:

- 'weights=1,2,0.5': The weights of children scorers. The default weight of each weight is `1.0`.

### Returns

Expand All @@ -728,6 +741,8 @@ Build a fusion expression.

```python
table_obj.fusion('rrf')
table_obj.fusion('rrf', 'topn=10')
table_obj.fusion('weighted_sum', 'weights=1,2,0.5')
```

### Details
Expand Down
65 changes: 35 additions & 30 deletions python/hello_infinity.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@

# remove local path, use the installed infinity sdk
current_path = os.path.abspath(os.path.dirname(__file__))
local_infinity_path = os.path.join(current_path, 'infinity')
local_infinity_path = os.path.join(current_path, "infinity")
if local_infinity_path in sys.path:
sys.path.remove(local_infinity_path)
if current_path in sys.path:
Expand Down Expand Up @@ -48,17 +48,17 @@ def test_english():
[
{
"num": 1,
"body": "unnecessary and harmful",
"body": r"unnecessary and harmful",
"vec": [1.0, 1.2, 0.8, 0.9],
},
{
"num": 2,
"body": "Office for Harmful Blooms",
"body": r"Office for Harmful Blooms",
"vec": [4.0, 4.2, 4.3, 4.5],
},
{
"num": 2,
"body": "A Bloom filter is a space-efficient probabilistic data structure, conceived by Burton Howard Bloom in 1970, that is used to test whether an element is a member of a set.",
"num": 3,
"body": r"A Bloom filter is a space-efficient probabilistic data structure, conceived by Burton Howard Bloom in 1970, that is used to test whether an element is a member of a set.",
"vec": [4.0, 4.2, 4.3, 4.5],
},
]
Expand Down Expand Up @@ -96,9 +96,13 @@ def test_english():

print("------fulltext-------")
questions = [
"blooms",
"Bloom filter",
'"Bloom filter"',
r"blooms", # single term
r"Bloom filter", # OR multiple terms
r'"Bloom filter"', # phrase: adjacent multiple terms
r"space efficient", # OR multiple terms
# r"space-efficient", # Error 3013: Invalid query statement: OrQueryNode should not have both not child and non-not child
r"space\-efficient", # Escape reserved character '-', equivalent to: `space efficient`
r'"space\-efficient"', # phrase and escape reserved character, equivalent to: `"space efficient"`
]
for question in questions:
qb_result = (
Expand Down Expand Up @@ -147,42 +151,42 @@ def test_chinese():
[
{
"num": 1,
"body": "据Wccftech报道,苹果正在开发一种定制芯片,旨在为人工智能(AI)服务器提供动力。暂时还不清楚这款芯片的具体规格,以及具体的实现目标。传闻苹果已选择台积电(TSMC)的3nm制程节点来制造这款芯片,预计2025年下半年量产。如果按照量产时间和台积电的半导体工艺进度,那么对应的很可能是N3E工艺。",
"body": r"据Wccftech报道,苹果正在开发一种定制芯片,旨在为人工智能(AI)服务器提供动力。暂时还不清楚这款芯片的具体规格,以及具体的实现目标。传闻苹果已选择台积电(TSMC)的3nm制程节点来制造这款芯片,预计2025年下半年量产。如果按照量产时间和台积电的半导体工艺进度,那么对应的很可能是N3E工艺。",
"vec": [1.0, 1.2, 0.8, 0.9],
},
{
"num": 2,
"body": "两个月前有报道称,苹果已正式放弃了努力超过十年、投下海量资金的“泰坦计划(Project Titan)”电动车项目。苹果随后解散了大约2000人的开发团队,各人会被分配到其他地方,其中一个很重要的去处就是人工智能部门。有传言称,苹果已经将注意力转向生成式AI,希望能够为业务找到新的增长动力。",
"body": r"两个月前有报道称,苹果已正式放弃了努力超过十年、投下海量资金的“泰坦计划(Project Titan)”电动车项目。苹果随后解散了大约2000人的开发团队,各人会被分配到其他地方,其中一个很重要的去处就是人工智能部门。有传言称,苹果已经将注意力转向生成式AI,希望能够为业务找到新的增长动力。",
"vec": [1.2, 0.8, 0.9, 1.0],
},
{
"num": 3,
"body": "爬行动物是一类包括蛇、蜥蜴、龟鳖和鳄鱼等的脊椎动物。它们的特点是体表覆盖着角质鳞片,而不是羽毛。羽毛是鸟类的特征,鸟类是从爬行动物中的某一类演化而来的,但它们已经发展出了独特的特征,包括羽毛。因此,爬行动物没有羽毛。",
"body": r"爬行动物是一类包括蛇、蜥蜴、龟鳖和鳄鱼等的脊椎动物。它们的特点是体表覆盖着角质鳞片,而不是羽毛。羽毛是鸟类的特征,鸟类是从爬行动物中的某一类演化而来的,但它们已经发展出了独特的特征,包括羽毛。因此,爬行动物没有羽毛。",
"vec": [0.8, 0.9, 1.0, 1.2],
},
{
"num": 4,
"body": "会徽整体形似运动中的羽毛球,球头绑带部分演化为“城墙”的图形元素,极具南京的地域特征,凸显出举办地的历史底蕴和人文气息。尾羽部分图形则巧妙融入了举办年份“2018”和南京的首字母“NJ”,结合中国传统书法笔触的表现形式,传递出羽毛球运动的速度感。会徽红黑配色鲜艳明快,契合了体育运动的活力与朝气[3]2018年世界羽毛球锦标赛吉祥物南京羽毛球世锦赛吉祥物2018年道达尔羽毛球世锦赛吉祥物在南京发布。造型简洁、形态生动、富有亲和力的“羽宝”拔得头筹,成为2018年世界羽毛球锦标赛吉祥物。比赛将于7月30日在宁举行,赛程7天,预计近340名顶尖运动员参赛。吉祥物“羽宝”头部由羽毛球外形变化而来,手持球拍,拟人化的设计再现了羽毛球运动员比赛时的接击球动作,胸前佩戴的梅花造型的金牌,代表着在南京举办的世锦赛将向世界献上精彩的羽毛球盛宴。同时黄蓝两色为主色调,在视觉冲击中体现了羽毛球运动动静转换的速度感和竞技魅力[6]2018年世界羽毛球锦标赛抽签结果7月17日,2018年南京羽毛球世锦赛抽签出炉。男单中国获得满额席位,石宇奇、谌龙、林丹和黄宇翔全部被分到了上半区。",
"body": r"会徽整体形似运动中的羽毛球,球头绑带部分演化为“城墙”的图形元素,极具南京的地域特征,凸显出举办地的历史底蕴和人文气息。尾羽部分图形则巧妙融入了举办年份“2018”和南京的首字母“NJ”,结合中国传统书法笔触的表现形式,传递出羽毛球运动的速度感。会徽红黑配色鲜艳明快,契合了体育运动的活力与朝气[3]2018年世界羽毛球锦标赛吉祥物南京羽毛球世锦赛吉祥物2018年道达尔羽毛球世锦赛吉祥物在南京发布。造型简洁、形态生动、富有亲和力的“羽宝”拔得头筹,成为2018年世界羽毛球锦标赛吉祥物。比赛将于7月30日在宁举行,赛程7天,预计近340名顶尖运动员参赛。吉祥物“羽宝”头部由羽毛球外形变化而来,手持球拍,拟人化的设计再现了羽毛球运动员比赛时的接击球动作,胸前佩戴的梅花造型的金牌,代表着在南京举办的世锦赛将向世界献上精彩的羽毛球盛宴。同时黄蓝两色为主色调,在视觉冲击中体现了羽毛球运动动静转换的速度感和竞技魅力[6]2018年世界羽毛球锦标赛抽签结果7月17日,2018年南京羽毛球世锦赛抽签出炉。男单中国获得满额席位,石宇奇、谌龙、林丹和黄宇翔全部被分到了上半区。",
"vec": [0.9, 1.0, 1.2, 0.8],
},
{
"num": 5,
"body": "周末我和朋友一起去“电子城”,想挑选一些新的“电脑配件”。那里有各种各样的“hardware”,如“motherboard”、“graphics card”等。我们还看到了一些很“awesome”的“peripheral devices”,像“keyboard”和“mouse”。我朋友说他需要一个新的“power supply”,而我则对那些“high-tech”的“storage devices”比较感兴趣。逛了一会儿后,我们都买到了自己心仪的东西,然后就“happily”回家了。",
"body": r"周末我和朋友一起去“电子城”,想挑选一些新的“电脑配件”。那里有各种各样的“hardware”,如“motherboard”、“graphics card”等。我们还看到了一些很“awesome”的“peripheral devices”,像“keyboard”和“mouse”。我朋友说他需要一个新的“power supply”,而我则对那些“high-tech”的“storage devices”比较感兴趣。逛了一会儿后,我们都买到了自己心仪的东西,然后就“happily”回家了。",
"vec": [1.0, 0.9, 0.8, 0.9],
},
{
"num": 6,
"body": "便携式计算机 Model Name型号:ThinkBook 16 G5+ ARP 输入电压/电流:20V=5A CMIT ID:2023AP123456 MO: DS-K3AJ303/Dm140",
"body": r"便携式计算机 Model Name型号:ThinkBook 16 G5+ ARP 输入电压/电流:20V=5A CMIT ID:2023AP123456 MO: DS-K3AJ303/Dm140",
"vec": [0.9, 0.8, 0.9, 1.0],
},
{
"num": 7,
"body": "Office for Harmful Blooms",
"body": r"Office for Harmful Blooms",
"vec": [4.0, 4.2, 4.3, 4.5],
},
{
"num": 8,
"body": "A Bloom filter is a space-efficient probabilistic data structure, conceived by Burton Howard Bloom in 1970, that is used to test whether an element is a member of a set.",
"body": r"A Bloom filter is a space-efficient probabilistic data structure, conceived by Burton Howard Bloom in 1970, that is used to test whether an element is a member of a set.",
"vec": [4.0, 4.2, 4.3, 4.5],
},
]
Expand Down Expand Up @@ -224,19 +228,20 @@ def test_chinese():

print("------fulltext-------")
questions = [
"芯片",
"苹果",
"羽毛",
"羽毛球",
'"羽毛球锦标赛"',
"2018年世界羽毛球锦标赛在哪个城市举办?",
"hi\-tech",
'"hi-tech"',
"graphics card",
'"graphics card"',
'"DS-K3AJ303/Dm140"',
"Bloom filter",
'"Bloom filter"',
r"芯片", # single term
r"苹果", # single term
r"羽毛", # single term
r"羽毛球", # single term
r'"羽毛球锦标赛"', # phrase: adjacent multiple terms
r"2018年世界羽毛球锦标赛在哪个城市举办?", # OR multiple terms
r"high\-tech", # Escape reserved character '-'
r'"high tech"', # phrase: adjacent multiple terms
r'"high-tech"', # phrase: adjacent multiple terms
r"graphics card", # OR multiple terms
r'"graphics card"', # phrase: adjacent multiple terms
r'"DS-K3AJ303/Dm140"', # phrase: adjacent multiple terms
r"Bloom filter", # OR multiple terms
r'"Bloom filter"', # phrase: adjacent multiple terms
]
for question in questions:
qb_result = (
Expand Down Expand Up @@ -264,5 +269,5 @@ def test_chinese():


if __name__ == "__main__":
test_english()
# test_english()
test_chinese()
2 changes: 1 addition & 1 deletion src/storage/invertedindex/format/position_list_encoder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@ InMemPositionListDecoder *PositionListEncoder::GetInMemPositionListDecoder() con
SkipListReaderPostingByteSlice *in_mem_skiplist_reader = nullptr;
if (pos_skiplist_writer_.get()) {
// not support tf bitmap in realtime segment
in_mem_skiplist_reader = new SkipListReaderPostingByteSlice(format_option_.GetDocListFormatOption());
in_mem_skiplist_reader = new SkipListReaderPostingByteSlice(format_option_.GetPosListFormatOption());
in_mem_skiplist_reader->Load(pos_skiplist_writer_.get());
}
PostingByteSlice *posting_buffer = new PostingByteSlice();
Expand Down
5 changes: 3 additions & 2 deletions src/storage/invertedindex/format/skiplist_reader.cppm
Original file line number Diff line number Diff line change
Expand Up @@ -120,8 +120,9 @@ protected:

export class SkipListReaderPostingByteSlice final : public SkipListReader {
public:
SkipListReaderPostingByteSlice(const DocListFormatOption &doc_list_format_option)
: SkipListReader(doc_list_format_option) {}
explicit SkipListReaderPostingByteSlice(const DocListFormatOption &doc_list_format_option) : SkipListReader(doc_list_format_option) {}

explicit SkipListReaderPostingByteSlice(const PositionListFormatOption &pos_list_format_option) : SkipListReader(pos_list_format_option) {}

~SkipListReaderPostingByteSlice() override;

Expand Down
9 changes: 5 additions & 4 deletions src/storage/invertedindex/posting_iterator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -64,11 +64,12 @@ bool PostingIterator::SkipTo(RowID doc_id) {
Pair<u32, u16> PostingIterator::GetBlockMaxInfo() const { return posting_decoder_->GetBlockMaxInfo(); }

RowID PostingIterator::SeekDoc(RowID row_id) {
RowID current_row_id = finish_decode_docid_ ? current_row_id_ : INVALID_ROWID;
if (row_id == current_row_id) [[unlikely]] {
return current_row_id;
if (segment_postings_.get() == nullptr || segment_postings_->empty()) [[unlikely]] {
current_row_id_ = INVALID_ROWID;
return INVALID_ROWID;
}
if (current_row_id != INVALID_ROWID and row_id < current_row_id) {
RowID current_row_id = finish_decode_docid_ ? current_row_id_ : INVALID_ROWID;
if (current_row_id != INVALID_ROWID and row_id <= current_row_id) [[unlikely]] {
return current_row_id;
}
assert(row_id > current_row_id or current_row_id == INVALID_ROWID);
Expand Down
9 changes: 9 additions & 0 deletions src/storage/invertedindex/posting_iterator.cppm
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,9 @@ public:
void SeekPosition(pos_t pos, pos_t &result);

docpayload_t GetCurrentDocPayload() {
if (current_row_id_ == INVALID_ROWID) [[unlikely]] {
return 0;
}
if (posting_option_.HasDocPayload()) {
DecodeTFBuffer();
DecodeDocPayloadBuffer();
Expand All @@ -59,6 +62,9 @@ public:
}

tf_t GetCurrentTF() {
if (current_row_id_ == INVALID_ROWID) [[unlikely]] {
return 0;
}
if (posting_option_.HasTfList()) {
DecodeTFBuffer();
return tf_buffer_[GetDocOffsetInBuffer()];
Expand All @@ -67,6 +73,9 @@ public:
}

ttf_t GetCurrentTTF() {
if (current_row_id_ == INVALID_ROWID) [[unlikely]] {
return 0;
}
if (posting_option_.HasTfList()) {
DecodeTFBuffer();
i32 offset = GetDocOffsetInBuffer();
Expand Down
Loading

0 comments on commit a362717

Please sign in to comment.