@@ -106,7 +106,54 @@ pub trait NgramLiteralProvider<E, const N: usize = 3> {
106
106
NgramRange : Clone + RangeBounds < & ' me str > + Send + Sync ;
107
107
108
108
// Return the documents containing the literals. The search space is restricted to the documents in the mask if specified
109
- // The literal slice must not be shorter than N, else `[...].split_at(N)` will panic
109
+ //
110
+ // The literal slice should not be shorter than N, or an empty set will be returned to indicate no document contains a
111
+ // ngram sequence that match the literal sequence
112
+ //
113
+ // The high level algorithm can be separated into the following phases:
114
+ // - Calculate all ngrams that could present in the match
115
+ // - Prefetch all relevant blocks for these ngrams
116
+ // - For each sliding window of size N in the literal sequence:
117
+ // - Fetch all (ngram, doc, pos) tuples from the index where the ngram can match the window of N literals
118
+ // - Track the sliding window with minimum number of candidate (ngram, doc, pos) tuples
119
+ // - Reorganize the ngrams by prefix and suffix into a lookup table
120
+ // - Taking the sliding window with minimum number of candidate (ngram, doc, pos) tuples as the pivot:
121
+ // - Group the (ngram, doc, pos) tuples by document
122
+ // - For each document, iterate over the candidate (ngram, pos) tuples:
123
+ // - Repeatedly use the suffix of the ngram and the prefix lookup table to see if there exists a sequence of ngrams
124
+ // and positions that aligns all the way to the last sliding window
125
+ // - Repeatedly use the prefix of the ngram and the suffix lookup table to see if there exists a sequence of ngrams
126
+ // and positions that aligns all the way to the first sliding window
127
+ // - If there is such an alignment from the start to the end, add the document to the result and skip to the next document
128
+ //
129
+ // An illustrative example (N=3) for one successful iteration of the final step is presented below (irrelevant info is hidden):
130
+ // ┌─────┐ ┌─────┐
131
+ // │ ijk │ │ jkl │
132
+ // │ │ │ │ ┌─────┐
133
+ // │ 42──┼────────┼►43 │ │ klm │
134
+ // ┌─────┐ │ │ │ │ │ │
135
+ // │ hij │ │ 54──┼────────┼►55──┼────────┼►56 │
136
+ // │ │ │ │ └─────┘ └─────┘
137
+ // │ 71◄─┼────────┼─72──┼────┐
138
+ // │ │ │ │ │ ┌─────┐ ┌─────┐
139
+ // │ 107 │ │ 108 │ │ │ jkL │ │ kLm │
140
+ // └─────┘ └─────┘ │ │ │ │ │
141
+ // pivot └───┼►73──┼────────┼►74 │
142
+ // │ │ │ │
143
+ // │ 109 │ │ 110 │
144
+ // └─────┘ └─────┘
145
+ // In this iteration, we inspect a document that contains the ngrams at the positions specified above. Starting at the pivot:
146
+ // - We check if position `42` could be part of a match. We check the window at right, which contains `jkl` and `jkL` as potential
147
+ // candidates. Position `43` is present in ngram `jkl` and aligns with `42`, so we proceed to check further to the right. The
148
+ // next window contains `klm` and `kLm` as potential candidates but there is no aligned position in either. Thus `42` cannot be
149
+ // part of a match.
150
+ // - We then check if position `54` could be part of a match. `jkl` contains position `55` and `klm` contains position `56`, thus
151
+ // we successfully find an aligned sequence of ngrams to the last sliding window. However there is no match to the left of the
152
+ // pivot, thus `54` cannot be part of a match.
153
+ // - We finally check position `72`, and successfully find an alignment to the last and first sliding window. Thus position `72`
154
+ // is part of a match, indicating this document matches the literal sequence. We proceed to the next document, even if there
155
+ // could be another match at position `108`.
156
+
110
157
async fn match_literal_with_mask (
111
158
& self ,
112
159
literals : & [ Literal ] ,
0 commit comments