Skip to content

[ENH] Optimize regex algorithm #4624

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 7 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions rust/index/src/fulltext/types.rs
Original file line number Diff line number Diff line change
Expand Up @@ -436,6 +436,15 @@ impl<'reader> NgramLiteralProvider<FullTextIndexError> for FullTextIndexReader<'
6
}

async fn prefetch_ngrams<'me, Ngrams>(&'me self, ngrams: Ngrams)
where
Ngrams: IntoIterator<Item = &'me str> + Send + Sync,
{
self.posting_lists_blockfile_reader
.load_blocks_for_prefixes(ngrams)
.await
}

async fn lookup_ngram_range<'me, NgramRange>(
&'me self,
ngram_range: NgramRange,
Expand Down
330 changes: 242 additions & 88 deletions rust/types/src/regex/literal_expr.rs
Original file line number Diff line number Diff line change
Expand Up @@ -71,11 +71,32 @@ impl From<ChromaHir> for LiteralExpr {
}
}

#[derive(Debug, Default)]
struct PrefixSuffixLookupTable<'me> {
prefix: HashMap<&'me str, Vec<usize>>,
suffix: HashMap<&'me str, Vec<usize>>,
}

impl<'me> PrefixSuffixLookupTable<'me> {
fn with_capacity(capacity: usize) -> Self {
Self {
prefix: HashMap::with_capacity(capacity),
suffix: HashMap::with_capacity(capacity),
}
}
}

#[async_trait::async_trait]
pub trait NgramLiteralProvider<E, const N: usize = 3> {
// Return the max branching factor during the search
fn maximum_branching_factor(&self) -> usize;

async fn prefetch_ngrams<'me, Ngrams>(&'me self, _ngrams: Ngrams)
where
Ngrams: IntoIterator<Item = &'me str> + Send + Sync,
{
}

// Return the (ngram, doc_id, positions) for a range of ngrams
async fn lookup_ngram_range<'me, NgramRange>(
&'me self,
Expand All @@ -85,7 +106,54 @@ pub trait NgramLiteralProvider<E, const N: usize = 3> {
NgramRange: Clone + RangeBounds<&'me str> + Send + Sync;

// Return the documents containing the literals. The search space is restricted to the documents in the mask if specified
// The literal slice must not be shorter than N, else `[...].split_at(N)` will panic
//
// The literal slice should not be shorter than N, or an empty set will be returned to indicate no document contains a
// ngram sequence that match the literal sequence
//
// The high level algorithm can be separated into the following phases:
// - Calculate all ngrams that could present in the match
// - Prefetch all relevant blocks for these ngrams
// - For each sliding window of size N in the literal sequence:
// - Fetch all (ngram, doc, pos) tuples from the index where the ngram can match the window of N literals
// - Track the sliding window with minimum number of candidate (ngram, doc, pos) tuples
// - Reorganize the ngrams by prefix and suffix into a lookup table
// - Taking the sliding window with minimum number of candidate (ngram, doc, pos) tuples as the pivot:
// - Group the (ngram, doc, pos) tuples by document
// - For each document, iterate over the candidate (ngram, pos) tuples:
// - Repeatedly use the suffix of the ngram and the prefix lookup table to see if there exists a sequence of ngrams
// and positions that aligns all the way to the last sliding window
// - Repeatedly use the prefix of the ngram and the suffix lookup table to see if there exists a sequence of ngrams
// and positions that aligns all the way to the first sliding window
// - If there is such an alignment from the start to the end, add the document to the result and skip to the next document
//
// An illustrative example (N=3) for one successful iteration of the final step is presented below (irrelevant info is hidden):
// ┌─────┐ ┌─────┐
// │ ijk │ │ jkl │
// │ │ │ │ ┌─────┐
// │ 42──┼────────┼►43 │ │ klm │
// ┌─────┐ │ │ │ │ │ │
// │ hij │ │ 54──┼────────┼►55──┼────────┼►56 │
// │ │ │ │ └─────┘ └─────┘
// │ 71◄─┼────────┼─72──┼────┐
// │ │ │ │ │ ┌─────┐ ┌─────┐
// │ 107 │ │ 108 │ │ │ jkL │ │ kLm │
// └─────┘ └─────┘ │ │ │ │ │
// pivot └───┼►73──┼────────┼►74 │
// │ │ │ │
// │ 109 │ │ 110 │
// └─────┘ └─────┘
// In this iteration, we inspect a document that contains the ngrams at the positions specified above. Starting at the pivot:
// - We check if position `42` could be part of a match. We check the window at right, which contains `jkl` and `jkL` as potential
// candidates. Position `43` is present in ngram `jkl` and aligns with `42`, so we proceed to check further to the right. The
// next window contains `klm` and `kLm` as potential candidates but there is no aligned position in either. Thus `42` cannot be
// part of a match.
// - We then check if position `54` could be part of a match. `jkl` contains position `55` and `klm` contains position `56`, thus
// we successfully find an aligned sequence of ngrams to the last sliding window. However there is no match to the left of the
// pivot, thus `54` cannot be part of a match.
// - We finally check position `72`, and successfully find an alignment to the last and first sliding window. Thus position `72`
// is part of a match, indicating this document matches the literal sequence. We proceed to the next document, even if there
// could be another match at position `108`.

async fn match_literal_with_mask(
&self,
literals: &[Literal],
Expand All @@ -95,17 +163,19 @@ pub trait NgramLiteralProvider<E, const N: usize = 3> {
return Ok(HashSet::new());
}

let (initial_literals, remaining_literals) = literals.split_at(N);
let initial_ngrams =
initial_literals
.iter()
.fold(vec![Vec::with_capacity(N)], |mut acc, lit| match lit {
Literal::Char(c) => {
acc.iter_mut().for_each(|s| s.push(*c));
acc
}
Literal::Class(class_unicode) => {
acc.into_iter()
// Derive the full set of ngrams
let ngram_vec = literals
.windows(N)
.map(|ngram_literals| {
ngram_literals
.iter()
.fold(vec![String::with_capacity(N)], |mut acc, lit| match lit {
Literal::Char(c) => {
acc.iter_mut().for_each(|s| s.push(*c));
acc
}
Literal::Class(class_unicode) => acc
.into_iter()
.flat_map(|s| {
class_unicode.iter().flat_map(|r| r.start()..=r.end()).map(
move |c| {
Expand All @@ -115,95 +185,179 @@ pub trait NgramLiteralProvider<E, const N: usize = 3> {
},
)
})
.collect()
}
});
.collect(),
})
})
.collect::<Vec<_>>();

// ngram suffix -> doc_id -> position
let mut suffix_doc_pos: HashMap<Vec<char>, HashMap<u32, HashSet<u32>>> = HashMap::new();
for ngram in initial_ngrams {
let ngram_string = ngram.iter().collect::<String>();
let ngram_doc_pos = self
.lookup_ngram_range(ngram_string.as_str()..=ngram_string.as_str())
.await?;

if ngram_doc_pos.is_empty() {
continue;
}
if ngram_vec.is_empty() {
return Ok(HashSet::new());
}

let suffix = ngram[1..].to_vec();
for (_, doc_id, pos) in ngram_doc_pos {
if pos.is_empty() {
self.prefetch_ngrams(
ngram_vec
.iter()
.flat_map(|ngrams| ngrams.iter().map(|ngram| ngram.as_str())),
)
.await;

// Retrieve all ngram posting lists
let mut ngram_doc_pos_vec = Vec::with_capacity(ngram_vec.iter().map(Vec::len).sum());
let mut lookup_table_vec = Vec::<PrefixSuffixLookupTable>::with_capacity(ngram_vec.len());
let mut min_lookup_table_size = usize::MAX;
let mut min_lookup_table_index = 0;
for ngrams in &ngram_vec {
let mut lookup_table = PrefixSuffixLookupTable::with_capacity(ngrams.len());
let mut lookup_table_size = 0;
for ngram in ngrams {
let ngram_doc_pos = self
.lookup_ngram_range(ngram.as_str()..=ngram.as_str())
.await?;

if ngram_doc_pos.is_empty() {
continue;
}
if mask.is_none() || mask.is_some_and(|m| m.contains(&doc_id)) {
suffix_doc_pos
.entry(suffix.clone())
.or_default()
.entry(doc_id)
.or_default()
.extend(pos);
}

let ngram_doc_pos_index = ngram_doc_pos_vec.len();
lookup_table_size += ngram_doc_pos.len();
ngram_doc_pos_vec.push(ngram_doc_pos);

let prefix = &ngram[..ngram.char_indices().next_back().unwrap_or_default().0];
let suffix = &ngram[ngram.char_indices().nth(1).unwrap_or_default().0..];
lookup_table
.prefix
.entry(prefix)
.or_insert_with(|| Vec::with_capacity(ngrams.len()))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[PerformanceOptimization]

Consider using entry().or_default() instead of entry().or_insert_with(|| Vec::with_capacity(ngrams.len())) for simpler code. If the performance difference is important, you could benchmark both approaches.

.push(ngram_doc_pos_index);
lookup_table
.suffix
.entry(suffix)
.or_insert_with(|| Vec::with_capacity(ngrams.len()))
.push(ngram_doc_pos_index);
}
let lookup_table_index = lookup_table_vec.len();
lookup_table_vec.push(lookup_table);
if lookup_table_size < min_lookup_table_size {
min_lookup_table_size = lookup_table_size;
min_lookup_table_index = lookup_table_index;
}
}

for literal in remaining_literals {
if suffix_doc_pos.is_empty() {
break;
}
let mut new_suffix_doc_pos: HashMap<Vec<char>, HashMap<u32, HashSet<u32>>> =
HashMap::new();
for (mut suffix, doc_pos) in suffix_doc_pos {
let ngram_ranges = match literal {
Literal::Char(literal_char) => {
suffix.push(*literal_char);
vec![(suffix.clone(), suffix)]
// Gather candidate documents
let min_lookup_table = &lookup_table_vec[min_lookup_table_index];
let min_ngram_doc_pos_iter = min_lookup_table
.prefix
.values()
.flat_map(|idxs| idxs.iter().map(|idx| &ngram_doc_pos_vec[*idx]));
let mut candidates =
HashMap::<_, Vec<_>>::with_capacity(min_ngram_doc_pos_iter.clone().map(Vec::len).sum());
for (ngram, doc, pos) in min_ngram_doc_pos_iter
.flatten()
.filter(|(_, d, _)| mask.is_none() || mask.is_some_and(|m| m.contains(d)))
{
candidates
.entry(*doc)
.or_insert_with(|| Vec::with_capacity(min_lookup_table.prefix.len()))
.push((*ngram, *pos));
}

// Find a valid trace across lookup tables
let mut result = HashSet::with_capacity(candidates.len());
for (doc, pivot_ngram_pos) in candidates {
for (ngram, pos) in pivot_ngram_pos
.into_iter()
.flat_map(|(n, ps)| ps.iter().map(move |p| (n, *p)))
{
// Trace to the right of pivot
let mut suffix_pos_idx =
Vec::with_capacity(lookup_table_vec.len() - min_lookup_table_index);
let suffix_offset = ngram.char_indices().nth(1).unwrap_or_default().0;
suffix_pos_idx.push((&ngram[suffix_offset..], pos + suffix_offset as u32, 0));
while let Some((suffix, match_pos, ngram_index)) = suffix_pos_idx.pop() {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

worth adding comments here. this is an iterative DFS right? Maybe a recursive impl is more readable

let focus_lookup_table = match lookup_table_vec
.get(min_lookup_table_index + suffix_pos_idx.len() + 1)
{
Some(table) => table,
None => {
Copy link
Contributor

@sanketkedia sanketkedia May 29, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

add comments explaining this case?

suffix_pos_idx.push((suffix, match_pos, ngram_index));
break;
}
};
let focus_ngram_doc_pos = match focus_lookup_table
.prefix
.get(suffix)
.and_then(|idxs| idxs.get(ngram_index))
{
Some(idx) => &ngram_doc_pos_vec[*idx],
None => continue,
};
suffix_pos_idx.push((suffix, match_pos, ngram_index + 1));
let (focus_ngram, _, pos) =
match focus_ngram_doc_pos.binary_search_by_key(&doc, |(_, d, _)| *d) {
Ok(idx) => focus_ngram_doc_pos[idx],
Err(_) => continue,
};
if pos.binary_search(&match_pos).is_ok() {
let suffix_offset = focus_ngram.char_indices().nth(1).unwrap_or_default().0;
suffix_pos_idx.push((
&focus_ngram[suffix_offset..],
match_pos + suffix_offset as u32,
0,
));
}
Literal::Class(class_unicode) => class_unicode
.iter()
.map(|r| {
let mut min_ngram = suffix.clone();
min_ngram.push(r.start());
let mut max_ngram = suffix.clone();
max_ngram.push(r.end());
(min_ngram, max_ngram)
})
.collect(),
};

for (min_ngram, max_ngram) in ngram_ranges {
let min_ngram_string = min_ngram.iter().collect::<String>();
let max_ngram_string = max_ngram.iter().collect::<String>();
let ngram_doc_pos = self
.lookup_ngram_range(min_ngram_string.as_str()..=max_ngram_string.as_str())
.await?;
for (ngram, doc_id, next_pos) in ngram_doc_pos {
if let Some(pos) = doc_pos.get(&doc_id) {
let next_pos_set: HashSet<&u32> = HashSet::from_iter(next_pos);
let mut valid_next_pos = pos
.iter()
.filter_map(|p| next_pos_set.contains(&(p + 1)).then_some(p + 1))
.peekable();
if valid_next_pos.peek().is_some() {
let new_suffix = ngram.chars().skip(1).collect();
new_suffix_doc_pos
.entry(new_suffix)
.or_default()
.entry(doc_id)
.or_default()
.extend(valid_next_pos);
}
}
if suffix_pos_idx.is_empty() {
continue;
}

// Trace to the left of pivot
let mut prefix_pos_idx = Vec::with_capacity(min_lookup_table_index + 1);
let prefix_offset = ngram.char_indices().next_back().unwrap_or_default().0;
prefix_pos_idx.push((&ngram[..prefix_offset], pos, 0));
while let Some((prefix, match_pos_with_offset, ngram_index)) = prefix_pos_idx.pop()
{
let focus_lookup_table = match min_lookup_table_index
.checked_sub(prefix_pos_idx.len() + 1)
.and_then(|lookup_index| lookup_table_vec.get(lookup_index))
{
Some(table) => table,
None => {
prefix_pos_idx.push((prefix, match_pos_with_offset, ngram_index));
break;
}
};
let focus_ngram_doc_pos = match focus_lookup_table
.suffix
.get(prefix)
.and_then(|idxs| idxs.get(ngram_index))
{
Some(idx) => &ngram_doc_pos_vec[*idx],
None => continue,
};
prefix_pos_idx.push((prefix, match_pos_with_offset, ngram_index + 1));
let (focus_ngram, _, pos) =
match focus_ngram_doc_pos.binary_search_by_key(&doc, |(_, d, _)| *d) {
Ok(idx) => focus_ngram_doc_pos[idx],
Err(_) => continue,
};
let match_pos = match match_pos_with_offset
.checked_sub(focus_ngram.char_indices().nth(1).unwrap_or_default().0 as u32)
{
Some(pos) => pos,
None => continue,
};
if pos.binary_search(&match_pos).is_ok() {
let prefix_offset =
focus_ngram.char_indices().next_back().unwrap_or_default().0;
prefix_pos_idx.push((&focus_ngram[..prefix_offset], match_pos, 0));
}
}
if !prefix_pos_idx.is_empty() {
result.insert(doc);
break;
}
}
suffix_doc_pos = new_suffix_doc_pos;
}

let result = suffix_doc_pos
.into_values()
.flat_map(|doc_pos| doc_pos.into_keys())
.collect();
Ok(result)
}

Expand Down
Loading
Loading