Skip to content
This repository was archived by the owner on Jan 2, 2025. It is now read-only.

Commit bac30ab

Browse files
authored
fix case-sensitivity of fuzzy search (#538)
trigrams generated for jacquard similarity were not case-insensitive, but the regex filter was. this meant that certain documents were filtered out if none of the case-sensitive trigrams matched.
1 parent f1cf345 commit bac30ab

File tree

2 files changed

+5
-17
lines changed

2 files changed

+5
-17
lines changed

server/bleep/src/indexes/file.rs

Lines changed: 3 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ use super::{
3838
use crate::{
3939
background::SyncPipes,
4040
intelligence::TreeSitterFile,
41+
query::compiler::{case_permutations, trigrams},
4142
repo::{iterator::*, FileCache, RepoMetadata, RepoRef, RepoRemote, Repository},
4243
semantic::Semantic,
4344
symbol::SymbolLocations,
@@ -295,20 +296,6 @@ impl Indexer<File> {
295296
limit: usize,
296297
) -> impl Iterator<Item = FileDocument> + '_ {
297298
// lifted from query::compiler
298-
fn trigrams(s: &str) -> impl Iterator<Item = String> {
299-
let mut chars = s.chars().collect::<Vec<_>>();
300-
301-
std::iter::from_fn(move || match chars.len() {
302-
0 => None,
303-
1 | 2 | 3 => Some(std::mem::take(&mut chars).into_iter().collect()),
304-
_ => {
305-
let out = chars.iter().take(3).collect();
306-
chars.remove(0);
307-
Some(out)
308-
}
309-
})
310-
}
311-
312299
let reader = self.reader.read().await;
313300
let searcher = reader.searcher();
314301
let collector = TopDocs::with_limit(100);
@@ -318,7 +305,8 @@ impl Indexer<File> {
318305
// matched the query
319306
let repo_ref_term = Term::from_field_text(self.source.repo_ref, &repo_ref.to_string());
320307
let mut hits = trigrams(query_str)
321-
.map(|token| Term::from_field_text(self.source.relative_path, &token))
308+
.flat_map(|s| case_permutations(s.as_str()))
309+
.map(|token| Term::from_field_text(self.source.relative_path, token.as_str()))
322310
.map(|term| {
323311
BooleanQuery::intersection(vec![
324312
Box::new(TermQuery::new(term, IndexRecordOption::Basic)),

server/bleep/src/query/compiler.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -199,7 +199,7 @@ fn str_to_query(field: Field, s: &str) -> DynQuery {
199199

200200
/// Split a string into trigrams, returning a bigram or unigram if the string is shorter than 3
201201
/// characters.
202-
fn trigrams(s: &str) -> impl Iterator<Item = CompactString> {
202+
pub fn trigrams(s: &str) -> impl Iterator<Item = CompactString> {
203203
let mut chars = s.chars().collect::<SmallVec<[char; 6]>>();
204204

205205
std::iter::from_fn(move || match chars.len() {
@@ -217,7 +217,7 @@ fn trigrams(s: &str) -> impl Iterator<Item = CompactString> {
217217
///
218218
/// This permutes each character by ASCII lowercase and uppercase variants. Characters which do not
219219
/// have case variants remain unchanged.
220-
fn case_permutations(s: &str) -> impl Iterator<Item = CompactString> {
220+
pub fn case_permutations(s: &str) -> impl Iterator<Item = CompactString> {
221221
// This implements a bitmask-based algorithm. The purpose is not speed; rather, a bitmask is
222222
// a simple way to get all combinations of a set of flags without allocating, sorting, or doing
223223
// anything else that is fancy.

0 commit comments

Comments
 (0)