Skip to content

Commit 01591ed

Browse files
author
sicheng
committed
Fix unicode support
1 parent a195b2b commit 01591ed

File tree

2 files changed

+19
-13
lines changed

2 files changed

+19
-13
lines changed

rust/types/src/regex/literal_expr.rs

Lines changed: 18 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -175,8 +175,8 @@ pub trait NgramLiteralProvider<E, const N: usize = 3> {
175175
lookup_table_size += ngram_doc_pos.len();
176176
ngram_doc_pos_vec.push(ngram_doc_pos);
177177

178-
let prefix = &ngram[..N - 1];
179-
let suffix = &ngram[1..];
178+
let prefix = &ngram[..ngram.char_indices().next_back().unwrap_or_default().0];
179+
let suffix = &ngram[ngram.char_indices().nth(1).unwrap_or_default().0..];
180180
lookup_table
181181
.prefix
182182
.entry(prefix)
@@ -224,7 +224,8 @@ pub trait NgramLiteralProvider<E, const N: usize = 3> {
224224
// Trace to the right of pivot
225225
let mut suffix_pos_idx =
226226
Vec::with_capacity(lookup_table_vec.len() - min_lookup_table_index);
227-
suffix_pos_idx.push((&ngram[1..], pos + ngram[..1].len() as u32, 0));
227+
let suffix_offset = ngram.char_indices().nth(1).unwrap_or_default().0;
228+
suffix_pos_idx.push((&ngram[suffix_offset..], pos + suffix_offset as u32, 0));
228229
while let Some((suffix, match_pos, ngram_index)) = suffix_pos_idx.pop() {
229230
let focus_lookup_table = match lookup_table_vec
230231
.get(min_lookup_table_index + suffix_pos_idx.len() + 1)
@@ -250,9 +251,10 @@ pub trait NgramLiteralProvider<E, const N: usize = 3> {
250251
Err(_) => continue,
251252
};
252253
if pos.binary_search(&match_pos).is_ok() {
254+
let suffix_offset = focus_ngram.char_indices().nth(1).unwrap_or_default().0;
253255
suffix_pos_idx.push((
254-
&focus_ngram[1..],
255-
match_pos + focus_ngram[..1].len() as u32,
256+
&focus_ngram[suffix_offset..],
257+
match_pos + suffix_offset as u32,
256258
0,
257259
));
258260
}
@@ -263,7 +265,8 @@ pub trait NgramLiteralProvider<E, const N: usize = 3> {
263265

264266
// Trace to the left of pivot
265267
let mut prefix_pos_idx = Vec::with_capacity(min_lookup_table_index + 1);
266-
prefix_pos_idx.push((&ngram[..N - 1], pos, 0));
268+
let prefix_offset = ngram.char_indices().next_back().unwrap_or_default().0;
269+
prefix_pos_idx.push((&ngram[..prefix_offset], pos, 0));
267270
while let Some((prefix, match_pos_with_offset, ngram_index)) = prefix_pos_idx.pop()
268271
{
269272
let focus_lookup_table = match min_lookup_table_index
@@ -290,13 +293,16 @@ pub trait NgramLiteralProvider<E, const N: usize = 3> {
290293
Ok(idx) => focus_ngram_doc_pos[idx],
291294
Err(_) => continue,
292295
};
293-
let match_pos =
294-
match match_pos_with_offset.checked_sub(focus_ngram[..1].len() as u32) {
295-
Some(pos) => pos,
296-
None => continue,
297-
};
296+
let match_pos = match match_pos_with_offset
297+
.checked_sub(focus_ngram.char_indices().nth(1).unwrap_or_default().0 as u32)
298+
{
299+
Some(pos) => pos,
300+
None => continue,
301+
};
298302
if pos.binary_search(&match_pos).is_ok() {
299-
prefix_pos_idx.push((&focus_ngram[..N - 1], match_pos, 0));
303+
let prefix_offset =
304+
focus_ngram.char_indices().next_back().unwrap_or_default().0;
305+
prefix_pos_idx.push((&focus_ngram[..prefix_offset], match_pos, 0));
300306
}
301307
}
302308
if !prefix_pos_idx.is_empty() {

rust/types/src/strategies.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -433,7 +433,7 @@ impl Arbitrary for ChromaHir {
433433
type Strategy = BoxedStrategy<Self>;
434434

435435
fn arbitrary_with(args: Self::Parameters) -> Self::Strategy {
436-
let literal = r"[a-zA-Z0-9_]{3,}".prop_map(Self::Literal);
436+
let literal = r"\w{3,}".prop_map(Self::Literal);
437437
let char_class = prop_oneof![
438438
2 => Just(Self::Class(ClassUnicode::new([
439439
ClassUnicodeRange::new('a', 'z'),

0 commit comments

Comments
 (0)