Skip to content

Commit ccdab18

Browse files
Multimodcraftershilangyu
authored andcommitted
Restore compilation behaviour for regexes without lookarounds
The machinery necessary to perform the parallel lookbehind checking should only be compiled in when there is actually a lookbehind expression in the regex. This restores compilation to the expected outputs for regexes without lookbehind expressions.
1 parent f97aa92 commit ccdab18

File tree

2 files changed

+46
-8
lines changed

2 files changed

+46
-8
lines changed

regex-automata/src/nfa/thompson/compiler.rs

Lines changed: 20 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -979,17 +979,29 @@ impl Compiler {
979979

980980
let compiled = self.c_alt_iter(exprs.iter().map(|e| {
981981
let _ = self.start_pattern()?;
982-
let lookaround_prefix =
983-
self.c_at_least(&Hir::dot(hir::Dot::AnyByte), false, 0)?;
984-
let lookaround_alt = self.add_union_reverse()?;
985-
self.patch(lookaround_prefix.end, lookaround_alt)?;
986-
let top_level_alt = self.add_union()?;
987-
self.patch(top_level_alt, lookaround_prefix.start)?;
988-
self.lookaround_alt.borrow_mut().replace(lookaround_alt);
982+
let has_lookarounds =
983+
(e.borrow() as &Hir).properties().contains_lookaround_expr();
984+
let mut top_level_alt = if has_lookarounds {
985+
self.add_union()?
986+
} else {
987+
StateID::ZERO
988+
};
989+
if has_lookarounds {
990+
let lookaround_prefix =
991+
self.c_at_least(&Hir::dot(hir::Dot::AnyByte), false, 0)?;
992+
let lookaround_alt = self.add_union_reverse()?;
993+
self.patch(lookaround_prefix.end, lookaround_alt)?;
994+
self.patch(top_level_alt, lookaround_prefix.start)?;
995+
self.lookaround_alt.borrow_mut().replace(lookaround_alt);
996+
}
989997
let one = self.c_cap(0, None, e.borrow())?;
990998
let match_state_id = self.add_match()?;
991999
self.patch(one.end, match_state_id)?;
992-
self.patch(top_level_alt, one.start)?;
1000+
if has_lookarounds {
1001+
self.patch(top_level_alt, one.start)?;
1002+
} else {
1003+
top_level_alt = one.start;
1004+
}
9931005
let _ = self.finish_pattern(top_level_alt)?;
9941006
self.lookaround_alt.borrow_mut().take();
9951007
Ok(ThompsonRef { start: top_level_alt, end: match_state_id })

regex-syntax/src/hir/mod.rs

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2041,6 +2041,7 @@ struct PropertiesI {
20412041
look_set_suffix: LookSet,
20422042
look_set_prefix_any: LookSet,
20432043
look_set_suffix_any: LookSet,
2044+
contains_lookaround_expr: bool,
20442045
utf8: bool,
20452046
explicit_captures_len: usize,
20462047
static_explicit_captures_len: Option<usize>,
@@ -2134,6 +2135,17 @@ impl Properties {
21342135
self.0.look_set_suffix_any
21352136
}
21362137

2138+
/// Returns whether there are any look-around expressions in this HIR value.
2139+
///
2140+
/// Only returns true for [`HirKind::LookAround`] and not for
2141+
/// [`HirKind::Look`], which can be queried by [`look_set`] instead.
2142+
/// Currently, only lookbehind assertions without capture groups are
2143+
/// supported.
2144+
#[inline]
2145+
pub fn contains_lookaround_expr(&self) -> bool {
2146+
self.0.contains_lookaround_expr
2147+
}
2148+
21372149
/// Return true if and only if the corresponding HIR will always match
21382150
/// valid UTF-8.
21392151
///
@@ -2403,6 +2415,7 @@ impl Properties {
24032415
look_set_suffix: fix,
24042416
look_set_prefix_any: LookSet::empty(),
24052417
look_set_suffix_any: LookSet::empty(),
2418+
contains_lookaround_expr: false,
24062419
utf8: true,
24072420
explicit_captures_len: 0,
24082421
static_explicit_captures_len,
@@ -2418,6 +2431,8 @@ impl Properties {
24182431
props.look_set_suffix.set_intersect(p.look_set_suffix());
24192432
props.look_set_prefix_any.set_union(p.look_set_prefix_any());
24202433
props.look_set_suffix_any.set_union(p.look_set_suffix_any());
2434+
props.contains_lookaround_expr =
2435+
props.contains_lookaround_expr || p.contains_lookaround_expr();
24212436
props.utf8 = props.utf8 && p.is_utf8();
24222437
props.explicit_captures_len = props
24232438
.explicit_captures_len
@@ -2465,6 +2480,7 @@ impl Properties {
24652480
look_set_suffix: LookSet::empty(),
24662481
look_set_prefix_any: LookSet::empty(),
24672482
look_set_suffix_any: LookSet::empty(),
2483+
contains_lookaround_expr: false,
24682484
// It is debatable whether an empty regex always matches at valid
24692485
// UTF-8 boundaries. Strictly speaking, at a byte oriented view,
24702486
// it is clearly false. There are, for example, many empty strings
@@ -2501,6 +2517,7 @@ impl Properties {
25012517
look_set_suffix: LookSet::empty(),
25022518
look_set_prefix_any: LookSet::empty(),
25032519
look_set_suffix_any: LookSet::empty(),
2520+
contains_lookaround_expr: false,
25042521
utf8: core::str::from_utf8(&lit.0).is_ok(),
25052522
explicit_captures_len: 0,
25062523
static_explicit_captures_len: Some(0),
@@ -2520,6 +2537,7 @@ impl Properties {
25202537
look_set_suffix: LookSet::empty(),
25212538
look_set_prefix_any: LookSet::empty(),
25222539
look_set_suffix_any: LookSet::empty(),
2540+
contains_lookaround_expr: false,
25232541
utf8: class.is_utf8(),
25242542
explicit_captures_len: 0,
25252543
static_explicit_captures_len: Some(0),
@@ -2539,6 +2557,9 @@ impl Properties {
25392557
look_set_suffix: LookSet::singleton(look),
25402558
look_set_prefix_any: LookSet::singleton(look),
25412559
look_set_suffix_any: LookSet::singleton(look),
2560+
// Note, this field represents _general_ lookarounds (ones using
2561+
// LookAround) and not simple ones (using Look).
2562+
contains_lookaround_expr: false,
25422563
// This requires a little explanation. Basically, we don't consider
25432564
// matching an empty string to be equivalent to matching invalid
25442565
// UTF-8, even though technically matching every empty string will
@@ -2569,6 +2590,7 @@ impl Properties {
25692590
maximum_len: Some(0),
25702591
literal: false,
25712592
alternation_literal: false,
2593+
contains_lookaround_expr: true,
25722594
..*sub_p.0.clone()
25732595
};
25742596
Properties(Box::new(inner))
@@ -2595,6 +2617,7 @@ impl Properties {
25952617
look_set_suffix: LookSet::empty(),
25962618
look_set_prefix_any: p.look_set_prefix_any(),
25972619
look_set_suffix_any: p.look_set_suffix_any(),
2620+
contains_lookaround_expr: p.contains_lookaround_expr(),
25982621
utf8: p.is_utf8(),
25992622
explicit_captures_len: p.explicit_captures_len(),
26002623
static_explicit_captures_len: p.static_explicit_captures_len(),
@@ -2656,6 +2679,7 @@ impl Properties {
26562679
look_set_suffix: LookSet::empty(),
26572680
look_set_prefix_any: LookSet::empty(),
26582681
look_set_suffix_any: LookSet::empty(),
2682+
contains_lookaround_expr: false,
26592683
utf8: true,
26602684
explicit_captures_len: 0,
26612685
static_explicit_captures_len: Some(0),
@@ -2667,6 +2691,8 @@ impl Properties {
26672691
let p = x.properties();
26682692
props.look_set.set_union(p.look_set());
26692693
props.utf8 = props.utf8 && p.is_utf8();
2694+
props.contains_lookaround_expr =
2695+
props.contains_lookaround_expr || p.contains_lookaround_expr();
26702696
props.explicit_captures_len = props
26712697
.explicit_captures_len
26722698
.saturating_add(p.explicit_captures_len());

0 commit comments

Comments
 (0)