Skip to content

Commit f97aa92

Browse files
Multimodcraftershilangyu
authored andcommitted
Implement lookaround compilation
These changes implement the compilation of lookaround assertions from HIR to NFA. Subexpressions of lookaround assertions are patched to a top level reverse union. This is necessary so that the NFA will explore the innermost subexpression first and thereby make sure that all subexpression results are available when they need to be checked. I.e. any `WriteLookaround` state must be visited before any `CheckLookaround` state with the same index.
1 parent edba197 commit f97aa92

File tree

3 files changed

+134
-11
lines changed

3 files changed

+134
-11
lines changed

regex-automata/src/nfa/thompson/builder.rs

Lines changed: 74 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,9 @@ enum State {
4141
},
4242
/// A state that only transitions to another state if the current input
4343
/// byte is in a particular range of bytes.
44-
ByteRange { trans: Transition },
44+
ByteRange {
45+
trans: Transition,
46+
},
4547
/// A state with possibly many transitions, represented in a sparse
4648
/// fashion. Transitions must be ordered lexicographically by input range
4749
/// and be non-overlapping. As such, this may only be used when every
@@ -55,10 +57,15 @@ enum State {
5557
/// that `Sparse` is used for via `Union`. But this creates a more bloated
5658
/// NFA with more epsilon transitions than is necessary in the special case
5759
/// of character classes.
58-
Sparse { transitions: Vec<Transition> },
60+
Sparse {
61+
transitions: Vec<Transition>,
62+
},
5963
/// A conditional epsilon transition satisfied via some sort of
6064
/// look-around.
61-
Look { look: Look, next: StateID },
65+
Look {
66+
look: Look,
67+
next: StateID,
68+
},
6269
/// An empty state that records the start of a capture location. This is an
6370
/// unconditional epsilon transition like `Empty`, except it can be used to
6471
/// record position information for a capture group when using the NFA for
@@ -91,10 +98,20 @@ enum State {
9198
/// The next state that this state should transition to.
9299
next: StateID,
93100
},
101+
WriteLookaround {
102+
lookaround_index: usize,
103+
},
104+
CheckLookaround {
105+
lookaround_index: usize,
106+
positive: bool,
107+
next: StateID,
108+
},
94109
/// An alternation such that there exists an epsilon transition to all
95110
/// states in `alternates`, where matches found via earlier transitions
96111
/// are preferred over later transitions.
97-
Union { alternates: Vec<StateID> },
112+
Union {
113+
alternates: Vec<StateID>,
114+
},
98115
/// An alternation such that there exists an epsilon transition to all
99116
/// states in `alternates`, where matches found via later transitions are
100117
/// preferred over earlier transitions.
@@ -110,7 +127,9 @@ enum State {
110127
/// to be amortized constant time. But if we used a `Union`, we'd need to
111128
/// prepend the state, which takes O(n) time. There are other approaches we
112129
/// could use to solve this, but this seems simple enough.
113-
UnionReverse { alternates: Vec<StateID> },
130+
UnionReverse {
131+
alternates: Vec<StateID>,
132+
},
114133
/// A state that cannot be transitioned out of. This is useful for cases
115134
/// where you want to prevent matching from occurring. For example, if your
116135
/// regex parser permits empty character classes, then one could choose a
@@ -124,7 +143,9 @@ enum State {
124143
///
125144
/// `pattern_id` refers to the ID of the pattern itself, which corresponds
126145
/// to the pattern's index (starting at 0).
127-
Match { pattern_id: PatternID },
146+
Match {
147+
pattern_id: PatternID,
148+
},
128149
}
129150

130151
impl State {
@@ -154,7 +175,9 @@ impl State {
154175
| State::CaptureStart { .. }
155176
| State::CaptureEnd { .. }
156177
| State::Fail
157-
| State::Match { .. } => 0,
178+
| State::Match { .. }
179+
| State::CheckLookaround { .. }
180+
| State::WriteLookaround { .. } => 0,
158181
State::Sparse { ref transitions } => {
159182
transitions.len() * mem::size_of::<Transition>()
160183
}
@@ -470,6 +493,22 @@ impl Builder {
470493
State::Look { look, next } => {
471494
remap[sid] = nfa.add(nfa::State::Look { look, next });
472495
}
496+
State::WriteLookaround { lookaround_index } => {
497+
remap[sid] = nfa.add(nfa::State::WriteLookaround {
498+
look_idx: lookaround_index,
499+
});
500+
}
501+
State::CheckLookaround {
502+
lookaround_index,
503+
positive,
504+
next,
505+
} => {
506+
remap[sid] = nfa.add(nfa::State::CheckLookaround {
507+
look_idx: lookaround_index,
508+
positive,
509+
next,
510+
});
511+
}
473512
State::CaptureStart { pattern_id, group_index, next } => {
474513
// We can't remove this empty state because of the side
475514
// effect of capturing an offset for this capture slot.
@@ -693,6 +732,30 @@ impl Builder {
693732
self.add(State::Empty { next: StateID::ZERO })
694733
}
695734

735+
/// Add a state which will record that the lookaround with the given index
736+
/// is satisfied at the current position.
737+
pub fn add_write_lookaround(
738+
&mut self,
739+
index: usize,
740+
) -> Result<StateID, BuildError> {
741+
self.add(State::WriteLookaround { lookaround_index: index })
742+
}
743+
744+
/// Add a state which will check whether the lookaround with the given
745+
/// index is satisfied at the current position.
746+
pub fn add_check_lookaround(
747+
&mut self,
748+
index: usize,
749+
positive: bool,
750+
next: StateID,
751+
) -> Result<StateID, BuildError> {
752+
self.add(State::CheckLookaround {
753+
lookaround_index: index,
754+
positive,
755+
next,
756+
})
757+
}
758+
696759
/// Add a "union" NFA state.
697760
///
698761
/// A "union" NFA state that contains zero or more unconditional epsilon
@@ -1159,6 +1222,9 @@ impl Builder {
11591222
State::Look { ref mut next, .. } => {
11601223
*next = to;
11611224
}
1225+
State::CheckLookaround { ref mut next, .. } => {
1226+
*next = to;
1227+
}
11621228
State::Union { ref mut alternates } => {
11631229
alternates.push(to);
11641230
self.memory_states += mem::size_of::<StateID>();
@@ -1173,6 +1239,7 @@ impl Builder {
11731239
State::CaptureEnd { ref mut next, .. } => {
11741240
*next = to;
11751241
}
1242+
State::WriteLookaround { .. } => {}
11761243
State::Fail => {}
11771244
State::Match { .. } => {}
11781245
}

regex-automata/src/nfa/thompson/compiler.rs

Lines changed: 59 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ use core::{borrow::Borrow, cell::RefCell};
33
use alloc::{sync::Arc, vec, vec::Vec};
44

55
use regex_syntax::{
6-
hir::{self, Hir},
6+
hir::{self, Hir, LookAround},
77
utf8::{Utf8Range, Utf8Sequences},
88
ParserBuilder,
99
};
@@ -711,6 +711,7 @@ pub struct Compiler {
711711
/// State used for caching common suffixes when compiling reverse UTF-8
712712
/// automata (for Unicode character classes).
713713
utf8_suffix: RefCell<Utf8SuffixMap>,
714+
lookaround_alt: RefCell<Option<StateID>>,
714715
}
715716

716717
impl Compiler {
@@ -723,6 +724,7 @@ impl Compiler {
723724
utf8_state: RefCell::new(Utf8State::new()),
724725
trie_state: RefCell::new(RangeTrie::new()),
725726
utf8_suffix: RefCell::new(Utf8SuffixMap::new(1000)),
727+
lookaround_alt: RefCell::new(None),
726728
}
727729
}
728730

@@ -977,11 +979,20 @@ impl Compiler {
977979

978980
let compiled = self.c_alt_iter(exprs.iter().map(|e| {
979981
let _ = self.start_pattern()?;
982+
let lookaround_prefix =
983+
self.c_at_least(&Hir::dot(hir::Dot::AnyByte), false, 0)?;
984+
let lookaround_alt = self.add_union_reverse()?;
985+
self.patch(lookaround_prefix.end, lookaround_alt)?;
986+
let top_level_alt = self.add_union()?;
987+
self.patch(top_level_alt, lookaround_prefix.start)?;
988+
self.lookaround_alt.borrow_mut().replace(lookaround_alt);
980989
let one = self.c_cap(0, None, e.borrow())?;
981990
let match_state_id = self.add_match()?;
982991
self.patch(one.end, match_state_id)?;
983-
let _ = self.finish_pattern(one.start)?;
984-
Ok(ThompsonRef { start: one.start, end: match_state_id })
992+
self.patch(top_level_alt, one.start)?;
993+
let _ = self.finish_pattern(top_level_alt)?;
994+
self.lookaround_alt.borrow_mut().take();
995+
Ok(ThompsonRef { start: top_level_alt, end: match_state_id })
985996
}))?;
986997
self.patch(unanchored_prefix.end, compiled.start)?;
987998
let nfa = self
@@ -1003,14 +1014,39 @@ impl Compiler {
10031014
Class(Class::Bytes(ref c)) => self.c_byte_class(c),
10041015
Class(Class::Unicode(ref c)) => self.c_unicode_class(c),
10051016
Look(ref look) => self.c_look(look),
1006-
LookAround(_) => todo!("implement lookaround NFA compilation"),
1017+
LookAround(ref lookaround) => self.c_lookaround(lookaround),
10071018
Repetition(ref rep) => self.c_repetition(rep),
10081019
Capture(ref c) => self.c_cap(c.index, c.name.as_deref(), &c.sub),
10091020
Concat(ref es) => self.c_concat(es.iter().map(|e| self.c(e))),
10101021
Alternation(ref es) => self.c_alt_slice(es),
10111022
}
10121023
}
10131024

1025+
fn c_lookaround(
1026+
&self,
1027+
lookaround: &LookAround,
1028+
) -> Result<ThompsonRef, BuildError> {
1029+
let sub = match lookaround {
1030+
LookAround::NegativeLookBehind(ref sub)
1031+
| LookAround::PositiveLookBehind(ref sub) => self.c(sub)?,
1032+
};
1033+
let pos = match lookaround {
1034+
LookAround::NegativeLookBehind(_) => false,
1035+
LookAround::PositiveLookBehind(_) => true,
1036+
};
1037+
let idx = todo!("get index");
1038+
let check = self.add_check_lookaround(idx, pos)?;
1039+
let write = self.add_write_lookaround(idx)?;
1040+
self.patch(sub.end, write)?;
1041+
self.patch(
1042+
self.lookaround_alt
1043+
.borrow()
1044+
.expect("Cannot compile lookaround outside pattern"),
1045+
sub.start,
1046+
)?;
1047+
Ok(ThompsonRef { start: check, end: check })
1048+
}
1049+
10141050
/// Compile a concatenation of the sub-expressions yielded by the given
10151051
/// iterator. If the iterator yields no elements, then this compiles down
10161052
/// to an "empty" state that always matches.
@@ -1631,6 +1667,25 @@ impl Compiler {
16311667
self.builder.borrow_mut().add_empty()
16321668
}
16331669

1670+
fn add_write_lookaround(
1671+
&self,
1672+
index: usize,
1673+
) -> Result<StateID, BuildError> {
1674+
self.builder.borrow_mut().add_write_lookaround(index)
1675+
}
1676+
1677+
fn add_check_lookaround(
1678+
&self,
1679+
index: usize,
1680+
positive: bool,
1681+
) -> Result<StateID, BuildError> {
1682+
self.builder.borrow_mut().add_check_lookaround(
1683+
index,
1684+
positive,
1685+
StateID::ZERO,
1686+
)
1687+
}
1688+
16341689
fn add_range(&self, start: u8, end: u8) -> Result<StateID, BuildError> {
16351690
self.builder.borrow_mut().add_range(Transition {
16361691
start,

regex-cli/cmd/generate/fowler.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -421,5 +421,6 @@ fn count_capturing_groups_ast(ast: &regex_syntax::ast::Ast) -> usize {
421421
Ast::Concat(ref concat) => {
422422
concat.asts.iter().map(count_capturing_groups_ast).sum()
423423
}
424+
Ast::LookAround(_) => todo!(),
424425
}
425426
}

0 commit comments

Comments
 (0)