Skip to content

Commit c587256

Browse files
authored
Add more detailed explanations to control-flow RegexOpcode values (#112170)
Add more detailed explanations to control-flow RegexOpcode values
1 parent 9caf9f1 commit c587256

File tree

1 file changed

+134
-19
lines changed
  • src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions

1 file changed

+134
-19
lines changed

src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexOpcode.cs

Lines changed: 134 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -117,54 +117,169 @@ internal enum RegexOpcode
117117
UpdateBumpalong = 46,
118118

119119
// Primitive control structures
120-
// TODO: Figure out what these comments mean / what these control structures actually do :)
121120

122-
/// <summary>back jump straight first.</summary>
121+
/// <summary>Lazy branch in an alternation or conditional construct.</summary>
122+
/// <remarks>
123+
/// On first execution, the opcode records the current input position (via the tracking stack) and continues straight
124+
/// without taking the jump. When the matching that follows fails, backtracking will occur and the saved position is restored,
125+
/// at which point the interpreter will jump to the alternative branch (using the patched jump offset in operand 0).
126+
/// This opcode is used to implement alternation in a non-greedy (lazy) manner.
127+
/// </remarks>
123128
Lazybranch = 23,
124-
/// <summary>back jump branch first for loop.</summary>
129+
130+
/// <summary>Branch in a quantified loop that uses a saved mark to decide whether to repeat or exit.</summary>
131+
/// <remarks>
132+
/// When executed, this opcode pops a previously saved input mark (from a <see cref="Setmark"/> or <see cref="Nullmark"/>)
133+
/// and compares it to the current input position. If the loop's inner expression has consumed input (non-empty match), it
134+
/// pushes updated state (saving the old mark and the current position) and jumps back (via the jump offset in operand 0)
135+
/// to repeat the loop. If no progress has been made (empty match), it records state for backtracking and proceeds.
136+
/// This opcode is used for greedy (non-lazy) quantified loops when no explicit counter is needed.
137+
/// </remarks>
125138
Branchmark = 24,
126-
/// <summary>back jump straight first for loop.</summary>
139+
140+
/// <summary>Lazy branch in a quantified loop that uses a saved mark.</summary>
141+
/// <remarks>
142+
/// Similar in spirit to <see cref="Branchmark"/>, this opcode is used for lazy loops.
143+
/// It initially does not jump back to repeat the loop, preferring to let the overall match continue.
144+
/// However, it saves the loop state so that if subsequent matching fails, backtracking will re-enter the loop body.
145+
/// Special care is taken to handle empty matches so as to avoid infinite loops.
146+
/// </remarks>
127147
Lazybranchmark = 25,
128-
/// <summary>back val set counter, null mark.</summary>
148+
149+
/// <summary>Initialize the loop counter for a quantifier when the minimum repetition is zero.</summary>
150+
/// <remarks>
151+
/// For quantified constructs with a minimum of zero (<see cref="RegexNode.M"/> == 0), this opcode pushes a counter
152+
/// value (-1) along with a marker (implicitly indicating no match so far) onto the grouping stack. The operand (always 0
153+
/// in this case) is used in later comparisons within a <see cref="Branchcount"/> or <see cref="Lazybranchcount"/> opcode.
154+
/// </remarks>
129155
Nullcount = 26,
130-
/// <summary>back val set counter, make mark</summary>
156+
157+
/// <summary>Initialize the loop counter for a quantifier with a positive minimum.</summary>
158+
/// <remarks>
159+
/// When the quantifier requires at least one match (M > 0), this opcode pushes the current input position as a marker and a
160+
/// counter value computed as (1 - M) onto the grouping stack. This counter will be adjusted in subsequent loop iterations
161+
/// (via <see cref="Branchcount"/> or <see cref="Lazybranchcount"/>) to decide whether the loop should continue.
162+
/// </remarks>
131163
Setcount = 27,
132-
/// <summary>back jump,limit branch++ if zero&lt;=c&lt;limit.</summary>
164+
165+
/// <summary>Greedy counted branch for quantified loops.</summary>
166+
/// <remarks>
167+
/// This opcode is used for quantified loops that require a counter. When executed, it pops the previously stored marker and counter
168+
/// from the grouping stack, computes the difference between the current input position and the marker, and compares the counter
169+
/// against a limit (given in operand 1). If the counter indicates that more iterations are allowed (and the inner expression consumed
170+
/// input), it increments the counter, updates the marker with the new position, and jumps (via the jump offset in operand 0) to
171+
/// repeat the loop. Otherwise, the interpreter continues straight. On backtracking, the previous state is restored so that a decreased
172+
/// count may be tried.
173+
/// </remarks>
133174
Branchcount = 28,
134-
/// <summary>back jump,limit same, but straight first.</summary>
175+
176+
/// <summary>Lazy counted branch for quantified loops.</summary>
177+
/// <remarks>
178+
/// This opcode is the lazy counterpart to <see cref="Branchcount"/>. It is used in quantified loops that use a counter and prefer
179+
/// to exit the loop as early as possible. On initial execution it will choose the straight path (i.e. not repeating the loop) if
180+
/// the counter is nonnegative, but if the inner expression consumed input and the counter is below the maximum (given in operand 1),
181+
/// it will re-enter the loop on backtracking.
182+
/// </remarks>
135183
Lazybranchcount = 29,
136-
/// <summary>back save position.</summary>
184+
185+
/// <summary>Push a null marker into the grouping stack for quantifiers with a minimum of zero when no explicit counter is needed.</summary>
186+
/// <remarks>
187+
/// This opcode is similar to <see cref="Nullcount"/> but is used in cases where the quantified construct does not require counting;
188+
/// it pushes a marker value (-1) onto the grouping stack to record the starting position. On backtracking, the marker is simply removed.
189+
/// </remarks>
137190
Nullmark = 30,
138-
/// <summary>back save position.</summary>
191+
192+
/// <summary>Push the current input position onto the grouping stack.</summary>
193+
/// <remarks>
194+
/// Used by grouping constructs (for capturing or to detect empty matches in loops), this opcode saves the current input position
195+
/// so that later the interpreter can compare it to the current position to decide whether progress was made. It is the non-counting
196+
/// counterpart to <see cref="Setcount"/>.
197+
/// </remarks>
139198
Setmark = 31,
140-
/// <summary>back group define group.</summary>
199+
200+
/// <summary>Completes a capturing group.</summary>
201+
/// <remarks>
202+
/// When executed, this opcode pops a previously saved marker (the start position of the group) from the grouping stack and uses the
203+
/// current input position as the end position. Operand 0 specifies the capture slot number. If operand 1 is not -1 then a prior capture
204+
/// must have been made and a transfer of capture is performed. On backtracking, the capture is undone.
205+
/// </remarks>
141206
Capturemark = 32,
142-
/// <summary>back recall position.</summary>
207+
208+
/// <summary>Recall a previously saved marker.</summary>
209+
/// <remarks>
210+
/// This opcode restores the input position from a marker saved on the grouping stack (typically via a <see cref="Setmark"/> or
211+
/// <see cref="Nullmark"/>). It is used in lookaround constructs to revert the input position to the point where the lookaround began.
212+
/// On backtracking, the marker is re-pushed onto the grouping stack.
213+
/// </remarks>
143214
Getmark = 33,
144-
/// <summary>back save backtrack state.</summary>
215+
216+
/// <summary>Mark the beginning of a non-backtracking / atomic region.</summary>
217+
/// <remarks>
218+
/// This opcode is used at the start of constructs that must not be re-entered on backtracking (such as lookahead/lookbehind or atomic groups).
219+
/// It saves the current backtracking state (including the current tracking and crawl positions) onto the grouping stack.
220+
/// When the region is later exited (by <see cref="Forejump"/>) the saved state is used to prevent further backtracking into the region.
221+
/// </remarks>
145222
Setjump = 34,
146-
/// <summary>zap back to saved state.</summary>
223+
224+
/// <summary>Restore state for a non-backtracking / atomic region on backtracking.</summary>
225+
/// <remarks>
226+
/// Used in negative lookaround constructs, this opcode pops the saved backtracking and capture state (stored by a prior <see cref="Setjump"/>)
227+
/// and erases any changes made within the non-backtracking region. It thereby restores the state to what it was before entering the region.
228+
/// </remarks>
147229
Backjump = 35,
148-
/// <summary>zap backtracking state.</summary>
230+
231+
/// <summary>Finalize a non-backtracking / atomic region.</summary>
232+
/// <remarks>
233+
/// This opcode is used at the end of lookaround or atomic group constructs to commit to the current matching path.
234+
/// It pops the saved state from the grouping stack (stored by <see cref="Setjump"/>), updates the tracking pointer (thereby
235+
/// discarding any backtracking state from within the region), and then continues execution. On backtracking from such a region,
236+
/// a variant of this opcode will undo any captures made.
237+
/// </remarks>
149238
Forejump = 36,
150-
/// <summary>Backtrack if ref undefined.</summary>
239+
240+
/// <summary>Test whether a particular backreference has already matched.</summary>
241+
/// <remarks>
242+
/// Operand 0 is the capture group number to test. When executed, if the specified group has not captured any text,
243+
/// the match fails and control transfers to backtracking. Otherwise, execution continues. This opcode is used in conditional
244+
/// constructs where a branch is taken only if a given capture exists.
245+
/// </remarks>
151246
TestBackreference = 37,
152-
/// <summary>jump just go.</summary>
247+
248+
/// <summary>Unconditional jump.</summary>
249+
/// <remarks>
250+
/// Operand 0 holds the target offset. When executed, the interpreter jumps unconditionally to that location.
251+
/// This opcode is used to implement control flow for alternation and loop constructs.
252+
/// </remarks>
153253
Goto = 38,
154-
/// <summary>done!</summary>
254+
255+
/// <summary>Halt the interpreter.</summary>
256+
/// <remarks>
257+
/// This opcode marks the end of the opcode stream. When reached, the matching process terminates and the result
258+
/// (whether a match was found) is returned.
259+
/// </remarks>
155260
Stop = 40,
156261

157262
// Modifiers for alternate modes
158263

159264
/// <summary>Mask to get unmodified ordinary operator.</summary>
160265
OperatorMask = 63,
266+
161267
/// <summary>Indicates that we're reverse scanning.</summary>
162268
RightToLeft = 64,
269+
163270
/// <summary>Indicates that we're backtracking.</summary>
164271
Backtracking = 128,
272+
165273
/// <summary>Indicates that we're backtracking on a second branch.</summary>
274+
/// <remarks>
275+
/// In patterns with alternations or complex quantifiers, multiple backtracking paths may be available.
276+
/// This flag marks opcodes that are being processed on an alternate (or secondary) branch during backtracking,
277+
/// as opposed to the primary branch. The interpreter uses this flag to apply specialized state restoration
278+
/// or branch-selection logic when reverting from one branch to another.
279+
/// </remarks>
166280
BacktrackingSecond = 256,
167-
/// <summary>Indicates that we're case-insensitive</summary>
281+
282+
/// <summary>Indicates that we're case-insensitive.</summary>
168283
CaseInsensitive = 512,
169284
}
170285
}

0 commit comments

Comments
 (0)