Skip to content

Commit d885650

Browse files
[release/6.0-preview7] Eliminate backtracking in the interpreter for patterns with .* (#55960)
1 parent 71c078d commit d885650

File tree

3 files changed

+210
-18
lines changed

3 files changed

+210
-18
lines changed

src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs

Lines changed: 123 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,7 @@ internal abstract class RegexCompiler
6464
private static readonly MethodInfo s_spanSliceIntIntMethod = typeof(ReadOnlySpan<char>).GetMethod("Slice", new Type[] { typeof(int), typeof(int) })!;
6565
private static readonly MethodInfo s_spanStartsWith = typeof(MemoryExtensions).GetMethod("StartsWith", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)) })!.MakeGenericMethod(typeof(char));
6666
private static readonly MethodInfo s_stringAsSpanMethod = typeof(MemoryExtensions).GetMethod("AsSpan", new Type[] { typeof(string) })!;
67+
private static readonly MethodInfo s_spanLastIndexOfMethod = typeof(MemoryExtensions).GetMethod("LastIndexOf", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)) })!.MakeGenericMethod(typeof(char));
6768
private static readonly MethodInfo s_stringAsSpanIntIntMethod = typeof(MemoryExtensions).GetMethod("AsSpan", new Type[] { typeof(string), typeof(int), typeof(int) })!;
6869
private static readonly MethodInfo s_stringGetCharsMethod = typeof(string).GetMethod("get_Chars", new Type[] { typeof(int) })!;
6970
private static readonly MethodInfo s_stringIndexOfCharInt = typeof(string).GetMethod("IndexOf", new Type[] { typeof(char), typeof(int) })!;
@@ -90,6 +91,7 @@ internal abstract class RegexCompiler
9091
private LocalBuilder? _runstackLocal;
9192
private LocalBuilder? _textInfoLocal; // cached to avoid extraneous TLS hits from CurrentCulture and virtual calls to TextInfo
9293
private LocalBuilder? _loopTimeoutCounterLocal; // timeout counter for setrep and setloop
94+
private LocalBuilder? _maxBacktrackPositionLocal;
9395

9496
protected RegexOptions _options; // options
9597
protected RegexCode? _code; // the RegexCode object
@@ -891,6 +893,8 @@ private void GenerateForwardSection()
891893
Mvfldloc(s_runtrackposField, _runtrackposLocal!);
892894
Mvfldloc(s_runstackField, _runstackLocal!);
893895
Mvfldloc(s_runstackposField, _runstackposLocal!);
896+
Ldc(-1);
897+
Stloc(_maxBacktrackPositionLocal!);
894898

895899
_backpos = -1;
896900

@@ -1705,7 +1709,7 @@ protected void GenerateFindFirstChar()
17051709
// if (!CharInClass(textSpan[i + 2], prefix[2], "...")) goto returnFalse;
17061710
// ...
17071711
Debug.Assert(charClassIndex == 0 || charClassIndex == 1);
1708-
for ( ; charClassIndex < _leadingCharClasses.Length; charClassIndex++)
1712+
for (; charClassIndex < _leadingCharClasses.Length; charClassIndex++)
17091713
{
17101714
Debug.Assert(needLoop);
17111715
Ldloca(textSpanLocal);
@@ -3310,6 +3314,7 @@ protected void GenerateGo()
33103314
}
33113315
_runtextbegLocal = DeclareInt32();
33123316
_runtextendLocal = DeclareInt32();
3317+
_maxBacktrackPositionLocal = DeclareInt32();
33133318

33143319
InitializeCultureForGoIfNecessary();
33153320

@@ -4258,7 +4263,61 @@ private void GenerateOneCode()
42584263
//: break Backward;
42594264
{
42604265
string str = _strings![Operand(0)];
4266+
Label multiCode = DefineLabel();
4267+
if (!IsRightToLeft())
4268+
{
4269+
// if (runtextend - runtextpos < c)
4270+
Ldloc(_runtextendLocal!);
4271+
Ldloc(_runtextposLocal!);
4272+
Sub();
4273+
Ldc(str.Length);
4274+
BgeFar(multiCode);
4275+
// if (!caseInsensitive && _maxBacktrackPosition != -1 && runtextpos > _maxBacktrackPosition)
4276+
if (!IsCaseInsensitive())
4277+
{
4278+
Ldloc(_maxBacktrackPositionLocal!);
4279+
Ldc(-1);
4280+
BeqFar(_backtrack);
4281+
Ldloc(_runtextposLocal!);
4282+
Ldloc(_maxBacktrackPositionLocal!);
4283+
BleFar(_backtrack);
4284+
// runtextpos = _maxBacktrackPosition;
4285+
Ldloc(_maxBacktrackPositionLocal!);
4286+
Stloc(_runtextposLocal!);
4287+
// ReadOnlySpan<char> runtextSpan = runtext.AsSpan(_maxBacktrackPosition, runtextend - _maxBacktractPosition);
4288+
Ldloc(_runtextLocal!);
4289+
Ldloc(_maxBacktrackPositionLocal!);
4290+
Ldloc(_runtextendLocal!);
4291+
Ldloc(_maxBacktrackPositionLocal!);
4292+
Sub();
4293+
using (RentedLocalBuilder runtextSpanLocal = RentReadOnlySpanCharLocal())
4294+
{
4295+
Call(s_stringAsSpanIntIntMethod);
4296+
Stloc(runtextSpanLocal);
4297+
using (RentedLocalBuilder lastIndexOfLocal = RentInt32Local())
4298+
{
4299+
// int lastIndexOf = runtextSpan.LastIndexOf(str.AsSpan());
4300+
Ldloc(runtextSpanLocal);
4301+
Ldstr(str);
4302+
Call(s_stringAsSpanMethod);
4303+
Call(s_spanLastIndexOfMethod);
4304+
Stloc(lastIndexOfLocal);
4305+
// if (lastIndexOf > -1)
4306+
Ldloc(lastIndexOfLocal);
4307+
Ldc(-1);
4308+
BleFar(_backtrack);
4309+
// runtextpos = lastIndexOf + _maxBacktrackPosition;
4310+
Ldloc(lastIndexOfLocal);
4311+
Ldloc(_maxBacktrackPositionLocal!);
4312+
Add();
4313+
Stloc(_runtextposLocal!);
4314+
BrFar(_backtrack);
4315+
}
4316+
}
4317+
}
4318+
}
42614319

4320+
MarkLabel(multiCode);
42624321
Ldc(str.Length);
42634322
Ldloc(_runtextendLocal!);
42644323
Ldloc(_runtextposLocal!);
@@ -4598,6 +4657,9 @@ private void GenerateOneCode()
45984657

45994658
using RentedLocalBuilder lenLocal = RentInt32Local();
46004659
using RentedLocalBuilder iLocal = RentInt32Local();
4660+
using RentedLocalBuilder tempMaxBacktrackPositionLocal = RentInt32Local();
4661+
Ldloc(_runtextposLocal!);
4662+
Stloc(tempMaxBacktrackPositionLocal);
46014663

46024664
if (!IsRightToLeft())
46034665
{
@@ -4847,6 +4909,12 @@ private void GenerateOneCode()
48474909
DoPush();
48484910

48494911
Track();
4912+
// if (_operator == RegexCode.Notoneloop) maxBacktrackPosition = tempMaxBacktrackPosition
4913+
if (_regexopcode == RegexCode.Notoneloop)
4914+
{
4915+
Ldloc(tempMaxBacktrackPositionLocal);
4916+
Stloc(_maxBacktrackPositionLocal!);
4917+
}
48504918
}
48514919
break;
48524920
}
@@ -4870,28 +4938,66 @@ private void GenerateOneCode()
48704938
//: if (i > 0)
48714939
//: Track(i - 1, pos - 1);
48724940
//: Advance(2);
4873-
PopTrack();
4874-
Stloc(_runtextposLocal!);
4941+
Label noBacktrackPositionBranch = DefineLabel();
48754942
PopTrack();
48764943
using (RentedLocalBuilder posLocal = RentInt32Local())
48774944
{
48784945
Stloc(posLocal);
4879-
Ldloc(posLocal);
4880-
Ldc(0);
4881-
BleFar(AdvanceLabel());
4946+
PopTrack();
4947+
using (RentedLocalBuilder iBacktrackLocal = RentInt32Local())
4948+
{
4949+
Stloc(iBacktrackLocal);
4950+
// if (!caseInsensitive && maxBacktrackPosition != -1 && pos > maxBacktrackPosition && runtextpos < pos && _operator == (RegexCode.Notoneloop | RegexCode.Back) && !_rightToLeft)
4951+
if (!IsCaseInsensitive() && _regexopcode == (RegexCode.Notoneloop | RegexCode.Back) && !IsRightToLeft())
4952+
{
4953+
Ldloc(_maxBacktrackPositionLocal!);
4954+
Ldc(-1);
4955+
Beq(noBacktrackPositionBranch);
4956+
Ldloc(posLocal);
4957+
Ldloc(_maxBacktrackPositionLocal!);
4958+
Ble(noBacktrackPositionBranch);
4959+
Ldloc(_runtextposLocal!);
4960+
Ldloc(posLocal);
4961+
Bge(noBacktrackPositionBranch);
4962+
/*
4963+
int difference = pos - maxBacktrackPosition;
4964+
pos = runtextpos;
4965+
i -= difference;
4966+
maxBacktrackPosition = -1;
4967+
*/
4968+
// int difference = pos - maxBacktrackPosition;
4969+
Ldloc(iBacktrackLocal);
4970+
Ldloc(posLocal);
4971+
Ldloc(_maxBacktrackPositionLocal!);
4972+
Sub();
4973+
Sub();
4974+
Stloc(iBacktrackLocal);
4975+
Ldloc(_runtextposLocal!);
4976+
Stloc(posLocal);
4977+
Ldc(-1);
4978+
Stloc(_maxBacktrackPositionLocal!);
4979+
}
4980+
4981+
MarkLabel(noBacktrackPositionBranch);
4982+
Ldloc(posLocal);
4983+
Stloc(_runtextposLocal!);
4984+
Ldloc(iBacktrackLocal);
4985+
Ldc(0);
4986+
BleFar(AdvanceLabel());
4987+
ReadyPushTrack();
4988+
Ldloc(iBacktrackLocal);
4989+
}
4990+
Ldc(1);
4991+
Sub();
4992+
DoPush();
48824993
ReadyPushTrack();
4883-
Ldloc(posLocal);
4994+
Ldloc(_runtextposLocal!);
4995+
Ldc(1);
4996+
Sub(IsRightToLeft());
4997+
DoPush();
4998+
Trackagain();
4999+
Advance();
48845000
}
4885-
Ldc(1);
4886-
Sub();
4887-
DoPush();
4888-
ReadyPushTrack();
4889-
Ldloc(_runtextposLocal!);
4890-
Ldc(1);
4891-
Sub(IsRightToLeft());
4892-
DoPush();
4893-
Trackagain();
4894-
Advance();
48955001
break;
48965002

48975003
case RegexCode.Onelazy:

src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexInterpreter.cs

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ internal sealed class RegexInterpreter : RegexRunner
2020
private int _codepos;
2121
private bool _rightToLeft;
2222
private bool _caseInsensitive;
23+
private int _maxBacktrackPosition = -1;
2324

2425
public RegexInterpreter(RegexCode code, CultureInfo culture)
2526
{
@@ -223,6 +224,20 @@ private bool MatchString(string str)
223224
{
224225
if (runtextend - runtextpos < c)
225226
{
227+
// If MatchString was called after a greedy op such as a .*, we would have zipped runtextpos to the end without really examining any characters. Reset to maxBacktrackPos here as an optimization
228+
if (!_caseInsensitive && _maxBacktrackPosition != -1 && runtextpos > _maxBacktrackPosition)
229+
{
230+
// If lastIndexOf is -1, we backtrack to the max extent possible.
231+
runtextpos = _maxBacktrackPosition;
232+
ReadOnlySpan<char> runtextSpan = runtext.AsSpan(_maxBacktrackPosition, runtextend - _maxBacktrackPosition);
233+
int lastIndexOf = runtextSpan.LastIndexOf(str);
234+
if (lastIndexOf > -1)
235+
{
236+
// Found the next position to match. Move runtextpos here
237+
runtextpos = _maxBacktrackPosition + lastIndexOf;
238+
}
239+
}
240+
226241
return false;
227242
}
228243

@@ -1185,6 +1200,7 @@ protected override void Go()
11851200
int len = Math.Min(Operand(1), Forwardchars());
11861201
char ch = (char)Operand(0);
11871202
int i;
1203+
int tempMaxBacktrackPosition = runtextpos;
11881204

11891205
if (!_rightToLeft && !_caseInsensitive)
11901206
{
@@ -1217,6 +1233,7 @@ protected override void Go()
12171233
if (len > i && _operator == RegexCode.Notoneloop)
12181234
{
12191235
TrackPush(len - i - 1, runtextpos - Bump());
1236+
_maxBacktrackPosition = tempMaxBacktrackPosition;
12201237
}
12211238
}
12221239
advance = 2;
@@ -1261,6 +1278,16 @@ protected override void Go()
12611278
{
12621279
int i = TrackPeek();
12631280
int pos = TrackPeek(1);
1281+
if (!_caseInsensitive && _maxBacktrackPosition != -1 && pos > _maxBacktrackPosition && runtextpos < pos && _operator == (RegexCode.Notoneloop | RegexCode.Back) && !_rightToLeft)
1282+
{
1283+
// The Multi node has bumped us along already
1284+
int difference = pos - _maxBacktrackPosition;
1285+
Debug.Assert(difference > 0);
1286+
pos = runtextpos;
1287+
i -= difference;
1288+
// We shouldn't be backtracking anymore.
1289+
_maxBacktrackPosition = -1;
1290+
}
12641291
runtextpos = pos;
12651292
if (i > 0)
12661293
{

0 commit comments

Comments
 (0)