Skip to content

[RegexDiff X64] [stephentoub] Remove capture groups from negative lookarounds #1285

@MihuBot

Description

@MihuBot

Job completed in 17 minutes 34 seconds (remote runner delay: 44 seconds).
dotnet/runtime#118084
Using arguments: regexdiff

70 out of 18857 patterns have generated source code changes.

Examples of GeneratedRegex source diffs
"&(?!(amp;)|(lt;)|(gt;)|(quot;))" (1847 uses)
[GeneratedRegex("&(?!(amp;)|(lt;)|(gt;)|(quot;))", RegexOptions.IgnoreCase)]
  /// ○ Match '&amp;'.<br/>
  /// ○ Zero-width negative lookahead.<br/>
  ///     ○ Match with 4 alternative expressions, atomically.<br/>
-   ///         ○ 1st capture group.<br/>
+   ///         ○ Match a sequence of expressions.<br/>
  ///             ○ Match a character in the set [Aa].<br/>
  ///             ○ Match a character in the set [Mm].<br/>
  ///             ○ Match a character in the set [Pp].<br/>
  ///             ○ Match ';'.<br/>
-   ///         ○ 2nd capture group.<br/>
+   ///         ○ Match a sequence of expressions.<br/>
  ///             ○ Match a character in the set [Ll].<br/>
  ///             ○ Match a character in the set [Tt].<br/>
  ///             ○ Match ';'.<br/>
-   ///         ○ 3rd capture group.<br/>
+   ///         ○ Match a sequence of expressions.<br/>
  ///             ○ Match a character in the set [Gg].<br/>
  ///             ○ Match a character in the set [Tt].<br/>
  ///             ○ Match ';'.<br/>
-   ///         ○ 4th capture group.<br/>
+   ///         ○ Match a sequence of expressions.<br/>
  ///             ○ Match a character in the set [Qq].<br/>
  ///             ○ Match a character in the set [Uu].<br/>
  ///             ○ Match a character in the set [Oo].<br/>
              {
                  int pos = base.runtextpos;
                  int matchStart = pos;
-                   int capture_starting_pos = 0;
-                   int capture_starting_pos1 = 0;
-                   int capture_starting_pos2 = 0;
-                   int capture_starting_pos3 = 0;
-                   int negativelookahead__capture_pos = 0;
                  ReadOnlySpan<char> slice = inputSpan.Slice(pos);
                  
                  // Match '&'.
                  if (slice.IsEmpty || slice[0] != '&')
                  {
-                       UncaptureUntil(0);
                      return false; // The input didn't match.
                  }
                  
                          base.CheckTimeout();
                      }
                      
-                       negativelookahead__capture_pos = base.Crawlpos();
                      // Match with 4 alternative expressions, atomically.
                      {
                          if ((uint)slice.Length < 2)
                          switch (slice[1])
                          {
                              case 'A' or 'a':
-                                   // 1st capture group.
+                                   
+                                   if ((uint)slice.Length < 5 ||
+                                       !slice.Slice(2).StartsWith("mp;", StringComparison.OrdinalIgnoreCase)) // Match the string "mp;" (ordinal case-insensitive)
                                  {
-                                       pos++;
-                                       slice = inputSpan.Slice(pos);
-                                       capture_starting_pos = pos;
-                                       
-                                       if ((uint)slice.Length < 4 ||
-                                           !slice.StartsWith("amp;", StringComparison.OrdinalIgnoreCase)) // Match the string "amp;" (ordinal case-insensitive)
-                                       {
-                                           goto NegativeLookaroundMatch;
-                                       }
-                                       
-                                       pos += 4;
-                                       slice = inputSpan.Slice(pos);
-                                       base.Capture(1, capture_starting_pos, pos);
+                                       goto NegativeLookaroundMatch;
                                  }
                                  
+                                   pos += 5;
+                                   slice = inputSpan.Slice(pos);
                                  break;
                                  
                              case 'L' or 'l':
-                                   // 2nd capture group.
+                                   
+                                   if ((uint)slice.Length < 4 ||
+                                       !slice.Slice(2).StartsWith("t;", StringComparison.OrdinalIgnoreCase)) // Match the string "t;" (ordinal case-insensitive)
                                  {
-                                       pos++;
-                                       slice = inputSpan.Slice(pos);
-                                       capture_starting_pos1 = pos;
-                                       
-                                       if ((uint)slice.Length < 3 ||
-                                           !slice.StartsWith("lt;", StringComparison.OrdinalIgnoreCase)) // Match the string "lt;" (ordinal case-insensitive)
-                                       {
-                                           goto NegativeLookaroundMatch;
-                                       }
-                                       
-                                       pos += 3;
-                                       slice = inputSpan.Slice(pos);
-                                       base.Capture(2, capture_starting_pos1, pos);
+                                       goto NegativeLookaroundMatch;
                                  }
                                  
+                                   pos += 4;
+                                   slice = inputSpan.Slice(pos);
                                  break;
                                  
                              case 'G' or 'g':
-                                   // 3rd capture group.
+                                   
+                                   if ((uint)slice.Length < 4 ||
+                                       !slice.Slice(2).StartsWith("t;", StringComparison.OrdinalIgnoreCase)) // Match the string "t;" (ordinal case-insensitive)
                                  {
-                                       pos++;
-                                       slice = inputSpan.Slice(pos);
-                                       capture_starting_pos2 = pos;
-                                       
-                                       if ((uint)slice.Length < 3 ||
-                                           !slice.StartsWith("gt;", StringComparison.OrdinalIgnoreCase)) // Match the string "gt;" (ordinal case-insensitive)
-                                       {
-                                           goto NegativeLookaroundMatch;
-                                       }
-                                       
-                                       pos += 3;
-                                       slice = inputSpan.Slice(pos);
-                                       base.Capture(3, capture_starting_pos2, pos);
+                                       goto NegativeLookaroundMatch;
                                  }
                                  
+                                   pos += 4;
+                                   slice = inputSpan.Slice(pos);
                                  break;
                                  
                              case 'Q' or 'q':
-                                   // 4th capture group.
+                                   
+                                   if ((uint)slice.Length < 6 ||
+                                       !slice.Slice(2).StartsWith("uot;", StringComparison.OrdinalIgnoreCase)) // Match the string "uot;" (ordinal case-insensitive)
                                  {
-                                       pos++;
-                                       slice = inputSpan.Slice(pos);
-                                       capture_starting_pos3 = pos;
-                                       
-                                       if ((uint)slice.Length < 5 ||
-                                           !slice.StartsWith("quot;", StringComparison.OrdinalIgnoreCase)) // Match the string "quot;" (ordinal case-insensitive)
-                                       {
-                                           goto NegativeLookaroundMatch;
-                                       }
-                                       
-                                       pos += 5;
-                                       slice = inputSpan.Slice(pos);
-                                       base.Capture(4, capture_starting_pos3, pos);
+                                       goto NegativeLookaroundMatch;
                                  }
                                  
+                                   pos += 6;
+                                   slice = inputSpan.Slice(pos);
                                  break;
                                  
                              default:
                          }
                      }
                      
-                       UncaptureUntil(0);
                      return false; // The input didn't match.
                      
                      NegativeLookaroundMatch:
                      pos = negativelookahead__starting_pos;
                      slice = inputSpan.Slice(pos);
-                       UncaptureUntil(negativelookahead__capture_pos);
                  }
                  
                  // The input matched.
                  base.runtextpos = pos;
                  base.Capture(0, matchStart, pos);
                  return true;
-                   
-                   // <summary>Undo captures until it reaches the specified capture position.</summary>
-                   [MethodImpl(MethodImplOptions.AggressiveInlining)]
-                   void UncaptureUntil(int capturePosition)
-                   {
-                       while (base.Crawlpos() > capturePosition)
-                       {
-                           base.Uncapture();
-                       }
-                   }
              }
          }
      }
"&(?!(amp;)|(lt;)|(gt;)|(quot;)|(nbsp;)|(reg;))" (783 uses)
[GeneratedRegex("&(?!(amp;)|(lt;)|(gt;)|(quot;)|(nbsp;)|(reg;))", RegexOptions.IgnoreCase)]
  /// ○ Match '&amp;'.<br/>
  /// ○ Zero-width negative lookahead.<br/>
  ///     ○ Match with 6 alternative expressions, atomically.<br/>
-   ///         ○ 1st capture group.<br/>
+   ///         ○ Match a sequence of expressions.<br/>
  ///             ○ Match a character in the set [Aa].<br/>
  ///             ○ Match a character in the set [Mm].<br/>
  ///             ○ Match a character in the set [Pp].<br/>
  ///             ○ Match ';'.<br/>
-   ///         ○ 2nd capture group.<br/>
+   ///         ○ Match a sequence of expressions.<br/>
  ///             ○ Match a character in the set [Ll].<br/>
  ///             ○ Match a character in the set [Tt].<br/>
  ///             ○ Match ';'.<br/>
-   ///         ○ 3rd capture group.<br/>
+   ///         ○ Match a sequence of expressions.<br/>
  ///             ○ Match a character in the set [Gg].<br/>
  ///             ○ Match a character in the set [Tt].<br/>
  ///             ○ Match ';'.<br/>
-   ///         ○ 4th capture group.<br/>
+   ///         ○ Match a sequence of expressions.<br/>
  ///             ○ Match a character in the set [Qq].<br/>
  ///             ○ Match a character in the set [Uu].<br/>
  ///             ○ Match a character in the set [Oo].<br/>
  ///             ○ Match a character in the set [Tt].<br/>
  ///             ○ Match ';'.<br/>
-   ///         ○ 5th capture group.<br/>
+   ///         ○ Match a sequence of expressions.<br/>
  ///             ○ Match a character in the set [Nn].<br/>
  ///             ○ Match a character in the set [Bb].<br/>
  ///             ○ Match a character in the set [Ss].<br/>
  ///             ○ Match a character in the set [Pp].<br/>
  ///             ○ Match ';'.<br/>
-   ///         ○ 6th capture group.<br/>
+   ///         ○ Match a sequence of expressions.<br/>
  ///             ○ Match a character in the set [Rr].<br/>
  ///             ○ Match a character in the set [Ee].<br/>
  ///             ○ Match a character in the set [Gg].<br/>
              {
                  int pos = base.runtextpos;
                  int matchStart = pos;
-                   int capture_starting_pos = 0;
-                   int capture_starting_pos1 = 0;
-                   int capture_starting_pos2 = 0;
-                   int capture_starting_pos3 = 0;
-                   int capture_starting_pos4 = 0;
-                   int capture_starting_pos5 = 0;
-                   int negativelookahead__capture_pos = 0;
                  ReadOnlySpan<char> slice = inputSpan.Slice(pos);
                  
                  // Match '&'.
                  if (slice.IsEmpty || slice[0] != '&')
                  {
-                       UncaptureUntil(0);
                      return false; // The input didn't match.
                  }
                  
                          base.CheckTimeout();
                      }
                      
-                       negativelookahead__capture_pos = base.Crawlpos();
                      // Match with 6 alternative expressions, atomically.
                      {
                          if ((uint)slice.Length < 2)
                          switch (slice[1])
                          {
                              case 'A' or 'a':
-                                   // 1st capture group.
+                                   
+                                   if ((uint)slice.Length < 5 ||
+                                       !slice.Slice(2).StartsWith("mp;", StringComparison.OrdinalIgnoreCase)) // Match the string "mp;" (ordinal case-insensitive)
                                  {
-                                       pos++;
-                                       slice = inputSpan.Slice(pos);
-                                       capture_starting_pos = pos;
-                                       
-                                       if ((uint)slice.Length < 4 ||
-                                           !slice.StartsWith("amp;", StringComparison.OrdinalIgnoreCase)) // Match the string "amp;" (ordinal case-insensitive)
-                                       {
-                                           goto NegativeLookaroundMatch;
-                                       }
-                                       
-                                       pos += 4;
-                                       slice = inputSpan.Slice(pos);
-                                       base.Capture(1, capture_starting_pos, pos);
+                                       goto NegativeLookaroundMatch;
                                  }
                                  
+                                   pos += 5;
+                                   slice = inputSpan.Slice(pos);
                                  break;
                                  
                              case 'L' or 'l':
-                                   // 2nd capture group.
+                                   
+                                   if ((uint)slice.Length < 4 ||
+                                       !slice.Slice(2).StartsWith("t;", StringComparison.OrdinalIgnoreCase)) // Match the string "t;" (ordinal case-insensitive)
                                  {
-                                       pos++;
-                                       slice = inputSpan.Slice(pos);
-                                       capture_starting_pos1 = pos;
-                                       
-                                       if ((uint)slice.Length < 3 ||
-                                           !slice.StartsWith("lt;", StringComparison.OrdinalIgnoreCase)) // Match the string "lt;" (ordinal case-insensitive)
-                                       {
-                                           goto NegativeLookaroundMatch;
-                                       }
-                                       
-                                       pos += 3;
-                                       slice = inputSpan.Slice(pos);
-                                       base.Capture(2, capture_starting_pos1, pos);
+                                       goto NegativeLookaroundMatch;
                                  }
                                  
+                                   pos += 4;
+                                   slice = inputSpan.Slice(pos);
                                  break;
                                  
                              case 'G' or 'g':
-                                   // 3rd capture group.
+                                   
+                                   if ((uint)slice.Length < 4 ||
+                                       !slice.Slice(2).StartsWith("t;", StringComparison.OrdinalIgnoreCase)) // Match the string "t;" (ordinal case-insensitive)
                                  {
-                                       pos++;
-                                       slice = inputSpan.Slice(pos);
-                                       capture_starting_pos2 = pos;
-                                       
-                                       if ((uint)slice.Length < 3 ||
-                                           !slice.StartsWith("gt;", StringComparison.OrdinalIgnoreCase)) // Match the string "gt;" (ordinal case-insensitive)
-                                       {
-                                           goto NegativeLookaroundMatch;
-                                       }
-                                       
-                                       pos += 3;
-                                       slice = inputSpan.Slice(pos);
-                                       base.Capture(3, capture_starting_pos2, pos);
+                                       goto NegativeLookaroundMatch;
                                  }
                                  
+                                   pos += 4;
+                                   slice = inputSpan.Slice(pos);
                                  break;
                                  
                              case 'Q' or 'q':
-                                   // 4th capture group.
+                                   
+                                   if ((uint)slice.Length < 6 ||
+                                       !slice.Slice(2).StartsWith("uot;", StringComparison.OrdinalIgnoreCase)) // Match the string "uot;" (ordinal case-insensitive)
                                  {
-                                       pos++;
-                                       slice = inputSpan.Slice(pos);
-                                       capture_starting_pos3 = pos;
-                                       
-                                       if ((uint)slice.Length < 5 ||
-                                           !slice.StartsWith("quot;", StringComparison.OrdinalIgnoreCase)) // Match the string "quot;" (ordinal case-insensitive)
-                                       {
-                                           goto NegativeLookaroundMatch;
-                                       }
-                                       
-                                       pos += 5;
-                                       slice = inputSpan.Slice(pos);
-                                       base.Capture(4, capture_starting_pos3, pos);
+                                       goto NegativeLookaroundMatch;
                                  }
                                  
+                                   pos += 6;
+                                   slice = inputSpan.Slice(pos);
                                  break;
                                  
                              case 'N' or 'n':
-                                   // 5th capture group.
+                                   
+                                   if ((uint)slice.Length < 6 ||
+                                       !slice.Slice(2).StartsWith("bsp;", StringComparison.OrdinalIgnoreCase)) // Match the string "bsp;" (ordinal case-insensitive)
                                  {
-                                       pos++;
-                                       slice = inputSpan.Slice(pos);
-                                       capture_starting_pos4 = pos;
-                                       
-                                       if ((uint)slice.Length < 5 ||
-                                           !slice.StartsWith("nbsp;", StringComparison.OrdinalIgnoreCase)) // Match the string "nbsp;" (ordinal case-insensitive)
-                                       {
-                                           goto NegativeLookaroundMatch;
-                                       }
-                                       
-                                       pos += 5;
-                                       slice = inputSpan.Slice(pos);
-                                       base.Capture(5, capture_starting_pos4, pos);
+                                       goto NegativeLookaroundMatch;
                                  }
                                  
+                                   pos += 6;
+                                   slice = inputSpan.Slice(pos);
                                  break;
                                  
                              case 'R' or 'r':
-                                   // 6th capture group.
+                                   
+                                   if ((uint)slice.Length < 5 ||
+                                       !slice.Slice(2).StartsWith("eg;", StringComparison.OrdinalIgnoreCase)) // Match the string "eg;" (ordinal case-insensitive)
                                  {
-                                       pos++;
-                                       slice = inputSpan.Slice(pos);
-                                       capture_starting_pos5 = pos;
-                                       
-                                       if ((uint)slice.Length < 4 ||
-                                           !slice.StartsWith("reg;", StringComparison.OrdinalIgnoreCase)) // Match the string "reg;" (ordinal case-insensitive)
-                                       {
-                                           goto NegativeLookaroundMatch;
-                                       }
-                                       
-                                       pos += 4;
-                                       slice = inputSpan.Slice(pos);
-                                       base.Capture(6, capture_starting_pos5, pos);
+                                       goto NegativeLookaroundMatch;
                                  }
                                  
+                                   pos += 5;
+                                   slice = inputSpan.Slice(pos);
                                  break;
                                  
                              default:
                          }
                      }
                      
-                       UncaptureUntil(0);
                      return false; // The input didn't match.
                      
                      NegativeLookaroundMatch:
                      pos = negativelookahead__starting_pos;
                      slice = inputSpan.Slice(pos);
-                       UncaptureUntil(negativelookahead__capture_pos);
                  }
                  
                  // The input matched.
                  base.runtextpos = pos;
                  base.Capture(0, matchStart, pos);
                  return true;
-                   
-                   // <summary>Undo captures until it reaches the specified capture position.</summary>
-                   [MethodImpl(MethodImplOptions.AggressiveInlining)]
-                   void UncaptureUntil(int capturePosition)
-                   {
-                       while (base.Crawlpos() > capturePosition)
-                       {
-                           base.Uncapture();
-                       }
-                   }
              }
          }
      }
"(?!(^[A-Z]))([A-Z])" (70 uses)
[GeneratedRegex("(?!(^[A-Z]))([A-Z])")]
  /// Explanation:<br/>
  /// <code>
  /// ○ Zero-width negative lookahead.<br/>
-   ///     ○ 1st capture group.<br/>
-   ///         ○ Match if at the beginning of the string.<br/>
-   ///         ○ Match a character in the set [A-Z].<br/>
+   ///     ○ Match if at the beginning of the string.<br/>
+   ///     ○ Match a character in the set [A-Z].<br/>
  /// ○ 2nd capture group.<br/>
  ///     ○ Match a character in the set [A-Z].<br/>
  /// </code>
                  int pos = base.runtextpos;
                  int matchStart = pos;
                  int capture_starting_pos = 0;
-                   int capture_starting_pos1 = 0;
-                   int negativelookahead__capture_pos = 0;
                  ReadOnlySpan<char> slice = inputSpan.Slice(pos);
                  
                  // Zero-width negative lookahead.
                          base.CheckTimeout();
                      }
                      
-                       negativelookahead__capture_pos = base.Crawlpos();
-                       // 1st capture group.
+                       // Match if at the beginning of the string.
+                       if (pos != 0)
                      {
-                           capture_starting_pos = pos;
-                           
-                           // Match if at the beginning of the string.
-                           if (pos != 0)
-                           {
-                               goto NegativeLookaroundMatch;
-                           }
-                           
-                           // Match a character in the set [A-Z].
-                           if (slice.IsEmpty || !char.IsAsciiLetterUpper(slice[0]))
-                           {
-                               goto NegativeLookaroundMatch;
-                           }
-                           
-                           pos++;
-                           slice = inputSpan.Slice(pos);
-                           base.Capture(1, capture_starting_pos, pos);
+                           goto NegativeLookaroundMatch;
+                       }
+                       
+                       // Match a character in the set [A-Z].
+                       if (slice.IsEmpty || !char.IsAsciiLetterUpper(slice[0]))
+                       {
+                           goto NegativeLookaroundMatch;
                      }
                      
                      UncaptureUntil(0);
                      NegativeLookaroundMatch:
                      pos = negativelookahead__starting_pos;
                      slice = inputSpan.Slice(pos);
-                       UncaptureUntil(negativelookahead__capture_pos);
                  }
                  
                  // 2nd capture group.
                  {
-                       capture_starting_pos1 = pos;
+                       capture_starting_pos = pos;
                      
                      // Match a character in the set [A-Z].
                      if (slice.IsEmpty || !char.IsAsciiLetterUpper(slice[0]))
                      
                      pos++;
                      slice = inputSpan.Slice(pos);
-                       base.Capture(2, capture_starting_pos1, pos);
+                       base.Capture(2, capture_starting_pos, pos);
                  }
                  
                  // The input matched.

For more diff examples, see https://gist.github.com/MihuBot/92ee92ddd485ed9c74d00540c589475f

Total bytes of base: 54274946
Total bytes of diff: 54227335
Total bytes of delta: -47611 (-0.09 % of base)
Total relative delta: -14.88
    diff is an improvement.
    relative diff is an improvement.

For a list of JIT diff improvements, see Improvements.md

Sample source code for further analysis
const string JsonPath = "RegexResults-1285.json";
if (!File.Exists(JsonPath))
{
    await using var archiveStream = await new HttpClient().GetStreamAsync("https://mihubot.xyz/r/E2kAZabA");
    using var archive = new ZipArchive(archiveStream, ZipArchiveMode.Read);
    archive.Entries.First(e => e.Name == "Results.json").ExtractToFile(JsonPath);
}

using FileStream jsonFileStream = File.OpenRead(JsonPath);
RegexEntry[] entries = JsonSerializer.Deserialize<RegexEntry[]>(jsonFileStream, new JsonSerializerOptions { IncludeFields = true })!;
Console.WriteLine($"Working with {entries.Length} patterns");



record KnownPattern(string Pattern, RegexOptions Options, int Count);

sealed class RegexEntry
{
    public required KnownPattern Regex { get; set; }
    public required string MainSource { get; set; }
    public required string PrSource { get; set; }
    public string? FullDiff { get; set; }
    public string? ShortDiff { get; set; }
    public (string Name, string Values)[]? SearchValuesOfChar { get; set; }
    public (string[] Values, StringComparison ComparisonType)[]? SearchValuesOfString { get; set; }
}

Artifacts:

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions