Skip to content

Commit fb0ebb5

Browse files
committed
Use offset to traverse matching
Removes unnecessary string allocations when detecting a match
1 parent 3f69b59 commit fb0ebb5

File tree

5 files changed

+22
-21
lines changed

5 files changed

+22
-21
lines changed

src/TurnerSoftware.RobotsExclusionTools/Tokenization/TokenDefinition.cs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,15 +16,15 @@ public TokenDefinition(TokenType tokenType, string regex)
1616
TokenType = tokenType;
1717
}
1818

19-
public TokenMatch Match(string input)
19+
public TokenMatch Match(string input, int offset = 0)
2020
{
21-
var match = Regex.Match(input);
21+
var match = Regex.Match(input, offset);
2222
if (match.Success)
2323
{
2424
return new TokenMatch
2525
{
2626
IsMatch = true,
27-
RemainingText = input.Substring(match.Length),
27+
MatchLength = match.Length,
2828
TokenType = TokenType,
2929
Value = match.Value
3030
};

src/TurnerSoftware.RobotsExclusionTools/Tokenization/TokenMatch.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,6 @@ public class TokenMatch
99
public bool IsMatch { get; set; }
1010
public TokenType TokenType { get; set; }
1111
public string Value { get; set; }
12-
public string RemainingText { get; set; }
12+
public int MatchLength { get; set; }
1313
}
1414
}

src/TurnerSoftware.RobotsExclusionTools/Tokenization/TokenizerBase.cs

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -17,19 +17,20 @@ public abstract class TokenizerBase : ITokenizer
1717
public IEnumerable<Token> Tokenize(string text)
1818
{
1919
var tokens = new List<Token>();
20-
var remainingText = text;
20+
var offset = 0;
21+
var numberOfChars = text.Length;
2122

22-
while (!string.IsNullOrWhiteSpace(remainingText))
23+
while (offset < numberOfChars)
2324
{
24-
var match = FindMatch(remainingText);
25+
var match = FindMatch(text, offset);
2526
if (match.IsMatch)
2627
{
2728
tokens.Add(new Token(match.TokenType, match.Value));
28-
remainingText = match.RemainingText;
29+
offset += match.MatchLength;
2930
}
3031
else
3132
{
32-
remainingText = remainingText.Substring(1);
33+
offset++;
3334
}
3435
}
3536

@@ -60,11 +61,11 @@ public async Task<IEnumerable<Token>> TokenizeAsync(TextReader reader)
6061
return tokens;
6162
}
6263

63-
private TokenMatch FindMatch(string text)
64+
private TokenMatch FindMatch(string text, int offset)
6465
{
6566
foreach (var tokenDefinition in GetTokenDefinitions())
6667
{
67-
var match = tokenDefinition.Match(text);
68+
var match = tokenDefinition.Match(text, offset);
6869
if (match.IsMatch)
6970
{
7071
return match;

src/TurnerSoftware.RobotsExclusionTools/Tokenization/Tokenizers/RobotsFileTokenizer.cs

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -33,11 +33,11 @@ public class RobotsFileTokenizer : TokenizerBase
3333
//This can be expressed as the following:
3434
//\x21\x23-\x27\x2a\x2b\x2d\x2e\x41-\x5a\x5e-\x7a\x7c\x7e
3535

36-
new TokenDefinition(TokenType.Comment, @"^#[^\x0A\x0D]*"),
37-
new TokenDefinition(TokenType.Field, @"^[\x21\x23-\x27\x2a\x2b\x2d\x2e\x41-\x5a\x5e-\x7a\x7c\x7e]+(?=:[ ])"),
38-
new TokenDefinition(TokenType.FieldValueDelimiter, "^:[ ]"),
39-
new TokenDefinition(TokenType.Value, @"^[^\x0A\x0D#]+"),
40-
new TokenDefinition(TokenType.NewLine, @"^\x0D?\x0A")
36+
new TokenDefinition(TokenType.Comment, @"\G#[^\x0A\x0D]*"),
37+
new TokenDefinition(TokenType.Field, @"\G[\x21\x23-\x27\x2a\x2b\x2d\x2e\x41-\x5a\x5e-\x7a\x7c\x7e]+(?=:[ ])"),
38+
new TokenDefinition(TokenType.FieldValueDelimiter, @"\G:[ ]"),
39+
new TokenDefinition(TokenType.Value, @"\G[^\x0A\x0D#]+"),
40+
new TokenDefinition(TokenType.NewLine, @"\G\x0D?\x0A")
4141
};
4242

4343
protected override IEnumerable<TokenDefinition> GetTokenDefinitions()

src/TurnerSoftware.RobotsExclusionTools/Tokenization/Tokenizers/RobotsPageTokenizer.cs

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -9,11 +9,11 @@ public class RobotsPageTokenizer : TokenizerBase
99
private static readonly IEnumerable<TokenDefinition> TokenDefinitions = new []
1010
{
1111
//Based on the same character restriction rules as the RobotsFileTokenizer
12-
new TokenDefinition(TokenType.Field, @"^[\x21\x23-\x27\x2a\x2b\x2d\x2e\x41-\x5a\x5e-\x7a\x7c\x7e]+(?=:[ ])"),
13-
new TokenDefinition(TokenType.FieldValueDelimiter, "^:[ ]"),
14-
new TokenDefinition(TokenType.Value, @"^[^\x0A\x0D#,]+"),
15-
new TokenDefinition(TokenType.NewLine, @"^\x0D?\x0A"),
16-
new TokenDefinition(TokenType.ValueDelimiter, "^,[ ]+")
12+
new TokenDefinition(TokenType.Field, @"\G[\x21\x23-\x27\x2a\x2b\x2d\x2e\x41-\x5a\x5e-\x7a\x7c\x7e]+(?=:[ ])"),
13+
new TokenDefinition(TokenType.FieldValueDelimiter, @"\G:[ ]"),
14+
new TokenDefinition(TokenType.Value, @"\G[^\x0A\x0D#,]+"),
15+
new TokenDefinition(TokenType.NewLine, @"\G\x0D?\x0A"),
16+
new TokenDefinition(TokenType.ValueDelimiter, @"\G,[ ]+")
1717
};
1818

1919
protected override IEnumerable<TokenDefinition> GetTokenDefinitions()

0 commit comments

Comments
 (0)