Skip to content

Commit

Permalink
fixes dlclark#24 changes $ behavior in singleline when using RE2 and …
Browse files Browse the repository at this point in the history
…ECMAScript modes

PCRE and .NET have a different definition of $ than RE2 and ECMAScript
engines in singleline mode.  PCRE defines it as "$ asserts position at the end
of the string, or before the line terminator right at the end of the string (if any)."
This means that a pattern of "^ac$\n" is valid and can match "ac\n" OR "ac".

This behavior is different in RE2 and ECMAScript engines.  For these engines the
pattern "^ac$\n" won't match any inputs in singleline mode because the $ demands the
string ends but the pattern requires an extra \n so they both cannot be true.

The PCRE/.NET behavior feels wrong, but for this project I maintain compatibility with
them in "default" mode.  The other, less suprising behavior is enabled by using either
the RE2 option or the ECMAScript option.
  • Loading branch information
dlclark committed Aug 25, 2020
1 parent f48b8c1 commit 0537a49
Show file tree
Hide file tree
Showing 3 changed files with 56 additions and 1 deletion.
23 changes: 23 additions & 0 deletions regexp_re2_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -96,3 +96,26 @@ func TestRE2NamedAscii_Concat(t *testing.T) {
t.Fatal("Expected match")
}
}

func TestRE2Dollar_Singleline(t *testing.T) {
// PCRE allows for \n after the $ and RE2 doesn't
r := MustCompile(`^ac$\n`, RE2)
if m, _ := r.MatchString("ac"); m {
t.Fatal("Expected no match")
}
if m, _ := r.MatchString("ac\n"); m {
t.Fatal("Expected no match")
}
}

func TestRE2Dollar_Multiline(t *testing.T) {
r := MustCompile(`^ac$\n`, RE2|Multiline)
if m, _ := r.MatchString("ac"); m {
t.Fatal("Expected no match")
}
if m, err := r.MatchString("ac\n"); err != nil {
t.Fatal(err)
} else if !m {
t.Fatal("Expected match")
}
}
19 changes: 19 additions & 0 deletions regexp_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -758,6 +758,25 @@ func TestECMANegateRange(t *testing.T) {
}
}

func TestDollar(t *testing.T) {
// PCRE/C# allow \n to match to $ at end-of-string in singleline mode...
// a weird edge-case kept for compatibility, ECMAScript/RE2 mode don't allow it
re := MustCompile(`ac$`, 0)
if m, err := re.MatchString("ac\n"); err != nil {
t.Fatal(err)
} else if !m {
t.Fatal("Expected match")
}
}
func TestECMADollar(t *testing.T) {
re := MustCompile(`ac$`, ECMAScript)
if m, err := re.MatchString("ac\n"); err != nil {
t.Fatal(err)
} else if m {
t.Fatal("Expected no match")
}
}

func TestThreeByteUnicode_InputOnly(t *testing.T) {
// confirm the bmprefix properly ignores 3-byte unicode in the input value
// this used to panic
Expand Down
15 changes: 14 additions & 1 deletion runner.go
Original file line number Diff line number Diff line change
Expand Up @@ -566,9 +566,22 @@ func (r *runner) execute() error {
continue

case syntax.EndZ:
if r.rightchars() > 1 || r.rightchars() == 1 && r.charAt(r.textPos()) != '\n' {
rchars := r.rightchars()
if rchars > 1 {
break
}
// RE2 and EcmaScript define $ as "asserts position at the end of the string"
// PCRE/.NET adds "or before the line terminator right at the end of the string (if any)"
if (r.re.options & (RE2 | ECMAScript)) != 0 {
// RE2/Ecmascript mode
if rchars > 0 {
break
}
} else if rchars == 1 && r.charAt(r.textPos()) != '\n' {
// "regular" mode
break
}

r.advance(0)
continue

Expand Down

0 comments on commit 0537a49

Please sign in to comment.