Skip to content

Commit

Permalink
fixes dlclark#47 change RE2 option to match the same characters for \…
Browse files Browse the repository at this point in the history
…s \w and \d as RE2
  • Loading branch information
dlclark committed Mar 29, 2022
1 parent a2a8dda commit d0559a0
Show file tree
Hide file tree
Showing 5 changed files with 104 additions and 10 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@ The default behavior of `regexp2` is to match the .NET regexp engine, however th
* add support for named ascii character classes (e.g. `[[:foo:]]`)
* add support for python-style capture groups (e.g. `(P<name>re)`)
* change singleline behavior for `$` to only match end of string (like RE2) (see [#24](https://github.com/dlclark/regexp2/issues/24))
* change the character classes `\d` `\s` and `\w` to match the same characters as RE2. NOTE: if you also use the `ECMAScript` option then this will change the `\s` character class to match ECMAScript instead of RE2. ECMAScript allows more whitespace characters in `\s` than RE2 (but still fewer than the the default behavior).

```go
re := regexp2.MustCompile(`Your RE2-compatible pattern`, regexp2.RE2)
Expand Down
3 changes: 3 additions & 0 deletions go.mod
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
module github.com/dlclark/regexp2

go 1.13
77 changes: 76 additions & 1 deletion regexp_re2_test.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
package regexp2

import "testing"
import (
"testing"
)

func TestRE2CompatCapture(t *testing.T) {
r := MustCompile(`re(?P<a>2)`, RE2)
Expand Down Expand Up @@ -119,3 +121,76 @@ func TestRE2Dollar_Multiline(t *testing.T) {
t.Fatal("Expected match")
}
}

func TestRE2ExtendedZero(t *testing.T) {
notZero := "߀" // \u07c0
r := MustCompile(`^\d$`, RE2)
if m, _ := r.MatchString(notZero); m {
t.Fatal("Expected no match")
}

r = MustCompile(`^\D$`, RE2)
if m, _ := r.MatchString(notZero); !m {
t.Fatal("Expected match")
}
}

func TestRegularExtendedZero(t *testing.T) {
notZero := "߀" // \u07c0

r := MustCompile(`^\d$`, 0)
if m, _ := r.MatchString(notZero); !m {
t.Fatal("Expected match")
}

r = MustCompile(`^\D$`, 0)
if m, _ := r.MatchString(notZero); m {
t.Fatal("Expected no match")
}
}

func TestRE2Word(t *testing.T) {
r := MustCompile(`\w`, RE2)
if m, _ := r.MatchString("å"); m {
t.Fatal("Expected no match")
}

r = MustCompile(`\W`, RE2)
if m, _ := r.MatchString("å"); !m {
t.Fatal("Expected match")
}

}

func TestRegularWord(t *testing.T) {
r := MustCompile(`\w`, 0)
if m, _ := r.MatchString("å"); !m {
t.Fatal("Expected match")
}
r = MustCompile(`\W`, 0)
if m, _ := r.MatchString("å"); m {
t.Fatal("Expected no match")
}
}

func TestRE2Space(t *testing.T) {
r := MustCompile(`\s`, RE2)
if m, _ := r.MatchString("\x0b"); m {
t.Fatal("Expected no match")
}
r = MustCompile(`\S`, RE2)
if m, _ := r.MatchString("\x0b"); !m {
t.Fatal("Expected match")
}
}

func TestRegularSpace(t *testing.T) {
r := MustCompile(`\s`, 0)
if m, _ := r.MatchString("\x0b"); !m {
t.Fatal("Expected match")
}
r = MustCompile(`\S`, 0)
if m, _ := r.MatchString("\x0b"); m {
t.Fatal("Expected no match")
}
}
15 changes: 13 additions & 2 deletions syntax/charclass.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,8 @@ var (
ecmaSpace = []rune{0x0009, 0x000e, 0x0020, 0x0021, 0x00a0, 0x00a1, 0x1680, 0x1681, 0x2000, 0x200b, 0x2028, 0x202a, 0x202f, 0x2030, 0x205f, 0x2060, 0x3000, 0x3001, 0xfeff, 0xff00}
ecmaWord = []rune{0x0030, 0x003a, 0x0041, 0x005b, 0x005f, 0x0060, 0x0061, 0x007b}
ecmaDigit = []rune{0x0030, 0x003a}

re2Space = []rune{0x0009, 0x000b, 0x000c, 0x000e, 0x0020, 0x0021}
)

var (
Expand All @@ -56,6 +58,9 @@ var (
NotSpaceClass = getCharSetFromCategoryString(true, false, spaceCategoryText)
DigitClass = getCharSetFromCategoryString(false, false, "Nd")
NotDigitClass = getCharSetFromCategoryString(false, true, "Nd")

RE2SpaceClass = getCharSetFromOldString(re2Space, false)
NotRE2SpaceClass = getCharSetFromOldString(re2Space, true)
)

var unicodeCategories = func() map[string]*unicode.RangeTable {
Expand Down Expand Up @@ -401,13 +406,19 @@ func (c *CharSet) addChar(ch rune) {
c.addRange(ch, ch)
}

func (c *CharSet) addSpace(ecma, negate bool) {
func (c *CharSet) addSpace(ecma, re2, negate bool) {
if ecma {
if negate {
c.addRanges(NotECMASpaceClass().ranges)
} else {
c.addRanges(ECMASpaceClass().ranges)
}
} else if re2 {
if negate {
c.addRanges(NotRE2SpaceClass().ranges)
} else {
c.addRanges(RE2SpaceClass().ranges)
}
} else {
c.addCategories(category{cat: spaceCategoryText, negate: negate})
}
Expand Down Expand Up @@ -563,7 +574,7 @@ func (c *CharSet) addNamedASCII(name string, negate bool) bool {
case "punct": //[!-/:-@[-`{-~]
rs = []singleRange{singleRange{'!', '/'}, singleRange{':', '@'}, singleRange{'[', '`'}, singleRange{'{', '~'}}
case "space":
c.addSpace(true, negate)
c.addSpace(true, false, negate)
case "upper":
rs = []singleRange{singleRange{'A', 'Z'}}
case "word":
Expand Down
18 changes: 11 additions & 7 deletions syntax/parser.go
Original file line number Diff line number Diff line change
Expand Up @@ -1121,14 +1121,14 @@ func (p *parser) scanBackslash(scanOnly bool) (*regexNode, error) {

case 'w':
p.moveRight(1)
if p.useOptionE() {
if p.useOptionE() || p.useRE2() {
return newRegexNodeSet(ntSet, p.options, ECMAWordClass()), nil
}
return newRegexNodeSet(ntSet, p.options, WordClass()), nil

case 'W':
p.moveRight(1)
if p.useOptionE() {
if p.useOptionE() || p.useRE2() {
return newRegexNodeSet(ntSet, p.options, NotECMAWordClass()), nil
}
return newRegexNodeSet(ntSet, p.options, NotWordClass()), nil
Expand All @@ -1137,26 +1137,30 @@ func (p *parser) scanBackslash(scanOnly bool) (*regexNode, error) {
p.moveRight(1)
if p.useOptionE() {
return newRegexNodeSet(ntSet, p.options, ECMASpaceClass()), nil
} else if p.useRE2() {
return newRegexNodeSet(ntSet, p.options, RE2SpaceClass()), nil
}
return newRegexNodeSet(ntSet, p.options, SpaceClass()), nil

case 'S':
p.moveRight(1)
if p.useOptionE() {
return newRegexNodeSet(ntSet, p.options, NotECMASpaceClass()), nil
} else if p.useRE2() {
return newRegexNodeSet(ntSet, p.options, NotRE2SpaceClass()), nil
}
return newRegexNodeSet(ntSet, p.options, NotSpaceClass()), nil

case 'd':
p.moveRight(1)
if p.useOptionE() {
if p.useOptionE() || p.useRE2() {
return newRegexNodeSet(ntSet, p.options, ECMADigitClass()), nil
}
return newRegexNodeSet(ntSet, p.options, DigitClass()), nil

case 'D':
p.moveRight(1)
if p.useOptionE() {
if p.useOptionE() || p.useRE2() {
return newRegexNodeSet(ntSet, p.options, NotECMADigitClass()), nil
}
return newRegexNodeSet(ntSet, p.options, NotDigitClass()), nil
Expand Down Expand Up @@ -1462,7 +1466,7 @@ func (p *parser) scanCharSet(caseInsensitive, scanOnly bool) (*CharSet, error) {
if inRange {
return nil, p.getErr(ErrBadClassInCharRange, ch)
}
cc.addDigit(p.useOptionE(), ch == 'D', p.patternRaw)
cc.addDigit(p.useOptionE() || p.useRE2(), ch == 'D', p.patternRaw)
}
continue

Expand All @@ -1471,7 +1475,7 @@ func (p *parser) scanCharSet(caseInsensitive, scanOnly bool) (*CharSet, error) {
if inRange {
return nil, p.getErr(ErrBadClassInCharRange, ch)
}
cc.addSpace(p.useOptionE(), ch == 'S')
cc.addSpace(p.useOptionE(), p.useRE2(), ch == 'S')
}
continue

Expand All @@ -1481,7 +1485,7 @@ func (p *parser) scanCharSet(caseInsensitive, scanOnly bool) (*CharSet, error) {
return nil, p.getErr(ErrBadClassInCharRange, ch)
}

cc.addWord(p.useOptionE(), ch == 'W')
cc.addWord(p.useOptionE() || p.useRE2(), ch == 'W')
}
continue

Expand Down

0 comments on commit d0559a0

Please sign in to comment.