fixes dlclark#47 change RE2 option to match the same characters for \…

…s \w and \d as RE2
natfrp · Mar 29, 2022 · d0559a0 · d0559a0
1 parent a2a8dda
commit d0559a0
Show file tree

Hide file tree

Showing 5 changed files with 104 additions and 10 deletions.
diff --git a/README.md b/README.md
@@ -80,6 +80,7 @@ The default behavior of `regexp2` is to match the .NET regexp engine, however th
 * add support for named ascii character classes (e.g. `[[:foo:]]`)
 * add support for python-style capture groups (e.g. `(P<name>re)`)
 * change singleline behavior for `$` to only match end of string (like RE2) (see [#24](https://github.com/dlclark/regexp2/issues/24))
+* change the character classes `\d` `\s` and `\w` to match the same characters as RE2. NOTE: if you also use the `ECMAScript` option then this will change the `\s` character class to match ECMAScript instead of RE2.  ECMAScript allows more whitespace characters in `\s` than RE2 (but still fewer than the the default behavior).
 
 ```go
 re := regexp2.MustCompile(`Your RE2-compatible pattern`, regexp2.RE2)

diff --git a/go.mod b/go.mod
@@ -0,0 +1,3 @@
+module github.com/dlclark/regexp2
+
+go 1.13
diff --git a/regexp_re2_test.go b/regexp_re2_test.go
@@ -1,6 +1,8 @@
 package regexp2
 
-import "testing"
+import (
+	"testing"
+)
 
 func TestRE2CompatCapture(t *testing.T) {
 	r := MustCompile(`re(?P<a>2)`, RE2)
@@ -119,3 +121,76 @@ func TestRE2Dollar_Multiline(t *testing.T) {
 		t.Fatal("Expected match")
 	}
 }
+
+func TestRE2ExtendedZero(t *testing.T) {
+	notZero := "߀" // \u07c0
+	r := MustCompile(`^\d$`, RE2)
+	if m, _ := r.MatchString(notZero); m {
+		t.Fatal("Expected no match")
+	}
+
+	r = MustCompile(`^\D$`, RE2)
+	if m, _ := r.MatchString(notZero); !m {
+		t.Fatal("Expected match")
+	}
+}
+
+func TestRegularExtendedZero(t *testing.T) {
+	notZero := "߀" // \u07c0
+
+	r := MustCompile(`^\d$`, 0)
+	if m, _ := r.MatchString(notZero); !m {
+		t.Fatal("Expected match")
+	}
+
+	r = MustCompile(`^\D$`, 0)
+	if m, _ := r.MatchString(notZero); m {
+		t.Fatal("Expected no match")
+	}
+}
+
+func TestRE2Word(t *testing.T) {
+	r := MustCompile(`\w`, RE2)
+	if m, _ := r.MatchString("å"); m {
+		t.Fatal("Expected no match")
+	}
+
+	r = MustCompile(`\W`, RE2)
+	if m, _ := r.MatchString("å"); !m {
+		t.Fatal("Expected match")
+	}
+
+}
+
+func TestRegularWord(t *testing.T) {
+	r := MustCompile(`\w`, 0)
+	if m, _ := r.MatchString("å"); !m {
+		t.Fatal("Expected match")
+	}
+	r = MustCompile(`\W`, 0)
+	if m, _ := r.MatchString("å"); m {
+		t.Fatal("Expected no match")
+	}
+}
+
+func TestRE2Space(t *testing.T) {
+	r := MustCompile(`\s`, RE2)
+	if m, _ := r.MatchString("\x0b"); m {
+		t.Fatal("Expected no match")
+	}
+	r = MustCompile(`\S`, RE2)
+	if m, _ := r.MatchString("\x0b"); !m {
+		t.Fatal("Expected match")
+	}
+}
+
+func TestRegularSpace(t *testing.T) {
+	r := MustCompile(`\s`, 0)
+	if m, _ := r.MatchString("\x0b"); !m {
+		t.Fatal("Expected match")
+	}
+	r = MustCompile(`\S`, 0)
+	if m, _ := r.MatchString("\x0b"); m {
+		t.Fatal("Expected no match")
+	}
+}
diff --git a/syntax/charclass.go b/syntax/charclass.go
@@ -37,6 +37,8 @@ var (
 	ecmaSpace = []rune{0x0009, 0x000e, 0x0020, 0x0021, 0x00a0, 0x00a1, 0x1680, 0x1681, 0x2000, 0x200b, 0x2028, 0x202a, 0x202f, 0x2030, 0x205f, 0x2060, 0x3000, 0x3001, 0xfeff, 0xff00}
 	ecmaWord  = []rune{0x0030, 0x003a, 0x0041, 0x005b, 0x005f, 0x0060, 0x0061, 0x007b}
 	ecmaDigit = []rune{0x0030, 0x003a}
+
+	re2Space = []rune{0x0009, 0x000b, 0x000c, 0x000e, 0x0020, 0x0021}
 )
 
 var (
@@ -56,6 +58,9 @@ var (
 	NotSpaceClass = getCharSetFromCategoryString(true, false, spaceCategoryText)
 	DigitClass    = getCharSetFromCategoryString(false, false, "Nd")
 	NotDigitClass = getCharSetFromCategoryString(false, true, "Nd")
+
+	RE2SpaceClass    = getCharSetFromOldString(re2Space, false)
+	NotRE2SpaceClass = getCharSetFromOldString(re2Space, true)
 )
 
 var unicodeCategories = func() map[string]*unicode.RangeTable {
@@ -401,13 +406,19 @@ func (c *CharSet) addChar(ch rune) {
 	c.addRange(ch, ch)
 }
 
-func (c *CharSet) addSpace(ecma, negate bool) {
+func (c *CharSet) addSpace(ecma, re2, negate bool) {
 	if ecma {
 		if negate {
 			c.addRanges(NotECMASpaceClass().ranges)
 		} else {
 			c.addRanges(ECMASpaceClass().ranges)
 		}
+	} else if re2 {
+		if negate {
+			c.addRanges(NotRE2SpaceClass().ranges)
+		} else {
+			c.addRanges(RE2SpaceClass().ranges)
+		}
 	} else {
 		c.addCategories(category{cat: spaceCategoryText, negate: negate})
 	}
@@ -563,7 +574,7 @@ func (c *CharSet) addNamedASCII(name string, negate bool) bool {
 	case "punct": //[!-/:-@[-`{-~]
 		rs = []singleRange{singleRange{'!', '/'}, singleRange{':', '@'}, singleRange{'[', '`'}, singleRange{'{', '~'}}
 	case "space":
-		c.addSpace(true, negate)
+		c.addSpace(true, false, negate)
 	case "upper":
 		rs = []singleRange{singleRange{'A', 'Z'}}
 	case "word":

diff --git a/syntax/parser.go b/syntax/parser.go
@@ -1121,14 +1121,14 @@ func (p *parser) scanBackslash(scanOnly bool) (*regexNode, error) {
 
 	case 'w':
 		p.moveRight(1)
-		if p.useOptionE() {
+		if p.useOptionE() || p.useRE2() {
 			return newRegexNodeSet(ntSet, p.options, ECMAWordClass()), nil
 		}
 		return newRegexNodeSet(ntSet, p.options, WordClass()), nil
 
 	case 'W':
 		p.moveRight(1)
-		if p.useOptionE() {
+		if p.useOptionE() || p.useRE2() {
 			return newRegexNodeSet(ntSet, p.options, NotECMAWordClass()), nil
 		}
 		return newRegexNodeSet(ntSet, p.options, NotWordClass()), nil
@@ -1137,26 +1137,30 @@ func (p *parser) scanBackslash(scanOnly bool) (*regexNode, error) {
 		p.moveRight(1)
 		if p.useOptionE() {
 			return newRegexNodeSet(ntSet, p.options, ECMASpaceClass()), nil
+		} else if p.useRE2() {
+			return newRegexNodeSet(ntSet, p.options, RE2SpaceClass()), nil
 		}
 		return newRegexNodeSet(ntSet, p.options, SpaceClass()), nil
 
 	case 'S':
 		p.moveRight(1)
 		if p.useOptionE() {
 			return newRegexNodeSet(ntSet, p.options, NotECMASpaceClass()), nil
+		} else if p.useRE2() {
+			return newRegexNodeSet(ntSet, p.options, NotRE2SpaceClass()), nil
 		}
 		return newRegexNodeSet(ntSet, p.options, NotSpaceClass()), nil
 
 	case 'd':
 		p.moveRight(1)
-		if p.useOptionE() {
+		if p.useOptionE() || p.useRE2() {
 			return newRegexNodeSet(ntSet, p.options, ECMADigitClass()), nil
 		}
 		return newRegexNodeSet(ntSet, p.options, DigitClass()), nil
 
 	case 'D':
 		p.moveRight(1)
-		if p.useOptionE() {
+		if p.useOptionE() || p.useRE2() {
 			return newRegexNodeSet(ntSet, p.options, NotECMADigitClass()), nil
 		}
 		return newRegexNodeSet(ntSet, p.options, NotDigitClass()), nil
@@ -1462,7 +1466,7 @@ func (p *parser) scanCharSet(caseInsensitive, scanOnly bool) (*CharSet, error) {
 					if inRange {
 						return nil, p.getErr(ErrBadClassInCharRange, ch)
 					}
-					cc.addDigit(p.useOptionE(), ch == 'D', p.patternRaw)
+					cc.addDigit(p.useOptionE() || p.useRE2(), ch == 'D', p.patternRaw)
 				}
 				continue
 
@@ -1471,7 +1475,7 @@ func (p *parser) scanCharSet(caseInsensitive, scanOnly bool) (*CharSet, error) {
 					if inRange {
 						return nil, p.getErr(ErrBadClassInCharRange, ch)
 					}
-					cc.addSpace(p.useOptionE(), ch == 'S')
+					cc.addSpace(p.useOptionE(), p.useRE2(), ch == 'S')
 				}
 				continue
 
@@ -1481,7 +1485,7 @@ func (p *parser) scanCharSet(caseInsensitive, scanOnly bool) (*CharSet, error) {
 						return nil, p.getErr(ErrBadClassInCharRange, ch)
 					}
 
-					cc.addWord(p.useOptionE(), ch == 'W')
+					cc.addWord(p.useOptionE() || p.useRE2(), ch == 'W')
 				}
 				continue