Skip to content

Commit 930cf59

Browse files
committed
regexp/syntax: recognize category aliases like \p{Letter}
The Unicode specification defines aliases for some of the general category names. For example the category "L" has alias "Letter". The regexp package supports \p{L} but not \p{Letter}, because there was nothing in the Unicode tables that lets regexp know about Letter. Now that package unicode provides CategoryAliases (see #70780), we can use it to provide \p{Letter} as well. This is the only feature missing from making package regexp suitable for use in a JSON-API Schema implementation. (The official test suite includes usage of aliases like \p{Letter} instead of \p{L}.) For better conformity with Unicode TR18, also accept case-insensitive matches for names and ignore underscores, hyphens, and spaces; and add Any, ASCII, and Assigned. Fixes #70781. Change-Id: I50ff024d99255338fa8d92663881acb47f1e92a5 Reviewed-on: https://go-review.googlesource.com/c/go/+/641377 LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: Alan Donovan <adonovan@google.com>
1 parent 28fd9fa commit 930cf59

File tree

4 files changed

+113
-9
lines changed

4 files changed

+113
-9
lines changed
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
The `\p{name}` and `\P{name}` character class syntaxes now accept the names
2+
Any, ASCII, Assigned, Cn, and LC, as well as Unicode category aliases like `\p{Letter}` for `\pL`.
3+
Following [Unicode TR18](https://unicode.org/reports/tr18/), they also now use
4+
case-insensitive name lookups, ignoring spaces, underscores, and hyphens.

src/regexp/syntax/doc.go

Lines changed: 2 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

src/regexp/syntax/parse.go

Lines changed: 101 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ package syntax
77
import (
88
"sort"
99
"strings"
10+
"sync"
1011
"unicode"
1112
"unicode/utf8"
1213
)
@@ -1639,20 +1640,109 @@ var anyTable = &unicode.RangeTable{
16391640
R32: []unicode.Range32{{Lo: 1 << 16, Hi: unicode.MaxRune, Stride: 1}},
16401641
}
16411642

1643+
var asciiTable = &unicode.RangeTable{
1644+
R16: []unicode.Range16{{Lo: 0, Hi: 0x7F, Stride: 1}},
1645+
}
1646+
1647+
var asciiFoldTable = &unicode.RangeTable{
1648+
R16: []unicode.Range16{
1649+
{Lo: 0, Hi: 0x7F, Stride: 1},
1650+
{Lo: 0x017F, Hi: 0x017F, Stride: 1}, // Old English long s (ſ), folds to S/s.
1651+
{Lo: 0x212A, Hi: 0x212A, Stride: 1}, // Kelvin K, folds to K/k.
1652+
},
1653+
}
1654+
1655+
// categoryAliases is a lazily constructed copy of unicode.CategoryAliases
1656+
// but with the keys passed through canonicalName, to support inexact matches.
1657+
var categoryAliases struct {
1658+
once sync.Once
1659+
m map[string]string
1660+
}
1661+
1662+
// initCategoryAliases initializes categoryAliases by canonicalizing unicode.CategoryAliases.
1663+
func initCategoryAliases() {
1664+
categoryAliases.m = make(map[string]string)
1665+
for name, actual := range unicode.CategoryAliases {
1666+
categoryAliases.m[canonicalName(name)] = actual
1667+
}
1668+
}
1669+
1670+
// canonicalName returns the canonical lookup string for name.
1671+
// The canonical name has a leading uppercase letter and then lowercase letters,
1672+
// and it omits all underscores, spaces, and hyphens.
1673+
// (We could have used all lowercase, but this way most package unicode
1674+
// map keys are already canonical.)
1675+
func canonicalName(name string) string {
1676+
var b []byte
1677+
first := true
1678+
for i := range len(name) {
1679+
c := name[i]
1680+
switch {
1681+
case c == '_' || c == '-' || c == ' ':
1682+
c = ' '
1683+
case first:
1684+
if 'a' <= c && c <= 'z' {
1685+
c -= 'a' - 'A'
1686+
}
1687+
first = false
1688+
default:
1689+
if 'A' <= c && c <= 'Z' {
1690+
c += 'a' - 'A'
1691+
}
1692+
}
1693+
if b == nil {
1694+
if c == name[i] && c != ' ' {
1695+
// No changes so far, avoid allocating b.
1696+
continue
1697+
}
1698+
b = make([]byte, i, len(name))
1699+
copy(b, name[:i])
1700+
}
1701+
if c == ' ' {
1702+
continue
1703+
}
1704+
b = append(b, c)
1705+
}
1706+
if b == nil {
1707+
return name
1708+
}
1709+
return string(b)
1710+
}
1711+
16421712
// unicodeTable returns the unicode.RangeTable identified by name
16431713
// and the table of additional fold-equivalent code points.
1644-
func unicodeTable(name string) (*unicode.RangeTable, *unicode.RangeTable) {
1645-
// Special case: "Any" means any.
1646-
if name == "Any" {
1647-
return anyTable, anyTable
1714+
// If sign < 0, the result should be inverted.
1715+
func unicodeTable(name string) (tab, fold *unicode.RangeTable, sign int) {
1716+
name = canonicalName(name)
1717+
1718+
// Special cases: Any, Assigned, and ASCII.
1719+
// Also LC is the only non-canonical Categories key, so handle it here.
1720+
switch name {
1721+
case "Any":
1722+
return anyTable, anyTable, +1
1723+
case "Assigned":
1724+
return unicode.Cn, unicode.Cn, -1 // invert Cn (unassigned)
1725+
case "Ascii":
1726+
return asciiTable, asciiFoldTable, +1
1727+
case "Lc":
1728+
return unicode.Categories["LC"], unicode.FoldCategory["LC"], +1
16481729
}
16491730
if t := unicode.Categories[name]; t != nil {
1650-
return t, unicode.FoldCategory[name]
1731+
return t, unicode.FoldCategory[name], +1
16511732
}
16521733
if t := unicode.Scripts[name]; t != nil {
1653-
return t, unicode.FoldScript[name]
1734+
return t, unicode.FoldScript[name], +1
1735+
}
1736+
1737+
// unicode.CategoryAliases makes liberal use of underscores in its names
1738+
// (they are defined that way by Unicode), but we want to match ignoring
1739+
// the underscores, so make our own map with canonical names.
1740+
categoryAliases.once.Do(initCategoryAliases)
1741+
if actual := categoryAliases.m[name]; actual != "" {
1742+
t := unicode.Categories[actual]
1743+
return t, unicode.FoldCategory[actual], +1
16541744
}
1655-
return nil, nil
1745+
return nil, nil, 0
16561746
}
16571747

16581748
// parseUnicodeClass parses a leading Unicode character class like \p{Han}
@@ -1700,10 +1790,13 @@ func (p *parser) parseUnicodeClass(s string, r []rune) (out []rune, rest string,
17001790
name = name[1:]
17011791
}
17021792

1703-
tab, fold := unicodeTable(name)
1793+
tab, fold, tsign := unicodeTable(name)
17041794
if tab == nil {
17051795
return nil, "", &Error{ErrInvalidCharRange, seq}
17061796
}
1797+
if tsign < 0 {
1798+
sign = -sign
1799+
}
17071800

17081801
if p.flags&FoldCase == 0 || fold == nil {
17091802
if sign > 0 {

src/regexp/syntax/parse_test.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -107,10 +107,16 @@ var parseTests = []parseTest{
107107
{`[\P{^Braille}]`, `cc{0x2800-0x28ff}`},
108108
{`[\pZ]`, `cc{0x20 0xa0 0x1680 0x2000-0x200a 0x2028-0x2029 0x202f 0x205f 0x3000}`},
109109
{`\p{Lu}`, mkCharClass(unicode.IsUpper)},
110+
{`\p{Uppercase_Letter}`, mkCharClass(unicode.IsUpper)},
111+
{`\p{upper case-let ter}`, mkCharClass(unicode.IsUpper)},
112+
{`\p{__upper case-let ter}`, mkCharClass(unicode.IsUpper)},
110113
{`[\p{Lu}]`, mkCharClass(unicode.IsUpper)},
111114
{`(?i)[\p{Lu}]`, mkCharClass(isUpperFold)},
112115
{`\p{Any}`, `dot{}`},
113116
{`\p{^Any}`, `cc{}`},
117+
{`(?i)\p{ascii}`, `cc{0x0-0x7f 0x17f 0x212a}`},
118+
{`\p{Assigned}`, mkCharClass(func(r rune) bool { return !unicode.In(r, unicode.Cn) })},
119+
{`\p{^Assigned}`, mkCharClass(func(r rune) bool { return unicode.In(r, unicode.Cn) })},
114120

115121
// Hex, octal.
116122
{`[\012-\234]\141`, `cat{cc{0xa-0x9c}lit{a}}`},

0 commit comments

Comments
 (0)