Skip to content

Commit 750b2a7

Browse files
committed
Tokenize unicode escapes to prevent ambiguous results
1 parent bd8b3a2 commit 750b2a7

File tree

2 files changed

+49
-4
lines changed

2 files changed

+49
-4
lines changed

src/index.js

Lines changed: 43 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,24 @@ import Selector from './Selector';
44
import PropertyTest from './PropertyTest';
55
import CSS from './escape';
66

7+
/**
8+
* Using the location and additional offset of unicode escape tokens,
9+
* fix the location values for parsed selector tokens.
10+
*
11+
* @param {Array} tokens
12+
* @param {Array} unicodeLocations location and offset of each unicode escape
13+
*/
14+
function fixLocation(tokens, unicodeLocations) {
15+
for (var i = 0; i < tokens.length; i++) {
16+
if (tokens[i].location) {
17+
tokens[i].location += unicodeLocations.filter((l) => l.index < tokens[i].location).reduce((offset, l) => offset + l.offset, 0);
18+
}
19+
if (Array.isArray(tokens[i].tokens)) {
20+
fixLocation(tokens[i].tokens, unicodeLocations);
21+
}
22+
}
23+
}
24+
725
export default {
826

927
// classes
@@ -17,8 +35,26 @@ export default {
1735
* @return {array} Selector object(s)
1836
*/
1937
parse(selector) {
38+
// tokenize the input string
39+
// (the only special tokens are unicode escapes, which need to be matched greedily to avoid ambiguous results)
40+
var tokens = []
41+
, unicodeReg = /\\[0-9a-zA-Z]{1,6}(?:\r\n|[ \n\r\t\f])?/g
42+
, lastIndex = 0
43+
, result
44+
, unicodeLocations = [] // keep track of where unicode tokens are to fix the "location" values of selector tokens
45+
;
46+
47+
// find all the unicode escapes, split everything else into individual chars
48+
while ((result = unicodeReg.exec(selector)) !== null) {
49+
Array.prototype.push.apply(tokens, selector.substr(lastIndex, result.index).split(''));
50+
unicodeLocations.push({index: tokens.length, offset: result[0].length - 1});
51+
tokens.push(result[0]);
52+
lastIndex = unicodeReg.lastIndex;
53+
}
54+
Array.prototype.push.apply(tokens, selector.substr(lastIndex).split(''));
55+
2056
var parser = new Parser(grammar.ParserRules, grammar.ParserStart)
21-
, results = parser.feed(selector).results
57+
, results = parser.feed(tokens).results
2258
;
2359

2460
// usually a parse error is thrown by nearley, unless there are no results due to
@@ -39,7 +75,12 @@ export default {
3975
}
4076
}
4177

42-
return results[0].selectors.map((selector) => new Selector(selector.tokens));
78+
return results[0].selectors.map((selector) => {
79+
if (unicodeLocations.length) {
80+
fixLocation(selector.tokens, unicodeLocations);
81+
}
82+
return new Selector(selector.tokens);
83+
});
4384
},
4485

4586
/**

src/selector-grammar.ne

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,10 @@ const combinatorTypes = {
3535
'~': 'generalSiblingCombinator',
3636
' ': 'descendantCombinator',
3737
};
38+
39+
// identify unicode escape tokens
40+
const unicodeReg = /^\\[0-9a-zA-Z]{1,6}(?:\r\n|[ \n\r\t\f])?$/;
41+
var unicode = {test: (x) => unicodeReg.test(x)};
3842
%}
3943

4044
# @see: https://www.w3.org/TR/css3-selectors/#w3cselgrammar
@@ -124,8 +128,8 @@ ident -> "-":? nmstart nmchar:*
124128
name -> nmchar:+
125129
nmstart -> [_a-zA-Z] | nonascii | escape
126130
nonascii -> [^\0-\177]
127-
unicode -> "\\" ( hex hex:? hex:? hex:? hex:? hex:? ) ( "\r\n" | space ):?
128-
{% (d) => { return {parsed: String.fromCodePoint(parseInt(collapse(d[1]), 16)), raw: collapse(d)} } %}
131+
unicode -> %unicode
132+
{% (d) => { return {parsed: String.fromCodePoint(parseInt(collapse(d).substr(1), 16)), raw: collapseRaw(d)} } %}
129133
escape -> unicode
130134
| "\\" [^\n\r\f0-9a-fA-F] {% (d) => { return {parsed: d[1], raw: collapseRaw(d)} } %}
131135
escaped_nl -> "\\" nl

0 commit comments

Comments
 (0)