Tokenize unicode escapes to prevent ambiguous results

balbuf · balbuf · commit 750b2a76e66b · 2017-06-11T02:19:56.000-04:00
diff --git a/src/index.js b/src/index.js
@@ -4,6 +4,24 @@ import Selector from './Selector';
 import PropertyTest from './PropertyTest';
 import CSS from './escape';
 
+/**
+ * Using the location and additional offset of unicode escape tokens,
+ * fix the location values for parsed selector tokens.
+ *
+ * @param  {Array} tokens
+ * @param  {Array} unicodeLocations  location and offset of each unicode escape
+ */
+function fixLocation(tokens, unicodeLocations) {
+	for (var i = 0; i < tokens.length; i++) {
+		if (tokens[i].location) {
+			tokens[i].location += unicodeLocations.filter((l) => l.index < tokens[i].location).reduce((offset, l) => offset + l.offset, 0);
+		}
+		if (Array.isArray(tokens[i].tokens)) {
+			fixLocation(tokens[i].tokens, unicodeLocations);
+		}
+	}
+}
+
 export default {
 
 	// classes
@@ -17,8 +35,26 @@ export default {
 	 * @return {array}          Selector object(s)
 	 */
 	parse(selector) {
+		// tokenize the input string
+		// (the only special tokens are unicode escapes, which need to be matched greedily to avoid ambiguous results)
+		var tokens = []
+		  , unicodeReg = /\\[0-9a-zA-Z]{1,6}(?:\r\n|[ \n\r\t\f])?/g
+		  , lastIndex = 0
+		  , result
+		  , unicodeLocations = [] // keep track of where unicode tokens are to fix the "location" values of selector tokens
+		;
+
+		// find all the unicode escapes, split everything else into individual chars
+		while ((result = unicodeReg.exec(selector)) !== null) {
+			Array.prototype.push.apply(tokens, selector.substr(lastIndex, result.index).split(''));
+			unicodeLocations.push({index: tokens.length, offset: result[0].length - 1});
+			tokens.push(result[0]);
+			lastIndex = unicodeReg.lastIndex;
+		}
+		Array.prototype.push.apply(tokens, selector.substr(lastIndex).split(''));
+
 		var parser = new Parser(grammar.ParserRules, grammar.ParserStart)
-		  , results = parser.feed(selector).results
+		  , results = parser.feed(tokens).results
 		;
 
 		// usually a parse error is thrown by nearley, unless there are no results due to
@@ -39,7 +75,12 @@ export default {
 			}
 		}
 
-		return results[0].selectors.map((selector) => new Selector(selector.tokens));
+		return results[0].selectors.map((selector) => {
+			if (unicodeLocations.length) {
+				fixLocation(selector.tokens, unicodeLocations);
+			}
+			return new Selector(selector.tokens);
+		});
 	},
 
 	/**
diff --git a/src/selector-grammar.ne b/src/selector-grammar.ne
@@ -35,6 +35,10 @@ const combinatorTypes = {
 	'~': 'generalSiblingCombinator',
 	' ': 'descendantCombinator',
 };
+
+// identify unicode escape tokens
+const unicodeReg = /^\\[0-9a-zA-Z]{1,6}(?:\r\n|[ \n\r\t\f])?$/;
+var unicode = {test: (x) => unicodeReg.test(x)};
 %}
 
 # @see: https://www.w3.org/TR/css3-selectors/#w3cselgrammar
@@ -124,8 +128,8 @@ ident -> "-":? nmstart nmchar:*
 name -> nmchar:+
 nmstart -> [_a-zA-Z] | nonascii | escape
 nonascii -> [^\0-\177]
-unicode -> "\\" ( hex hex:? hex:? hex:? hex:? hex:? ) ( "\r\n" | space ):?
-	{% (d) => { return {parsed: String.fromCodePoint(parseInt(collapse(d[1]), 16)), raw: collapse(d)} } %}
+unicode -> %unicode
+	{% (d) => { return {parsed: String.fromCodePoint(parseInt(collapse(d).substr(1), 16)), raw: collapseRaw(d)} } %}
 escape -> unicode
 	| "\\" [^\n\r\f0-9a-fA-F] {% (d) => { return {parsed: d[1], raw: collapseRaw(d)} } %}
 escaped_nl -> "\\" nl