Improve extraction of tokens from regex-based filters

Regex-based static network filters are those most likely to cause performance degradation, and as such the best guard against undue performance degradation caused by regex-based filters is the ability to extract valid and good tokens from regex patterns. This commit introduces a complete regex parser so that the static network filtering engine can now safely extract tokens regardless of the complexity of the regex pattern. The regex parser is a library imported from: https://github.com/foo123/RegexAnalyzer The syntax highlighter adds an underline to regex-based filters as a visual aid to filter authors so as to avoid mistakenly creating regex-based filters. This commit further colors the underline as a warning when a regex-based filter is found to be untokenizable. Filter list authors are invited to spot these untokenizable regex-based filters in their lists to verify that no mistake were made for those filters, causing them to be untokenizabke. For example, what appears to be a mistake: /^https?:\/\/.*\/sw.js?.[a-zA-Z0-9%]{50,}/ Though the mistake is minor, the regex-based filter above is untokenizable as a result, and become tokenizable when the `.` is properly escaped: /^https?:\/\/.*\/sw\.js?.[a-zA-Z0-9%]{50,}/ Filter list authors can use this search expression in the asset viewer to find instances of regex-based filters: /^(@@)?\/[^\n]+\/(\$|$)/
gorhill · Dec 26, 2020 · 426395a · 426395a
1 parent fdcb110
commit 426395a
Show file tree

Hide file tree

Showing 9 changed files with 2,300 additions and 25 deletions.
diff --git a/src/1p-filters.html b/src/1p-filters.html
@@ -49,6 +49,7 @@
 <script src="lib/codemirror/addon/scroll/annotatescrollbar.js"></script>
 <script src="lib/codemirror/addon/search/searchcursor.js"></script>
 <script src="lib/codemirror/addon/selection/active-line.js"></script>
+<script src="lib/regexanalyzer/regex.js"></script>
 
 <script src="js/codemirror/search.js"></script>
 <script src="js/codemirror/search-thread.js"></script>

diff --git a/src/about.html b/src/about.html
@@ -39,6 +39,7 @@
         <div class="li"><span><a href="https://github.com/rsms/inter" target="_blank">Inter font family</a> by <a href="https://github.com/rsms">Rasmus Andersson</a></span></div>
         <div class="li"><span><a href="https://fontawesome.com/" target="_blank">FontAwesome font family</a> by <a href="https://github.com/davegandy">Dave Gandy</a></span></div>
         <div class="li"><span><a href="https://github.com/Swatinem/diff" target="_blank">An implementation of Myers' diff algorithm</a> by <a href="https://github.com/Swatinem">Arpad Borsos</a></span></div>
+        <div class="li"><span><a href="https://github.com/foo123/RegexAnalyzer" target="_blank">Regular Expression Analyzer</a> by <a href="https://github.com/foo123">Nikos M.</a></span></div>
     </div>
     <hr>
     <div id="dev">

diff --git a/src/asset-viewer.html b/src/asset-viewer.html
@@ -33,6 +33,7 @@
 <script src="lib/codemirror/addon/scroll/annotatescrollbar.js"></script>
 <script src="lib/codemirror/addon/search/searchcursor.js"></script>
 <script src="lib/codemirror/addon/selection/active-line.js"></script>
+<script src="lib/regexanalyzer/regex.js"></script>
 
 <script src="js/codemirror/search.js"></script>
 <script src="js/codemirror/search-thread.js"></script>

diff --git a/src/background.html b/src/background.html
@@ -9,6 +9,7 @@
 <script src="lib/lz4/lz4-block-codec-any.js"></script>
 <script src="lib/punycode.js"></script>
 <script src="lib/publicsuffixlist/publicsuffixlist.js"></script>
+<script src="lib/regexanalyzer/regex.js"></script>
 <script src="js/webext.js"></script>
 <script src="js/vapi.js"></script>
 <script src="js/vapi-common.js"></script>

diff --git a/src/js/codemirror/ubo-static-filtering.js b/src/js/codemirror/ubo-static-filtering.js
@@ -274,7 +274,9 @@ CodeMirror.defineMode('ubo-static-filtering', function() {
             if ( parser.patternIsRegex() ) {
                 stream.pos = parser.slices[parser.optionsAnchorSpan.i+1];
                 parserSlot = parser.optionsAnchorSpan.i;
-                return 'variable notice';
+                return parser.patternIsTokenizable()
+                    ? 'variable notice'
+                    : 'variable warning';
             }
             if ( (parser.slices[parserSlot] & (parser.BITAsterisk | parser.BITCaret)) !== 0 ) {
                 stream.pos += parser.slices[parserSlot+2];

diff --git a/src/js/static-filtering-parser.js b/src/js/static-filtering-parser.js
@@ -1003,6 +1003,18 @@ const Parser = class {
         return (this.flavorBits & BITFlavorNetRegex) !== 0;
     }
 
+    patternIsTokenizable() {
+        // TODO: not necessarily true, this needs more work.
+        if ( this.patternIsRegex === false ) { return true; }
+        const s = Parser.tokenizableStrFromRegex(this.getNetPattern());
+        try {
+            return /(?<![\x01%0-9A-Za-z]|^)[%0-9A-Za-z]{7,}/.test(s) ||
+                /(?<![\x01%0-9A-Za-z]|^)[%0-9A-Za-z]{1,6}(?![\x01%0-9A-Za-z]|$)/.test(s);
+        } catch(ex) {
+        }
+        return true;
+    }
+
     patternHasWildcard() {
         return hasBits(this.patternBits, BITAsterisk);
     }
@@ -2748,6 +2760,109 @@ const ExtOptionsIterator = class {
 
 /******************************************************************************/
 
+// Depends on:
+// https://github.com/foo123/RegexAnalyzer
+
+Parser.tokenizableStrFromRegex = (( ) => {
+
+    const firstCharCodeClass = s => {
+        return /^[\x01%0-9A-Za-z]/.test(s) ? 1 : 0;
+    };
+
+    const lastCharCodeClass = s => {
+        return /[\x01%0-9A-Za-z]$/.test(s) ? 1 : 0;
+    };
+
+    const toTokenizableString = node => {
+        switch ( node.type ) {
+            case 1: /* T_SEQUENCE, 'Sequence' */ {
+                let s = '';
+                for ( let i = 0; i < node.val.length; i++ ) {
+                    s += toTokenizableString(node.val[i]);
+                }
+                return s;
+            }
+            case 2: /* T_ALTERNATION,'Alternation' */
+            case 8: /* T_CHARGROUP, 'CharacterGroup' */ {
+                let firstChar = 0;
+                let lastChar = 0;
+                for ( let i = 0; i < node.val.length; i++ ) {
+                    const s = toTokenizableString(node.val[i]);
+                    if ( firstChar === 0 && firstCharCodeClass(s) === 1 ) {
+                        firstChar = 1;
+                    }
+                    if ( lastChar === 0 && lastCharCodeClass(s) === 1 ) {
+                        lastChar = 1;
+                    }
+                    if ( firstChar === 1 && lastChar === 1 ) { break; }
+                }
+                return String.fromCharCode(firstChar, lastChar);
+            }
+            case 4: /* T_GROUP, 'Group' */ {
+                if ( node.flags.NegativeLookAhead === 1 ) { return '\x01'; }
+                if ( node.flags.NegativeLookBehind === 1 ) { return '\x01'; }
+                return toTokenizableString(node.val);
+            }
+            case 16: /* T_QUANTIFIER, 'Quantifier' */ {
+                const s = toTokenizableString(node.val);
+                const first = firstCharCodeClass(s);
+                const last = lastCharCodeClass(s);
+                if ( node.flags.min === 0 && first === 0 && last === 0 ) {
+                    return '';
+                }
+                return String.fromCharCode(first, last);
+            }
+            case 64: /* T_HEXCHAR, 'HexChar' */ {
+                return String.fromCharCode(parseInt(node.val.slice(1), 16));
+            }
+            case 128: /* T_SPECIAL, 'Special' */ {
+                const flags = node.flags;
+                if ( flags.MatchEnd === 1 ) { return '\x00'; }
+                if ( flags.MatchStart === 1 ) { return '\x00'; }
+                if ( flags.MatchWordBoundary === 1 ) { return '\x00'; }
+                return '\x01';
+            }
+            case 256: /* T_CHARS, 'Characters' */ {
+                for ( let i = 0; i < node.val.length; i++ ) {
+                    if ( firstCharCodeClass(node.val[i]) === 1 ) {
+                        return '\x01';
+                    }
+                }
+                return '\x00';
+            }
+            // Ranges are assumed to always involve token-related characters.
+            case 512: /* T_CHARRANGE, 'CharacterRange' */ {
+                return '\x01';
+            }
+            case 1024: /* T_STRING, 'String' */ {
+                return node.val;
+            }
+            case 2048: /* T_COMMENT, 'Comment' */ {
+                return '';
+            }
+            default:
+                break;
+        }
+        return '\x01';
+    };
+
+    return function(reStr) {
+        if (
+            self.Regex instanceof Object === false ||
+            self.Regex.Analyzer instanceof Object === false
+        ) {
+            return '';
+        }
+        try {
+            return toTokenizableString(self.Regex.Analyzer(reStr, false).tree());
+        } catch(ex) {
+        }
+        return '';
+    };
+})();
+
+/******************************************************************************/
+
 if ( typeof vAPI === 'object' && vAPI !== null ) {
     vAPI.StaticFilteringParser = Parser;
 } else {

diff --git a/src/js/static-net-filtering.js b/src/js/static-net-filtering.js
@@ -2622,15 +2622,9 @@ const FilterParser = class {
         if ( other !== undefined ) {
             return Object.assign(this, other);
         }
-        this.cantWebsocket = vAPI.cantWebsocket;
         this.noTokenHash = urlTokenizer.noTokenHash;
-        this.reIsolateHostname = /^(\*?\.)?([^\x00-\x24\x26-\x2C\x2F\x3A-\x5E\x60\x7B-\x7F]+)(.*)/;
         this.reBadCSP = /(?:=|;)\s*report-(?:to|uri)\b/;
         this.reToken = /[%0-9A-Za-z]+/g;
-        this.reRegexTokenAbort = /[\(\)\[\]]/;
-        this.reRegexBadPrefix = /(^|[^\\]\.|\\[%SDWsdw]|[^\\][()*+?[\\\]{}])$/;
-        this.reRegexBadSuffix = /^([^\\]\.|\\[%SDWsdw]|[()*+?[\]{}]|$)/;
-        this.reGoodToken = /[%0-9a-z]{1,}/g;
         this.domainOptList = [];
         this.tokenIdToNormalizedType = new Map([
             [ parser.OPTTokenCname, bitFromType('cname') ],
@@ -3175,32 +3169,22 @@ const FilterParser = class {
     //   not `bads`.
     extractTokenFromRegex() {
         this.reToken.lastIndex = 0;
-        const pattern = this.pattern;
+        const pattern =
+            vAPI.StaticFilteringParser.tokenizableStrFromRegex(this.pattern);
         let bestToken;
         let bestBadness = 0x7FFFFFFF;
         for (;;) {
             const matches = this.reToken.exec(pattern);
             if ( matches === null ) { break; }
-            let token = matches[0];
-            let prefix = pattern.slice(0, matches.index);
-            let suffix = pattern.slice(this.reToken.lastIndex);
-            if (
-                this.reRegexTokenAbort.test(prefix) &&
-                this.reRegexTokenAbort.test(suffix)
-            ) {
+            const { 0: token, index } = matches;
+            if ( index === 0 || pattern.charAt(index - 1) === '\x01' ) {
                 continue;
             }
-            if ( token.charCodeAt(0) === 0x62 /* 'b' */ ) {
-                const match = /\\+$/.exec(prefix);
-                if ( match !== null && (match[0].length & 1) !== 0 ) {
-                    prefix += 'b';
-                    token = token.slice(1);
-                }
-            }
+            const { lastIndex } = this.reToken;
             if (
-                this.reRegexBadPrefix.test(prefix) || (
-                    token.length < this.maxTokenLen &&
-                    this.reRegexBadSuffix.test(suffix)
+                token.length < this.maxTokenLen && (
+                    lastIndex === pattern.length ||
+                    pattern.charAt(lastIndex) === '\x01'
                 )
             ) {
                 continue;

diff --git a/src/lib/regexanalyzer/README.md b/src/lib/regexanalyzer/README.md
@@ -0,0 +1,14 @@
+https://github.com/foo123/RegexAnalyzer/issues/1#issuecomment-750039255
+
+> The (implied) license is as free as it can get. You can modify it and use
+> it anywhere you want if it suits you.
+> 
+> An attribution to original author would be appreciated but even this is not
+> mandatory.
+> 
+> Copy Left
+
+References:
+
+- https://en.wikipedia.org/wiki/Copyleft
+- http://gplv3.fsf.org/wiki/index.php/Compatible_licenses