Skip to content

Commit

Permalink
Introduce three more specialized filter classes to avoid regexes
Browse files Browse the repository at this point in the history
Performance- and memory-related work. Three more classes have
been created to avoid regex-based filters internally.

Purpose is to enforce filters which have only one single
wildcard in their pattern, a common occurrence. The filter
pattern is split in two literal string segments.

Similar as above, with the added condition that the filter is
hostname-anchored (`||`). The "Wildcard2" variant is a further
specialization to enforce filters where the only wildcard
is immediately preceded by the `^` special character, again
a very common occurrence.

Using two literal string segments in lieu of regexes allows to
quickly detect a mismatch by just testing the first segment.
Additionally, this reduces memory footprint as regexes are
much more expensive memory-wise than plain strings.

These three new filter classes allow to replace the use of
5276 regex-based filters internally with plain string-based
filters.

Often-called isHnAnchored() has been further fine-tuned to
avoid as much work as possible. I have also observed that
using an arrow function for closure-purpose helps measurably
performance, as per built-in benchmark.
  • Loading branch information
gorhill committed Apr 25, 2019
1 parent dfd6076 commit 9939039
Show file tree
Hide file tree
Showing 2 changed files with 232 additions and 29 deletions.
2 changes: 1 addition & 1 deletion src/js/background.js
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,7 @@ const µBlock = (function() { // jshint ignore:line

// Read-only
systemSettings: {
compiledMagic: 10, // Increase when compiled format changes
compiledMagic: 11, // Increase when compiled format changes
selfieMagic: 11 // Increase when selfie format changes
},

Expand Down
259 changes: 231 additions & 28 deletions src/js/static-net-filtering.js
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,8 @@ const typeValueToTypeName = {

const BlockImportant = BlockAction | Important;

const reIsWildcarded = /[\^\*]/;

// ABP filters: https://adblockplus.org/en/filters
// regex tester: http://regex101.com/

Expand All @@ -110,10 +112,39 @@ const BlockImportant = BlockAction | Important;
// See the following as short-lived registers, used during evaluation. They are
// valid until the next evaluation.

let pageHostnameRegister = '',
requestHostnameRegister = '';
//var filterRegister = null;
//var categoryRegister = '';
let pageHostnameRegister = '';
let requestHostnameRegister = '';

/******************************************************************************/

// First character of match must be within the hostname part of the url.
//
// https://github.com/gorhill/uBlock/issues/1929
// Match only hostname label boundaries.

const isHnAnchored = (( ) => {
let lastLen = 0, lastBeg = -1, lastEnd = -1;

return (url, matchStart) => {
const len = requestHostnameRegister.length;
if ( len !== lastLen || url.endsWith('://', lastBeg) === false ) {
lastBeg = len !== 0 ? url.indexOf('://') : -1;
if ( lastBeg !== -1 ) {
lastBeg += 3;
lastEnd = lastBeg + len;
} else {
lastEnd = -1;
}
}
return matchStart < lastEnd && (
matchStart === lastBeg ||
matchStart > lastBeg &&
url.charCodeAt(matchStart - 1) === 0x2E /* '.' */
);
};
})();

/******************************************************************************/

// Local helpers

Expand Down Expand Up @@ -204,27 +235,6 @@ const toLogDataInternal = function(categoryBits, tokenHash, filter) {
return logData;
};

// First character of match must be within the hostname part of the url.
//
// https://github.com/gorhill/uBlock/issues/1929
// Match only hostname label boundaries.
const isHnAnchored = (function() {
let hostname = '';
let beg = -1, end = -1;

return function(url, matchStart) {
if ( requestHostnameRegister !== hostname ) {
const hn = requestHostnameRegister;
beg = hn !== '' ? url.indexOf(hn) : -1;
end = beg !== -1 ? beg + hn.length : -1;
hostname = hn;
}
if ( matchStart < beg || matchStart >= end ) { return false; }
return matchStart === beg ||
url.charCodeAt(matchStart - 1) === 0x2E /* '.' */;
};
})();

/*******************************************************************************
Each filter class will register itself in the map. A filter class
Expand Down Expand Up @@ -536,6 +546,52 @@ FilterPlainHnAnchored.prototype.trieableId = 1;

registerFilterClass(FilterPlainHnAnchored);

/*******************************************************************************
Filters with only one single occurrence of wildcard `*`
*/

const FilterWildcard1 = class {
constructor(s0, s1) {
this.s0 = s0;
this.s1 = s1;
}

match(url) {
const pos = url.indexOf(this.s0);
return pos !== -1 && url.indexOf(this.s1, pos + this.s0.length) !== -1;
}

logData() {
return {
raw: `${this.s0}*${this.s1}`,
regex: rawToRegexStr(`${this.s0}*${this.s1}`, 0),
compiled: this.compile()
};
}

compile() {
return [ this.fid, this.s0, this.s1 ];
}

static compile(details) {
if ( details.anchor !== 0 ) { return; }
const s = details.f;
let pos = s.indexOf('*');
if ( pos === -1 ) { return; }
if ( reIsWildcarded.test(s.slice(pos + 1)) ) { return; }
if ( reIsWildcarded.test(s.slice(0, pos)) ) { return; }
return [ FilterWildcard1.fid, s.slice(0, pos), s.slice(pos + 1) ];
}

static load(args) {
return new FilterWildcard1(args[1], args[2]);
}
};

registerFilterClass(FilterWildcard1);

/******************************************************************************/

const FilterGeneric = class {
Expand Down Expand Up @@ -571,6 +627,8 @@ const FilterGeneric = class {
}

static compile(details) {
const compiled = FilterWildcard1.compile(details);
if ( compiled !== undefined ) { return compiled; }
return [ FilterGeneric.fid, details.f, details.anchor ];
}

Expand All @@ -583,6 +641,117 @@ FilterGeneric.prototype.re = null;

registerFilterClass(FilterGeneric);

/*******************************************************************************
Hostname-anchored filters with only one occurrence of wildcard `*`
*/

const FilterWildcard1HnAnchored = class {
constructor(s0, s1) {
this.s0 = s0;
this.s1 = s1;
}

match(url) {
const pos = url.indexOf(this.s0);
return pos !== -1 &&
isHnAnchored(url, pos) &&
url.indexOf(this.s1, pos + this.s0.length) !== -1;
}

logData() {
return {
raw: `||${this.s0}*${this.s1}`,
regex: rawToRegexStr(`${this.s0}*${this.s1}`, 0),
compiled: this.compile()
};
}

compile() {
return [ this.fid, this.s0, this.s1 ];
}

static compile(details) {
if ( (details.anchor & 0x0b001) !== 0 ) { return; }
const s = details.f;
let pos = s.indexOf('*');
if ( pos === -1 ) { return; }
if ( reIsWildcarded.test(s.slice(pos + 1)) ) { return; }
const needSeparator =
pos !== 0 && s.charCodeAt(pos - 1) === 0x5E /* '^' */;
if ( needSeparator ) { pos -= 1; }
if ( reIsWildcarded.test(s.slice(0, pos)) ) { return; }
if ( needSeparator ) {
return FilterWildcard2HnAnchored.compile(details, pos);
}
return [
FilterWildcard1HnAnchored.fid,
s.slice(0, pos),
s.slice(pos + 1),
];
}

static load(args) {
return new FilterWildcard1HnAnchored(args[1], args[2]);
}
};

registerFilterClass(FilterWildcard1HnAnchored);

/*******************************************************************************
Hostname-anchored filters with one occurrence of the wildcard
sequence `^*` and no other wildcard-equivalent character
*/

const FilterWildcard2HnAnchored = class {
constructor(s0, s1) {
this.s0 = s0;
this.s1 = s1;
}

match(url) {
const pos0 = url.indexOf(this.s0);
if ( pos0 === -1 || isHnAnchored(url, pos0) === false ) {
return false;
}
const pos1 = pos0 + this.s0.length;
const pos2 = url.indexOf(this.s1, pos1);
return pos2 !== -1 &&
this.reSeparators.test(url.slice(pos1, pos2));
}

logData() {
return {
raw: `||${this.s0}^*${this.s1}`,
regex: rawToRegexStr(`${this.s0}^*${this.s1}`, 0),
compiled: this.compile()
};
}

compile() {
return [ this.fid, this.s0, this.s1 ];
}

static compile(details, pos) {
return [
FilterWildcard2HnAnchored.fid,
details.f.slice(0, pos),
details.f.slice(pos + 2),
];
}

static load(args) {
return new FilterWildcard2HnAnchored(args[1], args[2]);
}
};

FilterWildcard2HnAnchored.prototype.reSeparators = /[^0-9a-z.%_-]/;

registerFilterClass(FilterWildcard2HnAnchored);

/******************************************************************************/

const FilterGenericHnAnchored = class {
Expand Down Expand Up @@ -610,6 +779,8 @@ const FilterGenericHnAnchored = class {
}

static compile(details) {
const compiled = FilterWildcard1HnAnchored.compile(details);
if ( compiled !== undefined ) { return compiled; }
return [ FilterGenericHnAnchored.fid, details.f ];
}

Expand Down Expand Up @@ -1377,7 +1548,10 @@ const FilterBucket = class {
return true;
}
}
if ( this.plainHnAnchoredTrie !== null && isHnAnchored(url, tokenBeg) ) {
if (
this.plainHnAnchoredTrie !== null &&
isHnAnchored(url, tokenBeg)
) {
const pos = this.plainHnAnchoredTrie.matches(url, tokenBeg);
if ( pos !== -1 ) {
this.plainHnAnchoredFilter.s = url.slice(tokenBeg, pos);
Expand Down Expand Up @@ -1524,7 +1698,6 @@ const FilterParser = function() {
this.reHasUnicode = /[^\x00-\x7F]/;
this.reWebsocketAny = /^ws[s*]?(?::\/?\/?)?\*?$/;
this.reBadCSP = /(?:^|;)\s*report-(?:to|uri)\b/;
this.reIsWildcarded = /[\^\*]/;
this.domainOpt = '';
this.noTokenHash = µb.urlTokenizer.noTokenHash;
this.unsupportedTypeBit = this.bitFromType('unsupported');
Expand Down Expand Up @@ -1917,7 +2090,7 @@ FilterParser.prototype.parse = function(raw) {
this.anchor = 0;
}

this.wildcarded = this.reIsWildcarded.test(s);
this.wildcarded = reIsWildcarded.test(s);

// This might look weird but we gain memory footprint by not going through
// toLowerCase(), at least on Chromium. Because copy-on-write?
Expand Down Expand Up @@ -2985,6 +3158,36 @@ FilterContainer.prototype.bucketHistogram = function() {
- FilterPlainHnAnchored and FilterPlainPrefix1 are good candidates
for storing in a plain string trie.
As of 2019-04-25:
{"FilterPlainHnAnchored" => 11078}
{"FilterPlainPrefix1" => 7195}
{"FilterPrefix1Trie" => 5720}
{"FilterOriginHit" => 3561}
{"FilterWildcard2HnAnchored" => 2943}
{"FilterPair" => 2391}
{"FilterBucket" => 1922}
{"FilterWildcard1HnAnchored" => 1910}
{"FilterHnAnchoredTrie" => 1586}
{"FilterPlainHostname" => 1391}
{"FilterOriginHitSet" => 1155}
{"FilterPlain" => 634}
{"FilterWildcard1" => 423}
{"FilterGenericHnAnchored" => 389}
{"FilterOriginMiss" => 302}
{"FilterGeneric" => 163}
{"FilterOriginMissSet" => 150}
{"FilterRegex" => 124}
{"FilterPlainRightAnchored" => 110}
{"FilterGenericHnAndRightAnchored" => 95}
{"FilterHostnameDict" => 59}
{"FilterPlainLeftAnchored" => 30}
{"FilterJustOrigin" => 22}
{"FilterHTTPJustOrigin" => 19}
{"FilterHTTPSJustOrigin" => 18}
{"FilterExactMatch" => 5}
{"FilterOriginMixedSet" => 3}
*/

FilterContainer.prototype.filterClassHistogram = function() {
Expand Down

0 comments on commit 9939039

Please sign in to comment.