Skip to content

Commit

Permalink
add bench scripts
Browse files Browse the repository at this point in the history
  • Loading branch information
pyloque committed Oct 27, 2018
1 parent 26101df commit 1c28655
Show file tree
Hide file tree
Showing 4 changed files with 115 additions and 35 deletions.
76 changes: 76 additions & 0 deletions index.bench.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
var FastScanner = require('./index')
var chars = 'abcdefghijklmnopqrstuv'

function randomString(min, max) {
var cs = []
var len = 0
if(min == max) {
len = min
} else {
len = Math.floor(Math.random() * (max-min)) + min
}
for(var i=0;i<len;i++) {
var k = Math.floor(Math.random() * chars.length)
cs.push(chars[k])
}
return cs.join('')
}

function randomWords(wordNum, min, max) {
var words = []
for(var k=0;k<wordNum;k++) {
words.push(randomString(min, max))
}
return words
}

function benchBuild() {
var wordLen = [10, 20]
var wordNums = [20000, 40000, 60000, 80000, 100000]
for(var i=0;i<wordNums.length;i++) {
var words = randomWords(wordNums[i], wordLen[0], wordLen[1])
var start = new Date().getTime()
var scanner = new FastScanner(words)
var end = new Date().getTime()
console.log("build ac tree of %d words costs %dms", words.length, end - start)
}
}

function benchSearch() {
var wordLen = [10, 20]
var wordNums = [20000, 40000, 60000, 80000, 100000]
var articleLens = [20000, 40000, 60000, 80000, 100000]
var articles = []
for(var i=0;i<articleLens.length;i++) {
articles.push(randomString(articleLens[i], articleLens[i]))
}
for(var i=0;i<wordNums.length;i++) {
var words = randomWords(wordNums[i], wordLen[0], wordLen[1])
var scanner = new FastScanner(words)
for(var k=0;k<articles.length;k++) {
var start = new Date().getTime()
scanner.search(articles[k])
var end = new Date().getTime()
console.log("search article of %d chars by %s words tree costs %dms", articles[k].length, wordNums[i], end - start)
}
}
}

function benchMemory() {
var wordLen = [10, 20]
var wordNums = [0, 20000, 40000, 60000, 80000, 100000]
for(var i=0;i<wordNums.length;i++) {
var words = randomWords(wordNums[i], wordLen[0], wordLen[1])
gc()
var before = process.memoryUsage()
var scanner = new FastScanner(words)
gc()
var after = process.memoryUsage()
scanner.search('abcdefg')
console.log("build tree of %d words costs rss=%dM heapTotal=%dM heapUsed=%dM", wordNums[i], (after.rss-before.rss) >> 20, (after.heapTotal - before.heapTotal) >> 20, (after.heapUsed - before.heapUsed) >> 20)
}
}

benchBuild()
benchSearch()
benchMemory()
64 changes: 33 additions & 31 deletions index.js
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
var root = {
next: {}, // 子节点指针
val: null, // 当前节点的字符,null表示根节点
back: null, // 回溯指针,也称失败指针
back: null, // 跳跃指针,也称失败指针
parent: null, // 父节点指针,
depth: 0, // 节点深度
accept: false // 是否形成了一个完整的词汇,中间节点也可能为true
Expand Down Expand Up @@ -78,7 +78,7 @@
var parent = node.parent
var back = parent.back
while(back != null) {
// 匹配父节点的回溯节点的子节点
// 匹配父节点的跳跃节点的子节点
var child = back.next[node.val]
if (child) {
node.back = child
Expand All @@ -97,15 +97,14 @@
var c = word[i]
var parent = current.parent
var back = parent.back
// 第一层节点也谈不上回溯
if (back == null) {
current = current.next[c]
continue;
}
// 匹配父节点的回溯节点的子节点
var child = back.next[current.val]
if (child) {
current.back = child
while (back != null) {
// 匹配父节点的跳跃节点的子节点
var child = back.next[current.val]
if (child) {
current.back = child
break
}
back = back.back
}
current = current.next[c]
}
Expand Down Expand Up @@ -136,8 +135,6 @@
return
}
addWord(this.root, word)
// var util = require('util')
// console.log(util.inspect(this.root, null, 8))
fallback(this.root, word)
}

Expand Down Expand Up @@ -182,7 +179,26 @@
for (var i = 0; i < content.length;i++) {
var c = content[i];
var next = current.next[c];
if (next) {
if(!next) {
// 递归匹配跳跃节点的子节点
var back = current.back
while(back != null) {
if(back.accept) {
var word = collect(back)
offWords.push([i - word.length, word]);
// 只选第一个词
if (options.quick) {
return offWords
}
}
next = back.next[c]
if(next) {
break
}
back = back.back
}
}
if(next) {
current = next;
// 收集匹配的词汇
if (current.accept) {
Expand All @@ -193,24 +209,10 @@
return offWords
}
}
continue;
}
var back = current.back;
if (back == null || back == this.root) {
current = this.root
continue;
}
// 跳跃
current = back;
// 收集匹配的词汇
if (current.accept) {
var word = collect(current)
offWords.push([i - word.length, word]);
// 只选第一个词
if (options.quick) {
return offWords
}
continue
}
// 重置
current = this.root
}
// 同一个位置选最长的
if (options.longest) {
Expand Down
5 changes: 3 additions & 2 deletions index.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -48,17 +48,18 @@ describe('测试叠加词汇', function () {
var scanner = new FastScanner(["近平", "习近平棒", "习近平好"])
var content = "习近平拽"
var offWords = scanner.search(content)
console.log(offWords)
assert.deepEqual([[1, '近平']], offWords)
});
it('扫的狠一点', function () {
var scanner = new FastScanner(["近平", "习近平", "习近平好"])
var content = "我不说习近平好,也不是习近平坏"
var offWords = scanner.search(content)
assert.deepEqual([[3, '习近平'], [3, '习近平好'], [4, '近平'], [11, '习近平'], [12, '近平']], offWords)
assert.deepEqual([[3, '习近平'], [3, '习近平好'], [11, '习近平'], [12, '近平']], offWords)
var offWords = scanner.search(content, { quick: true })
assert.deepEqual([[3, '习近平']], offWords)
var offWords = scanner.search(content, { longest: true })
assert.deepEqual([[3, '习近平好'], [4, '近平'], [11, '习近平'], [12, '近平']], offWords)
assert.deepEqual([[3, '习近平好'], [11, '习近平'], [12, '近平']], offWords)
});
});
describe('动态增加词汇', function () {
Expand Down
5 changes: 3 additions & 2 deletions package.json
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
{
"name": "fastscan",
"version": "1.0.4",
"version": "1.0.5",
"description": "quickly search by ahocorasick algorithm ",
"main": "index.min.js",
"scripts": {
"test": "mocha index.test.js"
"test": "mocha index.test.js",
"bench": "node --expose-gc index.bench.js"
},
"repository": {
"type": "git",
Expand Down

0 comments on commit 1c28655

Please sign in to comment.