From d15d8cce27efd8720f9bb3955dbeb91f4f741378 Mon Sep 17 00:00:00 2001 From: Lukasz Date: Sun, 10 Nov 2013 19:19:15 +0000 Subject: [PATCH] fix link duplication issue --- test/webcrawler/job/driller.js | 10 ++++++++-- webcrawler/job/driller.js | 26 ++++++++++++++++---------- webcrawler/job/solr.js | 33 ++++++++++++++++++++++++++++++--- webcrawler/utils/urltool.js | 1 + webcrawler/visittedurls.js | 15 +++++++++++++++ 5 files changed, 70 insertions(+), 15 deletions(-) create mode 100644 webcrawler/visittedurls.js diff --git a/test/webcrawler/job/driller.js b/test/webcrawler/job/driller.js index bbd649e..1c3323b 100644 --- a/test/webcrawler/job/driller.js +++ b/test/webcrawler/job/driller.js @@ -91,6 +91,10 @@ describe('Driller', function(){ env: 'http://www.testing.com/download.php?a=10', expect: 'http://www.testing.com/privacy.php' }, + { input: '/hello/id/5?_a=1#comment=1023', + env: 'http://www.testing.com/example/id/1', + expect: 'http://www.testing.com/hello/id/5?_a=1' }, + ]; for( i in tests ) { @@ -119,9 +123,10 @@ describe('Driller', function(){ }); describe('#execute()', function(){ - - it('should drill urls only from http://*example.com/', function( done ) { + var VisittedUrls = require('../../../webcrawler/visittedurls'); + it('should drill urls only from http://*example.com/', function( done ) { + VisittedUrls.setLinks( [] ); var env = helper.getEnv( 'http://www.example.com/view/1' ); drillerExecute( '/html/site01.html', 'example.com', env, function(docs){ assert.equal( docs.length, 6 ); @@ -131,6 +136,7 @@ describe('Driller', function(){ }); it('should attach url source to all documents', function(done){ + VisittedUrls.setLinks( [] ); var env = helper.getEnv( 'http://www.example.com/view/1' ); env.task.data = { source: [ 'http://www.example.com/1', 'http://www.example.com/2'] }; diff --git a/webcrawler/job/driller.js b/webcrawler/job/driller.js index 5340063..04f7923 100755 --- a/webcrawler/job/driller.js +++ b/webcrawler/job/driller.js @@ -2,6 +2,7 @@ var async = require( 'async' ); var UrlDoc = require( '../storage/doc/urldoc' ); var cheerio = require('cheerio'); var UrlTool = require('../utils/urltool'); +var VisittedUrls = require('../visittedurls'); exports = module.exports = Driller; @@ -18,7 +19,6 @@ function Driller(options) { maxDepth: false }; - this._links = {}; this.initOptions( options ); } @@ -26,7 +26,7 @@ Driller.prototype.execute = function(callback, data, env) { if( env.res.headers['content-type'] == undefined || ! env.res.headers['content-type'].match( /^text\/html/) ) { return callback(); } - + if( this.options.maxDepth !== false && env.task.data != undefined && env.task.data.source != undefined) { if( env.task.data.source.length >= this.options.maxDepth ) { return callback(); @@ -77,6 +77,9 @@ Driller.prototype.initOptions = function( options ) { if( i == "domainRestriction" ) { this.setDomainRestriction( options[ i ] ); } + else if( i == "patterns" ) { + this.setPatterns( options[ i ] ); + } else { this.options[ i ] = options[ i ]; } @@ -90,6 +93,12 @@ Driller.prototype.setDomainRestriction = function( domain ) { this.options[ "domain" ] = new RegExp( domain, 'i' ); } +Driller.prototype.setPatterns = function( patterns ) { + for( i in patterns ) { + this.options.patterns.push( new RegExp( patterns[ i ] ) ); + } +} + Driller.prototype.getOverwrite = function( url ) { for( i in this.options.overwrite ) { var rule = this.options.overwrite[ i ]; @@ -140,14 +149,11 @@ Driller.prototype.isValidUrl = function( url ) { } } - /** - * @todo: release some data to avoid memory leak - */ - if( this._links[ url ] == undefined ) { - this._links[ url ] = 1; - return true; + if( VisittedUrls.exists( url ) ) { + return false; } - this._links[ url ] += 1; - return false; + VisittedUrls.add( url ); + + return true; } \ No newline at end of file diff --git a/webcrawler/job/solr.js b/webcrawler/job/solr.js index d9bf3ce..c2c8982 100644 --- a/webcrawler/job/solr.js +++ b/webcrawler/job/solr.js @@ -6,9 +6,22 @@ exports = module.exports = Solr; function Solr(options) { for( i in options.rules ) { if( options.rules[ i ].filter != undefined ) { - options.rules[ i ].filter.pattern = new RegExp( options.rules[ i ].filter.pattern, 'g' ); + if( options.rules[ i ].filter[0] != undefined ) { + for( y in options.rules[ i ].filter ) { + options.rules[ i ].filter[ y ].pattern = new RegExp( options.rules[ i ].filter[ y ].pattern, 'g' ); + } + } + else { + options.rules[ i ].filter.pattern = new RegExp( options.rules[ i ].filter.pattern, 'g' ); + } } + } + if( options.urlPattern != undefined ) { + options.urlPattern = new RegExp( options.urlPattern ); + } + else { + options.urlPattern = false; } this.options = options; @@ -58,14 +71,28 @@ Solr.prototype.applyFilter = function( str, rule ) { return str; } - return str.replace( rule.filter.pattern, rule.filter.replacement ); + if( rule.filter[ 0 ] == undefined ) { + return str.replace( rule.filter.pattern, rule.filter.replacement ); + } + + for( i in rule.filter ) { + str = str.replace( rule.filter[ i ].pattern, rule.filter[ i ].replacement ); + } + + return str; } Solr.prototype.execute = function(callback, data, env) { + return callback(); + if( env.res.headers['content-type'] == undefined || ! env.res.headers['content-type'].match( /^text\/html/) ) { return callback(); } + if( this.options.urlPattern && ! env.task.href.match( this.options.urlPattern ) ) { + return callback(); + } + var $ = cheerio.load( data ), self = this, saveDoc = false, @@ -87,7 +114,7 @@ Solr.prototype.execute = function(callback, data, env) { if( doc[ rule.field ] == undefined ) { doc[ rule.field ] = content; } - else if( typeof( doc ) == 'string' ) { + else if( typeof( doc[ rule.field ] ) == 'string' ) { doc[ rule.field ] = [ doc[ rule.field ], content ]; } else { diff --git a/webcrawler/utils/urltool.js b/webcrawler/utils/urltool.js index ac7b954..7f638d9 100644 --- a/webcrawler/utils/urltool.js +++ b/webcrawler/utils/urltool.js @@ -31,6 +31,7 @@ UrlTool.nomalise = function( url, env, plugins ) { } url = url.replace( /([^:])\/[\/]+/, '$1/' ); + url = url.replace( /#.*$/, '' ); return url; } \ No newline at end of file diff --git a/webcrawler/visittedurls.js b/webcrawler/visittedurls.js new file mode 100644 index 0000000..a797c23 --- /dev/null +++ b/webcrawler/visittedurls.js @@ -0,0 +1,15 @@ +var VisittedUrls = exports = module.exports = {}; + +VisittedUrls.links = {}; + +VisittedUrls.add = function( url ) { + VisittedUrls.links[ url ] = 1; +} + +VisittedUrls.exists = function( url ) { + return VisittedUrls.links[ url ] != undefined; +} + +VisittedUrls.setLinks = function( links ) { + VisittedUrls.links = links; +} \ No newline at end of file