Skip to content

Commit

Permalink
fix link duplication issue
Browse files Browse the repository at this point in the history
  • Loading branch information
Lukasz authored and Lukasz committed Nov 10, 2013
1 parent 290cacb commit d15d8cc
Show file tree
Hide file tree
Showing 5 changed files with 70 additions and 15 deletions.
10 changes: 8 additions & 2 deletions test/webcrawler/job/driller.js
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,10 @@ describe('Driller', function(){
env: 'http://www.testing.com/download.php?a=10',
expect: 'http://www.testing.com/privacy.php' },

{ input: '/hello/id/5?_a=1#comment=1023',
env: 'http://www.testing.com/example/id/1',
expect: 'http://www.testing.com/hello/id/5?_a=1' },

];

for( i in tests ) {
Expand Down Expand Up @@ -119,9 +123,10 @@ describe('Driller', function(){
});

describe('#execute()', function(){

it('should drill urls only from http://*example.com/', function( done ) {
var VisittedUrls = require('../../../webcrawler/visittedurls');

it('should drill urls only from http://*example.com/', function( done ) {
VisittedUrls.setLinks( [] );
var env = helper.getEnv( 'http://www.example.com/view/1' );
drillerExecute( '/html/site01.html', 'example.com', env, function(docs){
assert.equal( docs.length, 6 );
Expand All @@ -131,6 +136,7 @@ describe('Driller', function(){
});

it('should attach url source to all documents', function(done){
VisittedUrls.setLinks( [] );
var env = helper.getEnv( 'http://www.example.com/view/1' );

env.task.data = { source: [ 'http://www.example.com/1', 'http://www.example.com/2'] };
Expand Down
26 changes: 16 additions & 10 deletions webcrawler/job/driller.js
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ var async = require( 'async' );
var UrlDoc = require( '../storage/doc/urldoc' );
var cheerio = require('cheerio');
var UrlTool = require('../utils/urltool');
var VisittedUrls = require('../visittedurls');

exports = module.exports = Driller;

Expand All @@ -18,15 +19,14 @@ function Driller(options) {
maxDepth: false
};

this._links = {};
this.initOptions( options );
}

Driller.prototype.execute = function(callback, data, env) {
if( env.res.headers['content-type'] == undefined || ! env.res.headers['content-type'].match( /^text\/html/) ) {
return callback();
}

if( this.options.maxDepth !== false && env.task.data != undefined && env.task.data.source != undefined) {
if( env.task.data.source.length >= this.options.maxDepth ) {
return callback();
Expand Down Expand Up @@ -77,6 +77,9 @@ Driller.prototype.initOptions = function( options ) {
if( i == "domainRestriction" ) {
this.setDomainRestriction( options[ i ] );
}
else if( i == "patterns" ) {
this.setPatterns( options[ i ] );
}
else {
this.options[ i ] = options[ i ];
}
Expand All @@ -90,6 +93,12 @@ Driller.prototype.setDomainRestriction = function( domain ) {
this.options[ "domain" ] = new RegExp( domain, 'i' );
}

Driller.prototype.setPatterns = function( patterns ) {
for( i in patterns ) {
this.options.patterns.push( new RegExp( patterns[ i ] ) );
}
}

Driller.prototype.getOverwrite = function( url ) {
for( i in this.options.overwrite ) {
var rule = this.options.overwrite[ i ];
Expand Down Expand Up @@ -140,14 +149,11 @@ Driller.prototype.isValidUrl = function( url ) {
}
}

/**
* @todo: release some data to avoid memory leak
*/
if( this._links[ url ] == undefined ) {
this._links[ url ] = 1;
return true;
if( VisittedUrls.exists( url ) ) {
return false;
}

this._links[ url ] += 1;
return false;
VisittedUrls.add( url );

return true;
}
33 changes: 30 additions & 3 deletions webcrawler/job/solr.js
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,22 @@ exports = module.exports = Solr;
function Solr(options) {
for( i in options.rules ) {
if( options.rules[ i ].filter != undefined ) {
options.rules[ i ].filter.pattern = new RegExp( options.rules[ i ].filter.pattern, 'g' );
if( options.rules[ i ].filter[0] != undefined ) {
for( y in options.rules[ i ].filter ) {
options.rules[ i ].filter[ y ].pattern = new RegExp( options.rules[ i ].filter[ y ].pattern, 'g' );
}
}
else {
options.rules[ i ].filter.pattern = new RegExp( options.rules[ i ].filter.pattern, 'g' );
}
}
}

if( options.urlPattern != undefined ) {
options.urlPattern = new RegExp( options.urlPattern );
}
else {
options.urlPattern = false;
}

this.options = options;
Expand Down Expand Up @@ -58,14 +71,28 @@ Solr.prototype.applyFilter = function( str, rule ) {
return str;
}

return str.replace( rule.filter.pattern, rule.filter.replacement );
if( rule.filter[ 0 ] == undefined ) {
return str.replace( rule.filter.pattern, rule.filter.replacement );
}

for( i in rule.filter ) {
str = str.replace( rule.filter[ i ].pattern, rule.filter[ i ].replacement );
}

return str;
}

Solr.prototype.execute = function(callback, data, env) {
return callback();

if( env.res.headers['content-type'] == undefined || ! env.res.headers['content-type'].match( /^text\/html/) ) {
return callback();
}

if( this.options.urlPattern && ! env.task.href.match( this.options.urlPattern ) ) {
return callback();
}

var $ = cheerio.load( data ),
self = this,
saveDoc = false,
Expand All @@ -87,7 +114,7 @@ Solr.prototype.execute = function(callback, data, env) {
if( doc[ rule.field ] == undefined ) {
doc[ rule.field ] = content;
}
else if( typeof( doc ) == 'string' ) {
else if( typeof( doc[ rule.field ] ) == 'string' ) {
doc[ rule.field ] = [ doc[ rule.field ], content ];
}
else {
Expand Down
1 change: 1 addition & 0 deletions webcrawler/utils/urltool.js
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ UrlTool.nomalise = function( url, env, plugins ) {
}

url = url.replace( /([^:])\/[\/]+/, '$1/' );
url = url.replace( /#.*$/, '' );

return url;
}
15 changes: 15 additions & 0 deletions webcrawler/visittedurls.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
var VisittedUrls = exports = module.exports = {};

VisittedUrls.links = {};

VisittedUrls.add = function( url ) {
VisittedUrls.links[ url ] = 1;
}

VisittedUrls.exists = function( url ) {
return VisittedUrls.links[ url ] != undefined;
}

VisittedUrls.setLinks = function( links ) {
VisittedUrls.links = links;
}

0 comments on commit d15d8cc

Please sign in to comment.