diff --git a/conf.example.json b/conf.example.json index df4e6c0..7fc7e95 100644 --- a/conf.example.json +++ b/conf.example.json @@ -1,8 +1,8 @@ { - "workers": 2, + "workers": 5, "seedUrl": "http://stackoverflow.com/", "database": { - "name": "web-crawler-stackoverflow", + "name": "web-crawler-so", "rebuild": true }, "jobs": [ @@ -11,4 +11,4 @@ { "name": "driller", "domainRestriction": "stackoverflow.com" }, { "name": "scheduler" } ] -} \ No newline at end of file +} diff --git a/webcrawler/agent.js b/webcrawler/agent.js index a9e3882..431cf60 100755 --- a/webcrawler/agent.js +++ b/webcrawler/agent.js @@ -95,6 +95,8 @@ agent.init = function( options ) { agent.queue = function( url, data ) { var self = this; + console.log( url ); + process.nextTick(function(){ var task = Url.parse( url ); if( data != undefined ) { @@ -161,7 +163,12 @@ agent.followRedirect = function( res, task, callback ) { } else { process.nextTick(function() { - agent.worker( task, callback ); + try { + agent.worker( task, callback ); + } + catch( e ) { + agent.onError({message: e.message}, task, callback); + } }); } } @@ -176,8 +183,7 @@ agent.followRedirect = function( res, task, callback ) { } agent.onRequest = function( res, task, callback ) { - var self = this, - data = ''; + var self = this; if( this.followRedirect( res, task, callback ) ) { return; @@ -191,12 +197,16 @@ agent.onRequest = function( res, task, callback ) { } res.on('data', function (chunk) { - data += chunk; + if( this._data == undefined ) { + this._data = ''; + } + + this._data += chunk; }); res.on('end', function() { + var data = this._data; self.handleData( data, task, res, callback ); - data = null; }); } @@ -219,10 +229,6 @@ agent.handleData = function( data, task, res, callback ) { } async.series( chain, callback ); - - res = null; - chain = null; - data = null; } agent.getJobFunction = function( job, data, env ) { diff --git a/webcrawler/job/driller.js b/webcrawler/job/driller.js index 1ceec18..0b1c541 100755 --- a/webcrawler/job/driller.js +++ b/webcrawler/job/driller.js @@ -129,11 +129,17 @@ Driller.prototype.isValidUrl = function( url ) { /** * Ignore links like "mailto:" or "javascript:" */ + + /* var protocol = url.match( /^[\ ]*([a-zA-Z0-0]+):/ ); if( protocol && protocol[1].toLowerCase() != 'http' && protocol[1].toLowerCase() != 'https' ) { return false; } + */ + if( ! url.match( /^http[s]{0,1}:\/\//i ) ) { + return false; + } if( this.options.domain && ! url.match( this.options.domain ) ) { return false;