Skip to content

Commit

Permalink
update
Browse files Browse the repository at this point in the history
  • Loading branch information
Lukasz authored and Lukasz committed Nov 14, 2013
1 parent 34dcbf3 commit 1e7e138
Show file tree
Hide file tree
Showing 3 changed files with 24 additions and 12 deletions.
6 changes: 3 additions & 3 deletions conf.example.json
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
{
"workers": 2,
"workers": 5,
"seedUrl": "http://stackoverflow.com/",
"database": {
"name": "web-crawler-stackoverflow",
"name": "web-crawler-so",
"rebuild": true
},
"jobs": [
Expand All @@ -11,4 +11,4 @@
{ "name": "driller", "domainRestriction": "stackoverflow.com" },
{ "name": "scheduler" }
]
}
}
24 changes: 15 additions & 9 deletions webcrawler/agent.js
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,8 @@ agent.init = function( options ) {
agent.queue = function( url, data ) {
var self = this;

console.log( url );

process.nextTick(function(){
var task = Url.parse( url );
if( data != undefined ) {
Expand Down Expand Up @@ -161,7 +163,12 @@ agent.followRedirect = function( res, task, callback ) {
}
else {
process.nextTick(function() {
agent.worker( task, callback );
try {
agent.worker( task, callback );
}
catch( e ) {
agent.onError({message: e.message}, task, callback);
}
});
}
}
Expand All @@ -176,8 +183,7 @@ agent.followRedirect = function( res, task, callback ) {
}

agent.onRequest = function( res, task, callback ) {
var self = this,
data = '';
var self = this;

if( this.followRedirect( res, task, callback ) ) {
return;
Expand All @@ -191,12 +197,16 @@ agent.onRequest = function( res, task, callback ) {
}

res.on('data', function (chunk) {
data += chunk;
if( this._data == undefined ) {
this._data = '';
}

this._data += chunk;
});

res.on('end', function() {
var data = this._data;
self.handleData( data, task, res, callback );
data = null;
});
}

Expand All @@ -219,10 +229,6 @@ agent.handleData = function( data, task, res, callback ) {
}

async.series( chain, callback );

res = null;
chain = null;
data = null;
}

agent.getJobFunction = function( job, data, env ) {
Expand Down
6 changes: 6 additions & 0 deletions webcrawler/job/driller.js
Original file line number Diff line number Diff line change
Expand Up @@ -129,11 +129,17 @@ Driller.prototype.isValidUrl = function( url ) {
/**
* Ignore links like "mailto:" or "javascript:"
*/

/*
var protocol = url.match( /^[\ ]*([a-zA-Z0-0]+):/ );
if( protocol && protocol[1].toLowerCase() != 'http' && protocol[1].toLowerCase() != 'https' ) {
return false;
}
*/

if( ! url.match( /^http[s]{0,1}:\/\//i ) ) {
return false;
}

if( this.options.domain && ! url.match( this.options.domain ) ) {
return false;
Expand Down

0 comments on commit 1e7e138

Please sign in to comment.