Skip to content

Commit

Permalink
more flexible configuration
Browse files Browse the repository at this point in the history
  • Loading branch information
Lukasz authored and Lukasz committed Nov 13, 2013
1 parent a02c096 commit 17c0d45
Show file tree
Hide file tree
Showing 4 changed files with 168 additions and 34 deletions.
41 changes: 41 additions & 0 deletions test/webcrawler/job/solr.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
var assert = require( "assert" )
var helper = require( "../../helper" );
var fs = require('fs');
var Solr = require( '../../../webcrawler/job/solr' );
var cheerio = require('cheerio');

function solrExecute( testFile, options, env, callback ) {
var solr = new Solr(options);

fs.readFile( __dirname + testFile , 'utf8', function (err, html) {
if( err ) { throw new Exception( err ); }

solr.save = callback;

solr.execute( function(){
callback( docs );
}, cheerio.load( html ), env );

});

}

describe('Solr', function(){

describe('#getRules', function(){

var solr = new Solr({
rules: [
{ "selector": "div a",
"attribute": false,
"field": "title" }
]
});

assert.equal( solr.getRules().length, 1 );



});

});
61 changes: 50 additions & 11 deletions webcrawler/agent.js
Original file line number Diff line number Diff line change
Expand Up @@ -44,24 +44,45 @@ agent._run = function( config ) {
function (err, result) {
var workersCount = config.getWorkers();
var seedUrl = config.getSeedUrl();

if( ! seedUrl ) {
var scheduler = new Scheduler();
var env = { agent: self };
}
for( var i = 0 ; i < workersCount ; i++ ) {
setTimeout(function () {
if( seedUrl ) {
agent.queue( seedUrl );
}
else {
scheduler.execute( false, false, env );
}
}, 500 * i );
}

if( typeof( seedUrl ) == 'object' ) {
self._runFromArray( seedUrl, 500 );
}
else {
self._runFromString( seedUrl, workersCount, 500 );
}
}
);
}

agent._runFromArray = function(seedUrl, delay) {
for( i in seedUrl ) {
(function( i ) {
setTimeout(function () {
agent.queue( seedUrl[ i ] );
}, delay * i );
})( i );
}
}

agent._runFromString = function(seedUrl, workersCount, delay ) {
for( var i = 0 ; i < workersCount ; i++ ) {
setTimeout(function () {
if( seedUrl ) {
agent.queue( seedUrl );
}
else {
scheduler.execute( false, false, env );
}
}, delay * i );
}
}

agent.init = function( options ) {
var self = this;
this.options =
Expand Down Expand Up @@ -128,7 +149,21 @@ agent.followRedirect = function( res, task, callback ) {
source.push( task.href );
task = Url.parse( UrlTool.nomalise( res.headers['location'], env ) );
task.source = source;
this.worker( task, callback );

if( task.redirectTTL == undefined ) {
task.redirectTTL = 30;
}

task.redirectTTL -= 1;

if( task.redirectTTL == 0 ) {
agent.onError({message: "Broken redirection loop"}, task, callback );
}
else {
process.nextTick(function() {
agent.worker( task, callback );
});
}
}
catch( e ) {
callback();
Expand Down Expand Up @@ -184,6 +219,10 @@ agent.handleData = function( data, task, res, callback ) {
}

async.series( chain, callback );

res = null;
chain = null;
data = null;
}

agent.getJobFunction = function( job, data, env ) {
Expand Down
2 changes: 2 additions & 0 deletions webcrawler/job/driller.js
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,8 @@ Driller.prototype.execute = function(callback, $, env) {
});

async.parallel( docs, callback );

docs = null;
}

Driller.prototype.addOverwriteRule = function( rule ) {
Expand Down
98 changes: 75 additions & 23 deletions webcrawler/job/solr.js
Original file line number Diff line number Diff line change
@@ -1,23 +1,17 @@
var cheerio = require('cheerio');
var solr = require('solr-client');
var async = require( 'async' );

exports = module.exports = Solr;

function Solr(options) {
for( i in options.rules ) {
if( options.rules[ i ].filter != undefined ) {
if( options.rules[ i ].filter[0] != undefined ) {
for( y in options.rules[ i ].filter ) {
options.rules[ i ].filter[ y ].pattern = new RegExp( options.rules[ i ].filter[ y ].pattern, 'g' );
}
}
else {
options.rules[ i ].filter.pattern = new RegExp( options.rules[ i ].filter.pattern, 'g' );
}
this.initFilters( options.rules[ i ] );
}
}

if( options.urlPattern != undefined ) {
if( options.urlPattern != undefined && options.urlPattern != false ) {
options.urlPattern = new RegExp( options.urlPattern );
}
else {
Expand All @@ -27,6 +21,33 @@ function Solr(options) {
this.options = options;
}

Solr.prototype.initFilters = function( rule ) {
if( rule.filter[0] != undefined ) {
for( y in rule.filter ) {
this._initFilter( rule.filter[ y ] );
}
}
else {
this._initFilter( rule.filter );
}
}

Solr.prototype._initFilter = function( filter ) {
if( filter.pattern != undefined ) {
filter.pattern = new RegExp( filter.pattern, 'g' );
filter.run = function( str ) {
return str.replace( this.pattern, this.replacement );
}
}
else if( filter.module != undefined ) {
console.log( filter.module );
filter.run = require( filter.module ).run;
}
else {
throw new Exception( "Incorrect filter configuration" );
}
}

Solr.prototype.getRules = function() {
return this.options.rules;
}
Expand Down Expand Up @@ -72,35 +93,64 @@ Solr.prototype.applyFilter = function( str, rule ) {
}

if( rule.filter[ 0 ] == undefined ) {
return str.replace( rule.filter.pattern, rule.filter.replacement );
return rule.filter.run( str );
//return str.replace( rule.filter.pattern, rule.filter.replacement );
}

for( i in rule.filter ) {
str = str.replace( rule.filter[ i ].pattern, rule.filter[ i ].replacement );
str = rule.filter[ i ].run( str );
//str = str.replace( rule.filter[ i ].pattern, rule.filter[ i ].replacement );
}

return str;
}

Solr.prototype.execute = function(callback, $, env) {
if( typeof( $ ) != 'function' ) {
if( typeof( $ ) != 'function' ||
( this.options.urlPattern && ! env.task.href.match( this.options.urlPattern ) ) ) {
return callback();
}

if( this.options.urlPattern && ! env.task.href.match( this.options.urlPattern ) ) {
return callback();
}
var self = this;

if( this.options.each != undefined ) {
/**
* Create multiple Solr document from one HTML page
*/
var docs = [];
$( this.options.each ).each(function(i, el) {
docs.push(function(callback){
var doc = {};
self.parseAndInsert( $, doc, el, callback );
});
});

var self = this,
saveDoc = false,
doc = {
async.parallel( docs, callback );
}
else {
var doc = {
id: env.task.href
};

this.parseAndInsert( $, doc, false, callback );
}
}

Solr.prototype.parseAndInsert = function( $, doc, el, callback ) {
var self = this,
saveDoc = false;

this.eachRule( function( rule ) {
$( rule.selector ).each( function() {
if( el ) {
var obj = $(el).find( rule.selector );
}
else {
var obj = $( rule.selector );
}

obj.each( function() {
if( rule.attribute == false ) {
var content = $(this).text();
var content = $(this).html();
}
else {
var content = $(this).attr( rule.attribute );
Expand All @@ -122,12 +172,14 @@ Solr.prototype.execute = function(callback, $, env) {
});
});

/**
* Save only if there is at least one positive match
*/
if( saveDoc ) {
this.save( doc, callback );
console.log( doc );
//this.save( doc, callback );
}
else {
callback();
}

}

0 comments on commit 17c0d45

Please sign in to comment.