From 17c0d450204823ef04a30191d36060d3138893aa Mon Sep 17 00:00:00 2001 From: Lukasz Date: Wed, 13 Nov 2013 22:37:45 +0000 Subject: [PATCH] more flexible configuration --- test/webcrawler/job/solr.js | 41 ++++++++++++++++ webcrawler/agent.js | 61 ++++++++++++++++++----- webcrawler/job/driller.js | 2 + webcrawler/job/solr.js | 98 ++++++++++++++++++++++++++++--------- 4 files changed, 168 insertions(+), 34 deletions(-) create mode 100644 test/webcrawler/job/solr.js diff --git a/test/webcrawler/job/solr.js b/test/webcrawler/job/solr.js new file mode 100644 index 0000000..9b83e28 --- /dev/null +++ b/test/webcrawler/job/solr.js @@ -0,0 +1,41 @@ +var assert = require( "assert" ) +var helper = require( "../../helper" ); +var fs = require('fs'); +var Solr = require( '../../../webcrawler/job/solr' ); +var cheerio = require('cheerio'); + +function solrExecute( testFile, options, env, callback ) { + var solr = new Solr(options); + + fs.readFile( __dirname + testFile , 'utf8', function (err, html) { + if( err ) { throw new Exception( err ); } + + solr.save = callback; + + solr.execute( function(){ + callback( docs ); + }, cheerio.load( html ), env ); + + }); + +} + +describe('Solr', function(){ + + describe('#getRules', function(){ + + var solr = new Solr({ + rules: [ + { "selector": "div a", + "attribute": false, + "field": "title" } + ] + }); + + assert.equal( solr.getRules().length, 1 ); + + + + }); + +}); \ No newline at end of file diff --git a/webcrawler/agent.js b/webcrawler/agent.js index 4b01f27..a9e3882 100755 --- a/webcrawler/agent.js +++ b/webcrawler/agent.js @@ -44,24 +44,45 @@ agent._run = function( config ) { function (err, result) { var workersCount = config.getWorkers(); var seedUrl = config.getSeedUrl(); + if( ! seedUrl ) { var scheduler = new Scheduler(); var env = { agent: self }; } - for( var i = 0 ; i < workersCount ; i++ ) { - setTimeout(function () { - if( seedUrl ) { - agent.queue( seedUrl ); - } - else { - scheduler.execute( false, false, env ); - } - }, 500 * i ); - } + + if( typeof( seedUrl ) == 'object' ) { + self._runFromArray( seedUrl, 500 ); + } + else { + self._runFromString( seedUrl, workersCount, 500 ); + } } ); } +agent._runFromArray = function(seedUrl, delay) { + for( i in seedUrl ) { + (function( i ) { + setTimeout(function () { + agent.queue( seedUrl[ i ] ); + }, delay * i ); + })( i ); + } +} + +agent._runFromString = function(seedUrl, workersCount, delay ) { + for( var i = 0 ; i < workersCount ; i++ ) { + setTimeout(function () { + if( seedUrl ) { + agent.queue( seedUrl ); + } + else { + scheduler.execute( false, false, env ); + } + }, delay * i ); + } +} + agent.init = function( options ) { var self = this; this.options = @@ -128,7 +149,21 @@ agent.followRedirect = function( res, task, callback ) { source.push( task.href ); task = Url.parse( UrlTool.nomalise( res.headers['location'], env ) ); task.source = source; - this.worker( task, callback ); + + if( task.redirectTTL == undefined ) { + task.redirectTTL = 30; + } + + task.redirectTTL -= 1; + + if( task.redirectTTL == 0 ) { + agent.onError({message: "Broken redirection loop"}, task, callback ); + } + else { + process.nextTick(function() { + agent.worker( task, callback ); + }); + } } catch( e ) { callback(); @@ -184,6 +219,10 @@ agent.handleData = function( data, task, res, callback ) { } async.series( chain, callback ); + + res = null; + chain = null; + data = null; } agent.getJobFunction = function( job, data, env ) { diff --git a/webcrawler/job/driller.js b/webcrawler/job/driller.js index d1c51fb..1ceec18 100755 --- a/webcrawler/job/driller.js +++ b/webcrawler/job/driller.js @@ -59,6 +59,8 @@ Driller.prototype.execute = function(callback, $, env) { }); async.parallel( docs, callback ); + + docs = null; } Driller.prototype.addOverwriteRule = function( rule ) { diff --git a/webcrawler/job/solr.js b/webcrawler/job/solr.js index 203064c..ad27d76 100644 --- a/webcrawler/job/solr.js +++ b/webcrawler/job/solr.js @@ -1,23 +1,17 @@ var cheerio = require('cheerio'); var solr = require('solr-client'); +var async = require( 'async' ); exports = module.exports = Solr; function Solr(options) { for( i in options.rules ) { if( options.rules[ i ].filter != undefined ) { - if( options.rules[ i ].filter[0] != undefined ) { - for( y in options.rules[ i ].filter ) { - options.rules[ i ].filter[ y ].pattern = new RegExp( options.rules[ i ].filter[ y ].pattern, 'g' ); - } - } - else { - options.rules[ i ].filter.pattern = new RegExp( options.rules[ i ].filter.pattern, 'g' ); - } + this.initFilters( options.rules[ i ] ); } } - if( options.urlPattern != undefined ) { + if( options.urlPattern != undefined && options.urlPattern != false ) { options.urlPattern = new RegExp( options.urlPattern ); } else { @@ -27,6 +21,33 @@ function Solr(options) { this.options = options; } +Solr.prototype.initFilters = function( rule ) { + if( rule.filter[0] != undefined ) { + for( y in rule.filter ) { + this._initFilter( rule.filter[ y ] ); + } + } + else { + this._initFilter( rule.filter ); + } +} + +Solr.prototype._initFilter = function( filter ) { + if( filter.pattern != undefined ) { + filter.pattern = new RegExp( filter.pattern, 'g' ); + filter.run = function( str ) { + return str.replace( this.pattern, this.replacement ); + } + } + else if( filter.module != undefined ) { + console.log( filter.module ); + filter.run = require( filter.module ).run; + } + else { + throw new Exception( "Incorrect filter configuration" ); + } +} + Solr.prototype.getRules = function() { return this.options.rules; } @@ -72,35 +93,64 @@ Solr.prototype.applyFilter = function( str, rule ) { } if( rule.filter[ 0 ] == undefined ) { - return str.replace( rule.filter.pattern, rule.filter.replacement ); + return rule.filter.run( str ); + //return str.replace( rule.filter.pattern, rule.filter.replacement ); } for( i in rule.filter ) { - str = str.replace( rule.filter[ i ].pattern, rule.filter[ i ].replacement ); + str = rule.filter[ i ].run( str ); + //str = str.replace( rule.filter[ i ].pattern, rule.filter[ i ].replacement ); } return str; } Solr.prototype.execute = function(callback, $, env) { - if( typeof( $ ) != 'function' ) { + if( typeof( $ ) != 'function' || + ( this.options.urlPattern && ! env.task.href.match( this.options.urlPattern ) ) ) { return callback(); } - if( this.options.urlPattern && ! env.task.href.match( this.options.urlPattern ) ) { - return callback(); - } + var self = this; + + if( this.options.each != undefined ) { + /** + * Create multiple Solr document from one HTML page + */ + var docs = []; + $( this.options.each ).each(function(i, el) { + docs.push(function(callback){ + var doc = {}; + self.parseAndInsert( $, doc, el, callback ); + }); + }); - var self = this, - saveDoc = false, - doc = { + async.parallel( docs, callback ); + } + else { + var doc = { id: env.task.href }; + this.parseAndInsert( $, doc, false, callback ); + } +} + +Solr.prototype.parseAndInsert = function( $, doc, el, callback ) { + var self = this, + saveDoc = false; + this.eachRule( function( rule ) { - $( rule.selector ).each( function() { + if( el ) { + var obj = $(el).find( rule.selector ); + } + else { + var obj = $( rule.selector ); + } + + obj.each( function() { if( rule.attribute == false ) { - var content = $(this).text(); + var content = $(this).html(); } else { var content = $(this).attr( rule.attribute ); @@ -122,12 +172,14 @@ Solr.prototype.execute = function(callback, $, env) { }); }); + /** + * Save only if there is at least one positive match + */ if( saveDoc ) { - this.save( doc, callback ); + console.log( doc ); + //this.save( doc, callback ); } else { callback(); } - } -