Skip to content

Commit

Permalink
fix xome bugs
Browse files Browse the repository at this point in the history
  • Loading branch information
Lukasz Kujawa committed Dec 30, 2013
1 parent 7ac195f commit 7e59e0d
Show file tree
Hide file tree
Showing 7 changed files with 17 additions and 40 deletions.
10 changes: 6 additions & 4 deletions conf.example.json
Original file line number Diff line number Diff line change
@@ -1,14 +1,16 @@
{
"workers": 5,
"seedUrl": "http://stackoverflow.com/",
"workers": 2,
"seedUrl": "http://127.0.0.1:5984/_utils/docs/",
"database": {
"name": "web-crawler-so",
"name": "web-crawler-example2",
"rebuild": true
},
"jobs": [
{ "name": "logger" },
{ "name": "saver" },
{ "name": "driller", "domainRestriction": "stackoverflow.com" },
{ "name": "driller",
"domainRestriction": "127.0.0.1"
},
{ "name": "scheduler" }
]
}
21 changes: 1 addition & 20 deletions crawler.js
Original file line number Diff line number Diff line change
@@ -1,34 +1,15 @@
/*
var heapdump = require('heapdump')
var http = require('http');
http.createServer(function (req, res) {
heapdump.writeSnapshot();
res.writeHead(200, {'Content-Type': 'text/plain'});
res.end("Dumped!");
}).listen(9999);
*/

var argv = require('optimist').argv;
var fs = require('fs');

var agent = require('./webcrawler/agent');
var Config = require( './webcrawler/config' );

/*
setInterval(function () {
if (typeof gc === 'function') {
gc();
}
console.log( process.memoryUsage());
}, 60000);
*/

console.log( "" );

if( argv._[0] == undefined ) {
console.log( "Usage: node crawler.js [config.json]" );
console.log( "" );
console.log( "Example: node crawler.js [conf.example.json]" );
console.log( "Example: node crawler.js conf.example.json" );
console.log( "" );
process.exit(1);
}
Expand Down
3 changes: 1 addition & 2 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,9 @@
"version": "0.0.1",
"private": true,
"scripts": {
"start": "node monitor.js"
"start": "node crawler.js conf.example.json"
},
"dependencies": {
"express": "3.4.2",
"jade": "*",
"cheerio": "*",
"nano": "*",
Expand Down
1 change: 0 additions & 1 deletion webcrawler/agent.js
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@ var cheerio = require('cheerio');

agent.initFromConfig = function( config ) {
var async = require( 'async' );
var job = require('./job');

agent.init({ workers: config.getWorkers() });

Expand Down
9 changes: 7 additions & 2 deletions webcrawler/job/driller.js
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,8 @@ function Driller(options) {
filters: [],
patterns: [],
verbose: false,
maxDepth: false
maxDepth: false,
storeSource: true
};

this.initOptions( options );
Expand Down Expand Up @@ -73,6 +74,10 @@ Driller.prototype.execute = function(callback, $, env) {
}

Driller.prototype.addSourceToWebDoc = function( url, urls, source ) {
if( ! this.options.storeSource ) {
return;
}

if( urls[ url ] != undefined ) {
return;
}
Expand Down Expand Up @@ -124,7 +129,7 @@ Driller.prototype.setNormalasiers = function( normalisers ) {

Driller.prototype.setDomainRestriction = function( domain ) {
var domain = domain.replace( /\./, '\.' );
domain = '^http[s]{0,1}:\/\/(([^\/]+?\\.)|())' + domain + '(\/|$)';
domain = '^http[s]{0,1}:\/\/(([^\/]+?\\.)|())' + domain + '(\/|:|$)';
this.options[ "domain" ] = new RegExp( domain, 'i' );
}

Expand Down
11 changes: 0 additions & 11 deletions webcrawler/job/index.js

This file was deleted.

2 changes: 2 additions & 0 deletions webcrawler/storage/couchdb.js
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@ couchdb.init = function(options) {

this.nano = require('nano')( this.options.host );
this.db = this.nano.use( this.options.dbname );

couchdb._nano = this.nano;
}

couchdb.getDB = function() {
Expand Down

0 comments on commit 7e59e0d

Please sign in to comment.