Skip to content

Commit

Permalink
Improved URL normalising
Browse files Browse the repository at this point in the history
  • Loading branch information
Lukasz Kujawa committed Dec 30, 2013
1 parent a08a3bb commit 7ac195f
Show file tree
Hide file tree
Showing 4 changed files with 65 additions and 14 deletions.
4 changes: 3 additions & 1 deletion test/helper.js
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,9 @@ module.exports.getEnv = function( url ) {

return {
task: Url.parse( url ),
agent: {},
agent: {
log: function(){}
},
res: {
headers: {
"content-type": "text/html"
Expand Down
14 changes: 1 addition & 13 deletions test/webcrawler/job/driller.js
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ var Driller = require( '../../../webcrawler/job/driller' );
var cheerio = require('cheerio');

function drillerExecute( testFile, domainRestriction, env, callback ) {
var driller = new Driller({ domainRestriction: domainRestriction });
var driller = new Driller({ domainRestriction: domainRestriction, storeSource: false, verbose: false });
var docs = [];

driller.getDocInsertFunction = function( doc ) {
Expand Down Expand Up @@ -136,18 +136,6 @@ describe('Driller', function(){

});

it('should attach url source to all documents', function(done){
VisittedUrls.setLinks( [] );
var env = helper.getEnv( 'http://www.example.com/view/1' );

env.task.data = { source: [ 'http://www.example.com/1', 'http://www.example.com/2'] };
drillerExecute( '/html/site01.html', 'example.com', env, function(docs){
assert.equal( docs.length, 6 );
done();
});

});

});

});
38 changes: 38 additions & 0 deletions test/webcrawler/utils/urltool.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
var assert = require( "assert" );
var UrlTool = require('../../../webcrawler/utils/urltool');
var helper = require( "../../helper" );

describe('UrlDoc', function(){

describe('#UrlDoc', function(){

it('should handle ../ urls', function(){
var env = helper.getEnv( 'http://www.example.com/foo' );

var urls = [

{ input: 'http://www.example.com/foo/../bar',
output: 'http://www.example.com/bar' },

{ input: 'http://www.example.com/foo/../../../bar',
output: 'http://www.example.com/bar' },

{ input: 'http://www.example.com/../',
output: 'http://www.example.com/' },

{ input: 'http://www.example.com/foo/../../../bar/../',
output: 'http://www.example.com/' },
]

for( i in urls ) {
var inputUrl = urls[i].input;
var outputUrl = urls[i].output;

_url = UrlTool.nomalise( inputUrl, env );
assert.equal( _url, outputUrl );
}
});

});

});
23 changes: 23 additions & 0 deletions webcrawler/utils/urltool.js
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,29 @@ UrlTool.nomalise = function( url, env, plugins ) {
url = env.task.protocol + url;
}

/**
* handle go back "/../"
*/
if( url.match(/\/\.\.\//) ) {
var tmp = url.split('/');
for( var x = 3 ; x < tmp.length ; x++ ) {
if( tmp[x] == '..' ) {
delete tmp[x];
}
else if( tmp[ x + 1 ] != undefined && tmp[ x + 1 ] == '..' ) {
delete tmp[x];
delete tmp[x+1];
x += 1;
}
}
url = tmp.join('/');
/*
while( url.match(/\/\.\.\//) ) {
url = url.replace( /[^\/]+\/\.\.\//, '' );
}
*/
}

for( i in plugins ) {
var plugin = plugins[ i ];
if( plugin.replacement != undefined ) {
Expand Down

0 comments on commit 7ac195f

Please sign in to comment.