Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 11 additions & 8 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@ Via NPM: `npm install robots-txt-parser --save`.

After installing robots-txt-parser it needs to be required and initialised:
```js
var robotsParser = require('robots-txt-parser');
var robots = robotsParser(
const robotsParser = require('robots-txt-parser');
const robots = robotsParser(
{
userAgent: 'Googlebot', // The default user agent to use when looking for allow/disallow rules, if this agent isn't listed in the active robots.txt, we use *.
allowOnNeutral: false // The value to use when the robots.txt rule's for allow and disallow are balanced on whether a link can be crawled.
Expand All @@ -20,9 +20,9 @@ Example Usage:


```js
var robotsParser = require('robots-txt-parser');
const robotsParser = require('robots-txt-parser');

var robots = robotsParser(
const robots = robotsParser(
{
userAgent: 'Googlebot', // The default user agent to use when looking for allow/disallow rules, if this agent isn't listed in the active robots.txt, we use *.
allowOnNeutral: false // The value to use when the robots.txt rule's for allow and disallow are balanced on whether a link can be crawled.
Expand All @@ -31,11 +31,14 @@ var robots = robotsParser(
robots.useRobotsFor('http://Example.com')
.then(() => {
robots.canCrawlSync('http://example.com/news'); // Returns true if the link can be crawled, false if not.
robots.canCrawl('http://example.com/news', (value) => { console.log('Crawlable: ', value); }) // Calls the callback with true if the link is crawlable, false if not.
robots.canCrawl('http://example.com/news', (value) => {
console.log('Crawlable: ', value);
}); // Calls the callback with true if the link is crawlable, false if not.
robots.canCrawl('http://example.com/news') // If no callback is provided, returns a promise which resolves with true if the link is crawlable, false if not.
.then((value) => {
console.log('Crawlable: ', value);
});
.then((value) => {
console.log('Crawlable: ', value);
});
});
```
## Docs
### parseRobots(key, string)
Expand Down
18 changes: 11 additions & 7 deletions package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "robots-txt-parser",
"version": "1.0.2",
"version": "1.0.3",
"description": "A lightweight robots.txt parser for Node.js with support for wildcards, caching and promises.",
"keywords": [
"robots",
Expand Down Expand Up @@ -41,10 +41,14 @@
"simple-get": "^2.4.0"
},
"devDependencies": {
"chai": "^3.5.0",
"istanbul": "^0.4.5",
"lodash": "^4.17.2",
"mocha": "^3.2.0",
"nyc": "^10.3.0"
}
"chai": "^4.3.4",
"lodash": "^4.17.21",
"mocha": "^8.3.2",
"nyc": "^15.1.0"
},
"files": [
"LICENSE",
"README.md",
"src/*.js"
]
}
36 changes: 26 additions & 10 deletions src/parser.js
Original file line number Diff line number Diff line change
Expand Up @@ -10,19 +10,26 @@ const HOST = 'host';
const comments = /#.*$/gm;
const whitespace = ' ';
const lineEndings = /[\r\n]+/g;
const recordSlices = /(\w+-)?\w+:\s\S*/g;

function cleanString(rawString) {
function cleanComments(rawString) {
// Replace comments and whitespace
return rawString
.replace(comments, '')
.replace(whitespace, '')
.trim();
.replace(comments, '');
}

function cleanSpaces(rawString) {
return rawString.replace(whitespace, '').trim();
}

function splitOnLines(string) {
return string.split(lineEndings);
}

function robustSplit(string) {
return [...string.match(recordSlices)].map(cleanSpaces);
}

function parseRecord(line) {
// Find first colon and assume is the field delimiter.
const firstColonI = line.indexOf(':');
Expand Down Expand Up @@ -55,25 +62,34 @@ function groupMemberRecord(value) {
};
}


function parser(rawString) {
const lines = splitOnLines(cleanString(rawString));
let lines = splitOnLines(cleanSpaces(cleanComments(rawString)));

// Fallback to the record based split method if we find only one line.
if (lines.length === 1) {
lines = robustSplit(cleanComments(rawString));
}

const robotsObj = {
sitemaps: [],
};
let agent = '';

lines.forEach((line) => {
const record = parseRecord(line);
switch (record.field) {
case USER_AGENT:
// Bot names are non-case sensitive.
agent = record.value = record.value.toLowerCase();
if (agent.length > 0) {
const recordValue = record.value.toLowerCase();
if (recordValue !== agent && recordValue.length > 0) {
// Bot names are non-case sensitive.
agent = recordValue;
robotsObj[agent] = {
allow: [],
disallow: [],
crawlDelay: 0,
};
} else if (recordValue.length === 0) { // Malformed user-agent, ignore its rules.
agent = '';
}
break;
// https://developers.google.com/webmasters/control-crawl-index/docs/robots_txt#order-of-precedence-for-group-member-records
Expand All @@ -93,7 +109,6 @@ function parser(rawString) {
robotsObj.sitemaps.push(record.value);
}
break;
// @TODO test crawl delay parameter.
case CRAWL_DELAY:
if (agent.length > 0) {
robotsObj[agent].crawlDelay = Number.parseInt(record.value, 10);
Expand All @@ -109,6 +124,7 @@ function parser(rawString) {
break;
}
});

// Return only unique sitemaps.
robotsObj.sitemaps = robotsObj.sitemaps.filter((val, i, s) => s.indexOf(val) === i);
return robotsObj;
Expand Down
2 changes: 1 addition & 1 deletion src/robots.js
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ const DFLT_OPTS = {
function Robots(opts = {}) {
this.robotsCache = {};
this.opts = {
userAgent: opts.userAgent ? opts.userAgent.toLowerCase() : DFLT_OPTS.userAgent,
userAgent: opts.userAgent ? opts.userAgent.toLowerCase() : DFLT_OPTS.userAgent.toLowerCase(),
allowOnNeutral: opts.allowOnNeutral ? opts.allowOnNeutral : DFLT_OPTS.allowOnNeutral,
};

Expand Down
69 changes: 69 additions & 0 deletions test/parser/can-parse-test-files.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
const chai = require('chai');
const exampleRobotsBBC = require('../test-data/example-robots-txt-bbc.js');
const exampleRobotsBcc = require('../test-data/example-robots-txt-bcc.js');
const exampleRobotsKarwei = require('../test-data/example-robots-txt-karwei.js');
const exampleRobotsShort = require('../test-data/example-robots-txt-short.js');
const exampleRobotsZalando = require('../test-data/example-robots-txt-zalando.js');
const parse = require('../../src/parser.js');

const expect = chai.expect;

describe('can-parse-test-files', () => {
it('Should completely parse robots-txt-bbc', () => {
const parseResult = parse(exampleRobotsBBC);
const userAgents = Object.keys(parseResult).filter(val => val !== 'sitemaps' && val !== 'host');
expect(userAgents).to.have.lengthOf(2);
expect(parseResult).to.have.keys(['*', 'magpie-crawler', 'sitemaps']);
expect(parseResult['*'].allow).to.have.lengthOf(0);
expect(parseResult['*'].disallow).to.have.lengthOf(44);
expect(parseResult['magpie-crawler'].allow).to.have.lengthOf(0);
expect(parseResult['magpie-crawler'].disallow).to.have.lengthOf(1);
expect(parseResult.sitemaps).to.have.lengthOf(10);
});

it('Should completely parse robots-txt-bcc', () => {
const parseResult = parse(exampleRobotsBcc);
const userAgents = Object.keys(parseResult).filter(val => val !== 'sitemaps' && val !== 'host');
expect(userAgents).to.have.lengthOf(1);
expect(parseResult).to.have.keys(['*', 'sitemaps']);
expect(parseResult['*'].allow).to.have.lengthOf(0);
expect(parseResult['*'].disallow).to.have.lengthOf(20);
expect(parseResult.sitemaps).to.have.lengthOf(6);
});

it('Should completely parse robots-txt-karwei', () => {
const parseResult = parse(exampleRobotsKarwei);
const userAgents = Object.keys(parseResult).filter(val => val !== 'sitemaps' && val !== 'host');
expect(userAgents).to.have.lengthOf(1);
expect(parseResult).to.have.keys(['*', 'sitemaps']);
expect(parseResult['*'].allow).to.have.lengthOf(0);
expect(parseResult['*'].disallow).to.have.lengthOf(33);
expect(parseResult.sitemaps).to.have.lengthOf(1);
});

it('Should completely parse robots-txt-short', () => {
const parseResult = parse(exampleRobotsShort);
const userAgents = Object.keys(parseResult).filter(val => val !== 'sitemaps' && val !== 'host');
expect(userAgents).to.have.lengthOf(3);
expect(parseResult).to.have.keys(['*', 'longbot', 'morebot', 'sitemaps', 'host']);
expect(parseResult['*'].allow).to.have.lengthOf(3);
expect(parseResult['*'].disallow).to.have.lengthOf(2);
expect(parseResult['morebot'].allow).to.have.lengthOf(2);
expect(parseResult['morebot'].disallow).to.have.lengthOf(2);
expect(parseResult['longbot'].allow).to.have.lengthOf(3);
expect(parseResult['longbot'].disallow).to.have.lengthOf(1);
expect(parseResult.sitemaps).to.have.lengthOf(5);
});

it('Should completely parse robots-txt-zalando', () => {
const parseResult = parse(exampleRobotsZalando);
const userAgents = Object.keys(parseResult).filter(val => val !== 'sitemaps' && val !== 'host');
expect(userAgents).to.have.lengthOf(2);
expect(parseResult).to.have.keys(['*', 'screaming frog seo spider', 'sitemaps']);
expect(parseResult['*'].allow).to.have.lengthOf(0);
expect(parseResult['*'].disallow).to.have.lengthOf(16);
expect(parseResult['screaming frog seo spider'].allow).to.have.lengthOf(0);
expect(parseResult['screaming frog seo spider'].disallow).to.have.lengthOf(1);
expect(parseResult.sitemaps).to.have.lengthOf(0);
});
});
2 changes: 1 addition & 1 deletion test/parser/ignores-malformed-values.js
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ const parse = require('../../src/parser.js');

const expect = chai.expect;

describe('can-parse-user-agents', () => {
describe('Ignores Malformed Values', () => {
const parseResult = parse(exampleRobotsMalformed);
const userAgents = Object.keys(parseResult).filter(val => val !== 'sitemaps' && val !== 'host');

Expand Down
1 change: 1 addition & 0 deletions test/parser/parser-tests.js
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,4 @@ require('./can-parse-allow.js');
require('./can-parse-disallow.js');
require('./can-parse-crawl-delay.js');
require('./ignores-malformed-values.js');
require('./can-parse-test-files.js');
65 changes: 65 additions & 0 deletions test/test-data/example-robots-txt-bbc.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
module.exports = `
# version: d167f71fcf4277403ce3b7118a1fad5d25a41310

# HTTPS www.bbc.co.uk

User-agent: *
Sitemap: https://www.bbc.co.uk/sitemap.xml
Sitemap: https://www.bbc.co.uk/sitemaps/https-index-uk-archive.xml
Sitemap: https://www.bbc.co.uk/sitemaps/https-index-uk-news.xml
Sitemap: https://www.bbc.co.uk/food/sitemap.xml
Sitemap: https://www.bbc.co.uk/bitesize/sitemap/sitemapindex.xml
Sitemap: https://www.bbc.co.uk/teach/sitemap/sitemapindex.xml
Sitemap: https://www.bbc.co.uk/sitemaps/https-index-uk-archive_video.xml
Sitemap: https://www.bbc.co.uk/sitemaps/https-index-uk-video.xml
Sitemap: https://www.bbc.co.uk/sitemaps/sitemap-uk-ws-topics.xml
Sitemap: https://www.bbc.co.uk/sport/sitemap.xml

Disallow: /cbbc/search$
Disallow: /cbbc/search/
Disallow: /cbbc/search?
Disallow: /cbeebies/search$
Disallow: /cbeebies/search/
Disallow: /cbeebies/search?
Disallow: /chwilio/
Disallow: /chwilio$
Disallow: /chwilio?
Disallow: /iplayer/bigscreen/
Disallow: /iplayer/cbbc/episodes/
Disallow: /iplayer/cbbc/search
Disallow: /iplayer/cbeebies/episodes/
Disallow: /iplayer/cbeebies/search
Disallow: /iplayer/search
Disallow: /indepthtoolkit/smallprox$
Disallow: /indepthtoolkit/smallprox/
Disallow: /modules/musicnav/language/
Disallow: /news/0
Disallow: /radio/aod/
Disallow: /radio/aod$
Disallow: /radio/imda
Disallow: /radio/player/
Disallow: /radio/player$
Disallow: /search/
Disallow: /search$
Disallow: /search?
Disallow: /sport/videos/*
Disallow: /sounds/player/
Disallow: /sounds/player$
Disallow: /ugc$
Disallow: /ugc/
Disallow: /ugcsupport$
Disallow: /ugcsupport/
Disallow: /userinfo/
Disallow: /userinfo
Disallow: /food/favourites
Disallow: /food/menus/*/shopping-list
Disallow: /food/recipes/*/shopping-list
Disallow: /food/search*?*
Disallow: /sounds/search$
Disallow: /sounds/search/
Disallow: /sounds/search?
Disallow: /ws/includes

User-agent: magpie-crawler
Disallow: /
`;
35 changes: 35 additions & 0 deletions test/test-data/example-robots-txt-bcc.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
module.exports = `
User-agent: *
Disallow: /*?fh_location*
Disallow: /search?*
Disallow: /*?price*
Disallow: /*?aspxerrorpath*
Disallow: /*?product*
Disallow: /*?tt*
Disallow: /*?category*
Disallow: /*?CID*
Disallow: /*?nietgevonden*
Disallow: /*?viewmode*
Disallow: /*?cIdee*
Disallow: /*?orderId*
Disallow: /*?token*
Disallow: /klantenservice?searchQuery=*
Disallow: /productdetail*
Disallow: /artikel=*

#Blog
User-agent: *
Disallow: /blog/wp-admin/
Disallow: /blog/tag/
Disallow: /page/
Disallow: /blog/ultimate_slider*

#Sitemap
User-Agent: *
Sitemap: https://www.bcc.nl/sitemap.xml
Sitemap: https://www.bcc.nl/cms-sitemap.xml
Sitemap: https://www.bcc.nl/video-sitemap.xml
Sitemap: https://www.bcc.nl/blog/sitemap.xml
Sitemap: https://www.bcc.nl/brands-sitemap.xml
Sitemap: https://www.bcc.nl/products-sitemap.xml
`;
1 change: 1 addition & 0 deletions test/test-data/example-robots-txt-karwei.js

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

22 changes: 22 additions & 0 deletions test/test-data/example-robots-txt-zalando.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
module.exports = `
User-agent: *
Disallow: /cart/*
Disallow: /wardrobe/*
Disallow: /myaccount/*
Disallow: /opinions*
Disallow: /api/recos/reco
Disallow: /api/catalog/logs
Disallow: /api/pdp/sizereco
Disallow: /api/shop-the-look/looks
Disallow: /api/pdp/partner
Disallow: /api/cmag
Disallow: /api/navigation/wishlist-count
Disallow: /api/navigation/cart-count
Disallow: /api/catalog/logs
Disallow: /api/catalog/seo*
Disallow: /api/t/i
Disallow: /api/rr

User-agent: Screaming Frog SEO Spider
Disallow: /
`;
12 changes: 12 additions & 0 deletions test/util/format-link.js
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,18 @@ const links = [
}, {
string: 'mailto:someone@example.com',
result: 'http://example.com',
}, {
string: 'fakeaddress.de',
result: 'http://fakeaddress.de',
}, {
string: 'cheese.nl',
result: 'http://cheese.nl',
}, {
string: 'mailto:someone@example.co.uk',
result: 'http://example.co.uk',
}, {
string: 'https://cheese.nl',
result: 'https://cheese.nl',
},
];

Expand Down
Loading