chrisakroyd · chrisakroyd · Apr 10, 2021 · Mar 30, 2021 · Mar 31, 2021 · Apr 5, 2021
diff --git a/README.md b/README.md
@@ -8,8 +8,8 @@ Via NPM: `npm install robots-txt-parser --save`.
 
 After installing robots-txt-parser it needs to be required and initialised:
 ```js
-var robotsParser = require('robots-txt-parser');
-var robots = robotsParser(
+const robotsParser = require('robots-txt-parser');
+const robots = robotsParser(
   {
     userAgent: 'Googlebot', // The default user agent to use when looking for allow/disallow rules, if this agent isn't listed in the active robots.txt, we use *.
     allowOnNeutral: false // The value to use when the robots.txt rule's for allow and disallow are balanced on whether a link can be crawled.
@@ -20,9 +20,9 @@ Example Usage:
 
 
 ```js
-var robotsParser = require('robots-txt-parser');
+const robotsParser = require('robots-txt-parser');
 
-var robots = robotsParser(
+const robots = robotsParser(
   {
     userAgent: 'Googlebot', // The default user agent to use when looking for allow/disallow rules, if this agent isn't listed in the active robots.txt, we use *.
     allowOnNeutral: false // The value to use when the robots.txt rule's for allow and disallow are balanced on whether a link can be crawled.
@@ -31,11 +31,14 @@ var robots = robotsParser(
 robots.useRobotsFor('http://Example.com')
   .then(() => {
     robots.canCrawlSync('http://example.com/news'); // Returns true if the link can be crawled, false if not.
-    robots.canCrawl('http://example.com/news', (value) => { console.log('Crawlable: ', value); }) // Calls the callback with true if the link is crawlable, false if not.
+    robots.canCrawl('http://example.com/news', (value) => {
+      console.log('Crawlable: ', value);
+    }); // Calls the callback with true if the link is crawlable, false if not.
     robots.canCrawl('http://example.com/news') // If no callback is provided, returns a promise which resolves with true if the link is crawlable, false if not.
-        .then((value) => {
-            console.log('Crawlable: ', value);
-        });
+      .then((value) => {
+        console.log('Crawlable: ', value);
+      });
+  });
 ```
 ## Docs
 ### parseRobots(key, string)

diff --git a/package.json b/package.json
@@ -1,6 +1,6 @@
 {
   "name": "robots-txt-parser",
-  "version": "1.0.2",
+  "version": "1.0.3",
   "description": "A lightweight robots.txt parser for Node.js with support for wildcards, caching and promises.",
   "keywords": [
     "robots",
@@ -41,10 +41,14 @@
     "simple-get": "^2.4.0"
   },
   "devDependencies": {
-    "chai": "^3.5.0",
-    "istanbul": "^0.4.5",
-    "lodash": "^4.17.2",
-    "mocha": "^3.2.0",
-    "nyc": "^10.3.0"
-  }
+    "chai": "^4.3.4",
+    "lodash": "^4.17.21",
+    "mocha": "^8.3.2",
+    "nyc": "^15.1.0"
+  },
+  "files": [
+    "LICENSE",
+    "README.md",
+    "src/*.js"
+  ]
 }
diff --git a/src/parser.js b/src/parser.js
@@ -10,19 +10,26 @@ const HOST = 'host';
 const comments = /#.*$/gm;
 const whitespace = ' ';
 const lineEndings = /[\r\n]+/g;
+const recordSlices = /(\w+-)?\w+:\s\S*/g;
 
-function cleanString(rawString) {
+function cleanComments(rawString) {
   // Replace comments and whitespace
   return rawString
-    .replace(comments, '')
-    .replace(whitespace, '')
-    .trim();
+    .replace(comments, '');
+}
+
+function cleanSpaces(rawString) {
+  return rawString.replace(whitespace, '').trim();
 }
 
 function splitOnLines(string) {
   return string.split(lineEndings);
 }
 
+function robustSplit(string) {
+  return [...string.match(recordSlices)].map(cleanSpaces);
+}
+
 function parseRecord(line) {
   // Find first colon and assume is the field delimiter.
   const firstColonI = line.indexOf(':');
@@ -55,25 +62,34 @@ function groupMemberRecord(value) {
   };
 }
 
-
 function parser(rawString) {
-  const lines = splitOnLines(cleanString(rawString));
+  let lines = splitOnLines(cleanSpaces(cleanComments(rawString)));
+
+  // Fallback to the record based split method if we find only one line.
+  if (lines.length === 1) {
+    lines = robustSplit(cleanComments(rawString));
+  }
+
   const robotsObj = {
     sitemaps: [],
   };
   let agent = '';
+
   lines.forEach((line) => {
     const record = parseRecord(line);
     switch (record.field) {
       case USER_AGENT:
-        // Bot names are non-case sensitive.
-        agent = record.value = record.value.toLowerCase();
-        if (agent.length > 0) {
+        const recordValue = record.value.toLowerCase();
+        if (recordValue !== agent && recordValue.length > 0) {
+          // Bot names are non-case sensitive.
+          agent = recordValue;
           robotsObj[agent] = {
             allow: [],
             disallow: [],
             crawlDelay: 0,
           };
+        } else if (recordValue.length === 0) {  // Malformed user-agent, ignore its rules.
+          agent = '';
         }
         break;
       // https://developers.google.com/webmasters/control-crawl-index/docs/robots_txt#order-of-precedence-for-group-member-records
@@ -93,7 +109,6 @@ function parser(rawString) {
           robotsObj.sitemaps.push(record.value);
         }
         break;
-      // @TODO test crawl delay parameter.
       case CRAWL_DELAY:
         if (agent.length > 0) {
           robotsObj[agent].crawlDelay = Number.parseInt(record.value, 10);
@@ -109,6 +124,7 @@ function parser(rawString) {
         break;
     }
   });
+
   // Return only unique sitemaps.
   robotsObj.sitemaps = robotsObj.sitemaps.filter((val, i, s) => s.indexOf(val) === i);
   return robotsObj;

diff --git a/src/robots.js b/src/robots.js
@@ -11,7 +11,7 @@ const DFLT_OPTS = {
 function Robots(opts = {}) {
   this.robotsCache = {};
   this.opts = {
-    userAgent: opts.userAgent ? opts.userAgent.toLowerCase() : DFLT_OPTS.userAgent,
+    userAgent: opts.userAgent ? opts.userAgent.toLowerCase() : DFLT_OPTS.userAgent.toLowerCase(),
     allowOnNeutral: opts.allowOnNeutral ? opts.allowOnNeutral : DFLT_OPTS.allowOnNeutral,
   };
 

diff --git a/test/parser/can-parse-test-files.js b/test/parser/can-parse-test-files.js
@@ -0,0 +1,69 @@
+const chai = require('chai');
+const exampleRobotsBBC = require('../test-data/example-robots-txt-bbc.js');
+const exampleRobotsBcc = require('../test-data/example-robots-txt-bcc.js');
+const exampleRobotsKarwei = require('../test-data/example-robots-txt-karwei.js');
+const exampleRobotsShort = require('../test-data/example-robots-txt-short.js');
+const exampleRobotsZalando = require('../test-data/example-robots-txt-zalando.js');
+const parse = require('../../src/parser.js');
+
+const expect = chai.expect;
+
+describe('can-parse-test-files', () => {
+  it('Should completely parse robots-txt-bbc', () => {
+    const parseResult = parse(exampleRobotsBBC);
+    const userAgents = Object.keys(parseResult).filter(val => val !== 'sitemaps' && val !== 'host');
+    expect(userAgents).to.have.lengthOf(2);
+    expect(parseResult).to.have.keys(['*', 'magpie-crawler', 'sitemaps']);
+    expect(parseResult['*'].allow).to.have.lengthOf(0);
+    expect(parseResult['*'].disallow).to.have.lengthOf(44);
+    expect(parseResult['magpie-crawler'].allow).to.have.lengthOf(0);
+    expect(parseResult['magpie-crawler'].disallow).to.have.lengthOf(1);
+    expect(parseResult.sitemaps).to.have.lengthOf(10);
+  });
+
+  it('Should completely parse robots-txt-bcc', () => {
+    const parseResult = parse(exampleRobotsBcc);
+    const userAgents = Object.keys(parseResult).filter(val => val !== 'sitemaps' && val !== 'host');
+    expect(userAgents).to.have.lengthOf(1);
+    expect(parseResult).to.have.keys(['*', 'sitemaps']);
+    expect(parseResult['*'].allow).to.have.lengthOf(0);
+    expect(parseResult['*'].disallow).to.have.lengthOf(20);
+    expect(parseResult.sitemaps).to.have.lengthOf(6);
+  });
+
+  it('Should completely parse robots-txt-karwei', () => {
+    const parseResult = parse(exampleRobotsKarwei);
+    const userAgents = Object.keys(parseResult).filter(val => val !== 'sitemaps' && val !== 'host');
+    expect(userAgents).to.have.lengthOf(1);
+    expect(parseResult).to.have.keys(['*', 'sitemaps']);
+    expect(parseResult['*'].allow).to.have.lengthOf(0);
+    expect(parseResult['*'].disallow).to.have.lengthOf(33);
+    expect(parseResult.sitemaps).to.have.lengthOf(1);
+  });
+
+  it('Should completely parse robots-txt-short', () => {
+    const parseResult = parse(exampleRobotsShort);
+    const userAgents = Object.keys(parseResult).filter(val => val !== 'sitemaps' && val !== 'host');
+    expect(userAgents).to.have.lengthOf(3);
+    expect(parseResult).to.have.keys(['*', 'longbot', 'morebot', 'sitemaps', 'host']);
+    expect(parseResult['*'].allow).to.have.lengthOf(3);
+    expect(parseResult['*'].disallow).to.have.lengthOf(2);
+    expect(parseResult['morebot'].allow).to.have.lengthOf(2);
+    expect(parseResult['morebot'].disallow).to.have.lengthOf(2);
+    expect(parseResult['longbot'].allow).to.have.lengthOf(3);
+    expect(parseResult['longbot'].disallow).to.have.lengthOf(1);
+    expect(parseResult.sitemaps).to.have.lengthOf(5);
+  });
+
+  it('Should completely parse robots-txt-zalando', () => {
+    const parseResult = parse(exampleRobotsZalando);
+    const userAgents = Object.keys(parseResult).filter(val => val !== 'sitemaps' && val !== 'host');
+    expect(userAgents).to.have.lengthOf(2);
+    expect(parseResult).to.have.keys(['*', 'screaming frog seo spider', 'sitemaps']);
+    expect(parseResult['*'].allow).to.have.lengthOf(0);
+    expect(parseResult['*'].disallow).to.have.lengthOf(16);
+    expect(parseResult['screaming frog seo spider'].allow).to.have.lengthOf(0);
+    expect(parseResult['screaming frog seo spider'].disallow).to.have.lengthOf(1);
+    expect(parseResult.sitemaps).to.have.lengthOf(0);
+  });
+});
diff --git a/test/parser/ignores-malformed-values.js b/test/parser/ignores-malformed-values.js
@@ -4,7 +4,7 @@ const parse = require('../../src/parser.js');
 
 const expect = chai.expect;
 
-describe('can-parse-user-agents', () => {
+describe('Ignores Malformed Values', () => {
   const parseResult = parse(exampleRobotsMalformed);
   const userAgents = Object.keys(parseResult).filter(val => val !== 'sitemaps' && val !== 'host');
 

diff --git a/test/parser/parser-tests.js b/test/parser/parser-tests.js
@@ -5,3 +5,4 @@ require('./can-parse-allow.js');
 require('./can-parse-disallow.js');
 require('./can-parse-crawl-delay.js');
 require('./ignores-malformed-values.js');
+require('./can-parse-test-files.js');
diff --git a/test/test-data/example-robots-txt-bbc.js b/test/test-data/example-robots-txt-bbc.js
@@ -0,0 +1,65 @@
+module.exports = `
+# version: d167f71fcf4277403ce3b7118a1fad5d25a41310
+
+# HTTPS www.bbc.co.uk
+
+User-agent: *
+Sitemap: https://www.bbc.co.uk/sitemap.xml
+Sitemap: https://www.bbc.co.uk/sitemaps/https-index-uk-archive.xml
+Sitemap: https://www.bbc.co.uk/sitemaps/https-index-uk-news.xml
+Sitemap: https://www.bbc.co.uk/food/sitemap.xml
+Sitemap: https://www.bbc.co.uk/bitesize/sitemap/sitemapindex.xml
+Sitemap: https://www.bbc.co.uk/teach/sitemap/sitemapindex.xml
+Sitemap: https://www.bbc.co.uk/sitemaps/https-index-uk-archive_video.xml
+Sitemap: https://www.bbc.co.uk/sitemaps/https-index-uk-video.xml
+Sitemap: https://www.bbc.co.uk/sitemaps/sitemap-uk-ws-topics.xml
+Sitemap: https://www.bbc.co.uk/sport/sitemap.xml
+
+Disallow: /cbbc/search$
+Disallow: /cbbc/search/
+Disallow: /cbbc/search?
+Disallow: /cbeebies/search$
+Disallow: /cbeebies/search/
+Disallow: /cbeebies/search?
+Disallow: /chwilio/
+Disallow: /chwilio$
+Disallow: /chwilio?
+Disallow: /iplayer/bigscreen/
+Disallow: /iplayer/cbbc/episodes/
+Disallow: /iplayer/cbbc/search
+Disallow: /iplayer/cbeebies/episodes/
+Disallow: /iplayer/cbeebies/search
+Disallow: /iplayer/search
+Disallow: /indepthtoolkit/smallprox$
+Disallow: /indepthtoolkit/smallprox/
+Disallow: /modules/musicnav/language/
+Disallow: /news/0
+Disallow: /radio/aod/
+Disallow: /radio/aod$
+Disallow: /radio/imda
+Disallow: /radio/player/
+Disallow: /radio/player$
+Disallow: /search/
+Disallow: /search$
+Disallow: /search?
+Disallow: /sport/videos/*
+Disallow: /sounds/player/
+Disallow: /sounds/player$
+Disallow: /ugc$
+Disallow: /ugc/
+Disallow: /ugcsupport$
+Disallow: /ugcsupport/
+Disallow: /userinfo/
+Disallow: /userinfo
+Disallow: /food/favourites
+Disallow: /food/menus/*/shopping-list
+Disallow: /food/recipes/*/shopping-list
+Disallow: /food/search*?*
+Disallow: /sounds/search$
+Disallow: /sounds/search/
+Disallow: /sounds/search?
+Disallow: /ws/includes
+
+User-agent: magpie-crawler
+Disallow: /
+`;
diff --git a/test/test-data/example-robots-txt-bcc.js b/test/test-data/example-robots-txt-bcc.js
@@ -0,0 +1,35 @@
+module.exports = `
+User-agent: *
+Disallow: /*?fh_location*
+Disallow: /search?*
+Disallow: /*?price*
+Disallow: /*?aspxerrorpath*
+Disallow: /*?product*
+Disallow: /*?tt*
+Disallow: /*?category*
+Disallow: /*?CID*
+Disallow: /*?nietgevonden*
+Disallow: /*?viewmode*
+Disallow: /*?cIdee*
+Disallow: /*?orderId*
+Disallow: /*?token*
+Disallow: /klantenservice?searchQuery=*
+Disallow: /productdetail*
+Disallow: /artikel=*
+
+#Blog
+User-agent: *
+Disallow: /blog/wp-admin/
+Disallow: /blog/tag/
+Disallow: /page/
+Disallow: /blog/ultimate_slider*
+
+#Sitemap
+User-Agent: *
+Sitemap: https://www.bcc.nl/sitemap.xml
+Sitemap: https://www.bcc.nl/cms-sitemap.xml
+Sitemap: https://www.bcc.nl/video-sitemap.xml
+Sitemap: https://www.bcc.nl/blog/sitemap.xml
+Sitemap: https://www.bcc.nl/brands-sitemap.xml
+Sitemap: https://www.bcc.nl/products-sitemap.xml
+`;
diff --git a/test/test-data/example-robots-txt-karwei.js b/test/test-data/example-robots-txt-karwei.js
diff --git a/test/test-data/example-robots-txt-zalando.js b/test/test-data/example-robots-txt-zalando.js
@@ -0,0 +1,22 @@
+module.exports = `
+User-agent: *
+Disallow: /cart/*
+Disallow: /wardrobe/*
+Disallow: /myaccount/*
+Disallow: /opinions*
+Disallow: /api/recos/reco
+Disallow: /api/catalog/logs
+Disallow: /api/pdp/sizereco
+Disallow: /api/shop-the-look/looks
+Disallow: /api/pdp/partner
+Disallow: /api/cmag
+Disallow: /api/navigation/wishlist-count
+Disallow: /api/navigation/cart-count
+Disallow: /api/catalog/logs
+Disallow: /api/catalog/seo*
+Disallow: /api/t/i
+Disallow: /api/rr
+
+User-agent: Screaming Frog SEO Spider
+Disallow: /
+`;
diff --git a/test/util/format-link.js b/test/util/format-link.js
@@ -25,6 +25,18 @@ const links = [
   }, {
     string: 'mailto:someone@example.com',
     result: 'http://example.com',
+  }, {
+    string: 'fakeaddress.de',
+    result: 'http://fakeaddress.de',
+  }, {
+    string: 'cheese.nl',
+    result: 'http://cheese.nl',
+  }, {
+    string: 'mailto:someone@example.co.uk',
+    result: 'http://example.co.uk',
+  }, {
+    string: 'https://cheese.nl',
+    result: 'https://cheese.nl',
   },
 ];