Skip to content

String can be used as source & work as ES module #2

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 8 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
node_modules
13 changes: 12 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,23 @@ Promise based parser for robots.txt files. Allows webcrawlers and other scraping

## Getting Started

### Fetch from URL

```js
const Robots = require("parse-robots");
import * as Robots from "parse-robots";

const robot = await Robots("https://facebook.com/robots.txt");
```

### Parse robots.txt as String

```js
import * as Robots from "parse-robots";

const robots_txt = "User-agent: Some-UA\nAllow: /";
const robot = await Robots(robots_txt, true);
```

## Methods

- `isAllowed(path, agent)` - Boolean
Expand Down
38 changes: 22 additions & 16 deletions index.js
Original file line number Diff line number Diff line change
@@ -1,21 +1,27 @@
const fetch = require("./lib/fetch");
module.exports = async function(url, options = {}) {
var _res = await fetch(url, options);
if (_res.status != 200) return null;
if (_res.content.toLowerCase().includes("<html")) return null;
var _content = _res.content
"use strict";

import * as fetch from "./lib/fetch.js";

export default async function(source, options = {}) {
let _res = source;
if (options !== true) {
_res = await fetch(source, options);
if (_res.status != 200) return null;
if (_res.content.toLowerCase().includes("<html")) return null;
}
let _content = _res.content
.split(/\r?\n/)
.filter(row => row.match(/^[ -~]+$/gim))
.join("\r\n");
var _sitemaps = [];
var _agents = [];
var _groups = {};
var _crawldelay = null;
var _host = null;
let _sitemaps = [];
let _agents = [];
let _groups = {};
let _crawldelay = null;
let _host = null;
_getGroups();
function _getGroups() {
var currentGroup = "";
var unsortedGroups = _content
let currentGroup = "";
let unsortedGroups = _content
.split(/\r?\n/)
.filter(row => row.trim().match(/^(Allow|Disallow|User-agent).*/gim))
.reduce((acc, cur) => {
Expand Down Expand Up @@ -99,7 +105,7 @@ module.exports = async function(url, options = {}) {
},
getSitemaps: function() {
const reg = /Sitemap: *([^\r\n]*)/gi;
var match = reg.exec(_content);
let match = reg.exec(_content);

while (match != null) {
_sitemaps.push(match[1]);
Expand All @@ -109,15 +115,15 @@ module.exports = async function(url, options = {}) {
},
getCrawlDelay: function(max = 60) {
const reg = /crawl-delay: *([^\r\n]*)/gi;
var match = reg.exec(_content);
let match = reg.exec(_content);
if (match) {
_crawldelay = parseInt(match[1]) > max ? max : parseInt(match[1]);
}
return _crawldelay;
},
getHost: function() {
const reg = /host: *([^\r\n]*)/gi;
var match = reg.exec(_content);
let match = reg.exec(_content);
if (match) _host = match[1];
return _host;
}
Expand Down
6 changes: 3 additions & 3 deletions lib/fetch.js
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
"use strict";
const got = require("got");

module.exports = async function fetch(url, options = {}) {
import * as got from "got";

export default async function fetch(url, options = {}) {
const userAgent =
options.agent ||
"Mozilla/5.0 (Windows NT 5.1; rv:33.0) Gecko/20100101 Firefox/33.0";
Expand All @@ -28,7 +29,6 @@ module.exports = async function fetch(url, options = {}) {
headers: response.headers,
content: null
});
throw Error(url);
}
});

Expand Down
1 change: 1 addition & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
"version": "3.2.0",
"description": "Promise based parser for robots.txt files.",
"main": "index.js",
"type": "module",
"scripts": {
"test": "echo \"Error: no test specified\" && exit 1"
},
Expand Down