a minimal puppeteer crawler api
- crwlr:
- handles the boring boilerplate work of actually crawling a site
- You provide:
- <String>
url
to start from - <Puppeteer Browser>
browser
instance with your own.launch(options)
pageOptions
as you wish:- <Object>
goto
to be provided as options topage.goto(url, options)
- <Function>
prepare(page)
binds event handlers and/or set properties for every new page - <Function>
resolved(response, page)
fires after everypage.goto()
has resolved
- <Object>
- <String>
$ npm install --save crwlr
'use strict';
const puppeteer = require('puppeteer');
const crwlr = require('crwlr');
const site = 'https://buster.neocities.org/crwlr/';
// *** Basic Example Without Any Options *** //
(async () => {
const browser = await puppeteer.launch();
let crawledPages = await crwlr(browser, site);
console.log(crawledPages);
})();
/*
[ 'https://buster.neocities.org/crwlr/',
'https://buster.neocities.org/crwlr/other.html',
'https://buster.neocities.org/crwlr/mixed-content.html',
'https://buster.neocities.org/crwlr/missing.html',
'https://buster.neocities.org/crwlr/dummy.pdf' ]
*/
'use strict';
const puppeteer = require('puppeteer');
const crwlr = require('crwlr');
const site = 'https://buster.neocities.org/crwlr/';
// *** Advanced Example With Options *** //
(async () => {
const browser = await puppeteer.launch({
headless: false
});
const pageOptions = {
prepare: page => {
page.on('request', request => {
if (request.url().match(/\.js$/)) {
console.log(`${page.url()} => requested: ${request.url()}`);
}
});
},
goto: {
waitUntil: 'networkidle2'
},
resolved: (response, page) => {
console.log(`=> resolved: ${response.status()} ${page.url()}`);
}
};
await crwlr(browser, site, pageOptions);
})();
/*
=> resolved: 200 https://buster.neocities.org/crwlr/
=> resolved: 200 https://buster.neocities.org/crwlr/other.html
https://buster.neocities.org/crwlr/mixed-content.html => requested: https://mixed-script.badssl.com/nonsecure.js
=> resolved: 200 https://buster.neocities.org/crwlr/mixed-content.html
=> resolved: 404 https://buster.neocities.org/crwlr/missing.html
=> resolved: 200 https://buster.neocities.org/crwlr/dummy.pdf
*/
ISC © Buster Collings