Skip to content
This repository has been archived by the owner on Jan 31, 2021. It is now read-only.

Commit

Permalink
Merge pull request #1 from mperreux/parallelize-scraper
Browse files Browse the repository at this point in the history
Parallelize scraper
  • Loading branch information
99littlebugs authored Jan 16, 2021
2 parents fe94b8c + b2da122 commit 972ce67
Showing 1 changed file with 30 additions and 27 deletions.
57 changes: 30 additions & 27 deletions scraper.js
Original file line number Diff line number Diff line change
Expand Up @@ -127,38 +127,41 @@ const nyGovSites = [{
}];

(async () => {
// todo: parallelize?
const browser = await puppeteer.launch();
const [page] = await browser.pages();
const promises = [];
for (const site of nyGovSites) {
site.events = [];
try {
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36 Edg/87.0.664.75');
await page.goto(site.link);
await page.waitForSelector("#pagetitle, h1", { timeout: 10000 });
const isError = await page.$("h1");
if (!isError) {
site.events = await page.evaluate(() => {
const events = [];
const eventsWeb = $(".event-type");
for (const event of eventsWeb) {
const isAvailable = $(event).find("button").text() !== "Event Full";
if (isAvailable) {
events.push({
date: new Date($(event).find("div div:contains('Date:'):last").first().text().substring(6) + " UTC").toISOString(),
time: $(event).find("div div:contains('Time:'):last").first().text().substring(6),
appointments: $(event).find("div div:contains('Appointments Available:'):last").first().text().substring(24),
linkId: $(event).parent().attr("id")
});
promises.push((async () => {
site.events = [];
try {
const page = await browser.newPage();
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36 Edg/87.0.664.75');
await page.goto(site.link, {timeout: 10000});
await page.waitForSelector("#pagetitle, h1", { timeout: 10000 });
const isError = await page.$("h1");
if (!isError) {
site.events = await page.evaluate(() => {
const events = [];
const eventsWeb = $(".event-type");
for (const event of eventsWeb) {
const isAvailable = $(event).find("button").text() !== "Event Full";
if (isAvailable) {
events.push({
date: new Date($(event).find("div div:contains('Date:'):last").first().text().substring(6) + " UTC").toISOString(),
time: $(event).find("div div:contains('Time:'):last").first().text().substring(6),
appointments: $(event).find("div div:contains('Appointments Available:'):last").first().text().substring(24),
linkId: $(event).parent().attr("id")
});
}
}
}
return events;
});
return events;
});
}
} catch (err) {
console.error(site.link + ": " + err); // todo: log oout to node
}
} catch (err) {
console.error(site.link + ": " + err); // todo: log oout to node
}
})())
}
await Promise.all(promises);
await browser.close();

const data = JSON.stringify({ sites: nyGovSites }, null, 2);
Expand Down

0 comments on commit 972ce67

Please sign in to comment.