Skip to content

Commit

Permalink
modularize
Browse files Browse the repository at this point in the history
  • Loading branch information
Ronan LE MEILLAT committed Sep 17, 2022
1 parent 38aa69a commit 0a753f8
Show file tree
Hide file tree
Showing 4 changed files with 61 additions and 39 deletions.
11 changes: 11 additions & 0 deletions index.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
import {tripHarvest} from './trip-harvester'
import * as dotenv from "dotenv";
import fs from "fs/promises";

dotenv.config();

const reviews = await tripHarvest(process.env.TRIPADVISOR_ID)
await fs.writeFile(
`out-${process.env.TRIPADVISOR_ID}-${Math.floor(Date.now() / 1000)}.json`,
JSON.stringify(reviews)
);
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
"description": "",
"type": "module",
"scripts": {
"scrap": "node --loader ts-node/esm ./harvester.ts",
"scrap": "node --experimental-specifier-resolution=node --loader ts-node/esm ./index.ts",
"test": "echo \"Error: no test specified\" && exit 1"
},
"author": "",
Expand Down
84 changes: 48 additions & 36 deletions harvester.ts → trip-harvester/index.ts
Original file line number Diff line number Diff line change
@@ -1,22 +1,7 @@
import * as dotenv from "dotenv"; // see https://github.com/motdotla/dotenv#how-do-i-use-dotenv-with-import
import { launch, Browser } from "puppeteer";
import fs from 'fs/promises'

dotenv.config();

const TRIPADVISOR_ID = process.env.TRIPADVISOR_ID;
const TRIPADVISOR_BASE = process.env.TRIPADVISOR_BASE;
const TRIPADVISOR_FULL = process.env.TRIPADVISOR_FULL;
const TRIPADVISOR_USER_REVIEW_BASE = `${TRIPADVISOR_BASE}ShowUserReviews-${TRIPADVISOR_ID}`;
const TRIPADVISOR_BASE_ACITVITY = `${TRIPADVISOR_BASE}Attraction_Review-${TRIPADVISOR_ID}-Reviews`; //?filterLang=ALL
const TRIPADVISOR_PAGES = [
process.env.TRIPADVISOR_PAGES_0,
process.env.RIPADVISOR_PAGES_1,
process.env.RIPADVISOR_PAGES_2,
];
const MAX_REVIEW_PAGE = 50
import { launch, Browser } from "puppeteer";

console.log(`Working for : ${TRIPADVISOR_BASE_ACITVITY}`);
const MAX_REVIEW_PAGE = 1000; // there is 5 reviews per page

type TripadvisorReview = {
reviewId?: string;
Expand Down Expand Up @@ -54,7 +39,9 @@ const evaluateTripAdvisorPage = (TRIPADVISOR_USER_REVIEW_BASE: string) => {
resolve(results);
}
if (document.querySelector("body").innerText.includes("Google")) {
console.log("start scraping review in this page\n"+window.location.href);
console.log(
"start scraping review in this page\n" + window.location.href
);
clearInterval(interval);
let items = document.body.querySelectorAll("[data-reviewid]");
for (let i = 0; i < items.length; i++) {
Expand All @@ -70,7 +57,9 @@ const evaluateTripAdvisorPage = (TRIPADVISOR_USER_REVIEW_BASE: string) => {
'[data-test-target="review-title"]'
) as HTMLElement
).innerText;
console.log(`\t got item ${i+1} id:${reviewId} title:${reviewTitle}`);
console.log(
`\t got item ${i + 1} id:${reviewId} title:${reviewTitle}`
);
let experience = item
.querySelectorAll("span")[6]
.innerText.replace(/^.*: /, "");
Expand All @@ -85,7 +74,9 @@ const evaluateTripAdvisorPage = (TRIPADVISOR_USER_REVIEW_BASE: string) => {
}
resolve(results);
} else {
console.log(`still wait for page being ready (perhaps there is no more element)\n${window.location.href}`);
console.log(
`still wait for page being ready (perhaps there is no more element)\n${window.location.href}`
);
}
}, WAITFOR_LANGUAGE_RADIO_INTERVAL);
});
Expand All @@ -96,7 +87,7 @@ const evaluateTripAdvisorPage = (TRIPADVISOR_USER_REVIEW_BASE: string) => {
return parse();
};

const processor = (browser: Browser, href: string) => {
const processor = (browser: Browser, href: string, tripAdvisorReviewBase:string) => {
return new Promise<TripadvisorReview[]>((resolve, reject) => {
browser.newPage().then((page) => {
page.on("console", (msg) => console.log("BROWSER LOG:", msg.text())); //capture in browser console
Expand Down Expand Up @@ -128,7 +119,7 @@ const processor = (browser: Browser, href: string) => {
review = review.concat(
await page.evaluate(
evaluateTripAdvisorPage,
TRIPADVISOR_USER_REVIEW_BASE
tripAdvisorReviewBase
)
);
const aNext = await page.$("span.pageNum + a");
Expand All @@ -138,7 +129,7 @@ const processor = (browser: Browser, href: string) => {
page.click("span.pageNum + a"),
]);
} else {
i = MAX_REVIEW_PAGE+1;
i = MAX_REVIEW_PAGE + 1;
break;
}
}
Expand All @@ -155,19 +146,40 @@ const processor = (browser: Browser, href: string) => {
});
});
};
launch({
headless: true,
devtools: true,

args: ["--no-sandbox", "--disable-setuid-sandbox", "--window-size=1920,1080"],
}).then(async (browser: Browser) => {
let promises = [];
const tripHarvest = (
tripAdvisorID: string
): Promise<TripadvisorReview[]> => {
const TRIPADVISOR_ID = tripAdvisorID;
const TRIPADVISOR_BASE = "https://www.tripadvisor.com/";
const TRIPADVISOR_USER_REVIEW_BASE = `${TRIPADVISOR_BASE}ShowUserReviews-${TRIPADVISOR_ID}`;
const TRIPADVISOR_BASE_ACITVITY = `${TRIPADVISOR_BASE}Attraction_Review-${TRIPADVISOR_ID}-Reviews`; //?filterLang=ALL

console.log(`Working for : ${TRIPADVISOR_BASE_ACITVITY}`);

return new Promise<TripadvisorReview[]>((resolve) => {
launch({
headless: true,
devtools: true,

const href = TRIPADVISOR_BASE_ACITVITY; //`${TRIPADVISOR_BASE_ACITVITY}${subpage}`;
args: [
"--no-sandbox",
"--disable-setuid-sandbox",
"--window-size=1920,1080",
],
}).then(async (browser: Browser) => {
let promises = [];

const href = TRIPADVISOR_BASE_ACITVITY; //`${TRIPADVISOR_BASE_ACITVITY}${subpage}`;

promises.push(processor(browser, TRIPADVISOR_BASE_ACITVITY,TRIPADVISOR_BASE));
const reviews = await Promise.all(promises);
await browser.close();
console.log(`Retrieved ${reviews[0].length} reviews`);
resolve(reviews[0]);
});
});
};

promises.push(processor(browser, TRIPADVISOR_BASE_ACITVITY));
const reviews = await Promise.all(promises);
await browser.close();
console.log(`Retrieved ${reviews[0].length} reviews`)
await fs.writeFile(`out-${Math.floor(Date.now() / 1000)}.json`,JSON.stringify(reviews[0]))
});
export {tripHarvest}
export type {TripadvisorReview}
3 changes: 1 addition & 2 deletions tsconfig.json
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
"esModuleInterop": true,
"resolveJsonModule": true,
"jsx": "preserve",
"baseUrl": ".",
"paths": {}
"baseUrl": "."
}
}

0 comments on commit 0a753f8

Please sign in to comment.