diff --git a/index.ts b/index.ts new file mode 100644 index 0000000..df8d544 --- /dev/null +++ b/index.ts @@ -0,0 +1,11 @@ +import {tripHarvest} from './trip-harvester' +import * as dotenv from "dotenv"; +import fs from "fs/promises"; + +dotenv.config(); + +const reviews = await tripHarvest(process.env.TRIPADVISOR_ID) + await fs.writeFile( + `out-${process.env.TRIPADVISOR_ID}-${Math.floor(Date.now() / 1000)}.json`, + JSON.stringify(reviews) + ); \ No newline at end of file diff --git a/package.json b/package.json index 53d30ed..106ea0f 100644 --- a/package.json +++ b/package.json @@ -4,7 +4,7 @@ "description": "", "type": "module", "scripts": { - "scrap": "node --loader ts-node/esm ./harvester.ts", + "scrap": "node --experimental-specifier-resolution=node --loader ts-node/esm ./index.ts", "test": "echo \"Error: no test specified\" && exit 1" }, "author": "", diff --git a/harvester.ts b/trip-harvester/index.ts similarity index 73% rename from harvester.ts rename to trip-harvester/index.ts index 0a54cf6..af128d5 100644 --- a/harvester.ts +++ b/trip-harvester/index.ts @@ -1,22 +1,7 @@ -import * as dotenv from "dotenv"; // see https://github.com/motdotla/dotenv#how-do-i-use-dotenv-with-import -import { launch, Browser } from "puppeteer"; -import fs from 'fs/promises' - -dotenv.config(); -const TRIPADVISOR_ID = process.env.TRIPADVISOR_ID; -const TRIPADVISOR_BASE = process.env.TRIPADVISOR_BASE; -const TRIPADVISOR_FULL = process.env.TRIPADVISOR_FULL; -const TRIPADVISOR_USER_REVIEW_BASE = `${TRIPADVISOR_BASE}ShowUserReviews-${TRIPADVISOR_ID}`; -const TRIPADVISOR_BASE_ACITVITY = `${TRIPADVISOR_BASE}Attraction_Review-${TRIPADVISOR_ID}-Reviews`; //?filterLang=ALL -const TRIPADVISOR_PAGES = [ - process.env.TRIPADVISOR_PAGES_0, - process.env.RIPADVISOR_PAGES_1, - process.env.RIPADVISOR_PAGES_2, -]; -const MAX_REVIEW_PAGE = 50 +import { launch, Browser } from "puppeteer"; -console.log(`Working for : ${TRIPADVISOR_BASE_ACITVITY}`); +const MAX_REVIEW_PAGE = 1000; // there is 5 reviews per page type TripadvisorReview = { reviewId?: string; @@ -54,7 +39,9 @@ const evaluateTripAdvisorPage = (TRIPADVISOR_USER_REVIEW_BASE: string) => { resolve(results); } if (document.querySelector("body").innerText.includes("Google")) { - console.log("start scraping review in this page\n"+window.location.href); + console.log( + "start scraping review in this page\n" + window.location.href + ); clearInterval(interval); let items = document.body.querySelectorAll("[data-reviewid]"); for (let i = 0; i < items.length; i++) { @@ -70,7 +57,9 @@ const evaluateTripAdvisorPage = (TRIPADVISOR_USER_REVIEW_BASE: string) => { '[data-test-target="review-title"]' ) as HTMLElement ).innerText; - console.log(`\t got item ${i+1} id:${reviewId} title:${reviewTitle}`); + console.log( + `\t got item ${i + 1} id:${reviewId} title:${reviewTitle}` + ); let experience = item .querySelectorAll("span")[6] .innerText.replace(/^.*: /, ""); @@ -85,7 +74,9 @@ const evaluateTripAdvisorPage = (TRIPADVISOR_USER_REVIEW_BASE: string) => { } resolve(results); } else { - console.log(`still wait for page being ready (perhaps there is no more element)\n${window.location.href}`); + console.log( + `still wait for page being ready (perhaps there is no more element)\n${window.location.href}` + ); } }, WAITFOR_LANGUAGE_RADIO_INTERVAL); }); @@ -96,7 +87,7 @@ const evaluateTripAdvisorPage = (TRIPADVISOR_USER_REVIEW_BASE: string) => { return parse(); }; -const processor = (browser: Browser, href: string) => { +const processor = (browser: Browser, href: string, tripAdvisorReviewBase:string) => { return new Promise((resolve, reject) => { browser.newPage().then((page) => { page.on("console", (msg) => console.log("BROWSER LOG:", msg.text())); //capture in browser console @@ -128,7 +119,7 @@ const processor = (browser: Browser, href: string) => { review = review.concat( await page.evaluate( evaluateTripAdvisorPage, - TRIPADVISOR_USER_REVIEW_BASE + tripAdvisorReviewBase ) ); const aNext = await page.$("span.pageNum + a"); @@ -138,7 +129,7 @@ const processor = (browser: Browser, href: string) => { page.click("span.pageNum + a"), ]); } else { - i = MAX_REVIEW_PAGE+1; + i = MAX_REVIEW_PAGE + 1; break; } } @@ -155,19 +146,40 @@ const processor = (browser: Browser, href: string) => { }); }); }; -launch({ - headless: true, - devtools: true, - args: ["--no-sandbox", "--disable-setuid-sandbox", "--window-size=1920,1080"], -}).then(async (browser: Browser) => { - let promises = []; +const tripHarvest = ( + tripAdvisorID: string +): Promise => { + const TRIPADVISOR_ID = tripAdvisorID; + const TRIPADVISOR_BASE = "https://www.tripadvisor.com/"; + const TRIPADVISOR_USER_REVIEW_BASE = `${TRIPADVISOR_BASE}ShowUserReviews-${TRIPADVISOR_ID}`; + const TRIPADVISOR_BASE_ACITVITY = `${TRIPADVISOR_BASE}Attraction_Review-${TRIPADVISOR_ID}-Reviews`; //?filterLang=ALL + + console.log(`Working for : ${TRIPADVISOR_BASE_ACITVITY}`); + + return new Promise((resolve) => { + launch({ + headless: true, + devtools: true, - const href = TRIPADVISOR_BASE_ACITVITY; //`${TRIPADVISOR_BASE_ACITVITY}${subpage}`; + args: [ + "--no-sandbox", + "--disable-setuid-sandbox", + "--window-size=1920,1080", + ], + }).then(async (browser: Browser) => { + let promises = []; + + const href = TRIPADVISOR_BASE_ACITVITY; //`${TRIPADVISOR_BASE_ACITVITY}${subpage}`; + + promises.push(processor(browser, TRIPADVISOR_BASE_ACITVITY,TRIPADVISOR_BASE)); + const reviews = await Promise.all(promises); + await browser.close(); + console.log(`Retrieved ${reviews[0].length} reviews`); + resolve(reviews[0]); + }); + }); +}; - promises.push(processor(browser, TRIPADVISOR_BASE_ACITVITY)); - const reviews = await Promise.all(promises); - await browser.close(); - console.log(`Retrieved ${reviews[0].length} reviews`) - await fs.writeFile(`out-${Math.floor(Date.now() / 1000)}.json`,JSON.stringify(reviews[0])) -}); +export {tripHarvest} +export type {TripadvisorReview} \ No newline at end of file diff --git a/tsconfig.json b/tsconfig.json index d2726f1..2438ecd 100644 --- a/tsconfig.json +++ b/tsconfig.json @@ -9,7 +9,6 @@ "esModuleInterop": true, "resolveJsonModule": true, "jsx": "preserve", - "baseUrl": ".", - "paths": {} + "baseUrl": "." } } \ No newline at end of file