Skip to content

Commit

Permalink
final
Browse files Browse the repository at this point in the history
  • Loading branch information
Ronan LE MEILLAT committed Sep 17, 2022
1 parent 0a753f8 commit b609386
Show file tree
Hide file tree
Showing 3 changed files with 115 additions and 16 deletions.
68 changes: 68 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
# Retrieve reviews from Tripadvisor with Puppeteer (in Sept 2022)!

With Puppeteer it wasn't so difficult to retrieve all the reviews for a specific "Thing to do" on Tripadvisor.


# What to do

First retrieve the Tripadvisor id from the attraction url page.

for example g187261-d7680662
from a long Tripadvisor URL.

## Let's run

Just edit the index.ts typescript file to your requirements, example for the id we grab before.
```javascript
import {tripReviewHarvest,true} from './trip-harvester'
import fs from "fs/promises";

const reviews = await tripReviewHarvest('g187261-d7680662')

await fs.writeFile(
`out-reviews-${'g187261-d7680662'}-${Math.floor(Date.now() / 1000)}.json`,
JSON.stringify(reviews)
);
```
After that we can run the harvester:
```bash
npm run scrap
```
After a while we get all the reviews in a json file.
```javascript
{
"globalRating": 4.5,
"reviews": [
{
"reviewId": "852911023",
"reviewerUrl": "https://www.tripadvisor.com/Profile/kleclercq",
"reviewerName": "klerclercq",
"reviewDate": "Aug 2022",
"rating": 5,
"title": "Une superbe orga canyoning",
"text": "ne superbe matinee avec Jeremy pour une sortie canyoning Nous en faisons chaque année avec nos deux filles et c’est le meilleur spot, super diversifié (tyrolienne saut rappel …) On a adoré En plus un service au top : emmenés et ramenés car nous ne souhaitons pas toucher à la",
"exp": "",
"url": "https://www.tripadvisor.com/ShowUserReviews-g187261-d7680662-r852911023"
},
{
"reviewId": "849994274",
"reviewerUrl": "https://www.tripadvisor.com/Profile/tedsouder",
"reviewerName": "Ted S",
"reviewDate": "Jul 2022",
"rating": 5,
"title": "Paragliding in Chamonix is not to be missed! Call the team at Evolution 2 to book your trip today!",
"text": "Really incredible experience in Chamonix. I was there with my son and nephew and was able to schedule a last-minute, 45-minute paragliding session for all three of us. Our guides met us at the lift at the agreed-upon time, they were super nice and helpful answering all our",
"exp": "",
"url": "https://www.tripadvisor.com/ShowUserReviews-g187261-d7680662-r849994274"
},
// lot of entries
]
}
```
## Is it legal ?

So is it legal or illegal? Web scraping and crawling aren’t illegal by themselves. After all, you could scrape or crawl your own website, without a hitch. If you scrap the reviews of your own company it is not.

## Howto debut ?

Turn false the headless param
6 changes: 3 additions & 3 deletions index.ts
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
import {tripHarvest} from './trip-harvester'
import {tripReviewHarvest} from './trip-harvester'
import * as dotenv from "dotenv";
import fs from "fs/promises";

dotenv.config();

const reviews = await tripHarvest(process.env.TRIPADVISOR_ID)
const reviews = await tripReviewHarvest(process.env.TRIPADVISOR_ID,false)
await fs.writeFile(
`out-${process.env.TRIPADVISOR_ID}-${Math.floor(Date.now() / 1000)}.json`,
`out-reviews-${process.env.TRIPADVISOR_ID}-${Math.floor(Date.now() / 1000)}.json`,
JSON.stringify(reviews)
);
57 changes: 44 additions & 13 deletions trip-harvester/index.ts
Original file line number Diff line number Diff line change
@@ -1,17 +1,24 @@

import { launch, Browser } from "puppeteer";

const MAX_REVIEW_PAGE = 1000; // there is 5 reviews per page

type TripadvisorReview = {
reviewId?: string;
reviewerUrl?: string;
reviewerName?: string;
reviewDate?: string;
rating?: number;
title?: string;
text?: string;
exp?: string;
url?: string;
};

type TripadvisorRating = {
globalRating: number;
reviews: TripadvisorReview[];
};

const evaluateTripAdvisorPage = (TRIPADVISOR_USER_REVIEW_BASE: string) => {
////////////
const WAITFOR_LANGUAGE_RADIO_INTERVAL = 1000;
Expand Down Expand Up @@ -46,6 +53,9 @@ const evaluateTripAdvisorPage = (TRIPADVISOR_USER_REVIEW_BASE: string) => {
let items = document.body.querySelectorAll("[data-reviewid]");
for (let i = 0; i < items.length; i++) {
const item = items[i];
const reviewerUrl = (item.parentElement.querySelector('a.ui_header_link')as HTMLAnchorElement).href
const reviewerName = (item.parentElement.querySelector('a.ui_header_link') as HTMLAnchorElement).innerText
const reviewDate = item.parentElement.querySelector('a.ui_header_link').parentElement.innerText.match(/\w+\W+\w+$/)[0]
let ratingElement = item
.querySelector(".ui_bubble_rating")
.getAttribute("class");
Expand All @@ -65,6 +75,9 @@ const evaluateTripAdvisorPage = (TRIPADVISOR_USER_REVIEW_BASE: string) => {
.innerText.replace(/^.*: /, "");
results.push({
reviewId: reviewId,
reviewerUrl: reviewerUrl,
reviewerName: reviewerName,
reviewDate: reviewDate,
rating: parsedRating,
title: reviewTitle,
text: item.querySelectorAll("span")[3].innerHTML,
Expand All @@ -87,8 +100,12 @@ const evaluateTripAdvisorPage = (TRIPADVISOR_USER_REVIEW_BASE: string) => {
return parse();
};

const processor = (browser: Browser, href: string, tripAdvisorReviewBase:string) => {
return new Promise<TripadvisorReview[]>((resolve, reject) => {
const processor = (
browser: Browser,
href: string,
tripAdvisorReviewBase: string
) => {
return new Promise<TripadvisorRating>((resolve, reject) => {
browser.newPage().then((page) => {
page.on("console", (msg) => console.log("BROWSER LOG:", msg.text())); //capture in browser console
page.goto(href).then((data) => {
Expand All @@ -114,9 +131,21 @@ const processor = (browser: Browser, href: string, tripAdvisorReviewBase:string)
{ timeout: 3002 }
)
.then(async (data) => {
let review = [] as TripadvisorReview[];
let rating = {
globalRating: 5,
reviews: [] as TripadvisorReview,
} as TripadvisorRating;
rating.globalRating = await page.$eval(
'div.ui_poi_review_rating > span.ui_bubble_rating',
(el) =>
parseInt(
el
.getAttribute("class")
.replace(/[^0-9]/g, "")
) / 10
);
for (let i = 0; i < MAX_REVIEW_PAGE; i++) {
review = review.concat(
rating.reviews = rating.reviews.concat(
await page.evaluate(
evaluateTripAdvisorPage,
tripAdvisorReviewBase
Expand All @@ -134,7 +163,7 @@ const processor = (browser: Browser, href: string, tripAdvisorReviewBase:string)
}
}

resolve(review);
resolve(rating);
});
});
});
Expand All @@ -147,8 +176,8 @@ const processor = (browser: Browser, href: string, tripAdvisorReviewBase:string)
});
};

const tripHarvest = (
tripAdvisorID: string
const tripReviewHarvest = (
tripAdvisorID: string, headless=true
): Promise<TripadvisorReview[]> => {
const TRIPADVISOR_ID = tripAdvisorID;
const TRIPADVISOR_BASE = "https://www.tripadvisor.com/";
Expand All @@ -159,7 +188,7 @@ const tripHarvest = (

return new Promise<TripadvisorReview[]>((resolve) => {
launch({
headless: true,
headless: headless,
devtools: true,

args: [
Expand All @@ -172,14 +201,16 @@ const tripHarvest = (

const href = TRIPADVISOR_BASE_ACITVITY; //`${TRIPADVISOR_BASE_ACITVITY}${subpage}`;

promises.push(processor(browser, TRIPADVISOR_BASE_ACITVITY,TRIPADVISOR_BASE));
promises.push(
processor(browser, TRIPADVISOR_BASE_ACITVITY, TRIPADVISOR_USER_REVIEW_BASE)
);
const reviews = await Promise.all(promises);
await browser.close();
console.log(`Retrieved ${reviews[0].length} reviews`);
console.log(`Retrieved ${(reviews[0] as TripadvisorRating).reviews.length} reviews`);
resolve(reviews[0]);
});
});
};

export {tripHarvest}
export type {TripadvisorReview}
export { tripReviewHarvest };
export type { TripadvisorReview };

0 comments on commit b609386

Please sign in to comment.