Skip to content
This repository was archived by the owner on Dec 22, 2023. It is now read-only.

Commit 52dd1cc

Browse files
authored
Merge pull request #238 from swaroopmaddu/master
Playstore Reviews Scrapper
2 parents 0db85a8 + ab499ac commit 52dd1cc

File tree

8 files changed

+376
-1
lines changed

8 files changed

+376
-1
lines changed
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
bin/
2+
__pycache__/
3+
lib/
4+
lib64
5+
pyvenv.cfg
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
# Playstore Review Scrapper
2+
3+
* Automated Playstore Reviews Scrapper using python3 and Selenium. And saves the revies to a csv file for further Data Exploration!
4+
* Provide application ids in a file line by line
5+
6+
7+
8+
# Requirements
9+
10+
- sudo apt-get install chromium-chromedriver
11+
- Modules specified in requirements.txt
12+
13+
14+
15+
# Usage
16+
17+
* python main.py
18+
19+
* Provide filename for gathering reviews from playstore
20+
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
com.whatsapp

Scripts/Web_Scrappers/PlayStoreReviewScrapper/com.whatsapp .csv

Lines changed: 201 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
import os
2+
import errno
3+
import csv
4+
5+
6+
#Write the reviews result to csv_file
7+
def writecsv(filename, dict_data):
8+
csv_columns = ['Sno', 'User', 'Rating', 'Review']
9+
csv_file = filename + ".csv"
10+
11+
file_exists = os.path.isfile(csv_file)
12+
13+
try:
14+
with open(csv_file, 'a') as csvfile:
15+
writer = csv.DictWriter(csvfile, fieldnames=csv_columns)
16+
if not file_exists:
17+
writer.writeheader()
18+
writer.writerow(dict_data)
19+
except IOError as ioex:
20+
print("I/O error")
21+
print("Error occured while extracting Reviews")
22+
print('errno:', ioex.errno)
23+
print('err code:', errno.errorcode[ioex.errno])
Lines changed: 113 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,113 @@
1+
#!/usr/bin/python3
2+
3+
import extracter
4+
import time
5+
import re
6+
from bs4 import BeautifulSoup
7+
from selenium import webdriver
8+
from selenium.webdriver.common.keys import Keys
9+
10+
11+
def getPlaystoreReviews(app_id):
12+
13+
# url of the playstore with application_id
14+
url = "https://play.google.com/store/apps/details?id=" + app_id + "&showAllReviews=true"
15+
16+
browser = webdriver.Chrome("/usr/lib/chromium-browser/chromedriver")
17+
browser.get(url)
18+
time.sleep(1)
19+
20+
#get body content to click buttons
21+
elem = browser.find_element_by_tag_name("body")
22+
23+
no_of_pagedowns = 400
24+
25+
path1 = '//*[@id="fcxH9b"]/div[4]/c-wiz/div/div[2]/div/div[1]/div/div/div[1]/div[2]/div[2]/div/span/span'
26+
path2 = '/html/body/div[1]/div[4]/c-wiz[2]/div/div[2]/div/div[1]/div/div/div[1]/div[2]/div[2]/div/span/span'
27+
path3 = '/html/body/div[1]/div[4]/c-wiz/div/div[2]/div/div[1]/div/div/div[1]/div[2]/div[2]/div/span/span'
28+
29+
# Scroll web page to get data
30+
while no_of_pagedowns:
31+
elem.send_keys(Keys.PAGE_DOWN)
32+
33+
if((no_of_pagedowns - 1) % 12 == 0):
34+
for path in [path1, path2, path3]:
35+
try:
36+
python_button = browser.find_elements_by_xpath(path)[0]
37+
python_button.click()
38+
except IndexError:
39+
elem.send_keys(Keys.PAGE_UP)
40+
pass
41+
42+
43+
if((no_of_pagedowns - 1) % 25 == 0):
44+
for path in [path1, path2, path3]:
45+
try:
46+
elem.send_keys(Keys.PAGE_UP)
47+
python_button = browser.find_elements_by_xpath(path)[0]
48+
python_button.click()
49+
except IndexError:
50+
pass
51+
52+
no_of_pagedowns -= 1
53+
54+
55+
# Now that the page is fully scrolled, grab the source code.
56+
57+
print("Gathering source information")
58+
source_data = browser.page_source
59+
time.sleep(1)
60+
print("Parsing source data")
61+
soup = BeautifulSoup(source_data, 'html.parser')
62+
time.sleep(1)
63+
64+
print("Getting source information..")
65+
time.sleep(1)
66+
67+
# Revirew main div
68+
review_divs = soup.find("div", {"jsname": "fk8dgd"})
69+
print("Gathering reviews information")
70+
time.sleep(1)
71+
# Find each review div elements
72+
reviews = review_divs.findAll("div", {"jscontroller": "H6eOGe"})
73+
74+
print("Gathering Reviews")
75+
print("\t=============Reviews=============")
76+
review_count = 0
77+
# Iterate through each review get all feilds of comments
78+
for div in reviews:
79+
review_count += 1
80+
user = div.find("span", {"class": "X43Kjb"})
81+
user = user.text.encode('unicode-escape').decode('utf-8')
82+
rating = div.find("div", {"class", "pf5lIe"})
83+
rating = rating.find("div", {"aria-label": re.compile('Rated')})
84+
rating = str(rating.get('aria-label'))
85+
rating = rating[6]
86+
review = div.find("span", {"jsname": "fbQN7e"})
87+
review = review.text.encode('unicode-escape').decode('utf-8')
88+
if(review == ""):
89+
review = div.find("span", {"jsname": "bN97Pc"})
90+
review = review.text
91+
content = {'Sno': review_count, 'User': user, "Rating": rating, "Review": review}
92+
93+
print("{} {}".format(review_count, review[0:150]))
94+
extracter.writecsv(app_id, content)
95+
96+
browser.close()
97+
98+
99+
def main():
100+
101+
infile = input("Enter file name: ")
102+
# for each application id get the application reviews
103+
with open(infile,'r') as file:
104+
while 1:
105+
application_id = file.readline()
106+
if not application_id:
107+
break
108+
getPlaystoreReviews(application_id)
109+
110+
111+
if __name__ == '__main__':
112+
main()
113+
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
beautifulsoup4==4.8.2
2+
certifi==2019.11.28
3+
chardet==3.0.4
4+
idna==2.8
5+
numpy==1.18.1
6+
pandas==1.0.0
7+
python-dateutil==2.8.1
8+
pytz==2019.3
9+
requests==2.22.0
10+
selenium==3.141.0
11+
six==1.14.0
12+
soupsieve==1.9.5
13+
urllib3==1.25.8

Scripts/Web_Scrappers/README.md

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +0,0 @@
1-

0 commit comments

Comments
 (0)