Skip to content
This repository was archived by the owner on Dec 22, 2023. It is now read-only.

Playstore Reviews Scrapper #238

Merged
merged 9 commits into from
Oct 3, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions Scripts/Web_Scrappers/PlayStoreReviewScrapper/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
bin/
__pycache__/
lib/
lib64
pyvenv.cfg
20 changes: 20 additions & 0 deletions Scripts/Web_Scrappers/PlayStoreReviewScrapper/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
# Playstore Review Scrapper

* Automated Playstore Reviews Scrapper using python3 and Selenium. And saves the revies to a csv file for further Data Exploration!
* Provide application ids in a file line by line



# Requirements

- sudo apt-get install chromium-chromedriver
- Modules specified in requirements.txt



# Usage

* python main.py

* Provide filename for gathering reviews from playstore

1 change: 1 addition & 0 deletions Scripts/Web_Scrappers/PlayStoreReviewScrapper/appids.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
com.whatsapp
201 changes: 201 additions & 0 deletions Scripts/Web_Scrappers/PlayStoreReviewScrapper/com.whatsapp .csv

Large diffs are not rendered by default.

23 changes: 23 additions & 0 deletions Scripts/Web_Scrappers/PlayStoreReviewScrapper/extracter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
import os
import errno
import csv


#Write the reviews result to csv_file
def writecsv(filename, dict_data):
csv_columns = ['Sno', 'User', 'Rating', 'Review']
csv_file = filename + ".csv"

file_exists = os.path.isfile(csv_file)

try:
with open(csv_file, 'a') as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=csv_columns)
if not file_exists:
writer.writeheader()
writer.writerow(dict_data)
except IOError as ioex:
print("I/O error")
print("Error occured while extracting Reviews")
print('errno:', ioex.errno)
print('err code:', errno.errorcode[ioex.errno])
113 changes: 113 additions & 0 deletions Scripts/Web_Scrappers/PlayStoreReviewScrapper/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
#!/usr/bin/python3

import extracter
import time
import re
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys


def getPlaystoreReviews(app_id):

# url of the playstore with application_id
url = "https://play.google.com/store/apps/details?id=" + app_id + "&showAllReviews=true"

browser = webdriver.Chrome("/usr/lib/chromium-browser/chromedriver")
browser.get(url)
time.sleep(1)

#get body content to click buttons
elem = browser.find_element_by_tag_name("body")

no_of_pagedowns = 400

path1 = '//*[@id="fcxH9b"]/div[4]/c-wiz/div/div[2]/div/div[1]/div/div/div[1]/div[2]/div[2]/div/span/span'
path2 = '/html/body/div[1]/div[4]/c-wiz[2]/div/div[2]/div/div[1]/div/div/div[1]/div[2]/div[2]/div/span/span'
path3 = '/html/body/div[1]/div[4]/c-wiz/div/div[2]/div/div[1]/div/div/div[1]/div[2]/div[2]/div/span/span'

# Scroll web page to get data
while no_of_pagedowns:
elem.send_keys(Keys.PAGE_DOWN)

if((no_of_pagedowns - 1) % 12 == 0):
for path in [path1, path2, path3]:
try:
python_button = browser.find_elements_by_xpath(path)[0]
python_button.click()
except IndexError:
elem.send_keys(Keys.PAGE_UP)
pass


if((no_of_pagedowns - 1) % 25 == 0):
for path in [path1, path2, path3]:
try:
elem.send_keys(Keys.PAGE_UP)
python_button = browser.find_elements_by_xpath(path)[0]
python_button.click()
except IndexError:
pass

no_of_pagedowns -= 1


# Now that the page is fully scrolled, grab the source code.

print("Gathering source information")
source_data = browser.page_source
time.sleep(1)
print("Parsing source data")
soup = BeautifulSoup(source_data, 'html.parser')
time.sleep(1)

print("Getting source information..")
time.sleep(1)

# Revirew main div
review_divs = soup.find("div", {"jsname": "fk8dgd"})
print("Gathering reviews information")
time.sleep(1)
# Find each review div elements
reviews = review_divs.findAll("div", {"jscontroller": "H6eOGe"})

print("Gathering Reviews")
print("\t=============Reviews=============")
review_count = 0
# Iterate through each review get all feilds of comments
for div in reviews:
review_count += 1
user = div.find("span", {"class": "X43Kjb"})
user = user.text.encode('unicode-escape').decode('utf-8')
rating = div.find("div", {"class", "pf5lIe"})
rating = rating.find("div", {"aria-label": re.compile('Rated')})
rating = str(rating.get('aria-label'))
rating = rating[6]
review = div.find("span", {"jsname": "fbQN7e"})
review = review.text.encode('unicode-escape').decode('utf-8')
if(review == ""):
review = div.find("span", {"jsname": "bN97Pc"})
review = review.text
content = {'Sno': review_count, 'User': user, "Rating": rating, "Review": review}

print("{} {}".format(review_count, review[0:150]))
extracter.writecsv(app_id, content)

browser.close()


def main():

infile = input("Enter file name: ")
# for each application id get the application reviews
with open(infile,'r') as file:
while 1:
application_id = file.readline()
if not application_id:
break
getPlaystoreReviews(application_id)


if __name__ == '__main__':
main()

13 changes: 13 additions & 0 deletions Scripts/Web_Scrappers/PlayStoreReviewScrapper/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
beautifulsoup4==4.8.2
certifi==2019.11.28
chardet==3.0.4
idna==2.8
numpy==1.18.1
pandas==1.0.0
python-dateutil==2.8.1
pytz==2019.3
requests==2.22.0
selenium==3.141.0
six==1.14.0
soupsieve==1.9.5
urllib3==1.25.8
1 change: 0 additions & 1 deletion Scripts/Web_Scrappers/README.md
Original file line number Diff line number Diff line change
@@ -1 +0,0 @@