-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathscraper.py
93 lines (81 loc) · 3.11 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
from selenium import webdriver
import time
import pandas as pd
def save(rate, review):
df = pd.read_csv('reviews.csv')
rates, reviews = df.rate.values.tolist(), df.review.values.tolist()
if type(rate) == list:
rates.extend(rate)
reviews.extend(review)
else:
rates.append(rate)
reviews.append(review)
pd.DataFrame({'rate': rates, 'review': reviews}).to_csv('reviews.csv', index=False)
def extract(driver, loading_time):
dialog_box = driver.find_element_by_xpath('//div[@class="review-dialog-list"]')
i = 0
while i < 20:
driver.execute_script('arguments[0].scrollTop = arguments[0].scrollHeight', dialog_box)
time.sleep(loading_time)
i += 1
user_review_containers = driver.find_elements_by_xpath('//div[@class="WMbnJf gws-localreviews__google-review"]')
rates, reviews = [], []
for container in user_review_containers:
rate_score = container.find_element_by_xpath('//div[@class="PuaHbe"]').find_element_by_tag_name(
'span').get_attribute('aria-label')
rating = rate_score[len('Diberi nilai'):len('Diberi nilai') + 4].strip()
rating = round(float(rating.replace(',', '.')))
print()
print(rate_score)
print(rating)
try:
container.find_element_by_partial_link_text('Lengkapnya').click()
review = container.find_element_by_class_name('review-full-text').text
except:
review = container.find_element_by_class_name('Jtu6Td').text
print(review)
if rating != 3:
rates.append(rating)
reviews.append(review)
# print(zip(rates, reviews))
# return [],[]
return rates, reviews
def scrap(keyword):
loading_time = 3
url = 'https://google.co.id/search?q=' + keyword
driver = webdriver.Chrome()
driver.get(url)
time.sleep(loading_time)
try:
all_reviews_button = driver.find_element_by_link_text('Lihat semua ulasan Google')
all_reviews_button.click()
time.sleep(loading_time)
sorting_spinner = driver.find_element_by_class_name('S7TGef')
sorting_spinner.click()
time.sleep(1)
driver.find_element_by_xpath('//*[@id="lb"]/div/g-menu/g-menu-item[4]/div').click() # lower to higher
time.sleep(loading_time)
rates, reviews = extract(driver, loading_time)
save(rates, reviews)
sorting_spinner = driver.find_element_by_class_name('S7TGef')
sorting_spinner.click()
time.sleep(1)
driver.find_element_by_xpath('//*[@id="lb"]/div/g-menu/g-menu-item[3]/div').click() # higher to lower
time.sleep(loading_time)
rates, reviews = extract(driver, loading_time)
save(rates, reviews)
except:
pass
driver.close()
#
# scrap('bakso boedjangan cibubur')
# scrap('saung mang engking cibubur')
# scrap('Gubug Udang Situ Cibubur')
# scrap('Pondok kemangi cibubur')
scrap('Fat bubble cibubur')
scrap('keibar cibubur')
scrap('upnormal cibubur')
scrap('coffee toffee cibubur')
scrap('coffee toffee kota wisata')
scrap('honeycomb kota wisata')
scrap('fly the wind kota wisata')