-
Notifications
You must be signed in to change notification settings - Fork 0
/
all_categories_crawler.py
253 lines (211 loc) · 11.5 KB
/
all_categories_crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
"""
-----------------------------------------------------------------------------------------------------------------------
Example use of crawler code to scrape all categories from an arbitrary website that uses dynamic dropdown listboxes
to display the categories and contains a redirect "Go" button that only redirects to the selected category if clicked.
This particular website contains a primary dropdown listbox for the categories and subcategories and a secondary
dropdown listbox for regions.
Remember to download and upgrade all necessary requirements.
Edit code to fit your own website. Remember to replace the base_url with your own website URL and all the xpaths
with the relevant xpaths of your desired elements to be crawled.
Author: Catherine Di, catherine_di_2004@outlook.com
-----------------------------------------------------------------------------------------------------------------------
"""
from webdriver_manager.chrome import ChromeDriverManager
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import StaleElementReferenceException,\
NoSuchElementException, NoSuchAttributeException, TimeoutException
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
def init_browser():
print("Loading Chrome ...")
service = webdriver.ChromeService(ChromeDriverManager().install())
options = webdriver.ChromeOptions()
# options.add_argument("--headless") # test first -> then uncomment to run in headless mode
options.add_argument("--disable-extensions")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
options.add_argument("--disable-images") # disable images to speed up
options.add_experimental_option("excludeSwitches", ["enable-logging"]) # suppress certain ChromeDriver log messages
options.add_argument("--ignore-certificate-errors") # ignore certificate errors
options.add_argument("--disable-notifications")
options.add_argument("--log-level=3")
return webdriver.Chrome(service=service, options=options)
def wait_clickable(driver, xpath, timeout=10, click=False):
# click an element - the correct approach with selenium
ec = EC.element_to_be_clickable((By.XPATH, xpath))
try: # wait for the element to be clickable
element = WebDriverWait(driver, timeout=timeout).until(ec)
except TimeoutException:
return None
if not click:
return element
# use JavaScript to make the click
# this works even when the element is not visiable
# simply use element.click() doesn't work if the element is not visible
driver.execute_script("arguments[0].click();", element)
return element
def wait_all_elements(driver, xpath, timeout=10):
# wait for all elements under the parent xpath to be present after reload
ec = EC.presence_of_all_elements_located((By.XPATH, xpath)) # wait for all elements to be present
try:
WebDriverWait(driver, timeout=timeout).until(ec)
except TimeoutException:
return False
return True
def page_load_complete(driver, timeout=10):
# test if the page is fully loaded
# not needed if you don't need the entire page to be loaded
try:
WebDriverWait(driver, timeout=timeout).until(
# wait for the page to be completely loaded
lambda driver: driver.execute_script("return document.readyState") == "complete")
except TimeoutException:
return False
return True
def get_text_list(driver, xpathx, attribute=None):
# get text of all the list items located by the xpath template xpathx
# optional one single attribute value to return together
# each item of the returned list has the format of element_text:attribute_value
text_list = []
i = 1
while i > 0:
xp = xpathx.replace("{#}", f"{i}") # loop through the xpaths of all elements in the dropdown list
try:
element = driver.find_element(By.XPATH, xp)
except NoSuchElementException: # no more elements; exit the loop
break
try:
text = element.text
except StaleElementReferenceException:
# re-locate the element
element = wait_clickable(driver, xp) # wait for the element to be clickable
if element is None:
break
text = element.text
if attribute is not None:
av = ""
try:
av = element.get_attribute(attribute)
except NoSuchAttributeException:
pass
text += ":" + av
text_list.append(text)
i += 1
#
return text_list
def scrape_all_categories():
base_url = "https://some_website.com" # start url of the website you want to crawl -> replace with your own url
driver = init_browser() # start the chrome browser
driver.get(base_url) # navigate to the base_url
page_load_complete(driver) # wait for the page to be fully loaded
# dismiss the cookie inquiry dialog box immediately
reject_cookie_xpath = '//*[@id="onetrust-reject-all-handler"]'
ec = EC.element_to_be_clickable((By.XPATH, reject_cookie_xpath))
try:
reject_button = WebDriverWait(driver, 10).until(ec)
reject_button.click()
except TimeoutException:
print("Warning: cannot find the Cookie Inquiry Dialog Box!")
# the region dropdown listbox (secondary listbox)
rbox_xpath = '//*[@id="app"]/div/main/div/div/section[1]/div/div/div[1]/div/div[2]/div/button'
# the category dropdown listbox (primary listbox)
cbox_xpath = '//*[@id="app"]/div/main/div/div/section[1]/div/div/div[1]/div/div[1]/div/button'
# the redirect button which triggers the reload of the domain listbox - e.g. a "Go" button next to the dropdown list
# delete related code if your website does not have a redirect button
redirect_xpath = '//*[@id="app"]/div/main/div/div/section[1]/div/div/div[1]/button'
# the domain table
dbox_xpath = '//*[@id="app"]/div/main/div/div/section[2]/div/div/div[1]/table/tbody'
if wait_clickable(driver, rbox_xpath, click=True) is None:
print("# Error: can not find the Region Listbox!")
return
# the template xpath of for the secondary list items
ritem_xpathx = '//*[@id="app"]/div/main/div/div/section[1]/div/div/div[1]/div/div[2]/div/div/button[{#}]'
region_list = get_text_list(driver, ritem_xpathx)
region_list[0] = "" # region string for the first item
print(f"Find total {len(region_list)} regions")
print(region_list)
if wait_clickable(driver, cbox_xpath, click=True) is None:
print("# Error: can not find the Category Listbox!")
return
# the template xpath of the primary list items
citem_xpathx = '//*[@id="app"]/div/main/div/div/section[1]/div/div/div[1]/div/div[1]/div/div/button[{#}]'
# get text and attribute("class") value of each item of the listbox
ta_list = get_text_list(driver, citem_xpathx, attribute="class")
# extract the category and subcategory (if any) strings from the list items
category_list = []
cat_name = None # the category name, initially None
for ta in ta_list:
[text, av] = ta.split(":") # extract text and attribute value
if av.find("subcategory") > 0:
category = cat_name + "/" + text.lower() # text contains the subcategory
else:
if cat_name is None:
cat_name = "" # category string for the first item
else:
cat_name = text.lower() # text contains the category
category = cat_name
category = category.replace(" - other", "").replace("&", "and").replace(" ", "-")
category_list.append(category) # categoary = the final category string
#
print(f"Find total {len(category_list)} categories")
print(category_list)
# first test the redirect button’s existance
# remove if no redirect button on your website
if wait_clickable(driver, redirect_xpath) is None:
print("# Error: can not find the redirect button!")
return
all_data = {} # the scraped data goes here {region : {category : domain_list}}
# for each region and category combination, scrape the dynamically loaded domain list:
# delete the outer loop if your website does not have a secondary dropdown list
for i, region in enumerate(region_list[:10], start=1): # limit to the first 10 region for testing -> remove the [:10] to scrape all region
region_data = all_data.setdefault(region, {})
# click the region dropdown listbox
if wait_clickable(driver, rbox_xpath, click=True) is None:
print("# Error: can not find the Region Listbox!")
continue
ritem_xpath = f'//*[@id="app"]/div/main/div/div/section[1]/div/div/div[1]/div/div[2]/div/div/button[{i}]'
# wait for the dynamically loaded list item to be clickable, then click it to make the selection
if wait_clickable(driver, ritem_xpath, click=True) is None:
continue
for j, cat in enumerate(category_list[:10], start=1): # limit to the first 10 categories for testing -> remove the [:10] to scrape all categories
domain_list = region_data.setdefault(cat, [])
print(f"\nScraping region[{i}] = {region}, cat[{j}] = {cat} ...") # print the region and category
# wait for the category dropdown listbox to be clickable; then click
if wait_clickable(driver, cbox_xpath, click=True) is None:
print("# Error: can not find the Category Listbox!")
continue
citem_xpath = f'//*[@id="app"]/div/main/div/div/section[1]/div/div/div[1]/div/div[1]/div/div/button[{j}]'
# wait for the specific category or subcategory to be clickable; then click
if wait_clickable(driver, citem_xpath, click=True) is None:
continue
# wait for the redirect button to be clickable; then click
# remove code if no redirect button on your website
if wait_clickable(driver, redirect_xpath, click=True) is None:
print("# Error: could not find the redirect button!")
continue
# checks if the redirection wait time exceeded the limit by checking if the element is clickable again
if wait_clickable(driver, redirect_xpath) is None:
print("# Error: too slow to refresh the page!")
continue
# checks if the domain table is loaded on the new page
if not wait_all_elements(driver, dbox_xpath):
print("# Error: too slow to load the domain table!")
continue
time.sleep(2.0) # wait for the table to be refreshed
print(driver.current_url)
# the template xpath of the list items
ditem_xpathx = '//*[@id="app"]/div/main/div/div/section[2]/div/div/div[1]/table/tbody/tr[{#}]/td[2]/a/span[2]'
domain_list += get_text_list(driver, ditem_xpathx)
print(domain_list)
driver.delete_all_cookies() # delete cookies to disable the site's anti-scraping - important!
driver.quit() # close the browser
return all_data
if __name__ == "__main__":
all_data = scrape_all_categories()
# write the scraped data to an excel file
fname = "YOUR_FILE_NAME.txt"
with open(fname, mode="w", encoding="utf-8") as f:
print(all_data, file=f)
print("Done scraping!")