Skip to content

Commit 4ae5b9e

Browse files
committed
feat: adds extra arguments to initializing Selenium webdriver. This is required to make it run inside a docker.
1 parent 6650e02 commit 4ae5b9e

File tree

1 file changed

+19
-3
lines changed

1 file changed

+19
-3
lines changed

amazon_book_scraper/automated_book_scraper.py

Lines changed: 19 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22
from abc import ABC, abstractmethod
33
import time
44
from selenium import webdriver
5+
from selenium.webdriver.chrome.options import Options as ChromeOptions
6+
from selenium.webdriver.firefox.options import Options as FirefoxOptions
57
from entities import Book
68
from book_attribute_scraper import BookAttributeScraper
79
from book_review_scraper import AutomatedBookReviewScraper
@@ -21,7 +23,8 @@ def __init__(
2123
automated_book_review_scraper: AutomatedBookReviewScraper,
2224
raw_data_storage: RawDataStorage,
2325
rds_data_storage: RDSDataStorage = None,
24-
browser: str = 'chrome') -> None:
26+
browser: str = 'chrome',
27+
mode: str = 'normal') -> None:
2528
"""
2629
Args:
2730
url (str): starting url for the book sraper
@@ -31,6 +34,7 @@ def __init__(
3134
raw_data_storage (RawDataStorage): object for saving raw data
3235
rds_data_storage (RDSDataStorage, optional): RDS interface object
3336
browser (str, optional): select the browser.
37+
mode (str, optional): normal or headless mode
3438
"""
3539
if not isinstance(book_attribute_scraper, BookAttributeScraper):
3640
raise TypeError('Invalid type')
@@ -49,14 +53,26 @@ def __init__(
4953
# init Selenium
5054
try:
5155
if browser == 'chrome':
56+
chrome_options = ChromeOptions()
57+
if mode == 'headless':
58+
chrome_options.add_argument("--headless")
59+
chrome_options.add_argument('--no-sandbox')
60+
chrome_options.add_argument('--disable-dev-shm-usage')
61+
self._driver = webdriver.Chrome(options=chrome_options)
5262
self._driver.implicitly_wait(10)
5363
elif browser == 'firefox':
64+
firfox_options = FirefoxOptions()
65+
if mode == 'headless':
66+
firfox_options.add_argument("--headless")
67+
firfox_options.add_argument('--no-sandbox')
68+
firfox_options.add_argument('--disable-dev-shm-usage')
69+
self._driver = webdriver.Firefox(options=firfox_options)
5470
self._driver.implicitly_wait(10)
5571
else:
5672
raise NotImplementedError(
5773
'Only Chrome and Firefox are supported.')
58-
except:
59-
print('Selenium driver error')
74+
except Exception as e:
75+
print('Selenium driver error: ', e)
6076

6177
# get to the url
6278
try:

0 commit comments

Comments
 (0)