Skip to content

Commit 4318e0d

Browse files
committed
chore: renames the entry point module from __main__.py to main.py.
1 parent 4244d95 commit 4318e0d

File tree

1 file changed

+58
-0
lines changed

1 file changed

+58
-0
lines changed

amazon_book_scraper/main.py

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
import sys
2+
from os import getcwd
3+
from local_raw_data_storage import LocalRawDataStorage
4+
from s3_raw_data_storage import S3RawDataStorage
5+
from aws_postgres_data_storage import AWSPostgresRDSDataStorage
6+
from amazon_book_attribute_scraper import AmazonBookAttributeScraper
7+
from amazon_automated_book_review_scraper import AmazonAutomatedBookReviewScraper
8+
from amazon_automated_book_scraper import AmazonAutomatedBookScraper
9+
10+
url = "https://www.amazon.com/s?i=stripbooks&rh=n%3A25&fs=true&qid=1645782603&ref=sr_pg_1"
11+
# specify a list of banned title pharses that are likely to be of
12+
# a special format
13+
banned_titles = [
14+
'Dungeons and Dragons',
15+
'Dungeons & Dragons',
16+
"Player's Handbook",
17+
"Users's Manual",
18+
]
19+
# object that scrapes attributes of single book
20+
abas = AmazonBookAttributeScraper(banned_titles=banned_titles)
21+
22+
# object that scrapes reviews of a single book
23+
aabrs = AmazonAutomatedBookReviewScraper()
24+
25+
# choose and initialize raw storage object: local or S3 bucket
26+
# raw_storage = LocalRawDataStorage(path=getcwd())
27+
raw_storage = S3RawDataStorage(path=None, bucket='aicore-web-scraping')
28+
29+
# params for the AWS Postgres RDS
30+
rds_param = {'DATABASE_TYPE': 'postgresql',
31+
'DBAPI': 'psycopg2',
32+
'ENDPOINT': "aicore-webscraping-db.cwckuebjlobx.us-east-1.rds.amazonaws.com",
33+
'USER': 'postgres',
34+
'PASSWORD': 'aicore2022',
35+
'PORT': 5432,
36+
'DATABASE': 'postgres'}
37+
# rds storage is optional
38+
rds_storage = AWSPostgresRDSDataStorage(rds_param)
39+
# rds_storage = None
40+
41+
# initialize the scraper object
42+
aabs = AmazonAutomatedBookScraper(
43+
url=url,
44+
book_attribute_scraper=abas,
45+
automated_book_review_scraper=aabrs,
46+
raw_data_storage=raw_storage,
47+
rds_data_storage=rds_storage,
48+
browser='chrome',
49+
# browser='firefox',
50+
# mode='normal')
51+
mode='headless')
52+
53+
# run the scraper
54+
num_books = int(sys.argv[1])
55+
num_reviews = int(sys.argv[2])
56+
# num_books = 3
57+
# num_reviews = 5
58+
aabs.scrape_books(num_books=num_books, num_reviews=num_reviews)

0 commit comments

Comments
 (0)