|
| 1 | +import sys |
| 2 | +from os import getcwd |
| 3 | +from local_raw_data_storage import LocalRawDataStorage |
| 4 | +from s3_raw_data_storage import S3RawDataStorage |
| 5 | +from aws_postgres_data_storage import AWSPostgresRDSDataStorage |
| 6 | +from amazon_book_attribute_scraper import AmazonBookAttributeScraper |
| 7 | +from amazon_automated_book_review_scraper import AmazonAutomatedBookReviewScraper |
| 8 | +from amazon_automated_book_scraper import AmazonAutomatedBookScraper |
| 9 | + |
| 10 | +url = "https://www.amazon.com/s?i=stripbooks&rh=n%3A25&fs=true&qid=1645782603&ref=sr_pg_1" |
| 11 | +# specify a list of banned title pharses that are likely to be of |
| 12 | +# a special format |
| 13 | +banned_titles = [ |
| 14 | + 'Dungeons and Dragons', |
| 15 | + 'Dungeons & Dragons', |
| 16 | + "Player's Handbook", |
| 17 | + "Users's Manual", |
| 18 | +] |
| 19 | +# object that scrapes attributes of single book |
| 20 | +abas = AmazonBookAttributeScraper(banned_titles=banned_titles) |
| 21 | + |
| 22 | +# object that scrapes reviews of a single book |
| 23 | +aabrs = AmazonAutomatedBookReviewScraper() |
| 24 | + |
| 25 | +# choose and initialize raw storage object: local or S3 bucket |
| 26 | +# raw_storage = LocalRawDataStorage(path=getcwd()) |
| 27 | +raw_storage = S3RawDataStorage(path=None, bucket='aicore-web-scraping') |
| 28 | + |
| 29 | +# params for the AWS Postgres RDS |
| 30 | +rds_param = {'DATABASE_TYPE': 'postgresql', |
| 31 | + 'DBAPI': 'psycopg2', |
| 32 | + 'ENDPOINT': "aicore-webscraping-db.cwckuebjlobx.us-east-1.rds.amazonaws.com", |
| 33 | + 'USER': 'postgres', |
| 34 | + 'PASSWORD': 'aicore2022', |
| 35 | + 'PORT': 5432, |
| 36 | + 'DATABASE': 'postgres'} |
| 37 | +# rds storage is optional |
| 38 | +rds_storage = AWSPostgresRDSDataStorage(rds_param) |
| 39 | +# rds_storage = None |
| 40 | + |
| 41 | +# initialize the scraper object |
| 42 | +aabs = AmazonAutomatedBookScraper( |
| 43 | + url=url, |
| 44 | + book_attribute_scraper=abas, |
| 45 | + automated_book_review_scraper=aabrs, |
| 46 | + raw_data_storage=raw_storage, |
| 47 | + rds_data_storage=rds_storage, |
| 48 | + browser='chrome', |
| 49 | + # browser='firefox', |
| 50 | + # mode='normal') |
| 51 | + mode='headless') |
| 52 | + |
| 53 | +# run the scraper |
| 54 | +num_books = int(sys.argv[1]) |
| 55 | +num_reviews = int(sys.argv[2]) |
| 56 | +# num_books = 3 |
| 57 | +# num_reviews = 5 |
| 58 | +aabs.scrape_books(num_books=num_books, num_reviews=num_reviews) |
0 commit comments