From af4efe617f7f9c632195b38cc7534898cc7de188 Mon Sep 17 00:00:00 2001 From: Markus Schepke Date: Tue, 31 Oct 2023 23:15:14 +0200 Subject: [PATCH] first version of board_game_scraper/download_bgg_dump.py --- .env.example | 3 + board_game_scraper/download_bgg_dump.py | 103 ++++++++++++++++++++++++ 2 files changed, 106 insertions(+) create mode 100644 board_game_scraper/download_bgg_dump.py diff --git a/.env.example b/.env.example index 9e655b2..8a9210d 100644 --- a/.env.example +++ b/.env.example @@ -30,3 +30,6 @@ LIMIT_IMAGES_TO_DOWNLOAD_DBPEDIA=0 LIMIT_IMAGES_TO_DOWNLOAD_LUDING=0 LIMIT_IMAGES_TO_DOWNLOAD_SPIELEN=0 LIMIT_IMAGES_TO_DOWNLOAD_WIKIDATA=0 +# BGG username and password +BGG_USERNAME= +BGG_PASSWORD= diff --git a/board_game_scraper/download_bgg_dump.py b/board_game_scraper/download_bgg_dump.py new file mode 100644 index 0000000..522851a --- /dev/null +++ b/board_game_scraper/download_bgg_dump.py @@ -0,0 +1,103 @@ +"""Download the latest BGG data dump.""" + +import argparse +import logging +import os +import sys + +from pathlib import Path +from typing import Union + +import requests + +from scrapy.selector import Selector + +BASE_DIR = Path(__file__).resolve().parent.parent +LOGGER = logging.getLogger(__name__) + + +def download_bgg_dump( + username: str, + password: str, + target_dir: Union[str, Path], +) -> None: + """Download the latest BGG data dump.""" + + target_dir = Path(target_dir).resolve() + target_dir.mkdir(parents=True, exist_ok=True) + + LOGGER.info("Downloading latest BGG dump to <%s>…", target_dir) + + login_url = "https://boardgamegeek.com/login/api/v1" + html_url = "https://boardgamegeek.com/data_dumps/bg_ranks" + + with requests.Session() as session: + credentials = { + "credentials": { + "username": username, + "password": password, + } + } + login_response = session.post(login_url, json=credentials) + login_response.raise_for_status() + + html_response = session.get(html_url) + html_response.raise_for_status() + + selector = Selector(text=html_response.text) + + for link in selector.css("#maincontent a[download]"): + download_url = link.xpath("@href").get() + file_name = link.xpath("@download").get() + file_path = target_dir / file_name + LOGGER.info("Downloading <%s> to <%s>…", download_url, file_path) + + download_response = session.get(download_url) + download_response.raise_for_status() + + with file_path.open("wb") as file: + file.write(download_response.content) + + LOGGER.info("Done.") + + +def _parse_args(): + parser = argparse.ArgumentParser(description="Download the latest BGG data dump.") + parser.add_argument( + "--out-dir", + "-d", + default=BASE_DIR / "feeds" / "bgg_dump", + help="Output directory", + ) + parser.add_argument( + "--verbose", + "-v", + action="count", + default=0, + help="Log level (repeat for more verbosity)", + ) + + return parser.parse_args() + + +def main(): + """Command line entry point.""" + + args = _parse_args() + + logging.basicConfig( + stream=sys.stderr, + level=logging.DEBUG if args.verbose > 0 else logging.INFO, + format="%(asctime)s %(levelname)-8.8s [%(name)s:%(lineno)s] %(message)s", + ) + + LOGGER.info(args) + + username = os.getenv("BGG_USERNAME") + password = os.getenv("BGG_PASSWORD") + + download_bgg_dump(username, password, args.out_dir) + + +if __name__ == "__main__": + main()