Skip to content

Commit

Permalink
first version of board_game_scraper/download_bgg_dump.py
Browse files Browse the repository at this point in the history
  • Loading branch information
MarkusShepherd committed Oct 31, 2023
1 parent 9eb4490 commit af4efe6
Show file tree
Hide file tree
Showing 2 changed files with 106 additions and 0 deletions.
3 changes: 3 additions & 0 deletions .env.example
Original file line number Diff line number Diff line change
Expand Up @@ -30,3 +30,6 @@ LIMIT_IMAGES_TO_DOWNLOAD_DBPEDIA=0
LIMIT_IMAGES_TO_DOWNLOAD_LUDING=0
LIMIT_IMAGES_TO_DOWNLOAD_SPIELEN=0
LIMIT_IMAGES_TO_DOWNLOAD_WIKIDATA=0
# BGG username and password
BGG_USERNAME=
BGG_PASSWORD=
103 changes: 103 additions & 0 deletions board_game_scraper/download_bgg_dump.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
"""Download the latest BGG data dump."""

import argparse
import logging
import os
import sys

from pathlib import Path
from typing import Union

import requests

from scrapy.selector import Selector

BASE_DIR = Path(__file__).resolve().parent.parent
LOGGER = logging.getLogger(__name__)


def download_bgg_dump(
username: str,
password: str,
target_dir: Union[str, Path],
) -> None:
"""Download the latest BGG data dump."""

target_dir = Path(target_dir).resolve()
target_dir.mkdir(parents=True, exist_ok=True)

LOGGER.info("Downloading latest BGG dump to <%s>…", target_dir)

login_url = "https://boardgamegeek.com/login/api/v1"
html_url = "https://boardgamegeek.com/data_dumps/bg_ranks"

with requests.Session() as session:
credentials = {
"credentials": {
"username": username,
"password": password,
}
}
login_response = session.post(login_url, json=credentials)
login_response.raise_for_status()

html_response = session.get(html_url)
html_response.raise_for_status()

selector = Selector(text=html_response.text)

for link in selector.css("#maincontent a[download]"):
download_url = link.xpath("@href").get()
file_name = link.xpath("@download").get()
file_path = target_dir / file_name
LOGGER.info("Downloading <%s> to <%s>…", download_url, file_path)

download_response = session.get(download_url)
download_response.raise_for_status()

with file_path.open("wb") as file:
file.write(download_response.content)

LOGGER.info("Done.")


def _parse_args():
parser = argparse.ArgumentParser(description="Download the latest BGG data dump.")
parser.add_argument(
"--out-dir",
"-d",
default=BASE_DIR / "feeds" / "bgg_dump",
help="Output directory",
)
parser.add_argument(
"--verbose",
"-v",
action="count",
default=0,
help="Log level (repeat for more verbosity)",
)

return parser.parse_args()


def main():
"""Command line entry point."""

args = _parse_args()

logging.basicConfig(
stream=sys.stderr,
level=logging.DEBUG if args.verbose > 0 else logging.INFO,
format="%(asctime)s %(levelname)-8.8s [%(name)s:%(lineno)s] %(message)s",
)

LOGGER.info(args)

username = os.getenv("BGG_USERNAME")
password = os.getenv("BGG_PASSWORD")

download_bgg_dump(username, password, args.out_dir)


if __name__ == "__main__":
main()

0 comments on commit af4efe6

Please sign in to comment.