diff --git a/.env.example b/.env.example index 8a9210d..a6b9cc9 100644 --- a/.env.example +++ b/.env.example @@ -5,8 +5,6 @@ LOG_LEVEL=DEBUG LOG_SCRAPED_ITEMS=0 ROBOTSTXT_OBEY=1 SCRAPER_FILE_TAG= -# get your Board Game Atlas credentials at https://www.boardgameatlas.com/api/docs/apps -BGA_CLIENT_ID= # you only need these settings if you want to prioritise certain BGG users GOOGLE_APPLICATION_CREDENTIALS=/path/to/gs.json # PubSub queue settings @@ -24,7 +22,6 @@ AWS_ACCESS_KEY_ID= AWS_SECRET_ACCESS_KEY= # limit downloaded images per game and spider # use 0 to disable download; -1 for all images -LIMIT_IMAGES_TO_DOWNLOAD_BGA=0 LIMIT_IMAGES_TO_DOWNLOAD_BGG=0 LIMIT_IMAGES_TO_DOWNLOAD_DBPEDIA=0 LIMIT_IMAGES_TO_DOWNLOAD_LUDING=0 diff --git a/README.md b/README.md index 0075f5e..b11c009 100644 --- a/README.md +++ b/README.md @@ -9,7 +9,6 @@ pip install board-game-scraper ## Sources -* [Board Game Atlas](https://www.boardgameatlas.com/) (`bga`) * [BoardGameGeek](https://boardgamegeek.com/) (`bgg`) * [DBpedia](https://wiki.dbpedia.org/) (`dbpedia`) * [Luding.org](https://luding.org/) (`luding`) diff --git a/board_game_scraper/fields.yaml b/board_game_scraper/fields.yaml index a2e1f84..fd625a7 100644 --- a/board_game_scraper/fields.yaml +++ b/board_game_scraper/fields.yaml @@ -31,5 +31,3 @@ type: Exact - field: spielen_id type: Exact -- field: bga_id - type: Exact diff --git a/board_game_scraper/full_merge.py b/board_game_scraper/full_merge.py index cf0b64e..21c1770 100644 --- a/board_game_scraper/full_merge.py +++ b/board_game_scraper/full_merge.py @@ -60,19 +60,6 @@ def merge_configs(spider, full=False): full = parse_bool(full) - if spider == "bga": - yield merge_config(spider="bga", item="GameItem", full=full) - yield merge_config( - spider="bga", - item="RatingItem", - full=full, - keys=("bga_user_id", "bga_id"), - fieldnames_exclude=("bgg_user_play_count",) - if parse_bool(full) - else ("bgg_user_play_count", "published_at", "updated_at", "scraped_at"), - ) - return - if spider == "bgg": yield merge_config(spider="bgg", item="GameItem", full=full) yield merge_config( diff --git a/board_game_scraper/items.py b/board_game_scraper/items.py index f65ddfc..3e24102 100644 --- a/board_game_scraper/items.py +++ b/board_game_scraper/items.py @@ -441,7 +441,6 @@ class GameItem(TypedItem): default=None, ) spielen_id = Field(dtype=str) - bga_id = Field(dtype=str) published_at = Field( dtype=datetime, @@ -628,16 +627,6 @@ class RatingItem(TypedItem): dtype=int, dtype_convert=parse_int, input_processor=NN_INT_PROCESSOR, default=0 ) - bga_id = Field(dtype=str) - bga_user_id = Field(dtype=str) - bga_user_name = Field(dtype=str, input_processor=MapCompose(identity, str)) - bga_user_rating = Field( - dtype=float, - dtype_convert=parse_float, - default=None, - input_processor=POS_FLOAT_PROCESSOR, - ) - comment = Field( dtype=str, input_processor=MapCompose( diff --git a/board_game_scraper/settings.py b/board_game_scraper/settings.py index 568a8c4..96e7327 100644 --- a/board_game_scraper/settings.py +++ b/board_game_scraper/settings.py @@ -88,7 +88,6 @@ "dbpedia_id", "luding_id", "spielen_id", - "bga_id", "published_at", "updated_at", "scraped_at", @@ -129,10 +128,6 @@ "bgg_user_preordered", "bgg_user_wishlist", "bgg_user_play_count", - "bga_id", - "bga_user_id", - "bga_user_name", - "bga_user_rating", "comment", "published_at", "updated_at", @@ -297,12 +292,6 @@ CLEAN_ITEM_DROP_FALSEY = True CLEAN_ITEM_DROP_VALUES = None -# Board Game Atlas -BGA_CLIENT_ID = os.getenv("BGA_CLIENT_ID") -BGA_SCRAPE_IMAGES = False -BGA_SCRAPE_VIDEOS = False -BGA_SCRAPE_REVIEWS = False - PULL_QUEUE_ENABLED = False PULL_QUEUE_PROJECT = os.getenv("PULL_QUEUE_PROJECT") PULL_QUEUE_SUBSCRIPTION = os.getenv("PULL_QUEUE_SUBSCRIPTION") diff --git a/board_game_scraper/spiders/bga.py b/board_game_scraper/spiders/bga.py deleted file mode 100644 index ddaeeb8..0000000 --- a/board_game_scraper/spiders/bga.py +++ /dev/null @@ -1,293 +0,0 @@ -# -*- coding: utf-8 -*- - -""" Board Game Atlas spider """ - -import os - -from functools import partial -from itertools import chain -from urllib.parse import urlencode - -from pytility import parse_float, parse_int -from scrapy import Request, Spider -from scrapy.utils.project import get_project_settings - -from ..items import GameItem, RatingItem -from ..loaders import GameJsonLoader, RatingJsonLoader -from ..utils import ( - extract_bga_id, - extract_meta, - extract_item, - extract_query_param, - extract_url, - json_from_response, - now, -) - -API_URL = "https://api.boardgameatlas.com/api" - - -def _extract_bga_id(item=None, response=None): - if item and item.get("bga_id"): - return item["bga_id"] - meta = extract_meta(response) - if meta.get("bga_id"): - return meta["bga_id"] - url = extract_url(item, response) - return extract_bga_id(url) - - -def _extract_requests(response=None): - meta = extract_meta(response) - return meta.get("game_requests") - - -class BgaSpider(Spider): - """Board Game Atlas spider""" - - name = "bga" - allowed_domains = ("boardgameatlas.com",) - item_classes = (GameItem, RatingItem) - api_url = API_URL - - custom_settings = { - "IMAGES_URLS_FIELD": None, - "DOWNLOAD_DELAY": 30, - "CONCURRENT_REQUESTS_PER_DOMAIN": 4, - "AUTOTHROTTLE_TARGET_CONCURRENCY": 2, - "LIMIT_IMAGES_TO_DOWNLOAD": parse_int(os.getenv("LIMIT_IMAGES_TO_DOWNLOAD_BGA")) - or 0, - } - - @classmethod - def from_crawler(cls, crawler, *args, **kwargs): - """initialise spider from crawler""" - - kwargs.setdefault("settings", crawler.settings) - spider = cls(*args, **kwargs) - spider._set_crawler(crawler) - return spider - - def __init__(self, *args, settings=None, **kwargs): - super().__init__(*args, **kwargs) - settings = settings or get_project_settings() - self.client_id = settings.get("BGA_CLIENT_ID") - self.scrape_images = settings.getbool("BGA_SCRAPE_IMAGES") - self.scrape_videos = settings.getbool("BGA_SCRAPE_VIDEOS") - self.scrape_reviews = settings.getbool("BGA_SCRAPE_REVIEWS") - - def _api_url(self, path="search", query=None): - query = query or {} - query.setdefault("client_id", self.client_id) - query.setdefault("limit", 100) - return "{}/{}?{}".format( - self.api_url, path, urlencode(sorted(query.items(), key=lambda x: x[0])) - ) - - def _game_requests(self, bga_id): - if self.scrape_images: - yield self._api_url("game/images", {"game_id": bga_id}), self.parse_images - if self.scrape_videos: - yield self._api_url("game/videos", {"game_id": bga_id}), self.parse_videos - if self.scrape_reviews: - yield self._api_url("game/reviews", {"game_id": bga_id}), self.parse_reviews - - # pylint: disable=no-self-use - def _next_request_or_item(self, item, requests): - if not requests: - return item - - url, callback = requests.pop(0) - callback = partial(callback, item=item) - return Request( - url=url, - callback=callback, - errback=callback, - meta={"item": item, "game_requests": requests}, - ) - - def start_requests(self): - """generate start requests""" - - yield Request( - url=self._api_url(query={"order_by": "popularity"}), - callback=self.parse, - priority=2, - ) - yield Request( - url=self._api_url(path="reviews"), - callback=self.parse_user_reviews, - priority=1, - ) - - # pylint: disable=line-too-long - def parse(self, response): - """ - @url https://api.boardgameatlas.com/api/search?client_id=8jfqHypg2l&order_by=popularity&limit=100 - @returns items 100 100 - @returns requests 1 1 - @scrapes name description url image_url bga_id scraped_at worst_rating best_rating - """ - - result = json_from_response(response) - games = result.get("games") or () - scraped_at = now() - - if games: - skip = parse_int(extract_query_param(response.url, "skip")) or 0 - limit = parse_int(extract_query_param(response.url, "limit")) or 100 - query = {"order_by": "popularity", "skip": skip + limit, "limit": limit} - yield Request( - url=self._api_url(query=query), callback=self.parse, priority=2 - ) - - for game in games: - bga_id = game.get("id") or extract_bga_id(game.get("url")) - ldr = GameJsonLoader( - item=GameItem( - bga_id=bga_id, scraped_at=scraped_at, worst_rating=1, best_rating=5 - ), - json_obj=game, - response=response, - ) - - ldr.add_jmes("name", "name") - ldr.add_jmes("alt_name", "names") - ldr.add_jmes("year", "year_published") - ldr.add_jmes("description", "description_preview") - ldr.add_jmes("description", "description") - - ldr.add_jmes("designer", "designers") - ldr.add_jmes("artist", "artists") - ldr.add_jmes("publisher", "primary_publisher") - ldr.add_jmes("publisher", "publishers") - - ldr.add_jmes("url", "url") - ldr.add_jmes("image_url", "image_url") - ldr.add_jmes("image_url", "thumb_url") - ldr.add_jmes("rules_url", "rules_url") - ldr.add_jmes("external_link", "official_url") - - list_price = ldr.get_jmes("msrp") - list_price = map( - "USD{:.2f}".format, filter(None, map(parse_float, list_price)) - ) - ldr.add_value("list_price", list_price) - - ldr.add_jmes("min_players", "min_players") - ldr.add_jmes("max_players", "max_players") - ldr.add_jmes("min_age", "min_age") - ldr.add_jmes("min_time", "min_playtime") - ldr.add_jmes("max_time", "max_playtime") - - # TODO resolve mechanic and category (#48) - # https://www.boardgameatlas.com/api/docs/game/categories - # https://www.boardgameatlas.com/api/docs/game/mechanics - ldr.add_jmes("category", "categories[].id") - ldr.add_jmes("mechanic", "mechanics[].id") - - ldr.add_jmes("num_votes", "num_user_ratings") - ldr.add_jmes("avg_rating", "average_user_rating") - - item = ldr.load_item() - requests = list(self._game_requests(bga_id)) - yield self._next_request_or_item(item, requests) - - def parse_images(self, response, item=None): - """ - @url https://api.boardgameatlas.com/api/game/images?client_id=8jfqHypg2l&game_id=OIXt3DmJU0&limit=100 - @returns items 1 1 - @returns requests 0 0 - @scrapes image_url - """ - - item = extract_item(item, response, GameItem) - result = json_from_response(response) - - ldr = GameJsonLoader(item=item, json_obj=result, response=response) - ldr.add_value("image_url", item.get("image_url")) - ldr.add_jmes("image_url", "images[].url") - ldr.add_jmes("image_url", "images[].thumb") - - item = ldr.load_item() - requests = _extract_requests(response) - return self._next_request_or_item(item, requests) - - def parse_videos(self, response, item=None): - """ - @url https://api.boardgameatlas.com/api/game/videos?client_id=8jfqHypg2l&game_id=OIXt3DmJU0&limit=100 - @returns items 1 1 - @returns requests 0 0 - @scrapes video_url - """ - - item = extract_item(item, response, GameItem) - result = json_from_response(response) - - ldr = GameJsonLoader(item=item, json_obj=result, response=response) - ldr.add_value("video_url", item.get("video_url")) - ldr.add_jmes("video_url", "videos[].url") - - item = ldr.load_item() - requests = _extract_requests(response) - return self._next_request_or_item(item, requests) - - # pylint: disable=no-self-use - def parse_reviews(self, response, item=None): - """ - @url https://api.boardgameatlas.com/api/game/reviews?client_id=8jfqHypg2l&game_id=OIXt3DmJU0&limit=100 - @returns items 1 1 - @returns requests 0 0 - @scrapes review_url - """ - - item = extract_item(item, response, GameItem) - result = json_from_response(response) - - ldr = GameJsonLoader(item=item, json_obj=result, response=response) - ldr.add_value("review_url", item.get("review_url")) - ldr.add_jmes("review_url", "reviews[].url") - - item = ldr.load_item() - requests = _extract_requests(response) - return self._next_request_or_item(item, requests) - - def parse_user_reviews(self, response): - """ - @url https://api.boardgameatlas.com/api/reviews?client_id=8jfqHypg2l&limit=100 - @returns items 100 100 - @returns requests 1 1 - @scrapes item_id bga_id bga_user_id bga_user_name - """ - - result = json_from_response(response) - reviews = result.get("reviews") or () - scraped_at = now() - - if reviews: - skip = parse_int(extract_query_param(response.url, "skip")) or 0 - limit = parse_int(extract_query_param(response.url, "limit")) or 100 - query = {"skip": skip + limit, "limit": limit} - yield Request( - url=self._api_url(path="reviews", query=query), - callback=self.parse_user_reviews, - priority=1, - ) - - for review in reviews: - ldr = RatingJsonLoader( - item=RatingItem(scraped_at=scraped_at), - json_obj=review, - response=response, - ) - - ldr.add_jmes("item_id", "id") - - ldr.add_jmes("bga_id", "game.id.objectId") - ldr.add_jmes("bga_user_id", "user.id") - ldr.add_jmes("bga_user_name", "user.username") - ldr.add_jmes("bga_user_rating", "rating") - comments = chain(ldr.get_jmes("title"), ldr.get_jmes("description")) - ldr.add_value("comment", "\n".join(filter(None, comments))) - - yield ldr.load_item() diff --git a/board_game_scraper/spiders/wikidata.py b/board_game_scraper/spiders/wikidata.py index 7a2b4f1..a62102e 100644 --- a/board_game_scraper/spiders/wikidata.py +++ b/board_game_scraper/spiders/wikidata.py @@ -181,7 +181,7 @@ def parse_game(self, response): @returns items 1 1 @returns requests 0 0 @scrapes name alt_name designer publisher url official_url image_url external_link \ - min_players max_players bgg_id wikidata_id wikipedia_id freebase_id luding_id bga_id + min_players max_players bgg_id wikidata_id wikipedia_id freebase_id luding_id """ try: @@ -246,7 +246,6 @@ def parse_game(self, response): ldr.add_jmes("wikidata_id", "id") ldr.add_jmes("wikidata_id", "title") ldr.add_jmes("luding_id", "claims.P3528[].mainsnak.datavalue.value") - ldr.add_jmes("bga_id", "claims.P6491[].mainsnak.datavalue.value") ldr.add_value( None, extract_ids( diff --git a/board_game_scraper/utils.py b/board_game_scraper/utils.py index 17a4b8e..86dd78b 100644 --- a/board_game_scraper/utils.py +++ b/board_game_scraper/utils.py @@ -19,7 +19,6 @@ clear_list, normalize_space, parse_int, - take_first, to_str, parse_date, ) @@ -48,7 +47,6 @@ r"^/(alle-brettspiele|messeneuheiten|ausgezeichnet-\d+)/(\w[^/]*).*$" ) REGEX_FREEBASE_ID = re.compile(r"^/ns/(g|m)\.([^/]+).*$") -REGEX_BGA_ID = re.compile(r"^.*/game/([a-zA-Z0-9]+)(/.*)?$") def to_lower(string): @@ -369,19 +367,6 @@ def extract_freebase_id(url: Union[str, ParseResult, None]) -> Optional[str]: ) -def extract_bga_id(url: Union[str, ParseResult, None]) -> Optional[str]: - """extract Board Game Atlas ID from URL""" - url = parse_url(url, ("boardgameatlas.com", "www.boardgameatlas.com")) - if not url: - return None - match = REGEX_BGA_ID.match(url.path) - if match: - return match.group(1) - ids_str = extract_query_param(url, "ids") - ids = ids_str.split(",") if ids_str else () - return take_first(map(normalize_space, ids)) or extract_query_param(url, "game-id") - - def extract_ids(*urls: Optional[str]) -> Dict[str, List[Union[int, str]]]: """extract all possible IDs from all the URLs""" urls = tuple(map(urlparse, urls)) @@ -393,7 +378,6 @@ def extract_ids(*urls: Optional[str]) -> Dict[str, List[Union[int, str]]]: "dbpedia_id": clear_list(map(extract_dbpedia_id, urls)), "luding_id": clear_list(map(extract_luding_id, urls)), "spielen_id": clear_list(map(extract_spielen_id, urls)), - "bga_id": clear_list(map(extract_bga_id, urls)), } diff --git a/merge.sh b/merge.sh index 4b45cf0..8892cc4 100755 --- a/merge.sh +++ b/merge.sh @@ -6,7 +6,7 @@ BASE_DIR="$(dirname "$(readlink --canonicalize "${BASH_SOURCE[0]}")")" LOGS_DIR="$(readlink --canonicalize "${BASE_DIR}/logs")" if [ $# -eq 0 ] || [[ "${1}" == 'all' ]]; then - SPIDERS=('bga' 'bgg_hotness' 'dbpedia' 'luding' 'spielen' 'wikidata' 'bgg') + SPIDERS=('bgg_hotness' 'dbpedia' 'luding' 'spielen' 'wikidata' 'bgg') else SPIDERS=("$@") fi