Skip to content

Commit

Permalink
Merge branch '102-premium-users' into 'master'
Browse files Browse the repository at this point in the history
Resolve "Scrape premium users on a regular basis"

Closes #102

See merge request recommend.games/board-game-scraper!69
  • Loading branch information
MarkusShepherd committed May 25, 2023
2 parents 90cf644 + 866a3e5 commit 7eb87d0
Show file tree
Hide file tree
Showing 3 changed files with 104 additions and 2 deletions.
4 changes: 4 additions & 0 deletions .env.example
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,10 @@ PULL_QUEUE_PROJECT=<pubsub-project>
PULL_QUEUE_SUBSCRIPTION_LOGS=<pubsub-subscription-logs>
PULL_QUEUE_SUBSCRIPTION_RESPONSES=<pubsub-subscription-responses>
PULL_QUEUE_INTERVAL=300
# Scrape premium users
SCRAPE_PREMIUM_USERS_LIST=
SCRAPE_PREMIUM_USERS_INTERVAL=1800
SCRAPE_PREMIUM_USERS_PREVENT_RESCRAPE_FOR=10800
# AWS credentials if you need access to S3
AWS_ACCESS_KEY_ID=<access-key>
AWS_SECRET_ACCESS_KEY=<secret-access-key>
Expand Down
89 changes: 88 additions & 1 deletion board_game_scraper/extensions.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,16 @@

import logging
import os

import sys
from datetime import timedelta, timezone
from pathlib import Path
from typing import Iterable, Union

from pytility import parse_date, parse_float
from scrapy import signals
from scrapy.exceptions import NotConfigured
from scrapy.utils.job import job_dir
from scrapy.utils.misc import arg_to_iter
from scrapy_extensions import LoopingExtension

from .utils import now, pubsub_client
Expand Down Expand Up @@ -151,6 +153,91 @@ def process_message(self, message, spider, encoding="utf-8"):
return True


class ScrapePremiumUsersExtension(LoopingExtension):
"""Schedule a collection request for premium users on a regular interval."""

@classmethod
def from_crawler(cls, crawler):
"""Initialise from crawler."""

if not crawler.settings.getbool("SCRAPE_PREMIUM_USERS_ENABLED"):
raise NotConfigured

premium_users = tuple(
arg_to_iter(crawler.settings.getlist("SCRAPE_PREMIUM_USERS_LIST"))
)

if not premium_users:
raise NotConfigured

interval = crawler.settings.getfloat("SCRAPE_PREMIUM_USERS_INTERVAL", 60 * 60)

prevent_rescrape_for = (
crawler.settings.getfloat("SCRAPE_PREMIUM_USERS_PREVENT_RESCRAPE_FOR")
or None
)

return cls(
crawler=crawler,
premium_users=premium_users,
interval=interval,
prevent_rescrape_for=prevent_rescrape_for,
)

def __init__(
self,
crawler,
premium_users: Iterable[str],
interval: float,
prevent_rescrape_for: Union[float, timedelta, None] = None,
):
self.premium_users = tuple(user.lower() for user in premium_users)

prevent_rescrape_for = (
prevent_rescrape_for
if isinstance(prevent_rescrape_for, timedelta)
else parse_float(prevent_rescrape_for)
)
self.prevent_rescrape_for = (
timedelta(seconds=prevent_rescrape_for)
if isinstance(prevent_rescrape_for, float)
else prevent_rescrape_for
)
self.last_scraped = {}

self.setup_looping_task(self._schedule_requests, crawler, interval)

def _schedule_requests(self, spider):
if not hasattr(spider, "collection_request"):
return

for user_name in self.premium_users:
if self.prevent_rescrape_for:
last_scraped = self.last_scraped.get(user_name)
curr_time = now()

if (
last_scraped
and last_scraped + self.prevent_rescrape_for > curr_time
):
LOGGER.info(
"Dropped <%s>: last scraped %s",
user_name,
last_scraped,
)
continue

self.last_scraped[user_name] = curr_time

LOGGER.info("Scheduling collection request for <%s>", user_name)
request = spider.collection_request(
user_name=user_name,
priority=sys.maxsize,
dont_filter=True,
)
spider.crawler.engine.crawl(request, spider)


class StateTag:
"""writes a tag into JOBDIR with the state of the spider"""

Expand Down
13 changes: 12 additions & 1 deletion board_game_scraper/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,7 +192,8 @@
"scrapy_extensions.NicerAutoThrottle": 0,
"board_game_scraper.extensions.StateTag": 0,
"board_game_scraper.extensions.DontRunBeforeTag": 0,
"board_game_scraper.extensions.PullQueueExtension": 100,
"board_game_scraper.extensions.ScrapePremiumUsersExtension": 100,
"board_game_scraper.extensions.PullQueueExtension": 200,
"scrapy_extensions.MonitorDownloadsExtension": 500,
"scrapy_extensions.DumpStatsExtension": 500,
}
Expand Down Expand Up @@ -311,3 +312,13 @@
os.getenv("PULL_QUEUE_PREVENT_RESCRAPE_FOR") or 6 * 60 * 60 # 6 hours
)
PULL_QUEUE_PULL_TIMEOUT = os.getenv("PULL_QUEUE_PULL_TIMEOUT") or 5 # 5 seconds

# Scrape premium users
SCRAPE_PREMIUM_USERS_ENABLED = True
SCRAPE_PREMIUM_USERS_LIST = os.getenv("SCRAPE_PREMIUM_USERS_LIST")
SCRAPE_PREMIUM_USERS_INTERVAL = (
os.getenv("SCRAPE_PREMIUM_USERS_INTERVAL") or 60 * 60 # 1 hour
)
SCRAPE_PREMIUM_USERS_PREVENT_RESCRAPE_FOR = (
os.getenv("SCRAPE_PREMIUM_USERS_PREVENT_RESCRAPE_FOR") or 3 * 60 * 60 # 3 hours
)

0 comments on commit 7eb87d0

Please sign in to comment.