Skip to content

Commit

Permalink
Merge branch '103-premium-config' into 'master'
Browse files Browse the repository at this point in the history
Resolve "Use premium users from config repo"

Closes #103

See merge request recommend.games/board-game-scraper!71
  • Loading branch information
MarkusShepherd committed Sep 14, 2023
2 parents 7adb07f + 687602e commit dbb5656
Show file tree
Hide file tree
Showing 5 changed files with 76 additions and 5 deletions.
1 change: 1 addition & 0 deletions .env.example
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ PULL_QUEUE_SUBSCRIPTION_RESPONSES=<pubsub-subscription-responses>
PULL_QUEUE_INTERVAL=300
# Scrape premium users
SCRAPE_PREMIUM_USERS_LIST=
SCRAPE_PREMIUM_USERS_CONFIG_DIR=
SCRAPE_PREMIUM_USERS_INTERVAL=1800
SCRAPE_PREMIUM_USERS_PREVENT_RESCRAPE_FOR=10800
# AWS credentials if you need access to S3
Expand Down
13 changes: 10 additions & 3 deletions board_game_scraper/extensions.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
from scrapy.utils.misc import arg_to_iter
from scrapy_extensions import LoopingExtension

from .utils import now, pubsub_client
from .utils import load_premium_users, now, pubsub_client

LOGGER = logging.getLogger(__name__)

Expand Down Expand Up @@ -163,9 +163,15 @@ def from_crawler(cls, crawler):
if not crawler.settings.getbool("SCRAPE_PREMIUM_USERS_ENABLED"):
raise NotConfigured

premium_users = tuple(
premium_users_list = frozenset(
arg_to_iter(crawler.settings.getlist("SCRAPE_PREMIUM_USERS_LIST"))
)
premium_users_from_dir = frozenset(
load_premium_users(
dirs=crawler.settings.get("SCRAPE_PREMIUM_USERS_CONFIG_DIR"),
)
)
premium_users = premium_users_list | premium_users_from_dir

if not premium_users:
raise NotConfigured
Expand All @@ -191,7 +197,8 @@ def __init__(
interval: float,
prevent_rescrape_for: Union[float, timedelta, None] = None,
):
self.premium_users = tuple(user.lower() for user in premium_users)
self.premium_users = frozenset(user.lower() for user in premium_users)
LOGGER.info("Scraping %d premium users", len(self.premium_users))

prevent_rescrape_for = (
prevent_rescrape_for
Expand Down
1 change: 1 addition & 0 deletions board_game_scraper/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -316,6 +316,7 @@
# Scrape premium users
SCRAPE_PREMIUM_USERS_ENABLED = True
SCRAPE_PREMIUM_USERS_LIST = os.getenv("SCRAPE_PREMIUM_USERS_LIST")
SCRAPE_PREMIUM_USERS_CONFIG_DIR = os.getenv("SCRAPE_PREMIUM_USERS_CONFIG_DIR")
SCRAPE_PREMIUM_USERS_INTERVAL = (
os.getenv("SCRAPE_PREMIUM_USERS_INTERVAL") or 60 * 60 # 1 hour
)
Expand Down
62 changes: 62 additions & 0 deletions board_game_scraper/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
)
from scrapy.item import BaseItem, Item
from w3lib.html import replace_entities
import yaml

try:
# pylint: disable=redefined-builtin
Expand Down Expand Up @@ -409,3 +410,64 @@ def pubsub_client():
LOGGER.exception("unable to initialise PubSub client")

return None


def _load_yaml(
path: Union[str, Path],
encoding: str = "utf-8",
) -> Iterable[Dict[str, Any]]:
path = Path(path).resolve()
LOGGER.info("Loading YAML from <%s>", path)
try:
with path.open(encoding=encoding) as yaml_file:
yield from yaml.safe_load(yaml_file)
except Exception:
LOGGER.exception("Unable to load YAML from <%s>", path)


def _load_yamls(
paths: Iterable[Union[str, Path]],
encoding: str = "utf-8",
) -> Iterable[Dict[str, Any]]:
for path in paths:
yield from _load_yaml(path, encoding)


def load_premium_users(
dirs: Union[str, Path, Iterable[Union[str, Path]], None] = None,
files: Union[str, Path, Iterable[Union[str, Path]], None] = None,
compare_date: Union[datetime, str, None] = None,
encoding: str = "utf-8",
) -> Iterable[str]:
"""Load premium users from YAML files and compare against given date."""

compare_date = parse_date(compare_date, tzinfo=timezone.utc) or parse_date(
datetime.utcnow(),
tzinfo=timezone.utc,
)
LOGGER.info("Comparing premium expiration dates against <%s>", compare_date)

for file_dir in arg_to_iter(dirs):
file_dir = Path(file_dir).resolve()
if file_dir.is_dir():
LOGGER.info("Loading YAML files from config dir <%s>", file_dir)
yield from load_premium_users(
files=file_dir.glob("*.yaml"),
compare_date=compare_date,
encoding=encoding,
)
else:
LOGGER.warning("Skipping non-existing config dir <%s>", file_dir)

for row in _load_yamls(arg_to_iter(files), encoding):
for username, expiry_date in row.items():
username = username.lower()
expiry_date = parse_date(expiry_date, tzinfo=timezone.utc)
if expiry_date < compare_date:
LOGGER.info(
"Premium for user <%s> ended on <%s>",
username,
expiry_date,
)
else:
yield username
4 changes: 2 additions & 2 deletions docker-compose.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,16 +10,16 @@ services:
environment:
CLOSESPIDER_TIMEOUT: 10800 # 3 hours
DONT_RUN_BEFORE_SEC: 3600 # 1 hour
# GOOGLE_APPLICATION_CREDENTIALS: /app/gs.json
TELNETCONSOLE_ENABLED: 1
TELNETCONSOLE_USERNAME: scrapy
TELNETCONSOLE_PASSWORD: recommend.games
SCRAPE_PREMIUM_USERS_CONFIG_DIR: /app/premium
ports:
- 6023:6023
volumes:
- ./feeds:/app/feeds
- ./images:/app/images
# - ../recommend-games-server/gs.json:/app/gs.json
- ../recommend-games-config/users/premium:/app/premium
restart: unless-stopped
stop_grace_period: 15m
stop_signal: SIGINT
Expand Down

0 comments on commit dbb5656

Please sign in to comment.