Skip to content

Commit

Permalink
Merge branch 'update' into 'master'
Browse files Browse the repository at this point in the history
Update

See merge request recommend.games/board-game-scraper!74
  • Loading branch information
MarkusShepherd committed Aug 23, 2024
2 parents 7579b8e + 4eec3fc commit dae4f59
Show file tree
Hide file tree
Showing 2 changed files with 11 additions and 60 deletions.
57 changes: 1 addition & 56 deletions board_game_scraper/news.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,11 @@

import argparse
import logging
import os.path
import sys

from datetime import date, timedelta, timezone
from pathlib import Path
from shutil import rmtree
from subprocess import run
from time import sleep
from typing import TYPE_CHECKING, Optional, Union

Expand Down Expand Up @@ -54,12 +52,10 @@ def _get_git_repo(path: Union[Path, str, None]) -> Optional["Repo"]:

def update_news(
*,
s3_src,
path_feeds,
path_merged,
path_split,
split_git_update=False,
s3_dst=None,
split_size=None,
log_level=None,
dry_run: bool = False,
Expand All @@ -73,9 +69,8 @@ def update_news(
path_split = Path(path_split).resolve()

LOGGER.info(
"%sSync from <%s>, merge from <%s> into <%s>, split into <%s>",
"%sMerge from <%s> into <%s> and split into <%s>",
dry_run_prefix,
s3_src,
path_feeds,
path_merged,
path_split,
Expand Down Expand Up @@ -112,9 +107,6 @@ def update_news(
else:
repo = None

if s3_dst:
LOGGER.info("%sUpload results to <%s>", dry_run_prefix, s3_dst)

LOGGER.info("%sDeleting existing dir <%s>", dry_run_prefix, path_split.parent)
if not dry_run:
if repo is None:
Expand All @@ -141,10 +133,6 @@ def update_news(
path_merged.parent.mkdir(parents=True, exist_ok=True)
path_split.parent.mkdir(parents=True, exist_ok=True)

LOGGER.info("%sS3 sync from <%s> to <%s>", dry_run_prefix, s3_src, path_feeds)
if not dry_run:
run(["aws", "s3", "sync", s3_src, os.path.join(path_feeds, "")], check=True)

merge_files(
in_paths=path_feeds.rglob("*.jl"),
out_path=path_merged,
Expand Down Expand Up @@ -197,54 +185,13 @@ def update_news(
remote,
)

if s3_dst:
LOGGER.info(
"%sS3 sync from <%s> to <%s>",
dry_run_prefix,
path_split.parent,
s3_dst,
)
if not dry_run:
run(
[
"aws",
"s3",
"sync",
"--acl",
"public-read",
"--exclude",
".gitignore",
"--exclude",
".DS_Store",
"--exclude",
".bucket",
"--size-only",
"--delete",
os.path.join(path_split.parent, ""),
s3_dst,
],
check=True,
)

LOGGER.info("%sDone updating news.", dry_run_prefix)


def _parse_args():
parser = argparse.ArgumentParser(
description="News syncing, merging, splitting, and uploading.",
)
parser.add_argument(
"--src-bucket",
"-b",
default="scrape.news.recommend.games",
help="S3 bucket with scraped data",
)
parser.add_argument(
"--dst-bucket",
"-B",
# default="news.recommend.games",
help="S3 bucket to upload to",
)
parser.add_argument(
"--feeds",
"-f",
Expand Down Expand Up @@ -339,11 +286,9 @@ def main():
file_obj.write(dont_run_before.isoformat())

update_news(
s3_src=f"s3://{args.src_bucket}/",
path_feeds=args.feeds,
path_merged=args.merged,
path_split=args.split,
s3_dst=f"s3://{args.dst_bucket}/" if args.dst_bucket else None,
split_size=args.split_size,
split_git_update=args.git,
log_level="DEBUG"
Expand Down
14 changes: 10 additions & 4 deletions docker-compose.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ services:
bgg:
image: registry.gitlab.com/recommend.games/board-game-scraper:${LIBRARY_VERSION}
container_name: bg-scraper-bgg
platform: linux/amd64
build: '.'
command: ['python', '-m', 'board_game_scraper', 'bgg']
env_file: .env
Expand All @@ -27,6 +28,7 @@ services:
bgg-hotness:
image: registry.gitlab.com/recommend.games/board-game-scraper:${LIBRARY_VERSION}
container_name: bg-scraper-bgg-hotness
platform: linux/amd64
build: '.'
command: ['python', '-m', 'board_game_scraper', 'bgg_hotness']
env_file: .env
Expand All @@ -43,6 +45,7 @@ services:
dbpedia:
image: registry.gitlab.com/recommend.games/board-game-scraper:${LIBRARY_VERSION}
container_name: bg-scraper-dbpedia
platform: linux/amd64
build: '.'
command: ['python', '-m', 'board_game_scraper', 'dbpedia']
env_file: .env
Expand All @@ -59,6 +62,7 @@ services:
luding:
image: registry.gitlab.com/recommend.games/board-game-scraper:${LIBRARY_VERSION}
container_name: bg-scraper-luding
platform: linux/amd64
build: '.'
command: ['python', '-m', 'board_game_scraper', 'luding']
env_file: .env
Expand All @@ -75,6 +79,7 @@ services:
spielen:
image: registry.gitlab.com/recommend.games/board-game-scraper:${LIBRARY_VERSION}
container_name: bg-scraper-spielen
platform: linux/amd64
build: '.'
command: ['python', '-m', 'board_game_scraper', 'spielen']
env_file: .env
Expand All @@ -91,6 +96,7 @@ services:
wikidata:
image: registry.gitlab.com/recommend.games/board-game-scraper:${LIBRARY_VERSION}
container_name: bg-scraper-wikidata
platform: linux/amd64
build: '.'
command: ['python', '-m', 'board_game_scraper', 'wikidata']
env_file: .env
Expand All @@ -107,15 +113,15 @@ services:
news:
image: registry.gitlab.com/mshepherd/news-scraper:0.18.0
container_name: bg-scraper-news
platform: linux/amd64
volumes:
- ./feeds/news/output:/root/output
- ~/.aws:/root/.aws
- ./feeds/news:/root/output
env_file: .env
environment:
ENVIRONMENT: docker
OUTPUT_DIR: s3://scrape.news.recommend.games
OUTPUT_DIR: /root/output
ELASTICSEARCH_STORAGE_ENABLED: 0
COUCHBASE_CACHE_ENABLED: 1
COUCHBASE_CACHE_ENABLED: 0
COUCHBASE_ENTITY_LINKING_ENABLED: 0
LOGSTASH_HOST: ''
LOGSTASH_PORT: ''
Expand Down

0 comments on commit dae4f59

Please sign in to comment.