|
1 |
| -import requests |
2 |
| -from datetime import datetime, timedelta |
3 |
| -import time, json, os |
4 |
| -from typing import Any, Optional |
| 1 | +import logging |
| 2 | +from datetime import timedelta |
| 3 | +from typing import Optional, Tuple |
| 4 | +import os |
| 5 | +import json |
5 | 6 |
|
6 |
| -from bytewax.connectors.periodic import SimplePollingInput |
| 7 | +import requests |
| 8 | +from bytewax import operators as op |
7 | 9 | from bytewax.dataflow import Dataflow
|
| 10 | +from bytewax.inputs import SimplePollingSource |
8 | 11 |
|
9 |
| -import logging |
10 |
| - |
11 |
| -from proton import ProtonOutput |
| 12 | +from proton import ProtonSink |
12 | 13 |
|
13 | 14 | logging.basicConfig(level=logging.INFO)
|
14 | 15 | logger = logging.getLogger(__name__)
|
15 |
| -# logger.setLevel(logging.DEBUG) |
16 |
| - |
17 |
| -class HNInput(SimplePollingInput): |
18 |
| - def __init__(self, interval: timedelta, align_to: Optional[datetime] = None, init_item: Optional[int] = None): |
19 |
| - super().__init__(interval, align_to) |
20 |
| - ''' |
21 |
| - By default, only get the recent events |
22 |
| - ''' |
23 |
| - if not init_item or init_item == 0: |
24 |
| - self.max_id = int(requests.get("https://hacker-news.firebaseio.com/v0/maxitem.json").json()*0.999998) |
25 |
| - else: |
26 |
| - self.max_id = init_item |
27 |
| - logger.info(f"received starting id: {init_item}") |
28 |
| - |
29 | 16 |
|
| 17 | + |
| 18 | +class HNSource(SimplePollingSource): |
30 | 19 | def next_item(self):
|
31 |
| - ''' |
32 |
| - Get all the items from hacker news API between the last max id and the current max id. |
33 |
| - ''' |
34 |
| - new_max_id = requests.get("https://hacker-news.firebaseio.com/v0/maxitem.json").json() |
35 |
| - logger.info(f"current id: {self.max_id}, new id: {new_max_id}. {new_max_id-self.max_id} items to fetch") |
36 |
| - ids = [int(i) for i in range(self.max_id, new_max_id)] |
37 |
| - self.max_id = new_max_id |
38 |
| - logger.debug(ids) |
39 |
| - return ids |
40 |
| - |
41 |
| -def download_metadata(hn_id): |
| 20 | + return ( |
| 21 | + "GLOBAL_ID", |
| 22 | + requests.get("https://hacker-news.firebaseio.com/v0/maxitem.json").json(), |
| 23 | + ) |
| 24 | + |
| 25 | + |
| 26 | +def get_id_stream(old_max_id, new_max_id) -> Tuple[str,list]: |
| 27 | + if old_max_id is None: |
| 28 | + # Get the last 10 items on the first run. |
| 29 | + old_max_id = new_max_id - 10 |
| 30 | + return (new_max_id, range(old_max_id, new_max_id)) |
| 31 | + |
| 32 | + |
| 33 | +def download_metadata(hn_id) -> Optional[Tuple[str, dict]]: |
42 | 34 | # Given an hacker news id returned from the api, fetch metadata
|
43 |
| - logger.info(f"Fetching https://hacker-news.firebaseio.com/v0/item/{hn_id}.json") |
44 |
| - req = requests.get( |
| 35 | + # Try 3 times, waiting more and more, or give up |
| 36 | + data = requests.get( |
45 | 37 | f"https://hacker-news.firebaseio.com/v0/item/{hn_id}.json"
|
46 |
| - ) |
47 |
| - if not req.json(): |
48 |
| - logger.warning(f"error getting payload from item {hn_id} trying again") |
49 |
| - time.sleep(0.5) |
50 |
| - return download_metadata(hn_id) |
51 |
| - return req.json() |
52 |
| - |
53 |
| -def recurse_tree(metadata): |
| 38 | + ).json() |
| 39 | + |
| 40 | + if data is None: |
| 41 | + logger.warning(f"Couldn't fetch item {hn_id}, skipping") |
| 42 | + return None |
| 43 | + return (str(hn_id), data) |
| 44 | + |
| 45 | + |
| 46 | +def recurse_tree(metadata, og_metadata=None) -> any: |
| 47 | + if not og_metadata: |
| 48 | + og_metadata = metadata |
54 | 49 | try:
|
55 | 50 | parent_id = metadata["parent"]
|
56 | 51 | parent_metadata = download_metadata(parent_id)
|
57 |
| - return recurse_tree(parent_metadata) |
| 52 | + return recurse_tree(parent_metadata[1], og_metadata) |
58 | 53 | except KeyError:
|
59 |
| - return (metadata["id"], {**metadata, "key_id": metadata["id"]}) |
60 |
| - |
61 |
| -def key_on_parent(metadata: dict) -> tuple: |
62 |
| - key, metadata = recurse_tree(metadata) |
63 |
| - return json.dumps(metadata, indent=4, sort_keys=True) |
64 |
| - |
65 |
| -def run_hn_flow(init_item): |
66 |
| - flow = Dataflow() |
67 |
| - flow.input("in", HNInput(timedelta(seconds=15), None, init_item)) # skip the align_to argument |
68 |
| - flow.flat_map(lambda x: x) |
69 |
| - # If you run this dataflow with multiple workers, downloads in |
70 |
| - # the next `map` will be parallelized thanks to .redistribute() |
71 |
| - flow.redistribute() |
72 |
| - flow.map(download_metadata) |
73 |
| - flow.inspect(logger.debug) |
74 |
| - |
75 |
| - # We want to keep related data together so let's build a |
76 |
| - # traversal function to get the ultimate parent |
77 |
| - flow.map(key_on_parent) |
78 |
| - |
79 |
| - flow.output("out", ProtonOutput("hn",os.environ["PROTON_HOST"])) |
80 |
| - return flow |
| 54 | + return (metadata["id"], |
| 55 | + { |
| 56 | + **og_metadata, |
| 57 | + "root_id":metadata["id"] |
| 58 | + } |
| 59 | + ) |
| 60 | + |
| 61 | + |
| 62 | +def key_on_parent(key__metadata) -> tuple: |
| 63 | + key, metadata = recurse_tree(key__metadata[1]) |
| 64 | + return (str(key), metadata) |
| 65 | + |
| 66 | + |
| 67 | +def format(id__metadata): |
| 68 | + id, metadata = id__metadata |
| 69 | + return json.dumps(metadata) |
| 70 | + |
| 71 | +flow = Dataflow("hn_scraper") |
| 72 | +max_id = op.input("in", flow, HNSource(timedelta(seconds=15))) |
| 73 | +id_stream = op.stateful_map("range", max_id, lambda: None, get_id_stream).then( |
| 74 | + op.flat_map, "strip_key_flatten", lambda key_ids: key_ids[1]).then( |
| 75 | + op.redistribute, "redist") |
| 76 | +id_stream = op.filter_map("meta_download", id_stream, download_metadata) |
| 77 | +split_stream = op.branch("split_comments", id_stream, lambda item: item[1]["type"] == "story") |
| 78 | +story_stream = split_stream.trues |
| 79 | +story_stream = op.map("format_stories", story_stream, format) |
| 80 | +comment_stream = split_stream.falses |
| 81 | +comment_stream = op.map("key_on_parent", comment_stream, key_on_parent) |
| 82 | +comment_stream = op.map("format_comments", comment_stream, format) |
| 83 | +op.inspect("stories", story_stream) |
| 84 | +op.inspect("comments", comment_stream) |
| 85 | +op.output("stories-out", story_stream, ProtonSink("hn_stories", os.environ["PROTON_HOST"])) |
| 86 | +op.output("comments-out", comment_stream, ProtonSink("hn_comments", os.environ["PROTON_HOST"])) |
0 commit comments