From c1631fee97b589710d743ab461815839f884bbfc Mon Sep 17 00:00:00 2001 From: zmoon Date: Sat, 27 Jan 2024 15:38:20 -0700 Subject: [PATCH 01/13] Initial retry loop applied to the readers catching connection exceptions that I saw in the GH logs (don't seem to happen locally in my testing on Win...) --- uscrn/data.py | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/uscrn/data.py b/uscrn/data.py index e952437..91f8913 100644 --- a/uscrn/data.py +++ b/uscrn/data.py @@ -16,6 +16,40 @@ """Restrict how many files to load, for testing purposes.""" +def retry(func): + """Retry a function on connection error. + Up to 60 s, with Fibonacci backoff (1, 1, 2, 3, ...). + """ + import urllib + from functools import wraps + + import requests + + max_time = 60 # seconds + + @wraps(func) + def wrapper(*args, **kwargs): + from time import perf_counter_ns, sleep + + t0 = perf_counter_ns() + a, b = 1, 1 + while True: + try: + return func(*args, **kwargs) + except (urllib.error.URLError, requests.exceptions.ConnectionError): + if perf_counter_ns() - t0 > max_time * 1_000_000_000: + raise + warnings.warn( # TODO: remove or switch to logging? + f"Retrying {func.__name__} in {a} s after connection error", + stacklevel=2, + ) + sleep(a) + a, b = b, a + b # Fibonacci backoff + + return wrapper + + +@retry def load_meta(*, cat: bool = False) -> pd.DataFrame: """Load the station metadata table. @@ -98,6 +132,7 @@ def parse_url(url: str) -> _ParseRes: return parse_fp(urlsplit(url).path) +@retry def read_subhourly(fp, *, cat: bool = False) -> pd.DataFrame: """Read a subhourly USCRN file. @@ -148,6 +183,7 @@ def read_subhourly(fp, *, cat: bool = False) -> pd.DataFrame: return df +@retry def read_hourly(fp, *, cat: bool = False) -> pd.DataFrame: """Read an hourly USCRN file. @@ -194,6 +230,7 @@ def read_hourly(fp, *, cat: bool = False) -> pd.DataFrame: return df +@retry def read_daily(fp, *, cat: bool = False) -> pd.DataFrame: """Read a daily USCRN file. @@ -236,6 +273,7 @@ def read_daily(fp, *, cat: bool = False) -> pd.DataFrame: return df +@retry def read_monthly(fp, *, cat: bool = False) -> pd.DataFrame: """Read a monthly USCRN file. From 4e492311dacf8f782d927068d5861592a7108306 Mon Sep 17 00:00:00 2001 From: zmoon Date: Sat, 27 Jan 2024 15:47:52 -0700 Subject: [PATCH 02/13] Add generous timeouts for downloading non-data text --- uscrn/attrs.py | 2 +- uscrn/data.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/uscrn/attrs.py b/uscrn/attrs.py index 5ab11ef..66082ff 100644 --- a/uscrn/attrs.py +++ b/uscrn/attrs.py @@ -306,7 +306,7 @@ def get(url: str, fp: Path) -> str: needs_update = True if needs_update: - r = requests.get(url) + r = requests.get(url, timeout=10) r.raise_for_status() with open(fp, "w") as f: f.write(r.text) diff --git a/uscrn/data.py b/uscrn/data.py index 91f8913..2c17edd 100644 --- a/uscrn/data.py +++ b/uscrn/data.py @@ -406,7 +406,7 @@ def get_data( # Get available years from the main page # e.g. `>2000/<` print("Discovering files...") - r = requests.get(f"{base_url}/") + r = requests.get(f"{base_url}/", timeout=10) # TODO: could cache this info like the docs r.raise_for_status() urls: list[str] if which == "monthly": @@ -438,7 +438,7 @@ def get_year_urls(year): # Get filenames from the year page # e.g. `>CRND0103-2020-TX_Palestine_6_WNW.txt<` url = f"{base_url}/{year}/" - r = requests.get(url) + r = requests.get(url, timeout=10) # TODO: could cache this info like the docs r.raise_for_status() fns = re.findall(r">(CRN[a-zA-Z0-9\-_]*\.txt)<", r.text) if not fns: # pragma: no cover From 6ec82f1340540fb9f39cbd2d2ac67df041fd1e95 Mon Sep 17 00:00:00 2001 From: zmoon Date: Sun, 28 Jan 2024 10:10:53 -0700 Subject: [PATCH 03/13] Don't know when connection error will happen would need some more complex testing with mocking --- uscrn/data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/uscrn/data.py b/uscrn/data.py index 2c17edd..3f8bca0 100644 --- a/uscrn/data.py +++ b/uscrn/data.py @@ -36,7 +36,7 @@ def wrapper(*args, **kwargs): while True: try: return func(*args, **kwargs) - except (urllib.error.URLError, requests.exceptions.ConnectionError): + except (urllib.error.URLError, requests.exceptions.ConnectionError): # pragma: no cover if perf_counter_ns() - t0 > max_time * 1_000_000_000: raise warnings.warn( # TODO: remove or switch to logging? From e7786dcc30dc986dbea7881120feb15f3f7291e2 Mon Sep 17 00:00:00 2001 From: zmoon Date: Sun, 28 Jan 2024 10:34:30 -0700 Subject: [PATCH 04/13] More retry --- uscrn/attrs.py | 5 ++++- uscrn/data.py | 52 +++++++++++++++----------------------------------- uscrn/util.py | 36 ++++++++++++++++++++++++++++++++++ 3 files changed, 55 insertions(+), 38 deletions(-) create mode 100644 uscrn/util.py diff --git a/uscrn/attrs.py b/uscrn/attrs.py index 66082ff..f3bee2a 100644 --- a/uscrn/attrs.py +++ b/uscrn/attrs.py @@ -289,15 +289,18 @@ def _get_docs( import pandas as pd import requests + from .util import retry + validate_which(which) stored_attrs = load_attrs() base_url = stored_attrs[which]["base_url"] + @retry def get(url: str, fp: Path) -> str: needs_update: bool if fp.is_file(): - r = requests.head(url) + r = requests.head(url, timeout=10) r.raise_for_status() last_modified_url = pd.Timestamp(r.headers["Last-Modified"]) last_modified_local = pd.Timestamp(fp.stat().st_mtime, unit="s", tz="UTC") diff --git a/uscrn/data.py b/uscrn/data.py index 3f8bca0..e1a2cc0 100644 --- a/uscrn/data.py +++ b/uscrn/data.py @@ -12,43 +12,12 @@ import pandas as pd import xarray as xr +from .util import retry + _GET_CAP: int | None = None """Restrict how many files to load, for testing purposes.""" -def retry(func): - """Retry a function on connection error. - Up to 60 s, with Fibonacci backoff (1, 1, 2, 3, ...). - """ - import urllib - from functools import wraps - - import requests - - max_time = 60 # seconds - - @wraps(func) - def wrapper(*args, **kwargs): - from time import perf_counter_ns, sleep - - t0 = perf_counter_ns() - a, b = 1, 1 - while True: - try: - return func(*args, **kwargs) - except (urllib.error.URLError, requests.exceptions.ConnectionError): # pragma: no cover - if perf_counter_ns() - t0 > max_time * 1_000_000_000: - raise - warnings.warn( # TODO: remove or switch to logging? - f"Retrying {func.__name__} in {a} s after connection error", - stacklevel=2, - ) - sleep(a) - a, b = b, a + b # Fibonacci backoff - - return wrapper - - @retry def load_meta(*, cat: bool = False) -> pd.DataFrame: """Load the station metadata table. @@ -405,19 +374,27 @@ def get_data( # Get available years from the main page # e.g. `>2000/<` + # TODO: could cache available years and files like the docs pages print("Discovering files...") - r = requests.get(f"{base_url}/", timeout=10) # TODO: could cache this info like the docs - r.raise_for_status() + + @retry + def get_main_list_page(): + r = requests.get(f"{base_url}/", timeout=10) + r.raise_for_status() + return r.text + + main_list_page = get_main_list_page() + urls: list[str] if which == "monthly": # No year subdirectories - fns = re.findall(r">(CRN[a-zA-Z0-9\-_]*\.txt)<", r.text) + fns = re.findall(r">(CRN[a-zA-Z0-9\-_]*\.txt)<", main_list_page) urls = [f"{base_url}/{fn}" for fn in fns] else: # Year subdirectories from multiprocessing.pool import ThreadPool - available_years: list[int] = [int(s) for s in re.findall(r">([0-9]{4})/?<", r.text)] + available_years: list[int] = [int(s) for s in re.findall(r">([0-9]{4})/?<", main_list_page)] years_: list[int] if isinstance(years, int): @@ -429,6 +406,7 @@ def get_data( if len(years_) == 0: raise ValueError("years should not be empty") + @retry def get_year_urls(year): if year not in available_years: raise ValueError( diff --git a/uscrn/util.py b/uscrn/util.py new file mode 100644 index 0000000..4b2a29f --- /dev/null +++ b/uscrn/util.py @@ -0,0 +1,36 @@ +import logging + +logger = logging.getLogger("uscrn") + + +def retry(func): + """Decorator to retry a function on web connection error. + Up to 60 s, with Fibonacci backoff (1, 1, 2, 3, ...). + """ + import urllib + from functools import wraps + + import requests + + max_time = 60 # seconds + + @wraps(func) + def wrapper(*args, **kwargs): + from time import perf_counter_ns, sleep + + t0 = perf_counter_ns() + a, b = 1, 1 + while True: + try: + return func(*args, **kwargs) + except (urllib.error.URLError, requests.exceptions.ConnectionError): # pragma: no cover + if perf_counter_ns() - t0 > max_time * 1_000_000_000: + raise + logger.info( + f"Retrying {func.__name__} in {a} s after connection error", + stacklevel=2, + ) + sleep(a) + a, b = b, a + b # Fibonacci backoff + + return wrapper From ecd8ffa94f952126c0f494d26e3f319726923f00 Mon Sep 17 00:00:00 2001 From: zmoon Date: Sun, 28 Jan 2024 11:05:06 -0700 Subject: [PATCH 05/13] Test get-docs download --- tests/test_attrs.py | 37 +++++++++++++++++++++++++++++++++++++ uscrn/attrs.py | 6 ++++-- 2 files changed, 41 insertions(+), 2 deletions(-) diff --git a/tests/test_attrs.py b/tests/test_attrs.py index a825ca0..26a396f 100644 --- a/tests/test_attrs.py +++ b/tests/test_attrs.py @@ -1,12 +1,16 @@ import inspect +from contextlib import contextmanager +from pathlib import Path from typing import get_args import pytest +import uscrn from uscrn.attrs import ( _ALL_WHICHS, DEFAULT_WHICH, WHICHS, + _get_docs, _map_dtype, expand_str, expand_strs, @@ -94,6 +98,39 @@ def test_load_attrs(): } +@contextmanager +def change_cache_dir(p: Path): + default = uscrn.attrs._CACHE_DIR + uscrn.attrs._CACHE_DIR = p + try: + yield + finally: + uscrn.attrs._CACHE_DIR = default + + +def test_get_docs_dl(tmp_path, caplog): + which = "daily" + d = tmp_path + + p_headers = d / f"{which}_headers.txt" + p_readme = d / f"{which}_readme.txt" + assert not p_headers.exists() + assert not p_readme.exists() + + with change_cache_dir(d), caplog.at_level("INFO"): + _get_docs(which) + assert "downloading" in caplog.text + + assert p_headers.is_file() + assert p_readme.is_file() + + # Now should load from disk instead of web + caplog.clear() + with change_cache_dir(d), caplog.at_level("INFO"): + _get_docs(which) + assert "downloading" not in caplog.text + + def test_load_col_info(): get_col_info("subhourly") get_col_info("daily") diff --git a/uscrn/attrs.py b/uscrn/attrs.py index f3bee2a..a688ccf 100644 --- a/uscrn/attrs.py +++ b/uscrn/attrs.py @@ -8,6 +8,7 @@ import numpy as np HERE = Path(__file__).parent +_CACHE_DIR = HERE / "cache" def expand_str(s: str) -> list[str]: @@ -289,7 +290,7 @@ def _get_docs( import pandas as pd import requests - from .util import retry + from .util import logger, retry validate_which(which) @@ -309,6 +310,7 @@ def get(url: str, fp: Path) -> str: needs_update = True if needs_update: + logger.info(f"downloading {url} to {fp}") r = requests.get(url, timeout=10) r.raise_for_status() with open(fp, "w") as f: @@ -319,7 +321,7 @@ def get(url: str, fp: Path) -> str: return text - cache_dir = HERE / "cache" + cache_dir = _CACHE_DIR cache_dir.mkdir(exist_ok=True) headers_txt = get(f"{base_url}/headers.txt", cache_dir / f"{which}_headers.txt") From 2ccdea8333f6ab3884fbdd0f23b7162f2e0a5251 Mon Sep 17 00:00:00 2001 From: zmoon Date: Sun, 28 Jan 2024 12:24:21 -0700 Subject: [PATCH 06/13] Add requests read timeout to catch Trying CI multiple times, got `requests.exceptions.ReadTimeout` for URL /pub/data/uscrn/products/daily01/2001/ (in `get_year_urls` with `get_data`) --- uscrn/util.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/uscrn/util.py b/uscrn/util.py index 4b2a29f..356e897 100644 --- a/uscrn/util.py +++ b/uscrn/util.py @@ -23,7 +23,11 @@ def wrapper(*args, **kwargs): while True: try: return func(*args, **kwargs) - except (urllib.error.URLError, requests.exceptions.ConnectionError): # pragma: no cover + except ( + urllib.error.URLError, + requests.exceptions.ConnectionError, + requests.exceptions.ReadTimeout, + ): # pragma: no cover if perf_counter_ns() - t0 > max_time * 1_000_000_000: raise logger.info( From 27e96a60764ee68decb1044d44d80746c50cf9c7 Mon Sep 17 00:00:00 2001 From: zmoon Date: Sun, 28 Jan 2024 12:32:44 -0700 Subject: [PATCH 07/13] Set timeout for pytest step in CI if many URLs are retried, could take long for now, for some runs this is < 1 min, for some, as long as 3 min --- .github/workflows/ci.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index a174f02..4ac23c9 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -33,6 +33,7 @@ jobs: - name: Test with pytest run: pytest --cov uscrn --cov-report xml --cov-report term + timeout-minutes: 10 - name: Upload coverage to Codecov if: ${{ matrix.python-version == '3.11' }} From b6dd1cead167a94a9b61ffd3c5cd181c105c16cd Mon Sep 17 00:00:00 2001 From: zmoon Date: Sun, 28 Jan 2024 12:46:05 -0700 Subject: [PATCH 08/13] `_util.py` internal utilities --- uscrn/{util.py => _util.py} | 0 uscrn/attrs.py | 2 +- uscrn/data.py | 2 +- 3 files changed, 2 insertions(+), 2 deletions(-) rename uscrn/{util.py => _util.py} (100%) diff --git a/uscrn/util.py b/uscrn/_util.py similarity index 100% rename from uscrn/util.py rename to uscrn/_util.py diff --git a/uscrn/attrs.py b/uscrn/attrs.py index a688ccf..c2ce1ec 100644 --- a/uscrn/attrs.py +++ b/uscrn/attrs.py @@ -290,7 +290,7 @@ def _get_docs( import pandas as pd import requests - from .util import logger, retry + from ._util import logger, retry validate_which(which) diff --git a/uscrn/data.py b/uscrn/data.py index e1a2cc0..020c165 100644 --- a/uscrn/data.py +++ b/uscrn/data.py @@ -12,7 +12,7 @@ import pandas as pd import xarray as xr -from .util import retry +from ._util import retry _GET_CAP: int | None = None """Restrict how many files to load, for testing purposes.""" From 7868dd09c044b456568d32972a222f4b08366862 Mon Sep 17 00:00:00 2001 From: Zachary Moon Date: Mon, 29 Jan 2024 10:39:46 -0500 Subject: [PATCH 09/13] Remove comment the above one applies to this too --- uscrn/data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/uscrn/data.py b/uscrn/data.py index 020c165..b3e9a1e 100644 --- a/uscrn/data.py +++ b/uscrn/data.py @@ -416,7 +416,7 @@ def get_year_urls(year): # Get filenames from the year page # e.g. `>CRND0103-2020-TX_Palestine_6_WNW.txt<` url = f"{base_url}/{year}/" - r = requests.get(url, timeout=10) # TODO: could cache this info like the docs + r = requests.get(url, timeout=10) r.raise_for_status() fns = re.findall(r">(CRN[a-zA-Z0-9\-_]*\.txt)<", r.text) if not fns: # pragma: no cover From 78c085a7db96a7d717119fd818a0005f7aa1bbd6 Mon Sep 17 00:00:00 2001 From: zmoon Date: Mon, 29 Jan 2024 10:39:07 -0700 Subject: [PATCH 10/13] Add simple test of retry deco --- tests/test_util.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) create mode 100644 tests/test_util.py diff --git a/tests/test_util.py b/tests/test_util.py new file mode 100644 index 0000000..2adde74 --- /dev/null +++ b/tests/test_util.py @@ -0,0 +1,20 @@ +from uscrn._util import retry + + +def test_retry(caplog): + from requests.exceptions import ConnectionError + + n = 0 + + @retry + def func(): + nonlocal n + if n < 2: + n += 1 + raise ConnectionError + + with caplog.at_level("INFO"): + func() + + for r in caplog.records: + assert r.getMessage() == "Retrying func in 1 s after connection error" From 2158e60272a716ba5f9d29614d4aed5bd1ccc26a Mon Sep 17 00:00:00 2001 From: zmoon Date: Mon, 29 Jan 2024 10:42:21 -0700 Subject: [PATCH 11/13] Add result --- tests/test_util.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tests/test_util.py b/tests/test_util.py index 2adde74..66c872c 100644 --- a/tests/test_util.py +++ b/tests/test_util.py @@ -12,9 +12,12 @@ def func(): if n < 2: n += 1 raise ConnectionError + return "result" with caplog.at_level("INFO"): - func() + res = func() for r in caplog.records: assert r.getMessage() == "Retrying func in 1 s after connection error" + + assert res == "result" From 9401568bbe3683d656f4d0ed8d855a4bd237f184 Mon Sep 17 00:00:00 2001 From: zmoon Date: Mon, 29 Jan 2024 10:47:29 -0700 Subject: [PATCH 12/13] Now covered --- uscrn/_util.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/uscrn/_util.py b/uscrn/_util.py index 356e897..4d4ae2a 100644 --- a/uscrn/_util.py +++ b/uscrn/_util.py @@ -27,7 +27,7 @@ def wrapper(*args, **kwargs): urllib.error.URLError, requests.exceptions.ConnectionError, requests.exceptions.ReadTimeout, - ): # pragma: no cover + ): if perf_counter_ns() - t0 > max_time * 1_000_000_000: raise logger.info( From c8ced8e73ffdfa85a67514c7b78cd545702d254d Mon Sep 17 00:00:00 2001 From: zmoon Date: Mon, 29 Jan 2024 10:59:58 -0700 Subject: [PATCH 13/13] Not currently testing max time until making it configurable so it can be shorter for the test --- uscrn/_util.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/uscrn/_util.py b/uscrn/_util.py index 4d4ae2a..8b4d672 100644 --- a/uscrn/_util.py +++ b/uscrn/_util.py @@ -12,7 +12,7 @@ def retry(func): import requests - max_time = 60 # seconds + max_time = 60_000_000_000 # 60 s (in ns) @wraps(func) def wrapper(*args, **kwargs): @@ -28,7 +28,7 @@ def wrapper(*args, **kwargs): requests.exceptions.ConnectionError, requests.exceptions.ReadTimeout, ): - if perf_counter_ns() - t0 > max_time * 1_000_000_000: + if perf_counter_ns() - t0 > max_time: # pragma: no cover raise logger.info( f"Retrying {func.__name__} in {a} s after connection error",