From c1631fee97b589710d743ab461815839f884bbfc Mon Sep 17 00:00:00 2001
From: zmoon <zmoon92@gmail.com>
Date: Sat, 27 Jan 2024 15:38:20 -0700
Subject: [PATCH 01/13] Initial retry loop

applied to the readers

catching connection exceptions that I saw in the GH logs
(don't seem to happen locally in my testing on Win...)
---
 uscrn/data.py | 38 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 38 insertions(+)

diff --git a/uscrn/data.py b/uscrn/data.py
index e952437..91f8913 100644
--- a/uscrn/data.py
+++ b/uscrn/data.py
@@ -16,6 +16,40 @@
 """Restrict how many files to load, for testing purposes."""
 
 
+def retry(func):
+    """Retry a function on connection error.
+    Up to 60 s, with Fibonacci backoff (1, 1, 2, 3, ...).
+    """
+    import urllib
+    from functools import wraps
+
+    import requests
+
+    max_time = 60  # seconds
+
+    @wraps(func)
+    def wrapper(*args, **kwargs):
+        from time import perf_counter_ns, sleep
+
+        t0 = perf_counter_ns()
+        a, b = 1, 1
+        while True:
+            try:
+                return func(*args, **kwargs)
+            except (urllib.error.URLError, requests.exceptions.ConnectionError):
+                if perf_counter_ns() - t0 > max_time * 1_000_000_000:
+                    raise
+                warnings.warn(  # TODO: remove or switch to logging?
+                    f"Retrying {func.__name__} in {a} s after connection error",
+                    stacklevel=2,
+                )
+                sleep(a)
+                a, b = b, a + b  # Fibonacci backoff
+
+    return wrapper
+
+
+@retry
 def load_meta(*, cat: bool = False) -> pd.DataFrame:
     """Load the station metadata table.
 
@@ -98,6 +132,7 @@ def parse_url(url: str) -> _ParseRes:
     return parse_fp(urlsplit(url).path)
 
 
+@retry
 def read_subhourly(fp, *, cat: bool = False) -> pd.DataFrame:
     """Read a subhourly USCRN file.
 
@@ -148,6 +183,7 @@ def read_subhourly(fp, *, cat: bool = False) -> pd.DataFrame:
     return df
 
 
+@retry
 def read_hourly(fp, *, cat: bool = False) -> pd.DataFrame:
     """Read an hourly USCRN file.
 
@@ -194,6 +230,7 @@ def read_hourly(fp, *, cat: bool = False) -> pd.DataFrame:
     return df
 
 
+@retry
 def read_daily(fp, *, cat: bool = False) -> pd.DataFrame:
     """Read a daily USCRN file.
 
@@ -236,6 +273,7 @@ def read_daily(fp, *, cat: bool = False) -> pd.DataFrame:
     return df
 
 
+@retry
 def read_monthly(fp, *, cat: bool = False) -> pd.DataFrame:
     """Read a monthly USCRN file.
 

From 4e492311dacf8f782d927068d5861592a7108306 Mon Sep 17 00:00:00 2001
From: zmoon <zmoon92@gmail.com>
Date: Sat, 27 Jan 2024 15:47:52 -0700
Subject: [PATCH 02/13] Add generous timeouts for downloading non-data text

---
 uscrn/attrs.py | 2 +-
 uscrn/data.py  | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/uscrn/attrs.py b/uscrn/attrs.py
index 5ab11ef..66082ff 100644
--- a/uscrn/attrs.py
+++ b/uscrn/attrs.py
@@ -306,7 +306,7 @@ def get(url: str, fp: Path) -> str:
             needs_update = True
 
         if needs_update:
-            r = requests.get(url)
+            r = requests.get(url, timeout=10)
             r.raise_for_status()
             with open(fp, "w") as f:
                 f.write(r.text)
diff --git a/uscrn/data.py b/uscrn/data.py
index 91f8913..2c17edd 100644
--- a/uscrn/data.py
+++ b/uscrn/data.py
@@ -406,7 +406,7 @@ def get_data(
     # Get available years from the main page
     # e.g. `>2000/<`
     print("Discovering files...")
-    r = requests.get(f"{base_url}/")
+    r = requests.get(f"{base_url}/", timeout=10)  # TODO: could cache this info like the docs
     r.raise_for_status()
     urls: list[str]
     if which == "monthly":
@@ -438,7 +438,7 @@ def get_year_urls(year):
             # Get filenames from the year page
             # e.g. `>CRND0103-2020-TX_Palestine_6_WNW.txt<`
             url = f"{base_url}/{year}/"
-            r = requests.get(url)
+            r = requests.get(url, timeout=10)  # TODO: could cache this info like the docs
             r.raise_for_status()
             fns = re.findall(r">(CRN[a-zA-Z0-9\-_]*\.txt)<", r.text)
             if not fns:  # pragma: no cover

From 6ec82f1340540fb9f39cbd2d2ac67df041fd1e95 Mon Sep 17 00:00:00 2001
From: zmoon <zmoon92@gmail.com>
Date: Sun, 28 Jan 2024 10:10:53 -0700
Subject: [PATCH 03/13] Don't know when connection error will happen

would need some more complex testing with mocking
---
 uscrn/data.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/uscrn/data.py b/uscrn/data.py
index 2c17edd..3f8bca0 100644
--- a/uscrn/data.py
+++ b/uscrn/data.py
@@ -36,7 +36,7 @@ def wrapper(*args, **kwargs):
         while True:
             try:
                 return func(*args, **kwargs)
-            except (urllib.error.URLError, requests.exceptions.ConnectionError):
+            except (urllib.error.URLError, requests.exceptions.ConnectionError):  # pragma: no cover
                 if perf_counter_ns() - t0 > max_time * 1_000_000_000:
                     raise
                 warnings.warn(  # TODO: remove or switch to logging?

From e7786dcc30dc986dbea7881120feb15f3f7291e2 Mon Sep 17 00:00:00 2001
From: zmoon <zmoon92@gmail.com>
Date: Sun, 28 Jan 2024 10:34:30 -0700
Subject: [PATCH 04/13] More retry

---
 uscrn/attrs.py |  5 ++++-
 uscrn/data.py  | 52 +++++++++++++++-----------------------------------
 uscrn/util.py  | 36 ++++++++++++++++++++++++++++++++++
 3 files changed, 55 insertions(+), 38 deletions(-)
 create mode 100644 uscrn/util.py

diff --git a/uscrn/attrs.py b/uscrn/attrs.py
index 66082ff..f3bee2a 100644
--- a/uscrn/attrs.py
+++ b/uscrn/attrs.py
@@ -289,15 +289,18 @@ def _get_docs(
     import pandas as pd
     import requests
 
+    from .util import retry
+
     validate_which(which)
 
     stored_attrs = load_attrs()
     base_url = stored_attrs[which]["base_url"]
 
+    @retry
     def get(url: str, fp: Path) -> str:
         needs_update: bool
         if fp.is_file():
-            r = requests.head(url)
+            r = requests.head(url, timeout=10)
             r.raise_for_status()
             last_modified_url = pd.Timestamp(r.headers["Last-Modified"])
             last_modified_local = pd.Timestamp(fp.stat().st_mtime, unit="s", tz="UTC")
diff --git a/uscrn/data.py b/uscrn/data.py
index 3f8bca0..e1a2cc0 100644
--- a/uscrn/data.py
+++ b/uscrn/data.py
@@ -12,43 +12,12 @@
 import pandas as pd
 import xarray as xr
 
+from .util import retry
+
 _GET_CAP: int | None = None
 """Restrict how many files to load, for testing purposes."""
 
 
-def retry(func):
-    """Retry a function on connection error.
-    Up to 60 s, with Fibonacci backoff (1, 1, 2, 3, ...).
-    """
-    import urllib
-    from functools import wraps
-
-    import requests
-
-    max_time = 60  # seconds
-
-    @wraps(func)
-    def wrapper(*args, **kwargs):
-        from time import perf_counter_ns, sleep
-
-        t0 = perf_counter_ns()
-        a, b = 1, 1
-        while True:
-            try:
-                return func(*args, **kwargs)
-            except (urllib.error.URLError, requests.exceptions.ConnectionError):  # pragma: no cover
-                if perf_counter_ns() - t0 > max_time * 1_000_000_000:
-                    raise
-                warnings.warn(  # TODO: remove or switch to logging?
-                    f"Retrying {func.__name__} in {a} s after connection error",
-                    stacklevel=2,
-                )
-                sleep(a)
-                a, b = b, a + b  # Fibonacci backoff
-
-    return wrapper
-
-
 @retry
 def load_meta(*, cat: bool = False) -> pd.DataFrame:
     """Load the station metadata table.
@@ -405,19 +374,27 @@ def get_data(
 
     # Get available years from the main page
     # e.g. `>2000/<`
+    # TODO: could cache available years and files like the docs pages
     print("Discovering files...")
-    r = requests.get(f"{base_url}/", timeout=10)  # TODO: could cache this info like the docs
-    r.raise_for_status()
+
+    @retry
+    def get_main_list_page():
+        r = requests.get(f"{base_url}/", timeout=10)
+        r.raise_for_status()
+        return r.text
+
+    main_list_page = get_main_list_page()
+
     urls: list[str]
     if which == "monthly":
         # No year subdirectories
-        fns = re.findall(r">(CRN[a-zA-Z0-9\-_]*\.txt)<", r.text)
+        fns = re.findall(r">(CRN[a-zA-Z0-9\-_]*\.txt)<", main_list_page)
         urls = [f"{base_url}/{fn}" for fn in fns]
     else:
         # Year subdirectories
         from multiprocessing.pool import ThreadPool
 
-        available_years: list[int] = [int(s) for s in re.findall(r">([0-9]{4})/?<", r.text)]
+        available_years: list[int] = [int(s) for s in re.findall(r">([0-9]{4})/?<", main_list_page)]
 
         years_: list[int]
         if isinstance(years, int):
@@ -429,6 +406,7 @@ def get_data(
             if len(years_) == 0:
                 raise ValueError("years should not be empty")
 
+        @retry
         def get_year_urls(year):
             if year not in available_years:
                 raise ValueError(
diff --git a/uscrn/util.py b/uscrn/util.py
new file mode 100644
index 0000000..4b2a29f
--- /dev/null
+++ b/uscrn/util.py
@@ -0,0 +1,36 @@
+import logging
+
+logger = logging.getLogger("uscrn")
+
+
+def retry(func):
+    """Decorator to retry a function on web connection error.
+    Up to 60 s, with Fibonacci backoff (1, 1, 2, 3, ...).
+    """
+    import urllib
+    from functools import wraps
+
+    import requests
+
+    max_time = 60  # seconds
+
+    @wraps(func)
+    def wrapper(*args, **kwargs):
+        from time import perf_counter_ns, sleep
+
+        t0 = perf_counter_ns()
+        a, b = 1, 1
+        while True:
+            try:
+                return func(*args, **kwargs)
+            except (urllib.error.URLError, requests.exceptions.ConnectionError):  # pragma: no cover
+                if perf_counter_ns() - t0 > max_time * 1_000_000_000:
+                    raise
+                logger.info(
+                    f"Retrying {func.__name__} in {a} s after connection error",
+                    stacklevel=2,
+                )
+                sleep(a)
+                a, b = b, a + b  # Fibonacci backoff
+
+    return wrapper

From ecd8ffa94f952126c0f494d26e3f319726923f00 Mon Sep 17 00:00:00 2001
From: zmoon <zmoon92@gmail.com>
Date: Sun, 28 Jan 2024 11:05:06 -0700
Subject: [PATCH 05/13] Test get-docs download

---
 tests/test_attrs.py | 37 +++++++++++++++++++++++++++++++++++++
 uscrn/attrs.py      |  6 ++++--
 2 files changed, 41 insertions(+), 2 deletions(-)

diff --git a/tests/test_attrs.py b/tests/test_attrs.py
index a825ca0..26a396f 100644
--- a/tests/test_attrs.py
+++ b/tests/test_attrs.py
@@ -1,12 +1,16 @@
 import inspect
+from contextlib import contextmanager
+from pathlib import Path
 from typing import get_args
 
 import pytest
 
+import uscrn
 from uscrn.attrs import (
     _ALL_WHICHS,
     DEFAULT_WHICH,
     WHICHS,
+    _get_docs,
     _map_dtype,
     expand_str,
     expand_strs,
@@ -94,6 +98,39 @@ def test_load_attrs():
             }
 
 
+@contextmanager
+def change_cache_dir(p: Path):
+    default = uscrn.attrs._CACHE_DIR
+    uscrn.attrs._CACHE_DIR = p
+    try:
+        yield
+    finally:
+        uscrn.attrs._CACHE_DIR = default
+
+
+def test_get_docs_dl(tmp_path, caplog):
+    which = "daily"
+    d = tmp_path
+
+    p_headers = d / f"{which}_headers.txt"
+    p_readme = d / f"{which}_readme.txt"
+    assert not p_headers.exists()
+    assert not p_readme.exists()
+
+    with change_cache_dir(d), caplog.at_level("INFO"):
+        _get_docs(which)
+        assert "downloading" in caplog.text
+
+    assert p_headers.is_file()
+    assert p_readme.is_file()
+
+    # Now should load from disk instead of web
+    caplog.clear()
+    with change_cache_dir(d), caplog.at_level("INFO"):
+        _get_docs(which)
+        assert "downloading" not in caplog.text
+
+
 def test_load_col_info():
     get_col_info("subhourly")
     get_col_info("daily")
diff --git a/uscrn/attrs.py b/uscrn/attrs.py
index f3bee2a..a688ccf 100644
--- a/uscrn/attrs.py
+++ b/uscrn/attrs.py
@@ -8,6 +8,7 @@
 import numpy as np
 
 HERE = Path(__file__).parent
+_CACHE_DIR = HERE / "cache"
 
 
 def expand_str(s: str) -> list[str]:
@@ -289,7 +290,7 @@ def _get_docs(
     import pandas as pd
     import requests
 
-    from .util import retry
+    from .util import logger, retry
 
     validate_which(which)
 
@@ -309,6 +310,7 @@ def get(url: str, fp: Path) -> str:
             needs_update = True
 
         if needs_update:
+            logger.info(f"downloading {url} to {fp}")
             r = requests.get(url, timeout=10)
             r.raise_for_status()
             with open(fp, "w") as f:
@@ -319,7 +321,7 @@ def get(url: str, fp: Path) -> str:
 
         return text
 
-    cache_dir = HERE / "cache"
+    cache_dir = _CACHE_DIR
     cache_dir.mkdir(exist_ok=True)
 
     headers_txt = get(f"{base_url}/headers.txt", cache_dir / f"{which}_headers.txt")

From 2ccdea8333f6ab3884fbdd0f23b7162f2e0a5251 Mon Sep 17 00:00:00 2001
From: zmoon <zmoon92@gmail.com>
Date: Sun, 28 Jan 2024 12:24:21 -0700
Subject: [PATCH 06/13] Add requests read timeout to catch

Trying CI multiple times, got `requests.exceptions.ReadTimeout`
for URL /pub/data/uscrn/products/daily01/2001/
(in `get_year_urls` with `get_data`)
---
 uscrn/util.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/uscrn/util.py b/uscrn/util.py
index 4b2a29f..356e897 100644
--- a/uscrn/util.py
+++ b/uscrn/util.py
@@ -23,7 +23,11 @@ def wrapper(*args, **kwargs):
         while True:
             try:
                 return func(*args, **kwargs)
-            except (urllib.error.URLError, requests.exceptions.ConnectionError):  # pragma: no cover
+            except (
+                urllib.error.URLError,
+                requests.exceptions.ConnectionError,
+                requests.exceptions.ReadTimeout,
+            ):  # pragma: no cover
                 if perf_counter_ns() - t0 > max_time * 1_000_000_000:
                     raise
                 logger.info(

From 27e96a60764ee68decb1044d44d80746c50cf9c7 Mon Sep 17 00:00:00 2001
From: zmoon <zmoon92@gmail.com>
Date: Sun, 28 Jan 2024 12:32:44 -0700
Subject: [PATCH 07/13] Set timeout for pytest step in CI

if many URLs are retried, could take long

for now, for some runs this is < 1 min,
for some, as long as 3 min
---
 .github/workflows/ci.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index a174f02..4ac23c9 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -33,6 +33,7 @@ jobs:
 
       - name: Test with pytest
         run: pytest --cov uscrn --cov-report xml --cov-report term
+        timeout-minutes: 10
 
       - name: Upload coverage to Codecov
         if: ${{ matrix.python-version == '3.11' }}

From b6dd1cead167a94a9b61ffd3c5cd181c105c16cd Mon Sep 17 00:00:00 2001
From: zmoon <zmoon92@gmail.com>
Date: Sun, 28 Jan 2024 12:46:05 -0700
Subject: [PATCH 08/13] `_util.py`

internal utilities
---
 uscrn/{util.py => _util.py} | 0
 uscrn/attrs.py              | 2 +-
 uscrn/data.py               | 2 +-
 3 files changed, 2 insertions(+), 2 deletions(-)
 rename uscrn/{util.py => _util.py} (100%)

diff --git a/uscrn/util.py b/uscrn/_util.py
similarity index 100%
rename from uscrn/util.py
rename to uscrn/_util.py
diff --git a/uscrn/attrs.py b/uscrn/attrs.py
index a688ccf..c2ce1ec 100644
--- a/uscrn/attrs.py
+++ b/uscrn/attrs.py
@@ -290,7 +290,7 @@ def _get_docs(
     import pandas as pd
     import requests
 
-    from .util import logger, retry
+    from ._util import logger, retry
 
     validate_which(which)
 
diff --git a/uscrn/data.py b/uscrn/data.py
index e1a2cc0..020c165 100644
--- a/uscrn/data.py
+++ b/uscrn/data.py
@@ -12,7 +12,7 @@
 import pandas as pd
 import xarray as xr
 
-from .util import retry
+from ._util import retry
 
 _GET_CAP: int | None = None
 """Restrict how many files to load, for testing purposes."""

From 7868dd09c044b456568d32972a222f4b08366862 Mon Sep 17 00:00:00 2001
From: Zachary Moon <zmoon92@gmail.com>
Date: Mon, 29 Jan 2024 10:39:46 -0500
Subject: [PATCH 09/13] Remove comment

the above one applies to this too
---
 uscrn/data.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/uscrn/data.py b/uscrn/data.py
index 020c165..b3e9a1e 100644
--- a/uscrn/data.py
+++ b/uscrn/data.py
@@ -416,7 +416,7 @@ def get_year_urls(year):
             # Get filenames from the year page
             # e.g. `>CRND0103-2020-TX_Palestine_6_WNW.txt<`
             url = f"{base_url}/{year}/"
-            r = requests.get(url, timeout=10)  # TODO: could cache this info like the docs
+            r = requests.get(url, timeout=10)
             r.raise_for_status()
             fns = re.findall(r">(CRN[a-zA-Z0-9\-_]*\.txt)<", r.text)
             if not fns:  # pragma: no cover

From 78c085a7db96a7d717119fd818a0005f7aa1bbd6 Mon Sep 17 00:00:00 2001
From: zmoon <zmoon92@gmail.com>
Date: Mon, 29 Jan 2024 10:39:07 -0700
Subject: [PATCH 10/13] Add simple test of retry deco

---
 tests/test_util.py | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)
 create mode 100644 tests/test_util.py

diff --git a/tests/test_util.py b/tests/test_util.py
new file mode 100644
index 0000000..2adde74
--- /dev/null
+++ b/tests/test_util.py
@@ -0,0 +1,20 @@
+from uscrn._util import retry
+
+
+def test_retry(caplog):
+    from requests.exceptions import ConnectionError
+
+    n = 0
+
+    @retry
+    def func():
+        nonlocal n
+        if n < 2:
+            n += 1
+            raise ConnectionError
+
+    with caplog.at_level("INFO"):
+        func()
+
+    for r in caplog.records:
+        assert r.getMessage() == "Retrying func in 1 s after connection error"

From 2158e60272a716ba5f9d29614d4aed5bd1ccc26a Mon Sep 17 00:00:00 2001
From: zmoon <zmoon92@gmail.com>
Date: Mon, 29 Jan 2024 10:42:21 -0700
Subject: [PATCH 11/13] Add result

---
 tests/test_util.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tests/test_util.py b/tests/test_util.py
index 2adde74..66c872c 100644
--- a/tests/test_util.py
+++ b/tests/test_util.py
@@ -12,9 +12,12 @@ def func():
         if n < 2:
             n += 1
             raise ConnectionError
+        return "result"
 
     with caplog.at_level("INFO"):
-        func()
+        res = func()
 
     for r in caplog.records:
         assert r.getMessage() == "Retrying func in 1 s after connection error"
+
+    assert res == "result"

From 9401568bbe3683d656f4d0ed8d855a4bd237f184 Mon Sep 17 00:00:00 2001
From: zmoon <zmoon92@gmail.com>
Date: Mon, 29 Jan 2024 10:47:29 -0700
Subject: [PATCH 12/13] Now covered

---
 uscrn/_util.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/uscrn/_util.py b/uscrn/_util.py
index 356e897..4d4ae2a 100644
--- a/uscrn/_util.py
+++ b/uscrn/_util.py
@@ -27,7 +27,7 @@ def wrapper(*args, **kwargs):
                 urllib.error.URLError,
                 requests.exceptions.ConnectionError,
                 requests.exceptions.ReadTimeout,
-            ):  # pragma: no cover
+            ):
                 if perf_counter_ns() - t0 > max_time * 1_000_000_000:
                     raise
                 logger.info(

From c8ced8e73ffdfa85a67514c7b78cd545702d254d Mon Sep 17 00:00:00 2001
From: zmoon <zmoon92@gmail.com>
Date: Mon, 29 Jan 2024 10:59:58 -0700
Subject: [PATCH 13/13] Not currently testing max time

until making it configurable so it can be shorter for the test
---
 uscrn/_util.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/uscrn/_util.py b/uscrn/_util.py
index 4d4ae2a..8b4d672 100644
--- a/uscrn/_util.py
+++ b/uscrn/_util.py
@@ -12,7 +12,7 @@ def retry(func):
 
     import requests
 
-    max_time = 60  # seconds
+    max_time = 60_000_000_000  # 60 s (in ns)
 
     @wraps(func)
     def wrapper(*args, **kwargs):
@@ -28,7 +28,7 @@ def wrapper(*args, **kwargs):
                 requests.exceptions.ConnectionError,
                 requests.exceptions.ReadTimeout,
             ):
-                if perf_counter_ns() - t0 > max_time * 1_000_000_000:
+                if perf_counter_ns() - t0 > max_time:  # pragma: no cover
                     raise
                 logger.info(
                     f"Retrying {func.__name__} in {a} s after connection error",