nextstrain · huddlej · Jan 19, 2023 · Jan 8, 2023 · Jan 8, 2023 · Jan 8, 2023
diff --git a/CHANGES.md b/CHANGES.md
@@ -2,6 +2,11 @@
 
 ## __NEXT__
 
+### Major Changes
+
+* frequencies: Changes the logic for calculating the time points when frequencies are estimated to ensure that the user-provided "end date" is always included. This change in the behavior of the frequencies command fixes a bug where large intervals between time points (e.g., 3 months) could cause recent data to be omitted from frequency calculations. See the pull request for more details included the scientific implications of this bug. [#1121][] (@huddlej)
+
+[#1121]: https://github.com/nextstrain/augur/pull/1121
 
 ## 19.3.0 (19 January 2023)
 

diff --git a/augur/frequency_estimators.py b/augur/frequency_estimators.py
@@ -1,14 +1,17 @@
 # estimates clade frequencies
 from __future__ import division, print_function
-from collections import defaultdict
+from collections import defaultdict, deque
 import datetime
+import isodate
 import numpy as np
 import pandas as pd
 from scipy.interpolate import interp1d
 from scipy.stats import norm
 import sys
 import time
 
+from .dates import numeric_date
+
 debug = False
 log_thres = 10.0
 
@@ -24,7 +27,9 @@ def get_pivots(observations, pivot_interval, start_date=None, end_date=None, piv
     interval between pivots.
 
     Start and end pivots will be based on the range of given observed dates,
-    unless a start or end date are provided to override these defaults.
+    unless a start or end date are provided to override these defaults. Pivots
+    include the end date and, where possible for the given interval, the start
+    date.
 
     Parameters
     ----------
@@ -54,18 +59,29 @@ def get_pivots(observations, pivot_interval, start_date=None, end_date=None, piv
     pivot_end = end_date if end_date else np.ceil(np.max(observations) / pivot_frequency) * pivot_frequency
 
     if pivot_interval_units == "months":
-        offset = "%sMS" % pivot_interval
+        duration_str = f'P{pivot_interval}M'
     elif pivot_interval_units == "weeks":
-        offset = "%sW" % pivot_interval
+        duration_str = f'P{pivot_interval}W'
     else:
         raise ValueError(f"The given interval unit '{pivot_interval_units}' is not supported.")
 
-    datetime_pivots = pd.date_range(
-        float_to_datestring(pivot_start),
-        float_to_datestring(pivot_end),
-        freq = offset
-    )
-    pivots = np.array([timestamp_to_float(pivot) for pivot in datetime_pivots])
+    # Construct standard Python date instances from numeric dates via ISO-8601
+    # dates and the corresponding delta time for the interval between pivots.
+    start = datetime.datetime.strptime(float_to_datestring(pivot_start), "%Y-%m-%d")
+    end = datetime.datetime.strptime(float_to_datestring(pivot_end), "%Y-%m-%d")
+    delta = isodate.parse_duration(duration_str)
+
+    # Start calculating pivots from the end date (inclusive), working backwards
+    # in time by a time delta that matches the user-requested interval. Include
+    # the start date in the final pivots when the interval spacing allows that
+    # date to be included.
+    pivots = deque([])
+    pivot = end
+    while pivot >= start:
+        pivots.appendleft(pivot)
+        pivot = pivot - delta
+
+    pivots = np.array([numeric_date(pivot) for pivot in pivots])
 
     return np.around(pivots, 4)
 

diff --git a/tests/builds/zika/auspice/zika_tip-frequencies.json b/tests/builds/zika/auspice/zika_tip-frequencies.json
@@ -1,92 +1,92 @@
 {
   "BRA/2016/FC_6706": {
     "frequencies": [
-      0.022579,
-      0.011312,
-      0.206983,
-      0.102282
+      0.022368,
+      0.011759,
+      0.207309,
+      0.102551
     ]
   },
   "COL/FLR_00008/2015": {
     "frequencies": [
-      0.243453,
-      0.208443,
-      0.008645,
-      0.010659
+      0.244555,
+      0.204597,
+      0.008456,
+      0.010737
     ]
   },
   "Colombia/2016/ZC204Se": {
     "frequencies": [
-      0.126056,
-      0.231597,
-      0.014167,
-      0.017099
+      0.12489,
+      0.234574,
+      0.013671,
+      0.017245
     ]
   },
   "DOM/2016/BB_0183": {
     "frequencies": [
-      0.017888,
-      0.009342,
-      0.183671,
-      0.148759
+      0.017736,
+      0.009645,
+      0.185763,
+      0.148494
     ]
   },
   "EcEs062_16": {
     "frequencies": [
-      0.018981,
-      0.009763,
-      0.190994,
-      0.134398
+      0.018817,
+      0.010092,
+      0.192704,
+      0.134289
     ]
   },
   "HND/2016/HU_ME59": {
     "frequencies": [
-      0.009484,
-      0.006254,
-      0.090552,
-      0.457824
+      0.009425,
+      0.006444,
+      0.093589,
+      0.456546
     ]
   },
   "PAN/CDC_259359_V1_V3/2015": {
     "frequencies": [
-      0.224832,
-      0.214575,
-      0.008963,
-      0.011176
+      0.225577,
+      0.211235,
+      0.008761,
+      0.011258
     ]
   },
   "PRVABC59": {
     "frequencies": [
-      0.243453,
-      0.208443,
-      0.008645,
-      0.010659
+      0.244555,
+      0.204597,
+      0.008456,
+      0.010737
     ]
   },
   "VEN/UF_1/2016": {
     "frequencies": [
-      0.030662,
-      0.016483,
-      0.206983,
-      0.070196
+      0.030336,
+      0.017436,
+      0.204461,
+      0.07079
     ]
   },
   "ZKC2/2016": {
     "frequencies": [
-      0.062612,
-      0.083787,
-      0.080395,
-      0.036947
+      0.06174,
+      0.089622,
+      0.076833,
+      0.037353
     ]
   },
   "generated_by": {
     "program": "augur",
-    "version": "7.0.2"
+    "version": "19.2.0"
   },
   "pivots": [
-    2015.75,
-    2016.0,
-    2016.25,
-    2016.5
+    2015.7521,
+    2016.0041,
+    2016.2527,
+    2016.5014
   ]
 }
diff --git a/tests/functional/frequencies.t b/tests/functional/frequencies.t
@@ -34,26 +34,24 @@ Pivots get calculated from the requested date range.
 Calculate KDE-based tip frequencies for a time period with relative dates.
 Testing relative dates deterministically from the shell is tricky.
 To keep these tests simple and avoid freezing the system date to specific values, this test checks the logical consistency of the requested relative dates and pivot interval.
-With a minimum date of 1 year (12 months) ago, a maximum date of 6 months ago, and a pivot interval of 3 months, we expect only 2 pivots in the final output.
-Since the test data are much older than the time period requested, all strains will always have frequencies of 0 for the 2 requested pivots.
-As long as we always calculate 2 pivots with frequencies of 0 for all strains, we can ignore the actual pivot values calculated for the relative dates in the diff below.
-
-TODO: un-comment this test, which is failing on 2022-06-01 with 3 pivot points: https://github.com/nextstrain/augur/runs/6694014041
-
-$ ${AUGUR} frequencies \
->  --method kde \
->  --tree "frequencies/tree.nwk" \
->  --metadata "frequencies/metadata.tsv" \
->  --pivot-interval 3 \
->  --min-date 1Y \
->  --max-date 6M \
->  --output "$TMP/tip-frequencies.json" > /dev/null
-
-$ python3 "$TESTDIR/../../scripts/diff_jsons.py" \
->  --exclude-paths "root['generated_by']['version']" "root['pivots']" -- \
->  "frequencies/zika_tip-frequencies_with_relative_dates.json" \
->  "$TMP/tip-frequencies.json"
-{}
-$ rm -f "$TMP/tip-frequencies.json"
-
-$ popd > /dev/null
+With a minimum date of 12 months ago, a maximum date of 6 months ago, and a pivot interval of 3 months, we always expect 3 pivots in the final output corresponding to the end date (always included), the 3-month pivot before that end date, and the start date (also should always be included).
+Since the test data are much older than the time period requested, all strains will always have frequencies of 0 for the 3 requested pivots.
+As long as we always calculate 3 pivots with frequencies of 0 for all strains, we can ignore the actual pivot values calculated for the relative dates in the diff below.
+
+  $ ${AUGUR} frequencies \
+  >  --method kde \
+  >  --tree "frequencies/tree.nwk" \
+  >  --metadata "frequencies/metadata.tsv" \
+  >  --pivot-interval 3 \
+  >  --min-date 12M \
+  >  --max-date 6M \
+  >  --output "$TMP/tip-frequencies.json" > /dev/null
+
+  $ python3 "$TESTDIR/../../scripts/diff_jsons.py" \
+  >  --exclude-paths "root['generated_by']['version']" "root['pivots']" -- \
+  >  "frequencies/zika_tip-frequencies_with_relative_dates.json" \
+  >  "$TMP/tip-frequencies.json"
+  {}
+  $ rm -f "$TMP/tip-frequencies.json"
+
+  $ popd > /dev/null