Skip to content

Commit

Permalink
Merge pull request #1121 from nextstrain/fix-monthly-pivots
Browse files Browse the repository at this point in the history
frequencies: Fix pivot logic to always include the end date
  • Loading branch information
huddlej authored Jan 19, 2023
2 parents 5e1babb + 0591500 commit 85d2edd
Show file tree
Hide file tree
Showing 8 changed files with 231 additions and 158 deletions.
5 changes: 5 additions & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,11 @@

## __NEXT__

### Major Changes

* frequencies: Changes the logic for calculating the time points when frequencies are estimated to ensure that the user-provided "end date" is always included. This change in the behavior of the frequencies command fixes a bug where large intervals between time points (e.g., 3 months) could cause recent data to be omitted from frequency calculations. See the pull request for more details included the scientific implications of this bug. [#1121][] (@huddlej)

[#1121]: https://github.com/nextstrain/augur/pull/1121

## 19.3.0 (19 January 2023)

Expand Down
36 changes: 26 additions & 10 deletions augur/frequency_estimators.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,17 @@
# estimates clade frequencies
from __future__ import division, print_function
from collections import defaultdict
from collections import defaultdict, deque
import datetime
import isodate
import numpy as np
import pandas as pd
from scipy.interpolate import interp1d
from scipy.stats import norm
import sys
import time

from .dates import numeric_date

debug = False
log_thres = 10.0

Expand All @@ -24,7 +27,9 @@ def get_pivots(observations, pivot_interval, start_date=None, end_date=None, piv
interval between pivots.
Start and end pivots will be based on the range of given observed dates,
unless a start or end date are provided to override these defaults.
unless a start or end date are provided to override these defaults. Pivots
include the end date and, where possible for the given interval, the start
date.
Parameters
----------
Expand Down Expand Up @@ -54,18 +59,29 @@ def get_pivots(observations, pivot_interval, start_date=None, end_date=None, piv
pivot_end = end_date if end_date else np.ceil(np.max(observations) / pivot_frequency) * pivot_frequency

if pivot_interval_units == "months":
offset = "%sMS" % pivot_interval
duration_str = f'P{pivot_interval}M'
elif pivot_interval_units == "weeks":
offset = "%sW" % pivot_interval
duration_str = f'P{pivot_interval}W'
else:
raise ValueError(f"The given interval unit '{pivot_interval_units}' is not supported.")

datetime_pivots = pd.date_range(
float_to_datestring(pivot_start),
float_to_datestring(pivot_end),
freq = offset
)
pivots = np.array([timestamp_to_float(pivot) for pivot in datetime_pivots])
# Construct standard Python date instances from numeric dates via ISO-8601
# dates and the corresponding delta time for the interval between pivots.
start = datetime.datetime.strptime(float_to_datestring(pivot_start), "%Y-%m-%d")
end = datetime.datetime.strptime(float_to_datestring(pivot_end), "%Y-%m-%d")
delta = isodate.parse_duration(duration_str)

# Start calculating pivots from the end date (inclusive), working backwards
# in time by a time delta that matches the user-requested interval. Include
# the start date in the final pivots when the interval spacing allows that
# date to be included.
pivots = deque([])
pivot = end
while pivot >= start:
pivots.appendleft(pivot)
pivot = pivot - delta

pivots = np.array([numeric_date(pivot) for pivot in pivots])

return np.around(pivots, 4)

Expand Down
90 changes: 45 additions & 45 deletions tests/builds/zika/auspice/zika_tip-frequencies.json
Original file line number Diff line number Diff line change
@@ -1,92 +1,92 @@
{
"BRA/2016/FC_6706": {
"frequencies": [
0.022579,
0.011312,
0.206983,
0.102282
0.022368,
0.011759,
0.207309,
0.102551
]
},
"COL/FLR_00008/2015": {
"frequencies": [
0.243453,
0.208443,
0.008645,
0.010659
0.244555,
0.204597,
0.008456,
0.010737
]
},
"Colombia/2016/ZC204Se": {
"frequencies": [
0.126056,
0.231597,
0.014167,
0.017099
0.12489,
0.234574,
0.013671,
0.017245
]
},
"DOM/2016/BB_0183": {
"frequencies": [
0.017888,
0.009342,
0.183671,
0.148759
0.017736,
0.009645,
0.185763,
0.148494
]
},
"EcEs062_16": {
"frequencies": [
0.018981,
0.009763,
0.190994,
0.134398
0.018817,
0.010092,
0.192704,
0.134289
]
},
"HND/2016/HU_ME59": {
"frequencies": [
0.009484,
0.006254,
0.090552,
0.457824
0.009425,
0.006444,
0.093589,
0.456546
]
},
"PAN/CDC_259359_V1_V3/2015": {
"frequencies": [
0.224832,
0.214575,
0.008963,
0.011176
0.225577,
0.211235,
0.008761,
0.011258
]
},
"PRVABC59": {
"frequencies": [
0.243453,
0.208443,
0.008645,
0.010659
0.244555,
0.204597,
0.008456,
0.010737
]
},
"VEN/UF_1/2016": {
"frequencies": [
0.030662,
0.016483,
0.206983,
0.070196
0.030336,
0.017436,
0.204461,
0.07079
]
},
"ZKC2/2016": {
"frequencies": [
0.062612,
0.083787,
0.080395,
0.036947
0.06174,
0.089622,
0.076833,
0.037353
]
},
"generated_by": {
"program": "augur",
"version": "7.0.2"
"version": "19.2.0"
},
"pivots": [
2015.75,
2016.0,
2016.25,
2016.5
2015.7521,
2016.0041,
2016.2527,
2016.5014
]
}
44 changes: 21 additions & 23 deletions tests/functional/frequencies.t
Original file line number Diff line number Diff line change
Expand Up @@ -34,26 +34,24 @@ Pivots get calculated from the requested date range.
Calculate KDE-based tip frequencies for a time period with relative dates.
Testing relative dates deterministically from the shell is tricky.
To keep these tests simple and avoid freezing the system date to specific values, this test checks the logical consistency of the requested relative dates and pivot interval.
With a minimum date of 1 year (12 months) ago, a maximum date of 6 months ago, and a pivot interval of 3 months, we expect only 2 pivots in the final output.
Since the test data are much older than the time period requested, all strains will always have frequencies of 0 for the 2 requested pivots.
As long as we always calculate 2 pivots with frequencies of 0 for all strains, we can ignore the actual pivot values calculated for the relative dates in the diff below.

TODO: un-comment this test, which is failing on 2022-06-01 with 3 pivot points: https://github.com/nextstrain/augur/runs/6694014041

$ ${AUGUR} frequencies \
> --method kde \
> --tree "frequencies/tree.nwk" \
> --metadata "frequencies/metadata.tsv" \
> --pivot-interval 3 \
> --min-date 1Y \
> --max-date 6M \
> --output "$TMP/tip-frequencies.json" > /dev/null

$ python3 "$TESTDIR/../../scripts/diff_jsons.py" \
> --exclude-paths "root['generated_by']['version']" "root['pivots']" -- \
> "frequencies/zika_tip-frequencies_with_relative_dates.json" \
> "$TMP/tip-frequencies.json"
{}
$ rm -f "$TMP/tip-frequencies.json"

$ popd > /dev/null
With a minimum date of 12 months ago, a maximum date of 6 months ago, and a pivot interval of 3 months, we always expect 3 pivots in the final output corresponding to the end date (always included), the 3-month pivot before that end date, and the start date (also should always be included).
Since the test data are much older than the time period requested, all strains will always have frequencies of 0 for the 3 requested pivots.
As long as we always calculate 3 pivots with frequencies of 0 for all strains, we can ignore the actual pivot values calculated for the relative dates in the diff below.

$ ${AUGUR} frequencies \
> --method kde \
> --tree "frequencies/tree.nwk" \
> --metadata "frequencies/metadata.tsv" \
> --pivot-interval 3 \
> --min-date 12M \
> --max-date 6M \
> --output "$TMP/tip-frequencies.json" > /dev/null

$ python3 "$TESTDIR/../../scripts/diff_jsons.py" \
> --exclude-paths "root['generated_by']['version']" "root['pivots']" -- \
> "frequencies/zika_tip-frequencies_with_relative_dates.json" \
> "$TMP/tip-frequencies.json"
{}
$ rm -f "$TMP/tip-frequencies.json"

$ popd > /dev/null
Loading

0 comments on commit 85d2edd

Please sign in to comment.