diff --git a/CHANGES.md b/CHANGES.md index 931751e13..c79722a2e 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -2,6 +2,11 @@ ## __NEXT__ +### Major Changes + +* frequencies: Changes the logic for calculating the time points when frequencies are estimated to ensure that the user-provided "end date" is always included. This change in the behavior of the frequencies command fixes a bug where large intervals between time points (e.g., 3 months) could cause recent data to be omitted from frequency calculations. See the pull request for more details included the scientific implications of this bug. [#1121][] (@huddlej) + +[#1121]: https://github.com/nextstrain/augur/pull/1121 ## 19.3.0 (19 January 2023) diff --git a/augur/frequency_estimators.py b/augur/frequency_estimators.py index 48d335e3d..1c5117419 100644 --- a/augur/frequency_estimators.py +++ b/augur/frequency_estimators.py @@ -1,7 +1,8 @@ # estimates clade frequencies from __future__ import division, print_function -from collections import defaultdict +from collections import defaultdict, deque import datetime +import isodate import numpy as np import pandas as pd from scipy.interpolate import interp1d @@ -9,6 +10,8 @@ import sys import time +from .dates import numeric_date + debug = False log_thres = 10.0 @@ -24,7 +27,9 @@ def get_pivots(observations, pivot_interval, start_date=None, end_date=None, piv interval between pivots. Start and end pivots will be based on the range of given observed dates, - unless a start or end date are provided to override these defaults. + unless a start or end date are provided to override these defaults. Pivots + include the end date and, where possible for the given interval, the start + date. Parameters ---------- @@ -54,18 +59,29 @@ def get_pivots(observations, pivot_interval, start_date=None, end_date=None, piv pivot_end = end_date if end_date else np.ceil(np.max(observations) / pivot_frequency) * pivot_frequency if pivot_interval_units == "months": - offset = "%sMS" % pivot_interval + duration_str = f'P{pivot_interval}M' elif pivot_interval_units == "weeks": - offset = "%sW" % pivot_interval + duration_str = f'P{pivot_interval}W' else: raise ValueError(f"The given interval unit '{pivot_interval_units}' is not supported.") - datetime_pivots = pd.date_range( - float_to_datestring(pivot_start), - float_to_datestring(pivot_end), - freq = offset - ) - pivots = np.array([timestamp_to_float(pivot) for pivot in datetime_pivots]) + # Construct standard Python date instances from numeric dates via ISO-8601 + # dates and the corresponding delta time for the interval between pivots. + start = datetime.datetime.strptime(float_to_datestring(pivot_start), "%Y-%m-%d") + end = datetime.datetime.strptime(float_to_datestring(pivot_end), "%Y-%m-%d") + delta = isodate.parse_duration(duration_str) + + # Start calculating pivots from the end date (inclusive), working backwards + # in time by a time delta that matches the user-requested interval. Include + # the start date in the final pivots when the interval spacing allows that + # date to be included. + pivots = deque([]) + pivot = end + while pivot >= start: + pivots.appendleft(pivot) + pivot = pivot - delta + + pivots = np.array([numeric_date(pivot) for pivot in pivots]) return np.around(pivots, 4) diff --git a/tests/builds/zika/auspice/zika_tip-frequencies.json b/tests/builds/zika/auspice/zika_tip-frequencies.json index b95d444ae..086656e64 100644 --- a/tests/builds/zika/auspice/zika_tip-frequencies.json +++ b/tests/builds/zika/auspice/zika_tip-frequencies.json @@ -1,92 +1,92 @@ { "BRA/2016/FC_6706": { "frequencies": [ - 0.022579, - 0.011312, - 0.206983, - 0.102282 + 0.022368, + 0.011759, + 0.207309, + 0.102551 ] }, "COL/FLR_00008/2015": { "frequencies": [ - 0.243453, - 0.208443, - 0.008645, - 0.010659 + 0.244555, + 0.204597, + 0.008456, + 0.010737 ] }, "Colombia/2016/ZC204Se": { "frequencies": [ - 0.126056, - 0.231597, - 0.014167, - 0.017099 + 0.12489, + 0.234574, + 0.013671, + 0.017245 ] }, "DOM/2016/BB_0183": { "frequencies": [ - 0.017888, - 0.009342, - 0.183671, - 0.148759 + 0.017736, + 0.009645, + 0.185763, + 0.148494 ] }, "EcEs062_16": { "frequencies": [ - 0.018981, - 0.009763, - 0.190994, - 0.134398 + 0.018817, + 0.010092, + 0.192704, + 0.134289 ] }, "HND/2016/HU_ME59": { "frequencies": [ - 0.009484, - 0.006254, - 0.090552, - 0.457824 + 0.009425, + 0.006444, + 0.093589, + 0.456546 ] }, "PAN/CDC_259359_V1_V3/2015": { "frequencies": [ - 0.224832, - 0.214575, - 0.008963, - 0.011176 + 0.225577, + 0.211235, + 0.008761, + 0.011258 ] }, "PRVABC59": { "frequencies": [ - 0.243453, - 0.208443, - 0.008645, - 0.010659 + 0.244555, + 0.204597, + 0.008456, + 0.010737 ] }, "VEN/UF_1/2016": { "frequencies": [ - 0.030662, - 0.016483, - 0.206983, - 0.070196 + 0.030336, + 0.017436, + 0.204461, + 0.07079 ] }, "ZKC2/2016": { "frequencies": [ - 0.062612, - 0.083787, - 0.080395, - 0.036947 + 0.06174, + 0.089622, + 0.076833, + 0.037353 ] }, "generated_by": { "program": "augur", - "version": "7.0.2" + "version": "19.2.0" }, "pivots": [ - 2015.75, - 2016.0, - 2016.25, - 2016.5 + 2015.7521, + 2016.0041, + 2016.2527, + 2016.5014 ] } \ No newline at end of file diff --git a/tests/functional/frequencies.t b/tests/functional/frequencies.t index 4ba9c0f92..51d7d6cfd 100644 --- a/tests/functional/frequencies.t +++ b/tests/functional/frequencies.t @@ -34,26 +34,24 @@ Pivots get calculated from the requested date range. Calculate KDE-based tip frequencies for a time period with relative dates. Testing relative dates deterministically from the shell is tricky. To keep these tests simple and avoid freezing the system date to specific values, this test checks the logical consistency of the requested relative dates and pivot interval. -With a minimum date of 1 year (12 months) ago, a maximum date of 6 months ago, and a pivot interval of 3 months, we expect only 2 pivots in the final output. -Since the test data are much older than the time period requested, all strains will always have frequencies of 0 for the 2 requested pivots. -As long as we always calculate 2 pivots with frequencies of 0 for all strains, we can ignore the actual pivot values calculated for the relative dates in the diff below. - -TODO: un-comment this test, which is failing on 2022-06-01 with 3 pivot points: https://github.com/nextstrain/augur/runs/6694014041 - -$ ${AUGUR} frequencies \ -> --method kde \ -> --tree "frequencies/tree.nwk" \ -> --metadata "frequencies/metadata.tsv" \ -> --pivot-interval 3 \ -> --min-date 1Y \ -> --max-date 6M \ -> --output "$TMP/tip-frequencies.json" > /dev/null - -$ python3 "$TESTDIR/../../scripts/diff_jsons.py" \ -> --exclude-paths "root['generated_by']['version']" "root['pivots']" -- \ -> "frequencies/zika_tip-frequencies_with_relative_dates.json" \ -> "$TMP/tip-frequencies.json" -{} -$ rm -f "$TMP/tip-frequencies.json" - -$ popd > /dev/null +With a minimum date of 12 months ago, a maximum date of 6 months ago, and a pivot interval of 3 months, we always expect 3 pivots in the final output corresponding to the end date (always included), the 3-month pivot before that end date, and the start date (also should always be included). +Since the test data are much older than the time period requested, all strains will always have frequencies of 0 for the 3 requested pivots. +As long as we always calculate 3 pivots with frequencies of 0 for all strains, we can ignore the actual pivot values calculated for the relative dates in the diff below. + + $ ${AUGUR} frequencies \ + > --method kde \ + > --tree "frequencies/tree.nwk" \ + > --metadata "frequencies/metadata.tsv" \ + > --pivot-interval 3 \ + > --min-date 12M \ + > --max-date 6M \ + > --output "$TMP/tip-frequencies.json" > /dev/null + + $ python3 "$TESTDIR/../../scripts/diff_jsons.py" \ + > --exclude-paths "root['generated_by']['version']" "root['pivots']" -- \ + > "frequencies/zika_tip-frequencies_with_relative_dates.json" \ + > "$TMP/tip-frequencies.json" + {} + $ rm -f "$TMP/tip-frequencies.json" + + $ popd > /dev/null diff --git a/tests/functional/frequencies/zika_tip-frequencies.json b/tests/functional/frequencies/zika_tip-frequencies.json index b95d444ae..086656e64 100644 --- a/tests/functional/frequencies/zika_tip-frequencies.json +++ b/tests/functional/frequencies/zika_tip-frequencies.json @@ -1,92 +1,92 @@ { "BRA/2016/FC_6706": { "frequencies": [ - 0.022579, - 0.011312, - 0.206983, - 0.102282 + 0.022368, + 0.011759, + 0.207309, + 0.102551 ] }, "COL/FLR_00008/2015": { "frequencies": [ - 0.243453, - 0.208443, - 0.008645, - 0.010659 + 0.244555, + 0.204597, + 0.008456, + 0.010737 ] }, "Colombia/2016/ZC204Se": { "frequencies": [ - 0.126056, - 0.231597, - 0.014167, - 0.017099 + 0.12489, + 0.234574, + 0.013671, + 0.017245 ] }, "DOM/2016/BB_0183": { "frequencies": [ - 0.017888, - 0.009342, - 0.183671, - 0.148759 + 0.017736, + 0.009645, + 0.185763, + 0.148494 ] }, "EcEs062_16": { "frequencies": [ - 0.018981, - 0.009763, - 0.190994, - 0.134398 + 0.018817, + 0.010092, + 0.192704, + 0.134289 ] }, "HND/2016/HU_ME59": { "frequencies": [ - 0.009484, - 0.006254, - 0.090552, - 0.457824 + 0.009425, + 0.006444, + 0.093589, + 0.456546 ] }, "PAN/CDC_259359_V1_V3/2015": { "frequencies": [ - 0.224832, - 0.214575, - 0.008963, - 0.011176 + 0.225577, + 0.211235, + 0.008761, + 0.011258 ] }, "PRVABC59": { "frequencies": [ - 0.243453, - 0.208443, - 0.008645, - 0.010659 + 0.244555, + 0.204597, + 0.008456, + 0.010737 ] }, "VEN/UF_1/2016": { "frequencies": [ - 0.030662, - 0.016483, - 0.206983, - 0.070196 + 0.030336, + 0.017436, + 0.204461, + 0.07079 ] }, "ZKC2/2016": { "frequencies": [ - 0.062612, - 0.083787, - 0.080395, - 0.036947 + 0.06174, + 0.089622, + 0.076833, + 0.037353 ] }, "generated_by": { "program": "augur", - "version": "7.0.2" + "version": "19.2.0" }, "pivots": [ - 2015.75, - 2016.0, - 2016.25, - 2016.5 + 2015.7521, + 2016.0041, + 2016.2527, + 2016.5014 ] } \ No newline at end of file diff --git a/tests/functional/frequencies/zika_tip-frequencies_with_fixed_dates.json b/tests/functional/frequencies/zika_tip-frequencies_with_fixed_dates.json index f370935a6..318920e30 100644 --- a/tests/functional/frequencies/zika_tip-frequencies_with_fixed_dates.json +++ b/tests/functional/frequencies/zika_tip-frequencies_with_fixed_dates.json @@ -10,11 +10,11 @@ }, "COL/FLR_00008/2015": { "frequencies": [ - 0.342553, - 0.340179, - 0.337787, - 0.342054, - 0.330097 + 0.34254, + 0.340199, + 0.337813, + 0.34201, + 0.329984 ] }, "Colombia/2016/ZC204Se": { @@ -55,20 +55,20 @@ }, "PAN/CDC_259359_V1_V3/2015": { "frequencies": [ - 0.314894, - 0.319642, - 0.324426, - 0.315891, - 0.339807 + 0.314921, + 0.319602, + 0.324374, + 0.31598, + 0.340033 ] }, "PRVABC59": { "frequencies": [ - 0.342553, - 0.340179, - 0.337787, - 0.342054, - 0.330097 + 0.34254, + 0.340199, + 0.337813, + 0.34201, + 0.329984 ] }, "VEN/UF_1/2016": { @@ -91,13 +91,13 @@ }, "generated_by": { "program": "augur", - "version": "14.1.0" + "version": "19.2.0" }, "pivots": [ - 2015.0, - 2015.25, - 2015.5, - 2015.75, - 2016.0 + 2015.0014, + 2015.2479, + 2015.4973, + 2015.7493, + 2016.0014 ] } \ No newline at end of file diff --git a/tests/functional/frequencies/zika_tip-frequencies_with_relative_dates.json b/tests/functional/frequencies/zika_tip-frequencies_with_relative_dates.json index 056fc9175..8bb7ae524 100644 --- a/tests/functional/frequencies/zika_tip-frequencies_with_relative_dates.json +++ b/tests/functional/frequencies/zika_tip-frequencies_with_relative_dates.json @@ -1,70 +1,81 @@ { "BRA/2016/FC_6706": { "frequencies": [ + 0.0, 0.0, 0.0 ] }, "COL/FLR_00008/2015": { "frequencies": [ + 0.0, 0.0, 0.0 ] }, "Colombia/2016/ZC204Se": { "frequencies": [ + 0.0, 0.0, 0.0 ] }, "DOM/2016/BB_0183": { "frequencies": [ + 0.0, 0.0, 0.0 ] }, "EcEs062_16": { "frequencies": [ + 0.0, 0.0, 0.0 ] }, "HND/2016/HU_ME59": { "frequencies": [ + 0.0, 0.0, 0.0 ] }, "PAN/CDC_259359_V1_V3/2015": { "frequencies": [ + 0.0, 0.0, 0.0 ] }, "PRVABC59": { "frequencies": [ + 0.0, 0.0, 0.0 ] }, "VEN/UF_1/2016": { "frequencies": [ + 0.0, 0.0, 0.0 ] }, "ZKC2/2016": { "frequencies": [ + 0.0, 0.0, 0.0 ] }, "generated_by": { "program": "augur", - "version": "14.1.0" + "version": "19.2.0" }, "pivots": [ - 2021.3333, - 2021.5833 + 2022.0178, + 2022.2644, + 2022.5137 ] } \ No newline at end of file diff --git a/tests/test_frequencies.py b/tests/test_frequencies.py index aa6be91d8..43e1088ff 100644 --- a/tests/test_frequencies.py +++ b/tests/test_frequencies.py @@ -12,7 +12,8 @@ # we assume (and assert) that this script is running from the tests/ directory sys.path.append(str(Path(__file__).parent.parent.parent)) -from augur.frequency_estimators import get_pivots, TreeKdeFrequencies, AlignmentKdeFrequencies, TreeKdeFrequenciesError +from augur.dates import numeric_date +from augur.frequency_estimators import float_to_datestring, get_pivots, TreeKdeFrequencies, AlignmentKdeFrequencies, TreeKdeFrequenciesError from augur.utils import json_to_tree # Define regions to use for testing weighted frequencies. @@ -60,9 +61,15 @@ def test_get_pivots_from_tree_only(tree): pivots = get_pivots(observations, pivot_frequency) assert isinstance(pivots, np.ndarray) - # Floating point pivot values should be separated by the given number of - # months divided by number of months in a year. - assert pivots[1] - pivots[0] == pivot_frequency / 12.0 + # Floating point pivot values should be roughly separated by the given + # number of months divided by number of months in a year. Numeric dates from + # pivots calculate decimal fractions as the proportion of days in the year, + # instead of months in the year (e.g., the fraction for the first three + # months of a non-leap year is (31 + 28 + 31) / 365 or 0.246575 instead of 3 + # / 12 or 0.25). As a result of this difference in how we calculate + # fractions, we need to round to pivots to 2 decimal places which is the + # precision of month-based fractions. + assert np.round(pivots[1] - pivots[0], 2) == (pivot_frequency / 12.0) def test_get_pivots_from_start_and_end_date(): """ @@ -87,16 +94,45 @@ def test_get_pivots_by_months(): """ pivots = get_pivots(observations=[], pivot_interval=1, start_date=2015.0, end_date=2016.0, pivot_interval_units="months") # Pivots should include all 12 months of the year plus the month represented - # by the end date, since the pandas month interval uses "month starts". See - # pandas date offsets documentation for more details: - # https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases + # by the end date because pivots represent time slices through data and we + # want to evaluate frequencies at the start and end time slices. assert len(pivots) == 13 + assert float_to_datestring(pivots[-1]) == "2016-01-01" + +def test_get_pivots_by_months_with_realistic_start_end_dates(): + """Get pivots where intervals are defined by months and realistic start and end dates. + """ + # 6 years (72 months) of pivots with 3-month intervals should produce 24 + 1 + # pivots (4 pivots per year plus the pivot for the beginning of the year + # associated with the end date). + pivots = get_pivots( + observations=[], + pivot_interval=3, + start_date=numeric_date("2017-01-06"), + end_date=numeric_date("2023-01-06"), + pivot_interval_units="months" + ) + assert len(pivots) == 25 + assert float_to_datestring(pivots[-1]) == "2023-01-06" def test_get_pivots_by_weeks(): """Get pivots where intervals are defined as weeks instead of months. """ + # As with monthly pivots, weekly pivots should include the first and last + # values in the range. So, a 1-year interval will represent 52 weekly pivots + # from the beginning plus the first pivot from the next year. pivots = get_pivots(observations=[], pivot_interval=1, start_date=2015.0, end_date=2016.0, pivot_interval_units="weeks") - assert len(pivots) == 52 + assert len(pivots) == 53 + + pivots = get_pivots( + observations=[], + pivot_interval=1, + start_date=numeric_date("2022-01-06"), + end_date=numeric_date("2023-01-06"), + pivot_interval_units="weeks" + ) + assert len(pivots) == 53 + def test_get_pivots_by_invalid_unit(): with pytest.raises(ValueError, match=r".*invalid_unit.*is not supported.*"): @@ -133,7 +169,7 @@ def test_estimate_with_time_interval(self, tree): ) frequencies = kde_frequencies.estimate(tree) assert hasattr(kde_frequencies, "pivots") - assert kde_frequencies.pivots[0] == 2015.5833 + assert kde_frequencies.pivots[0] == 2015.5 assert hasattr(kde_frequencies, "frequencies") assert list(frequencies.values())[0].shape == kde_frequencies.pivots.shape @@ -407,9 +443,16 @@ def test_estimate(self, alignment): frequencies = kde_frequencies.estimate(alignment, observations) assert hasattr(kde_frequencies, "pivots") - assert np.around(kde_frequencies.pivots[1] - kde_frequencies.pivots[0], 2) == np.around(1 / 12.0, 2) + pivots = kde_frequencies.pivots + + # Pivot decimal fractions represent fractions of a year by days, so + # comparisons with fractions of a year by months need to take into + # account rounding error between these calculations. In this test, we + # allow pivots to be spaced apart by a value that is "close" to 1 / 12 + # months by the rounding error of the month-based precision. + assert np.isclose((pivots[-1] - pivots[-2]), 1 / 12.0, atol=0.005) assert hasattr(kde_frequencies, "frequencies") - assert list(frequencies.values())[0].shape == kde_frequencies.pivots.shape + assert list(frequencies.values())[0].shape == pivots.shape # Find a position with frequencies estimated for multiple bases. position = list(frequencies.keys())[0].split(":")[0]