From 039ff0e59fb0c13ebd304ae919304e8958c26bfa Mon Sep 17 00:00:00 2001 From: rlskoeser Date: Fri, 27 Sep 2024 16:01:45 -0400 Subject: [PATCH] Improve duration handling for unknown years --- .../shxco_partial_date_durations.ipynb | 628 ++++++++---------- src/undate/undate.py | 21 +- tests/test_undate.py | 21 +- 3 files changed, 316 insertions(+), 354 deletions(-) diff --git a/examples/notebooks/shxco_partial_date_durations.ipynb b/examples/notebooks/shxco_partial_date_durations.ipynb index 8d00a66..b89661f 100644 --- a/examples/notebooks/shxco_partial_date_durations.ipynb +++ b/examples/notebooks/shxco_partial_date_durations.ipynb @@ -21,7 +21,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 1, "metadata": {}, "outputs": [ { @@ -29,8 +29,8 @@ "output_type": "stream", "text": [ "\n", - "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.0.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m24.0\u001b[0m\n", - "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49m/Users/rkoeser/workarea/env/undate/bin/python -m pip install --upgrade pip\u001b[0m\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m24.1.2\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m24.2\u001b[0m\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49m/Users/rkoeser/workarea/env/undate-py3.10/bin/python3.10 -m pip install --upgrade pip\u001b[0m\n", "Note: you may need to restart the kernel to use updated packages.\n" ] } @@ -311,17 +311,18 @@ "\n", "Define a method to initialize an `UndateInterval` from start and end date strings in ISO format as used in S&co datasets\n", "\n", - "**Note:** There's an off-by-one discrepancy between how we currently calculate duration in Undate and in the Shakespeare and Company Project code; becauS&co code counts the first day in the range but not the last (this could also be thought of as counting half of the start and end dates). For simplicity of comparison here, we subtract one day from the result returned by `UndateInterval.duration`." + "**Note:** There's an off-by-one discrepancy between how we currently calculate duration in Undate and in the Shakespeare and Company Project code. This is because S&co code counts the first day in the range but not the last (this could also be thought of as counting half of the start and end dates). For simplicity of comparison here, we subtract one day from the result returned by `UndateInterval.duration`." ] }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 3, "metadata": { "id": "y_MqgrQW64uI" }, "outputs": [], "source": [ + "from undate.date import ONE_DAY\n", "from undate.undate import UndateInterval\n", "from undate.dateformat.iso8601 import ISO8601DateFormat\n", "\n", @@ -333,9 +334,8 @@ " interval = UndateInterval(earliest=unstart, latest=unend)\n", "\n", " # subtract one here for simplicity of comparison,\n", - " # to reconcile difference between how duration logic\n", - "\n", - " return interval.duration().days - 1" + " # to reconcile differences between duration logic\n", + " return interval.duration() - ONE_DAY" ] }, { @@ -353,7 +353,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 4, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -452,7 +452,7 @@ "260 4 months 122.0 " ] }, - "execution_count": 18, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } @@ -478,7 +478,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 5, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -497,15 +497,15 @@ "91.0 397\n", "365.0 337\n", " ... \n", - "69.0 1\n", - "36.0 1\n", - "73.0 1\n", - "574.0 1\n", - "171.0 1\n", + "200.0 1\n", + "277.0 1\n", + "169.0 1\n", + "45.0 1\n", + "38.0 1\n", "Name: count, Length: 133, dtype: int64" ] }, - "execution_count": 7, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -517,7 +517,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 6, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -540,7 +540,7 @@ "Name: subscription_duration_days, dtype: float64" ] }, - "execution_count": 8, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -558,7 +558,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 7, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -589,44 +589,25 @@ " \n", " \n", " \n", - " event_type\n", + " member_names\n", " start_date\n", " end_date\n", - " member_uris\n", - " member_names\n", - " member_sort_names\n", - " subscription_price_paid\n", - " subscription_deposit\n", " subscription_duration\n", " subscription_duration_days\n", - " ...\n", - " item_uri\n", - " item_title\n", - " item_volume\n", - " item_authors\n", - " item_year\n", - " item_notes\n", - " source_type\n", - " source_citation\n", - " source_manifest\n", - " source_image\n", " \n", " \n", " \n", " \n", "\n", - "

0 rows × 28 columns

\n", "" ], "text/plain": [ "Empty DataFrame\n", - "Columns: [event_type, start_date, end_date, member_uris, member_names, member_sort_names, subscription_price_paid, subscription_deposit, subscription_duration, subscription_duration_days, subscription_volumes, subscription_category, subscription_purchase_date, reimbursement_refund, borrow_status, borrow_duration_days, purchase_price, currency, item_uri, item_title, item_volume, item_authors, item_year, item_notes, source_type, source_citation, source_manifest, source_image]\n", - "Index: []\n", - "\n", - "[0 rows x 28 columns]" + "Columns: [member_names, start_date, end_date, subscription_duration, subscription_duration_days]\n", + "Index: []" ] }, - "execution_count": 12, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -638,7 +619,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 8, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -707,7 +688,7 @@ "13686 NaN 31.0 " ] }, - "execution_count": 19, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -726,7 +707,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 9, "metadata": { "id": "jwvN9-CgLQRx" }, @@ -746,7 +727,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 10, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -793,7 +774,7 @@ " 1928\n", " 1 year\n", " 365.0\n", - " 730\n", + " 730 days\n", " \n", " \n", " 70\n", @@ -802,7 +783,7 @@ " 1932\n", " 1 year\n", " 365.0\n", - " 730\n", + " 730 days\n", " \n", " \n", " 233\n", @@ -811,7 +792,7 @@ " 1921-08\n", " 1 month\n", " 31.0\n", - " 61\n", + " 61 days\n", " \n", " \n", " 234\n", @@ -820,7 +801,7 @@ " 1922-02\n", " 5 months\n", " 153.0\n", - " 180\n", + " 180 days\n", " \n", " \n", " 260\n", @@ -829,7 +810,7 @@ " 1923-10\n", " 4 months\n", " 122.0\n", - " 152\n", + " 152 days\n", " \n", " \n", "\n", @@ -843,15 +824,15 @@ "234 Anne Moderwell;Hiram Moderwell / H. K. Moderwell 1921-09 1922-02 \n", "260 Victor Llona 1923-06 1923-10 \n", "\n", - " subscription_duration subscription_duration_days undate_duration \n", - "28 1 year 365.0 730 \n", - "70 1 year 365.0 730 \n", - "233 1 month 31.0 61 \n", - "234 5 months 153.0 180 \n", - "260 4 months 122.0 152 " + " subscription_duration subscription_duration_days undate_duration \n", + "28 1 year 365.0 730 days \n", + "70 1 year 365.0 730 days \n", + "233 1 month 31.0 61 days \n", + "234 5 months 153.0 180 days \n", + "260 4 months 122.0 152 days " ] }, - "execution_count": 21, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -864,7 +845,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 11, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -911,7 +892,7 @@ " 1928\n", " 1 year\n", " 365.0\n", - " 730\n", + " 730 days\n", " \n", " \n", " 70\n", @@ -920,7 +901,7 @@ " 1932\n", " 1 year\n", " 365.0\n", - " 730\n", + " 730 days\n", " \n", " \n", " 233\n", @@ -929,7 +910,7 @@ " 1921-08\n", " 1 month\n", " 31.0\n", - " 61\n", + " 61 days\n", " \n", " \n", " 234\n", @@ -938,7 +919,7 @@ " 1922-02\n", " 5 months\n", " 153.0\n", - " 180\n", + " 180 days\n", " \n", " \n", " 260\n", @@ -947,7 +928,7 @@ " 1923-10\n", " 4 months\n", " 122.0\n", - " 152\n", + " 152 days\n", " \n", " \n", "\n", @@ -961,15 +942,15 @@ "234 Anne Moderwell;Hiram Moderwell / H. K. Moderwell 1921-09 1922-02 \n", "260 Victor Llona 1923-06 1923-10 \n", "\n", - " subscription_duration subscription_duration_days undate_duration \n", - "28 1 year 365.0 730 \n", - "70 1 year 365.0 730 \n", - "233 1 month 31.0 61 \n", - "234 5 months 153.0 180 \n", - "260 4 months 122.0 152 " + " subscription_duration subscription_duration_days undate_duration \n", + "28 1 year 365.0 730 days \n", + "70 1 year 365.0 730 days \n", + "233 1 month 31.0 61 days \n", + "234 5 months 153.0 180 days \n", + "260 4 months 122.0 152 days " ] }, - "execution_count": 23, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -981,7 +962,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 12, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -1029,7 +1010,7 @@ " 1928\n", " 1 year\n", " 365.0\n", - " 730\n", + " 730 days\n", " 365.0\n", " \n", " \n", @@ -1039,7 +1020,7 @@ " 1932\n", " 1 year\n", " 365.0\n", - " 730\n", + " 730 days\n", " 365.0\n", " \n", " \n", @@ -1049,7 +1030,7 @@ " 1921-08\n", " 1 month\n", " 31.0\n", - " 61\n", + " 61 days\n", " 30.0\n", " \n", " \n", @@ -1059,7 +1040,7 @@ " 1922-02\n", " 5 months\n", " 153.0\n", - " 180\n", + " 180 days\n", " 27.0\n", " \n", " \n", @@ -1069,7 +1050,7 @@ " 1923-10\n", " 4 months\n", " 122.0\n", - " 152\n", + " 152 days\n", " 30.0\n", " \n", " \n", @@ -1089,7 +1070,7 @@ " 1941-12-24\n", " 1 month\n", " 30.0\n", - " 30\n", + " 30 days\n", " 0.0\n", " \n", " \n", @@ -1099,7 +1080,7 @@ " 1941-12-24\n", " 1 month\n", " 30.0\n", - " 30\n", + " 30 days\n", " 0.0\n", " \n", " \n", @@ -1109,7 +1090,7 @@ " 1942-01-04\n", " 1 month\n", " 31.0\n", - " 31\n", + " 31 days\n", " 0.0\n", " \n", " \n", @@ -1119,7 +1100,7 @@ " 1942-03-08\n", " 3 months\n", " 90.0\n", - " 90\n", + " 90 days\n", " 0.0\n", " \n", " \n", @@ -1129,7 +1110,7 @@ " 1942-01-09\n", " 1 month\n", " 31.0\n", - " 31\n", + " 31 days\n", " 0.0\n", " \n", " \n", @@ -1164,36 +1145,36 @@ "35118 1942-03-08 3 months 90.0 \n", "35119 1942-01-09 1 month 31.0 \n", "\n", - " undate_duration duration_diff \n", - "28 730 365.0 \n", - "70 730 365.0 \n", - "233 61 30.0 \n", - "234 180 27.0 \n", - "260 152 30.0 \n", - "... ... ... \n", - "35114 30 0.0 \n", - "35115 30 0.0 \n", - "35116 31 0.0 \n", - "35118 90 0.0 \n", - "35119 31 0.0 \n", + " undate_duration duration_diff \n", + "28 730 days 365.0 \n", + "70 730 days 365.0 \n", + "233 61 days 30.0 \n", + "234 180 days 27.0 \n", + "260 152 days 30.0 \n", + "... ... ... \n", + "35114 30 days 0.0 \n", + "35115 30 days 0.0 \n", + "35116 31 days 0.0 \n", + "35118 90 days 0.0 \n", + "35119 31 days 0.0 \n", "\n", "[9144 rows x 7 columns]" ] }, - "execution_count": 24, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# what's the difference between the two?\n", - "subs_duration['duration_diff'] = subs_duration.apply(lambda row: row.undate_duration - row.subscription_duration_days, axis=1)\n", + "subs_duration['duration_diff'] = subs_duration.apply(lambda row: row.undate_duration.astype(\"int\") - row.subscription_duration_days, axis=1)\n", "subs_duration" ] }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 13, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -1206,20 +1187,20 @@ "data": { "text/plain": [ "duration_diff\n", - " 0.0 9065\n", - " 30.0 30\n", - " 29.0 21\n", - " 1.0 10\n", - "-1.0 9\n", - " 28.0 4\n", - " 365.0 2\n", - " 27.0 1\n", - " 2.0 1\n", - "-3.0 1\n", + "0.0 9065\n", + "30.0 30\n", + "29.0 21\n", + "1.0 10\n", + "-1.0 9\n", + "28.0 4\n", + "365.0 2\n", + "27.0 1\n", + "2.0 1\n", + "-3.0 1\n", "Name: count, dtype: int64" ] }, - "execution_count": 25, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } @@ -1239,7 +1220,7 @@ }, { "cell_type": "code", - "execution_count": 41, + "execution_count": 14, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -1287,7 +1268,7 @@ " 1928\n", " 1 year\n", " 365.0\n", - " 730\n", + " 730 days\n", " 365.0\n", " \n", " \n", @@ -1297,7 +1278,7 @@ " 1932\n", " 1 year\n", " 365.0\n", - " 730\n", + " 730 days\n", " 365.0\n", " \n", " \n", @@ -1307,7 +1288,7 @@ " 1921-08\n", " 1 month\n", " 31.0\n", - " 61\n", + " 61 days\n", " 30.0\n", " \n", " \n", @@ -1317,7 +1298,7 @@ " 1922-02\n", " 5 months\n", " 153.0\n", - " 180\n", + " 180 days\n", " 27.0\n", " \n", " \n", @@ -1327,7 +1308,7 @@ " 1923-10\n", " 4 months\n", " 122.0\n", - " 152\n", + " 152 days\n", " 30.0\n", " \n", " \n", @@ -1337,7 +1318,7 @@ " 1923-09\n", " 1 month\n", " 31.0\n", - " 60\n", + " 60 days\n", " 29.0\n", " \n", " \n", @@ -1347,7 +1328,7 @@ " 1924-03\n", " 1 month\n", " 29.0\n", - " 59\n", + " 59 days\n", " 30.0\n", " \n", " \n", @@ -1357,7 +1338,7 @@ " 1924-04\n", " 2 months\n", " 60.0\n", - " 89\n", + " 89 days\n", " 29.0\n", " \n", " \n", @@ -1367,7 +1348,7 @@ " 1926-10\n", " 7 months\n", " 214.0\n", - " 244\n", + " 244 days\n", " 30.0\n", " \n", " \n", @@ -1377,7 +1358,7 @@ " 1926-12\n", " 1 month\n", " 30.0\n", - " 60\n", + " 60 days\n", " 30.0\n", " \n", " \n", @@ -1397,32 +1378,32 @@ "293 Madeleine Lorsignol 1926-03 1926-10 \n", "313 M. Mathieu 1926-11 1926-12 \n", "\n", - " subscription_duration subscription_duration_days undate_duration \\\n", - "28 1 year 365.0 730 \n", - "70 1 year 365.0 730 \n", - "233 1 month 31.0 61 \n", - "234 5 months 153.0 180 \n", - "260 4 months 122.0 152 \n", - "261 1 month 31.0 60 \n", - "271 1 month 29.0 59 \n", - "272 2 months 60.0 89 \n", - "293 7 months 214.0 244 \n", - "313 1 month 30.0 60 \n", + " subscription_duration subscription_duration_days undate_duration \\\n", + "28 1 year 365.0 730 days \n", + "70 1 year 365.0 730 days \n", + "233 1 month 31.0 61 days \n", + "234 5 months 153.0 180 days \n", + "260 4 months 122.0 152 days \n", + "261 1 month 31.0 60 days \n", + "271 1 month 29.0 59 days \n", + "272 2 months 60.0 89 days \n", + "293 7 months 214.0 244 days \n", + "313 1 month 30.0 60 days \n", "\n", - " duration_diff \n", - "28 365.0 \n", - "70 365.0 \n", - "233 30.0 \n", - "234 27.0 \n", - "260 30.0 \n", - "261 29.0 \n", - "271 30.0 \n", - "272 29.0 \n", - "293 30.0 \n", - "313 30.0 " + " duration_diff \n", + "28 365.0 \n", + "70 365.0 \n", + "233 30.0 \n", + "234 27.0 \n", + "260 30.0 \n", + "261 29.0 \n", + "271 30.0 \n", + "272 29.0 \n", + "293 30.0 \n", + "313 30.0 " ] }, - "execution_count": 41, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } @@ -1435,7 +1416,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 15, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -1455,14 +1436,14 @@ "4 months 5\n", "5 months 3\n", "1 year 2\n", - "7 months 2\n", "8 months 2\n", + "7 months 2\n", "11 months 1\n", "10 months 1\n", "Name: count, dtype: int64" ] }, - "execution_count": 27, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } @@ -1474,7 +1455,7 @@ }, { "cell_type": "code", - "execution_count": 43, + "execution_count": 16, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -1522,7 +1503,7 @@ " 1921-08\n", " 1 month\n", " 31.0\n", - " 61\n", + " 61 days\n", " 30.0\n", " \n", " \n", @@ -1532,7 +1513,7 @@ " 1923-09\n", " 1 month\n", " 31.0\n", - " 60\n", + " 60 days\n", " 29.0\n", " \n", " \n", @@ -1542,7 +1523,7 @@ " 1924-03\n", " 1 month\n", " 29.0\n", - " 59\n", + " 59 days\n", " 30.0\n", " \n", " \n", @@ -1552,7 +1533,7 @@ " 1926-12\n", " 1 month\n", " 30.0\n", - " 60\n", + " 60 days\n", " 30.0\n", " \n", " \n", @@ -1562,7 +1543,7 @@ " 1928-03\n", " 1 month\n", " 29.0\n", - " 59\n", + " 59 days\n", " 30.0\n", " \n", " \n", @@ -1572,7 +1553,7 @@ " 1928-03\n", " 1 month\n", " 29.0\n", - " 59\n", + " 59 days\n", " 30.0\n", " \n", " \n", @@ -1582,7 +1563,7 @@ " 1929-09\n", " 1 month\n", " 31.0\n", - " 60\n", + " 60 days\n", " 29.0\n", " \n", " \n", @@ -1592,7 +1573,7 @@ " 1929-09\n", " 1 month\n", " 31.0\n", - " 60\n", + " 60 days\n", " 29.0\n", " \n", " \n", @@ -1602,7 +1583,7 @@ " 1930-06\n", " 1 month\n", " 31.0\n", - " 60\n", + " 60 days\n", " 29.0\n", " \n", " \n", @@ -1612,7 +1593,7 @@ " 1930-12\n", " 1 month\n", " 30.0\n", - " 60\n", + " 60 days\n", " 30.0\n", " \n", " \n", @@ -1622,7 +1603,7 @@ " 1931-06\n", " 1 month\n", " 31.0\n", - " 60\n", + " 60 days\n", " 29.0\n", " \n", " \n", @@ -1632,7 +1613,7 @@ " 1931-07\n", " 1 month\n", " 30.0\n", - " 60\n", + " 60 days\n", " 30.0\n", " \n", " \n", @@ -1642,7 +1623,7 @@ " 1931-08\n", " 1 month\n", " 31.0\n", - " 61\n", + " 61 days\n", " 30.0\n", " \n", " \n", @@ -1652,7 +1633,7 @@ " 1931-09\n", " 1 month\n", " 31.0\n", - " 60\n", + " 60 days\n", " 29.0\n", " \n", " \n", @@ -1662,7 +1643,7 @@ " 1931-10\n", " 1 month\n", " 30.0\n", - " 60\n", + " 60 days\n", " 30.0\n", " \n", " \n", @@ -1687,25 +1668,25 @@ "468 Elaine Cammett 1931-08 1931-09 1 month \n", "472 Frederick McWilliam 1931-09 1931-10 1 month \n", "\n", - " subscription_duration_days undate_duration duration_diff \n", - "233 31.0 61 30.0 \n", - "261 31.0 60 29.0 \n", - "271 29.0 59 30.0 \n", - "313 30.0 60 30.0 \n", - "354 29.0 59 30.0 \n", - "356 29.0 59 30.0 \n", - "393 31.0 60 29.0 \n", - "394 31.0 60 29.0 \n", - "430 31.0 60 29.0 \n", - "444 30.0 60 30.0 \n", - "462 31.0 60 29.0 \n", - "464 30.0 60 30.0 \n", - "466 31.0 61 30.0 \n", - "468 31.0 60 29.0 \n", - "472 30.0 60 30.0 " + " subscription_duration_days undate_duration duration_diff \n", + "233 31.0 61 days 30.0 \n", + "261 31.0 60 days 29.0 \n", + "271 29.0 59 days 30.0 \n", + "313 30.0 60 days 30.0 \n", + "354 29.0 59 days 30.0 \n", + "356 29.0 59 days 30.0 \n", + "393 31.0 60 days 29.0 \n", + "394 31.0 60 days 29.0 \n", + "430 31.0 60 days 29.0 \n", + "444 30.0 60 days 30.0 \n", + "462 31.0 60 days 29.0 \n", + "464 30.0 60 days 30.0 \n", + "466 31.0 61 days 30.0 \n", + "468 31.0 60 days 29.0 \n", + "472 30.0 60 days 30.0 " ] }, - "execution_count": 43, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } @@ -1728,7 +1709,7 @@ }, { "cell_type": "code", - "execution_count": 44, + "execution_count": 17, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -1776,7 +1757,7 @@ " 1928\n", " 1 year\n", " 365.0\n", - " 730\n", + " 730 days\n", " 365.0\n", " \n", " \n", @@ -1786,7 +1767,7 @@ " 1932\n", " 1 year\n", " 365.0\n", - " 730\n", + " 730 days\n", " 365.0\n", " \n", " \n", @@ -1796,7 +1777,7 @@ " 1922-02\n", " 5 months\n", " 153.0\n", - " 180\n", + " 180 days\n", " 27.0\n", " \n", " \n", @@ -1806,7 +1787,7 @@ " 1923-10\n", " 4 months\n", " 122.0\n", - " 152\n", + " 152 days\n", " 30.0\n", " \n", " \n", @@ -1816,7 +1797,7 @@ " 1924-04\n", " 2 months\n", " 60.0\n", - " 89\n", + " 89 days\n", " 29.0\n", " \n", " \n", @@ -1826,7 +1807,7 @@ " 1926-10\n", " 7 months\n", " 214.0\n", - " 244\n", + " 244 days\n", " 30.0\n", " \n", " \n", @@ -1836,7 +1817,7 @@ " 1928-02\n", " 11 months\n", " 337.0\n", - " 365\n", + " 365 days\n", " 28.0\n", " \n", " \n", @@ -1846,7 +1827,7 @@ " 1927-10\n", " 3 months\n", " 92.0\n", - " 122\n", + " 122 days\n", " 30.0\n", " \n", " \n", @@ -1856,7 +1837,7 @@ " 1928-06\n", " 8 months\n", " 244.0\n", - " 273\n", + " 273 days\n", " 29.0\n", " \n", " \n", @@ -1866,7 +1847,7 @@ " 1928-04\n", " 3 months\n", " 91.0\n", - " 120\n", + " 120 days\n", " 29.0\n", " \n", " \n", @@ -1876,7 +1857,7 @@ " 1930-04\n", " 10 months\n", " 304.0\n", - " 333\n", + " 333 days\n", " 29.0\n", " \n", " \n", @@ -1886,7 +1867,7 @@ " 1930-04\n", " 3 months\n", " 90.0\n", - " 119\n", + " 119 days\n", " 29.0\n", " \n", " \n", @@ -1896,7 +1877,7 @@ " 1930-04\n", " 3 months\n", " 90.0\n", - " 119\n", + " 119 days\n", " 29.0\n", " \n", " \n", @@ -1906,7 +1887,7 @@ " 1930-09\n", " 8 months\n", " 243.0\n", - " 272\n", + " 272 days\n", " 29.0\n", " \n", " \n", @@ -1916,7 +1897,7 @@ " 1930-06\n", " 4 months\n", " 120.0\n", - " 149\n", + " 149 days\n", " 29.0\n", " \n", " \n", @@ -1941,42 +1922,42 @@ "412 Jacques Delmond 1930-01 1930-09 \n", "415 Loren Mozley 1930-02 1930-06 \n", "\n", - " subscription_duration subscription_duration_days undate_duration \\\n", - "28 1 year 365.0 730 \n", - "70 1 year 365.0 730 \n", - "234 5 months 153.0 180 \n", - "260 4 months 122.0 152 \n", - "272 2 months 60.0 89 \n", - "293 7 months 214.0 244 \n", - "321 11 months 337.0 365 \n", - "331 3 months 92.0 122 \n", - "337 8 months 244.0 273 \n", - "349 3 months 91.0 120 \n", - "388 10 months 304.0 333 \n", - "408 3 months 90.0 119 \n", - "409 3 months 90.0 119 \n", - "412 8 months 243.0 272 \n", - "415 4 months 120.0 149 \n", + " subscription_duration subscription_duration_days undate_duration \\\n", + "28 1 year 365.0 730 days \n", + "70 1 year 365.0 730 days \n", + "234 5 months 153.0 180 days \n", + "260 4 months 122.0 152 days \n", + "272 2 months 60.0 89 days \n", + "293 7 months 214.0 244 days \n", + "321 11 months 337.0 365 days \n", + "331 3 months 92.0 122 days \n", + "337 8 months 244.0 273 days \n", + "349 3 months 91.0 120 days \n", + "388 10 months 304.0 333 days \n", + "408 3 months 90.0 119 days \n", + "409 3 months 90.0 119 days \n", + "412 8 months 243.0 272 days \n", + "415 4 months 120.0 149 days \n", "\n", - " duration_diff \n", - "28 365.0 \n", - "70 365.0 \n", - "234 27.0 \n", - "260 30.0 \n", - "272 29.0 \n", - "293 30.0 \n", - "321 28.0 \n", - "331 30.0 \n", - "337 29.0 \n", - "349 29.0 \n", - "388 29.0 \n", - "408 29.0 \n", - "409 29.0 \n", - "412 29.0 \n", - "415 29.0 " + " duration_diff \n", + "28 365.0 \n", + "70 365.0 \n", + "234 27.0 \n", + "260 30.0 \n", + "272 29.0 \n", + "293 30.0 \n", + "321 28.0 \n", + "331 30.0 \n", + "337 29.0 \n", + "349 29.0 \n", + "388 29.0 \n", + "408 29.0 \n", + "409 29.0 \n", + "412 29.0 \n", + "415 29.0 " ] }, - "execution_count": 44, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } @@ -2001,7 +1982,7 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 18, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -2087,7 +2068,7 @@ "606 G. E. Pulsford --01-20 --01-28 8.0" ] }, - "execution_count": 32, + "execution_count": 18, "metadata": {}, "output_type": "execute_result" } @@ -2101,7 +2082,7 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 19, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -2187,7 +2168,7 @@ "29908 Ann Samyn 1961-10-04 1962-03-21 168.0" ] }, - "execution_count": 33, + "execution_count": 19, "metadata": {}, "output_type": "execute_result" } @@ -2198,7 +2179,7 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 20, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -2243,7 +2224,7 @@ " --01-07\n", " --01-13\n", " 6.0\n", - " 6\n", + " 6 days\n", " \n", " \n", " 603\n", @@ -2251,7 +2232,7 @@ " --01-12\n", " --01-20\n", " 8.0\n", - " 8\n", + " 8 days\n", " \n", " \n", " 604\n", @@ -2259,7 +2240,7 @@ " --01-16\n", " --02-16\n", " 31.0\n", - " 31\n", + " 31 days\n", " \n", " \n", " 605\n", @@ -2267,7 +2248,7 @@ " --01-19\n", " --01-24\n", " 5.0\n", - " 5\n", + " 5 days\n", " \n", " \n", " 606\n", @@ -2275,7 +2256,7 @@ " --01-20\n", " --01-28\n", " 8.0\n", - " 8\n", + " 8 days\n", " \n", " \n", " 607\n", @@ -2283,7 +2264,7 @@ " --01-24\n", " --03-20\n", " 55.0\n", - " 55\n", + " 55 days\n", " \n", " \n", " 608\n", @@ -2291,7 +2272,7 @@ " --01-24\n", " --03-20\n", " 55.0\n", - " 55\n", + " 55 days\n", " \n", " \n", " 609\n", @@ -2299,7 +2280,7 @@ " --01-24\n", " --03-20\n", " 55.0\n", - " 55\n", + " 55 days\n", " \n", " \n", " 610\n", @@ -2307,7 +2288,7 @@ " --01-24\n", " --05-30\n", " 126.0\n", - " 126\n", + " 126 days\n", " \n", " \n", " 611\n", @@ -2315,27 +2296,27 @@ " --01-24\n", " --05-30\n", " 126.0\n", - " 126\n", + " 126 days\n", " \n", " \n", "\n", "" ], "text/plain": [ - " member_names start_date end_date borrow_duration_days undate_duration\n", - "602 G. E. Pulsford --01-07 --01-13 6.0 6\n", - "603 G. E. Pulsford --01-12 --01-20 8.0 8\n", - "604 Robert D. Sage --01-16 --02-16 31.0 31\n", - "605 Gertrude Stein --01-19 --01-24 5.0 5\n", - "606 G. E. Pulsford --01-20 --01-28 8.0 8\n", - "607 Gertrude Stein --01-24 --03-20 55.0 55\n", - "608 Gertrude Stein --01-24 --03-20 55.0 55\n", - "609 Gertrude Stein --01-24 --03-20 55.0 55\n", - "610 Gertrude Stein --01-24 --05-30 126.0 126\n", - "611 Gertrude Stein --01-24 --05-30 126.0 126" + " member_names start_date end_date borrow_duration_days undate_duration\n", + "602 G. E. Pulsford --01-07 --01-13 6.0 6 days\n", + "603 G. E. Pulsford --01-12 --01-20 8.0 8 days\n", + "604 Robert D. Sage --01-16 --02-16 31.0 31 days\n", + "605 Gertrude Stein --01-19 --01-24 5.0 5 days\n", + "606 G. E. Pulsford --01-20 --01-28 8.0 8 days\n", + "607 Gertrude Stein --01-24 --03-20 55.0 55 days\n", + "608 Gertrude Stein --01-24 --03-20 55.0 55 days\n", + "609 Gertrude Stein --01-24 --03-20 55.0 55 days\n", + "610 Gertrude Stein --01-24 --05-30 126.0 126 days\n", + "611 Gertrude Stein --01-24 --05-30 126.0 126 days" ] }, - "execution_count": 34, + "execution_count": 20, "metadata": {}, "output_type": "execute_result" } @@ -2348,7 +2329,7 @@ }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 21, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -2394,7 +2375,7 @@ " --01-07\n", " --01-13\n", " 6.0\n", - " 6\n", + " 6 days\n", " 0.0\n", " \n", " \n", @@ -2403,7 +2384,7 @@ " --01-12\n", " --01-20\n", " 8.0\n", - " 8\n", + " 8 days\n", " 0.0\n", " \n", " \n", @@ -2412,7 +2393,7 @@ " --01-16\n", " --02-16\n", " 31.0\n", - " 31\n", + " 31 days\n", " 0.0\n", " \n", " \n", @@ -2421,7 +2402,7 @@ " --01-19\n", " --01-24\n", " 5.0\n", - " 5\n", + " 5 days\n", " 0.0\n", " \n", " \n", @@ -2430,7 +2411,7 @@ " --01-20\n", " --01-28\n", " 8.0\n", - " 8\n", + " 8 days\n", " 0.0\n", " \n", " \n", @@ -2439,7 +2420,7 @@ " --01-24\n", " --03-20\n", " 55.0\n", - " 55\n", + " 55 days\n", " 0.0\n", " \n", " \n", @@ -2448,7 +2429,7 @@ " --01-24\n", " --03-20\n", " 55.0\n", - " 55\n", + " 55 days\n", " 0.0\n", " \n", " \n", @@ -2457,7 +2438,7 @@ " --01-24\n", " --03-20\n", " 55.0\n", - " 55\n", + " 55 days\n", " 0.0\n", " \n", " \n", @@ -2466,7 +2447,7 @@ " --01-24\n", " --05-30\n", " 126.0\n", - " 126\n", + " 126 days\n", " 0.0\n", " \n", " \n", @@ -2475,7 +2456,7 @@ " --01-24\n", " --05-30\n", " 126.0\n", - " 126\n", + " 126 days\n", " 0.0\n", " \n", " \n", @@ -2483,45 +2464,45 @@ "" ], "text/plain": [ - " member_names start_date end_date borrow_duration_days \\\n", - "602 G. E. Pulsford --01-07 --01-13 6.0 \n", - "603 G. E. Pulsford --01-12 --01-20 8.0 \n", - "604 Robert D. Sage --01-16 --02-16 31.0 \n", - "605 Gertrude Stein --01-19 --01-24 5.0 \n", - "606 G. E. Pulsford --01-20 --01-28 8.0 \n", - "607 Gertrude Stein --01-24 --03-20 55.0 \n", - "608 Gertrude Stein --01-24 --03-20 55.0 \n", - "609 Gertrude Stein --01-24 --03-20 55.0 \n", - "610 Gertrude Stein --01-24 --05-30 126.0 \n", - "611 Gertrude Stein --01-24 --05-30 126.0 \n", + " member_names start_date end_date borrow_duration_days undate_duration \\\n", + "602 G. E. Pulsford --01-07 --01-13 6.0 6 days \n", + "603 G. E. Pulsford --01-12 --01-20 8.0 8 days \n", + "604 Robert D. Sage --01-16 --02-16 31.0 31 days \n", + "605 Gertrude Stein --01-19 --01-24 5.0 5 days \n", + "606 G. E. Pulsford --01-20 --01-28 8.0 8 days \n", + "607 Gertrude Stein --01-24 --03-20 55.0 55 days \n", + "608 Gertrude Stein --01-24 --03-20 55.0 55 days \n", + "609 Gertrude Stein --01-24 --03-20 55.0 55 days \n", + "610 Gertrude Stein --01-24 --05-30 126.0 126 days \n", + "611 Gertrude Stein --01-24 --05-30 126.0 126 days \n", "\n", - " undate_duration duration_diff \n", - "602 6 0.0 \n", - "603 8 0.0 \n", - "604 31 0.0 \n", - "605 5 0.0 \n", - "606 8 0.0 \n", - "607 55 0.0 \n", - "608 55 0.0 \n", - "609 55 0.0 \n", - "610 126 0.0 \n", - "611 126 0.0 " + " duration_diff \n", + "602 0.0 \n", + "603 0.0 \n", + "604 0.0 \n", + "605 0.0 \n", + "606 0.0 \n", + "607 0.0 \n", + "608 0.0 \n", + "609 0.0 \n", + "610 0.0 \n", + "611 0.0 " ] }, - "execution_count": 36, + "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# what's the difference between the two?\n", - "borrow_duration['duration_diff'] = borrow_duration.apply(lambda row: row.undate_duration - row.borrow_duration_days, axis=1)\n", + "borrow_duration['duration_diff'] = borrow_duration.apply(lambda row: row.undate_duration.astype(\"int\") - row.borrow_duration_days, axis=1)\n", "borrow_duration.head(10)" ] }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 22, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -2538,7 +2519,7 @@ "Name: count, dtype: int64" ] }, - "execution_count": 37, + "execution_count": 22, "metadata": {}, "output_type": "execute_result" } @@ -2556,14 +2537,21 @@ "source": [ "Woohoo, everything matches! 🎉\n", "\n", + "* * * \n", + "\n", "In a previous run, there were two borrow events where the calculation did not match; this was due to an error in undate duration method when the start and end dates have unknown years and dates wrap to the following year (e.g., december to january), which has now been corrected.\n", "\n", - "**Note:** One of those events has a range (--06-07/--06-06) that looks like a data error in S&co, but the data matches what is [written on the lending card](https://shakespeareandco.princeton.edu/members/davet-yvonne/cards/cf96d38f-e651-491c-a575-131ea32ce425/#)." + "**Note:** One of those events has a range (--06-07/--06-06) that looks like a data error in S&co, but the data matches what is [written on the lending card](https://shakespeareandco.princeton.edu/members/davet-yvonne/cards/cf96d38f-e651-491c-a575-131ea32ce425/#).\n", + "\n", + "* * * \n", + "\n", + "In a preliminary implementation of the numpy datetime64 integration, the new earliest possible year turned out to be a leap year, resulting in the counts for Gertrude Stein's borrows from January to March to be off by one. This was corrected by adjusting the minimum year by one to ensure it is not a leap year.\n", + "\n" ] }, { "cell_type": "code", - "execution_count": 40, + "execution_count": 28, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -2572,54 +2560,10 @@ "id": "-Bq76gtDWljg", "outputId": "f1ee526d-b938-4cbf-e93c-c6c91c077ae7" }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
member_namesstart_dateend_dateborrow_duration_daysundate_durationduration_diff
\n", - "
" - ], - "text/plain": [ - "Empty DataFrame\n", - "Columns: [member_names, start_date, end_date, borrow_duration_days, undate_duration, duration_diff]\n", - "Index: []" - ] - }, - "execution_count": 40, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ - "borrow_duration[borrow_duration.duration_diff != 0]" + "# Confirm that we have no mismatches\n", + "assert len(borrow_duration[borrow_duration.duration_diff != 0]) == 0" ] } ], @@ -2648,7 +2592,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.13" + "version": "3.10.14" } }, "nbformat": 4, diff --git a/src/undate/undate.py b/src/undate/undate.py index cb7d30a..bf248ff 100644 --- a/src/undate/undate.py +++ b/src/undate/undate.py @@ -5,7 +5,7 @@ # Pre 3.10 requires Union for multiple types, e.g. Union[int, None] instead of int | None from typing import Dict, Optional, Union -from undate.date import ONE_DAY, ONE_MONTH_MAX, ONE_YEAR, Date, DatePrecision +from undate.date import ONE_DAY, ONE_MONTH_MAX, ONE_YEAR, Date, DatePrecision, Timedelta from undate.dateformat.base import BaseDateFormat @@ -33,8 +33,8 @@ class Undate: # See https://numpy.org/doc/stable/reference/arrays.datetime.html#datetime-units # It just so happens that int(2.5e16) is a leap year, which is a weird default, # so let's increase our lower bound by one year. - MIN_ALLOWABLE_YEAR = int(2.5e16) + 1 - MAX_ALLOWABLE_YEAR = int(-2.5e16) + MIN_ALLOWABLE_YEAR = int(-2.5e16) + 1 + MAX_ALLOWABLE_YEAR = int(2.5e16) def __init__( self, @@ -73,8 +73,8 @@ def __init__( else: # use the configured min/max allowable years if we # don't have any other bounds - max_year = self.MIN_ALLOWABLE_YEAR - min_year = self.MAX_ALLOWABLE_YEAR + min_year = self.MIN_ALLOWABLE_YEAR + max_year = self.MAX_ALLOWABLE_YEAR # if month is passed in as a string but completely unknown, # treat as none @@ -126,6 +126,9 @@ def __init__( if day is not None: min_day, max_day = self._missing_digit_minmax(day, min_day, max_day) + # TODO: special case, if we get a Feb 29 date with unknown year, + # must switch the min/max years to known leap years! + # for unknowns, assume smallest possible value for earliest and # largest valid for latest self.earliest = Date(min_year, min_month, min_day) @@ -290,7 +293,7 @@ def is_known(self, part: str) -> bool: def is_partially_known(self, part: str) -> bool: return isinstance(self.initial_values[part], str) - def duration(self): # -> np.timedelta64: + def duration(self) -> Timedelta: """What is the duration of this date? Calculate based on earliest and latest date within range, taking into account the precision of the date even if not all @@ -399,11 +402,11 @@ def __eq__(self, other) -> bool: # consider interval equal if both dates are equal return self.earliest == other.earliest and self.latest == other.latest - def duration(self): # -> np.timedelta64: + def duration(self) -> Timedelta: """Calculate the duration between two undates. :returns: A duration - :rtype: numpy.timedelta64 + :rtype: Timedelta """ # what is the duration of this date range? @@ -423,7 +426,7 @@ def duration(self): # -> np.timedelta64: # if we get a negative, we've wrapped from end of one year # to the beginning of the next; # recalculate assuming second date is in the subsequent year - if duration.astype("int") < 0: + if duration.days < 0: end = self.latest.earliest + ONE_YEAR duration = end - self.earliest.earliest diff --git a/tests/test_undate.py b/tests/test_undate.py index 9e81f97..c1d3792 100644 --- a/tests/test_undate.py +++ b/tests/test_undate.py @@ -1,3 +1,4 @@ +import calendar from datetime import date, timedelta import numpy as np @@ -113,6 +114,10 @@ def test_init_partially_known_day(self): uncertain_day = Undate(2024, 2, "2X") assert uncertain_day.latest.day == 29 + # TODO: handle leap day in an unknown year + # (currently causes an exception because min/max years are not leap years) + # Undate(None, 2, 29) + def test_init_invalid(self): with pytest.raises(ValueError): Undate("19xx") @@ -294,7 +299,7 @@ def test_duration(self): assert january_duration.days == 31 feb_duration = Undate(2022, 2).duration() assert feb_duration.days == 28 - # next leap year will be 2024 + # 2024 is a known leap year leapyear_feb_duration = Undate(2024, 2).duration() assert leapyear_feb_duration.days == 29 @@ -391,6 +396,9 @@ def test_not_eq(self): ) assert UndateInterval(Undate(2022, 5)) != UndateInterval(Undate(2022, 6)) + def test_min_year_non_leapyear(self): + assert not calendar.isleap(Undate.MIN_ALLOWABLE_YEAR) + def test_duration(self): week_duration = UndateInterval( Undate(2022, 11, 1), Undate(2022, 11, 7) @@ -415,14 +423,21 @@ def test_duration(self): month_noyear_duration = UndateInterval( Undate(None, 12, 1), Undate(None, 1, 1) ).duration() - assert month_noyear_duration.days == 31 + assert month_noyear_duration.days == 32 - # real case from Shakespeare and Company Project data; + # real world test cases from Shakespeare and Company Project data; # second date is a year minus one day in the future month_noyear_duration = UndateInterval( Undate(None, 6, 7), Undate(None, 6, 6) ).duration() assert month_noyear_duration.days == 365 + # durations that span february in unknown years should assume + # non-leap years + jan_march_duration = UndateInterval( + Undate(None, 2, 28), Undate(None, 3, 1) + ).duration() + assert jan_march_duration.days == 2 + # duration is not supported for open-ended intervals assert UndateInterval(Undate(2000), None).duration() == NotImplemented