Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enhance and move ISO-8601 parser to coding.times #9899

Merged
merged 21 commits into from
Dec 31, 2024
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,9 @@ Internal Changes
within ``as_compatible_data``. This is consistent with how lists of these objects
will be converted (:pull:`9900`).
By `Kai Mühlbauer <https://github.com/kmuehlbauer>`_.
- Move ISO-8601 parser from coding.cftimeindex to coding.times to make it available there (prevents circular import) (:pull:`9899`).
kmuehlbauer marked this conversation as resolved.
Show resolved Hide resolved
kmuehlbauer marked this conversation as resolved.
Show resolved Hide resolved
By `Kai Mühlbauer <https://github.com/kmuehlbauer>`_.


.. _whats-new.2024.11.0:

Expand Down
9 changes: 9 additions & 0 deletions properties/test_encode_decode.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,12 @@
# isort: split

import hypothesis.extra.numpy as npst
import hypothesis.strategies as st
import numpy as np
from hypothesis import given

import xarray as xr
from xarray.coding.times import _parse_iso8601_without_reso
from xarray.testing.strategies import variables


Expand Down Expand Up @@ -43,3 +45,10 @@ def test_CFScaleOffset_coder_roundtrip(original) -> None:
coder = xr.coding.variables.CFScaleOffsetCoder()
roundtripped = coder.decode(coder.encode(original))
xr.testing.assert_identical(original, roundtripped)


# TODO: add cftime.datetime
@given(dt=st.datetimes())
def test_iso8601_decode(dt):
kmuehlbauer marked this conversation as resolved.
Show resolved Hide resolved
iso = dt.isoformat()
assert dt == _parse_iso8601_without_reso(type(dt), iso)
3 changes: 2 additions & 1 deletion xarray/coding/cftime_offsets.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,9 +53,10 @@
import pandas as pd
from packaging.version import Version

from xarray.coding.cftimeindex import CFTimeIndex, _parse_iso8601_with_reso
from xarray.coding.cftimeindex import CFTimeIndex
from xarray.coding.times import (
_is_standard_calendar,
_parse_iso8601_with_reso,
_should_cftime_be_used,
convert_time_or_go_back,
format_cftime_datetime,
Expand Down
73 changes: 2 additions & 71 deletions xarray/coding/cftimeindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,6 @@
from __future__ import annotations

import math
import re
import warnings
from datetime import timedelta
from typing import TYPE_CHECKING, Any
Expand All @@ -53,6 +52,8 @@

from xarray.coding.times import (
_STANDARD_CALENDARS,
_parse_iso8601_with_reso,
_parse_iso8601_without_reso,
cftime_to_nptime,
infer_calendar_name,
)
Expand All @@ -78,71 +79,6 @@
OUT_OF_BOUNDS_TIMEDELTA_ERRORS = (OverflowError,)


def named(name, pattern):
return "(?P<" + name + ">" + pattern + ")"


def optional(x):
return "(?:" + x + ")?"


def trailing_optional(xs):
if not xs:
return ""
return xs[0] + optional(trailing_optional(xs[1:]))


def build_pattern(date_sep=r"\-", datetime_sep=r"T", time_sep=r"\:", micro_sep=r"."):
pieces = [
(None, "year", r"\d{4}"),
(date_sep, "month", r"\d{2}"),
(date_sep, "day", r"\d{2}"),
(datetime_sep, "hour", r"\d{2}"),
(time_sep, "minute", r"\d{2}"),
(time_sep, "second", r"\d{2}"),
(micro_sep, "microsecond", r"\d{1,6}"),
]
pattern_list = []
for sep, name, sub_pattern in pieces:
pattern_list.append((sep if sep else "") + named(name, sub_pattern))
# TODO: allow timezone offsets?
return "^" + trailing_optional(pattern_list) + "$"


_BASIC_PATTERN = build_pattern(date_sep="", time_sep="")
_EXTENDED_PATTERN = build_pattern()
_CFTIME_PATTERN = build_pattern(datetime_sep=" ")
_PATTERNS = [_BASIC_PATTERN, _EXTENDED_PATTERN, _CFTIME_PATTERN]


def parse_iso8601_like(datetime_string):
for pattern in _PATTERNS:
match = re.match(pattern, datetime_string)
if match:
return match.groupdict()
raise ValueError(
f"no ISO-8601 or cftime-string-like match for string: {datetime_string}"
)


def _parse_iso8601_with_reso(date_type, timestr):
_ = attempt_import("cftime")
headtr1ck marked this conversation as resolved.
Show resolved Hide resolved

default = date_type(1, 1, 1)
result = parse_iso8601_like(timestr)
replace = {}

for attr in ["year", "month", "day", "hour", "minute", "second", "microsecond"]:
value = result.get(attr, None)
if value is not None:
if attr == "microsecond":
# convert match string into valid microsecond value
value = 10 ** (6 - len(value)) * int(value)
replace[attr] = int(value)
resolution = attr
return default.replace(**replace), resolution


def _parsed_string_to_bounds(date_type, resolution, parsed):
"""Generalization of
pandas.tseries.index.DatetimeIndex._parsed_string_to_bounds
Expand Down Expand Up @@ -811,11 +747,6 @@ def is_leap_year(self):
return func(self.year, calendar=self.calendar)


def _parse_iso8601_without_reso(date_type, datetime_str):
date, _ = _parse_iso8601_with_reso(date_type, datetime_str)
return date


def _parse_array_of_cftime_strings(strings, date_type):
"""Create a numpy array from an array of strings.

Expand Down
70 changes: 70 additions & 0 deletions xarray/coding/times.py
Original file line number Diff line number Diff line change
Expand Up @@ -189,6 +189,76 @@ def _unpack_netcdf_time_units(units: str) -> tuple[str, str]:
return delta_units, ref_date


def named(name, pattern):
kmuehlbauer marked this conversation as resolved.
Show resolved Hide resolved
return "(?P<" + name + ">" + pattern + ")"


def optional(x):
kmuehlbauer marked this conversation as resolved.
Show resolved Hide resolved
return "(?:" + x + ")?"


def trailing_optional(xs):
kmuehlbauer marked this conversation as resolved.
Show resolved Hide resolved
if not xs:
return ""
return xs[0] + optional(trailing_optional(xs[1:]))


def build_pattern(date_sep=r"\-", datetime_sep=r"T", time_sep=r"\:", micro_sep=r"."):
kmuehlbauer marked this conversation as resolved.
Show resolved Hide resolved
pieces = [
(None, "year", r"[+-]?\d{4,5}"),
(date_sep, "month", r"\d{2}"),
(date_sep, "day", r"\d{2}"),
(datetime_sep, "hour", r"\d{2}"),
(time_sep, "minute", r"\d{2}"),
(time_sep, "second", r"\d{2}"),
(micro_sep, "microsecond", r"\d{1,6}"),
]
pattern_list = []
for sep, name, sub_pattern in pieces:
pattern_list.append((sep if sep else "") + named(name, sub_pattern))
# TODO: allow timezone offsets?
return "^" + trailing_optional(pattern_list) + "$"


_BASIC_PATTERN = build_pattern(date_sep="", time_sep="")
_EXTENDED_PATTERN = build_pattern()
_CFTIME_PATTERN = build_pattern(datetime_sep=" ")
_PATTERNS = [_BASIC_PATTERN, _EXTENDED_PATTERN, _CFTIME_PATTERN]


def parse_iso8601_like(datetime_string):
kmuehlbauer marked this conversation as resolved.
Show resolved Hide resolved
for pattern in _PATTERNS:
match = re.match(pattern, datetime_string)
if match:
return match.groupdict()
raise ValueError(
f"no ISO-8601 or cftime-string-like match for string: {datetime_string}"
)


def _parse_iso8601_with_reso(date_type, timestr):
default = date_type(1, 1, 1)
result = parse_iso8601_like(timestr)
replace = {}

for attr in ["year", "month", "day", "hour", "minute", "second", "microsecond"]:
value = result.get(attr, None)
if value is not None:
resolution = attr
if attr == "microsecond":
if len(value) <= 3:
resolution = "millisecond"
# convert match string into valid microsecond value
value = 10 ** (6 - len(value)) * int(value)
replace[attr] = int(value)
return default.replace(**replace), resolution


def _parse_iso8601_without_reso(date_type, datetime_str):
kmuehlbauer marked this conversation as resolved.
Show resolved Hide resolved
date, _ = _parse_iso8601_with_reso(date_type, datetime_str)
return date


def _unpack_time_units_and_ref_date(units: str) -> tuple[str, pd.Timestamp]:
# same us _unpack_netcdf_time_units but finalizes ref_date for
# processing in encode_cf_datetime
Expand Down
30 changes: 23 additions & 7 deletions xarray/tests/test_cftimeindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,11 @@
from xarray.coding.cftimeindex import (
CFTimeIndex,
_parse_array_of_cftime_strings,
_parse_iso8601_with_reso,
_parsed_string_to_bounds,
assert_all_valid_date_type,
)
from xarray.coding.times import (
_parse_iso8601_with_reso,
parse_iso8601_like,
)
from xarray.tests import (
Expand Down Expand Up @@ -132,16 +134,30 @@ def date_dict(
list(ISO8601_LIKE_STRING_TESTS.values()),
ids=list(ISO8601_LIKE_STRING_TESTS.keys()),
)
def test_parse_iso8601_like(string, expected):
result = parse_iso8601_like(string)
@pytest.mark.parametrize("five_digit_year", [False, True], ids=["4Y", "5Y"])
kmuehlbauer marked this conversation as resolved.
Show resolved Hide resolved
@pytest.mark.parametrize("sign", ["", "+", "-"], ids=["None", "plus", "minus"])
def test_parse_iso8601_like(five_digit_year, sign, string, expected):
kmuehlbauer marked this conversation as resolved.
Show resolved Hide resolved
pre = "1" if five_digit_year else ""
datestring = sign + pre + string
result = parse_iso8601_like(datestring)
expected = expected.copy()
expected.update(year=sign + pre + expected["year"])
assert result == expected

if result["microsecond"] is None:
# check malformed single digit addendum
# this check is only performed when we have at least "hour" given
# like "1999010101", where a single added digit should raise
# for "1999" (year), "199901" (month) and "19990101" (day)
# and a single added digit the string would just be interpreted
# as having a 5-digit year.
if result["microsecond"] is None and result["hour"] is not None:
with pytest.raises(ValueError):
parse_iso8601_like(string + "3")
if result["second"] is None:
parse_iso8601_like(datestring + "3")

# check malformed floating point addendum
if result["second"] is None or result["microsecond"] is not None:
with pytest.raises(ValueError):
parse_iso8601_like(string + ".3")
parse_iso8601_like(datestring + ".3")


_CFTIME_CALENDARS = [
Expand Down
Loading