Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Updated regular expressions, fixed a few bugs, added tests for v0.15.0 #639

Merged
merged 19 commits into from
Sep 8, 2019
Merged
Show file tree
Hide file tree
Changes from 17 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion arrow/arrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -1335,7 +1335,7 @@ def _get_iteration_params(cls, end, limit):
if end is None:

if limit is None:
raise Exception("one of 'end' or 'limit' is required")
raise ValueError("one of 'end' or 'limit' is required")

return cls.max, limit

Expand Down
26 changes: 0 additions & 26 deletions arrow/factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
from __future__ import absolute_import

import calendar
import warnings
from datetime import date, datetime, tzinfo
from time import struct_time

Expand All @@ -20,21 +19,6 @@
from arrow.util import is_timestamp, isstr


class ArrowParseWarning(DeprecationWarning):
"""Raised when arrow.get() is passed a string with no formats and matches incorrectly
on one of the default formats.

e.g.
arrow.get('blabla2016') -> <Arrow [2016-01-01T00:00:00+00:00]>
arrow.get('13/4/2045') -> <Arrow [2045-01-01T00:00:00+00:00]>

In version 0.15.0 this warning will become a ParserError.
"""


warnings.simplefilter("always", ArrowParseWarning)


class ArrowFactory(object):
""" A factory for generating :class:`Arrow <arrow.arrow.Arrow>` objects.

Expand Down Expand Up @@ -195,11 +179,6 @@ def get(self, *args, **kwargs):

# (str) -> parse.
elif isstr(arg):
warnings.warn(
"The .get() parsing method without a format string will parse more strictly in version 0.15.0."
"See https://github.com/crsmithdev/arrow/issues/612 for more details.",
ArrowParseWarning,
)
dt = parser.DateTimeParser(locale).parse_iso(arg)
return self.type.fromdatetime(dt, tz)

Expand Down Expand Up @@ -242,11 +221,6 @@ def get(self, *args, **kwargs):

# (str, format) -> parse.
elif isstr(arg_1) and (isstr(arg_2) or isinstance(arg_2, list)):
warnings.warn(
"The .get() parsing method with a format string will parse more strictly in version 0.15.0."
"See https://github.com/crsmithdev/arrow/issues/612 for more details.",
ArrowParseWarning,
)
dt = parser.DateTimeParser(locale).parse(args[0], args[1])
return self.type.fromdatetime(dt, tzinfo=tz)

Expand Down
156 changes: 82 additions & 74 deletions arrow/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,16 @@
from backports.functools_lru_cache import lru_cache # pragma: no cover


class ParserError(RuntimeError):
class ParserError(ValueError):
pass


# Allows for ParserErrors to be propagated from _build_datetime()
# when day_of_year errors occur.
# Before this, the ParserErrors were caught by the try/except in
# _parse_multiformat() and the appropriate error message was not
# transmitted to the user.
class ParserMatchError(ParserError):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good idea

pass


Expand All @@ -25,18 +34,20 @@ class DateTimeParser(object):
)
_ESCAPE_RE = re.compile(r"\[[^\[\]]*\]")

_ONE_OR_MORE_DIGIT_RE = re.compile(r"\d+")
_ONE_OR_TWO_DIGIT_RE = re.compile(r"\d{1,2}")
_ONE_OR_TWO_OR_THREE_DIGIT_RE = re.compile(r"\d{1,3}")
_ONE_OR_MORE_DIGIT_RE = re.compile(r"\d+")
_TWO_DIGIT_RE = re.compile(r"\d{2}")
_THREE_DIGIT_RE = re.compile(r"\d{3}")
_FOUR_DIGIT_RE = re.compile(r"\d{4}")
_TWO_DIGIT_RE = re.compile(r"\d{2}")
_TZ_RE = re.compile(r"[+\-]?\d{2}:?(\d{2})?|Z")
_TZ_Z_RE = re.compile(r"([\+\-])(\d{2})(?:(\d{2}))?|Z")
_TZ_ZZ_RE = re.compile(r"([\+\-])(\d{2})(?:\:(\d{2}))?|Z")
_TZ_NAME_RE = re.compile(r"\w[\w+\-/]+")
# NOTE: timestamps cannot be parsed from natural language strings (by removing the ^...$) because it will
# break cases like "15 Jul 2000" and a format list (see issue #447)
_TIMESTAMP_RE = re.compile(r"^\d+\.?\d+$")
# TODO: test timestamp thoroughly
_TIME_RE = re.compile(r"^(\d{2})(?:\:?(\d{2}))?(?:\:?(\d{2}))?(?:([\.\,])(\d+))?$")
systemcatch marked this conversation as resolved.
Show resolved Hide resolved

# TODO: test new regular expressions
_BASE_INPUT_RE_MAP = {
"YYYY": _FOUR_DIGIT_RE,
"YY": _TWO_DIGIT_RE,
Expand All @@ -56,8 +67,8 @@ class DateTimeParser(object):
"s": _ONE_OR_TWO_DIGIT_RE,
"X": _TIMESTAMP_RE,
"ZZZ": _TZ_NAME_RE,
"ZZ": _TZ_RE,
"Z": _TZ_RE,
"ZZ": _TZ_ZZ_RE,
"Z": _TZ_Z_RE,
"S": _ONE_OR_MORE_DIGIT_RE,
}

Expand Down Expand Up @@ -95,23 +106,14 @@ def __init__(self, locale="en_us", cache_size=0):
# TODO: since we support more than ISO-8601, we should rename this function
# IDEA: break into multiple functions
def parse_iso(self, datetime_string):
# strip leading and trailing whitespace
datetime_string = datetime_string.strip()

# TODO: add a flag to normalize whitespace (useful in logs, ref issue #421)
systemcatch marked this conversation as resolved.
Show resolved Hide resolved
has_space_divider = " " in datetime_string
has_t_divider = "T" in datetime_string

num_spaces = datetime_string.count(" ")
if has_space_divider and num_spaces != 1:
if has_space_divider and num_spaces != 1 or has_t_divider and num_spaces > 0:
systemcatch marked this conversation as resolved.
Show resolved Hide resolved
raise ParserError(
"Expected an ISO 8601-like string, but was given '{}' which contains multiple spaces. Try passing in a format string to resolve this.".format(
datetime_string
)
)

if has_t_divider and num_spaces > 0:
raise ParserError(
"Expected an ISO 8601-like string, but was given '{}' which contains \"T\" separator and spaces. Try passing in a format string to resolve this.".format(
"Expected an ISO 8601-like string, but was given '{}'. Try passing in a format string to resolve this.".format(
datetime_string
)
)
Expand All @@ -120,6 +122,7 @@ def parse_iso(self, datetime_string):
has_tz = False

# TODO: add tests for all the new formats, especially basic format
# NOTE: YYYYMM is omitted to avoid confusion with YYMMDD (no longer part of ISO 8601, but is still often used)
systemcatch marked this conversation as resolved.
Show resolved Hide resolved
# date formats (ISO-8601 and others) to test against
formats = [
"YYYY-MM-DD",
Expand All @@ -141,53 +144,61 @@ def parse_iso(self, datetime_string):
]

if has_time:
# Z is ignored entirely because fromdatetime defaults to UTC in arrow.py
if datetime_string[-1] == "Z":
datetime_string = datetime_string[:-1]

if has_space_divider:
date_string, time_string = datetime_string.split(" ", 1)
else:
date_string, time_string = datetime_string.split("T", 1)

time_parts = re.split("[+-]", time_string, 1)
colon_count = time_parts[0].count(":")
time_parts = re.split(r"[\+\-Z]", time_string, 1, re.IGNORECASE)
# TODO: should we prevent mixing basic and extended formats? would need to ensure that dates, times, and timezones are in same format
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we can leave that for a future release

time_colon_count = time_parts[0].count(":")

is_basic_time_format = time_colon_count == 0
tz_format = "Z"

# use 'ZZ' token instead since tz offset is present in non-basic format
if len(time_parts) == 2 and ":" in time_parts[1]:
tz_format = "ZZ"

time_components = self._TIME_RE.match(time_parts[0])
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👏 regex to the rescue again!


is_basic_time_format = colon_count == 0
if time_components is None:
raise ParserError(
"Invalid time component provided. Please specify a format or provide a valid time component in the basic or extended ISO 8601 time format."
)

hours, minutes, seconds, subseconds_sep, subseconds = (
time_components.groups()
)

has_tz = len(time_parts) > 1
has_hours = len(time_parts[0]) == 2
has_minutes = colon_count == 1 or len(time_parts[0]) == 4
has_seconds = colon_count == 2 or len(time_parts[0]) == 6
has_subseconds = re.search("[.,]", time_parts[0])
has_tz = len(time_parts) == 2
has_minutes = minutes is not None
has_seconds = seconds is not None
has_subseconds = subseconds is not None

time_sep = "" if is_basic_time_format else ":"

if has_subseconds:
time_string = "HH:mm:ss{}S".format(has_subseconds.group())
time_string = "HH{time_sep}mm{time_sep}ss{subseconds_sep}S".format(
time_sep=time_sep, subseconds_sep=subseconds_sep
)
elif has_seconds:
time_string = "HH:mm:ss"
time_string = "HH{time_sep}mm{time_sep}ss".format(time_sep=time_sep)
elif has_minutes:
time_string = "HH:mm"
elif has_hours:
time_string = "HH"
time_string = "HH{time_sep}mm".format(time_sep=time_sep)
else:
raise ParserError(
"Invalid time component provided. Please specify a format or provide a time in the form 'HH:mm:ss.S', 'HH:mm:ss', 'HH:mm', or 'HH'."
)

if is_basic_time_format:
time_string = time_string.replace(":", "")
time_string = "HH"

if has_space_divider:
formats = ["{} {}".format(f, time_string) for f in formats]
else:
formats = ["{}T{}".format(f, time_string) for f in formats]

# TODO: reduce set of date formats for basic? test earlier?

if has_time and has_tz:
# Add "Z" to format strings to indicate to _parse_tokens
# that a timezone needs to be parsed
formats = ["{}Z".format(f) for f in formats]
# Add "Z" or "ZZ" to the format strings to indicate to
# _parse_token() that a timezone needs to be parsed
formats = ["{}{}".format(f, tz_format) for f in formats]

return self._parse_multiformat(datetime_string, formats)

Expand All @@ -200,7 +211,7 @@ def parse(self, datetime_string, fmt):

match = fmt_pattern_re.search(datetime_string)
if match is None:
raise ParserError(
raise ParserMatchError(
"Failed to match '{}' when parsing '{}'".format(fmt, datetime_string)
)

Expand Down Expand Up @@ -231,7 +242,7 @@ def _generate_pattern_re(self, fmt):

# Any number of S is the same as one.
# TODO: allow users to specify the number of digits to parse
escaped_fmt = re.sub("S+", "S", escaped_fmt)
escaped_fmt = re.sub(r"S+", "S", escaped_fmt)

escaped_data = re.findall(self._ESCAPE_RE, fmt)

Expand Down Expand Up @@ -276,13 +287,11 @@ def _generate_pattern_re(self, fmt):
# Reference: https://stackoverflow.com/q/14232931/3820660
starting_word_boundary = r"(?<![\S])"
ending_word_boundary = r"(?![\S])"
final_fmt_pattern = r"{starting_word_boundary}{final_fmt_pattern}Z?{ending_word_boundary}".format(
starting_word_boundary=starting_word_boundary,
final_fmt_pattern=final_fmt_pattern,
ending_word_boundary=ending_word_boundary,
bounded_fmt_pattern = r"{}{}{}".format(
starting_word_boundary, final_fmt_pattern, ending_word_boundary
)

return tokens, re.compile(final_fmt_pattern, flags=re.IGNORECASE)
return tokens, re.compile(bounded_fmt_pattern, flags=re.IGNORECASE)

def _parse_token(self, token, value, parts):

Expand Down Expand Up @@ -335,7 +344,7 @@ def _parse_token(self, token, value, parts):
parts["microsecond"] = int(value[:6]) + rounding

elif token == "X":
parts["timestamp"] = int(value)
parts["timestamp"] = float(value)

elif token in ["ZZZ", "ZZ", "Z"]:
parts["tzinfo"] = TzinfoParser.parse(value)
Expand All @@ -351,33 +360,30 @@ def _build_datetime(parts):

timestamp = parts.get("timestamp")

if timestamp:
tz_utc = tz.tzutc()
return datetime.fromtimestamp(timestamp, tz=tz_utc)
if timestamp is not None:
systemcatch marked this conversation as resolved.
Show resolved Hide resolved
return datetime.fromtimestamp(timestamp, tz=tz.tzutc())

day_of_year = parts.get("day_of_year")

if day_of_year:
if day_of_year is not None:
year = parts.get("year")
month = parts.get("month")
if year is None:
raise ParserError(
"Year component is required with the DDD and DDDD tokens"
"Year component is required with the DDD and DDDD tokens."
)

if month is not None:
raise ParserError(
"Month component is not allowed with the DDD and DDDD tokens"
"Month component is not allowed with the DDD and DDDD tokens."
)

date_string = "{}-{}".format(year, day_of_year)
try:
dt = datetime.strptime(date_string, "%Y-%j")
except ValueError:
raise ParserError(
"Expected a valid day of year, but received '{}'".format(
day_of_year
)
"The provided day of year '{}' is invalid.".format(day_of_year)
)

parts["year"] = dt.year
Expand Down Expand Up @@ -411,12 +417,12 @@ def _parse_multiformat(self, string, formats):
try:
_datetime = self.parse(string, fmt)
break
except ParserError:
except ParserMatchError:
pass

if _datetime is None:
raise ParserError(
"Could not match input '{}' to any of the formats provided: {}".format(
"Could not match input '{}' to any of the following formats: {}".format(
string, ", ".join(formats)
)
)
Expand All @@ -429,23 +435,23 @@ def _choice_re(choices, flags=0):


class TzinfoParser(object):

_TZINFO_RE = re.compile(r"([+\-])?(\d\d):?(\d\d)?")
# TODO: test against full timezone DB
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

that's a good idea

_TZINFO_RE = re.compile(r"^([\+\-])?(\d{2})(?:\:?(\d{2}))?$")
systemcatch marked this conversation as resolved.
Show resolved Hide resolved

@classmethod
def parse(cls, string):
def parse(cls, tzinfo_string):

tzinfo = None

if string == "local":
if tzinfo_string == "local":
tzinfo = tz.tzlocal()

elif string in ["utc", "UTC", "Z"]:
elif tzinfo_string in ["utc", "UTC", "Z"]:
tzinfo = tz.tzutc()

else:

iso_match = cls._TZINFO_RE.match(string)
iso_match = cls._TZINFO_RE.match(tzinfo_string)

if iso_match:
sign, hours, minutes = iso_match.groups()
Expand All @@ -459,9 +465,11 @@ def parse(cls, string):
tzinfo = tz.tzoffset(None, seconds)

else:
tzinfo = tz.gettz(string)
tzinfo = tz.gettz(tzinfo_string)

if tzinfo is None:
raise ParserError('Could not parse timezone expression "{}"'.format(string))
raise ParserError(
'Could not parse timezone expression "{}"'.format(tzinfo_string)
)

return tzinfo
Loading