From e3126c77b24e6d6132be3faa169c3815bd011144 Mon Sep 17 00:00:00 2001 From: Jad Chaar Date: Sun, 7 Jul 2019 23:54:03 -0400 Subject: [PATCH 1/2] Fixed a number of parsing issues with a couple of regex tweaks and additions --- arrow/parser.py | 114 ++++++++++-------------------------------- tests/parser_tests.py | 73 +++++++++++++-------------- 2 files changed, 60 insertions(+), 127 deletions(-) diff --git a/arrow/parser.py b/arrow/parser.py index cec107171..d04ecabb2 100644 --- a/arrow/parser.py +++ b/arrow/parser.py @@ -47,6 +47,7 @@ class DateTimeParser(object): _TWO_DIGIT_RE = re.compile(r"\d{2}") _TZ_RE = re.compile(r"[+\-]?\d{2}:?(\d{2})?|Z") _TZ_NAME_RE = re.compile(r"\w[\w+\-/]+") + _TIMESTAMP_RE = re.compile(r"\d+") _BASE_INPUT_RE_MAP = { "YYYY": _FOUR_DIGIT_RE, @@ -63,7 +64,7 @@ class DateTimeParser(object): "m": _ONE_OR_TWO_DIGIT_RE, "ss": _TWO_DIGIT_RE, "s": _ONE_OR_TWO_DIGIT_RE, - "X": re.compile(r"\d+"), + "X": _TIMESTAMP_RE, "ZZZ": _TZ_NAME_RE, "ZZ": _TZ_RE, "Z": _TZ_RE, @@ -101,9 +102,14 @@ def __init__(self, locale="en_us", cache_size=0): self._generate_pattern_re ) + # TODO: since we support more than ISO-8601, we should rename this function def parse_iso(self, string): + # TODO: account for more than 1 space like arrow.get(" 2016") + # string = string.strip() - has_time = "T" in string or " " in string.strip() + has_space_divider = " " in string and len(string.strip().split(" ")) == 2 + + has_time = "T" in string or has_space_divider space_divider = " " in string.strip() has_tz = False @@ -120,11 +126,11 @@ def parse_iso(self, string): time_parts = re.split("[+-]", time_string, 1) colon_count = time_parts[0].count(":") - # TODO "20160504T010203Z" parses incorectly, time part is HH only, due to Z changing len + # TODO "20160504T010203Z" parses incorrectly, time part is HH only, due to Z changing len is_basic_time_format = colon_count == 0 has_tz = len(time_parts) > 1 - has_hours = colon_count == 0 or len(time_string) == 2 + has_hours = len(time_string) == 2 has_minutes = colon_count == 1 or len(time_string) == 4 has_seconds = colon_count == 2 or len(time_string) == 6 has_subseconds = re.search("[.,]", time_parts[0]) @@ -146,7 +152,7 @@ def parse_iso(self, string): # IDEA reduced set of date formats for basic - # TODO: add tests for all the new formats + # TODO: add tests for all the new formats, especially basic format # required date formats to test against formats = [ "YYYY-MM-DD", @@ -227,6 +233,21 @@ def _generate_pattern_re(self, fmt): if i < len(escaped_data): final_fmt_pattern += escaped_data[i][1:-1] + # Wrap final_fmt_pattern in a custom word boundary to strictly + # match the formatting pattern and filter out date and time formats + # that include junk such as: blah1998-09-12 blah, blah 1998-09-12blah, + # blah1998-09-12blah. The custom word boundary matches every character + # that is not a whitespace character to allow for searching for a date + # and time string in a natural language sentence. Therefore, searching + # for a string of the form YYYY-MM-DD in "blah 1998-09-12 blah" will + # work properly. + # Reference: https://stackoverflow.com/q/14232931/3820660 + starting_word_boundary = r"(? - # FIXME arrow.get("Call 01-02-03 on 79-01-01 12:05:10", "YY-MM-DD HH:mm:ss") warns incorrectly - # FIXME arrow.get("79-01-01 12:05:10", "YY-MM-DD HH:mm:ss") warns incorrectly - # IDEA test for whitespace on either side of match? - if "YY" in fmt_tokens and match.start != 0 or match.end() != len(string): - warnings.warn( - "Parser loosely matched {fmt} on '{string}', in version " - "0.15.0 this will raise a ParserError.".format( - fmt=fmt, string=string - ), - category=GetParseWarning, - ) - raise ParserError - - if fmt == "YYYY": - # accounts for arrow.get('05/02/2017', ['YYYY', 'MM/DD/YYYY']) - if match.start() != 0 or match.end() != len(string): - warnings.warn( - "Parser loosely matched {fmt} on '{string}', in version " - "0.15.0 this will raise a ParserError.".format( - fmt=fmt, string=string - ), - category=GetParseWarning, - ) - raise ParserError - - # Fixes bug where "15/01/2019" matches to "D/M/YY" - # arrow.get("15/01/2019", ["D/M/YY", "D/M/YYYY"]) - if "YY" in fmt_tokens and match.end() != len(string): - raise ParserError - - # TODO: talk to Chris about these conditionals - # if string[-1] == "Z" and match.end() != len(string) - 1: - # # TODO: add an exception message - # raise ParserError - # - # if string[-1] != "Z" and match.end() != len(string): - # # TODO: add an exception message - # raise ParserError - # - # if match.start() != 0: - # # TODO: add an exception message - # raise ParserError - - # if ("YY" in fmt_tokens or "YYYY" in fmt_tokens) and (match.end() != len(string) or match.start() != 0): - # # TODO: add an exception message - # raise ParserError - parts = {} for token in fmt_tokens: if token == "Do": diff --git a/tests/parser_tests.py b/tests/parser_tests.py index deec9a09d..f1cd3229f 100644 --- a/tests/parser_tests.py +++ b/tests/parser_tests.py @@ -136,23 +136,6 @@ def test_YY_and_YYYY_format_list(self): datetime(2019, 1, 15, 4, 5, 6, 789120, tzinfo=tz.tzutc()), ) - def test_long_year_input(self): - - # TODO: ask Chris if this should throw a ParserError - # Pendulum does not throw an error - self.assertEqual( - self.parser.parse("09 January 123456789101112", "DD MMMM YYYY"), - datetime(1234, 1, 9), - ) - - # Pendulum throws an error - with self.assertRaises(ParserError): - self.parser.parse("123456789101112 09 January", "YYYY DD MMMM") - - # Pendulum throws an error - with self.assertRaises(ParserError): - self.parser.parse("68096653015/01/19", "YY/M/DD") - class DateTimeParserParseTests(Chai): def setUp(self): @@ -357,6 +340,17 @@ def test_parse_subsecond_rounding(self): self.assertEqual(self.parser.parse(string, format), self.expected) self.assertEqual(self.parser.parse_iso(string), self.expected) + # Regression tests for issue #560 + def test_parse_long_year(self): + with self.assertRaises(ParserError): + self.parser.parse("09 January 123456789101112", "DD MMMM YYYY"), + + with self.assertRaises(ParserError): + self.parser.parse("123456789101112 09 January", "YYYY DD MMMM") + + with self.assertRaises(ParserError): + self.parser.parse("68096653015/01/19", "YY/M/DD") + class DateTimeParserRegexTests(Chai): def setUp(self): @@ -653,31 +647,32 @@ def test_isoformat(self): self.assertEqual(self.parser.parse_iso(dt.isoformat()), dt) - def test_iso8601_string_with_extra_words_at_start_and_end(self): + def test_parse_with_extra_words_at_start_and_end(self): + input_format_pairs = [ + ("blah2016", "YYYY"), + ("blah2016blah", "YYYY"), + ("2016blah", "YYYY"), + ("2016-05blah", "YYYY-MM"), + ("2016-05-16blah", "YYYY-MM-DD"), + ("2016-05T04:05:06.789120blah", "YYYY-MM-DDThh:mm:ss.S"), + ("2016-05T04:05:06.789120ZblahZ", "YYYY-MM-DDThh:mm:ss.SZ"), + ("2016-05T04:05:06.789120Zblah", "YYYY-MM-DDThh:mm:ss.SZ"), + ("2016-05T04:05:06.789120blahZ", "YYYY-MM-DDThh:mm:ss.SZ"), + ] - with self.assertRaises(ParserError): - self.parser.parse_iso("2016-05blah") + for pair in input_format_pairs: + with self.assertRaises(ParserError): + self.parser.parse_iso(pair[0]) - with self.assertRaises(ParserError): - self.parser.parse_iso("2016-05-16blah") + with self.assertRaises(ParserError): + self.parser.parse(pair[0], pair[1]) - with self.assertRaises(ParserError): - self.parser.parse_iso("2016-05T04:05:06.78912ZblahZ") - - with self.assertRaises(ParserError): - self.parser.parse_iso("2016-05T04:05:06.78912Zblah") - - with self.assertRaises(ParserError): - self.parser.parse_iso("2016-05T04:05:06.78912blahZ") - - with self.assertRaises(ParserError): - self.parser.parse_iso("blah2016") - - with self.assertRaises(ParserError): - self.parser.parse_iso("2016blah") - - with self.assertRaises(ParserError): - self.parser.parse_iso("blah2016blah") + # Spaces surrounding the parsable date are ok because we + # allow the parsing of natural language input + self.assertEqual(self.parser.parse_iso("blah 2016 blah"), datetime(2016, 1, 1)) + self.assertEqual( + self.parser.parse("blah 2016 blah", "YYYY"), datetime(2016, 1, 1) + ) class TzinfoParserTests(Chai): From f77e6984ffe5b088fa30b8226d6590bbdc0bc1ba Mon Sep 17 00:00:00 2001 From: Jad Chaar Date: Mon, 8 Jul 2019 01:09:55 -0400 Subject: [PATCH 2/2] Added unit tests and added cleanup/validation to datetimte string in parse_iso --- arrow/parser.py | 129 ++++++++++++++++++++++-------------------- tests/parser_tests.py | 74 +++++++++++++++++++++--- 2 files changed, 135 insertions(+), 68 deletions(-) diff --git a/arrow/parser.py b/arrow/parser.py index d04ecabb2..d9ac5bee3 100644 --- a/arrow/parser.py +++ b/arrow/parser.py @@ -103,22 +103,51 @@ def __init__(self, locale="en_us", cache_size=0): ) # TODO: since we support more than ISO-8601, we should rename this function - def parse_iso(self, string): - # TODO: account for more than 1 space like arrow.get(" 2016") - # string = string.strip() + def parse_iso(self, datetime_string): + # TODO: talk to Chris about this => the below space divider checks + # are not really necessary thanks to the new regex changes, but I think + # it is good to include them to provide better error messages. - has_space_divider = " " in string and len(string.strip().split(" ")) == 2 + # strip leading and trailing whitespace + datetime_string = datetime_string.strip() - has_time = "T" in string or has_space_divider - space_divider = " " in string.strip() + has_space_divider = " " in datetime_string + num_space_dividers = len(datetime_string.split(" ")) + if has_space_divider and num_space_dividers != 2: + raise ParserError( + "Expected 1 space divider, but was given {}. Try passing in a format string to resolve this.".format( + num_space_dividers + ) + ) + + has_time = has_space_divider or "T" in datetime_string has_tz = False + # TODO: add tests for all the new formats, especially basic format + # required date formats to test against + formats = [ + "YYYY-MM-DD", + "YYYY-M-DD", + "YYYY-M-D", + "YYYY/MM/DD", + "YYYY/M/DD", + "YYYY/M/D", + "YYYY.MM.DD", + "YYYY.M.DD", + "YYYY.M.D", + "YYYYMMDD", + "YYYY-MM", + "YYYY/MM", + "YYYY.MM", + "YYYY", + ] + if has_time: - if space_divider: - date_string, time_string = string.split(" ", 1) + if has_space_divider: + date_string, time_string = datetime_string.split(" ", 1) else: - date_string, time_string = string.split("T", 1) + date_string, time_string = datetime_string.split("T", 1) # TODO: understand why we are not accounting for Z directly # currently Z is ignored entirely but fromdatetime defaults to UTC, see arrow.py L196 @@ -150,40 +179,45 @@ def parse_iso(self, string): if is_basic_time_format: time_string = time_string.replace(":", "") - # IDEA reduced set of date formats for basic + if has_space_divider: + formats = ["{} {}".format(f, time_string) for f in formats] + else: + formats = ["{}T{}".format(f, time_string) for f in formats] - # TODO: add tests for all the new formats, especially basic format - # required date formats to test against - formats = [ - "YYYY-MM-DD", - "YYYY-M-DD", - "YYYY-M-D", - "YYYY/MM/DD", - "YYYY/M/DD", - "YYYY/M/D", - "YYYY.MM.DD", - "YYYY.M.DD", - "YYYY.M.D", - "YYYYMMDD", - "YYYY-MM", - "YYYY/MM", - "YYYY.MM", - "YYYY", - # "YY", this is not a good format to try by default? - ] - - if has_time: - formats = ["{}T{}".format(f, time_string) for f in formats] + # TODO: reduce set of date formats for basic? if has_time and has_tz: # Add "Z" to format strings to indicate to _parse_tokens # that a timezone needs to be parsed formats = ["{}Z".format(f) for f in formats] - if space_divider: - formats = [item.replace("T", " ", 1) for item in formats] + # TODO: make thrown error messages less cryptic and more informative + return self._parse_multiformat(datetime_string, formats, True) + + def parse(self, datetime_string, fmt, from_parse_iso=False): + + if isinstance(fmt, list): + return self._parse_multiformat(datetime_string, fmt) + + fmt_tokens, fmt_pattern_re = self._generate_pattern_re(fmt) - return self._parse_multiformat(string, formats, True) + match = fmt_pattern_re.search(datetime_string) + if match is None: + raise ParserError( + "Failed to match '{}' when parsing '{}'".format( + fmt_pattern_re.pattern, datetime_string + ) + ) + + parts = {} + for token in fmt_tokens: + if token == "Do": + value = match.group("value") + else: + value = match.group(token) + self._parse_token(token, value, parts) + + return self._build_datetime(parts) def _generate_pattern_re(self, fmt): @@ -250,31 +284,6 @@ def _generate_pattern_re(self, fmt): return tokens, re.compile(final_fmt_pattern, flags=re.IGNORECASE) - def parse(self, string, fmt, from_parse_iso=False): - - if isinstance(fmt, list): - return self._parse_multiformat(string, fmt) - - fmt_tokens, fmt_pattern_re = self._generate_pattern_re(fmt) - - match = fmt_pattern_re.search(string) - if match is None: - raise ParserError( - "Failed to match '{}' when parsing '{}'".format( - fmt_pattern_re.pattern, string - ) - ) - - parts = {} - for token in fmt_tokens: - if token == "Do": - value = match.group("value") - else: - value = match.group(token) - self._parse_token(token, value, parts) - - return self._build_datetime(parts) - def _parse_token(self, token, value, parts): if token == "YYYY": diff --git a/tests/parser_tests.py b/tests/parser_tests.py index f1cd3229f..c1679e245 100644 --- a/tests/parser_tests.py +++ b/tests/parser_tests.py @@ -647,33 +647,91 @@ def test_isoformat(self): self.assertEqual(self.parser.parse_iso(dt.isoformat()), dt) - def test_parse_with_extra_words_at_start_and_end(self): + def test_parse_with_extra_words_at_start_and_end_invalid(self): + # The tuple's second entry is None if the datetime string + # is valid when a format string is passed in input_format_pairs = [ ("blah2016", "YYYY"), ("blah2016blah", "YYYY"), + ("blah 2016 blah", None), ("2016blah", "YYYY"), ("2016-05blah", "YYYY-MM"), ("2016-05-16blah", "YYYY-MM-DD"), - ("2016-05T04:05:06.789120blah", "YYYY-MM-DDThh:mm:ss.S"), - ("2016-05T04:05:06.789120ZblahZ", "YYYY-MM-DDThh:mm:ss.SZ"), - ("2016-05T04:05:06.789120Zblah", "YYYY-MM-DDThh:mm:ss.SZ"), - ("2016-05T04:05:06.789120blahZ", "YYYY-MM-DDThh:mm:ss.SZ"), + ("2016-05-16T04:05:06.789120blah", "YYYY-MM-DDThh:mm:ss.S"), + ("2016-05-16T04:05:06.789120ZblahZ", "YYYY-MM-DDThh:mm:ss.SZ"), + ("2016-05-16T04:05:06.789120Zblah", "YYYY-MM-DDThh:mm:ss.SZ"), + ("2016-05-16T04:05:06.789120blahZ", "YYYY-MM-DDThh:mm:ss.SZ"), + ("Meet me at 2016-05-16T04:05:06.789120 on Tuesday", None), + ("Meet me at 2016-05-16 04:05:06.789120 on Tuesday", None), ] for pair in input_format_pairs: with self.assertRaises(ParserError): self.parser.parse_iso(pair[0]) - with self.assertRaises(ParserError): - self.parser.parse(pair[0], pair[1]) + if pair[1] is not None: + with self.assertRaises(ParserError): + self.parser.parse(pair[0], pair[1]) + def test_parse_with_extra_words_at_start_and_end_valid(self): # Spaces surrounding the parsable date are ok because we # allow the parsing of natural language input - self.assertEqual(self.parser.parse_iso("blah 2016 blah"), datetime(2016, 1, 1)) self.assertEqual( self.parser.parse("blah 2016 blah", "YYYY"), datetime(2016, 1, 1) ) + self.assertEqual( + self.parser.parse( + "Meet me at 2016-05-16T04:05:06.789120 on Tuesday", + "YYYY-MM-DDThh:mm:ss.S", + ), + datetime(2016, 5, 16, 4, 5, 6, 789120), + ) + + self.assertEqual( + self.parser.parse( + "Meet me at 2016-05-16 04:05:06.789120 on Tuesday", + "YYYY-MM-DD hh:mm:ss.S", + ), + datetime(2016, 5, 16, 4, 5, 6, 789120), + ) + + def test_parse_with_leading_and_trailing_whitespace(self): + self.assertEqual(self.parser.parse_iso(" 2016"), datetime(2016, 1, 1)) + self.assertEqual(self.parser.parse(" 2016", "YYYY"), datetime(2016, 1, 1)) + + self.assertEqual(self.parser.parse_iso("2016 "), datetime(2016, 1, 1)) + self.assertEqual(self.parser.parse("2016 ", "YYYY"), datetime(2016, 1, 1)) + + self.assertEqual( + self.parser.parse_iso(" 2016 "), datetime(2016, 1, 1) + ) + self.assertEqual( + self.parser.parse(" 2016 ", "YYYY"), datetime(2016, 1, 1) + ) + + self.assertEqual( + self.parser.parse_iso(" 2016-05-16 04:05:06.789120 "), + datetime(2016, 5, 16, 4, 5, 6, 789120), + ) + self.assertEqual( + self.parser.parse( + " 2016-05-16 04:05:06.789120 ", "YYYY-MM-DD hh:mm:ss.S" + ), + datetime(2016, 5, 16, 4, 5, 6, 789120), + ) + + self.assertEqual( + self.parser.parse_iso(" 2016-05-16T04:05:06.789120 "), + datetime(2016, 5, 16, 4, 5, 6, 789120), + ) + self.assertEqual( + self.parser.parse( + " 2016-05-16T04:05:06.789120 ", "YYYY-MM-DDThh:mm:ss.S" + ), + datetime(2016, 5, 16, 4, 5, 6, 789120), + ) + class TzinfoParserTests(Chai): def setUp(self):