From e3126c77b24e6d6132be3faa169c3815bd011144 Mon Sep 17 00:00:00 2001
From: Jad Chaar <jad.chaar@gmail.com>
Date: Sun, 7 Jul 2019 23:54:03 -0400
Subject: [PATCH 1/2] Fixed a number of parsing issues with a couple of regex
 tweaks and additions

---
 arrow/parser.py       | 114 ++++++++++--------------------------------
 tests/parser_tests.py |  73 +++++++++++++--------------
 2 files changed, 60 insertions(+), 127 deletions(-)

diff --git a/arrow/parser.py b/arrow/parser.py
index cec107171..d04ecabb2 100644
--- a/arrow/parser.py
+++ b/arrow/parser.py
@@ -47,6 +47,7 @@ class DateTimeParser(object):
     _TWO_DIGIT_RE = re.compile(r"\d{2}")
     _TZ_RE = re.compile(r"[+\-]?\d{2}:?(\d{2})?|Z")
     _TZ_NAME_RE = re.compile(r"\w[\w+\-/]+")
+    _TIMESTAMP_RE = re.compile(r"\d+")
 
     _BASE_INPUT_RE_MAP = {
         "YYYY": _FOUR_DIGIT_RE,
@@ -63,7 +64,7 @@ class DateTimeParser(object):
         "m": _ONE_OR_TWO_DIGIT_RE,
         "ss": _TWO_DIGIT_RE,
         "s": _ONE_OR_TWO_DIGIT_RE,
-        "X": re.compile(r"\d+"),
+        "X": _TIMESTAMP_RE,
         "ZZZ": _TZ_NAME_RE,
         "ZZ": _TZ_RE,
         "Z": _TZ_RE,
@@ -101,9 +102,14 @@ def __init__(self, locale="en_us", cache_size=0):
                 self._generate_pattern_re
             )
 
+    # TODO: since we support more than ISO-8601, we should rename this function
     def parse_iso(self, string):
+        # TODO: account for more than 1 space like arrow.get("     2016")
+        # string = string.strip()
 
-        has_time = "T" in string or " " in string.strip()
+        has_space_divider = " " in string and len(string.strip().split(" ")) == 2
+
+        has_time = "T" in string or has_space_divider
         space_divider = " " in string.strip()
 
         has_tz = False
@@ -120,11 +126,11 @@ def parse_iso(self, string):
             time_parts = re.split("[+-]", time_string, 1)
             colon_count = time_parts[0].count(":")
 
-            # TODO "20160504T010203Z" parses incorectly, time part is HH only, due to Z changing len
+            # TODO "20160504T010203Z" parses incorrectly, time part is HH only, due to Z changing len
             is_basic_time_format = colon_count == 0
 
             has_tz = len(time_parts) > 1
-            has_hours = colon_count == 0 or len(time_string) == 2
+            has_hours = len(time_string) == 2
             has_minutes = colon_count == 1 or len(time_string) == 4
             has_seconds = colon_count == 2 or len(time_string) == 6
             has_subseconds = re.search("[.,]", time_parts[0])
@@ -146,7 +152,7 @@ def parse_iso(self, string):
 
         # IDEA reduced set of date formats for basic
 
-        # TODO: add tests for all the new formats
+        # TODO: add tests for all the new formats, especially basic format
         # required date formats to test against
         formats = [
             "YYYY-MM-DD",
@@ -227,6 +233,21 @@ def _generate_pattern_re(self, fmt):
             if i < len(escaped_data):
                 final_fmt_pattern += escaped_data[i][1:-1]
 
+        # Wrap final_fmt_pattern in a custom word boundary to strictly
+        # match the formatting pattern and filter out date and time formats
+        # that include junk such as: blah1998-09-12 blah, blah 1998-09-12blah,
+        # blah1998-09-12blah. The custom word boundary matches every character
+        # that is not a whitespace character to allow for searching for a date
+        # and time string in a natural language sentence. Therefore, searching
+        # for a string of the form YYYY-MM-DD in "blah 1998-09-12 blah" will
+        # work properly.
+        # Reference: https://stackoverflow.com/q/14232931/3820660
+        starting_word_boundary = r"(?<![\S])"
+        ending_word_boundary = r"(?![\S])"
+        final_fmt_pattern = r"{}{}Z?{}".format(
+            starting_word_boundary, final_fmt_pattern, ending_word_boundary
+        )
+
         return tokens, re.compile(final_fmt_pattern, flags=re.IGNORECASE)
 
     def parse(self, string, fmt, from_parse_iso=False):
@@ -244,89 +265,6 @@ def parse(self, string, fmt, from_parse_iso=False):
                 )
             )
 
-        if from_parse_iso:
-            # Accounts for cases such as "blahblah2016"
-            if match.start() != 0:
-                warnings.warn(
-                    "Parser loosely matched {fmt} on '{string}', in version "
-                    "0.15.0 this will raise a ParserError.".format(
-                        fmt=fmt, string=string
-                    ),
-                    category=GetParseWarning,
-                )
-                raise ParserError
-
-            # TODO arrow.get('2013-02-03 04:05:06.78912Z') is warning incorrectly due to this
-            # Accounts for cases such as "2016-05T04:05:06.78912blahZ"
-            if string[-1] == "Z" and match.end() != len(string) - 1:
-                warnings.warn(
-                    "Parser loosely matched {fmt} on '{string}', in version "
-                    "0.15.0 this will raise a ParserError.".format(
-                        fmt=fmt, string=string
-                    ),
-                    category=GetParseWarning,
-                )
-                raise ParserError
-
-            # Accounts for cases such as "2016-05T04:05:06.78912Zblah"
-            if string[-1] != "Z" and match.end() != len(string):
-                warnings.warn(
-                    "Parser loosely matched {fmt} on '{string}', in version "
-                    "0.15.0 this will raise a ParserError.".format(
-                        fmt=fmt, string=string
-                    ),
-                    category=GetParseWarning,
-                )
-                raise ParserError
-        else:
-            # fixes arrow.get("15/01/2019", ["D/M/YY","D/M/YYYY"]) => <Arrow [2020-01-15T00:00:00+00:00]>
-            # FIXME arrow.get("Call 01-02-03 on 79-01-01 12:05:10", "YY-MM-DD HH:mm:ss") warns incorrectly
-            # FIXME arrow.get("79-01-01 12:05:10", "YY-MM-DD HH:mm:ss") warns incorrectly
-            # IDEA test for whitespace on either side of match?
-            if "YY" in fmt_tokens and match.start != 0 or match.end() != len(string):
-                warnings.warn(
-                    "Parser loosely matched {fmt} on '{string}', in version "
-                    "0.15.0 this will raise a ParserError.".format(
-                        fmt=fmt, string=string
-                    ),
-                    category=GetParseWarning,
-                )
-                raise ParserError
-
-            if fmt == "YYYY":
-                # accounts for arrow.get('05/02/2017', ['YYYY', 'MM/DD/YYYY'])
-                if match.start() != 0 or match.end() != len(string):
-                    warnings.warn(
-                        "Parser loosely matched {fmt} on '{string}', in version "
-                        "0.15.0 this will raise a ParserError.".format(
-                            fmt=fmt, string=string
-                        ),
-                        category=GetParseWarning,
-                    )
-                    raise ParserError
-
-        # Fixes bug where "15/01/2019" matches to "D/M/YY"
-        # arrow.get("15/01/2019", ["D/M/YY", "D/M/YYYY"])
-        if "YY" in fmt_tokens and match.end() != len(string):
-            raise ParserError
-
-        # TODO: talk to Chris about these conditionals
-        # if string[-1] == "Z" and match.end() != len(string) - 1:
-        #     # TODO: add an exception message
-        #     raise ParserError
-        #
-        # if string[-1] != "Z" and match.end() != len(string):
-        #     # TODO: add an exception message
-        #     raise ParserError
-        #
-        # if match.start() != 0:
-        #     # TODO: add an exception message
-        #     raise ParserError
-
-        # if ("YY" in fmt_tokens or "YYYY" in fmt_tokens) and (match.end() != len(string) or match.start() != 0):
-        #     # TODO: add an exception message
-        #     raise ParserError
-
         parts = {}
         for token in fmt_tokens:
             if token == "Do":
diff --git a/tests/parser_tests.py b/tests/parser_tests.py
index deec9a09d..f1cd3229f 100644
--- a/tests/parser_tests.py
+++ b/tests/parser_tests.py
@@ -136,23 +136,6 @@ def test_YY_and_YYYY_format_list(self):
             datetime(2019, 1, 15, 4, 5, 6, 789120, tzinfo=tz.tzutc()),
         )
 
-    def test_long_year_input(self):
-
-        # TODO: ask Chris if this should throw a ParserError
-        # Pendulum does not throw an error
-        self.assertEqual(
-            self.parser.parse("09 January 123456789101112", "DD MMMM YYYY"),
-            datetime(1234, 1, 9),
-        )
-
-        # Pendulum throws an error
-        with self.assertRaises(ParserError):
-            self.parser.parse("123456789101112 09 January", "YYYY DD MMMM")
-
-        # Pendulum throws an error
-        with self.assertRaises(ParserError):
-            self.parser.parse("68096653015/01/19", "YY/M/DD")
-
 
 class DateTimeParserParseTests(Chai):
     def setUp(self):
@@ -357,6 +340,17 @@ def test_parse_subsecond_rounding(self):
         self.assertEqual(self.parser.parse(string, format), self.expected)
         self.assertEqual(self.parser.parse_iso(string), self.expected)
 
+    # Regression tests for issue #560
+    def test_parse_long_year(self):
+        with self.assertRaises(ParserError):
+            self.parser.parse("09 January 123456789101112", "DD MMMM YYYY"),
+
+        with self.assertRaises(ParserError):
+            self.parser.parse("123456789101112 09 January", "YYYY DD MMMM")
+
+        with self.assertRaises(ParserError):
+            self.parser.parse("68096653015/01/19", "YY/M/DD")
+
 
 class DateTimeParserRegexTests(Chai):
     def setUp(self):
@@ -653,31 +647,32 @@ def test_isoformat(self):
 
         self.assertEqual(self.parser.parse_iso(dt.isoformat()), dt)
 
-    def test_iso8601_string_with_extra_words_at_start_and_end(self):
+    def test_parse_with_extra_words_at_start_and_end(self):
+        input_format_pairs = [
+            ("blah2016", "YYYY"),
+            ("blah2016blah", "YYYY"),
+            ("2016blah", "YYYY"),
+            ("2016-05blah", "YYYY-MM"),
+            ("2016-05-16blah", "YYYY-MM-DD"),
+            ("2016-05T04:05:06.789120blah", "YYYY-MM-DDThh:mm:ss.S"),
+            ("2016-05T04:05:06.789120ZblahZ", "YYYY-MM-DDThh:mm:ss.SZ"),
+            ("2016-05T04:05:06.789120Zblah", "YYYY-MM-DDThh:mm:ss.SZ"),
+            ("2016-05T04:05:06.789120blahZ", "YYYY-MM-DDThh:mm:ss.SZ"),
+        ]
 
-        with self.assertRaises(ParserError):
-            self.parser.parse_iso("2016-05blah")
+        for pair in input_format_pairs:
+            with self.assertRaises(ParserError):
+                self.parser.parse_iso(pair[0])
 
-        with self.assertRaises(ParserError):
-            self.parser.parse_iso("2016-05-16blah")
+            with self.assertRaises(ParserError):
+                self.parser.parse(pair[0], pair[1])
 
-        with self.assertRaises(ParserError):
-            self.parser.parse_iso("2016-05T04:05:06.78912ZblahZ")
-
-        with self.assertRaises(ParserError):
-            self.parser.parse_iso("2016-05T04:05:06.78912Zblah")
-
-        with self.assertRaises(ParserError):
-            self.parser.parse_iso("2016-05T04:05:06.78912blahZ")
-
-        with self.assertRaises(ParserError):
-            self.parser.parse_iso("blah2016")
-
-        with self.assertRaises(ParserError):
-            self.parser.parse_iso("2016blah")
-
-        with self.assertRaises(ParserError):
-            self.parser.parse_iso("blah2016blah")
+        # Spaces surrounding the parsable date are ok because we
+        # allow the parsing of natural language input
+        self.assertEqual(self.parser.parse_iso("blah 2016 blah"), datetime(2016, 1, 1))
+        self.assertEqual(
+            self.parser.parse("blah 2016 blah", "YYYY"), datetime(2016, 1, 1)
+        )
 
 
 class TzinfoParserTests(Chai):

From f77e6984ffe5b088fa30b8226d6590bbdc0bc1ba Mon Sep 17 00:00:00 2001
From: Jad Chaar <jad.chaar@gmail.com>
Date: Mon, 8 Jul 2019 01:09:55 -0400
Subject: [PATCH 2/2] Added unit tests and added cleanup/validation to
 datetimte string in parse_iso

---
 arrow/parser.py       | 129 ++++++++++++++++++++++--------------------
 tests/parser_tests.py |  74 +++++++++++++++++++++---
 2 files changed, 135 insertions(+), 68 deletions(-)

diff --git a/arrow/parser.py b/arrow/parser.py
index d04ecabb2..d9ac5bee3 100644
--- a/arrow/parser.py
+++ b/arrow/parser.py
@@ -103,22 +103,51 @@ def __init__(self, locale="en_us", cache_size=0):
             )
 
     # TODO: since we support more than ISO-8601, we should rename this function
-    def parse_iso(self, string):
-        # TODO: account for more than 1 space like arrow.get("     2016")
-        # string = string.strip()
+    def parse_iso(self, datetime_string):
+        # TODO: talk to Chris about this => the below space divider checks
+        # are not really necessary thanks to the new regex changes, but I think
+        # it is good to include them to provide better error messages.
 
-        has_space_divider = " " in string and len(string.strip().split(" ")) == 2
+        # strip leading and trailing whitespace
+        datetime_string = datetime_string.strip()
 
-        has_time = "T" in string or has_space_divider
-        space_divider = " " in string.strip()
+        has_space_divider = " " in datetime_string
 
+        num_space_dividers = len(datetime_string.split(" "))
+        if has_space_divider and num_space_dividers != 2:
+            raise ParserError(
+                "Expected 1 space divider, but was given {}. Try passing in a format string to resolve this.".format(
+                    num_space_dividers
+                )
+            )
+
+        has_time = has_space_divider or "T" in datetime_string
         has_tz = False
 
+        # TODO: add tests for all the new formats, especially basic format
+        # required date formats to test against
+        formats = [
+            "YYYY-MM-DD",
+            "YYYY-M-DD",
+            "YYYY-M-D",
+            "YYYY/MM/DD",
+            "YYYY/M/DD",
+            "YYYY/M/D",
+            "YYYY.MM.DD",
+            "YYYY.M.DD",
+            "YYYY.M.D",
+            "YYYYMMDD",
+            "YYYY-MM",
+            "YYYY/MM",
+            "YYYY.MM",
+            "YYYY",
+        ]
+
         if has_time:
-            if space_divider:
-                date_string, time_string = string.split(" ", 1)
+            if has_space_divider:
+                date_string, time_string = datetime_string.split(" ", 1)
             else:
-                date_string, time_string = string.split("T", 1)
+                date_string, time_string = datetime_string.split("T", 1)
 
             # TODO: understand why we are not accounting for Z directly
             # currently Z is ignored entirely but fromdatetime defaults to UTC, see arrow.py L196
@@ -150,40 +179,45 @@ def parse_iso(self, string):
             if is_basic_time_format:
                 time_string = time_string.replace(":", "")
 
-        # IDEA reduced set of date formats for basic
+            if has_space_divider:
+                formats = ["{} {}".format(f, time_string) for f in formats]
+            else:
+                formats = ["{}T{}".format(f, time_string) for f in formats]
 
-        # TODO: add tests for all the new formats, especially basic format
-        # required date formats to test against
-        formats = [
-            "YYYY-MM-DD",
-            "YYYY-M-DD",
-            "YYYY-M-D",
-            "YYYY/MM/DD",
-            "YYYY/M/DD",
-            "YYYY/M/D",
-            "YYYY.MM.DD",
-            "YYYY.M.DD",
-            "YYYY.M.D",
-            "YYYYMMDD",
-            "YYYY-MM",
-            "YYYY/MM",
-            "YYYY.MM",
-            "YYYY",
-            # "YY", this is not a good format to try by default?
-        ]
-
-        if has_time:
-            formats = ["{}T{}".format(f, time_string) for f in formats]
+        # TODO: reduce set of date formats for basic?
 
         if has_time and has_tz:
             # Add "Z" to format strings to indicate to _parse_tokens
             # that a timezone needs to be parsed
             formats = ["{}Z".format(f) for f in formats]
 
-        if space_divider:
-            formats = [item.replace("T", " ", 1) for item in formats]
+        # TODO: make thrown error messages less cryptic and more informative
+        return self._parse_multiformat(datetime_string, formats, True)
+
+    def parse(self, datetime_string, fmt, from_parse_iso=False):
+
+        if isinstance(fmt, list):
+            return self._parse_multiformat(datetime_string, fmt)
+
+        fmt_tokens, fmt_pattern_re = self._generate_pattern_re(fmt)
 
-        return self._parse_multiformat(string, formats, True)
+        match = fmt_pattern_re.search(datetime_string)
+        if match is None:
+            raise ParserError(
+                "Failed to match '{}' when parsing '{}'".format(
+                    fmt_pattern_re.pattern, datetime_string
+                )
+            )
+
+        parts = {}
+        for token in fmt_tokens:
+            if token == "Do":
+                value = match.group("value")
+            else:
+                value = match.group(token)
+            self._parse_token(token, value, parts)
+
+        return self._build_datetime(parts)
 
     def _generate_pattern_re(self, fmt):
 
@@ -250,31 +284,6 @@ def _generate_pattern_re(self, fmt):
 
         return tokens, re.compile(final_fmt_pattern, flags=re.IGNORECASE)
 
-    def parse(self, string, fmt, from_parse_iso=False):
-
-        if isinstance(fmt, list):
-            return self._parse_multiformat(string, fmt)
-
-        fmt_tokens, fmt_pattern_re = self._generate_pattern_re(fmt)
-
-        match = fmt_pattern_re.search(string)
-        if match is None:
-            raise ParserError(
-                "Failed to match '{}' when parsing '{}'".format(
-                    fmt_pattern_re.pattern, string
-                )
-            )
-
-        parts = {}
-        for token in fmt_tokens:
-            if token == "Do":
-                value = match.group("value")
-            else:
-                value = match.group(token)
-            self._parse_token(token, value, parts)
-
-        return self._build_datetime(parts)
-
     def _parse_token(self, token, value, parts):
 
         if token == "YYYY":
diff --git a/tests/parser_tests.py b/tests/parser_tests.py
index f1cd3229f..c1679e245 100644
--- a/tests/parser_tests.py
+++ b/tests/parser_tests.py
@@ -647,33 +647,91 @@ def test_isoformat(self):
 
         self.assertEqual(self.parser.parse_iso(dt.isoformat()), dt)
 
-    def test_parse_with_extra_words_at_start_and_end(self):
+    def test_parse_with_extra_words_at_start_and_end_invalid(self):
+        # The tuple's second entry is None if the datetime string
+        # is valid when a format string is passed in
         input_format_pairs = [
             ("blah2016", "YYYY"),
             ("blah2016blah", "YYYY"),
+            ("blah 2016 blah", None),
             ("2016blah", "YYYY"),
             ("2016-05blah", "YYYY-MM"),
             ("2016-05-16blah", "YYYY-MM-DD"),
-            ("2016-05T04:05:06.789120blah", "YYYY-MM-DDThh:mm:ss.S"),
-            ("2016-05T04:05:06.789120ZblahZ", "YYYY-MM-DDThh:mm:ss.SZ"),
-            ("2016-05T04:05:06.789120Zblah", "YYYY-MM-DDThh:mm:ss.SZ"),
-            ("2016-05T04:05:06.789120blahZ", "YYYY-MM-DDThh:mm:ss.SZ"),
+            ("2016-05-16T04:05:06.789120blah", "YYYY-MM-DDThh:mm:ss.S"),
+            ("2016-05-16T04:05:06.789120ZblahZ", "YYYY-MM-DDThh:mm:ss.SZ"),
+            ("2016-05-16T04:05:06.789120Zblah", "YYYY-MM-DDThh:mm:ss.SZ"),
+            ("2016-05-16T04:05:06.789120blahZ", "YYYY-MM-DDThh:mm:ss.SZ"),
+            ("Meet me at 2016-05-16T04:05:06.789120 on Tuesday", None),
+            ("Meet me at 2016-05-16 04:05:06.789120 on Tuesday", None),
         ]
 
         for pair in input_format_pairs:
             with self.assertRaises(ParserError):
                 self.parser.parse_iso(pair[0])
 
-            with self.assertRaises(ParserError):
-                self.parser.parse(pair[0], pair[1])
+            if pair[1] is not None:
+                with self.assertRaises(ParserError):
+                    self.parser.parse(pair[0], pair[1])
 
+    def test_parse_with_extra_words_at_start_and_end_valid(self):
         # Spaces surrounding the parsable date are ok because we
         # allow the parsing of natural language input
-        self.assertEqual(self.parser.parse_iso("blah 2016 blah"), datetime(2016, 1, 1))
         self.assertEqual(
             self.parser.parse("blah 2016 blah", "YYYY"), datetime(2016, 1, 1)
         )
 
+        self.assertEqual(
+            self.parser.parse(
+                "Meet me at 2016-05-16T04:05:06.789120 on Tuesday",
+                "YYYY-MM-DDThh:mm:ss.S",
+            ),
+            datetime(2016, 5, 16, 4, 5, 6, 789120),
+        )
+
+        self.assertEqual(
+            self.parser.parse(
+                "Meet me at 2016-05-16 04:05:06.789120 on Tuesday",
+                "YYYY-MM-DD hh:mm:ss.S",
+            ),
+            datetime(2016, 5, 16, 4, 5, 6, 789120),
+        )
+
+    def test_parse_with_leading_and_trailing_whitespace(self):
+        self.assertEqual(self.parser.parse_iso("      2016"), datetime(2016, 1, 1))
+        self.assertEqual(self.parser.parse("      2016", "YYYY"), datetime(2016, 1, 1))
+
+        self.assertEqual(self.parser.parse_iso("2016      "), datetime(2016, 1, 1))
+        self.assertEqual(self.parser.parse("2016      ", "YYYY"), datetime(2016, 1, 1))
+
+        self.assertEqual(
+            self.parser.parse_iso("      2016      "), datetime(2016, 1, 1)
+        )
+        self.assertEqual(
+            self.parser.parse("      2016      ", "YYYY"), datetime(2016, 1, 1)
+        )
+
+        self.assertEqual(
+            self.parser.parse_iso("      2016-05-16 04:05:06.789120      "),
+            datetime(2016, 5, 16, 4, 5, 6, 789120),
+        )
+        self.assertEqual(
+            self.parser.parse(
+                "      2016-05-16 04:05:06.789120      ", "YYYY-MM-DD hh:mm:ss.S"
+            ),
+            datetime(2016, 5, 16, 4, 5, 6, 789120),
+        )
+
+        self.assertEqual(
+            self.parser.parse_iso("      2016-05-16T04:05:06.789120      "),
+            datetime(2016, 5, 16, 4, 5, 6, 789120),
+        )
+        self.assertEqual(
+            self.parser.parse(
+                "      2016-05-16T04:05:06.789120      ", "YYYY-MM-DDThh:mm:ss.S"
+            ),
+            datetime(2016, 5, 16, 4, 5, 6, 789120),
+        )
+
 
 class TzinfoParserTests(Chai):
     def setUp(self):