Merge pull request #3898 from Zac-HD/more-rewrites

Rewrite regex filters on `text()` and `binary()`
HypothesisWorks · Feb 27, 2024 · c27727f · c27727f
2 parents 1bc9b88 + 6816a6c
commit c27727f
Showing 7 changed files with 188 additions and 55 deletions.
diff --git a/.gitignore b/.gitignore
@@ -13,6 +13,7 @@
 .runtimes
 /hypothesis-python/branch-check
 /pythonpython3.*
+/pythonpypy3.*
 .pyodide-xbuildenv
 
 # python
@@ -104,3 +105,4 @@ __pycache__
 HypothesisWorks.github.io.iml
 jekyll.log
 /website/output/
+/t.py
diff --git a/hypothesis-python/RELEASE.rst b/hypothesis-python/RELEASE.rst
@@ -0,0 +1,6 @@
+RELEASE_TYPE: patch
+
+This patch implements filter-rewriting for :func:`~hypothesis.strategies.text`
+and :func:`~hypothesis.strategies.binary` with the :meth:`~re.Pattern.search`,
+:meth:`~re.Pattern.match`, or :meth:`~re.Pattern.fullmatch` method of a
+:func:`re.compile`\ d regex.
diff --git a/hypothesis-python/src/hypothesis/strategies/_internal/core.py b/hypothesis-python/src/hypothesis/strategies/_internal/core.py
@@ -134,7 +134,7 @@
     one_of,
 )
 from hypothesis.strategies._internal.strings import (
-    FixedSizeBytes,
+    BytesStrategy,
     OneCharStringStrategy,
     TextStrategy,
 )
@@ -963,11 +963,7 @@ def binary(
     values.
     """
     check_valid_sizes(min_size, max_size)
-    if min_size == max_size:
-        return FixedSizeBytes(min_size)
-    return lists(
-        integers(min_value=0, max_value=255), min_size=min_size, max_size=max_size
-    ).map(bytes)
+    return BytesStrategy(min_size, max_size)
 
 
 @cacheable

diff --git a/hypothesis-python/src/hypothesis/strategies/_internal/strings.py b/hypothesis-python/src/hypothesis/strategies/_internal/strings.py
@@ -11,14 +11,17 @@
 import copy
 import re
 import warnings
-from functools import lru_cache
+from functools import lru_cache, partial
 
 from hypothesis.errors import HypothesisWarning, InvalidArgument
 from hypothesis.internal import charmap
+from hypothesis.internal.filtering import max_len, min_len
 from hypothesis.internal.intervalsets import IntervalSet
 from hypothesis.strategies._internal.collections import ListStrategy
 from hypothesis.strategies._internal.lazy import unwrap_strategies
+from hypothesis.strategies._internal.numbers import IntegersStrategy
 from hypothesis.strategies._internal.strategies import SearchStrategy
+from hypothesis.vendor.pretty import pretty
 
 
 class OneCharStringStrategy(SearchStrategy):
@@ -76,6 +79,33 @@ def do_draw(self, data):
         return data.draw_string(self.intervals, min_size=1, max_size=1)
 
 
+_nonempty_names = (
+    "capitalize",
+    "expandtabs",
+    "join",
+    "lower",
+    "rsplit",
+    "split",
+    "splitlines",
+    "swapcase",
+    "title",
+    "upper",
+)
+_nonempty_and_content_names = (
+    "islower",
+    "isupper",
+    "isalnum",
+    "isalpha",
+    "isascii",
+    "isdigit",
+    "isspace",
+    "istitle",
+    "lstrip",
+    "rstrip",
+    "strip",
+)
+
+
 class TextStrategy(ListStrategy):
     def do_draw(self, data):
         # if our element strategy is OneCharStringStrategy, we can skip the
@@ -104,44 +134,17 @@ def __repr__(self):
     _nonempty_filters = (
         *ListStrategy._nonempty_filters,
         str,
-        str.capitalize,
         str.casefold,
         str.encode,
-        str.expandtabs,
-        str.join,
-        str.lower,
-        str.rsplit,
-        str.split,
-        str.splitlines,
-        str.swapcase,
-        str.title,
-        str.upper,
+        *(getattr(str, n) for n in _nonempty_names),
     )
     _nonempty_and_content_filters = (
-        str.isidentifier,
-        str.islower,
-        str.isupper,
-        str.isalnum,
-        str.isalpha,
-        str.isascii,
         str.isdecimal,
-        str.isdigit,
         str.isnumeric,
-        str.isspace,
-        str.istitle,
-        str.lstrip,
-        str.rstrip,
-        str.strip,
+        *(getattr(str, n) for n in _nonempty_and_content_names),
     )
 
     def filter(self, condition):
-        if condition in (str.lower, str.title, str.upper):
-            warnings.warn(
-                f"You applied str.{condition.__name__} as a filter, but this allows "
-                f"all nonempty strings!  Did you mean str.is{condition.__name__}?",
-                HypothesisWarning,
-                stacklevel=2,
-            )
         elems = unwrap_strategies(self.element_strategy)
         if (
             condition is str.isidentifier
@@ -163,17 +166,76 @@ def filter(self, condition):
                 ),
                 # Filter to ensure that NFKC normalization keeps working in future
             ).filter(str.isidentifier)
+        if (new := _string_filter_rewrite(self, str, condition)) is not None:
+            return new
+        return super().filter(condition)
 
-        # We use ListStrategy filter logic for the conditions that *only* imply
-        # the string is nonempty.  Here, we increment the min_size but still apply
-        # the filter for conditions that imply nonempty *and specific contents*.
-        if condition in self._nonempty_and_content_filters:
-            assert self.max_size >= 1, "Always-empty is special cased in st.text()"
-            self = copy.copy(self)
-            self.min_size = max(1, self.min_size)
-            return ListStrategy.filter(self, condition)
 
-        return super().filter(condition)
+def _string_filter_rewrite(self, kind, condition):
+    if condition in (kind.lower, kind.title, kind.upper):
+        k = kind.__name__
+        warnings.warn(
+            f"You applied {k}.{condition.__name__} as a filter, but this allows "
+            f"all nonempty strings!  Did you mean {k}.is{condition.__name__}?",
+            HypothesisWarning,
+            stacklevel=2,
+        )
+
+    elems = unwrap_strategies(self.element_strategy)
+    if (
+        (kind is bytes or isinstance(elems, OneCharStringStrategy))
+        and isinstance(pattern := getattr(condition, "__self__", None), re.Pattern)
+        and isinstance(pattern.pattern, kind)
+    ):
+        from hypothesis.strategies._internal.regex import regex_strategy
+
+        print(f"{condition=}")
+        print(f"{condition.__name__=}")
+
+        if condition.__name__ == "match":
+            # Replace with an easier-to-handle equivalent condition
+            caret = "^" if kind is str else b"^"
+            pattern = re.compile(caret + pattern.pattern, flags=pattern.flags)
+            condition = pattern.search
+
+        if condition.__name__ in ("search", "findall", "fullmatch"):
+            s = regex_strategy(
+                pattern,
+                fullmatch=condition.__name__ == "fullmatch",
+                alphabet=self.element_strategy if kind is str else None,
+            )
+            if self.min_size > 0:
+                s = s.filter(partial(min_len, self.min_size))
+            if self.max_size < 1e999:
+                s = s.filter(partial(max_len, self.max_size))
+            return s
+        elif condition.__name__ in ("finditer", "scanner"):
+            # PyPy implements `finditer` as an alias to their `scanner` method
+            warnings.warn(
+                f"You applied {pretty(condition)} as a filter, but this allows "
+                f"any string at all!  Did you mean .findall ?",
+                HypothesisWarning,
+                stacklevel=3,
+            )
+            return self
+        elif condition.__name__ == "split":
+            warnings.warn(
+                f"You applied {pretty(condition)} as a filter, but this allows "
+                f"any nonempty string!  Did you mean .search ?",
+                HypothesisWarning,
+                stacklevel=3,
+            )
+            return self.filter(bool)
+
+    # We use ListStrategy filter logic for the conditions that *only* imply
+    # the string is nonempty.  Here, we increment the min_size but still apply
+    # the filter for conditions that imply nonempty *and specific contents*.
+    if condition in self._nonempty_and_content_filters and self.max_size >= 1:
+        self = copy.copy(self)
+        self.min_size = max(1, self.min_size)
+        return ListStrategy.filter(self, condition)
+
+    return None
 
 
 # Excerpted from https://www.unicode.org/Public/15.0.0/ucd/PropList.txt
@@ -229,9 +291,26 @@ def _identifier_characters():
     return id_start, id_continue
 
 
-class FixedSizeBytes(SearchStrategy):
-    def __init__(self, size):
-        self.size = size
+class BytesStrategy(ListStrategy):
+    def __init__(self, min_size, max_size):
+        super().__init__(IntegersStrategy(0, 255), min_size=min_size, max_size=max_size)
 
     def do_draw(self, data):
-        return bytes(data.draw_bytes(self.size))
+        # TODO: refactor the underlying provider to support variable-length bytes
+        if self.min_size == self.max_size:
+            return bytes(data.draw_bytes(self.min_size))
+        return bytes(super().do_draw(data))
+
+    _nonempty_filters = (
+        *ListStrategy._nonempty_filters,
+        bytes,
+        *(getattr(bytes, n) for n in _nonempty_names),
+    )
+    _nonempty_and_content_filters = (
+        *(getattr(bytes, n) for n in _nonempty_and_content_names),
+    )
+
+    def filter(self, condition):
+        if (new := _string_filter_rewrite(self, bytes, condition)) is not None:
+            return new
+        return super().filter(condition)
diff --git a/hypothesis-python/src/hypothesis/vendor/pretty.py b/hypothesis-python/src/hypothesis/vendor/pretty.py
@@ -747,7 +747,7 @@ def _repr_float_counting_nans(obj, p, cycle):
     type: _type_pprint,
     types.FunctionType: _function_pprint,
     types.BuiltinFunctionType: _function_pprint,
-    types.MethodType: _repr_pprint,
+    types.MethodType: _function_pprint,
     datetime.datetime: _repr_pprint,
     datetime.timedelta: _repr_pprint,
     BaseException: _exception_pprint,

diff --git a/hypothesis-python/tests/cover/test_filter_rewriting.py b/hypothesis-python/tests/cover/test_filter_rewriting.py
@@ -11,6 +11,7 @@
 import decimal
 import math
 import operator
+import re
 from fractions import Fraction
 from functools import partial
 from sys import float_info
@@ -31,6 +32,8 @@
 from tests.common.debug import check_can_generate_examples
 from tests.common.utils import fails_with
 
+A_FEW = 15  # speed up massively-parametrized tests
+
 
 @pytest.mark.parametrize(
     "strategy, predicate, start, end",
@@ -84,6 +87,7 @@
     ],
     ids=get_pretty_function_description,
 )
+@settings(max_examples=A_FEW)
 @given(data=st.data())
 def test_filter_rewriting_ints(data, strategy, predicate, start, end):
     s = strategy.filter(predicate)
@@ -147,6 +151,7 @@ def test_filter_rewriting_ints(data, strategy, predicate, start, end):
     ],
     ids=get_pretty_function_description,
 )
+@settings(max_examples=A_FEW)
 @given(data=st.data())
 def test_filter_rewriting_floats(data, strategy, predicate, min_value, max_value):
     s = strategy.filter(predicate)
@@ -405,6 +410,7 @@ def test_filter_floats_can_skip_subnormals(op, attr, value, expected):
     ],
     ids=get_pretty_function_description,
 )
+@settings(max_examples=A_FEW)
 @given(data=st.data())
 def test_filter_rewriting_text_partial_len(data, strategy, predicate, start, end):
     s = strategy.filter(predicate)
@@ -477,7 +483,7 @@ def test_can_rewrite_multiple_length_filters_if_not_lambdas(data):
     ],
     ids=get_pretty_function_description,
 )
-@settings(max_examples=15)
+@settings(max_examples=A_FEW)
 @given(data=st.data())
 def test_filter_rewriting_text_lambda_len(data, strategy, predicate, start, end):
     s = strategy.filter(predicate)
@@ -541,6 +547,7 @@ def test_filter_rewriting_text_lambda_len(data, strategy, predicate, start, end)
     ],
     ids=get_pretty_function_description,
 )
+@settings(max_examples=A_FEW)
 @given(data=st.data())
 def test_filter_rewriting_lambda_len_unique_elements(
     data, strategy, predicate, start, end
@@ -573,3 +580,39 @@ def test_does_not_rewrite_unsatisfiable_len_filter(predicate):
     # Rewriting to nothing() would correctly express the constraint.  However
     # we don't want _only rewritable strategies_ to work in e.g. one_of, so:
     assert not strategy.is_empty
+
+
+@pytest.mark.parametrize(
+    "method", ["match", "search", "findall", "fullmatch", "finditer", "split"]
+)
+@pytest.mark.parametrize(
+    "strategy, pattern",
+    [
+        (st.text(), "ab+c"),
+        (st.text(alphabet="abcdef"), "ab+c"),
+        (st.text(min_size=5, max_size=10), "ab+c"),
+        (st.binary(), b"ab+c"),
+        (st.binary(min_size=5, max_size=10), b"ab+c"),
+    ],
+    ids=repr,
+)
+@settings(max_examples=A_FEW)
+@given(data=st.data())
+def test_regex_filter_rewriting(data, strategy, pattern, method):
+    # This would raise a HealthCheck without rewriting, so checking that
+    # we can draw a valid value is sufficient.
+    predicate = getattr(re.compile(pattern), method)
+    s = strategy.filter(predicate)
+    if method in ("finditer", "split"):
+        msg = r"You applied re.compile\(.+?\).\w+ as a filter, but this allows"
+        with pytest.warns(HypothesisWarning, match=msg):
+            value = data.draw(s)
+    else:
+        value = data.draw(s)
+    assert predicate(value)
+
+
+@fails_with(TypeError)
+@given(st.text().filter(re.compile("abc").sub))
+def test_error_on_method_which_requires_multiple_args(_):
+    pass