diff --git a/.gitignore b/.gitignore index 9a0c7886fc..5a404bcaaa 100644 --- a/.gitignore +++ b/.gitignore @@ -13,6 +13,7 @@ .runtimes /hypothesis-python/branch-check /pythonpython3.* +/pythonpypy3.* .pyodide-xbuildenv # python @@ -104,3 +105,4 @@ __pycache__ HypothesisWorks.github.io.iml jekyll.log /website/output/ +/t.py diff --git a/hypothesis-python/RELEASE.rst b/hypothesis-python/RELEASE.rst new file mode 100644 index 0000000000..4a15bf6da8 --- /dev/null +++ b/hypothesis-python/RELEASE.rst @@ -0,0 +1,6 @@ +RELEASE_TYPE: patch + +This patch implements filter-rewriting for :func:`~hypothesis.strategies.text` +and :func:`~hypothesis.strategies.binary` with the :meth:`~re.Pattern.search`, +:meth:`~re.Pattern.match`, or :meth:`~re.Pattern.fullmatch` method of a +:func:`re.compile`\ d regex. diff --git a/hypothesis-python/src/hypothesis/strategies/_internal/core.py b/hypothesis-python/src/hypothesis/strategies/_internal/core.py index 1efe75caec..83edacbfef 100644 --- a/hypothesis-python/src/hypothesis/strategies/_internal/core.py +++ b/hypothesis-python/src/hypothesis/strategies/_internal/core.py @@ -134,7 +134,7 @@ one_of, ) from hypothesis.strategies._internal.strings import ( - FixedSizeBytes, + BytesStrategy, OneCharStringStrategy, TextStrategy, ) @@ -963,11 +963,7 @@ def binary( values. """ check_valid_sizes(min_size, max_size) - if min_size == max_size: - return FixedSizeBytes(min_size) - return lists( - integers(min_value=0, max_value=255), min_size=min_size, max_size=max_size - ).map(bytes) + return BytesStrategy(min_size, max_size) @cacheable diff --git a/hypothesis-python/src/hypothesis/strategies/_internal/strings.py b/hypothesis-python/src/hypothesis/strategies/_internal/strings.py index 021ce3c6e6..8df955e632 100644 --- a/hypothesis-python/src/hypothesis/strategies/_internal/strings.py +++ b/hypothesis-python/src/hypothesis/strategies/_internal/strings.py @@ -11,14 +11,17 @@ import copy import re import warnings -from functools import lru_cache +from functools import lru_cache, partial from hypothesis.errors import HypothesisWarning, InvalidArgument from hypothesis.internal import charmap +from hypothesis.internal.filtering import max_len, min_len from hypothesis.internal.intervalsets import IntervalSet from hypothesis.strategies._internal.collections import ListStrategy from hypothesis.strategies._internal.lazy import unwrap_strategies +from hypothesis.strategies._internal.numbers import IntegersStrategy from hypothesis.strategies._internal.strategies import SearchStrategy +from hypothesis.vendor.pretty import pretty class OneCharStringStrategy(SearchStrategy): @@ -76,6 +79,33 @@ def do_draw(self, data): return data.draw_string(self.intervals, min_size=1, max_size=1) +_nonempty_names = ( + "capitalize", + "expandtabs", + "join", + "lower", + "rsplit", + "split", + "splitlines", + "swapcase", + "title", + "upper", +) +_nonempty_and_content_names = ( + "islower", + "isupper", + "isalnum", + "isalpha", + "isascii", + "isdigit", + "isspace", + "istitle", + "lstrip", + "rstrip", + "strip", +) + + class TextStrategy(ListStrategy): def do_draw(self, data): # if our element strategy is OneCharStringStrategy, we can skip the @@ -104,44 +134,17 @@ def __repr__(self): _nonempty_filters = ( *ListStrategy._nonempty_filters, str, - str.capitalize, str.casefold, str.encode, - str.expandtabs, - str.join, - str.lower, - str.rsplit, - str.split, - str.splitlines, - str.swapcase, - str.title, - str.upper, + *(getattr(str, n) for n in _nonempty_names), ) _nonempty_and_content_filters = ( - str.isidentifier, - str.islower, - str.isupper, - str.isalnum, - str.isalpha, - str.isascii, str.isdecimal, - str.isdigit, str.isnumeric, - str.isspace, - str.istitle, - str.lstrip, - str.rstrip, - str.strip, + *(getattr(str, n) for n in _nonempty_and_content_names), ) def filter(self, condition): - if condition in (str.lower, str.title, str.upper): - warnings.warn( - f"You applied str.{condition.__name__} as a filter, but this allows " - f"all nonempty strings! Did you mean str.is{condition.__name__}?", - HypothesisWarning, - stacklevel=2, - ) elems = unwrap_strategies(self.element_strategy) if ( condition is str.isidentifier @@ -163,17 +166,76 @@ def filter(self, condition): ), # Filter to ensure that NFKC normalization keeps working in future ).filter(str.isidentifier) + if (new := _string_filter_rewrite(self, str, condition)) is not None: + return new + return super().filter(condition) - # We use ListStrategy filter logic for the conditions that *only* imply - # the string is nonempty. Here, we increment the min_size but still apply - # the filter for conditions that imply nonempty *and specific contents*. - if condition in self._nonempty_and_content_filters: - assert self.max_size >= 1, "Always-empty is special cased in st.text()" - self = copy.copy(self) - self.min_size = max(1, self.min_size) - return ListStrategy.filter(self, condition) - return super().filter(condition) +def _string_filter_rewrite(self, kind, condition): + if condition in (kind.lower, kind.title, kind.upper): + k = kind.__name__ + warnings.warn( + f"You applied {k}.{condition.__name__} as a filter, but this allows " + f"all nonempty strings! Did you mean {k}.is{condition.__name__}?", + HypothesisWarning, + stacklevel=2, + ) + + elems = unwrap_strategies(self.element_strategy) + if ( + (kind is bytes or isinstance(elems, OneCharStringStrategy)) + and isinstance(pattern := getattr(condition, "__self__", None), re.Pattern) + and isinstance(pattern.pattern, kind) + ): + from hypothesis.strategies._internal.regex import regex_strategy + + print(f"{condition=}") + print(f"{condition.__name__=}") + + if condition.__name__ == "match": + # Replace with an easier-to-handle equivalent condition + caret = "^" if kind is str else b"^" + pattern = re.compile(caret + pattern.pattern, flags=pattern.flags) + condition = pattern.search + + if condition.__name__ in ("search", "findall", "fullmatch"): + s = regex_strategy( + pattern, + fullmatch=condition.__name__ == "fullmatch", + alphabet=self.element_strategy if kind is str else None, + ) + if self.min_size > 0: + s = s.filter(partial(min_len, self.min_size)) + if self.max_size < 1e999: + s = s.filter(partial(max_len, self.max_size)) + return s + elif condition.__name__ in ("finditer", "scanner"): + # PyPy implements `finditer` as an alias to their `scanner` method + warnings.warn( + f"You applied {pretty(condition)} as a filter, but this allows " + f"any string at all! Did you mean .findall ?", + HypothesisWarning, + stacklevel=3, + ) + return self + elif condition.__name__ == "split": + warnings.warn( + f"You applied {pretty(condition)} as a filter, but this allows " + f"any nonempty string! Did you mean .search ?", + HypothesisWarning, + stacklevel=3, + ) + return self.filter(bool) + + # We use ListStrategy filter logic for the conditions that *only* imply + # the string is nonempty. Here, we increment the min_size but still apply + # the filter for conditions that imply nonempty *and specific contents*. + if condition in self._nonempty_and_content_filters and self.max_size >= 1: + self = copy.copy(self) + self.min_size = max(1, self.min_size) + return ListStrategy.filter(self, condition) + + return None # Excerpted from https://www.unicode.org/Public/15.0.0/ucd/PropList.txt @@ -229,9 +291,26 @@ def _identifier_characters(): return id_start, id_continue -class FixedSizeBytes(SearchStrategy): - def __init__(self, size): - self.size = size +class BytesStrategy(ListStrategy): + def __init__(self, min_size, max_size): + super().__init__(IntegersStrategy(0, 255), min_size=min_size, max_size=max_size) def do_draw(self, data): - return bytes(data.draw_bytes(self.size)) + # TODO: refactor the underlying provider to support variable-length bytes + if self.min_size == self.max_size: + return bytes(data.draw_bytes(self.min_size)) + return bytes(super().do_draw(data)) + + _nonempty_filters = ( + *ListStrategy._nonempty_filters, + bytes, + *(getattr(bytes, n) for n in _nonempty_names), + ) + _nonempty_and_content_filters = ( + *(getattr(bytes, n) for n in _nonempty_and_content_names), + ) + + def filter(self, condition): + if (new := _string_filter_rewrite(self, bytes, condition)) is not None: + return new + return super().filter(condition) diff --git a/hypothesis-python/src/hypothesis/vendor/pretty.py b/hypothesis-python/src/hypothesis/vendor/pretty.py index 35451b9961..ceffe3a6aa 100644 --- a/hypothesis-python/src/hypothesis/vendor/pretty.py +++ b/hypothesis-python/src/hypothesis/vendor/pretty.py @@ -747,7 +747,7 @@ def _repr_float_counting_nans(obj, p, cycle): type: _type_pprint, types.FunctionType: _function_pprint, types.BuiltinFunctionType: _function_pprint, - types.MethodType: _repr_pprint, + types.MethodType: _function_pprint, datetime.datetime: _repr_pprint, datetime.timedelta: _repr_pprint, BaseException: _exception_pprint, diff --git a/hypothesis-python/tests/cover/test_filter_rewriting.py b/hypothesis-python/tests/cover/test_filter_rewriting.py index d5268f7a03..ef31eea749 100644 --- a/hypothesis-python/tests/cover/test_filter_rewriting.py +++ b/hypothesis-python/tests/cover/test_filter_rewriting.py @@ -11,6 +11,7 @@ import decimal import math import operator +import re from fractions import Fraction from functools import partial from sys import float_info @@ -31,6 +32,8 @@ from tests.common.debug import check_can_generate_examples from tests.common.utils import fails_with +A_FEW = 15 # speed up massively-parametrized tests + @pytest.mark.parametrize( "strategy, predicate, start, end", @@ -84,6 +87,7 @@ ], ids=get_pretty_function_description, ) +@settings(max_examples=A_FEW) @given(data=st.data()) def test_filter_rewriting_ints(data, strategy, predicate, start, end): s = strategy.filter(predicate) @@ -147,6 +151,7 @@ def test_filter_rewriting_ints(data, strategy, predicate, start, end): ], ids=get_pretty_function_description, ) +@settings(max_examples=A_FEW) @given(data=st.data()) def test_filter_rewriting_floats(data, strategy, predicate, min_value, max_value): s = strategy.filter(predicate) @@ -405,6 +410,7 @@ def test_filter_floats_can_skip_subnormals(op, attr, value, expected): ], ids=get_pretty_function_description, ) +@settings(max_examples=A_FEW) @given(data=st.data()) def test_filter_rewriting_text_partial_len(data, strategy, predicate, start, end): s = strategy.filter(predicate) @@ -477,7 +483,7 @@ def test_can_rewrite_multiple_length_filters_if_not_lambdas(data): ], ids=get_pretty_function_description, ) -@settings(max_examples=15) +@settings(max_examples=A_FEW) @given(data=st.data()) def test_filter_rewriting_text_lambda_len(data, strategy, predicate, start, end): s = strategy.filter(predicate) @@ -541,6 +547,7 @@ def test_filter_rewriting_text_lambda_len(data, strategy, predicate, start, end) ], ids=get_pretty_function_description, ) +@settings(max_examples=A_FEW) @given(data=st.data()) def test_filter_rewriting_lambda_len_unique_elements( data, strategy, predicate, start, end @@ -573,3 +580,39 @@ def test_does_not_rewrite_unsatisfiable_len_filter(predicate): # Rewriting to nothing() would correctly express the constraint. However # we don't want _only rewritable strategies_ to work in e.g. one_of, so: assert not strategy.is_empty + + +@pytest.mark.parametrize( + "method", ["match", "search", "findall", "fullmatch", "finditer", "split"] +) +@pytest.mark.parametrize( + "strategy, pattern", + [ + (st.text(), "ab+c"), + (st.text(alphabet="abcdef"), "ab+c"), + (st.text(min_size=5, max_size=10), "ab+c"), + (st.binary(), b"ab+c"), + (st.binary(min_size=5, max_size=10), b"ab+c"), + ], + ids=repr, +) +@settings(max_examples=A_FEW) +@given(data=st.data()) +def test_regex_filter_rewriting(data, strategy, pattern, method): + # This would raise a HealthCheck without rewriting, so checking that + # we can draw a valid value is sufficient. + predicate = getattr(re.compile(pattern), method) + s = strategy.filter(predicate) + if method in ("finditer", "split"): + msg = r"You applied re.compile\(.+?\).\w+ as a filter, but this allows" + with pytest.warns(HypothesisWarning, match=msg): + value = data.draw(s) + else: + value = data.draw(s) + assert predicate(value) + + +@fails_with(TypeError) +@given(st.text().filter(re.compile("abc").sub)) +def test_error_on_method_which_requires_multiple_args(_): + pass diff --git a/hypothesis-python/tests/cover/test_health_checks.py b/hypothesis-python/tests/cover/test_health_checks.py index ae55f3a700..5cc9217d13 100644 --- a/hypothesis-python/tests/cover/test_health_checks.py +++ b/hypothesis-python/tests/cover/test_health_checks.py @@ -18,6 +18,7 @@ from hypothesis.errors import FailedHealthCheck, InvalidArgument from hypothesis.internal.compat import int_from_bytes from hypothesis.internal.conjecture.data import ConjectureData +from hypothesis.internal.conjecture.engine import BUFFER_SIZE from hypothesis.internal.entropy import deterministic_PRNG from hypothesis.stateful import ( RuleBasedStateMachine, @@ -121,8 +122,12 @@ def test(x): assert "filter" in e.value.args[0] +large_strategy = st.binary(min_size=7000, max_size=7000) +too_large_strategy = st.tuples(large_strategy, large_strategy) + + def test_large_data_will_fail_a_health_check(): - @given(st.none() | st.binary(min_size=10**5, max_size=10**5)) + @given(st.none() | too_large_strategy) @settings(database=None) def test(x): pass @@ -160,7 +165,7 @@ def a(x): def test_large_base_example_fails_health_check(): - @given(st.binary(min_size=7000, max_size=7000)) + @given(large_strategy) def test(b): pass @@ -171,7 +176,7 @@ def test(b): def test_example_that_shrinks_to_overrun_fails_health_check(): - @given(st.binary(min_size=9000, max_size=9000) | st.none()) + @given(too_large_strategy | st.none()) def test(b): pass @@ -260,7 +265,9 @@ def test_does_not_trigger_health_check_when_most_examples_are_small(monkeypatch) @settings(database=None, max_examples=11, phases=[Phase.generate]) @given( st.integers(0, 100).flatmap( - lambda n: st.binary(min_size=n * 100, max_size=n * 100) + lambda n: st.binary( + min_size=min(n * 100, BUFFER_SIZE), max_size=n * 100 + ) ) ) def test(b):