Skip to content

Commit

Permalink
Merge pull request #3898 from Zac-HD/more-rewrites
Browse files Browse the repository at this point in the history
Rewrite regex filters on `text()` and `binary()`
  • Loading branch information
Zac-HD authored Feb 27, 2024

Verified

This commit was signed with the committer’s verified signature.
mikz Michal Cichra
2 parents 1bc9b88 + 6816a6c commit c27727f
Showing 7 changed files with 188 additions and 55 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -13,6 +13,7 @@
.runtimes
/hypothesis-python/branch-check
/pythonpython3.*
/pythonpypy3.*
.pyodide-xbuildenv

# python
@@ -104,3 +105,4 @@ __pycache__
HypothesisWorks.github.io.iml
jekyll.log
/website/output/
/t.py
6 changes: 6 additions & 0 deletions hypothesis-python/RELEASE.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
RELEASE_TYPE: patch

This patch implements filter-rewriting for :func:`~hypothesis.strategies.text`
and :func:`~hypothesis.strategies.binary` with the :meth:`~re.Pattern.search`,
:meth:`~re.Pattern.match`, or :meth:`~re.Pattern.fullmatch` method of a
:func:`re.compile`\ d regex.
8 changes: 2 additions & 6 deletions hypothesis-python/src/hypothesis/strategies/_internal/core.py
Original file line number Diff line number Diff line change
@@ -134,7 +134,7 @@
one_of,
)
from hypothesis.strategies._internal.strings import (
FixedSizeBytes,
BytesStrategy,
OneCharStringStrategy,
TextStrategy,
)
@@ -963,11 +963,7 @@ def binary(
values.
"""
check_valid_sizes(min_size, max_size)
if min_size == max_size:
return FixedSizeBytes(min_size)
return lists(
integers(min_value=0, max_value=255), min_size=min_size, max_size=max_size
).map(bytes)
return BytesStrategy(min_size, max_size)


@cacheable
165 changes: 122 additions & 43 deletions hypothesis-python/src/hypothesis/strategies/_internal/strings.py
Original file line number Diff line number Diff line change
@@ -11,14 +11,17 @@
import copy
import re
import warnings
from functools import lru_cache
from functools import lru_cache, partial

from hypothesis.errors import HypothesisWarning, InvalidArgument
from hypothesis.internal import charmap
from hypothesis.internal.filtering import max_len, min_len
from hypothesis.internal.intervalsets import IntervalSet
from hypothesis.strategies._internal.collections import ListStrategy
from hypothesis.strategies._internal.lazy import unwrap_strategies
from hypothesis.strategies._internal.numbers import IntegersStrategy
from hypothesis.strategies._internal.strategies import SearchStrategy
from hypothesis.vendor.pretty import pretty


class OneCharStringStrategy(SearchStrategy):
@@ -76,6 +79,33 @@ def do_draw(self, data):
return data.draw_string(self.intervals, min_size=1, max_size=1)


_nonempty_names = (
"capitalize",
"expandtabs",
"join",
"lower",
"rsplit",
"split",
"splitlines",
"swapcase",
"title",
"upper",
)
_nonempty_and_content_names = (
"islower",
"isupper",
"isalnum",
"isalpha",
"isascii",
"isdigit",
"isspace",
"istitle",
"lstrip",
"rstrip",
"strip",
)


class TextStrategy(ListStrategy):
def do_draw(self, data):
# if our element strategy is OneCharStringStrategy, we can skip the
@@ -104,44 +134,17 @@ def __repr__(self):
_nonempty_filters = (
*ListStrategy._nonempty_filters,
str,
str.capitalize,
str.casefold,
str.encode,
str.expandtabs,
str.join,
str.lower,
str.rsplit,
str.split,
str.splitlines,
str.swapcase,
str.title,
str.upper,
*(getattr(str, n) for n in _nonempty_names),
)
_nonempty_and_content_filters = (
str.isidentifier,
str.islower,
str.isupper,
str.isalnum,
str.isalpha,
str.isascii,
str.isdecimal,
str.isdigit,
str.isnumeric,
str.isspace,
str.istitle,
str.lstrip,
str.rstrip,
str.strip,
*(getattr(str, n) for n in _nonempty_and_content_names),
)

def filter(self, condition):
if condition in (str.lower, str.title, str.upper):
warnings.warn(
f"You applied str.{condition.__name__} as a filter, but this allows "
f"all nonempty strings! Did you mean str.is{condition.__name__}?",
HypothesisWarning,
stacklevel=2,
)
elems = unwrap_strategies(self.element_strategy)
if (
condition is str.isidentifier
@@ -163,17 +166,76 @@ def filter(self, condition):
),
# Filter to ensure that NFKC normalization keeps working in future
).filter(str.isidentifier)
if (new := _string_filter_rewrite(self, str, condition)) is not None:
return new
return super().filter(condition)

# We use ListStrategy filter logic for the conditions that *only* imply
# the string is nonempty. Here, we increment the min_size but still apply
# the filter for conditions that imply nonempty *and specific contents*.
if condition in self._nonempty_and_content_filters:
assert self.max_size >= 1, "Always-empty is special cased in st.text()"
self = copy.copy(self)
self.min_size = max(1, self.min_size)
return ListStrategy.filter(self, condition)

return super().filter(condition)
def _string_filter_rewrite(self, kind, condition):
if condition in (kind.lower, kind.title, kind.upper):
k = kind.__name__
warnings.warn(
f"You applied {k}.{condition.__name__} as a filter, but this allows "
f"all nonempty strings! Did you mean {k}.is{condition.__name__}?",
HypothesisWarning,
stacklevel=2,
)

elems = unwrap_strategies(self.element_strategy)
if (
(kind is bytes or isinstance(elems, OneCharStringStrategy))
and isinstance(pattern := getattr(condition, "__self__", None), re.Pattern)
and isinstance(pattern.pattern, kind)
):
from hypothesis.strategies._internal.regex import regex_strategy

print(f"{condition=}")
print(f"{condition.__name__=}")

if condition.__name__ == "match":
# Replace with an easier-to-handle equivalent condition
caret = "^" if kind is str else b"^"
pattern = re.compile(caret + pattern.pattern, flags=pattern.flags)
condition = pattern.search

if condition.__name__ in ("search", "findall", "fullmatch"):
s = regex_strategy(
pattern,
fullmatch=condition.__name__ == "fullmatch",
alphabet=self.element_strategy if kind is str else None,
)
if self.min_size > 0:
s = s.filter(partial(min_len, self.min_size))
if self.max_size < 1e999:
s = s.filter(partial(max_len, self.max_size))
return s
elif condition.__name__ in ("finditer", "scanner"):
# PyPy implements `finditer` as an alias to their `scanner` method
warnings.warn(
f"You applied {pretty(condition)} as a filter, but this allows "
f"any string at all! Did you mean .findall ?",
HypothesisWarning,
stacklevel=3,
)
return self
elif condition.__name__ == "split":
warnings.warn(
f"You applied {pretty(condition)} as a filter, but this allows "
f"any nonempty string! Did you mean .search ?",
HypothesisWarning,
stacklevel=3,
)
return self.filter(bool)

# We use ListStrategy filter logic for the conditions that *only* imply
# the string is nonempty. Here, we increment the min_size but still apply
# the filter for conditions that imply nonempty *and specific contents*.
if condition in self._nonempty_and_content_filters and self.max_size >= 1:
self = copy.copy(self)
self.min_size = max(1, self.min_size)
return ListStrategy.filter(self, condition)

return None


# Excerpted from https://www.unicode.org/Public/15.0.0/ucd/PropList.txt
@@ -229,9 +291,26 @@ def _identifier_characters():
return id_start, id_continue


class FixedSizeBytes(SearchStrategy):
def __init__(self, size):
self.size = size
class BytesStrategy(ListStrategy):
def __init__(self, min_size, max_size):
super().__init__(IntegersStrategy(0, 255), min_size=min_size, max_size=max_size)

def do_draw(self, data):
return bytes(data.draw_bytes(self.size))
# TODO: refactor the underlying provider to support variable-length bytes
if self.min_size == self.max_size:
return bytes(data.draw_bytes(self.min_size))
return bytes(super().do_draw(data))

_nonempty_filters = (
*ListStrategy._nonempty_filters,
bytes,
*(getattr(bytes, n) for n in _nonempty_names),
)
_nonempty_and_content_filters = (
*(getattr(bytes, n) for n in _nonempty_and_content_names),
)

def filter(self, condition):
if (new := _string_filter_rewrite(self, bytes, condition)) is not None:
return new
return super().filter(condition)
2 changes: 1 addition & 1 deletion hypothesis-python/src/hypothesis/vendor/pretty.py
Original file line number Diff line number Diff line change
@@ -747,7 +747,7 @@ def _repr_float_counting_nans(obj, p, cycle):
type: _type_pprint,
types.FunctionType: _function_pprint,
types.BuiltinFunctionType: _function_pprint,
types.MethodType: _repr_pprint,
types.MethodType: _function_pprint,
datetime.datetime: _repr_pprint,
datetime.timedelta: _repr_pprint,
BaseException: _exception_pprint,
45 changes: 44 additions & 1 deletion hypothesis-python/tests/cover/test_filter_rewriting.py
Original file line number Diff line number Diff line change
@@ -11,6 +11,7 @@
import decimal
import math
import operator
import re
from fractions import Fraction
from functools import partial
from sys import float_info
@@ -31,6 +32,8 @@
from tests.common.debug import check_can_generate_examples
from tests.common.utils import fails_with

A_FEW = 15 # speed up massively-parametrized tests


@pytest.mark.parametrize(
"strategy, predicate, start, end",
@@ -84,6 +87,7 @@
],
ids=get_pretty_function_description,
)
@settings(max_examples=A_FEW)
@given(data=st.data())
def test_filter_rewriting_ints(data, strategy, predicate, start, end):
s = strategy.filter(predicate)
@@ -147,6 +151,7 @@ def test_filter_rewriting_ints(data, strategy, predicate, start, end):
],
ids=get_pretty_function_description,
)
@settings(max_examples=A_FEW)
@given(data=st.data())
def test_filter_rewriting_floats(data, strategy, predicate, min_value, max_value):
s = strategy.filter(predicate)
@@ -405,6 +410,7 @@ def test_filter_floats_can_skip_subnormals(op, attr, value, expected):
],
ids=get_pretty_function_description,
)
@settings(max_examples=A_FEW)
@given(data=st.data())
def test_filter_rewriting_text_partial_len(data, strategy, predicate, start, end):
s = strategy.filter(predicate)
@@ -477,7 +483,7 @@ def test_can_rewrite_multiple_length_filters_if_not_lambdas(data):
],
ids=get_pretty_function_description,
)
@settings(max_examples=15)
@settings(max_examples=A_FEW)
@given(data=st.data())
def test_filter_rewriting_text_lambda_len(data, strategy, predicate, start, end):
s = strategy.filter(predicate)
@@ -541,6 +547,7 @@ def test_filter_rewriting_text_lambda_len(data, strategy, predicate, start, end)
],
ids=get_pretty_function_description,
)
@settings(max_examples=A_FEW)
@given(data=st.data())
def test_filter_rewriting_lambda_len_unique_elements(
data, strategy, predicate, start, end
@@ -573,3 +580,39 @@ def test_does_not_rewrite_unsatisfiable_len_filter(predicate):
# Rewriting to nothing() would correctly express the constraint. However
# we don't want _only rewritable strategies_ to work in e.g. one_of, so:
assert not strategy.is_empty


@pytest.mark.parametrize(
"method", ["match", "search", "findall", "fullmatch", "finditer", "split"]
)
@pytest.mark.parametrize(
"strategy, pattern",
[
(st.text(), "ab+c"),
(st.text(alphabet="abcdef"), "ab+c"),
(st.text(min_size=5, max_size=10), "ab+c"),
(st.binary(), b"ab+c"),
(st.binary(min_size=5, max_size=10), b"ab+c"),
],
ids=repr,
)
@settings(max_examples=A_FEW)
@given(data=st.data())
def test_regex_filter_rewriting(data, strategy, pattern, method):
# This would raise a HealthCheck without rewriting, so checking that
# we can draw a valid value is sufficient.
predicate = getattr(re.compile(pattern), method)
s = strategy.filter(predicate)
if method in ("finditer", "split"):
msg = r"You applied re.compile\(.+?\).\w+ as a filter, but this allows"
with pytest.warns(HypothesisWarning, match=msg):
value = data.draw(s)
else:
value = data.draw(s)
assert predicate(value)


@fails_with(TypeError)
@given(st.text().filter(re.compile("abc").sub))
def test_error_on_method_which_requires_multiple_args(_):
pass
Loading

0 comments on commit c27727f

Please sign in to comment.