Skip to content

Commit

Permalink
New per-phase statistics system
Browse files Browse the repository at this point in the history
The per-phase reporting is the big part, but the new system is also designed for easier dumping of machine-readable data (as json).

This is useful for research evaluations, and for general performance monitoring too.
  • Loading branch information
Zac-HD committed May 12, 2020
1 parent 0de2cf8 commit 31b02a2
Show file tree
Hide file tree
Showing 8 changed files with 204 additions and 196 deletions.
6 changes: 6 additions & 0 deletions hypothesis-python/RELEASE.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
RELEASE_TYPE: minor

This release upgrades the test statistics available via the
:ref:`--hypothesis-show-statistics <statistics>` option to include
separate information on each of the :attr:`~hypothesis.settings.phases`
(:issue:`1555`).
40 changes: 19 additions & 21 deletions hypothesis-python/docs/details.rst
Original file line number Diff line number Diff line change
Expand Up @@ -72,11 +72,9 @@ You would see:

.. code-block:: none
test_integers:
- 100 passing examples, 0 failing examples, 0 invalid examples
- Typical runtimes: ~ 1ms
- Fraction of time spent in data generation: ~ 12%
- during generate phase (0.06 seconds):
- Typical runtimes: < 1ms, ~ 47% in data generation
- 100 passing examples, 0 failing examples, 0 invalid examples
- Stopped because settings.max_examples=100
The final "Stopped because" line is particularly important to note: It tells you the
Expand All @@ -102,13 +100,13 @@ You would see something like:
test_even_integers:
- 100 passing examples, 0 failing examples, 36 invalid examples
- Typical runtimes: 0-1 ms
- Fraction of time spent in data generation: ~ 16%
- Stopped because settings.max_examples=100
- Events:
* 80.88%, Retried draw from integers().filter(lambda x: <unknown>) to satisfy filter
* 26.47%, Aborted test because unable to satisfy integers().filter(lambda x: <unknown>)
- during generate phase (0.08 seconds):
- Typical runtimes: < 1ms, ~ 57% in data generation
- 100 passing examples, 0 failing examples, 12 invalid examples
- Events:
* 51.79%, Retried draw from integers().filter(lambda x: x % 2 == 0) to satisfy filter
* 10.71%, Aborted test because unable to satisfy integers().filter(lambda x: x % 2 == 0)
- Stopped because settings.max_examples=100
You can also mark custom events in a test using the ``event`` function:

Expand All @@ -130,16 +128,16 @@ You will then see output like:
test_even_integers:
- 100 passing examples, 0 failing examples, 38 invalid examples
- Typical runtimes: 0-1 ms
- Fraction of time spent in data generation: ~ 16%
- during generate phase (0.09 seconds):
- Typical runtimes: < 1ms, ~ 59% in data generation
- 100 passing examples, 0 failing examples, 32 invalid examples
- Events:
* 54.55%, Retried draw from integers().filter(lambda x: x % 2 == 0) to satisfy filter
* 31.06%, i mod 3 = 2
* 28.79%, i mod 3 = 0
* 24.24%, Aborted test because unable to satisfy integers().filter(lambda x: x % 2 == 0)
* 15.91%, i mod 3 = 1
- Stopped because settings.max_examples=100
- Events:
* 80.43%, Retried draw from integers().filter(lambda x: <unknown>) to satisfy filter
* 31.88%, i mod 3 = 0
* 27.54%, Aborted test because unable to satisfy integers().filter(lambda x: <unknown>)
* 21.74%, i mod 3 = 1
* 18.84%, i mod 3 = 2
Arguments to ``event`` can be any hashable type, but two events will be considered the same
if they are the same when converted to a string with :obj:`python:str`.
Expand Down
4 changes: 2 additions & 2 deletions hypothesis-python/src/hypothesis/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@
verbose_report,
with_reporter,
)
from hypothesis.statistics import describe_targets, note_engine_for_statistics
from hypothesis.statistics import describe_targets, note_statistics
from hypothesis.strategies._internal.collections import TupleStrategy
from hypothesis.strategies._internal.strategies import (
Ex,
Expand Down Expand Up @@ -730,7 +730,7 @@ def run_engine(self):
# Use the Conjecture engine to run the test function many times
# on different inputs.
runner.run()
note_engine_for_statistics(runner)
note_statistics(runner.statistics)

if runner.call_count == 0:
return
Expand Down
11 changes: 5 additions & 6 deletions hypothesis-python/src/hypothesis/extra/pytestplugin.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
from hypothesis.errors import InvalidArgument
from hypothesis.internal.detection import is_hypothesis_test
from hypothesis.reporting import default as default_reporter, with_reporter
from hypothesis.statistics import collector
from hypothesis.statistics import collector, describe_statistics

LOAD_PROFILE_OPTION = "--hypothesis-profile"
VERBOSITY_OPTION = "--hypothesis-verbosity"
Expand Down Expand Up @@ -177,8 +177,8 @@ def pytest_runtest_call(item):
store = StoringReporter(item.config)

def note_statistics(stats):
lines = [item.nodeid + ":", ""] + stats.get_description() + [""]
item.hypothesis_statistics = lines
stats["nodeid"] = item.nodeid
item.hypothesis_statistics = describe_statistics(stats)

with collector.with_value(note_statistics):
with with_reporter(store):
Expand All @@ -205,10 +205,9 @@ def pytest_terminal_summary(terminalreporter):
# always be the key for a list of _pytest.reports.TestReport objects
# (where we stored the statistics data in pytest_runtest_makereport above)
for test_report in terminalreporter.stats.get("", []):
for name, lines in test_report.user_properties:
for name, value in test_report.user_properties:
if name == "hypothesis-stats" and test_report.when == "teardown":
for li in lines:
terminalreporter.write_line(li)
terminalreporter.write_line(value + "\n\n")

def pytest_collection_modifyitems(items):
for item in items:
Expand Down
67 changes: 43 additions & 24 deletions hypothesis-python/src/hypothesis/internal/conjecture/engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,9 @@
#
# END HEADER

from collections import Counter, defaultdict
import math
from collections import defaultdict
from contextlib import contextmanager
from enum import Enum
from random import Random, getrandbits
from time import perf_counter
Expand Down Expand Up @@ -90,14 +92,14 @@ def __init__(self, test_function, settings=None, random=None, database_key=None)
self.shrinks = 0
self.finish_shrinking_deadline = None
self.call_count = 0
self.event_call_counts = Counter()
self.valid_examples = 0
self.random = random or Random(getrandbits(128))
self.database_key = database_key
self.status_runtimes = {}

self.all_drawtimes = []
self.all_runtimes = []
# Global dict of per-phase statistics, and a list of per-call stats
# which transfer to the global dict at the end of each phase.
self.statistics = {}
self.stats_per_test_case = []

self.events_to_strings = WeakKeyDictionary()

Expand Down Expand Up @@ -131,6 +133,20 @@ def __init__(self, test_function, settings=None, random=None, database_key=None)
# executed test case.
self.__data_cache = LRUReusedCache(CACHE_SIZE)

@contextmanager
def _log_phase_statistics(self, phase):
self.stats_per_test_case.clear()
start_time = perf_counter()
try:
yield
finally:
self.statistics[phase + "-phase"] = {
"duration-seconds": perf_counter() - start_time,
"test-cases": list(self.stats_per_test_case),
"distinct-failures": len(self.interesting_examples),
"shrinks-successful": self.shrinks,
}

@property
def should_optimise(self):
return Phase.target in self.settings.phases
Expand Down Expand Up @@ -173,7 +189,14 @@ def test_function(self, data):
# the KeyboardInterrupt, never continue to the code below.
if not interrupted: # pragma: no branch
data.freeze()
self.note_details(data)
call_stats = {
"status": data.status.name.lower(),
"runtime": data.finish_time - data.start_time,
"drawtime": math.fsum(data.draw_times),
"events": sorted({self.event_to_string(e) for e in data.events}),
}
self.stats_per_test_case.append(call_stats)
self.__data_cache[data.buffer] = data.as_result()

self.debug_data(data)

Expand Down Expand Up @@ -386,15 +409,6 @@ def secondary_key(self):
def pareto_key(self):
return self.sub_key(b"pareto")

def note_details(self, data):
self.__data_cache[data.buffer] = data.as_result()
runtime = max(data.finish_time - data.start_time, 0.0)
self.all_runtimes.append(runtime)
self.all_drawtimes.extend(data.draw_times)
self.status_runtimes.setdefault(data.status, []).append(runtime)
for event in set(map(self.event_to_string, data.events)):
self.event_call_counts[event] += 1

def debug(self, message):
if self.settings.verbosity >= Verbosity.debug:
base_report(message)
Expand Down Expand Up @@ -525,6 +539,9 @@ def reuse_existing_examples(self):
break

def exit_with(self, reason):
self.statistics["stopped-because"] = reason.describe(self.settings)
if self.best_observed_targets:
self.statistics["targets"] = dict(self.best_observed_targets)
self.debug("exit_with(%s)" % (reason.name,))
self.exit_reason = reason
raise RunIsComplete()
Expand Down Expand Up @@ -834,15 +851,17 @@ def pareto_optimise(self):
ParetoOptimiser(self).run()

def _run(self):
self.reuse_existing_examples()
self.generate_new_examples()

# We normally run the targeting phase mixed in with the generate phase,
# but if we've been asked to run it but not generation then we have to
# run it explciitly on its own here.
if Phase.generate not in self.settings.phases:
self.optimise_targets()
self.shrink_interesting_examples()
with self._log_phase_statistics("reuse"):
self.reuse_existing_examples()
with self._log_phase_statistics("generate"):
self.generate_new_examples()
# We normally run the targeting phase mixed in with the generate phase,
# but if we've been asked to run it but not generation then we have to
# run it explciitly on its own here.
if Phase.generate not in self.settings.phases:
self.optimise_targets()
with self._log_phase_statistics("shrink"):
self.shrink_interesting_examples()
self.exit_with(ExitReason.finished)

def new_conjecture_data(self, prefix, max_length=BUFFER_SIZE, observer=None):
Expand Down
Loading

0 comments on commit 31b02a2

Please sign in to comment.