New per-phase statistics system

The per-phase reporting is the big part, but the new system is also designed for easier dumping of machine-readable data (as json). This is useful for research evaluations, and for general performance monitoring too.
HypothesisWorks · May 12, 2020 · 31b02a2 · 31b02a2
1 parent 0de2cf8
commit 31b02a2
Show file tree

Hide file tree

Showing 8 changed files with 204 additions and 196 deletions.
diff --git a/hypothesis-python/RELEASE.rst b/hypothesis-python/RELEASE.rst
@@ -0,0 +1,6 @@
+RELEASE_TYPE: minor
+
+This release upgrades the test statistics available via the
+:ref:`--hypothesis-show-statistics <statistics>` option to include
+separate information on each of the :attr:`~hypothesis.settings.phases`
+(:issue:`1555`).
diff --git a/hypothesis-python/docs/details.rst b/hypothesis-python/docs/details.rst
@@ -72,11 +72,9 @@ You would see:
 
 .. code-block:: none
 
-  test_integers:
-
-    - 100 passing examples, 0 failing examples, 0 invalid examples
-    - Typical runtimes: ~ 1ms
-    - Fraction of time spent in data generation: ~ 12%
+    - during generate phase (0.06 seconds):
+        - Typical runtimes: < 1ms, ~ 47% in data generation
+        - 100 passing examples, 0 failing examples, 0 invalid examples
     - Stopped because settings.max_examples=100
 
 The final "Stopped because" line is particularly important to note: It tells you the
@@ -102,13 +100,13 @@ You would see something like:
 
   test_even_integers:
 
-      - 100 passing examples, 0 failing examples, 36 invalid examples
-      - Typical runtimes: 0-1 ms
-      - Fraction of time spent in data generation: ~ 16%
-      - Stopped because settings.max_examples=100
-      - Events:
-        * 80.88%, Retried draw from integers().filter(lambda x: <unknown>) to satisfy filter
-        * 26.47%, Aborted test because unable to satisfy integers().filter(lambda x: <unknown>)
+    - during generate phase (0.08 seconds):
+        - Typical runtimes: < 1ms, ~ 57% in data generation
+        - 100 passing examples, 0 failing examples, 12 invalid examples
+        - Events:
+          * 51.79%, Retried draw from integers().filter(lambda x: x % 2 == 0) to satisfy filter
+          * 10.71%, Aborted test because unable to satisfy integers().filter(lambda x: x % 2 == 0)
+    - Stopped because settings.max_examples=100
 
 You can also mark custom events in a test using the ``event`` function:
 
@@ -130,16 +128,16 @@ You will then see output like:
 
   test_even_integers:
 
-    - 100 passing examples, 0 failing examples, 38 invalid examples
-    - Typical runtimes: 0-1 ms
-    - Fraction of time spent in data generation: ~ 16%
+    - during generate phase (0.09 seconds):
+        - Typical runtimes: < 1ms, ~ 59% in data generation
+        - 100 passing examples, 0 failing examples, 32 invalid examples
+        - Events:
+          * 54.55%, Retried draw from integers().filter(lambda x: x % 2 == 0) to satisfy filter
+          * 31.06%, i mod 3 = 2
+          * 28.79%, i mod 3 = 0
+          * 24.24%, Aborted test because unable to satisfy integers().filter(lambda x: x % 2 == 0)
+          * 15.91%, i mod 3 = 1
     - Stopped because settings.max_examples=100
-    - Events:
-      * 80.43%, Retried draw from integers().filter(lambda x: <unknown>) to satisfy filter
-      * 31.88%, i mod 3 = 0
-      * 27.54%, Aborted test because unable to satisfy integers().filter(lambda x: <unknown>)
-      * 21.74%, i mod 3 = 1
-      * 18.84%, i mod 3 = 2
 
 Arguments to ``event`` can be any hashable type, but two events will be considered the same
 if they are the same when converted to a string with :obj:`python:str`.

diff --git a/hypothesis-python/src/hypothesis/core.py b/hypothesis-python/src/hypothesis/core.py
@@ -89,7 +89,7 @@
     verbose_report,
     with_reporter,
 )
-from hypothesis.statistics import describe_targets, note_engine_for_statistics
+from hypothesis.statistics import describe_targets, note_statistics
 from hypothesis.strategies._internal.collections import TupleStrategy
 from hypothesis.strategies._internal.strategies import (
     Ex,
@@ -730,7 +730,7 @@ def run_engine(self):
         # Use the Conjecture engine to run the test function many times
         # on different inputs.
         runner.run()
-        note_engine_for_statistics(runner)
+        note_statistics(runner.statistics)
 
         if runner.call_count == 0:
             return

diff --git a/hypothesis-python/src/hypothesis/extra/pytestplugin.py b/hypothesis-python/src/hypothesis/extra/pytestplugin.py
@@ -23,7 +23,7 @@
 from hypothesis.errors import InvalidArgument
 from hypothesis.internal.detection import is_hypothesis_test
 from hypothesis.reporting import default as default_reporter, with_reporter
-from hypothesis.statistics import collector
+from hypothesis.statistics import collector, describe_statistics
 
 LOAD_PROFILE_OPTION = "--hypothesis-profile"
 VERBOSITY_OPTION = "--hypothesis-verbosity"
@@ -177,8 +177,8 @@ def pytest_runtest_call(item):
             store = StoringReporter(item.config)
 
             def note_statistics(stats):
-                lines = [item.nodeid + ":", ""] + stats.get_description() + [""]
-                item.hypothesis_statistics = lines
+                stats["nodeid"] = item.nodeid
+                item.hypothesis_statistics = describe_statistics(stats)
 
             with collector.with_value(note_statistics):
                 with with_reporter(store):
@@ -205,10 +205,9 @@ def pytest_terminal_summary(terminalreporter):
         # always be the key for a list of _pytest.reports.TestReport objects
         # (where we stored the statistics data in pytest_runtest_makereport above)
         for test_report in terminalreporter.stats.get("", []):
-            for name, lines in test_report.user_properties:
+            for name, value in test_report.user_properties:
                 if name == "hypothesis-stats" and test_report.when == "teardown":
-                    for li in lines:
-                        terminalreporter.write_line(li)
+                    terminalreporter.write_line(value + "\n\n")
 
     def pytest_collection_modifyitems(items):
         for item in items:

diff --git a/hypothesis-python/src/hypothesis/internal/conjecture/engine.py b/hypothesis-python/src/hypothesis/internal/conjecture/engine.py
@@ -13,7 +13,9 @@
 #
 # END HEADER
 
-from collections import Counter, defaultdict
+import math
+from collections import defaultdict
+from contextlib import contextmanager
 from enum import Enum
 from random import Random, getrandbits
 from time import perf_counter
@@ -90,14 +92,14 @@ def __init__(self, test_function, settings=None, random=None, database_key=None)
         self.shrinks = 0
         self.finish_shrinking_deadline = None
         self.call_count = 0
-        self.event_call_counts = Counter()
         self.valid_examples = 0
         self.random = random or Random(getrandbits(128))
         self.database_key = database_key
-        self.status_runtimes = {}
 
-        self.all_drawtimes = []
-        self.all_runtimes = []
+        # Global dict of per-phase statistics, and a list of per-call stats
+        # which transfer to the global dict at the end of each phase.
+        self.statistics = {}
+        self.stats_per_test_case = []
 
         self.events_to_strings = WeakKeyDictionary()
 
@@ -131,6 +133,20 @@ def __init__(self, test_function, settings=None, random=None, database_key=None)
         # executed test case.
         self.__data_cache = LRUReusedCache(CACHE_SIZE)
 
+    @contextmanager
+    def _log_phase_statistics(self, phase):
+        self.stats_per_test_case.clear()
+        start_time = perf_counter()
+        try:
+            yield
+        finally:
+            self.statistics[phase + "-phase"] = {
+                "duration-seconds": perf_counter() - start_time,
+                "test-cases": list(self.stats_per_test_case),
+                "distinct-failures": len(self.interesting_examples),
+                "shrinks-successful": self.shrinks,
+            }
+
     @property
     def should_optimise(self):
         return Phase.target in self.settings.phases
@@ -173,7 +189,14 @@ def test_function(self, data):
             # the KeyboardInterrupt, never continue to the code below.
             if not interrupted:  # pragma: no branch
                 data.freeze()
-                self.note_details(data)
+                call_stats = {
+                    "status": data.status.name.lower(),
+                    "runtime": data.finish_time - data.start_time,
+                    "drawtime": math.fsum(data.draw_times),
+                    "events": sorted({self.event_to_string(e) for e in data.events}),
+                }
+                self.stats_per_test_case.append(call_stats)
+                self.__data_cache[data.buffer] = data.as_result()
 
         self.debug_data(data)
 
@@ -386,15 +409,6 @@ def secondary_key(self):
     def pareto_key(self):
         return self.sub_key(b"pareto")
 
-    def note_details(self, data):
-        self.__data_cache[data.buffer] = data.as_result()
-        runtime = max(data.finish_time - data.start_time, 0.0)
-        self.all_runtimes.append(runtime)
-        self.all_drawtimes.extend(data.draw_times)
-        self.status_runtimes.setdefault(data.status, []).append(runtime)
-        for event in set(map(self.event_to_string, data.events)):
-            self.event_call_counts[event] += 1
-
     def debug(self, message):
         if self.settings.verbosity >= Verbosity.debug:
             base_report(message)
@@ -525,6 +539,9 @@ def reuse_existing_examples(self):
                         break
 
     def exit_with(self, reason):
+        self.statistics["stopped-because"] = reason.describe(self.settings)
+        if self.best_observed_targets:
+            self.statistics["targets"] = dict(self.best_observed_targets)
         self.debug("exit_with(%s)" % (reason.name,))
         self.exit_reason = reason
         raise RunIsComplete()
@@ -834,15 +851,17 @@ def pareto_optimise(self):
             ParetoOptimiser(self).run()
 
     def _run(self):
-        self.reuse_existing_examples()
-        self.generate_new_examples()
-
-        # We normally run the targeting phase mixed in with the generate phase,
-        # but if we've been asked to run it but not generation then we have to
-        # run it explciitly on its own here.
-        if Phase.generate not in self.settings.phases:
-            self.optimise_targets()
-        self.shrink_interesting_examples()
+        with self._log_phase_statistics("reuse"):
+            self.reuse_existing_examples()
+        with self._log_phase_statistics("generate"):
+            self.generate_new_examples()
+            # We normally run the targeting phase mixed in with the generate phase,
+            # but if we've been asked to run it but not generation then we have to
+            # run it explciitly on its own here.
+            if Phase.generate not in self.settings.phases:
+                self.optimise_targets()
+        with self._log_phase_statistics("shrink"):
+            self.shrink_interesting_examples()
         self.exit_with(ExitReason.finished)
 
     def new_conjecture_data(self, prefix, max_length=BUFFER_SIZE, observer=None):