Merge pull request #4214 from tybug/automate-shrinking-benchmark

Automate shrinking benchmark more
HypothesisWorks · Dec 27, 2024 · 3dbfae2 · 3dbfae2
2 parents 2530e74 + 79fef72
commit 3dbfae2
Show file tree

Hide file tree

Showing 8 changed files with 604 additions and 194 deletions.
diff --git a/hypothesis-python/benchmark/README.md b/hypothesis-python/benchmark/README.md
@@ -1,14 +1,12 @@
-This directory contains code for benchmarking Hypothesis' shrinking. This was written for [pull/3962](https://github.com/HypothesisWorks/hypothesis/pull/3962) and is a manual process at the moment, though we may eventually integrate it more closely with ci for automated benchmarking.
+This directory contains plotting code for our shrinker benchmarking. The code for collecting the data is in `conftest.py`. This directory handles plotting the results.
 
-To run a benchmark:
-
-* Add the contents of `conftest.py` to the bottom of `hypothesis-python/tests/conftest.py`
-* In `hypothesis-python/tests/common/debug.py`, change `derandomize=True` to `derandomize=False` (if you are running more than one trial)
-* Run the tests: `pytest hypothesis-python/tests/`
-  * Note that the benchmarking script does not currently support xdist, so do not use `-n 8` or similar.
+The plotting script (but not collecting benchmark data) requires additional dependencies: `pip install scipy vl-convert-python`.
 
-When pytest finishes the output will contain a dictionary of the benchmarking results. Add that as a new entry in `data.json`. Repeat for however many trials you want; n=5 seems reasonable.
+To run a benchmark:
 
-Also repeat for both your baseline ("old") and your comparison ("new") code.
+- `pytest tests/ --hypothesis-benchmark-shrinks new --hypothesis-benchmark-output data.json` (starting on the newer version)
+- `pytest tests/ --hypothesis-benchmark-shrinks old --hypothesis-benchmark-output data.json` (after switching to the old version)
+  - Use the same `data.json` path, the benchmark will append data. You can append `-k ...` for both commands to subset the benchmark.
+- `python benchmark/graph.py data.json shrinking.png`
 
-Then run `python graph.py` to generate a graph comparing the old and new results.
+This hooks any `minimal()` calls any reports the number of shrinks. Default (and currently unchangeable) number of iterations is 5 per test.
diff --git a/hypothesis-python/benchmark/conftest.py b/hypothesis-python/benchmark/conftest.py
diff --git a/hypothesis-python/benchmark/data.json b/hypothesis-python/benchmark/data.json
diff --git a/hypothesis-python/benchmark/graph.py b/hypothesis-python/benchmark/graph.py
@@ -9,113 +9,117 @@
 # obtain one at https://mozilla.org/MPL/2.0/.
 
 import json
+import math
 import statistics
 from pathlib import Path
 
-import matplotlib.pyplot as plt
-import seaborn as sns
-
-data_path = Path(__file__).parent / "data.json"
-with open(data_path) as f:
-    data = json.loads(f.read())
-
-old_runs = data["old"]
-new_runs = data["new"]
-all_runs = old_runs + new_runs
-
-# every run should involve the same functions
-names = set()
-for run in all_runs:
-    names.add(frozenset(run.keys()))
-
-intersection = frozenset.intersection(*names)
-diff = frozenset.union(*[intersection.symmetric_difference(n) for n in names])
-
-print(f"skipping these tests which were not present in all runs: {', '.join(diff)}")
-names = list(intersection)
-
-# the similar invariant for number of minimal calls per run is not true: functions
-# may make a variable number of minimal() calls.
-# it would be nice to compare identically just the ones which don't vary, to get
-# a very fine grained comparison instead of averaging.
-# sizes = []
-# for run in all_runs:
-#     sizes.append(tuple(len(value) for value in run.values()))
-# assert len(set(sizes)) == 1
-
-new_names = []
-for name in names:
-    if all(all(x == 0 for x in run[name]) for run in all_runs):
-        print(f"no shrinks for {name}, skipping")
-        continue
-    new_names.append(name)
-names = new_names
-
-# either "time" or "calls"
-statistic = "time"
-# name : average calls
-old_values = {}
-new_values = {}
-for name in names:
-
-    # mean across the different minimal() calls in a single test function, then
-    # median across the n iterations we ran that for to reduce error
-    old_vals = [statistics.mean(r[statistic] for r in run[name]) for run in old_runs]
-    new_vals = [statistics.mean(r[statistic] for r in run[name]) for run in new_runs]
-    old_values[name] = statistics.median(old_vals)
-    new_values[name] = statistics.median(new_vals)
-
-# name : (absolute difference, times difference)
-diffs = {}
-for name in names:
-    old = old_values[name]
-    new = new_values[name]
-    diff = old - new
-    if old == 0:
-        diff_times = 0
-    else:
-        diff_times = (old - new) / old
-    if 0 < diff_times < 1:
-        diff_times = (1 / (1 - diff_times)) - 1
-    diffs[name] = (diff, diff_times)
-
-    print(f"{name} {diff} ({old} -> {new}, {round(diff_times, 1)}✕)")
-
-diffs = dict(sorted(diffs.items(), key=lambda kv: kv[1][0]))
-diffs_value = [v[0] for v in diffs.values()]
-diffs_percentage = [v[1] for v in diffs.values()]
-
-print(f"mean: {statistics.mean(diffs_value)}, median: {statistics.median(diffs_value)}")
-
-
-# https://stackoverflow.com/a/65824524
-def align_axes(ax1, ax2):
-    ax1_ylims = ax1.axes.get_ylim()
-    ax1_yratio = ax1_ylims[0] / ax1_ylims[1]
-
-    ax2_ylims = ax2.axes.get_ylim()
-    ax2_yratio = ax2_ylims[0] / ax2_ylims[1]
-
-    if ax1_yratio < ax2_yratio:
-        ax2.set_ylim(bottom=ax2_ylims[1] * ax1_yratio)
-    else:
-        ax1.set_ylim(bottom=ax1_ylims[1] * ax2_yratio)
-
-
-ax1 = sns.barplot(diffs_value, color="b", alpha=0.7, label="absolute change")
-ax2 = plt.twinx()
-sns.barplot(diffs_percentage, color="r", alpha=0.7, ax=ax2, label="n✕ change")
-
-ax1.set_title(
-    "old shrinks - new shrinks (aka shrinks saved, higher is better)"
-    if statistic == "calls"
-    else "old time - new time in seconds (aka time saved, higher is better)"
-)
-ax1.set_xticks([])
-align_axes(ax1, ax2)
-legend1 = ax1.legend(loc="upper left")
-legend1.legend_handles[0].set_color("b")
-legend2 = ax2.legend(loc="lower right")
-legend2.legend_handles[0].set_color("r")
-
-plt.show()
+import click
+
+
+def plot_vega(vega_spec, data, *, to, parameters=None):
+    import vl_convert
+
+    parameters = parameters or {}
+
+    spec = json.loads(vega_spec.read_text())
+    spec["data"].insert(0, {"name": "source", "values": data})
+    if "signals" not in spec:
+        spec["signals"] = []
+
+    for key, value in parameters.items():
+        spec["signals"].append({"name": key, "value": value})
+
+    with open(to, "wb") as f:
+        # default ppi is 72, which is somewhat blurry.
+        f.write(vl_convert.vega_to_png(spec, ppi=200))
+
+
+def _mean_difference_ci(n1, n2, *, confidence):
+    from scipy import stats
+
+    var1 = statistics.variance(n1)
+    var2 = statistics.variance(n2)
+    df = len(n1) + len(n2) - 2
+    # this assumes equal variances between the populations of n1 and n2. This
+    # is not necessarily true (new might be more consistent than old), but it's
+    # good enough.
+    pooled_std = math.sqrt(((len(n1) - 1) * var1 + (len(n2) - 1) * var2) / df)
+    se = pooled_std * math.sqrt(1 / len(n1) + 1 / len(n2))
+    t_crit = stats.t.ppf((1 + confidence) / 2, df)
+    return t_crit * se
+
+
+def _process_benchmark_data(data):
+    assert set(data) == {"old", "new"}
+    old_calls = data["old"]["calls"]
+    new_calls = data["new"]["calls"]
+    assert set(old_calls) == set(new_calls), set(old_calls).symmetric_difference(
+        set(new_calls)
+    )
+
+    graph_data = []
+
+    def _diff_times(old, new):
+        if old == 0 and new == 0:
+            return 0
+        if old == 0:
+            # there aren't any great options here, but 0 is more reasonable than inf.
+            return 0
+        v = (old - new) / old
+        if 0 < v < 1:
+            v = (1 / (1 - v)) - 1
+        return v
+
+    sums = {"old": 0, "new": 0}
+    for node_id in old_calls:
+        old = old_calls[node_id]
+        new = new_calls[node_id]
+        if set(old) | set(new) == {0} or len(old) != len(new):
+            print(f"skipping {node_id}")
+            continue
+
+        sums["old"] += statistics.mean(old)
+        sums["new"] += statistics.mean(new)
+        diffs = [n_old - n_new for n_old, n_new in zip(old, new)]
+        diffs_times = [_diff_times(n_old, n_new) for n_old, n_new in zip(old, new)]
+        ci_shrink = (
+            _mean_difference_ci(old, new, confidence=0.95) if len(old) > 1 else 0
+        )
+
+        graph_data.append(
+            {
+                "node_id": node_id,
+                "absolute": statistics.mean(diffs),
+                "absolute_ci_lower": ci_shrink,
+                "absolute_ci_upper": ci_shrink,
+                "nx": statistics.mean(diffs_times),
+                "nx_ci_lower": 0,
+                "nx_ci_upper": 0,
+            }
+        )
+
+    graph_data = sorted(graph_data, key=lambda d: d["absolute"])
+    return graph_data, sums
+
+
+@click.command()
+@click.argument("data", type=click.Path(exists=True, path_type=Path))
+@click.argument("out", type=click.Path(path_type=Path))
+def plot(data, out):
+    data = json.loads(data.read_text())
+    data, sums = _process_benchmark_data(data)
+    plot_vega(
+        Path(__file__).parent / "spec.json",
+        data=data,
+        to=out,
+        parameters={
+            "title": "Shrinking benchmark (calls)",
+            "sum_old": sums["old"],
+            "sum_new": sums["new"],
+            "absolute_axis_title": ("shrink call change (old - new, larger is good)"),
+        },
+    )
+
+
+if __name__ == "__main__":
+    plot()