Skip to content

Commit

Permalink
Merge pull request #4214 from tybug/automate-shrinking-benchmark
Browse files Browse the repository at this point in the history
Automate shrinking benchmark more
  • Loading branch information
tybug authored Dec 27, 2024
2 parents 2530e74 + 79fef72 commit 3dbfae2
Show file tree
Hide file tree
Showing 8 changed files with 604 additions and 194 deletions.
18 changes: 8 additions & 10 deletions hypothesis-python/benchmark/README.md
Original file line number Diff line number Diff line change
@@ -1,14 +1,12 @@
This directory contains code for benchmarking Hypothesis' shrinking. This was written for [pull/3962](https://github.com/HypothesisWorks/hypothesis/pull/3962) and is a manual process at the moment, though we may eventually integrate it more closely with ci for automated benchmarking.
This directory contains plotting code for our shrinker benchmarking. The code for collecting the data is in `conftest.py`. This directory handles plotting the results.

To run a benchmark:

* Add the contents of `conftest.py` to the bottom of `hypothesis-python/tests/conftest.py`
* In `hypothesis-python/tests/common/debug.py`, change `derandomize=True` to `derandomize=False` (if you are running more than one trial)
* Run the tests: `pytest hypothesis-python/tests/`
* Note that the benchmarking script does not currently support xdist, so do not use `-n 8` or similar.
The plotting script (but not collecting benchmark data) requires additional dependencies: `pip install scipy vl-convert-python`.

When pytest finishes the output will contain a dictionary of the benchmarking results. Add that as a new entry in `data.json`. Repeat for however many trials you want; n=5 seems reasonable.
To run a benchmark:

Also repeat for both your baseline ("old") and your comparison ("new") code.
- `pytest tests/ --hypothesis-benchmark-shrinks new --hypothesis-benchmark-output data.json` (starting on the newer version)
- `pytest tests/ --hypothesis-benchmark-shrinks old --hypothesis-benchmark-output data.json` (after switching to the old version)
- Use the same `data.json` path, the benchmark will append data. You can append `-k ...` for both commands to subset the benchmark.
- `python benchmark/graph.py data.json shrinking.png`

Then run `python graph.py` to generate a graph comparing the old and new results.
This hooks any `minimal()` calls any reports the number of shrinks. Default (and currently unchangeable) number of iterations is 5 per test.
71 changes: 0 additions & 71 deletions hypothesis-python/benchmark/conftest.py

This file was deleted.

4 changes: 0 additions & 4 deletions hypothesis-python/benchmark/data.json

This file was deleted.

218 changes: 111 additions & 107 deletions hypothesis-python/benchmark/graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,113 +9,117 @@
# obtain one at https://mozilla.org/MPL/2.0/.

import json
import math
import statistics
from pathlib import Path

import matplotlib.pyplot as plt
import seaborn as sns

data_path = Path(__file__).parent / "data.json"
with open(data_path) as f:
data = json.loads(f.read())

old_runs = data["old"]
new_runs = data["new"]
all_runs = old_runs + new_runs

# every run should involve the same functions
names = set()
for run in all_runs:
names.add(frozenset(run.keys()))

intersection = frozenset.intersection(*names)
diff = frozenset.union(*[intersection.symmetric_difference(n) for n in names])

print(f"skipping these tests which were not present in all runs: {', '.join(diff)}")
names = list(intersection)

# the similar invariant for number of minimal calls per run is not true: functions
# may make a variable number of minimal() calls.
# it would be nice to compare identically just the ones which don't vary, to get
# a very fine grained comparison instead of averaging.
# sizes = []
# for run in all_runs:
# sizes.append(tuple(len(value) for value in run.values()))
# assert len(set(sizes)) == 1

new_names = []
for name in names:
if all(all(x == 0 for x in run[name]) for run in all_runs):
print(f"no shrinks for {name}, skipping")
continue
new_names.append(name)
names = new_names

# either "time" or "calls"
statistic = "time"
# name : average calls
old_values = {}
new_values = {}
for name in names:

# mean across the different minimal() calls in a single test function, then
# median across the n iterations we ran that for to reduce error
old_vals = [statistics.mean(r[statistic] for r in run[name]) for run in old_runs]
new_vals = [statistics.mean(r[statistic] for r in run[name]) for run in new_runs]
old_values[name] = statistics.median(old_vals)
new_values[name] = statistics.median(new_vals)

# name : (absolute difference, times difference)
diffs = {}
for name in names:
old = old_values[name]
new = new_values[name]
diff = old - new
if old == 0:
diff_times = 0
else:
diff_times = (old - new) / old
if 0 < diff_times < 1:
diff_times = (1 / (1 - diff_times)) - 1
diffs[name] = (diff, diff_times)

print(f"{name} {diff} ({old} -> {new}, {round(diff_times, 1)}✕)")

diffs = dict(sorted(diffs.items(), key=lambda kv: kv[1][0]))
diffs_value = [v[0] for v in diffs.values()]
diffs_percentage = [v[1] for v in diffs.values()]

print(f"mean: {statistics.mean(diffs_value)}, median: {statistics.median(diffs_value)}")


# https://stackoverflow.com/a/65824524
def align_axes(ax1, ax2):
ax1_ylims = ax1.axes.get_ylim()
ax1_yratio = ax1_ylims[0] / ax1_ylims[1]

ax2_ylims = ax2.axes.get_ylim()
ax2_yratio = ax2_ylims[0] / ax2_ylims[1]

if ax1_yratio < ax2_yratio:
ax2.set_ylim(bottom=ax2_ylims[1] * ax1_yratio)
else:
ax1.set_ylim(bottom=ax1_ylims[1] * ax2_yratio)


ax1 = sns.barplot(diffs_value, color="b", alpha=0.7, label="absolute change")
ax2 = plt.twinx()
sns.barplot(diffs_percentage, color="r", alpha=0.7, ax=ax2, label="n✕ change")

ax1.set_title(
"old shrinks - new shrinks (aka shrinks saved, higher is better)"
if statistic == "calls"
else "old time - new time in seconds (aka time saved, higher is better)"
)
ax1.set_xticks([])
align_axes(ax1, ax2)
legend1 = ax1.legend(loc="upper left")
legend1.legend_handles[0].set_color("b")
legend2 = ax2.legend(loc="lower right")
legend2.legend_handles[0].set_color("r")

plt.show()
import click


def plot_vega(vega_spec, data, *, to, parameters=None):
import vl_convert

parameters = parameters or {}

spec = json.loads(vega_spec.read_text())
spec["data"].insert(0, {"name": "source", "values": data})
if "signals" not in spec:
spec["signals"] = []

for key, value in parameters.items():
spec["signals"].append({"name": key, "value": value})

with open(to, "wb") as f:
# default ppi is 72, which is somewhat blurry.
f.write(vl_convert.vega_to_png(spec, ppi=200))


def _mean_difference_ci(n1, n2, *, confidence):
from scipy import stats

var1 = statistics.variance(n1)
var2 = statistics.variance(n2)
df = len(n1) + len(n2) - 2
# this assumes equal variances between the populations of n1 and n2. This
# is not necessarily true (new might be more consistent than old), but it's
# good enough.
pooled_std = math.sqrt(((len(n1) - 1) * var1 + (len(n2) - 1) * var2) / df)
se = pooled_std * math.sqrt(1 / len(n1) + 1 / len(n2))
t_crit = stats.t.ppf((1 + confidence) / 2, df)
return t_crit * se


def _process_benchmark_data(data):
assert set(data) == {"old", "new"}
old_calls = data["old"]["calls"]
new_calls = data["new"]["calls"]
assert set(old_calls) == set(new_calls), set(old_calls).symmetric_difference(
set(new_calls)
)

graph_data = []

def _diff_times(old, new):
if old == 0 and new == 0:
return 0
if old == 0:
# there aren't any great options here, but 0 is more reasonable than inf.
return 0
v = (old - new) / old
if 0 < v < 1:
v = (1 / (1 - v)) - 1
return v

sums = {"old": 0, "new": 0}
for node_id in old_calls:
old = old_calls[node_id]
new = new_calls[node_id]
if set(old) | set(new) == {0} or len(old) != len(new):
print(f"skipping {node_id}")
continue

sums["old"] += statistics.mean(old)
sums["new"] += statistics.mean(new)
diffs = [n_old - n_new for n_old, n_new in zip(old, new)]
diffs_times = [_diff_times(n_old, n_new) for n_old, n_new in zip(old, new)]
ci_shrink = (
_mean_difference_ci(old, new, confidence=0.95) if len(old) > 1 else 0
)

graph_data.append(
{
"node_id": node_id,
"absolute": statistics.mean(diffs),
"absolute_ci_lower": ci_shrink,
"absolute_ci_upper": ci_shrink,
"nx": statistics.mean(diffs_times),
"nx_ci_lower": 0,
"nx_ci_upper": 0,
}
)

graph_data = sorted(graph_data, key=lambda d: d["absolute"])
return graph_data, sums


@click.command()
@click.argument("data", type=click.Path(exists=True, path_type=Path))
@click.argument("out", type=click.Path(path_type=Path))
def plot(data, out):
data = json.loads(data.read_text())
data, sums = _process_benchmark_data(data)
plot_vega(
Path(__file__).parent / "spec.json",
data=data,
to=out,
parameters={
"title": "Shrinking benchmark (calls)",
"sum_old": sums["old"],
"sum_new": sums["new"],
"absolute_axis_title": ("shrink call change (old - new, larger is good)"),
},
)


if __name__ == "__main__":
plot()
Loading

0 comments on commit 3dbfae2

Please sign in to comment.