From 47afab5e45fd31dfe5cf6ca8222fd84db42fb068 Mon Sep 17 00:00:00 2001 From: Kareem Khazem Date: Fri, 28 Apr 2023 09:06:44 +0100 Subject: [PATCH] Add extra_column to benchcomp markdown visualizer (#2415) --- .../benchcomp/visualizers/__init__.py | 105 ++++++++++++++++-- tools/benchcomp/configs/perf-regression.yaml | 55 +++++++++ tools/benchcomp/test/test_regression.py | 39 +++++-- 3 files changed, 181 insertions(+), 18 deletions(-) diff --git a/tools/benchcomp/benchcomp/visualizers/__init__.py b/tools/benchcomp/benchcomp/visualizers/__init__.py index c4870c425f19..9639cf426b88 100644 --- a/tools/benchcomp/benchcomp/visualizers/__init__.py +++ b/tools/benchcomp/benchcomp/visualizers/__init__.py @@ -82,20 +82,84 @@ def __call__(self, results): class dump_markdown_results_table: - """Print a Markdown-formatted table displaying benchmark results + """Print Markdown-formatted tables displaying benchmark results + + For each metric, this visualization prints out a table of benchmarks, + showing the value of the metric for each variant. The 'out_file' key is mandatory; specify '-' to print to stdout. + 'extra_colums' can be an empty dict. The sample configuration below assumes + that each benchmark result has a 'success' and 'runtime' metric for both + variants, 'variant_1' and 'variant_2'. It adds a 'ratio' column to the table + for the 'runtime' metric, and a 'change' column to the table for the + 'success' metric. The 'text' lambda is called once for each benchmark. The + 'text' lambda accepts a single argument---a dict---that maps variant + names to the value of that variant for a particular metric. The lambda + returns a string that is rendered in the benchmark's row in the new column. + This allows you to emit arbitrary text or markdown formatting in response to + particular combinations of values for different variants, such as + regressions or performance improvements. + Sample configuration: + ``` visualize: - type: dump_markdown_results_table - out_file: '-' + out_file: "-" + extra_columns: + runtime: + - column_name: ratio + text: > + lambda b: str(b["variant_2"]/b["variant_1"]) + if b["variant_2"] < (1.5 * b["variant_1"]) + else "**" + str(b["variant_2"]/b["variant_1"]) + success: + - column_name: change + text: > + lambda b: "" if b["variant_2"] == b["variant_1"] + else "newly passing" if b["variant_2"] + else "regressed" + ``` + + Example output: + + ``` + ## runtime + + | Benchmark | variant_1 | variant_2 | ratio | + | --- | --- | --- | --- | + | bench_1 | 5 | 10 | **2.0** | + | bench_2 | 10 | 5 | 0.5 | + + ## success + + | Benchmark | variant_1 | variant_2 | notes | + | --- | --- | --- | --- | + | bench_1 | True | True | | + | bench_2 | True | False | regressed | + | bench_3 | False | True | newly passing | + ``` """ - def __init__(self, out_file): + def __init__(self, out_file, extra_columns=None): self.get_out_file = benchcomp.Outfile(out_file) + self.extra_columns = self._eval_column_text(extra_columns or {}) + + + @staticmethod + def _eval_column_text(column_spec): + for columns in column_spec.values(): + for column in columns: + try: + column["text"] = eval(column["text"]) + except SyntaxError: + logging.error( + "This column text is not a valid python program: '%s'", + column["text"]) + sys.exit(1) + return column_spec @staticmethod @@ -104,10 +168,10 @@ def _get_template(): {% for metric, benchmarks in d["metrics"].items() %} ## {{ metric }} - | Benchmark | {% for variant in d["variants"] %} {{ variant }} |{% endfor %} - | --- | {% for variant in d["variants"] %}--- |{% endfor -%} + | Benchmark | {% for variant in d["variants"][metric] %} {{ variant }} |{% endfor %} + | --- |{% for variant in d["variants"][metric] %} --- |{% endfor -%} {% for bench_name, bench_variants in benchmarks.items () %} - | {{ bench_name }} {% for variant in d["variants"] -%} + | {{ bench_name }} {% for variant in d["variants"][metric] -%} | {{ bench_variants[variant] }} {% endfor %}| {%- endfor %} {% endfor -%} @@ -134,10 +198,35 @@ def _organize_results_into_metrics(results): return ret + def _add_extra_columns(self, metrics): + for metric, benches in metrics.items(): + try: + columns = self.extra_columns[metric] + except KeyError: + continue + for bench, variants in benches.items(): + tmp_variants = dict(variants) + for column in columns: + variants[column["column_name"]] = column["text"](tmp_variants) + + + @staticmethod + def _get_variants(metrics): + ret = {} + for metric, benches in metrics.items(): + for bench, variants in benches.items(): + ret[metric] = list(variants.keys()) + break + return ret + + def __call__(self, results): + metrics = self._organize_results_into_metrics(results) + self._add_extra_columns(metrics) + data = { - "metrics": self._organize_results_into_metrics(results), - "variants": list(results["benchmarks"].values())[0]["variants"], + "metrics": metrics, + "variants": self._get_variants(metrics), } env = jinja2.Environment( diff --git a/tools/benchcomp/configs/perf-regression.yaml b/tools/benchcomp/configs/perf-regression.yaml index a76cb5ab54bf..a0d88e0558db 100644 --- a/tools/benchcomp/configs/perf-regression.yaml +++ b/tools/benchcomp/configs/perf-regression.yaml @@ -33,6 +33,61 @@ visualize: - type: dump_markdown_results_table out_file: '-' + extra_columns: + + # For these two metrics, display the difference between old and new and + # embolden if the absolute difference is more than 10% of the old value + number_vccs: + - column_name: diff old → new + text: > + lambda b: "" if b["kani_new"] == b["kani_old"] + else ("**" if abs((b["kani_new"]-b["kani_old"])/b["kani_old"]) > 0.1 else "") + + ("+" if b["kani_new"] > b["kani_old"] else "") + + str(b["kani_new"] - b["kani_old"]) + + ("**" if abs((b["kani_new"]-b["kani_old"])/b["kani_old"]) > 0.1 else "") + number_program_steps: + - column_name: diff old → new + text: > + lambda b: "" if b["kani_new"] == b["kani_old"] + else ("**" if abs((b["kani_new"]-b["kani_old"])/b["kani_old"]) > 0.1 else "") + + ("+" if b["kani_new"] > b["kani_old"] else "") + + str(b["kani_new"] - b["kani_old"]) + + ("**" if abs((b["kani_new"]-b["kani_old"])/b["kani_old"]) > 0.1 else "") + + # For 'runtime' metrics, display the % change from old to new, emboldening + # cells whose absolute change is >50% + solver_runtime: + - column_name: "% change old → new" + text: > + lambda b: "" if b["kani_new"] == b["kani_old"] + else ("**" if abs((b["kani_new"]-b["kani_old"])/b["kani_old"]) > 0.5 else "") + + ("+" if b["kani_new"] > b["kani_old"] else "") + + "%.3f%%" % ((b["kani_new"] - b["kani_old"]) * 100 / b["kani_old"]) + + ("**" if abs((b["kani_new"]-b["kani_old"])/b["kani_old"]) > 0.5 else "") + verification_time: + - column_name: "% change old → new" + text: > + lambda b: "" if b["kani_new"] == b["kani_old"] + else ("**" if abs((b["kani_new"]-b["kani_old"])/b["kani_old"]) > 0.5 else "") + + ("+" if b["kani_new"] > b["kani_old"] else "") + + "%.3f%%" % ((b["kani_new"] - b["kani_old"]) * 100 / b["kani_old"]) + + ("**" if abs((b["kani_new"]-b["kani_old"])/b["kani_old"]) > 0.5 else "") + symex_runtime: + - column_name: "% change old → new" + text: > + lambda b: "" if b["kani_new"] == b["kani_old"] + else ("**" if abs((b["kani_new"]-b["kani_old"])/b["kani_old"]) > 0.5 else "") + + ("+" if b["kani_new"] > b["kani_old"] else "") + + "%.3f%%" % ((b["kani_new"] - b["kani_old"]) * 100 / b["kani_old"]) + + ("**" if abs((b["kani_new"]-b["kani_old"])/b["kani_old"]) > 0.5 else "") + + # For success metric, display some text if success has changed + success: + - column_name: change + text: > + lambda b: "" if b["kani_new"] == b["kani_old"] + else "❌ newly failing" if b["kani_old"] + else "✅ newly passing" - type: error_on_regression variant_pairs: [[kani_old, kani_new]] diff --git a/tools/benchcomp/test/test_regression.py b/tools/benchcomp/test/test_regression.py index 258005f84b57..1aaa1ec5c320 100644 --- a/tools/benchcomp/test/test_regression.py +++ b/tools/benchcomp/test/test_regression.py @@ -402,9 +402,10 @@ def test_markdown_results_table(self): "config": { "directory": str(tmp), "command_line": - "mkdir bench_1 bench_2" + "mkdir bench_1 bench_2 bench_3" "&& echo true > bench_1/success" "&& echo true > bench_2/success" + "&& echo false > bench_3/success" "&& echo 5 > bench_1/runtime" "&& echo 10 > bench_2/runtime" }, @@ -413,9 +414,10 @@ def test_markdown_results_table(self): "config": { "directory": str(tmp), "command_line": - "mkdir bench_1 bench_2" + "mkdir bench_1 bench_2 bench_3" "&& echo true > bench_1/success" "&& echo false > bench_2/success" + "&& echo true > bench_3/success" "&& echo 10 > bench_1/runtime" "&& echo 5 > bench_2/runtime" } @@ -432,6 +434,22 @@ def test_markdown_results_table(self): "visualize": [{ "type": "dump_markdown_results_table", "out_file": "-", + "extra_columns": { + "runtime": [{ + "column_name": "ratio", + "text": + "lambda b: str(b['variant_2']/b['variant_1'])" + "if b['variant_2'] < 1.5 * b['variant_1'] " + "else '**' + str(b['variant_2']/b['variant_1']) + '**'" + }], + "success": [{ + "column_name": "notes", + "text": + "lambda b: '' if b['variant_2'] == b['variant_1']" + "else 'newly passing' if b['variant_2'] " + "else 'regressed'" + }] + } }] }) run_bc() @@ -441,17 +459,18 @@ def test_markdown_results_table(self): run_bc.stdout, textwrap.dedent(""" ## runtime - | Benchmark | variant_1 | variant_2 | - | --- | --- |--- | - | bench_1 | 5 | 10 | - | bench_2 | 10 | 5 | + | Benchmark | variant_1 | variant_2 | ratio | + | --- | --- | --- | --- | + | bench_1 | 5 | 10 | **2.0** | + | bench_2 | 10 | 5 | 0.5 | ## success - | Benchmark | variant_1 | variant_2 | - | --- | --- |--- | - | bench_1 | True | True | - | bench_2 | True | False | + | Benchmark | variant_1 | variant_2 | notes | + | --- | --- | --- | --- | + | bench_1 | True | True | | + | bench_2 | True | False | regressed | + | bench_3 | False | True | newly passing | """))