Add the ability to compare benchmark runs (flutter#6918)

kenzieschmoll · Dec 8, 2023 · 6dd6896 · 6dd6896
1 parent 24d5d61
commit 6dd6896
Show file tree

Hide file tree

Showing 6 changed files with 349 additions and 3 deletions.
diff --git a/packages/devtools_app/benchmark/README.md b/packages/devtools_app/benchmark/README.md
@@ -21,7 +21,6 @@ All of the commands below should be run from the `packages/devtools_app` directo
 To run the performance benchmark tests locally, run:
 ```sh
 dart run benchmark/scripts/run_benchmarks.dart
-dart run benchmark/run_benchmarks.dart
 ```
 
 To run the test that verifies we can run benchmark tests, run:
@@ -48,4 +47,18 @@ the other running tests are using.
 
 The tests are defined by "automators", which live in the `benchmark/test_infra/automators`
 directory. To add a new test or test case, either modify an existing automator or add
-a new one for a new screen. Follow existing examples in that directory for guidance.
+a new one for a new screen. Follow existing examples in that directory for guidance.
+
+## Comparing two benchmark test runs
+
+In order to compare two different benchmark runs, you first need to run the benchmark
+tests and save the results to a file:
+```sh
+dart run benchmark/scripts/run_benchmarks.dart --save-to-file=baseline.json
+dart run benchmark/scripts/run_benchmarks.dart --save-to-file=test.json
+```
+
+Then, to compare the benchmarks and calculate deltas, run:
+```sh
+dart run benchmark/scripts/compare_benchmarks.dart baseline_file.json test_file.json
+```
diff --git a/packages/devtools_app/benchmark/devtools_benchmarks_test.dart b/packages/devtools_app/benchmark/devtools_benchmarks_test.dart
@@ -11,6 +11,7 @@ import 'dart:io';
 import 'package:test/test.dart';
 import 'package:web_benchmarks/server.dart';
 
+import 'scripts/compare_benchmarks.dart';
 import 'test_infra/common.dart';
 import 'test_infra/project_root_directory.dart';
 
@@ -37,6 +38,21 @@ void main() {
     timeout: const Timeout(Duration(minutes: 10)),
   );
 
+  test(
+    'Can compare web benchmarks',
+    () {
+      final benchmark1 = BenchmarkResults.parse(testBenchmarkResults1);
+      final benchmark2 = BenchmarkResults.parse(testBenchmarkResults2);
+      final comparison = compareBenchmarks(
+        benchmark1,
+        benchmark2,
+        baselineSource: 'path/to/baseline',
+      );
+      expect(comparison, testBenchmarkComparison);
+    },
+    timeout: const Timeout(Duration(minutes: 10)),
+  );
+
   // TODO(kenz): add tests that verify performance meets some expected threshold
 }
 
@@ -86,3 +102,134 @@ Future<void> _runBenchmarks({bool useWasm = false}) async {
     isA<String>(),
   );
 }
+
+final testBenchmarkResults1 = {
+  'foo': [
+    {'metric': 'preroll_frame.average', 'value': 60.5},
+    {'metric': 'preroll_frame.outlierAverage', 'value': 1400},
+    {'metric': 'preroll_frame.outlierRatio', 'value': 20.2},
+    {'metric': 'preroll_frame.noise', 'value': 0.85},
+    {'metric': 'apply_frame.average', 'value': 80.0},
+    {'metric': 'apply_frame.outlierAverage', 'value': 200.6},
+    {'metric': 'apply_frame.outlierRatio', 'value': 2.5},
+    {'metric': 'apply_frame.noise', 'value': 0.4},
+    {'metric': 'drawFrameDuration.average', 'value': 2058.9},
+    {'metric': 'drawFrameDuration.outlierAverage', 'value': 24000},
+    {'metric': 'drawFrameDuration.outlierRatio', 'value': 12.05},
+    {'metric': 'drawFrameDuration.noise', 'value': 0.34},
+    {'metric': 'totalUiFrame.average', 'value': 4166},
+  ],
+  'bar': [
+    {'metric': 'preroll_frame.average', 'value': 60.5},
+    {'metric': 'preroll_frame.outlierAverage', 'value': 1400},
+    {'metric': 'preroll_frame.outlierRatio', 'value': 20.2},
+    {'metric': 'preroll_frame.noise', 'value': 0.85},
+    {'metric': 'apply_frame.average', 'value': 80.0},
+    {'metric': 'apply_frame.outlierAverage', 'value': 200.6},
+    {'metric': 'apply_frame.outlierRatio', 'value': 2.5},
+    {'metric': 'apply_frame.noise', 'value': 0.4},
+    {'metric': 'drawFrameDuration.average', 'value': 2058.9},
+    {'metric': 'drawFrameDuration.outlierAverage', 'value': 24000},
+    {'metric': 'drawFrameDuration.outlierRatio', 'value': 12.05},
+    {'metric': 'drawFrameDuration.noise', 'value': 0.34},
+    {'metric': 'totalUiFrame.average', 'value': 4166},
+  ],
+};
+
+final testBenchmarkResults2 = {
+  'foo': [
+    {'metric': 'preroll_frame.average', 'value': 65.5},
+    {'metric': 'preroll_frame.outlierAverage', 'value': 1410},
+    {'metric': 'preroll_frame.outlierRatio', 'value': 20.0},
+    {'metric': 'preroll_frame.noise', 'value': 1.5},
+    {'metric': 'apply_frame.average', 'value': 50.0},
+    {'metric': 'apply_frame.outlierAverage', 'value': 100.0},
+    {'metric': 'apply_frame.outlierRatio', 'value': 2.55},
+    {'metric': 'apply_frame.noise', 'value': 0.9},
+    {'metric': 'drawFrameDuration.average', 'value': 2000.0},
+    {'metric': 'drawFrameDuration.outlierAverage', 'value': 20000},
+    {'metric': 'drawFrameDuration.outlierRatio', 'value': 11.05},
+    {'metric': 'drawFrameDuration.noise', 'value': 1.34},
+    {'metric': 'totalUiFrame.average', 'value': 4150},
+  ],
+  'bar': [
+    {'metric': 'preroll_frame.average', 'value': 65.5},
+    {'metric': 'preroll_frame.outlierAverage', 'value': 1410},
+    {'metric': 'preroll_frame.outlierRatio', 'value': 20.0},
+    {'metric': 'preroll_frame.noise', 'value': 1.5},
+    {'metric': 'apply_frame.average', 'value': 50.0},
+    {'metric': 'apply_frame.outlierAverage', 'value': 100.0},
+    {'metric': 'apply_frame.outlierRatio', 'value': 2.55},
+    {'metric': 'apply_frame.noise', 'value': 0.9},
+    {'metric': 'drawFrameDuration.average', 'value': 2000.0},
+    {'metric': 'drawFrameDuration.outlierAverage', 'value': 20000},
+    {'metric': 'drawFrameDuration.outlierRatio', 'value': 11.05},
+    {'metric': 'drawFrameDuration.noise', 'value': 1.34},
+    {'metric': 'totalUiFrame.average', 'value': 4150},
+  ],
+};
+
+final testBenchmarkComparison = {
+  'foo': [
+    {'metric': 'preroll_frame.average', 'value': 65.5, 'delta': 5.0},
+    {'metric': 'preroll_frame.outlierAverage', 'value': 1410.0, 'delta': 10.0},
+    {
+      'metric': 'preroll_frame.outlierRatio',
+      'value': 20.0,
+      'delta': -0.1999999999999993,
+    },
+    {'metric': 'preroll_frame.noise', 'value': 1.5, 'delta': 0.65},
+    {'metric': 'apply_frame.average', 'value': 50.0, 'delta': -30.0},
+    {'metric': 'apply_frame.outlierAverage', 'value': 100.0, 'delta': -100.6},
+    {
+      'metric': 'apply_frame.outlierRatio',
+      'value': 2.55,
+      'delta': 0.04999999999999982,
+    },
+    {'metric': 'apply_frame.noise', 'value': 0.9, 'delta': 0.5},
+    {
+      'metric': 'drawFrameDuration.average',
+      'value': 2000.0,
+      'delta': -58.90000000000009,
+    },
+    {
+      'metric': 'drawFrameDuration.outlierAverage',
+      'value': 20000.0,
+      'delta': -4000.0,
+    },
+    {'metric': 'drawFrameDuration.outlierRatio', 'value': 11.05, 'delta': -1.0},
+    {'metric': 'drawFrameDuration.noise', 'value': 1.34, 'delta': 1.0},
+    {'metric': 'totalUiFrame.average', 'value': 4150.0, 'delta': -16.0},
+  ],
+  'bar': [
+    {'metric': 'preroll_frame.average', 'value': 65.5, 'delta': 5.0},
+    {'metric': 'preroll_frame.outlierAverage', 'value': 1410.0, 'delta': 10.0},
+    {
+      'metric': 'preroll_frame.outlierRatio',
+      'value': 20.0,
+      'delta': -0.1999999999999993,
+    },
+    {'metric': 'preroll_frame.noise', 'value': 1.5, 'delta': 0.65},
+    {'metric': 'apply_frame.average', 'value': 50.0, 'delta': -30.0},
+    {'metric': 'apply_frame.outlierAverage', 'value': 100.0, 'delta': -100.6},
+    {
+      'metric': 'apply_frame.outlierRatio',
+      'value': 2.55,
+      'delta': 0.04999999999999982,
+    },
+    {'metric': 'apply_frame.noise', 'value': 0.9, 'delta': 0.5},
+    {
+      'metric': 'drawFrameDuration.average',
+      'value': 2000.0,
+      'delta': -58.90000000000009,
+    },
+    {
+      'metric': 'drawFrameDuration.outlierAverage',
+      'value': 20000.0,
+      'delta': -4000.0,
+    },
+    {'metric': 'drawFrameDuration.outlierRatio', 'value': 11.05, 'delta': -1.0},
+    {'metric': 'drawFrameDuration.noise', 'value': 1.34, 'delta': 1.0},
+    {'metric': 'totalUiFrame.average', 'value': 4150.0, 'delta': -16.0},
+  ],
+};
diff --git a/packages/devtools_app/benchmark/scripts/compare_benchmarks.dart b/packages/devtools_app/benchmark/scripts/compare_benchmarks.dart
@@ -0,0 +1,125 @@
+// Copyright 2023 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+import 'dart:convert';
+import 'dart:io';
+
+import 'package:collection/collection.dart';
+import 'package:web_benchmarks/server.dart';
+
+import 'utils.dart';
+
+/// Compares two sets of web benchmarks and calculates the delta between each
+/// matching metric.
+void main(List<String> args) {
+  if (args.length != 2) {
+    throw Exception(
+      'Expected 2 arguments (<baseline-file>, <test-file>), but instead there '
+      'were ${args.length}.',
+    );
+  }
+
+  final baselineSource = args[0];
+  final testSource = args[1];
+
+  stdout
+    ..writeln('Comparing the following benchmark results:')
+    ..writeln('    "$testSource" (test)')
+    ..writeln('    "$baselineSource" (baseline)');
+
+  final baselineFile = checkFileExists(baselineSource);
+  final testFile = checkFileExists(testSource);
+  if (baselineFile == null || testFile == null) {
+    if (baselineFile == null) {
+      throw Exception('Cannot find baseline file $baselineSource');
+    }
+    if (testFile == null) {
+      throw Exception('Cannot find test file $testSource');
+    }
+  }
+
+  final baselineResults =
+      BenchmarkResults.parse(jsonDecode(baselineFile.readAsStringSync()));
+  final testResults =
+      BenchmarkResults.parse(jsonDecode(testFile.readAsStringSync()));
+  compareBenchmarks(
+    baselineResults,
+    testResults,
+    baselineSource: baselineSource,
+  );
+}
+
+Map<String, List<Map<String, Object?>>> compareBenchmarks(
+  BenchmarkResults baseline,
+  BenchmarkResults test, {
+  required String baselineSource,
+}) {
+  stdout.writeln('Starting baseline comparison...');
+
+  for (final benchmarkName in test.scores.keys) {
+    stdout.writeln('Comparing metrics for benchmark "$benchmarkName".');
+
+    // Lookup this benchmark in the baseline.
+    final baselineScores = baseline.scores[benchmarkName];
+    if (baselineScores == null) {
+      stdout.writeln(
+        'Baseline does not contain results for benchmark "$benchmarkName".',
+      );
+      continue;
+    }
+
+    final testScores = test.scores[benchmarkName]!;
+
+    for (final score in testScores) {
+      // Lookup this metric in the baseline.
+      final baselineScore =
+          baselineScores.firstWhereOrNull((s) => s.metric == score.metric);
+      if (baselineScore == null) {
+        stdout.writeln(
+          'Baseline does not contain metric "${score.metric}" for '
+          'benchmark "$benchmarkName".',
+        );
+        continue;
+      }
+
+      // Add the delta to the [testMetric].
+      _benchmarkDeltas[score] = (score.value - baselineScore.value).toDouble();
+    }
+  }
+  stdout.writeln('Baseline comparison finished.');
+
+  final comparisonAsMap = test.toJsonWithDeltas();
+  stdout
+    ..writeln('==== Comparison with baseline $baselineSource ====')
+    ..writeln(const JsonEncoder.withIndent('  ').convert(comparisonAsMap))
+    ..writeln('==== End of baseline comparison ====');
+  return comparisonAsMap;
+}
+
+Expando<double> _benchmarkDeltas = Expando<double>();
+
+extension ScoreDeltaExtension on BenchmarkScore {
+  double? get deltaFromBaseline => _benchmarkDeltas[this];
+}
+
+extension ResultDeltaExtension on BenchmarkResults {
+  Map<String, List<Map<String, Object?>>> toJsonWithDeltas() {
+    return scores.map<String, List<Map<String, Object?>>>(
+      (String benchmarkName, List<BenchmarkScore> scores) {
+        return MapEntry<String, List<Map<String, Object?>>>(
+          benchmarkName,
+          scores.map<Map<String, Object?>>(
+            (BenchmarkScore score) {
+              final delta = _benchmarkDeltas[score];
+              return <String, Object?>{
+                ...score.toJson(),
+                if (delta != null) 'delta': delta,
+              };
+            },
+          ).toList(),
+        );
+      },
+    );
+  }
+}