Skip to content

Commit

Permalink
Add the ability to compare benchmark runs (flutter#6918)
Browse files Browse the repository at this point in the history
  • Loading branch information
kenzieschmoll authored Dec 8, 2023
1 parent 24d5d61 commit 6dd6896
Show file tree
Hide file tree
Showing 6 changed files with 349 additions and 3 deletions.
17 changes: 15 additions & 2 deletions packages/devtools_app/benchmark/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@ All of the commands below should be run from the `packages/devtools_app` directo
To run the performance benchmark tests locally, run:
```sh
dart run benchmark/scripts/run_benchmarks.dart
dart run benchmark/run_benchmarks.dart
```

To run the test that verifies we can run benchmark tests, run:
Expand All @@ -48,4 +47,18 @@ the other running tests are using.

The tests are defined by "automators", which live in the `benchmark/test_infra/automators`
directory. To add a new test or test case, either modify an existing automator or add
a new one for a new screen. Follow existing examples in that directory for guidance.
a new one for a new screen. Follow existing examples in that directory for guidance.

## Comparing two benchmark test runs

In order to compare two different benchmark runs, you first need to run the benchmark
tests and save the results to a file:
```sh
dart run benchmark/scripts/run_benchmarks.dart --save-to-file=baseline.json
dart run benchmark/scripts/run_benchmarks.dart --save-to-file=test.json
```

Then, to compare the benchmarks and calculate deltas, run:
```sh
dart run benchmark/scripts/compare_benchmarks.dart baseline_file.json test_file.json
```
147 changes: 147 additions & 0 deletions packages/devtools_app/benchmark/devtools_benchmarks_test.dart
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ import 'dart:io';
import 'package:test/test.dart';
import 'package:web_benchmarks/server.dart';

import 'scripts/compare_benchmarks.dart';
import 'test_infra/common.dart';
import 'test_infra/project_root_directory.dart';

Expand All @@ -37,6 +38,21 @@ void main() {
timeout: const Timeout(Duration(minutes: 10)),
);

test(
'Can compare web benchmarks',
() {
final benchmark1 = BenchmarkResults.parse(testBenchmarkResults1);
final benchmark2 = BenchmarkResults.parse(testBenchmarkResults2);
final comparison = compareBenchmarks(
benchmark1,
benchmark2,
baselineSource: 'path/to/baseline',
);
expect(comparison, testBenchmarkComparison);
},
timeout: const Timeout(Duration(minutes: 10)),
);

// TODO(kenz): add tests that verify performance meets some expected threshold
}

Expand Down Expand Up @@ -86,3 +102,134 @@ Future<void> _runBenchmarks({bool useWasm = false}) async {
isA<String>(),
);
}

final testBenchmarkResults1 = {
'foo': [
{'metric': 'preroll_frame.average', 'value': 60.5},
{'metric': 'preroll_frame.outlierAverage', 'value': 1400},
{'metric': 'preroll_frame.outlierRatio', 'value': 20.2},
{'metric': 'preroll_frame.noise', 'value': 0.85},
{'metric': 'apply_frame.average', 'value': 80.0},
{'metric': 'apply_frame.outlierAverage', 'value': 200.6},
{'metric': 'apply_frame.outlierRatio', 'value': 2.5},
{'metric': 'apply_frame.noise', 'value': 0.4},
{'metric': 'drawFrameDuration.average', 'value': 2058.9},
{'metric': 'drawFrameDuration.outlierAverage', 'value': 24000},
{'metric': 'drawFrameDuration.outlierRatio', 'value': 12.05},
{'metric': 'drawFrameDuration.noise', 'value': 0.34},
{'metric': 'totalUiFrame.average', 'value': 4166},
],
'bar': [
{'metric': 'preroll_frame.average', 'value': 60.5},
{'metric': 'preroll_frame.outlierAverage', 'value': 1400},
{'metric': 'preroll_frame.outlierRatio', 'value': 20.2},
{'metric': 'preroll_frame.noise', 'value': 0.85},
{'metric': 'apply_frame.average', 'value': 80.0},
{'metric': 'apply_frame.outlierAverage', 'value': 200.6},
{'metric': 'apply_frame.outlierRatio', 'value': 2.5},
{'metric': 'apply_frame.noise', 'value': 0.4},
{'metric': 'drawFrameDuration.average', 'value': 2058.9},
{'metric': 'drawFrameDuration.outlierAverage', 'value': 24000},
{'metric': 'drawFrameDuration.outlierRatio', 'value': 12.05},
{'metric': 'drawFrameDuration.noise', 'value': 0.34},
{'metric': 'totalUiFrame.average', 'value': 4166},
],
};

final testBenchmarkResults2 = {
'foo': [
{'metric': 'preroll_frame.average', 'value': 65.5},
{'metric': 'preroll_frame.outlierAverage', 'value': 1410},
{'metric': 'preroll_frame.outlierRatio', 'value': 20.0},
{'metric': 'preroll_frame.noise', 'value': 1.5},
{'metric': 'apply_frame.average', 'value': 50.0},
{'metric': 'apply_frame.outlierAverage', 'value': 100.0},
{'metric': 'apply_frame.outlierRatio', 'value': 2.55},
{'metric': 'apply_frame.noise', 'value': 0.9},
{'metric': 'drawFrameDuration.average', 'value': 2000.0},
{'metric': 'drawFrameDuration.outlierAverage', 'value': 20000},
{'metric': 'drawFrameDuration.outlierRatio', 'value': 11.05},
{'metric': 'drawFrameDuration.noise', 'value': 1.34},
{'metric': 'totalUiFrame.average', 'value': 4150},
],
'bar': [
{'metric': 'preroll_frame.average', 'value': 65.5},
{'metric': 'preroll_frame.outlierAverage', 'value': 1410},
{'metric': 'preroll_frame.outlierRatio', 'value': 20.0},
{'metric': 'preroll_frame.noise', 'value': 1.5},
{'metric': 'apply_frame.average', 'value': 50.0},
{'metric': 'apply_frame.outlierAverage', 'value': 100.0},
{'metric': 'apply_frame.outlierRatio', 'value': 2.55},
{'metric': 'apply_frame.noise', 'value': 0.9},
{'metric': 'drawFrameDuration.average', 'value': 2000.0},
{'metric': 'drawFrameDuration.outlierAverage', 'value': 20000},
{'metric': 'drawFrameDuration.outlierRatio', 'value': 11.05},
{'metric': 'drawFrameDuration.noise', 'value': 1.34},
{'metric': 'totalUiFrame.average', 'value': 4150},
],
};

final testBenchmarkComparison = {
'foo': [
{'metric': 'preroll_frame.average', 'value': 65.5, 'delta': 5.0},
{'metric': 'preroll_frame.outlierAverage', 'value': 1410.0, 'delta': 10.0},
{
'metric': 'preroll_frame.outlierRatio',
'value': 20.0,
'delta': -0.1999999999999993,
},
{'metric': 'preroll_frame.noise', 'value': 1.5, 'delta': 0.65},
{'metric': 'apply_frame.average', 'value': 50.0, 'delta': -30.0},
{'metric': 'apply_frame.outlierAverage', 'value': 100.0, 'delta': -100.6},
{
'metric': 'apply_frame.outlierRatio',
'value': 2.55,
'delta': 0.04999999999999982,
},
{'metric': 'apply_frame.noise', 'value': 0.9, 'delta': 0.5},
{
'metric': 'drawFrameDuration.average',
'value': 2000.0,
'delta': -58.90000000000009,
},
{
'metric': 'drawFrameDuration.outlierAverage',
'value': 20000.0,
'delta': -4000.0,
},
{'metric': 'drawFrameDuration.outlierRatio', 'value': 11.05, 'delta': -1.0},
{'metric': 'drawFrameDuration.noise', 'value': 1.34, 'delta': 1.0},
{'metric': 'totalUiFrame.average', 'value': 4150.0, 'delta': -16.0},
],
'bar': [
{'metric': 'preroll_frame.average', 'value': 65.5, 'delta': 5.0},
{'metric': 'preroll_frame.outlierAverage', 'value': 1410.0, 'delta': 10.0},
{
'metric': 'preroll_frame.outlierRatio',
'value': 20.0,
'delta': -0.1999999999999993,
},
{'metric': 'preroll_frame.noise', 'value': 1.5, 'delta': 0.65},
{'metric': 'apply_frame.average', 'value': 50.0, 'delta': -30.0},
{'metric': 'apply_frame.outlierAverage', 'value': 100.0, 'delta': -100.6},
{
'metric': 'apply_frame.outlierRatio',
'value': 2.55,
'delta': 0.04999999999999982,
},
{'metric': 'apply_frame.noise', 'value': 0.9, 'delta': 0.5},
{
'metric': 'drawFrameDuration.average',
'value': 2000.0,
'delta': -58.90000000000009,
},
{
'metric': 'drawFrameDuration.outlierAverage',
'value': 20000.0,
'delta': -4000.0,
},
{'metric': 'drawFrameDuration.outlierRatio', 'value': 11.05, 'delta': -1.0},
{'metric': 'drawFrameDuration.noise', 'value': 1.34, 'delta': 1.0},
{'metric': 'totalUiFrame.average', 'value': 4150.0, 'delta': -16.0},
],
};
125 changes: 125 additions & 0 deletions packages/devtools_app/benchmark/scripts/compare_benchmarks.dart
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
// Copyright 2023 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

import 'dart:convert';
import 'dart:io';

import 'package:collection/collection.dart';
import 'package:web_benchmarks/server.dart';

import 'utils.dart';

/// Compares two sets of web benchmarks and calculates the delta between each
/// matching metric.
void main(List<String> args) {
if (args.length != 2) {
throw Exception(
'Expected 2 arguments (<baseline-file>, <test-file>), but instead there '
'were ${args.length}.',
);
}

final baselineSource = args[0];
final testSource = args[1];

stdout
..writeln('Comparing the following benchmark results:')
..writeln(' "$testSource" (test)')
..writeln(' "$baselineSource" (baseline)');

final baselineFile = checkFileExists(baselineSource);
final testFile = checkFileExists(testSource);
if (baselineFile == null || testFile == null) {
if (baselineFile == null) {
throw Exception('Cannot find baseline file $baselineSource');
}
if (testFile == null) {
throw Exception('Cannot find test file $testSource');
}
}

final baselineResults =
BenchmarkResults.parse(jsonDecode(baselineFile.readAsStringSync()));
final testResults =
BenchmarkResults.parse(jsonDecode(testFile.readAsStringSync()));
compareBenchmarks(
baselineResults,
testResults,
baselineSource: baselineSource,
);
}

Map<String, List<Map<String, Object?>>> compareBenchmarks(
BenchmarkResults baseline,
BenchmarkResults test, {
required String baselineSource,
}) {
stdout.writeln('Starting baseline comparison...');

for (final benchmarkName in test.scores.keys) {
stdout.writeln('Comparing metrics for benchmark "$benchmarkName".');

// Lookup this benchmark in the baseline.
final baselineScores = baseline.scores[benchmarkName];
if (baselineScores == null) {
stdout.writeln(
'Baseline does not contain results for benchmark "$benchmarkName".',
);
continue;
}

final testScores = test.scores[benchmarkName]!;

for (final score in testScores) {
// Lookup this metric in the baseline.
final baselineScore =
baselineScores.firstWhereOrNull((s) => s.metric == score.metric);
if (baselineScore == null) {
stdout.writeln(
'Baseline does not contain metric "${score.metric}" for '
'benchmark "$benchmarkName".',
);
continue;
}

// Add the delta to the [testMetric].
_benchmarkDeltas[score] = (score.value - baselineScore.value).toDouble();
}
}
stdout.writeln('Baseline comparison finished.');

final comparisonAsMap = test.toJsonWithDeltas();
stdout
..writeln('==== Comparison with baseline $baselineSource ====')
..writeln(const JsonEncoder.withIndent(' ').convert(comparisonAsMap))
..writeln('==== End of baseline comparison ====');
return comparisonAsMap;
}

Expando<double> _benchmarkDeltas = Expando<double>();

extension ScoreDeltaExtension on BenchmarkScore {
double? get deltaFromBaseline => _benchmarkDeltas[this];
}

extension ResultDeltaExtension on BenchmarkResults {
Map<String, List<Map<String, Object?>>> toJsonWithDeltas() {
return scores.map<String, List<Map<String, Object?>>>(
(String benchmarkName, List<BenchmarkScore> scores) {
return MapEntry<String, List<Map<String, Object?>>>(
benchmarkName,
scores.map<Map<String, Object?>>(
(BenchmarkScore score) {
final delta = _benchmarkDeltas[score];
return <String, Object?>{
...score.toJson(),
if (delta != null) 'delta': delta,
};
},
).toList(),
);
},
);
}
}
Loading

0 comments on commit 6dd6896

Please sign in to comment.