From 7a35839599c1b1bd89618459e7e385525f05f90a Mon Sep 17 00:00:00 2001
From: Luke Gorrie <luke@snabb.co>
Date: Mon, 27 Mar 2017 10:43:00 +0000
Subject: [PATCH 1/4] Import revamped nix benchmark framework

The nix code for running benchmarks is now imported in-tree. This
makes it easy to automatically run benchmarks and also generate the
fancy R visualizations. See README.md for example usage.

The visualizations are now generated as PNG files instead of an
Rmarkdown document. This is to make it easier to introduce new
visualizations, or variations of existing ones, and easier to load
large images for zooming.

Under the hood the nix code is also updated so that each test run is a
separate derivation. This means that nix is able to parallelize the
test runs (one execution of each benchmark with one raptorjit version)
and distribute them between machines. This should make the tests run
faster on the Hydra CI cluster and also avoid tying up whole servers
with hours-long derivations that run hundreds of test runs at the same
time. (It also allows you to parallelize test runs on the local machine
to use multiple cores.)
---
 README.md                   |  33 ++++++++++++
 testsuite/bench/bench.R     |  49 +++++++++++++++++
 testsuite/bench/default.nix | 103 ++++++++++++++++++++++++++++++++++++
 testsuite/bench/generate.R  |  25 +++++++++
 4 files changed, 210 insertions(+)
 create mode 100644 testsuite/bench/bench.R
 create mode 100644 testsuite/bench/default.nix
 create mode 100755 testsuite/bench/generate.R

diff --git a/README.md b/README.md
index 244264570e..2c3a954c61 100644
--- a/README.md
+++ b/README.md
@@ -116,6 +116,39 @@ $ make
 
 ... but make sure you have at least `make`, `clang`, and `luajit` in your `$PATH`.
 
+### Run the benchmarks
+
+Nix can also run the full benchmark suite and generate visualizations
+with R/ggplot2.
+
+The simplest incantation tests one branch:
+
+```shell
+$ nix-build testsuite/bench --arg Asrc ./.   # note: ./. means ./
+```
+
+You can also test several branches (A-E), give them names, specify
+command-line arguments, say how many tests to run, and allow parallel
+execution:
+
+```shell
+$ nix-build testsuite/bench                     \
+            --arg    Asrc ~/git/raptorjit       \
+            --argstr Aname master               \
+            --arg    Bsrc ~/git/raptorjit-hack  \
+            --argstr Bname hacked               \
+            --arg    Csrc ~/git/raptorjit-hack2 \
+            --argstr Cname hacked-O1            \
+            --argstr Cargs -O1                  \
+            --arg    runs 100                   \
+            -j 5           # Run up to 5 tests in parallel
+''
+
+If you are using a distributed nix environment such
+as [Hydra](https://nixos.org/hydra/) then the tests can be
+automatically parallelized and distributed across a suitable build
+farm.
+
 ### Quotes
 
 Here are some borrowed words to put this branch into context:
diff --git a/testsuite/bench/bench.R b/testsuite/bench/bench.R
new file mode 100644
index 0000000000..e76382bf15
--- /dev/null
+++ b/testsuite/bench/bench.R
@@ -0,0 +1,49 @@
+# R subroutines for reading and visualizing benchmark results.
+
+suppressPackageStartupMessages({
+  library(dplyr)
+  library(ggplot2)
+})
+
+## R library routines for analyzing benchmark results
+bench.read <- function(filename) {
+  data <- read.csv(filename)
+  ## baseline is the mean performance of the "A" version
+  baseline <- data %>%
+    filter(letter=="A") %>%
+    group_by(benchmark) %>%
+    summarize(baseline = mean(cycles))
+  ## Add 'relative' performance column: compared to mean from baseline branch
+  relative <- data %>%
+    left_join(baseline, by="benchmark") %>%
+    group_by(benchmark, version) %>%
+    mutate(relative = first(baseline) / cycles)
+  return(relative)
+}
+
+## Jitter plot faceted by benchmark
+bench.jitterplot <- function(data) {
+  ggplot(aes(y=relative, x=version, color=version), data=data) +
+    geom_jitter(shape=1, alpha=0.5) +
+    scale_y_continuous(breaks=seq(0, 3, 0.1), labels=scales::percent) +
+    theme(aspect.ratio = 1) +
+    theme(axis.text.x = element_text(angle=90)) +
+    ylab("Performance relative to baseline average") +
+    ggtitle("Comparative performance between RaptorJIT versions") +
+    facet_wrap(~ benchmark, scales="free_x")
+}
+
+## ECDF plot faceted by benchmark
+bench.ecdfplot <- function(data) {
+  ggplot(aes(x=relative, color=version), data=data) +
+  stat_ecdf() +
+  scale_x_continuous(labels=scales::percent) +
+  scale_y_log10(labels=scales::percent) +
+  theme(aspect.ratio = 1) +
+  theme(axis.text.x = element_text(angle=90)) +
+  ylab("Performance relative to baseline average") +
+  xlab("Percentage of results at or above this performance level") +
+  ggtitle("Comparative performance between RaptorJIT variants") +
+  facet_wrap(~ benchmark)
+}
+
diff --git a/testsuite/bench/default.nix b/testsuite/bench/default.nix
new file mode 100644
index 0000000000..8d95a071d2
--- /dev/null
+++ b/testsuite/bench/default.nix
@@ -0,0 +1,103 @@
+# Run a large parallel benchmark campaign and generate R/ggplot2 reports.
+
+{ pkgs ? (import ../../pkgs.nix) {},
+  Asrc,        Aname ? "A", Aargs ? "",
+  Bsrc ? null, Bname ? "B", Bargs ? "",
+  Csrc ? null, Cname ? "C", Cargs ? "",
+  Dsrc ? null, Dname ? "D", Dargs ? "",
+  Esrc ? null, Ename ? "E", Eargs ? "",
+  hardware ? null,
+  runs ? 30 }:
+
+with pkgs;
+with stdenv;
+
+# Derivation to run benchmarks and produce a CSV result.
+let benchmark = letter: name: src: args: run:
+  let raptorjit = (import src {inherit pkgs; version = name;}).raptorjit; in
+  mkDerivation {
+    name = "benchmark-${name}-${toString run}";
+    src = pkgs.lib.cleanSource ./.;
+    # Force consistent hardware
+    requiredSystemFeatures = if hardware != null then [hardware] else [];
+    buildInputs = [ raptorjit linuxPackages.perf utillinux ];
+    buildPhase = ''
+      # Run multiple iterations of the benchmarks
+      echo "Run $run"
+      mkdir -p result/$run
+      # Run each individual benchmark
+      cat PARAM_x86_CI.txt |
+        (while read benchmark params; do
+           echo "running $benchmark"
+           # Execute with performance monitoring & time supervision
+           # Note: discard stdout due to overwhelming output
+           timeout -sKILL 60 \
+             perf stat -x, -o result/$run/$benchmark.perf \
+             raptorjit ${args} -e "math.randomseed(${toString run})" $benchmark.lua $params \
+                > /dev/null || \
+                rm result/$run/$benchmark.perf
+        done)
+    '';
+    installPhase = ''
+      # Copy the raw perf output for reference
+      cp -r result $out
+      # Log the exact CPU
+      lscpu > $out/cpu.txt
+      # Create a CSV file
+      # Create the rows based on the perf logs
+      for result in result/*.perf; do
+        version=${name}
+        benchmark=$(basename -s.perf -a $result)
+        instructions=$(awk -F, -e '$3 == "instructions" { print $1; }' $result)
+        cycles=$(      awk -F, -e '$3 == "cycles"       { print $1; }' $result)
+        echo ${letter},$version,$benchmark,${toString run},$instructions,$cycles >> $out/bench.csv
+      done
+    '';
+  };
+
+# Run a set of benchmarks and aggregate the results into a CSV file.
+# Each benchmark run is a separate derivation. This allows nix to
+# parallelize and distribute the benchmarking.
+  benchmarkSet = letter: name: src: args:
+    let benchmarks = map (benchmark letter name src args) (pkgs.lib.range 1 runs);
+    in
+      runCommand "benchmarks-${name}" { buildInputs = benchmarks; } ''
+        source $stdenv/setup
+        mkdir -p $out
+        for dir in ${pkgs.lib.fold (acc: x: "${acc} ${x}") "" benchmarks}; do
+          cat $dir/bench.csv >> $out/bench.csv
+        done
+      '';
+
+  benchA =                      (benchmarkSet "A" Aname Asrc Aargs);
+  benchB = if Bsrc != null then (benchmarkSet "B" Bname Bsrc Bargs) else "";
+  benchC = if Csrc != null then (benchmarkSet "C" Cname Csrc Cargs) else "";
+  benchD = if Dsrc != null then (benchmarkSet "D" Dname Dsrc Dargs) else "";
+  benchE = if Esrc != null then (benchmarkSet "E" Ename Esrc Eargs) else "";
+in
+
+rec {
+  benchmarkResults = mkDerivation {
+    name = "benchmark-results";
+    buildInputs = with pkgs.rPackages; [ pkgs.R ggplot2 dplyr ];
+    builder = pkgs.writeText "builder.csv" ''
+      source $stdenv/setup
+      # Get the CSV file
+      mkdir -p $out/nix-support
+      echo "letter,version,benchmark,run,instructions,cycles" > bench.csv
+                            cat ${benchA}/bench.csv >> bench.csv
+      [ -n "${benchB}" ] && cat ${benchB}/bench.csv >> bench.csv
+      [ -n "${benchC}" ] && cat ${benchC}/bench.csv >> bench.csv
+      [ -n "${benchD}" ] && cat ${benchD}/bench.csv >> bench.csv
+      [ -n "${benchE}" ] && cat ${benchE}/bench.csv >> bench.csv
+      cp bench.csv $out
+      echo "file CSV $out/bench.csv" >> $out/nix-support/hydra-build-products
+      # Generate the report
+      (cd ${./.}; Rscript ./generate.R $out/bench.csv $out)
+      for png in $out/*.png; do
+        echo "file PNG $png" >> $out/nix-support/hydra-build-products
+      done
+    '';
+  };
+}
+
diff --git a/testsuite/bench/generate.R b/testsuite/bench/generate.R
new file mode 100755
index 0000000000..fb43f01b1f
--- /dev/null
+++ b/testsuite/bench/generate.R
@@ -0,0 +1,25 @@
+#!/usr/bin/env nix-shell
+#!nix-shell -i Rscript -p R rpkgs.dplyr rpkgs.ggplot2
+
+# R command-line program for making visualizations from benchmark results.
+
+suppressWarnings(source("bench.R"))
+
+args <- commandArgs(trailingOnly=T)
+if (length(args) != 2) {
+    message("Usage: generate.R <csv> <outdir>"); quit(status=1)
+}
+
+filename <- args[[1]]
+outdir   <- args[[2]]
+
+data <- bench.read(filename)
+if (!dir.exists(outdir)) { dir.create(outdir, recursive=T) }
+
+ggsave(filename = file.path(outdir,"bench-jitter.png"),
+       plot = bench.jitterplot(data),
+       width=12, height=12)
+
+ggsave(filename = file.path(outdir,"bench-ecdf.png"),
+       plot = bench.ecdfplot(data),
+       width=12, height=12)

From b1b52db19b38e4b86ce77dcae9a734e3f77f148e Mon Sep 17 00:00:00 2001
From: Luke Gorrie <luke@snabb.co>
Date: Mon, 27 Mar 2017 11:00:18 +0000
Subject: [PATCH 2/4] Fix markdown goof in README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 2c3a954c61..daf8fa972c 100644
--- a/README.md
+++ b/README.md
@@ -142,7 +142,7 @@ $ nix-build testsuite/bench                     \
             --argstr Cargs -O1                  \
             --arg    runs 100                   \
             -j 5           # Run up to 5 tests in parallel
-''
+```
 
 If you are using a distributed nix environment such
 as [Hydra](https://nixos.org/hydra/) then the tests can be

From f5aa32905a4da78be60d581ca0e12e381856278b Mon Sep 17 00:00:00 2001
From: Luke Gorrie <luke@snabb.co>
Date: Mon, 27 Mar 2017 11:02:37 +0000
Subject: [PATCH 3/4] README.md: Add note about where benchmark visualizations
 go

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index daf8fa972c..3d81927184 100644
--- a/README.md
+++ b/README.md
@@ -132,6 +132,7 @@ command-line arguments, say how many tests to run, and allow parallel
 execution:
 
 ```shell
+# Run the benchmarks and create result visualizations result/
 $ nix-build testsuite/bench                     \
             --arg    Asrc ~/git/raptorjit       \
             --argstr Aname master               \

From 43181637fb66dab6d4286be3b4c511cbecf4cfed Mon Sep 17 00:00:00 2001
From: Luke Gorrie <luke@snabb.co>
Date: Fri, 24 Mar 2017 19:41:23 +0000
Subject: [PATCH 4/4] testsuite/bench: Tweaked tests to run longer

Tweaked parameters so that shorter tests run for longer, typically 2-3
seconds each. This is to reduce sensitivity to potential disturbance
of results by small events during startup e.g. slow system calls.
---
 testsuite/bench/PARAM_x86_CI.txt | 13 ++++++-------
 testsuite/bench/life.lua         |  2 +-
 testsuite/bench/roulette.lua     |  2 +-
 3 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/testsuite/bench/PARAM_x86_CI.txt b/testsuite/bench/PARAM_x86_CI.txt
index 0a90016863..5f8752c23b 100644
--- a/testsuite/bench/PARAM_x86_CI.txt
+++ b/testsuite/bench/PARAM_x86_CI.txt
@@ -1,22 +1,21 @@
-array3d 300
+array3d 500
 binary-trees 16
 chameneos 1e7
-coroutine-ring 2e7
+coroutine-ring 5e7
 euler14-bit 2e7
 fannkuch 11
 fasta 5e6
 life
 mandelbrot 5000
 mandelbrot-bit 5000
-md5 20000
-nbody 5e6
+md5 30000
+nbody 8e6
 nsieve 12
-nsieve-bit 12
+nsieve-bit 13
 nsieve-bit-fp 12
-partialsums 1e7
+partialsums 3e7
 pidigits-nogmp 5000
 ray 9
-recursive-ack 10
 recursive-fib 40
 scimark-fft 50000
 scimark-lu 5000
diff --git a/testsuite/bench/life.lua b/testsuite/bench/life.lua
index 911d9fe177..4b7029dac6 100644
--- a/testsuite/bench/life.lua
+++ b/testsuite/bench/life.lua
@@ -103,7 +103,7 @@ function LIFE(w,h)
     thisgen:draw()
     write("Life - generation ",gen,"\n")
     gen=gen+1
-    if gen>2000 then break end
+    if gen>10000 then break end
     --delay()		-- no delay
   end
 end
diff --git a/testsuite/bench/roulette.lua b/testsuite/bench/roulette.lua
index 968d42a744..84afe8fdd6 100644
--- a/testsuite/bench/roulette.lua
+++ b/testsuite/bench/roulette.lua
@@ -6,7 +6,7 @@
 -- (Let the test harness determine the random seed)
 -- math.randomseed(os.time())
 
-local population = 100e6
+local population = 200e6
 local live = 0
 local die  = 0