From 9efc716dec4f4dcea5fc0091eaa3166ee61c1bd0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Pawe=C5=82=20Bylica?= <chfast@gmail.com>
Date: Thu, 19 Nov 2020 21:55:58 +0100
Subject: [PATCH 1/2] bench: Add EVM instructions synthetic benchmarks

This introduces a set of on-demand generated EVM bytecodes available in
the evmone-bench tool. Each bytecode tries to stress a single
"low-level" EVM instruction. Instructions are grouped by their stack
requirements and there are also two main modes of the structure of the
generated bytecodes.
---
 test/bench/CMakeLists.txt           |   1 +
 test/bench/bench.cpp                |   2 +
 test/bench/synthetic_benchmarks.cpp | 277 ++++++++++++++++++++++++++++
 test/bench/synthetic_benchmarks.hpp |   9 +
 4 files changed, 289 insertions(+)
 create mode 100644 test/bench/synthetic_benchmarks.cpp
 create mode 100644 test/bench/synthetic_benchmarks.hpp

diff --git a/test/bench/CMakeLists.txt b/test/bench/CMakeLists.txt
index 875c7eb45e..eee3c10425 100644
--- a/test/bench/CMakeLists.txt
+++ b/test/bench/CMakeLists.txt
@@ -11,6 +11,7 @@ target_sources(
     evmone-bench PRIVATE
     bench.cpp
     helpers.hpp
+    synthetic_benchmarks.cpp synthetic_benchmarks.hpp
 )
 
 set(HAVE_STD_FILESYSTEM 0)
diff --git a/test/bench/bench.cpp b/test/bench/bench.cpp
index ea72861e7f..8cda55e0ca 100644
--- a/test/bench/bench.cpp
+++ b/test/bench/bench.cpp
@@ -3,6 +3,7 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include "helpers.hpp"
+#include "synthetic_benchmarks.hpp"
 #include <benchmark/benchmark.h>
 #include <evmc/evmc.hpp>
 #include <evmc/loader.h>
@@ -287,6 +288,7 @@ int main(int argc, char** argv)
         registered_vms["advanced"] = evmc::VM{evmc_create_evmone(), {{"O", "2"}}};
         registered_vms["baseline"] = evmc::VM{evmc_create_evmone(), {{"O", "0"}}};
         register_benchmarks(benchmark_cases);
+        register_synthetic_benchmarks();
         RunSpecifiedBenchmarks();
         return 0;
     }
diff --git a/test/bench/synthetic_benchmarks.cpp b/test/bench/synthetic_benchmarks.cpp
new file mode 100644
index 0000000000..b88df2978c
--- /dev/null
+++ b/test/bench/synthetic_benchmarks.cpp
@@ -0,0 +1,277 @@
+// evmone: Fast Ethereum Virtual Machine implementation
+// Copyright 2020 The evmone Authors.
+// SPDX-License-Identifier: Apache-2.0
+
+#include "synthetic_benchmarks.hpp"
+#include "helpers.hpp"
+#include "test/utils/bytecode.hpp"
+#include <evmc/instructions.h>
+#include <evmone/instruction_traits.hpp>
+
+using namespace benchmark;
+
+namespace evmone::test
+{
+namespace
+{
+/// Stack limit inside the EVM benchmark loop (one stack item is used for the loop counter).
+constexpr auto stack_limit = 1023;
+
+enum class Mode
+{
+    min_stack = 0,   ///< The code uses as minimal stack as possible.
+    full_stack = 1,  ///< The code fills the stack up to its limit.
+};
+
+/// The instruction grouping by EVM stack requirements.
+enum class InstructionCategory : char
+{
+    nop = 'n',     ///< No-op instruction.
+    nullop = 'a',  ///< Nullary operator - produces a result without any stack input.
+    unop = 'u',    ///< Unary operator.
+    binop = 'b',   ///< Binary operator.
+    push = 'p',    ///< PUSH instruction.
+    dup = 'd',     ///< DUP instruction.
+    swap = 's',    ///< SWAP instruction.
+    other = 'X',   ///< Not any of the categories above.
+};
+
+constexpr InstructionCategory get_instruction_category(evmc_opcode opcode) noexcept
+{
+    const auto trait = instr::traits[opcode];
+    if (opcode >= OP_PUSH1 && opcode <= OP_PUSH32)
+        return InstructionCategory::push;
+    else if (opcode >= OP_SWAP1 && opcode <= OP_SWAP16)
+        return InstructionCategory::swap;
+    else if (opcode >= OP_DUP1 && opcode <= OP_DUP16)
+        return InstructionCategory::dup;
+    else if (trait.stack_height_required == 0 && trait.stack_height_change == 0)
+        return InstructionCategory::nop;
+    else if (trait.stack_height_required == 0 && trait.stack_height_change == 1)
+        return InstructionCategory::nullop;
+    else if (trait.stack_height_required == 1 && trait.stack_height_change == 0)
+        return InstructionCategory::unop;
+    else if (trait.stack_height_required == 2 && trait.stack_height_change == -1)
+        return InstructionCategory::binop;
+    else
+        return InstructionCategory::other;
+}
+
+struct CodeParams
+{
+    evmc_opcode opcode;
+    Mode mode;
+};
+
+/// The less-than comparison operator. Needed for std::map.
+[[maybe_unused]] inline constexpr bool operator<(const CodeParams& a, const CodeParams& b) noexcept
+{
+    return std::tuple(a.opcode, a.mode) < std::tuple(b.opcode, b.mode);
+}
+
+std::string to_string(const CodeParams& params)
+{
+    return std::string{instr::traits[params.opcode].name} + '/' +
+           static_cast<char>(get_instruction_category(params.opcode)) +
+           std::to_string(static_cast<int>(params.mode));
+}
+
+/// Generates the EVM benchmark loop inner code for the given opcode and "mode".
+bytecode generate_loop_inner_code(CodeParams params)
+{
+    const auto [opcode, mode] = params;
+    const auto category = get_instruction_category(opcode);
+    switch (mode)
+    {
+    case Mode::min_stack:
+        switch (category)
+        {
+        case InstructionCategory::nop:
+            // JUMPDEST JUMPDEST ...
+            return stack_limit * 2 * bytecode{opcode};
+
+        case InstructionCategory::nullop:
+            // CALLER POP CALLER POP ...
+            return stack_limit * (bytecode{opcode} + OP_POP);
+
+        case InstructionCategory::unop:
+            // DUP1 NOT NOT ... POP
+            return OP_DUP1 + stack_limit * 2 * bytecode{opcode} + OP_POP;
+
+        case InstructionCategory::binop:
+            // DUP1 DUP1 ADD DUP1 ADD DUP1 ADD ... POP
+            return OP_DUP1 + (stack_limit - 1) * (OP_DUP1 + bytecode{opcode}) + OP_POP;
+
+        case InstructionCategory::push:
+            // PUSH1 POP PUSH1 POP ...
+            return stack_limit * (push(opcode, {}) + OP_POP);
+
+        case InstructionCategory::dup:
+        {
+            // The required n stack height for DUPn is provided by
+            // duplicating the loop counter n-1 times with DUP1.
+            const auto n = opcode - OP_DUP1 + 1;
+            // DUP1 ...  DUPn POP DUPn POP ...  POP ...
+            // \ n-1  /                         \ n-1 /
+            return (n - 1) * OP_DUP1 +                // Required n stack height.
+                   (stack_limit - (n - 1)) *          //
+                       (bytecode{opcode} + OP_POP) +  // Multiple DUPn POP pairs.
+                   (n - 1) * OP_POP;                  // Pop initially duplicated values.
+        }
+
+        case InstructionCategory::swap:
+        {
+            // The required n+1 stack height for SWAPn is provided by duplicating the loop counter
+            // n times with DUP1. This also guarantees the loop counter remains unchanged because
+            // it is always going to be swapped to the same value.
+            const auto n = opcode - OP_SWAP1 + 1;
+            // DUP1 ...  SWAPn SWAPn ...  POP ...
+            // \  n   /                   \  n  /
+            return n * OP_DUP1 +                         // Required n+1 stack height.
+                   stack_limit * 2 * bytecode{opcode} +  // Multiple SWAPns.
+                   n * OP_POP;                           // Pop initially duplicated values.
+        }
+
+        default:
+            break;
+        }
+        break;
+
+    case Mode::full_stack:
+        switch (category)
+        {
+        case InstructionCategory::nullop:
+            // CALLER CALLER ... POP POP ...
+            return stack_limit * opcode + stack_limit * OP_POP;
+
+        case InstructionCategory::binop:
+            // DUP1 DUP1 DUP1 ... ADD ADD ADD ... POP
+            return stack_limit * OP_DUP1 + (stack_limit - 1) * opcode + OP_POP;
+
+        case InstructionCategory::push:
+            // PUSH1 PUSH1 PUSH1 ... POP POP POP ...
+            return stack_limit * push(opcode, {}) + stack_limit * OP_POP;
+
+        case InstructionCategory::dup:
+        {
+            // The required initial n stack height for DUPn is provided by
+            // duplicating the loop counter n-1 times with DUP1.
+            const auto n = opcode - OP_DUP1 + 1;
+            // DUP1 ...  DUPn DUPn ...  POP POP ...
+            // \ n-1  /  \  S-(n-1)  /  \    S    /
+            return (n - 1) * OP_DUP1 +                           // Required n stack height.
+                   (stack_limit - (n - 1)) * bytecode{opcode} +  // Fill the stack with DUPn.
+                   stack_limit * OP_POP;                         // Clear whole stack.
+        }
+
+        default:
+            break;
+        }
+        break;
+    }
+
+    return {};
+}
+
+/// Generates a benchmark loop with given inner code.
+///
+/// This generates do-while loop with 255 iterations and it starts with PUSH1 of 255 as the loop
+/// counter. The while check is done as `(counter += -1) != 0`. The SUB is avoided because it
+/// consumes arguments in unnatural order and additional SWAP would be required.
+///
+/// The loop counter stays on the stack top. The inner code is allowed to duplicate it, but must not
+/// modify it.
+bytecode generate_loop_v1(const bytecode& inner_code)
+{
+    const auto counter = push(255);
+    const auto jumpdest_offset = counter.size();
+    return counter + OP_JUMPDEST + inner_code +  // loop label + inner code
+           push("ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff") +  // -1
+           OP_ADD + OP_DUP1 +                 // counter += (-1)
+           push(jumpdest_offset) + OP_JUMPI;  // jump to jumpdest_offset if counter != 0
+}
+
+/// Generates a benchmark loop with given inner code.
+///
+/// This is improved variant of v1. It has exactly the same instructions and consumes the same
+/// amount of gas, but according to performed benchmarks (see "loop_v1" and "loop_v2") it runs
+/// faster. And we want the lowest possible loop overhead.
+/// The change is to set the loop counter to -255 and check `(counter += 1) != 0`.
+bytecode generate_loop_v2(const bytecode& inner_code)
+{
+    const auto counter =
+        push("ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff01");  // -255
+    const auto jumpdest_offset = counter.size();
+    return counter + OP_JUMPDEST + inner_code +  // loop label + inner code
+           push(1) + OP_ADD + OP_DUP1 +          // counter += 1
+           push(jumpdest_offset) + OP_JUMPI;     // jump to jumpdest_offset if counter != 0
+}
+
+bytes_view generate_code(CodeParams params)
+{
+    static std::map<CodeParams, bytecode> cache;
+
+    auto& code = cache[params];
+    if (!code.empty())
+        return code;
+
+    code = generate_loop_v2(generate_loop_inner_code(params));  // Cache it.
+    return code;
+}
+}  // namespace
+
+void register_synthetic_benchmarks()
+{
+    std::vector<CodeParams> params_list;
+
+    // Nops & unops.
+    for (const auto opcode : {OP_JUMPDEST, OP_ISZERO, OP_NOT})
+        params_list.push_back({opcode, Mode::min_stack});
+
+    // Binops.
+    for (const auto opcode : {OP_ADD, OP_MUL, OP_SUB, OP_SIGNEXTEND, OP_LT, OP_GT, OP_SLT, OP_SGT,
+             OP_EQ, OP_AND, OP_OR, OP_XOR, OP_BYTE, OP_SHL, OP_SHR, OP_SAR})
+        params_list.insert(
+            params_list.end(), {{opcode, Mode::min_stack}, {opcode, Mode::full_stack}});
+
+    // Nullops.
+    for (const auto opcode : {OP_ADDRESS, OP_CALLER, OP_CALLVALUE, OP_CALLDATASIZE, OP_CODESIZE,
+             OP_RETURNDATASIZE, OP_PC, OP_MSIZE, OP_GAS})
+        params_list.insert(
+            params_list.end(), {{opcode, Mode::min_stack}, {opcode, Mode::full_stack}});
+
+    // PUSH.
+    for (auto opcode = OP_PUSH1; opcode <= OP_PUSH32; opcode = static_cast<evmc_opcode>(opcode + 1))
+        params_list.insert(
+            params_list.end(), {{opcode, Mode::min_stack}, {opcode, Mode::full_stack}});
+
+    // SWAP.
+    for (auto opcode = OP_SWAP1; opcode <= OP_SWAP16; opcode = static_cast<evmc_opcode>(opcode + 1))
+        params_list.insert(params_list.end(), {{opcode, Mode::min_stack}});
+
+    // DUP.
+    for (auto opcode = OP_DUP1; opcode <= OP_DUP16; opcode = static_cast<evmc_opcode>(opcode + 1))
+        params_list.insert(
+            params_list.end(), {{opcode, Mode::min_stack}, {opcode, Mode::full_stack}});
+
+
+    for (auto& [vm_name, vm] : registered_vms)
+    {
+        RegisterBenchmark((std::string{vm_name} + "/execute/synth/loop_v1").c_str(),
+            [&vm = vm](State& state) { execute(state, vm, generate_loop_v1({})); });
+        RegisterBenchmark((std::string{vm_name} + "/execute/synth/loop_v2").c_str(),
+            [&vm = vm](State& state) { execute(state, vm, generate_loop_v2({})); });
+    }
+
+    for (const auto params : params_list)
+    {
+        for (auto& [vm_name, vm] : registered_vms)
+        {
+            RegisterBenchmark(
+                (std::string{vm_name} + "/execute/synth/" + to_string(params)).c_str(),
+                [&vm = vm, params](State& state) { execute(state, vm, generate_code(params)); })
+                ->Unit(kMicrosecond);
+        }
+    }
+}
+}  // namespace evmone::test
diff --git a/test/bench/synthetic_benchmarks.hpp b/test/bench/synthetic_benchmarks.hpp
new file mode 100644
index 0000000000..0fc923a1e1
--- /dev/null
+++ b/test/bench/synthetic_benchmarks.hpp
@@ -0,0 +1,9 @@
+// evmone: Fast Ethereum Virtual Machine implementation
+// Copyright 2020 The evmone Authors.
+// SPDX-License-Identifier: Apache-2.0
+#pragma once
+
+namespace evmone::test
+{
+void register_synthetic_benchmarks();
+}

From 0b3c2c4345176cd309a3d1782121c434698b423e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Pawe=C5=82=20Bylica?= <chfast@gmail.com>
Date: Tue, 19 Jan 2021 14:48:40 +0100
Subject: [PATCH 2/2] bench: Allow omitting DIR argument to evmone-bench

---
 test/bench/CMakeLists.txt | 13 +++++++------
 test/bench/bench.cpp      | 12 +++++++-----
 2 files changed, 14 insertions(+), 11 deletions(-)

diff --git a/test/bench/CMakeLists.txt b/test/bench/CMakeLists.txt
index eee3c10425..ff35bc1597 100644
--- a/test/bench/CMakeLists.txt
+++ b/test/bench/CMakeLists.txt
@@ -47,11 +47,12 @@ set(PREFIX evmone/bench)
 
 # Check if DIR argument works.
 add_test(NAME ${PREFIX}/dir COMMAND evmone-bench ${CMAKE_CURRENT_SOURCE_DIR}/../benchmarks --benchmark_list_tests)
+set_tests_properties(${PREFIX}/dir PROPERTIES PASS_REGULAR_EXPRESSION "execute/synth")
 
-# Empty DIR name should run no benchmarks.
-add_test(NAME ${PREFIX}/dirname_empty COMMAND evmone-bench "" --benchmark_list_tests)
-set_tests_properties(${PREFIX}/dirname_empty PROPERTIES PASS_REGULAR_EXPRESSION "Failed to match any benchmarks")
+# Omitting DIR is fine.
+add_test(NAME ${PREFIX}/no_dir COMMAND evmone-bench --benchmark_list_tests)
+set_tests_properties(${PREFIX}/no_dir PROPERTIES PASS_REGULAR_EXPRESSION "execute/synth")
 
-# Missing DIR argument is an error.
-add_test(NAME ${PREFIX}/no_dir COMMAND evmone-bench)
-set_tests_properties(${PREFIX}/no_dir PROPERTIES PASS_REGULAR_EXPRESSION "DIR argument .* missing")
+# Empty DIR name should list only built-in benchmarks
+add_test(NAME ${PREFIX}/dirname_empty COMMAND evmone-bench "" --benchmark_list_tests)
+set_tests_properties(${PREFIX}/dirname_empty PROPERTIES PASS_REGULAR_EXPRESSION "execute/synth")
diff --git a/test/bench/bench.cpp b/test/bench/bench.cpp
index 8cda55e0ca..4a2098a6b3 100644
--- a/test/bench/bench.cpp
+++ b/test/bench/bench.cpp
@@ -191,12 +191,14 @@ constexpr auto cli_parsing_error = -3;
 ///
 /// The following variants of number arguments are supported (including argv[0]):
 ///
+/// 1: evmone-bench
+///    Uses evmone VMs, only synthetic benchmarks are available.
 /// 2: evmone-bench benchmarks_dir
-///    Uses evmone VM, loads all benchmarks from benchmarks_dir.
+///    Uses evmone VMs, loads all benchmarks from benchmarks_dir.
 /// 3: evmone-bench evmc_config benchmarks_dir
-///    The same as (2) but loads custom EVMC VM.
+///    The same as (2) but loads additional custom EVMC VM.
 /// 4: evmone-bench code_hex_file input_hex expected_output_hex.
-///    Uses evmone VM, registers custom benchmark with the code from the given file,
+///    Uses evmone VMs, registers custom benchmark with the code from the given file,
 ///    and the given input. The benchmark will compare the output with the provided
 ///    expected one.
 std::tuple<int, std::vector<BenchmarkCase>> parseargs(int argc, char** argv)
@@ -211,8 +213,8 @@ std::tuple<int, std::vector<BenchmarkCase>> parseargs(int argc, char** argv)
     switch (argc)
     {
     case 1:
-        std::cerr << "DIR argument (path to a directory with benchmarks) missing\n";
-        return {cli_parsing_error, {}};
+        // Run with built-in synthetic benchmarks only.
+        break;
     case 2:
         benchmarks_dir = argv[1];
         break;