From 6b455ff206f1ba1c43cf54b6eb920ceefb9004a1 Mon Sep 17 00:00:00 2001
From: Felipe Oliveira Carvalho <felipekde@gmail.com>
Date: Tue, 12 Nov 2019 22:24:37 +0100
Subject: [PATCH] Upgrade Google Benchmark to v1.5.0 (#3386)

---
 Makefile-ponyc                                |    3 +-
 lib/gbenchmark/.clang-format                  |    2 +-
 lib/gbenchmark/.gitignore                     |    4 +
 lib/gbenchmark/.travis.yml                    |   75 +-
 lib/gbenchmark/AUTHORS                        |    6 +
 lib/gbenchmark/BUILD.bazel                    |   32 +-
 lib/gbenchmark/CMakeLists.txt                 |   57 +-
 lib/gbenchmark/CONTRIBUTORS                   |    8 +
 lib/gbenchmark/README.md                      | 1152 ++++++++++-------
 lib/gbenchmark/WORKSPACE                      |   10 +-
 lib/gbenchmark/_config.yml                    |    1 +
 lib/gbenchmark/appveyor.yml                   |    6 -
 lib/gbenchmark/cmake/CXXFeatureCheck.cmake    |   10 +-
 lib/gbenchmark/cmake/GetGitVersion.cmake      |    5 +-
 lib/gbenchmark/cmake/GoogleTest.cmake         |   41 +
 lib/gbenchmark/cmake/GoogleTest.cmake.in      |   58 +
 lib/gbenchmark/cmake/HandleGTest.cmake        |  115 --
 lib/gbenchmark/cmake/benchmark.pc.in          |    1 +
 lib/gbenchmark/cmake/split_list.cmake         |    3 +
 lib/gbenchmark/conan/CMakeLists.txt           |    7 +
 .../conan/test_package/CMakeLists.txt         |   10 +
 .../conan/test_package/conanfile.py           |   19 +
 .../conan/test_package/test_package.cpp       |   18 +
 lib/gbenchmark/conanfile.py                   |   79 ++
 lib/gbenchmark/dependencies.md                |   18 +
 lib/gbenchmark/docs/_config.yml               |    1 +
 lib/gbenchmark/docs/tools.md                  |   99 +-
 lib/gbenchmark/include/benchmark/benchmark.h  |  412 ++++--
 lib/gbenchmark/src/CMakeLists.txt             |   50 +-
 lib/gbenchmark/src/benchmark.cc               |  340 ++---
 lib/gbenchmark/src/benchmark_api_internal.cc  |   15 +
 lib/gbenchmark/src/benchmark_api_internal.h   |   16 +-
 lib/gbenchmark/src/benchmark_main.cc          |   17 +
 lib/gbenchmark/src/benchmark_name.cc          |   58 +
 lib/gbenchmark/src/benchmark_register.cc      |   95 +-
 lib/gbenchmark/src/benchmark_register.h       |   92 +-
 lib/gbenchmark/src/benchmark_runner.cc        |  361 ++++++
 lib/gbenchmark/src/benchmark_runner.h         |   51 +
 lib/gbenchmark/src/check.h                    |    5 +-
 lib/gbenchmark/src/colorprint.cc              |    2 +-
 lib/gbenchmark/src/commandlineflags.cc        |    6 +-
 lib/gbenchmark/src/commandlineflags.h         |    6 -
 lib/gbenchmark/src/complexity.cc              |   44 +-
 lib/gbenchmark/src/console_reporter.cc        |   57 +-
 lib/gbenchmark/src/counter.cc                 |   26 +-
 lib/gbenchmark/src/counter.h                  |    9 +-
 lib/gbenchmark/src/csv_reporter.cc            |   61 +-
 lib/gbenchmark/src/cycleclock.h               |    8 +-
 lib/gbenchmark/src/internal_macros.h          |   27 +-
 lib/gbenchmark/src/json_reporter.cc           |  124 +-
 lib/gbenchmark/src/log.h                      |    3 +-
 lib/gbenchmark/src/re.h                       |   24 +-
 lib/gbenchmark/src/reporter.cc                |   22 +-
 lib/gbenchmark/src/sleep.cc                   |    2 +-
 lib/gbenchmark/src/statistics.cc              |   67 +-
 lib/gbenchmark/src/string_util.cc             |   92 +-
 lib/gbenchmark/src/string_util.h              |   29 +-
 lib/gbenchmark/src/sysinfo.cc                 |  160 ++-
 lib/gbenchmark/src/thread_manager.h           |    4 +-
 lib/gbenchmark/src/thread_timer.h             |   23 +-
 lib/gbenchmark/src/timers.cc                  |    8 +-
 lib/gbenchmark/test/AssemblyTests.cmake       |    1 +
 lib/gbenchmark/test/BUILD                     |   10 +-
 lib/gbenchmark/test/CMakeLists.txt            |   42 +-
 lib/gbenchmark/test/basic_test.cc             |    9 +-
 lib/gbenchmark/test/benchmark_gtest.cc        |   97 +-
 lib/gbenchmark/test/benchmark_name_gtest.cc   |   74 ++
 lib/gbenchmark/test/commandlineflags_gtest.cc |   78 ++
 lib/gbenchmark/test/complexity_test.cc        |   80 +-
 lib/gbenchmark/test/cxx03_test.cc             |    2 +-
 .../test/display_aggregates_only_test.cc      |   43 +
 .../test/internal_threading_test.cc           |  184 +++
 lib/gbenchmark/test/link_main_test.cc         |    8 +
 lib/gbenchmark/test/memory_manager_test.cc    |   44 +
 lib/gbenchmark/test/multiple_ranges_test.cc   |    3 +-
 lib/gbenchmark/test/options_test.cc           |   10 +
 lib/gbenchmark/test/output_test.h             |   36 +-
 lib/gbenchmark/test/output_test_helper.cc     |  180 ++-
 .../test/register_benchmark_test.cc           |    6 +-
 .../test/report_aggregates_only_test.cc       |   39 +
 lib/gbenchmark/test/reporter_output_test.cc   |  532 ++++++--
 lib/gbenchmark/test/skip_with_error_test.cc   |    9 +-
 lib/gbenchmark/test/state_assembly_test.cc    |    4 +-
 lib/gbenchmark/test/statistics_gtest.cc       |   53 +-
 lib/gbenchmark/test/string_util_gtest.cc      |  146 +++
 lib/gbenchmark/test/templated_fixture_test.cc |    6 +-
 .../test/user_counters_tabular_test.cc        |  199 +--
 lib/gbenchmark/test/user_counters_test.cc     |  333 ++++-
 .../test/user_counters_thousands_test.cc      |  173 +++
 lib/gbenchmark/tools/compare.py               |  108 +-
 lib/gbenchmark/tools/compare_bench.py         |   67 -
 .../tools/gbench/Inputs/test1_run1.json       |   19 +-
 .../tools/gbench/Inputs/test1_run2.json       |   19 +-
 .../tools/gbench/Inputs/test3_run0.json       |   65 +
 .../tools/gbench/Inputs/test3_run1.json       |   65 +
 lib/gbenchmark/tools/gbench/report.py         |  409 +++++-
 lib/gbenchmark/tools/gbench/util.py           |   15 +-
 97 files changed, 5482 insertions(+), 1842 deletions(-)
 create mode 100644 lib/gbenchmark/_config.yml
 create mode 100644 lib/gbenchmark/cmake/GoogleTest.cmake
 create mode 100644 lib/gbenchmark/cmake/GoogleTest.cmake.in
 delete mode 100644 lib/gbenchmark/cmake/HandleGTest.cmake
 create mode 100644 lib/gbenchmark/cmake/split_list.cmake
 create mode 100644 lib/gbenchmark/conan/CMakeLists.txt
 create mode 100644 lib/gbenchmark/conan/test_package/CMakeLists.txt
 create mode 100644 lib/gbenchmark/conan/test_package/conanfile.py
 create mode 100644 lib/gbenchmark/conan/test_package/test_package.cpp
 create mode 100644 lib/gbenchmark/conanfile.py
 create mode 100644 lib/gbenchmark/dependencies.md
 create mode 100644 lib/gbenchmark/docs/_config.yml
 create mode 100644 lib/gbenchmark/src/benchmark_api_internal.cc
 create mode 100644 lib/gbenchmark/src/benchmark_main.cc
 create mode 100644 lib/gbenchmark/src/benchmark_name.cc
 create mode 100644 lib/gbenchmark/src/benchmark_runner.cc
 create mode 100644 lib/gbenchmark/src/benchmark_runner.h
 create mode 100644 lib/gbenchmark/test/benchmark_name_gtest.cc
 create mode 100644 lib/gbenchmark/test/commandlineflags_gtest.cc
 create mode 100644 lib/gbenchmark/test/display_aggregates_only_test.cc
 create mode 100644 lib/gbenchmark/test/internal_threading_test.cc
 create mode 100644 lib/gbenchmark/test/link_main_test.cc
 create mode 100644 lib/gbenchmark/test/memory_manager_test.cc
 create mode 100644 lib/gbenchmark/test/report_aggregates_only_test.cc
 create mode 100644 lib/gbenchmark/test/string_util_gtest.cc
 create mode 100644 lib/gbenchmark/test/user_counters_thousands_test.cc
 delete mode 100755 lib/gbenchmark/tools/compare_bench.py
 create mode 100644 lib/gbenchmark/tools/gbench/Inputs/test3_run0.json
 create mode 100644 lib/gbenchmark/tools/gbench/Inputs/test3_run1.json

diff --git a/Makefile-ponyc b/Makefile-ponyc
index dbcb664519..cb689308fc 100644
--- a/Makefile-ponyc
+++ b/Makefile-ponyc
@@ -616,7 +616,8 @@ libponyc.benchmarks.buildoptions += -DLLVM_BUILD_MODE=$(LLVM_BUILD_MODE)
 
 libgbenchmark.buildoptions := \
   -Wshadow -pedantic -pedantic-errors \
-  -Wfloat-equal -fstrict-aliasing -Wstrict-aliasing -Wno-invalid-offsetof \
+  -fstrict-aliasing -Wstrict-aliasing \
+  -Wno-invalid-offsetof -Wno-deprecated-declarations \
   -DHAVE_POSIX_REGEX -DHAVE_STD_REGEX -DHAVE_STEADY_CLOCK
 
 ifneq ($(ALPINE),)
diff --git a/lib/gbenchmark/.clang-format b/lib/gbenchmark/.clang-format
index 4b3f13fa55..e7d00feaa0 100644
--- a/lib/gbenchmark/.clang-format
+++ b/lib/gbenchmark/.clang-format
@@ -1,5 +1,5 @@
 ---
 Language:        Cpp
 BasedOnStyle:  Google
+PointerAlignment: Left
 ...
-
diff --git a/lib/gbenchmark/.gitignore b/lib/gbenchmark/.gitignore
index 050e46987f..806d04c6b3 100644
--- a/lib/gbenchmark/.gitignore
+++ b/lib/gbenchmark/.gitignore
@@ -48,6 +48,7 @@ bazel-*
 # out-of-source build top-level folders.
 build/
 _build/
+build*/
 
 # in-source dependencies
 /googletest/
@@ -55,3 +56,6 @@ _build/
 # Visual Studio 2015/2017 cache/options directory
 .vs/
 CMakeSettings.json
+
+# Visual Studio Code cache/options directory
+.vscode/
diff --git a/lib/gbenchmark/.travis.yml b/lib/gbenchmark/.travis.yml
index 09c058c544..6b6cfc7046 100644
--- a/lib/gbenchmark/.travis.yml
+++ b/lib/gbenchmark/.travis.yml
@@ -23,13 +23,25 @@ matrix:
         apt:
           packages:
             - g++-multilib
-      env: COMPILER=g++ C_COMPILER=gcc BUILD_TYPE=Debug BUILD_32_BITS=ON
+            - libc6:i386
+      env:
+        - COMPILER=g++
+        - C_COMPILER=gcc
+        - BUILD_TYPE=Debug
+        - BUILD_32_BITS=ON
+        - EXTRA_FLAGS="-m32"
     - compiler: gcc
       addons:
         apt:
           packages:
             - g++-multilib
-      env: COMPILER=g++ C_COMPILER=gcc BUILD_TYPE=Release BUILD_32_BITS=ON
+            - libc6:i386
+      env:
+        - COMPILER=g++
+        - C_COMPILER=gcc
+        - BUILD_TYPE=Release
+        - BUILD_32_BITS=ON
+        - EXTRA_FLAGS="-m32"
     - compiler: gcc
       env:
         - INSTALL_GCC6_FROM_PPA=1
@@ -42,81 +54,102 @@ matrix:
       env: COMPILER=clang++ C_COMPILER=clang BUILD_TYPE=Release
     # Clang w/ libc++
     - compiler: clang
+      dist: xenial
       addons:
         apt:
           packages:
             clang-3.8
       env:
+        - INSTALL_GCC6_FROM_PPA=1
         - COMPILER=clang++-3.8 C_COMPILER=clang-3.8 BUILD_TYPE=Debug
         - LIBCXX_BUILD=1
-        - EXTRA_FLAGS="-stdlib=libc++"
+        - EXTRA_CXX_FLAGS="-stdlib=libc++"
     - compiler: clang
+      dist: xenial
       addons:
         apt:
           packages:
             clang-3.8
       env:
+        - INSTALL_GCC6_FROM_PPA=1
         - COMPILER=clang++-3.8 C_COMPILER=clang-3.8 BUILD_TYPE=Release
         - LIBCXX_BUILD=1
-        - EXTRA_FLAGS="-stdlib=libc++"
+        - EXTRA_CXX_FLAGS="-stdlib=libc++"
     # Clang w/ 32bit libc++
     - compiler: clang
+      dist: xenial
       addons:
         apt:
           packages:
             - clang-3.8
             - g++-multilib
+            - libc6:i386
       env:
+        - INSTALL_GCC6_FROM_PPA=1
         - COMPILER=clang++-3.8 C_COMPILER=clang-3.8 BUILD_TYPE=Debug
         - LIBCXX_BUILD=1
         - BUILD_32_BITS=ON
-        - EXTRA_FLAGS="-stdlib=libc++ -m32"
+        - EXTRA_FLAGS="-m32"
+        - EXTRA_CXX_FLAGS="-stdlib=libc++"
     # Clang w/ 32bit libc++
     - compiler: clang
+      dist: xenial
       addons:
         apt:
           packages:
             - clang-3.8
             - g++-multilib
+            - libc6:i386
       env:
+        - INSTALL_GCC6_FROM_PPA=1
         - COMPILER=clang++-3.8 C_COMPILER=clang-3.8 BUILD_TYPE=Release
         - LIBCXX_BUILD=1
         - BUILD_32_BITS=ON
-        - EXTRA_FLAGS="-stdlib=libc++ -m32"
+        - EXTRA_FLAGS="-m32"
+        - EXTRA_CXX_FLAGS="-stdlib=libc++"
     # Clang w/ libc++, ASAN, UBSAN
     - compiler: clang
+      dist: xenial
       addons:
         apt:
           packages:
             clang-3.8
       env:
+        - INSTALL_GCC6_FROM_PPA=1
         - COMPILER=clang++-3.8 C_COMPILER=clang-3.8 BUILD_TYPE=Debug
         - LIBCXX_BUILD=1 LIBCXX_SANITIZER="Undefined;Address"
         - ENABLE_SANITIZER=1
-        - EXTRA_FLAGS="-stdlib=libc++ -g -O2 -fno-omit-frame-pointer -fsanitize=undefined,address -fno-sanitize-recover=all"
+        - EXTRA_FLAGS="-g -O2 -fno-omit-frame-pointer -fsanitize=undefined,address -fno-sanitize-recover=all"
+        - EXTRA_CXX_FLAGS="-stdlib=libc++"
         - UBSAN_OPTIONS=print_stacktrace=1
     # Clang w/ libc++ and MSAN
     - compiler: clang
+      dist: xenial
       addons:
         apt:
           packages:
             clang-3.8
       env:
+        - INSTALL_GCC6_FROM_PPA=1
         - COMPILER=clang++-3.8 C_COMPILER=clang-3.8 BUILD_TYPE=Debug
         - LIBCXX_BUILD=1 LIBCXX_SANITIZER=MemoryWithOrigins
         - ENABLE_SANITIZER=1
-        - EXTRA_FLAGS="-stdlib=libc++ -g -O2 -fno-omit-frame-pointer -fsanitize=memory -fsanitize-memory-track-origins"
+        - EXTRA_FLAGS="-g -O2 -fno-omit-frame-pointer -fsanitize=memory -fsanitize-memory-track-origins"
+        - EXTRA_CXX_FLAGS="-stdlib=libc++"
     # Clang w/ libc++ and MSAN
     - compiler: clang
+      dist: xenial
       addons:
         apt:
           packages:
             clang-3.8
       env:
+        - INSTALL_GCC6_FROM_PPA=1
         - COMPILER=clang++-3.8 C_COMPILER=clang-3.8 BUILD_TYPE=RelWithDebInfo
         - LIBCXX_BUILD=1 LIBCXX_SANITIZER=Thread
         - ENABLE_SANITIZER=1
-        - EXTRA_FLAGS="-stdlib=libc++ -g -O2 -fno-omit-frame-pointer -fsanitize=thread -fno-sanitize-recover=all"
+        - EXTRA_FLAGS="-g -O2 -fno-omit-frame-pointer -fsanitize=thread -fno-sanitize-recover=all"
+        - EXTRA_CXX_FLAGS="-stdlib=libc++"
     - os: osx
       osx_image: xcode8.3
       compiler: clang
@@ -127,6 +160,14 @@ matrix:
       compiler: clang
       env:
         - COMPILER=clang++ BUILD_TYPE=Release
+    - os: osx
+      osx_image: xcode8.3
+      compiler: clang
+      env:
+        - COMPILER=clang++
+        - BUILD_TYPE=Release
+        - BUILD_32_BITS=ON
+        - EXTRA_FLAGS="-m32"
     - os: osx
       osx_image: xcode8.3
       compiler: gcc
@@ -155,35 +196,35 @@ before_install:
 
 install:
   - if [ -n "${INSTALL_GCC6_FROM_PPA}" ]; then
-      sudo -E apt-get -yq --no-install-suggests --no-install-recommends install g++-6;
+      travis_wait sudo -E apt-get -yq --no-install-suggests --no-install-recommends install g++-6;
     fi
   - if [ "${TRAVIS_OS_NAME}" == "linux" -a "${BUILD_32_BITS}" == "OFF" ]; then
-      sudo -E apt-get -y --no-install-suggests --no-install-recommends install llvm-3.9-tools;
+      travis_wait sudo -E apt-get -y --no-install-suggests --no-install-recommends install llvm-3.9-tools;
       sudo cp /usr/lib/llvm-3.9/bin/FileCheck /usr/local/bin/;
     fi
   - if [ "${BUILD_TYPE}" == "Coverage" -a "${TRAVIS_OS_NAME}" == "linux" ]; then
       PATH=~/.local/bin:${PATH};
       pip install --user --upgrade pip;
-      pip install --user cpp-coveralls;
+      travis_wait pip install --user cpp-coveralls;
     fi
   - if [ "${C_COMPILER}" == "gcc-7" -a "${TRAVIS_OS_NAME}" == "osx" ]; then
       rm -f /usr/local/include/c++;
       brew update;
-      brew install gcc@7;
+      travis_wait brew install gcc@7;
     fi
   - if [ "${TRAVIS_OS_NAME}" == "linux" ]; then
       sudo apt-get update -qq;
-      sudo apt-get install -qq unzip;
+      sudo apt-get install -qq unzip cmake3;
       wget https://github.com/bazelbuild/bazel/releases/download/0.10.1/bazel-0.10.1-installer-linux-x86_64.sh --output-document bazel-installer.sh;
-      sudo bash bazel-installer.sh;
+      travis_wait sudo bash bazel-installer.sh;
     fi
   - if [ "${TRAVIS_OS_NAME}" == "osx" ]; then
       curl -L -o bazel-installer.sh https://github.com/bazelbuild/bazel/releases/download/0.10.1/bazel-0.10.1-installer-darwin-x86_64.sh;
-      sudo bash bazel-installer.sh;
+      travis_wait sudo bash bazel-installer.sh;
     fi
 
 script:
-  - cmake -DCMAKE_C_COMPILER=${C_COMPILER} -DCMAKE_CXX_COMPILER=${COMPILER} -DCMAKE_BUILD_TYPE=${BUILD_TYPE} -DCMAKE_CXX_FLAGS="${EXTRA_FLAGS}" -DBENCHMARK_DOWNLOAD_DEPENDENCIES=ON -DBENCHMARK_BUILD_32_BITS=${BUILD_32_BITS} ${EXTRA_OPTIONS} ..
+  - cmake -DCMAKE_C_COMPILER=${C_COMPILER} -DCMAKE_CXX_COMPILER=${COMPILER} -DCMAKE_BUILD_TYPE=${BUILD_TYPE} -DCMAKE_C_FLAGS="${EXTRA_FLAGS}" -DCMAKE_CXX_FLAGS="${EXTRA_FLAGS} ${EXTRA_CXX_FLAGS}" -DBENCHMARK_DOWNLOAD_DEPENDENCIES=ON -DBENCHMARK_BUILD_32_BITS=${BUILD_32_BITS} ${EXTRA_OPTIONS} ..
   - make
   - ctest -C ${BUILD_TYPE} --output-on-failure
   - bazel test -c dbg --define google_benchmark.have_regex=posix --announce_rc --verbose_failures --test_output=errors --keep_going //test/...
diff --git a/lib/gbenchmark/AUTHORS b/lib/gbenchmark/AUTHORS
index 45adb27ee5..912cbbc13c 100644
--- a/lib/gbenchmark/AUTHORS
+++ b/lib/gbenchmark/AUTHORS
@@ -9,15 +9,20 @@
 # Please keep the list sorted.
 
 Albert Pretorius <pretoalb@gmail.com>
+Alex Steele <steeleal123@gmail.com>
+Andriy Berestovskyy <berestovskyy@gmail.com>
 Arne Beer <arne@twobeer.de>
 Carto
 Christopher Seymour <chris.j.seymour@hotmail.com>
+Daniel Harvey <danielharvey458@gmail.com>
 David Coeurjolly <david.coeurjolly@liris.cnrs.fr>
+Deniz Evrenci <denizevrenci@gmail.com>
 Dirac Research 
 Dominik Czarnota <dominik.b.czarnota@gmail.com>
 Eric Fiselier <eric@efcs.ca>
 Eugene Zhuk <eugene.zhuk@gmail.com>
 Evgeny Safronov <division494@gmail.com>
+Federico Ficarelli <federico.ficarelli@gmail.com>
 Felix Homann <linuxaudio@showlabor.de>
 Google Inc.
 International Business Machines Corporation
@@ -34,6 +39,7 @@ Maxim Vafin <maxvafin@gmail.com>
 MongoDB Inc.
 Nick Hutchinson <nshutchinson@gmail.com>
 Oleksandr Sochka <sasha.sochka@gmail.com>
+Ori Livneh <ori.livneh@gmail.com>
 Paul Redmond <paul.redmond@gmail.com>
 Radoslav Yovchev <radoslav.tm@gmail.com>
 Roman Lebedev <lebedev.ri@gmail.com>
diff --git a/lib/gbenchmark/BUILD.bazel b/lib/gbenchmark/BUILD.bazel
index 35605ccb91..6ee69f2907 100644
--- a/lib/gbenchmark/BUILD.bazel
+++ b/lib/gbenchmark/BUILD.bazel
@@ -1,14 +1,38 @@
 licenses(["notice"])
 
+config_setting(
+    name = "windows",
+    values = {
+        "cpu": "x64_windows",
+    },
+    visibility = [":__subpackages__"],
+)
+
 cc_library(
     name = "benchmark",
-    srcs = glob([
-        "src/*.cc",
-        "src/*.h",
-    ]),
+    srcs = glob(
+        [
+            "src/*.cc",
+            "src/*.h",
+        ],
+        exclude = ["src/benchmark_main.cc"],
+    ),
+    hdrs = ["include/benchmark/benchmark.h"],
+    linkopts = select({
+        ":windows": ["-DEFAULTLIB:shlwapi.lib"],
+        "//conditions:default": ["-pthread"],
+    }),
+    strip_include_prefix = "include",
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "benchmark_main",
+    srcs = ["src/benchmark_main.cc"],
     hdrs = ["include/benchmark/benchmark.h"],
     strip_include_prefix = "include",
     visibility = ["//visibility:public"],
+    deps = [":benchmark"],
 )
 
 cc_library(
diff --git a/lib/gbenchmark/CMakeLists.txt b/lib/gbenchmark/CMakeLists.txt
index 4c107937ef..9db1361212 100644
--- a/lib/gbenchmark/CMakeLists.txt
+++ b/lib/gbenchmark/CMakeLists.txt
@@ -1,22 +1,28 @@
-cmake_minimum_required (VERSION 2.8.12)
-
-project (benchmark)
+cmake_minimum_required (VERSION 3.5.1)
 
 foreach(p
+    CMP0048 # OK to clear PROJECT_VERSION on project()
     CMP0054 # CMake 3.1
     CMP0056 # export EXE_LINKER_FLAGS to try_run
     CMP0057 # Support no if() IN_LIST operator
+    CMP0063 # Honor visibility properties for all targets
     )
   if(POLICY ${p})
     cmake_policy(SET ${p} NEW)
   endif()
 endforeach()
 
+project (benchmark CXX)
+
 option(BENCHMARK_ENABLE_TESTING "Enable testing of the benchmark library." ON)
 option(BENCHMARK_ENABLE_EXCEPTIONS "Enable the use of exceptions in the benchmark library." ON)
 option(BENCHMARK_ENABLE_LTO "Enable link time optimisation of the benchmark library." OFF)
 option(BENCHMARK_USE_LIBCXX "Build and test using libc++ as the standard library." OFF)
-option(BENCHMARK_BUILD_32_BITS "Build a 32 bit version of the library." OFF)
+if(NOT MSVC)
+  option(BENCHMARK_BUILD_32_BITS "Build a 32 bit version of the library." OFF)
+else()
+  set(BENCHMARK_BUILD_32_BITS OFF CACHE BOOL "Build a 32 bit version of the library - unsupported when using MSVC)" FORCE)
+endif()
 option(BENCHMARK_ENABLE_INSTALL "Enable installation of benchmark. (Projects embedding benchmark may want to turn this OFF.)" ON)
 
 # Allow unmet dependencies to be met using CMake's ExternalProject mechanics, which
@@ -75,7 +81,7 @@ get_git_version(GIT_VERSION)
 
 # Tell the user what versions we are using
 string(REGEX MATCH "[0-9]+\\.[0-9]+\\.[0-9]+" VERSION ${GIT_VERSION})
-message("-- Version: ${VERSION}")
+message(STATUS "Version: ${VERSION}")
 
 # The version of the libraries
 set(GENERIC_LIB_VERSION ${VERSION})
@@ -90,7 +96,7 @@ if (BENCHMARK_BUILD_32_BITS)
   add_required_cxx_compiler_flag(-m32)
 endif()
 
-if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC")
+if (MSVC)
   # Turn compiler warnings up to 11
   string(REGEX REPLACE "[-/]W[1-4]" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /W4")
@@ -99,6 +105,7 @@ if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC")
   if (NOT BENCHMARK_ENABLE_EXCEPTIONS)
     add_cxx_compiler_flag(-EHs-)
     add_cxx_compiler_flag(-EHa-)
+    add_definitions(-D_HAS_EXCEPTIONS=0)
   endif()
   # Link time optimisation
   if (BENCHMARK_ENABLE_LTO)
@@ -130,7 +137,6 @@ else()
 
   # Turn compiler warnings up to 11
   add_cxx_compiler_flag(-Wall)
-
   add_cxx_compiler_flag(-Wextra)
   add_cxx_compiler_flag(-Wshadow)
   add_cxx_compiler_flag(-Werror RELEASE)
@@ -139,8 +145,20 @@ else()
   add_cxx_compiler_flag(-pedantic)
   add_cxx_compiler_flag(-pedantic-errors)
   add_cxx_compiler_flag(-Wshorten-64-to-32)
-  add_cxx_compiler_flag(-Wfloat-equal)
   add_cxx_compiler_flag(-fstrict-aliasing)
+  # Disable warnings regarding deprecated parts of the library while building
+  # and testing those parts of the library.
+  add_cxx_compiler_flag(-Wno-deprecated-declarations)
+  if (CMAKE_CXX_COMPILER_ID STREQUAL "Intel")
+    # Intel silently ignores '-Wno-deprecated-declarations',
+    # warning no. 1786 must be explicitly disabled.
+    # See #631 for rationale.
+    add_cxx_compiler_flag(-wd1786)
+  endif()
+  # Disable deprecation warnings for release builds (when -Werror is enabled).
+  add_cxx_compiler_flag(-Wno-deprecated RELEASE)
+  add_cxx_compiler_flag(-Wno-deprecated RELWITHDEBINFO)
+  add_cxx_compiler_flag(-Wno-deprecated MINSIZEREL)
   if (NOT BENCHMARK_ENABLE_EXCEPTIONS)
     add_cxx_compiler_flag(-fno-exceptions)
   endif()
@@ -152,7 +170,7 @@ else()
   endif()
   # ICC17u2: overloaded virtual function "benchmark::Fixture::SetUp" is only partially overridden
   # (because of deprecated overload)
-  add_cxx_compiler_flag(-wd654)  
+  add_cxx_compiler_flag(-wd654)
   add_cxx_compiler_flag(-Wthread-safety)
   if (HAVE_CXX_FLAG_WTHREAD_SAFETY)
     cxx_feature_check(THREAD_SAFETY_ATTRIBUTES)
@@ -166,10 +184,14 @@ else()
     add_definitions(-D_GNU_SOURCE=1)
   endif()
 
+  if (QNXNTO)
+    add_definitions(-D_QNX_SOURCE)
+  endif()
+
   # Link time optimisation
   if (BENCHMARK_ENABLE_LTO)
     add_cxx_compiler_flag(-flto)
-    if ("${CMAKE_C_COMPILER_ID}" STREQUAL "GNU")
+    if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
       find_program(GCC_AR gcc-ar)
       if (GCC_AR)
         set(CMAKE_AR ${GCC_AR})
@@ -178,7 +200,7 @@ else()
       if (GCC_RANLIB)
         set(CMAKE_RANLIB ${GCC_RANLIB})
       endif()
-    elseif("${CMAKE_C_COMPILER_ID}" STREQUAL "Clang")
+    elseif("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang")
       include(llvm-toolchain)
     endif()
   endif()
@@ -203,12 +225,12 @@ else()
 endif()
 
 if (BENCHMARK_USE_LIBCXX)
-  if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
+  if ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang")
     add_cxx_compiler_flag(-stdlib=libc++)
   elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU" OR
           "${CMAKE_CXX_COMPILER_ID}" STREQUAL "Intel")
     add_cxx_compiler_flag(-nostdinc++)
-    message("libc++ header path must be manually specified using CMAKE_CXX_FLAGS")
+    message(WARNING "libc++ header path must be manually specified using CMAKE_CXX_FLAGS")
     # Adding -nodefaultlibs directly to CMAKE_<TYPE>_LINKER_FLAGS will break
     # configuration checks such as 'find_package(Threads)'
     list(APPEND BENCHMARK_CXX_LINKER_FLAGS -nodefaultlibs)
@@ -216,7 +238,7 @@ if (BENCHMARK_USE_LIBCXX)
     # linker flags appear before all linker inputs and -lc++ must appear after.
     list(APPEND BENCHMARK_CXX_LIBRARIES c++)
   else()
-    message(FATAL "-DBENCHMARK_USE_LIBCXX:BOOL=ON is not supported for compiler")
+    message(FATAL_ERROR "-DBENCHMARK_USE_LIBCXX:BOOL=ON is not supported for compiler")
   endif()
 endif(BENCHMARK_USE_LIBCXX)
 
@@ -234,6 +256,7 @@ if (NOT BENCHMARK_ENABLE_EXCEPTIONS AND HAVE_STD_REGEX
 endif()
 cxx_feature_check(STEADY_CLOCK)
 # Ensure we have pthreads
+set(THREADS_PREFER_PTHREAD_FLAG ON)
 find_package(Threads REQUIRED)
 
 # Set up directories
@@ -244,8 +267,10 @@ add_subdirectory(src)
 
 if (BENCHMARK_ENABLE_TESTING)
   enable_testing()
-  if (BENCHMARK_ENABLE_GTEST_TESTS)
-    include(HandleGTest)
+  if (BENCHMARK_ENABLE_GTEST_TESTS AND
+      NOT (TARGET gtest AND TARGET gtest_main AND
+           TARGET gmock AND TARGET gmock_main))
+    include(GoogleTest)
   endif()
   add_subdirectory(test)
 endif()
diff --git a/lib/gbenchmark/CONTRIBUTORS b/lib/gbenchmark/CONTRIBUTORS
index 2f1999be74..b680efc8c4 100644
--- a/lib/gbenchmark/CONTRIBUTORS
+++ b/lib/gbenchmark/CONTRIBUTORS
@@ -23,17 +23,24 @@
 # Please keep the list sorted.
 
 Albert Pretorius <pretoalb@gmail.com>
+Alex Steele <steelal123@gmail.com>
+Andriy Berestovskyy <berestovskyy@gmail.com>
 Arne Beer <arne@twobeer.de>
 Billy Robert O'Neal III <billy.oneal@gmail.com> <bion@microsoft.com>
 Chris Kennelly <ckennelly@google.com> <ckennelly@ckennelly.com>
 Christopher Seymour <chris.j.seymour@hotmail.com>
+Cyrille Faucheux <cyrille.faucheux@gmail.com>
+Daniel Harvey <danielharvey458@gmail.com>
 David Coeurjolly <david.coeurjolly@liris.cnrs.fr>
+Deniz Evrenci <denizevrenci@gmail.com>
 Dominic Hamon <dma@stripysock.com> <dominic@google.com>
 Dominik Czarnota <dominik.b.czarnota@gmail.com>
 Eric Fiselier <eric@efcs.ca>
 Eugene Zhuk <eugene.zhuk@gmail.com>
 Evgeny Safronov <division494@gmail.com>
+Federico Ficarelli <federico.ficarelli@gmail.com>
 Felix Homann <linuxaudio@showlabor.de>
+Hannes Hauswedell <h2@fsfe.org>
 Ismael Jimenez Martinez <ismael.jimenez.martinez@gmail.com>
 Jern-Kuan Leong <jernkuan@gmail.com>
 JianXiong Zhou <zhoujianxiong2@gmail.com>
@@ -48,6 +55,7 @@ Matt Clarkson <mattyclarkson@gmail.com>
 Maxim Vafin <maxvafin@gmail.com>
 Nick Hutchinson <nshutchinson@gmail.com>
 Oleksandr Sochka <sasha.sochka@gmail.com>
+Ori Livneh <ori.livneh@gmail.com>
 Pascal Leroy <phl@google.com>
 Paul Redmond <paul.redmond@gmail.com>
 Pierre Phaneuf <pphaneuf@google.com>
diff --git a/lib/gbenchmark/README.md b/lib/gbenchmark/README.md
index c8f8c01f0f..45e4158843 100644
--- a/lib/gbenchmark/README.md
+++ b/lib/gbenchmark/README.md
@@ -1,82 +1,137 @@
-# benchmark
+# Benchmark
 [![Build Status](https://travis-ci.org/google/benchmark.svg?branch=master)](https://travis-ci.org/google/benchmark)
 [![Build status](https://ci.appveyor.com/api/projects/status/u0qsyp7t1tk7cpxs/branch/master?svg=true)](https://ci.appveyor.com/project/google/benchmark/branch/master)
 [![Coverage Status](https://coveralls.io/repos/google/benchmark/badge.svg)](https://coveralls.io/r/google/benchmark)
 [![slackin](https://slackin-iqtfqnpzxd.now.sh/badge.svg)](https://slackin-iqtfqnpzxd.now.sh/)
 
-A library to support the benchmarking of functions, similar to unit-tests.
 
-Discussion group: https://groups.google.com/d/forum/benchmark-discuss
+A library to benchmark code snippets, similar to unit tests. Example:
 
-IRC channel: https://freenode.net #googlebenchmark
+```c++
+#include <benchmark/benchmark.h>
+
+static void BM_SomeFunction(benchmark::State& state) {
+  // Perform setup here
+  for (auto _ : state) {
+    // This code gets timed
+    SomeFunction();
+  }
+}
+// Register the function as a benchmark
+BENCHMARK(BM_SomeFunction);
+// Run the benchmark
+BENCHMARK_MAIN();
+```
+
+To get started, see [Requirements](#requirements) and
+[Installation](#installation). See [Usage](#usage) for a full example and the
+[User Guide](#user-guide) for a more comprehensive feature overview.
+
+It may also help to read the [Google Test documentation](https://github.com/google/googletest/blob/master/googletest/docs/primer.md)
+as some of the structural aspects of the APIs are similar.
+
+### Resources
 
-[Known issues and common problems](#known-issues)
+[Discussion group](https://groups.google.com/d/forum/benchmark-discuss)
+
+IRC channel: [freenode](https://freenode.net) #googlebenchmark
 
 [Additional Tooling Documentation](docs/tools.md)
 
 [Assembly Testing Documentation](docs/AssemblyTests.md)
 
+## Requirements
+
+The library can be used with C++03. However, it requires C++11 to build,
+including compiler and standard library support.
+
+The following minimum versions are required to build the library:
+
+* GCC 4.8
+* Clang 3.4
+* Visual Studio 2013
+* Intel 2015 Update 1
 
-## Building
+## Installation
 
-The basic steps for configuring and building the library look like this:
+This describes the installation process using cmake. As pre-requisites, you'll
+need git and cmake installed.
+
+_See [dependencies.md](dependencies.md) for more details regarding supported
+versions of build tools._
 
 ```bash
+# Check out the library.
 $ git clone https://github.com/google/benchmark.git
-# Benchmark requires GTest as a dependency. Add the source tree as a subdirectory.
+# Benchmark requires Google Test as a dependency. Add the source tree as a subdirectory.
 $ git clone https://github.com/google/googletest.git benchmark/googletest
+# Make a build directory to place the build output.
 $ mkdir build && cd build
-$ cmake -G <generator> [options] ../benchmark
-# Assuming a makefile generator was used
+# Generate a Makefile with cmake.
+# Use cmake -G <generator> to generate a different file type.
+$ cmake ../benchmark
+# Build the library.
 $ make
 ```
+This builds the `benchmark` and `benchmark_main` libraries and tests.
+On a unix system, the build directory should now look something like this:
+
+```
+/benchmark
+/build
+  /src
+    /libbenchmark.a
+    /libbenchmark_main.a
+  /test
+    ...
+```
+
+Next, you can run the tests to check the build.
+
+```bash
+$ make test
+```
+
+If you want to install the library globally, also run:
+
+```
+sudo make install
+```
 
-Note that Google Benchmark requires GTest to build and run the tests. This
-dependency can be provided three ways:
+Note that Google Benchmark requires Google Test to build and run the tests. This
+dependency can be provided two ways:
 
-* Checkout the GTest sources into `benchmark/googletest`.
+* Checkout the Google Test sources into `benchmark/googletest` as above.
 * Otherwise, if `-DBENCHMARK_DOWNLOAD_DEPENDENCIES=ON` is specified during
   configuration, the library will automatically download and build any required
   dependencies.
-* Otherwise, if nothing is done, CMake will use `find_package(GTest REQUIRED)`
-  to resolve the required GTest dependency.
 
 If you do not wish to build and run the tests, add `-DBENCHMARK_ENABLE_GTEST_TESTS=OFF`
 to `CMAKE_ARGS`.
 
+### Debug vs Release
 
-## Installation Guide
-
-For Ubuntu and Debian Based System
-
-First make sure you have git and cmake installed (If not please install it)
+By default, benchmark builds as a debug library. You will see a warning in the
+output when this is the case. To build it as a release library instead, use:
 
 ```
-sudo apt-get install git
-sudo apt-get install cmake
+cmake -DCMAKE_BUILD_TYPE=Release
 ```
 
-Now, let's clone the repository and build it
+To enable link-time optimisation, use
 
 ```
-git clone https://github.com/google/benchmark.git
-cd benchmark
-mkdir build
-cd build
-cmake .. -DCMAKE_BUILD_TYPE=RELEASE
-make
+cmake -DCMAKE_BUILD_TYPE=Release -DBENCHMARK_ENABLE_LTO=true
 ```
 
-We need to install the library globally now
+If you are using gcc, you might need to set `GCC_AR` and `GCC_RANLIB` cmake
+cache variables, if autodetection fails.
 
-```
-sudo make install
-```
+If you are using clang, you may need to set `LLVMAR_EXECUTABLE`,
+`LLVMNM_EXECUTABLE` and `LLVMRANLIB_EXECUTABLE` cmake cache variables.
 
-Now you have google/benchmark installed in your machine 
-Note: Don't forget to link to pthread library while building
 
-## Stable and Experimental Library Versions
+### Stable and Experimental Library Versions
 
 The main branch contains the latest stable version of the benchmarking library;
 the API of which can be considered largely stable, with source breaking changes
@@ -88,10 +143,11 @@ to use, test, and provide feedback on the new features are encouraged to try
 this branch. However, this branch provides no stability guarantees and reserves
 the right to change and break the API at any time.
 
-
-## Example usage
+## Usage
 ### Basic usage
-Define a function that executes the code to be measured.
+Define a function that executes the code to measure, register it as a benchmark
+function using the `BENCHMARK` macro, and ensure an appropriate `main` function
+is available:
 
 ```c++
 #include <benchmark/benchmark.h>
@@ -114,11 +170,218 @@ BENCHMARK(BM_StringCopy);
 BENCHMARK_MAIN();
 ```
 
-Don't forget to inform your linker to add benchmark library e.g. through `-lbenchmark` compilation flag.
+To run the benchmark, compile and link against the `benchmark` library
+(libbenchmark.a/.so). If you followed the build steps above, this
+library will be under the build directory you created.
+
+```bash
+# Example on linux after running the build steps above. Assumes the
+# `benchmark` and `build` directories are under the current directory.
+$ g++ -std=c++11 -isystem benchmark/include -Lbuild/src -lpthread \
+  -lbenchmark mybenchmark.cc -o mybenchmark
+```
+
+Alternatively, link against the `benchmark_main` library and remove
+`BENCHMARK_MAIN();` above to get the same behavior.
+
+The compiled executable will run all benchmarks by default. Pass the `--help`
+flag for option information or see the guide below.
+
+### Platform-specific instructions
+
+When the library is built using GCC it is necessary to link with the pthread
+library due to how GCC implements `std::thread`. Failing to link to pthread will
+lead to runtime exceptions (unless you're using libc++), not linker errors. See
+[issue #67](https://github.com/google/benchmark/issues/67) for more details. You
+can link to pthread by adding `-pthread` to your linker command. Note, you can
+also use `-lpthread`, but there are potential issues with ordering of command
+line parameters if you use that.
+
+If you're running benchmarks on Windows, the shlwapi library (`-lshlwapi`) is
+also required.
+
+If you're running benchmarks on solaris, you'll want the kstat library linked in
+too (`-lkstat`).
+
+## User Guide
+
+### Command Line
+[Output Formats](#output-formats)
+
+[Output Files](#output-files)
+
+[Running a Subset of Benchmarks](#running-a-subset-of-benchmarks)
+
+[Result Comparison](#result-comparison)
+
+### Library
+[Runtime and Reporting Considerations](#runtime-and-reporting-considerations)
+
+[Passing Arguments](#passing-arguments)
+
+[Calculating Asymptotic Complexity](#asymptotic-complexity)
+
+[Templated Benchmarks](#templated-benchmarks)
+
+[Fixtures](#fixtures)
+
+[Custom Counters](#custom-counters)
+
+[Multithreaded Benchmarks](#multithreaded-benchmarks)
+
+[CPU Timers](#cpu-timers)
+
+[Manual Timing](#manual-timing)
+
+[Setting the Time Unit](#setting-the-time-unit)
+
+[Preventing Optimization](#preventing-optimization)
+
+[Reporting Statistics](#reporting-statistics)
+
+[Custom Statistics](#custom-statistics)
+
+[Using RegisterBenchmark](#using-register-benchmark)
+
+[Exiting with an Error](#exiting-with-an-error)
+
+[A Faster KeepRunning Loop](#a-faster-keep-running-loop)
+
+[Disabling CPU Frequency Scaling](#disabling-cpu-frequency-scaling)
+
+<a name="output-formats" />
+
+### Output Formats
+
+The library supports multiple output formats. Use the
+`--benchmark_format=<console|json|csv>` flag to set the format type. `console`
+is the default format.
+
+The Console format is intended to be a human readable format. By default
+the format generates color output. Context is output on stderr and the
+tabular data on stdout. Example tabular output looks like:
+```
+Benchmark                               Time(ns)    CPU(ns) Iterations
+----------------------------------------------------------------------
+BM_SetInsert/1024/1                        28928      29349      23853  133.097kB/s   33.2742k items/s
+BM_SetInsert/1024/8                        32065      32913      21375  949.487kB/s   237.372k items/s
+BM_SetInsert/1024/10                       33157      33648      21431  1.13369MB/s   290.225k items/s
+```
+
+The JSON format outputs human readable json split into two top level attributes.
+The `context` attribute contains information about the run in general, including
+information about the CPU and the date.
+The `benchmarks` attribute contains a list of every benchmark run. Example json
+output looks like:
+```json
+{
+  "context": {
+    "date": "2015/03/17-18:40:25",
+    "num_cpus": 40,
+    "mhz_per_cpu": 2801,
+    "cpu_scaling_enabled": false,
+    "build_type": "debug"
+  },
+  "benchmarks": [
+    {
+      "name": "BM_SetInsert/1024/1",
+      "iterations": 94877,
+      "real_time": 29275,
+      "cpu_time": 29836,
+      "bytes_per_second": 134066,
+      "items_per_second": 33516
+    },
+    {
+      "name": "BM_SetInsert/1024/8",
+      "iterations": 21609,
+      "real_time": 32317,
+      "cpu_time": 32429,
+      "bytes_per_second": 986770,
+      "items_per_second": 246693
+    },
+    {
+      "name": "BM_SetInsert/1024/10",
+      "iterations": 21393,
+      "real_time": 32724,
+      "cpu_time": 33355,
+      "bytes_per_second": 1199226,
+      "items_per_second": 299807
+    }
+  ]
+}
+```
+
+The CSV format outputs comma-separated values. The `context` is output on stderr
+and the CSV itself on stdout. Example CSV output looks like:
+```
+name,iterations,real_time,cpu_time,bytes_per_second,items_per_second,label
+"BM_SetInsert/1024/1",65465,17890.7,8407.45,475768,118942,
+"BM_SetInsert/1024/8",116606,18810.1,9766.64,3.27646e+06,819115,
+"BM_SetInsert/1024/10",106365,17238.4,8421.53,4.74973e+06,1.18743e+06,
+```
+
+<a name="output-files" />
+
+### Output Files
+
+Write benchmark results to a file with the `--benchmark_out=<filename>` option.
+Specify the output format with `--benchmark_out_format={json|console|csv}`. Note that Specifying
+`--benchmark_out` does not suppress the console output.
+
+<a name="running-a-subset-of-benchmarks" />
+
+### Running a Subset of Benchmarks
+
+The `--benchmark_filter=<regex>` option can be used to only run the benchmarks
+which match the specified `<regex>`. For example:
+
+```bash
+$ ./run_benchmarks.x --benchmark_filter=BM_memcpy/32
+Run on (1 X 2300 MHz CPU )
+2016-06-25 19:34:24
+Benchmark              Time           CPU Iterations
+----------------------------------------------------
+BM_memcpy/32          11 ns         11 ns   79545455
+BM_memcpy/32k       2181 ns       2185 ns     324074
+BM_memcpy/32          12 ns         12 ns   54687500
+BM_memcpy/32k       1834 ns       1837 ns     357143
+```
+
+<a name="result-comparison" />
+
+### Result comparison
+
+It is possible to compare the benchmarking results. See [Additional Tooling Documentation](docs/tools.md)
+
+<a name="runtime-and-reporting-considerations" />
+
+### Runtime and Reporting Considerations
+
+When the benchmark binary is executed, each benchmark function is run serially.
+The number of iterations to run is determined dynamically by running the
+benchmark a few times and measuring the time taken and ensuring that the
+ultimate result will be statistically stable. As such, faster benchmark
+functions will be run for more iterations than slower benchmark functions, and
+the number of iterations is thus reported.
+
+In all cases, the number of iterations for which the benchmark is run is
+governed by the amount of time the benchmark takes. Concretely, the number of
+iterations is at least one, not more than 1e9, until CPU time is greater than
+the minimum time, or the wallclock time is 5x minimum time. The minimum time is
+set per benchmark by calling `MinTime` on the registered benchmark object.
+
+Average timings are then reported over the iterations run. If multiple
+repetitions are requested using the `--benchmark_repetitions` command-line
+option, or at registration time, the benchmark function will be run several
+times and statistical results across these repetitions will also be reported.
+
+As well as the per-benchmark entries, a preamble in the report will include
+information about the machine on which the benchmarks are run.
 
-The benchmark library will reporting the timing for the code within the `for(...)` loop.
+<a name="passing-arguments" />
+
+### Passing Arguments
 
-### Passing arguments
 Sometimes a family of benchmarks can be implemented with just one routine that
 takes an extra argument to specify which one of the family of benchmarks to
 run. For example, the following code defines a family of benchmarks for
@@ -205,7 +468,31 @@ static void CustomArguments(benchmark::internal::Benchmark* b) {
 BENCHMARK(BM_SetInsert)->Apply(CustomArguments);
 ```
 
-### Calculate asymptotic complexity (Big O)
+#### Passing Arbitrary Arguments to a Benchmark
+
+In C++11 it is possible to define a benchmark that takes an arbitrary number
+of extra arguments. The `BENCHMARK_CAPTURE(func, test_case_name, ...args)`
+macro creates a benchmark that invokes `func`  with the `benchmark::State` as
+the first argument followed by the specified `args...`.
+The `test_case_name` is appended to the name of the benchmark and
+should describe the values passed.
+
+```c++
+template <class ...ExtraArgs>
+void BM_takes_args(benchmark::State& state, ExtraArgs&&... extra_args) {
+  [...]
+}
+// Registers a benchmark named "BM_takes_args/int_string_test" that passes
+// the specified values to `extra_args`.
+BENCHMARK_CAPTURE(BM_takes_args, int_string_test, 42, std::string("abc"));
+```
+Note that elements of `...args` may refer to global variables. Users should
+avoid modifying global state inside of a benchmark.
+
+<a name="asymptotic-complexity" />
+
+### Calculating Asymptotic Complexity (Big O)
+
 Asymptotic complexity might be calculated for a family of benchmarks. The
 following code will calculate the coefficient for the high-order term in the
 running time and the normalized root-mean square error of string comparison.
@@ -236,16 +523,18 @@ that might be used to customize high-order term calculation.
 
 ```c++
 BENCHMARK(BM_StringCompare)->RangeMultiplier(2)
-    ->Range(1<<10, 1<<18)->Complexity([](int n)->double{return n; });
+    ->Range(1<<10, 1<<18)->Complexity([](int64_t n)->double{return n; });
 ```
 
-### Templated benchmarks
-Templated benchmarks work the same way: This example produces and consumes
-messages of size `sizeof(v)` `range_x` times. It also outputs throughput in the
-absence of multiprogramming.
+<a name="templated-benchmarks" />
+
+### Templated Benchmarks
+
+This example produces and consumes messages of size `sizeof(v)` `range_x`
+times. It also outputs throughput in the absence of multiprogramming.
 
 ```c++
-template <class Q> int BM_Sequential(benchmark::State& state) {
+template <class Q> void BM_Sequential(benchmark::State& state) {
   Q q;
   typename Q::value_type v;
   for (auto _ : state) {
@@ -273,110 +562,210 @@ Three macros are provided for adding benchmark templates.
 #define BENCHMARK_TEMPLATE2(func, arg1, arg2)
 ```
 
-### A Faster KeepRunning loop
+<a name="fixtures" />
 
-In C++11 mode, a ranged-based for loop should be used in preference to
-the `KeepRunning` loop for running the benchmarks. For example:
+### Fixtures
+
+Fixture tests are created by first defining a type that derives from
+`::benchmark::Fixture` and then creating/registering the tests using the
+following macros:
+
+* `BENCHMARK_F(ClassName, Method)`
+* `BENCHMARK_DEFINE_F(ClassName, Method)`
+* `BENCHMARK_REGISTER_F(ClassName, Method)`
+
+For Example:
 
 ```c++
-static void BM_Fast(benchmark::State &state) {
-  for (auto _ : state) {
-    FastOperation();
+class MyFixture : public benchmark::Fixture {
+public:
+  void SetUp(const ::benchmark::State& state) {
+  }
+
+  void TearDown(const ::benchmark::State& state) {
+  }
+};
+
+BENCHMARK_F(MyFixture, FooTest)(benchmark::State& st) {
+   for (auto _ : st) {
+     ...
   }
 }
-BENCHMARK(BM_Fast);
+
+BENCHMARK_DEFINE_F(MyFixture, BarTest)(benchmark::State& st) {
+   for (auto _ : st) {
+     ...
+  }
+}
+/* BarTest is NOT registered */
+BENCHMARK_REGISTER_F(MyFixture, BarTest)->Threads(2);
+/* BarTest is now registered */
 ```
 
-The reason the ranged-for loop is faster than using `KeepRunning`, is
-because `KeepRunning` requires a memory load and store of the iteration count
-ever iteration, whereas the ranged-for variant is able to keep the iteration count
-in a register.
+#### Templated Fixtures
 
-For example, an empty inner loop of using the ranged-based for method looks like:
+Also you can create templated fixture by using the following macros:
 
-```asm
-# Loop Init
-  mov rbx, qword ptr [r14 + 104]
-  call benchmark::State::StartKeepRunning()
-  test rbx, rbx
-  je .LoopEnd
-.LoopHeader: # =>This Inner Loop Header: Depth=1
-  add rbx, -1
-  jne .LoopHeader
-.LoopEnd:
-```
+* `BENCHMARK_TEMPLATE_F(ClassName, Method, ...)`
+* `BENCHMARK_TEMPLATE_DEFINE_F(ClassName, Method, ...)`
 
-Compared to an empty `KeepRunning` loop, which looks like:
+For example:
+```c++
+template<typename T>
+class MyFixture : public benchmark::Fixture {};
 
-```asm
-.LoopHeader: # in Loop: Header=BB0_3 Depth=1
-  cmp byte ptr [rbx], 1
-  jne .LoopInit
-.LoopBody: # =>This Inner Loop Header: Depth=1
-  mov rax, qword ptr [rbx + 8]
-  lea rcx, [rax + 1]
-  mov qword ptr [rbx + 8], rcx
-  cmp rax, qword ptr [rbx + 104]
-  jb .LoopHeader
-  jmp .LoopEnd
-.LoopInit:
-  mov rdi, rbx
-  call benchmark::State::StartKeepRunning()
-  jmp .LoopBody
-.LoopEnd:
+BENCHMARK_TEMPLATE_F(MyFixture, IntTest, int)(benchmark::State& st) {
+   for (auto _ : st) {
+     ...
+  }
+}
+
+BENCHMARK_TEMPLATE_DEFINE_F(MyFixture, DoubleTest, double)(benchmark::State& st) {
+   for (auto _ : st) {
+     ...
+  }
+}
+
+BENCHMARK_REGISTER_F(MyFixture, DoubleTest)->Threads(2);
 ```
 
-Unless C++03 compatibility is required, the ranged-for variant of writing
-the benchmark loop should be preferred.  
+<a name="custom-counters" />
 
-## Passing arbitrary arguments to a benchmark
-In C++11 it is possible to define a benchmark that takes an arbitrary number
-of extra arguments. The `BENCHMARK_CAPTURE(func, test_case_name, ...args)`
-macro creates a benchmark that invokes `func`  with the `benchmark::State` as
-the first argument followed by the specified `args...`.
-The `test_case_name` is appended to the name of the benchmark and
-should describe the values passed.
+### Custom Counters
+
+You can add your own counters with user-defined names. The example below
+will add columns "Foo", "Bar" and "Baz" in its output:
 
 ```c++
-template <class ...ExtraArgs>
-void BM_takes_args(benchmark::State& state, ExtraArgs&&... extra_args) {
-  [...]
+static void UserCountersExample1(benchmark::State& state) {
+  double numFoos = 0, numBars = 0, numBazs = 0;
+  for (auto _ : state) {
+    // ... count Foo,Bar,Baz events
+  }
+  state.counters["Foo"] = numFoos;
+  state.counters["Bar"] = numBars;
+  state.counters["Baz"] = numBazs;
 }
-// Registers a benchmark named "BM_takes_args/int_string_test" that passes
-// the specified values to `extra_args`.
-BENCHMARK_CAPTURE(BM_takes_args, int_string_test, 42, std::string("abc"));
 ```
-Note that elements of `...args` may refer to global variables. Users should
-avoid modifying global state inside of a benchmark.
 
-## Using RegisterBenchmark(name, fn, args...)
+The `state.counters` object is a `std::map` with `std::string` keys
+and `Counter` values. The latter is a `double`-like class, via an implicit
+conversion to `double&`. Thus you can use all of the standard arithmetic
+assignment operators (`=,+=,-=,*=,/=`) to change the value of each counter.
 
-The `RegisterBenchmark(name, func, args...)` function provides an alternative
-way to create and register benchmarks.
-`RegisterBenchmark(name, func, args...)` creates, registers, and returns a
-pointer to a new benchmark with the specified `name` that invokes
-`func(st, args...)` where `st` is a `benchmark::State` object.
+In multithreaded benchmarks, each counter is set on the calling thread only.
+When the benchmark finishes, the counters from each thread will be summed;
+the resulting sum is the value which will be shown for the benchmark.
 
-Unlike the `BENCHMARK` registration macros, which can only be used at the global
-scope, the `RegisterBenchmark` can be called anywhere. This allows for
-benchmark tests to be registered programmatically.
+The `Counter` constructor accepts three parameters: the value as a `double`
+; a bit flag which allows you to show counters as rates, and/or as per-thread
+iteration, and/or as per-thread averages, and/or iteration invariants;
+and a flag specifying the 'unit' - i.e. is 1k a 1000 (default,
+`benchmark::Counter::OneK::kIs1000`), or 1024
+(`benchmark::Counter::OneK::kIs1024`)?
 
-Additionally `RegisterBenchmark` allows any callable object to be registered
-as a benchmark. Including capturing lambdas and function objects.
+```c++
+  // sets a simple counter
+  state.counters["Foo"] = numFoos;
+
+  // Set the counter as a rate. It will be presented divided
+  // by the duration of the benchmark.
+  state.counters["FooRate"] = Counter(numFoos, benchmark::Counter::kIsRate);
+
+  // Set the counter as a thread-average quantity. It will
+  // be presented divided by the number of threads.
+  state.counters["FooAvg"] = Counter(numFoos, benchmark::Counter::kAvgThreads);
+
+  // There's also a combined flag:
+  state.counters["FooAvgRate"] = Counter(numFoos,benchmark::Counter::kAvgThreadsRate);
+
+  // This says that we process with the rate of state.range(0) bytes every iteration:
+  state.counters["BytesProcessed"] = Counter(state.range(0), benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::OneK::kIs1024);
+```
+
+When you're compiling in C++11 mode or later you can use `insert()` with
+`std::initializer_list`:
 
-For Example:
 ```c++
-auto BM_test = [](benchmark::State& st, auto Inputs) { /* ... */ };
+  // With C++11, this can be done:
+  state.counters.insert({{"Foo", numFoos}, {"Bar", numBars}, {"Baz", numBazs}});
+  // ... instead of:
+  state.counters["Foo"] = numFoos;
+  state.counters["Bar"] = numBars;
+  state.counters["Baz"] = numBazs;
+```
 
-int main(int argc, char** argv) {
-  for (auto& test_input : { /* ... */ })
-      benchmark::RegisterBenchmark(test_input.name(), BM_test, test_input);
-  benchmark::Initialize(&argc, argv);
-  benchmark::RunSpecifiedBenchmarks();
-}
+#### Counter Reporting
+
+When using the console reporter, by default, user counters are are printed at
+the end after the table, the same way as ``bytes_processed`` and
+``items_processed``. This is best for cases in which there are few counters,
+or where there are only a couple of lines per benchmark. Here's an example of
+the default output:
+
+```
+------------------------------------------------------------------------------
+Benchmark                        Time           CPU Iterations UserCounters...
+------------------------------------------------------------------------------
+BM_UserCounter/threads:8      2248 ns      10277 ns      68808 Bar=16 Bat=40 Baz=24 Foo=8
+BM_UserCounter/threads:1      9797 ns       9788 ns      71523 Bar=2 Bat=5 Baz=3 Foo=1024m
+BM_UserCounter/threads:2      4924 ns       9842 ns      71036 Bar=4 Bat=10 Baz=6 Foo=2
+BM_UserCounter/threads:4      2589 ns      10284 ns      68012 Bar=8 Bat=20 Baz=12 Foo=4
+BM_UserCounter/threads:8      2212 ns      10287 ns      68040 Bar=16 Bat=40 Baz=24 Foo=8
+BM_UserCounter/threads:16     1782 ns      10278 ns      68144 Bar=32 Bat=80 Baz=48 Foo=16
+BM_UserCounter/threads:32     1291 ns      10296 ns      68256 Bar=64 Bat=160 Baz=96 Foo=32
+BM_UserCounter/threads:4      2615 ns      10307 ns      68040 Bar=8 Bat=20 Baz=12 Foo=4
+BM_Factorial                    26 ns         26 ns   26608979 40320
+BM_Factorial/real_time          26 ns         26 ns   26587936 40320
+BM_CalculatePiRange/1           16 ns         16 ns   45704255 0
+BM_CalculatePiRange/8           73 ns         73 ns    9520927 3.28374
+BM_CalculatePiRange/64         609 ns        609 ns    1140647 3.15746
+BM_CalculatePiRange/512       4900 ns       4901 ns     142696 3.14355
+```
+
+If this doesn't suit you, you can print each counter as a table column by
+passing the flag `--benchmark_counters_tabular=true` to the benchmark
+application. This is best for cases in which there are a lot of counters, or
+a lot of lines per individual benchmark. Note that this will trigger a
+reprinting of the table header any time the counter set changes between
+individual benchmarks. Here's an example of corresponding output when
+`--benchmark_counters_tabular=true` is passed:
+
+```
+---------------------------------------------------------------------------------------
+Benchmark                        Time           CPU Iterations    Bar   Bat   Baz   Foo
+---------------------------------------------------------------------------------------
+BM_UserCounter/threads:8      2198 ns       9953 ns      70688     16    40    24     8
+BM_UserCounter/threads:1      9504 ns       9504 ns      73787      2     5     3     1
+BM_UserCounter/threads:2      4775 ns       9550 ns      72606      4    10     6     2
+BM_UserCounter/threads:4      2508 ns       9951 ns      70332      8    20    12     4
+BM_UserCounter/threads:8      2055 ns       9933 ns      70344     16    40    24     8
+BM_UserCounter/threads:16     1610 ns       9946 ns      70720     32    80    48    16
+BM_UserCounter/threads:32     1192 ns       9948 ns      70496     64   160    96    32
+BM_UserCounter/threads:4      2506 ns       9949 ns      70332      8    20    12     4
+--------------------------------------------------------------
+Benchmark                        Time           CPU Iterations
+--------------------------------------------------------------
+BM_Factorial                    26 ns         26 ns   26392245 40320
+BM_Factorial/real_time          26 ns         26 ns   26494107 40320
+BM_CalculatePiRange/1           15 ns         15 ns   45571597 0
+BM_CalculatePiRange/8           74 ns         74 ns    9450212 3.28374
+BM_CalculatePiRange/64         595 ns        595 ns    1173901 3.15746
+BM_CalculatePiRange/512       4752 ns       4752 ns     147380 3.14355
+BM_CalculatePiRange/4k       37970 ns      37972 ns      18453 3.14184
+BM_CalculatePiRange/32k     303733 ns     303744 ns       2305 3.14162
+BM_CalculatePiRange/256k   2434095 ns    2434186 ns        288 3.1416
+BM_CalculatePiRange/1024k  9721140 ns    9721413 ns         71 3.14159
+BM_CalculatePi/threads:8      2255 ns       9943 ns      70936
 ```
+Note above the additional header printed when the benchmark changes from
+``BM_UserCounter`` to ``BM_Factorial``. This is because ``BM_Factorial`` does
+not have the same counter set as ``BM_UserCounter``.
+
+<a name="multithreaded-benchmarks"/>
+
+### Multithreaded Benchmarks
 
-### Multithreaded benchmarks
 In a multithreaded test (benchmark invoked by multiple threads simultaneously),
 it is guaranteed that none of the threads will start until all have reached
 the start of the benchmark loop, and all will have finished before any thread
@@ -409,8 +798,78 @@ BENCHMARK(BM_test)->Range(8, 8<<10)->UseRealTime();
 
 Without `UseRealTime`, CPU time is used by default.
 
+<a name="cpu-timers" />
+
+### CPU Timers
+
+By default, the CPU timer only measures the time spent by the main thread.
+If the benchmark itself uses threads internally, this measurement may not
+be what you are looking for. Instead, there is a way to measure the total
+CPU usage of the process, by all the threads.
+
+```c++
+void callee(int i);
+
+static void MyMain(int size) {
+#pragma omp parallel for
+  for(int i = 0; i < size; i++)
+    callee(i);
+}
+
+static void BM_OpenMP(benchmark::State& state) {
+  for (auto _ : state)
+    MyMain(state.range(0);
+}
+
+// Measure the time spent by the main thread, use it to decide for how long to
+// run the benchmark loop. Depending on the internal implementation detail may
+// measure to anywhere from near-zero (the overhead spent before/after work
+// handoff to worker thread[s]) to the whole single-thread time.
+BENCHMARK(BM_OpenMP)->Range(8, 8<<10);
+
+// Measure the user-visible time, the wall clock (literally, the time that
+// has passed on the clock on the wall), use it to decide for how long to
+// run the benchmark loop. This will always be meaningful, an will match the
+// time spent by the main thread in single-threaded case, in general decreasing
+// with the number of internal threads doing the work.
+BENCHMARK(BM_OpenMP)->Range(8, 8<<10)->UseRealTime();
+
+// Measure the total CPU consumption, use it to decide for how long to
+// run the benchmark loop. This will always measure to no less than the
+// time spent by the main thread in single-threaded case.
+BENCHMARK(BM_OpenMP)->Range(8, 8<<10)->MeasureProcessCPUTime();
+
+// A mixture of the last two. Measure the total CPU consumption, but use the
+// wall clock to decide for how long to run the benchmark loop.
+BENCHMARK(BM_OpenMP)->Range(8, 8<<10)->MeasureProcessCPUTime()->UseRealTime();
+```
+
+#### Controlling Timers
+
+Normally, the entire duration of the work loop (`for (auto _ : state) {}`)
+is measured. But sometimes, it is necessary to do some work inside of
+that loop, every iteration, but without counting that time to the benchmark time.
+That is possible, althought it is not recommended, since it has high overhead.
+
+```c++
+static void BM_SetInsert_With_Timer_Control(benchmark::State& state) {
+  std::set<int> data;
+  for (auto _ : state) {
+    state.PauseTiming(); // Stop timers. They will not count until they are resumed.
+    data = ConstructRandomSet(state.range(0)); // Do something that should not be measured
+    state.ResumeTiming(); // And resume timers. They are now counting again.
+    // The rest will be measured.
+    for (int j = 0; j < state.range(1); ++j)
+      data.insert(RandomNumber());
+  }
+}
+BENCHMARK(BM_SetInsert_With_Timer_Control)->Ranges({{1<<10, 8<<10}, {128, 512}});
+```
+
+<a name="manual-timing" />
+
+### Manual Timing
 
-## Manual timing
 For benchmarking something for which neither CPU time nor real-time are
 correct or accurate enough, completely manual timing is supported using
 the `UseManualTime` function.
@@ -448,7 +907,22 @@ static void BM_ManualTiming(benchmark::State& state) {
 BENCHMARK(BM_ManualTiming)->Range(1, 1<<17)->UseManualTime();
 ```
 
-### Preventing optimisation
+<a name="setting-the-time-unit" />
+
+### Setting the Time Unit
+
+If a benchmark runs a few milliseconds it may be hard to visually compare the
+measured times, since the output data is given in nanoseconds per default. In
+order to manually set the time unit, you can specify it manually:
+
+```c++
+BENCHMARK(BM_test)->Unit(benchmark::kMillisecond);
+```
+
+<a name="preventing-optimization" />
+
+### Preventing Optimization
+
 To prevent a value or expression from being optimized away by the compiler
 the `benchmark::DoNotOptimize(...)` and `benchmark::ClobberMemory()`
 functions can be used.
@@ -506,24 +980,10 @@ static void BM_vector_push_back(benchmark::State& state) {
 
 Note that `ClobberMemory()` is only available for GNU or MSVC based compilers.
 
-### Set time unit manually
-If a benchmark runs a few milliseconds it may be hard to visually compare the
-measured times, since the output data is given in nanoseconds per default. In
-order to manually set the time unit, you can specify it manually:
-
-```c++
-BENCHMARK(BM_test)->Unit(benchmark::kMillisecond);
-```
+<a name="reporting-statistics" />
 
-## Controlling number of iterations
-In all cases, the number of iterations for which the benchmark is run is
-governed by the amount of time the benchmark takes. Concretely, the number of
-iterations is at least one, not more than 1e9, until CPU time is greater than
-the minimum time, or the wallclock time is 5x minimum time. The minimum time is
-set as a flag `--benchmark_min_time` or per benchmark by calling `MinTime` on
-the registered benchmark object.
+### Statistics: Reporting the Mean, Median and Standard Deviation of Repeated Benchmarks
 
-## Reporting the mean, median and standard deviation by repeated benchmarks
 By default each benchmark is run once and that single result is reported.
 However benchmarks are often noisy and a single result may not be representative
 of the overall behavior. For this reason it's possible to repeatedly rerun the
@@ -534,17 +994,28 @@ The number of runs of each benchmark is specified globally by the
 `Repetitions` on the registered benchmark object. When a benchmark is run more
 than once the mean, median and standard deviation of the runs will be reported.
 
-Additionally the `--benchmark_report_aggregates_only={true|false}` flag or
-`ReportAggregatesOnly(bool)` function can be used to change how repeated tests
-are reported. By default the result of each repeated run is reported. When this
-option is `true` only the mean, median and standard deviation of the runs is reported.
-Calling `ReportAggregatesOnly(bool)` on a registered benchmark object overrides
-the value of the flag for that benchmark.
+Additionally the `--benchmark_report_aggregates_only={true|false}`,
+`--benchmark_display_aggregates_only={true|false}` flags or
+`ReportAggregatesOnly(bool)`, `DisplayAggregatesOnly(bool)` functions can be
+used to change how repeated tests are reported. By default the result of each
+repeated run is reported. When `report aggregates only` option is `true`,
+only the aggregates (i.e. mean, median and standard deviation, maybe complexity
+measurements if they were requested) of the runs is reported, to both the
+reporters - standard output (console), and the file.
+However when only the `display aggregates only` option is `true`,
+only the aggregates are displayed in the standard output, while the file
+output still contains everything.
+Calling `ReportAggregatesOnly(bool)` / `DisplayAggregatesOnly(bool)` on a
+registered benchmark object overrides the value of the appropriate flag for that
+benchmark.
+
+<a name="custom-statistics" />
+
+### Custom Statistics
 
-## User-defined statistics for repeated benchmarks
 While having mean, median and standard deviation is nice, this may not be
-enough for everyone. For example you may want to know what is the largest
-observation, e.g. because you have some real-time constraints. This is easy.
+enough for everyone. For example you may want to know what the largest
+observation is, e.g. because you have some real-time constraints. This is easy.
 The following code will specify a custom statistic to be calculated, defined
 by a lambda function.
 
@@ -564,188 +1035,38 @@ BENCHMARK(BM_spin_empty)
   ->Arg(512);
 ```
 
-## Fixtures
-Fixture tests are created by
-first defining a type that derives from `::benchmark::Fixture` and then
-creating/registering the tests using the following macros:
+<a name="using-register-benchmark" />
 
-* `BENCHMARK_F(ClassName, Method)`
-* `BENCHMARK_DEFINE_F(ClassName, Method)`
-* `BENCHMARK_REGISTER_F(ClassName, Method)`
+### Using RegisterBenchmark(name, fn, args...)
 
-For Example:
+The `RegisterBenchmark(name, func, args...)` function provides an alternative
+way to create and register benchmarks.
+`RegisterBenchmark(name, func, args...)` creates, registers, and returns a
+pointer to a new benchmark with the specified `name` that invokes
+`func(st, args...)` where `st` is a `benchmark::State` object.
+
+Unlike the `BENCHMARK` registration macros, which can only be used at the global
+scope, the `RegisterBenchmark` can be called anywhere. This allows for
+benchmark tests to be registered programmatically.
+
+Additionally `RegisterBenchmark` allows any callable object to be registered
+as a benchmark. Including capturing lambdas and function objects.
 
+For Example:
 ```c++
-class MyFixture : public benchmark::Fixture {};
+auto BM_test = [](benchmark::State& st, auto Inputs) { /* ... */ };
 
-BENCHMARK_F(MyFixture, FooTest)(benchmark::State& st) {
-   for (auto _ : st) {
-     ...
-  }
+int main(int argc, char** argv) {
+  for (auto& test_input : { /* ... */ })
+      benchmark::RegisterBenchmark(test_input.name(), BM_test, test_input);
+  benchmark::Initialize(&argc, argv);
+  benchmark::RunSpecifiedBenchmarks();
 }
+```
 
-BENCHMARK_DEFINE_F(MyFixture, BarTest)(benchmark::State& st) {
-   for (auto _ : st) {
-     ...
-  }
-}
-/* BarTest is NOT registered */
-BENCHMARK_REGISTER_F(MyFixture, BarTest)->Threads(2);
-/* BarTest is now registered */
-```
-
-### Templated fixtures
-Also you can create templated fixture by using the following macros:
-
-* `BENCHMARK_TEMPLATE_F(ClassName, Method, ...)`
-* `BENCHMARK_TEMPLATE_DEFINE_F(ClassName, Method, ...)`
-
-For example:
-```c++
-template<typename T>
-class MyFixture : public benchmark::Fixture {};
-
-BENCHMARK_TEMPLATE_F(MyFixture, IntTest, int)(benchmark::State& st) {
-   for (auto _ : st) {
-     ...
-  }
-}
-
-BENCHMARK_TEMPLATE_DEFINE_F(MyFixture, DoubleTest, double)(benchmark::State& st) {
-   for (auto _ : st) {
-     ...
-  }
-}
-
-BENCHMARK_REGISTER_F(MyFixture, DoubleTest)->Threads(2);
-```
-
-## User-defined counters
-
-You can add your own counters with user-defined names. The example below
-will add columns "Foo", "Bar" and "Baz" in its output:
-
-```c++
-static void UserCountersExample1(benchmark::State& state) {
-  double numFoos = 0, numBars = 0, numBazs = 0;
-  for (auto _ : state) {
-    // ... count Foo,Bar,Baz events
-  }
-  state.counters["Foo"] = numFoos;
-  state.counters["Bar"] = numBars;
-  state.counters["Baz"] = numBazs;
-}
-```
-
-The `state.counters` object is a `std::map` with `std::string` keys
-and `Counter` values. The latter is a `double`-like class, via an implicit
-conversion to `double&`. Thus you can use all of the standard arithmetic
-assignment operators (`=,+=,-=,*=,/=`) to change the value of each counter.
-
-In multithreaded benchmarks, each counter is set on the calling thread only.
-When the benchmark finishes, the counters from each thread will be summed;
-the resulting sum is the value which will be shown for the benchmark.
-
-The `Counter` constructor accepts two parameters: the value as a `double`
-and a bit flag which allows you to show counters as rates and/or as
-per-thread averages:
-
-```c++
-  // sets a simple counter
-  state.counters["Foo"] = numFoos;
-
-  // Set the counter as a rate. It will be presented divided
-  // by the duration of the benchmark.
-  state.counters["FooRate"] = Counter(numFoos, benchmark::Counter::kIsRate);
-
-  // Set the counter as a thread-average quantity. It will
-  // be presented divided by the number of threads.
-  state.counters["FooAvg"] = Counter(numFoos, benchmark::Counter::kAvgThreads);
-
-  // There's also a combined flag:
-  state.counters["FooAvgRate"] = Counter(numFoos,benchmark::Counter::kAvgThreadsRate);
-```
-
-When you're compiling in C++11 mode or later you can use `insert()` with
-`std::initializer_list`:
-
-```c++
-  // With C++11, this can be done:
-  state.counters.insert({{"Foo", numFoos}, {"Bar", numBars}, {"Baz", numBazs}});
-  // ... instead of:
-  state.counters["Foo"] = numFoos;
-  state.counters["Bar"] = numBars;
-  state.counters["Baz"] = numBazs;
-```
-
-### Counter reporting
-
-When using the console reporter, by default, user counters are are printed at
-the end after the table, the same way as ``bytes_processed`` and
-``items_processed``. This is best for cases in which there are few counters,
-or where there are only a couple of lines per benchmark. Here's an example of
-the default output:
-
-```
-------------------------------------------------------------------------------
-Benchmark                        Time           CPU Iterations UserCounters...
-------------------------------------------------------------------------------
-BM_UserCounter/threads:8      2248 ns      10277 ns      68808 Bar=16 Bat=40 Baz=24 Foo=8
-BM_UserCounter/threads:1      9797 ns       9788 ns      71523 Bar=2 Bat=5 Baz=3 Foo=1024m
-BM_UserCounter/threads:2      4924 ns       9842 ns      71036 Bar=4 Bat=10 Baz=6 Foo=2
-BM_UserCounter/threads:4      2589 ns      10284 ns      68012 Bar=8 Bat=20 Baz=12 Foo=4
-BM_UserCounter/threads:8      2212 ns      10287 ns      68040 Bar=16 Bat=40 Baz=24 Foo=8
-BM_UserCounter/threads:16     1782 ns      10278 ns      68144 Bar=32 Bat=80 Baz=48 Foo=16
-BM_UserCounter/threads:32     1291 ns      10296 ns      68256 Bar=64 Bat=160 Baz=96 Foo=32
-BM_UserCounter/threads:4      2615 ns      10307 ns      68040 Bar=8 Bat=20 Baz=12 Foo=4
-BM_Factorial                    26 ns         26 ns   26608979 40320
-BM_Factorial/real_time          26 ns         26 ns   26587936 40320
-BM_CalculatePiRange/1           16 ns         16 ns   45704255 0
-BM_CalculatePiRange/8           73 ns         73 ns    9520927 3.28374
-BM_CalculatePiRange/64         609 ns        609 ns    1140647 3.15746
-BM_CalculatePiRange/512       4900 ns       4901 ns     142696 3.14355
-```
-
-If this doesn't suit you, you can print each counter as a table column by
-passing the flag `--benchmark_counters_tabular=true` to the benchmark
-application. This is best for cases in which there are a lot of counters, or
-a lot of lines per individual benchmark. Note that this will trigger a
-reprinting of the table header any time the counter set changes between
-individual benchmarks. Here's an example of corresponding output when
-`--benchmark_counters_tabular=true` is passed:
-
-```
----------------------------------------------------------------------------------------
-Benchmark                        Time           CPU Iterations    Bar   Bat   Baz   Foo
----------------------------------------------------------------------------------------
-BM_UserCounter/threads:8      2198 ns       9953 ns      70688     16    40    24     8
-BM_UserCounter/threads:1      9504 ns       9504 ns      73787      2     5     3     1
-BM_UserCounter/threads:2      4775 ns       9550 ns      72606      4    10     6     2
-BM_UserCounter/threads:4      2508 ns       9951 ns      70332      8    20    12     4
-BM_UserCounter/threads:8      2055 ns       9933 ns      70344     16    40    24     8
-BM_UserCounter/threads:16     1610 ns       9946 ns      70720     32    80    48    16
-BM_UserCounter/threads:32     1192 ns       9948 ns      70496     64   160    96    32
-BM_UserCounter/threads:4      2506 ns       9949 ns      70332      8    20    12     4
---------------------------------------------------------------
-Benchmark                        Time           CPU Iterations
---------------------------------------------------------------
-BM_Factorial                    26 ns         26 ns   26392245 40320
-BM_Factorial/real_time          26 ns         26 ns   26494107 40320
-BM_CalculatePiRange/1           15 ns         15 ns   45571597 0
-BM_CalculatePiRange/8           74 ns         74 ns    9450212 3.28374
-BM_CalculatePiRange/64         595 ns        595 ns    1173901 3.15746
-BM_CalculatePiRange/512       4752 ns       4752 ns     147380 3.14355
-BM_CalculatePiRange/4k       37970 ns      37972 ns      18453 3.14184
-BM_CalculatePiRange/32k     303733 ns     303744 ns       2305 3.14162
-BM_CalculatePiRange/256k   2434095 ns    2434186 ns        288 3.1416
-BM_CalculatePiRange/1024k  9721140 ns    9721413 ns         71 3.14159
-BM_CalculatePi/threads:8      2255 ns       9943 ns      70936
-```
-Note above the additional header printed when the benchmark changes from
-``BM_UserCounter`` to ``BM_Factorial``. This is because ``BM_Factorial`` does
-not have the same counter set as ``BM_UserCounter``.
+<a name="exiting-with-an-error" />
 
-## Exiting Benchmarks in Error
+### Exiting with an Error
 
 When errors caused by external influences, such as file I/O and network
 communication, occur within a benchmark the
@@ -785,141 +1106,67 @@ static void BM_test_ranged_fo(benchmark::State & state) {
   }
 }
 ```
+<a name="a-faster-keep-running-loop" />
 
-## Running a subset of the benchmarks
-
-The `--benchmark_filter=<regex>` option can be used to only run the benchmarks
-which match the specified `<regex>`. For example:
+### A Faster KeepRunning Loop
 
-```bash
-$ ./run_benchmarks.x --benchmark_filter=BM_memcpy/32
-Run on (1 X 2300 MHz CPU )
-2016-06-25 19:34:24
-Benchmark              Time           CPU Iterations
-----------------------------------------------------
-BM_memcpy/32          11 ns         11 ns   79545455
-BM_memcpy/32k       2181 ns       2185 ns     324074
-BM_memcpy/32          12 ns         12 ns   54687500
-BM_memcpy/32k       1834 ns       1837 ns     357143
-```
-
-
-## Output Formats
-The library supports multiple output formats. Use the
-`--benchmark_format=<console|json|csv>` flag to set the format type. `console`
-is the default format.
-
-The Console format is intended to be a human readable format. By default
-the format generates color output. Context is output on stderr and the
-tabular data on stdout. Example tabular output looks like:
-```
-Benchmark                               Time(ns)    CPU(ns) Iterations
-----------------------------------------------------------------------
-BM_SetInsert/1024/1                        28928      29349      23853  133.097kB/s   33.2742k items/s
-BM_SetInsert/1024/8                        32065      32913      21375  949.487kB/s   237.372k items/s
-BM_SetInsert/1024/10                       33157      33648      21431  1.13369MB/s   290.225k items/s
-```
+In C++11 mode, a ranged-based for loop should be used in preference to
+the `KeepRunning` loop for running the benchmarks. For example:
 
-The JSON format outputs human readable json split into two top level attributes.
-The `context` attribute contains information about the run in general, including
-information about the CPU and the date.
-The `benchmarks` attribute contains a list of every benchmark run. Example json
-output looks like:
-```json
-{
-  "context": {
-    "date": "2015/03/17-18:40:25",
-    "num_cpus": 40,
-    "mhz_per_cpu": 2801,
-    "cpu_scaling_enabled": false,
-    "build_type": "debug"
-  },
-  "benchmarks": [
-    {
-      "name": "BM_SetInsert/1024/1",
-      "iterations": 94877,
-      "real_time": 29275,
-      "cpu_time": 29836,
-      "bytes_per_second": 134066,
-      "items_per_second": 33516
-    },
-    {
-      "name": "BM_SetInsert/1024/8",
-      "iterations": 21609,
-      "real_time": 32317,
-      "cpu_time": 32429,
-      "bytes_per_second": 986770,
-      "items_per_second": 246693
-    },
-    {
-      "name": "BM_SetInsert/1024/10",
-      "iterations": 21393,
-      "real_time": 32724,
-      "cpu_time": 33355,
-      "bytes_per_second": 1199226,
-      "items_per_second": 299807
-    }
-  ]
+```c++
+static void BM_Fast(benchmark::State &state) {
+  for (auto _ : state) {
+    FastOperation();
+  }
 }
+BENCHMARK(BM_Fast);
 ```
 
-The CSV format outputs comma-separated values. The `context` is output on stderr
-and the CSV itself on stdout. Example CSV output looks like:
-```
-name,iterations,real_time,cpu_time,bytes_per_second,items_per_second,label
-"BM_SetInsert/1024/1",65465,17890.7,8407.45,475768,118942,
-"BM_SetInsert/1024/8",116606,18810.1,9766.64,3.27646e+06,819115,
-"BM_SetInsert/1024/10",106365,17238.4,8421.53,4.74973e+06,1.18743e+06,
-```
-
-## Output Files
-The library supports writing the output of the benchmark to a file specified
-by `--benchmark_out=<filename>`. The format of the output can be specified
-using `--benchmark_out_format={json|console|csv}`. Specifying
-`--benchmark_out` does not suppress the console output.
+The reason the ranged-for loop is faster than using `KeepRunning`, is
+because `KeepRunning` requires a memory load and store of the iteration count
+ever iteration, whereas the ranged-for variant is able to keep the iteration count
+in a register.
 
-## Debug vs Release
-By default, benchmark builds as a debug library. You will see a warning in the output when this is the case. To build it as a release library instead, use:
+For example, an empty inner loop of using the ranged-based for method looks like:
 
-```
-cmake -DCMAKE_BUILD_TYPE=Release
+```asm
+# Loop Init
+  mov rbx, qword ptr [r14 + 104]
+  call benchmark::State::StartKeepRunning()
+  test rbx, rbx
+  je .LoopEnd
+.LoopHeader: # =>This Inner Loop Header: Depth=1
+  add rbx, -1
+  jne .LoopHeader
+.LoopEnd:
 ```
 
-To enable link-time optimisation, use
+Compared to an empty `KeepRunning` loop, which looks like:
 
-```
-cmake -DCMAKE_BUILD_TYPE=Release -DBENCHMARK_ENABLE_LTO=true
+```asm
+.LoopHeader: # in Loop: Header=BB0_3 Depth=1
+  cmp byte ptr [rbx], 1
+  jne .LoopInit
+.LoopBody: # =>This Inner Loop Header: Depth=1
+  mov rax, qword ptr [rbx + 8]
+  lea rcx, [rax + 1]
+  mov qword ptr [rbx + 8], rcx
+  cmp rax, qword ptr [rbx + 104]
+  jb .LoopHeader
+  jmp .LoopEnd
+.LoopInit:
+  mov rdi, rbx
+  call benchmark::State::StartKeepRunning()
+  jmp .LoopBody
+.LoopEnd:
 ```
 
-If you are using gcc, you might need to set `GCC_AR` and `GCC_RANLIB` cmake cache variables, if autodetection fails.
-If you are using clang, you may need to set `LLVMAR_EXECUTABLE`, `LLVMNM_EXECUTABLE` and `LLVMRANLIB_EXECUTABLE` cmake cache variables.
-
-## Linking against the library
-
-When the library is built using GCC it is necessary to link with `-pthread`,
-due to how GCC implements `std::thread`.
-
-For GCC 4.x failing to link to pthreads will lead to runtime exceptions, not linker errors.
-See [issue #67](https://github.com/google/benchmark/issues/67) for more details.
-
-## Compiler Support
-
-Google Benchmark uses C++11 when building the library. As such we require
-a modern C++ toolchain, both compiler and standard library.
-
-The following minimum versions are strongly recommended build the library:
-
-* GCC 4.8
-* Clang 3.4
-* Visual Studio 2013
-* Intel 2015 Update 1
-
-Anything older *may* work.
+Unless C++03 compatibility is required, the ranged-for variant of writing
+the benchmark loop should be preferred.
 
-Note: Using the library and its headers in C++03 is supported. C++11 is only
-required to build the library.
+<a name="disabling-cpu-frequency-scaling" />
 
-## Disable CPU frequency scaling
+### Disabling CPU Frequency Scaling
 If you see this error:
 ```
 ***WARNING*** CPU scaling is enabled, the benchmark real time measurements may be noisy and will incur extra overhead.
@@ -930,14 +1177,3 @@ sudo cpupower frequency-set --governor performance
 ./mybench
 sudo cpupower frequency-set --governor powersave
 ```
-
-# Known Issues
-
-### Windows
-
-* Users must manually link `shlwapi.lib`. Failure to do so may result
-in unresolved symbols.
-
-### Solaris
-
-* Users must explicitly link with kstat library (-lkstat compilation flag).
diff --git a/lib/gbenchmark/WORKSPACE b/lib/gbenchmark/WORKSPACE
index 9ba32fd75e..9a75f968d9 100644
--- a/lib/gbenchmark/WORKSPACE
+++ b/lib/gbenchmark/WORKSPACE
@@ -1,9 +1,9 @@
 workspace(name = "com_github_google_benchmark")
 
-load("@bazel_tools//tools/build_defs/repo:git.bzl", "git_repository")
+load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
 
-git_repository(
-    name = "com_google_googletest",
-    commit = "3f0cf6b62ad1eb50d8736538363d3580dd640c3e",  # HEAD
-    remote = "https://github.com/google/googletest",
+http_archive(
+     name = "com_google_googletest",
+     urls = ["https://github.com/google/googletest/archive/3f0cf6b62ad1eb50d8736538363d3580dd640c3e.zip"],
+     strip_prefix = "googletest-3f0cf6b62ad1eb50d8736538363d3580dd640c3e",
 )
diff --git a/lib/gbenchmark/_config.yml b/lib/gbenchmark/_config.yml
new file mode 100644
index 0000000000..18854876c6
--- /dev/null
+++ b/lib/gbenchmark/_config.yml
@@ -0,0 +1 @@
+theme: jekyll-theme-midnight
\ No newline at end of file
diff --git a/lib/gbenchmark/appveyor.yml b/lib/gbenchmark/appveyor.yml
index e99c6e77f0..cf240190be 100644
--- a/lib/gbenchmark/appveyor.yml
+++ b/lib/gbenchmark/appveyor.yml
@@ -20,12 +20,6 @@ environment:
     - compiler: msvc-14-seh
       generator: "Visual Studio 14 2015 Win64"
 
-    - compiler: msvc-12-seh
-      generator: "Visual Studio 12 2013"
-
-    - compiler: msvc-12-seh
-      generator: "Visual Studio 12 2013 Win64"
-
     - compiler: gcc-5.3.0-posix
       generator: "MinGW Makefiles"
       cxx_path: 'C:\mingw-w64\i686-5.3.0-posix-dwarf-rt_v4-rev0\mingw32\bin'
diff --git a/lib/gbenchmark/cmake/CXXFeatureCheck.cmake b/lib/gbenchmark/cmake/CXXFeatureCheck.cmake
index c4c4d660f1..99b56dd623 100644
--- a/lib/gbenchmark/cmake/CXXFeatureCheck.cmake
+++ b/lib/gbenchmark/cmake/CXXFeatureCheck.cmake
@@ -28,7 +28,7 @@ function(cxx_feature_check FILE)
   endif()
 
   if (NOT DEFINED COMPILE_${FEATURE})
-    message("-- Performing Test ${FEATURE}")
+    message(STATUS "Performing Test ${FEATURE}")
     if(CMAKE_CROSSCOMPILING)
       try_compile(COMPILE_${FEATURE}
               ${CMAKE_BINARY_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/cmake/${FILE}.cpp
@@ -42,7 +42,7 @@ function(cxx_feature_check FILE)
         set(RUN_${FEATURE} 1)
       endif()
     else()
-      message("-- Performing Test ${FEATURE}")
+      message(STATUS "Performing Test ${FEATURE}")
       try_run(RUN_${FEATURE} COMPILE_${FEATURE}
               ${CMAKE_BINARY_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/cmake/${FILE}.cpp
               CMAKE_FLAGS ${BENCHMARK_CXX_LINKER_FLAGS}
@@ -51,14 +51,14 @@ function(cxx_feature_check FILE)
   endif()
 
   if(RUN_${FEATURE} EQUAL 0)
-    message("-- Performing Test ${FEATURE} -- success")
+    message(STATUS "Performing Test ${FEATURE} -- success")
     set(HAVE_${VAR} 1 PARENT_SCOPE)
     add_definitions(-DHAVE_${VAR})
   else()
     if(NOT COMPILE_${FEATURE})
-      message("-- Performing Test ${FEATURE} -- failed to compile")
+      message(STATUS "Performing Test ${FEATURE} -- failed to compile")
     else()
-      message("-- Performing Test ${FEATURE} -- compiled but failed to run")
+      message(STATUS "Performing Test ${FEATURE} -- compiled but failed to run")
     endif()
   endif()
 endfunction()
diff --git a/lib/gbenchmark/cmake/GetGitVersion.cmake b/lib/gbenchmark/cmake/GetGitVersion.cmake
index 8dd9480045..4f10f226d7 100644
--- a/lib/gbenchmark/cmake/GetGitVersion.cmake
+++ b/lib/gbenchmark/cmake/GetGitVersion.cmake
@@ -21,6 +21,7 @@ set(__get_git_version INCLUDED)
 function(get_git_version var)
   if(GIT_EXECUTABLE)
       execute_process(COMMAND ${GIT_EXECUTABLE} describe --match "v[0-9]*.[0-9]*.[0-9]*" --abbrev=8
+          WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}
           RESULT_VARIABLE status
           OUTPUT_VARIABLE GIT_VERSION
           ERROR_QUIET)
@@ -33,9 +34,11 @@ function(get_git_version var)
 
       # Work out if the repository is dirty
       execute_process(COMMAND ${GIT_EXECUTABLE} update-index -q --refresh
+          WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}
           OUTPUT_QUIET
           ERROR_QUIET)
       execute_process(COMMAND ${GIT_EXECUTABLE} diff-index --name-only HEAD --
+          WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}
           OUTPUT_VARIABLE GIT_DIFF_INDEX
           ERROR_QUIET)
       string(COMPARE NOTEQUAL "${GIT_DIFF_INDEX}" "" GIT_DIRTY)
@@ -46,6 +49,6 @@ function(get_git_version var)
       set(GIT_VERSION "v0.0.0")
   endif()
 
-  message("-- git Version: ${GIT_VERSION}")
+  message(STATUS "git Version: ${GIT_VERSION}")
   set(${var} ${GIT_VERSION} PARENT_SCOPE)
 endfunction()
diff --git a/lib/gbenchmark/cmake/GoogleTest.cmake b/lib/gbenchmark/cmake/GoogleTest.cmake
new file mode 100644
index 0000000000..fb7c6be25e
--- /dev/null
+++ b/lib/gbenchmark/cmake/GoogleTest.cmake
@@ -0,0 +1,41 @@
+# Download and unpack googletest at configure time
+set(GOOGLETEST_PREFIX "${benchmark_BINARY_DIR}/third_party/googletest")
+configure_file(${benchmark_SOURCE_DIR}/cmake/GoogleTest.cmake.in ${GOOGLETEST_PREFIX}/CMakeLists.txt @ONLY)
+
+set(GOOGLETEST_PATH "${CMAKE_CURRENT_SOURCE_DIR}/googletest") # Mind the quotes
+execute_process(COMMAND ${CMAKE_COMMAND} -G "${CMAKE_GENERATOR}"
+  -DALLOW_DOWNLOADING_GOOGLETEST=${BENCHMARK_DOWNLOAD_DEPENDENCIES} -DGOOGLETEST_PATH:PATH=${GOOGLETEST_PATH} .
+  RESULT_VARIABLE result
+  WORKING_DIRECTORY ${GOOGLETEST_PREFIX}
+)
+
+if(result)
+  message(FATAL_ERROR "CMake step for googletest failed: ${result}")
+endif()
+
+execute_process(
+  COMMAND ${CMAKE_COMMAND} --build .
+  RESULT_VARIABLE result
+  WORKING_DIRECTORY ${GOOGLETEST_PREFIX}
+)
+
+if(result)
+  message(FATAL_ERROR "Build step for googletest failed: ${result}")
+endif()
+
+# Prevent overriding the parent project's compiler/linker
+# settings on Windows
+set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
+
+include(${GOOGLETEST_PREFIX}/googletest-paths.cmake)
+
+# Add googletest directly to our build. This defines
+# the gtest and gtest_main targets.
+add_subdirectory(${GOOGLETEST_SOURCE_DIR}
+                 ${GOOGLETEST_BINARY_DIR}
+                 EXCLUDE_FROM_ALL)
+
+set_target_properties(gtest PROPERTIES INTERFACE_SYSTEM_INCLUDE_DIRECTORIES $<TARGET_PROPERTY:gtest,INTERFACE_INCLUDE_DIRECTORIES>)
+set_target_properties(gtest_main PROPERTIES INTERFACE_SYSTEM_INCLUDE_DIRECTORIES $<TARGET_PROPERTY:gtest_main,INTERFACE_INCLUDE_DIRECTORIES>)
+set_target_properties(gmock PROPERTIES INTERFACE_SYSTEM_INCLUDE_DIRECTORIES $<TARGET_PROPERTY:gmock,INTERFACE_INCLUDE_DIRECTORIES>)
+set_target_properties(gmock_main PROPERTIES INTERFACE_SYSTEM_INCLUDE_DIRECTORIES $<TARGET_PROPERTY:gmock_main,INTERFACE_INCLUDE_DIRECTORIES>)
diff --git a/lib/gbenchmark/cmake/GoogleTest.cmake.in b/lib/gbenchmark/cmake/GoogleTest.cmake.in
new file mode 100644
index 0000000000..28818ee293
--- /dev/null
+++ b/lib/gbenchmark/cmake/GoogleTest.cmake.in
@@ -0,0 +1,58 @@
+cmake_minimum_required(VERSION 2.8.12)
+
+project(googletest-download NONE)
+
+# Enable ExternalProject CMake module
+include(ExternalProject)
+
+option(ALLOW_DOWNLOADING_GOOGLETEST "If googletest src tree is not found in location specified by GOOGLETEST_PATH, do fetch the archive from internet" OFF)
+set(GOOGLETEST_PATH "/usr/src/googletest" CACHE PATH
+                    "Path to the googletest root tree. Should contain googletest and googlemock subdirs. And CMakeLists.txt in root, and in both of these subdirs")
+
+# Download and install GoogleTest
+
+message(STATUS "Looking for Google Test sources")
+message(STATUS "Looking for Google Test sources in ${GOOGLETEST_PATH}")
+if(EXISTS "${GOOGLETEST_PATH}"            AND IS_DIRECTORY "${GOOGLETEST_PATH}"            AND EXISTS "${GOOGLETEST_PATH}/CMakeLists.txt" AND
+   EXISTS "${GOOGLETEST_PATH}/googletest" AND IS_DIRECTORY "${GOOGLETEST_PATH}/googletest" AND EXISTS "${GOOGLETEST_PATH}/googletest/CMakeLists.txt" AND
+   EXISTS "${GOOGLETEST_PATH}/googlemock" AND IS_DIRECTORY "${GOOGLETEST_PATH}/googlemock" AND EXISTS "${GOOGLETEST_PATH}/googlemock/CMakeLists.txt")
+  message(STATUS "Found Google Test in ${GOOGLETEST_PATH}")
+
+  ExternalProject_Add(
+    googletest
+    PREFIX            "${CMAKE_BINARY_DIR}"
+    DOWNLOAD_DIR      "${CMAKE_BINARY_DIR}/download"
+    SOURCE_DIR        "${GOOGLETEST_PATH}" # use existing src dir.
+    BINARY_DIR        "${CMAKE_BINARY_DIR}/build"
+    CONFIGURE_COMMAND ""
+    BUILD_COMMAND     ""
+    INSTALL_COMMAND   ""
+    TEST_COMMAND      ""
+  )
+else()
+  if(NOT ALLOW_DOWNLOADING_GOOGLETEST)
+    message(SEND_ERROR "Did not find Google Test sources! Either pass correct path in GOOGLETEST_PATH, or enable ALLOW_DOWNLOADING_GOOGLETEST, or disable BENCHMARK_ENABLE_GTEST_TESTS / BENCHMARK_ENABLE_TESTING.")
+  else()
+    message(WARNING "Did not find Google Test sources! Fetching from web...")
+    ExternalProject_Add(
+      googletest
+      GIT_REPOSITORY    https://github.com/google/googletest.git
+      GIT_TAG           master
+      PREFIX            "${CMAKE_BINARY_DIR}"
+      STAMP_DIR         "${CMAKE_BINARY_DIR}/stamp"
+      DOWNLOAD_DIR      "${CMAKE_BINARY_DIR}/download"
+      SOURCE_DIR        "${CMAKE_BINARY_DIR}/src"
+      BINARY_DIR        "${CMAKE_BINARY_DIR}/build"
+      CONFIGURE_COMMAND ""
+      BUILD_COMMAND     ""
+      INSTALL_COMMAND   ""
+      TEST_COMMAND      ""
+    )
+  endif()
+endif()
+
+ExternalProject_Get_Property(googletest SOURCE_DIR BINARY_DIR)
+file(WRITE googletest-paths.cmake
+"set(GOOGLETEST_SOURCE_DIR \"${SOURCE_DIR}\")
+set(GOOGLETEST_BINARY_DIR \"${BINARY_DIR}\")
+")
diff --git a/lib/gbenchmark/cmake/HandleGTest.cmake b/lib/gbenchmark/cmake/HandleGTest.cmake
deleted file mode 100644
index 3ebe9f32f0..0000000000
--- a/lib/gbenchmark/cmake/HandleGTest.cmake
+++ /dev/null
@@ -1,115 +0,0 @@
-
-macro(split_list listname)
-  string(REPLACE ";" " " ${listname} "${${listname}}")
-endmacro()
-
-macro(build_external_gtest)
-  include(ExternalProject)
-  set(GTEST_FLAGS "")
-  if (BENCHMARK_USE_LIBCXX)
-    if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
-      list(APPEND GTEST_FLAGS -stdlib=libc++)
-    else()
-      message(WARNING "Unsupported compiler (${CMAKE_CXX_COMPILER}) when using libc++")
-    endif()
-  endif()
-  if (BENCHMARK_BUILD_32_BITS)
-    list(APPEND GTEST_FLAGS -m32)
-  endif()
-  if (NOT "${CMAKE_CXX_FLAGS}" STREQUAL "")
-    list(APPEND GTEST_FLAGS ${CMAKE_CXX_FLAGS})
-  endif()
-  string(TOUPPER "${CMAKE_BUILD_TYPE}" GTEST_BUILD_TYPE)
-  if ("${GTEST_BUILD_TYPE}" STREQUAL "COVERAGE")
-    set(GTEST_BUILD_TYPE "DEBUG")
-  endif()
-  # FIXME: Since 10/Feb/2017 the googletest trunk has had a bug where
-  # -Werror=unused-function fires during the build on OS X. This is a temporary
-  # workaround to keep our travis bots from failing. It should be removed
-  # once gtest is fixed.
-  if (NOT "${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC")
-    list(APPEND GTEST_FLAGS "-Wno-unused-function")
-  endif()
-  split_list(GTEST_FLAGS)
-  set(EXCLUDE_FROM_ALL_OPT "")
-  set(EXCLUDE_FROM_ALL_VALUE "")
-  if (${CMAKE_VERSION} VERSION_GREATER "3.0.99")
-      set(EXCLUDE_FROM_ALL_OPT "EXCLUDE_FROM_ALL")
-      set(EXCLUDE_FROM_ALL_VALUE "ON")
-  endif()
-  ExternalProject_Add(googletest
-      ${EXCLUDE_FROM_ALL_OPT} ${EXCLUDE_FROM_ALL_VALUE}
-      GIT_REPOSITORY https://github.com/google/googletest.git
-      GIT_TAG master
-      PREFIX "${CMAKE_BINARY_DIR}/googletest"
-      INSTALL_DIR "${CMAKE_BINARY_DIR}/googletest"
-      CMAKE_CACHE_ARGS
-        -DCMAKE_BUILD_TYPE:STRING=${GTEST_BUILD_TYPE}
-        -DCMAKE_C_COMPILER:STRING=${CMAKE_C_COMPILER}
-        -DCMAKE_CXX_COMPILER:STRING=${CMAKE_CXX_COMPILER}
-        -DCMAKE_INSTALL_PREFIX:PATH=<INSTALL_DIR>
-        -DCMAKE_INSTALL_LIBDIR:PATH=<INSTALL_DIR>/lib
-        -DCMAKE_CXX_FLAGS:STRING=${GTEST_FLAGS}
-        -Dgtest_force_shared_crt:BOOL=ON
-      )
-
-  ExternalProject_Get_Property(googletest install_dir)
-  set(GTEST_INCLUDE_DIRS ${install_dir}/include)
-  file(MAKE_DIRECTORY ${GTEST_INCLUDE_DIRS})
-
-  set(LIB_SUFFIX "${CMAKE_STATIC_LIBRARY_SUFFIX}")
-  set(LIB_PREFIX "${CMAKE_STATIC_LIBRARY_PREFIX}")
-  if("${GTEST_BUILD_TYPE}" STREQUAL "DEBUG")
-    set(LIB_SUFFIX "d${CMAKE_STATIC_LIBRARY_SUFFIX}")
-  endif()
-
-  # Use gmock_main instead of gtest_main because it initializes gtest as well.
-  # Note: The libraries are listed in reverse order of their dependancies.
-  foreach(LIB gtest gmock gmock_main)
-    add_library(${LIB} UNKNOWN IMPORTED)
-    set_target_properties(${LIB} PROPERTIES
-      IMPORTED_LOCATION ${install_dir}/lib/${LIB_PREFIX}${LIB}${LIB_SUFFIX}
-      INTERFACE_INCLUDE_DIRECTORIES ${GTEST_INCLUDE_DIRS}
-      INTERFACE_LINK_LIBRARIES "${GTEST_BOTH_LIBRARIES}"
-    )
-    add_dependencies(${LIB} googletest)
-    list(APPEND GTEST_BOTH_LIBRARIES ${LIB})
-  endforeach()
-endmacro(build_external_gtest)
-
-if (BENCHMARK_ENABLE_GTEST_TESTS)
-  if (IS_DIRECTORY ${CMAKE_SOURCE_DIR}/googletest)
-    set(GTEST_ROOT "${CMAKE_SOURCE_DIR}/googletest")
-    set(INSTALL_GTEST OFF CACHE INTERNAL "")
-    set(INSTALL_GMOCK OFF CACHE INTERNAL "")
-    add_subdirectory(${CMAKE_SOURCE_DIR}/googletest)
-    set(GTEST_BOTH_LIBRARIES gtest gmock gmock_main)
-    foreach(HEADER test mock)
-      # CMake 2.8 and older don't respect INTERFACE_INCLUDE_DIRECTORIES, so we
-      # have to add the paths ourselves.
-      set(HFILE g${HEADER}/g${HEADER}.h)
-      set(HPATH ${GTEST_ROOT}/google${HEADER}/include)
-      find_path(HEADER_PATH_${HEADER} ${HFILE}
-          NO_DEFAULT_PATHS
-          HINTS ${HPATH}
-      )
-      if (NOT HEADER_PATH_${HEADER})
-        message(FATAL "Failed to find header ${HFILE} in ${HPATH}")
-      endif()
-      list(APPEND GTEST_INCLUDE_DIRS ${HEADER_PATH_${HEADER}})
-    endforeach()
-  elseif(BENCHMARK_DOWNLOAD_DEPENDENCIES)
-    build_external_gtest()
-  else()
-    find_package(GTest REQUIRED)
-    find_path(GMOCK_INCLUDE_DIRS gmock/gmock.h
-        HINTS ${GTEST_INCLUDE_DIRS})
-    if (NOT GMOCK_INCLUDE_DIRS)
-      message(FATAL "Failed to find header gmock/gmock.h with hint ${GTEST_INCLUDE_DIRS}")
-    endif()
-    set(GTEST_INCLUDE_DIRS ${GTEST_INCLUDE_DIRS} ${GMOCK_INCLUDE_DIRS})
-    # FIXME: We don't currently require the gmock library to build the tests,
-    # and it's likely we won't find it, so we don't try. As long as we've
-    # found the gmock/gmock.h header and gtest_main that should be good enough.
-  endif()
-endif()
diff --git a/lib/gbenchmark/cmake/benchmark.pc.in b/lib/gbenchmark/cmake/benchmark.pc.in
index 1e84bff68d..43ca8f91d7 100644
--- a/lib/gbenchmark/cmake/benchmark.pc.in
+++ b/lib/gbenchmark/cmake/benchmark.pc.in
@@ -8,4 +8,5 @@ Description: Google microbenchmark framework
 Version: @VERSION@
 
 Libs: -L${libdir} -lbenchmark
+Libs.private: -lpthread
 Cflags: -I${includedir}
diff --git a/lib/gbenchmark/cmake/split_list.cmake b/lib/gbenchmark/cmake/split_list.cmake
new file mode 100644
index 0000000000..67aed3fdc8
--- /dev/null
+++ b/lib/gbenchmark/cmake/split_list.cmake
@@ -0,0 +1,3 @@
+macro(split_list listname)
+  string(REPLACE ";" " " ${listname} "${${listname}}")
+endmacro()
diff --git a/lib/gbenchmark/conan/CMakeLists.txt b/lib/gbenchmark/conan/CMakeLists.txt
new file mode 100644
index 0000000000..15b92ca91a
--- /dev/null
+++ b/lib/gbenchmark/conan/CMakeLists.txt
@@ -0,0 +1,7 @@
+cmake_minimum_required(VERSION 2.8.11)
+project(cmake_wrapper)
+
+include(conanbuildinfo.cmake)
+conan_basic_setup()
+
+include(${CMAKE_SOURCE_DIR}/CMakeListsOriginal.txt)
diff --git a/lib/gbenchmark/conan/test_package/CMakeLists.txt b/lib/gbenchmark/conan/test_package/CMakeLists.txt
new file mode 100644
index 0000000000..089a6c729d
--- /dev/null
+++ b/lib/gbenchmark/conan/test_package/CMakeLists.txt
@@ -0,0 +1,10 @@
+cmake_minimum_required(VERSION 2.8.11)
+project(test_package)
+
+set(CMAKE_VERBOSE_MAKEFILE TRUE)
+
+include(${CMAKE_BINARY_DIR}/conanbuildinfo.cmake)
+conan_basic_setup()
+
+add_executable(${PROJECT_NAME} test_package.cpp)
+target_link_libraries(${PROJECT_NAME} ${CONAN_LIBS})
diff --git a/lib/gbenchmark/conan/test_package/conanfile.py b/lib/gbenchmark/conan/test_package/conanfile.py
new file mode 100644
index 0000000000..d63f4088c9
--- /dev/null
+++ b/lib/gbenchmark/conan/test_package/conanfile.py
@@ -0,0 +1,19 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+from conans import ConanFile, CMake
+import os
+
+
+class TestPackageConan(ConanFile):
+    settings = "os", "compiler", "build_type", "arch"
+    generators = "cmake"
+
+    def build(self):
+        cmake = CMake(self)
+        cmake.configure()
+        cmake.build()
+
+    def test(self):
+        bin_path = os.path.join("bin", "test_package")
+        self.run(bin_path, run_environment=True)
diff --git a/lib/gbenchmark/conan/test_package/test_package.cpp b/lib/gbenchmark/conan/test_package/test_package.cpp
new file mode 100644
index 0000000000..4fa7ec0bf9
--- /dev/null
+++ b/lib/gbenchmark/conan/test_package/test_package.cpp
@@ -0,0 +1,18 @@
+#include "benchmark/benchmark.h"
+
+void BM_StringCreation(benchmark::State& state) {
+    while (state.KeepRunning())
+        std::string empty_string;
+}
+
+BENCHMARK(BM_StringCreation);
+
+void BM_StringCopy(benchmark::State& state) {
+    std::string x = "hello";
+    while (state.KeepRunning())
+        std::string copy(x);
+}
+
+BENCHMARK(BM_StringCopy);
+
+BENCHMARK_MAIN();
diff --git a/lib/gbenchmark/conanfile.py b/lib/gbenchmark/conanfile.py
new file mode 100644
index 0000000000..e31fc5268a
--- /dev/null
+++ b/lib/gbenchmark/conanfile.py
@@ -0,0 +1,79 @@
+from conans import ConanFile, CMake, tools
+from conans.errors import ConanInvalidConfiguration
+import shutil
+import os
+
+
+class GoogleBenchmarkConan(ConanFile):
+    name = "benchmark"
+    description = "A microbenchmark support library."
+    topics = ("conan", "benchmark", "google", "microbenchmark")
+    url = "https://github.com/google/benchmark"
+    homepage = "https://github.com/google/benchmark"
+    author = "Google Inc."
+    license = "Apache-2.0"
+    exports_sources = ["*"]
+    generators = "cmake"
+
+    settings = "arch", "build_type", "compiler", "os"
+    options = {
+        "shared": [True, False],
+        "fPIC": [True, False],
+        "enable_lto": [True, False],
+        "enable_exceptions": [True, False]
+    }
+    default_options = {"shared": False, "fPIC": True, "enable_lto": False, "enable_exceptions": True}
+
+    _build_subfolder = "."
+
+    def source(self):
+        # Wrap the original CMake file to call conan_basic_setup
+        shutil.move("CMakeLists.txt", "CMakeListsOriginal.txt")
+        shutil.move(os.path.join("conan", "CMakeLists.txt"), "CMakeLists.txt")
+
+    def config_options(self):
+        if self.settings.os == "Windows":
+            if self.settings.compiler == "Visual Studio" and float(self.settings.compiler.version.value) <= 12:
+                raise ConanInvalidConfiguration("{} {} does not support Visual Studio <= 12".format(self.name, self.version))
+            del self.options.fPIC
+
+    def configure(self):
+        if self.settings.os == "Windows" and self.options.shared:
+            raise ConanInvalidConfiguration("Windows shared builds are not supported right now, see issue #639")
+
+    def _configure_cmake(self):
+        cmake = CMake(self)
+
+        cmake.definitions["BENCHMARK_ENABLE_TESTING"] = "OFF"
+        cmake.definitions["BENCHMARK_ENABLE_GTEST_TESTS"] = "OFF"
+        cmake.definitions["BENCHMARK_ENABLE_LTO"] = "ON" if self.options.enable_lto else "OFF"
+        cmake.definitions["BENCHMARK_ENABLE_EXCEPTIONS"] = "ON" if self.options.enable_exceptions else "OFF"
+
+        # See https://github.com/google/benchmark/pull/638 for Windows 32 build explanation
+        if self.settings.os != "Windows":
+            cmake.definitions["BENCHMARK_BUILD_32_BITS"] = "ON" if "64" not in str(self.settings.arch) else "OFF"
+            cmake.definitions["BENCHMARK_USE_LIBCXX"] = "ON" if (str(self.settings.compiler.libcxx) == "libc++") else "OFF"
+        else:
+            cmake.definitions["BENCHMARK_USE_LIBCXX"] = "OFF"
+
+        cmake.configure(build_folder=self._build_subfolder)
+        return cmake
+
+    def build(self):
+        cmake = self._configure_cmake()
+        cmake.build()
+
+    def package(self):
+        cmake = self._configure_cmake()
+        cmake.install()
+
+        self.copy(pattern="LICENSE", dst="licenses")
+
+    def package_info(self):
+        self.cpp_info.libs = tools.collect_libs(self)
+        if self.settings.os == "Linux":
+            self.cpp_info.libs.extend(["pthread", "rt"])
+        elif self.settings.os == "Windows":
+            self.cpp_info.libs.append("shlwapi")
+        elif self.settings.os == "SunOS":
+            self.cpp_info.libs.append("kstat")
diff --git a/lib/gbenchmark/dependencies.md b/lib/gbenchmark/dependencies.md
new file mode 100644
index 0000000000..6289b4e354
--- /dev/null
+++ b/lib/gbenchmark/dependencies.md
@@ -0,0 +1,18 @@
+# Build tool dependency policy
+
+To ensure the broadest compatibility when building the benchmark library, but
+still allow forward progress, we require any build tooling to be available for:
+
+* Debian stable AND
+* The last two Ubuntu LTS releases AND
+
+Currently, this means using build tool versions that are available for Ubuntu
+16.04 (Xenial), Ubuntu 18.04 (Bionic), and Debian stretch.
+
+_Note, [travis](.travis.yml) runs under Ubuntu 14.04 (Trusty) for linux builds._
+
+## cmake
+The current supported version is cmake 3.5.1 as of 2018-06-06.
+
+_Note, this version is also available for Ubuntu 14.04, the previous Ubuntu LTS
+release, as `cmake3`._
diff --git a/lib/gbenchmark/docs/_config.yml b/lib/gbenchmark/docs/_config.yml
new file mode 100644
index 0000000000..18854876c6
--- /dev/null
+++ b/lib/gbenchmark/docs/_config.yml
@@ -0,0 +1 @@
+theme: jekyll-theme-midnight
\ No newline at end of file
diff --git a/lib/gbenchmark/docs/tools.md b/lib/gbenchmark/docs/tools.md
index 70500bd322..4a3b2e9bd2 100644
--- a/lib/gbenchmark/docs/tools.md
+++ b/lib/gbenchmark/docs/tools.md
@@ -1,84 +1,25 @@
 # Benchmark Tools
 
-## compare_bench.py
-
-The `compare_bench.py` utility which can be used to compare the result of benchmarks.
-The program is invoked like:
-
-``` bash
-$ compare_bench.py <old-benchmark> <new-benchmark> [benchmark options]...
-```
-
-Where `<old-benchmark>` and `<new-benchmark>` either specify a benchmark executable file, or a JSON output file. The type of the input file is automatically detected. If a benchmark executable is specified then the benchmark is run to obtain the results. Otherwise the results are simply loaded from the output file.
-
-`[benchmark options]` will be passed to the benchmarks invocations. They can be anything that binary accepts, be it either normal `--benchmark_*` parameters, or some custom parameters your binary takes.
-
-The sample output using the JSON test files under `Inputs/` gives:
-
-``` bash
-$ ./compare_bench.py ./gbench/Inputs/test1_run1.json ./gbench/Inputs/test1_run2.json
-Comparing ./gbench/Inputs/test1_run1.json to ./gbench/Inputs/test1_run2.json
-Benchmark                        Time             CPU      Time Old      Time New       CPU Old       CPU New
--------------------------------------------------------------------------------------------------------------
-BM_SameTimes                  +0.0000         +0.0000            10            10            10            10
-BM_2xFaster                   -0.5000         -0.5000            50            25            50            25
-BM_2xSlower                   +1.0000         +1.0000            50           100            50           100
-BM_1PercentFaster             -0.0100         -0.0100           100            99           100            99
-BM_1PercentSlower             +0.0100         +0.0100           100           101           100           101
-BM_10PercentFaster            -0.1000         -0.1000           100            90           100            90
-BM_10PercentSlower            +0.1000         +0.1000           100           110           100           110
-BM_100xSlower                +99.0000        +99.0000           100         10000           100         10000
-BM_100xFaster                 -0.9900         -0.9900         10000           100         10000           100
-BM_10PercentCPUToTime         +0.1000         -0.1000           100           110           100            90
-BM_ThirdFaster                -0.3333         -0.3334           100            67           100            67
-BM_BadTimeUnit                -0.9000         +0.2000             0             0             0             1
-```
-
-As you can note, the values in `Time` and `CPU` columns are calculated as `(new - old) / |old|`.
+## compare.py
 
-When a benchmark executable is run, the raw output from the benchmark is printed in real time to stdout. The sample output using `benchmark/basic_test` for both arguments looks like:
+The `compare.py` can be used to compare the result of benchmarks.
 
-```
-./compare_bench.py  test/basic_test test/basic_test  --benchmark_filter=BM_empty.*
-RUNNING: test/basic_test --benchmark_filter=BM_empty.* --benchmark_out=/tmp/tmpN7LF3a
-Run on (8 X 4000 MHz CPU s)
-2017-11-07 23:28:36
----------------------------------------------------------------------
-Benchmark                              Time           CPU Iterations
----------------------------------------------------------------------
-BM_empty                               4 ns          4 ns  170178757
-BM_empty/threads:8                     1 ns          7 ns  103868920
-BM_empty_stop_start                    0 ns          0 ns 1000000000
-BM_empty_stop_start/threads:8          0 ns          0 ns 1403031720
-RUNNING: /test/basic_test --benchmark_filter=BM_empty.* --benchmark_out=/tmp/tmplvrIp8
-Run on (8 X 4000 MHz CPU s)
-2017-11-07 23:28:38
----------------------------------------------------------------------
-Benchmark                              Time           CPU Iterations
----------------------------------------------------------------------
-BM_empty                               4 ns          4 ns  169534855
-BM_empty/threads:8                     1 ns          7 ns  104188776
-BM_empty_stop_start                    0 ns          0 ns 1000000000
-BM_empty_stop_start/threads:8          0 ns          0 ns 1404159424
-Comparing ../build/test/basic_test to ../build/test/basic_test
-Benchmark                                Time             CPU      Time Old      Time New       CPU Old       CPU New
----------------------------------------------------------------------------------------------------------------------
-BM_empty                              -0.0048         -0.0049             4             4             4             4
-BM_empty/threads:8                    -0.0123         -0.0054             1             1             7             7
-BM_empty_stop_start                   -0.0000         -0.0000             0             0             0             0
-BM_empty_stop_start/threads:8         -0.0029         +0.0001             0             0             0             0
+**NOTE**: the utility relies on the scipy package which can be installed using [these instructions](https://www.scipy.org/install.html).
 
-```
+### Displaying aggregates only
 
-As you can note, the values in `Time` and `CPU` columns are calculated as `(new - old) / |old|`.
-Obviously this example doesn't give any useful output, but it's intended to show the output format when 'compare_bench.py' needs to run benchmarks.
+The switch `-a` / `--display_aggregates_only` can be used to control the
+displayment of the normal iterations vs the aggregates. When passed, it will
+be passthrough to the benchmark binaries to be run, and will be accounted for
+in the tool itself; only the aggregates will be displayed, but not normal runs.
+It only affects the display, the separate runs will still be used to calculate
+the U test.
 
-## compare.py
+### Modes of operation
 
-The `compare.py` can be used to compare the result of benchmarks.
 There are three modes of operation:
 
-1. Just compare two benchmarks, what `compare_bench.py` did.
+1. Just compare two benchmarks
 The program is invoked like:
 
 ``` bash
@@ -240,3 +181,19 @@ Benchmark                               Time             CPU      Time Old
 ```
 This is a mix of the previous two modes, two (potentially different) benchmark binaries are run, and a different filter is applied to each one.
 As you can note, the values in `Time` and `CPU` columns are calculated as `(new - old) / |old|`.
+
+### U test
+
+If there is a sufficient repetition count of the benchmarks, the tool can do
+a [U Test](https://en.wikipedia.org/wiki/Mann%E2%80%93Whitney_U_test), of the
+null hypothesis that it is equally likely that a randomly selected value from
+one sample will be less than or greater than a randomly selected value from a
+second sample.
+
+If the calculated p-value is below this value is lower than the significance
+level alpha, then the result is said to be statistically significant and the
+null hypothesis is rejected. Which in other words means that the two benchmarks
+aren't identical.
+
+**WARNING**: requires **LARGE** (no less than 9) number of repetitions to be
+meaningful!
diff --git a/lib/gbenchmark/include/benchmark/benchmark.h b/lib/gbenchmark/include/benchmark/benchmark.h
index 6be8c12ad4..6cb96f546d 100644
--- a/lib/gbenchmark/include/benchmark/benchmark.h
+++ b/lib/gbenchmark/include/benchmark/benchmark.h
@@ -56,8 +56,7 @@ static void BM_memcpy(benchmark::State& state) {
   memset(src, 'x', state.range(0));
   for (auto _ : state)
     memcpy(dst, src, state.range(0));
-  state.SetBytesProcessed(int64_t(state.iterations()) *
-                          int64_t(state.range(0)));
+  state.SetBytesProcessed(state.iterations() * state.range(0));
   delete[] src; delete[] dst;
 }
 BENCHMARK(BM_memcpy)->Arg(8)->Arg(64)->Arg(512)->Arg(1<<10)->Arg(8<<10);
@@ -122,8 +121,7 @@ template <class Q> int BM_Sequential(benchmark::State& state) {
       q.Wait(&v);
   }
   // actually messages, not bytes:
-  state.SetBytesProcessed(
-      static_cast<int64_t>(state.iterations())*state.range(0));
+  state.SetBytesProcessed(state.iterations() * state.range(0));
 }
 BENCHMARK_TEMPLATE(BM_Sequential, WaitQueue<int>)->Range(1<<0, 1<<10);
 
@@ -164,7 +162,6 @@ BENCHMARK(BM_test)->Unit(benchmark::kMillisecond);
 #ifndef BENCHMARK_BENCHMARK_H_
 #define BENCHMARK_BENCHMARK_H_
 
-
 // The _MSVC_LANG check should detect Visual Studio 2015 Update 3 and newer.
 #if __cplusplus >= 201103L || (defined(_MSVC_LANG) && _MSVC_LANG >= 201103L)
 #define BENCHMARK_HAS_CXX11
@@ -176,19 +173,19 @@ BENCHMARK(BM_test)->Unit(benchmark::kMillisecond);
 #include <cassert>
 #include <cstddef>
 #include <iosfwd>
-#include <string>
-#include <vector>
 #include <map>
 #include <set>
+#include <string>
+#include <vector>
 
 #if defined(BENCHMARK_HAS_CXX11)
-#include <type_traits>
 #include <initializer_list>
+#include <type_traits>
 #include <utility>
 #endif
 
 #if defined(_MSC_VER)
-#include <intrin.h> // for _ReadWriteBarrier
+#include <intrin.h>  // for _ReadWriteBarrier
 #endif
 
 #ifndef BENCHMARK_HAS_CXX11
@@ -227,21 +224,36 @@ BENCHMARK(BM_test)->Unit(benchmark::kMillisecond);
 #define BENCHMARK_INTERNAL_TOSTRING2(x) #x
 #define BENCHMARK_INTERNAL_TOSTRING(x) BENCHMARK_INTERNAL_TOSTRING2(x)
 
-#if defined(__GNUC__)
+#if defined(__GNUC__) || defined(__clang__)
 #define BENCHMARK_BUILTIN_EXPECT(x, y) __builtin_expect(x, y)
 #define BENCHMARK_DEPRECATED_MSG(msg) __attribute__((deprecated(msg)))
 #else
 #define BENCHMARK_BUILTIN_EXPECT(x, y) x
 #define BENCHMARK_DEPRECATED_MSG(msg)
-#define BENCHMARK_WARNING_MSG(msg) __pragma(message(__FILE__ "(" BENCHMARK_INTERNAL_TOSTRING(__LINE__) ") : warning note: " msg))
+#define BENCHMARK_WARNING_MSG(msg)                           \
+  __pragma(message(__FILE__ "(" BENCHMARK_INTERNAL_TOSTRING( \
+      __LINE__) ") : warning note: " msg))
 #endif
 
 #if defined(__GNUC__) && !defined(__clang__)
 #define BENCHMARK_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)
 #endif
 
+#ifndef __has_builtin
+#define __has_builtin(x) 0
+#endif
+
+#if defined(__GNUC__) || __has_builtin(__builtin_unreachable)
+#define BENCHMARK_UNREACHABLE() __builtin_unreachable()
+#elif defined(_MSC_VER)
+#define BENCHMARK_UNREACHABLE() __assume(false)
+#else
+#define BENCHMARK_UNREACHABLE() ((void)0)
+#endif
+
 namespace benchmark {
 class BenchmarkReporter;
+class MemoryManager;
 
 void Initialize(int* argc, char** argv);
 
@@ -254,7 +266,7 @@ bool ReportUnrecognizedArguments(int argc, char** argv);
 // of each matching benchmark. Otherwise run each matching benchmark and
 // report the results.
 //
-// The second and third overload use the specified 'console_reporter' and
+// The second and third overload use the specified 'display_reporter' and
 //  'file_reporter' respectively. 'file_reporter' will write to the file
 //  specified
 //   by '--benchmark_output'. If '--benchmark_output' is not given the
@@ -262,16 +274,13 @@ bool ReportUnrecognizedArguments(int argc, char** argv);
 //
 // RETURNS: The number of matching benchmarks.
 size_t RunSpecifiedBenchmarks();
-size_t RunSpecifiedBenchmarks(BenchmarkReporter* console_reporter);
-size_t RunSpecifiedBenchmarks(BenchmarkReporter* console_reporter,
+size_t RunSpecifiedBenchmarks(BenchmarkReporter* display_reporter);
+size_t RunSpecifiedBenchmarks(BenchmarkReporter* display_reporter,
                               BenchmarkReporter* file_reporter);
 
-// If this routine is called, peak memory allocation past this point in the
-// benchmark is reported at the end of the benchmark report line. (It is
-// computed by running the benchmark once with a single iteration and a memory
-// tracer.)
-// TODO(dominic)
-// void MemoryUsage();
+// Register a MemoryManager instance that will be used to collect and report
+// allocation measurements for benchmark runs.
+void RegisterMemoryManager(MemoryManager* memory_manager);
 
 namespace internal {
 class Benchmark;
@@ -290,22 +299,19 @@ BENCHMARK_UNUSED static int stream_init_anchor = InitializeStreams();
 
 }  // namespace internal
 
-
 #if (!defined(__GNUC__) && !defined(__clang__)) || defined(__pnacl__) || \
-    defined(EMSCRIPTN)
-# define BENCHMARK_HAS_NO_INLINE_ASSEMBLY
+    defined(__EMSCRIPTEN__)
+#define BENCHMARK_HAS_NO_INLINE_ASSEMBLY
 #endif
 
-
 // The DoNotOptimize(...) function can be used to prevent a value or
 // expression from being optimized away by the compiler. This function is
 // intended to add little to no overhead.
 // See: https://youtu.be/nXaxk27zwlk?t=2441
 #ifndef BENCHMARK_HAS_NO_INLINE_ASSEMBLY
 template <class Tp>
-inline BENCHMARK_ALWAYS_INLINE
-void DoNotOptimize(Tp const& value) {
-    asm volatile("" : : "r,m"(value) : "memory");
+inline BENCHMARK_ALWAYS_INLINE void DoNotOptimize(Tp const& value) {
+  asm volatile("" : : "r,m"(value) : "memory");
 }
 
 template <class Tp>
@@ -329,9 +335,7 @@ inline BENCHMARK_ALWAYS_INLINE void DoNotOptimize(Tp const& value) {
   _ReadWriteBarrier();
 }
 
-inline BENCHMARK_ALWAYS_INLINE void ClobberMemory() {
-  _ReadWriteBarrier();
-}
+inline BENCHMARK_ALWAYS_INLINE void ClobberMemory() { _ReadWriteBarrier(); }
 #else
 template <class Tp>
 inline BENCHMARK_ALWAYS_INLINE void DoNotOptimize(Tp const& value) {
@@ -340,39 +344,63 @@ inline BENCHMARK_ALWAYS_INLINE void DoNotOptimize(Tp const& value) {
 // FIXME Add ClobberMemory() for non-gnu and non-msvc compilers
 #endif
 
-
-
 // This class is used for user-defined counters.
 class Counter {
-public:
-
+ public:
   enum Flags {
-    kDefaults   = 0,
+    kDefaults = 0,
     // Mark the counter as a rate. It will be presented divided
     // by the duration of the benchmark.
-    kIsRate     = 1,
+    kIsRate = 1U << 0U,
     // Mark the counter as a thread-average quantity. It will be
     // presented divided by the number of threads.
-    kAvgThreads = 2,
+    kAvgThreads = 1U << 1U,
     // Mark the counter as a thread-average rate. See above.
-    kAvgThreadsRate = kIsRate|kAvgThreads
+    kAvgThreadsRate = kIsRate | kAvgThreads,
+    // Mark the counter as a constant value, valid/same for *every* iteration.
+    // When reporting, it will be *multiplied* by the iteration count.
+    kIsIterationInvariant = 1U << 2U,
+    // Mark the counter as a constant rate.
+    // When reporting, it will be *multiplied* by the iteration count
+    // and then divided by the duration of the benchmark.
+    kIsIterationInvariantRate = kIsRate | kIsIterationInvariant,
+    // Mark the counter as a iteration-average quantity.
+    // It will be presented divided by the number of iterations.
+    kAvgIterations = 1U << 3U,
+    // Mark the counter as a iteration-average rate. See above.
+    kAvgIterationsRate = kIsRate | kAvgIterations
+  };
+
+  enum OneK {
+    // 1'000 items per 1k
+    kIs1000 = 1000,
+    // 1'024 items per 1k
+    kIs1024 = 1024
   };
 
   double value;
-  Flags  flags;
+  Flags flags;
+  OneK oneK;
 
   BENCHMARK_ALWAYS_INLINE
-  Counter(double v = 0., Flags f = kDefaults) : value(v), flags(f) {}
-
-  BENCHMARK_ALWAYS_INLINE operator double const& () const { return value; }
-  BENCHMARK_ALWAYS_INLINE operator double      & ()       { return value; }
+  Counter(double v = 0., Flags f = kDefaults, OneK k = kIs1000)
+      : value(v), flags(f), oneK(k) {}
 
+  BENCHMARK_ALWAYS_INLINE operator double const&() const { return value; }
+  BENCHMARK_ALWAYS_INLINE operator double&() { return value; }
 };
 
+// A helper for user code to create unforeseen combinations of Flags, without
+// having to do this cast manually each time, or providing this operator.
+Counter::Flags inline operator|(const Counter::Flags& LHS,
+                                const Counter::Flags& RHS) {
+  return static_cast<Counter::Flags>(static_cast<int>(LHS) |
+                                     static_cast<int>(RHS));
+}
+
 // This is the container for the user-defined counters.
 typedef std::map<std::string, Counter> UserCounters;
 
-
 // TimeUnit is passed to a benchmark in order to specify the order of magnitude
 // for the measured time.
 enum TimeUnit { kNanosecond, kMicrosecond, kMillisecond };
@@ -383,36 +411,49 @@ enum TimeUnit { kNanosecond, kMicrosecond, kMillisecond };
 // calculated automatically to the best fit.
 enum BigO { oNone, o1, oN, oNSquared, oNCubed, oLogN, oNLogN, oAuto, oLambda };
 
+typedef uint64_t IterationCount;
+
 // BigOFunc is passed to a benchmark in order to specify the asymptotic
 // computational complexity for the benchmark.
-typedef double(BigOFunc)(int64_t);
+typedef double(BigOFunc)(IterationCount);
 
 // StatisticsFunc is passed to a benchmark in order to compute some descriptive
 // statistics over all the measurements of some type
 typedef double(StatisticsFunc)(const std::vector<double>&);
 
+namespace internal {
 struct Statistics {
   std::string name_;
   StatisticsFunc* compute_;
 
-  Statistics(std::string name, StatisticsFunc* compute)
-    : name_(name), compute_(compute) {}
+  Statistics(const std::string& name, StatisticsFunc* compute)
+      : name_(name), compute_(compute) {}
 };
 
-namespace internal {
+struct BenchmarkInstance;
 class ThreadTimer;
 class ThreadManager;
 
-enum ReportMode
+enum AggregationReportMode
 #if defined(BENCHMARK_HAS_CXX11)
-  : unsigned
+    : unsigned
 #else
 #endif
-  {
-  RM_Unspecified,  // The mode has not been manually specified
-  RM_Default,      // The mode is user-specified as default.
-  RM_ReportAggregatesOnly
+{
+  // The mode has not been manually specified
+  ARM_Unspecified = 0,
+  // The mode is user-specified.
+  // This may or may not be set when the following bit-flags are set.
+  ARM_Default = 1U << 0U,
+  // File reporter should only output aggregates.
+  ARM_FileReportAggregatesOnly = 1U << 1U,
+  // Display reporter should only output aggregates
+  ARM_DisplayReportAggregatesOnly = 1U << 2U,
+  // Both reporters should only display aggregates.
+  ARM_ReportAggregatesOnly =
+      ARM_FileReportAggregatesOnly | ARM_DisplayReportAggregatesOnly
 };
+
 }  // namespace internal
 
 // State is passed to a running Benchmark and contains state for the
@@ -447,7 +488,7 @@ class State {
   //   while (state.KeepRunningBatch(1000)) {
   //     // process 1000 elements
   //   }
-  bool KeepRunningBatch(size_t n);
+  bool KeepRunningBatch(IterationCount n);
 
   // REQUIRES: timer is running and 'SkipWithError(...)' has not been called
   //           by the current thread.
@@ -508,16 +549,21 @@ class State {
 
   // Set the number of bytes processed by the current benchmark
   // execution.  This routine is typically called once at the end of a
-  // throughput oriented benchmark.  If this routine is called with a
-  // value > 0, the report is printed in MB/sec instead of nanoseconds
-  // per iteration.
+  // throughput oriented benchmark.
   //
   // REQUIRES: a benchmark has exited its benchmarking loop.
   BENCHMARK_ALWAYS_INLINE
-  void SetBytesProcessed(int64_t bytes) { bytes_processed_ = bytes; }
+  void SetBytesProcessed(int64_t bytes) {
+    counters["bytes_per_second"] =
+        Counter(static_cast<double>(bytes), Counter::kIsRate, Counter::kIs1024);
+  }
 
   BENCHMARK_ALWAYS_INLINE
-  int64_t bytes_processed() const { return bytes_processed_; }
+  int64_t bytes_processed() const {
+    if (counters.find("bytes_per_second") != counters.end())
+      return static_cast<int64_t>(counters.at("bytes_per_second"));
+    return 0;
+  }
 
   // If this routine is called with complexity_n > 0 and complexity report is
   // requested for the
@@ -537,10 +583,17 @@ class State {
   //
   // REQUIRES: a benchmark has exited its benchmarking loop.
   BENCHMARK_ALWAYS_INLINE
-  void SetItemsProcessed(int64_t items) { items_processed_ = items; }
+  void SetItemsProcessed(int64_t items) {
+    counters["items_per_second"] =
+        Counter(static_cast<double>(items), benchmark::Counter::kIsRate);
+  }
 
   BENCHMARK_ALWAYS_INLINE
-  int64_t items_processed() const { return items_processed_; }
+  int64_t items_processed() const {
+    if (counters.find("items_per_second") != counters.end())
+      return static_cast<int64_t>(counters.at("items_per_second"));
+    return 0;
+  }
 
   // If this routine is called, the specified label is printed at the
   // end of the benchmark report line for the currently executing
@@ -574,35 +627,35 @@ class State {
   int64_t range_y() const { return range(1); }
 
   BENCHMARK_ALWAYS_INLINE
-  size_t iterations() const {
-    return (max_iterations - total_iterations_ + batch_leftover_);
+  IterationCount iterations() const {
+    if (BENCHMARK_BUILTIN_EXPECT(!started_, false)) {
+      return 0;
+    }
+    return max_iterations - total_iterations_ + batch_leftover_;
   }
 
-private: // items we expect on the first cache line (ie 64 bytes of the struct)
-
+ private
+     :  // items we expect on the first cache line (ie 64 bytes of the struct)
   // When total_iterations_ is 0, KeepRunning() and friends will return false.
   // May be larger than max_iterations.
-  size_t total_iterations_;
+  IterationCount total_iterations_;
 
   // When using KeepRunningBatch(), batch_leftover_ holds the number of
   // iterations beyond max_iters that were run. Used to track
   // completed_iterations_ accurately.
-  size_t batch_leftover_;
+  IterationCount batch_leftover_;
 
-public:
-  const size_t max_iterations;
+ public:
+  const IterationCount max_iterations;
 
-private:
+ private:
   bool started_;
   bool finished_;
   bool error_occurred_;
 
-private: // items we don't need on the first cache line
+ private:  // items we don't need on the first cache line
   std::vector<int64_t> range_;
 
-  int64_t bytes_processed_;
-  int64_t items_processed_;
-
   int64_t complexity_n_;
 
  public:
@@ -613,35 +666,32 @@ class State {
   // Number of threads concurrently executing the benchmark.
   const int threads;
 
-
-  // TODO(EricWF) make me private
-  State(size_t max_iters, const std::vector<int64_t>& ranges, int thread_i,
-        int n_threads, internal::ThreadTimer* timer,
+ private:
+  State(IterationCount max_iters, const std::vector<int64_t>& ranges,
+        int thread_i, int n_threads, internal::ThreadTimer* timer,
         internal::ThreadManager* manager);
 
- private:
   void StartKeepRunning();
   // Implementation of KeepRunning() and KeepRunningBatch().
   // is_batch must be true unless n is 1.
-  bool KeepRunningInternal(size_t n, bool is_batch);
+  bool KeepRunningInternal(IterationCount n, bool is_batch);
   void FinishKeepRunning();
   internal::ThreadTimer* timer_;
   internal::ThreadManager* manager_;
-  BENCHMARK_DISALLOW_COPY_AND_ASSIGN(State);
+
+  friend struct internal::BenchmarkInstance;
 };
 
-inline BENCHMARK_ALWAYS_INLINE
-bool State::KeepRunning() {
-  return KeepRunningInternal(1, /*is_batch=*/ false);
+inline BENCHMARK_ALWAYS_INLINE bool State::KeepRunning() {
+  return KeepRunningInternal(1, /*is_batch=*/false);
 }
 
-inline BENCHMARK_ALWAYS_INLINE
-bool State::KeepRunningBatch(size_t n) {
-  return KeepRunningInternal(n, /*is_batch=*/ true);
+inline BENCHMARK_ALWAYS_INLINE bool State::KeepRunningBatch(IterationCount n) {
+  return KeepRunningInternal(n, /*is_batch=*/true);
 }
 
-inline BENCHMARK_ALWAYS_INLINE
-bool State::KeepRunningInternal(size_t n, bool is_batch) {
+inline BENCHMARK_ALWAYS_INLINE bool State::KeepRunningInternal(IterationCount n,
+                                                               bool is_batch) {
   // total_iterations_ is set to 0 by the constructor, and always set to a
   // nonzero value by StartKepRunning().
   assert(n > 0);
@@ -654,13 +704,13 @@ bool State::KeepRunningInternal(size_t n, bool is_batch) {
   if (!started_) {
     StartKeepRunning();
     if (!error_occurred_ && total_iterations_ >= n) {
-      total_iterations_-= n;
+      total_iterations_ -= n;
       return true;
     }
   }
   // For non-batch runs, total_iterations_ must be 0 by now.
   if (is_batch && total_iterations_ != 0) {
-    batch_leftover_  = n - total_iterations_;
+    batch_leftover_ = n - total_iterations_;
     total_iterations_ = 0;
     return true;
   }
@@ -704,7 +754,7 @@ struct State::StateIterator {
   }
 
  private:
-  size_t cached_;
+  IterationCount cached_;
   State* const parent_;
 };
 
@@ -808,7 +858,7 @@ class Benchmark {
   // NOTE: This function should only be used when *exact* iteration control is
   //   needed and never to control or limit how long a benchmark runs, where
   // `--benchmark_min_time=N` or `MinTime(...)` should be used instead.
-  Benchmark* Iterations(size_t n);
+  Benchmark* Iterations(IterationCount n);
 
   // Specify the amount of times to repeat this benchmark. This option overrides
   // the `benchmark_repetitions` flag.
@@ -818,13 +868,24 @@ class Benchmark {
   // Specify if each repetition of the benchmark should be reported separately
   // or if only the final statistics should be reported. If the benchmark
   // is not repeated then the single result is always reported.
+  // Applies to *ALL* reporters (display and file).
   Benchmark* ReportAggregatesOnly(bool value = true);
 
-  // If a particular benchmark is I/O bound, runs multiple threads internally or
-  // if for some reason CPU timings are not representative, call this method. If
-  // called, the elapsed time will be used to control how many iterations are
-  // run, and in the printing of items/second or MB/seconds values.  If not
-  // called, the cpu time used by the benchmark will be used.
+  // Same as ReportAggregatesOnly(), but applies to display reporter only.
+  Benchmark* DisplayAggregatesOnly(bool value = true);
+
+  // By default, the CPU time is measured only for the main thread, which may
+  // be unrepresentative if the benchmark uses threads internally. If called,
+  // the total CPU time spent by all the threads will be measured instead.
+  // By default, the only the main thread CPU time will be measured.
+  Benchmark* MeasureProcessCPUTime();
+
+  // If a particular benchmark should use the Wall clock instead of the CPU time
+  // (be it either the CPU time of the main thread only (default), or the
+  // total CPU usage of the benchmark), call this method. If called, the elapsed
+  // (wall) time will be used to control how many iterations are run, and in the
+  // printing of items/second or MB/seconds values.
+  // If not called, the CPU time used by the benchmark will be used.
   Benchmark* UseRealTime();
 
   // If a benchmark must measure time manually (e.g. if GPU execution time is
@@ -879,9 +940,6 @@ class Benchmark {
 
   virtual void Run(State& state) = 0;
 
-  // Used inside the benchmark implementation
-  struct Instance;
-
  protected:
   explicit Benchmark(const char* name);
   Benchmark(Benchmark const&);
@@ -893,14 +951,15 @@ class Benchmark {
   friend class BenchmarkFamilies;
 
   std::string name_;
-  ReportMode report_mode_;
-  std::vector<std::string> arg_names_;   // Args for all benchmark runs
+  AggregationReportMode aggregation_report_mode_;
+  std::vector<std::string> arg_names_;       // Args for all benchmark runs
   std::vector<std::vector<int64_t> > args_;  // Args for all benchmark runs
   TimeUnit time_unit_;
   int range_multiplier_;
   double min_time_;
-  size_t iterations_;
+  IterationCount iterations_;
   int repetitions_;
+  bool measure_process_cpu_time_;
   bool use_real_time_;
   bool use_manual_time_;
   BigO complexity_;
@@ -1119,7 +1178,7 @@ class Fixture : public internal::Benchmark {
   class BaseClass##_##Method##_Benchmark : public BaseClass<a> {    \
    public:                                                          \
     BaseClass##_##Method##_Benchmark() : BaseClass<a>() {           \
-      this->SetName(#BaseClass"<" #a ">/" #Method);                 \
+      this->SetName(#BaseClass "<" #a ">/" #Method);                \
     }                                                               \
                                                                     \
    protected:                                                       \
@@ -1130,7 +1189,7 @@ class Fixture : public internal::Benchmark {
   class BaseClass##_##Method##_Benchmark : public BaseClass<a, b> {    \
    public:                                                             \
     BaseClass##_##Method##_Benchmark() : BaseClass<a, b>() {           \
-      this->SetName(#BaseClass"<" #a "," #b ">/" #Method);             \
+      this->SetName(#BaseClass "<" #a "," #b ">/" #Method);            \
     }                                                                  \
                                                                        \
    protected:                                                          \
@@ -1142,14 +1201,15 @@ class Fixture : public internal::Benchmark {
   class BaseClass##_##Method##_Benchmark : public BaseClass<__VA_ARGS__> { \
    public:                                                                 \
     BaseClass##_##Method##_Benchmark() : BaseClass<__VA_ARGS__>() {        \
-      this->SetName(#BaseClass"<" #__VA_ARGS__ ">/" #Method);              \
+      this->SetName(#BaseClass "<" #__VA_ARGS__ ">/" #Method);             \
     }                                                                      \
                                                                            \
    protected:                                                              \
     virtual void BenchmarkCase(::benchmark::State&);                       \
   };
 #else
-#define BENCHMARK_TEMPLATE_PRIVATE_DECLARE_F(n, a) BENCHMARK_TEMPLATE1_PRIVATE_DECLARE_F(n, a)
+#define BENCHMARK_TEMPLATE_PRIVATE_DECLARE_F(n, a) \
+  BENCHMARK_TEMPLATE1_PRIVATE_DECLARE_F(n, a)
 #endif
 
 #define BENCHMARK_DEFINE_F(BaseClass, Method)    \
@@ -1169,7 +1229,8 @@ class Fixture : public internal::Benchmark {
   BENCHMARK_TEMPLATE_PRIVATE_DECLARE_F(BaseClass, Method, __VA_ARGS__) \
   void BaseClass##_##Method##_Benchmark::BenchmarkCase
 #else
-#define BENCHMARK_TEMPLATE_DEFINE_F(BaseClass, Method, a) BENCHMARK_TEMPLATE1_DEFINE_F(BaseClass, Method, a)
+#define BENCHMARK_TEMPLATE_DEFINE_F(BaseClass, Method, a) \
+  BENCHMARK_TEMPLATE1_DEFINE_F(BaseClass, Method, a)
 #endif
 
 #define BENCHMARK_REGISTER_F(BaseClass, Method) \
@@ -1196,24 +1257,24 @@ class Fixture : public internal::Benchmark {
   void BaseClass##_##Method##_Benchmark::BenchmarkCase
 
 #ifdef BENCHMARK_HAS_CXX11
-#define BENCHMARK_TEMPLATE_F(BaseClass, Method, ...)           \
+#define BENCHMARK_TEMPLATE_F(BaseClass, Method, ...)                   \
   BENCHMARK_TEMPLATE_PRIVATE_DECLARE_F(BaseClass, Method, __VA_ARGS__) \
-  BENCHMARK_REGISTER_F(BaseClass, Method);                     \
+  BENCHMARK_REGISTER_F(BaseClass, Method);                             \
   void BaseClass##_##Method##_Benchmark::BenchmarkCase
 #else
-#define BENCHMARK_TEMPLATE_F(BaseClass, Method, a) BENCHMARK_TEMPLATE1_F(BaseClass, Method, a)
+#define BENCHMARK_TEMPLATE_F(BaseClass, Method, a) \
+  BENCHMARK_TEMPLATE1_F(BaseClass, Method, a)
 #endif
 
 // Helper macro to create a main routine in a test that runs the benchmarks
-#define BENCHMARK_MAIN()                   \
-  int main(int argc, char** argv) {        \
-    ::benchmark::Initialize(&argc, argv);  \
+#define BENCHMARK_MAIN()                                                \
+  int main(int argc, char** argv) {                                     \
+    ::benchmark::Initialize(&argc, argv);                               \
     if (::benchmark::ReportUnrecognizedArguments(argc, argv)) return 1; \
-    ::benchmark::RunSpecifiedBenchmarks(); \
-  }                                        \
+    ::benchmark::RunSpecifiedBenchmarks();                              \
+  }                                                                     \
   int main(int, char**)
 
-
 // ------------------------------------------------------
 // Benchmark Reporters
 
@@ -1231,6 +1292,7 @@ struct CPUInfo {
   double cycles_per_second;
   std::vector<CacheInfo> caches;
   bool scaling_enabled;
+  std::vector<double> load_avg;
 
   static const CPUInfo& Get();
 
@@ -1239,6 +1301,33 @@ struct CPUInfo {
   BENCHMARK_DISALLOW_COPY_AND_ASSIGN(CPUInfo);
 };
 
+// Adding Struct for System Information
+struct SystemInfo {
+  std::string name;
+  static const SystemInfo& Get();
+
+ private:
+  SystemInfo();
+  BENCHMARK_DISALLOW_COPY_AND_ASSIGN(SystemInfo);
+};
+
+// BenchmarkName contains the components of the Benchmark's name
+// which allows individual fields to be modified or cleared before
+// building the final name using 'str()'.
+struct BenchmarkName {
+  std::string function_name;
+  std::string args;
+  std::string min_time;
+  std::string iterations;
+  std::string repetitions;
+  std::string time_type;
+  std::string threads;
+
+  // Return the full name of the benchmark with each non-empty
+  // field separated by a '/'
+  std::string str() const;
+};
+
 // Interface for custom benchmark result printers.
 // By default, benchmark reports are printed to stdout. However an application
 // can control the destination of the reports by calling
@@ -1248,35 +1337,48 @@ class BenchmarkReporter {
  public:
   struct Context {
     CPUInfo const& cpu_info;
+    SystemInfo const& sys_info;
     // The number of chars in the longest benchmark name.
     size_t name_field_width;
-    static const char *executable_name;
+    static const char* executable_name;
     Context();
   };
 
   struct Run {
+    static const int64_t no_repetition_index = -1;
+    enum RunType { RT_Iteration, RT_Aggregate };
+
     Run()
-        : error_occurred(false),
+        : run_type(RT_Iteration),
+          error_occurred(false),
           iterations(1),
+          threads(1),
           time_unit(kNanosecond),
           real_accumulated_time(0),
           cpu_accumulated_time(0),
-          bytes_per_second(0),
-          items_per_second(0),
           max_heapbytes_used(0),
           complexity(oNone),
           complexity_lambda(),
           complexity_n(0),
           report_big_o(false),
           report_rms(false),
-          counters() {}
-
-    std::string benchmark_name;
+          counters(),
+          has_memory_result(false),
+          allocs_per_iter(0.0),
+          max_bytes_used(0) {}
+
+    std::string benchmark_name() const;
+    BenchmarkName run_name;
+    RunType run_type;
+    std::string aggregate_name;
     std::string report_label;  // Empty if not set by benchmark.
     bool error_occurred;
     std::string error_message;
 
-    int64_t iterations;
+    IterationCount iterations;
+    int64_t threads;
+    int64_t repetition_index;
+    int64_t repetitions;
     TimeUnit time_unit;
     double real_accumulated_time;
     double cpu_accumulated_time;
@@ -1293,10 +1395,6 @@ class BenchmarkReporter {
     // accumulated time.
     double GetAdjustedCPUTime() const;
 
-    // Zero if not set by benchmark.
-    double bytes_per_second;
-    double items_per_second;
-
     // This is set to 0.0 if memory tracing is not enabled.
     double max_heapbytes_used;
 
@@ -1306,13 +1404,18 @@ class BenchmarkReporter {
     int64_t complexity_n;
 
     // what statistics to compute from the measurements
-    const std::vector<Statistics>* statistics;
+    const std::vector<internal::Statistics>* statistics;
 
     // Inform print function whether the current run is a complexity report
     bool report_big_o;
     bool report_rms;
 
     UserCounters counters;
+
+    // Memory metrics.
+    bool has_memory_result;
+    double allocs_per_iter;
+    int64_t max_bytes_used;
   };
 
   // Construct a BenchmarkReporter with the output stream set to 'std::cout'
@@ -1373,17 +1476,19 @@ class BenchmarkReporter {
 // Simple reporter that outputs benchmark data to the console. This is the
 // default reporter used by RunSpecifiedBenchmarks().
 class ConsoleReporter : public BenchmarkReporter {
-public:
+ public:
   enum OutputOptions {
     OO_None = 0,
     OO_Color = 1,
     OO_Tabular = 2,
-    OO_ColorTabular = OO_Color|OO_Tabular,
+    OO_ColorTabular = OO_Color | OO_Tabular,
     OO_Defaults = OO_ColorTabular
   };
   explicit ConsoleReporter(OutputOptions opts_ = OO_Defaults)
-      : output_options_(opts_), name_field_width_(0),
-        prev_counters_(), printed_header_(false) {}
+      : output_options_(opts_),
+        name_field_width_(0),
+        prev_counters_(),
+        printed_header_(false) {}
 
   virtual bool ReportContext(const Context& context);
   virtual void ReportRuns(const std::vector<Run>& reports);
@@ -1411,7 +1516,9 @@ class JSONReporter : public BenchmarkReporter {
   bool first_report_;
 };
 
-class CSVReporter : public BenchmarkReporter {
+class BENCHMARK_DEPRECATED_MSG(
+    "The CSV Reporter will be removed in a future release") CSVReporter
+    : public BenchmarkReporter {
  public:
   CSVReporter() : printed_header_(false) {}
   virtual bool ReportContext(const Context& context);
@@ -1421,7 +1528,30 @@ class CSVReporter : public BenchmarkReporter {
   void PrintRunData(const Run& report);
 
   bool printed_header_;
-  std::set< std::string > user_counter_names_;
+  std::set<std::string> user_counter_names_;
+};
+
+// If a MemoryManager is registered, it can be used to collect and report
+// allocation metrics for a run of the benchmark.
+class MemoryManager {
+ public:
+  struct Result {
+    Result() : num_allocs(0), max_bytes_used(0) {}
+
+    // The number of allocations made in total between Start and Stop.
+    int64_t num_allocs;
+
+    // The peak memory use between Start and Stop.
+    int64_t max_bytes_used;
+  };
+
+  virtual ~MemoryManager() {}
+
+  // Implement this to start recording allocation information.
+  virtual void Start() = 0;
+
+  // Implement this to stop recording and fill out the given Result structure.
+  virtual void Stop(Result* result) = 0;
 };
 
 inline const char* GetTimeUnitString(TimeUnit unit) {
@@ -1431,9 +1561,9 @@ inline const char* GetTimeUnitString(TimeUnit unit) {
     case kMicrosecond:
       return "us";
     case kNanosecond:
-    default:
       return "ns";
   }
+  BENCHMARK_UNREACHABLE();
 }
 
 inline double GetTimeUnitMultiplier(TimeUnit unit) {
@@ -1443,11 +1573,11 @@ inline double GetTimeUnitMultiplier(TimeUnit unit) {
     case kMicrosecond:
       return 1e6;
     case kNanosecond:
-    default:
       return 1e9;
   }
+  BENCHMARK_UNREACHABLE();
 }
 
-} // namespace benchmark
+}  // namespace benchmark
 
 #endif  // BENCHMARK_BENCHMARK_H_
diff --git a/lib/gbenchmark/src/CMakeLists.txt b/lib/gbenchmark/src/CMakeLists.txt
index 836549e3ca..b47de6791c 100644
--- a/lib/gbenchmark/src/CMakeLists.txt
+++ b/lib/gbenchmark/src/CMakeLists.txt
@@ -1,4 +1,5 @@
 # Allow the source files to find headers in src/
+include(GNUInstallDirs)
 include_directories(${PROJECT_SOURCE_DIR}/src)
 
 if (DEFINED BENCHMARK_CXX_LINKER_FLAGS)
@@ -11,6 +12,10 @@ file(GLOB
     *.cc
     ${PROJECT_SOURCE_DIR}/include/benchmark/*.h
     ${CMAKE_CURRENT_SOURCE_DIR}/*.h)
+file(GLOB BENCHMARK_MAIN "benchmark_main.cc")
+foreach(item ${BENCHMARK_MAIN})
+  list(REMOVE_ITEM SOURCE_FILES "${item}")
+endforeach()
 
 add_library(benchmark ${SOURCE_FILES})
 set_target_properties(benchmark PROPERTIES
@@ -29,6 +34,14 @@ if(LIBRT)
   target_link_libraries(benchmark ${LIBRT})
 endif()
 
+if(CMAKE_BUILD_TYPE)
+  string(TOUPPER ${CMAKE_BUILD_TYPE} CMAKE_BUILD_TYPE_UPPER)
+endif()
+if(NOT CMAKE_THREAD_LIBS_INIT AND "${CMAKE_CXX_FLAGS} ${CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE_UPPER}}" MATCHES ".*-fsanitize=[^ ]*address.*")
+  message(WARNING "CMake's FindThreads.cmake did not fail, but CMAKE_THREAD_LIBS_INIT ended up being empty. This was fixed in https://github.com/Kitware/CMake/commit/d53317130e84898c5328c237186dbd995aaf1c12 Let's guess that -pthread is sufficient.")
+  target_link_libraries(benchmark -pthread)
+endif()
+
 # We need extra libraries on Windows
 if(${CMAKE_SYSTEM_NAME} MATCHES "Windows")
   target_link_libraries(benchmark Shlwapi)
@@ -39,11 +52,18 @@ if(${CMAKE_SYSTEM_NAME} MATCHES "SunOS")
   target_link_libraries(benchmark kstat)
 endif()
 
-set(include_install_dir "include")
-set(lib_install_dir "lib/")
-set(bin_install_dir "bin/")
-set(config_install_dir "lib/cmake/${PROJECT_NAME}")
-set(pkgconfig_install_dir "lib/pkgconfig")
+# Benchmark main library
+add_library(benchmark_main "benchmark_main.cc")
+set_target_properties(benchmark_main PROPERTIES
+  OUTPUT_NAME "benchmark_main"
+  VERSION ${GENERIC_LIB_VERSION}
+  SOVERSION ${GENERIC_LIB_SOVERSION}
+)
+target_include_directories(benchmark PUBLIC
+    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../include>
+    )
+target_link_libraries(benchmark_main benchmark)
+
 
 set(generated_dir "${CMAKE_CURRENT_BINARY_DIR}/generated")
 
@@ -56,7 +76,7 @@ set(namespace "${PROJECT_NAME}::")
 
 include(CMakePackageConfigHelpers)
 write_basic_package_version_file(
-    "${version_config}" VERSION ${GIT_VERSION} COMPATIBILITY SameMajorVersion
+  "${version_config}" VERSION ${GENERIC_LIB_VERSION} COMPATIBILITY SameMajorVersion
 )
 
 configure_file("${PROJECT_SOURCE_DIR}/cmake/Config.cmake.in" "${project_config}" @ONLY)
@@ -65,28 +85,28 @@ configure_file("${PROJECT_SOURCE_DIR}/cmake/benchmark.pc.in" "${pkg_config}" @ON
 if (BENCHMARK_ENABLE_INSTALL)
   # Install target (will install the library to specified CMAKE_INSTALL_PREFIX variable)
   install(
-    TARGETS benchmark
+    TARGETS benchmark benchmark_main
     EXPORT ${targets_export_name}
-    ARCHIVE DESTINATION ${lib_install_dir}
-    LIBRARY DESTINATION ${lib_install_dir}
-    RUNTIME DESTINATION ${bin_install_dir}
-    INCLUDES DESTINATION ${include_install_dir})
+    ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
+    LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+    RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
+    INCLUDES DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
 
   install(
     DIRECTORY "${PROJECT_SOURCE_DIR}/include/benchmark"
-    DESTINATION ${include_install_dir}
+    DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
     FILES_MATCHING PATTERN "*.*h")
 
   install(
       FILES "${project_config}" "${version_config}"
-      DESTINATION "${config_install_dir}")
+      DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/${PROJECT_NAME}")
 
   install(
       FILES "${pkg_config}"
-      DESTINATION "${pkgconfig_install_dir}")
+      DESTINATION "${CMAKE_INSTALL_LIBDIR}/pkgconfig")
 
   install(
       EXPORT "${targets_export_name}"
       NAMESPACE "${namespace}"
-      DESTINATION "${config_install_dir}")
+      DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/${PROJECT_NAME}")
 endif()
diff --git a/lib/gbenchmark/src/benchmark.cc b/lib/gbenchmark/src/benchmark.cc
index 82b15ac709..29bfa3512f 100644
--- a/lib/gbenchmark/src/benchmark.cc
+++ b/lib/gbenchmark/src/benchmark.cc
@@ -14,6 +14,7 @@
 
 #include "benchmark/benchmark.h"
 #include "benchmark_api_internal.h"
+#include "benchmark_runner.h"
 #include "internal_macros.h"
 
 #ifndef BENCHMARK_OS_WINDOWS
@@ -34,6 +35,7 @@
 #include <memory>
 #include <string>
 #include <thread>
+#include <utility>
 
 #include "check.h"
 #include "colorprint.h"
@@ -55,9 +57,9 @@ DEFINE_bool(benchmark_list_tests, false,
 
 DEFINE_string(benchmark_filter, ".",
               "A regular expression that specifies the set of benchmarks "
-              "to execute.  If this flag is empty, no benchmarks are run.  "
-              "If this flag is the string \"all\", all benchmarks linked "
-              "into the process are run.");
+              "to execute.  If this flag is empty, or if this flag is the "
+              "string \"all\", all benchmarks linked into the binary are "
+              "run.");
 
 DEFINE_double(benchmark_min_time, 0.5,
               "Minimum number of seconds we should run benchmark before "
@@ -72,10 +74,19 @@ DEFINE_int32(benchmark_repetitions, 1,
              "The number of runs of each benchmark. If greater than 1, the "
              "mean and standard deviation of the runs will be reported.");
 
-DEFINE_bool(benchmark_report_aggregates_only, false,
-            "Report the result of each benchmark repetitions. When 'true' is "
-            "specified only the mean, standard deviation, and other statistics "
-            "are reported for repeated benchmarks.");
+DEFINE_bool(
+    benchmark_report_aggregates_only, false,
+    "Report the result of each benchmark repetitions. When 'true' is specified "
+    "only the mean, standard deviation, and other statistics are reported for "
+    "repeated benchmarks. Affects all reporters.");
+
+DEFINE_bool(
+    benchmark_display_aggregates_only, false,
+    "Display the result of each benchmark repetitions. When 'true' is "
+    "specified only the mean, standard deviation, and other statistics are "
+    "displayed for repeated benchmarks. Unlike "
+    "benchmark_report_aggregates_only, only affects the display reporter, but "
+    "*NOT* file reporter, which will still contain all the output.");
 
 DEFINE_string(benchmark_format, "console",
               "The format to use for console output. Valid values are "
@@ -103,195 +114,15 @@ DEFINE_int32(v, 0, "The level of verbose logging to output");
 
 namespace benchmark {
 
-namespace {
-static const size_t kMaxIterations = 1000000000;
-}  // end namespace
-
 namespace internal {
 
+// FIXME: wouldn't LTO mess this up?
 void UseCharPointer(char const volatile*) {}
 
-namespace {
-
-BenchmarkReporter::Run CreateRunReport(
-    const benchmark::internal::Benchmark::Instance& b,
-    const internal::ThreadManager::Result& results,
-    double seconds) {
-  // Create report about this benchmark run.
-  BenchmarkReporter::Run report;
-
-  report.benchmark_name = b.name;
-  report.error_occurred = results.has_error_;
-  report.error_message = results.error_message_;
-  report.report_label = results.report_label_;
-  // This is the total iterations across all threads.
-  report.iterations = results.iterations;
-  report.time_unit = b.time_unit;
-
-  if (!report.error_occurred) {
-    double bytes_per_second = 0;
-    if (results.bytes_processed > 0 && seconds > 0.0) {
-      bytes_per_second = (results.bytes_processed / seconds);
-    }
-    double items_per_second = 0;
-    if (results.items_processed > 0 && seconds > 0.0) {
-      items_per_second = (results.items_processed / seconds);
-    }
-
-    if (b.use_manual_time) {
-      report.real_accumulated_time = results.manual_time_used;
-    } else {
-      report.real_accumulated_time = results.real_time_used;
-    }
-    report.cpu_accumulated_time = results.cpu_time_used;
-    report.bytes_per_second = bytes_per_second;
-    report.items_per_second = items_per_second;
-    report.complexity_n = results.complexity_n;
-    report.complexity = b.complexity;
-    report.complexity_lambda = b.complexity_lambda;
-    report.statistics = b.statistics;
-    report.counters = results.counters;
-    internal::Finish(&report.counters, seconds, b.threads);
-  }
-  return report;
-}
-
-// Execute one thread of benchmark b for the specified number of iterations.
-// Adds the stats collected for the thread into *total.
-void RunInThread(const benchmark::internal::Benchmark::Instance* b,
-                 size_t iters, int thread_id,
-                 internal::ThreadManager* manager) {
-  internal::ThreadTimer timer;
-  State st(iters, b->arg, thread_id, b->threads, &timer, manager);
-  b->benchmark->Run(st);
-  CHECK(st.iterations() >= st.max_iterations)
-      << "Benchmark returned before State::KeepRunning() returned false!";
-  {
-    MutexLock l(manager->GetBenchmarkMutex());
-    internal::ThreadManager::Result& results = manager->results;
-    results.iterations += st.iterations();
-    results.cpu_time_used += timer.cpu_time_used();
-    results.real_time_used += timer.real_time_used();
-    results.manual_time_used += timer.manual_time_used();
-    results.bytes_processed += st.bytes_processed();
-    results.items_processed += st.items_processed();
-    results.complexity_n += st.complexity_length_n();
-    internal::Increment(&results.counters, st.counters);
-  }
-  manager->NotifyThreadComplete();
-}
-
-std::vector<BenchmarkReporter::Run> RunBenchmark(
-    const benchmark::internal::Benchmark::Instance& b,
-    std::vector<BenchmarkReporter::Run>* complexity_reports) {
-  std::vector<BenchmarkReporter::Run> reports;  // return value
-
-  const bool has_explicit_iteration_count = b.iterations != 0;
-  size_t iters = has_explicit_iteration_count ? b.iterations : 1;
-  std::unique_ptr<internal::ThreadManager> manager;
-  std::vector<std::thread> pool(b.threads - 1);
-  const int repeats =
-      b.repetitions != 0 ? b.repetitions : FLAGS_benchmark_repetitions;
-  const bool report_aggregates_only =
-      repeats != 1 &&
-      (b.report_mode == internal::RM_Unspecified
-           ? FLAGS_benchmark_report_aggregates_only
-           : b.report_mode == internal::RM_ReportAggregatesOnly);
-  for (int repetition_num = 0; repetition_num < repeats; repetition_num++) {
-    for (;;) {
-      // Try benchmark
-      VLOG(2) << "Running " << b.name << " for " << iters << "\n";
-
-      manager.reset(new internal::ThreadManager(b.threads));
-      for (std::size_t ti = 0; ti < pool.size(); ++ti) {
-        pool[ti] = std::thread(&RunInThread, &b, iters,
-                               static_cast<int>(ti + 1), manager.get());
-      }
-      RunInThread(&b, iters, 0, manager.get());
-      manager->WaitForAllThreads();
-      for (std::thread& thread : pool) thread.join();
-      internal::ThreadManager::Result results;
-      {
-        MutexLock l(manager->GetBenchmarkMutex());
-        results = manager->results;
-      }
-      manager.reset();
-      // Adjust real/manual time stats since they were reported per thread.
-      results.real_time_used /= b.threads;
-      results.manual_time_used /= b.threads;
-
-      VLOG(2) << "Ran in " << results.cpu_time_used << "/"
-              << results.real_time_used << "\n";
-
-      // Base decisions off of real time if requested by this benchmark.
-      double seconds = results.cpu_time_used;
-      if (b.use_manual_time) {
-        seconds = results.manual_time_used;
-      } else if (b.use_real_time) {
-        seconds = results.real_time_used;
-      }
-
-      const double min_time =
-          !IsZero(b.min_time) ? b.min_time : FLAGS_benchmark_min_time;
-
-      // Determine if this run should be reported; Either it has
-      // run for a sufficient amount of time or because an error was reported.
-      const bool should_report =  repetition_num > 0
-        || has_explicit_iteration_count  // An exact iteration count was requested
-        || results.has_error_
-        || iters >= kMaxIterations  // No chance to try again, we hit the limit.
-        || seconds >= min_time  // the elapsed time is large enough
-        // CPU time is specified but the elapsed real time greatly exceeds the
-        // minimum time. Note that user provided timers are except from this
-        // sanity check.
-        || ((results.real_time_used >= 5 * min_time) && !b.use_manual_time);
-
-      if (should_report) {
-        BenchmarkReporter::Run report = CreateRunReport(b, results, seconds);
-        if (!report.error_occurred && b.complexity != oNone)
-          complexity_reports->push_back(report);
-        reports.push_back(report);
-        break;
-      }
-
-      // See how much iterations should be increased by
-      // Note: Avoid division by zero with max(seconds, 1ns).
-      double multiplier = min_time * 1.4 / std::max(seconds, 1e-9);
-      // If our last run was at least 10% of FLAGS_benchmark_min_time then we
-      // use the multiplier directly. Otherwise we use at most 10 times
-      // expansion.
-      // NOTE: When the last run was at least 10% of the min time the max
-      // expansion should be 14x.
-      bool is_significant = (seconds / min_time) > 0.1;
-      multiplier = is_significant ? multiplier : std::min(10.0, multiplier);
-      if (multiplier <= 1.0) multiplier = 2.0;
-      double next_iters = std::max(multiplier * iters, iters + 1.0);
-      if (next_iters > kMaxIterations) {
-        next_iters = kMaxIterations;
-      }
-      VLOG(3) << "Next iters: " << next_iters << ", " << multiplier << "\n";
-      iters = static_cast<int>(next_iters + 0.5);
-    }
-  }
-  // Calculate additional statistics
-  auto stat_reports = ComputeStats(reports);
-  if ((b.complexity != oNone) && b.last_benchmark_instance) {
-    auto additional_run_stats = ComputeBigO(*complexity_reports);
-    stat_reports.insert(stat_reports.end(), additional_run_stats.begin(),
-                        additional_run_stats.end());
-    complexity_reports->clear();
-  }
-
-  if (report_aggregates_only) reports.clear();
-  reports.insert(reports.end(), stat_reports.begin(), stat_reports.end());
-  return reports;
-}
-
-}  // namespace
 }  // namespace internal
 
-State::State(size_t max_iters, const std::vector<int64_t>& ranges, int thread_i,
-             int n_threads, internal::ThreadTimer* timer,
+State::State(IterationCount max_iters, const std::vector<int64_t>& ranges,
+             int thread_i, int n_threads, internal::ThreadTimer* timer,
              internal::ThreadManager* manager)
     : total_iterations_(0),
       batch_leftover_(0),
@@ -300,8 +131,6 @@ State::State(size_t max_iters, const std::vector<int64_t>& ranges, int thread_i,
       finished_(false),
       error_occurred_(false),
       range_(ranges),
-      bytes_processed_(0),
-      items_processed_(0),
       complexity_n_(0),
       counters(),
       thread_index(thread_i),
@@ -317,15 +146,21 @@ State::State(size_t max_iters, const std::vector<int64_t>& ranges, int thread_i,
   // demonstrated since constexpr evaluation must diagnose all undefined
   // behavior). However, GCC and Clang also warn about this use of offsetof,
   // which must be suppressed.
-#ifdef __GNUC__
+#if defined(__INTEL_COMPILER)
+#pragma warning push
+#pragma warning(disable:1875)
+#elif defined(__GNUC__)
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Winvalid-offsetof"
 #endif
   // Offset tests to ensure commonly accessed data is on the first cache line.
   const int cache_line_size = 64;
   static_assert(offsetof(State, error_occurred_) <=
-                (cache_line_size - sizeof(error_occurred_)), "");
-#ifdef __GNUC__
+                    (cache_line_size - sizeof(error_occurred_)),
+                "");
+#if defined(__INTEL_COMPILER)
+#pragma warning pop
+#elif defined(__GNUC__)
 #pragma GCC diagnostic pop
 #endif
 }
@@ -386,25 +221,25 @@ void State::FinishKeepRunning() {
 namespace internal {
 namespace {
 
-void RunBenchmarks(const std::vector<Benchmark::Instance>& benchmarks,
-                           BenchmarkReporter* console_reporter,
-                           BenchmarkReporter* file_reporter) {
+void RunBenchmarks(const std::vector<BenchmarkInstance>& benchmarks,
+                   BenchmarkReporter* display_reporter,
+                   BenchmarkReporter* file_reporter) {
   // Note the file_reporter can be null.
-  CHECK(console_reporter != nullptr);
+  CHECK(display_reporter != nullptr);
 
   // Determine the width of the name field using a minimum width of 10.
-  bool has_repetitions = FLAGS_benchmark_repetitions > 1;
+  bool might_have_aggregates = FLAGS_benchmark_repetitions > 1;
   size_t name_field_width = 10;
   size_t stat_field_width = 0;
-  for (const Benchmark::Instance& benchmark : benchmarks) {
+  for (const BenchmarkInstance& benchmark : benchmarks) {
     name_field_width =
-        std::max<size_t>(name_field_width, benchmark.name.size());
-    has_repetitions |= benchmark.repetitions > 1;
+        std::max<size_t>(name_field_width, benchmark.name.str().size());
+    might_have_aggregates |= benchmark.repetitions > 1;
 
-    for(const auto& Stat : *benchmark.statistics)
+    for (const auto& Stat : *benchmark.statistics)
       stat_field_width = std::max<size_t>(stat_field_width, Stat.name_.size());
   }
-  if (has_repetitions) name_field_width += 1 + stat_field_width;
+  if (might_have_aggregates) name_field_width += 1 + stat_field_width;
 
   // Print header here
   BenchmarkReporter::Context context;
@@ -421,22 +256,36 @@ void RunBenchmarks(const std::vector<Benchmark::Instance>& benchmarks,
     std::flush(reporter->GetErrorStream());
   };
 
-  if (console_reporter->ReportContext(context) &&
+  if (display_reporter->ReportContext(context) &&
       (!file_reporter || file_reporter->ReportContext(context))) {
-    flushStreams(console_reporter);
+    flushStreams(display_reporter);
     flushStreams(file_reporter);
+
     for (const auto& benchmark : benchmarks) {
-      std::vector<BenchmarkReporter::Run> reports =
-          RunBenchmark(benchmark, &complexity_reports);
-      console_reporter->ReportRuns(reports);
-      if (file_reporter) file_reporter->ReportRuns(reports);
-      flushStreams(console_reporter);
+      RunResults run_results = RunBenchmark(benchmark, &complexity_reports);
+
+      auto report = [&run_results](BenchmarkReporter* reporter,
+                                   bool report_aggregates_only) {
+        assert(reporter);
+        // If there are no aggregates, do output non-aggregates.
+        report_aggregates_only &= !run_results.aggregates_only.empty();
+        if (!report_aggregates_only)
+          reporter->ReportRuns(run_results.non_aggregates);
+        if (!run_results.aggregates_only.empty())
+          reporter->ReportRuns(run_results.aggregates_only);
+      };
+
+      report(display_reporter, run_results.display_report_aggregates_only);
+      if (file_reporter)
+        report(file_reporter, run_results.file_report_aggregates_only);
+
+      flushStreams(display_reporter);
       flushStreams(file_reporter);
     }
   }
-  console_reporter->Finalize();
+  display_reporter->Finalize();
   if (file_reporter) file_reporter->Finalize();
-  flushStreams(console_reporter);
+  flushStreams(display_reporter);
   flushStreams(file_reporter);
 }
 
@@ -463,21 +312,26 @@ bool IsZero(double n) {
 
 ConsoleReporter::OutputOptions GetOutputOptions(bool force_no_color) {
   int output_opts = ConsoleReporter::OO_Defaults;
-  if ((FLAGS_benchmark_color == "auto" && IsColorTerminal()) ||
-      IsTruthyFlagValue(FLAGS_benchmark_color)) {
+  auto is_benchmark_color = [force_no_color] () -> bool {
+    if (force_no_color) {
+      return false;
+    }
+    if (FLAGS_benchmark_color == "auto") {
+      return IsColorTerminal();
+    }
+    return IsTruthyFlagValue(FLAGS_benchmark_color);
+  };
+  if (is_benchmark_color()) {
     output_opts |= ConsoleReporter::OO_Color;
   } else {
     output_opts &= ~ConsoleReporter::OO_Color;
   }
-  if(force_no_color) {
-    output_opts &= ~ConsoleReporter::OO_Color;
-  }
-  if(FLAGS_benchmark_counters_tabular) {
+  if (FLAGS_benchmark_counters_tabular) {
     output_opts |= ConsoleReporter::OO_Tabular;
   } else {
     output_opts &= ~ConsoleReporter::OO_Tabular;
   }
-  return static_cast< ConsoleReporter::OutputOptions >(output_opts);
+  return static_cast<ConsoleReporter::OutputOptions>(output_opts);
 }
 
 }  // end namespace internal
@@ -486,11 +340,11 @@ size_t RunSpecifiedBenchmarks() {
   return RunSpecifiedBenchmarks(nullptr, nullptr);
 }
 
-size_t RunSpecifiedBenchmarks(BenchmarkReporter* console_reporter) {
-  return RunSpecifiedBenchmarks(console_reporter, nullptr);
+size_t RunSpecifiedBenchmarks(BenchmarkReporter* display_reporter) {
+  return RunSpecifiedBenchmarks(display_reporter, nullptr);
 }
 
-size_t RunSpecifiedBenchmarks(BenchmarkReporter* console_reporter,
+size_t RunSpecifiedBenchmarks(BenchmarkReporter* display_reporter,
                               BenchmarkReporter* file_reporter) {
   std::string spec = FLAGS_benchmark_filter;
   if (spec.empty() || spec == "all")
@@ -498,15 +352,15 @@ size_t RunSpecifiedBenchmarks(BenchmarkReporter* console_reporter,
 
   // Setup the reporters
   std::ofstream output_file;
-  std::unique_ptr<BenchmarkReporter> default_console_reporter;
+  std::unique_ptr<BenchmarkReporter> default_display_reporter;
   std::unique_ptr<BenchmarkReporter> default_file_reporter;
-  if (!console_reporter) {
-    default_console_reporter = internal::CreateReporter(
-          FLAGS_benchmark_format, internal::GetOutputOptions());
-    console_reporter = default_console_reporter.get();
+  if (!display_reporter) {
+    default_display_reporter = internal::CreateReporter(
+        FLAGS_benchmark_format, internal::GetOutputOptions());
+    display_reporter = default_display_reporter.get();
   }
-  auto& Out = console_reporter->GetOutputStream();
-  auto& Err = console_reporter->GetErrorStream();
+  auto& Out = display_reporter->GetOutputStream();
+  auto& Err = display_reporter->GetErrorStream();
 
   std::string const& fname = FLAGS_benchmark_out;
   if (fname.empty() && file_reporter) {
@@ -530,7 +384,7 @@ size_t RunSpecifiedBenchmarks(BenchmarkReporter* console_reporter,
     file_reporter->SetErrorStream(&output_file);
   }
 
-  std::vector<internal::Benchmark::Instance> benchmarks;
+  std::vector<internal::BenchmarkInstance> benchmarks;
   if (!FindBenchmarksInternal(spec, &benchmarks, &Err)) return 0;
 
   if (benchmarks.empty()) {
@@ -539,14 +393,19 @@ size_t RunSpecifiedBenchmarks(BenchmarkReporter* console_reporter,
   }
 
   if (FLAGS_benchmark_list_tests) {
-    for (auto const& benchmark : benchmarks) Out << benchmark.name << "\n";
+    for (auto const& benchmark : benchmarks)
+      Out << benchmark.name.str() << "\n";
   } else {
-    internal::RunBenchmarks(benchmarks, console_reporter, file_reporter);
+    internal::RunBenchmarks(benchmarks, display_reporter, file_reporter);
   }
 
   return benchmarks.size();
 }
 
+void RegisterMemoryManager(MemoryManager* manager) {
+  internal::memory_manager = manager;
+}
+
 namespace internal {
 
 void PrintUsageAndExit() {
@@ -556,7 +415,8 @@ void PrintUsageAndExit() {
           "          [--benchmark_filter=<regex>]\n"
           "          [--benchmark_min_time=<min_time>]\n"
           "          [--benchmark_repetitions=<num_repetitions>]\n"
-          "          [--benchmark_report_aggregates_only={true|false}\n"
+          "          [--benchmark_report_aggregates_only={true|false}]\n"
+          "          [--benchmark_display_aggregates_only={true|false}]\n"
           "          [--benchmark_format=<console|json|csv>]\n"
           "          [--benchmark_out=<filename>]\n"
           "          [--benchmark_out_format=<json|console|csv>]\n"
@@ -568,7 +428,8 @@ void PrintUsageAndExit() {
 
 void ParseCommandLineFlags(int* argc, char** argv) {
   using namespace benchmark;
-  BenchmarkReporter::Context::executable_name = argv[0];
+  BenchmarkReporter::Context::executable_name =
+      (argc && *argc > 0) ? argv[0] : "unknown";
   for (int i = 1; i < *argc; ++i) {
     if (ParseBoolFlag(argv[i], "benchmark_list_tests",
                       &FLAGS_benchmark_list_tests) ||
@@ -579,6 +440,8 @@ void ParseCommandLineFlags(int* argc, char** argv) {
                        &FLAGS_benchmark_repetitions) ||
         ParseBoolFlag(argv[i], "benchmark_report_aggregates_only",
                       &FLAGS_benchmark_report_aggregates_only) ||
+        ParseBoolFlag(argv[i], "benchmark_display_aggregates_only",
+                      &FLAGS_benchmark_display_aggregates_only) ||
         ParseStringFlag(argv[i], "benchmark_format", &FLAGS_benchmark_format) ||
         ParseStringFlag(argv[i], "benchmark_out", &FLAGS_benchmark_out) ||
         ParseStringFlag(argv[i], "benchmark_out_format",
@@ -588,7 +451,7 @@ void ParseCommandLineFlags(int* argc, char** argv) {
         // TODO: Remove this.
         ParseStringFlag(argv[i], "color_print", &FLAGS_benchmark_color) ||
         ParseBoolFlag(argv[i], "benchmark_counters_tabular",
-                        &FLAGS_benchmark_counters_tabular) ||
+                      &FLAGS_benchmark_counters_tabular) ||
         ParseInt32Flag(argv[i], "v", &FLAGS_v)) {
       for (int j = i; j != *argc - 1; ++j) argv[j] = argv[j + 1];
 
@@ -622,7 +485,8 @@ void Initialize(int* argc, char** argv) {
 
 bool ReportUnrecognizedArguments(int argc, char** argv) {
   for (int i = 1; i < argc; ++i) {
-    fprintf(stderr, "%s: error: unrecognized command-line flag: %s\n", argv[0], argv[i]);
+    fprintf(stderr, "%s: error: unrecognized command-line flag: %s\n", argv[0],
+            argv[i]);
   }
   return argc > 1;
 }
diff --git a/lib/gbenchmark/src/benchmark_api_internal.cc b/lib/gbenchmark/src/benchmark_api_internal.cc
new file mode 100644
index 0000000000..d468a257e3
--- /dev/null
+++ b/lib/gbenchmark/src/benchmark_api_internal.cc
@@ -0,0 +1,15 @@
+#include "benchmark_api_internal.h"
+
+namespace benchmark {
+namespace internal {
+
+State BenchmarkInstance::Run(IterationCount iters, int thread_id,
+                             internal::ThreadTimer* timer,
+                             internal::ThreadManager* manager) const {
+  State st(iters, arg, thread_id, threads, timer, manager);
+  benchmark->Run(st);
+  return st;
+}
+
+}  // internal
+}  // benchmark
diff --git a/lib/gbenchmark/src/benchmark_api_internal.h b/lib/gbenchmark/src/benchmark_api_internal.h
index dd7a3ffe8c..264eff95c5 100644
--- a/lib/gbenchmark/src/benchmark_api_internal.h
+++ b/lib/gbenchmark/src/benchmark_api_internal.h
@@ -2,10 +2,12 @@
 #define BENCHMARK_API_INTERNAL_H
 
 #include "benchmark/benchmark.h"
+#include "commandlineflags.h"
 
 #include <cmath>
 #include <iosfwd>
 #include <limits>
+#include <memory>
 #include <string>
 #include <vector>
 
@@ -13,13 +15,14 @@ namespace benchmark {
 namespace internal {
 
 // Information kept per benchmark we may want to run
-struct Benchmark::Instance {
-  std::string name;
+struct BenchmarkInstance {
+  BenchmarkName name;
   Benchmark* benchmark;
-  ReportMode report_mode;
+  AggregationReportMode aggregation_report_mode;
   std::vector<int64_t> arg;
   TimeUnit time_unit;
   int range_multiplier;
+  bool measure_process_cpu_time;
   bool use_real_time;
   bool use_manual_time;
   BigO complexity;
@@ -29,12 +32,15 @@ struct Benchmark::Instance {
   bool last_benchmark_instance;
   int repetitions;
   double min_time;
-  size_t iterations;
+  IterationCount iterations;
   int threads;  // Number of concurrent threads to us
+
+  State Run(IterationCount iters, int thread_id, internal::ThreadTimer* timer,
+            internal::ThreadManager* manager) const;
 };
 
 bool FindBenchmarksInternal(const std::string& re,
-                            std::vector<Benchmark::Instance>* benchmarks,
+                            std::vector<BenchmarkInstance>* benchmarks,
                             std::ostream* Err);
 
 bool IsZero(double n);
diff --git a/lib/gbenchmark/src/benchmark_main.cc b/lib/gbenchmark/src/benchmark_main.cc
new file mode 100644
index 0000000000..b3b2478314
--- /dev/null
+++ b/lib/gbenchmark/src/benchmark_main.cc
@@ -0,0 +1,17 @@
+// Copyright 2018 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "benchmark/benchmark.h"
+
+BENCHMARK_MAIN();
diff --git a/lib/gbenchmark/src/benchmark_name.cc b/lib/gbenchmark/src/benchmark_name.cc
new file mode 100644
index 0000000000..2a17ebce27
--- /dev/null
+++ b/lib/gbenchmark/src/benchmark_name.cc
@@ -0,0 +1,58 @@
+// Copyright 2015 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <benchmark/benchmark.h>
+
+namespace benchmark {
+
+namespace {
+
+// Compute the total size of a pack of std::strings
+size_t size_impl() { return 0; }
+
+template <typename Head, typename... Tail>
+size_t size_impl(const Head& head, const Tail&... tail) {
+  return head.size() + size_impl(tail...);
+}
+
+// Join a pack of std::strings using a delimiter
+// TODO: use absl::StrJoin
+void join_impl(std::string&, char) {}
+
+template <typename Head, typename... Tail>
+void join_impl(std::string& s, const char delimiter, const Head& head,
+               const Tail&... tail) {
+  if (!s.empty() && !head.empty()) {
+    s += delimiter;
+  }
+
+  s += head;
+
+  join_impl(s, delimiter, tail...);
+}
+
+template <typename... Ts>
+std::string join(char delimiter, const Ts&... ts) {
+  std::string s;
+  s.reserve(sizeof...(Ts) + size_impl(ts...));
+  join_impl(s, delimiter, ts...);
+  return s;
+}
+}  // namespace
+
+std::string BenchmarkName::str() const {
+  return join('/', function_name, args, min_time, iterations, repetitions,
+              time_type, threads);
+}
+}  // namespace benchmark
diff --git a/lib/gbenchmark/src/benchmark_register.cc b/lib/gbenchmark/src/benchmark_register.cc
index 4fea6d915f..6696c382b8 100644
--- a/lib/gbenchmark/src/benchmark_register.cc
+++ b/lib/gbenchmark/src/benchmark_register.cc
@@ -34,6 +34,9 @@
 #include <sstream>
 #include <thread>
 
+#define __STDC_FORMAT_MACROS
+#include <inttypes.h>
+
 #include "benchmark/benchmark.h"
 #include "benchmark_api_internal.h"
 #include "check.h"
@@ -77,8 +80,8 @@ class BenchmarkFamilies {
 
   // Extract the list of benchmark instances that match the specified
   // regular expression.
-  bool FindBenchmarks(const std::string& re,
-                      std::vector<Benchmark::Instance>* benchmarks,
+  bool FindBenchmarks(std::string re,
+                      std::vector<BenchmarkInstance>* benchmarks,
                       std::ostream* Err);
 
  private:
@@ -107,13 +110,18 @@ void BenchmarkFamilies::ClearBenchmarks() {
 }
 
 bool BenchmarkFamilies::FindBenchmarks(
-    const std::string& spec, std::vector<Benchmark::Instance>* benchmarks,
+    std::string spec, std::vector<BenchmarkInstance>* benchmarks,
     std::ostream* ErrStream) {
   CHECK(ErrStream);
   auto& Err = *ErrStream;
   // Make regular expression out of command-line flag
   std::string error_msg;
   Regex re;
+  bool isNegativeFilter = false;
+  if (spec[0] == '-') {
+    spec.replace(0, 1, "");
+    isNegativeFilter = true;
+  }
   if (!re.Init(spec, &error_msg)) {
     Err << "Could not compile benchmark re: " << error_msg << std::endl;
     return false;
@@ -147,16 +155,17 @@ bool BenchmarkFamilies::FindBenchmarks(
 
     for (auto const& args : family->args_) {
       for (int num_threads : *thread_counts) {
-        Benchmark::Instance instance;
-        instance.name = family->name_;
+        BenchmarkInstance instance;
+        instance.name.function_name = family->name_;
         instance.benchmark = family.get();
-        instance.report_mode = family->report_mode_;
+        instance.aggregation_report_mode = family->aggregation_report_mode_;
         instance.arg = args;
         instance.time_unit = family->time_unit_;
         instance.range_multiplier = family->range_multiplier_;
         instance.min_time = family->min_time_;
         instance.iterations = family->iterations_;
         instance.repetitions = family->repetitions_;
+        instance.measure_process_cpu_time = family->measure_process_cpu_time_;
         instance.use_real_time = family->use_real_time_;
         instance.use_manual_time = family->use_manual_time_;
         instance.complexity = family->complexity_;
@@ -167,39 +176,57 @@ bool BenchmarkFamilies::FindBenchmarks(
         // Add arguments to instance name
         size_t arg_i = 0;
         for (auto const& arg : args) {
-          instance.name += "/";
+          if (!instance.name.args.empty()) {
+            instance.name.args += '/';
+          }
 
           if (arg_i < family->arg_names_.size()) {
             const auto& arg_name = family->arg_names_[arg_i];
             if (!arg_name.empty()) {
-              instance.name +=
-                  StrFormat("%s:", family->arg_names_[arg_i].c_str());
+              instance.name.args += StrFormat("%s:", arg_name.c_str());
             }
           }
 
-          instance.name += StrFormat("%d", arg);
+          instance.name.args += StrFormat("%" PRId64, arg);
           ++arg_i;
         }
 
         if (!IsZero(family->min_time_))
-          instance.name += StrFormat("/min_time:%0.3f", family->min_time_);
-        if (family->iterations_ != 0)
-          instance.name += StrFormat("/iterations:%d", family->iterations_);
+          instance.name.min_time =
+              StrFormat("min_time:%0.3f", family->min_time_);
+        if (family->iterations_ != 0) {
+          instance.name.iterations =
+              StrFormat("iterations:%lu",
+                        static_cast<unsigned long>(family->iterations_));
+        }
         if (family->repetitions_ != 0)
-          instance.name += StrFormat("/repeats:%d", family->repetitions_);
+          instance.name.repetitions =
+              StrFormat("repeats:%d", family->repetitions_);
+
+        if (family->measure_process_cpu_time_) {
+          instance.name.time_type = "process_time";
+        }
 
         if (family->use_manual_time_) {
-          instance.name += "/manual_time";
+          if (!instance.name.time_type.empty()) {
+            instance.name.time_type += '/';
+          }
+          instance.name.time_type += "manual_time";
         } else if (family->use_real_time_) {
-          instance.name += "/real_time";
+          if (!instance.name.time_type.empty()) {
+            instance.name.time_type += '/';
+          }
+          instance.name.time_type += "real_time";
         }
 
         // Add the number of threads used to the name
         if (!family->thread_counts_.empty()) {
-          instance.name += StrFormat("/threads:%d", instance.threads);
+          instance.name.threads = StrFormat("threads:%d", instance.threads);
         }
 
-        if (re.Match(instance.name)) {
+        const auto full_name = instance.name.str();
+        if ((re.Match(full_name) && !isNegativeFilter) ||
+            (!re.Match(full_name) && isNegativeFilter)) {
           instance.last_benchmark_instance = (&args == &family->args_.back());
           benchmarks->push_back(std::move(instance));
         }
@@ -219,7 +246,7 @@ Benchmark* RegisterBenchmarkInternal(Benchmark* bench) {
 // FIXME: This function is a hack so that benchmark.cc can access
 // `BenchmarkFamilies`
 bool FindBenchmarksInternal(const std::string& re,
-                            std::vector<Benchmark::Instance>* benchmarks,
+                            std::vector<BenchmarkInstance>* benchmarks,
                             std::ostream* Err) {
   return BenchmarkFamilies::GetInstance()->FindBenchmarks(re, benchmarks, Err);
 }
@@ -230,12 +257,13 @@ bool FindBenchmarksInternal(const std::string& re,
 
 Benchmark::Benchmark(const char* name)
     : name_(name),
-      report_mode_(RM_Unspecified),
+      aggregation_report_mode_(ARM_Unspecified),
       time_unit_(kNanosecond),
       range_multiplier_(kRangeMultiplier),
       min_time_(0),
       iterations_(0),
       repetitions_(0),
+      measure_process_cpu_time_(false),
       use_real_time_(false),
       use_manual_time_(false),
       complexity_(oNone),
@@ -317,7 +345,6 @@ Benchmark* Benchmark::ArgNames(const std::vector<std::string>& names) {
 
 Benchmark* Benchmark::DenseRange(int64_t start, int64_t limit, int step) {
   CHECK(ArgsCnt() == -1 || ArgsCnt() == 1);
-  CHECK_GE(start, 0);
   CHECK_LE(start, limit);
   for (int64_t arg = start; arg <= limit; arg += step) {
     args_.push_back({arg});
@@ -349,7 +376,7 @@ Benchmark* Benchmark::MinTime(double t) {
   return this;
 }
 
-Benchmark* Benchmark::Iterations(size_t n) {
+Benchmark* Benchmark::Iterations(IterationCount n) {
   CHECK(n > 0);
   CHECK(IsZero(min_time_));
   iterations_ = n;
@@ -363,7 +390,29 @@ Benchmark* Benchmark::Repetitions(int n) {
 }
 
 Benchmark* Benchmark::ReportAggregatesOnly(bool value) {
-  report_mode_ = value ? RM_ReportAggregatesOnly : RM_Default;
+  aggregation_report_mode_ = value ? ARM_ReportAggregatesOnly : ARM_Default;
+  return this;
+}
+
+Benchmark* Benchmark::DisplayAggregatesOnly(bool value) {
+  // If we were called, the report mode is no longer 'unspecified', in any case.
+  aggregation_report_mode_ = static_cast<AggregationReportMode>(
+      aggregation_report_mode_ | ARM_Default);
+
+  if (value) {
+    aggregation_report_mode_ = static_cast<AggregationReportMode>(
+        aggregation_report_mode_ | ARM_DisplayReportAggregatesOnly);
+  } else {
+    aggregation_report_mode_ = static_cast<AggregationReportMode>(
+        aggregation_report_mode_ & ~ARM_DisplayReportAggregatesOnly);
+  }
+
+  return this;
+}
+
+Benchmark* Benchmark::MeasureProcessCPUTime() {
+  // Can be used together with UseRealTime() / UseManualTime().
+  measure_process_cpu_time_ = true;
   return this;
 }
 
diff --git a/lib/gbenchmark/src/benchmark_register.h b/lib/gbenchmark/src/benchmark_register.h
index 0705e219f2..61377d7423 100644
--- a/lib/gbenchmark/src/benchmark_register.h
+++ b/lib/gbenchmark/src/benchmark_register.h
@@ -5,29 +5,103 @@
 
 #include "check.h"
 
+namespace benchmark {
+namespace internal {
+
+// Append the powers of 'mult' in the closed interval [lo, hi].
+// Returns iterator to the start of the inserted range.
 template <typename T>
-void AddRange(std::vector<T>* dst, T lo, T hi, int mult) {
+typename std::vector<T>::iterator
+AddPowers(std::vector<T>* dst, T lo, T hi, int mult) {
   CHECK_GE(lo, 0);
   CHECK_GE(hi, lo);
   CHECK_GE(mult, 2);
 
-  // Add "lo"
-  dst->push_back(lo);
+  const size_t start_offset = dst->size();
 
   static const T kmax = std::numeric_limits<T>::max();
 
-  // Now space out the benchmarks in multiples of "mult"
-  for (T i = 1; i < kmax / mult; i *= mult) {
-    if (i >= hi) break;
-    if (i > lo) {
+  // Space out the values in multiples of "mult"
+  for (T i = 1; i <= hi; i *= mult) {
+    if (i >= lo) {
       dst->push_back(i);
     }
+    // Break the loop here since multiplying by
+    // 'mult' would move outside of the range of T
+    if (i > kmax / mult) break;
+  }
+
+  return dst->begin() + start_offset;
+}
+
+template <typename T>
+void AddNegatedPowers(std::vector<T>* dst, T lo, T hi, int mult) {
+  // We negate lo and hi so we require that they cannot be equal to 'min'.
+  CHECK_GT(lo, std::numeric_limits<T>::min());
+  CHECK_GT(hi, std::numeric_limits<T>::min());
+  CHECK_GE(hi, lo);
+  CHECK_LE(hi, 0);
+
+  // Add positive powers, then negate and reverse.
+  // Casts necessary since small integers get promoted
+  // to 'int' when negating.
+  const auto lo_complement = static_cast<T>(-lo);
+  const auto hi_complement = static_cast<T>(-hi);
+
+  const auto it = AddPowers(dst, hi_complement, lo_complement, mult);
+
+  std::for_each(it, dst->end(), [](T& t) { t *= -1; });
+  std::reverse(it, dst->end());
+}
+
+template <typename T>
+void AddRange(std::vector<T>* dst, T lo, T hi, int mult) {
+  static_assert(std::is_integral<T>::value && std::is_signed<T>::value,
+                "Args type must be a signed integer");
+
+  CHECK_GE(hi, lo);
+  CHECK_GE(mult, 2);
+
+  // Add "lo"
+  dst->push_back(lo);
+
+  // Handle lo == hi as a special case, so we then know
+  // lo < hi and so it is safe to add 1 to lo and subtract 1
+  // from hi without falling outside of the range of T.
+  if (lo == hi) return;
+
+  // Ensure that lo_inner <= hi_inner below.
+  if (lo + 1 == hi) {
+    dst->push_back(hi);
+    return;
   }
 
-  // Add "hi" (if different from "lo")
-  if (hi != lo) {
+  // Add all powers of 'mult' in the range [lo+1, hi-1] (inclusive).
+  const auto lo_inner = static_cast<T>(lo + 1);
+  const auto hi_inner = static_cast<T>(hi - 1);
+
+  // Insert negative values
+  if (lo_inner < 0) {
+    AddNegatedPowers(dst, lo_inner, std::min(hi_inner, T{-1}), mult);
+  }
+
+  // Treat 0 as a special case (see discussion on #762).
+  if (lo <= 0 && hi >= 0) {
+    dst->push_back(0);
+  }
+
+  // Insert positive values
+  if (hi_inner > 0) {
+    AddPowers(dst, std::max(lo_inner, T{1}), hi_inner, mult);
+  }
+
+  // Add "hi" (if different from last value).
+  if (hi != dst->back()) {
     dst->push_back(hi);
   }
 }
 
+}  // namespace internal
+}  // namespace benchmark
+
 #endif  // BENCHMARK_REGISTER_H
diff --git a/lib/gbenchmark/src/benchmark_runner.cc b/lib/gbenchmark/src/benchmark_runner.cc
new file mode 100644
index 0000000000..f770190223
--- /dev/null
+++ b/lib/gbenchmark/src/benchmark_runner.cc
@@ -0,0 +1,361 @@
+// Copyright 2015 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "benchmark_runner.h"
+#include "benchmark/benchmark.h"
+#include "benchmark_api_internal.h"
+#include "internal_macros.h"
+
+#ifndef BENCHMARK_OS_WINDOWS
+#ifndef BENCHMARK_OS_FUCHSIA
+#include <sys/resource.h>
+#endif
+#include <sys/time.h>
+#include <unistd.h>
+#endif
+
+#include <algorithm>
+#include <atomic>
+#include <condition_variable>
+#include <cstdio>
+#include <cstdlib>
+#include <fstream>
+#include <iostream>
+#include <memory>
+#include <string>
+#include <thread>
+#include <utility>
+
+#include "check.h"
+#include "colorprint.h"
+#include "commandlineflags.h"
+#include "complexity.h"
+#include "counter.h"
+#include "internal_macros.h"
+#include "log.h"
+#include "mutex.h"
+#include "re.h"
+#include "statistics.h"
+#include "string_util.h"
+#include "thread_manager.h"
+#include "thread_timer.h"
+
+namespace benchmark {
+
+namespace internal {
+
+MemoryManager* memory_manager = nullptr;
+
+namespace {
+
+static constexpr IterationCount kMaxIterations = 1000000000;
+
+BenchmarkReporter::Run CreateRunReport(
+    const benchmark::internal::BenchmarkInstance& b,
+    const internal::ThreadManager::Result& results,
+    IterationCount memory_iterations,
+    const MemoryManager::Result& memory_result, double seconds,
+    int64_t repetition_index) {
+  // Create report about this benchmark run.
+  BenchmarkReporter::Run report;
+
+  report.run_name = b.name;
+  report.error_occurred = results.has_error_;
+  report.error_message = results.error_message_;
+  report.report_label = results.report_label_;
+  // This is the total iterations across all threads.
+  report.iterations = results.iterations;
+  report.time_unit = b.time_unit;
+  report.threads = b.threads;
+  report.repetition_index = repetition_index;
+  report.repetitions = b.repetitions;
+
+  if (!report.error_occurred) {
+    if (b.use_manual_time) {
+      report.real_accumulated_time = results.manual_time_used;
+    } else {
+      report.real_accumulated_time = results.real_time_used;
+    }
+    report.cpu_accumulated_time = results.cpu_time_used;
+    report.complexity_n = results.complexity_n;
+    report.complexity = b.complexity;
+    report.complexity_lambda = b.complexity_lambda;
+    report.statistics = b.statistics;
+    report.counters = results.counters;
+
+    if (memory_iterations > 0) {
+      report.has_memory_result = true;
+      report.allocs_per_iter =
+          memory_iterations ? static_cast<double>(memory_result.num_allocs) /
+                                  memory_iterations
+                            : 0;
+      report.max_bytes_used = memory_result.max_bytes_used;
+    }
+
+    internal::Finish(&report.counters, results.iterations, seconds, b.threads);
+  }
+  return report;
+}
+
+// Execute one thread of benchmark b for the specified number of iterations.
+// Adds the stats collected for the thread into *total.
+void RunInThread(const BenchmarkInstance* b, IterationCount iters,
+                 int thread_id, ThreadManager* manager) {
+  internal::ThreadTimer timer(
+      b->measure_process_cpu_time
+          ? internal::ThreadTimer::CreateProcessCpuTime()
+          : internal::ThreadTimer::Create());
+  State st = b->Run(iters, thread_id, &timer, manager);
+  CHECK(st.iterations() >= st.max_iterations)
+      << "Benchmark returned before State::KeepRunning() returned false!";
+  {
+    MutexLock l(manager->GetBenchmarkMutex());
+    internal::ThreadManager::Result& results = manager->results;
+    results.iterations += st.iterations();
+    results.cpu_time_used += timer.cpu_time_used();
+    results.real_time_used += timer.real_time_used();
+    results.manual_time_used += timer.manual_time_used();
+    results.complexity_n += st.complexity_length_n();
+    internal::Increment(&results.counters, st.counters);
+  }
+  manager->NotifyThreadComplete();
+}
+
+class BenchmarkRunner {
+ public:
+  BenchmarkRunner(const benchmark::internal::BenchmarkInstance& b_,
+                  std::vector<BenchmarkReporter::Run>* complexity_reports_)
+      : b(b_),
+        complexity_reports(*complexity_reports_),
+        min_time(!IsZero(b.min_time) ? b.min_time : FLAGS_benchmark_min_time),
+        repeats(b.repetitions != 0 ? b.repetitions
+                                   : FLAGS_benchmark_repetitions),
+        has_explicit_iteration_count(b.iterations != 0),
+        pool(b.threads - 1),
+        iters(has_explicit_iteration_count ? b.iterations : 1) {
+    run_results.display_report_aggregates_only =
+        (FLAGS_benchmark_report_aggregates_only ||
+         FLAGS_benchmark_display_aggregates_only);
+    run_results.file_report_aggregates_only =
+        FLAGS_benchmark_report_aggregates_only;
+    if (b.aggregation_report_mode != internal::ARM_Unspecified) {
+      run_results.display_report_aggregates_only =
+          (b.aggregation_report_mode &
+           internal::ARM_DisplayReportAggregatesOnly);
+      run_results.file_report_aggregates_only =
+          (b.aggregation_report_mode & internal::ARM_FileReportAggregatesOnly);
+    }
+
+    for (int repetition_num = 0; repetition_num < repeats; repetition_num++) {
+      DoOneRepetition(repetition_num);
+    }
+
+    // Calculate additional statistics
+    run_results.aggregates_only = ComputeStats(run_results.non_aggregates);
+
+    // Maybe calculate complexity report
+    if ((b.complexity != oNone) && b.last_benchmark_instance) {
+      auto additional_run_stats = ComputeBigO(complexity_reports);
+      run_results.aggregates_only.insert(run_results.aggregates_only.end(),
+                                         additional_run_stats.begin(),
+                                         additional_run_stats.end());
+      complexity_reports.clear();
+    }
+  }
+
+  RunResults&& get_results() { return std::move(run_results); }
+
+ private:
+  RunResults run_results;
+
+  const benchmark::internal::BenchmarkInstance& b;
+  std::vector<BenchmarkReporter::Run>& complexity_reports;
+
+  const double min_time;
+  const int repeats;
+  const bool has_explicit_iteration_count;
+
+  std::vector<std::thread> pool;
+
+  IterationCount iters;  // preserved between repetitions!
+  // So only the first repetition has to find/calculate it,
+  // the other repetitions will just use that precomputed iteration count.
+
+  struct IterationResults {
+    internal::ThreadManager::Result results;
+    IterationCount iters;
+    double seconds;
+  };
+  IterationResults DoNIterations() {
+    VLOG(2) << "Running " << b.name.str() << " for " << iters << "\n";
+
+    std::unique_ptr<internal::ThreadManager> manager;
+    manager.reset(new internal::ThreadManager(b.threads));
+
+    // Run all but one thread in separate threads
+    for (std::size_t ti = 0; ti < pool.size(); ++ti) {
+      pool[ti] = std::thread(&RunInThread, &b, iters, static_cast<int>(ti + 1),
+                             manager.get());
+    }
+    // And run one thread here directly.
+    // (If we were asked to run just one thread, we don't create new threads.)
+    // Yes, we need to do this here *after* we start the separate threads.
+    RunInThread(&b, iters, 0, manager.get());
+
+    // The main thread has finished. Now let's wait for the other threads.
+    manager->WaitForAllThreads();
+    for (std::thread& thread : pool) thread.join();
+
+    IterationResults i;
+    // Acquire the measurements/counters from the manager, UNDER THE LOCK!
+    {
+      MutexLock l(manager->GetBenchmarkMutex());
+      i.results = manager->results;
+    }
+
+    // And get rid of the manager.
+    manager.reset();
+
+    // Adjust real/manual time stats since they were reported per thread.
+    i.results.real_time_used /= b.threads;
+    i.results.manual_time_used /= b.threads;
+    // If we were measuring whole-process CPU usage, adjust the CPU time too.
+    if (b.measure_process_cpu_time) i.results.cpu_time_used /= b.threads;
+
+    VLOG(2) << "Ran in " << i.results.cpu_time_used << "/"
+            << i.results.real_time_used << "\n";
+
+    // So for how long were we running?
+    i.iters = iters;
+    // Base decisions off of real time if requested by this benchmark.
+    i.seconds = i.results.cpu_time_used;
+    if (b.use_manual_time) {
+      i.seconds = i.results.manual_time_used;
+    } else if (b.use_real_time) {
+      i.seconds = i.results.real_time_used;
+    }
+
+    return i;
+  }
+
+  IterationCount PredictNumItersNeeded(const IterationResults& i) const {
+    // See how much iterations should be increased by.
+    // Note: Avoid division by zero with max(seconds, 1ns).
+    double multiplier = min_time * 1.4 / std::max(i.seconds, 1e-9);
+    // If our last run was at least 10% of FLAGS_benchmark_min_time then we
+    // use the multiplier directly.
+    // Otherwise we use at most 10 times expansion.
+    // NOTE: When the last run was at least 10% of the min time the max
+    // expansion should be 14x.
+    bool is_significant = (i.seconds / min_time) > 0.1;
+    multiplier = is_significant ? multiplier : std::min(10.0, multiplier);
+    if (multiplier <= 1.0) multiplier = 2.0;
+
+    // So what seems to be the sufficiently-large iteration count? Round up.
+    const IterationCount max_next_iters =
+        static_cast<IterationCount>(0.5 + std::max(multiplier * i.iters, i.iters + 1.0));
+    // But we do have *some* sanity limits though..
+    const IterationCount next_iters = std::min(max_next_iters, kMaxIterations);
+
+    VLOG(3) << "Next iters: " << next_iters << ", " << multiplier << "\n";
+    return next_iters;  // round up before conversion to integer.
+  }
+
+  bool ShouldReportIterationResults(const IterationResults& i) const {
+    // Determine if this run should be reported;
+    // Either it has run for a sufficient amount of time
+    // or because an error was reported.
+    return i.results.has_error_ ||
+           i.iters >= kMaxIterations ||  // Too many iterations already.
+           i.seconds >= min_time ||      // The elapsed time is large enough.
+           // CPU time is specified but the elapsed real time greatly exceeds
+           // the minimum time.
+           // Note that user provided timers are except from this sanity check.
+           ((i.results.real_time_used >= 5 * min_time) && !b.use_manual_time);
+  }
+
+  void DoOneRepetition(int64_t repetition_index) {
+    const bool is_the_first_repetition = repetition_index == 0;
+    IterationResults i;
+
+    // We *may* be gradually increasing the length (iteration count)
+    // of the benchmark until we decide the results are significant.
+    // And once we do, we report those last results and exit.
+    // Please do note that the if there are repetitions, the iteration count
+    // is *only* calculated for the *first* repetition, and other repetitions
+    // simply use that precomputed iteration count.
+    for (;;) {
+      i = DoNIterations();
+
+      // Do we consider the results to be significant?
+      // If we are doing repetitions, and the first repetition was already done,
+      // it has calculated the correct iteration time, so we have run that very
+      // iteration count just now. No need to calculate anything. Just report.
+      // Else, the normal rules apply.
+      const bool results_are_significant = !is_the_first_repetition ||
+                                           has_explicit_iteration_count ||
+                                           ShouldReportIterationResults(i);
+
+      if (results_are_significant) break;  // Good, let's report them!
+
+      // Nope, bad iteration. Let's re-estimate the hopefully-sufficient
+      // iteration count, and run the benchmark again...
+
+      iters = PredictNumItersNeeded(i);
+      assert(iters > i.iters &&
+             "if we did more iterations than we want to do the next time, "
+             "then we should have accepted the current iteration run.");
+    }
+
+    // Oh, one last thing, we need to also produce the 'memory measurements'..
+    MemoryManager::Result memory_result;
+    IterationCount memory_iterations = 0;
+    if (memory_manager != nullptr) {
+      // Only run a few iterations to reduce the impact of one-time
+      // allocations in benchmarks that are not properly managed.
+      memory_iterations = std::min<IterationCount>(16, iters);
+      memory_manager->Start();
+      std::unique_ptr<internal::ThreadManager> manager;
+      manager.reset(new internal::ThreadManager(1));
+      RunInThread(&b, memory_iterations, 0, manager.get());
+      manager->WaitForAllThreads();
+      manager.reset();
+
+      memory_manager->Stop(&memory_result);
+    }
+
+    // Ok, now actualy report.
+    BenchmarkReporter::Run report =
+        CreateRunReport(b, i.results, memory_iterations, memory_result,
+                        i.seconds, repetition_index);
+
+    if (!report.error_occurred && b.complexity != oNone)
+      complexity_reports.push_back(report);
+
+    run_results.non_aggregates.push_back(report);
+  }
+};
+
+}  // end namespace
+
+RunResults RunBenchmark(
+    const benchmark::internal::BenchmarkInstance& b,
+    std::vector<BenchmarkReporter::Run>* complexity_reports) {
+  internal::BenchmarkRunner r(b, complexity_reports);
+  return r.get_results();
+}
+
+}  // end namespace internal
+
+}  // end namespace benchmark
diff --git a/lib/gbenchmark/src/benchmark_runner.h b/lib/gbenchmark/src/benchmark_runner.h
new file mode 100644
index 0000000000..96e8282a11
--- /dev/null
+++ b/lib/gbenchmark/src/benchmark_runner.h
@@ -0,0 +1,51 @@
+// Copyright 2015 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef BENCHMARK_RUNNER_H_
+#define BENCHMARK_RUNNER_H_
+
+#include "benchmark_api_internal.h"
+#include "internal_macros.h"
+
+DECLARE_double(benchmark_min_time);
+
+DECLARE_int32(benchmark_repetitions);
+
+DECLARE_bool(benchmark_report_aggregates_only);
+
+DECLARE_bool(benchmark_display_aggregates_only);
+
+namespace benchmark {
+
+namespace internal {
+
+extern MemoryManager* memory_manager;
+
+struct RunResults {
+  std::vector<BenchmarkReporter::Run> non_aggregates;
+  std::vector<BenchmarkReporter::Run> aggregates_only;
+
+  bool display_report_aggregates_only = false;
+  bool file_report_aggregates_only = false;
+};
+
+RunResults RunBenchmark(
+    const benchmark::internal::BenchmarkInstance& b,
+    std::vector<BenchmarkReporter::Run>* complexity_reports);
+
+}  // namespace internal
+
+}  // end namespace benchmark
+
+#endif  // BENCHMARK_RUNNER_H_
diff --git a/lib/gbenchmark/src/check.h b/lib/gbenchmark/src/check.h
index 73bead2fb5..f5f8253f80 100644
--- a/lib/gbenchmark/src/check.h
+++ b/lib/gbenchmark/src/check.h
@@ -1,9 +1,9 @@
 #ifndef CHECK_H_
 #define CHECK_H_
 
+#include <cmath>
 #include <cstdlib>
 #include <ostream>
-#include <cmath>
 
 #include "internal_macros.h"
 #include "log.h"
@@ -62,6 +62,8 @@ class CheckHandler {
 #define CHECK(b) ::benchmark::internal::GetNullLogInstance()
 #endif
 
+// clang-format off
+// preserve whitespacing between operators for alignment
 #define CHECK_EQ(a, b) CHECK((a) == (b))
 #define CHECK_NE(a, b) CHECK((a) != (b))
 #define CHECK_GE(a, b) CHECK((a) >= (b))
@@ -75,5 +77,6 @@ class CheckHandler {
 #define CHECK_FLOAT_LE(a, b, eps) CHECK((b) - (a) > -(eps))
 #define CHECK_FLOAT_GT(a, b, eps) CHECK((a) - (b) >  (eps))
 #define CHECK_FLOAT_LT(a, b, eps) CHECK((b) - (a) >  (eps))
+//clang-format on
 
 #endif  // CHECK_H_
diff --git a/lib/gbenchmark/src/colorprint.cc b/lib/gbenchmark/src/colorprint.cc
index 2dec4a8b28..fff6a98818 100644
--- a/lib/gbenchmark/src/colorprint.cc
+++ b/lib/gbenchmark/src/colorprint.cc
@@ -25,7 +25,7 @@
 #include "internal_macros.h"
 
 #ifdef BENCHMARK_OS_WINDOWS
-#include <Windows.h>
+#include <windows.h>
 #include <io.h>
 #else
 #include <unistd.h>
diff --git a/lib/gbenchmark/src/commandlineflags.cc b/lib/gbenchmark/src/commandlineflags.cc
index 2fc92517a3..6bd65c5ae7 100644
--- a/lib/gbenchmark/src/commandlineflags.cc
+++ b/lib/gbenchmark/src/commandlineflags.cc
@@ -21,6 +21,8 @@
 #include <limits>
 
 namespace benchmark {
+namespace {
+
 // Parses 'str' for a 32-bit signed integer.  If successful, writes
 // the result to *value and returns true; otherwise leaves *value
 // unchanged and returns false.
@@ -45,7 +47,7 @@ bool ParseInt32(const std::string& src_text, const char* str, int32_t* value) {
       // LONG_MAX or LONG_MIN when the input overflows.)
       result != long_value
       // The parsed value overflows as an Int32.
-      ) {
+  ) {
     std::cerr << src_text << " is expected to be a 32-bit integer, "
               << "but actually has value \"" << str << "\", "
               << "which overflows.\n";
@@ -88,6 +90,8 @@ static std::string FlagToEnvVar(const char* flag) {
   return "BENCHMARK_" + env_var;
 }
 
+}  // namespace
+
 // Reads and returns the Boolean environment variable corresponding to
 // the given flag; if it's not set, returns default_value.
 //
diff --git a/lib/gbenchmark/src/commandlineflags.h b/lib/gbenchmark/src/commandlineflags.h
index 945c9a9fc4..5eaea82a59 100644
--- a/lib/gbenchmark/src/commandlineflags.h
+++ b/lib/gbenchmark/src/commandlineflags.h
@@ -23,16 +23,10 @@
   std::string FLAG(name) = (default_val)
 
 namespace benchmark {
-// Parses 'str' for a 32-bit signed integer.  If successful, writes the result
-// to *value and returns true; otherwise leaves *value unchanged and returns
-// false.
-bool ParseInt32(const std::string& src_text, const char* str, int32_t* value);
-
 // Parses a bool/Int32/string from the environment variable
 // corresponding to the given Google Test flag.
 bool BoolFromEnv(const char* flag, bool default_val);
 int32_t Int32FromEnv(const char* flag, int32_t default_val);
-double DoubleFromEnv(const char* flag, double default_val);
 const char* StringFromEnv(const char* flag, const char* default_val);
 
 // Parses a string for a bool flag, in the form of either
diff --git a/lib/gbenchmark/src/complexity.cc b/lib/gbenchmark/src/complexity.cc
index 97bf6e09b3..aeed67f0c7 100644
--- a/lib/gbenchmark/src/complexity.cc
+++ b/lib/gbenchmark/src/complexity.cc
@@ -26,20 +26,26 @@ namespace benchmark {
 
 // Internal function to calculate the different scalability forms
 BigOFunc* FittingCurve(BigO complexity) {
+  static const double kLog2E = 1.44269504088896340736;
   switch (complexity) {
     case oN:
-      return [](int64_t n) -> double { return static_cast<double>(n); };
+      return [](IterationCount n) -> double { return static_cast<double>(n); };
     case oNSquared:
-      return [](int64_t n) -> double { return std::pow(n, 2); };
+      return [](IterationCount n) -> double { return std::pow(n, 2); };
     case oNCubed:
-      return [](int64_t n) -> double { return std::pow(n, 3); };
+      return [](IterationCount n) -> double { return std::pow(n, 3); };
     case oLogN:
-      return [](int64_t n) { return log2(n); };
+      /* Note: can't use log2 because Android's GNU STL lacks it */
+      return
+          [](IterationCount n) { return kLog2E * log(static_cast<double>(n)); };
     case oNLogN:
-      return [](int64_t n) { return n * log2(n); };
+      /* Note: can't use log2 because Android's GNU STL lacks it */
+      return [](IterationCount n) {
+        return kLog2E * n * log(static_cast<double>(n));
+      };
     case o1:
     default:
-      return [](int64_t) { return 1.0; };
+      return [](IterationCount) { return 1.0; };
   }
 }
 
@@ -70,8 +76,8 @@ std::string GetBigOString(BigO complexity) {
 //   - time          : Vector containing the times for the benchmark tests.
 //   - fitting_curve : lambda expression (e.g. [](int64_t n) {return n; };).
 
-// For a deeper explanation on the algorithm logic, look the README file at
-// http://github.com/ismaelJimenez/Minimal-Cpp-Least-Squared-Fit
+// For a deeper explanation on the algorithm logic, please refer to
+// https://en.wikipedia.org/wiki/Least_squares#Least_squares,_regression_analysis_and_statistics
 
 LeastSq MinimalLeastSq(const std::vector<int64_t>& n,
                        const std::vector<double>& time,
@@ -179,12 +185,20 @@ std::vector<BenchmarkReporter::Run> ComputeBigO(
     result_cpu = MinimalLeastSq(n, cpu_time, reports[0].complexity);
     result_real = MinimalLeastSq(n, real_time, result_cpu.complexity);
   }
-  std::string benchmark_name =
-      reports[0].benchmark_name.substr(0, reports[0].benchmark_name.find('/'));
+
+  // Drop the 'args' when reporting complexity.
+  auto run_name = reports[0].run_name;
+  run_name.args.clear();
 
   // Get the data from the accumulator to BenchmarkReporter::Run's.
   Run big_o;
-  big_o.benchmark_name = benchmark_name + "_BigO";
+  big_o.run_name = run_name;
+  big_o.run_type = BenchmarkReporter::Run::RT_Aggregate;
+  big_o.repetitions = reports[0].repetitions;
+  big_o.repetition_index = Run::no_repetition_index;
+  big_o.threads = reports[0].threads;
+  big_o.aggregate_name = "BigO";
+  big_o.report_label = reports[0].report_label;
   big_o.iterations = 0;
   big_o.real_accumulated_time = result_real.coef;
   big_o.cpu_accumulated_time = result_cpu.coef;
@@ -200,10 +214,14 @@ std::vector<BenchmarkReporter::Run> ComputeBigO(
 
   // Only add label to mean/stddev if it is same for all runs
   Run rms;
-  big_o.report_label = reports[0].report_label;
-  rms.benchmark_name = benchmark_name + "_RMS";
+  rms.run_name = run_name;
+  rms.run_type = BenchmarkReporter::Run::RT_Aggregate;
+  rms.aggregate_name = "RMS";
   rms.report_label = big_o.report_label;
   rms.iterations = 0;
+  rms.repetition_index = Run::no_repetition_index;
+  rms.repetitions = reports[0].repetitions;
+  rms.threads = reports[0].threads;
   rms.real_accumulated_time = result_real.rms / multiplier;
   rms.cpu_accumulated_time = result_cpu.rms / multiplier;
   rms.report_rms = true;
diff --git a/lib/gbenchmark/src/console_reporter.cc b/lib/gbenchmark/src/console_reporter.cc
index 48920ca782..cc8ae276f6 100644
--- a/lib/gbenchmark/src/console_reporter.cc
+++ b/lib/gbenchmark/src/console_reporter.cc
@@ -53,7 +53,7 @@ bool ConsoleReporter::ReportContext(const Context& context) {
 }
 
 void ConsoleReporter::PrintHeader(const Run& run) {
-  std::string str = FormatString("%-*s %13s %13s %10s", static_cast<int>(name_field_width_),
+  std::string str = FormatString("%-*s %13s %15s %12s", static_cast<int>(name_field_width_),
                                  "Benchmark", "Time", "CPU", "Iterations");
   if(!run.counters.empty()) {
     if(output_options_ & OO_Tabular) {
@@ -64,9 +64,8 @@ void ConsoleReporter::PrintHeader(const Run& run) {
       str += " UserCounters...";
     }
   }
-  str += "\n";
   std::string line = std::string(str.length(), '-');
-  GetOutputStream() << line << "\n" << str << line << "\n";
+  GetOutputStream() << line << "\n" << str << "\n" << line << "\n";
 }
 
 void ConsoleReporter::ReportRuns(const std::vector<Run>& reports) {
@@ -98,6 +97,21 @@ static void IgnoreColorPrint(std::ostream& out, LogColor, const char* fmt,
   va_end(args);
 }
 
+
+static std::string FormatTime(double time) {
+  // Align decimal places...
+  if (time < 1.0) {
+    return FormatString("%10.3f", time);
+  }
+  if (time < 10.0) {
+    return FormatString("%10.2f", time);
+  }
+  if (time < 100.0) {
+    return FormatString("%10.1f", time);
+  }
+  return FormatString("%10.0f", time);
+}
+
 void ConsoleReporter::PrintRunData(const Run& result) {
   typedef void(PrinterFn)(std::ostream&, LogColor, const char*, ...);
   auto& Out = GetOutputStream();
@@ -106,7 +120,7 @@ void ConsoleReporter::PrintRunData(const Run& result) {
   auto name_color =
       (result.report_big_o || result.report_rms) ? COLOR_BLUE : COLOR_GREEN;
   printer(Out, name_color, "%-*s ", name_field_width_,
-          result.benchmark_name.c_str());
+          result.benchmark_name().c_str());
 
   if (result.error_occurred) {
     printer(Out, COLOR_RED, "ERROR OCCURRED: \'%s\'",
@@ -114,33 +128,24 @@ void ConsoleReporter::PrintRunData(const Run& result) {
     printer(Out, COLOR_DEFAULT, "\n");
     return;
   }
-  // Format bytes per second
-  std::string rate;
-  if (result.bytes_per_second > 0) {
-    rate = StrCat(" ", HumanReadableNumber(result.bytes_per_second), "B/s");
-  }
-
-  // Format items per second
-  std::string items;
-  if (result.items_per_second > 0) {
-    items =
-        StrCat(" ", HumanReadableNumber(result.items_per_second), " items/s");
-  }
 
   const double real_time = result.GetAdjustedRealTime();
   const double cpu_time = result.GetAdjustedCPUTime();
+  const std::string real_time_str = FormatTime(real_time);
+  const std::string cpu_time_str = FormatTime(cpu_time);
+
 
   if (result.report_big_o) {
     std::string big_o = GetBigOString(result.complexity);
-    printer(Out, COLOR_YELLOW, "%10.2f %s %10.2f %s ", real_time, big_o.c_str(),
+    printer(Out, COLOR_YELLOW, "%10.2f %-4s %10.2f %-4s ", real_time, big_o.c_str(),
             cpu_time, big_o.c_str());
   } else if (result.report_rms) {
-    printer(Out, COLOR_YELLOW, "%10.0f %% %10.0f %% ", real_time * 100,
-            cpu_time * 100);
+    printer(Out, COLOR_YELLOW, "%10.0f %-4s %10.0f %-4s ", real_time * 100, "%",
+            cpu_time * 100, "%");
   } else {
     const char* timeLabel = GetTimeUnitString(result.time_unit);
-    printer(Out, COLOR_YELLOW, "%10.0f %s %10.0f %s ", real_time, timeLabel,
-            cpu_time, timeLabel);
+    printer(Out, COLOR_YELLOW, "%s %-4s %s %-4s ", real_time_str.c_str(), timeLabel,
+            cpu_time_str.c_str(), timeLabel);
   }
 
   if (!result.report_big_o && !result.report_rms) {
@@ -150,7 +155,7 @@ void ConsoleReporter::PrintRunData(const Run& result) {
   for (auto& c : result.counters) {
     const std::size_t cNameLen = std::max(std::string::size_type(10),
                                           c.first.length());
-    auto const& s = HumanReadableNumber(c.second.value, 1000);
+    auto const& s = HumanReadableNumber(c.second.value, c.second.oneK);
     if (output_options_ & OO_Tabular) {
       if (c.second.flags & Counter::kIsRate) {
         printer(Out, COLOR_DEFAULT, " %*s/s", cNameLen - 2, s.c_str());
@@ -164,14 +169,6 @@ void ConsoleReporter::PrintRunData(const Run& result) {
     }
   }
 
-  if (!rate.empty()) {
-    printer(Out, COLOR_DEFAULT, " %*s", 13, rate.c_str());
-  }
-
-  if (!items.empty()) {
-    printer(Out, COLOR_DEFAULT, " %*s", 18, items.c_str());
-  }
-
   if (!result.report_label.empty()) {
     printer(Out, COLOR_DEFAULT, " %s", result.report_label.c_str());
   }
diff --git a/lib/gbenchmark/src/counter.cc b/lib/gbenchmark/src/counter.cc
index ed1aa044ee..c248ea110b 100644
--- a/lib/gbenchmark/src/counter.cc
+++ b/lib/gbenchmark/src/counter.cc
@@ -17,7 +17,8 @@
 namespace benchmark {
 namespace internal {
 
-double Finish(Counter const& c, double cpu_time, double num_threads) {
+double Finish(Counter const& c, IterationCount iterations, double cpu_time,
+              double num_threads) {
   double v = c.value;
   if (c.flags & Counter::kIsRate) {
     v /= cpu_time;
@@ -25,25 +26,32 @@ double Finish(Counter const& c, double cpu_time, double num_threads) {
   if (c.flags & Counter::kAvgThreads) {
     v /= num_threads;
   }
+  if (c.flags & Counter::kIsIterationInvariant) {
+    v *= iterations;
+  }
+  if (c.flags & Counter::kAvgIterations) {
+    v /= iterations;
+  }
   return v;
 }
 
-void Finish(UserCounters *l, double cpu_time, double num_threads) {
-  for (auto &c : *l) {
-    c.second.value = Finish(c.second, cpu_time, num_threads);
+void Finish(UserCounters* l, IterationCount iterations, double cpu_time,
+            double num_threads) {
+  for (auto& c : *l) {
+    c.second.value = Finish(c.second, iterations, cpu_time, num_threads);
   }
 }
 
-void Increment(UserCounters *l, UserCounters const& r) {
+void Increment(UserCounters* l, UserCounters const& r) {
   // add counters present in both or just in *l
-  for (auto &c : *l) {
+  for (auto& c : *l) {
     auto it = r.find(c.first);
     if (it != r.end()) {
       c.second.value = c.second + it->second;
     }
   }
   // add counters present in r, but not in *l
-  for (auto const &tc : r) {
+  for (auto const& tc : r) {
     auto it = l->find(tc.first);
     if (it == l->end()) {
       (*l)[tc.first] = tc.second;
@@ -64,5 +72,5 @@ bool SameNames(UserCounters const& l, UserCounters const& r) {
   return true;
 }
 
-} // end namespace internal
-} // end namespace benchmark
+}  // end namespace internal
+}  // end namespace benchmark
diff --git a/lib/gbenchmark/src/counter.h b/lib/gbenchmark/src/counter.h
index dd6865a31d..1ad46d4940 100644
--- a/lib/gbenchmark/src/counter.h
+++ b/lib/gbenchmark/src/counter.h
@@ -18,9 +18,10 @@ namespace benchmark {
 
 // these counter-related functions are hidden to reduce API surface.
 namespace internal {
-void Finish(UserCounters *l, double time, double num_threads);
-void Increment(UserCounters *l, UserCounters const& r);
+void Finish(UserCounters* l, IterationCount iterations, double time,
+            double num_threads);
+void Increment(UserCounters* l, UserCounters const& r);
 bool SameNames(UserCounters const& l, UserCounters const& r);
-} // end namespace internal
+}  // end namespace internal
 
-} //end namespace benchmark
+}  // end namespace benchmark
diff --git a/lib/gbenchmark/src/csv_reporter.cc b/lib/gbenchmark/src/csv_reporter.cc
index 35510645b0..af2c18fc8a 100644
--- a/lib/gbenchmark/src/csv_reporter.cc
+++ b/lib/gbenchmark/src/csv_reporter.cc
@@ -22,9 +22,9 @@
 #include <tuple>
 #include <vector>
 
+#include "check.h"
 #include "string_util.h"
 #include "timers.h"
-#include "check.h"
 
 // File format reference: http://edoceo.com/utilitas/csv-file-format.
 
@@ -37,18 +37,32 @@ std::vector<std::string> elements = {
     "error_occurred", "error_message"};
 }  // namespace
 
+std::string CsvEscape(const std::string & s) {
+  std::string tmp;
+  tmp.reserve(s.size() + 2);
+  for (char c : s) {
+    switch (c) {
+    case '"' : tmp += "\"\""; break;
+    default  : tmp += c; break;
+    }
+  }
+  return '"' + tmp + '"';
+}
+
 bool CSVReporter::ReportContext(const Context& context) {
   PrintBasicContext(&GetErrorStream(), context);
   return true;
 }
 
-void CSVReporter::ReportRuns(const std::vector<Run> & reports) {
+void CSVReporter::ReportRuns(const std::vector<Run>& reports) {
   std::ostream& Out = GetOutputStream();
 
   if (!printed_header_) {
     // save the names of all the user counters
     for (const auto& run : reports) {
       for (const auto& cnt : run.counters) {
+        if (cnt.first == "bytes_per_second" || cnt.first == "items_per_second")
+          continue;
         user_counter_names_.insert(cnt.first);
       }
     }
@@ -58,7 +72,8 @@ void CSVReporter::ReportRuns(const std::vector<Run> & reports) {
       Out << *B++;
       if (B != elements.end()) Out << ",";
     }
-    for (auto B = user_counter_names_.begin(); B != user_counter_names_.end();) {
+    for (auto B = user_counter_names_.begin();
+         B != user_counter_names_.end();) {
       Out << ",\"" << *B++ << "\"";
     }
     Out << "\n";
@@ -68,10 +83,12 @@ void CSVReporter::ReportRuns(const std::vector<Run> & reports) {
     // check that all the current counters are saved in the name set
     for (const auto& run : reports) {
       for (const auto& cnt : run.counters) {
+        if (cnt.first == "bytes_per_second" || cnt.first == "items_per_second")
+          continue;
         CHECK(user_counter_names_.find(cnt.first) != user_counter_names_.end())
-              << "All counters must be present in each run. "
-              << "Counter named \"" << cnt.first
-              << "\" was not in a run after being added to the header";
+            << "All counters must be present in each run. "
+            << "Counter named \"" << cnt.first
+            << "\" was not in a run after being added to the header";
       }
     }
   }
@@ -80,23 +97,15 @@ void CSVReporter::ReportRuns(const std::vector<Run> & reports) {
   for (const auto& run : reports) {
     PrintRunData(run);
   }
-
 }
 
-void CSVReporter::PrintRunData(const Run & run) {
+void CSVReporter::PrintRunData(const Run& run) {
   std::ostream& Out = GetOutputStream();
-
-  // Field with embedded double-quote characters must be doubled and the field
-  // delimited with double-quotes.
-  std::string name = run.benchmark_name;
-  ReplaceAll(&name, "\"", "\"\"");
-  Out << '"' << name << "\",";
+  Out << CsvEscape(run.benchmark_name()) << ",";
   if (run.error_occurred) {
     Out << std::string(elements.size() - 3, ',');
     Out << "true,";
-    std::string msg = run.error_message;
-    ReplaceAll(&msg, "\"", "\"\"");
-    Out << '"' << msg << "\"\n";
+    Out << CsvEscape(run.error_message) << "\n";
     return;
   }
 
@@ -117,27 +126,23 @@ void CSVReporter::PrintRunData(const Run & run) {
   }
   Out << ",";
 
-  if (run.bytes_per_second > 0.0) {
-    Out << run.bytes_per_second;
+  if (run.counters.find("bytes_per_second") != run.counters.end()) {
+    Out << run.counters.at("bytes_per_second");
   }
   Out << ",";
-  if (run.items_per_second > 0.0) {
-    Out << run.items_per_second;
+  if (run.counters.find("items_per_second") != run.counters.end()) {
+    Out << run.counters.at("items_per_second");
   }
   Out << ",";
   if (!run.report_label.empty()) {
-    // Field with embedded double-quote characters must be doubled and the field
-    // delimited with double-quotes.
-    std::string label = run.report_label;
-    ReplaceAll(&label, "\"", "\"\"");
-    Out << "\"" << label << "\"";
+    Out << CsvEscape(run.report_label);
   }
   Out << ",,";  // for error_occurred and error_message
 
   // Print user counters
-  for (const auto &ucn : user_counter_names_) {
+  for (const auto& ucn : user_counter_names_) {
     auto it = run.counters.find(ucn);
-    if(it == run.counters.end()) {
+    if (it == run.counters.end()) {
       Out << ",";
     } else {
       Out << "," << it->second;
diff --git a/lib/gbenchmark/src/cycleclock.h b/lib/gbenchmark/src/cycleclock.h
index 3b376ac57d..f5e37b011b 100644
--- a/lib/gbenchmark/src/cycleclock.h
+++ b/lib/gbenchmark/src/cycleclock.h
@@ -41,7 +41,7 @@ extern "C" uint64_t __rdtsc();
 #pragma intrinsic(__rdtsc)
 #endif
 
-#ifndef BENCHMARK_OS_WINDOWS
+#if !defined(BENCHMARK_OS_WINDOWS) || defined(BENCHMARK_OS_MINGW)
 #include <sys/time.h>
 #include <time.h>
 #endif
@@ -121,7 +121,7 @@ inline BENCHMARK_ALWAYS_INLINE int64_t Now() {
   // because is provides nanosecond resolution (which is noticable at
   // least for PNaCl modules running on x86 Mac & Linux).
   // Initialize to always return 0 if clock_gettime fails.
-  struct timespec ts = { 0, 0 };
+  struct timespec ts = {0, 0};
   clock_gettime(CLOCK_MONOTONIC, &ts);
   return static_cast<int64_t>(ts.tv_sec) * 1000000000 + ts.tv_nsec;
 #elif defined(__aarch64__)
@@ -159,10 +159,10 @@ inline BENCHMARK_ALWAYS_INLINE int64_t Now() {
   struct timeval tv;
   gettimeofday(&tv, nullptr);
   return static_cast<int64_t>(tv.tv_sec) * 1000000 + tv.tv_usec;
-#elif defined(__s390__) // Covers both s390 and s390x.
+#elif defined(__s390__)  // Covers both s390 and s390x.
   // Return the CPU clock.
   uint64_t tsc;
-  asm("stck %0" : "=Q" (tsc) : : "cc");
+  asm("stck %0" : "=Q"(tsc) : : "cc");
   return tsc;
 #else
 // The soft failover to a generic implementation is automatic only for ARM.
diff --git a/lib/gbenchmark/src/internal_macros.h b/lib/gbenchmark/src/internal_macros.h
index f7b9203e53..6adf00d056 100644
--- a/lib/gbenchmark/src/internal_macros.h
+++ b/lib/gbenchmark/src/internal_macros.h
@@ -3,12 +3,14 @@
 
 #include "benchmark/benchmark.h"
 
+/* Needed to detect STL */
+#include <cstdlib>
+
+// clang-format off
+
 #ifndef __has_feature
 #define __has_feature(x) 0
 #endif
-#ifndef __has_builtin
-#define __has_builtin(x) 0
-#endif
 
 #if defined(__clang__)
   #if !defined(COMPILER_CLANG)
@@ -38,6 +40,9 @@
   #define BENCHMARK_OS_CYGWIN 1
 #elif defined(_WIN32)
   #define BENCHMARK_OS_WINDOWS 1
+  #if defined(__MINGW32__)
+    #define BENCHMARK_OS_MINGW 1
+  #endif
 #elif defined(__APPLE__)
   #define BENCHMARK_OS_APPLE 1
   #include "TargetConditionals.h"
@@ -57,7 +62,7 @@
   #define BENCHMARK_OS_LINUX 1
 #elif defined(__native_client__)
   #define BENCHMARK_OS_NACL 1
-#elif defined(EMSCRIPTEN)
+#elif defined(__EMSCRIPTEN__)
   #define BENCHMARK_OS_EMSCRIPTEN 1
 #elif defined(__rtems__)
   #define BENCHMARK_OS_RTEMS 1
@@ -65,6 +70,12 @@
 #define BENCHMARK_OS_FUCHSIA 1
 #elif defined (__SVR4) && defined (__sun)
 #define BENCHMARK_OS_SOLARIS 1
+#elif defined(__QNX__)
+#define BENCHMARK_OS_QNX 1
+#endif
+
+#if defined(__ANDROID__) && defined(__GLIBCXX__)
+#define BENCHMARK_STL_ANDROID_GNUSTL 1
 #endif
 
 #if !__has_feature(cxx_exceptions) && !defined(__cpp_exceptions) \
@@ -78,12 +89,6 @@
   #define BENCHMARK_MAYBE_UNUSED
 #endif
 
-#if defined(COMPILER_GCC) || __has_builtin(__builtin_unreachable)
-  #define BENCHMARK_UNREACHABLE() __builtin_unreachable()
-#elif defined(COMPILER_MSVC)
-  #define BENCHMARK_UNREACHABLE() __assume(false)
-#else
-  #define BENCHMARK_UNREACHABLE() ((void)0)
-#endif
+// clang-format on
 
 #endif  // BENCHMARK_INTERNAL_MACROS_H_
diff --git a/lib/gbenchmark/src/json_reporter.cc b/lib/gbenchmark/src/json_reporter.cc
index 685d6b097d..11db2b99d5 100644
--- a/lib/gbenchmark/src/json_reporter.cc
+++ b/lib/gbenchmark/src/json_reporter.cc
@@ -16,13 +16,14 @@
 #include "complexity.h"
 
 #include <algorithm>
+#include <cmath>
 #include <cstdint>
+#include <iomanip>  // for setprecision
 #include <iostream>
+#include <limits>
 #include <string>
 #include <tuple>
 #include <vector>
-#include <iomanip> // for setprecision
-#include <limits>
 
 #include "string_util.h"
 #include "timers.h"
@@ -31,32 +32,63 @@ namespace benchmark {
 
 namespace {
 
+std::string StrEscape(const std::string & s) {
+  std::string tmp;
+  tmp.reserve(s.size());
+  for (char c : s) {
+    switch (c) {
+    case '\b': tmp += "\\b"; break;
+    case '\f': tmp += "\\f"; break;
+    case '\n': tmp += "\\n"; break;
+    case '\r': tmp += "\\r"; break;
+    case '\t': tmp += "\\t"; break;
+    case '\\': tmp += "\\\\"; break;
+    case '"' : tmp += "\\\""; break;
+    default  : tmp += c; break;
+    }
+  }
+  return tmp;
+}
+
 std::string FormatKV(std::string const& key, std::string const& value) {
-  return StrFormat("\"%s\": \"%s\"", key.c_str(), value.c_str());
+  return StrFormat("\"%s\": \"%s\"", StrEscape(key).c_str(), StrEscape(value).c_str());
 }
 
 std::string FormatKV(std::string const& key, const char* value) {
-  return StrFormat("\"%s\": \"%s\"", key.c_str(), value);
+  return StrFormat("\"%s\": \"%s\"", StrEscape(key).c_str(), StrEscape(value).c_str());
 }
 
 std::string FormatKV(std::string const& key, bool value) {
-  return StrFormat("\"%s\": %s", key.c_str(), value ? "true" : "false");
+  return StrFormat("\"%s\": %s", StrEscape(key).c_str(), value ? "true" : "false");
 }
 
 std::string FormatKV(std::string const& key, int64_t value) {
   std::stringstream ss;
-  ss << '"' << key << "\": " << value;
+  ss << '"' << StrEscape(key) << "\": " << value;
   return ss.str();
 }
 
-std::string FormatKV(std::string const& key, double value) {
+std::string FormatKV(std::string const& key, IterationCount value) {
   std::stringstream ss;
-  ss << '"' << key << "\": ";
-
-  const auto max_digits10 = std::numeric_limits<decltype (value)>::max_digits10;
-  const auto max_fractional_digits10 = max_digits10 - 1;
+  ss << '"' << StrEscape(key) << "\": " << value;
+  return ss.str();
+}
 
-  ss << std::scientific << std::setprecision(max_fractional_digits10) << value;
+std::string FormatKV(std::string const& key, double value) {
+  std::stringstream ss;
+  ss << '"' << StrEscape(key) << "\": ";
+
+  if (std::isnan(value))
+    ss << (value < 0 ? "-" : "") << "NaN";
+  else if (std::isinf(value))
+    ss << (value < 0 ? "-" : "") << "Infinity";
+  else {
+    const auto max_digits10 =
+        std::numeric_limits<decltype(value)>::max_digits10;
+    const auto max_fractional_digits10 = max_digits10 - 1;
+    ss << std::scientific << std::setprecision(max_fractional_digits10)
+       << value;
+  }
   return ss.str();
 }
 
@@ -77,6 +109,8 @@ bool JSONReporter::ReportContext(const Context& context) {
   std::string walltime_value = LocalDateTimeString();
   out << indent << FormatKV("date", walltime_value) << ",\n";
 
+  out << indent << FormatKV("host_name", context.sys_info.name) << ",\n";
+
   if (Context::executable_name) {
     out << indent << FormatKV("executable", Context::executable_name) << ",\n";
   }
@@ -111,6 +145,12 @@ bool JSONReporter::ReportContext(const Context& context) {
   }
   indent = std::string(4, ' ');
   out << indent << "],\n";
+  out << indent << "\"load_avg\": [";
+  for (auto it = info.load_avg.begin(); it != info.load_avg.end();) {
+    out << *it++;
+    if (it != info.load_avg.end()) out << ",";
+  }
+  out << "],\n";
 
 #if defined(NDEBUG)
   const char build_type[] = "release";
@@ -154,52 +194,60 @@ void JSONReporter::Finalize() {
 void JSONReporter::PrintRunData(Run const& run) {
   std::string indent(6, ' ');
   std::ostream& out = GetOutputStream();
-  out << indent << FormatKV("name", run.benchmark_name) << ",\n";
+  out << indent << FormatKV("name", run.benchmark_name()) << ",\n";
+  out << indent << FormatKV("run_name", run.run_name.str()) << ",\n";
+  out << indent << FormatKV("run_type", [&run]() -> const char* {
+    switch (run.run_type) {
+      case BenchmarkReporter::Run::RT_Iteration:
+        return "iteration";
+      case BenchmarkReporter::Run::RT_Aggregate:
+        return "aggregate";
+    }
+    BENCHMARK_UNREACHABLE();
+  }()) << ",\n";
+  out << indent << FormatKV("repetitions", run.repetitions) << ",\n";
+  if (run.run_type != BenchmarkReporter::Run::RT_Aggregate) {
+    out << indent << FormatKV("repetition_index", run.repetition_index)
+        << ",\n";
+  }
+  out << indent << FormatKV("threads", run.threads) << ",\n";
+  if (run.run_type == BenchmarkReporter::Run::RT_Aggregate) {
+    out << indent << FormatKV("aggregate_name", run.aggregate_name) << ",\n";
+  }
   if (run.error_occurred) {
     out << indent << FormatKV("error_occurred", run.error_occurred) << ",\n";
     out << indent << FormatKV("error_message", run.error_message) << ",\n";
   }
   if (!run.report_big_o && !run.report_rms) {
     out << indent << FormatKV("iterations", run.iterations) << ",\n";
-    out << indent
-        << FormatKV("real_time", run.GetAdjustedRealTime())
-        << ",\n";
-    out << indent
-        << FormatKV("cpu_time", run.GetAdjustedCPUTime());
+    out << indent << FormatKV("real_time", run.GetAdjustedRealTime()) << ",\n";
+    out << indent << FormatKV("cpu_time", run.GetAdjustedCPUTime());
     out << ",\n"
         << indent << FormatKV("time_unit", GetTimeUnitString(run.time_unit));
   } else if (run.report_big_o) {
-    out << indent
-        << FormatKV("cpu_coefficient", run.GetAdjustedCPUTime())
+    out << indent << FormatKV("cpu_coefficient", run.GetAdjustedCPUTime())
         << ",\n";
-    out << indent
-        << FormatKV("real_coefficient", run.GetAdjustedRealTime())
+    out << indent << FormatKV("real_coefficient", run.GetAdjustedRealTime())
         << ",\n";
     out << indent << FormatKV("big_o", GetBigOString(run.complexity)) << ",\n";
     out << indent << FormatKV("time_unit", GetTimeUnitString(run.time_unit));
   } else if (run.report_rms) {
-    out << indent
-        << FormatKV("rms", run.GetAdjustedCPUTime());
+    out << indent << FormatKV("rms", run.GetAdjustedCPUTime());
   }
-  if (run.bytes_per_second > 0.0) {
-    out << ",\n"
-        << indent
-        << FormatKV("bytes_per_second", run.bytes_per_second);
-  }
-  if (run.items_per_second > 0.0) {
-    out << ",\n"
-        << indent
-        << FormatKV("items_per_second", run.items_per_second);
+
+  for (auto& c : run.counters) {
+    out << ",\n" << indent << FormatKV(c.first, c.second);
   }
-  for(auto &c : run.counters) {
-    out << ",\n"
-        << indent
-        << FormatKV(c.first, c.second);
+
+  if (run.has_memory_result) {
+    out << ",\n" << indent << FormatKV("allocs_per_iter", run.allocs_per_iter);
+    out << ",\n" << indent << FormatKV("max_bytes_used", run.max_bytes_used);
   }
+
   if (!run.report_label.empty()) {
     out << ",\n" << indent << FormatKV("label", run.report_label);
   }
   out << '\n';
 }
 
-} // end namespace benchmark
+}  // end namespace benchmark
diff --git a/lib/gbenchmark/src/log.h b/lib/gbenchmark/src/log.h
index d06e1031db..47d0c35c01 100644
--- a/lib/gbenchmark/src/log.h
+++ b/lib/gbenchmark/src/log.h
@@ -66,8 +66,9 @@ inline LogType& GetLogInstanceForLevel(int level) {
 }  // end namespace internal
 }  // end namespace benchmark
 
+// clang-format off
 #define VLOG(x)                                                               \
   (::benchmark::internal::GetLogInstanceForLevel(x) << "-- LOG(" << x << "):" \
                                                                          " ")
-
+// clang-format on
 #endif
diff --git a/lib/gbenchmark/src/re.h b/lib/gbenchmark/src/re.h
index 924d2f0ba7..fbe25037b4 100644
--- a/lib/gbenchmark/src/re.h
+++ b/lib/gbenchmark/src/re.h
@@ -17,6 +17,8 @@
 
 #include "internal_macros.h"
 
+// clang-format off
+
 #if !defined(HAVE_STD_REGEX) && \
     !defined(HAVE_GNU_POSIX_REGEX) && \
     !defined(HAVE_POSIX_REGEX)
@@ -45,6 +47,9 @@
 #else
 #error No regular expression backend was found!
 #endif
+
+// clang-format on
+
 #include <string>
 
 #include "check.h"
@@ -76,7 +81,7 @@ class Regex {
 #elif defined(HAVE_POSIX_REGEX) || defined(HAVE_GNU_POSIX_REGEX)
   regex_t re_;
 #else
-  #error No regular expression backend implementation available
+#error No regular expression backend implementation available
 #endif
 };
 
@@ -84,20 +89,21 @@ class Regex {
 
 inline bool Regex::Init(const std::string& spec, std::string* error) {
 #ifdef BENCHMARK_HAS_NO_EXCEPTIONS
-  ((void)error); // suppress unused warning
+  ((void)error);  // suppress unused warning
 #else
   try {
 #endif
-    re_ = std::regex(spec, std::regex_constants::extended);
-    init_ = true;
+  re_ = std::regex(spec, std::regex_constants::extended);
+  init_ = true;
 #ifndef BENCHMARK_HAS_NO_EXCEPTIONS
-  } catch (const std::regex_error& e) {
-    if (error) {
-      *error = e.what();
-    }
+}
+catch (const std::regex_error& e) {
+  if (error) {
+    *error = e.what();
   }
+}
 #endif
-  return init_;
+return init_;
 }
 
 inline Regex::~Regex() {}
diff --git a/lib/gbenchmark/src/reporter.cc b/lib/gbenchmark/src/reporter.cc
index 4b40aaec8b..4d3e477d44 100644
--- a/lib/gbenchmark/src/reporter.cc
+++ b/lib/gbenchmark/src/reporter.cc
@@ -22,6 +22,7 @@
 #include <vector>
 
 #include "check.h"
+#include "string_util.h"
 
 namespace benchmark {
 
@@ -54,6 +55,14 @@ void BenchmarkReporter::PrintBasicContext(std::ostream *out,
       Out << "\n";
     }
   }
+  if (!info.load_avg.empty()) {
+    Out << "Load Average: ";
+    for (auto It = info.load_avg.begin(); It != info.load_avg.end();) {
+      Out << StrFormat("%.2f", *It++);
+      if (It != info.load_avg.end()) Out << ", ";
+    }
+    Out << "\n";
+  }
 
   if (info.scaling_enabled) {
     Out << "***WARNING*** CPU scaling is enabled, the benchmark "
@@ -68,9 +77,18 @@ void BenchmarkReporter::PrintBasicContext(std::ostream *out,
 }
 
 // No initializer because it's already initialized to NULL.
-const char* BenchmarkReporter::Context::executable_name;
+const char *BenchmarkReporter::Context::executable_name;
+
+BenchmarkReporter::Context::Context()
+    : cpu_info(CPUInfo::Get()), sys_info(SystemInfo::Get()) {}
 
-BenchmarkReporter::Context::Context() : cpu_info(CPUInfo::Get()) {}
+std::string BenchmarkReporter::Run::benchmark_name() const {
+  std::string name = run_name.str();
+  if (run_type == RT_Aggregate) {
+    name += "_" + aggregate_name;
+  }
+  return name;
+}
 
 double BenchmarkReporter::Run::GetAdjustedRealTime() const {
   double new_time = real_accumulated_time * GetTimeUnitMultiplier(time_unit);
diff --git a/lib/gbenchmark/src/sleep.cc b/lib/gbenchmark/src/sleep.cc
index 54aa04a422..1512ac90f7 100644
--- a/lib/gbenchmark/src/sleep.cc
+++ b/lib/gbenchmark/src/sleep.cc
@@ -21,7 +21,7 @@
 #include "internal_macros.h"
 
 #ifdef BENCHMARK_OS_WINDOWS
-#include <Windows.h>
+#include <windows.h>
 #endif
 
 namespace benchmark {
diff --git a/lib/gbenchmark/src/statistics.cc b/lib/gbenchmark/src/statistics.cc
index 1c91e1015a..bd5a3d6597 100644
--- a/lib/gbenchmark/src/statistics.cc
+++ b/lib/gbenchmark/src/statistics.cc
@@ -17,9 +17,9 @@
 
 #include <algorithm>
 #include <cmath>
+#include <numeric>
 #include <string>
 #include <vector>
-#include <numeric>
 #include "check.h"
 #include "statistics.h"
 
@@ -43,9 +43,9 @@ double StatisticsMedian(const std::vector<double>& v) {
 
   // did we have an odd number of samples?
   // if yes, then center is the median
-  // it no, then we are looking for the average between center and the value before
-  if(v.size() % 2 == 1)
-    return *center;
+  // it no, then we are looking for the average between center and the value
+  // before
+  if (v.size() % 2 == 1) return *center;
   auto center2 = copy.begin() + v.size() / 2 - 1;
   std::nth_element(copy.begin(), center2, copy.end());
   return (*center + *center2) / 2.0;
@@ -68,8 +68,7 @@ double StatisticsStdDev(const std::vector<double>& v) {
   if (v.empty()) return mean;
 
   // Sample standard deviation is undefined for n = 1
-  if (v.size() == 1)
-    return 0.0;
+  if (v.size() == 1) return 0.0;
 
   const double avg_squares = SumSquares(v) * (1.0 / v.size());
   return Sqrt(v.size() / (v.size() - 1.0) * (avg_squares - Sqr(mean)));
@@ -92,27 +91,23 @@ std::vector<BenchmarkReporter::Run> ComputeStats(
   // Accumulators.
   std::vector<double> real_accumulated_time_stat;
   std::vector<double> cpu_accumulated_time_stat;
-  std::vector<double> bytes_per_second_stat;
-  std::vector<double> items_per_second_stat;
 
   real_accumulated_time_stat.reserve(reports.size());
   cpu_accumulated_time_stat.reserve(reports.size());
-  bytes_per_second_stat.reserve(reports.size());
-  items_per_second_stat.reserve(reports.size());
 
   // All repetitions should be run with the same number of iterations so we
   // can take this information from the first benchmark.
-  int64_t const run_iterations = reports.front().iterations;
+  const IterationCount run_iterations = reports.front().iterations;
   // create stats for user counters
   struct CounterStat {
     Counter c;
     std::vector<double> s;
   };
-  std::map< std::string, CounterStat > counter_stats;
-  for(Run const& r : reports) {
-    for(auto const& cnt : r.counters) {
+  std::map<std::string, CounterStat> counter_stats;
+  for (Run const& r : reports) {
+    for (auto const& cnt : r.counters) {
       auto it = counter_stats.find(cnt.first);
-      if(it == counter_stats.end()) {
+      if (it == counter_stats.end()) {
         counter_stats.insert({cnt.first, {cnt.second, std::vector<double>{}}});
         it = counter_stats.find(cnt.first);
         it->second.s.reserve(reports.size());
@@ -124,15 +119,13 @@ std::vector<BenchmarkReporter::Run> ComputeStats(
 
   // Populate the accumulators.
   for (Run const& run : reports) {
-    CHECK_EQ(reports[0].benchmark_name, run.benchmark_name);
+    CHECK_EQ(reports[0].benchmark_name(), run.benchmark_name());
     CHECK_EQ(run_iterations, run.iterations);
     if (run.error_occurred) continue;
     real_accumulated_time_stat.emplace_back(run.real_accumulated_time);
     cpu_accumulated_time_stat.emplace_back(run.cpu_accumulated_time);
-    items_per_second_stat.emplace_back(run.items_per_second);
-    bytes_per_second_stat.emplace_back(run.bytes_per_second);
     // user counters
-    for(auto const& cnt : run.counters) {
+    for (auto const& cnt : run.counters) {
       auto it = counter_stats.find(cnt.first);
       CHECK_NE(it, counter_stats.end());
       it->second.s.emplace_back(cnt.second);
@@ -148,24 +141,46 @@ std::vector<BenchmarkReporter::Run> ComputeStats(
     }
   }
 
-  for(const auto& Stat : *reports[0].statistics) {
+  const double iteration_rescale_factor =
+      double(reports.size()) / double(run_iterations);
+
+  for (const auto& Stat : *reports[0].statistics) {
     // Get the data from the accumulator to BenchmarkReporter::Run's.
     Run data;
-    data.benchmark_name = reports[0].benchmark_name + "_" + Stat.name_;
+    data.run_name = reports[0].run_name;
+    data.run_type = BenchmarkReporter::Run::RT_Aggregate;
+    data.threads = reports[0].threads;
+    data.repetitions = reports[0].repetitions;
+    data.repetition_index = Run::no_repetition_index;
+    data.aggregate_name = Stat.name_;
     data.report_label = report_label;
-    data.iterations = run_iterations;
+
+    // It is incorrect to say that an aggregate is computed over
+    // run's iterations, because those iterations already got averaged.
+    // Similarly, if there are N repetitions with 1 iterations each,
+    // an aggregate will be computed over N measurements, not 1.
+    // Thus it is best to simply use the count of separate reports.
+    data.iterations = reports.size();
 
     data.real_accumulated_time = Stat.compute_(real_accumulated_time_stat);
     data.cpu_accumulated_time = Stat.compute_(cpu_accumulated_time_stat);
-    data.bytes_per_second = Stat.compute_(bytes_per_second_stat);
-    data.items_per_second = Stat.compute_(items_per_second_stat);
+
+    // We will divide these times by data.iterations when reporting, but the
+    // data.iterations is not nessesairly the scale of these measurements,
+    // because in each repetition, these timers are sum over all the iterations.
+    // And if we want to say that the stats are over N repetitions and not
+    // M iterations, we need to multiply these by (N/M).
+    data.real_accumulated_time *= iteration_rescale_factor;
+    data.cpu_accumulated_time *= iteration_rescale_factor;
 
     data.time_unit = reports[0].time_unit;
 
     // user counters
-    for(auto const& kv : counter_stats) {
+    for (auto const& kv : counter_stats) {
+      // Do *NOT* rescale the custom counters. They are already properly scaled.
       const auto uc_stat = Stat.compute_(kv.second.s);
-      auto c = Counter(uc_stat, counter_stats[kv.first].c.flags);
+      auto c = Counter(uc_stat, counter_stats[kv.first].c.flags,
+                       counter_stats[kv.first].c.oneK);
       data.counters[kv.first] = c;
     }
 
diff --git a/lib/gbenchmark/src/string_util.cc b/lib/gbenchmark/src/string_util.cc
index ebc3acebd2..39b01a1719 100644
--- a/lib/gbenchmark/src/string_util.cc
+++ b/lib/gbenchmark/src/string_util.cc
@@ -160,13 +160,93 @@ std::string StrFormat(const char* format, ...) {
   return tmp;
 }
 
-void ReplaceAll(std::string* str, const std::string& from,
-                const std::string& to) {
-  std::size_t start = 0;
-  while ((start = str->find(from, start)) != std::string::npos) {
-    str->replace(start, from.length(), to);
-    start += to.length();
+#ifdef BENCHMARK_STL_ANDROID_GNUSTL
+/*
+ * GNU STL in Android NDK lacks support for some C++11 functions, including
+ * stoul, stoi, stod. We reimplement them here using C functions strtoul,
+ * strtol, strtod. Note that reimplemented functions are in benchmark::
+ * namespace, not std:: namespace.
+ */
+unsigned long stoul(const std::string& str, size_t* pos, int base) {
+  /* Record previous errno */
+  const int oldErrno = errno;
+  errno = 0;
+
+  const char* strStart = str.c_str();
+  char* strEnd = const_cast<char*>(strStart);
+  const unsigned long result = strtoul(strStart, &strEnd, base);
+
+  const int strtoulErrno = errno;
+  /* Restore previous errno */
+  errno = oldErrno;
+
+  /* Check for errors and return */
+  if (strtoulErrno == ERANGE) {
+    throw std::out_of_range(
+      "stoul failed: " + str + " is outside of range of unsigned long");
+  } else if (strEnd == strStart || strtoulErrno != 0) {
+    throw std::invalid_argument(
+      "stoul failed: " + str + " is not an integer");
   }
+  if (pos != nullptr) {
+    *pos = static_cast<size_t>(strEnd - strStart);
+  }
+  return result;
+}
+
+int stoi(const std::string& str, size_t* pos, int base) {
+  /* Record previous errno */
+  const int oldErrno = errno;
+  errno = 0;
+
+  const char* strStart = str.c_str();
+  char* strEnd = const_cast<char*>(strStart);
+  const long result = strtol(strStart, &strEnd, base);
+
+  const int strtolErrno = errno;
+  /* Restore previous errno */
+  errno = oldErrno;
+
+  /* Check for errors and return */
+  if (strtolErrno == ERANGE || long(int(result)) != result) {
+    throw std::out_of_range(
+      "stoul failed: " + str + " is outside of range of int");
+  } else if (strEnd == strStart || strtolErrno != 0) {
+    throw std::invalid_argument(
+      "stoul failed: " + str + " is not an integer");
+  }
+  if (pos != nullptr) {
+    *pos = static_cast<size_t>(strEnd - strStart);
+  }
+  return int(result);
+}
+
+double stod(const std::string& str, size_t* pos) {
+  /* Record previous errno */
+  const int oldErrno = errno;
+  errno = 0;
+
+  const char* strStart = str.c_str();
+  char* strEnd = const_cast<char*>(strStart);
+  const double result = strtod(strStart, &strEnd);
+
+  /* Restore previous errno */
+  const int strtodErrno = errno;
+  errno = oldErrno;
+
+  /* Check for errors and return */
+  if (strtodErrno == ERANGE) {
+    throw std::out_of_range(
+      "stoul failed: " + str + " is outside of range of int");
+  } else if (strEnd == strStart || strtodErrno != 0) {
+    throw std::invalid_argument(
+      "stoul failed: " + str + " is not an integer");
+  }
+  if (pos != nullptr) {
+    *pos = static_cast<size_t>(strEnd - strStart);
+  }
+  return result;
 }
+#endif
 
 }  // end namespace benchmark
diff --git a/lib/gbenchmark/src/string_util.h b/lib/gbenchmark/src/string_util.h
index e70e769872..09d7b4bd2a 100644
--- a/lib/gbenchmark/src/string_util.h
+++ b/lib/gbenchmark/src/string_util.h
@@ -12,15 +12,20 @@ void AppendHumanReadable(int n, std::string* str);
 
 std::string HumanReadableNumber(double n, double one_k = 1024.0);
 
-std::string StrFormat(const char* format, ...);
+#if defined(__MINGW32__)
+__attribute__((format(__MINGW_PRINTF_FORMAT, 1, 2)))
+#elif defined(__GNUC__)
+__attribute__((format(printf, 1, 2)))
+#endif
+std::string
+StrFormat(const char* format, ...);
 
 inline std::ostream& StrCatImp(std::ostream& out) BENCHMARK_NOEXCEPT {
   return out;
 }
 
 template <class First, class... Rest>
-inline std::ostream& StrCatImp(std::ostream& out, First&& f,
-                                  Rest&&... rest) {
+inline std::ostream& StrCatImp(std::ostream& out, First&& f, Rest&&... rest) {
   out << std::forward<First>(f);
   return StrCatImp(out, std::forward<Rest>(rest)...);
 }
@@ -32,8 +37,22 @@ inline std::string StrCat(Args&&... args) {
   return ss.str();
 }
 
-void ReplaceAll(std::string* str, const std::string& from,
-                const std::string& to);
+#ifdef BENCHMARK_STL_ANDROID_GNUSTL
+/*
+ * GNU STL in Android NDK lacks support for some C++11 functions, including
+ * stoul, stoi, stod. We reimplement them here using C functions strtoul,
+ * strtol, strtod. Note that reimplemented functions are in benchmark::
+ * namespace, not std:: namespace.
+ */
+unsigned long stoul(const std::string& str, size_t* pos = nullptr,
+                           int base = 10);
+int stoi(const std::string& str, size_t* pos = nullptr, int base = 10);
+double stod(const std::string& str, size_t* pos = nullptr);
+#else
+using std::stoul;
+using std::stoi;
+using std::stod;
+#endif
 
 }  // end namespace benchmark
 
diff --git a/lib/gbenchmark/src/sysinfo.cc b/lib/gbenchmark/src/sysinfo.cc
index affaf24343..f094ffad0f 100644
--- a/lib/gbenchmark/src/sysinfo.cc
+++ b/lib/gbenchmark/src/sysinfo.cc
@@ -15,10 +15,11 @@
 #include "internal_macros.h"
 
 #ifdef BENCHMARK_OS_WINDOWS
-#include <Shlwapi.h>
+#include <shlwapi.h>
 #undef StrCat  // Don't let StrCat in string_util.h be renamed to lstrcatA
-#include <VersionHelpers.h>
-#include <Windows.h>
+#include <versionhelpers.h>
+#include <windows.h>
+#include <codecvt>
 #else
 #include <fcntl.h>
 #ifndef BENCHMARK_OS_FUCHSIA
@@ -28,7 +29,7 @@
 #include <sys/types.h>  // this header must be included before 'sys/sysctl.h' to avoid compilation error on FreeBSD
 #include <unistd.h>
 #if defined BENCHMARK_OS_FREEBSD || defined BENCHMARK_OS_MACOSX || \
-    defined BENCHMARK_OS_NETBSD
+    defined BENCHMARK_OS_NETBSD || defined BENCHMARK_OS_OPENBSD
 #define BENCHMARK_HAS_SYSCTL
 #include <sys/sysctl.h>
 #endif
@@ -36,6 +37,9 @@
 #if defined(BENCHMARK_OS_SOLARIS)
 #include <kstat.h>
 #endif
+#if defined(BENCHMARK_OS_QNX)
+#include <sys/syspage.h>
+#endif
 
 #include <algorithm>
 #include <array>
@@ -52,6 +56,7 @@
 #include <limits>
 #include <memory>
 #include <sstream>
+#include <locale>
 
 #include "check.h"
 #include "cycleclock.h"
@@ -136,6 +141,26 @@ struct ValueUnion {
 };
 
 ValueUnion GetSysctlImp(std::string const& Name) {
+#if defined BENCHMARK_OS_OPENBSD
+  int mib[2];
+
+  mib[0] = CTL_HW;
+  if ((Name == "hw.ncpu") || (Name == "hw.cpuspeed")){
+    ValueUnion buff(sizeof(int));
+
+    if (Name == "hw.ncpu") {
+      mib[1] = HW_NCPU;
+    } else {
+      mib[1] = HW_CPUSPEED;
+    }
+
+    if (sysctl(mib, 2, buff.data(), &buff.Size, nullptr, 0) == -1) {
+      return ValueUnion();
+    }
+    return buff;
+  }
+  return ValueUnion();
+#else
   size_t CurBuffSize = 0;
   if (sysctlbyname(Name.c_str(), nullptr, &CurBuffSize, nullptr, 0) == -1)
     return ValueUnion();
@@ -144,6 +169,7 @@ ValueUnion GetSysctlImp(std::string const& Name) {
   if (sysctlbyname(Name.c_str(), buff.data(), &buff.Size, nullptr, 0) == 0)
     return buff;
   return ValueUnion();
+#endif
 }
 
 BENCHMARK_MAYBE_UNUSED
@@ -186,6 +212,9 @@ bool ReadFromFile(std::string const& fname, ArgT* arg) {
 bool CpuScalingEnabled(int num_cpus) {
   // We don't have a valid CPU count, so don't even bother.
   if (num_cpus <= 0) return false;
+#ifdef BENCHMARK_OS_QNX
+  return false;
+#endif
 #ifndef BENCHMARK_OS_WINDOWS
   // On Linux, the CPUfreq subsystem exposes CPU information as files on the
   // local file system. If reading the exported files fails, then we may not be
@@ -204,7 +233,7 @@ int CountSetBitsInCPUMap(std::string Val) {
   auto CountBits = [](std::string Part) {
     using CPUMask = std::bitset<sizeof(std::uintptr_t) * CHAR_BIT>;
     Part = "0x" + Part;
-    CPUMask Mask(std::stoul(Part, nullptr, 16));
+    CPUMask Mask(benchmark::stoul(Part, nullptr, 16));
     return static_cast<int>(Mask.count());
   };
   size_t Pos;
@@ -267,7 +296,7 @@ std::vector<CPUInfo::CacheInfo> GetCacheSizesMacOSX() {
     std::string name;
     std::string type;
     int level;
-    size_t num_sharing;
+    uint64_t num_sharing;
   } Cases[] = {{"hw.l1dcachesize", "Data", 1, CacheCounts[1]},
                {"hw.l1icachesize", "Instruction", 1, CacheCounts[1]},
                {"hw.l2cachesize", "Unified", 2, CacheCounts[2]},
@@ -333,6 +362,40 @@ std::vector<CPUInfo::CacheInfo> GetCacheSizesWindows() {
   }
   return res;
 }
+#elif BENCHMARK_OS_QNX
+std::vector<CPUInfo::CacheInfo> GetCacheSizesQNX() {
+  std::vector<CPUInfo::CacheInfo> res;
+  struct cacheattr_entry *cache = SYSPAGE_ENTRY(cacheattr);
+  uint32_t const elsize = SYSPAGE_ELEMENT_SIZE(cacheattr);
+  int num = SYSPAGE_ENTRY_SIZE(cacheattr) / elsize ;
+  for(int i = 0; i < num; ++i ) {
+    CPUInfo::CacheInfo info;
+    switch (cache->flags){
+      case CACHE_FLAG_INSTR :
+        info.type = "Instruction";
+        info.level = 1;
+        break;
+      case CACHE_FLAG_DATA :
+        info.type = "Data";
+        info.level = 1;
+        break;
+      case CACHE_FLAG_UNIFIED :
+        info.type = "Unified";
+        info.level = 2;
+      case CACHE_FLAG_SHARED :
+        info.type = "Shared";
+        info.level = 3;
+      default :
+        continue;
+        break;
+    }
+    info.size = cache->line_size * cache->num_lines;
+    info.num_sharing = 0;
+    res.push_back(std::move(info));
+    cache = SYSPAGE_ARRAY_ADJ_OFFSET(cacheattr, cache, elsize);
+  }
+  return res;
+}
 #endif
 
 std::vector<CPUInfo::CacheInfo> GetCacheSizes() {
@@ -340,11 +403,44 @@ std::vector<CPUInfo::CacheInfo> GetCacheSizes() {
   return GetCacheSizesMacOSX();
 #elif defined(BENCHMARK_OS_WINDOWS)
   return GetCacheSizesWindows();
+#elif defined(BENCHMARK_OS_QNX)
+  return GetCacheSizesQNX();
 #else
   return GetCacheSizesFromKVFS();
 #endif
 }
 
+std::string GetSystemName() {
+#if defined(BENCHMARK_OS_WINDOWS)
+  std::string str;
+  const unsigned COUNT = MAX_COMPUTERNAME_LENGTH+1;
+  TCHAR  hostname[COUNT] = {'\0'};
+  DWORD DWCOUNT = COUNT;
+  if (!GetComputerName(hostname, &DWCOUNT))
+    return std::string("");
+#ifndef UNICODE
+  str = std::string(hostname, DWCOUNT);
+#else
+  //Using wstring_convert, Is deprecated in C++17
+  using convert_type = std::codecvt_utf8<wchar_t>;
+  std::wstring_convert<convert_type, wchar_t> converter;
+  std::wstring wStr(hostname, DWCOUNT);
+  str = converter.to_bytes(wStr);
+#endif
+  return str;
+#else // defined(BENCHMARK_OS_WINDOWS)
+#ifdef BENCHMARK_HAS_SYSCTL // BSD/Mac Doesnt have HOST_NAME_MAX defined
+#define HOST_NAME_MAX 64
+#elif defined(BENCHMARK_OS_QNX)
+#define HOST_NAME_MAX 154
+#endif
+  char hostname[HOST_NAME_MAX];
+  int retVal = gethostname(hostname, HOST_NAME_MAX);
+  if (retVal != 0) return std::string("");
+  return std::string(hostname);
+#endif // Catch-all POSIX block.
+}
+
 int GetNumCPUs() {
 #ifdef BENCHMARK_HAS_SYSCTL
   int NumCPU = -1;
@@ -369,6 +465,8 @@ int GetNumCPUs() {
             strerror(errno));
   }
   return NumCPU;
+#elif defined(BENCHMARK_OS_QNX)
+  return static_cast<int>(_syspage_ptr->num_cpu);
 #else
   int NumCPUs = 0;
   int MaxID = -1;
@@ -383,11 +481,17 @@ int GetNumCPUs() {
     if (ln.empty()) continue;
     size_t SplitIdx = ln.find(':');
     std::string value;
+#if defined(__s390__)
+    // s390 has another format in /proc/cpuinfo
+    // it needs to be parsed differently
+    if (SplitIdx != std::string::npos) value = ln.substr(Key.size()+1,SplitIdx-Key.size()-1);
+#else
     if (SplitIdx != std::string::npos) value = ln.substr(SplitIdx + 1);
+#endif
     if (ln.size() >= Key.size() && ln.compare(0, Key.size(), Key) == 0) {
       NumCPUs++;
       if (!value.empty()) {
-        int CurID = std::stoi(value);
+        int CurID = benchmark::stoi(value);
         MaxID = std::max(CurID, MaxID);
       }
     }
@@ -460,12 +564,12 @@ double GetCPUCyclesPerSecond() {
     // which would cause infinite looping in WallTime_Init.
     if (startsWithKey(ln, "cpu MHz")) {
       if (!value.empty()) {
-        double cycles_per_second = std::stod(value) * 1000000.0;
+        double cycles_per_second = benchmark::stod(value) * 1000000.0;
         if (cycles_per_second > 0) return cycles_per_second;
       }
     } else if (startsWithKey(ln, "bogomips")) {
       if (!value.empty()) {
-        bogo_clock = std::stod(value) * 1000000.0;
+        bogo_clock = benchmark::stod(value) * 1000000.0;
         if (bogo_clock < 0.0) bogo_clock = error_value;
       }
     }
@@ -489,12 +593,17 @@ double GetCPUCyclesPerSecond() {
 #if defined(BENCHMARK_OS_FREEBSD) || defined(BENCHMARK_OS_NETBSD) || \
     defined(BENCHMARK_OS_OPENBSD)
       "machdep.tsc_freq";
+#elif defined BENCHMARK_OS_OPENBSD
+      "hw.cpuspeed";
 #else
       "hw.cpufrequency";
 #endif
   unsigned long long hz = 0;
+#if defined BENCHMARK_OS_OPENBSD
+  if (GetSysctl(FreqStr, &hz)) return hz * 1000000;
+#else
   if (GetSysctl(FreqStr, &hz)) return hz;
-
+#endif
   fprintf(stderr, "Unable to determine clock rate from sysctl: %s: %s\n",
           FreqStr, strerror(errno));
 
@@ -538,6 +647,9 @@ double GetCPUCyclesPerSecond() {
   double clock_hz = knp->value.ui64;
   kstat_close(kc);
   return clock_hz;
+#elif defined (BENCHMARK_OS_QNX)
+  return static_cast<double>((int64_t)(SYSPAGE_ENTRY(cpuinfo)->speed) *
+                             (int64_t)(1000 * 1000));
 #endif
   // If we've fallen through, attempt to roughly estimate the CPU clock rate.
   const int estimate_time_ms = 1000;
@@ -546,6 +658,24 @@ double GetCPUCyclesPerSecond() {
   return static_cast<double>(cycleclock::Now() - start_ticks);
 }
 
+std::vector<double> GetLoadAvg() {
+#if (defined BENCHMARK_OS_FREEBSD || defined(BENCHMARK_OS_LINUX) || \
+    defined BENCHMARK_OS_MACOSX || defined BENCHMARK_OS_NETBSD ||  \
+    defined BENCHMARK_OS_OPENBSD) && !defined(__ANDROID__)
+  constexpr int kMaxSamples = 3;
+  std::vector<double> res(kMaxSamples, 0.0);
+  const int nelem = getloadavg(res.data(), kMaxSamples);
+  if (nelem < 1) {
+    res.clear();
+  } else {
+    res.resize(nelem);
+  }
+  return res;
+#else
+  return {};
+#endif
+}
+
 }  // end namespace
 
 const CPUInfo& CPUInfo::Get() {
@@ -557,6 +687,14 @@ CPUInfo::CPUInfo()
     : num_cpus(GetNumCPUs()),
       cycles_per_second(GetCPUCyclesPerSecond()),
       caches(GetCacheSizes()),
-      scaling_enabled(CpuScalingEnabled(num_cpus)) {}
+      scaling_enabled(CpuScalingEnabled(num_cpus)),
+      load_avg(GetLoadAvg()) {}
+
+
+const SystemInfo& SystemInfo::Get() {
+  static const SystemInfo* info = new SystemInfo();
+  return *info;
+}
 
+SystemInfo::SystemInfo() : name(GetSystemName()) {}
 }  // end namespace benchmark
diff --git a/lib/gbenchmark/src/thread_manager.h b/lib/gbenchmark/src/thread_manager.h
index 82b4d72b62..1720281f0a 100644
--- a/lib/gbenchmark/src/thread_manager.h
+++ b/lib/gbenchmark/src/thread_manager.h
@@ -38,12 +38,10 @@ class ThreadManager {
 
  public:
   struct Result {
-    int64_t iterations = 0;
+    IterationCount iterations = 0;
     double real_time_used = 0;
     double cpu_time_used = 0;
     double manual_time_used = 0;
-    int64_t bytes_processed = 0;
-    int64_t items_processed = 0;
     int64_t complexity_n = 0;
     std::string report_label_;
     std::string error_message_;
diff --git a/lib/gbenchmark/src/thread_timer.h b/lib/gbenchmark/src/thread_timer.h
index eaf108e017..fbd298d3bd 100644
--- a/lib/gbenchmark/src/thread_timer.h
+++ b/lib/gbenchmark/src/thread_timer.h
@@ -8,14 +8,22 @@ namespace benchmark {
 namespace internal {
 
 class ThreadTimer {
+  explicit ThreadTimer(bool measure_process_cpu_time_)
+      : measure_process_cpu_time(measure_process_cpu_time_) {}
+
  public:
-  ThreadTimer() = default;
+  static ThreadTimer Create() {
+    return ThreadTimer(/*measure_process_cpu_time_=*/false);
+  }
+  static ThreadTimer CreateProcessCpuTime() {
+    return ThreadTimer(/*measure_process_cpu_time_=*/true);
+  }
 
   // Called by each thread
   void StartTimer() {
     running_ = true;
     start_real_time_ = ChronoClockNow();
-    start_cpu_time_ = ThreadCPUUsage();
+    start_cpu_time_ = ReadCpuTimerOfChoice();
   }
 
   // Called by each thread
@@ -25,7 +33,8 @@ class ThreadTimer {
     real_time_used_ += ChronoClockNow() - start_real_time_;
     // Floating point error can result in the subtraction producing a negative
     // time. Guard against that.
-    cpu_time_used_ += std::max<double>(ThreadCPUUsage() - start_cpu_time_, 0);
+    cpu_time_used_ +=
+        std::max<double>(ReadCpuTimerOfChoice() - start_cpu_time_, 0);
   }
 
   // Called by each thread
@@ -52,6 +61,14 @@ class ThreadTimer {
   }
 
  private:
+  double ReadCpuTimerOfChoice() const {
+    if (measure_process_cpu_time) return ProcessCPUUsage();
+    return ThreadCPUUsage();
+  }
+
+  // should the thread, or the process, time be measured?
+  const bool measure_process_cpu_time;
+
   bool running_ = false;        // Is the timer running
   double start_real_time_ = 0;  // If running_
   double start_cpu_time_ = 0;   // If running_
diff --git a/lib/gbenchmark/src/timers.cc b/lib/gbenchmark/src/timers.cc
index b8a0d87986..81ce466c82 100644
--- a/lib/gbenchmark/src/timers.cc
+++ b/lib/gbenchmark/src/timers.cc
@@ -16,10 +16,10 @@
 #include "internal_macros.h"
 
 #ifdef BENCHMARK_OS_WINDOWS
-#include <Shlwapi.h>
+#include <shlwapi.h>
 #undef StrCat  // Don't let StrCat in string_util.h be renamed to lstrcatA
-#include <VersionHelpers.h>
-#include <Windows.h>
+#include <versionhelpers.h>
+#include <windows.h>
 #else
 #include <fcntl.h>
 #ifndef BENCHMARK_OS_FUCHSIA
@@ -194,7 +194,6 @@ std::string DateTimeString(bool local) {
         std::strftime(storage, sizeof(storage), "%x %X", ::localtime(&now));
 #else
     std::tm timeinfo;
-    std::memset(&timeinfo, 0, sizeof(std::tm));
     ::localtime_r(&now, &timeinfo);
     written = std::strftime(storage, sizeof(storage), "%F %T", &timeinfo);
 #endif
@@ -203,7 +202,6 @@ std::string DateTimeString(bool local) {
     written = std::strftime(storage, sizeof(storage), "%x %X", ::gmtime(&now));
 #else
     std::tm timeinfo;
-    std::memset(&timeinfo, 0, sizeof(std::tm));
     ::gmtime_r(&now, &timeinfo);
     written = std::strftime(storage, sizeof(storage), "%F %T", &timeinfo);
 #endif
diff --git a/lib/gbenchmark/test/AssemblyTests.cmake b/lib/gbenchmark/test/AssemblyTests.cmake
index d8f321aa61..3d078586f1 100644
--- a/lib/gbenchmark/test/AssemblyTests.cmake
+++ b/lib/gbenchmark/test/AssemblyTests.cmake
@@ -1,4 +1,5 @@
 
+include(split_list)
 
 set(ASM_TEST_FLAGS "")
 check_cxx_compiler_flag(-O3 BENCHMARK_HAS_O3_FLAG)
diff --git a/lib/gbenchmark/test/BUILD b/lib/gbenchmark/test/BUILD
index 2b3a391296..3f174c486f 100644
--- a/lib/gbenchmark/test/BUILD
+++ b/lib/gbenchmark/test/BUILD
@@ -53,5 +53,13 @@ cc_library(
   # FIXME: Add support for assembly tests to bazel.
   # See Issue #556
   # https://github.com/google/benchmark/issues/556
-  ) for test_src in glob(["*test.cc"], exclude = ["*_assembly_test.cc"])
+  ) for test_src in glob(["*test.cc"], exclude = ["*_assembly_test.cc", "link_main_test.cc"])
 ]
+
+cc_test(
+    name = "link_main_test",
+    size = "small",
+    srcs = ["link_main_test.cc"],
+    copts = TEST_COPTS,
+    deps = ["//:benchmark_main"],
+)
diff --git a/lib/gbenchmark/test/CMakeLists.txt b/lib/gbenchmark/test/CMakeLists.txt
index 63c0e58ef3..030f35aae3 100644
--- a/lib/gbenchmark/test/CMakeLists.txt
+++ b/lib/gbenchmark/test/CMakeLists.txt
@@ -41,6 +41,10 @@ macro(compile_benchmark_test name)
   target_link_libraries(${name} benchmark ${CMAKE_THREAD_LIBS_INIT})
 endmacro(compile_benchmark_test)
 
+macro(compile_benchmark_test_with_main name)
+  add_executable(${name} "${name}.cc")
+  target_link_libraries(${name} benchmark_main)
+endmacro(compile_benchmark_test_with_main)
 
 macro(compile_output_test name)
   add_executable(${name} "${name}.cc" output_test.h)
@@ -59,14 +63,23 @@ macro(add_filter_test name filter expect)
 endmacro(add_filter_test)
 
 add_filter_test(filter_simple "Foo" 3)
+add_filter_test(filter_simple_negative "-Foo" 2)
 add_filter_test(filter_suffix "BM_.*" 4)
+add_filter_test(filter_suffix_negative "-BM_.*" 1)
 add_filter_test(filter_regex_all ".*" 5)
+add_filter_test(filter_regex_all_negative "-.*" 0)
 add_filter_test(filter_regex_blank "" 5)
+add_filter_test(filter_regex_blank_negative "-" 0)
 add_filter_test(filter_regex_none "monkey" 0)
+add_filter_test(filter_regex_none_negative "-monkey" 5)
 add_filter_test(filter_regex_wildcard ".*Foo.*" 3)
+add_filter_test(filter_regex_wildcard_negative "-.*Foo.*" 2)
 add_filter_test(filter_regex_begin "^BM_.*" 4)
+add_filter_test(filter_regex_begin_negative "-^BM_.*" 1)
 add_filter_test(filter_regex_begin2 "^N" 1)
+add_filter_test(filter_regex_begin2_negative "-^N" 4)
 add_filter_test(filter_regex_end ".*Ba$" 1)
+add_filter_test(filter_regex_end_negative "-.*Ba$" 4)
 
 compile_benchmark_test(options_test)
 add_test(options_benchmarks options_test --benchmark_min_time=0.01)
@@ -100,6 +113,9 @@ add_test(map_test map_test --benchmark_min_time=0.01)
 compile_benchmark_test(multiple_ranges_test)
 add_test(multiple_ranges_test multiple_ranges_test --benchmark_min_time=0.01)
 
+compile_benchmark_test_with_main(link_main_test)
+add_test(link_main_test link_main_test --benchmark_min_time=0.01)
+
 compile_output_test(reporter_output_test)
 add_test(reporter_output_test reporter_output_test --benchmark_min_time=0.01)
 
@@ -109,9 +125,24 @@ add_test(templated_fixture_test templated_fixture_test --benchmark_min_time=0.01
 compile_output_test(user_counters_test)
 add_test(user_counters_test user_counters_test --benchmark_min_time=0.01)
 
+compile_output_test(internal_threading_test)
+add_test(internal_threading_test internal_threading_test --benchmark_min_time=0.01)
+
+compile_output_test(report_aggregates_only_test)
+add_test(report_aggregates_only_test report_aggregates_only_test --benchmark_min_time=0.01)
+
+compile_output_test(display_aggregates_only_test)
+add_test(display_aggregates_only_test display_aggregates_only_test --benchmark_min_time=0.01)
+
 compile_output_test(user_counters_tabular_test)
 add_test(user_counters_tabular_test user_counters_tabular_test --benchmark_counters_tabular=true --benchmark_min_time=0.01)
 
+compile_output_test(user_counters_thousands_test)
+add_test(user_counters_thousands_test user_counters_thousands_test --benchmark_min_time=0.01)
+
+compile_output_test(memory_manager_test)
+add_test(memory_manager_test memory_manager_test --benchmark_min_time=0.01)
+
 check_cxx_compiler_flag(-std=c++03 BENCHMARK_HAS_CXX03_FLAG)
 if (BENCHMARK_HAS_CXX03_FLAG)
   compile_benchmark_test(cxx03_test)
@@ -147,14 +178,8 @@ add_test(complexity_benchmark complexity_test --benchmark_min_time=${COMPLEXITY_
 if (BENCHMARK_ENABLE_GTEST_TESTS)
   macro(compile_gtest name)
     add_executable(${name} "${name}.cc")
-    if (TARGET googletest)
-      add_dependencies(${name} googletest)
-    endif()
-    if (GTEST_INCLUDE_DIRS)
-      target_include_directories(${name} PRIVATE ${GTEST_INCLUDE_DIRS})
-    endif()
     target_link_libraries(${name} benchmark
-        ${GTEST_BOTH_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT})
+        gmock_main ${CMAKE_THREAD_LIBS_INIT})
   endmacro(compile_gtest)
 
   macro(add_gtest name)
@@ -163,7 +188,10 @@ if (BENCHMARK_ENABLE_GTEST_TESTS)
   endmacro()
 
   add_gtest(benchmark_gtest)
+  add_gtest(benchmark_name_gtest)
+  add_gtest(commandlineflags_gtest)
   add_gtest(statistics_gtest)
+  add_gtest(string_util_gtest)
 endif(BENCHMARK_ENABLE_GTEST_TESTS)
 
 ###############################################################################
diff --git a/lib/gbenchmark/test/basic_test.cc b/lib/gbenchmark/test/basic_test.cc
index 100f68985c..5f3dd1a3ee 100644
--- a/lib/gbenchmark/test/basic_test.cc
+++ b/lib/gbenchmark/test/basic_test.cc
@@ -98,7 +98,8 @@ BENCHMARK(BM_empty_stop_start)->ThreadPerCpu();
 
 
 void BM_KeepRunning(benchmark::State& state) {
-  size_t iter_count = 0;
+  benchmark::IterationCount iter_count = 0;
+  assert(iter_count == state.iterations());
   while (state.KeepRunning()) {
     ++iter_count;
   }
@@ -108,8 +109,8 @@ BENCHMARK(BM_KeepRunning);
 
 void BM_KeepRunningBatch(benchmark::State& state) {
   // Choose a prime batch size to avoid evenly dividing max_iterations.
-  const size_t batch_size = 101;
-  size_t iter_count = 0;
+  const benchmark::IterationCount batch_size = 101;
+  benchmark::IterationCount iter_count = 0;
   while (state.KeepRunningBatch(batch_size)) {
     iter_count += batch_size;
   }
@@ -118,7 +119,7 @@ void BM_KeepRunningBatch(benchmark::State& state) {
 BENCHMARK(BM_KeepRunningBatch);
 
 void BM_RangedFor(benchmark::State& state) {
-  size_t iter_count = 0;
+  benchmark::IterationCount iter_count = 0;
   for (auto _ : state) {
     ++iter_count;
   }
diff --git a/lib/gbenchmark/test/benchmark_gtest.cc b/lib/gbenchmark/test/benchmark_gtest.cc
index 10683b433a..9557b20ec7 100644
--- a/lib/gbenchmark/test/benchmark_gtest.cc
+++ b/lib/gbenchmark/test/benchmark_gtest.cc
@@ -4,6 +4,8 @@
 #include "gmock/gmock.h"
 #include "gtest/gtest.h"
 
+namespace benchmark {
+namespace internal {
 namespace {
 
 TEST(AddRangeTest, Simple) {
@@ -30,4 +32,97 @@ TEST(AddRangeTest, Advanced64) {
   EXPECT_THAT(dst, testing::ElementsAre(5, 8, 15));
 }
 
-}  // end namespace
+TEST(AddRangeTest, FullRange8) {
+  std::vector<int8_t> dst;
+  AddRange(&dst, int8_t{1}, std::numeric_limits<int8_t>::max(), 8);
+  EXPECT_THAT(dst, testing::ElementsAre(1, 8, 64, 127));
+}
+
+TEST(AddRangeTest, FullRange64) {
+  std::vector<int64_t> dst;
+  AddRange(&dst, int64_t{1}, std::numeric_limits<int64_t>::max(), 1024);
+  EXPECT_THAT(
+      dst, testing::ElementsAre(1LL, 1024LL, 1048576LL, 1073741824LL,
+                                1099511627776LL, 1125899906842624LL,
+                                1152921504606846976LL, 9223372036854775807LL));
+}
+
+TEST(AddRangeTest, NegativeRanges) {
+  std::vector<int> dst;
+  AddRange(&dst, -8, 0, 2);
+  EXPECT_THAT(dst, testing::ElementsAre(-8, -4, -2, -1, 0));
+}
+
+TEST(AddRangeTest, StrictlyNegative) {
+  std::vector<int> dst;
+  AddRange(&dst, -8, -1, 2);
+  EXPECT_THAT(dst, testing::ElementsAre(-8, -4, -2, -1));
+}
+
+TEST(AddRangeTest, SymmetricNegativeRanges) {
+  std::vector<int> dst;
+  AddRange(&dst, -8, 8, 2);
+  EXPECT_THAT(dst, testing::ElementsAre(-8, -4, -2, -1, 0, 1, 2, 4, 8));
+}
+
+TEST(AddRangeTest, SymmetricNegativeRangesOddMult) {
+  std::vector<int> dst;
+  AddRange(&dst, -30, 32, 5);
+  EXPECT_THAT(dst, testing::ElementsAre(-30, -25, -5, -1, 0, 1, 5, 25, 32));
+}
+
+TEST(AddRangeTest, NegativeRangesAsymmetric) {
+  std::vector<int> dst;
+  AddRange(&dst, -3, 5, 2);
+  EXPECT_THAT(dst, testing::ElementsAre(-3, -2, -1, 0, 1, 2, 4, 5));
+}
+
+TEST(AddRangeTest, NegativeRangesLargeStep) {
+  // Always include -1, 0, 1 when crossing zero.
+  std::vector<int> dst;
+  AddRange(&dst, -8, 8, 10);
+  EXPECT_THAT(dst, testing::ElementsAre(-8, -1, 0, 1, 8));
+}
+
+TEST(AddRangeTest, ZeroOnlyRange) {
+  std::vector<int> dst;
+  AddRange(&dst, 0, 0, 2);
+  EXPECT_THAT(dst, testing::ElementsAre(0));
+}
+
+TEST(AddRangeTest, NegativeRange64) {
+  std::vector<int64_t> dst;
+  AddRange<int64_t>(&dst, -4, 4, 2);
+  EXPECT_THAT(dst, testing::ElementsAre(-4, -2, -1, 0, 1, 2, 4));
+}
+
+TEST(AddRangeTest, NegativeRangePreservesExistingOrder) {
+  // If elements already exist in the range, ensure we don't change
+  // their ordering by adding negative values.
+  std::vector<int64_t> dst = {1, 2, 3};
+  AddRange<int64_t>(&dst, -2, 2, 2);
+  EXPECT_THAT(dst, testing::ElementsAre(1, 2, 3, -2, -1, 0, 1, 2));
+}
+
+TEST(AddRangeTest, FullNegativeRange64) {
+  std::vector<int64_t> dst;
+  const auto min = std::numeric_limits<int64_t>::min();
+  const auto max = std::numeric_limits<int64_t>::max();
+  AddRange(&dst, min, max, 1024);
+  EXPECT_THAT(
+      dst, testing::ElementsAreArray(std::vector<int64_t>{
+               min, -1152921504606846976LL, -1125899906842624LL,
+               -1099511627776LL, -1073741824LL, -1048576LL, -1024LL, -1LL, 0LL,
+               1LL, 1024LL, 1048576LL, 1073741824LL, 1099511627776LL,
+               1125899906842624LL, 1152921504606846976LL, max}));
+}
+
+TEST(AddRangeTest, Simple8) {
+  std::vector<int8_t> dst;
+  AddRange<int8_t>(&dst, 1, 8, 2);
+  EXPECT_THAT(dst, testing::ElementsAre(1, 2, 4, 8));
+}
+
+}  // namespace
+}  // namespace internal
+}  // namespace benchmark
diff --git a/lib/gbenchmark/test/benchmark_name_gtest.cc b/lib/gbenchmark/test/benchmark_name_gtest.cc
new file mode 100644
index 0000000000..afb401c1f5
--- /dev/null
+++ b/lib/gbenchmark/test/benchmark_name_gtest.cc
@@ -0,0 +1,74 @@
+#include "benchmark/benchmark.h"
+#include "gtest/gtest.h"
+
+namespace {
+
+using namespace benchmark;
+using namespace benchmark::internal;
+
+TEST(BenchmarkNameTest, Empty) {
+  const auto name = BenchmarkName();
+  EXPECT_EQ(name.str(), std::string());
+}
+
+TEST(BenchmarkNameTest, FunctionName) {
+  auto name = BenchmarkName();
+  name.function_name = "function_name";
+  EXPECT_EQ(name.str(), "function_name");
+}
+
+TEST(BenchmarkNameTest, FunctionNameAndArgs) {
+  auto name = BenchmarkName();
+  name.function_name = "function_name";
+  name.args = "some_args:3/4/5";
+  EXPECT_EQ(name.str(), "function_name/some_args:3/4/5");
+}
+
+TEST(BenchmarkNameTest, MinTime) {
+  auto name = BenchmarkName();
+  name.function_name = "function_name";
+  name.args = "some_args:3/4";
+  name.min_time = "min_time:3.4s";
+  EXPECT_EQ(name.str(), "function_name/some_args:3/4/min_time:3.4s");
+}
+
+TEST(BenchmarkNameTest, Iterations) {
+  auto name = BenchmarkName();
+  name.function_name = "function_name";
+  name.min_time = "min_time:3.4s";
+  name.iterations = "iterations:42";
+  EXPECT_EQ(name.str(), "function_name/min_time:3.4s/iterations:42");
+}
+
+TEST(BenchmarkNameTest, Repetitions) {
+  auto name = BenchmarkName();
+  name.function_name = "function_name";
+  name.min_time = "min_time:3.4s";
+  name.repetitions = "repetitions:24";
+  EXPECT_EQ(name.str(), "function_name/min_time:3.4s/repetitions:24");
+}
+
+TEST(BenchmarkNameTest, TimeType) {
+  auto name = BenchmarkName();
+  name.function_name = "function_name";
+  name.min_time = "min_time:3.4s";
+  name.time_type = "hammer_time";
+  EXPECT_EQ(name.str(), "function_name/min_time:3.4s/hammer_time");
+}
+
+TEST(BenchmarkNameTest, Threads) {
+  auto name = BenchmarkName();
+  name.function_name = "function_name";
+  name.min_time = "min_time:3.4s";
+  name.threads = "threads:256";
+  EXPECT_EQ(name.str(), "function_name/min_time:3.4s/threads:256");
+}
+
+TEST(BenchmarkNameTest, TestEmptyFunctionName) {
+  auto name = BenchmarkName();
+  name.args = "first:3/second:4";
+  name.threads = "threads:22";
+  EXPECT_EQ(name.str(), "first:3/second:4/threads:22");
+}
+
+}  // end namespace
diff --git a/lib/gbenchmark/test/commandlineflags_gtest.cc b/lib/gbenchmark/test/commandlineflags_gtest.cc
new file mode 100644
index 0000000000..5460778c48
--- /dev/null
+++ b/lib/gbenchmark/test/commandlineflags_gtest.cc
@@ -0,0 +1,78 @@
+#include <cstdlib>
+
+#include "../src/commandlineflags.h"
+#include "../src/internal_macros.h"
+#include "gtest/gtest.h"
+
+namespace benchmark {
+namespace {
+
+#if defined(BENCHMARK_OS_WINDOWS)
+int setenv(const char* name, const char* value, int overwrite) {
+  if (!overwrite) {
+    // NOTE: getenv_s is far superior but not available under mingw.
+    char* env_value = getenv(name);
+    if (env_value == nullptr) {
+      return -1;
+    }
+  }
+  return _putenv_s(name, value);
+}
+
+int unsetenv(const char* name) {
+  return _putenv_s(name, "");
+}
+
+#endif  // BENCHMARK_OS_WINDOWS
+
+TEST(BoolFromEnv, Default) {
+  ASSERT_EQ(unsetenv("BENCHMARK_NOT_IN_ENV"), 0);
+  EXPECT_EQ(BoolFromEnv("not_in_env", true), true);
+}
+
+TEST(BoolFromEnv, False) {
+  ASSERT_EQ(setenv("BENCHMARK_IN_ENV", "0", 1), 0);
+  EXPECT_EQ(BoolFromEnv("in_env", true), false);
+  unsetenv("BENCHMARK_IN_ENV");
+}
+
+TEST(BoolFromEnv, True) {
+  ASSERT_EQ(setenv("BENCHMARK_IN_ENV", "1", 1), 0);
+  EXPECT_EQ(BoolFromEnv("in_env", false), true);
+  unsetenv("BENCHMARK_IN_ENV");
+
+  ASSERT_EQ(setenv("BENCHMARK_IN_ENV", "foo", 1), 0);
+  EXPECT_EQ(BoolFromEnv("in_env", false), true);
+  unsetenv("BENCHMARK_IN_ENV");
+}
+
+TEST(Int32FromEnv, NotInEnv) {
+  ASSERT_EQ(unsetenv("BENCHMARK_NOT_IN_ENV"), 0);
+  EXPECT_EQ(Int32FromEnv("not_in_env", 42), 42);
+}
+
+TEST(Int32FromEnv, InvalidInteger) {
+  ASSERT_EQ(setenv("BENCHMARK_IN_ENV", "foo", 1), 0);
+  EXPECT_EQ(Int32FromEnv("in_env", 42), 42);
+  ASSERT_EQ(unsetenv("BENCHMARK_IN_ENV"), 0);
+}
+
+TEST(Int32FromEnv, ValidInteger) {
+  ASSERT_EQ(setenv("BENCHMARK_IN_ENV", "42", 1), 0);
+  EXPECT_EQ(Int32FromEnv("in_env", 64), 42);
+  unsetenv("BENCHMARK_IN_ENV");
+}
+
+TEST(StringFromEnv, Default) {
+  ASSERT_EQ(unsetenv("BENCHMARK_NOT_IN_ENV"), 0);
+  EXPECT_STREQ(StringFromEnv("not_in_env", "foo"), "foo");
+}
+
+TEST(StringFromEnv, Valid) {
+  ASSERT_EQ(setenv("BENCHMARK_IN_ENV", "foo", 1), 0);
+  EXPECT_STREQ(StringFromEnv("in_env", "bar"), "foo");
+  unsetenv("BENCHMARK_IN_ENV");
+}
+
+}  // namespace
+}  // namespace benchmark
diff --git a/lib/gbenchmark/test/complexity_test.cc b/lib/gbenchmark/test/complexity_test.cc
index ab832861ec..d4febbbc15 100644
--- a/lib/gbenchmark/test/complexity_test.cc
+++ b/lib/gbenchmark/test/complexity_test.cc
@@ -12,9 +12,10 @@ namespace {
 #define ADD_COMPLEXITY_CASES(...) \
   int CONCAT(dummy, __LINE__) = AddComplexityTest(__VA_ARGS__)
 
-int AddComplexityTest(std::string big_o_test_name, std::string rms_test_name,
-                      std::string big_o) {
-  SetSubstitutions({{"%bigo_name", big_o_test_name},
+int AddComplexityTest(std::string test_name, std::string big_o_test_name,
+                      std::string rms_test_name, std::string big_o) {
+  SetSubstitutions({{"%name", test_name},
+                    {"%bigo_name", big_o_test_name},
                     {"%rms_name", rms_test_name},
                     {"%bigo_str", "[ ]* %float " + big_o},
                     {"%bigo", big_o},
@@ -25,12 +26,22 @@ int AddComplexityTest(std::string big_o_test_name, std::string rms_test_name,
        {"^%bigo_name", MR_Not},  // Assert we we didn't only matched a name.
        {"^%rms_name %rms %rms[ ]*$", MR_Next}});
   AddCases(TC_JSONOut, {{"\"name\": \"%bigo_name\",$"},
+                        {"\"run_name\": \"%name\",$", MR_Next},
+                        {"\"run_type\": \"aggregate\",$", MR_Next},
+                        {"\"repetitions\": %int,$", MR_Next},
+                        {"\"threads\": 1,$", MR_Next},
+                        {"\"aggregate_name\": \"BigO\",$", MR_Next},
                         {"\"cpu_coefficient\": %float,$", MR_Next},
                         {"\"real_coefficient\": %float,$", MR_Next},
                         {"\"big_o\": \"%bigo\",$", MR_Next},
                         {"\"time_unit\": \"ns\"$", MR_Next},
                         {"}", MR_Next},
                         {"\"name\": \"%rms_name\",$"},
+                        {"\"run_name\": \"%name\",$", MR_Next},
+                        {"\"run_type\": \"aggregate\",$", MR_Next},
+                        {"\"repetitions\": %int,$", MR_Next},
+                        {"\"threads\": 1,$", MR_Next},
+                        {"\"aggregate_name\": \"RMS\",$", MR_Next},
                         {"\"rms\": %float$", MR_Next},
                         {"}", MR_Next}});
   AddCases(TC_CSVOut, {{"^\"%bigo_name\",,%float,%float,%bigo,,,,,$"},
@@ -55,10 +66,11 @@ void BM_Complexity_O1(benchmark::State& state) {
 }
 BENCHMARK(BM_Complexity_O1)->Range(1, 1 << 18)->Complexity(benchmark::o1);
 BENCHMARK(BM_Complexity_O1)->Range(1, 1 << 18)->Complexity();
-BENCHMARK(BM_Complexity_O1)->Range(1, 1 << 18)->Complexity([](int64_t) {
-  return 1.0;
-});
+BENCHMARK(BM_Complexity_O1)
+    ->Range(1, 1 << 18)
+    ->Complexity([](benchmark::IterationCount) { return 1.0; });
 
+const char *one_test_name = "BM_Complexity_O1";
 const char *big_o_1_test_name = "BM_Complexity_O1_BigO";
 const char *rms_o_1_test_name = "BM_Complexity_O1_RMS";
 const char *enum_big_o_1 = "\\([0-9]+\\)";
@@ -69,13 +81,16 @@ const char *auto_big_o_1 = "(\\([0-9]+\\))|(lgN)";
 const char *lambda_big_o_1 = "f\\(N\\)";
 
 // Add enum tests
-ADD_COMPLEXITY_CASES(big_o_1_test_name, rms_o_1_test_name, enum_big_o_1);
+ADD_COMPLEXITY_CASES(one_test_name, big_o_1_test_name, rms_o_1_test_name,
+                     enum_big_o_1);
 
 // Add auto enum tests
-ADD_COMPLEXITY_CASES(big_o_1_test_name, rms_o_1_test_name, auto_big_o_1);
+ADD_COMPLEXITY_CASES(one_test_name, big_o_1_test_name, rms_o_1_test_name,
+                     auto_big_o_1);
 
 // Add lambda tests
-ADD_COMPLEXITY_CASES(big_o_1_test_name, rms_o_1_test_name, lambda_big_o_1);
+ADD_COMPLEXITY_CASES(one_test_name, big_o_1_test_name, rms_o_1_test_name,
+                     lambda_big_o_1);
 
 // ========================================================================= //
 // --------------------------- Testing BigO O(N) --------------------------- //
@@ -85,7 +100,7 @@ std::vector<int> ConstructRandomVector(int64_t size) {
   std::vector<int> v;
   v.reserve(static_cast<int>(size));
   for (int i = 0; i < size; ++i) {
-    v.push_back(std::rand() % size);
+    v.push_back(static_cast<int>(std::rand() % size));
   }
   return v;
 }
@@ -106,22 +121,27 @@ BENCHMARK(BM_Complexity_O_N)
 BENCHMARK(BM_Complexity_O_N)
     ->RangeMultiplier(2)
     ->Range(1 << 10, 1 << 16)
-    ->Complexity([](int64_t n) -> double { return n; });
+    ->Complexity([](benchmark::IterationCount n) -> double {
+      return static_cast<double>(n);
+    });
 BENCHMARK(BM_Complexity_O_N)
     ->RangeMultiplier(2)
     ->Range(1 << 10, 1 << 16)
     ->Complexity();
 
+const char *n_test_name = "BM_Complexity_O_N";
 const char *big_o_n_test_name = "BM_Complexity_O_N_BigO";
 const char *rms_o_n_test_name = "BM_Complexity_O_N_RMS";
 const char *enum_auto_big_o_n = "N";
 const char *lambda_big_o_n = "f\\(N\\)";
 
 // Add enum tests
-ADD_COMPLEXITY_CASES(big_o_n_test_name, rms_o_n_test_name, enum_auto_big_o_n);
+ADD_COMPLEXITY_CASES(n_test_name, big_o_n_test_name, rms_o_n_test_name,
+                     enum_auto_big_o_n);
 
 // Add lambda tests
-ADD_COMPLEXITY_CASES(big_o_n_test_name, rms_o_n_test_name, lambda_big_o_n);
+ADD_COMPLEXITY_CASES(n_test_name, big_o_n_test_name, rms_o_n_test_name,
+                     lambda_big_o_n);
 
 // ========================================================================= //
 // ------------------------- Testing BigO O(N*lgN) ------------------------- //
@@ -134,6 +154,7 @@ static void BM_Complexity_O_N_log_N(benchmark::State& state) {
   }
   state.SetComplexityN(state.range(0));
 }
+static const double kLog2E = 1.44269504088896340736;
 BENCHMARK(BM_Complexity_O_N_log_N)
     ->RangeMultiplier(2)
     ->Range(1 << 10, 1 << 16)
@@ -141,24 +162,47 @@ BENCHMARK(BM_Complexity_O_N_log_N)
 BENCHMARK(BM_Complexity_O_N_log_N)
     ->RangeMultiplier(2)
     ->Range(1 << 10, 1 << 16)
-    ->Complexity([](int64_t n) { return n * log2(n); });
+    ->Complexity([](benchmark::IterationCount n) {
+      return kLog2E * n * log(static_cast<double>(n));
+    });
 BENCHMARK(BM_Complexity_O_N_log_N)
     ->RangeMultiplier(2)
     ->Range(1 << 10, 1 << 16)
     ->Complexity();
 
+const char *n_lg_n_test_name = "BM_Complexity_O_N_log_N";
 const char *big_o_n_lg_n_test_name = "BM_Complexity_O_N_log_N_BigO";
 const char *rms_o_n_lg_n_test_name = "BM_Complexity_O_N_log_N_RMS";
 const char *enum_auto_big_o_n_lg_n = "NlgN";
 const char *lambda_big_o_n_lg_n = "f\\(N\\)";
 
 // Add enum tests
-ADD_COMPLEXITY_CASES(big_o_n_lg_n_test_name, rms_o_n_lg_n_test_name,
-                     enum_auto_big_o_n_lg_n);
+ADD_COMPLEXITY_CASES(n_lg_n_test_name, big_o_n_lg_n_test_name,
+                     rms_o_n_lg_n_test_name, enum_auto_big_o_n_lg_n);
 
 // Add lambda tests
-ADD_COMPLEXITY_CASES(big_o_n_lg_n_test_name, rms_o_n_lg_n_test_name,
-                     lambda_big_o_n_lg_n);
+ADD_COMPLEXITY_CASES(n_lg_n_test_name, big_o_n_lg_n_test_name,
+                     rms_o_n_lg_n_test_name, lambda_big_o_n_lg_n);
+
+// ========================================================================= //
+// -------- Testing formatting of Complexity with captured args ------------ //
+// ========================================================================= //
+
+void BM_ComplexityCaptureArgs(benchmark::State& state, int n) {
+  for (auto _ : state) {
+  }
+  state.SetComplexityN(n);
+}
+
+BENCHMARK_CAPTURE(BM_ComplexityCaptureArgs, capture_test, 100)
+    ->Complexity(benchmark::oN)
+    ->Ranges({{1, 2}, {3, 4}});
+
+const std::string complexity_capture_name =
+    "BM_ComplexityCaptureArgs/capture_test";
+
+ADD_COMPLEXITY_CASES(complexity_capture_name, complexity_capture_name + "_BigO",
+                     complexity_capture_name + "_RMS", "N");
 
 // ========================================================================= //
 // --------------------------- TEST CASES END ------------------------------ //
diff --git a/lib/gbenchmark/test/cxx03_test.cc b/lib/gbenchmark/test/cxx03_test.cc
index baa9ed9262..c4c9a52273 100644
--- a/lib/gbenchmark/test/cxx03_test.cc
+++ b/lib/gbenchmark/test/cxx03_test.cc
@@ -14,7 +14,7 @@
 
 void BM_empty(benchmark::State& state) {
   while (state.KeepRunning()) {
-    volatile std::size_t x = state.iterations();
+    volatile benchmark::IterationCount x = state.iterations();
     ((void)x);
   }
 }
diff --git a/lib/gbenchmark/test/display_aggregates_only_test.cc b/lib/gbenchmark/test/display_aggregates_only_test.cc
new file mode 100644
index 0000000000..3c36d3f03c
--- /dev/null
+++ b/lib/gbenchmark/test/display_aggregates_only_test.cc
@@ -0,0 +1,43 @@
+
+#undef NDEBUG
+#include <cstdio>
+#include <string>
+
+#include "benchmark/benchmark.h"
+#include "output_test.h"
+
+// Ok this test is super ugly. We want to check what happens with the file
+// reporter in the presence of DisplayAggregatesOnly().
+// We do not care about console output, the normal tests check that already.
+
+void BM_SummaryRepeat(benchmark::State& state) {
+  for (auto _ : state) {
+  }
+}
+BENCHMARK(BM_SummaryRepeat)->Repetitions(3)->DisplayAggregatesOnly();
+
+int main(int argc, char* argv[]) {
+  const std::string output = GetFileReporterOutput(argc, argv);
+
+  if (SubstrCnt(output, "\"name\": \"BM_SummaryRepeat/repeats:3") != 6 ||
+      SubstrCnt(output, "\"name\": \"BM_SummaryRepeat/repeats:3\"") != 3 ||
+      SubstrCnt(output, "\"name\": \"BM_SummaryRepeat/repeats:3_mean\"") != 1 ||
+      SubstrCnt(output, "\"name\": \"BM_SummaryRepeat/repeats:3_median\"") !=
+          1 ||
+      SubstrCnt(output, "\"name\": \"BM_SummaryRepeat/repeats:3_stddev\"") !=
+          1) {
+    std::cout << "Precondition mismatch. Expected to only find 6 "
+                 "occurrences of \"BM_SummaryRepeat/repeats:3\" substring:\n"
+                 "\"name\": \"BM_SummaryRepeat/repeats:3\", "
+                 "\"name\": \"BM_SummaryRepeat/repeats:3\", "
+                 "\"name\": \"BM_SummaryRepeat/repeats:3\", "
+                 "\"name\": \"BM_SummaryRepeat/repeats:3_mean\", "
+                 "\"name\": \"BM_SummaryRepeat/repeats:3_median\", "
+                 "\"name\": \"BM_SummaryRepeat/repeats:3_stddev\"\nThe entire "
+                 "output:\n";
+    std::cout << output;
+    return 1;
+  }
+
+  return 0;
+}
diff --git a/lib/gbenchmark/test/internal_threading_test.cc b/lib/gbenchmark/test/internal_threading_test.cc
new file mode 100644
index 0000000000..039d7c14a8
--- /dev/null
+++ b/lib/gbenchmark/test/internal_threading_test.cc
@@ -0,0 +1,184 @@
+
+#undef NDEBUG
+
+#include <chrono>
+#include <thread>
+#include "../src/timers.h"
+#include "benchmark/benchmark.h"
+#include "output_test.h"
+
+static const std::chrono::duration<double, std::milli> time_frame(50);
+static const double time_frame_in_sec(
+    std::chrono::duration_cast<std::chrono::duration<double, std::ratio<1, 1>>>(
+        time_frame)
+        .count());
+
+void MyBusySpinwait() {
+  const auto start = benchmark::ChronoClockNow();
+
+  while (true) {
+    const auto now = benchmark::ChronoClockNow();
+    const auto elapsed = now - start;
+
+    if (std::chrono::duration<double, std::chrono::seconds::period>(elapsed) >=
+        time_frame)
+      return;
+  }
+}
+
+// ========================================================================= //
+// --------------------------- TEST CASES BEGIN ---------------------------- //
+// ========================================================================= //
+
+// ========================================================================= //
+// BM_MainThread
+
+void BM_MainThread(benchmark::State& state) {
+  for (auto _ : state) {
+    MyBusySpinwait();
+    state.SetIterationTime(time_frame_in_sec);
+  }
+  state.counters["invtime"] =
+      benchmark::Counter{1, benchmark::Counter::kIsRate};
+}
+
+BENCHMARK(BM_MainThread)->Iterations(1)->Threads(1);
+BENCHMARK(BM_MainThread)->Iterations(1)->Threads(1)->UseRealTime();
+BENCHMARK(BM_MainThread)->Iterations(1)->Threads(1)->UseManualTime();
+BENCHMARK(BM_MainThread)->Iterations(1)->Threads(1)->MeasureProcessCPUTime();
+BENCHMARK(BM_MainThread)
+    ->Iterations(1)
+    ->Threads(1)
+    ->MeasureProcessCPUTime()
+    ->UseRealTime();
+BENCHMARK(BM_MainThread)
+    ->Iterations(1)
+    ->Threads(1)
+    ->MeasureProcessCPUTime()
+    ->UseManualTime();
+
+BENCHMARK(BM_MainThread)->Iterations(1)->Threads(2);
+BENCHMARK(BM_MainThread)->Iterations(1)->Threads(2)->UseRealTime();
+BENCHMARK(BM_MainThread)->Iterations(1)->Threads(2)->UseManualTime();
+BENCHMARK(BM_MainThread)->Iterations(1)->Threads(2)->MeasureProcessCPUTime();
+BENCHMARK(BM_MainThread)
+    ->Iterations(1)
+    ->Threads(2)
+    ->MeasureProcessCPUTime()
+    ->UseRealTime();
+BENCHMARK(BM_MainThread)
+    ->Iterations(1)
+    ->Threads(2)
+    ->MeasureProcessCPUTime()
+    ->UseManualTime();
+
+// ========================================================================= //
+// BM_WorkerThread
+
+void BM_WorkerThread(benchmark::State& state) {
+  for (auto _ : state) {
+    std::thread Worker(&MyBusySpinwait);
+    Worker.join();
+    state.SetIterationTime(time_frame_in_sec);
+  }
+  state.counters["invtime"] =
+      benchmark::Counter{1, benchmark::Counter::kIsRate};
+}
+
+BENCHMARK(BM_WorkerThread)->Iterations(1)->Threads(1);
+BENCHMARK(BM_WorkerThread)->Iterations(1)->Threads(1)->UseRealTime();
+BENCHMARK(BM_WorkerThread)->Iterations(1)->Threads(1)->UseManualTime();
+BENCHMARK(BM_WorkerThread)->Iterations(1)->Threads(1)->MeasureProcessCPUTime();
+BENCHMARK(BM_WorkerThread)
+    ->Iterations(1)
+    ->Threads(1)
+    ->MeasureProcessCPUTime()
+    ->UseRealTime();
+BENCHMARK(BM_WorkerThread)
+    ->Iterations(1)
+    ->Threads(1)
+    ->MeasureProcessCPUTime()
+    ->UseManualTime();
+
+BENCHMARK(BM_WorkerThread)->Iterations(1)->Threads(2);
+BENCHMARK(BM_WorkerThread)->Iterations(1)->Threads(2)->UseRealTime();
+BENCHMARK(BM_WorkerThread)->Iterations(1)->Threads(2)->UseManualTime();
+BENCHMARK(BM_WorkerThread)->Iterations(1)->Threads(2)->MeasureProcessCPUTime();
+BENCHMARK(BM_WorkerThread)
+    ->Iterations(1)
+    ->Threads(2)
+    ->MeasureProcessCPUTime()
+    ->UseRealTime();
+BENCHMARK(BM_WorkerThread)
+    ->Iterations(1)
+    ->Threads(2)
+    ->MeasureProcessCPUTime()
+    ->UseManualTime();
+
+// ========================================================================= //
+// BM_MainThreadAndWorkerThread
+
+void BM_MainThreadAndWorkerThread(benchmark::State& state) {
+  for (auto _ : state) {
+    std::thread Worker(&MyBusySpinwait);
+    MyBusySpinwait();
+    Worker.join();
+    state.SetIterationTime(time_frame_in_sec);
+  }
+  state.counters["invtime"] =
+      benchmark::Counter{1, benchmark::Counter::kIsRate};
+}
+
+BENCHMARK(BM_MainThreadAndWorkerThread)->Iterations(1)->Threads(1);
+BENCHMARK(BM_MainThreadAndWorkerThread)
+    ->Iterations(1)
+    ->Threads(1)
+    ->UseRealTime();
+BENCHMARK(BM_MainThreadAndWorkerThread)
+    ->Iterations(1)
+    ->Threads(1)
+    ->UseManualTime();
+BENCHMARK(BM_MainThreadAndWorkerThread)
+    ->Iterations(1)
+    ->Threads(1)
+    ->MeasureProcessCPUTime();
+BENCHMARK(BM_MainThreadAndWorkerThread)
+    ->Iterations(1)
+    ->Threads(1)
+    ->MeasureProcessCPUTime()
+    ->UseRealTime();
+BENCHMARK(BM_MainThreadAndWorkerThread)
+    ->Iterations(1)
+    ->Threads(1)
+    ->MeasureProcessCPUTime()
+    ->UseManualTime();
+
+BENCHMARK(BM_MainThreadAndWorkerThread)->Iterations(1)->Threads(2);
+BENCHMARK(BM_MainThreadAndWorkerThread)
+    ->Iterations(1)
+    ->Threads(2)
+    ->UseRealTime();
+BENCHMARK(BM_MainThreadAndWorkerThread)
+    ->Iterations(1)
+    ->Threads(2)
+    ->UseManualTime();
+BENCHMARK(BM_MainThreadAndWorkerThread)
+    ->Iterations(1)
+    ->Threads(2)
+    ->MeasureProcessCPUTime();
+BENCHMARK(BM_MainThreadAndWorkerThread)
+    ->Iterations(1)
+    ->Threads(2)
+    ->MeasureProcessCPUTime()
+    ->UseRealTime();
+BENCHMARK(BM_MainThreadAndWorkerThread)
+    ->Iterations(1)
+    ->Threads(2)
+    ->MeasureProcessCPUTime()
+    ->UseManualTime();
+
+// ========================================================================= //
+// ---------------------------- TEST CASES END ----------------------------- //
+// ========================================================================= //
+
+int main(int argc, char* argv[]) { RunOutputTests(argc, argv); }
diff --git a/lib/gbenchmark/test/link_main_test.cc b/lib/gbenchmark/test/link_main_test.cc
new file mode 100644
index 0000000000..241ad5c390
--- /dev/null
+++ b/lib/gbenchmark/test/link_main_test.cc
@@ -0,0 +1,8 @@
+#include "benchmark/benchmark.h"
+
+void BM_empty(benchmark::State& state) {
+  for (auto _ : state) {
+    benchmark::DoNotOptimize(state.iterations());
+  }
+}
+BENCHMARK(BM_empty);
diff --git a/lib/gbenchmark/test/memory_manager_test.cc b/lib/gbenchmark/test/memory_manager_test.cc
new file mode 100644
index 0000000000..90bed16cff
--- /dev/null
+++ b/lib/gbenchmark/test/memory_manager_test.cc
@@ -0,0 +1,44 @@
+#include <memory>
+
+#include "../src/check.h"
+#include "benchmark/benchmark.h"
+#include "output_test.h"
+
+class TestMemoryManager : public benchmark::MemoryManager {
+  void Start() {}
+  void Stop(Result* result) {
+    result->num_allocs = 42;
+    result->max_bytes_used = 42000;
+  }
+};
+
+void BM_empty(benchmark::State& state) {
+  for (auto _ : state) {
+    benchmark::DoNotOptimize(state.iterations());
+  }
+}
+BENCHMARK(BM_empty);
+
+ADD_CASES(TC_ConsoleOut, {{"^BM_empty %console_report$"}});
+ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_empty\",$"},
+                       {"\"run_name\": \"BM_empty\",$", MR_Next},
+                       {"\"run_type\": \"iteration\",$", MR_Next},
+                       {"\"repetitions\": 0,$", MR_Next},
+                       {"\"repetition_index\": 0,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
+                       {"\"iterations\": %int,$", MR_Next},
+                       {"\"real_time\": %float,$", MR_Next},
+                       {"\"cpu_time\": %float,$", MR_Next},
+                       {"\"time_unit\": \"ns\",$", MR_Next},
+                       {"\"allocs_per_iter\": %float,$", MR_Next},
+                       {"\"max_bytes_used\": 42000$", MR_Next},
+                       {"}", MR_Next}});
+ADD_CASES(TC_CSVOut, {{"^\"BM_empty\",%csv_report$"}});
+
+int main(int argc, char* argv[]) {
+  std::unique_ptr<benchmark::MemoryManager> mm(new TestMemoryManager());
+
+  benchmark::RegisterMemoryManager(mm.get());
+  RunOutputTests(argc, argv);
+  benchmark::RegisterMemoryManager(nullptr);
+}
diff --git a/lib/gbenchmark/test/multiple_ranges_test.cc b/lib/gbenchmark/test/multiple_ranges_test.cc
index c64acabc25..b25f40eb52 100644
--- a/lib/gbenchmark/test/multiple_ranges_test.cc
+++ b/lib/gbenchmark/test/multiple_ranges_test.cc
@@ -40,8 +40,7 @@ class MultipleRangesFixture : public ::benchmark::Fixture {
   // NOTE: This is not TearDown as we want to check after _all_ runs are
   // complete.
   virtual ~MultipleRangesFixture() {
-    assert(actualValues.size() == expectedValues.size());
-    if (actualValues.size() != expectedValues.size()) {
+    if (actualValues != expectedValues) {
       std::cout << "EXPECTED\n";
       for (auto v : expectedValues) {
         std::cout << "{";
diff --git a/lib/gbenchmark/test/options_test.cc b/lib/gbenchmark/test/options_test.cc
index fdec69174e..7bfc235465 100644
--- a/lib/gbenchmark/test/options_test.cc
+++ b/lib/gbenchmark/test/options_test.cc
@@ -35,6 +35,16 @@ BENCHMARK(BM_basic)->UseRealTime();
 BENCHMARK(BM_basic)->ThreadRange(2, 4);
 BENCHMARK(BM_basic)->ThreadPerCpu();
 BENCHMARK(BM_basic)->Repetitions(3);
+BENCHMARK(BM_basic)
+    ->RangeMultiplier(std::numeric_limits<int>::max())
+    ->Range(std::numeric_limits<int64_t>::min(),
+            std::numeric_limits<int64_t>::max());
+
+// Negative ranges
+BENCHMARK(BM_basic)->Range(-64, -1);
+BENCHMARK(BM_basic)->RangeMultiplier(4)->Range(-8, 8);
+BENCHMARK(BM_basic)->DenseRange(-2, 2, 1);
+BENCHMARK(BM_basic)->Ranges({{-64, 1}, {-8, -1}});
 
 void CustomArgs(benchmark::internal::Benchmark* b) {
   for (int i = 0; i < 10; ++i) {
diff --git a/lib/gbenchmark/test/output_test.h b/lib/gbenchmark/test/output_test.h
index 897a13866b..9385761b21 100644
--- a/lib/gbenchmark/test/output_test.h
+++ b/lib/gbenchmark/test/output_test.h
@@ -2,13 +2,13 @@
 #define TEST_OUTPUT_TEST_H
 
 #undef NDEBUG
+#include <functional>
 #include <initializer_list>
 #include <memory>
+#include <sstream>
 #include <string>
 #include <utility>
 #include <vector>
-#include <functional>
-#include <sstream>
 
 #include "../src/re.h"
 #include "benchmark/benchmark.h"
@@ -60,6 +60,13 @@ int SetSubstitutions(
 // Run all output tests.
 void RunOutputTests(int argc, char* argv[]);
 
+// Count the number of 'pat' substrings in the 'haystack' string.
+int SubstrCnt(const std::string& haystack, const std::string& pat);
+
+// Run registered benchmarks with file reporter enabled, and return the content
+// outputted by the file reporter.
+std::string GetFileReporterOutput(int argc, char* argv[]);
+
 // ========================================================================= //
 // ------------------------- Results checking ------------------------------ //
 // ========================================================================= //
@@ -73,26 +80,27 @@ void RunOutputTests(int argc, char* argv[]);
 //                  will be the subject of a call to checker_function
 // checker_function: should be of type ResultsCheckFn (see below)
 #define CHECK_BENCHMARK_RESULTS(bm_name_pattern, checker_function) \
-    size_t CONCAT(dummy, __LINE__) = AddChecker(bm_name_pattern, checker_function)
+  size_t CONCAT(dummy, __LINE__) = AddChecker(bm_name_pattern, checker_function)
 
 struct Results;
-typedef std::function< void(Results const&) > ResultsCheckFn;
+typedef std::function<void(Results const&)> ResultsCheckFn;
 
 size_t AddChecker(const char* bm_name_pattern, ResultsCheckFn fn);
 
 // Class holding the results of a benchmark.
 // It is passed in calls to checker functions.
 struct Results {
-
   // the benchmark name
   std::string name;
   // the benchmark fields
-  std::map< std::string, std::string > values;
+  std::map<std::string, std::string> values;
 
   Results(const std::string& n) : name(n) {}
 
   int NumThreads() const;
 
+  double NumIterations() const;
+
   typedef enum { kCpuTime, kRealTime } BenchmarkTime;
 
   // get cpu_time or real_time in seconds
@@ -102,18 +110,18 @@ struct Results {
   // it is better to use fuzzy float checks for this, as the float
   // ASCII formatting is lossy.
   double DurationRealTime() const {
-    return GetAs< double >("iterations") * GetTime(kRealTime);
+    return NumIterations() * GetTime(kRealTime);
   }
   // get the cpu_time duration of the benchmark in seconds
   double DurationCPUTime() const {
-    return GetAs< double >("iterations") * GetTime(kCpuTime);
+    return NumIterations() * GetTime(kCpuTime);
   }
 
   // get the string for a result by name, or nullptr if the name
   // is not found
   const std::string* Get(const char* entry_name) const {
     auto it = values.find(entry_name);
-    if(it == values.end()) return nullptr;
+    if (it == values.end()) return nullptr;
     return &it->second;
   }
 
@@ -126,15 +134,15 @@ struct Results {
   // as a double, and only then converted to the asked type.
   template <class T>
   T GetCounterAs(const char* entry_name) const {
-    double dval = GetAs< double >(entry_name);
-    T tval = static_cast< T >(dval);
+    double dval = GetAs<double>(entry_name);
+    T tval = static_cast<T>(dval);
     return tval;
   }
 };
 
 template <class T>
 T Results::GetAs(const char* entry_name) const {
-  auto *sv = Get(entry_name);
+  auto* sv = Get(entry_name);
   CHECK(sv != nullptr && !sv->empty());
   std::stringstream ss;
   ss << *sv;
@@ -148,6 +156,8 @@ T Results::GetAs(const char* entry_name) const {
 // Macros to help in result checking. Do not use them with arguments causing
 // side-effects.
 
+// clang-format off
+
 #define _CHECK_RESULT_VALUE(entry, getfn, var_type, var_name, relationship, value) \
     CONCAT(CHECK_, relationship)                                        \
     (entry.getfn< var_type >(var_name), (value)) << "\n"                \
@@ -188,6 +198,8 @@ T Results::GetAs(const char* entry_name) const {
 #define CHECK_FLOAT_COUNTER_VALUE(entry, var_name, relationship, value, eps_factor) \
     _CHECK_FLOAT_RESULT_VALUE(entry, GetCounterAs, double, var_name, relationship, value, eps_factor)
 
+// clang-format on
+
 // ========================================================================= //
 // --------------------------- Misc Utilities ------------------------------ //
 // ========================================================================= //
diff --git a/lib/gbenchmark/test/output_test_helper.cc b/lib/gbenchmark/test/output_test_helper.cc
index 6b18fe4359..5dc951d2bc 100644
--- a/lib/gbenchmark/test/output_test_helper.cc
+++ b/lib/gbenchmark/test/output_test_helper.cc
@@ -1,13 +1,17 @@
+#include <cstdio>
+#include <cstring>
+#include <fstream>
 #include <iostream>
 #include <map>
 #include <memory>
+#include <random>
 #include <sstream>
-#include <cstring>
+#include <streambuf>
 
+#include "../src/benchmark_api_internal.h"
 #include "../src/check.h"  // NOTE: check.h is for internal use only!
 #include "../src/re.h"     // NOTE: re.h is for internal use only
 #include "output_test.h"
-#include "../src/benchmark_api_internal.h"
 
 // ========================================================================= //
 // ------------------------------ Internals -------------------------------- //
@@ -33,16 +37,20 @@ TestCaseList& GetTestCaseList(TestCaseID ID) {
 
 SubMap& GetSubstitutions() {
   // Don't use 'dec_re' from header because it may not yet be initialized.
+  // clang-format off
   static std::string safe_dec_re = "[0-9]*[.]?[0-9]+([eE][-+][0-9]+)?";
+  static std::string time_re = "([0-9]+[.])?[0-9]+";
   static SubMap map = {
       {"%float", "[0-9]*[.]?[0-9]+([eE][-+][0-9]+)?"},
       // human-readable float
       {"%hrfloat", "[0-9]*[.]?[0-9]+([eE][-+][0-9]+)?[kMGTPEZYmunpfazy]?"},
       {"%int", "[ ]*[0-9]+"},
       {" %s ", "[ ]+"},
-      {"%time", "[ ]*[0-9]{1,6} ns"},
-      {"%console_report", "[ ]*[0-9]{1,6} ns [ ]*[0-9]{1,6} ns [ ]*[0-9]+"},
-      {"%console_us_report", "[ ]*[0-9] us [ ]*[0-9] us [ ]*[0-9]+"},
+      {"%time", "[ ]*" + time_re + "[ ]+ns"},
+      {"%console_report", "[ ]*" + time_re + "[ ]+ns [ ]*" + time_re + "[ ]+ns [ ]*[0-9]+"},
+      {"%console_time_only_report", "[ ]*" + time_re + "[ ]+ns [ ]*" + time_re + "[ ]+ns"},
+      {"%console_us_report", "[ ]*" + time_re + "[ ]+us [ ]*" + time_re + "[ ]+us [ ]*[0-9]+"},
+      {"%console_us_time_only_report", "[ ]*" + time_re + "[ ]+us [ ]*" + time_re + "[ ]+us"},
       {"%csv_header",
        "name,iterations,real_time,cpu_time,time_unit,bytes_per_second,"
        "items_per_second,label,error_occurred,error_message"},
@@ -57,6 +65,7 @@ SubMap& GetSubstitutions() {
        "," + safe_dec_re + ",,,"},
       {"%csv_label_report_begin", "[0-9]+," + safe_dec_re + "," + safe_dec_re + ",ns,,,"},
       {"%csv_label_report_end", ",,"}};
+  // clang-format on
   return map;
 }
 
@@ -147,9 +156,9 @@ class TestReporter : public benchmark::BenchmarkReporter {
   }
 
  private:
-  std::vector<benchmark::BenchmarkReporter *> reporters_;
+  std::vector<benchmark::BenchmarkReporter*> reporters_;
 };
-}
+}  // namespace
 
 }  // end namespace internal
 
@@ -163,28 +172,25 @@ namespace internal {
 // It works by parsing the CSV output to read the results.
 class ResultsChecker {
  public:
-
-  struct PatternAndFn : public TestCase { // reusing TestCase for its regexes
+  struct PatternAndFn : public TestCase {  // reusing TestCase for its regexes
     PatternAndFn(const std::string& rx, ResultsCheckFn fn_)
-    : TestCase(rx), fn(fn_) {}
+        : TestCase(rx), fn(fn_) {}
     ResultsCheckFn fn;
   };
 
-  std::vector< PatternAndFn > check_patterns;
-  std::vector< Results > results;
-  std::vector< std::string > field_names;
+  std::vector<PatternAndFn> check_patterns;
+  std::vector<Results> results;
+  std::vector<std::string> field_names;
 
   void Add(const std::string& entry_pattern, ResultsCheckFn fn);
 
   void CheckResults(std::stringstream& output);
 
  private:
-
   void SetHeader_(const std::string& csv_header);
   void SetValues_(const std::string& entry_csv_line);
 
-  std::vector< std::string > SplitCsv_(const std::string& line);
-
+  std::vector<std::string> SplitCsv_(const std::string& line);
 };
 
 // store the static ResultsChecker in a function to prevent initialization
@@ -203,11 +209,11 @@ void ResultsChecker::Add(const std::string& entry_pattern, ResultsCheckFn fn) {
 void ResultsChecker::CheckResults(std::stringstream& output) {
   // first reset the stream to the start
   {
-    auto start = std::ios::streampos(0);
+    auto start = std::stringstream::pos_type(0);
     // clear before calling tellg()
     output.clear();
     // seek to zero only when needed
-    if(output.tellg() > start) output.seekg(start);
+    if (output.tellg() > start) output.seekg(start);
     // and just in case
     output.clear();
   }
@@ -218,18 +224,18 @@ void ResultsChecker::CheckResults(std::stringstream& output) {
     CHECK(output.good());
     std::getline(output, line);
     if (on_first) {
-      SetHeader_(line); // this is important
+      SetHeader_(line);  // this is important
       on_first = false;
       continue;
     }
     SetValues_(line);
   }
   // finally we can call the subscribed check functions
-  for(const auto& p : check_patterns) {
+  for (const auto& p : check_patterns) {
     VLOG(2) << "--------------------------------\n";
     VLOG(2) << "checking for benchmarks matching " << p.regex_str << "...\n";
-    for(const auto& r : results) {
-      if(!p.regex->Match(r.name)) {
+    for (const auto& r : results) {
+      if (!p.regex->Match(r.name)) {
         VLOG(2) << p.regex_str << " is not matched by " << r.name << "\n";
         continue;
       } else {
@@ -249,51 +255,50 @@ void ResultsChecker::SetHeader_(const std::string& csv_header) {
 
 // set the values for a benchmark
 void ResultsChecker::SetValues_(const std::string& entry_csv_line) {
-  if(entry_csv_line.empty()) return; // some lines are empty
+  if (entry_csv_line.empty()) return;  // some lines are empty
   CHECK(!field_names.empty());
   auto vals = SplitCsv_(entry_csv_line);
   CHECK_EQ(vals.size(), field_names.size());
-  results.emplace_back(vals[0]); // vals[0] is the benchmark name
-  auto &entry = results.back();
+  results.emplace_back(vals[0]);  // vals[0] is the benchmark name
+  auto& entry = results.back();
   for (size_t i = 1, e = vals.size(); i < e; ++i) {
     entry.values[field_names[i]] = vals[i];
   }
 }
 
 // a quick'n'dirty csv splitter (eliminating quotes)
-std::vector< std::string > ResultsChecker::SplitCsv_(const std::string& line) {
-  std::vector< std::string > out;
-  if(line.empty()) return out;
-  if(!field_names.empty()) out.reserve(field_names.size());
+std::vector<std::string> ResultsChecker::SplitCsv_(const std::string& line) {
+  std::vector<std::string> out;
+  if (line.empty()) return out;
+  if (!field_names.empty()) out.reserve(field_names.size());
   size_t prev = 0, pos = line.find_first_of(','), curr = pos;
-  while(pos != line.npos) {
+  while (pos != line.npos) {
     CHECK(curr > 0);
-    if(line[prev] == '"') ++prev;
-    if(line[curr-1] == '"') --curr;
-    out.push_back(line.substr(prev, curr-prev));
+    if (line[prev] == '"') ++prev;
+    if (line[curr - 1] == '"') --curr;
+    out.push_back(line.substr(prev, curr - prev));
     prev = pos + 1;
     pos = line.find_first_of(',', pos + 1);
     curr = pos;
   }
   curr = line.size();
-  if(line[prev] == '"') ++prev;
-  if(line[curr-1] == '"') --curr;
-  out.push_back(line.substr(prev, curr-prev));
+  if (line[prev] == '"') ++prev;
+  if (line[curr - 1] == '"') --curr;
+  out.push_back(line.substr(prev, curr - prev));
   return out;
 }
 
 }  // end namespace internal
 
-size_t AddChecker(const char* bm_name, ResultsCheckFn fn)
-{
-  auto &rc = internal::GetResultsChecker();
+size_t AddChecker(const char* bm_name, ResultsCheckFn fn) {
+  auto& rc = internal::GetResultsChecker();
   rc.Add(bm_name, fn);
   return rc.results.size();
 }
 
 int Results::NumThreads() const {
   auto pos = name.find("/threads:");
-  if(pos == name.npos) return 1;
+  if (pos == name.npos) return 1;
   auto end = name.find('/', pos + 9);
   std::stringstream ss;
   ss << name.substr(pos + 9, end);
@@ -303,19 +308,23 @@ int Results::NumThreads() const {
   return num;
 }
 
+double Results::NumIterations() const {
+  return GetAs<double>("iterations");
+}
+
 double Results::GetTime(BenchmarkTime which) const {
   CHECK(which == kCpuTime || which == kRealTime);
-  const char *which_str = which == kCpuTime ? "cpu_time" : "real_time";
-  double val = GetAs< double >(which_str);
+  const char* which_str = which == kCpuTime ? "cpu_time" : "real_time";
+  double val = GetAs<double>(which_str);
   auto unit = Get("time_unit");
   CHECK(unit);
-  if(*unit == "ns") {
+  if (*unit == "ns") {
     return val * 1.e-9;
-  } else if(*unit == "us") {
+  } else if (*unit == "us") {
     return val * 1.e-6;
-  } else if(*unit == "ms") {
+  } else if (*unit == "ms") {
     return val * 1.e-3;
-  } else if(*unit == "s") {
+  } else if (*unit == "s") {
     return val;
   } else {
     CHECK(1 == 0) << "unknown time unit: " << *unit;
@@ -333,7 +342,7 @@ TestCase::TestCase(std::string re, int rule)
       substituted_regex(internal::PerformSubstitutions(regex_str)),
       regex(std::make_shared<benchmark::Regex>()) {
   std::string err_str;
-  regex->Init(substituted_regex,& err_str);
+  regex->Init(substituted_regex, &err_str);
   CHECK(err_str.empty()) << "Could not construct regex \"" << substituted_regex
                          << "\""
                          << "\n    originally \"" << regex_str << "\""
@@ -367,7 +376,7 @@ int SetSubstitutions(
 void RunOutputTests(int argc, char* argv[]) {
   using internal::GetTestCaseList;
   benchmark::Initialize(&argc, argv);
-  auto options = benchmark::internal::GetOutputOptions(/*force_no_color*/true);
+  auto options = benchmark::internal::GetOutputOptions(/*force_no_color*/ true);
   benchmark::ConsoleReporter CR(options);
   benchmark::JSONReporter JR;
   benchmark::CSVReporter CSVR;
@@ -416,8 +425,81 @@ void RunOutputTests(int argc, char* argv[]) {
 
   // now that we know the output is as expected, we can dispatch
   // the checks to subscribees.
-  auto &csv = TestCases[2];
+  auto& csv = TestCases[2];
   // would use == but gcc spits a warning
   CHECK(std::strcmp(csv.name, "CSVReporter") == 0);
   internal::GetResultsChecker().CheckResults(csv.out_stream);
 }
+
+int SubstrCnt(const std::string& haystack, const std::string& pat) {
+  if (pat.length() == 0) return 0;
+  int count = 0;
+  for (size_t offset = haystack.find(pat); offset != std::string::npos;
+       offset = haystack.find(pat, offset + pat.length()))
+    ++count;
+  return count;
+}
+
+static char ToHex(int ch) {
+  return ch < 10 ? static_cast<char>('0' + ch)
+                 : static_cast<char>('a' + (ch - 10));
+}
+
+static char RandomHexChar() {
+  static std::mt19937 rd{std::random_device{}()};
+  static std::uniform_int_distribution<int> mrand{0, 15};
+  return ToHex(mrand(rd));
+}
+
+static std::string GetRandomFileName() {
+  std::string model = "test.%%%%%%";
+  for (auto & ch :  model) {
+    if (ch == '%')
+      ch = RandomHexChar();
+  }
+  return model;
+}
+
+static bool FileExists(std::string const& name) {
+  std::ifstream in(name.c_str());
+  return in.good();
+}
+
+static std::string GetTempFileName() {
+  // This function attempts to avoid race conditions where two tests
+  // create the same file at the same time. However, it still introduces races
+  // similar to tmpnam.
+  int retries = 3;
+  while (--retries) {
+    std::string name = GetRandomFileName();
+    if (!FileExists(name))
+      return name;
+  }
+  std::cerr << "Failed to create unique temporary file name" << std::endl;
+  std::abort();
+}
+
+std::string GetFileReporterOutput(int argc, char* argv[]) {
+  std::vector<char*> new_argv(argv, argv + argc);
+  assert(static_cast<decltype(new_argv)::size_type>(argc) == new_argv.size());
+
+  std::string tmp_file_name = GetTempFileName();
+  std::cout << "Will be using this as the tmp file: " << tmp_file_name << '\n';
+
+  std::string tmp = "--benchmark_out=";
+  tmp += tmp_file_name;
+  new_argv.emplace_back(const_cast<char*>(tmp.c_str()));
+
+  argc = int(new_argv.size());
+
+  benchmark::Initialize(&argc, new_argv.data());
+  benchmark::RunSpecifiedBenchmarks();
+
+  // Read the output back from the file, and delete the file.
+  std::ifstream tmp_stream(tmp_file_name);
+  std::string output = std::string((std::istreambuf_iterator<char>(tmp_stream)),
+                                   std::istreambuf_iterator<char>());
+  std::remove(tmp_file_name.c_str());
+
+  return output;
+}
diff --git a/lib/gbenchmark/test/register_benchmark_test.cc b/lib/gbenchmark/test/register_benchmark_test.cc
index 8ab2c29939..3ac5b21fb3 100644
--- a/lib/gbenchmark/test/register_benchmark_test.cc
+++ b/lib/gbenchmark/test/register_benchmark_test.cc
@@ -29,14 +29,16 @@ struct TestCase {
   typedef benchmark::BenchmarkReporter::Run Run;
 
   void CheckRun(Run const& run) const {
-    CHECK(name == run.benchmark_name) << "expected " << name << " got "
-                                      << run.benchmark_name;
+    // clang-format off
+    CHECK(name == run.benchmark_name()) << "expected " << name << " got "
+                                      << run.benchmark_name();
     if (label) {
       CHECK(run.report_label == label) << "expected " << label << " got "
                                        << run.report_label;
     } else {
       CHECK(run.report_label == "");
     }
+    // clang-format on
   }
 };
 
diff --git a/lib/gbenchmark/test/report_aggregates_only_test.cc b/lib/gbenchmark/test/report_aggregates_only_test.cc
new file mode 100644
index 0000000000..9646b9be53
--- /dev/null
+++ b/lib/gbenchmark/test/report_aggregates_only_test.cc
@@ -0,0 +1,39 @@
+
+#undef NDEBUG
+#include <cstdio>
+#include <string>
+
+#include "benchmark/benchmark.h"
+#include "output_test.h"
+
+// Ok this test is super ugly. We want to check what happens with the file
+// reporter in the presence of ReportAggregatesOnly().
+// We do not care about console output, the normal tests check that already.
+
+void BM_SummaryRepeat(benchmark::State& state) {
+  for (auto _ : state) {
+  }
+}
+BENCHMARK(BM_SummaryRepeat)->Repetitions(3)->ReportAggregatesOnly();
+
+int main(int argc, char* argv[]) {
+  const std::string output = GetFileReporterOutput(argc, argv);
+
+  if (SubstrCnt(output, "\"name\": \"BM_SummaryRepeat/repeats:3") != 3 ||
+      SubstrCnt(output, "\"name\": \"BM_SummaryRepeat/repeats:3_mean\"") != 1 ||
+      SubstrCnt(output, "\"name\": \"BM_SummaryRepeat/repeats:3_median\"") !=
+          1 ||
+      SubstrCnt(output, "\"name\": \"BM_SummaryRepeat/repeats:3_stddev\"") !=
+          1) {
+    std::cout << "Precondition mismatch. Expected to only find three "
+                 "occurrences of \"BM_SummaryRepeat/repeats:3\" substring:\n"
+                 "\"name\": \"BM_SummaryRepeat/repeats:3_mean\", "
+                 "\"name\": \"BM_SummaryRepeat/repeats:3_median\", "
+                 "\"name\": \"BM_SummaryRepeat/repeats:3_stddev\"\nThe entire "
+                 "output:\n";
+    std::cout << output;
+    return 1;
+  }
+
+  return 0;
+}
diff --git a/lib/gbenchmark/test/reporter_output_test.cc b/lib/gbenchmark/test/reporter_output_test.cc
index 23eb1baf63..c8090d4aca 100644
--- a/lib/gbenchmark/test/reporter_output_test.cc
+++ b/lib/gbenchmark/test/reporter_output_test.cc
@@ -9,26 +9,29 @@
 // ---------------------- Testing Prologue Output -------------------------- //
 // ========================================================================= //
 
-ADD_CASES(TC_ConsoleOut,
-          {{"^[-]+$", MR_Next},
-           {"^Benchmark %s Time %s CPU %s Iterations$", MR_Next},
-           {"^[-]+$", MR_Next}});
+ADD_CASES(TC_ConsoleOut, {{"^[-]+$", MR_Next},
+                          {"^Benchmark %s Time %s CPU %s Iterations$", MR_Next},
+                          {"^[-]+$", MR_Next}});
 static int AddContextCases() {
   AddCases(TC_ConsoleErr,
            {
                {"%int[-/]%int[-/]%int %int:%int:%int$", MR_Default},
                {"Running .*/reporter_output_test(\\.exe)?$", MR_Next},
-               {"Run on \\(%int X %float MHz CPU s\\)", MR_Next},
+               {"Run on \\(%int X %float MHz CPU s?\\)", MR_Next},
            });
-  AddCases(TC_JSONOut, {{"^\\{", MR_Default},
-                        {"\"context\":", MR_Next},
-                        {"\"date\": \"", MR_Next},
-                        {"\"executable\": \".*/reporter_output_test(\\.exe)?\",", MR_Next},
-                        {"\"num_cpus\": %int,$", MR_Next},
-                        {"\"mhz_per_cpu\": %float,$", MR_Next},
-                        {"\"cpu_scaling_enabled\": ", MR_Next},
-                        {"\"caches\": \\[$", MR_Next}});
-  auto const& Caches = benchmark::CPUInfo::Get().caches;
+  AddCases(TC_JSONOut,
+           {{"^\\{", MR_Default},
+            {"\"context\":", MR_Next},
+            {"\"date\": \"", MR_Next},
+            {"\"host_name\":", MR_Next},
+            {"\"executable\": \".*(/|\\\\)reporter_output_test(\\.exe)?\",",
+             MR_Next},
+            {"\"num_cpus\": %int,$", MR_Next},
+            {"\"mhz_per_cpu\": %float,$", MR_Next},
+            {"\"cpu_scaling_enabled\": ", MR_Next},
+            {"\"caches\": \\[$", MR_Next}});
+  auto const& Info = benchmark::CPUInfo::Get();
+  auto const& Caches = Info.caches;
   if (!Caches.empty()) {
     AddCases(TC_ConsoleErr, {{"CPU Caches:$", MR_Next}});
   }
@@ -45,8 +48,13 @@ static int AddContextCases() {
                           {"\"num_sharing\": %int$", MR_Next},
                           {"}[,]{0,1}$", MR_Next}});
   }
-
   AddCases(TC_JSONOut, {{"],$"}});
+  auto const& LoadAvg = Info.load_avg;
+  if (!LoadAvg.empty()) {
+    AddCases(TC_ConsoleErr,
+             {{"Load Average: (%float, ){0,2}%float$", MR_Next}});
+  }
+  AddCases(TC_JSONOut, {{"\"load_avg\": \\[(%float,?){0,3}],$", MR_Next}});
   return 0;
 }
 int dummy_register = AddContextCases();
@@ -64,6 +72,11 @@ BENCHMARK(BM_basic);
 
 ADD_CASES(TC_ConsoleOut, {{"^BM_basic %console_report$"}});
 ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_basic\",$"},
+                       {"\"run_name\": \"BM_basic\",$", MR_Next},
+                       {"\"run_type\": \"iteration\",$", MR_Next},
+                       {"\"repetitions\": 0,$", MR_Next},
+                       {"\"repetition_index\": 0,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
                        {"\"iterations\": %int,$", MR_Next},
                        {"\"real_time\": %float,$", MR_Next},
                        {"\"cpu_time\": %float,$", MR_Next},
@@ -82,9 +95,14 @@ void BM_bytes_per_second(benchmark::State& state) {
 }
 BENCHMARK(BM_bytes_per_second);
 
-ADD_CASES(TC_ConsoleOut,
-          {{"^BM_bytes_per_second %console_report +%float[kM]{0,1}B/s$"}});
+ADD_CASES(TC_ConsoleOut, {{"^BM_bytes_per_second %console_report "
+                           "bytes_per_second=%float[kM]{0,1}/s$"}});
 ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_bytes_per_second\",$"},
+                       {"\"run_name\": \"BM_bytes_per_second\",$", MR_Next},
+                       {"\"run_type\": \"iteration\",$", MR_Next},
+                       {"\"repetitions\": 0,$", MR_Next},
+                       {"\"repetition_index\": 0,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
                        {"\"iterations\": %int,$", MR_Next},
                        {"\"real_time\": %float,$", MR_Next},
                        {"\"cpu_time\": %float,$", MR_Next},
@@ -104,9 +122,14 @@ void BM_items_per_second(benchmark::State& state) {
 }
 BENCHMARK(BM_items_per_second);
 
-ADD_CASES(TC_ConsoleOut,
-          {{"^BM_items_per_second %console_report +%float[kM]{0,1} items/s$"}});
+ADD_CASES(TC_ConsoleOut, {{"^BM_items_per_second %console_report "
+                           "items_per_second=%float[kM]{0,1}/s$"}});
 ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_items_per_second\",$"},
+                       {"\"run_name\": \"BM_items_per_second\",$", MR_Next},
+                       {"\"run_type\": \"iteration\",$", MR_Next},
+                       {"\"repetitions\": 0,$", MR_Next},
+                       {"\"repetition_index\": 0,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
                        {"\"iterations\": %int,$", MR_Next},
                        {"\"real_time\": %float,$", MR_Next},
                        {"\"cpu_time\": %float,$", MR_Next},
@@ -128,6 +151,11 @@ BENCHMARK(BM_label);
 
 ADD_CASES(TC_ConsoleOut, {{"^BM_label %console_report some label$"}});
 ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_label\",$"},
+                       {"\"run_name\": \"BM_label\",$", MR_Next},
+                       {"\"run_type\": \"iteration\",$", MR_Next},
+                       {"\"repetitions\": 0,$", MR_Next},
+                       {"\"repetition_index\": 0,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
                        {"\"iterations\": %int,$", MR_Next},
                        {"\"real_time\": %float,$", MR_Next},
                        {"\"cpu_time\": %float,$", MR_Next},
@@ -149,6 +177,11 @@ void BM_error(benchmark::State& state) {
 BENCHMARK(BM_error);
 ADD_CASES(TC_ConsoleOut, {{"^BM_error[ ]+ERROR OCCURRED: 'message'$"}});
 ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_error\",$"},
+                       {"\"run_name\": \"BM_error\",$", MR_Next},
+                       {"\"run_type\": \"iteration\",$", MR_Next},
+                       {"\"repetitions\": 0,$", MR_Next},
+                       {"\"repetition_index\": 0,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
                        {"\"error_occurred\": true,$", MR_Next},
                        {"\"error_message\": \"message\",$", MR_Next}});
 
@@ -165,7 +198,12 @@ void BM_no_arg_name(benchmark::State& state) {
 }
 BENCHMARK(BM_no_arg_name)->Arg(3);
 ADD_CASES(TC_ConsoleOut, {{"^BM_no_arg_name/3 %console_report$"}});
-ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_no_arg_name/3\",$"}});
+ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_no_arg_name/3\",$"},
+                       {"\"run_name\": \"BM_no_arg_name/3\",$", MR_Next},
+                       {"\"run_type\": \"iteration\",$", MR_Next},
+                       {"\"repetitions\": 0,$", MR_Next},
+                       {"\"repetition_index\": 0,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next}});
 ADD_CASES(TC_CSVOut, {{"^\"BM_no_arg_name/3\",%csv_report$"}});
 
 // ========================================================================= //
@@ -178,7 +216,12 @@ void BM_arg_name(benchmark::State& state) {
 }
 BENCHMARK(BM_arg_name)->ArgName("first")->Arg(3);
 ADD_CASES(TC_ConsoleOut, {{"^BM_arg_name/first:3 %console_report$"}});
-ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_arg_name/first:3\",$"}});
+ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_arg_name/first:3\",$"},
+                       {"\"run_name\": \"BM_arg_name/first:3\",$", MR_Next},
+                       {"\"run_type\": \"iteration\",$", MR_Next},
+                       {"\"repetitions\": 0,$", MR_Next},
+                       {"\"repetition_index\": 0,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next}});
 ADD_CASES(TC_CSVOut, {{"^\"BM_arg_name/first:3\",%csv_report$"}});
 
 // ========================================================================= //
@@ -192,9 +235,27 @@ void BM_arg_names(benchmark::State& state) {
 BENCHMARK(BM_arg_names)->Args({2, 5, 4})->ArgNames({"first", "", "third"});
 ADD_CASES(TC_ConsoleOut,
           {{"^BM_arg_names/first:2/5/third:4 %console_report$"}});
-ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_arg_names/first:2/5/third:4\",$"}});
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_arg_names/first:2/5/third:4\",$"},
+           {"\"run_name\": \"BM_arg_names/first:2/5/third:4\",$", MR_Next},
+           {"\"run_type\": \"iteration\",$", MR_Next},
+           {"\"repetitions\": 0,$", MR_Next},
+           {"\"repetition_index\": 0,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next}});
 ADD_CASES(TC_CSVOut, {{"^\"BM_arg_names/first:2/5/third:4\",%csv_report$"}});
 
+// ========================================================================= //
+// ------------------------ Testing Big Args Output ------------------------ //
+// ========================================================================= //
+
+void BM_BigArgs(benchmark::State& state) {
+  for (auto _ : state) {
+  }
+}
+BENCHMARK(BM_BigArgs)->RangeMultiplier(2)->Range(1U << 30U, 1U << 31U);
+ADD_CASES(TC_ConsoleOut, {{"^BM_BigArgs/1073741824 %console_report$"},
+                          {"^BM_BigArgs/2147483648 %console_report$"}});
+
 // ========================================================================= //
 // ----------------------- Testing Complexity Output ----------------------- //
 // ========================================================================= //
@@ -221,16 +282,45 @@ void BM_Repeat(benchmark::State& state) {
 }
 // need two repetitions min to be able to output any aggregate output
 BENCHMARK(BM_Repeat)->Repetitions(2);
-ADD_CASES(TC_ConsoleOut, {{"^BM_Repeat/repeats:2 %console_report$"},
-                          {"^BM_Repeat/repeats:2 %console_report$"},
-                          {"^BM_Repeat/repeats:2_mean %console_report$"},
-                          {"^BM_Repeat/repeats:2_median %console_report$"},
-                          {"^BM_Repeat/repeats:2_stddev %console_report$"}});
+ADD_CASES(TC_ConsoleOut,
+          {{"^BM_Repeat/repeats:2 %console_report$"},
+           {"^BM_Repeat/repeats:2 %console_report$"},
+           {"^BM_Repeat/repeats:2_mean %console_time_only_report [ ]*2$"},
+           {"^BM_Repeat/repeats:2_median %console_time_only_report [ ]*2$"},
+           {"^BM_Repeat/repeats:2_stddev %console_time_only_report [ ]*2$"}});
 ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_Repeat/repeats:2\",$"},
+                       {"\"run_name\": \"BM_Repeat/repeats:2\"", MR_Next},
+                       {"\"run_type\": \"iteration\",$", MR_Next},
+                       {"\"repetitions\": 2,$", MR_Next},
+                       {"\"repetition_index\": 0,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
                        {"\"name\": \"BM_Repeat/repeats:2\",$"},
+                       {"\"run_name\": \"BM_Repeat/repeats:2\",$", MR_Next},
+                       {"\"run_type\": \"iteration\",$", MR_Next},
+                       {"\"repetitions\": 2,$", MR_Next},
+                       {"\"repetition_index\": 1,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
                        {"\"name\": \"BM_Repeat/repeats:2_mean\",$"},
+                       {"\"run_name\": \"BM_Repeat/repeats:2\",$", MR_Next},
+                       {"\"run_type\": \"aggregate\",$", MR_Next},
+                       {"\"repetitions\": 2,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
+                       {"\"aggregate_name\": \"mean\",$", MR_Next},
+                       {"\"iterations\": 2,$", MR_Next},
                        {"\"name\": \"BM_Repeat/repeats:2_median\",$"},
-                       {"\"name\": \"BM_Repeat/repeats:2_stddev\",$"}});
+                       {"\"run_name\": \"BM_Repeat/repeats:2\",$", MR_Next},
+                       {"\"run_type\": \"aggregate\",$", MR_Next},
+                       {"\"repetitions\": 2,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
+                       {"\"aggregate_name\": \"median\",$", MR_Next},
+                       {"\"iterations\": 2,$", MR_Next},
+                       {"\"name\": \"BM_Repeat/repeats:2_stddev\",$"},
+                       {"\"run_name\": \"BM_Repeat/repeats:2\",$", MR_Next},
+                       {"\"run_type\": \"aggregate\",$", MR_Next},
+                       {"\"repetitions\": 2,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
+                       {"\"aggregate_name\": \"stddev\",$", MR_Next},
+                       {"\"iterations\": 2,$", MR_Next}});
 ADD_CASES(TC_CSVOut, {{"^\"BM_Repeat/repeats:2\",%csv_report$"},
                       {"^\"BM_Repeat/repeats:2\",%csv_report$"},
                       {"^\"BM_Repeat/repeats:2_mean\",%csv_report$"},
@@ -238,18 +328,52 @@ ADD_CASES(TC_CSVOut, {{"^\"BM_Repeat/repeats:2\",%csv_report$"},
                       {"^\"BM_Repeat/repeats:2_stddev\",%csv_report$"}});
 // but for two repetitions, mean and median is the same, so let's repeat..
 BENCHMARK(BM_Repeat)->Repetitions(3);
-ADD_CASES(TC_ConsoleOut, {{"^BM_Repeat/repeats:3 %console_report$"},
-                          {"^BM_Repeat/repeats:3 %console_report$"},
-                          {"^BM_Repeat/repeats:3 %console_report$"},
-                          {"^BM_Repeat/repeats:3_mean %console_report$"},
-                          {"^BM_Repeat/repeats:3_median %console_report$"},
-                          {"^BM_Repeat/repeats:3_stddev %console_report$"}});
+ADD_CASES(TC_ConsoleOut,
+          {{"^BM_Repeat/repeats:3 %console_report$"},
+           {"^BM_Repeat/repeats:3 %console_report$"},
+           {"^BM_Repeat/repeats:3 %console_report$"},
+           {"^BM_Repeat/repeats:3_mean %console_time_only_report [ ]*3$"},
+           {"^BM_Repeat/repeats:3_median %console_time_only_report [ ]*3$"},
+           {"^BM_Repeat/repeats:3_stddev %console_time_only_report [ ]*3$"}});
 ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_Repeat/repeats:3\",$"},
+                       {"\"run_name\": \"BM_Repeat/repeats:3\",$", MR_Next},
+                       {"\"run_type\": \"iteration\",$", MR_Next},
+                       {"\"repetitions\": 3,$", MR_Next},
+                       {"\"repetition_index\": 0,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
                        {"\"name\": \"BM_Repeat/repeats:3\",$"},
+                       {"\"run_name\": \"BM_Repeat/repeats:3\",$", MR_Next},
+                       {"\"run_type\": \"iteration\",$", MR_Next},
+                       {"\"repetitions\": 3,$", MR_Next},
+                       {"\"repetition_index\": 1,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
                        {"\"name\": \"BM_Repeat/repeats:3\",$"},
+                       {"\"run_name\": \"BM_Repeat/repeats:3\",$", MR_Next},
+                       {"\"run_type\": \"iteration\",$", MR_Next},
+                       {"\"repetitions\": 3,$", MR_Next},
+                       {"\"repetition_index\": 2,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
                        {"\"name\": \"BM_Repeat/repeats:3_mean\",$"},
+                       {"\"run_name\": \"BM_Repeat/repeats:3\",$", MR_Next},
+                       {"\"run_type\": \"aggregate\",$", MR_Next},
+                       {"\"repetitions\": 3,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
+                       {"\"aggregate_name\": \"mean\",$", MR_Next},
+                       {"\"iterations\": 3,$", MR_Next},
                        {"\"name\": \"BM_Repeat/repeats:3_median\",$"},
-                       {"\"name\": \"BM_Repeat/repeats:3_stddev\",$"}});
+                       {"\"run_name\": \"BM_Repeat/repeats:3\",$", MR_Next},
+                       {"\"run_type\": \"aggregate\",$", MR_Next},
+                       {"\"repetitions\": 3,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
+                       {"\"aggregate_name\": \"median\",$", MR_Next},
+                       {"\"iterations\": 3,$", MR_Next},
+                       {"\"name\": \"BM_Repeat/repeats:3_stddev\",$"},
+                       {"\"run_name\": \"BM_Repeat/repeats:3\",$", MR_Next},
+                       {"\"run_type\": \"aggregate\",$", MR_Next},
+                       {"\"repetitions\": 3,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
+                       {"\"aggregate_name\": \"stddev\",$", MR_Next},
+                       {"\"iterations\": 3,$", MR_Next}});
 ADD_CASES(TC_CSVOut, {{"^\"BM_Repeat/repeats:3\",%csv_report$"},
                       {"^\"BM_Repeat/repeats:3\",%csv_report$"},
                       {"^\"BM_Repeat/repeats:3\",%csv_report$"},
@@ -258,20 +382,59 @@ ADD_CASES(TC_CSVOut, {{"^\"BM_Repeat/repeats:3\",%csv_report$"},
                       {"^\"BM_Repeat/repeats:3_stddev\",%csv_report$"}});
 // median differs between even/odd number of repetitions, so just to be sure
 BENCHMARK(BM_Repeat)->Repetitions(4);
-ADD_CASES(TC_ConsoleOut, {{"^BM_Repeat/repeats:4 %console_report$"},
-                          {"^BM_Repeat/repeats:4 %console_report$"},
-                          {"^BM_Repeat/repeats:4 %console_report$"},
-                          {"^BM_Repeat/repeats:4 %console_report$"},
-                          {"^BM_Repeat/repeats:4_mean %console_report$"},
-                          {"^BM_Repeat/repeats:4_median %console_report$"},
-                          {"^BM_Repeat/repeats:4_stddev %console_report$"}});
+ADD_CASES(TC_ConsoleOut,
+          {{"^BM_Repeat/repeats:4 %console_report$"},
+           {"^BM_Repeat/repeats:4 %console_report$"},
+           {"^BM_Repeat/repeats:4 %console_report$"},
+           {"^BM_Repeat/repeats:4 %console_report$"},
+           {"^BM_Repeat/repeats:4_mean %console_time_only_report [ ]*4$"},
+           {"^BM_Repeat/repeats:4_median %console_time_only_report [ ]*4$"},
+           {"^BM_Repeat/repeats:4_stddev %console_time_only_report [ ]*4$"}});
 ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_Repeat/repeats:4\",$"},
+                       {"\"run_name\": \"BM_Repeat/repeats:4\",$", MR_Next},
+                       {"\"run_type\": \"iteration\",$", MR_Next},
+                       {"\"repetitions\": 4,$", MR_Next},
+                       {"\"repetition_index\": 0,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
                        {"\"name\": \"BM_Repeat/repeats:4\",$"},
+                       {"\"run_name\": \"BM_Repeat/repeats:4\",$", MR_Next},
+                       {"\"run_type\": \"iteration\",$", MR_Next},
+                       {"\"repetitions\": 4,$", MR_Next},
+                       {"\"repetition_index\": 1,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
                        {"\"name\": \"BM_Repeat/repeats:4\",$"},
+                       {"\"run_name\": \"BM_Repeat/repeats:4\",$", MR_Next},
+                       {"\"run_type\": \"iteration\",$", MR_Next},
+                       {"\"repetitions\": 4,$", MR_Next},
+                       {"\"repetition_index\": 2,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
                        {"\"name\": \"BM_Repeat/repeats:4\",$"},
+                       {"\"run_name\": \"BM_Repeat/repeats:4\",$", MR_Next},
+                       {"\"run_type\": \"iteration\",$", MR_Next},
+                       {"\"repetitions\": 4,$", MR_Next},
+                       {"\"repetition_index\": 3,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
                        {"\"name\": \"BM_Repeat/repeats:4_mean\",$"},
+                       {"\"run_name\": \"BM_Repeat/repeats:4\",$", MR_Next},
+                       {"\"run_type\": \"aggregate\",$", MR_Next},
+                       {"\"repetitions\": 4,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
+                       {"\"aggregate_name\": \"mean\",$", MR_Next},
+                       {"\"iterations\": 4,$", MR_Next},
                        {"\"name\": \"BM_Repeat/repeats:4_median\",$"},
-                       {"\"name\": \"BM_Repeat/repeats:4_stddev\",$"}});
+                       {"\"run_name\": \"BM_Repeat/repeats:4\",$", MR_Next},
+                       {"\"run_type\": \"aggregate\",$", MR_Next},
+                       {"\"repetitions\": 4,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
+                       {"\"aggregate_name\": \"median\",$", MR_Next},
+                       {"\"iterations\": 4,$", MR_Next},
+                       {"\"name\": \"BM_Repeat/repeats:4_stddev\",$"},
+                       {"\"run_name\": \"BM_Repeat/repeats:4\",$", MR_Next},
+                       {"\"run_type\": \"aggregate\",$", MR_Next},
+                       {"\"repetitions\": 4,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
+                       {"\"aggregate_name\": \"stddev\",$", MR_Next},
+                       {"\"iterations\": 4,$", MR_Next}});
 ADD_CASES(TC_CSVOut, {{"^\"BM_Repeat/repeats:4\",%csv_report$"},
                       {"^\"BM_Repeat/repeats:4\",%csv_report$"},
                       {"^\"BM_Repeat/repeats:4\",%csv_report$"},
@@ -288,7 +451,12 @@ void BM_RepeatOnce(benchmark::State& state) {
 }
 BENCHMARK(BM_RepeatOnce)->Repetitions(1)->ReportAggregatesOnly();
 ADD_CASES(TC_ConsoleOut, {{"^BM_RepeatOnce/repeats:1 %console_report$"}});
-ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_RepeatOnce/repeats:1\",$"}});
+ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_RepeatOnce/repeats:1\",$"},
+                       {"\"run_name\": \"BM_RepeatOnce/repeats:1\",$", MR_Next},
+                       {"\"run_type\": \"iteration\",$", MR_Next},
+                       {"\"repetitions\": 1,$", MR_Next},
+                       {"\"repetition_index\": 0,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next}});
 ADD_CASES(TC_CSVOut, {{"^\"BM_RepeatOnce/repeats:1\",%csv_report$"}});
 
 // Test that non-aggregate data is not reported
@@ -297,20 +465,84 @@ void BM_SummaryRepeat(benchmark::State& state) {
   }
 }
 BENCHMARK(BM_SummaryRepeat)->Repetitions(3)->ReportAggregatesOnly();
-ADD_CASES(TC_ConsoleOut,
+ADD_CASES(
+    TC_ConsoleOut,
+    {{".*BM_SummaryRepeat/repeats:3 ", MR_Not},
+     {"^BM_SummaryRepeat/repeats:3_mean %console_time_only_report [ ]*3$"},
+     {"^BM_SummaryRepeat/repeats:3_median %console_time_only_report [ ]*3$"},
+     {"^BM_SummaryRepeat/repeats:3_stddev %console_time_only_report [ ]*3$"}});
+ADD_CASES(TC_JSONOut,
           {{".*BM_SummaryRepeat/repeats:3 ", MR_Not},
-           {"^BM_SummaryRepeat/repeats:3_mean %console_report$"},
-           {"^BM_SummaryRepeat/repeats:3_median %console_report$"},
-           {"^BM_SummaryRepeat/repeats:3_stddev %console_report$"}});
-ADD_CASES(TC_JSONOut, {{".*BM_SummaryRepeat/repeats:3 ", MR_Not},
-                       {"\"name\": \"BM_SummaryRepeat/repeats:3_mean\",$"},
-                       {"\"name\": \"BM_SummaryRepeat/repeats:3_median\",$"},
-                       {"\"name\": \"BM_SummaryRepeat/repeats:3_stddev\",$"}});
+           {"\"name\": \"BM_SummaryRepeat/repeats:3_mean\",$"},
+           {"\"run_name\": \"BM_SummaryRepeat/repeats:3\",$", MR_Next},
+           {"\"run_type\": \"aggregate\",$", MR_Next},
+           {"\"repetitions\": 3,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"aggregate_name\": \"mean\",$", MR_Next},
+           {"\"iterations\": 3,$", MR_Next},
+           {"\"name\": \"BM_SummaryRepeat/repeats:3_median\",$"},
+           {"\"run_name\": \"BM_SummaryRepeat/repeats:3\",$", MR_Next},
+           {"\"run_type\": \"aggregate\",$", MR_Next},
+           {"\"repetitions\": 3,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"aggregate_name\": \"median\",$", MR_Next},
+           {"\"iterations\": 3,$", MR_Next},
+           {"\"name\": \"BM_SummaryRepeat/repeats:3_stddev\",$"},
+           {"\"run_name\": \"BM_SummaryRepeat/repeats:3\",$", MR_Next},
+           {"\"run_type\": \"aggregate\",$", MR_Next},
+           {"\"repetitions\": 3,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"aggregate_name\": \"stddev\",$", MR_Next},
+           {"\"iterations\": 3,$", MR_Next}});
 ADD_CASES(TC_CSVOut, {{".*BM_SummaryRepeat/repeats:3 ", MR_Not},
                       {"^\"BM_SummaryRepeat/repeats:3_mean\",%csv_report$"},
                       {"^\"BM_SummaryRepeat/repeats:3_median\",%csv_report$"},
                       {"^\"BM_SummaryRepeat/repeats:3_stddev\",%csv_report$"}});
 
+// Test that non-aggregate data is not displayed.
+// NOTE: this test is kinda bad. we are only testing the display output.
+//       But we don't check that the file output still contains everything...
+void BM_SummaryDisplay(benchmark::State& state) {
+  for (auto _ : state) {
+  }
+}
+BENCHMARK(BM_SummaryDisplay)->Repetitions(2)->DisplayAggregatesOnly();
+ADD_CASES(
+    TC_ConsoleOut,
+    {{".*BM_SummaryDisplay/repeats:2 ", MR_Not},
+     {"^BM_SummaryDisplay/repeats:2_mean %console_time_only_report [ ]*2$"},
+     {"^BM_SummaryDisplay/repeats:2_median %console_time_only_report [ ]*2$"},
+     {"^BM_SummaryDisplay/repeats:2_stddev %console_time_only_report [ ]*2$"}});
+ADD_CASES(TC_JSONOut,
+          {{".*BM_SummaryDisplay/repeats:2 ", MR_Not},
+           {"\"name\": \"BM_SummaryDisplay/repeats:2_mean\",$"},
+           {"\"run_name\": \"BM_SummaryDisplay/repeats:2\",$", MR_Next},
+           {"\"run_type\": \"aggregate\",$", MR_Next},
+           {"\"repetitions\": 2,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"aggregate_name\": \"mean\",$", MR_Next},
+           {"\"iterations\": 2,$", MR_Next},
+           {"\"name\": \"BM_SummaryDisplay/repeats:2_median\",$"},
+           {"\"run_name\": \"BM_SummaryDisplay/repeats:2\",$", MR_Next},
+           {"\"run_type\": \"aggregate\",$", MR_Next},
+           {"\"repetitions\": 2,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"aggregate_name\": \"median\",$", MR_Next},
+           {"\"iterations\": 2,$", MR_Next},
+           {"\"name\": \"BM_SummaryDisplay/repeats:2_stddev\",$"},
+           {"\"run_name\": \"BM_SummaryDisplay/repeats:2\",$", MR_Next},
+           {"\"run_type\": \"aggregate\",$", MR_Next},
+           {"\"repetitions\": 2,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"aggregate_name\": \"stddev\",$", MR_Next},
+           {"\"iterations\": 2,$", MR_Next}});
+ADD_CASES(TC_CSVOut,
+          {{".*BM_SummaryDisplay/repeats:2 ", MR_Not},
+           {"^\"BM_SummaryDisplay/repeats:2_mean\",%csv_report$"},
+           {"^\"BM_SummaryDisplay/repeats:2_median\",%csv_report$"},
+           {"^\"BM_SummaryDisplay/repeats:2_stddev\",%csv_report$"}});
+
+// Test repeats with custom time unit.
 void BM_RepeatTimeUnit(benchmark::State& state) {
   for (auto _ : state) {
   }
@@ -319,18 +551,40 @@ BENCHMARK(BM_RepeatTimeUnit)
     ->Repetitions(3)
     ->ReportAggregatesOnly()
     ->Unit(benchmark::kMicrosecond);
-ADD_CASES(TC_ConsoleOut,
+ADD_CASES(
+    TC_ConsoleOut,
+    {{".*BM_RepeatTimeUnit/repeats:3 ", MR_Not},
+     {"^BM_RepeatTimeUnit/repeats:3_mean %console_us_time_only_report [ ]*3$"},
+     {"^BM_RepeatTimeUnit/repeats:3_median %console_us_time_only_report [ "
+      "]*3$"},
+     {"^BM_RepeatTimeUnit/repeats:3_stddev %console_us_time_only_report [ "
+      "]*3$"}});
+ADD_CASES(TC_JSONOut,
           {{".*BM_RepeatTimeUnit/repeats:3 ", MR_Not},
-           {"^BM_RepeatTimeUnit/repeats:3_mean %console_us_report$"},
-           {"^BM_RepeatTimeUnit/repeats:3_median %console_us_report$"},
-           {"^BM_RepeatTimeUnit/repeats:3_stddev %console_us_report$"}});
-ADD_CASES(TC_JSONOut, {{".*BM_RepeatTimeUnit/repeats:3 ", MR_Not},
-                       {"\"name\": \"BM_RepeatTimeUnit/repeats:3_mean\",$"},
-                       {"\"time_unit\": \"us\",?$"},
-                       {"\"name\": \"BM_RepeatTimeUnit/repeats:3_median\",$"},
-                       {"\"time_unit\": \"us\",?$"},
-                       {"\"name\": \"BM_RepeatTimeUnit/repeats:3_stddev\",$"},
-                       {"\"time_unit\": \"us\",?$"}});
+           {"\"name\": \"BM_RepeatTimeUnit/repeats:3_mean\",$"},
+           {"\"run_name\": \"BM_RepeatTimeUnit/repeats:3\",$", MR_Next},
+           {"\"run_type\": \"aggregate\",$", MR_Next},
+           {"\"repetitions\": 3,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"aggregate_name\": \"mean\",$", MR_Next},
+           {"\"iterations\": 3,$", MR_Next},
+           {"\"time_unit\": \"us\",?$"},
+           {"\"name\": \"BM_RepeatTimeUnit/repeats:3_median\",$"},
+           {"\"run_name\": \"BM_RepeatTimeUnit/repeats:3\",$", MR_Next},
+           {"\"run_type\": \"aggregate\",$", MR_Next},
+           {"\"repetitions\": 3,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"aggregate_name\": \"median\",$", MR_Next},
+           {"\"iterations\": 3,$", MR_Next},
+           {"\"time_unit\": \"us\",?$"},
+           {"\"name\": \"BM_RepeatTimeUnit/repeats:3_stddev\",$"},
+           {"\"run_name\": \"BM_RepeatTimeUnit/repeats:3\",$", MR_Next},
+           {"\"run_type\": \"aggregate\",$", MR_Next},
+           {"\"repetitions\": 3,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"aggregate_name\": \"stddev\",$", MR_Next},
+           {"\"iterations\": 3,$", MR_Next},
+           {"\"time_unit\": \"us\",?$"}});
 ADD_CASES(TC_CSVOut,
           {{".*BM_RepeatTimeUnit/repeats:3 ", MR_Not},
            {"^\"BM_RepeatTimeUnit/repeats:3_mean\",%csv_us_report$"},
@@ -346,34 +600,140 @@ const auto UserStatistics = [](const std::vector<double>& v) {
 };
 void BM_UserStats(benchmark::State& state) {
   for (auto _ : state) {
+    state.SetIterationTime(150 / 10e8);
   }
 }
+// clang-format off
 BENCHMARK(BM_UserStats)
-    ->Repetitions(3)
-    ->ComputeStatistics("", UserStatistics);
+  ->Repetitions(3)
+  ->Iterations(5)
+  ->UseManualTime()
+  ->ComputeStatistics("", UserStatistics);
+// clang-format on
+
 // check that user-provided stats is calculated, and is after the default-ones
 // empty string as name is intentional, it would sort before anything else
-ADD_CASES(TC_ConsoleOut, {{"^BM_UserStats/repeats:3 %console_report$"},
-                          {"^BM_UserStats/repeats:3 %console_report$"},
-                          {"^BM_UserStats/repeats:3 %console_report$"},
-                          {"^BM_UserStats/repeats:3_mean %console_report$"},
-                          {"^BM_UserStats/repeats:3_median %console_report$"},
-                          {"^BM_UserStats/repeats:3_stddev %console_report$"},
-                          {"^BM_UserStats/repeats:3_ %console_report$"}});
-ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_UserStats/repeats:3\",$"},
-                       {"\"name\": \"BM_UserStats/repeats:3\",$"},
-                       {"\"name\": \"BM_UserStats/repeats:3\",$"},
-                       {"\"name\": \"BM_UserStats/repeats:3_mean\",$"},
-                       {"\"name\": \"BM_UserStats/repeats:3_median\",$"},
-                       {"\"name\": \"BM_UserStats/repeats:3_stddev\",$"},
-                       {"\"name\": \"BM_UserStats/repeats:3_\",$"}});
-ADD_CASES(TC_CSVOut, {{"^\"BM_UserStats/repeats:3\",%csv_report$"},
-                      {"^\"BM_UserStats/repeats:3\",%csv_report$"},
-                      {"^\"BM_UserStats/repeats:3\",%csv_report$"},
-                      {"^\"BM_UserStats/repeats:3_mean\",%csv_report$"},
-                      {"^\"BM_UserStats/repeats:3_median\",%csv_report$"},
-                      {"^\"BM_UserStats/repeats:3_stddev\",%csv_report$"},
-                      {"^\"BM_UserStats/repeats:3_\",%csv_report$"}});
+ADD_CASES(TC_ConsoleOut, {{"^BM_UserStats/iterations:5/repeats:3/manual_time [ "
+                           "]* 150 ns %time [ ]*5$"},
+                          {"^BM_UserStats/iterations:5/repeats:3/manual_time [ "
+                           "]* 150 ns %time [ ]*5$"},
+                          {"^BM_UserStats/iterations:5/repeats:3/manual_time [ "
+                           "]* 150 ns %time [ ]*5$"},
+                          {"^BM_UserStats/iterations:5/repeats:3/"
+                           "manual_time_mean [ ]* 150 ns %time [ ]*3$"},
+                          {"^BM_UserStats/iterations:5/repeats:3/"
+                           "manual_time_median [ ]* 150 ns %time [ ]*3$"},
+                          {"^BM_UserStats/iterations:5/repeats:3/"
+                           "manual_time_stddev [ ]* 0.000 ns %time [ ]*3$"},
+                          {"^BM_UserStats/iterations:5/repeats:3/manual_time_ "
+                           "[ ]* 150 ns %time [ ]*3$"}});
+ADD_CASES(
+    TC_JSONOut,
+    {{"\"name\": \"BM_UserStats/iterations:5/repeats:3/manual_time\",$"},
+     {"\"run_name\": \"BM_UserStats/iterations:5/repeats:3/manual_time\",$",
+      MR_Next},
+     {"\"run_type\": \"iteration\",$", MR_Next},
+     {"\"repetitions\": 3,$", MR_Next},
+     {"\"repetition_index\": 0,$", MR_Next},
+     {"\"threads\": 1,$", MR_Next},
+     {"\"iterations\": 5,$", MR_Next},
+     {"\"real_time\": 1\\.5(0)*e\\+(0)*2,$", MR_Next},
+     {"\"name\": \"BM_UserStats/iterations:5/repeats:3/manual_time\",$"},
+     {"\"run_name\": \"BM_UserStats/iterations:5/repeats:3/manual_time\",$",
+      MR_Next},
+     {"\"run_type\": \"iteration\",$", MR_Next},
+     {"\"repetitions\": 3,$", MR_Next},
+     {"\"repetition_index\": 1,$", MR_Next},
+     {"\"threads\": 1,$", MR_Next},
+     {"\"iterations\": 5,$", MR_Next},
+     {"\"real_time\": 1\\.5(0)*e\\+(0)*2,$", MR_Next},
+     {"\"name\": \"BM_UserStats/iterations:5/repeats:3/manual_time\",$"},
+     {"\"run_name\": \"BM_UserStats/iterations:5/repeats:3/manual_time\",$",
+      MR_Next},
+     {"\"run_type\": \"iteration\",$", MR_Next},
+     {"\"repetitions\": 3,$", MR_Next},
+     {"\"repetition_index\": 2,$", MR_Next},
+     {"\"threads\": 1,$", MR_Next},
+     {"\"iterations\": 5,$", MR_Next},
+     {"\"real_time\": 1\\.5(0)*e\\+(0)*2,$", MR_Next},
+     {"\"name\": \"BM_UserStats/iterations:5/repeats:3/manual_time_mean\",$"},
+     {"\"run_name\": \"BM_UserStats/iterations:5/repeats:3/manual_time\",$",
+      MR_Next},
+     {"\"run_type\": \"aggregate\",$", MR_Next},
+     {"\"repetitions\": 3,$", MR_Next},
+     {"\"threads\": 1,$", MR_Next},
+     {"\"aggregate_name\": \"mean\",$", MR_Next},
+     {"\"iterations\": 3,$", MR_Next},
+     {"\"real_time\": 1\\.5(0)*e\\+(0)*2,$", MR_Next},
+     {"\"name\": \"BM_UserStats/iterations:5/repeats:3/manual_time_median\",$"},
+     {"\"run_name\": \"BM_UserStats/iterations:5/repeats:3/manual_time\",$",
+      MR_Next},
+     {"\"run_type\": \"aggregate\",$", MR_Next},
+     {"\"repetitions\": 3,$", MR_Next},
+     {"\"threads\": 1,$", MR_Next},
+     {"\"aggregate_name\": \"median\",$", MR_Next},
+     {"\"iterations\": 3,$", MR_Next},
+     {"\"real_time\": 1\\.5(0)*e\\+(0)*2,$", MR_Next},
+     {"\"name\": \"BM_UserStats/iterations:5/repeats:3/manual_time_stddev\",$"},
+     {"\"run_name\": \"BM_UserStats/iterations:5/repeats:3/manual_time\",$",
+      MR_Next},
+     {"\"run_type\": \"aggregate\",$", MR_Next},
+     {"\"repetitions\": 3,$", MR_Next},
+     {"\"threads\": 1,$", MR_Next},
+     {"\"aggregate_name\": \"stddev\",$", MR_Next},
+     {"\"iterations\": 3,$", MR_Next},
+     {"\"real_time\": %float,$", MR_Next},
+     {"\"name\": \"BM_UserStats/iterations:5/repeats:3/manual_time_\",$"},
+     {"\"run_name\": \"BM_UserStats/iterations:5/repeats:3/manual_time\",$",
+      MR_Next},
+     {"\"run_type\": \"aggregate\",$", MR_Next},
+     {"\"repetitions\": 3,$", MR_Next},
+     {"\"threads\": 1,$", MR_Next},
+     {"\"aggregate_name\": \"\",$", MR_Next},
+     {"\"iterations\": 3,$", MR_Next},
+     {"\"real_time\": 1\\.5(0)*e\\+(0)*2,$", MR_Next}});
+ADD_CASES(
+    TC_CSVOut,
+    {{"^\"BM_UserStats/iterations:5/repeats:3/manual_time\",%csv_report$"},
+     {"^\"BM_UserStats/iterations:5/repeats:3/manual_time\",%csv_report$"},
+     {"^\"BM_UserStats/iterations:5/repeats:3/manual_time\",%csv_report$"},
+     {"^\"BM_UserStats/iterations:5/repeats:3/manual_time_mean\",%csv_report$"},
+     {"^\"BM_UserStats/iterations:5/repeats:3/"
+      "manual_time_median\",%csv_report$"},
+     {"^\"BM_UserStats/iterations:5/repeats:3/"
+      "manual_time_stddev\",%csv_report$"},
+     {"^\"BM_UserStats/iterations:5/repeats:3/manual_time_\",%csv_report$"}});
+
+// ========================================================================= //
+// ------------------------- Testing StrEscape JSON ------------------------ //
+// ========================================================================= //
+#if 0 // enable when csv testing code correctly handles multi-line fields
+void BM_JSON_Format(benchmark::State& state) {
+  state.SkipWithError("val\b\f\n\r\t\\\"with\"es,capes");
+  for (auto _ : state) {
+  }
+}
+BENCHMARK(BM_JSON_Format);
+ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_JSON_Format\",$"},
+                       {"\"run_name\": \"BM_JSON_Format\",$", MR_Next},
+                       {"\"run_type\": \"iteration\",$", MR_Next},
+                       {"\"repetitions\": 0,$", MR_Next},
+                       {"\"repetition_index\": 0,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
+                       {"\"error_occurred\": true,$", MR_Next},
+                       {R"("error_message": "val\\b\\f\\n\\r\\t\\\\\\"with\\"es,capes",$)", MR_Next}});
+#endif
+// ========================================================================= //
+// -------------------------- Testing CsvEscape ---------------------------- //
+// ========================================================================= //
+
+void BM_CSV_Format(benchmark::State& state) {
+  state.SkipWithError("\"freedom\"");
+  for (auto _ : state) {
+  }
+}
+BENCHMARK(BM_CSV_Format);
+ADD_CASES(TC_CSVOut, {{"^\"BM_CSV_Format\",,,,,,,,true,\"\"\"freedom\"\"\"$"}});
 
 // ========================================================================= //
 // --------------------------- TEST CASES END ------------------------------ //
diff --git a/lib/gbenchmark/test/skip_with_error_test.cc b/lib/gbenchmark/test/skip_with_error_test.cc
index 8d2c342a9a..06579772ff 100644
--- a/lib/gbenchmark/test/skip_with_error_test.cc
+++ b/lib/gbenchmark/test/skip_with_error_test.cc
@@ -33,8 +33,8 @@ struct TestCase {
   typedef benchmark::BenchmarkReporter::Run Run;
 
   void CheckRun(Run const& run) const {
-    CHECK(name == run.benchmark_name) << "expected " << name << " got "
-                                      << run.benchmark_name;
+    CHECK(name == run.benchmark_name())
+        << "expected " << name << " got " << run.benchmark_name();
     CHECK(error_occurred == run.error_occurred);
     CHECK(error_message == run.error_message);
     if (error_occurred) {
@@ -70,7 +70,6 @@ void BM_error_before_running(benchmark::State& state) {
 BENCHMARK(BM_error_before_running);
 ADD_CASES("BM_error_before_running", {{"", true, "error message"}});
 
-
 void BM_error_before_running_batch(benchmark::State& state) {
   state.SkipWithError("error message");
   while (state.KeepRunningBatch(17)) {
@@ -124,7 +123,7 @@ void BM_error_during_running_ranged_for(benchmark::State& state) {
       // Test the unfortunate but documented behavior that the ranged-for loop
       // doesn't automatically terminate when SkipWithError is set.
       assert(++It != End);
-      break; // Required behavior
+      break;  // Required behavior
     }
   }
 }
@@ -133,8 +132,6 @@ ADD_CASES("BM_error_during_running_ranged_for",
           {{"/1/iterations:5", true, "error message"},
            {"/2/iterations:5", false, ""}});
 
-
-
 void BM_error_after_running(benchmark::State& state) {
   for (auto _ : state) {
     benchmark::DoNotOptimize(state.iterations());
diff --git a/lib/gbenchmark/test/state_assembly_test.cc b/lib/gbenchmark/test/state_assembly_test.cc
index e2c5c8648d..7ddbb3b2a9 100644
--- a/lib/gbenchmark/test/state_assembly_test.cc
+++ b/lib/gbenchmark/test/state_assembly_test.cc
@@ -4,11 +4,13 @@
 #pragma clang diagnostic ignored "-Wreturn-type"
 #endif
 
+// clang-format off
 extern "C" {
   extern int ExternInt;
   benchmark::State& GetState();
   void Fn();
 }
+// clang-format on
 
 using benchmark::State;
 
@@ -23,7 +25,7 @@ extern "C" int test_for_auto_loop() {
   for (auto _ : S) {
     // CHECK: .L[[LOOP_HEAD:[a-zA-Z0-9_]+]]:
     // CHECK-GNU-NEXT: subq $1, %rbx
-    // CHECK-CLANG-NEXT: {{(addq \$1,|incq)}} %rax
+    // CHECK-CLANG-NEXT: {{(addq \$1, %rax|incq %rax|addq \$-1, %rbx)}}
     // CHECK-NEXT: jne .L[[LOOP_HEAD]]
     benchmark::DoNotOptimize(x);
   }
diff --git a/lib/gbenchmark/test/statistics_gtest.cc b/lib/gbenchmark/test/statistics_gtest.cc
index b4d6abbb57..99e314920c 100644
--- a/lib/gbenchmark/test/statistics_gtest.cc
+++ b/lib/gbenchmark/test/statistics_gtest.cc
@@ -7,55 +7,22 @@
 
 namespace {
 TEST(StatisticsTest, Mean) {
-  std::vector<double> Inputs;
-  {
-    Inputs = {42, 42, 42, 42};
-    double Res = benchmark::StatisticsMean(Inputs);
-    EXPECT_DOUBLE_EQ(Res, 42.0);
-  }
-  {
-    Inputs = {1, 2, 3, 4};
-    double Res = benchmark::StatisticsMean(Inputs);
-    EXPECT_DOUBLE_EQ(Res, 2.5);
-  }
-  {
-    Inputs = {1, 2, 5, 10, 10, 14};
-    double Res = benchmark::StatisticsMean(Inputs);
-    EXPECT_DOUBLE_EQ(Res, 7.0);
-  }
+  EXPECT_DOUBLE_EQ(benchmark::StatisticsMean({42, 42, 42, 42}), 42.0);
+  EXPECT_DOUBLE_EQ(benchmark::StatisticsMean({1, 2, 3, 4}), 2.5);
+  EXPECT_DOUBLE_EQ(benchmark::StatisticsMean({1, 2, 5, 10, 10, 14}), 7.0);
 }
 
 TEST(StatisticsTest, Median) {
-  std::vector<double> Inputs;
-  {
-    Inputs = {42, 42, 42, 42};
-    double Res = benchmark::StatisticsMedian(Inputs);
-    EXPECT_DOUBLE_EQ(Res, 42.0);
-  }
-  {
-    Inputs = {1, 2, 3, 4};
-    double Res = benchmark::StatisticsMedian(Inputs);
-    EXPECT_DOUBLE_EQ(Res, 2.5);
-  }
-  {
-    Inputs = {1, 2, 5, 10, 10};
-    double Res = benchmark::StatisticsMedian(Inputs);
-    EXPECT_DOUBLE_EQ(Res, 5.0);
-  }
+  EXPECT_DOUBLE_EQ(benchmark::StatisticsMedian({42, 42, 42, 42}), 42.0);
+  EXPECT_DOUBLE_EQ(benchmark::StatisticsMedian({1, 2, 3, 4}), 2.5);
+  EXPECT_DOUBLE_EQ(benchmark::StatisticsMedian({1, 2, 5, 10, 10}), 5.0);
 }
 
 TEST(StatisticsTest, StdDev) {
-  std::vector<double> Inputs;
-  {
-    Inputs = {101, 101, 101, 101};
-    double Res = benchmark::StatisticsStdDev(Inputs);
-    EXPECT_DOUBLE_EQ(Res, 0.0);
-  }
-  {
-    Inputs = {1, 2, 3};
-    double Res = benchmark::StatisticsStdDev(Inputs);
-    EXPECT_DOUBLE_EQ(Res, 1.0);
-  }
+  EXPECT_DOUBLE_EQ(benchmark::StatisticsStdDev({101, 101, 101, 101}), 0.0);
+  EXPECT_DOUBLE_EQ(benchmark::StatisticsStdDev({1, 2, 3}), 1.0);
+  EXPECT_FLOAT_EQ(benchmark::StatisticsStdDev({1.5, 2.4, 3.3, 4.2, 5.1}),
+                  1.42302495);
 }
 
 }  // end namespace
diff --git a/lib/gbenchmark/test/string_util_gtest.cc b/lib/gbenchmark/test/string_util_gtest.cc
new file mode 100644
index 0000000000..2c5d073f61
--- /dev/null
+++ b/lib/gbenchmark/test/string_util_gtest.cc
@@ -0,0 +1,146 @@
+//===---------------------------------------------------------------------===//
+// statistics_test - Unit tests for src/statistics.cc
+//===---------------------------------------------------------------------===//
+
+#include "../src/string_util.h"
+#include "gtest/gtest.h"
+
+namespace {
+TEST(StringUtilTest, stoul) {
+  {
+    size_t pos = 0;
+    EXPECT_EQ(0ul, benchmark::stoul("0", &pos));
+    EXPECT_EQ(1ul, pos);
+  }
+  {
+    size_t pos = 0;
+    EXPECT_EQ(7ul, benchmark::stoul("7", &pos));
+    EXPECT_EQ(1ul, pos);
+  }
+  {
+    size_t pos = 0;
+    EXPECT_EQ(135ul, benchmark::stoul("135", &pos));
+    EXPECT_EQ(3ul, pos);
+  }
+#if ULONG_MAX == 0xFFFFFFFFul
+  {
+    size_t pos = 0;
+    EXPECT_EQ(0xFFFFFFFFul, benchmark::stoul("4294967295", &pos));
+    EXPECT_EQ(10ul, pos);
+  }
+#elif ULONG_MAX == 0xFFFFFFFFFFFFFFFFul
+  {
+    size_t pos = 0;
+    EXPECT_EQ(0xFFFFFFFFFFFFFFFFul, benchmark::stoul("18446744073709551615", &pos));
+    EXPECT_EQ(20ul, pos);
+  }
+#endif
+  {
+    size_t pos = 0;
+    EXPECT_EQ(10ul, benchmark::stoul("1010", &pos, 2));
+    EXPECT_EQ(4ul, pos);
+  }
+  {
+    size_t pos = 0;
+    EXPECT_EQ(520ul, benchmark::stoul("1010", &pos, 8));
+    EXPECT_EQ(4ul, pos);
+  }
+  {
+    size_t pos = 0;
+    EXPECT_EQ(1010ul, benchmark::stoul("1010", &pos, 10));
+    EXPECT_EQ(4ul, pos);
+  }
+  {
+    size_t pos = 0;
+    EXPECT_EQ(4112ul, benchmark::stoul("1010", &pos, 16));
+    EXPECT_EQ(4ul, pos);
+  }
+  {
+    size_t pos = 0;
+    EXPECT_EQ(0xBEEFul, benchmark::stoul("BEEF", &pos, 16));
+    EXPECT_EQ(4ul, pos);
+  }
+  {
+    ASSERT_THROW(benchmark::stoul("this is a test"), std::invalid_argument);
+  }
+}
+
+TEST(StringUtilTest, stoi) {
+  {
+    size_t pos = 0;
+    EXPECT_EQ(0, benchmark::stoi("0", &pos));
+    EXPECT_EQ(1ul, pos);
+  }
+  {
+    size_t pos = 0;
+    EXPECT_EQ(-17, benchmark::stoi("-17", &pos));
+    EXPECT_EQ(3ul, pos);
+  }
+  {
+    size_t pos = 0;
+    EXPECT_EQ(1357, benchmark::stoi("1357", &pos));
+    EXPECT_EQ(4ul, pos);
+  }
+  {
+    size_t pos = 0;
+    EXPECT_EQ(10, benchmark::stoi("1010", &pos, 2));
+    EXPECT_EQ(4ul, pos);
+  }
+  {
+    size_t pos = 0;
+    EXPECT_EQ(520, benchmark::stoi("1010", &pos, 8));
+    EXPECT_EQ(4ul, pos);
+  }
+  {
+    size_t pos = 0;
+    EXPECT_EQ(1010, benchmark::stoi("1010", &pos, 10));
+    EXPECT_EQ(4ul, pos);
+  }
+  {
+    size_t pos = 0;
+    EXPECT_EQ(4112, benchmark::stoi("1010", &pos, 16));
+    EXPECT_EQ(4ul, pos);
+  }
+  {
+    size_t pos = 0;
+    EXPECT_EQ(0xBEEF, benchmark::stoi("BEEF", &pos, 16));
+    EXPECT_EQ(4ul, pos);
+  }
+  {
+    ASSERT_THROW(benchmark::stoi("this is a test"), std::invalid_argument);
+  }
+}
+
+TEST(StringUtilTest, stod) {
+  {
+    size_t pos = 0;
+    EXPECT_EQ(0.0, benchmark::stod("0", &pos));
+    EXPECT_EQ(1ul, pos);
+  }
+  {
+    size_t pos = 0;
+    EXPECT_EQ(-84.0, benchmark::stod("-84", &pos));
+    EXPECT_EQ(3ul, pos);
+  }
+  {
+    size_t pos = 0;
+    EXPECT_EQ(1234.0, benchmark::stod("1234", &pos));
+    EXPECT_EQ(4ul, pos);
+  }
+  {
+    size_t pos = 0;
+    EXPECT_EQ(1.5, benchmark::stod("1.5", &pos));
+    EXPECT_EQ(3ul, pos);
+  }
+  {
+    size_t pos = 0;
+    /* Note: exactly representable as double */
+    EXPECT_EQ(-1.25e+9, benchmark::stod("-1.25e+9", &pos));
+    EXPECT_EQ(8ul, pos);
+  }
+  {
+    ASSERT_THROW(benchmark::stod("this is a test"), std::invalid_argument);
+  }
+}
+
+}  // end namespace
diff --git a/lib/gbenchmark/test/templated_fixture_test.cc b/lib/gbenchmark/test/templated_fixture_test.cc
index ec5b4c0cc0..fe9865cc77 100644
--- a/lib/gbenchmark/test/templated_fixture_test.cc
+++ b/lib/gbenchmark/test/templated_fixture_test.cc
@@ -4,15 +4,15 @@
 #include <cassert>
 #include <memory>
 
-template<typename T>
+template <typename T>
 class MyFixture : public ::benchmark::Fixture {
-public:
+ public:
   MyFixture() : data(0) {}
 
   T data;
 };
 
-BENCHMARK_TEMPLATE_F(MyFixture, Foo, int)(benchmark::State &st) {
+BENCHMARK_TEMPLATE_F(MyFixture, Foo, int)(benchmark::State& st) {
   for (auto _ : st) {
     data += 1;
   }
diff --git a/lib/gbenchmark/test/user_counters_tabular_test.cc b/lib/gbenchmark/test/user_counters_tabular_test.cc
index 9b8a6132e6..099464ef99 100644
--- a/lib/gbenchmark/test/user_counters_tabular_test.cc
+++ b/lib/gbenchmark/test/user_counters_tabular_test.cc
@@ -7,9 +7,11 @@
 // @todo: <jpmag> this checks the full output at once; the rule for
 // CounterSet1 was failing because it was not matching "^[-]+$".
 // @todo: <jpmag> check that the counters are vertically aligned.
-ADD_CASES(TC_ConsoleOut, {
-// keeping these lines long improves readability, so:
-// clang-format off
+ADD_CASES(
+    TC_ConsoleOut,
+    {
+        // keeping these lines long improves readability, so:
+        // clang-format off
     {"^[-]+$", MR_Next},
     {"^Benchmark %s Time %s CPU %s Iterations %s Bar %s Bat %s Baz %s Foo %s Frob %s Lob$", MR_Next},
     {"^[-]+$", MR_Next},
@@ -44,8 +46,8 @@ ADD_CASES(TC_ConsoleOut, {
     {"^BM_CounterSet2_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
     {"^BM_CounterSet2_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
     {"^BM_CounterSet2_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$"},
-// clang-format on
-});
+        // clang-format on
+    });
 ADD_CASES(TC_CSVOut, {{"%csv_header,"
                        "\"Bar\",\"Bat\",\"Baz\",\"Foo\",\"Frob\",\"Lob\""}});
 
@@ -58,27 +60,33 @@ void BM_Counters_Tabular(benchmark::State& state) {
   }
   namespace bm = benchmark;
   state.counters.insert({
-    {"Foo",  { 1, bm::Counter::kAvgThreads}},
-    {"Bar",  { 2, bm::Counter::kAvgThreads}},
-    {"Baz",  { 4, bm::Counter::kAvgThreads}},
-    {"Bat",  { 8, bm::Counter::kAvgThreads}},
-    {"Frob", {16, bm::Counter::kAvgThreads}},
-    {"Lob",  {32, bm::Counter::kAvgThreads}},
+      {"Foo", {1, bm::Counter::kAvgThreads}},
+      {"Bar", {2, bm::Counter::kAvgThreads}},
+      {"Baz", {4, bm::Counter::kAvgThreads}},
+      {"Bat", {8, bm::Counter::kAvgThreads}},
+      {"Frob", {16, bm::Counter::kAvgThreads}},
+      {"Lob", {32, bm::Counter::kAvgThreads}},
   });
 }
 BENCHMARK(BM_Counters_Tabular)->ThreadRange(1, 16);
-ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_Counters_Tabular/threads:%int\",$"},
-                       {"\"iterations\": %int,$", MR_Next},
-                       {"\"real_time\": %float,$", MR_Next},
-                       {"\"cpu_time\": %float,$", MR_Next},
-                       {"\"time_unit\": \"ns\",$", MR_Next},
-                       {"\"Bar\": %float,$", MR_Next},
-                       {"\"Bat\": %float,$", MR_Next},
-                       {"\"Baz\": %float,$", MR_Next},
-                       {"\"Foo\": %float,$", MR_Next},
-                       {"\"Frob\": %float,$", MR_Next},
-                       {"\"Lob\": %float$", MR_Next},
-                       {"}", MR_Next}});
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_Counters_Tabular/threads:%int\",$"},
+           {"\"run_name\": \"BM_Counters_Tabular/threads:%int\",$", MR_Next},
+           {"\"run_type\": \"iteration\",$", MR_Next},
+           {"\"repetitions\": 0,$", MR_Next},
+           {"\"repetition_index\": 0,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\",$", MR_Next},
+           {"\"Bar\": %float,$", MR_Next},
+           {"\"Bat\": %float,$", MR_Next},
+           {"\"Baz\": %float,$", MR_Next},
+           {"\"Foo\": %float,$", MR_Next},
+           {"\"Frob\": %float,$", MR_Next},
+           {"\"Lob\": %float$", MR_Next},
+           {"}", MR_Next}});
 ADD_CASES(TC_CSVOut, {{"^\"BM_Counters_Tabular/threads:%int\",%csv_report,"
                        "%float,%float,%float,%float,%float,%float$"}});
 // VS2013 does not allow this function to be passed as a lambda argument
@@ -102,39 +110,46 @@ void BM_CounterRates_Tabular(benchmark::State& state) {
   }
   namespace bm = benchmark;
   state.counters.insert({
-    {"Foo",  { 1, bm::Counter::kAvgThreadsRate}},
-    {"Bar",  { 2, bm::Counter::kAvgThreadsRate}},
-    {"Baz",  { 4, bm::Counter::kAvgThreadsRate}},
-    {"Bat",  { 8, bm::Counter::kAvgThreadsRate}},
-    {"Frob", {16, bm::Counter::kAvgThreadsRate}},
-    {"Lob",  {32, bm::Counter::kAvgThreadsRate}},
+      {"Foo", {1, bm::Counter::kAvgThreadsRate}},
+      {"Bar", {2, bm::Counter::kAvgThreadsRate}},
+      {"Baz", {4, bm::Counter::kAvgThreadsRate}},
+      {"Bat", {8, bm::Counter::kAvgThreadsRate}},
+      {"Frob", {16, bm::Counter::kAvgThreadsRate}},
+      {"Lob", {32, bm::Counter::kAvgThreadsRate}},
   });
 }
 BENCHMARK(BM_CounterRates_Tabular)->ThreadRange(1, 16);
-ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_CounterRates_Tabular/threads:%int\",$"},
-                       {"\"iterations\": %int,$", MR_Next},
-                       {"\"real_time\": %float,$", MR_Next},
-                       {"\"cpu_time\": %float,$", MR_Next},
-                       {"\"time_unit\": \"ns\",$", MR_Next},
-                       {"\"Bar\": %float,$", MR_Next},
-                       {"\"Bat\": %float,$", MR_Next},
-                       {"\"Baz\": %float,$", MR_Next},
-                       {"\"Foo\": %float,$", MR_Next},
-                       {"\"Frob\": %float,$", MR_Next},
-                       {"\"Lob\": %float$", MR_Next},
-                       {"}", MR_Next}});
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_CounterRates_Tabular/threads:%int\",$"},
+           {"\"run_name\": \"BM_CounterRates_Tabular/threads:%int\",$",
+            MR_Next},
+           {"\"run_type\": \"iteration\",$", MR_Next},
+           {"\"repetitions\": 0,$", MR_Next},
+           {"\"repetition_index\": 0,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\",$", MR_Next},
+           {"\"Bar\": %float,$", MR_Next},
+           {"\"Bat\": %float,$", MR_Next},
+           {"\"Baz\": %float,$", MR_Next},
+           {"\"Foo\": %float,$", MR_Next},
+           {"\"Frob\": %float,$", MR_Next},
+           {"\"Lob\": %float$", MR_Next},
+           {"}", MR_Next}});
 ADD_CASES(TC_CSVOut, {{"^\"BM_CounterRates_Tabular/threads:%int\",%csv_report,"
                        "%float,%float,%float,%float,%float,%float$"}});
 // VS2013 does not allow this function to be passed as a lambda argument
 // to CHECK_BENCHMARK_RESULTS()
 void CheckTabularRate(Results const& e) {
   double t = e.DurationCPUTime();
-  CHECK_FLOAT_COUNTER_VALUE(e, "Foo", EQ, 1./t, 0.001);
-  CHECK_FLOAT_COUNTER_VALUE(e, "Bar", EQ, 2./t, 0.001);
-  CHECK_FLOAT_COUNTER_VALUE(e, "Baz", EQ, 4./t, 0.001);
-  CHECK_FLOAT_COUNTER_VALUE(e, "Bat", EQ, 8./t, 0.001);
-  CHECK_FLOAT_COUNTER_VALUE(e, "Frob", EQ, 16./t, 0.001);
-  CHECK_FLOAT_COUNTER_VALUE(e, "Lob", EQ, 32./t, 0.001);
+  CHECK_FLOAT_COUNTER_VALUE(e, "Foo", EQ, 1. / t, 0.001);
+  CHECK_FLOAT_COUNTER_VALUE(e, "Bar", EQ, 2. / t, 0.001);
+  CHECK_FLOAT_COUNTER_VALUE(e, "Baz", EQ, 4. / t, 0.001);
+  CHECK_FLOAT_COUNTER_VALUE(e, "Bat", EQ, 8. / t, 0.001);
+  CHECK_FLOAT_COUNTER_VALUE(e, "Frob", EQ, 16. / t, 0.001);
+  CHECK_FLOAT_COUNTER_VALUE(e, "Lob", EQ, 32. / t, 0.001);
 }
 CHECK_BENCHMARK_RESULTS("BM_CounterRates_Tabular/threads:%int",
                         &CheckTabularRate);
@@ -149,21 +164,27 @@ void BM_CounterSet0_Tabular(benchmark::State& state) {
   }
   namespace bm = benchmark;
   state.counters.insert({
-    {"Foo", {10, bm::Counter::kAvgThreads}},
-    {"Bar", {20, bm::Counter::kAvgThreads}},
-    {"Baz", {40, bm::Counter::kAvgThreads}},
+      {"Foo", {10, bm::Counter::kAvgThreads}},
+      {"Bar", {20, bm::Counter::kAvgThreads}},
+      {"Baz", {40, bm::Counter::kAvgThreads}},
   });
 }
 BENCHMARK(BM_CounterSet0_Tabular)->ThreadRange(1, 16);
-ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_CounterSet0_Tabular/threads:%int\",$"},
-                       {"\"iterations\": %int,$", MR_Next},
-                       {"\"real_time\": %float,$", MR_Next},
-                       {"\"cpu_time\": %float,$", MR_Next},
-                       {"\"time_unit\": \"ns\",$", MR_Next},
-                       {"\"Bar\": %float,$", MR_Next},
-                       {"\"Baz\": %float,$", MR_Next},
-                       {"\"Foo\": %float$", MR_Next},
-                       {"}", MR_Next}});
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_CounterSet0_Tabular/threads:%int\",$"},
+           {"\"run_name\": \"BM_CounterSet0_Tabular/threads:%int\",$", MR_Next},
+           {"\"run_type\": \"iteration\",$", MR_Next},
+           {"\"repetitions\": 0,$", MR_Next},
+           {"\"repetition_index\": 0,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\",$", MR_Next},
+           {"\"Bar\": %float,$", MR_Next},
+           {"\"Baz\": %float,$", MR_Next},
+           {"\"Foo\": %float$", MR_Next},
+           {"}", MR_Next}});
 ADD_CASES(TC_CSVOut, {{"^\"BM_CounterSet0_Tabular/threads:%int\",%csv_report,"
                        "%float,,%float,%float,,"}});
 // VS2013 does not allow this function to be passed as a lambda argument
@@ -181,21 +202,27 @@ void BM_CounterSet1_Tabular(benchmark::State& state) {
   }
   namespace bm = benchmark;
   state.counters.insert({
-    {"Foo", {15, bm::Counter::kAvgThreads}},
-    {"Bar", {25, bm::Counter::kAvgThreads}},
-    {"Baz", {45, bm::Counter::kAvgThreads}},
+      {"Foo", {15, bm::Counter::kAvgThreads}},
+      {"Bar", {25, bm::Counter::kAvgThreads}},
+      {"Baz", {45, bm::Counter::kAvgThreads}},
   });
 }
 BENCHMARK(BM_CounterSet1_Tabular)->ThreadRange(1, 16);
-ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_CounterSet1_Tabular/threads:%int\",$"},
-                       {"\"iterations\": %int,$", MR_Next},
-                       {"\"real_time\": %float,$", MR_Next},
-                       {"\"cpu_time\": %float,$", MR_Next},
-                       {"\"time_unit\": \"ns\",$", MR_Next},
-                       {"\"Bar\": %float,$", MR_Next},
-                       {"\"Baz\": %float,$", MR_Next},
-                       {"\"Foo\": %float$", MR_Next},
-                       {"}", MR_Next}});
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_CounterSet1_Tabular/threads:%int\",$"},
+           {"\"run_name\": \"BM_CounterSet1_Tabular/threads:%int\",$", MR_Next},
+           {"\"run_type\": \"iteration\",$", MR_Next},
+           {"\"repetitions\": 0,$", MR_Next},
+           {"\"repetition_index\": 0,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\",$", MR_Next},
+           {"\"Bar\": %float,$", MR_Next},
+           {"\"Baz\": %float,$", MR_Next},
+           {"\"Foo\": %float$", MR_Next},
+           {"}", MR_Next}});
 ADD_CASES(TC_CSVOut, {{"^\"BM_CounterSet1_Tabular/threads:%int\",%csv_report,"
                        "%float,,%float,%float,,"}});
 // VS2013 does not allow this function to be passed as a lambda argument
@@ -217,21 +244,27 @@ void BM_CounterSet2_Tabular(benchmark::State& state) {
   }
   namespace bm = benchmark;
   state.counters.insert({
-    {"Foo", {10, bm::Counter::kAvgThreads}},
-    {"Bat", {30, bm::Counter::kAvgThreads}},
-    {"Baz", {40, bm::Counter::kAvgThreads}},
+      {"Foo", {10, bm::Counter::kAvgThreads}},
+      {"Bat", {30, bm::Counter::kAvgThreads}},
+      {"Baz", {40, bm::Counter::kAvgThreads}},
   });
 }
 BENCHMARK(BM_CounterSet2_Tabular)->ThreadRange(1, 16);
-ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_CounterSet2_Tabular/threads:%int\",$"},
-                       {"\"iterations\": %int,$", MR_Next},
-                       {"\"real_time\": %float,$", MR_Next},
-                       {"\"cpu_time\": %float,$", MR_Next},
-                       {"\"time_unit\": \"ns\",$", MR_Next},
-                       {"\"Bat\": %float,$", MR_Next},
-                       {"\"Baz\": %float,$", MR_Next},
-                       {"\"Foo\": %float$", MR_Next},
-                       {"}", MR_Next}});
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_CounterSet2_Tabular/threads:%int\",$"},
+           {"\"run_name\": \"BM_CounterSet2_Tabular/threads:%int\",$", MR_Next},
+           {"\"run_type\": \"iteration\",$", MR_Next},
+           {"\"repetitions\": 0,$", MR_Next},
+           {"\"repetition_index\": 0,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\",$", MR_Next},
+           {"\"Bat\": %float,$", MR_Next},
+           {"\"Baz\": %float,$", MR_Next},
+           {"\"Foo\": %float$", MR_Next},
+           {"}", MR_Next}});
 ADD_CASES(TC_CSVOut, {{"^\"BM_CounterSet2_Tabular/threads:%int\",%csv_report,"
                        ",%float,%float,%float,,"}});
 // VS2013 does not allow this function to be passed as a lambda argument
diff --git a/lib/gbenchmark/test/user_counters_test.cc b/lib/gbenchmark/test/user_counters_test.cc
index 06aafb1fa1..0775bc01f7 100644
--- a/lib/gbenchmark/test/user_counters_test.cc
+++ b/lib/gbenchmark/test/user_counters_test.cc
@@ -8,12 +8,16 @@
 // ---------------------- Testing Prologue Output -------------------------- //
 // ========================================================================= //
 
+// clang-format off
+
 ADD_CASES(TC_ConsoleOut,
           {{"^[-]+$", MR_Next},
            {"^Benchmark %s Time %s CPU %s Iterations UserCounters...$", MR_Next},
            {"^[-]+$", MR_Next}});
 ADD_CASES(TC_CSVOut, {{"%csv_header,\"bar\",\"foo\""}});
 
+// clang-format on
+
 // ========================================================================= //
 // ------------------------- Simple Counters Output ------------------------ //
 // ========================================================================= //
@@ -25,8 +29,14 @@ void BM_Counters_Simple(benchmark::State& state) {
   state.counters["bar"] = 2 * (double)state.iterations();
 }
 BENCHMARK(BM_Counters_Simple);
-ADD_CASES(TC_ConsoleOut, {{"^BM_Counters_Simple %console_report bar=%hrfloat foo=%hrfloat$"}});
+ADD_CASES(TC_ConsoleOut,
+          {{"^BM_Counters_Simple %console_report bar=%hrfloat foo=%hrfloat$"}});
 ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_Counters_Simple\",$"},
+                       {"\"run_name\": \"BM_Counters_Simple\",$", MR_Next},
+                       {"\"run_type\": \"iteration\",$", MR_Next},
+                       {"\"repetitions\": 0,$", MR_Next},
+                       {"\"repetition_index\": 0,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
                        {"\"iterations\": %int,$", MR_Next},
                        {"\"real_time\": %float,$", MR_Next},
                        {"\"cpu_time\": %float,$", MR_Next},
@@ -38,10 +48,10 @@ ADD_CASES(TC_CSVOut, {{"^\"BM_Counters_Simple\",%csv_report,%float,%float$"}});
 // VS2013 does not allow this function to be passed as a lambda argument
 // to CHECK_BENCHMARK_RESULTS()
 void CheckSimple(Results const& e) {
-  double its = e.GetAs< double >("iterations");
+  double its = e.NumIterations();
   CHECK_COUNTER_VALUE(e, int, "foo", EQ, 1);
   // check that the value of bar is within 0.1% of the expected value
-  CHECK_FLOAT_COUNTER_VALUE(e, "bar", EQ, 2.*its, 0.001);
+  CHECK_FLOAT_COUNTER_VALUE(e, "bar", EQ, 2. * its, 0.001);
 }
 CHECK_BENCHMARK_RESULTS("BM_Counters_Simple", &CheckSimple);
 
@@ -49,7 +59,9 @@ CHECK_BENCHMARK_RESULTS("BM_Counters_Simple", &CheckSimple);
 // --------------------- Counters+Items+Bytes/s Output --------------------- //
 // ========================================================================= //
 
-namespace { int num_calls1 = 0; }
+namespace {
+int num_calls1 = 0;
+}
 void BM_Counters_WithBytesAndItemsPSec(benchmark::State& state) {
   for (auto _ : state) {
   }
@@ -59,30 +71,36 @@ void BM_Counters_WithBytesAndItemsPSec(benchmark::State& state) {
   state.SetItemsProcessed(150);
 }
 BENCHMARK(BM_Counters_WithBytesAndItemsPSec);
-ADD_CASES(TC_ConsoleOut,
-          {{"^BM_Counters_WithBytesAndItemsPSec %console_report "
-            "bar=%hrfloat foo=%hrfloat +%hrfloatB/s +%hrfloat items/s$"}});
-ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_Counters_WithBytesAndItemsPSec\",$"},
-                       {"\"iterations\": %int,$", MR_Next},
-                       {"\"real_time\": %float,$", MR_Next},
-                       {"\"cpu_time\": %float,$", MR_Next},
-                       {"\"time_unit\": \"ns\",$", MR_Next},
-                       {"\"bytes_per_second\": %float,$", MR_Next},
-                       {"\"items_per_second\": %float,$", MR_Next},
-                       {"\"bar\": %float,$", MR_Next},
-                       {"\"foo\": %float$", MR_Next},
-                       {"}", MR_Next}});
+ADD_CASES(TC_ConsoleOut, {{"^BM_Counters_WithBytesAndItemsPSec %console_report "
+                           "bar=%hrfloat bytes_per_second=%hrfloat/s "
+                           "foo=%hrfloat items_per_second=%hrfloat/s$"}});
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_Counters_WithBytesAndItemsPSec\",$"},
+           {"\"run_name\": \"BM_Counters_WithBytesAndItemsPSec\",$", MR_Next},
+           {"\"run_type\": \"iteration\",$", MR_Next},
+           {"\"repetitions\": 0,$", MR_Next},
+           {"\"repetition_index\": 0,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\",$", MR_Next},
+           {"\"bar\": %float,$", MR_Next},
+           {"\"bytes_per_second\": %float,$", MR_Next},
+           {"\"foo\": %float,$", MR_Next},
+           {"\"items_per_second\": %float$", MR_Next},
+           {"}", MR_Next}});
 ADD_CASES(TC_CSVOut, {{"^\"BM_Counters_WithBytesAndItemsPSec\","
                        "%csv_bytes_items_report,%float,%float$"}});
 // VS2013 does not allow this function to be passed as a lambda argument
 // to CHECK_BENCHMARK_RESULTS()
 void CheckBytesAndItemsPSec(Results const& e) {
-  double t = e.DurationCPUTime(); // this (and not real time) is the time used
+  double t = e.DurationCPUTime();  // this (and not real time) is the time used
   CHECK_COUNTER_VALUE(e, int, "foo", EQ, 1);
   CHECK_COUNTER_VALUE(e, int, "bar", EQ, num_calls1);
   // check that the values are within 0.1% of the expected values
-  CHECK_FLOAT_RESULT_VALUE(e, "bytes_per_second", EQ, 364./t, 0.001);
-  CHECK_FLOAT_RESULT_VALUE(e, "items_per_second", EQ, 150./t, 0.001);
+  CHECK_FLOAT_RESULT_VALUE(e, "bytes_per_second", EQ, 364. / t, 0.001);
+  CHECK_FLOAT_RESULT_VALUE(e, "items_per_second", EQ, 150. / t, 0.001);
 }
 CHECK_BENCHMARK_RESULTS("BM_Counters_WithBytesAndItemsPSec",
                         &CheckBytesAndItemsPSec);
@@ -99,8 +117,15 @@ void BM_Counters_Rate(benchmark::State& state) {
   state.counters["bar"] = bm::Counter{2, bm::Counter::kIsRate};
 }
 BENCHMARK(BM_Counters_Rate);
-ADD_CASES(TC_ConsoleOut, {{"^BM_Counters_Rate %console_report bar=%hrfloat/s foo=%hrfloat/s$"}});
+ADD_CASES(
+    TC_ConsoleOut,
+    {{"^BM_Counters_Rate %console_report bar=%hrfloat/s foo=%hrfloat/s$"}});
 ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_Counters_Rate\",$"},
+                       {"\"run_name\": \"BM_Counters_Rate\",$", MR_Next},
+                       {"\"run_type\": \"iteration\",$", MR_Next},
+                       {"\"repetitions\": 0,$", MR_Next},
+                       {"\"repetition_index\": 0,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
                        {"\"iterations\": %int,$", MR_Next},
                        {"\"real_time\": %float,$", MR_Next},
                        {"\"cpu_time\": %float,$", MR_Next},
@@ -112,10 +137,10 @@ ADD_CASES(TC_CSVOut, {{"^\"BM_Counters_Rate\",%csv_report,%float,%float$"}});
 // VS2013 does not allow this function to be passed as a lambda argument
 // to CHECK_BENCHMARK_RESULTS()
 void CheckRate(Results const& e) {
-  double t = e.DurationCPUTime(); // this (and not real time) is the time used
+  double t = e.DurationCPUTime();  // this (and not real time) is the time used
   // check that the values are within 0.1% of the expected values
-  CHECK_FLOAT_COUNTER_VALUE(e, "foo", EQ, 1./t, 0.001);
-  CHECK_FLOAT_COUNTER_VALUE(e, "bar", EQ, 2./t, 0.001);
+  CHECK_FLOAT_COUNTER_VALUE(e, "foo", EQ, 1. / t, 0.001);
+  CHECK_FLOAT_COUNTER_VALUE(e, "bar", EQ, 2. / t, 0.001);
 }
 CHECK_BENCHMARK_RESULTS("BM_Counters_Rate", &CheckRate);
 
@@ -130,16 +155,25 @@ void BM_Counters_Threads(benchmark::State& state) {
   state.counters["bar"] = 2;
 }
 BENCHMARK(BM_Counters_Threads)->ThreadRange(1, 8);
-ADD_CASES(TC_ConsoleOut, {{"^BM_Counters_Threads/threads:%int %console_report bar=%hrfloat foo=%hrfloat$"}});
-ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_Counters_Threads/threads:%int\",$"},
-                       {"\"iterations\": %int,$", MR_Next},
-                       {"\"real_time\": %float,$", MR_Next},
-                       {"\"cpu_time\": %float,$", MR_Next},
-                       {"\"time_unit\": \"ns\",$", MR_Next},
-                       {"\"bar\": %float,$", MR_Next},
-                       {"\"foo\": %float$", MR_Next},
-                       {"}", MR_Next}});
-ADD_CASES(TC_CSVOut, {{"^\"BM_Counters_Threads/threads:%int\",%csv_report,%float,%float$"}});
+ADD_CASES(TC_ConsoleOut, {{"^BM_Counters_Threads/threads:%int %console_report "
+                           "bar=%hrfloat foo=%hrfloat$"}});
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_Counters_Threads/threads:%int\",$"},
+           {"\"run_name\": \"BM_Counters_Threads/threads:%int\",$", MR_Next},
+           {"\"run_type\": \"iteration\",$", MR_Next},
+           {"\"repetitions\": 0,$", MR_Next},
+           {"\"repetition_index\": 0,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\",$", MR_Next},
+           {"\"bar\": %float,$", MR_Next},
+           {"\"foo\": %float$", MR_Next},
+           {"}", MR_Next}});
+ADD_CASES(
+    TC_CSVOut,
+    {{"^\"BM_Counters_Threads/threads:%int\",%csv_report,%float,%float$"}});
 // VS2013 does not allow this function to be passed as a lambda argument
 // to CHECK_BENCHMARK_RESULTS()
 void CheckThreads(Results const& e) {
@@ -160,16 +194,25 @@ void BM_Counters_AvgThreads(benchmark::State& state) {
   state.counters["bar"] = bm::Counter{2, bm::Counter::kAvgThreads};
 }
 BENCHMARK(BM_Counters_AvgThreads)->ThreadRange(1, 8);
-ADD_CASES(TC_ConsoleOut, {{"^BM_Counters_AvgThreads/threads:%int %console_report bar=%hrfloat foo=%hrfloat$"}});
-ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_Counters_AvgThreads/threads:%int\",$"},
-                       {"\"iterations\": %int,$", MR_Next},
-                       {"\"real_time\": %float,$", MR_Next},
-                       {"\"cpu_time\": %float,$", MR_Next},
-                       {"\"time_unit\": \"ns\",$", MR_Next},
-                       {"\"bar\": %float,$", MR_Next},
-                       {"\"foo\": %float$", MR_Next},
-                       {"}", MR_Next}});
-ADD_CASES(TC_CSVOut, {{"^\"BM_Counters_AvgThreads/threads:%int\",%csv_report,%float,%float$"}});
+ADD_CASES(TC_ConsoleOut, {{"^BM_Counters_AvgThreads/threads:%int "
+                           "%console_report bar=%hrfloat foo=%hrfloat$"}});
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_Counters_AvgThreads/threads:%int\",$"},
+           {"\"run_name\": \"BM_Counters_AvgThreads/threads:%int\",$", MR_Next},
+           {"\"run_type\": \"iteration\",$", MR_Next},
+           {"\"repetitions\": 0,$", MR_Next},
+           {"\"repetition_index\": 0,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\",$", MR_Next},
+           {"\"bar\": %float,$", MR_Next},
+           {"\"foo\": %float$", MR_Next},
+           {"}", MR_Next}});
+ADD_CASES(
+    TC_CSVOut,
+    {{"^\"BM_Counters_AvgThreads/threads:%int\",%csv_report,%float,%float$"}});
 // VS2013 does not allow this function to be passed as a lambda argument
 // to CHECK_BENCHMARK_RESULTS()
 void CheckAvgThreads(Results const& e) {
@@ -191,25 +234,203 @@ void BM_Counters_AvgThreadsRate(benchmark::State& state) {
   state.counters["bar"] = bm::Counter{2, bm::Counter::kAvgThreadsRate};
 }
 BENCHMARK(BM_Counters_AvgThreadsRate)->ThreadRange(1, 8);
-ADD_CASES(TC_ConsoleOut, {{"^BM_Counters_AvgThreadsRate/threads:%int %console_report bar=%hrfloat/s foo=%hrfloat/s$"}});
-ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_Counters_AvgThreadsRate/threads:%int\",$"},
-                       {"\"iterations\": %int,$", MR_Next},
-                       {"\"real_time\": %float,$", MR_Next},
-                       {"\"cpu_time\": %float,$", MR_Next},
-                       {"\"time_unit\": \"ns\",$", MR_Next},
-                       {"\"bar\": %float,$", MR_Next},
-                       {"\"foo\": %float$", MR_Next},
-                       {"}", MR_Next}});
-ADD_CASES(TC_CSVOut, {{"^\"BM_Counters_AvgThreadsRate/threads:%int\",%csv_report,%float,%float$"}});
+ADD_CASES(TC_ConsoleOut, {{"^BM_Counters_AvgThreadsRate/threads:%int "
+                           "%console_report bar=%hrfloat/s foo=%hrfloat/s$"}});
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_Counters_AvgThreadsRate/threads:%int\",$"},
+           {"\"run_name\": \"BM_Counters_AvgThreadsRate/threads:%int\",$",
+            MR_Next},
+           {"\"run_type\": \"iteration\",$", MR_Next},
+           {"\"repetitions\": 0,$", MR_Next},
+           {"\"repetition_index\": 0,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\",$", MR_Next},
+           {"\"bar\": %float,$", MR_Next},
+           {"\"foo\": %float$", MR_Next},
+           {"}", MR_Next}});
+ADD_CASES(TC_CSVOut, {{"^\"BM_Counters_AvgThreadsRate/"
+                       "threads:%int\",%csv_report,%float,%float$"}});
 // VS2013 does not allow this function to be passed as a lambda argument
 // to CHECK_BENCHMARK_RESULTS()
 void CheckAvgThreadsRate(Results const& e) {
-  CHECK_FLOAT_COUNTER_VALUE(e, "foo", EQ, 1./e.DurationCPUTime(), 0.001);
-  CHECK_FLOAT_COUNTER_VALUE(e, "bar", EQ, 2./e.DurationCPUTime(), 0.001);
+  CHECK_FLOAT_COUNTER_VALUE(e, "foo", EQ, 1. / e.DurationCPUTime(), 0.001);
+  CHECK_FLOAT_COUNTER_VALUE(e, "bar", EQ, 2. / e.DurationCPUTime(), 0.001);
 }
 CHECK_BENCHMARK_RESULTS("BM_Counters_AvgThreadsRate/threads:%int",
                         &CheckAvgThreadsRate);
 
+// ========================================================================= //
+// ------------------- IterationInvariant Counters Output ------------------ //
+// ========================================================================= //
+
+void BM_Counters_IterationInvariant(benchmark::State& state) {
+  for (auto _ : state) {
+  }
+  namespace bm = benchmark;
+  state.counters["foo"] = bm::Counter{1, bm::Counter::kIsIterationInvariant};
+  state.counters["bar"] = bm::Counter{2, bm::Counter::kIsIterationInvariant};
+}
+BENCHMARK(BM_Counters_IterationInvariant);
+ADD_CASES(TC_ConsoleOut, {{"^BM_Counters_IterationInvariant %console_report "
+                           "bar=%hrfloat foo=%hrfloat$"}});
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_Counters_IterationInvariant\",$"},
+           {"\"run_name\": \"BM_Counters_IterationInvariant\",$", MR_Next},
+           {"\"run_type\": \"iteration\",$", MR_Next},
+           {"\"repetitions\": 0,$", MR_Next},
+           {"\"repetition_index\": 0,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\",$", MR_Next},
+           {"\"bar\": %float,$", MR_Next},
+           {"\"foo\": %float$", MR_Next},
+           {"}", MR_Next}});
+ADD_CASES(TC_CSVOut,
+          {{"^\"BM_Counters_IterationInvariant\",%csv_report,%float,%float$"}});
+// VS2013 does not allow this function to be passed as a lambda argument
+// to CHECK_BENCHMARK_RESULTS()
+void CheckIterationInvariant(Results const& e) {
+  double its = e.NumIterations();
+  // check that the values are within 0.1% of the expected value
+  CHECK_FLOAT_COUNTER_VALUE(e, "foo", EQ, its, 0.001);
+  CHECK_FLOAT_COUNTER_VALUE(e, "bar", EQ, 2. * its, 0.001);
+}
+CHECK_BENCHMARK_RESULTS("BM_Counters_IterationInvariant",
+                        &CheckIterationInvariant);
+
+// ========================================================================= //
+// ----------------- IterationInvariantRate Counters Output ---------------- //
+// ========================================================================= //
+
+void BM_Counters_kIsIterationInvariantRate(benchmark::State& state) {
+  for (auto _ : state) {
+  }
+  namespace bm = benchmark;
+  state.counters["foo"] =
+      bm::Counter{1, bm::Counter::kIsIterationInvariantRate};
+  state.counters["bar"] =
+      bm::Counter{2, bm::Counter::kIsRate | bm::Counter::kIsIterationInvariant};
+}
+BENCHMARK(BM_Counters_kIsIterationInvariantRate);
+ADD_CASES(TC_ConsoleOut, {{"^BM_Counters_kIsIterationInvariantRate "
+                           "%console_report bar=%hrfloat/s foo=%hrfloat/s$"}});
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_Counters_kIsIterationInvariantRate\",$"},
+           {"\"run_name\": \"BM_Counters_kIsIterationInvariantRate\",$",
+            MR_Next},
+           {"\"run_type\": \"iteration\",$", MR_Next},
+           {"\"repetitions\": 0,$", MR_Next},
+           {"\"repetition_index\": 0,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\",$", MR_Next},
+           {"\"bar\": %float,$", MR_Next},
+           {"\"foo\": %float$", MR_Next},
+           {"}", MR_Next}});
+ADD_CASES(TC_CSVOut, {{"^\"BM_Counters_kIsIterationInvariantRate\",%csv_report,"
+                       "%float,%float$"}});
+// VS2013 does not allow this function to be passed as a lambda argument
+// to CHECK_BENCHMARK_RESULTS()
+void CheckIsIterationInvariantRate(Results const& e) {
+  double its = e.NumIterations();
+  double t = e.DurationCPUTime();  // this (and not real time) is the time used
+  // check that the values are within 0.1% of the expected values
+  CHECK_FLOAT_COUNTER_VALUE(e, "foo", EQ, its * 1. / t, 0.001);
+  CHECK_FLOAT_COUNTER_VALUE(e, "bar", EQ, its * 2. / t, 0.001);
+}
+CHECK_BENCHMARK_RESULTS("BM_Counters_kIsIterationInvariantRate",
+                        &CheckIsIterationInvariantRate);
+
+// ========================================================================= //
+// ------------------- AvgIterations Counters Output ------------------ //
+// ========================================================================= //
+
+void BM_Counters_AvgIterations(benchmark::State& state) {
+  for (auto _ : state) {
+  }
+  namespace bm = benchmark;
+  state.counters["foo"] = bm::Counter{1, bm::Counter::kAvgIterations};
+  state.counters["bar"] = bm::Counter{2, bm::Counter::kAvgIterations};
+}
+BENCHMARK(BM_Counters_AvgIterations);
+ADD_CASES(TC_ConsoleOut, {{"^BM_Counters_AvgIterations %console_report "
+                           "bar=%hrfloat foo=%hrfloat$"}});
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_Counters_AvgIterations\",$"},
+           {"\"run_name\": \"BM_Counters_AvgIterations\",$", MR_Next},
+           {"\"run_type\": \"iteration\",$", MR_Next},
+           {"\"repetitions\": 0,$", MR_Next},
+           {"\"repetition_index\": 0,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\",$", MR_Next},
+           {"\"bar\": %float,$", MR_Next},
+           {"\"foo\": %float$", MR_Next},
+           {"}", MR_Next}});
+ADD_CASES(TC_CSVOut,
+          {{"^\"BM_Counters_AvgIterations\",%csv_report,%float,%float$"}});
+// VS2013 does not allow this function to be passed as a lambda argument
+// to CHECK_BENCHMARK_RESULTS()
+void CheckAvgIterations(Results const& e) {
+  double its = e.NumIterations();
+  // check that the values are within 0.1% of the expected value
+  CHECK_FLOAT_COUNTER_VALUE(e, "foo", EQ, 1. / its, 0.001);
+  CHECK_FLOAT_COUNTER_VALUE(e, "bar", EQ, 2. / its, 0.001);
+}
+CHECK_BENCHMARK_RESULTS("BM_Counters_AvgIterations", &CheckAvgIterations);
+
+// ========================================================================= //
+// ----------------- AvgIterationsRate Counters Output ---------------- //
+// ========================================================================= //
+
+void BM_Counters_kAvgIterationsRate(benchmark::State& state) {
+  for (auto _ : state) {
+  }
+  namespace bm = benchmark;
+  state.counters["foo"] = bm::Counter{1, bm::Counter::kAvgIterationsRate};
+  state.counters["bar"] =
+      bm::Counter{2, bm::Counter::kIsRate | bm::Counter::kAvgIterations};
+}
+BENCHMARK(BM_Counters_kAvgIterationsRate);
+ADD_CASES(TC_ConsoleOut, {{"^BM_Counters_kAvgIterationsRate "
+                           "%console_report bar=%hrfloat/s foo=%hrfloat/s$"}});
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_Counters_kAvgIterationsRate\",$"},
+           {"\"run_name\": \"BM_Counters_kAvgIterationsRate\",$", MR_Next},
+           {"\"run_type\": \"iteration\",$", MR_Next},
+           {"\"repetitions\": 0,$", MR_Next},
+           {"\"repetition_index\": 0,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\",$", MR_Next},
+           {"\"bar\": %float,$", MR_Next},
+           {"\"foo\": %float$", MR_Next},
+           {"}", MR_Next}});
+ADD_CASES(TC_CSVOut, {{"^\"BM_Counters_kAvgIterationsRate\",%csv_report,"
+                       "%float,%float$"}});
+// VS2013 does not allow this function to be passed as a lambda argument
+// to CHECK_BENCHMARK_RESULTS()
+void CheckAvgIterationsRate(Results const& e) {
+  double its = e.NumIterations();
+  double t = e.DurationCPUTime();  // this (and not real time) is the time used
+  // check that the values are within 0.1% of the expected values
+  CHECK_FLOAT_COUNTER_VALUE(e, "foo", EQ, 1. / its / t, 0.001);
+  CHECK_FLOAT_COUNTER_VALUE(e, "bar", EQ, 2. / its / t, 0.001);
+}
+CHECK_BENCHMARK_RESULTS("BM_Counters_kAvgIterationsRate",
+                        &CheckAvgIterationsRate);
+
 // ========================================================================= //
 // --------------------------- TEST CASES END ------------------------------ //
 // ========================================================================= //
diff --git a/lib/gbenchmark/test/user_counters_thousands_test.cc b/lib/gbenchmark/test/user_counters_thousands_test.cc
new file mode 100644
index 0000000000..21d8285ded
--- /dev/null
+++ b/lib/gbenchmark/test/user_counters_thousands_test.cc
@@ -0,0 +1,173 @@
+
+#undef NDEBUG
+
+#include "benchmark/benchmark.h"
+#include "output_test.h"
+
+// ========================================================================= //
+// ------------------------ Thousands Customisation ------------------------ //
+// ========================================================================= //
+
+void BM_Counters_Thousands(benchmark::State& state) {
+  for (auto _ : state) {
+  }
+  namespace bm = benchmark;
+  state.counters.insert({
+      {"t0_1000000DefaultBase",
+       bm::Counter(1000 * 1000, bm::Counter::kDefaults)},
+      {"t1_1000000Base1000", bm::Counter(1000 * 1000, bm::Counter::kDefaults,
+                                         benchmark::Counter::OneK::kIs1000)},
+      {"t2_1000000Base1024", bm::Counter(1000 * 1000, bm::Counter::kDefaults,
+                                         benchmark::Counter::OneK::kIs1024)},
+      {"t3_1048576Base1000", bm::Counter(1024 * 1024, bm::Counter::kDefaults,
+                                         benchmark::Counter::OneK::kIs1000)},
+      {"t4_1048576Base1024", bm::Counter(1024 * 1024, bm::Counter::kDefaults,
+                                         benchmark::Counter::OneK::kIs1024)},
+  });
+}
+BENCHMARK(BM_Counters_Thousands)->Repetitions(2);
+ADD_CASES(
+    TC_ConsoleOut,
+    {
+        {"^BM_Counters_Thousands/repeats:2 %console_report "
+         "t0_1000000DefaultBase=1000k "
+         "t1_1000000Base1000=1000k t2_1000000Base1024=976.56[23]k "
+         "t3_1048576Base1000=1048.58k t4_1048576Base1024=1024k$"},
+        {"^BM_Counters_Thousands/repeats:2 %console_report "
+         "t0_1000000DefaultBase=1000k "
+         "t1_1000000Base1000=1000k t2_1000000Base1024=976.56[23]k "
+         "t3_1048576Base1000=1048.58k t4_1048576Base1024=1024k$"},
+        {"^BM_Counters_Thousands/repeats:2_mean %console_report "
+         "t0_1000000DefaultBase=1000k t1_1000000Base1000=1000k "
+         "t2_1000000Base1024=976.56[23]k t3_1048576Base1000=1048.58k "
+         "t4_1048576Base1024=1024k$"},
+        {"^BM_Counters_Thousands/repeats:2_median %console_report "
+         "t0_1000000DefaultBase=1000k t1_1000000Base1000=1000k "
+         "t2_1000000Base1024=976.56[23]k t3_1048576Base1000=1048.58k "
+         "t4_1048576Base1024=1024k$"},
+        {"^BM_Counters_Thousands/repeats:2_stddev %console_time_only_report [ "
+         "]*2 t0_1000000DefaultBase=0 t1_1000000Base1000=0 "
+         "t2_1000000Base1024=0 t3_1048576Base1000=0 t4_1048576Base1024=0$"},
+    });
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_Counters_Thousands/repeats:2\",$"},
+           {"\"run_name\": \"BM_Counters_Thousands/repeats:2\",$", MR_Next},
+           {"\"run_type\": \"iteration\",$", MR_Next},
+           {"\"repetitions\": 2,$", MR_Next},
+           {"\"repetition_index\": 0,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\",$", MR_Next},
+           {"\"t0_1000000DefaultBase\": 1\\.(0)*e\\+(0)*6,$", MR_Next},
+           {"\"t1_1000000Base1000\": 1\\.(0)*e\\+(0)*6,$", MR_Next},
+           {"\"t2_1000000Base1024\": 1\\.(0)*e\\+(0)*6,$", MR_Next},
+           {"\"t3_1048576Base1000\": 1\\.048576(0)*e\\+(0)*6,$", MR_Next},
+           {"\"t4_1048576Base1024\": 1\\.048576(0)*e\\+(0)*6$", MR_Next},
+           {"}", MR_Next}});
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_Counters_Thousands/repeats:2\",$"},
+           {"\"run_name\": \"BM_Counters_Thousands/repeats:2\",$", MR_Next},
+           {"\"run_type\": \"iteration\",$", MR_Next},
+           {"\"repetitions\": 2,$", MR_Next},
+           {"\"repetition_index\": 1,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\",$", MR_Next},
+           {"\"t0_1000000DefaultBase\": 1\\.(0)*e\\+(0)*6,$", MR_Next},
+           {"\"t1_1000000Base1000\": 1\\.(0)*e\\+(0)*6,$", MR_Next},
+           {"\"t2_1000000Base1024\": 1\\.(0)*e\\+(0)*6,$", MR_Next},
+           {"\"t3_1048576Base1000\": 1\\.048576(0)*e\\+(0)*6,$", MR_Next},
+           {"\"t4_1048576Base1024\": 1\\.048576(0)*e\\+(0)*6$", MR_Next},
+           {"}", MR_Next}});
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_Counters_Thousands/repeats:2_mean\",$"},
+           {"\"run_name\": \"BM_Counters_Thousands/repeats:2\",$", MR_Next},
+           {"\"run_type\": \"aggregate\",$", MR_Next},
+           {"\"repetitions\": 2,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"aggregate_name\": \"mean\",$", MR_Next},
+           {"\"iterations\": 2,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\",$", MR_Next},
+           {"\"t0_1000000DefaultBase\": 1\\.(0)*e\\+(0)*6,$", MR_Next},
+           {"\"t1_1000000Base1000\": 1\\.(0)*e\\+(0)*6,$", MR_Next},
+           {"\"t2_1000000Base1024\": 1\\.(0)*e\\+(0)*6,$", MR_Next},
+           {"\"t3_1048576Base1000\": 1\\.048576(0)*e\\+(0)*6,$", MR_Next},
+           {"\"t4_1048576Base1024\": 1\\.048576(0)*e\\+(0)*6$", MR_Next},
+           {"}", MR_Next}});
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_Counters_Thousands/repeats:2_median\",$"},
+           {"\"run_name\": \"BM_Counters_Thousands/repeats:2\",$", MR_Next},
+           {"\"run_type\": \"aggregate\",$", MR_Next},
+           {"\"repetitions\": 2,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"aggregate_name\": \"median\",$", MR_Next},
+           {"\"iterations\": 2,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\",$", MR_Next},
+           {"\"t0_1000000DefaultBase\": 1\\.(0)*e\\+(0)*6,$", MR_Next},
+           {"\"t1_1000000Base1000\": 1\\.(0)*e\\+(0)*6,$", MR_Next},
+           {"\"t2_1000000Base1024\": 1\\.(0)*e\\+(0)*6,$", MR_Next},
+           {"\"t3_1048576Base1000\": 1\\.048576(0)*e\\+(0)*6,$", MR_Next},
+           {"\"t4_1048576Base1024\": 1\\.048576(0)*e\\+(0)*6$", MR_Next},
+           {"}", MR_Next}});
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_Counters_Thousands/repeats:2_stddev\",$"},
+           {"\"run_name\": \"BM_Counters_Thousands/repeats:2\",$", MR_Next},
+           {"\"run_type\": \"aggregate\",$", MR_Next},
+           {"\"repetitions\": 2,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"aggregate_name\": \"stddev\",$", MR_Next},
+           {"\"iterations\": 2,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\",$", MR_Next},
+           {"\"t0_1000000DefaultBase\": 0\\.(0)*e\\+(0)*,$", MR_Next},
+           {"\"t1_1000000Base1000\": 0\\.(0)*e\\+(0)*,$", MR_Next},
+           {"\"t2_1000000Base1024\": 0\\.(0)*e\\+(0)*,$", MR_Next},
+           {"\"t3_1048576Base1000\": 0\\.(0)*e\\+(0)*,$", MR_Next},
+           {"\"t4_1048576Base1024\": 0\\.(0)*e\\+(0)*$", MR_Next},
+           {"}", MR_Next}});
+
+ADD_CASES(
+    TC_CSVOut,
+    {{"^\"BM_Counters_Thousands/"
+      "repeats:2\",%csv_report,1e\\+(0)*6,1e\\+(0)*6,1e\\+(0)*6,1\\.04858e\\+("
+      "0)*6,1\\.04858e\\+(0)*6$"},
+     {"^\"BM_Counters_Thousands/"
+      "repeats:2\",%csv_report,1e\\+(0)*6,1e\\+(0)*6,1e\\+(0)*6,1\\.04858e\\+("
+      "0)*6,1\\.04858e\\+(0)*6$"},
+     {"^\"BM_Counters_Thousands/"
+      "repeats:2_mean\",%csv_report,1e\\+(0)*6,1e\\+(0)*6,1e\\+(0)*6,1\\."
+      "04858e\\+(0)*6,1\\.04858e\\+(0)*6$"},
+     {"^\"BM_Counters_Thousands/"
+      "repeats:2_median\",%csv_report,1e\\+(0)*6,1e\\+(0)*6,1e\\+(0)*6,1\\."
+      "04858e\\+(0)*6,1\\.04858e\\+(0)*6$"},
+     {"^\"BM_Counters_Thousands/repeats:2_stddev\",%csv_report,0,0,0,0,0$"}});
+// VS2013 does not allow this function to be passed as a lambda argument
+// to CHECK_BENCHMARK_RESULTS()
+void CheckThousands(Results const& e) {
+  if (e.name != "BM_Counters_Thousands/repeats:2")
+    return;  // Do not check the aggregates!
+
+  // check that the values are within 0.01% of the expected values
+  CHECK_FLOAT_COUNTER_VALUE(e, "t0_1000000DefaultBase", EQ, 1000 * 1000,
+                            0.0001);
+  CHECK_FLOAT_COUNTER_VALUE(e, "t1_1000000Base1000", EQ, 1000 * 1000, 0.0001);
+  CHECK_FLOAT_COUNTER_VALUE(e, "t2_1000000Base1024", EQ, 1000 * 1000, 0.0001);
+  CHECK_FLOAT_COUNTER_VALUE(e, "t3_1048576Base1000", EQ, 1024 * 1024, 0.0001);
+  CHECK_FLOAT_COUNTER_VALUE(e, "t4_1048576Base1024", EQ, 1024 * 1024, 0.0001);
+}
+CHECK_BENCHMARK_RESULTS("BM_Counters_Thousands", &CheckThousands);
+
+// ========================================================================= //
+// --------------------------- TEST CASES END ------------------------------ //
+// ========================================================================= //
+
+int main(int argc, char* argv[]) { RunOutputTests(argc, argv); }
diff --git a/lib/gbenchmark/tools/compare.py b/lib/gbenchmark/tools/compare.py
index c4a47e8d50..539ace6fb1 100755
--- a/lib/gbenchmark/tools/compare.py
+++ b/lib/gbenchmark/tools/compare.py
@@ -1,5 +1,6 @@
 #!/usr/bin/env python
 
+import unittest
 """
 compare.py - versatile benchmark output compare tool
 """
@@ -35,6 +36,34 @@ def check_inputs(in1, in2, flags):
 def create_parser():
     parser = ArgumentParser(
         description='versatile benchmark output compare tool')
+
+    parser.add_argument(
+        '-a',
+        '--display_aggregates_only',
+        dest='display_aggregates_only',
+        action="store_true",
+        help="If there are repetitions, by default, we display everything - the"
+             " actual runs, and the aggregates computed. Sometimes, it is "
+             "desirable to only view the aggregates. E.g. when there are a lot "
+             "of repetitions. Do note that only the display is affected. "
+             "Internally, all the actual runs are still used, e.g. for U test.")
+
+    utest = parser.add_argument_group()
+    utest.add_argument(
+        '--no-utest',
+        dest='utest',
+        default=True,
+        action="store_false",
+        help="The tool can do a two-tailed Mann-Whitney U test with the null hypothesis that it is equally likely that a randomly selected value from one sample will be less than or greater than a randomly selected value from a second sample.\nWARNING: requires **LARGE** (no less than {}) number of repetitions to be meaningful!\nThe test is being done by default, if at least {} repetitions were done.\nThis option can disable the U Test.".format(report.UTEST_OPTIMAL_REPETITIONS, report.UTEST_MIN_REPETITIONS))
+    alpha_default = 0.05
+    utest.add_argument(
+        "--alpha",
+        dest='utest_alpha',
+        default=alpha_default,
+        type=float,
+        help=("significance level alpha. if the calculated p-value is below this value, then the result is said to be statistically significant and the null hypothesis is rejected.\n(default: %0.4f)") %
+        alpha_default)
+
     subparsers = parser.add_subparsers(
         help='This tool has multiple modes of operation:',
         dest='mode')
@@ -138,6 +167,9 @@ def main():
     # Parse the command line flags
     parser = create_parser()
     args, unknown_args = parser.parse_known_args()
+    if args.mode is None:
+        parser.print_help()
+        exit(1)
     assert not unknown_args
     benchmark_options = args.benchmark_options
 
@@ -175,10 +207,14 @@ def main():
     else:
         # should never happen
         print("Unrecognized mode of operation: '%s'" % args.mode)
+        parser.print_help()
         exit(1)
 
     check_inputs(test_baseline, test_contender, benchmark_options)
 
+    if args.display_aggregates_only:
+        benchmark_options += ['--benchmark_display_aggregates_only=true']
+
     options_baseline = []
     options_contender = []
 
@@ -201,15 +237,14 @@ def main():
             json2_orig, filter_contender, replacement)
 
     # Diff and output
-    output_lines = gbench.report.generate_difference_report(json1, json2)
+    output_lines = gbench.report.generate_difference_report(
+        json1, json2, args.display_aggregates_only,
+        args.utest, args.utest_alpha)
     print(description)
     for ln in output_lines:
         print(ln)
 
 
-import unittest
-
-
 class TestParser(unittest.TestCase):
     def setUp(self):
         self.parser = create_parser()
@@ -218,12 +253,57 @@ def setUp(self):
                 os.path.realpath(__file__)),
             'gbench',
             'Inputs')
-        self.testInput0 = os.path.join(testInputs, 'test_baseline_run1.json')
-        self.testInput1 = os.path.join(testInputs, 'test_baseline_run2.json')
+        self.testInput0 = os.path.join(testInputs, 'test1_run1.json')
+        self.testInput1 = os.path.join(testInputs, 'test1_run2.json')
 
     def test_benchmarks_basic(self):
         parsed = self.parser.parse_args(
             ['benchmarks', self.testInput0, self.testInput1])
+        self.assertFalse(parsed.display_aggregates_only)
+        self.assertTrue(parsed.utest)
+        self.assertEqual(parsed.mode, 'benchmarks')
+        self.assertEqual(parsed.test_baseline[0].name, self.testInput0)
+        self.assertEqual(parsed.test_contender[0].name, self.testInput1)
+        self.assertFalse(parsed.benchmark_options)
+
+    def test_benchmarks_basic_without_utest(self):
+        parsed = self.parser.parse_args(
+            ['--no-utest', 'benchmarks', self.testInput0, self.testInput1])
+        self.assertFalse(parsed.display_aggregates_only)
+        self.assertFalse(parsed.utest)
+        self.assertEqual(parsed.utest_alpha, 0.05)
+        self.assertEqual(parsed.mode, 'benchmarks')
+        self.assertEqual(parsed.test_baseline[0].name, self.testInput0)
+        self.assertEqual(parsed.test_contender[0].name, self.testInput1)
+        self.assertFalse(parsed.benchmark_options)
+
+    def test_benchmarks_basic_display_aggregates_only(self):
+        parsed = self.parser.parse_args(
+            ['-a', 'benchmarks', self.testInput0, self.testInput1])
+        self.assertTrue(parsed.display_aggregates_only)
+        self.assertTrue(parsed.utest)
+        self.assertEqual(parsed.mode, 'benchmarks')
+        self.assertEqual(parsed.test_baseline[0].name, self.testInput0)
+        self.assertEqual(parsed.test_contender[0].name, self.testInput1)
+        self.assertFalse(parsed.benchmark_options)
+
+    def test_benchmarks_basic_with_utest_alpha(self):
+        parsed = self.parser.parse_args(
+            ['--alpha=0.314', 'benchmarks', self.testInput0, self.testInput1])
+        self.assertFalse(parsed.display_aggregates_only)
+        self.assertTrue(parsed.utest)
+        self.assertEqual(parsed.utest_alpha, 0.314)
+        self.assertEqual(parsed.mode, 'benchmarks')
+        self.assertEqual(parsed.test_baseline[0].name, self.testInput0)
+        self.assertEqual(parsed.test_contender[0].name, self.testInput1)
+        self.assertFalse(parsed.benchmark_options)
+
+    def test_benchmarks_basic_without_utest_with_utest_alpha(self):
+        parsed = self.parser.parse_args(
+            ['--no-utest', '--alpha=0.314', 'benchmarks', self.testInput0, self.testInput1])
+        self.assertFalse(parsed.display_aggregates_only)
+        self.assertFalse(parsed.utest)
+        self.assertEqual(parsed.utest_alpha, 0.314)
         self.assertEqual(parsed.mode, 'benchmarks')
         self.assertEqual(parsed.test_baseline[0].name, self.testInput0)
         self.assertEqual(parsed.test_contender[0].name, self.testInput1)
@@ -232,6 +312,8 @@ def test_benchmarks_basic(self):
     def test_benchmarks_with_remainder(self):
         parsed = self.parser.parse_args(
             ['benchmarks', self.testInput0, self.testInput1, 'd'])
+        self.assertFalse(parsed.display_aggregates_only)
+        self.assertTrue(parsed.utest)
         self.assertEqual(parsed.mode, 'benchmarks')
         self.assertEqual(parsed.test_baseline[0].name, self.testInput0)
         self.assertEqual(parsed.test_contender[0].name, self.testInput1)
@@ -240,6 +322,8 @@ def test_benchmarks_with_remainder(self):
     def test_benchmarks_with_remainder_after_doubleminus(self):
         parsed = self.parser.parse_args(
             ['benchmarks', self.testInput0, self.testInput1, '--', 'e'])
+        self.assertFalse(parsed.display_aggregates_only)
+        self.assertTrue(parsed.utest)
         self.assertEqual(parsed.mode, 'benchmarks')
         self.assertEqual(parsed.test_baseline[0].name, self.testInput0)
         self.assertEqual(parsed.test_contender[0].name, self.testInput1)
@@ -248,6 +332,8 @@ def test_benchmarks_with_remainder_after_doubleminus(self):
     def test_filters_basic(self):
         parsed = self.parser.parse_args(
             ['filters', self.testInput0, 'c', 'd'])
+        self.assertFalse(parsed.display_aggregates_only)
+        self.assertTrue(parsed.utest)
         self.assertEqual(parsed.mode, 'filters')
         self.assertEqual(parsed.test[0].name, self.testInput0)
         self.assertEqual(parsed.filter_baseline[0], 'c')
@@ -257,6 +343,8 @@ def test_filters_basic(self):
     def test_filters_with_remainder(self):
         parsed = self.parser.parse_args(
             ['filters', self.testInput0, 'c', 'd', 'e'])
+        self.assertFalse(parsed.display_aggregates_only)
+        self.assertTrue(parsed.utest)
         self.assertEqual(parsed.mode, 'filters')
         self.assertEqual(parsed.test[0].name, self.testInput0)
         self.assertEqual(parsed.filter_baseline[0], 'c')
@@ -266,6 +354,8 @@ def test_filters_with_remainder(self):
     def test_filters_with_remainder_after_doubleminus(self):
         parsed = self.parser.parse_args(
             ['filters', self.testInput0, 'c', 'd', '--', 'f'])
+        self.assertFalse(parsed.display_aggregates_only)
+        self.assertTrue(parsed.utest)
         self.assertEqual(parsed.mode, 'filters')
         self.assertEqual(parsed.test[0].name, self.testInput0)
         self.assertEqual(parsed.filter_baseline[0], 'c')
@@ -275,6 +365,8 @@ def test_filters_with_remainder_after_doubleminus(self):
     def test_benchmarksfiltered_basic(self):
         parsed = self.parser.parse_args(
             ['benchmarksfiltered', self.testInput0, 'c', self.testInput1, 'e'])
+        self.assertFalse(parsed.display_aggregates_only)
+        self.assertTrue(parsed.utest)
         self.assertEqual(parsed.mode, 'benchmarksfiltered')
         self.assertEqual(parsed.test_baseline[0].name, self.testInput0)
         self.assertEqual(parsed.filter_baseline[0], 'c')
@@ -285,6 +377,8 @@ def test_benchmarksfiltered_basic(self):
     def test_benchmarksfiltered_with_remainder(self):
         parsed = self.parser.parse_args(
             ['benchmarksfiltered', self.testInput0, 'c', self.testInput1, 'e', 'f'])
+        self.assertFalse(parsed.display_aggregates_only)
+        self.assertTrue(parsed.utest)
         self.assertEqual(parsed.mode, 'benchmarksfiltered')
         self.assertEqual(parsed.test_baseline[0].name, self.testInput0)
         self.assertEqual(parsed.filter_baseline[0], 'c')
@@ -295,6 +389,8 @@ def test_benchmarksfiltered_with_remainder(self):
     def test_benchmarksfiltered_with_remainder_after_doubleminus(self):
         parsed = self.parser.parse_args(
             ['benchmarksfiltered', self.testInput0, 'c', self.testInput1, 'e', '--', 'g'])
+        self.assertFalse(parsed.display_aggregates_only)
+        self.assertTrue(parsed.utest)
         self.assertEqual(parsed.mode, 'benchmarksfiltered')
         self.assertEqual(parsed.test_baseline[0].name, self.testInput0)
         self.assertEqual(parsed.filter_baseline[0], 'c')
diff --git a/lib/gbenchmark/tools/compare_bench.py b/lib/gbenchmark/tools/compare_bench.py
deleted file mode 100755
index 7bbf0d0157..0000000000
--- a/lib/gbenchmark/tools/compare_bench.py
+++ /dev/null
@@ -1,67 +0,0 @@
-#!/usr/bin/env python
-"""
-compare_bench.py - Compare two benchmarks or their results and report the
-                   difference.
-"""
-import argparse
-from argparse import ArgumentParser
-import sys
-import gbench
-from gbench import util, report
-from gbench.util import *
-
-def check_inputs(in1, in2, flags):
-    """
-    Perform checking on the user provided inputs and diagnose any abnormalities
-    """
-    in1_kind, in1_err = classify_input_file(in1)
-    in2_kind, in2_err = classify_input_file(in2)
-    output_file = find_benchmark_flag('--benchmark_out=', flags)
-    output_type = find_benchmark_flag('--benchmark_out_format=', flags)
-    if in1_kind == IT_Executable and in2_kind == IT_Executable and output_file:
-        print(("WARNING: '--benchmark_out=%s' will be passed to both "
-              "benchmarks causing it to be overwritten") % output_file)
-    if in1_kind == IT_JSON and in2_kind == IT_JSON and len(flags) > 0:
-        print("WARNING: passing --benchmark flags has no effect since both "
-              "inputs are JSON")
-    if output_type is not None and output_type != 'json':
-        print(("ERROR: passing '--benchmark_out_format=%s' to 'compare_bench.py`"
-              " is not supported.") % output_type)
-        sys.exit(1)
-
-
-def main():
-    parser = ArgumentParser(
-        description='compare the results of two benchmarks')
-    parser.add_argument(
-        'test1', metavar='test1', type=str, nargs=1,
-        help='A benchmark executable or JSON output file')
-    parser.add_argument(
-        'test2', metavar='test2', type=str, nargs=1,
-        help='A benchmark executable or JSON output file')
-    parser.add_argument(
-        'benchmark_options', metavar='benchmark_options', nargs=argparse.REMAINDER,
-        help='Arguments to pass when running benchmark executables'
-    )
-    args, unknown_args = parser.parse_known_args()
-    # Parse the command line flags
-    test1 = args.test1[0]
-    test2 = args.test2[0]
-    if unknown_args:
-        # should never happen
-        print("Unrecognized positional argument arguments: '%s'"
-              % unknown_args)
-        exit(1)
-    benchmark_options = args.benchmark_options
-    check_inputs(test1, test2, benchmark_options)
-    # Run the benchmarks and report the results
-    json1 = gbench.util.run_or_load_benchmark(test1, benchmark_options)
-    json2 = gbench.util.run_or_load_benchmark(test2, benchmark_options)
-    output_lines = gbench.report.generate_difference_report(json1, json2)
-    print('Comparing %s to %s' % (test1, test2))
-    for ln in output_lines:
-        print(ln)
-
-
-if __name__ == '__main__':
-    main()
diff --git a/lib/gbenchmark/tools/gbench/Inputs/test1_run1.json b/lib/gbenchmark/tools/gbench/Inputs/test1_run1.json
index d7ec6a9c8f..601e327aef 100644
--- a/lib/gbenchmark/tools/gbench/Inputs/test1_run1.json
+++ b/lib/gbenchmark/tools/gbench/Inputs/test1_run1.json
@@ -85,7 +85,24 @@
       "time_unit": "ns"
     },
     {
-      "name": "BM_BadTimeUnit",
+      "name": "MyComplexityTest_BigO",
+      "run_name": "MyComplexityTest",
+      "run_type": "aggregate",
+      "aggregate_name": "BigO",
+      "cpu_coefficient": 4.2749856294592886e+00,
+      "real_coefficient": 6.4789275289789780e+00,
+      "big_o": "N",
+      "time_unit": "ns"
+    },
+    {
+      "name": "MyComplexityTest_RMS",
+      "run_name": "MyComplexityTest",
+      "run_type": "aggregate",
+      "aggregate_name": "RMS",
+      "rms": 4.5097802512472874e-03
+    },
+    {
+      "name": "BM_NotBadTimeUnit",
       "iterations": 1000,
       "real_time": 0.4,
       "cpu_time": 0.5,
diff --git a/lib/gbenchmark/tools/gbench/Inputs/test1_run2.json b/lib/gbenchmark/tools/gbench/Inputs/test1_run2.json
index 59a5ffaca4..3cbcf39b0c 100644
--- a/lib/gbenchmark/tools/gbench/Inputs/test1_run2.json
+++ b/lib/gbenchmark/tools/gbench/Inputs/test1_run2.json
@@ -85,7 +85,24 @@
       "time_unit": "ns"
     },
     {
-      "name": "BM_BadTimeUnit",
+      "name": "MyComplexityTest_BigO",
+      "run_name": "MyComplexityTest",
+      "run_type": "aggregate",
+      "aggregate_name": "BigO",
+      "cpu_coefficient": 5.6215779594361486e+00,
+      "real_coefficient": 5.6288314793554610e+00,
+      "big_o": "N",
+      "time_unit": "ns"
+    },
+    {
+      "name": "MyComplexityTest_RMS",
+      "run_name": "MyComplexityTest",
+      "run_type": "aggregate",
+      "aggregate_name": "RMS",
+      "rms": 3.3128901852342174e-03
+    },
+    {
+      "name": "BM_NotBadTimeUnit",
       "iterations": 1000,
       "real_time": 0.04,
       "cpu_time": 0.6,
diff --git a/lib/gbenchmark/tools/gbench/Inputs/test3_run0.json b/lib/gbenchmark/tools/gbench/Inputs/test3_run0.json
new file mode 100644
index 0000000000..49f8b06143
--- /dev/null
+++ b/lib/gbenchmark/tools/gbench/Inputs/test3_run0.json
@@ -0,0 +1,65 @@
+{
+  "context": {
+    "date": "2016-08-02 17:44:46",
+    "num_cpus": 4,
+    "mhz_per_cpu": 4228,
+    "cpu_scaling_enabled": false,
+    "library_build_type": "release"
+  },
+  "benchmarks": [
+    {
+      "name": "BM_One",
+      "run_type": "aggregate",
+      "iterations": 1000,
+      "real_time": 10,
+      "cpu_time": 100,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_Two",
+      "iterations": 1000,
+      "real_time": 9,
+      "cpu_time": 90,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_Two",
+      "iterations": 1000,
+      "real_time": 8,
+      "cpu_time": 86,
+      "time_unit": "ns"
+    },
+    {
+      "name": "short",
+      "run_type": "aggregate",
+      "iterations": 1000,
+      "real_time": 8,
+      "cpu_time": 80,
+      "time_unit": "ns"
+    },
+    {
+      "name": "short",
+      "run_type": "aggregate",
+      "iterations": 1000,
+      "real_time": 8,
+      "cpu_time": 77,
+      "time_unit": "ns"
+    },
+    {
+      "name": "medium",
+      "run_type": "iteration",
+      "iterations": 1000,
+      "real_time": 8,
+      "cpu_time": 80,
+      "time_unit": "ns"
+    },
+    {
+      "name": "medium",
+      "run_type": "iteration",
+      "iterations": 1000,
+      "real_time": 9,
+      "cpu_time": 82,
+      "time_unit": "ns"
+    }
+  ]
+}
diff --git a/lib/gbenchmark/tools/gbench/Inputs/test3_run1.json b/lib/gbenchmark/tools/gbench/Inputs/test3_run1.json
new file mode 100644
index 0000000000..acc5ba17ae
--- /dev/null
+++ b/lib/gbenchmark/tools/gbench/Inputs/test3_run1.json
@@ -0,0 +1,65 @@
+{
+  "context": {
+    "date": "2016-08-02 17:44:46",
+    "num_cpus": 4,
+    "mhz_per_cpu": 4228,
+    "cpu_scaling_enabled": false,
+    "library_build_type": "release"
+  },
+  "benchmarks": [
+    {
+      "name": "BM_One",
+      "iterations": 1000,
+      "real_time": 9,
+      "cpu_time": 110,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_Two",
+      "run_type": "aggregate",
+      "iterations": 1000,
+      "real_time": 10,
+      "cpu_time": 89,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_Two",
+      "iterations": 1000,
+      "real_time": 7,
+      "cpu_time": 72,
+      "time_unit": "ns"
+    },
+    {
+      "name": "short",
+      "run_type": "aggregate",
+      "iterations": 1000,
+      "real_time": 7,
+      "cpu_time": 75,
+      "time_unit": "ns"
+    },
+    {
+      "name": "short",
+      "run_type": "aggregate",
+      "iterations": 762,
+      "real_time": 4.54,
+      "cpu_time": 66.6,
+      "time_unit": "ns"
+    },
+    {
+      "name": "short",
+      "run_type": "iteration",
+      "iterations": 1000,
+      "real_time": 800,
+      "cpu_time": 1,
+      "time_unit": "ns"
+    },
+    {
+      "name": "medium",
+      "run_type": "iteration",
+      "iterations": 1200,
+      "real_time": 5,
+      "cpu_time": 53,
+      "time_unit": "ns"
+    }
+  ]
+}
diff --git a/lib/gbenchmark/tools/gbench/report.py b/lib/gbenchmark/tools/gbench/report.py
index 8d68fe96ee..5bd3a8d85d 100644
--- a/lib/gbenchmark/tools/gbench/report.py
+++ b/lib/gbenchmark/tools/gbench/report.py
@@ -1,9 +1,13 @@
+import unittest
 """report.py - Utilities for reporting statistics about benchmark results
 """
 import os
 import re
 import copy
 
+from scipy.stats import mannwhitneyu
+
+
 class BenchmarkColor(object):
     def __init__(self, name, code):
         self.name = name
@@ -16,11 +20,13 @@ def __repr__(self):
     def __format__(self, format):
         return self.code
 
+
 # Benchmark Colors Enumeration
 BC_NONE = BenchmarkColor('NONE', '')
 BC_MAGENTA = BenchmarkColor('MAGENTA', '\033[95m')
 BC_CYAN = BenchmarkColor('CYAN', '\033[96m')
 BC_OKBLUE = BenchmarkColor('OKBLUE', '\033[94m')
+BC_OKGREEN = BenchmarkColor('OKGREEN', '\033[32m')
 BC_HEADER = BenchmarkColor('HEADER', '\033[92m')
 BC_WARNING = BenchmarkColor('WARNING', '\033[93m')
 BC_WHITE = BenchmarkColor('WHITE', '\033[97m')
@@ -29,6 +35,11 @@ def __format__(self, format):
 BC_BOLD = BenchmarkColor('BOLD', '\033[1m')
 BC_UNDERLINE = BenchmarkColor('UNDERLINE', '\033[4m')
 
+UTEST_MIN_REPETITIONS = 2
+UTEST_OPTIMAL_REPETITIONS = 9  # Lowest reasonable number, More is better.
+UTEST_COL_NAME = "_pvalue"
+
+
 def color_format(use_color, fmt_str, *args, **kwargs):
     """
     Return the result of 'fmt_str.format(*args, **kwargs)' after transforming
@@ -78,64 +89,242 @@ def filter_benchmark(json_orig, family, replacement=""):
     for be in json_orig['benchmarks']:
         if not regex.search(be['name']):
             continue
-        filteredbench = copy.deepcopy(be) # Do NOT modify the old name!
+        filteredbench = copy.deepcopy(be)  # Do NOT modify the old name!
         filteredbench['name'] = regex.sub(replacement, filteredbench['name'])
         filtered['benchmarks'].append(filteredbench)
     return filtered
 
 
-def generate_difference_report(json1, json2, use_color=True):
+def get_unique_benchmark_names(json):
+    """
+    While *keeping* the order, give all the unique 'names' used for benchmarks.
+    """
+    seen = set()
+    uniqued = [x['name'] for x in json['benchmarks']
+               if x['name'] not in seen and
+               (seen.add(x['name']) or True)]
+    return uniqued
+
+
+def intersect(list1, list2):
+    """
+    Given two lists, get a new list consisting of the elements only contained
+    in *both of the input lists*, while preserving the ordering.
+    """
+    return [x for x in list1 if x in list2]
+
+
+def is_potentially_comparable_benchmark(x):
+    return ('time_unit' in x and 'real_time' in x and 'cpu_time' in x)
+
+
+def partition_benchmarks(json1, json2):
+    """
+    While preserving the ordering, find benchmarks with the same names in
+    both of the inputs, and group them.
+    (i.e. partition/filter into groups with common name)
+    """
+    json1_unique_names = get_unique_benchmark_names(json1)
+    json2_unique_names = get_unique_benchmark_names(json2)
+    names = intersect(json1_unique_names, json2_unique_names)
+    partitions = []
+    for name in names:
+        time_unit = None
+        # Pick the time unit from the first entry of the lhs benchmark.
+        # We should be careful not to crash with unexpected input.
+        for x in json1['benchmarks']:
+            if (x['name'] == name and is_potentially_comparable_benchmark(x)):
+                time_unit = x['time_unit']
+                break
+        if time_unit is None:
+            continue
+        # Filter by name and time unit.
+        # All the repetitions are assumed to be comparable.
+        lhs = [x for x in json1['benchmarks'] if x['name'] == name and
+               x['time_unit'] == time_unit]
+        rhs = [x for x in json2['benchmarks'] if x['name'] == name and
+               x['time_unit'] == time_unit]
+        partitions.append([lhs, rhs])
+    return partitions
+
+
+def extract_field(partition, field_name):
+    # The count of elements may be different. We want *all* of them.
+    lhs = [x[field_name] for x in partition[0]]
+    rhs = [x[field_name] for x in partition[1]]
+    return [lhs, rhs]
+
+def calc_utest(timings_cpu, timings_time):
+    min_rep_cnt = min(len(timings_time[0]),
+                      len(timings_time[1]),
+                      len(timings_cpu[0]),
+                      len(timings_cpu[1]))
+
+    # Does *everything* has at least UTEST_MIN_REPETITIONS repetitions?
+    if min_rep_cnt < UTEST_MIN_REPETITIONS:
+        return False, None, None
+
+    time_pvalue = mannwhitneyu(
+        timings_time[0], timings_time[1], alternative='two-sided').pvalue
+    cpu_pvalue = mannwhitneyu(
+        timings_cpu[0], timings_cpu[1], alternative='two-sided').pvalue
+
+    return (min_rep_cnt >= UTEST_OPTIMAL_REPETITIONS), cpu_pvalue, time_pvalue
+
+def print_utest(partition, utest_alpha, first_col_width, use_color=True):
+    def get_utest_color(pval):
+        return BC_FAIL if pval >= utest_alpha else BC_OKGREEN
+
+    timings_time = extract_field(partition, 'real_time')
+    timings_cpu = extract_field(partition, 'cpu_time')
+    have_optimal_repetitions, cpu_pvalue, time_pvalue = calc_utest(timings_cpu, timings_time)
+
+    # Check if we failed miserably with minimum required repetitions for utest
+    if not have_optimal_repetitions and cpu_pvalue is None and time_pvalue is None:
+        return []
+
+    dsc = "U Test, Repetitions: {} vs {}".format(
+        len(timings_cpu[0]), len(timings_cpu[1]))
+    dsc_color = BC_OKGREEN
+
+    # We still got some results to show but issue a warning about it.
+    if not have_optimal_repetitions:
+        dsc_color = BC_WARNING
+        dsc += ". WARNING: Results unreliable! {}+ repetitions recommended.".format(
+            UTEST_OPTIMAL_REPETITIONS)
+
+    special_str = "{}{:<{}s}{endc}{}{:16.4f}{endc}{}{:16.4f}{endc}{}      {}"
+
+    last_name = partition[0][0]['name']
+    return [color_format(use_color,
+                         special_str,
+                         BC_HEADER,
+                         "{}{}".format(last_name, UTEST_COL_NAME),
+                         first_col_width,
+                         get_utest_color(time_pvalue), time_pvalue,
+                         get_utest_color(cpu_pvalue), cpu_pvalue,
+                         dsc_color, dsc,
+                         endc=BC_ENDC)]
+
+
+def generate_difference_report(
+        json1,
+        json2,
+        display_aggregates_only=False,
+        utest=False,
+        utest_alpha=0.05,
+        use_color=True):
     """
     Calculate and report the difference between each test of two benchmarks
     runs specified as 'json1' and 'json2'.
     """
+    assert utest is True or utest is False
     first_col_width = find_longest_name(json1['benchmarks'])
+
     def find_test(name):
         for b in json2['benchmarks']:
             if b['name'] == name:
                 return b
         return None
-    first_col_width = max(first_col_width, len('Benchmark'))
+
+    first_col_width = max(
+        first_col_width,
+        len('Benchmark'))
+    first_col_width += len(UTEST_COL_NAME)
     first_line = "{:<{}s}Time             CPU      Time Old      Time New       CPU Old       CPU New".format(
         'Benchmark', 12 + first_col_width)
     output_strs = [first_line, '-' * len(first_line)]
 
-    gen = (bn for bn in json1['benchmarks'] if 'real_time' in bn and 'cpu_time' in bn)
-    for bn in gen:
-        other_bench = find_test(bn['name'])
-        if not other_bench:
-            continue
+    partitions = partition_benchmarks(json1, json2)
+    for partition in partitions:
+        # Careful, we may have different repetition count.
+        for i in range(min(len(partition[0]), len(partition[1]))):
+            bn = partition[0][i]
+            other_bench = partition[1][i]
 
-        if bn['time_unit'] != other_bench['time_unit']:
-            continue
+            # *If* we were asked to only display aggregates,
+            # and if it is non-aggregate, then skip it.
+            if display_aggregates_only and 'run_type' in bn and 'run_type' in other_bench:
+                assert bn['run_type'] == other_bench['run_type']
+                if bn['run_type'] != 'aggregate':
+                    continue
+
+            fmt_str = "{}{:<{}s}{endc}{}{:+16.4f}{endc}{}{:+16.4f}{endc}{:14.0f}{:14.0f}{endc}{:14.0f}{:14.0f}"
+
+            def get_color(res):
+                if res > 0.05:
+                    return BC_FAIL
+                elif res > -0.07:
+                    return BC_WHITE
+                else:
+                    return BC_CYAN
+
+            tres = calculate_change(bn['real_time'], other_bench['real_time'])
+            cpures = calculate_change(bn['cpu_time'], other_bench['cpu_time'])
+            output_strs += [color_format(use_color,
+                                         fmt_str,
+                                         BC_HEADER,
+                                         bn['name'],
+                                         first_col_width,
+                                         get_color(tres),
+                                         tres,
+                                         get_color(cpures),
+                                         cpures,
+                                         bn['real_time'],
+                                         other_bench['real_time'],
+                                         bn['cpu_time'],
+                                         other_bench['cpu_time'],
+                                         endc=BC_ENDC)]
+
+        # After processing the whole partition, if requested, do the U test.
+        if utest:
+            output_strs += print_utest(partition,
+                                       utest_alpha=utest_alpha,
+                                       first_col_width=first_col_width,
+                                       use_color=use_color)
 
-        def get_color(res):
-            if res > 0.05:
-                return BC_FAIL
-            elif res > -0.07:
-                return BC_WHITE
-            else:
-                return BC_CYAN
-        fmt_str = "{}{:<{}s}{endc}{}{:+16.4f}{endc}{}{:+16.4f}{endc}{:14.0f}{:14.0f}{endc}{:14.0f}{:14.0f}"
-        tres = calculate_change(bn['real_time'], other_bench['real_time'])
-        cpures = calculate_change(bn['cpu_time'], other_bench['cpu_time'])
-        output_strs += [color_format(use_color, fmt_str,
-            BC_HEADER, bn['name'], first_col_width,
-            get_color(tres), tres, get_color(cpures), cpures,
-            bn['real_time'], other_bench['real_time'],
-            bn['cpu_time'], other_bench['cpu_time'],
-            endc=BC_ENDC)]
     return output_strs
 
+
 ###############################################################################
 # Unit tests
 
-import unittest
+
+class TestGetUniqueBenchmarkNames(unittest.TestCase):
+    def load_results(self):
+        import json
+        testInputs = os.path.join(
+            os.path.dirname(
+                os.path.realpath(__file__)),
+            'Inputs')
+        testOutput = os.path.join(testInputs, 'test3_run0.json')
+        with open(testOutput, 'r') as f:
+            json = json.load(f)
+        return json
+
+    def test_basic(self):
+        expect_lines = [
+            'BM_One',
+            'BM_Two',
+            'short',  # These two are not sorted
+            'medium',  # These two are not sorted
+        ]
+        json = self.load_results()
+        output_lines = get_unique_benchmark_names(json)
+        print("\n")
+        print("\n".join(output_lines))
+        self.assertEqual(len(output_lines), len(expect_lines))
+        for i in range(0, len(output_lines)):
+            self.assertEqual(expect_lines[i], output_lines[i])
+
 
 class TestReportDifference(unittest.TestCase):
     def load_results(self):
         import json
-        testInputs = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'Inputs')
+        testInputs = os.path.join(
+            os.path.dirname(
+                os.path.realpath(__file__)),
+            'Inputs')
         testOutput1 = os.path.join(testInputs, 'test1_run1.json')
         testOutput2 = os.path.join(testInputs, 'test1_run2.json')
         with open(testOutput1, 'r') as f:
@@ -153,27 +342,35 @@ def test_basic(self):
             ['BM_1PercentSlower', '+0.0100', '+0.0100', '100', '101', '100', '101'],
             ['BM_10PercentFaster', '-0.1000', '-0.1000', '100', '90', '100', '90'],
             ['BM_10PercentSlower', '+0.1000', '+0.1000', '100', '110', '100', '110'],
-            ['BM_100xSlower', '+99.0000', '+99.0000', '100', '10000', '100', '10000'],
-            ['BM_100xFaster', '-0.9900', '-0.9900', '10000', '100', '10000', '100'],
-            ['BM_10PercentCPUToTime', '+0.1000', '-0.1000', '100', '110', '100', '90'],
+            ['BM_100xSlower', '+99.0000', '+99.0000',
+                '100', '10000', '100', '10000'],
+            ['BM_100xFaster', '-0.9900', '-0.9900',
+                '10000', '100', '10000', '100'],
+            ['BM_10PercentCPUToTime', '+0.1000',
+                '-0.1000', '100', '110', '100', '90'],
             ['BM_ThirdFaster', '-0.3333', '-0.3334', '100', '67', '100', '67'],
-            ['BM_BadTimeUnit', '-0.9000', '+0.2000', '0', '0', '0', '1'],
+            ['BM_NotBadTimeUnit', '-0.9000', '+0.2000', '0', '0', '0', '1'],
         ]
         json1, json2 = self.load_results()
-        output_lines_with_header = generate_difference_report(json1, json2, use_color=False)
+        output_lines_with_header = generate_difference_report(
+            json1, json2, use_color=False)
         output_lines = output_lines_with_header[2:]
+        print("\n")
         print("\n".join(output_lines_with_header))
         self.assertEqual(len(output_lines), len(expect_lines))
         for i in range(0, len(output_lines)):
             parts = [x for x in output_lines[i].split(' ') if x]
             self.assertEqual(len(parts), 7)
-            self.assertEqual(parts, expect_lines[i])
+            self.assertEqual(expect_lines[i], parts)
 
 
 class TestReportDifferenceBetweenFamilies(unittest.TestCase):
     def load_result(self):
         import json
-        testInputs = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'Inputs')
+        testInputs = os.path.join(
+            os.path.dirname(
+                os.path.realpath(__file__)),
+            'Inputs')
         testOutput = os.path.join(testInputs, 'test2_run.json')
         with open(testOutput, 'r') as f:
             json = json.load(f)
@@ -189,15 +386,151 @@ def test_basic(self):
         json = self.load_result()
         json1 = filter_benchmark(json, "BM_Z.ro", ".")
         json2 = filter_benchmark(json, "BM_O.e", ".")
-        output_lines_with_header = generate_difference_report(json1, json2, use_color=False)
+        output_lines_with_header = generate_difference_report(
+            json1, json2, use_color=False)
         output_lines = output_lines_with_header[2:]
-        print "\n"
+        print("\n")
         print("\n".join(output_lines_with_header))
         self.assertEqual(len(output_lines), len(expect_lines))
         for i in range(0, len(output_lines)):
             parts = [x for x in output_lines[i].split(' ') if x]
             self.assertEqual(len(parts), 7)
-            self.assertEqual(parts, expect_lines[i])
+            self.assertEqual(expect_lines[i], parts)
+
+
+class TestReportDifferenceWithUTest(unittest.TestCase):
+    def load_results(self):
+        import json
+        testInputs = os.path.join(
+            os.path.dirname(
+                os.path.realpath(__file__)),
+            'Inputs')
+        testOutput1 = os.path.join(testInputs, 'test3_run0.json')
+        testOutput2 = os.path.join(testInputs, 'test3_run1.json')
+        with open(testOutput1, 'r') as f:
+            json1 = json.load(f)
+        with open(testOutput2, 'r') as f:
+            json2 = json.load(f)
+        return json1, json2
+
+    def test_utest(self):
+        expect_lines = []
+        expect_lines = [
+            ['BM_One', '-0.1000', '+0.1000', '10', '9', '100', '110'],
+            ['BM_Two', '+0.1111', '-0.0111', '9', '10', '90', '89'],
+            ['BM_Two', '-0.1250', '-0.1628', '8', '7', '86', '72'],
+            ['BM_Two_pvalue',
+             '0.6985',
+             '0.6985',
+             'U',
+             'Test,',
+             'Repetitions:',
+             '2',
+             'vs',
+             '2.',
+             'WARNING:',
+             'Results',
+             'unreliable!',
+             '9+',
+             'repetitions',
+             'recommended.'],
+            ['short', '-0.1250', '-0.0625', '8', '7', '80', '75'],
+            ['short', '-0.4325', '-0.1351', '8', '5', '77', '67'],
+            ['short_pvalue',
+             '0.7671',
+             '0.1489',
+             'U',
+             'Test,',
+             'Repetitions:',
+             '2',
+             'vs',
+             '3.',
+             'WARNING:',
+             'Results',
+             'unreliable!',
+             '9+',
+             'repetitions',
+             'recommended.'],
+            ['medium', '-0.3750', '-0.3375', '8', '5', '80', '53'],
+        ]
+        json1, json2 = self.load_results()
+        output_lines_with_header = generate_difference_report(
+            json1, json2, utest=True, utest_alpha=0.05, use_color=False)
+        output_lines = output_lines_with_header[2:]
+        print("\n")
+        print("\n".join(output_lines_with_header))
+        self.assertEqual(len(output_lines), len(expect_lines))
+        for i in range(0, len(output_lines)):
+            parts = [x for x in output_lines[i].split(' ') if x]
+            self.assertEqual(expect_lines[i], parts)
+
+
+class TestReportDifferenceWithUTestWhileDisplayingAggregatesOnly(
+        unittest.TestCase):
+    def load_results(self):
+        import json
+        testInputs = os.path.join(
+            os.path.dirname(
+                os.path.realpath(__file__)),
+            'Inputs')
+        testOutput1 = os.path.join(testInputs, 'test3_run0.json')
+        testOutput2 = os.path.join(testInputs, 'test3_run1.json')
+        with open(testOutput1, 'r') as f:
+            json1 = json.load(f)
+        with open(testOutput2, 'r') as f:
+            json2 = json.load(f)
+        return json1, json2
+
+    def test_utest(self):
+        expect_lines = []
+        expect_lines = [
+            ['BM_One', '-0.1000', '+0.1000', '10', '9', '100', '110'],
+            ['BM_Two', '+0.1111', '-0.0111', '9', '10', '90', '89'],
+            ['BM_Two', '-0.1250', '-0.1628', '8', '7', '86', '72'],
+            ['BM_Two_pvalue',
+             '0.6985',
+             '0.6985',
+             'U',
+             'Test,',
+             'Repetitions:',
+             '2',
+             'vs',
+             '2.',
+             'WARNING:',
+             'Results',
+             'unreliable!',
+             '9+',
+             'repetitions',
+             'recommended.'],
+            ['short', '-0.1250', '-0.0625', '8', '7', '80', '75'],
+            ['short', '-0.4325', '-0.1351', '8', '5', '77', '67'],
+            ['short_pvalue',
+             '0.7671',
+             '0.1489',
+             'U',
+             'Test,',
+             'Repetitions:',
+             '2',
+             'vs',
+             '3.',
+             'WARNING:',
+             'Results',
+             'unreliable!',
+             '9+',
+             'repetitions',
+             'recommended.'],
+        ]
+        json1, json2 = self.load_results()
+        output_lines_with_header = generate_difference_report(
+            json1, json2, display_aggregates_only=True,
+            utest=True, utest_alpha=0.05, use_color=False)
+        output_lines = output_lines_with_header[2:]
+        print("\n")
+        print("\n".join(output_lines_with_header))
+        self.assertEqual(len(output_lines), len(expect_lines))
+        for i in range(0, len(output_lines)):
+            parts = [x for x in output_lines[i].split(' ') if x]
+            self.assertEqual(expect_lines[i], parts)
 
 
 if __name__ == '__main__':
diff --git a/lib/gbenchmark/tools/gbench/util.py b/lib/gbenchmark/tools/gbench/util.py
index 07c2377275..1f8e8e2c47 100644
--- a/lib/gbenchmark/tools/gbench/util.py
+++ b/lib/gbenchmark/tools/gbench/util.py
@@ -7,11 +7,13 @@
 import sys
 
 # Input file type enumeration
-IT_Invalid    = 0
-IT_JSON       = 1
+IT_Invalid = 0
+IT_JSON = 1
 IT_Executable = 2
 
 _num_magic_bytes = 2 if sys.platform.startswith('win') else 4
+
+
 def is_executable_file(filename):
     """
     Return 'True' if 'filename' names a valid file which is likely
@@ -46,7 +48,7 @@ def is_json_file(filename):
         with open(filename, 'r') as f:
             json.load(f)
         return True
-    except:
+    except BaseException:
         pass
     return False
 
@@ -84,6 +86,7 @@ def check_input_file(filename):
         sys.exit(1)
     return ftype
 
+
 def find_benchmark_flag(prefix, benchmark_flags):
     """
     Search the specified list of flags for a flag matching `<prefix><arg>` and
@@ -97,6 +100,7 @@ def find_benchmark_flag(prefix, benchmark_flags):
             result = f[len(prefix):]
     return result
 
+
 def remove_benchmark_flags(prefix, benchmark_flags):
     """
     Return a new list containing the specified benchmark_flags except those
@@ -105,6 +109,7 @@ def remove_benchmark_flags(prefix, benchmark_flags):
     assert prefix.startswith('--') and prefix.endswith('=')
     return [f for f in benchmark_flags if not f.startswith(prefix)]
 
+
 def load_benchmark_results(fname):
     """
     Read benchmark output from a file and return the JSON object.
@@ -129,7 +134,7 @@ def run_benchmark(exe_name, benchmark_flags):
         thandle, output_name = tempfile.mkstemp()
         os.close(thandle)
         benchmark_flags = list(benchmark_flags) + \
-                          ['--benchmark_out=%s' % output_name]
+            ['--benchmark_out=%s' % output_name]
 
     cmd = [exe_name] + benchmark_flags
     print("RUNNING: %s" % ' '.join(cmd))
@@ -156,4 +161,4 @@ def run_or_load_benchmark(filename, benchmark_flags):
     elif ftype == IT_Executable:
         return run_benchmark(filename, benchmark_flags)
     else:
-        assert False # This branch is unreachable
\ No newline at end of file
+        assert False  # This branch is unreachable