Update google benchmark

git-svn-id: https://llvm.org/svn/llvm-project/libcxx/trunk@300530 91177308-0d34-0410-b5e6-96231b3b80d8
2024-11-23 11:59:52 +00:00 · 2017-04-18 07:17:20 +00:00 · 2017-04-18 07:17:20 +00:00 · 688edc78f9
commit 688edc78f9
parent 937aecfb3d
41 changed files with 1230 additions and 162 deletions
--- a/utils/google-benchmark/AUTHORS
+++ b/utils/google-benchmark/AUTHORS
@ -18,12 +18,15 @@ Eugene Zhuk <eugene.zhuk@gmail.com>
 Evgeny Safronov <division494@gmail.com>
 Felix Homann <linuxaudio@showlabor.de>
 Google Inc.
+International Business Machines Corporation
 Ismael Jimenez Martinez <ismael.jimenez.martinez@gmail.com>
+Joao Paulo Magalhaes <joaoppmagalhaes@gmail.com>
 JianXiong Zhou <zhoujianxiong2@gmail.com>
 Jussi Knuuttila <jussi.knuuttila@gmail.com>
 Kaito Udagawa <umireon@gmail.com>
 Lei Xu <eddyxu@gmail.com>
 Matt Clarkson <mattyclarkson@gmail.com>
+Maxim Vafin <maxvafin@gmail.com>
 Nick Hutchinson <nshutchinson@gmail.com>
 Oleksandr Sochka <sasha.sochka@gmail.com>
 Paul Redmond <paul.redmond@gmail.com>
--- a/utils/google-benchmark/CMakeLists.txt
+++ b/utils/google-benchmark/CMakeLists.txt
@ -1,4 +1,4 @@
-cmake_minimum_required (VERSION 2.8.11)
+cmake_minimum_required (VERSION 2.8.12)
 project (benchmark)

 foreach(p
@ -11,8 +11,11 @@ foreach(p
 endforeach()

 option(BENCHMARK_ENABLE_TESTING "Enable testing of the benchmark library." ON)
+option(BENCHMARK_ENABLE_EXCEPTIONS "Enable the use of exceptions in the benchmark library." ON)
 option(BENCHMARK_ENABLE_LTO "Enable link time optimisation of the benchmark library." OFF)
 option(BENCHMARK_USE_LIBCXX "Build and test using libc++ as the standard library." OFF)
+option(BENCHMARK_BUILD_32_BITS "Build a 32 bit version of the library" OFF)
+
 # Make sure we can import out CMake functions
 list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake")

@ -33,12 +36,20 @@ include(CheckCXXCompilerFlag)
 include(AddCXXCompilerFlag)
 include(CXXFeatureCheck)

+if (BENCHMARK_BUILD_32_BITS)
+  add_required_cxx_compiler_flag(-m32)
+endif()
+
 if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC")
  # Turn compiler warnings up to 11
  string(REGEX REPLACE "[-/]W[1-4]" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /W4")
  add_definitions(-D_CRT_SECURE_NO_WARNINGS)

+  if (NOT BENCHMARK_ENABLE_EXCEPTIONS)
+    add_cxx_compiler_flag(-EHs-)
+    add_cxx_compiler_flag(-EHa-)
+  endif()
  # Link time optimisation
  if (BENCHMARK_ENABLE_LTO)
    set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /GL")
@ -80,12 +91,20 @@ else()
  add_cxx_compiler_flag(-Wshorten-64-to-32)
  add_cxx_compiler_flag(-Wfloat-equal)
  add_cxx_compiler_flag(-fstrict-aliasing)
+  if (NOT BENCHMARK_ENABLE_EXCEPTIONS)
+    add_cxx_compiler_flag(-fno-exceptions)
+  endif()
  if (NOT BENCHMARK_USE_LIBCXX)
    add_cxx_compiler_flag(-Wzero-as-null-pointer-constant)
  endif()
  if (HAVE_CXX_FLAG_FSTRICT_ALIASING)
-    add_cxx_compiler_flag(-Wstrict-aliasing)
+    if (NOT CMAKE_CXX_COMPILER_ID STREQUAL "Intel") #ICC17u2: Many false positives for Wstrict-aliasing
+      add_cxx_compiler_flag(-Wstrict-aliasing)
+    endif()
  endif()
+  # ICC17u2: overloaded virtual function "benchmark::Fixture::SetUp" is only partially overridden
+  # (because of deprecated overload)
+  add_cxx_compiler_flag(-wd654)  
  add_cxx_compiler_flag(-Wthread-safety)
  if (HAVE_CXX_FLAG_WTHREAD_SAFETY)
    cxx_feature_check(THREAD_SAFETY_ATTRIBUTES)
@ -162,7 +181,10 @@ cxx_feature_check(POSIX_REGEX)
 if(NOT HAVE_STD_REGEX AND NOT HAVE_GNU_POSIX_REGEX AND NOT HAVE_POSIX_REGEX)
  message(FATAL_ERROR "Failed to determine the source files for the regular expression backend")
 endif()
-
+if (NOT BENCHMARK_ENABLE_EXCEPTIONS AND HAVE_STD_REGEX
+        AND NOT HAVE_GNU_POSIX_REGEX AND NOT HAVE_POSIX_REGEX)
+  message(WARNING "Using std::regex with exceptions disabled is not fully supported")
+endif()
 cxx_feature_check(STEADY_CLOCK)
 # Ensure we have pthreads
 find_package(Threads REQUIRED)
--- a/utils/google-benchmark/CONTRIBUTORS
+++ b/utils/google-benchmark/CONTRIBUTORS
@ -34,18 +34,21 @@ Eugene Zhuk <eugene.zhuk@gmail.com>
 Evgeny Safronov <division494@gmail.com>
 Felix Homann <linuxaudio@showlabor.de>
 Ismael Jimenez Martinez <ismael.jimenez.martinez@gmail.com>
+Joao Paulo Magalhaes <joaoppmagalhaes@gmail.com>
 JianXiong Zhou <zhoujianxiong2@gmail.com>
 Jussi Knuuttila <jussi.knuuttila@gmail.com>
 Kaito Udagawa <umireon@gmail.com>
 Kai Wolf <kai.wolf@gmail.com>
 Lei Xu <eddyxu@gmail.com>
 Matt Clarkson <mattyclarkson@gmail.com>
+Maxim Vafin <maxvafin@gmail.com>
 Nick Hutchinson <nshutchinson@gmail.com>
 Oleksandr Sochka <sasha.sochka@gmail.com>
 Pascal Leroy <phl@google.com>
 Paul Redmond <paul.redmond@gmail.com>
 Pierre Phaneuf <pphaneuf@google.com>
 Radoslav Yovchev <radoslav.tm@gmail.com>
+Ray Glover <ray.glover@uk.ibm.com>
 Shuo Chen <chenshuo@chenshuo.com>
 Yusuke Suzuki <utatane.tea@gmail.com>
 Tobias Ulvgård <tobias.ulvgard@dirac.se>
--- a/utils/google-benchmark/README.md
+++ b/utils/google-benchmark/README.md
@ -11,6 +11,8 @@ IRC channel: https://freenode.net #googlebenchmark

 [Known issues and common problems](#known-issues)

+[Additional Tooling Documentation](docs/tools.md)
+
 ## Example usage
 ### Basic usage
 Define a function that executes the code to be measured.
@ -363,7 +365,7 @@ static void BM_vector_push_back(benchmark::State& state) {
 }
 ```

-Note that `ClobberMemory()` is only available for GNU based compilers.
+Note that `ClobberMemory()` is only available for GNU or MSVC based compilers.

 ### Set time unit manually
 If a benchmark runs a few milliseconds it may be hard to visually compare the
@ -430,6 +432,65 @@ BENCHMARK_REGISTER_F(MyFixture, BarTest)->Threads(2);
 /* BarTest is now registered */
 ```

+
+## User-defined counters
+
+You can add your own counters with user-defined names. The example below
+will add columns "Foo", "Bar" and "Baz" in its output:
+
+```c++
+static void UserCountersExample1(benchmark::State& state) {
+  double numFoos = 0, numBars = 0, numBazs = 0;
+  while (state.KeepRunning()) {
+    // ... count Foo,Bar,Baz events
+  }
+  state.counters["Foo"] = numFoos;
+  state.counters["Bar"] = numBars;
+  state.counters["Baz"] = numBazs;
+}
+```
+
+The `state.counters` object is a `std::map` with `std::string` keys
+and `Counter` values. The latter is a `double`-like class, via an implicit
+conversion to `double&`. Thus you can use all of the standard arithmetic
+assignment operators (`=,+=,-=,*=,/=`) to change the value of each counter.
+
+In multithreaded benchmarks, each counter is set on the calling thread only.
+When the benchmark finishes, the counters from each thread will be summed;
+the resulting sum is the value which will be shown for the benchmark.
+
+The `Counter` constructor accepts two parameters: the value as a `double`
+and a bit flag which allows you to show counters as rates and/or as
+per-thread averages:
+
+```c++
+  // sets a simple counter
+  state.counters["Foo"] = numFoos;
+
+  // Set the counter as a rate. It will be presented divided
+  // by the duration of the benchmark.
+  state.counters["FooRate"] = Counter(numFoos, benchmark::Counter::kIsRate);
+
+  // Set the counter as a thread-average quantity. It will
+  // be presented divided by the number of threads.
+  state.counters["FooAvg"] = Counter(numFoos, benchmark::Counter::kAvgThreads);
+
+  // There's also a combined flag:
+  state.counters["FooAvgRate"] = Counter(numFoos,benchmark::Counter::kAvgThreadsRate);
+```
+
+When you're compiling in C++11 mode or later you can use `insert()` with
+`std::initializer_list`:
+
+```c++
+  // With C++11, this can be done:
+  state.counters.insert({{"Foo", numFoos}, {"Bar", numBars}, {"Baz", numBazs}});
+  // ... instead of:
+  state.counters["Foo"] = numFoos;
+  state.counters["Bar"] = numBars;
+  state.counters["Baz"] = numBazs;
+```
+
 ## Exiting Benchmarks in Error

 When errors caused by external influences, such as file I/O and network
@ -501,7 +562,7 @@ The `context` attribute contains information about the run in general, including
 information about the CPU and the date.
 The `benchmarks` attribute contains a list of ever benchmark run. Example json
 output looks like:
-``` json
+```json
 {
  "context": {
    "date": "2015/03/17-18:40:25",
@ -582,6 +643,7 @@ The following minimum versions are strongly recommended build the library:
 * GCC 4.8
 * Clang 3.4
 * Visual Studio 2013
+* Intel 2015 Update 1

 Anything older *may* work.

--- a/utils/google-benchmark/cmake/AddCXXCompilerFlag.cmake
+++ b/utils/google-benchmark/cmake/AddCXXCompilerFlag.cmake
@ -19,14 +19,21 @@ set(__add_cxx_compiler_flag INCLUDED)

 include(CheckCXXCompilerFlag)

-function(add_cxx_compiler_flag FLAG)
+function(mangle_compiler_flag FLAG OUTPUT)
  string(TOUPPER "HAVE_CXX_FLAG_${FLAG}" SANITIZED_FLAG)
  string(REPLACE "+" "X" SANITIZED_FLAG ${SANITIZED_FLAG})
  string(REGEX REPLACE "[^A-Za-z_0-9]" "_" SANITIZED_FLAG ${SANITIZED_FLAG})
  string(REGEX REPLACE "_+" "_" SANITIZED_FLAG ${SANITIZED_FLAG})
-  set(CMAKE_REQUIRED_FLAGS "${FLAG}")
-  check_cxx_compiler_flag("${FLAG}" ${SANITIZED_FLAG})
-  if(${SANITIZED_FLAG})
+  set(${OUTPUT} "${SANITIZED_FLAG}" PARENT_SCOPE)
+endfunction(mangle_compiler_flag)
+
+function(add_cxx_compiler_flag FLAG)
+  mangle_compiler_flag("${FLAG}" MANGLED_FLAG)
+  set(OLD_CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS}")
+  set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} ${FLAG}")
+  check_cxx_compiler_flag("${FLAG}" ${MANGLED_FLAG})
+  set(CMAKE_REQUIRED_FLAGS "${OLD_CMAKE_REQUIRED_FLAGS}")
+  if(${MANGLED_FLAG})
    set(VARIANT ${ARGV1})
    if(ARGV1)
      string(TOUPPER "_${VARIANT}" VARIANT)
@ -35,3 +42,23 @@ function(add_cxx_compiler_flag FLAG)
  endif()
 endfunction()

+function(add_required_cxx_compiler_flag FLAG)
+  mangle_compiler_flag("${FLAG}" MANGLED_FLAG)
+  set(OLD_CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS}")
+  set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} ${FLAG}")
+  check_cxx_compiler_flag("${FLAG}" ${MANGLED_FLAG})
+  set(CMAKE_REQUIRED_FLAGS "${OLD_CMAKE_REQUIRED_FLAGS}")
+  if(${MANGLED_FLAG})
+    set(VARIANT ${ARGV1})
+    if(ARGV1)
+      string(TOUPPER "_${VARIANT}" VARIANT)
+    endif()
+    set(CMAKE_CXX_FLAGS${VARIANT} "${CMAKE_CXX_FLAGS${VARIANT}} ${FLAG}" PARENT_SCOPE)
+    set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${FLAG}" PARENT_SCOPE)
+    set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} ${FLAG}" PARENT_SCOPE)
+    set(CMAKE_MODULE_LINKER_FLAGS "${CMAKE_MODULE_LINKER_FLAGS} ${FLAG}" PARENT_SCOPE)
+    set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} ${FLAG}" PARENT_SCOPE)
+  else()
+    message(FATAL_ERROR "Required flag '${FLAG}' is not supported by the compiler")
+  endif()
+endfunction()
--- a/utils/google-benchmark/cmake/CXXFeatureCheck.cmake
+++ b/utils/google-benchmark/cmake/CXXFeatureCheck.cmake
@ -10,7 +10,7 @@
 #
 # include(CXXFeatureCheck)
 # cxx_feature_check(STD_REGEX)
-# Requires CMake 2.6+
+# Requires CMake 2.8.12+

 if(__cxx_feature_check)
  return()
--- a/utils/google-benchmark/cmake/Config.cmake.in
+++ b/utils/google-benchmark/cmake/Config.cmake.in
@ -0,0 +1 @@
+include("${CMAKE_CURRENT_LIST_DIR}/@targets_export_name@.cmake")
--- a/utils/google-benchmark/docs/tools.md
+++ b/utils/google-benchmark/docs/tools.md
@ -0,0 +1,59 @@
+# Benchmark Tools
+
+## compare_bench.py
+
+The `compare_bench.py` utility which can be used to compare the result of benchmarks.
+The program is invoked like:
+
+``` bash
+$ compare_bench.py <old-benchmark> <new-benchmark> [benchmark options]...
+```
+
+Where `<old-benchmark>` and `<new-benchmark>` either specify a benchmark executable file, or a JSON output file. The type of the input file is automatically detected. If a benchmark executable is specified then the benchmark is run to obtain the results. Otherwise the results are simply loaded from the output file.
+
+The sample output using the JSON test files under `Inputs/` gives:
+
+``` bash
+$ ./compare_bench.py ./gbench/Inputs/test1_run1.json ./gbench/Inputs/test1_run2.json
+Comparing ./gbench/Inputs/test1_run1.json to ./gbench/Inputs/test1_run2.json
+Benchmark                   Time           CPU
+----------------------------------------------
+BM_SameTimes               +0.00         +0.00
+BM_2xFaster                -0.50         -0.50
+BM_2xSlower                +1.00         +1.00
+BM_10PercentFaster         -0.10         -0.10
+BM_10PercentSlower         +0.10         +0.10
+```
+
+When a benchmark executable is run, the raw output from the benchmark is printed in real time to stdout. The sample output using `benchmark/basic_test` for both arguments looks like:
+
+```
+./compare_bench.py  test/basic_test test/basic_test  --benchmark_filter=BM_empty.*
+RUNNING: test/basic_test --benchmark_filter=BM_empty.*
+Run on (4 X 4228.32 MHz CPU s)
+2016-08-02 19:21:33
+Benchmark                              Time           CPU Iterations
+--------------------------------------------------------------------
+BM_empty                               9 ns          9 ns   79545455
+BM_empty/threads:4                     4 ns          9 ns   75268816
+BM_empty_stop_start                    8 ns          8 ns   83333333
+BM_empty_stop_start/threads:4          3 ns          8 ns   83333332
+RUNNING: test/basic_test --benchmark_filter=BM_empty.*
+Run on (4 X 4228.32 MHz CPU s)
+2016-08-02 19:21:35
+Benchmark                              Time           CPU Iterations
+--------------------------------------------------------------------
+BM_empty                               9 ns          9 ns   76086957
+BM_empty/threads:4                     4 ns          9 ns   76086956
+BM_empty_stop_start                    8 ns          8 ns   87500000
+BM_empty_stop_start/threads:4          3 ns          8 ns   88607596
+Comparing test/basic_test to test/basic_test
+Benchmark                              Time           CPU
+---------------------------------------------------------
+BM_empty                              +0.00         +0.00
+BM_empty/threads:4                    +0.00         +0.00
+BM_empty_stop_start                   +0.00         +0.00
+BM_empty_stop_start/threads:4         +0.00         +0.00
+```
+
+Obviously this example doesn't give any useful output, but it's intended to show the output format when 'compare_bench.py' needs to run benchmarks.
--- a/utils/google-benchmark/include/benchmark/benchmark_api.h
+++ b/utils/google-benchmark/include/benchmark/benchmark_api.h
@ -155,19 +155,29 @@ BENCHMARK(BM_test)->Unit(benchmark::kMillisecond);

 #include <string>
 #include <vector>
+#include <map>

 #include "macros.h"

 #if defined(BENCHMARK_HAS_CXX11)
 #include <type_traits>
+#include <initializer_list>
 #include <utility>
 #endif

+#if defined(_MSC_VER)
+#include <intrin.h> // for _ReadWriteBarrier
+#endif
+
 namespace benchmark {
 class BenchmarkReporter;

 void Initialize(int* argc, char** argv);

+// Report to stdout all arguments in 'argv' as unrecognized except the first.
+// Returns true there is at least on unrecognized argument (i.e. 'argc' > 1).
+bool ReportUnrecognizedArguments(int argc, char** argv);
+
 // Generate a list of benchmarks matching the specified --benchmark_filter flag
 // and if --benchmark_list_tests is specified return after printing the name
 // of each matching benchmark. Otherwise run each matching benchmark and
@ -197,19 +207,6 @@ class Benchmark;
 class BenchmarkImp;
 class BenchmarkFamilies;

-template <class T>
-struct Voider {
-  typedef void type;
-};
-
-template <class T, class = void>
-struct EnableIfString {};
-
-template <class T>
-struct EnableIfString<T, typename Voider<typename T::basic_string>::type> {
-  typedef int type;
-};
-
 void UseCharPointer(char const volatile*);

 // Take ownership of the pointer and register the benchmark. Return the
@ -222,11 +219,16 @@ BENCHMARK_UNUSED static int stream_init_anchor = InitializeStreams();

 }  // end namespace internal

+
+#if !defined(__GNUC__) || defined(__pnacl__) || defined(EMSCRIPTN)
+# define BENCHMARK_HAS_NO_INLINE_ASSEMBLY
+#endif
+
 // The DoNotOptimize(...) function can be used to prevent a value or
 // expression from being optimized away by the compiler. This function is
 // intended to add little to no overhead.
 // See: https://youtu.be/nXaxk27zwlk?t=2441
-#if defined(__GNUC__)
+#ifndef BENCHMARK_HAS_NO_INLINE_ASSEMBLY
 template <class Tp>
 inline BENCHMARK_ALWAYS_INLINE void DoNotOptimize(Tp const& value) {
  asm volatile("" : : "g"(value) : "memory");
@ -236,14 +238,57 @@ inline BENCHMARK_ALWAYS_INLINE void DoNotOptimize(Tp const& value) {
 inline BENCHMARK_ALWAYS_INLINE void ClobberMemory() {
  asm volatile("" : : : "memory");
 }
+#elif defined(_MSC_VER)
+template <class Tp>
+inline BENCHMARK_ALWAYS_INLINE void DoNotOptimize(Tp const& value) {
+  internal::UseCharPointer(&reinterpret_cast<char const volatile&>(value));
+  _ReadWriteBarrier();
+}
+
+inline BENCHMARK_ALWAYS_INLINE void ClobberMemory() {
+  _ReadWriteBarrier();
+}
 #else
 template <class Tp>
 inline BENCHMARK_ALWAYS_INLINE void DoNotOptimize(Tp const& value) {
  internal::UseCharPointer(&reinterpret_cast<char const volatile&>(value));
 }
-// FIXME Add ClobberMemory() for non-gnu compilers
+// FIXME Add ClobberMemory() for non-gnu and non-msvc compilers
 #endif

+
+
+// This class is used for user-defined counters.
+class Counter {
+public:
+
+  enum Flags {
+    kDefaults   = 0,
+    // Mark the counter as a rate. It will be presented divided
+    // by the duration of the benchmark.
+    kIsRate     = 1,
+    // Mark the counter as a thread-average quantity. It will be
+    // presented divided by the number of threads.
+    kAvgThreads = 2,
+    // Mark the counter as a thread-average rate. See above.
+    kAvgThreadsRate = kIsRate|kAvgThreads
+  };
+
+  double value;
+  Flags  flags;
+
+  BENCHMARK_ALWAYS_INLINE
+  Counter(double v = 0., Flags f = kDefaults) : value(v), flags(f) {}
+
+  BENCHMARK_ALWAYS_INLINE operator double const& () const { return value; }
+  BENCHMARK_ALWAYS_INLINE operator double      & ()       { return value; }
+
+};
+
+// This is the container for the user-defined counters.
+typedef std::map<std::string, Counter> UserCounters;
+
+
 // TimeUnit is passed to a benchmark in order to specify the order of magnitude
 // for the measured time.
 enum TimeUnit { kNanosecond, kMicrosecond, kMillisecond };
@ -393,13 +438,7 @@ class State {
  // REQUIRES: a benchmark has exited its KeepRunning loop.
  void SetLabel(const char* label);

-  // Allow the use of std::string without actually including <string>.
-  // This function does not participate in overload resolution unless StringType
-  // has the nested typename `basic_string`. This typename should be provided
-  // as an injected class name in the case of std::string.
-  template <class StringType>
-  void SetLabel(StringType const& str,
-                typename internal::EnableIfString<StringType>::type = 1) {
+  void BENCHMARK_ALWAYS_INLINE SetLabel(const std::string& str) {
    this->SetLabel(str.c_str());
  }

@ -434,6 +473,8 @@ class State {
  bool error_occurred_;

 public:
+  // Container for user-defined counters.
+  UserCounters counters;
  // Index of the executing thread. Values from [0, threads).
  const int thread_index;
  // Number of threads concurrently executing the benchmark.
@ -536,9 +577,17 @@ class Benchmark {

  // Set the minimum amount of time to use when running this benchmark. This
  // option overrides the `benchmark_min_time` flag.
-  // REQUIRES: `t > 0`
+  // REQUIRES: `t > 0` and `Iterations` has not been called on this benchmark.
  Benchmark* MinTime(double t);

+  // Specify the amount of iterations that should be run by this benchmark.
+  // REQUIRES: 'n > 0' and `MinTime` has not been called on this benchmark.
+  //
+  // NOTE: This function should only be used when *exact* iteration control is
+  //   needed and never to control or limit how long a benchmark runs, where
+  // `--benchmark_min_time=N` or `MinTime(...)` should be used instead.
+  Benchmark* Iterations(size_t n);
+
  // Specify the amount of times to repeat this benchmark. This option overrides
  // the `benchmark_repetitions` flag.
  // REQUIRES: `n > 0`
@ -627,6 +676,7 @@ class Benchmark {
  TimeUnit time_unit_;
  int range_multiplier_;
  double min_time_;
+  size_t iterations_;
  int repetitions_;
  bool use_real_time_;
  bool use_manual_time_;
@ -858,6 +908,7 @@ class Fixture : public internal::Benchmark {
 #define BENCHMARK_MAIN()                   \
  int main(int argc, char** argv) {        \
    ::benchmark::Initialize(&argc, argv);  \
+    if (::benchmark::ReportUnrecognizedArguments(argc, argv)) return 1; \
    ::benchmark::RunSpecifiedBenchmarks(); \
  }

--- a/utils/google-benchmark/include/benchmark/reporter.h
+++ b/utils/google-benchmark/include/benchmark/reporter.h
@ -19,6 +19,7 @@
 #include <string>
 #include <utility>
 #include <vector>
+#include <set>

 #include "benchmark_api.h"  // For forward declaration of BenchmarkReporter

@ -54,7 +55,8 @@ class BenchmarkReporter {
          complexity_lambda(),
          complexity_n(0),
          report_big_o(false),
-          report_rms(false) {}
+          report_rms(false),
+          counters() {}

    std::string benchmark_name;
    std::string report_label;  // Empty if not set by benchmark.
@ -93,6 +95,8 @@ class BenchmarkReporter {
    // Inform print function whether the current run is a complexity report
    bool report_big_o;
    bool report_rms;
+
+    UserCounters counters;
  };

  // Construct a BenchmarkReporter with the output stream set to 'std::cout'
@ -163,7 +167,10 @@ class ConsoleReporter : public BenchmarkReporter {

 protected:
  virtual void PrintRunData(const Run& report);
+  virtual void PrintHeader(const Run& report);
+
  size_t name_field_width_;
+  bool printed_header_;

 private:
  bool color_output_;
@ -184,11 +191,15 @@ class JSONReporter : public BenchmarkReporter {

 class CSVReporter : public BenchmarkReporter {
 public:
+  CSVReporter() : printed_header_(false) {}
  virtual bool ReportContext(const Context& context);
  virtual void ReportRuns(const std::vector<Run>& reports);

 private:
  void PrintRunData(const Run& report);
+
+  bool printed_header_;
+  std::set< std::string > user_counter_names_;
 };

 inline const char* GetTimeUnitString(TimeUnit unit) {
--- a/utils/google-benchmark/mingw.py
+++ b/utils/google-benchmark/mingw.py
@ -0,0 +1,320 @@
+#! /usr/bin/env python
+# encoding: utf-8
+
+import argparse
+import errno
+import logging
+import os
+import platform
+import re
+import sys
+import subprocess
+import tempfile
+
+try:
+    import winreg
+except ImportError:
+    import _winreg as winreg
+try:
+    import urllib.request as request
+except ImportError:
+    import urllib as request
+try:
+    import urllib.parse as parse
+except ImportError:
+    import urlparse as parse
+
+class EmptyLogger(object):
+    '''
+    Provides an implementation that performs no logging
+    '''
+    def debug(self, *k, **kw):
+        pass
+    def info(self, *k, **kw):
+        pass
+    def warn(self, *k, **kw):
+        pass
+    def error(self, *k, **kw):
+        pass
+    def critical(self, *k, **kw):
+        pass
+    def setLevel(self, *k, **kw):
+        pass
+
+urls = (
+    'http://downloads.sourceforge.net/project/mingw-w64/Toolchains%20'
+        'targetting%20Win32/Personal%20Builds/mingw-builds/installer/'
+        'repository.txt',
+    'http://downloads.sourceforge.net/project/mingwbuilds/host-windows/'
+        'repository.txt'
+)
+'''
+A list of mingw-build repositories
+'''
+
+def repository(urls = urls, log = EmptyLogger()):
+    '''
+    Downloads and parse mingw-build repository files and parses them
+    '''
+    log.info('getting mingw-builds repository')
+    versions = {}
+    re_sourceforge = re.compile(r'http://sourceforge.net/projects/([^/]+)/files')
+    re_sub = r'http://downloads.sourceforge.net/project/\1'
+    for url in urls:
+        log.debug(' - requesting: %s', url)
+        socket = request.urlopen(url)
+        repo = socket.read()
+        if not isinstance(repo, str):
+            repo = repo.decode();
+        socket.close()
+        for entry in repo.split('\n')[:-1]:
+            value = entry.split('|')
+            version = tuple([int(n) for n in value[0].strip().split('.')])
+            version = versions.setdefault(version, {})
+            arch = value[1].strip()
+            if arch == 'x32':
+                arch = 'i686'
+            elif arch == 'x64':
+                arch = 'x86_64'
+            arch = version.setdefault(arch, {})
+            threading = arch.setdefault(value[2].strip(), {})
+            exceptions = threading.setdefault(value[3].strip(), {})
+            revision = exceptions.setdefault(int(value[4].strip()[3:]),
+                re_sourceforge.sub(re_sub, value[5].strip()))
+    return versions
+
+def find_in_path(file, path=None):
+    '''
+    Attempts to find an executable in the path
+    '''
+    if platform.system() == 'Windows':
+        file += '.exe'
+    if path is None:
+        path = os.environ.get('PATH', '')
+    if type(path) is type(''):
+        path = path.split(os.pathsep)
+    return list(filter(os.path.exists,
+        map(lambda dir, file=file: os.path.join(dir, file), path)))
+
+def find_7zip(log = EmptyLogger()):
+    '''
+    Attempts to find 7zip for unpacking the mingw-build archives
+    '''
+    log.info('finding 7zip')
+    path = find_in_path('7z')
+    if not path:
+        key = winreg.OpenKey(winreg.HKEY_LOCAL_MACHINE, r'SOFTWARE\7-Zip')
+        path, _ = winreg.QueryValueEx(key, 'Path')
+        path = [os.path.join(path, '7z.exe')]
+    log.debug('found \'%s\'', path[0])
+    return path[0]
+
+find_7zip()
+
+def unpack(archive, location, log = EmptyLogger()):
+    '''
+    Unpacks a mingw-builds archive
+    '''
+    sevenzip = find_7zip(log)
+    log.info('unpacking %s', os.path.basename(archive))
+    cmd = [sevenzip, 'x', archive, '-o' + location, '-y']
+    log.debug(' - %r', cmd)
+    with open(os.devnull, 'w') as devnull:
+        subprocess.check_call(cmd, stdout = devnull)
+
+def download(url, location, log = EmptyLogger()):
+    '''
+    Downloads and unpacks a mingw-builds archive
+    '''
+    log.info('downloading MinGW')
+    log.debug(' - url: %s', url)
+    log.debug(' - location: %s', location)
+
+    re_content = re.compile(r'attachment;[ \t]*filename=(")?([^"]*)(")?[\r\n]*')
+
+    stream = request.urlopen(url)
+    try:
+        content = stream.getheader('Content-Disposition') or ''
+    except AttributeError:
+        content = stream.headers.getheader('Content-Disposition') or ''
+    matches = re_content.match(content)
+    if matches:
+        filename = matches.group(2)
+    else:
+        parsed = parse.urlparse(stream.geturl())
+        filename = os.path.basename(parsed.path)
+
+    try:
+        os.makedirs(location)
+    except OSError as e:
+        if e.errno == errno.EEXIST and os.path.isdir(location):
+            pass
+        else:
+            raise
+
+    archive = os.path.join(location, filename)
+    with open(archive, 'wb') as out:
+        while True:
+            buf = stream.read(1024)
+            if not buf:
+                break
+            out.write(buf)
+    unpack(archive, location, log = log)
+    os.remove(archive)
+
+    possible = os.path.join(location, 'mingw64')
+    if not os.path.exists(possible):
+        possible = os.path.join(location, 'mingw32')
+        if not os.path.exists(possible):
+            raise ValueError('Failed to find unpacked MinGW: ' + possible)
+    return possible
+
+def root(location = None, arch = None, version = None, threading = None,
+        exceptions = None, revision = None, log = EmptyLogger()):
+    '''
+    Returns the root folder of a specific version of the mingw-builds variant
+    of gcc. Will download the compiler if needed
+    '''
+
+    # Get the repository if we don't have all the information
+    if not (arch and version and threading and exceptions and revision):
+        versions = repository(log = log)
+
+    # Determine some defaults
+    version = version or max(versions.keys())
+    if not arch:
+        arch = platform.machine().lower()
+        if arch == 'x86':
+            arch = 'i686'
+        elif arch == 'amd64':
+            arch = 'x86_64'
+    if not threading:
+        keys = versions[version][arch].keys()
+        if 'posix' in keys:
+            threading = 'posix'
+        elif 'win32' in keys:
+            threading = 'win32'
+        else:
+            threading = keys[0]
+    if not exceptions:
+        keys = versions[version][arch][threading].keys()
+        if 'seh' in keys:
+            exceptions = 'seh'
+        elif 'sjlj' in keys:
+            exceptions = 'sjlj'
+        else:
+            exceptions = keys[0]
+    if revision == None:
+        revision = max(versions[version][arch][threading][exceptions].keys())
+    if not location:
+        location = os.path.join(tempfile.gettempdir(), 'mingw-builds')
+
+    # Get the download url
+    url = versions[version][arch][threading][exceptions][revision]
+
+    # Tell the user whatzzup
+    log.info('finding MinGW %s', '.'.join(str(v) for v in version))
+    log.debug(' - arch: %s', arch)
+    log.debug(' - threading: %s', threading)
+    log.debug(' - exceptions: %s', exceptions)
+    log.debug(' - revision: %s', revision)
+    log.debug(' - url: %s', url)
+
+    # Store each specific revision differently
+    slug = '{version}-{arch}-{threading}-{exceptions}-rev{revision}'
+    slug = slug.format(
+        version = '.'.join(str(v) for v in version),
+        arch = arch,
+        threading = threading,
+        exceptions = exceptions,
+        revision = revision
+    )
+    if arch == 'x86_64':
+        root_dir = os.path.join(location, slug, 'mingw64')
+    elif arch == 'i686':
+        root_dir = os.path.join(location, slug, 'mingw32')
+    else:
+        raise ValueError('Unknown MinGW arch: ' + arch)
+
+    # Download if needed
+    if not os.path.exists(root_dir):
+        downloaded = download(url, os.path.join(location, slug), log = log)
+        if downloaded != root_dir:
+            raise ValueError('The location of mingw did not match\n%s\n%s'
+                % (downloaded, root_dir))
+
+    return root_dir
+
+def str2ver(string):
+    '''
+    Converts a version string into a tuple
+    '''
+    try:
+        version = tuple(int(v) for v in string.split('.'))
+        if len(version) is not 3:
+            raise ValueError()
+    except ValueError:
+        raise argparse.ArgumentTypeError(
+            'please provide a three digit version string')
+    return version
+
+def main():
+    '''
+    Invoked when the script is run directly by the python interpreter
+    '''
+    parser = argparse.ArgumentParser(
+        description = 'Downloads a specific version of MinGW',
+        formatter_class = argparse.ArgumentDefaultsHelpFormatter
+    )
+    parser.add_argument('--location',
+        help = 'the location to download the compiler to',
+        default = os.path.join(tempfile.gettempdir(), 'mingw-builds'))
+    parser.add_argument('--arch', required = True, choices = ['i686', 'x86_64'],
+        help = 'the target MinGW architecture string')
+    parser.add_argument('--version', type = str2ver,
+        help = 'the version of GCC to download')
+    parser.add_argument('--threading', choices = ['posix', 'win32'],
+        help = 'the threading type of the compiler')
+    parser.add_argument('--exceptions', choices = ['sjlj', 'seh', 'dwarf'],
+        help = 'the method to throw exceptions')
+    parser.add_argument('--revision', type=int,
+        help = 'the revision of the MinGW release')
+    group = parser.add_mutually_exclusive_group()
+    group.add_argument('-v', '--verbose', action='store_true',
+        help='increase the script output verbosity')
+    group.add_argument('-q', '--quiet', action='store_true',
+        help='only print errors and warning')
+    args = parser.parse_args()
+
+    # Create the logger
+    logger = logging.getLogger('mingw')
+    handler = logging.StreamHandler()
+    formatter = logging.Formatter('%(message)s')
+    handler.setFormatter(formatter)
+    logger.addHandler(handler)
+    logger.setLevel(logging.INFO)
+    if args.quiet:
+        logger.setLevel(logging.WARN)
+    if args.verbose:
+        logger.setLevel(logging.DEBUG)
+
+    # Get MinGW
+    root_dir = root(location = args.location, arch = args.arch,
+        version = args.version, threading = args.threading,
+        exceptions = args.exceptions, revision = args.revision,
+        log = logger)
+
+    sys.stdout.write('%s\n' % os.path.join(root_dir, 'bin'))
+
+if __name__ == '__main__':
+    try:
+        main()
+    except IOError as e:
+        sys.stderr.write('IO error: %s\n' % e)
+        sys.exit(1)
+    except OSError as e:
+        sys.stderr.write('OS error: %s\n' % e)
+        sys.exit(1)
+    except KeyboardInterrupt as e:
+        sys.stderr.write('Killed\n')
+        sys.exit(1)
--- a/utils/google-benchmark/src/CMakeLists.txt
+++ b/utils/google-benchmark/src/CMakeLists.txt
@ -21,24 +21,55 @@ set_target_properties(benchmark PROPERTIES

 # Link threads.
 target_link_libraries(benchmark  ${BENCHMARK_CXX_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT})
+find_library(LIBRT rt)
+if(LIBRT)
+  target_link_libraries(benchmark ${LIBRT})
+endif()

 # We need extra libraries on Windows
 if(${CMAKE_SYSTEM_NAME} MATCHES "Windows")
  target_link_libraries(benchmark Shlwapi)
 endif()

-# Expose public API
-target_include_directories(benchmark PUBLIC ${PROJECT_SOURCE_DIR}/include)
+set(include_install_dir "include")
+set(lib_install_dir "lib/")
+set(bin_install_dir "bin/")
+set(config_install_dir "lib/cmake/${PROJECT_NAME}")
+
+set(generated_dir "${CMAKE_CURRENT_BINARY_DIR}/generated")
+
+set(version_config "${generated_dir}/${PROJECT_NAME}ConfigVersion.cmake")
+set(project_config "${generated_dir}/${PROJECT_NAME}Config.cmake")
+set(targets_export_name "${PROJECT_NAME}Targets")
+
+set(namespace "${PROJECT_NAME}::")
+
+include(CMakePackageConfigHelpers)
+write_basic_package_version_file(
+    "${version_config}" VERSION ${GIT_VERSION} COMPATIBILITY SameMajorVersion
+)
+
+configure_file("${PROJECT_SOURCE_DIR}/cmake/Config.cmake.in" "${project_config}" @ONLY)

 # Install target (will install the library to specified CMAKE_INSTALL_PREFIX variable)
 install(
  TARGETS benchmark
-  ARCHIVE DESTINATION lib
-  LIBRARY DESTINATION lib
-  RUNTIME DESTINATION bin
-  COMPONENT library)
+  EXPORT ${targets_export_name}
+  ARCHIVE DESTINATION ${lib_install_dir}
+  LIBRARY DESTINATION ${lib_install_dir}
+  RUNTIME DESTINATION ${bin_install_dir}
+  INCLUDES DESTINATION ${include_install_dir})

 install(
  DIRECTORY "${PROJECT_SOURCE_DIR}/include/benchmark"
-  DESTINATION include
+  DESTINATION ${include_install_dir}
  FILES_MATCHING PATTERN "*.*h")
+
+install(
+    FILES "${project_config}" "${version_config}"
+    DESTINATION "${config_install_dir}")
+
+install(
+    EXPORT "${targets_export_name}"
+    NAMESPACE "${namespace}"
+    DESTINATION "${config_install_dir}")
--- a/utils/google-benchmark/src/benchmark.cc
+++ b/utils/google-benchmark/src/benchmark.cc
@ -37,6 +37,7 @@
 #include "colorprint.h"
 #include "commandlineflags.h"
 #include "complexity.h"
+#include "counter.h"
 #include "log.h"
 #include "mutex.h"
 #include "re.h"
@ -145,6 +146,7 @@ class ThreadManager {
    std::string report_label_;
    std::string error_message_;
    bool has_error_ = false;
+    UserCounters counters;
  };
  GUARDED_BY(GetBenchmarkMutex()) Result results;

@ -249,6 +251,7 @@ BenchmarkReporter::Run CreateRunReport(
    report.complexity_n = results.complexity_n;
    report.complexity = b.complexity;
    report.complexity_lambda = b.complexity_lambda;
+    report.counters = results.counters;
  }
  return report;
 }
@ -272,6 +275,7 @@ void RunInThread(const benchmark::internal::Benchmark::Instance* b,
    results.bytes_processed += st.bytes_processed();
    results.items_processed += st.items_processed();
    results.complexity_n += st.complexity_length_n();
+    internal::Increment(&results.counters, st.counters);
  }
  manager->NotifyThreadComplete();
 }
@ -281,7 +285,8 @@ std::vector<BenchmarkReporter::Run> RunBenchmark(
    std::vector<BenchmarkReporter::Run>* complexity_reports) {
  std::vector<BenchmarkReporter::Run> reports;  // return value

-  size_t iters = 1;
+  const bool has_explicit_iteration_count = b.iterations != 0;
+  size_t iters = has_explicit_iteration_count ? b.iterations : 1;
  std::unique_ptr<internal::ThreadManager> manager;
  std::vector<std::thread> pool(b.threads - 1);
  const int repeats =
@ -291,7 +296,7 @@ std::vector<BenchmarkReporter::Run> RunBenchmark(
      (b.report_mode == internal::RM_Unspecified
           ? FLAGS_benchmark_report_aggregates_only
           : b.report_mode == internal::RM_ReportAggregatesOnly);
-  for (int i = 0; i < repeats; i++) {
+  for (int repetition_num = 0; repetition_num < repeats; repetition_num++) {
    for (;;) {
      // Try benchmark
      VLOG(2) << "Running " << b.name << " for " << iters << "\n";
@ -327,10 +332,20 @@ std::vector<BenchmarkReporter::Run> RunBenchmark(

      const double min_time =
          !IsZero(b.min_time) ? b.min_time : FLAGS_benchmark_min_time;
-      // If this was the first run, was elapsed time or cpu time large enough?
-      // If this is not the first run, go with the current value of iter.
-      if ((i > 0) || results.has_error_ || (iters >= kMaxIterations) ||
-          (seconds >= min_time) || (results.real_time_used >= 5 * min_time)) {
+
+      // Determine if this run should be reported; Either it has
+      // run for a sufficient amount of time or because an error was reported.
+      const bool should_report =  repetition_num > 0
+        || has_explicit_iteration_count // An exact iteration count was requested
+        || results.has_error_
+        || iters >= kMaxIterations
+        || seconds >= min_time // the elapsed time is large enough
+        // CPU time is specified but the elapsed real time greatly exceeds the
+        // minimum time. Note that user provided timers are except from this
+        // sanity check.
+        || ((results.real_time_used >= 5 * min_time) && !b.use_manual_time);
+
+      if (should_report) {
        BenchmarkReporter::Run report =
            CreateRunReport(b, results, iters, seconds);
        if (!report.error_occurred && b.complexity != oNone)
@ -386,6 +401,7 @@ State::State(size_t max_iters, const std::vector<int>& ranges, int thread_i,
      items_processed_(0),
      complexity_n_(0),
      error_occurred_(false),
+      counters(),
      thread_index(thread_i),
      threads(n_threads),
      max_iterations(max_iters),
@ -634,7 +650,7 @@ void ParseCommandLineFlags(int* argc, char** argv) {
        // TODO: Remove this.
        ParseStringFlag(argv[i], "color_print", &FLAGS_benchmark_color) ||
        ParseInt32Flag(argv[i], "v", &FLAGS_v)) {
-      for (int j = i; j != *argc; ++j) argv[j] = argv[j + 1];
+      for (int j = i; j != *argc - 1; ++j) argv[j] = argv[j + 1];

      --(*argc);
      --i;
@ -664,4 +680,11 @@ void Initialize(int* argc, char** argv) {
  internal::LogLevel() = FLAGS_v;
 }

+bool ReportUnrecognizedArguments(int argc, char** argv) {
+  for (int i = 1; i < argc; ++i) {
+    fprintf(stderr, "%s: error: unrecognized command-line flag: %s\n", argv[0], argv[i]);
+  }
+  return argc > 1;
+}
+
 }  // end namespace benchmark
--- a/utils/google-benchmark/src/benchmark_api_internal.h
+++ b/utils/google-benchmark/src/benchmark_api_internal.h
@ -24,9 +24,11 @@ struct Benchmark::Instance {
  bool use_manual_time;
  BigO complexity;
  BigOFunc* complexity_lambda;
+  UserCounters counters;
  bool last_benchmark_instance;
  int repetitions;
  double min_time;
+  size_t iterations;
  int threads;  // Number of concurrent threads to us
 };

--- a/utils/google-benchmark/src/benchmark_register.cc
+++ b/utils/google-benchmark/src/benchmark_register.cc
@ -143,6 +143,7 @@ bool BenchmarkFamilies::FindBenchmarks(
        instance.time_unit = family->time_unit_;
        instance.range_multiplier = family->range_multiplier_;
        instance.min_time = family->min_time_;
+        instance.iterations = family->iterations_;
        instance.repetitions = family->repetitions_;
        instance.use_real_time = family->use_real_time_;
        instance.use_manual_time = family->use_manual_time_;
@ -163,16 +164,17 @@ bool BenchmarkFamilies::FindBenchmarks(
            }
          }

-          AppendHumanReadable(arg, &instance.name);
+          instance.name += std::to_string(arg);
          ++arg_i;
        }

-        if (!IsZero(family->min_time_)) {
+        if (!IsZero(family->min_time_))
          instance.name += StringPrintF("/min_time:%0.3f", family->min_time_);
-        }
-        if (family->repetitions_ != 0) {
+        if (family->iterations_ != 0)
+          instance.name += StringPrintF("/iterations:%d", family->iterations_);
+        if (family->repetitions_ != 0)
          instance.name += StringPrintF("/repeats:%d", family->repetitions_);
-        }
+
        if (family->use_manual_time_) {
          instance.name += "/manual_time";
        } else if (family->use_real_time_) {
@ -219,6 +221,7 @@ Benchmark::Benchmark(const char* name)
      time_unit_(kNanosecond),
      range_multiplier_(kRangeMultiplier),
      min_time_(0),
+      iterations_(0),
      repetitions_(0),
      use_real_time_(false),
      use_manual_time_(false),
@ -344,6 +347,22 @@ Benchmark* Benchmark::RangeMultiplier(int multiplier) {
  return this;
 }

+
+Benchmark* Benchmark::MinTime(double t) {
+  CHECK(t > 0.0);
+  CHECK(iterations_ == 0);
+  min_time_ = t;
+  return this;
+}
+
+
+Benchmark* Benchmark::Iterations(size_t n) {
+  CHECK(n > 0);
+  CHECK(IsZero(min_time_));
+  iterations_ = n;
+  return this;
+}
+
 Benchmark* Benchmark::Repetitions(int n) {
  CHECK(n > 0);
  repetitions_ = n;
@ -355,12 +374,6 @@ Benchmark* Benchmark::ReportAggregatesOnly(bool value) {
  return this;
 }

-Benchmark* Benchmark::MinTime(double t) {
-  CHECK(t > 0.0);
-  min_time_ = t;
-  return this;
-}
-
 Benchmark* Benchmark::UseRealTime() {
  CHECK(!use_manual_time_)
      << "Cannot set UseRealTime and UseManualTime simultaneously.";
--- a/utils/google-benchmark/src/complexity.cc
+++ b/utils/google-benchmark/src/complexity.cc
@ -171,6 +171,22 @@ std::vector<BenchmarkReporter::Run> ComputeStats(
  // All repetitions should be run with the same number of iterations so we
  // can take this information from the first benchmark.
  int64_t const run_iterations = reports.front().iterations;
+  // create stats for user counters
+  struct CounterStat {
+    Counter c;
+    Stat1_d s;
+  };
+  std::map< std::string, CounterStat > counter_stats;
+  for(Run const& r : reports) {
+    for(auto const& cnt : r.counters) {
+      auto it = counter_stats.find(cnt.first);
+      if(it == counter_stats.end()) {
+        counter_stats.insert({cnt.first, {cnt.second, Stat1_d{}}});
+      } else {
+        CHECK_EQ(counter_stats[cnt.first].c.flags, cnt.second.flags);
+      }
+    }
+  }

  // Populate the accumulators.
  for (Run const& run : reports) {
@ -183,6 +199,12 @@ std::vector<BenchmarkReporter::Run> ComputeStats(
        Stat1_d(run.cpu_accumulated_time / run.iterations, run.iterations);
    items_per_second_stat += Stat1_d(run.items_per_second, run.iterations);
    bytes_per_second_stat += Stat1_d(run.bytes_per_second, run.iterations);
+    // user counters
+    for(auto const& cnt : run.counters) {
+      auto it = counter_stats.find(cnt.first);
+      CHECK_NE(it, counter_stats.end());
+      it->second.s += Stat1_d(cnt.second, run.iterations);
+    }
  }

  // Get the data from the accumulator to BenchmarkReporter::Run's.
@ -196,6 +218,11 @@ std::vector<BenchmarkReporter::Run> ComputeStats(
  mean_data.bytes_per_second = bytes_per_second_stat.Mean();
  mean_data.items_per_second = items_per_second_stat.Mean();
  mean_data.time_unit = reports[0].time_unit;
+  // user counters
+  for(auto const& kv : counter_stats) {
+    auto c = Counter(kv.second.s.Mean(), counter_stats[kv.first].c.flags);
+    mean_data.counters[kv.first] = c;
+  }

  // Only add label to mean/stddev if it is same for all runs
  mean_data.report_label = reports[0].report_label;
@ -215,6 +242,11 @@ std::vector<BenchmarkReporter::Run> ComputeStats(
  stddev_data.bytes_per_second = bytes_per_second_stat.StdDev();
  stddev_data.items_per_second = items_per_second_stat.StdDev();
  stddev_data.time_unit = reports[0].time_unit;
+  // user counters
+  for(auto const& kv : counter_stats) {
+    auto c = Counter(kv.second.s.StdDev(), counter_stats[kv.first].c.flags);
+    stddev_data.counters[kv.first] = c;
+  }

  results.push_back(mean_data);
  results.push_back(stddev_data);
@ -263,6 +295,11 @@ std::vector<BenchmarkReporter::Run> ComputeBigO(
  big_o.report_big_o = true;
  big_o.complexity = result_cpu.complexity;

+  // All the time results are reported after being multiplied by the
+  // time unit multiplier. But since RMS is a relative quantity it
+  // should not be multiplied at all. So, here, we _divide_ it by the
+  // multiplier so that when it is multiplied later the result is the
+  // correct one.
  double multiplier = GetTimeUnitMultiplier(reports[0].time_unit);

  // Only add label to mean/stddev if it is same for all runs
@ -275,6 +312,9 @@ std::vector<BenchmarkReporter::Run> ComputeBigO(
  rms.cpu_accumulated_time = result_cpu.rms / multiplier;
  rms.report_rms = true;
  rms.complexity = result_cpu.complexity;
+  // don't forget to keep the time unit, or we won't be able to
+  // recover the correct value.
+  rms.time_unit = reports[0].time_unit;

  results.push_back(big_o);
  results.push_back(rms);
--- a/utils/google-benchmark/src/console_reporter.cc
+++ b/utils/google-benchmark/src/console_reporter.cc
@ -14,6 +14,7 @@

 #include "benchmark/reporter.h"
 #include "complexity.h"
+#include "counter.h"

 #include <algorithm>
 #include <cstdint>
@ -34,6 +35,7 @@ namespace benchmark {

 bool ConsoleReporter::ReportContext(const Context& context) {
  name_field_width_ = context.name_field_width;
+  printed_header_ = false;

  PrintBasicContext(&GetErrorStream(), context);

@ -45,16 +47,32 @@ bool ConsoleReporter::ReportContext(const Context& context) {
    color_output_ = false;
  }
 #endif
-  std::string str =
-      FormatString("%-*s %13s %13s %10s\n", static_cast<int>(name_field_width_),
-                   "Benchmark", "Time", "CPU", "Iterations");
-  GetOutputStream() << str << std::string(str.length() - 1, '-') << "\n";

  return true;
 }

+void ConsoleReporter::PrintHeader(const Run& run) {
+  std::string str =
+      FormatString("%-*s %13s %13s %10s\n", static_cast<int>(name_field_width_),
+                   "Benchmark", "Time", "CPU", "Iterations");
+  if(!run.counters.empty()) {
+    str += " UserCounters...";
+  }
+  std::string line = std::string(str.length(), '-');
+  GetOutputStream() << line << "\n" << str << line << "\n";
+}
+
 void ConsoleReporter::ReportRuns(const std::vector<Run>& reports) {
-  for (const auto& run : reports) PrintRunData(run);
+  for (const auto& run : reports) {
+    // print the header if none was printed yet
+    if (!printed_header_) {
+      printed_header_ = true;
+      PrintHeader(run);
+    }
+    // As an alternative to printing the headers like this, we could sort
+    // the benchmarks by header and then print like that.
+    PrintRunData(run);
+  }
 }

 static void IgnoreColorPrint(std::ostream& out, LogColor, const char* fmt,
@ -114,6 +132,11 @@ void ConsoleReporter::PrintRunData(const Run& result) {
    printer(Out, COLOR_CYAN, "%10lld", result.iterations);
  }

+  for (auto& c : result.counters) {
+    auto const& s = HumanReadableNumber(c.second.value);
+    printer(Out, COLOR_DEFAULT, " %s=%s", c.first.c_str(), s.c_str());
+  }
+
  if (!rate.empty()) {
    printer(Out, COLOR_DEFAULT, " %*s", 13, rate.c_str());
  }
--- a/utils/google-benchmark/src/counter.cc
+++ b/utils/google-benchmark/src/counter.cc
@ -0,0 +1,68 @@
+// Copyright 2015 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "counter.h"
+
+namespace benchmark {
+namespace internal {
+
+double Finish(Counter const& c, double cpu_time, double num_threads) {
+  double v = c.value;
+  if (c.flags & Counter::kIsRate) {
+    v /= cpu_time;
+  }
+  if (c.flags & Counter::kAvgThreads) {
+    v /= num_threads;
+  }
+  return v;
+}
+
+void Finish(UserCounters *l, double cpu_time, double num_threads) {
+  for (auto &c : *l) {
+    c.second = Finish(c.second, cpu_time, num_threads);
+  }
+}
+
+void Increment(UserCounters *l, UserCounters const& r) {
+  // add counters present in both or just in *l
+  for (auto &c : *l) {
+    auto it = r.find(c.first);
+    if (it != r.end()) {
+      c.second = c.second + it->second;
+    }
+  }
+  // add counters present in r, but not in *l
+  for (auto const &tc : r) {
+    auto it = l->find(tc.first);
+    if (it == l->end()) {
+      (*l)[tc.first] = tc.second;
+    }
+  }
+}
+
+bool SameNames(UserCounters const& l, UserCounters const& r) {
+  if (&l == &r) return true;
+  if (l.size() != r.size()) {
+    return false;
+  }
+  for (auto const& c : l) {
+    if ( r.find(c.first) == r.end()) {
+      return false;
+    }
+  }
+  return true;
+}
+
+} // end namespace internal
+} // end namespace benchmark
--- a/utils/google-benchmark/src/counter.h
+++ b/utils/google-benchmark/src/counter.h
@ -0,0 +1,26 @@
+// Copyright 2015 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "benchmark/benchmark_api.h"
+
+namespace benchmark {
+
+// these counter-related functions are hidden to reduce API surface.
+namespace internal {
+void Finish(UserCounters *l, double time, double num_threads);
+void Increment(UserCounters *l, UserCounters const& r);
+bool SameNames(UserCounters const& l, UserCounters const& r);
+} // end namespace internal
+
+} //end namespace benchmark
--- a/utils/google-benchmark/src/csv_reporter.cc
+++ b/utils/google-benchmark/src/csv_reporter.cc
@ -24,6 +24,7 @@

 #include "string_util.h"
 #include "timers.h"
+#include "check.h"

 // File format reference: http://edoceo.com/utilitas/csv-file-format.

@ -38,21 +39,51 @@ std::vector<std::string> elements = {

 bool CSVReporter::ReportContext(const Context& context) {
  PrintBasicContext(&GetErrorStream(), context);
-
-  std::ostream& Out = GetOutputStream();
-  for (auto B = elements.begin(); B != elements.end();) {
-    Out << *B++;
-    if (B != elements.end()) Out << ",";
-  }
-  Out << "\n";
  return true;
 }

-void CSVReporter::ReportRuns(const std::vector<Run>& reports) {
-  for (const auto& run : reports) PrintRunData(run);
+void CSVReporter::ReportRuns(const std::vector<Run> & reports) {
+  std::ostream& Out = GetOutputStream();
+
+  if (!printed_header_) {
+    // save the names of all the user counters
+    for (const auto& run : reports) {
+      for (const auto& cnt : run.counters) {
+        user_counter_names_.insert(cnt.first);
+      }
+    }
+
+    // print the header
+    for (auto B = elements.begin(); B != elements.end();) {
+      Out << *B++;
+      if (B != elements.end()) Out << ",";
+    }
+    for (auto B = user_counter_names_.begin(); B != user_counter_names_.end();) {
+      Out << ",\"" << *B++ << "\"";
+    }
+    Out << "\n";
+
+    printed_header_ = true;
+  } else {
+    // check that all the current counters are saved in the name set
+    for (const auto& run : reports) {
+      for (const auto& cnt : run.counters) {
+        CHECK(user_counter_names_.find(cnt.first) != user_counter_names_.end())
+              << "All counters must be present in each run. "
+              << "Counter named \"" << cnt.first
+              << "\" was not in a run after being added to the header";
+      }
+    }
+  }
+
+  // print results for each run
+  for (const auto& run : reports) {
+    PrintRunData(run);
+  }
+
 }

-void CSVReporter::PrintRunData(const Run& run) {
+void CSVReporter::PrintRunData(const Run & run) {
  std::ostream& Out = GetOutputStream();

  // Field with embedded double-quote characters must be doubled and the field
@ -102,6 +133,13 @@ void CSVReporter::PrintRunData(const Run& run) {
    Out << "\"" << label << "\"";
  }
  Out << ",,";  // for error_occurred and error_message
+
+  // Print user counters
+  for (const auto &ucn : user_counter_names_) {
+    auto it = run.counters.find(ucn);
+    CHECK(it != run.counters.end());
+    Out << "," << it->second;
+  }
  Out << '\n';
 }

--- a/utils/google-benchmark/src/cycleclock.h
+++ b/utils/google-benchmark/src/cycleclock.h
@ -43,6 +43,11 @@ extern "C" uint64_t __rdtsc();

 #ifndef BENCHMARK_OS_WINDOWS
 #include <sys/time.h>
+#include <time.h>
+#endif
+
+#ifdef BENCHMARK_OS_EMSCRIPTEN
+#include <emscripten.h>
 #endif

 namespace benchmark {
@ -65,6 +70,10 @@ inline BENCHMARK_ALWAYS_INLINE int64_t Now() {
  // counter pauses; it does not continue counting, nor does it
  // reset to zero.
  return mach_absolute_time();
+#elif defined(BENCHMARK_OS_EMSCRIPTEN)
+  // this goes above x86-specific code because old versions of Emscripten
+  // define __x86_64__, although they have nothing to do with it.
+  return static_cast<int64_t>(emscripten_get_now() * 1e+6);
 #elif defined(__i386__)
  int64_t ret;
  __asm__ volatile("rdtsc" : "=A"(ret));
@ -79,7 +88,7 @@ inline BENCHMARK_ALWAYS_INLINE int64_t Now() {
  asm("mftbu %0" : "=r"(tbu0));
  asm("mftb  %0" : "=r"(tbl));
  asm("mftbu %0" : "=r"(tbu1));
-  tbl &= -static_cast<int64>(tbu0 == tbu1);
+  tbl &= -static_cast<int64_t>(tbu0 == tbu1);
  // high 32 bits in tbu1; low 32 bits in tbl  (tbu0 is garbage)
  return (tbu1 << 32) | tbl;
 #elif defined(__sparc__)
@ -99,6 +108,22 @@ inline BENCHMARK_ALWAYS_INLINE int64_t Now() {
  _asm rdtsc
 #elif defined(COMPILER_MSVC)
  return __rdtsc();
+#elif defined(BENCHMARK_OS_NACL)
+  // Native Client validator on x86/x86-64 allows RDTSC instructions,
+  // and this case is handled above. Native Client validator on ARM
+  // rejects MRC instructions (used in the ARM-specific sequence below),
+  // so we handle it here. Portable Native Client compiles to
+  // architecture-agnostic bytecode, which doesn't provide any
+  // cycle counter access mnemonics.
+
+  // Native Client does not provide any API to access cycle counter.
+  // Use clock_gettime(CLOCK_MONOTONIC, ...) instead of gettimeofday
+  // because is provides nanosecond resolution (which is noticable at
+  // least for PNaCl modules running on x86 Mac & Linux).
+  // Initialize to always return 0 if clock_gettime fails.
+  struct timespec ts = { 0, 0 };
+  clock_gettime(CLOCK_MONOTONIC, &ts);
+  return static_cast<int64_t>(ts.tv_sec) * 1000000000 + ts.tv_nsec;
 #elif defined(__aarch64__)
  // System timer of ARMv8 runs at a different frequency than the CPU's.
  // The frequency is fixed, typically in the range 1-50MHz.  It can be
@ -108,7 +133,9 @@ inline BENCHMARK_ALWAYS_INLINE int64_t Now() {
  asm volatile("mrs %0, cntvct_el0" : "=r"(virtual_timer_value));
  return virtual_timer_value;
 #elif defined(__ARM_ARCH)
-#if (__ARM_ARCH >= 6)  // V6 is the earliest arch that has a standard cyclecount
+  // V6 is the earliest arch that has a standard cyclecount
+  // Native Client validator doesn't allow MRC instructions.
+#if (__ARM_ARCH >= 6)
  uint32_t pmccntr;
  uint32_t pmuseren;
  uint32_t pmcntenset;
--- a/utils/google-benchmark/src/internal_macros.h
+++ b/utils/google-benchmark/src/internal_macros.h
@ -30,13 +30,26 @@
 #elif defined(_WIN32)
 #define BENCHMARK_OS_WINDOWS 1
 #elif defined(__APPLE__)
-// TODO(ericwf) This doesn't actually check that it is a Mac OSX system. Just
-// that it is an apple system.
-#define BENCHMARK_OS_MACOSX 1
+#include "TargetConditionals.h"
+  #if defined(TARGET_OS_MAC)
+    #define BENCHMARK_OS_MACOSX 1
+    #if defined(TARGET_OS_IPHONE)
+      #define BENCHMARK_OS_IOS 1
+    #endif
+  #endif
 #elif defined(__FreeBSD__)
 #define BENCHMARK_OS_FREEBSD 1
 #elif defined(__linux__)
 #define BENCHMARK_OS_LINUX 1
+#elif defined(__native_client__)
+#define BENCHMARK_OS_NACL 1
+#elif defined(EMSCRIPTEN)
+#define BENCHMARK_OS_EMSCRIPTEN 1
+#endif
+
+#if !__has_feature(cxx_exceptions) && !defined(__cpp_exceptions) \
+     && !defined(__EXCEPTIONS)
+#define BENCHMARK_HAS_NO_EXCEPTIONS
 #endif

 #endif  // BENCHMARK_INTERNAL_MACROS_H_
--- a/utils/google-benchmark/src/json_reporter.cc
+++ b/utils/google-benchmark/src/json_reporter.cc
@ -154,10 +154,15 @@ void JSONReporter::PrintRunData(Run const& run) {
        << indent
        << FormatKV("items_per_second", RoundDouble(run.items_per_second));
  }
+  for(auto &c : run.counters) {
+    out << ",\n"
+        << indent
+        << FormatKV(c.first, RoundDouble(c.second));
+  }
  if (!run.report_label.empty()) {
    out << ",\n" << indent << FormatKV("label", run.report_label);
  }
  out << '\n';
 }

-}  // end namespace benchmark
+} // end namespace benchmark
--- a/utils/google-benchmark/src/re.h
+++ b/utils/google-benchmark/src/re.h
@ -15,6 +15,15 @@
 #ifndef BENCHMARK_RE_H_
 #define BENCHMARK_RE_H_

+#include "internal_macros.h"
+
+// Prefer C regex libraries when compiling w/o exceptions so that we can
+// correctly report errors.
+#if defined(BENCHMARK_HAS_NO_EXCEPTIONS) && defined(HAVE_STD_REGEX) && \
+    (defined(HAVE_GNU_POSIX_REGEX) || defined(HAVE_POSIX_REGEX))
+#undef HAVE_STD_REGEX
+#endif
+
 #if defined(HAVE_STD_REGEX)
 #include <regex>
 #elif defined(HAVE_GNU_POSIX_REGEX)
@ -62,15 +71,20 @@ class Regex {
 #if defined(HAVE_STD_REGEX)

 inline bool Regex::Init(const std::string& spec, std::string* error) {
+#ifdef BENCHMARK_HAS_NO_EXCEPTIONS
+  ((void)error); // suppress unused warning
+#else
  try {
+#endif
    re_ = std::regex(spec, std::regex_constants::extended);
-
    init_ = true;
+#ifndef BENCHMARK_HAS_NO_EXCEPTIONS
  } catch (const std::regex_error& e) {
    if (error) {
      *error = e.what();
    }
  }
+#endif
  return init_;
 }

--- a/utils/google-benchmark/src/sleep.cc
+++ b/utils/google-benchmark/src/sleep.cc
@ -15,6 +15,7 @@
 #include "sleep.h"

 #include <cerrno>
+#include <cstdlib>
 #include <ctime>

 #include "internal_macros.h"
@ -40,7 +41,7 @@ void SleepForMicroseconds(int microseconds) {
 }

 void SleepForMilliseconds(int milliseconds) {
-  SleepForMicroseconds(static_cast<int>(milliseconds) * kNumMicrosPerMilli);
+  SleepForMicroseconds(milliseconds * kNumMicrosPerMilli);
 }

 void SleepForSeconds(double seconds) {
--- a/utils/google-benchmark/src/sleep.h
+++ b/utils/google-benchmark/src/sleep.h
@ -1,14 +1,12 @@
 #ifndef BENCHMARK_SLEEP_H_
 #define BENCHMARK_SLEEP_H_

-#include <cstdint>
-
 namespace benchmark {
-const int64_t kNumMillisPerSecond = 1000LL;
-const int64_t kNumMicrosPerMilli = 1000LL;
-const int64_t kNumMicrosPerSecond = kNumMillisPerSecond * 1000LL;
-const int64_t kNumNanosPerMicro = 1000LL;
-const int64_t kNumNanosPerSecond = kNumNanosPerMicro * kNumMicrosPerSecond;
+const int kNumMillisPerSecond = 1000;
+const int kNumMicrosPerMilli = 1000;
+const int kNumMicrosPerSecond = kNumMillisPerSecond * 1000;
+const int kNumNanosPerMicro = 1000;
+const int kNumNanosPerSecond = kNumNanosPerMicro * kNumMicrosPerSecond;

 void SleepForMilliseconds(int milliseconds);
 void SleepForSeconds(double seconds);
--- a/utils/google-benchmark/src/string_util.cc
+++ b/utils/google-benchmark/src/string_util.cc
@ -45,6 +45,8 @@ void ToExponentAndMantissa(double val, double thresh, int precision,
      std::max(thresh, 1.0 / std::pow(10.0, precision));
  const double big_threshold = adjusted_threshold * one_k;
  const double small_threshold = adjusted_threshold;
+  // Values in ]simple_threshold,small_threshold[ will be printed as-is
+  const double simple_threshold = 0.01;

  if (val > big_threshold) {
    // Positive powers
@ -62,14 +64,16 @@ void ToExponentAndMantissa(double val, double thresh, int precision,
    *exponent = 0;
  } else if (val < small_threshold) {
    // Negative powers
-    double scaled = val;
-    for (size_t i = 0; i < arraysize(kSmallSIUnits); ++i) {
-      scaled *= one_k;
-      if (scaled >= small_threshold) {
-        mantissa_stream << scaled;
-        *exponent = -static_cast<int64_t>(i + 1);
-        *mantissa = mantissa_stream.str();
-        return;
+    if (val < simple_threshold) {
+      double scaled = val;
+      for (size_t i = 0; i < arraysize(kSmallSIUnits); ++i) {
+        scaled *= one_k;
+        if (scaled >= small_threshold) {
+          mantissa_stream << scaled;
+          *exponent = -static_cast<int64_t>(i + 1);
+          *mantissa = mantissa_stream.str();
+          return;
+        }
      }
    }
    mantissa_stream << val;
--- a/utils/google-benchmark/src/sysinfo.cc
+++ b/utils/google-benchmark/src/sysinfo.cc
@ -75,7 +75,9 @@ bool ReadIntFromFile(const char* file, long* value) {
    char line[1024];
    char* err;
    memset(line, '\0', sizeof(line));
-    CHECK(read(fd, line, sizeof(line) - 1));
+    ssize_t read_err = read(fd, line, sizeof(line) - 1);
+    ((void)read_err); // prevent unused warning
+    CHECK(read_err >= 0);
    const long temp_value = strtol(line, &err, 10);
    if (line[0] != '\0' && (*err == '\n' || *err == '\0')) {
      *value = temp_value;
@ -295,8 +297,13 @@ void InitializeSystemInfo() {
      (size == sizeof(cpu_freq))) {
    cpuinfo_cycles_per_second = cpu_freq;
  } else {
+    #if defined BENCHMARK_OS_IOS
+    fprintf(stderr, "CPU frequency cannot be detected. \n");
+    cpuinfo_cycles_per_second = 0;
+    #else
    fprintf(stderr, "%s\n", strerror(errno));
    std::exit(EXIT_FAILURE);
+    #endif
  }
 #else
  // Generic cycles per second counter
--- a/utils/google-benchmark/src/timers.cc
+++ b/utils/google-benchmark/src/timers.cc
@ -35,6 +35,10 @@
 #endif
 #endif

+#ifdef BENCHMARK_OS_EMSCRIPTEN
+#include <emscripten.h>
+#endif
+
 #include <cerrno>
 #include <cstdint>
 #include <cstdio>
@ -100,14 +104,7 @@ BENCHMARK_NORETURN static void DiagnoseAndExit(const char* msg) {
 }  // end namespace

 double ProcessCPUUsage() {
-// FIXME We want to use clock_gettime, but its not available in MacOS 10.11. See
-// https://github.com/google/benchmark/pull/292
-#if defined(CLOCK_PROCESS_CPUTIME_ID) && !defined(BENCHMARK_OS_MACOSX)
-  struct timespec spec;
-  if (clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &spec) == 0)
-    return MakeTime(spec);
-  DiagnoseAndExit("clock_gettime(CLOCK_PROCESS_CPUTIME_ID, ...) failed");
-#elif defined(BENCHMARK_OS_WINDOWS)
+#if defined(BENCHMARK_OS_WINDOWS)
  HANDLE proc = GetCurrentProcess();
  FILETIME creation_time;
  FILETIME exit_time;
@ -117,21 +114,28 @@ double ProcessCPUUsage() {
                      &user_time))
    return MakeTime(kernel_time, user_time);
  DiagnoseAndExit("GetProccessTimes() failed");
+#elif defined(BENCHMARK_OS_EMSCRIPTEN)
+  // clock_gettime(CLOCK_PROCESS_CPUTIME_ID, ...) returns 0 on Emscripten.
+  // Use Emscripten-specific API. Reported CPU time would be exactly the
+  // same as total time, but this is ok because there aren't long-latency
+  // syncronous system calls in Emscripten.
+  return emscripten_get_now() * 1e-3;
+#elif defined(CLOCK_PROCESS_CPUTIME_ID) && !defined(BENCHMARK_OS_MACOSX)
+  // FIXME We want to use clock_gettime, but its not available in MacOS 10.11. See
+  // https://github.com/google/benchmark/pull/292
+  struct timespec spec;
+  if (clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &spec) == 0)
+    return MakeTime(spec);
+  DiagnoseAndExit("clock_gettime(CLOCK_PROCESS_CPUTIME_ID, ...) failed");
 #else
  struct rusage ru;
  if (getrusage(RUSAGE_SELF, &ru) == 0) return MakeTime(ru);
-  DiagnoseAndExit("clock_gettime(CLOCK_PROCESS_CPUTIME_ID, ...) failed");
+  DiagnoseAndExit("getrusage(RUSAGE_SELF, ...) failed");
 #endif
 }

 double ThreadCPUUsage() {
-// FIXME We want to use clock_gettime, but its not available in MacOS 10.11. See
-// https://github.com/google/benchmark/pull/292
-#if defined(CLOCK_THREAD_CPUTIME_ID) && !defined(BENCHMARK_OS_MACOSX)
-  struct timespec ts;
-  if (clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts) == 0) return MakeTime(ts);
-  DiagnoseAndExit("clock_gettime(CLOCK_THREAD_CPUTIME_ID, ...) failed");
-#elif defined(BENCHMARK_OS_WINDOWS)
+#if defined(BENCHMARK_OS_WINDOWS)
  HANDLE this_thread = GetCurrentThread();
  FILETIME creation_time;
  FILETIME exit_time;
@ -141,6 +145,8 @@ double ThreadCPUUsage() {
                 &user_time);
  return MakeTime(kernel_time, user_time);
 #elif defined(BENCHMARK_OS_MACOSX)
+  // FIXME We want to use clock_gettime, but its not available in MacOS 10.11. See
+  // https://github.com/google/benchmark/pull/292
  mach_msg_type_number_t count = THREAD_BASIC_INFO_COUNT;
  thread_basic_info_data_t info;
  mach_port_t thread = pthread_mach_thread_np(pthread_self());
@ -149,6 +155,13 @@ double ThreadCPUUsage() {
    return MakeTime(info);
  }
  DiagnoseAndExit("ThreadCPUUsage() failed when evaluating thread_info");
+#elif defined(BENCHMARK_OS_EMSCRIPTEN)
+  // Emscripten doesn't support traditional threads
+  return ProcessCPUUsage();
+#elif defined(CLOCK_THREAD_CPUTIME_ID)
+  struct timespec ts;
+  if (clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts) == 0) return MakeTime(ts);
+  DiagnoseAndExit("clock_gettime(CLOCK_THREAD_CPUTIME_ID, ...) failed");
 #else
 #error Per-thread timing is not available on your system.
 #endif
--- a/utils/google-benchmark/test/CMakeLists.txt
+++ b/utils/google-benchmark/test/CMakeLists.txt
@ -2,6 +2,25 @@

 find_package(Threads REQUIRED)

+# NOTE: Some tests use `<cassert>` to perform the test. Therefore we must
+# strip -DNDEBUG from the default CMake flags in DEBUG mode.
+string(TOUPPER "${CMAKE_BUILD_TYPE}" uppercase_CMAKE_BUILD_TYPE)
+if( NOT uppercase_CMAKE_BUILD_TYPE STREQUAL "DEBUG" )
+  add_definitions( -UNDEBUG )
+  add_definitions(-DTEST_BENCHMARK_LIBRARY_HAS_NO_ASSERTIONS)
+  # Also remove /D NDEBUG to avoid MSVC warnings about conflicting defines.
+  foreach (flags_var_to_scrub
+      CMAKE_CXX_FLAGS_RELEASE
+      CMAKE_CXX_FLAGS_RELWITHDEBINFO
+      CMAKE_CXX_FLAGS_MINSIZEREL
+      CMAKE_C_FLAGS_RELEASE
+      CMAKE_C_FLAGS_RELWITHDEBINFO
+      CMAKE_C_FLAGS_MINSIZEREL)
+    string (REGEX REPLACE "(^| )[/-]D *NDEBUG($| )" " "
+      "${flags_var_to_scrub}" "${${flags_var_to_scrub}}")
+  endforeach()
+endif()
+
 # NOTE: These flags must be added after find_package(Threads REQUIRED) otherwise
 # they will break the configuration check.
 if (DEFINED BENCHMARK_CXX_LINKER_FLAGS)
--- a/utils/google-benchmark/test/benchmark_test.cc
+++ b/utils/google-benchmark/test/benchmark_test.cc
@ -150,7 +150,7 @@ static void BM_LongTest(benchmark::State& state) {
 BENCHMARK(BM_LongTest)->Range(1 << 16, 1 << 28);

 static void BM_ParallelMemset(benchmark::State& state) {
-  int size = state.range(0) / sizeof(int);
+  int size = state.range(0) / static_cast<int>(sizeof(int));
  int thread_size = size / state.threads;
  int from = thread_size * state.thread_index;
  int to = from + thread_size;
@ -209,11 +209,27 @@ BENCHMARK_CAPTURE(BM_with_args, string_and_pair_test, std::string("abc"),
                  std::pair<int, double>(42, 3.8));

 void BM_non_template_args(benchmark::State& state, int, double) {
-  while (state.KeepRunning()) {
-  }
+  while(state.KeepRunning()) {}
 }
 BENCHMARK_CAPTURE(BM_non_template_args, basic_test, 0, 0);

+static void BM_UserCounter(benchmark::State& state) {
+  static const int depth = 1024;
+  while (state.KeepRunning()) {
+    benchmark::DoNotOptimize(CalculatePi(depth));
+  }
+  state.counters["Foo"] = 1;
+  state.counters["Bar"] = 2;
+  state.counters["Baz"] = 3;
+  state.counters["Bat"] = 5;
+#ifdef BENCHMARK_HAS_CXX11
+  state.counters.insert({{"Foo", 2}, {"Bar", 3}, {"Baz", 5}, {"Bat", 6}});
+#endif
+}
+BENCHMARK(BM_UserCounter)->Threads(8);
+BENCHMARK(BM_UserCounter)->ThreadRange(1, 32);
+BENCHMARK(BM_UserCounter)->ThreadPerCpu();
+
 #endif  // __cplusplus >= 201103L

 static void BM_DenseThreadRanges(benchmark::State& st) {
--- a/utils/google-benchmark/test/cxx03_test.cc
+++ b/utils/google-benchmark/test/cxx03_test.cc
@ -39,4 +39,10 @@ void BM_template1(benchmark::State& state) {
 BENCHMARK_TEMPLATE(BM_template1, long);
 BENCHMARK_TEMPLATE1(BM_template1, int);

+void BM_counters(benchmark::State& state) {
+    BM_empty(state);
+    state.counters["Foo"] = 2;
+}
+BENCHMARK(BM_counters);
+
 BENCHMARK_MAIN()
--- a/utils/google-benchmark/test/diagnostics_test.cc
+++ b/utils/google-benchmark/test/diagnostics_test.cc
@ -26,7 +26,7 @@ void TestHandler() {
 }

 void try_invalid_pause_resume(benchmark::State& state) {
-#if !defined(NDEBUG) && !defined(TEST_HAS_NO_EXCEPTIONS)
+#if !defined(TEST_BENCHMARK_LIBRARY_HAS_NO_ASSERTIONS) && !defined(TEST_HAS_NO_EXCEPTIONS)
  try {
    state.PauseTiming();
    std::abort();
--- a/utils/google-benchmark/test/options_test.cc
+++ b/utils/google-benchmark/test/options_test.cc
@ -1,8 +1,12 @@
 #include "benchmark/benchmark_api.h"
-
 #include <chrono>
 #include <thread>

+#if defined(NDEBUG)
+#undef NDEBUG
+#endif
+#include <cassert>
+
 void BM_basic(benchmark::State& state) {
  while (state.KeepRunning()) {
  }
@ -40,4 +44,22 @@ void CustomArgs(benchmark::internal::Benchmark* b) {

 BENCHMARK(BM_basic)->Apply(CustomArgs);

+void BM_explicit_iteration_count(benchmark::State& st) {
+  // Test that benchmarks specified with an explicit iteration count are
+  // only run once.
+  static bool invoked_before = false;
+  assert(!invoked_before);
+  invoked_before = true;
+
+  // Test that the requested iteration count is respected.
+  assert(st.max_iterations == 42);
+  size_t actual_iterations = 0;
+  while (st.KeepRunning())
+    ++actual_iterations;
+  assert(st.iterations() == st.max_iterations);
+  assert(st.iterations() == 42);
+
+}
+BENCHMARK(BM_explicit_iteration_count)->Iterations(42);
+
 BENCHMARK_MAIN()
--- a/utils/google-benchmark/test/output_test_helper.cc
+++ b/utils/google-benchmark/test/output_test_helper.cc
@ -31,7 +31,7 @@ TestCaseList& GetTestCaseList(TestCaseID ID) {

 SubMap& GetSubstitutions() {
  // Don't use 'dec_re' from header because it may not yet be initialized.
-  static std::string dec_re = "[0-9]*[.]?[0-9]+([eE][-+][0-9]+)?";
+  static std::string safe_dec_re = "[0-9]*[.]?[0-9]+([eE][-+][0-9]+)?";
  static SubMap map = {
      {"%float", "[0-9]*[.]?[0-9]+([eE][-+][0-9]+)?"},
      {"%int", "[ ]*[0-9]+"},
@ -39,13 +39,13 @@ SubMap& GetSubstitutions() {
      {"%time", "[ ]*[0-9]{1,5} ns"},
      {"%console_report", "[ ]*[0-9]{1,5} ns [ ]*[0-9]{1,5} ns [ ]*[0-9]+"},
      {"%console_us_report", "[ ]*[0-9] us [ ]*[0-9] us [ ]*[0-9]+"},
-      {"%csv_report", "[0-9]+," + dec_re + "," + dec_re + ",ns,,,,,"},
-      {"%csv_us_report", "[0-9]+," + dec_re + "," + dec_re + ",us,,,,,"},
+      {"%csv_report", "[0-9]+," + safe_dec_re + "," + safe_dec_re + ",ns,,,,,"},
+      {"%csv_us_report", "[0-9]+," + safe_dec_re + "," + safe_dec_re + ",us,,,,,"},
      {"%csv_bytes_report",
-       "[0-9]+," + dec_re + "," + dec_re + ",ns," + dec_re + ",,,,"},
+       "[0-9]+," + safe_dec_re + "," + safe_dec_re + ",ns," + safe_dec_re + ",,,,"},
      {"%csv_items_report",
-       "[0-9]+," + dec_re + "," + dec_re + ",ns,," + dec_re + ",,,"},
-      {"%csv_label_report_begin", "[0-9]+," + dec_re + "," + dec_re + ",ns,,,"},
+       "[0-9]+," + safe_dec_re + "," + safe_dec_re + ",ns,," + safe_dec_re + ",,,"},
+      {"%csv_label_report_begin", "[0-9]+," + safe_dec_re + "," + safe_dec_re + ",ns,,,"},
      {"%csv_label_report_end", ",,"}};
  return map;
 }
--- a/utils/google-benchmark/test/reporter_output_test.cc
+++ b/utils/google-benchmark/test/reporter_output_test.cc
@ -9,8 +9,10 @@
 // ---------------------- Testing Prologue Output -------------------------- //
 // ========================================================================= //

-ADD_CASES(TC_ConsoleOut, {{"^Benchmark %s Time %s CPU %s Iterations$", MR_Next},
-                          {"^[-]+$", MR_Next}});
+ADD_CASES(TC_ConsoleOut,
+          {{"^[-]+$", MR_Next},
+           {"^Benchmark %s Time %s CPU %s Iterations$", MR_Next},
+           {"^[-]+$", MR_Next}});
 ADD_CASES(TC_CSVOut,
          {{"name,iterations,real_time,cpu_time,time_unit,bytes_per_second,"
            "items_per_second,label,error_occurred,error_message"}});
--- a/utils/google-benchmark/tools/compare_bench.py
+++ b/utils/google-benchmark/tools/compare_bench.py
@ -3,25 +3,63 @@
 compare_bench.py - Compare two benchmarks or their results and report the
                   difference.
 """
+import argparse
+from argparse import ArgumentParser
 import sys
 import gbench
 from gbench import util, report
+from gbench.util import *
+
+def check_inputs(in1, in2, flags):
+    """
+    Perform checking on the user provided inputs and diagnose any abnormalities
+    """
+    in1_kind, in1_err = classify_input_file(in1)
+    in2_kind, in2_err = classify_input_file(in2)
+    output_file = find_benchmark_flag('--benchmark_out=', flags)
+    output_type = find_benchmark_flag('--benchmark_out_format=', flags)
+    if in1_kind == IT_Executable and in2_kind == IT_Executable and output_file:
+        print(("WARNING: '--benchmark_out=%s' will be passed to both "
+              "benchmarks causing it to be overwritten") % output_file)
+    if in1_kind == IT_JSON and in2_kind == IT_JSON and len(flags) > 0:
+        print("WARNING: passing --benchmark flags has no effect since both "
+              "inputs are JSON")
+    if output_type is not None and output_type != 'json':
+        print(("ERROR: passing '--benchmark_out_format=%s' to 'compare_bench.py`"
+              " is not supported.") % output_type)
+        sys.exit(1)
+

 def main():
+    parser = ArgumentParser(
+        description='compare the results of two benchmarks')
+    parser.add_argument(
+        'test1', metavar='test1', type=str, nargs=1,
+        help='A benchmark executable or JSON output file')
+    parser.add_argument(
+        'test2', metavar='test2', type=str, nargs=1,
+        help='A benchmark executable or JSON output file')
+    # FIXME this is a dummy argument which will never actually match
+    # any --benchmark flags but it helps generate a better usage message
+    parser.add_argument(
+        'benchmark_options', metavar='benchmark_option', nargs='*',
+        help='Arguments to pass when running benchmark executables'
+    )
+    args, unknown_args = parser.parse_known_args()
    # Parse the command line flags
-    def usage():
-        print('compare_bench.py <test1> <test2> [benchmark options]...')
+    test1 = args.test1[0]
+    test2 = args.test2[0]
+    if args.benchmark_options:
+        print("Unrecognized positional argument arguments: '%s'"
+              % args.benchmark_options)
        exit(1)
-    if '--help' in sys.argv or len(sys.argv) < 3:
-        usage()
-    tests = sys.argv[1:3]
-    bench_opts = sys.argv[3:]
-    bench_opts = list(bench_opts)
+    benchmark_options = unknown_args
+    check_inputs(test1, test2, benchmark_options)
    # Run the benchmarks and report the results
-    json1 = gbench.util.run_or_load_benchmark(tests[0], bench_opts)
-    json2 = gbench.util.run_or_load_benchmark(tests[1], bench_opts)
+    json1 = gbench.util.run_or_load_benchmark(test1, benchmark_options)
+    json2 = gbench.util.run_or_load_benchmark(test2, benchmark_options)
    output_lines = gbench.report.generate_difference_report(json1, json2)
-    print 'Comparing %s to %s' % (tests[0], tests[1])
+    print('Comparing %s to %s' % (test1, test2))
    for ln in output_lines:
        print(ln)

--- a/utils/google-benchmark/tools/gbench/Inputs/test1_run1.json
+++ b/utils/google-benchmark/tools/gbench/Inputs/test1_run1.json
@ -41,6 +41,20 @@
      "real_time": 100,
      "cpu_time": 100,
      "time_unit": "ns"
+    },
+    {
+      "name": "BM_100xSlower",
+      "iterations": 1000,
+      "real_time": 100,
+      "cpu_time": 100,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_100xFaster",
+      "iterations": 1000,
+      "real_time": 10000,
+      "cpu_time": 10000,
+      "time_unit": "ns"
    }
  ]
 }
--- a/utils/google-benchmark/tools/gbench/Inputs/test1_run2.json
+++ b/utils/google-benchmark/tools/gbench/Inputs/test1_run2.json
@ -41,6 +41,20 @@
      "real_time": 110,
      "cpu_time": 110,
      "time_unit": "ns"
+    },
+    {
+      "name": "BM_100xSlower",
+      "iterations": 1000,
+      "real_time": 10000,
+      "cpu_time": 10000,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_100xFaster",
+      "iterations": 1000,
+      "real_time": 100,
+      "cpu_time": 100,
+      "time_unit": "ns"
    }
  ]
 }
--- a/utils/google-benchmark/tools/gbench/report.py
+++ b/utils/google-benchmark/tools/gbench/report.py
@ -92,7 +92,7 @@ def generate_difference_report(json1, json2, use_color=True):
                return BC_WHITE
            else:
                return BC_CYAN
-        fmt_str = "{}{:<{}s}{endc}    {}{:+.2f}{endc}         {}{:+.2f}{endc}         {:4d}         {:4d}"
+        fmt_str = "{}{:<{}s}{endc}{}{:+9.2f}{endc}{}{:+14.2f}{endc}{:14d}{:14d}"
        tres = calculate_change(bn['real_time'], other_bench['real_time'])
        cpures = calculate_change(bn['cpu_time'], other_bench['cpu_time'])
        output_strs += [color_format(use_color, fmt_str,
@ -121,19 +121,22 @@ class TestReportDifference(unittest.TestCase):

    def test_basic(self):
        expect_lines = [
-            ['BM_SameTimes', '+0.00', '+0.00'],
-            ['BM_2xFaster', '-0.50', '-0.50'],
-            ['BM_2xSlower', '+1.00', '+1.00'],
-            ['BM_10PercentFaster', '-0.10', '-0.10'],
-            ['BM_10PercentSlower', '+0.10', '+0.10']
+            ['BM_SameTimes', '+0.00', '+0.00', '10', '10'],
+            ['BM_2xFaster', '-0.50', '-0.50', '50', '25'],
+            ['BM_2xSlower', '+1.00', '+1.00', '50', '100'],
+            ['BM_10PercentFaster', '-0.10', '-0.10', '100', '90'],
+            ['BM_10PercentSlower', '+0.10', '+0.10', '100', '110'],
+            ['BM_100xSlower', '+99.00', '+99.00', '100', '10000'],
+            ['BM_100xFaster', '-0.99', '-0.99', '10000', '100'],
        ]
        json1, json2 = self.load_results()
-        output_lines = generate_difference_report(json1, json2, use_color=False)
-        print output_lines
+        output_lines_with_header = generate_difference_report(json1, json2, use_color=False)
+        output_lines = output_lines_with_header[2:]
+        print("\n".join(output_lines_with_header))
        self.assertEqual(len(output_lines), len(expect_lines))
        for i in xrange(0, len(output_lines)):
            parts = [x for x in output_lines[i].split(' ') if x]
-            self.assertEqual(len(parts), 3)
+            self.assertEqual(len(parts), 5)
            self.assertEqual(parts, expect_lines[i])


--- a/utils/google-benchmark/tools/gbench/util.py
+++ b/utils/google-benchmark/tools/gbench/util.py
@ -20,21 +20,21 @@ def is_executable_file(filename):
    """
    if not os.path.isfile(filename):
        return False
-    with open(filename, 'r') as f:
+    with open(filename, mode='rb') as f:
        magic_bytes = f.read(_num_magic_bytes)
    if sys.platform == 'darwin':
        return magic_bytes in [
-            '\xfe\xed\xfa\xce',  # MH_MAGIC
-            '\xce\xfa\xed\xfe',  # MH_CIGAM
-            '\xfe\xed\xfa\xcf',  # MH_MAGIC_64
-            '\xcf\xfa\xed\xfe',  # MH_CIGAM_64
-            '\xca\xfe\xba\xbe',  # FAT_MAGIC
-            '\xbe\xba\xfe\xca'   # FAT_CIGAM
+            b'\xfe\xed\xfa\xce',  # MH_MAGIC
+            b'\xce\xfa\xed\xfe',  # MH_CIGAM
+            b'\xfe\xed\xfa\xcf',  # MH_MAGIC_64
+            b'\xcf\xfa\xed\xfe',  # MH_CIGAM_64
+            b'\xca\xfe\xba\xbe',  # FAT_MAGIC
+            b'\xbe\xba\xfe\xca'   # FAT_CIGAM
        ]
    elif sys.platform.startswith('win'):
-        return magic_bytes == 'MZ'
+        return magic_bytes == b'MZ'
    else:
-        return magic_bytes == '\x7FELF'
+        return magic_bytes == b'\x7FELF'


 def is_json_file(filename):
@ -68,7 +68,7 @@ def classify_input_file(filename):
    elif is_json_file(filename):
        ftype = IT_JSON
    else:
-        err_msg = "'%s' does not name a valid benchmark executable or JSON file"
+        err_msg = "'%s' does not name a valid benchmark executable or JSON file" % filename
    return ftype, err_msg


@ -80,10 +80,30 @@ def check_input_file(filename):
    """
    ftype, msg = classify_input_file(filename)
    if ftype == IT_Invalid:
-        print "Invalid input file: %s" % msg
+        print("Invalid input file: %s" % msg)
        sys.exit(1)
    return ftype

+def find_benchmark_flag(prefix, benchmark_flags):
+    """
+    Search the specified list of flags for a flag matching `<prefix><arg>` and
+    if it is found return the arg it specifies. If specified more than once the
+    last value is returned. If the flag is not found None is returned.
+    """
+    assert prefix.startswith('--') and prefix.endswith('=')
+    result = None
+    for f in benchmark_flags:
+        if f.startswith(prefix):
+            result = f[len(prefix):]
+    return result
+
+def remove_benchmark_flags(prefix, benchmark_flags):
+    """
+    Return a new list containing the specified benchmark_flags except those
+    with the specified prefix.
+    """
+    assert prefix.startswith('--') and prefix.endswith('=')
+    return [f for f in benchmark_flags if not f.startswith(prefix)]

 def load_benchmark_results(fname):
    """
@ -101,16 +121,25 @@ def run_benchmark(exe_name, benchmark_flags):
    real time console output.
    RETURNS: A JSON object representing the benchmark output
    """
-    thandle, tname = tempfile.mkstemp()
-    os.close(thandle)
+    output_name = find_benchmark_flag('--benchmark_out=',
+                                      benchmark_flags)
+    is_temp_output = False
+    if output_name is None:
+        is_temp_output = True
+        thandle, output_name = tempfile.mkstemp()
+        os.close(thandle)
+        benchmark_flags = list(benchmark_flags) + \
+                          ['--benchmark_out=%s' % output_name]
+
    cmd = [exe_name] + benchmark_flags
    print("RUNNING: %s" % ' '.join(cmd))
-    exitCode = subprocess.call(cmd + ['--benchmark_out=%s' % tname])
+    exitCode = subprocess.call(cmd)
    if exitCode != 0:
        print('TEST FAILED...')
        sys.exit(exitCode)
-    json_res = load_benchmark_results(tname)
-    os.unlink(tname)
+    json_res = load_benchmark_results(output_name)
+    if is_temp_output:
+        os.unlink(output_name)
    return json_res
				`@ -0,0 +1 @@`
				`include("${CMAKE_CURRENT_LIST_DIR}/@targets_export_name@.cmake")`