!17 benchmark 从1.5.2升级到1.6.1

Merge pull request !17 from mipengwei/master
2024-11-22 23:09:48 +00:00 · 2022-07-21 02:07:50 +00:00 · 2022-07-21 02:07:50 +00:00 · 39a7777fad
commit 39a7777fad
parent babc73c00a 7667342230
128 changed files with 6994 additions and 2740 deletions
--- a/.clang-tidy
+++ b/.clang-tidy
@ -0,0 +1,7 @@
+---
+Checks:          'clang-analyzer-*,readability-redundant-*,performance-*'
+WarningsAsErrors: 'clang-analyzer-*,readability-redundant-*,performance-*'
+HeaderFilterRegex: '.*'
+AnalyzeTemporaryDtors: false
+FormatStyle:     none
+User:            user
--- a/.github/.libcxx-setup.sh
+++ b/.github/.libcxx-setup.sh
@ -0,0 +1,24 @@
+#!/usr/bin/env bash
+
+# Checkout LLVM sources
+git clone --depth=1 https://github.com/llvm/llvm-project.git llvm-project
+
+# Setup libc++ options
+if [ -z "$BUILD_32_BITS" ]; then
+  export BUILD_32_BITS=OFF && echo disabling 32 bit build
+fi
+
+# Build and install libc++ (Use unstable ABI for better sanitizer coverage)
+cd ./llvm-project
+cmake -DCMAKE_C_COMPILER=${C_COMPILER}          \
+      -DCMAKE_CXX_COMPILER=${COMPILER}          \
+      -DCMAKE_BUILD_TYPE=RelWithDebInfo         \
+      -DCMAKE_INSTALL_PREFIX=/usr               \
+      -DLIBCXX_ABI_UNSTABLE=OFF                 \
+      -DLLVM_USE_SANITIZER=${LIBCXX_SANITIZER}  \
+      -DLLVM_BUILD_32_BITS=${BUILD_32_BITS}     \
+      -DLLVM_ENABLE_PROJECTS='libcxx;libcxxabi' \
+      -S llvm -B llvm-build -G "Unix Makefiles"
+make -C llvm-build -j3 cxx cxxabi
+sudo make -C llvm-build install-cxx install-cxxabi
+cd ..
--- a/.github/workflows/bazel.yml
+++ b/.github/workflows/bazel.yml
@ -0,0 +1,30 @@
+name: bazel
+
+on:
+  push: {}
+  pull_request: {}
+
+jobs:
+  build-and-test:
+    runs-on: ubuntu-latest
+    
+    steps:
+    - uses: actions/checkout@v1
+
+    - name: mount bazel cache
+      uses: actions/cache@v2.0.0
+      env:
+        cache-name: bazel-cache
+      with:
+        path: "~/.cache/bazel"
+        key: ${{ env.cache-name }}-${{ runner.os }}-${{ github.ref }}
+        restore-keys: |
+          ${{ env.cache-name }}-${{ runner.os }}-main
+
+    - name: build
+      run: |
+        bazel build //:benchmark //:benchmark_main //test/...
+
+    - name: test
+      run: |
+        bazel test --test_output=all //test/...
--- a/.github/workflows/build-and-test-perfcounters.yml
+++ b/.github/workflows/build-and-test-perfcounters.yml
@ -0,0 +1,97 @@
+name: build-and-test-perfcounters
+
+on:
+  push:
+    branches: [ main ]
+  pull_request:
+    branches: [ main ]
+
+jobs:
+  job:
+    # TODO(dominic): Extend this to include compiler and set through env: CC/CXX.
+    name: ${{ matrix.os }}.${{ matrix.build_type }}
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [ubuntu-latest, ubuntu-20.04]
+        build_type: ['Release', 'Debug']
+    steps:
+    - uses: actions/checkout@v2
+
+    - name: install libpfm
+      run: sudo apt -y install libpfm4-dev
+
+    - name: setup cmake
+      uses: jwlawson/actions-setup-cmake@v1.9
+      with:
+        cmake-version: '3.5.1'
+
+    - name: create build environment
+      run: cmake -E make_directory ${{ runner.workspace }}/_build
+
+    - name: configure cmake
+      shell: bash
+      working-directory: ${{ runner.workspace }}/_build
+      run: >
+        cmake $GITHUB_WORKSPACE
+        -DBENCHMARK_ENABLE_LIBPFM=1
+        -DBENCHMARK_DOWNLOAD_DEPENDENCIES=ON
+        -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
+
+    - name: build
+      shell: bash
+      working-directory: ${{ runner.workspace }}/_build
+      run: cmake --build . --config ${{ matrix.build_type }}
+
+    # Skip testing, for now. It seems perf_event_open does not succeed on the
+    # hosting machine, very likely a permissions issue.
+    # TODO(mtrofin): Enable test.
+    # - name: test
+    #   shell: bash
+    #   working-directory: ${{ runner.workspace }}/_build
+    #   run: ctest -C ${{ matrix.build_type }} --rerun-failed --output-on-failure
+
+  ubuntu-16_04:
+    name: ubuntu-16.04.${{ matrix.build_type }}
+    runs-on: [ubuntu-latest]
+    strategy:
+      fail-fast: false
+      matrix:
+        build_type: ['Release', 'Debug']
+    container: ubuntu:16.04
+    steps:
+      - uses: actions/checkout@v2
+
+      - name: install required bits
+        run: |
+          apt update
+          apt -y install clang cmake g++ git
+
+      - name: install libpfm
+        run: apt -y install libpfm4-dev
+
+      - name: create build environment
+        run: cmake -E make_directory $GITHUB_WORKSPACE/_build
+
+      - name: configure cmake
+        shell: bash
+        working-directory: ${{ github.workspace }}/_build
+        run: >
+          cmake $GITHUB_WORKSPACE
+          -DBENCHMARK_ENABLE_LIBPFM=1
+          -DBENCHMARK_DOWNLOAD_DEPENDENCIES=ON
+          -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
+
+      - name: build
+        shell: bash
+        working-directory: ${{ github.workspace }}/_build
+        run: cmake --build . --config ${{ matrix.build_type }}
+
+    # Skip testing, for now. It seems perf_event_open does not succeed on the
+    # hosting machine, very likely a permissions issue.
+    # TODO(mtrofin): Enable test.
+    # - name: test
+    #   shell: bash
+    #   working-directory: ${{ runner.workspace }}/_build
+    #   run: ctest -C ${{ matrix.build_type }} --rerun-failed --output-on-failure
--- a/.github/workflows/build-and-test.yml
+++ b/.github/workflows/build-and-test.yml
@ -1,38 +1,149 @@
 name: build-and-test

 on:
-  push:
-    branches: [ master ]
-  pull_request:
-    branches: [ master ]
+  push: {}
+  pull_request: {}

 jobs:
+  # TODO: add 32-bit builds (g++ and clang++) for ubuntu
+  #   (requires g++-multilib and libc6:i386)
+  # TODO: add coverage build (requires lcov)
+  # TODO: add clang + libc++ builds for ubuntu
  job:
-    # TODO(dominic): Extend this to include compiler and set through env: CC/CXX.
-    name: ${{ matrix.os }}.${{ matrix.build_type }}
+    name: ${{ matrix.os }}.${{ matrix.build_type }}.${{ matrix.compiler }}
    runs-on: ${{ matrix.os }}
    strategy:
      fail-fast: false
      matrix:
-        os: [ubuntu-latest, ubuntu-16.04, ubuntu-20.04, macos-latest, windows-latest]
+        os: [ubuntu-latest, ubuntu-20.04, macos-latest]
        build_type: ['Release', 'Debug']
+        compiler: [g++, clang++]
+        include:
+          - displayTargetName: windows-latest-release
+            os: windows-latest
+            build_type: 'Release'
+          - displayTargetName: windows-latest-debug
+            os: windows-latest
+            build_type: 'Debug'
    steps:
-    - uses: actions/checkout@v2
+      - uses: actions/checkout@v2

-    - name: create build environment
-      run: cmake -E make_directory ${{ runner.workspace }}/_build
+      - name: create build environment
+        run: cmake -E make_directory ${{ runner.workspace }}/_build

-    - name: configure cmake
-      shell: bash
-      working-directory: ${{ runner.workspace }}/_build
-      run: cmake -DBENCHMARK_DOWNLOAD_DEPENDENCIES=ON $GITHUB_WORKSPACE -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
+      - name: configure cmake
+        env:
+          CXX: ${{ matrix.compiler }}
+        shell: bash
+        working-directory: ${{ runner.workspace }}/_build
+        run: >
+          cmake $GITHUB_WORKSPACE
+          -DBENCHMARK_DOWNLOAD_DEPENDENCIES=ON
+          -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}

-    - name: build
-      shell: bash
-      working-directory: ${{ runner.workspace }}/_build
-      run: cmake --build . --config ${{ matrix.build_type }}
+      - name: build
+        shell: bash
+        working-directory: ${{ runner.workspace }}/_build
+        run: cmake --build . --config ${{ matrix.build_type }}

-    - name: test
-      shell: bash
-      working-directory: ${{ runner.workspace }}/_build
-      run: ctest -C ${{ matrix.build_type }}
+      - name: test
+        shell: bash
+        working-directory: ${{ runner.workspace }}/_build
+        run: ctest -C ${{ matrix.build_type }} -VV
+
+  ubuntu-16_04:
+    name: ubuntu-16.04.${{ matrix.build_type }}.${{ matrix.compiler }}
+    runs-on: [ubuntu-latest]
+    strategy:
+      fail-fast: false
+      matrix:
+        build_type: ['Release', 'Debug']
+        compiler: [g++, clang++]
+    container: ubuntu:16.04
+    steps:
+      - uses: actions/checkout@v2
+
+      - name: install required bits
+        run: |
+          apt update
+          apt -y install clang cmake g++ git
+
+      - name: create build environment
+        run: cmake -E make_directory $GITHUB_WORKSPACE/_build
+
+      - name: configure cmake
+        env:
+          CXX: ${{ matrix.compiler }}
+        shell: bash
+        working-directory: ${{ github.workspace }}/_build
+        run: >
+          cmake $GITHUB_WORKSPACE
+          -DBENCHMARK_DOWNLOAD_DEPENDENCIES=ON
+          -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
+
+      - name: build
+        shell: bash
+        working-directory: ${{ github.workspace }}/_build
+        run: cmake --build . --config ${{ matrix.build_type }}
+
+      - name: test
+        shell: bash
+        working-directory: ${{ github.workspace }}/_build
+        run: ctest -C ${{ matrix.build_type }} -VV
+
+  ubuntu-14_04:
+    name: ubuntu-14.04.${{ matrix.build_type }}.${{ matrix.compiler }}
+    runs-on: [ubuntu-latest]
+    strategy:
+      fail-fast: false
+      matrix:
+        build_type: ['Release', 'Debug']
+        compiler: [g++-4.8, clang++-3.6]
+        include:
+          - compiler: g++-6
+            build_type: 'Debug'
+            run_tests: true
+          - compiler: g++-6
+            build_type: 'Release'
+            run_tests: true
+    container: ubuntu:14.04
+    steps:
+      - uses: actions/checkout@v2
+
+      - name: install required bits
+        run: |
+          sudo apt update
+          sudo apt -y install clang-3.6 cmake3 g++-4.8 git
+
+      - name: install other bits
+        if: ${{ matrix.compiler }} == g++-6
+        run: |
+          sudo apt -y install software-properties-common
+          sudo add-apt-repository -y "ppa:ubuntu-toolchain-r/test"
+          sudo apt update
+          sudo apt -y install g++-6
+
+      - name: create build environment
+        run: cmake -E make_directory $GITHUB_WORKSPACE/_build
+
+      - name: configure cmake
+        env:
+          CXX: ${{ matrix.compiler }}
+        shell: bash
+        working-directory: ${{ github.workspace }}/_build
+        run: >
+          cmake $GITHUB_WORKSPACE
+          -DBENCHMARK_ENABLE_TESTING=${{ matrix.run_tests }}
+          -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
+          -DBENCHMARK_DOWNLOAD_DEPENDENCIES=${{ matrix.run_tests }}
+
+      - name: build
+        shell: bash
+        working-directory: ${{ github.workspace }}/_build
+        run: cmake --build . --config ${{ matrix.build_type }}
+
+      - name: test
+        if: ${{ matrix.run_tests }}
+        shell: bash
+        working-directory: ${{ github.workspace }}/_build
+        run: ctest -C ${{ matrix.build_type }} -VV
--- a/.github/workflows/clang-format-lint.yml
+++ b/.github/workflows/clang-format-lint.yml
@ -0,0 +1,17 @@
+name: clang-format-lint
+on:
+  push: {}
+  pull_request: {}
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+
+    steps:
+    - uses: actions/checkout@v2
+    - uses: DoozyX/clang-format-lint-action@v0.13
+      with:
+        source: './include/benchmark ./src ./test'
+        extensions: 'h,cc'
+        clangFormatVersion: 12
+        style: Google
--- a/.github/workflows/clang-tidy.yml
+++ b/.github/workflows/clang-tidy.yml
@ -0,0 +1,38 @@
+name: clang-tidy
+
+on:
+  push: {}
+  pull_request: {}
+
+jobs:
+  job:
+    name: run-clang-tidy
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+    steps:
+    - uses: actions/checkout@v2
+
+    - name: install clang-tidy
+      run: sudo apt update && sudo apt -y install clang-tidy
+
+    - name: create build environment
+      run: cmake -E make_directory ${{ runner.workspace }}/_build
+
+    - name: configure cmake
+      shell: bash
+      working-directory: ${{ runner.workspace }}/_build
+      run: >
+        cmake $GITHUB_WORKSPACE
+        -DBENCHMARK_ENABLE_ASSEMBLY_TESTS=OFF
+        -DBENCHMARK_ENABLE_LIBPFM=OFF
+        -DBENCHMARK_DOWNLOAD_DEPENDENCIES=ON
+        -DCMAKE_C_COMPILER=clang
+        -DCMAKE_CXX_COMPILER=clang++
+        -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
+        -DGTEST_COMPILE_COMMANDS=OFF
+
+    - name: run
+      shell: bash
+      working-directory: ${{ runner.workspace }}/_build
+      run: run-clang-tidy
--- a/.github/workflows/doxygen.yml
+++ b/.github/workflows/doxygen.yml
@ -0,0 +1,26 @@
+name: doxygen
+
+on:
+  push:
+    branches: [main]
+  pull_request:
+    branches: [main]
+
+jobs:
+  build-and-deploy:
+    name: Build HTML documentation
+    runs-on: ubuntu-latest
+    steps:
+    - name: Fetching sources
+      uses: actions/checkout@v2
+    - name: Installing build dependencies
+      run: |
+        sudo apt update
+        sudo apt install cmake doxygen gcc git
+    - name: Creating build directory
+      run: |
+        mkdir build
+    - name: Building HTML documentation with Doxygen
+      run: |
+        cmake -S . -B build -DBENCHMARK_ENABLE_TESTING:BOOL=OFF -DBENCHMARK_ENABLE_DOXYGEN:BOOL=ON -DBENCHMARK_INSTALL_DOCS:BOOL=ON
+        cmake --build build --target benchmark_doxygen
--- a/.github/workflows/pylint.yml
+++ b/.github/workflows/pylint.yml
@ -2,9 +2,9 @@ name: pylint

 on:
  push:
-    branches: [ master ]
+    branches: [ main ]
  pull_request:
-    branches: [ master ]
+    branches: [ main ]

 jobs:
  pylint:
--- a/.github/workflows/sanitizer.yml
+++ b/.github/workflows/sanitizer.yml
@ -0,0 +1,92 @@
+name: sanitizer
+
+on:
+  push: {}
+  pull_request: {}
+
+env:
+  UBSAN_OPTIONS: "print_stacktrace=1"
+
+jobs:
+  job:
+    name: ${{ matrix.sanitizer }}.${{ matrix.build_type }}.${{ matrix.compiler }}
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        build_type: ['Debug', 'RelWithDebInfo']
+        sanitizer: ['asan', 'ubsan', 'tsan']
+        compiler: ['clang', 'gcc']
+        # TODO: add 'msan' above. currently failing and needs investigation.
+    steps:
+    - uses: actions/checkout@v2
+
+    - name: configure msan env
+      if: matrix.sanitizer == 'msan'
+      run: |
+        echo "EXTRA_FLAGS=-g -O2 -fno-omit-frame-pointer -fsanitize=memory -fsanitize-memory-track-origins" >> $GITHUB_ENV
+        echo "LIBCXX_SANITIZER=MemoryWithOrigins" >> $GITHUB_ENV
+
+    - name: configure ubsan env
+      if: matrix.sanitizer == 'ubsan'
+      run: |
+        echo "EXTRA_FLAGS=-g -O2 -fno-omit-frame-pointer -fsanitize=undefined -fno-sanitize-recover=all" >> $GITHUB_ENV
+        echo "LIBCXX_SANITIZER=Undefined" >> $GITHUB_ENV
+
+    - name: configure asan env
+      if: matrix.sanitizer == 'asan'
+      run: |
+        echo "EXTRA_FLAGS=-g -O2 -fno-omit-frame-pointer -fsanitize=address -fno-sanitize-recover=all" >> $GITHUB_ENV
+        echo "LIBCXX_SANITIZER=Address" >> $GITHUB_ENV
+
+    - name: configure tsan env
+      if: matrix.sanitizer == 'tsan'
+      run: |
+        echo "EXTRA_FLAGS=-g -O2 -fno-omit-frame-pointer -fsanitize=thread -fno-sanitize-recover=all" >> $GITHUB_ENV
+        echo "LIBCXX_SANITIZER=Thread" >> $GITHUB_ENV
+
+    - name: configure clang
+      if: matrix.compiler == 'clang'
+      run: |
+        echo "CC=clang" >> $GITHUB_ENV
+        echo "CXX=clang++" >> $GITHUB_ENV
+
+    - name: configure gcc
+      if: matrix.compiler == 'gcc'
+      run: |
+        sudo apt update && sudo apt -y install gcc-10 g++-10
+        echo "CC=gcc-10" >> $GITHUB_ENV
+        echo "CXX=g++-10" >> $GITHUB_ENV
+
+    - name: install llvm stuff
+      if: matrix.compiler == 'clang'
+      run: |
+        "${GITHUB_WORKSPACE}/.github/.libcxx-setup.sh"
+        echo "EXTRA_CXX_FLAGS=\"-stdlib=libc++\"" >> $GITHUB_ENV
+
+    - name: create build environment
+      run: cmake -E make_directory ${{ runner.workspace }}/_build
+
+    - name: configure cmake
+      shell: bash
+      working-directory: ${{ runner.workspace }}/_build
+      run: >
+        cmake $GITHUB_WORKSPACE
+        -DBENCHMARK_ENABLE_ASSEMBLY_TESTS=OFF
+        -DBENCHMARK_ENABLE_LIBPFM=OFF
+        -DBENCHMARK_DOWNLOAD_DEPENDENCIES=ON
+        -DCMAKE_C_COMPILER=${{ env.CC }}
+        -DCMAKE_CXX_COMPILER=${{ env.CXX }}
+        -DCMAKE_C_FLAGS="${{ env.EXTRA_FLAGS }}"
+        -DCMAKE_CXX_FLAGS="${{ env.EXTRA_FLAGS }} ${{ env.EXTRA_CXX_FLAGS }}"
+        -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
+
+    - name: build
+      shell: bash
+      working-directory: ${{ runner.workspace }}/_build
+      run: cmake --build . --config ${{ matrix.build_type }}
+
+    - name: test
+      shell: bash
+      working-directory: ${{ runner.workspace }}/_build
+      run: ctest -C ${{ matrix.build_type }} -VV
--- a/.github/workflows/test_bindings.yml
+++ b/.github/workflows/test_bindings.yml
@ -2,9 +2,9 @@ name: test-bindings

 on:
  push:
-    branches: [master]
+    branches: [main]
  pull_request:
-    branches: [master]
+    branches: [main]

 jobs:
  python_bindings:
--- a/.github/workflows/wheels.yml
+++ b/.github/workflows/wheels.yml
@ -0,0 +1,133 @@
+name: Build and upload Python wheels
+
+on:
+  workflow_dispatch:
+  release:
+    types:
+      - published
+
+jobs:
+  build_sdist:
+    name: Build source distribution
+    runs-on: ubuntu-latest
+    steps:
+      - name: Check out repo
+        uses: actions/checkout@v2
+
+      - name: Install Python 3.9
+        uses: actions/setup-python@v2
+        with:
+          python-version: 3.9
+
+      - name: Build and check sdist
+        run: |
+          python setup.py sdist
+      - name: Upload sdist
+        uses: actions/upload-artifact@v2
+        with:
+          name: dist
+          path: dist/*.tar.gz
+
+  build_linux:
+    name: Build google-benchmark manylinux wheels
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Check out Google Benchmark
+        uses: actions/checkout@v2
+
+      - name: Set up Python 3.9
+        uses: actions/setup-python@v2
+        with:
+          python-version: 3.9
+
+      # TODO: Bazel does not seem to work in an emulated Docker environment, see
+      # https://github.com/bazelbuild/bazel/issues/11379
+#      - name: Set up QEMU
+#        uses: docker/setup-qemu-action@v1
+#        with:
+#          platforms: all
+
+      - name: Build Python wheels on ubuntu-latest
+        env:
+          CIBW_BUILD: 'cp37-* cp38-* cp39-* cp310-*'
+          CIBW_SKIP: "*-musllinux_*"
+          # Bazel repo only exists on CentOS 7 for x86 and ppc, so no manylinux2010
+          # TODO: Build ppc64le, aarch64 using some other trick
+          CIBW_MANYLINUX_X86_64_IMAGE: manylinux2014
+          CIBW_ARCHS_LINUX: x86_64
+          CIBW_BEFORE_ALL: >
+            curl -O --retry-delay 5 --retry 5 https://copr.fedorainfracloud.org/coprs/vbatts/bazel/repo/epel-7/vbatts-bazel-epel-7.repo &&
+            cp vbatts-bazel-epel-7.repo /etc/yum.repos.d/bazel.repo &&
+            yum install -y bazel4
+          CIBW_TEST_COMMAND: python {project}/bindings/python/google_benchmark/example.py
+        run: |
+          pip install cibuildwheel
+          python -m cibuildwheel --output-dir wheelhouse
+
+      - name: Upload Linux wheels
+        uses: actions/upload-artifact@v2
+        with:
+          name: dist
+          path: wheelhouse/*.whl
+
+  build_macos:
+    name: Build google-benchmark macOS wheels
+    runs-on: macos-latest
+
+    steps:
+      - name: Check out Google Benchmark
+        uses: actions/checkout@v2
+
+      - name: Set up Python 3.9
+        uses: actions/setup-python@v2
+        with:
+          python-version: 3.9
+
+      - name: Build Python wheels on macOS
+        env:
+          CIBW_ARCHS_MACOS: "x86_64 arm64"
+          CIBW_BUILD: 'cp37-* cp38-* cp39-* cp310-*'
+          # ARM64 requires Python 3.8 minimum
+          CIBW_SKIP: 'cp37-*-arm64'
+          CIBW_TEST_COMMAND: python {project}/bindings/python/google_benchmark/example.py
+          CIBW_TEST_SKIP: "*_arm64"
+        run: |
+          pip install cibuildwheel
+          python -m cibuildwheel --output-dir wheelhouse
+
+      - name: Upload macOS wheels
+        uses: actions/upload-artifact@v2
+        with:
+          name: dist
+          path: wheelhouse/*.whl
+
+  build_windows:
+    name: Build google-benchmark wheels on Windows
+    runs-on: windows-latest
+
+    steps:
+      - name: Check out Google Benchmark
+        uses: actions/checkout@v2
+
+      - name: Set up Python 3.9
+        uses: actions/setup-python@v2
+        with:
+          python-version: 3.9
+
+      - name: Build Python wheels on Windows
+        env:
+          CIBW_BUILD: 'cp37-* cp38-* cp39-* cp310-*'
+          CIBW_ARCHS_WINDOWS: AMD64
+          # otherwise, pip crashes the job by trying to remove an in-use bazel DLL
+          PIP_NO_CLEAN: true
+          CIBW_TEST_COMMAND: python {project}/bindings/python/google_benchmark/example.py
+        run: |
+          pip install cibuildwheel
+          python -m cibuildwheel --output-dir wheelhouse
+
+      - name: Upload wheels
+        uses: actions/upload-artifact@v2
+        with:
+          name: dist
+          path: wheelhouse/*.whl
--- a/.gitignore
+++ b/.gitignore
@ -11,6 +11,7 @@
 *.swp
 *.pyc
 __pycache__
+.DS_Store

 # lcov
 *.lcov
--- a/.travis.yml
+++ b/.travis.yml
@ -10,10 +10,6 @@ matrix:
          packages:
            - lcov
      env: COMPILER=g++ C_COMPILER=gcc BUILD_TYPE=Coverage
-    - compiler: gcc
-      env: COMPILER=g++ C_COMPILER=gcc BUILD_TYPE=Debug
-    - compiler: gcc
-      env: COMPILER=g++ C_COMPILER=gcc BUILD_TYPE=Release
    - compiler: gcc
      addons:
        apt:
@ -44,10 +40,6 @@ matrix:
        - COMPILER=g++-6 C_COMPILER=gcc-6  BUILD_TYPE=Debug
        - ENABLE_SANITIZER=1
        - EXTRA_FLAGS="-fno-omit-frame-pointer -g -O2 -fsanitize=undefined,address -fuse-ld=gold"
-    - compiler: clang
-      env: COMPILER=clang++ C_COMPILER=clang BUILD_TYPE=Debug
-    - compiler: clang
-      env: COMPILER=clang++ C_COMPILER=clang BUILD_TYPE=Release
    # Clang w/ libc++
    - compiler: clang
      dist: xenial
@ -146,16 +138,6 @@ matrix:
        - ENABLE_SANITIZER=1
        - EXTRA_FLAGS="-g -O2 -fno-omit-frame-pointer -fsanitize=thread -fno-sanitize-recover=all"
        - EXTRA_CXX_FLAGS="-stdlib=libc++"
-    - os: osx
-      osx_image: xcode8.3
-      compiler: clang
-      env:
-        - COMPILER=clang++ BUILD_TYPE=Debug
-    - os: osx
-      osx_image: xcode8.3
-      compiler: clang
-      env:
-        - COMPILER=clang++ BUILD_TYPE=Release
    - os: osx
      osx_image: xcode8.3
      compiler: clang
@ -164,15 +146,10 @@ matrix:
        - BUILD_TYPE=Release
        - BUILD_32_BITS=ON
        - EXTRA_FLAGS="-m32"
-    - os: osx
-      osx_image: xcode9.4
-      compiler: gcc
-      env:
-        - COMPILER=g++-7 C_COMPILER=gcc-7  BUILD_TYPE=Debug

 before_script:
  - if [ -n "${LIBCXX_BUILD}" ]; then
-      source .travis-libcxx-setup.sh;
+      source .libcxx-setup.sh;
    fi
  - if [ -n "${ENABLE_SANITIZER}" ]; then
      export EXTRA_OPTIONS="-DBENCHMARK_ENABLE_ASSEMBLY_TESTS=OFF";
--- a/6
+++ b/6
@ -21,6 +21,8 @@ David Coeurjolly <david.coeurjolly@liris.cnrs.fr>
 Deniz Evrenci <denizevrenci@gmail.com>
 Dirac Research 
 Dominik Czarnota <dominik.b.czarnota@gmail.com>
+Dominik Korman <kormandominik@gmail.com>
+Donald Aingworth <donalds_junk_mail@yahoo.com>
 Eric Backus <eric_backus@alum.mit.edu>
 Eric Fiselier <eric@efcs.ca>
 Eugene Zhuk <eugene.zhuk@gmail.com>
@ -43,6 +45,7 @@ Matt Clarkson <mattyclarkson@gmail.com>
 Maxim Vafin <maxvafin@gmail.com>
 MongoDB Inc.
 Nick Hutchinson <nshutchinson@gmail.com>
+Norman Heino <norman.heino@gmail.com>
 Oleksandr Sochka <sasha.sochka@gmail.com>
 Ori Livneh <ori.livneh@gmail.com>
 Paul Redmond <paul.redmond@gmail.com>
@ -50,8 +53,11 @@ Radoslav Yovchev <radoslav.tm@gmail.com>
 Roman Lebedev <lebedev.ri@gmail.com>
 Sayan Bhattacharjee <aero.sayan@gmail.com>
 Shuo Chen <chenshuo@chenshuo.com>
+Staffan Tjernstrom <staffantj@gmail.com>
 Steinar H. Gunderson <sgunderson@bigfoot.com>
 Stripe, Inc.
+Tobias Schmidt <tobias.schmidt@in.tum.de>
 Yixuan Qiu <yixuanq@gmail.com>
 Yusuke Suzuki <utatane.tea@gmail.com>
 Zbigniew Skowron <zbychs@gmail.com>
+Min-Yih Hsu <yihshyng223@gmail.com>
--- a/BUILD.bazel
+++ b/BUILD.bazel
@ -1,9 +1,17 @@
-load("@rules_cc//cc:defs.bzl", "cc_library")
-
 licenses(["notice"])

+config_setting(
+    name = "qnx",
+    constraint_values = ["@platforms//os:qnx"],
+    values = {
+        "cpu": "x64_qnx",
+    },
+    visibility = [":__subpackages__"],
+)
+
 config_setting(
    name = "windows",
+    constraint_values = ["@platforms//os:windows"],
    values = {
        "cpu": "x64_windows",
    },
--- a/BUILD.gn
+++ b/BUILD.gn
@ -15,25 +15,21 @@ import("//build/ohos.gni")

 # Lets callers do '#include <benchmark.h>'
 config("benchmark_config") {
-  include_dirs = [
-    "include",
-  ]
+  include_dirs = [ "include" ]
 }

 # This is the configuration used to build benchmark itself.
 # It should not be needed outside of this library.
 config("benchmark_private_config") {
  visibility = [ ":*" ]
-  include_dirs = [
-    "include",
-  ]
+  include_dirs = [ "include" ]
 }

 ohos_static_library("benchmark") {
  sources = [
+    "src/benchmark.cc",
    "src/benchmark_api_internal.cc",
    "src/benchmark_name.cc",
-    "src/benchmark.cc",
    "src/benchmark_register.cc",
    "src/benchmark_runner.cc",
    "src/colorprint.cc",
@ -43,6 +39,7 @@ ohos_static_library("benchmark") {
    "src/counter.cc",
    "src/csv_reporter.cc",
    "src/json_reporter.cc",
+    "src/perf_counters.cc",
    "src/reporter.cc",
    "src/sleep.cc",
    "src/statistics.cc",
@ -50,21 +47,15 @@ ohos_static_library("benchmark") {
    "src/sysinfo.cc",
    "src/timers.cc",
  ]
-  
+
  public_configs = [ ":benchmark_config" ]
  configs = [ ":benchmark_private_config" ]
 }

 ohos_static_library("benchmark_main") {
-  include_dirs = [
-    "include",
-  ]
+  include_dirs = [ "include" ]

-  sources = [
-    "src/benchmark_main.cc",
-  ]
+  sources = [ "src/benchmark_main.cc" ]

-  deps = [
-    ":benchmark"
-  ]
+  deps = [ ":benchmark" ]
 }
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -13,18 +13,31 @@ foreach(p
  endif()
 endforeach()

-project (benchmark CXX)
+project (benchmark VERSION 1.6.1 LANGUAGES CXX)

 option(BENCHMARK_ENABLE_TESTING "Enable testing of the benchmark library." ON)
 option(BENCHMARK_ENABLE_EXCEPTIONS "Enable the use of exceptions in the benchmark library." ON)
 option(BENCHMARK_ENABLE_LTO "Enable link time optimisation of the benchmark library." OFF)
 option(BENCHMARK_USE_LIBCXX "Build and test using libc++ as the standard library." OFF)
+option(BENCHMARK_ENABLE_WERROR "Build Release candidates with -Werror." ON)
+option(BENCHMARK_FORCE_WERROR "Build Release candidates with -Werror regardless of compiler issues." OFF)
+
+if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "PGI")
+  # PGC++ maybe reporting false positives.
+  set(BENCHMARK_ENABLE_WERROR OFF)
+endif()
+if(BENCHMARK_FORCE_WERROR)
+  set(BENCHMARK_ENABLE_WERROR ON)
+endif(BENCHMARK_FORCE_WERROR)
+
 if(NOT MSVC)
  option(BENCHMARK_BUILD_32_BITS "Build a 32 bit version of the library." OFF)
 else()
  set(BENCHMARK_BUILD_32_BITS OFF CACHE BOOL "Build a 32 bit version of the library - unsupported when using MSVC)" FORCE)
 endif()
 option(BENCHMARK_ENABLE_INSTALL "Enable installation of benchmark. (Projects embedding benchmark may want to turn this OFF.)" ON)
+option(BENCHMARK_ENABLE_DOXYGEN "Build documentation with Doxygen." OFF)
+option(BENCHMARK_INSTALL_DOCS "Enable installation of documentation." ON)

 # Allow unmet dependencies to be met using CMake's ExternalProject mechanics, which
 # may require downloading the source code.
@ -33,8 +46,22 @@ option(BENCHMARK_DOWNLOAD_DEPENDENCIES "Allow the downloading and in-tree buildi
 # This option can be used to disable building and running unit tests which depend on gtest
 # in cases where it is not possible to build or find a valid version of gtest.
 option(BENCHMARK_ENABLE_GTEST_TESTS "Enable building the unit tests which depend on gtest" ON)
+option(BENCHMARK_USE_BUNDLED_GTEST "Use bundled GoogleTest. If disabled, the find_package(GTest) will be used." ON)
+
+option(BENCHMARK_ENABLE_LIBPFM "Enable performance counters provided by libpfm" OFF)

 set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
+if(MSVC)
+    # As of CMake 3.18, CMAKE_SYSTEM_PROCESSOR is not set properly for MSVC and
+    # cross-compilation (e.g. Host=x86_64, target=aarch64) requires using the
+    # undocumented, but working variable.
+    # See https://gitlab.kitware.com/cmake/cmake/-/issues/15170
+    set(CMAKE_SYSTEM_PROCESSOR ${MSVC_CXX_ARCHITECTURE_ID})
+    if(${CMAKE_SYSTEM_PROCESSOR} MATCHES "ARM")
+      set(CMAKE_CROSSCOMPILING TRUE)
+    endif()
+endif()
+
 set(ENABLE_ASSEMBLY_TESTS_DEFAULT OFF)
 function(should_enable_assembly_tests)
  if(CMAKE_BUILD_TYPE)
@ -81,8 +108,14 @@ list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
 include(GetGitVersion)
 get_git_version(GIT_VERSION)

+# If no git version can be determined, use the version
+# from the project() command
+if ("${GIT_VERSION}" STREQUAL "0.0.0")
+  set(VERSION "${benchmark_VERSION}")
+else()
+  set(VERSION "${GIT_VERSION}")
+endif()
 # Tell the user what versions we are using
-string(REGEX MATCH "[0-9]+\\.[0-9]+\\.[0-9]+" VERSION ${GIT_VERSION})
 message(STATUS "Version: ${VERSION}")

 # The version of the libraries
@ -93,6 +126,9 @@ string(SUBSTRING ${VERSION} 0 1 GENERIC_LIB_SOVERSION)
 include(CheckCXXCompilerFlag)
 include(AddCXXCompilerFlag)
 include(CXXFeatureCheck)
+include(CheckLibraryExists)
+
+check_library_exists(rt shm_open "" HAVE_LIB_RT)

 if (BENCHMARK_BUILD_32_BITS)
  add_required_cxx_compiler_flag(-m32)
@ -141,12 +177,17 @@ else()
  add_cxx_compiler_flag(-Wall)
  add_cxx_compiler_flag(-Wextra)
  add_cxx_compiler_flag(-Wshadow)
-  add_cxx_compiler_flag(-Werror RELEASE)
-  add_cxx_compiler_flag(-Werror RELWITHDEBINFO)
-  add_cxx_compiler_flag(-Werror MINSIZEREL)
-  # Disabled until googletest (gmock) stops emitting variadic macro warnings
-  #add_cxx_compiler_flag(-pedantic)
-  #add_cxx_compiler_flag(-pedantic-errors)
+  if(BENCHMARK_ENABLE_WERROR)
+      add_cxx_compiler_flag(-Werror RELEASE)
+      add_cxx_compiler_flag(-Werror RELWITHDEBINFO)
+      add_cxx_compiler_flag(-Werror MINSIZEREL)
+  endif()
+  if (NOT BENCHMARK_ENABLE_TESTING)
+    # Disable warning when compiling tests as gtest does not use 'override'.
+    add_cxx_compiler_flag(-Wsuggest-override)
+  endif()
+  add_cxx_compiler_flag(-pedantic)
+  add_cxx_compiler_flag(-pedantic-errors)
  add_cxx_compiler_flag(-Wshorten-64-to-32)
  add_cxx_compiler_flag(-fstrict-aliasing)
  # Disable warnings regarding deprecated parts of the library while building
@ -159,9 +200,11 @@ else()
    add_cxx_compiler_flag(-wd1786)
  endif()
  # Disable deprecation warnings for release builds (when -Werror is enabled).
-  add_cxx_compiler_flag(-Wno-deprecated RELEASE)
-  add_cxx_compiler_flag(-Wno-deprecated RELWITHDEBINFO)
-  add_cxx_compiler_flag(-Wno-deprecated MINSIZEREL)
+  if(BENCHMARK_ENABLE_WERROR)
+      add_cxx_compiler_flag(-Wno-deprecated RELEASE)
+      add_cxx_compiler_flag(-Wno-deprecated RELWITHDEBINFO)
+      add_cxx_compiler_flag(-Wno-deprecated MINSIZEREL)
+  endif()
  if (NOT BENCHMARK_ENABLE_EXCEPTIONS)
    add_cxx_compiler_flag(-fno-exceptions)
  endif()
@ -194,6 +237,7 @@ else()
  # Link time optimisation
  if (BENCHMARK_ENABLE_LTO)
    add_cxx_compiler_flag(-flto)
+    add_cxx_compiler_flag(-Wno-lto-type-mismatch)
    if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
      find_program(GCC_AR gcc-ar)
      if (GCC_AR)
@ -269,6 +313,10 @@ cxx_feature_check(STEADY_CLOCK)
 set(THREADS_PREFER_PTHREAD_FLAG ON)
 find_package(Threads REQUIRED)

+if (BENCHMARK_ENABLE_LIBPFM)
+  find_package(PFM)
+endif()
+
 # Set up directories
 include_directories(${PROJECT_SOURCE_DIR}/include)

@ -280,7 +328,15 @@ if (BENCHMARK_ENABLE_TESTING)
  if (BENCHMARK_ENABLE_GTEST_TESTS AND
      NOT (TARGET gtest AND TARGET gtest_main AND
           TARGET gmock AND TARGET gmock_main))
-    include(GoogleTest)
+    if (BENCHMARK_USE_BUNDLED_GTEST)
+      include(GoogleTest)
+    else()
+      find_package(GTest CONFIG REQUIRED)
+      add_library(gtest ALIAS GTest::gtest)
+      add_library(gtest_main ALIAS GTest::gtest_main)
+      add_library(gmock ALIAS GTest::gmock)
+      add_library(gmock_main ALIAS GTest::gmock_main)
+    endif()
  endif()
  add_subdirectory(test)
 endif()
--- a/8
+++ b/8
@ -22,6 +22,7 @@
 #
 # Please keep the list sorted.

+Abhina Sreeskantharajan <abhina.sreeskantharajan@ibm.com>
 Albert Pretorius <pretoalb@gmail.com>
 Alex Steele <steelal123@gmail.com>
 Andriy Berestovskyy <berestovskyy@gmail.com>
@ -37,10 +38,13 @@ David Coeurjolly <david.coeurjolly@liris.cnrs.fr>
 Deniz Evrenci <denizevrenci@gmail.com>
 Dominic Hamon <dma@stripysock.com> <dominic@google.com>
 Dominik Czarnota <dominik.b.czarnota@gmail.com>
+Dominik Korman <kormandominik@gmail.com>
+Donald Aingworth <donalds_junk_mail@yahoo.com>
 Eric Backus <eric_backus@alum.mit.edu>
 Eric Fiselier <eric@efcs.ca>
 Eugene Zhuk <eugene.zhuk@gmail.com>
 Evgeny Safronov <division494@gmail.com>
+Fanbo Meng <fanbo.meng@ibm.com>
 Federico Ficarelli <federico.ficarelli@gmail.com>
 Felix Homann <linuxaudio@showlabor.de>
 Geoffrey Martin-Noble <gcmn@google.com> <gmngeoffrey@gmail.com>
@ -60,6 +64,7 @@ Lei Xu <eddyxu@gmail.com>
 Matt Clarkson <mattyclarkson@gmail.com>
 Maxim Vafin <maxvafin@gmail.com>
 Nick Hutchinson <nshutchinson@gmail.com>
+Norman Heino <norman.heino@gmail.com>
 Oleksandr Sochka <sasha.sochka@gmail.com>
 Ori Livneh <ori.livneh@gmail.com>
 Pascal Leroy <phl@google.com>
@ -72,8 +77,11 @@ Robert Guo <robert.guo@mongodb.com>
 Roman Lebedev <lebedev.ri@gmail.com>
 Sayan Bhattacharjee <aero.sayan@gmail.com>
 Shuo Chen <chenshuo@chenshuo.com>
+Steven Wan <wan.yu@ibm.com>
+Tobias Schmidt <tobias.schmidt@in.tum.de>
 Tobias Ulvgård <tobias.ulvgard@dirac.se>
 Tom Madams <tom.ej.madams@gmail.com> <tmadams@google.com>
 Yixuan Qiu <yixuanq@gmail.com>
 Yusuke Suzuki <utatane.tea@gmail.com>
 Zbigniew Skowron <zbychs@gmail.com>
+Min-Yih Hsu <yihshyng223@gmail.com>
--- a/OAT.xml
+++ b/OAT.xml
@ -38,7 +38,6 @@
                <filteritem type="filepath" name="cmake/.*" desc="sources files"/>
                <filteritem type="filepath" name="bindings/python/.*" desc="This file can not declare copyright"/>
                <filteritem type="filepath" name="bindings/python/google_benchmark/.*" desc="This file can not declare copyright"/>
-                <filteritem type="filepath" name="conan/test_package/.*" desc="This file can not declare copyright"/>
                <filteritem type="filepath" name="include/benchmark/.*" desc="This file can not declare copyright"/>
                <filteritem type="filename" name="test/BUILD" desc="This file can not declare copyright"/>
                <filteritem type="filename" name="bindings/python/BUILD" desc="This file can not declare copyright"/>
--- a/README.OpenSource
+++ b/README.OpenSource
@ -3,9 +3,9 @@
    "Name": "benchmark",
    "License": "Apache License V2.0",
    "License File": "LICENSE",
-    "Version Number": "1.5.2",
-    "Owner": "renxiang11@huawei.com",
-    "Upstream URL": "https://github.com/google/benchmark/releases/tag/v1.5.2",
+    "Version Number": "1.6.1",
+    "Owner": "mipengwei@huawei.com",
+    "Upstream URL": "https://github.com/google/benchmark/releases/tag/v1.6.1",
    "Description": "A microbenchmark support library"
  }
 ]
--- a/README.md
+++ b/README.md
--- a/26
+++ b/26
@ -1,12 +1,7 @@
 workspace(name = "com_github_google_benchmark")

 load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
-
-http_archive(
-    name = "rules_cc",
-    strip_prefix = "rules_cc-a508235df92e71d537fcbae0c7c952ea6957a912",
-    urls = ["https://github.com/bazelbuild/rules_cc/archive/a508235df92e71d537fcbae0c7c952ea6957a912.zip"],
-)
+load("@bazel_tools//tools/build_defs/repo:git.bzl", "git_repository")

 http_archive(
    name = "com_google_absl",
@ -15,10 +10,10 @@ http_archive(
    urls = ["https://github.com/abseil/abseil-cpp/archive/20200225.2.tar.gz"],
 )

-http_archive(
+git_repository(
    name = "com_google_googletest",
-    strip_prefix = "googletest-3f0cf6b62ad1eb50d8736538363d3580dd640c3e",
-    urls = ["https://github.com/google/googletest/archive/3f0cf6b62ad1eb50d8736538363d3580dd640c3e.zip"],
+    remote = "https://github.com/google/googletest.git",
+    tag = "release-1.11.0",
 )

 http_archive(
@ -34,3 +29,16 @@ new_local_repository(
    build_file = "@//bindings/python:python_headers.BUILD",
    path = "/usr/include/python3.6",  # May be overwritten by setup.py.
 )
+
+http_archive(
+    name = "rules_python",
+    url = "https://github.com/bazelbuild/rules_python/releases/download/0.1.0/rules_python-0.1.0.tar.gz",
+    sha256 = "b6d46438523a3ec0f3cead544190ee13223a52f6a6765a29eae7b7cc24cc83a0",
+)
+
+load("@rules_python//python:pip.bzl", pip3_install="pip_install")
+
+pip3_install(
+   name = "py_deps",
+   requirements = "//:requirements.txt",
+)
--- a/_config.yml
+++ b/_config.yml
@ -1 +1,2 @@
-theme: jekyll-theme-midnight
+theme: jekyll-theme-midnight
+markdown: GFM
--- a/bindings/python/google_benchmark/init.py
+++ b/bindings/python/google_benchmark/init.py
@ -34,6 +34,7 @@ from google_benchmark._benchmark import (
    kNanosecond,
    kMicrosecond,
    kMillisecond,
+    kSecond,
    oNone,
    o1,
    oN,
@ -53,6 +54,7 @@ __all__ = [
    "kNanosecond",
    "kMicrosecond",
    "kMillisecond",
+    "kSecond",
    "oNone",
    "o1",
    "oN",
@ -64,7 +66,7 @@ __all__ = [
    "oLambda",
 ]

-__version__ = "0.2.0"
+__version__ = "1.6.1"


 class __OptionMaker:
@ -108,7 +110,7 @@ class __OptionMaker:


 # Alias for nicer API.
-# We have to instanciate an object, even if stateless, to be able to use __getattr__
+# We have to instantiate an object, even if stateless, to be able to use __getattr__
 # on option.range
 option = __OptionMaker()

--- a/bindings/python/google_benchmark/benchmark.cc
+++ b/bindings/python/google_benchmark/benchmark.cc
@ -49,6 +49,7 @@ PYBIND11_MODULE(_benchmark, m) {
      .value("kNanosecond", TimeUnit::kNanosecond)
      .value("kMicrosecond", TimeUnit::kMicrosecond)
      .value("kMillisecond", TimeUnit::kMillisecond)
+      .value("kSecond", TimeUnit::kSecond)
      .export_values();

  using benchmark::BigO;
@ -156,7 +157,7 @@ PYBIND11_MODULE(_benchmark, m) {
      .def("pause_timing", &State::PauseTiming)
      .def("resume_timing", &State::ResumeTiming)
      .def("skip_with_error", &State::SkipWithError)
-      .def_property_readonly("error_occured", &State::error_occurred)
+      .def_property_readonly("error_occurred", &State::error_occurred)
      .def("set_iteration_time", &State::SetIterationTime)
      .def_property("bytes_processed", &State::bytes_processed,
                    &State::SetBytesProcessed)
@ -164,12 +165,12 @@ PYBIND11_MODULE(_benchmark, m) {
                    &State::SetComplexityN)
      .def_property("items_processed", &State::items_processed,
                    &State::SetItemsProcessed)
-      .def("set_label", (void (State::*)(const char*)) & State::SetLabel)
+      .def("set_label", (void(State::*)(const char*)) & State::SetLabel)
      .def("range", &State::range, py::arg("pos") = 0)
      .def_property_readonly("iterations", &State::iterations)
      .def_readwrite("counters", &State::counters)
-      .def_readonly("thread_index", &State::thread_index)
-      .def_readonly("threads", &State::threads);
+      .def_property_readonly("thread_index", &State::thread_index)
+      .def_property_readonly("threads", &State::threads);

  m.def("Initialize", Initialize);
  m.def("RegisterBenchmark", RegisterBenchmark,
--- a/bindings/python/google_benchmark/example.py
+++ b/bindings/python/google_benchmark/example.py
@ -102,7 +102,7 @@ def with_options(state):

@benchmark.register(name="sum_million_microseconds")
@benchmark.option.unit(benchmark.kMicrosecond)
-def with_options(state):
+def with_options2(state):
    while state:
        sum(range(1_000_000))

--- a/cmake/AddCXXCompilerFlag.cmake
+++ b/cmake/AddCXXCompilerFlag.cmake
@ -34,9 +34,11 @@ function(add_cxx_compiler_flag FLAG)
  check_cxx_compiler_flag("${FLAG}" ${MANGLED_FLAG})
  set(CMAKE_REQUIRED_FLAGS "${OLD_CMAKE_REQUIRED_FLAGS}")
  if(${MANGLED_FLAG})
-    set(VARIANT ${ARGV1})
-    if(ARGV1)
+    if(ARGC GREATER 1)
+      set(VARIANT ${ARGV1})
      string(TOUPPER "_${VARIANT}" VARIANT)
+    else()
+      set(VARIANT "")
    endif()
    set(CMAKE_CXX_FLAGS${VARIANT} "${CMAKE_CXX_FLAGS${VARIANT}} ${BENCHMARK_CXX_FLAGS${VARIANT}} ${FLAG}" PARENT_SCOPE)
  endif()
@ -49,9 +51,11 @@ function(add_required_cxx_compiler_flag FLAG)
  check_cxx_compiler_flag("${FLAG}" ${MANGLED_FLAG})
  set(CMAKE_REQUIRED_FLAGS "${OLD_CMAKE_REQUIRED_FLAGS}")
  if(${MANGLED_FLAG})
-    set(VARIANT ${ARGV1})
-    if(ARGV1)
+    if(ARGC GREATER 1)
+      set(VARIANT ${ARGV1})
      string(TOUPPER "_${VARIANT}" VARIANT)
+    else()
+      set(VARIANT "")
    endif()
    set(CMAKE_CXX_FLAGS${VARIANT} "${CMAKE_CXX_FLAGS${VARIANT}} ${FLAG}" PARENT_SCOPE)
    set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${FLAG}" PARENT_SCOPE)
--- a/cmake/Config.cmake.in
+++ b/cmake/Config.cmake.in
@ -1 +1,7 @@
+@PACKAGE_INIT@
+
+include (CMakeFindDependencyMacro)
+
+find_dependency (Threads)
+
 include("${CMAKE_CURRENT_LIST_DIR}/@targets_export_name@.cmake")
--- a/cmake/GetGitVersion.cmake
+++ b/cmake/GetGitVersion.cmake
@ -20,16 +20,20 @@ set(__get_git_version INCLUDED)

 function(get_git_version var)
  if(GIT_EXECUTABLE)
-      execute_process(COMMAND ${GIT_EXECUTABLE} describe --match "v[0-9]*.[0-9]*.[0-9]*" --abbrev=8
+      execute_process(COMMAND ${GIT_EXECUTABLE} describe --tags --match "v[0-9]*.[0-9]*.[0-9]*" --abbrev=8
          WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}
          RESULT_VARIABLE status
-          OUTPUT_VARIABLE GIT_VERSION
+          OUTPUT_VARIABLE GIT_DESCRIBE_VERSION
          ERROR_QUIET)
-      if(${status})
-          set(GIT_VERSION "v0.0.0")
+      if(status)
+          set(GIT_DESCRIBE_VERSION "v0.0.0")
+      endif()
+      
+      string(STRIP ${GIT_DESCRIBE_VERSION} GIT_DESCRIBE_VERSION)
+      if(GIT_DESCRIBE_VERSION MATCHES v[^-]*-) 
+         string(REGEX REPLACE "v([^-]*)-([0-9]+)-.*" "\\1.\\2"  GIT_VERSION ${GIT_DESCRIBE_VERSION})
      else()
-          string(STRIP ${GIT_VERSION} GIT_VERSION)
-          string(REGEX REPLACE "-[0-9]+-g" "-" GIT_VERSION ${GIT_VERSION})
+         string(REGEX REPLACE "v(.*)" "\\1" GIT_VERSION ${GIT_DESCRIBE_VERSION})
      endif()

      # Work out if the repository is dirty
@ -43,12 +47,12 @@ function(get_git_version var)
          ERROR_QUIET)
      string(COMPARE NOTEQUAL "${GIT_DIFF_INDEX}" "" GIT_DIRTY)
      if (${GIT_DIRTY})
-          set(GIT_VERSION "${GIT_VERSION}-dirty")
+          set(GIT_DESCRIBE_VERSION "${GIT_DESCRIBE_VERSION}-dirty")
      endif()
+      message(STATUS "git version: ${GIT_DESCRIBE_VERSION} normalized to ${GIT_VERSION}")
  else()
-      set(GIT_VERSION "v0.0.0")
+      set(GIT_VERSION "0.0.0")
  endif()

-  message(STATUS "git Version: ${GIT_VERSION}")
  set(${var} ${GIT_VERSION} PARENT_SCOPE)
 endfunction()
--- a/cmake/GoogleTest.cmake
+++ b/cmake/GoogleTest.cmake
@ -29,13 +29,20 @@ set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)

 include(${GOOGLETEST_PREFIX}/googletest-paths.cmake)

+# googletest doesn't seem to want to stay build warning clean so let's not hurt ourselves.
+add_compile_options(-w)
+
 # Add googletest directly to our build. This defines
 # the gtest and gtest_main targets.
 add_subdirectory(${GOOGLETEST_SOURCE_DIR}
                 ${GOOGLETEST_BINARY_DIR}
                 EXCLUDE_FROM_ALL)

-set_target_properties(gtest PROPERTIES INTERFACE_SYSTEM_INCLUDE_DIRECTORIES $<TARGET_PROPERTY:gtest,INTERFACE_INCLUDE_DIRECTORIES>)
-set_target_properties(gtest_main PROPERTIES INTERFACE_SYSTEM_INCLUDE_DIRECTORIES $<TARGET_PROPERTY:gtest_main,INTERFACE_INCLUDE_DIRECTORIES>)
-set_target_properties(gmock PROPERTIES INTERFACE_SYSTEM_INCLUDE_DIRECTORIES $<TARGET_PROPERTY:gmock,INTERFACE_INCLUDE_DIRECTORIES>)
-set_target_properties(gmock_main PROPERTIES INTERFACE_SYSTEM_INCLUDE_DIRECTORIES $<TARGET_PROPERTY:gmock_main,INTERFACE_INCLUDE_DIRECTORIES>)
+if(NOT DEFINED GTEST_COMPILE_COMMANDS)
+    set(GTEST_COMPILE_COMMANDS ON)
+endif()
+
+set_target_properties(gtest PROPERTIES INTERFACE_SYSTEM_INCLUDE_DIRECTORIES $<TARGET_PROPERTY:gtest,INTERFACE_INCLUDE_DIRECTORIES> EXPORT_COMPILE_COMMANDS ${GTEST_COMPILE_COMMANDS})
+set_target_properties(gtest_main PROPERTIES INTERFACE_SYSTEM_INCLUDE_DIRECTORIES $<TARGET_PROPERTY:gtest_main,INTERFACE_INCLUDE_DIRECTORIES> EXPORT_COMPILE_COMMANDS ${GTEST_COMPILE_COMMANDS})
+set_target_properties(gmock PROPERTIES INTERFACE_SYSTEM_INCLUDE_DIRECTORIES $<TARGET_PROPERTY:gmock,INTERFACE_INCLUDE_DIRECTORIES> EXPORT_COMPILE_COMMANDS ${GTEST_COMPILE_COMMANDS})
+set_target_properties(gmock_main PROPERTIES INTERFACE_SYSTEM_INCLUDE_DIRECTORIES $<TARGET_PROPERTY:gmock_main,INTERFACE_INCLUDE_DIRECTORIES> EXPORT_COMPILE_COMMANDS ${GTEST_COMPILE_COMMANDS})
--- a/cmake/GoogleTest.cmake.in
+++ b/cmake/GoogleTest.cmake.in
@ -31,13 +31,14 @@ if(EXISTS "${GOOGLETEST_PATH}"            AND IS_DIRECTORY "${GOOGLETEST_PATH}"
  )
 else()
  if(NOT ALLOW_DOWNLOADING_GOOGLETEST)
-    message(SEND_ERROR "Did not find Google Test sources! Either pass correct path in GOOGLETEST_PATH, or enable BENCHMARK_DOWNLOAD_DEPENDENCIES, or disable BENCHMARK_ENABLE_GTEST_TESTS / BENCHMARK_ENABLE_TESTING.")
+    message(SEND_ERROR "Did not find Google Test sources! Either pass correct path in GOOGLETEST_PATH, or enable BENCHMARK_DOWNLOAD_DEPENDENCIES, or disable BENCHMARK_USE_BUNDLED_GTEST, or disable BENCHMARK_ENABLE_GTEST_TESTS / BENCHMARK_ENABLE_TESTING.")
+    return()
  else()
    message(WARNING "Did not find Google Test sources! Fetching from web...")
    ExternalProject_Add(
      googletest
      GIT_REPOSITORY    https://github.com/google/googletest.git
-      GIT_TAG           master
+      GIT_TAG           "release-1.11.0"
      PREFIX            "${CMAKE_BINARY_DIR}"
      STAMP_DIR         "${CMAKE_BINARY_DIR}/stamp"
      DOWNLOAD_DIR      "${CMAKE_BINARY_DIR}/download"
--- a/cmake/benchmark.pc.in
+++ b/cmake/benchmark.pc.in
@ -1,7 +1,7 @@
 prefix=@CMAKE_INSTALL_PREFIX@
 exec_prefix=${prefix}
-libdir=${prefix}/lib
-includedir=${prefix}/include
+libdir=${prefix}/@CMAKE_INSTALL_LIBDIR@
+includedir=${prefix}/@CMAKE_INSTALL_INCLUDEDIR@

 Name: @PROJECT_NAME@
 Description: Google microbenchmark framework
--- a/conan/CMakeLists.txt
+++ b/conan/CMakeLists.txt
@ -1,7 +0,0 @@
-cmake_minimum_required(VERSION 2.8.11)
-project(cmake_wrapper)
-
-include(conanbuildinfo.cmake)
-conan_basic_setup()
-
-include(${CMAKE_SOURCE_DIR}/CMakeListsOriginal.txt)
--- a/conan/test_package/CMakeLists.txt
+++ b/conan/test_package/CMakeLists.txt
@ -1,10 +0,0 @@
-cmake_minimum_required(VERSION 2.8.11)
-project(test_package)
-
-set(CMAKE_VERBOSE_MAKEFILE TRUE)
-
-include(${CMAKE_BINARY_DIR}/conanbuildinfo.cmake)
-conan_basic_setup()
-
-add_executable(${PROJECT_NAME} test_package.cpp)
-target_link_libraries(${PROJECT_NAME} ${CONAN_LIBS})
--- a/conan/test_package/conanfile.py
+++ b/conan/test_package/conanfile.py
@ -1,19 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-
-from conans import ConanFile, CMake
-import os
-
-
-class TestPackageConan(ConanFile):
-    settings = "os", "compiler", "build_type", "arch"
-    generators = "cmake"
-
-    def build(self):
-        cmake = CMake(self)
-        cmake.configure()
-        cmake.build()
-
-    def test(self):
-        bin_path = os.path.join("bin", "test_package")
-        self.run(bin_path, run_environment=True)
--- a/conan/test_package/test_package.cpp
+++ b/conan/test_package/test_package.cpp
@ -1,18 +0,0 @@
-#include "benchmark/benchmark.h"
-
-void BM_StringCreation(benchmark::State& state) {
-    while (state.KeepRunning())
-        std::string empty_string;
-}
-
-BENCHMARK(BM_StringCreation);
-
-void BM_StringCopy(benchmark::State& state) {
-    std::string x = "hello";
-    while (state.KeepRunning())
-        std::string copy(x);
-}
-
-BENCHMARK(BM_StringCopy);
-
-BENCHMARK_MAIN();
--- a/conanfile.py
+++ b/conanfile.py
@ -1,79 +0,0 @@
-from conans import ConanFile, CMake, tools
-from conans.errors import ConanInvalidConfiguration
-import shutil
-import os
-
-
-class GoogleBenchmarkConan(ConanFile):
-    name = "benchmark"
-    description = "A microbenchmark support library."
-    topics = ("conan", "benchmark", "google", "microbenchmark")
-    url = "https://github.com/google/benchmark"
-    homepage = "https://github.com/google/benchmark"
-    author = "Google Inc."
-    license = "Apache-2.0"
-    exports_sources = ["*"]
-    generators = "cmake"
-
-    settings = "arch", "build_type", "compiler", "os"
-    options = {
-        "shared": [True, False],
-        "fPIC": [True, False],
-        "enable_lto": [True, False],
-        "enable_exceptions": [True, False]
-    }
-    default_options = {"shared": False, "fPIC": True, "enable_lto": False, "enable_exceptions": True}
-
-    _build_subfolder = "."
-
-    def source(self):
-        # Wrap the original CMake file to call conan_basic_setup
-        shutil.move("CMakeLists.txt", "CMakeListsOriginal.txt")
-        shutil.move(os.path.join("conan", "CMakeLists.txt"), "CMakeLists.txt")
-
-    def config_options(self):
-        if self.settings.os == "Windows":
-            if self.settings.compiler == "Visual Studio" and float(self.settings.compiler.version.value) <= 12:
-                raise ConanInvalidConfiguration("{} {} does not support Visual Studio <= 12".format(self.name, self.version))
-            del self.options.fPIC
-
-    def configure(self):
-        if self.settings.os == "Windows" and self.options.shared:
-            raise ConanInvalidConfiguration("Windows shared builds are not supported right now, see issue #639")
-
-    def _configure_cmake(self):
-        cmake = CMake(self)
-
-        cmake.definitions["BENCHMARK_ENABLE_TESTING"] = "OFF"
-        cmake.definitions["BENCHMARK_ENABLE_GTEST_TESTS"] = "OFF"
-        cmake.definitions["BENCHMARK_ENABLE_LTO"] = "ON" if self.options.enable_lto else "OFF"
-        cmake.definitions["BENCHMARK_ENABLE_EXCEPTIONS"] = "ON" if self.options.enable_exceptions else "OFF"
-
-        # See https://github.com/google/benchmark/pull/638 for Windows 32 build explanation
-        if self.settings.os != "Windows":
-            cmake.definitions["BENCHMARK_BUILD_32_BITS"] = "ON" if "64" not in str(self.settings.arch) else "OFF"
-            cmake.definitions["BENCHMARK_USE_LIBCXX"] = "ON" if (str(self.settings.compiler.libcxx) == "libc++") else "OFF"
-        else:
-            cmake.definitions["BENCHMARK_USE_LIBCXX"] = "OFF"
-
-        cmake.configure(build_folder=self._build_subfolder)
-        return cmake
-
-    def build(self):
-        cmake = self._configure_cmake()
-        cmake.build()
-
-    def package(self):
-        cmake = self._configure_cmake()
-        cmake.install()
-
-        self.copy(pattern="LICENSE", dst="licenses")
-
-    def package_info(self):
-        self.cpp_info.libs = tools.collect_libs(self)
-        if self.settings.os == "Linux":
-            self.cpp_info.libs.extend(["pthread", "rt"])
-        elif self.settings.os == "Windows":
-            self.cpp_info.libs.append("shlwapi")
-        elif self.settings.os == "SunOS":
-            self.cpp_info.libs.append("kstat")
--- a/docs/_config.yml
+++ b/docs/_config.yml
@ -1 +1 @@
-theme: jekyll-theme-midnight
+theme: jekyll-theme-minimal
--- a/docs/dependencies.md
+++ b/docs/dependencies.md
@ -0,0 +1,19 @@
+# Build tool dependency policy
+
+To ensure the broadest compatibility when building the benchmark library, but
+still allow forward progress, we require any build tooling to be available for:
+
+* Debian stable _and_
+* The last two Ubuntu LTS releases
+
+Currently, this means using build tool versions that are available for Ubuntu
+18.04 (Bionic Beaver), Ubuntu 20.04 (Focal Fossa), and Debian 11 (bullseye).
+
+_Note, CI also runs ubuntu-16.04 and ubuntu-14.04 to ensure best effort support
+for older versions._
+
+## cmake
+The current supported version is cmake 3.5.1 as of 2018-06-06.
+
+_Note, this version is also available for Ubuntu 14.04, an older Ubuntu LTS
+release, as `cmake3`._
--- a/docs/index.md
+++ b/docs/index.md
@ -0,0 +1,10 @@
+# Benchmark
+
+* [Assembly Tests](AssemblyTests.md)
+* [Dependencies](dependencies.md)
+* [Perf Counters](perf_counters.md)
+* [Platform Specific Build Instructions](platform_specific_build_instructions.md)
+* [Random Interleaving](random_interleaving.md)
+* [Releasing](releasing.md)
+* [Tools](tools.md)
+* [User Guide](user_guide.md)
--- a/docs/perf_counters.md
+++ b/docs/perf_counters.md
@ -0,0 +1,34 @@
+<a name="perf-counters" />
+
+# User-Requested Performance Counters
+
+When running benchmarks, the user may choose to request collection of
+performance counters. This may be useful in investigation scenarios - narrowing
+down the cause of a regression; or verifying that the underlying cause of a
+performance improvement matches expectations.
+
+This feature is available if:
+
+* The benchmark is run on an architecture featuring a Performance Monitoring
+  Unit (PMU),
+* The benchmark is compiled with support for collecting counters. Currently,
+  this requires [libpfm](http://perfmon2.sourceforge.net/) be available at build
+  time
+
+The feature does not require modifying benchmark code. Counter collection is
+handled at the boundaries where timer collection is also handled. 
+
+To opt-in:
+
+*  Install `libpfm4-dev`, e.g. `apt-get install libpfm4-dev`.
+*  Enable the cmake flag BENCHMARK_ENABLE_LIBPFM.
+
+To use, pass a comma-separated list of counter names through the
+`--benchmark_perf_counters` flag. The names are decoded through libpfm - meaning,
+they are platform specific, but some (e.g. `CYCLES` or `INSTRUCTIONS`) are
+mapped by libpfm to platform-specifics - see libpfm
+[documentation](http://perfmon2.sourceforge.net/docs.html) for more details.
+
+The counter values are reported back through the [User Counters](../README.md#custom-counters)
+mechanism, meaning, they are available in all the formats (e.g. JSON) supported
+by User Counters.
--- a/docs/platform_specific_build_instructions.md
+++ b/docs/platform_specific_build_instructions.md
@ -0,0 +1,48 @@
+# Platform Specific Build Instructions
+
+## Building with GCC
+
+When the library is built using GCC it is necessary to link with the pthread
+library due to how GCC implements `std::thread`. Failing to link to pthread will
+lead to runtime exceptions (unless you're using libc++), not linker errors. See
+[issue #67](https://github.com/google/benchmark/issues/67) for more details. You
+can link to pthread by adding `-pthread` to your linker command. Note, you can
+also use `-lpthread`, but there are potential issues with ordering of command
+line parameters if you use that.
+
+On QNX, the pthread library is part of libc and usually included automatically
+(see
+[`pthread_create()`](https://www.qnx.com/developers/docs/7.1/index.html#com.qnx.doc.neutrino.lib_ref/topic/p/pthread_create.html)).
+There's no separate pthread library to link.
+
+## Building with Visual Studio 2015 or 2017
+
+The `shlwapi` library (`-lshlwapi`) is required to support a call to `CPUInfo` which reads the registry. Either add `shlwapi.lib` under `[ Configuration Properties > Linker > Input ]`, or use the following:
+
+```
+// Alternatively, can add libraries using linker options.
+#ifdef _WIN32
+#pragma comment ( lib, "Shlwapi.lib" )
+#ifdef _DEBUG
+#pragma comment ( lib, "benchmarkd.lib" )
+#else
+#pragma comment ( lib, "benchmark.lib" )
+#endif
+#endif
+```
+
+Can also use the graphical version of CMake:
+* Open `CMake GUI`.
+* Under `Where to build the binaries`, same path as source plus `build`.
+* Under `CMAKE_INSTALL_PREFIX`, same path as source plus `install`.
+* Click `Configure`, `Generate`, `Open Project`.
+* If build fails, try deleting entire directory and starting again, or unticking options to build less.
+
+## Building with Intel 2015 Update 1 or Intel System Studio Update 4
+
+See instructions for building with Visual Studio. Once built, right click on the solution and change the build to Intel.
+
+## Building on Solaris
+
+If you're running benchmarks on solaris, you'll want the kstat library linked in
+too (`-lkstat`).
--- a/docs/random_interleaving.md
+++ b/docs/random_interleaving.md
@ -0,0 +1,13 @@
+<a name="interleaving" />
+
+# Random Interleaving
+
+[Random Interleaving](https://github.com/google/benchmark/issues/1051) is a
+technique to lower run-to-run variance. It randomly interleaves repetitions of a
+microbenchmark with repetitions from other microbenchmarks in the same benchmark
+test. Data shows it is able to lower run-to-run variance by
+[40%](https://github.com/google/benchmark/issues/1051) on average.
+
+To use, you mainly need to set `--benchmark_enable_random_interleaving=true`,
+and optionally specify non-zero repetition count `--benchmark_repetitions=9`
+and optionally decrease the per-repetition time `--benchmark_min_time=0.1`.
--- a/docs/releasing.md
+++ b/docs/releasing.md
@ -1,6 +1,6 @@
 # How to release

-* Make sure you're on master and synced to HEAD
+* Make sure you're on main and synced to HEAD
 * Ensure the project builds and tests run (sanity check only, obviously)
    * `parallel -j0 exec ::: test/*_test` can help ensure everything at least
      passes
@ -8,9 +8,28 @@
    * `git log $(git describe --abbrev=0 --tags)..HEAD` gives you the list of
      commits between the last annotated tag and HEAD
    * Pick the most interesting.
+* Create one last commit that updates the version saved in `CMakeLists.txt` and the
+  `__version__` variable in `bindings/python/google_benchmark/__init__.py`to the release
+  version you're creating. (This version will be used if benchmark is installed from the
+  archive you'll be creating in the next step.)
+
+```
+project (benchmark VERSION 1.6.0 LANGUAGES CXX)
+```
+
+```python
+# bindings/python/google_benchmark/__init__.py
+
+# ...
+
+__version__ = "1.6.0"  # <-- change this to the release version you are creating
+
+# ...
+```
+
 * Create a release through github's interface
    * Note this will create a lightweight tag.
    * Update this to an annotated tag:
      * `git pull --tags`
      * `git tag -a -f <tag> <tag>`
-      * `git push --force origin`
+      * `git push --force --tags origin`
--- a/docs/user_guide.md
+++ b/docs/user_guide.md
--- a/include/benchmark/benchmark.h
+++ b/include/benchmark/benchmark.h
@ -34,7 +34,7 @@ static void BM_StringCopy(benchmark::State& state) {
 BENCHMARK(BM_StringCopy);

 // Augment the main() program to invoke benchmarks if specified
-// via the --benchmarks command line flag.  E.g.,
+// via the --benchmark_filter command line flag.  E.g.,
 //       my_unittest --benchmark_filter=all
 //       my_unittest --benchmark_filter=BM_StringCreation
 //       my_unittest --benchmark_filter=String
@ -42,6 +42,7 @@ BENCHMARK(BM_StringCopy);
 int main(int argc, char** argv) {
  benchmark::Initialize(&argc, argv);
  benchmark::RunSpecifiedBenchmarks();
+  benchmark::Shutdown();
  return 0;
 }

@ -139,13 +140,13 @@ thread exits the loop body. As such, any global setup or teardown you want to
 do can be wrapped in a check against the thread index:

 static void BM_MultiThreaded(benchmark::State& state) {
-  if (state.thread_index == 0) {
+  if (state.thread_index() == 0) {
    // Setup code here.
  }
  for (auto _ : state) {
    // Run the test as normal.
  }
-  if (state.thread_index == 0) {
+  if (state.thread_index() == 0) {
    // Teardown code here.
  }
 }
@ -167,12 +168,19 @@ BENCHMARK(BM_test)->Unit(benchmark::kMillisecond);
 #define BENCHMARK_HAS_CXX11
 #endif

+// This _MSC_VER check should detect VS 2017 v15.3 and newer.
+#if __cplusplus >= 201703L || \
+    (defined(_MSC_VER) && _MSC_VER >= 1911 && _MSVC_LANG >= 201703L)
+#define BENCHMARK_HAS_CXX17
+#endif
+
 #include <stdint.h>

 #include <algorithm>
 #include <cassert>
 #include <cstddef>
 #include <iosfwd>
+#include <limits>
 #include <map>
 #include <set>
 #include <string>
@ -180,6 +188,7 @@ BENCHMARK(BM_test)->Unit(benchmark::kMillisecond);
 #include <vector>

 #if defined(BENCHMARK_HAS_CXX11)
+#include <atomic>
 #include <initializer_list>
 #include <type_traits>
 #include <utility>
@ -199,13 +208,19 @@ BENCHMARK(BM_test)->Unit(benchmark::kMillisecond);
  TypeName& operator=(const TypeName&) = delete
 #endif

-#if defined(__GNUC__)
+#ifdef BENCHMARK_HAS_CXX17
+#define BENCHMARK_UNUSED [[maybe_unused]]
+#elif defined(__GNUC__) || defined(__clang__)
 #define BENCHMARK_UNUSED __attribute__((unused))
+#else
+#define BENCHMARK_UNUSED
+#endif
+
+#if defined(__GNUC__) || defined(__clang__)
 #define BENCHMARK_ALWAYS_INLINE __attribute__((always_inline))
 #define BENCHMARK_NOEXCEPT noexcept
 #define BENCHMARK_NOEXCEPT_OP(x) noexcept(x)
 #elif defined(_MSC_VER) && !defined(__clang__)
-#define BENCHMARK_UNUSED
 #define BENCHMARK_ALWAYS_INLINE __forceinline
 #if _MSC_VER >= 1900
 #define BENCHMARK_NOEXCEPT noexcept
@ -216,7 +231,6 @@ BENCHMARK(BM_test)->Unit(benchmark::kMillisecond);
 #endif
 #define __func__ __FUNCTION__
 #else
-#define BENCHMARK_UNUSED
 #define BENCHMARK_ALWAYS_INLINE
 #define BENCHMARK_NOEXCEPT
 #define BENCHMARK_NOEXCEPT_OP(x)
@ -225,16 +239,24 @@ BENCHMARK(BM_test)->Unit(benchmark::kMillisecond);
 #define BENCHMARK_INTERNAL_TOSTRING2(x) #x
 #define BENCHMARK_INTERNAL_TOSTRING(x) BENCHMARK_INTERNAL_TOSTRING2(x)

+// clang-format off
 #if defined(__GNUC__) || defined(__clang__)
 #define BENCHMARK_BUILTIN_EXPECT(x, y) __builtin_expect(x, y)
 #define BENCHMARK_DEPRECATED_MSG(msg) __attribute__((deprecated(msg)))
+#define BENCHMARK_DISABLE_DEPRECATED_WARNING \
+  _Pragma("GCC diagnostic push")             \
+  _Pragma("GCC diagnostic ignored \"-Wdeprecated-declarations\"")
+#define BENCHMARK_RESTORE_DEPRECATED_WARNING _Pragma("GCC diagnostic pop")
 #else
 #define BENCHMARK_BUILTIN_EXPECT(x, y) x
 #define BENCHMARK_DEPRECATED_MSG(msg)
 #define BENCHMARK_WARNING_MSG(msg)                           \
  __pragma(message(__FILE__ "(" BENCHMARK_INTERNAL_TOSTRING( \
      __LINE__) ") : warning note: " msg))
+#define BENCHMARK_DISABLE_DEPRECATED_WARNING
+#define BENCHMARK_RESTORE_DEPRECATED_WARNING
 #endif
+// clang-format on

 #if defined(__GNUC__) && !defined(__clang__)
 #define BENCHMARK_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)
@ -252,21 +274,34 @@ BENCHMARK(BM_test)->Unit(benchmark::kMillisecond);
 #define BENCHMARK_UNREACHABLE() ((void)0)
 #endif

+#ifdef BENCHMARK_HAS_CXX11
+#define BENCHMARK_OVERRIDE override
+#else
+#define BENCHMARK_OVERRIDE
+#endif
+
 namespace benchmark {
 class BenchmarkReporter;
-class MemoryManager;

 void Initialize(int* argc, char** argv);
+void Shutdown();

 // Report to stdout all arguments in 'argv' as unrecognized except the first.
 // Returns true there is at least on unrecognized argument (i.e. 'argc' > 1).
 bool ReportUnrecognizedArguments(int argc, char** argv);

+// Returns the current value of --benchmark_filter.
+std::string GetBenchmarkFilter();
+
 // Generate a list of benchmarks matching the specified --benchmark_filter flag
 // and if --benchmark_list_tests is specified return after printing the name
 // of each matching benchmark. Otherwise run each matching benchmark and
 // report the results.
 //
+// spec : Specify the benchmarks to run. If users do not specify this arg,
+//        then the value of FLAGS_benchmark_filter
+//        will be used.
+//
 // The second and third overload use the specified 'display_reporter' and
 //  'file_reporter' respectively. 'file_reporter' will write to the file
 //  specified
@ -275,14 +310,70 @@ bool ReportUnrecognizedArguments(int argc, char** argv);
 //
 // RETURNS: The number of matching benchmarks.
 size_t RunSpecifiedBenchmarks();
+size_t RunSpecifiedBenchmarks(std::string spec);
+
 size_t RunSpecifiedBenchmarks(BenchmarkReporter* display_reporter);
+size_t RunSpecifiedBenchmarks(BenchmarkReporter* display_reporter,
+                              std::string spec);
+
 size_t RunSpecifiedBenchmarks(BenchmarkReporter* display_reporter,
                              BenchmarkReporter* file_reporter);
+size_t RunSpecifiedBenchmarks(BenchmarkReporter* display_reporter,
+                              BenchmarkReporter* file_reporter,
+                              std::string spec);
+
+// If a MemoryManager is registered (via RegisterMemoryManager()),
+// it can be used to collect and report allocation metrics for a run of the
+// benchmark.
+class MemoryManager {
+ public:
+  static const int64_t TombstoneValue;
+
+  struct Result {
+    Result()
+        : num_allocs(0),
+          max_bytes_used(0),
+          total_allocated_bytes(TombstoneValue),
+          net_heap_growth(TombstoneValue) {}
+
+    // The number of allocations made in total between Start and Stop.
+    int64_t num_allocs;
+
+    // The peak memory use between Start and Stop.
+    int64_t max_bytes_used;
+
+    // The total memory allocated, in bytes, between Start and Stop.
+    // Init'ed to TombstoneValue if metric not available.
+    int64_t total_allocated_bytes;
+
+    // The net changes in memory, in bytes, between Start and Stop.
+    // ie., total_allocated_bytes - total_deallocated_bytes.
+    // Init'ed to TombstoneValue if metric not available.
+    int64_t net_heap_growth;
+  };
+
+  virtual ~MemoryManager() {}
+
+  // Implement this to start recording allocation information.
+  virtual void Start() = 0;
+
+  // Implement this to stop recording and fill out the given Result structure.
+  BENCHMARK_DEPRECATED_MSG("Use Stop(Result&) instead")
+  virtual void Stop(Result* result) = 0;
+
+  // FIXME(vyng): Make this pure virtual once we've migrated current users.
+  BENCHMARK_DISABLE_DEPRECATED_WARNING
+  virtual void Stop(Result& result) { Stop(&result); }
+  BENCHMARK_RESTORE_DEPRECATED_WARNING
+};

 // Register a MemoryManager instance that will be used to collect and report
 // allocation measurements for benchmark runs.
 void RegisterMemoryManager(MemoryManager* memory_manager);

+// Add a key-value pair to output as part of the context stanza in the report.
+void AddCustomContext(const std::string& key, const std::string& value);
+
 namespace internal {
 class Benchmark;
 class BenchmarkImp;
@ -305,6 +396,14 @@ BENCHMARK_UNUSED static int stream_init_anchor = InitializeStreams();
 #define BENCHMARK_HAS_NO_INLINE_ASSEMBLY
 #endif

+// Force the compiler to flush pending writes to global memory. Acts as an
+// effective read/write barrier
+#ifdef BENCHMARK_HAS_CXX11
+inline BENCHMARK_ALWAYS_INLINE void ClobberMemory() {
+  std::atomic_signal_fence(std::memory_order_acq_rel);
+}
+#endif
+
 // The DoNotOptimize(...) function can be used to prevent a value or
 // expression from being optimized away by the compiler. This function is
 // intended to add little to no overhead.
@ -324,11 +423,11 @@ inline BENCHMARK_ALWAYS_INLINE void DoNotOptimize(Tp& value) {
 #endif
 }

-// Force the compiler to flush pending writes to global memory. Acts as an
-// effective read/write barrier
+#ifndef BENCHMARK_HAS_CXX11
 inline BENCHMARK_ALWAYS_INLINE void ClobberMemory() {
  asm volatile("" : : : "memory");
 }
+#endif
 #elif defined(_MSC_VER)
 template <class Tp>
 inline BENCHMARK_ALWAYS_INLINE void DoNotOptimize(Tp const& value) {
@ -336,13 +435,15 @@ inline BENCHMARK_ALWAYS_INLINE void DoNotOptimize(Tp const& value) {
  _ReadWriteBarrier();
 }

+#ifndef BENCHMARK_HAS_CXX11
 inline BENCHMARK_ALWAYS_INLINE void ClobberMemory() { _ReadWriteBarrier(); }
+#endif
 #else
 template <class Tp>
 inline BENCHMARK_ALWAYS_INLINE void DoNotOptimize(Tp const& value) {
  internal::UseCharPointer(&reinterpret_cast<char const volatile&>(value));
 }
-// FIXME Add ClobberMemory() for non-gnu and non-msvc compilers
+// FIXME Add ClobberMemory() for non-gnu and non-msvc compilers, before C++11.
 #endif

 // This class is used for user-defined counters.
@ -352,27 +453,27 @@ class Counter {
    kDefaults = 0,
    // Mark the counter as a rate. It will be presented divided
    // by the duration of the benchmark.
-    kIsRate = 1U << 0U,
+    kIsRate = 1 << 0,
    // Mark the counter as a thread-average quantity. It will be
    // presented divided by the number of threads.
-    kAvgThreads = 1U << 1U,
+    kAvgThreads = 1 << 1,
    // Mark the counter as a thread-average rate. See above.
    kAvgThreadsRate = kIsRate | kAvgThreads,
    // Mark the counter as a constant value, valid/same for *every* iteration.
    // When reporting, it will be *multiplied* by the iteration count.
-    kIsIterationInvariant = 1U << 2U,
+    kIsIterationInvariant = 1 << 2,
    // Mark the counter as a constant rate.
    // When reporting, it will be *multiplied* by the iteration count
    // and then divided by the duration of the benchmark.
    kIsIterationInvariantRate = kIsRate | kIsIterationInvariant,
    // Mark the counter as a iteration-average quantity.
    // It will be presented divided by the number of iterations.
-    kAvgIterations = 1U << 3U,
+    kAvgIterations = 1 << 3,
    // Mark the counter as a iteration-average rate. See above.
    kAvgIterationsRate = kIsRate | kAvgIterations,

    // In the end, invert the result. This is always done last!
-    kInvert = 1U << 31U
+    kInvert = 1 << 31
  };

  enum OneK {
@ -390,7 +491,7 @@ class Counter {
  Counter(double v = 0., Flags f = kDefaults, OneK k = kIs1000)
      : value(v), flags(f), oneK(k) {}

-  BENCHMARK_ALWAYS_INLINE operator double const&() const { return value; }
+  BENCHMARK_ALWAYS_INLINE operator double const &() const { return value; }
  BENCHMARK_ALWAYS_INLINE operator double&() { return value; }
 };

@ -407,7 +508,7 @@ typedef std::map<std::string, Counter> UserCounters;

 // TimeUnit is passed to a benchmark in order to specify the order of magnitude
 // for the measured time.
-enum TimeUnit { kNanosecond, kMicrosecond, kMillisecond };
+enum TimeUnit { kNanosecond, kMicrosecond, kMillisecond, kSecond };

 // BigO is passed to a benchmark in order to specify the asymptotic
 // computational
@ -417,6 +518,8 @@ enum BigO { oNone, o1, oN, oNSquared, oNCubed, oLogN, oNLogN, oAuto, oLambda };

 typedef uint64_t IterationCount;

+enum StatisticUnit { kTime, kPercentage };
+
 // BigOFunc is passed to a benchmark in order to specify the asymptotic
 // computational complexity for the benchmark.
 typedef double(BigOFunc)(IterationCount);
@ -429,14 +532,17 @@ namespace internal {
 struct Statistics {
  std::string name_;
  StatisticsFunc* compute_;
+  StatisticUnit unit_;

-  Statistics(const std::string& name, StatisticsFunc* compute)
-      : name_(name), compute_(compute) {}
+  Statistics(const std::string& name, StatisticsFunc* compute,
+             StatisticUnit unit = kTime)
+      : name_(name), compute_(compute), unit_(unit) {}
 };

-struct BenchmarkInstance;
+class BenchmarkInstance;
 class ThreadTimer;
 class ThreadManager;
+class PerfCountersMeasurement;

 enum AggregationReportMode
 #if defined(BENCHMARK_HAS_CXX11)
@ -633,6 +739,14 @@ class State {
  BENCHMARK_DEPRECATED_MSG("use 'range(1)' instead")
  int64_t range_y() const { return range(1); }

+  // Number of threads concurrently executing the benchmark.
+  BENCHMARK_ALWAYS_INLINE
+  int threads() const { return threads_; }
+
+  // Index of the executing thread. Values from [0, threads).
+  BENCHMARK_ALWAYS_INLINE
+  int thread_index() const { return thread_index_; }
+
  BENCHMARK_ALWAYS_INLINE
  IterationCount iterations() const {
    if (BENCHMARK_BUILTIN_EXPECT(!started_, false)) {
@ -641,8 +755,8 @@ class State {
    return max_iterations - total_iterations_ + batch_leftover_;
  }

- private
-     :  // items we expect on the first cache line (ie 64 bytes of the struct)
+ private:
+  // items we expect on the first cache line (ie 64 bytes of the struct)
  // When total_iterations_ is 0, KeepRunning() and friends will return false.
  // May be larger than max_iterations.
  IterationCount total_iterations_;
@ -660,7 +774,7 @@ class State {
  bool finished_;
  bool error_occurred_;

- private:  // items we don't need on the first cache line
+  // items we don't need on the first cache line
  std::vector<int64_t> range_;

  int64_t complexity_n_;
@ -668,25 +782,27 @@ class State {
 public:
  // Container for user-defined counters.
  UserCounters counters;
-  // Index of the executing thread. Values from [0, threads).
-  const int thread_index;
-  // Number of threads concurrently executing the benchmark.
-  const int threads;

 private:
  State(IterationCount max_iters, const std::vector<int64_t>& ranges,
        int thread_i, int n_threads, internal::ThreadTimer* timer,
-        internal::ThreadManager* manager);
+        internal::ThreadManager* manager,
+        internal::PerfCountersMeasurement* perf_counters_measurement);

  void StartKeepRunning();
  // Implementation of KeepRunning() and KeepRunningBatch().
  // is_batch must be true unless n is 1.
  bool KeepRunningInternal(IterationCount n, bool is_batch);
  void FinishKeepRunning();
-  internal::ThreadTimer* timer_;
-  internal::ThreadManager* manager_;

-  friend struct internal::BenchmarkInstance;
+  const int thread_index_;
+  const int threads_;
+
+  internal::ThreadTimer* const timer_;
+  internal::ThreadManager* const manager_;
+  internal::PerfCountersMeasurement* const perf_counters_measurement_;
+
+  friend class internal::BenchmarkInstance;
 };

 inline BENCHMARK_ALWAYS_INLINE bool State::KeepRunning() {
@ -790,6 +906,9 @@ class Benchmark {
  // Note: the following methods all return "this" so that multiple
  // method calls can be chained together in one expression.

+  // Specify the name of the benchmark
+  Benchmark* Name(const std::string& name);
+
  // Run this benchmark once with "x" as the extra argument passed
  // to the function.
  // REQUIRES: The function passed to the constructor must accept an arg1.
@ -850,6 +969,23 @@ class Benchmark {
    return Ranges(ranges);
  }

+  // Have "setup" and/or "teardown" invoked once for every benchmark run.
+  // If the benchmark is multi-threaded (will run in k threads concurrently),
+  // the setup callback will be be invoked exactly once (not k times) before
+  // each run with k threads. Time allowing (e.g. for a short benchmark), there
+  // may be multiple such runs per benchmark, each run with its own
+  // "setup"/"teardown".
+  //
+  // If the benchmark uses different size groups of threads (e.g. via
+  // ThreadRange), the above will be true for each size group.
+  //
+  // The callback will be passed a State object, which includes the number
+  // of threads, thread-index, benchmark arguments, etc.
+  //
+  // The callback must not be NULL or self-deleting.
+  Benchmark* Setup(void (*setup)(const benchmark::State&));
+  Benchmark* Teardown(void (*teardown)(const benchmark::State&));
+
  // Pass this benchmark object to *func, which can customize
  // the benchmark by calling various methods like Arg, Args,
  // Threads, etc.
@ -918,7 +1054,9 @@ class Benchmark {
  Benchmark* Complexity(BigOFunc* complexity);

  // Add this statistics to be computed over all the values of benchmark run
-  Benchmark* ComputeStatistics(std::string name, StatisticsFunc* statistics);
+  Benchmark* ComputeStatistics(const std::string& name,
+                               StatisticsFunc* statistics,
+                               StatisticUnit unit = kTime);

  // Support for running multiple copies of the same benchmark concurrently
  // in multiple threads.  This may be useful when measuring the scaling
@ -961,6 +1099,7 @@ class Benchmark {

 private:
  friend class BenchmarkFamilies;
+  friend class BenchmarkInstance;

  std::string name_;
  AggregationReportMode aggregation_report_mode_;
@ -979,6 +1118,10 @@ class Benchmark {
  std::vector<Statistics> statistics_;
  std::vector<int> thread_counts_;

+  typedef void (*callback_function)(const benchmark::State&);
+  callback_function setup_;
+  callback_function teardown_;
+
  Benchmark& operator=(Benchmark const&);
 };

@ -1008,7 +1151,7 @@ class FunctionBenchmark : public Benchmark {
  FunctionBenchmark(const char* name, Function* func)
      : Benchmark(name), func_(func) {}

-  virtual void Run(State& st);
+  virtual void Run(State& st) BENCHMARK_OVERRIDE;

 private:
  Function* func_;
@ -1018,7 +1161,7 @@ class FunctionBenchmark : public Benchmark {
 template <class Lambda>
 class LambdaBenchmark : public Benchmark {
 public:
-  virtual void Run(State& st) { lambda_(st); }
+  virtual void Run(State& st) BENCHMARK_OVERRIDE { lambda_(st); }

 private:
  template <class OLambda>
@ -1027,8 +1170,7 @@ class LambdaBenchmark : public Benchmark {

  LambdaBenchmark(LambdaBenchmark const&) = delete;

- private:
-  template <class Lam>
+  template <class Lam>  // NOLINTNEXTLINE(readability-redundant-declaration)
  friend Benchmark* ::benchmark::RegisterBenchmark(const char*, Lam&&);

  Lambda lambda_;
@ -1070,7 +1212,7 @@ class Fixture : public internal::Benchmark {
 public:
  Fixture() : internal::Benchmark("") {}

-  virtual void Run(State& st) {
+  virtual void Run(State& st) BENCHMARK_OVERRIDE {
    this->SetUp(st);
    this->BenchmarkCase(st);
    this->TearDown(st);
@ -1102,19 +1244,37 @@ class Fixture : public internal::Benchmark {
 #endif

 // Helpers for generating unique variable names
+#ifdef BENCHMARK_HAS_CXX11
+#define BENCHMARK_PRIVATE_NAME(...)                                      \
+  BENCHMARK_PRIVATE_CONCAT(benchmark_uniq_, BENCHMARK_PRIVATE_UNIQUE_ID, \
+                           __VA_ARGS__)
+#else
 #define BENCHMARK_PRIVATE_NAME(n) \
-  BENCHMARK_PRIVATE_CONCAT(_benchmark_, BENCHMARK_PRIVATE_UNIQUE_ID, n)
+  BENCHMARK_PRIVATE_CONCAT(benchmark_uniq_, BENCHMARK_PRIVATE_UNIQUE_ID, n)
+#endif  // BENCHMARK_HAS_CXX11
+
 #define BENCHMARK_PRIVATE_CONCAT(a, b, c) BENCHMARK_PRIVATE_CONCAT2(a, b, c)
 #define BENCHMARK_PRIVATE_CONCAT2(a, b, c) a##b##c
+// Helper for concatenation with macro name expansion
+#define BENCHMARK_PRIVATE_CONCAT_NAME(BaseClass, Method) \
+  BaseClass##_##Method##_Benchmark

 #define BENCHMARK_PRIVATE_DECLARE(n)                                 \
  static ::benchmark::internal::Benchmark* BENCHMARK_PRIVATE_NAME(n) \
      BENCHMARK_UNUSED

+#ifdef BENCHMARK_HAS_CXX11
+#define BENCHMARK(...)                                               \
+  BENCHMARK_PRIVATE_DECLARE(_benchmark_) =                           \
+      (::benchmark::internal::RegisterBenchmarkInternal(             \
+          new ::benchmark::internal::FunctionBenchmark(#__VA_ARGS__, \
+                                                       &__VA_ARGS__)))
+#else
 #define BENCHMARK(n)                                     \
  BENCHMARK_PRIVATE_DECLARE(n) =                         \
      (::benchmark::internal::RegisterBenchmarkInternal( \
          new ::benchmark::internal::FunctionBenchmark(#n, n)))
+#endif  // BENCHMARK_HAS_CXX11

 // Old-style macros
 #define BENCHMARK_WITH_ARG(n, a) BENCHMARK(n)->Arg((a))
@ -1175,49 +1335,49 @@ class Fixture : public internal::Benchmark {
 #define BENCHMARK_TEMPLATE(n, a) BENCHMARK_TEMPLATE1(n, a)
 #endif

-#define BENCHMARK_PRIVATE_DECLARE_F(BaseClass, Method)        \
-  class BaseClass##_##Method##_Benchmark : public BaseClass { \
-   public:                                                    \
-    BaseClass##_##Method##_Benchmark() : BaseClass() {        \
-      this->SetName(#BaseClass "/" #Method);                  \
-    }                                                         \
-                                                              \
-   protected:                                                 \
-    virtual void BenchmarkCase(::benchmark::State&);          \
+#define BENCHMARK_PRIVATE_DECLARE_F(BaseClass, Method)                  \
+  class BaseClass##_##Method##_Benchmark : public BaseClass {           \
+   public:                                                              \
+    BaseClass##_##Method##_Benchmark() {                                \
+      this->SetName(#BaseClass "/" #Method);                            \
+    }                                                                   \
+                                                                        \
+   protected:                                                           \
+    virtual void BenchmarkCase(::benchmark::State&) BENCHMARK_OVERRIDE; \
  };

-#define BENCHMARK_TEMPLATE1_PRIVATE_DECLARE_F(BaseClass, Method, a) \
-  class BaseClass##_##Method##_Benchmark : public BaseClass<a> {    \
-   public:                                                          \
-    BaseClass##_##Method##_Benchmark() : BaseClass<a>() {           \
-      this->SetName(#BaseClass "<" #a ">/" #Method);                \
-    }                                                               \
-                                                                    \
-   protected:                                                       \
-    virtual void BenchmarkCase(::benchmark::State&);                \
+#define BENCHMARK_TEMPLATE1_PRIVATE_DECLARE_F(BaseClass, Method, a)     \
+  class BaseClass##_##Method##_Benchmark : public BaseClass<a> {        \
+   public:                                                              \
+    BaseClass##_##Method##_Benchmark() {                                \
+      this->SetName(#BaseClass "<" #a ">/" #Method);                    \
+    }                                                                   \
+                                                                        \
+   protected:                                                           \
+    virtual void BenchmarkCase(::benchmark::State&) BENCHMARK_OVERRIDE; \
  };

-#define BENCHMARK_TEMPLATE2_PRIVATE_DECLARE_F(BaseClass, Method, a, b) \
-  class BaseClass##_##Method##_Benchmark : public BaseClass<a, b> {    \
-   public:                                                             \
-    BaseClass##_##Method##_Benchmark() : BaseClass<a, b>() {           \
-      this->SetName(#BaseClass "<" #a "," #b ">/" #Method);            \
-    }                                                                  \
-                                                                       \
-   protected:                                                          \
-    virtual void BenchmarkCase(::benchmark::State&);                   \
+#define BENCHMARK_TEMPLATE2_PRIVATE_DECLARE_F(BaseClass, Method, a, b)  \
+  class BaseClass##_##Method##_Benchmark : public BaseClass<a, b> {     \
+   public:                                                              \
+    BaseClass##_##Method##_Benchmark() {                                \
+      this->SetName(#BaseClass "<" #a "," #b ">/" #Method);             \
+    }                                                                   \
+                                                                        \
+   protected:                                                           \
+    virtual void BenchmarkCase(::benchmark::State&) BENCHMARK_OVERRIDE; \
  };

 #ifdef BENCHMARK_HAS_CXX11
 #define BENCHMARK_TEMPLATE_PRIVATE_DECLARE_F(BaseClass, Method, ...)       \
  class BaseClass##_##Method##_Benchmark : public BaseClass<__VA_ARGS__> { \
   public:                                                                 \
-    BaseClass##_##Method##_Benchmark() : BaseClass<__VA_ARGS__>() {        \
+    BaseClass##_##Method##_Benchmark() {                                   \
      this->SetName(#BaseClass "<" #__VA_ARGS__ ">/" #Method);             \
    }                                                                      \
                                                                           \
   protected:                                                              \
-    virtual void BenchmarkCase(::benchmark::State&);                       \
+    virtual void BenchmarkCase(::benchmark::State&) BENCHMARK_OVERRIDE;    \
  };
 #else
 #define BENCHMARK_TEMPLATE_PRIVATE_DECLARE_F(n, a) \
@ -1226,27 +1386,27 @@ class Fixture : public internal::Benchmark {

 #define BENCHMARK_DEFINE_F(BaseClass, Method)    \
  BENCHMARK_PRIVATE_DECLARE_F(BaseClass, Method) \
-  void BaseClass##_##Method##_Benchmark::BenchmarkCase
+  void BENCHMARK_PRIVATE_CONCAT_NAME(BaseClass, Method)::BenchmarkCase

 #define BENCHMARK_TEMPLATE1_DEFINE_F(BaseClass, Method, a)    \
  BENCHMARK_TEMPLATE1_PRIVATE_DECLARE_F(BaseClass, Method, a) \
-  void BaseClass##_##Method##_Benchmark::BenchmarkCase
+  void BENCHMARK_PRIVATE_CONCAT_NAME(BaseClass, Method)::BenchmarkCase

 #define BENCHMARK_TEMPLATE2_DEFINE_F(BaseClass, Method, a, b)    \
  BENCHMARK_TEMPLATE2_PRIVATE_DECLARE_F(BaseClass, Method, a, b) \
-  void BaseClass##_##Method##_Benchmark::BenchmarkCase
+  void BENCHMARK_PRIVATE_CONCAT_NAME(BaseClass, Method)::BenchmarkCase

 #ifdef BENCHMARK_HAS_CXX11
 #define BENCHMARK_TEMPLATE_DEFINE_F(BaseClass, Method, ...)            \
  BENCHMARK_TEMPLATE_PRIVATE_DECLARE_F(BaseClass, Method, __VA_ARGS__) \
-  void BaseClass##_##Method##_Benchmark::BenchmarkCase
+  void BENCHMARK_PRIVATE_CONCAT_NAME(BaseClass, Method)::BenchmarkCase
 #else
 #define BENCHMARK_TEMPLATE_DEFINE_F(BaseClass, Method, a) \
  BENCHMARK_TEMPLATE1_DEFINE_F(BaseClass, Method, a)
 #endif

 #define BENCHMARK_REGISTER_F(BaseClass, Method) \
-  BENCHMARK_PRIVATE_REGISTER_F(BaseClass##_##Method##_Benchmark)
+  BENCHMARK_PRIVATE_REGISTER_F(BENCHMARK_PRIVATE_CONCAT_NAME(BaseClass, Method))

 #define BENCHMARK_PRIVATE_REGISTER_F(TestName) \
  BENCHMARK_PRIVATE_DECLARE(TestName) =        \
@ -1256,23 +1416,23 @@ class Fixture : public internal::Benchmark {
 #define BENCHMARK_F(BaseClass, Method)           \
  BENCHMARK_PRIVATE_DECLARE_F(BaseClass, Method) \
  BENCHMARK_REGISTER_F(BaseClass, Method);       \
-  void BaseClass##_##Method##_Benchmark::BenchmarkCase
+  void BENCHMARK_PRIVATE_CONCAT_NAME(BaseClass, Method)::BenchmarkCase

 #define BENCHMARK_TEMPLATE1_F(BaseClass, Method, a)           \
  BENCHMARK_TEMPLATE1_PRIVATE_DECLARE_F(BaseClass, Method, a) \
  BENCHMARK_REGISTER_F(BaseClass, Method);                    \
-  void BaseClass##_##Method##_Benchmark::BenchmarkCase
+  void BENCHMARK_PRIVATE_CONCAT_NAME(BaseClass, Method)::BenchmarkCase

 #define BENCHMARK_TEMPLATE2_F(BaseClass, Method, a, b)           \
  BENCHMARK_TEMPLATE2_PRIVATE_DECLARE_F(BaseClass, Method, a, b) \
  BENCHMARK_REGISTER_F(BaseClass, Method);                       \
-  void BaseClass##_##Method##_Benchmark::BenchmarkCase
+  void BENCHMARK_PRIVATE_CONCAT_NAME(BaseClass, Method)::BenchmarkCase

 #ifdef BENCHMARK_HAS_CXX11
 #define BENCHMARK_TEMPLATE_F(BaseClass, Method, ...)                   \
  BENCHMARK_TEMPLATE_PRIVATE_DECLARE_F(BaseClass, Method, __VA_ARGS__) \
  BENCHMARK_REGISTER_F(BaseClass, Method);                             \
-  void BaseClass##_##Method##_Benchmark::BenchmarkCase
+  void BENCHMARK_PRIVATE_CONCAT_NAME(BaseClass, Method)::BenchmarkCase
 #else
 #define BENCHMARK_TEMPLATE_F(BaseClass, Method, a) \
  BENCHMARK_TEMPLATE1_F(BaseClass, Method, a)
@ -1284,6 +1444,8 @@ class Fixture : public internal::Benchmark {
    ::benchmark::Initialize(&argc, argv);                               \
    if (::benchmark::ReportUnrecognizedArguments(argc, argv)) return 1; \
    ::benchmark::RunSpecifiedBenchmarks();                              \
+    ::benchmark::Shutdown();                                            \
+    return 0;                                                           \
  }                                                                     \
  int main(int, char**)

@ -1300,16 +1462,12 @@ struct CPUInfo {
    int num_sharing;
  };

-  enum Scaling {
-    UNKNOWN,
-    ENABLED,
-    DISABLED
-  };
+  enum Scaling { UNKNOWN, ENABLED, DISABLED };

  int num_cpus;
+  Scaling scaling;
  double cycles_per_second;
  std::vector<CacheInfo> caches;
-  Scaling scaling;
  std::vector<double> load_avg;

  static const CPUInfo& Get();
@ -1368,6 +1526,7 @@ class BenchmarkReporter {

    Run()
        : run_type(RT_Iteration),
+          aggregate_unit(kTime),
          error_occurred(false),
          iterations(1),
          threads(1),
@ -1380,15 +1539,16 @@ class BenchmarkReporter {
          complexity_n(0),
          report_big_o(false),
          report_rms(false),
-          counters(),
-          has_memory_result(false),
-          allocs_per_iter(0.0),
-          max_bytes_used(0) {}
+          memory_result(NULL),
+          allocs_per_iter(0.0) {}

    std::string benchmark_name() const;
    BenchmarkName run_name;
+    int64_t family_index;
+    int64_t per_family_instance_index;
    RunType run_type;
    std::string aggregate_name;
+    StatisticUnit aggregate_unit;
    std::string report_label;  // Empty if not set by benchmark.
    bool error_occurred;
    std::string error_message;
@ -1431,9 +1591,21 @@ class BenchmarkReporter {
    UserCounters counters;

    // Memory metrics.
-    bool has_memory_result;
+    const MemoryManager::Result* memory_result;
    double allocs_per_iter;
-    int64_t max_bytes_used;
+  };
+
+  struct PerFamilyRunReports {
+    PerFamilyRunReports() : num_runs_total(0), num_runs_done(0) {}
+
+    // How many runs will all instances of this benchmark perform?
+    int num_runs_total;
+
+    // How many runs have happened already?
+    int num_runs_done;
+
+    // The reports about (non-errneous!) runs of this family.
+    std::vector<BenchmarkReporter::Run> Runs;
  };

  // Construct a BenchmarkReporter with the output stream set to 'std::cout'
@ -1503,13 +1675,10 @@ class ConsoleReporter : public BenchmarkReporter {
    OO_Defaults = OO_ColorTabular
  };
  explicit ConsoleReporter(OutputOptions opts_ = OO_Defaults)
-      : output_options_(opts_),
-        name_field_width_(0),
-        prev_counters_(),
-        printed_header_(false) {}
+      : output_options_(opts_), name_field_width_(0), printed_header_(false) {}

-  virtual bool ReportContext(const Context& context);
-  virtual void ReportRuns(const std::vector<Run>& reports);
+  virtual bool ReportContext(const Context& context) BENCHMARK_OVERRIDE;
+  virtual void ReportRuns(const std::vector<Run>& reports) BENCHMARK_OVERRIDE;

 protected:
  virtual void PrintRunData(const Run& report);
@ -1524,9 +1693,9 @@ class ConsoleReporter : public BenchmarkReporter {
 class JSONReporter : public BenchmarkReporter {
 public:
  JSONReporter() : first_report_(true) {}
-  virtual bool ReportContext(const Context& context);
-  virtual void ReportRuns(const std::vector<Run>& reports);
-  virtual void Finalize();
+  virtual bool ReportContext(const Context& context) BENCHMARK_OVERRIDE;
+  virtual void ReportRuns(const std::vector<Run>& reports) BENCHMARK_OVERRIDE;
+  virtual void Finalize() BENCHMARK_OVERRIDE;

 private:
  void PrintRunData(const Run& report);
@ -1539,8 +1708,8 @@ class BENCHMARK_DEPRECATED_MSG(
    : public BenchmarkReporter {
 public:
  CSVReporter() : printed_header_(false) {}
-  virtual bool ReportContext(const Context& context);
-  virtual void ReportRuns(const std::vector<Run>& reports);
+  virtual bool ReportContext(const Context& context) BENCHMARK_OVERRIDE;
+  virtual void ReportRuns(const std::vector<Run>& reports) BENCHMARK_OVERRIDE;

 private:
  void PrintRunData(const Run& report);
@ -1549,31 +1718,10 @@ class BENCHMARK_DEPRECATED_MSG(
  std::set<std::string> user_counter_names_;
 };

-// If a MemoryManager is registered, it can be used to collect and report
-// allocation metrics for a run of the benchmark.
-class MemoryManager {
- public:
-  struct Result {
-    Result() : num_allocs(0), max_bytes_used(0) {}
-
-    // The number of allocations made in total between Start and Stop.
-    int64_t num_allocs;
-
-    // The peak memory use between Start and Stop.
-    int64_t max_bytes_used;
-  };
-
-  virtual ~MemoryManager() {}
-
-  // Implement this to start recording allocation information.
-  virtual void Start() = 0;
-
-  // Implement this to stop recording and fill out the given Result structure.
-  virtual void Stop(Result* result) = 0;
-};
-
 inline const char* GetTimeUnitString(TimeUnit unit) {
  switch (unit) {
+    case kSecond:
+      return "s";
    case kMillisecond:
      return "ms";
    case kMicrosecond:
@ -1586,6 +1734,8 @@ inline const char* GetTimeUnitString(TimeUnit unit) {

 inline double GetTimeUnitMultiplier(TimeUnit unit) {
  switch (unit) {
+    case kSecond:
+      return 1;
    case kMillisecond:
      return 1e3;
    case kMicrosecond:
@ -1596,6 +1746,20 @@ inline double GetTimeUnitMultiplier(TimeUnit unit) {
  BENCHMARK_UNREACHABLE();
 }

+// Creates a list of integer values for the given range and multiplier.
+// This can be used together with ArgsProduct() to allow multiple ranges
+// with different multiplers.
+// Example:
+// ArgsProduct({
+//   CreateRange(0, 1024, /*multi=*/32),
+//   CreateRange(0, 100, /*multi=*/4),
+//   CreateDenseRange(0, 4, /*step=*/1),
+// });
+std::vector<int64_t> CreateRange(int64_t lo, int64_t hi, int multi);
+
+// Creates a list of integer values for the given range and step.
+std::vector<int64_t> CreateDenseRange(int64_t start, int64_t limit, int step);
+
 }  // namespace benchmark

 #endif  // BENCHMARK_BENCHMARK_H_
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,3 @@
+numpy == 1.19.4
+scipy == 1.5.4
+pandas == 1.1.5
--- a/setup.py
+++ b/setup.py
@ -1,5 +1,6 @@
 import os
 import posixpath
+import platform
 import re
 import shutil
 import sys
@ -89,6 +90,8 @@ class BuildBazelExtension(build_ext.build_ext):
            # Link with python*.lib.
            for library_dir in self.library_dirs:
                bazel_argv.append("--linkopt=/LIBPATH:" + library_dir)
+        elif sys.platform == "darwin" and platform.machine() == "x86_64":
+            bazel_argv.append("--macos_minimum_os=10.9")

        self.spawn(bazel_argv)

--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@ -25,32 +25,32 @@ set_target_properties(benchmark PROPERTIES
  SOVERSION ${GENERIC_LIB_SOVERSION}
 )
 target_include_directories(benchmark PUBLIC
-    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../include>
-    )
+  $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/include>)
+
+# libpfm, if available
+if (HAVE_LIBPFM)
+  target_link_libraries(benchmark PRIVATE pfm)
+  add_definitions(-DHAVE_LIBPFM)
+endif()

 # Link threads.
-target_link_libraries(benchmark  ${BENCHMARK_CXX_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT})
-find_library(LIBRT rt)
-if(LIBRT)
-  target_link_libraries(benchmark ${LIBRT})
-endif()
+target_link_libraries(benchmark PRIVATE Threads::Threads)
+
+target_link_libraries(benchmark PRIVATE ${BENCHMARK_CXX_LIBRARIES})
+
+if(HAVE_LIB_RT)
+  target_link_libraries(benchmark PRIVATE rt)
+endif(HAVE_LIB_RT)

-if(CMAKE_BUILD_TYPE)
-  string(TOUPPER ${CMAKE_BUILD_TYPE} CMAKE_BUILD_TYPE_UPPER)
-endif()
-if(NOT CMAKE_THREAD_LIBS_INIT AND "${CMAKE_CXX_FLAGS} ${CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE_UPPER}}" MATCHES ".*-fsanitize=[^ ]*address.*")
-  message(WARNING "CMake's FindThreads.cmake did not fail, but CMAKE_THREAD_LIBS_INIT ended up being empty. This was fixed in https://github.com/Kitware/CMake/commit/d53317130e84898c5328c237186dbd995aaf1c12 Let's guess that -pthread is sufficient.")
-  target_link_libraries(benchmark -pthread)
-endif()

 # We need extra libraries on Windows
 if(${CMAKE_SYSTEM_NAME} MATCHES "Windows")
-  target_link_libraries(benchmark shlwapi)
+  target_link_libraries(benchmark PRIVATE shlwapi)
 endif()

 # We need extra libraries on Solaris
 if(${CMAKE_SYSTEM_NAME} MATCHES "SunOS")
-  target_link_libraries(benchmark kstat)
+  target_link_libraries(benchmark PRIVATE kstat)
 endif()

 # Benchmark main library
@ -61,33 +61,44 @@ set_target_properties(benchmark_main PROPERTIES
  VERSION ${GENERIC_LIB_VERSION}
  SOVERSION ${GENERIC_LIB_SOVERSION}
 )
-target_include_directories(benchmark PUBLIC
-    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../include>
-    )
-target_link_libraries(benchmark_main benchmark::benchmark)
+target_link_libraries(benchmark_main PUBLIC benchmark::benchmark)


-set(generated_dir "${CMAKE_CURRENT_BINARY_DIR}/generated")
+set(generated_dir "${PROJECT_BINARY_DIR}")

 set(version_config "${generated_dir}/${PROJECT_NAME}ConfigVersion.cmake")
 set(project_config "${generated_dir}/${PROJECT_NAME}Config.cmake")
 set(pkg_config "${generated_dir}/${PROJECT_NAME}.pc")
+set(targets_to_export benchmark benchmark_main)
 set(targets_export_name "${PROJECT_NAME}Targets")

 set(namespace "${PROJECT_NAME}::")

 include(CMakePackageConfigHelpers)
+
+configure_package_config_file (
+  ${PROJECT_SOURCE_DIR}/cmake/Config.cmake.in
+  ${project_config}
+  INSTALL_DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/${PROJECT_NAME}
+  NO_SET_AND_CHECK_MACRO
+  NO_CHECK_REQUIRED_COMPONENTS_MACRO
+)
 write_basic_package_version_file(
  "${version_config}" VERSION ${GENERIC_LIB_VERSION} COMPATIBILITY SameMajorVersion
 )

-configure_file("${PROJECT_SOURCE_DIR}/cmake/Config.cmake.in" "${project_config}" @ONLY)
 configure_file("${PROJECT_SOURCE_DIR}/cmake/benchmark.pc.in" "${pkg_config}" @ONLY)

+export (
+  TARGETS ${targets_to_export}
+  NAMESPACE "${namespace}"
+  FILE ${generated_dir}/${targets_export_name}.cmake
+)
+
 if (BENCHMARK_ENABLE_INSTALL)
  # Install target (will install the library to specified CMAKE_INSTALL_PREFIX variable)
  install(
-    TARGETS benchmark benchmark_main
+    TARGETS ${targets_to_export}
    EXPORT ${targets_export_name}
    ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
    LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
@ -112,3 +123,37 @@ if (BENCHMARK_ENABLE_INSTALL)
      NAMESPACE "${namespace}"
      DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/${PROJECT_NAME}")
 endif()
+
+if (BENCHMARK_ENABLE_DOXYGEN)
+  find_package(Doxygen REQUIRED)
+  set(DOXYGEN_QUIET YES)
+  set(DOXYGEN_RECURSIVE YES)
+  set(DOXYGEN_GENERATE_HTML YES)
+  set(DOXYGEN_GENERATE_MAN NO)
+  set(DOXYGEN_MARKDOWN_SUPPORT YES)
+  set(DOXYGEN_BUILTIN_STL_SUPPORT YES)
+  set(DOXYGEN_EXTRACT_PACKAGE YES)
+  set(DOXYGEN_EXTRACT_STATIC YES)
+  set(DOXYGEN_SHOW_INCLUDE_FILES YES)
+  set(DOXYGEN_BINARY_TOC YES)
+  set(DOXYGEN_TOC_EXPAND YES)
+  set(DOXYGEN_USE_MDFILE_AS_MAINPAGE "index.md")
+  doxygen_add_docs(benchmark_doxygen
+    docs
+    include
+    src
+    ALL
+    WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}
+    COMMENT "Building documentation with Doxygen.")
+  if (BENCHMARK_ENABLE_INSTALL AND BENCHMARK_INSTALL_DOCS)
+    install(
+      DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/html/"
+      DESTINATION ${CMAKE_INSTALL_DOCDIR})
+  endif()
+else()
+  if (BENCHMARK_ENABLE_INSTALL AND BENCHMARK_INSTALL_DOCS)
+    install(
+      DIRECTORY "${PROJECT_SOURCE_DIR}/docs/"
+      DESTINATION ${CMAKE_INSTALL_DOCDIR})
+  endif()
+endif()
--- a/src/benchmark.cc
+++ b/src/benchmark.cc
@ -13,6 +13,7 @@
 // limitations under the License.

 #include "benchmark/benchmark.h"
+
 #include "benchmark_api_internal.h"
 #include "benchmark_runner.h"
 #include "internal_macros.h"
@ -32,7 +33,10 @@
 #include <cstdlib>
 #include <fstream>
 #include <iostream>
+#include <limits>
+#include <map>
 #include <memory>
+#include <random>
 #include <string>
 #include <thread>
 #include <utility>
@ -45,71 +49,85 @@
 #include "internal_macros.h"
 #include "log.h"
 #include "mutex.h"
+#include "perf_counters.h"
 #include "re.h"
 #include "statistics.h"
 #include "string_util.h"
 #include "thread_manager.h"
 #include "thread_timer.h"

+namespace benchmark {
 // Print a list of benchmarks. This option overrides all other options.
-DEFINE_bool(benchmark_list_tests, false);
+BM_DEFINE_bool(benchmark_list_tests, false);

 // A regular expression that specifies the set of benchmarks to execute.  If
 // this flag is empty, or if this flag is the string \"all\", all benchmarks
 // linked into the binary are run.
-DEFINE_string(benchmark_filter, ".");
+BM_DEFINE_string(benchmark_filter, "");

 // Minimum number of seconds we should run benchmark before results are
 // considered significant.  For cpu-time based tests, this is the lower bound
 // on the total cpu time used by all threads that make up the test.  For
 // real-time based tests, this is the lower bound on the elapsed time of the
 // benchmark execution, regardless of number of threads.
-DEFINE_double(benchmark_min_time, 0.5);
+BM_DEFINE_double(benchmark_min_time, 0.5);

 // The number of runs of each benchmark. If greater than 1, the mean and
 // standard deviation of the runs will be reported.
-DEFINE_int32(benchmark_repetitions, 1);
+BM_DEFINE_int32(benchmark_repetitions, 1);
+
+// If set, enable random interleaving of repetitions of all benchmarks.
+// See http://github.com/google/benchmark/issues/1051 for details.
+BM_DEFINE_bool(benchmark_enable_random_interleaving, false);

 // Report the result of each benchmark repetitions. When 'true' is specified
 // only the mean, standard deviation, and other statistics are reported for
 // repeated benchmarks. Affects all reporters.
-DEFINE_bool(benchmark_report_aggregates_only, false);
+BM_DEFINE_bool(benchmark_report_aggregates_only, false);

 // Display the result of each benchmark repetitions. When 'true' is specified
 // only the mean, standard deviation, and other statistics are displayed for
 // repeated benchmarks. Unlike benchmark_report_aggregates_only, only affects
 // the display reporter, but  *NOT* file reporter, which will still contain
 // all the output.
-DEFINE_bool(benchmark_display_aggregates_only, false);
+BM_DEFINE_bool(benchmark_display_aggregates_only, false);

 // The format to use for console output.
 // Valid values are 'console', 'json', or 'csv'.
-DEFINE_string(benchmark_format, "console");
+BM_DEFINE_string(benchmark_format, "console");

 // The format to use for file output.
 // Valid values are 'console', 'json', or 'csv'.
-DEFINE_string(benchmark_out_format, "json");
+BM_DEFINE_string(benchmark_out_format, "json");

 // The file to write additional output to.
-DEFINE_string(benchmark_out, "");
+BM_DEFINE_string(benchmark_out, "");

 // Whether to use colors in the output.  Valid values:
 // 'true'/'yes'/1, 'false'/'no'/0, and 'auto'. 'auto' means to use colors if
 // the output is being sent to a terminal and the TERM environment variable is
 // set to a terminal type that supports colors.
-DEFINE_string(benchmark_color, "auto");
+BM_DEFINE_string(benchmark_color, "auto");

 // Whether to use tabular format when printing user counters to the console.
 // Valid values: 'true'/'yes'/1, 'false'/'no'/0.  Defaults to false.
-DEFINE_bool(benchmark_counters_tabular, false);
+BM_DEFINE_bool(benchmark_counters_tabular, false);
+
+// List of additional perf counters to collect, in libpfm format. For more
+// information about libpfm: https://man7.org/linux/man-pages/man3/libpfm.3.html
+BM_DEFINE_string(benchmark_perf_counters, "");
+
+// Extra context to include in the output formatted as comma-separated key-value
+// pairs. Kept internal as it's only used for parsing from env/command line.
+BM_DEFINE_kvpairs(benchmark_context, {});

 // The level of verbose logging to output
-DEFINE_int32(v, 0);
-
-namespace benchmark {
+BM_DEFINE_int32(v, 0);

 namespace internal {

+std::map<std::string, std::string>* global_context = nullptr;
+
 // FIXME: wouldn't LTO mess this up?
 void UseCharPointer(char const volatile*) {}

@ -117,7 +135,8 @@ void UseCharPointer(char const volatile*) {}

 State::State(IterationCount max_iters, const std::vector<int64_t>& ranges,
             int thread_i, int n_threads, internal::ThreadTimer* timer,
-             internal::ThreadManager* manager)
+             internal::ThreadManager* manager,
+             internal::PerfCountersMeasurement* perf_counters_measurement)
    : total_iterations_(0),
      batch_leftover_(0),
      max_iterations(max_iters),
@ -126,13 +145,14 @@ State::State(IterationCount max_iters, const std::vector<int64_t>& ranges,
      error_occurred_(false),
      range_(ranges),
      complexity_n_(0),
-      counters(),
-      thread_index(thread_i),
-      threads(n_threads),
+      thread_index_(thread_i),
+      threads_(n_threads),
      timer_(timer),
-      manager_(manager) {
-  CHECK(max_iterations != 0) << "At least one iteration must be run";
-  CHECK_LT(thread_index, threads) << "thread_index must be less than threads";
+      manager_(manager),
+      perf_counters_measurement_(perf_counters_measurement) {
+  BM_CHECK(max_iterations != 0) << "At least one iteration must be run";
+  BM_CHECK_LT(thread_index_, threads_)
+      << "thread_index must be less than threads";

  // Note: The use of offsetof below is technically undefined until C++17
  // because State is not a standard layout type. However, all compilers
@ -161,17 +181,29 @@ State::State(IterationCount max_iters, const std::vector<int64_t>& ranges,

 void State::PauseTiming() {
  // Add in time accumulated so far
-  CHECK(started_ && !finished_ && !error_occurred_);
+  BM_CHECK(started_ && !finished_ && !error_occurred_);
  timer_->StopTimer();
+  if (perf_counters_measurement_) {
+    auto measurements = perf_counters_measurement_->StopAndGetMeasurements();
+    for (const auto& name_and_measurement : measurements) {
+      auto name = name_and_measurement.first;
+      auto measurement = name_and_measurement.second;
+      BM_CHECK_EQ(counters[name], 0.0);
+      counters[name] = Counter(measurement, Counter::kAvgIterations);
+    }
+  }
 }

 void State::ResumeTiming() {
-  CHECK(started_ && !finished_ && !error_occurred_);
+  BM_CHECK(started_ && !finished_ && !error_occurred_);
  timer_->StartTimer();
+  if (perf_counters_measurement_) {
+    perf_counters_measurement_->Start();
+  }
 }

 void State::SkipWithError(const char* msg) {
-  CHECK(msg);
+  BM_CHECK(msg);
  error_occurred_ = true;
  {
    MutexLock l(manager_->GetBenchmarkMutex());
@ -194,7 +226,7 @@ void State::SetLabel(const char* label) {
 }

 void State::StartKeepRunning() {
-  CHECK(!started_ && !finished_);
+  BM_CHECK(!started_ && !finished_);
  started_ = true;
  total_iterations_ = error_occurred_ ? 0 : max_iterations;
  manager_->StartStopBarrier();
@ -202,7 +234,7 @@ void State::StartKeepRunning() {
 }

 void State::FinishKeepRunning() {
-  CHECK(started_ && (!finished_ || error_occurred_));
+  BM_CHECK(started_ && (!finished_ || error_occurred_));
  if (!error_occurred_) {
    PauseTiming();
  }
@ -215,11 +247,42 @@ void State::FinishKeepRunning() {
 namespace internal {
 namespace {

+// Flushes streams after invoking reporter methods that write to them. This
+// ensures users get timely updates even when streams are not line-buffered.
+void FlushStreams(BenchmarkReporter* reporter) {
+  if (!reporter) return;
+  std::flush(reporter->GetOutputStream());
+  std::flush(reporter->GetErrorStream());
+}
+
+// Reports in both display and file reporters.
+void Report(BenchmarkReporter* display_reporter,
+            BenchmarkReporter* file_reporter, const RunResults& run_results) {
+  auto report_one = [](BenchmarkReporter* reporter, bool aggregates_only,
+                       const RunResults& results) {
+    assert(reporter);
+    // If there are no aggregates, do output non-aggregates.
+    aggregates_only &= !results.aggregates_only.empty();
+    if (!aggregates_only) reporter->ReportRuns(results.non_aggregates);
+    if (!results.aggregates_only.empty())
+      reporter->ReportRuns(results.aggregates_only);
+  };
+
+  report_one(display_reporter, run_results.display_report_aggregates_only,
+             run_results);
+  if (file_reporter)
+    report_one(file_reporter, run_results.file_report_aggregates_only,
+               run_results);
+
+  FlushStreams(display_reporter);
+  FlushStreams(file_reporter);
+}
+
 void RunBenchmarks(const std::vector<BenchmarkInstance>& benchmarks,
                   BenchmarkReporter* display_reporter,
                   BenchmarkReporter* file_reporter) {
  // Note the file_reporter can be null.
-  CHECK(display_reporter != nullptr);
+  BM_CHECK(display_reporter != nullptr);

  // Determine the width of the name field using a minimum width of 10.
  bool might_have_aggregates = FLAGS_benchmark_repetitions > 1;
@ -227,10 +290,10 @@ void RunBenchmarks(const std::vector<BenchmarkInstance>& benchmarks,
  size_t stat_field_width = 0;
  for (const BenchmarkInstance& benchmark : benchmarks) {
    name_field_width =
-        std::max<size_t>(name_field_width, benchmark.name.str().size());
-    might_have_aggregates |= benchmark.repetitions > 1;
+        std::max<size_t>(name_field_width, benchmark.name().str().size());
+    might_have_aggregates |= benchmark.repetitions() > 1;

-    for (const auto& Stat : *benchmark.statistics)
+    for (const auto& Stat : benchmark.statistics())
      stat_field_width = std::max<size_t>(stat_field_width, Stat.name_.size());
  }
  if (might_have_aggregates) name_field_width += 1 + stat_field_width;
@ -239,56 +302,82 @@ void RunBenchmarks(const std::vector<BenchmarkInstance>& benchmarks,
  BenchmarkReporter::Context context;
  context.name_field_width = name_field_width;

-  // Keep track of running times of all instances of current benchmark
-  std::vector<BenchmarkReporter::Run> complexity_reports;
-
-  // We flush streams after invoking reporter methods that write to them. This
-  // ensures users get timely updates even when streams are not line-buffered.
-  auto flushStreams = [](BenchmarkReporter* reporter) {
-    if (!reporter) return;
-    std::flush(reporter->GetOutputStream());
-    std::flush(reporter->GetErrorStream());
-  };
+  // Keep track of running times of all instances of each benchmark family.
+  std::map<int /*family_index*/, BenchmarkReporter::PerFamilyRunReports>
+      per_family_reports;

  if (display_reporter->ReportContext(context) &&
      (!file_reporter || file_reporter->ReportContext(context))) {
-    flushStreams(display_reporter);
-    flushStreams(file_reporter);
+    FlushStreams(display_reporter);
+    FlushStreams(file_reporter);

-    for (const auto& benchmark : benchmarks) {
-      RunResults run_results = RunBenchmark(benchmark, &complexity_reports);
+    size_t num_repetitions_total = 0;

-      auto report = [&run_results](BenchmarkReporter* reporter,
-                                   bool report_aggregates_only) {
-        assert(reporter);
-        // If there are no aggregates, do output non-aggregates.
-        report_aggregates_only &= !run_results.aggregates_only.empty();
-        if (!report_aggregates_only)
-          reporter->ReportRuns(run_results.non_aggregates);
-        if (!run_results.aggregates_only.empty())
-          reporter->ReportRuns(run_results.aggregates_only);
-      };
+    std::vector<internal::BenchmarkRunner> runners;
+    runners.reserve(benchmarks.size());
+    for (const BenchmarkInstance& benchmark : benchmarks) {
+      BenchmarkReporter::PerFamilyRunReports* reports_for_family = nullptr;
+      if (benchmark.complexity() != oNone)
+        reports_for_family = &per_family_reports[benchmark.family_index()];

-      report(display_reporter, run_results.display_report_aggregates_only);
-      if (file_reporter)
-        report(file_reporter, run_results.file_report_aggregates_only);
+      runners.emplace_back(benchmark, reports_for_family);
+      int num_repeats_of_this_instance = runners.back().GetNumRepeats();
+      num_repetitions_total += num_repeats_of_this_instance;
+      if (reports_for_family)
+        reports_for_family->num_runs_total += num_repeats_of_this_instance;
+    }
+    assert(runners.size() == benchmarks.size() && "Unexpected runner count.");

-      flushStreams(display_reporter);
-      flushStreams(file_reporter);
+    std::vector<size_t> repetition_indices;
+    repetition_indices.reserve(num_repetitions_total);
+    for (size_t runner_index = 0, num_runners = runners.size();
+         runner_index != num_runners; ++runner_index) {
+      const internal::BenchmarkRunner& runner = runners[runner_index];
+      std::fill_n(std::back_inserter(repetition_indices),
+                  runner.GetNumRepeats(), runner_index);
+    }
+    assert(repetition_indices.size() == num_repetitions_total &&
+           "Unexpected number of repetition indexes.");
+
+    if (FLAGS_benchmark_enable_random_interleaving) {
+      std::random_device rd;
+      std::mt19937 g(rd());
+      std::shuffle(repetition_indices.begin(), repetition_indices.end(), g);
+    }
+
+    for (size_t repetition_index : repetition_indices) {
+      internal::BenchmarkRunner& runner = runners[repetition_index];
+      runner.DoOneRepetition();
+      if (runner.HasRepeatsRemaining()) continue;
+      // FIXME: report each repetition separately, not all of them in bulk.
+
+      RunResults run_results = runner.GetResults();
+
+      // Maybe calculate complexity report
+      if (const auto* reports_for_family = runner.GetReportsForFamily()) {
+        if (reports_for_family->num_runs_done ==
+            reports_for_family->num_runs_total) {
+          auto additional_run_stats = ComputeBigO(reports_for_family->Runs);
+          run_results.aggregates_only.insert(run_results.aggregates_only.end(),
+                                             additional_run_stats.begin(),
+                                             additional_run_stats.end());
+          per_family_reports.erase(
+              static_cast<int>(reports_for_family->Runs.front().family_index));
+        }
+      }
+
+      Report(display_reporter, file_reporter, run_results);
    }
  }
  display_reporter->Finalize();
  if (file_reporter) file_reporter->Finalize();
-  flushStreams(display_reporter);
-  flushStreams(file_reporter);
+  FlushStreams(display_reporter);
+  FlushStreams(file_reporter);
 }

 // Disable deprecated warnings temporarily because we need to reference
 // CSVReporter but don't want to trigger -Werror=-Wdeprecated-declarations
-#ifdef __GNUC__
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
-#endif
+BENCHMARK_DISABLE_DEPRECATED_WARNING

 std::unique_ptr<BenchmarkReporter> CreateReporter(
    std::string const& name, ConsoleReporter::OutputOptions output_opts) {
@ -305,9 +394,7 @@ std::unique_ptr<BenchmarkReporter> CreateReporter(
  }
 }

-#ifdef __GNUC__
-#pragma GCC diagnostic pop
-#endif
+BENCHMARK_RESTORE_DEPRECATED_WARNING

 }  // end namespace

@ -342,16 +429,32 @@ ConsoleReporter::OutputOptions GetOutputOptions(bool force_no_color) {
 }  // end namespace internal

 size_t RunSpecifiedBenchmarks() {
-  return RunSpecifiedBenchmarks(nullptr, nullptr);
+  return RunSpecifiedBenchmarks(nullptr, nullptr, FLAGS_benchmark_filter);
+}
+
+size_t RunSpecifiedBenchmarks(std::string spec) {
+  return RunSpecifiedBenchmarks(nullptr, nullptr, std::move(spec));
 }

 size_t RunSpecifiedBenchmarks(BenchmarkReporter* display_reporter) {
-  return RunSpecifiedBenchmarks(display_reporter, nullptr);
+  return RunSpecifiedBenchmarks(display_reporter, nullptr,
+                                FLAGS_benchmark_filter);
+}
+
+size_t RunSpecifiedBenchmarks(BenchmarkReporter* display_reporter,
+                              std::string spec) {
+  return RunSpecifiedBenchmarks(display_reporter, nullptr, std::move(spec));
 }

 size_t RunSpecifiedBenchmarks(BenchmarkReporter* display_reporter,
                              BenchmarkReporter* file_reporter) {
-  std::string spec = FLAGS_benchmark_filter;
+  return RunSpecifiedBenchmarks(display_reporter, file_reporter,
+                                FLAGS_benchmark_filter);
+}
+
+size_t RunSpecifiedBenchmarks(BenchmarkReporter* display_reporter,
+                              BenchmarkReporter* file_reporter,
+                              std::string spec) {
  if (spec.empty() || spec == "all")
    spec = ".";  // Regexp that matches all benchmarks

@ -377,7 +480,7 @@ size_t RunSpecifiedBenchmarks(BenchmarkReporter* display_reporter,
  if (!fname.empty()) {
    output_file.open(fname);
    if (!output_file.is_open()) {
-      Err << "invalid file name: '" << fname << std::endl;
+      Err << "invalid file name: '" << fname << "'" << std::endl;
      std::exit(1);
    }
    if (!file_reporter) {
@ -399,7 +502,7 @@ size_t RunSpecifiedBenchmarks(BenchmarkReporter* display_reporter,

  if (FLAGS_benchmark_list_tests) {
    for (auto const& benchmark : benchmarks)
-      Out << benchmark.name.str() << "\n";
+      Out << benchmark.name().str() << "\n";
  } else {
    internal::RunBenchmarks(benchmarks, display_reporter, file_reporter);
  }
@ -407,10 +510,22 @@ size_t RunSpecifiedBenchmarks(BenchmarkReporter* display_reporter,
  return benchmarks.size();
 }

+std::string GetBenchmarkFilter() { return FLAGS_benchmark_filter; }
+
 void RegisterMemoryManager(MemoryManager* manager) {
  internal::memory_manager = manager;
 }

+void AddCustomContext(const std::string& key, const std::string& value) {
+  if (internal::global_context == nullptr) {
+    internal::global_context = new std::map<std::string, std::string>();
+  }
+  if (!internal::global_context->emplace(key, value).second) {
+    std::cerr << "Failed to add custom context \"" << key << "\" as it already "
+              << "exists with value \"" << value << "\"\n";
+  }
+}
+
 namespace internal {

 void PrintUsageAndExit() {
@ -420,6 +535,7 @@ void PrintUsageAndExit() {
          "          [--benchmark_filter=<regex>]\n"
          "          [--benchmark_min_time=<min_time>]\n"
          "          [--benchmark_repetitions=<num_repetitions>]\n"
+          "          [--benchmark_enable_random_interleaving={true|false}]\n"
          "          [--benchmark_report_aggregates_only={true|false}]\n"
          "          [--benchmark_display_aggregates_only={true|false}]\n"
          "          [--benchmark_format=<console|json|csv>]\n"
@ -427,6 +543,8 @@ void PrintUsageAndExit() {
          "          [--benchmark_out_format=<json|console|csv>]\n"
          "          [--benchmark_color={auto|true|false}]\n"
          "          [--benchmark_counters_tabular={true|false}]\n"
+          "          [--benchmark_perf_counters=<counter>,...]\n"
+          "          [--benchmark_context=<key>=<value>,...]\n"
          "          [--v=<verbosity>]\n");
  exit(0);
 }
@ -443,6 +561,8 @@ void ParseCommandLineFlags(int* argc, char** argv) {
                        &FLAGS_benchmark_min_time) ||
        ParseInt32Flag(argv[i], "benchmark_repetitions",
                       &FLAGS_benchmark_repetitions) ||
+        ParseBoolFlag(argv[i], "benchmark_enable_random_interleaving",
+                      &FLAGS_benchmark_enable_random_interleaving) ||
        ParseBoolFlag(argv[i], "benchmark_report_aggregates_only",
                      &FLAGS_benchmark_report_aggregates_only) ||
        ParseBoolFlag(argv[i], "benchmark_display_aggregates_only",
@ -452,11 +572,12 @@ void ParseCommandLineFlags(int* argc, char** argv) {
        ParseStringFlag(argv[i], "benchmark_out_format",
                        &FLAGS_benchmark_out_format) ||
        ParseStringFlag(argv[i], "benchmark_color", &FLAGS_benchmark_color) ||
-        // "color_print" is the deprecated name for "benchmark_color".
-        // TODO: Remove this.
-        ParseStringFlag(argv[i], "color_print", &FLAGS_benchmark_color) ||
        ParseBoolFlag(argv[i], "benchmark_counters_tabular",
                      &FLAGS_benchmark_counters_tabular) ||
+        ParseStringFlag(argv[i], "benchmark_perf_counters",
+                        &FLAGS_benchmark_perf_counters) ||
+        ParseKeyValueFlag(argv[i], "benchmark_context",
+                          &FLAGS_benchmark_context) ||
        ParseInt32Flag(argv[i], "v", &FLAGS_v)) {
      for (int j = i; j != *argc - 1; ++j) argv[j] = argv[j + 1];

@ -467,13 +588,17 @@ void ParseCommandLineFlags(int* argc, char** argv) {
    }
  }
  for (auto const* flag :
-       {&FLAGS_benchmark_format, &FLAGS_benchmark_out_format})
+       {&FLAGS_benchmark_format, &FLAGS_benchmark_out_format}) {
    if (*flag != "console" && *flag != "json" && *flag != "csv") {
      PrintUsageAndExit();
    }
+  }
  if (FLAGS_benchmark_color.empty()) {
    PrintUsageAndExit();
  }
+  for (const auto& kv : FLAGS_benchmark_context) {
+    AddCustomContext(kv.first, kv.second);
+  }
 }

 int InitializeStreams() {
@ -488,6 +613,8 @@ void Initialize(int* argc, char** argv) {
  internal::LogLevel() = FLAGS_v;
 }

+void Shutdown() { delete internal::global_context; }
+
 bool ReportUnrecognizedArguments(int argc, char** argv) {
  for (int i = 1; i < argc; ++i) {
    fprintf(stderr, "%s: error: unrecognized command-line flag: %s\n", argv[0],
--- a/src/benchmark_api_internal.cc
+++ b/src/benchmark_api_internal.cc
@ -1,15 +1,112 @@
 #include "benchmark_api_internal.h"

+#include <cinttypes>
+
+#include "string_util.h"
+
 namespace benchmark {
 namespace internal {

-State BenchmarkInstance::Run(IterationCount iters, int thread_id,
-                             internal::ThreadTimer* timer,
-                             internal::ThreadManager* manager) const {
-  State st(iters, arg, thread_id, threads, timer, manager);
-  benchmark->Run(st);
+BenchmarkInstance::BenchmarkInstance(Benchmark* benchmark, int family_idx,
+                                     int per_family_instance_idx,
+                                     const std::vector<int64_t>& args,
+                                     int thread_count)
+    : benchmark_(*benchmark),
+      family_index_(family_idx),
+      per_family_instance_index_(per_family_instance_idx),
+      aggregation_report_mode_(benchmark_.aggregation_report_mode_),
+      args_(args),
+      time_unit_(benchmark_.time_unit_),
+      measure_process_cpu_time_(benchmark_.measure_process_cpu_time_),
+      use_real_time_(benchmark_.use_real_time_),
+      use_manual_time_(benchmark_.use_manual_time_),
+      complexity_(benchmark_.complexity_),
+      complexity_lambda_(benchmark_.complexity_lambda_),
+      statistics_(benchmark_.statistics_),
+      repetitions_(benchmark_.repetitions_),
+      min_time_(benchmark_.min_time_),
+      iterations_(benchmark_.iterations_),
+      threads_(thread_count) {
+  name_.function_name = benchmark_.name_;
+
+  size_t arg_i = 0;
+  for (const auto& arg : args) {
+    if (!name_.args.empty()) {
+      name_.args += '/';
+    }
+
+    if (arg_i < benchmark->arg_names_.size()) {
+      const auto& arg_name = benchmark_.arg_names_[arg_i];
+      if (!arg_name.empty()) {
+        name_.args += StrFormat("%s:", arg_name.c_str());
+      }
+    }
+
+    name_.args += StrFormat("%" PRId64, arg);
+    ++arg_i;
+  }
+
+  if (!IsZero(benchmark->min_time_)) {
+    name_.min_time = StrFormat("min_time:%0.3f", benchmark_.min_time_);
+  }
+
+  if (benchmark_.iterations_ != 0) {
+    name_.iterations = StrFormat(
+        "iterations:%lu", static_cast<unsigned long>(benchmark_.iterations_));
+  }
+
+  if (benchmark_.repetitions_ != 0) {
+    name_.repetitions = StrFormat("repeats:%d", benchmark_.repetitions_);
+  }
+
+  if (benchmark_.measure_process_cpu_time_) {
+    name_.time_type = "process_time";
+  }
+
+  if (benchmark_.use_manual_time_) {
+    if (!name_.time_type.empty()) {
+      name_.time_type += '/';
+    }
+    name_.time_type += "manual_time";
+  } else if (benchmark_.use_real_time_) {
+    if (!name_.time_type.empty()) {
+      name_.time_type += '/';
+    }
+    name_.time_type += "real_time";
+  }
+
+  if (!benchmark_.thread_counts_.empty()) {
+    name_.threads = StrFormat("threads:%d", threads_);
+  }
+
+  setup_ = benchmark_.setup_;
+  teardown_ = benchmark_.teardown_;
+}
+
+State BenchmarkInstance::Run(
+    IterationCount iters, int thread_id, internal::ThreadTimer* timer,
+    internal::ThreadManager* manager,
+    internal::PerfCountersMeasurement* perf_counters_measurement) const {
+  State st(iters, args_, thread_id, threads_, timer, manager,
+           perf_counters_measurement);
+  benchmark_.Run(st);
  return st;
 }

-}  // internal
-}  // benchmark
+void BenchmarkInstance::Setup() const {
+  if (setup_) {
+    State st(/*iters*/ 1, args_, /*thread_id*/ 0, threads_, nullptr, nullptr,
+             nullptr);
+    setup_(st);
+  }
+}
+
+void BenchmarkInstance::Teardown() const {
+  if (teardown_) {
+    State st(/*iters*/ 1, args_, /*thread_id*/ 0, threads_, nullptr, nullptr,
+             nullptr);
+    teardown_(st);
+  }
+}
+}  // namespace internal
+}  // namespace benchmark
--- a/src/benchmark_api_internal.h
+++ b/src/benchmark_api_internal.h
@ -1,9 +1,6 @@
 #ifndef BENCHMARK_API_INTERNAL_H
 #define BENCHMARK_API_INTERNAL_H

-#include "benchmark/benchmark.h"
-#include "commandlineflags.h"
-
 #include <cmath>
 #include <iosfwd>
 #include <limits>
@ -11,32 +8,66 @@
 #include <string>
 #include <vector>

+#include "benchmark/benchmark.h"
+#include "commandlineflags.h"
+
 namespace benchmark {
 namespace internal {

 // Information kept per benchmark we may want to run
-struct BenchmarkInstance {
-  BenchmarkName name;
-  Benchmark* benchmark;
-  AggregationReportMode aggregation_report_mode;
-  std::vector<int64_t> arg;
-  TimeUnit time_unit;
-  int range_multiplier;
-  bool measure_process_cpu_time;
-  bool use_real_time;
-  bool use_manual_time;
-  BigO complexity;
-  BigOFunc* complexity_lambda;
-  UserCounters counters;
-  const std::vector<Statistics>* statistics;
-  bool last_benchmark_instance;
-  int repetitions;
-  double min_time;
-  IterationCount iterations;
-  int threads;  // Number of concurrent threads to us
+class BenchmarkInstance {
+ public:
+  BenchmarkInstance(Benchmark* benchmark, int family_index,
+                    int per_family_instance_index,
+                    const std::vector<int64_t>& args, int threads);
+
+  const BenchmarkName& name() const { return name_; }
+  int family_index() const { return family_index_; }
+  int per_family_instance_index() const { return per_family_instance_index_; }
+  AggregationReportMode aggregation_report_mode() const {
+    return aggregation_report_mode_;
+  }
+  TimeUnit time_unit() const { return time_unit_; }
+  bool measure_process_cpu_time() const { return measure_process_cpu_time_; }
+  bool use_real_time() const { return use_real_time_; }
+  bool use_manual_time() const { return use_manual_time_; }
+  BigO complexity() const { return complexity_; }
+  BigOFunc* complexity_lambda() const { return complexity_lambda_; }
+  const std::vector<Statistics>& statistics() const { return statistics_; }
+  int repetitions() const { return repetitions_; }
+  double min_time() const { return min_time_; }
+  IterationCount iterations() const { return iterations_; }
+  int threads() const { return threads_; }
+  void Setup() const;
+  void Teardown() const;

  State Run(IterationCount iters, int thread_id, internal::ThreadTimer* timer,
-            internal::ThreadManager* manager) const;
+            internal::ThreadManager* manager,
+            internal::PerfCountersMeasurement* perf_counters_measurement) const;
+
+ private:
+  BenchmarkName name_;
+  Benchmark& benchmark_;
+  const int family_index_;
+  const int per_family_instance_index_;
+  AggregationReportMode aggregation_report_mode_;
+  const std::vector<int64_t>& args_;
+  TimeUnit time_unit_;
+  bool measure_process_cpu_time_;
+  bool use_real_time_;
+  bool use_manual_time_;
+  BigO complexity_;
+  BigOFunc* complexity_lambda_;
+  UserCounters counters_;
+  const std::vector<Statistics>& statistics_;
+  int repetitions_;
+  double min_time_;
+  IterationCount iterations_;
+  int threads_;  // Number of concurrent threads to us
+
+  typedef void (*callback_function)(const benchmark::State&);
+  callback_function setup_ = nullptr;
+  callback_function teardown_ = nullptr;
 };

 bool FindBenchmarksInternal(const std::string& re,
--- a/src/benchmark_register.cc
+++ b/src/benchmark_register.cc
@ -24,6 +24,7 @@

 #include <algorithm>
 #include <atomic>
+#include <cinttypes>
 #include <condition_variable>
 #include <cstdio>
 #include <cstdlib>
@ -35,11 +36,6 @@
 #include <sstream>
 #include <thread>

-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-#include <inttypes.h>
-
 #include "benchmark/benchmark.h"
 #include "benchmark_api_internal.h"
 #include "check.h"
@ -115,7 +111,7 @@ void BenchmarkFamilies::ClearBenchmarks() {
 bool BenchmarkFamilies::FindBenchmarks(
    std::string spec, std::vector<BenchmarkInstance>* benchmarks,
    std::ostream* ErrStream) {
-  CHECK(ErrStream);
+  BM_CHECK(ErrStream);
  auto& Err = *ErrStream;
  // Make regular expression out of command-line flag
  std::string error_msg;
@ -133,8 +129,13 @@ bool BenchmarkFamilies::FindBenchmarks(
  // Special list of thread counts to use when none are specified
  const std::vector<int> one_thread = {1};

+  int next_family_index = 0;
+
  MutexLock l(mutex_);
  for (std::unique_ptr<Benchmark>& family : families_) {
+    int family_index = next_family_index;
+    int per_family_instance_index = 0;
+
    // Family was deleted or benchmark doesn't match
    if (!family) continue;

@ -154,84 +155,24 @@ bool BenchmarkFamilies::FindBenchmarks(
    }
    // reserve in the special case the regex ".", since we know the final
    // family size.
-    if (spec == ".") benchmarks->reserve(family_size);
+    if (spec == ".") benchmarks->reserve(benchmarks->size() + family_size);

    for (auto const& args : family->args_) {
      for (int num_threads : *thread_counts) {
-        BenchmarkInstance instance;
-        instance.name.function_name = family->name_;
-        instance.benchmark = family.get();
-        instance.aggregation_report_mode = family->aggregation_report_mode_;
-        instance.arg = args;
-        instance.time_unit = family->time_unit_;
-        instance.range_multiplier = family->range_multiplier_;
-        instance.min_time = family->min_time_;
-        instance.iterations = family->iterations_;
-        instance.repetitions = family->repetitions_;
-        instance.measure_process_cpu_time = family->measure_process_cpu_time_;
-        instance.use_real_time = family->use_real_time_;
-        instance.use_manual_time = family->use_manual_time_;
-        instance.complexity = family->complexity_;
-        instance.complexity_lambda = family->complexity_lambda_;
-        instance.statistics = &family->statistics_;
-        instance.threads = num_threads;
+        BenchmarkInstance instance(family.get(), family_index,
+                                   per_family_instance_index, args,
+                                   num_threads);

-        // Add arguments to instance name
-        size_t arg_i = 0;
-        for (auto const& arg : args) {
-          if (!instance.name.args.empty()) {
-            instance.name.args += '/';
-          }
-
-          if (arg_i < family->arg_names_.size()) {
-            const auto& arg_name = family->arg_names_[arg_i];
-            if (!arg_name.empty()) {
-              instance.name.args += StrFormat("%s:", arg_name.c_str());
-            }
-          }
-
-          instance.name.args += StrFormat("%" PRId64, arg);
-          ++arg_i;
-        }
-
-        if (!IsZero(family->min_time_))
-          instance.name.min_time =
-              StrFormat("min_time:%0.3f", family->min_time_);
-        if (family->iterations_ != 0) {
-          instance.name.iterations =
-              StrFormat("iterations:%lu",
-                        static_cast<unsigned long>(family->iterations_));
-        }
-        if (family->repetitions_ != 0)
-          instance.name.repetitions =
-              StrFormat("repeats:%d", family->repetitions_);
-
-        if (family->measure_process_cpu_time_) {
-          instance.name.time_type = "process_time";
-        }
-
-        if (family->use_manual_time_) {
-          if (!instance.name.time_type.empty()) {
-            instance.name.time_type += '/';
-          }
-          instance.name.time_type += "manual_time";
-        } else if (family->use_real_time_) {
-          if (!instance.name.time_type.empty()) {
-            instance.name.time_type += '/';
-          }
-          instance.name.time_type += "real_time";
-        }
-
-        // Add the number of threads used to the name
-        if (!family->thread_counts_.empty()) {
-          instance.name.threads = StrFormat("threads:%d", instance.threads);
-        }
-
-        const auto full_name = instance.name.str();
+        const auto full_name = instance.name().str();
        if ((re.Match(full_name) && !isNegativeFilter) ||
            (!re.Match(full_name) && isNegativeFilter)) {
-          instance.last_benchmark_instance = (&args == &family->args_.back());
          benchmarks->push_back(std::move(instance));
+
+          ++per_family_instance_index;
+
+          // Only bump the next family index once we've estabilished that
+          // at least one instance of this family will be run.
+          if (next_family_index == family_index) ++next_family_index;
        }
      }
    }
@ -270,16 +211,24 @@ Benchmark::Benchmark(const char* name)
      use_real_time_(false),
      use_manual_time_(false),
      complexity_(oNone),
-      complexity_lambda_(nullptr) {
+      complexity_lambda_(nullptr),
+      setup_(nullptr),
+      teardown_(nullptr) {
  ComputeStatistics("mean", StatisticsMean);
  ComputeStatistics("median", StatisticsMedian);
  ComputeStatistics("stddev", StatisticsStdDev);
+  ComputeStatistics("cv", StatisticsCV, kPercentage);
 }

 Benchmark::~Benchmark() {}

+Benchmark* Benchmark::Name(const std::string& name) {
+  SetName(name.c_str());
+  return this;
+}
+
 Benchmark* Benchmark::Arg(int64_t x) {
-  CHECK(ArgsCnt() == -1 || ArgsCnt() == 1);
+  BM_CHECK(ArgsCnt() == -1 || ArgsCnt() == 1);
  args_.push_back({x});
  return this;
 }
@ -290,7 +239,7 @@ Benchmark* Benchmark::Unit(TimeUnit unit) {
 }

 Benchmark* Benchmark::Range(int64_t start, int64_t limit) {
-  CHECK(ArgsCnt() == -1 || ArgsCnt() == 1);
+  BM_CHECK(ArgsCnt() == -1 || ArgsCnt() == 1);
  std::vector<int64_t> arglist;
  AddRange(&arglist, start, limit, range_multiplier_);

@ -302,7 +251,7 @@ Benchmark* Benchmark::Range(int64_t start, int64_t limit) {

 Benchmark* Benchmark::Ranges(
    const std::vector<std::pair<int64_t, int64_t>>& ranges) {
-  CHECK(ArgsCnt() == -1 || ArgsCnt() == static_cast<int>(ranges.size()));
+  BM_CHECK(ArgsCnt() == -1 || ArgsCnt() == static_cast<int>(ranges.size()));
  std::vector<std::vector<int64_t>> arglists(ranges.size());
  for (std::size_t i = 0; i < ranges.size(); i++) {
    AddRange(&arglists[i], ranges[i].first, ranges[i].second,
@ -316,7 +265,7 @@ Benchmark* Benchmark::Ranges(

 Benchmark* Benchmark::ArgsProduct(
    const std::vector<std::vector<int64_t>>& arglists) {
-  CHECK(ArgsCnt() == -1 || ArgsCnt() == static_cast<int>(arglists.size()));
+  BM_CHECK(ArgsCnt() == -1 || ArgsCnt() == static_cast<int>(arglists.size()));

  std::vector<std::size_t> indices(arglists.size());
  const std::size_t total = std::accumulate(
@ -343,20 +292,20 @@ Benchmark* Benchmark::ArgsProduct(
 }

 Benchmark* Benchmark::ArgName(const std::string& name) {
-  CHECK(ArgsCnt() == -1 || ArgsCnt() == 1);
+  BM_CHECK(ArgsCnt() == -1 || ArgsCnt() == 1);
  arg_names_ = {name};
  return this;
 }

 Benchmark* Benchmark::ArgNames(const std::vector<std::string>& names) {
-  CHECK(ArgsCnt() == -1 || ArgsCnt() == static_cast<int>(names.size()));
+  BM_CHECK(ArgsCnt() == -1 || ArgsCnt() == static_cast<int>(names.size()));
  arg_names_ = names;
  return this;
 }

 Benchmark* Benchmark::DenseRange(int64_t start, int64_t limit, int step) {
-  CHECK(ArgsCnt() == -1 || ArgsCnt() == 1);
-  CHECK_LE(start, limit);
+  BM_CHECK(ArgsCnt() == -1 || ArgsCnt() == 1);
+  BM_CHECK_LE(start, limit);
  for (int64_t arg = start; arg <= limit; arg += step) {
    args_.push_back({arg});
  }
@ -364,7 +313,7 @@ Benchmark* Benchmark::DenseRange(int64_t start, int64_t limit, int step) {
 }

 Benchmark* Benchmark::Args(const std::vector<int64_t>& args) {
-  CHECK(ArgsCnt() == -1 || ArgsCnt() == static_cast<int>(args.size()));
+  BM_CHECK(ArgsCnt() == -1 || ArgsCnt() == static_cast<int>(args.size()));
  args_.push_back(args);
  return this;
 }
@ -374,28 +323,40 @@ Benchmark* Benchmark::Apply(void (*custom_arguments)(Benchmark* benchmark)) {
  return this;
 }

+Benchmark* Benchmark::Setup(void (*setup)(const benchmark::State&)) {
+  BM_CHECK(setup != nullptr);
+  setup_ = setup;
+  return this;
+}
+
+Benchmark* Benchmark::Teardown(void (*teardown)(const benchmark::State&)) {
+  BM_CHECK(teardown != nullptr);
+  teardown_ = teardown;
+  return this;
+}
+
 Benchmark* Benchmark::RangeMultiplier(int multiplier) {
-  CHECK(multiplier > 1);
+  BM_CHECK(multiplier > 1);
  range_multiplier_ = multiplier;
  return this;
 }

 Benchmark* Benchmark::MinTime(double t) {
-  CHECK(t > 0.0);
-  CHECK(iterations_ == 0);
+  BM_CHECK(t > 0.0);
+  BM_CHECK(iterations_ == 0);
  min_time_ = t;
  return this;
 }

 Benchmark* Benchmark::Iterations(IterationCount n) {
-  CHECK(n > 0);
-  CHECK(IsZero(min_time_));
+  BM_CHECK(n > 0);
+  BM_CHECK(IsZero(min_time_));
  iterations_ = n;
  return this;
 }

 Benchmark* Benchmark::Repetitions(int n) {
-  CHECK(n > 0);
+  BM_CHECK(n > 0);
  repetitions_ = n;
  return this;
 }
@ -428,14 +389,14 @@ Benchmark* Benchmark::MeasureProcessCPUTime() {
 }

 Benchmark* Benchmark::UseRealTime() {
-  CHECK(!use_manual_time_)
+  BM_CHECK(!use_manual_time_)
      << "Cannot set UseRealTime and UseManualTime simultaneously.";
  use_real_time_ = true;
  return this;
 }

 Benchmark* Benchmark::UseManualTime() {
-  CHECK(!use_real_time_)
+  BM_CHECK(!use_real_time_)
      << "Cannot set UseRealTime and UseManualTime simultaneously.";
  use_manual_time_ = true;
  return this;
@ -452,21 +413,22 @@ Benchmark* Benchmark::Complexity(BigOFunc* complexity) {
  return this;
 }

-Benchmark* Benchmark::ComputeStatistics(std::string name,
-                                        StatisticsFunc* statistics) {
-  statistics_.emplace_back(name, statistics);
+Benchmark* Benchmark::ComputeStatistics(const std::string& name,
+                                        StatisticsFunc* statistics,
+                                        StatisticUnit unit) {
+  statistics_.emplace_back(name, statistics, unit);
  return this;
 }

 Benchmark* Benchmark::Threads(int t) {
-  CHECK_GT(t, 0);
+  BM_CHECK_GT(t, 0);
  thread_counts_.push_back(t);
  return this;
 }

 Benchmark* Benchmark::ThreadRange(int min_threads, int max_threads) {
-  CHECK_GT(min_threads, 0);
-  CHECK_GE(max_threads, min_threads);
+  BM_CHECK_GT(min_threads, 0);
+  BM_CHECK_GE(max_threads, min_threads);

  AddRange(&thread_counts_, min_threads, max_threads, 2);
  return this;
@ -474,9 +436,9 @@ Benchmark* Benchmark::ThreadRange(int min_threads, int max_threads) {

 Benchmark* Benchmark::DenseThreadRange(int min_threads, int max_threads,
                                       int stride) {
-  CHECK_GT(min_threads, 0);
-  CHECK_GE(max_threads, min_threads);
-  CHECK_GE(stride, 1);
+  BM_CHECK_GT(min_threads, 0);
+  BM_CHECK_GE(max_threads, min_threads);
+  BM_CHECK_GE(stride, 1);

  for (auto i = min_threads; i < max_threads; i += stride) {
    thread_counts_.push_back(i);
@ -512,4 +474,19 @@ void ClearRegisteredBenchmarks() {
  internal::BenchmarkFamilies::GetInstance()->ClearBenchmarks();
 }

+std::vector<int64_t> CreateRange(int64_t lo, int64_t hi, int multi) {
+  std::vector<int64_t> args;
+  internal::AddRange(&args, lo, hi, multi);
+  return args;
+}
+
+std::vector<int64_t> CreateDenseRange(int64_t start, int64_t limit, int step) {
+  BM_CHECK_LE(start, limit);
+  std::vector<int64_t> args;
+  for (int64_t arg = start; arg <= limit; arg += step) {
+    args.push_back(arg);
+  }
+  return args;
+}
+
 }  // end namespace benchmark
--- a/src/benchmark_register.h
+++ b/src/benchmark_register.h
@ -1,6 +1,7 @@
 #ifndef BENCHMARK_REGISTER_H
 #define BENCHMARK_REGISTER_H

+#include <limits>
 #include <vector>

 #include "check.h"
@ -11,18 +12,18 @@ namespace internal {
 // Append the powers of 'mult' in the closed interval [lo, hi].
 // Returns iterator to the start of the inserted range.
 template <typename T>
-typename std::vector<T>::iterator
-AddPowers(std::vector<T>* dst, T lo, T hi, int mult) {
-  CHECK_GE(lo, 0);
-  CHECK_GE(hi, lo);
-  CHECK_GE(mult, 2);
+typename std::vector<T>::iterator AddPowers(std::vector<T>* dst, T lo, T hi,
+                                            int mult) {
+  BM_CHECK_GE(lo, 0);
+  BM_CHECK_GE(hi, lo);
+  BM_CHECK_GE(mult, 2);

  const size_t start_offset = dst->size();

  static const T kmax = std::numeric_limits<T>::max();

  // Space out the values in multiples of "mult"
-  for (T i = 1; i <= hi; i *= mult) {
+  for (T i = static_cast<T>(1); i <= hi; i *= mult) {
    if (i >= lo) {
      dst->push_back(i);
    }
@ -37,10 +38,10 @@ AddPowers(std::vector<T>* dst, T lo, T hi, int mult) {
 template <typename T>
 void AddNegatedPowers(std::vector<T>* dst, T lo, T hi, int mult) {
  // We negate lo and hi so we require that they cannot be equal to 'min'.
-  CHECK_GT(lo, std::numeric_limits<T>::min());
-  CHECK_GT(hi, std::numeric_limits<T>::min());
-  CHECK_GE(hi, lo);
-  CHECK_LE(hi, 0);
+  BM_CHECK_GT(lo, std::numeric_limits<T>::min());
+  BM_CHECK_GT(hi, std::numeric_limits<T>::min());
+  BM_CHECK_GE(hi, lo);
+  BM_CHECK_LE(hi, 0);

  // Add positive powers, then negate and reverse.
  // Casts necessary since small integers get promoted
@ -59,8 +60,8 @@ void AddRange(std::vector<T>* dst, T lo, T hi, int mult) {
  static_assert(std::is_integral<T>::value && std::is_signed<T>::value,
                "Args type must be a signed integer");

-  CHECK_GE(hi, lo);
-  CHECK_GE(mult, 2);
+  BM_CHECK_GE(hi, lo);
+  BM_CHECK_GE(mult, 2);

  // Add "lo"
  dst->push_back(lo);
@ -86,7 +87,7 @@ void AddRange(std::vector<T>* dst, T lo, T hi, int mult) {
  }

  // Treat 0 as a special case (see discussion on #762).
-  if (lo <= 0 && hi >= 0) {
+  if (lo < 0 && hi >= 0) {
    dst->push_back(0);
  }

--- a/src/benchmark_runner.cc
+++ b/src/benchmark_runner.cc
@ -13,6 +13,7 @@
 // limitations under the License.

 #include "benchmark_runner.h"
+
 #include "benchmark/benchmark.h"
 #include "benchmark_api_internal.h"
 #include "internal_macros.h"
@ -45,6 +46,7 @@
 #include "internal_macros.h"
 #include "log.h"
 #include "mutex.h"
+#include "perf_counters.h"
 #include "re.h"
 #include "statistics.h"
 #include "string_util.h"
@ -65,59 +67,64 @@ BenchmarkReporter::Run CreateRunReport(
    const benchmark::internal::BenchmarkInstance& b,
    const internal::ThreadManager::Result& results,
    IterationCount memory_iterations,
-    const MemoryManager::Result& memory_result, double seconds,
-    int64_t repetition_index) {
+    const MemoryManager::Result* memory_result, double seconds,
+    int64_t repetition_index, int64_t repeats) {
  // Create report about this benchmark run.
  BenchmarkReporter::Run report;

-  report.run_name = b.name;
+  report.run_name = b.name();
+  report.family_index = b.family_index();
+  report.per_family_instance_index = b.per_family_instance_index();
  report.error_occurred = results.has_error_;
  report.error_message = results.error_message_;
  report.report_label = results.report_label_;
  // This is the total iterations across all threads.
  report.iterations = results.iterations;
-  report.time_unit = b.time_unit;
-  report.threads = b.threads;
+  report.time_unit = b.time_unit();
+  report.threads = b.threads();
  report.repetition_index = repetition_index;
-  report.repetitions = b.repetitions;
+  report.repetitions = repeats;

  if (!report.error_occurred) {
-    if (b.use_manual_time) {
+    if (b.use_manual_time()) {
      report.real_accumulated_time = results.manual_time_used;
    } else {
      report.real_accumulated_time = results.real_time_used;
    }
    report.cpu_accumulated_time = results.cpu_time_used;
    report.complexity_n = results.complexity_n;
-    report.complexity = b.complexity;
-    report.complexity_lambda = b.complexity_lambda;
-    report.statistics = b.statistics;
+    report.complexity = b.complexity();
+    report.complexity_lambda = b.complexity_lambda();
+    report.statistics = &b.statistics();
    report.counters = results.counters;

    if (memory_iterations > 0) {
-      report.has_memory_result = true;
+      assert(memory_result != nullptr);
+      report.memory_result = memory_result;
      report.allocs_per_iter =
-          memory_iterations ? static_cast<double>(memory_result.num_allocs) /
+          memory_iterations ? static_cast<double>(memory_result->num_allocs) /
                                  memory_iterations
                            : 0;
-      report.max_bytes_used = memory_result.max_bytes_used;
    }

-    internal::Finish(&report.counters, results.iterations, seconds, b.threads);
+    internal::Finish(&report.counters, results.iterations, seconds,
+                     b.threads());
  }
  return report;
 }

 // Execute one thread of benchmark b for the specified number of iterations.
-// Adds the stats collected for the thread into *total.
+// Adds the stats collected for the thread into manager->results.
 void RunInThread(const BenchmarkInstance* b, IterationCount iters,
-                 int thread_id, ThreadManager* manager) {
+                 int thread_id, ThreadManager* manager,
+                 PerfCountersMeasurement* perf_counters_measurement) {
  internal::ThreadTimer timer(
-      b->measure_process_cpu_time
+      b->measure_process_cpu_time()
          ? internal::ThreadTimer::CreateProcessCpuTime()
          : internal::ThreadTimer::Create());
-  State st = b->Run(iters, thread_id, &timer, manager);
-  CHECK(st.error_occurred() || st.iterations() >= st.max_iterations)
+  State st =
+      b->Run(iters, thread_id, &timer, manager, perf_counters_measurement);
+  BM_CHECK(st.error_occurred() || st.iterations() >= st.max_iterations)
      << "Benchmark returned before State::KeepRunning() returned false!";
  {
    MutexLock l(manager->GetBenchmarkMutex());
@ -132,229 +139,219 @@ void RunInThread(const BenchmarkInstance* b, IterationCount iters,
  manager->NotifyThreadComplete();
 }

-class BenchmarkRunner {
- public:
-  BenchmarkRunner(const benchmark::internal::BenchmarkInstance& b_,
-                  std::vector<BenchmarkReporter::Run>* complexity_reports_)
-      : b(b_),
-        complexity_reports(*complexity_reports_),
-        min_time(!IsZero(b.min_time) ? b.min_time : FLAGS_benchmark_min_time),
-        repeats(b.repetitions != 0 ? b.repetitions
-                                   : FLAGS_benchmark_repetitions),
-        has_explicit_iteration_count(b.iterations != 0),
-        pool(b.threads - 1),
-        iters(has_explicit_iteration_count ? b.iterations : 1) {
-    run_results.display_report_aggregates_only =
-        (FLAGS_benchmark_report_aggregates_only ||
-         FLAGS_benchmark_display_aggregates_only);
-    run_results.file_report_aggregates_only =
-        FLAGS_benchmark_report_aggregates_only;
-    if (b.aggregation_report_mode != internal::ARM_Unspecified) {
-      run_results.display_report_aggregates_only =
-          (b.aggregation_report_mode &
-           internal::ARM_DisplayReportAggregatesOnly);
-      run_results.file_report_aggregates_only =
-          (b.aggregation_report_mode & internal::ARM_FileReportAggregatesOnly);
-    }
-
-    for (int repetition_num = 0; repetition_num < repeats; repetition_num++) {
-      DoOneRepetition(repetition_num);
-    }
-
-    // Calculate additional statistics
-    run_results.aggregates_only = ComputeStats(run_results.non_aggregates);
-
-    // Maybe calculate complexity report
-    if ((b.complexity != oNone) && b.last_benchmark_instance) {
-      auto additional_run_stats = ComputeBigO(complexity_reports);
-      run_results.aggregates_only.insert(run_results.aggregates_only.end(),
-                                         additional_run_stats.begin(),
-                                         additional_run_stats.end());
-      complexity_reports.clear();
-    }
-  }
-
-  RunResults&& get_results() { return std::move(run_results); }
-
- private:
-  RunResults run_results;
-
-  const benchmark::internal::BenchmarkInstance& b;
-  std::vector<BenchmarkReporter::Run>& complexity_reports;
-
-  const double min_time;
-  const int repeats;
-  const bool has_explicit_iteration_count;
-
-  std::vector<std::thread> pool;
-
-  IterationCount iters;  // preserved between repetitions!
-  // So only the first repetition has to find/calculate it,
-  // the other repetitions will just use that precomputed iteration count.
-
-  struct IterationResults {
-    internal::ThreadManager::Result results;
-    IterationCount iters;
-    double seconds;
-  };
-  IterationResults DoNIterations() {
-    VLOG(2) << "Running " << b.name.str() << " for " << iters << "\n";
-
-    std::unique_ptr<internal::ThreadManager> manager;
-    manager.reset(new internal::ThreadManager(b.threads));
-
-    // Run all but one thread in separate threads
-    for (std::size_t ti = 0; ti < pool.size(); ++ti) {
-      pool[ti] = std::thread(&RunInThread, &b, iters, static_cast<int>(ti + 1),
-                             manager.get());
-    }
-    // And run one thread here directly.
-    // (If we were asked to run just one thread, we don't create new threads.)
-    // Yes, we need to do this here *after* we start the separate threads.
-    RunInThread(&b, iters, 0, manager.get());
-
-    // The main thread has finished. Now let's wait for the other threads.
-    manager->WaitForAllThreads();
-    for (std::thread& thread : pool) thread.join();
-
-    IterationResults i;
-    // Acquire the measurements/counters from the manager, UNDER THE LOCK!
-    {
-      MutexLock l(manager->GetBenchmarkMutex());
-      i.results = manager->results;
-    }
-
-    // And get rid of the manager.
-    manager.reset();
-
-    // Adjust real/manual time stats since they were reported per thread.
-    i.results.real_time_used /= b.threads;
-    i.results.manual_time_used /= b.threads;
-    // If we were measuring whole-process CPU usage, adjust the CPU time too.
-    if (b.measure_process_cpu_time) i.results.cpu_time_used /= b.threads;
-
-    VLOG(2) << "Ran in " << i.results.cpu_time_used << "/"
-            << i.results.real_time_used << "\n";
-
-    // So for how long were we running?
-    i.iters = iters;
-    // Base decisions off of real time if requested by this benchmark.
-    i.seconds = i.results.cpu_time_used;
-    if (b.use_manual_time) {
-      i.seconds = i.results.manual_time_used;
-    } else if (b.use_real_time) {
-      i.seconds = i.results.real_time_used;
-    }
-
-    return i;
-  }
-
-  IterationCount PredictNumItersNeeded(const IterationResults& i) const {
-    // See how much iterations should be increased by.
-    // Note: Avoid division by zero with max(seconds, 1ns).
-    double multiplier = min_time * 1.4 / std::max(i.seconds, 1e-9);
-    // If our last run was at least 10% of FLAGS_benchmark_min_time then we
-    // use the multiplier directly.
-    // Otherwise we use at most 10 times expansion.
-    // NOTE: When the last run was at least 10% of the min time the max
-    // expansion should be 14x.
-    bool is_significant = (i.seconds / min_time) > 0.1;
-    multiplier = is_significant ? multiplier : std::min(10.0, multiplier);
-    if (multiplier <= 1.0) multiplier = 2.0;
-
-    // So what seems to be the sufficiently-large iteration count? Round up.
-    const IterationCount max_next_iters = static_cast<IterationCount>(
-        std::lround(std::max(multiplier * static_cast<double>(i.iters),
-                             static_cast<double>(i.iters) + 1.0)));
-    // But we do have *some* sanity limits though..
-    const IterationCount next_iters = std::min(max_next_iters, kMaxIterations);
-
-    VLOG(3) << "Next iters: " << next_iters << ", " << multiplier << "\n";
-    return next_iters;  // round up before conversion to integer.
-  }
-
-  bool ShouldReportIterationResults(const IterationResults& i) const {
-    // Determine if this run should be reported;
-    // Either it has run for a sufficient amount of time
-    // or because an error was reported.
-    return i.results.has_error_ ||
-           i.iters >= kMaxIterations ||  // Too many iterations already.
-           i.seconds >= min_time ||      // The elapsed time is large enough.
-           // CPU time is specified but the elapsed real time greatly exceeds
-           // the minimum time.
-           // Note that user provided timers are except from this sanity check.
-           ((i.results.real_time_used >= 5 * min_time) && !b.use_manual_time);
-  }
-
-  void DoOneRepetition(int64_t repetition_index) {
-    const bool is_the_first_repetition = repetition_index == 0;
-    IterationResults i;
-
-    // We *may* be gradually increasing the length (iteration count)
-    // of the benchmark until we decide the results are significant.
-    // And once we do, we report those last results and exit.
-    // Please do note that the if there are repetitions, the iteration count
-    // is *only* calculated for the *first* repetition, and other repetitions
-    // simply use that precomputed iteration count.
-    for (;;) {
-      i = DoNIterations();
-
-      // Do we consider the results to be significant?
-      // If we are doing repetitions, and the first repetition was already done,
-      // it has calculated the correct iteration time, so we have run that very
-      // iteration count just now. No need to calculate anything. Just report.
-      // Else, the normal rules apply.
-      const bool results_are_significant = !is_the_first_repetition ||
-                                           has_explicit_iteration_count ||
-                                           ShouldReportIterationResults(i);
-
-      if (results_are_significant) break;  // Good, let's report them!
-
-      // Nope, bad iteration. Let's re-estimate the hopefully-sufficient
-      // iteration count, and run the benchmark again...
-
-      iters = PredictNumItersNeeded(i);
-      assert(iters > i.iters &&
-             "if we did more iterations than we want to do the next time, "
-             "then we should have accepted the current iteration run.");
-    }
-
-    // Oh, one last thing, we need to also produce the 'memory measurements'..
-    MemoryManager::Result memory_result;
-    IterationCount memory_iterations = 0;
-    if (memory_manager != nullptr) {
-      // Only run a few iterations to reduce the impact of one-time
-      // allocations in benchmarks that are not properly managed.
-      memory_iterations = std::min<IterationCount>(16, iters);
-      memory_manager->Start();
-      std::unique_ptr<internal::ThreadManager> manager;
-      manager.reset(new internal::ThreadManager(1));
-      RunInThread(&b, memory_iterations, 0, manager.get());
-      manager->WaitForAllThreads();
-      manager.reset();
-
-      memory_manager->Stop(&memory_result);
-    }
-
-    // Ok, now actualy report.
-    BenchmarkReporter::Run report =
-        CreateRunReport(b, i.results, memory_iterations, memory_result,
-                        i.seconds, repetition_index);
-
-    if (!report.error_occurred && b.complexity != oNone)
-      complexity_reports.push_back(report);
-
-    run_results.non_aggregates.push_back(report);
-  }
-};
-
 }  // end namespace

-RunResults RunBenchmark(
-    const benchmark::internal::BenchmarkInstance& b,
-    std::vector<BenchmarkReporter::Run>* complexity_reports) {
-  internal::BenchmarkRunner r(b, complexity_reports);
-  return r.get_results();
+BenchmarkRunner::BenchmarkRunner(
+    const benchmark::internal::BenchmarkInstance& b_,
+    BenchmarkReporter::PerFamilyRunReports* reports_for_family_)
+    : b(b_),
+      reports_for_family(reports_for_family_),
+      min_time(!IsZero(b.min_time()) ? b.min_time() : FLAGS_benchmark_min_time),
+      repeats(b.repetitions() != 0 ? b.repetitions()
+                                   : FLAGS_benchmark_repetitions),
+      has_explicit_iteration_count(b.iterations() != 0),
+      pool(b.threads() - 1),
+      iters(has_explicit_iteration_count ? b.iterations() : 1),
+      perf_counters_measurement(
+          PerfCounters::Create(StrSplit(FLAGS_benchmark_perf_counters, ','))),
+      perf_counters_measurement_ptr(perf_counters_measurement.IsValid()
+                                        ? &perf_counters_measurement
+                                        : nullptr) {
+  run_results.display_report_aggregates_only =
+      (FLAGS_benchmark_report_aggregates_only ||
+       FLAGS_benchmark_display_aggregates_only);
+  run_results.file_report_aggregates_only =
+      FLAGS_benchmark_report_aggregates_only;
+  if (b.aggregation_report_mode() != internal::ARM_Unspecified) {
+    run_results.display_report_aggregates_only =
+        (b.aggregation_report_mode() &
+         internal::ARM_DisplayReportAggregatesOnly);
+    run_results.file_report_aggregates_only =
+        (b.aggregation_report_mode() & internal::ARM_FileReportAggregatesOnly);
+    BM_CHECK(FLAGS_benchmark_perf_counters.empty() ||
+             perf_counters_measurement.IsValid())
+        << "Perf counters were requested but could not be set up.";
+  }
+}
+
+BenchmarkRunner::IterationResults BenchmarkRunner::DoNIterations() {
+  BM_VLOG(2) << "Running " << b.name().str() << " for " << iters << "\n";
+
+  std::unique_ptr<internal::ThreadManager> manager;
+  manager.reset(new internal::ThreadManager(b.threads()));
+
+  // Run all but one thread in separate threads
+  for (std::size_t ti = 0; ti < pool.size(); ++ti) {
+    pool[ti] = std::thread(&RunInThread, &b, iters, static_cast<int>(ti + 1),
+                           manager.get(), perf_counters_measurement_ptr);
+  }
+  // And run one thread here directly.
+  // (If we were asked to run just one thread, we don't create new threads.)
+  // Yes, we need to do this here *after* we start the separate threads.
+  RunInThread(&b, iters, 0, manager.get(), perf_counters_measurement_ptr);
+
+  // The main thread has finished. Now let's wait for the other threads.
+  manager->WaitForAllThreads();
+  for (std::thread& thread : pool) thread.join();
+
+  IterationResults i;
+  // Acquire the measurements/counters from the manager, UNDER THE LOCK!
+  {
+    MutexLock l(manager->GetBenchmarkMutex());
+    i.results = manager->results;
+  }
+
+  // And get rid of the manager.
+  manager.reset();
+
+  // Adjust real/manual time stats since they were reported per thread.
+  i.results.real_time_used /= b.threads();
+  i.results.manual_time_used /= b.threads();
+  // If we were measuring whole-process CPU usage, adjust the CPU time too.
+  if (b.measure_process_cpu_time()) i.results.cpu_time_used /= b.threads();
+
+  BM_VLOG(2) << "Ran in " << i.results.cpu_time_used << "/"
+             << i.results.real_time_used << "\n";
+
+  // By using KeepRunningBatch a benchmark can iterate more times than
+  // requested, so take the iteration count from i.results.
+  i.iters = i.results.iterations / b.threads();
+
+  // Base decisions off of real time if requested by this benchmark.
+  i.seconds = i.results.cpu_time_used;
+  if (b.use_manual_time()) {
+    i.seconds = i.results.manual_time_used;
+  } else if (b.use_real_time()) {
+    i.seconds = i.results.real_time_used;
+  }
+
+  return i;
+}
+
+IterationCount BenchmarkRunner::PredictNumItersNeeded(
+    const IterationResults& i) const {
+  // See how much iterations should be increased by.
+  // Note: Avoid division by zero with max(seconds, 1ns).
+  double multiplier = min_time * 1.4 / std::max(i.seconds, 1e-9);
+  // If our last run was at least 10% of FLAGS_benchmark_min_time then we
+  // use the multiplier directly.
+  // Otherwise we use at most 10 times expansion.
+  // NOTE: When the last run was at least 10% of the min time the max
+  // expansion should be 14x.
+  bool is_significant = (i.seconds / min_time) > 0.1;
+  multiplier = is_significant ? multiplier : 10.0;
+
+  // So what seems to be the sufficiently-large iteration count? Round up.
+  const IterationCount max_next_iters = static_cast<IterationCount>(
+      std::lround(std::max(multiplier * static_cast<double>(i.iters),
+                           static_cast<double>(i.iters) + 1.0)));
+  // But we do have *some* sanity limits though..
+  const IterationCount next_iters = std::min(max_next_iters, kMaxIterations);
+
+  BM_VLOG(3) << "Next iters: " << next_iters << ", " << multiplier << "\n";
+  return next_iters;  // round up before conversion to integer.
+}
+
+bool BenchmarkRunner::ShouldReportIterationResults(
+    const IterationResults& i) const {
+  // Determine if this run should be reported;
+  // Either it has run for a sufficient amount of time
+  // or because an error was reported.
+  return i.results.has_error_ ||
+         i.iters >= kMaxIterations ||  // Too many iterations already.
+         i.seconds >= min_time ||      // The elapsed time is large enough.
+         // CPU time is specified but the elapsed real time greatly exceeds
+         // the minimum time.
+         // Note that user provided timers are except from this sanity check.
+         ((i.results.real_time_used >= 5 * min_time) && !b.use_manual_time());
+}
+
+void BenchmarkRunner::DoOneRepetition() {
+  assert(HasRepeatsRemaining() && "Already done all repetitions?");
+
+  const bool is_the_first_repetition = num_repetitions_done == 0;
+  IterationResults i;
+
+  // We *may* be gradually increasing the length (iteration count)
+  // of the benchmark until we decide the results are significant.
+  // And once we do, we report those last results and exit.
+  // Please do note that the if there are repetitions, the iteration count
+  // is *only* calculated for the *first* repetition, and other repetitions
+  // simply use that precomputed iteration count.
+  for (;;) {
+    b.Setup();
+    i = DoNIterations();
+    b.Teardown();
+
+    // Do we consider the results to be significant?
+    // If we are doing repetitions, and the first repetition was already done,
+    // it has calculated the correct iteration time, so we have run that very
+    // iteration count just now. No need to calculate anything. Just report.
+    // Else, the normal rules apply.
+    const bool results_are_significant = !is_the_first_repetition ||
+                                         has_explicit_iteration_count ||
+                                         ShouldReportIterationResults(i);
+
+    if (results_are_significant) break;  // Good, let's report them!
+
+    // Nope, bad iteration. Let's re-estimate the hopefully-sufficient
+    // iteration count, and run the benchmark again...
+
+    iters = PredictNumItersNeeded(i);
+    assert(iters > i.iters &&
+           "if we did more iterations than we want to do the next time, "
+           "then we should have accepted the current iteration run.");
+  }
+
+  // Oh, one last thing, we need to also produce the 'memory measurements'..
+  MemoryManager::Result* memory_result = nullptr;
+  IterationCount memory_iterations = 0;
+  if (memory_manager != nullptr) {
+    // TODO(vyng): Consider making BenchmarkReporter::Run::memory_result an
+    // optional so we don't have to own the Result here.
+    // Can't do it now due to cxx03.
+    memory_results.push_back(MemoryManager::Result());
+    memory_result = &memory_results.back();
+    // Only run a few iterations to reduce the impact of one-time
+    // allocations in benchmarks that are not properly managed.
+    memory_iterations = std::min<IterationCount>(16, iters);
+    memory_manager->Start();
+    std::unique_ptr<internal::ThreadManager> manager;
+    manager.reset(new internal::ThreadManager(1));
+    b.Setup();
+    RunInThread(&b, memory_iterations, 0, manager.get(),
+                perf_counters_measurement_ptr);
+    manager->WaitForAllThreads();
+    manager.reset();
+    b.Teardown();
+
+    BENCHMARK_DISABLE_DEPRECATED_WARNING
+    memory_manager->Stop(memory_result);
+    BENCHMARK_RESTORE_DEPRECATED_WARNING
+  }
+
+  // Ok, now actually report.
+  BenchmarkReporter::Run report =
+      CreateRunReport(b, i.results, memory_iterations, memory_result, i.seconds,
+                      num_repetitions_done, repeats);
+
+  if (reports_for_family) {
+    ++reports_for_family->num_runs_done;
+    if (!report.error_occurred) reports_for_family->Runs.push_back(report);
+  }
+
+  run_results.non_aggregates.push_back(report);
+
+  ++num_repetitions_done;
+}
+
+RunResults&& BenchmarkRunner::GetResults() {
+  assert(!HasRepeatsRemaining() && "Did not run all repetitions yet?");
+
+  // Calculate additional statistics over the repetitions of this instance.
+  run_results.aggregates_only = ComputeStats(run_results.non_aggregates);
+
+  return std::move(run_results);
 }

 }  // end namespace internal
--- a/src/benchmark_runner.h
+++ b/src/benchmark_runner.h
@ -15,19 +15,22 @@
 #ifndef BENCHMARK_RUNNER_H_
 #define BENCHMARK_RUNNER_H_

+#include <thread>
+#include <vector>
+
 #include "benchmark_api_internal.h"
 #include "internal_macros.h"
-
-DECLARE_double(benchmark_min_time);
-
-DECLARE_int32(benchmark_repetitions);
-
-DECLARE_bool(benchmark_report_aggregates_only);
-
-DECLARE_bool(benchmark_display_aggregates_only);
+#include "perf_counters.h"
+#include "thread_manager.h"

 namespace benchmark {

+BM_DECLARE_double(benchmark_min_time);
+BM_DECLARE_int32(benchmark_repetitions);
+BM_DECLARE_bool(benchmark_report_aggregates_only);
+BM_DECLARE_bool(benchmark_display_aggregates_only);
+BM_DECLARE_string(benchmark_perf_counters);
+
 namespace internal {

 extern MemoryManager* memory_manager;
@ -40,9 +43,59 @@ struct RunResults {
  bool file_report_aggregates_only = false;
 };

-RunResults RunBenchmark(
-    const benchmark::internal::BenchmarkInstance& b,
-    std::vector<BenchmarkReporter::Run>* complexity_reports);
+class BenchmarkRunner {
+ public:
+  BenchmarkRunner(const benchmark::internal::BenchmarkInstance& b_,
+                  BenchmarkReporter::PerFamilyRunReports* reports_for_family);
+
+  int GetNumRepeats() const { return repeats; }
+
+  bool HasRepeatsRemaining() const {
+    return GetNumRepeats() != num_repetitions_done;
+  }
+
+  void DoOneRepetition();
+
+  RunResults&& GetResults();
+
+  BenchmarkReporter::PerFamilyRunReports* GetReportsForFamily() const {
+    return reports_for_family;
+  }
+
+ private:
+  RunResults run_results;
+
+  const benchmark::internal::BenchmarkInstance& b;
+  BenchmarkReporter::PerFamilyRunReports* reports_for_family;
+
+  const double min_time;
+  const int repeats;
+  const bool has_explicit_iteration_count;
+
+  int num_repetitions_done = 0;
+
+  std::vector<std::thread> pool;
+
+  std::vector<MemoryManager::Result> memory_results;
+
+  IterationCount iters;  // preserved between repetitions!
+  // So only the first repetition has to find/calculate it,
+  // the other repetitions will just use that precomputed iteration count.
+
+  PerfCountersMeasurement perf_counters_measurement;
+  PerfCountersMeasurement* const perf_counters_measurement_ptr;
+
+  struct IterationResults {
+    internal::ThreadManager::Result results;
+    IterationCount iters;
+    double seconds;
+  };
+  IterationResults DoNIterations();
+
+  IterationCount PredictNumItersNeeded(const IterationResults& i) const;
+
+  bool ShouldReportIterationResults(const IterationResults& i) const;
+};

 }  // namespace internal

--- a/src/check.h
+++ b/src/check.h
@ -23,8 +23,9 @@ BENCHMARK_NORETURN inline void CallAbortHandler() {
  std::abort();  // fallback to enforce noreturn
 }

-// CheckHandler is the class constructed by failing CHECK macros. CheckHandler
-// will log information about the failures and abort when it is destructed.
+// CheckHandler is the class constructed by failing BM_CHECK macros.
+// CheckHandler will log information about the failures and abort when it is
+// destructed.
 class CheckHandler {
 public:
  CheckHandler(const char* check, const char* file, const char* func, int line)
@ -35,10 +36,17 @@ class CheckHandler {

  LogType& GetLog() { return log_; }

+#if defined(COMPILER_MSVC)
+#pragma warning(push)
+#pragma warning(disable : 4722)
+#endif
  BENCHMARK_NORETURN ~CheckHandler() BENCHMARK_NOEXCEPT_OP(false) {
    log_ << std::endl;
    CallAbortHandler();
  }
+#if defined(COMPILER_MSVC)
+#pragma warning(pop)
+#endif

  CheckHandler& operator=(const CheckHandler&) = delete;
  CheckHandler(const CheckHandler&) = delete;
@ -51,32 +59,32 @@ class CheckHandler {
 }  // end namespace internal
 }  // end namespace benchmark

-// The CHECK macro returns a std::ostream object that can have extra information
-// written to it.
+// The BM_CHECK macro returns a std::ostream object that can have extra
+// information written to it.
 #ifndef NDEBUG
-#define CHECK(b)                                                             \
+#define BM_CHECK(b)                                                          \
  (b ? ::benchmark::internal::GetNullLogInstance()                           \
     : ::benchmark::internal::CheckHandler(#b, __FILE__, __func__, __LINE__) \
           .GetLog())
 #else
-#define CHECK(b) ::benchmark::internal::GetNullLogInstance()
+#define BM_CHECK(b) ::benchmark::internal::GetNullLogInstance()
 #endif

 // clang-format off
 // preserve whitespacing between operators for alignment
-#define CHECK_EQ(a, b) CHECK((a) == (b))
-#define CHECK_NE(a, b) CHECK((a) != (b))
-#define CHECK_GE(a, b) CHECK((a) >= (b))
-#define CHECK_LE(a, b) CHECK((a) <= (b))
-#define CHECK_GT(a, b) CHECK((a) > (b))
-#define CHECK_LT(a, b) CHECK((a) < (b))
+#define BM_CHECK_EQ(a, b) BM_CHECK((a) == (b))
+#define BM_CHECK_NE(a, b) BM_CHECK((a) != (b))
+#define BM_CHECK_GE(a, b) BM_CHECK((a) >= (b))
+#define BM_CHECK_LE(a, b) BM_CHECK((a) <= (b))
+#define BM_CHECK_GT(a, b) BM_CHECK((a) > (b))
+#define BM_CHECK_LT(a, b) BM_CHECK((a) < (b))

-#define CHECK_FLOAT_EQ(a, b, eps) CHECK(std::fabs((a) - (b)) <  (eps))
-#define CHECK_FLOAT_NE(a, b, eps) CHECK(std::fabs((a) - (b)) >= (eps))
-#define CHECK_FLOAT_GE(a, b, eps) CHECK((a) - (b) > -(eps))
-#define CHECK_FLOAT_LE(a, b, eps) CHECK((b) - (a) > -(eps))
-#define CHECK_FLOAT_GT(a, b, eps) CHECK((a) - (b) >  (eps))
-#define CHECK_FLOAT_LT(a, b, eps) CHECK((b) - (a) >  (eps))
+#define BM_CHECK_FLOAT_EQ(a, b, eps) BM_CHECK(std::fabs((a) - (b)) <  (eps))
+#define BM_CHECK_FLOAT_NE(a, b, eps) BM_CHECK(std::fabs((a) - (b)) >= (eps))
+#define BM_CHECK_FLOAT_GE(a, b, eps) BM_CHECK((a) - (b) > -(eps))
+#define BM_CHECK_FLOAT_LE(a, b, eps) BM_CHECK((b) - (a) > -(eps))
+#define BM_CHECK_FLOAT_GT(a, b, eps) BM_CHECK((a) - (b) >  (eps))
+#define BM_CHECK_FLOAT_LT(a, b, eps) BM_CHECK((b) - (a) >  (eps))
 //clang-format on

 #endif  // CHECK_H_
--- a/src/colorprint.cc
+++ b/src/colorprint.cc
@ -25,8 +25,8 @@
 #include "internal_macros.h"

 #ifdef BENCHMARK_OS_WINDOWS
-#include <windows.h>
 #include <io.h>
+#include <windows.h>
 #else
 #include <unistd.h>
 #endif  // BENCHMARK_OS_WINDOWS
@ -94,7 +94,7 @@ std::string FormatString(const char* msg, va_list args) {
  va_end(args_cp);

  // currently there is no error handling for failure, so this is hack.
-  CHECK(ret >= 0);
+  BM_CHECK(ret >= 0);

  if (ret == 0)  // handle empty expansion
    return {};
@ -102,10 +102,10 @@ std::string FormatString(const char* msg, va_list args) {
    return local_buff;
  else {
    // we did not provide a long enough buffer on our first attempt.
-    size = (size_t)ret + 1;  // + 1 for the null byte
+    size = static_cast<size_t>(ret) + 1;  // + 1 for the null byte
    std::unique_ptr<char[]> buff(new char[size]);
    ret = vsnprintf(buff.get(), size, msg, args);
-    CHECK(ret > 0 && ((size_t)ret) < size);
+    BM_CHECK(ret > 0 && (static_cast<size_t>(ret)) < size);
    return buff.get();
  }
 }
--- a/src/commandlineflags.cc
+++ b/src/commandlineflags.cc
@ -20,6 +20,10 @@
 #include <cstring>
 #include <iostream>
 #include <limits>
+#include <map>
+#include <utility>
+
+#include "../src/string_util.h"

 namespace benchmark {
 namespace {
@ -78,6 +82,30 @@ bool ParseDouble(const std::string& src_text, const char* str, double* value) {
  return true;
 }

+// Parses 'str' into KV pairs. If successful, writes the result to *value and
+// returns true; otherwise leaves *value unchanged and returns false.
+bool ParseKvPairs(const std::string& src_text, const char* str,
+                  std::map<std::string, std::string>* value) {
+  std::map<std::string, std::string> kvs;
+  for (const auto& kvpair : StrSplit(str, ',')) {
+    const auto kv = StrSplit(kvpair, '=');
+    if (kv.size() != 2) {
+      std::cerr << src_text << " is expected to be a comma-separated list of "
+                << "<key>=<value> strings, but actually has value \"" << str
+                << "\".\n";
+      return false;
+    }
+    if (!kvs.emplace(kv[0], kv[1]).second) {
+      std::cerr << src_text << " is expected to contain unique keys but key \""
+                << kv[0] << "\" was repeated.\n";
+      return false;
+    }
+  }
+
+  *value = kvs;
+  return true;
+}
+
 // Returns the name of the environment variable corresponding to the
 // given flag.  For example, FlagToEnvVar("foo") will return
 // "BENCHMARK_FOO" in the open-source version.
@ -129,6 +157,20 @@ const char* StringFromEnv(const char* flag, const char* default_val) {
  return value == nullptr ? default_val : value;
 }

+std::map<std::string, std::string> KvPairsFromEnv(
+    const char* flag, std::map<std::string, std::string> default_val) {
+  const std::string env_var = FlagToEnvVar(flag);
+  const char* const value_str = getenv(env_var.c_str());
+
+  if (value_str == nullptr) return default_val;
+
+  std::map<std::string, std::string> value;
+  if (!ParseKvPairs("Environment variable " + env_var, value_str, &value)) {
+    return default_val;
+  }
+  return value;
+}
+
 // Parses a string as a command line flag.  The string should have
 // the format "--flag=value".  When def_optional is true, the "=value"
 // part can be omitted.
@ -206,6 +248,21 @@ bool ParseStringFlag(const char* str, const char* flag, std::string* value) {
  return true;
 }

+bool ParseKeyValueFlag(const char* str, const char* flag,
+                       std::map<std::string, std::string>* value) {
+  const char* const value_str = ParseFlagValue(str, flag, false);
+
+  if (value_str == nullptr) return false;
+
+  for (const auto& kvpair : StrSplit(value_str, ',')) {
+    const auto kv = StrSplit(kvpair, '=');
+    if (kv.size() != 2) return false;
+    value->emplace(kv[0], kv[1]);
+  }
+
+  return true;
+}
+
 bool IsFlag(const char* str, const char* flag) {
  return (ParseFlagValue(str, flag, true) != nullptr);
 }
--- a/src/commandlineflags.h
+++ b/src/commandlineflags.h
@ -2,61 +2,70 @@
 #define BENCHMARK_COMMANDLINEFLAGS_H_

 #include <cstdint>
+#include <map>
 #include <string>

 // Macro for referencing flags.
 #define FLAG(name) FLAGS_##name

 // Macros for declaring flags.
-#define DECLARE_bool(name) extern bool FLAG(name)
-#define DECLARE_int32(name) extern int32_t FLAG(name)
-#define DECLARE_double(name) extern double FLAG(name)
-#define DECLARE_string(name) extern std::string FLAG(name)
+#define BM_DECLARE_bool(name) extern bool FLAG(name)
+#define BM_DECLARE_int32(name) extern int32_t FLAG(name)
+#define BM_DECLARE_double(name) extern double FLAG(name)
+#define BM_DECLARE_string(name) extern std::string FLAG(name)
+#define BM_DECLARE_kvpairs(name) \
+  extern std::map<std::string, std::string> FLAG(name)

 // Macros for defining flags.
-#define DEFINE_bool(name, default_val)            \
-  bool FLAG(name) =                               \
-    benchmark::BoolFromEnv(#name, default_val)
-#define DEFINE_int32(name, default_val)           \
-  int32_t FLAG(name) =                            \
-    benchmark::Int32FromEnv(#name, default_val)
-#define DEFINE_double(name, default_val)          \
-  double FLAG(name) =                             \
-    benchmark::DoubleFromEnv(#name, default_val)
-#define DEFINE_string(name, default_val)          \
-  std::string FLAG(name) =                        \
-    benchmark::StringFromEnv(#name, default_val)
+#define BM_DEFINE_bool(name, default_val) \
+  bool FLAG(name) = benchmark::BoolFromEnv(#name, default_val)
+#define BM_DEFINE_int32(name, default_val) \
+  int32_t FLAG(name) = benchmark::Int32FromEnv(#name, default_val)
+#define BM_DEFINE_double(name, default_val) \
+  double FLAG(name) = benchmark::DoubleFromEnv(#name, default_val)
+#define BM_DEFINE_string(name, default_val) \
+  std::string FLAG(name) = benchmark::StringFromEnv(#name, default_val)
+#define BM_DEFINE_kvpairs(name, default_val)      \
+  std::map<std::string, std::string> FLAG(name) = \
+      benchmark::KvPairsFromEnv(#name, default_val)

 namespace benchmark {

-// Parses a bool from the environment variable
-// corresponding to the given flag.
+// Parses a bool from the environment variable corresponding to the given flag.
 //
 // If the variable exists, returns IsTruthyFlagValue() value;  if not,
 // returns the given default value.
 bool BoolFromEnv(const char* flag, bool default_val);

-// Parses an Int32 from the environment variable
-// corresponding to the given flag.
+// Parses an Int32 from the environment variable corresponding to the given
+// flag.
 //
 // If the variable exists, returns ParseInt32() value;  if not, returns
 // the given default value.
 int32_t Int32FromEnv(const char* flag, int32_t default_val);

-// Parses an Double from the environment variable
-// corresponding to the given flag.
+// Parses an Double from the environment variable corresponding to the given
+// flag.
 //
 // If the variable exists, returns ParseDouble();  if not, returns
 // the given default value.
 double DoubleFromEnv(const char* flag, double default_val);

-// Parses a string from the environment variable
-// corresponding to the given flag.
+// Parses a string from the environment variable corresponding to the given
+// flag.
 //
 // If variable exists, returns its value;  if not, returns
 // the given default value.
 const char* StringFromEnv(const char* flag, const char* default_val);

+// Parses a set of kvpairs from the environment variable corresponding to the
+// given flag.
+//
+// If variable exists, returns its value;  if not, returns
+// the given default value.
+std::map<std::string, std::string> KvPairsFromEnv(
+    const char* flag, std::map<std::string, std::string> default_val);
+
 // Parses a string for a bool flag, in the form of either
 // "--flag=value" or "--flag".
 //
@ -68,27 +77,31 @@ const char* StringFromEnv(const char* flag, const char* default_val);
 // true.  On failure, returns false without changing *value.
 bool ParseBoolFlag(const char* str, const char* flag, bool* value);

-// Parses a string for an Int32 flag, in the form of
-// "--flag=value".
+// Parses a string for an Int32 flag, in the form of "--flag=value".
 //
 // On success, stores the value of the flag in *value, and returns
 // true.  On failure, returns false without changing *value.
 bool ParseInt32Flag(const char* str, const char* flag, int32_t* value);

-// Parses a string for a Double flag, in the form of
-// "--flag=value".
+// Parses a string for a Double flag, in the form of "--flag=value".
 //
 // On success, stores the value of the flag in *value, and returns
 // true.  On failure, returns false without changing *value.
 bool ParseDoubleFlag(const char* str, const char* flag, double* value);

-// Parses a string for a string flag, in the form of
-// "--flag=value".
+// Parses a string for a string flag, in the form of "--flag=value".
 //
 // On success, stores the value of the flag in *value, and returns
 // true.  On failure, returns false without changing *value.
 bool ParseStringFlag(const char* str, const char* flag, std::string* value);

+// Parses a string for a kvpairs flag in the form "--flag=key=value,key=value"
+//
+// On success, stores the value of the flag in *value and returns true. On
+// failure returns false, though *value may have been mutated.
+bool ParseKeyValueFlag(const char* str, const char* flag,
+                       std::map<std::string, std::string>* value);
+
 // Returns true if the string matches the flag.
 bool IsFlag(const char* str, const char* flag);

--- a/src/complexity.cc
+++ b/src/complexity.cc
@ -15,12 +15,13 @@
 // Source project : https://github.com/ismaelJimenez/cpp.leastsq
 // Adapted to be used with google benchmark

-#include "benchmark/benchmark.h"
+#include "complexity.h"

 #include <algorithm>
 #include <cmath>
+
+#include "benchmark/benchmark.h"
 #include "check.h"
-#include "complexity.h"

 namespace benchmark {

@ -82,7 +83,6 @@ std::string GetBigOString(BigO complexity) {
 LeastSq MinimalLeastSq(const std::vector<int64_t>& n,
                       const std::vector<double>& time,
                       BigOFunc* fitting_curve) {
-  double sigma_gn = 0.0;
  double sigma_gn_squared = 0.0;
  double sigma_time = 0.0;
  double sigma_time_gn = 0.0;
@ -90,7 +90,6 @@ LeastSq MinimalLeastSq(const std::vector<int64_t>& n,
  // Calculate least square fitting parameter
  for (size_t i = 0; i < n.size(); ++i) {
    double gn_i = fitting_curve(n[i]);
-    sigma_gn += gn_i;
    sigma_gn_squared += gn_i * gn_i;
    sigma_time += time[i];
    sigma_time_gn += time[i] * gn_i;
@ -125,10 +124,10 @@ LeastSq MinimalLeastSq(const std::vector<int64_t>& n,
 //                  fitting curve.
 LeastSq MinimalLeastSq(const std::vector<int64_t>& n,
                       const std::vector<double>& time, const BigO complexity) {
-  CHECK_EQ(n.size(), time.size());
-  CHECK_GE(n.size(), 2);  // Do not compute fitting curve is less than two
-                          // benchmark runs are given
-  CHECK_NE(complexity, oNone);
+  BM_CHECK_EQ(n.size(), time.size());
+  BM_CHECK_GE(n.size(), 2);  // Do not compute fitting curve is less than two
+                             // benchmark runs are given
+  BM_CHECK_NE(complexity, oNone);

  LeastSq best_fit;

@ -169,7 +168,8 @@ std::vector<BenchmarkReporter::Run> ComputeBigO(

  // Populate the accumulators.
  for (const Run& run : reports) {
-    CHECK_GT(run.complexity_n, 0) << "Did you forget to call SetComplexityN?";
+    BM_CHECK_GT(run.complexity_n, 0)
+        << "Did you forget to call SetComplexityN?";
    n.push_back(run.complexity_n);
    real_time.push_back(run.real_accumulated_time / run.iterations);
    cpu_time.push_back(run.cpu_accumulated_time / run.iterations);
@ -193,11 +193,14 @@ std::vector<BenchmarkReporter::Run> ComputeBigO(
  // Get the data from the accumulator to BenchmarkReporter::Run's.
  Run big_o;
  big_o.run_name = run_name;
+  big_o.family_index = reports[0].family_index;
+  big_o.per_family_instance_index = reports[0].per_family_instance_index;
  big_o.run_type = BenchmarkReporter::Run::RT_Aggregate;
  big_o.repetitions = reports[0].repetitions;
  big_o.repetition_index = Run::no_repetition_index;
  big_o.threads = reports[0].threads;
  big_o.aggregate_name = "BigO";
+  big_o.aggregate_unit = StatisticUnit::kTime;
  big_o.report_label = reports[0].report_label;
  big_o.iterations = 0;
  big_o.real_accumulated_time = result_real.coef;
@ -215,8 +218,11 @@ std::vector<BenchmarkReporter::Run> ComputeBigO(
  // Only add label to mean/stddev if it is same for all runs
  Run rms;
  rms.run_name = run_name;
+  rms.family_index = reports[0].family_index;
+  rms.per_family_instance_index = reports[0].per_family_instance_index;
  rms.run_type = BenchmarkReporter::Run::RT_Aggregate;
  rms.aggregate_name = "RMS";
+  rms.aggregate_unit = StatisticUnit::kPercentage;
  rms.report_label = big_o.report_label;
  rms.iterations = 0;
  rms.repetition_index = Run::no_repetition_index;
--- a/src/console_reporter.cc
+++ b/src/console_reporter.cc
@ -45,7 +45,7 @@ bool ConsoleReporter::ReportContext(const Context& context) {
    GetErrorStream()
        << "Color printing is only supported for stdout on windows."
           " Disabling color printing\n";
-    output_options_ = static_cast< OutputOptions >(output_options_ & ~OO_Color);
+    output_options_ = static_cast<OutputOptions>(output_options_ & ~OO_Color);
  }
 #endif

@ -53,11 +53,12 @@ bool ConsoleReporter::ReportContext(const Context& context) {
 }

 void ConsoleReporter::PrintHeader(const Run& run) {
-  std::string str = FormatString("%-*s %13s %15s %12s", static_cast<int>(name_field_width_),
-                                 "Benchmark", "Time", "CPU", "Iterations");
-  if(!run.counters.empty()) {
-    if(output_options_ & OO_Tabular) {
-      for(auto const& c : run.counters) {
+  std::string str =
+      FormatString("%-*s %13s %15s %12s", static_cast<int>(name_field_width_),
+                   "Benchmark", "Time", "CPU", "Iterations");
+  if (!run.counters.empty()) {
+    if (output_options_ & OO_Tabular) {
+      for (auto const& c : run.counters) {
        str += FormatString(" %10s", c.first.c_str());
      }
    } else {
@ -97,7 +98,6 @@ static void IgnoreColorPrint(std::ostream& out, LogColor, const char* fmt,
  va_end(args);
 }

-
 static std::string FormatTime(double time) {
  // Align decimal places...
  if (time < 1.0) {
@ -115,8 +115,9 @@ static std::string FormatTime(double time) {
 void ConsoleReporter::PrintRunData(const Run& result) {
  typedef void(PrinterFn)(std::ostream&, LogColor, const char*, ...);
  auto& Out = GetOutputStream();
-  PrinterFn* printer = (output_options_ & OO_Color) ?
-                         (PrinterFn*)ColorPrintf : IgnoreColorPrint;
+  PrinterFn* printer = (output_options_ & OO_Color)
+                           ? static_cast<PrinterFn*>(ColorPrintf)
+                           : IgnoreColorPrint;
  auto name_color =
      (result.report_big_o || result.report_rms) ? COLOR_BLUE : COLOR_GREEN;
  printer(Out, name_color, "%-*s ", name_field_width_,
@ -134,18 +135,23 @@ void ConsoleReporter::PrintRunData(const Run& result) {
  const std::string real_time_str = FormatTime(real_time);
  const std::string cpu_time_str = FormatTime(cpu_time);

-
  if (result.report_big_o) {
    std::string big_o = GetBigOString(result.complexity);
-    printer(Out, COLOR_YELLOW, "%10.2f %-4s %10.2f %-4s ", real_time, big_o.c_str(),
-            cpu_time, big_o.c_str());
+    printer(Out, COLOR_YELLOW, "%10.2f %-4s %10.2f %-4s ", real_time,
+            big_o.c_str(), cpu_time, big_o.c_str());
  } else if (result.report_rms) {
    printer(Out, COLOR_YELLOW, "%10.0f %-4s %10.0f %-4s ", real_time * 100, "%",
            cpu_time * 100, "%");
-  } else {
+  } else if (result.run_type != Run::RT_Aggregate ||
+             result.aggregate_unit == StatisticUnit::kTime) {
    const char* timeLabel = GetTimeUnitString(result.time_unit);
-    printer(Out, COLOR_YELLOW, "%s %-4s %s %-4s ", real_time_str.c_str(), timeLabel,
-            cpu_time_str.c_str(), timeLabel);
+    printer(Out, COLOR_YELLOW, "%s %-4s %s %-4s ", real_time_str.c_str(),
+            timeLabel, cpu_time_str.c_str(), timeLabel);
+  } else {
+    assert(result.aggregate_unit == StatisticUnit::kPercentage);
+    printer(Out, COLOR_YELLOW, "%10.2f %-4s %10.2f %-4s ",
+            (100. * result.real_accumulated_time), "%",
+            (100. * result.cpu_accumulated_time), "%");
  }

  if (!result.report_big_o && !result.report_rms) {
@ -153,12 +159,19 @@ void ConsoleReporter::PrintRunData(const Run& result) {
  }

  for (auto& c : result.counters) {
-    const std::size_t cNameLen = std::max(std::string::size_type(10),
-                                          c.first.length());
-    auto const& s = HumanReadableNumber(c.second.value, c.second.oneK);
+    const std::size_t cNameLen =
+        std::max(std::string::size_type(10), c.first.length());
+    std::string s;
    const char* unit = "";
-    if (c.second.flags & Counter::kIsRate)
-      unit = (c.second.flags & Counter::kInvert) ? "s" : "/s";
+    if (result.run_type == Run::RT_Aggregate &&
+        result.aggregate_unit == StatisticUnit::kPercentage) {
+      s = StrFormat("%.2f", 100. * c.second.value);
+      unit = "%";
+    } else {
+      s = HumanReadableNumber(c.second.value, c.second.oneK);
+      if (c.second.flags & Counter::kIsRate)
+        unit = (c.second.flags & Counter::kInvert) ? "s" : "/s";
+    }
    if (output_options_ & OO_Tabular) {
      printer(Out, COLOR_DEFAULT, " %*s%s", cNameLen - strlen(unit), s.c_str(),
              unit);
--- a/src/csv_reporter.cc
+++ b/src/csv_reporter.cc
@ -12,9 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include "benchmark/benchmark.h"
-#include "complexity.h"
-
 #include <algorithm>
 #include <cstdint>
 #include <iostream>
@ -22,7 +19,9 @@
 #include <tuple>
 #include <vector>

+#include "benchmark/benchmark.h"
 #include "check.h"
+#include "complexity.h"
 #include "string_util.h"
 #include "timers.h"

@ -37,13 +36,17 @@ std::vector<std::string> elements = {
    "error_occurred", "error_message"};
 }  // namespace

-std::string CsvEscape(const std::string & s) {
+std::string CsvEscape(const std::string& s) {
  std::string tmp;
  tmp.reserve(s.size() + 2);
  for (char c : s) {
    switch (c) {
-    case '"' : tmp += "\"\""; break;
-    default  : tmp += c; break;
+      case '"':
+        tmp += "\"\"";
+        break;
+      default:
+        tmp += c;
+        break;
    }
  }
  return '"' + tmp + '"';
@ -85,7 +88,8 @@ void CSVReporter::ReportRuns(const std::vector<Run>& reports) {
      for (const auto& cnt : run.counters) {
        if (cnt.first == "bytes_per_second" || cnt.first == "items_per_second")
          continue;
-        CHECK(user_counter_names_.find(cnt.first) != user_counter_names_.end())
+        BM_CHECK(user_counter_names_.find(cnt.first) !=
+                 user_counter_names_.end())
            << "All counters must be present in each run. "
            << "Counter named \"" << cnt.first
            << "\" was not in a run after being added to the header";
--- a/src/cycleclock.h
+++ b/src/cycleclock.h
@ -36,7 +36,7 @@
 // declarations of some other intrinsics, breaking compilation.
 // Therefore, we simply declare __rdtsc ourselves. See also
 // http://connect.microsoft.com/VisualStudio/feedback/details/262047
-#if defined(COMPILER_MSVC) && !defined(_M_IX86)
+#if defined(COMPILER_MSVC) && !defined(_M_IX86) && !defined(_M_ARM64)
 extern "C" uint64_t __rdtsc();
 #pragma intrinsic(__rdtsc)
 #endif
@ -92,7 +92,7 @@ inline BENCHMARK_ALWAYS_INLINE int64_t Now() {
  uint32_t tbl, tbu0, tbu1;
  asm volatile(
      "mftbu %0\n"
-      "mftbl %1\n"
+      "mftb %1\n"
      "mftbu %2"
      : "=r"(tbu0), "=r"(tbl), "=r"(tbu1));
  tbl &= -static_cast<int32_t>(tbu0 == tbu1);
@ -114,6 +114,12 @@ inline BENCHMARK_ALWAYS_INLINE int64_t Now() {
  // when I know it will work.  Otherwise, I'll use __rdtsc and hope
  // the code is being compiled with a non-ancient compiler.
  _asm rdtsc
+#elif defined(COMPILER_MSVC) && defined(_M_ARM64)
+  // See // https://docs.microsoft.com/en-us/cpp/intrinsics/arm64-intrinsics
+  // and https://reviews.llvm.org/D53115
+  int64_t virtual_timer_value;
+  virtual_timer_value = _ReadStatusReg(ARM64_CNTVCT);
+  return virtual_timer_value;
 #elif defined(COMPILER_MSVC)
  return __rdtsc();
 #elif defined(BENCHMARK_OS_NACL)
@ -161,18 +167,27 @@ inline BENCHMARK_ALWAYS_INLINE int64_t Now() {
  struct timeval tv;
  gettimeofday(&tv, nullptr);
  return static_cast<int64_t>(tv.tv_sec) * 1000000 + tv.tv_usec;
-#elif defined(__mips__)
+#elif defined(__mips__) || defined(__m68k__)
  // mips apparently only allows rdtsc for superusers, so we fall
  // back to gettimeofday.  It's possible clock_gettime would be better.
  struct timeval tv;
  gettimeofday(&tv, nullptr);
  return static_cast<int64_t>(tv.tv_sec) * 1000000 + tv.tv_usec;
+#elif defined(__loongarch__)
+  struct timeval tv;
+  gettimeofday(&tv, nullptr);
+  return static_cast<int64_t>(tv.tv_sec) * 1000000 + tv.tv_usec;
 #elif defined(__s390__)  // Covers both s390 and s390x.
  // Return the CPU clock.
  uint64_t tsc;
+#if defined(BENCHMARK_OS_ZOS) && defined(COMPILER_IBMXL)
+  // z/OS XL compiler HLASM syntax.
+  asm(" stck %0" : "=m"(tsc) : : "cc");
+#else
  asm("stck %0" : "=Q"(tsc) : : "cc");
+#endif
  return tsc;
-#elif defined(__riscv) // RISC-V
+#elif defined(__riscv)  // RISC-V
  // Use RDCYCLE (and RDCYCLEH on riscv32)
 #if __riscv_xlen == 32
  uint32_t cycles_lo, cycles_hi0, cycles_hi1;
@ -193,6 +208,10 @@ inline BENCHMARK_ALWAYS_INLINE int64_t Now() {
  asm volatile("rdcycle %0" : "=r"(cycles));
  return cycles;
 #endif
+#elif defined(__e2k__) || defined(__elbrus__)
+  struct timeval tv;
+  gettimeofday(&tv, nullptr);
+  return static_cast<int64_t>(tv.tv_sec) * 1000000 + tv.tv_usec;
 #else
 // The soft failover to a generic implementation is automatic only for ARM.
 // For other platforms the developer is expected to make an attempt to create
--- a/src/internal_macros.h
+++ b/src/internal_macros.h
@ -13,7 +13,11 @@
 #endif

 #if defined(__clang__)
-  #if !defined(COMPILER_CLANG)
+  #if defined(__ibmxl__)
+    #if !defined(COMPILER_IBMXL)
+      #define COMPILER_IBMXL
+    #endif
+  #elif !defined(COMPILER_CLANG)
    #define COMPILER_CLANG
  #endif
 #elif defined(_MSC_VER)
@ -58,6 +62,8 @@
  #define BENCHMARK_OS_NETBSD 1
 #elif defined(__OpenBSD__)
  #define BENCHMARK_OS_OPENBSD 1
+#elif defined(__DragonFly__)
+  #define BENCHMARK_OS_DRAGONFLY 1
 #elif defined(__linux__)
  #define BENCHMARK_OS_LINUX 1
 #elif defined(__native_client__)
@ -72,6 +78,8 @@
 #define BENCHMARK_OS_SOLARIS 1
 #elif defined(__QNX__)
 #define BENCHMARK_OS_QNX 1
+#elif defined(__MVS__)
+#define BENCHMARK_OS_ZOS 1
 #endif

 #if defined(__ANDROID__) && defined(__GLIBCXX__)
--- a/src/json_reporter.cc
+++ b/src/json_reporter.cc
@ -12,9 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include "benchmark/benchmark.h"
-#include "complexity.h"
-
 #include <algorithm>
 #include <cmath>
 #include <cstdint>
@ -25,41 +22,65 @@
 #include <tuple>
 #include <vector>

+#include "benchmark/benchmark.h"
+#include "complexity.h"
 #include "string_util.h"
 #include "timers.h"

 namespace benchmark {
+namespace internal {
+extern std::map<std::string, std::string>* global_context;
+}

 namespace {

-std::string StrEscape(const std::string & s) {
+std::string StrEscape(const std::string& s) {
  std::string tmp;
  tmp.reserve(s.size());
  for (char c : s) {
    switch (c) {
-    case '\b': tmp += "\\b"; break;
-    case '\f': tmp += "\\f"; break;
-    case '\n': tmp += "\\n"; break;
-    case '\r': tmp += "\\r"; break;
-    case '\t': tmp += "\\t"; break;
-    case '\\': tmp += "\\\\"; break;
-    case '"' : tmp += "\\\""; break;
-    default  : tmp += c; break;
+      case '\b':
+        tmp += "\\b";
+        break;
+      case '\f':
+        tmp += "\\f";
+        break;
+      case '\n':
+        tmp += "\\n";
+        break;
+      case '\r':
+        tmp += "\\r";
+        break;
+      case '\t':
+        tmp += "\\t";
+        break;
+      case '\\':
+        tmp += "\\\\";
+        break;
+      case '"':
+        tmp += "\\\"";
+        break;
+      default:
+        tmp += c;
+        break;
    }
  }
  return tmp;
 }

 std::string FormatKV(std::string const& key, std::string const& value) {
-  return StrFormat("\"%s\": \"%s\"", StrEscape(key).c_str(), StrEscape(value).c_str());
+  return StrFormat("\"%s\": \"%s\"", StrEscape(key).c_str(),
+                   StrEscape(value).c_str());
 }

 std::string FormatKV(std::string const& key, const char* value) {
-  return StrFormat("\"%s\": \"%s\"", StrEscape(key).c_str(), StrEscape(value).c_str());
+  return StrFormat("\"%s\": \"%s\"", StrEscape(key).c_str(),
+                   StrEscape(value).c_str());
 }

 std::string FormatKV(std::string const& key, bool value) {
-  return StrFormat("\"%s\": %s", StrEscape(key).c_str(), value ? "true" : "false");
+  return StrFormat("\"%s\": %s", StrEscape(key).c_str(),
+                   value ? "true" : "false");
 }

 std::string FormatKV(std::string const& key, int64_t value) {
@ -123,7 +144,9 @@ bool JSONReporter::ReportContext(const Context& context) {
                  RoundDouble(info.cycles_per_second / 1000000.0))
      << ",\n";
  if (CPUInfo::Scaling::UNKNOWN != info.scaling) {
-    out << indent << FormatKV("cpu_scaling_enabled", info.scaling == CPUInfo::Scaling::ENABLED ? true : false)
+    out << indent
+        << FormatKV("cpu_scaling_enabled",
+                    info.scaling == CPUInfo::Scaling::ENABLED ? true : false)
        << ",\n";
  }

@ -136,8 +159,8 @@ bool JSONReporter::ReportContext(const Context& context) {
    out << cache_indent << FormatKV("type", CI.type) << ",\n";
    out << cache_indent << FormatKV("level", static_cast<int64_t>(CI.level))
        << ",\n";
-    out << cache_indent
-        << FormatKV("size", static_cast<int64_t>(CI.size)) << ",\n";
+    out << cache_indent << FormatKV("size", static_cast<int64_t>(CI.size))
+        << ",\n";
    out << cache_indent
        << FormatKV("num_sharing", static_cast<int64_t>(CI.num_sharing))
        << "\n";
@ -159,7 +182,16 @@ bool JSONReporter::ReportContext(const Context& context) {
 #else
  const char build_type[] = "debug";
 #endif
-  out << indent << FormatKV("library_build_type", build_type) << "\n";
+  out << indent << FormatKV("library_build_type", build_type);
+
+  if (internal::global_context != nullptr) {
+    for (const auto& kv : *internal::global_context) {
+      out << ",\n";
+      out << indent << FormatKV(kv.first, kv.second);
+    }
+  }
+  out << "\n";
+
  // Close context block and open the list of benchmarks.
  out << inner_indent << "},\n";
  out << inner_indent << "\"benchmarks\": [\n";
@ -197,6 +229,10 @@ void JSONReporter::PrintRunData(Run const& run) {
  std::string indent(6, ' ');
  std::ostream& out = GetOutputStream();
  out << indent << FormatKV("name", run.benchmark_name()) << ",\n";
+  out << indent << FormatKV("family_index", run.family_index) << ",\n";
+  out << indent
+      << FormatKV("per_family_instance_index", run.per_family_instance_index)
+      << ",\n";
  out << indent << FormatKV("run_name", run.run_name.str()) << ",\n";
  out << indent << FormatKV("run_type", [&run]() -> const char* {
    switch (run.run_type) {
@ -215,6 +251,15 @@ void JSONReporter::PrintRunData(Run const& run) {
  out << indent << FormatKV("threads", run.threads) << ",\n";
  if (run.run_type == BenchmarkReporter::Run::RT_Aggregate) {
    out << indent << FormatKV("aggregate_name", run.aggregate_name) << ",\n";
+    out << indent << FormatKV("aggregate_unit", [&run]() -> const char* {
+      switch (run.aggregate_unit) {
+        case StatisticUnit::kTime:
+          return "time";
+        case StatisticUnit::kPercentage:
+          return "percentage";
+      }
+      BENCHMARK_UNREACHABLE();
+    }()) << ",\n";
  }
  if (run.error_occurred) {
    out << indent << FormatKV("error_occurred", run.error_occurred) << ",\n";
@ -222,8 +267,17 @@ void JSONReporter::PrintRunData(Run const& run) {
  }
  if (!run.report_big_o && !run.report_rms) {
    out << indent << FormatKV("iterations", run.iterations) << ",\n";
-    out << indent << FormatKV("real_time", run.GetAdjustedRealTime()) << ",\n";
-    out << indent << FormatKV("cpu_time", run.GetAdjustedCPUTime());
+    if (run.run_type != Run::RT_Aggregate ||
+        run.aggregate_unit == StatisticUnit::kTime) {
+      out << indent << FormatKV("real_time", run.GetAdjustedRealTime())
+          << ",\n";
+      out << indent << FormatKV("cpu_time", run.GetAdjustedCPUTime());
+    } else {
+      assert(run.aggregate_unit == StatisticUnit::kPercentage);
+      out << indent << FormatKV("real_time", run.real_accumulated_time)
+          << ",\n";
+      out << indent << FormatKV("cpu_time", run.cpu_accumulated_time);
+    }
    out << ",\n"
        << indent << FormatKV("time_unit", GetTimeUnitString(run.time_unit));
  } else if (run.report_big_o) {
@ -241,9 +295,20 @@ void JSONReporter::PrintRunData(Run const& run) {
    out << ",\n" << indent << FormatKV(c.first, c.second);
  }

-  if (run.has_memory_result) {
+  if (run.memory_result) {
+    const MemoryManager::Result memory_result = *run.memory_result;
    out << ",\n" << indent << FormatKV("allocs_per_iter", run.allocs_per_iter);
-    out << ",\n" << indent << FormatKV("max_bytes_used", run.max_bytes_used);
+    out << ",\n"
+        << indent << FormatKV("max_bytes_used", memory_result.max_bytes_used);
+
+    auto report_if_present = [&out, &indent](const char* label, int64_t val) {
+      if (val != MemoryManager::TombstoneValue)
+        out << ",\n" << indent << FormatKV(label, val);
+    };
+
+    report_if_present("total_allocated_bytes",
+                      memory_result.total_allocated_bytes);
+    report_if_present("net_heap_growth", memory_result.net_heap_growth);
  }

  if (!run.report_label.empty()) {
@ -252,4 +317,7 @@ void JSONReporter::PrintRunData(Run const& run) {
  out << '\n';
 }

+const int64_t MemoryManager::TombstoneValue =
+    std::numeric_limits<int64_t>::max();
+
 }  // end namespace benchmark
--- a/src/log.h
+++ b/src/log.h
@ -67,7 +67,7 @@ inline LogType& GetLogInstanceForLevel(int level) {
 }  // end namespace benchmark

 // clang-format off
-#define VLOG(x)                                                               \
+#define BM_VLOG(x)                                                               \
  (::benchmark::internal::GetLogInstanceForLevel(x) << "-- LOG(" << x << "):" \
                                                                         " ")
 // clang-format on
--- a/src/mutex.h
+++ b/src/mutex.h
@ -9,60 +9,60 @@
 // Enable thread safety attributes only with clang.
 // The attributes can be safely erased when compiling with other compilers.
 #if defined(HAVE_THREAD_SAFETY_ATTRIBUTES)
-#define THREAD_ANNOTATION_ATTRIBUTE__(x) __attribute__((x))
+#define THREAD_ANNOTATION_ATTRIBUTE_(x) __attribute__((x))
 #else
-#define THREAD_ANNOTATION_ATTRIBUTE__(x)  // no-op
+#define THREAD_ANNOTATION_ATTRIBUTE_(x)  // no-op
 #endif

-#define CAPABILITY(x) THREAD_ANNOTATION_ATTRIBUTE__(capability(x))
+#define CAPABILITY(x) THREAD_ANNOTATION_ATTRIBUTE_(capability(x))

-#define SCOPED_CAPABILITY THREAD_ANNOTATION_ATTRIBUTE__(scoped_lockable)
+#define SCOPED_CAPABILITY THREAD_ANNOTATION_ATTRIBUTE_(scoped_lockable)

-#define GUARDED_BY(x) THREAD_ANNOTATION_ATTRIBUTE__(guarded_by(x))
+#define GUARDED_BY(x) THREAD_ANNOTATION_ATTRIBUTE_(guarded_by(x))

-#define PT_GUARDED_BY(x) THREAD_ANNOTATION_ATTRIBUTE__(pt_guarded_by(x))
+#define PT_GUARDED_BY(x) THREAD_ANNOTATION_ATTRIBUTE_(pt_guarded_by(x))

 #define ACQUIRED_BEFORE(...) \
-  THREAD_ANNOTATION_ATTRIBUTE__(acquired_before(__VA_ARGS__))
+  THREAD_ANNOTATION_ATTRIBUTE_(acquired_before(__VA_ARGS__))

 #define ACQUIRED_AFTER(...) \
-  THREAD_ANNOTATION_ATTRIBUTE__(acquired_after(__VA_ARGS__))
+  THREAD_ANNOTATION_ATTRIBUTE_(acquired_after(__VA_ARGS__))

 #define REQUIRES(...) \
-  THREAD_ANNOTATION_ATTRIBUTE__(requires_capability(__VA_ARGS__))
+  THREAD_ANNOTATION_ATTRIBUTE_(requires_capability(__VA_ARGS__))

 #define REQUIRES_SHARED(...) \
-  THREAD_ANNOTATION_ATTRIBUTE__(requires_shared_capability(__VA_ARGS__))
+  THREAD_ANNOTATION_ATTRIBUTE_(requires_shared_capability(__VA_ARGS__))

 #define ACQUIRE(...) \
-  THREAD_ANNOTATION_ATTRIBUTE__(acquire_capability(__VA_ARGS__))
+  THREAD_ANNOTATION_ATTRIBUTE_(acquire_capability(__VA_ARGS__))

 #define ACQUIRE_SHARED(...) \
-  THREAD_ANNOTATION_ATTRIBUTE__(acquire_shared_capability(__VA_ARGS__))
+  THREAD_ANNOTATION_ATTRIBUTE_(acquire_shared_capability(__VA_ARGS__))

 #define RELEASE(...) \
-  THREAD_ANNOTATION_ATTRIBUTE__(release_capability(__VA_ARGS__))
+  THREAD_ANNOTATION_ATTRIBUTE_(release_capability(__VA_ARGS__))

 #define RELEASE_SHARED(...) \
-  THREAD_ANNOTATION_ATTRIBUTE__(release_shared_capability(__VA_ARGS__))
+  THREAD_ANNOTATION_ATTRIBUTE_(release_shared_capability(__VA_ARGS__))

 #define TRY_ACQUIRE(...) \
-  THREAD_ANNOTATION_ATTRIBUTE__(try_acquire_capability(__VA_ARGS__))
+  THREAD_ANNOTATION_ATTRIBUTE_(try_acquire_capability(__VA_ARGS__))

 #define TRY_ACQUIRE_SHARED(...) \
-  THREAD_ANNOTATION_ATTRIBUTE__(try_acquire_shared_capability(__VA_ARGS__))
+  THREAD_ANNOTATION_ATTRIBUTE_(try_acquire_shared_capability(__VA_ARGS__))

-#define EXCLUDES(...) THREAD_ANNOTATION_ATTRIBUTE__(locks_excluded(__VA_ARGS__))
+#define EXCLUDES(...) THREAD_ANNOTATION_ATTRIBUTE_(locks_excluded(__VA_ARGS__))

-#define ASSERT_CAPABILITY(x) THREAD_ANNOTATION_ATTRIBUTE__(assert_capability(x))
+#define ASSERT_CAPABILITY(x) THREAD_ANNOTATION_ATTRIBUTE_(assert_capability(x))

 #define ASSERT_SHARED_CAPABILITY(x) \
-  THREAD_ANNOTATION_ATTRIBUTE__(assert_shared_capability(x))
+  THREAD_ANNOTATION_ATTRIBUTE_(assert_shared_capability(x))

-#define RETURN_CAPABILITY(x) THREAD_ANNOTATION_ATTRIBUTE__(lock_returned(x))
+#define RETURN_CAPABILITY(x) THREAD_ANNOTATION_ATTRIBUTE_(lock_returned(x))

 #define NO_THREAD_SAFETY_ANALYSIS \
-  THREAD_ANNOTATION_ATTRIBUTE__(no_thread_safety_analysis)
+  THREAD_ANNOTATION_ATTRIBUTE_(no_thread_safety_analysis)

 namespace benchmark {

@ -130,7 +130,7 @@ class Barrier {
  // entered the barrier.  Returns iff this is the last thread to
  // enter the barrier.
  bool createBarrier(MutexLock& ml) REQUIRES(lock_) {
-    CHECK_LT(entered_, running_threads_);
+    BM_CHECK_LT(entered_, running_threads_);
    entered_++;
    if (entered_ < running_threads_) {
      // Wait for all threads to enter
--- a/src/perf_counters.cc
+++ b/src/perf_counters.cc
@ -0,0 +1,132 @@
+// Copyright 2021 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "perf_counters.h"
+
+#include <cstring>
+#include <vector>
+
+#if defined HAVE_LIBPFM
+#include "perfmon/pfmlib.h"
+#include "perfmon/pfmlib_perf_event.h"
+#endif
+
+namespace benchmark {
+namespace internal {
+
+constexpr size_t PerfCounterValues::kMaxCounters;
+
+#if defined HAVE_LIBPFM
+const bool PerfCounters::kSupported = true;
+
+bool PerfCounters::Initialize() { return pfm_initialize() == PFM_SUCCESS; }
+
+PerfCounters PerfCounters::Create(
+    const std::vector<std::string>& counter_names) {
+  if (counter_names.empty()) {
+    return NoCounters();
+  }
+  if (counter_names.size() > PerfCounterValues::kMaxCounters) {
+    GetErrorLogInstance()
+        << counter_names.size()
+        << " counters were requested. The minimum is 1, the maximum is "
+        << PerfCounterValues::kMaxCounters << "\n";
+    return NoCounters();
+  }
+  std::vector<int> counter_ids(counter_names.size());
+
+  const int mode = PFM_PLM3;  // user mode only
+  for (size_t i = 0; i < counter_names.size(); ++i) {
+    const bool is_first = i == 0;
+    struct perf_event_attr attr {};
+    attr.size = sizeof(attr);
+    const int group_id = !is_first ? counter_ids[0] : -1;
+    const auto& name = counter_names[i];
+    if (name.empty()) {
+      GetErrorLogInstance() << "A counter name was the empty string\n";
+      return NoCounters();
+    }
+    pfm_perf_encode_arg_t arg{};
+    arg.attr = &attr;
+
+    const int pfm_get =
+        pfm_get_os_event_encoding(name.c_str(), mode, PFM_OS_PERF_EVENT, &arg);
+    if (pfm_get != PFM_SUCCESS) {
+      GetErrorLogInstance() << "Unknown counter name: " << name << "\n";
+      return NoCounters();
+    }
+    attr.disabled = is_first;
+    // Note: the man page for perf_event_create suggests inerit = true and
+    // read_format = PERF_FORMAT_GROUP don't work together, but that's not the
+    // case.
+    attr.inherit = true;
+    attr.pinned = is_first;
+    attr.exclude_kernel = true;
+    attr.exclude_user = false;
+    attr.exclude_hv = true;
+    // Read all counters in one read.
+    attr.read_format = PERF_FORMAT_GROUP;
+
+    int id = -1;
+    static constexpr size_t kNrOfSyscallRetries = 5;
+    // Retry syscall as it was interrupted often (b/64774091).
+    for (size_t num_retries = 0; num_retries < kNrOfSyscallRetries;
+         ++num_retries) {
+      id = perf_event_open(&attr, 0, -1, group_id, 0);
+      if (id >= 0 || errno != EINTR) {
+        break;
+      }
+    }
+    if (id < 0) {
+      GetErrorLogInstance()
+          << "Failed to get a file descriptor for " << name << "\n";
+      return NoCounters();
+    }
+
+    counter_ids[i] = id;
+  }
+  if (ioctl(counter_ids[0], PERF_EVENT_IOC_ENABLE) != 0) {
+    GetErrorLogInstance() << "Failed to start counters\n";
+    return NoCounters();
+  }
+
+  return PerfCounters(counter_names, std::move(counter_ids));
+}
+
+PerfCounters::~PerfCounters() {
+  if (counter_ids_.empty()) {
+    return;
+  }
+  ioctl(counter_ids_[0], PERF_EVENT_IOC_DISABLE);
+  for (int fd : counter_ids_) {
+    close(fd);
+  }
+}
+#else   // defined HAVE_LIBPFM
+const bool PerfCounters::kSupported = false;
+
+bool PerfCounters::Initialize() { return false; }
+
+PerfCounters PerfCounters::Create(
+    const std::vector<std::string>& counter_names) {
+  if (!counter_names.empty()) {
+    GetErrorLogInstance() << "Performance counters not supported.";
+  }
+  return NoCounters();
+}
+
+PerfCounters::~PerfCounters() = default;
+#endif  // defined HAVE_LIBPFM
+}  // namespace internal
+}  // namespace benchmark
--- a/src/perf_counters.h
+++ b/src/perf_counters.h
@ -0,0 +1,172 @@
+// Copyright 2021 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef BENCHMARK_PERF_COUNTERS_H
+#define BENCHMARK_PERF_COUNTERS_H
+
+#include <array>
+#include <cstdint>
+#include <vector>
+
+#include "benchmark/benchmark.h"
+#include "check.h"
+#include "log.h"
+
+#ifndef BENCHMARK_OS_WINDOWS
+#include <unistd.h>
+#endif
+
+namespace benchmark {
+namespace internal {
+
+// Typically, we can only read a small number of counters. There is also a
+// padding preceding counter values, when reading multiple counters with one
+// syscall (which is desirable). PerfCounterValues abstracts these details.
+// The implementation ensures the storage is inlined, and allows 0-based
+// indexing into the counter values.
+// The object is used in conjunction with a PerfCounters object, by passing it
+// to Snapshot(). The values are populated such that
+// perfCounters->names()[i]'s value is obtained at position i (as given by
+// operator[]) of this object.
+class PerfCounterValues {
+ public:
+  explicit PerfCounterValues(size_t nr_counters) : nr_counters_(nr_counters) {
+    BM_CHECK_LE(nr_counters_, kMaxCounters);
+  }
+
+  uint64_t operator[](size_t pos) const { return values_[kPadding + pos]; }
+
+  static constexpr size_t kMaxCounters = 3;
+
+ private:
+  friend class PerfCounters;
+  // Get the byte buffer in which perf counters can be captured.
+  // This is used by PerfCounters::Read
+  std::pair<char*, size_t> get_data_buffer() {
+    return {reinterpret_cast<char*>(values_.data()),
+            sizeof(uint64_t) * (kPadding + nr_counters_)};
+  }
+
+  static constexpr size_t kPadding = 1;
+  std::array<uint64_t, kPadding + kMaxCounters> values_;
+  const size_t nr_counters_;
+};
+
+// Collect PMU counters. The object, once constructed, is ready to be used by
+// calling read(). PMU counter collection is enabled from the time create() is
+// called, to obtain the object, until the object's destructor is called.
+class PerfCounters final {
+ public:
+  // True iff this platform supports performance counters.
+  static const bool kSupported;
+
+  bool IsValid() const { return is_valid_; }
+  static PerfCounters NoCounters() { return PerfCounters(); }
+
+  ~PerfCounters();
+  PerfCounters(PerfCounters&&) = default;
+  PerfCounters(const PerfCounters&) = delete;
+
+  // Platform-specific implementations may choose to do some library
+  // initialization here.
+  static bool Initialize();
+
+  // Return a PerfCounters object ready to read the counters with the names
+  // specified. The values are user-mode only. The counter name format is
+  // implementation and OS specific.
+  // TODO: once we move to C++-17, this should be a std::optional, and then the
+  // IsValid() boolean can be dropped.
+  static PerfCounters Create(const std::vector<std::string>& counter_names);
+
+  // Take a snapshot of the current value of the counters into the provided
+  // valid PerfCounterValues storage. The values are populated such that:
+  // names()[i]'s value is (*values)[i]
+  BENCHMARK_ALWAYS_INLINE bool Snapshot(PerfCounterValues* values) const {
+#ifndef BENCHMARK_OS_WINDOWS
+    assert(values != nullptr);
+    assert(IsValid());
+    auto buffer = values->get_data_buffer();
+    auto read_bytes = ::read(counter_ids_[0], buffer.first, buffer.second);
+    return static_cast<size_t>(read_bytes) == buffer.second;
+#else
+    (void)values;
+    return false;
+#endif
+  }
+
+  const std::vector<std::string>& names() const { return counter_names_; }
+  size_t num_counters() const { return counter_names_.size(); }
+
+ private:
+  PerfCounters(const std::vector<std::string>& counter_names,
+               std::vector<int>&& counter_ids)
+      : counter_ids_(std::move(counter_ids)),
+        counter_names_(counter_names),
+        is_valid_(true) {}
+  PerfCounters() : is_valid_(false) {}
+
+  std::vector<int> counter_ids_;
+  const std::vector<std::string> counter_names_;
+  const bool is_valid_;
+};
+
+// Typical usage of the above primitives.
+class PerfCountersMeasurement final {
+ public:
+  PerfCountersMeasurement(PerfCounters&& c)
+      : counters_(std::move(c)),
+        start_values_(counters_.IsValid() ? counters_.names().size() : 0),
+        end_values_(counters_.IsValid() ? counters_.names().size() : 0) {}
+
+  bool IsValid() const { return counters_.IsValid(); }
+
+  BENCHMARK_ALWAYS_INLINE void Start() {
+    assert(IsValid());
+    // Tell the compiler to not move instructions above/below where we take
+    // the snapshot.
+    ClobberMemory();
+    counters_.Snapshot(&start_values_);
+    ClobberMemory();
+  }
+
+  BENCHMARK_ALWAYS_INLINE std::vector<std::pair<std::string, double>>
+  StopAndGetMeasurements() {
+    assert(IsValid());
+    // Tell the compiler to not move instructions above/below where we take
+    // the snapshot.
+    ClobberMemory();
+    counters_.Snapshot(&end_values_);
+    ClobberMemory();
+
+    std::vector<std::pair<std::string, double>> ret;
+    for (size_t i = 0; i < counters_.names().size(); ++i) {
+      double measurement = static_cast<double>(end_values_[i]) -
+                           static_cast<double>(start_values_[i]);
+      ret.push_back({counters_.names()[i], measurement});
+    }
+    return ret;
+  }
+
+ private:
+  PerfCounters counters_;
+  PerfCounterValues start_values_;
+  PerfCounterValues end_values_;
+};
+
+BENCHMARK_UNUSED static bool perf_init_anchor = PerfCounters::Initialize();
+
+}  // namespace internal
+}  // namespace benchmark
+
+#endif  // BENCHMARK_PERF_COUNTERS_H
--- a/src/re.h
+++ b/src/re.h
@ -126,7 +126,7 @@ inline bool Regex::Init(const std::string& spec, std::string* error) {

      // regerror returns the number of bytes necessary to null terminate
      // the string, so we move that when assigning to error.
-      CHECK_NE(needed, 0);
+      BM_CHECK_NE(needed, 0);
      error->assign(errbuf, needed - 1);

      delete[] errbuf;
--- a/src/reporter.cc
+++ b/src/reporter.cc
@ -12,19 +12,22 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include "benchmark/benchmark.h"
-#include "timers.h"
-
 #include <cstdlib>
-
 #include <iostream>
+#include <map>
+#include <string>
 #include <tuple>
 #include <vector>

+#include "benchmark/benchmark.h"
 #include "check.h"
 #include "string_util.h"
+#include "timers.h"

 namespace benchmark {
+namespace internal {
+extern std::map<std::string, std::string> *global_context;
+}

 BenchmarkReporter::BenchmarkReporter()
    : output_stream_(&std::cout), error_stream_(&std::cerr) {}
@ -33,7 +36,7 @@ BenchmarkReporter::~BenchmarkReporter() {}

 void BenchmarkReporter::PrintBasicContext(std::ostream *out,
                                          Context const &context) {
-  CHECK(out) << "cannot be null";
+  BM_CHECK(out) << "cannot be null";
  auto &Out = *out;

  Out << LocalDateTimeString() << "\n";
@ -64,6 +67,12 @@ void BenchmarkReporter::PrintBasicContext(std::ostream *out,
    Out << "\n";
  }

+  if (internal::global_context != nullptr) {
+    for (const auto &kv : *internal::global_context) {
+      Out << kv.first << ": " << kv.second << "\n";
+    }
+  }
+
  if (CPUInfo::Scaling::ENABLED == info.scaling) {
    Out << "***WARNING*** CPU scaling is enabled, the benchmark "
           "real time measurements may be noisy and will incur extra "
--- a/src/sleep.cc
+++ b/src/sleep.cc
@ -24,6 +24,10 @@
 #include <windows.h>
 #endif

+#ifdef BENCHMARK_OS_ZOS
+#include <unistd.h>
+#endif
+
 namespace benchmark {
 #ifdef BENCHMARK_OS_WINDOWS
 // Window's Sleep takes milliseconds argument.
@ -31,13 +35,24 @@ void SleepForMilliseconds(int milliseconds) { Sleep(milliseconds); }
 void SleepForSeconds(double seconds) {
  SleepForMilliseconds(static_cast<int>(kNumMillisPerSecond * seconds));
 }
-#else   // BENCHMARK_OS_WINDOWS
+#else  // BENCHMARK_OS_WINDOWS
 void SleepForMicroseconds(int microseconds) {
+#ifdef BENCHMARK_OS_ZOS
+  // z/OS does not support nanosleep. Instead call sleep() and then usleep() to
+  // sleep for the remaining microseconds because usleep() will fail if its
+  // argument is greater than 1000000.
+  div_t sleepTime = div(microseconds, kNumMicrosPerSecond);
+  int seconds = sleepTime.quot;
+  while (seconds != 0) seconds = sleep(seconds);
+  while (usleep(sleepTime.rem) == -1 && errno == EINTR)
+    ;
+#else
  struct timespec sleep_time;
  sleep_time.tv_sec = microseconds / kNumMicrosPerSecond;
  sleep_time.tv_nsec = (microseconds % kNumMicrosPerSecond) * kNumNanosPerMicro;
  while (nanosleep(&sleep_time, &sleep_time) != 0 && errno == EINTR)
    ;  // Ignore signals and wait for the full interval to elapse.
+#endif
 }

 void SleepForMilliseconds(int milliseconds) {
--- a/src/statistics.cc
+++ b/src/statistics.cc
@ -13,15 +13,16 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include "benchmark/benchmark.h"
+#include "statistics.h"

 #include <algorithm>
 #include <cmath>
 #include <numeric>
 #include <string>
 #include <vector>
+
+#include "benchmark/benchmark.h"
 #include "check.h"
-#include "statistics.h"

 namespace benchmark {

@ -74,6 +75,15 @@ double StatisticsStdDev(const std::vector<double>& v) {
  return Sqrt(v.size() / (v.size() - 1.0) * (avg_squares - Sqr(mean)));
 }

+double StatisticsCV(const std::vector<double>& v) {
+  if (v.size() < 2) return 0.0;
+
+  const auto stddev = StatisticsStdDev(v);
+  const auto mean = StatisticsMean(v);
+
+  return stddev / mean;
+}
+
 std::vector<BenchmarkReporter::Run> ComputeStats(
    const std::vector<BenchmarkReporter::Run>& reports) {
  typedef BenchmarkReporter::Run Run;
@ -112,22 +122,22 @@ std::vector<BenchmarkReporter::Run> ComputeStats(
        it = counter_stats.find(cnt.first);
        it->second.s.reserve(reports.size());
      } else {
-        CHECK_EQ(counter_stats[cnt.first].c.flags, cnt.second.flags);
+        BM_CHECK_EQ(counter_stats[cnt.first].c.flags, cnt.second.flags);
      }
    }
  }

  // Populate the accumulators.
  for (Run const& run : reports) {
-    CHECK_EQ(reports[0].benchmark_name(), run.benchmark_name());
-    CHECK_EQ(run_iterations, run.iterations);
+    BM_CHECK_EQ(reports[0].benchmark_name(), run.benchmark_name());
+    BM_CHECK_EQ(run_iterations, run.iterations);
    if (run.error_occurred) continue;
    real_accumulated_time_stat.emplace_back(run.real_accumulated_time);
    cpu_accumulated_time_stat.emplace_back(run.cpu_accumulated_time);
    // user counters
    for (auto const& cnt : run.counters) {
      auto it = counter_stats.find(cnt.first);
-      CHECK_NE(it, counter_stats.end());
+      BM_CHECK_NE(it, counter_stats.end());
      it->second.s.emplace_back(cnt.second);
    }
  }
@ -148,11 +158,14 @@ std::vector<BenchmarkReporter::Run> ComputeStats(
    // Get the data from the accumulator to BenchmarkReporter::Run's.
    Run data;
    data.run_name = reports[0].run_name;
+    data.family_index = reports[0].family_index;
+    data.per_family_instance_index = reports[0].per_family_instance_index;
    data.run_type = BenchmarkReporter::Run::RT_Aggregate;
    data.threads = reports[0].threads;
    data.repetitions = reports[0].repetitions;
    data.repetition_index = Run::no_repetition_index;
    data.aggregate_name = Stat.name_;
+    data.aggregate_unit = Stat.unit_;
    data.report_label = report_label;

    // It is incorrect to say that an aggregate is computed over
@ -165,13 +178,15 @@ std::vector<BenchmarkReporter::Run> ComputeStats(
    data.real_accumulated_time = Stat.compute_(real_accumulated_time_stat);
    data.cpu_accumulated_time = Stat.compute_(cpu_accumulated_time_stat);

-    // We will divide these times by data.iterations when reporting, but the
-    // data.iterations is not nessesairly the scale of these measurements,
-    // because in each repetition, these timers are sum over all the iterations.
-    // And if we want to say that the stats are over N repetitions and not
-    // M iterations, we need to multiply these by (N/M).
-    data.real_accumulated_time *= iteration_rescale_factor;
-    data.cpu_accumulated_time *= iteration_rescale_factor;
+    if (data.aggregate_unit == StatisticUnit::kTime) {
+      // We will divide these times by data.iterations when reporting, but the
+      // data.iterations is not necessarily the scale of these measurements,
+      // because in each repetition, these timers are sum over all the iters.
+      // And if we want to say that the stats are over N repetitions and not
+      // M iterations, we need to multiply these by (N/M).
+      data.real_accumulated_time *= iteration_rescale_factor;
+      data.cpu_accumulated_time *= iteration_rescale_factor;
+    }

    data.time_unit = reports[0].time_unit;

--- a/src/statistics.h
+++ b/src/statistics.h
@ -31,6 +31,7 @@ std::vector<BenchmarkReporter::Run> ComputeStats(
 double StatisticsMean(const std::vector<double>& v);
 double StatisticsMedian(const std::vector<double>& v);
 double StatisticsStdDev(const std::vector<double>& v);
+double StatisticsCV(const std::vector<double>& v);

 }  // end namespace benchmark

--- a/src/string_util.cc
+++ b/src/string_util.cc
@ -151,7 +151,7 @@ std::string StrFormatImp(const char* msg, va_list args) {
  auto buff_ptr = std::unique_ptr<char[]>(new char[size]);
  // 2015-10-08: vsnprintf is used instead of snd::vsnprintf due to a limitation
  // in the android-ndk
-  ret = vsnprintf(buff_ptr.get(), size, msg, args);
+  vsnprintf(buff_ptr.get(), size, msg, args);
  return std::string(buff_ptr.get());
 }

@ -163,6 +163,19 @@ std::string StrFormat(const char* format, ...) {
  return tmp;
 }

+std::vector<std::string> StrSplit(const std::string& str, char delim) {
+  if (str.empty()) return {};
+  std::vector<std::string> ret;
+  size_t first = 0;
+  size_t next = str.find(delim);
+  for (; next != std::string::npos;
+       first = next + 1, next = str.find(delim, first)) {
+    ret.push_back(str.substr(first, next - first));
+  }
+  ret.push_back(str.substr(first));
+  return ret;
+}
+
 #ifdef BENCHMARK_STL_ANDROID_GNUSTL
 /*
 * GNU STL in Android NDK lacks support for some C++11 functions, including
@ -185,11 +198,10 @@ unsigned long stoul(const std::string& str, size_t* pos, int base) {

  /* Check for errors and return */
  if (strtoulErrno == ERANGE) {
-    throw std::out_of_range(
-      "stoul failed: " + str + " is outside of range of unsigned long");
+    throw std::out_of_range("stoul failed: " + str +
+                            " is outside of range of unsigned long");
  } else if (strEnd == strStart || strtoulErrno != 0) {
-    throw std::invalid_argument(
-      "stoul failed: " + str + " is not an integer");
+    throw std::invalid_argument("stoul failed: " + str + " is not an integer");
  }
  if (pos != nullptr) {
    *pos = static_cast<size_t>(strEnd - strStart);
@ -212,11 +224,10 @@ int stoi(const std::string& str, size_t* pos, int base) {

  /* Check for errors and return */
  if (strtolErrno == ERANGE || long(int(result)) != result) {
-    throw std::out_of_range(
-      "stoul failed: " + str + " is outside of range of int");
+    throw std::out_of_range("stoul failed: " + str +
+                            " is outside of range of int");
  } else if (strEnd == strStart || strtolErrno != 0) {
-    throw std::invalid_argument(
-      "stoul failed: " + str + " is not an integer");
+    throw std::invalid_argument("stoul failed: " + str + " is not an integer");
  }
  if (pos != nullptr) {
    *pos = static_cast<size_t>(strEnd - strStart);
@ -239,11 +250,10 @@ double stod(const std::string& str, size_t* pos) {

  /* Check for errors and return */
  if (strtodErrno == ERANGE) {
-    throw std::out_of_range(
-      "stoul failed: " + str + " is outside of range of int");
+    throw std::out_of_range("stoul failed: " + str +
+                            " is outside of range of int");
  } else if (strEnd == strStart || strtodErrno != 0) {
-    throw std::invalid_argument(
-      "stoul failed: " + str + " is not an integer");
+    throw std::invalid_argument("stoul failed: " + str + " is not an integer");
  }
  if (pos != nullptr) {
    *pos = static_cast<size_t>(strEnd - strStart);
--- a/src/string_util.h
+++ b/src/string_util.h
@ -4,6 +4,7 @@
 #include <sstream>
 #include <string>
 #include <utility>
+
 #include "internal_macros.h"

 namespace benchmark {
@ -37,6 +38,10 @@ inline std::string StrCat(Args&&... args) {
  return ss.str();
 }

+std::vector<std::string> StrSplit(const std::string& str, char delim);
+
+// Disable lint checking for this block since it re-implements C functions.
+// NOLINTBEGIN
 #ifdef BENCHMARK_STL_ANDROID_GNUSTL
 /*
 * GNU STL in Android NDK lacks support for some C++11 functions, including
@ -45,14 +50,15 @@ inline std::string StrCat(Args&&... args) {
 * namespace, not std:: namespace.
 */
 unsigned long stoul(const std::string& str, size_t* pos = nullptr,
-                           int base = 10);
+                    int base = 10);
 int stoi(const std::string& str, size_t* pos = nullptr, int base = 10);
 double stod(const std::string& str, size_t* pos = nullptr);
 #else
-using std::stoul;
-using std::stoi;
-using std::stod;
+using std::stod;   // NOLINT(misc-unused-using-decls)
+using std::stoi;   // NOLINT(misc-unused-using-decls)
+using std::stoul;  // NOLINT(misc-unused-using-decls)
 #endif
+// NOLINTEND

 }  // end namespace benchmark

--- a/src/sysinfo.cc
+++ b/src/sysinfo.cc
@ -19,6 +19,7 @@
 #undef StrCat  // Don't let StrCat in string_util.h be renamed to lstrcatA
 #include <versionhelpers.h>
 #include <windows.h>
+
 #include <codecvt>
 #else
 #include <fcntl.h>
@ -29,7 +30,8 @@
 #include <sys/types.h>  // this header must be included before 'sys/sysctl.h' to avoid compilation error on FreeBSD
 #include <unistd.h>
 #if defined BENCHMARK_OS_FREEBSD || defined BENCHMARK_OS_MACOSX || \
-    defined BENCHMARK_OS_NETBSD || defined BENCHMARK_OS_OPENBSD
+    defined BENCHMARK_OS_NETBSD || defined BENCHMARK_OS_OPENBSD || \
+    defined BENCHMARK_OS_DRAGONFLY
 #define BENCHMARK_HAS_SYSCTL
 #include <sys/sysctl.h>
 #endif
@ -54,9 +56,9 @@
 #include <iostream>
 #include <iterator>
 #include <limits>
+#include <locale>
 #include <memory>
 #include <sstream>
-#include <locale>
 #include <utility>

 #include "check.h"
@ -134,7 +136,7 @@ struct ValueUnion {
  template <class T, int N>
  std::array<T, N> GetAsArray() {
    const int ArrSize = sizeof(T) * N;
-    CHECK_LE(ArrSize, Size);
+    BM_CHECK_LE(ArrSize, Size);
    std::array<T, N> Arr;
    std::memcpy(Arr.data(), data(), ArrSize);
    return Arr;
@ -146,7 +148,7 @@ ValueUnion GetSysctlImp(std::string const& Name) {
  int mib[2];

  mib[0] = CTL_HW;
-  if ((Name == "hw.ncpu") || (Name == "hw.cpuspeed")){
+  if ((Name == "hw.ncpu") || (Name == "hw.cpuspeed")) {
    ValueUnion buff(sizeof(int));

    if (Name == "hw.ncpu") {
@ -213,10 +215,9 @@ bool ReadFromFile(std::string const& fname, ArgT* arg) {
 CPUInfo::Scaling CpuScaling(int num_cpus) {
  // We don't have a valid CPU count, so don't even bother.
  if (num_cpus <= 0) return CPUInfo::Scaling::UNKNOWN;
-#ifdef BENCHMARK_OS_QNX
+#if defined(BENCHMARK_OS_QNX)
  return CPUInfo::Scaling::UNKNOWN;
-#endif
-#ifndef BENCHMARK_OS_WINDOWS
+#elif !defined(BENCHMARK_OS_WINDOWS)
  // On Linux, the CPUfreq subsystem exposes CPU information as files on the
  // local file system. If reading the exported files fails, then we may not be
  // running on Linux, so we silently ignore all the read errors.
@ -224,11 +225,13 @@ CPUInfo::Scaling CpuScaling(int num_cpus) {
  for (int cpu = 0; cpu < num_cpus; ++cpu) {
    std::string governor_file =
        StrCat("/sys/devices/system/cpu/cpu", cpu, "/cpufreq/scaling_governor");
-    if (ReadFromFile(governor_file, &res) && res != "performance") return CPUInfo::Scaling::ENABLED;
+    if (ReadFromFile(governor_file, &res) && res != "performance")
+      return CPUInfo::Scaling::ENABLED;
  }
  return CPUInfo::Scaling::DISABLED;
-#endif
+#else
  return CPUInfo::Scaling::UNKNOWN;
+#endif
 }

 int CountSetBitsInCPUMap(std::string Val) {
@ -343,6 +346,7 @@ std::vector<CPUInfo::CacheInfo> GetCacheSizesWindows() {
    C.num_sharing = static_cast<int>(B.count());
    C.level = Cache->Level;
    C.size = Cache->Size;
+    C.type = "Unknown";
    switch (Cache->Type) {
      case CacheUnified:
        C.type = "Unified";
@ -356,9 +360,6 @@ std::vector<CPUInfo::CacheInfo> GetCacheSizesWindows() {
      case CacheTrace:
        C.type = "Trace";
        break;
-      default:
-        C.type = "Unknown";
-        break;
    }
    res.push_back(C);
  }
@ -367,29 +368,29 @@ std::vector<CPUInfo::CacheInfo> GetCacheSizesWindows() {
 #elif BENCHMARK_OS_QNX
 std::vector<CPUInfo::CacheInfo> GetCacheSizesQNX() {
  std::vector<CPUInfo::CacheInfo> res;
-  struct cacheattr_entry *cache = SYSPAGE_ENTRY(cacheattr);
+  struct cacheattr_entry* cache = SYSPAGE_ENTRY(cacheattr);
  uint32_t const elsize = SYSPAGE_ELEMENT_SIZE(cacheattr);
-  int num = SYSPAGE_ENTRY_SIZE(cacheattr) / elsize ;
-  for(int i = 0; i < num; ++i ) {
+  int num = SYSPAGE_ENTRY_SIZE(cacheattr) / elsize;
+  for (int i = 0; i < num; ++i) {
    CPUInfo::CacheInfo info;
-    switch (cache->flags){
-      case CACHE_FLAG_INSTR :
+    switch (cache->flags) {
+      case CACHE_FLAG_INSTR:
        info.type = "Instruction";
        info.level = 1;
        break;
-      case CACHE_FLAG_DATA :
+      case CACHE_FLAG_DATA:
        info.type = "Data";
        info.level = 1;
        break;
-      case CACHE_FLAG_UNIFIED :
+      case CACHE_FLAG_UNIFIED:
        info.type = "Unified";
        info.level = 2;
        break;
-      case CACHE_FLAG_SHARED :
+      case CACHE_FLAG_SHARED:
        info.type = "Shared";
        info.level = 3;
        break;
-      default :
+      default:
        continue;
        break;
    }
@ -417,24 +418,23 @@ std::vector<CPUInfo::CacheInfo> GetCacheSizes() {
 std::string GetSystemName() {
 #if defined(BENCHMARK_OS_WINDOWS)
  std::string str;
-  const unsigned COUNT = MAX_COMPUTERNAME_LENGTH+1;
-  TCHAR  hostname[COUNT] = {'\0'};
+  const unsigned COUNT = MAX_COMPUTERNAME_LENGTH + 1;
+  TCHAR hostname[COUNT] = {'\0'};
  DWORD DWCOUNT = COUNT;
-  if (!GetComputerName(hostname, &DWCOUNT))
-    return std::string("");
+  if (!GetComputerName(hostname, &DWCOUNT)) return std::string("");
 #ifndef UNICODE
  str = std::string(hostname, DWCOUNT);
 #else
-  //Using wstring_convert, Is deprecated in C++17
+  // Using wstring_convert, Is deprecated in C++17
  using convert_type = std::codecvt_utf8<wchar_t>;
  std::wstring_convert<convert_type, wchar_t> converter;
  std::wstring wStr(hostname, DWCOUNT);
  str = converter.to_bytes(wStr);
 #endif
  return str;
-#else // defined(BENCHMARK_OS_WINDOWS)
+#else  // defined(BENCHMARK_OS_WINDOWS)
 #ifndef HOST_NAME_MAX
-#ifdef BENCHMARK_HAS_SYSCTL // BSD/Mac Doesnt have HOST_NAME_MAX defined
+#ifdef BENCHMARK_HAS_SYSCTL  // BSD/Mac Doesnt have HOST_NAME_MAX defined
 #define HOST_NAME_MAX 64
 #elif defined(BENCHMARK_OS_NACL)
 #define HOST_NAME_MAX 64
@ -443,15 +443,15 @@ std::string GetSystemName() {
 #elif defined(BENCHMARK_OS_RTEMS)
 #define HOST_NAME_MAX 256
 #else
-#warning "HOST_NAME_MAX not defined. using 64"
+#pragma message("HOST_NAME_MAX not defined. using 64")
 #define HOST_NAME_MAX 64
 #endif
-#endif // def HOST_NAME_MAX
+#endif  // def HOST_NAME_MAX
  char hostname[HOST_NAME_MAX];
  int retVal = gethostname(hostname, HOST_NAME_MAX);
  if (retVal != 0) return std::string("");
  return std::string(hostname);
-#endif // Catch-all POSIX block.
+#endif  // Catch-all POSIX block.
 }

 int GetNumCPUs() {
@ -473,8 +473,7 @@ int GetNumCPUs() {
  // Returns -1 in case of a failure.
  int NumCPU = sysconf(_SC_NPROCESSORS_ONLN);
  if (NumCPU < 0) {
-    fprintf(stderr,
-            "sysconf(_SC_NPROCESSORS_ONLN) failed with error: %s\n",
+    fprintf(stderr, "sysconf(_SC_NPROCESSORS_ONLN) failed with error: %s\n",
            strerror(errno));
  }
  return NumCPU;
@ -497,7 +496,8 @@ int GetNumCPUs() {
 #if defined(__s390__)
    // s390 has another format in /proc/cpuinfo
    // it needs to be parsed differently
-    if (SplitIdx != std::string::npos) value = ln.substr(Key.size()+1,SplitIdx-Key.size()-1);
+    if (SplitIdx != std::string::npos)
+      value = ln.substr(Key.size() + 1, SplitIdx - Key.size() - 1);
 #else
    if (SplitIdx != std::string::npos) value = ln.substr(SplitIdx + 1);
 #endif
@ -529,7 +529,11 @@ int GetNumCPUs() {
  BENCHMARK_UNREACHABLE();
 }

-double GetCPUCyclesPerSecond() {
+double GetCPUCyclesPerSecond(CPUInfo::Scaling scaling) {
+  // Currently, scaling is only used on linux path here,
+  // suppress diagnostics about it being unused on other paths.
+  (void)scaling;
+
 #if defined BENCHMARK_OS_LINUX || defined BENCHMARK_OS_CYGWIN
  long freq;

@ -540,8 +544,15 @@ double GetCPUCyclesPerSecond() {
  // cannot always be relied upon. The same reasons apply to /proc/cpuinfo as
  // well.
  if (ReadFromFile("/sys/devices/system/cpu/cpu0/tsc_freq_khz", &freq)
-      // If CPU scaling is in effect, we want to use the *maximum* frequency,
-      // not whatever CPU speed some random processor happens to be using now.
+      // If CPU scaling is disabled, use the *current* frequency.
+      // Note that we specifically don't want to read cpuinfo_cur_freq,
+      // because it is only readable by root.
+      || (scaling == CPUInfo::Scaling::DISABLED &&
+          ReadFromFile("/sys/devices/system/cpu/cpu0/cpufreq/scaling_cur_freq",
+                       &freq))
+      // Otherwise, if CPU scaling may be in effect, we want to use
+      // the *maximum* frequency, not whatever CPU speed some random processor
+      // happens to be using now.
      || ReadFromFile("/sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_max_freq",
                      &freq)) {
    // The value is in kHz (as the file name suggests).  For example, on a
@ -607,6 +618,8 @@ double GetCPUCyclesPerSecond() {
      "machdep.tsc_freq";
 #elif defined BENCHMARK_OS_OPENBSD
      "hw.cpuspeed";
+#elif defined BENCHMARK_OS_DRAGONFLY
+      "hw.tsc_frequency";
 #else
      "hw.cpufrequency";
 #endif
@ -630,13 +643,13 @@ double GetCPUCyclesPerSecond() {
                      "~MHz", nullptr, &data, &data_size)))
    return static_cast<double>((int64_t)data *
                               (int64_t)(1000 * 1000));  // was mhz
-#elif defined (BENCHMARK_OS_SOLARIS)
-  kstat_ctl_t *kc = kstat_open();
+#elif defined(BENCHMARK_OS_SOLARIS)
+  kstat_ctl_t* kc = kstat_open();
  if (!kc) {
    std::cerr << "failed to open /dev/kstat\n";
    return -1;
  }
-  kstat_t *ksp = kstat_lookup(kc, (char*)"cpu_info", -1, (char*)"cpu_info0");
+  kstat_t* ksp = kstat_lookup(kc, (char*)"cpu_info", -1, (char*)"cpu_info0");
  if (!ksp) {
    std::cerr << "failed to lookup in /dev/kstat\n";
    return -1;
@ -645,7 +658,7 @@ double GetCPUCyclesPerSecond() {
    std::cerr << "failed to read from /dev/kstat\n";
    return -1;
  }
-  kstat_named_t *knp =
+  kstat_named_t* knp =
      (kstat_named_t*)kstat_data_lookup(ksp, (char*)"current_clock_Hz");
  if (!knp) {
    std::cerr << "failed to lookup data in /dev/kstat\n";
@ -659,7 +672,7 @@ double GetCPUCyclesPerSecond() {
  double clock_hz = knp->value.ui64;
  kstat_close(kc);
  return clock_hz;
-#elif defined (BENCHMARK_OS_QNX)
+#elif defined(BENCHMARK_OS_QNX)
  return static_cast<double>((int64_t)(SYSPAGE_ENTRY(cpuinfo)->speed) *
                             (int64_t)(1000 * 1000));
 #endif
@ -671,9 +684,10 @@ double GetCPUCyclesPerSecond() {
 }

 std::vector<double> GetLoadAvg() {
-#if (defined BENCHMARK_OS_FREEBSD || defined(BENCHMARK_OS_LINUX) || \
-    defined BENCHMARK_OS_MACOSX || defined BENCHMARK_OS_NETBSD ||  \
-    defined BENCHMARK_OS_OPENBSD) && !defined(__ANDROID__)
+#if (defined BENCHMARK_OS_FREEBSD || defined(BENCHMARK_OS_LINUX) ||     \
+     defined BENCHMARK_OS_MACOSX || defined BENCHMARK_OS_NETBSD ||      \
+     defined BENCHMARK_OS_OPENBSD || defined BENCHMARK_OS_DRAGONFLY) && \
+    !defined(__ANDROID__)
  constexpr int kMaxSamples = 3;
  std::vector<double> res(kMaxSamples, 0.0);
  const int nelem = getloadavg(res.data(), kMaxSamples);
@ -697,12 +711,11 @@ const CPUInfo& CPUInfo::Get() {

 CPUInfo::CPUInfo()
    : num_cpus(GetNumCPUs()),
-      cycles_per_second(GetCPUCyclesPerSecond()),
-      caches(GetCacheSizes()),
      scaling(CpuScaling(num_cpus)),
+      cycles_per_second(GetCPUCyclesPerSecond(scaling)),
+      caches(GetCacheSizes()),
      load_avg(GetLoadAvg()) {}

-
 const SystemInfo& SystemInfo::Get() {
  static const SystemInfo* info = new SystemInfo();
  return *info;
--- a/src/thread_manager.h
+++ b/src/thread_manager.h
@ -36,7 +36,6 @@ class ThreadManager {
                        [this]() { return alive_threads_ == 0; });
  }

- public:
  struct Result {
    IterationCount iterations = 0;
    double real_time_used = 0;
--- a/src/thread_timer.h
+++ b/src/thread_timer.h
@ -28,7 +28,7 @@ class ThreadTimer {

  // Called by each thread
  void StopTimer() {
-    CHECK(running_);
+    BM_CHECK(running_);
    running_ = false;
    real_time_used_ += ChronoClockNow() - start_real_time_;
    // Floating point error can result in the subtraction producing a negative
@ -44,19 +44,19 @@ class ThreadTimer {

  // REQUIRES: timer is not running
  double real_time_used() const {
-    CHECK(!running_);
+    BM_CHECK(!running_);
    return real_time_used_;
  }

  // REQUIRES: timer is not running
  double cpu_time_used() const {
-    CHECK(!running_);
+    BM_CHECK(!running_);
    return cpu_time_used_;
  }

  // REQUIRES: timer is not running
  double manual_time_used() const {
-    CHECK(!running_);
+    BM_CHECK(!running_);
    return manual_time_used_;
  }

--- a/src/timers.cc
+++ b/src/timers.cc
@ -13,6 +13,7 @@
 // limitations under the License.

 #include "timers.h"
+
 #include "internal_macros.h"

 #ifdef BENCHMARK_OS_WINDOWS
@ -28,7 +29,8 @@
 #include <sys/time.h>
 #include <sys/types.h>  // this header must be included before 'sys/sysctl.h' to avoid compilation error on FreeBSD
 #include <unistd.h>
-#if defined BENCHMARK_OS_FREEBSD || defined BENCHMARK_OS_MACOSX
+#if defined BENCHMARK_OS_FREEBSD || defined BENCHMARK_OS_DRAGONFLY || \
+    defined BENCHMARK_OS_MACOSX
 #include <sys/sysctl.h>
 #endif
 #if defined(BENCHMARK_OS_MACOSX)
@ -124,8 +126,8 @@ double ProcessCPUUsage() {
  // syncronous system calls in Emscripten.
  return emscripten_get_now() * 1e-3;
 #elif defined(CLOCK_PROCESS_CPUTIME_ID) && !defined(BENCHMARK_OS_MACOSX)
-  // FIXME We want to use clock_gettime, but its not available in MacOS 10.11. See
-  // https://github.com/google/benchmark/pull/292
+  // FIXME We want to use clock_gettime, but its not available in MacOS 10.11.
+  // See https://github.com/google/benchmark/pull/292
  struct timespec spec;
  if (clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &spec) == 0)
    return MakeTime(spec);
@ -148,13 +150,14 @@ double ThreadCPUUsage() {
                 &user_time);
  return MakeTime(kernel_time, user_time);
 #elif defined(BENCHMARK_OS_MACOSX)
-  // FIXME We want to use clock_gettime, but its not available in MacOS 10.11. See
-  // https://github.com/google/benchmark/pull/292
+  // FIXME We want to use clock_gettime, but its not available in MacOS 10.11.
+  // See https://github.com/google/benchmark/pull/292
  mach_msg_type_number_t count = THREAD_BASIC_INFO_COUNT;
  thread_basic_info_data_t info;
  mach_port_t thread = pthread_mach_thread_np(pthread_self());
-  if (thread_info(thread, THREAD_BASIC_INFO, (thread_info_t)&info, &count) ==
-      KERN_SUCCESS) {
+  if (thread_info(thread, THREAD_BASIC_INFO,
+                  reinterpret_cast<thread_info_t>(&info),
+                  &count) == KERN_SUCCESS) {
    return MakeTime(info);
  }
  DiagnoseAndExit("ThreadCPUUsage() failed when evaluating thread_info");
@ -189,15 +192,26 @@ std::string LocalDateTimeString() {
  std::size_t timestamp_len;
  long int offset_minutes;
  char tz_offset_sign = '+';
-  // Long enough buffers to avoid format-overflow warnings
-  char tz_offset[128];
+  // tz_offset is set in one of three ways:
+  // * strftime with %z - This either returns empty or the ISO 8601 time.  The
+  // maximum length an
+  //   ISO 8601 string can be is 7 (e.g. -03:30, plus trailing zero).
+  // * snprintf with %c%02li:%02li - The maximum length is 41 (one for %c, up to
+  // 19 for %02li,
+  //   one for :, up to 19 %02li, plus trailing zero).
+  // * A fixed string of "-00:00".  The maximum length is 7 (-00:00, plus
+  // trailing zero).
+  //
+  // Thus, the maximum size this needs to be is 41.
+  char tz_offset[41];
+  // Long enough buffer to avoid format-overflow warnings
  char storage[128];

 #if defined(BENCHMARK_OS_WINDOWS)
-  std::tm *timeinfo_p = ::localtime(&now);
+  std::tm* timeinfo_p = ::localtime(&now);
 #else
  std::tm timeinfo;
-  std::tm *timeinfo_p = &timeinfo;
+  std::tm* timeinfo_p = &timeinfo;
  ::localtime_r(&now, &timeinfo);
 #endif

@ -214,10 +228,11 @@ std::string LocalDateTimeString() {
      tz_offset_sign = '-';
    }

-    tz_len = ::snprintf(tz_offset, sizeof(tz_offset), "%c%02li:%02li",
-        tz_offset_sign, offset_minutes / 100, offset_minutes % 100);
-    CHECK(tz_len == kTzOffsetLen);
-    ((void)tz_len); // Prevent unused variable warning in optimized build.
+    tz_len =
+        ::snprintf(tz_offset, sizeof(tz_offset), "%c%02li:%02li",
+                   tz_offset_sign, offset_minutes / 100, offset_minutes % 100);
+    BM_CHECK(tz_len == kTzOffsetLen);
+    ((void)tz_len);  // Prevent unused variable warning in optimized build.
  } else {
    // Unknown offset. RFC3339 specifies that unknown local offsets should be
    // written as UTC time with -00:00 timezone.
@ -231,9 +246,9 @@ std::string LocalDateTimeString() {
    strncpy(tz_offset, "-00:00", kTzOffsetLen + 1);
  }

-  timestamp_len = std::strftime(storage, sizeof(storage), "%Y-%m-%dT%H:%M:%S",
-      timeinfo_p);
-  CHECK(timestamp_len == kTimestampLen);
+  timestamp_len =
+      std::strftime(storage, sizeof(storage), "%Y-%m-%dT%H:%M:%S", timeinfo_p);
+  BM_CHECK(timestamp_len == kTimestampLen);
  // Prevent unused variable warning in optimized build.
  ((void)kTimestampLen);

--- a/test/BUILD
+++ b/test/BUILD
@ -20,6 +20,8 @@ TEST_ARGS = ["--benchmark_min_time=0.01"]

 PER_SRC_TEST_ARGS = ({
    "user_counters_tabular_test.cc": ["--benchmark_counters_tabular=true"],
+    "repetitions_test.cc": [" --benchmark_repetitions=3"],
+    "spec_arg_test.cc" : ["--benchmark_filter=BM_NotChosen"],
 })

 load("@rules_cc//cc:defs.bzl", "cc_library", "cc_test")
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@ -56,6 +56,12 @@ endmacro(compile_output_test)
 compile_benchmark_test(benchmark_test)
 add_test(NAME benchmark COMMAND benchmark_test --benchmark_min_time=0.01)

+compile_benchmark_test(spec_arg_test)
+add_test(NAME spec_arg COMMAND spec_arg_test --benchmark_filter=BM_NotChosen)
+
+compile_benchmark_test(benchmark_setup_teardown_test)
+add_test(NAME benchmark_setup_teardown COMMAND benchmark_setup_teardown_test)
+
 compile_benchmark_test(filter_test)
 macro(add_filter_test name filter expect)
  add_test(NAME ${name} COMMAND filter_test --benchmark_min_time=0.01 --benchmark_filter=${filter} ${expect})
@ -87,6 +93,9 @@ add_test(NAME options_benchmarks COMMAND options_test --benchmark_min_time=0.01)
 compile_benchmark_test(basic_test)
 add_test(NAME basic_benchmark COMMAND basic_test --benchmark_min_time=0.01)

+compile_output_test(repetitions_test)
+add_test(NAME repetitions_benchmark COMMAND repetitions_test --benchmark_min_time=0.01 --benchmark_repetitions=3)
+
 compile_benchmark_test(diagnostics_test)
 add_test(NAME diagnostics_test COMMAND diagnostics_test --benchmark_min_time=0.01)

@ -128,6 +137,9 @@ add_test(NAME templated_fixture_test COMMAND templated_fixture_test --benchmark_
 compile_output_test(user_counters_test)
 add_test(NAME user_counters_test COMMAND user_counters_test --benchmark_min_time=0.01)

+compile_output_test(perf_counters_test)
+add_test(NAME perf_counters_test COMMAND perf_counters_test --benchmark_min_time=0.01 --benchmark_perf_counters=CYCLES,BRANCHES)
+
 compile_output_test(internal_threading_test)
 add_test(NAME internal_threading_test COMMAND internal_threading_test --benchmark_min_time=0.01)

@ -193,9 +205,11 @@ if (BENCHMARK_ENABLE_GTEST_TESTS)

  add_gtest(benchmark_gtest)
  add_gtest(benchmark_name_gtest)
+  add_gtest(benchmark_random_interleaving_gtest)
  add_gtest(commandlineflags_gtest)
  add_gtest(statistics_gtest)
  add_gtest(string_util_gtest)
+  add_gtest(perf_counters_gtest)
 endif(BENCHMARK_ENABLE_GTEST_TESTS)

 ###############################################################################
--- a/test/args_product_test.cc
+++ b/test/args_product_test.cc
@ -1,10 +1,10 @@
-#include "benchmark/benchmark.h"
-
 #include <cassert>
 #include <iostream>
 #include <set>
 #include <vector>

+#include "benchmark/benchmark.h"
+
 class ArgsProductFixture : public ::benchmark::Fixture {
 public:
  ArgsProductFixture()
@ -23,7 +23,7 @@ class ArgsProductFixture : public ::benchmark::Fixture {
                        {2, 15, 10, 9},
                        {4, 5, 6, 11}}) {}

-  void SetUp(const ::benchmark::State& state) {
+  void SetUp(const ::benchmark::State& state) BENCHMARK_OVERRIDE {
    std::vector<int64_t> ranges = {state.range(0), state.range(1),
                                   state.range(2), state.range(3)};

@ -37,7 +37,7 @@ class ArgsProductFixture : public ::benchmark::Fixture {
  virtual ~ArgsProductFixture() {
    if (actualValues != expectedValues) {
      std::cout << "EXPECTED\n";
-      for (auto v : expectedValues) {
+      for (const auto& v : expectedValues) {
        std::cout << "{";
        for (int64_t iv : v) {
          std::cout << iv << ", ";
@ -45,7 +45,7 @@ class ArgsProductFixture : public ::benchmark::Fixture {
        std::cout << "}\n";
      }
      std::cout << "ACTUAL\n";
-      for (auto v : actualValues) {
+      for (const auto& v : actualValues) {
        std::cout << "{";
        for (int64_t iv : v) {
          std::cout << iv << ", ";
--- a/test/basic_test.cc
+++ b/test/basic_test.cc
@ -13,7 +13,7 @@ BENCHMARK(BM_empty)->ThreadPerCpu();

 void BM_spin_empty(benchmark::State& state) {
  for (auto _ : state) {
-    for (int x = 0; x < state.range(0); ++x) {
+    for (auto x = 0; x < state.range(0); ++x) {
      benchmark::DoNotOptimize(x);
    }
  }
@ -22,11 +22,11 @@ BASIC_BENCHMARK_TEST(BM_spin_empty);
 BASIC_BENCHMARK_TEST(BM_spin_empty)->ThreadPerCpu();

 void BM_spin_pause_before(benchmark::State& state) {
-  for (int i = 0; i < state.range(0); ++i) {
+  for (auto i = 0; i < state.range(0); ++i) {
    benchmark::DoNotOptimize(i);
  }
  for (auto _ : state) {
-    for (int i = 0; i < state.range(0); ++i) {
+    for (auto i = 0; i < state.range(0); ++i) {
      benchmark::DoNotOptimize(i);
    }
  }
@ -37,11 +37,11 @@ BASIC_BENCHMARK_TEST(BM_spin_pause_before)->ThreadPerCpu();
 void BM_spin_pause_during(benchmark::State& state) {
  for (auto _ : state) {
    state.PauseTiming();
-    for (int i = 0; i < state.range(0); ++i) {
+    for (auto i = 0; i < state.range(0); ++i) {
      benchmark::DoNotOptimize(i);
    }
    state.ResumeTiming();
-    for (int i = 0; i < state.range(0); ++i) {
+    for (auto i = 0; i < state.range(0); ++i) {
      benchmark::DoNotOptimize(i);
    }
  }
@ -62,11 +62,11 @@ BENCHMARK(BM_pause_during)->UseRealTime()->ThreadPerCpu();

 void BM_spin_pause_after(benchmark::State& state) {
  for (auto _ : state) {
-    for (int i = 0; i < state.range(0); ++i) {
+    for (auto i = 0; i < state.range(0); ++i) {
      benchmark::DoNotOptimize(i);
    }
  }
-  for (int i = 0; i < state.range(0); ++i) {
+  for (auto i = 0; i < state.range(0); ++i) {
    benchmark::DoNotOptimize(i);
  }
 }
@ -74,15 +74,15 @@ BASIC_BENCHMARK_TEST(BM_spin_pause_after);
 BASIC_BENCHMARK_TEST(BM_spin_pause_after)->ThreadPerCpu();

 void BM_spin_pause_before_and_after(benchmark::State& state) {
-  for (int i = 0; i < state.range(0); ++i) {
+  for (auto i = 0; i < state.range(0); ++i) {
    benchmark::DoNotOptimize(i);
  }
  for (auto _ : state) {
-    for (int i = 0; i < state.range(0); ++i) {
+    for (auto i = 0; i < state.range(0); ++i) {
      benchmark::DoNotOptimize(i);
    }
  }
-  for (int i = 0; i < state.range(0); ++i) {
+  for (auto i = 0; i < state.range(0); ++i) {
    benchmark::DoNotOptimize(i);
  }
 }
@ -96,7 +96,6 @@ void BM_empty_stop_start(benchmark::State& state) {
 BENCHMARK(BM_empty_stop_start);
 BENCHMARK(BM_empty_stop_start)->ThreadPerCpu();

-
 void BM_KeepRunning(benchmark::State& state) {
  benchmark::IterationCount iter_count = 0;
  assert(iter_count == state.iterations());
@ -108,15 +107,30 @@ void BM_KeepRunning(benchmark::State& state) {
 BENCHMARK(BM_KeepRunning);

 void BM_KeepRunningBatch(benchmark::State& state) {
-  // Choose a prime batch size to avoid evenly dividing max_iterations.
-  const benchmark::IterationCount batch_size = 101;
+  // Choose a batch size >1000 to skip the typical runs with iteration
+  // targets of 10, 100 and 1000.  If these are not actually skipped the
+  // bug would be detectable as consecutive runs with the same iteration
+  // count.  Below we assert that this does not happen.
+  const benchmark::IterationCount batch_size = 1009;
+
+  static benchmark::IterationCount prior_iter_count = 0;
  benchmark::IterationCount iter_count = 0;
  while (state.KeepRunningBatch(batch_size)) {
    iter_count += batch_size;
  }
  assert(state.iterations() == iter_count);
+
+  // Verify that the iteration count always increases across runs (see
+  // comment above).
+  assert(iter_count == batch_size            // max_iterations == 1
+         || iter_count > prior_iter_count);  // max_iterations > batch_size
+  prior_iter_count = iter_count;
 }
-BENCHMARK(BM_KeepRunningBatch);
+// Register with a fixed repetition count to establish the invariant that
+// the iteration count should always change across runs.  This overrides
+// the --benchmark_repetitions command line flag, which would otherwise
+// cause this test to fail if set > 1.
+BENCHMARK(BM_KeepRunningBatch)->Repetitions(1);

 void BM_RangedFor(benchmark::State& state) {
  benchmark::IterationCount iter_count = 0;
@ -127,10 +141,39 @@ void BM_RangedFor(benchmark::State& state) {
 }
 BENCHMARK(BM_RangedFor);

+#ifdef BENCHMARK_HAS_CXX11
+template <typename T>
+void BM_OneTemplateFunc(benchmark::State& state) {
+  auto arg = state.range(0);
+  T sum = 0;
+  for (auto _ : state) {
+    sum += arg;
+  }
+}
+BENCHMARK(BM_OneTemplateFunc<int>)->Arg(1);
+BENCHMARK(BM_OneTemplateFunc<double>)->Arg(1);
+
+template <typename A, typename B>
+void BM_TwoTemplateFunc(benchmark::State& state) {
+  auto arg = state.range(0);
+  A sum = 0;
+  B prod = 1;
+  for (auto _ : state) {
+    sum += arg;
+    prod *= arg;
+  }
+}
+BENCHMARK(BM_TwoTemplateFunc<int, double>)->Arg(1);
+BENCHMARK(BM_TwoTemplateFunc<double, int>)->Arg(1);
+
+#endif  // BENCHMARK_HAS_CXX11
+
 // Ensure that StateIterator provides all the necessary typedefs required to
 // instantiate std::iterator_traits.
-static_assert(std::is_same<
-  typename std::iterator_traits<benchmark::State::StateIterator>::value_type,
-  typename benchmark::State::StateIterator::value_type>::value, "");
+static_assert(
+    std::is_same<typename std::iterator_traits<
+                     benchmark::State::StateIterator>::value_type,
+                 typename benchmark::State::StateIterator::value_type>::value,
+    "");

 BENCHMARK_MAIN();
--- a/test/benchmark_gtest.cc
+++ b/test/benchmark_gtest.cc
@ -1,3 +1,5 @@
+#include <map>
+#include <string>
 #include <vector>

 #include "../src/benchmark_register.h"
@ -6,6 +8,8 @@

 namespace benchmark {
 namespace internal {
+extern std::map<std::string, std::string>* global_context;
+
 namespace {

 TEST(AddRangeTest, Simple) {
@ -90,6 +94,12 @@ TEST(AddRangeTest, ZeroOnlyRange) {
  EXPECT_THAT(dst, testing::ElementsAre(0));
 }

+TEST(AddRangeTest, ZeroStartingRange) {
+  std::vector<int> dst;
+  AddRange(&dst, 0, 2, 2);
+  EXPECT_THAT(dst, testing::ElementsAre(0, 1, 2));
+}
+
 TEST(AddRangeTest, NegativeRange64) {
  std::vector<int64_t> dst;
  AddRange<int64_t>(&dst, -4, 4, 2);
@ -123,6 +133,33 @@ TEST(AddRangeTest, Simple8) {
  EXPECT_THAT(dst, testing::ElementsAre(1, 2, 4, 8));
 }

+TEST(AddCustomContext, Simple) {
+  EXPECT_THAT(global_context, nullptr);
+
+  AddCustomContext("foo", "bar");
+  AddCustomContext("baz", "qux");
+
+  EXPECT_THAT(*global_context,
+              testing::UnorderedElementsAre(testing::Pair("foo", "bar"),
+                                            testing::Pair("baz", "qux")));
+
+  delete global_context;
+  global_context = nullptr;
+}
+
+TEST(AddCustomContext, DuplicateKey) {
+  EXPECT_THAT(global_context, nullptr);
+
+  AddCustomContext("foo", "bar");
+  AddCustomContext("foo", "qux");
+
+  EXPECT_THAT(*global_context,
+              testing::UnorderedElementsAre(testing::Pair("foo", "bar")));
+
+  delete global_context;
+  global_context = nullptr;
+}
+
 }  // namespace
 }  // namespace internal
 }  // namespace benchmark
--- a/test/benchmark_random_interleaving_gtest.cc
+++ b/test/benchmark_random_interleaving_gtest.cc
@ -0,0 +1,127 @@
+#include <queue>
+#include <string>
+#include <vector>
+
+#include "../src/commandlineflags.h"
+#include "../src/string_util.h"
+#include "benchmark/benchmark.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+namespace benchmark {
+
+BM_DECLARE_bool(benchmark_enable_random_interleaving);
+BM_DECLARE_string(benchmark_filter);
+BM_DECLARE_int32(benchmark_repetitions);
+
+namespace internal {
+namespace {
+
+class EventQueue : public std::queue<std::string> {
+ public:
+  void Put(const std::string& event) { push(event); }
+
+  void Clear() {
+    while (!empty()) {
+      pop();
+    }
+  }
+
+  std::string Get() {
+    std::string event = front();
+    pop();
+    return event;
+  }
+};
+
+EventQueue* queue = new EventQueue();
+
+class NullReporter : public BenchmarkReporter {
+ public:
+  bool ReportContext(const Context& /*context*/) override { return true; }
+  void ReportRuns(const std::vector<Run>& /* report */) override {}
+};
+
+class BenchmarkTest : public testing::Test {
+ public:
+  static void SetupHook(int /* num_threads */) { queue->push("Setup"); }
+
+  static void TeardownHook(int /* num_threads */) { queue->push("Teardown"); }
+
+  void Execute(const std::string& pattern) {
+    queue->Clear();
+
+    BenchmarkReporter* reporter = new NullReporter;
+    FLAGS_benchmark_filter = pattern;
+    RunSpecifiedBenchmarks(reporter);
+    delete reporter;
+
+    queue->Put("DONE");  // End marker
+  }
+};
+
+void BM_Match1(benchmark::State& state) {
+  const int64_t arg = state.range(0);
+
+  for (auto _ : state) {
+  }
+  queue->Put(StrFormat("BM_Match1/%d", static_cast<int>(arg)));
+}
+BENCHMARK(BM_Match1)
+    ->Iterations(100)
+    ->Arg(1)
+    ->Arg(2)
+    ->Arg(3)
+    ->Range(10, 80)
+    ->Args({90})
+    ->Args({100});
+
+TEST_F(BenchmarkTest, Match1) {
+  Execute("BM_Match1");
+  ASSERT_EQ("BM_Match1/1", queue->Get());
+  ASSERT_EQ("BM_Match1/2", queue->Get());
+  ASSERT_EQ("BM_Match1/3", queue->Get());
+  ASSERT_EQ("BM_Match1/10", queue->Get());
+  ASSERT_EQ("BM_Match1/64", queue->Get());
+  ASSERT_EQ("BM_Match1/80", queue->Get());
+  ASSERT_EQ("BM_Match1/90", queue->Get());
+  ASSERT_EQ("BM_Match1/100", queue->Get());
+  ASSERT_EQ("DONE", queue->Get());
+}
+
+TEST_F(BenchmarkTest, Match1WithRepetition) {
+  FLAGS_benchmark_repetitions = 2;
+
+  Execute("BM_Match1/(64|80)");
+  ASSERT_EQ("BM_Match1/64", queue->Get());
+  ASSERT_EQ("BM_Match1/64", queue->Get());
+  ASSERT_EQ("BM_Match1/80", queue->Get());
+  ASSERT_EQ("BM_Match1/80", queue->Get());
+  ASSERT_EQ("DONE", queue->Get());
+}
+
+TEST_F(BenchmarkTest, Match1WithRandomInterleaving) {
+  FLAGS_benchmark_enable_random_interleaving = true;
+  FLAGS_benchmark_repetitions = 100;
+
+  std::map<std::string, int> element_count;
+  std::map<std::string, int> interleaving_count;
+  Execute("BM_Match1/(64|80)");
+  for (int i = 0; i < 100; ++i) {
+    std::vector<std::string> interleaving;
+    interleaving.push_back(queue->Get());
+    interleaving.push_back(queue->Get());
+    element_count[interleaving[0]]++;
+    element_count[interleaving[1]]++;
+    interleaving_count[StrFormat("%s,%s", interleaving[0].c_str(),
+                                 interleaving[1].c_str())]++;
+  }
+  EXPECT_EQ(element_count["BM_Match1/64"], 100) << "Unexpected repetitions.";
+  EXPECT_EQ(element_count["BM_Match1/80"], 100) << "Unexpected repetitions.";
+  EXPECT_GE(interleaving_count.size(), 2) << "Interleaving was not randomized.";
+  ASSERT_EQ("DONE", queue->Get());
+}
+
+}  // namespace
+}  // namespace internal
+}  // namespace benchmark
--- a/test/benchmark_setup_teardown_test.cc
+++ b/test/benchmark_setup_teardown_test.cc
@ -0,0 +1,157 @@
+#include <atomic>
+#include <cassert>
+#include <cstdlib>
+#include <cstring>
+#include <iostream>
+#include <limits>
+#include <string>
+
+#include "benchmark/benchmark.h"
+
+// Test that Setup() and Teardown() are called exactly once
+// for each benchmark run (single-threaded).
+namespace single {
+static int setup_call = 0;
+static int teardown_call = 0;
+}  // namespace single
+static void DoSetup1(const benchmark::State& state) {
+  ++single::setup_call;
+
+  // Setup/Teardown should never be called with any thread_idx != 0.
+  assert(state.thread_index() == 0);
+}
+
+static void DoTeardown1(const benchmark::State& state) {
+  ++single::teardown_call;
+  assert(state.thread_index() == 0);
+}
+
+static void BM_with_setup(benchmark::State& state) {
+  for (auto s : state) {
+  }
+}
+BENCHMARK(BM_with_setup)
+    ->Arg(1)
+    ->Arg(3)
+    ->Arg(5)
+    ->Arg(7)
+    ->Iterations(100)
+    ->Setup(DoSetup1)
+    ->Teardown(DoTeardown1);
+
+// Test that Setup() and Teardown() are called once for each group of threads.
+namespace concurrent {
+static std::atomic<int> setup_call(0);
+static std::atomic<int> teardown_call(0);
+static std::atomic<int> func_call(0);
+}  // namespace concurrent
+
+static void DoSetup2(const benchmark::State& state) {
+  concurrent::setup_call.fetch_add(1, std::memory_order_acquire);
+  assert(state.thread_index() == 0);
+}
+
+static void DoTeardown2(const benchmark::State& state) {
+  concurrent::teardown_call.fetch_add(1, std::memory_order_acquire);
+  assert(state.thread_index() == 0);
+}
+
+static void BM_concurrent(benchmark::State& state) {
+  for (auto s : state) {
+  }
+  concurrent::func_call.fetch_add(1, std::memory_order_acquire);
+}
+
+BENCHMARK(BM_concurrent)
+    ->Setup(DoSetup2)
+    ->Teardown(DoTeardown2)
+    ->Iterations(100)
+    ->Threads(5)
+    ->Threads(10)
+    ->Threads(15);
+
+// Testing interaction with Fixture::Setup/Teardown
+namespace fixture_interaction {
+int setup = 0;
+int fixture_setup = 0;
+}  // namespace fixture_interaction
+
+#define FIXTURE_BECHMARK_NAME MyFixture
+
+class FIXTURE_BECHMARK_NAME : public ::benchmark::Fixture {
+ public:
+  void SetUp(const ::benchmark::State&) BENCHMARK_OVERRIDE {
+    fixture_interaction::fixture_setup++;
+  }
+
+  ~FIXTURE_BECHMARK_NAME() {}
+};
+
+BENCHMARK_F(FIXTURE_BECHMARK_NAME, BM_WithFixture)(benchmark::State& st) {
+  for (auto _ : st) {
+  }
+}
+
+static void DoSetupWithFixture(const benchmark::State&) {
+  fixture_interaction::setup++;
+}
+
+BENCHMARK_REGISTER_F(FIXTURE_BECHMARK_NAME, BM_WithFixture)
+    ->Arg(1)
+    ->Arg(3)
+    ->Arg(5)
+    ->Arg(7)
+    ->Setup(DoSetupWithFixture)
+    ->Repetitions(1)
+    ->Iterations(100);
+
+// Testing repetitions.
+namespace repetitions {
+int setup = 0;
+}
+
+static void DoSetupWithRepetitions(const benchmark::State&) {
+  repetitions::setup++;
+}
+static void BM_WithRep(benchmark::State& state) {
+  for (auto _ : state) {
+  }
+}
+
+BENCHMARK(BM_WithRep)
+    ->Arg(1)
+    ->Arg(3)
+    ->Arg(5)
+    ->Arg(7)
+    ->Setup(DoSetupWithRepetitions)
+    ->Iterations(100)
+    ->Repetitions(4);
+
+int main(int argc, char** argv) {
+  benchmark::Initialize(&argc, argv);
+
+  size_t ret = benchmark::RunSpecifiedBenchmarks(".");
+  assert(ret > 0);
+
+  // Setup/Teardown is called once for each arg group (1,3,5,7).
+  assert(single::setup_call == 4);
+  assert(single::teardown_call == 4);
+
+  // 3 group of threads calling this function (3,5,10).
+  assert(concurrent::setup_call.load(std::memory_order_relaxed) == 3);
+  assert(concurrent::teardown_call.load(std::memory_order_relaxed) == 3);
+  assert((5 + 10 + 15) ==
+         concurrent::func_call.load(std::memory_order_relaxed));
+
+  // Setup is called 4 times, once for each arg group (1,3,5,7)
+  assert(fixture_interaction::setup == 4);
+  // Fixture::Setup is called everytime the bm routine is run.
+  // The exact number is indeterministic, so we just assert that
+  // it's more than setup.
+  assert(fixture_interaction::fixture_setup > fixture_interaction::setup);
+
+  // Setup is call once for each repetition * num_arg =  4 * 4 = 16.
+  assert(repetitions::setup == 16);
+
+  return 0;
+}
--- a/test/benchmark_test.cc
+++ b/test/benchmark_test.cc
@ -93,8 +93,9 @@ static void BM_SetInsert(benchmark::State& state) {
  state.SetBytesProcessed(state.iterations() * state.range(1) * sizeof(int));
 }

-// Test many inserts at once to reduce the total iterations needed. Otherwise, the slower,
-// non-timed part of each iteration will make the benchmark take forever.
+// Test many inserts at once to reduce the total iterations needed. Otherwise,
+// the slower, non-timed part of each iteration will make the benchmark take
+// forever.
 BENCHMARK(BM_SetInsert)->Ranges({{1 << 10, 8 << 10}, {128, 512}});

 template <typename Container,
@ -126,7 +127,7 @@ static void BM_StringCompare(benchmark::State& state) {
 BENCHMARK(BM_StringCompare)->Range(1, 1 << 20);

 static void BM_SetupTeardown(benchmark::State& state) {
-  if (state.thread_index == 0) {
+  if (state.thread_index() == 0) {
    // No need to lock test_vector_mu here as this is running single-threaded.
    test_vector = new std::vector<int>();
  }
@ -139,7 +140,7 @@ static void BM_SetupTeardown(benchmark::State& state) {
      test_vector->pop_back();
    ++i;
  }
-  if (state.thread_index == 0) {
+  if (state.thread_index() == 0) {
    delete test_vector;
  }
 }
@ -156,11 +157,11 @@ BENCHMARK(BM_LongTest)->Range(1 << 16, 1 << 28);

 static void BM_ParallelMemset(benchmark::State& state) {
  int64_t size = state.range(0) / static_cast<int64_t>(sizeof(int));
-  int thread_size = static_cast<int>(size) / state.threads;
-  int from = thread_size * state.thread_index;
+  int thread_size = static_cast<int>(size) / state.threads();
+  int from = thread_size * state.thread_index();
  int to = from + thread_size;

-  if (state.thread_index == 0) {
+  if (state.thread_index() == 0) {
    test_vector = new std::vector<int>(static_cast<size_t>(size));
  }

@ -172,7 +173,7 @@ static void BM_ParallelMemset(benchmark::State& state) {
    }
  }

-  if (state.thread_index == 0) {
+  if (state.thread_index() == 0) {
    delete test_vector;
  }
 }
@ -214,7 +215,8 @@ BENCHMARK_CAPTURE(BM_with_args, string_and_pair_test, std::string("abc"),
                  std::pair<int, double>(42, 3.8));

 void BM_non_template_args(benchmark::State& state, int, double) {
-  while(state.KeepRunning()) {}
+  while (state.KeepRunning()) {
+  }
 }
 BENCHMARK_CAPTURE(BM_non_template_args, basic_test, 0, 0);

@ -223,14 +225,14 @@ BENCHMARK_CAPTURE(BM_non_template_args, basic_test, 0, 0);
 static void BM_DenseThreadRanges(benchmark::State& st) {
  switch (st.range(0)) {
    case 1:
-      assert(st.threads == 1 || st.threads == 2 || st.threads == 3);
+      assert(st.threads() == 1 || st.threads() == 2 || st.threads() == 3);
      break;
    case 2:
-      assert(st.threads == 1 || st.threads == 3 || st.threads == 4);
+      assert(st.threads() == 1 || st.threads() == 3 || st.threads() == 4);
      break;
    case 3:
-      assert(st.threads == 5 || st.threads == 8 || st.threads == 11 ||
-             st.threads == 14);
+      assert(st.threads() == 5 || st.threads() == 8 || st.threads() == 11 ||
+             st.threads() == 14);
      break;
    default:
      assert(false && "Invalid test case number");
--- a/test/clobber_memory_assembly_test.cc
+++ b/test/clobber_memory_assembly_test.cc
@ -9,7 +9,6 @@ extern "C" {
 extern int ExternInt;
 extern int ExternInt2;
 extern int ExternInt3;
-
 }

 // CHECK-LABEL: test_basic:
--- a/test/commandlineflags_gtest.cc
+++ b/test/commandlineflags_gtest.cc
@ -2,6 +2,7 @@

 #include "../src/commandlineflags.h"
 #include "../src/internal_macros.h"
+#include "gmock/gmock.h"
 #include "gtest/gtest.h"

 namespace benchmark {
@ -19,9 +20,7 @@ int setenv(const char* name, const char* value, int overwrite) {
  return _putenv_s(name, value);
 }

-int unsetenv(const char* name) {
-  return _putenv_s(name, "");
-}
+int unsetenv(const char* name) { return _putenv_s(name, ""); }

 #endif  // BENCHMARK_OS_WINDOWS

@ -197,5 +196,33 @@ TEST(StringFromEnv, Valid) {
  unsetenv("IN_ENV");
 }

+TEST(KvPairsFromEnv, Default) {
+  ASSERT_EQ(unsetenv("NOT_IN_ENV"), 0);
+  EXPECT_THAT(KvPairsFromEnv("not_in_env", {{"foo", "bar"}}),
+              testing::ElementsAre(testing::Pair("foo", "bar")));
+}
+
+TEST(KvPairsFromEnv, MalformedReturnsDefault) {
+  ASSERT_EQ(setenv("IN_ENV", "foo", 1), 0);
+  EXPECT_THAT(KvPairsFromEnv("in_env", {{"foo", "bar"}}),
+              testing::ElementsAre(testing::Pair("foo", "bar")));
+  unsetenv("IN_ENV");
+}
+
+TEST(KvPairsFromEnv, Single) {
+  ASSERT_EQ(setenv("IN_ENV", "foo=bar", 1), 0);
+  EXPECT_THAT(KvPairsFromEnv("in_env", {}),
+              testing::ElementsAre(testing::Pair("foo", "bar")));
+  unsetenv("IN_ENV");
+}
+
+TEST(KvPairsFromEnv, Multiple) {
+  ASSERT_EQ(setenv("IN_ENV", "foo=bar,baz=qux", 1), 0);
+  EXPECT_THAT(KvPairsFromEnv("in_env", {}),
+              testing::UnorderedElementsAre(testing::Pair("foo", "bar"),
+                                            testing::Pair("baz", "qux")));
+  unsetenv("IN_ENV");
+}
+
 }  // namespace
 }  // namespace benchmark
--- a/test/complexity_test.cc
+++ b/test/complexity_test.cc
@ -4,6 +4,7 @@
 #include <cmath>
 #include <cstdlib>
 #include <vector>
+
 #include "benchmark/benchmark.h"
 #include "output_test.h"

@ -12,8 +13,10 @@ namespace {
 #define ADD_COMPLEXITY_CASES(...) \
  int CONCAT(dummy, __LINE__) = AddComplexityTest(__VA_ARGS__)

-int AddComplexityTest(std::string test_name, std::string big_o_test_name,
-                      std::string rms_test_name, std::string big_o) {
+int AddComplexityTest(const std::string &test_name,
+                      const std::string &big_o_test_name,
+                      const std::string &rms_test_name,
+                      const std::string &big_o, int family_index) {
  SetSubstitutions({{"%name", test_name},
                    {"%bigo_name", big_o_test_name},
                    {"%rms_name", rms_test_name},
@ -25,25 +28,33 @@ int AddComplexityTest(std::string test_name, std::string big_o_test_name,
      {{"^%bigo_name %bigo_str %bigo_str[ ]*$"},
       {"^%bigo_name", MR_Not},  // Assert we we didn't only matched a name.
       {"^%rms_name %rms %rms[ ]*$", MR_Next}});
-  AddCases(TC_JSONOut, {{"\"name\": \"%bigo_name\",$"},
-                        {"\"run_name\": \"%name\",$", MR_Next},
-                        {"\"run_type\": \"aggregate\",$", MR_Next},
-                        {"\"repetitions\": %int,$", MR_Next},
-                        {"\"threads\": 1,$", MR_Next},
-                        {"\"aggregate_name\": \"BigO\",$", MR_Next},
-                        {"\"cpu_coefficient\": %float,$", MR_Next},
-                        {"\"real_coefficient\": %float,$", MR_Next},
-                        {"\"big_o\": \"%bigo\",$", MR_Next},
-                        {"\"time_unit\": \"ns\"$", MR_Next},
-                        {"}", MR_Next},
-                        {"\"name\": \"%rms_name\",$"},
-                        {"\"run_name\": \"%name\",$", MR_Next},
-                        {"\"run_type\": \"aggregate\",$", MR_Next},
-                        {"\"repetitions\": %int,$", MR_Next},
-                        {"\"threads\": 1,$", MR_Next},
-                        {"\"aggregate_name\": \"RMS\",$", MR_Next},
-                        {"\"rms\": %float$", MR_Next},
-                        {"}", MR_Next}});
+  AddCases(
+      TC_JSONOut,
+      {{"\"name\": \"%bigo_name\",$"},
+       {"\"family_index\": " + std::to_string(family_index) + ",$", MR_Next},
+       {"\"per_family_instance_index\": 0,$", MR_Next},
+       {"\"run_name\": \"%name\",$", MR_Next},
+       {"\"run_type\": \"aggregate\",$", MR_Next},
+       {"\"repetitions\": %int,$", MR_Next},
+       {"\"threads\": 1,$", MR_Next},
+       {"\"aggregate_name\": \"BigO\",$", MR_Next},
+       {"\"aggregate_unit\": \"time\",$", MR_Next},
+       {"\"cpu_coefficient\": %float,$", MR_Next},
+       {"\"real_coefficient\": %float,$", MR_Next},
+       {"\"big_o\": \"%bigo\",$", MR_Next},
+       {"\"time_unit\": \"ns\"$", MR_Next},
+       {"}", MR_Next},
+       {"\"name\": \"%rms_name\",$"},
+       {"\"family_index\": " + std::to_string(family_index) + ",$", MR_Next},
+       {"\"per_family_instance_index\": 0,$", MR_Next},
+       {"\"run_name\": \"%name\",$", MR_Next},
+       {"\"run_type\": \"aggregate\",$", MR_Next},
+       {"\"repetitions\": %int,$", MR_Next},
+       {"\"threads\": 1,$", MR_Next},
+       {"\"aggregate_name\": \"RMS\",$", MR_Next},
+       {"\"aggregate_unit\": \"percentage\",$", MR_Next},
+       {"\"rms\": %float$", MR_Next},
+       {"}", MR_Next}});
  AddCases(TC_CSVOut, {{"^\"%bigo_name\",,%float,%float,%bigo,,,,,$"},
                       {"^\"%bigo_name\"", MR_Not},
                       {"^\"%rms_name\",,%float,%float,,,,,,$", MR_Next}});
@ -56,7 +67,7 @@ int AddComplexityTest(std::string test_name, std::string big_o_test_name,
 // --------------------------- Testing BigO O(1) --------------------------- //
 // ========================================================================= //

-void BM_Complexity_O1(benchmark::State& state) {
+void BM_Complexity_O1(benchmark::State &state) {
  for (auto _ : state) {
    for (int i = 0; i < 1024; ++i) {
      benchmark::DoNotOptimize(&i);
@ -82,15 +93,15 @@ const char *lambda_big_o_1 = "f\\(N\\)";

 // Add enum tests
 ADD_COMPLEXITY_CASES(one_test_name, big_o_1_test_name, rms_o_1_test_name,
-                     enum_big_o_1);
+                     enum_big_o_1, /*family_index=*/0);

 // Add auto enum tests
 ADD_COMPLEXITY_CASES(one_test_name, big_o_1_test_name, rms_o_1_test_name,
-                     auto_big_o_1);
+                     auto_big_o_1, /*family_index=*/1);

 // Add lambda tests
 ADD_COMPLEXITY_CASES(one_test_name, big_o_1_test_name, rms_o_1_test_name,
-                     lambda_big_o_1);
+                     lambda_big_o_1, /*family_index=*/2);

 // ========================================================================= //
 // --------------------------- Testing BigO O(N) --------------------------- //
@ -105,7 +116,7 @@ std::vector<int> ConstructRandomVector(int64_t size) {
  return v;
 }

-void BM_Complexity_O_N(benchmark::State& state) {
+void BM_Complexity_O_N(benchmark::State &state) {
  auto v = ConstructRandomVector(state.range(0));
  // Test worst case scenario (item not in vector)
  const int64_t item_not_in_vector = state.range(0) * 2;
@ -137,17 +148,17 @@ const char *lambda_big_o_n = "f\\(N\\)";

 // Add enum tests
 ADD_COMPLEXITY_CASES(n_test_name, big_o_n_test_name, rms_o_n_test_name,
-                     enum_auto_big_o_n);
+                     enum_auto_big_o_n, /*family_index=*/3);

 // Add lambda tests
 ADD_COMPLEXITY_CASES(n_test_name, big_o_n_test_name, rms_o_n_test_name,
-                     lambda_big_o_n);
+                     lambda_big_o_n, /*family_index=*/4);

 // ========================================================================= //
 // ------------------------- Testing BigO O(N*lgN) ------------------------- //
 // ========================================================================= //

-static void BM_Complexity_O_N_log_N(benchmark::State& state) {
+static void BM_Complexity_O_N_log_N(benchmark::State &state) {
  auto v = ConstructRandomVector(state.range(0));
  for (auto _ : state) {
    std::sort(v.begin(), v.end());
@ -178,17 +189,19 @@ const char *lambda_big_o_n_lg_n = "f\\(N\\)";

 // Add enum tests
 ADD_COMPLEXITY_CASES(n_lg_n_test_name, big_o_n_lg_n_test_name,
-                     rms_o_n_lg_n_test_name, enum_auto_big_o_n_lg_n);
+                     rms_o_n_lg_n_test_name, enum_auto_big_o_n_lg_n,
+                     /*family_index=*/6);

 // Add lambda tests
 ADD_COMPLEXITY_CASES(n_lg_n_test_name, big_o_n_lg_n_test_name,
-                     rms_o_n_lg_n_test_name, lambda_big_o_n_lg_n);
+                     rms_o_n_lg_n_test_name, lambda_big_o_n_lg_n,
+                     /*family_index=*/7);

 // ========================================================================= //
 // -------- Testing formatting of Complexity with captured args ------------ //
 // ========================================================================= //

-void BM_ComplexityCaptureArgs(benchmark::State& state, int n) {
+void BM_ComplexityCaptureArgs(benchmark::State &state, int n) {
  for (auto _ : state) {
    // This test requires a non-zero CPU time to avoid divide-by-zero
    benchmark::DoNotOptimize(state.iterations());
@ -204,7 +217,7 @@ const std::string complexity_capture_name =
    "BM_ComplexityCaptureArgs/capture_test";

 ADD_COMPLEXITY_CASES(complexity_capture_name, complexity_capture_name + "_BigO",
-                     complexity_capture_name + "_RMS", "N");
+                     complexity_capture_name + "_RMS", "N", /*family_index=*/9);

 // ========================================================================= //
 // --------------------------- TEST CASES END ------------------------------ //
--- a/test/cxx03_test.cc
+++ b/test/cxx03_test.cc
@ -44,8 +44,7 @@ BENCHMARK_TEMPLATE(BM_template1, long);
 BENCHMARK_TEMPLATE1(BM_template1, int);

 template <class T>
-struct BM_Fixture : public ::benchmark::Fixture {
-};
+struct BM_Fixture : public ::benchmark::Fixture {};

 BENCHMARK_TEMPLATE_F(BM_Fixture, BM_template1, long)(benchmark::State& state) {
  BM_empty(state);
@ -55,8 +54,8 @@ BENCHMARK_TEMPLATE1_F(BM_Fixture, BM_template2, int)(benchmark::State& state) {
 }

 void BM_counters(benchmark::State& state) {
-    BM_empty(state);
-    state.counters["Foo"] = 2;
+  BM_empty(state);
+  state.counters["Foo"] = 2;
 }
 BENCHMARK(BM_counters);

--- a/test/diagnostics_test.cc
+++ b/test/diagnostics_test.cc
@ -26,7 +26,8 @@ void TestHandler() {
 }

 void try_invalid_pause_resume(benchmark::State& state) {
-#if !defined(TEST_BENCHMARK_LIBRARY_HAS_NO_ASSERTIONS) && !defined(TEST_HAS_NO_EXCEPTIONS)
+#if !defined(TEST_BENCHMARK_LIBRARY_HAS_NO_ASSERTIONS) && \
+    !defined(TEST_HAS_NO_EXCEPTIONS)
  try {
    state.PauseTiming();
    std::abort();
@ -57,13 +58,12 @@ void BM_diagnostic_test(benchmark::State& state) {
 }
 BENCHMARK(BM_diagnostic_test);

-
 void BM_diagnostic_test_keep_running(benchmark::State& state) {
  static bool called_once = false;

  if (called_once == false) try_invalid_pause_resume(state);

-  while(state.KeepRunning()) {
+  while (state.KeepRunning()) {
    benchmark::DoNotOptimize(state.iterations());
  }

--- a/test/display_aggregates_only_test.cc
+++ b/test/display_aggregates_only_test.cc
@ -19,21 +19,23 @@ BENCHMARK(BM_SummaryRepeat)->Repetitions(3)->DisplayAggregatesOnly();
 int main(int argc, char* argv[]) {
  const std::string output = GetFileReporterOutput(argc, argv);

-  if (SubstrCnt(output, "\"name\": \"BM_SummaryRepeat/repeats:3") != 6 ||
+  if (SubstrCnt(output, "\"name\": \"BM_SummaryRepeat/repeats:3") != 7 ||
      SubstrCnt(output, "\"name\": \"BM_SummaryRepeat/repeats:3\"") != 3 ||
      SubstrCnt(output, "\"name\": \"BM_SummaryRepeat/repeats:3_mean\"") != 1 ||
      SubstrCnt(output, "\"name\": \"BM_SummaryRepeat/repeats:3_median\"") !=
          1 ||
      SubstrCnt(output, "\"name\": \"BM_SummaryRepeat/repeats:3_stddev\"") !=
-          1) {
-    std::cout << "Precondition mismatch. Expected to only find 6 "
+          1 ||
+      SubstrCnt(output, "\"name\": \"BM_SummaryRepeat/repeats:3_cv\"") != 1) {
+    std::cout << "Precondition mismatch. Expected to only find 8 "
                 "occurrences of \"BM_SummaryRepeat/repeats:3\" substring:\n"
                 "\"name\": \"BM_SummaryRepeat/repeats:3\", "
                 "\"name\": \"BM_SummaryRepeat/repeats:3\", "
                 "\"name\": \"BM_SummaryRepeat/repeats:3\", "
                 "\"name\": \"BM_SummaryRepeat/repeats:3_mean\", "
                 "\"name\": \"BM_SummaryRepeat/repeats:3_median\", "
-                 "\"name\": \"BM_SummaryRepeat/repeats:3_stddev\"\nThe entire "
+                 "\"name\": \"BM_SummaryRepeat/repeats:3_stddev\", "
+                 "\"name\": \"BM_SummaryRepeat/repeats:3_cv\"\nThe entire "
                 "output:\n";
    std::cout << output;
    return 1;
--- a/test/donotoptimize_assembly_test.cc
+++ b/test/donotoptimize_assembly_test.cc
@ -15,7 +15,7 @@ inline int Add42(int x) { return x + 42; }
 struct NotTriviallyCopyable {
  NotTriviallyCopyable();
  explicit NotTriviallyCopyable(int x) : value(x) {}
-  NotTriviallyCopyable(NotTriviallyCopyable const&);
+  NotTriviallyCopyable(NotTriviallyCopyable const &);
  int value;
 };

@ -23,7 +23,6 @@ struct Large {
  int value;
  int data[2];
 };
-
 }
 // CHECK-LABEL: test_with_rvalue:
 extern "C" void test_with_rvalue() {
@ -118,8 +117,7 @@ extern "C" int test_div_by_two(int input) {
 // CHECK-LABEL: test_inc_integer:
 extern "C" int test_inc_integer() {
  int x = 0;
-  for (int i=0; i < 5; ++i)
-    benchmark::DoNotOptimize(++x);
+  for (int i = 0; i < 5; ++i) benchmark::DoNotOptimize(++x);
  // CHECK: movl $1, [[DEST:.*]]
  // CHECK: {{(addl \$1,|incl)}} [[DEST]]
  // CHECK: {{(addl \$1,|incl)}} [[DEST]]
@ -147,7 +145,7 @@ extern "C" void test_pointer_const_lvalue() {
  // CHECK-CLANG: movq %rax, -{{[0-9]+}}(%[[REG:[a-z]+]])
  // CHECK: ret
  int x = 42;
-  int * const xp = &x;
+  int *const xp = &x;
  benchmark::DoNotOptimize(xp);
 }

--- a/test/donotoptimize_test.cc
+++ b/test/donotoptimize_test.cc
@ -1,27 +1,28 @@
-#include "benchmark/benchmark.h"
-
 #include <cstdint>

+#include "benchmark/benchmark.h"
+
 namespace {
 #if defined(__GNUC__)
 std::uint64_t double_up(const std::uint64_t x) __attribute__((const));
 #endif
 std::uint64_t double_up(const std::uint64_t x) { return x * 2; }
-}
+}  // namespace

 // Using DoNotOptimize on types like BitRef seem to cause a lot of problems
 // with the inline assembly on both GCC and Clang.
 struct BitRef {
  int index;
-  unsigned char &byte;
+  unsigned char& byte;

-public:
+ public:
  static BitRef Make() {
    static unsigned char arr[2] = {};
    BitRef b(1, arr[0]);
    return b;
  }
-private:
+
+ private:
  BitRef(int i, unsigned char& b) : index(i), byte(b) {}
 };

--- a/test/filter_test.cc
+++ b/test/filter_test.cc
@ -1,36 +1,41 @@
-#include "benchmark/benchmark.h"
-
+#include <algorithm>
 #include <cassert>
 #include <cmath>
 #include <cstdint>
 #include <cstdlib>
-
 #include <iostream>
 #include <limits>
 #include <sstream>
 #include <string>

+#include "benchmark/benchmark.h"
+
 namespace {

 class TestReporter : public benchmark::ConsoleReporter {
 public:
-  virtual bool ReportContext(const Context& context) {
+  virtual bool ReportContext(const Context& context) BENCHMARK_OVERRIDE {
    return ConsoleReporter::ReportContext(context);
  };

-  virtual void ReportRuns(const std::vector<Run>& report) {
+  virtual void ReportRuns(const std::vector<Run>& report) BENCHMARK_OVERRIDE {
    ++count_;
+    max_family_index_ =
+        std::max<size_t>(max_family_index_, report[0].family_index);
    ConsoleReporter::ReportRuns(report);
  };

-  TestReporter() : count_(0) {}
+  TestReporter() : count_(0), max_family_index_(0) {}

  virtual ~TestReporter() {}

  size_t GetCount() const { return count_; }

+  size_t GetMaxFamilyIndex() const { return max_family_index_; }
+
 private:
  mutable size_t count_;
+  mutable size_t max_family_index_;
 };

 }  // end namespace
@ -65,7 +70,7 @@ static void BM_FooBa(benchmark::State& state) {
 }
 BENCHMARK(BM_FooBa);

-int main(int argc, char **argv) {
+int main(int argc, char** argv) {
  bool list_only = false;
  for (int i = 0; i < argc; ++i)
    list_only |= std::string(argv[i]).find("--benchmark_list_tests") !=
@ -98,6 +103,15 @@ int main(int argc, char **argv) {
                << std::endl;
      return -1;
    }
+
+    const size_t max_family_index = test_reporter.GetMaxFamilyIndex();
+    const size_t num_families = reports_count == 0 ? 0 : 1 + max_family_index;
+    if (num_families != expected_reports) {
+      std::cerr << "ERROR: Expected " << expected_reports
+                << " test families to be run but num_families = "
+                << num_families << std::endl;
+      return -1;
+    }
  }

  return 0;
--- a/test/fixture_test.cc
+++ b/test/fixture_test.cc
@ -1,39 +1,41 @@

-#include "benchmark/benchmark.h"
-
 #include <cassert>
 #include <memory>

-class MyFixture : public ::benchmark::Fixture {
+#include "benchmark/benchmark.h"
+
+#define FIXTURE_BECHMARK_NAME MyFixture
+
+class FIXTURE_BECHMARK_NAME : public ::benchmark::Fixture {
 public:
-  void SetUp(const ::benchmark::State& state) {
-    if (state.thread_index == 0) {
+  void SetUp(const ::benchmark::State& state) BENCHMARK_OVERRIDE {
+    if (state.thread_index() == 0) {
      assert(data.get() == nullptr);
      data.reset(new int(42));
    }
  }

-  void TearDown(const ::benchmark::State& state) {
-    if (state.thread_index == 0) {
+  void TearDown(const ::benchmark::State& state) BENCHMARK_OVERRIDE {
+    if (state.thread_index() == 0) {
      assert(data.get() != nullptr);
      data.reset();
    }
  }

-  ~MyFixture() { assert(data == nullptr); }
+  ~FIXTURE_BECHMARK_NAME() { assert(data == nullptr); }

  std::unique_ptr<int> data;
 };

-BENCHMARK_F(MyFixture, Foo)(benchmark::State &st) {
+BENCHMARK_F(FIXTURE_BECHMARK_NAME, Foo)(benchmark::State& st) {
  assert(data.get() != nullptr);
  assert(*data == 42);
  for (auto _ : st) {
  }
 }

-BENCHMARK_DEFINE_F(MyFixture, Bar)(benchmark::State& st) {
-  if (st.thread_index == 0) {
+BENCHMARK_DEFINE_F(FIXTURE_BECHMARK_NAME, Bar)(benchmark::State& st) {
+  if (st.thread_index() == 0) {
    assert(data.get() != nullptr);
    assert(*data == 42);
  }
@ -43,7 +45,7 @@ BENCHMARK_DEFINE_F(MyFixture, Bar)(benchmark::State& st) {
  }
  st.SetItemsProcessed(st.range(0));
 }
-BENCHMARK_REGISTER_F(MyFixture, Bar)->Arg(42);
-BENCHMARK_REGISTER_F(MyFixture, Bar)->Arg(42)->ThreadPerCpu();
+BENCHMARK_REGISTER_F(FIXTURE_BECHMARK_NAME, Bar)->Arg(42);
+BENCHMARK_REGISTER_F(FIXTURE_BECHMARK_NAME, Bar)->Arg(42)->ThreadPerCpu();

 BENCHMARK_MAIN();
--- a/Show More
+++ b/Show More