VIXL Release 1.9

Refer to the README.md and LICENCE files for details.
2025-02-10 08:52:26 +00:00 · 2015-03-31 11:04:14 +01:00 · 2015-03-31 11:04:14 +01:00 · 6e2c8275d5
commit 6e2c8275d5
parent 5289c5900f
52 changed files with 2075 additions and 1238 deletions
--- a/README.md
+++ b/README.md
@ -1,44 +1,24 @@
-VIXL: AArch64 Runtime Code Generation Library Version 1.8
+VIXL: AArch64 Runtime Code Generation Library Version 1.9
 =========================================================

 Contents:

- * Requirements
 * Overview
+ * Requirements
 * Known limitations
 * Usage


-Requirements
-============
-
-To build VIXL the following software is required:
-
- 1. Python 2.7
- 2. SCons 2.0
- 3. GCC 4.6+
-
-A 64-bit host machine is required, implementing an LP64 data model. VIXL has
-only been tested using GCC on AArch64 Debian and amd64 Ubuntu systems.
-
-To run the linter stage of the tests, the following software is also required:
-
- 1. Git
- 2. [Google's `cpplint.py`][cpplint]
-
-Refer to the 'Usage' section for details.
-
-
 Overview
 ========

-VIXL is made of three components.
+VIXL contains three components.

- 1. A programmatic assembler to generate A64 code at runtime. The assembler
+ 1. A programmatic **assembler** to generate A64 code at runtime. The assembler
    abstracts some of the constraints of the A64 ISA; for example, most
    instructions support any immediate.
- 2. A disassembler which can print any instruction emitted by the assembler.
- 3. A simulator which can simulate any instruction emitted by the assembler.
+ 2. A **disassembler** that can print any instruction emitted by the assembler.
+ 3. A **simulator** that can simulate any instruction emitted by the assembler.
    The simulator allows generated code to be run on another architecture
    without the need for a full ISA model.

@ -48,11 +28,32 @@ Changes from previous versions of VIXL can be found in the
 [Changelog](doc/changelog.md).


+Requirements
+============
+
+To build VIXL the following software is required:
+
+ 1. Python 2.7
+ 2. SCons 2.0
+ 3. GCC 4.8+ or Clang 3.4+
+
+A 64-bit host machine is required, implementing an LP64 data model. VIXL has
+been tested using GCC on AArch64 Debian, GCC and Clang on amd64 Ubuntu
+systems.
+
+To run the linter stage of the tests, the following software is also required:
+
+ 1. Git
+ 2. [Google's `cpplint.py`][cpplint]
+
+Refer to the 'Usage' section for details.
+
+
 Known Limitations
 =================

-VIXL was developed to target JavaScript engines so a number of features from A64
-were deemed unnecessary:
+VIXL was developed for JavaScript engines so a number of features from A64 were
+deemed unnecessary:

 * Limited rounding mode support for floating point.
 * Limited support for synchronisation instructions.
--- a/63
+++ b/63
@ -49,18 +49,19 @@ Some common build targets are:
 # Global configuration.
 PROJ_SRC_DIR   = 'src'
 PROJ_SRC_FILES = '''
-src/a64/assembler-a64.cc
-src/a64/cpu-a64.cc
-src/a64/debugger-a64.cc
-src/a64/decoder-a64.cc
-src/a64/disasm-a64.cc
-src/a64/instructions-a64.cc
-src/a64/instrument-a64.cc
-src/a64/logic-a64.cc
-src/a64/macro-assembler-a64.cc
-src/a64/simulator-a64.cc
-src/code-buffer.cc
-src/utils.cc
+src/vixl/a64/assembler-a64.cc
+src/vixl/a64/cpu-a64.cc
+src/vixl/a64/debugger-a64.cc
+src/vixl/a64/decoder-a64.cc
+src/vixl/a64/disasm-a64.cc
+src/vixl/a64/instructions-a64.cc
+src/vixl/a64/instrument-a64.cc
+src/vixl/a64/logic-a64.cc
+src/vixl/a64/macro-assembler-a64.cc
+src/vixl/a64/simulator-a64.cc
+src/vixl/code-buffer.cc
+src/vixl/compiler-intrinsics.cc
+src/vixl/utils.cc
 '''.split()
 PROJ_EXAMPLES_DIR = 'examples'
 PROJ_EXAMPLES_SRC_FILES = '''
@ -119,9 +120,7 @@ TARGET_SRC_FILES = {
    benchmarks/bench-branch-link-masm.cc
    '''.split()
 }
-RELEASE_OBJ_DIR  = 'obj/release'
-DEBUG_OBJ_DIR    = 'obj/debug'
-
+OBJ_DIR  = 'obj'

 # Helper functions.
 def abort(message):
@ -133,6 +132,10 @@ def list_target(obj_dir, src_files):
  return map(lambda x: os.path.join(obj_dir, x), src_files)


+def is_compiler(compiler):
+  return env['CXX'].find(compiler) == 0
+
+
 def create_variant(obj_dir, targets_dir):
  VariantDir(os.path.join(obj_dir, PROJ_SRC_DIR), PROJ_SRC_DIR)
  for directory in targets_dir.itervalues():
@ -146,10 +149,9 @@ args.Add(EnumVariable('mode', 'Build mode', 'release',
 sim_default = 'off' if platform.machine() == 'aarch64' else 'on'
 args.Add(EnumVariable('simulator', 'build for the simulator', sim_default,
                      allowed_values = ['on', 'off']))
+args.Add('std', 'c++ standard')

 # Configure the environment.
-create_variant(RELEASE_OBJ_DIR, TARGET_SRC_DIR)
-create_variant(DEBUG_OBJ_DIR, TARGET_SRC_DIR)
 env = Environment(variables=args)

 # Commandline help.
@ -175,18 +177,32 @@ if os.environ.get('LINKFLAGS'):
  env.Append(LINKFLAGS = os.environ.get('LINKFLAGS').split())

 # Always look in 'src' for include files.
+# TODO: Restore the '-Wunreachable-code' flag. This flag breaks builds for clang
+# 3.4 with std=c++98. So we need to re-enable this conditionally when clang is at
+# version 3.5 or later.
 env.Append(CPPPATH = [PROJ_SRC_DIR])
 env.Append(CPPFLAGS = ['-Wall',
                       '-Werror',
                       '-fdiagnostics-show-option',
                       '-Wextra',
+                       '-Wredundant-decls',
                       '-pedantic',
                       # Explicitly enable the write-strings warning. VIXL uses
                       # const correctly when handling string constants.
                       '-Wwrite-strings'])

 build_suffix = ''
+std_path = 'default-std'

+if 'std' in env:
+  env.Append(CPPFLAGS = ['-std=' + env['std']])
+  std_path = env['std']
+
+if is_compiler('clang++'):
+  # This warning only works for Clang, when compiling the code base as C++11
+  # or newer. The compiler does not complain if the option is passed when
+  # compiling earlier C++ standards.
+  env.Append(CPPFLAGS = ['-Wimplicit-fallthrough'])

 if env['simulator'] == 'on':
  env.Append(CPPFLAGS = ['-DUSE_SIMULATOR'])
@ -196,11 +212,9 @@ if env['mode'] == 'debug':
  env.Append(CPPFLAGS = ['-g', '-DVIXL_DEBUG'])
  # Append the debug mode suffix to the executable name.
  build_suffix += '_g'
-  build_dir = DEBUG_OBJ_DIR
 else:
  # Release mode.
  env.Append(CPPFLAGS = ['-O3'])
-  build_dir = RELEASE_OBJ_DIR
  process = subprocess.Popen(env['CXX'] + ' --version | grep "gnu.*4\.8"',
                             shell = True,
                             stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
@ -214,6 +228,9 @@ else:
    # GCC 4.8.
    env.Append(CPPFLAGS = ['-Wno-maybe-uninitialized'])

+# Configure build directory
+build_dir = os.path.join(OBJ_DIR, env['mode'], env['CXX'], std_path, '')
+create_variant(build_dir, TARGET_SRC_DIR)

 # The lists of available targets and target names.
 targets = []
@ -226,7 +243,7 @@ def create_alias(name, target):


 # The vixl library.
-libvixl = env.Library('vixl' + build_suffix,
+libvixl = env.Library(build_dir + 'vixl' + build_suffix,
                      list_target(build_dir, PROJ_SRC_FILES))
 create_alias('libvixl', libvixl)

@ -238,7 +255,7 @@ test_ex_vdir = os.path.join(build_dir, 'test_examples')
 VariantDir(test_ex_vdir, '.')
 test_ex_obj = env.Object(list_target(test_ex_vdir, PROJ_EXAMPLES_SRC_FILES),
                         CPPFLAGS = env['CPPFLAGS'] + ['-DTEST_EXAMPLES'])
-test = env.Program('test-runner' + build_suffix,
+test = env.Program(build_dir + 'test-runner' + build_suffix,
                   list_target(build_dir, TARGET_SRC_FILES['test']) +
                   test_ex_obj + libvixl,
                   CPPPATH = env['CPPPATH'] + [PROJ_EXAMPLES_DIR])
@ -248,7 +265,7 @@ create_alias('test', test)
 benchmarks = ['bench-dataop', 'bench-branch', 'bench-branch-link',
              'bench-branch-masm', 'bench-branch-link-masm']
 for bench in benchmarks:
-  prog = env.Program(bench + build_suffix,
+  prog = env.Program(build_dir + bench + build_suffix,
                     list_target(build_dir, TARGET_SRC_FILES[bench]) + libvixl)
  create_alias(bench, prog)
 # Alias to build all benchmarks.
@ -258,7 +275,7 @@ create_alias('benchmarks', benchmarks)
 examples = []
 for example in PROJ_EXAMPLES_SRC_FILES:
  example_name = "example-" + os.path.splitext(os.path.basename(example))[0]
-  prog = env.Program(example_name,
+  prog = env.Program(build_dir + example_name,
                     [os.path.join(build_dir, example)] + libvixl,
                     CPPPATH = env['CPPPATH'] + [PROJ_EXAMPLES_DIR])
  create_alias(example_name, prog)
--- a/benchmarks/bench-branch-link-masm.cc
+++ b/benchmarks/bench-branch-link-masm.cc
@ -24,9 +24,9 @@
 // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

-#include "a64/macro-assembler-a64.h"
-#include "a64/instructions-a64.h"
-#include "globals.h"
+#include "vixl/a64/macro-assembler-a64.h"
+#include "vixl/a64/instructions-a64.h"
+#include "vixl/globals.h"

 using namespace vixl;

--- a/benchmarks/bench-branch-link.cc
+++ b/benchmarks/bench-branch-link.cc
@ -24,9 +24,9 @@
 // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

-#include "a64/macro-assembler-a64.h"
-#include "a64/instructions-a64.h"
-#include "globals.h"
+#include "vixl/a64/macro-assembler-a64.h"
+#include "vixl/a64/instructions-a64.h"
+#include "vixl/globals.h"

 using namespace vixl;

--- a/benchmarks/bench-branch-masm.cc
+++ b/benchmarks/bench-branch-masm.cc
@ -24,10 +24,10 @@
 // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

-#include "globals.h"
+#include "vixl/globals.h"

-#include "a64/macro-assembler-a64.h"
-#include "a64/instructions-a64.h"
+#include "vixl/a64/macro-assembler-a64.h"
+#include "vixl/a64/instructions-a64.h"

 using namespace vixl;

--- a/benchmarks/bench-branch.cc
+++ b/benchmarks/bench-branch.cc
@ -24,9 +24,9 @@
 // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

-#include "a64/macro-assembler-a64.h"
-#include "a64/instructions-a64.h"
-#include "globals.h"
+#include "vixl/a64/macro-assembler-a64.h"
+#include "vixl/a64/instructions-a64.h"
+#include "vixl/globals.h"

 using namespace vixl;

--- a/benchmarks/bench-dataop.cc
+++ b/benchmarks/bench-dataop.cc
@ -24,9 +24,9 @@
 // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

-#include "a64/macro-assembler-a64.h"
-#include "a64/instructions-a64.h"
-#include "globals.h"
+#include "vixl/a64/macro-assembler-a64.h"
+#include "vixl/a64/instructions-a64.h"
+#include "vixl/globals.h"

 using namespace vixl;

--- a/doc/changelog.md
+++ b/doc/changelog.md
@ -1,6 +1,13 @@
 VIXL Change Log
 ===============

+* 1.9
+    + Improved compatibility with Android build system.
+    + Improved compatibility with Clang toolchain.
+    + Added support for `umulh` instruction.
+    + Added support for `fcmpe` and `fccmpe` instructions.
+    + Other small bug fixes and improvements.
+
 * 1.8
    + Complete NEON instruction set support.
    + Support long branches using veneers.
--- a/examples/custom-disassembler.h
+++ b/examples/custom-disassembler.h
@ -27,7 +27,7 @@
 #ifndef VIXL_EXAMPLES_CUSTOM_DISASSEMBLER_H_
 #define VIXL_EXAMPLES_CUSTOM_DISASSEMBLER_H_

-#include "a64/disasm-a64.h"
+#include "vixl/a64/disasm-a64.h"

 using namespace vixl;

--- a/examples/examples.h
+++ b/examples/examples.h
@ -27,9 +27,9 @@
 #ifndef VIXL_EXAMPLE_EXAMPLES_H_
 # define VIXL_EXAMPLE_EXAMPLES_H_

-#include "a64/simulator-a64.h"
-#include "a64/debugger-a64.h"
-#include "a64/macro-assembler-a64.h"
+#include "vixl/a64/simulator-a64.h"
+#include "vixl/a64/debugger-a64.h"
+#include "vixl/a64/macro-assembler-a64.h"

 using namespace vixl;

--- a/examples/getting-started.cc
+++ b/examples/getting-started.cc
@ -24,8 +24,8 @@
 // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

-#include "a64/simulator-a64.h"
-#include "a64/macro-assembler-a64.h"
+#include "vixl/a64/simulator-a64.h"
+#include "vixl/a64/macro-assembler-a64.h"

 #define BUF_SIZE (4096)
 #define __ masm->
--- a/examples/neon-matrix-multiply.cc
+++ b/examples/neon-matrix-multiply.cc
@ -117,7 +117,7 @@ int main(void) {
  float mat1[kLength], mat2[kLength], output[kLength];

  // Initialise the output matrix to the zero matrix.
-  memset(output, 0, sizeof(float)*kLength);
+  memset(output, 0, sizeof(output[0]) * kLength);

  // Fill the two input matrices with some 32 bit floating point values.
  // Array initialisation using curly brackets is also possible like so:
--- a/src/vixl/a64/assembler-a64.cc
+++ b/src/vixl/a64/assembler-a64.cc
@ -26,7 +26,7 @@


 #include <cmath>
-#include "a64/assembler-a64.h"
+#include "vixl/a64/assembler-a64.h"

 namespace vixl {

@ -35,7 +35,7 @@ CPURegister CPURegList::PopLowestIndex() {
  if (IsEmpty()) {
    return NoCPUReg;
  }
-  int index = CountTrailingZeros(list_, kRegListSizeInBits);
+  int index = CountTrailingZeros(list_);
  VIXL_ASSERT((1 << index) & list_);
  Remove(index);
  return CPURegister(index, size_, type_);
@ -47,7 +47,7 @@ CPURegister CPURegList::PopHighestIndex() {
  if (IsEmpty()) {
    return NoCPUReg;
  }
-  int index = CountLeadingZeros(list_, kRegListSizeInBits);
+  int index = CountLeadingZeros(list_);
  index = kRegListSizeInBits - 1 - index;
  VIXL_ASSERT((1 << index) & list_);
  Remove(index);
@ -463,6 +463,12 @@ bool MemOperand::IsPostIndex() const {
 }


+void MemOperand::AddOffset(int64_t offset) {
+  VIXL_ASSERT(IsImmediateOffset());
+  offset_ += offset;
+}
+
+
 // Assembler
 Assembler::Assembler(byte* buffer, size_t capacity,
                     PositionIndependentCodeOption pic)
@ -1349,6 +1355,14 @@ void Assembler::smulh(const Register& xd,
 }


+void Assembler::umulh(const Register& xd,
+                      const Register& xn,
+                      const Register& xm) {
+  VIXL_ASSERT(xd.Is64Bits() && xn.Is64Bits() && xm.Is64Bits());
+  DataProcessing3Source(xd, xn, xm, xzr, UMULH_x);
+}
+
+
 void Assembler::udiv(const Register& rd,
                     const Register& rn,
                     const Register& rm) {
@ -2628,33 +2642,78 @@ void Assembler::fnmul(const VRegister& vd,
 }


-void Assembler::fcmp(const VRegister& vn,
-                     const VRegister& vm) {
+void Assembler::FPCompareMacro(const VRegister& vn,
+                               double value,
+                               FPTrapFlags trap) {
+  USE(value);
+  // Although the fcmp{e} instructions can strictly only take an immediate
+  // value of +0.0, we don't need to check for -0.0 because the sign of 0.0
+  // doesn't affect the result of the comparison.
+  VIXL_ASSERT(value == 0.0);
+  VIXL_ASSERT(vn.Is1S() || vn.Is1D());
+  Instr op = (trap == EnableTrap) ? FCMPE_zero : FCMP_zero;
+  Emit(FPType(vn) | op | Rn(vn));
+}
+
+
+void Assembler::FPCompareMacro(const VRegister& vn,
+                               const VRegister& vm,
+                               FPTrapFlags trap) {
  VIXL_ASSERT(vn.Is1S() || vn.Is1D());
  VIXL_ASSERT(vn.IsSameSizeAndType(vm));
-  Emit(FPType(vn) | FCMP | Rm(vm) | Rn(vn));
+  Instr op = (trap == EnableTrap) ? FCMPE : FCMP;
+  Emit(FPType(vn) | op | Rm(vm) | Rn(vn));
+}
+
+
+void Assembler::fcmp(const VRegister& vn,
+                     const VRegister& vm) {
+  FPCompareMacro(vn, vm, DisableTrap);
+}
+
+
+void Assembler::fcmpe(const VRegister& vn,
+                      const VRegister& vm) {
+  FPCompareMacro(vn, vm, EnableTrap);
 }


 void Assembler::fcmp(const VRegister& vn,
                     double value) {
-  USE(value);
-  // Although the fcmp instruction can strictly only take an immediate value of
-  // +0.0, we don't need to check for -0.0 because the sign of 0.0 doesn't
-  // affect the result of the comparison.
-  VIXL_ASSERT(value == 0.0);
-  VIXL_ASSERT(vn.Is1S() || vn.Is1D());
-  Emit(FPType(vn) | FCMP_zero | Rn(vn));
+  FPCompareMacro(vn, value, DisableTrap);
 }


+void Assembler::fcmpe(const VRegister& vn,
+                      double value) {
+  FPCompareMacro(vn, value, EnableTrap);
+}
+
+
+void Assembler::FPCCompareMacro(const VRegister& vn,
+                                const VRegister& vm,
+                                StatusFlags nzcv,
+                                Condition cond,
+                                FPTrapFlags trap) {
+  VIXL_ASSERT(vn.Is1S() || vn.Is1D());
+  VIXL_ASSERT(vn.IsSameSizeAndType(vm));
+  Instr op = (trap == EnableTrap) ? FCCMPE : FCCMP;
+  Emit(FPType(vn) | op | Rm(vm) | Cond(cond) | Rn(vn) | Nzcv(nzcv));
+}
+
 void Assembler::fccmp(const VRegister& vn,
                      const VRegister& vm,
                      StatusFlags nzcv,
                      Condition cond) {
-  VIXL_ASSERT(vn.Is1S() || vn.Is1D());
-  VIXL_ASSERT(vn.IsSameSizeAndType(vm));
-  Emit(FPType(vn) | FCCMP | Rm(vm) | Cond(cond) | Rn(vn) | Nzcv(nzcv));
+  FPCCompareMacro(vn, vm, nzcv, cond, DisableTrap);
+}
+
+
+void Assembler::fccmpe(const VRegister& vn,
+                       const VRegister& vm,
+                       StatusFlags nzcv,
+                       Condition cond) {
+  FPCCompareMacro(vn, vm, nzcv, cond, EnableTrap);
 }


@ -4948,6 +5007,7 @@ bool Assembler::IsImmFP64(double imm) {


 bool Assembler::IsImmLSPair(int64_t offset, unsigned access_size) {
+  VIXL_ASSERT(access_size <= kQRegSizeInBytesLog2);
  bool offset_is_size_multiple =
      (((offset >> access_size) << access_size) == offset);
  return offset_is_size_multiple && is_int7(offset >> access_size);
@ -4955,6 +5015,7 @@ bool Assembler::IsImmLSPair(int64_t offset, unsigned access_size) {


 bool Assembler::IsImmLSScaled(int64_t offset, unsigned access_size) {
+  VIXL_ASSERT(access_size <= kQRegSizeInBytesLog2);
  bool offset_is_size_multiple =
      (((offset >> access_size) << access_size) == offset);
  return offset_is_size_multiple && is_uint12(offset >> access_size);
@ -5319,10 +5380,8 @@ bool AreAliased(const CPURegister& reg1, const CPURegister& reg2,
    }
  }

-  int number_of_unique_regs =
-    CountSetBits(unique_regs, sizeof(unique_regs) * 8);
-  int number_of_unique_fpregs =
-    CountSetBits(unique_fpregs, sizeof(unique_fpregs) * 8);
+  int number_of_unique_regs = CountSetBits(unique_regs);
+  int number_of_unique_fpregs = CountSetBits(unique_fpregs);

  VIXL_ASSERT(number_of_valid_regs >= number_of_unique_regs);
  VIXL_ASSERT(number_of_valid_fpregs >= number_of_unique_fpregs);
--- a/src/vixl/a64/assembler-a64.h
+++ b/src/vixl/a64/assembler-a64.h
@ -28,11 +28,11 @@
 #define VIXL_A64_ASSEMBLER_A64_H_


-#include "globals.h"
-#include "invalset.h"
-#include "utils.h"
-#include "code-buffer.h"
-#include "a64/instructions-a64.h"
+#include "vixl/globals.h"
+#include "vixl/invalset.h"
+#include "vixl/utils.h"
+#include "vixl/code-buffer.h"
+#include "vixl/a64/instructions-a64.h"

 namespace vixl {

@ -55,6 +55,7 @@ class CPURegister {
    kInvalid = 0,
    kRegister,
    kVRegister,
+    kFPRegister = kVRegister,
    kNoRegister
  };

@ -556,6 +557,10 @@ class CPURegList {
                                 const CPURegList& list_3,
                                 const CPURegList& list_4);

+  bool Overlaps(const CPURegList& other) const {
+    return (type_ == other.type_) && ((list_ & other.list_) != 0);
+  }
+
  RegList list() const {
    VIXL_ASSERT(IsValid());
    return list_;
@ -600,7 +605,7 @@ class CPURegList {

  int Count() const {
    VIXL_ASSERT(IsValid());
-    return CountSetBits(list_, kRegListSizeInBits);
+    return CountSetBits(list_);
  }

  unsigned RegisterSizeInBits() const {
@ -630,7 +635,7 @@ class CPURegList {

 // AAPCS64 callee-saved registers.
 extern const CPURegList kCalleeSaved;
-extern const CPURegList kCalleeSavedFP;
+extern const CPURegList kCalleeSavedV;


 // AAPCS64 caller-saved registers. Note that this includes lr.
@ -710,17 +715,17 @@ class MemOperand {
  explicit MemOperand(Register base,
                      int64_t offset = 0,
                      AddrMode addrmode = Offset);
-  explicit MemOperand(Register base,
-                      Register regoffset,
-                      Shift shift = LSL,
-                      unsigned shift_amount = 0);
-  explicit MemOperand(Register base,
-                      Register regoffset,
-                      Extend extend,
-                      unsigned shift_amount = 0);
-  explicit MemOperand(Register base,
-                      const Operand& offset,
-                      AddrMode addrmode = Offset);
+  MemOperand(Register base,
+             Register regoffset,
+             Shift shift = LSL,
+             unsigned shift_amount = 0);
+  MemOperand(Register base,
+             Register regoffset,
+             Extend extend,
+             unsigned shift_amount = 0);
+  MemOperand(Register base,
+             const Operand& offset,
+             AddrMode addrmode = Offset);

  const Register& base() const { return base_; }
  const Register& regoffset() const { return regoffset_; }
@ -734,6 +739,8 @@ class MemOperand {
  bool IsPreIndex() const;
  bool IsPostIndex() const;

+  void AddOffset(int64_t offset);
+
 private:
  Register base_;
  Register regoffset_;
@ -1606,6 +1613,11 @@ class Assembler {
    umaddl(rd, rn, rm, xzr);
  }

+  // Unsigned multiply high: 64 x 64 -> 64-bit <127:64>.
+  void umulh(const Register& xd,
+             const Register& xn,
+             const Register& xm);
+
  // Signed long multiply and subtract: 64 - (32 x 32) -> 64-bit.
  void smsubl(const Register& rd,
              const Register& rn,
@ -2022,18 +2034,44 @@ class Assembler {
  // FP round to integer, towards zero.
  void frintz(const VRegister& vd, const VRegister& vn);

+  void FPCompareMacro(const VRegister& vn,
+                      double value,
+                      FPTrapFlags trap);
+
+  void FPCompareMacro(const VRegister& vn,
+                      const VRegister& vm,
+                      FPTrapFlags trap);
+
  // FP compare registers.
  void fcmp(const VRegister& vn, const VRegister& vm);

  // FP compare immediate.
  void fcmp(const VRegister& vn, double value);

+  void FPCCompareMacro(const VRegister& vn,
+                       const VRegister& vm,
+                       StatusFlags nzcv,
+                       Condition cond,
+                       FPTrapFlags trap);
+
  // FP conditional compare.
  void fccmp(const VRegister& vn,
             const VRegister& vm,
             StatusFlags nzcv,
             Condition cond);

+  // FP signaling compare registers.
+  void fcmpe(const VRegister& vn, const VRegister& vm);
+
+  // FP signaling compare immediate.
+  void fcmpe(const VRegister& vn, double value);
+
+  // FP conditional signaling compare.
+  void fccmpe(const VRegister& vn,
+              const VRegister& vm,
+              StatusFlags nzcv,
+              Condition cond);
+
  // FP conditional select.
  void fcsel(const VRegister& vd,
             const VRegister& vn,
@ -3949,8 +3987,8 @@ class Assembler {
                           unsigned* n = NULL,
                           unsigned* imm_s = NULL,
                           unsigned* imm_r = NULL);
-  static bool IsImmLSPair(int64_t offset, unsigned size);
-  static bool IsImmLSScaled(int64_t offset, unsigned size);
+  static bool IsImmLSPair(int64_t offset, unsigned access_size);
+  static bool IsImmLSScaled(int64_t offset, unsigned access_size);
  static bool IsImmLSUnscaled(int64_t offset);
  static bool IsImmMovn(uint64_t imm, unsigned reg_size);
  static bool IsImmMovz(uint64_t imm, unsigned reg_size);
--- a/src/vixl/a64/constants-a64.h
+++ b/src/vixl/a64/constants-a64.h
@ -225,6 +225,11 @@ inline Condition InvertCondition(Condition cond) {
  return static_cast<Condition>(cond ^ 1);
 }

+enum FPTrapFlags {
+  EnableTrap   = 1,
+  DisableTrap = 0
+};
+
 enum FlagsUpdate {
  SetFlags   = 1,
  LeaveFlags = 0
@ -1092,8 +1097,10 @@ enum FPCompareOp {
  FCMP_zero      = FCMP_s_zero,
  FCMPE_s        = FPCompareFixed | 0x00000010,
  FCMPE_d        = FPCompareFixed | FP64 | 0x00000010,
+  FCMPE          = FCMPE_s,
  FCMPE_s_zero   = FPCompareFixed | 0x00000018,
-  FCMPE_d_zero   = FPCompareFixed | FP64 | 0x00000018
+  FCMPE_d_zero   = FPCompareFixed | FP64 | 0x00000018,
+  FCMPE_zero     = FCMPE_s_zero
 };

 // Floating point conditional compare.
--- a/src/vixl/a64/cpu-a64.cc
+++ b/src/vixl/a64/cpu-a64.cc
@ -24,8 +24,8 @@
 // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

-#include "utils.h"
-#include "a64/cpu-a64.h"
+#include "vixl/utils.h"
+#include "vixl/a64/cpu-a64.h"

 namespace vixl {

--- a/src/vixl/a64/cpu-a64.h
+++ b/src/vixl/a64/cpu-a64.h
@ -27,8 +27,8 @@
 #ifndef VIXL_CPU_A64_H
 #define VIXL_CPU_A64_H

-#include "globals.h"
-#include "instructions-a64.h"
+#include "vixl/globals.h"
+#include "vixl/a64/instructions-a64.h"

 namespace vixl {

--- a/src/vixl/a64/debugger-a64.cc
+++ b/src/vixl/a64/debugger-a64.cc
@ -26,7 +26,7 @@

 #ifdef USE_SIMULATOR

-#include "a64/debugger-a64.h"
+#include "vixl/a64/debugger-a64.h"

 namespace vixl {

@ -645,7 +645,8 @@ void Debugger::VisitException(const Instruction* instr) {
    case BRK:
      DoBreakpoint(instr);
      return;
-    case HLT:   // Fall through.
+    case HLT:
+      VIXL_FALLTHROUGH();
    default: Simulator::VisitException(instr);
  }
 }
@ -994,6 +995,7 @@ Token* FormatToken::Tokenize(const char* arg) {
      break;
    case 'i':
      if (length == 1) return new Format<uint32_t>("%08" PRIx32, 'i');
+      VIXL_FALLTHROUGH();
    default: return NULL;
  }

--- a/src/vixl/a64/debugger-a64.h
+++ b/src/vixl/a64/debugger-a64.h
@ -32,10 +32,10 @@
 #include <errno.h>
 #include <vector>

-#include "globals.h"
-#include "utils.h"
-#include "a64/constants-a64.h"
-#include "a64/simulator-a64.h"
+#include "vixl/globals.h"
+#include "vixl/utils.h"
+#include "vixl/a64/constants-a64.h"
+#include "vixl/a64/simulator-a64.h"

 namespace vixl {

--- a/src/vixl/a64/decoder-a64.cc
+++ b/src/vixl/a64/decoder-a64.cc
@ -24,9 +24,9 @@
 // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

-#include "globals.h"
-#include "utils.h"
-#include "a64/decoder-a64.h"
+#include "vixl/globals.h"
+#include "vixl/utils.h"
+#include "vixl/a64/decoder-a64.h"

 namespace vixl {

@ -488,6 +488,7 @@ void Decoder::DecodeDataProcessing(const Instruction* instr) {
        case 6: {
          if (instr->Bit(29) == 0x1) {
            VisitUnallocated(instr);
+            VIXL_FALLTHROUGH();
          } else {
            if (instr->Bit(30) == 0) {
              if ((instr->Bit(15) == 0x1) ||
--- a/src/vixl/a64/decoder-a64.h
+++ b/src/vixl/a64/decoder-a64.h
@ -29,8 +29,8 @@

 #include <list>

-#include "globals.h"
-#include "a64/instructions-a64.h"
+#include "vixl/globals.h"
+#include "vixl/a64/instructions-a64.h"


 // List macro containing all visitors needed by the decoder class.
--- a/src/vixl/a64/disasm-a64.cc
+++ b/src/vixl/a64/disasm-a64.cc
@ -25,7 +25,7 @@
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 #include <cstdlib>
-#include "a64/disasm-a64.h"
+#include "vixl/a64/disasm-a64.h"

 namespace vixl {

@ -890,9 +890,9 @@ void Disassembler::VisitLoadStoreUnscaledOffset(const Instruction* instr) {
    case LDUR_s:   mnemonic = "ldur"; form = form_s; break;
    case LDUR_d:   mnemonic = "ldur"; form = form_d; break;
    case LDUR_q:   mnemonic = "ldur"; form = form_q; break;
-    case LDURSB_x: form = form_x;  // Fall through.
+    case LDURSB_x: form = form_x; VIXL_FALLTHROUGH();
    case LDURSB_w: mnemonic = "ldursb"; break;
-    case LDURSH_x: form = form_x;  // Fall through.
+    case LDURSH_x: form = form_x; VIXL_FALLTHROUGH();
    case LDURSH_w: mnemonic = "ldursh"; break;
    case LDURSW_x: mnemonic = "ldursw"; form = form_x; break;
    case PRFUM:    mnemonic = "prfum"; form = form_prefetch; break;
@ -1054,9 +1054,13 @@ void Disassembler::VisitFPCompare(const Instruction* instr) {

  switch (instr->Mask(FPCompareMask)) {
    case FCMP_s_zero:
-    case FCMP_d_zero: form = form_zero;  // Fall through.
+    case FCMP_d_zero: form = form_zero; VIXL_FALLTHROUGH();
    case FCMP_s:
    case FCMP_d: mnemonic = "fcmp"; break;
+    case FCMPE_s_zero:
+    case FCMPE_d_zero: form = form_zero; VIXL_FALLTHROUGH();
+    case FCMPE_s:
+    case FCMPE_d: mnemonic = "fcmpe"; break;
    default: form = "(FPCompare)";
  }
  Format(instr, mnemonic, form);
@ -2884,8 +2888,8 @@ int Disassembler::SubstituteRegisterField(const Instruction* instr,
    field_len = 3;
  }

-  CPURegister::RegisterType reg_type;
-  unsigned reg_size;
+  CPURegister::RegisterType reg_type = CPURegister::kRegister;
+  unsigned reg_size = kXRegSize;

  if (reg_prefix == 'R') {
    reg_prefix = instr->SixtyFourBits() ? 'X' : 'W';
@ -2913,8 +2917,6 @@ int Disassembler::SubstituteRegisterField(const Instruction* instr,
      return field_len;
    default:
      VIXL_UNREACHABLE();
-      reg_type = CPURegister::kRegister;
-      reg_size = kXRegSize;
  }

  if ((reg_type == CPURegister::kRegister) &&
@ -3087,6 +3089,7 @@ int Disassembler::SubstituteImmediateField(const Instruction* instr,
              return 0;
            }
          }
+          VIXL_FALLTHROUGH();
        }
        case 'L': {  // IVLSLane[0123] - suffix indicates access size shift.
          AppendToOutput("%d", instr->NEONLSIndex(format[8] - '0'));
@ -3236,7 +3239,8 @@ int Disassembler::SubstituteShiftField(const Instruction* instr,
  switch (format[1]) {
    case 'D': {  // HDP.
      VIXL_ASSERT(instr->ShiftDP() != ROR);
-    }  // Fall through.
+      VIXL_FALLTHROUGH();
+    }
    case 'L': {  // HLo.
      if (instr->ImmDPShift() != 0) {
        const char* shift_type[] = {"lsl", "lsr", "asr", "ror"};
--- a/src/vixl/a64/disasm-a64.h
+++ b/src/vixl/a64/disasm-a64.h
@ -27,11 +27,11 @@
 #ifndef VIXL_A64_DISASM_A64_H
 #define VIXL_A64_DISASM_A64_H

-#include "globals.h"
-#include "utils.h"
-#include "instructions-a64.h"
-#include "decoder-a64.h"
-#include "assembler-a64.h"
+#include "vixl/globals.h"
+#include "vixl/utils.h"
+#include "vixl/a64/instructions-a64.h"
+#include "vixl/a64/decoder-a64.h"
+#include "vixl/a64/assembler-a64.h"

 namespace vixl {

--- a/src/vixl/a64/instructions-a64.cc
+++ b/src/vixl/a64/instructions-a64.cc
@ -24,8 +24,8 @@
 // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

-#include "a64/instructions-a64.h"
-#include "a64/assembler-a64.h"
+#include "vixl/a64/instructions-a64.h"
+#include "vixl/a64/assembler-a64.h"

 namespace vixl {

--- a/src/vixl/a64/instructions-a64.h
+++ b/src/vixl/a64/instructions-a64.h
@ -27,9 +27,9 @@
 #ifndef VIXL_A64_INSTRUCTIONS_A64_H_
 #define VIXL_A64_INSTRUCTIONS_A64_H_

-#include "globals.h"
-#include "utils.h"
-#include "a64/constants-a64.h"
+#include "vixl/globals.h"
+#include "vixl/utils.h"
+#include "vixl/a64/constants-a64.h"

 namespace vixl {
 // ISA constants. --------------------------------------------------------------
--- a/src/vixl/a64/instrument-a64.cc
+++ b/src/vixl/a64/instrument-a64.cc
@ -24,7 +24,7 @@
 // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

-#include "a64/instrument-a64.h"
+#include "vixl/a64/instrument-a64.h"

 namespace vixl {

@ -421,22 +421,26 @@ void Instrument::InstrumentLoadStore(const Instruction* instr) {
  static Counter* store_fp_counter = GetCounter("Store FP");

  switch (instr->Mask(LoadStoreMask)) {
-    case STRB_w:    // Fall through.
-    case STRH_w:    // Fall through.
-    case STR_w:     // Fall through.
+    case STRB_w:
+    case STRH_w:
+    case STR_w:
+      VIXL_FALLTHROUGH();
    case STR_x:     store_int_counter->Increment(); break;
-    case STR_s:     // Fall through.
+    case STR_s:
+      VIXL_FALLTHROUGH();
    case STR_d:     store_fp_counter->Increment(); break;
-    case LDRB_w:    // Fall through.
-    case LDRH_w:    // Fall through.
-    case LDR_w:     // Fall through.
-    case LDR_x:     // Fall through.
-    case LDRSB_x:   // Fall through.
-    case LDRSH_x:   // Fall through.
-    case LDRSW_x:   // Fall through.
-    case LDRSB_w:   // Fall through.
+    case LDRB_w:
+    case LDRH_w:
+    case LDR_w:
+    case LDR_x:
+    case LDRSB_x:
+    case LDRSH_x:
+    case LDRSW_x:
+    case LDRSB_w:
+      VIXL_FALLTHROUGH();
    case LDRSH_w:   load_int_counter->Increment(); break;
-    case LDR_s:     // Fall through.
+    case LDR_s:
+      VIXL_FALLTHROUGH();
    case LDR_d:     load_fp_counter->Increment(); break;
  }
 }
--- a/src/vixl/a64/instrument-a64.h
+++ b/src/vixl/a64/instrument-a64.h
@ -27,11 +27,11 @@
 #ifndef VIXL_A64_INSTRUMENT_A64_H_
 #define VIXL_A64_INSTRUMENT_A64_H_

-#include "globals.h"
-#include "utils.h"
-#include "a64/decoder-a64.h"
-#include "a64/constants-a64.h"
-#include "a64/instrument-a64.h"
+#include "vixl/globals.h"
+#include "vixl/utils.h"
+#include "vixl/a64/decoder-a64.h"
+#include "vixl/a64/constants-a64.h"
+#include "vixl/a64/instrument-a64.h"

 namespace vixl {

--- a/src/vixl/a64/logic-a64.cc
+++ b/src/vixl/a64/logic-a64.cc
@ -24,9 +24,365 @@
 // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

-#include "a64/simulator-a64.h"
+#include <cmath>
+#include "vixl/a64/simulator-a64.h"

 namespace vixl {
+
+template<> double Simulator::FPDefaultNaN<double>() {
+  return kFP64DefaultNaN;
+}
+
+
+template<> float Simulator::FPDefaultNaN<float>() {
+  return kFP32DefaultNaN;
+}
+
+// See FPRound for a description of this function.
+static inline double FPRoundToDouble(int64_t sign, int64_t exponent,
+                                     uint64_t mantissa, FPRounding round_mode) {
+  int64_t bits =
+      FPRound<int64_t, kDoubleExponentBits, kDoubleMantissaBits>(sign,
+                                                                 exponent,
+                                                                 mantissa,
+                                                                 round_mode);
+  return rawbits_to_double(bits);
+}
+
+
+// See FPRound for a description of this function.
+static inline float FPRoundToFloat(int64_t sign, int64_t exponent,
+                                   uint64_t mantissa, FPRounding round_mode) {
+  int32_t bits =
+      FPRound<int32_t, kFloatExponentBits, kFloatMantissaBits>(sign,
+                                                               exponent,
+                                                               mantissa,
+                                                               round_mode);
+  return rawbits_to_float(bits);
+}
+
+
+// See FPRound for a description of this function.
+static inline float16 FPRoundToFloat16(int64_t sign,
+                                       int64_t exponent,
+                                       uint64_t mantissa,
+                                       FPRounding round_mode) {
+  return FPRound<float16, kFloat16ExponentBits, kFloat16MantissaBits>(
+      sign, exponent, mantissa, round_mode);
+}
+
+
+double Simulator::FixedToDouble(int64_t src, int fbits, FPRounding round) {
+  if (src >= 0) {
+    return UFixedToDouble(src, fbits, round);
+  } else {
+    // This works for all negative values, including INT64_MIN.
+    return -UFixedToDouble(-src, fbits, round);
+  }
+}
+
+
+double Simulator::UFixedToDouble(uint64_t src, int fbits, FPRounding round) {
+  // An input of 0 is a special case because the result is effectively
+  // subnormal: The exponent is encoded as 0 and there is no implicit 1 bit.
+  if (src == 0) {
+    return 0.0;
+  }
+
+  // Calculate the exponent. The highest significant bit will have the value
+  // 2^exponent.
+  const int highest_significant_bit = 63 - CountLeadingZeros(src);
+  const int64_t exponent = highest_significant_bit - fbits;
+
+  return FPRoundToDouble(0, exponent, src, round);
+}
+
+
+float Simulator::FixedToFloat(int64_t src, int fbits, FPRounding round) {
+  if (src >= 0) {
+    return UFixedToFloat(src, fbits, round);
+  } else {
+    // This works for all negative values, including INT64_MIN.
+    return -UFixedToFloat(-src, fbits, round);
+  }
+}
+
+
+float Simulator::UFixedToFloat(uint64_t src, int fbits, FPRounding round) {
+  // An input of 0 is a special case because the result is effectively
+  // subnormal: The exponent is encoded as 0 and there is no implicit 1 bit.
+  if (src == 0) {
+    return 0.0f;
+  }
+
+  // Calculate the exponent. The highest significant bit will have the value
+  // 2^exponent.
+  const int highest_significant_bit = 63 - CountLeadingZeros(src);
+  const int32_t exponent = highest_significant_bit - fbits;
+
+  return FPRoundToFloat(0, exponent, src, round);
+}
+
+
+double Simulator::FPToDouble(float value) {
+  switch (std::fpclassify(value)) {
+    case FP_NAN: {
+      if (IsSignallingNaN(value)) {
+        FPProcessException();
+      }
+      if (DN()) return kFP64DefaultNaN;
+
+      // Convert NaNs as the processor would:
+      //  - The sign is propagated.
+      //  - The payload (mantissa) is transferred entirely, except that the top
+      //    bit is forced to '1', making the result a quiet NaN. The unused
+      //    (low-order) payload bits are set to 0.
+      uint32_t raw = float_to_rawbits(value);
+
+      uint64_t sign = raw >> 31;
+      uint64_t exponent = (1 << 11) - 1;
+      uint64_t payload = unsigned_bitextract_64(21, 0, raw);
+      payload <<= (52 - 23);  // The unused low-order bits should be 0.
+      payload |= (UINT64_C(1) << 51);  // Force a quiet NaN.
+
+      return rawbits_to_double((sign << 63) | (exponent << 52) | payload);
+    }
+
+    case FP_ZERO:
+    case FP_NORMAL:
+    case FP_SUBNORMAL:
+    case FP_INFINITE: {
+      // All other inputs are preserved in a standard cast, because every value
+      // representable using an IEEE-754 float is also representable using an
+      // IEEE-754 double.
+      return static_cast<double>(value);
+    }
+  }
+
+  VIXL_UNREACHABLE();
+  return static_cast<double>(value);
+}
+
+
+float Simulator::FPToFloat(float16 value) {
+  uint32_t sign = value >> 15;
+  uint32_t exponent = unsigned_bitextract_32(
+      kFloat16MantissaBits + kFloat16ExponentBits - 1, kFloat16MantissaBits,
+      value);
+  uint32_t mantissa = unsigned_bitextract_32(
+      kFloat16MantissaBits - 1, 0, value);
+
+  switch (float16classify(value)) {
+    case FP_ZERO:
+      return (sign == 0) ? 0.0f : -0.0f;
+
+    case FP_INFINITE:
+      return (sign == 0) ? kFP32PositiveInfinity : kFP32NegativeInfinity;
+
+    case FP_SUBNORMAL: {
+      // Calculate shift required to put mantissa into the most-significant bits
+      // of the destination mantissa.
+      int shift = CountLeadingZeros(mantissa << (32 - 10));
+
+      // Shift mantissa and discard implicit '1'.
+      mantissa <<= (kFloatMantissaBits - kFloat16MantissaBits) + shift + 1;
+      mantissa &= (1 << kFloatMantissaBits) - 1;
+
+      // Adjust the exponent for the shift applied, and rebias.
+      exponent = exponent - shift + (-15 + 127);
+      break;
+    }
+
+    case FP_NAN:
+      if (IsSignallingNaN(value)) {
+        FPProcessException();
+      }
+      if (DN()) return kFP32DefaultNaN;
+
+      // Convert NaNs as the processor would:
+      //  - The sign is propagated.
+      //  - The payload (mantissa) is transferred entirely, except that the top
+      //    bit is forced to '1', making the result a quiet NaN. The unused
+      //    (low-order) payload bits are set to 0.
+      exponent = (1 << kFloatExponentBits) - 1;
+
+      // Increase bits in mantissa, making low-order bits 0.
+      mantissa <<= (kFloatMantissaBits - kFloat16MantissaBits);
+      mantissa |= 1 << 22;  // Force a quiet NaN.
+      break;
+
+    case FP_NORMAL:
+      // Increase bits in mantissa, making low-order bits 0.
+      mantissa <<= (kFloatMantissaBits - kFloat16MantissaBits);
+
+      // Change exponent bias.
+      exponent += (-15 + 127);
+      break;
+
+    default: VIXL_UNREACHABLE();
+  }
+  return rawbits_to_float((sign << 31) |
+                          (exponent << kFloatMantissaBits) |
+                          mantissa);
+}
+
+
+float16 Simulator::FPToFloat16(float value, FPRounding round_mode) {
+  // Only the FPTieEven rounding mode is implemented.
+  VIXL_ASSERT(round_mode == FPTieEven);
+  USE(round_mode);
+
+  uint32_t raw = float_to_rawbits(value);
+  int32_t sign = raw >> 31;
+  int32_t exponent = unsigned_bitextract_32(30, 23, raw) - 127;
+  uint32_t mantissa = unsigned_bitextract_32(22, 0, raw);
+
+  switch (std::fpclassify(value)) {
+    case FP_NAN: {
+      if (IsSignallingNaN(value)) {
+        FPProcessException();
+      }
+      if (DN()) return kFP16DefaultNaN;
+
+      // Convert NaNs as the processor would:
+      //  - The sign is propagated.
+      //  - The payload (mantissa) is transferred as much as possible, except
+      //    that the top bit is forced to '1', making the result a quiet NaN.
+      float16 result = (sign == 0) ? kFP16PositiveInfinity
+                                   : kFP16NegativeInfinity;
+      result |= mantissa >> (kFloatMantissaBits - kFloat16MantissaBits);
+      result |= (1 << 9);  // Force a quiet NaN;
+      return result;
+    }
+
+    case FP_ZERO:
+      return (sign == 0) ? 0 : 0x8000;
+
+    case FP_INFINITE:
+      return (sign == 0) ? kFP16PositiveInfinity : kFP16NegativeInfinity;
+
+    case FP_NORMAL:
+    case FP_SUBNORMAL: {
+      // Convert float-to-half as the processor would, assuming that FPCR.FZ
+      // (flush-to-zero) is not set.
+
+      // Add the implicit '1' bit to the mantissa.
+      mantissa += (1 << 23);
+      return FPRoundToFloat16(sign, exponent, mantissa, round_mode);
+    }
+  }
+
+  VIXL_UNREACHABLE();
+  return 0;
+}
+
+
+float16 Simulator::FPToFloat16(double value, FPRounding round_mode) {
+  // Only the FPTieEven rounding mode is implemented.
+  VIXL_ASSERT(round_mode == FPTieEven);
+  USE(round_mode);
+
+  uint64_t raw = double_to_rawbits(value);
+  int32_t sign = raw >> 63;
+  int64_t exponent = unsigned_bitextract_64(62, 52, raw) - 1023;
+  uint64_t mantissa = unsigned_bitextract_64(51, 0, raw);
+
+  switch (std::fpclassify(value)) {
+    case FP_NAN: {
+      if (IsSignallingNaN(value)) {
+        FPProcessException();
+      }
+      if (DN()) return kFP16DefaultNaN;
+
+      // Convert NaNs as the processor would:
+      //  - The sign is propagated.
+      //  - The payload (mantissa) is transferred as much as possible, except
+      //    that the top bit is forced to '1', making the result a quiet NaN.
+      float16 result = (sign == 0) ? kFP16PositiveInfinity
+                                   : kFP16NegativeInfinity;
+      result |= mantissa >> (kDoubleMantissaBits - kFloat16MantissaBits);
+      result |= (1 << 9);  // Force a quiet NaN;
+      return result;
+    }
+
+    case FP_ZERO:
+      return (sign == 0) ? 0 : 0x8000;
+
+    case FP_INFINITE:
+      return (sign == 0) ? kFP16PositiveInfinity : kFP16NegativeInfinity;
+
+    case FP_NORMAL:
+    case FP_SUBNORMAL: {
+      // Convert double-to-half as the processor would, assuming that FPCR.FZ
+      // (flush-to-zero) is not set.
+
+      // Add the implicit '1' bit to the mantissa.
+      mantissa += (UINT64_C(1) << 52);
+      return FPRoundToFloat16(sign, exponent, mantissa, round_mode);
+    }
+  }
+
+  VIXL_UNREACHABLE();
+  return 0;
+}
+
+
+float Simulator::FPToFloat(double value, FPRounding round_mode) {
+  // Only the FPTieEven rounding mode is implemented.
+  VIXL_ASSERT((round_mode == FPTieEven) || (round_mode == FPRoundOdd));
+  USE(round_mode);
+
+  switch (std::fpclassify(value)) {
+    case FP_NAN: {
+      if (IsSignallingNaN(value)) {
+        FPProcessException();
+      }
+      if (DN()) return kFP32DefaultNaN;
+
+      // Convert NaNs as the processor would:
+      //  - The sign is propagated.
+      //  - The payload (mantissa) is transferred as much as possible, except
+      //    that the top bit is forced to '1', making the result a quiet NaN.
+      uint64_t raw = double_to_rawbits(value);
+
+      uint32_t sign = raw >> 63;
+      uint32_t exponent = (1 << 8) - 1;
+      uint32_t payload = unsigned_bitextract_64(50, 52 - 23, raw);
+      payload |= (1 << 22);   // Force a quiet NaN.
+
+      return rawbits_to_float((sign << 31) | (exponent << 23) | payload);
+    }
+
+    case FP_ZERO:
+    case FP_INFINITE: {
+      // In a C++ cast, any value representable in the target type will be
+      // unchanged. This is always the case for +/-0.0 and infinities.
+      return static_cast<float>(value);
+    }
+
+    case FP_NORMAL:
+    case FP_SUBNORMAL: {
+      // Convert double-to-float as the processor would, assuming that FPCR.FZ
+      // (flush-to-zero) is not set.
+      uint64_t raw = double_to_rawbits(value);
+      // Extract the IEEE-754 double components.
+      uint32_t sign = raw >> 63;
+      // Extract the exponent and remove the IEEE-754 encoding bias.
+      int32_t exponent = unsigned_bitextract_64(62, 52, raw) - 1023;
+      // Extract the mantissa and add the implicit '1' bit.
+      uint64_t mantissa = unsigned_bitextract_64(51, 0, raw);
+      if (std::fpclassify(value) == FP_NORMAL) {
+        mantissa |= (UINT64_C(1) << 52);
+      }
+      return FPRoundToFloat(sign, exponent, mantissa, round_mode);
+    }
+  }
+
+  VIXL_UNREACHABLE();
+  return value;
+}
+
+
 void Simulator::ld1(VectorFormat vform,
                    LogicVRegister dst,
                    uint64_t addr) {
@ -1524,7 +1880,7 @@ LogicVRegister Simulator::sshl(VectorFormat vform,
    int64_t lj_src_val = src1.IntLeftJustified(vform, i);

    // Set signed saturation state.
-    if ((shift_val > CountLeadingSignBits(lj_src_val, 64)) &&
+    if ((shift_val > CountLeadingSignBits(lj_src_val)) &&
        (lj_src_val != 0)) {
      dst.SetSignedSat(i, lj_src_val >= 0);
    }
@ -1532,7 +1888,7 @@ LogicVRegister Simulator::sshl(VectorFormat vform,
    // Set unsigned saturation state.
    if (lj_src_val < 0) {
      dst.SetUnsignedSat(i, false);
-    } else if ((shift_val > CountLeadingZeros(lj_src_val, 64)) &&
+    } else if ((shift_val > CountLeadingZeros(lj_src_val)) &&
               (lj_src_val != 0)) {
      dst.SetUnsignedSat(i, true);
    }
@ -1570,7 +1926,7 @@ LogicVRegister Simulator::ushl(VectorFormat vform,
    uint64_t lj_src_val = src1.UintLeftJustified(vform, i);

    // Set saturation state.
-    if ((shift_val > CountLeadingZeros(lj_src_val, 64)) && (lj_src_val != 0)) {
+    if ((shift_val > CountLeadingZeros(lj_src_val)) && (lj_src_val != 0)) {
      dst.SetUnsignedSat(i, true);
    }

@ -3153,9 +3509,9 @@ LogicVRegister Simulator::uzp2(VectorFormat vform,
 template <typename T>
 T Simulator::FPAdd(T op1, T op2) {
  T result = FPProcessNaNs(op1, op2);
-  if (isnan(result)) return result;
+  if (std::isnan(result)) return result;

-  if (isinf(op1) && isinf(op2) && (op1 != op2)) {
+  if (std::isinf(op1) && std::isinf(op2) && (op1 != op2)) {
    // inf + -inf returns the default NaN.
    FPProcessException();
    return FPDefaultNaN<T>();
@ -3169,9 +3525,9 @@ T Simulator::FPAdd(T op1, T op2) {
 template <typename T>
 T Simulator::FPSub(T op1, T op2) {
  // NaNs should be handled elsewhere.
-  VIXL_ASSERT(!isnan(op1) && !isnan(op2));
+  VIXL_ASSERT(!std::isnan(op1) && !std::isnan(op2));

-  if (isinf(op1) && isinf(op2) && (op1 == op2)) {
+  if (std::isinf(op1) && std::isinf(op2) && (op1 == op2)) {
    // inf - inf returns the default NaN.
    FPProcessException();
    return FPDefaultNaN<T>();
@ -3185,9 +3541,9 @@ T Simulator::FPSub(T op1, T op2) {
 template <typename T>
 T Simulator::FPMul(T op1, T op2) {
  // NaNs should be handled elsewhere.
-  VIXL_ASSERT(!isnan(op1) && !isnan(op2));
+  VIXL_ASSERT(!std::isnan(op1) && !std::isnan(op2));

-  if ((isinf(op1) && (op2 == 0.0)) || (isinf(op2) && (op1 == 0.0))) {
+  if ((std::isinf(op1) && (op2 == 0.0)) || (std::isinf(op2) && (op1 == 0.0))) {
    // inf * 0.0 returns the default NaN.
    FPProcessException();
    return FPDefaultNaN<T>();
@ -3200,7 +3556,7 @@ T Simulator::FPMul(T op1, T op2) {

 template<typename T>
 T Simulator::FPMulx(T op1, T op2) {
-  if ((isinf(op1) && (op2 == 0.0)) || (isinf(op2) && (op1 == 0.0))) {
+  if ((std::isinf(op1) && (op2 == 0.0)) || (std::isinf(op2) && (op1 == 0.0))) {
    // inf * 0.0 returns +/-2.0.
    T two = 2.0;
    return copysign(1.0, op1) * copysign(1.0, op2) * two;
@ -3215,13 +3571,13 @@ T Simulator::FPMulAdd(T a, T op1, T op2) {

  T sign_a = copysign(1.0, a);
  T sign_prod = copysign(1.0, op1) * copysign(1.0, op2);
-  bool isinf_prod = isinf(op1) || isinf(op2);
+  bool isinf_prod = std::isinf(op1) || std::isinf(op2);
  bool operation_generates_nan =
-      (isinf(op1) && (op2 == 0.0)) ||                     // inf * 0.0
-      (isinf(op2) && (op1 == 0.0)) ||                     // 0.0 * inf
-      (isinf(a) && isinf_prod && (sign_a != sign_prod));  // inf - inf
+      (std::isinf(op1) && (op2 == 0.0)) ||                     // inf * 0.0
+      (std::isinf(op2) && (op1 == 0.0)) ||                     // 0.0 * inf
+      (std::isinf(a) && isinf_prod && (sign_a != sign_prod));  // inf - inf

-  if (isnan(result)) {
+  if (std::isnan(result)) {
    // Generated NaNs override quiet NaNs propagated from a.
    if (operation_generates_nan && IsQuietNaN(a)) {
      FPProcessException();
@ -3244,7 +3600,7 @@ T Simulator::FPMulAdd(T a, T op1, T op2) {
  }

  result = FusedMultiplyAdd(op1, op2, a);
-  VIXL_ASSERT(!isnan(result));
+  VIXL_ASSERT(!std::isnan(result));

  // Work around broken fma implementations for rounded zero results: If a is
  // 0.0, the sign of the result is the sign of op1 * op2 before rounding.
@ -3259,9 +3615,9 @@ T Simulator::FPMulAdd(T a, T op1, T op2) {
 template <typename T>
 T Simulator::FPDiv(T op1, T op2) {
  // NaNs should be handled elsewhere.
-  VIXL_ASSERT(!isnan(op1) && !isnan(op2));
+  VIXL_ASSERT(!std::isnan(op1) && !std::isnan(op2));

-  if ((isinf(op1) && isinf(op2)) || ((op1 == 0.0) && (op2 == 0.0))) {
+  if ((std::isinf(op1) && std::isinf(op2)) || ((op1 == 0.0) && (op2 == 0.0))) {
    // inf / inf and 0.0 / 0.0 return the default NaN.
    FPProcessException();
    return FPDefaultNaN<T>();
@ -3276,7 +3632,7 @@ T Simulator::FPDiv(T op1, T op2) {

 template <typename T>
 T Simulator::FPSqrt(T op) {
-  if (isnan(op)) {
+  if (std::isnan(op)) {
    return FPProcessNaN(op);
  } else if (op < 0.0) {
    FPProcessException();
@ -3290,7 +3646,7 @@ T Simulator::FPSqrt(T op) {
 template <typename T>
 T Simulator::FPMax(T a, T b) {
  T result = FPProcessNaNs(a, b);
-  if (isnan(result)) return result;
+  if (std::isnan(result)) return result;

  if ((a == 0.0) && (b == 0.0) &&
      (copysign(1.0, a) != copysign(1.0, b))) {
@ -3311,14 +3667,14 @@ T Simulator::FPMaxNM(T a, T b) {
  }

  T result = FPProcessNaNs(a, b);
-  return isnan(result) ? result : FPMax(a, b);
+  return std::isnan(result) ? result : FPMax(a, b);
 }


 template <typename T>
 T Simulator::FPMin(T a, T b) {
  T result = FPProcessNaNs(a, b);
-  if (isnan(result)) return result;
+  if (std::isnan(result)) return result;

  if ((a == 0.0) && (b == 0.0) &&
      (copysign(1.0, a) != copysign(1.0, b))) {
@ -3339,16 +3695,17 @@ T Simulator::FPMinNM(T a, T b) {
  }

  T result = FPProcessNaNs(a, b);
-  return isnan(result) ? result : FPMin(a, b);
+  return std::isnan(result) ? result : FPMin(a, b);
 }


 template <typename T>
 T Simulator::FPRecipStepFused(T op1, T op2) {
  const T two = 2.0;
-  if ((isinf(op1) && (op2 == 0.0)) || ((op1 == 0.0) && (isinf(op2)))) {
+  if ((std::isinf(op1) && (op2 == 0.0))
+      || ((op1 == 0.0) && (std::isinf(op2)))) {
    return two;
-  } else if (isinf(op1) || isinf(op2)) {
+  } else if (std::isinf(op1) || std::isinf(op2)) {
    // Return +inf if signs match, otherwise -inf.
    return ((op1 >= 0.0) == (op2 >= 0.0)) ? kFP64PositiveInfinity
                                          : kFP64NegativeInfinity;
@ -3363,9 +3720,10 @@ T Simulator::FPRSqrtStepFused(T op1, T op2) {
  const T one_point_five = 1.5;
  const T two = 2.0;

-  if ((isinf(op1) && (op2 == 0.0)) || ((op1 == 0.0) && (isinf(op2)))) {
+  if ((std::isinf(op1) && (op2 == 0.0))
+      || ((op1 == 0.0) && (std::isinf(op2)))) {
    return one_point_five;
-  } else if (isinf(op1) || isinf(op2)) {
+  } else if (std::isinf(op1) || std::isinf(op2)) {
    // Return +inf if signs match, otherwise -inf.
    return ((op1 >= 0.0) == (op2 >= 0.0)) ? kFP64PositiveInfinity
                                          : kFP64NegativeInfinity;
@ -3373,9 +3731,9 @@ T Simulator::FPRSqrtStepFused(T op1, T op2) {
    // The multiply-add-halve operation must be fully fused, so avoid interim
    // rounding by checking which operand can be losslessly divided by two
    // before doing the multiply-add.
-    if (isnormal(op1 / two)) {
+    if (std::isnormal(op1 / two)) {
      return FusedMultiplyAdd(op1 / two, op2, one_point_five);
-    } else if (isnormal(op2 / two)) {
+    } else if (std::isnormal(op2 / two)) {
      return FusedMultiplyAdd(op1, op2 / two, one_point_five);
    } else {
      // Neither operand is normal after halving: the result is dominated by
@ -3390,11 +3748,11 @@ double Simulator::FPRoundInt(double value, FPRounding round_mode) {
  if ((value == 0.0) || (value == kFP64PositiveInfinity) ||
      (value == kFP64NegativeInfinity)) {
    return value;
-  } else if (isnan(value)) {
+  } else if (std::isnan(value)) {
    return FPProcessNaN(value);
  }

-  double int_result = floor(value);
+  double int_result = std::floor(value);
  double error = value - int_result;
  switch (round_mode) {
    case FPTieAway: {
@ -3419,7 +3777,7 @@ double Simulator::FPRoundInt(double value, FPRounding round_mode) {
      // If the error is greater than 0.5, or is equal to 0.5 and the integer
      // result is odd, round up.
      } else if ((error > 0.5) ||
-          ((error == 0.5) && (fmod(int_result, 2) != 0))) {
+          ((error == 0.5) && (std::fmod(int_result, 2) != 0))) {
        int_result++;
      }
      break;
@ -3461,7 +3819,7 @@ int32_t Simulator::FPToInt32(double value, FPRounding rmode) {
  } else if (value < kWMinInt) {
    return kWMinInt;
  }
-  return isnan(value) ? 0 : static_cast<int32_t>(value);
+  return std::isnan(value) ? 0 : static_cast<int32_t>(value);
 }


@ -3472,7 +3830,7 @@ int64_t Simulator::FPToInt64(double value, FPRounding rmode) {
  } else if (value < kXMinInt) {
    return kXMinInt;
  }
-  return isnan(value) ? 0 : static_cast<int64_t>(value);
+  return std::isnan(value) ? 0 : static_cast<int64_t>(value);
 }


@ -3483,7 +3841,7 @@ uint32_t Simulator::FPToUInt32(double value, FPRounding rmode) {
  } else if (value < 0.0) {
    return 0;
  }
-  return isnan(value) ? 0 : static_cast<uint32_t>(value);
+  return std::isnan(value) ? 0 : static_cast<uint32_t>(value);
 }


@ -3494,7 +3852,7 @@ uint64_t Simulator::FPToUInt64(double value, FPRounding rmode) {
  } else if (value < 0.0) {
    return 0;
  }
-  return isnan(value) ? 0 : static_cast<uint64_t>(value);
+  return std::isnan(value) ? 0 : static_cast<uint64_t>(value);
 }


@ -3511,7 +3869,7 @@ LogicVRegister Simulator::FN(VectorFormat vform,                 \
    T result;                                                    \
    if (PROCNAN) {                                               \
      result = FPProcessNaNs(op1, op2);                          \
-      if (!isnan(result)) {                                      \
+      if (!std::isnan(result)) {                                      \
        result = OP(op1, op2);                                   \
      }                                                          \
    } else {                                                     \
@ -3558,7 +3916,7 @@ LogicVRegister Simulator::frecps(VectorFormat vform,
    T op1 = -src1.Float<T>(i);
    T op2 = src2.Float<T>(i);
    T result = FPProcessNaNs(op1, op2);
-    dst.SetFloat(i, isnan(result) ? result : FPRecipStepFused(op1, op2));
+    dst.SetFloat(i, std::isnan(result) ? result : FPRecipStepFused(op1, op2));
  }
  return dst;
 }
@ -3588,7 +3946,7 @@ LogicVRegister Simulator::frsqrts(VectorFormat vform,
    T op1 = -src1.Float<T>(i);
    T op2 = src2.Float<T>(i);
    T result = FPProcessNaNs(op1, op2);
-    dst.SetFloat(i, isnan(result) ? result : FPRSqrtStepFused(op1, op2));
+    dst.SetFloat(i, std::isnan(result) ? result : FPRSqrtStepFused(op1, op2));
  }
  return dst;
 }
@ -3620,7 +3978,7 @@ LogicVRegister Simulator::fcmp(VectorFormat vform,
    T op1 = src1.Float<T>(i);
    T op2 = src2.Float<T>(i);
    T nan_result = FPProcessNaNs(op1, op2);
-    if (!isnan(nan_result)) {
+    if (!std::isnan(nan_result)) {
      switch (cond) {
        case eq: result = (op1 == op2); break;
        case ge: result = (op1 >= op2); break;
@ -4001,7 +4359,7 @@ LogicVRegister Simulator::frint(VectorFormat vform,
    for (int i = 0; i < LaneCountFromFormat(vform); i++) {
      float input = src.Float<float>(i);
      float rounded = FPRoundInt(input, rounding_mode);
-      if (inexact_exception && !isnan(input) && (input != rounded)) {
+      if (inexact_exception && !std::isnan(input) && (input != rounded)) {
        FPProcessException();
      }
      dst.SetFloat<float>(i, rounded);
@ -4011,7 +4369,7 @@ LogicVRegister Simulator::frint(VectorFormat vform,
    for (int i = 0; i < LaneCountFromFormat(vform); i++) {
      double input = src.Float<double>(i);
      double rounded = FPRoundInt(input, rounding_mode);
-      if (inexact_exception && !isnan(input) && (input != rounded)) {
+      if (inexact_exception && !std::isnan(input) && (input != rounded)) {
        FPProcessException();
      }
      dst.SetFloat<double>(i, rounded);
@ -4029,13 +4387,13 @@ LogicVRegister Simulator::fcvts(VectorFormat vform,
  dst.ClearForWrite(vform);
  if (LaneSizeInBitsFromFormat(vform) == kSRegSize) {
    for (int i = 0; i < LaneCountFromFormat(vform); i++) {
-      float op = src.Float<float>(i) * powf(2.0f, fbits);
+      float op = src.Float<float>(i) * std::pow(2.0f, fbits);
      dst.SetInt(vform, i, FPToInt32(op, rounding_mode));
    }
  } else {
    VIXL_ASSERT(LaneSizeInBitsFromFormat(vform) == kDRegSize);
    for (int i = 0; i < LaneCountFromFormat(vform); i++) {
-      double op = src.Float<double>(i) * pow(2.0, fbits);
+      double op = src.Float<double>(i) * std::pow(2.0, fbits);
      dst.SetInt(vform, i, FPToInt64(op, rounding_mode));
    }
  }
@ -4051,13 +4409,13 @@ LogicVRegister Simulator::fcvtu(VectorFormat vform,
  dst.ClearForWrite(vform);
  if (LaneSizeInBitsFromFormat(vform) == kSRegSize) {
    for (int i = 0; i < LaneCountFromFormat(vform); i++) {
-      float op = src.Float<float>(i) * powf(2.0f, fbits);
+      float op = src.Float<float>(i) * std::pow(2.0f, fbits);
      dst.SetUint(vform, i, FPToUInt32(op, rounding_mode));
    }
  } else {
    VIXL_ASSERT(LaneSizeInBitsFromFormat(vform) == kDRegSize);
    for (int i = 0; i < LaneCountFromFormat(vform); i++) {
-      double op = src.Float<double>(i) * pow(2.0, fbits);
+      double op = src.Float<double>(i) * std::pow(2.0, fbits);
      dst.SetUint(vform, i, FPToUInt64(op, rounding_mode));
    }
  }
@ -4182,7 +4540,7 @@ static inline uint64_t Bits(uint64_t val, int start_bit, int end_bit) {

 template <typename T>
 T Simulator::FPRecipSqrtEstimate(T op) {
-  if (isnan(op)) {
+  if (std::isnan(op)) {
    return FPProcessNaN(op);
  } else if (op == 0.0) {
    if (copysign(1.0, op) < 0.0) {
@ -4193,7 +4551,7 @@ T Simulator::FPRecipSqrtEstimate(T op) {
  } else if (copysign(1.0, op) < 0.0) {
    FPProcessException();
    return FPDefaultNaN<T>();
-  } else if (isinf(op)) {
+  } else if (std::isinf(op)) {
    return 0.0;
  } else {
    uint64_t fraction;
@ -4271,17 +4629,17 @@ T Simulator::FPRecipEstimate(T op, FPRounding rounding) {
    sign = double_sign(op);
  }

-  if (isnan(op)) {
+  if (std::isnan(op)) {
    return FPProcessNaN(op);
-  } else if (isinf(op)) {
+  } else if (std::isinf(op)) {
    return (sign == 1) ? -0.0 : 0.0;
  } else if (op == 0.0) {
    FPProcessException();  // FPExc_DivideByZero exception.
    return (sign == 1) ? kFP64NegativeInfinity : kFP64PositiveInfinity;
  } else if (((sizeof(T) == sizeof(float)) &&  // NOLINT(runtime/sizeof)
-              (fabsf(op) < pow(2.0, -128))) ||
+              (std::fabs(op) < std::pow(2.0, -128.0))) ||
             ((sizeof(T) == sizeof(double)) &&  // NOLINT(runtime/sizeof)
-              (fabs(op) < pow(2.0, -1024)))) {
+              (std::fabs(op) < std::pow(2.0, -1024.0)))) {
    bool overflow_to_inf = false;
    switch (rounding) {
      case FPTieEven: overflow_to_inf = true; break;
@ -4338,9 +4696,9 @@ T Simulator::FPRecipEstimate(T op, FPRounding rounding) {

    fraction = double_mantissa(estimate);
    if (result_exp == 0) {
-      fraction = (1L << 51) | Bits(fraction, 51, 1);
+      fraction = (UINT64_C(1) << 51) | Bits(fraction, 51, 1);
    } else if (result_exp == -1) {
-      fraction = (1L << 50) | Bits(fraction, 51, 2);
+      fraction = (UINT64_C(1) << 50) | Bits(fraction, 51, 2);
      result_exp = 0;
    }
    if (sizeof(T) == sizeof(float)) {  // NOLINT(runtime/sizeof)
@ -4384,8 +4742,8 @@ LogicVRegister Simulator::ursqrte(VectorFormat vform,
    if (operand <= 0x3FFFFFFF) {
      result = 0xFFFFFFFF;
    } else {
-      dp_operand = operand * pow(2.0, -32);
-      dp_result = recip_sqrt_estimate(dp_operand) * pow(2.0, 31);
+      dp_operand = operand * std::pow(2.0, -32);
+      dp_result = recip_sqrt_estimate(dp_operand) * std::pow(2.0, 31);
      result = static_cast<uint32_t>(dp_result);
    }
    dst.SetUint(vform, i, result);
@ -4416,8 +4774,8 @@ LogicVRegister Simulator::urecpe(VectorFormat vform,
    if (operand <= 0x7FFFFFFF) {
      result = 0xFFFFFFFF;
    } else {
-      dp_operand = operand * pow(2.0, -32);
-      dp_result = recip_estimate(dp_operand) * pow(2.0, 31);
+      dp_operand = operand * std::pow(2.0, -32);
+      dp_result = recip_estimate(dp_operand) * std::pow(2.0, 31);
      result = static_cast<uint32_t>(dp_result);
    }
    dst.SetUint(vform, i, result);
@ -4433,7 +4791,7 @@ LogicVRegister Simulator::frecpx(VectorFormat vform,
  for (int i = 0; i < LaneCountFromFormat(vform); i++) {
    T op = src.Float<T>(i);
    T result;
-    if (isnan(op)) {
+    if (std::isnan(op)) {
       result = FPProcessNaN(op);
    } else {
      int exp;
--- a/src/vixl/a64/macro-assembler-a64.cc
+++ b/src/vixl/a64/macro-assembler-a64.cc
@ -24,7 +24,7 @@
 // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

-#include "a64/macro-assembler-a64.h"
+#include "vixl/a64/macro-assembler-a64.h"

 namespace vixl {

@ -43,8 +43,8 @@ void Pool::SetNextCheckpoint(ptrdiff_t checkpoint) {
 }


-LiteralPool::LiteralPool(MacroAssembler* masm) :
-    Pool(masm), size_(0), first_use_(-1) {
+LiteralPool::LiteralPool(MacroAssembler* masm)
+  : Pool(masm), size_(0), first_use_(-1) {
 }


@ -718,11 +718,13 @@ void MacroAssembler::LogicalMacro(const Register& rd,
        case AND:
          Mov(rd, 0);
          return;
-        case ORR:  // Fall through.
+        case ORR:
+          VIXL_FALLTHROUGH();
        case EOR:
          Mov(rd, rn);
          return;
-        case ANDS:  // Fall through.
+        case ANDS:
+          VIXL_FALLTHROUGH();
        case BICS:
          break;
        default:
@ -740,7 +742,8 @@ void MacroAssembler::LogicalMacro(const Register& rd,
        case EOR:
          Mvn(rd, rn);
          return;
-        case ANDS:  // Fall through.
+        case ANDS:
+          VIXL_FALLTHROUGH();
        case BICS:
          break;
        default:
@ -1131,13 +1134,14 @@ void MacroAssembler::Csel(const Register& rd,

 void MacroAssembler::Add(const Register& rd,
                         const Register& rn,
-                         const Operand& operand) {
+                         const Operand& operand,
+                         FlagsUpdate S) {
  VIXL_ASSERT(allow_macro_instructions_);
  if (operand.IsImmediate() && (operand.immediate() < 0) &&
      IsImmAddSub(-operand.immediate())) {
-    AddSubMacro(rd, rn, -operand.immediate(), LeaveFlags, SUB);
+    AddSubMacro(rd, rn, -operand.immediate(), S, SUB);
  } else {
-    AddSubMacro(rd, rn, operand, LeaveFlags, ADD);
+    AddSubMacro(rd, rn, operand, S, ADD);
  }
 }

@ -1145,25 +1149,20 @@ void MacroAssembler::Add(const Register& rd,
 void MacroAssembler::Adds(const Register& rd,
                          const Register& rn,
                          const Operand& operand) {
-  VIXL_ASSERT(allow_macro_instructions_);
-  if (operand.IsImmediate() && (operand.immediate() < 0) &&
-      IsImmAddSub(-operand.immediate())) {
-    AddSubMacro(rd, rn, -operand.immediate(), SetFlags, SUB);
-  } else {
-    AddSubMacro(rd, rn, operand, SetFlags, ADD);
-  }
+  Add(rd, rn, operand, SetFlags);
 }


 void MacroAssembler::Sub(const Register& rd,
                         const Register& rn,
-                         const Operand& operand) {
+                         const Operand& operand,
+                         FlagsUpdate S) {
  VIXL_ASSERT(allow_macro_instructions_);
  if (operand.IsImmediate() && (operand.immediate() < 0) &&
      IsImmAddSub(-operand.immediate())) {
-    AddSubMacro(rd, rn, -operand.immediate(), LeaveFlags, ADD);
+    AddSubMacro(rd, rn, -operand.immediate(), S, ADD);
  } else {
-    AddSubMacro(rd, rn, operand, LeaveFlags, SUB);
+    AddSubMacro(rd, rn, operand, S, SUB);
  }
 }

@ -1171,13 +1170,7 @@ void MacroAssembler::Sub(const Register& rd,
 void MacroAssembler::Subs(const Register& rd,
                          const Register& rn,
                          const Operand& operand) {
-  VIXL_ASSERT(allow_macro_instructions_);
-  if (operand.IsImmediate() && (operand.immediate() < 0) &&
-      IsImmAddSub(-operand.immediate())) {
-    AddSubMacro(rd, rn, -operand.immediate(), SetFlags, ADD);
-  } else {
-    AddSubMacro(rd, rn, operand, SetFlags, SUB);
-  }
+  Sub(rd, rn, operand, SetFlags);
 }


@ -1193,23 +1186,29 @@ void MacroAssembler::Cmp(const Register& rn, const Operand& operand) {
 }


-void MacroAssembler::Fcmp(const FPRegister& fn, double value) {
+void MacroAssembler::Fcmp(const FPRegister& fn, double value,
+                          FPTrapFlags trap) {
  VIXL_ASSERT(allow_macro_instructions_);
  // The worst case for size is:
  //  * 1 to materialise the constant, using literal pool if necessary
-  //  * 1 instruction for fcmp
+  //  * 1 instruction for fcmp{e}
  MacroEmissionCheckScope guard(this);
  if (value != 0.0) {
    UseScratchRegisterScope temps(this);
    FPRegister tmp = temps.AcquireSameSizeAs(fn);
    Fmov(tmp, value);
-    fcmp(fn, tmp);
+    FPCompareMacro(fn, tmp, trap);
  } else {
-    fcmp(fn, value);
+    FPCompareMacro(fn, value, trap);
  }
 }


+void MacroAssembler::Fcmpe(const FPRegister& fn, double value) {
+  Fcmp(fn, value, EnableTrap);
+}
+
+
 void MacroAssembler::Fmov(VRegister vd, double imm) {
  VIXL_ASSERT(allow_macro_instructions_);
  // Floating point immediates are loaded through the literal pool.
@ -1637,41 +1636,67 @@ void MacroAssembler::Pop(const CPURegister& dst0, const CPURegister& dst1,


 void MacroAssembler::PushCPURegList(CPURegList registers) {
-  int size = registers.RegisterSizeInBytes();
-
-  PrepareForPush(registers.Count(), size);
-  // Push up to four registers at a time because if the current stack pointer is
-  // sp and reg_size is 32, registers must be pushed in blocks of four in order
-  // to maintain the 16-byte alignment for sp.
+  VIXL_ASSERT(!registers.Overlaps(*TmpList()));
+  VIXL_ASSERT(!registers.Overlaps(*FPTmpList()));
  VIXL_ASSERT(allow_macro_instructions_);
+
+  int reg_size = registers.RegisterSizeInBytes();
+  PrepareForPush(registers.Count(), reg_size);
+
+  // Bump the stack pointer and store two registers at the bottom.
+  int size = registers.TotalSizeInBytes();
+  const CPURegister& bottom_0 = registers.PopLowestIndex();
+  const CPURegister& bottom_1 = registers.PopLowestIndex();
+  if (bottom_0.IsValid() && bottom_1.IsValid()) {
+    Stp(bottom_0, bottom_1, MemOperand(StackPointer(), -size, PreIndex));
+  } else if (bottom_0.IsValid()) {
+    Str(bottom_0, MemOperand(StackPointer(), -size, PreIndex));
+  }
+
+  int offset = 2 * reg_size;
  while (!registers.IsEmpty()) {
-    int count_before = registers.Count();
-    const CPURegister& src0 = registers.PopHighestIndex();
-    const CPURegister& src1 = registers.PopHighestIndex();
-    const CPURegister& src2 = registers.PopHighestIndex();
-    const CPURegister& src3 = registers.PopHighestIndex();
-    int count = count_before - registers.Count();
-    PushHelper(count, size, src0, src1, src2, src3);
+    const CPURegister& src0 = registers.PopLowestIndex();
+    const CPURegister& src1 = registers.PopLowestIndex();
+    if (src1.IsValid()) {
+      Stp(src0, src1, MemOperand(StackPointer(), offset));
+    } else {
+      Str(src0, MemOperand(StackPointer(), offset));
+    }
+    offset += 2 * reg_size;
  }
 }


 void MacroAssembler::PopCPURegList(CPURegList registers) {
-  int size = registers.RegisterSizeInBytes();
-
-  PrepareForPop(registers.Count(), size);
-  // Pop up to four registers at a time because if the current stack pointer is
-  // sp and reg_size is 32, registers must be pushed in blocks of four in order
-  // to maintain the 16-byte alignment for sp.
+  VIXL_ASSERT(!registers.Overlaps(*TmpList()));
+  VIXL_ASSERT(!registers.Overlaps(*FPTmpList()));
  VIXL_ASSERT(allow_macro_instructions_);
+
+  int reg_size = registers.RegisterSizeInBytes();
+  PrepareForPop(registers.Count(), reg_size);
+
+
+  int size = registers.TotalSizeInBytes();
+  const CPURegister& bottom_0 = registers.PopLowestIndex();
+  const CPURegister& bottom_1 = registers.PopLowestIndex();
+
+  int offset = 2 * reg_size;
  while (!registers.IsEmpty()) {
-    int count_before = registers.Count();
    const CPURegister& dst0 = registers.PopLowestIndex();
    const CPURegister& dst1 = registers.PopLowestIndex();
-    const CPURegister& dst2 = registers.PopLowestIndex();
-    const CPURegister& dst3 = registers.PopLowestIndex();
-    int count = count_before - registers.Count();
-    PopHelper(count, size, dst0, dst1, dst2, dst3);
+    if (dst1.IsValid()) {
+      Ldp(dst0, dst1, MemOperand(StackPointer(), offset));
+    } else {
+      Ldr(dst0, MemOperand(StackPointer(), offset));
+    }
+    offset += 2 * reg_size;
+  }
+
+  // Load the two registers at the bottom and drop the stack pointer.
+  if (bottom_0.IsValid() && bottom_1.IsValid()) {
+    Ldp(bottom_0, bottom_1, MemOperand(StackPointer(), size, PostIndex));
+  } else if (bottom_0.IsValid()) {
+    Ldr(bottom_0, MemOperand(StackPointer(), size, PostIndex));
  }
 }

@ -1831,42 +1856,6 @@ void MacroAssembler::Peek(const Register& dst, const Operand& offset) {
 }


-void MacroAssembler::PeekCPURegList(CPURegList registers, int offset) {
-  VIXL_ASSERT(!registers.IncludesAliasOf(StackPointer()));
-  VIXL_ASSERT(offset >= 0);
-  int size = registers.RegisterSizeInBytes();
-
-  while (registers.Count() >= 2) {
-    const CPURegister& dst0 = registers.PopLowestIndex();
-    const CPURegister& dst1 = registers.PopLowestIndex();
-    Ldp(dst0, dst1, MemOperand(StackPointer(), offset));
-    offset += 2 * size;
-  }
-  if (!registers.IsEmpty()) {
-    Ldr(registers.PopLowestIndex(),
-        MemOperand(StackPointer(), offset));
-  }
-}
-
-
-void MacroAssembler::PokeCPURegList(CPURegList registers, int offset) {
-  VIXL_ASSERT(!registers.IncludesAliasOf(StackPointer()));
-  VIXL_ASSERT(offset >= 0);
-  int size = registers.RegisterSizeInBytes();
-
-  while (registers.Count() >= 2) {
-    const CPURegister& dst0 = registers.PopLowestIndex();
-    const CPURegister& dst1 = registers.PopLowestIndex();
-    Stp(dst0, dst1, MemOperand(StackPointer(), offset));
-    offset += 2 * size;
-  }
-  if (!registers.IsEmpty()) {
-    Str(registers.PopLowestIndex(),
-        MemOperand(StackPointer(), offset));
-  }
-}
-
-
 void MacroAssembler::Claim(const Operand& size) {
  VIXL_ASSERT(allow_macro_instructions_);

@ -1956,6 +1945,80 @@ void MacroAssembler::PopCalleeSavedRegisters() {
  ldp(x29, x30, tos);
 }

+void MacroAssembler::LoadCPURegList(CPURegList registers,
+                                    const MemOperand& src) {
+  LoadStoreCPURegListHelper(kLoad, registers, src);
+}
+
+void MacroAssembler::StoreCPURegList(CPURegList registers,
+                                     const MemOperand& dst) {
+  LoadStoreCPURegListHelper(kStore, registers, dst);
+}
+
+
+void MacroAssembler::LoadStoreCPURegListHelper(LoadStoreCPURegListAction op,
+                                               CPURegList registers,
+                                               const MemOperand& mem) {
+  // We do not handle pre-indexing or post-indexing.
+  VIXL_ASSERT(!(mem.IsPreIndex() || mem.IsPostIndex()));
+  VIXL_ASSERT(!registers.Overlaps(tmp_list_));
+  VIXL_ASSERT(!registers.Overlaps(fptmp_list_));
+  VIXL_ASSERT(!registers.IncludesAliasOf(sp));
+
+  UseScratchRegisterScope temps(this);
+
+  MemOperand loc = BaseMemOperandForLoadStoreCPURegList(registers,
+                                                        mem,
+                                                        &temps);
+
+  while (registers.Count() >= 2) {
+    const CPURegister& dst0 = registers.PopLowestIndex();
+    const CPURegister& dst1 = registers.PopLowestIndex();
+    if (op == kStore) {
+      Stp(dst0, dst1, loc);
+    } else {
+      VIXL_ASSERT(op == kLoad);
+      Ldp(dst0, dst1, loc);
+    }
+    loc.AddOffset(2 * registers.RegisterSizeInBytes());
+  }
+  if (!registers.IsEmpty()) {
+    if (op == kStore) {
+      Str(registers.PopLowestIndex(), loc);
+    } else {
+      VIXL_ASSERT(op == kLoad);
+      Ldr(registers.PopLowestIndex(), loc);
+    }
+  }
+}
+
+MemOperand MacroAssembler::BaseMemOperandForLoadStoreCPURegList(
+    const CPURegList& registers,
+    const MemOperand& mem,
+    UseScratchRegisterScope* scratch_scope) {
+  // If necessary, pre-compute the base address for the accesses.
+  if (mem.IsRegisterOffset()) {
+    Register reg_base = scratch_scope->AcquireX();
+    ComputeAddress(reg_base, mem);
+    return MemOperand(reg_base);
+
+  } else if (mem.IsImmediateOffset()) {
+    int reg_size = registers.RegisterSizeInBytes();
+    int total_size = registers.TotalSizeInBytes();
+    int64_t min_offset = mem.offset();
+    int64_t max_offset = mem.offset() + std::max(0, total_size - 2 * reg_size);
+    if ((registers.Count() >= 2) &&
+        (!Assembler::IsImmLSPair(min_offset, WhichPowerOf2(reg_size)) ||
+         !Assembler::IsImmLSPair(max_offset, WhichPowerOf2(reg_size)))) {
+      Register reg_base = scratch_scope->AcquireX();
+      ComputeAddress(reg_base, mem);
+      return MemOperand(reg_base);
+    }
+  }
+
+  return mem;
+}
+
 void MacroAssembler::BumpSystemStackPointer(const Operand& space) {
  VIXL_ASSERT(!sp.Is(StackPointer()));
  // TODO: Several callers rely on this not using scratch registers, so we use
--- a/src/vixl/a64/macro-assembler-a64.h
+++ b/src/vixl/a64/macro-assembler-a64.h
@ -30,9 +30,9 @@
 #include <algorithm>
 #include <limits>

-#include "globals.h"
-#include "a64/assembler-a64.h"
-#include "a64/debugger-a64.h"
+#include "vixl/globals.h"
+#include "vixl/a64/assembler-a64.h"
+#include "vixl/a64/debugger-a64.h"


 #define LS_MACRO_LIST(V)                                      \
@ -56,6 +56,7 @@ namespace vixl {

 // Forward declaration
 class MacroAssembler;
+class UseScratchRegisterScope;

 class Pool {
 public:
@ -631,13 +632,15 @@ class MacroAssembler : public Assembler {
  // Add and sub macros.
  void Add(const Register& rd,
           const Register& rn,
-           const Operand& operand);
+           const Operand& operand,
+           FlagsUpdate S = LeaveFlags);
  void Adds(const Register& rd,
            const Register& rn,
            const Operand& operand);
  void Sub(const Register& rd,
           const Register& rn,
-           const Operand& operand);
+           const Operand& operand,
+           FlagsUpdate S = LeaveFlags);
  void Subs(const Register& rd,
            const Register& rn,
            const Operand& operand);
@ -844,39 +847,43 @@ class MacroAssembler : public Assembler {
  // supported.
  //
  // Otherwise, (Peek|Poke)(CPU|X|W|D|S)RegList is preferred.
-  void PeekCPURegList(CPURegList registers, int offset);
-  void PokeCPURegList(CPURegList registers, int offset);
+  void PeekCPURegList(CPURegList registers, int64_t offset) {
+    LoadCPURegList(registers, MemOperand(StackPointer(), offset));
+  }
+  void PokeCPURegList(CPURegList registers, int64_t offset) {
+    StoreCPURegList(registers, MemOperand(StackPointer(), offset));
+  }

-  void PeekSizeRegList(RegList registers, int offset, unsigned reg_size,
+  void PeekSizeRegList(RegList registers, int64_t offset, unsigned reg_size,
      CPURegister::RegisterType type = CPURegister::kRegister) {
    PeekCPURegList(CPURegList(type, reg_size, registers), offset);
  }
-  void PokeSizeRegList(RegList registers, int offset, unsigned reg_size,
+  void PokeSizeRegList(RegList registers, int64_t offset, unsigned reg_size,
      CPURegister::RegisterType type = CPURegister::kRegister) {
    PokeCPURegList(CPURegList(type, reg_size, registers), offset);
  }
-  void PeekXRegList(RegList regs, int offset) {
+  void PeekXRegList(RegList regs, int64_t offset) {
    PeekSizeRegList(regs, offset, kXRegSize);
  }
-  void PokeXRegList(RegList regs, int offset) {
+  void PokeXRegList(RegList regs, int64_t offset) {
    PokeSizeRegList(regs, offset, kXRegSize);
  }
-  void PeekWRegList(RegList regs, int offset) {
+  void PeekWRegList(RegList regs, int64_t offset) {
    PeekSizeRegList(regs, offset, kWRegSize);
  }
-  void PokeWRegList(RegList regs, int offset) {
+  void PokeWRegList(RegList regs, int64_t offset) {
    PokeSizeRegList(regs, offset, kWRegSize);
  }
-  void PeekDRegList(RegList regs, int offset) {
+  void PeekDRegList(RegList regs, int64_t offset) {
    PeekSizeRegList(regs, offset, kDRegSize, CPURegister::kVRegister);
  }
-  void PokeDRegList(RegList regs, int offset) {
+  void PokeDRegList(RegList regs, int64_t offset) {
    PokeSizeRegList(regs, offset, kDRegSize, CPURegister::kVRegister);
  }
-  void PeekSRegList(RegList regs, int offset) {
+  void PeekSRegList(RegList regs, int64_t offset) {
    PeekSizeRegList(regs, offset, kSRegSize, CPURegister::kVRegister);
  }
-  void PokeSRegList(RegList regs, int offset) {
+  void PokeSRegList(RegList regs, int64_t offset) {
    PokeSizeRegList(regs, offset, kSRegSize, CPURegister::kVRegister);
  }

@ -911,6 +918,9 @@ class MacroAssembler : public Assembler {
  // aligned to 16 bytes.
  void PopCalleeSavedRegisters();

+  void LoadCPURegList(CPURegList registers, const MemOperand& src);
+  void StoreCPURegList(CPURegList registers, const MemOperand& dst);
+
  // Remaining instructions are simple pass-through calls to the assembler.
  void Adr(const Register& rd, Label* label) {
    VIXL_ASSERT(allow_macro_instructions_);
@ -1135,18 +1145,31 @@ class MacroAssembler : public Assembler {
  void Fccmp(const VRegister& vn,
             const VRegister& vm,
             StatusFlags nzcv,
-             Condition cond) {
+             Condition cond,
+             FPTrapFlags trap = DisableTrap) {
    VIXL_ASSERT(allow_macro_instructions_);
    VIXL_ASSERT((cond != al) && (cond != nv));
    SingleEmissionCheckScope guard(this);
-    fccmp(vn, vm, nzcv, cond);
+    FPCCompareMacro(vn, vm, nzcv, cond, trap);
  }
-  void Fcmp(const VRegister& vn, const VRegister& vm) {
+  void Fccmpe(const VRegister& vn,
+              const VRegister& vm,
+              StatusFlags nzcv,
+              Condition cond) {
+    Fccmp(vn, vm, nzcv, cond, EnableTrap);
+  }
+  void Fcmp(const VRegister& vn, const VRegister& vm,
+            FPTrapFlags trap = DisableTrap) {
    VIXL_ASSERT(allow_macro_instructions_);
    SingleEmissionCheckScope guard(this);
-    fcmp(vn, vm);
+    FPCompareMacro(vn, vm, trap);
+  }
+  void Fcmp(const VRegister& vn, double value,
+            FPTrapFlags trap = DisableTrap);
+  void Fcmpe(const VRegister& vn, double value);
+  void Fcmpe(const VRegister& vn, const VRegister& vm) {
+    Fcmp(vn, vm, EnableTrap);
  }
-  void Fcmp(const VRegister& vn, double value);
  void Fcsel(const VRegister& vd,
             const VRegister& vn,
             const VRegister& vm,
@ -2000,6 +2023,14 @@ class MacroAssembler : public Assembler {
    SingleEmissionCheckScope guard(this);
    umull(rd, rn, rm);
  }
+  void Umulh(const Register& xd, const Register& xn, const Register& xm) {
+    VIXL_ASSERT(allow_macro_instructions_);
+    VIXL_ASSERT(!xd.IsZero());
+    VIXL_ASSERT(!xn.IsZero());
+    VIXL_ASSERT(!xm.IsZero());
+    SingleEmissionCheckScope guard(this);
+    umulh(xd, xn, xm);
+  }
  void Umsubl(const Register& rd,
              const Register& rn,
              const Register& rm,
@ -2989,6 +3020,23 @@ class MacroAssembler : public Assembler {
  void PrepareForPush(int count, int size);
  void PrepareForPop(int count, int size);

+  // The actual implementation of load and store operations for CPURegList.
+  enum LoadStoreCPURegListAction {
+    kLoad,
+    kStore
+  };
+  void LoadStoreCPURegListHelper(LoadStoreCPURegListAction operation,
+                                 CPURegList registers,
+                                 const MemOperand& mem);
+  // Returns a MemOperand suitable for loading or storing a CPURegList at `dst`.
+  // This helper may allocate registers from `scratch_scope` and generate code
+  // to compute an intermediate address. The resulting MemOperand is only valid
+  // as long as `scratch_scope` remains valid.
+  MemOperand BaseMemOperandForLoadStoreCPURegList(
+      const CPURegList& registers,
+      const MemOperand& mem,
+      UseScratchRegisterScope* scratch_scope);
+
  bool LabelIsOutOfRange(Label* label, ImmBranchType branch_type) {
    return !Instruction::IsValidImmPCOffset(branch_type,
                                            label->location() - CursorOffset());
--- a/src/vixl/a64/simulator-a64.cc
+++ b/src/vixl/a64/simulator-a64.cc
@ -27,8 +27,8 @@
 #ifdef USE_SIMULATOR

 #include <string.h>
-#include <math.h>
-#include "a64/simulator-a64.h"
+#include <cmath>
+#include "vixl/a64/simulator-a64.h"

 namespace vixl {

@ -396,23 +396,18 @@ int64_t Simulator::ExtendValue(unsigned reg_size,
 }


-template<> double Simulator::FPDefaultNaN<double>() const {
-  return kFP64DefaultNaN;
-}
-
-
-template<> float Simulator::FPDefaultNaN<float>() const {
-  return kFP32DefaultNaN;
-}
-
-
-void Simulator::FPCompare(double val0, double val1) {
+void Simulator::FPCompare(double val0, double val1, FPTrapFlags trap) {
  AssertSupportedFPCR();

  // TODO: This assumes that the C++ implementation handles comparisons in the
  // way that we expect (as per AssertSupportedFPCR()).
-  if ((isnan(val0) != 0) || (isnan(val1) != 0)) {
+  bool process_exception = false;
+  if ((std::isnan(val0) != 0) || (std::isnan(val1) != 0)) {
    nzcv().SetRawValue(FPUnorderedFlag);
+    if (IsSignallingNaN(val0) || IsSignallingNaN(val1) ||
+        (trap == EnableTrap)) {
+      process_exception = true;
+    }
  } else if (val0 < val1) {
    nzcv().SetRawValue(FPLessThanFlag);
  } else if (val0 > val1) {
@ -423,6 +418,7 @@ void Simulator::FPCompare(double val0, double val1) {
    VIXL_UNREACHABLE();
  }
  LogSystemRegister(NZCV);
+  if (process_exception) FPProcessException();
 }


@ -440,7 +436,7 @@ Simulator::PrintRegisterFormat Simulator::GetPrintRegisterFormatForSize(
  }

  switch (lane_size) {
-    default: VIXL_UNREACHABLE();
+    default: VIXL_UNREACHABLE(); break;
    case kQRegSizeInBytes: format |= kPrintReg1Q; break;
    case kDRegSizeInBytes: format |= kPrintReg1D; break;
    case kSRegSizeInBytes: format |= kPrintReg1S; break;
@ -460,7 +456,7 @@ Simulator::PrintRegisterFormat Simulator::GetPrintRegisterFormatForSize(
 Simulator::PrintRegisterFormat Simulator::GetPrintRegisterFormat(
    VectorFormat vform) {
  switch (vform) {
-    default: VIXL_UNREACHABLE();
+    default: VIXL_UNREACHABLE(); return kPrintReg16B;
    case kFormat16B: return kPrintReg16B;
    case kFormat8B: return kPrintReg8B;
    case kFormat8H: return kPrintReg8H;
@ -841,7 +837,7 @@ void Simulator::VisitUnconditionalBranch(const Instruction* instr) {
  switch (instr->Mask(UnconditionalBranchMask)) {
    case BL:
      set_lr(instr->NextInstruction());
-      // Fall through.
+      VIXL_FALLTHROUGH();
    case B:
      set_pc(instr->ImmPCOffsetTarget());
      break;
@ -864,7 +860,7 @@ void Simulator::VisitUnconditionalBranchToRegister(const Instruction* instr) {
  switch (instr->Mask(UnconditionalBranchToRegisterMask)) {
    case BLR:
      set_lr(instr->NextInstruction());
-      // Fall through.
+      VIXL_FALLTHROUGH();
    case BR:
    case RET: set_pc(target); break;
    default: VIXL_UNREACHABLE();
@ -1007,7 +1003,7 @@ void Simulator::LogicalHelper(const Instruction* instr, int64_t op2) {
  // Switch on the logical operation, stripping out the NOT bit, as it has a
  // different meaning for logical immediate instructions.
  switch (instr->Mask(LogicalOpMask & ~NOT)) {
-    case ANDS: update_flags = true;  // Fall through.
+    case ANDS: update_flags = true; VIXL_FALLTHROUGH();
    case AND: result = op1 & op2; break;
    case ORR: result = op1 | op2; break;
    case EOR: result = op1 ^ op2; break;
@ -1616,14 +1612,14 @@ void Simulator::VisitDataProcessing1Source(const Instruction* instr) {
    case REV_w: set_wreg(dst, ReverseBytes(wreg(src), Reverse32)); break;
    case REV32_x: set_xreg(dst, ReverseBytes(xreg(src), Reverse32)); break;
    case REV_x: set_xreg(dst, ReverseBytes(xreg(src), Reverse64)); break;
-    case CLZ_w: set_wreg(dst, CountLeadingZeros(wreg(src), kWRegSize)); break;
-    case CLZ_x: set_xreg(dst, CountLeadingZeros(xreg(src), kXRegSize)); break;
+    case CLZ_w: set_wreg(dst, CountLeadingZeros(wreg(src))); break;
+    case CLZ_x: set_xreg(dst, CountLeadingZeros(xreg(src))); break;
    case CLS_w: {
-      set_wreg(dst, CountLeadingSignBits(wreg(src), kWRegSize));
+      set_wreg(dst, CountLeadingSignBits(wreg(src)));
      break;
    }
    case CLS_x: {
-      set_xreg(dst, CountLeadingSignBits(xreg(src), kXRegSize));
+      set_xreg(dst, CountLeadingSignBits(xreg(src)));
      break;
    }
    default: VIXL_UNIMPLEMENTED();
@ -1831,9 +1827,13 @@ void Simulator::VisitDataProcessing2Source(const Instruction* instr) {
 // The algorithm used is adapted from the one described in section 8.2 of
 //   Hacker's Delight, by Henry S. Warren, Jr.
 // It assumes that a right shift on a signed integer is an arithmetic shift.
-static int64_t MultiplyHighSigned(int64_t u, int64_t v) {
+// Type T must be either uint64_t or int64_t.
+template <typename T>
+static T MultiplyHigh(T u, T v) {
  uint64_t u0, v0, w0;
-  int64_t u1, v1, w1, w2, t;
+  T u1, v1, w1, w2, t;
+
+  VIXL_ASSERT(sizeof(u) == sizeof(u0));

  u0 = u & 0xffffffff;
  u1 = u >> 32;
@ -1872,8 +1872,12 @@ void Simulator::VisitDataProcessing3Source(const Instruction* instr) {
    case SMSUBL_x: result = xreg(instr->Ra()) - (rn_s32 * rm_s32); break;
    case UMADDL_x: result = xreg(instr->Ra()) + (rn_u32 * rm_u32); break;
    case UMSUBL_x: result = xreg(instr->Ra()) - (rn_u32 * rm_u32); break;
+    case UMULH_x:
+      result = MultiplyHigh(reg<uint64_t>(instr->Rn()),
+                            reg<uint64_t>(instr->Rm()));
+      break;
    case SMULH_x:
-      result = MultiplyHighSigned(xreg(instr->Rn()), xreg(instr->Rm()));
+      result = MultiplyHigh(xreg(instr->Rn()), xreg(instr->Rm()));
      break;
    default: VIXL_UNIMPLEMENTED();
  }
@ -2112,28 +2116,28 @@ void Simulator::VisitFPFixedPointConvert(const Instruction* instr) {
      break;
    }
    case FCVTZS_xd_fixed:
-      set_xreg(dst, FPToInt64(dreg(src) * pow(2.0, fbits), FPZero));
+      set_xreg(dst, FPToInt64(dreg(src) * std::pow(2.0, fbits), FPZero));
      break;
    case FCVTZS_wd_fixed:
-      set_wreg(dst, FPToInt32(dreg(src) * pow(2.0, fbits), FPZero));
+      set_wreg(dst, FPToInt32(dreg(src) * std::pow(2.0, fbits), FPZero));
      break;
    case FCVTZU_xd_fixed:
-      set_xreg(dst, FPToUInt64(dreg(src) * pow(2.0, fbits), FPZero));
+      set_xreg(dst, FPToUInt64(dreg(src) * std::pow(2.0, fbits), FPZero));
      break;
    case FCVTZU_wd_fixed:
-      set_wreg(dst, FPToUInt32(dreg(src) * pow(2.0, fbits), FPZero));
+      set_wreg(dst, FPToUInt32(dreg(src) * std::pow(2.0, fbits), FPZero));
      break;
    case FCVTZS_xs_fixed:
-      set_xreg(dst, FPToInt64(sreg(src) * powf(2.0f, fbits), FPZero));
+      set_xreg(dst, FPToInt64(sreg(src) * std::pow(2.0f, fbits), FPZero));
      break;
    case FCVTZS_ws_fixed:
-      set_wreg(dst, FPToInt32(sreg(src) * powf(2.0f, fbits), FPZero));
+      set_wreg(dst, FPToInt32(sreg(src) * std::pow(2.0f, fbits), FPZero));
      break;
    case FCVTZU_xs_fixed:
-      set_xreg(dst, FPToUInt64(sreg(src) * powf(2.0f, fbits), FPZero));
+      set_xreg(dst, FPToUInt64(sreg(src) * std::pow(2.0f, fbits), FPZero));
      break;
    case FCVTZU_ws_fixed:
-      set_wreg(dst, FPToUInt32(sreg(src) * powf(2.0f, fbits), FPZero));
+      set_wreg(dst, FPToUInt32(sreg(src) * std::pow(2.0f, fbits), FPZero));
      break;
    default: VIXL_UNREACHABLE();
  }
@ -2143,11 +2147,16 @@ void Simulator::VisitFPFixedPointConvert(const Instruction* instr) {
 void Simulator::VisitFPCompare(const Instruction* instr) {
  AssertSupportedFPCR();

+  FPTrapFlags trap = DisableTrap;
  switch (instr->Mask(FPCompareMask)) {
-    case FCMP_s: FPCompare(sreg(instr->Rn()), sreg(instr->Rm())); break;
-    case FCMP_d: FPCompare(dreg(instr->Rn()), dreg(instr->Rm())); break;
-    case FCMP_s_zero: FPCompare(sreg(instr->Rn()), 0.0f); break;
-    case FCMP_d_zero: FPCompare(dreg(instr->Rn()), 0.0); break;
+    case FCMPE_s: trap = EnableTrap; VIXL_FALLTHROUGH();
+    case FCMP_s: FPCompare(sreg(instr->Rn()), sreg(instr->Rm()), trap); break;
+    case FCMPE_d: trap = EnableTrap; VIXL_FALLTHROUGH();
+    case FCMP_d: FPCompare(dreg(instr->Rn()), dreg(instr->Rm()), trap); break;
+    case FCMPE_s_zero: trap = EnableTrap; VIXL_FALLTHROUGH();
+    case FCMP_s_zero: FPCompare(sreg(instr->Rn()), 0.0f, trap); break;
+    case FCMPE_d_zero: trap = EnableTrap; VIXL_FALLTHROUGH();
+    case FCMP_d_zero: FPCompare(dreg(instr->Rn()), 0.0, trap); break;
    default: VIXL_UNIMPLEMENTED();
  }
 }
@ -2156,18 +2165,23 @@ void Simulator::VisitFPCompare(const Instruction* instr) {
 void Simulator::VisitFPConditionalCompare(const Instruction* instr) {
  AssertSupportedFPCR();

+  FPTrapFlags trap = DisableTrap;
  switch (instr->Mask(FPConditionalCompareMask)) {
+    case FCCMPE_s: trap = EnableTrap;
+      VIXL_FALLTHROUGH();
    case FCCMP_s:
      if (ConditionPassed(instr->Condition())) {
-        FPCompare(sreg(instr->Rn()), sreg(instr->Rm()));
+        FPCompare(sreg(instr->Rn()), sreg(instr->Rm()), trap);
      } else {
        nzcv().SetFlags(instr->Nzcv());
        LogSystemRegister(NZCV);
      }
      break;
+    case FCCMPE_d: trap = EnableTrap;
+      VIXL_FALLTHROUGH();
    case FCCMP_d:
      if (ConditionPassed(instr->Condition())) {
-        FPCompare(dreg(instr->Rn()), dreg(instr->Rm()));
+        FPCompare(dreg(instr->Rn()), dreg(instr->Rm()), trap);
      } else {
        nzcv().SetFlags(instr->Nzcv());
        LogSystemRegister(NZCV);
@ -2245,547 +2259,6 @@ void Simulator::VisitFPDataProcessing1Source(const Instruction* instr) {
 }


-// Assemble the specified IEEE-754 components into the target type and apply
-// appropriate rounding.
-//  sign:     0 = positive, 1 = negative
-//  exponent: Unbiased IEEE-754 exponent.
-//  mantissa: The mantissa of the input. The top bit (which is not encoded for
-//            normal IEEE-754 values) must not be omitted. This bit has the
-//            value 'pow(2, exponent)'.
-//
-// The input value is assumed to be a normalized value. That is, the input may
-// not be infinity or NaN. If the source value is subnormal, it must be
-// normalized before calling this function such that the highest set bit in the
-// mantissa has the value 'pow(2, exponent)'.
-//
-// Callers should use FPRoundToFloat or FPRoundToDouble directly, rather than
-// calling a templated FPRound.
-template <class T, int ebits, int mbits>
-static T FPRound(int64_t sign, int64_t exponent, uint64_t mantissa,
-                 FPRounding round_mode) {
-  VIXL_ASSERT((sign == 0) || (sign == 1));
-
-  // Only FPTieEven and FPRoundOdd rounding modes are implemented.
-  VIXL_ASSERT((round_mode == FPTieEven) || (round_mode == FPRoundOdd));
-
-  // Rounding can promote subnormals to normals, and normals to infinities. For
-  // example, a double with exponent 127 (FLT_MAX_EXP) would appear to be
-  // encodable as a float, but rounding based on the low-order mantissa bits
-  // could make it overflow. With ties-to-even rounding, this value would become
-  // an infinity.
-
-  // ---- Rounding Method ----
-  //
-  // The exponent is irrelevant in the rounding operation, so we treat the
-  // lowest-order bit that will fit into the result ('onebit') as having
-  // the value '1'. Similarly, the highest-order bit that won't fit into
-  // the result ('halfbit') has the value '0.5'. The 'point' sits between
-  // 'onebit' and 'halfbit':
-  //
-  //            These bits fit into the result.
-  //               |---------------------|
-  //  mantissa = 0bxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
-  //                                     ||
-  //                                    / |
-  //                                   /  halfbit
-  //                               onebit
-  //
-  // For subnormal outputs, the range of representable bits is smaller and
-  // the position of onebit and halfbit depends on the exponent of the
-  // input, but the method is otherwise similar.
-  //
-  //   onebit(frac)
-  //     |
-  //     | halfbit(frac)          halfbit(adjusted)
-  //     | /                      /
-  //     | |                      |
-  //  0b00.0 (exact)      -> 0b00.0 (exact)                    -> 0b00
-  //  0b00.0...           -> 0b00.0...                         -> 0b00
-  //  0b00.1 (exact)      -> 0b00.0111..111                    -> 0b00
-  //  0b00.1...           -> 0b00.1...                         -> 0b01
-  //  0b01.0 (exact)      -> 0b01.0 (exact)                    -> 0b01
-  //  0b01.0...           -> 0b01.0...                         -> 0b01
-  //  0b01.1 (exact)      -> 0b01.1 (exact)                    -> 0b10
-  //  0b01.1...           -> 0b01.1...                         -> 0b10
-  //  0b10.0 (exact)      -> 0b10.0 (exact)                    -> 0b10
-  //  0b10.0...           -> 0b10.0...                         -> 0b10
-  //  0b10.1 (exact)      -> 0b10.0111..111                    -> 0b10
-  //  0b10.1...           -> 0b10.1...                         -> 0b11
-  //  0b11.0 (exact)      -> 0b11.0 (exact)                    -> 0b11
-  //  ...                   /             |                      /   |
-  //                       /              |                     /    |
-  //                                                           /     |
-  // adjusted = frac - (halfbit(mantissa) & ~onebit(frac));   /      |
-  //
-  //                   mantissa = (mantissa >> shift) + halfbit(adjusted);
-
-  static const int mantissa_offset = 0;
-  static const int exponent_offset = mantissa_offset + mbits;
-  static const int sign_offset = exponent_offset + ebits;
-  VIXL_ASSERT(sign_offset == (sizeof(T) * 8 - 1));
-
-  // Bail out early for zero inputs.
-  if (mantissa == 0) {
-    return sign << sign_offset;
-  }
-
-  // If all bits in the exponent are set, the value is infinite or NaN.
-  // This is true for all binary IEEE-754 formats.
-  static const int infinite_exponent = (1 << ebits) - 1;
-  static const int max_normal_exponent = infinite_exponent - 1;
-
-  // Apply the exponent bias to encode it for the result. Doing this early makes
-  // it easy to detect values that will be infinite or subnormal.
-  exponent += max_normal_exponent >> 1;
-
-  if (exponent > max_normal_exponent) {
-    // Overflow: the input is too large for the result type to represent.
-    if (round_mode == FPTieEven) {
-      // FPTieEven rounding mode handles overflows using infinities.
-      exponent = infinite_exponent;
-      mantissa = 0;
-    } else {
-      VIXL_ASSERT(round_mode == FPRoundOdd);
-      // FPRoundOdd rounding mode handles overflows using the largest magnitude
-      // normal number.
-      exponent = max_normal_exponent;
-      mantissa = (UINT64_C(1) << exponent_offset) - 1;
-    }
-    return (sign << sign_offset) |
-           (exponent << exponent_offset) |
-           (mantissa << mantissa_offset);
-  }
-
-  // Calculate the shift required to move the top mantissa bit to the proper
-  // place in the destination type.
-  const int highest_significant_bit = 63 - CountLeadingZeros(mantissa, 64);
-  int shift = highest_significant_bit - mbits;
-
-  if (exponent <= 0) {
-    // The output will be subnormal (before rounding).
-    // For subnormal outputs, the shift must be adjusted by the exponent. The +1
-    // is necessary because the exponent of a subnormal value (encoded as 0) is
-    // the same as the exponent of the smallest normal value (encoded as 1).
-    shift += -exponent + 1;
-
-    // Handle inputs that would produce a zero output.
-    //
-    // Shifts higher than highest_significant_bit+1 will always produce a zero
-    // result. A shift of exactly highest_significant_bit+1 might produce a
-    // non-zero result after rounding.
-    if (shift > (highest_significant_bit + 1)) {
-      if (round_mode == FPTieEven) {
-        // The result will always be +/-0.0.
-        return sign << sign_offset;
-      } else {
-        VIXL_ASSERT(round_mode == FPRoundOdd);
-        VIXL_ASSERT(mantissa != 0);
-        // For FPRoundOdd, if the mantissa is too small to represent and
-        // non-zero return the next "odd" value.
-        return (sign << sign_offset) | 1;
-      }
-    }
-
-    // Properly encode the exponent for a subnormal output.
-    exponent = 0;
-  } else {
-    // Clear the topmost mantissa bit, since this is not encoded in IEEE-754
-    // normal values.
-    mantissa &= ~(UINT64_C(1) << highest_significant_bit);
-  }
-
-  if (shift > 0) {
-    if (round_mode == FPTieEven) {
-      // We have to shift the mantissa to the right. Some precision is lost, so
-      // we need to apply rounding.
-      uint64_t onebit_mantissa = (mantissa >> (shift)) & 1;
-      uint64_t halfbit_mantissa = (mantissa >> (shift-1)) & 1;
-      uint64_t adjustment = (halfbit_mantissa & ~onebit_mantissa);
-      uint64_t adjusted = mantissa - adjustment;
-      T halfbit_adjusted = (adjusted >> (shift-1)) & 1;
-
-      T result = (sign << sign_offset) |
-                 (exponent << exponent_offset) |
-                 ((mantissa >> shift) << mantissa_offset);
-
-      // A very large mantissa can overflow during rounding. If this happens,
-      // the exponent should be incremented and the mantissa set to 1.0
-      // (encoded as 0). Applying halfbit_adjusted after assembling the float
-      // has the nice side-effect that this case is handled for free.
-      //
-      // This also handles cases where a very large finite value overflows to
-      // infinity, or where a very large subnormal value overflows to become
-      // normal.
-      return result + halfbit_adjusted;
-    } else {
-      VIXL_ASSERT(round_mode == FPRoundOdd);
-      // If any bits at position halfbit or below are set, onebit (ie. the
-      // bottom bit of the resulting mantissa) must be set.
-      uint64_t fractional_bits = mantissa & ((UINT64_C(1) << shift) - 1);
-      if (fractional_bits != 0) {
-        mantissa |= UINT64_C(1) << shift;
-      }
-
-      return (sign << sign_offset) |
-             (exponent << exponent_offset) |
-             ((mantissa >> shift) << mantissa_offset);
-    }
-  } else {
-    // We have to shift the mantissa to the left (or not at all). The input
-    // mantissa is exactly representable in the output mantissa, so apply no
-    // rounding correction.
-    return (sign << sign_offset) |
-           (exponent << exponent_offset) |
-           ((mantissa << -shift) << mantissa_offset);
-  }
-}
-
-
-// See FPRound for a description of this function.
-static inline double FPRoundToDouble(int64_t sign, int64_t exponent,
-                                     uint64_t mantissa, FPRounding round_mode) {
-  int64_t bits =
-      FPRound<int64_t, kDoubleExponentBits, kDoubleMantissaBits>(sign,
-                                                                 exponent,
-                                                                 mantissa,
-                                                                 round_mode);
-  return rawbits_to_double(bits);
-}
-
-
-// See FPRound for a description of this function.
-static inline float FPRoundToFloat(int64_t sign, int64_t exponent,
-                                   uint64_t mantissa, FPRounding round_mode) {
-  int32_t bits =
-      FPRound<int32_t, kFloatExponentBits, kFloatMantissaBits>(sign,
-                                                               exponent,
-                                                               mantissa,
-                                                               round_mode);
-  return rawbits_to_float(bits);
-}
-
-
-// See FPRound for a description of this function.
-static inline float16 FPRoundToFloat16(int64_t sign,
-                                       int64_t exponent,
-                                       uint64_t mantissa,
-                                       FPRounding round_mode) {
-  return FPRound<float16, kFloat16ExponentBits, kFloat16MantissaBits>(
-      sign, exponent, mantissa, round_mode);
-}
-
-
-double Simulator::FixedToDouble(int64_t src, int fbits, FPRounding round) {
-  if (src >= 0) {
-    return UFixedToDouble(src, fbits, round);
-  } else {
-    // This works for all negative values, including INT64_MIN.
-    return -UFixedToDouble(-src, fbits, round);
-  }
-}
-
-
-double Simulator::UFixedToDouble(uint64_t src, int fbits, FPRounding round) {
-  // An input of 0 is a special case because the result is effectively
-  // subnormal: The exponent is encoded as 0 and there is no implicit 1 bit.
-  if (src == 0) {
-    return 0.0;
-  }
-
-  // Calculate the exponent. The highest significant bit will have the value
-  // 2^exponent.
-  const int highest_significant_bit = 63 - CountLeadingZeros(src, 64);
-  const int64_t exponent = highest_significant_bit - fbits;
-
-  return FPRoundToDouble(0, exponent, src, round);
-}
-
-
-float Simulator::FixedToFloat(int64_t src, int fbits, FPRounding round) {
-  if (src >= 0) {
-    return UFixedToFloat(src, fbits, round);
-  } else {
-    // This works for all negative values, including INT64_MIN.
-    return -UFixedToFloat(-src, fbits, round);
-  }
-}
-
-
-float Simulator::UFixedToFloat(uint64_t src, int fbits, FPRounding round) {
-  // An input of 0 is a special case because the result is effectively
-  // subnormal: The exponent is encoded as 0 and there is no implicit 1 bit.
-  if (src == 0) {
-    return 0.0f;
-  }
-
-  // Calculate the exponent. The highest significant bit will have the value
-  // 2^exponent.
-  const int highest_significant_bit = 63 - CountLeadingZeros(src, 64);
-  const int32_t exponent = highest_significant_bit - fbits;
-
-  return FPRoundToFloat(0, exponent, src, round);
-}
-
-
-double Simulator::FPToDouble(float value) {
-  switch (fpclassify(value)) {
-    case FP_NAN: {
-      if (IsSignallingNaN(value)) {
-        FPProcessException();
-      }
-      if (DN()) return kFP64DefaultNaN;
-
-      // Convert NaNs as the processor would:
-      //  - The sign is propagated.
-      //  - The payload (mantissa) is transferred entirely, except that the top
-      //    bit is forced to '1', making the result a quiet NaN. The unused
-      //    (low-order) payload bits are set to 0.
-      uint32_t raw = float_to_rawbits(value);
-
-      uint64_t sign = raw >> 31;
-      uint64_t exponent = (1 << 11) - 1;
-      uint64_t payload = unsigned_bitextract_64(21, 0, raw);
-      payload <<= (52 - 23);  // The unused low-order bits should be 0.
-      payload |= (UINT64_C(1) << 51);  // Force a quiet NaN.
-
-      return rawbits_to_double((sign << 63) | (exponent << 52) | payload);
-    }
-
-    case FP_ZERO:
-    case FP_NORMAL:
-    case FP_SUBNORMAL:
-    case FP_INFINITE: {
-      // All other inputs are preserved in a standard cast, because every value
-      // representable using an IEEE-754 float is also representable using an
-      // IEEE-754 double.
-      return static_cast<double>(value);
-    }
-  }
-
-  VIXL_UNREACHABLE();
-  return static_cast<double>(value);
-}
-
-
-float Simulator::FPToFloat(float16 value) {
-  uint32_t sign = value >> 15;
-  uint32_t exponent = unsigned_bitextract_32(
-      kFloat16MantissaBits + kFloat16ExponentBits - 1, kFloat16MantissaBits,
-      value);
-  uint32_t mantissa = unsigned_bitextract_32(
-      kFloat16MantissaBits - 1, 0, value);
-
-  switch (float16classify(value)) {
-    case FP_ZERO:
-      return (sign == 0) ? 0.0f : -0.0f;
-
-    case FP_INFINITE:
-      return (sign == 0) ? kFP32PositiveInfinity : kFP32NegativeInfinity;
-
-    case FP_SUBNORMAL: {
-      // Calculate shift required to put mantissa into the most-significant bits
-      // of the destination mantissa.
-      int shift = CountLeadingZeros(mantissa << (32 - 10), 32);
-
-      // Shift mantissa and discard implicit '1'.
-      mantissa <<= (kFloatMantissaBits - kFloat16MantissaBits) + shift + 1;
-      mantissa &= (1 << kFloatMantissaBits) - 1;
-
-      // Adjust the exponent for the shift applied, and rebias.
-      exponent = exponent - shift + (-15 + 127);
-      break;
-    }
-
-    case FP_NAN:
-      if (IsSignallingNaN(value)) {
-        FPProcessException();
-      }
-      if (DN()) return kFP32DefaultNaN;
-
-      // Convert NaNs as the processor would:
-      //  - The sign is propagated.
-      //  - The payload (mantissa) is transferred entirely, except that the top
-      //    bit is forced to '1', making the result a quiet NaN. The unused
-      //    (low-order) payload bits are set to 0.
-      exponent = (1 << kFloatExponentBits) - 1;
-
-      // Increase bits in mantissa, making low-order bits 0.
-      mantissa <<= (kFloatMantissaBits - kFloat16MantissaBits);
-      mantissa |= 1 << 22;  // Force a quiet NaN.
-      break;
-
-    case FP_NORMAL:
-      // Increase bits in mantissa, making low-order bits 0.
-      mantissa <<= (kFloatMantissaBits - kFloat16MantissaBits);
-
-      // Change exponent bias.
-      exponent += (-15 + 127);
-      break;
-
-    default: VIXL_UNREACHABLE();
-  }
-  return rawbits_to_float((sign << 31) |
-                          (exponent << kFloatMantissaBits) |
-                          mantissa);
-}
-
-
-float16 Simulator::FPToFloat16(float value, FPRounding round_mode) {
-  // Only the FPTieEven rounding mode is implemented.
-  VIXL_ASSERT(round_mode == FPTieEven);
-  USE(round_mode);
-
-  uint32_t raw = float_to_rawbits(value);
-  int32_t sign = raw >> 31;
-  int32_t exponent = unsigned_bitextract_32(30, 23, raw) - 127;
-  uint32_t mantissa = unsigned_bitextract_32(22, 0, raw);
-
-  switch (fpclassify(value)) {
-    case FP_NAN: {
-      if (IsSignallingNaN(value)) {
-        FPProcessException();
-      }
-      if (DN()) return kFP16DefaultNaN;
-
-      // Convert NaNs as the processor would:
-      //  - The sign is propagated.
-      //  - The payload (mantissa) is transferred as much as possible, except
-      //    that the top bit is forced to '1', making the result a quiet NaN.
-      float16 result = (sign == 0) ? kFP16PositiveInfinity
-                                   : kFP16NegativeInfinity;
-      result |= mantissa >> (kFloatMantissaBits - kFloat16MantissaBits);
-      result |= (1 << 9);  // Force a quiet NaN;
-      return result;
-    }
-
-    case FP_ZERO:
-      return (sign == 0) ? 0 : 0x8000;
-
-    case FP_INFINITE:
-      return (sign == 0) ? kFP16PositiveInfinity : kFP16NegativeInfinity;
-
-    case FP_NORMAL:
-    case FP_SUBNORMAL: {
-      // Convert float-to-half as the processor would, assuming that FPCR.FZ
-      // (flush-to-zero) is not set.
-
-      // Add the implicit '1' bit to the mantissa.
-      mantissa += (1 << 23);
-      return FPRoundToFloat16(sign, exponent, mantissa, round_mode);
-    }
-  }
-
-  VIXL_UNREACHABLE();
-  return 0;
-}
-
-
-float16 Simulator::FPToFloat16(double value, FPRounding round_mode) {
-  // Only the FPTieEven rounding mode is implemented.
-  VIXL_ASSERT(round_mode == FPTieEven);
-  USE(round_mode);
-
-  uint64_t raw = double_to_rawbits(value);
-  int32_t sign = raw >> 63;
-  int64_t exponent = unsigned_bitextract_64(62, 52, raw) - 1023;
-  uint64_t mantissa = unsigned_bitextract_64(51, 0, raw);
-
-  switch (fpclassify(value)) {
-    case FP_NAN: {
-      if (IsSignallingNaN(value)) {
-        FPProcessException();
-      }
-      if (DN()) return kFP16DefaultNaN;
-
-      // Convert NaNs as the processor would:
-      //  - The sign is propagated.
-      //  - The payload (mantissa) is transferred as much as possible, except
-      //    that the top bit is forced to '1', making the result a quiet NaN.
-      float16 result = (sign == 0) ? kFP16PositiveInfinity
-                                   : kFP16NegativeInfinity;
-      result |= mantissa >> (kDoubleMantissaBits - kFloat16MantissaBits);
-      result |= (1 << 9);  // Force a quiet NaN;
-      return result;
-    }
-
-    case FP_ZERO:
-      return (sign == 0) ? 0 : 0x8000;
-
-    case FP_INFINITE:
-      return (sign == 0) ? kFP16PositiveInfinity : kFP16NegativeInfinity;
-
-    case FP_NORMAL:
-    case FP_SUBNORMAL: {
-      // Convert double-to-half as the processor would, assuming that FPCR.FZ
-      // (flush-to-zero) is not set.
-
-      // Add the implicit '1' bit to the mantissa.
-      mantissa += (UINT64_C(1) << 52);
-      return FPRoundToFloat16(sign, exponent, mantissa, round_mode);
-    }
-  }
-
-  VIXL_UNREACHABLE();
-  return 0;
-}
-
-
-float Simulator::FPToFloat(double value, FPRounding round_mode) {
-  // Only the FPTieEven rounding mode is implemented.
-  VIXL_ASSERT((round_mode == FPTieEven) || (round_mode == FPRoundOdd));
-  USE(round_mode);
-
-  switch (fpclassify(value)) {
-    case FP_NAN: {
-      if (IsSignallingNaN(value)) {
-        FPProcessException();
-      }
-      if (DN()) return kFP32DefaultNaN;
-
-      // Convert NaNs as the processor would:
-      //  - The sign is propagated.
-      //  - The payload (mantissa) is transferred as much as possible, except
-      //    that the top bit is forced to '1', making the result a quiet NaN.
-      uint64_t raw = double_to_rawbits(value);
-
-      uint32_t sign = raw >> 63;
-      uint32_t exponent = (1 << 8) - 1;
-      uint32_t payload = unsigned_bitextract_64(50, 52 - 23, raw);
-      payload |= (1 << 22);   // Force a quiet NaN.
-
-      return rawbits_to_float((sign << 31) | (exponent << 23) | payload);
-    }
-
-    case FP_ZERO:
-    case FP_INFINITE: {
-      // In a C++ cast, any value representable in the target type will be
-      // unchanged. This is always the case for +/-0.0 and infinities.
-      return static_cast<float>(value);
-    }
-
-    case FP_NORMAL:
-    case FP_SUBNORMAL: {
-      // Convert double-to-float as the processor would, assuming that FPCR.FZ
-      // (flush-to-zero) is not set.
-      uint64_t raw = double_to_rawbits(value);
-      // Extract the IEEE-754 double components.
-      uint32_t sign = raw >> 63;
-      // Extract the exponent and remove the IEEE-754 encoding bias.
-      int32_t exponent = unsigned_bitextract_64(62, 52, raw) - 1023;
-      // Extract the mantissa and add the implicit '1' bit.
-      uint64_t mantissa = unsigned_bitextract_64(51, 0, raw);
-      if (fpclassify(value) == FP_NORMAL) {
-        mantissa |= (UINT64_C(1) << 52);
-      }
-      return FPRoundToFloat(sign, exponent, mantissa, round_mode);
-    }
-  }
-
-  VIXL_UNREACHABLE();
-  return value;
-}
-
-
 void Simulator::VisitFPDataProcessing2Source(const Instruction* instr) {
  AssertSupportedFPCR();

@ -2851,63 +2324,6 @@ void Simulator::VisitFPDataProcessing3Source(const Instruction* instr) {
 }


-template <typename T>
-T Simulator::FPProcessNaN(T op) {
-  VIXL_ASSERT(isnan(op));
-  if (IsSignallingNaN(op)) {
-    FPProcessException();
-  }
-  return DN() ? FPDefaultNaN<T>() : ToQuietNaN(op);
-}
-
-template float Simulator::FPProcessNaN(float op);
-template double Simulator::FPProcessNaN(double op);
-
-template <typename T>
-T Simulator::FPProcessNaNs(T op1, T op2) {
-  if (IsSignallingNaN(op1)) {
-    return FPProcessNaN(op1);
-  } else if (IsSignallingNaN(op2)) {
-    return FPProcessNaN(op2);
-  } else if (isnan(op1)) {
-    VIXL_ASSERT(IsQuietNaN(op1));
-    return FPProcessNaN(op1);
-  } else if (isnan(op2)) {
-    VIXL_ASSERT(IsQuietNaN(op2));
-    return FPProcessNaN(op2);
-  } else {
-    return 0.0;
-  }
-}
-
-template float Simulator::FPProcessNaNs(float op1, float op2);
-template double Simulator::FPProcessNaNs(double op1, double op2);
-
-template <typename T>
-T Simulator::FPProcessNaNs3(T op1, T op2, T op3) {
-  if (IsSignallingNaN(op1)) {
-    return FPProcessNaN(op1);
-  } else if (IsSignallingNaN(op2)) {
-    return FPProcessNaN(op2);
-  } else if (IsSignallingNaN(op3)) {
-    return FPProcessNaN(op3);
-  } else if (isnan(op1)) {
-    VIXL_ASSERT(IsQuietNaN(op1));
-    return FPProcessNaN(op1);
-  } else if (isnan(op2)) {
-    VIXL_ASSERT(IsQuietNaN(op2));
-    return FPProcessNaN(op2);
-  } else if (isnan(op3)) {
-    VIXL_ASSERT(IsQuietNaN(op3));
-    return FPProcessNaN(op3);
-  } else {
-    return 0.0;
-  }
-}
-
-template float Simulator::FPProcessNaNs3(float op1, float op2, float op3);
-template double Simulator::FPProcessNaNs3(double op1, double op2, double op3);
-
 bool Simulator::FPProcessNaNs(const Instruction* instr) {
  unsigned fd = instr->Rd();
  unsigned fn = instr->Rn();
@ -2916,13 +2332,13 @@ bool Simulator::FPProcessNaNs(const Instruction* instr) {

  if (instr->Mask(FP64) == FP64) {
    double result = FPProcessNaNs(dreg(fn), dreg(fm));
-    if (isnan(result)) {
+    if (std::isnan(result)) {
      set_dreg(fd, result);
      done = true;
    }
  } else {
    float result = FPProcessNaNs(sreg(fn), sreg(fm));
-    if (isnan(result)) {
+    if (std::isnan(result)) {
      set_sreg(fd, result);
      done = true;
    }
@ -3618,13 +3034,13 @@ void Simulator::NEONLoadStoreMultiStructHelper(const Instruction* instr,
  switch (instr->Mask(NEONLoadStoreMultiStructPostIndexMask)) {
    case NEON_LD1_4v:
    case NEON_LD1_4v_post: ld1(vf, vreg(reg[3]), addr[3]); count++;
-      // Fall through.
+      VIXL_FALLTHROUGH();
    case NEON_LD1_3v:
    case NEON_LD1_3v_post: ld1(vf, vreg(reg[2]), addr[2]); count++;
-      // Fall through.
+      VIXL_FALLTHROUGH();
    case NEON_LD1_2v:
    case NEON_LD1_2v_post: ld1(vf, vreg(reg[1]), addr[1]); count++;
-      // Fall through.
+      VIXL_FALLTHROUGH();
    case NEON_LD1_1v:
    case NEON_LD1_1v_post:
      ld1(vf, vreg(reg[0]), addr[0]);
@ -3632,13 +3048,13 @@ void Simulator::NEONLoadStoreMultiStructHelper(const Instruction* instr,
      break;
    case NEON_ST1_4v:
    case NEON_ST1_4v_post: st1(vf, vreg(reg[3]), addr[3]); count++;
-      // Fall through.
+      VIXL_FALLTHROUGH();
    case NEON_ST1_3v:
    case NEON_ST1_3v_post: st1(vf, vreg(reg[2]), addr[2]); count++;
-      // Fall through.
+      VIXL_FALLTHROUGH();
    case NEON_ST1_2v:
    case NEON_ST1_2v_post: st1(vf, vreg(reg[1]), addr[1]); count++;
-      // Fall through.
+      VIXL_FALLTHROUGH();
    case NEON_ST1_1v:
    case NEON_ST1_1v_post:
      st1(vf, vreg(reg[0]), addr[0]);
@ -3745,6 +3161,7 @@ void Simulator::NEONLoadStoreSingleStructHelper(const Instruction* instr,
    case NEON_LD3_b_post:
    case NEON_LD4_b:
    case NEON_LD4_b_post: do_load = true;
+      VIXL_FALLTHROUGH();
    case NEON_ST1_b:
    case NEON_ST1_b_post:
    case NEON_ST2_b:
@ -3762,6 +3179,7 @@ void Simulator::NEONLoadStoreSingleStructHelper(const Instruction* instr,
    case NEON_LD3_h_post:
    case NEON_LD4_h:
    case NEON_LD4_h_post: do_load = true;
+      VIXL_FALLTHROUGH();
    case NEON_ST1_h:
    case NEON_ST1_h_post:
    case NEON_ST2_h:
@ -3778,6 +3196,7 @@ void Simulator::NEONLoadStoreSingleStructHelper(const Instruction* instr,
    case NEON_LD3_s_post:
    case NEON_LD4_s:
    case NEON_LD4_s_post: do_load = true;
+      VIXL_FALLTHROUGH();
    case NEON_ST1_s:
    case NEON_ST1_s_post:
    case NEON_ST2_s:
--- a/src/vixl/a64/simulator-a64.h
+++ b/src/vixl/a64/simulator-a64.h
@ -27,12 +27,12 @@
 #ifndef VIXL_A64_SIMULATOR_A64_H_
 #define VIXL_A64_SIMULATOR_A64_H_

-#include "globals.h"
-#include "utils.h"
-#include "a64/instructions-a64.h"
-#include "a64/assembler-a64.h"
-#include "a64/disasm-a64.h"
-#include "a64/instrument-a64.h"
+#include "vixl/globals.h"
+#include "vixl/utils.h"
+#include "vixl/a64/instructions-a64.h"
+#include "vixl/a64/assembler-a64.h"
+#include "vixl/a64/disasm-a64.h"
+#include "vixl/a64/instrument-a64.h"

 namespace vixl {

@ -150,6 +150,201 @@ const unsigned kLogParamsOffset = 1 * kInstructionSize;
 const unsigned kLogLength = 2 * kInstructionSize;


+// Assemble the specified IEEE-754 components into the target type and apply
+// appropriate rounding.
+//  sign:     0 = positive, 1 = negative
+//  exponent: Unbiased IEEE-754 exponent.
+//  mantissa: The mantissa of the input. The top bit (which is not encoded for
+//            normal IEEE-754 values) must not be omitted. This bit has the
+//            value 'pow(2, exponent)'.
+//
+// The input value is assumed to be a normalized value. That is, the input may
+// not be infinity or NaN. If the source value is subnormal, it must be
+// normalized before calling this function such that the highest set bit in the
+// mantissa has the value 'pow(2, exponent)'.
+//
+// Callers should use FPRoundToFloat or FPRoundToDouble directly, rather than
+// calling a templated FPRound.
+template <class T, int ebits, int mbits>
+T FPRound(int64_t sign, int64_t exponent, uint64_t mantissa,
+                 FPRounding round_mode) {
+  VIXL_ASSERT((sign == 0) || (sign == 1));
+
+  // Only FPTieEven and FPRoundOdd rounding modes are implemented.
+  VIXL_ASSERT((round_mode == FPTieEven) || (round_mode == FPRoundOdd));
+
+  // Rounding can promote subnormals to normals, and normals to infinities. For
+  // example, a double with exponent 127 (FLT_MAX_EXP) would appear to be
+  // encodable as a float, but rounding based on the low-order mantissa bits
+  // could make it overflow. With ties-to-even rounding, this value would become
+  // an infinity.
+
+  // ---- Rounding Method ----
+  //
+  // The exponent is irrelevant in the rounding operation, so we treat the
+  // lowest-order bit that will fit into the result ('onebit') as having
+  // the value '1'. Similarly, the highest-order bit that won't fit into
+  // the result ('halfbit') has the value '0.5'. The 'point' sits between
+  // 'onebit' and 'halfbit':
+  //
+  //            These bits fit into the result.
+  //               |---------------------|
+  //  mantissa = 0bxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
+  //                                     ||
+  //                                    / |
+  //                                   /  halfbit
+  //                               onebit
+  //
+  // For subnormal outputs, the range of representable bits is smaller and
+  // the position of onebit and halfbit depends on the exponent of the
+  // input, but the method is otherwise similar.
+  //
+  //   onebit(frac)
+  //     |
+  //     | halfbit(frac)          halfbit(adjusted)
+  //     | /                      /
+  //     | |                      |
+  //  0b00.0 (exact)      -> 0b00.0 (exact)                    -> 0b00
+  //  0b00.0...           -> 0b00.0...                         -> 0b00
+  //  0b00.1 (exact)      -> 0b00.0111..111                    -> 0b00
+  //  0b00.1...           -> 0b00.1...                         -> 0b01
+  //  0b01.0 (exact)      -> 0b01.0 (exact)                    -> 0b01
+  //  0b01.0...           -> 0b01.0...                         -> 0b01
+  //  0b01.1 (exact)      -> 0b01.1 (exact)                    -> 0b10
+  //  0b01.1...           -> 0b01.1...                         -> 0b10
+  //  0b10.0 (exact)      -> 0b10.0 (exact)                    -> 0b10
+  //  0b10.0...           -> 0b10.0...                         -> 0b10
+  //  0b10.1 (exact)      -> 0b10.0111..111                    -> 0b10
+  //  0b10.1...           -> 0b10.1...                         -> 0b11
+  //  0b11.0 (exact)      -> 0b11.0 (exact)                    -> 0b11
+  //  ...                   /             |                      /   |
+  //                       /              |                     /    |
+  //                                                           /     |
+  // adjusted = frac - (halfbit(mantissa) & ~onebit(frac));   /      |
+  //
+  //                   mantissa = (mantissa >> shift) + halfbit(adjusted);
+
+  static const int mantissa_offset = 0;
+  static const int exponent_offset = mantissa_offset + mbits;
+  static const int sign_offset = exponent_offset + ebits;
+  VIXL_ASSERT(sign_offset == (sizeof(T) * 8 - 1));
+
+  // Bail out early for zero inputs.
+  if (mantissa == 0) {
+    return sign << sign_offset;
+  }
+
+  // If all bits in the exponent are set, the value is infinite or NaN.
+  // This is true for all binary IEEE-754 formats.
+  static const int infinite_exponent = (1 << ebits) - 1;
+  static const int max_normal_exponent = infinite_exponent - 1;
+
+  // Apply the exponent bias to encode it for the result. Doing this early makes
+  // it easy to detect values that will be infinite or subnormal.
+  exponent += max_normal_exponent >> 1;
+
+  if (exponent > max_normal_exponent) {
+    // Overflow: the input is too large for the result type to represent.
+    if (round_mode == FPTieEven) {
+      // FPTieEven rounding mode handles overflows using infinities.
+      exponent = infinite_exponent;
+      mantissa = 0;
+    } else {
+      VIXL_ASSERT(round_mode == FPRoundOdd);
+      // FPRoundOdd rounding mode handles overflows using the largest magnitude
+      // normal number.
+      exponent = max_normal_exponent;
+      mantissa = (UINT64_C(1) << exponent_offset) - 1;
+    }
+    return (sign << sign_offset) |
+           (exponent << exponent_offset) |
+           (mantissa << mantissa_offset);
+  }
+
+  // Calculate the shift required to move the top mantissa bit to the proper
+  // place in the destination type.
+  const int highest_significant_bit = 63 - CountLeadingZeros(mantissa);
+  int shift = highest_significant_bit - mbits;
+
+  if (exponent <= 0) {
+    // The output will be subnormal (before rounding).
+    // For subnormal outputs, the shift must be adjusted by the exponent. The +1
+    // is necessary because the exponent of a subnormal value (encoded as 0) is
+    // the same as the exponent of the smallest normal value (encoded as 1).
+    shift += -exponent + 1;
+
+    // Handle inputs that would produce a zero output.
+    //
+    // Shifts higher than highest_significant_bit+1 will always produce a zero
+    // result. A shift of exactly highest_significant_bit+1 might produce a
+    // non-zero result after rounding.
+    if (shift > (highest_significant_bit + 1)) {
+      if (round_mode == FPTieEven) {
+        // The result will always be +/-0.0.
+        return sign << sign_offset;
+      } else {
+        VIXL_ASSERT(round_mode == FPRoundOdd);
+        VIXL_ASSERT(mantissa != 0);
+        // For FPRoundOdd, if the mantissa is too small to represent and
+        // non-zero return the next "odd" value.
+        return (sign << sign_offset) | 1;
+      }
+    }
+
+    // Properly encode the exponent for a subnormal output.
+    exponent = 0;
+  } else {
+    // Clear the topmost mantissa bit, since this is not encoded in IEEE-754
+    // normal values.
+    mantissa &= ~(UINT64_C(1) << highest_significant_bit);
+  }
+
+  if (shift > 0) {
+    if (round_mode == FPTieEven) {
+      // We have to shift the mantissa to the right. Some precision is lost, so
+      // we need to apply rounding.
+      uint64_t onebit_mantissa = (mantissa >> (shift)) & 1;
+      uint64_t halfbit_mantissa = (mantissa >> (shift-1)) & 1;
+      uint64_t adjustment = (halfbit_mantissa & ~onebit_mantissa);
+      uint64_t adjusted = mantissa - adjustment;
+      T halfbit_adjusted = (adjusted >> (shift-1)) & 1;
+
+      T result = (sign << sign_offset) |
+                 (exponent << exponent_offset) |
+                 ((mantissa >> shift) << mantissa_offset);
+
+      // A very large mantissa can overflow during rounding. If this happens,
+      // the exponent should be incremented and the mantissa set to 1.0
+      // (encoded as 0). Applying halfbit_adjusted after assembling the float
+      // has the nice side-effect that this case is handled for free.
+      //
+      // This also handles cases where a very large finite value overflows to
+      // infinity, or where a very large subnormal value overflows to become
+      // normal.
+      return result + halfbit_adjusted;
+    } else {
+      VIXL_ASSERT(round_mode == FPRoundOdd);
+      // If any bits at position halfbit or below are set, onebit (ie. the
+      // bottom bit of the resulting mantissa) must be set.
+      uint64_t fractional_bits = mantissa & ((UINT64_C(1) << shift) - 1);
+      if (fractional_bits != 0) {
+        mantissa |= UINT64_C(1) << shift;
+      }
+
+      return (sign << sign_offset) |
+             (exponent << exponent_offset) |
+             ((mantissa >> shift) << mantissa_offset);
+    }
+  } else {
+    // We have to shift the mantissa to the left (or not at all). The input
+    // mantissa is exactly representable in the output mantissa, so apply no
+    // rounding correction.
+    return (sign << sign_offset) |
+           (exponent << exponent_offset) |
+           ((mantissa << -shift) << mantissa_offset);
+  }
+}
+

 // Representation of memory, with typed getters and setters for access.
 class Memory {
@ -988,7 +1183,7 @@ class Simulator : public DecoderVisitor {

  PrintRegisterFormat GetPrintRegisterFormatForSizeFP(unsigned size) {
    switch (size) {
-      default: VIXL_UNREACHABLE();
+      default: VIXL_UNREACHABLE(); return kPrintDReg;
      case kDRegSizeInBytes: return kPrintDReg;
      case kSRegSizeInBytes: return kPrintSReg;
    }
@ -1170,7 +1365,8 @@ class Simulator : public DecoderVisitor {
        return !Z() && (N() == V());
      case le:
        return !(!Z() && (N() == V()));
-      case nv:  // Fall through.
+      case nv:
+        VIXL_FALLTHROUGH();
      case al:
        return true;
      default:
@ -2317,8 +2513,6 @@ class Simulator : public DecoderVisitor {

  void SysOp_W(int op, int64_t val);

-  template <typename T>
-  T FPDefaultNaN() const;
  template <typename T>
  T FPRecipSqrtEstimate(T op);
  template <typename T>
@ -2326,7 +2520,7 @@ class Simulator : public DecoderVisitor {
  template <typename T, typename R>
  R FPToFixed(T op, int fbits, bool is_signed, FPRounding rounding);

-  void FPCompare(double val0, double val1);
+  void FPCompare(double val0, double val1, FPTrapFlags trap);
  double FPRoundInt(double value, FPRounding round_mode);
  double FPToDouble(float value);
  float FPToFloat(double value, FPRounding round_mode);
@ -2389,18 +2583,8 @@ class Simulator : public DecoderVisitor {
  // for cumulative exception bits or floating-point exceptions.
  void FPProcessException() { }

-  // Standard NaN processing.
-  template <typename T>
-  T FPProcessNaN(T op);
-
  bool FPProcessNaNs(const Instruction* instr);

-  template <typename T>
-  T FPProcessNaNs(T op1, T op2);
-
-  template <typename T>
-  T FPProcessNaNs3(T op1, T op2, T op3);
-
  // Pseudo Printf instruction
  void DoPrintf(const Instruction* instr);

@ -2478,6 +2662,58 @@ class Simulator : public DecoderVisitor {
  static const Instruction* kEndOfSimAddress;

 private:
+  template <typename T>
+  static T FPDefaultNaN();
+
+  // Standard NaN processing.
+  template <typename T>
+  T FPProcessNaN(T op) {
+    VIXL_ASSERT(std::isnan(op));
+    if (IsSignallingNaN(op)) {
+      FPProcessException();
+    }
+    return DN() ? FPDefaultNaN<T>() : ToQuietNaN(op);
+  }
+
+  template <typename T>
+  T FPProcessNaNs(T op1, T op2) {
+    if (IsSignallingNaN(op1)) {
+      return FPProcessNaN(op1);
+    } else if (IsSignallingNaN(op2)) {
+      return FPProcessNaN(op2);
+    } else if (std::isnan(op1)) {
+      VIXL_ASSERT(IsQuietNaN(op1));
+      return FPProcessNaN(op1);
+    } else if (std::isnan(op2)) {
+      VIXL_ASSERT(IsQuietNaN(op2));
+      return FPProcessNaN(op2);
+    } else {
+      return 0.0;
+    }
+  }
+
+  template <typename T>
+  T FPProcessNaNs3(T op1, T op2, T op3) {
+    if (IsSignallingNaN(op1)) {
+      return FPProcessNaN(op1);
+    } else if (IsSignallingNaN(op2)) {
+      return FPProcessNaN(op2);
+    } else if (IsSignallingNaN(op3)) {
+      return FPProcessNaN(op3);
+    } else if (std::isnan(op1)) {
+      VIXL_ASSERT(IsQuietNaN(op1));
+      return FPProcessNaN(op1);
+    } else if (std::isnan(op2)) {
+      VIXL_ASSERT(IsQuietNaN(op2));
+      return FPProcessNaN(op2);
+    } else if (std::isnan(op3)) {
+      VIXL_ASSERT(IsQuietNaN(op3));
+      return FPProcessNaN(op3);
+    } else {
+      return 0.0;
+    }
+  }
+
  bool coloured_trace_;

  // A set of TraceParameters flags.
--- a/src/vixl/code-buffer.cc
+++ b/src/vixl/code-buffer.cc
@ -24,8 +24,8 @@
 // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

-#include "code-buffer.h"
-#include "utils.h"
+#include "vixl/code-buffer.h"
+#include "vixl/utils.h"

 namespace vixl {

--- a/src/vixl/code-buffer.h
+++ b/src/vixl/code-buffer.h
@ -28,7 +28,7 @@
 #define VIXL_CODE_BUFFER_H

 #include <string.h>
-#include "globals.h"
+#include "vixl/globals.h"

 namespace vixl {

--- a/src/vixl/compiler-intrinsics.cc
+++ b/src/vixl/compiler-intrinsics.cc
@ -0,0 +1,144 @@
+// Copyright 2015, ARM Limited
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+//   * Redistributions of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//   * Redistributions in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//   * Neither the name of ARM Limited nor the names of its contributors may be
+//     used to endorse or promote products derived from this software without
+//     specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS CONTRIBUTORS "AS IS" AND
+// ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include "compiler-intrinsics.h"
+
+namespace vixl {
+
+
+int CountLeadingSignBitsFallBack(int64_t value, int width) {
+  VIXL_ASSERT(IsPowerOf2(width) && (width <= 64));
+  if (value >= 0) {
+    return CountLeadingZeros(value, width) - 1;
+  } else {
+    return CountLeadingZeros(~value, width) - 1;
+  }
+}
+
+
+int CountLeadingZerosFallBack(uint64_t value, int width) {
+  VIXL_ASSERT(IsPowerOf2(width) && (width <= 64));
+  if (value == 0) {
+    return width;
+  }
+  int count = 0;
+  value = value << (64 - width);
+  if ((value & UINT64_C(0xffffffff00000000)) == 0) {
+    count += 32;
+    value = value << 32;
+  }
+  if ((value & UINT64_C(0xffff000000000000)) == 0) {
+    count += 16;
+    value = value << 16;
+  }
+  if ((value & UINT64_C(0xff00000000000000)) == 0) {
+    count += 8;
+    value = value << 8;
+  }
+  if ((value & UINT64_C(0xf000000000000000)) == 0) {
+    count += 4;
+    value = value << 4;
+  }
+  if ((value & UINT64_C(0xc000000000000000)) == 0) {
+    count += 2;
+    value = value << 2;
+  }
+  if ((value & UINT64_C(0x8000000000000000)) == 0) {
+    count += 1;
+  }
+  count += (value == 0);
+  return count;
+}
+
+
+int CountSetBitsFallBack(uint64_t value, int width) {
+  VIXL_ASSERT(IsPowerOf2(width) && (width <= 64));
+
+  // Mask out unused bits to ensure that they are not counted.
+  value &= (UINT64_C(0xffffffffffffffff) >> (64 - width));
+
+  // Add up the set bits.
+  // The algorithm works by adding pairs of bit fields together iteratively,
+  // where the size of each bit field doubles each time.
+  // An example for an 8-bit value:
+  // Bits:  h  g  f  e  d  c  b  a
+  //         \ |   \ |   \ |   \ |
+  // value = h+g   f+e   d+c   b+a
+  //            \    |      \    |
+  // value =   h+g+f+e     d+c+b+a
+  //                  \          |
+  // value =       h+g+f+e+d+c+b+a
+  const uint64_t kMasks[] = {
+    UINT64_C(0x5555555555555555),
+    UINT64_C(0x3333333333333333),
+    UINT64_C(0x0f0f0f0f0f0f0f0f),
+    UINT64_C(0x00ff00ff00ff00ff),
+    UINT64_C(0x0000ffff0000ffff),
+    UINT64_C(0x00000000ffffffff),
+  };
+
+  for (unsigned i = 0; i < (sizeof(kMasks) / sizeof(kMasks[0])); i++) {
+    int shift = 1 << i;
+    value = ((value >> shift) & kMasks[i]) + (value & kMasks[i]);
+  }
+
+  return value;
+}
+
+
+int CountTrailingZerosFallBack(uint64_t value, int width) {
+  VIXL_ASSERT(IsPowerOf2(width) && (width <= 64));
+  int count = 0;
+  value = value << (64 - width);
+  if ((value & UINT64_C(0xffffffff)) == 0) {
+    count += 32;
+    value = value >> 32;
+  }
+  if ((value & 0xffff) == 0) {
+    count += 16;
+    value = value >> 16;
+  }
+  if ((value & 0xff) == 0) {
+    count += 8;
+    value = value >> 8;
+  }
+  if ((value & 0xf) == 0) {
+    count += 4;
+    value = value >> 4;
+  }
+  if ((value & 0x3) == 0) {
+    count += 2;
+    value = value >> 2;
+  }
+  if ((value & 0x1) == 0) {
+    count += 1;
+  }
+  count += (value == 0);
+  return count - (64 - width);
+}
+
+
+}  // namespace vixl
--- a/src/vixl/compiler-intrinsics.h
+++ b/src/vixl/compiler-intrinsics.h
@ -0,0 +1,155 @@
+// Copyright 2015, ARM Limited
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+//   * Redistributions of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//   * Redistributions in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//   * Neither the name of ARM Limited nor the names of its contributors may be
+//     used to endorse or promote products derived from this software without
+//     specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS CONTRIBUTORS "AS IS" AND
+// ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+#ifndef VIXL_COMPILER_INTRINSICS_H
+#define VIXL_COMPILER_INTRINSICS_H
+
+#include "globals.h"
+
+namespace vixl {
+
+// Helper to check whether the version of GCC used is greater than the specified
+// requirement.
+#define MAJOR 1000000
+#define MINOR 1000
+#if defined(__GNUC__) && defined(__GNUC_MINOR__) && defined(__GNUC_PATCHLEVEL__)
+#define GCC_VERSION_OR_NEWER(major, minor, patchlevel)                         \
+    ((__GNUC__ * MAJOR + __GNUC_MINOR__ * MINOR + __GNUC_PATCHLEVEL__) >=      \
+     ((major) * MAJOR + (minor) * MINOR + (patchlevel)))
+#elif defined(__GNUC__) && defined(__GNUC_MINOR__)
+#define GCC_VERSION_OR_NEWER(major, minor, patchlevel)                         \
+    ((__GNUC__ * MAJOR + __GNUC_MINOR__ * MINOR) >=                            \
+     ((major) * MAJOR + (minor) * MINOR + (patchlevel)))
+#else
+#define GCC_VERSION_OR_NEWER(major, minor, patchlevel) 0
+#endif
+
+
+#if defined(__clang__) && !defined(VIXL_NO_COMPILER_BUILTINS)
+
+#define COMPILER_HAS_BUILTIN_CLRSB    (__has_builtin(__builtin_clrsb))
+#define COMPILER_HAS_BUILTIN_CLZ      (__has_builtin(__builtin_clz))
+#define COMPILER_HAS_BUILTIN_CTZ      (__has_builtin(__builtin_ctz))
+#define COMPILER_HAS_BUILTIN_FFS      (__has_builtin(__builtin_ffs))
+#define COMPILER_HAS_BUILTIN_POPCOUNT (__has_builtin(__builtin_popcount))
+
+#elif defined(__GNUC__) && !defined(VIXL_NO_COMPILER_BUILTINS)
+// The documentation for these builtins is available at:
+// https://gcc.gnu.org/onlinedocs/gcc-$MAJOR.$MINOR.$PATCHLEVEL/gcc//Other-Builtins.html
+
+# define COMPILER_HAS_BUILTIN_CLRSB    (GCC_VERSION_OR_NEWER(4, 7, 0))
+# define COMPILER_HAS_BUILTIN_CLZ      (GCC_VERSION_OR_NEWER(3, 4, 0))
+# define COMPILER_HAS_BUILTIN_CTZ      (GCC_VERSION_OR_NEWER(3, 4, 0))
+# define COMPILER_HAS_BUILTIN_FFS      (GCC_VERSION_OR_NEWER(3, 4, 0))
+# define COMPILER_HAS_BUILTIN_POPCOUNT (GCC_VERSION_OR_NEWER(3, 4, 0))
+
+#else
+// One can define VIXL_NO_COMPILER_BUILTINS to force using the manually
+// implemented C++ methods.
+
+#define COMPILER_HAS_BUILTIN_BSWAP    false
+#define COMPILER_HAS_BUILTIN_CLRSB    false
+#define COMPILER_HAS_BUILTIN_CLZ      false
+#define COMPILER_HAS_BUILTIN_CTZ      false
+#define COMPILER_HAS_BUILTIN_FFS      false
+#define COMPILER_HAS_BUILTIN_POPCOUNT false
+
+#endif
+
+
+template<typename V>
+inline bool IsPowerOf2(V value) {
+  return (value != 0) && ((value & (value - 1)) == 0);
+}
+
+
+// Declaration of fallback functions.
+int CountLeadingSignBitsFallBack(int64_t value, int width);
+int CountLeadingZerosFallBack(uint64_t value, int width);
+int CountSetBitsFallBack(uint64_t value, int width);
+int CountTrailingZerosFallBack(uint64_t value, int width);
+
+
+// Implementation of intrinsics functions.
+// TODO: The implementations could be improved for sizes different from 32bit
+// and 64bit: we could mask the values and call the appropriate builtin.
+
+template<typename V>
+inline int CountLeadingSignBits(V value, int width = (sizeof(V) * 8)) {
+#if COMPILER_HAS_BUILTIN_CLRSB
+  if (width == 32) {
+    return __builtin_clrsb(value);
+  } else if (width == 64) {
+    return __builtin_clrsbll(value);
+  }
+#endif
+  return CountLeadingSignBitsFallBack(value, width);
+}
+
+
+template<typename V>
+inline int CountLeadingZeros(V value, int width = (sizeof(V) * 8)) {
+#if COMPILER_HAS_BUILTIN_CLZ
+  if (width == 32) {
+    return (value == 0) ? 32 : __builtin_clz(value);
+  } else if (width == 64) {
+    return (value == 0) ? 64 : __builtin_clzll(value);
+  }
+#endif
+  return CountLeadingZerosFallBack(value, width);
+}
+
+
+template<typename V>
+inline int CountSetBits(V value, int width = (sizeof(V) * 8)) {
+#if COMPILER_HAS_BUILTIN_POPCOUNT
+  if (width == 32) {
+    return __builtin_popcount(value);
+  } else if (width == 64) {
+    return __builtin_popcountll(value);
+  }
+#endif
+  return CountSetBitsFallBack(value, width);
+}
+
+
+template<typename V>
+inline int CountTrailingZeros(V value, int width = (sizeof(V) * 8)) {
+#if COMPILER_HAS_BUILTIN_CTZ
+  if (width == 32) {
+    return (value == 0) ? 32 : __builtin_ctz(value);
+  } else if (width == 64) {
+    return (value == 0) ? 64 : __builtin_ctzll(value);
+  }
+#endif
+  return CountTrailingZerosFallBack(value, width);
+}
+
+}  // namespace vixl
+
+#endif  // VIXL_COMPILER_INTRINSICS_H
+
--- a/src/vixl/globals.h
+++ b/src/vixl/globals.h
@ -49,7 +49,7 @@
 #include <stdint.h>
 #include <stdlib.h>
 #include <stddef.h>
-#include "platform.h"
+#include "vixl/platform.h"


 typedef uint8_t byte;
@ -88,4 +88,20 @@ template <typename T> inline void USE(T, T, T, T) {}

 #define VIXL_ALIGNMENT_EXCEPTION() printf("ALIGNMENT EXCEPTION\t"); VIXL_ABORT()

+// The clang::fallthrough attribute is used along with the Wimplicit-fallthrough
+// argument to annotate intentional fall-through between switch labels.
+// For more information please refer to:
+// http://clang.llvm.org/docs/AttributeReference.html#fallthrough-clang-fallthrough
+#ifndef __has_warning
+  #define __has_warning(x)  0
+#endif
+
+// Note: This option is only available for Clang. And will only be enabled for
+// C++11(201103L).
+#if __has_warning("-Wimplicit-fallthrough") && __cplusplus >= 201103L
+  #define VIXL_FALLTHROUGH() [[clang::fallthrough]] //NOLINT
+#else
+  #define VIXL_FALLTHROUGH() do {} while (0)
+#endif
+
 #endif  // VIXL_GLOBALS_H
--- a/src/vixl/invalset.h
+++ b/src/vixl/invalset.h
@ -32,7 +32,7 @@
 #include <algorithm>
 #include <vector>

-#include "globals.h"
+#include "vixl/globals.h"

 namespace vixl {

@ -250,7 +250,7 @@ template<class S> class InvalSetIterator {

  // Indicates if the iterator is looking at the vector or at the preallocated
  // elements.
-  bool using_vector_;
+  const bool using_vector_;
  // Used when looking at the preallocated elements, or in debug mode when using
  // the vector to track how many times the iterator has advanced.
  size_t index_;
@ -657,13 +657,14 @@ void InvalSet<TEMPLATE_INVALSET_P_DEF>::ReclaimMemory() {

 template<class S>
 InvalSetIterator<S>::InvalSetIterator(S* inval_set)
-    : using_vector_(false), index_(0), inval_set_(inval_set) {
+    : using_vector_((inval_set != NULL) && inval_set->IsUsingVector()),
+      index_(0),
+      inval_set_(inval_set) {
  if (inval_set != NULL) {
    inval_set->Sort(S::kSoftSort);
 #ifdef VIXL_DEBUG
    inval_set->Acquire();
 #endif
-    using_vector_ = inval_set->IsUsingVector();
    if (using_vector_) {
      iterator_ = typename std::vector<ElementType>::iterator(
          inval_set_->vector_->begin());
--- a/src/vixl/platform.h
+++ b/src/vixl/platform.h
--- a/src/vixl/utils.cc
+++ b/src/vixl/utils.cc
@ -24,7 +24,7 @@
 // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

-#include "utils.h"
+#include "vixl/utils.h"
 #include <stdio.h>

 namespace vixl {
@ -127,91 +127,6 @@ int float16classify(float16 value) {
 }


-int CountLeadingZeros(uint64_t value, int width) {
-  VIXL_ASSERT((width == 8) || (width == 16) || (width == 32) || (width == 64));
-  int count = 0;
-  uint64_t bit_test = UINT64_C(1) << (width - 1);
-  while ((count < width) && ((bit_test & value) == 0)) {
-    count++;
-    bit_test >>= 1;
-  }
-  return count;
-}
-
-
-int CountLeadingSignBits(int64_t value, int width) {
-  VIXL_ASSERT((width == 8) || (width == 16) || (width == 32) || (width == 64));
-  if (value >= 0) {
-    return CountLeadingZeros(value, width) - 1;
-  } else {
-    return CountLeadingZeros(~value, width) - 1;
-  }
-}
-
-
-int CountTrailingZeros(uint64_t value, int width) {
-  VIXL_ASSERT((width == 32) || (width == 64));
-  int count = 0;
-  while ((count < width) && (((value >> count) & 1) == 0)) {
-    count++;
-  }
-  return count;
-}
-
-
-int CountSetBits(uint64_t value, int width) {
-  // TODO: Other widths could be added here, as the implementation already
-  // supports them.
-  VIXL_ASSERT((width == 32) || (width == 64));
-
-  // Mask out unused bits to ensure that they are not counted.
-  value &= (UINT64_C(0xffffffffffffffff) >> (64-width));
-
-  // Add up the set bits.
-  // The algorithm works by adding pairs of bit fields together iteratively,
-  // where the size of each bit field doubles each time.
-  // An example for an 8-bit value:
-  // Bits:  h  g  f  e  d  c  b  a
-  //         \ |   \ |   \ |   \ |
-  // value = h+g   f+e   d+c   b+a
-  //            \    |      \    |
-  // value =   h+g+f+e     d+c+b+a
-  //                  \          |
-  // value =       h+g+f+e+d+c+b+a
-  const uint64_t kMasks[] = {
-    UINT64_C(0x5555555555555555),
-    UINT64_C(0x3333333333333333),
-    UINT64_C(0x0f0f0f0f0f0f0f0f),
-    UINT64_C(0x00ff00ff00ff00ff),
-    UINT64_C(0x0000ffff0000ffff),
-    UINT64_C(0x00000000ffffffff),
-  };
-
-  for (unsigned i = 0; i < (sizeof(kMasks) / sizeof(kMasks[0])); i++) {
-    int shift = 1 << i;
-    value = ((value >> shift) & kMasks[i]) + (value & kMasks[i]);
-  }
-
-  return value;
-}
-
-
-uint64_t LowestSetBit(uint64_t value) {
-  return value & -value;
-}
-
-
-int HighestSetBitPosition(uint64_t number) {
-  VIXL_ASSERT(number != 0);
-  return 63 - CountLeadingZeros(number, 64);
-}
-
-
-bool IsPowerOf2(int64_t value) {
-  return (value != 0) && ((value & (value - 1)) == 0);
-}
-
-
 unsigned CountClearHalfWords(uint64_t imm, unsigned reg_size) {
  VIXL_ASSERT((reg_size % 8) == 0);
  int count = 0;
--- a/src/vixl/utils.h
+++ b/src/vixl/utils.h
@ -27,9 +27,10 @@
 #ifndef VIXL_UTILS_H
 #define VIXL_UTILS_H

-#include <math.h>
 #include <string.h>
-#include "globals.h"
+#include <cmath>
+#include "vixl/globals.h"
+#include "vixl/compiler-intrinsics.h"

 namespace vixl {

@ -121,7 +122,7 @@ int float16classify(float16 value);
 inline bool IsSignallingNaN(double num) {
  const uint64_t kFP64QuietNaNMask = UINT64_C(0x0008000000000000);
  uint64_t raw = double_to_rawbits(num);
-  if (isnan(num) && ((raw & kFP64QuietNaNMask) == 0)) {
+  if (std::isnan(num) && ((raw & kFP64QuietNaNMask) == 0)) {
    return true;
  }
  return false;
@ -131,7 +132,7 @@ inline bool IsSignallingNaN(double num) {
 inline bool IsSignallingNaN(float num) {
  const uint32_t kFP32QuietNaNMask = 0x00400000;
  uint32_t raw = float_to_rawbits(num);
-  if (isnan(num) && ((raw & kFP32QuietNaNMask) == 0)) {
+  if (std::isnan(num) && ((raw & kFP32QuietNaNMask) == 0)) {
    return true;
  }
  return false;
@ -147,21 +148,21 @@ inline bool IsSignallingNaN(float16 num) {

 template <typename T>
 inline bool IsQuietNaN(T num) {
-  return isnan(num) && !IsSignallingNaN(num);
+  return std::isnan(num) && !IsSignallingNaN(num);
 }


 // Convert the NaN in 'num' to a quiet NaN.
 inline double ToQuietNaN(double num) {
  const uint64_t kFP64QuietNaNMask = UINT64_C(0x0008000000000000);
-  VIXL_ASSERT(isnan(num));
+  VIXL_ASSERT(std::isnan(num));
  return rawbits_to_double(double_to_rawbits(num) | kFP64QuietNaNMask);
 }


 inline float ToQuietNaN(float num) {
  const uint32_t kFP32QuietNaNMask = 0x00400000;
-  VIXL_ASSERT(isnan(num));
+  VIXL_ASSERT(std::isnan(num));
  return rawbits_to_float(float_to_rawbits(num) | kFP32QuietNaNMask);
 }

@ -177,14 +178,23 @@ inline float FusedMultiplyAdd(float op1, float op2, float a) {
 }


-// Bit counting.
-int CountLeadingZeros(uint64_t value, int width);
-int CountLeadingSignBits(int64_t value, int width);
-int CountTrailingZeros(uint64_t value, int width);
-int CountSetBits(uint64_t value, int width);
-uint64_t LowestSetBit(uint64_t value);
-int HighestSetBitPosition(uint64_t value);
-bool IsPowerOf2(int64_t value);
+inline uint64_t LowestSetBit(uint64_t value) {
+  return value & -value;
+}
+
+
+template<typename T>
+inline int HighestSetBitPosition(T value) {
+  VIXL_ASSERT(value != 0);
+  return (sizeof(value) * 8 - 1) - CountLeadingZeros(value);
+}
+
+
+template<typename V>
+inline int WhichPowerOf2(V value) {
+  VIXL_ASSERT(IsPowerOf2(value));
+  return CountTrailingZeros(value);
+}

 unsigned CountClearHalfWords(uint64_t imm, unsigned reg_size);

--- a/test/examples/test-examples.cc
+++ b/test/examples/test-examples.cc
@ -24,9 +24,9 @@
 // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

-#include "a64/macro-assembler-a64.h"
-#include "a64/debugger-a64.h"
-#include "a64/simulator-a64.h"
+#include "vixl/a64/macro-assembler-a64.h"
+#include "vixl/a64/debugger-a64.h"
+#include "vixl/a64/simulator-a64.h"
 #include "examples.h"
 #include "non-const-visitor.h"
 #include "custom-disassembler.h"
--- a/test/test-assembler-a64.cc
+++ b/test/test-assembler-a64.cc
@ -27,16 +27,16 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#include <math.h>
 #include <float.h>
+#include <cmath>

 #include "test-runner.h"
 #include "test-utils-a64.h"
-#include "a64/macro-assembler-a64.h"
-#include "a64/simulator-a64.h"
-#include "a64/debugger-a64.h"
-#include "a64/disasm-a64.h"
-#include "a64/cpu-a64.h"
+#include "vixl/a64/macro-assembler-a64.h"
+#include "vixl/a64/simulator-a64.h"
+#include "vixl/a64/debugger-a64.h"
+#include "vixl/a64/disasm-a64.h"
+#include "vixl/a64/cpu-a64.h"

 namespace vixl {

@ -1072,28 +1072,28 @@ TEST(mul) {
  SETUP();

  START();
-  __ Mov(x16, 0);
-  __ Mov(x17, 1);
+  __ Mov(x25, 0);
+  __ Mov(x26, 1);
  __ Mov(x18, 0xffffffff);
  __ Mov(x19, 0xffffffffffffffff);

-  __ Mul(w0, w16, w16);
-  __ Mul(w1, w16, w17);
-  __ Mul(w2, w17, w18);
+  __ Mul(w0, w25, w25);
+  __ Mul(w1, w25, w26);
+  __ Mul(w2, w26, w18);
  __ Mul(w3, w18, w19);
-  __ Mul(x4, x16, x16);
-  __ Mul(x5, x17, x18);
+  __ Mul(x4, x25, x25);
+  __ Mul(x5, x26, x18);
  __ Mul(x6, x18, x19);
  __ Mul(x7, x19, x19);
-  __ Smull(x8, w17, w18);
+  __ Smull(x8, w26, w18);
  __ Smull(x9, w18, w18);
  __ Smull(x10, w19, w19);
-  __ Mneg(w11, w16, w16);
-  __ Mneg(w12, w16, w17);
-  __ Mneg(w13, w17, w18);
+  __ Mneg(w11, w25, w25);
+  __ Mneg(w12, w25, w26);
+  __ Mneg(w13, w26, w18);
  __ Mneg(w14, w18, w19);
-  __ Mneg(x20, x16, x16);
-  __ Mneg(x21, x17, x18);
+  __ Mneg(x20, x25, x25);
+  __ Mneg(x21, x26, x18);
  __ Mneg(x22, x18, x19);
  __ Mneg(x23, x19, x19);
  END();
@ -1333,6 +1333,54 @@ TEST(smulh) {
 }


+TEST(umulh) {
+  SETUP();
+
+  START();
+  __ Mov(x20, 0);
+  __ Mov(x21, 1);
+  __ Mov(x22, 0x0000000100000000);
+  __ Mov(x23, 0x0000000012345678);
+  __ Mov(x24, 0x0123456789abcdef);
+  __ Mov(x25, 0x0000000200000000);
+  __ Mov(x26, 0x8000000000000000);
+  __ Mov(x27, 0xffffffffffffffff);
+  __ Mov(x28, 0x5555555555555555);
+  __ Mov(x29, 0xaaaaaaaaaaaaaaaa);
+
+  __ Umulh(x0, x20, x24);
+  __ Umulh(x1, x21, x24);
+  __ Umulh(x2, x22, x23);
+  __ Umulh(x3, x22, x24);
+  __ Umulh(x4, x24, x25);
+  __ Umulh(x5, x23, x27);
+  __ Umulh(x6, x26, x26);
+  __ Umulh(x7, x26, x27);
+  __ Umulh(x8, x27, x27);
+  __ Umulh(x9, x28, x28);
+  __ Umulh(x10, x28, x29);
+  __ Umulh(x11, x29, x29);
+  END();
+
+  RUN();
+
+  ASSERT_EQUAL_64(0, x0);
+  ASSERT_EQUAL_64(0, x1);
+  ASSERT_EQUAL_64(0, x2);
+  ASSERT_EQUAL_64(0x0000000001234567, x3);
+  ASSERT_EQUAL_64(0x0000000002468acf, x4);
+  ASSERT_EQUAL_64(0x0000000012345677, x5);
+  ASSERT_EQUAL_64(0x4000000000000000, x6);
+  ASSERT_EQUAL_64(0x7fffffffffffffff, x7);
+  ASSERT_EQUAL_64(0xfffffffffffffffe, x8);
+  ASSERT_EQUAL_64(0x1c71c71c71c71c71, x9);
+  ASSERT_EQUAL_64(0x38e38e38e38e38e3, x10);
+  ASSERT_EQUAL_64(0x71c71c71c71c71c6, x11);
+
+  TEARDOWN();
+}
+
+
 TEST(smaddl_umaddl_umull) {
  SETUP();

@ -9446,26 +9494,26 @@ static float MinMaxHelper(float n,
  uint32_t raw_n = float_to_rawbits(n);
  uint32_t raw_m = float_to_rawbits(m);

-  if (isnan(n) && ((raw_n & kFP32QuietNaNMask) == 0)) {
+  if (std::isnan(n) && ((raw_n & kFP32QuietNaNMask) == 0)) {
    // n is signalling NaN.
    return rawbits_to_float(raw_n | kFP32QuietNaNMask);
-  } else if (isnan(m) && ((raw_m & kFP32QuietNaNMask) == 0)) {
+  } else if (std::isnan(m) && ((raw_m & kFP32QuietNaNMask) == 0)) {
    // m is signalling NaN.
    return rawbits_to_float(raw_m | kFP32QuietNaNMask);
  } else if (quiet_nan_substitute == 0.0) {
-    if (isnan(n)) {
+    if (std::isnan(n)) {
      // n is quiet NaN.
      return n;
-    } else if (isnan(m)) {
+    } else if (std::isnan(m)) {
      // m is quiet NaN.
      return m;
    }
  } else {
    // Substitute n or m if one is quiet, but not both.
-    if (isnan(n) && !isnan(m)) {
+    if (std::isnan(n) && !std::isnan(m)) {
      // n is quiet NaN: replace with substitute.
      n = quiet_nan_substitute;
-    } else if (!isnan(n) && isnan(m)) {
+    } else if (!std::isnan(n) && std::isnan(m)) {
      // m is quiet NaN: replace with substitute.
      m = quiet_nan_substitute;
    }
@ -9488,26 +9536,26 @@ static double MinMaxHelper(double n,
  uint64_t raw_n = double_to_rawbits(n);
  uint64_t raw_m = double_to_rawbits(m);

-  if (isnan(n) && ((raw_n & kFP64QuietNaNMask) == 0)) {
+  if (std::isnan(n) && ((raw_n & kFP64QuietNaNMask) == 0)) {
    // n is signalling NaN.
    return rawbits_to_double(raw_n | kFP64QuietNaNMask);
-  } else if (isnan(m) && ((raw_m & kFP64QuietNaNMask) == 0)) {
+  } else if (std::isnan(m) && ((raw_m & kFP64QuietNaNMask) == 0)) {
    // m is signalling NaN.
    return rawbits_to_double(raw_m | kFP64QuietNaNMask);
  } else if (quiet_nan_substitute == 0.0) {
-    if (isnan(n)) {
+    if (std::isnan(n)) {
      // n is quiet NaN.
      return n;
-    } else if (isnan(m)) {
+    } else if (std::isnan(m)) {
      // m is quiet NaN.
      return m;
    }
  } else {
    // Substitute n or m if one is quiet, but not both.
-    if (isnan(n) && !isnan(m)) {
+    if (std::isnan(n) && !std::isnan(m)) {
      // n is quiet NaN: replace with substitute.
      n = quiet_nan_substitute;
-    } else if (!isnan(n) && isnan(m)) {
+    } else if (!std::isnan(n) && std::isnan(m)) {
      // m is quiet NaN: replace with substitute.
      m = quiet_nan_substitute;
    }
@ -9700,6 +9748,10 @@ TEST(fccmp) {
  __ Fmov(d18, -0.5);
  __ Fmov(d19, -1.0);
  __ Mov(x20, 0);
+  __ Mov(x21, 0x7ff0000000000001);  // Double precision NaN.
+  __ Fmov(d21, x21);
+  __ Mov(w22, 0x7f800001);  // Single precision NaN.
+  __ Fmov(s22, w22);

  __ Cmp(x20, 0);
  __ Fccmp(s16, s16, NoFlag, eq);
@ -9739,6 +9791,22 @@ TEST(fccmp) {

  __ fccmp(d18, d18, NFlag, nv);
  __ Mrs(x9, NZCV);
+
+  __ Cmp(x20, 0);
+  __ Fccmpe(s16, s16, NoFlag, eq);
+  __ Mrs(x10, NZCV);
+
+  __ Cmp(x20, 0);
+  __ Fccmpe(d18, d19, ZCVFlag, ls);
+  __ Mrs(x11, NZCV);
+
+  __ Cmp(x20, 0);
+  __ Fccmpe(d21, d21, NoFlag, eq);
+  __ Mrs(x12, NZCV);
+
+  __ Cmp(x20, 0);
+  __ Fccmpe(s22, s22, NoFlag, eq);
+  __ Mrs(x13, NZCV);
  END();

  RUN();
@ -9753,6 +9821,10 @@ TEST(fccmp) {
  ASSERT_EQUAL_32(NFlag, w7);
  ASSERT_EQUAL_32(ZCFlag, w8);
  ASSERT_EQUAL_32(ZCFlag, w9);
+  ASSERT_EQUAL_32(ZCFlag, w10);
+  ASSERT_EQUAL_32(CFlag, w11);
+  ASSERT_EQUAL_32(CVFlag, w12);
+  ASSERT_EQUAL_32(CVFlag, w13);

  TEARDOWN();
 }
@ -9813,6 +9885,19 @@ TEST(fcmp) {
    __ Fcmp(d19, 12.3456);
    temps.Exclude(d0);
    __ Mrs(x16, NZCV);
+
+    __ Fcmpe(s8, s8);
+    __ Mrs(x22, NZCV);
+    __ Fcmpe(s8, 0.0);
+    __ Mrs(x23, NZCV);
+    __ Fcmpe(d19, d19);
+    __ Mrs(x24, NZCV);
+    __ Fcmpe(d19, 0.0);
+    __ Mrs(x25, NZCV);
+    __ Fcmpe(s18, s18);
+    __ Mrs(x26, NZCV);
+    __ Fcmpe(d21, d21);
+    __ Mrs(x27, NZCV);
  }

  END();
@ -9833,6 +9918,12 @@ TEST(fcmp) {
  ASSERT_EQUAL_32(CVFlag, w14);
  ASSERT_EQUAL_32(ZCFlag, w15);
  ASSERT_EQUAL_32(NFlag, w16);
+  ASSERT_EQUAL_32(ZCFlag, w22);
+  ASSERT_EQUAL_32(ZCFlag, w23);
+  ASSERT_EQUAL_32(ZCFlag, w24);
+  ASSERT_EQUAL_32(ZCFlag, w25);
+  ASSERT_EQUAL_32(CVFlag, w26);
+  ASSERT_EQUAL_32(CVFlag, w27);

  TEARDOWN();
 }
@ -11869,16 +11960,16 @@ static void TestUScvtfHelper(uint64_t in,
  double expected_ucvtf_base = rawbits_to_double(expected_ucvtf_bits);

  for (int fbits = 0; fbits <= 32; fbits++) {
-    double expected_scvtf = expected_scvtf_base / pow(2, fbits);
-    double expected_ucvtf = expected_ucvtf_base / pow(2, fbits);
+    double expected_scvtf = expected_scvtf_base / std::pow(2, fbits);
+    double expected_ucvtf = expected_ucvtf_base / std::pow(2, fbits);
    ASSERT_EQUAL_FP64(expected_scvtf, results_scvtf_x[fbits]);
    ASSERT_EQUAL_FP64(expected_ucvtf, results_ucvtf_x[fbits]);
    if (cvtf_s32) ASSERT_EQUAL_FP64(expected_scvtf, results_scvtf_w[fbits]);
    if (cvtf_u32) ASSERT_EQUAL_FP64(expected_ucvtf, results_ucvtf_w[fbits]);
  }
  for (int fbits = 33; fbits <= 64; fbits++) {
-    double expected_scvtf = expected_scvtf_base / pow(2, fbits);
-    double expected_ucvtf = expected_ucvtf_base / pow(2, fbits);
+    double expected_scvtf = expected_scvtf_base / std::pow(2, fbits);
+    double expected_ucvtf = expected_ucvtf_base / std::pow(2, fbits);
    ASSERT_EQUAL_FP64(expected_scvtf, results_scvtf_x[fbits]);
    ASSERT_EQUAL_FP64(expected_ucvtf, results_ucvtf_x[fbits]);
  }
@ -12023,18 +12114,16 @@ static void TestUScvtf32Helper(uint64_t in,
  float expected_ucvtf_base = rawbits_to_float(expected_ucvtf_bits);

  for (int fbits = 0; fbits <= 32; fbits++) {
-    float expected_scvtf = expected_scvtf_base / powf(2, fbits);
-    float expected_ucvtf = expected_ucvtf_base / powf(2, fbits);
+    float expected_scvtf = expected_scvtf_base / std::pow(2.0f, fbits);
+    float expected_ucvtf = expected_ucvtf_base / std::pow(2.0f, fbits);
    ASSERT_EQUAL_FP32(expected_scvtf, results_scvtf_x[fbits]);
    ASSERT_EQUAL_FP32(expected_ucvtf, results_ucvtf_x[fbits]);
    if (cvtf_s32) ASSERT_EQUAL_FP32(expected_scvtf, results_scvtf_w[fbits]);
    if (cvtf_u32) ASSERT_EQUAL_FP32(expected_ucvtf, results_ucvtf_w[fbits]);
-    break;
  }
  for (int fbits = 33; fbits <= 64; fbits++) {
-    break;
-    float expected_scvtf = expected_scvtf_base / powf(2, fbits);
-    float expected_ucvtf = expected_ucvtf_base / powf(2, fbits);
+    float expected_scvtf = expected_scvtf_base / std::pow(2.0f, fbits);
+    float expected_ucvtf = expected_ucvtf_base / std::pow(2.0f, fbits);
    ASSERT_EQUAL_FP32(expected_scvtf, results_scvtf_x[fbits]);
    ASSERT_EQUAL_FP32(expected_ucvtf, results_ucvtf_x[fbits]);
  }
@ -12617,6 +12706,10 @@ TEST(peek_poke_mixed) {
  SETUP();
  START();

+  // Acquire all temps from the MacroAssembler. They are used arbitrarily below.
+  UseScratchRegisterScope temps(&masm);
+  temps.ExcludeAll();
+
  // The literal base is chosen to have two useful properties:
  //  * When multiplied by small values (such as a register index), this value
  //    is clearly readable in the result.
@ -12687,6 +12780,10 @@ TEST(peek_poke_reglist) {
  SETUP();
  START();

+  // Acquire all temps from the MacroAssembler. They are used arbitrarily below.
+  UseScratchRegisterScope temps(&masm);
+  temps.ExcludeAll();
+
  // The literal base is chosen to have two useful properties:
  //  * When multiplied by small values (such as a register index), this value
  //    is clearly readable in the result.
@ -12769,6 +12866,121 @@ TEST(peek_poke_reglist) {
 }


+TEST(load_store_reglist) {
+  SETUP();
+  START();
+
+  // The literal base is chosen to have two useful properties:
+  //  * When multiplied by small values (such as a register index), this value
+  //    is clearly readable in the result.
+  //  * The value is not formed from repeating fixed-size smaller values, so it
+  //    can be used to detect endianness-related errors.
+  uint64_t high_base = UINT32_C(0x01000010);
+  uint64_t low_base =  UINT32_C(0x00100101);
+  uint64_t base = (high_base << 32) | low_base;
+  uint64_t array[21];
+  memset(array, 0, sizeof(array));
+
+  // Initialize the registers.
+  __ Mov(x1, base);
+  __ Add(x2, x1, x1);
+  __ Add(x3, x2, x1);
+  __ Add(x4, x3, x1);
+  __ Fmov(d1, x1);
+  __ Fmov(d2, x2);
+  __ Fmov(d3, x3);
+  __ Fmov(d4, x4);
+  __ Fmov(d5, x1);
+  __ Fmov(d6, x2);
+  __ Fmov(d7, x3);
+  __ Fmov(d8, x4);
+
+  Register reg_base = x20;
+  Register reg_index = x21;
+  int size_stored = 0;
+
+  __ Mov(reg_base, reinterpret_cast<uintptr_t>(&array));
+
+  // Test aligned accesses.
+  CPURegList list_src(w1, w2, w3, w4);
+  CPURegList list_dst(w11, w12, w13, w14);
+  CPURegList list_fp_src_1(d1, d2, d3, d4);
+  CPURegList list_fp_dst_1(d11, d12, d13, d14);
+
+  __ StoreCPURegList(list_src, MemOperand(reg_base, 0 * sizeof(uint64_t)));
+  __ LoadCPURegList(list_dst, MemOperand(reg_base, 0 * sizeof(uint64_t)));
+  size_stored += 4 * kWRegSizeInBytes;
+
+  __ Mov(reg_index, size_stored);
+  __ StoreCPURegList(list_src, MemOperand(reg_base, reg_index));
+  __ LoadCPURegList(list_dst, MemOperand(reg_base, reg_index));
+  size_stored += 4 * kWRegSizeInBytes;
+
+  __ StoreCPURegList(list_fp_src_1, MemOperand(reg_base, size_stored));
+  __ LoadCPURegList(list_fp_dst_1, MemOperand(reg_base, size_stored));
+  size_stored += 4 * kDRegSizeInBytes;
+
+  __ Mov(reg_index, size_stored);
+  __ StoreCPURegList(list_fp_src_1, MemOperand(reg_base, reg_index));
+  __ LoadCPURegList(list_fp_dst_1, MemOperand(reg_base, reg_index));
+  size_stored += 4 * kDRegSizeInBytes;
+
+  // Test unaligned accesses.
+  CPURegList list_fp_src_2(d5, d6, d7, d8);
+  CPURegList list_fp_dst_2(d15, d16, d17, d18);
+
+  __ Str(wzr, MemOperand(reg_base, size_stored));
+  size_stored += 1 * kWRegSizeInBytes;
+  __ StoreCPURegList(list_fp_src_2, MemOperand(reg_base, size_stored));
+  __ LoadCPURegList(list_fp_dst_2, MemOperand(reg_base, size_stored));
+  size_stored += 4 * kDRegSizeInBytes;
+
+  __ Mov(reg_index, size_stored);
+  __ StoreCPURegList(list_fp_src_2, MemOperand(reg_base, reg_index));
+  __ LoadCPURegList(list_fp_dst_2, MemOperand(reg_base, reg_index));
+
+  END();
+  RUN();
+
+  VIXL_CHECK(array[0] == (1 * low_base) + (2 * low_base << kWRegSize));
+  VIXL_CHECK(array[1] == (3 * low_base) + (4 * low_base << kWRegSize));
+  VIXL_CHECK(array[2] == (1 * low_base) + (2 * low_base << kWRegSize));
+  VIXL_CHECK(array[3] == (3 * low_base) + (4 * low_base << kWRegSize));
+  VIXL_CHECK(array[4] == 1 * base);
+  VIXL_CHECK(array[5] == 2 * base);
+  VIXL_CHECK(array[6] == 3 * base);
+  VIXL_CHECK(array[7] == 4 * base);
+  VIXL_CHECK(array[8] == 1 * base);
+  VIXL_CHECK(array[9] == 2 * base);
+  VIXL_CHECK(array[10] == 3 * base);
+  VIXL_CHECK(array[11] == 4 * base);
+  VIXL_CHECK(array[12] == ((1 * low_base) << kSRegSize));
+  VIXL_CHECK(array[13] == (((2 * low_base) << kSRegSize) | (1 * high_base)));
+  VIXL_CHECK(array[14] == (((3 * low_base) << kSRegSize) | (2 * high_base)));
+  VIXL_CHECK(array[15] == (((4 * low_base) << kSRegSize) | (3 * high_base)));
+  VIXL_CHECK(array[16] == (((1 * low_base) << kSRegSize) | (4 * high_base)));
+  VIXL_CHECK(array[17] == (((2 * low_base) << kSRegSize) | (1 * high_base)));
+  VIXL_CHECK(array[18] == (((3 * low_base) << kSRegSize) | (2 * high_base)));
+  VIXL_CHECK(array[19] == (((4 * low_base) << kSRegSize) | (3 * high_base)));
+  VIXL_CHECK(array[20] == (4 * high_base));
+
+  ASSERT_EQUAL_64(1 * low_base, x11);
+  ASSERT_EQUAL_64(2 * low_base, x12);
+  ASSERT_EQUAL_64(3 * low_base, x13);
+  ASSERT_EQUAL_64(4 * low_base, x14);
+  ASSERT_EQUAL_FP64(rawbits_to_double(1 * base), d11);
+  ASSERT_EQUAL_FP64(rawbits_to_double(2 * base), d12);
+  ASSERT_EQUAL_FP64(rawbits_to_double(3 * base), d13);
+  ASSERT_EQUAL_FP64(rawbits_to_double(4 * base), d14);
+  ASSERT_EQUAL_FP64(rawbits_to_double(1 * base), d15);
+  ASSERT_EQUAL_FP64(rawbits_to_double(2 * base), d16);
+  ASSERT_EQUAL_FP64(rawbits_to_double(3 * base), d17);
+  ASSERT_EQUAL_FP64(rawbits_to_double(4 * base), d18);
+
+  TEARDOWN();
+}
+
+
 // This enum is used only as an argument to the push-pop test helpers.
 enum PushPopMethod {
  // Push or Pop using the Push and Pop methods, with blocks of up to four
@ -12814,6 +13026,10 @@ static void PushPopXRegSimpleHelper(int reg_count,
  RegList list = PopulateRegisterArray(NULL, x, r, reg_size, reg_count,
                                       allowed);

+  // Acquire all temps from the MacroAssembler. They are used arbitrarily below.
+  UseScratchRegisterScope temps(&masm);
+  temps.ExcludeAll();
+
  // The literal base is chosen to have two useful properties:
  //  * When multiplied by small values (such as a register index), this value
  //    is clearly readable in the result.
@ -12993,6 +13209,10 @@ static void PushPopFPXRegSimpleHelper(int reg_count,
  // Arbitrarily pick a register to use as a stack pointer.
  const Register& stack_pointer = x10;

+  // Acquire all temps from the MacroAssembler. They are used arbitrarily below.
+  UseScratchRegisterScope temps(&masm);
+  temps.ExcludeAll();
+
  // The literal base is chosen to have two useful properties:
  //  * When multiplied (using an integer) by small values (such as a register
  //    index), this value is clearly readable in the result.
@ -13167,6 +13387,10 @@ static void PushPopXRegMixedMethodsHelper(int claim, int reg_size) {
    r6_to_r9 |= x[i].Bit();
  }

+  // Acquire all temps from the MacroAssembler. They are used arbitrarily below.
+  UseScratchRegisterScope temps(&masm);
+  temps.ExcludeAll();
+
  // The literal base is chosen to have two useful properties:
  //  * When multiplied by small values (such as a register index), this value
  //    is clearly readable in the result.
@ -13267,6 +13491,10 @@ static void PushPopXRegWXOverlapHelper(int reg_count, int claim) {
    stack[i] = 0xdeadbeef;
  }

+  // Acquire all temps from the MacroAssembler. They are used arbitrarily below.
+  UseScratchRegisterScope temps(&masm);
+  temps.ExcludeAll();
+
  // The literal base is chosen to have two useful properties:
  //  * When multiplied by small values (such as a register index), this value
  //    is clearly readable in the result.
@ -13446,6 +13674,10 @@ TEST(push_pop_sp) {

  VIXL_ASSERT(sp.Is(__ StackPointer()));

+  // Acquire all temps from the MacroAssembler. They are used arbitrarily below.
+  UseScratchRegisterScope temps(&masm);
+  temps.ExcludeAll();
+
  __ Mov(x3, 0x3333333333333333);
  __ Mov(x2, 0x2222222222222222);
  __ Mov(x1, 0x1111111111111111);
@ -14154,8 +14386,8 @@ TEST(process_nan_float) {


 static void ProcessNaNsHelper(double n, double m, double expected) {
-  VIXL_ASSERT(isnan(n) || isnan(m));
-  VIXL_ASSERT(isnan(expected));
+  VIXL_ASSERT(std::isnan(n) || std::isnan(m));
+  VIXL_ASSERT(std::isnan(expected));

  SETUP();
  START();
@ -14225,8 +14457,8 @@ TEST(process_nans_double) {


 static void ProcessNaNsHelper(float n, float m, float expected) {
-  VIXL_ASSERT(isnan(n) || isnan(m));
-  VIXL_ASSERT(isnan(expected));
+  VIXL_ASSERT(std::isnan(n) || std::isnan(m));
+  VIXL_ASSERT(std::isnan(expected));

  SETUP();
  START();
@ -14296,10 +14528,10 @@ TEST(process_nans_float) {


 static void DefaultNaNHelper(float n, float m, float a) {
-  VIXL_ASSERT(isnan(n) || isnan(m) || isnan(a));
+  VIXL_ASSERT(std::isnan(n) || std::isnan(m) || std::isnan(a));

-  bool test_1op = isnan(n);
-  bool test_2op = isnan(n) || isnan(m);
+  bool test_1op = std::isnan(n);
+  bool test_2op = std::isnan(n) || std::isnan(m);

  SETUP();
  START();
@ -14423,10 +14655,10 @@ TEST(default_nan_float) {


 static void DefaultNaNHelper(double n, double m, double a) {
-  VIXL_ASSERT(isnan(n) || isnan(m) || isnan(a));
+  VIXL_ASSERT(std::isnan(n) || std::isnan(m) || std::isnan(a));

-  bool test_1op = isnan(n);
-  bool test_2op = isnan(n) || isnan(m);
+  bool test_1op = std::isnan(n);
+  bool test_2op = std::isnan(n) || std::isnan(m);

  SETUP();
  START();
--- a/test/test-disasm-a64.cc
+++ b/test/test-disasm-a64.cc
@ -28,8 +28,8 @@
 #include <cstring>
 #include "test-runner.h"

-#include "a64/macro-assembler-a64.h"
-#include "a64/disasm-a64.h"
+#include "vixl/a64/macro-assembler-a64.h"
+#include "vixl/a64/disasm-a64.h"

 #define TEST(name)  TEST_(DISASM_##name)

@ -457,6 +457,7 @@ TEST(mul_and_div) {
  COMPARE(smull(x0, w0, w1), "smull x0, w0, w1");
  COMPARE(smull(x30, w30, w0), "smull x30, w30, w0");
  COMPARE(smulh(x0, x1, x2), "smulh x0, x1, x2");
+  COMPARE(umulh(x0, x2, x1), "umulh x0, x2, x1");

  COMPARE(sdiv(w0, w1, w2), "sdiv w0, w1, w2");
  COMPARE(sdiv(x3, x4, x5), "sdiv x3, x4, x5");
@ -2361,6 +2362,13 @@ TEST(fp_compare) {
  COMPARE(fcmp(s12, 0), "fcmp s12, #0.0");
  COMPARE(fcmp(d12, 0), "fcmp d12, #0.0");

+  COMPARE(fcmpe(s0, s1), "fcmpe s0, s1");
+  COMPARE(fcmpe(s31, s30), "fcmpe s31, s30");
+  COMPARE(fcmpe(d0, d1), "fcmpe d0, d1");
+  COMPARE(fcmpe(d31, d30), "fcmpe d31, d30");
+  COMPARE(fcmpe(s12, 0), "fcmpe s12, #0.0");
+  COMPARE(fcmpe(d12, 0), "fcmpe d12, #0.0");
+
  CLEANUP();
 }

@ -2379,6 +2387,17 @@ TEST(fp_cond_compare) {
  COMPARE(fccmp(s14, s15, CVFlag, al), "fccmp s14, s15, #nzCV, al");
  COMPARE(fccmp(d16, d17, CFlag, nv), "fccmp d16, d17, #nzCv, nv");

+  COMPARE(fccmpe(s0, s1, NoFlag, eq), "fccmpe s0, s1, #nzcv, eq");
+  COMPARE(fccmpe(s2, s3, ZVFlag, ne), "fccmpe s2, s3, #nZcV, ne");
+  COMPARE(fccmpe(s30, s16, NCFlag, pl), "fccmpe s30, s16, #NzCv, pl");
+  COMPARE(fccmpe(s31, s31, NZCVFlag, le), "fccmpe s31, s31, #NZCV, le");
+  COMPARE(fccmpe(d4, d5, VFlag, gt), "fccmpe d4, d5, #nzcV, gt");
+  COMPARE(fccmpe(d6, d7, NFlag, vs), "fccmpe d6, d7, #Nzcv, vs");
+  COMPARE(fccmpe(d30, d0, NZFlag, vc), "fccmpe d30, d0, #NZcv, vc");
+  COMPARE(fccmpe(d31, d31, ZFlag, hs), "fccmpe d31, d31, #nZcv, hs");
+  COMPARE(fccmpe(s14, s15, CVFlag, al), "fccmpe s14, s15, #nzCV, al");
+  COMPARE(fccmpe(d16, d17, CFlag, nv), "fccmpe d16, d17, #nzCv, nv");
+
  CLEANUP();
 }

@ -2655,6 +2674,12 @@ TEST(add_sub_negative) {
  COMPARE(Add(w19, w3, -0x344), "sub w19, w3, #0x344 (836)");
  COMPARE(Add(w20, w4, -2000), "sub w20, w4, #0x7d0 (2000)");

+  COMPARE(Add(w0, w1, 5, LeaveFlags), "add w0, w1, #0x5 (5)");
+  COMPARE(Add(w1, w2, 15, SetFlags), "adds w1, w2, #0xf (15)");
+
+  COMPARE(Sub(w0, w1, 5, LeaveFlags), "sub w0, w1, #0x5 (5)");
+  COMPARE(Sub(w1, w2, 15, SetFlags), "subs w1, w2, #0xf (15)");
+
  COMPARE(Sub(w21, w3, -0xbc), "add w21, w3, #0xbc (188)");
  COMPARE(Sub(w22, w4, -2000), "add w22, w4, #0x7d0 (2000)");

--- a/test/test-fuzz-a64.cc
+++ b/test/test-fuzz-a64.cc
@ -27,8 +27,8 @@
 #include <stdlib.h>
 #include "test-runner.h"

-#include "a64/decoder-a64.h"
-#include "a64/disasm-a64.h"
+#include "vixl/a64/decoder-a64.h"
+#include "vixl/a64/disasm-a64.h"

 #define TEST(name)  TEST_(FUZZ_##name)

--- a/test/test-invalset.cc
+++ b/test/test-invalset.cc
@ -26,7 +26,7 @@

 #include "test-runner.h"

-#include "invalset.h"
+#include "vixl/invalset.h"

 namespace vixl {

--- a/test/test-runner.h
+++ b/test/test-runner.h
@ -27,7 +27,7 @@
 #ifndef TEST_TEST_H_
 #define TEST_TEST_H_

-#include "utils.h"
+#include "vixl/utils.h"

 namespace vixl {

--- a/test/test-simulator-a64.cc
+++ b/test/test-simulator-a64.cc
@ -31,8 +31,8 @@
 #include "test-utils-a64.h"
 #include "test-simulator-inputs-a64.h"
 #include "test-simulator-traces-a64.h"
-#include "a64/macro-assembler-a64.h"
-#include "a64/simulator-a64.h"
+#include "vixl/a64/macro-assembler-a64.h"
+#include "vixl/a64/simulator-a64.h"

 namespace vixl {

--- a/test/test-utils-a64.cc
+++ b/test/test-utils-a64.cc
@ -26,13 +26,13 @@

 #include "test-utils-a64.h"

-#include <math.h>   // Needed for isnan().
+#include <cmath>

 #include "test-runner.h"
-#include "a64/macro-assembler-a64.h"
-#include "a64/simulator-a64.h"
-#include "a64/disasm-a64.h"
-#include "a64/cpu-a64.h"
+#include "vixl/a64/macro-assembler-a64.h"
+#include "vixl/a64/simulator-a64.h"
+#include "vixl/a64/disasm-a64.h"
+#include "vixl/a64/cpu-a64.h"

 #define __ masm->

@ -85,7 +85,7 @@ bool EqualFP32(float expected, const RegisterDump*, float result) {
  if (float_to_rawbits(expected) == float_to_rawbits(result)) {
    return true;
  } else {
-    if (isnan(expected) || (expected == 0.0)) {
+    if (std::isnan(expected) || (expected == 0.0)) {
      printf("Expected 0x%08" PRIx32 "\t Found 0x%08" PRIx32 "\n",
             float_to_rawbits(expected), float_to_rawbits(result));
    } else {
@ -104,7 +104,7 @@ bool EqualFP64(double expected, const RegisterDump*, double result) {
    return true;
  }

-  if (isnan(expected) || (expected == 0.0)) {
+  if (std::isnan(expected) || (expected == 0.0)) {
    printf("Expected 0x%016" PRIx64 "\t Found 0x%016" PRIx64 "\n",
           double_to_rawbits(expected), double_to_rawbits(result));
  } else {
--- a/test/test-utils-a64.h
+++ b/test/test-utils-a64.h
@ -28,10 +28,10 @@
 #define VIXL_A64_TEST_UTILS_A64_H_

 #include "test-runner.h"
-#include "a64/macro-assembler-a64.h"
-#include "a64/simulator-a64.h"
-#include "a64/disasm-a64.h"
-#include "a64/cpu-a64.h"
+#include "vixl/a64/macro-assembler-a64.h"
+#include "vixl/a64/simulator-a64.h"
+#include "vixl/a64/disasm-a64.h"
+#include "vixl/a64/cpu-a64.h"

 namespace vixl {

--- a/tools/presubmit.py
+++ b/tools/presubmit.py
@ -40,6 +40,10 @@ import test
 import util


+SUPPORTED_COMPILERS = ['g++', 'clang++']
+OBJ_DIR = './obj'
+
+
 def BuildOptions():
  result = argparse.ArgumentParser(
          description='Run the linter and unit tests.',
@ -53,9 +57,11 @@ def BuildOptions():
                      help='Do not run the linter. Run the tests only.')
  result.add_argument('--noclean', action='store_true',
                      help='Do not clean before build.')
+  result.add_argument('--fast', action='store_true',
+                      help='Only test with one toolchain')
  result.add_argument('--jobs', '-j', metavar='N', type=int, nargs='?',
                      default=1, const=multiprocessing.cpu_count(),
-                      help='''Runs the tests using N jobs. If the option is set
+                      help='''Run the tests using N jobs. If the option is set
                      but no value is provided, the script will use as many jobs
                      as it thinks useful.''')
  sim_default = 'off' if platform.machine() == 'aarch64' else 'on'
@ -65,30 +71,72 @@ def BuildOptions():
  return result.parse_args()


-def CleanBuildSystem():
-  def clean(mode):
-    if args.verbose: print('Cleaning ' + mode + ' mode test...')
-    command = 'scons mode=%s simulator=%s all --clean' % \
-              (mode, args.simulator)
+def check_supported(compiler, mode, std):
+  if compiler not in SUPPORTED_COMPILERS:
+    print 'Invalid compiler.'
+    sys.exit(1)
+  if mode not in ['release', 'debug']:
+    print 'Invalid mode.'
+    sys.exit(1)
+  if std not in ['c++98', 'c++11']:
+    print 'Invalid c++ standard.'
+    sys.exit(1)
+
+
+def initalize_compiler_list():
+  compiler_list = []
+  for compiler in SUPPORTED_COMPILERS:
+    if util.has_compiler(compiler) and (len(compiler_list) == 0 or not args.fast):
+      compiler_list.append(compiler)
+    else:
+      # This warning suffices for args.fast too.
+      print 'WARNING: Skipping ' + compiler + ' tests.'
+  if len(compiler_list) == 0:
+    util.abort('Found no supported compilers')
+  return compiler_list
+
+
+def CleanBuildSystem(compiler):
+  def clean(compiler, mode, std):
+    check_supported(compiler, mode, std)
+    os.environ['CXX'] = compiler
+    if args.verbose:
+      print 'Cleaning ' + compiler + ' ' + std + ' ' \
+            + mode + ' mode test...'
+    command = 'scons mode=%s std=%s simulator=%s all --clean' % \
+              (mode, std, args.simulator)
    status, output = util.getstatusoutput(command)
    if status != 0:
      print(output)
      util.abort('Failed cleaning test: ' + command)
-  clean('debug')
-  clean('release')
+
+  clean(compiler, 'debug',    'c++98')
+  clean(compiler, 'debug',    'c++11')
+  clean(compiler, 'release',  'c++98')
+  clean(compiler, 'release',  'c++11')


-def BuildEverything():
-  def build(mode):
-    if args.verbose: print('Building ' + mode + ' mode test...')
-    command = 'scons mode=%s simulator=%s all -j%u' % \
-              (mode, args.simulator, args.jobs)
+def BuildEverything(compiler):
+  def build(compiler, mode, std):
+    check_supported(compiler, mode, std)
+    os.environ['CXX'] = compiler
+    if args.verbose:
+      print 'Building ' + compiler + ' ' +  std + ' ' \
+            + mode + ' mode test...'
+    if args.jobs == 1:
+      print '- This may take a while. Pass `-j` to use multiple threads.'
+    command = 'scons mode=%s std=%s simulator=%s all -j%u' % \
+              (mode, std, args.simulator, args.jobs)
    status, output = util.getstatusoutput(command)
    if status != 0:
      print(output)
      util.abort('Failed building test: ' + command)
-  build('debug')
-  build('release')
+
+  print 'Building ' + compiler + ' tests...'
+  build(compiler, 'debug',    'c++98')
+  build(compiler, 'debug',    'c++11')
+  build(compiler, 'release',  'c++98')
+  build(compiler, 'release',  'c++11')


 NOT_RUN = 'NOT RUN'
@ -101,7 +149,7 @@ class Test:
    self.status = NOT_RUN

  def name_prefix(self):
-    return '%-26s : ' % self.name
+    return '%-40s : ' % self.name


 class Tester:
@ -121,33 +169,36 @@ class Tester:


 class VIXLTest(Test):
-  def __init__(self, mode, simulator, debugger = False, verbose = False):
-    if not mode in ['release', 'debug']:
-      print 'Invalid mode.'
-      sys.exit(1)
-
-    self.debugger = debugger
+  def __init__(self, compiler, mode, std, simulator, debugger = False, verbose = False):
+    check_supported(compiler, mode, std)
    self.verbose = verbose
+    self.debugger = debugger
+    self.compiler = compiler
+    self.mode = mode
+    self.std = std

-    name = 'test ' + mode
+    name = 'test ' + compiler + ' ' + std + ' ' + mode
    if simulator:
      name += ' (%s)' % ('debugger' if debugger else 'simulator')
    Test.__init__(self, name)

-    self.exe = './test-runner'
+    self.exe = 'test-runner'
    if simulator:
        self.exe += '_sim'
    if mode == 'debug':
      self.exe += '_g'

  def Run(self):
-    manifest = test.ReadManifest(self.exe, [], self.debugger,
-                                 False, self.verbose)
+    self.status = PASSED
+    command = os.path.join(OBJ_DIR, self.mode, self.compiler,
+                           self.std, self.exe)
+    manifest = test.ReadManifest(command, [], self.debugger, False, self.verbose)
    retcode = test.RunTests(manifest, jobs = args.jobs,
                            verbose = self.verbose, debugger = self.debugger,
                            progress_prefix = self.name_prefix())
    printer.EnsureNewLine()
-    self.status = PASSED if retcode == 0 else FAILED
+    if retcode != 0:
+      self.status = FAILED


 class LintTest(Test):
@ -167,13 +218,17 @@ details.'''
    n_errors = lint.LintFiles(lint.default_tracked_files,
                              jobs = args.jobs, verbose = args.verbose,
                              progress_prefix = self.name_prefix())
-
    self.status = PASSED if n_errors == 0 else FAILED


 class BenchTest(Test):
-  def __init__(self, mode, simulator):
-    name = 'benchmarks ' + mode
+  def __init__(self, compiler, mode, std, simulator):
+    check_supported(compiler, mode, std)
+    self.compiler = compiler
+    self.mode = mode
+    self.std = std
+
+    name = 'benchmarks ' + compiler + ' ' + std + ' ' + mode
    Test.__init__(self, name)
    self.exe_suffix = ''
    if simulator:
@ -186,7 +241,8 @@ class BenchTest(Test):
                  'bench-branch-masm', 'bench-branch-link-masm']
    self.status = PASSED
    for bench in benchmarks:
-      command = './' + bench + self.exe_suffix
+      command = os.path.join(OBJ_DIR, self.mode, self.compiler, self.std,
+                             bench + self.exe_suffix)
      (rc, out) = util.getstatusoutput(command)
      if rc != 0:
        self.status = FAILED
@ -206,31 +262,44 @@ if __name__ == '__main__':
    print 'WARNING: This is not a Git repository. The linter will not run.'
    args.nolint = True

-  tester = Tester()
  if not args.nolint:
    import lint
-    tester.AddTest(LintTest())
+    LintTest().Run()

  if not args.notest:
-    if not args.noclean:
-      CleanBuildSystem()
-    BuildEverything()
+    tester = Tester()
+    compiler_list = initalize_compiler_list()

-    if args.simulator == 'on':
-      #                        mode,      sim,   debugger, verbose
-      tester.AddTest(VIXLTest('release',  True,  True,     args.verbose))
-      tester.AddTest(VIXLTest('debug',    True,  True,     args.verbose))
-      tester.AddTest(VIXLTest('release',  True,  False,    args.verbose))
-      tester.AddTest(VIXLTest('debug',    True,  False,    args.verbose))
-      tester.AddTest(BenchTest('release', True))
-      tester.AddTest(BenchTest('debug',   True))
-    else:
-      tester.AddTest(VIXLTest('release',  False, False,    args.verbose))
-      tester.AddTest(VIXLTest('debug',    False, False,    args.verbose))
-      tester.AddTest(BenchTest('release', False))
-      tester.AddTest(BenchTest('debug',   False))
+    for compiler in compiler_list:
+      if not args.noclean:
+        CleanBuildSystem(compiler)
+      BuildEverything(compiler)

-  tester.RunAll()
+      if args.simulator == 'on':
+        #                                 mode,       std,      sim,   debugger, verbose
+        tester.AddTest(VIXLTest(compiler, 'release',  'c++98',  True,  True,     args.verbose))
+        tester.AddTest(VIXLTest(compiler, 'debug',    'c++98',  True,  True,     args.verbose))
+        tester.AddTest(VIXLTest(compiler, 'release',  'c++98',  True,  False,    args.verbose))
+        tester.AddTest(VIXLTest(compiler, 'debug',    'c++98',  True,  False,    args.verbose))
+        tester.AddTest(VIXLTest(compiler, 'release',  'c++11',  True,  True,     args.verbose))
+        tester.AddTest(VIXLTest(compiler, 'debug',    'c++11',  True,  True,     args.verbose))
+        tester.AddTest(VIXLTest(compiler, 'release',  'c++11',  True,  False,    args.verbose))
+        tester.AddTest(VIXLTest(compiler, 'debug',    'c++11',  True,  False,    args.verbose))
+        tester.AddTest(BenchTest(compiler,'release',  'c++98',  True))
+        tester.AddTest(BenchTest(compiler,'debug',    'c++98',  True))
+        tester.AddTest(BenchTest(compiler,'release',  'c++11',  True))
+        tester.AddTest(BenchTest(compiler,'debug',    'c++11',  True))
+      else:
+        tester.AddTest(VIXLTest(compiler, 'release',  'c++98',  False, False,    args.verbose))
+        tester.AddTest(VIXLTest(compiler, 'debug',    'c++98',  False, False,    args.verbose))
+        tester.AddTest(VIXLTest(compiler, 'release',  'c++11',  False, False,    args.verbose))
+        tester.AddTest(VIXLTest(compiler, 'debug',    'c++11',  False, False,    args.verbose))
+        tester.AddTest(BenchTest(compiler,'release',  'c++98',  False))
+        tester.AddTest(BenchTest(compiler,'debug',    'c++98',  False))
+        tester.AddTest(BenchTest(compiler,'release',  'c++11',  False))
+        tester.AddTest(BenchTest(compiler,'debug',    'c++11',  False))
+
+    tester.RunAll()

  if git.is_git_repository_root():
    untracked_files = git.get_untracked_files()
--- a/tools/util.py
+++ b/tools/util.py
@ -24,6 +24,7 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

+import os
 import sys
 import subprocess
 import shlex
@ -49,3 +50,8 @@ def last_line(text):
  lines = text.split('\n')
  last = lines[-1].split('\r')
  return last[-1]
+
+
+def has_compiler(compiler):
+  status, output = getstatusoutput('which ' + compiler)
+  return status == 0