diff --git a/.gitignore b/.gitignore
index bf5e319..37a6af4 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,6 +1,7 @@
 # Editor and engineering scratch files
 .vs
 .vscode
+.DS_Store
 *.log
 *.diff
 *.user
@@ -12,7 +13,7 @@ Proto
 Binaries
 
 # Build artifacts
-astcenc*
+astcenc
 build*
 
 # General build artifacts
diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 0000000..4ff560c
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "Source/GoogleTest"]
+	path = Source/GoogleTest
+	url = https://github.com/google/googletest.git
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4800c9b..1dcc21e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -15,7 +15,6 @@
 #  under the License.
 #  ----------------------------------------------------------------------------
 
-
 # CMake configuration
 cmake_minimum_required(VERSION 3.15)
 cmake_policy(SET CMP0069 NEW)  # LTO support
@@ -29,38 +28,77 @@ set(CMAKE_CXX_STANDARD_REQUIRED ON)
 set(CMAKE_CXX_EXTENSIONS OFF)
 set(CMAKE_EXPORT_COMPILE_COMMANDS 1)
 set(PACKAGE_ROOT astcenc)
+include(CTest)
 
 # Command line configuration
-option(ISA_AVX2 "Enable builds for AVX2 SIMD")
+function(printopt optName optVal optArch tgtArch)
+    if(${optVal})
+        if(${optArch} MATCHES ${tgtArch})
+            message("  -- ${optName} backend - ON")
+        else()
+            message("  -- ${optName} backend - SKIPPED (${optArch} only)")
+        endif()
+    else()
+        message("  -- ${optName} backend - OFF")
+    endif()
+endfunction()
+
+set(VALID_ARCH aarch64 x64)
+set(ARCH x64 CACHE STRING "Target architecture")
+set_property(CACHE ARCH PROPERTY STRINGS ${VALID_ARCH})
+
+message("-- Selecting target astcenc backends:")
+list(FIND VALID_ARCH ${ARCH} index)
+if(index EQUAL -1)
+    message(FATAL_ERROR "ARCH must be one of ${VALID_ARCH}")
+endif()
 
-message("-- Selecting backend build type(s)")
 set(ANY_ISA 0)
-if(${ISA_AVX2})
+
+option(ISA_AVX2 "Enable builds for AVX2 SIMD")
+printopt("AVX2" ${ISA_AVX2} "x64" ${ARCH})
+if(${ISA_AVX2} AND ${ARCH} MATCHES "x64")
     set(ANY_ISA 1)
-    message("  -- AVX2 backend: ON")
 endif()
 
 option(ISA_SSE41 "Enable builds for SSE4.1 SIMD")
-if(${ISA_SSE41})
+printopt("SSE4.1" ${ISA_SSE41} "x64" ${ARCH})
+if(${ISA_SSE41} AND ${ARCH} MATCHES "x64")
     set(ANY_ISA 1)
-    message("  -- SSE4.1 backend: ON")
 endif()
 
 option(ISA_SSE2 "Enable builds for SSE2 SIMD")
-if(${ISA_SSE2})
+printopt("SSE2" ${ISA_SSE2} "x64" ${ARCH})
+if(${ISA_SSE2} AND ${ARCH} MATCHES "x64")
     set(ANY_ISA 1)
-    message("  -- SSE2 backend: ON")
 endif()
 
-option(ISA_NONE "Enable builds for noSIMD")
+option(ISA_NEON "Enable builds for NEON SIMD")
+printopt("NEON" ${ISA_NEON} "aarch64" ${ARCH})
+if(${ISA_NEON} AND ${ARCH} MATCHES "aarch64")
+    set(ANY_ISA 1)
+endif()
+
+option(ISA_NONE "Enable builds for no SIMD")
 if(${ISA_NONE})
     set(ANY_ISA 1)
-    message("  -- No SIMD backend: ON")
+    message("  -- No SIMD backend - ON")
+else()
+    message("  -- No SIMD backend - OFF")
 endif()
 
 option(ISA_INVARIANCE "Enable builds for ISA invariance")
 if(${ISA_INVARIANCE})
-    message("  -- ISA invariance: ON")
+    message("  -- ISA invariant backend - ON")
+else()
+    message("  -- ISA invariant backend - OFF")
+endif()
+
+option(UNITTEST "Enable builds for unit tests")
+if(${UNITTEST})
+    message("  -- Unit tests - ON")
+else()
+    message("  -- Unit tests - OFF")
 endif()
 
 if(NOT ${ANY_ISA})
diff --git a/Docs/Building.md b/Docs/Building.md
index 85b8bb0..e7c5c46 100644
--- a/Docs/Building.md
+++ b/Docs/Building.md
@@ -22,8 +22,14 @@ to generate the build system.
 mkdir build
 cd build
 
-# Create the build system
-cmake -G "NMake Makefiles" -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=./ \
+# Configure your build of choice, for example:
+
+# Arm arch64
+cmake -G "Unix Makefiles" -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=./ \
+    -DARCH=aarch64 -DISA_NEON=ON ..
+
+# x86-64
+cmake -G "Unix Makefiles" -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=./ \
     -DISA_AVX2=ON -DISA_SSE41=ON -DISA_SSE2=ON ..
 ```
 
@@ -62,7 +68,13 @@ export CXX=clang++
 mkdir build
 cd build
 
-# Create the build system
+# Configure your build of choice, for example:
+
+# Arm arch64
+cmake -G "Unix Makefiles" -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=./ \
+    -DARCH=aarch64 -DISA_NEON=ON ..
+
+# x86-64
 cmake -G "Unix Makefiles" -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=./ \
     -DISA_AVX2=ON -DISA_SSE41=ON -DISA_SSE2=ON ..
 ```
@@ -88,13 +100,14 @@ For codec developers there are a number of useful features in the build system.
 
 ### No intrinsics build
 
-All normal builds will use SIMD accelerated code paths using instrinsics, as
-x86-64 guarantees availability of at least SSE2. For development purposes it
-is possible to build an intrinsic-free build which uses no explicit SIMD
-acceleration (the compiler may still auto-vectorize).
+All normal builds will use SIMD accelerated code paths using intrinsics, as all
+target architectures (x86-64 and aarch64) guarantee SIMD availability. For
+development purposes it is possible to build an intrinsic-free build which uses
+no explicit SIMD acceleration (the compiler may still auto-vectorize).
 
 To enable this binary variant add `-DISA_NONE=ON` to the CMake command line
-when configuring. It is NOT
+when configuring. It is NOT recommended to use this for production; it is
+significantly slower than the vectorized SIMD builds.
 
 ### ISA Invariance
 
@@ -126,12 +139,35 @@ We support and test the following `CMAKE_BUILD_TYPE` options.
 Note that optimized release builds are compiled with link-time optimization,
 which can make profiling more challenging ...
 
+### Testing
+
+We support building unit tests.
+
+These builds use the `googletest` framework, which is pulled in though a git
+submodule. On first use, you must fetch the submodule dependency:
+
+```shell
+git submodule init
+git submodule update
+```
+
+To build unit tests add `-DUNITTEST=ON` to the CMake command line when
+configuring.
+
+To run unit tests use the CMake `ctest` utility from your build directory after
+you have built the tests.
+
+```shell
+cd build
+ctest --verbose
+```
+
 ### Packaging
 
 We support building a release bundle of all enabled binary configurations in
 the current CMake configuration using the `package` build target
 
-```bash
+```shell
 # Run a build and package build outputs in `./astcenc-<ver>-<os>-<arch>.<fmt>`
 cd build
 make package -j16
diff --git a/Docs/ChangeLog.md b/Docs/ChangeLog.md
index e1519d4..12ea10c 100644
--- a/Docs/ChangeLog.md
+++ b/Docs/ChangeLog.md
@@ -18,6 +18,9 @@ stable across versions, and this release is not compatible with 2.1. Please
 recompile your client-side code using the updated `astcenc.h` header.
 
 * **General:**
+  * **Feature:** New Arm aarch64 NEON accelerated vector library. Note that at
+    this time Arm builds must be built from source; pre-built binaries are not
+	provided in this release.
   * **Improvement:** SSE4.2 feature profile changed to SSE4.1, which more
     accurately reflects the feature set used.
   * **Improvement:** Build system changed to use CMake for all platforms.
@@ -35,8 +38,8 @@ recompile your client-side code using the updated `astcenc.h` header.
     scales RGB values by the alpha value. This can be useful to minimize
     cross-channel color bleed caused by GPU post-multiply filtering/blending.
   * **Improvements:** Command line tool cleanly traps and reports errors for
-    corrupt input images rather than relying on hard standard library
-    `assert()` calls.
+    corrupt input images rather than relying on standard library `assert()`
+	calls in release builds.
 * **Core API:**
   * **API Change:** Images using region-based metrics no longer need to include
     padding; all input images should be tightly packed and `dim_pad` is removed
diff --git a/Docs/Testing.md b/Docs/Testing.md
index bf985f0..7b11dc1 100644
--- a/Docs/Testing.md
+++ b/Docs/Testing.md
@@ -14,10 +14,27 @@ can be achieved by configuring the CMake build using the install prefix
 `-DCMAKE_INSTALL_PREFIX=../` and then running a build with the `install` build
 target.
 
-# Running unit tests
+# Running C++ unit tests
 
-To run the command line unit tests, which aim to get coverage of the command
-line options and core codec stability without testing the compression quality
+We support a small (but growing) number of C++ unit tests, which are written
+using the `googletest` framework and integrated in the CMake "CTest" test
+framework.
+
+To build unit tests pull the `googletest` git submodule and add `-DUNITTEST=ON`
+to the CMake command line when configuring.
+
+To run unit tests use the CMake `ctest` utility from your build directory after
+you have built the tests.
+
+```shell
+cd build
+ctest --verbose
+```
+
+# Running command line tests
+
+To run the command line tests, which aim to get coverage of the command line
+options and core codec stability without testing the compression quality
 itself, run the command line:
 
     python3 -m unittest discover -s Test -p astc_test*.py -v
diff --git a/Source/CMakeLists.txt b/Source/CMakeLists.txt
index fb17616..dc2455f 100644
--- a/Source/CMakeLists.txt
+++ b/Source/CMakeLists.txt
@@ -21,6 +21,25 @@ if(CMAKE_CXX_COMPILER_ID MATCHES "GNU|Clang")
     set(CMAKE_CXX_COMPILE_OPTIONS_IPO "-flto")
 endif()
 
+# - - - - - - - - - - - - - - - - - -
+# No architecture-specific SIMD
+
+if (${ISA_NONE})
+    set(ISA_SIMD none)
+    include(cmake_core.cmake)
+endif()
+
+# - - - - - - - - - - - - - - - - - -
+# Armv8-A architecture-specific SIMD
+
+if (${ISA_NEON})
+    set(ISA_SIMD neon)
+    include(cmake_core.cmake)
+endif()
+
+# - - - - - - - - - - - - - - - - - -
+# x86-64 architecture-specific SIMD
+
 if (${ISA_AVX2})
     set(ISA_SIMD avx2)
     include(cmake_core.cmake)
@@ -36,7 +55,11 @@ if (${ISA_SSE2})
     include(cmake_core.cmake)
 endif()
 
-if (${ISA_NONE})
-    set(ISA_SIMD none)
-    include(cmake_core.cmake)
+# - - - - - - - - - - - - - - - - - -
+# Unit testing
+if (${UNITTEST})
+    set(INSTALL_GTEST OFF CACHE BOOL "" FORCE)
+    add_subdirectory(GoogleTest)
+    enable_testing()
+    add_subdirectory(UnitTest)
 endif()
diff --git a/Source/GoogleTest b/Source/GoogleTest
new file mode 160000
index 0000000..703bd9c
--- /dev/null
+++ b/Source/GoogleTest
@@ -0,0 +1 @@
+Subproject commit 703bd9caab50b139428cea1aaff9974ebee5742e
diff --git a/Source/UnitTest/CMakeLists.txt b/Source/UnitTest/CMakeLists.txt
new file mode 100644
index 0000000..e0357b9
--- /dev/null
+++ b/Source/UnitTest/CMakeLists.txt
@@ -0,0 +1,50 @@
+#  SPDX-License-Identifier: Apache-2.0
+#  ----------------------------------------------------------------------------
+#  Copyright 2020 Arm Limited
+#
+#  Licensed under the Apache License, Version 2.0 (the "License"); you may not
+#  use this file except in compliance with the License. You may obtain a copy
+#  of the License at:
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#  WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#  License for the specific language governing permissions and limitations
+#  under the License.
+#  ----------------------------------------------------------------------------
+
+# - - - - - - - - - - - - - - - - - -
+# No architecture-specific SIMD
+
+if (${ISA_NONE})
+    set(ISA_SIMD none)
+    include(cmake_core.cmake)
+endif()
+
+# - - - - - - - - - - - - - - - - - -
+# Armv8-A architecture-specific SIMD
+
+if (${ISA_NEON})
+    set(ISA_SIMD neon)
+    include(cmake_core.cmake)
+endif()
+
+# - - - - - - - - - - - - - - - - - -
+# x86-64 architecture-specific SIMD
+
+if (${ISA_AVX2})
+    set(ISA_SIMD avx2)
+    include(cmake_core.cmake)
+endif()
+
+if (${ISA_SSE41})
+    set(ISA_SIMD sse4.1)
+    include(cmake_core.cmake)
+endif()
+
+if (${ISA_SSE2})
+    set(ISA_SIMD sse2)
+    include(cmake_core.cmake)
+endif()
diff --git a/Source/UnitTest/cmake_core.cmake b/Source/UnitTest/cmake_core.cmake
new file mode 100644
index 0000000..98858db
--- /dev/null
+++ b/Source/UnitTest/cmake_core.cmake
@@ -0,0 +1,112 @@
+#  SPDX-License-Identifier: Apache-2.0
+#  ----------------------------------------------------------------------------
+#  Copyright 2020 Arm Limited
+#
+#  Licensed under the Apache License, Version 2.0 (the "License"); you may not
+#  use this file except in compliance with the License. You may obtain a copy
+#  of the License at:
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#  WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#  License for the specific language governing permissions and limitations
+#  under the License.
+#  ----------------------------------------------------------------------------
+
+add_executable(test-simd-${ISA_SIMD})
+
+target_sources(test-simd-${ISA_SIMD}
+    PRIVATE
+        test_simd.cpp)
+
+target_include_directories(test-simd-${ISA_SIMD}
+    PRIVATE
+        ${gtest_SOURCE_DIR}/include)
+
+target_compile_options(test-simd-${ISA_SIMD}
+    PRIVATE
+        # Use pthreads on Linux/macOS
+        $<$<PLATFORM_ID:Linux,Darwin>:-pthread>
+
+        # MSVC compiler defines
+        $<$<CXX_COMPILER_ID:MSVC>:/EHsc>
+
+        # G++ and Clang++ compiler defines
+        $<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wall>
+        $<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wextra>
+        $<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wpedantic>
+        $<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Werror>
+        $<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wshadow>
+        $<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wdouble-promotion>)
+
+# Set up configuration for SIMD ISA builds
+if(${ISA_SIMD} MATCHES "none")
+    target_compile_definitions(test-simd-${ISA_SIMD}
+        PRIVATE
+            ASTCENC_NEON=0
+            ASTCENC_SSE=0
+            ASTCENC_AVX=0
+            ASTCENC_POPCNT=0)
+
+    if (${ARCH} MATCHES x64)
+        target_compile_options(astcenc-${ISA_SIMD}
+            PRIVATE
+                $<$<CXX_COMPILER_ID:${GNU_LIKE}>:-mfpmath=sse -msse2>)
+    endif()
+
+elseif(${ISA_SIMD} MATCHES "neon")
+    target_compile_definitions(test-simd-${ISA_SIMD}
+        PRIVATE
+            ASTCENC_NEON=1
+            ASTCENC_SSE=0
+            ASTCENC_AVX=0
+            ASTCENC_POPCNT=0)
+
+elseif(${ISA_SIMD} MATCHES "sse2")
+    target_compile_definitions(test-simd-${ISA_SIMD}
+        PRIVATE
+            ASTCENC_NEON=0
+            ASTCENC_SSE=20
+            ASTCENC_AVX=0
+            ASTCENC_POPCNT=0)
+
+    target_compile_options(test-simd-${ISA_SIMD}
+        PRIVATE
+        $<$<CXX_COMPILER_ID:${GNU_LIKE}>:-mfpmath=sse -msse2>)
+
+elseif(${ISA_SIMD} MATCHES "sse4.1")
+    target_compile_definitions(test-simd-${ISA_SIMD}
+        PRIVATE
+            ASTCENC_NEON=0
+            ASTCENC_SSE=41
+            ASTCENC_AVX=0
+            ASTCENC_POPCNT=1)
+
+    target_compile_options(test-simd-${ISA_SIMD}
+        PRIVATE
+            $<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-mfpmath=sse -msse4.1 -mpopcnt>)
+
+elseif(${ISA_SIMD} MATCHES "avx2")
+    target_compile_definitions(test-simd-${ISA_SIMD}
+        PRIVATE
+            ASTCENC_NEON=0
+            ASTCENC_SSE=41
+            ASTCENC_AVX=2
+            ASTCENC_POPCNT=1)
+
+    target_compile_options(test-simd-${ISA_SIMD}
+        PRIVATE
+            $<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-mfpmath=sse -mavx2 -mpopcnt>
+            $<$<CXX_COMPILER_ID:MSVC>:/arch:AVX2>)
+endif()
+
+target_link_libraries(test-simd-${ISA_SIMD}
+    PRIVATE
+        gtest_main)
+
+add_test(NAME test-simd-${ISA_SIMD}
+         COMMAND test-simd-${ISA_SIMD})
+
+install(TARGETS test-simd-${ISA_SIMD} DESTINATION ${PACKAGE_ROOT})
diff --git a/Source/UnitTest/test_simd.cpp b/Source/UnitTest/test_simd.cpp
new file mode 100644
index 0000000..e5333ad
--- /dev/null
+++ b/Source/UnitTest/test_simd.cpp
@@ -0,0 +1,1004 @@
+// SPDX-License-Identifier: Apache-2.0
+// ----------------------------------------------------------------------------
+// Copyright 2020 Arm Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
+// use this file except in compliance with the License. You may obtain a copy
+// of the License at:
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+// ----------------------------------------------------------------------------
+
+/**
+ * @brief Unit tests for the vectorized SIMD functionality.
+ *
+ * This test suite is a partial implementation, focussing on 4-wide vectors.
+ * We're adding things as we touch related parts of the code, but there is some
+ * technical debt to catch up on to get full coverage.
+ */
+
+#include <limits>
+
+#include "gtest/gtest.h"
+
+#include "../astcenc_internal.h"
+#include "../astcenc_vecmathlib.h"
+
+namespace astcenc
+{
+
+#if ASTCENC_SIMD_WIDTH == 4
+
+// VLA (4-wide) tests - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+/** \brief Test VLA change_sign. */
+TEST(vfloat, ChangeSign)
+{
+	vfloat a(-1.0f,  1.0f, -3.12f, 3.12f);
+	vfloat b(-1.0f, -1.0f,  3.12f, 3.12f);
+	vfloat r = change_sign(a, b);
+	EXPECT_EQ(r.lane<0>(),  1.0f);
+	EXPECT_EQ(r.lane<1>(), -1.0f);
+	EXPECT_EQ(r.lane<2>(), -3.12f);
+	EXPECT_EQ(r.lane<3>(),  3.12f);
+}
+
+/** \brief Test VLA atan. */
+TEST(vfloat, Atan)
+{
+	vfloat a(-0.15f, 0.0f, 0.9f, 2.1f);
+	vfloat r = atan(a);
+	EXPECT_NEAR(r.lane<0>(), -0.149061f, 0.005f);
+	EXPECT_NEAR(r.lane<1>(),  0.000000f, 0.005f);
+	EXPECT_NEAR(r.lane<2>(),  0.733616f, 0.005f);
+	EXPECT_NEAR(r.lane<3>(),  1.123040f, 0.005f);
+}
+
+/** \brief Test VLA atan2. */
+TEST(vfloat, Atan2)
+{
+	vfloat a(-0.15f, 0.0f, 0.9f, 2.1f);
+	vfloat b(1.15f, -3.0f, -0.9f, 1.1f);
+	vfloat r = atan2(a, b);
+	EXPECT_NEAR(r.lane<0>(), -0.129816f, 0.005f);
+	EXPECT_NEAR(r.lane<1>(),  3.141592f, 0.005f);
+	EXPECT_NEAR(r.lane<2>(),  2.360342f, 0.005f);
+	EXPECT_NEAR(r.lane<3>(),  1.084357f, 0.005f);
+}
+
+#endif
+
+#if ASTCENC_SIMD_WIDTH >= 4
+
+static const float qnan = std::numeric_limits<float>::quiet_NaN();
+alignas(16) static const float f32x4_data[5] { 0.0f, 1.0f, 2.0f, 3.0f, 4.0f };
+alignas(16) static const int s32x4_data[5] {      0,    1,    2,    3,    4 };
+
+// VFLOAT4 tests - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+/** \brief Test unaligned vfloat4 data load. */
+TEST(vfloat4, UnalignedLoad)
+{
+	vfloat4 a(&(f32x4_data[1]));
+	EXPECT_EQ(a.lane<0>(), 1.0f);
+	EXPECT_EQ(a.lane<1>(), 2.0f);
+	EXPECT_EQ(a.lane<2>(), 3.0f);
+	EXPECT_EQ(a.lane<3>(), 4.0f);
+}
+
+/** \brief Test scalar duplicated vfloat4 load. */
+TEST(vfloat4, ScalarDupLoad)
+{
+	vfloat4 a(1.1f);
+	EXPECT_EQ(a.lane<0>(), 1.1f);
+	EXPECT_EQ(a.lane<1>(), 1.1f);
+	EXPECT_EQ(a.lane<2>(), 1.1f);
+	EXPECT_EQ(a.lane<3>(), 1.1f);
+}
+
+/** \brief Test scalar vfloat4 load. */
+TEST(vfloat4, ScalarLoad)
+{
+	vfloat4 a(1.1f, 2.2f, 3.3f, 4.4f);
+	EXPECT_EQ(a.lane<0>(), 1.1f);
+	EXPECT_EQ(a.lane<1>(), 2.2f);
+	EXPECT_EQ(a.lane<2>(), 3.3f);
+	EXPECT_EQ(a.lane<3>(), 4.4f);
+}
+
+/** \brief Test copy vfloat4 load. */
+TEST(vfloat4, CopyLoad)
+{
+	vfloat4 s(1.1f, 2.2f, 3.3f, 4.4f);
+	vfloat4 a(s.m);
+	EXPECT_EQ(a.lane<0>(), 1.1f);
+	EXPECT_EQ(a.lane<1>(), 2.2f);
+	EXPECT_EQ(a.lane<2>(), 3.3f);
+	EXPECT_EQ(a.lane<3>(), 4.4f);
+}
+
+/** \brief Test vfloat4 scalar lane set. */
+TEST(vfloat4, SetLane)
+{
+	vfloat4 a(0.0f);
+
+	a.set_lane<0>(1.0f);
+	EXPECT_EQ(a.lane<0>(), 1.0f);
+	EXPECT_EQ(a.lane<1>(), 0.0f);
+	EXPECT_EQ(a.lane<2>(), 0.0f);
+	EXPECT_EQ(a.lane<3>(), 0.0f);
+
+	a.set_lane<1>(2.0f);
+	EXPECT_EQ(a.lane<0>(), 1.0f);
+	EXPECT_EQ(a.lane<1>(), 2.0f);
+	EXPECT_EQ(a.lane<2>(), 0.0f);
+	EXPECT_EQ(a.lane<3>(), 0.0f);
+
+	a.set_lane<2>(3.0f);
+	EXPECT_EQ(a.lane<0>(), 1.0f);
+	EXPECT_EQ(a.lane<1>(), 2.0f);
+	EXPECT_EQ(a.lane<2>(), 3.0f);
+	EXPECT_EQ(a.lane<3>(), 0.0f);
+
+	a.set_lane<3>(4.0f);
+	EXPECT_EQ(a.lane<0>(), 1.0f);
+	EXPECT_EQ(a.lane<1>(), 2.0f);
+	EXPECT_EQ(a.lane<2>(), 3.0f);
+	EXPECT_EQ(a.lane<3>(), 4.0f);
+}
+
+/** \brief Test vfloat4 zero. */
+TEST(vfloat4, Zero)
+{
+	vfloat4 a = vfloat4::zero();
+	EXPECT_EQ(a.lane<0>(), 0.0f);
+	EXPECT_EQ(a.lane<1>(), 0.0f);
+	EXPECT_EQ(a.lane<2>(), 0.0f);
+	EXPECT_EQ(a.lane<3>(), 0.0f);
+}
+
+/** \brief Test vfloat4 load1. */
+TEST(vfloat4, Load1)
+{
+	float s = 3.14f;
+	vfloat4 a = vfloat4::load1(&s);
+	EXPECT_EQ(a.lane<0>(), 3.14f);
+	EXPECT_EQ(a.lane<1>(), 3.14f);
+	EXPECT_EQ(a.lane<2>(), 3.14f);
+	EXPECT_EQ(a.lane<3>(), 3.14f);
+}
+
+/** \brief Test vfloat4 loada. */
+TEST(vfloat4, Loada)
+{
+	vfloat4 a(&(f32x4_data[0]));
+	EXPECT_EQ(a.lane<0>(), 0.0f);
+	EXPECT_EQ(a.lane<1>(), 1.0f);
+	EXPECT_EQ(a.lane<2>(), 2.0f);
+	EXPECT_EQ(a.lane<3>(), 3.0f);
+}
+
+/** \brief Test vfloat4 lane_id. */
+TEST(vfloat4, LaneID)
+{
+	vfloat4 a = vfloat4::lane_id();
+	EXPECT_EQ(a.lane<0>(), 0.0f);
+	EXPECT_EQ(a.lane<1>(), 1.0f);
+	EXPECT_EQ(a.lane<2>(), 2.0f);
+	EXPECT_EQ(a.lane<3>(), 3.0f);
+}
+
+/** \brief Test vfloat4 add. */
+TEST(vfloat4, vadd)
+{
+	vfloat4 a(1.0f, 2.0f, 3.0f, 4.0f);
+	vfloat4 b(0.1f, 0.2f, 0.3f, 0.4f);
+	a = a + b;
+	EXPECT_EQ(a.lane<0>(), 1.0f + 0.1f);
+	EXPECT_EQ(a.lane<1>(), 2.0f + 0.2f);
+	EXPECT_EQ(a.lane<2>(), 3.0f + 0.3f);
+	EXPECT_EQ(a.lane<3>(), 4.0f + 0.4f);
+}
+
+/** \brief Test vfloat4 sub. */
+TEST(vfloat4, vsub)
+{
+	vfloat4 a(1.0f, 2.0f, 3.0f, 4.0f);
+	vfloat4 b(0.1f, 0.2f, 0.3f, 0.4f);
+	a = a - b;
+	EXPECT_EQ(a.lane<0>(), 1.0f - 0.1f);
+	EXPECT_EQ(a.lane<1>(), 2.0f - 0.2f);
+	EXPECT_EQ(a.lane<2>(), 3.0f - 0.3f);
+	EXPECT_EQ(a.lane<3>(), 4.0f - 0.4f);
+}
+
+/** \brief Test vfloat4 mul. */
+TEST(vfloat4, vmul)
+{
+	vfloat4 a(1.0f, 2.0f, 3.0f, 4.0f);
+	vfloat4 b(0.1f, 0.2f, 0.3f, 0.4f);
+	a = a * b;
+	EXPECT_EQ(a.lane<0>(), 1.0f * 0.1f);
+	EXPECT_EQ(a.lane<1>(), 2.0f * 0.2f);
+	EXPECT_EQ(a.lane<2>(), 3.0f * 0.3f);
+	EXPECT_EQ(a.lane<3>(), 4.0f * 0.4f);
+}
+
+/** \brief Test vfloat4 mul. */
+TEST(vfloat4, vsmul)
+{
+	vfloat4 a(1.0f, 2.0f, 3.0f, 4.0f);
+	float b = 3.14f;
+	a = a * b;
+	EXPECT_EQ(a.lane<0>(), 1.0f * 3.14f);
+	EXPECT_EQ(a.lane<1>(), 2.0f * 3.14f);
+	EXPECT_EQ(a.lane<2>(), 3.0f * 3.14f);
+	EXPECT_EQ(a.lane<3>(), 4.0f * 3.14f);
+}
+
+/** \brief Test vfloat4 mul. */
+TEST(vfloat4, svmul)
+{
+	float a = 3.14f;
+	vfloat4 b(1.0f, 2.0f, 3.0f, 4.0f);
+	b = a * b;
+	EXPECT_EQ(b.lane<0>(), 3.14f * 1.0f);
+	EXPECT_EQ(b.lane<1>(), 3.14f * 2.0f);
+	EXPECT_EQ(b.lane<2>(), 3.14f * 3.0f);
+	EXPECT_EQ(b.lane<3>(), 3.14f * 4.0f);
+}
+
+/** \brief Test vfloat4 div. */
+TEST(vfloat4, vdiv)
+{
+	vfloat4 a(1.0f, 2.0f, 3.0f, 4.0f);
+	vfloat4 b(0.1f, 0.2f, 0.3f, 0.4f);
+	a = a / b;
+	EXPECT_EQ(a.lane<0>(), 1.0f / 0.1f);
+	EXPECT_EQ(a.lane<1>(), 2.0f / 0.2f);
+	EXPECT_EQ(a.lane<2>(), 3.0f / 0.3f);
+	EXPECT_EQ(a.lane<3>(), 4.0f / 0.4f);
+}
+
+/** \brief Test vfloat4 ceq. */
+TEST(vfloat4, ceq)
+{
+	vfloat4 a1(1.0f, 2.0f, 3.0f, 4.0f);
+	vfloat4 b1(0.1f, 0.2f, 0.3f, 0.4f);
+	vmask r1 = a1 == b1;
+	EXPECT_EQ(0, mask(r1));
+	EXPECT_EQ(false, any(r1));
+	EXPECT_EQ(false, all(r1));
+
+	vfloat4 a2(1.0f, 2.0f, 3.0f, 4.0f);
+	vfloat4 b2(1.0f, 0.2f, 0.3f, 0.4f);
+	vmask r2 = a2 == b2;
+	EXPECT_EQ(0x1, mask(r2));
+	EXPECT_EQ(true, any(r2));
+	EXPECT_EQ(false, all(r2));
+
+	vfloat4 a3(1.0f, 2.0f, 3.0f, 4.0f);
+	vfloat4 b3(1.0f, 0.2f, 3.0f, 0.4f);
+	vmask r3 = a3 == b3;
+	EXPECT_EQ(0x5, mask(r3));
+	EXPECT_EQ(true, any(r3));
+	EXPECT_EQ(false, all(r3));
+
+	vfloat4 a4(1.0f, 2.0f, 3.0f, 4.0f);
+	vmask r4 = a4 == a4;
+	EXPECT_EQ(0xF, mask(r4));
+	EXPECT_EQ(true, any(r4));
+	EXPECT_EQ(true, all(r4));
+}
+
+/** \brief Test vfloat4 cne. */
+TEST(vfloat4, cne)
+{
+	vfloat4 a1(1.0f, 2.0f, 3.0f, 4.0f);
+	vfloat4 b1(0.1f, 0.2f, 0.3f, 0.4f);
+	vmask r1 = a1 != b1;
+	EXPECT_EQ(0xF, mask(r1));
+	EXPECT_EQ(true, any(r1));
+	EXPECT_EQ(true, all(r1));
+
+	vfloat4 a2(1.0f, 2.0f, 3.0f, 4.0f);
+	vfloat4 b2(1.0f, 0.2f, 0.3f, 0.4f);
+	vmask r2 = a2 != b2;
+	EXPECT_EQ(0xE, mask(r2));
+	EXPECT_EQ(true, any(r2));
+	EXPECT_EQ(false, all(r2));
+
+	vfloat4 a3(1.0f, 2.0f, 3.0f, 4.0f);
+	vfloat4 b3(1.0f, 0.2f, 3.0f, 0.4f);
+	vmask r3 = a3 != b3;
+	EXPECT_EQ(0xA, mask(r3));
+	EXPECT_EQ(true, any(r3));
+	EXPECT_EQ(false, all(r3));
+
+	vfloat4 a4(1.0f, 2.0f, 3.0f, 4.0f);
+	vmask r4 = a4 != a4;
+	EXPECT_EQ(0, mask(r4));
+	EXPECT_EQ(false, any(r4));
+	EXPECT_EQ(false, all(r4));
+}
+
+/** \brief Test vfloat4 clt. */
+TEST(vfloat4, clt)
+{
+	vfloat4 a(1.0f, 2.0f, 3.0f, 4.0f);
+	vfloat4 b(0.9f, 2.1f, 3.0f, 4.1f);
+	vmask r = a < b;
+	EXPECT_EQ(0xA, mask(r));
+}
+
+/** \brief Test vfloat4 cle. */
+TEST(vfloat4, cle)
+{
+	vfloat4 a(1.0f, 2.0f, 3.0f, 4.0f);
+	vfloat4 b(0.9f, 2.1f, 3.0f, 4.1f);
+	vmask r = a <= b;
+	EXPECT_EQ(0xE, mask(r));
+}
+
+/** \brief Test vfloat4 cgt. */
+TEST(vfloat4, cgt)
+{
+	vfloat4 a(1.0f, 2.0f, 3.0f, 4.0f);
+	vfloat4 b(0.9f, 2.1f, 3.0f, 4.1f);
+	vmask r = a > b;
+	EXPECT_EQ(0x1, mask(r));
+}
+
+/** \brief Test vfloat4 cge. */
+TEST(vfloat4, cge)
+{
+	vfloat4 a(1.0f, 2.0f, 3.0f, 4.0f);
+	vfloat4 b(0.9f, 2.1f, 3.0f, 4.1f);
+	vmask r = a >= b;
+	EXPECT_EQ(0x5, mask(r));
+}
+
+/** \brief Test vfloat4 min. */
+TEST(vfloat4, min)
+{
+	vfloat4 a(1.0f, 2.0f, 3.0f, 4.0f);
+	vfloat4 b(0.9f, 2.1f, 3.0f, 4.1f);
+	vfloat4 r = min(a, b);
+	EXPECT_EQ(r.lane<0>(), 0.9f);
+	EXPECT_EQ(r.lane<1>(), 2.0f);
+	EXPECT_EQ(r.lane<2>(), 3.0f);
+	EXPECT_EQ(r.lane<3>(), 4.0f);
+}
+
+/** \brief Test vfloat4 max. */
+TEST(vfloat4, max)
+{
+	vfloat4 a(1.0f, 2.0f, 3.0f, 4.0f);
+	vfloat4 b(0.9f, 2.1f, 3.0f, 4.1f);
+	vfloat4 r = max(a, b);
+	EXPECT_EQ(r.lane<0>(), 1.0f);
+	EXPECT_EQ(r.lane<1>(), 2.1f);
+	EXPECT_EQ(r.lane<2>(), 3.0f);
+	EXPECT_EQ(r.lane<3>(), 4.1f);
+}
+
+/** \brief Test vfloat4 clamp. */
+TEST(vfloat4, clamp)
+{
+	vfloat4 a1(1.0f, 2.0f, 3.0f, 4.0f);
+	vfloat4 r1 = clamp(2.1f, 3.0f, a1);
+	EXPECT_EQ(r1.lane<0>(), 2.1f);
+	EXPECT_EQ(r1.lane<1>(), 2.1f);
+	EXPECT_EQ(r1.lane<2>(), 3.0f);
+	EXPECT_EQ(r1.lane<3>(), 3.0f);
+
+	vfloat4 a2(1.0f, 2.0f, qnan, 4.0f);
+	vfloat4 r2 = clamp(2.1f, 3.0f, a2);
+	EXPECT_EQ(r2.lane<0>(), 2.1f);
+	EXPECT_EQ(r2.lane<1>(), 2.1f);
+	EXPECT_EQ(r2.lane<2>(), 2.1f);
+	EXPECT_EQ(r2.lane<3>(), 3.0f);
+}
+
+/** \brief Test vfloat4 clampz. */
+TEST(vfloat4, clampz)
+{
+	vfloat4 a1(-1.0f, 0.0f, 0.1f, 4.0f);
+	vfloat4 r1 = clampz(3.0f, a1);
+	EXPECT_EQ(r1.lane<0>(), 0.0f);
+	EXPECT_EQ(r1.lane<1>(), 0.0f);
+	EXPECT_EQ(r1.lane<2>(), 0.1f);
+	EXPECT_EQ(r1.lane<3>(), 3.0f);
+
+	vfloat4 a2(-1.0f, 0.0f, qnan, 4.0f);
+	vfloat4 r2 = clampz(3.0f, a2);
+	EXPECT_EQ(r2.lane<0>(), 0.0f);
+	EXPECT_EQ(r2.lane<1>(), 0.0f);
+	EXPECT_EQ(r2.lane<2>(), 0.0f);
+	EXPECT_EQ(r2.lane<3>(), 3.0f);
+}
+
+/** \brief Test vfloat4 clampz. */
+TEST(vfloat4, clampzo)
+{
+	vfloat4 a1(-1.0f, 0.0f, 0.1f, 4.0f);
+	vfloat4 r1 = clampzo(a1);
+	EXPECT_EQ(r1.lane<0>(), 0.0f);
+	EXPECT_EQ(r1.lane<1>(), 0.0f);
+	EXPECT_EQ(r1.lane<2>(), 0.1f);
+	EXPECT_EQ(r1.lane<3>(), 1.0f);
+
+	vfloat4 a2(-1.0f, 0.0f, qnan, 4.0f);
+	vfloat4 r2 = clampzo(a2);
+	EXPECT_EQ(r2.lane<0>(), 0.0f);
+	EXPECT_EQ(r2.lane<1>(), 0.0f);
+	EXPECT_EQ(r2.lane<2>(), 0.0f);
+	EXPECT_EQ(r2.lane<3>(), 1.0f);
+}
+
+/** \brief Test vfloat4 abs. */
+TEST(vfloat4, abs)
+{
+	vfloat4 a(-1.0f, 0.0f, 0.1f, 4.0f);
+	vfloat4 r = abs(a);
+	EXPECT_EQ(r.lane<0>(), 1.0f);
+	EXPECT_EQ(r.lane<1>(), 0.0f);
+	EXPECT_EQ(r.lane<2>(), 0.1f);
+	EXPECT_EQ(r.lane<3>(), 4.0f);
+}
+
+/** \brief Test vfloat4 round. */
+TEST(vfloat4, round)
+{
+	vfloat4 a(1.1f, 1.5f, 1.6f, 4.0f);
+	vfloat4 r = round(a);
+	EXPECT_EQ(r.lane<0>(), 1.0f);
+	EXPECT_EQ(r.lane<1>(), 2.0f);
+	EXPECT_EQ(r.lane<2>(), 2.0f);
+	EXPECT_EQ(r.lane<3>(), 4.0f);
+}
+
+/** \brief Test vfloat4 hmin. */
+TEST(vfloat4, hmin)
+{
+	vfloat4 a1(1.1f, 1.5f, 1.6f, 4.0f);
+	vfloat4 r1 = hmin(a1);
+	EXPECT_EQ(r1.lane<0>(), 1.1f);
+	EXPECT_EQ(r1.lane<1>(), 1.1f);
+	EXPECT_EQ(r1.lane<2>(), 1.1f);
+	EXPECT_EQ(r1.lane<3>(), 1.1f);
+
+	vfloat4 a2(1.1f, 1.5f, 1.6f, 0.2f);
+	vfloat4 r2 = hmin(a2);
+	EXPECT_EQ(r2.lane<0>(), 0.2f);
+	EXPECT_EQ(r2.lane<1>(), 0.2f);
+	EXPECT_EQ(r2.lane<2>(), 0.2f);
+	EXPECT_EQ(r2.lane<3>(), 0.2f);
+}
+
+/** \brief Test vfloat4 sqrt. */
+TEST(vfloat4, sqrt)
+{
+	vfloat4 a(1.0f, 2.0f, 3.0f, 4.0f);
+	vfloat4 r = sqrt(a);
+	EXPECT_EQ(r.lane<0>(), std::sqrt(1.0f));
+	EXPECT_EQ(r.lane<1>(), std::sqrt(2.0f));
+	EXPECT_EQ(r.lane<2>(), std::sqrt(3.0f));
+	EXPECT_EQ(r.lane<3>(), std::sqrt(4.0f));
+}
+
+/** \brief Test vfloat4 select. */
+TEST(vfloat4, select)
+{
+	vfloat4 m1(1.0f, 1.0f, 1.0f, 1.0f);
+	vfloat4 m2(1.0f, 2.0f, 1.0f, 2.0f);
+	vmask4 cond = m1 == m2;
+
+	vfloat4 a(1.0f, 3.0f, 3.0f, 1.0f);
+	vfloat4 b(4.0f, 2.0f, 2.0f, 4.0f);
+
+	// Select in one direction
+	vfloat4 r1 = select(a, b, cond);
+	EXPECT_EQ(r1.lane<0>(), 4.0f);
+	EXPECT_EQ(r1.lane<1>(), 3.0f);
+	EXPECT_EQ(r1.lane<2>(), 2.0f);
+	EXPECT_EQ(r1.lane<3>(), 1.0f);
+
+	// Select in the other
+	vfloat4 r2 = select(b, a, cond);
+	EXPECT_EQ(r2.lane<0>(), 1.0f);
+	EXPECT_EQ(r2.lane<1>(), 2.0f);
+	EXPECT_EQ(r2.lane<2>(), 3.0f);
+	EXPECT_EQ(r2.lane<3>(), 4.0f);
+}
+
+/** \brief Test vfloat4 select MSB only. */
+TEST(vfloat4, select_msb)
+{
+	vint4 msb(0x80000000u, 0, 0x80000000u, 0);
+	vmask4 cond(msb.m);
+
+	vfloat4 a(1.0f, 3.0f, 3.0f, 1.0f);
+	vfloat4 b(4.0f, 2.0f, 2.0f, 4.0f);
+
+	// Select in one direction
+	vfloat4 r1 = select(a, b, cond);
+	EXPECT_EQ(r1.lane<0>(), 4.0f);
+	EXPECT_EQ(r1.lane<1>(), 3.0f);
+	EXPECT_EQ(r1.lane<2>(), 2.0f);
+	EXPECT_EQ(r1.lane<3>(), 1.0f);
+
+	// Select in the other
+	vfloat4 r2 = select(b, a, cond);
+	EXPECT_EQ(r2.lane<0>(), 1.0f);
+	EXPECT_EQ(r2.lane<1>(), 2.0f);
+	EXPECT_EQ(r2.lane<2>(), 3.0f);
+	EXPECT_EQ(r2.lane<3>(), 4.0f);
+}
+
+/** \brief Test vfloat4 gatherf. */
+TEST(vfloat4, gatherf)
+{
+	vint4 indices(0, 4, 3, 2);
+	vfloat4 r = gatherf(f32x4_data, indices);
+	EXPECT_EQ(r.lane<0>(), 0.0f);
+	EXPECT_EQ(r.lane<1>(), 4.0f);
+	EXPECT_EQ(r.lane<2>(), 3.0f);
+	EXPECT_EQ(r.lane<3>(), 2.0f);
+}
+
+/** \brief Test vfloat4 store. */
+TEST(vfloat4, store)
+{
+	alignas(16) float out[5];
+	vfloat4 a(f32x4_data);
+	store(a, &(out[1]));
+	EXPECT_EQ(out[1], 0.0f);
+	EXPECT_EQ(out[2], 1.0f);
+	EXPECT_EQ(out[3], 2.0f);
+	EXPECT_EQ(out[4], 3.0f);
+}
+
+/** \brief Test vfloat4 storea. */
+TEST(vfloat4, storea)
+{
+	alignas(16) float out[4];
+	vfloat4 a(f32x4_data);
+	store(a, out);
+	EXPECT_EQ(out[0], 0.0f);
+	EXPECT_EQ(out[1], 1.0f);
+	EXPECT_EQ(out[2], 2.0f);
+	EXPECT_EQ(out[3], 3.0f);
+}
+
+/** \brief Test vfloat4 dot. */
+TEST(vfloat4, dot)
+{
+	vfloat4 a(1.0f, 2.0f, 4.0f, 8.0f);
+	vfloat4 b(1.0f, 0.5f, 0.25f, 0.125f);
+	vfloat4 r = dot(a, b);
+	EXPECT_EQ(r.lane<0>(), 4.0f);
+	EXPECT_EQ(r.lane<1>(), 4.0f);
+	EXPECT_EQ(r.lane<2>(), 4.0f);
+	EXPECT_EQ(r.lane<3>(), 4.0f);
+}
+
+/** \brief Test vfloat4 float_to_int. */
+TEST(vfloat4, float_to_int)
+{
+	vfloat4 a(1.1f, 1.5f, 1.6f, 4.0f);
+	vint4 r = float_to_int(a);
+	EXPECT_EQ(r.lane<0>(), 1);
+	EXPECT_EQ(r.lane<1>(), 1);
+	EXPECT_EQ(r.lane<2>(), 1);
+	EXPECT_EQ(r.lane<3>(), 4);
+}
+
+/** \brief Test vfloat4 round. */
+TEST(vfloat4, float_to_int_rtn)
+{
+	vfloat4 a(1.1f, 1.5f, 1.6f, 4.0f);
+	vint4 r = float_to_int_rtn(a);
+	EXPECT_EQ(r.lane<0>(), 1);
+	EXPECT_EQ(r.lane<1>(), 2);
+	EXPECT_EQ(r.lane<2>(), 2);
+	EXPECT_EQ(r.lane<3>(), 4);
+}
+
+
+// VINT4 tests - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+/** \brief Test unaligned vint4 data load. */
+TEST(vint4, UnalignedLoad)
+{
+	vint4 a(&(s32x4_data[1]));
+	EXPECT_EQ(a.lane<0>(), 1);
+	EXPECT_EQ(a.lane<1>(), 2);
+	EXPECT_EQ(a.lane<2>(), 3);
+	EXPECT_EQ(a.lane<3>(), 4);
+}
+
+/** \brief Test scalar duplicated vint4 load. */
+TEST(vint4, ScalarDupLoad)
+{
+	vint4 a(42);
+	EXPECT_EQ(a.lane<0>(), 42);
+	EXPECT_EQ(a.lane<1>(), 42);
+	EXPECT_EQ(a.lane<2>(), 42);
+	EXPECT_EQ(a.lane<3>(), 42);
+}
+
+/** \brief Test scalar vint4 load. */
+TEST(vint4, ScalarLoad)
+{
+	vint4 a(11, 22, 33, 44);
+	EXPECT_EQ(a.lane<0>(), 11);
+	EXPECT_EQ(a.lane<1>(), 22);
+	EXPECT_EQ(a.lane<2>(), 33);
+	EXPECT_EQ(a.lane<3>(), 44);
+}
+
+/** \brief Test copy vint4 load. */
+TEST(vint4, CopyLoad)
+{
+	vint4 s(11, 22, 33, 44);
+	vint4 a(s.m);
+	EXPECT_EQ(a.lane<0>(), 11);
+	EXPECT_EQ(a.lane<1>(), 22);
+	EXPECT_EQ(a.lane<2>(), 33);
+	EXPECT_EQ(a.lane<3>(), 44);
+}
+
+/** \brief Test vint4 lane_id. */
+TEST(vint4, LaneID)
+{
+	vint4 a = vint4::lane_id();
+	EXPECT_EQ(a.lane<0>(), 0);
+	EXPECT_EQ(a.lane<1>(), 1);
+	EXPECT_EQ(a.lane<2>(), 2);
+	EXPECT_EQ(a.lane<3>(), 3);
+}
+
+/** \brief Test vint4 add. */
+TEST(vint4, vadd)
+{
+	vint4 a(1, 2, 3, 4);
+	vint4 b(2, 3, 4, 5);
+	a = a + b;
+	EXPECT_EQ(a.lane<0>(), 1 + 2);
+	EXPECT_EQ(a.lane<1>(), 2 + 3);
+	EXPECT_EQ(a.lane<2>(), 3 + 4);
+	EXPECT_EQ(a.lane<3>(), 4 + 5);
+}
+
+/** \brief Test vint4 sub. */
+TEST(vint4, vsub)
+{
+	vint4 a(1, 2, 4, 4);
+	vint4 b(2, 3, 3, 5);
+	a = a - b;
+	EXPECT_EQ(a.lane<0>(), 1 - 2);
+	EXPECT_EQ(a.lane<1>(), 2 - 3);
+	EXPECT_EQ(a.lane<2>(), 4 - 3);
+	EXPECT_EQ(a.lane<3>(), 4 - 5);
+}
+
+/** \brief Test vint4 bitwise invert. */
+TEST(vint4, bit_invert)
+{
+	vint4 a(-1, 0, 1, 2);
+	a = ~a;
+	EXPECT_EQ(a.lane<0>(), ~-1);
+	EXPECT_EQ(a.lane<1>(), ~0);
+	EXPECT_EQ(a.lane<2>(), ~1);
+	EXPECT_EQ(a.lane<3>(), ~2);
+}
+
+/** \brief Test vint4 bitwise or. */
+TEST(vint4, bit_vor)
+{
+	vint4 a(1, 2, 3, 4);
+	vint4 b(2, 3, 4, 5);
+	a = a | b;
+	EXPECT_EQ(a.lane<0>(), 3);
+	EXPECT_EQ(a.lane<1>(), 3);
+	EXPECT_EQ(a.lane<2>(), 7);
+	EXPECT_EQ(a.lane<3>(), 5);
+}
+
+/** \brief Test vint4 bitwise and. */
+TEST(vint4, bit_vand)
+{
+	vint4 a(1, 2, 3, 4);
+	vint4 b(2, 3, 4, 5);
+	a = a & b;
+	EXPECT_EQ(a.lane<0>(), 0);
+	EXPECT_EQ(a.lane<1>(), 2);
+	EXPECT_EQ(a.lane<2>(), 0);
+	EXPECT_EQ(a.lane<3>(), 4);
+}
+
+/** \brief Test vint4 bitwise xor. */
+TEST(vint4, bit_vxor)
+{
+	vint4 a(1, 2, 3, 4);
+	vint4 b(2, 3, 4, 5);
+	a = a ^ b;
+	EXPECT_EQ(a.lane<0>(), 3);
+	EXPECT_EQ(a.lane<1>(), 1);
+	EXPECT_EQ(a.lane<2>(), 7);
+	EXPECT_EQ(a.lane<3>(), 1);
+}
+
+/** \brief Test vint4 ceq. */
+TEST(vint4, ceq)
+{
+	vint4 a1(1, 2, 3, 4);
+	vint4 b1(0, 1, 2, 3);
+	vmask r1 = a1 == b1;
+	EXPECT_EQ(0, mask(r1));
+	EXPECT_EQ(false, any(r1));
+	EXPECT_EQ(false, all(r1));
+
+	vint4 a2(1, 2, 3, 4);
+	vint4 b2(1, 0, 0, 0);
+	vmask r2 = a2 == b2;
+	EXPECT_EQ(0x1, mask(r2));
+	EXPECT_EQ(true, any(r2));
+	EXPECT_EQ(false, all(r2));
+
+	vint4 a3(1, 2, 3, 4);
+	vint4 b3(1, 0, 3, 0);
+	vmask r3 = a3 == b3;
+	EXPECT_EQ(0x5, mask(r3));
+	EXPECT_EQ(true, any(r3));
+	EXPECT_EQ(false, all(r3));
+
+	vint4 a4(1, 2, 3, 4);
+	vmask r4 = a4 == a4;
+	EXPECT_EQ(0xF, mask(r4));
+	EXPECT_EQ(true, any(r4));
+	EXPECT_EQ(true, all(r4));
+}
+
+/** \brief Test vint4 cne. */
+TEST(vint4, cne)
+{
+	vint4 a1(1, 2, 3, 4);
+	vint4 b1(0, 1, 2, 3);
+	vmask r1 = a1 != b1;
+	EXPECT_EQ(0xF, mask(r1));
+	EXPECT_EQ(true, any(r1));
+	EXPECT_EQ(true, all(r1));
+
+	vint4 a2(1, 2, 3, 4);
+	vint4 b2(1, 0, 0, 0);
+	vmask r2 = a2 != b2;
+	EXPECT_EQ(0xE, mask(r2));
+	EXPECT_EQ(true, any(r2));
+	EXPECT_EQ(false, all(r2));
+
+	vint4 a3(1, 2, 3, 4);
+	vint4 b3(1, 0, 3, 0);
+	vmask r3 = a3 != b3;
+	EXPECT_EQ(0xA, mask(r3));
+	EXPECT_EQ(true, any(r3));
+	EXPECT_EQ(false, all(r3));
+
+	vint4 a4(1, 2, 3, 4);
+	vmask r4 = a4 != a4;
+	EXPECT_EQ(0, mask(r4));
+	EXPECT_EQ(false, any(r4));
+	EXPECT_EQ(false, all(r4));
+}
+
+/** \brief Test vint4 clt. */
+TEST(vint4, clt)
+{
+	vint4 a(1, 2, 3, 4);
+	vint4 b(0, 3, 3, 5);
+	vmask r = a < b;
+	EXPECT_EQ(0xA, mask(r));
+}
+
+/** \brief Test vint4 cgt. */
+TEST(vint4, cle)
+{
+	vint4 a(1, 2, 3, 4);
+	vint4 b(0, 3, 3, 5);
+	vmask r = a > b;
+	EXPECT_EQ(0x1, mask(r));
+}
+
+/** \brief Test vint4 min. */
+TEST(vint4, min)
+{
+	vint4 a(1, 2, 3, 4);
+	vint4 b(0, 3, 3, 5);
+	vint4 r = min(a, b);
+	EXPECT_EQ(r.lane<0>(), 0);
+	EXPECT_EQ(r.lane<1>(), 2);
+	EXPECT_EQ(r.lane<2>(), 3);
+	EXPECT_EQ(r.lane<3>(), 4);
+}
+
+/** \brief Test vint4 max. */
+TEST(vint4, max)
+{
+	vint4 a(1, 2, 3, 4);
+	vint4 b(0, 3, 3, 5);
+	vint4 r = max(a, b);
+	EXPECT_EQ(r.lane<0>(), 1);
+	EXPECT_EQ(r.lane<1>(), 3);
+	EXPECT_EQ(r.lane<2>(), 3);
+	EXPECT_EQ(r.lane<3>(), 5);
+}
+
+/** \brief Test vint4 hmin. */
+TEST(vint4, hmin)
+{
+	vint4 a1(1, 2, 1, 2);
+	vint4 r1 = hmin(a1);
+	EXPECT_EQ(r1.lane<0>(), 1);
+	EXPECT_EQ(r1.lane<1>(), 1);
+	EXPECT_EQ(r1.lane<2>(), 1);
+	EXPECT_EQ(r1.lane<3>(), 1);
+
+	vint4 a2(1, 2, -1, 5);
+	vint4 r2 = hmin(a2);
+	EXPECT_EQ(r2.lane<0>(), -1);
+	EXPECT_EQ(r2.lane<1>(), -1);
+	EXPECT_EQ(r2.lane<2>(), -1);
+	EXPECT_EQ(r2.lane<3>(), -1);
+}
+
+/** \brief Test vint4 storea. */
+TEST(vint4, storea)
+{
+	alignas(16) int out[4];
+	vint4 a(s32x4_data);
+	storea(a, out);
+	EXPECT_EQ(out[0], 0);
+	EXPECT_EQ(out[1], 1);
+	EXPECT_EQ(out[2], 2);
+	EXPECT_EQ(out[3], 3);
+}
+
+/** \brief Test vint4 store_nbytes. */
+TEST(vint4, store_nbytes)
+{
+	alignas(16) int out;
+	vint4 a(42, 314, 75, 90);
+	store_nbytes(a, (uint8_t*)&out);
+	EXPECT_EQ(out, 42);
+}
+
+/** \brief Test vint4 gatheri. */
+TEST(vint4, gatheri)
+{
+	vint4 indices(0, 4, 3, 2);
+	vint4 r = gatheri(s32x4_data, indices);
+	EXPECT_EQ(r.lane<0>(), 0);
+	EXPECT_EQ(r.lane<1>(), 4);
+	EXPECT_EQ(r.lane<2>(), 3);
+	EXPECT_EQ(r.lane<3>(), 2);
+}
+
+/** \brief Test vint4 pack_low_bytes. */
+TEST(vint4, pack_low_bytes)
+{
+	vint4 a(1, 2, 3, 4);
+	vint4 r = pack_low_bytes(a);
+	EXPECT_EQ(r.lane<0>(), (4 << 24) | (3 << 16) | (2  << 8) | (1 << 0));
+}
+
+/** \brief Test vint4 select. */
+TEST(vint4, select)
+{
+	vint4 m1(1, 1, 1, 1);
+	vint4 m2(1, 2, 1, 2);
+	vmask4 cond = m1 == m2;
+
+	vint4 a(1, 3, 3, 1);
+	vint4 b(4, 2, 2, 4);
+
+	vint4 r1 = select(a, b, cond);
+	EXPECT_EQ(r1.lane<0>(), 4);
+	EXPECT_EQ(r1.lane<1>(), 3);
+	EXPECT_EQ(r1.lane<2>(), 2);
+	EXPECT_EQ(r1.lane<3>(), 1);
+
+	vint4 r2 = select(b, a, cond);
+	EXPECT_EQ(r2.lane<0>(), 1);
+	EXPECT_EQ(r2.lane<1>(), 2);
+	EXPECT_EQ(r2.lane<2>(), 3);
+	EXPECT_EQ(r2.lane<3>(), 4);
+}
+
+/** \brief Test vint4 select MSB. */
+TEST(vint4, select_msb)
+{
+	vint4 msb(0x80000000u, 0, 0x80000000u, 0);
+	vmask4 cond(msb.m);
+
+	vint4 a(1, 3, 3, 1);
+	vint4 b(4, 2, 2, 4);
+
+	vint4 r1 = select(a, b, cond);
+	EXPECT_EQ(r1.lane<0>(), 4);
+	EXPECT_EQ(r1.lane<1>(), 3);
+	EXPECT_EQ(r1.lane<2>(), 2);
+	EXPECT_EQ(r1.lane<3>(), 1);
+
+	vint4 r2 = select(b, a, cond);
+	EXPECT_EQ(r2.lane<0>(), 1);
+	EXPECT_EQ(r2.lane<1>(), 2);
+	EXPECT_EQ(r2.lane<2>(), 3);
+	EXPECT_EQ(r2.lane<3>(), 4);
+}
+
+// VMASK4 tests - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+/** \brief Test vmask4 or. */
+TEST(vmask4, or)
+{
+	vfloat4 m1a(0, 1, 0, 1);
+	vfloat4 m1b(1, 1, 1, 1);
+	vmask4 m1 = m1a == m1b;
+
+	vfloat4 m2a(1, 1, 0, 0);
+	vfloat4 m2b(1, 1, 1, 1);
+	vmask4 m2 = m2a == m2b;
+
+	vmask4 r = m1 | m2;
+	EXPECT_EQ(mask(r), 0xB);
+}
+
+/** \brief Test vmask4 and. */
+TEST(vmask4, and)
+{
+	vfloat4 m1a(0, 1, 0, 1);
+	vfloat4 m1b(1, 1, 1, 1);
+	vmask4 m1 = m1a == m1b;
+
+	vfloat4 m2a(1, 1, 0, 0);
+	vfloat4 m2b(1, 1, 1, 1);
+	vmask4 m2 = m2a == m2b;
+
+	vmask4 r = m1 & m2;
+	EXPECT_EQ(mask(r), 0x2);
+}
+
+/** \brief Test vmask4 xor. */
+TEST(vmask4, xor)
+{
+	vfloat4 m1a(0, 1, 0, 1);
+	vfloat4 m1b(1, 1, 1, 1);
+	vmask4 m1 = m1a == m1b;
+
+	vfloat4 m2a(1, 1, 0, 0);
+	vfloat4 m2b(1, 1, 1, 1);
+	vmask4 m2 = m2a == m2b;
+
+	vmask4 r = m1 ^ m2;
+	EXPECT_EQ(mask(r), 0x9);
+}
+
+/** \brief Test vmask4 not. */
+TEST(vmask4, not)
+{
+	vfloat4 m1a(0, 1, 0, 1);
+	vfloat4 m1b(1, 1, 1, 1);
+	vmask4 m1 = m1a == m1b;
+	vmask r = ~m1;
+	EXPECT_EQ(mask(r), 0x5);
+}
+
+#endif
+
+}
diff --git a/Source/astcenc_internal.h b/Source/astcenc_internal.h
index 814c9c0..6ee35d1 100644
--- a/Source/astcenc_internal.h
+++ b/Source/astcenc_internal.h
@@ -63,6 +63,14 @@
   #endif
 #endif
 
+#ifndef ASTCENC_NEON
+  #if defined(__aarch64__)
+    #define ASTCENC_NEON 1
+  #else
+    #define ASTCENC_NEON 0
+  #endif
+#endif
+
 #if ASTCENC_AVX
   #define ASTCENC_VECALIGN 32
 #else
diff --git a/Source/astcenc_vecmathlib.h b/Source/astcenc_vecmathlib.h
index 57b9e3c..1edc5af 100644
--- a/Source/astcenc_vecmathlib.h
+++ b/Source/astcenc_vecmathlib.h
@@ -32,7 +32,7 @@
  * Explicit 4-wide types are accessible via the vint4, vfloat4, and vmask4
  * types. These are provided for use by VLA code, but are also expected to be
  * used as a fixed-width type and will supported a reference C++ fallback for
- * use on platforms without SIMD intrinsics (TODO: not yet implemented).
+ * use on platforms without SIMD intrinsics.
  *
  * Explicit 8-wide types are accessible via the vint8, vfloat8, and vmask8
  * types. These are provide for use by VLA code, and are not expected to be
@@ -42,9 +42,10 @@
  * With the current implementation ISA support is provided for:
  *
  *     * 1-wide for scalar reference.
- *     * 4-wide for SSE2.
- *     * 4-wide for SSE4.1.
- *     * 8-wide for AVX2.
+ *     * 4-wide for Armv8-A NEON.
+ *     * 4-wide for x86-64 SSE2.
+ *     * 4-wide for x86-64 SSE4.1.
+ *     * 8-wide for x86-64 AVX2.
  *
  */
 
@@ -53,6 +54,8 @@
 
 #if ASTCENC_SSE != 0 || ASTCENC_AVX != 0
 	#include <immintrin.h>
+#elif ASTCENC_NEON != 0
+	#include <arm_neon.h>
 #endif
 
 #if defined(_MSC_VER)
@@ -64,7 +67,7 @@
 #endif
 
 #if ASTCENC_AVX >= 2
-	/* If we have AVX2 expose 8-wide VLA, and 4-wide fixed width. */
+	/* If we have AVX2 expose 8-wide VLA. */
 	#include "astcenc_vecmathlib_avx2_8.h"
 	#include "astcenc_vecmathlib_sse_4.h"
 
@@ -89,6 +92,20 @@
 
 	constexpr auto loada = vfloat4::loada;
 	constexpr auto load1 = vfloat4::load1;
+
+#elif ASTCENC_NEON > 0
+	/* If we have NEON expose 4-wide VLA. */
+	#include "astcenc_vecmathlib_neon_4.h"
+
+	#define ASTCENC_SIMD_WIDTH 4
+
+	using vfloat = vfloat4;
+	using vint = vint4;
+	using vmask = vmask4;
+
+	constexpr auto loada = vfloat4::loada;
+	constexpr auto load1 = vfloat4::load1;
+
 #else
 	/* If we have nothing expose 1-wide VLA, and 4-wide fixed width. */
 	#include "astcenc_vecmathlib_none_1.h"
diff --git a/Source/astcenc_vecmathlib_neon_4.h b/Source/astcenc_vecmathlib_neon_4.h
new file mode 100755
index 0000000..c5b9d60
--- /dev/null
+++ b/Source/astcenc_vecmathlib_neon_4.h
@@ -0,0 +1,799 @@
+// SPDX-License-Identifier: Apache-2.0
+// ----------------------------------------------------------------------------
+// Copyright 2019-2020 Arm Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
+// use this file except in compliance with the License. You may obtain a copy
+// of the License at:
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+// ----------------------------------------------------------------------------
+
+/**
+ * @brief 4x32-bit vectors, implemented using Armv8-A NEON.
+ *
+ * This module implements 4-wide 32-bit float, int, and mask vectors for
+ * Armv8-A NEON.
+ *
+ * There is a baseline level of functionality provided by all vector widths and
+ * implementations. This is implemented using identical function signatures,
+ * modulo data type, so we can use them as substitutable implementations in VLA
+ * code.
+ *
+ * The 4-wide vectors are also used as a fixed-width type, and significantly
+ * extend the functionality above that available to VLA code.
+ */
+
+#ifndef ASTC_VECMATHLIB_NEON_4_H_INCLUDED
+#define ASTC_VECMATHLIB_NEON_4_H_INCLUDED
+
+#ifndef ASTCENC_SIMD_INLINE
+	#error "Include astcenc_vecmathlib.h, do not include directly"
+#endif
+
+#include <cstdio>
+
+// ============================================================================
+// vfloat4 data type
+// ============================================================================
+
+/**
+ * @brief Data type for 4-wide floats.
+ */
+struct vfloat4
+{
+	/**
+	 * @brief Construct from zero-initialized value.
+	 */
+	ASTCENC_SIMD_INLINE vfloat4() {}
+
+	/**
+	 * @brief Construct from 4 values loaded from an unaligned address.
+	 *
+	 * Consider using loada() which is better with vectors if data is aligned
+	 * to vector length.
+	 */
+	ASTCENC_SIMD_INLINE explicit vfloat4(const float *p)
+	{
+		m = vld1q_f32(p);
+	}
+
+	/**
+	 * @brief Construct from 1 scalar value replicated across all lanes.
+	 *
+	 * Consider using zero() for constexpr zeros.
+	 */
+	ASTCENC_SIMD_INLINE explicit vfloat4(float a)
+	{
+		m = vdupq_n_f32(a);
+	}
+
+	/**
+	 * @brief Construct from 4 scalar values.
+	 *
+	 * The value of @c a is stored to lane 0 (LSB) in the SIMD register.
+	 */
+	ASTCENC_SIMD_INLINE explicit vfloat4(float a, float b, float c, float d)
+	{
+		float32x4_t v { a, b, c, d };
+		m = v;
+	}
+
+	/**
+	 * @brief Construct from an existing SIMD register.
+	 */
+	ASTCENC_SIMD_INLINE explicit vfloat4(float32x4_t a)
+	{
+		m = a;
+	}
+
+	/**
+	 * @brief Get the scalar value of a single lane.
+	 *
+	 * TODO: Can we do better for lane0, which is the common case for VLA?
+	 */
+	template <int l> ASTCENC_SIMD_INLINE float lane() const
+	{
+		return vgetq_lane_f32(m, l);
+	}
+
+	/**
+	 * @brief Set the scalar value of a single lane.
+	 */
+	template <int l> ASTCENC_SIMD_INLINE void set_lane(float a)
+	{
+		m = vld1q_lane_f32(&a, m, l);
+	}
+
+	/**
+	 * @brief Factory that returns a vector of zeros.
+	 */
+	static ASTCENC_SIMD_INLINE vfloat4 zero()
+	{
+		return vfloat4(vdupq_n_f32(0.0f));
+	}
+
+	/**
+	 * @brief Factory that returns a replicated scalar loaded from memory.
+	 */
+	static ASTCENC_SIMD_INLINE vfloat4 load1(const float* p)
+	{
+		return vfloat4(vdupq_n_f32(*p));
+	}
+
+	/**
+	 * @brief Factory that returns a vector loaded from 16B aligned memory.
+	 */
+	static ASTCENC_SIMD_INLINE vfloat4 loada(const float* p)
+	{
+		return vfloat4(vld1q_f32(p));
+	}
+
+	/**
+	 * @brief Factory that returns a vector containing the lane IDs.
+	 */
+	static ASTCENC_SIMD_INLINE vfloat4 lane_id()
+	{
+		alignas(16) float data[4] = { 0.0f, 1.0f, 2.0f, 3.0f };
+		return vfloat4(vld1q_f32(data));
+	}
+
+	/**
+	 * @brief The vector ...
+	 */
+	float32x4_t m;
+};
+
+// ============================================================================
+// vint4 data type
+// ============================================================================
+
+/**
+ * @brief Data type for 4-wide ints.
+ */
+struct vint4
+{
+	/**
+	 * @brief Construct from zero-initialized value.
+	 */
+	ASTCENC_SIMD_INLINE vint4() {}
+
+	/**
+	 * @brief Construct from 4 values loaded from an unaligned address.
+	 *
+	 * Consider using loada() which is better with vectors if data is aligned
+	 * to vector length.
+	 */
+	ASTCENC_SIMD_INLINE explicit vint4(const int *p)
+	{
+		m = vld1q_s32(p);
+	}
+
+	/**
+	 * @brief Construct from 1 scalar value replicated across all lanes.
+	 *
+	 * Consider using vfloat4::zero() for constexpr zeros.
+	 */
+	ASTCENC_SIMD_INLINE explicit vint4(int a)
+	{
+		m = vdupq_n_s32(a);
+	}
+
+	/**
+	 * @brief Construct from 4 scalar values.
+	 *
+	 * The value of @c a is stored to lane 0 (LSB) in the SIMD register.
+	 */
+	ASTCENC_SIMD_INLINE explicit vint4(int a, int b, int c, int d)
+	{
+		int32x4_t v = { a, b, c, d };
+		m = v;
+	}
+
+	/**
+	 * @brief Construct from an existing SIMD register.
+	 */
+	ASTCENC_SIMD_INLINE explicit vint4(int32x4_t a)
+	{
+		m = a;
+	}
+
+	/**
+	 * @brief Get the scalar from a single lane.
+	 */
+	template <int l> ASTCENC_SIMD_INLINE int lane() const
+	{
+		return vgetq_lane_s32(m, l);
+	}
+
+	/**
+	 * @brief Factory that returns a vector containing the lane IDs.
+	 */
+	static ASTCENC_SIMD_INLINE vint4 lane_id()
+	{
+		alignas(ASTCENC_VECALIGN) int data[4] = { 0, 1, 2, 3 };
+		return vint4(vld1q_s32(data));
+	}
+
+	/**
+	 * @brief The vector ...
+	 */
+	int32x4_t m;
+};
+
+// ============================================================================
+// vmask4 data type
+// ============================================================================
+
+/**
+ * @brief Data type for 4-wide control plane masks.
+ */
+struct vmask4
+{
+	/**
+	 * @brief Construct from an existing SIMD register.
+	 */
+	ASTCENC_SIMD_INLINE explicit vmask4(uint32x4_t a)
+	{
+		m = a;
+	}
+
+	/**
+	 * @brief Construct from an existing SIMD register.
+	 */
+	ASTCENC_SIMD_INLINE explicit vmask4(int32x4_t a)
+	{
+		m = vreinterpretq_u32_s32(a);
+	}
+
+	/**
+	 * @brief The vector ...
+	 */
+	uint32x4_t m;
+};
+
+// ============================================================================
+// vmask4 operators and functions
+// ============================================================================
+
+/**
+ * @brief Overload: mask union (or).
+ */
+ASTCENC_SIMD_INLINE vmask4 operator|(vmask4 a, vmask4 b)
+{
+	return vmask4(vorrq_u32(a.m, b.m));
+}
+
+/**
+ * @brief Overload: mask intersect (and).
+ */
+ASTCENC_SIMD_INLINE vmask4 operator&(vmask4 a, vmask4 b)
+{
+	return vmask4(vandq_u32(a.m, b.m));
+}
+
+/**
+ * @brief Overload: mask difference (xor).
+ */
+ASTCENC_SIMD_INLINE vmask4 operator^(vmask4 a, vmask4 b)
+{
+	return vmask4(veorq_u32(a.m, b.m));
+}
+
+/**
+ * @brief Overload: mask invert (not).
+ */
+ASTCENC_SIMD_INLINE vmask4 operator~(vmask4 a)
+{
+	return vmask4(vmvnq_u32(a.m));
+}
+
+/**
+ * @brief Return a 4-bit mask code indicating mask status.
+ *
+ * bit0 = lane 0
+ */
+ASTCENC_SIMD_INLINE unsigned int mask(vmask4 a)
+{
+	int32x4_t shift = { 0, 1, 2, 3 };
+	uint32x4_t tmp = vshrq_n_u32(a.m, 31);
+	return vaddvq_u32(vshlq_u32(tmp, shift));
+}
+
+/**
+ * @brief True if any lanes are enabled, false otherwise.
+ */
+ASTCENC_SIMD_INLINE bool any(vmask4 a)
+{
+	return mask(a) != 0;
+}
+
+/**
+ * @brief True if all lanes are enabled, false otherwise.
+ */
+ASTCENC_SIMD_INLINE bool all(vmask4 a)
+{
+	return mask(a) == 0xF;
+}
+
+// ============================================================================
+// vint4 operators and functions
+// ============================================================================
+
+/**
+ * @brief Overload: vector by vector addition.
+ */
+ASTCENC_SIMD_INLINE vint4 operator+(vint4 a, vint4 b)
+{
+	return vint4(vaddq_s32(a.m, b.m));
+}
+
+/**
+ * @brief Overload: vector by vector subtraction.
+ */
+ASTCENC_SIMD_INLINE vint4 operator-(vint4 a, vint4 b)
+{
+	return vint4(vsubq_s32(a.m, b.m));
+}
+
+/**
+ * @brief Overload: vector bit invert.
+ */
+ASTCENC_SIMD_INLINE vint4 operator~(vint4 a)
+{
+	return vint4(vmvnq_s32(a.m));
+}
+
+/**
+ * @brief Overload: vector by vector bitwise or.
+ */
+ASTCENC_SIMD_INLINE vint4 operator|(vint4 a, vint4 b)
+{
+	return vint4(vorrq_s32(a.m, b.m));
+}
+
+/**
+ * @brief Overload: vector by vector bitwise and.
+ */
+ASTCENC_SIMD_INLINE vint4 operator&(vint4 a, vint4 b)
+{
+	return vint4(vandq_s32(a.m, b.m));
+}
+
+/**
+ * @brief Overload: vector by vector bitwise xor.
+ */
+ASTCENC_SIMD_INLINE vint4 operator^(vint4 a, vint4 b)
+{
+	return vint4(veorq_s32(a.m, b.m));
+}
+
+/**
+ * @brief Overload: vector by vector equality.
+ */
+ASTCENC_SIMD_INLINE vmask4 operator==(vint4 a, vint4 b)
+{
+	return vmask4(vceqq_s32(a.m, b.m));
+}
+
+/**
+ * @brief Overload: vector by vector inequality.
+ */
+ASTCENC_SIMD_INLINE vmask4 operator!=(vint4 a, vint4 b)
+{
+	return ~vmask4(vceqq_s32(a.m, b.m));
+}
+
+/**
+ * @brief Overload: vector by vector less than.
+ */
+ASTCENC_SIMD_INLINE vmask4 operator<(vint4 a, vint4 b)
+{
+	return vmask4(vcltq_s32(a.m, b.m));
+}
+
+/**
+ * @brief Overload: vector by vector greater than.
+ */
+ASTCENC_SIMD_INLINE vmask4 operator>(vint4 a, vint4 b)
+{
+	return vmask4(vcgtq_s32(a.m, b.m));
+}
+
+/**
+ * @brief Return the min vector of two vectors.
+ */
+ASTCENC_SIMD_INLINE vint4 min(vint4 a, vint4 b)
+{
+	return vint4(vminq_s32(a.m, b.m));
+}
+
+/**
+ * @brief Return the max vector of two vectors.
+ */
+ASTCENC_SIMD_INLINE vint4 max(vint4 a, vint4 b)
+{
+	return vint4(vmaxq_s32(a.m, b.m));
+}
+
+/**
+ * @brief Return the horizontal minimum of a vector.
+ */
+ASTCENC_SIMD_INLINE vint4 hmin(vint4 a)
+{
+	return vint4(vminvq_s32(a.m));
+}
+
+/**
+ * @brief Store a vector to a 16B aligned memory address.
+ */
+ASTCENC_SIMD_INLINE void storea(vint4 a, int* p)
+{
+	vst1q_s32(p, a.m);
+}
+
+/**
+ * @brief Store lowest N (vector width) bytes into an unaligned address.
+ */
+ASTCENC_SIMD_INLINE void store_nbytes(vint4 a, uint8_t* p)
+{
+	vst1q_lane_s32((int32_t*)p, a.m, 0);
+}
+
+/**
+ * @brief Gather N (vector width) indices from the array.
+ */
+ASTCENC_SIMD_INLINE vint4 gatheri(const int* base, vint4 indices)
+{
+	alignas(16) int idx[4];
+	storea(indices, idx);
+	alignas(16) int vals[4];
+	vals[0] = base[idx[0]];
+	vals[1] = base[idx[1]];
+	vals[2] = base[idx[2]];
+	vals[3] = base[idx[3]];
+	return vint4(vals);
+}
+
+/**
+ * @brief Pack low 8 bits of N (vector width) lanes into bottom of vector.
+ */
+ASTCENC_SIMD_INLINE vint4 pack_low_bytes(vint4 a)
+{
+	alignas(16) uint8_t shuf[16] = {
+		0, 4, 8, 12,   0, 0, 0, 0,   0, 0, 0, 0,   0, 0, 0, 0
+	};
+	uint8x16_t idx = vld1q_u8(shuf);
+	int8x16_t av = vreinterpretq_s8_s32(a.m);
+    return vint4(vreinterpretq_s32_s8(vqtbl1q_s8(av, idx)));
+}
+
+/**
+ * @brief Return lanes from @c b if MSB of @c cond is set, else @c a.
+ */
+ASTCENC_SIMD_INLINE vint4 select(vint4 a, vint4 b, vmask4 cond)
+{
+	static const uint32x4_t msb = vdupq_n_u32(0x80000000u);
+	uint32x4_t mask = vcgeq_u32(cond.m, msb);
+	return vint4(vbslq_s32(mask, b.m, a.m));
+}
+
+/**
+ * @brief Debug function to print a vector of ints.
+ */
+ASTCENC_SIMD_INLINE void print(vint4 a)
+{
+	alignas(16) int v[4];
+	storea(a, v);
+	printf("v4_i32:\n  %8u %8u %8u %8u\n",
+	       v[0], v[1], v[2], v[3]);
+}
+
+// ============================================================================
+// vfloat4 operators and functions
+// ============================================================================
+
+/**
+ * @brief Overload: vector by vector addition.
+ */
+ASTCENC_SIMD_INLINE vfloat4 operator+(vfloat4 a, vfloat4 b)
+{
+	return vfloat4(vaddq_f32(a.m, b.m));
+}
+
+/**
+ * @brief Overload: vector by vector subtraction.
+ */
+ASTCENC_SIMD_INLINE vfloat4 operator-(vfloat4 a, vfloat4 b)
+{
+	return vfloat4(vsubq_f32(a.m, b.m));
+}
+
+/**
+ * @brief Overload: vector by vector multiplication.
+ */
+ASTCENC_SIMD_INLINE vfloat4 operator*(vfloat4 a, vfloat4 b)
+{
+	return vfloat4(vmulq_f32(a.m, b.m));
+}
+
+/**
+ * @brief Overload: vector by scalar multiplication.
+ */
+ASTCENC_SIMD_INLINE vfloat4 operator*(vfloat4 a, float b)
+{
+	float32x4_t bv = vld1q_dup_f32(&b);
+	return vfloat4(vmulq_f32(a.m, bv));
+}
+
+/**
+ * @brief Overload: scalar by vector multiplication.
+ */
+ASTCENC_SIMD_INLINE vfloat4 operator*(float a, vfloat4 b)
+{
+	float32x4_t av = vld1q_dup_f32(&a);
+	return vfloat4(vmulq_f32(av, b.m));
+}
+
+/**
+ * @brief Overload: vector by vector division.
+ */
+ASTCENC_SIMD_INLINE vfloat4 operator/(vfloat4 a, vfloat4 b)
+{
+	return vfloat4(vdivq_f32(a.m, b.m));
+}
+
+/**
+ * @brief Overload: vector by vector equality.
+ */
+ASTCENC_SIMD_INLINE vmask4 operator==(vfloat4 a, vfloat4 b)
+{
+	return vmask4(vceqq_f32(a.m, b.m));
+}
+
+/**
+ * @brief Overload: vector by vector inequality.
+ */
+ASTCENC_SIMD_INLINE vmask4 operator!=(vfloat4 a, vfloat4 b)
+{
+	return vmask4(vmvnq_u32(vceqq_f32(a.m, b.m)));
+}
+
+/**
+ * @brief Overload: vector by vector less than.
+ */
+ASTCENC_SIMD_INLINE vmask4 operator<(vfloat4 a, vfloat4 b)
+{
+	return vmask4(vcltq_f32(a.m, b.m));
+}
+
+/**
+ * @brief Overload: vector by vector greater than.
+ */
+ASTCENC_SIMD_INLINE vmask4 operator>(vfloat4 a, vfloat4 b)
+{
+	return vmask4(vcgtq_f32(a.m, b.m));
+}
+
+/**
+ * @brief Overload: vector by vector less than or equal.
+ */
+ASTCENC_SIMD_INLINE vmask4 operator<=(vfloat4 a, vfloat4 b)
+{
+	return vmask4(vcleq_f32(a.m, b.m));
+}
+
+/**
+ * @brief Overload: vector by vector greater than or equal.
+ */
+ASTCENC_SIMD_INLINE vmask4 operator>=(vfloat4 a, vfloat4 b)
+{
+	return vmask4(vcgeq_f32(a.m, b.m));
+}
+
+/**
+ * @brief Return the min vector of two vectors.
+ *
+ * If either lane value is NaN, @c b will be returned for that lane.
+ */
+ASTCENC_SIMD_INLINE vfloat4 min(vfloat4 a, vfloat4 b)
+{
+	// Do not reorder - second operand will return if either is NaN
+	return vfloat4(vminnmq_f32(a.m, b.m));
+}
+
+/**
+ * @brief Return the max vector of two vectors.
+ *
+ * If either lane value is NaN, @c b will be returned for that lane.
+ */
+ASTCENC_SIMD_INLINE vfloat4 max(vfloat4 a, vfloat4 b)
+{
+	// Do not reorder - second operand will return if either is NaN
+	return vfloat4(vmaxnmq_f32(a.m, b.m));
+}
+
+/**
+ * @brief Return the clamped value between min and max.
+ *
+ * It is assumed that neither @c min nor @c max are NaN values. If @c a is NaN
+ * then @c min will be returned for that lane.
+ */
+ASTCENC_SIMD_INLINE vfloat4 clamp(float min, float max, vfloat4 a)
+{
+	float32x4_t minv = vdupq_n_f32(min);
+	float32x4_t maxv = vdupq_n_f32(max);
+	return vfloat4(vminnmq_f32(vmaxnmq_f32(a.m, minv), maxv));
+}
+
+/**
+ * @brief Return a clamped value between 0.0f and max.
+ *
+ * It is assumed that @c max is not a NaN value. If @c a is NaN then zero will
+ * be returned for that lane.
+ */
+ASTCENC_SIMD_INLINE vfloat4 clampz(float max, vfloat4 a)
+{
+	// Do not reorder - second operand will return if either is NaN
+	float32x4_t minv = vdupq_n_f32(0.0f);
+	float32x4_t maxv = vdupq_n_f32(max);
+	return vfloat4(vminnmq_f32(vmaxnmq_f32(a.m, minv), maxv));
+}
+
+/**
+ * @brief Return a clamped value between 0.0f and 1.0f.
+ *
+ * If @c a is NaN then zero will be returned for that lane.
+ */
+ASTCENC_SIMD_INLINE vfloat4 clampzo(vfloat4 a)
+{
+	float32x4_t minv = vdupq_n_f32(0.0f);
+	float32x4_t maxv = vdupq_n_f32(1.0f);
+	return vfloat4(vminnmq_f32(vmaxnmq_f32(a.m, minv), maxv));
+}
+
+/**
+ * @brief Return the absolute value of the float vector.
+ */
+ASTCENC_SIMD_INLINE vfloat4 abs(vfloat4 a)
+{
+	float32x4_t zero = vdupq_n_f32(0.0f);
+	float32x4_t inv = vsubq_f32(zero, a.m);
+	return vfloat4(vmaxq_f32(a.m, inv));
+}
+
+/**
+ * @brief Return a float rounded to the nearest integer value.
+ *
+ * TODO: Can we do a better fallback here, if we exploit the fact that we
+ *       can assume that values are positive?
+ */
+ASTCENC_SIMD_INLINE vfloat4 round(vfloat4 a)
+{
+	return vfloat4(vrndnq_f32(a.m));
+}
+
+/**
+ * @brief Return the horizontal minimum of a vector.
+ */
+ASTCENC_SIMD_INLINE vfloat4 hmin(vfloat4 a)
+{
+	return vfloat4(vminvq_f32(a.m));
+}
+
+/**
+ * @brief Return the sqrt of the lanes in the vector.
+ */
+ASTCENC_SIMD_INLINE vfloat4 sqrt(vfloat4 a)
+{
+	return vfloat4(vsqrtq_f32(a.m));
+}
+
+/**
+ * @brief Return lanes from @c b if MSB of @c cond is set, else @c a.
+ */
+ASTCENC_SIMD_INLINE vfloat4 select(vfloat4 a, vfloat4 b, vmask4 cond)
+{
+	static const uint32x4_t msb = vdupq_n_u32(0x80000000u);
+	uint32x4_t mask = vcgeq_u32(cond.m, msb);
+	return vfloat4(vbslq_f32(mask, b.m, a.m));
+}
+
+/**
+ * @brief Load a vector of gathered results from an array;
+ */
+ASTCENC_SIMD_INLINE vfloat4 gatherf(const float* base, vint4 indices)
+{
+	alignas(16) int idx[4];
+	storea(indices, idx);
+	alignas(16) float vals[4];
+	vals[0] = base[idx[0]];
+	vals[1] = base[idx[1]];
+	vals[2] = base[idx[2]];
+	vals[3] = base[idx[3]];
+	return vfloat4(vals);
+}
+
+/**
+ * @brief Store a vector to an unaligned memory address.
+ */
+ASTCENC_SIMD_INLINE void store(vfloat4 a, float* p)
+{
+	vst1q_f32(p, a.m);
+}
+
+/**
+ * @brief Store a vector to a 16B aligned memory address.
+ */
+ASTCENC_SIMD_INLINE void storea(vfloat4 a, float* p)
+{
+	vst1q_f32(p, a.m);
+}
+
+/**
+ * @brief Return the dot product for the full 4 lanes, returning vector.
+ */
+ASTCENC_SIMD_INLINE vfloat4 dot(vfloat4 a, vfloat4 b)
+{
+	return vfloat4(vaddvq_f32(vmulq_f32(a.m, b.m)));
+}
+
+/**
+ * @brief Return a integer value for a float vector, using truncation.
+ */
+ASTCENC_SIMD_INLINE vint4 float_to_int(vfloat4 a)
+{
+	return vint4(vcvtq_s32_f32(a.m));
+}
+
+/**
+ * @brief Return a integer value for a float vector, using round-to-nearest.
+ */
+ASTCENC_SIMD_INLINE vint4 float_to_int_rtn(vfloat4 a)
+{
+	a = round(a);
+	return vint4(vcvtq_s32_f32(a.m));
+}
+
+/**
+ * @brief Return a float value as an integer bit pattern (i.e. no conversion).
+ *
+ * It is a common trick to convert floats into integer bit patterns, perform
+ * some bit hackery based on knowledge they are IEEE 754 layout, and then
+ * convert them back again. This is the first half of that flip.
+ */
+ASTCENC_SIMD_INLINE vint4 float_as_int(vfloat4 a)
+{
+	return vint4(vreinterpretq_s32_f32(a.m));
+}
+
+/**
+ * @brief Return a integer value as a float bit pattern (i.e. no conversion).
+ *
+ * It is a common trick to convert floats into integer bit patterns, perform
+ * some bit hackery based on knowledge they are IEEE 754 layout, and then
+ * convert them back again. This is the second half of that flip.
+ */
+ASTCENC_SIMD_INLINE vfloat4 int_as_float(vint4 v)
+{
+	return vfloat4(vreinterpretq_f32_s32(v.m));
+}
+
+/**
+ * @brief Debug function to print a vector of floats.
+ */
+ASTCENC_SIMD_INLINE void print(vfloat4 a)
+{
+	alignas(16) float v[4];
+	storea(a, v);
+	printf("v4_f32:\n  %0.4f %0.4f %0.4f %0.4f\n",
+	       (double)v[0], (double)v[1], (double)v[2], (double)v[3]);
+}
+
+#endif // #ifndef ASTC_VECMATHLIB_NEON_4_H_INCLUDED
diff --git a/Source/cmake_core.cmake b/Source/cmake_core.cmake
index b14c02a..8ce4137 100644
--- a/Source/cmake_core.cmake
+++ b/Source/cmake_core.cmake
@@ -110,17 +110,29 @@ endif()
 if(${ISA_SIMD} MATCHES "none")
     target_compile_definitions(astcenc-${ISA_SIMD}
         PRIVATE
+            ASTCENC_NEON=0
             ASTCENC_SSE=0
             ASTCENC_AVX=0
             ASTCENC_POPCNT=0)
 
-    target_compile_options(astcenc-${ISA_SIMD}
+    if (${ARCH} MATCHES x64)
+        target_compile_options(astcenc-${ISA_SIMD}
+            PRIVATE
+                $<$<CXX_COMPILER_ID:${GNU_LIKE}>:-mfpmath=sse -msse2>)
+    endif()
+
+elseif(${ISA_SIMD} MATCHES "neon")
+    target_compile_definitions(astcenc-${ISA_SIMD}
         PRIVATE
-            $<$<CXX_COMPILER_ID:${GNU_LIKE}>:-mfpmath=sse -msse2>)
+            ASTCENC_NEON=1
+            ASTCENC_SSE=0
+            ASTCENC_AVX=0
+            ASTCENC_POPCNT=0)
 
 elseif(${ISA_SIMD} MATCHES "sse2")
     target_compile_definitions(astcenc-${ISA_SIMD}
         PRIVATE
+            ASTCENC_NEON=0
             ASTCENC_SSE=20
             ASTCENC_AVX=0
             ASTCENC_POPCNT=0)
@@ -132,6 +144,7 @@ elseif(${ISA_SIMD} MATCHES "sse2")
 elseif(${ISA_SIMD} MATCHES "sse4.1")
     target_compile_definitions(astcenc-${ISA_SIMD}
         PRIVATE
+            ASTCENC_NEON=0
             ASTCENC_SSE=41
             ASTCENC_AVX=0
             ASTCENC_POPCNT=1)
@@ -143,6 +156,7 @@ elseif(${ISA_SIMD} MATCHES "sse4.1")
 elseif(${ISA_SIMD} MATCHES "avx2")
     target_compile_definitions(astcenc-${ISA_SIMD}
         PRIVATE
+            ASTCENC_NEON=0
             ASTCENC_SSE=41
             ASTCENC_AVX=2
             ASTCENC_POPCNT=1)
diff --git a/Test/astc_test_image.py b/Test/astc_test_image.py
index 4441ef5..5492375 100644
--- a/Test/astc_test_image.py
+++ b/Test/astc_test_image.py
@@ -285,23 +285,30 @@ def get_encoder_params(encoderName, referenceName, imageSet):
         outDir = "Test/Images/%s" % imageSet
         refName = None
     # Latest master
+    elif encoderName == "ref-master-neon":
+        # Warning: this option rebuilds a new reference test result for the
+        # master branch using the user's locally build encoder.
+        encoder = te.Encoder2x("neon")
+        name = "reference-master-neon"
+        outDir = "Test/Images/%s" % imageSet
+        refName = None
     elif encoderName == "ref-master-sse2":
         # Warning: this option rebuilds a new reference test result for the
-        # master branch using the user's locally build encoder in ./Source.
+        # master branch using the user's locally build encoder.
         encoder = te.Encoder2x("sse2")
         name = "reference-master-sse2"
         outDir = "Test/Images/%s" % imageSet
         refName = None
     elif encoderName == "ref-master-sse4.1":
         # Warning: this option rebuilds a new reference test result for the
-        # master branch using the user's locally build encoder in ./Source.
+        # master branch using the user's locally build encoder.
         encoder = te.Encoder2x("sse4.1")
         name = "reference-master-sse4.1"
         outDir = "Test/Images/%s" % imageSet
         refName = None
     elif encoderName == "ref-master-avx2":
         # Warning: this option rebuilds a new reference test result for the
-        # master branch using the user's locally build encoder in ./Source.
+        # master branch using the user's locally build encoder.
         encoder = te.Encoder2x("avx2")
         name = "reference-master-avx2"
         outDir = "Test/Images/%s" % imageSet
@@ -327,8 +334,8 @@ def parse_command_line():
     refcoders = ["ref-1.7",
                  "ref-2.0-sse2", "ref-2.0-sse4.1", "ref-2.0-avx2",
                  "ref-2.1-sse2", "ref-2.1-sse4.1", "ref-2.1-avx2",
-                 "ref-master-sse2", "ref-master-sse4.1", "ref-master-avx2"]
-    testcoders = ["none", "sse2", "sse4.1", "avx2"]
+                 "ref-master-neon", "ref-master-sse2", "ref-master-sse4.1", "ref-master-avx2"]
+    testcoders = ["none", "neon", "sse2", "sse4.1", "avx2"]
     coders = refcoders + testcoders + ["all", "all-ref"]
     parser.add_argument("--encoder", dest="encoders", default="avx2",
                         choices=coders, help="test encoder variant")
diff --git a/Test/testlib/image.py b/Test/testlib/image.py
index a466114..41320f9 100644
--- a/Test/testlib/image.py
+++ b/Test/testlib/image.py
@@ -28,6 +28,7 @@ The directory path is structured:
 
 from collections.abc import Iterable
 import os
+import re
 import subprocess as sp
 
 from PIL import Image as PILImage
@@ -35,6 +36,39 @@ from PIL import Image as PILImage
 import testlib.misc as misc
 
 
+CONVERT_BINARY = ["convert"]
+
+
+g_ConvertVersion = None
+
+
+def get_convert_version():
+    """
+    Get the major/minor version of ImageMagick on the system.
+    """
+    global g_ConvertVersion
+
+    if g_ConvertVersion is None:
+        command = list(CONVERT_BINARY)
+        command += ["--version"]
+        result = sp.run(command, stdout=sp.PIPE, stderr=sp.PIPE,
+                        check=True, universal_newlines=True)
+
+        # Version is top row
+        version = result.stdout.splitlines()[0]
+        # ... third token
+        version = re.split(" ", version)[2]
+        # ... major/minor/patch/subpatch
+        version = re.split("\\.|-", version)
+
+        numericVersion = float(version[0])
+        numericVersion += float(version[1]) / 10.0
+
+        g_ConvertVersion = numericVersion
+
+    return g_ConvertVersion
+
+
 class ImageException(Exception):
     """
     Exception thrown for bad image specification.
@@ -228,6 +262,14 @@ class Image():
         Args:
             filePath (str): The path to the image on disk.
         """
+        convert = get_convert_version()
+
+        # ImageMagick 7 started to use .tga file origin information. By default
+        # TGA files store data from bottom up, and define the origin as bottom
+        # left. We want our color samples to always use a top left origin, even
+        # if the data is stored in alternative layout.
+        self.invertYCoords = (convert >= 7.0) and filePath.endswith(".tga")
+
         self.filePath = filePath
         self.proxyPath = None
 
@@ -256,8 +298,14 @@ class Image():
             coords = [coords]
 
         for (x, y) in coords:
-            command = [
-                "convert", self.filePath,
+            command = list(CONVERT_BINARY)
+            command += [self.filePath]
+
+            # Invert coordinates if the format needs it
+            if self.invertYCoords:
+                command += ["-flip"]
+
+            command += [
                 "-format", "%%[pixel:p{%u,%u}]" % (x, y),
                 "info:"
             ]