Add Arm aarch64 builds and NEON acceleration (#191)

This PR adds support for Arm aarch64 builds, including the corresponding NEON accelerated vector library. As part of this work I also improved testing: - Native C++ unit tests support using `googletest` integrated into CMake/CTest. - First unit test suite added, for 4-wide SIMD implementations. - Command line functional tests can target any build, not just AVX2.
2026-06-30 23:07:54 -04:00 · 2021-01-01 23:27:18 +00:00
parent f395904288
commit fb388737fb
17 changed files with 2225 additions and 44 deletions
@@ -1,6 +1,7 @@
 # Editor and engineering scratch files
 .vs
 .vscode
+.DS_Store
 *.log
 *.diff
 *.user
@@ -12,7 +13,7 @@ Proto
 Binaries

 # Build artifacts
-astcenc*
+astcenc
 build*

 # General build artifacts
@@ -0,0 +1,3 @@
+[submodule "Source/GoogleTest"]
+	path = Source/GoogleTest
+	url = https://github.com/google/googletest.git
@@ -15,7 +15,6 @@
 #  under the License.
 #  ----------------------------------------------------------------------------

-
 # CMake configuration
 cmake_minimum_required(VERSION 3.15)
 cmake_policy(SET CMP0069 NEW)  # LTO support
@@ -29,38 +28,77 @@ set(CMAKE_CXX_STANDARD_REQUIRED ON)
 set(CMAKE_CXX_EXTENSIONS OFF)
 set(CMAKE_EXPORT_COMPILE_COMMANDS 1)
 set(PACKAGE_ROOT astcenc)
+include(CTest)

 # Command line configuration
-option(ISA_AVX2 "Enable builds for AVX2 SIMD")
+function(printopt optName optVal optArch tgtArch)
+    if(${optVal})
+        if(${optArch} MATCHES ${tgtArch})
+            message("  -- ${optName} backend - ON")
+        else()
+            message("  -- ${optName} backend - SKIPPED (${optArch} only)")
+        endif()
+    else()
+        message("  -- ${optName} backend - OFF")
+    endif()
+endfunction()
+
+set(VALID_ARCH aarch64 x64)
+set(ARCH x64 CACHE STRING "Target architecture")
+set_property(CACHE ARCH PROPERTY STRINGS ${VALID_ARCH})
+
+message("-- Selecting target astcenc backends:")
+list(FIND VALID_ARCH ${ARCH} index)
+if(index EQUAL -1)
+    message(FATAL_ERROR "ARCH must be one of ${VALID_ARCH}")
+endif()

-message("-- Selecting backend build type(s)")
 set(ANY_ISA 0)
-if(${ISA_AVX2})
+
+option(ISA_AVX2 "Enable builds for AVX2 SIMD")
+printopt("AVX2" ${ISA_AVX2} "x64" ${ARCH})
+if(${ISA_AVX2} AND ${ARCH} MATCHES "x64")
    set(ANY_ISA 1)
-    message("  -- AVX2 backend: ON")
 endif()

 option(ISA_SSE41 "Enable builds for SSE4.1 SIMD")
-if(${ISA_SSE41})
+printopt("SSE4.1" ${ISA_SSE41} "x64" ${ARCH})
+if(${ISA_SSE41} AND ${ARCH} MATCHES "x64")
    set(ANY_ISA 1)
-    message("  -- SSE4.1 backend: ON")
 endif()

 option(ISA_SSE2 "Enable builds for SSE2 SIMD")
-if(${ISA_SSE2})
+printopt("SSE2" ${ISA_SSE2} "x64" ${ARCH})
+if(${ISA_SSE2} AND ${ARCH} MATCHES "x64")
    set(ANY_ISA 1)
-    message("  -- SSE2 backend: ON")
 endif()

-option(ISA_NONE "Enable builds for noSIMD")
+option(ISA_NEON "Enable builds for NEON SIMD")
+printopt("NEON" ${ISA_NEON} "aarch64" ${ARCH})
+if(${ISA_NEON} AND ${ARCH} MATCHES "aarch64")
+    set(ANY_ISA 1)
+endif()
+
+option(ISA_NONE "Enable builds for no SIMD")
 if(${ISA_NONE})
    set(ANY_ISA 1)
-    message("  -- No SIMD backend: ON")
+    message("  -- No SIMD backend - ON")
+else()
+    message("  -- No SIMD backend - OFF")
 endif()

 option(ISA_INVARIANCE "Enable builds for ISA invariance")
 if(${ISA_INVARIANCE})
-    message("  -- ISA invariance: ON")
+    message("  -- ISA invariant backend - ON")
+else()
+    message("  -- ISA invariant backend - OFF")
+endif()
+
+option(UNITTEST "Enable builds for unit tests")
+if(${UNITTEST})
+    message("  -- Unit tests - ON")
+else()
+    message("  -- Unit tests - OFF")
 endif()

 if(NOT ${ANY_ISA})
@@ -22,8 +22,14 @@ to generate the build system.
 mkdir build
 cd build

-# Create the build system
-cmake -G "NMake Makefiles" -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=./ \
+# Configure your build of choice, for example:
+
+# Arm arch64
+cmake -G "Unix Makefiles" -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=./ \
+    -DARCH=aarch64 -DISA_NEON=ON ..
+
+# x86-64
+cmake -G "Unix Makefiles" -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=./ \
    -DISA_AVX2=ON -DISA_SSE41=ON -DISA_SSE2=ON ..
 ```

@@ -62,7 +68,13 @@ export CXX=clang++
 mkdir build
 cd build

-# Create the build system
+# Configure your build of choice, for example:
+
+# Arm arch64
+cmake -G "Unix Makefiles" -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=./ \
+    -DARCH=aarch64 -DISA_NEON=ON ..
+
+# x86-64
 cmake -G "Unix Makefiles" -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=./ \
    -DISA_AVX2=ON -DISA_SSE41=ON -DISA_SSE2=ON ..
 ```
@@ -88,13 +100,14 @@ For codec developers there are a number of useful features in the build system.

 ### No intrinsics build

-All normal builds will use SIMD accelerated code paths using instrinsics, as
-x86-64 guarantees availability of at least SSE2. For development purposes it
-is possible to build an intrinsic-free build which uses no explicit SIMD
-acceleration (the compiler may still auto-vectorize).
+All normal builds will use SIMD accelerated code paths using intrinsics, as all
+target architectures (x86-64 and aarch64) guarantee SIMD availability. For
+development purposes it is possible to build an intrinsic-free build which uses
+no explicit SIMD acceleration (the compiler may still auto-vectorize).

 To enable this binary variant add `-DISA_NONE=ON` to the CMake command line
-when configuring. It is NOT
+when configuring. It is NOT recommended to use this for production; it is
+significantly slower than the vectorized SIMD builds.

 ### ISA Invariance

@@ -126,12 +139,35 @@ We support and test the following `CMAKE_BUILD_TYPE` options.
 Note that optimized release builds are compiled with link-time optimization,
 which can make profiling more challenging ...

+### Testing
+
+We support building unit tests.
+
+These builds use the `googletest` framework, which is pulled in though a git
+submodule. On first use, you must fetch the submodule dependency:
+
+```shell
+git submodule init
+git submodule update
+```
+
+To build unit tests add `-DUNITTEST=ON` to the CMake command line when
+configuring.
+
+To run unit tests use the CMake `ctest` utility from your build directory after
+you have built the tests.
+
+```shell
+cd build
+ctest --verbose
+```
+
 ### Packaging

 We support building a release bundle of all enabled binary configurations in
 the current CMake configuration using the `package` build target

-```bash
+```shell
 # Run a build and package build outputs in `./astcenc-<ver>-<os>-<arch>.<fmt>`
 cd build
 make package -j16
@@ -18,6 +18,9 @@ stable across versions, and this release is not compatible with 2.1. Please
 recompile your client-side code using the updated `astcenc.h` header.

 * **General:**
+  * **Feature:** New Arm aarch64 NEON accelerated vector library. Note that at
+    this time Arm builds must be built from source; pre-built binaries are not
+	provided in this release.
  * **Improvement:** SSE4.2 feature profile changed to SSE4.1, which more
    accurately reflects the feature set used.
  * **Improvement:** Build system changed to use CMake for all platforms.
@@ -35,8 +38,8 @@ recompile your client-side code using the updated `astcenc.h` header.
    scales RGB values by the alpha value. This can be useful to minimize
    cross-channel color bleed caused by GPU post-multiply filtering/blending.
  * **Improvements:** Command line tool cleanly traps and reports errors for
-    corrupt input images rather than relying on hard standard library
-    `assert()` calls.
+    corrupt input images rather than relying on standard library `assert()`
+	calls in release builds.
 * **Core API:**
  * **API Change:** Images using region-based metrics no longer need to include
    padding; all input images should be tightly packed and `dim_pad` is removed
@@ -14,10 +14,27 @@ can be achieved by configuring the CMake build using the install prefix
 `-DCMAKE_INSTALL_PREFIX=../` and then running a build with the `install` build
 target.

-# Running unit tests
+# Running C++ unit tests

-To run the command line unit tests, which aim to get coverage of the command
-line options and core codec stability without testing the compression quality
+We support a small (but growing) number of C++ unit tests, which are written
+using the `googletest` framework and integrated in the CMake "CTest" test
+framework.
+
+To build unit tests pull the `googletest` git submodule and add `-DUNITTEST=ON`
+to the CMake command line when configuring.
+
+To run unit tests use the CMake `ctest` utility from your build directory after
+you have built the tests.
+
+```shell
+cd build
+ctest --verbose
+```
+
+# Running command line tests
+
+To run the command line tests, which aim to get coverage of the command line
+options and core codec stability without testing the compression quality
 itself, run the command line:

    python3 -m unittest discover -s Test -p astc_test*.py -v
@@ -21,6 +21,25 @@ if(CMAKE_CXX_COMPILER_ID MATCHES "GNU|Clang")
    set(CMAKE_CXX_COMPILE_OPTIONS_IPO "-flto")
 endif()

+# - - - - - - - - - - - - - - - - - -
+# No architecture-specific SIMD
+
+if (${ISA_NONE})
+    set(ISA_SIMD none)
+    include(cmake_core.cmake)
+endif()
+
+# - - - - - - - - - - - - - - - - - -
+# Armv8-A architecture-specific SIMD
+
+if (${ISA_NEON})
+    set(ISA_SIMD neon)
+    include(cmake_core.cmake)
+endif()
+
+# - - - - - - - - - - - - - - - - - -
+# x86-64 architecture-specific SIMD
+
 if (${ISA_AVX2})
    set(ISA_SIMD avx2)
    include(cmake_core.cmake)
@@ -36,7 +55,11 @@ if (${ISA_SSE2})
    include(cmake_core.cmake)
 endif()

-if (${ISA_NONE})
-    set(ISA_SIMD none)
-    include(cmake_core.cmake)
+# - - - - - - - - - - - - - - - - - -
+# Unit testing
+if (${UNITTEST})
+    set(INSTALL_GTEST OFF CACHE BOOL "" FORCE)
+    add_subdirectory(GoogleTest)
+    enable_testing()
+    add_subdirectory(UnitTest)
 endif()
@@ -0,0 +1,50 @@
+#  SPDX-License-Identifier: Apache-2.0
+#  ----------------------------------------------------------------------------
+#  Copyright 2020 Arm Limited
+#
+#  Licensed under the Apache License, Version 2.0 (the "License"); you may not
+#  use this file except in compliance with the License. You may obtain a copy
+#  of the License at:
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#  WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#  License for the specific language governing permissions and limitations
+#  under the License.
+#  ----------------------------------------------------------------------------
+
+# - - - - - - - - - - - - - - - - - -
+# No architecture-specific SIMD
+
+if (${ISA_NONE})
+    set(ISA_SIMD none)
+    include(cmake_core.cmake)
+endif()
+
+# - - - - - - - - - - - - - - - - - -
+# Armv8-A architecture-specific SIMD
+
+if (${ISA_NEON})
+    set(ISA_SIMD neon)
+    include(cmake_core.cmake)
+endif()
+
+# - - - - - - - - - - - - - - - - - -
+# x86-64 architecture-specific SIMD
+
+if (${ISA_AVX2})
+    set(ISA_SIMD avx2)
+    include(cmake_core.cmake)
+endif()
+
+if (${ISA_SSE41})
+    set(ISA_SIMD sse4.1)
+    include(cmake_core.cmake)
+endif()
+
+if (${ISA_SSE2})
+    set(ISA_SIMD sse2)
+    include(cmake_core.cmake)
+endif()
@@ -0,0 +1,112 @@
+#  SPDX-License-Identifier: Apache-2.0
+#  ----------------------------------------------------------------------------
+#  Copyright 2020 Arm Limited
+#
+#  Licensed under the Apache License, Version 2.0 (the "License"); you may not
+#  use this file except in compliance with the License. You may obtain a copy
+#  of the License at:
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#  WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#  License for the specific language governing permissions and limitations
+#  under the License.
+#  ----------------------------------------------------------------------------
+
+add_executable(test-simd-${ISA_SIMD})
+
+target_sources(test-simd-${ISA_SIMD}
+    PRIVATE
+        test_simd.cpp)
+
+target_include_directories(test-simd-${ISA_SIMD}
+    PRIVATE
+        ${gtest_SOURCE_DIR}/include)
+
+target_compile_options(test-simd-${ISA_SIMD}
+    PRIVATE
+        # Use pthreads on Linux/macOS
+        $<$<PLATFORM_ID:Linux,Darwin>:-pthread>
+
+        # MSVC compiler defines
+        $<$<CXX_COMPILER_ID:MSVC>:/EHsc>
+
+        # G++ and Clang++ compiler defines
+        $<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wall>
+        $<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wextra>
+        $<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wpedantic>
+        $<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Werror>
+        $<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wshadow>
+        $<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wdouble-promotion>)
+
+# Set up configuration for SIMD ISA builds
+if(${ISA_SIMD} MATCHES "none")
+    target_compile_definitions(test-simd-${ISA_SIMD}
+        PRIVATE
+            ASTCENC_NEON=0
+            ASTCENC_SSE=0
+            ASTCENC_AVX=0
+            ASTCENC_POPCNT=0)
+
+    if (${ARCH} MATCHES x64)
+        target_compile_options(astcenc-${ISA_SIMD}
+            PRIVATE
+                $<$<CXX_COMPILER_ID:${GNU_LIKE}>:-mfpmath=sse -msse2>)
+    endif()
+
+elseif(${ISA_SIMD} MATCHES "neon")
+    target_compile_definitions(test-simd-${ISA_SIMD}
+        PRIVATE
+            ASTCENC_NEON=1
+            ASTCENC_SSE=0
+            ASTCENC_AVX=0
+            ASTCENC_POPCNT=0)
+
+elseif(${ISA_SIMD} MATCHES "sse2")
+    target_compile_definitions(test-simd-${ISA_SIMD}
+        PRIVATE
+            ASTCENC_NEON=0
+            ASTCENC_SSE=20
+            ASTCENC_AVX=0
+            ASTCENC_POPCNT=0)
+
+    target_compile_options(test-simd-${ISA_SIMD}
+        PRIVATE
+        $<$<CXX_COMPILER_ID:${GNU_LIKE}>:-mfpmath=sse -msse2>)
+
+elseif(${ISA_SIMD} MATCHES "sse4.1")
+    target_compile_definitions(test-simd-${ISA_SIMD}
+        PRIVATE
+            ASTCENC_NEON=0
+            ASTCENC_SSE=41
+            ASTCENC_AVX=0
+            ASTCENC_POPCNT=1)
+
+    target_compile_options(test-simd-${ISA_SIMD}
+        PRIVATE
+            $<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-mfpmath=sse -msse4.1 -mpopcnt>)
+
+elseif(${ISA_SIMD} MATCHES "avx2")
+    target_compile_definitions(test-simd-${ISA_SIMD}
+        PRIVATE
+            ASTCENC_NEON=0
+            ASTCENC_SSE=41
+            ASTCENC_AVX=2
+            ASTCENC_POPCNT=1)
+
+    target_compile_options(test-simd-${ISA_SIMD}
+        PRIVATE
+            $<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-mfpmath=sse -mavx2 -mpopcnt>
+            $<$<CXX_COMPILER_ID:MSVC>:/arch:AVX2>)
+endif()
+
+target_link_libraries(test-simd-${ISA_SIMD}
+    PRIVATE
+        gtest_main)
+
+add_test(NAME test-simd-${ISA_SIMD}
+         COMMAND test-simd-${ISA_SIMD})
+
+install(TARGETS test-simd-${ISA_SIMD} DESTINATION ${PACKAGE_ROOT})
@@ -63,6 +63,14 @@
  #endif
 #endif

+#ifndef ASTCENC_NEON
+  #if defined(__aarch64__)
+    #define ASTCENC_NEON 1
+  #else
+    #define ASTCENC_NEON 0
+  #endif
+#endif
+
 #if ASTCENC_AVX
  #define ASTCENC_VECALIGN 32
 #else
@@ -32,7 +32,7 @@
 * Explicit 4-wide types are accessible via the vint4, vfloat4, and vmask4
 * types. These are provided for use by VLA code, but are also expected to be
 * used as a fixed-width type and will supported a reference C++ fallback for
- * use on platforms without SIMD intrinsics (TODO: not yet implemented).
+ * use on platforms without SIMD intrinsics.
 *
 * Explicit 8-wide types are accessible via the vint8, vfloat8, and vmask8
 * types. These are provide for use by VLA code, and are not expected to be
@@ -42,9 +42,10 @@
 * With the current implementation ISA support is provided for:
 *
 *     * 1-wide for scalar reference.
- *     * 4-wide for SSE2.
- *     * 4-wide for SSE4.1.
- *     * 8-wide for AVX2.
+ *     * 4-wide for Armv8-A NEON.
+ *     * 4-wide for x86-64 SSE2.
+ *     * 4-wide for x86-64 SSE4.1.
+ *     * 8-wide for x86-64 AVX2.
 *
 */

@@ -53,6 +54,8 @@

 #if ASTCENC_SSE != 0 || ASTCENC_AVX != 0
 	#include <immintrin.h>
+#elif ASTCENC_NEON != 0
+	#include <arm_neon.h>
 #endif

 #if defined(_MSC_VER)
@@ -64,7 +67,7 @@
 #endif

 #if ASTCENC_AVX >= 2
-	/* If we have AVX2 expose 8-wide VLA, and 4-wide fixed width. */
+	/* If we have AVX2 expose 8-wide VLA. */
 	#include "astcenc_vecmathlib_avx2_8.h"
 	#include "astcenc_vecmathlib_sse_4.h"

@@ -89,6 +92,20 @@

 	constexpr auto loada = vfloat4::loada;
 	constexpr auto load1 = vfloat4::load1;
+
+#elif ASTCENC_NEON > 0
+	/* If we have NEON expose 4-wide VLA. */
+	#include "astcenc_vecmathlib_neon_4.h"
+
+	#define ASTCENC_SIMD_WIDTH 4
+
+	using vfloat = vfloat4;
+	using vint = vint4;
+	using vmask = vmask4;
+
+	constexpr auto loada = vfloat4::loada;
+	constexpr auto load1 = vfloat4::load1;
+
 #else
 	/* If we have nothing expose 1-wide VLA, and 4-wide fixed width. */
 	#include "astcenc_vecmathlib_none_1.h"
@@ -0,0 +1,799 @@
+// SPDX-License-Identifier: Apache-2.0
+// ----------------------------------------------------------------------------
+// Copyright 2019-2020 Arm Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
+// use this file except in compliance with the License. You may obtain a copy
+// of the License at:
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+// ----------------------------------------------------------------------------
+
+/**
+ * @brief 4x32-bit vectors, implemented using Armv8-A NEON.
+ *
+ * This module implements 4-wide 32-bit float, int, and mask vectors for
+ * Armv8-A NEON.
+ *
+ * There is a baseline level of functionality provided by all vector widths and
+ * implementations. This is implemented using identical function signatures,
+ * modulo data type, so we can use them as substitutable implementations in VLA
+ * code.
+ *
+ * The 4-wide vectors are also used as a fixed-width type, and significantly
+ * extend the functionality above that available to VLA code.
+ */
+
+#ifndef ASTC_VECMATHLIB_NEON_4_H_INCLUDED
+#define ASTC_VECMATHLIB_NEON_4_H_INCLUDED
+
+#ifndef ASTCENC_SIMD_INLINE
+	#error "Include astcenc_vecmathlib.h, do not include directly"
+#endif
+
+#include <cstdio>
+
+// ============================================================================
+// vfloat4 data type
+// ============================================================================
+
+/**
+ * @brief Data type for 4-wide floats.
+ */
+struct vfloat4
+{
+	/**
+	 * @brief Construct from zero-initialized value.
+	 */
+	ASTCENC_SIMD_INLINE vfloat4() {}
+
+	/**
+	 * @brief Construct from 4 values loaded from an unaligned address.
+	 *
+	 * Consider using loada() which is better with vectors if data is aligned
+	 * to vector length.
+	 */
+	ASTCENC_SIMD_INLINE explicit vfloat4(const float *p)
+	{
+		m = vld1q_f32(p);
+	}
+
+	/**
+	 * @brief Construct from 1 scalar value replicated across all lanes.
+	 *
+	 * Consider using zero() for constexpr zeros.
+	 */
+	ASTCENC_SIMD_INLINE explicit vfloat4(float a)
+	{
+		m = vdupq_n_f32(a);
+	}
+
+	/**
+	 * @brief Construct from 4 scalar values.
+	 *
+	 * The value of @c a is stored to lane 0 (LSB) in the SIMD register.
+	 */
+	ASTCENC_SIMD_INLINE explicit vfloat4(float a, float b, float c, float d)
+	{
+		float32x4_t v { a, b, c, d };
+		m = v;
+	}
+
+	/**
+	 * @brief Construct from an existing SIMD register.
+	 */
+	ASTCENC_SIMD_INLINE explicit vfloat4(float32x4_t a)
+	{
+		m = a;
+	}
+
+	/**
+	 * @brief Get the scalar value of a single lane.
+	 *
+	 * TODO: Can we do better for lane0, which is the common case for VLA?
+	 */
+	template <int l> ASTCENC_SIMD_INLINE float lane() const
+	{
+		return vgetq_lane_f32(m, l);
+	}
+
+	/**
+	 * @brief Set the scalar value of a single lane.
+	 */
+	template <int l> ASTCENC_SIMD_INLINE void set_lane(float a)
+	{
+		m = vld1q_lane_f32(&a, m, l);
+	}
+
+	/**
+	 * @brief Factory that returns a vector of zeros.
+	 */
+	static ASTCENC_SIMD_INLINE vfloat4 zero()
+	{
+		return vfloat4(vdupq_n_f32(0.0f));
+	}
+
+	/**
+	 * @brief Factory that returns a replicated scalar loaded from memory.
+	 */
+	static ASTCENC_SIMD_INLINE vfloat4 load1(const float* p)
+	{
+		return vfloat4(vdupq_n_f32(*p));
+	}
+
+	/**
+	 * @brief Factory that returns a vector loaded from 16B aligned memory.
+	 */
+	static ASTCENC_SIMD_INLINE vfloat4 loada(const float* p)
+	{
+		return vfloat4(vld1q_f32(p));
+	}
+
+	/**
+	 * @brief Factory that returns a vector containing the lane IDs.
+	 */
+	static ASTCENC_SIMD_INLINE vfloat4 lane_id()
+	{
+		alignas(16) float data[4] = { 0.0f, 1.0f, 2.0f, 3.0f };
+		return vfloat4(vld1q_f32(data));
+	}
+
+	/**
+	 * @brief The vector ...
+	 */
+	float32x4_t m;
+};
+
+// ============================================================================
+// vint4 data type
+// ============================================================================
+
+/**
+ * @brief Data type for 4-wide ints.
+ */
+struct vint4
+{
+	/**
+	 * @brief Construct from zero-initialized value.
+	 */
+	ASTCENC_SIMD_INLINE vint4() {}
+
+	/**
+	 * @brief Construct from 4 values loaded from an unaligned address.
+	 *
+	 * Consider using loada() which is better with vectors if data is aligned
+	 * to vector length.
+	 */
+	ASTCENC_SIMD_INLINE explicit vint4(const int *p)
+	{
+		m = vld1q_s32(p);
+	}
+
+	/**
+	 * @brief Construct from 1 scalar value replicated across all lanes.
+	 *
+	 * Consider using vfloat4::zero() for constexpr zeros.
+	 */
+	ASTCENC_SIMD_INLINE explicit vint4(int a)
+	{
+		m = vdupq_n_s32(a);
+	}
+
+	/**
+	 * @brief Construct from 4 scalar values.
+	 *
+	 * The value of @c a is stored to lane 0 (LSB) in the SIMD register.
+	 */
+	ASTCENC_SIMD_INLINE explicit vint4(int a, int b, int c, int d)
+	{
+		int32x4_t v = { a, b, c, d };
+		m = v;
+	}
+
+	/**
+	 * @brief Construct from an existing SIMD register.
+	 */
+	ASTCENC_SIMD_INLINE explicit vint4(int32x4_t a)
+	{
+		m = a;
+	}
+
+	/**
+	 * @brief Get the scalar from a single lane.
+	 */
+	template <int l> ASTCENC_SIMD_INLINE int lane() const
+	{
+		return vgetq_lane_s32(m, l);
+	}
+
+	/**
+	 * @brief Factory that returns a vector containing the lane IDs.
+	 */
+	static ASTCENC_SIMD_INLINE vint4 lane_id()
+	{
+		alignas(ASTCENC_VECALIGN) int data[4] = { 0, 1, 2, 3 };
+		return vint4(vld1q_s32(data));
+	}
+
+	/**
+	 * @brief The vector ...
+	 */
+	int32x4_t m;
+};
+
+// ============================================================================
+// vmask4 data type
+// ============================================================================
+
+/**
+ * @brief Data type for 4-wide control plane masks.
+ */
+struct vmask4
+{
+	/**
+	 * @brief Construct from an existing SIMD register.
+	 */
+	ASTCENC_SIMD_INLINE explicit vmask4(uint32x4_t a)
+	{
+		m = a;
+	}
+
+	/**
+	 * @brief Construct from an existing SIMD register.
+	 */
+	ASTCENC_SIMD_INLINE explicit vmask4(int32x4_t a)
+	{
+		m = vreinterpretq_u32_s32(a);
+	}
+
+	/**
+	 * @brief The vector ...
+	 */
+	uint32x4_t m;
+};
+
+// ============================================================================
+// vmask4 operators and functions
+// ============================================================================
+
+/**
+ * @brief Overload: mask union (or).
+ */
+ASTCENC_SIMD_INLINE vmask4 operator|(vmask4 a, vmask4 b)
+{
+	return vmask4(vorrq_u32(a.m, b.m));
+}
+
+/**
+ * @brief Overload: mask intersect (and).
+ */
+ASTCENC_SIMD_INLINE vmask4 operator&(vmask4 a, vmask4 b)
+{
+	return vmask4(vandq_u32(a.m, b.m));
+}
+
+/**
+ * @brief Overload: mask difference (xor).
+ */
+ASTCENC_SIMD_INLINE vmask4 operator^(vmask4 a, vmask4 b)
+{
+	return vmask4(veorq_u32(a.m, b.m));
+}
+
+/**
+ * @brief Overload: mask invert (not).
+ */
+ASTCENC_SIMD_INLINE vmask4 operator~(vmask4 a)
+{
+	return vmask4(vmvnq_u32(a.m));
+}
+
+/**
+ * @brief Return a 4-bit mask code indicating mask status.
+ *
+ * bit0 = lane 0
+ */
+ASTCENC_SIMD_INLINE unsigned int mask(vmask4 a)
+{
+	int32x4_t shift = { 0, 1, 2, 3 };
+	uint32x4_t tmp = vshrq_n_u32(a.m, 31);
+	return vaddvq_u32(vshlq_u32(tmp, shift));
+}
+
+/**
+ * @brief True if any lanes are enabled, false otherwise.
+ */
+ASTCENC_SIMD_INLINE bool any(vmask4 a)
+{
+	return mask(a) != 0;
+}
+
+/**
+ * @brief True if all lanes are enabled, false otherwise.
+ */
+ASTCENC_SIMD_INLINE bool all(vmask4 a)
+{
+	return mask(a) == 0xF;
+}
+
+// ============================================================================
+// vint4 operators and functions
+// ============================================================================
+
+/**
+ * @brief Overload: vector by vector addition.
+ */
+ASTCENC_SIMD_INLINE vint4 operator+(vint4 a, vint4 b)
+{
+	return vint4(vaddq_s32(a.m, b.m));
+}
+
+/**
+ * @brief Overload: vector by vector subtraction.
+ */
+ASTCENC_SIMD_INLINE vint4 operator-(vint4 a, vint4 b)
+{
+	return vint4(vsubq_s32(a.m, b.m));
+}
+
+/**
+ * @brief Overload: vector bit invert.
+ */
+ASTCENC_SIMD_INLINE vint4 operator~(vint4 a)
+{
+	return vint4(vmvnq_s32(a.m));
+}
+
+/**
+ * @brief Overload: vector by vector bitwise or.
+ */
+ASTCENC_SIMD_INLINE vint4 operator|(vint4 a, vint4 b)
+{
+	return vint4(vorrq_s32(a.m, b.m));
+}
+
+/**
+ * @brief Overload: vector by vector bitwise and.
+ */
+ASTCENC_SIMD_INLINE vint4 operator&(vint4 a, vint4 b)
+{
+	return vint4(vandq_s32(a.m, b.m));
+}
+
+/**
+ * @brief Overload: vector by vector bitwise xor.
+ */
+ASTCENC_SIMD_INLINE vint4 operator^(vint4 a, vint4 b)
+{
+	return vint4(veorq_s32(a.m, b.m));
+}
+
+/**
+ * @brief Overload: vector by vector equality.
+ */
+ASTCENC_SIMD_INLINE vmask4 operator==(vint4 a, vint4 b)
+{
+	return vmask4(vceqq_s32(a.m, b.m));
+}
+
+/**
+ * @brief Overload: vector by vector inequality.
+ */
+ASTCENC_SIMD_INLINE vmask4 operator!=(vint4 a, vint4 b)
+{
+	return ~vmask4(vceqq_s32(a.m, b.m));
+}
+
+/**
+ * @brief Overload: vector by vector less than.
+ */
+ASTCENC_SIMD_INLINE vmask4 operator<(vint4 a, vint4 b)
+{
+	return vmask4(vcltq_s32(a.m, b.m));
+}
+
+/**
+ * @brief Overload: vector by vector greater than.
+ */
+ASTCENC_SIMD_INLINE vmask4 operator>(vint4 a, vint4 b)
+{
+	return vmask4(vcgtq_s32(a.m, b.m));
+}
+
+/**
+ * @brief Return the min vector of two vectors.
+ */
+ASTCENC_SIMD_INLINE vint4 min(vint4 a, vint4 b)
+{
+	return vint4(vminq_s32(a.m, b.m));
+}
+
+/**
+ * @brief Return the max vector of two vectors.
+ */
+ASTCENC_SIMD_INLINE vint4 max(vint4 a, vint4 b)
+{
+	return vint4(vmaxq_s32(a.m, b.m));
+}
+
+/**
+ * @brief Return the horizontal minimum of a vector.
+ */
+ASTCENC_SIMD_INLINE vint4 hmin(vint4 a)
+{
+	return vint4(vminvq_s32(a.m));
+}
+
+/**
+ * @brief Store a vector to a 16B aligned memory address.
+ */
+ASTCENC_SIMD_INLINE void storea(vint4 a, int* p)
+{
+	vst1q_s32(p, a.m);
+}
+
+/**
+ * @brief Store lowest N (vector width) bytes into an unaligned address.
+ */
+ASTCENC_SIMD_INLINE void store_nbytes(vint4 a, uint8_t* p)
+{
+	vst1q_lane_s32((int32_t*)p, a.m, 0);
+}
+
+/**
+ * @brief Gather N (vector width) indices from the array.
+ */
+ASTCENC_SIMD_INLINE vint4 gatheri(const int* base, vint4 indices)
+{
+	alignas(16) int idx[4];
+	storea(indices, idx);
+	alignas(16) int vals[4];
+	vals[0] = base[idx[0]];
+	vals[1] = base[idx[1]];
+	vals[2] = base[idx[2]];
+	vals[3] = base[idx[3]];
+	return vint4(vals);
+}
+
+/**
+ * @brief Pack low 8 bits of N (vector width) lanes into bottom of vector.
+ */
+ASTCENC_SIMD_INLINE vint4 pack_low_bytes(vint4 a)
+{
+	alignas(16) uint8_t shuf[16] = {
+		0, 4, 8, 12,   0, 0, 0, 0,   0, 0, 0, 0,   0, 0, 0, 0
+	};
+	uint8x16_t idx = vld1q_u8(shuf);
+	int8x16_t av = vreinterpretq_s8_s32(a.m);
+    return vint4(vreinterpretq_s32_s8(vqtbl1q_s8(av, idx)));
+}
+
+/**
+ * @brief Return lanes from @c b if MSB of @c cond is set, else @c a.
+ */
+ASTCENC_SIMD_INLINE vint4 select(vint4 a, vint4 b, vmask4 cond)
+{
+	static const uint32x4_t msb = vdupq_n_u32(0x80000000u);
+	uint32x4_t mask = vcgeq_u32(cond.m, msb);
+	return vint4(vbslq_s32(mask, b.m, a.m));
+}
+
+/**
+ * @brief Debug function to print a vector of ints.
+ */
+ASTCENC_SIMD_INLINE void print(vint4 a)
+{
+	alignas(16) int v[4];
+	storea(a, v);
+	printf("v4_i32:\n  %8u %8u %8u %8u\n",
+	       v[0], v[1], v[2], v[3]);
+}
+
+// ============================================================================
+// vfloat4 operators and functions
+// ============================================================================
+
+/**
+ * @brief Overload: vector by vector addition.
+ */
+ASTCENC_SIMD_INLINE vfloat4 operator+(vfloat4 a, vfloat4 b)
+{
+	return vfloat4(vaddq_f32(a.m, b.m));
+}
+
+/**
+ * @brief Overload: vector by vector subtraction.
+ */
+ASTCENC_SIMD_INLINE vfloat4 operator-(vfloat4 a, vfloat4 b)
+{
+	return vfloat4(vsubq_f32(a.m, b.m));
+}
+
+/**
+ * @brief Overload: vector by vector multiplication.
+ */
+ASTCENC_SIMD_INLINE vfloat4 operator*(vfloat4 a, vfloat4 b)
+{
+	return vfloat4(vmulq_f32(a.m, b.m));
+}
+
+/**
+ * @brief Overload: vector by scalar multiplication.
+ */
+ASTCENC_SIMD_INLINE vfloat4 operator*(vfloat4 a, float b)
+{
+	float32x4_t bv = vld1q_dup_f32(&b);
+	return vfloat4(vmulq_f32(a.m, bv));
+}
+
+/**
+ * @brief Overload: scalar by vector multiplication.
+ */
+ASTCENC_SIMD_INLINE vfloat4 operator*(float a, vfloat4 b)
+{
+	float32x4_t av = vld1q_dup_f32(&a);
+	return vfloat4(vmulq_f32(av, b.m));
+}
+
+/**
+ * @brief Overload: vector by vector division.
+ */
+ASTCENC_SIMD_INLINE vfloat4 operator/(vfloat4 a, vfloat4 b)
+{
+	return vfloat4(vdivq_f32(a.m, b.m));
+}
+
+/**
+ * @brief Overload: vector by vector equality.
+ */
+ASTCENC_SIMD_INLINE vmask4 operator==(vfloat4 a, vfloat4 b)
+{
+	return vmask4(vceqq_f32(a.m, b.m));
+}
+
+/**
+ * @brief Overload: vector by vector inequality.
+ */
+ASTCENC_SIMD_INLINE vmask4 operator!=(vfloat4 a, vfloat4 b)
+{
+	return vmask4(vmvnq_u32(vceqq_f32(a.m, b.m)));
+}
+
+/**
+ * @brief Overload: vector by vector less than.
+ */
+ASTCENC_SIMD_INLINE vmask4 operator<(vfloat4 a, vfloat4 b)
+{
+	return vmask4(vcltq_f32(a.m, b.m));
+}
+
+/**
+ * @brief Overload: vector by vector greater than.
+ */
+ASTCENC_SIMD_INLINE vmask4 operator>(vfloat4 a, vfloat4 b)
+{
+	return vmask4(vcgtq_f32(a.m, b.m));
+}
+
+/**
+ * @brief Overload: vector by vector less than or equal.
+ */
+ASTCENC_SIMD_INLINE vmask4 operator<=(vfloat4 a, vfloat4 b)
+{
+	return vmask4(vcleq_f32(a.m, b.m));
+}
+
+/**
+ * @brief Overload: vector by vector greater than or equal.
+ */
+ASTCENC_SIMD_INLINE vmask4 operator>=(vfloat4 a, vfloat4 b)
+{
+	return vmask4(vcgeq_f32(a.m, b.m));
+}
+
+/**
+ * @brief Return the min vector of two vectors.
+ *
+ * If either lane value is NaN, @c b will be returned for that lane.
+ */
+ASTCENC_SIMD_INLINE vfloat4 min(vfloat4 a, vfloat4 b)
+{
+	// Do not reorder - second operand will return if either is NaN
+	return vfloat4(vminnmq_f32(a.m, b.m));
+}
+
+/**
+ * @brief Return the max vector of two vectors.
+ *
+ * If either lane value is NaN, @c b will be returned for that lane.
+ */
+ASTCENC_SIMD_INLINE vfloat4 max(vfloat4 a, vfloat4 b)
+{
+	// Do not reorder - second operand will return if either is NaN
+	return vfloat4(vmaxnmq_f32(a.m, b.m));
+}
+
+/**
+ * @brief Return the clamped value between min and max.
+ *
+ * It is assumed that neither @c min nor @c max are NaN values. If @c a is NaN
+ * then @c min will be returned for that lane.
+ */
+ASTCENC_SIMD_INLINE vfloat4 clamp(float min, float max, vfloat4 a)
+{
+	float32x4_t minv = vdupq_n_f32(min);
+	float32x4_t maxv = vdupq_n_f32(max);
+	return vfloat4(vminnmq_f32(vmaxnmq_f32(a.m, minv), maxv));
+}
+
+/**
+ * @brief Return a clamped value between 0.0f and max.
+ *
+ * It is assumed that @c max is not a NaN value. If @c a is NaN then zero will
+ * be returned for that lane.
+ */
+ASTCENC_SIMD_INLINE vfloat4 clampz(float max, vfloat4 a)
+{
+	// Do not reorder - second operand will return if either is NaN
+	float32x4_t minv = vdupq_n_f32(0.0f);
+	float32x4_t maxv = vdupq_n_f32(max);
+	return vfloat4(vminnmq_f32(vmaxnmq_f32(a.m, minv), maxv));
+}
+
+/**
+ * @brief Return a clamped value between 0.0f and 1.0f.
+ *
+ * If @c a is NaN then zero will be returned for that lane.
+ */
+ASTCENC_SIMD_INLINE vfloat4 clampzo(vfloat4 a)
+{
+	float32x4_t minv = vdupq_n_f32(0.0f);
+	float32x4_t maxv = vdupq_n_f32(1.0f);
+	return vfloat4(vminnmq_f32(vmaxnmq_f32(a.m, minv), maxv));
+}
+
+/**
+ * @brief Return the absolute value of the float vector.
+ */
+ASTCENC_SIMD_INLINE vfloat4 abs(vfloat4 a)
+{
+	float32x4_t zero = vdupq_n_f32(0.0f);
+	float32x4_t inv = vsubq_f32(zero, a.m);
+	return vfloat4(vmaxq_f32(a.m, inv));
+}
+
+/**
+ * @brief Return a float rounded to the nearest integer value.
+ *
+ * TODO: Can we do a better fallback here, if we exploit the fact that we
+ *       can assume that values are positive?
+ */
+ASTCENC_SIMD_INLINE vfloat4 round(vfloat4 a)
+{
+	return vfloat4(vrndnq_f32(a.m));
+}
+
+/**
+ * @brief Return the horizontal minimum of a vector.
+ */
+ASTCENC_SIMD_INLINE vfloat4 hmin(vfloat4 a)
+{
+	return vfloat4(vminvq_f32(a.m));
+}
+
+/**
+ * @brief Return the sqrt of the lanes in the vector.
+ */
+ASTCENC_SIMD_INLINE vfloat4 sqrt(vfloat4 a)
+{
+	return vfloat4(vsqrtq_f32(a.m));
+}
+
+/**
+ * @brief Return lanes from @c b if MSB of @c cond is set, else @c a.
+ */
+ASTCENC_SIMD_INLINE vfloat4 select(vfloat4 a, vfloat4 b, vmask4 cond)
+{
+	static const uint32x4_t msb = vdupq_n_u32(0x80000000u);
+	uint32x4_t mask = vcgeq_u32(cond.m, msb);
+	return vfloat4(vbslq_f32(mask, b.m, a.m));
+}
+
+/**
+ * @brief Load a vector of gathered results from an array;
+ */
+ASTCENC_SIMD_INLINE vfloat4 gatherf(const float* base, vint4 indices)
+{
+	alignas(16) int idx[4];
+	storea(indices, idx);
+	alignas(16) float vals[4];
+	vals[0] = base[idx[0]];
+	vals[1] = base[idx[1]];
+	vals[2] = base[idx[2]];
+	vals[3] = base[idx[3]];
+	return vfloat4(vals);
+}
+
+/**
+ * @brief Store a vector to an unaligned memory address.
+ */
+ASTCENC_SIMD_INLINE void store(vfloat4 a, float* p)
+{
+	vst1q_f32(p, a.m);
+}
+
+/**
+ * @brief Store a vector to a 16B aligned memory address.
+ */
+ASTCENC_SIMD_INLINE void storea(vfloat4 a, float* p)
+{
+	vst1q_f32(p, a.m);
+}
+
+/**
+ * @brief Return the dot product for the full 4 lanes, returning vector.
+ */
+ASTCENC_SIMD_INLINE vfloat4 dot(vfloat4 a, vfloat4 b)
+{
+	return vfloat4(vaddvq_f32(vmulq_f32(a.m, b.m)));
+}
+
+/**
+ * @brief Return a integer value for a float vector, using truncation.
+ */
+ASTCENC_SIMD_INLINE vint4 float_to_int(vfloat4 a)
+{
+	return vint4(vcvtq_s32_f32(a.m));
+}
+
+/**
+ * @brief Return a integer value for a float vector, using round-to-nearest.
+ */
+ASTCENC_SIMD_INLINE vint4 float_to_int_rtn(vfloat4 a)
+{
+	a = round(a);
+	return vint4(vcvtq_s32_f32(a.m));
+}
+
+/**
+ * @brief Return a float value as an integer bit pattern (i.e. no conversion).
+ *
+ * It is a common trick to convert floats into integer bit patterns, perform
+ * some bit hackery based on knowledge they are IEEE 754 layout, and then
+ * convert them back again. This is the first half of that flip.
+ */
+ASTCENC_SIMD_INLINE vint4 float_as_int(vfloat4 a)
+{
+	return vint4(vreinterpretq_s32_f32(a.m));
+}
+
+/**
+ * @brief Return a integer value as a float bit pattern (i.e. no conversion).
+ *
+ * It is a common trick to convert floats into integer bit patterns, perform
+ * some bit hackery based on knowledge they are IEEE 754 layout, and then
+ * convert them back again. This is the second half of that flip.
+ */
+ASTCENC_SIMD_INLINE vfloat4 int_as_float(vint4 v)
+{
+	return vfloat4(vreinterpretq_f32_s32(v.m));
+}
+
+/**
+ * @brief Debug function to print a vector of floats.
+ */
+ASTCENC_SIMD_INLINE void print(vfloat4 a)
+{
+	alignas(16) float v[4];
+	storea(a, v);
+	printf("v4_f32:\n  %0.4f %0.4f %0.4f %0.4f\n",
+	       (double)v[0], (double)v[1], (double)v[2], (double)v[3]);
+}
+
+#endif // #ifndef ASTC_VECMATHLIB_NEON_4_H_INCLUDED
@@ -110,17 +110,29 @@ endif()
 if(${ISA_SIMD} MATCHES "none")
    target_compile_definitions(astcenc-${ISA_SIMD}
        PRIVATE
+            ASTCENC_NEON=0
            ASTCENC_SSE=0
            ASTCENC_AVX=0
            ASTCENC_POPCNT=0)

-    target_compile_options(astcenc-${ISA_SIMD}
+    if (${ARCH} MATCHES x64)
+        target_compile_options(astcenc-${ISA_SIMD}
+            PRIVATE
+                $<$<CXX_COMPILER_ID:${GNU_LIKE}>:-mfpmath=sse -msse2>)
+    endif()
+
+elseif(${ISA_SIMD} MATCHES "neon")
+    target_compile_definitions(astcenc-${ISA_SIMD}
        PRIVATE
-            $<$<CXX_COMPILER_ID:${GNU_LIKE}>:-mfpmath=sse -msse2>)
+            ASTCENC_NEON=1
+            ASTCENC_SSE=0
+            ASTCENC_AVX=0
+            ASTCENC_POPCNT=0)

 elseif(${ISA_SIMD} MATCHES "sse2")
    target_compile_definitions(astcenc-${ISA_SIMD}
        PRIVATE
+            ASTCENC_NEON=0
            ASTCENC_SSE=20
            ASTCENC_AVX=0
            ASTCENC_POPCNT=0)
@@ -132,6 +144,7 @@ elseif(${ISA_SIMD} MATCHES "sse2")
 elseif(${ISA_SIMD} MATCHES "sse4.1")
    target_compile_definitions(astcenc-${ISA_SIMD}
        PRIVATE
+            ASTCENC_NEON=0
            ASTCENC_SSE=41
            ASTCENC_AVX=0
            ASTCENC_POPCNT=1)
@@ -143,6 +156,7 @@ elseif(${ISA_SIMD} MATCHES "sse4.1")
 elseif(${ISA_SIMD} MATCHES "avx2")
    target_compile_definitions(astcenc-${ISA_SIMD}
        PRIVATE
+            ASTCENC_NEON=0
            ASTCENC_SSE=41
            ASTCENC_AVX=2
            ASTCENC_POPCNT=1)
@@ -285,23 +285,30 @@ def get_encoder_params(encoderName, referenceName, imageSet):
        outDir = "Test/Images/%s" % imageSet
        refName = None
    # Latest master
+    elif encoderName == "ref-master-neon":
+        # Warning: this option rebuilds a new reference test result for the
+        # master branch using the user's locally build encoder.
+        encoder = te.Encoder2x("neon")
+        name = "reference-master-neon"
+        outDir = "Test/Images/%s" % imageSet
+        refName = None
    elif encoderName == "ref-master-sse2":
        # Warning: this option rebuilds a new reference test result for the
-        # master branch using the user's locally build encoder in ./Source.
+        # master branch using the user's locally build encoder.
        encoder = te.Encoder2x("sse2")
        name = "reference-master-sse2"
        outDir = "Test/Images/%s" % imageSet
        refName = None
    elif encoderName == "ref-master-sse4.1":
        # Warning: this option rebuilds a new reference test result for the
-        # master branch using the user's locally build encoder in ./Source.
+        # master branch using the user's locally build encoder.
        encoder = te.Encoder2x("sse4.1")
        name = "reference-master-sse4.1"
        outDir = "Test/Images/%s" % imageSet
        refName = None
    elif encoderName == "ref-master-avx2":
        # Warning: this option rebuilds a new reference test result for the
-        # master branch using the user's locally build encoder in ./Source.
+        # master branch using the user's locally build encoder.
        encoder = te.Encoder2x("avx2")
        name = "reference-master-avx2"
        outDir = "Test/Images/%s" % imageSet
@@ -327,8 +334,8 @@ def parse_command_line():
    refcoders = ["ref-1.7",
                 "ref-2.0-sse2", "ref-2.0-sse4.1", "ref-2.0-avx2",
                 "ref-2.1-sse2", "ref-2.1-sse4.1", "ref-2.1-avx2",
-                 "ref-master-sse2", "ref-master-sse4.1", "ref-master-avx2"]
-    testcoders = ["none", "sse2", "sse4.1", "avx2"]
+                 "ref-master-neon", "ref-master-sse2", "ref-master-sse4.1", "ref-master-avx2"]
+    testcoders = ["none", "neon", "sse2", "sse4.1", "avx2"]
    coders = refcoders + testcoders + ["all", "all-ref"]
    parser.add_argument("--encoder", dest="encoders", default="avx2",
                        choices=coders, help="test encoder variant")
@@ -28,6 +28,7 @@ The directory path is structured:

 from collections.abc import Iterable
 import os
+import re
 import subprocess as sp

 from PIL import Image as PILImage
@@ -35,6 +36,39 @@ from PIL import Image as PILImage
 import testlib.misc as misc


+CONVERT_BINARY = ["convert"]
+
+
+g_ConvertVersion = None
+
+
+def get_convert_version():
+    """
+    Get the major/minor version of ImageMagick on the system.
+    """
+    global g_ConvertVersion
+
+    if g_ConvertVersion is None:
+        command = list(CONVERT_BINARY)
+        command += ["--version"]
+        result = sp.run(command, stdout=sp.PIPE, stderr=sp.PIPE,
+                        check=True, universal_newlines=True)
+
+        # Version is top row
+        version = result.stdout.splitlines()[0]
+        # ... third token
+        version = re.split(" ", version)[2]
+        # ... major/minor/patch/subpatch
+        version = re.split("\\.|-", version)
+
+        numericVersion = float(version[0])
+        numericVersion += float(version[1]) / 10.0
+
+        g_ConvertVersion = numericVersion
+
+    return g_ConvertVersion
+
+
 class ImageException(Exception):
    """
    Exception thrown for bad image specification.
@@ -228,6 +262,14 @@ class Image():
        Args:
            filePath (str): The path to the image on disk.
        """
+        convert = get_convert_version()
+
+        # ImageMagick 7 started to use .tga file origin information. By default
+        # TGA files store data from bottom up, and define the origin as bottom
+        # left. We want our color samples to always use a top left origin, even
+        # if the data is stored in alternative layout.
+        self.invertYCoords = (convert >= 7.0) and filePath.endswith(".tga")
+
        self.filePath = filePath
        self.proxyPath = None

@@ -256,8 +298,14 @@ class Image():
            coords = [coords]

        for (x, y) in coords:
-            command = [
-                "convert", self.filePath,
+            command = list(CONVERT_BINARY)
+            command += [self.filePath]
+
+            # Invert coordinates if the format needs it
+            if self.invertYCoords:
+                command += ["-flip"]
+
+            command += [
                "-format", "%%[pixel:p{%u,%u}]" % (x, y),
                "info:"
            ]