Add Arm aarch64 builds and NEON acceleration (#191)

This PR adds support for Arm aarch64 builds, including the corresponding NEON accelerated vector library.

As part of this work I also improved testing:

- Native C++ unit tests support using `googletest` integrated into CMake/CTest. 
- First unit test suite added, for 4-wide SIMD implementations.
- Command line functional tests can target any build, not just AVX2.
This commit is contained in:
Pete Harris
2021-01-01 23:27:18 +00:00
committed by GitHub
parent f395904288
commit fb388737fb
17 changed files with 2225 additions and 44 deletions
+2 -1
View File
@@ -1,6 +1,7 @@
# Editor and engineering scratch files
.vs
.vscode
.DS_Store
*.log
*.diff
*.user
@@ -12,7 +13,7 @@ Proto
Binaries
# Build artifacts
astcenc*
astcenc
build*
# General build artifacts
+3
View File
@@ -0,0 +1,3 @@
[submodule "Source/GoogleTest"]
path = Source/GoogleTest
url = https://github.com/google/googletest.git
+50 -12
View File
@@ -15,7 +15,6 @@
# under the License.
# ----------------------------------------------------------------------------
# CMake configuration
cmake_minimum_required(VERSION 3.15)
cmake_policy(SET CMP0069 NEW) # LTO support
@@ -29,38 +28,77 @@ set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CXX_EXTENSIONS OFF)
set(CMAKE_EXPORT_COMPILE_COMMANDS 1)
set(PACKAGE_ROOT astcenc)
include(CTest)
# Command line configuration
option(ISA_AVX2 "Enable builds for AVX2 SIMD")
function(printopt optName optVal optArch tgtArch)
if(${optVal})
if(${optArch} MATCHES ${tgtArch})
message(" -- ${optName} backend - ON")
else()
message(" -- ${optName} backend - SKIPPED (${optArch} only)")
endif()
else()
message(" -- ${optName} backend - OFF")
endif()
endfunction()
set(VALID_ARCH aarch64 x64)
set(ARCH x64 CACHE STRING "Target architecture")
set_property(CACHE ARCH PROPERTY STRINGS ${VALID_ARCH})
message("-- Selecting target astcenc backends:")
list(FIND VALID_ARCH ${ARCH} index)
if(index EQUAL -1)
message(FATAL_ERROR "ARCH must be one of ${VALID_ARCH}")
endif()
message("-- Selecting backend build type(s)")
set(ANY_ISA 0)
if(${ISA_AVX2})
option(ISA_AVX2 "Enable builds for AVX2 SIMD")
printopt("AVX2" ${ISA_AVX2} "x64" ${ARCH})
if(${ISA_AVX2} AND ${ARCH} MATCHES "x64")
set(ANY_ISA 1)
message(" -- AVX2 backend: ON")
endif()
option(ISA_SSE41 "Enable builds for SSE4.1 SIMD")
if(${ISA_SSE41})
printopt("SSE4.1" ${ISA_SSE41} "x64" ${ARCH})
if(${ISA_SSE41} AND ${ARCH} MATCHES "x64")
set(ANY_ISA 1)
message(" -- SSE4.1 backend: ON")
endif()
option(ISA_SSE2 "Enable builds for SSE2 SIMD")
if(${ISA_SSE2})
printopt("SSE2" ${ISA_SSE2} "x64" ${ARCH})
if(${ISA_SSE2} AND ${ARCH} MATCHES "x64")
set(ANY_ISA 1)
message(" -- SSE2 backend: ON")
endif()
option(ISA_NONE "Enable builds for noSIMD")
option(ISA_NEON "Enable builds for NEON SIMD")
printopt("NEON" ${ISA_NEON} "aarch64" ${ARCH})
if(${ISA_NEON} AND ${ARCH} MATCHES "aarch64")
set(ANY_ISA 1)
endif()
option(ISA_NONE "Enable builds for no SIMD")
if(${ISA_NONE})
set(ANY_ISA 1)
message(" -- No SIMD backend: ON")
message(" -- No SIMD backend - ON")
else()
message(" -- No SIMD backend - OFF")
endif()
option(ISA_INVARIANCE "Enable builds for ISA invariance")
if(${ISA_INVARIANCE})
message(" -- ISA invariance: ON")
message(" -- ISA invariant backend - ON")
else()
message(" -- ISA invariant backend - OFF")
endif()
option(UNITTEST "Enable builds for unit tests")
if(${UNITTEST})
message(" -- Unit tests - ON")
else()
message(" -- Unit tests - OFF")
endif()
if(NOT ${ANY_ISA})
+45 -9
View File
@@ -22,8 +22,14 @@ to generate the build system.
mkdir build
cd build
# Create the build system
cmake -G "NMake Makefiles" -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=./ \
# Configure your build of choice, for example:
# Arm arch64
cmake -G "Unix Makefiles" -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=./ \
-DARCH=aarch64 -DISA_NEON=ON ..
# x86-64
cmake -G "Unix Makefiles" -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=./ \
-DISA_AVX2=ON -DISA_SSE41=ON -DISA_SSE2=ON ..
```
@@ -62,7 +68,13 @@ export CXX=clang++
mkdir build
cd build
# Create the build system
# Configure your build of choice, for example:
# Arm arch64
cmake -G "Unix Makefiles" -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=./ \
-DARCH=aarch64 -DISA_NEON=ON ..
# x86-64
cmake -G "Unix Makefiles" -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=./ \
-DISA_AVX2=ON -DISA_SSE41=ON -DISA_SSE2=ON ..
```
@@ -88,13 +100,14 @@ For codec developers there are a number of useful features in the build system.
### No intrinsics build
All normal builds will use SIMD accelerated code paths using instrinsics, as
x86-64 guarantees availability of at least SSE2. For development purposes it
is possible to build an intrinsic-free build which uses no explicit SIMD
acceleration (the compiler may still auto-vectorize).
All normal builds will use SIMD accelerated code paths using intrinsics, as all
target architectures (x86-64 and aarch64) guarantee SIMD availability. For
development purposes it is possible to build an intrinsic-free build which uses
no explicit SIMD acceleration (the compiler may still auto-vectorize).
To enable this binary variant add `-DISA_NONE=ON` to the CMake command line
when configuring. It is NOT
when configuring. It is NOT recommended to use this for production; it is
significantly slower than the vectorized SIMD builds.
### ISA Invariance
@@ -126,12 +139,35 @@ We support and test the following `CMAKE_BUILD_TYPE` options.
Note that optimized release builds are compiled with link-time optimization,
which can make profiling more challenging ...
### Testing
We support building unit tests.
These builds use the `googletest` framework, which is pulled in though a git
submodule. On first use, you must fetch the submodule dependency:
```shell
git submodule init
git submodule update
```
To build unit tests add `-DUNITTEST=ON` to the CMake command line when
configuring.
To run unit tests use the CMake `ctest` utility from your build directory after
you have built the tests.
```shell
cd build
ctest --verbose
```
### Packaging
We support building a release bundle of all enabled binary configurations in
the current CMake configuration using the `package` build target
```bash
```shell
# Run a build and package build outputs in `./astcenc-<ver>-<os>-<arch>.<fmt>`
cd build
make package -j16
+5 -2
View File
@@ -18,6 +18,9 @@ stable across versions, and this release is not compatible with 2.1. Please
recompile your client-side code using the updated `astcenc.h` header.
* **General:**
* **Feature:** New Arm aarch64 NEON accelerated vector library. Note that at
this time Arm builds must be built from source; pre-built binaries are not
provided in this release.
* **Improvement:** SSE4.2 feature profile changed to SSE4.1, which more
accurately reflects the feature set used.
* **Improvement:** Build system changed to use CMake for all platforms.
@@ -35,8 +38,8 @@ recompile your client-side code using the updated `astcenc.h` header.
scales RGB values by the alpha value. This can be useful to minimize
cross-channel color bleed caused by GPU post-multiply filtering/blending.
* **Improvements:** Command line tool cleanly traps and reports errors for
corrupt input images rather than relying on hard standard library
`assert()` calls.
corrupt input images rather than relying on standard library `assert()`
calls in release builds.
* **Core API:**
* **API Change:** Images using region-based metrics no longer need to include
padding; all input images should be tightly packed and `dim_pad` is removed
+20 -3
View File
@@ -14,10 +14,27 @@ can be achieved by configuring the CMake build using the install prefix
`-DCMAKE_INSTALL_PREFIX=../` and then running a build with the `install` build
target.
# Running unit tests
# Running C++ unit tests
To run the command line unit tests, which aim to get coverage of the command
line options and core codec stability without testing the compression quality
We support a small (but growing) number of C++ unit tests, which are written
using the `googletest` framework and integrated in the CMake "CTest" test
framework.
To build unit tests pull the `googletest` git submodule and add `-DUNITTEST=ON`
to the CMake command line when configuring.
To run unit tests use the CMake `ctest` utility from your build directory after
you have built the tests.
```shell
cd build
ctest --verbose
```
# Running command line tests
To run the command line tests, which aim to get coverage of the command line
options and core codec stability without testing the compression quality
itself, run the command line:
python3 -m unittest discover -s Test -p astc_test*.py -v
+26 -3
View File
@@ -21,6 +21,25 @@ if(CMAKE_CXX_COMPILER_ID MATCHES "GNU|Clang")
set(CMAKE_CXX_COMPILE_OPTIONS_IPO "-flto")
endif()
# - - - - - - - - - - - - - - - - - -
# No architecture-specific SIMD
if (${ISA_NONE})
set(ISA_SIMD none)
include(cmake_core.cmake)
endif()
# - - - - - - - - - - - - - - - - - -
# Armv8-A architecture-specific SIMD
if (${ISA_NEON})
set(ISA_SIMD neon)
include(cmake_core.cmake)
endif()
# - - - - - - - - - - - - - - - - - -
# x86-64 architecture-specific SIMD
if (${ISA_AVX2})
set(ISA_SIMD avx2)
include(cmake_core.cmake)
@@ -36,7 +55,11 @@ if (${ISA_SSE2})
include(cmake_core.cmake)
endif()
if (${ISA_NONE})
set(ISA_SIMD none)
include(cmake_core.cmake)
# - - - - - - - - - - - - - - - - - -
# Unit testing
if (${UNITTEST})
set(INSTALL_GTEST OFF CACHE BOOL "" FORCE)
add_subdirectory(GoogleTest)
enable_testing()
add_subdirectory(UnitTest)
endif()
+1
Submodule Source/GoogleTest added at 703bd9caab
+50
View File
@@ -0,0 +1,50 @@
# SPDX-License-Identifier: Apache-2.0
# ----------------------------------------------------------------------------
# Copyright 2020 Arm Limited
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may not
# use this file except in compliance with the License. You may obtain a copy
# of the License at:
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
# ----------------------------------------------------------------------------
# - - - - - - - - - - - - - - - - - -
# No architecture-specific SIMD
if (${ISA_NONE})
set(ISA_SIMD none)
include(cmake_core.cmake)
endif()
# - - - - - - - - - - - - - - - - - -
# Armv8-A architecture-specific SIMD
if (${ISA_NEON})
set(ISA_SIMD neon)
include(cmake_core.cmake)
endif()
# - - - - - - - - - - - - - - - - - -
# x86-64 architecture-specific SIMD
if (${ISA_AVX2})
set(ISA_SIMD avx2)
include(cmake_core.cmake)
endif()
if (${ISA_SSE41})
set(ISA_SIMD sse4.1)
include(cmake_core.cmake)
endif()
if (${ISA_SSE2})
set(ISA_SIMD sse2)
include(cmake_core.cmake)
endif()
+112
View File
@@ -0,0 +1,112 @@
# SPDX-License-Identifier: Apache-2.0
# ----------------------------------------------------------------------------
# Copyright 2020 Arm Limited
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may not
# use this file except in compliance with the License. You may obtain a copy
# of the License at:
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
# ----------------------------------------------------------------------------
add_executable(test-simd-${ISA_SIMD})
target_sources(test-simd-${ISA_SIMD}
PRIVATE
test_simd.cpp)
target_include_directories(test-simd-${ISA_SIMD}
PRIVATE
${gtest_SOURCE_DIR}/include)
target_compile_options(test-simd-${ISA_SIMD}
PRIVATE
# Use pthreads on Linux/macOS
$<$<PLATFORM_ID:Linux,Darwin>:-pthread>
# MSVC compiler defines
$<$<CXX_COMPILER_ID:MSVC>:/EHsc>
# G++ and Clang++ compiler defines
$<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wall>
$<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wextra>
$<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wpedantic>
$<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Werror>
$<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wshadow>
$<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wdouble-promotion>)
# Set up configuration for SIMD ISA builds
if(${ISA_SIMD} MATCHES "none")
target_compile_definitions(test-simd-${ISA_SIMD}
PRIVATE
ASTCENC_NEON=0
ASTCENC_SSE=0
ASTCENC_AVX=0
ASTCENC_POPCNT=0)
if (${ARCH} MATCHES x64)
target_compile_options(astcenc-${ISA_SIMD}
PRIVATE
$<$<CXX_COMPILER_ID:${GNU_LIKE}>:-mfpmath=sse -msse2>)
endif()
elseif(${ISA_SIMD} MATCHES "neon")
target_compile_definitions(test-simd-${ISA_SIMD}
PRIVATE
ASTCENC_NEON=1
ASTCENC_SSE=0
ASTCENC_AVX=0
ASTCENC_POPCNT=0)
elseif(${ISA_SIMD} MATCHES "sse2")
target_compile_definitions(test-simd-${ISA_SIMD}
PRIVATE
ASTCENC_NEON=0
ASTCENC_SSE=20
ASTCENC_AVX=0
ASTCENC_POPCNT=0)
target_compile_options(test-simd-${ISA_SIMD}
PRIVATE
$<$<CXX_COMPILER_ID:${GNU_LIKE}>:-mfpmath=sse -msse2>)
elseif(${ISA_SIMD} MATCHES "sse4.1")
target_compile_definitions(test-simd-${ISA_SIMD}
PRIVATE
ASTCENC_NEON=0
ASTCENC_SSE=41
ASTCENC_AVX=0
ASTCENC_POPCNT=1)
target_compile_options(test-simd-${ISA_SIMD}
PRIVATE
$<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-mfpmath=sse -msse4.1 -mpopcnt>)
elseif(${ISA_SIMD} MATCHES "avx2")
target_compile_definitions(test-simd-${ISA_SIMD}
PRIVATE
ASTCENC_NEON=0
ASTCENC_SSE=41
ASTCENC_AVX=2
ASTCENC_POPCNT=1)
target_compile_options(test-simd-${ISA_SIMD}
PRIVATE
$<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-mfpmath=sse -mavx2 -mpopcnt>
$<$<CXX_COMPILER_ID:MSVC>:/arch:AVX2>)
endif()
target_link_libraries(test-simd-${ISA_SIMD}
PRIVATE
gtest_main)
add_test(NAME test-simd-${ISA_SIMD}
COMMAND test-simd-${ISA_SIMD})
install(TARGETS test-simd-${ISA_SIMD} DESTINATION ${PACKAGE_ROOT})
File diff suppressed because it is too large Load Diff
+8
View File
@@ -63,6 +63,14 @@
#endif
#endif
#ifndef ASTCENC_NEON
#if defined(__aarch64__)
#define ASTCENC_NEON 1
#else
#define ASTCENC_NEON 0
#endif
#endif
#if ASTCENC_AVX
#define ASTCENC_VECALIGN 32
#else
+22 -5
View File
@@ -32,7 +32,7 @@
* Explicit 4-wide types are accessible via the vint4, vfloat4, and vmask4
* types. These are provided for use by VLA code, but are also expected to be
* used as a fixed-width type and will supported a reference C++ fallback for
* use on platforms without SIMD intrinsics (TODO: not yet implemented).
* use on platforms without SIMD intrinsics.
*
* Explicit 8-wide types are accessible via the vint8, vfloat8, and vmask8
* types. These are provide for use by VLA code, and are not expected to be
@@ -42,9 +42,10 @@
* With the current implementation ISA support is provided for:
*
* * 1-wide for scalar reference.
* * 4-wide for SSE2.
* * 4-wide for SSE4.1.
* * 8-wide for AVX2.
* * 4-wide for Armv8-A NEON.
* * 4-wide for x86-64 SSE2.
* * 4-wide for x86-64 SSE4.1.
* * 8-wide for x86-64 AVX2.
*
*/
@@ -53,6 +54,8 @@
#if ASTCENC_SSE != 0 || ASTCENC_AVX != 0
#include <immintrin.h>
#elif ASTCENC_NEON != 0
#include <arm_neon.h>
#endif
#if defined(_MSC_VER)
@@ -64,7 +67,7 @@
#endif
#if ASTCENC_AVX >= 2
/* If we have AVX2 expose 8-wide VLA, and 4-wide fixed width. */
/* If we have AVX2 expose 8-wide VLA. */
#include "astcenc_vecmathlib_avx2_8.h"
#include "astcenc_vecmathlib_sse_4.h"
@@ -89,6 +92,20 @@
constexpr auto loada = vfloat4::loada;
constexpr auto load1 = vfloat4::load1;
#elif ASTCENC_NEON > 0
/* If we have NEON expose 4-wide VLA. */
#include "astcenc_vecmathlib_neon_4.h"
#define ASTCENC_SIMD_WIDTH 4
using vfloat = vfloat4;
using vint = vint4;
using vmask = vmask4;
constexpr auto loada = vfloat4::loada;
constexpr auto load1 = vfloat4::load1;
#else
/* If we have nothing expose 1-wide VLA, and 4-wide fixed width. */
#include "astcenc_vecmathlib_none_1.h"
+799
View File
@@ -0,0 +1,799 @@
// SPDX-License-Identifier: Apache-2.0
// ----------------------------------------------------------------------------
// Copyright 2019-2020 Arm Limited
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy
// of the License at:
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
// License for the specific language governing permissions and limitations
// under the License.
// ----------------------------------------------------------------------------
/**
* @brief 4x32-bit vectors, implemented using Armv8-A NEON.
*
* This module implements 4-wide 32-bit float, int, and mask vectors for
* Armv8-A NEON.
*
* There is a baseline level of functionality provided by all vector widths and
* implementations. This is implemented using identical function signatures,
* modulo data type, so we can use them as substitutable implementations in VLA
* code.
*
* The 4-wide vectors are also used as a fixed-width type, and significantly
* extend the functionality above that available to VLA code.
*/
#ifndef ASTC_VECMATHLIB_NEON_4_H_INCLUDED
#define ASTC_VECMATHLIB_NEON_4_H_INCLUDED
#ifndef ASTCENC_SIMD_INLINE
#error "Include astcenc_vecmathlib.h, do not include directly"
#endif
#include <cstdio>
// ============================================================================
// vfloat4 data type
// ============================================================================
/**
* @brief Data type for 4-wide floats.
*/
struct vfloat4
{
/**
* @brief Construct from zero-initialized value.
*/
ASTCENC_SIMD_INLINE vfloat4() {}
/**
* @brief Construct from 4 values loaded from an unaligned address.
*
* Consider using loada() which is better with vectors if data is aligned
* to vector length.
*/
ASTCENC_SIMD_INLINE explicit vfloat4(const float *p)
{
m = vld1q_f32(p);
}
/**
* @brief Construct from 1 scalar value replicated across all lanes.
*
* Consider using zero() for constexpr zeros.
*/
ASTCENC_SIMD_INLINE explicit vfloat4(float a)
{
m = vdupq_n_f32(a);
}
/**
* @brief Construct from 4 scalar values.
*
* The value of @c a is stored to lane 0 (LSB) in the SIMD register.
*/
ASTCENC_SIMD_INLINE explicit vfloat4(float a, float b, float c, float d)
{
float32x4_t v { a, b, c, d };
m = v;
}
/**
* @brief Construct from an existing SIMD register.
*/
ASTCENC_SIMD_INLINE explicit vfloat4(float32x4_t a)
{
m = a;
}
/**
* @brief Get the scalar value of a single lane.
*
* TODO: Can we do better for lane0, which is the common case for VLA?
*/
template <int l> ASTCENC_SIMD_INLINE float lane() const
{
return vgetq_lane_f32(m, l);
}
/**
* @brief Set the scalar value of a single lane.
*/
template <int l> ASTCENC_SIMD_INLINE void set_lane(float a)
{
m = vld1q_lane_f32(&a, m, l);
}
/**
* @brief Factory that returns a vector of zeros.
*/
static ASTCENC_SIMD_INLINE vfloat4 zero()
{
return vfloat4(vdupq_n_f32(0.0f));
}
/**
* @brief Factory that returns a replicated scalar loaded from memory.
*/
static ASTCENC_SIMD_INLINE vfloat4 load1(const float* p)
{
return vfloat4(vdupq_n_f32(*p));
}
/**
* @brief Factory that returns a vector loaded from 16B aligned memory.
*/
static ASTCENC_SIMD_INLINE vfloat4 loada(const float* p)
{
return vfloat4(vld1q_f32(p));
}
/**
* @brief Factory that returns a vector containing the lane IDs.
*/
static ASTCENC_SIMD_INLINE vfloat4 lane_id()
{
alignas(16) float data[4] = { 0.0f, 1.0f, 2.0f, 3.0f };
return vfloat4(vld1q_f32(data));
}
/**
* @brief The vector ...
*/
float32x4_t m;
};
// ============================================================================
// vint4 data type
// ============================================================================
/**
* @brief Data type for 4-wide ints.
*/
struct vint4
{
/**
* @brief Construct from zero-initialized value.
*/
ASTCENC_SIMD_INLINE vint4() {}
/**
* @brief Construct from 4 values loaded from an unaligned address.
*
* Consider using loada() which is better with vectors if data is aligned
* to vector length.
*/
ASTCENC_SIMD_INLINE explicit vint4(const int *p)
{
m = vld1q_s32(p);
}
/**
* @brief Construct from 1 scalar value replicated across all lanes.
*
* Consider using vfloat4::zero() for constexpr zeros.
*/
ASTCENC_SIMD_INLINE explicit vint4(int a)
{
m = vdupq_n_s32(a);
}
/**
* @brief Construct from 4 scalar values.
*
* The value of @c a is stored to lane 0 (LSB) in the SIMD register.
*/
ASTCENC_SIMD_INLINE explicit vint4(int a, int b, int c, int d)
{
int32x4_t v = { a, b, c, d };
m = v;
}
/**
* @brief Construct from an existing SIMD register.
*/
ASTCENC_SIMD_INLINE explicit vint4(int32x4_t a)
{
m = a;
}
/**
* @brief Get the scalar from a single lane.
*/
template <int l> ASTCENC_SIMD_INLINE int lane() const
{
return vgetq_lane_s32(m, l);
}
/**
* @brief Factory that returns a vector containing the lane IDs.
*/
static ASTCENC_SIMD_INLINE vint4 lane_id()
{
alignas(ASTCENC_VECALIGN) int data[4] = { 0, 1, 2, 3 };
return vint4(vld1q_s32(data));
}
/**
* @brief The vector ...
*/
int32x4_t m;
};
// ============================================================================
// vmask4 data type
// ============================================================================
/**
* @brief Data type for 4-wide control plane masks.
*/
struct vmask4
{
/**
* @brief Construct from an existing SIMD register.
*/
ASTCENC_SIMD_INLINE explicit vmask4(uint32x4_t a)
{
m = a;
}
/**
* @brief Construct from an existing SIMD register.
*/
ASTCENC_SIMD_INLINE explicit vmask4(int32x4_t a)
{
m = vreinterpretq_u32_s32(a);
}
/**
* @brief The vector ...
*/
uint32x4_t m;
};
// ============================================================================
// vmask4 operators and functions
// ============================================================================
/**
* @brief Overload: mask union (or).
*/
ASTCENC_SIMD_INLINE vmask4 operator|(vmask4 a, vmask4 b)
{
return vmask4(vorrq_u32(a.m, b.m));
}
/**
* @brief Overload: mask intersect (and).
*/
ASTCENC_SIMD_INLINE vmask4 operator&(vmask4 a, vmask4 b)
{
return vmask4(vandq_u32(a.m, b.m));
}
/**
* @brief Overload: mask difference (xor).
*/
ASTCENC_SIMD_INLINE vmask4 operator^(vmask4 a, vmask4 b)
{
return vmask4(veorq_u32(a.m, b.m));
}
/**
* @brief Overload: mask invert (not).
*/
ASTCENC_SIMD_INLINE vmask4 operator~(vmask4 a)
{
return vmask4(vmvnq_u32(a.m));
}
/**
* @brief Return a 4-bit mask code indicating mask status.
*
* bit0 = lane 0
*/
ASTCENC_SIMD_INLINE unsigned int mask(vmask4 a)
{
int32x4_t shift = { 0, 1, 2, 3 };
uint32x4_t tmp = vshrq_n_u32(a.m, 31);
return vaddvq_u32(vshlq_u32(tmp, shift));
}
/**
* @brief True if any lanes are enabled, false otherwise.
*/
ASTCENC_SIMD_INLINE bool any(vmask4 a)
{
return mask(a) != 0;
}
/**
* @brief True if all lanes are enabled, false otherwise.
*/
ASTCENC_SIMD_INLINE bool all(vmask4 a)
{
return mask(a) == 0xF;
}
// ============================================================================
// vint4 operators and functions
// ============================================================================
/**
* @brief Overload: vector by vector addition.
*/
ASTCENC_SIMD_INLINE vint4 operator+(vint4 a, vint4 b)
{
return vint4(vaddq_s32(a.m, b.m));
}
/**
* @brief Overload: vector by vector subtraction.
*/
ASTCENC_SIMD_INLINE vint4 operator-(vint4 a, vint4 b)
{
return vint4(vsubq_s32(a.m, b.m));
}
/**
* @brief Overload: vector bit invert.
*/
ASTCENC_SIMD_INLINE vint4 operator~(vint4 a)
{
return vint4(vmvnq_s32(a.m));
}
/**
* @brief Overload: vector by vector bitwise or.
*/
ASTCENC_SIMD_INLINE vint4 operator|(vint4 a, vint4 b)
{
return vint4(vorrq_s32(a.m, b.m));
}
/**
* @brief Overload: vector by vector bitwise and.
*/
ASTCENC_SIMD_INLINE vint4 operator&(vint4 a, vint4 b)
{
return vint4(vandq_s32(a.m, b.m));
}
/**
* @brief Overload: vector by vector bitwise xor.
*/
ASTCENC_SIMD_INLINE vint4 operator^(vint4 a, vint4 b)
{
return vint4(veorq_s32(a.m, b.m));
}
/**
* @brief Overload: vector by vector equality.
*/
ASTCENC_SIMD_INLINE vmask4 operator==(vint4 a, vint4 b)
{
return vmask4(vceqq_s32(a.m, b.m));
}
/**
* @brief Overload: vector by vector inequality.
*/
ASTCENC_SIMD_INLINE vmask4 operator!=(vint4 a, vint4 b)
{
return ~vmask4(vceqq_s32(a.m, b.m));
}
/**
* @brief Overload: vector by vector less than.
*/
ASTCENC_SIMD_INLINE vmask4 operator<(vint4 a, vint4 b)
{
return vmask4(vcltq_s32(a.m, b.m));
}
/**
* @brief Overload: vector by vector greater than.
*/
ASTCENC_SIMD_INLINE vmask4 operator>(vint4 a, vint4 b)
{
return vmask4(vcgtq_s32(a.m, b.m));
}
/**
* @brief Return the min vector of two vectors.
*/
ASTCENC_SIMD_INLINE vint4 min(vint4 a, vint4 b)
{
return vint4(vminq_s32(a.m, b.m));
}
/**
* @brief Return the max vector of two vectors.
*/
ASTCENC_SIMD_INLINE vint4 max(vint4 a, vint4 b)
{
return vint4(vmaxq_s32(a.m, b.m));
}
/**
* @brief Return the horizontal minimum of a vector.
*/
ASTCENC_SIMD_INLINE vint4 hmin(vint4 a)
{
return vint4(vminvq_s32(a.m));
}
/**
* @brief Store a vector to a 16B aligned memory address.
*/
ASTCENC_SIMD_INLINE void storea(vint4 a, int* p)
{
vst1q_s32(p, a.m);
}
/**
* @brief Store lowest N (vector width) bytes into an unaligned address.
*/
ASTCENC_SIMD_INLINE void store_nbytes(vint4 a, uint8_t* p)
{
vst1q_lane_s32((int32_t*)p, a.m, 0);
}
/**
* @brief Gather N (vector width) indices from the array.
*/
ASTCENC_SIMD_INLINE vint4 gatheri(const int* base, vint4 indices)
{
alignas(16) int idx[4];
storea(indices, idx);
alignas(16) int vals[4];
vals[0] = base[idx[0]];
vals[1] = base[idx[1]];
vals[2] = base[idx[2]];
vals[3] = base[idx[3]];
return vint4(vals);
}
/**
* @brief Pack low 8 bits of N (vector width) lanes into bottom of vector.
*/
ASTCENC_SIMD_INLINE vint4 pack_low_bytes(vint4 a)
{
alignas(16) uint8_t shuf[16] = {
0, 4, 8, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
};
uint8x16_t idx = vld1q_u8(shuf);
int8x16_t av = vreinterpretq_s8_s32(a.m);
return vint4(vreinterpretq_s32_s8(vqtbl1q_s8(av, idx)));
}
/**
* @brief Return lanes from @c b if MSB of @c cond is set, else @c a.
*/
ASTCENC_SIMD_INLINE vint4 select(vint4 a, vint4 b, vmask4 cond)
{
static const uint32x4_t msb = vdupq_n_u32(0x80000000u);
uint32x4_t mask = vcgeq_u32(cond.m, msb);
return vint4(vbslq_s32(mask, b.m, a.m));
}
/**
* @brief Debug function to print a vector of ints.
*/
ASTCENC_SIMD_INLINE void print(vint4 a)
{
alignas(16) int v[4];
storea(a, v);
printf("v4_i32:\n %8u %8u %8u %8u\n",
v[0], v[1], v[2], v[3]);
}
// ============================================================================
// vfloat4 operators and functions
// ============================================================================
/**
* @brief Overload: vector by vector addition.
*/
ASTCENC_SIMD_INLINE vfloat4 operator+(vfloat4 a, vfloat4 b)
{
return vfloat4(vaddq_f32(a.m, b.m));
}
/**
* @brief Overload: vector by vector subtraction.
*/
ASTCENC_SIMD_INLINE vfloat4 operator-(vfloat4 a, vfloat4 b)
{
return vfloat4(vsubq_f32(a.m, b.m));
}
/**
* @brief Overload: vector by vector multiplication.
*/
ASTCENC_SIMD_INLINE vfloat4 operator*(vfloat4 a, vfloat4 b)
{
return vfloat4(vmulq_f32(a.m, b.m));
}
/**
* @brief Overload: vector by scalar multiplication.
*/
ASTCENC_SIMD_INLINE vfloat4 operator*(vfloat4 a, float b)
{
float32x4_t bv = vld1q_dup_f32(&b);
return vfloat4(vmulq_f32(a.m, bv));
}
/**
* @brief Overload: scalar by vector multiplication.
*/
ASTCENC_SIMD_INLINE vfloat4 operator*(float a, vfloat4 b)
{
float32x4_t av = vld1q_dup_f32(&a);
return vfloat4(vmulq_f32(av, b.m));
}
/**
* @brief Overload: vector by vector division.
*/
ASTCENC_SIMD_INLINE vfloat4 operator/(vfloat4 a, vfloat4 b)
{
return vfloat4(vdivq_f32(a.m, b.m));
}
/**
* @brief Overload: vector by vector equality.
*/
ASTCENC_SIMD_INLINE vmask4 operator==(vfloat4 a, vfloat4 b)
{
return vmask4(vceqq_f32(a.m, b.m));
}
/**
* @brief Overload: vector by vector inequality.
*/
ASTCENC_SIMD_INLINE vmask4 operator!=(vfloat4 a, vfloat4 b)
{
return vmask4(vmvnq_u32(vceqq_f32(a.m, b.m)));
}
/**
* @brief Overload: vector by vector less than.
*/
ASTCENC_SIMD_INLINE vmask4 operator<(vfloat4 a, vfloat4 b)
{
return vmask4(vcltq_f32(a.m, b.m));
}
/**
* @brief Overload: vector by vector greater than.
*/
ASTCENC_SIMD_INLINE vmask4 operator>(vfloat4 a, vfloat4 b)
{
return vmask4(vcgtq_f32(a.m, b.m));
}
/**
* @brief Overload: vector by vector less than or equal.
*/
ASTCENC_SIMD_INLINE vmask4 operator<=(vfloat4 a, vfloat4 b)
{
return vmask4(vcleq_f32(a.m, b.m));
}
/**
* @brief Overload: vector by vector greater than or equal.
*/
ASTCENC_SIMD_INLINE vmask4 operator>=(vfloat4 a, vfloat4 b)
{
return vmask4(vcgeq_f32(a.m, b.m));
}
/**
* @brief Return the min vector of two vectors.
*
* If either lane value is NaN, @c b will be returned for that lane.
*/
ASTCENC_SIMD_INLINE vfloat4 min(vfloat4 a, vfloat4 b)
{
// Do not reorder - second operand will return if either is NaN
return vfloat4(vminnmq_f32(a.m, b.m));
}
/**
* @brief Return the max vector of two vectors.
*
* If either lane value is NaN, @c b will be returned for that lane.
*/
ASTCENC_SIMD_INLINE vfloat4 max(vfloat4 a, vfloat4 b)
{
// Do not reorder - second operand will return if either is NaN
return vfloat4(vmaxnmq_f32(a.m, b.m));
}
/**
* @brief Return the clamped value between min and max.
*
* It is assumed that neither @c min nor @c max are NaN values. If @c a is NaN
* then @c min will be returned for that lane.
*/
ASTCENC_SIMD_INLINE vfloat4 clamp(float min, float max, vfloat4 a)
{
float32x4_t minv = vdupq_n_f32(min);
float32x4_t maxv = vdupq_n_f32(max);
return vfloat4(vminnmq_f32(vmaxnmq_f32(a.m, minv), maxv));
}
/**
* @brief Return a clamped value between 0.0f and max.
*
* It is assumed that @c max is not a NaN value. If @c a is NaN then zero will
* be returned for that lane.
*/
ASTCENC_SIMD_INLINE vfloat4 clampz(float max, vfloat4 a)
{
// Do not reorder - second operand will return if either is NaN
float32x4_t minv = vdupq_n_f32(0.0f);
float32x4_t maxv = vdupq_n_f32(max);
return vfloat4(vminnmq_f32(vmaxnmq_f32(a.m, minv), maxv));
}
/**
* @brief Return a clamped value between 0.0f and 1.0f.
*
* If @c a is NaN then zero will be returned for that lane.
*/
ASTCENC_SIMD_INLINE vfloat4 clampzo(vfloat4 a)
{
float32x4_t minv = vdupq_n_f32(0.0f);
float32x4_t maxv = vdupq_n_f32(1.0f);
return vfloat4(vminnmq_f32(vmaxnmq_f32(a.m, minv), maxv));
}
/**
* @brief Return the absolute value of the float vector.
*/
ASTCENC_SIMD_INLINE vfloat4 abs(vfloat4 a)
{
float32x4_t zero = vdupq_n_f32(0.0f);
float32x4_t inv = vsubq_f32(zero, a.m);
return vfloat4(vmaxq_f32(a.m, inv));
}
/**
* @brief Return a float rounded to the nearest integer value.
*
* TODO: Can we do a better fallback here, if we exploit the fact that we
* can assume that values are positive?
*/
ASTCENC_SIMD_INLINE vfloat4 round(vfloat4 a)
{
return vfloat4(vrndnq_f32(a.m));
}
/**
* @brief Return the horizontal minimum of a vector.
*/
ASTCENC_SIMD_INLINE vfloat4 hmin(vfloat4 a)
{
return vfloat4(vminvq_f32(a.m));
}
/**
* @brief Return the sqrt of the lanes in the vector.
*/
ASTCENC_SIMD_INLINE vfloat4 sqrt(vfloat4 a)
{
return vfloat4(vsqrtq_f32(a.m));
}
/**
* @brief Return lanes from @c b if MSB of @c cond is set, else @c a.
*/
ASTCENC_SIMD_INLINE vfloat4 select(vfloat4 a, vfloat4 b, vmask4 cond)
{
static const uint32x4_t msb = vdupq_n_u32(0x80000000u);
uint32x4_t mask = vcgeq_u32(cond.m, msb);
return vfloat4(vbslq_f32(mask, b.m, a.m));
}
/**
* @brief Load a vector of gathered results from an array;
*/
ASTCENC_SIMD_INLINE vfloat4 gatherf(const float* base, vint4 indices)
{
alignas(16) int idx[4];
storea(indices, idx);
alignas(16) float vals[4];
vals[0] = base[idx[0]];
vals[1] = base[idx[1]];
vals[2] = base[idx[2]];
vals[3] = base[idx[3]];
return vfloat4(vals);
}
/**
* @brief Store a vector to an unaligned memory address.
*/
ASTCENC_SIMD_INLINE void store(vfloat4 a, float* p)
{
vst1q_f32(p, a.m);
}
/**
* @brief Store a vector to a 16B aligned memory address.
*/
ASTCENC_SIMD_INLINE void storea(vfloat4 a, float* p)
{
vst1q_f32(p, a.m);
}
/**
* @brief Return the dot product for the full 4 lanes, returning vector.
*/
ASTCENC_SIMD_INLINE vfloat4 dot(vfloat4 a, vfloat4 b)
{
return vfloat4(vaddvq_f32(vmulq_f32(a.m, b.m)));
}
/**
* @brief Return a integer value for a float vector, using truncation.
*/
ASTCENC_SIMD_INLINE vint4 float_to_int(vfloat4 a)
{
return vint4(vcvtq_s32_f32(a.m));
}
/**
* @brief Return a integer value for a float vector, using round-to-nearest.
*/
ASTCENC_SIMD_INLINE vint4 float_to_int_rtn(vfloat4 a)
{
a = round(a);
return vint4(vcvtq_s32_f32(a.m));
}
/**
* @brief Return a float value as an integer bit pattern (i.e. no conversion).
*
* It is a common trick to convert floats into integer bit patterns, perform
* some bit hackery based on knowledge they are IEEE 754 layout, and then
* convert them back again. This is the first half of that flip.
*/
ASTCENC_SIMD_INLINE vint4 float_as_int(vfloat4 a)
{
return vint4(vreinterpretq_s32_f32(a.m));
}
/**
* @brief Return a integer value as a float bit pattern (i.e. no conversion).
*
* It is a common trick to convert floats into integer bit patterns, perform
* some bit hackery based on knowledge they are IEEE 754 layout, and then
* convert them back again. This is the second half of that flip.
*/
ASTCENC_SIMD_INLINE vfloat4 int_as_float(vint4 v)
{
return vfloat4(vreinterpretq_f32_s32(v.m));
}
/**
* @brief Debug function to print a vector of floats.
*/
ASTCENC_SIMD_INLINE void print(vfloat4 a)
{
alignas(16) float v[4];
storea(a, v);
printf("v4_f32:\n %0.4f %0.4f %0.4f %0.4f\n",
(double)v[0], (double)v[1], (double)v[2], (double)v[3]);
}
#endif // #ifndef ASTC_VECMATHLIB_NEON_4_H_INCLUDED
+16 -2
View File
@@ -110,17 +110,29 @@ endif()
if(${ISA_SIMD} MATCHES "none")
target_compile_definitions(astcenc-${ISA_SIMD}
PRIVATE
ASTCENC_NEON=0
ASTCENC_SSE=0
ASTCENC_AVX=0
ASTCENC_POPCNT=0)
target_compile_options(astcenc-${ISA_SIMD}
if (${ARCH} MATCHES x64)
target_compile_options(astcenc-${ISA_SIMD}
PRIVATE
$<$<CXX_COMPILER_ID:${GNU_LIKE}>:-mfpmath=sse -msse2>)
endif()
elseif(${ISA_SIMD} MATCHES "neon")
target_compile_definitions(astcenc-${ISA_SIMD}
PRIVATE
$<$<CXX_COMPILER_ID:${GNU_LIKE}>:-mfpmath=sse -msse2>)
ASTCENC_NEON=1
ASTCENC_SSE=0
ASTCENC_AVX=0
ASTCENC_POPCNT=0)
elseif(${ISA_SIMD} MATCHES "sse2")
target_compile_definitions(astcenc-${ISA_SIMD}
PRIVATE
ASTCENC_NEON=0
ASTCENC_SSE=20
ASTCENC_AVX=0
ASTCENC_POPCNT=0)
@@ -132,6 +144,7 @@ elseif(${ISA_SIMD} MATCHES "sse2")
elseif(${ISA_SIMD} MATCHES "sse4.1")
target_compile_definitions(astcenc-${ISA_SIMD}
PRIVATE
ASTCENC_NEON=0
ASTCENC_SSE=41
ASTCENC_AVX=0
ASTCENC_POPCNT=1)
@@ -143,6 +156,7 @@ elseif(${ISA_SIMD} MATCHES "sse4.1")
elseif(${ISA_SIMD} MATCHES "avx2")
target_compile_definitions(astcenc-${ISA_SIMD}
PRIVATE
ASTCENC_NEON=0
ASTCENC_SSE=41
ASTCENC_AVX=2
ASTCENC_POPCNT=1)
+12 -5
View File
@@ -285,23 +285,30 @@ def get_encoder_params(encoderName, referenceName, imageSet):
outDir = "Test/Images/%s" % imageSet
refName = None
# Latest master
elif encoderName == "ref-master-neon":
# Warning: this option rebuilds a new reference test result for the
# master branch using the user's locally build encoder.
encoder = te.Encoder2x("neon")
name = "reference-master-neon"
outDir = "Test/Images/%s" % imageSet
refName = None
elif encoderName == "ref-master-sse2":
# Warning: this option rebuilds a new reference test result for the
# master branch using the user's locally build encoder in ./Source.
# master branch using the user's locally build encoder.
encoder = te.Encoder2x("sse2")
name = "reference-master-sse2"
outDir = "Test/Images/%s" % imageSet
refName = None
elif encoderName == "ref-master-sse4.1":
# Warning: this option rebuilds a new reference test result for the
# master branch using the user's locally build encoder in ./Source.
# master branch using the user's locally build encoder.
encoder = te.Encoder2x("sse4.1")
name = "reference-master-sse4.1"
outDir = "Test/Images/%s" % imageSet
refName = None
elif encoderName == "ref-master-avx2":
# Warning: this option rebuilds a new reference test result for the
# master branch using the user's locally build encoder in ./Source.
# master branch using the user's locally build encoder.
encoder = te.Encoder2x("avx2")
name = "reference-master-avx2"
outDir = "Test/Images/%s" % imageSet
@@ -327,8 +334,8 @@ def parse_command_line():
refcoders = ["ref-1.7",
"ref-2.0-sse2", "ref-2.0-sse4.1", "ref-2.0-avx2",
"ref-2.1-sse2", "ref-2.1-sse4.1", "ref-2.1-avx2",
"ref-master-sse2", "ref-master-sse4.1", "ref-master-avx2"]
testcoders = ["none", "sse2", "sse4.1", "avx2"]
"ref-master-neon", "ref-master-sse2", "ref-master-sse4.1", "ref-master-avx2"]
testcoders = ["none", "neon", "sse2", "sse4.1", "avx2"]
coders = refcoders + testcoders + ["all", "all-ref"]
parser.add_argument("--encoder", dest="encoders", default="avx2",
choices=coders, help="test encoder variant")
+50 -2
View File
@@ -28,6 +28,7 @@ The directory path is structured:
from collections.abc import Iterable
import os
import re
import subprocess as sp
from PIL import Image as PILImage
@@ -35,6 +36,39 @@ from PIL import Image as PILImage
import testlib.misc as misc
CONVERT_BINARY = ["convert"]
g_ConvertVersion = None
def get_convert_version():
"""
Get the major/minor version of ImageMagick on the system.
"""
global g_ConvertVersion
if g_ConvertVersion is None:
command = list(CONVERT_BINARY)
command += ["--version"]
result = sp.run(command, stdout=sp.PIPE, stderr=sp.PIPE,
check=True, universal_newlines=True)
# Version is top row
version = result.stdout.splitlines()[0]
# ... third token
version = re.split(" ", version)[2]
# ... major/minor/patch/subpatch
version = re.split("\\.|-", version)
numericVersion = float(version[0])
numericVersion += float(version[1]) / 10.0
g_ConvertVersion = numericVersion
return g_ConvertVersion
class ImageException(Exception):
"""
Exception thrown for bad image specification.
@@ -228,6 +262,14 @@ class Image():
Args:
filePath (str): The path to the image on disk.
"""
convert = get_convert_version()
# ImageMagick 7 started to use .tga file origin information. By default
# TGA files store data from bottom up, and define the origin as bottom
# left. We want our color samples to always use a top left origin, even
# if the data is stored in alternative layout.
self.invertYCoords = (convert >= 7.0) and filePath.endswith(".tga")
self.filePath = filePath
self.proxyPath = None
@@ -256,8 +298,14 @@ class Image():
coords = [coords]
for (x, y) in coords:
command = [
"convert", self.filePath,
command = list(CONVERT_BINARY)
command += [self.filePath]
# Invert coordinates if the format needs it
if self.invertYCoords:
command += ["-flip"]
command += [
"-format", "%%[pixel:p{%u,%u}]" % (x, y),
"info:"
]