diff --git a/.gitignore b/.gitignore index bf5e319..37a6af4 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,7 @@ # Editor and engineering scratch files .vs .vscode +.DS_Store *.log *.diff *.user @@ -12,7 +13,7 @@ Proto Binaries # Build artifacts -astcenc* +astcenc build* # General build artifacts diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..4ff560c --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "Source/GoogleTest"] + path = Source/GoogleTest + url = https://github.com/google/googletest.git diff --git a/CMakeLists.txt b/CMakeLists.txt index 4800c9b..1dcc21e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -15,7 +15,6 @@ # under the License. # ---------------------------------------------------------------------------- - # CMake configuration cmake_minimum_required(VERSION 3.15) cmake_policy(SET CMP0069 NEW) # LTO support @@ -29,38 +28,77 @@ set(CMAKE_CXX_STANDARD_REQUIRED ON) set(CMAKE_CXX_EXTENSIONS OFF) set(CMAKE_EXPORT_COMPILE_COMMANDS 1) set(PACKAGE_ROOT astcenc) +include(CTest) # Command line configuration -option(ISA_AVX2 "Enable builds for AVX2 SIMD") +function(printopt optName optVal optArch tgtArch) + if(${optVal}) + if(${optArch} MATCHES ${tgtArch}) + message(" -- ${optName} backend - ON") + else() + message(" -- ${optName} backend - SKIPPED (${optArch} only)") + endif() + else() + message(" -- ${optName} backend - OFF") + endif() +endfunction() + +set(VALID_ARCH aarch64 x64) +set(ARCH x64 CACHE STRING "Target architecture") +set_property(CACHE ARCH PROPERTY STRINGS ${VALID_ARCH}) + +message("-- Selecting target astcenc backends:") +list(FIND VALID_ARCH ${ARCH} index) +if(index EQUAL -1) + message(FATAL_ERROR "ARCH must be one of ${VALID_ARCH}") +endif() -message("-- Selecting backend build type(s)") set(ANY_ISA 0) -if(${ISA_AVX2}) + +option(ISA_AVX2 "Enable builds for AVX2 SIMD") +printopt("AVX2" ${ISA_AVX2} "x64" ${ARCH}) +if(${ISA_AVX2} AND ${ARCH} MATCHES "x64") set(ANY_ISA 1) - message(" -- AVX2 backend: ON") endif() option(ISA_SSE41 "Enable builds for SSE4.1 SIMD") -if(${ISA_SSE41}) +printopt("SSE4.1" ${ISA_SSE41} "x64" ${ARCH}) +if(${ISA_SSE41} AND ${ARCH} MATCHES "x64") set(ANY_ISA 1) - message(" -- SSE4.1 backend: ON") endif() option(ISA_SSE2 "Enable builds for SSE2 SIMD") -if(${ISA_SSE2}) +printopt("SSE2" ${ISA_SSE2} "x64" ${ARCH}) +if(${ISA_SSE2} AND ${ARCH} MATCHES "x64") set(ANY_ISA 1) - message(" -- SSE2 backend: ON") endif() -option(ISA_NONE "Enable builds for noSIMD") +option(ISA_NEON "Enable builds for NEON SIMD") +printopt("NEON" ${ISA_NEON} "aarch64" ${ARCH}) +if(${ISA_NEON} AND ${ARCH} MATCHES "aarch64") + set(ANY_ISA 1) +endif() + +option(ISA_NONE "Enable builds for no SIMD") if(${ISA_NONE}) set(ANY_ISA 1) - message(" -- No SIMD backend: ON") + message(" -- No SIMD backend - ON") +else() + message(" -- No SIMD backend - OFF") endif() option(ISA_INVARIANCE "Enable builds for ISA invariance") if(${ISA_INVARIANCE}) - message(" -- ISA invariance: ON") + message(" -- ISA invariant backend - ON") +else() + message(" -- ISA invariant backend - OFF") +endif() + +option(UNITTEST "Enable builds for unit tests") +if(${UNITTEST}) + message(" -- Unit tests - ON") +else() + message(" -- Unit tests - OFF") endif() if(NOT ${ANY_ISA}) diff --git a/Docs/Building.md b/Docs/Building.md index 85b8bb0..e7c5c46 100644 --- a/Docs/Building.md +++ b/Docs/Building.md @@ -22,8 +22,14 @@ to generate the build system. mkdir build cd build -# Create the build system -cmake -G "NMake Makefiles" -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=./ \ +# Configure your build of choice, for example: + +# Arm arch64 +cmake -G "Unix Makefiles" -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=./ \ + -DARCH=aarch64 -DISA_NEON=ON .. + +# x86-64 +cmake -G "Unix Makefiles" -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=./ \ -DISA_AVX2=ON -DISA_SSE41=ON -DISA_SSE2=ON .. ``` @@ -62,7 +68,13 @@ export CXX=clang++ mkdir build cd build -# Create the build system +# Configure your build of choice, for example: + +# Arm arch64 +cmake -G "Unix Makefiles" -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=./ \ + -DARCH=aarch64 -DISA_NEON=ON .. + +# x86-64 cmake -G "Unix Makefiles" -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=./ \ -DISA_AVX2=ON -DISA_SSE41=ON -DISA_SSE2=ON .. ``` @@ -88,13 +100,14 @@ For codec developers there are a number of useful features in the build system. ### No intrinsics build -All normal builds will use SIMD accelerated code paths using instrinsics, as -x86-64 guarantees availability of at least SSE2. For development purposes it -is possible to build an intrinsic-free build which uses no explicit SIMD -acceleration (the compiler may still auto-vectorize). +All normal builds will use SIMD accelerated code paths using intrinsics, as all +target architectures (x86-64 and aarch64) guarantee SIMD availability. For +development purposes it is possible to build an intrinsic-free build which uses +no explicit SIMD acceleration (the compiler may still auto-vectorize). To enable this binary variant add `-DISA_NONE=ON` to the CMake command line -when configuring. It is NOT +when configuring. It is NOT recommended to use this for production; it is +significantly slower than the vectorized SIMD builds. ### ISA Invariance @@ -126,12 +139,35 @@ We support and test the following `CMAKE_BUILD_TYPE` options. Note that optimized release builds are compiled with link-time optimization, which can make profiling more challenging ... +### Testing + +We support building unit tests. + +These builds use the `googletest` framework, which is pulled in though a git +submodule. On first use, you must fetch the submodule dependency: + +```shell +git submodule init +git submodule update +``` + +To build unit tests add `-DUNITTEST=ON` to the CMake command line when +configuring. + +To run unit tests use the CMake `ctest` utility from your build directory after +you have built the tests. + +```shell +cd build +ctest --verbose +``` + ### Packaging We support building a release bundle of all enabled binary configurations in the current CMake configuration using the `package` build target -```bash +```shell # Run a build and package build outputs in `./astcenc---.` cd build make package -j16 diff --git a/Docs/ChangeLog.md b/Docs/ChangeLog.md index e1519d4..12ea10c 100644 --- a/Docs/ChangeLog.md +++ b/Docs/ChangeLog.md @@ -18,6 +18,9 @@ stable across versions, and this release is not compatible with 2.1. Please recompile your client-side code using the updated `astcenc.h` header. * **General:** + * **Feature:** New Arm aarch64 NEON accelerated vector library. Note that at + this time Arm builds must be built from source; pre-built binaries are not + provided in this release. * **Improvement:** SSE4.2 feature profile changed to SSE4.1, which more accurately reflects the feature set used. * **Improvement:** Build system changed to use CMake for all platforms. @@ -35,8 +38,8 @@ recompile your client-side code using the updated `astcenc.h` header. scales RGB values by the alpha value. This can be useful to minimize cross-channel color bleed caused by GPU post-multiply filtering/blending. * **Improvements:** Command line tool cleanly traps and reports errors for - corrupt input images rather than relying on hard standard library - `assert()` calls. + corrupt input images rather than relying on standard library `assert()` + calls in release builds. * **Core API:** * **API Change:** Images using region-based metrics no longer need to include padding; all input images should be tightly packed and `dim_pad` is removed diff --git a/Docs/Testing.md b/Docs/Testing.md index bf985f0..7b11dc1 100644 --- a/Docs/Testing.md +++ b/Docs/Testing.md @@ -14,10 +14,27 @@ can be achieved by configuring the CMake build using the install prefix `-DCMAKE_INSTALL_PREFIX=../` and then running a build with the `install` build target. -# Running unit tests +# Running C++ unit tests -To run the command line unit tests, which aim to get coverage of the command -line options and core codec stability without testing the compression quality +We support a small (but growing) number of C++ unit tests, which are written +using the `googletest` framework and integrated in the CMake "CTest" test +framework. + +To build unit tests pull the `googletest` git submodule and add `-DUNITTEST=ON` +to the CMake command line when configuring. + +To run unit tests use the CMake `ctest` utility from your build directory after +you have built the tests. + +```shell +cd build +ctest --verbose +``` + +# Running command line tests + +To run the command line tests, which aim to get coverage of the command line +options and core codec stability without testing the compression quality itself, run the command line: python3 -m unittest discover -s Test -p astc_test*.py -v diff --git a/Source/CMakeLists.txt b/Source/CMakeLists.txt index fb17616..dc2455f 100644 --- a/Source/CMakeLists.txt +++ b/Source/CMakeLists.txt @@ -21,6 +21,25 @@ if(CMAKE_CXX_COMPILER_ID MATCHES "GNU|Clang") set(CMAKE_CXX_COMPILE_OPTIONS_IPO "-flto") endif() +# - - - - - - - - - - - - - - - - - - +# No architecture-specific SIMD + +if (${ISA_NONE}) + set(ISA_SIMD none) + include(cmake_core.cmake) +endif() + +# - - - - - - - - - - - - - - - - - - +# Armv8-A architecture-specific SIMD + +if (${ISA_NEON}) + set(ISA_SIMD neon) + include(cmake_core.cmake) +endif() + +# - - - - - - - - - - - - - - - - - - +# x86-64 architecture-specific SIMD + if (${ISA_AVX2}) set(ISA_SIMD avx2) include(cmake_core.cmake) @@ -36,7 +55,11 @@ if (${ISA_SSE2}) include(cmake_core.cmake) endif() -if (${ISA_NONE}) - set(ISA_SIMD none) - include(cmake_core.cmake) +# - - - - - - - - - - - - - - - - - - +# Unit testing +if (${UNITTEST}) + set(INSTALL_GTEST OFF CACHE BOOL "" FORCE) + add_subdirectory(GoogleTest) + enable_testing() + add_subdirectory(UnitTest) endif() diff --git a/Source/GoogleTest b/Source/GoogleTest new file mode 160000 index 0000000..703bd9c --- /dev/null +++ b/Source/GoogleTest @@ -0,0 +1 @@ +Subproject commit 703bd9caab50b139428cea1aaff9974ebee5742e diff --git a/Source/UnitTest/CMakeLists.txt b/Source/UnitTest/CMakeLists.txt new file mode 100644 index 0000000..e0357b9 --- /dev/null +++ b/Source/UnitTest/CMakeLists.txt @@ -0,0 +1,50 @@ +# SPDX-License-Identifier: Apache-2.0 +# ---------------------------------------------------------------------------- +# Copyright 2020 Arm Limited +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not +# use this file except in compliance with the License. You may obtain a copy +# of the License at: +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. +# ---------------------------------------------------------------------------- + +# - - - - - - - - - - - - - - - - - - +# No architecture-specific SIMD + +if (${ISA_NONE}) + set(ISA_SIMD none) + include(cmake_core.cmake) +endif() + +# - - - - - - - - - - - - - - - - - - +# Armv8-A architecture-specific SIMD + +if (${ISA_NEON}) + set(ISA_SIMD neon) + include(cmake_core.cmake) +endif() + +# - - - - - - - - - - - - - - - - - - +# x86-64 architecture-specific SIMD + +if (${ISA_AVX2}) + set(ISA_SIMD avx2) + include(cmake_core.cmake) +endif() + +if (${ISA_SSE41}) + set(ISA_SIMD sse4.1) + include(cmake_core.cmake) +endif() + +if (${ISA_SSE2}) + set(ISA_SIMD sse2) + include(cmake_core.cmake) +endif() diff --git a/Source/UnitTest/cmake_core.cmake b/Source/UnitTest/cmake_core.cmake new file mode 100644 index 0000000..98858db --- /dev/null +++ b/Source/UnitTest/cmake_core.cmake @@ -0,0 +1,112 @@ +# SPDX-License-Identifier: Apache-2.0 +# ---------------------------------------------------------------------------- +# Copyright 2020 Arm Limited +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not +# use this file except in compliance with the License. You may obtain a copy +# of the License at: +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. +# ---------------------------------------------------------------------------- + +add_executable(test-simd-${ISA_SIMD}) + +target_sources(test-simd-${ISA_SIMD} + PRIVATE + test_simd.cpp) + +target_include_directories(test-simd-${ISA_SIMD} + PRIVATE + ${gtest_SOURCE_DIR}/include) + +target_compile_options(test-simd-${ISA_SIMD} + PRIVATE + # Use pthreads on Linux/macOS + $<$:-pthread> + + # MSVC compiler defines + $<$:/EHsc> + + # G++ and Clang++ compiler defines + $<$>:-Wall> + $<$>:-Wextra> + $<$>:-Wpedantic> + $<$>:-Werror> + $<$>:-Wshadow> + $<$>:-Wdouble-promotion>) + +# Set up configuration for SIMD ISA builds +if(${ISA_SIMD} MATCHES "none") + target_compile_definitions(test-simd-${ISA_SIMD} + PRIVATE + ASTCENC_NEON=0 + ASTCENC_SSE=0 + ASTCENC_AVX=0 + ASTCENC_POPCNT=0) + + if (${ARCH} MATCHES x64) + target_compile_options(astcenc-${ISA_SIMD} + PRIVATE + $<$:-mfpmath=sse -msse2>) + endif() + +elseif(${ISA_SIMD} MATCHES "neon") + target_compile_definitions(test-simd-${ISA_SIMD} + PRIVATE + ASTCENC_NEON=1 + ASTCENC_SSE=0 + ASTCENC_AVX=0 + ASTCENC_POPCNT=0) + +elseif(${ISA_SIMD} MATCHES "sse2") + target_compile_definitions(test-simd-${ISA_SIMD} + PRIVATE + ASTCENC_NEON=0 + ASTCENC_SSE=20 + ASTCENC_AVX=0 + ASTCENC_POPCNT=0) + + target_compile_options(test-simd-${ISA_SIMD} + PRIVATE + $<$:-mfpmath=sse -msse2>) + +elseif(${ISA_SIMD} MATCHES "sse4.1") + target_compile_definitions(test-simd-${ISA_SIMD} + PRIVATE + ASTCENC_NEON=0 + ASTCENC_SSE=41 + ASTCENC_AVX=0 + ASTCENC_POPCNT=1) + + target_compile_options(test-simd-${ISA_SIMD} + PRIVATE + $<$>:-mfpmath=sse -msse4.1 -mpopcnt>) + +elseif(${ISA_SIMD} MATCHES "avx2") + target_compile_definitions(test-simd-${ISA_SIMD} + PRIVATE + ASTCENC_NEON=0 + ASTCENC_SSE=41 + ASTCENC_AVX=2 + ASTCENC_POPCNT=1) + + target_compile_options(test-simd-${ISA_SIMD} + PRIVATE + $<$>:-mfpmath=sse -mavx2 -mpopcnt> + $<$:/arch:AVX2>) +endif() + +target_link_libraries(test-simd-${ISA_SIMD} + PRIVATE + gtest_main) + +add_test(NAME test-simd-${ISA_SIMD} + COMMAND test-simd-${ISA_SIMD}) + +install(TARGETS test-simd-${ISA_SIMD} DESTINATION ${PACKAGE_ROOT}) diff --git a/Source/UnitTest/test_simd.cpp b/Source/UnitTest/test_simd.cpp new file mode 100644 index 0000000..e5333ad --- /dev/null +++ b/Source/UnitTest/test_simd.cpp @@ -0,0 +1,1004 @@ +// SPDX-License-Identifier: Apache-2.0 +// ---------------------------------------------------------------------------- +// Copyright 2020 Arm Limited +// +// Licensed under the Apache License, Version 2.0 (the "License"); you may not +// use this file except in compliance with the License. You may obtain a copy +// of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +// ---------------------------------------------------------------------------- + +/** + * @brief Unit tests for the vectorized SIMD functionality. + * + * This test suite is a partial implementation, focussing on 4-wide vectors. + * We're adding things as we touch related parts of the code, but there is some + * technical debt to catch up on to get full coverage. + */ + +#include + +#include "gtest/gtest.h" + +#include "../astcenc_internal.h" +#include "../astcenc_vecmathlib.h" + +namespace astcenc +{ + +#if ASTCENC_SIMD_WIDTH == 4 + +// VLA (4-wide) tests - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + +/** \brief Test VLA change_sign. */ +TEST(vfloat, ChangeSign) +{ + vfloat a(-1.0f, 1.0f, -3.12f, 3.12f); + vfloat b(-1.0f, -1.0f, 3.12f, 3.12f); + vfloat r = change_sign(a, b); + EXPECT_EQ(r.lane<0>(), 1.0f); + EXPECT_EQ(r.lane<1>(), -1.0f); + EXPECT_EQ(r.lane<2>(), -3.12f); + EXPECT_EQ(r.lane<3>(), 3.12f); +} + +/** \brief Test VLA atan. */ +TEST(vfloat, Atan) +{ + vfloat a(-0.15f, 0.0f, 0.9f, 2.1f); + vfloat r = atan(a); + EXPECT_NEAR(r.lane<0>(), -0.149061f, 0.005f); + EXPECT_NEAR(r.lane<1>(), 0.000000f, 0.005f); + EXPECT_NEAR(r.lane<2>(), 0.733616f, 0.005f); + EXPECT_NEAR(r.lane<3>(), 1.123040f, 0.005f); +} + +/** \brief Test VLA atan2. */ +TEST(vfloat, Atan2) +{ + vfloat a(-0.15f, 0.0f, 0.9f, 2.1f); + vfloat b(1.15f, -3.0f, -0.9f, 1.1f); + vfloat r = atan2(a, b); + EXPECT_NEAR(r.lane<0>(), -0.129816f, 0.005f); + EXPECT_NEAR(r.lane<1>(), 3.141592f, 0.005f); + EXPECT_NEAR(r.lane<2>(), 2.360342f, 0.005f); + EXPECT_NEAR(r.lane<3>(), 1.084357f, 0.005f); +} + +#endif + +#if ASTCENC_SIMD_WIDTH >= 4 + +static const float qnan = std::numeric_limits::quiet_NaN(); +alignas(16) static const float f32x4_data[5] { 0.0f, 1.0f, 2.0f, 3.0f, 4.0f }; +alignas(16) static const int s32x4_data[5] { 0, 1, 2, 3, 4 }; + +// VFLOAT4 tests - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + +/** \brief Test unaligned vfloat4 data load. */ +TEST(vfloat4, UnalignedLoad) +{ + vfloat4 a(&(f32x4_data[1])); + EXPECT_EQ(a.lane<0>(), 1.0f); + EXPECT_EQ(a.lane<1>(), 2.0f); + EXPECT_EQ(a.lane<2>(), 3.0f); + EXPECT_EQ(a.lane<3>(), 4.0f); +} + +/** \brief Test scalar duplicated vfloat4 load. */ +TEST(vfloat4, ScalarDupLoad) +{ + vfloat4 a(1.1f); + EXPECT_EQ(a.lane<0>(), 1.1f); + EXPECT_EQ(a.lane<1>(), 1.1f); + EXPECT_EQ(a.lane<2>(), 1.1f); + EXPECT_EQ(a.lane<3>(), 1.1f); +} + +/** \brief Test scalar vfloat4 load. */ +TEST(vfloat4, ScalarLoad) +{ + vfloat4 a(1.1f, 2.2f, 3.3f, 4.4f); + EXPECT_EQ(a.lane<0>(), 1.1f); + EXPECT_EQ(a.lane<1>(), 2.2f); + EXPECT_EQ(a.lane<2>(), 3.3f); + EXPECT_EQ(a.lane<3>(), 4.4f); +} + +/** \brief Test copy vfloat4 load. */ +TEST(vfloat4, CopyLoad) +{ + vfloat4 s(1.1f, 2.2f, 3.3f, 4.4f); + vfloat4 a(s.m); + EXPECT_EQ(a.lane<0>(), 1.1f); + EXPECT_EQ(a.lane<1>(), 2.2f); + EXPECT_EQ(a.lane<2>(), 3.3f); + EXPECT_EQ(a.lane<3>(), 4.4f); +} + +/** \brief Test vfloat4 scalar lane set. */ +TEST(vfloat4, SetLane) +{ + vfloat4 a(0.0f); + + a.set_lane<0>(1.0f); + EXPECT_EQ(a.lane<0>(), 1.0f); + EXPECT_EQ(a.lane<1>(), 0.0f); + EXPECT_EQ(a.lane<2>(), 0.0f); + EXPECT_EQ(a.lane<3>(), 0.0f); + + a.set_lane<1>(2.0f); + EXPECT_EQ(a.lane<0>(), 1.0f); + EXPECT_EQ(a.lane<1>(), 2.0f); + EXPECT_EQ(a.lane<2>(), 0.0f); + EXPECT_EQ(a.lane<3>(), 0.0f); + + a.set_lane<2>(3.0f); + EXPECT_EQ(a.lane<0>(), 1.0f); + EXPECT_EQ(a.lane<1>(), 2.0f); + EXPECT_EQ(a.lane<2>(), 3.0f); + EXPECT_EQ(a.lane<3>(), 0.0f); + + a.set_lane<3>(4.0f); + EXPECT_EQ(a.lane<0>(), 1.0f); + EXPECT_EQ(a.lane<1>(), 2.0f); + EXPECT_EQ(a.lane<2>(), 3.0f); + EXPECT_EQ(a.lane<3>(), 4.0f); +} + +/** \brief Test vfloat4 zero. */ +TEST(vfloat4, Zero) +{ + vfloat4 a = vfloat4::zero(); + EXPECT_EQ(a.lane<0>(), 0.0f); + EXPECT_EQ(a.lane<1>(), 0.0f); + EXPECT_EQ(a.lane<2>(), 0.0f); + EXPECT_EQ(a.lane<3>(), 0.0f); +} + +/** \brief Test vfloat4 load1. */ +TEST(vfloat4, Load1) +{ + float s = 3.14f; + vfloat4 a = vfloat4::load1(&s); + EXPECT_EQ(a.lane<0>(), 3.14f); + EXPECT_EQ(a.lane<1>(), 3.14f); + EXPECT_EQ(a.lane<2>(), 3.14f); + EXPECT_EQ(a.lane<3>(), 3.14f); +} + +/** \brief Test vfloat4 loada. */ +TEST(vfloat4, Loada) +{ + vfloat4 a(&(f32x4_data[0])); + EXPECT_EQ(a.lane<0>(), 0.0f); + EXPECT_EQ(a.lane<1>(), 1.0f); + EXPECT_EQ(a.lane<2>(), 2.0f); + EXPECT_EQ(a.lane<3>(), 3.0f); +} + +/** \brief Test vfloat4 lane_id. */ +TEST(vfloat4, LaneID) +{ + vfloat4 a = vfloat4::lane_id(); + EXPECT_EQ(a.lane<0>(), 0.0f); + EXPECT_EQ(a.lane<1>(), 1.0f); + EXPECT_EQ(a.lane<2>(), 2.0f); + EXPECT_EQ(a.lane<3>(), 3.0f); +} + +/** \brief Test vfloat4 add. */ +TEST(vfloat4, vadd) +{ + vfloat4 a(1.0f, 2.0f, 3.0f, 4.0f); + vfloat4 b(0.1f, 0.2f, 0.3f, 0.4f); + a = a + b; + EXPECT_EQ(a.lane<0>(), 1.0f + 0.1f); + EXPECT_EQ(a.lane<1>(), 2.0f + 0.2f); + EXPECT_EQ(a.lane<2>(), 3.0f + 0.3f); + EXPECT_EQ(a.lane<3>(), 4.0f + 0.4f); +} + +/** \brief Test vfloat4 sub. */ +TEST(vfloat4, vsub) +{ + vfloat4 a(1.0f, 2.0f, 3.0f, 4.0f); + vfloat4 b(0.1f, 0.2f, 0.3f, 0.4f); + a = a - b; + EXPECT_EQ(a.lane<0>(), 1.0f - 0.1f); + EXPECT_EQ(a.lane<1>(), 2.0f - 0.2f); + EXPECT_EQ(a.lane<2>(), 3.0f - 0.3f); + EXPECT_EQ(a.lane<3>(), 4.0f - 0.4f); +} + +/** \brief Test vfloat4 mul. */ +TEST(vfloat4, vmul) +{ + vfloat4 a(1.0f, 2.0f, 3.0f, 4.0f); + vfloat4 b(0.1f, 0.2f, 0.3f, 0.4f); + a = a * b; + EXPECT_EQ(a.lane<0>(), 1.0f * 0.1f); + EXPECT_EQ(a.lane<1>(), 2.0f * 0.2f); + EXPECT_EQ(a.lane<2>(), 3.0f * 0.3f); + EXPECT_EQ(a.lane<3>(), 4.0f * 0.4f); +} + +/** \brief Test vfloat4 mul. */ +TEST(vfloat4, vsmul) +{ + vfloat4 a(1.0f, 2.0f, 3.0f, 4.0f); + float b = 3.14f; + a = a * b; + EXPECT_EQ(a.lane<0>(), 1.0f * 3.14f); + EXPECT_EQ(a.lane<1>(), 2.0f * 3.14f); + EXPECT_EQ(a.lane<2>(), 3.0f * 3.14f); + EXPECT_EQ(a.lane<3>(), 4.0f * 3.14f); +} + +/** \brief Test vfloat4 mul. */ +TEST(vfloat4, svmul) +{ + float a = 3.14f; + vfloat4 b(1.0f, 2.0f, 3.0f, 4.0f); + b = a * b; + EXPECT_EQ(b.lane<0>(), 3.14f * 1.0f); + EXPECT_EQ(b.lane<1>(), 3.14f * 2.0f); + EXPECT_EQ(b.lane<2>(), 3.14f * 3.0f); + EXPECT_EQ(b.lane<3>(), 3.14f * 4.0f); +} + +/** \brief Test vfloat4 div. */ +TEST(vfloat4, vdiv) +{ + vfloat4 a(1.0f, 2.0f, 3.0f, 4.0f); + vfloat4 b(0.1f, 0.2f, 0.3f, 0.4f); + a = a / b; + EXPECT_EQ(a.lane<0>(), 1.0f / 0.1f); + EXPECT_EQ(a.lane<1>(), 2.0f / 0.2f); + EXPECT_EQ(a.lane<2>(), 3.0f / 0.3f); + EXPECT_EQ(a.lane<3>(), 4.0f / 0.4f); +} + +/** \brief Test vfloat4 ceq. */ +TEST(vfloat4, ceq) +{ + vfloat4 a1(1.0f, 2.0f, 3.0f, 4.0f); + vfloat4 b1(0.1f, 0.2f, 0.3f, 0.4f); + vmask r1 = a1 == b1; + EXPECT_EQ(0, mask(r1)); + EXPECT_EQ(false, any(r1)); + EXPECT_EQ(false, all(r1)); + + vfloat4 a2(1.0f, 2.0f, 3.0f, 4.0f); + vfloat4 b2(1.0f, 0.2f, 0.3f, 0.4f); + vmask r2 = a2 == b2; + EXPECT_EQ(0x1, mask(r2)); + EXPECT_EQ(true, any(r2)); + EXPECT_EQ(false, all(r2)); + + vfloat4 a3(1.0f, 2.0f, 3.0f, 4.0f); + vfloat4 b3(1.0f, 0.2f, 3.0f, 0.4f); + vmask r3 = a3 == b3; + EXPECT_EQ(0x5, mask(r3)); + EXPECT_EQ(true, any(r3)); + EXPECT_EQ(false, all(r3)); + + vfloat4 a4(1.0f, 2.0f, 3.0f, 4.0f); + vmask r4 = a4 == a4; + EXPECT_EQ(0xF, mask(r4)); + EXPECT_EQ(true, any(r4)); + EXPECT_EQ(true, all(r4)); +} + +/** \brief Test vfloat4 cne. */ +TEST(vfloat4, cne) +{ + vfloat4 a1(1.0f, 2.0f, 3.0f, 4.0f); + vfloat4 b1(0.1f, 0.2f, 0.3f, 0.4f); + vmask r1 = a1 != b1; + EXPECT_EQ(0xF, mask(r1)); + EXPECT_EQ(true, any(r1)); + EXPECT_EQ(true, all(r1)); + + vfloat4 a2(1.0f, 2.0f, 3.0f, 4.0f); + vfloat4 b2(1.0f, 0.2f, 0.3f, 0.4f); + vmask r2 = a2 != b2; + EXPECT_EQ(0xE, mask(r2)); + EXPECT_EQ(true, any(r2)); + EXPECT_EQ(false, all(r2)); + + vfloat4 a3(1.0f, 2.0f, 3.0f, 4.0f); + vfloat4 b3(1.0f, 0.2f, 3.0f, 0.4f); + vmask r3 = a3 != b3; + EXPECT_EQ(0xA, mask(r3)); + EXPECT_EQ(true, any(r3)); + EXPECT_EQ(false, all(r3)); + + vfloat4 a4(1.0f, 2.0f, 3.0f, 4.0f); + vmask r4 = a4 != a4; + EXPECT_EQ(0, mask(r4)); + EXPECT_EQ(false, any(r4)); + EXPECT_EQ(false, all(r4)); +} + +/** \brief Test vfloat4 clt. */ +TEST(vfloat4, clt) +{ + vfloat4 a(1.0f, 2.0f, 3.0f, 4.0f); + vfloat4 b(0.9f, 2.1f, 3.0f, 4.1f); + vmask r = a < b; + EXPECT_EQ(0xA, mask(r)); +} + +/** \brief Test vfloat4 cle. */ +TEST(vfloat4, cle) +{ + vfloat4 a(1.0f, 2.0f, 3.0f, 4.0f); + vfloat4 b(0.9f, 2.1f, 3.0f, 4.1f); + vmask r = a <= b; + EXPECT_EQ(0xE, mask(r)); +} + +/** \brief Test vfloat4 cgt. */ +TEST(vfloat4, cgt) +{ + vfloat4 a(1.0f, 2.0f, 3.0f, 4.0f); + vfloat4 b(0.9f, 2.1f, 3.0f, 4.1f); + vmask r = a > b; + EXPECT_EQ(0x1, mask(r)); +} + +/** \brief Test vfloat4 cge. */ +TEST(vfloat4, cge) +{ + vfloat4 a(1.0f, 2.0f, 3.0f, 4.0f); + vfloat4 b(0.9f, 2.1f, 3.0f, 4.1f); + vmask r = a >= b; + EXPECT_EQ(0x5, mask(r)); +} + +/** \brief Test vfloat4 min. */ +TEST(vfloat4, min) +{ + vfloat4 a(1.0f, 2.0f, 3.0f, 4.0f); + vfloat4 b(0.9f, 2.1f, 3.0f, 4.1f); + vfloat4 r = min(a, b); + EXPECT_EQ(r.lane<0>(), 0.9f); + EXPECT_EQ(r.lane<1>(), 2.0f); + EXPECT_EQ(r.lane<2>(), 3.0f); + EXPECT_EQ(r.lane<3>(), 4.0f); +} + +/** \brief Test vfloat4 max. */ +TEST(vfloat4, max) +{ + vfloat4 a(1.0f, 2.0f, 3.0f, 4.0f); + vfloat4 b(0.9f, 2.1f, 3.0f, 4.1f); + vfloat4 r = max(a, b); + EXPECT_EQ(r.lane<0>(), 1.0f); + EXPECT_EQ(r.lane<1>(), 2.1f); + EXPECT_EQ(r.lane<2>(), 3.0f); + EXPECT_EQ(r.lane<3>(), 4.1f); +} + +/** \brief Test vfloat4 clamp. */ +TEST(vfloat4, clamp) +{ + vfloat4 a1(1.0f, 2.0f, 3.0f, 4.0f); + vfloat4 r1 = clamp(2.1f, 3.0f, a1); + EXPECT_EQ(r1.lane<0>(), 2.1f); + EXPECT_EQ(r1.lane<1>(), 2.1f); + EXPECT_EQ(r1.lane<2>(), 3.0f); + EXPECT_EQ(r1.lane<3>(), 3.0f); + + vfloat4 a2(1.0f, 2.0f, qnan, 4.0f); + vfloat4 r2 = clamp(2.1f, 3.0f, a2); + EXPECT_EQ(r2.lane<0>(), 2.1f); + EXPECT_EQ(r2.lane<1>(), 2.1f); + EXPECT_EQ(r2.lane<2>(), 2.1f); + EXPECT_EQ(r2.lane<3>(), 3.0f); +} + +/** \brief Test vfloat4 clampz. */ +TEST(vfloat4, clampz) +{ + vfloat4 a1(-1.0f, 0.0f, 0.1f, 4.0f); + vfloat4 r1 = clampz(3.0f, a1); + EXPECT_EQ(r1.lane<0>(), 0.0f); + EXPECT_EQ(r1.lane<1>(), 0.0f); + EXPECT_EQ(r1.lane<2>(), 0.1f); + EXPECT_EQ(r1.lane<3>(), 3.0f); + + vfloat4 a2(-1.0f, 0.0f, qnan, 4.0f); + vfloat4 r2 = clampz(3.0f, a2); + EXPECT_EQ(r2.lane<0>(), 0.0f); + EXPECT_EQ(r2.lane<1>(), 0.0f); + EXPECT_EQ(r2.lane<2>(), 0.0f); + EXPECT_EQ(r2.lane<3>(), 3.0f); +} + +/** \brief Test vfloat4 clampz. */ +TEST(vfloat4, clampzo) +{ + vfloat4 a1(-1.0f, 0.0f, 0.1f, 4.0f); + vfloat4 r1 = clampzo(a1); + EXPECT_EQ(r1.lane<0>(), 0.0f); + EXPECT_EQ(r1.lane<1>(), 0.0f); + EXPECT_EQ(r1.lane<2>(), 0.1f); + EXPECT_EQ(r1.lane<3>(), 1.0f); + + vfloat4 a2(-1.0f, 0.0f, qnan, 4.0f); + vfloat4 r2 = clampzo(a2); + EXPECT_EQ(r2.lane<0>(), 0.0f); + EXPECT_EQ(r2.lane<1>(), 0.0f); + EXPECT_EQ(r2.lane<2>(), 0.0f); + EXPECT_EQ(r2.lane<3>(), 1.0f); +} + +/** \brief Test vfloat4 abs. */ +TEST(vfloat4, abs) +{ + vfloat4 a(-1.0f, 0.0f, 0.1f, 4.0f); + vfloat4 r = abs(a); + EXPECT_EQ(r.lane<0>(), 1.0f); + EXPECT_EQ(r.lane<1>(), 0.0f); + EXPECT_EQ(r.lane<2>(), 0.1f); + EXPECT_EQ(r.lane<3>(), 4.0f); +} + +/** \brief Test vfloat4 round. */ +TEST(vfloat4, round) +{ + vfloat4 a(1.1f, 1.5f, 1.6f, 4.0f); + vfloat4 r = round(a); + EXPECT_EQ(r.lane<0>(), 1.0f); + EXPECT_EQ(r.lane<1>(), 2.0f); + EXPECT_EQ(r.lane<2>(), 2.0f); + EXPECT_EQ(r.lane<3>(), 4.0f); +} + +/** \brief Test vfloat4 hmin. */ +TEST(vfloat4, hmin) +{ + vfloat4 a1(1.1f, 1.5f, 1.6f, 4.0f); + vfloat4 r1 = hmin(a1); + EXPECT_EQ(r1.lane<0>(), 1.1f); + EXPECT_EQ(r1.lane<1>(), 1.1f); + EXPECT_EQ(r1.lane<2>(), 1.1f); + EXPECT_EQ(r1.lane<3>(), 1.1f); + + vfloat4 a2(1.1f, 1.5f, 1.6f, 0.2f); + vfloat4 r2 = hmin(a2); + EXPECT_EQ(r2.lane<0>(), 0.2f); + EXPECT_EQ(r2.lane<1>(), 0.2f); + EXPECT_EQ(r2.lane<2>(), 0.2f); + EXPECT_EQ(r2.lane<3>(), 0.2f); +} + +/** \brief Test vfloat4 sqrt. */ +TEST(vfloat4, sqrt) +{ + vfloat4 a(1.0f, 2.0f, 3.0f, 4.0f); + vfloat4 r = sqrt(a); + EXPECT_EQ(r.lane<0>(), std::sqrt(1.0f)); + EXPECT_EQ(r.lane<1>(), std::sqrt(2.0f)); + EXPECT_EQ(r.lane<2>(), std::sqrt(3.0f)); + EXPECT_EQ(r.lane<3>(), std::sqrt(4.0f)); +} + +/** \brief Test vfloat4 select. */ +TEST(vfloat4, select) +{ + vfloat4 m1(1.0f, 1.0f, 1.0f, 1.0f); + vfloat4 m2(1.0f, 2.0f, 1.0f, 2.0f); + vmask4 cond = m1 == m2; + + vfloat4 a(1.0f, 3.0f, 3.0f, 1.0f); + vfloat4 b(4.0f, 2.0f, 2.0f, 4.0f); + + // Select in one direction + vfloat4 r1 = select(a, b, cond); + EXPECT_EQ(r1.lane<0>(), 4.0f); + EXPECT_EQ(r1.lane<1>(), 3.0f); + EXPECT_EQ(r1.lane<2>(), 2.0f); + EXPECT_EQ(r1.lane<3>(), 1.0f); + + // Select in the other + vfloat4 r2 = select(b, a, cond); + EXPECT_EQ(r2.lane<0>(), 1.0f); + EXPECT_EQ(r2.lane<1>(), 2.0f); + EXPECT_EQ(r2.lane<2>(), 3.0f); + EXPECT_EQ(r2.lane<3>(), 4.0f); +} + +/** \brief Test vfloat4 select MSB only. */ +TEST(vfloat4, select_msb) +{ + vint4 msb(0x80000000u, 0, 0x80000000u, 0); + vmask4 cond(msb.m); + + vfloat4 a(1.0f, 3.0f, 3.0f, 1.0f); + vfloat4 b(4.0f, 2.0f, 2.0f, 4.0f); + + // Select in one direction + vfloat4 r1 = select(a, b, cond); + EXPECT_EQ(r1.lane<0>(), 4.0f); + EXPECT_EQ(r1.lane<1>(), 3.0f); + EXPECT_EQ(r1.lane<2>(), 2.0f); + EXPECT_EQ(r1.lane<3>(), 1.0f); + + // Select in the other + vfloat4 r2 = select(b, a, cond); + EXPECT_EQ(r2.lane<0>(), 1.0f); + EXPECT_EQ(r2.lane<1>(), 2.0f); + EXPECT_EQ(r2.lane<2>(), 3.0f); + EXPECT_EQ(r2.lane<3>(), 4.0f); +} + +/** \brief Test vfloat4 gatherf. */ +TEST(vfloat4, gatherf) +{ + vint4 indices(0, 4, 3, 2); + vfloat4 r = gatherf(f32x4_data, indices); + EXPECT_EQ(r.lane<0>(), 0.0f); + EXPECT_EQ(r.lane<1>(), 4.0f); + EXPECT_EQ(r.lane<2>(), 3.0f); + EXPECT_EQ(r.lane<3>(), 2.0f); +} + +/** \brief Test vfloat4 store. */ +TEST(vfloat4, store) +{ + alignas(16) float out[5]; + vfloat4 a(f32x4_data); + store(a, &(out[1])); + EXPECT_EQ(out[1], 0.0f); + EXPECT_EQ(out[2], 1.0f); + EXPECT_EQ(out[3], 2.0f); + EXPECT_EQ(out[4], 3.0f); +} + +/** \brief Test vfloat4 storea. */ +TEST(vfloat4, storea) +{ + alignas(16) float out[4]; + vfloat4 a(f32x4_data); + store(a, out); + EXPECT_EQ(out[0], 0.0f); + EXPECT_EQ(out[1], 1.0f); + EXPECT_EQ(out[2], 2.0f); + EXPECT_EQ(out[3], 3.0f); +} + +/** \brief Test vfloat4 dot. */ +TEST(vfloat4, dot) +{ + vfloat4 a(1.0f, 2.0f, 4.0f, 8.0f); + vfloat4 b(1.0f, 0.5f, 0.25f, 0.125f); + vfloat4 r = dot(a, b); + EXPECT_EQ(r.lane<0>(), 4.0f); + EXPECT_EQ(r.lane<1>(), 4.0f); + EXPECT_EQ(r.lane<2>(), 4.0f); + EXPECT_EQ(r.lane<3>(), 4.0f); +} + +/** \brief Test vfloat4 float_to_int. */ +TEST(vfloat4, float_to_int) +{ + vfloat4 a(1.1f, 1.5f, 1.6f, 4.0f); + vint4 r = float_to_int(a); + EXPECT_EQ(r.lane<0>(), 1); + EXPECT_EQ(r.lane<1>(), 1); + EXPECT_EQ(r.lane<2>(), 1); + EXPECT_EQ(r.lane<3>(), 4); +} + +/** \brief Test vfloat4 round. */ +TEST(vfloat4, float_to_int_rtn) +{ + vfloat4 a(1.1f, 1.5f, 1.6f, 4.0f); + vint4 r = float_to_int_rtn(a); + EXPECT_EQ(r.lane<0>(), 1); + EXPECT_EQ(r.lane<1>(), 2); + EXPECT_EQ(r.lane<2>(), 2); + EXPECT_EQ(r.lane<3>(), 4); +} + + +// VINT4 tests - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + +/** \brief Test unaligned vint4 data load. */ +TEST(vint4, UnalignedLoad) +{ + vint4 a(&(s32x4_data[1])); + EXPECT_EQ(a.lane<0>(), 1); + EXPECT_EQ(a.lane<1>(), 2); + EXPECT_EQ(a.lane<2>(), 3); + EXPECT_EQ(a.lane<3>(), 4); +} + +/** \brief Test scalar duplicated vint4 load. */ +TEST(vint4, ScalarDupLoad) +{ + vint4 a(42); + EXPECT_EQ(a.lane<0>(), 42); + EXPECT_EQ(a.lane<1>(), 42); + EXPECT_EQ(a.lane<2>(), 42); + EXPECT_EQ(a.lane<3>(), 42); +} + +/** \brief Test scalar vint4 load. */ +TEST(vint4, ScalarLoad) +{ + vint4 a(11, 22, 33, 44); + EXPECT_EQ(a.lane<0>(), 11); + EXPECT_EQ(a.lane<1>(), 22); + EXPECT_EQ(a.lane<2>(), 33); + EXPECT_EQ(a.lane<3>(), 44); +} + +/** \brief Test copy vint4 load. */ +TEST(vint4, CopyLoad) +{ + vint4 s(11, 22, 33, 44); + vint4 a(s.m); + EXPECT_EQ(a.lane<0>(), 11); + EXPECT_EQ(a.lane<1>(), 22); + EXPECT_EQ(a.lane<2>(), 33); + EXPECT_EQ(a.lane<3>(), 44); +} + +/** \brief Test vint4 lane_id. */ +TEST(vint4, LaneID) +{ + vint4 a = vint4::lane_id(); + EXPECT_EQ(a.lane<0>(), 0); + EXPECT_EQ(a.lane<1>(), 1); + EXPECT_EQ(a.lane<2>(), 2); + EXPECT_EQ(a.lane<3>(), 3); +} + +/** \brief Test vint4 add. */ +TEST(vint4, vadd) +{ + vint4 a(1, 2, 3, 4); + vint4 b(2, 3, 4, 5); + a = a + b; + EXPECT_EQ(a.lane<0>(), 1 + 2); + EXPECT_EQ(a.lane<1>(), 2 + 3); + EXPECT_EQ(a.lane<2>(), 3 + 4); + EXPECT_EQ(a.lane<3>(), 4 + 5); +} + +/** \brief Test vint4 sub. */ +TEST(vint4, vsub) +{ + vint4 a(1, 2, 4, 4); + vint4 b(2, 3, 3, 5); + a = a - b; + EXPECT_EQ(a.lane<0>(), 1 - 2); + EXPECT_EQ(a.lane<1>(), 2 - 3); + EXPECT_EQ(a.lane<2>(), 4 - 3); + EXPECT_EQ(a.lane<3>(), 4 - 5); +} + +/** \brief Test vint4 bitwise invert. */ +TEST(vint4, bit_invert) +{ + vint4 a(-1, 0, 1, 2); + a = ~a; + EXPECT_EQ(a.lane<0>(), ~-1); + EXPECT_EQ(a.lane<1>(), ~0); + EXPECT_EQ(a.lane<2>(), ~1); + EXPECT_EQ(a.lane<3>(), ~2); +} + +/** \brief Test vint4 bitwise or. */ +TEST(vint4, bit_vor) +{ + vint4 a(1, 2, 3, 4); + vint4 b(2, 3, 4, 5); + a = a | b; + EXPECT_EQ(a.lane<0>(), 3); + EXPECT_EQ(a.lane<1>(), 3); + EXPECT_EQ(a.lane<2>(), 7); + EXPECT_EQ(a.lane<3>(), 5); +} + +/** \brief Test vint4 bitwise and. */ +TEST(vint4, bit_vand) +{ + vint4 a(1, 2, 3, 4); + vint4 b(2, 3, 4, 5); + a = a & b; + EXPECT_EQ(a.lane<0>(), 0); + EXPECT_EQ(a.lane<1>(), 2); + EXPECT_EQ(a.lane<2>(), 0); + EXPECT_EQ(a.lane<3>(), 4); +} + +/** \brief Test vint4 bitwise xor. */ +TEST(vint4, bit_vxor) +{ + vint4 a(1, 2, 3, 4); + vint4 b(2, 3, 4, 5); + a = a ^ b; + EXPECT_EQ(a.lane<0>(), 3); + EXPECT_EQ(a.lane<1>(), 1); + EXPECT_EQ(a.lane<2>(), 7); + EXPECT_EQ(a.lane<3>(), 1); +} + +/** \brief Test vint4 ceq. */ +TEST(vint4, ceq) +{ + vint4 a1(1, 2, 3, 4); + vint4 b1(0, 1, 2, 3); + vmask r1 = a1 == b1; + EXPECT_EQ(0, mask(r1)); + EXPECT_EQ(false, any(r1)); + EXPECT_EQ(false, all(r1)); + + vint4 a2(1, 2, 3, 4); + vint4 b2(1, 0, 0, 0); + vmask r2 = a2 == b2; + EXPECT_EQ(0x1, mask(r2)); + EXPECT_EQ(true, any(r2)); + EXPECT_EQ(false, all(r2)); + + vint4 a3(1, 2, 3, 4); + vint4 b3(1, 0, 3, 0); + vmask r3 = a3 == b3; + EXPECT_EQ(0x5, mask(r3)); + EXPECT_EQ(true, any(r3)); + EXPECT_EQ(false, all(r3)); + + vint4 a4(1, 2, 3, 4); + vmask r4 = a4 == a4; + EXPECT_EQ(0xF, mask(r4)); + EXPECT_EQ(true, any(r4)); + EXPECT_EQ(true, all(r4)); +} + +/** \brief Test vint4 cne. */ +TEST(vint4, cne) +{ + vint4 a1(1, 2, 3, 4); + vint4 b1(0, 1, 2, 3); + vmask r1 = a1 != b1; + EXPECT_EQ(0xF, mask(r1)); + EXPECT_EQ(true, any(r1)); + EXPECT_EQ(true, all(r1)); + + vint4 a2(1, 2, 3, 4); + vint4 b2(1, 0, 0, 0); + vmask r2 = a2 != b2; + EXPECT_EQ(0xE, mask(r2)); + EXPECT_EQ(true, any(r2)); + EXPECT_EQ(false, all(r2)); + + vint4 a3(1, 2, 3, 4); + vint4 b3(1, 0, 3, 0); + vmask r3 = a3 != b3; + EXPECT_EQ(0xA, mask(r3)); + EXPECT_EQ(true, any(r3)); + EXPECT_EQ(false, all(r3)); + + vint4 a4(1, 2, 3, 4); + vmask r4 = a4 != a4; + EXPECT_EQ(0, mask(r4)); + EXPECT_EQ(false, any(r4)); + EXPECT_EQ(false, all(r4)); +} + +/** \brief Test vint4 clt. */ +TEST(vint4, clt) +{ + vint4 a(1, 2, 3, 4); + vint4 b(0, 3, 3, 5); + vmask r = a < b; + EXPECT_EQ(0xA, mask(r)); +} + +/** \brief Test vint4 cgt. */ +TEST(vint4, cle) +{ + vint4 a(1, 2, 3, 4); + vint4 b(0, 3, 3, 5); + vmask r = a > b; + EXPECT_EQ(0x1, mask(r)); +} + +/** \brief Test vint4 min. */ +TEST(vint4, min) +{ + vint4 a(1, 2, 3, 4); + vint4 b(0, 3, 3, 5); + vint4 r = min(a, b); + EXPECT_EQ(r.lane<0>(), 0); + EXPECT_EQ(r.lane<1>(), 2); + EXPECT_EQ(r.lane<2>(), 3); + EXPECT_EQ(r.lane<3>(), 4); +} + +/** \brief Test vint4 max. */ +TEST(vint4, max) +{ + vint4 a(1, 2, 3, 4); + vint4 b(0, 3, 3, 5); + vint4 r = max(a, b); + EXPECT_EQ(r.lane<0>(), 1); + EXPECT_EQ(r.lane<1>(), 3); + EXPECT_EQ(r.lane<2>(), 3); + EXPECT_EQ(r.lane<3>(), 5); +} + +/** \brief Test vint4 hmin. */ +TEST(vint4, hmin) +{ + vint4 a1(1, 2, 1, 2); + vint4 r1 = hmin(a1); + EXPECT_EQ(r1.lane<0>(), 1); + EXPECT_EQ(r1.lane<1>(), 1); + EXPECT_EQ(r1.lane<2>(), 1); + EXPECT_EQ(r1.lane<3>(), 1); + + vint4 a2(1, 2, -1, 5); + vint4 r2 = hmin(a2); + EXPECT_EQ(r2.lane<0>(), -1); + EXPECT_EQ(r2.lane<1>(), -1); + EXPECT_EQ(r2.lane<2>(), -1); + EXPECT_EQ(r2.lane<3>(), -1); +} + +/** \brief Test vint4 storea. */ +TEST(vint4, storea) +{ + alignas(16) int out[4]; + vint4 a(s32x4_data); + storea(a, out); + EXPECT_EQ(out[0], 0); + EXPECT_EQ(out[1], 1); + EXPECT_EQ(out[2], 2); + EXPECT_EQ(out[3], 3); +} + +/** \brief Test vint4 store_nbytes. */ +TEST(vint4, store_nbytes) +{ + alignas(16) int out; + vint4 a(42, 314, 75, 90); + store_nbytes(a, (uint8_t*)&out); + EXPECT_EQ(out, 42); +} + +/** \brief Test vint4 gatheri. */ +TEST(vint4, gatheri) +{ + vint4 indices(0, 4, 3, 2); + vint4 r = gatheri(s32x4_data, indices); + EXPECT_EQ(r.lane<0>(), 0); + EXPECT_EQ(r.lane<1>(), 4); + EXPECT_EQ(r.lane<2>(), 3); + EXPECT_EQ(r.lane<3>(), 2); +} + +/** \brief Test vint4 pack_low_bytes. */ +TEST(vint4, pack_low_bytes) +{ + vint4 a(1, 2, 3, 4); + vint4 r = pack_low_bytes(a); + EXPECT_EQ(r.lane<0>(), (4 << 24) | (3 << 16) | (2 << 8) | (1 << 0)); +} + +/** \brief Test vint4 select. */ +TEST(vint4, select) +{ + vint4 m1(1, 1, 1, 1); + vint4 m2(1, 2, 1, 2); + vmask4 cond = m1 == m2; + + vint4 a(1, 3, 3, 1); + vint4 b(4, 2, 2, 4); + + vint4 r1 = select(a, b, cond); + EXPECT_EQ(r1.lane<0>(), 4); + EXPECT_EQ(r1.lane<1>(), 3); + EXPECT_EQ(r1.lane<2>(), 2); + EXPECT_EQ(r1.lane<3>(), 1); + + vint4 r2 = select(b, a, cond); + EXPECT_EQ(r2.lane<0>(), 1); + EXPECT_EQ(r2.lane<1>(), 2); + EXPECT_EQ(r2.lane<2>(), 3); + EXPECT_EQ(r2.lane<3>(), 4); +} + +/** \brief Test vint4 select MSB. */ +TEST(vint4, select_msb) +{ + vint4 msb(0x80000000u, 0, 0x80000000u, 0); + vmask4 cond(msb.m); + + vint4 a(1, 3, 3, 1); + vint4 b(4, 2, 2, 4); + + vint4 r1 = select(a, b, cond); + EXPECT_EQ(r1.lane<0>(), 4); + EXPECT_EQ(r1.lane<1>(), 3); + EXPECT_EQ(r1.lane<2>(), 2); + EXPECT_EQ(r1.lane<3>(), 1); + + vint4 r2 = select(b, a, cond); + EXPECT_EQ(r2.lane<0>(), 1); + EXPECT_EQ(r2.lane<1>(), 2); + EXPECT_EQ(r2.lane<2>(), 3); + EXPECT_EQ(r2.lane<3>(), 4); +} + +// VMASK4 tests - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +/** \brief Test vmask4 or. */ +TEST(vmask4, or) +{ + vfloat4 m1a(0, 1, 0, 1); + vfloat4 m1b(1, 1, 1, 1); + vmask4 m1 = m1a == m1b; + + vfloat4 m2a(1, 1, 0, 0); + vfloat4 m2b(1, 1, 1, 1); + vmask4 m2 = m2a == m2b; + + vmask4 r = m1 | m2; + EXPECT_EQ(mask(r), 0xB); +} + +/** \brief Test vmask4 and. */ +TEST(vmask4, and) +{ + vfloat4 m1a(0, 1, 0, 1); + vfloat4 m1b(1, 1, 1, 1); + vmask4 m1 = m1a == m1b; + + vfloat4 m2a(1, 1, 0, 0); + vfloat4 m2b(1, 1, 1, 1); + vmask4 m2 = m2a == m2b; + + vmask4 r = m1 & m2; + EXPECT_EQ(mask(r), 0x2); +} + +/** \brief Test vmask4 xor. */ +TEST(vmask4, xor) +{ + vfloat4 m1a(0, 1, 0, 1); + vfloat4 m1b(1, 1, 1, 1); + vmask4 m1 = m1a == m1b; + + vfloat4 m2a(1, 1, 0, 0); + vfloat4 m2b(1, 1, 1, 1); + vmask4 m2 = m2a == m2b; + + vmask4 r = m1 ^ m2; + EXPECT_EQ(mask(r), 0x9); +} + +/** \brief Test vmask4 not. */ +TEST(vmask4, not) +{ + vfloat4 m1a(0, 1, 0, 1); + vfloat4 m1b(1, 1, 1, 1); + vmask4 m1 = m1a == m1b; + vmask r = ~m1; + EXPECT_EQ(mask(r), 0x5); +} + +#endif + +} diff --git a/Source/astcenc_internal.h b/Source/astcenc_internal.h index 814c9c0..6ee35d1 100644 --- a/Source/astcenc_internal.h +++ b/Source/astcenc_internal.h @@ -63,6 +63,14 @@ #endif #endif +#ifndef ASTCENC_NEON + #if defined(__aarch64__) + #define ASTCENC_NEON 1 + #else + #define ASTCENC_NEON 0 + #endif +#endif + #if ASTCENC_AVX #define ASTCENC_VECALIGN 32 #else diff --git a/Source/astcenc_vecmathlib.h b/Source/astcenc_vecmathlib.h index 57b9e3c..1edc5af 100644 --- a/Source/astcenc_vecmathlib.h +++ b/Source/astcenc_vecmathlib.h @@ -32,7 +32,7 @@ * Explicit 4-wide types are accessible via the vint4, vfloat4, and vmask4 * types. These are provided for use by VLA code, but are also expected to be * used as a fixed-width type and will supported a reference C++ fallback for - * use on platforms without SIMD intrinsics (TODO: not yet implemented). + * use on platforms without SIMD intrinsics. * * Explicit 8-wide types are accessible via the vint8, vfloat8, and vmask8 * types. These are provide for use by VLA code, and are not expected to be @@ -42,9 +42,10 @@ * With the current implementation ISA support is provided for: * * * 1-wide for scalar reference. - * * 4-wide for SSE2. - * * 4-wide for SSE4.1. - * * 8-wide for AVX2. + * * 4-wide for Armv8-A NEON. + * * 4-wide for x86-64 SSE2. + * * 4-wide for x86-64 SSE4.1. + * * 8-wide for x86-64 AVX2. * */ @@ -53,6 +54,8 @@ #if ASTCENC_SSE != 0 || ASTCENC_AVX != 0 #include +#elif ASTCENC_NEON != 0 + #include #endif #if defined(_MSC_VER) @@ -64,7 +67,7 @@ #endif #if ASTCENC_AVX >= 2 - /* If we have AVX2 expose 8-wide VLA, and 4-wide fixed width. */ + /* If we have AVX2 expose 8-wide VLA. */ #include "astcenc_vecmathlib_avx2_8.h" #include "astcenc_vecmathlib_sse_4.h" @@ -89,6 +92,20 @@ constexpr auto loada = vfloat4::loada; constexpr auto load1 = vfloat4::load1; + +#elif ASTCENC_NEON > 0 + /* If we have NEON expose 4-wide VLA. */ + #include "astcenc_vecmathlib_neon_4.h" + + #define ASTCENC_SIMD_WIDTH 4 + + using vfloat = vfloat4; + using vint = vint4; + using vmask = vmask4; + + constexpr auto loada = vfloat4::loada; + constexpr auto load1 = vfloat4::load1; + #else /* If we have nothing expose 1-wide VLA, and 4-wide fixed width. */ #include "astcenc_vecmathlib_none_1.h" diff --git a/Source/astcenc_vecmathlib_neon_4.h b/Source/astcenc_vecmathlib_neon_4.h new file mode 100755 index 0000000..c5b9d60 --- /dev/null +++ b/Source/astcenc_vecmathlib_neon_4.h @@ -0,0 +1,799 @@ +// SPDX-License-Identifier: Apache-2.0 +// ---------------------------------------------------------------------------- +// Copyright 2019-2020 Arm Limited +// +// Licensed under the Apache License, Version 2.0 (the "License"); you may not +// use this file except in compliance with the License. You may obtain a copy +// of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +// ---------------------------------------------------------------------------- + +/** + * @brief 4x32-bit vectors, implemented using Armv8-A NEON. + * + * This module implements 4-wide 32-bit float, int, and mask vectors for + * Armv8-A NEON. + * + * There is a baseline level of functionality provided by all vector widths and + * implementations. This is implemented using identical function signatures, + * modulo data type, so we can use them as substitutable implementations in VLA + * code. + * + * The 4-wide vectors are also used as a fixed-width type, and significantly + * extend the functionality above that available to VLA code. + */ + +#ifndef ASTC_VECMATHLIB_NEON_4_H_INCLUDED +#define ASTC_VECMATHLIB_NEON_4_H_INCLUDED + +#ifndef ASTCENC_SIMD_INLINE + #error "Include astcenc_vecmathlib.h, do not include directly" +#endif + +#include + +// ============================================================================ +// vfloat4 data type +// ============================================================================ + +/** + * @brief Data type for 4-wide floats. + */ +struct vfloat4 +{ + /** + * @brief Construct from zero-initialized value. + */ + ASTCENC_SIMD_INLINE vfloat4() {} + + /** + * @brief Construct from 4 values loaded from an unaligned address. + * + * Consider using loada() which is better with vectors if data is aligned + * to vector length. + */ + ASTCENC_SIMD_INLINE explicit vfloat4(const float *p) + { + m = vld1q_f32(p); + } + + /** + * @brief Construct from 1 scalar value replicated across all lanes. + * + * Consider using zero() for constexpr zeros. + */ + ASTCENC_SIMD_INLINE explicit vfloat4(float a) + { + m = vdupq_n_f32(a); + } + + /** + * @brief Construct from 4 scalar values. + * + * The value of @c a is stored to lane 0 (LSB) in the SIMD register. + */ + ASTCENC_SIMD_INLINE explicit vfloat4(float a, float b, float c, float d) + { + float32x4_t v { a, b, c, d }; + m = v; + } + + /** + * @brief Construct from an existing SIMD register. + */ + ASTCENC_SIMD_INLINE explicit vfloat4(float32x4_t a) + { + m = a; + } + + /** + * @brief Get the scalar value of a single lane. + * + * TODO: Can we do better for lane0, which is the common case for VLA? + */ + template ASTCENC_SIMD_INLINE float lane() const + { + return vgetq_lane_f32(m, l); + } + + /** + * @brief Set the scalar value of a single lane. + */ + template ASTCENC_SIMD_INLINE void set_lane(float a) + { + m = vld1q_lane_f32(&a, m, l); + } + + /** + * @brief Factory that returns a vector of zeros. + */ + static ASTCENC_SIMD_INLINE vfloat4 zero() + { + return vfloat4(vdupq_n_f32(0.0f)); + } + + /** + * @brief Factory that returns a replicated scalar loaded from memory. + */ + static ASTCENC_SIMD_INLINE vfloat4 load1(const float* p) + { + return vfloat4(vdupq_n_f32(*p)); + } + + /** + * @brief Factory that returns a vector loaded from 16B aligned memory. + */ + static ASTCENC_SIMD_INLINE vfloat4 loada(const float* p) + { + return vfloat4(vld1q_f32(p)); + } + + /** + * @brief Factory that returns a vector containing the lane IDs. + */ + static ASTCENC_SIMD_INLINE vfloat4 lane_id() + { + alignas(16) float data[4] = { 0.0f, 1.0f, 2.0f, 3.0f }; + return vfloat4(vld1q_f32(data)); + } + + /** + * @brief The vector ... + */ + float32x4_t m; +}; + +// ============================================================================ +// vint4 data type +// ============================================================================ + +/** + * @brief Data type for 4-wide ints. + */ +struct vint4 +{ + /** + * @brief Construct from zero-initialized value. + */ + ASTCENC_SIMD_INLINE vint4() {} + + /** + * @brief Construct from 4 values loaded from an unaligned address. + * + * Consider using loada() which is better with vectors if data is aligned + * to vector length. + */ + ASTCENC_SIMD_INLINE explicit vint4(const int *p) + { + m = vld1q_s32(p); + } + + /** + * @brief Construct from 1 scalar value replicated across all lanes. + * + * Consider using vfloat4::zero() for constexpr zeros. + */ + ASTCENC_SIMD_INLINE explicit vint4(int a) + { + m = vdupq_n_s32(a); + } + + /** + * @brief Construct from 4 scalar values. + * + * The value of @c a is stored to lane 0 (LSB) in the SIMD register. + */ + ASTCENC_SIMD_INLINE explicit vint4(int a, int b, int c, int d) + { + int32x4_t v = { a, b, c, d }; + m = v; + } + + /** + * @brief Construct from an existing SIMD register. + */ + ASTCENC_SIMD_INLINE explicit vint4(int32x4_t a) + { + m = a; + } + + /** + * @brief Get the scalar from a single lane. + */ + template ASTCENC_SIMD_INLINE int lane() const + { + return vgetq_lane_s32(m, l); + } + + /** + * @brief Factory that returns a vector containing the lane IDs. + */ + static ASTCENC_SIMD_INLINE vint4 lane_id() + { + alignas(ASTCENC_VECALIGN) int data[4] = { 0, 1, 2, 3 }; + return vint4(vld1q_s32(data)); + } + + /** + * @brief The vector ... + */ + int32x4_t m; +}; + +// ============================================================================ +// vmask4 data type +// ============================================================================ + +/** + * @brief Data type for 4-wide control plane masks. + */ +struct vmask4 +{ + /** + * @brief Construct from an existing SIMD register. + */ + ASTCENC_SIMD_INLINE explicit vmask4(uint32x4_t a) + { + m = a; + } + + /** + * @brief Construct from an existing SIMD register. + */ + ASTCENC_SIMD_INLINE explicit vmask4(int32x4_t a) + { + m = vreinterpretq_u32_s32(a); + } + + /** + * @brief The vector ... + */ + uint32x4_t m; +}; + +// ============================================================================ +// vmask4 operators and functions +// ============================================================================ + +/** + * @brief Overload: mask union (or). + */ +ASTCENC_SIMD_INLINE vmask4 operator|(vmask4 a, vmask4 b) +{ + return vmask4(vorrq_u32(a.m, b.m)); +} + +/** + * @brief Overload: mask intersect (and). + */ +ASTCENC_SIMD_INLINE vmask4 operator&(vmask4 a, vmask4 b) +{ + return vmask4(vandq_u32(a.m, b.m)); +} + +/** + * @brief Overload: mask difference (xor). + */ +ASTCENC_SIMD_INLINE vmask4 operator^(vmask4 a, vmask4 b) +{ + return vmask4(veorq_u32(a.m, b.m)); +} + +/** + * @brief Overload: mask invert (not). + */ +ASTCENC_SIMD_INLINE vmask4 operator~(vmask4 a) +{ + return vmask4(vmvnq_u32(a.m)); +} + +/** + * @brief Return a 4-bit mask code indicating mask status. + * + * bit0 = lane 0 + */ +ASTCENC_SIMD_INLINE unsigned int mask(vmask4 a) +{ + int32x4_t shift = { 0, 1, 2, 3 }; + uint32x4_t tmp = vshrq_n_u32(a.m, 31); + return vaddvq_u32(vshlq_u32(tmp, shift)); +} + +/** + * @brief True if any lanes are enabled, false otherwise. + */ +ASTCENC_SIMD_INLINE bool any(vmask4 a) +{ + return mask(a) != 0; +} + +/** + * @brief True if all lanes are enabled, false otherwise. + */ +ASTCENC_SIMD_INLINE bool all(vmask4 a) +{ + return mask(a) == 0xF; +} + +// ============================================================================ +// vint4 operators and functions +// ============================================================================ + +/** + * @brief Overload: vector by vector addition. + */ +ASTCENC_SIMD_INLINE vint4 operator+(vint4 a, vint4 b) +{ + return vint4(vaddq_s32(a.m, b.m)); +} + +/** + * @brief Overload: vector by vector subtraction. + */ +ASTCENC_SIMD_INLINE vint4 operator-(vint4 a, vint4 b) +{ + return vint4(vsubq_s32(a.m, b.m)); +} + +/** + * @brief Overload: vector bit invert. + */ +ASTCENC_SIMD_INLINE vint4 operator~(vint4 a) +{ + return vint4(vmvnq_s32(a.m)); +} + +/** + * @brief Overload: vector by vector bitwise or. + */ +ASTCENC_SIMD_INLINE vint4 operator|(vint4 a, vint4 b) +{ + return vint4(vorrq_s32(a.m, b.m)); +} + +/** + * @brief Overload: vector by vector bitwise and. + */ +ASTCENC_SIMD_INLINE vint4 operator&(vint4 a, vint4 b) +{ + return vint4(vandq_s32(a.m, b.m)); +} + +/** + * @brief Overload: vector by vector bitwise xor. + */ +ASTCENC_SIMD_INLINE vint4 operator^(vint4 a, vint4 b) +{ + return vint4(veorq_s32(a.m, b.m)); +} + +/** + * @brief Overload: vector by vector equality. + */ +ASTCENC_SIMD_INLINE vmask4 operator==(vint4 a, vint4 b) +{ + return vmask4(vceqq_s32(a.m, b.m)); +} + +/** + * @brief Overload: vector by vector inequality. + */ +ASTCENC_SIMD_INLINE vmask4 operator!=(vint4 a, vint4 b) +{ + return ~vmask4(vceqq_s32(a.m, b.m)); +} + +/** + * @brief Overload: vector by vector less than. + */ +ASTCENC_SIMD_INLINE vmask4 operator<(vint4 a, vint4 b) +{ + return vmask4(vcltq_s32(a.m, b.m)); +} + +/** + * @brief Overload: vector by vector greater than. + */ +ASTCENC_SIMD_INLINE vmask4 operator>(vint4 a, vint4 b) +{ + return vmask4(vcgtq_s32(a.m, b.m)); +} + +/** + * @brief Return the min vector of two vectors. + */ +ASTCENC_SIMD_INLINE vint4 min(vint4 a, vint4 b) +{ + return vint4(vminq_s32(a.m, b.m)); +} + +/** + * @brief Return the max vector of two vectors. + */ +ASTCENC_SIMD_INLINE vint4 max(vint4 a, vint4 b) +{ + return vint4(vmaxq_s32(a.m, b.m)); +} + +/** + * @brief Return the horizontal minimum of a vector. + */ +ASTCENC_SIMD_INLINE vint4 hmin(vint4 a) +{ + return vint4(vminvq_s32(a.m)); +} + +/** + * @brief Store a vector to a 16B aligned memory address. + */ +ASTCENC_SIMD_INLINE void storea(vint4 a, int* p) +{ + vst1q_s32(p, a.m); +} + +/** + * @brief Store lowest N (vector width) bytes into an unaligned address. + */ +ASTCENC_SIMD_INLINE void store_nbytes(vint4 a, uint8_t* p) +{ + vst1q_lane_s32((int32_t*)p, a.m, 0); +} + +/** + * @brief Gather N (vector width) indices from the array. + */ +ASTCENC_SIMD_INLINE vint4 gatheri(const int* base, vint4 indices) +{ + alignas(16) int idx[4]; + storea(indices, idx); + alignas(16) int vals[4]; + vals[0] = base[idx[0]]; + vals[1] = base[idx[1]]; + vals[2] = base[idx[2]]; + vals[3] = base[idx[3]]; + return vint4(vals); +} + +/** + * @brief Pack low 8 bits of N (vector width) lanes into bottom of vector. + */ +ASTCENC_SIMD_INLINE vint4 pack_low_bytes(vint4 a) +{ + alignas(16) uint8_t shuf[16] = { + 0, 4, 8, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + }; + uint8x16_t idx = vld1q_u8(shuf); + int8x16_t av = vreinterpretq_s8_s32(a.m); + return vint4(vreinterpretq_s32_s8(vqtbl1q_s8(av, idx))); +} + +/** + * @brief Return lanes from @c b if MSB of @c cond is set, else @c a. + */ +ASTCENC_SIMD_INLINE vint4 select(vint4 a, vint4 b, vmask4 cond) +{ + static const uint32x4_t msb = vdupq_n_u32(0x80000000u); + uint32x4_t mask = vcgeq_u32(cond.m, msb); + return vint4(vbslq_s32(mask, b.m, a.m)); +} + +/** + * @brief Debug function to print a vector of ints. + */ +ASTCENC_SIMD_INLINE void print(vint4 a) +{ + alignas(16) int v[4]; + storea(a, v); + printf("v4_i32:\n %8u %8u %8u %8u\n", + v[0], v[1], v[2], v[3]); +} + +// ============================================================================ +// vfloat4 operators and functions +// ============================================================================ + +/** + * @brief Overload: vector by vector addition. + */ +ASTCENC_SIMD_INLINE vfloat4 operator+(vfloat4 a, vfloat4 b) +{ + return vfloat4(vaddq_f32(a.m, b.m)); +} + +/** + * @brief Overload: vector by vector subtraction. + */ +ASTCENC_SIMD_INLINE vfloat4 operator-(vfloat4 a, vfloat4 b) +{ + return vfloat4(vsubq_f32(a.m, b.m)); +} + +/** + * @brief Overload: vector by vector multiplication. + */ +ASTCENC_SIMD_INLINE vfloat4 operator*(vfloat4 a, vfloat4 b) +{ + return vfloat4(vmulq_f32(a.m, b.m)); +} + +/** + * @brief Overload: vector by scalar multiplication. + */ +ASTCENC_SIMD_INLINE vfloat4 operator*(vfloat4 a, float b) +{ + float32x4_t bv = vld1q_dup_f32(&b); + return vfloat4(vmulq_f32(a.m, bv)); +} + +/** + * @brief Overload: scalar by vector multiplication. + */ +ASTCENC_SIMD_INLINE vfloat4 operator*(float a, vfloat4 b) +{ + float32x4_t av = vld1q_dup_f32(&a); + return vfloat4(vmulq_f32(av, b.m)); +} + +/** + * @brief Overload: vector by vector division. + */ +ASTCENC_SIMD_INLINE vfloat4 operator/(vfloat4 a, vfloat4 b) +{ + return vfloat4(vdivq_f32(a.m, b.m)); +} + +/** + * @brief Overload: vector by vector equality. + */ +ASTCENC_SIMD_INLINE vmask4 operator==(vfloat4 a, vfloat4 b) +{ + return vmask4(vceqq_f32(a.m, b.m)); +} + +/** + * @brief Overload: vector by vector inequality. + */ +ASTCENC_SIMD_INLINE vmask4 operator!=(vfloat4 a, vfloat4 b) +{ + return vmask4(vmvnq_u32(vceqq_f32(a.m, b.m))); +} + +/** + * @brief Overload: vector by vector less than. + */ +ASTCENC_SIMD_INLINE vmask4 operator<(vfloat4 a, vfloat4 b) +{ + return vmask4(vcltq_f32(a.m, b.m)); +} + +/** + * @brief Overload: vector by vector greater than. + */ +ASTCENC_SIMD_INLINE vmask4 operator>(vfloat4 a, vfloat4 b) +{ + return vmask4(vcgtq_f32(a.m, b.m)); +} + +/** + * @brief Overload: vector by vector less than or equal. + */ +ASTCENC_SIMD_INLINE vmask4 operator<=(vfloat4 a, vfloat4 b) +{ + return vmask4(vcleq_f32(a.m, b.m)); +} + +/** + * @brief Overload: vector by vector greater than or equal. + */ +ASTCENC_SIMD_INLINE vmask4 operator>=(vfloat4 a, vfloat4 b) +{ + return vmask4(vcgeq_f32(a.m, b.m)); +} + +/** + * @brief Return the min vector of two vectors. + * + * If either lane value is NaN, @c b will be returned for that lane. + */ +ASTCENC_SIMD_INLINE vfloat4 min(vfloat4 a, vfloat4 b) +{ + // Do not reorder - second operand will return if either is NaN + return vfloat4(vminnmq_f32(a.m, b.m)); +} + +/** + * @brief Return the max vector of two vectors. + * + * If either lane value is NaN, @c b will be returned for that lane. + */ +ASTCENC_SIMD_INLINE vfloat4 max(vfloat4 a, vfloat4 b) +{ + // Do not reorder - second operand will return if either is NaN + return vfloat4(vmaxnmq_f32(a.m, b.m)); +} + +/** + * @brief Return the clamped value between min and max. + * + * It is assumed that neither @c min nor @c max are NaN values. If @c a is NaN + * then @c min will be returned for that lane. + */ +ASTCENC_SIMD_INLINE vfloat4 clamp(float min, float max, vfloat4 a) +{ + float32x4_t minv = vdupq_n_f32(min); + float32x4_t maxv = vdupq_n_f32(max); + return vfloat4(vminnmq_f32(vmaxnmq_f32(a.m, minv), maxv)); +} + +/** + * @brief Return a clamped value between 0.0f and max. + * + * It is assumed that @c max is not a NaN value. If @c a is NaN then zero will + * be returned for that lane. + */ +ASTCENC_SIMD_INLINE vfloat4 clampz(float max, vfloat4 a) +{ + // Do not reorder - second operand will return if either is NaN + float32x4_t minv = vdupq_n_f32(0.0f); + float32x4_t maxv = vdupq_n_f32(max); + return vfloat4(vminnmq_f32(vmaxnmq_f32(a.m, minv), maxv)); +} + +/** + * @brief Return a clamped value between 0.0f and 1.0f. + * + * If @c a is NaN then zero will be returned for that lane. + */ +ASTCENC_SIMD_INLINE vfloat4 clampzo(vfloat4 a) +{ + float32x4_t minv = vdupq_n_f32(0.0f); + float32x4_t maxv = vdupq_n_f32(1.0f); + return vfloat4(vminnmq_f32(vmaxnmq_f32(a.m, minv), maxv)); +} + +/** + * @brief Return the absolute value of the float vector. + */ +ASTCENC_SIMD_INLINE vfloat4 abs(vfloat4 a) +{ + float32x4_t zero = vdupq_n_f32(0.0f); + float32x4_t inv = vsubq_f32(zero, a.m); + return vfloat4(vmaxq_f32(a.m, inv)); +} + +/** + * @brief Return a float rounded to the nearest integer value. + * + * TODO: Can we do a better fallback here, if we exploit the fact that we + * can assume that values are positive? + */ +ASTCENC_SIMD_INLINE vfloat4 round(vfloat4 a) +{ + return vfloat4(vrndnq_f32(a.m)); +} + +/** + * @brief Return the horizontal minimum of a vector. + */ +ASTCENC_SIMD_INLINE vfloat4 hmin(vfloat4 a) +{ + return vfloat4(vminvq_f32(a.m)); +} + +/** + * @brief Return the sqrt of the lanes in the vector. + */ +ASTCENC_SIMD_INLINE vfloat4 sqrt(vfloat4 a) +{ + return vfloat4(vsqrtq_f32(a.m)); +} + +/** + * @brief Return lanes from @c b if MSB of @c cond is set, else @c a. + */ +ASTCENC_SIMD_INLINE vfloat4 select(vfloat4 a, vfloat4 b, vmask4 cond) +{ + static const uint32x4_t msb = vdupq_n_u32(0x80000000u); + uint32x4_t mask = vcgeq_u32(cond.m, msb); + return vfloat4(vbslq_f32(mask, b.m, a.m)); +} + +/** + * @brief Load a vector of gathered results from an array; + */ +ASTCENC_SIMD_INLINE vfloat4 gatherf(const float* base, vint4 indices) +{ + alignas(16) int idx[4]; + storea(indices, idx); + alignas(16) float vals[4]; + vals[0] = base[idx[0]]; + vals[1] = base[idx[1]]; + vals[2] = base[idx[2]]; + vals[3] = base[idx[3]]; + return vfloat4(vals); +} + +/** + * @brief Store a vector to an unaligned memory address. + */ +ASTCENC_SIMD_INLINE void store(vfloat4 a, float* p) +{ + vst1q_f32(p, a.m); +} + +/** + * @brief Store a vector to a 16B aligned memory address. + */ +ASTCENC_SIMD_INLINE void storea(vfloat4 a, float* p) +{ + vst1q_f32(p, a.m); +} + +/** + * @brief Return the dot product for the full 4 lanes, returning vector. + */ +ASTCENC_SIMD_INLINE vfloat4 dot(vfloat4 a, vfloat4 b) +{ + return vfloat4(vaddvq_f32(vmulq_f32(a.m, b.m))); +} + +/** + * @brief Return a integer value for a float vector, using truncation. + */ +ASTCENC_SIMD_INLINE vint4 float_to_int(vfloat4 a) +{ + return vint4(vcvtq_s32_f32(a.m)); +} + +/** + * @brief Return a integer value for a float vector, using round-to-nearest. + */ +ASTCENC_SIMD_INLINE vint4 float_to_int_rtn(vfloat4 a) +{ + a = round(a); + return vint4(vcvtq_s32_f32(a.m)); +} + +/** + * @brief Return a float value as an integer bit pattern (i.e. no conversion). + * + * It is a common trick to convert floats into integer bit patterns, perform + * some bit hackery based on knowledge they are IEEE 754 layout, and then + * convert them back again. This is the first half of that flip. + */ +ASTCENC_SIMD_INLINE vint4 float_as_int(vfloat4 a) +{ + return vint4(vreinterpretq_s32_f32(a.m)); +} + +/** + * @brief Return a integer value as a float bit pattern (i.e. no conversion). + * + * It is a common trick to convert floats into integer bit patterns, perform + * some bit hackery based on knowledge they are IEEE 754 layout, and then + * convert them back again. This is the second half of that flip. + */ +ASTCENC_SIMD_INLINE vfloat4 int_as_float(vint4 v) +{ + return vfloat4(vreinterpretq_f32_s32(v.m)); +} + +/** + * @brief Debug function to print a vector of floats. + */ +ASTCENC_SIMD_INLINE void print(vfloat4 a) +{ + alignas(16) float v[4]; + storea(a, v); + printf("v4_f32:\n %0.4f %0.4f %0.4f %0.4f\n", + (double)v[0], (double)v[1], (double)v[2], (double)v[3]); +} + +#endif // #ifndef ASTC_VECMATHLIB_NEON_4_H_INCLUDED diff --git a/Source/cmake_core.cmake b/Source/cmake_core.cmake index b14c02a..8ce4137 100644 --- a/Source/cmake_core.cmake +++ b/Source/cmake_core.cmake @@ -110,17 +110,29 @@ endif() if(${ISA_SIMD} MATCHES "none") target_compile_definitions(astcenc-${ISA_SIMD} PRIVATE + ASTCENC_NEON=0 ASTCENC_SSE=0 ASTCENC_AVX=0 ASTCENC_POPCNT=0) - target_compile_options(astcenc-${ISA_SIMD} + if (${ARCH} MATCHES x64) + target_compile_options(astcenc-${ISA_SIMD} + PRIVATE + $<$:-mfpmath=sse -msse2>) + endif() + +elseif(${ISA_SIMD} MATCHES "neon") + target_compile_definitions(astcenc-${ISA_SIMD} PRIVATE - $<$:-mfpmath=sse -msse2>) + ASTCENC_NEON=1 + ASTCENC_SSE=0 + ASTCENC_AVX=0 + ASTCENC_POPCNT=0) elseif(${ISA_SIMD} MATCHES "sse2") target_compile_definitions(astcenc-${ISA_SIMD} PRIVATE + ASTCENC_NEON=0 ASTCENC_SSE=20 ASTCENC_AVX=0 ASTCENC_POPCNT=0) @@ -132,6 +144,7 @@ elseif(${ISA_SIMD} MATCHES "sse2") elseif(${ISA_SIMD} MATCHES "sse4.1") target_compile_definitions(astcenc-${ISA_SIMD} PRIVATE + ASTCENC_NEON=0 ASTCENC_SSE=41 ASTCENC_AVX=0 ASTCENC_POPCNT=1) @@ -143,6 +156,7 @@ elseif(${ISA_SIMD} MATCHES "sse4.1") elseif(${ISA_SIMD} MATCHES "avx2") target_compile_definitions(astcenc-${ISA_SIMD} PRIVATE + ASTCENC_NEON=0 ASTCENC_SSE=41 ASTCENC_AVX=2 ASTCENC_POPCNT=1) diff --git a/Test/astc_test_image.py b/Test/astc_test_image.py index 4441ef5..5492375 100644 --- a/Test/astc_test_image.py +++ b/Test/astc_test_image.py @@ -285,23 +285,30 @@ def get_encoder_params(encoderName, referenceName, imageSet): outDir = "Test/Images/%s" % imageSet refName = None # Latest master + elif encoderName == "ref-master-neon": + # Warning: this option rebuilds a new reference test result for the + # master branch using the user's locally build encoder. + encoder = te.Encoder2x("neon") + name = "reference-master-neon" + outDir = "Test/Images/%s" % imageSet + refName = None elif encoderName == "ref-master-sse2": # Warning: this option rebuilds a new reference test result for the - # master branch using the user's locally build encoder in ./Source. + # master branch using the user's locally build encoder. encoder = te.Encoder2x("sse2") name = "reference-master-sse2" outDir = "Test/Images/%s" % imageSet refName = None elif encoderName == "ref-master-sse4.1": # Warning: this option rebuilds a new reference test result for the - # master branch using the user's locally build encoder in ./Source. + # master branch using the user's locally build encoder. encoder = te.Encoder2x("sse4.1") name = "reference-master-sse4.1" outDir = "Test/Images/%s" % imageSet refName = None elif encoderName == "ref-master-avx2": # Warning: this option rebuilds a new reference test result for the - # master branch using the user's locally build encoder in ./Source. + # master branch using the user's locally build encoder. encoder = te.Encoder2x("avx2") name = "reference-master-avx2" outDir = "Test/Images/%s" % imageSet @@ -327,8 +334,8 @@ def parse_command_line(): refcoders = ["ref-1.7", "ref-2.0-sse2", "ref-2.0-sse4.1", "ref-2.0-avx2", "ref-2.1-sse2", "ref-2.1-sse4.1", "ref-2.1-avx2", - "ref-master-sse2", "ref-master-sse4.1", "ref-master-avx2"] - testcoders = ["none", "sse2", "sse4.1", "avx2"] + "ref-master-neon", "ref-master-sse2", "ref-master-sse4.1", "ref-master-avx2"] + testcoders = ["none", "neon", "sse2", "sse4.1", "avx2"] coders = refcoders + testcoders + ["all", "all-ref"] parser.add_argument("--encoder", dest="encoders", default="avx2", choices=coders, help="test encoder variant") diff --git a/Test/testlib/image.py b/Test/testlib/image.py index a466114..41320f9 100644 --- a/Test/testlib/image.py +++ b/Test/testlib/image.py @@ -28,6 +28,7 @@ The directory path is structured: from collections.abc import Iterable import os +import re import subprocess as sp from PIL import Image as PILImage @@ -35,6 +36,39 @@ from PIL import Image as PILImage import testlib.misc as misc +CONVERT_BINARY = ["convert"] + + +g_ConvertVersion = None + + +def get_convert_version(): + """ + Get the major/minor version of ImageMagick on the system. + """ + global g_ConvertVersion + + if g_ConvertVersion is None: + command = list(CONVERT_BINARY) + command += ["--version"] + result = sp.run(command, stdout=sp.PIPE, stderr=sp.PIPE, + check=True, universal_newlines=True) + + # Version is top row + version = result.stdout.splitlines()[0] + # ... third token + version = re.split(" ", version)[2] + # ... major/minor/patch/subpatch + version = re.split("\\.|-", version) + + numericVersion = float(version[0]) + numericVersion += float(version[1]) / 10.0 + + g_ConvertVersion = numericVersion + + return g_ConvertVersion + + class ImageException(Exception): """ Exception thrown for bad image specification. @@ -228,6 +262,14 @@ class Image(): Args: filePath (str): The path to the image on disk. """ + convert = get_convert_version() + + # ImageMagick 7 started to use .tga file origin information. By default + # TGA files store data from bottom up, and define the origin as bottom + # left. We want our color samples to always use a top left origin, even + # if the data is stored in alternative layout. + self.invertYCoords = (convert >= 7.0) and filePath.endswith(".tga") + self.filePath = filePath self.proxyPath = None @@ -256,8 +298,14 @@ class Image(): coords = [coords] for (x, y) in coords: - command = [ - "convert", self.filePath, + command = list(CONVERT_BINARY) + command += [self.filePath] + + # Invert coordinates if the format needs it + if self.invertYCoords: + command += ["-flip"] + + command += [ "-format", "%%[pixel:p{%u,%u}]" % (x, y), "info:" ]