mirror of
https://github.com/openharmony/third_party_astc-encoder.git
synced 2026-06-30 23:07:54 -04:00
Add Arm aarch64 builds and NEON acceleration (#191)
This PR adds support for Arm aarch64 builds, including the corresponding NEON accelerated vector library. As part of this work I also improved testing: - Native C++ unit tests support using `googletest` integrated into CMake/CTest. - First unit test suite added, for 4-wide SIMD implementations. - Command line functional tests can target any build, not just AVX2.
This commit is contained in:
+2
-1
@@ -1,6 +1,7 @@
|
||||
# Editor and engineering scratch files
|
||||
.vs
|
||||
.vscode
|
||||
.DS_Store
|
||||
*.log
|
||||
*.diff
|
||||
*.user
|
||||
@@ -12,7 +13,7 @@ Proto
|
||||
Binaries
|
||||
|
||||
# Build artifacts
|
||||
astcenc*
|
||||
astcenc
|
||||
build*
|
||||
|
||||
# General build artifacts
|
||||
|
||||
@@ -0,0 +1,3 @@
|
||||
[submodule "Source/GoogleTest"]
|
||||
path = Source/GoogleTest
|
||||
url = https://github.com/google/googletest.git
|
||||
+50
-12
@@ -15,7 +15,6 @@
|
||||
# under the License.
|
||||
# ----------------------------------------------------------------------------
|
||||
|
||||
|
||||
# CMake configuration
|
||||
cmake_minimum_required(VERSION 3.15)
|
||||
cmake_policy(SET CMP0069 NEW) # LTO support
|
||||
@@ -29,38 +28,77 @@ set(CMAKE_CXX_STANDARD_REQUIRED ON)
|
||||
set(CMAKE_CXX_EXTENSIONS OFF)
|
||||
set(CMAKE_EXPORT_COMPILE_COMMANDS 1)
|
||||
set(PACKAGE_ROOT astcenc)
|
||||
include(CTest)
|
||||
|
||||
# Command line configuration
|
||||
option(ISA_AVX2 "Enable builds for AVX2 SIMD")
|
||||
function(printopt optName optVal optArch tgtArch)
|
||||
if(${optVal})
|
||||
if(${optArch} MATCHES ${tgtArch})
|
||||
message(" -- ${optName} backend - ON")
|
||||
else()
|
||||
message(" -- ${optName} backend - SKIPPED (${optArch} only)")
|
||||
endif()
|
||||
else()
|
||||
message(" -- ${optName} backend - OFF")
|
||||
endif()
|
||||
endfunction()
|
||||
|
||||
set(VALID_ARCH aarch64 x64)
|
||||
set(ARCH x64 CACHE STRING "Target architecture")
|
||||
set_property(CACHE ARCH PROPERTY STRINGS ${VALID_ARCH})
|
||||
|
||||
message("-- Selecting target astcenc backends:")
|
||||
list(FIND VALID_ARCH ${ARCH} index)
|
||||
if(index EQUAL -1)
|
||||
message(FATAL_ERROR "ARCH must be one of ${VALID_ARCH}")
|
||||
endif()
|
||||
|
||||
message("-- Selecting backend build type(s)")
|
||||
set(ANY_ISA 0)
|
||||
if(${ISA_AVX2})
|
||||
|
||||
option(ISA_AVX2 "Enable builds for AVX2 SIMD")
|
||||
printopt("AVX2" ${ISA_AVX2} "x64" ${ARCH})
|
||||
if(${ISA_AVX2} AND ${ARCH} MATCHES "x64")
|
||||
set(ANY_ISA 1)
|
||||
message(" -- AVX2 backend: ON")
|
||||
endif()
|
||||
|
||||
option(ISA_SSE41 "Enable builds for SSE4.1 SIMD")
|
||||
if(${ISA_SSE41})
|
||||
printopt("SSE4.1" ${ISA_SSE41} "x64" ${ARCH})
|
||||
if(${ISA_SSE41} AND ${ARCH} MATCHES "x64")
|
||||
set(ANY_ISA 1)
|
||||
message(" -- SSE4.1 backend: ON")
|
||||
endif()
|
||||
|
||||
option(ISA_SSE2 "Enable builds for SSE2 SIMD")
|
||||
if(${ISA_SSE2})
|
||||
printopt("SSE2" ${ISA_SSE2} "x64" ${ARCH})
|
||||
if(${ISA_SSE2} AND ${ARCH} MATCHES "x64")
|
||||
set(ANY_ISA 1)
|
||||
message(" -- SSE2 backend: ON")
|
||||
endif()
|
||||
|
||||
option(ISA_NONE "Enable builds for noSIMD")
|
||||
option(ISA_NEON "Enable builds for NEON SIMD")
|
||||
printopt("NEON" ${ISA_NEON} "aarch64" ${ARCH})
|
||||
if(${ISA_NEON} AND ${ARCH} MATCHES "aarch64")
|
||||
set(ANY_ISA 1)
|
||||
endif()
|
||||
|
||||
option(ISA_NONE "Enable builds for no SIMD")
|
||||
if(${ISA_NONE})
|
||||
set(ANY_ISA 1)
|
||||
message(" -- No SIMD backend: ON")
|
||||
message(" -- No SIMD backend - ON")
|
||||
else()
|
||||
message(" -- No SIMD backend - OFF")
|
||||
endif()
|
||||
|
||||
option(ISA_INVARIANCE "Enable builds for ISA invariance")
|
||||
if(${ISA_INVARIANCE})
|
||||
message(" -- ISA invariance: ON")
|
||||
message(" -- ISA invariant backend - ON")
|
||||
else()
|
||||
message(" -- ISA invariant backend - OFF")
|
||||
endif()
|
||||
|
||||
option(UNITTEST "Enable builds for unit tests")
|
||||
if(${UNITTEST})
|
||||
message(" -- Unit tests - ON")
|
||||
else()
|
||||
message(" -- Unit tests - OFF")
|
||||
endif()
|
||||
|
||||
if(NOT ${ANY_ISA})
|
||||
|
||||
+45
-9
@@ -22,8 +22,14 @@ to generate the build system.
|
||||
mkdir build
|
||||
cd build
|
||||
|
||||
# Create the build system
|
||||
cmake -G "NMake Makefiles" -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=./ \
|
||||
# Configure your build of choice, for example:
|
||||
|
||||
# Arm arch64
|
||||
cmake -G "Unix Makefiles" -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=./ \
|
||||
-DARCH=aarch64 -DISA_NEON=ON ..
|
||||
|
||||
# x86-64
|
||||
cmake -G "Unix Makefiles" -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=./ \
|
||||
-DISA_AVX2=ON -DISA_SSE41=ON -DISA_SSE2=ON ..
|
||||
```
|
||||
|
||||
@@ -62,7 +68,13 @@ export CXX=clang++
|
||||
mkdir build
|
||||
cd build
|
||||
|
||||
# Create the build system
|
||||
# Configure your build of choice, for example:
|
||||
|
||||
# Arm arch64
|
||||
cmake -G "Unix Makefiles" -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=./ \
|
||||
-DARCH=aarch64 -DISA_NEON=ON ..
|
||||
|
||||
# x86-64
|
||||
cmake -G "Unix Makefiles" -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=./ \
|
||||
-DISA_AVX2=ON -DISA_SSE41=ON -DISA_SSE2=ON ..
|
||||
```
|
||||
@@ -88,13 +100,14 @@ For codec developers there are a number of useful features in the build system.
|
||||
|
||||
### No intrinsics build
|
||||
|
||||
All normal builds will use SIMD accelerated code paths using instrinsics, as
|
||||
x86-64 guarantees availability of at least SSE2. For development purposes it
|
||||
is possible to build an intrinsic-free build which uses no explicit SIMD
|
||||
acceleration (the compiler may still auto-vectorize).
|
||||
All normal builds will use SIMD accelerated code paths using intrinsics, as all
|
||||
target architectures (x86-64 and aarch64) guarantee SIMD availability. For
|
||||
development purposes it is possible to build an intrinsic-free build which uses
|
||||
no explicit SIMD acceleration (the compiler may still auto-vectorize).
|
||||
|
||||
To enable this binary variant add `-DISA_NONE=ON` to the CMake command line
|
||||
when configuring. It is NOT
|
||||
when configuring. It is NOT recommended to use this for production; it is
|
||||
significantly slower than the vectorized SIMD builds.
|
||||
|
||||
### ISA Invariance
|
||||
|
||||
@@ -126,12 +139,35 @@ We support and test the following `CMAKE_BUILD_TYPE` options.
|
||||
Note that optimized release builds are compiled with link-time optimization,
|
||||
which can make profiling more challenging ...
|
||||
|
||||
### Testing
|
||||
|
||||
We support building unit tests.
|
||||
|
||||
These builds use the `googletest` framework, which is pulled in though a git
|
||||
submodule. On first use, you must fetch the submodule dependency:
|
||||
|
||||
```shell
|
||||
git submodule init
|
||||
git submodule update
|
||||
```
|
||||
|
||||
To build unit tests add `-DUNITTEST=ON` to the CMake command line when
|
||||
configuring.
|
||||
|
||||
To run unit tests use the CMake `ctest` utility from your build directory after
|
||||
you have built the tests.
|
||||
|
||||
```shell
|
||||
cd build
|
||||
ctest --verbose
|
||||
```
|
||||
|
||||
### Packaging
|
||||
|
||||
We support building a release bundle of all enabled binary configurations in
|
||||
the current CMake configuration using the `package` build target
|
||||
|
||||
```bash
|
||||
```shell
|
||||
# Run a build and package build outputs in `./astcenc-<ver>-<os>-<arch>.<fmt>`
|
||||
cd build
|
||||
make package -j16
|
||||
|
||||
+5
-2
@@ -18,6 +18,9 @@ stable across versions, and this release is not compatible with 2.1. Please
|
||||
recompile your client-side code using the updated `astcenc.h` header.
|
||||
|
||||
* **General:**
|
||||
* **Feature:** New Arm aarch64 NEON accelerated vector library. Note that at
|
||||
this time Arm builds must be built from source; pre-built binaries are not
|
||||
provided in this release.
|
||||
* **Improvement:** SSE4.2 feature profile changed to SSE4.1, which more
|
||||
accurately reflects the feature set used.
|
||||
* **Improvement:** Build system changed to use CMake for all platforms.
|
||||
@@ -35,8 +38,8 @@ recompile your client-side code using the updated `astcenc.h` header.
|
||||
scales RGB values by the alpha value. This can be useful to minimize
|
||||
cross-channel color bleed caused by GPU post-multiply filtering/blending.
|
||||
* **Improvements:** Command line tool cleanly traps and reports errors for
|
||||
corrupt input images rather than relying on hard standard library
|
||||
`assert()` calls.
|
||||
corrupt input images rather than relying on standard library `assert()`
|
||||
calls in release builds.
|
||||
* **Core API:**
|
||||
* **API Change:** Images using region-based metrics no longer need to include
|
||||
padding; all input images should be tightly packed and `dim_pad` is removed
|
||||
|
||||
+20
-3
@@ -14,10 +14,27 @@ can be achieved by configuring the CMake build using the install prefix
|
||||
`-DCMAKE_INSTALL_PREFIX=../` and then running a build with the `install` build
|
||||
target.
|
||||
|
||||
# Running unit tests
|
||||
# Running C++ unit tests
|
||||
|
||||
To run the command line unit tests, which aim to get coverage of the command
|
||||
line options and core codec stability without testing the compression quality
|
||||
We support a small (but growing) number of C++ unit tests, which are written
|
||||
using the `googletest` framework and integrated in the CMake "CTest" test
|
||||
framework.
|
||||
|
||||
To build unit tests pull the `googletest` git submodule and add `-DUNITTEST=ON`
|
||||
to the CMake command line when configuring.
|
||||
|
||||
To run unit tests use the CMake `ctest` utility from your build directory after
|
||||
you have built the tests.
|
||||
|
||||
```shell
|
||||
cd build
|
||||
ctest --verbose
|
||||
```
|
||||
|
||||
# Running command line tests
|
||||
|
||||
To run the command line tests, which aim to get coverage of the command line
|
||||
options and core codec stability without testing the compression quality
|
||||
itself, run the command line:
|
||||
|
||||
python3 -m unittest discover -s Test -p astc_test*.py -v
|
||||
|
||||
+26
-3
@@ -21,6 +21,25 @@ if(CMAKE_CXX_COMPILER_ID MATCHES "GNU|Clang")
|
||||
set(CMAKE_CXX_COMPILE_OPTIONS_IPO "-flto")
|
||||
endif()
|
||||
|
||||
# - - - - - - - - - - - - - - - - - -
|
||||
# No architecture-specific SIMD
|
||||
|
||||
if (${ISA_NONE})
|
||||
set(ISA_SIMD none)
|
||||
include(cmake_core.cmake)
|
||||
endif()
|
||||
|
||||
# - - - - - - - - - - - - - - - - - -
|
||||
# Armv8-A architecture-specific SIMD
|
||||
|
||||
if (${ISA_NEON})
|
||||
set(ISA_SIMD neon)
|
||||
include(cmake_core.cmake)
|
||||
endif()
|
||||
|
||||
# - - - - - - - - - - - - - - - - - -
|
||||
# x86-64 architecture-specific SIMD
|
||||
|
||||
if (${ISA_AVX2})
|
||||
set(ISA_SIMD avx2)
|
||||
include(cmake_core.cmake)
|
||||
@@ -36,7 +55,11 @@ if (${ISA_SSE2})
|
||||
include(cmake_core.cmake)
|
||||
endif()
|
||||
|
||||
if (${ISA_NONE})
|
||||
set(ISA_SIMD none)
|
||||
include(cmake_core.cmake)
|
||||
# - - - - - - - - - - - - - - - - - -
|
||||
# Unit testing
|
||||
if (${UNITTEST})
|
||||
set(INSTALL_GTEST OFF CACHE BOOL "" FORCE)
|
||||
add_subdirectory(GoogleTest)
|
||||
enable_testing()
|
||||
add_subdirectory(UnitTest)
|
||||
endif()
|
||||
|
||||
Submodule
+1
Submodule Source/GoogleTest added at 703bd9caab
@@ -0,0 +1,50 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# ----------------------------------------------------------------------------
|
||||
# Copyright 2020 Arm Limited
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License"); you may not
|
||||
# use this file except in compliance with the License. You may obtain a copy
|
||||
# of the License at:
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
# License for the specific language governing permissions and limitations
|
||||
# under the License.
|
||||
# ----------------------------------------------------------------------------
|
||||
|
||||
# - - - - - - - - - - - - - - - - - -
|
||||
# No architecture-specific SIMD
|
||||
|
||||
if (${ISA_NONE})
|
||||
set(ISA_SIMD none)
|
||||
include(cmake_core.cmake)
|
||||
endif()
|
||||
|
||||
# - - - - - - - - - - - - - - - - - -
|
||||
# Armv8-A architecture-specific SIMD
|
||||
|
||||
if (${ISA_NEON})
|
||||
set(ISA_SIMD neon)
|
||||
include(cmake_core.cmake)
|
||||
endif()
|
||||
|
||||
# - - - - - - - - - - - - - - - - - -
|
||||
# x86-64 architecture-specific SIMD
|
||||
|
||||
if (${ISA_AVX2})
|
||||
set(ISA_SIMD avx2)
|
||||
include(cmake_core.cmake)
|
||||
endif()
|
||||
|
||||
if (${ISA_SSE41})
|
||||
set(ISA_SIMD sse4.1)
|
||||
include(cmake_core.cmake)
|
||||
endif()
|
||||
|
||||
if (${ISA_SSE2})
|
||||
set(ISA_SIMD sse2)
|
||||
include(cmake_core.cmake)
|
||||
endif()
|
||||
@@ -0,0 +1,112 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# ----------------------------------------------------------------------------
|
||||
# Copyright 2020 Arm Limited
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License"); you may not
|
||||
# use this file except in compliance with the License. You may obtain a copy
|
||||
# of the License at:
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
# License for the specific language governing permissions and limitations
|
||||
# under the License.
|
||||
# ----------------------------------------------------------------------------
|
||||
|
||||
add_executable(test-simd-${ISA_SIMD})
|
||||
|
||||
target_sources(test-simd-${ISA_SIMD}
|
||||
PRIVATE
|
||||
test_simd.cpp)
|
||||
|
||||
target_include_directories(test-simd-${ISA_SIMD}
|
||||
PRIVATE
|
||||
${gtest_SOURCE_DIR}/include)
|
||||
|
||||
target_compile_options(test-simd-${ISA_SIMD}
|
||||
PRIVATE
|
||||
# Use pthreads on Linux/macOS
|
||||
$<$<PLATFORM_ID:Linux,Darwin>:-pthread>
|
||||
|
||||
# MSVC compiler defines
|
||||
$<$<CXX_COMPILER_ID:MSVC>:/EHsc>
|
||||
|
||||
# G++ and Clang++ compiler defines
|
||||
$<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wall>
|
||||
$<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wextra>
|
||||
$<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wpedantic>
|
||||
$<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Werror>
|
||||
$<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wshadow>
|
||||
$<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wdouble-promotion>)
|
||||
|
||||
# Set up configuration for SIMD ISA builds
|
||||
if(${ISA_SIMD} MATCHES "none")
|
||||
target_compile_definitions(test-simd-${ISA_SIMD}
|
||||
PRIVATE
|
||||
ASTCENC_NEON=0
|
||||
ASTCENC_SSE=0
|
||||
ASTCENC_AVX=0
|
||||
ASTCENC_POPCNT=0)
|
||||
|
||||
if (${ARCH} MATCHES x64)
|
||||
target_compile_options(astcenc-${ISA_SIMD}
|
||||
PRIVATE
|
||||
$<$<CXX_COMPILER_ID:${GNU_LIKE}>:-mfpmath=sse -msse2>)
|
||||
endif()
|
||||
|
||||
elseif(${ISA_SIMD} MATCHES "neon")
|
||||
target_compile_definitions(test-simd-${ISA_SIMD}
|
||||
PRIVATE
|
||||
ASTCENC_NEON=1
|
||||
ASTCENC_SSE=0
|
||||
ASTCENC_AVX=0
|
||||
ASTCENC_POPCNT=0)
|
||||
|
||||
elseif(${ISA_SIMD} MATCHES "sse2")
|
||||
target_compile_definitions(test-simd-${ISA_SIMD}
|
||||
PRIVATE
|
||||
ASTCENC_NEON=0
|
||||
ASTCENC_SSE=20
|
||||
ASTCENC_AVX=0
|
||||
ASTCENC_POPCNT=0)
|
||||
|
||||
target_compile_options(test-simd-${ISA_SIMD}
|
||||
PRIVATE
|
||||
$<$<CXX_COMPILER_ID:${GNU_LIKE}>:-mfpmath=sse -msse2>)
|
||||
|
||||
elseif(${ISA_SIMD} MATCHES "sse4.1")
|
||||
target_compile_definitions(test-simd-${ISA_SIMD}
|
||||
PRIVATE
|
||||
ASTCENC_NEON=0
|
||||
ASTCENC_SSE=41
|
||||
ASTCENC_AVX=0
|
||||
ASTCENC_POPCNT=1)
|
||||
|
||||
target_compile_options(test-simd-${ISA_SIMD}
|
||||
PRIVATE
|
||||
$<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-mfpmath=sse -msse4.1 -mpopcnt>)
|
||||
|
||||
elseif(${ISA_SIMD} MATCHES "avx2")
|
||||
target_compile_definitions(test-simd-${ISA_SIMD}
|
||||
PRIVATE
|
||||
ASTCENC_NEON=0
|
||||
ASTCENC_SSE=41
|
||||
ASTCENC_AVX=2
|
||||
ASTCENC_POPCNT=1)
|
||||
|
||||
target_compile_options(test-simd-${ISA_SIMD}
|
||||
PRIVATE
|
||||
$<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-mfpmath=sse -mavx2 -mpopcnt>
|
||||
$<$<CXX_COMPILER_ID:MSVC>:/arch:AVX2>)
|
||||
endif()
|
||||
|
||||
target_link_libraries(test-simd-${ISA_SIMD}
|
||||
PRIVATE
|
||||
gtest_main)
|
||||
|
||||
add_test(NAME test-simd-${ISA_SIMD}
|
||||
COMMAND test-simd-${ISA_SIMD})
|
||||
|
||||
install(TARGETS test-simd-${ISA_SIMD} DESTINATION ${PACKAGE_ROOT})
|
||||
File diff suppressed because it is too large
Load Diff
@@ -63,6 +63,14 @@
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifndef ASTCENC_NEON
|
||||
#if defined(__aarch64__)
|
||||
#define ASTCENC_NEON 1
|
||||
#else
|
||||
#define ASTCENC_NEON 0
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if ASTCENC_AVX
|
||||
#define ASTCENC_VECALIGN 32
|
||||
#else
|
||||
|
||||
@@ -32,7 +32,7 @@
|
||||
* Explicit 4-wide types are accessible via the vint4, vfloat4, and vmask4
|
||||
* types. These are provided for use by VLA code, but are also expected to be
|
||||
* used as a fixed-width type and will supported a reference C++ fallback for
|
||||
* use on platforms without SIMD intrinsics (TODO: not yet implemented).
|
||||
* use on platforms without SIMD intrinsics.
|
||||
*
|
||||
* Explicit 8-wide types are accessible via the vint8, vfloat8, and vmask8
|
||||
* types. These are provide for use by VLA code, and are not expected to be
|
||||
@@ -42,9 +42,10 @@
|
||||
* With the current implementation ISA support is provided for:
|
||||
*
|
||||
* * 1-wide for scalar reference.
|
||||
* * 4-wide for SSE2.
|
||||
* * 4-wide for SSE4.1.
|
||||
* * 8-wide for AVX2.
|
||||
* * 4-wide for Armv8-A NEON.
|
||||
* * 4-wide for x86-64 SSE2.
|
||||
* * 4-wide for x86-64 SSE4.1.
|
||||
* * 8-wide for x86-64 AVX2.
|
||||
*
|
||||
*/
|
||||
|
||||
@@ -53,6 +54,8 @@
|
||||
|
||||
#if ASTCENC_SSE != 0 || ASTCENC_AVX != 0
|
||||
#include <immintrin.h>
|
||||
#elif ASTCENC_NEON != 0
|
||||
#include <arm_neon.h>
|
||||
#endif
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
@@ -64,7 +67,7 @@
|
||||
#endif
|
||||
|
||||
#if ASTCENC_AVX >= 2
|
||||
/* If we have AVX2 expose 8-wide VLA, and 4-wide fixed width. */
|
||||
/* If we have AVX2 expose 8-wide VLA. */
|
||||
#include "astcenc_vecmathlib_avx2_8.h"
|
||||
#include "astcenc_vecmathlib_sse_4.h"
|
||||
|
||||
@@ -89,6 +92,20 @@
|
||||
|
||||
constexpr auto loada = vfloat4::loada;
|
||||
constexpr auto load1 = vfloat4::load1;
|
||||
|
||||
#elif ASTCENC_NEON > 0
|
||||
/* If we have NEON expose 4-wide VLA. */
|
||||
#include "astcenc_vecmathlib_neon_4.h"
|
||||
|
||||
#define ASTCENC_SIMD_WIDTH 4
|
||||
|
||||
using vfloat = vfloat4;
|
||||
using vint = vint4;
|
||||
using vmask = vmask4;
|
||||
|
||||
constexpr auto loada = vfloat4::loada;
|
||||
constexpr auto load1 = vfloat4::load1;
|
||||
|
||||
#else
|
||||
/* If we have nothing expose 1-wide VLA, and 4-wide fixed width. */
|
||||
#include "astcenc_vecmathlib_none_1.h"
|
||||
|
||||
Executable
+799
@@ -0,0 +1,799 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
// ----------------------------------------------------------------------------
|
||||
// Copyright 2019-2020 Arm Limited
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
|
||||
// use this file except in compliance with the License. You may obtain a copy
|
||||
// of the License at:
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
// License for the specific language governing permissions and limitations
|
||||
// under the License.
|
||||
// ----------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* @brief 4x32-bit vectors, implemented using Armv8-A NEON.
|
||||
*
|
||||
* This module implements 4-wide 32-bit float, int, and mask vectors for
|
||||
* Armv8-A NEON.
|
||||
*
|
||||
* There is a baseline level of functionality provided by all vector widths and
|
||||
* implementations. This is implemented using identical function signatures,
|
||||
* modulo data type, so we can use them as substitutable implementations in VLA
|
||||
* code.
|
||||
*
|
||||
* The 4-wide vectors are also used as a fixed-width type, and significantly
|
||||
* extend the functionality above that available to VLA code.
|
||||
*/
|
||||
|
||||
#ifndef ASTC_VECMATHLIB_NEON_4_H_INCLUDED
|
||||
#define ASTC_VECMATHLIB_NEON_4_H_INCLUDED
|
||||
|
||||
#ifndef ASTCENC_SIMD_INLINE
|
||||
#error "Include astcenc_vecmathlib.h, do not include directly"
|
||||
#endif
|
||||
|
||||
#include <cstdio>
|
||||
|
||||
// ============================================================================
|
||||
// vfloat4 data type
|
||||
// ============================================================================
|
||||
|
||||
/**
|
||||
* @brief Data type for 4-wide floats.
|
||||
*/
|
||||
struct vfloat4
|
||||
{
|
||||
/**
|
||||
* @brief Construct from zero-initialized value.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE vfloat4() {}
|
||||
|
||||
/**
|
||||
* @brief Construct from 4 values loaded from an unaligned address.
|
||||
*
|
||||
* Consider using loada() which is better with vectors if data is aligned
|
||||
* to vector length.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE explicit vfloat4(const float *p)
|
||||
{
|
||||
m = vld1q_f32(p);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Construct from 1 scalar value replicated across all lanes.
|
||||
*
|
||||
* Consider using zero() for constexpr zeros.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE explicit vfloat4(float a)
|
||||
{
|
||||
m = vdupq_n_f32(a);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Construct from 4 scalar values.
|
||||
*
|
||||
* The value of @c a is stored to lane 0 (LSB) in the SIMD register.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE explicit vfloat4(float a, float b, float c, float d)
|
||||
{
|
||||
float32x4_t v { a, b, c, d };
|
||||
m = v;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Construct from an existing SIMD register.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE explicit vfloat4(float32x4_t a)
|
||||
{
|
||||
m = a;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Get the scalar value of a single lane.
|
||||
*
|
||||
* TODO: Can we do better for lane0, which is the common case for VLA?
|
||||
*/
|
||||
template <int l> ASTCENC_SIMD_INLINE float lane() const
|
||||
{
|
||||
return vgetq_lane_f32(m, l);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Set the scalar value of a single lane.
|
||||
*/
|
||||
template <int l> ASTCENC_SIMD_INLINE void set_lane(float a)
|
||||
{
|
||||
m = vld1q_lane_f32(&a, m, l);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Factory that returns a vector of zeros.
|
||||
*/
|
||||
static ASTCENC_SIMD_INLINE vfloat4 zero()
|
||||
{
|
||||
return vfloat4(vdupq_n_f32(0.0f));
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Factory that returns a replicated scalar loaded from memory.
|
||||
*/
|
||||
static ASTCENC_SIMD_INLINE vfloat4 load1(const float* p)
|
||||
{
|
||||
return vfloat4(vdupq_n_f32(*p));
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Factory that returns a vector loaded from 16B aligned memory.
|
||||
*/
|
||||
static ASTCENC_SIMD_INLINE vfloat4 loada(const float* p)
|
||||
{
|
||||
return vfloat4(vld1q_f32(p));
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Factory that returns a vector containing the lane IDs.
|
||||
*/
|
||||
static ASTCENC_SIMD_INLINE vfloat4 lane_id()
|
||||
{
|
||||
alignas(16) float data[4] = { 0.0f, 1.0f, 2.0f, 3.0f };
|
||||
return vfloat4(vld1q_f32(data));
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief The vector ...
|
||||
*/
|
||||
float32x4_t m;
|
||||
};
|
||||
|
||||
// ============================================================================
|
||||
// vint4 data type
|
||||
// ============================================================================
|
||||
|
||||
/**
|
||||
* @brief Data type for 4-wide ints.
|
||||
*/
|
||||
struct vint4
|
||||
{
|
||||
/**
|
||||
* @brief Construct from zero-initialized value.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE vint4() {}
|
||||
|
||||
/**
|
||||
* @brief Construct from 4 values loaded from an unaligned address.
|
||||
*
|
||||
* Consider using loada() which is better with vectors if data is aligned
|
||||
* to vector length.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE explicit vint4(const int *p)
|
||||
{
|
||||
m = vld1q_s32(p);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Construct from 1 scalar value replicated across all lanes.
|
||||
*
|
||||
* Consider using vfloat4::zero() for constexpr zeros.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE explicit vint4(int a)
|
||||
{
|
||||
m = vdupq_n_s32(a);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Construct from 4 scalar values.
|
||||
*
|
||||
* The value of @c a is stored to lane 0 (LSB) in the SIMD register.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE explicit vint4(int a, int b, int c, int d)
|
||||
{
|
||||
int32x4_t v = { a, b, c, d };
|
||||
m = v;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Construct from an existing SIMD register.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE explicit vint4(int32x4_t a)
|
||||
{
|
||||
m = a;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Get the scalar from a single lane.
|
||||
*/
|
||||
template <int l> ASTCENC_SIMD_INLINE int lane() const
|
||||
{
|
||||
return vgetq_lane_s32(m, l);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Factory that returns a vector containing the lane IDs.
|
||||
*/
|
||||
static ASTCENC_SIMD_INLINE vint4 lane_id()
|
||||
{
|
||||
alignas(ASTCENC_VECALIGN) int data[4] = { 0, 1, 2, 3 };
|
||||
return vint4(vld1q_s32(data));
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief The vector ...
|
||||
*/
|
||||
int32x4_t m;
|
||||
};
|
||||
|
||||
// ============================================================================
|
||||
// vmask4 data type
|
||||
// ============================================================================
|
||||
|
||||
/**
|
||||
* @brief Data type for 4-wide control plane masks.
|
||||
*/
|
||||
struct vmask4
|
||||
{
|
||||
/**
|
||||
* @brief Construct from an existing SIMD register.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE explicit vmask4(uint32x4_t a)
|
||||
{
|
||||
m = a;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Construct from an existing SIMD register.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE explicit vmask4(int32x4_t a)
|
||||
{
|
||||
m = vreinterpretq_u32_s32(a);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief The vector ...
|
||||
*/
|
||||
uint32x4_t m;
|
||||
};
|
||||
|
||||
// ============================================================================
|
||||
// vmask4 operators and functions
|
||||
// ============================================================================
|
||||
|
||||
/**
|
||||
* @brief Overload: mask union (or).
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE vmask4 operator|(vmask4 a, vmask4 b)
|
||||
{
|
||||
return vmask4(vorrq_u32(a.m, b.m));
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Overload: mask intersect (and).
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE vmask4 operator&(vmask4 a, vmask4 b)
|
||||
{
|
||||
return vmask4(vandq_u32(a.m, b.m));
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Overload: mask difference (xor).
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE vmask4 operator^(vmask4 a, vmask4 b)
|
||||
{
|
||||
return vmask4(veorq_u32(a.m, b.m));
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Overload: mask invert (not).
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE vmask4 operator~(vmask4 a)
|
||||
{
|
||||
return vmask4(vmvnq_u32(a.m));
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Return a 4-bit mask code indicating mask status.
|
||||
*
|
||||
* bit0 = lane 0
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE unsigned int mask(vmask4 a)
|
||||
{
|
||||
int32x4_t shift = { 0, 1, 2, 3 };
|
||||
uint32x4_t tmp = vshrq_n_u32(a.m, 31);
|
||||
return vaddvq_u32(vshlq_u32(tmp, shift));
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief True if any lanes are enabled, false otherwise.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE bool any(vmask4 a)
|
||||
{
|
||||
return mask(a) != 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief True if all lanes are enabled, false otherwise.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE bool all(vmask4 a)
|
||||
{
|
||||
return mask(a) == 0xF;
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// vint4 operators and functions
|
||||
// ============================================================================
|
||||
|
||||
/**
|
||||
* @brief Overload: vector by vector addition.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE vint4 operator+(vint4 a, vint4 b)
|
||||
{
|
||||
return vint4(vaddq_s32(a.m, b.m));
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Overload: vector by vector subtraction.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE vint4 operator-(vint4 a, vint4 b)
|
||||
{
|
||||
return vint4(vsubq_s32(a.m, b.m));
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Overload: vector bit invert.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE vint4 operator~(vint4 a)
|
||||
{
|
||||
return vint4(vmvnq_s32(a.m));
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Overload: vector by vector bitwise or.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE vint4 operator|(vint4 a, vint4 b)
|
||||
{
|
||||
return vint4(vorrq_s32(a.m, b.m));
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Overload: vector by vector bitwise and.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE vint4 operator&(vint4 a, vint4 b)
|
||||
{
|
||||
return vint4(vandq_s32(a.m, b.m));
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Overload: vector by vector bitwise xor.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE vint4 operator^(vint4 a, vint4 b)
|
||||
{
|
||||
return vint4(veorq_s32(a.m, b.m));
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Overload: vector by vector equality.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE vmask4 operator==(vint4 a, vint4 b)
|
||||
{
|
||||
return vmask4(vceqq_s32(a.m, b.m));
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Overload: vector by vector inequality.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE vmask4 operator!=(vint4 a, vint4 b)
|
||||
{
|
||||
return ~vmask4(vceqq_s32(a.m, b.m));
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Overload: vector by vector less than.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE vmask4 operator<(vint4 a, vint4 b)
|
||||
{
|
||||
return vmask4(vcltq_s32(a.m, b.m));
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Overload: vector by vector greater than.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE vmask4 operator>(vint4 a, vint4 b)
|
||||
{
|
||||
return vmask4(vcgtq_s32(a.m, b.m));
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Return the min vector of two vectors.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE vint4 min(vint4 a, vint4 b)
|
||||
{
|
||||
return vint4(vminq_s32(a.m, b.m));
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Return the max vector of two vectors.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE vint4 max(vint4 a, vint4 b)
|
||||
{
|
||||
return vint4(vmaxq_s32(a.m, b.m));
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Return the horizontal minimum of a vector.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE vint4 hmin(vint4 a)
|
||||
{
|
||||
return vint4(vminvq_s32(a.m));
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Store a vector to a 16B aligned memory address.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE void storea(vint4 a, int* p)
|
||||
{
|
||||
vst1q_s32(p, a.m);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Store lowest N (vector width) bytes into an unaligned address.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE void store_nbytes(vint4 a, uint8_t* p)
|
||||
{
|
||||
vst1q_lane_s32((int32_t*)p, a.m, 0);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Gather N (vector width) indices from the array.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE vint4 gatheri(const int* base, vint4 indices)
|
||||
{
|
||||
alignas(16) int idx[4];
|
||||
storea(indices, idx);
|
||||
alignas(16) int vals[4];
|
||||
vals[0] = base[idx[0]];
|
||||
vals[1] = base[idx[1]];
|
||||
vals[2] = base[idx[2]];
|
||||
vals[3] = base[idx[3]];
|
||||
return vint4(vals);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Pack low 8 bits of N (vector width) lanes into bottom of vector.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE vint4 pack_low_bytes(vint4 a)
|
||||
{
|
||||
alignas(16) uint8_t shuf[16] = {
|
||||
0, 4, 8, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
|
||||
};
|
||||
uint8x16_t idx = vld1q_u8(shuf);
|
||||
int8x16_t av = vreinterpretq_s8_s32(a.m);
|
||||
return vint4(vreinterpretq_s32_s8(vqtbl1q_s8(av, idx)));
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Return lanes from @c b if MSB of @c cond is set, else @c a.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE vint4 select(vint4 a, vint4 b, vmask4 cond)
|
||||
{
|
||||
static const uint32x4_t msb = vdupq_n_u32(0x80000000u);
|
||||
uint32x4_t mask = vcgeq_u32(cond.m, msb);
|
||||
return vint4(vbslq_s32(mask, b.m, a.m));
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Debug function to print a vector of ints.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE void print(vint4 a)
|
||||
{
|
||||
alignas(16) int v[4];
|
||||
storea(a, v);
|
||||
printf("v4_i32:\n %8u %8u %8u %8u\n",
|
||||
v[0], v[1], v[2], v[3]);
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// vfloat4 operators and functions
|
||||
// ============================================================================
|
||||
|
||||
/**
|
||||
* @brief Overload: vector by vector addition.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE vfloat4 operator+(vfloat4 a, vfloat4 b)
|
||||
{
|
||||
return vfloat4(vaddq_f32(a.m, b.m));
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Overload: vector by vector subtraction.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE vfloat4 operator-(vfloat4 a, vfloat4 b)
|
||||
{
|
||||
return vfloat4(vsubq_f32(a.m, b.m));
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Overload: vector by vector multiplication.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE vfloat4 operator*(vfloat4 a, vfloat4 b)
|
||||
{
|
||||
return vfloat4(vmulq_f32(a.m, b.m));
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Overload: vector by scalar multiplication.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE vfloat4 operator*(vfloat4 a, float b)
|
||||
{
|
||||
float32x4_t bv = vld1q_dup_f32(&b);
|
||||
return vfloat4(vmulq_f32(a.m, bv));
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Overload: scalar by vector multiplication.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE vfloat4 operator*(float a, vfloat4 b)
|
||||
{
|
||||
float32x4_t av = vld1q_dup_f32(&a);
|
||||
return vfloat4(vmulq_f32(av, b.m));
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Overload: vector by vector division.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE vfloat4 operator/(vfloat4 a, vfloat4 b)
|
||||
{
|
||||
return vfloat4(vdivq_f32(a.m, b.m));
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Overload: vector by vector equality.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE vmask4 operator==(vfloat4 a, vfloat4 b)
|
||||
{
|
||||
return vmask4(vceqq_f32(a.m, b.m));
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Overload: vector by vector inequality.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE vmask4 operator!=(vfloat4 a, vfloat4 b)
|
||||
{
|
||||
return vmask4(vmvnq_u32(vceqq_f32(a.m, b.m)));
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Overload: vector by vector less than.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE vmask4 operator<(vfloat4 a, vfloat4 b)
|
||||
{
|
||||
return vmask4(vcltq_f32(a.m, b.m));
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Overload: vector by vector greater than.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE vmask4 operator>(vfloat4 a, vfloat4 b)
|
||||
{
|
||||
return vmask4(vcgtq_f32(a.m, b.m));
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Overload: vector by vector less than or equal.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE vmask4 operator<=(vfloat4 a, vfloat4 b)
|
||||
{
|
||||
return vmask4(vcleq_f32(a.m, b.m));
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Overload: vector by vector greater than or equal.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE vmask4 operator>=(vfloat4 a, vfloat4 b)
|
||||
{
|
||||
return vmask4(vcgeq_f32(a.m, b.m));
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Return the min vector of two vectors.
|
||||
*
|
||||
* If either lane value is NaN, @c b will be returned for that lane.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE vfloat4 min(vfloat4 a, vfloat4 b)
|
||||
{
|
||||
// Do not reorder - second operand will return if either is NaN
|
||||
return vfloat4(vminnmq_f32(a.m, b.m));
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Return the max vector of two vectors.
|
||||
*
|
||||
* If either lane value is NaN, @c b will be returned for that lane.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE vfloat4 max(vfloat4 a, vfloat4 b)
|
||||
{
|
||||
// Do not reorder - second operand will return if either is NaN
|
||||
return vfloat4(vmaxnmq_f32(a.m, b.m));
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Return the clamped value between min and max.
|
||||
*
|
||||
* It is assumed that neither @c min nor @c max are NaN values. If @c a is NaN
|
||||
* then @c min will be returned for that lane.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE vfloat4 clamp(float min, float max, vfloat4 a)
|
||||
{
|
||||
float32x4_t minv = vdupq_n_f32(min);
|
||||
float32x4_t maxv = vdupq_n_f32(max);
|
||||
return vfloat4(vminnmq_f32(vmaxnmq_f32(a.m, minv), maxv));
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Return a clamped value between 0.0f and max.
|
||||
*
|
||||
* It is assumed that @c max is not a NaN value. If @c a is NaN then zero will
|
||||
* be returned for that lane.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE vfloat4 clampz(float max, vfloat4 a)
|
||||
{
|
||||
// Do not reorder - second operand will return if either is NaN
|
||||
float32x4_t minv = vdupq_n_f32(0.0f);
|
||||
float32x4_t maxv = vdupq_n_f32(max);
|
||||
return vfloat4(vminnmq_f32(vmaxnmq_f32(a.m, minv), maxv));
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Return a clamped value between 0.0f and 1.0f.
|
||||
*
|
||||
* If @c a is NaN then zero will be returned for that lane.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE vfloat4 clampzo(vfloat4 a)
|
||||
{
|
||||
float32x4_t minv = vdupq_n_f32(0.0f);
|
||||
float32x4_t maxv = vdupq_n_f32(1.0f);
|
||||
return vfloat4(vminnmq_f32(vmaxnmq_f32(a.m, minv), maxv));
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Return the absolute value of the float vector.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE vfloat4 abs(vfloat4 a)
|
||||
{
|
||||
float32x4_t zero = vdupq_n_f32(0.0f);
|
||||
float32x4_t inv = vsubq_f32(zero, a.m);
|
||||
return vfloat4(vmaxq_f32(a.m, inv));
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Return a float rounded to the nearest integer value.
|
||||
*
|
||||
* TODO: Can we do a better fallback here, if we exploit the fact that we
|
||||
* can assume that values are positive?
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE vfloat4 round(vfloat4 a)
|
||||
{
|
||||
return vfloat4(vrndnq_f32(a.m));
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Return the horizontal minimum of a vector.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE vfloat4 hmin(vfloat4 a)
|
||||
{
|
||||
return vfloat4(vminvq_f32(a.m));
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Return the sqrt of the lanes in the vector.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE vfloat4 sqrt(vfloat4 a)
|
||||
{
|
||||
return vfloat4(vsqrtq_f32(a.m));
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Return lanes from @c b if MSB of @c cond is set, else @c a.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE vfloat4 select(vfloat4 a, vfloat4 b, vmask4 cond)
|
||||
{
|
||||
static const uint32x4_t msb = vdupq_n_u32(0x80000000u);
|
||||
uint32x4_t mask = vcgeq_u32(cond.m, msb);
|
||||
return vfloat4(vbslq_f32(mask, b.m, a.m));
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Load a vector of gathered results from an array;
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE vfloat4 gatherf(const float* base, vint4 indices)
|
||||
{
|
||||
alignas(16) int idx[4];
|
||||
storea(indices, idx);
|
||||
alignas(16) float vals[4];
|
||||
vals[0] = base[idx[0]];
|
||||
vals[1] = base[idx[1]];
|
||||
vals[2] = base[idx[2]];
|
||||
vals[3] = base[idx[3]];
|
||||
return vfloat4(vals);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Store a vector to an unaligned memory address.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE void store(vfloat4 a, float* p)
|
||||
{
|
||||
vst1q_f32(p, a.m);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Store a vector to a 16B aligned memory address.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE void storea(vfloat4 a, float* p)
|
||||
{
|
||||
vst1q_f32(p, a.m);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Return the dot product for the full 4 lanes, returning vector.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE vfloat4 dot(vfloat4 a, vfloat4 b)
|
||||
{
|
||||
return vfloat4(vaddvq_f32(vmulq_f32(a.m, b.m)));
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Return a integer value for a float vector, using truncation.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE vint4 float_to_int(vfloat4 a)
|
||||
{
|
||||
return vint4(vcvtq_s32_f32(a.m));
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Return a integer value for a float vector, using round-to-nearest.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE vint4 float_to_int_rtn(vfloat4 a)
|
||||
{
|
||||
a = round(a);
|
||||
return vint4(vcvtq_s32_f32(a.m));
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Return a float value as an integer bit pattern (i.e. no conversion).
|
||||
*
|
||||
* It is a common trick to convert floats into integer bit patterns, perform
|
||||
* some bit hackery based on knowledge they are IEEE 754 layout, and then
|
||||
* convert them back again. This is the first half of that flip.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE vint4 float_as_int(vfloat4 a)
|
||||
{
|
||||
return vint4(vreinterpretq_s32_f32(a.m));
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Return a integer value as a float bit pattern (i.e. no conversion).
|
||||
*
|
||||
* It is a common trick to convert floats into integer bit patterns, perform
|
||||
* some bit hackery based on knowledge they are IEEE 754 layout, and then
|
||||
* convert them back again. This is the second half of that flip.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE vfloat4 int_as_float(vint4 v)
|
||||
{
|
||||
return vfloat4(vreinterpretq_f32_s32(v.m));
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Debug function to print a vector of floats.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE void print(vfloat4 a)
|
||||
{
|
||||
alignas(16) float v[4];
|
||||
storea(a, v);
|
||||
printf("v4_f32:\n %0.4f %0.4f %0.4f %0.4f\n",
|
||||
(double)v[0], (double)v[1], (double)v[2], (double)v[3]);
|
||||
}
|
||||
|
||||
#endif // #ifndef ASTC_VECMATHLIB_NEON_4_H_INCLUDED
|
||||
+16
-2
@@ -110,17 +110,29 @@ endif()
|
||||
if(${ISA_SIMD} MATCHES "none")
|
||||
target_compile_definitions(astcenc-${ISA_SIMD}
|
||||
PRIVATE
|
||||
ASTCENC_NEON=0
|
||||
ASTCENC_SSE=0
|
||||
ASTCENC_AVX=0
|
||||
ASTCENC_POPCNT=0)
|
||||
|
||||
target_compile_options(astcenc-${ISA_SIMD}
|
||||
if (${ARCH} MATCHES x64)
|
||||
target_compile_options(astcenc-${ISA_SIMD}
|
||||
PRIVATE
|
||||
$<$<CXX_COMPILER_ID:${GNU_LIKE}>:-mfpmath=sse -msse2>)
|
||||
endif()
|
||||
|
||||
elseif(${ISA_SIMD} MATCHES "neon")
|
||||
target_compile_definitions(astcenc-${ISA_SIMD}
|
||||
PRIVATE
|
||||
$<$<CXX_COMPILER_ID:${GNU_LIKE}>:-mfpmath=sse -msse2>)
|
||||
ASTCENC_NEON=1
|
||||
ASTCENC_SSE=0
|
||||
ASTCENC_AVX=0
|
||||
ASTCENC_POPCNT=0)
|
||||
|
||||
elseif(${ISA_SIMD} MATCHES "sse2")
|
||||
target_compile_definitions(astcenc-${ISA_SIMD}
|
||||
PRIVATE
|
||||
ASTCENC_NEON=0
|
||||
ASTCENC_SSE=20
|
||||
ASTCENC_AVX=0
|
||||
ASTCENC_POPCNT=0)
|
||||
@@ -132,6 +144,7 @@ elseif(${ISA_SIMD} MATCHES "sse2")
|
||||
elseif(${ISA_SIMD} MATCHES "sse4.1")
|
||||
target_compile_definitions(astcenc-${ISA_SIMD}
|
||||
PRIVATE
|
||||
ASTCENC_NEON=0
|
||||
ASTCENC_SSE=41
|
||||
ASTCENC_AVX=0
|
||||
ASTCENC_POPCNT=1)
|
||||
@@ -143,6 +156,7 @@ elseif(${ISA_SIMD} MATCHES "sse4.1")
|
||||
elseif(${ISA_SIMD} MATCHES "avx2")
|
||||
target_compile_definitions(astcenc-${ISA_SIMD}
|
||||
PRIVATE
|
||||
ASTCENC_NEON=0
|
||||
ASTCENC_SSE=41
|
||||
ASTCENC_AVX=2
|
||||
ASTCENC_POPCNT=1)
|
||||
|
||||
+12
-5
@@ -285,23 +285,30 @@ def get_encoder_params(encoderName, referenceName, imageSet):
|
||||
outDir = "Test/Images/%s" % imageSet
|
||||
refName = None
|
||||
# Latest master
|
||||
elif encoderName == "ref-master-neon":
|
||||
# Warning: this option rebuilds a new reference test result for the
|
||||
# master branch using the user's locally build encoder.
|
||||
encoder = te.Encoder2x("neon")
|
||||
name = "reference-master-neon"
|
||||
outDir = "Test/Images/%s" % imageSet
|
||||
refName = None
|
||||
elif encoderName == "ref-master-sse2":
|
||||
# Warning: this option rebuilds a new reference test result for the
|
||||
# master branch using the user's locally build encoder in ./Source.
|
||||
# master branch using the user's locally build encoder.
|
||||
encoder = te.Encoder2x("sse2")
|
||||
name = "reference-master-sse2"
|
||||
outDir = "Test/Images/%s" % imageSet
|
||||
refName = None
|
||||
elif encoderName == "ref-master-sse4.1":
|
||||
# Warning: this option rebuilds a new reference test result for the
|
||||
# master branch using the user's locally build encoder in ./Source.
|
||||
# master branch using the user's locally build encoder.
|
||||
encoder = te.Encoder2x("sse4.1")
|
||||
name = "reference-master-sse4.1"
|
||||
outDir = "Test/Images/%s" % imageSet
|
||||
refName = None
|
||||
elif encoderName == "ref-master-avx2":
|
||||
# Warning: this option rebuilds a new reference test result for the
|
||||
# master branch using the user's locally build encoder in ./Source.
|
||||
# master branch using the user's locally build encoder.
|
||||
encoder = te.Encoder2x("avx2")
|
||||
name = "reference-master-avx2"
|
||||
outDir = "Test/Images/%s" % imageSet
|
||||
@@ -327,8 +334,8 @@ def parse_command_line():
|
||||
refcoders = ["ref-1.7",
|
||||
"ref-2.0-sse2", "ref-2.0-sse4.1", "ref-2.0-avx2",
|
||||
"ref-2.1-sse2", "ref-2.1-sse4.1", "ref-2.1-avx2",
|
||||
"ref-master-sse2", "ref-master-sse4.1", "ref-master-avx2"]
|
||||
testcoders = ["none", "sse2", "sse4.1", "avx2"]
|
||||
"ref-master-neon", "ref-master-sse2", "ref-master-sse4.1", "ref-master-avx2"]
|
||||
testcoders = ["none", "neon", "sse2", "sse4.1", "avx2"]
|
||||
coders = refcoders + testcoders + ["all", "all-ref"]
|
||||
parser.add_argument("--encoder", dest="encoders", default="avx2",
|
||||
choices=coders, help="test encoder variant")
|
||||
|
||||
+50
-2
@@ -28,6 +28,7 @@ The directory path is structured:
|
||||
|
||||
from collections.abc import Iterable
|
||||
import os
|
||||
import re
|
||||
import subprocess as sp
|
||||
|
||||
from PIL import Image as PILImage
|
||||
@@ -35,6 +36,39 @@ from PIL import Image as PILImage
|
||||
import testlib.misc as misc
|
||||
|
||||
|
||||
CONVERT_BINARY = ["convert"]
|
||||
|
||||
|
||||
g_ConvertVersion = None
|
||||
|
||||
|
||||
def get_convert_version():
|
||||
"""
|
||||
Get the major/minor version of ImageMagick on the system.
|
||||
"""
|
||||
global g_ConvertVersion
|
||||
|
||||
if g_ConvertVersion is None:
|
||||
command = list(CONVERT_BINARY)
|
||||
command += ["--version"]
|
||||
result = sp.run(command, stdout=sp.PIPE, stderr=sp.PIPE,
|
||||
check=True, universal_newlines=True)
|
||||
|
||||
# Version is top row
|
||||
version = result.stdout.splitlines()[0]
|
||||
# ... third token
|
||||
version = re.split(" ", version)[2]
|
||||
# ... major/minor/patch/subpatch
|
||||
version = re.split("\\.|-", version)
|
||||
|
||||
numericVersion = float(version[0])
|
||||
numericVersion += float(version[1]) / 10.0
|
||||
|
||||
g_ConvertVersion = numericVersion
|
||||
|
||||
return g_ConvertVersion
|
||||
|
||||
|
||||
class ImageException(Exception):
|
||||
"""
|
||||
Exception thrown for bad image specification.
|
||||
@@ -228,6 +262,14 @@ class Image():
|
||||
Args:
|
||||
filePath (str): The path to the image on disk.
|
||||
"""
|
||||
convert = get_convert_version()
|
||||
|
||||
# ImageMagick 7 started to use .tga file origin information. By default
|
||||
# TGA files store data from bottom up, and define the origin as bottom
|
||||
# left. We want our color samples to always use a top left origin, even
|
||||
# if the data is stored in alternative layout.
|
||||
self.invertYCoords = (convert >= 7.0) and filePath.endswith(".tga")
|
||||
|
||||
self.filePath = filePath
|
||||
self.proxyPath = None
|
||||
|
||||
@@ -256,8 +298,14 @@ class Image():
|
||||
coords = [coords]
|
||||
|
||||
for (x, y) in coords:
|
||||
command = [
|
||||
"convert", self.filePath,
|
||||
command = list(CONVERT_BINARY)
|
||||
command += [self.filePath]
|
||||
|
||||
# Invert coordinates if the format needs it
|
||||
if self.invertYCoords:
|
||||
command += ["-flip"]
|
||||
|
||||
command += [
|
||||
"-format", "%%[pixel:p{%u,%u}]" % (x, y),
|
||||
"info:"
|
||||
]
|
||||
|
||||
Reference in New Issue
Block a user