ASTC-encoder update to 4.7.0
Signed-off-by: zhaonan287 <zhaonan34@huawei.com>
@@ -38,7 +38,6 @@ ohos_source_set("astc_encoder_static") {
|
||||
"//third_party/astc-encoder/Source/astcenc_partition_tables.cpp",
|
||||
"//third_party/astc-encoder/Source/astcenc_percentile_tables.cpp",
|
||||
"//third_party/astc-encoder/Source/astcenc_pick_best_endpoint_format.cpp",
|
||||
"//third_party/astc-encoder/Source/astcenc_platform_isa_detection.cpp",
|
||||
"//third_party/astc-encoder/Source/astcenc_quantization.cpp",
|
||||
"//third_party/astc-encoder/Source/astcenc_symbolic_physical.cpp",
|
||||
"//third_party/astc-encoder/Source/astcenc_weight_align.cpp",
|
||||
@@ -51,7 +50,6 @@ ohos_source_set("astc_encoder_static") {
|
||||
ohos_shared_library("astc_encoder_shared") {
|
||||
public_configs = [ ":astc_encoder_config" ]
|
||||
deps = [ ":astc_encoder_static" ]
|
||||
output_extension = "so"
|
||||
install_enable = true
|
||||
part_name = "astc-encoder"
|
||||
innerapi_tags = [ "platformsdk" ]
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# ----------------------------------------------------------------------------
|
||||
# Copyright 2020-2022 Arm Limited
|
||||
# Copyright 2020-2024 Arm Limited
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License"); you may not
|
||||
# use this file except in compliance with the License. You may obtain a copy
|
||||
@@ -24,163 +24,83 @@ if(MSVC)
|
||||
add_compile_options("/wd4324") # Disable structure was padded due to alignment specifier
|
||||
endif()
|
||||
|
||||
project(astcencoder VERSION 3.7.0)
|
||||
project(astcencoder VERSION 4.7.0)
|
||||
|
||||
set(CMAKE_CXX_STANDARD 14)
|
||||
set(CMAKE_CXX_STANDARD_REQUIRED ON)
|
||||
set(CMAKE_CXX_EXTENSIONS OFF)
|
||||
set(CMAKE_EXPORT_COMPILE_COMMANDS 1)
|
||||
set(PACKAGE_ROOT astcenc)
|
||||
set(CMAKE_XCODE_ATTRIBUTE_VALID_ARCHS "x86_64 x86_64h arm64")
|
||||
|
||||
include(CTest)
|
||||
|
||||
option(ISA_AVX2 "Enable builds for AVX2 SIMD")
|
||||
option(ISA_SSE41 "Enable builds for SSE4.1 SIMD")
|
||||
option(ISA_SSE2 "Enable builds for SSE2 SIMD")
|
||||
option(ISA_NEON "Enable builds for NEON SIMD")
|
||||
option(ISA_NONE "Enable builds for no SIMD")
|
||||
option(ISA_NATIVE "Enable builds for native SIMD")
|
||||
option(DECOMPRESSOR "Enable builds for decompression only")
|
||||
option(DIAGNOSTICS "Enable builds for diagnostic trace")
|
||||
option(ASAN "Enable builds width address sanitizer")
|
||||
option(UNITTEST "Enable builds for unit tests")
|
||||
option(NO_INVARIANCE "Enable builds without invariance")
|
||||
option(CLI "Enable build of CLI" ON)
|
||||
|
||||
set(UNIVERSAL_BUILD OFF)
|
||||
set(MACOS_BUILD OFF)
|
||||
set(MACOS_ARCH_LEN 0)
|
||||
option(ASTCENC_ISA_AVX2 "Enable astcenc builds for AVX2 SIMD")
|
||||
option(ASTCENC_ISA_SSE41 "Enable astcenc builds for SSE4.1 SIMD")
|
||||
option(ASTCENC_ISA_SSE2 "Enable astcenc builds for SSE2 SIMD")
|
||||
option(ASTCENC_ISA_NEON "Enable astcenc builds for NEON SIMD")
|
||||
option(ASTCENC_ISA_NONE "Enable astcenc builds for no SIMD")
|
||||
option(ASTCENC_ISA_NATIVE "Enable astcenc builds for native SIMD")
|
||||
option(ASTCENC_DECOMPRESSOR "Enable astcenc builds for decompression only")
|
||||
option(ASTCENC_SHAREDLIB "Enable astcenc builds with core library shared objects")
|
||||
option(ASTCENC_DIAGNOSTICS "Enable astcenc builds with diagnostic trace")
|
||||
option(ASTCENC_ASAN "Enable astcenc builds with address sanitizer")
|
||||
option(ASTCENC_UNITTEST "Enable astcenc builds with unit tests")
|
||||
option(ASTCENC_INVARIANCE "Enable astcenc floating point invariance" ON)
|
||||
option(ASTCENC_CLI "Enable build of astcenc command line tools" ON)
|
||||
|
||||
# Preflight for some macOS-specific build options
|
||||
if("${CMAKE_SYSTEM_NAME}" STREQUAL "Darwin")
|
||||
set(MACOS_BUILD ON)
|
||||
list(LENGTH CMAKE_OSX_ARCHITECTURES MACOS_ARCH_LEN)
|
||||
option(ASTCENC_UNIVERSAL_BUILD "Enable universal multi-arch build" ON)
|
||||
|
||||
if(${ASTCENC_UNIVERSAL_BUILD})
|
||||
set(ASTCENC_ISA_SSE41 ON)
|
||||
set(ASTCENC_ISA_AVX2 ON)
|
||||
set(ASTCENC_ISA_NEON ON)
|
||||
|
||||
if(${ASTCENC_ISA_SSE2})
|
||||
message(FATAL_ERROR "ISA_SSE2 cannot be used in a universal build")
|
||||
endif()
|
||||
|
||||
if(${ASTCENC_ISA_NONE})
|
||||
message(FATAL_ERROR "ISA_NONE cannot be used in a universal build")
|
||||
endif()
|
||||
|
||||
if(${ASTCENC_ISA_NATIVE})
|
||||
message(FATAL_ERROR "ISA_NATIVE cannot be used in a universal build")
|
||||
endif()
|
||||
endif()
|
||||
else()
|
||||
set(ASTCENC_UNIVERSAL_BUILD OFF)
|
||||
endif()
|
||||
|
||||
# Count options which MUST be x64
|
||||
set(X64_ISA_COUNT 0)
|
||||
set(CONFIGS ${ISA_AVX2} ${ISA_SSE41} ${ISA_SSE2})
|
||||
foreach(CONFIG ${CONFIGS})
|
||||
if(${CONFIG})
|
||||
math(EXPR X64_ISA_COUNT "${X64_ISA_COUNT} + 1")
|
||||
set(ASTCENC_X64_ISA_COUNT 0)
|
||||
set(ASTCENC_CONFIGS ${ASTCENC_ISA_AVX2} ${ASTCENC_ISA_SSE41} ${ASTCENC_ISA_SSE2})
|
||||
foreach(ASTCENC_CONFIG ${ASTCENC_CONFIGS})
|
||||
if(${ASTCENC_CONFIG})
|
||||
math(EXPR ASTCENC_X64_ISA_COUNT "${ASTCENC_X64_ISA_COUNT} + 1")
|
||||
endif()
|
||||
endforeach()
|
||||
|
||||
# Count options which MUST be arm64
|
||||
set(ARM64_ISA_COUNT 0)
|
||||
set(CONFIGS ${ISA_NEON})
|
||||
foreach(CONFIG ${CONFIGS})
|
||||
if(${CONFIG})
|
||||
math(EXPR ARM64_ISA_COUNT "${ARM64_ISA_COUNT} + 1")
|
||||
set(ASTCENC_ARM64_ISA_COUNT 0)
|
||||
set(ASTCENC_CONFIGS ${ASTCENC_ISA_NEON})
|
||||
foreach(ASTCENC_CONFIG ${ASTCENC_CONFIGS})
|
||||
if(${ASTCENC_CONFIG})
|
||||
math(EXPR ASTCENC_ARM64_ISA_COUNT "${ASTCENC_ARM64_ISA_COUNT} + 1")
|
||||
endif()
|
||||
endforeach()
|
||||
|
||||
# macOS builds
|
||||
if("${MACOS_BUILD}")
|
||||
list(FIND CMAKE_OSX_ARCHITECTURES "x86_64" IS_X64)
|
||||
list(FIND CMAKE_OSX_ARCHITECTURES "arm64" IS_ARM64)
|
||||
list(FIND CMAKE_OSX_ARCHITECTURES "$(ARCHS_STANDARD)" IS_AUTO)
|
||||
|
||||
# Turn list index into boolean
|
||||
if(${IS_X64} EQUAL -1)
|
||||
set(IS_X64 OFF)
|
||||
else()
|
||||
set(IS_X64 ON)
|
||||
endif()
|
||||
|
||||
if(${IS_ARM64} EQUAL -1)
|
||||
set(IS_ARM64 OFF)
|
||||
else()
|
||||
set(IS_ARM64 ON)
|
||||
endif()
|
||||
|
||||
if(${IS_AUTO} EQUAL -1)
|
||||
set(IS_AUTO OFF)
|
||||
else()
|
||||
set(IS_AUTO ON)
|
||||
endif()
|
||||
|
||||
# Set up defaults if no more specific ISA set - use XCode's own defaults
|
||||
if((IS_ARM64 OR IS_AUTO) AND ("${ARM64_ISA_COUNT}" EQUAL 0) AND (NOT "${ISA_NONE}"))
|
||||
set(ARM64_ISA_COUNT 1)
|
||||
set(ISA_NEON ON)
|
||||
endif()
|
||||
|
||||
if((IS_X64 OR IS_AUTO) AND ("${X64_ISA_COUNT}" EQUAL 0) AND (NOT "${ISA_NONE}"))
|
||||
set(X64_ISA_COUNT 1)
|
||||
set(ISA_SSE41 ON)
|
||||
endif()
|
||||
|
||||
# User might be doing multi-architecture - XCode sets this at runtime
|
||||
if("${IS_AUTO}")
|
||||
if(("${ARM64_ISA_COUNT}" GREATER 1) OR ("${X64_ISA_COUNT}" GREATER 1))
|
||||
message(FATAL_ERROR "For macOS universal binaries only one backend per architecture is allowed.")
|
||||
endif()
|
||||
|
||||
set(UNIVERSAL_BUILD ON)
|
||||
|
||||
# User requested explicit multi-architecture universal build
|
||||
elseif("${MACOS_ARCH_LEN}" GREATER 2)
|
||||
message(FATAL_ERROR "For macOS universal binaries only x86_64 and arm64 builds are allowed.")
|
||||
|
||||
elseif("${MACOS_ARCH_LEN}" EQUAL 2)
|
||||
if(NOT (${IS_X64} AND ${IS_ARM64}))
|
||||
message(FATAL_ERROR "For macOS universal binaries only x86_64 and arm64 builds are allowed.")
|
||||
endif()
|
||||
|
||||
if(("${ARM64_ISA_COUNT}" GREATER 1) OR ("${X64_ISA_COUNT}" GREATER 1))
|
||||
message(FATAL_ERROR "For macOS universal binaries only one backend per architecture is allowed.")
|
||||
endif()
|
||||
|
||||
set(UNIVERSAL_BUILD ON)
|
||||
|
||||
# User requested explicit single architecture build
|
||||
elseif("${MACOS_ARCH_LEN}" EQUAL 1)
|
||||
if("${IS_X64}" AND "${ARM64_ISA_COUNT}")
|
||||
message(FATAL_ERROR "For macOS x86_64 builds an arm64 backend cannot be specified.")
|
||||
endif()
|
||||
|
||||
if("${IS_ARM64}" AND "${X64_ISA_COUNT}")
|
||||
message(FATAL_ERROR "For macOS arm64 builds an x86_64 backend cannot be specified.")
|
||||
endif()
|
||||
|
||||
# Else is this a implicit multi-architecture universal build?
|
||||
elseif(("${ARM64_ISA_COUNT}" EQUAL 1) AND ("${X64_ISA_COUNT}" GREATER 1))
|
||||
string(CONCAT MSG "For macOS setting multiple architecture backends builds a universal binary. "
|
||||
"For universal binaries only one backend per architecture is allowed.")
|
||||
message(FATAL_ERROR "${MSG}")
|
||||
|
||||
elseif(("${X64_ISA_COUNT}" EQUAL 1) AND ("${ARM64_ISA_COUNT}" GREATER 1))
|
||||
string(CONCAT MSG "For macOS setting multiple architecture backends builds a universal binary. "
|
||||
"For universal binaries only one backend per architecture is allowed.")
|
||||
message(FATAL_ERROR "${MSG}")
|
||||
|
||||
elseif(("${ARM64_ISA_COUNT}" EQUAL 1) AND ("${X64_ISA_COUNT}" EQUAL 1))
|
||||
set(UNIVERSAL_BUILD ON)
|
||||
set(CMAKE_OSX_ARCHITECTURES "arm64;x86_64")
|
||||
|
||||
# Else is this an implicit single architecture build?
|
||||
elseif("${ARM64_ISA_COUNT}" EQUAL 1)
|
||||
set(CMAKE_OSX_ARCHITECTURES "arm64")
|
||||
|
||||
elseif("${X64_ISA_COUNT}" EQUAL 1)
|
||||
set(CMAKE_OSX_ARCHITECTURES "x86_64")
|
||||
|
||||
else()
|
||||
# Do nothing here - assume it defaults to host?
|
||||
|
||||
endif()
|
||||
|
||||
# Non-macOS builds
|
||||
else()
|
||||
if(("${ARM64_ISA_COUNT}" GREATER 0) AND ("${X64_ISA_COUNT}" GREATER 0))
|
||||
if(NOT "${CMAKE_SYSTEM_NAME}" STREQUAL "Darwin")
|
||||
if(("${ASTCENC_ARM64_ISA_COUNT}" GREATER 0) AND ("${ASTCENC_X64_ISA_COUNT}" GREATER 0))
|
||||
message(FATAL_ERROR "Builds can only support a single architecture per configure.")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
# If nothing more specific is set then fall back on the compiler's defaults
|
||||
if(("${ARM64_ISA_COUNT}" EQUAL 0) AND ("${X64_ISA_COUNT}" EQUAL 0) AND (NOT "${ISA_NONE}"))
|
||||
set(ISA_NATIVE ON)
|
||||
if(("${ASTCENC_ARM64_ISA_COUNT}" EQUAL 0) AND ("${ASTCENC_X64_ISA_COUNT}" EQUAL 0) AND (NOT "${ASTCENC_ISA_NONE}"))
|
||||
set(ASTCENC_ISA_NATIVE ON)
|
||||
endif()
|
||||
|
||||
function(printopt optName optVal)
|
||||
@@ -191,38 +111,38 @@ function(printopt optName optVal)
|
||||
endif()
|
||||
endfunction()
|
||||
|
||||
if("${BLOCK_MAX_TEXELS}")
|
||||
message(STATUS " Max block texels - ${BLOCK_MAX_TEXELS}")
|
||||
if("${ASTCENC_BLOCK_MAX_TEXELS}")
|
||||
message(STATUS " Max block texels - ${ASTCENC_BLOCK_MAX_TEXELS}")
|
||||
endif()
|
||||
printopt("AVX2 backend " ${ISA_AVX2})
|
||||
printopt("SSE4.1 backend " ${ISA_SSE41})
|
||||
printopt("SSE2 backend " ${ISA_SSE2})
|
||||
printopt("NEON backend " ${ISA_NEON})
|
||||
printopt("NONE backend " ${ISA_NONE})
|
||||
printopt("NATIVE backend " ${ISA_NATIVE})
|
||||
if("${MACOS_BUILD}")
|
||||
printopt("Universal bin " ${UNIVERSAL_BUILD})
|
||||
|
||||
printopt("AVX2 backend " ${ASTCENC_ISA_AVX2})
|
||||
printopt("SSE4.1 backend " ${ASTCENC_ISA_SSE41})
|
||||
printopt("SSE2 backend " ${ASTCENC_ISA_SSE2})
|
||||
printopt("NEON backend " ${ASTCENC_ISA_NEON})
|
||||
printopt("NONE backend " ${ASTCENC_ISA_NONE})
|
||||
printopt("NATIVE backend " ${ASTCENC_ISA_NATIVE})
|
||||
if("${CMAKE_SYSTEM_NAME}" STREQUAL "Darwin")
|
||||
printopt("Universal bin " ${ASTCENC_UNIVERSAL_BUILD})
|
||||
endif()
|
||||
printopt("Decompressor " ${DECOMPRESSOR})
|
||||
printopt("No invariance " ${NO_INVARIANCE})
|
||||
printopt("Diagnostics " ${DIAGNOSTICS})
|
||||
printopt("ASAN " ${ASAN})
|
||||
printopt("Unit tests " ${UNITTEST})
|
||||
printopt("Invariance " ${ASTCENC_INVARIANCE})
|
||||
printopt("Shared libs " ${ASTCENC_SHAREDLIB})
|
||||
printopt("Decompressor " ${ASTCENC_DECOMPRESSOR})
|
||||
printopt("Diagnostics " ${ASTCENC_DIAGNOSTICS})
|
||||
printopt("ASAN " ${ASTCENC_ASAN})
|
||||
printopt("Unit tests " ${ASTCENC_UNITTEST})
|
||||
|
||||
# Subcomponents
|
||||
add_subdirectory(Source)
|
||||
|
||||
# Configure package archive
|
||||
if(PACKAGE)
|
||||
if("${MACOS_BUILD}")
|
||||
string(TOLOWER "macOS" PKG_OS)
|
||||
if(ASTCENC_PACKAGE)
|
||||
if("${CMAKE_SYSTEM_NAME}" STREQUAL "Darwin")
|
||||
string(TOLOWER "macOS" ASTCENC_PKG_OS)
|
||||
else()
|
||||
string(TOLOWER ${CMAKE_SYSTEM_NAME} PKG_OS)
|
||||
string(TOLOWER ${CMAKE_SYSTEM_NAME} ASTCENC_PKG_OS)
|
||||
endif()
|
||||
|
||||
set(PKG_VER ${CMAKE_PROJECT_VERSION_MAJOR}.${CMAKE_PROJECT_VERSION_MINOR})
|
||||
|
||||
set(CPACK_PACKAGE_FILE_NAME "astcenc-${PKG_VER}-${PKG_OS}-${PACKAGE}")
|
||||
set(CPACK_PACKAGE_FILE_NAME "astcenc-${CMAKE_PROJECT_VERSION}-${ASTCENC_PKG_OS}-${ASTCENC_PACKAGE}")
|
||||
set(CPACK_INCLUDE_TOPLEVEL_DIRECTORY FALSE)
|
||||
set(CPACK_PACKAGE_CHECKSUM SHA256)
|
||||
set(CPACK_GENERATOR ZIP)
|
||||
|
||||
@@ -10,7 +10,7 @@ backends.
|
||||
|
||||
## Windows
|
||||
|
||||
Builds for Windows are tested with CMake 3.17 and Visual Studio 2019.
|
||||
Builds for Windows are tested with CMake 3.17, and Visual Studio 2019 or newer.
|
||||
|
||||
### Configuring the build
|
||||
|
||||
@@ -25,13 +25,13 @@ cd build
|
||||
|
||||
# Configure your build of choice, for example:
|
||||
|
||||
# x86-64 using a Visual Studio solution
|
||||
cmake -G "Visual Studio 16 2019" -T ClangCL -DCMAKE_INSTALL_PREFIX=..\ ^
|
||||
-DASTCENC_ISA_AVX2=ON -DASTCENC_ISA_SSE41=ON -DASTCENC_ISA_SSE2=ON ..
|
||||
|
||||
# x86-64 using NMake
|
||||
cmake -G "NMake Makefiles" -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=..\ ^
|
||||
-DISA_AVX2=ON -DISA_SSE41=ON -DISA_SSE2=ON ..
|
||||
|
||||
# x86-64 using Visual Studio solution
|
||||
cmake -G "Visual Studio 16 2019" -T ClangCL -DCMAKE_INSTALL_PREFIX=..\ ^
|
||||
-DISA_AVX2=ON -DISA_SSE41=ON -DISA_SSE2=ON ..
|
||||
-DASTCENC_ISA_AVX2=ON -DASTCENC_ISA_SSE41=ON -DASTCENC_ISA_SSE2=ON ..
|
||||
```
|
||||
|
||||
A single CMake configure can build multiple binaries for a single target CPU
|
||||
@@ -49,14 +49,15 @@ Once you have configured the build you can use NMake to compile the project
|
||||
from your build dir, and install to your target install directory.
|
||||
|
||||
```shell
|
||||
# Run a build and install build outputs in `${CMAKE_INSTALL_PREFIX}/astcenc/`
|
||||
# Run a build and install build outputs in `${CMAKE_INSTALL_PREFIX}/bin/`
|
||||
cd build
|
||||
nmake install
|
||||
```
|
||||
|
||||
## macOS and Linux
|
||||
## macOS and Linux using Make
|
||||
|
||||
Builds for macOS and Linux are tested with CMake 3.17 and clang++ 9.0.
|
||||
Builds for macOS and Linux are tested with CMake 3.17, and clang++ 9.0 or
|
||||
newer.
|
||||
|
||||
> Compiling using g++ is supported, but clang++ builds are faster by ~15%.
|
||||
|
||||
@@ -78,15 +79,14 @@ cd build
|
||||
|
||||
# Arm arch64
|
||||
cmake -G "Unix Makefiles" -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=../ \
|
||||
-DISA_NEON=ON ..
|
||||
-DASTCENC_ISA_NEON=ON ..
|
||||
|
||||
# x86-64
|
||||
cmake -G "Unix Makefiles" -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=../ \
|
||||
-DISA_AVX2=ON -DISA_SSE41=ON -DISA_SSE2=ON ..
|
||||
-DASTCENC_ISA_AVX2=ON -DASTCENC_ISA_SSE41=ON -DASTCENC_ISA_SSE2=ON ..
|
||||
|
||||
# macOS universal binary build
|
||||
cmake -G "Unix Makefiles" -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=../ \
|
||||
-DISA_AVX2=ON -DISA_NEON=ON ..
|
||||
cmake -G "Unix Makefiles" -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=../ ..
|
||||
```
|
||||
|
||||
A single CMake configure can build multiple binaries for a single target CPU
|
||||
@@ -94,14 +94,13 @@ architecture, for example building x64 for both SSE2 and AVX2. Each binary name
|
||||
will include the build variant as a postfix. It is possible to build any set of
|
||||
the supported SIMD variants by enabling only the ones you require.
|
||||
|
||||
For macOS, we additionally support the ability to build a universal binary,
|
||||
combining one x86 and one arm64 variant into a single output binary. The OS
|
||||
will select the correct variant to run for the machine being used to run the
|
||||
built binary. To build a universal binary select a single x86 variant and a
|
||||
single arm64 variant, and both will be included in a single output binary. It
|
||||
is not required, but if `CMAKE_OSX_ARCHITECTURES` is set on the command line
|
||||
(e.g. by XCode-generated build commands) it will be validated against the other
|
||||
configuration variant settings.
|
||||
For macOS, we additionally support the ability to build a universal binary.
|
||||
This build includes SSE4.1 (`x86_64`), AVX2 (`x86_64h`), and NEON (`arm64`)
|
||||
build slices in a single output binary. The OS will select the correct variant
|
||||
to run for the machine being used. This is the default build target for a macOS
|
||||
build, but single-target binaries can still be built by setting
|
||||
`-DASTCENC_UNIVERSAL_BINARY=OFF` and then manually selecting the specific ISA
|
||||
variants that are required.
|
||||
|
||||
### Building
|
||||
|
||||
@@ -109,11 +108,44 @@ Once you have configured the build you can use Make to compile the project from
|
||||
your build dir, and install to your target install directory.
|
||||
|
||||
```shell
|
||||
# Run a build and install build outputs in `${CMAKE_INSTALL_PREFIX}/astcenc/`
|
||||
# Run a build and install build outputs in `${CMAKE_INSTALL_PREFIX}/bin/`
|
||||
# for executable binaries and `${CMAKE_INSTALL_PREFIX}/lib/` for libraries
|
||||
cd build
|
||||
make install -j16
|
||||
```
|
||||
|
||||
## macOS using XCode
|
||||
|
||||
Builds for macOS and Linux are tested with CMake 3.17, and XCode 14.0 or
|
||||
newer.
|
||||
|
||||
### Configuring the build
|
||||
|
||||
To use CMake you must first configure the build. Create a build directory
|
||||
in the root of the astcenc checkout, and then run `cmake` inside that directory
|
||||
to generate the build system.
|
||||
|
||||
```shell
|
||||
# Create a build directory
|
||||
mkdir build
|
||||
cd build
|
||||
|
||||
# Configure a universal build
|
||||
cmake -G Xcode -DCMAKE_INSTALL_PREFIX=../ ..
|
||||
```
|
||||
|
||||
### Building
|
||||
|
||||
Once you have configured the build you can use CMake to compile the project
|
||||
from your build dir, and install to your target install directory.
|
||||
|
||||
```shell
|
||||
cmake --build . --config Release
|
||||
|
||||
# Optionally install the binaries to the installation directory
|
||||
cmake --install . --config Release
|
||||
```
|
||||
|
||||
## Advanced build options
|
||||
|
||||
For codec developers and power users there are a number of useful features in
|
||||
@@ -132,22 +164,33 @@ We support and test the following `CMAKE_BUILD_TYPE` options.
|
||||
Note that optimized release builds are compiled with link-time optimization,
|
||||
which can make profiling more challenging ...
|
||||
|
||||
### Shared Libraries
|
||||
|
||||
We support building the core library as a shared object by setting the CMake
|
||||
option `-DASTCENC_SHAREDLIB=ON` at configure time. For macOS build targets the
|
||||
shared library supports the same universal build configuration as the command
|
||||
line utility.
|
||||
|
||||
Note that the command line tool is always statically linked; the shared objects
|
||||
are an extra build output that are not currently used by the command line tool.
|
||||
|
||||
### Constrained block size builds
|
||||
|
||||
All normal builds will support all ASTC block sizes, including the worst case
|
||||
6x6x6 3D block size (216 texels per block). Compressor memory footprint and
|
||||
performance can be improved by limiting the block sizes supported in the build
|
||||
by adding `-DBLOCK_MAX_TEXELS=<texel_count>` to to CMake command line when
|
||||
configuring. Legal block sizes that are unavailable in a restricted build will
|
||||
return the error `ASTCENC_ERR_NOT_IMPLEMENTED` during context creation.
|
||||
by adding `-DASTCENC_BLOCK_MAX_TEXELS=<texel_count>` to to CMake command line
|
||||
when configuring. Legal block sizes that are unavailable in a restricted build
|
||||
will return the error `ASTCENC_ERR_NOT_IMPLEMENTED` during context creation.
|
||||
|
||||
### Non-invariant builds
|
||||
|
||||
All normal builds are designed to be invariant, so any build from the same git
|
||||
revision will produce bit-identical results for all compilers and CPU
|
||||
architectures. To achieve this we sacrifice some performance, so if this is
|
||||
not required you can specify `-DNO_INVARIANCE=ON` to enable additional
|
||||
optimizations.
|
||||
not required you can specify `-DASTCENC_INVARIANCE=OFF` to enable additional
|
||||
optimizations. This has most benefit for AVX2 builds where we are able to
|
||||
enable use of the FMA instruction set extensions.
|
||||
|
||||
### No intrinsics builds
|
||||
|
||||
@@ -156,8 +199,8 @@ supported target architectures (x86 and arm64) guarantee SIMD availability. For
|
||||
development purposes it is possible to build an intrinsic-free build which uses
|
||||
no explicit SIMD acceleration (the compiler may still auto-vectorize).
|
||||
|
||||
To enable this binary variant add `-DISA_NONE=ON` to the CMake command line
|
||||
when configuring. It is NOT recommended to use this for production; it is
|
||||
To enable this binary variant add `-DASTCENC_ISA_NONE=ON` to the CMake command
|
||||
line when configuring. It is NOT recommended to use this for production; it is
|
||||
significantly slower than the vectorized SIMD builds.
|
||||
|
||||
### Test builds
|
||||
@@ -171,7 +214,7 @@ git submodule init
|
||||
git submodule update
|
||||
```
|
||||
|
||||
To build unit tests add `-DUNITTEST=ON` to the CMake command line when
|
||||
To build unit tests add `-DASTCENC_UNITTEST=ON` to the CMake command line when
|
||||
configuring.
|
||||
|
||||
To run unit tests use the CMake `ctest` utility from your build directory after
|
||||
@@ -185,14 +228,56 @@ ctest --verbose
|
||||
### Address sanitizer builds
|
||||
|
||||
We support building with ASAN on Linux and macOS when using a compiler that
|
||||
supports it. To build binaries with ASAN checking enabled add `-DASAN=ON` to
|
||||
the CMake command line when configuring.
|
||||
supports it. To build binaries with ASAN checking enabled add `-DASTCENC_ASAN=ON`
|
||||
to the CMake command line when configuring.
|
||||
|
||||
### Android builds
|
||||
|
||||
Builds of the command line utility for Android are not officially supported, but can be a useful
|
||||
development build for testing on e.g. different Arm CPU microarchitectures.
|
||||
|
||||
The build script below shows one possible route to building the command line tool for Android. Once
|
||||
built the application can be pushed to e.g. `/data/local/tmp` and executed from an Android shell
|
||||
terminal over `adb`.
|
||||
|
||||
```shell
|
||||
ANDROID_ABI=arm64-v8a
|
||||
ANDROID_NDK=/work/tools/android/ndk/22.1.7171670
|
||||
|
||||
BUILD_TYPE=RelWithDebInfo
|
||||
|
||||
BUILD_DIR=build
|
||||
|
||||
mkdir -p ${BUILD_DIR}
|
||||
cd ${BUILD_DIR}
|
||||
|
||||
cmake \
|
||||
-DCMAKE_INSTALL_PREFIX=./ \
|
||||
-DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
|
||||
-DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK}/build/cmake/android.toolchain.cmake \
|
||||
-DANDROID_ABI=${ANDROID_ABI} \
|
||||
-DANDROID_ARM_NEON=ON \
|
||||
-DANDROID_PLATFORM=android-21 \
|
||||
-DCMAKE_ANDROID_NDK_TOOLCHAIN_VERSION=clang \
|
||||
-DANDROID_TOOLCHAIN=clang \
|
||||
-DANDROID_STL=c++_static \
|
||||
-DARCH=aarch64 \
|
||||
-DASTCENC_ISA_NEON=ON \
|
||||
..
|
||||
|
||||
make -j16
|
||||
```
|
||||
|
||||
## Packaging a release bundle
|
||||
|
||||
We support building a release bundle of all enabled binary configurations in
|
||||
the current CMake configuration using the `package` build target
|
||||
|
||||
Configure CMake with:
|
||||
|
||||
* `-DASTCENC_PACAKGE=<arch>` to set the package architecture/variant name used
|
||||
to name the package archive (not set by default).
|
||||
|
||||
```shell
|
||||
# Run a build and package build outputs in `./astcenc-<ver>-<os>-<arch>.<fmt>`
|
||||
cd build
|
||||
@@ -212,4 +297,4 @@ details.
|
||||
|
||||
- - -
|
||||
|
||||
_Copyright © 2019-2022, Arm Limited and contributors. All rights reserved._
|
||||
_Copyright © 2019-2023, Arm Limited and contributors. All rights reserved._
|
||||
|
||||
@@ -1,328 +0,0 @@
|
||||
# 2.x series change log
|
||||
|
||||
This page summarizes the major functional and performance changes in each
|
||||
release of the 2.x series.
|
||||
|
||||
All performance data on this page is measured on an Intel Core i5-9600K
|
||||
clocked at 4.2 GHz, running astcenc using 6 threads.
|
||||
|
||||
<!-- ---------------------------------------------------------------------- -->
|
||||
## 2.5
|
||||
|
||||
**Status:** Released, March 2021
|
||||
|
||||
The 2.5 release is the last major release in the 2.x series. After this release
|
||||
a `2.x` branch will provide stable long-term support, and the `main` branch
|
||||
will switch to focusing on more radical changes for the 3.x series.
|
||||
|
||||
Reminder for users of the library interface - the API is not designed to be
|
||||
stable across versions, and this release is not compatible with earlier 2.x
|
||||
releases. Please update and rebuild your client-side code using the updated
|
||||
`astcenc.h` header.
|
||||
|
||||
**General:**
|
||||
* **Feature:** The `ISA_INVARIANCE` build option is no longer supported, as
|
||||
there is no longer any performance benefit from the variant paths. All
|
||||
builds are now using the equivalent of the `ISA_INVARIANCE=ON` setting, and
|
||||
all builds (except Armv7) are now believed to be invariant across operating
|
||||
systems, compilers, CPU architectures, and SIMD instruction sets.
|
||||
* **Feature:** Armv8 32-bit builds with NEON are now supported, with
|
||||
out-of-the-box support for Arm Linux soft-float and hard-float ABIs. There
|
||||
are no pre-built binaries for these targets; support is included for
|
||||
library users targeting older 32-bit Android and iOS devices.
|
||||
* **Feature:** A compressor mode for encoding HDR textures that have been
|
||||
encoded into LDR RGBM wrapper format is now supported. Note that this
|
||||
encoding has some strong recommendations for how the RGBM encoding is
|
||||
implemented to avoid block artifacts in the compressed image.
|
||||
* **Core API:**
|
||||
* **API Change:** The core API has been changed to be a pure C API, making it
|
||||
easier to wrap the codec in a stable shared library ABI. Some entry points
|
||||
that used to accept references now expect pointers.
|
||||
* **API Change:** The decompression functionality in the core API has been
|
||||
changed to allow use of multiple threads. The design pattern matches the
|
||||
compression functionality, requiring the caller to create the threads,
|
||||
synchronize them between images, and to call the new
|
||||
`astcenc_decompress_reset()` function between images.
|
||||
* **API Feature:** Defines to support exporting public API entry point
|
||||
symbols from a shared object are provided, but not exposed off-the-shelf by
|
||||
the CMake provided by the project.
|
||||
* **API Feature:** New `astcenc_get_block_info()` function added to the core
|
||||
API to allow users to perform high level analysis of compressed data. This
|
||||
API is not implemented in decompressor-only builds.
|
||||
* **API Feature:** Codec configuration structure has been extended to expose
|
||||
the new RGBM compression mode. See the API header for details.
|
||||
|
||||
<!-- ---------------------------------------------------------------------- -->
|
||||
## 2.4
|
||||
|
||||
**Status:** Released, February 2021
|
||||
|
||||
The 2.4 release is the fifth release in the 2.x series. It is primarily a bug
|
||||
fix release for HDR image handling, which impacts all earlier 2.x series
|
||||
releases.
|
||||
|
||||
**General:**
|
||||
* **Feature:** When using the `-a` option, or the equivalent config option
|
||||
for the API, any 2D blocks that are entirely zero alpha after the alpha
|
||||
filter radius is taken into account are replaced by transparent black
|
||||
constant color blocks. This is an RDO-like technique to improve compression
|
||||
ratios of any additional application packaging compression that is applied.
|
||||
**Command Line:**
|
||||
* **Bug fix:** The command line wrapper now correctly loads HDR images that
|
||||
have a non-square aspect ratio.
|
||||
|
||||
<!-- ---------------------------------------------------------------------- -->
|
||||
## 2.3
|
||||
|
||||
**Status:** Released, January 2021
|
||||
|
||||
The 2.3 release is the fourth release in the 2.x series. It includes a number
|
||||
of performance improvements and new features.
|
||||
|
||||
Reminder for users of the library interface - the API is not designed to be
|
||||
stable across versions, and this release is not compatible with 2.2. Please
|
||||
recompile your client-side code using the updated `astcenc.h` header.
|
||||
|
||||
* **General:**
|
||||
* **Feature:** Decompressor-only builds of the codec are supported again.
|
||||
While this is primarily a feature for library users who want to shrink
|
||||
binary size, a variant command line tool `astcdec` can be built by
|
||||
specifying `DECOMPRESSOR=ON` on the CMake configure command line.
|
||||
* **Feature:** Diagnostic builds of the codec can now be built. These builds
|
||||
generate a JSON file containing a trace of the compressor execution.
|
||||
Diagnostic builds are only suitable for codec development; they are slower
|
||||
and JSON generation cannot be disabled. Build by setting `DIAGNOSTICS=ON`
|
||||
on the CMake configure command line.
|
||||
* **Feature:** Code compatibility improved with older versions of GCC,
|
||||
earliest compiler now tested is GCC 7.5 (was GCC 9.3).
|
||||
* **Feature:** Code compatibility improved with newer versions of LLVM,
|
||||
latest compiler now tested is Clang 12.0 (was Clang 9.0).
|
||||
* **Feature:** Code compatibility improved with the Visual Studio 2019 LLVM
|
||||
toolset (`clang-cl`). Using the LLVM toolset gives 25% performance
|
||||
improvements and is recommended.
|
||||
* **Command Line:**
|
||||
* **Feature:** Quality level now accepts either a preset (`-fast`, etc) or a
|
||||
float value between 0 and 100, allowing more control over the compression
|
||||
quality vs performance trade-off. The presets are not evenly spaced in the
|
||||
float range; they have been spaced to give the best distribution of points
|
||||
between the fast and thorough presets.
|
||||
* `-fastest`: 0.0
|
||||
* `-fast`: 10.0
|
||||
* `-medium`: 60.0
|
||||
* `-thorough`: 98.0
|
||||
* `-exhaustive`: 100.0
|
||||
* **Core API:**
|
||||
* **API Change:** Quality level preset enum replaced with a float value
|
||||
between 0 (`-fastest`) and 100 (`-exhaustive`). See above for more info.
|
||||
|
||||
### Performance
|
||||
|
||||
This release includes a number of optimizations to improve performance.
|
||||
|
||||
* New compressor algorithm for handling encoding candidates and refinement.
|
||||
* Vectorized implementation of `compute_error_of_weight_set()`.
|
||||
* Unrolled implementation of `encode_ise()`.
|
||||
* Many other small improvements!
|
||||
|
||||
The most significant change is the change to the compressor path, which now
|
||||
uses an adaptive approach to candidate trials and block refinement.
|
||||
|
||||
In earlier releases the quality level will determine the number of encoding
|
||||
candidates and the number of iterative refinement passes that are used for each
|
||||
major encoding trial. This is a fixed behavior; it will always try the full N
|
||||
candidates and M refinement iterations specified by the quality level for each
|
||||
encoding trial.
|
||||
|
||||
The new approach implements two optimizations for this:
|
||||
|
||||
* Compression will complete when a block candidate hits the specified target
|
||||
quality, after its M refinement iterations have been applied. Later block
|
||||
candidates are simply abandoned.
|
||||
* Block candidates will predict how much refinement can improve them, and
|
||||
abandon refinement if they are unlikely to improve upon the best known
|
||||
encoding already in-hand.
|
||||
|
||||
This pair of optimizations provides significant performance improvement to the
|
||||
high quality modes which use the most block candidates and refinement
|
||||
iterations. A minor loss of image quality is expected, as the blocks we no
|
||||
longer test or refine may have been better coding choices.
|
||||
|
||||
**Absolute performance vs 2.2 release:**
|
||||
|
||||

|
||||
|
||||
**Relative performance vs 2.2 release:**
|
||||
|
||||

|
||||
|
||||
<!-- ---------------------------------------------------------------------- -->
|
||||
## 2.2
|
||||
|
||||
**Status:** Released, January 2021
|
||||
|
||||
The 2.2 release is the third release in the 2.x series. It includes a number
|
||||
of performance improvements and new features.
|
||||
|
||||
Reminder for users of the library interface - the API is not designed to be
|
||||
stable across versions, and this release is not compatible with 2.1. Please
|
||||
recompile your client-side code using the updated `astcenc.h` header.
|
||||
|
||||
* **General:**
|
||||
* **Feature:** New Arm aarch64 NEON accelerated vector library support.
|
||||
* **Improvement:** New CMake build system for all platforms.
|
||||
* **Improvement:** SSE4.2 feature profile changed to SSE4.1, which more
|
||||
accurately reflects the feature set used.
|
||||
* **Binary releases:**
|
||||
* **Improvement:** Linux binaries changed to use Clang 9.0, which gives
|
||||
up to 15% performance improvement.
|
||||
* **Improvement:** Windows binaries are now code signed.
|
||||
* **Improvement:** macOS binaries for Apple silicon platforms now provided.
|
||||
* **Improvement:** macOS binaries are now code signed and notarized.
|
||||
* **Command Line:**
|
||||
* **Feature:** New image preprocess `-pp-normalize` option added. This forces
|
||||
normal vectors to be unit length, which is useful when compressing source
|
||||
textures that use normal length to encode an NDF, which is incompatible
|
||||
with ASTC's two channel encoding.
|
||||
* **Feature:** New image preprocess `-pp-premultiply` option added. This
|
||||
scales RGB values by the alpha value. This can be useful to minimize
|
||||
cross-channel color bleed caused by GPU post-multiply filtering/blending.
|
||||
* **Improvements:** Command line tool cleanly traps and reports errors for
|
||||
corrupt input images rather than relying on standard library `assert()`
|
||||
calls in release builds.
|
||||
* **Core API:**
|
||||
* **API Change:** Images using region-based metrics no longer need to include
|
||||
padding; all input images should be tightly packed and `dim_pad` is removed
|
||||
from the `astcenc_image` structure. This makes it easier to directly use
|
||||
images loaded from other libraries.
|
||||
* **API Change:** Image `data` is no longer a 3D array accessed using
|
||||
`data[z][y][x]` indexing, it's an array of 2D slices. This makes it easier
|
||||
to directly use images loaded from other libraries.
|
||||
* **API Change:** New `ASTCENC_FLG_SELF_DECOMPRESS_ONLY` flag added to the
|
||||
codec config. Using this flag enables additional optimizations that
|
||||
aggressively exploit implementation- and configuration-specific, behavior
|
||||
to gain performance. When using this flag the codec can only reliably
|
||||
decompress images that were compressed in the same context session. Images
|
||||
produced via other means may fail to decompress correctly, even if they are
|
||||
otherwise valid ASTC files.
|
||||
|
||||
### Performance
|
||||
|
||||
There is one major set of optimizations in this release, related to the new
|
||||
`ASTCENC_FLG_SELF_DECOMPRESS_ONLY` mode. These allow the compressor to only
|
||||
create data tables it knows that it is going to use, based on its current set
|
||||
of heuristics, rather than needing the full set the format allows.
|
||||
|
||||
The first benefit of these changes is a reduced context creation time, which
|
||||
can be reduced by up to 250ms on our test machine. This is a significant
|
||||
percentage of the command line utility runtime for a small image when using a
|
||||
quick search preset. Compressing the whole Kodak test suite using the command
|
||||
line utility and the `-fastest` preset is ~30% faster with this release, which
|
||||
is mostly due to faster startup.
|
||||
|
||||
The reduction in the data table size in this mode also improve the core codec
|
||||
speed. Our test sets show an average of 12% improvement in the codec for
|
||||
`-fastest` mode, and an average of 3% for `-medium` mode.
|
||||
|
||||
Key for performance charts:
|
||||
|
||||
* Color = block size (see legend).
|
||||
* Letter = image format (N = normal map, G = grayscale, L = LDR, H = HDR).
|
||||
|
||||
**Absolute performance vs 2.1 release:**
|
||||
|
||||

|
||||
|
||||
**Relative performance vs 2.1 release:**
|
||||
|
||||

|
||||
|
||||
|
||||
<!-- ---------------------------------------------------------------------- -->
|
||||
## 2.1
|
||||
|
||||
**Status:** Released, November 2020
|
||||
|
||||
The 2.1 release is the second release in the 2.x series. It includes a number
|
||||
of performance optimizations and new features.
|
||||
|
||||
Reminder for users of the library interface - the API is not designed to be
|
||||
stable across versions, and this release is not compatible with 2.0. Please
|
||||
recompile your client-side code using the updated `astcenc.h` header.
|
||||
|
||||
### Features:
|
||||
|
||||
* **Command line:**
|
||||
* **Bug fix:** The meaning of the `-tH\cH\dH` and `-th\ch\dh` compression
|
||||
modes was inverted. They now match the documentation; use `-*H` for HDR
|
||||
RGBA, and `-*h` for HDR RGB with LDR alpha.
|
||||
* **Feature:** A new `-fastest` quality preset is now available. This is
|
||||
designed for fast "roughing out" of new content, and sacrifices significant
|
||||
image quality compared to `-fast`. We do not recommend its use for
|
||||
production builds.
|
||||
* **Feature:** A new `-candidatelimit` compression tuning option is now
|
||||
available. This is a power-user control to determine how many candidates
|
||||
are returned for each block mode encoding trial. This feature is used
|
||||
automatically by the search presets; see `-help` for details.
|
||||
* **Improvement:** The compression test modes (`-tl\ts\th\tH`) now emit a
|
||||
MTex/s performance metric, in addition to coding time.
|
||||
* **Core API:**
|
||||
* **Feature:** A new quality preset `ASTCENC_PRE_FASTEST` is available. See
|
||||
`-fastest` above for details.
|
||||
* **Feature:** A new tuning option `tune_candidate_limit` is available in
|
||||
the config structure. See `-candidatelimit` above for details.
|
||||
* **Feature:** Image input/output can now use `ASTCENC_TYPE_F32` data types.
|
||||
* **Stability:**
|
||||
* **Feature:** The SSE2, SSE4.2, and AVX2 variants now produce identical
|
||||
compressed output when run on the same CPU when compiled with the
|
||||
preprocessor define `ASTCENC_ISA_INVARIANCE=1`. For Make builds this can
|
||||
be set on the command line by setting `ISA_INV=1`. ISA invariance is off
|
||||
by default; it reduces performance by 1-3%.
|
||||
|
||||
### Performance
|
||||
|
||||
Key for performance charts:
|
||||
|
||||
* Color = block size (see legend).
|
||||
* Letter = image format (N = normal map, G = grayscale, L = LDR, H = HDR).
|
||||
|
||||
**Absolute performance vs 2.0 release:**
|
||||
|
||||

|
||||
|
||||
**Relative performance vs 2.0 release:**
|
||||
|
||||

|
||||
|
||||
|
||||
<!-- ---------------------------------------------------------------------- -->
|
||||
## 2.0
|
||||
|
||||
**Status:** Released, August 2020
|
||||
|
||||
The 2.0 release is first release in the 2.x series. It includes a number of
|
||||
major changes over the earlier 1.7 series, and is not command-line compatible.
|
||||
|
||||
### Features:
|
||||
|
||||
* The core codec can be built as a library, exposed via a new codec API.
|
||||
* The core codec supports accelerated SIMD paths for SSE2, SSE4.2, and AVX2.
|
||||
* The command line syntax has a clearer mapping to Khronos feature profiles.
|
||||
|
||||
### Performance:
|
||||
|
||||
Key for performance charts
|
||||
|
||||
* Color = block size (see legend).
|
||||
* Letter = image format (N = normal map, G = grayscale, L = LDR, H = HDR).
|
||||
|
||||
**Absolute performance vs 1.7 release:**
|
||||
|
||||

|
||||
|
||||
**Relative performance vs 1.7 release:**
|
||||
|
||||

|
||||
|
||||
- - -
|
||||
|
||||
_Copyright © 2020-2022, Arm Limited and contributors. All rights reserved._
|
||||
@@ -299,10 +299,6 @@ Key for charts:
|
||||
* Color = block size (see legend).
|
||||
* Letter = image format (N = normal map, G = grayscale, L = LDR, H = HDR).
|
||||
|
||||
**Absolute performance vs 2.5 release:**
|
||||
|
||||

|
||||
|
||||
**Relative performance vs 2.5 release:**
|
||||
|
||||

|
||||
|
||||
@@ -0,0 +1,398 @@
|
||||
# 4.x series change log
|
||||
|
||||
This page summarizes the major functional and performance changes in each
|
||||
release of the 4.x series.
|
||||
|
||||
All performance data on this page is measured on an Intel Core i5-9600K
|
||||
clocked at 4.2 GHz, running `astcenc` using AVX2 and 6 threads.
|
||||
|
||||
<!-- ---------------------------------------------------------------------- -->
|
||||
## 4.7.0
|
||||
|
||||
**Status:** January 2024
|
||||
|
||||
The 4.7.0 release is a major maintenance release, fixing rounding behavior in
|
||||
the decompressor to match the Khronos specification. This fix includes the
|
||||
addition of explicit support for optimizing for `decode_unorm8` rounding.
|
||||
|
||||
Reminder - the codec library API is not designed to be binary compatible across
|
||||
versions. We always recommend rebuilding your client-side code using the updated
|
||||
`astcenc.h` header.
|
||||
|
||||
* **General:**
|
||||
* **Bug fix:** sRGB LDR decompression now uses the correct endpoint expansion
|
||||
method to create the 16-bit RGB endpoint colors, and removes the previous
|
||||
correction code from the interpolation function. This bug could result in
|
||||
LSB bit flips relative to the standard specification.
|
||||
* **Bug fix:** Decompressing to an 8-bit per component output image now matches
|
||||
the `decode_unorm8` extension rounding rules. This bug could result in
|
||||
LSB bit flips relative to the standard specification.
|
||||
* **Bug fix:** Code now avoids using `alignas()` in the reference C
|
||||
implementation, as the default `alignas(16)` is narrower than the
|
||||
native minimum alignment requirement on some CPUs.
|
||||
* **Feature:** Library configuration supports a new flag,
|
||||
`ASTCENC_FLG_USE_DECODE_UNORM8`. This flag indicates that the image will be
|
||||
used with the `decode_unorm8` decode mode. When set during compression
|
||||
this allows the compressor to use the correct rounding when determining the
|
||||
best encoding.
|
||||
* **Feature:** Command line tool supports a new option, `-decode_unorm8`.
|
||||
This option indicates that the image will be used with the `decode_unorm8`
|
||||
decode mode. This option will automatically be set for decompression
|
||||
(`-d*`) and trial (`-t*`) tool operation if the decompressed output image
|
||||
is stored to an 8-bit per component file format. This option must be set
|
||||
manually for compression (`-c*`) tool operation, as the desired decode mode
|
||||
cannot be reliably determined.
|
||||
* **Feature:** Library configuration supports a new optional progress
|
||||
reporting callback to be specified. This is called during compression to
|
||||
to allow interactive tooling use cases to display incremental progress. The
|
||||
command line tool uses this feature to show compression progress unless
|
||||
`-silent` is used.
|
||||
|
||||
<!-- ---------------------------------------------------------------------- -->
|
||||
## 4.6.1
|
||||
|
||||
**Status:** November 2023
|
||||
|
||||
The 4.6.1 release is a minor maintenance release to fix a scaling bug on
|
||||
large core count Windows systems.
|
||||
|
||||
* **General:**
|
||||
* **Optimization:** Windows builds of the `astcenc` command line tool can now
|
||||
use more than 64 cores on large core count systems. This change doubled
|
||||
command line performance for `-exhaustive` compression when testing on an
|
||||
96 core/192 thread system.
|
||||
* **Feature:** Windows Arm64 native builds of the `astcenc` command line tool
|
||||
are now included in the prebuilt release binaries.
|
||||
|
||||
<!-- ---------------------------------------------------------------------- -->
|
||||
## 4.6.0
|
||||
|
||||
**Status:** November 2023
|
||||
|
||||
The 4.6.0 release retunes the compressor heuristics to give improvements to
|
||||
performance for trivial losses to image quality. It also includes some minor
|
||||
bug fixes and code quality improvements.
|
||||
|
||||
Reminder - the codec library API is not designed to be binary compatible across
|
||||
versions. We always recommend rebuilding your client-side code using the updated
|
||||
`astcenc.h` header.
|
||||
|
||||
* **General:**
|
||||
* **Bug-fix:** Fixed context allocation for contexts allocated with the
|
||||
`ASTCENC_FLG_DECOMPRESS_ONLY` flag.
|
||||
* **Bug-fix:** Reduced use of `reinterpret_cast` in the core codec to
|
||||
avoid strict aliasing violations.
|
||||
* **Optimization:** `-medium` search quality no longer tests 4 partition
|
||||
encodings for block sizes between 25 and 83 texels (inclusive). This
|
||||
improves performance for a tiny drop in image quality.
|
||||
* **Optimization:** `-thorough` and higher search qualities no longer test the
|
||||
mode0 first search for block sizes between 25 and 83 texels (inclusive).
|
||||
This improves performance for a tiny drop in image quality.
|
||||
* **Optimization:** `TUNE_MAX_PARTITIONING_CANDIDATES` reduced from 32 to 8
|
||||
to reduce the size of stack allocated data structures. This causes a tiny
|
||||
drop in image quality for the `-verythorough` and `-exhaustive` presets.
|
||||
|
||||
<!-- ---------------------------------------------------------------------- -->
|
||||
## 4.5.0
|
||||
|
||||
**Status:** June 2023
|
||||
|
||||
The 4.5.0 release is a maintenance release with small image quality
|
||||
improvements, and a number of build system quality of life improvements.
|
||||
|
||||
* **General:**
|
||||
* **Bug-fix:** Improved handling compiler arguments in CMake, including
|
||||
consistent use of MSVC-style command line arguments for ClangCL.
|
||||
* **Bug-fix:** Invariant Clang builds now use `-ffp-model=precise` with
|
||||
`-ffp-contract=off` which is needed to restore invariance due to recent
|
||||
changes in compiler defaults.
|
||||
* **Change:** macOS binary releases are now distributed as a single universal
|
||||
binary for all platforms.
|
||||
* **Change:** Windows binary releases are now compiled with VS2022.
|
||||
* **Change:** Invariant MSVC builds for VS2022 now use `/fp:precise` instead
|
||||
of `/fp:strict`, which is is now possible because precise no longer implies
|
||||
contraction. This should improve performance for MSVC builds.
|
||||
* **Change:** Non-invariant Clang builds now use `-ffp-model=precise` with
|
||||
`-ffp-contract=on`. This should improve performance on older Clang
|
||||
versions which defaulted to no contraction.
|
||||
* **Change:** Non-invariant MSVC builds for VS2022 now use `/fp:precise`
|
||||
with `/fp:contract`. This should improve performance for MSVC builds.
|
||||
* **Change:** CMake config variables now use an `ASTCENC_` prefix to add a
|
||||
namespace and group options when the library is used in a larger project.
|
||||
* **Change:** CMake config `ASTCENC_UNIVERSAL_BUILD` for building macOS
|
||||
universal binaries has been improved to include the `x86_64h` slice for
|
||||
AVX2 builds. Universal builds are now on by default for macOS, and always
|
||||
include NEON (arm64), SSE4.1 (x86_64), and AVX2 (x86_64h) variants.
|
||||
* **Change:** CMake config `ASTCENC_NO_INVARIANCE` has been inverted to
|
||||
remove the negated option, and is now `ASTCENC_INVARIANCE` with a default
|
||||
of `ON`. Disabling this option can substantially improve performance, but
|
||||
images can different across platforms and compilers.
|
||||
* **Optimization:** Color quantization and packing for LDR RGB and RGBA has
|
||||
been vectorized to improve performance.
|
||||
* **Change:** Color quantization for LDR RGB and RGBA endpoints will now try
|
||||
multiple quantization packing methods, and pick the one with the lowest
|
||||
endpoint encoding error. This gives a minor image quality improvement, for
|
||||
no significant performance impact when combined with the vectorization
|
||||
optimizations.
|
||||
|
||||
<!-- ---------------------------------------------------------------------- -->
|
||||
## 4.4.0
|
||||
|
||||
**Status:** March 2023
|
||||
|
||||
The 4.4.0 release is a minor release with image quality improvements, a small
|
||||
performance boost, and a few new quality-of-life features.
|
||||
|
||||
* **General:**
|
||||
* **Change:** Core library no longer checks availability of required
|
||||
instruction set extensions, such as SSE4.1 or AVX2. Checking compatibility
|
||||
is now the responsibility of the caller. See `astcenccli_entry.cpp` for
|
||||
an example of code performing this check.
|
||||
* **Change:** Core library can be built as a shared object by setting the
|
||||
`-DSHAREDLIB=ON` CMake option, resulting in e.g. `libastcenc-avx2-shared.so`.
|
||||
Note that the command line tool is always statically linked.
|
||||
* **Change:** Decompressed 3D images will now write one output file per
|
||||
slice, if the target format is a 2D image format.
|
||||
* **Change:** Command line errors print to stderr instead of stdout.
|
||||
* **Change:** Color encoding uses new quantization tables, that now factor
|
||||
in floating-point rounding if a distance tie is found when using the
|
||||
integer quant256 value. This improves image quality for 4x4 and 5x5 block
|
||||
sizes.
|
||||
* **Optimization:** Partition selection uses a simplified line calculation
|
||||
with a faster approximation. This improves performance for all block sizes.
|
||||
* **Bug-fix:** Fixed missing symbol error in decompressor-only builds.
|
||||
* **Bug-fix:** Fixed infinity handling in debug trace JSON files.
|
||||
|
||||
### Performance:
|
||||
|
||||
Key for charts:
|
||||
|
||||
* Color = block size (see legend).
|
||||
* Letter = image format (N = normal map, G = grayscale, L = LDR, H = HDR).
|
||||
|
||||
**Relative performance vs 4.3 release:**
|
||||
|
||||

|
||||
|
||||
<!-- ---------------------------------------------------------------------- -->
|
||||
## 4.3.1
|
||||
|
||||
**Status:** January 2023
|
||||
|
||||
The 4.3.1 release is a minor maintenance release. No performance or image
|
||||
quality changes are expected.
|
||||
|
||||
* **General:**
|
||||
* **Bug-fix:** Fixed typo in `-2/3/4partitioncandidatelimit` CLI options.
|
||||
* **Bug-fix:** Fixed handling for `-3/4partitionindexlimit` CLI options.
|
||||
* **Bug-fix:** Updated to `stb_image.h` v2.28, which includes multiple fixes
|
||||
and improvements for image loading.
|
||||
|
||||
<!-- ---------------------------------------------------------------------- -->
|
||||
## 4.3.0
|
||||
|
||||
**Status:** January 2023
|
||||
|
||||
The 4.3.0 release is an optimization release. There are minor performance
|
||||
and image quality improvements in this release.
|
||||
|
||||
Reminder - the codec library API is not designed to be binary compatible across
|
||||
versions. We always recommend rebuilding your client-side code using the updated
|
||||
`astcenc.h` header.
|
||||
|
||||
* **General:**
|
||||
* **Bug-fix:** Use lower case `windows.h` include for MinGW compatibility.
|
||||
* **Change:** The `-mask` command line option, `ASTCENC_FLG_MAP_MASK` in the
|
||||
library API, has been removed.
|
||||
* **Optimization:** Always skip blue-contraction for `QUANT_256` encodings.
|
||||
This gives a small image quality improvement for the 4x4 block size.
|
||||
* **Optimization:** Always skip RGBO vector calculation for LDR encodings.
|
||||
* **Optimization:** Defer color packing and scrambling to physical layer.
|
||||
* **Optimization:** Remove folded `decimation_info` lookup tables. This
|
||||
significantly reduces compressor memory footprint and improves context
|
||||
creation time. Impact increases with the active block size.
|
||||
* **Optimization:** Increased trial and refinement pruning by using stricter
|
||||
target errors when determining whether to skip iterations.
|
||||
|
||||
### Performance:
|
||||
|
||||
Key for charts:
|
||||
|
||||
* Color = block size (see legend).
|
||||
* Letter = image format (N = normal map, G = grayscale, L = LDR, H = HDR).
|
||||
|
||||
**Relative performance vs 4.2 release:**
|
||||
|
||||

|
||||
|
||||
|
||||
<!-- ---------------------------------------------------------------------- -->
|
||||
## 4.2.0
|
||||
|
||||
**Status:** November 2022
|
||||
|
||||
The 4.2.0 release is an optimization release. There are significant performance
|
||||
improvements, minor image quality improvements, and library interface changes in
|
||||
this release.
|
||||
|
||||
Reminder - the codec library API is not designed to be binary compatible across
|
||||
versions. We always recommend rebuilding your client-side code using the updated
|
||||
`astcenc.h` header.
|
||||
|
||||
* **General:**
|
||||
* **Bug-fix:** Compression for RGB and RGBA base+offset encodings no
|
||||
longer generate endpoints with the incorrect blue-contract behavior.
|
||||
* **Bug-fix:** Lowest channel correlation calculation now correctly ignores
|
||||
constant color channels for the purposes of filtering 2 plane encodings.
|
||||
On average this improves both performance and image quality.
|
||||
* **Bug-fix:** ISA compatibility now checked in `config_init()` as well as
|
||||
in `context_alloc()`.
|
||||
* **Change:** Removed the low-weight count optimization, as more recent
|
||||
changes had significantly reduced its performance benefit. Option removed
|
||||
from both command line and configuration structure.
|
||||
* **Feature:** The `-exhaustive` mode now runs full trials on more
|
||||
partitioning candidates and block candidates. This improves image quality
|
||||
by 0.1 to 0.25 dB, but slows down compression by 3x. The `-verythorough`
|
||||
and `-thorough` modes also test more candidates.
|
||||
* **Feature:** A new preset, `-verythorough`, has been introduced to provide
|
||||
a standard performance point between `-thorough` and the re-tuned
|
||||
`-exhaustive` mode. This new mode is faster and higher quality than the
|
||||
`-exhaustive` preset in the 4.1 release.
|
||||
* **Feature:** The compressor can now independently vary the number of
|
||||
partitionings considered for error estimation for 2/3/4 partitions. This
|
||||
allows heuristics to put more effort into 2 partitions, and less in to
|
||||
3/4 partitions.
|
||||
* **Feature:** The compressor can now run trials on a variable number of
|
||||
candidate partitionings, allowing high quality modes to explore more of the
|
||||
search space at the expense of slower compression. The number of trials is
|
||||
independently configurable for 2/3/4 partition cases.
|
||||
* **Optimization:** Introduce early-out threshold for 2/3/4 partition
|
||||
searches based on the results after 1 of 2 trials. This significantly
|
||||
improves performance for `-medium` and `-thorough` searches, for a minor
|
||||
loss in image quality.
|
||||
* **Optimization:** Reduce early-out threshold for 3/4 partition searches
|
||||
based on 2/3 partition results. This significantly improves performance,
|
||||
especially for `-thorough` searches, for a minor loss in image quality.
|
||||
* **Optimization:** Use direct vector compare to create a SIMD mask instead
|
||||
of a scalar compare that is broadcast to a vector mask.
|
||||
* **Optimization:** Remove obsolete partition validity masks from the
|
||||
partition selection algorithm.
|
||||
* **Optimization:** Removed obsolete channel scaling from partition
|
||||
`avgs_and_dirs()` calculation.
|
||||
|
||||
### Performance:
|
||||
|
||||
Key for charts:
|
||||
|
||||
* Color = block size (see legend).
|
||||
* Letter = image format (N = normal map, G = grayscale, L = LDR, H = HDR).
|
||||
|
||||
**Relative performance vs 4.0 and 4.1 release:**
|
||||
|
||||

|
||||
|
||||
|
||||
<!-- ---------------------------------------------------------------------- -->
|
||||
## 4.1.0
|
||||
|
||||
**Status:** August 2022
|
||||
|
||||
The 4.1.0 release is a maintenance release. There is no performance or image
|
||||
quality change in this release.
|
||||
|
||||
* **General:**
|
||||
* **Change:** Command line decompressor no longer uses the legacy
|
||||
`GL_LUMINANCE` or `GL_LUMINANCE_ALPHA` format enums when writing KTX
|
||||
output files. Luminance textures now use the `GL_RED` format and
|
||||
luminance_alpha textures now use the `GL_RG` format.
|
||||
* **Change:** Command line tool gains a new `-dimage` option to generate
|
||||
diagnostic images showing aspects of the compression encoding. The output
|
||||
file name with its extension stripped is used as the stem of the diagnostic
|
||||
image file names.
|
||||
* **Bug-fix:** Library decompressor builds for SSE no longer use masked store
|
||||
`maskmovdqu` instructions, as they can generate faults on masked lanes.
|
||||
* **Bug-fix:** Command line decompressor now correctly uses sized type enums
|
||||
for the internal format when writing output KTX files.
|
||||
* **Bug-fix:** Command line compressor now correctly loads 16 and 32-bit per
|
||||
component input KTX files.
|
||||
* **Bug-fix:** Fixed GCC9 compiler warnings on Arm aarch64.
|
||||
|
||||
<!-- ---------------------------------------------------------------------- -->
|
||||
## 4.0.0
|
||||
|
||||
**Status:** July 2022
|
||||
|
||||
The 4.0.0 release introduces some major performance enhancement, and a number
|
||||
of larger changes to the heuristics used in the codec to find a more effective
|
||||
cost:quality trade off.
|
||||
|
||||
* **General:**
|
||||
* **Change:** The `-array` option for specifying the number of image planes
|
||||
for ASTC 3D volumetric block compression been renamed to `-zdim`.
|
||||
* **Change:** The build root package directory is now `bin` instead of
|
||||
`astcenc`, allowing the CMake install step to write binaries into
|
||||
`/usr/local/bin` if the user wishes to do so.
|
||||
* **Feature:** A new `-ssw` option for specifying the shader sampling swizzle
|
||||
has been added as convenience alternative to the `-cw` option. This is
|
||||
needed to correct error weighting during compression if not all components
|
||||
are read in the shader. For example, to extract and compress two components
|
||||
from an RGBA input image, weighting the two components equally when
|
||||
sampling through .ra in the shader, use `-esw ggga -ssw ra`. In this
|
||||
example `-ssw ra` is equivalent to the alternative `-cw 1 0 0 1` encoding.
|
||||
* **Feature:** The `-a` alpha weighting option has been re-enabled in the
|
||||
backend, and now again applies alpha scaling to the RGB error metrics when
|
||||
encoding. This is based on the maximum alpha in each block, not the
|
||||
individual texel alpha values used in the earlier implementation.
|
||||
* **Feature:** The command line tool now has `-repeats <count>` for testing,
|
||||
which will iterate around compression and decompression `count` times.
|
||||
Reported performance metrics also now separate compression and
|
||||
decompression scores.
|
||||
* **Feature:** The core codec is now warning clean up to /W4 for both MSVC
|
||||
`cl.exe` and `clangcl.exe` compilers.
|
||||
* **Feature:** The core codec now supports arm64 for both MSVC `cl.exe` and
|
||||
`clangcl.exe` compilers.
|
||||
* **Feature:** `NO_INVARIANCE` builds will enable the `-ffp-contract=fast`
|
||||
option for all targets when using Clang or GCC. In addition AVX2 targets
|
||||
will also set the `-mfma` option. This reduces image quality by up to 0.2dB
|
||||
(normally much less), but improves performance by up to 5-20%.
|
||||
* **Optimization:** Angular endpoint min/max weight selection is restricted
|
||||
to weight `QUANT_11` or lower. Higher quantization levels assume default
|
||||
0-1 range, which is less accurate but much faster.
|
||||
* **Optimization:** Maximum weight quantization for later trials is selected
|
||||
based on the weight quantization of the best encoding from the 1 plane 1
|
||||
partition trial. This significantly reduces the search space for the later
|
||||
trials with more planes or partitions.
|
||||
* **Optimization:** Small data tables now use in-register SIMD permutes
|
||||
rather than gathers (AVX2) or unrolled scalar lookups (SSE/NEON). This can
|
||||
be a significant optimization for paths that are load unit limited.
|
||||
* **Optimization:** Decompressed image block writes in the decompressor now
|
||||
use a vectorized approach to writing each row of texels in the block,
|
||||
including to ability to exploit masked stores if the target supports them.
|
||||
* **Optimization:** Weight scrambling has been moved into the physical layer;
|
||||
the rest of the codec now uses linear order weights.
|
||||
* **Optimization:** Weight packing has been moved into the physical layer;
|
||||
the rest of the codec now uses unpacked weights in the 0-64 range.
|
||||
* **Optimization:** Consistently vectorize the creation of unquantized weight
|
||||
grids when they are needed.
|
||||
* **Optimization:** Remove redundant per-decimation mode copies of endpoint
|
||||
and weight structures, which were really read-only duplicates.
|
||||
* **Optimization:** Early-out the same endpoint mode color calculation if it
|
||||
cannot be applied.
|
||||
* **Optimization:** Numerous type size reductions applied to arrays to reduce
|
||||
both context working buffer size usage and stack usage.
|
||||
|
||||
### Performance:
|
||||
|
||||
Key for charts:
|
||||
|
||||
* Color = block size (see legend).
|
||||
* Letter = image format (N = normal map, G = grayscale, L = LDR, H = HDR).
|
||||
|
||||
**Relative performance vs 3.7 release:**
|
||||
|
||||

|
||||
|
||||
|
||||
- - -
|
||||
|
||||
_Copyright © 2022-2024, Arm Limited and contributors. All rights reserved._
|
||||
|
Before Width: | Height: | Size: 111 KiB |
|
Before Width: | Height: | Size: 148 KiB |
|
Before Width: | Height: | Size: 141 KiB |
|
Before Width: | Height: | Size: 149 KiB |
|
Before Width: | Height: | Size: 134 KiB |
|
Before Width: | Height: | Size: 112 KiB |
|
Before Width: | Height: | Size: 120 KiB |
|
Before Width: | Height: | Size: 120 KiB |
|
Before Width: | Height: | Size: 123 KiB |
|
After Width: | Height: | Size: 121 KiB |
|
After Width: | Height: | Size: 126 KiB |
|
After Width: | Height: | Size: 116 KiB |
|
After Width: | Height: | Size: 108 KiB |
@@ -25,7 +25,7 @@ their compressed bitrate are shown in the table below.
|
||||
| BC3nm | G+R | 8 | BC1 G + BC4 R |
|
||||
| BC4 | R | 4 | L8 |
|
||||
| BC5 | R+G | 8 | BC1 R + BC1 G |
|
||||
| BC6 | RGB (HDR) | 8 | |
|
||||
| BC6H | RGB (HDR) | 8 | |
|
||||
| BC7 | RGB / RGBA | 8 | |
|
||||
| EAC_R11 | R | 4 | R11 |
|
||||
| EAC_RG11 | RG | 8 | RG11 |
|
||||
@@ -46,40 +46,40 @@ also a weakness (it reduces quality when compressing correlated signals).
|
||||
# ASTC Format Mapping
|
||||
|
||||
The main question which arises with the mapping of another format on to ASTC
|
||||
is how to handle cases where the input isn't a 4 channel RGBA input. ASTC is a
|
||||
container format which always decompresses in to a 4 channel RGBA result.
|
||||
is how to handle cases where the input isn't a 4 component RGBA input. ASTC is
|
||||
a container format which always decompresses in to a 4 component RGBA result.
|
||||
However, the internal compressed representation is very flexible and can store
|
||||
1-4 channels as needed on a per-block basis.
|
||||
1-4 components as needed on a per-block basis.
|
||||
|
||||
To get the best quality for a given bitrate, or the lowest bitrate for a given
|
||||
quality, it is important that as few channels as possible are stored in the
|
||||
quality, it is important that as few components as possible are stored in the
|
||||
internal representation to avoid wasting coding space.
|
||||
|
||||
Specific optimizations in the ASTC coding scheme exist for:
|
||||
|
||||
* Encoding the RGB channels as a single luminance channel, so only a single
|
||||
* Encoding the RGB components as a single luminance component, so only a single
|
||||
value needs to be stored in the coding instead of three.
|
||||
* Encoding the A channel as a constant 1.0 value, so the coding doesn't
|
||||
* Encoding the A component as a constant 1.0 value, so the coding doesn't
|
||||
actually need to store a per-pixel alpha value at all.
|
||||
|
||||
... so mapping your inputs given to the compressor to hit these paths is
|
||||
really important if you want to get the best output quality for your chosen
|
||||
bitrate.
|
||||
|
||||
## Encoding 1-4 channel data
|
||||
## Encoding 1-4 component data
|
||||
|
||||
The table below shows the recommended channel usage for data with different
|
||||
numbers of color channels present in the data.
|
||||
The table below shows the recommended component usage for data with different
|
||||
numbers of color components present in the data.
|
||||
|
||||
The coding swizzle should be applied when compressing an image. This can be
|
||||
handled by the compressor when reading an uncompressed input image by
|
||||
specifying the swizzle using the `-esw` command line option.
|
||||
|
||||
The sampling swizzle is what your should use in your shader programs to read
|
||||
the data from the compressed texture, assuming no additional API-level channel
|
||||
swizzling is specified by the application.
|
||||
the data from the compressed texture, assuming no additional API-level
|
||||
component swizzling is specified by the application.
|
||||
|
||||
| Input Channels | ASTC Endpoint | Coding Swizzle | Sampling Swizzle |
|
||||
| Input components | ASTC Endpoint | Coding Swizzle | Sampling Swizzle |
|
||||
| -------------- | ------------- | -------------- | ------------------ |
|
||||
| 1 | L + 1 | `rrr1` | `.g` <sup>1</sup> |
|
||||
| 2 | L + A | `rrrg` | `.ga` <sup>1</sup> |
|
||||
@@ -88,13 +88,13 @@ swizzling is specified by the application.
|
||||
|
||||
**1:** Sampling from `g` is preferred to sampling from `r` because it allows a
|
||||
single shader to be compatible with ASTC, BC1, or ETC formats. BC1 and ETC1
|
||||
store color endpoints as RGB565 data, so the `g` channel will have higher
|
||||
store color endpoints as RGB565 data, so the `g` component will have higher
|
||||
precision. For ASTC it doesn't actually make any difference; the same single
|
||||
channel luminance will be returned for all three of the `.rgb` channels.
|
||||
component luminance will be returned for all three of the `.rgb` components.
|
||||
|
||||
## Equivalence with other formats
|
||||
|
||||
Based on these channel encoding requirements we can now derive the the ASTC
|
||||
Based on these component encoding requirements we can now derive the the ASTC
|
||||
coding equivalents for most of the other texture compression formats in common
|
||||
use today.
|
||||
|
||||
@@ -105,7 +105,7 @@ use today.
|
||||
| BC3nm | `gggr` | `.ag` | |
|
||||
| BC4 | `rrr1` | `.r` | |
|
||||
| BC5 | `rrrg` | `.ra` <sup>2</sup> | |
|
||||
| BC6 | `rgb1` | `.rgb` | HDR profile only |
|
||||
| BC6H | `rgb1` | `.rgb` <sup>3</sup> | HDR profile only |
|
||||
| BC7 | `rgba` | `.rgba` | |
|
||||
| EAC_R11 | `rrr1` | `.r` | |
|
||||
| EAC_RG11 | `rrrg` | `.ra` <sup>2</sup> | |
|
||||
@@ -115,38 +115,66 @@ use today.
|
||||
| ETC2+EAC | `rgba` | `.rgba` | |
|
||||
|
||||
**1:** ASTC has no equivalent of the 1-bit punch-through alpha encoding
|
||||
supported by BC1 or ETC2; if alpha is present it will be a full alpha channel.
|
||||
supported by BC1 or ETC2; if alpha is present it will be a full alpha
|
||||
component.
|
||||
|
||||
**2:** ASTC relies on using the L+A color endpoint type for coding efficiency
|
||||
for two channel data. It therefore has no direct equivalent of a two-plane
|
||||
format sampled though the `.rg` channels such as BC5 or EAC_RG11. This can
|
||||
be emulated by setting texture channel swizzles in the runtime API - e.g. via
|
||||
for two component data. It therefore has no direct equivalent of a two-plane
|
||||
format sampled though the `.rg` components such as BC5 or EAC_RG11. This can
|
||||
be emulated by setting texture component swizzles in the runtime API - e.g. via
|
||||
`glTexParameteri()` for OpenGL ES - although it has been noted that API
|
||||
controlled swizzles are not available in WebGL.
|
||||
|
||||
**3:** ASTC can only store unsigned values, and has no equivalent of the BC6
|
||||
signed endpoint mode.
|
||||
|
||||
# Other Considerations
|
||||
|
||||
This section outlines some of the other things to consider when encoding
|
||||
textures using ASTC.
|
||||
|
||||
## Encoding non-correlated channels
|
||||
## Decode mode extensions
|
||||
|
||||
Most other texture compression formats have a static channel assignment in
|
||||
ASTC is specified to decompress into a 16-bit per component RGBA output by
|
||||
default, with the exception of the sRGB format which uses an 8-bit value for the
|
||||
RGB components.
|
||||
|
||||
Decompressing in to a 16-bit per component output format is often higher than
|
||||
many use cases require, especially for LDR textures which originally came from
|
||||
an 8-bit per component source image. Most implementations of ASTC support the
|
||||
decode mode extensions, which allow an application to opt-in to a lower
|
||||
precision decompressed format (RGBA8 for LDR, RGB9E5 for HDR). Using these
|
||||
extensions can improve GPU texture cache efficiency, and even improve texturing
|
||||
filtering throughput, for use cases that do not need the higher precision.
|
||||
|
||||
The ASTC format uses different data rounding rules when the decode mode
|
||||
extensions are used. To ensure that the compressor chooses the best encodings
|
||||
for the RGBA8 rounding rules, you can specify `-decode_unorm8` when compressing
|
||||
textures that will be decompressed into the RGBA8 intermediate. This gives a
|
||||
small image quality boost.
|
||||
|
||||
**Note:** This mode is automatically enabled if you use the `astcenc`
|
||||
decompressor to write an 8-bit per component output image.
|
||||
|
||||
## Encoding non-correlated components
|
||||
|
||||
Most other texture compression formats have a static component assignment in
|
||||
terms of the expected data correlation. For example, ETC2+EAC assumes that RGB
|
||||
are always correlated and that alpha is non-correlated. ASTC can automatically
|
||||
encode data as either fully correlated across all 4 channels, or with any one
|
||||
channel assigned to a separate non-correlated partition to the other three.
|
||||
encode data as either fully correlated across all 4 components, or with any one
|
||||
component assigned to a separate non-correlated partition to the other three.
|
||||
|
||||
The non-correlated channel can be changed on a block-by-block basis, so the
|
||||
The non-correlated component can be changed on a block-by-block basis, so the
|
||||
compressor can dynamically adjust the coding based on the data present in the
|
||||
image. This means that there is no need for non-correlated data to be stored
|
||||
in a specific channel in the input image.
|
||||
in a specific component in the input image.
|
||||
|
||||
It is however worth noting that the alpha channel is treated differently to
|
||||
the RGB color channels in some circumstances:
|
||||
It is however worth noting that the alpha component is treated differently to
|
||||
the RGB color components in some circumstances:
|
||||
|
||||
* When coding for sRGB the alpha channel will always be stored in linear space.
|
||||
* When coding for HDR the alpha channel can optionally be kept as LDR data.
|
||||
* When coding for sRGB the alpha component will always be stored in linear
|
||||
space.
|
||||
* When coding for HDR the alpha component can optionally be kept as LDR data.
|
||||
|
||||
## Encoding normal maps
|
||||
|
||||
@@ -155,21 +183,21 @@ BC5; store the X and Y components of a unit-length normal. The Z component of
|
||||
the normal can be reconstructed in shader code based on the knowledge that the
|
||||
vector is unit length.
|
||||
|
||||
To encode this we therefore want to store two input channels and should
|
||||
therefore use the `rrrg` coding swizzle, and the `.ga` sampling swizzle. The
|
||||
OpenGL ES shader code for reconstruction of the Z value is:
|
||||
To encode this we need to store only two input components in the compressed
|
||||
data, and therefore use the `rrrg` coding swizzle to align the data with the
|
||||
ASTC luminance+alpha endpoint. We can sample this in shader code using the
|
||||
`.ga` sampling swizzle, and reconstruct the Z value with:
|
||||
|
||||
vec3 nml;
|
||||
nml.xy = texture(...).ga; // Load normals (range 0 to 1)
|
||||
nml.xy = nml.xy * 2.0 - 1.0; // Unpack normals (range -1 to +1)
|
||||
nml.z = sqrt(1 - dot(nml.xy, nml.xy)); // Compute Z, given unit length
|
||||
|
||||
In addition to this it is useful to optimize for angular error in the resulting
|
||||
vector rather than for absolute color error in the data, which improves the
|
||||
perceptual quality of the image.
|
||||
|
||||
Both the encoding swizzle and the angular error function are enabled by using
|
||||
the `-normal` command line option.
|
||||
The encoding swizzle and appropriate component weighting is enabled by using
|
||||
the `-normal` command line option. If you wish to use a different pair of
|
||||
components you can specify a custom swizzle after setting the `-normal`
|
||||
parameter. For example, to match BC5n component ordering use
|
||||
`-normal -esw gggr` for compression and `-normal -dsw arz1` for decompression.
|
||||
|
||||
## Encoding sRGB data
|
||||
|
||||
@@ -182,8 +210,8 @@ For color data it is nearly always a perceptual quality win to use sRGB input
|
||||
source textures that are then compressed using the ASTC sRGB compression mode
|
||||
(compress using the `-cs` command line option rather than the `-cl` command
|
||||
line option). Note that sRGB gamma correction is only applied to the RGB
|
||||
channels during decode; the alpha channel is always treated as linear encoded
|
||||
data.
|
||||
components during decode; the alpha component is always treated as linear
|
||||
encoded data.
|
||||
|
||||
*Important:* The uncompressed input texture provided on the command line must
|
||||
be stored in the sRGB color space for `-cs` to function correctly.
|
||||
@@ -191,17 +219,17 @@ be stored in the sRGB color space for `-cs` to function correctly.
|
||||
## Encoding HDR data
|
||||
|
||||
HDR data can be encoded just like LDR data, but with some caveats around
|
||||
handling the alpha channel.
|
||||
handling the alpha component.
|
||||
|
||||
For many use cases the alpha channel is an actual alpha opacity channel and is
|
||||
therefore used for storing an LDR value between 0 and 1. For these cases use
|
||||
the `-ch` compressor option which will treat the RGB channels as HDR, but the
|
||||
A channel as LDR.
|
||||
For many use cases the alpha component is an actual alpha opacity component and
|
||||
is therefore used for storing an LDR value between 0 and 1. For these cases use
|
||||
the `-ch` compressor option which will treat the RGB components as HDR, but the
|
||||
A component as LDR.
|
||||
|
||||
For other use cases the alpha channel is simply a fourth data channel which is
|
||||
also storing an HDR value. For these cases use the `-cH` compressor option
|
||||
which will treat all channels as HDR data.
|
||||
For other use cases the alpha component is simply a fourth data component which
|
||||
is also storing an HDR value. For these cases use the `-cH` compressor option
|
||||
which will treat all components as HDR data.
|
||||
|
||||
- - -
|
||||
|
||||
_Copyright © 2019-2022, Arm Limited and contributors. All rights reserved._
|
||||
_Copyright © 2019-2024, Arm Limited and contributors. All rights reserved._
|
||||
|
||||
@@ -20,8 +20,8 @@ We support a small (but growing) number of C++ unit tests, which are written
|
||||
using the `googletest` framework and integrated in the CMake "CTest" test
|
||||
framework.
|
||||
|
||||
To build unit tests pull the `googletest` git submodule and add `-DUNITTEST=ON`
|
||||
to the CMake command line when configuring.
|
||||
To build unit tests pull the `googletest` git submodule and add
|
||||
`-DASTCENC_UNITTEST=ON` to the CMake command line when configuring.
|
||||
|
||||
To run unit tests use the CMake `ctest` utility from your build directory after
|
||||
you have built the tests.
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
"Name":"astc-encoder",
|
||||
"License":"Apache 2.0 License",
|
||||
"License File":"LICENSE",
|
||||
"Version Number":"3.7",
|
||||
"Version Number":"4.7",
|
||||
"Owner":"wangyonglang@huawei.com",
|
||||
"Upstream URL":"https://github.com/ARM-software/astc-encoder.git",
|
||||
"Description":"The Arm Adaptive Scalable Texture Compression (ASTC) Encoder,astcenc,is a command-line tool for compressing and decompressing images using the ASTC texture compression standard."
|
||||
|
||||
@@ -1,13 +1,13 @@
|
||||
# About
|
||||
|
||||
This is the official repository for the Arm® Adaptive Scalable Texture
|
||||
Compression (ASTC) Encoder, `astcenc`, a command-line tool for compressing
|
||||
and decompressing images using the ASTC texture compression standard.
|
||||
The Arm® Adaptive Scalable Texture Compression (ASTC) Encoder, `astcenc`, is
|
||||
a command-line tool for compressing and decompressing images using the ASTC
|
||||
texture compression standard.
|
||||
|
||||
## The ASTC format
|
||||
|
||||
The ASTC compressed data format, developed by Arm® and AMD, has been adopted as
|
||||
an official extension to the Open GL®, OpenGL ES, and Vulkan® graphics APIs. It
|
||||
an official extension to the OpenGL®, OpenGL ES, and Vulkan® graphics APIs. It
|
||||
provides a major step forward in terms of both the image quality at a given
|
||||
bitrate, and the format and bitrate flexibility available to content creators.
|
||||
This allows more assets to use compression, often at a reduced bitrate compared
|
||||
@@ -20,7 +20,7 @@ read the full [Khronos Data Format Specification][2] for all the details.
|
||||
|
||||
This project is licensed under the Apache 2.0 license. By downloading any
|
||||
component from this repository you acknowledge that you accept terms specified
|
||||
in the [LICENSE](LICENSE) file.
|
||||
in the [LICENSE.txt](LICENSE.txt) file.
|
||||
|
||||
# Encoder feature support
|
||||
|
||||
@@ -33,8 +33,8 @@ dynamic range (BMP, PNG, TGA), high dynamic range (EXR, HDR), or DDS and KTX
|
||||
wrapped output images.
|
||||
|
||||
The encoder allows control over the compression time/quality tradeoff with
|
||||
`exhaustive`, `thorough`, `medium`, `fast`, and `fastest` encoding quality
|
||||
presets.
|
||||
`exhaustive`, `verythorough`, `thorough`, `medium`, `fast`, and `fastest`
|
||||
encoding quality presets.
|
||||
|
||||
The encoder allows compression time and quality analysis by reporting the
|
||||
compression time, and the Peak Signal-to-Noise Ratio (PSNR) between the input
|
||||
@@ -58,15 +58,15 @@ from 0.89 bits/pixel up to 8 bits/pixel.
|
||||
Release build binaries for the `astcenc` stable releases are provided in the
|
||||
[GitHub Releases page][3].
|
||||
|
||||
**Latest 3.x stable release:** 3.7
|
||||
* Change log: [4.x series](./Docs/ChangeLog-4x.md)
|
||||
* Change log: [3.x series](./Docs/ChangeLog-3x.md)
|
||||
|
||||
**Latest 2.x stable release:** 2.5
|
||||
* Change log: [2.x series](./Docs/ChangeLog-2x.md)
|
||||
Binaries are provided for 64-bit builds on Windows, macOS, and Linux.
|
||||
|
||||
Binaries are provided for 64-bit builds on Windows, macOS, and Linux. The
|
||||
builds of the astcenc are provided as multiple binaries, each tuned for a
|
||||
specific SIMD instruction set.
|
||||
## Windows and Linux
|
||||
|
||||
For Windows and Linux the builds of the astcenc are provided as multiple
|
||||
binaries, each tuned for a specific SIMD instruction set.
|
||||
|
||||
For x86-64 we provide, in order of increasing performance:
|
||||
|
||||
@@ -78,23 +78,33 @@ The x86-64 SSE2 builds will work on all x86-64 machines, but it is the slowest
|
||||
of the three. The other two require extended CPU instruction set support which
|
||||
is not universally available, but each step gains ~15% more performance.
|
||||
|
||||
For Apple silicon macOS devices we provide:
|
||||
For Arm, if binaries are available, we provide:
|
||||
|
||||
* `astcenc-neon` - uses NEON
|
||||
|
||||
## macOS
|
||||
|
||||
For macOS devices we provide a single universal binary `astcenc`, which allows
|
||||
the OS to automatically use the correct binary variant for the current host
|
||||
machine. Support is provided for three architecture slices:
|
||||
|
||||
* `x86_64` - uses the `astcenc-sse4.1` build defined above.
|
||||
* `x86_64h` - uses the `astcenc-avx2` build defined above.
|
||||
* `arm64` - uses the `astcenc-neon` build defined above.
|
||||
|
||||
## Repository branches
|
||||
|
||||
The `main` branch is an active development branch for the compressor. It aims
|
||||
to be a stable branch, but as it is used for ongoing development expect it to
|
||||
have some volatility.
|
||||
to be a stable branch for the latest major release series, but as it is used
|
||||
for ongoing development expect it to have some volatility. We recommend using
|
||||
the latest stable release tag for production development.
|
||||
|
||||
The `2.x` branch is a stable branch for the 2.x release series. It is no longer
|
||||
under active development, but is a supported branch that will continue to get
|
||||
The `3.x` branch is a stable branch for the 3.x release series. It is no longer
|
||||
under active development, but is a supported branch that continues to get
|
||||
backported bug fixes.
|
||||
|
||||
The `1.x` branch is a stable branch for the 1.x release series. It is no longer
|
||||
under active development or getting bug fixes.
|
||||
The `1.x` and `2.x` branches are stable branches for older releases. They are
|
||||
no longer under active development or getting bug fixes.
|
||||
|
||||
Any other branches you might find are development branches for new features or
|
||||
optimizations, so might be interesting to play with but should be considered
|
||||
@@ -135,6 +145,11 @@ The modes available are:
|
||||
* `-ch` : use the HDR color profile, tuned for HDR RGB and LDR A.
|
||||
* `-cH` : use the HDR color profile, tuned for HDR RGBA.
|
||||
|
||||
If you intend to use the resulting image with the decode mode extensions to
|
||||
limit the decompressed precision to UNORM8, it is recommended that you also
|
||||
specify the `-decode_unorm8` flag. This will ensure that the compressor uses
|
||||
the correct rounding rules when choosing encodings.
|
||||
|
||||
## Decompressing an image
|
||||
|
||||
Decompress an image using the `-dl` \ `-ds` \ `-dh` \ `-dH` modes. For example:
|
||||
@@ -180,11 +195,6 @@ The compression speed can be controlled from `-fastest`, through `-fast`,
|
||||
encoder has to spend looking for good encodings the better the results, but it
|
||||
does result in increasingly small improvements for the amount of time required.
|
||||
|
||||
:warning: The `-fastest` quality preset is designed for quickly roughing-out
|
||||
new content. It is tuned to give the fastest possible compression, often at the
|
||||
expense of significant image quality loss compared to `-fast`. We do not
|
||||
recommend using it for production builds.
|
||||
|
||||
There are many other command line options for tuning the encoder parameters
|
||||
which can be used to fine tune the compression algorithm. See the command line
|
||||
help message for more details.
|
||||
@@ -203,6 +213,9 @@ It covers:
|
||||
* How to efficiently encode normal maps, sRGB data, and HDR data.
|
||||
* Coding equivalents to other compression formats.
|
||||
|
||||
The [ASTC Developer Guide][5] document (external link) provides a more detailed
|
||||
guide for developers using the `astcenc` compressor.
|
||||
|
||||
The [.astc File Format](./Docs/FileFormat.md) page provides a light-weight
|
||||
specification for the `.astc` file format and how to read or write it.
|
||||
|
||||
@@ -217,10 +230,16 @@ how to test any modifications to the source code in this repository.
|
||||
If you have issues with the `astcenc` encoder, or questions about the ASTC
|
||||
texture format itself, please raise them in the GitHub issue tracker.
|
||||
|
||||
If you have any questions about Arm GPUs, application development for Arm GPUs,
|
||||
or general mobile graphics development or technology please submit them on the
|
||||
[Arm Community graphics forums][4].
|
||||
|
||||
- - -
|
||||
|
||||
_Copyright © 2013-2022, Arm Limited and contributors. All rights reserved._
|
||||
_Copyright © 2013-2024, Arm Limited and contributors. All rights reserved._
|
||||
|
||||
[1]: ./Docs/FormatOverview.md
|
||||
[2]: https://www.khronos.org/registry/DataFormat/specs/1.3/dataformat.1.3.html#ASTC
|
||||
[3]: https://github.com/ARM-software/astc-encoder/releases
|
||||
[4]: https://community.arm.com/support-forums/f/graphics-gaming-and-vr-forum/
|
||||
[5]: https://developer.arm.com/documentation/102162/latest/?lang=en
|
||||
|
||||
@@ -41,7 +41,7 @@ astc-encoder引入openharmony的thirdparty目录下,
|
||||
```
|
||||
./build.sh --product-name rk3568 --ccache
|
||||
```
|
||||
编译生成物对应路径:`out/rk3568/thirdparty/astc-encoder/libastc_encoder_shared.so`。
|
||||
编译生成物对应路径:`out/rk3568/thirdparty/astc-encoder/libastc_encoder_shared.z.so`。
|
||||
|
||||
## 许可证<a name="section126611612164217"></a>
|
||||
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# ----------------------------------------------------------------------------
|
||||
# Copyright 2020-2021 Arm Limited
|
||||
# Copyright 2020-2023 Arm Limited
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License"); you may not
|
||||
# use this file except in compliance with the License. You may obtain a copy
|
||||
@@ -17,45 +17,80 @@
|
||||
|
||||
# Overwrite the LTO flags to force fat LTO; worth 3-4% performance
|
||||
# See https://gitlab.kitware.com/cmake/cmake/-/issues/16808
|
||||
if(CMAKE_CXX_COMPILER_ID MATCHES "GNU|Clang" AND ${CLI})
|
||||
if(CMAKE_CXX_COMPILER_ID MATCHES "GNU|Clang" AND ${ASTCENC_CLI})
|
||||
set(CMAKE_CXX_COMPILE_OPTIONS_IPO "-flto")
|
||||
endif()
|
||||
|
||||
if(${DECOMPRESSOR})
|
||||
set(CODEC dec)
|
||||
if(${ASTCENC_DECOMPRESSOR})
|
||||
set(ASTCENC_CODEC dec)
|
||||
else()
|
||||
set(CODEC enc)
|
||||
set(ASTCENC_CODEC enc)
|
||||
endif()
|
||||
|
||||
if(${UNIVERSAL_BUILD})
|
||||
if(${ISA_AVX2})
|
||||
set(ISA_SIMD "avx2")
|
||||
elseif(${ISA_SSE41})
|
||||
set(ISA_SIMD "sse4.1")
|
||||
elseif(${ISA_SSE2})
|
||||
set(ISA_SIMD "sse2")
|
||||
endif()
|
||||
include(cmake_core.cmake)
|
||||
else()
|
||||
set(ARTEFACTS native none neon avx2 sse4.1 sse2)
|
||||
set(CONFIGS ${ISA_NATIVE} ${ISA_NONE} ${ISA_NEON} ${ISA_AVX2} ${ISA_SSE41} ${ISA_SSE2})
|
||||
list(LENGTH ARTEFACTS ARTEFACTS_LEN)
|
||||
math(EXPR ARTEFACTS_LEN "${ARTEFACTS_LEN} - 1")
|
||||
set(ASTCENC_ARTIFACTS native none neon avx2 sse4.1 sse2)
|
||||
set(ASTCENC_CONFIGS ${ASTCENC_ISA_NATIVE} ${ASTCENC_ISA_NONE} ${ASTCENC_ISA_NEON} ${ASTCENC_ISA_AVX2} ${ASTCENC_ISA_SSE41} ${ASTCENC_ISA_SSE2})
|
||||
list(LENGTH ASTCENC_ARTIFACTS ASTCENC_ARTIFACTS_LEN)
|
||||
math(EXPR ASTCENC_ARTIFACTS_LEN "${ASTCENC_ARTIFACTS_LEN} - 1")
|
||||
|
||||
foreach(INDEX RANGE ${ARTEFACTS_LEN})
|
||||
list(GET ARTEFACTS ${INDEX} ARTEFACT)
|
||||
list(GET CONFIGS ${INDEX} CONFIG)
|
||||
if(${CONFIG})
|
||||
set(ISA_SIMD ${ARTEFACT})
|
||||
include(cmake_core.cmake)
|
||||
foreach(INDEX RANGE ${ASTCENC_ARTIFACTS_LEN})
|
||||
list(GET ASTCENC_ARTIFACTS ${INDEX} ASTCENC_ARTIFACT)
|
||||
list(GET ASTCENC_CONFIGS ${INDEX} ASTCENC_CONFIG)
|
||||
if(${ASTCENC_CONFIG})
|
||||
set(ASTCENC_ISA_SIMD ${ASTCENC_ARTIFACT})
|
||||
|
||||
if(${ASTCENC_ISA_SIMD} MATCHES "neon")
|
||||
set(CMAKE_OSX_ARCHITECTURES arm64)
|
||||
elseif(${ASTCENC_ISA_SIMD} MATCHES "avx2")
|
||||
set(CMAKE_OSX_ARCHITECTURES x86_64h)
|
||||
elseif(NOT ${ASTCENC_ISA_SIMD} MATCHES "none")
|
||||
set(CMAKE_OSX_ARCHITECTURES x86_64)
|
||||
endif()
|
||||
endforeach()
|
||||
|
||||
include(cmake_core.cmake)
|
||||
endif()
|
||||
endforeach()
|
||||
|
||||
if(${ASTCENC_CLI} AND ${ASTCENC_UNIVERSAL_BUILD})
|
||||
add_custom_target(
|
||||
astc${ASTCENC_CODEC}
|
||||
ALL
|
||||
COMMAND
|
||||
lipo -create -output $<TARGET_FILE_DIR:astc${ASTCENC_CODEC}-sse4.1>/astc${ASTCENC_CODEC} -arch x86_64 $<TARGET_FILE:astc${ASTCENC_CODEC}-sse4.1> -arch x86_64h $<TARGET_FILE:astc${ASTCENC_CODEC}-avx2> -arch arm64 $<TARGET_FILE:astc${ASTCENC_CODEC}-neon>
|
||||
VERBATIM)
|
||||
|
||||
add_dependencies(
|
||||
astc${ASTCENC_CODEC}
|
||||
astc${ASTCENC_CODEC}-sse4.1
|
||||
astc${ASTCENC_CODEC}-avx2
|
||||
astc${ASTCENC_CODEC}-neon)
|
||||
|
||||
install(PROGRAMS $<TARGET_FILE_DIR:astc${ASTCENC_CODEC}-sse4.1>/astc${ASTCENC_CODEC}
|
||||
DESTINATION bin)
|
||||
endif()
|
||||
|
||||
if(${ASTCENC_SHAREDLIB} AND ${ASTCENC_UNIVERSAL_BUILD})
|
||||
add_custom_target(
|
||||
astc${ASTCENC_CODEC}-shared
|
||||
ALL
|
||||
COMMAND
|
||||
lipo -create -output $<TARGET_FILE_DIR:astc${ASTCENC_CODEC}-sse4.1-shared>/libastc${ASTCENC_CODEC}-shared.dylib -arch x86_64 $<TARGET_FILE:astc${ASTCENC_CODEC}-sse4.1-shared> -arch x86_64h $<TARGET_FILE:astc${ASTCENC_CODEC}-avx2-shared> -arch arm64 $<TARGET_FILE:astc${ASTCENC_CODEC}-neon-shared>
|
||||
VERBATIM)
|
||||
|
||||
add_dependencies(
|
||||
astc${ASTCENC_CODEC}-shared
|
||||
astc${ASTCENC_CODEC}-sse4.1-shared
|
||||
astc${ASTCENC_CODEC}-avx2-shared
|
||||
astc${ASTCENC_CODEC}-neon-shared)
|
||||
|
||||
install(PROGRAMS $<TARGET_FILE_DIR:astc${ASTCENC_CODEC}-sse4.1-shared>/libastc${ASTCENC_CODEC}-shared.dylib
|
||||
DESTINATION lib)
|
||||
endif()
|
||||
|
||||
# - - - - - - - - - - - - - - - - - -
|
||||
# Unit testing
|
||||
if(${UNITTEST})
|
||||
if(${ASTCENC_UNITTEST})
|
||||
set(INSTALL_GTEST OFF CACHE BOOL "" FORCE)
|
||||
set(CMAKE_OSX_ARCHITECTURES x86_64;arm64)
|
||||
add_subdirectory(GoogleTest)
|
||||
enable_testing()
|
||||
add_subdirectory(UnitTest)
|
||||
|
||||
@@ -94,9 +94,9 @@ extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size)
|
||||
int i = stream.ConsumeIntegralInRange<int>(0, testSz.size() - 1);
|
||||
|
||||
// Populate the physical block
|
||||
physical_compressed_block pcb;
|
||||
uint8_t pcb[16];
|
||||
std::vector<uint8_t> buffer = stream.ConsumeBytes<uint8_t>(16);
|
||||
std::memcpy(&pcb, buffer.data(), 16);
|
||||
std::memcpy(pcb, buffer.data(), 16);
|
||||
|
||||
// Call the function under test
|
||||
symbolic_compressed_block scb;
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# ----------------------------------------------------------------------------
|
||||
# Copyright 2020-2021 Arm Limited
|
||||
# Copyright 2020-2023 Arm Limited
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License"); you may not
|
||||
# use this file except in compliance with the License. You may obtain a copy
|
||||
@@ -15,27 +15,25 @@
|
||||
# under the License.
|
||||
# ----------------------------------------------------------------------------
|
||||
|
||||
if(${UNIVERSAL_BUILD})
|
||||
if(${ISA_AVX2})
|
||||
set(ISA_SIMD "avx2")
|
||||
elseif(${ISA_SSE41})
|
||||
set(ISA_SIMD "sse4.1")
|
||||
elseif(${ISA_SSE2})
|
||||
set(ISA_SIMD "sse2")
|
||||
endif()
|
||||
include(cmake_core.cmake)
|
||||
else()
|
||||
set(ARTEFACTS native none neon avx2 sse4.1 sse2)
|
||||
set(CONFIGS ${ISA_NATIVE} ${ISA_NONE} ${ISA_NEON} ${ISA_AVX2} ${ISA_SSE41} ${ISA_SSE2})
|
||||
list(LENGTH ARTEFACTS ARTEFACTS_LEN)
|
||||
math(EXPR ARTEFACTS_LEN "${ARTEFACTS_LEN} - 1")
|
||||
set(ASTCENC_ARTIFACTS native none neon avx2 sse4.1 sse2)
|
||||
set(ASTCENC_CONFIGS ${ASTCENC_ISA_NATIVE} ${ASTCENC_ISA_NONE} ${ASTCENC_ISA_NEON} ${ASTCENC_ISA_AVX2} ${ASTCENC_ISA_SSE41} ${ASTCENC_ISA_SSE2})
|
||||
list(LENGTH ASTCENC_ARTIFACTS ASTCENC_ARTIFACTS_LEN)
|
||||
math(EXPR ASTCENC_ARTIFACTS_LEN "${ASTCENC_ARTIFACTS_LEN} - 1")
|
||||
|
||||
foreach(INDEX RANGE ${ARTEFACTS_LEN})
|
||||
list(GET ARTEFACTS ${INDEX} ARTEFACT)
|
||||
list(GET CONFIGS ${INDEX} CONFIG)
|
||||
if(${CONFIG})
|
||||
set(ISA_SIMD ${ARTEFACT})
|
||||
include(cmake_core.cmake)
|
||||
foreach(INDEX RANGE ${ASTCENC_ARTIFACTS_LEN})
|
||||
list(GET ASTCENC_ARTIFACTS ${INDEX} ASTCENC_ARTIFACT)
|
||||
list(GET ASTCENC_CONFIGS ${INDEX} ASTCENC_CONFIG)
|
||||
if(${ASTCENC_CONFIG})
|
||||
set(ASTCENC_ISA_SIMD ${ASTCENC_ARTIFACT})
|
||||
|
||||
if(${ASTCENC_ISA_SIMD} MATCHES "neon")
|
||||
set(CMAKE_OSX_ARCHITECTURES arm64)
|
||||
elseif(${ASTCENC_ISA_SIMD} MATCHES "avx2")
|
||||
set(CMAKE_OSX_ARCHITECTURES x86_64h)
|
||||
elseif(NOT ${ASTCENC_ISA_SIMD} MATCHES "none")
|
||||
set(CMAKE_OSX_ARCHITECTURES x86_64)
|
||||
endif()
|
||||
endforeach()
|
||||
endif()
|
||||
|
||||
include(cmake_core.cmake)
|
||||
endif()
|
||||
endforeach()
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# ----------------------------------------------------------------------------
|
||||
# Copyright 2020-2021 Arm Limited
|
||||
# Copyright 2020-2023 Arm Limited
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License"); you may not
|
||||
# use this file except in compliance with the License. You may obtain a copy
|
||||
@@ -15,25 +15,34 @@
|
||||
# under the License.
|
||||
# ----------------------------------------------------------------------------
|
||||
|
||||
if(${UNIVERSAL_BUILD})
|
||||
set(ASTC_TEST test-unit)
|
||||
else()
|
||||
set(ASTC_TEST test-unit-${ISA_SIMD})
|
||||
set(ASTCENC_TEST test-unit-${ASTCENC_ISA_SIMD})
|
||||
|
||||
add_executable(${ASTCENC_TEST})
|
||||
|
||||
# Enable LTO under the conditions where the codec library will use LTO.
|
||||
# The library link will fail if the settings don't match
|
||||
if(${ASTCENC_CLI})
|
||||
set_property(TARGET ${ASTCENC_TEST}
|
||||
PROPERTY
|
||||
INTERPROCEDURAL_OPTIMIZATION_RELEASE True)
|
||||
endif()
|
||||
|
||||
add_executable(${ASTC_TEST})
|
||||
|
||||
target_sources(${ASTC_TEST}
|
||||
target_sources(${ASTCENC_TEST}
|
||||
PRIVATE
|
||||
test_simd.cpp
|
||||
test_softfloat.cpp
|
||||
test_decode.cpp
|
||||
../astcenc_mathlib_softfloat.cpp)
|
||||
|
||||
target_include_directories(${ASTC_TEST}
|
||||
target_include_directories(${ASTCENC_TEST}
|
||||
PRIVATE
|
||||
${gtest_SOURCE_DIR}/include)
|
||||
|
||||
target_compile_options(${ASTC_TEST}
|
||||
target_link_libraries(${ASTCENC_TEST}
|
||||
PRIVATE
|
||||
astcenc-${ASTCENC_ISA_SIMD}-static)
|
||||
|
||||
target_compile_options(${ASTCENC_TEST}
|
||||
PRIVATE
|
||||
# Use pthreads on Linux/macOS
|
||||
$<$<PLATFORM_ID:Linux,Darwin>:-pthread>
|
||||
@@ -47,92 +56,83 @@ target_compile_options(${ASTC_TEST}
|
||||
$<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wpedantic>
|
||||
$<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Werror>
|
||||
$<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wshadow>
|
||||
$<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wdouble-promotion>)
|
||||
$<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wno-c++98-compat-pedantic>
|
||||
$<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wno-c++98-c++11-compat-pedantic>
|
||||
$<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wno-float-equal>
|
||||
|
||||
# Ignore things that the googletest build triggers
|
||||
$<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wno-unknown-warning-option>
|
||||
$<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wno-double-promotion>
|
||||
$<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wno-undef>
|
||||
$<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wno-reserved-identifier>
|
||||
$<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wno-global-constructors>)
|
||||
|
||||
# Set up configuration for SIMD ISA builds
|
||||
if(${ISA_SIMD} MATCHES "none")
|
||||
if(NOT ${UNIVERSAL_BUILD})
|
||||
target_compile_definitions(${ASTC_TEST}
|
||||
PRIVATE
|
||||
ASTCENC_NEON=0
|
||||
ASTCENC_SSE=0
|
||||
ASTCENC_AVX=0
|
||||
ASTCENC_POPCNT=0
|
||||
ASTCENC_F16C=0)
|
||||
endif()
|
||||
|
||||
elseif(${ISA_SIMD} MATCHES "neon")
|
||||
if(NOT ${UNIVERSAL_BUILD})
|
||||
target_compile_definitions(${ASTC_TEST}
|
||||
PRIVATE
|
||||
ASTCENC_NEON=1
|
||||
ASTCENC_SSE=0
|
||||
ASTCENC_AVX=0
|
||||
ASTCENC_POPCNT=0
|
||||
ASTCENC_F16C=0)
|
||||
endif()
|
||||
|
||||
elseif(${ISA_SIMD} MATCHES "sse2")
|
||||
if(NOT ${UNIVERSAL_BUILD})
|
||||
target_compile_definitions(${ASTC_TEST}
|
||||
PRIVATE
|
||||
ASTCENC_NEON=0
|
||||
ASTCENC_SSE=20
|
||||
ASTCENC_AVX=0
|
||||
ASTCENC_POPCNT=0
|
||||
ASTCENC_F16C=0)
|
||||
endif()
|
||||
|
||||
target_compile_options(${ASTC_TEST}
|
||||
if(${ASTCENC_ISA_SIMD} MATCHES "none")
|
||||
target_compile_definitions(${ASTCENC_TEST}
|
||||
PRIVATE
|
||||
$<$<CXX_COMPILER_ID:${GNU_LIKE}>:-mfpmath=sse -msse2>)
|
||||
ASTCENC_NEON=0
|
||||
ASTCENC_SSE=0
|
||||
ASTCENC_AVX=0
|
||||
ASTCENC_POPCNT=0
|
||||
ASTCENC_F16C=0)
|
||||
|
||||
elseif(${ISA_SIMD} MATCHES "sse4.1")
|
||||
if(NOT ${UNIVERSAL_BUILD})
|
||||
target_compile_definitions(${ASTC_TEST}
|
||||
PRIVATE
|
||||
ASTCENC_NEON=0
|
||||
ASTCENC_SSE=41
|
||||
ASTCENC_AVX=0
|
||||
ASTCENC_POPCNT=1
|
||||
ASTCENC_F16C=0)
|
||||
endif()
|
||||
|
||||
target_compile_options(${ASTC_TEST}
|
||||
elseif(${ASTCENC_ISA_SIMD} MATCHES "neon")
|
||||
target_compile_definitions(${ASTCENC_TEST}
|
||||
PRIVATE
|
||||
$<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-mfpmath=sse -msse4.1 -mpopcnt>)
|
||||
ASTCENC_NEON=1
|
||||
ASTCENC_SSE=0
|
||||
ASTCENC_AVX=0
|
||||
ASTCENC_POPCNT=0
|
||||
ASTCENC_F16C=0)
|
||||
|
||||
elseif(${ISA_SIMD} MATCHES "avx2")
|
||||
if(NOT ${UNIVERSAL_BUILD})
|
||||
target_compile_definitions(${ASTC_TEST}
|
||||
PRIVATE
|
||||
ASTCENC_NEON=0
|
||||
ASTCENC_SSE=41
|
||||
ASTCENC_AVX=2
|
||||
ASTCENC_POPCNT=1
|
||||
ASTCENC_F16C=1)
|
||||
endif()
|
||||
|
||||
target_compile_options(${ASTC_TEST}
|
||||
elseif(${ASTCENC_ISA_SIMD} MATCHES "sse2")
|
||||
target_compile_definitions(${ASTCENC_TEST}
|
||||
PRIVATE
|
||||
$<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-mfpmath=sse -mavx2 -mpopcnt -mf16c>
|
||||
ASTCENC_NEON=0
|
||||
ASTCENC_SSE=20
|
||||
ASTCENC_AVX=0
|
||||
ASTCENC_POPCNT=0
|
||||
ASTCENC_F16C=0)
|
||||
|
||||
target_compile_options(${ASTCENC_TEST}
|
||||
PRIVATE
|
||||
$<$<CXX_COMPILER_ID:${GNU_LIKE}>:-msse2>)
|
||||
|
||||
elseif(${ASTCENC_ISA_SIMD} MATCHES "sse4.1")
|
||||
target_compile_definitions(${ASTCENC_TEST}
|
||||
PRIVATE
|
||||
ASTCENC_NEON=0
|
||||
ASTCENC_SSE=41
|
||||
ASTCENC_AVX=0
|
||||
ASTCENC_POPCNT=1
|
||||
ASTCENC_F16C=0)
|
||||
|
||||
target_compile_options(${ASTCENC_TEST}
|
||||
PRIVATE
|
||||
$<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-msse4.1 -mpopcnt>)
|
||||
|
||||
elseif(${ASTCENC_ISA_SIMD} MATCHES "avx2")
|
||||
target_compile_definitions(${ASTCENC_TEST}
|
||||
PRIVATE
|
||||
ASTCENC_NEON=0
|
||||
ASTCENC_SSE=41
|
||||
ASTCENC_AVX=2
|
||||
ASTCENC_POPCNT=1
|
||||
ASTCENC_F16C=1)
|
||||
|
||||
target_compile_options(${ASTCENC_TEST}
|
||||
PRIVATE
|
||||
$<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-mavx2 -mpopcnt -mf16c>
|
||||
$<$<CXX_COMPILER_ID:MSVC>:/arch:AVX2>)
|
||||
|
||||
endif()
|
||||
|
||||
target_compile_options(${ASTC_TEST}
|
||||
PRIVATE
|
||||
$<$<CXX_COMPILER_ID:${CLANG_LIKE}>:-fsanitize=undefined>)
|
||||
|
||||
target_link_options(${ASTC_TEST}
|
||||
PRIVATE
|
||||
$<$<CXX_COMPILER_ID:${CLANG_LIKE}>:-fsanitize=undefined>)
|
||||
|
||||
target_link_libraries(${ASTC_TEST}
|
||||
target_link_libraries(${ASTCENC_TEST}
|
||||
PRIVATE
|
||||
gtest_main)
|
||||
|
||||
add_test(NAME ${ASTC_TEST}
|
||||
COMMAND ${ASTC_TEST})
|
||||
add_test(NAME ${ASTCENC_TEST}
|
||||
COMMAND ${ASTCENC_TEST})
|
||||
|
||||
install(TARGETS ${ASTC_TEST} DESTINATION ${PACKAGE_ROOT})
|
||||
install(TARGETS ${ASTCENC_TEST})
|
||||
|
||||
@@ -0,0 +1,79 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
// ----------------------------------------------------------------------------
|
||||
// Copyright 2023 Arm Limited
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
|
||||
// use this file except in compliance with the License. You may obtain a copy
|
||||
// of the License at:
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
// License for the specific language governing permissions and limitations
|
||||
// under the License.
|
||||
// ----------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* @brief Unit tests for the vectorized SIMD functionality.
|
||||
*/
|
||||
|
||||
#include <limits>
|
||||
|
||||
#include "gtest/gtest.h"
|
||||
|
||||
#include "../astcenc.h"
|
||||
|
||||
namespace astcenc
|
||||
{
|
||||
|
||||
/** @brief Test harness for exploring issue #447. */
|
||||
TEST(decode, decode12x12)
|
||||
{
|
||||
astcenc_error status;
|
||||
astcenc_config config;
|
||||
astcenc_context* context;
|
||||
|
||||
static const astcenc_swizzle swizzle {
|
||||
ASTCENC_SWZ_R, ASTCENC_SWZ_G, ASTCENC_SWZ_B, ASTCENC_SWZ_A
|
||||
};
|
||||
|
||||
uint8_t data[16] {
|
||||
#if 0
|
||||
0x84,0x00,0x38,0xC8,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0xB3,0x4D,0x78
|
||||
#else
|
||||
0x29,0x00,0x1A,0x97,0x01,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0xCF,0x97,0x86
|
||||
#endif
|
||||
};
|
||||
|
||||
uint8_t output[12*12*4];
|
||||
astcenc_config_init(ASTCENC_PRF_LDR, 12, 12, 1, ASTCENC_PRE_MEDIUM, 0, &config);
|
||||
|
||||
status = astcenc_context_alloc(&config, 1, &context);
|
||||
EXPECT_EQ(status, ASTCENC_SUCCESS);
|
||||
|
||||
astcenc_image image;
|
||||
image.dim_x = 12;
|
||||
image.dim_y = 12;
|
||||
image.dim_z = 1;
|
||||
image.data_type = ASTCENC_TYPE_U8;
|
||||
uint8_t* slices = output;
|
||||
image.data = reinterpret_cast<void**>(&slices);
|
||||
|
||||
status = astcenc_decompress_image(context, data, 16, &image, &swizzle, 0);
|
||||
EXPECT_EQ(status, ASTCENC_SUCCESS);
|
||||
|
||||
for (int y = 0; y < 12; y++)
|
||||
{
|
||||
for (int x = 0; x < 12; x++)
|
||||
{
|
||||
uint8_t* pixel = output + (12 * 4 * y) + (4 * x);
|
||||
printf("[%2dx%2d] = %03d, %03d, %03d, %03d\n", x, y, pixel[0], pixel[1], pixel[2], pixel[3]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
@@ -1,6 +1,6 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
// ----------------------------------------------------------------------------
|
||||
// Copyright 2020-2022 Arm Limited
|
||||
// Copyright 2020-2024 Arm Limited
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
|
||||
// use this file except in compliance with the License. You may obtain a copy
|
||||
@@ -31,15 +31,15 @@ namespace astcenc
|
||||
|
||||
// Misc utility tests - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
|
||||
|
||||
static int round_down(int x)
|
||||
static unsigned int round_down(unsigned int x)
|
||||
{
|
||||
int remainder = x % ASTCENC_SIMD_WIDTH;
|
||||
unsigned int remainder = x % ASTCENC_SIMD_WIDTH;
|
||||
return x - remainder;
|
||||
}
|
||||
|
||||
static int round_up(int x)
|
||||
static unsigned int round_up(unsigned int x)
|
||||
{
|
||||
int remainder = x % ASTCENC_SIMD_WIDTH;
|
||||
unsigned int remainder = x % ASTCENC_SIMD_WIDTH;
|
||||
if (!remainder)
|
||||
{
|
||||
return x;
|
||||
@@ -52,9 +52,9 @@ static int round_up(int x)
|
||||
TEST(misc, RoundDownVLA)
|
||||
{
|
||||
// Static ones which are valid for all VLA widths
|
||||
EXPECT_EQ(round_down_to_simd_multiple_vla(0), 0);
|
||||
EXPECT_EQ(round_down_to_simd_multiple_vla(8), 8);
|
||||
EXPECT_EQ(round_down_to_simd_multiple_vla(16), 16);
|
||||
EXPECT_EQ(round_down_to_simd_multiple_vla(0), 0u);
|
||||
EXPECT_EQ(round_down_to_simd_multiple_vla(8), 8u);
|
||||
EXPECT_EQ(round_down_to_simd_multiple_vla(16), 16u);
|
||||
|
||||
// Variable ones which depend on VLA width
|
||||
EXPECT_EQ(round_down_to_simd_multiple_vla(3), round_down(3));
|
||||
@@ -67,9 +67,9 @@ TEST(misc, RoundDownVLA)
|
||||
TEST(misc, RoundUpVLA)
|
||||
{
|
||||
// Static ones which are valid for all VLA widths
|
||||
EXPECT_EQ(round_up_to_simd_multiple_vla(0), 0);
|
||||
EXPECT_EQ(round_up_to_simd_multiple_vla(8), 8);
|
||||
EXPECT_EQ(round_up_to_simd_multiple_vla(16), 16);
|
||||
EXPECT_EQ(round_up_to_simd_multiple_vla(0), 0u);
|
||||
EXPECT_EQ(round_up_to_simd_multiple_vla(8), 8u);
|
||||
EXPECT_EQ(round_up_to_simd_multiple_vla(16), 16u);
|
||||
|
||||
// Variable ones which depend on VLA width
|
||||
EXPECT_EQ(round_up_to_simd_multiple_vla(3), round_up(3));
|
||||
@@ -540,27 +540,27 @@ TEST(vfloat4, ceq)
|
||||
vfloat4 a1(1.0f, 2.0f, 3.0f, 4.0f);
|
||||
vfloat4 b1(0.1f, 0.2f, 0.3f, 0.4f);
|
||||
vmask4 r1 = a1 == b1;
|
||||
EXPECT_EQ(0, mask(r1));
|
||||
EXPECT_EQ(0u, mask(r1));
|
||||
EXPECT_EQ(false, any(r1));
|
||||
EXPECT_EQ(false, all(r1));
|
||||
|
||||
vfloat4 a2(1.0f, 2.0f, 3.0f, 4.0f);
|
||||
vfloat4 b2(1.0f, 0.2f, 0.3f, 0.4f);
|
||||
vmask4 r2 = a2 == b2;
|
||||
EXPECT_EQ(0x1, mask(r2));
|
||||
EXPECT_EQ(0x1u, mask(r2));
|
||||
EXPECT_EQ(true, any(r2));
|
||||
EXPECT_EQ(false, all(r2));
|
||||
|
||||
vfloat4 a3(1.0f, 2.0f, 3.0f, 4.0f);
|
||||
vfloat4 b3(1.0f, 0.2f, 3.0f, 0.4f);
|
||||
vmask4 r3 = a3 == b3;
|
||||
EXPECT_EQ(0x5, mask(r3));
|
||||
EXPECT_EQ(0x5u, mask(r3));
|
||||
EXPECT_EQ(true, any(r3));
|
||||
EXPECT_EQ(false, all(r3));
|
||||
|
||||
vfloat4 a4(1.0f, 2.0f, 3.0f, 4.0f);
|
||||
vmask4 r4 = a4 == a4;
|
||||
EXPECT_EQ(0xF, mask(r4));
|
||||
EXPECT_EQ(0xFu, mask(r4));
|
||||
EXPECT_EQ(true, any(r4));
|
||||
EXPECT_EQ(true, all(r4));
|
||||
}
|
||||
@@ -571,27 +571,27 @@ TEST(vfloat4, cne)
|
||||
vfloat4 a1(1.0f, 2.0f, 3.0f, 4.0f);
|
||||
vfloat4 b1(0.1f, 0.2f, 0.3f, 0.4f);
|
||||
vmask4 r1 = a1 != b1;
|
||||
EXPECT_EQ(0xF, mask(r1));
|
||||
EXPECT_EQ(0xFu, mask(r1));
|
||||
EXPECT_EQ(true, any(r1));
|
||||
EXPECT_EQ(true, all(r1));
|
||||
|
||||
vfloat4 a2(1.0f, 2.0f, 3.0f, 4.0f);
|
||||
vfloat4 b2(1.0f, 0.2f, 0.3f, 0.4f);
|
||||
vmask4 r2 = a2 != b2;
|
||||
EXPECT_EQ(0xE, mask(r2));
|
||||
EXPECT_EQ(0xEu, mask(r2));
|
||||
EXPECT_EQ(true, any(r2));
|
||||
EXPECT_EQ(false, all(r2));
|
||||
|
||||
vfloat4 a3(1.0f, 2.0f, 3.0f, 4.0f);
|
||||
vfloat4 b3(1.0f, 0.2f, 3.0f, 0.4f);
|
||||
vmask4 r3 = a3 != b3;
|
||||
EXPECT_EQ(0xA, mask(r3));
|
||||
EXPECT_EQ(0xAu, mask(r3));
|
||||
EXPECT_EQ(true, any(r3));
|
||||
EXPECT_EQ(false, all(r3));
|
||||
|
||||
vfloat4 a4(1.0f, 2.0f, 3.0f, 4.0f);
|
||||
vmask4 r4 = a4 != a4;
|
||||
EXPECT_EQ(0, mask(r4));
|
||||
EXPECT_EQ(0u, mask(r4));
|
||||
EXPECT_EQ(false, any(r4));
|
||||
EXPECT_EQ(false, all(r4));
|
||||
}
|
||||
@@ -602,7 +602,7 @@ TEST(vfloat4, clt)
|
||||
vfloat4 a(1.0f, 2.0f, 3.0f, 4.0f);
|
||||
vfloat4 b(0.9f, 2.1f, 3.0f, 4.1f);
|
||||
vmask4 r = a < b;
|
||||
EXPECT_EQ(0xA, mask(r));
|
||||
EXPECT_EQ(0xAu, mask(r));
|
||||
}
|
||||
|
||||
/** @brief Test vfloat4 cle. */
|
||||
@@ -611,7 +611,7 @@ TEST(vfloat4, cle)
|
||||
vfloat4 a(1.0f, 2.0f, 3.0f, 4.0f);
|
||||
vfloat4 b(0.9f, 2.1f, 3.0f, 4.1f);
|
||||
vmask4 r = a <= b;
|
||||
EXPECT_EQ(0xE, mask(r));
|
||||
EXPECT_EQ(0xEu, mask(r));
|
||||
}
|
||||
|
||||
/** @brief Test vfloat4 cgt. */
|
||||
@@ -620,7 +620,7 @@ TEST(vfloat4, cgt)
|
||||
vfloat4 a(1.0f, 2.0f, 3.0f, 4.0f);
|
||||
vfloat4 b(0.9f, 2.1f, 3.0f, 4.1f);
|
||||
vmask4 r = a > b;
|
||||
EXPECT_EQ(0x1, mask(r));
|
||||
EXPECT_EQ(0x1u, mask(r));
|
||||
}
|
||||
|
||||
/** @brief Test vfloat4 cge. */
|
||||
@@ -629,7 +629,7 @@ TEST(vfloat4, cge)
|
||||
vfloat4 a(1.0f, 2.0f, 3.0f, 4.0f);
|
||||
vfloat4 b(0.9f, 2.1f, 3.0f, 4.1f);
|
||||
vmask4 r = a >= b;
|
||||
EXPECT_EQ(0x5, mask(r));
|
||||
EXPECT_EQ(0x5u, mask(r));
|
||||
}
|
||||
|
||||
/** @brief Test vfloat4 min. */
|
||||
@@ -894,7 +894,8 @@ TEST(vfloat4, select)
|
||||
/** @brief Test vfloat4 select MSB only. */
|
||||
TEST(vfloat4, select_msb)
|
||||
{
|
||||
vint4 msb(0x80000000, 0, 0x80000000, 0);
|
||||
int msb_set = static_cast<int>(0x80000000);
|
||||
vint4 msb(msb_set, 0, msb_set, 0);
|
||||
vmask4 cond(msb.m);
|
||||
|
||||
vfloat4 a(1.0f, 3.0f, 3.0f, 1.0f);
|
||||
@@ -929,7 +930,7 @@ TEST(vfloat4, gatherf)
|
||||
/** @brief Test vfloat4 storea. */
|
||||
TEST(vfloat4, storea)
|
||||
{
|
||||
alignas(16) float out[4];
|
||||
ASTCENC_ALIGNAS float out[4];
|
||||
vfloat4 a(f32_data);
|
||||
storea(a, out);
|
||||
EXPECT_EQ(out[0], 0.0f);
|
||||
@@ -941,7 +942,7 @@ TEST(vfloat4, storea)
|
||||
/** @brief Test vfloat4 store. */
|
||||
TEST(vfloat4, store)
|
||||
{
|
||||
alignas(16) float out[5];
|
||||
ASTCENC_ALIGNAS float out[5];
|
||||
vfloat4 a(f32_data);
|
||||
store(a, &(out[1]));
|
||||
EXPECT_EQ(out[1], 0.0f);
|
||||
@@ -1439,27 +1440,27 @@ TEST(vint4, ceq)
|
||||
vint4 a1(1, 2, 3, 4);
|
||||
vint4 b1(0, 1, 2, 3);
|
||||
vmask4 r1 = a1 == b1;
|
||||
EXPECT_EQ(0, mask(r1));
|
||||
EXPECT_EQ(0u, mask(r1));
|
||||
EXPECT_EQ(false, any(r1));
|
||||
EXPECT_EQ(false, all(r1));
|
||||
|
||||
vint4 a2(1, 2, 3, 4);
|
||||
vint4 b2(1, 0, 0, 0);
|
||||
vmask4 r2 = a2 == b2;
|
||||
EXPECT_EQ(0x1, mask(r2));
|
||||
EXPECT_EQ(0x1u, mask(r2));
|
||||
EXPECT_EQ(true, any(r2));
|
||||
EXPECT_EQ(false, all(r2));
|
||||
|
||||
vint4 a3(1, 2, 3, 4);
|
||||
vint4 b3(1, 0, 3, 0);
|
||||
vmask4 r3 = a3 == b3;
|
||||
EXPECT_EQ(0x5, mask(r3));
|
||||
EXPECT_EQ(0x5u, mask(r3));
|
||||
EXPECT_EQ(true, any(r3));
|
||||
EXPECT_EQ(false, all(r3));
|
||||
|
||||
vint4 a4(1, 2, 3, 4);
|
||||
vmask4 r4 = a4 == a4;
|
||||
EXPECT_EQ(0xF, mask(r4));
|
||||
EXPECT_EQ(0xFu, mask(r4));
|
||||
EXPECT_EQ(true, any(r4));
|
||||
EXPECT_EQ(true, all(r4));
|
||||
}
|
||||
@@ -1470,27 +1471,27 @@ TEST(vint4, cne)
|
||||
vint4 a1(1, 2, 3, 4);
|
||||
vint4 b1(0, 1, 2, 3);
|
||||
vmask4 r1 = a1 != b1;
|
||||
EXPECT_EQ(0xF, mask(r1));
|
||||
EXPECT_EQ(0xFu, mask(r1));
|
||||
EXPECT_EQ(true, any(r1));
|
||||
EXPECT_EQ(true, all(r1));
|
||||
|
||||
vint4 a2(1, 2, 3, 4);
|
||||
vint4 b2(1, 0, 0, 0);
|
||||
vmask4 r2 = a2 != b2;
|
||||
EXPECT_EQ(0xE, mask(r2));
|
||||
EXPECT_EQ(0xEu, mask(r2));
|
||||
EXPECT_EQ(true, any(r2));
|
||||
EXPECT_EQ(false, all(r2));
|
||||
|
||||
vint4 a3(1, 2, 3, 4);
|
||||
vint4 b3(1, 0, 3, 0);
|
||||
vmask4 r3 = a3 != b3;
|
||||
EXPECT_EQ(0xA, mask(r3));
|
||||
EXPECT_EQ(0xAu, mask(r3));
|
||||
EXPECT_EQ(true, any(r3));
|
||||
EXPECT_EQ(false, all(r3));
|
||||
|
||||
vint4 a4(1, 2, 3, 4);
|
||||
vmask4 r4 = a4 != a4;
|
||||
EXPECT_EQ(0, mask(r4));
|
||||
EXPECT_EQ(0u, mask(r4));
|
||||
EXPECT_EQ(false, any(r4));
|
||||
EXPECT_EQ(false, all(r4));
|
||||
}
|
||||
@@ -1501,7 +1502,7 @@ TEST(vint4, clt)
|
||||
vint4 a(1, 2, 3, 4);
|
||||
vint4 b(0, 3, 3, 5);
|
||||
vmask4 r = a < b;
|
||||
EXPECT_EQ(0xA, mask(r));
|
||||
EXPECT_EQ(0xAu, mask(r));
|
||||
}
|
||||
|
||||
/** @brief Test vint4 cgt. */
|
||||
@@ -1510,7 +1511,7 @@ TEST(vint4, cle)
|
||||
vint4 a(1, 2, 3, 4);
|
||||
vint4 b(0, 3, 3, 5);
|
||||
vmask4 r = a > b;
|
||||
EXPECT_EQ(0x1, mask(r));
|
||||
EXPECT_EQ(0x1u, mask(r));
|
||||
}
|
||||
|
||||
/** @brief Test vint4 lsl. */
|
||||
@@ -1544,7 +1545,7 @@ TEST(vint4, lsr)
|
||||
EXPECT_EQ(a.lane<0>(), 1);
|
||||
EXPECT_EQ(a.lane<1>(), 2);
|
||||
EXPECT_EQ(a.lane<2>(), 4);
|
||||
EXPECT_EQ(a.lane<3>(), 0xFFFFFFFC);
|
||||
EXPECT_EQ(a.lane<3>(), static_cast<int>(0xFFFFFFFC));
|
||||
|
||||
a = lsr<1>(a);
|
||||
EXPECT_EQ(a.lane<0>(), 0);
|
||||
@@ -1681,7 +1682,8 @@ TEST(vint4, hadd_rgb_s)
|
||||
/** @brief Test vint4 clz. */
|
||||
TEST(vint4, clz)
|
||||
{
|
||||
vint4 a1(0x80000000, 0x40000000, 0x20000000, 0x10000000);
|
||||
int msb_set = static_cast<int>(0x80000000);
|
||||
vint4 a1(msb_set, 0x40000000, 0x20000000, 0x10000000);
|
||||
vint4 r1 = clz(a1);
|
||||
EXPECT_EQ(r1.lane<0>(), 0);
|
||||
EXPECT_EQ(r1.lane<1>(), 1);
|
||||
@@ -1723,7 +1725,7 @@ TEST(vint4, two_to_the_n)
|
||||
/** @brief Test vint4 storea. */
|
||||
TEST(vint4, storea)
|
||||
{
|
||||
alignas(16) int out[4];
|
||||
ASTCENC_ALIGNAS int out[4];
|
||||
vint4 a(s32_data);
|
||||
storea(a, out);
|
||||
EXPECT_EQ(out[0], 0);
|
||||
@@ -1735,7 +1737,7 @@ TEST(vint4, storea)
|
||||
/** @brief Test vint4 store. */
|
||||
TEST(vint4, store)
|
||||
{
|
||||
alignas(16) int out[5];
|
||||
ASTCENC_ALIGNAS int out[5];
|
||||
vint4 a(s32_data);
|
||||
store(a, &(out[1]));
|
||||
EXPECT_EQ(out[1], 0);
|
||||
@@ -1747,12 +1749,78 @@ TEST(vint4, store)
|
||||
/** @brief Test vint4 store_nbytes. */
|
||||
TEST(vint4, store_nbytes)
|
||||
{
|
||||
alignas(16) int out;
|
||||
ASTCENC_ALIGNAS int out;
|
||||
vint4 a(42, 314, 75, 90);
|
||||
store_nbytes(a, (uint8_t*)&out);
|
||||
store_nbytes(a, reinterpret_cast<uint8_t*>(&out));
|
||||
EXPECT_EQ(out, 42);
|
||||
}
|
||||
|
||||
/** @brief Test vint4 store_lanes_masked. */
|
||||
TEST(vint4, store_lanes_masked)
|
||||
{
|
||||
uint8_t resulta[16] { 0 };
|
||||
|
||||
// Store nothing
|
||||
vmask4 mask1 = vint4(0) == vint4(1);
|
||||
vint4 data1 = vint4(1);
|
||||
|
||||
store_lanes_masked(resulta, data1, mask1);
|
||||
vint4 result1v = vint4::load(resulta);
|
||||
vint4 expect1v = vint4::zero();
|
||||
EXPECT_TRUE(all(result1v == expect1v));
|
||||
|
||||
// Store half
|
||||
vmask4 mask2 = vint4(1, 1, 0, 0) == vint4(1);
|
||||
vint4 data2 = vint4(2);
|
||||
|
||||
store_lanes_masked(resulta, data2, mask2);
|
||||
vint4 result2v = vint4::load(resulta);
|
||||
vint4 expect2v = vint4(2, 2, 0, 0);
|
||||
EXPECT_TRUE(all(result2v == expect2v));
|
||||
|
||||
// Store all
|
||||
vmask4 mask3 = vint4(1) == vint4(1);
|
||||
vint4 data3 = vint4(3);
|
||||
|
||||
store_lanes_masked(resulta, data3, mask3);
|
||||
vint4 result3v = vint4::load(resulta);
|
||||
vint4 expect3v = vint4(3);
|
||||
EXPECT_TRUE(all(result3v == expect3v));
|
||||
}
|
||||
|
||||
/** @brief Test vint4 store_lanes_masked to unaligned address. */
|
||||
TEST(vint4, store_lanes_masked_unaligned)
|
||||
{
|
||||
uint8_t resulta[17] { 0 };
|
||||
|
||||
// Store nothing
|
||||
vmask4 mask1 = vint4(0) == vint4(1);
|
||||
vint4 data1 = vint4(1);
|
||||
|
||||
store_lanes_masked(resulta + 1, data1, mask1);
|
||||
vint4 result1v = vint4::load(resulta + 1);
|
||||
vint4 expect1v = vint4::zero();
|
||||
EXPECT_TRUE(all(result1v == expect1v));
|
||||
|
||||
// Store half
|
||||
vmask4 mask2 = vint4(1, 1, 0, 0) == vint4(1);
|
||||
vint4 data2 = vint4(2);
|
||||
|
||||
store_lanes_masked(resulta + 1, data2, mask2);
|
||||
vint4 result2v = vint4::load(resulta + 1);
|
||||
vint4 expect2v = vint4(2, 2, 0, 0);
|
||||
EXPECT_TRUE(all(result2v == expect2v));
|
||||
|
||||
// Store all
|
||||
vmask4 mask3 = vint4(1) == vint4(1);
|
||||
vint4 data3 = vint4(3);
|
||||
|
||||
store_lanes_masked(resulta + 1, data3, mask3);
|
||||
vint4 result3v = vint4::load(resulta + 1);
|
||||
vint4 expect3v = vint4(3);
|
||||
EXPECT_TRUE(all(result3v == expect3v));
|
||||
}
|
||||
|
||||
/** @brief Test vint4 gatheri. */
|
||||
TEST(vint4, gatheri)
|
||||
{
|
||||
@@ -1799,38 +1867,38 @@ TEST(vint4, select)
|
||||
/** @brief Test vmask4 scalar literal constructor. */
|
||||
TEST(vmask4, scalar_literal_construct)
|
||||
{
|
||||
vfloat4 m1a(0, 0, 0, 0);
|
||||
vfloat4 m1b(1, 1, 1, 1);
|
||||
vfloat4 m1a(0.0f, 0.0f, 0.0f, 0.0f);
|
||||
vfloat4 m1b(1.0f, 1.0f, 1.0f, 1.0f);
|
||||
vmask4 m1(true);
|
||||
|
||||
vfloat4 r = select(m1a, m1b, m1);
|
||||
|
||||
EXPECT_EQ(r.lane<0>(), 1);
|
||||
EXPECT_EQ(r.lane<1>(), 1);
|
||||
EXPECT_EQ(r.lane<2>(), 1);
|
||||
EXPECT_EQ(r.lane<3>(), 1);
|
||||
EXPECT_EQ(r.lane<0>(), 1.0f);
|
||||
EXPECT_EQ(r.lane<1>(), 1.0f);
|
||||
EXPECT_EQ(r.lane<2>(), 1.0f);
|
||||
EXPECT_EQ(r.lane<3>(), 1.0f);
|
||||
|
||||
r = select(m1b, m1a, m1);
|
||||
|
||||
EXPECT_EQ(r.lane<0>(), 0);
|
||||
EXPECT_EQ(r.lane<1>(), 0);
|
||||
EXPECT_EQ(r.lane<2>(), 0);
|
||||
EXPECT_EQ(r.lane<3>(), 0);
|
||||
EXPECT_EQ(r.lane<0>(), 0.0f);
|
||||
EXPECT_EQ(r.lane<1>(), 0.0f);
|
||||
EXPECT_EQ(r.lane<2>(), 0.0f);
|
||||
EXPECT_EQ(r.lane<3>(), 0.0f);
|
||||
}
|
||||
|
||||
/** @brief Test vmask4 literal constructor. */
|
||||
TEST(vmask4, literal_construct)
|
||||
{
|
||||
vfloat4 m1a(0, 0, 0, 0);
|
||||
vfloat4 m1b(1, 1, 1, 1);
|
||||
vfloat4 m1a(0.0f, 0.0f, 0.0f, 0.0f);
|
||||
vfloat4 m1b(1.0f, 1.0f, 1.0f, 1.0f);
|
||||
vmask4 m1(true, false, true, false);
|
||||
|
||||
vfloat4 r = select(m1a, m1b, m1);
|
||||
|
||||
EXPECT_EQ(r.lane<0>(), 1);
|
||||
EXPECT_EQ(r.lane<1>(), 0);
|
||||
EXPECT_EQ(r.lane<2>(), 1);
|
||||
EXPECT_EQ(r.lane<3>(), 0);
|
||||
EXPECT_EQ(r.lane<0>(), 1.0f);
|
||||
EXPECT_EQ(r.lane<1>(), 0.0f);
|
||||
EXPECT_EQ(r.lane<2>(), 1.0f);
|
||||
EXPECT_EQ(r.lane<3>(), 0.0f);
|
||||
}
|
||||
|
||||
/** @brief Test vmask4 or. */
|
||||
@@ -1845,7 +1913,7 @@ TEST(vmask4, or)
|
||||
vmask4 m2 = m2a == m2b;
|
||||
|
||||
vmask4 r = m1 | m2;
|
||||
EXPECT_EQ(mask(r), 0xB);
|
||||
EXPECT_EQ(mask(r), 0xBu);
|
||||
}
|
||||
|
||||
/** @brief Test vmask4 and. */
|
||||
@@ -1860,7 +1928,7 @@ TEST(vmask4, and)
|
||||
vmask4 m2 = m2a == m2b;
|
||||
|
||||
vmask4 r = m1 & m2;
|
||||
EXPECT_EQ(mask(r), 0x2);
|
||||
EXPECT_EQ(mask(r), 0x2u);
|
||||
}
|
||||
|
||||
/** @brief Test vmask4 xor. */
|
||||
@@ -1875,7 +1943,7 @@ TEST(vmask4, xor)
|
||||
vmask4 m2 = m2a == m2b;
|
||||
|
||||
vmask4 r = m1 ^ m2;
|
||||
EXPECT_EQ(mask(r), 0x9);
|
||||
EXPECT_EQ(mask(r), 0x9u);
|
||||
}
|
||||
|
||||
/** @brief Test vmask4 not. */
|
||||
@@ -1885,7 +1953,63 @@ TEST(vmask4, not)
|
||||
vfloat4 m1b(1, 1, 1, 1);
|
||||
vmask4 m1 = m1a == m1b;
|
||||
vmask4 r = ~m1;
|
||||
EXPECT_EQ(mask(r), 0x5);
|
||||
EXPECT_EQ(mask(r), 0x5u);
|
||||
}
|
||||
|
||||
/** @brief Test vint4 table permute. */
|
||||
TEST(vint4, vtable_8bt_32bi_32entry)
|
||||
{
|
||||
vint4 table0(0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f);
|
||||
vint4 table1(0x10111213, 0x14151617, 0x18191a1b, 0x1c1d1e1f);
|
||||
|
||||
vint4 table0p, table1p;
|
||||
vtable_prepare(table0, table1, table0p, table1p);
|
||||
|
||||
vint4 index(0, 7, 4, 31);
|
||||
|
||||
vint4 result = vtable_8bt_32bi(table0p, table1p, index);
|
||||
|
||||
EXPECT_EQ(result.lane<0>(), 3);
|
||||
EXPECT_EQ(result.lane<1>(), 4);
|
||||
EXPECT_EQ(result.lane<2>(), 7);
|
||||
EXPECT_EQ(result.lane<3>(), 28);
|
||||
}
|
||||
|
||||
/** @brief Test vint4 table permute. */
|
||||
TEST(vint4, vtable_8bt_32bi_64entry)
|
||||
{
|
||||
vint4 table0(0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f);
|
||||
vint4 table1(0x10111213, 0x14151617, 0x18191a1b, 0x1c1d1e1f);
|
||||
vint4 table2(0x20212223, 0x24252627, 0x28292a2b, 0x2c2d2e2f);
|
||||
vint4 table3(0x30313233, 0x34353637, 0x38393a3b, 0x3c3d3e3f);
|
||||
|
||||
vint4 table0p, table1p, table2p, table3p;
|
||||
vtable_prepare(table0, table1, table2, table3, table0p, table1p, table2p, table3p);
|
||||
|
||||
vint4 index(0, 7, 38, 63);
|
||||
|
||||
vint4 result = vtable_8bt_32bi(table0p, table1p, table2p, table3p, index);
|
||||
|
||||
EXPECT_EQ(result.lane<0>(), 3);
|
||||
EXPECT_EQ(result.lane<1>(), 4);
|
||||
EXPECT_EQ(result.lane<2>(), 37);
|
||||
EXPECT_EQ(result.lane<3>(), 60);
|
||||
}
|
||||
|
||||
/** @brief Test vint4 rgba byte interleave. */
|
||||
TEST(vint4, interleave_rgba8)
|
||||
{
|
||||
vint4 r(0x01, 0x11, 0x21, 0x31);
|
||||
vint4 g(0x02, 0x12, 0x22, 0x32);
|
||||
vint4 b(0x03, 0x13, 0x23, 0x33);
|
||||
vint4 a(0x04, 0x14, 0x24, 0x34);
|
||||
|
||||
vint4 result = interleave_rgba8(r, g, b, a);
|
||||
|
||||
EXPECT_EQ(result.lane<0>(), 0x04030201);
|
||||
EXPECT_EQ(result.lane<1>(), 0x14131211);
|
||||
EXPECT_EQ(result.lane<2>(), 0x24232221);
|
||||
EXPECT_EQ(result.lane<3>(), 0x34333231);
|
||||
}
|
||||
|
||||
# if ASTCENC_SIMD_WIDTH == 8
|
||||
@@ -2142,27 +2266,27 @@ TEST(vfloat8, ceq)
|
||||
vfloat8 a1(1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f);
|
||||
vfloat8 b1(0.1f, 0.2f, 0.3f, 0.4f, 0.5f, 0.6f, 0.7f, 0.8f);
|
||||
vmask8 r1 = a1 == b1;
|
||||
EXPECT_EQ(0, mask(r1));
|
||||
EXPECT_EQ(0u, mask(r1));
|
||||
EXPECT_EQ(false, any(r1));
|
||||
EXPECT_EQ(false, all(r1));
|
||||
|
||||
vfloat8 a2(1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f);
|
||||
vfloat8 b2(1.0f, 0.2f, 0.3f, 0.4f, 0.5f, 0.6f, 0.7f, 0.8f);
|
||||
vmask8 r2 = a2 == b2;
|
||||
EXPECT_EQ(0x1, mask(r2));
|
||||
EXPECT_EQ(0x1u, mask(r2));
|
||||
EXPECT_EQ(true, any(r2));
|
||||
EXPECT_EQ(false, all(r2));
|
||||
|
||||
vfloat8 a3(1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f);
|
||||
vfloat8 b3(1.0f, 0.2f, 3.0f, 0.4f, 0.5f, 0.6f, 0.7f, 0.8f);
|
||||
vmask8 r3 = a3 == b3;
|
||||
EXPECT_EQ(0x5, mask(r3));
|
||||
EXPECT_EQ(0x5u, mask(r3));
|
||||
EXPECT_EQ(true, any(r3));
|
||||
EXPECT_EQ(false, all(r3));
|
||||
|
||||
vfloat8 a4(1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f);
|
||||
vmask8 r4 = a4 == a4;
|
||||
EXPECT_EQ(0xFF, mask(r4));
|
||||
EXPECT_EQ(0xFFu, mask(r4));
|
||||
EXPECT_EQ(true, any(r4));
|
||||
EXPECT_EQ(true, all(r4));
|
||||
}
|
||||
@@ -2173,27 +2297,27 @@ TEST(vfloat8, cne)
|
||||
vfloat8 a1(1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f);
|
||||
vfloat8 b1(0.1f, 0.2f, 0.3f, 0.4f, 0.5f, 0.6f, 0.7f, 0.8f);
|
||||
vmask8 r1 = a1 != b1;
|
||||
EXPECT_EQ(0xFF, mask(r1));
|
||||
EXPECT_EQ(0xFFu, mask(r1));
|
||||
EXPECT_EQ(true, any(r1));
|
||||
EXPECT_EQ(true, all(r1));
|
||||
|
||||
vfloat8 a2(1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f);
|
||||
vfloat8 b2(1.0f, 0.2f, 0.3f, 0.4f, 0.5f, 0.6f, 0.7f, 0.8f);
|
||||
vmask8 r2 = a2 != b2;
|
||||
EXPECT_EQ(0xFE, mask(r2));
|
||||
EXPECT_EQ(0xFEu, mask(r2));
|
||||
EXPECT_EQ(true, any(r2));
|
||||
EXPECT_EQ(false, all(r2));
|
||||
|
||||
vfloat8 a3(1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f);
|
||||
vfloat8 b3(1.0f, 0.2f, 3.0f, 0.4f, 0.5f, 0.6f, 0.7f, 0.8f);
|
||||
vmask8 r3 = a3 != b3;
|
||||
EXPECT_EQ(0xFA, mask(r3));
|
||||
EXPECT_EQ(0xFAu, mask(r3));
|
||||
EXPECT_EQ(true, any(r3));
|
||||
EXPECT_EQ(false, all(r3));
|
||||
|
||||
vfloat8 a4(1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f);
|
||||
vmask8 r4 = a4 != a4;
|
||||
EXPECT_EQ(0, mask(r4));
|
||||
EXPECT_EQ(0u, mask(r4));
|
||||
EXPECT_EQ(false, any(r4));
|
||||
EXPECT_EQ(false, all(r4));
|
||||
}
|
||||
@@ -2204,7 +2328,7 @@ TEST(vfloat8, clt)
|
||||
vfloat8 a(1.0f, 2.0f, 3.0f, 4.0f, 1.0f, 2.0f, 3.0f, 4.0f);
|
||||
vfloat8 b(0.9f, 2.1f, 3.0f, 4.1f, 0.9f, 2.1f, 3.0f, 4.1f);
|
||||
vmask8 r = a < b;
|
||||
EXPECT_EQ(0xAA, mask(r));
|
||||
EXPECT_EQ(0xAAu, mask(r));
|
||||
}
|
||||
|
||||
/** @brief Test vfloat8 cle. */
|
||||
@@ -2213,7 +2337,7 @@ TEST(vfloat8, cle)
|
||||
vfloat8 a(1.0f, 2.0f, 3.0f, 4.0f, 1.0f, 2.0f, 3.0f, 4.0f);
|
||||
vfloat8 b(0.9f, 2.1f, 3.0f, 4.1f, 0.9f, 2.1f, 3.0f, 4.1f);
|
||||
vmask8 r = a <= b;
|
||||
EXPECT_EQ(0xEE, mask(r));
|
||||
EXPECT_EQ(0xEEu, mask(r));
|
||||
}
|
||||
|
||||
/** @brief Test vfloat8 cgt. */
|
||||
@@ -2222,7 +2346,7 @@ TEST(vfloat8, cgt)
|
||||
vfloat8 a(1.0f, 2.0f, 3.0f, 4.0f, 1.0f, 2.0f, 3.0f, 4.0f);
|
||||
vfloat8 b(0.9f, 2.1f, 3.0f, 4.1f, 0.9f, 2.1f, 3.0f, 4.1f);
|
||||
vmask8 r = a > b;
|
||||
EXPECT_EQ(0x11, mask(r));
|
||||
EXPECT_EQ(0x11u, mask(r));
|
||||
}
|
||||
|
||||
/** @brief Test vfloat8 cge. */
|
||||
@@ -2231,7 +2355,7 @@ TEST(vfloat8, cge)
|
||||
vfloat8 a(1.0f, 2.0f, 3.0f, 4.0f, 1.0f, 2.0f, 3.0f, 4.0f);
|
||||
vfloat8 b(0.9f, 2.1f, 3.0f, 4.1f, 0.9f, 2.1f, 3.0f, 4.1f);
|
||||
vmask8 r = a >= b;
|
||||
EXPECT_EQ(0x55, mask(r));
|
||||
EXPECT_EQ(0x55u, mask(r));
|
||||
}
|
||||
|
||||
/** @brief Test vfloat8 min. */
|
||||
@@ -2510,7 +2634,8 @@ TEST(vfloat8, select)
|
||||
/** @brief Test vfloat8 select MSB only. */
|
||||
TEST(vfloat8, select_msb)
|
||||
{
|
||||
vint8 msb(0x80000000, 0, 0x80000000, 0, 0x80000000, 0, 0x80000000, 0);
|
||||
int msb_set = static_cast<int>(0x80000000);
|
||||
vint8 msb(msb_set, 0, msb_set, 0, msb_set, 0, msb_set, 0);
|
||||
vmask8 cond(msb.m);
|
||||
|
||||
vfloat8 a(1.0f, 3.0f, 3.0f, 1.0f, 1.0f, 3.0f, 3.0f, 1.0f);
|
||||
@@ -2527,7 +2652,6 @@ TEST(vfloat8, select_msb)
|
||||
EXPECT_EQ(r1.lane<6>(), 2.0f);
|
||||
EXPECT_EQ(r1.lane<7>(), 1.0f);
|
||||
|
||||
|
||||
// Select in the other
|
||||
vfloat8 r2 = select(b, a, cond);
|
||||
EXPECT_EQ(r2.lane<0>(), 1.0f);
|
||||
@@ -2867,27 +2991,27 @@ TEST(vint8, ceq)
|
||||
vint8 a1(1, 2, 3, 4, 1, 2, 3, 4);
|
||||
vint8 b1(0, 1, 2, 3, 0, 1, 2, 3);
|
||||
vmask8 r1 = a1 == b1;
|
||||
EXPECT_EQ(0, mask(r1));
|
||||
EXPECT_EQ(0u, mask(r1));
|
||||
EXPECT_EQ(false, any(r1));
|
||||
EXPECT_EQ(false, all(r1));
|
||||
|
||||
vint8 a2(1, 2, 3, 4, 1, 2, 3, 4);
|
||||
vint8 b2(1, 0, 0, 0, 1, 0, 0, 0);
|
||||
vmask8 r2 = a2 == b2;
|
||||
EXPECT_EQ(0x11, mask(r2));
|
||||
EXPECT_EQ(0x11u, mask(r2));
|
||||
EXPECT_EQ(true, any(r2));
|
||||
EXPECT_EQ(false, all(r2));
|
||||
|
||||
vint8 a3(1, 2, 3, 4, 1, 2, 3, 4);
|
||||
vint8 b3(1, 0, 3, 0, 1, 0, 3, 0);
|
||||
vmask8 r3 = a3 == b3;
|
||||
EXPECT_EQ(0x55, mask(r3));
|
||||
EXPECT_EQ(0x55u, mask(r3));
|
||||
EXPECT_EQ(true, any(r3));
|
||||
EXPECT_EQ(false, all(r3));
|
||||
|
||||
vint8 a4(1, 2, 3, 4, 1, 2, 3, 4);
|
||||
vmask8 r4 = a4 == a4;
|
||||
EXPECT_EQ(0xFF, mask(r4));
|
||||
EXPECT_EQ(0xFFu, mask(r4));
|
||||
EXPECT_EQ(true, any(r4));
|
||||
EXPECT_EQ(true, all(r4));
|
||||
}
|
||||
@@ -2898,27 +3022,27 @@ TEST(vint8, cne)
|
||||
vint8 a1(1, 2, 3, 4, 1, 2, 3, 4);
|
||||
vint8 b1(0, 1, 2, 3, 0, 1, 2, 3);
|
||||
vmask8 r1 = a1 != b1;
|
||||
EXPECT_EQ(0xFF, mask(r1));
|
||||
EXPECT_EQ(0xFFu, mask(r1));
|
||||
EXPECT_EQ(true, any(r1));
|
||||
EXPECT_EQ(true, all(r1));
|
||||
|
||||
vint8 a2(1, 2, 3, 4, 1, 2, 3, 4);
|
||||
vint8 b2(1, 0, 0, 0, 1, 0, 0, 0);
|
||||
vmask8 r2 = a2 != b2;
|
||||
EXPECT_EQ(0xEE, mask(r2));
|
||||
EXPECT_EQ(0xEEu, mask(r2));
|
||||
EXPECT_EQ(true, any(r2));
|
||||
EXPECT_EQ(false, all(r2));
|
||||
|
||||
vint8 a3(1, 2, 3, 4, 1, 2, 3, 4);
|
||||
vint8 b3(1, 0, 3, 0, 1, 0, 3, 0);
|
||||
vmask8 r3 = a3 != b3;
|
||||
EXPECT_EQ(0xAA, mask(r3));
|
||||
EXPECT_EQ(0xAAu, mask(r3));
|
||||
EXPECT_EQ(true, any(r3));
|
||||
EXPECT_EQ(false, all(r3));
|
||||
|
||||
vint8 a4(1, 2, 3, 4, 1, 2, 3, 4);
|
||||
vmask8 r4 = a4 != a4;
|
||||
EXPECT_EQ(0, mask(r4));
|
||||
EXPECT_EQ(0u, mask(r4));
|
||||
EXPECT_EQ(false, any(r4));
|
||||
EXPECT_EQ(false, all(r4));
|
||||
}
|
||||
@@ -2929,7 +3053,7 @@ TEST(vint8, clt)
|
||||
vint8 a(1, 2, 3, 4, 1, 2, 3, 4);
|
||||
vint8 b(0, 3, 3, 5, 0, 3, 3, 5);
|
||||
vmask8 r = a < b;
|
||||
EXPECT_EQ(0xAA, mask(r));
|
||||
EXPECT_EQ(0xAAu, mask(r));
|
||||
}
|
||||
|
||||
/** @brief Test vint8 cgt. */
|
||||
@@ -2938,7 +3062,7 @@ TEST(vint8, cgt)
|
||||
vint8 a(1, 2, 3, 4, 1, 2, 3, 4);
|
||||
vint8 b(0, 3, 3, 5, 0, 3, 3, 5);
|
||||
vmask8 r = a > b;
|
||||
EXPECT_EQ(0x11, mask(r));
|
||||
EXPECT_EQ(0x11u, mask(r));
|
||||
}
|
||||
|
||||
/** @brief Test vint8 min. */
|
||||
@@ -2973,30 +3097,66 @@ TEST(vint8, max)
|
||||
EXPECT_EQ(r.lane<7>(), 5);
|
||||
}
|
||||
|
||||
/** @brief Test vint8 lsl. */
|
||||
TEST(vint8, lsl)
|
||||
{
|
||||
vint8 a(1, 2, 4, -4, 1, 2, 4, -4);
|
||||
a = lsl<0>(a);
|
||||
EXPECT_EQ(a.lane<0>(), 1);
|
||||
EXPECT_EQ(a.lane<1>(), 2);
|
||||
EXPECT_EQ(a.lane<2>(), 4);
|
||||
EXPECT_EQ(a.lane<3>(), static_cast<int>(0xFFFFFFFC));
|
||||
EXPECT_EQ(a.lane<4>(), 1);
|
||||
EXPECT_EQ(a.lane<5>(), 2);
|
||||
EXPECT_EQ(a.lane<6>(), 4);
|
||||
EXPECT_EQ(a.lane<7>(), static_cast<int>(0xFFFFFFFC));
|
||||
|
||||
|
||||
a = lsl<1>(a);
|
||||
EXPECT_EQ(a.lane<0>(), 2);
|
||||
EXPECT_EQ(a.lane<1>(), 4);
|
||||
EXPECT_EQ(a.lane<2>(), 8);
|
||||
EXPECT_EQ(a.lane<3>(), static_cast<int>(0xFFFFFFF8));
|
||||
EXPECT_EQ(a.lane<4>(), 2);
|
||||
EXPECT_EQ(a.lane<5>(), 4);
|
||||
EXPECT_EQ(a.lane<6>(), 8);
|
||||
EXPECT_EQ(a.lane<7>(), static_cast<int>(0xFFFFFFF8));
|
||||
|
||||
a = lsl<2>(a);
|
||||
EXPECT_EQ(a.lane<0>(), 8);
|
||||
EXPECT_EQ(a.lane<1>(), 16);
|
||||
EXPECT_EQ(a.lane<2>(), 32);
|
||||
EXPECT_EQ(a.lane<3>(), static_cast<int>(0xFFFFFFE0));
|
||||
EXPECT_EQ(a.lane<4>(), 8);
|
||||
EXPECT_EQ(a.lane<5>(), 16);
|
||||
EXPECT_EQ(a.lane<6>(), 32);
|
||||
EXPECT_EQ(a.lane<7>(), static_cast<int>(0xFFFFFFE0));
|
||||
}
|
||||
|
||||
/** @brief Test vint8 lsr. */
|
||||
TEST(vint8, lsr)
|
||||
{
|
||||
vint8 a(1, 2, 4, -4, 1, 2, 4, -4);
|
||||
a = lsr<0>(a);
|
||||
EXPECT_EQ(a.lane<0>(), 1);
|
||||
EXPECT_EQ(a.lane<1>(), 2);
|
||||
EXPECT_EQ(a.lane<2>(), 4);
|
||||
EXPECT_EQ(a.lane<3>(), 0xFFFFFFFC);
|
||||
EXPECT_EQ(a.lane<4>(), 1);
|
||||
EXPECT_EQ(a.lane<5>(), 2);
|
||||
EXPECT_EQ(a.lane<6>(), 4);
|
||||
EXPECT_EQ(a.lane<7>(), 0xFFFFFFFC);
|
||||
EXPECT_EQ(a.lane<0>(), 1);
|
||||
EXPECT_EQ(a.lane<1>(), 2);
|
||||
EXPECT_EQ(a.lane<2>(), 4);
|
||||
EXPECT_EQ(a.lane<3>(), static_cast<int>(0xFFFFFFFC));
|
||||
EXPECT_EQ(a.lane<4>(), 1);
|
||||
EXPECT_EQ(a.lane<5>(), 2);
|
||||
EXPECT_EQ(a.lane<6>(), 4);
|
||||
EXPECT_EQ(a.lane<7>(), static_cast<int>(0xFFFFFFFC));
|
||||
|
||||
|
||||
a = lsr<1>(a);
|
||||
EXPECT_EQ(a.lane<0>(), 0);
|
||||
EXPECT_EQ(a.lane<1>(), 1);
|
||||
EXPECT_EQ(a.lane<2>(), 2);
|
||||
EXPECT_EQ(a.lane<3>(), 0x7FFFFFFE);
|
||||
EXPECT_EQ(a.lane<4>(), 0);
|
||||
EXPECT_EQ(a.lane<5>(), 1);
|
||||
EXPECT_EQ(a.lane<6>(), 2);
|
||||
EXPECT_EQ(a.lane<7>(), 0x7FFFFFFE);
|
||||
EXPECT_EQ(a.lane<0>(), 0);
|
||||
EXPECT_EQ(a.lane<1>(), 1);
|
||||
EXPECT_EQ(a.lane<2>(), 2);
|
||||
EXPECT_EQ(a.lane<3>(), 0x7FFFFFFE);
|
||||
EXPECT_EQ(a.lane<4>(), 0);
|
||||
EXPECT_EQ(a.lane<5>(), 1);
|
||||
EXPECT_EQ(a.lane<6>(), 2);
|
||||
EXPECT_EQ(a.lane<7>(), 0x7FFFFFFE);
|
||||
|
||||
a = lsr<2>(a);
|
||||
EXPECT_EQ(a.lane<0>(), 0);
|
||||
@@ -3134,11 +3294,77 @@ TEST(vint8, store_nbytes)
|
||||
{
|
||||
alignas(32) int out[2];
|
||||
vint8 a(42, 314, 75, 90, 42, 314, 75, 90);
|
||||
store_nbytes(a, (uint8_t*)&out);
|
||||
store_nbytes(a, reinterpret_cast<uint8_t*>(&out));
|
||||
EXPECT_EQ(out[0], 42);
|
||||
EXPECT_EQ(out[1], 314);
|
||||
}
|
||||
|
||||
/** @brief Test vint8 store_lanes_masked. */
|
||||
TEST(vint8, store_lanes_masked)
|
||||
{
|
||||
uint8_t resulta[32] { 0 };
|
||||
|
||||
// Store nothing
|
||||
vmask8 mask1 = vint8(0) == vint8(1);
|
||||
vint8 data1 = vint8(1);
|
||||
|
||||
store_lanes_masked(resulta, data1, mask1);
|
||||
vint8 result1v = vint8::load(resulta);
|
||||
vint8 expect1v = vint8::zero();
|
||||
EXPECT_TRUE(all(result1v == expect1v));
|
||||
|
||||
// Store half
|
||||
vmask8 mask2 = vint8(1, 1, 1, 1, 0, 0, 0, 0) == vint8(1);
|
||||
vint8 data2 = vint8(2);
|
||||
|
||||
store_lanes_masked(resulta, data2, mask2);
|
||||
vint8 result2v = vint8::load(resulta);
|
||||
vint8 expect2v = vint8(2, 2, 2, 2, 0, 0, 0, 0);
|
||||
EXPECT_TRUE(all(result2v == expect2v));
|
||||
|
||||
// Store all
|
||||
vmask8 mask3 = vint8(1) == vint8(1);
|
||||
vint8 data3 = vint8(3);
|
||||
|
||||
store_lanes_masked(resulta, data3, mask3);
|
||||
vint8 result3v = vint8::load(resulta);
|
||||
vint8 expect3v = vint8(3);
|
||||
EXPECT_TRUE(all(result3v == expect3v));
|
||||
}
|
||||
|
||||
/** @brief Test vint8 store_lanes_masked to unaligned address. */
|
||||
TEST(vint8, store_lanes_masked_unaligned)
|
||||
{
|
||||
uint8_t resulta[33] { 0 };
|
||||
|
||||
// Store nothing
|
||||
vmask8 mask1 = vint8(0) == vint8(1);
|
||||
vint8 data1 = vint8(1);
|
||||
|
||||
store_lanes_masked(resulta + 1, data1, mask1);
|
||||
vint8 result1v = vint8::load(resulta + 1);
|
||||
vint8 expect1v = vint8::zero();
|
||||
EXPECT_TRUE(all(result1v == expect1v));
|
||||
|
||||
// Store half
|
||||
vmask8 mask2 = vint8(1, 1, 1, 1, 0, 0, 0, 0) == vint8(1);
|
||||
vint8 data2 = vint8(2);
|
||||
|
||||
store_lanes_masked(resulta + 1, data2, mask2);
|
||||
vint8 result2v = vint8::load(resulta + 1);
|
||||
vint8 expect2v = vint8(2, 2, 2, 2, 0, 0, 0, 0);
|
||||
EXPECT_TRUE(all(result2v == expect2v));
|
||||
|
||||
// Store all
|
||||
vmask8 mask3 = vint8(1) == vint8(1);
|
||||
vint8 data3 = vint8(3);
|
||||
|
||||
store_lanes_masked(resulta + 1, data3, mask3);
|
||||
vint8 result3v = vint8::load(resulta + 1);
|
||||
vint8 expect3v = vint8(3);
|
||||
EXPECT_TRUE(all(result3v == expect3v));
|
||||
}
|
||||
|
||||
/** @brief Test vint8 gatheri. */
|
||||
TEST(vint8, gatheri)
|
||||
{
|
||||
@@ -3225,7 +3451,7 @@ TEST(vmask8, or)
|
||||
vmask8 m2 = m2a == m2b;
|
||||
|
||||
vmask8 r = m1 | m2;
|
||||
EXPECT_EQ(mask(r), 0xBB);
|
||||
EXPECT_EQ(mask(r), 0xBBu);
|
||||
}
|
||||
|
||||
/** @brief Test vmask8 and. */
|
||||
@@ -3240,7 +3466,7 @@ TEST(vmask8, and)
|
||||
vmask8 m2 = m2a == m2b;
|
||||
|
||||
vmask8 r = m1 & m2;
|
||||
EXPECT_EQ(mask(r), 0x22);
|
||||
EXPECT_EQ(mask(r), 0x22u);
|
||||
}
|
||||
|
||||
/** @brief Test vmask8 xor. */
|
||||
@@ -3255,7 +3481,7 @@ TEST(vmask8, xor)
|
||||
vmask8 m2 = m2a == m2b;
|
||||
|
||||
vmask8 r = m1 ^ m2;
|
||||
EXPECT_EQ(mask(r), 0x99);
|
||||
EXPECT_EQ(mask(r), 0x99u);
|
||||
}
|
||||
|
||||
/** @brief Test vmask8 not. */
|
||||
@@ -3265,7 +3491,55 @@ TEST(vmask8, not)
|
||||
vfloat8 m1b(1, 1, 1, 1, 1, 1, 1, 1);
|
||||
vmask8 m1 = m1a == m1b;
|
||||
vmask8 r = ~m1;
|
||||
EXPECT_EQ(mask(r), 0x55);
|
||||
EXPECT_EQ(mask(r), 0x55u);
|
||||
}
|
||||
|
||||
/** @brief Test vint8 table permute. */
|
||||
TEST(vint8, vtable_8bt_32bi_32entry)
|
||||
{
|
||||
vint4 table0(0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f);
|
||||
vint4 table1(0x10111213, 0x14151617, 0x18191a1b, 0x1c1d1e1f);
|
||||
|
||||
vint8 table0p, table1p;
|
||||
vtable_prepare(table0, table1, table0p, table1p);
|
||||
|
||||
vint8 index(0, 7, 4, 15, 16, 20, 23, 31);
|
||||
|
||||
vint8 result = vtable_8bt_32bi(table0p, table1p, index);
|
||||
|
||||
EXPECT_EQ(result.lane<0>(), 3);
|
||||
EXPECT_EQ(result.lane<1>(), 4);
|
||||
EXPECT_EQ(result.lane<2>(), 7);
|
||||
EXPECT_EQ(result.lane<3>(), 12);
|
||||
EXPECT_EQ(result.lane<4>(), 19);
|
||||
EXPECT_EQ(result.lane<5>(), 23);
|
||||
EXPECT_EQ(result.lane<6>(), 20);
|
||||
EXPECT_EQ(result.lane<7>(), 28);
|
||||
}
|
||||
|
||||
/** @brief Test vint4 table permute. */
|
||||
TEST(vint8, vtable_8bt_32bi_64entry)
|
||||
{
|
||||
vint4 table0(0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f);
|
||||
vint4 table1(0x10111213, 0x14151617, 0x18191a1b, 0x1c1d1e1f);
|
||||
vint4 table2(0x20212223, 0x24252627, 0x28292a2b, 0x2c2d2e2f);
|
||||
vint4 table3(0x30313233, 0x34353637, 0x38393a3b, 0x3c3d3e3f);
|
||||
|
||||
vint8 table0p, table1p, table2p, table3p;
|
||||
vtable_prepare(table0, table1, table2, table3, table0p, table1p, table2p, table3p);
|
||||
|
||||
vint8 index(0, 7, 4, 15, 16, 20, 38, 63);
|
||||
|
||||
vint8 result = vtable_8bt_32bi(table0p, table1p, table2p, table3p, index);
|
||||
|
||||
EXPECT_EQ(result.lane<0>(), 3);
|
||||
EXPECT_EQ(result.lane<1>(), 4);
|
||||
EXPECT_EQ(result.lane<2>(), 7);
|
||||
EXPECT_EQ(result.lane<3>(), 12);
|
||||
EXPECT_EQ(result.lane<4>(), 19);
|
||||
EXPECT_EQ(result.lane<5>(), 23);
|
||||
EXPECT_EQ(result.lane<6>(), 37);
|
||||
EXPECT_EQ(result.lane<7>(), 60);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
// ----------------------------------------------------------------------------
|
||||
// Copyright 2020-2022 Arm Limited
|
||||
// Copyright 2020-2024 Arm Limited
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
|
||||
// use this file except in compliance with the License. You may obtain a copy
|
||||
@@ -43,10 +43,18 @@
|
||||
* for faster processing. The caller is responsible for creating the worker threads, and
|
||||
* synchronizing between images.
|
||||
*
|
||||
* Extended instruction set support
|
||||
* ================================
|
||||
*
|
||||
* This library supports use of extended instruction sets, such as SSE4.1 and AVX2. These are
|
||||
* enabled at compile time when building the library. There is no runtime checking in the core
|
||||
* library that the instruction sets used are actually available. Checking compatibility is the
|
||||
* responsibility of the calling code.
|
||||
*
|
||||
* Threading
|
||||
* =========
|
||||
*
|
||||
* In pseudocode, the usage for manual user threading looks like this:
|
||||
* In pseudo-code, the usage for manual user threading looks like this:
|
||||
*
|
||||
* // Configure the compressor run
|
||||
* astcenc_config my_config;
|
||||
@@ -74,7 +82,7 @@
|
||||
*
|
||||
* The codec supports compressing single images, which can be either 2D images or volumetric 3D
|
||||
* images. Calling code is responsible for any handling of aggregate types, such as mipmap chains,
|
||||
* texture arrays, or sliced 3D textures
|
||||
* texture arrays, or sliced 3D textures.
|
||||
*
|
||||
* Images are passed in as an astcenc_image structure. Inputs can be either 8-bit unorm, 16-bit
|
||||
* half-float, or 32-bit float, as indicated by the data_type field.
|
||||
@@ -82,7 +90,7 @@
|
||||
* Images can be any dimension; there is no requirement to be a multiple of the ASTC block size.
|
||||
*
|
||||
* Data is always passed in as 4 color components, and accessed as an array of 2D image slices. Data
|
||||
* within an image slice is always tightly packed without padding. Addresing looks like this:
|
||||
* within an image slice is always tightly packed without padding. Addressing looks like this:
|
||||
*
|
||||
* data[z_coord][y_coord * x_dim * 4 + x_coord * 4 ] // Red
|
||||
* data[z_coord][y_coord * x_dim * 4 + x_coord * 4 + 1] // Green
|
||||
@@ -112,8 +120,8 @@
|
||||
* Input images must contain unit-length normalized and should be passed in using a two component
|
||||
* swizzle. The astcenc command line tool defaults to an RRRG swizzle, but some developers prefer
|
||||
* to use GGGR for compatability with BC5n which will work just as well. The Z component can be
|
||||
* recovered programatically in shader code, using knowledge that the vector is unit length and that
|
||||
* Z must be positive for a tangent-space normal map.
|
||||
* recovered programmatically in shader code, using knowledge that the vector is unit length and
|
||||
* that Z must be positive for a tangent-space normal map.
|
||||
*
|
||||
* Decompress-only usage
|
||||
* =====================
|
||||
@@ -215,8 +223,6 @@ enum astcenc_error {
|
||||
ASTCENC_ERR_OUT_OF_MEM,
|
||||
/** @brief The call failed due to the build using fast math. */
|
||||
ASTCENC_ERR_BAD_CPU_FLOAT,
|
||||
/** @brief The call failed due to the build using an unsupported ISA. */
|
||||
ASTCENC_ERR_BAD_CPU_ISA,
|
||||
/** @brief The call failed due to an out-of-spec parameter. */
|
||||
ASTCENC_ERR_BAD_PARAM,
|
||||
/** @brief The call failed due to an out-of-spec block size. */
|
||||
@@ -233,6 +239,8 @@ enum astcenc_error {
|
||||
ASTCENC_ERR_BAD_CONTEXT,
|
||||
/** @brief The call failed due to unimplemented functionality. */
|
||||
ASTCENC_ERR_NOT_IMPLEMENTED,
|
||||
/** @brief The call failed due to an out-of-spec decode mode flag set. */
|
||||
ASTCENC_ERR_BAD_DECODE_MODE,
|
||||
#if defined(ASTCENC_DIAGNOSTICS)
|
||||
/** @brief The call failed due to an issue with diagnostic tracing. */
|
||||
ASTCENC_ERR_DTRACE_FAILURE,
|
||||
@@ -265,9 +273,12 @@ static const float ASTCENC_PRE_FAST = 10.0f;
|
||||
/** @brief The medium quality search preset. */
|
||||
static const float ASTCENC_PRE_MEDIUM = 60.0f;
|
||||
|
||||
/** @brief The throrough quality search preset. */
|
||||
/** @brief The thorough quality search preset. */
|
||||
static const float ASTCENC_PRE_THOROUGH = 98.0f;
|
||||
|
||||
/** @brief The thorough quality search preset. */
|
||||
static const float ASTCENC_PRE_VERYTHOROUGH = 99.0f;
|
||||
|
||||
/** @brief The exhaustive, highest quality, search preset. */
|
||||
static const float ASTCENC_PRE_EXHAUSTIVE = 100.0f;
|
||||
|
||||
@@ -320,6 +331,11 @@ enum astcenc_type
|
||||
ASTCENC_TYPE_F32 = 2
|
||||
};
|
||||
|
||||
/**
|
||||
* @brief Function pointer type for compression progress reporting callback.
|
||||
*/
|
||||
extern "C" typedef void (*astcenc_progress_callback)(float);
|
||||
|
||||
/**
|
||||
* @brief Enable normal map compression.
|
||||
*
|
||||
@@ -331,35 +347,17 @@ enum astcenc_type
|
||||
static const unsigned int ASTCENC_FLG_MAP_NORMAL = 1 << 0;
|
||||
|
||||
/**
|
||||
* @brief Enable mask map compression.
|
||||
* @brief Enable compression heuristics that assume use of decode_unorm8 decode mode.
|
||||
*
|
||||
* Input data will be treated a multi-layer mask map, where is is desirable for the color components
|
||||
* to be treated independently for the purposes of error analysis.
|
||||
* The decode_unorm8 decode mode rounds differently to the decode_fp16 decode mode, so enabling this
|
||||
* flag during compression will allow the compressor to use the correct rounding when selecting
|
||||
* encodings. This will improve the compressed image quality if your application is using the
|
||||
* decode_unorm8 decode mode, but will reduce image quality if using decode_fp16.
|
||||
*
|
||||
* Note that LDR_SRGB images will always use decode_unorm8 for the RGB channels, irrespective of
|
||||
* this setting.
|
||||
*/
|
||||
static const unsigned int ASTCENC_FLG_MAP_MASK = 1 << 1;
|
||||
|
||||
/**
|
||||
* @brief Enable RGBM map compression.
|
||||
*
|
||||
* Input data will be treated as HDR data that has been stored in an LDR RGBM-encoded wrapper
|
||||
* format. Data must be preprocessed by the user to be in LDR RGBM format before calling the
|
||||
* compression function, this flag is only used to control the use of RGBM-specific heuristics and
|
||||
* error metrics.
|
||||
*
|
||||
* IMPORTANT: The ASTC format is prone to bad failure modes with unconstrained RGBM data; very small
|
||||
* M values can round to zero due to quantization and result in black or white pixels. It is highly
|
||||
* recommended that the minimum value of M used in the encoding is kept above a lower threshold (try
|
||||
* 16 or 32). Applying this threshold reduces the number of very dark colors that can be
|
||||
* represented, but is still higher precision than 8-bit LDR.
|
||||
*
|
||||
* When this flag is set the value of @c rgbm_m_scale in the context must be set to the RGBM scale
|
||||
* factor used during reconstruction. This defaults to 5 when in RGBM mode.
|
||||
*
|
||||
* It is recommended that the value of @c cw_a_weight is set to twice the value of the multiplier
|
||||
* scale, ensuring that the M value is accurately encoded. This defaults to 10 when in RGBM mode,
|
||||
* matching the default scale factor.
|
||||
*/
|
||||
static const unsigned int ASTCENC_FLG_MAP_RGBM = 1 << 6;
|
||||
static const unsigned int ASTCENC_FLG_USE_DECODE_UNORM8 = 1 << 1;
|
||||
|
||||
/**
|
||||
* @brief Enable alpha weighting.
|
||||
@@ -396,15 +394,38 @@ static const unsigned int ASTCENC_FLG_DECOMPRESS_ONLY = 1 << 4;
|
||||
*/
|
||||
static const unsigned int ASTCENC_FLG_SELF_DECOMPRESS_ONLY = 1 << 5;
|
||||
|
||||
/**
|
||||
* @brief Enable RGBM map compression.
|
||||
*
|
||||
* Input data will be treated as HDR data that has been stored in an LDR RGBM-encoded wrapper
|
||||
* format. Data must be preprocessed by the user to be in LDR RGBM format before calling the
|
||||
* compression function, this flag is only used to control the use of RGBM-specific heuristics and
|
||||
* error metrics.
|
||||
*
|
||||
* IMPORTANT: The ASTC format is prone to bad failure modes with unconstrained RGBM data; very small
|
||||
* M values can round to zero due to quantization and result in black or white pixels. It is highly
|
||||
* recommended that the minimum value of M used in the encoding is kept above a lower threshold (try
|
||||
* 16 or 32). Applying this threshold reduces the number of very dark colors that can be
|
||||
* represented, but is still higher precision than 8-bit LDR.
|
||||
*
|
||||
* When this flag is set the value of @c rgbm_m_scale in the context must be set to the RGBM scale
|
||||
* factor used during reconstruction. This defaults to 5 when in RGBM mode.
|
||||
*
|
||||
* It is recommended that the value of @c cw_a_weight is set to twice the value of the multiplier
|
||||
* scale, ensuring that the M value is accurately encoded. This defaults to 10 when in RGBM mode,
|
||||
* matching the default scale factor.
|
||||
*/
|
||||
static const unsigned int ASTCENC_FLG_MAP_RGBM = 1 << 6;
|
||||
|
||||
/**
|
||||
* @brief The bit mask of all valid flags.
|
||||
*/
|
||||
static const unsigned int ASTCENC_ALL_FLAGS =
|
||||
ASTCENC_FLG_MAP_MASK |
|
||||
ASTCENC_FLG_MAP_NORMAL |
|
||||
ASTCENC_FLG_MAP_RGBM |
|
||||
ASTCENC_FLG_USE_ALPHA_WEIGHT |
|
||||
ASTCENC_FLG_USE_PERCEPTUAL |
|
||||
ASTCENC_FLG_USE_DECODE_UNORM8 |
|
||||
ASTCENC_FLG_DECOMPRESS_ONLY |
|
||||
ASTCENC_FLG_SELF_DECOMPRESS_ONLY;
|
||||
|
||||
@@ -452,7 +473,7 @@ struct astcenc_config
|
||||
*
|
||||
* It is recommended that this is set to 1 when using FLG_USE_ALPHA_WEIGHT on a texture that
|
||||
* will be sampled using linear texture filtering to minimize color bleed out of transparent
|
||||
* texels that are adjcent to non-transparent texels.
|
||||
* texels that are adjacent to non-transparent texels.
|
||||
*/
|
||||
unsigned int a_scale_radius;
|
||||
|
||||
@@ -467,11 +488,25 @@ struct astcenc_config
|
||||
unsigned int tune_partition_count_limit;
|
||||
|
||||
/**
|
||||
* @brief The maximum number of partitions searched (-partitionindexlimit).
|
||||
* @brief The maximum number of partitions searched (-2partitionindexlimit).
|
||||
*
|
||||
* Valid values are between 1 and 1024.
|
||||
*/
|
||||
unsigned int tune_partition_index_limit;
|
||||
unsigned int tune_2partition_index_limit;
|
||||
|
||||
/**
|
||||
* @brief The maximum number of partitions searched (-3partitionindexlimit).
|
||||
*
|
||||
* Valid values are between 1 and 1024.
|
||||
*/
|
||||
unsigned int tune_3partition_index_limit;
|
||||
|
||||
/**
|
||||
* @brief The maximum number of partitions searched (-4partitionindexlimit).
|
||||
*
|
||||
* Valid values are between 1 and 1024.
|
||||
*/
|
||||
unsigned int tune_4partition_index_limit;
|
||||
|
||||
/**
|
||||
* @brief The maximum centile for block modes searched (-blockmodelimit).
|
||||
@@ -491,10 +526,31 @@ struct astcenc_config
|
||||
/**
|
||||
* @brief The number of trial candidates per mode search (-candidatelimit).
|
||||
*
|
||||
* Valid values are between 1 and TUNE_MAX_TRIAL_CANDIDATES (default 4).
|
||||
* Valid values are between 1 and TUNE_MAX_TRIAL_CANDIDATES.
|
||||
*/
|
||||
unsigned int tune_candidate_limit;
|
||||
|
||||
/**
|
||||
* @brief The number of trial partitionings per search (-2partitioncandidatelimit).
|
||||
*
|
||||
* Valid values are between 1 and TUNE_MAX_PARTITIONING_CANDIDATES.
|
||||
*/
|
||||
unsigned int tune_2partitioning_candidate_limit;
|
||||
|
||||
/**
|
||||
* @brief The number of trial partitionings per search (-3partitioncandidatelimit).
|
||||
*
|
||||
* Valid values are between 1 and TUNE_MAX_PARTITIONING_CANDIDATES.
|
||||
*/
|
||||
unsigned int tune_3partitioning_candidate_limit;
|
||||
|
||||
/**
|
||||
* @brief The number of trial partitionings per search (-4partitioncandidatelimit).
|
||||
*
|
||||
* Valid values are between 1 and TUNE_MAX_PARTITIONING_CANDIDATES.
|
||||
*/
|
||||
unsigned int tune_4partitioning_candidate_limit;
|
||||
|
||||
/**
|
||||
* @brief The dB threshold for stopping block search (-dblimit).
|
||||
*
|
||||
@@ -503,51 +559,54 @@ struct astcenc_config
|
||||
float tune_db_limit;
|
||||
|
||||
/**
|
||||
* @brief The amount of overshoot needed to early-out mode 0 fast path.
|
||||
* @brief The amount of MSE overshoot needed to early-out trials.
|
||||
*
|
||||
* We have a fast-path for mode 0 (1 partition, 1 plane) which uses only essential block modes
|
||||
* as an initital search. This can short-cut compression for simple blocks, but to avoid
|
||||
* shortcutting too much we* force this to overshoot the MSE threshold needed to hit the
|
||||
* block-local db_limit e.g. 1.0 = no overshoot, 2.0 = need half the error to trigger.
|
||||
* The first early-out is for 1 partition, 1 plane trials, where we try a minimal encode using
|
||||
* the high probability block modes. This can short-cut compression for simple blocks.
|
||||
*
|
||||
* The second early-out is for refinement trials, where we can exit refinement once quality is
|
||||
* reached.
|
||||
*/
|
||||
float tune_mode0_mse_overshoot;
|
||||
float tune_mse_overshoot;
|
||||
|
||||
/**
|
||||
* @brief The amount of overshoot needed to early-out refinement.
|
||||
*
|
||||
* The codec will refine block candidates iteratively to improve the encoding, based on the
|
||||
* @c tune_refinement_limit count. Earlier implementations will use all refinement iterations,
|
||||
* even if the target threshold is reached. This tuning parameter allows an early out, but with
|
||||
* an overshoot MSE threshold. Setting this to 1.0 will early-out as soon as the target is hit,
|
||||
* but does reduce image quality vs the default behavior of over-refinement.
|
||||
*/
|
||||
float tune_refinement_mse_overshoot;
|
||||
|
||||
/**
|
||||
* @brief The threshold for skipping 2.2/3.1/3.2/4.1 trials (-2partitionlimitfactor).
|
||||
* @brief The threshold for skipping 3.1/4.1 trials (-2partitionlimitfactor).
|
||||
*
|
||||
* This option is further scaled for normal maps, so it skips less often.
|
||||
*/
|
||||
float tune_2_partition_early_out_limit_factor;
|
||||
float tune_2partition_early_out_limit_factor;
|
||||
|
||||
/**
|
||||
* @brief The threshold for skipping 3.2/4.1 trials (-3partitionlimitfactor).
|
||||
* @brief The threshold for skipping 4.1 trials (-3partitionlimitfactor).
|
||||
*
|
||||
* This option is further scaled for normal maps, so it skips less often.
|
||||
*/
|
||||
float tune_3_partition_early_out_limit_factor;
|
||||
float tune_3partition_early_out_limit_factor;
|
||||
|
||||
/**
|
||||
* @brief The threshold for skipping two weight planes (-2planelimitcorrelation).
|
||||
*
|
||||
* This option is ineffective for normal maps.
|
||||
*/
|
||||
float tune_2_plane_early_out_limit_correlation;
|
||||
float tune_2plane_early_out_limit_correlation;
|
||||
|
||||
/**
|
||||
* @brief The threshold below which (inclusive) we stop testing low/high/low+high cutoffs.
|
||||
* @brief The config enable for the mode0 fast-path search.
|
||||
*
|
||||
* If this is set to TUNE_MIN_TEXELS_MODE0 or higher then the early-out fast mode0
|
||||
* search is enabled. This option is ineffective for 3D block sizes.
|
||||
*/
|
||||
unsigned int tune_low_weight_count_limit;
|
||||
float tune_search_mode0_enable;
|
||||
|
||||
/**
|
||||
* @brief The progress callback, can be @c nullptr.
|
||||
*
|
||||
* If this is specified the codec will peridocially report progress for
|
||||
* compression as a percentage between 0 and 100. The callback is called from one
|
||||
* of the compressor threads, so doing significant work in the callback will
|
||||
* reduce compression performance.
|
||||
*/
|
||||
astcenc_progress_callback progress_callback;
|
||||
|
||||
#if defined(ASTCENC_DIAGNOSTICS)
|
||||
/**
|
||||
@@ -811,7 +870,7 @@ ASTCENC_PUBLIC void astcenc_context_free(
|
||||
* advanced content packaging pipelines.
|
||||
*
|
||||
* @param context Codec context.
|
||||
* @param data One block of compressesd ASTC data.
|
||||
* @param data One block of compressed ASTC data.
|
||||
* @param info The output info structure to populate.
|
||||
*
|
||||
* @return @c ASTCENC_SUCCESS if the block was decoded, or an error otherwise. Note that this
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
// ----------------------------------------------------------------------------
|
||||
// Copyright 2011-2022 Arm Limited
|
||||
// Copyright 2011-2023 Arm Limited
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
|
||||
// use this file except in compliance with the License. You may obtain a copy
|
||||
@@ -390,8 +390,6 @@ void compute_avgs_and_dirs_4_comp(
|
||||
const image_block& blk,
|
||||
partition_metrics pm[BLOCK_MAX_PARTITIONS]
|
||||
) {
|
||||
float texel_weight = hadd_s(blk.channel_weight) / 4.0f;
|
||||
|
||||
int partition_count = pi.partition_count;
|
||||
promise(partition_count > 0);
|
||||
|
||||
@@ -434,11 +432,6 @@ void compute_avgs_and_dirs_4_comp(
|
||||
sum_wp += select(zero, texel_datum, tdm3);
|
||||
}
|
||||
|
||||
sum_xp = sum_xp * texel_weight;
|
||||
sum_yp = sum_yp * texel_weight;
|
||||
sum_zp = sum_zp * texel_weight;
|
||||
sum_wp = sum_wp * texel_weight;
|
||||
|
||||
vfloat4 prod_xp = dot(sum_xp, sum_xp);
|
||||
vfloat4 prod_yp = dot(sum_yp, sum_yp);
|
||||
vfloat4 prod_zp = dot(sum_zp, sum_zp);
|
||||
@@ -473,8 +466,6 @@ void compute_avgs_and_dirs_3_comp(
|
||||
vfloat4 partition_averages[BLOCK_MAX_PARTITIONS];
|
||||
compute_partition_averages_rgba(pi, blk, partition_averages);
|
||||
|
||||
float texel_weight = hadd_s(blk.channel_weight.swz<0, 1, 2>());
|
||||
|
||||
const float* data_vr = blk.data_r;
|
||||
const float* data_vg = blk.data_g;
|
||||
const float* data_vb = blk.data_b;
|
||||
@@ -482,8 +473,6 @@ void compute_avgs_and_dirs_3_comp(
|
||||
// TODO: Data-driven permute would be useful to avoid this ...
|
||||
if (omitted_component == 0)
|
||||
{
|
||||
texel_weight = hadd_s(blk.channel_weight.swz<1, 2, 3>());
|
||||
|
||||
partition_averages[0] = partition_averages[0].swz<1, 2, 3>();
|
||||
partition_averages[1] = partition_averages[1].swz<1, 2, 3>();
|
||||
partition_averages[2] = partition_averages[2].swz<1, 2, 3>();
|
||||
@@ -495,8 +484,6 @@ void compute_avgs_and_dirs_3_comp(
|
||||
}
|
||||
else if (omitted_component == 1)
|
||||
{
|
||||
texel_weight = hadd_s(blk.channel_weight.swz<0, 2, 3>());
|
||||
|
||||
partition_averages[0] = partition_averages[0].swz<0, 2, 3>();
|
||||
partition_averages[1] = partition_averages[1].swz<0, 2, 3>();
|
||||
partition_averages[2] = partition_averages[2].swz<0, 2, 3>();
|
||||
@@ -507,8 +494,6 @@ void compute_avgs_and_dirs_3_comp(
|
||||
}
|
||||
else if (omitted_component == 2)
|
||||
{
|
||||
texel_weight = hadd_s(blk.channel_weight.swz<0, 1, 3>());
|
||||
|
||||
partition_averages[0] = partition_averages[0].swz<0, 1, 3>();
|
||||
partition_averages[1] = partition_averages[1].swz<0, 1, 3>();
|
||||
partition_averages[2] = partition_averages[2].swz<0, 1, 3>();
|
||||
@@ -524,8 +509,6 @@ void compute_avgs_and_dirs_3_comp(
|
||||
partition_averages[3] = partition_averages[3].swz<0, 1, 2>();
|
||||
}
|
||||
|
||||
texel_weight = texel_weight * (1.0f / 3.0f);
|
||||
|
||||
unsigned int partition_count = pi.partition_count;
|
||||
promise(partition_count > 0);
|
||||
|
||||
@@ -563,10 +546,6 @@ void compute_avgs_and_dirs_3_comp(
|
||||
sum_zp += select(zero, texel_datum, tdm2);
|
||||
}
|
||||
|
||||
sum_xp = sum_xp * texel_weight;
|
||||
sum_yp = sum_yp * texel_weight;
|
||||
sum_zp = sum_zp * texel_weight;
|
||||
|
||||
vfloat4 prod_xp = dot(sum_xp, sum_xp);
|
||||
vfloat4 prod_yp = dot(sum_yp, sum_yp);
|
||||
vfloat4 prod_zp = dot(sum_zp, sum_zp);
|
||||
@@ -591,8 +570,6 @@ void compute_avgs_and_dirs_3_comp_rgb(
|
||||
const image_block& blk,
|
||||
partition_metrics pm[BLOCK_MAX_PARTITIONS]
|
||||
) {
|
||||
float texel_weight = hadd_s(blk.channel_weight.swz<0, 1, 2>()) * (1.0f / 3.0f);
|
||||
|
||||
unsigned int partition_count = pi.partition_count;
|
||||
promise(partition_count > 0);
|
||||
|
||||
@@ -632,10 +609,6 @@ void compute_avgs_and_dirs_3_comp_rgb(
|
||||
sum_zp += select(zero, texel_datum, tdm2);
|
||||
}
|
||||
|
||||
sum_xp = sum_xp * texel_weight;
|
||||
sum_yp = sum_yp * texel_weight;
|
||||
sum_zp = sum_zp * texel_weight;
|
||||
|
||||
vfloat4 prod_xp = dot(sum_xp, sum_xp);
|
||||
vfloat4 prod_yp = dot(sum_yp, sum_yp);
|
||||
vfloat4 prod_zp = dot(sum_zp, sum_zp);
|
||||
@@ -662,7 +635,6 @@ void compute_avgs_and_dirs_2_comp(
|
||||
unsigned int component2,
|
||||
partition_metrics pm[BLOCK_MAX_PARTITIONS]
|
||||
) {
|
||||
float texel_weight;
|
||||
vfloat4 average;
|
||||
|
||||
const float* data_vr = nullptr;
|
||||
@@ -670,7 +642,6 @@ void compute_avgs_and_dirs_2_comp(
|
||||
|
||||
if (component1 == 0 && component2 == 1)
|
||||
{
|
||||
texel_weight = hadd_s(blk.channel_weight.swz<0, 1>()) / 2.0f;
|
||||
average = blk.data_mean.swz<0, 1>();
|
||||
|
||||
data_vr = blk.data_r;
|
||||
@@ -678,7 +649,6 @@ void compute_avgs_and_dirs_2_comp(
|
||||
}
|
||||
else if (component1 == 0 && component2 == 2)
|
||||
{
|
||||
texel_weight = hadd_s(blk.channel_weight.swz<0, 2>()) / 2.0f;
|
||||
average = blk.data_mean.swz<0, 2>();
|
||||
|
||||
data_vr = blk.data_r;
|
||||
@@ -688,7 +658,6 @@ void compute_avgs_and_dirs_2_comp(
|
||||
{
|
||||
assert(component1 == 1 && component2 == 2);
|
||||
|
||||
texel_weight = hadd_s(blk.channel_weight.swz<1, 2>()) / 2.0f;
|
||||
average = blk.data_mean.swz<1, 2>();
|
||||
|
||||
data_vr = blk.data_g;
|
||||
@@ -714,7 +683,7 @@ void compute_avgs_and_dirs_2_comp(
|
||||
average += vfloat2(data_vr[iwt], data_vg[iwt]);
|
||||
}
|
||||
|
||||
average = average * (1.0f / static_cast<float>(texel_count));
|
||||
average = average / static_cast<float>(texel_count);
|
||||
}
|
||||
|
||||
pm[partition].avg = average;
|
||||
@@ -737,9 +706,6 @@ void compute_avgs_and_dirs_2_comp(
|
||||
sum_yp += select(zero, texel_datum, tdm1);
|
||||
}
|
||||
|
||||
sum_xp = sum_xp * texel_weight;
|
||||
sum_yp = sum_yp * texel_weight;
|
||||
|
||||
vfloat4 prod_xp = dot(sum_xp, sum_xp);
|
||||
vfloat4 prod_yp = dot(sum_yp, sum_yp);
|
||||
|
||||
@@ -759,8 +725,7 @@ void compute_error_squared_rgba(
|
||||
const image_block& blk,
|
||||
const processed_line4 uncor_plines[BLOCK_MAX_PARTITIONS],
|
||||
const processed_line4 samec_plines[BLOCK_MAX_PARTITIONS],
|
||||
float uncor_lengths[BLOCK_MAX_PARTITIONS],
|
||||
float samec_lengths[BLOCK_MAX_PARTITIONS],
|
||||
float line_lengths[BLOCK_MAX_PARTITIONS],
|
||||
float& uncor_error,
|
||||
float& samec_error
|
||||
) {
|
||||
@@ -774,12 +739,6 @@ void compute_error_squared_rgba(
|
||||
{
|
||||
const uint8_t *texel_indexes = pi.texels_of_partition[partition];
|
||||
|
||||
float uncor_loparam = 1e10f;
|
||||
float uncor_hiparam = -1e10f;
|
||||
|
||||
float samec_loparam = 1e10f;
|
||||
float samec_hiparam = -1e10f;
|
||||
|
||||
processed_line4 l_uncor = uncor_plines[partition];
|
||||
processed_line4 l_samec = samec_plines[partition];
|
||||
|
||||
@@ -807,9 +766,6 @@ void compute_error_squared_rgba(
|
||||
vfloat uncor_loparamv(1e10f);
|
||||
vfloat uncor_hiparamv(-1e10f);
|
||||
|
||||
vfloat samec_loparamv(1e10f);
|
||||
vfloat samec_hiparamv(-1e10f);
|
||||
|
||||
vfloat ew_r(blk.channel_weight.lane<0>());
|
||||
vfloat ew_g(blk.channel_weight.lane<1>());
|
||||
vfloat ew_b(blk.channel_weight.lane<2>());
|
||||
@@ -822,17 +778,17 @@ void compute_error_squared_rgba(
|
||||
for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
|
||||
{
|
||||
vmask mask = lane_ids < vint(texel_count);
|
||||
vint texel_idxs(&(texel_indexes[i]));
|
||||
vint texel_idxs(texel_indexes + i);
|
||||
|
||||
vfloat data_r = gatherf(blk.data_r, texel_idxs);
|
||||
vfloat data_g = gatherf(blk.data_g, texel_idxs);
|
||||
vfloat data_b = gatherf(blk.data_b, texel_idxs);
|
||||
vfloat data_a = gatherf(blk.data_a, texel_idxs);
|
||||
|
||||
vfloat uncor_param = (data_r * l_uncor_bs0)
|
||||
+ (data_g * l_uncor_bs1)
|
||||
+ (data_b * l_uncor_bs2)
|
||||
+ (data_a * l_uncor_bs3);
|
||||
vfloat uncor_param = (data_r * l_uncor_bs0)
|
||||
+ (data_g * l_uncor_bs1)
|
||||
+ (data_b * l_uncor_bs2)
|
||||
+ (data_a * l_uncor_bs3);
|
||||
|
||||
uncor_loparamv = min(uncor_param, uncor_loparamv);
|
||||
uncor_hiparamv = max(uncor_param, uncor_hiparamv);
|
||||
@@ -859,9 +815,6 @@ void compute_error_squared_rgba(
|
||||
+ (data_b * l_samec_bs2)
|
||||
+ (data_a * l_samec_bs3);
|
||||
|
||||
samec_loparamv = min(samec_param, samec_loparamv);
|
||||
samec_hiparamv = max(samec_param, samec_hiparamv);
|
||||
|
||||
vfloat samec_dist0 = samec_param * l_samec_bs0 - data_r;
|
||||
vfloat samec_dist1 = samec_param * l_samec_bs1 - data_g;
|
||||
vfloat samec_dist2 = samec_param * l_samec_bs2 - data_b;
|
||||
@@ -877,18 +830,9 @@ void compute_error_squared_rgba(
|
||||
lane_ids += vint(ASTCENC_SIMD_WIDTH);
|
||||
}
|
||||
|
||||
uncor_loparam = hmin_s(uncor_loparamv);
|
||||
uncor_hiparam = hmax_s(uncor_hiparamv);
|
||||
|
||||
samec_loparam = hmin_s(samec_loparamv);
|
||||
samec_hiparam = hmax_s(samec_hiparamv);
|
||||
|
||||
float uncor_linelen = uncor_hiparam - uncor_loparam;
|
||||
float samec_linelen = samec_hiparam - samec_loparam;
|
||||
|
||||
// Turn very small numbers and NaNs into a small number
|
||||
uncor_lengths[partition] = astc::max(uncor_linelen, 1e-7f);
|
||||
samec_lengths[partition] = astc::max(samec_linelen, 1e-7f);
|
||||
float uncor_linelen = hmax_s(uncor_hiparamv) - hmin_s(uncor_loparamv);
|
||||
line_lengths[partition] = astc::max(uncor_linelen, 1e-7f);
|
||||
}
|
||||
|
||||
uncor_error = hadd_s(uncor_errorsumv);
|
||||
@@ -916,19 +860,9 @@ void compute_error_squared_rgb(
|
||||
unsigned int texel_count = pi.partition_texel_count[partition];
|
||||
promise(texel_count > 0);
|
||||
|
||||
float uncor_loparam = 1e10f;
|
||||
float uncor_hiparam = -1e10f;
|
||||
|
||||
float samec_loparam = 1e10f;
|
||||
float samec_hiparam = -1e10f;
|
||||
|
||||
processed_line3 l_uncor = pl.uncor_pline;
|
||||
processed_line3 l_samec = pl.samec_pline;
|
||||
|
||||
// This implementation is an example vectorization of this function.
|
||||
// It works for - the codec is a 2-4% faster than not vectorizing - but
|
||||
// the benefit is limited by the use of gathers and register pressure
|
||||
|
||||
// Vectorize some useful scalar inputs
|
||||
vfloat l_uncor_bs0(l_uncor.bs.lane<0>());
|
||||
vfloat l_uncor_bs1(l_uncor.bs.lane<1>());
|
||||
@@ -947,9 +881,6 @@ void compute_error_squared_rgb(
|
||||
vfloat uncor_loparamv(1e10f);
|
||||
vfloat uncor_hiparamv(-1e10f);
|
||||
|
||||
vfloat samec_loparamv(1e10f);
|
||||
vfloat samec_hiparamv(-1e10f);
|
||||
|
||||
vfloat ew_r(blk.channel_weight.lane<0>());
|
||||
vfloat ew_g(blk.channel_weight.lane<1>());
|
||||
vfloat ew_b(blk.channel_weight.lane<2>());
|
||||
@@ -961,15 +892,15 @@ void compute_error_squared_rgb(
|
||||
for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
|
||||
{
|
||||
vmask mask = lane_ids < vint(texel_count);
|
||||
vint texel_idxs(&(texel_indexes[i]));
|
||||
vint texel_idxs(texel_indexes + i);
|
||||
|
||||
vfloat data_r = gatherf(blk.data_r, texel_idxs);
|
||||
vfloat data_g = gatherf(blk.data_g, texel_idxs);
|
||||
vfloat data_b = gatherf(blk.data_b, texel_idxs);
|
||||
|
||||
vfloat uncor_param = (data_r * l_uncor_bs0)
|
||||
+ (data_g * l_uncor_bs1)
|
||||
+ (data_b * l_uncor_bs2);
|
||||
vfloat uncor_param = (data_r * l_uncor_bs0)
|
||||
+ (data_g * l_uncor_bs1)
|
||||
+ (data_b * l_uncor_bs2);
|
||||
|
||||
uncor_loparamv = min(uncor_param, uncor_loparamv);
|
||||
uncor_hiparamv = max(uncor_param, uncor_hiparamv);
|
||||
@@ -992,9 +923,6 @@ void compute_error_squared_rgb(
|
||||
+ (data_g * l_samec_bs1)
|
||||
+ (data_b * l_samec_bs2);
|
||||
|
||||
samec_loparamv = min(samec_param, samec_loparamv);
|
||||
samec_hiparamv = max(samec_param, samec_hiparamv);
|
||||
|
||||
vfloat samec_dist0 = samec_param * l_samec_bs0 - data_r;
|
||||
vfloat samec_dist1 = samec_param * l_samec_bs1 - data_g;
|
||||
vfloat samec_dist2 = samec_param * l_samec_bs2 - data_b;
|
||||
@@ -1008,18 +936,9 @@ void compute_error_squared_rgb(
|
||||
lane_ids += vint(ASTCENC_SIMD_WIDTH);
|
||||
}
|
||||
|
||||
uncor_loparam = hmin_s(uncor_loparamv);
|
||||
uncor_hiparam = hmax_s(uncor_hiparamv);
|
||||
|
||||
samec_loparam = hmin_s(samec_loparamv);
|
||||
samec_hiparam = hmax_s(samec_hiparamv);
|
||||
|
||||
float uncor_linelen = uncor_hiparam - uncor_loparam;
|
||||
float samec_linelen = samec_hiparam - samec_loparam;
|
||||
|
||||
// Turn very small numbers and NaNs into a small number
|
||||
pl.uncor_line_len = astc::max(uncor_linelen, 1e-7f);
|
||||
pl.samec_line_len = astc::max(samec_linelen, 1e-7f);
|
||||
float uncor_linelen = hmax_s(uncor_hiparamv) - hmin_s(uncor_loparamv);
|
||||
pl.line_length = astc::max(uncor_linelen, 1e-7f);
|
||||
}
|
||||
|
||||
uncor_error = hadd_s(uncor_errorsumv);
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
// ----------------------------------------------------------------------------
|
||||
// Copyright 2011-2022 Arm Limited
|
||||
// Copyright 2011-2023 Arm Limited
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
|
||||
// use this file except in compliance with the License. You may obtain a copy
|
||||
@@ -330,17 +330,17 @@ static void init_decimation_info_2d(
|
||||
|
||||
for (unsigned int j = 0; j < wb.weight_count_of_texel[i]; j++)
|
||||
{
|
||||
di.texel_weights_int_4t[j][i] = wb.weights_of_texel[i][j];
|
||||
di.texel_weights_float_4t[j][i] = static_cast<float>(wb.weights_of_texel[i][j]) * (1.0f / WEIGHTS_TEXEL_SUM);
|
||||
di.texel_weights_4t[j][i] = wb.grid_weights_of_texel[i][j];
|
||||
di.texel_weight_contribs_int_tr[j][i] = wb.weights_of_texel[i][j];
|
||||
di.texel_weight_contribs_float_tr[j][i] = static_cast<float>(wb.weights_of_texel[i][j]) * (1.0f / WEIGHTS_TEXEL_SUM);
|
||||
di.texel_weights_tr[j][i] = wb.grid_weights_of_texel[i][j];
|
||||
}
|
||||
|
||||
// Init all 4 entries so we can rely on zeros for vectorization
|
||||
for (unsigned int j = wb.weight_count_of_texel[i]; j < 4; j++)
|
||||
{
|
||||
di.texel_weights_int_4t[j][i] = 0;
|
||||
di.texel_weights_float_4t[j][i] = 0.0f;
|
||||
di.texel_weights_4t[j][i] = 0;
|
||||
di.texel_weight_contribs_int_tr[j][i] = 0;
|
||||
di.texel_weight_contribs_float_tr[j][i] = 0.0f;
|
||||
di.texel_weights_tr[j][i] = 0;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -356,43 +356,30 @@ static void init_decimation_info_2d(
|
||||
uint8_t texel = wb.texels_of_weight[i][j];
|
||||
|
||||
// Create transposed versions of these for better vectorization
|
||||
di.weight_texel[j][i] = texel;
|
||||
di.weights_flt[j][i] = static_cast<float>(wb.texel_weights_of_weight[i][j]);
|
||||
di.weight_texels_tr[j][i] = texel;
|
||||
di.weights_texel_contribs_tr[j][i] = static_cast<float>(wb.texel_weights_of_weight[i][j]);
|
||||
|
||||
// perform a layer of array unrolling. An aspect of this unrolling is that
|
||||
// one of the texel-weight indexes is an identity-mapped index; we will use this
|
||||
// fact to reorder the indexes so that the first one is the identity index.
|
||||
int swap_idx = -1;
|
||||
// Store the per-texel contribution of this weight for each texel it contributes to
|
||||
di.texel_contrib_for_weight[j][i] = 0.0f;
|
||||
for (unsigned int k = 0; k < 4; k++)
|
||||
{
|
||||
uint8_t dttw = di.texel_weights_4t[k][texel];
|
||||
float dttwf = di.texel_weights_float_4t[k][texel];
|
||||
uint8_t dttw = di.texel_weights_tr[k][texel];
|
||||
float dttwf = di.texel_weight_contribs_float_tr[k][texel];
|
||||
if (dttw == i && dttwf != 0.0f)
|
||||
{
|
||||
swap_idx = k;
|
||||
di.texel_contrib_for_weight[j][i] = di.texel_weight_contribs_float_tr[k][texel];
|
||||
break;
|
||||
}
|
||||
di.texel_weights_texel[i][j][k] = dttw;
|
||||
di.texel_weights_float_texel[i][j][k] = dttwf;
|
||||
}
|
||||
|
||||
if (swap_idx != 0)
|
||||
{
|
||||
uint8_t vi = di.texel_weights_texel[i][j][0];
|
||||
float vf = di.texel_weights_float_texel[i][j][0];
|
||||
di.texel_weights_texel[i][j][0] = di.texel_weights_texel[i][j][swap_idx];
|
||||
di.texel_weights_float_texel[i][j][0] = di.texel_weights_float_texel[i][j][swap_idx];
|
||||
di.texel_weights_texel[i][j][swap_idx] = vi;
|
||||
di.texel_weights_float_texel[i][j][swap_idx] = vf;
|
||||
}
|
||||
}
|
||||
|
||||
// Initialize array tail so we can over-fetch with SIMD later to avoid loop tails
|
||||
// Match last texel in active lane in SIMD group, for better gathers
|
||||
uint8_t last_texel = di.weight_texel[texel_count_wt - 1][i];
|
||||
uint8_t last_texel = di.weight_texels_tr[texel_count_wt - 1][i];
|
||||
for (unsigned int j = texel_count_wt; j < max_texel_count_of_weight; j++)
|
||||
{
|
||||
di.weight_texel[j][i] = last_texel;
|
||||
di.weights_flt[j][i] = 0.0f;
|
||||
di.weight_texels_tr[j][i] = last_texel;
|
||||
di.weights_texel_contribs_tr[j][i] = 0.0f;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -404,16 +391,16 @@ static void init_decimation_info_2d(
|
||||
|
||||
for (unsigned int j = 0; j < 4; j++)
|
||||
{
|
||||
di.texel_weights_float_4t[j][i] = 0;
|
||||
di.texel_weights_4t[j][i] = 0;
|
||||
di.texel_weights_int_4t[j][i] = 0;
|
||||
di.texel_weight_contribs_float_tr[j][i] = 0;
|
||||
di.texel_weights_tr[j][i] = 0;
|
||||
di.texel_weight_contribs_int_tr[j][i] = 0;
|
||||
}
|
||||
}
|
||||
|
||||
// Initialize array tail so we can over-fetch with SIMD later to avoid loop tails
|
||||
// Match last texel in active lane in SIMD group, for better gathers
|
||||
unsigned int last_texel_count_wt = wb.texel_count_of_weight[weights_per_block - 1];
|
||||
uint8_t last_texel = di.weight_texel[last_texel_count_wt - 1][weights_per_block - 1];
|
||||
uint8_t last_texel = di.weight_texels_tr[last_texel_count_wt - 1][weights_per_block - 1];
|
||||
|
||||
unsigned int weights_per_block_simd = round_up_to_simd_multiple_vla(weights_per_block);
|
||||
for (unsigned int i = weights_per_block; i < weights_per_block_simd; i++)
|
||||
@@ -422,8 +409,8 @@ static void init_decimation_info_2d(
|
||||
|
||||
for (unsigned int j = 0; j < max_texel_count_of_weight; j++)
|
||||
{
|
||||
di.weight_texel[j][i] = last_texel;
|
||||
di.weights_flt[j][i] = 0.0f;
|
||||
di.weight_texels_tr[j][i] = last_texel;
|
||||
di.weights_texel_contribs_tr[j][i] = 0.0f;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -600,16 +587,16 @@ static void init_decimation_info_3d(
|
||||
// Init all 4 entries so we can rely on zeros for vectorization
|
||||
for (unsigned int j = 0; j < 4; j++)
|
||||
{
|
||||
di.texel_weights_int_4t[j][i] = 0;
|
||||
di.texel_weights_float_4t[j][i] = 0.0f;
|
||||
di.texel_weights_4t[j][i] = 0;
|
||||
di.texel_weight_contribs_int_tr[j][i] = 0;
|
||||
di.texel_weight_contribs_float_tr[j][i] = 0.0f;
|
||||
di.texel_weights_tr[j][i] = 0;
|
||||
}
|
||||
|
||||
for (unsigned int j = 0; j < wb.weight_count_of_texel[i]; j++)
|
||||
{
|
||||
di.texel_weights_int_4t[j][i] = wb.weights_of_texel[i][j];
|
||||
di.texel_weights_float_4t[j][i] = static_cast<float>(wb.weights_of_texel[i][j]) * (1.0f / WEIGHTS_TEXEL_SUM);
|
||||
di.texel_weights_4t[j][i] = wb.grid_weights_of_texel[i][j];
|
||||
di.texel_weight_contribs_int_tr[j][i] = wb.weights_of_texel[i][j];
|
||||
di.texel_weight_contribs_float_tr[j][i] = static_cast<float>(wb.weights_of_texel[i][j]) * (1.0f / WEIGHTS_TEXEL_SUM);
|
||||
di.texel_weights_tr[j][i] = wb.grid_weights_of_texel[i][j];
|
||||
}
|
||||
}
|
||||
|
||||
@@ -625,43 +612,30 @@ static void init_decimation_info_3d(
|
||||
unsigned int texel = wb.texels_of_weight[i][j];
|
||||
|
||||
// Create transposed versions of these for better vectorization
|
||||
di.weight_texel[j][i] = static_cast<uint8_t>(texel);
|
||||
di.weights_flt[j][i] = static_cast<float>(wb.texel_weights_of_weight[i][j]);
|
||||
di.weight_texels_tr[j][i] = static_cast<uint8_t>(texel);
|
||||
di.weights_texel_contribs_tr[j][i] = static_cast<float>(wb.texel_weights_of_weight[i][j]);
|
||||
|
||||
// perform a layer of array unrolling. An aspect of this unrolling is that
|
||||
// one of the texel-weight indexes is an identity-mapped index; we will use this
|
||||
// fact to reorder the indexes so that the first one is the identity index.
|
||||
int swap_idx = -1;
|
||||
// Store the per-texel contribution of this weight for each texel it contributes to
|
||||
di.texel_contrib_for_weight[j][i] = 0.0f;
|
||||
for (unsigned int k = 0; k < 4; k++)
|
||||
{
|
||||
uint8_t dttw = di.texel_weights_4t[k][texel];
|
||||
float dttwf = di.texel_weights_float_4t[k][texel];
|
||||
uint8_t dttw = di.texel_weights_tr[k][texel];
|
||||
float dttwf = di.texel_weight_contribs_float_tr[k][texel];
|
||||
if (dttw == i && dttwf != 0.0f)
|
||||
{
|
||||
swap_idx = k;
|
||||
di.texel_contrib_for_weight[j][i] = di.texel_weight_contribs_float_tr[k][texel];
|
||||
break;
|
||||
}
|
||||
di.texel_weights_texel[i][j][k] = dttw;
|
||||
di.texel_weights_float_texel[i][j][k] = dttwf;
|
||||
}
|
||||
|
||||
if (swap_idx != 0)
|
||||
{
|
||||
uint8_t vi = di.texel_weights_texel[i][j][0];
|
||||
float vf = di.texel_weights_float_texel[i][j][0];
|
||||
di.texel_weights_texel[i][j][0] = di.texel_weights_texel[i][j][swap_idx];
|
||||
di.texel_weights_float_texel[i][j][0] = di.texel_weights_float_texel[i][j][swap_idx];
|
||||
di.texel_weights_texel[i][j][swap_idx] = vi;
|
||||
di.texel_weights_float_texel[i][j][swap_idx] = vf;
|
||||
}
|
||||
}
|
||||
|
||||
// Initialize array tail so we can over-fetch with SIMD later to avoid loop tails
|
||||
// Match last texel in active lane in SIMD group, for better gathers
|
||||
uint8_t last_texel = di.weight_texel[texel_count_wt - 1][i];
|
||||
uint8_t last_texel = di.weight_texels_tr[texel_count_wt - 1][i];
|
||||
for (unsigned int j = texel_count_wt; j < max_texel_count_of_weight; j++)
|
||||
{
|
||||
di.weight_texel[j][i] = last_texel;
|
||||
di.weights_flt[j][i] = 0.0f;
|
||||
di.weight_texels_tr[j][i] = last_texel;
|
||||
di.weights_texel_contribs_tr[j][i] = 0.0f;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -673,16 +647,16 @@ static void init_decimation_info_3d(
|
||||
|
||||
for (unsigned int j = 0; j < 4; j++)
|
||||
{
|
||||
di.texel_weights_float_4t[j][i] = 0;
|
||||
di.texel_weights_4t[j][i] = 0;
|
||||
di.texel_weights_int_4t[j][i] = 0;
|
||||
di.texel_weight_contribs_float_tr[j][i] = 0;
|
||||
di.texel_weights_tr[j][i] = 0;
|
||||
di.texel_weight_contribs_int_tr[j][i] = 0;
|
||||
}
|
||||
}
|
||||
|
||||
// Initialize array tail so we can over-fetch with SIMD later to avoid loop tails
|
||||
// Match last texel in active lane in SIMD group, for better gathers
|
||||
int last_texel_count_wt = wb.texel_count_of_weight[weights_per_block - 1];
|
||||
uint8_t last_texel = di.weight_texel[last_texel_count_wt - 1][weights_per_block - 1];
|
||||
uint8_t last_texel = di.weight_texels_tr[last_texel_count_wt - 1][weights_per_block - 1];
|
||||
|
||||
unsigned int weights_per_block_simd = round_up_to_simd_multiple_vla(weights_per_block);
|
||||
for (unsigned int i = weights_per_block; i < weights_per_block_simd; i++)
|
||||
@@ -691,8 +665,8 @@ static void init_decimation_info_3d(
|
||||
|
||||
for (int j = 0; j < max_texel_count_of_weight; j++)
|
||||
{
|
||||
di.weight_texel[j][i] = last_texel;
|
||||
di.weights_flt[j][i] = 0.0f;
|
||||
di.weight_texels_tr[j][i] = last_texel;
|
||||
di.weights_texel_contribs_tr[j][i] = 0.0f;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -802,8 +776,8 @@ static void construct_dt_entry_2d(
|
||||
assert(maxprec_1plane >= 0 || maxprec_2planes >= 0);
|
||||
bsd.decimation_modes[index].maxprec_1plane = static_cast<int8_t>(maxprec_1plane);
|
||||
bsd.decimation_modes[index].maxprec_2planes = static_cast<int8_t>(maxprec_2planes);
|
||||
bsd.decimation_modes[index].ref_1_plane = 0;
|
||||
bsd.decimation_modes[index].ref_2_planes = 0;
|
||||
bsd.decimation_modes[index].refprec_1plane = 0;
|
||||
bsd.decimation_modes[index].refprec_2planes = 0;
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -957,16 +931,6 @@ static void construct_block_size_descriptor_2d(
|
||||
}
|
||||
|
||||
auto& bm = bsd.block_modes[packed_bm_idx];
|
||||
auto& dm = bsd.decimation_modes[decimation_mode];
|
||||
|
||||
if (is_dual_plane)
|
||||
{
|
||||
dm.ref_2_planes = 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
dm.ref_1_plane = 1;
|
||||
}
|
||||
|
||||
bm.decimation_mode = static_cast<uint8_t>(decimation_mode);
|
||||
bm.quant_mode = static_cast<uint8_t>(quant_mode);
|
||||
@@ -974,6 +938,17 @@ static void construct_block_size_descriptor_2d(
|
||||
bm.weight_bits = static_cast<uint8_t>(weight_bits);
|
||||
bm.mode_index = static_cast<uint16_t>(i);
|
||||
|
||||
auto& dm = bsd.decimation_modes[decimation_mode];
|
||||
|
||||
if (is_dual_plane)
|
||||
{
|
||||
dm.set_ref_2plane(bm.get_weight_quant_mode());
|
||||
}
|
||||
else
|
||||
{
|
||||
dm.set_ref_1plane(bm.get_weight_quant_mode());
|
||||
}
|
||||
|
||||
bsd.block_mode_packed_index[i] = static_cast<uint16_t>(packed_bm_idx);
|
||||
|
||||
packed_bm_idx++;
|
||||
@@ -1002,8 +977,8 @@ static void construct_block_size_descriptor_2d(
|
||||
{
|
||||
bsd.decimation_modes[i].maxprec_1plane = -1;
|
||||
bsd.decimation_modes[i].maxprec_2planes = -1;
|
||||
bsd.decimation_modes[i].ref_1_plane = 0;
|
||||
bsd.decimation_modes[i].ref_2_planes = 0;
|
||||
bsd.decimation_modes[i].refprec_1plane = 0;
|
||||
bsd.decimation_modes[i].refprec_2planes = 0;
|
||||
}
|
||||
|
||||
// Determine the texels to use for kmeans clustering.
|
||||
@@ -1013,7 +988,7 @@ static void construct_block_size_descriptor_2d(
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Allocate block modes and decimation tables for a single £D block size.
|
||||
* @brief Allocate block modes and decimation tables for a single 3D block size.
|
||||
*
|
||||
* TODO: This function doesn't include all of the heuristics that we use for 2D block sizes such as
|
||||
* the percentile mode cutoffs. If 3D becomes more widely used we should look at this.
|
||||
@@ -1088,8 +1063,8 @@ static void construct_block_size_descriptor_3d(
|
||||
|
||||
bsd.decimation_modes[decimation_mode_count].maxprec_1plane = static_cast<int8_t>(maxprec_1plane);
|
||||
bsd.decimation_modes[decimation_mode_count].maxprec_2planes = static_cast<int8_t>(maxprec_2planes);
|
||||
bsd.decimation_modes[decimation_mode_count].ref_1_plane = maxprec_1plane == -1 ? 0 : 1;
|
||||
bsd.decimation_modes[decimation_mode_count].ref_2_planes = maxprec_2planes == -1 ? 0 : 1;
|
||||
bsd.decimation_modes[decimation_mode_count].refprec_1plane = maxprec_1plane == -1 ? 0 : 0xFFFF;
|
||||
bsd.decimation_modes[decimation_mode_count].refprec_2planes = maxprec_2planes == -1 ? 0 : 0xFFFF;
|
||||
decimation_mode_count++;
|
||||
}
|
||||
}
|
||||
@@ -1100,15 +1075,14 @@ static void construct_block_size_descriptor_3d(
|
||||
{
|
||||
bsd.decimation_modes[i].maxprec_1plane = -1;
|
||||
bsd.decimation_modes[i].maxprec_2planes = -1;
|
||||
bsd.decimation_modes[i].ref_1_plane = 0;
|
||||
bsd.decimation_modes[i].ref_2_planes = 0;
|
||||
bsd.decimation_modes[i].refprec_1plane = 0;
|
||||
bsd.decimation_modes[i].refprec_2planes = 0;
|
||||
}
|
||||
|
||||
bsd.decimation_mode_count_always = 0; // Skipped for 3D modes
|
||||
bsd.decimation_mode_count_selected = decimation_mode_count;
|
||||
bsd.decimation_mode_count_all = decimation_mode_count;
|
||||
|
||||
// Construct the list of block formats
|
||||
// Construct the list of block formats referencing the decimation tables
|
||||
|
||||
// Clear the list to a known-bad value
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
// ----------------------------------------------------------------------------
|
||||
// Copyright 2011-2021 Arm Limited
|
||||
// Copyright 2011-2023 Arm Limited
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
|
||||
// use this file except in compliance with the License. You may obtain a copy
|
||||
@@ -23,43 +23,6 @@
|
||||
|
||||
#include "astcenc_internal.h"
|
||||
|
||||
/**
|
||||
* @brief Unquantize a color.
|
||||
*
|
||||
* This function uses a lookup table as the quantization is encoded to make
|
||||
* hardware implementations easier, and is not a simple lerp.
|
||||
*
|
||||
* @param quant_level The quantization level to use.
|
||||
* @param inputq The input quantized color.
|
||||
*
|
||||
* @return The unquantized color.
|
||||
*/
|
||||
static ASTCENC_SIMD_INLINE vint4 unquant_color(
|
||||
quant_method quant_level,
|
||||
vint4 inputq
|
||||
) {
|
||||
const uint8_t* unq = color_unquant_tables[quant_level - QUANT_6];
|
||||
return vint4(unq[inputq.lane<0>()], unq[inputq.lane<1>()],
|
||||
unq[inputq.lane<2>()], unq[inputq.lane<3>()]);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Determine the quantized value given a quantization level.
|
||||
*
|
||||
* @param quant_level The quantization level to use.
|
||||
* @param value The value to convert. This may be outside of the 0-255 range and will be
|
||||
* clamped before the value is looked up.
|
||||
*
|
||||
* @return The encoded quantized value. These are not necessarily in the order; the compressor
|
||||
* scrambles the values slightly to make hardware implementation easier.
|
||||
*/
|
||||
static inline int unquant_color(
|
||||
quant_method quant_level,
|
||||
int value
|
||||
) {
|
||||
return color_unquant_tables[quant_level - QUANT_6][value];
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Un-blue-contract a color.
|
||||
*
|
||||
@@ -77,35 +40,14 @@ static ASTCENC_SIMD_INLINE vint4 uncontract_color(
|
||||
return select(input, bc0, mask);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Unpack an LDR RGBA color that uses delta encoding.
|
||||
*
|
||||
* @param input0q The raw quantized endpoint 0 color.
|
||||
* @param input1q The raw quantized endpoint 1 color deltas.
|
||||
* @param quant_level The quantization level to use.
|
||||
* @param[out] output0 The unpacked and unquantized endpoint 0 color.
|
||||
* @param[out] output1 The unpacked and unquantized endpoint 1 color.
|
||||
*/
|
||||
static void rgba_delta_unpack(
|
||||
vint4 input0q,
|
||||
vint4 input1q,
|
||||
quant_method quant_level,
|
||||
void rgba_delta_unpack(
|
||||
vint4 input0,
|
||||
vint4 input1,
|
||||
vint4& output0,
|
||||
vint4& output1
|
||||
) {
|
||||
// Unquantize color endpoints
|
||||
vint4 input0 = unquant_color(quant_level, input0q);
|
||||
vint4 input1 = unquant_color(quant_level, input1q);
|
||||
|
||||
// Perform bit-transfer
|
||||
input0 = input0 | lsl<1>(input1 & 0x80);
|
||||
input1 = input1 & 0x7F;
|
||||
vmask4 mask = (input1 & 0x40) != vint4::zero();
|
||||
input1 = select(input1, input1 - 0x80, mask);
|
||||
|
||||
// Scale
|
||||
input0 = asr<1>(input0);
|
||||
input1 = asr<1>(input1);
|
||||
// Apply bit transfer
|
||||
bit_transfer_signed(input1, input0);
|
||||
|
||||
// Apply blue-uncontraction if needed
|
||||
int rgb_sum = hadd_rgb_s(input1);
|
||||
@@ -126,44 +68,28 @@ static void rgba_delta_unpack(
|
||||
*
|
||||
* Output alpha set to 255.
|
||||
*
|
||||
* @param input0q The raw quantized endpoint 0 color.
|
||||
* @param input1q The raw quantized endpoint 1 color deltas.
|
||||
* @param quant_level The quantization level to use.
|
||||
* @param[out] output0 The unpacked and unquantized endpoint 0 color.
|
||||
* @param[out] output1 The unpacked and unquantized endpoint 1 color.
|
||||
* @param input0 The packed endpoint 0 color.
|
||||
* @param input1 The packed endpoint 1 color deltas.
|
||||
* @param[out] output0 The unpacked endpoint 0 color.
|
||||
* @param[out] output1 The unpacked endpoint 1 color.
|
||||
*/
|
||||
static void rgb_delta_unpack(
|
||||
vint4 input0q,
|
||||
vint4 input1q,
|
||||
quant_method quant_level,
|
||||
vint4 input0,
|
||||
vint4 input1,
|
||||
vint4& output0,
|
||||
vint4& output1
|
||||
) {
|
||||
rgba_delta_unpack(input0q, input1q, quant_level, output0, output1);
|
||||
rgba_delta_unpack(input0, input1, output0, output1);
|
||||
output0.set_lane<3>(255);
|
||||
output1.set_lane<3>(255);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Unpack an LDR RGBA color that uses direct encoding.
|
||||
*
|
||||
* @param input0q The raw quantized endpoint 0 color.
|
||||
* @param input1q The raw quantized endpoint 1 color.
|
||||
* @param quant_level The quantization level to use.
|
||||
* @param[out] output0 The unpacked and unquantized endpoint 0 color.
|
||||
* @param[out] output1 The unpacked and unquantized endpoint 1 color.
|
||||
*/
|
||||
static void rgba_unpack(
|
||||
vint4 input0q,
|
||||
vint4 input1q,
|
||||
quant_method quant_level,
|
||||
void rgba_unpack(
|
||||
vint4 input0,
|
||||
vint4 input1,
|
||||
vint4& output0,
|
||||
vint4& output1
|
||||
) {
|
||||
// Unquantize color endpoints
|
||||
vint4 input0 = unquant_color(quant_level, input0q);
|
||||
vint4 input1 = unquant_color(quant_level, input1q);
|
||||
|
||||
// Apply blue-uncontraction if needed
|
||||
if (hadd_rgb_s(input0) > hadd_rgb_s(input1))
|
||||
{
|
||||
@@ -181,20 +107,18 @@ static void rgba_unpack(
|
||||
*
|
||||
* Output alpha set to 255.
|
||||
*
|
||||
* @param input0q The raw quantized endpoint 0 color.
|
||||
* @param input1q The raw quantized endpoint 1 color.
|
||||
* @param quant_level The quantization level to use.
|
||||
* @param[out] output0 The unpacked and unquantized endpoint 0 color.
|
||||
* @param[out] output1 The unpacked and unquantized endpoint 1 color.
|
||||
* @param input0 The packed endpoint 0 color.
|
||||
* @param input1 The packed endpoint 1 color.
|
||||
* @param[out] output0 The unpacked endpoint 0 color.
|
||||
* @param[out] output1 The unpacked endpoint 1 color.
|
||||
*/
|
||||
static void rgb_unpack(
|
||||
vint4 input0q,
|
||||
vint4 input1q,
|
||||
quant_method quant_level,
|
||||
vint4 input0,
|
||||
vint4 input1,
|
||||
vint4& output0,
|
||||
vint4& output1
|
||||
) {
|
||||
rgba_unpack(input0q, input1q, quant_level, output0, output1);
|
||||
rgba_unpack(input0, input1, output0, output1);
|
||||
output0.set_lane<3>(255);
|
||||
output1.set_lane<3>(255);
|
||||
}
|
||||
@@ -204,31 +128,24 @@ static void rgb_unpack(
|
||||
*
|
||||
* Note only the RGB channels use the scaled encoding, alpha uses direct.
|
||||
*
|
||||
* @param input0q The raw quantized endpoint 0 color.
|
||||
* @param alpha1q The raw quantized endpoint 1 alpha value.
|
||||
* @param scaleq The raw quantized scale.
|
||||
* @param quant_level The quantization level to use.
|
||||
* @param[out] output0 The unpacked and unquantized endpoint 0 color.
|
||||
* @param[out] output1 The unpacked and unquantized endpoint 1 color.
|
||||
* @param input0 The packed endpoint 0 color.
|
||||
* @param alpha1 The packed endpoint 1 alpha value.
|
||||
* @param scale The packed quantized scale.
|
||||
* @param[out] output0 The unpacked endpoint 0 color.
|
||||
* @param[out] output1 The unpacked endpoint 1 color.
|
||||
*/
|
||||
static void rgb_scale_alpha_unpack(
|
||||
vint4 input0q,
|
||||
uint8_t alpha1q,
|
||||
uint8_t scaleq,
|
||||
quant_method quant_level,
|
||||
vint4 input0,
|
||||
uint8_t alpha1,
|
||||
uint8_t scale,
|
||||
vint4& output0,
|
||||
vint4& output1
|
||||
) {
|
||||
// Unquantize color endpoints
|
||||
vint4 input = unquant_color(quant_level, input0q);
|
||||
uint8_t alpha1 = unquant_color(quant_level, alpha1q);
|
||||
uint8_t scale = unquant_color(quant_level, scaleq);
|
||||
|
||||
output1 = input;
|
||||
output1 = input0;
|
||||
output1.set_lane<3>(alpha1);
|
||||
|
||||
output0 = asr<8>(input * scale);
|
||||
output0.set_lane<3>(input.lane<3>());
|
||||
output0 = asr<8>(input0 * scale);
|
||||
output0.set_lane<3>(input0.lane<3>());
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -236,26 +153,21 @@ static void rgb_scale_alpha_unpack(
|
||||
*
|
||||
* Output alpha is 255.
|
||||
*
|
||||
* @param input0q The raw quantized endpoint 0 color.
|
||||
* @param scaleq The raw quantized scale.
|
||||
* @param quant_level The quantization level to use.
|
||||
* @param[out] output0 The unpacked and unquantized endpoint 0 color.
|
||||
* @param[out] output1 The unpacked and unquantized endpoint 1 color.
|
||||
* @param input0 The packed endpoint 0 color.
|
||||
* @param scale The packed scale.
|
||||
* @param[out] output0 The unpacked endpoint 0 color.
|
||||
* @param[out] output1 The unpacked endpoint 1 color.
|
||||
*/
|
||||
static void rgb_scale_unpack(
|
||||
vint4 input0q,
|
||||
int scaleq,
|
||||
quant_method quant_level,
|
||||
vint4 input0,
|
||||
int scale,
|
||||
vint4& output0,
|
||||
vint4& output1
|
||||
) {
|
||||
vint4 input = unquant_color(quant_level, input0q);
|
||||
int scale = unquant_color(quant_level, scaleq);
|
||||
|
||||
output1 = input;
|
||||
output1 = input0;
|
||||
output1.set_lane<3>(255);
|
||||
|
||||
output0 = asr<8>(input * scale);
|
||||
output0 = asr<8>(input0 * scale);
|
||||
output0.set_lane<3>(255);
|
||||
}
|
||||
|
||||
@@ -264,19 +176,17 @@ static void rgb_scale_unpack(
|
||||
*
|
||||
* Output alpha is 255.
|
||||
*
|
||||
* @param input The raw quantized endpoints.
|
||||
* @param quant_level The quantization level to use.
|
||||
* @param[out] output0 The unpacked and unquantized endpoint 0 color.
|
||||
* @param[out] output1 The unpacked and unquantized endpoint 1 color.
|
||||
* @param input The packed endpoints.
|
||||
* @param[out] output0 The unpacked endpoint 0 color.
|
||||
* @param[out] output1 The unpacked endpoint 1 color.
|
||||
*/
|
||||
static void luminance_unpack(
|
||||
const uint8_t input[2],
|
||||
quant_method quant_level,
|
||||
vint4& output0,
|
||||
vint4& output1
|
||||
) {
|
||||
int lum0 = unquant_color(quant_level, input[0]);
|
||||
int lum1 = unquant_color(quant_level, input[1]);
|
||||
int lum0 = input[0];
|
||||
int lum1 = input[1];
|
||||
output0 = vint4(lum0, lum0, lum0, 255);
|
||||
output1 = vint4(lum1, lum1, lum1, 255);
|
||||
}
|
||||
@@ -286,19 +196,17 @@ static void luminance_unpack(
|
||||
*
|
||||
* Output alpha is 255.
|
||||
*
|
||||
* @param input The raw quantized endpoints (L0, L1).
|
||||
* @param quant_level The quantization level to use.
|
||||
* @param[out] output0 The unpacked and unquantized endpoint 0 color.
|
||||
* @param[out] output1 The unpacked and unquantized endpoint 1 color.
|
||||
* @param input The packed endpoints (L0, L1).
|
||||
* @param[out] output0 The unpacked endpoint 0 color.
|
||||
* @param[out] output1 The unpacked endpoint 1 color.
|
||||
*/
|
||||
static void luminance_delta_unpack(
|
||||
const uint8_t input[2],
|
||||
quant_method quant_level,
|
||||
vint4& output0,
|
||||
vint4& output1
|
||||
) {
|
||||
int v0 = unquant_color(quant_level, input[0]);
|
||||
int v1 = unquant_color(quant_level, input[1]);
|
||||
int v0 = input[0];
|
||||
int v1 = input[1];
|
||||
int l0 = (v0 >> 2) | (v1 & 0xC0);
|
||||
int l1 = l0 + (v1 & 0x3F);
|
||||
|
||||
@@ -311,21 +219,19 @@ static void luminance_delta_unpack(
|
||||
/**
|
||||
* @brief Unpack an LDR LA color that uses direct encoding.
|
||||
*
|
||||
* @param input The raw quantized endpoints (L0, L1, A0, A1).
|
||||
* @param quant_level The quantization level to use.
|
||||
* @param[out] output0 The unpacked and unquantized endpoint 0 color.
|
||||
* @param[out] output1 The unpacked and unquantized endpoint 1 color.
|
||||
* @param input The packed endpoints (L0, L1, A0, A1).
|
||||
* @param[out] output0 The unpacked endpoint 0 color.
|
||||
* @param[out] output1 The unpacked endpoint 1 color.
|
||||
*/
|
||||
static void luminance_alpha_unpack(
|
||||
const uint8_t input[4],
|
||||
quant_method quant_level,
|
||||
vint4& output0,
|
||||
vint4& output1
|
||||
) {
|
||||
int lum0 = unquant_color(quant_level, input[0]);
|
||||
int lum1 = unquant_color(quant_level, input[1]);
|
||||
int alpha0 = unquant_color(quant_level, input[2]);
|
||||
int alpha1 = unquant_color(quant_level, input[3]);
|
||||
int lum0 = input[0];
|
||||
int lum1 = input[1];
|
||||
int alpha0 = input[2];
|
||||
int alpha1 = input[3];
|
||||
output0 = vint4(lum0, lum0, lum0, alpha0);
|
||||
output1 = vint4(lum1, lum1, lum1, alpha1);
|
||||
}
|
||||
@@ -333,30 +239,34 @@ static void luminance_alpha_unpack(
|
||||
/**
|
||||
* @brief Unpack an LDR LA color that uses delta encoding.
|
||||
*
|
||||
* @param input The raw quantized endpoints (L0, L1, A0, A1).
|
||||
* @param quant_level The quantization level to use.
|
||||
* @param[out] output0 The unpacked and unquantized endpoint 0 color.
|
||||
* @param[out] output1 The unpacked and unquantized endpoint 1 color.
|
||||
* @param input The packed endpoints (L0, L1, A0, A1).
|
||||
* @param[out] output0 The unpacked endpoint 0 color.
|
||||
* @param[out] output1 The unpacked endpoint 1 color.
|
||||
*/
|
||||
static void luminance_alpha_delta_unpack(
|
||||
const uint8_t input[4],
|
||||
quant_method quant_level,
|
||||
vint4& output0,
|
||||
vint4& output1
|
||||
) {
|
||||
int lum0 = unquant_color(quant_level, input[0]);
|
||||
int lum1 = unquant_color(quant_level, input[1]);
|
||||
int alpha0 = unquant_color(quant_level, input[2]);
|
||||
int alpha1 = unquant_color(quant_level, input[3]);
|
||||
int lum0 = input[0];
|
||||
int lum1 = input[1];
|
||||
int alpha0 = input[2];
|
||||
int alpha1 = input[3];
|
||||
|
||||
lum0 |= (lum1 & 0x80) << 1;
|
||||
alpha0 |= (alpha1 & 0x80) << 1;
|
||||
lum1 &= 0x7F;
|
||||
alpha1 &= 0x7F;
|
||||
|
||||
if (lum1 & 0x40)
|
||||
{
|
||||
lum1 -= 0x80;
|
||||
}
|
||||
|
||||
if (alpha1 & 0x40)
|
||||
{
|
||||
alpha1 -= 0x80;
|
||||
}
|
||||
|
||||
lum0 >>= 1;
|
||||
lum1 >>= 1;
|
||||
@@ -375,21 +285,19 @@ static void luminance_alpha_delta_unpack(
|
||||
/**
|
||||
* @brief Unpack an HDR RGB + offset encoding.
|
||||
*
|
||||
* @param input The raw quantized endpoints (packed and modal).
|
||||
* @param quant_level The quantization level to use.
|
||||
* @param[out] output0 The unpacked and unquantized endpoint 0 color.
|
||||
* @param[out] output1 The unpacked and unquantized endpoint 1 color.
|
||||
* @param input The packed endpoints (packed and modal).
|
||||
* @param[out] output0 The unpacked endpoint 0 color.
|
||||
* @param[out] output1 The unpacked endpoint 1 color.
|
||||
*/
|
||||
static void hdr_rgbo_unpack(
|
||||
const uint8_t input[4],
|
||||
quant_method quant_level,
|
||||
vint4& output0,
|
||||
vint4& output1
|
||||
) {
|
||||
int v0 = unquant_color(quant_level, input[0]);
|
||||
int v1 = unquant_color(quant_level, input[1]);
|
||||
int v2 = unquant_color(quant_level, input[2]);
|
||||
int v3 = unquant_color(quant_level, input[3]);
|
||||
int v0 = input[0];
|
||||
int v1 = input[1];
|
||||
int v2 = input[2];
|
||||
int v3 = input[3];
|
||||
|
||||
int modeval = ((v0 & 0xC0) >> 6) | (((v1 & 0x80) >> 7) << 2) | (((v2 & 0x80) >> 7) << 3);
|
||||
|
||||
@@ -527,24 +435,22 @@ static void hdr_rgbo_unpack(
|
||||
/**
|
||||
* @brief Unpack an HDR RGB direct encoding.
|
||||
*
|
||||
* @param input The raw quantized endpoints (packed and modal).
|
||||
* @param quant_level The quantization level to use.
|
||||
* @param[out] output0 The unpacked and unquantized endpoint 0 color.
|
||||
* @param[out] output1 The unpacked and unquantized endpoint 1 color.
|
||||
* @param input The packed endpoints (packed and modal).
|
||||
* @param[out] output0 The unpacked endpoint 0 color.
|
||||
* @param[out] output1 The unpacked endpoint 1 color.
|
||||
*/
|
||||
static void hdr_rgb_unpack(
|
||||
const uint8_t input[6],
|
||||
quant_method quant_level,
|
||||
vint4& output0,
|
||||
vint4& output1
|
||||
) {
|
||||
|
||||
int v0 = unquant_color(quant_level, input[0]);
|
||||
int v1 = unquant_color(quant_level, input[1]);
|
||||
int v2 = unquant_color(quant_level, input[2]);
|
||||
int v3 = unquant_color(quant_level, input[3]);
|
||||
int v4 = unquant_color(quant_level, input[4]);
|
||||
int v5 = unquant_color(quant_level, input[5]);
|
||||
int v0 = input[0];
|
||||
int v1 = input[1];
|
||||
int v2 = input[2];
|
||||
int v3 = input[3];
|
||||
int v4 = input[4];
|
||||
int v5 = input[5];
|
||||
|
||||
// extract all the fixed-placement bitfields
|
||||
int modeval = ((v1 & 0x80) >> 7) | (((v2 & 0x80) >> 7) << 1) | (((v3 & 0x80) >> 7) << 2);
|
||||
@@ -695,21 +601,19 @@ static void hdr_rgb_unpack(
|
||||
/**
|
||||
* @brief Unpack an HDR RGB + LDR A direct encoding.
|
||||
*
|
||||
* @param input The raw quantized endpoints (packed and modal).
|
||||
* @param quant_level The quantization level to use.
|
||||
* @param[out] output0 The unpacked and unquantized endpoint 0 color.
|
||||
* @param[out] output1 The unpacked and unquantized endpoint 1 color.
|
||||
* @param input The packed endpoints (packed and modal).
|
||||
* @param[out] output0 The unpacked endpoint 0 color.
|
||||
* @param[out] output1 The unpacked endpoint 1 color.
|
||||
*/
|
||||
static void hdr_rgb_ldr_alpha_unpack(
|
||||
const uint8_t input[8],
|
||||
quant_method quant_level,
|
||||
vint4& output0,
|
||||
vint4& output1
|
||||
) {
|
||||
hdr_rgb_unpack(input, quant_level, output0, output1);
|
||||
hdr_rgb_unpack(input, output0, output1);
|
||||
|
||||
int v6 = unquant_color(quant_level, input[6]);
|
||||
int v7 = unquant_color(quant_level, input[7]);
|
||||
int v6 = input[6];
|
||||
int v7 = input[7];
|
||||
output0.set_lane<3>(v6);
|
||||
output1.set_lane<3>(v7);
|
||||
}
|
||||
@@ -717,19 +621,17 @@ static void hdr_rgb_ldr_alpha_unpack(
|
||||
/**
|
||||
* @brief Unpack an HDR L (small range) direct encoding.
|
||||
*
|
||||
* @param input The raw quantized endpoints (packed and modal).
|
||||
* @param quant_level The quantization level to use.
|
||||
* @param[out] output0 The unpacked and unquantized endpoint 0 color.
|
||||
* @param[out] output1 The unpacked and unquantized endpoint 1 color.
|
||||
* @param input The packed endpoints (packed and modal).
|
||||
* @param[out] output0 The unpacked endpoint 0 color.
|
||||
* @param[out] output1 The unpacked endpoint 1 color.
|
||||
*/
|
||||
static void hdr_luminance_small_range_unpack(
|
||||
const uint8_t input[2],
|
||||
quant_method quant_level,
|
||||
vint4& output0,
|
||||
vint4& output1
|
||||
) {
|
||||
int v0 = unquant_color(quant_level, input[0]);
|
||||
int v1 = unquant_color(quant_level, input[1]);
|
||||
int v0 = input[0];
|
||||
int v1 = input[1];
|
||||
|
||||
int y0, y1;
|
||||
if (v0 & 0x80)
|
||||
@@ -745,7 +647,9 @@ static void hdr_luminance_small_range_unpack(
|
||||
|
||||
y1 += y0;
|
||||
if (y1 > 0xFFF)
|
||||
{
|
||||
y1 = 0xFFF;
|
||||
}
|
||||
|
||||
output0 = vint4(y0 << 4, y0 << 4, y0 << 4, 0x7800);
|
||||
output1 = vint4(y1 << 4, y1 << 4, y1 << 4, 0x7800);
|
||||
@@ -754,19 +658,17 @@ static void hdr_luminance_small_range_unpack(
|
||||
/**
|
||||
* @brief Unpack an HDR L (large range) direct encoding.
|
||||
*
|
||||
* @param input The raw quantized endpoints (packed and modal).
|
||||
* @param quant_level The quantization level to use.
|
||||
* @param[out] output0 The unpacked and unquantized endpoint 0 color.
|
||||
* @param[out] output1 The unpacked and unquantized endpoint 1 color.
|
||||
* @param input The packed endpoints (packed and modal).
|
||||
* @param[out] output0 The unpacked endpoint 0 color.
|
||||
* @param[out] output1 The unpacked endpoint 1 color.
|
||||
*/
|
||||
static void hdr_luminance_large_range_unpack(
|
||||
const uint8_t input[2],
|
||||
quant_method quant_level,
|
||||
vint4& output0,
|
||||
vint4& output1
|
||||
) {
|
||||
int v0 = unquant_color(quant_level, input[0]);
|
||||
int v1 = unquant_color(quant_level, input[1]);
|
||||
int v0 = input[0];
|
||||
int v1 = input[1];
|
||||
|
||||
int y0, y1;
|
||||
if (v1 >= v0)
|
||||
@@ -787,20 +689,18 @@ static void hdr_luminance_large_range_unpack(
|
||||
/**
|
||||
* @brief Unpack an HDR A direct encoding.
|
||||
*
|
||||
* @param input The raw quantized endpoints (packed and modal).
|
||||
* @param quant_level The quantization level to use.
|
||||
* @param[out] output0 The unpacked and unquantized endpoint 0 color.
|
||||
* @param[out] output1 The unpacked and unquantized endpoint 1 color.
|
||||
* @param input The packed endpoints (packed and modal).
|
||||
* @param[out] output0 The unpacked endpoint 0 color.
|
||||
* @param[out] output1 The unpacked endpoint 1 color.
|
||||
*/
|
||||
static void hdr_alpha_unpack(
|
||||
const uint8_t input[2],
|
||||
quant_method quant_level,
|
||||
int& output0,
|
||||
int& output1
|
||||
) {
|
||||
|
||||
int v6 = unquant_color(quant_level, input[0]);
|
||||
int v7 = unquant_color(quant_level, input[1]);
|
||||
int v6 = input[0];
|
||||
int v7 = input[1];
|
||||
|
||||
int selector = ((v6 >> 7) & 1) | ((v7 >> 6) & 2);
|
||||
v6 &= 0x7F;
|
||||
@@ -821,9 +721,13 @@ static void hdr_alpha_unpack(
|
||||
v7 += v6;
|
||||
|
||||
if (v7 < 0)
|
||||
{
|
||||
v7 = 0;
|
||||
}
|
||||
else if (v7 > 0xFFF)
|
||||
{
|
||||
v7 = 0xFFF;
|
||||
}
|
||||
|
||||
output0 = v6;
|
||||
output1 = v7;
|
||||
@@ -836,21 +740,19 @@ static void hdr_alpha_unpack(
|
||||
/**
|
||||
* @brief Unpack an HDR RGBA direct encoding.
|
||||
*
|
||||
* @param input The raw quantized endpoints (packed and modal).
|
||||
* @param quant_level The quantization level to use.
|
||||
* @param[out] output0 The unpacked and unquantized endpoint 0 color.
|
||||
* @param[out] output1 The unpacked and unquantized endpoint 1 color.
|
||||
* @param input The packed endpoints (packed and modal).
|
||||
* @param[out] output0 The unpacked endpoint 0 color.
|
||||
* @param[out] output1 The unpacked endpoint 1 color.
|
||||
*/
|
||||
static void hdr_rgb_hdr_alpha_unpack(
|
||||
const uint8_t input[8],
|
||||
quant_method quant_level,
|
||||
vint4& output0,
|
||||
vint4& output1
|
||||
) {
|
||||
hdr_rgb_unpack(input, quant_level, output0, output1);
|
||||
hdr_rgb_unpack(input, output0, output1);
|
||||
|
||||
int alpha0, alpha1;
|
||||
hdr_alpha_unpack(input + 6, quant_level, alpha0, alpha1);
|
||||
hdr_alpha_unpack(input + 6, alpha0, alpha1);
|
||||
|
||||
output0.set_lane<3>(alpha0);
|
||||
output1.set_lane<3>(alpha1);
|
||||
@@ -860,7 +762,6 @@ static void hdr_rgb_hdr_alpha_unpack(
|
||||
void unpack_color_endpoints(
|
||||
astcenc_profile decode_mode,
|
||||
int format,
|
||||
quant_method quant_level,
|
||||
const uint8_t* input,
|
||||
bool& rgb_hdr,
|
||||
bool& alpha_hdr,
|
||||
@@ -876,38 +777,38 @@ void unpack_color_endpoints(
|
||||
switch (format)
|
||||
{
|
||||
case FMT_LUMINANCE:
|
||||
luminance_unpack(input, quant_level, output0, output1);
|
||||
luminance_unpack(input, output0, output1);
|
||||
break;
|
||||
|
||||
case FMT_LUMINANCE_DELTA:
|
||||
luminance_delta_unpack(input, quant_level, output0, output1);
|
||||
luminance_delta_unpack(input, output0, output1);
|
||||
break;
|
||||
|
||||
case FMT_HDR_LUMINANCE_SMALL_RANGE:
|
||||
rgb_hdr = true;
|
||||
alpha_hdr_default = true;
|
||||
hdr_luminance_small_range_unpack(input, quant_level, output0, output1);
|
||||
hdr_luminance_small_range_unpack(input, output0, output1);
|
||||
break;
|
||||
|
||||
case FMT_HDR_LUMINANCE_LARGE_RANGE:
|
||||
rgb_hdr = true;
|
||||
alpha_hdr_default = true;
|
||||
hdr_luminance_large_range_unpack(input, quant_level, output0, output1);
|
||||
hdr_luminance_large_range_unpack(input, output0, output1);
|
||||
break;
|
||||
|
||||
case FMT_LUMINANCE_ALPHA:
|
||||
luminance_alpha_unpack(input, quant_level, output0, output1);
|
||||
luminance_alpha_unpack(input, output0, output1);
|
||||
break;
|
||||
|
||||
case FMT_LUMINANCE_ALPHA_DELTA:
|
||||
luminance_alpha_delta_unpack(input, quant_level, output0, output1);
|
||||
luminance_alpha_delta_unpack(input, output0, output1);
|
||||
break;
|
||||
|
||||
case FMT_RGB_SCALE:
|
||||
{
|
||||
vint4 input0q(input[0], input[1], input[2], 0);
|
||||
uint8_t scale = input[3];
|
||||
rgb_scale_unpack(input0q, scale, quant_level, output0, output1);
|
||||
rgb_scale_unpack(input0q, scale, output0, output1);
|
||||
}
|
||||
break;
|
||||
|
||||
@@ -916,21 +817,21 @@ void unpack_color_endpoints(
|
||||
vint4 input0q(input[0], input[1], input[2], input[4]);
|
||||
uint8_t alpha1q = input[5];
|
||||
uint8_t scaleq = input[3];
|
||||
rgb_scale_alpha_unpack(input0q, alpha1q, scaleq, quant_level, output0, output1);
|
||||
rgb_scale_alpha_unpack(input0q, alpha1q, scaleq, output0, output1);
|
||||
}
|
||||
break;
|
||||
|
||||
case FMT_HDR_RGB_SCALE:
|
||||
rgb_hdr = true;
|
||||
alpha_hdr_default = true;
|
||||
hdr_rgbo_unpack(input, quant_level,output0, output1);
|
||||
hdr_rgbo_unpack(input, output0, output1);
|
||||
break;
|
||||
|
||||
case FMT_RGB:
|
||||
{
|
||||
vint4 input0q(input[0], input[2], input[4], 0);
|
||||
vint4 input1q(input[1], input[3], input[5], 0);
|
||||
rgb_unpack(input0q, input1q, quant_level, output0, output1);
|
||||
rgb_unpack(input0q, input1q, output0, output1);
|
||||
}
|
||||
break;
|
||||
|
||||
@@ -938,21 +839,21 @@ void unpack_color_endpoints(
|
||||
{
|
||||
vint4 input0q(input[0], input[2], input[4], 0);
|
||||
vint4 input1q(input[1], input[3], input[5], 0);
|
||||
rgb_delta_unpack(input0q, input1q, quant_level, output0, output1);
|
||||
rgb_delta_unpack(input0q, input1q, output0, output1);
|
||||
}
|
||||
break;
|
||||
|
||||
case FMT_HDR_RGB:
|
||||
rgb_hdr = true;
|
||||
alpha_hdr_default = true;
|
||||
hdr_rgb_unpack(input, quant_level, output0, output1);
|
||||
hdr_rgb_unpack(input, output0, output1);
|
||||
break;
|
||||
|
||||
case FMT_RGBA:
|
||||
{
|
||||
vint4 input0q(input[0], input[2], input[4], input[6]);
|
||||
vint4 input1q(input[1], input[3], input[5], input[7]);
|
||||
rgba_unpack(input0q, input1q, quant_level, output0, output1);
|
||||
rgba_unpack(input0q, input1q, output0, output1);
|
||||
}
|
||||
break;
|
||||
|
||||
@@ -960,19 +861,19 @@ void unpack_color_endpoints(
|
||||
{
|
||||
vint4 input0q(input[0], input[2], input[4], input[6]);
|
||||
vint4 input1q(input[1], input[3], input[5], input[7]);
|
||||
rgba_delta_unpack(input0q, input1q, quant_level, output0, output1);
|
||||
rgba_delta_unpack(input0q, input1q, output0, output1);
|
||||
}
|
||||
break;
|
||||
|
||||
case FMT_HDR_RGB_LDR_ALPHA:
|
||||
rgb_hdr = true;
|
||||
hdr_rgb_ldr_alpha_unpack(input, quant_level, output0, output1);
|
||||
hdr_rgb_ldr_alpha_unpack(input, output0, output1);
|
||||
break;
|
||||
|
||||
case FMT_HDR_RGBA:
|
||||
rgb_hdr = true;
|
||||
alpha_hdr = true;
|
||||
hdr_rgb_hdr_alpha_unpack(input, quant_level, output0, output1);
|
||||
hdr_rgb_hdr_alpha_unpack(input, output0, output1);
|
||||
break;
|
||||
}
|
||||
|
||||
@@ -993,32 +894,55 @@ void unpack_color_endpoints(
|
||||
}
|
||||
}
|
||||
|
||||
vint4 ldr_scale(257);
|
||||
vint4 hdr_scale(1);
|
||||
vint4 output_scale = ldr_scale;
|
||||
// Handle endpoint errors and expansion
|
||||
|
||||
// An LDR profile image
|
||||
if ((decode_mode == ASTCENC_PRF_LDR) ||
|
||||
(decode_mode == ASTCENC_PRF_LDR_SRGB))
|
||||
// Linear LDR 8-bit endpoints are expanded to 16-bit by replication
|
||||
if (decode_mode == ASTCENC_PRF_LDR)
|
||||
{
|
||||
// Also matches HDR alpha, as cannot have HDR alpha without HDR RGB
|
||||
if (rgb_hdr == true)
|
||||
// Error color - HDR endpoint in an LDR encoding
|
||||
if (rgb_hdr || alpha_hdr)
|
||||
{
|
||||
output0 = vint4(0xFF00, 0x0000, 0xFF00, 0xFF00);
|
||||
output1 = vint4(0xFF00, 0x0000, 0xFF00, 0xFF00);
|
||||
output_scale = hdr_scale;
|
||||
|
||||
output0 = vint4(0xFF, 0x00, 0xFF, 0xFF);
|
||||
output1 = vint4(0xFF, 0x00, 0xFF, 0xFF);
|
||||
rgb_hdr = false;
|
||||
alpha_hdr = false;
|
||||
}
|
||||
|
||||
output0 = output0 * 257;
|
||||
output1 = output1 * 257;
|
||||
}
|
||||
// An HDR profile image
|
||||
// sRGB LDR 8-bit endpoints are expanded to 16 bit by:
|
||||
// - RGB = shift left by 8 bits and OR with 0x80
|
||||
// - A = replication
|
||||
else if (decode_mode == ASTCENC_PRF_LDR_SRGB)
|
||||
{
|
||||
// Error color - HDR endpoint in an LDR encoding
|
||||
if (rgb_hdr || alpha_hdr)
|
||||
{
|
||||
output0 = vint4(0xFF, 0x00, 0xFF, 0xFF);
|
||||
output1 = vint4(0xFF, 0x00, 0xFF, 0xFF);
|
||||
rgb_hdr = false;
|
||||
alpha_hdr = false;
|
||||
}
|
||||
|
||||
vmask4 mask(true, true, true, false);
|
||||
|
||||
vint4 output0rgb = lsl<8>(output0) | vint4(0x80);
|
||||
vint4 output0a = output0 * 257;
|
||||
output0 = select(output0a, output0rgb, mask);
|
||||
|
||||
vint4 output1rgb = lsl<8>(output1) | vint4(0x80);
|
||||
vint4 output1a = output1 * 257;
|
||||
output1 = select(output1a, output1rgb, mask);
|
||||
}
|
||||
// An HDR profile decode, but may be using linear LDR endpoints
|
||||
// Linear LDR 8-bit endpoints are expanded to 16-bit by replication
|
||||
// HDR endpoints are already 16-bit
|
||||
else
|
||||
{
|
||||
vmask4 hdr_lanes(rgb_hdr, rgb_hdr, rgb_hdr, alpha_hdr);
|
||||
output_scale = select(ldr_scale, hdr_scale, hdr_lanes);
|
||||
vint4 output_scale = select(vint4(257), vint4(1), hdr_lanes);
|
||||
output0 = output0 * output_scale;
|
||||
output1 = output1 * output_scale;
|
||||
}
|
||||
|
||||
output0 = output0 * output_scale;
|
||||
output1 = output1 * output_scale;
|
||||
}
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
// ----------------------------------------------------------------------------
|
||||
// Copyright 2011-2022 Arm Limited
|
||||
// Copyright 2011-2024 Arm Limited
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
|
||||
// use this file except in compliance with the License. You may obtain a copy
|
||||
@@ -79,10 +79,10 @@ static bool realign_weights_undecimated(
|
||||
// Get the quantization table
|
||||
const block_mode& bm = bsd.get_block_mode(scb.block_mode);
|
||||
unsigned int weight_quant_level = bm.quant_mode;
|
||||
const quantization_and_transfer_table *qat = &(quant_and_xfer_tables[weight_quant_level]);
|
||||
const quant_and_transfer_table& qat = quant_and_xfer_tables[weight_quant_level];
|
||||
|
||||
unsigned int max_plane = bm.is_dual_plane;
|
||||
int plane2_component = bm.is_dual_plane ? scb.plane2_component : -1;
|
||||
int plane2_component = scb.plane2_component;
|
||||
vmask4 plane_mask = vint4::lane_id() == vint4(plane2_component);
|
||||
|
||||
// Decode the color endpoints
|
||||
@@ -99,14 +99,13 @@ static bool realign_weights_undecimated(
|
||||
{
|
||||
unpack_color_endpoints(decode_mode,
|
||||
scb.color_formats[pa_idx],
|
||||
scb.get_color_quant_mode(),
|
||||
scb.color_values[pa_idx],
|
||||
rgb_hdr, alpha_hdr,
|
||||
endpnt0[pa_idx],
|
||||
endpnt1[pa_idx]);
|
||||
}
|
||||
|
||||
uint8_t* dec_weights_quant_pvalue = scb.weights;
|
||||
uint8_t* dec_weights_uquant = scb.weights;
|
||||
bool adjustments = false;
|
||||
|
||||
// For each plane and partition ...
|
||||
@@ -126,50 +125,48 @@ static bool realign_weights_undecimated(
|
||||
promise(bsd.texel_count > 0);
|
||||
for (unsigned int texel = 0; texel < bsd.texel_count; texel++)
|
||||
{
|
||||
int uqw = qat->unquantized_value[dec_weights_quant_pvalue[texel]];
|
||||
int uqw = dec_weights_uquant[texel];
|
||||
|
||||
uint32_t prev_and_next = qat->prev_next_values[uqw];
|
||||
int prev_wt_uq = prev_and_next & 0xFF;
|
||||
int next_wt_uq = (prev_and_next >> 8) & 0xFF;
|
||||
uint32_t prev_and_next = qat.prev_next_values[uqw];
|
||||
int uqw_down = prev_and_next & 0xFF;
|
||||
int uqw_up = (prev_and_next >> 8) & 0xFF;
|
||||
|
||||
// Interpolate the colors to create the diffs
|
||||
float weight_base = static_cast<float>(uqw);
|
||||
float weight_down = static_cast<float>(uqw_down - uqw);
|
||||
float weight_up = static_cast<float>(uqw_up - uqw);
|
||||
|
||||
unsigned int partition = pi.partition_of_texel[texel];
|
||||
|
||||
float plane_weight = static_cast<float>(uqw);
|
||||
float plane_up_weight = static_cast<float>(next_wt_uq - uqw);
|
||||
float plane_down_weight = static_cast<float>(prev_wt_uq - uqw);
|
||||
|
||||
vfloat4 color_offset = offset[partition];
|
||||
vfloat4 color_base = endpnt0f[partition];
|
||||
|
||||
vfloat4 color = color_base + color_offset * plane_weight;
|
||||
|
||||
vfloat4 color = color_base + color_offset * weight_base;
|
||||
vfloat4 orig_color = blk.texel(texel);
|
||||
vfloat4 error_weight = blk.channel_weight;
|
||||
|
||||
vfloat4 color_diff = color - orig_color;
|
||||
vfloat4 color_up_diff = color_diff + color_offset * plane_up_weight;
|
||||
vfloat4 color_down_diff = color_diff + color_offset * plane_down_weight;
|
||||
vfloat4 color_diff_down = color_diff + color_offset * weight_down;
|
||||
vfloat4 color_diff_up = color_diff + color_offset * weight_up;
|
||||
|
||||
float current_error = dot_s(color_diff * color_diff, error_weight);
|
||||
float up_error = dot_s(color_up_diff * color_up_diff, error_weight);
|
||||
float down_error = dot_s(color_down_diff * color_down_diff, error_weight);
|
||||
float error_base = dot_s(color_diff * color_diff, error_weight);
|
||||
float error_down = dot_s(color_diff_down * color_diff_down, error_weight);
|
||||
float error_up = dot_s(color_diff_up * color_diff_up, error_weight);
|
||||
|
||||
// Check if the prev or next error is better, and if so use it
|
||||
if ((up_error < current_error) && (up_error < down_error))
|
||||
if ((error_up < error_base) && (error_up < error_down) && (uqw < 64))
|
||||
{
|
||||
dec_weights_quant_pvalue[texel] = static_cast<uint8_t>((prev_and_next >> 24) & 0xFF);
|
||||
dec_weights_uquant[texel] = static_cast<uint8_t>(uqw_up);
|
||||
adjustments = true;
|
||||
}
|
||||
else if (down_error < current_error)
|
||||
else if ((error_down < error_base) && (uqw > 0))
|
||||
{
|
||||
dec_weights_quant_pvalue[texel] = static_cast<uint8_t>((prev_and_next >> 16) & 0xFF);
|
||||
dec_weights_uquant[texel] = static_cast<uint8_t>(uqw_down);
|
||||
adjustments = true;
|
||||
}
|
||||
}
|
||||
|
||||
// Prepare iteration for plane 2
|
||||
dec_weights_quant_pvalue += WEIGHTS_PLANE2_OFFSET;
|
||||
dec_weights_uquant += WEIGHTS_PLANE2_OFFSET;
|
||||
plane_mask = ~plane_mask;
|
||||
}
|
||||
|
||||
@@ -201,7 +198,7 @@ static bool realign_weights_decimated(
|
||||
// Get the quantization table
|
||||
const block_mode& bm = bsd.get_block_mode(scb.block_mode);
|
||||
unsigned int weight_quant_level = bm.quant_mode;
|
||||
const quantization_and_transfer_table *qat = &(quant_and_xfer_tables[weight_quant_level]);
|
||||
const quant_and_transfer_table& qat = quant_and_xfer_tables[weight_quant_level];
|
||||
|
||||
// Get the decimation table
|
||||
const decimation_info& di = bsd.get_decimation_info(bm.decimation_mode);
|
||||
@@ -209,7 +206,7 @@ static bool realign_weights_decimated(
|
||||
assert(weight_count != bsd.texel_count);
|
||||
|
||||
unsigned int max_plane = bm.is_dual_plane;
|
||||
int plane2_component = bm.is_dual_plane ? scb.plane2_component : -1;
|
||||
int plane2_component = scb.plane2_component;
|
||||
vmask4 plane_mask = vint4::lane_id() == vint4(plane2_component);
|
||||
|
||||
// Decode the color endpoints
|
||||
@@ -227,16 +224,13 @@ static bool realign_weights_decimated(
|
||||
{
|
||||
unpack_color_endpoints(decode_mode,
|
||||
scb.color_formats[pa_idx],
|
||||
scb.get_color_quant_mode(),
|
||||
scb.color_values[pa_idx],
|
||||
rgb_hdr, alpha_hdr,
|
||||
endpnt0[pa_idx],
|
||||
endpnt1[pa_idx]);
|
||||
}
|
||||
|
||||
uint8_t uq_pl_weights[BLOCK_MAX_WEIGHTS];
|
||||
float uq_pl_weightsf[BLOCK_MAX_WEIGHTS];
|
||||
uint8_t* dec_weights_quant_pvalue = scb.weights;
|
||||
uint8_t* dec_weights_uquant = scb.weights;
|
||||
bool adjustments = false;
|
||||
|
||||
// For each plane and partition ...
|
||||
@@ -253,97 +247,90 @@ static bool realign_weights_decimated(
|
||||
}
|
||||
|
||||
// Create an unquantized weight grid for this decimation level
|
||||
for (unsigned int we_idx = 0; we_idx < weight_count; we_idx++)
|
||||
ASTCENC_ALIGNAS float uq_weightsf[BLOCK_MAX_WEIGHTS];
|
||||
for (unsigned int we_idx = 0; we_idx < weight_count; we_idx += ASTCENC_SIMD_WIDTH)
|
||||
{
|
||||
uq_pl_weights[we_idx] = qat->unquantized_value[dec_weights_quant_pvalue[we_idx]];
|
||||
uq_pl_weightsf[we_idx] = static_cast<float>(uq_pl_weights[we_idx]);
|
||||
vint unquant_value(dec_weights_uquant + we_idx);
|
||||
vfloat unquant_valuef = int_to_float(unquant_value);
|
||||
storea(unquant_valuef, uq_weightsf + we_idx);
|
||||
}
|
||||
|
||||
// For each weight compute previous, current, and next errors
|
||||
for (unsigned int we_idx = 0; we_idx < weight_count; we_idx++)
|
||||
{
|
||||
unsigned int uqw = uq_pl_weights[we_idx];
|
||||
float uqwf = uq_pl_weightsf[we_idx];
|
||||
int uqw = dec_weights_uquant[we_idx];
|
||||
uint32_t prev_and_next = qat.prev_next_values[uqw];
|
||||
|
||||
uint32_t prev_and_next = qat->prev_next_values[uqw];
|
||||
unsigned int prev_wt_uq = prev_and_next & 0xFF;
|
||||
unsigned int next_wt_uq = (prev_and_next >> 8) & 0xFF;
|
||||
float uqw_base = uq_weightsf[we_idx];
|
||||
float uqw_down = static_cast<float>(prev_and_next & 0xFF);
|
||||
float uqw_up = static_cast<float>((prev_and_next >> 8) & 0xFF);
|
||||
|
||||
float uqw_next_dif = static_cast<float>(next_wt_uq) - uqwf;
|
||||
float uqw_prev_dif = static_cast<float>(prev_wt_uq) - uqwf;
|
||||
float uqw_diff_down = uqw_down - uqw_base;
|
||||
float uqw_diff_up = uqw_up - uqw_base;
|
||||
|
||||
vfloat4 current_errorv = vfloat4::zero();
|
||||
vfloat4 up_errorv = vfloat4::zero();
|
||||
vfloat4 down_errorv = vfloat4::zero();
|
||||
vfloat4 error_basev = vfloat4::zero();
|
||||
vfloat4 error_downv = vfloat4::zero();
|
||||
vfloat4 error_upv = vfloat4::zero();
|
||||
|
||||
// Interpolate the colors to create the diffs
|
||||
unsigned int texels_to_evaluate = di.weight_texel_count[we_idx];
|
||||
promise(texels_to_evaluate > 0);
|
||||
for (unsigned int te_idx = 0; te_idx < texels_to_evaluate; te_idx++)
|
||||
{
|
||||
unsigned int texel = di.weight_texel[te_idx][we_idx];
|
||||
float weight_base = uqwf;
|
||||
unsigned int texel = di.weight_texels_tr[te_idx][we_idx];
|
||||
|
||||
const uint8_t *texel_weights = di.texel_weights_texel[we_idx][te_idx];
|
||||
const float *texel_weights_float = di.texel_weights_float_texel[we_idx][te_idx];
|
||||
float twf0 = texel_weights_float[0];
|
||||
float tw_base = di.texel_contrib_for_weight[te_idx][we_idx];
|
||||
|
||||
weight_base = (uqwf * twf0
|
||||
+ uq_pl_weightsf[texel_weights[1]] * texel_weights_float[1])
|
||||
+ (uq_pl_weightsf[texel_weights[2]] * texel_weights_float[2]
|
||||
+ uq_pl_weightsf[texel_weights[3]] * texel_weights_float[3]);
|
||||
|
||||
unsigned int partition = pi.partition_of_texel[texel];
|
||||
float weight_base = (uq_weightsf[di.texel_weights_tr[0][texel]] * di.texel_weight_contribs_float_tr[0][texel]
|
||||
+ uq_weightsf[di.texel_weights_tr[1][texel]] * di.texel_weight_contribs_float_tr[1][texel])
|
||||
+ (uq_weightsf[di.texel_weights_tr[2][texel]] * di.texel_weight_contribs_float_tr[2][texel]
|
||||
+ uq_weightsf[di.texel_weights_tr[3][texel]] * di.texel_weight_contribs_float_tr[3][texel]);
|
||||
|
||||
// Ideally this is integer rounded, but IQ gain it isn't worth the overhead
|
||||
// float plane_weight = astc::flt_rd(weight_base + 0.5f);
|
||||
// float plane_up_weight = astc::flt_rd(weight_base + 0.5f + uqw_next_dif * twf0) - plane_weight;
|
||||
// float plane_down_weight = astc::flt_rd(weight_base + 0.5f + uqw_prev_dif * twf0) - plane_weight;
|
||||
|
||||
float plane_weight = weight_base;
|
||||
float plane_up_weight = weight_base + uqw_next_dif * twf0 - plane_weight;
|
||||
float plane_down_weight = weight_base + uqw_prev_dif * twf0 - plane_weight;
|
||||
// float weight = astc::flt_rd(weight_base + 0.5f);
|
||||
// float weight_down = astc::flt_rd(weight_base + 0.5f + uqw_diff_down * tw_base) - weight;
|
||||
// float weight_up = astc::flt_rd(weight_base + 0.5f + uqw_diff_up * tw_base) - weight;
|
||||
float weight_down = weight_base + uqw_diff_down * tw_base - weight_base;
|
||||
float weight_up = weight_base + uqw_diff_up * tw_base - weight_base;
|
||||
|
||||
unsigned int partition = pi.partition_of_texel[texel];
|
||||
vfloat4 color_offset = offset[partition];
|
||||
vfloat4 color_base = endpnt0f[partition];
|
||||
|
||||
vfloat4 color = color_base + color_offset * plane_weight;
|
||||
|
||||
vfloat4 orig_color = blk.texel(texel);
|
||||
vfloat4 color = color_base + color_offset * weight_base;
|
||||
vfloat4 orig_color = blk.texel(texel);
|
||||
|
||||
vfloat4 color_diff = color - orig_color;
|
||||
vfloat4 color_up_diff = color_diff + color_offset * plane_up_weight;
|
||||
vfloat4 color_down_diff = color_diff + color_offset * plane_down_weight;
|
||||
vfloat4 color_down_diff = color_diff + color_offset * weight_down;
|
||||
vfloat4 color_up_diff = color_diff + color_offset * weight_up;
|
||||
|
||||
current_errorv += color_diff * color_diff;
|
||||
up_errorv += color_up_diff * color_up_diff;
|
||||
down_errorv += color_down_diff * color_down_diff;
|
||||
error_basev += color_diff * color_diff;
|
||||
error_downv += color_down_diff * color_down_diff;
|
||||
error_upv += color_up_diff * color_up_diff;
|
||||
}
|
||||
|
||||
vfloat4 error_weight = blk.channel_weight;
|
||||
float current_error = hadd_s(current_errorv * error_weight);
|
||||
float up_error = hadd_s(up_errorv * error_weight);
|
||||
float down_error = hadd_s(down_errorv * error_weight);
|
||||
float error_base = hadd_s(error_basev * error_weight);
|
||||
float error_down = hadd_s(error_downv * error_weight);
|
||||
float error_up = hadd_s(error_upv * error_weight);
|
||||
|
||||
// Check if the prev or next error is better, and if so use it
|
||||
if ((up_error < current_error) && (up_error < down_error))
|
||||
if ((error_up < error_base) && (error_up < error_down) && (uqw < 64))
|
||||
{
|
||||
uq_pl_weights[we_idx] = static_cast<uint8_t>(next_wt_uq);
|
||||
uq_pl_weightsf[we_idx] = static_cast<float>(next_wt_uq);
|
||||
dec_weights_quant_pvalue[we_idx] = static_cast<uint8_t>((prev_and_next >> 24) & 0xFF);
|
||||
uq_weightsf[we_idx] = uqw_up;
|
||||
dec_weights_uquant[we_idx] = static_cast<uint8_t>(uqw_up);
|
||||
adjustments = true;
|
||||
}
|
||||
else if (down_error < current_error)
|
||||
else if ((error_down < error_base) && (uqw > 0))
|
||||
{
|
||||
uq_pl_weights[we_idx] = static_cast<uint8_t>(prev_wt_uq);
|
||||
uq_pl_weightsf[we_idx] = static_cast<float>(prev_wt_uq);
|
||||
dec_weights_quant_pvalue[we_idx] = static_cast<uint8_t>((prev_and_next >> 16) & 0xFF);
|
||||
uq_weightsf[we_idx] = uqw_down;
|
||||
dec_weights_uquant[we_idx] = static_cast<uint8_t>(uqw_down);
|
||||
adjustments = true;
|
||||
}
|
||||
}
|
||||
|
||||
// Prepare iteration for plane 2
|
||||
dec_weights_quant_pvalue += WEIGHTS_PLANE2_OFFSET;
|
||||
dec_weights_uquant += WEIGHTS_PLANE2_OFFSET;
|
||||
plane_mask = ~plane_mask;
|
||||
}
|
||||
|
||||
@@ -373,12 +360,15 @@ static float compress_symbolic_block_for_partition_1plane(
|
||||
unsigned int partition_count,
|
||||
unsigned int partition_index,
|
||||
symbolic_compressed_block& scb,
|
||||
compression_working_buffers& tmpbuf
|
||||
compression_working_buffers& tmpbuf,
|
||||
int quant_limit
|
||||
) {
|
||||
promise(partition_count > 0);
|
||||
promise(config.tune_candidate_limit > 0);
|
||||
promise(config.tune_refinement_limit > 0);
|
||||
|
||||
int max_weight_quant = astc::min(static_cast<int>(QUANT_32), quant_limit);
|
||||
|
||||
auto compute_difference = &compute_symbolic_block_difference_1plane;
|
||||
if ((partition_count == 1) && !(config.flags & ASTCENC_FLG_MAP_RGBM))
|
||||
{
|
||||
@@ -389,13 +379,11 @@ static float compress_symbolic_block_for_partition_1plane(
|
||||
|
||||
// Compute ideal weights and endpoint colors, with no quantization or decimation
|
||||
endpoints_and_weights& ei = tmpbuf.ei1;
|
||||
endpoints_and_weights *eix = tmpbuf.eix1;
|
||||
compute_ideal_colors_and_weights_1plane(blk, pi, ei);
|
||||
|
||||
// Compute ideal weights and endpoint colors for every decimation
|
||||
float *dec_weights_ideal_value = tmpbuf.dec_weights_ideal_value;
|
||||
float *dec_weights_quant_uvalue = tmpbuf.dec_weights_quant_uvalue;
|
||||
uint8_t *dec_weights_quant_pvalue = tmpbuf.dec_weights_quant_pvalue;
|
||||
float* dec_weights_ideal = tmpbuf.dec_weights_ideal;
|
||||
uint8_t* dec_weights_uquant = tmpbuf.dec_weights_uquant;
|
||||
|
||||
// For each decimation mode, compute an ideal set of weights with no quantization
|
||||
unsigned int max_decimation_modes = only_always ? bsd.decimation_mode_count_always
|
||||
@@ -404,7 +392,7 @@ static float compress_symbolic_block_for_partition_1plane(
|
||||
for (unsigned int i = 0; i < max_decimation_modes; i++)
|
||||
{
|
||||
const auto& dm = bsd.get_decimation_mode(i);
|
||||
if (!dm.ref_1_plane)
|
||||
if (!dm.is_ref_1plane(static_cast<quant_method>(max_weight_quant)))
|
||||
{
|
||||
continue;
|
||||
}
|
||||
@@ -413,9 +401,8 @@ static float compress_symbolic_block_for_partition_1plane(
|
||||
|
||||
compute_ideal_weights_for_decimation(
|
||||
ei,
|
||||
eix[i],
|
||||
di,
|
||||
dec_weights_ideal_value + i * BLOCK_MAX_WEIGHTS);
|
||||
dec_weights_ideal + i * BLOCK_MAX_WEIGHTS);
|
||||
}
|
||||
|
||||
// Compute maximum colors for the endpoints and ideal weights, then for each endpoint and ideal
|
||||
@@ -433,14 +420,11 @@ static float compress_symbolic_block_for_partition_1plane(
|
||||
|
||||
// For each mode, use the angular method to compute a shift
|
||||
compute_angular_endpoints_1plane(
|
||||
config.tune_low_weight_count_limit,
|
||||
only_always, bsd,
|
||||
dec_weights_ideal_value,
|
||||
tmpbuf);
|
||||
only_always, bsd, dec_weights_ideal, max_weight_quant, tmpbuf);
|
||||
|
||||
float* weight_low_value = tmpbuf.weight_low_value1;
|
||||
float* weight_high_value = tmpbuf.weight_high_value1;
|
||||
int* qwt_bitcounts = tmpbuf.qwt_bitcounts;
|
||||
int8_t* qwt_bitcounts = tmpbuf.qwt_bitcounts;
|
||||
float* qwt_errors = tmpbuf.qwt_errors;
|
||||
|
||||
// For each mode (which specifies a decimation and a quantization):
|
||||
@@ -456,9 +440,16 @@ static float compress_symbolic_block_for_partition_1plane(
|
||||
unsigned int max_block_modes = only_always ? bsd.block_mode_count_1plane_always
|
||||
: bsd.block_mode_count_1plane_selected;
|
||||
promise(max_block_modes > 0);
|
||||
for (unsigned int i = 0; i < max_block_modes; ++i)
|
||||
for (unsigned int i = 0; i < max_block_modes; i++)
|
||||
{
|
||||
const block_mode& bm = bsd.block_modes[i];
|
||||
|
||||
if (bm.quant_mode > max_weight_quant)
|
||||
{
|
||||
qwt_errors[i] = 1e38f;
|
||||
continue;
|
||||
}
|
||||
|
||||
assert(!bm.is_dual_plane);
|
||||
int bitcount = free_bits_for_partition_count[partition_count - 1] - bm.weight_bits;
|
||||
if (bitcount <= 0)
|
||||
@@ -475,26 +466,28 @@ static float compress_symbolic_block_for_partition_1plane(
|
||||
int decimation_mode = bm.decimation_mode;
|
||||
const auto& di = bsd.get_decimation_info(decimation_mode);
|
||||
|
||||
qwt_bitcounts[i] = bitcount;
|
||||
qwt_bitcounts[i] = static_cast<int8_t>(bitcount);
|
||||
|
||||
ASTCENC_ALIGNAS float dec_weights_uquantf[BLOCK_MAX_WEIGHTS];
|
||||
|
||||
// Generate the optimized set of weights for the weight mode
|
||||
compute_quantized_weights_for_decimation(
|
||||
di,
|
||||
weight_low_value[i], weight_high_value[i],
|
||||
dec_weights_ideal_value + BLOCK_MAX_WEIGHTS * decimation_mode,
|
||||
dec_weights_quant_uvalue + BLOCK_MAX_WEIGHTS * i,
|
||||
dec_weights_quant_pvalue + BLOCK_MAX_WEIGHTS * i,
|
||||
dec_weights_ideal + BLOCK_MAX_WEIGHTS * decimation_mode,
|
||||
dec_weights_uquantf,
|
||||
dec_weights_uquant + BLOCK_MAX_WEIGHTS * i,
|
||||
bm.get_weight_quant_mode());
|
||||
|
||||
// Compute weight quantization errors for the block mode
|
||||
qwt_errors[i] = compute_error_of_weight_set_1plane(
|
||||
eix[decimation_mode],
|
||||
ei,
|
||||
di,
|
||||
dec_weights_quant_uvalue + BLOCK_MAX_WEIGHTS * i);
|
||||
dec_weights_uquantf);
|
||||
}
|
||||
|
||||
// Decide the optimal combination of color endpoint encodings and weight encodings
|
||||
int partition_format_specifiers[TUNE_MAX_TRIAL_CANDIDATES][BLOCK_MAX_PARTITIONS];
|
||||
uint8_t partition_format_specifiers[TUNE_MAX_TRIAL_CANDIDATES][BLOCK_MAX_PARTITIONS];
|
||||
int block_mode_index[TUNE_MAX_TRIAL_CANDIDATES];
|
||||
|
||||
quant_method color_quant_level[TUNE_MAX_TRIAL_CANDIDATES];
|
||||
@@ -520,22 +513,22 @@ static float compress_symbolic_block_for_partition_1plane(
|
||||
const block_mode& qw_bm = bsd.block_modes[bm_packed_index];
|
||||
|
||||
int decimation_mode = qw_bm.decimation_mode;
|
||||
int weight_quant_mode = qw_bm.quant_mode;
|
||||
const auto& di = bsd.get_decimation_info(decimation_mode);
|
||||
promise(di.weight_count > 0);
|
||||
|
||||
trace_add_data("weight_x", di.weight_x);
|
||||
trace_add_data("weight_y", di.weight_y);
|
||||
trace_add_data("weight_z", di.weight_z);
|
||||
trace_add_data("weight_quant", weight_quant_mode);
|
||||
trace_add_data("weight_quant", qw_bm.quant_mode);
|
||||
|
||||
// Recompute the ideal color endpoints before storing them
|
||||
vfloat4 rgbs_colors[BLOCK_MAX_PARTITIONS];
|
||||
vfloat4 rgbo_colors[BLOCK_MAX_PARTITIONS];
|
||||
|
||||
symbolic_compressed_block workscb;
|
||||
endpoints workep = ei.ep;
|
||||
|
||||
uint8_t* u8_weight_src = dec_weights_quant_pvalue + BLOCK_MAX_WEIGHTS * bm_packed_index;
|
||||
uint8_t* u8_weight_src = dec_weights_uquant + BLOCK_MAX_WEIGHTS * bm_packed_index;
|
||||
|
||||
for (unsigned int j = 0; j < di.weight_count; j++)
|
||||
{
|
||||
@@ -545,52 +538,56 @@ static float compress_symbolic_block_for_partition_1plane(
|
||||
for (unsigned int l = 0; l < config.tune_refinement_limit; l++)
|
||||
{
|
||||
recompute_ideal_colors_1plane(
|
||||
blk, pi, di,
|
||||
weight_quant_mode, workscb.weights,
|
||||
eix[decimation_mode].ep, rgbs_colors, rgbo_colors);
|
||||
blk, pi, di, workscb.weights,
|
||||
workep, rgbs_colors, rgbo_colors);
|
||||
|
||||
// Quantize the chosen color
|
||||
// Quantize the chosen color, tracking if worth trying the mod value
|
||||
bool all_same = color_quant_level[i] != color_quant_level_mod[i];
|
||||
for (unsigned int j = 0; j < partition_count; j++)
|
||||
{
|
||||
workscb.color_formats[j] = pack_color_endpoints(
|
||||
privateProfile,
|
||||
eix[decimation_mode].ep.endpt0[j],
|
||||
eix[decimation_mode].ep.endpt1[j],
|
||||
workep.endpt0[j],
|
||||
workep.endpt1[j],
|
||||
rgbs_colors[j],
|
||||
rgbo_colors[j],
|
||||
partition_format_specifiers[i][j],
|
||||
workscb.color_values[j],
|
||||
color_quant_level[i]);
|
||||
|
||||
all_same = all_same && workscb.color_formats[j] == workscb.color_formats[0];
|
||||
}
|
||||
|
||||
// If all the color endpoint modes are the same, we get a few more bits to store colors;
|
||||
// let's see if we can take advantage of this: requantize all the colors and see if the
|
||||
// endpoint modes remain the same.
|
||||
workscb.color_formats_matched = 0;
|
||||
|
||||
if ((partition_count >= 2 && workscb.color_formats[0] == workscb.color_formats[1]
|
||||
&& color_quant_level[i] != color_quant_level_mod[i])
|
||||
&& (partition_count == 2 || (workscb.color_formats[0] == workscb.color_formats[2]
|
||||
&& (partition_count == 3 || (workscb.color_formats[0] == workscb.color_formats[3])))))
|
||||
if (partition_count >= 2 && all_same)
|
||||
{
|
||||
uint8_t colorvals[BLOCK_MAX_PARTITIONS][12];
|
||||
uint8_t colorvals[BLOCK_MAX_PARTITIONS][8];
|
||||
uint8_t color_formats_mod[BLOCK_MAX_PARTITIONS] { 0 };
|
||||
bool all_same_mod = true;
|
||||
for (unsigned int j = 0; j < partition_count; j++)
|
||||
{
|
||||
color_formats_mod[j] = pack_color_endpoints(
|
||||
privateProfile,
|
||||
eix[decimation_mode].ep.endpt0[j],
|
||||
eix[decimation_mode].ep.endpt1[j],
|
||||
workep.endpt0[j],
|
||||
workep.endpt1[j],
|
||||
rgbs_colors[j],
|
||||
rgbo_colors[j],
|
||||
partition_format_specifiers[i][j],
|
||||
colorvals[j],
|
||||
color_quant_level_mod[i]);
|
||||
|
||||
// Early out as soon as it's no longer possible to use mod
|
||||
if (color_formats_mod[j] != color_formats_mod[0])
|
||||
{
|
||||
all_same_mod = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (color_formats_mod[0] == color_formats_mod[1]
|
||||
&& (partition_count == 2 || (color_formats_mod[0] == color_formats_mod[2]
|
||||
&& (partition_count == 3 || (color_formats_mod[0] == color_formats_mod[3])))))
|
||||
if (all_same_mod)
|
||||
{
|
||||
workscb.color_formats_matched = 1;
|
||||
for (unsigned int j = 0; j < BLOCK_MAX_PARTITIONS; j++)
|
||||
@@ -631,12 +628,12 @@ static float compress_symbolic_block_for_partition_1plane(
|
||||
trace_add_data("error_prerealign", errorval);
|
||||
best_errorval_in_mode = astc::min(errorval, best_errorval_in_mode);
|
||||
|
||||
// Average refinement improvement is 3.5% per iteration (allow 5%), but the first
|
||||
// iteration can help more so we give it a extra 10% leeway. Use this knowledge to
|
||||
// Average refinement improvement is 3.5% per iteration (allow 4.5%), but the first
|
||||
// iteration can help more so we give it a extra 8% leeway. Use this knowledge to
|
||||
// drive a heuristic to skip blocks that are unlikely to catch up with the best
|
||||
// block we have already.
|
||||
unsigned int iters_remaining = config.tune_refinement_limit - l;
|
||||
float threshold = (0.05f * static_cast<float>(iters_remaining)) + 1.1f;
|
||||
float threshold = (0.045f * static_cast<float>(iters_remaining)) + 1.08f;
|
||||
if (errorval > (threshold * best_errorval_in_scb))
|
||||
{
|
||||
break;
|
||||
@@ -681,10 +678,10 @@ static float compress_symbolic_block_for_partition_1plane(
|
||||
best_errorval_in_mode = astc::min(errorval, best_errorval_in_mode);
|
||||
|
||||
// Average refinement improvement is 3.5% per iteration, so skip blocks that are
|
||||
// unlikely to catch up with the best block we have already. Assume a 5% per step to
|
||||
// unlikely to catch up with the best block we have already. Assume a 4.5% per step to
|
||||
// give benefit of the doubt ...
|
||||
unsigned int iters_remaining = config.tune_refinement_limit - 1 - l;
|
||||
float threshold = (0.05f * static_cast<float>(iters_remaining)) + 1.0f;
|
||||
float threshold = (0.045f * static_cast<float>(iters_remaining)) + 1.0f;
|
||||
if (errorval > (threshold * best_errorval_in_scb))
|
||||
{
|
||||
break;
|
||||
@@ -733,29 +730,30 @@ static float compress_symbolic_block_for_partition_2planes(
|
||||
float tune_errorval_threshold,
|
||||
unsigned int plane2_component,
|
||||
symbolic_compressed_block& scb,
|
||||
compression_working_buffers& tmpbuf
|
||||
compression_working_buffers& tmpbuf,
|
||||
int quant_limit
|
||||
) {
|
||||
promise(config.tune_candidate_limit > 0);
|
||||
promise(config.tune_refinement_limit > 0);
|
||||
promise(bsd.decimation_mode_count_selected > 0);
|
||||
|
||||
int max_weight_quant = astc::min(static_cast<int>(QUANT_32), quant_limit);
|
||||
|
||||
// Compute ideal weights and endpoint colors, with no quantization or decimation
|
||||
endpoints_and_weights& ei1 = tmpbuf.ei1;
|
||||
endpoints_and_weights& ei2 = tmpbuf.ei2;
|
||||
endpoints_and_weights* eix1 = tmpbuf.eix1;
|
||||
endpoints_and_weights* eix2 = tmpbuf.eix2;
|
||||
|
||||
compute_ideal_colors_and_weights_2planes(bsd, blk, plane2_component, ei1, ei2);
|
||||
|
||||
// Compute ideal weights and endpoint colors for every decimation
|
||||
float *dec_weights_ideal_value = tmpbuf.dec_weights_ideal_value;
|
||||
float *dec_weights_quant_uvalue = tmpbuf.dec_weights_quant_uvalue;
|
||||
uint8_t *dec_weights_quant_pvalue = tmpbuf.dec_weights_quant_pvalue;
|
||||
float* dec_weights_ideal = tmpbuf.dec_weights_ideal;
|
||||
uint8_t* dec_weights_uquant = tmpbuf.dec_weights_uquant;
|
||||
|
||||
// For each decimation mode, compute an ideal set of weights with no quantization
|
||||
for (unsigned int i = 0; i < bsd.decimation_mode_count_selected; i++)
|
||||
{
|
||||
const auto& dm = bsd.get_decimation_mode(i);
|
||||
if (!dm.ref_2_planes)
|
||||
if (!dm.is_ref_2plane(static_cast<quant_method>(max_weight_quant)))
|
||||
{
|
||||
continue;
|
||||
}
|
||||
@@ -764,15 +762,13 @@ static float compress_symbolic_block_for_partition_2planes(
|
||||
|
||||
compute_ideal_weights_for_decimation(
|
||||
ei1,
|
||||
eix1[i],
|
||||
di,
|
||||
dec_weights_ideal_value + i * BLOCK_MAX_WEIGHTS);
|
||||
dec_weights_ideal + i * BLOCK_MAX_WEIGHTS);
|
||||
|
||||
compute_ideal_weights_for_decimation(
|
||||
ei2,
|
||||
eix2[i],
|
||||
di,
|
||||
dec_weights_ideal_value + i * BLOCK_MAX_WEIGHTS + WEIGHTS_PLANE2_OFFSET);
|
||||
dec_weights_ideal + i * BLOCK_MAX_WEIGHTS + WEIGHTS_PLANE2_OFFSET);
|
||||
}
|
||||
|
||||
// Compute maximum colors for the endpoints and ideal weights, then for each endpoint and ideal
|
||||
@@ -800,9 +796,7 @@ static float compress_symbolic_block_for_partition_2planes(
|
||||
float min_wt_cutoff2 = hmin_s(select(err_max, min_ep2, err_mask));
|
||||
|
||||
compute_angular_endpoints_2planes(
|
||||
config.tune_low_weight_count_limit,
|
||||
bsd, dec_weights_ideal_value,
|
||||
tmpbuf);
|
||||
bsd, dec_weights_ideal, max_weight_quant, tmpbuf);
|
||||
|
||||
// For each mode (which specifies a decimation and a quantization):
|
||||
// * Compute number of bits needed for the quantized weights
|
||||
@@ -814,7 +808,7 @@ static float compress_symbolic_block_for_partition_2planes(
|
||||
float* weight_low_value2 = tmpbuf.weight_low_value2;
|
||||
float* weight_high_value2 = tmpbuf.weight_high_value2;
|
||||
|
||||
int* qwt_bitcounts = tmpbuf.qwt_bitcounts;
|
||||
int8_t* qwt_bitcounts = tmpbuf.qwt_bitcounts;
|
||||
float* qwt_errors = tmpbuf.qwt_errors;
|
||||
|
||||
unsigned int start_2plane = bsd.block_mode_count_1plane_selected;
|
||||
@@ -825,7 +819,13 @@ static float compress_symbolic_block_for_partition_2planes(
|
||||
const block_mode& bm = bsd.block_modes[i];
|
||||
assert(bm.is_dual_plane);
|
||||
|
||||
qwt_bitcounts[i] = 109 - bm.weight_bits;
|
||||
if (bm.quant_mode > max_weight_quant)
|
||||
{
|
||||
qwt_errors[i] = 1e38f;
|
||||
continue;
|
||||
}
|
||||
|
||||
qwt_bitcounts[i] = static_cast<int8_t>(109 - bm.weight_bits);
|
||||
|
||||
if (weight_high_value1[i] > 1.02f * min_wt_cutoff1)
|
||||
{
|
||||
@@ -840,36 +840,38 @@ static float compress_symbolic_block_for_partition_2planes(
|
||||
unsigned int decimation_mode = bm.decimation_mode;
|
||||
const auto& di = bsd.get_decimation_info(decimation_mode);
|
||||
|
||||
ASTCENC_ALIGNAS float dec_weights_uquantf[BLOCK_MAX_WEIGHTS];
|
||||
|
||||
// Generate the optimized set of weights for the mode
|
||||
compute_quantized_weights_for_decimation(
|
||||
di,
|
||||
weight_low_value1[i],
|
||||
weight_high_value1[i],
|
||||
dec_weights_ideal_value + BLOCK_MAX_WEIGHTS * decimation_mode,
|
||||
dec_weights_quant_uvalue + BLOCK_MAX_WEIGHTS * i,
|
||||
dec_weights_quant_pvalue + BLOCK_MAX_WEIGHTS * i,
|
||||
dec_weights_ideal + BLOCK_MAX_WEIGHTS * decimation_mode,
|
||||
dec_weights_uquantf,
|
||||
dec_weights_uquant + BLOCK_MAX_WEIGHTS * i,
|
||||
bm.get_weight_quant_mode());
|
||||
|
||||
compute_quantized_weights_for_decimation(
|
||||
di,
|
||||
weight_low_value2[i],
|
||||
weight_high_value2[i],
|
||||
dec_weights_ideal_value + BLOCK_MAX_WEIGHTS * decimation_mode + WEIGHTS_PLANE2_OFFSET,
|
||||
dec_weights_quant_uvalue + BLOCK_MAX_WEIGHTS * i + WEIGHTS_PLANE2_OFFSET,
|
||||
dec_weights_quant_pvalue + BLOCK_MAX_WEIGHTS * i + WEIGHTS_PLANE2_OFFSET,
|
||||
dec_weights_ideal + BLOCK_MAX_WEIGHTS * decimation_mode + WEIGHTS_PLANE2_OFFSET,
|
||||
dec_weights_uquantf + WEIGHTS_PLANE2_OFFSET,
|
||||
dec_weights_uquant + BLOCK_MAX_WEIGHTS * i + WEIGHTS_PLANE2_OFFSET,
|
||||
bm.get_weight_quant_mode());
|
||||
|
||||
// Compute weight quantization errors for the block mode
|
||||
qwt_errors[i] = compute_error_of_weight_set_2planes(
|
||||
eix1[decimation_mode],
|
||||
eix2[decimation_mode],
|
||||
ei1,
|
||||
ei2,
|
||||
di,
|
||||
dec_weights_quant_uvalue + BLOCK_MAX_WEIGHTS * i,
|
||||
dec_weights_quant_uvalue + BLOCK_MAX_WEIGHTS * i + WEIGHTS_PLANE2_OFFSET);
|
||||
dec_weights_uquantf,
|
||||
dec_weights_uquantf + WEIGHTS_PLANE2_OFFSET);
|
||||
}
|
||||
|
||||
// Decide the optimal combination of color endpoint encodings and weight encodings
|
||||
int partition_format_specifiers[TUNE_MAX_TRIAL_CANDIDATES][BLOCK_MAX_PARTITIONS];
|
||||
uint8_t partition_format_specifiers[TUNE_MAX_TRIAL_CANDIDATES][BLOCK_MAX_PARTITIONS];
|
||||
int block_mode_index[TUNE_MAX_TRIAL_CANDIDATES];
|
||||
|
||||
quant_method color_quant_level[TUNE_MAX_TRIAL_CANDIDATES];
|
||||
@@ -901,25 +903,22 @@ static float compress_symbolic_block_for_partition_2planes(
|
||||
const block_mode& qw_bm = bsd.block_modes[bm_packed_index];
|
||||
|
||||
int decimation_mode = qw_bm.decimation_mode;
|
||||
int weight_quant_mode = qw_bm.quant_mode;
|
||||
const auto& di = bsd.get_decimation_info(decimation_mode);
|
||||
promise(di.weight_count > 0);
|
||||
|
||||
trace_add_data("weight_x", di.weight_x);
|
||||
trace_add_data("weight_y", di.weight_y);
|
||||
trace_add_data("weight_z", di.weight_z);
|
||||
trace_add_data("weight_quant", weight_quant_mode);
|
||||
|
||||
// Recompute the ideal color endpoints before storing them.
|
||||
merge_endpoints(eix1[decimation_mode].ep, eix2[decimation_mode].ep, plane2_component, epm);
|
||||
trace_add_data("weight_quant", qw_bm.quant_mode);
|
||||
|
||||
vfloat4 rgbs_color;
|
||||
vfloat4 rgbo_color;
|
||||
|
||||
symbolic_compressed_block workscb;
|
||||
endpoints workep = epm;
|
||||
|
||||
uint8_t* u8_weight1_src = dec_weights_quant_pvalue + BLOCK_MAX_WEIGHTS * bm_packed_index;
|
||||
uint8_t* u8_weight2_src = dec_weights_quant_pvalue + BLOCK_MAX_WEIGHTS * bm_packed_index + WEIGHTS_PLANE2_OFFSET;
|
||||
uint8_t* u8_weight1_src = dec_weights_uquant + BLOCK_MAX_WEIGHTS * bm_packed_index;
|
||||
uint8_t* u8_weight2_src = dec_weights_uquant + BLOCK_MAX_WEIGHTS * bm_packed_index + WEIGHTS_PLANE2_OFFSET;
|
||||
|
||||
for (int j = 0; j < di.weight_count; j++)
|
||||
{
|
||||
@@ -930,15 +929,15 @@ static float compress_symbolic_block_for_partition_2planes(
|
||||
for (unsigned int l = 0; l < config.tune_refinement_limit; l++)
|
||||
{
|
||||
recompute_ideal_colors_2planes(
|
||||
blk, bsd, di, weight_quant_mode,
|
||||
blk, bsd, di,
|
||||
workscb.weights, workscb.weights + WEIGHTS_PLANE2_OFFSET,
|
||||
epm, rgbs_color, rgbo_color, plane2_component);
|
||||
workep, rgbs_color, rgbo_color, plane2_component);
|
||||
|
||||
// Quantize the chosen color
|
||||
workscb.color_formats[0] = pack_color_endpoints(
|
||||
privateProfile,
|
||||
epm.endpt0[0],
|
||||
epm.endpt1[0],
|
||||
workep.endpt0[0],
|
||||
workep.endpt1[0],
|
||||
rgbs_color, rgbo_color,
|
||||
partition_format_specifiers[i][0],
|
||||
workscb.color_values[0],
|
||||
@@ -966,12 +965,12 @@ static float compress_symbolic_block_for_partition_2planes(
|
||||
trace_add_data("error_prerealign", errorval);
|
||||
best_errorval_in_mode = astc::min(errorval, best_errorval_in_mode);
|
||||
|
||||
// Average refinement improvement is 3.5% per iteration (allow 5%), but the first
|
||||
// iteration can help more so we give it a extra 10% leeway. Use this knowledge to
|
||||
// Average refinement improvement is 3.5% per iteration (allow 4.5%), but the first
|
||||
// iteration can help more so we give it a extra 8% leeway. Use this knowledge to
|
||||
// drive a heuristic to skip blocks that are unlikely to catch up with the best
|
||||
// block we have already.
|
||||
unsigned int iters_remaining = config.tune_refinement_limit - l;
|
||||
float threshold = (0.05f * static_cast<float>(iters_remaining)) + 1.1f;
|
||||
float threshold = (0.045f * static_cast<float>(iters_remaining)) + 1.08f;
|
||||
if (errorval > (threshold * best_errorval_in_scb))
|
||||
{
|
||||
break;
|
||||
@@ -1017,10 +1016,10 @@ static float compress_symbolic_block_for_partition_2planes(
|
||||
best_errorval_in_mode = astc::min(errorval, best_errorval_in_mode);
|
||||
|
||||
// Average refinement improvement is 3.5% per iteration, so skip blocks that are
|
||||
// unlikely to catch up with the best block we have already. Assume a 5% per step to
|
||||
// unlikely to catch up with the best block we have already. Assume a 4.5% per step to
|
||||
// give benefit of the doubt ...
|
||||
unsigned int iters_remaining = config.tune_refinement_limit - 1 - l;
|
||||
float threshold = (0.05f * static_cast<float>(iters_remaining)) + 1.0f;
|
||||
float threshold = (0.045f * static_cast<float>(iters_remaining)) + 1.0f;
|
||||
if (errorval > (threshold * best_errorval_in_scb))
|
||||
{
|
||||
break;
|
||||
@@ -1132,12 +1131,13 @@ static float prepare_block_statistics(
|
||||
|
||||
aa_var -= as * (as * rpt);
|
||||
|
||||
rg_cov *= astc::rsqrt(astc::max(rr_var * gg_var, 1e-30f));
|
||||
rb_cov *= astc::rsqrt(astc::max(rr_var * bb_var, 1e-30f));
|
||||
ra_cov *= astc::rsqrt(astc::max(rr_var * aa_var, 1e-30f));
|
||||
gb_cov *= astc::rsqrt(astc::max(gg_var * bb_var, 1e-30f));
|
||||
ga_cov *= astc::rsqrt(astc::max(gg_var * aa_var, 1e-30f));
|
||||
ba_cov *= astc::rsqrt(astc::max(bb_var * aa_var, 1e-30f));
|
||||
// These will give a NaN if a channel is constant - these are fixed up in the next step
|
||||
rg_cov *= astc::rsqrt(rr_var * gg_var);
|
||||
rb_cov *= astc::rsqrt(rr_var * bb_var);
|
||||
ra_cov *= astc::rsqrt(rr_var * aa_var);
|
||||
gb_cov *= astc::rsqrt(gg_var * bb_var);
|
||||
ga_cov *= astc::rsqrt(gg_var * aa_var);
|
||||
ba_cov *= astc::rsqrt(bb_var * aa_var);
|
||||
|
||||
if (astc::isnan(rg_cov)) rg_cov = 1.0f;
|
||||
if (astc::isnan(rb_cov)) rb_cov = 1.0f;
|
||||
@@ -1146,7 +1146,7 @@ static float prepare_block_statistics(
|
||||
if (astc::isnan(ga_cov)) ga_cov = 1.0f;
|
||||
if (astc::isnan(ba_cov)) ba_cov = 1.0f;
|
||||
|
||||
float lowest_correlation = astc::min(fabsf(rg_cov), fabsf(rb_cov));
|
||||
float lowest_correlation = astc::min(fabsf(rg_cov), fabsf(rb_cov));
|
||||
lowest_correlation = astc::min(lowest_correlation, fabsf(ra_cov));
|
||||
lowest_correlation = astc::min(lowest_correlation, fabsf(gb_cov));
|
||||
lowest_correlation = astc::min(lowest_correlation, fabsf(ga_cov));
|
||||
@@ -1173,9 +1173,9 @@ static float prepare_block_statistics(
|
||||
|
||||
/* See header for documentation. */
|
||||
void compress_block(
|
||||
const astcenc_context& ctx,
|
||||
const astcenc_contexti& ctx,
|
||||
const image_block& blk,
|
||||
physical_compressed_block& pcb,
|
||||
uint8_t pcb[16],
|
||||
#if QUALITY_CONTROL
|
||||
compression_working_buffers& tmpbuf,
|
||||
bool calQualityEnable,
|
||||
@@ -1206,16 +1206,28 @@ void compress_block(
|
||||
bool block_skip_two_plane = false;
|
||||
int max_partitions = (ctx.config.privateProfile == HIGH_SPEED_PROFILE) ? 1 : ctx.config.tune_partition_count_limit;
|
||||
|
||||
unsigned int requested_partition_indices[3] {
|
||||
ctx.config.tune_2partition_index_limit,
|
||||
ctx.config.tune_3partition_index_limit,
|
||||
ctx.config.tune_4partition_index_limit
|
||||
};
|
||||
|
||||
unsigned int requested_partition_trials[3] {
|
||||
ctx.config.tune_2partitioning_candidate_limit,
|
||||
ctx.config.tune_3partitioning_candidate_limit,
|
||||
ctx.config.tune_4partitioning_candidate_limit
|
||||
};
|
||||
|
||||
#if defined(ASTCENC_DIAGNOSTICS)
|
||||
// Do this early in diagnostic builds so we can dump uniform metrics
|
||||
// for every block. Do it later in release builds to avoid redundant work!
|
||||
float error_weight_sum = hadd_s(blk.channel_weight) * bsd->texel_count;
|
||||
float error_weight_sum = hadd_s(blk.channel_weight) * bsd.texel_count;
|
||||
float error_threshold = ctx.config.tune_db_limit
|
||||
* error_weight_sum
|
||||
* block_is_l_scale
|
||||
* block_is_la_scale;
|
||||
|
||||
lowest_correl = prepare_block_statistics(bsd->texel_count, blk);
|
||||
lowest_correl = prepare_block_statistics(bsd.texel_count, blk);
|
||||
trace_add_data("lowest_correl", lowest_correl);
|
||||
trace_add_data("tune_error_threshold", error_threshold);
|
||||
#endif
|
||||
@@ -1228,6 +1240,7 @@ void compress_block(
|
||||
trace_add_data("plane_count", 1);
|
||||
|
||||
scb.partition_count = 0;
|
||||
|
||||
// Encode as FP16 if using HDR
|
||||
if ((decode_mode == ASTCENC_PRF_HDR) ||
|
||||
(decode_mode == ASTCENC_PRF_HDR_RGB_LDR_A))
|
||||
@@ -1244,6 +1257,7 @@ void compress_block(
|
||||
vint4 color_u16 = float_to_int_rtn(color_f32);
|
||||
store(color_u16, scb.constant_color);
|
||||
}
|
||||
|
||||
trace_add_data("exit", "quality hit");
|
||||
if (ctx.config.privateProfile == HIGH_SPEED_PROFILE)
|
||||
{
|
||||
@@ -1258,7 +1272,7 @@ void compress_block(
|
||||
for (int w = 0; w < 16; w++) { // weights num is 16 when block mode is HIGH_SPEED_PROFILE_BLOCK_MODE
|
||||
scb.weights[w] = 0;
|
||||
}
|
||||
for (int pixel = 0; pixel < BLOCK_MAX_COMPONENTS; pixel++) { // scb.constant_color[pixel] is 16 bit
|
||||
for (unsigned int pixel = 0; pixel < BLOCK_MAX_COMPONENTS; pixel++) { // scb.constant_color[pixel] is 16 bit
|
||||
scb.color_values[0][pixel << 1] = scb.constant_color[pixel] & BYTE_MASK; // low byte
|
||||
scb.color_values[0][(pixel << 1) + 1] = (scb.constant_color[pixel] >> 8) & BYTE_MASK; // high byte
|
||||
}
|
||||
@@ -1291,8 +1305,8 @@ void compress_block(
|
||||
|
||||
float exit_thresholds_for_pcount[BLOCK_MAX_PARTITIONS] {
|
||||
0.0f,
|
||||
ctx.config.tune_2_partition_early_out_limit_factor,
|
||||
ctx.config.tune_3_partition_early_out_limit_factor,
|
||||
ctx.config.tune_2partition_early_out_limit_factor,
|
||||
ctx.config.tune_3partition_early_out_limit_factor,
|
||||
0.0f
|
||||
};
|
||||
|
||||
@@ -1304,19 +1318,21 @@ void compress_block(
|
||||
// compression and slightly reduces image quality.
|
||||
|
||||
float errorval_mult[2] {
|
||||
1.0f / ctx.config.tune_mode0_mse_overshoot,
|
||||
1.0f / ctx.config.tune_mse_overshoot,
|
||||
1.0f
|
||||
};
|
||||
|
||||
static const float errorval_overshoot = 1.0f / ctx.config.tune_refinement_mse_overshoot;
|
||||
static const float errorval_overshoot = 1.0f / ctx.config.tune_mse_overshoot;
|
||||
|
||||
// Only enable MODE0 fast path (trial 0) if 2D and more than 25 texels
|
||||
// Only enable MODE0 fast path if enabled
|
||||
// Never enable for 3D blocks as no "always" block modes are available
|
||||
int start_trial = 1;
|
||||
if ((bsd.texel_count >= TUNE_MIN_TEXELS_MODE0_FASTPATH) && (bsd.zdim == 1))
|
||||
if ((ctx.config.tune_search_mode0_enable >= TUNE_MIN_SEARCH_MODE0) && (bsd.zdim == 1))
|
||||
{
|
||||
start_trial = 0;
|
||||
}
|
||||
|
||||
int quant_limit = QUANT_32;
|
||||
for (int i = start_trial; i < 2; i++)
|
||||
{
|
||||
TRACE_NODE(node1, "pass");
|
||||
@@ -1328,7 +1344,11 @@ void compress_block(
|
||||
ctx.config.privateProfile,
|
||||
ctx.config, bsd, blk, i == 0,
|
||||
error_threshold * errorval_mult[i] * errorval_overshoot,
|
||||
1, 0, scb, tmpbuf);
|
||||
1, 0, scb, tmpbuf, QUANT_32);
|
||||
|
||||
// Record the quant level so we can use the filter later searches
|
||||
const auto& bm = bsd.get_block_mode(scb.block_mode);
|
||||
quant_limit = bm.get_weight_quant_mode();
|
||||
|
||||
best_errorvals_for_pcount[0] = astc::min(best_errorvals_for_pcount[0], errorval);
|
||||
if ((ctx.config.privateProfile == HIGH_SPEED_PROFILE) || (errorval < (error_threshold * errorval_mult[i])))
|
||||
@@ -1342,7 +1362,7 @@ void compress_block(
|
||||
lowest_correl = prepare_block_statistics(bsd.texel_count, blk);
|
||||
#endif
|
||||
|
||||
block_skip_two_plane = lowest_correl > ctx.config.tune_2_plane_early_out_limit_correlation;
|
||||
block_skip_two_plane = lowest_correl > ctx.config.tune_2plane_early_out_limit_correlation;
|
||||
|
||||
// Test the four possible 1-partition, 2-planes modes. Do this in reverse, as
|
||||
// alpha is the most likely to be non-correlated if it is present in the data.
|
||||
@@ -1359,7 +1379,7 @@ void compress_block(
|
||||
|
||||
if (block_skip_two_plane)
|
||||
{
|
||||
trace_add_data("skip", "tune_2_plane_early_out_limit_correlation");
|
||||
trace_add_data("skip", "tune_2plane_early_out_limit_correlation");
|
||||
continue;
|
||||
}
|
||||
|
||||
@@ -1378,11 +1398,11 @@ void compress_block(
|
||||
float errorval = compress_symbolic_block_for_partition_2planes(
|
||||
ctx.config.privateProfile,
|
||||
ctx.config, bsd, blk, error_threshold * errorval_overshoot,
|
||||
i, scb, tmpbuf);
|
||||
i, scb, tmpbuf, quant_limit);
|
||||
|
||||
// If attempting two planes is much worse than the best one plane result
|
||||
// then further two plane searches are unlikely to help so move on ...
|
||||
if (errorval > (best_errorvals_for_pcount[0] * 2.0f))
|
||||
if (errorval > (best_errorvals_for_pcount[0] * 1.85f))
|
||||
{
|
||||
break;
|
||||
}
|
||||
@@ -1397,13 +1417,19 @@ void compress_block(
|
||||
// Find best blocks for 2, 3 and 4 partitions
|
||||
for (int partition_count = 2; partition_count <= max_partitions; partition_count++)
|
||||
{
|
||||
unsigned int partition_indices[2] { 0 };
|
||||
unsigned int partition_indices[TUNE_MAX_PARTITIONING_CANDIDATES];
|
||||
|
||||
find_best_partition_candidates(bsd, blk, partition_count,
|
||||
ctx.config.tune_partition_index_limit,
|
||||
partition_indices);
|
||||
unsigned int requested_indices = requested_partition_indices[partition_count - 2];
|
||||
|
||||
for (unsigned int i = 0; i < 2; i++)
|
||||
unsigned int requested_trials = requested_partition_trials[partition_count - 2];
|
||||
requested_trials = astc::min(requested_trials, requested_indices);
|
||||
|
||||
unsigned int actual_trials = find_best_partition_candidates(
|
||||
bsd, blk, partition_count, requested_indices, partition_indices, requested_trials);
|
||||
|
||||
float best_error_in_prev = best_errorvals_for_pcount[partition_count - 2];
|
||||
|
||||
for (unsigned int i = 0; i < actual_trials; i++)
|
||||
{
|
||||
TRACE_NODE(node1, "pass");
|
||||
trace_add_data("partition_count", partition_count);
|
||||
@@ -1416,9 +1442,22 @@ void compress_block(
|
||||
ctx.config, bsd, blk, false,
|
||||
error_threshold * errorval_overshoot,
|
||||
partition_count, partition_indices[i],
|
||||
scb, tmpbuf);
|
||||
scb, tmpbuf, quant_limit);
|
||||
|
||||
best_errorvals_for_pcount[partition_count - 1] = astc::min(best_errorvals_for_pcount[partition_count - 1], errorval);
|
||||
|
||||
// If using N partitions doesn't improve much over using N-1 partitions then skip trying
|
||||
// N+1. Error can dramatically improve if the data is correlated or non-correlated and
|
||||
// aligns with a partitioning that suits that encoding, so for this inner loop check add
|
||||
// a large error scale because the "other" trial could be a lot better.
|
||||
float best_error = best_errorvals_for_pcount[partition_count - 1];
|
||||
float best_error_scale = exit_thresholds_for_pcount[partition_count - 1] * 1.85f;
|
||||
if (best_error > (best_error_in_prev * best_error_scale))
|
||||
{
|
||||
trace_add_data("skip", "tune_partition_early_out_limit_factor");
|
||||
goto END_OF_TESTS;
|
||||
}
|
||||
|
||||
if (errorval < error_threshold)
|
||||
{
|
||||
trace_add_data("exit", "quality hit");
|
||||
@@ -1428,7 +1467,6 @@ void compress_block(
|
||||
|
||||
// If using N partitions doesn't improve much over using N-1 partitions then skip trying N+1
|
||||
float best_error = best_errorvals_for_pcount[partition_count - 1];
|
||||
float best_error_in_prev = best_errorvals_for_pcount[partition_count - 2];
|
||||
float best_error_scale = exit_thresholds_for_pcount[partition_count - 1];
|
||||
if (best_error > (best_error_in_prev * best_error_scale))
|
||||
{
|
||||
@@ -1455,7 +1493,6 @@ END_OF_TESTS:
|
||||
#endif
|
||||
|
||||
scb.block_type = SYM_BTYPE_CONST_U16;
|
||||
scb.block_mode = -2;
|
||||
vfloat4 color_f32 = clamp(0.0f, 1.0f, blk.origin_texel) * 65535.0f;
|
||||
vint4 color_u16 = float_to_int_rtn(color_f32);
|
||||
store(color_u16, scb.constant_color);
|
||||
|
||||
@@ -99,17 +99,9 @@ static void brent_kung_prefix_sum(
|
||||
} while (lc_stride > 2);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Compute averages for a pixel region.
|
||||
*
|
||||
* The routine computes both in a single pass, using a summed-area table to decouple the running
|
||||
* time from the averaging/variance kernel size.
|
||||
*
|
||||
* @param[out] ctx The compressor context storing the output data.
|
||||
* @param arg The input parameter structure.
|
||||
*/
|
||||
static void compute_pixel_region_variance(
|
||||
astcenc_context& ctx,
|
||||
/* See header for documentation. */
|
||||
void compute_pixel_region_variance(
|
||||
astcenc_contexti& ctx,
|
||||
const pixel_region_args& arg
|
||||
) {
|
||||
// Unpack the memory structure into local variables
|
||||
@@ -427,57 +419,6 @@ static void compute_pixel_region_variance(
|
||||
}
|
||||
}
|
||||
|
||||
void compute_averages(
|
||||
astcenc_context& ctx,
|
||||
const avg_args &ag
|
||||
) {
|
||||
pixel_region_args arg = ag.arg;
|
||||
arg.work_memory = new vfloat4[ag.work_memory_size];
|
||||
|
||||
int size_x = ag.img_size_x;
|
||||
int size_y = ag.img_size_y;
|
||||
int size_z = ag.img_size_z;
|
||||
|
||||
int step_xy = ag.blk_size_xy;
|
||||
int step_z = ag.blk_size_z;
|
||||
|
||||
int y_tasks = (size_y + step_xy - 1) / step_xy;
|
||||
|
||||
// All threads run this processing loop until there is no work remaining
|
||||
while (true)
|
||||
{
|
||||
unsigned int count;
|
||||
unsigned int base = ctx.manage_avg.get_task_assignment(16, count);
|
||||
if (!count)
|
||||
{
|
||||
break;
|
||||
}
|
||||
|
||||
for (unsigned int i = base; i < base + count; i++)
|
||||
{
|
||||
int z = (i / (y_tasks)) * step_z;
|
||||
int y = (i - (z * y_tasks)) * step_xy;
|
||||
|
||||
arg.size_z = astc::min(step_z, size_z - z);
|
||||
arg.offset_z = z;
|
||||
|
||||
arg.size_y = astc::min(step_xy, size_y - y);
|
||||
arg.offset_y = y;
|
||||
|
||||
for (int x = 0; x < size_x; x += step_xy)
|
||||
{
|
||||
arg.size_x = astc::min(step_xy, size_x - x);
|
||||
arg.offset_x = x;
|
||||
compute_pixel_region_variance(ctx, arg);
|
||||
}
|
||||
}
|
||||
|
||||
ctx.manage_avg.complete_task_assignment(count);
|
||||
}
|
||||
|
||||
delete[] arg.work_memory;
|
||||
}
|
||||
|
||||
/* See header for documentation. */
|
||||
unsigned int init_compute_averages(
|
||||
const astcenc_image& img,
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
// ----------------------------------------------------------------------------
|
||||
// Copyright 2011-2022 Arm Limited
|
||||
// Copyright 2011-2024 Arm Limited
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
|
||||
// use this file except in compliance with the License. You may obtain a copy
|
||||
@@ -24,48 +24,18 @@
|
||||
#include <stdio.h>
|
||||
#include <assert.h>
|
||||
|
||||
/**
|
||||
* @brief Compute a vector of texel weights by interpolating the decimated weight grid.
|
||||
*
|
||||
* @param base_texel_index The first texel to get; N (SIMD width) consecutive texels are loaded.
|
||||
* @param di The weight grid decimation to use.
|
||||
* @param weights The raw weights.
|
||||
*
|
||||
* @return The undecimated weight for N (SIMD width) texels.
|
||||
*/
|
||||
static vint compute_value_of_texel_weight_int_vla(
|
||||
int base_texel_index,
|
||||
const decimation_info& di,
|
||||
const int* weights
|
||||
) {
|
||||
vint summed_value(8);
|
||||
vint weight_count(di.texel_weight_count + base_texel_index);
|
||||
int max_weight_count = hmax(weight_count).lane<0>();
|
||||
|
||||
promise(max_weight_count > 0);
|
||||
for (int i = 0; i < max_weight_count; i++)
|
||||
{
|
||||
vint texel_weights(di.texel_weights_4t[i] + base_texel_index);
|
||||
vint texel_weights_int(di.texel_weights_int_4t[i] + base_texel_index);
|
||||
|
||||
summed_value += gatheri(weights, texel_weights) * texel_weights_int;
|
||||
}
|
||||
|
||||
return lsr<4>(summed_value);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Compute the integer linear interpolation of two color endpoints.
|
||||
*
|
||||
* @param decode_mode The ASTC profile (linear or sRGB)
|
||||
* @param u8_mask The mask for lanes using decode_unorm8 rather than decode_f16.
|
||||
* @param color0 The endpoint0 color.
|
||||
* @param color1 The endpoint1 color.
|
||||
* @param weights The interpolation weight (between 0 and 64).
|
||||
* @param weights The interpolation weight (between 0 and 64).
|
||||
*
|
||||
* @return The interpolated color.
|
||||
*/
|
||||
static vint4 lerp_color_int(
|
||||
astcenc_profile decode_mode,
|
||||
vmask4 u8_mask,
|
||||
vint4 color0,
|
||||
vint4 color1,
|
||||
vint4 weights
|
||||
@@ -73,24 +43,18 @@ static vint4 lerp_color_int(
|
||||
vint4 weight1 = weights;
|
||||
vint4 weight0 = vint4(64) - weight1;
|
||||
|
||||
if (decode_mode == ASTCENC_PRF_LDR_SRGB)
|
||||
{
|
||||
color0 = asr<8>(color0);
|
||||
color1 = asr<8>(color1);
|
||||
}
|
||||
|
||||
vint4 color = (color0 * weight0) + (color1 * weight1) + vint4(32);
|
||||
color = asr<6>(color);
|
||||
|
||||
if (decode_mode == ASTCENC_PRF_LDR_SRGB)
|
||||
{
|
||||
color = color * vint4(257);
|
||||
}
|
||||
// For decode_unorm8 values force the codec to bit replicate. This allows the
|
||||
// rest of the codec to assume the full 0xFFFF range for everything and ignore
|
||||
// the decode_mode setting
|
||||
vint4 color_u8 = asr<8>(color) * vint4(257);
|
||||
color = select(color, color_u8, u8_mask);
|
||||
|
||||
return color;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* @brief Convert integer color value into a float value for the decoder.
|
||||
*
|
||||
@@ -127,43 +91,74 @@ void unpack_weights(
|
||||
const symbolic_compressed_block& scb,
|
||||
const decimation_info& di,
|
||||
bool is_dual_plane,
|
||||
quant_method quant_level,
|
||||
int weights_plane1[BLOCK_MAX_TEXELS],
|
||||
int weights_plane2[BLOCK_MAX_TEXELS]
|
||||
) {
|
||||
// First, unquantize the weights ...
|
||||
int uq_plane1_weights[BLOCK_MAX_WEIGHTS];
|
||||
int uq_plane2_weights[BLOCK_MAX_WEIGHTS];
|
||||
unsigned int weight_count = di.weight_count;
|
||||
|
||||
const quantization_and_transfer_table *qat = &(quant_and_xfer_tables[quant_level]);
|
||||
|
||||
// Second, undecimate the weights ...
|
||||
// Safe to overshoot as all arrays are allocated to full size
|
||||
if (!is_dual_plane)
|
||||
{
|
||||
for (unsigned int i = 0; i < weight_count; i++)
|
||||
{
|
||||
uq_plane1_weights[i] = qat->unquantized_value[scb.weights[i]];
|
||||
}
|
||||
// Build full 64-entry weight lookup table
|
||||
vint4 tab0 = vint4::load(scb.weights + 0);
|
||||
vint4 tab1 = vint4::load(scb.weights + 16);
|
||||
vint4 tab2 = vint4::load(scb.weights + 32);
|
||||
vint4 tab3 = vint4::load(scb.weights + 48);
|
||||
|
||||
vint tab0p, tab1p, tab2p, tab3p;
|
||||
vtable_prepare(tab0, tab1, tab2, tab3, tab0p, tab1p, tab2p, tab3p);
|
||||
|
||||
for (unsigned int i = 0; i < bsd.texel_count; i += ASTCENC_SIMD_WIDTH)
|
||||
{
|
||||
store(compute_value_of_texel_weight_int_vla(i, di, uq_plane1_weights), weights_plane1 + i);
|
||||
vint summed_value(8);
|
||||
vint weight_count(di.texel_weight_count + i);
|
||||
int max_weight_count = hmax(weight_count).lane<0>();
|
||||
|
||||
promise(max_weight_count > 0);
|
||||
for (int j = 0; j < max_weight_count; j++)
|
||||
{
|
||||
vint texel_weights(di.texel_weights_tr[j] + i);
|
||||
vint texel_weights_int(di.texel_weight_contribs_int_tr[j] + i);
|
||||
|
||||
summed_value += vtable_8bt_32bi(tab0p, tab1p, tab2p, tab3p, texel_weights) * texel_weights_int;
|
||||
}
|
||||
|
||||
store(lsr<4>(summed_value), weights_plane1 + i);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for (unsigned int i = 0; i < weight_count; i++)
|
||||
{
|
||||
uq_plane1_weights[i] = qat->unquantized_value[scb.weights[i]];
|
||||
uq_plane2_weights[i] = qat->unquantized_value[scb.weights[i + WEIGHTS_PLANE2_OFFSET]];
|
||||
}
|
||||
// Build a 32-entry weight lookup table per plane
|
||||
// Plane 1
|
||||
vint4 tab0_plane1 = vint4::load(scb.weights + 0);
|
||||
vint4 tab1_plane1 = vint4::load(scb.weights + 16);
|
||||
vint tab0_plane1p, tab1_plane1p;
|
||||
vtable_prepare(tab0_plane1, tab1_plane1, tab0_plane1p, tab1_plane1p);
|
||||
|
||||
// Plane 2
|
||||
vint4 tab0_plane2 = vint4::load(scb.weights + 32);
|
||||
vint4 tab1_plane2 = vint4::load(scb.weights + 48);
|
||||
vint tab0_plane2p, tab1_plane2p;
|
||||
vtable_prepare(tab0_plane2, tab1_plane2, tab0_plane2p, tab1_plane2p);
|
||||
|
||||
for (unsigned int i = 0; i < bsd.texel_count; i += ASTCENC_SIMD_WIDTH)
|
||||
{
|
||||
store(compute_value_of_texel_weight_int_vla(i, di, uq_plane1_weights), weights_plane1 + i);
|
||||
store(compute_value_of_texel_weight_int_vla(i, di, uq_plane2_weights), weights_plane2 + i);
|
||||
vint sum_plane1(8);
|
||||
vint sum_plane2(8);
|
||||
|
||||
vint weight_count(di.texel_weight_count + i);
|
||||
int max_weight_count = hmax(weight_count).lane<0>();
|
||||
|
||||
promise(max_weight_count > 0);
|
||||
for (int j = 0; j < max_weight_count; j++)
|
||||
{
|
||||
vint texel_weights(di.texel_weights_tr[j] + i);
|
||||
vint texel_weights_int(di.texel_weight_contribs_int_tr[j] + i);
|
||||
|
||||
sum_plane1 += vtable_8bt_32bi(tab0_plane1p, tab1_plane1p, texel_weights) * texel_weights_int;
|
||||
sum_plane2 += vtable_8bt_32bi(tab0_plane2p, tab1_plane2p, texel_weights) * texel_weights_int;
|
||||
}
|
||||
|
||||
store(lsr<4>(sum_plane1), weights_plane1 + i);
|
||||
store(lsr<4>(sum_plane2), weights_plane2 + i);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -228,12 +223,13 @@ void decompress_symbolic_block(
|
||||
{
|
||||
vint4 colori(scb.constant_color);
|
||||
|
||||
// For sRGB decoding a real decoder would just use the top 8 bits for color conversion.
|
||||
// We don't color convert, so rescale the top 8 bits into the full 16 bit dynamic range.
|
||||
if (decode_mode == ASTCENC_PRF_LDR_SRGB)
|
||||
{
|
||||
colori = asr<8>(colori) * 257;
|
||||
}
|
||||
// Determine the UNORM8 rounding on the decode
|
||||
vmask4 u8_mask = get_u8_component_mask(decode_mode, blk);
|
||||
|
||||
// The real decoder would just use the top 8 bits, but we rescale
|
||||
// in to a 16-bit value that rounds correctly.
|
||||
vint4 colori_u8 = asr<8>(colori) * 257;
|
||||
colori = select(colori, colori_u8, u8_mask);
|
||||
|
||||
vint4 colorf16 = unorm16_to_sf16(colori);
|
||||
color = float16_to_float(colorf16);
|
||||
@@ -277,17 +273,19 @@ void decompress_symbolic_block(
|
||||
const auto& bm = bsd.get_block_mode(scb.block_mode);
|
||||
const auto& di = bsd.get_decimation_info(bm.decimation_mode);
|
||||
|
||||
int is_dual_plane = bm.is_dual_plane;
|
||||
bool is_dual_plane = static_cast<bool>(bm.is_dual_plane);
|
||||
|
||||
// Unquantize and undecimate the weights
|
||||
int plane1_weights[BLOCK_MAX_TEXELS];
|
||||
int plane2_weights[BLOCK_MAX_TEXELS];
|
||||
unpack_weights(bsd, scb, di, is_dual_plane, bm.get_weight_quant_mode(), plane1_weights, plane2_weights);
|
||||
unpack_weights(bsd, scb, di, is_dual_plane, plane1_weights, plane2_weights);
|
||||
|
||||
// Now that we have endpoint colors and weights, we can unpack texel colors
|
||||
int plane2_component = is_dual_plane ? scb.plane2_component : -1;
|
||||
int plane2_component = scb.plane2_component;
|
||||
vmask4 plane2_mask = vint4::lane_id() == vint4(plane2_component);
|
||||
|
||||
vmask4 u8_mask = get_u8_component_mask(decode_mode, blk);
|
||||
|
||||
for (int i = 0; i < partition_count; i++)
|
||||
{
|
||||
// Decode the color endpoints for this partition
|
||||
@@ -298,7 +296,6 @@ void decompress_symbolic_block(
|
||||
|
||||
unpack_color_endpoints(decode_mode,
|
||||
scb.color_formats[i],
|
||||
scb.get_color_quant_mode(),
|
||||
scb.color_values[i],
|
||||
rgb_lns, a_lns,
|
||||
ep0, ep1);
|
||||
@@ -310,7 +307,7 @@ void decompress_symbolic_block(
|
||||
{
|
||||
int tix = pi.texels_of_partition[i][j];
|
||||
vint4 weight = select(vint4(plane1_weights[tix]), vint4(plane2_weights[tix]), plane2_mask);
|
||||
vint4 color = lerp_color_int(decode_mode, ep0, ep1, weight);
|
||||
vint4 color = lerp_color_int(u8_mask, ep0, ep1, weight);
|
||||
vfloat4 colorf = decode_texel(color, lns_mask);
|
||||
|
||||
blk.data_r[tix] = colorf.lane<0>();
|
||||
@@ -347,7 +344,7 @@ float compute_symbolic_block_difference_2plane(
|
||||
// Unquantize and undecimate the weights
|
||||
int plane1_weights[BLOCK_MAX_TEXELS];
|
||||
int plane2_weights[BLOCK_MAX_TEXELS];
|
||||
unpack_weights(bsd, scb, di, true, bm.get_weight_quant_mode(), plane1_weights, plane2_weights);
|
||||
unpack_weights(bsd, scb, di, true, plane1_weights, plane2_weights);
|
||||
|
||||
vmask4 plane2_mask = vint4::lane_id() == vint4(scb.plane2_component);
|
||||
|
||||
@@ -361,17 +358,18 @@ float compute_symbolic_block_difference_2plane(
|
||||
|
||||
unpack_color_endpoints(config.profile,
|
||||
scb.color_formats[0],
|
||||
scb.get_color_quant_mode(),
|
||||
scb.color_values[0],
|
||||
rgb_lns, a_lns,
|
||||
ep0, ep1);
|
||||
|
||||
vmask4 u8_mask = get_u8_component_mask(config.profile, blk);
|
||||
|
||||
// Unpack and compute error for each texel in the partition
|
||||
unsigned int texel_count = bsd.texel_count;
|
||||
for (unsigned int i = 0; i < texel_count; i++)
|
||||
{
|
||||
vint4 weight = select(vint4(plane1_weights[i]), vint4(plane2_weights[i]), plane2_mask);
|
||||
vint4 colori = lerp_color_int(config.profile, ep0, ep1, weight);
|
||||
vint4 colori = lerp_color_int(u8_mask, ep0, ep1, weight);
|
||||
|
||||
vfloat4 color = int_to_float(colori);
|
||||
vfloat4 oldColor = blk.texel(i);
|
||||
@@ -443,7 +441,9 @@ float compute_symbolic_block_difference_1plane(
|
||||
|
||||
// Unquantize and undecimate the weights
|
||||
int plane1_weights[BLOCK_MAX_TEXELS];
|
||||
unpack_weights(bsd, scb, di, false, bm.get_weight_quant_mode(), plane1_weights, nullptr);
|
||||
unpack_weights(bsd, scb, di, false, plane1_weights, nullptr);
|
||||
|
||||
vmask4 u8_mask = get_u8_component_mask(config.profile, blk);
|
||||
|
||||
vfloat4 summa = vfloat4::zero();
|
||||
for (unsigned int i = 0; i < partition_count; i++)
|
||||
@@ -456,7 +456,6 @@ float compute_symbolic_block_difference_1plane(
|
||||
|
||||
unpack_color_endpoints(config.profile,
|
||||
scb.color_formats[i],
|
||||
scb.get_color_quant_mode(),
|
||||
scb.color_values[i],
|
||||
rgb_lns, a_lns,
|
||||
ep0, ep1);
|
||||
@@ -466,7 +465,7 @@ float compute_symbolic_block_difference_1plane(
|
||||
for (unsigned int j = 0; j < texel_count; j++)
|
||||
{
|
||||
unsigned int tix = pi.texels_of_partition[i][j];
|
||||
vint4 colori = lerp_color_int(config.profile, ep0, ep1,
|
||||
vint4 colori = lerp_color_int(u8_mask, ep0, ep1,
|
||||
vint4(plane1_weights[tix]));
|
||||
|
||||
vfloat4 color = int_to_float(colori);
|
||||
@@ -534,8 +533,8 @@ float compute_symbolic_block_difference_1plane_1partition(
|
||||
const decimation_info& di = bsd.get_decimation_info(bm.decimation_mode);
|
||||
|
||||
// Unquantize and undecimate the weights
|
||||
alignas(ASTCENC_VECALIGN) int plane1_weights[BLOCK_MAX_TEXELS];
|
||||
unpack_weights(bsd, scb, di, false, bm.get_weight_quant_mode(), plane1_weights, nullptr);
|
||||
ASTCENC_ALIGNAS int plane1_weights[BLOCK_MAX_TEXELS];
|
||||
unpack_weights(bsd, scb, di, false, plane1_weights, nullptr);
|
||||
|
||||
// Decode the color endpoints for this partition
|
||||
vint4 ep0;
|
||||
@@ -545,24 +544,16 @@ float compute_symbolic_block_difference_1plane_1partition(
|
||||
|
||||
unpack_color_endpoints(config.profile,
|
||||
scb.color_formats[0],
|
||||
scb.get_color_quant_mode(),
|
||||
scb.color_values[0],
|
||||
rgb_lns, a_lns,
|
||||
ep0, ep1);
|
||||
|
||||
|
||||
// Pre-shift sRGB so things round correctly
|
||||
if (config.profile == ASTCENC_PRF_LDR_SRGB)
|
||||
{
|
||||
ep0 = asr<8>(ep0);
|
||||
ep1 = asr<8>(ep1);
|
||||
}
|
||||
vmask4 u8_mask = get_u8_component_mask(config.profile, blk);
|
||||
|
||||
// Unpack and compute error for each texel in the partition
|
||||
vfloatacc summav = vfloatacc::zero();
|
||||
|
||||
vint lane_id = vint::lane_id();
|
||||
vint srgb_scale(config.profile == ASTCENC_PRF_LDR_SRGB ? 257 : 1);
|
||||
|
||||
unsigned int texel_count = bsd.texel_count;
|
||||
for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
|
||||
@@ -581,11 +572,25 @@ float compute_symbolic_block_difference_1plane_1partition(
|
||||
vint ep0_b = vint(ep0.lane<2>()) * weight0;
|
||||
vint ep0_a = vint(ep0.lane<3>()) * weight0;
|
||||
|
||||
// Shift so things round correctly
|
||||
vint colori_r = asr<6>(ep0_r + ep1_r + vint(32)) * srgb_scale;
|
||||
vint colori_g = asr<6>(ep0_g + ep1_g + vint(32)) * srgb_scale;
|
||||
vint colori_b = asr<6>(ep0_b + ep1_b + vint(32)) * srgb_scale;
|
||||
vint colori_a = asr<6>(ep0_a + ep1_a + vint(32)) * srgb_scale;
|
||||
// Combine contributions
|
||||
vint colori_r = asr<6>(ep0_r + ep1_r + vint(32));
|
||||
vint colori_g = asr<6>(ep0_g + ep1_g + vint(32));
|
||||
vint colori_b = asr<6>(ep0_b + ep1_b + vint(32));
|
||||
vint colori_a = asr<6>(ep0_a + ep1_a + vint(32));
|
||||
|
||||
// If using a U8 decode mode bit replicate top 8 bits
|
||||
// so rest of codec can assume 0xFFFF max range everywhere
|
||||
vint colori_r8 = asr<8>(colori_r) * vint(257);
|
||||
colori_r = select(colori_r, colori_r8, vmask(u8_mask.lane<0>()));
|
||||
|
||||
vint colori_g8 = asr<8>(colori_g) * vint(257);
|
||||
colori_g = select(colori_g, colori_g8, vmask(u8_mask.lane<1>()));
|
||||
|
||||
vint colori_b8 = asr<8>(colori_b) * vint(257);
|
||||
colori_b = select(colori_b, colori_b8, vmask(u8_mask.lane<2>()));
|
||||
|
||||
vint colori_a8 = asr<8>(colori_a) * vint(257);
|
||||
colori_a = select(colori_a, colori_a8, vmask(u8_mask.lane<3>()));
|
||||
|
||||
// Compute color diff
|
||||
vfloat color_r = int_to_float(colori_r);
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
// ----------------------------------------------------------------------------
|
||||
// Copyright 2021 Arm Limited
|
||||
// Copyright 2021-2023 Arm Limited
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
|
||||
// use this file except in compliance with the License. You may obtain a copy
|
||||
@@ -24,6 +24,8 @@
|
||||
#include <cassert>
|
||||
#include <cstdarg>
|
||||
#include <cstdio>
|
||||
#include <cmath>
|
||||
#include <limits>
|
||||
#include <string>
|
||||
|
||||
#include "astcenc_diagnostic_trace.h"
|
||||
@@ -32,7 +34,7 @@
|
||||
static TraceLog* g_TraceLog = nullptr;
|
||||
|
||||
/** @brief The JSON indentation level. */
|
||||
static const int g_trace_indent = 2;
|
||||
static const size_t g_trace_indent = 2;
|
||||
|
||||
TraceLog::TraceLog(
|
||||
const char* file_name):
|
||||
@@ -55,7 +57,7 @@ TraceNode* TraceLog::get_current_leaf()
|
||||
}
|
||||
|
||||
/* See header for documentation. */
|
||||
int TraceLog::get_depth()
|
||||
size_t TraceLog::get_depth()
|
||||
{
|
||||
return m_stack.size();
|
||||
}
|
||||
@@ -82,12 +84,12 @@ TraceNode::TraceNode(
|
||||
vsnprintf (buffer, bufsz, format, args);
|
||||
va_end (args);
|
||||
|
||||
// Guarantee there is a nul termintor
|
||||
// Guarantee there is a nul terminator
|
||||
buffer[bufsz - 1] = 0;
|
||||
|
||||
// Generate the node
|
||||
TraceNode* parent = g_TraceLog->get_current_leaf();
|
||||
int depth = g_TraceLog->get_depth();
|
||||
size_t depth = g_TraceLog->get_depth();
|
||||
g_TraceLog->m_stack.push_back(this);
|
||||
|
||||
bool comma = parent && parent->m_attrib_count;
|
||||
@@ -108,8 +110,8 @@ TraceNode::TraceNode(
|
||||
out << '\n';
|
||||
}
|
||||
|
||||
int out_indent = (depth * 2) * g_trace_indent;
|
||||
int in_indent = (depth * 2 + 1) * g_trace_indent;
|
||||
size_t out_indent = (depth * 2) * g_trace_indent;
|
||||
size_t in_indent = (depth * 2 + 1) * g_trace_indent;
|
||||
|
||||
std::string out_indents("");
|
||||
if (out_indent)
|
||||
@@ -131,8 +133,8 @@ void TraceNode::add_attrib(
|
||||
) {
|
||||
(void)type;
|
||||
|
||||
int depth = g_TraceLog->get_depth();
|
||||
int indent = (depth * 2) * g_trace_indent;
|
||||
size_t depth = g_TraceLog->get_depth();
|
||||
size_t indent = (depth * 2) * g_trace_indent;
|
||||
auto& out = g_TraceLog->m_file;
|
||||
bool comma = m_attrib_count;
|
||||
m_attrib_count++;
|
||||
@@ -154,9 +156,9 @@ TraceNode::~TraceNode()
|
||||
g_TraceLog->m_stack.pop_back();
|
||||
|
||||
auto& out = g_TraceLog->m_file;
|
||||
int depth = g_TraceLog->get_depth();
|
||||
int out_indent = (depth * 2) * g_trace_indent;
|
||||
int in_indent = (depth * 2 + 1) * g_trace_indent;
|
||||
size_t depth = g_TraceLog->get_depth();
|
||||
size_t out_indent = (depth * 2) * g_trace_indent;
|
||||
size_t in_indent = (depth * 2 + 1) * g_trace_indent;
|
||||
|
||||
std::string out_indents("");
|
||||
if (out_indent)
|
||||
@@ -189,7 +191,7 @@ void trace_add_data(
|
||||
vsnprintf (buffer, bufsz, format, args);
|
||||
va_end (args);
|
||||
|
||||
// Guarantee there is a nul termintor
|
||||
// Guarantee there is a nul terminator
|
||||
buffer[bufsz - 1] = 0;
|
||||
|
||||
std::string value = "\"" + std::string(buffer) + "\"";
|
||||
@@ -203,7 +205,20 @@ void trace_add_data(
|
||||
const char* key,
|
||||
float value
|
||||
) {
|
||||
char buffer[256];
|
||||
// Turn infinities into parseable values
|
||||
if (std::isinf(value))
|
||||
{
|
||||
if (value > 0.0f)
|
||||
{
|
||||
value = std::numeric_limits<float>::max();
|
||||
}
|
||||
else
|
||||
{
|
||||
value = -std::numeric_limits<float>::max();
|
||||
}
|
||||
}
|
||||
|
||||
char buffer[256];
|
||||
sprintf(buffer, "%.20g", (double)value);
|
||||
TraceNode* node = g_TraceLog->get_current_leaf();
|
||||
node->add_attrib("float", key, buffer);
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
// ----------------------------------------------------------------------------
|
||||
// Copyright 2021 Arm Limited
|
||||
// Copyright 2021-2022 Arm Limited
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
|
||||
// use this file except in compliance with the License. You may obtain a copy
|
||||
@@ -148,7 +148,7 @@ public:
|
||||
*
|
||||
* @return The current leaf node stack depth.
|
||||
*/
|
||||
int get_depth();
|
||||
size_t get_depth();
|
||||
|
||||
/**
|
||||
* @brief The file stream to write to.
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
// ----------------------------------------------------------------------------
|
||||
// Copyright 2011-2022 Arm Limited
|
||||
// Copyright 2011-2024 Arm Limited
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
|
||||
// use this file except in compliance with the License. You may obtain a copy
|
||||
@@ -24,7 +24,7 @@
|
||||
#include <new>
|
||||
|
||||
#include "astcenc.h"
|
||||
#include "astcenc_internal.h"
|
||||
#include "astcenc_internal_entry.h"
|
||||
#include "astcenc_diagnostic_trace.h"
|
||||
|
||||
/**
|
||||
@@ -40,89 +40,96 @@ struct astcenc_preset_config
|
||||
{
|
||||
float quality;
|
||||
unsigned int tune_partition_count_limit;
|
||||
unsigned int tune_partition_index_limit;
|
||||
unsigned int tune_2partition_index_limit;
|
||||
unsigned int tune_3partition_index_limit;
|
||||
unsigned int tune_4partition_index_limit;
|
||||
unsigned int tune_block_mode_limit;
|
||||
unsigned int tune_refinement_limit;
|
||||
unsigned int tune_candidate_limit;
|
||||
unsigned int tune_2partitioning_candidate_limit;
|
||||
unsigned int tune_3partitioning_candidate_limit;
|
||||
unsigned int tune_4partitioning_candidate_limit;
|
||||
float tune_db_limit_a_base;
|
||||
float tune_db_limit_b_base;
|
||||
float tune_mode0_mse_overshoot;
|
||||
float tune_refinement_mse_overshoot;
|
||||
float tune_2_partition_early_out_limit_factor;
|
||||
float tune_3_partition_early_out_limit_factor;
|
||||
float tune_2_plane_early_out_limit_correlation;
|
||||
unsigned int tune_low_weight_count_limit;
|
||||
float tune_mse_overshoot;
|
||||
float tune_2partition_early_out_limit_factor;
|
||||
float tune_3partition_early_out_limit_factor;
|
||||
float tune_2plane_early_out_limit_correlation;
|
||||
float tune_search_mode0_enable;
|
||||
};
|
||||
|
||||
|
||||
/**
|
||||
* @brief The static quality presets that are built-in for high bandwidth
|
||||
* presets (x < 25 texels per block).
|
||||
* @brief The static presets for high bandwidth encodings (x < 25 texels per block).
|
||||
*/
|
||||
static const std::array<astcenc_preset_config, 5> preset_configs_high {{
|
||||
static const std::array<astcenc_preset_config, 6> preset_configs_high {{
|
||||
{
|
||||
ASTCENC_PRE_FASTEST,
|
||||
2, 8, 42, 2, 2, 85.2f, 63.2f, 3.5f, 3.5f, 1.0f, 1.0f, 0.5f, 25
|
||||
2, 10, 6, 4, 43, 2, 2, 2, 2, 2, 85.2f, 63.2f, 3.5f, 1.0f, 1.0f, 0.85f, 0.0f
|
||||
}, {
|
||||
ASTCENC_PRE_FAST,
|
||||
3, 12, 55, 3, 3, 85.2f, 63.2f, 3.5f, 3.5f, 1.0f, 1.1f, 0.65f, 20
|
||||
3, 18, 10, 8, 55, 3, 3, 2, 2, 2, 85.2f, 63.2f, 3.5f, 1.0f, 1.0f, 0.90f, 0.0f
|
||||
}, {
|
||||
ASTCENC_PRE_MEDIUM,
|
||||
4, 26, 76, 3, 3 , 95.0f, 70.0f, 2.5f, 2.5f, 1.2f, 1.25f, 0.85f, 16
|
||||
4, 34, 28, 16, 77, 3, 3, 2, 2, 2, 95.0f, 70.0f, 2.5f, 1.1f, 1.05f, 0.95f, 0.0f
|
||||
}, {
|
||||
ASTCENC_PRE_THOROUGH,
|
||||
4, 76, 93, 4, 4, 105.0f, 77.0f, 10.0f, 10.0f, 2.5f, 1.25f, 0.95f, 12
|
||||
4, 82, 60, 30, 94, 4, 4, 3, 2, 2, 105.0f, 77.0f, 10.0f, 1.35f, 1.15f, 0.97f, 0.0f
|
||||
}, {
|
||||
ASTCENC_PRE_VERYTHOROUGH,
|
||||
4, 256, 128, 64, 98, 4, 6, 8, 6, 4, 200.0f, 200.0f, 10.0f, 1.6f, 1.4f, 0.98f, 0.0f
|
||||
}, {
|
||||
ASTCENC_PRE_EXHAUSTIVE,
|
||||
4, 1024, 100, 4, 4, 200.0f, 200.0f, 10.0f, 10.0f, 10.0f, 10.0f, 0.99f, 0
|
||||
4, 512, 512, 512, 100, 4, 8, 8, 8, 8, 200.0f, 200.0f, 10.0f, 2.0f, 2.0f, 0.99f, 0.0f
|
||||
}
|
||||
}};
|
||||
|
||||
|
||||
/**
|
||||
* @brief The static quality presets that are built-in for medium bandwidth
|
||||
* presets (25 <= x < 64 texels per block).
|
||||
* @brief The static presets for medium bandwidth encodings (25 <= x < 64 texels per block).
|
||||
*/
|
||||
static const std::array<astcenc_preset_config, 5> preset_configs_mid {{
|
||||
static const std::array<astcenc_preset_config, 6> preset_configs_mid {{
|
||||
{
|
||||
ASTCENC_PRE_FASTEST,
|
||||
2, 8, 40, 2, 2, 85.2f, 63.2f, 3.5f, 3.5f, 1.0f, 1.0f, 0.5f, 20
|
||||
2, 10, 6, 4, 43, 2, 2, 2, 2, 2, 85.2f, 63.2f, 3.5f, 1.0f, 1.0f, 0.80f, 1.0f
|
||||
}, {
|
||||
ASTCENC_PRE_FAST,
|
||||
3, 12, 55, 3, 3, 85.2f, 63.2f, 3.5f, 3.5f, 1.0f, 1.1f, 0.5f, 16
|
||||
3, 18, 12, 10, 55, 3, 3, 2, 2, 2, 85.2f, 63.2f, 3.5f, 1.0f, 1.0f, 0.85f, 1.0f
|
||||
}, {
|
||||
ASTCENC_PRE_MEDIUM,
|
||||
4, 26, 76, 3, 3, 95.0f, 70.0f, 3.0f, 3.0f, 1.2f, 1.25f, 0.75f, 14
|
||||
3, 34, 28, 16, 77, 3, 3, 2, 2, 2, 95.0f, 70.0f, 3.0f, 1.1f, 1.05f, 0.90f, 1.0f
|
||||
}, {
|
||||
ASTCENC_PRE_THOROUGH,
|
||||
4, 76, 93, 4, 4, 105.0f, 77.0f, 10.0f, 10.0f, 2.5f, 1.25f, 0.95f, 10
|
||||
4, 82, 60, 30, 94, 4, 4, 3, 2, 2, 105.0f, 77.0f, 10.0f, 1.4f, 1.2f, 0.95f, 0.0f
|
||||
}, {
|
||||
ASTCENC_PRE_VERYTHOROUGH,
|
||||
4, 256, 128, 64, 98, 4, 6, 8, 6, 3, 200.0f, 200.0f, 10.0f, 1.6f, 1.4f, 0.98f, 0.0f
|
||||
}, {
|
||||
ASTCENC_PRE_EXHAUSTIVE,
|
||||
4, 1024, 100, 4, 4, 200.0f, 200.0f, 10.0f, 10.0f, 10.0f, 10.0f, 0.99f, 0
|
||||
4, 256, 256, 256, 100, 4, 8, 8, 8, 8, 200.0f, 200.0f, 10.0f, 2.0f, 2.0f, 0.99f, 0.0f
|
||||
}
|
||||
}};
|
||||
|
||||
|
||||
/**
|
||||
* @brief The static quality presets that are built-in for low bandwidth
|
||||
* presets (64 <= x texels per block).
|
||||
* @brief The static presets for low bandwidth encodings (64 <= x texels per block).
|
||||
*/
|
||||
static const std::array<astcenc_preset_config, 5> preset_configs_low {{
|
||||
static const std::array<astcenc_preset_config, 6> preset_configs_low {{
|
||||
{
|
||||
ASTCENC_PRE_FASTEST,
|
||||
2, 6, 38, 2, 2, 85.0f, 63.0f, 3.5f, 3.5f, 1.0f, 1.0f, 0.5f, 20
|
||||
2, 10, 6, 4, 40, 2, 2, 2, 2, 2, 85.0f, 63.0f, 3.5f, 1.0f, 1.0f, 0.80f, 1.0f
|
||||
}, {
|
||||
ASTCENC_PRE_FAST,
|
||||
3, 10, 53, 3, 3, 85.0f, 63.0f, 3.5f, 3.5f, 1.0f, 1.1f, 0.5f, 16
|
||||
2, 18, 12, 10, 55, 3, 3, 2, 2, 2, 85.0f, 63.0f, 3.5f, 1.0f, 1.0f, 0.85f, 1.0f
|
||||
}, {
|
||||
ASTCENC_PRE_MEDIUM,
|
||||
3, 26, 76, 3, 3, 95.0f, 70.0f, 3.5f, 3.5f, 1.2f, 1.25f, 0.65f, 12
|
||||
3, 34, 28, 16, 77, 3, 3, 2, 2, 2, 95.0f, 70.0f, 3.5f, 1.1f, 1.05f, 0.90f, 1.0f
|
||||
}, {
|
||||
ASTCENC_PRE_THOROUGH,
|
||||
4, 75, 92, 4, 4, 105.0f, 77.0f, 10.0f, 10.0f, 2.5f, 1.25f, 0.85f, 10
|
||||
4, 82, 60, 30, 93, 4, 4, 3, 2, 2, 105.0f, 77.0f, 10.0f, 1.3f, 1.2f, 0.97f, 1.0f
|
||||
}, {
|
||||
ASTCENC_PRE_VERYTHOROUGH,
|
||||
4, 256, 128, 64, 98, 4, 6, 8, 5, 2, 200.0f, 200.0f, 10.0f, 1.6f, 1.4f, 0.98f, 1.0f
|
||||
}, {
|
||||
ASTCENC_PRE_EXHAUSTIVE,
|
||||
4, 1024, 100, 4, 4, 200.0f, 200.0f, 10.0f, 10.0f, 10.0f, 10.0f, 0.99f, 0
|
||||
4, 256, 256, 256, 100, 4, 8, 8, 8, 8, 200.0f, 200.0f, 10.0f, 2.0f, 2.0f, 0.99f, 1.0f
|
||||
}
|
||||
}};
|
||||
|
||||
@@ -151,48 +158,6 @@ static astcenc_error validate_cpu_float()
|
||||
return ASTCENC_SUCCESS;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Validate CPU ISA support meets the requirements of this build of the library.
|
||||
*
|
||||
* Each library build is statically compiled for a particular set of CPU ISA features, such as the
|
||||
* SIMD support or other ISA extensions such as POPCNT. This function checks that the host CPU
|
||||
* actually supports everything this build needs.
|
||||
*
|
||||
* @return Return @c ASTCENC_SUCCESS if validated, otherwise an error on failure.
|
||||
*/
|
||||
static astcenc_error validate_cpu_isa()
|
||||
{
|
||||
#if ASTCENC_SSE >= 41
|
||||
if (!cpu_supports_sse41())
|
||||
{
|
||||
return ASTCENC_ERR_BAD_CPU_ISA;
|
||||
}
|
||||
#endif
|
||||
|
||||
#if ASTCENC_POPCNT >= 1
|
||||
if (!cpu_supports_popcnt())
|
||||
{
|
||||
return ASTCENC_ERR_BAD_CPU_ISA;
|
||||
}
|
||||
#endif
|
||||
|
||||
#if ASTCENC_F16C >= 1
|
||||
if (!cpu_supports_f16c())
|
||||
{
|
||||
return ASTCENC_ERR_BAD_CPU_ISA;
|
||||
}
|
||||
#endif
|
||||
|
||||
#if ASTCENC_AVX >= 2
|
||||
if (!cpu_supports_avx2())
|
||||
{
|
||||
return ASTCENC_ERR_BAD_CPU_ISA;
|
||||
}
|
||||
#endif
|
||||
|
||||
return ASTCENC_SUCCESS;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Validate config profile.
|
||||
*
|
||||
@@ -252,11 +217,13 @@ static astcenc_error validate_block_size(
|
||||
/**
|
||||
* @brief Validate flags.
|
||||
*
|
||||
* @param flags The flags to check.
|
||||
* @param profile The profile to check.
|
||||
* @param flags The flags to check.
|
||||
*
|
||||
* @return Return @c ASTCENC_SUCCESS if validated, otherwise an error on failure.
|
||||
*/
|
||||
static astcenc_error validate_flags(
|
||||
astcenc_profile profile,
|
||||
unsigned int flags
|
||||
) {
|
||||
// Flags field must not contain any unknown flag bits
|
||||
@@ -267,14 +234,21 @@ static astcenc_error validate_flags(
|
||||
}
|
||||
|
||||
// Flags field must only contain at most a single map type
|
||||
exMask = ASTCENC_FLG_MAP_MASK
|
||||
| ASTCENC_FLG_MAP_NORMAL
|
||||
exMask = ASTCENC_FLG_MAP_NORMAL
|
||||
| ASTCENC_FLG_MAP_RGBM;
|
||||
if (popcount(flags & exMask) > 1)
|
||||
{
|
||||
return ASTCENC_ERR_BAD_FLAGS;
|
||||
}
|
||||
|
||||
// Decode_unorm8 must only be used with an LDR profile
|
||||
bool is_unorm8 = flags & ASTCENC_FLG_USE_DECODE_UNORM8;
|
||||
bool is_hdr = (profile == ASTCENC_PRF_HDR) || (profile == ASTCENC_PRF_HDR_RGB_LDR_A);
|
||||
if (is_unorm8 && is_hdr)
|
||||
{
|
||||
return ASTCENC_ERR_BAD_DECODE_MODE;
|
||||
}
|
||||
|
||||
return ASTCENC_SUCCESS;
|
||||
}
|
||||
|
||||
@@ -400,7 +374,7 @@ static astcenc_error validate_config(
|
||||
return status;
|
||||
}
|
||||
|
||||
status = validate_flags(config.flags);
|
||||
status = validate_flags(config.profile, config.flags);
|
||||
if (status != ASTCENC_SUCCESS)
|
||||
{
|
||||
return status;
|
||||
@@ -423,16 +397,20 @@ static astcenc_error validate_config(
|
||||
config.rgbm_m_scale = astc::max(config.rgbm_m_scale, 1.0f);
|
||||
|
||||
config.tune_partition_count_limit = astc::clamp(config.tune_partition_count_limit, 1u, 4u);
|
||||
config.tune_partition_index_limit = astc::clamp(config.tune_partition_index_limit, 1u, BLOCK_MAX_PARTITIONINGS);
|
||||
config.tune_2partition_index_limit = astc::clamp(config.tune_2partition_index_limit, 1u, BLOCK_MAX_PARTITIONINGS);
|
||||
config.tune_3partition_index_limit = astc::clamp(config.tune_3partition_index_limit, 1u, BLOCK_MAX_PARTITIONINGS);
|
||||
config.tune_4partition_index_limit = astc::clamp(config.tune_4partition_index_limit, 1u, BLOCK_MAX_PARTITIONINGS);
|
||||
config.tune_block_mode_limit = astc::clamp(config.tune_block_mode_limit, 1u, 100u);
|
||||
config.tune_refinement_limit = astc::max(config.tune_refinement_limit, 1u);
|
||||
config.tune_candidate_limit = astc::clamp(config.tune_candidate_limit, 1u, TUNE_MAX_TRIAL_CANDIDATES);
|
||||
config.tune_2partitioning_candidate_limit = astc::clamp(config.tune_2partitioning_candidate_limit, 1u, TUNE_MAX_PARTITIONING_CANDIDATES);
|
||||
config.tune_3partitioning_candidate_limit = astc::clamp(config.tune_3partitioning_candidate_limit, 1u, TUNE_MAX_PARTITIONING_CANDIDATES);
|
||||
config.tune_4partitioning_candidate_limit = astc::clamp(config.tune_4partitioning_candidate_limit, 1u, TUNE_MAX_PARTITIONING_CANDIDATES);
|
||||
config.tune_db_limit = astc::max(config.tune_db_limit, 0.0f);
|
||||
config.tune_mode0_mse_overshoot = astc::max(config.tune_mode0_mse_overshoot, 1.0f);
|
||||
config.tune_refinement_mse_overshoot = astc::max(config.tune_refinement_mse_overshoot, 1.0f);
|
||||
config.tune_2_partition_early_out_limit_factor = astc::max(config.tune_2_partition_early_out_limit_factor, 0.0f);
|
||||
config.tune_3_partition_early_out_limit_factor = astc::max(config.tune_3_partition_early_out_limit_factor, 0.0f);
|
||||
config.tune_2_plane_early_out_limit_correlation = astc::max(config.tune_2_plane_early_out_limit_correlation, 0.0f);
|
||||
config.tune_mse_overshoot = astc::max(config.tune_mse_overshoot, 1.0f);
|
||||
config.tune_2partition_early_out_limit_factor = astc::max(config.tune_2partition_early_out_limit_factor, 0.0f);
|
||||
config.tune_3partition_early_out_limit_factor = astc::max(config.tune_3partition_early_out_limit_factor, 0.0f);
|
||||
config.tune_2plane_early_out_limit_correlation = astc::max(config.tune_2plane_early_out_limit_correlation, 0.0f);
|
||||
|
||||
// Specifying a zero weight color component is not allowed; force to small value
|
||||
float max_weight = astc::max(astc::max(config.cw_r_weight, config.cw_g_weight),
|
||||
@@ -465,9 +443,15 @@ astcenc_error astcenc_config_init(
|
||||
astcenc_config* configp
|
||||
) {
|
||||
astcenc_error status;
|
||||
astcenc_config& config = *configp;
|
||||
|
||||
status = validate_cpu_float();
|
||||
if (status != ASTCENC_SUCCESS)
|
||||
{
|
||||
return status;
|
||||
}
|
||||
|
||||
// Zero init all config fields; although most of will be over written
|
||||
astcenc_config& config = *configp;
|
||||
std::memset(&config, 0, sizeof(config));
|
||||
|
||||
// Process the block size
|
||||
@@ -494,7 +478,7 @@ astcenc_error astcenc_config_init(
|
||||
return ASTCENC_ERR_BAD_QUALITY;
|
||||
}
|
||||
|
||||
static const std::array<astcenc_preset_config, 5>* preset_configs;
|
||||
static const std::array<astcenc_preset_config, 6>* preset_configs;
|
||||
int texels_int = block_x * block_y * block_z;
|
||||
if (texels_int < 25)
|
||||
{
|
||||
@@ -526,21 +510,24 @@ astcenc_error astcenc_config_init(
|
||||
if (start == end)
|
||||
{
|
||||
config.tune_partition_count_limit = (*preset_configs)[start].tune_partition_count_limit;
|
||||
config.tune_partition_index_limit = (*preset_configs)[start].tune_partition_index_limit;
|
||||
config.tune_2partition_index_limit = (*preset_configs)[start].tune_2partition_index_limit;
|
||||
config.tune_3partition_index_limit = (*preset_configs)[start].tune_3partition_index_limit;
|
||||
config.tune_4partition_index_limit = (*preset_configs)[start].tune_4partition_index_limit;
|
||||
config.tune_block_mode_limit = (*preset_configs)[start].tune_block_mode_limit;
|
||||
config.tune_refinement_limit = (*preset_configs)[start].tune_refinement_limit;
|
||||
config.tune_candidate_limit = astc::min((*preset_configs)[start].tune_candidate_limit,
|
||||
TUNE_MAX_TRIAL_CANDIDATES);
|
||||
config.tune_candidate_limit = (*preset_configs)[start].tune_candidate_limit;
|
||||
config.tune_2partitioning_candidate_limit = (*preset_configs)[start].tune_2partitioning_candidate_limit;
|
||||
config.tune_3partitioning_candidate_limit = (*preset_configs)[start].tune_3partitioning_candidate_limit;
|
||||
config.tune_4partitioning_candidate_limit = (*preset_configs)[start].tune_4partitioning_candidate_limit;
|
||||
config.tune_db_limit = astc::max((*preset_configs)[start].tune_db_limit_a_base - 35 * ltexels,
|
||||
(*preset_configs)[start].tune_db_limit_b_base - 19 * ltexels);
|
||||
|
||||
config.tune_mode0_mse_overshoot = (*preset_configs)[start].tune_mode0_mse_overshoot;
|
||||
config.tune_refinement_mse_overshoot = (*preset_configs)[start].tune_refinement_mse_overshoot;
|
||||
config.tune_mse_overshoot = (*preset_configs)[start].tune_mse_overshoot;
|
||||
|
||||
config.tune_2_partition_early_out_limit_factor = (*preset_configs)[start].tune_2_partition_early_out_limit_factor;
|
||||
config.tune_3_partition_early_out_limit_factor =(*preset_configs)[start].tune_3_partition_early_out_limit_factor;
|
||||
config.tune_2_plane_early_out_limit_correlation = (*preset_configs)[start].tune_2_plane_early_out_limit_correlation;
|
||||
config.tune_low_weight_count_limit = (*preset_configs)[start].tune_low_weight_count_limit;
|
||||
config.tune_2partition_early_out_limit_factor = (*preset_configs)[start].tune_2partition_early_out_limit_factor;
|
||||
config.tune_3partition_early_out_limit_factor = (*preset_configs)[start].tune_3partition_early_out_limit_factor;
|
||||
config.tune_2plane_early_out_limit_correlation = (*preset_configs)[start].tune_2plane_early_out_limit_correlation;
|
||||
config.tune_search_mode0_enable = (*preset_configs)[start].tune_search_mode0_enable;
|
||||
}
|
||||
// Start and end node are not the same - so interpolate between them
|
||||
else
|
||||
@@ -562,21 +549,24 @@ astcenc_error astcenc_config_init(
|
||||
#define LERPUI(param) static_cast<unsigned int>(LERPI(param))
|
||||
|
||||
config.tune_partition_count_limit = LERPI(tune_partition_count_limit);
|
||||
config.tune_partition_index_limit = LERPI(tune_partition_index_limit);
|
||||
config.tune_2partition_index_limit = LERPI(tune_2partition_index_limit);
|
||||
config.tune_3partition_index_limit = LERPI(tune_3partition_index_limit);
|
||||
config.tune_4partition_index_limit = LERPI(tune_4partition_index_limit);
|
||||
config.tune_block_mode_limit = LERPI(tune_block_mode_limit);
|
||||
config.tune_refinement_limit = LERPI(tune_refinement_limit);
|
||||
config.tune_candidate_limit = astc::min(LERPUI(tune_candidate_limit),
|
||||
TUNE_MAX_TRIAL_CANDIDATES);
|
||||
config.tune_candidate_limit = LERPUI(tune_candidate_limit);
|
||||
config.tune_2partitioning_candidate_limit = LERPUI(tune_2partitioning_candidate_limit);
|
||||
config.tune_3partitioning_candidate_limit = LERPUI(tune_3partitioning_candidate_limit);
|
||||
config.tune_4partitioning_candidate_limit = LERPUI(tune_4partitioning_candidate_limit);
|
||||
config.tune_db_limit = astc::max(LERP(tune_db_limit_a_base) - 35 * ltexels,
|
||||
LERP(tune_db_limit_b_base) - 19 * ltexels);
|
||||
|
||||
config.tune_mode0_mse_overshoot = LERP(tune_mode0_mse_overshoot);
|
||||
config.tune_refinement_mse_overshoot = LERP(tune_refinement_mse_overshoot);
|
||||
config.tune_mse_overshoot = LERP(tune_mse_overshoot);
|
||||
|
||||
config.tune_2_partition_early_out_limit_factor = LERP(tune_2_partition_early_out_limit_factor);
|
||||
config.tune_3_partition_early_out_limit_factor = LERP(tune_3_partition_early_out_limit_factor);
|
||||
config.tune_2_plane_early_out_limit_correlation = LERP(tune_2_plane_early_out_limit_correlation);
|
||||
config.tune_low_weight_count_limit = LERPI(tune_low_weight_count_limit);
|
||||
config.tune_2partition_early_out_limit_factor = LERP(tune_2partition_early_out_limit_factor);
|
||||
config.tune_3partition_early_out_limit_factor = LERP(tune_3partition_early_out_limit_factor);
|
||||
config.tune_2plane_early_out_limit_correlation = LERP(tune_2plane_early_out_limit_correlation);
|
||||
config.tune_search_mode0_enable = LERP(tune_search_mode0_enable);
|
||||
#undef LERP
|
||||
#undef LERPI
|
||||
#undef LERPUI
|
||||
@@ -604,13 +594,14 @@ astcenc_error astcenc_config_init(
|
||||
case ASTCENC_PRF_HDR_RGB_LDR_A:
|
||||
case ASTCENC_PRF_HDR:
|
||||
config.tune_db_limit = 999.0f;
|
||||
config.tune_search_mode0_enable = 0.0f;
|
||||
break;
|
||||
default:
|
||||
return ASTCENC_ERR_BAD_PROFILE;
|
||||
}
|
||||
|
||||
// Flags field must not contain any unknown flag bits
|
||||
status = validate_flags(flags);
|
||||
status = validate_flags(profile, flags);
|
||||
if (status != ASTCENC_SUCCESS)
|
||||
{
|
||||
return status;
|
||||
@@ -625,20 +616,14 @@ astcenc_error astcenc_config_init(
|
||||
|
||||
config.cw_g_weight = 0.0f;
|
||||
config.cw_b_weight = 0.0f;
|
||||
config.tune_2_partition_early_out_limit_factor *= 1.5f;
|
||||
config.tune_3_partition_early_out_limit_factor *= 1.5f;
|
||||
config.tune_2_plane_early_out_limit_correlation = 0.99f;
|
||||
config.tune_2partition_early_out_limit_factor *= 1.5f;
|
||||
config.tune_3partition_early_out_limit_factor *= 1.5f;
|
||||
config.tune_2plane_early_out_limit_correlation = 0.99f;
|
||||
|
||||
// Normals are prone to blocking artifacts on smooth curves
|
||||
// so force compressor to try harder here ...
|
||||
config.tune_db_limit *= 1.03f;
|
||||
}
|
||||
else if (flags & ASTCENC_FLG_MAP_MASK)
|
||||
{
|
||||
// Masks are prone to blocking artifacts on mask edges
|
||||
// so force compressor to try harder here ...
|
||||
config.tune_db_limit *= 1.03f;
|
||||
}
|
||||
else if (flags & ASTCENC_FLG_MAP_RGBM)
|
||||
{
|
||||
config.rgbm_m_scale = 5.0f;
|
||||
@@ -655,7 +640,7 @@ astcenc_error astcenc_config_init(
|
||||
//
|
||||
// ... but we scale these up to keep a better balance between color and alpha. Note
|
||||
// that if the content is using alpha we'd recommend using the -a option to weight
|
||||
// the color conribution by the alpha transparency.
|
||||
// the color contribution by the alpha transparency.
|
||||
if (flags & ASTCENC_FLG_USE_PERCEPTUAL)
|
||||
{
|
||||
config.cw_r_weight = 0.30f * 2.25f;
|
||||
@@ -683,12 +668,6 @@ astcenc_error astcenc_context_alloc(
|
||||
return status;
|
||||
}
|
||||
|
||||
status = validate_cpu_isa();
|
||||
if (status != ASTCENC_SUCCESS)
|
||||
{
|
||||
return status;
|
||||
}
|
||||
|
||||
if (thread_count == 0)
|
||||
{
|
||||
return ASTCENC_ERR_BAD_PARAM;
|
||||
@@ -702,7 +681,8 @@ astcenc_error astcenc_context_alloc(
|
||||
}
|
||||
#endif
|
||||
|
||||
astcenc_context* ctx = new astcenc_context;
|
||||
astcenc_context* ctxo = new astcenc_context;
|
||||
astcenc_contexti* ctx = &ctxo->context;
|
||||
ctx->thread_count = thread_count;
|
||||
ctx->config = config;
|
||||
ctx->working_buffers = nullptr;
|
||||
@@ -714,12 +694,18 @@ astcenc_error astcenc_context_alloc(
|
||||
status = validate_config(ctx->config);
|
||||
if (status != ASTCENC_SUCCESS)
|
||||
{
|
||||
delete ctx;
|
||||
delete ctxo;
|
||||
return status;
|
||||
}
|
||||
|
||||
ctx->bsd = aligned_malloc<block_size_descriptor>(sizeof(block_size_descriptor), ASTCENC_VECALIGN);
|
||||
bool can_omit_modes = config.flags & ASTCENC_FLG_SELF_DECOMPRESS_ONLY;
|
||||
if (!ctx->bsd)
|
||||
{
|
||||
delete ctxo;
|
||||
return ASTCENC_ERR_OUT_OF_MEM;
|
||||
}
|
||||
|
||||
bool can_omit_modes = static_cast<bool>(config.flags & ASTCENC_FLG_SELF_DECOMPRESS_ONLY);
|
||||
init_block_size_descriptor(ctx->config.privateProfile, config.block_x, config.block_y, config.block_z,
|
||||
can_omit_modes,
|
||||
config.tune_partition_count_limit,
|
||||
@@ -728,7 +714,7 @@ astcenc_error astcenc_context_alloc(
|
||||
|
||||
#if !defined(ASTCENC_DECOMPRESS_ONLY)
|
||||
// Do setup only needed by compression
|
||||
if (!(status & ASTCENC_FLG_DECOMPRESS_ONLY))
|
||||
if (!(ctx->config.flags & ASTCENC_FLG_DECOMPRESS_ONLY))
|
||||
{
|
||||
// Turn a dB limit into a per-texel error for faster use later
|
||||
if ((ctx->config.profile == ASTCENC_PRF_LDR) || (ctx->config.profile == ASTCENC_PRF_LDR_SRGB))
|
||||
@@ -742,12 +728,12 @@ astcenc_error astcenc_context_alloc(
|
||||
|
||||
size_t worksize = sizeof(compression_working_buffers) * thread_count;
|
||||
ctx->working_buffers = aligned_malloc<compression_working_buffers>(worksize, ASTCENC_VECALIGN);
|
||||
static_assert((sizeof(compression_working_buffers) % ASTCENC_VECALIGN) == 0,
|
||||
static_assert((ASTCENC_VECALIGN == 0) || ((sizeof(compression_working_buffers) % ASTCENC_VECALIGN) == 0),
|
||||
"compression_working_buffers size must be multiple of vector alignment");
|
||||
if (!ctx->working_buffers)
|
||||
{
|
||||
aligned_free<block_size_descriptor>(ctx->bsd);
|
||||
delete ctx;
|
||||
delete ctxo;
|
||||
*context = nullptr;
|
||||
return ASTCENC_ERR_OUT_OF_MEM;
|
||||
}
|
||||
@@ -766,7 +752,7 @@ astcenc_error astcenc_context_alloc(
|
||||
trace_add_data("block_z", config.block_z);
|
||||
#endif
|
||||
|
||||
*context = ctx;
|
||||
*context = ctxo;
|
||||
|
||||
#if !defined(ASTCENC_DECOMPRESS_ONLY)
|
||||
prepare_angular_tables();
|
||||
@@ -777,16 +763,17 @@ astcenc_error astcenc_context_alloc(
|
||||
|
||||
/* See header dor documentation. */
|
||||
void astcenc_context_free(
|
||||
astcenc_context* ctx
|
||||
astcenc_context* ctxo
|
||||
) {
|
||||
if (ctx)
|
||||
if (ctxo)
|
||||
{
|
||||
astcenc_contexti* ctx = &ctxo->context;
|
||||
aligned_free<compression_working_buffers>(ctx->working_buffers);
|
||||
aligned_free<block_size_descriptor>(ctx->bsd);
|
||||
#if defined(ASTCENC_DIAGNOSTICS)
|
||||
delete ctx->trace_log;
|
||||
#endif
|
||||
delete ctx;
|
||||
delete ctxo;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -795,14 +782,14 @@ void astcenc_context_free(
|
||||
/**
|
||||
* @brief Compress an image, after any preflight has completed.
|
||||
*
|
||||
* @param[out] ctx The compressor context.
|
||||
* @param[out] ctxo The compressor context.
|
||||
* @param thread_index The thread index.
|
||||
* @param image The intput image.
|
||||
* @param swizzle The input swizzle.
|
||||
* @param[out] buffer The output array for the compressed data.
|
||||
*/
|
||||
static void compress_image(
|
||||
astcenc_context& ctx,
|
||||
astcenc_context& ctxo,
|
||||
unsigned int thread_index,
|
||||
const astcenc_image& image,
|
||||
const astcenc_swizzle& swizzle,
|
||||
@@ -814,6 +801,7 @@ static void compress_image(
|
||||
uint8_t* buffer
|
||||
#endif
|
||||
) {
|
||||
astcenc_contexti& ctx = ctxo.context;
|
||||
const block_size_descriptor& bsd = *ctx.bsd;
|
||||
astcenc_profile decode_mode = ctx.config.profile;
|
||||
|
||||
@@ -822,7 +810,7 @@ static void compress_image(
|
||||
int block_x = bsd.xdim;
|
||||
int block_y = bsd.ydim;
|
||||
int block_z = bsd.zdim;
|
||||
blk.texel_count = block_x * block_y * block_z;
|
||||
blk.texel_count = static_cast<uint8_t>(block_x * block_y * block_z);
|
||||
|
||||
int dim_x = image.dim_x;
|
||||
int dim_y = image.dim_y;
|
||||
@@ -836,6 +824,8 @@ static void compress_image(
|
||||
int row_blocks = xblocks;
|
||||
int plane_blocks = xblocks * yblocks;
|
||||
|
||||
blk.decode_unorm8 = ctxo.context.config.flags & ASTCENC_FLG_USE_DECODE_UNORM8;
|
||||
|
||||
// Populate the block channel weights
|
||||
blk.channel_weight = vfloat4(ctx.config.cw_r_weight,
|
||||
ctx.config.cw_g_weight,
|
||||
@@ -846,8 +836,7 @@ static void compress_image(
|
||||
auto& temp_buffers = ctx.working_buffers[thread_index];
|
||||
|
||||
// Only the first thread actually runs the initializer
|
||||
ctx.manage_compress.init(block_count);
|
||||
|
||||
ctxo.manage_compress.init(block_count, ctx.config.progress_callback);
|
||||
|
||||
// Determine if we can use an optimized load function
|
||||
bool needs_swz = (swizzle.r != ASTCENC_SWZ_R) || (swizzle.g != ASTCENC_SWZ_G) ||
|
||||
@@ -859,17 +848,17 @@ static void compress_image(
|
||||
bool use_fast_load = !needs_swz && !needs_hdr &&
|
||||
block_z == 1 && image.data_type == ASTCENC_TYPE_U8;
|
||||
|
||||
auto load_func = fetch_image_block;
|
||||
auto load_func = load_image_block;
|
||||
if (use_fast_load)
|
||||
{
|
||||
load_func = fetch_image_block_fast_ldr;
|
||||
load_func = load_image_block_fast_ldr;
|
||||
}
|
||||
|
||||
// All threads run this processing loop until there is no work remaining
|
||||
while (true)
|
||||
{
|
||||
unsigned int count;
|
||||
unsigned int base = ctx.manage_compress.get_task_assignment(16, count);
|
||||
unsigned int base = ctxo.manage_compress.get_task_assignment(16, count);
|
||||
if (!count)
|
||||
{
|
||||
break;
|
||||
@@ -924,6 +913,18 @@ static void compress_image(
|
||||
if (use_full_block)
|
||||
{
|
||||
load_func(decode_mode, image, blk, bsd, x * block_x, y * block_y, z * block_z, swizzle);
|
||||
|
||||
// Scale RGB error contribution by the maximum alpha in the block
|
||||
// This encourages preserving alpha accuracy in regions with high
|
||||
// transparency, and can buy up to 0.5 dB PSNR.
|
||||
if (ctx.config.flags & ASTCENC_FLG_USE_ALPHA_WEIGHT)
|
||||
{
|
||||
float alpha_scale = blk.data_max.lane<3>() * (1.0f / 65535.0f);
|
||||
blk.channel_weight = vfloat4(ctx.config.cw_r_weight * alpha_scale,
|
||||
ctx.config.cw_g_weight * alpha_scale,
|
||||
ctx.config.cw_b_weight * alpha_scale,
|
||||
ctx.config.cw_a_weight);
|
||||
}
|
||||
}
|
||||
// Apply alpha scale RDO - substitute constant color block
|
||||
else
|
||||
@@ -937,31 +938,92 @@ static void compress_image(
|
||||
|
||||
int offset = ((z * yblocks + y) * xblocks + x) * 16;
|
||||
uint8_t *bp = buffer + offset;
|
||||
physical_compressed_block* pcb = reinterpret_cast<physical_compressed_block*>(bp);
|
||||
#if QUALITY_CONTROL
|
||||
int32_t *mseBlock[RGBA_COM] = {nullptr, nullptr, nullptr, nullptr};
|
||||
if (calQualityEnable) {
|
||||
int offset = (z * yblocks + y) * xblocks + x;
|
||||
offset = (z * yblocks + y) * xblocks + x;
|
||||
mseBlock[R_COM] = mse[R_COM] + offset;
|
||||
mseBlock[G_COM] = mse[G_COM] + offset;
|
||||
mseBlock[B_COM] = mse[B_COM] + offset;
|
||||
mseBlock[A_COM] = mse[A_COM] + offset;
|
||||
}
|
||||
compress_block(ctx, blk, *pcb, temp_buffers, calQualityEnable, mseBlock);
|
||||
compress_block(ctx, blk, bp, temp_buffers, calQualityEnable, mseBlock);
|
||||
#else
|
||||
compress_block(ctx, blk, *pcb, temp_buffers);
|
||||
compress_block(ctx, blk, bp, temp_buffers);
|
||||
#endif
|
||||
}
|
||||
|
||||
ctx.manage_compress.complete_task_assignment(count);
|
||||
ctxo.manage_compress.complete_task_assignment(count);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Compute regional averages in an image.
|
||||
*
|
||||
* This function can be called by multiple threads, but only after a single
|
||||
* thread calls the setup function @c init_compute_averages().
|
||||
*
|
||||
* Results are written back into @c img->input_alpha_averages.
|
||||
*
|
||||
* @param[out] ctx The context.
|
||||
* @param ag The average and variance arguments created during setup.
|
||||
*/
|
||||
static void compute_averages(
|
||||
astcenc_context& ctx,
|
||||
const avg_args &ag
|
||||
) {
|
||||
pixel_region_args arg = ag.arg;
|
||||
arg.work_memory = new vfloat4[ag.work_memory_size];
|
||||
|
||||
int size_x = ag.img_size_x;
|
||||
int size_y = ag.img_size_y;
|
||||
int size_z = ag.img_size_z;
|
||||
|
||||
int step_xy = ag.blk_size_xy;
|
||||
int step_z = ag.blk_size_z;
|
||||
|
||||
int y_tasks = (size_y + step_xy - 1) / step_xy;
|
||||
|
||||
// All threads run this processing loop until there is no work remaining
|
||||
while (true)
|
||||
{
|
||||
unsigned int count;
|
||||
unsigned int base = ctx.manage_avg.get_task_assignment(16, count);
|
||||
if (!count)
|
||||
{
|
||||
break;
|
||||
}
|
||||
|
||||
for (unsigned int i = base; i < base + count; i++)
|
||||
{
|
||||
int z = (i / (y_tasks)) * step_z;
|
||||
int y = (i - (z * y_tasks)) * step_xy;
|
||||
|
||||
arg.size_z = astc::min(step_z, size_z - z);
|
||||
arg.offset_z = z;
|
||||
|
||||
arg.size_y = astc::min(step_xy, size_y - y);
|
||||
arg.offset_y = y;
|
||||
|
||||
for (int x = 0; x < size_x; x += step_xy)
|
||||
{
|
||||
arg.size_x = astc::min(step_xy, size_x - x);
|
||||
arg.offset_x = x;
|
||||
compute_pixel_region_variance(ctx.context, arg);
|
||||
}
|
||||
}
|
||||
|
||||
ctx.manage_avg.complete_task_assignment(count);
|
||||
}
|
||||
|
||||
delete[] arg.work_memory;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
/* See header for documentation. */
|
||||
astcenc_error astcenc_compress_image(
|
||||
astcenc_context* ctx,
|
||||
astcenc_context* ctxo,
|
||||
astcenc_image* imagep,
|
||||
const astcenc_swizzle* swizzle,
|
||||
uint8_t* data_out,
|
||||
@@ -973,7 +1035,7 @@ astcenc_error astcenc_compress_image(
|
||||
unsigned int thread_index
|
||||
) {
|
||||
#if defined(ASTCENC_DECOMPRESS_ONLY)
|
||||
(void)ctx;
|
||||
(void)ctxo;
|
||||
(void)imagep;
|
||||
(void)swizzle;
|
||||
(void)data_out;
|
||||
@@ -981,6 +1043,7 @@ astcenc_error astcenc_compress_image(
|
||||
(void)thread_index;
|
||||
return ASTCENC_ERR_BAD_CONTEXT;
|
||||
#else
|
||||
astcenc_contexti* ctx = &ctxo->context;
|
||||
astcenc_error status;
|
||||
astcenc_image& image = *imagep;
|
||||
|
||||
@@ -1018,7 +1081,7 @@ astcenc_error astcenc_compress_image(
|
||||
// If context thread count is one then implicitly reset
|
||||
if (ctx->thread_count == 1)
|
||||
{
|
||||
astcenc_compress_reset(ctx);
|
||||
astcenc_compress_reset(ctxo);
|
||||
}
|
||||
|
||||
if (ctx->config.a_scale_radius != 0)
|
||||
@@ -1036,21 +1099,21 @@ astcenc_error astcenc_compress_image(
|
||||
};
|
||||
|
||||
// Only the first thread actually runs the initializer
|
||||
ctx->manage_avg.init(init_avg);
|
||||
ctxo->manage_avg.init(init_avg);
|
||||
|
||||
// All threads will enter this function and dynamically grab work
|
||||
compute_averages(*ctx, ctx->avg_preprocess_args);
|
||||
compute_averages(*ctxo, ctx->avg_preprocess_args);
|
||||
}
|
||||
|
||||
// Wait for compute_averages to complete before compressing
|
||||
ctx->manage_avg.wait();
|
||||
ctxo->manage_avg.wait();
|
||||
#if QUALITY_CONTROL
|
||||
compress_image(*ctx, thread_index, image, *swizzle, data_out, calQualityEnable, mse);
|
||||
compress_image(*ctxo, thread_index, image, *swizzle, data_out, calQualityEnable, mse);
|
||||
#else
|
||||
compress_image(*ctx, thread_index, image, *swizzle, data_out);
|
||||
compress_image(*ctxo, thread_index, image, *swizzle, data_out);
|
||||
#endif
|
||||
// Wait for compress to complete before freeing memory
|
||||
ctx->manage_compress.wait();
|
||||
ctxo->manage_compress.wait();
|
||||
|
||||
auto term_compress = [ctx]() {
|
||||
delete[] ctx->input_alpha_averages;
|
||||
@@ -1058,7 +1121,7 @@ astcenc_error astcenc_compress_image(
|
||||
};
|
||||
|
||||
// Only the first thread to arrive actually runs the term
|
||||
ctx->manage_compress.term(term_compress);
|
||||
ctxo->manage_compress.term(term_compress);
|
||||
|
||||
return ASTCENC_SUCCESS;
|
||||
#endif
|
||||
@@ -1066,26 +1129,27 @@ astcenc_error astcenc_compress_image(
|
||||
|
||||
/* See header for documentation. */
|
||||
astcenc_error astcenc_compress_reset(
|
||||
astcenc_context* ctx
|
||||
astcenc_context* ctxo
|
||||
) {
|
||||
#if defined(ASTCENC_DECOMPRESS_ONLY)
|
||||
(void)ctx;
|
||||
(void)ctxo;
|
||||
return ASTCENC_ERR_BAD_CONTEXT;
|
||||
#else
|
||||
astcenc_contexti* ctx = &ctxo->context;
|
||||
if (ctx->config.flags & ASTCENC_FLG_DECOMPRESS_ONLY)
|
||||
{
|
||||
return ASTCENC_ERR_BAD_CONTEXT;
|
||||
}
|
||||
|
||||
ctx->manage_avg.reset();
|
||||
ctx->manage_compress.reset();
|
||||
ctxo->manage_avg.reset();
|
||||
ctxo->manage_compress.reset();
|
||||
return ASTCENC_SUCCESS;
|
||||
#endif
|
||||
}
|
||||
|
||||
/* See header for documentation. */
|
||||
astcenc_error astcenc_decompress_image(
|
||||
astcenc_context* ctx,
|
||||
astcenc_context* ctxo,
|
||||
const uint8_t* data,
|
||||
size_t data_len,
|
||||
astcenc_image* image_outp,
|
||||
@@ -1094,6 +1158,7 @@ astcenc_error astcenc_decompress_image(
|
||||
) {
|
||||
astcenc_error status;
|
||||
astcenc_image& image_out = *image_outp;
|
||||
astcenc_contexti* ctx = &ctxo->context;
|
||||
|
||||
// Today this doesn't matter (working set on stack) but might in future ...
|
||||
if (thread_index >= ctx->thread_count)
|
||||
@@ -1114,6 +1179,7 @@ astcenc_error astcenc_decompress_image(
|
||||
unsigned int xblocks = (image_out.dim_x + block_x - 1) / block_x;
|
||||
unsigned int yblocks = (image_out.dim_y + block_y - 1) / block_y;
|
||||
unsigned int zblocks = (image_out.dim_z + block_z - 1) / block_z;
|
||||
unsigned int block_count = zblocks * yblocks * xblocks;
|
||||
|
||||
int row_blocks = xblocks;
|
||||
int plane_blocks = xblocks * yblocks;
|
||||
@@ -1126,22 +1192,25 @@ astcenc_error astcenc_decompress_image(
|
||||
}
|
||||
|
||||
image_block blk;
|
||||
blk.texel_count = block_x * block_y * block_z;
|
||||
blk.texel_count = static_cast<uint8_t>(block_x * block_y * block_z);
|
||||
|
||||
// Decode mode inferred from the output data type
|
||||
blk.decode_unorm8 = image_out.data_type == ASTCENC_TYPE_U8;
|
||||
|
||||
// If context thread count is one then implicitly reset
|
||||
if (ctx->thread_count == 1)
|
||||
{
|
||||
astcenc_decompress_reset(ctx);
|
||||
astcenc_decompress_reset(ctxo);
|
||||
}
|
||||
|
||||
// Only the first thread actually runs the initializer
|
||||
ctx->manage_decompress.init(zblocks * yblocks * xblocks);
|
||||
ctxo->manage_decompress.init(block_count, nullptr);
|
||||
|
||||
// All threads run this processing loop until there is no work remaining
|
||||
while (true)
|
||||
{
|
||||
unsigned int count;
|
||||
unsigned int base = ctx->manage_decompress.get_task_assignment(128, count);
|
||||
unsigned int base = ctxo->manage_decompress.get_task_assignment(128, count);
|
||||
if (!count)
|
||||
{
|
||||
break;
|
||||
@@ -1158,20 +1227,19 @@ astcenc_error astcenc_decompress_image(
|
||||
unsigned int offset = (((z * yblocks + y) * xblocks) + x) * 16;
|
||||
const uint8_t* bp = data + offset;
|
||||
|
||||
const physical_compressed_block& pcb = *reinterpret_cast<const physical_compressed_block*>(bp);
|
||||
symbolic_compressed_block scb;
|
||||
|
||||
physical_to_symbolic(*ctx->bsd, pcb, scb);
|
||||
physical_to_symbolic(*ctx->bsd, bp, scb);
|
||||
|
||||
decompress_symbolic_block(ctx->config.profile, *ctx->bsd,
|
||||
x * block_x, y * block_y, z * block_z,
|
||||
scb, blk);
|
||||
|
||||
write_image_block(image_out, blk, *ctx->bsd,
|
||||
store_image_block(image_out, blk, *ctx->bsd,
|
||||
x * block_x, y * block_y, z * block_z, *swizzle);
|
||||
}
|
||||
|
||||
ctx->manage_decompress.complete_task_assignment(count);
|
||||
ctxo->manage_decompress.complete_task_assignment(count);
|
||||
}
|
||||
|
||||
return ASTCENC_SUCCESS;
|
||||
@@ -1179,28 +1247,29 @@ astcenc_error astcenc_decompress_image(
|
||||
|
||||
/* See header for documentation. */
|
||||
astcenc_error astcenc_decompress_reset(
|
||||
astcenc_context* ctx
|
||||
astcenc_context* ctxo
|
||||
) {
|
||||
ctx->manage_decompress.reset();
|
||||
ctxo->manage_decompress.reset();
|
||||
return ASTCENC_SUCCESS;
|
||||
}
|
||||
|
||||
/* See header for documentation. */
|
||||
astcenc_error astcenc_get_block_info(
|
||||
astcenc_context* ctx,
|
||||
astcenc_context* ctxo,
|
||||
const uint8_t data[16],
|
||||
astcenc_block_info* info
|
||||
) {
|
||||
#if defined(ASTCENC_DECOMPRESS_ONLY)
|
||||
(void)ctx;
|
||||
(void)ctxo;
|
||||
(void)data;
|
||||
(void)info;
|
||||
return ASTCENC_ERR_BAD_CONTEXT;
|
||||
#else
|
||||
astcenc_contexti* ctx = &ctxo->context;
|
||||
|
||||
// Decode the compressed data into a symbolic form
|
||||
const physical_compressed_block&pcb = *reinterpret_cast<const physical_compressed_block*>(data);
|
||||
symbolic_compressed_block scb;
|
||||
physical_to_symbolic(*ctx->bsd, pcb, scb);
|
||||
physical_to_symbolic(*ctx->bsd, data, scb);
|
||||
|
||||
// Fetch the appropriate partition and decimation tables
|
||||
block_size_descriptor& bsd = *ctx->bsd;
|
||||
@@ -1260,7 +1329,6 @@ astcenc_error astcenc_get_block_info(
|
||||
|
||||
unpack_color_endpoints(ctx->config.profile,
|
||||
scb.color_formats[i],
|
||||
scb.get_color_quant_mode(),
|
||||
scb.color_values[i],
|
||||
rgb_hdr, a_hdr,
|
||||
endpnt[0], endpnt[1]);
|
||||
@@ -1284,7 +1352,7 @@ astcenc_error astcenc_get_block_info(
|
||||
int weight_plane1[BLOCK_MAX_TEXELS];
|
||||
int weight_plane2[BLOCK_MAX_TEXELS];
|
||||
|
||||
unpack_weights(bsd, scb, di, bm.is_dual_plane, bm.get_weight_quant_mode(), weight_plane1, weight_plane2);
|
||||
unpack_weights(bsd, scb, di, bm.is_dual_plane, weight_plane1, weight_plane2);
|
||||
for (unsigned int i = 0; i < bsd.texel_count; i++)
|
||||
{
|
||||
info->weight_values_plane1[i] = static_cast<float>(weight_plane1[i]) * (1.0f / WEIGHTS_TEXEL_SUM);
|
||||
@@ -1318,8 +1386,6 @@ const char* astcenc_get_error_string(
|
||||
return "ASTCENC_ERR_OUT_OF_MEM";
|
||||
case ASTCENC_ERR_BAD_CPU_FLOAT:
|
||||
return "ASTCENC_ERR_BAD_CPU_FLOAT";
|
||||
case ASTCENC_ERR_BAD_CPU_ISA:
|
||||
return "ASTCENC_ERR_BAD_CPU_ISA";
|
||||
case ASTCENC_ERR_BAD_PARAM:
|
||||
return "ASTCENC_ERR_BAD_PARAM";
|
||||
case ASTCENC_ERR_BAD_BLOCK_SIZE:
|
||||
@@ -1336,6 +1402,8 @@ const char* astcenc_get_error_string(
|
||||
return "ASTCENC_ERR_BAD_CONTEXT";
|
||||
case ASTCENC_ERR_NOT_IMPLEMENTED:
|
||||
return "ASTCENC_ERR_NOT_IMPLEMENTED";
|
||||
case ASTCENC_ERR_BAD_DECODE_MODE:
|
||||
return "ASTCENC_ERR_BAD_DECODE_MODE";
|
||||
#if defined(ASTCENC_DIAGNOSTICS)
|
||||
case ASTCENC_ERR_DTRACE_FAILURE:
|
||||
return "ASTCENC_ERR_DTRACE_FAILURE";
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
// ----------------------------------------------------------------------------
|
||||
// Copyright 2011-2022 Arm Limited
|
||||
// Copyright 2011-2023 Arm Limited
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
|
||||
// use this file except in compliance with the License. You may obtain a copy
|
||||
@@ -46,15 +46,16 @@
|
||||
* lines for endpoint selection.
|
||||
*/
|
||||
|
||||
#include <limits>
|
||||
#include "astcenc_internal.h"
|
||||
|
||||
/**
|
||||
* @brief Pick some initital kmeans cluster centers.
|
||||
* @brief Pick some initial kmeans cluster centers.
|
||||
*
|
||||
* @param blk The image block color data to compress.
|
||||
* @param texel_count The number of texels in the block.
|
||||
* @param partition_count The number of partitions in the block.
|
||||
* @param[out] cluster_centers The initital partition cluster center colors.
|
||||
* @param[out] cluster_centers The initial partition cluster center colors.
|
||||
*/
|
||||
static void kmeans_init(
|
||||
const image_block& blk,
|
||||
@@ -249,13 +250,16 @@ static void kmeans_update(
|
||||
*
|
||||
* @return The number of bit mismatches.
|
||||
*/
|
||||
static inline unsigned int partition_mismatch2(
|
||||
static inline uint8_t partition_mismatch2(
|
||||
const uint64_t a[2],
|
||||
const uint64_t b[2]
|
||||
) {
|
||||
int v1 = popcount(a[0] ^ b[0]) + popcount(a[1] ^ b[1]);
|
||||
int v2 = popcount(a[0] ^ b[1]) + popcount(a[1] ^ b[0]);
|
||||
return astc::min(v1, v2);
|
||||
|
||||
// Divide by 2 because XOR always counts errors twice, once when missing
|
||||
// in the expected position, and again when present in the wrong partition
|
||||
return static_cast<uint8_t>(astc::min(v1, v2) / 2);
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -266,7 +270,7 @@ static inline unsigned int partition_mismatch2(
|
||||
*
|
||||
* @return The number of bit mismatches.
|
||||
*/
|
||||
static inline unsigned int partition_mismatch3(
|
||||
static inline uint8_t partition_mismatch3(
|
||||
const uint64_t a[3],
|
||||
const uint64_t b[3]
|
||||
) {
|
||||
@@ -294,7 +298,9 @@ static inline unsigned int partition_mismatch3(
|
||||
int s5 = p11 + p20;
|
||||
int v2 = astc::min(s4, s5) + p02;
|
||||
|
||||
return astc::min(v0, v1, v2);
|
||||
// Divide by 2 because XOR always counts errors twice, once when missing
|
||||
// in the expected position, and again when present in the wrong partition
|
||||
return static_cast<uint8_t>(astc::min(v0, v1, v2) / 2);
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -305,7 +311,7 @@ static inline unsigned int partition_mismatch3(
|
||||
*
|
||||
* @return The number of bit mismatches.
|
||||
*/
|
||||
static inline unsigned int partition_mismatch4(
|
||||
static inline uint8_t partition_mismatch4(
|
||||
const uint64_t a[4],
|
||||
const uint64_t b[4]
|
||||
) {
|
||||
@@ -341,7 +347,9 @@ static inline unsigned int partition_mismatch4(
|
||||
int v2 = p02 + astc::min(p11 + mx03, p10 + mx13, p13 + mx01);
|
||||
int v3 = p03 + astc::min(p11 + mx02, p12 + mx01, p10 + mx12);
|
||||
|
||||
return astc::min(v0, v1, v2, v3);
|
||||
// Divide by 2 because XOR always counts errors twice, once when missing
|
||||
// in the expected position, and again when present in the wrong partition
|
||||
return static_cast<uint8_t>(astc::min(v0, v1, v2, v3) / 2);
|
||||
}
|
||||
|
||||
using mismatch_dispatch = unsigned int (*)(const uint64_t*, const uint64_t*);
|
||||
@@ -358,32 +366,36 @@ static void count_partition_mismatch_bits(
|
||||
const block_size_descriptor& bsd,
|
||||
unsigned int partition_count,
|
||||
const uint64_t bitmaps[BLOCK_MAX_PARTITIONS],
|
||||
unsigned int mismatch_counts[BLOCK_MAX_PARTITIONINGS]
|
||||
uint8_t mismatch_counts[BLOCK_MAX_PARTITIONINGS]
|
||||
) {
|
||||
unsigned int active_count = bsd.partitioning_count_selected[partition_count - 1];
|
||||
promise(active_count > 0);
|
||||
|
||||
if (partition_count == 2)
|
||||
{
|
||||
for (unsigned int i = 0; i < active_count; i++)
|
||||
{
|
||||
int bitcount = partition_mismatch2(bitmaps, bsd.coverage_bitmaps_2[i]);
|
||||
mismatch_counts[i] = astc::max(bitcount, static_cast<int>(bsd.partitioning_valid_2[i]));
|
||||
mismatch_counts[i] = partition_mismatch2(bitmaps, bsd.coverage_bitmaps_2[i]);
|
||||
assert(mismatch_counts[i] < BLOCK_MAX_KMEANS_TEXELS);
|
||||
assert(mismatch_counts[i] < bsd.texel_count);
|
||||
}
|
||||
}
|
||||
else if (partition_count == 3)
|
||||
{
|
||||
for (unsigned int i = 0; i < active_count; i++)
|
||||
{
|
||||
int bitcount = partition_mismatch3(bitmaps, bsd.coverage_bitmaps_3[i]);
|
||||
mismatch_counts[i] = astc::max(bitcount, static_cast<int>(bsd.partitioning_valid_3[i]));
|
||||
mismatch_counts[i] = partition_mismatch3(bitmaps, bsd.coverage_bitmaps_3[i]);
|
||||
assert(mismatch_counts[i] < BLOCK_MAX_KMEANS_TEXELS);
|
||||
assert(mismatch_counts[i] < bsd.texel_count);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for (unsigned int i = 0; i < active_count; i++)
|
||||
{
|
||||
int bitcount = partition_mismatch4(bitmaps, bsd.coverage_bitmaps_4[i]);
|
||||
mismatch_counts[i] = astc::max(bitcount, static_cast<int>(bsd.partitioning_valid_4[i]));
|
||||
mismatch_counts[i] = partition_mismatch4(bitmaps, bsd.coverage_bitmaps_4[i]);
|
||||
assert(mismatch_counts[i] < BLOCK_MAX_KMEANS_TEXELS);
|
||||
assert(mismatch_counts[i] < bsd.texel_count);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -398,11 +410,13 @@ static void count_partition_mismatch_bits(
|
||||
* @return The number of active partitions in this selection.
|
||||
*/
|
||||
static unsigned int get_partition_ordering_by_mismatch_bits(
|
||||
unsigned int texel_count,
|
||||
unsigned int partitioning_count,
|
||||
const unsigned int mismatch_count[BLOCK_MAX_PARTITIONINGS],
|
||||
unsigned int partition_ordering[BLOCK_MAX_PARTITIONINGS]
|
||||
const uint8_t mismatch_count[BLOCK_MAX_PARTITIONINGS],
|
||||
uint16_t partition_ordering[BLOCK_MAX_PARTITIONINGS]
|
||||
) {
|
||||
unsigned int mscount[256] { 0 };
|
||||
promise(partitioning_count > 0);
|
||||
uint16_t mscount[BLOCK_MAX_KMEANS_TEXELS] { 0 };
|
||||
|
||||
// Create the histogram of mismatch counts
|
||||
for (unsigned int i = 0; i < partitioning_count; i++)
|
||||
@@ -410,16 +424,14 @@ static unsigned int get_partition_ordering_by_mismatch_bits(
|
||||
mscount[mismatch_count[i]]++;
|
||||
}
|
||||
|
||||
unsigned int active_count = partitioning_count - mscount[255];
|
||||
|
||||
// Create a running sum from the histogram array
|
||||
// Cells store previous values only; i.e. exclude self after sum
|
||||
unsigned int summa = 0;
|
||||
for (unsigned int i = 0; i < 256; i++)
|
||||
unsigned int sum = 0;
|
||||
for (unsigned int i = 0; i < texel_count; i++)
|
||||
{
|
||||
unsigned int cnt = mscount[i];
|
||||
mscount[i] = summa;
|
||||
summa += cnt;
|
||||
uint16_t cnt = mscount[i];
|
||||
mscount[i] = sum;
|
||||
sum += cnt;
|
||||
}
|
||||
|
||||
// Use the running sum as the index, incrementing after read to allow
|
||||
@@ -427,10 +439,10 @@ static unsigned int get_partition_ordering_by_mismatch_bits(
|
||||
for (unsigned int i = 0; i < partitioning_count; i++)
|
||||
{
|
||||
unsigned int idx = mscount[mismatch_count[i]]++;
|
||||
partition_ordering[idx] = i;
|
||||
partition_ordering[idx] = static_cast<uint16_t>(i);
|
||||
}
|
||||
|
||||
return active_count;
|
||||
return partitioning_count;
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -447,7 +459,7 @@ static unsigned int compute_kmeans_partition_ordering(
|
||||
const block_size_descriptor& bsd,
|
||||
const image_block& blk,
|
||||
unsigned int partition_count,
|
||||
unsigned int partition_ordering[BLOCK_MAX_PARTITIONINGS]
|
||||
uint16_t partition_ordering[BLOCK_MAX_PARTITIONINGS]
|
||||
) {
|
||||
vfloat4 cluster_centers[BLOCK_MAX_PARTITIONS];
|
||||
uint8_t texel_partitions[BLOCK_MAX_TEXELS];
|
||||
@@ -478,22 +490,71 @@ static unsigned int compute_kmeans_partition_ordering(
|
||||
}
|
||||
|
||||
// Count the mismatch between the block and the format's partition tables
|
||||
unsigned int mismatch_counts[BLOCK_MAX_PARTITIONINGS];
|
||||
uint8_t mismatch_counts[BLOCK_MAX_PARTITIONINGS];
|
||||
count_partition_mismatch_bits(bsd, partition_count, bitmaps, mismatch_counts);
|
||||
|
||||
// Sort the partitions based on the number of mismatched bits
|
||||
return get_partition_ordering_by_mismatch_bits(
|
||||
texels_to_process,
|
||||
bsd.partitioning_count_selected[partition_count - 1],
|
||||
mismatch_counts, partition_ordering);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Insert a partitioning into an order list of results, sorted by error.
|
||||
*
|
||||
* @param max_values The max number of entries in the best result arrays.
|
||||
* @param this_error The error of the new entry.
|
||||
* @param this_partition The partition ID of the new entry.
|
||||
* @param[out] best_errors The array of best error values.
|
||||
* @param[out] best_partitions The array of best partition values.
|
||||
*/
|
||||
static void insert_result(
|
||||
unsigned int max_values,
|
||||
float this_error,
|
||||
unsigned int this_partition,
|
||||
float* best_errors,
|
||||
unsigned int* best_partitions)
|
||||
{
|
||||
promise(max_values > 0);
|
||||
|
||||
// Don't bother searching if the current worst error beats the new error
|
||||
if (this_error >= best_errors[max_values - 1])
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
// Else insert into the list in error-order
|
||||
for (unsigned int i = 0; i < max_values; i++)
|
||||
{
|
||||
// Existing result is better - move on ...
|
||||
if (this_error > best_errors[i])
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
// Move existing results down one
|
||||
for (unsigned int j = max_values - 1; j > i; j--)
|
||||
{
|
||||
best_errors[j] = best_errors[j - 1];
|
||||
best_partitions[j] = best_partitions[j - 1];
|
||||
}
|
||||
|
||||
// Insert new result
|
||||
best_errors[i] = this_error;
|
||||
best_partitions[i] = this_partition;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/* See header for documentation. */
|
||||
void find_best_partition_candidates(
|
||||
unsigned int find_best_partition_candidates(
|
||||
const block_size_descriptor& bsd,
|
||||
const image_block& blk,
|
||||
unsigned int partition_count,
|
||||
unsigned int partition_search_limit,
|
||||
unsigned int best_partitions[2]
|
||||
unsigned int best_partitions[TUNE_MAX_PARTITIONING_CANDIDATES],
|
||||
unsigned int requested_candidates
|
||||
) {
|
||||
// Constant used to estimate quantization error for a given partitioning; the optimal value for
|
||||
// this depends on bitrate. These values have been determined empirically.
|
||||
@@ -517,20 +578,26 @@ void find_best_partition_candidates(
|
||||
|
||||
weight_imprecision_estim = weight_imprecision_estim * weight_imprecision_estim;
|
||||
|
||||
unsigned int partition_sequence[BLOCK_MAX_PARTITIONINGS];
|
||||
uint16_t partition_sequence[BLOCK_MAX_PARTITIONINGS];
|
||||
unsigned int sequence_len = compute_kmeans_partition_ordering(bsd, blk, partition_count, partition_sequence);
|
||||
partition_search_limit = astc::min(partition_search_limit, sequence_len);
|
||||
requested_candidates = astc::min(partition_search_limit, requested_candidates);
|
||||
|
||||
bool uses_alpha = !blk.is_constant_channel(3);
|
||||
|
||||
// Partitioning errors assuming uncorrelated-chrominance endpoints
|
||||
float uncor_best_error { ERROR_CALC_DEFAULT };
|
||||
unsigned int uncor_best_partition { 0 };
|
||||
float uncor_best_errors[TUNE_MAX_PARTITIONING_CANDIDATES];
|
||||
unsigned int uncor_best_partitions[TUNE_MAX_PARTITIONING_CANDIDATES];
|
||||
|
||||
// Partitioning errors assuming same-chrominance endpoints
|
||||
// Store two so we can always return one different to uncorr
|
||||
float samec_best_errors[2] { ERROR_CALC_DEFAULT, ERROR_CALC_DEFAULT };
|
||||
unsigned int samec_best_partitions[2] { 0, 0 };
|
||||
float samec_best_errors[TUNE_MAX_PARTITIONING_CANDIDATES];
|
||||
unsigned int samec_best_partitions[TUNE_MAX_PARTITIONING_CANDIDATES];
|
||||
|
||||
for (unsigned int i = 0; i < requested_candidates; i++)
|
||||
{
|
||||
uncor_best_errors[i] = ERROR_CALC_DEFAULT;
|
||||
samec_best_errors[i] = ERROR_CALC_DEFAULT;
|
||||
}
|
||||
|
||||
if (uses_alpha)
|
||||
{
|
||||
@@ -550,8 +617,7 @@ void find_best_partition_candidates(
|
||||
processed_line4 uncor_plines[BLOCK_MAX_PARTITIONS];
|
||||
processed_line4 samec_plines[BLOCK_MAX_PARTITIONS];
|
||||
|
||||
float uncor_line_lens[BLOCK_MAX_PARTITIONS];
|
||||
float samec_line_lens[BLOCK_MAX_PARTITIONS];
|
||||
float line_lengths[BLOCK_MAX_PARTITIONS];
|
||||
|
||||
for (unsigned int j = 0; j < partition_count; j++)
|
||||
{
|
||||
@@ -561,13 +627,13 @@ void find_best_partition_candidates(
|
||||
uncor_lines[j].b = normalize_safe(pm.dir, unit4());
|
||||
|
||||
uncor_plines[j].amod = uncor_lines[j].a - uncor_lines[j].b * dot(uncor_lines[j].a, uncor_lines[j].b);
|
||||
uncor_plines[j].bs = uncor_lines[j].b;
|
||||
uncor_plines[j].bs = uncor_lines[j].b;
|
||||
|
||||
samec_lines[j].a = vfloat4::zero();
|
||||
samec_lines[j].b = normalize_safe(pm.avg, unit4());
|
||||
|
||||
samec_plines[j].amod = vfloat4::zero();
|
||||
samec_plines[j].bs = samec_lines[j].b;
|
||||
samec_plines[j].bs = samec_lines[j].b;
|
||||
}
|
||||
|
||||
float uncor_error = 0.0f;
|
||||
@@ -577,8 +643,7 @@ void find_best_partition_candidates(
|
||||
blk,
|
||||
uncor_plines,
|
||||
samec_plines,
|
||||
uncor_line_lens,
|
||||
samec_line_lens,
|
||||
line_lengths,
|
||||
uncor_error,
|
||||
samec_error);
|
||||
|
||||
@@ -597,32 +662,15 @@ void find_best_partition_candidates(
|
||||
float tpp = static_cast<float>(pi.partition_texel_count[j]);
|
||||
vfloat4 error_weights(tpp * weight_imprecision_estim);
|
||||
|
||||
vfloat4 uncor_vector = uncor_lines[j].b * uncor_line_lens[j];
|
||||
vfloat4 samec_vector = samec_lines[j].b * samec_line_lens[j];
|
||||
vfloat4 uncor_vector = uncor_lines[j].b * line_lengths[j];
|
||||
vfloat4 samec_vector = samec_lines[j].b * line_lengths[j];
|
||||
|
||||
uncor_error += dot_s(uncor_vector * uncor_vector, error_weights);
|
||||
samec_error += dot_s(samec_vector * samec_vector, error_weights);
|
||||
}
|
||||
|
||||
if (uncor_error < uncor_best_error)
|
||||
{
|
||||
uncor_best_error = uncor_error;
|
||||
uncor_best_partition = partition;
|
||||
}
|
||||
|
||||
if (samec_error < samec_best_errors[0])
|
||||
{
|
||||
samec_best_errors[1] = samec_best_errors[0];
|
||||
samec_best_partitions[1] = samec_best_partitions[0];
|
||||
|
||||
samec_best_errors[0] = samec_error;
|
||||
samec_best_partitions[0] = partition;
|
||||
}
|
||||
else if (samec_error < samec_best_errors[1])
|
||||
{
|
||||
samec_best_errors[1] = samec_error;
|
||||
samec_best_partitions[1] = partition;
|
||||
}
|
||||
insert_result(requested_candidates, uncor_error, partition, uncor_best_errors, uncor_best_partitions);
|
||||
insert_result(requested_candidates, samec_error, partition, samec_best_errors, samec_best_partitions);
|
||||
}
|
||||
}
|
||||
else
|
||||
@@ -644,10 +692,10 @@ void find_best_partition_candidates(
|
||||
partition_lines3& pl = plines[j];
|
||||
|
||||
pl.uncor_line.a = pm.avg;
|
||||
pl.uncor_line.b = normalize_safe(pm.dir.swz<0, 1, 2>(), unit3());
|
||||
pl.uncor_line.b = normalize_safe(pm.dir, unit3());
|
||||
|
||||
pl.samec_line.a = vfloat4::zero();
|
||||
pl.samec_line.b = normalize_safe(pm.avg.swz<0, 1, 2>(), unit3());
|
||||
pl.samec_line.b = normalize_safe(pm.avg, unit3());
|
||||
|
||||
pl.uncor_pline.amod = pl.uncor_line.a - pl.uncor_line.b * dot3(pl.uncor_line.a, pl.uncor_line.b);
|
||||
pl.uncor_pline.bs = pl.uncor_line.b;
|
||||
@@ -682,57 +730,52 @@ void find_best_partition_candidates(
|
||||
float tpp = static_cast<float>(pi.partition_texel_count[j]);
|
||||
vfloat4 error_weights(tpp * weight_imprecision_estim);
|
||||
|
||||
vfloat4 uncor_vector = pl.uncor_line.b * pl.uncor_line_len;
|
||||
vfloat4 samec_vector = pl.samec_line.b * pl.samec_line_len;
|
||||
vfloat4 uncor_vector = pl.uncor_line.b * pl.line_length;
|
||||
vfloat4 samec_vector = pl.samec_line.b * pl.line_length;
|
||||
|
||||
uncor_error += dot3_s(uncor_vector * uncor_vector, error_weights);
|
||||
samec_error += dot3_s(samec_vector * samec_vector, error_weights);
|
||||
}
|
||||
|
||||
if (uncor_error < uncor_best_error)
|
||||
{
|
||||
uncor_best_error = uncor_error;
|
||||
uncor_best_partition = partition;
|
||||
}
|
||||
insert_result(requested_candidates, uncor_error, partition, uncor_best_errors, uncor_best_partitions);
|
||||
insert_result(requested_candidates, samec_error, partition, samec_best_errors, samec_best_partitions);
|
||||
}
|
||||
}
|
||||
|
||||
if (samec_error < samec_best_errors[0])
|
||||
{
|
||||
samec_best_errors[1] = samec_best_errors[0];
|
||||
samec_best_partitions[1] = samec_best_partitions[0];
|
||||
unsigned int interleave[2 * TUNE_MAX_PARTITIONING_CANDIDATES];
|
||||
for (unsigned int i = 0; i < requested_candidates; i++)
|
||||
{
|
||||
interleave[2 * i] = bsd.get_raw_partition_info(partition_count, uncor_best_partitions[i]).partition_index;
|
||||
interleave[2 * i + 1] = bsd.get_raw_partition_info(partition_count, samec_best_partitions[i]).partition_index;
|
||||
}
|
||||
|
||||
samec_best_errors[0] = samec_error;
|
||||
samec_best_partitions[0] = partition;
|
||||
}
|
||||
else if (samec_error < samec_best_errors[1])
|
||||
uint64_t bitmasks[1024/64] { 0 };
|
||||
unsigned int emitted = 0;
|
||||
|
||||
// Deduplicate the first "requested" entries
|
||||
for (unsigned int i = 0; i < requested_candidates * 2; i++)
|
||||
{
|
||||
unsigned int partition = interleave[i];
|
||||
|
||||
unsigned int word = partition / 64;
|
||||
unsigned int bit = partition % 64;
|
||||
|
||||
bool written = bitmasks[word] & (1ull << bit);
|
||||
|
||||
if (!written)
|
||||
{
|
||||
best_partitions[emitted] = partition;
|
||||
bitmasks[word] |= 1ull << bit;
|
||||
emitted++;
|
||||
|
||||
if (emitted == requested_candidates)
|
||||
{
|
||||
samec_best_errors[1] = samec_error;
|
||||
samec_best_partitions[1] = partition;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Same partition is best for both, so use this first unconditionally
|
||||
if (uncor_best_partition == samec_best_partitions[0])
|
||||
{
|
||||
best_partitions[0] = samec_best_partitions[0];
|
||||
best_partitions[1] = samec_best_partitions[1];
|
||||
}
|
||||
// Uncor is best
|
||||
else if (uncor_best_error <= samec_best_errors[0])
|
||||
{
|
||||
best_partitions[0] = uncor_best_partition;
|
||||
best_partitions[1] = samec_best_partitions[0];
|
||||
}
|
||||
// Samec is best
|
||||
else
|
||||
{
|
||||
best_partitions[0] = samec_best_partitions[0];
|
||||
best_partitions[1] = uncor_best_partition;
|
||||
}
|
||||
|
||||
// Convert these back into canonical partition IDs for the rest of the codec
|
||||
best_partitions[0] = bsd.get_raw_partition_info(partition_count, best_partitions[0]).partition_index;
|
||||
best_partitions[1] = bsd.get_raw_partition_info(partition_count, best_partitions[1]).partition_index;
|
||||
return emitted;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
// ----------------------------------------------------------------------------
|
||||
// Copyright 2011-2022 Arm Limited
|
||||
// Copyright 2011-2024 Arm Limited
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
|
||||
// use this file except in compliance with the License. You may obtain a copy
|
||||
@@ -26,6 +26,76 @@
|
||||
#include "astcenc_internal.h"
|
||||
#include "astcenc_vecmathlib.h"
|
||||
|
||||
/**
|
||||
* @brief Compute the infilled weight for N texel indices in a decimated grid.
|
||||
*
|
||||
* @param di The weight grid decimation to use.
|
||||
* @param weights The decimated weight values to use.
|
||||
* @param index The first texel index to interpolate.
|
||||
*
|
||||
* @return The interpolated weight for the given set of SIMD_WIDTH texels.
|
||||
*/
|
||||
static vfloat bilinear_infill_vla(
|
||||
const decimation_info& di,
|
||||
const float* weights,
|
||||
unsigned int index
|
||||
) {
|
||||
// Load the bilinear filter texel weight indexes in the decimated grid
|
||||
vint weight_idx0 = vint(di.texel_weights_tr[0] + index);
|
||||
vint weight_idx1 = vint(di.texel_weights_tr[1] + index);
|
||||
vint weight_idx2 = vint(di.texel_weights_tr[2] + index);
|
||||
vint weight_idx3 = vint(di.texel_weights_tr[3] + index);
|
||||
|
||||
// Load the bilinear filter weights from the decimated grid
|
||||
vfloat weight_val0 = gatherf(weights, weight_idx0);
|
||||
vfloat weight_val1 = gatherf(weights, weight_idx1);
|
||||
vfloat weight_val2 = gatherf(weights, weight_idx2);
|
||||
vfloat weight_val3 = gatherf(weights, weight_idx3);
|
||||
|
||||
// Load the weight contribution factors for each decimated weight
|
||||
vfloat tex_weight_float0 = loada(di.texel_weight_contribs_float_tr[0] + index);
|
||||
vfloat tex_weight_float1 = loada(di.texel_weight_contribs_float_tr[1] + index);
|
||||
vfloat tex_weight_float2 = loada(di.texel_weight_contribs_float_tr[2] + index);
|
||||
vfloat tex_weight_float3 = loada(di.texel_weight_contribs_float_tr[3] + index);
|
||||
|
||||
// Compute the bilinear interpolation to generate the per-texel weight
|
||||
return (weight_val0 * tex_weight_float0 + weight_val1 * tex_weight_float1) +
|
||||
(weight_val2 * tex_weight_float2 + weight_val3 * tex_weight_float3);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Compute the infilled weight for N texel indices in a decimated grid.
|
||||
*
|
||||
* This is specialized version which computes only two weights per texel for
|
||||
* encodings that are only decimated in a single axis.
|
||||
*
|
||||
* @param di The weight grid decimation to use.
|
||||
* @param weights The decimated weight values to use.
|
||||
* @param index The first texel index to interpolate.
|
||||
*
|
||||
* @return The interpolated weight for the given set of SIMD_WIDTH texels.
|
||||
*/
|
||||
static vfloat bilinear_infill_vla_2(
|
||||
const decimation_info& di,
|
||||
const float* weights,
|
||||
unsigned int index
|
||||
) {
|
||||
// Load the bilinear filter texel weight indexes in the decimated grid
|
||||
vint weight_idx0 = vint(di.texel_weights_tr[0] + index);
|
||||
vint weight_idx1 = vint(di.texel_weights_tr[1] + index);
|
||||
|
||||
// Load the bilinear filter weights from the decimated grid
|
||||
vfloat weight_val0 = gatherf(weights, weight_idx0);
|
||||
vfloat weight_val1 = gatherf(weights, weight_idx1);
|
||||
|
||||
// Load the weight contribution factors for each decimated weight
|
||||
vfloat tex_weight_float0 = loada(di.texel_weight_contribs_float_tr[0] + index);
|
||||
vfloat tex_weight_float1 = loada(di.texel_weight_contribs_float_tr[1] + index);
|
||||
|
||||
// Compute the bilinear interpolation to generate the per-texel weight
|
||||
return (weight_val0 * tex_weight_float0 + weight_val1 * tex_weight_float1);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Compute the ideal endpoints and weights for 1 color component.
|
||||
*
|
||||
@@ -90,7 +160,7 @@ static void compute_ideal_colors_and_weights_1_comp(
|
||||
highvalue = astc::max(value, highvalue);
|
||||
}
|
||||
|
||||
if (highvalue < lowvalue)
|
||||
if (highvalue <= lowvalue)
|
||||
{
|
||||
lowvalue = 0.0f;
|
||||
highvalue = 1e-7f;
|
||||
@@ -198,13 +268,13 @@ static void compute_ideal_colors_and_weights_2_comp(
|
||||
|
||||
for (unsigned int i = 0; i < partition_count; i++)
|
||||
{
|
||||
vfloat4 dir = pms[i].dir.swz<0, 1>();
|
||||
vfloat4 dir = pms[i].dir;
|
||||
if (hadd_s(dir) < 0.0f)
|
||||
{
|
||||
dir = vfloat4::zero() - dir;
|
||||
}
|
||||
|
||||
line2 line { pms[i].avg.swz<0, 1>(), normalize_safe(dir, unit2()) };
|
||||
line2 line { pms[i].avg, normalize_safe(dir, unit2()) };
|
||||
float lowparam { 1e10f };
|
||||
float highparam { -1e10f };
|
||||
|
||||
@@ -222,7 +292,7 @@ static void compute_ideal_colors_and_weights_2_comp(
|
||||
|
||||
// It is possible for a uniform-color partition to produce length=0;
|
||||
// this causes NaN issues so set to small value to avoid this problem
|
||||
if (highparam < lowparam)
|
||||
if (highparam <= lowparam)
|
||||
{
|
||||
lowparam = 0.0f;
|
||||
highparam = 1e-7f;
|
||||
@@ -371,7 +441,7 @@ static void compute_ideal_colors_and_weights_3_comp(
|
||||
|
||||
// It is possible for a uniform-color partition to produce length=0;
|
||||
// this causes NaN issues so set to small value to avoid this problem
|
||||
if (highparam < lowparam)
|
||||
if (highparam <= lowparam)
|
||||
{
|
||||
lowparam = 0.0f;
|
||||
highparam = 1e-7f;
|
||||
@@ -493,7 +563,7 @@ static void compute_ideal_colors_and_weights_4_comp(
|
||||
|
||||
// It is possible for a uniform-color partition to produce length=0;
|
||||
// this causes NaN issues so set to small value to avoid this problem
|
||||
if (highparam < lowparam)
|
||||
if (highparam <= lowparam)
|
||||
{
|
||||
lowparam = 0.0f;
|
||||
highparam = 1e-7f;
|
||||
@@ -621,8 +691,8 @@ float compute_error_of_weight_set_1plane(
|
||||
const float* dec_weight_quant_uvalue
|
||||
) {
|
||||
vfloatacc error_summav = vfloatacc::zero();
|
||||
float error_summa = 0.0f;
|
||||
unsigned int texel_count = di.texel_count;
|
||||
promise(texel_count > 0);
|
||||
|
||||
// Process SIMD-width chunks, safe to over-fetch - the extra space is zero initialized
|
||||
if (di.max_texel_weight_count > 2)
|
||||
@@ -675,7 +745,7 @@ float compute_error_of_weight_set_1plane(
|
||||
}
|
||||
|
||||
// Resolve the final scalar accumulator sum
|
||||
return error_summa = hadd_s(error_summav);
|
||||
return hadd_s(error_summav);
|
||||
}
|
||||
|
||||
/* See header for documentation. */
|
||||
@@ -688,6 +758,7 @@ float compute_error_of_weight_set_2planes(
|
||||
) {
|
||||
vfloatacc error_summav = vfloatacc::zero();
|
||||
unsigned int texel_count = di.texel_count;
|
||||
promise(texel_count > 0);
|
||||
|
||||
// Process SIMD-width chunks, safe to over-fetch - the extra space is zero initialized
|
||||
if (di.max_texel_weight_count > 2)
|
||||
@@ -772,8 +843,7 @@ float compute_error_of_weight_set_2planes(
|
||||
|
||||
/* See header for documentation. */
|
||||
void compute_ideal_weights_for_decimation(
|
||||
const endpoints_and_weights& eai_in,
|
||||
endpoints_and_weights& eai_out,
|
||||
const endpoints_and_weights& ei,
|
||||
const decimation_info& di,
|
||||
float* dec_weight_ideal_value
|
||||
) {
|
||||
@@ -783,49 +853,31 @@ void compute_ideal_weights_for_decimation(
|
||||
promise(texel_count > 0);
|
||||
promise(weight_count > 0);
|
||||
|
||||
// This function includes a copy of the epw from eai_in to eai_out. We do it here because we
|
||||
// want to load the data anyway, so we can avoid loading it from memory twice.
|
||||
eai_out.ep = eai_in.ep;
|
||||
eai_out.is_constant_weight_error_scale = eai_in.is_constant_weight_error_scale;
|
||||
|
||||
// Ensure that the end of the output arrays that are used for SIMD paths later are filled so we
|
||||
// can safely run SIMD elsewhere without a loop tail. Note that this is always safe as weight
|
||||
// arrays always contain space for 64 elements
|
||||
unsigned int prev_weight_count_simd = round_down_to_simd_multiple_vla(weight_count - 1);
|
||||
storea(vfloat::zero(), dec_weight_ideal_value + prev_weight_count_simd);
|
||||
|
||||
// If we have a 1:1 mapping just shortcut the computation - clone the weights into both the
|
||||
// weight set and the output epw copy.
|
||||
|
||||
// Transfer enough to also copy zero initialized SIMD over-fetch region
|
||||
unsigned int texel_count_simd = round_up_to_simd_multiple_vla(texel_count);
|
||||
for (unsigned int i = 0; i < texel_count_simd; i += ASTCENC_SIMD_WIDTH)
|
||||
{
|
||||
vfloat weight(eai_in.weights + i);
|
||||
vfloat weight_error_scale(eai_in.weight_error_scale + i);
|
||||
|
||||
storea(weight, eai_out.weights + i);
|
||||
storea(weight_error_scale, eai_out.weight_error_scale + i);
|
||||
|
||||
// Direct 1:1 weight mapping, so clone weights directly
|
||||
// TODO: Can we just avoid the copy for direct cases?
|
||||
if (is_direct)
|
||||
{
|
||||
storea(weight, dec_weight_ideal_value + i);
|
||||
}
|
||||
}
|
||||
|
||||
// If we have a 1:1 mapping just shortcut the computation. Transfer enough to also copy the
|
||||
// zero-initialized SIMD over-fetch region
|
||||
if (is_direct)
|
||||
{
|
||||
for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
|
||||
{
|
||||
vfloat weight(ei.weights + i);
|
||||
storea(weight, dec_weight_ideal_value + i);
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
// Otherwise compute an estimate and perform single refinement iteration
|
||||
alignas(ASTCENC_VECALIGN) float infilled_weights[BLOCK_MAX_TEXELS];
|
||||
ASTCENC_ALIGNAS float infilled_weights[BLOCK_MAX_TEXELS];
|
||||
|
||||
// Compute an initial average for each decimated weight
|
||||
bool constant_wes = eai_in.is_constant_weight_error_scale;
|
||||
vfloat weight_error_scale(eai_in.weight_error_scale[0]);
|
||||
bool constant_wes = ei.is_constant_weight_error_scale;
|
||||
vfloat weight_error_scale(ei.weight_error_scale[0]);
|
||||
|
||||
// This overshoots - this is OK as we initialize the array tails in the
|
||||
// decimation table structures to safe values ...
|
||||
@@ -842,24 +894,24 @@ void compute_ideal_weights_for_decimation(
|
||||
|
||||
for (unsigned int j = 0; j < max_texel_count; j++)
|
||||
{
|
||||
vint texel(di.weight_texel[j] + i);
|
||||
vfloat weight = loada(di.weights_flt[j] + i);
|
||||
vint texel(di.weight_texels_tr[j] + i);
|
||||
vfloat weight = loada(di.weights_texel_contribs_tr[j] + i);
|
||||
|
||||
if (!constant_wes)
|
||||
{
|
||||
weight_error_scale = gatherf(eai_in.weight_error_scale, texel);
|
||||
weight_error_scale = gatherf(ei.weight_error_scale, texel);
|
||||
}
|
||||
|
||||
vfloat contrib_weight = weight * weight_error_scale;
|
||||
|
||||
weight_weight += contrib_weight;
|
||||
initial_weight += gatherf(eai_in.weights, texel) * contrib_weight;
|
||||
initial_weight += gatherf(ei.weights, texel) * contrib_weight;
|
||||
}
|
||||
|
||||
storea(initial_weight / weight_weight, dec_weight_ideal_value + i);
|
||||
}
|
||||
|
||||
// Populate the interpolated weight grid based on the initital average
|
||||
// Populate the interpolated weight grid based on the initial average
|
||||
// Process SIMD-width texel coordinates at at time while we can. Safe to
|
||||
// over-process full SIMD vectors - the tail is zeroed.
|
||||
if (di.max_texel_weight_count <= 2)
|
||||
@@ -900,17 +952,17 @@ void compute_ideal_weights_for_decimation(
|
||||
|
||||
for (unsigned int j = 0; j < max_texel_count; j++)
|
||||
{
|
||||
vint texel(di.weight_texel[j] + i);
|
||||
vfloat contrib_weight = loada(di.weights_flt[j] + i);
|
||||
vint texel(di.weight_texels_tr[j] + i);
|
||||
vfloat contrib_weight = loada(di.weights_texel_contribs_tr[j] + i);
|
||||
|
||||
if (!constant_wes)
|
||||
{
|
||||
weight_error_scale = gatherf(eai_in.weight_error_scale, texel);
|
||||
weight_error_scale = gatherf(ei.weight_error_scale, texel);
|
||||
}
|
||||
|
||||
vfloat scale = weight_error_scale * contrib_weight;
|
||||
vfloat old_weight = gatherf(infilled_weights, texel);
|
||||
vfloat ideal_weight = gatherf(eai_in.weights, texel);
|
||||
vfloat ideal_weight = gatherf(ei.weights, texel);
|
||||
|
||||
error_change0 += contrib_weight * scale;
|
||||
error_change1 += (old_weight - ideal_weight) * scale;
|
||||
@@ -919,7 +971,7 @@ void compute_ideal_weights_for_decimation(
|
||||
vfloat step = (error_change1 * chd_scale) / error_change0;
|
||||
step = clamp(-stepsize, stepsize, step);
|
||||
|
||||
// Update the weight; note this can store negative values.
|
||||
// Update the weight; note this can store negative values
|
||||
storea(weight_val + step, dec_weight_ideal_value + i);
|
||||
}
|
||||
}
|
||||
@@ -936,19 +988,20 @@ void compute_quantized_weights_for_decimation(
|
||||
) {
|
||||
int weight_count = di.weight_count;
|
||||
promise(weight_count > 0);
|
||||
const quantization_and_transfer_table *qat = &(quant_and_xfer_tables[quant_level]);
|
||||
const quant_and_transfer_table& qat = quant_and_xfer_tables[quant_level];
|
||||
|
||||
// The available quant levels, stored with a minus 1 bias
|
||||
static const float quant_levels_m1[12] {
|
||||
1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 7.0f, 9.0f, 11.0f, 15.0f, 19.0f, 23.0f, 31.0f
|
||||
};
|
||||
|
||||
vint steps_m1(get_quant_level(quant_level) - 1);
|
||||
float quant_level_m1 = quant_levels_m1[quant_level];
|
||||
|
||||
// Quantize the weight set using both the specified low/high bounds and standard 0..1 bounds
|
||||
|
||||
// TODO: Oddity to investigate; triggered by test in issue #265.
|
||||
if (high_bound < low_bound)
|
||||
if (high_bound <= low_bound)
|
||||
{
|
||||
low_bound = 0.0f;
|
||||
high_bound = 1.0f;
|
||||
@@ -968,29 +1021,72 @@ void compute_quantized_weights_for_decimation(
|
||||
|
||||
// This runs to the rounded-up SIMD size, which is safe as the loop tail is filled with known
|
||||
// safe data in compute_ideal_weights_for_decimation and arrays are always 64 elements
|
||||
for (int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH)
|
||||
if (get_quant_level(quant_level) <= 16)
|
||||
{
|
||||
vfloat ix = loada(&dec_weight_ideal_value[i]) * scalev - scaled_low_boundv;
|
||||
ix = clampzo(ix);
|
||||
vint4 tab0 = vint4::load(qat.quant_to_unquant);
|
||||
vint tab0p;
|
||||
vtable_prepare(tab0, tab0p);
|
||||
|
||||
// Look up the two closest indexes and return the one that was closest
|
||||
vfloat ix1 = ix * quant_level_m1v;
|
||||
for (int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH)
|
||||
{
|
||||
vfloat ix = loada(dec_weight_ideal_value + i) * scalev - scaled_low_boundv;
|
||||
ix = clampzo(ix);
|
||||
|
||||
vint weightl = float_to_int(ix1);
|
||||
vint weighth = weightl + vint(1);
|
||||
// Look up the two closest indexes and return the one that was closest
|
||||
vfloat ix1 = ix * quant_level_m1v;
|
||||
|
||||
vfloat ixl = gatherf(qat->unquantized_value_unsc, weightl);
|
||||
vfloat ixh = gatherf(qat->unquantized_value_unsc, weighth);
|
||||
vint weightl = float_to_int(ix1);
|
||||
vint weighth = min(weightl + vint(1), steps_m1);
|
||||
|
||||
vmask mask = (ixl + ixh) < (vfloat(128.0f) * ix);
|
||||
vint weight = select(weightl, weighth, mask);
|
||||
ixl = select(ixl, ixh, mask);
|
||||
vint ixli = vtable_8bt_32bi(tab0p, weightl);
|
||||
vint ixhi = vtable_8bt_32bi(tab0p, weighth);
|
||||
|
||||
// Invert the weight-scaling that was done initially
|
||||
storea(ixl * rscalev + low_boundv, &weight_set_out[i]);
|
||||
vint scm = gatheri(qat->scramble_map, weight);
|
||||
vint scn = pack_low_bytes(scm);
|
||||
store_nbytes(scn, &quantized_weight_set[i]);
|
||||
vfloat ixl = int_to_float(ixli);
|
||||
vfloat ixh = int_to_float(ixhi);
|
||||
|
||||
vmask mask = (ixl + ixh) < (vfloat(128.0f) * ix);
|
||||
vint weight = select(ixli, ixhi, mask);
|
||||
ixl = select(ixl, ixh, mask);
|
||||
|
||||
// Invert the weight-scaling that was done initially
|
||||
storea(ixl * rscalev + low_boundv, weight_set_out + i);
|
||||
vint scn = pack_low_bytes(weight);
|
||||
store_nbytes(scn, quantized_weight_set + i);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
vint4 tab0 = vint4::load(qat.quant_to_unquant + 0);
|
||||
vint4 tab1 = vint4::load(qat.quant_to_unquant + 16);
|
||||
vint tab0p, tab1p;
|
||||
vtable_prepare(tab0, tab1, tab0p, tab1p);
|
||||
|
||||
for (int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH)
|
||||
{
|
||||
vfloat ix = loada(dec_weight_ideal_value + i) * scalev - scaled_low_boundv;
|
||||
ix = clampzo(ix);
|
||||
|
||||
// Look up the two closest indexes and return the one that was closest
|
||||
vfloat ix1 = ix * quant_level_m1v;
|
||||
|
||||
vint weightl = float_to_int(ix1);
|
||||
vint weighth = min(weightl + vint(1), steps_m1);
|
||||
|
||||
vint ixli = vtable_8bt_32bi(tab0p, tab1p, weightl);
|
||||
vint ixhi = vtable_8bt_32bi(tab0p, tab1p, weighth);
|
||||
|
||||
vfloat ixl = int_to_float(ixli);
|
||||
vfloat ixh = int_to_float(ixhi);
|
||||
|
||||
vmask mask = (ixl + ixh) < (vfloat(128.0f) * ix);
|
||||
vint weight = select(ixli, ixhi, mask);
|
||||
ixl = select(ixl, ixh, mask);
|
||||
|
||||
// Invert the weight-scaling that was done initially
|
||||
storea(ixl * rscalev + low_boundv, weight_set_out + i);
|
||||
vint scn = pack_low_bytes(weight);
|
||||
store_nbytes(scn, quantized_weight_set + i);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1062,8 +1158,7 @@ void recompute_ideal_colors_1plane(
|
||||
const image_block& blk,
|
||||
const partition_info& pi,
|
||||
const decimation_info& di,
|
||||
int weight_quant_mode,
|
||||
const uint8_t* dec_weights_quant_pvalue,
|
||||
const uint8_t* dec_weights_uquant,
|
||||
endpoints& ep,
|
||||
vfloat4 rgbs_vectors[BLOCK_MAX_PARTITIONS],
|
||||
vfloat4 rgbo_vectors[BLOCK_MAX_PARTITIONS]
|
||||
@@ -1076,15 +1171,15 @@ void recompute_ideal_colors_1plane(
|
||||
promise(total_texel_count > 0);
|
||||
promise(partition_count > 0);
|
||||
|
||||
const quantization_and_transfer_table& qat = quant_and_xfer_tables[weight_quant_mode];
|
||||
|
||||
float dec_weight[BLOCK_MAX_WEIGHTS];
|
||||
for (unsigned int i = 0; i < weight_count; i++)
|
||||
ASTCENC_ALIGNAS float dec_weight[BLOCK_MAX_WEIGHTS];
|
||||
for (unsigned int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH)
|
||||
{
|
||||
dec_weight[i] = qat.unquantized_value[dec_weights_quant_pvalue[i]] * (1.0f / 64.0f);
|
||||
vint unquant_value(dec_weights_uquant + i);
|
||||
vfloat unquant_valuef = int_to_float(unquant_value) * vfloat(1.0f / 64.0f);
|
||||
storea(unquant_valuef, dec_weight + i);
|
||||
}
|
||||
|
||||
alignas(ASTCENC_VECALIGN) float undec_weight[BLOCK_MAX_TEXELS];
|
||||
ASTCENC_ALIGNAS float undec_weight[BLOCK_MAX_TEXELS];
|
||||
float* undec_weight_ref;
|
||||
if (di.max_texel_weight_count == 1)
|
||||
{
|
||||
@@ -1121,7 +1216,7 @@ void recompute_ideal_colors_1plane(
|
||||
// Only compute a partition mean if more than one partition
|
||||
if (partition_count > 1)
|
||||
{
|
||||
rgba_sum = vfloat4(1e-17f);
|
||||
rgba_sum = vfloat4::zero();
|
||||
promise(texel_count > 0);
|
||||
for (unsigned int j = 0; j < texel_count; j++)
|
||||
{
|
||||
@@ -1157,7 +1252,6 @@ void recompute_ideal_colors_1plane(
|
||||
for (unsigned int j = 0; j < texel_count; j++)
|
||||
{
|
||||
unsigned int tix = texel_indexes[j];
|
||||
|
||||
vfloat4 rgba = blk.texel(tix);
|
||||
|
||||
float idx0 = undec_weight_ref[tix];
|
||||
@@ -1190,14 +1284,11 @@ void recompute_ideal_colors_1plane(
|
||||
vfloat4 right_sum = vfloat4(right_sum_s) * color_weight;
|
||||
vfloat4 lmrs_sum = vfloat3(left_sum_s, middle_sum_s, right_sum_s) * ls_weight;
|
||||
|
||||
vfloat4 weight_weight_sum = vfloat4(weight_weight_sum_s) * color_weight;
|
||||
float psum = right_sum_s * hadd_rgb_s(color_weight);
|
||||
|
||||
color_vec_x = color_vec_x * color_weight;
|
||||
color_vec_y = color_vec_y * color_weight;
|
||||
|
||||
// Initialize the luminance and scale vectors with a reasonable default
|
||||
float scalediv = scale_min * (1.0f / astc::max(scale_max, 1e-10f));
|
||||
float scalediv = scale_min / astc::max(scale_max, 1e-10f);
|
||||
scalediv = astc::clamp1f(scalediv);
|
||||
|
||||
vfloat4 sds = scale_dir * scale_max;
|
||||
@@ -1249,32 +1340,38 @@ void recompute_ideal_colors_1plane(
|
||||
|
||||
if (fabsf(ls_det1) > (ls_mss1 * 1e-4f) && scale_ep0 == scale_ep0 && scale_ep1 == scale_ep1 && scale_ep0 < scale_ep1)
|
||||
{
|
||||
float scalediv2 = scale_ep0 * (1.0f / scale_ep1);
|
||||
float scalediv2 = scale_ep0 / scale_ep1;
|
||||
vfloat4 sdsm = scale_dir * scale_ep1;
|
||||
rgbs_vectors[i] = vfloat4(sdsm.lane<0>(), sdsm.lane<1>(), sdsm.lane<2>(), scalediv2);
|
||||
}
|
||||
}
|
||||
|
||||
// Calculations specific to mode #7, the HDR RGB-scale mode
|
||||
vfloat4 rgbq_sum = color_vec_x + color_vec_y;
|
||||
rgbq_sum.set_lane<3>(hadd_rgb_s(color_vec_y));
|
||||
|
||||
vfloat4 rgbovec = compute_rgbo_vector(rgba_weight_sum, weight_weight_sum, rgbq_sum, psum);
|
||||
rgbo_vectors[i] = rgbovec;
|
||||
|
||||
// We can get a failure due to the use of a singular (non-invertible) matrix
|
||||
// If it failed, compute rgbo_vectors[] with a different method ...
|
||||
if (astc::isnan(dot_s(rgbovec, rgbovec)))
|
||||
// Calculations specific to mode #7, the HDR RGB-scale mode - skip if known LDR
|
||||
if (blk.rgb_lns[0] || blk.alpha_lns[0])
|
||||
{
|
||||
vfloat4 v0 = ep.endpt0[i];
|
||||
vfloat4 v1 = ep.endpt1[i];
|
||||
vfloat4 weight_weight_sum = vfloat4(weight_weight_sum_s) * color_weight;
|
||||
float psum = right_sum_s * hadd_rgb_s(color_weight);
|
||||
|
||||
float avgdif = hadd_rgb_s(v1 - v0) * (1.0f / 3.0f);
|
||||
avgdif = astc::max(avgdif, 0.0f);
|
||||
vfloat4 rgbq_sum = color_vec_x + color_vec_y;
|
||||
rgbq_sum.set_lane<3>(hadd_rgb_s(color_vec_y));
|
||||
|
||||
vfloat4 avg = (v0 + v1) * 0.5f;
|
||||
vfloat4 ep0 = avg - vfloat4(avgdif) * 0.5f;
|
||||
rgbo_vectors[i] = vfloat4(ep0.lane<0>(), ep0.lane<1>(), ep0.lane<2>(), avgdif);
|
||||
vfloat4 rgbovec = compute_rgbo_vector(rgba_weight_sum, weight_weight_sum, rgbq_sum, psum);
|
||||
rgbo_vectors[i] = rgbovec;
|
||||
|
||||
// We can get a failure due to the use of a singular (non-invertible) matrix
|
||||
// If it failed, compute rgbo_vectors[] with a different method ...
|
||||
if (astc::isnan(dot_s(rgbovec, rgbovec)))
|
||||
{
|
||||
vfloat4 v0 = ep.endpt0[i];
|
||||
vfloat4 v1 = ep.endpt1[i];
|
||||
|
||||
float avgdif = hadd_rgb_s(v1 - v0) * (1.0f / 3.0f);
|
||||
avgdif = astc::max(avgdif, 0.0f);
|
||||
|
||||
vfloat4 avg = (v0 + v1) * 0.5f;
|
||||
vfloat4 ep0 = avg - vfloat4(avgdif) * 0.5f;
|
||||
rgbo_vectors[i] = vfloat4(ep0.lane<0>(), ep0.lane<1>(), ep0.lane<2>(), avgdif);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1284,9 +1381,8 @@ void recompute_ideal_colors_2planes(
|
||||
const image_block& blk,
|
||||
const block_size_descriptor& bsd,
|
||||
const decimation_info& di,
|
||||
int weight_quant_mode,
|
||||
const uint8_t* dec_weights_quant_pvalue_plane1,
|
||||
const uint8_t* dec_weights_quant_pvalue_plane2,
|
||||
const uint8_t* dec_weights_uquant_plane1,
|
||||
const uint8_t* dec_weights_uquant_plane2,
|
||||
endpoints& ep,
|
||||
vfloat4& rgbs_vector,
|
||||
vfloat4& rgbo_vector,
|
||||
@@ -1298,20 +1394,24 @@ void recompute_ideal_colors_2planes(
|
||||
promise(total_texel_count > 0);
|
||||
promise(weight_count > 0);
|
||||
|
||||
const quantization_and_transfer_table *qat = &(quant_and_xfer_tables[weight_quant_mode]);
|
||||
|
||||
float dec_weight_plane1[BLOCK_MAX_WEIGHTS_2PLANE];
|
||||
float dec_weight_plane2[BLOCK_MAX_WEIGHTS_2PLANE];
|
||||
ASTCENC_ALIGNAS float dec_weight_plane1[BLOCK_MAX_WEIGHTS_2PLANE];
|
||||
ASTCENC_ALIGNAS float dec_weight_plane2[BLOCK_MAX_WEIGHTS_2PLANE];
|
||||
|
||||
assert(weight_count <= BLOCK_MAX_WEIGHTS_2PLANE);
|
||||
for (unsigned int i = 0; i < weight_count; i++)
|
||||
|
||||
for (unsigned int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH)
|
||||
{
|
||||
dec_weight_plane1[i] = qat->unquantized_value[dec_weights_quant_pvalue_plane1[i]] * (1.0f / 64.0f);
|
||||
dec_weight_plane2[i] = qat->unquantized_value[dec_weights_quant_pvalue_plane2[i]] * (1.0f / 64.0f);
|
||||
vint unquant_value1(dec_weights_uquant_plane1 + i);
|
||||
vfloat unquant_value1f = int_to_float(unquant_value1) * vfloat(1.0f / 64.0f);
|
||||
storea(unquant_value1f, dec_weight_plane1 + i);
|
||||
|
||||
vint unquant_value2(dec_weights_uquant_plane2 + i);
|
||||
vfloat unquant_value2f = int_to_float(unquant_value2) * vfloat(1.0f / 64.0f);
|
||||
storea(unquant_value2f, dec_weight_plane2 + i);
|
||||
}
|
||||
|
||||
alignas(ASTCENC_VECALIGN) float undec_weight_plane1[BLOCK_MAX_TEXELS];
|
||||
alignas(ASTCENC_VECALIGN) float undec_weight_plane2[BLOCK_MAX_TEXELS];
|
||||
ASTCENC_ALIGNAS float undec_weight_plane1[BLOCK_MAX_TEXELS];
|
||||
ASTCENC_ALIGNAS float undec_weight_plane2[BLOCK_MAX_TEXELS];
|
||||
|
||||
float* undec_weight_plane1_ref;
|
||||
float* undec_weight_plane2_ref;
|
||||
@@ -1419,7 +1519,7 @@ void recompute_ideal_colors_2planes(
|
||||
color_vec_x += cwprod - cwiprod;
|
||||
|
||||
scale_vec += vfloat2(om_idx0, idx0) * (ls_weight * scale);
|
||||
weight_weight_sum += (color_weight * color_idx);
|
||||
weight_weight_sum += color_idx;
|
||||
}
|
||||
|
||||
vfloat4 left1_sum = vfloat4(left1_sum_s) * color_weight;
|
||||
@@ -1431,13 +1531,11 @@ void recompute_ideal_colors_2planes(
|
||||
vfloat4 middle2_sum = vfloat4(middle2_sum_s) * color_weight;
|
||||
vfloat4 right2_sum = vfloat4(right2_sum_s) * color_weight;
|
||||
|
||||
float psum = dot3_s(select(right1_sum, right2_sum, p2_mask), color_weight);
|
||||
|
||||
color_vec_x = color_vec_x * color_weight;
|
||||
color_vec_y = color_vec_y * color_weight;
|
||||
|
||||
// Initialize the luminance and scale vectors with a reasonable default
|
||||
float scalediv = scale_min * (1.0f / astc::max(scale_max, 1e-10f));
|
||||
float scalediv = scale_min / astc::max(scale_max, 1e-10f);
|
||||
scalediv = astc::clamp1f(scalediv);
|
||||
|
||||
vfloat4 sds = scale_dir * scale_max;
|
||||
@@ -1493,7 +1591,7 @@ void recompute_ideal_colors_2planes(
|
||||
|
||||
if (fabsf(ls_det1) > (ls_mss1 * 1e-4f) && scale_ep0 == scale_ep0 && scale_ep1 == scale_ep1 && scale_ep0 < scale_ep1)
|
||||
{
|
||||
float scalediv2 = scale_ep0 * (1.0f / scale_ep1);
|
||||
float scalediv2 = scale_ep0 / scale_ep1;
|
||||
vfloat4 sdsm = scale_dir * scale_ep1;
|
||||
rgbs_vector = vfloat4(sdsm.lane<0>(), sdsm.lane<1>(), sdsm.lane<2>(), scalediv2);
|
||||
}
|
||||
@@ -1533,26 +1631,32 @@ void recompute_ideal_colors_2planes(
|
||||
ep.endpt1[0] = select(ep.endpt1[0], ep1, full_mask);
|
||||
}
|
||||
|
||||
// Calculations specific to mode #7, the HDR RGB-scale mode
|
||||
vfloat4 rgbq_sum = color_vec_x + color_vec_y;
|
||||
rgbq_sum.set_lane<3>(hadd_rgb_s(color_vec_y));
|
||||
|
||||
rgbo_vector = compute_rgbo_vector(rgba_weight_sum, weight_weight_sum, rgbq_sum, psum);
|
||||
|
||||
// We can get a failure due to the use of a singular (non-invertible) matrix
|
||||
// If it failed, compute rgbo_vectors[] with a different method ...
|
||||
if (astc::isnan(dot_s(rgbo_vector, rgbo_vector)))
|
||||
// Calculations specific to mode #7, the HDR RGB-scale mode - skip if known LDR
|
||||
if (blk.rgb_lns[0] || blk.alpha_lns[0])
|
||||
{
|
||||
vfloat4 v0 = ep.endpt0[0];
|
||||
vfloat4 v1 = ep.endpt1[0];
|
||||
weight_weight_sum = weight_weight_sum * color_weight;
|
||||
float psum = dot3_s(select(right1_sum, right2_sum, p2_mask), color_weight);
|
||||
|
||||
float avgdif = hadd_rgb_s(v1 - v0) * (1.0f / 3.0f);
|
||||
avgdif = astc::max(avgdif, 0.0f);
|
||||
vfloat4 rgbq_sum = color_vec_x + color_vec_y;
|
||||
rgbq_sum.set_lane<3>(hadd_rgb_s(color_vec_y));
|
||||
|
||||
vfloat4 avg = (v0 + v1) * 0.5f;
|
||||
vfloat4 ep0 = avg - vfloat4(avgdif) * 0.5f;
|
||||
rgbo_vector = compute_rgbo_vector(rgba_weight_sum, weight_weight_sum, rgbq_sum, psum);
|
||||
|
||||
rgbo_vector = vfloat4(ep0.lane<0>(), ep0.lane<1>(), ep0.lane<2>(), avgdif);
|
||||
// We can get a failure due to the use of a singular (non-invertible) matrix
|
||||
// If it failed, compute rgbo_vectors[] with a different method ...
|
||||
if (astc::isnan(dot_s(rgbo_vector, rgbo_vector)))
|
||||
{
|
||||
vfloat4 v0 = ep.endpt0[0];
|
||||
vfloat4 v1 = ep.endpt1[0];
|
||||
|
||||
float avgdif = hadd_rgb_s(v1 - v0) * (1.0f / 3.0f);
|
||||
avgdif = astc::max(avgdif, 0.0f);
|
||||
|
||||
vfloat4 avg = (v0 + v1) * 0.5f;
|
||||
vfloat4 ep0 = avg - vfloat4(avgdif) * 0.5f;
|
||||
|
||||
rgbo_vector = vfloat4(ep0.lane<0>(), ep0.lane<1>(), ep0.lane<2>(), avgdif);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
// ----------------------------------------------------------------------------
|
||||
// Copyright 2011-2022 Arm Limited
|
||||
// Copyright 2011-2024 Arm Limited
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
|
||||
// use this file except in compliance with the License. You may obtain a copy
|
||||
@@ -109,7 +109,7 @@ static vfloat4 swz_texel(
|
||||
vfloat4 data,
|
||||
const astcenc_swizzle& swz
|
||||
) {
|
||||
alignas(16) float datas[6];
|
||||
ASTCENC_ALIGNAS float datas[6];
|
||||
|
||||
storea(data, datas);
|
||||
datas[ASTCENC_SWZ_0] = 0.0f;
|
||||
@@ -143,12 +143,12 @@ static vfloat4 encode_texel_lns(
|
||||
vmask4 lns_mask
|
||||
) {
|
||||
vfloat4 datav_unorm = data * 65535.0f;
|
||||
vfloat4 datav_lns = float_to_lns(data);
|
||||
vfloat4 datav_lns = float_to_lns(data);
|
||||
return select(datav_unorm, datav_lns, lns_mask);
|
||||
}
|
||||
|
||||
/* See header for documentation. */
|
||||
void fetch_image_block(
|
||||
void load_image_block(
|
||||
astcenc_profile decode_mode,
|
||||
const astcenc_image& img,
|
||||
image_block& blk,
|
||||
@@ -265,7 +265,7 @@ void fetch_image_block(
|
||||
}
|
||||
|
||||
/* See header for documentation. */
|
||||
void fetch_image_block_fast_ldr(
|
||||
void load_image_block_fast_ldr(
|
||||
astcenc_profile decode_mode,
|
||||
const astcenc_image& img,
|
||||
image_block& blk,
|
||||
@@ -332,7 +332,7 @@ void fetch_image_block_fast_ldr(
|
||||
}
|
||||
|
||||
/* See header for documentation. */
|
||||
void write_image_block(
|
||||
void store_image_block(
|
||||
astcenc_image& img,
|
||||
const image_block& blk,
|
||||
const block_size_descriptor& bsd,
|
||||
@@ -341,24 +341,21 @@ void write_image_block(
|
||||
unsigned int zpos,
|
||||
const astcenc_swizzle& swz
|
||||
) {
|
||||
unsigned int xsize = img.dim_x;
|
||||
unsigned int ysize = img.dim_y;
|
||||
unsigned int zsize = img.dim_z;
|
||||
|
||||
unsigned int x_size = img.dim_x;
|
||||
unsigned int x_start = xpos;
|
||||
unsigned int x_end = std::min(xsize, xpos + bsd.xdim);
|
||||
unsigned int x_nudge = bsd.xdim - (x_end - x_start);
|
||||
unsigned int x_end = astc::min(x_size, xpos + bsd.xdim);
|
||||
unsigned int x_count = x_end - x_start;
|
||||
unsigned int x_nudge = bsd.xdim - x_count;
|
||||
|
||||
unsigned int y_size = img.dim_y;
|
||||
unsigned int y_start = ypos;
|
||||
unsigned int y_end = std::min(ysize, ypos + bsd.ydim);
|
||||
unsigned int y_nudge = (bsd.ydim - (y_end - y_start)) * bsd.xdim;
|
||||
unsigned int y_end = astc::min(y_size, ypos + bsd.ydim);
|
||||
unsigned int y_count = y_end - y_start;
|
||||
unsigned int y_nudge = (bsd.ydim - y_count) * bsd.xdim;
|
||||
|
||||
unsigned int z_size = img.dim_z;
|
||||
unsigned int z_start = zpos;
|
||||
unsigned int z_end = std::min(zsize, zpos + bsd.zdim);
|
||||
|
||||
float data[7];
|
||||
data[ASTCENC_SWZ_0] = 0.0f;
|
||||
data[ASTCENC_SWZ_1] = 1.0f;
|
||||
unsigned int z_end = astc::min(z_size, zpos + bsd.zdim);
|
||||
|
||||
// True if any non-identity swizzle
|
||||
bool needs_swz = (swz.r != ASTCENC_SWZ_R) || (swz.g != ASTCENC_SWZ_G) ||
|
||||
@@ -378,47 +375,68 @@ void write_image_block(
|
||||
|
||||
for (unsigned int y = y_start; y < y_end; y++)
|
||||
{
|
||||
for (unsigned int x = x_start; x < x_end; x++)
|
||||
{
|
||||
vint4 colori = vint4::zero();
|
||||
uint8_t* data8_row = data8 + (4 * x_size * y) + (4 * x_start);
|
||||
|
||||
// Errors are NaN encoded - convert to magenta error color
|
||||
if (blk.data_r[idx] != blk.data_r[idx])
|
||||
for (unsigned int x = 0; x < x_count; x += ASTCENC_SIMD_WIDTH)
|
||||
{
|
||||
unsigned int max_texels = ASTCENC_SIMD_WIDTH;
|
||||
unsigned int used_texels = astc::min(x_count - x, max_texels);
|
||||
|
||||
// Unaligned load as rows are not always SIMD_WIDTH long
|
||||
vfloat data_r(blk.data_r + idx);
|
||||
vfloat data_g(blk.data_g + idx);
|
||||
vfloat data_b(blk.data_b + idx);
|
||||
vfloat data_a(blk.data_a + idx);
|
||||
|
||||
vint data_ri = float_to_int_rtn(min(data_r, 1.0f) * 255.0f);
|
||||
vint data_gi = float_to_int_rtn(min(data_g, 1.0f) * 255.0f);
|
||||
vint data_bi = float_to_int_rtn(min(data_b, 1.0f) * 255.0f);
|
||||
vint data_ai = float_to_int_rtn(min(data_a, 1.0f) * 255.0f);
|
||||
|
||||
if (needs_swz)
|
||||
{
|
||||
colori = vint4(0xFF, 0x00, 0xFF, 0xFF);
|
||||
}
|
||||
else if (needs_swz)
|
||||
{
|
||||
data[ASTCENC_SWZ_R] = blk.data_r[idx];
|
||||
data[ASTCENC_SWZ_G] = blk.data_g[idx];
|
||||
data[ASTCENC_SWZ_B] = blk.data_b[idx];
|
||||
data[ASTCENC_SWZ_A] = blk.data_a[idx];
|
||||
vint swizzle_table[7];
|
||||
swizzle_table[ASTCENC_SWZ_0] = vint(0);
|
||||
swizzle_table[ASTCENC_SWZ_1] = vint(255);
|
||||
swizzle_table[ASTCENC_SWZ_R] = data_ri;
|
||||
swizzle_table[ASTCENC_SWZ_G] = data_gi;
|
||||
swizzle_table[ASTCENC_SWZ_B] = data_bi;
|
||||
swizzle_table[ASTCENC_SWZ_A] = data_ai;
|
||||
|
||||
if (needs_z)
|
||||
{
|
||||
float xcoord = (data[0] * 2.0f) - 1.0f;
|
||||
float ycoord = (data[3] * 2.0f) - 1.0f;
|
||||
float zcoord = 1.0f - xcoord * xcoord - ycoord * ycoord;
|
||||
if (zcoord < 0.0f)
|
||||
{
|
||||
zcoord = 0.0f;
|
||||
}
|
||||
data[ASTCENC_SWZ_Z] = (astc::sqrt(zcoord) * 0.5f) + 0.5f;
|
||||
vfloat data_x = (data_r * vfloat(2.0f)) - vfloat(1.0f);
|
||||
vfloat data_y = (data_a * vfloat(2.0f)) - vfloat(1.0f);
|
||||
vfloat data_z = vfloat(1.0f) - (data_x * data_x) - (data_y * data_y);
|
||||
data_z = max(data_z, 0.0f);
|
||||
data_z = (sqrt(data_z) * vfloat(0.5f)) + vfloat(0.5f);
|
||||
|
||||
swizzle_table[ASTCENC_SWZ_Z] = float_to_int_rtn(min(data_z, 1.0f) * 255.0f);
|
||||
}
|
||||
|
||||
vfloat4 color = vfloat4(data[swz.r], data[swz.g], data[swz.b], data[swz.a]);
|
||||
colori = float_to_int_rtn(min(color, 1.0f) * 255.0f);
|
||||
data_ri = swizzle_table[swz.r];
|
||||
data_gi = swizzle_table[swz.g];
|
||||
data_bi = swizzle_table[swz.b];
|
||||
data_ai = swizzle_table[swz.a];
|
||||
}
|
||||
else
|
||||
|
||||
// Errors are NaN encoded - convert to magenta error color
|
||||
// Branch is OK here - it is almost never true so predicts well
|
||||
vmask nan_mask = data_r != data_r;
|
||||
if (any(nan_mask))
|
||||
{
|
||||
vfloat4 color = blk.texel(idx);
|
||||
colori = float_to_int_rtn(min(color, 1.0f) * 255.0f);
|
||||
data_ri = select(data_ri, vint(0xFF), nan_mask);
|
||||
data_gi = select(data_gi, vint(0x00), nan_mask);
|
||||
data_bi = select(data_bi, vint(0xFF), nan_mask);
|
||||
data_ai = select(data_ai, vint(0xFF), nan_mask);
|
||||
}
|
||||
|
||||
colori = pack_low_bytes(colori);
|
||||
store_nbytes(colori, data8 + (4 * xsize * y) + (4 * x ));
|
||||
vint data_rgbai = interleave_rgba8(data_ri, data_gi, data_bi, data_ai);
|
||||
vmask store_mask = vint::lane_id() < vint(used_texels);
|
||||
store_lanes_masked(data8_row, data_rgbai, store_mask);
|
||||
|
||||
idx++;
|
||||
data8_row += ASTCENC_SIMD_WIDTH * 4;
|
||||
idx += used_texels;
|
||||
}
|
||||
idx += x_nudge;
|
||||
}
|
||||
@@ -434,13 +452,18 @@ void write_image_block(
|
||||
|
||||
for (unsigned int y = y_start; y < y_end; y++)
|
||||
{
|
||||
for (unsigned int x = x_start; x < x_end; x++)
|
||||
uint16_t* data16_row = data16 + (4 * x_size * y) + (4 * x_start);
|
||||
|
||||
for (unsigned int x = 0; x < x_count; x++)
|
||||
{
|
||||
vint4 color;
|
||||
|
||||
// NaNs are handled inline - no need to special case
|
||||
if (needs_swz)
|
||||
{
|
||||
float data[7];
|
||||
data[ASTCENC_SWZ_0] = 0.0f;
|
||||
data[ASTCENC_SWZ_1] = 1.0f;
|
||||
data[ASTCENC_SWZ_R] = blk.data_r[idx];
|
||||
data[ASTCENC_SWZ_G] = blk.data_g[idx];
|
||||
data[ASTCENC_SWZ_B] = blk.data_b[idx];
|
||||
@@ -467,11 +490,12 @@ void write_image_block(
|
||||
color = float_to_float16(colorf);
|
||||
}
|
||||
|
||||
data16[(4 * xsize * y) + (4 * x )] = static_cast<uint16_t>(color.lane<0>());
|
||||
data16[(4 * xsize * y) + (4 * x + 1)] = static_cast<uint16_t>(color.lane<1>());
|
||||
data16[(4 * xsize * y) + (4 * x + 2)] = static_cast<uint16_t>(color.lane<2>());
|
||||
data16[(4 * xsize * y) + (4 * x + 3)] = static_cast<uint16_t>(color.lane<3>());
|
||||
|
||||
// TODO: Vectorize with store N shorts?
|
||||
data16_row[0] = static_cast<uint16_t>(color.lane<0>());
|
||||
data16_row[1] = static_cast<uint16_t>(color.lane<1>());
|
||||
data16_row[2] = static_cast<uint16_t>(color.lane<2>());
|
||||
data16_row[3] = static_cast<uint16_t>(color.lane<3>());
|
||||
data16_row += 4;
|
||||
idx++;
|
||||
}
|
||||
idx += x_nudge;
|
||||
@@ -490,13 +514,18 @@ void write_image_block(
|
||||
|
||||
for (unsigned int y = y_start; y < y_end; y++)
|
||||
{
|
||||
for (unsigned int x = x_start; x < x_end; x++)
|
||||
float* data32_row = data32 + (4 * x_size * y) + (4 * x_start);
|
||||
|
||||
for (unsigned int x = 0; x < x_count; x++)
|
||||
{
|
||||
vfloat4 color = blk.texel(idx);
|
||||
|
||||
// NaNs are handled inline - no need to special case
|
||||
if (needs_swz)
|
||||
{
|
||||
float data[7];
|
||||
data[ASTCENC_SWZ_0] = 0.0f;
|
||||
data[ASTCENC_SWZ_1] = 1.0f;
|
||||
data[ASTCENC_SWZ_R] = color.lane<0>();
|
||||
data[ASTCENC_SWZ_G] = color.lane<1>();
|
||||
data[ASTCENC_SWZ_B] = color.lane<2>();
|
||||
@@ -517,8 +546,8 @@ void write_image_block(
|
||||
color = vfloat4(data[swz.r], data[swz.g], data[swz.b], data[swz.a]);
|
||||
}
|
||||
|
||||
store(color, data32 + (4 * xsize * y) + (4 * x ));
|
||||
|
||||
store(color, data32_row);
|
||||
data32_row += 4;
|
||||
idx++;
|
||||
}
|
||||
idx += x_nudge;
|
||||
|
||||
@@ -24,6 +24,7 @@
|
||||
#include <array>
|
||||
|
||||
/** @brief Unpacked quint triplets <low,middle,high> for each packed value */
|
||||
// TODO: Bitpack these into a uint16_t?
|
||||
static const uint8_t quints_of_integer[128][3] {
|
||||
{0, 0, 0}, {1, 0, 0}, {2, 0, 0}, {3, 0, 0},
|
||||
{4, 0, 0}, {0, 4, 0}, {4, 4, 0}, {4, 4, 4},
|
||||
@@ -99,6 +100,7 @@ static const uint8_t integer_of_quints[5][5][5] {
|
||||
};
|
||||
|
||||
/** @brief Unpacked trit quintuplets <low,...,high> for each packed value */
|
||||
// TODO: Bitpack these into a uint16_t?
|
||||
static const uint8_t trits_of_integer[256][5] {
|
||||
{0, 0, 0, 0, 0}, {1, 0, 0, 0, 0}, {2, 0, 0, 0, 0}, {0, 0, 2, 0, 0},
|
||||
{0, 1, 0, 0, 0}, {1, 1, 0, 0, 0}, {2, 1, 0, 0, 0}, {1, 0, 2, 0, 0},
|
||||
@@ -334,44 +336,41 @@ static const uint8_t integer_of_trits[3][3][3][3][3] {
|
||||
*/
|
||||
struct btq_count
|
||||
{
|
||||
/** @brief The quantization level. */
|
||||
uint8_t quant;
|
||||
|
||||
/** @brief The number of bits. */
|
||||
uint8_t bits;
|
||||
uint8_t bits:6;
|
||||
|
||||
/** @brief The number of trits. */
|
||||
uint8_t trits;
|
||||
uint8_t trits:1;
|
||||
|
||||
/** @brief The number of quints. */
|
||||
uint8_t quints;
|
||||
uint8_t quints:1;
|
||||
};
|
||||
|
||||
/**
|
||||
* @brief The table of bits, trits, and quints needed for a quant encode.
|
||||
*/
|
||||
static const std::array<btq_count, 21> btq_counts {{
|
||||
{ QUANT_2, 1, 0, 0 },
|
||||
{ QUANT_3, 0, 1, 0 },
|
||||
{ QUANT_4, 2, 0, 0 },
|
||||
{ QUANT_5, 0, 0, 1 },
|
||||
{ QUANT_6, 1, 1, 0 },
|
||||
{ QUANT_8, 3, 0, 0 },
|
||||
{ QUANT_10, 1, 0, 1 },
|
||||
{ QUANT_12, 2, 1, 0 },
|
||||
{ QUANT_16, 4, 0, 0 },
|
||||
{ QUANT_20, 2, 0, 1 },
|
||||
{ QUANT_24, 3, 1, 0 },
|
||||
{ QUANT_32, 5, 0, 0 },
|
||||
{ QUANT_40, 3, 0, 1 },
|
||||
{ QUANT_48, 4, 1, 0 },
|
||||
{ QUANT_64, 6, 0, 0 },
|
||||
{ QUANT_80, 4, 0, 1 },
|
||||
{ QUANT_96, 5, 1, 0 },
|
||||
{ QUANT_128, 7, 0, 0 },
|
||||
{ QUANT_160, 5, 0, 1 },
|
||||
{ QUANT_192, 6, 1, 0 },
|
||||
{ QUANT_256, 8, 0, 0 }
|
||||
{ 1, 0, 0 }, // QUANT_2
|
||||
{ 0, 1, 0 }, // QUANT_3
|
||||
{ 2, 0, 0 }, // QUANT_4
|
||||
{ 0, 0, 1 }, // QUANT_5
|
||||
{ 1, 1, 0 }, // QUANT_6
|
||||
{ 3, 0, 0 }, // QUANT_8
|
||||
{ 1, 0, 1 }, // QUANT_10
|
||||
{ 2, 1, 0 }, // QUANT_12
|
||||
{ 4, 0, 0 }, // QUANT_16
|
||||
{ 2, 0, 1 }, // QUANT_20
|
||||
{ 3, 1, 0 }, // QUANT_24
|
||||
{ 5, 0, 0 }, // QUANT_32
|
||||
{ 3, 0, 1 }, // QUANT_40
|
||||
{ 4, 1, 0 }, // QUANT_48
|
||||
{ 6, 0, 0 }, // QUANT_64
|
||||
{ 4, 0, 1 }, // QUANT_80
|
||||
{ 5, 1, 0 }, // QUANT_96
|
||||
{ 7, 0, 0 }, // QUANT_128
|
||||
{ 5, 0, 1 }, // QUANT_160
|
||||
{ 6, 1, 0 }, // QUANT_192
|
||||
{ 8, 0, 0 } // QUANT_256
|
||||
}};
|
||||
|
||||
/**
|
||||
@@ -382,44 +381,38 @@ static const std::array<btq_count, 21> btq_counts {{
|
||||
*/
|
||||
struct ise_size
|
||||
{
|
||||
/** @brief The quantization level. */
|
||||
uint8_t quant;
|
||||
|
||||
/** @brief The scaling parameter. */
|
||||
uint8_t scale;
|
||||
|
||||
/** @brief The rounding parameter. */
|
||||
uint8_t round;
|
||||
uint8_t scale:6;
|
||||
|
||||
/** @brief The divisor parameter. */
|
||||
uint8_t divisor;
|
||||
uint8_t divisor:2;
|
||||
};
|
||||
|
||||
/**
|
||||
* @brief The table of scale, round, and divisors needed for quant sizing.
|
||||
*/
|
||||
static const std::array<ise_size, 21> ise_sizes {{
|
||||
{ QUANT_2, 1, 0, 1 },
|
||||
{ QUANT_3, 8, 4, 5 },
|
||||
{ QUANT_4, 2, 0, 1 },
|
||||
{ QUANT_5, 7, 2, 3 },
|
||||
{ QUANT_6, 13, 4, 5 },
|
||||
{ QUANT_8, 3, 0, 1 },
|
||||
{ QUANT_10, 10, 2, 3 },
|
||||
{ QUANT_12, 18, 4, 5 },
|
||||
{ QUANT_16, 4, 0, 1 },
|
||||
{ QUANT_20, 13, 2, 3 },
|
||||
{ QUANT_24, 23, 4, 5 },
|
||||
{ QUANT_32, 5, 0, 1 },
|
||||
{ QUANT_40, 16, 2, 3 },
|
||||
{ QUANT_48, 28, 4, 5 },
|
||||
{ QUANT_64, 6, 0, 1 },
|
||||
{ QUANT_80, 19, 2, 3 },
|
||||
{ QUANT_96, 33, 4, 5 },
|
||||
{ QUANT_128, 7, 0, 1 },
|
||||
{ QUANT_160, 22, 2, 3 },
|
||||
{ QUANT_192, 38, 4, 5 },
|
||||
{ QUANT_256, 8, 0, 1 }
|
||||
{ 1, 0 }, // QUANT_2
|
||||
{ 8, 2 }, // QUANT_3
|
||||
{ 2, 0 }, // QUANT_4
|
||||
{ 7, 1 }, // QUANT_5
|
||||
{ 13, 2 }, // QUANT_6
|
||||
{ 3, 0 }, // QUANT_8
|
||||
{ 10, 1 }, // QUANT_10
|
||||
{ 18, 2 }, // QUANT_12
|
||||
{ 4, 0 }, // QUANT_16
|
||||
{ 13, 1 }, // QUANT_20
|
||||
{ 23, 2 }, // QUANT_24
|
||||
{ 5, 0 }, // QUANT_32
|
||||
{ 16, 1 }, // QUANT_40
|
||||
{ 28, 2 }, // QUANT_48
|
||||
{ 6, 0 }, // QUANT_64
|
||||
{ 19, 1 }, // QUANT_80
|
||||
{ 33, 2 }, // QUANT_96
|
||||
{ 7, 0 }, // QUANT_128
|
||||
{ 22, 1 }, // QUANT_160
|
||||
{ 38, 2 }, // QUANT_192
|
||||
{ 8, 0 } // QUANT_256
|
||||
}};
|
||||
|
||||
/* See header for documentation. */
|
||||
@@ -435,7 +428,8 @@ unsigned int get_ise_sequence_bitcount(
|
||||
}
|
||||
|
||||
auto& entry = ise_sizes[quant_level];
|
||||
return (entry.scale * character_count + entry.round) / entry.divisor;
|
||||
unsigned int divisor = (entry.divisor << 1) + 1;
|
||||
return (entry.scale * character_count + divisor - 1) / divisor;
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -645,7 +639,6 @@ void encode_ise(
|
||||
// Write out just bits
|
||||
else
|
||||
{
|
||||
promise(character_count > 0);
|
||||
for (unsigned int i = 0; i < character_count; i++)
|
||||
{
|
||||
write_bits(input_data[i], bits, bit_offset, output_data);
|
||||
@@ -685,10 +678,10 @@ void decode_ise(
|
||||
|
||||
if (trits)
|
||||
{
|
||||
static const unsigned int bits_to_read[5] { 2, 2, 1, 2, 1 };
|
||||
static const unsigned int block_shift[5] { 0, 2, 4, 5, 7 };
|
||||
static const unsigned int next_lcounter[5] { 1, 2, 3, 4, 0 };
|
||||
static const unsigned int hcounter_incr[5] { 0, 0, 0, 0, 1 };
|
||||
static const uint8_t bits_to_read[5] { 2, 2, 1, 2, 1 };
|
||||
static const uint8_t block_shift[5] { 0, 2, 4, 5, 7 };
|
||||
static const uint8_t next_lcounter[5] { 1, 2, 3, 4, 0 };
|
||||
static const uint8_t hcounter_incr[5] { 0, 0, 0, 0, 1 };
|
||||
unsigned int tdata = read_bits(bits_to_read[lcounter], bit_offset, input_data);
|
||||
bit_offset += bits_to_read[lcounter];
|
||||
tq_blocks[hcounter] |= tdata << block_shift[lcounter];
|
||||
@@ -698,10 +691,10 @@ void decode_ise(
|
||||
|
||||
if (quints)
|
||||
{
|
||||
static const unsigned int bits_to_read[3] { 3, 2, 2 };
|
||||
static const unsigned int block_shift[3] { 0, 3, 5 };
|
||||
static const unsigned int next_lcounter[3] { 1, 2, 0 };
|
||||
static const unsigned int hcounter_incr[3] { 0, 0, 1 };
|
||||
static const uint8_t bits_to_read[3] { 3, 2, 2 };
|
||||
static const uint8_t block_shift[3] { 0, 3, 5 };
|
||||
static const uint8_t next_lcounter[3] { 1, 2, 0 };
|
||||
static const uint8_t hcounter_incr[3] { 0, 0, 1 };
|
||||
unsigned int tdata = read_bits(bits_to_read[lcounter], bit_offset, input_data);
|
||||
bit_offset += bits_to_read[lcounter];
|
||||
tq_blocks[hcounter] |= tdata << block_shift[lcounter];
|
||||
@@ -714,6 +707,7 @@ void decode_ise(
|
||||
if (trits)
|
||||
{
|
||||
unsigned int trit_blocks = (character_count + 4) / 5;
|
||||
promise(trit_blocks > 0);
|
||||
for (unsigned int i = 0; i < trit_blocks; i++)
|
||||
{
|
||||
const uint8_t *tritptr = trits_of_integer[tq_blocks[i]];
|
||||
@@ -728,6 +722,7 @@ void decode_ise(
|
||||
if (quints)
|
||||
{
|
||||
unsigned int quint_blocks = (character_count + 2) / 3;
|
||||
promise(quint_blocks > 0);
|
||||
for (unsigned int i = 0; i < quint_blocks; i++)
|
||||
{
|
||||
const uint8_t *quintptr = quints_of_integer[tq_blocks[i]];
|
||||
|
||||
@@ -0,0 +1,330 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
// ----------------------------------------------------------------------------
|
||||
// Copyright 2011-2024 Arm Limited
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
|
||||
// use this file except in compliance with the License. You may obtain a copy
|
||||
// of the License at:
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
// License for the specific language governing permissions and limitations
|
||||
// under the License.
|
||||
// ----------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* @brief Functions and data declarations for the outer context.
|
||||
*
|
||||
* The outer context includes thread-pool management, which is slower to
|
||||
* compile due to increased use of C++ stdlib. The inner context used in the
|
||||
* majority of the codec library does not include this.
|
||||
*/
|
||||
|
||||
#ifndef ASTCENC_INTERNAL_ENTRY_INCLUDED
|
||||
#define ASTCENC_INTERNAL_ENTRY_INCLUDED
|
||||
|
||||
#include <atomic>
|
||||
#include <condition_variable>
|
||||
#include <functional>
|
||||
#include <mutex>
|
||||
|
||||
#include "astcenc_internal.h"
|
||||
|
||||
/* ============================================================================
|
||||
Parallel execution control
|
||||
============================================================================ */
|
||||
|
||||
/**
|
||||
* @brief A simple counter-based manager for parallel task execution.
|
||||
*
|
||||
* The task processing execution consists of:
|
||||
*
|
||||
* * A single-threaded init stage.
|
||||
* * A multi-threaded processing stage.
|
||||
* * A condition variable so threads can wait for processing completion.
|
||||
*
|
||||
* The init stage will be executed by the first thread to arrive in the critical section, there is
|
||||
* no main thread in the thread pool.
|
||||
*
|
||||
* The processing stage uses dynamic dispatch to assign task tickets to threads on an on-demand
|
||||
* basis. Threads may each therefore executed different numbers of tasks, depending on their
|
||||
* processing complexity. The task queue and the task tickets are just counters; the caller must map
|
||||
* these integers to an actual processing partition in a specific problem domain.
|
||||
*
|
||||
* The exit wait condition is needed to ensure processing has finished before a worker thread can
|
||||
* progress to the next stage of the pipeline. Specifically a worker may exit the processing stage
|
||||
* because there are no new tasks to assign to it while other worker threads are still processing.
|
||||
* Calling @c wait() will ensure that all other worker have finished before the thread can proceed.
|
||||
*
|
||||
* The basic usage model:
|
||||
*
|
||||
* // --------- From single-threaded code ---------
|
||||
*
|
||||
* // Reset the tracker state
|
||||
* manager->reset()
|
||||
*
|
||||
* // --------- From multi-threaded code ---------
|
||||
*
|
||||
* // Run the stage init; only first thread actually runs the lambda
|
||||
* manager->init(<lambda>)
|
||||
*
|
||||
* do
|
||||
* {
|
||||
* // Request a task assignment
|
||||
* uint task_count;
|
||||
* uint base_index = manager->get_tasks(<granule>, task_count);
|
||||
*
|
||||
* // Process any tasks we were given (task_count <= granule size)
|
||||
* if (task_count)
|
||||
* {
|
||||
* // Run the user task processing code for N tasks here
|
||||
* ...
|
||||
*
|
||||
* // Flag these tasks as complete
|
||||
* manager->complete_tasks(task_count);
|
||||
* }
|
||||
* } while (task_count);
|
||||
*
|
||||
* // Wait for all threads to complete tasks before progressing
|
||||
* manager->wait()
|
||||
*
|
||||
* // Run the stage term; only first thread actually runs the lambda
|
||||
* manager->term(<lambda>)
|
||||
*/
|
||||
class ParallelManager
|
||||
{
|
||||
private:
|
||||
/** @brief Lock used for critical section and condition synchronization. */
|
||||
std::mutex m_lock;
|
||||
|
||||
/** @brief True if the stage init() step has been executed. */
|
||||
bool m_init_done;
|
||||
|
||||
/** @brief True if the stage term() step has been executed. */
|
||||
bool m_term_done;
|
||||
|
||||
/** @brief Condition variable for tracking stage processing completion. */
|
||||
std::condition_variable m_complete;
|
||||
|
||||
/** @brief Number of tasks started, but not necessarily finished. */
|
||||
std::atomic<unsigned int> m_start_count;
|
||||
|
||||
/** @brief Number of tasks finished. */
|
||||
unsigned int m_done_count;
|
||||
|
||||
/** @brief Number of tasks that need to be processed. */
|
||||
unsigned int m_task_count;
|
||||
|
||||
/** @brief Progress callback (optional). */
|
||||
astcenc_progress_callback m_callback;
|
||||
|
||||
/** @brief Lock used for callback synchronization. */
|
||||
std::mutex m_callback_lock;
|
||||
|
||||
/** @brief Minimum progress before making a callback. */
|
||||
float m_callback_min_diff;
|
||||
|
||||
/** @brief Last progress callback value. */
|
||||
float m_callback_last_value;
|
||||
|
||||
public:
|
||||
/** @brief Create a new ParallelManager. */
|
||||
ParallelManager()
|
||||
{
|
||||
reset();
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Reset the tracker for a new processing batch.
|
||||
*
|
||||
* This must be called from single-threaded code before starting the multi-threaded processing
|
||||
* operations.
|
||||
*/
|
||||
void reset()
|
||||
{
|
||||
m_init_done = false;
|
||||
m_term_done = false;
|
||||
m_start_count = 0;
|
||||
m_done_count = 0;
|
||||
m_task_count = 0;
|
||||
m_callback_last_value = 0.0f;
|
||||
m_callback_min_diff = 1.0f;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Trigger the pipeline stage init step.
|
||||
*
|
||||
* This can be called from multi-threaded code. The first thread to hit this will process the
|
||||
* initialization. Other threads will block and wait for it to complete.
|
||||
*
|
||||
* @param init_func Callable which executes the stage initialization. It must return the
|
||||
* total number of tasks in the stage.
|
||||
*/
|
||||
void init(std::function<unsigned int(void)> init_func)
|
||||
{
|
||||
std::lock_guard<std::mutex> lck(m_lock);
|
||||
if (!m_init_done)
|
||||
{
|
||||
m_task_count = init_func();
|
||||
m_init_done = true;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Trigger the pipeline stage init step.
|
||||
*
|
||||
* This can be called from multi-threaded code. The first thread to hit this will process the
|
||||
* initialization. Other threads will block and wait for it to complete.
|
||||
*
|
||||
* @param task_count Total number of tasks needing processing.
|
||||
* @param callback Function pointer for progress status callbacks.
|
||||
*/
|
||||
void init(unsigned int task_count, astcenc_progress_callback callback)
|
||||
{
|
||||
std::lock_guard<std::mutex> lck(m_lock);
|
||||
if (!m_init_done)
|
||||
{
|
||||
m_callback = callback;
|
||||
m_task_count = task_count;
|
||||
m_init_done = true;
|
||||
|
||||
// Report every 1% or 4096 blocks, whichever is larger, to avoid callback overhead
|
||||
float min_diff = (4096.0f / static_cast<float>(task_count)) * 100.0f;
|
||||
m_callback_min_diff = astc::max(min_diff, 1.0f);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Request a task assignment.
|
||||
*
|
||||
* Assign up to @c granule tasks to the caller for processing.
|
||||
*
|
||||
* @param granule Maximum number of tasks that can be assigned.
|
||||
* @param[out] count Actual number of tasks assigned, or zero if no tasks were assigned.
|
||||
*
|
||||
* @return Task index of the first assigned task; assigned tasks increment from this.
|
||||
*/
|
||||
unsigned int get_task_assignment(unsigned int granule, unsigned int& count)
|
||||
{
|
||||
unsigned int base = m_start_count.fetch_add(granule, std::memory_order_relaxed);
|
||||
if (base >= m_task_count)
|
||||
{
|
||||
count = 0;
|
||||
return 0;
|
||||
}
|
||||
|
||||
count = astc::min(m_task_count - base, granule);
|
||||
return base;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Complete a task assignment.
|
||||
*
|
||||
* Mark @c count tasks as complete. This will notify all threads blocked on @c wait() if this
|
||||
* completes the processing of the stage.
|
||||
*
|
||||
* @param count The number of completed tasks.
|
||||
*/
|
||||
void complete_task_assignment(unsigned int count)
|
||||
{
|
||||
// Note: m_done_count cannot use an atomic without the mutex; this has a race between the
|
||||
// update here and the wait() for other threads
|
||||
unsigned int local_count;
|
||||
float local_last_value;
|
||||
{
|
||||
std::unique_lock<std::mutex> lck(m_lock);
|
||||
m_done_count += count;
|
||||
local_count = m_done_count;
|
||||
local_last_value = m_callback_last_value;
|
||||
|
||||
if (m_done_count == m_task_count)
|
||||
{
|
||||
// Ensure the progress bar hits 100%
|
||||
if (m_callback)
|
||||
{
|
||||
std::unique_lock<std::mutex> cblck(m_callback_lock);
|
||||
m_callback(100.0f);
|
||||
m_callback_last_value = 100.0f;
|
||||
}
|
||||
|
||||
lck.unlock();
|
||||
m_complete.notify_all();
|
||||
}
|
||||
}
|
||||
|
||||
// Process progress callback if we have one
|
||||
if (m_callback)
|
||||
{
|
||||
// Initial lockless test - have we progressed enough to emit?
|
||||
float num = static_cast<float>(local_count);
|
||||
float den = static_cast<float>(m_task_count);
|
||||
float this_value = (num / den) * 100.0f;
|
||||
bool report_test = (this_value - local_last_value) > m_callback_min_diff;
|
||||
|
||||
// Recheck under lock, because another thread might report first
|
||||
if (report_test)
|
||||
{
|
||||
std::unique_lock<std::mutex> cblck(m_callback_lock);
|
||||
bool report_retest = (this_value - m_callback_last_value) > m_callback_min_diff;
|
||||
if (report_retest)
|
||||
{
|
||||
m_callback(this_value);
|
||||
m_callback_last_value = this_value;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Wait for stage processing to complete.
|
||||
*/
|
||||
void wait()
|
||||
{
|
||||
std::unique_lock<std::mutex> lck(m_lock);
|
||||
m_complete.wait(lck, [this]{ return m_done_count == m_task_count; });
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Trigger the pipeline stage term step.
|
||||
*
|
||||
* This can be called from multi-threaded code. The first thread to hit this will process the
|
||||
* work pool termination. Caller must have called @c wait() prior to calling this function to
|
||||
* ensure that processing is complete.
|
||||
*
|
||||
* @param term_func Callable which executes the stage termination.
|
||||
*/
|
||||
void term(std::function<void(void)> term_func)
|
||||
{
|
||||
std::lock_guard<std::mutex> lck(m_lock);
|
||||
if (!m_term_done)
|
||||
{
|
||||
term_func();
|
||||
m_term_done = true;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* @brief The astcenc compression context.
|
||||
*/
|
||||
struct astcenc_context
|
||||
{
|
||||
/** @brief The context internal state. */
|
||||
astcenc_contexti context;
|
||||
|
||||
#if !defined(ASTCENC_DECOMPRESS_ONLY)
|
||||
/** @brief The parallel manager for averages computation. */
|
||||
ParallelManager manage_avg;
|
||||
|
||||
/** @brief The parallel manager for compression. */
|
||||
ParallelManager manage_compress;
|
||||
#endif
|
||||
|
||||
/** @brief The parallel manager for decompression. */
|
||||
ParallelManager manage_decompress;
|
||||
};
|
||||
|
||||
#endif
|
||||
@@ -1,6 +1,6 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
// ----------------------------------------------------------------------------
|
||||
// Copyright 2011-2021 Arm Limited
|
||||
// Copyright 2011-2024 Arm Limited
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
|
||||
// use this file except in compliance with the License. You may obtain a copy
|
||||
@@ -48,8 +48,6 @@
|
||||
#define ASTCENC_SSE 42
|
||||
#elif defined(__SSE4_1__)
|
||||
#define ASTCENC_SSE 41
|
||||
#elif defined(__SSE3__)
|
||||
#define ASTCENC_SSE 30
|
||||
#elif defined(__SSE2__)
|
||||
#define ASTCENC_SSE 20
|
||||
#else
|
||||
@@ -75,10 +73,22 @@
|
||||
#endif
|
||||
#endif
|
||||
|
||||
// Force vector-sized SIMD alignment
|
||||
#if ASTCENC_AVX
|
||||
#define ASTCENC_VECALIGN 32
|
||||
#else
|
||||
#elif ASTCENC_SSE || ASTCENC_NEON
|
||||
#define ASTCENC_VECALIGN 16
|
||||
// Use default alignment for non-SIMD builds
|
||||
#else
|
||||
#define ASTCENC_VECALIGN 0
|
||||
#endif
|
||||
|
||||
// C++11 states that alignas(0) should be ignored but GCC doesn't do
|
||||
// this on some versions, so workaround and avoid emitting alignas(0)
|
||||
#if ASTCENC_VECALIGN > 0
|
||||
#define ASTCENC_ALIGNAS alignas(ASTCENC_VECALIGN)
|
||||
#else
|
||||
#define ASTCENC_ALIGNAS
|
||||
#endif
|
||||
|
||||
#if ASTCENC_SSE != 0 || ASTCENC_AVX != 0 || ASTCENC_POPCNT != 0
|
||||
|
||||
@@ -273,7 +273,7 @@ static sf16 sf32_to_sf16(sf32 inp, roundmode rmode)
|
||||
of the mantissa is set.)
|
||||
*/
|
||||
p = (inp - 1) & UINT32_C(0x800000); /* zero if INF, nonzero if NaN. */
|
||||
return ((inp + vlx) >> 13) | (p >> 14);
|
||||
return static_cast<sf16>(((inp + vlx) >> 13) | (p >> 14));
|
||||
/*
|
||||
positive, exponent = 0, round-mode == UP; need to check whether number actually is 0.
|
||||
If it is, then return 0, else return 1 (the smallest representable nonzero number)
|
||||
@@ -283,7 +283,7 @@ static sf16 sf32_to_sf16(sf32 inp, roundmode rmode)
|
||||
-inp will set the MSB if the input number is nonzero.
|
||||
Thus (-inp) >> 31 will turn into 0 if the input number is 0 and 1 otherwise.
|
||||
*/
|
||||
return static_cast<uint32_t>((-static_cast<int32_t>(inp))) >> 31;
|
||||
return static_cast<sf16>(static_cast<uint32_t>((-static_cast<int32_t>(inp))) >> 31);
|
||||
|
||||
/*
|
||||
negative, exponent = , round-mode == DOWN, need to check whether number is
|
||||
@@ -296,7 +296,7 @@ static sf16 sf32_to_sf16(sf32 inp, roundmode rmode)
|
||||
the MSB set if it isn't. We then right-shift the value by 31 places to
|
||||
get a value that is 0 if the input is -0.0 and 1 otherwise.
|
||||
*/
|
||||
return ((vlx - inp) >> 31) + UINT32_C(0x8000);
|
||||
return static_cast<sf16>(((vlx - inp) >> 31) + UINT32_C(0x8000));
|
||||
|
||||
/*
|
||||
for all other cases involving underflow/overflow, we don't need to
|
||||
@@ -330,7 +330,7 @@ static sf16 sf32_to_sf16(sf32 inp, roundmode rmode)
|
||||
case 47:
|
||||
case 48:
|
||||
case 49:
|
||||
return vlx;
|
||||
return static_cast<sf16>(vlx);
|
||||
|
||||
/*
|
||||
for normal numbers, 'vlx' is the difference between the FP32 value of a number and the
|
||||
@@ -349,14 +349,14 @@ static sf16 sf32_to_sf16(sf32 inp, roundmode rmode)
|
||||
case 36:
|
||||
case 37:
|
||||
case 39:
|
||||
return (inp + vlx) >> 13;
|
||||
return static_cast<sf16>((inp + vlx) >> 13);
|
||||
|
||||
/* normal number, round-to-nearest-even. */
|
||||
case 33:
|
||||
case 38:
|
||||
p = inp + vlx;
|
||||
p += (inp >> 13) & 1;
|
||||
return p >> 13;
|
||||
return static_cast<sf16>(p >> 13);
|
||||
|
||||
/*
|
||||
the various denormal cases. These are not expected to be common, so their performance is a bit
|
||||
@@ -371,22 +371,22 @@ static sf16 sf32_to_sf16(sf32 inp, roundmode rmode)
|
||||
case 27:
|
||||
/* denormal, round towards zero. */
|
||||
p = 126 - ((inp >> 23) & 0xFF);
|
||||
return (((inp & UINT32_C(0x7FFFFF)) + UINT32_C(0x800000)) >> p) | vlx;
|
||||
return static_cast<sf16>((((inp & UINT32_C(0x7FFFFF)) + UINT32_C(0x800000)) >> p) | vlx);
|
||||
case 20:
|
||||
case 26:
|
||||
/* denormal, round away from zero. */
|
||||
p = 126 - ((inp >> 23) & 0xFF);
|
||||
return rtup_shift32((inp & UINT32_C(0x7FFFFF)) + UINT32_C(0x800000), p) | vlx;
|
||||
return static_cast<sf16>(rtup_shift32((inp & UINT32_C(0x7FFFFF)) + UINT32_C(0x800000), p) | vlx);
|
||||
case 24:
|
||||
case 29:
|
||||
/* denormal, round to nearest-away */
|
||||
p = 126 - ((inp >> 23) & 0xFF);
|
||||
return rtna_shift32((inp & UINT32_C(0x7FFFFF)) + UINT32_C(0x800000), p) | vlx;
|
||||
return static_cast<sf16>(rtna_shift32((inp & UINT32_C(0x7FFFFF)) + UINT32_C(0x800000), p) | vlx);
|
||||
case 23:
|
||||
case 28:
|
||||
/* denormal, round to nearest-even. */
|
||||
p = 126 - ((inp >> 23) & 0xFF);
|
||||
return rtne_shift32((inp & UINT32_C(0x7FFFFF)) + UINT32_C(0x800000), p) | vlx;
|
||||
return static_cast<sf16>(rtne_shift32((inp & UINT32_C(0x7FFFFF)) + UINT32_C(0x800000), p) | vlx);
|
||||
}
|
||||
|
||||
return 0;
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
// ----------------------------------------------------------------------------
|
||||
// Copyright 2011-2022 Arm Limited
|
||||
// Copyright 2011-2023 Arm Limited
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
|
||||
// use this file except in compliance with the License. You may obtain a copy
|
||||
@@ -21,6 +21,9 @@
|
||||
|
||||
#include "astcenc_internal.h"
|
||||
|
||||
/** @brief The number of 64-bit words needed to represent a canonical partition bit pattern. */
|
||||
#define BIT_PATTERN_WORDS (((ASTCENC_BLOCK_MAX_TEXELS * 2) + 63) / 64)
|
||||
|
||||
/**
|
||||
* @brief Generate a canonical representation of a partition pattern.
|
||||
*
|
||||
@@ -28,22 +31,22 @@
|
||||
* the remapped texel index. Remapping ensures that we only match on the partition pattern,
|
||||
* independent of the partition order generated by the hash.
|
||||
*
|
||||
* @param texel_count The number of texels in the block.
|
||||
* @param partition_of_texel The partition assignments, in hash order.
|
||||
* @param[out] bit_pattern The output bit pattern representation.
|
||||
* @param texel_count The number of texels in the block.
|
||||
* @param partition_of_texel The partition assignments, in hash order.
|
||||
* @param[out] bit_pattern The output bit pattern representation.
|
||||
*/
|
||||
static void generate_canonical_partitioning(
|
||||
unsigned int texel_count,
|
||||
const uint8_t* partition_of_texel,
|
||||
uint64_t bit_pattern[7]
|
||||
uint64_t bit_pattern[BIT_PATTERN_WORDS]
|
||||
) {
|
||||
// Clear the pattern
|
||||
for (unsigned int i = 0; i < 7; i++)
|
||||
for (unsigned int i = 0; i < BIT_PATTERN_WORDS; i++)
|
||||
{
|
||||
bit_pattern[i] = 0;
|
||||
}
|
||||
|
||||
// Store a mapping to reorder the raw partitions so that the the partitions are ordered such
|
||||
// Store a mapping to reorder the raw partitions so that the partitions are ordered such
|
||||
// that the lowest texel index in partition N is smaller than the lowest texel index in
|
||||
// partition N + 1.
|
||||
int mapped_index[BLOCK_MAX_PARTITIONS];
|
||||
@@ -76,19 +79,35 @@ static void generate_canonical_partitioning(
|
||||
* @return @c true if the patterns are the same, @c false otherwise.
|
||||
*/
|
||||
static bool compare_canonical_partitionings(
|
||||
const uint64_t part1[7],
|
||||
const uint64_t part2[7]
|
||||
const uint64_t part1[BIT_PATTERN_WORDS],
|
||||
const uint64_t part2[BIT_PATTERN_WORDS]
|
||||
) {
|
||||
return (part1[0] == part2[0]) && (part1[1] == part2[1]) &&
|
||||
(part1[2] == part2[2]) && (part1[3] == part2[3]) &&
|
||||
(part1[4] == part2[4]) && (part1[5] == part2[5]) &&
|
||||
(part1[6] == part2[6]);
|
||||
return (part1[0] == part2[0])
|
||||
#if BIT_PATTERN_WORDS > 1
|
||||
&& (part1[1] == part2[1])
|
||||
#endif
|
||||
#if BIT_PATTERN_WORDS > 2
|
||||
&& (part1[2] == part2[2])
|
||||
#endif
|
||||
#if BIT_PATTERN_WORDS > 3
|
||||
&& (part1[3] == part2[3])
|
||||
#endif
|
||||
#if BIT_PATTERN_WORDS > 4
|
||||
&& (part1[4] == part2[4])
|
||||
#endif
|
||||
#if BIT_PATTERN_WORDS > 5
|
||||
&& (part1[5] == part2[5])
|
||||
#endif
|
||||
#if BIT_PATTERN_WORDS > 6
|
||||
&& (part1[6] == part2[6])
|
||||
#endif
|
||||
;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Hash function used for procedural partition assignment.
|
||||
*
|
||||
* @param inp The hash seed.
|
||||
* @param inp The hash seed.
|
||||
*
|
||||
* @return The hashed value.
|
||||
*/
|
||||
@@ -116,7 +135,7 @@ static uint32_t hash52(
|
||||
* @param y The texel Y coordinate in the block.
|
||||
* @param z The texel Z coordinate in the block.
|
||||
* @param partition_count The total partition count of this encoding.
|
||||
* @param small_block @c true if the blockhas fewer than 32 texels.
|
||||
* @param small_block @c true if the block has fewer than 32 texels.
|
||||
*
|
||||
* @return The assigned partition index for this texel.
|
||||
*/
|
||||
@@ -316,25 +335,21 @@ static bool generate_one_partition_info_entry(
|
||||
}
|
||||
|
||||
// Populate the partition index
|
||||
pi.partition_index = partition_index;
|
||||
pi.partition_index = static_cast<uint16_t>(partition_index);
|
||||
|
||||
// Populate the coverage bitmaps for 2/3/4 partitions
|
||||
uint64_t* bitmaps { nullptr };
|
||||
uint8_t* valids { nullptr };
|
||||
if (partition_count == 2)
|
||||
{
|
||||
bitmaps = bsd.coverage_bitmaps_2[partition_remap_index];
|
||||
valids = bsd.partitioning_valid_2;
|
||||
}
|
||||
else if (partition_count == 3)
|
||||
{
|
||||
bitmaps = bsd.coverage_bitmaps_3[partition_remap_index];
|
||||
valids = bsd.partitioning_valid_3;
|
||||
}
|
||||
else if (partition_count == 4)
|
||||
{
|
||||
bitmaps = bsd.coverage_bitmaps_4[partition_remap_index];
|
||||
valids = bsd.partitioning_valid_4;
|
||||
}
|
||||
|
||||
for (unsigned int i = 0; i < BLOCK_MAX_PARTITIONS; i++)
|
||||
@@ -347,9 +362,7 @@ static bool generate_one_partition_info_entry(
|
||||
|
||||
if (bitmaps)
|
||||
{
|
||||
// Populate the bitmap validity mask
|
||||
valids[partition_remap_index] = valid ? 0 : 255;
|
||||
|
||||
// Populate the partition coverage bitmap
|
||||
for (unsigned int i = 0; i < partition_count; i++)
|
||||
{
|
||||
bitmaps[i] = 0ULL;
|
||||
@@ -374,12 +387,6 @@ static void build_partition_table_for_one_partition_count(
|
||||
partition_info* ptab,
|
||||
uint64_t* canonical_patterns
|
||||
) {
|
||||
uint8_t* partitioning_valid[3] {
|
||||
bsd.partitioning_valid_2,
|
||||
bsd.partitioning_valid_3,
|
||||
bsd.partitioning_valid_4
|
||||
};
|
||||
|
||||
unsigned int next_index = 0;
|
||||
bsd.partitioning_count_selected[partition_count - 1] = 0;
|
||||
bsd.partitioning_count_all[partition_count - 1] = 0;
|
||||
@@ -397,7 +404,7 @@ static void build_partition_table_for_one_partition_count(
|
||||
|
||||
// Tracker for things we built in the first iteration
|
||||
uint8_t build[BLOCK_MAX_PARTITIONINGS] { 0 };
|
||||
for (unsigned int x = 0; x < max_iter; x++)
|
||||
for (unsigned int x = 0; x < max_iter; x++)
|
||||
{
|
||||
for (unsigned int i = 0; i < BLOCK_MAX_PARTITIONINGS; i++)
|
||||
{
|
||||
@@ -413,11 +420,11 @@ static void build_partition_table_for_one_partition_count(
|
||||
continue;
|
||||
}
|
||||
|
||||
generate_canonical_partitioning(bsd.texel_count, ptab[next_index].partition_of_texel, canonical_patterns + next_index * 7);
|
||||
generate_canonical_partitioning(bsd.texel_count, ptab[next_index].partition_of_texel, canonical_patterns + next_index * BIT_PATTERN_WORDS);
|
||||
bool keep_canonical = true;
|
||||
for (unsigned int j = 0; j < next_index; j++)
|
||||
{
|
||||
bool match = compare_canonical_partitionings(canonical_patterns + 7 * next_index, canonical_patterns + 7 * j);
|
||||
bool match = compare_canonical_partitionings(canonical_patterns + next_index * BIT_PATTERN_WORDS, canonical_patterns + j * BIT_PATTERN_WORDS);
|
||||
if (match)
|
||||
{
|
||||
keep_canonical = false;
|
||||
@@ -429,7 +436,7 @@ static void build_partition_table_for_one_partition_count(
|
||||
{
|
||||
if (x == 0)
|
||||
{
|
||||
bsd.partitioning_packed_index[partition_count - 2][i] = next_index;
|
||||
bsd.partitioning_packed_index[partition_count - 2][i] = static_cast<uint16_t>(next_index);
|
||||
bsd.partitioning_count_selected[partition_count - 1]++;
|
||||
bsd.partitioning_count_all[partition_count - 1]++;
|
||||
build[i] = 1;
|
||||
@@ -440,9 +447,8 @@ static void build_partition_table_for_one_partition_count(
|
||||
{
|
||||
if (x == 1)
|
||||
{
|
||||
bsd.partitioning_packed_index[partition_count - 2][i] = next_index;
|
||||
bsd.partitioning_packed_index[partition_count - 2][i] = static_cast<uint16_t>(next_index);
|
||||
bsd.partitioning_count_all[partition_count - 1]++;
|
||||
partitioning_valid[partition_count - 2][next_index] = 255;
|
||||
next_index++;
|
||||
}
|
||||
}
|
||||
@@ -465,7 +471,8 @@ void init_partition_tables(
|
||||
bsd.partitioning_count_selected[0] = 1;
|
||||
bsd.partitioning_count_all[0] = 1;
|
||||
|
||||
uint64_t* canonical_patterns = new uint64_t[BLOCK_MAX_PARTITIONINGS * 7];
|
||||
uint64_t* canonical_patterns = new uint64_t[BLOCK_MAX_PARTITIONINGS * BIT_PATTERN_WORDS];
|
||||
|
||||
build_partition_table_for_one_partition_count(bsd, can_omit_partitionings, partition_count_cutoff, 2, par_tab2, canonical_patterns);
|
||||
build_partition_table_for_one_partition_count(bsd, can_omit_partitionings, partition_count_cutoff, 3, par_tab3, canonical_patterns);
|
||||
build_partition_table_for_one_partition_count(bsd, can_omit_partitionings, partition_count_cutoff, 4, par_tab4, canonical_patterns);
|
||||
|
||||
@@ -289,25 +289,13 @@ static void compute_encoding_choice_errors(
|
||||
vmask4 endpt_can_offset = endpt_diff < vfloat4(0.12f * 65535.0f);
|
||||
bool can_offset_encode = (mask(endpt_can_offset) & 0x7) == 0x7;
|
||||
|
||||
// Determine if we can blue contract encode RGB lanes
|
||||
vfloat4 endpt_diff_bc(
|
||||
endpt0.lane<0>() + (endpt0.lane<0>() - endpt0.lane<2>()),
|
||||
endpt1.lane<0>() + (endpt1.lane<0>() - endpt1.lane<2>()),
|
||||
endpt0.lane<1>() + (endpt0.lane<1>() - endpt0.lane<2>()),
|
||||
endpt1.lane<1>() + (endpt1.lane<1>() - endpt1.lane<2>())
|
||||
);
|
||||
|
||||
vmask4 endpt_can_bc_lo = endpt_diff_bc > vfloat4(0.01f * 65535.0f);
|
||||
vmask4 endpt_can_bc_hi = endpt_diff_bc < vfloat4(0.99f * 65535.0f);
|
||||
bool can_blue_contract = (mask(endpt_can_bc_lo & endpt_can_bc_hi) & 0x7) == 0x7;
|
||||
|
||||
// Store out the settings
|
||||
eci[i].rgb_scale_error = (samechroma_rgb_error - uncorr_rgb_error) * 0.7f; // empirical
|
||||
eci[i].rgb_luma_error = (rgb_luma_error - uncorr_rgb_error) * 1.5f; // wild guess
|
||||
eci[i].luminance_error = (luminance_rgb_error - uncorr_rgb_error) * 3.0f; // empirical
|
||||
eci[i].alpha_drop_error = alpha_drop_error * 3.0f;
|
||||
eci[i].can_offset_encode = can_offset_encode;
|
||||
eci[i].can_blue_contract = can_blue_contract;
|
||||
eci[i].can_blue_contract = !blk.is_luminance();
|
||||
}
|
||||
}
|
||||
|
||||
@@ -333,15 +321,11 @@ static void compute_color_error_for_every_integer_count_and_quant_level(
|
||||
const endpoints& ep,
|
||||
vfloat4 error_weight,
|
||||
float best_error[21][4],
|
||||
int format_of_choice[21][4]
|
||||
uint8_t format_of_choice[21][4]
|
||||
) {
|
||||
int partition_size = pi.partition_texel_count[partition_index];
|
||||
|
||||
static const float baseline_quant_error[21] {
|
||||
(65536.0f * 65536.0f / 18.0f), // 2 values, 1 step
|
||||
(65536.0f * 65536.0f / 18.0f) / (2 * 2), // 3 values, 2 steps
|
||||
(65536.0f * 65536.0f / 18.0f) / (3 * 3), // 4 values, 3 steps
|
||||
(65536.0f * 65536.0f / 18.0f) / (4 * 4), // 5 values
|
||||
static const float baseline_quant_error[21 - QUANT_6] {
|
||||
(65536.0f * 65536.0f / 18.0f) / (5 * 5),
|
||||
(65536.0f * 65536.0f / 18.0f) / (7 * 7),
|
||||
(65536.0f * 65536.0f / 18.0f) / (9 * 9),
|
||||
@@ -529,7 +513,7 @@ static void compute_color_error_for_every_integer_count_and_quant_level(
|
||||
best_error[i][1] = ERROR_CALC_DEFAULT;
|
||||
best_error[i][0] = ERROR_CALC_DEFAULT;
|
||||
|
||||
format_of_choice[i][3] = encode_hdr_alpha ? FMT_HDR_RGBA : FMT_HDR_RGB_LDR_ALPHA;
|
||||
format_of_choice[i][3] = static_cast<uint8_t>(encode_hdr_alpha ? FMT_HDR_RGBA : FMT_HDR_RGB_LDR_ALPHA);
|
||||
format_of_choice[i][2] = FMT_HDR_RGB;
|
||||
format_of_choice[i][1] = FMT_HDR_RGB_SCALE;
|
||||
format_of_choice[i][0] = FMT_HDR_LUMINANCE_LARGE_RANGE;
|
||||
@@ -540,7 +524,7 @@ static void compute_color_error_for_every_integer_count_and_quant_level(
|
||||
// The base_quant_error should depend on the scale-factor that would be used during
|
||||
// actual encode of the color value
|
||||
|
||||
float base_quant_error = baseline_quant_error[i] * static_cast<float>(partition_size);
|
||||
float base_quant_error = baseline_quant_error[i - QUANT_6] * static_cast<float>(partition_size);
|
||||
float rgb_quantization_error = error_weight_rgbsum * base_quant_error * 2.0f;
|
||||
float alpha_quantization_error = error_weight.lane<3>() * base_quant_error * 2.0f;
|
||||
float rgba_quantization_error = rgb_quantization_error + alpha_quantization_error;
|
||||
@@ -549,7 +533,7 @@ static void compute_color_error_for_every_integer_count_and_quant_level(
|
||||
|
||||
float full_hdr_rgba_error = rgba_quantization_error + rgb_range_error + alpha_range_error;
|
||||
best_error[i][3] = full_hdr_rgba_error;
|
||||
format_of_choice[i][3] = encode_hdr_alpha ? FMT_HDR_RGBA : FMT_HDR_RGB_LDR_ALPHA;
|
||||
format_of_choice[i][3] = static_cast<uint8_t>(encode_hdr_alpha ? FMT_HDR_RGBA : FMT_HDR_RGB_LDR_ALPHA);
|
||||
|
||||
// For 6 integers, we have one HDR-RGB encoding
|
||||
float full_hdr_rgb_error = (rgb_quantization_error * mode11mult) + rgb_range_error + eci.alpha_drop_error;
|
||||
@@ -603,7 +587,7 @@ static void compute_color_error_for_every_integer_count_and_quant_level(
|
||||
error_scale_oe_rgb = 1.0f;
|
||||
}
|
||||
|
||||
float base_quant_error = baseline_quant_error[i];
|
||||
float base_quant_error = baseline_quant_error[i - QUANT_6];
|
||||
float quant_error_rgb = base_quant_error_rgb * base_quant_error;
|
||||
float quant_error_rgba = base_quant_error_rgba * base_quant_error;
|
||||
|
||||
@@ -688,10 +672,10 @@ static void compute_color_error_for_every_integer_count_and_quant_level(
|
||||
static float one_partition_find_best_combination_for_bitcount(
|
||||
QualityProfile privateProfile,
|
||||
const float best_combined_error[21][4],
|
||||
const int best_combined_format[21][4],
|
||||
const uint8_t best_combined_format[21][4],
|
||||
int bits_available,
|
||||
quant_method& best_quant_level,
|
||||
int& best_format
|
||||
uint8_t& best_quant_level,
|
||||
uint8_t& best_format
|
||||
) {
|
||||
int best_integer_count = 0;
|
||||
float best_integer_count_error = ERROR_CALC_DEFAULT;
|
||||
@@ -721,7 +705,7 @@ static float one_partition_find_best_combination_for_bitcount(
|
||||
|
||||
int ql = quant_mode_table[best_integer_count + 1][bits_available];
|
||||
|
||||
best_quant_level = static_cast<quant_method>(ql);
|
||||
best_quant_level = static_cast<uint8_t>(ql);
|
||||
if (privateProfile == HIGH_SPEED_PROFILE) // keep openSource code style
|
||||
{
|
||||
best_format = FMT_RGBA;
|
||||
@@ -749,9 +733,9 @@ static float one_partition_find_best_combination_for_bitcount(
|
||||
*/
|
||||
static void two_partitions_find_best_combination_for_every_quantization_and_integer_count(
|
||||
const float best_error[2][21][4], // indexed by (partition, quant-level, integer-pair-count-minus-1)
|
||||
const int best_format[2][21][4],
|
||||
const uint8_t best_format[2][21][4],
|
||||
float best_combined_error[21][7], // indexed by (quant-level, integer-pair-count-minus-2)
|
||||
int best_combined_format[21][7][2]
|
||||
uint8_t best_combined_format[21][7][2]
|
||||
) {
|
||||
for (int i = QUANT_2; i <= QUANT_256; i++)
|
||||
{
|
||||
@@ -801,11 +785,11 @@ static void two_partitions_find_best_combination_for_every_quantization_and_inte
|
||||
*/
|
||||
static float two_partitions_find_best_combination_for_bitcount(
|
||||
float best_combined_error[21][7],
|
||||
int best_combined_format[21][7][2],
|
||||
uint8_t best_combined_format[21][7][2],
|
||||
int bits_available,
|
||||
quant_method& best_quant_level,
|
||||
quant_method& best_quant_level_mod,
|
||||
int* best_formats
|
||||
uint8_t& best_quant_level,
|
||||
uint8_t& best_quant_level_mod,
|
||||
uint8_t* best_formats
|
||||
) {
|
||||
int best_integer_count = 0;
|
||||
float best_integer_count_error = ERROR_CALC_DEFAULT;
|
||||
@@ -832,8 +816,8 @@ static float two_partitions_find_best_combination_for_bitcount(
|
||||
int ql = quant_mode_table[best_integer_count][bits_available];
|
||||
int ql_mod = quant_mode_table[best_integer_count][bits_available + 2];
|
||||
|
||||
best_quant_level = static_cast<quant_method>(ql);
|
||||
best_quant_level_mod = static_cast<quant_method>(ql_mod);
|
||||
best_quant_level = static_cast<uint8_t>(ql);
|
||||
best_quant_level_mod = static_cast<uint8_t>(ql_mod);
|
||||
|
||||
if (ql >= QUANT_6)
|
||||
{
|
||||
@@ -863,9 +847,9 @@ static float two_partitions_find_best_combination_for_bitcount(
|
||||
*/
|
||||
static void three_partitions_find_best_combination_for_every_quantization_and_integer_count(
|
||||
const float best_error[3][21][4], // indexed by (partition, quant-level, integer-count)
|
||||
const int best_format[3][21][4],
|
||||
const uint8_t best_format[3][21][4],
|
||||
float best_combined_error[21][10],
|
||||
int best_combined_format[21][10][3]
|
||||
uint8_t best_combined_format[21][10][3]
|
||||
) {
|
||||
for (int i = QUANT_2; i <= QUANT_256; i++)
|
||||
{
|
||||
@@ -926,11 +910,11 @@ static void three_partitions_find_best_combination_for_every_quantization_and_in
|
||||
*/
|
||||
static float three_partitions_find_best_combination_for_bitcount(
|
||||
const float best_combined_error[21][10],
|
||||
const int best_combined_format[21][10][3],
|
||||
const uint8_t best_combined_format[21][10][3],
|
||||
int bits_available,
|
||||
quant_method& best_quant_level,
|
||||
quant_method& best_quant_level_mod,
|
||||
int* best_formats
|
||||
uint8_t& best_quant_level,
|
||||
uint8_t& best_quant_level_mod,
|
||||
uint8_t* best_formats
|
||||
) {
|
||||
int best_integer_count = 0;
|
||||
float best_integer_count_error = ERROR_CALC_DEFAULT;
|
||||
@@ -957,8 +941,8 @@ static float three_partitions_find_best_combination_for_bitcount(
|
||||
int ql = quant_mode_table[best_integer_count][bits_available];
|
||||
int ql_mod = quant_mode_table[best_integer_count][bits_available + 5];
|
||||
|
||||
best_quant_level = static_cast<quant_method>(ql);
|
||||
best_quant_level_mod = static_cast<quant_method>(ql_mod);
|
||||
best_quant_level = static_cast<uint8_t>(ql);
|
||||
best_quant_level_mod = static_cast<uint8_t>(ql_mod);
|
||||
|
||||
if (ql >= QUANT_6)
|
||||
{
|
||||
@@ -988,9 +972,9 @@ static float three_partitions_find_best_combination_for_bitcount(
|
||||
*/
|
||||
static void four_partitions_find_best_combination_for_every_quantization_and_integer_count(
|
||||
const float best_error[4][21][4], // indexed by (partition, quant-level, integer-count)
|
||||
const int best_format[4][21][4],
|
||||
const uint8_t best_format[4][21][4],
|
||||
float best_combined_error[21][13],
|
||||
int best_combined_format[21][13][4]
|
||||
uint8_t best_combined_format[21][13][4]
|
||||
) {
|
||||
for (int i = QUANT_2; i <= QUANT_256; i++)
|
||||
{
|
||||
@@ -1062,11 +1046,11 @@ static void four_partitions_find_best_combination_for_every_quantization_and_int
|
||||
*/
|
||||
static float four_partitions_find_best_combination_for_bitcount(
|
||||
const float best_combined_error[21][13],
|
||||
const int best_combined_format[21][13][4],
|
||||
const uint8_t best_combined_format[21][13][4],
|
||||
int bits_available,
|
||||
quant_method& best_quant_level,
|
||||
quant_method& best_quant_level_mod,
|
||||
int* best_formats
|
||||
uint8_t& best_quant_level,
|
||||
uint8_t& best_quant_level_mod,
|
||||
uint8_t* best_formats
|
||||
) {
|
||||
int best_integer_count = 0;
|
||||
float best_integer_count_error = ERROR_CALC_DEFAULT;
|
||||
@@ -1093,8 +1077,8 @@ static float four_partitions_find_best_combination_for_bitcount(
|
||||
int ql = quant_mode_table[best_integer_count][bits_available];
|
||||
int ql_mod = quant_mode_table[best_integer_count][bits_available + 8];
|
||||
|
||||
best_quant_level = static_cast<quant_method>(ql);
|
||||
best_quant_level_mod = static_cast<quant_method>(ql_mod);
|
||||
best_quant_level = static_cast<uint8_t>(ql);
|
||||
best_quant_level_mod = static_cast<uint8_t>(ql_mod);
|
||||
|
||||
if (ql >= QUANT_6)
|
||||
{
|
||||
@@ -1121,13 +1105,13 @@ unsigned int compute_ideal_endpoint_formats(
|
||||
const image_block& blk,
|
||||
const endpoints& ep,
|
||||
// bitcounts and errors computed for the various quantization methods
|
||||
const int* qwt_bitcounts,
|
||||
const int8_t* qwt_bitcounts,
|
||||
const float* qwt_errors,
|
||||
unsigned int tune_candidate_limit,
|
||||
unsigned int start_block_mode,
|
||||
unsigned int end_block_mode,
|
||||
// output data
|
||||
int partition_format_specifiers[TUNE_MAX_TRIAL_CANDIDATES][BLOCK_MAX_PARTITIONS],
|
||||
uint8_t partition_format_specifiers[TUNE_MAX_TRIAL_CANDIDATES][BLOCK_MAX_PARTITIONS],
|
||||
int block_mode[TUNE_MAX_TRIAL_CANDIDATES],
|
||||
quant_method quant_level[TUNE_MAX_TRIAL_CANDIDATES],
|
||||
quant_method quant_level_mod[TUNE_MAX_TRIAL_CANDIDATES],
|
||||
@@ -1137,8 +1121,8 @@ unsigned int compute_ideal_endpoint_formats(
|
||||
|
||||
promise(partition_count > 0);
|
||||
|
||||
int encode_hdr_rgb = blk.rgb_lns[0];
|
||||
int encode_hdr_alpha = blk.alpha_lns[0];
|
||||
bool encode_hdr_rgb = static_cast<bool>(blk.rgb_lns[0]);
|
||||
bool encode_hdr_alpha = static_cast<bool>(blk.alpha_lns[0]);
|
||||
|
||||
// Compute the errors that result from various encoding choices (such as using luminance instead
|
||||
// of RGB, discarding Alpha, using RGB-scale in place of two separate RGB endpoints and so on)
|
||||
@@ -1146,7 +1130,7 @@ unsigned int compute_ideal_endpoint_formats(
|
||||
compute_encoding_choice_errors(blk, pi, ep, eci);
|
||||
|
||||
float best_error[BLOCK_MAX_PARTITIONS][21][4];
|
||||
int format_of_choice[BLOCK_MAX_PARTITIONS][21][4];
|
||||
uint8_t format_of_choice[BLOCK_MAX_PARTITIONS][21][4];
|
||||
for (int i = 0; i < partition_count; i++)
|
||||
{
|
||||
compute_color_error_for_every_integer_count_and_quant_level(
|
||||
@@ -1156,28 +1140,24 @@ unsigned int compute_ideal_endpoint_formats(
|
||||
}
|
||||
|
||||
float* errors_of_best_combination = tmpbuf.errors_of_best_combination;
|
||||
quant_method* best_quant_levels = tmpbuf.best_quant_levels;
|
||||
quant_method* best_quant_levels_mod = tmpbuf.best_quant_levels_mod;
|
||||
int (&best_ep_formats)[WEIGHTS_MAX_BLOCK_MODES][BLOCK_MAX_PARTITIONS] = tmpbuf.best_ep_formats;
|
||||
uint8_t* best_quant_levels = tmpbuf.best_quant_levels;
|
||||
uint8_t* best_quant_levels_mod = tmpbuf.best_quant_levels_mod;
|
||||
uint8_t (&best_ep_formats)[WEIGHTS_MAX_BLOCK_MODES][BLOCK_MAX_PARTITIONS] = tmpbuf.best_ep_formats;
|
||||
|
||||
// Ensure that the "overstep" of the last iteration in the vectorized loop will contain data
|
||||
// that will never be picked as best candidate
|
||||
const unsigned int packed_end_block_mode = round_up_to_simd_multiple_vla(end_block_mode);
|
||||
// Ensure that the first iteration understep contains data that will never be picked
|
||||
vfloat clear_error(ERROR_CALC_DEFAULT);
|
||||
vint clear_quant(0);
|
||||
|
||||
// TODO: Can we avoid this?
|
||||
for (unsigned int i = 0; i < start_block_mode; i++)
|
||||
{
|
||||
errors_of_best_combination[i] = ERROR_CALC_DEFAULT;
|
||||
best_quant_levels[i] = QUANT_2;
|
||||
best_quant_levels_mod[i] = QUANT_2;
|
||||
}
|
||||
unsigned int packed_start_block_mode = round_down_to_simd_multiple_vla(start_block_mode);
|
||||
storea(clear_error, errors_of_best_combination + packed_start_block_mode);
|
||||
store_nbytes(clear_quant, best_quant_levels + packed_start_block_mode);
|
||||
store_nbytes(clear_quant, best_quant_levels_mod + packed_start_block_mode);
|
||||
|
||||
for (unsigned int i = end_block_mode; i < packed_end_block_mode; i++)
|
||||
{
|
||||
errors_of_best_combination[i] = ERROR_CALC_DEFAULT;
|
||||
best_quant_levels[i] = QUANT_2;
|
||||
best_quant_levels_mod[i] = QUANT_2;
|
||||
}
|
||||
// Ensure that last iteration overstep contains data that will never be picked
|
||||
unsigned int packed_end_block_mode = round_down_to_simd_multiple_vla(end_block_mode - 1);
|
||||
storea(clear_error, errors_of_best_combination + packed_end_block_mode);
|
||||
store_nbytes(clear_quant, best_quant_levels + packed_end_block_mode);
|
||||
store_nbytes(clear_quant, best_quant_levels_mod + packed_end_block_mode);
|
||||
|
||||
// Track a scalar best to avoid expensive search at least once ...
|
||||
float error_of_best_combination = ERROR_CALC_DEFAULT;
|
||||
@@ -1186,7 +1166,7 @@ unsigned int compute_ideal_endpoint_formats(
|
||||
// The block contains 1 partition
|
||||
if (partition_count == 1)
|
||||
{
|
||||
for (unsigned int i = start_block_mode; i < end_block_mode; ++i)
|
||||
for (unsigned int i = start_block_mode; i < end_block_mode; i++)
|
||||
{
|
||||
if (qwt_errors[i] >= ERROR_CALC_DEFAULT)
|
||||
{
|
||||
@@ -1214,13 +1194,13 @@ unsigned int compute_ideal_endpoint_formats(
|
||||
else if (partition_count == 2)
|
||||
{
|
||||
float combined_best_error[21][7];
|
||||
int formats_of_choice[21][7][2];
|
||||
uint8_t formats_of_choice[21][7][2];
|
||||
|
||||
two_partitions_find_best_combination_for_every_quantization_and_integer_count(
|
||||
best_error, format_of_choice, combined_best_error, formats_of_choice);
|
||||
|
||||
assert(start_block_mode == 0);
|
||||
for (unsigned int i = 0; i < end_block_mode; ++i)
|
||||
for (unsigned int i = 0; i < end_block_mode; i++)
|
||||
{
|
||||
if (qwt_errors[i] >= ERROR_CALC_DEFAULT)
|
||||
{
|
||||
@@ -1247,13 +1227,13 @@ unsigned int compute_ideal_endpoint_formats(
|
||||
else if (partition_count == 3)
|
||||
{
|
||||
float combined_best_error[21][10];
|
||||
int formats_of_choice[21][10][3];
|
||||
uint8_t formats_of_choice[21][10][3];
|
||||
|
||||
three_partitions_find_best_combination_for_every_quantization_and_integer_count(
|
||||
best_error, format_of_choice, combined_best_error, formats_of_choice);
|
||||
|
||||
assert(start_block_mode == 0);
|
||||
for (unsigned int i = 0; i < end_block_mode; ++i)
|
||||
for (unsigned int i = 0; i < end_block_mode; i++)
|
||||
{
|
||||
if (qwt_errors[i] >= ERROR_CALC_DEFAULT)
|
||||
{
|
||||
@@ -1281,13 +1261,13 @@ unsigned int compute_ideal_endpoint_formats(
|
||||
{
|
||||
assert(partition_count == 4);
|
||||
float combined_best_error[21][13];
|
||||
int formats_of_choice[21][13][4];
|
||||
uint8_t formats_of_choice[21][13][4];
|
||||
|
||||
four_partitions_find_best_combination_for_every_quantization_and_integer_count(
|
||||
best_error, format_of_choice, combined_best_error, formats_of_choice);
|
||||
|
||||
assert(start_block_mode == 0);
|
||||
for (unsigned int i = 0; i < end_block_mode; ++i)
|
||||
for (unsigned int i = 0; i < end_block_mode; i++)
|
||||
{
|
||||
if (qwt_errors[i] >= ERROR_CALC_DEFAULT)
|
||||
{
|
||||
@@ -1330,10 +1310,8 @@ unsigned int compute_ideal_endpoint_formats(
|
||||
vint lane_ids = vint::lane_id() + vint(start_block_mode);
|
||||
for (unsigned int j = start_block_mode; j < end_block_mode; j += ASTCENC_SIMD_WIDTH)
|
||||
{
|
||||
vfloat err = vfloat(&errors_of_best_combination[j]);
|
||||
vmask mask1 = err < vbest_ep_error;
|
||||
vmask mask2 = vint(reinterpret_cast<int*>(best_quant_levels + j)) > vint(4);
|
||||
vmask mask = mask1 & mask2;
|
||||
vfloat err = vfloat(errors_of_best_combination + j);
|
||||
vmask mask = err < vbest_ep_error;
|
||||
vbest_ep_error = select(vbest_ep_error, err, mask);
|
||||
vbest_error_index = select(vbest_error_index, lane_ids, mask);
|
||||
lane_ids += vint(ASTCENC_SIMD_WIDTH);
|
||||
@@ -1368,8 +1346,8 @@ unsigned int compute_ideal_endpoint_formats(
|
||||
|
||||
block_mode[i] = best_error_weights[i];
|
||||
|
||||
quant_level[i] = best_quant_levels[best_error_weights[i]];
|
||||
quant_level_mod[i] = best_quant_levels_mod[best_error_weights[i]];
|
||||
quant_level[i] = static_cast<quant_method>(best_quant_levels[best_error_weights[i]]);
|
||||
quant_level_mod[i] = static_cast<quant_method>(best_quant_levels_mod[best_error_weights[i]]);
|
||||
|
||||
assert(quant_level[i] >= QUANT_6 && quant_level[i] <= QUANT_256);
|
||||
assert(quant_level_mod[i] >= QUANT_6 && quant_level_mod[i] <= QUANT_256);
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
// ----------------------------------------------------------------------------
|
||||
// Copyright 2011-2021 Arm Limited
|
||||
// Copyright 2011-2023 Arm Limited
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
|
||||
// use this file except in compliance with the License. You may obtain a copy
|
||||
@@ -23,6 +23,50 @@
|
||||
|
||||
#include <cassert>
|
||||
|
||||
/**
|
||||
* @brief Reverse bits in a byte.
|
||||
*
|
||||
* @param p The value to reverse.
|
||||
*
|
||||
* @return The reversed result.
|
||||
*/
|
||||
static inline int bitrev8(int p)
|
||||
{
|
||||
p = ((p & 0x0F) << 4) | ((p >> 4) & 0x0F);
|
||||
p = ((p & 0x33) << 2) | ((p >> 2) & 0x33);
|
||||
p = ((p & 0x55) << 1) | ((p >> 1) & 0x55);
|
||||
return p;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* @brief Read up to 8 bits at an arbitrary bit offset.
|
||||
*
|
||||
* The stored value is at most 8 bits, but can be stored at an offset of between 0 and 7 bits so may
|
||||
* span two separate bytes in memory.
|
||||
*
|
||||
* @param bitcount The number of bits to read.
|
||||
* @param bitoffset The bit offset to read from, between 0 and 7.
|
||||
* @param[in,out] ptr The data pointer to read from.
|
||||
*
|
||||
* @return The read value.
|
||||
*/
|
||||
static inline int read_bits(
|
||||
int bitcount,
|
||||
int bitoffset,
|
||||
const uint8_t* ptr
|
||||
) {
|
||||
int mask = (1 << bitcount) - 1;
|
||||
ptr += bitoffset >> 3;
|
||||
bitoffset &= 7;
|
||||
int value = ptr[0] | (ptr[1] << 8);
|
||||
value >>= bitoffset;
|
||||
value &= mask;
|
||||
return value;
|
||||
}
|
||||
|
||||
#if !defined(ASTCENC_DECOMPRESS_ONLY)
|
||||
|
||||
/**
|
||||
* @brief Write up to 8 bits at an arbitrary bit offset.
|
||||
*
|
||||
@@ -54,74 +98,47 @@ static inline void write_bits(
|
||||
ptr[1] |= value >> 8;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Read up to 8 bits at an arbitrary bit offset.
|
||||
*
|
||||
* The stored value is at most 8 bits, but can be stored at an offset of between 0 and 7 bits so may
|
||||
* span two separate bytes in memory.
|
||||
*
|
||||
* @param bitcount The number of bits to read.
|
||||
* @param bitoffset The bit offset to read from, between 0 and 7.
|
||||
* @param[in,out] ptr The data pointer to read from.
|
||||
*
|
||||
* @return The read value.
|
||||
*/
|
||||
static inline int read_bits(
|
||||
int bitcount,
|
||||
int bitoffset,
|
||||
const uint8_t* ptr
|
||||
) {
|
||||
int mask = (1 << bitcount) - 1;
|
||||
ptr += bitoffset >> 3;
|
||||
bitoffset &= 7;
|
||||
int value = ptr[0] | (ptr[1] << 8);
|
||||
value >>= bitoffset;
|
||||
value &= mask;
|
||||
return value;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Reverse bits in a byte.
|
||||
*
|
||||
* @param p The value to reverse.
|
||||
*
|
||||
* @return The reversed result.
|
||||
*/
|
||||
static inline int bitrev8(int p)
|
||||
{
|
||||
p = ((p & 0x0F) << 4) | ((p >> 4) & 0x0F);
|
||||
p = ((p & 0x33) << 2) | ((p >> 2) & 0x33);
|
||||
p = ((p & 0x55) << 1) | ((p >> 1) & 0x55);
|
||||
return p;
|
||||
}
|
||||
|
||||
static const int HIGH_SPEED_PROFILE_COLOR_BYTES = 8;
|
||||
static const int HIGH_SPEED_PROFILE_WEIGHT_BYTES = 16;
|
||||
/* See header for documentation. */
|
||||
void symbolic_to_physical(
|
||||
const block_size_descriptor& bsd,
|
||||
const symbolic_compressed_block& scb,
|
||||
physical_compressed_block& pcb
|
||||
uint8_t pcb[16]
|
||||
) {
|
||||
assert(scb.block_type != SYM_BTYPE_ERROR);
|
||||
const auto& bm = bsd.get_block_mode(scb.block_mode);
|
||||
const auto& di = bsd.get_decimation_info(bm.decimation_mode);
|
||||
int weight_count = di.weight_count;
|
||||
quant_method weight_quant_method = bm.get_weight_quant_mode();
|
||||
float weight_quant_levels = static_cast<float>(get_quant_level(weight_quant_method));
|
||||
const auto& qat = quant_and_xfer_tables[weight_quant_method];
|
||||
if (scb.privateProfile == HIGH_SPEED_PROFILE)
|
||||
{
|
||||
uint8_t weights[64];
|
||||
for (int i = 0; i < weight_count; i++)
|
||||
{
|
||||
float uqw = static_cast<float>(scb.weights[i]);
|
||||
float qw = (uqw / 64.0f) * (weight_quant_levels - 1.0f);
|
||||
int qwi = static_cast<int>(qw + 0.5f);
|
||||
weights[i] = qat.scramble_map[qwi];
|
||||
}
|
||||
uint8_t weightbuf[HIGH_SPEED_PROFILE_WEIGHT_BYTES] = {0};
|
||||
encode_ise(QUANT_6, HIGH_SPEED_PROFILE_WEIGHT_BYTES, scb.weights, weightbuf, 0);
|
||||
encode_ise(QUANT_6, HIGH_SPEED_PROFILE_WEIGHT_BYTES, weights, weightbuf, 0);
|
||||
for (int i = 0; i < HIGH_SPEED_PROFILE_WEIGHT_BYTES; i++)
|
||||
{
|
||||
pcb.data[i] = static_cast<uint8_t>(bitrev8(weightbuf[HIGH_SPEED_PROFILE_WEIGHT_BYTES - 1 - i]));
|
||||
pcb[i] = static_cast<uint8_t>(bitrev8(weightbuf[HIGH_SPEED_PROFILE_WEIGHT_BYTES - 1 - i]));
|
||||
}
|
||||
pcb.data[0] = 0x43; // the first byte of every block stream is 0x43 for HIGH_SPEED_PROFILE
|
||||
pcb.data[1] = 0x80; // the second byte of every block stream is 0x80 for HIGH_SPEED_PROFILE
|
||||
pcb.data[2] = 0x01; // the third (2 idx) byte of every block stream is 0x01 for HIGH_SPEED_PROFILE
|
||||
pcb[0] = 0x43; // the first byte of every block stream is 0x43 for HIGH_SPEED_PROFILE
|
||||
pcb[1] = 0x80; // the second byte of every block stream is 0x80 for HIGH_SPEED_PROFILE
|
||||
pcb[2] = 0x01; // the third (2 idx) byte of every block stream is 0x01 for HIGH_SPEED_PROFILE
|
||||
uint8_t values_to_encode[HIGH_SPEED_PROFILE_COLOR_BYTES];
|
||||
for (int j = 0; j < HIGH_SPEED_PROFILE_COLOR_BYTES; j++)
|
||||
{
|
||||
values_to_encode[j] = scb.color_values[0][j];
|
||||
}
|
||||
encode_ise(scb.get_color_quant_mode(), HIGH_SPEED_PROFILE_COLOR_BYTES,
|
||||
values_to_encode, pcb.data, 17); // the color is starting from 17th bit for HIGH_SPEED_PROFILE
|
||||
values_to_encode, pcb, 17); // the color is starting from 17th bit for HIGH_SPEED_PROFILE
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -132,13 +149,13 @@ void symbolic_to_physical(
|
||||
static const uint8_t cbytes[8] { 0xFC, 0xFD, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF };
|
||||
for (unsigned int i = 0; i < 8; i++)
|
||||
{
|
||||
pcb.data[i] = cbytes[i];
|
||||
pcb[i] = cbytes[i];
|
||||
}
|
||||
|
||||
for (unsigned int i = 0; i < BLOCK_MAX_COMPONENTS; i++)
|
||||
{
|
||||
pcb.data[2 * i + 8] = scb.constant_color[i] & 0xFF;
|
||||
pcb.data[2 * i + 9] = (scb.constant_color[i] >> 8) & 0xFF;
|
||||
pcb[2 * i + 8] = scb.constant_color[i] & 0xFF;
|
||||
pcb[2 * i + 9] = (scb.constant_color[i] >> 8) & 0xFF;
|
||||
}
|
||||
|
||||
return;
|
||||
@@ -151,13 +168,13 @@ void symbolic_to_physical(
|
||||
static const uint8_t cbytes[8] { 0xFC, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF };
|
||||
for (unsigned int i = 0; i < 8; i++)
|
||||
{
|
||||
pcb.data[i] = cbytes[i];
|
||||
pcb[i] = cbytes[i];
|
||||
}
|
||||
|
||||
for (unsigned int i = 0; i < BLOCK_MAX_COMPONENTS; i++)
|
||||
{
|
||||
pcb.data[2 * i + 8] = scb.constant_color[i] & 0xFF;
|
||||
pcb.data[2 * i + 9] = (scb.constant_color[i] >> 8) & 0xFF;
|
||||
pcb[2 * i + 8] = scb.constant_color[i] & 0xFF;
|
||||
pcb[2 * i + 9] = (scb.constant_color[i] >> 8) & 0xFF;
|
||||
}
|
||||
|
||||
return;
|
||||
@@ -169,50 +186,60 @@ void symbolic_to_physical(
|
||||
// They are encoded as an ordinary integer-sequence, then bit-reversed
|
||||
uint8_t weightbuf[16] { 0 };
|
||||
|
||||
const auto& bm = bsd.get_block_mode(scb.block_mode);
|
||||
const auto& di = bsd.get_decimation_info(bm.decimation_mode);
|
||||
int weight_count = di.weight_count;
|
||||
quant_method weight_quant_method = bm.get_weight_quant_mode();
|
||||
int is_dual_plane = bm.is_dual_plane;
|
||||
|
||||
int real_weight_count = is_dual_plane ? 2 * weight_count : weight_count;
|
||||
|
||||
int bits_for_weights = get_ise_sequence_bitcount(real_weight_count, weight_quant_method);
|
||||
|
||||
uint8_t weights[64];
|
||||
if (is_dual_plane)
|
||||
{
|
||||
uint8_t weights[64];
|
||||
for (int i = 0; i < weight_count; i++)
|
||||
{
|
||||
weights[2 * i] = scb.weights[i];
|
||||
weights[2 * i + 1] = scb.weights[i + WEIGHTS_PLANE2_OFFSET];
|
||||
float uqw = static_cast<float>(scb.weights[i]);
|
||||
float qw = (uqw / 64.0f) * (weight_quant_levels - 1.0f);
|
||||
int qwi = static_cast<int>(qw + 0.5f);
|
||||
weights[2 * i] = qat.scramble_map[qwi];
|
||||
|
||||
uqw = static_cast<float>(scb.weights[i + WEIGHTS_PLANE2_OFFSET]);
|
||||
qw = (uqw / 64.0f) * (weight_quant_levels - 1.0f);
|
||||
qwi = static_cast<int>(qw + 0.5f);
|
||||
weights[2 * i + 1] = qat.scramble_map[qwi];
|
||||
}
|
||||
encode_ise(weight_quant_method, real_weight_count, weights, weightbuf, 0);
|
||||
}
|
||||
else
|
||||
{
|
||||
encode_ise(weight_quant_method, weight_count, scb.weights, weightbuf, 0);
|
||||
for (int i = 0; i < weight_count; i++)
|
||||
{
|
||||
float uqw = static_cast<float>(scb.weights[i]);
|
||||
float qw = (uqw / 64.0f) * (weight_quant_levels - 1.0f);
|
||||
int qwi = static_cast<int>(qw + 0.5f);
|
||||
weights[i] = qat.scramble_map[qwi];
|
||||
}
|
||||
}
|
||||
|
||||
encode_ise(weight_quant_method, real_weight_count, weights, weightbuf, 0);
|
||||
|
||||
for (int i = 0; i < 16; i++)
|
||||
{
|
||||
pcb.data[i] = static_cast<uint8_t>(bitrev8(weightbuf[15 - i]));
|
||||
pcb[i] = static_cast<uint8_t>(bitrev8(weightbuf[15 - i]));
|
||||
}
|
||||
|
||||
write_bits(scb.block_mode, 11, 0, pcb.data);
|
||||
write_bits(partition_count - 1, 2, 11, pcb.data);
|
||||
write_bits(scb.block_mode, 11, 0, pcb);
|
||||
write_bits(partition_count - 1, 2, 11, pcb);
|
||||
|
||||
int below_weights_pos = 128 - bits_for_weights;
|
||||
|
||||
// Encode partition index and color endpoint types for blocks with 2+ partitions
|
||||
if (partition_count > 1)
|
||||
{
|
||||
write_bits(scb.partition_index, 6, 13, pcb.data);
|
||||
write_bits(scb.partition_index >> 6, PARTITION_INDEX_BITS - 6, 19, pcb.data);
|
||||
write_bits(scb.partition_index, 6, 13, pcb);
|
||||
write_bits(scb.partition_index >> 6, PARTITION_INDEX_BITS - 6, 19, pcb);
|
||||
|
||||
if (scb.color_formats_matched)
|
||||
{
|
||||
write_bits(scb.color_formats[0] << 2, 6, 13 + PARTITION_INDEX_BITS, pcb.data);
|
||||
write_bits(scb.color_formats[0] << 2, 6, 13 + PARTITION_INDEX_BITS, pcb);
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -251,44 +278,48 @@ void symbolic_to_physical(
|
||||
int encoded_type_highpart = encoded_type >> 6;
|
||||
int encoded_type_highpart_size = (3 * partition_count) - 4;
|
||||
int encoded_type_highpart_pos = 128 - bits_for_weights - encoded_type_highpart_size;
|
||||
write_bits(encoded_type_lowpart, 6, 13 + PARTITION_INDEX_BITS, pcb.data);
|
||||
write_bits(encoded_type_highpart, encoded_type_highpart_size, encoded_type_highpart_pos, pcb.data);
|
||||
write_bits(encoded_type_lowpart, 6, 13 + PARTITION_INDEX_BITS, pcb);
|
||||
write_bits(encoded_type_highpart, encoded_type_highpart_size, encoded_type_highpart_pos, pcb);
|
||||
below_weights_pos -= encoded_type_highpart_size;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
write_bits(scb.color_formats[0], 4, 13, pcb.data);
|
||||
write_bits(scb.color_formats[0], 4, 13, pcb);
|
||||
}
|
||||
|
||||
// In dual-plane mode, encode the color component of the second plane of weights
|
||||
if (is_dual_plane)
|
||||
{
|
||||
write_bits(scb.plane2_component, 2, below_weights_pos - 2, pcb.data);
|
||||
write_bits(scb.plane2_component, 2, below_weights_pos - 2, pcb);
|
||||
}
|
||||
|
||||
// Encode the color components
|
||||
uint8_t values_to_encode[32];
|
||||
int valuecount_to_encode = 0;
|
||||
|
||||
const uint8_t* pack_table = color_uquant_to_scrambled_pquant_tables[scb.quant_mode - QUANT_6];
|
||||
for (unsigned int i = 0; i < scb.partition_count; i++)
|
||||
{
|
||||
int vals = 2 * (scb.color_formats[i] >> 2) + 2;
|
||||
assert(vals <= 8);
|
||||
for (int j = 0; j < vals; j++)
|
||||
{
|
||||
values_to_encode[j + valuecount_to_encode] = scb.color_values[i][j];
|
||||
values_to_encode[j + valuecount_to_encode] = pack_table[scb.color_values[i][j]];
|
||||
}
|
||||
valuecount_to_encode += vals;
|
||||
}
|
||||
|
||||
encode_ise(scb.get_color_quant_mode(), valuecount_to_encode, values_to_encode, pcb.data,
|
||||
encode_ise(scb.get_color_quant_mode(), valuecount_to_encode, values_to_encode, pcb,
|
||||
scb.partition_count == 1 ? 17 : 19 + PARTITION_INDEX_BITS);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
/* See header for documentation. */
|
||||
void physical_to_symbolic(
|
||||
const block_size_descriptor& bsd,
|
||||
const physical_compressed_block& pcb,
|
||||
const uint8_t pcb[16],
|
||||
symbolic_compressed_block& scb
|
||||
) {
|
||||
uint8_t bswapped[16];
|
||||
@@ -296,7 +327,7 @@ void physical_to_symbolic(
|
||||
scb.block_type = SYM_BTYPE_NONCONST;
|
||||
|
||||
// Extract header fields
|
||||
int block_mode = read_bits(11, 0, pcb.data);
|
||||
int block_mode = read_bits(11, 0, pcb);
|
||||
if ((block_mode & 0x1FF) == 0x1FC)
|
||||
{
|
||||
// Constant color block
|
||||
@@ -314,24 +345,24 @@ void physical_to_symbolic(
|
||||
scb.partition_count = 0;
|
||||
for (int i = 0; i < 4; i++)
|
||||
{
|
||||
scb.constant_color[i] = pcb.data[2 * i + 8] | (pcb.data[2 * i + 9] << 8);
|
||||
scb.constant_color[i] = pcb[2 * i + 8] | (pcb[2 * i + 9] << 8);
|
||||
}
|
||||
|
||||
// Additionally, check that the void-extent
|
||||
if (bsd.zdim == 1)
|
||||
{
|
||||
// 2D void-extent
|
||||
int rsvbits = read_bits(2, 10, pcb.data);
|
||||
int rsvbits = read_bits(2, 10, pcb);
|
||||
if (rsvbits != 3)
|
||||
{
|
||||
scb.block_type = SYM_BTYPE_ERROR;
|
||||
return;
|
||||
}
|
||||
|
||||
int vx_low_s = read_bits(8, 12, pcb.data) | (read_bits(5, 12 + 8, pcb.data) << 8);
|
||||
int vx_high_s = read_bits(8, 25, pcb.data) | (read_bits(5, 25 + 8, pcb.data) << 8);
|
||||
int vx_low_t = read_bits(8, 38, pcb.data) | (read_bits(5, 38 + 8, pcb.data) << 8);
|
||||
int vx_high_t = read_bits(8, 51, pcb.data) | (read_bits(5, 51 + 8, pcb.data) << 8);
|
||||
int vx_low_s = read_bits(8, 12, pcb) | (read_bits(5, 12 + 8, pcb) << 8);
|
||||
int vx_high_s = read_bits(8, 25, pcb) | (read_bits(5, 25 + 8, pcb) << 8);
|
||||
int vx_low_t = read_bits(8, 38, pcb) | (read_bits(5, 38 + 8, pcb) << 8);
|
||||
int vx_high_t = read_bits(8, 51, pcb) | (read_bits(5, 51 + 8, pcb) << 8);
|
||||
|
||||
int all_ones = vx_low_s == 0x1FFF && vx_high_s == 0x1FFF && vx_low_t == 0x1FFF && vx_high_t == 0x1FFF;
|
||||
|
||||
@@ -344,12 +375,12 @@ void physical_to_symbolic(
|
||||
else
|
||||
{
|
||||
// 3D void-extent
|
||||
int vx_low_s = read_bits(9, 10, pcb.data);
|
||||
int vx_high_s = read_bits(9, 19, pcb.data);
|
||||
int vx_low_t = read_bits(9, 28, pcb.data);
|
||||
int vx_high_t = read_bits(9, 37, pcb.data);
|
||||
int vx_low_p = read_bits(9, 46, pcb.data);
|
||||
int vx_high_p = read_bits(9, 55, pcb.data);
|
||||
int vx_low_s = read_bits(9, 10, pcb);
|
||||
int vx_high_s = read_bits(9, 19, pcb);
|
||||
int vx_low_t = read_bits(9, 28, pcb);
|
||||
int vx_high_t = read_bits(9, 37, pcb);
|
||||
int vx_low_p = read_bits(9, 46, pcb);
|
||||
int vx_high_p = read_bits(9, 55, pcb);
|
||||
|
||||
int all_ones = vx_low_s == 0x1FF && vx_high_s == 0x1FF && vx_low_t == 0x1FF && vx_high_t == 0x1FF && vx_low_p == 0x1FF && vx_high_p == 0x1FF;
|
||||
|
||||
@@ -374,38 +405,47 @@ void physical_to_symbolic(
|
||||
const auto& di = bsd.get_decimation_info(bm.decimation_mode);
|
||||
|
||||
int weight_count = di.weight_count;
|
||||
promise(weight_count > 0);
|
||||
|
||||
quant_method weight_quant_method = static_cast<quant_method>(bm.quant_mode);
|
||||
int is_dual_plane = bm.is_dual_plane;
|
||||
|
||||
int real_weight_count = is_dual_plane ? 2 * weight_count : weight_count;
|
||||
|
||||
int partition_count = read_bits(2, 11, pcb.data) + 1;
|
||||
int partition_count = read_bits(2, 11, pcb) + 1;
|
||||
promise(partition_count > 0);
|
||||
|
||||
scb.block_mode = static_cast<uint16_t>(block_mode);
|
||||
scb.partition_count = static_cast<uint8_t>(partition_count);
|
||||
|
||||
for (int i = 0; i < 16; i++)
|
||||
{
|
||||
bswapped[i] = static_cast<uint8_t>(bitrev8(pcb.data[15 - i]));
|
||||
bswapped[i] = static_cast<uint8_t>(bitrev8(pcb[15 - i]));
|
||||
}
|
||||
|
||||
int bits_for_weights = get_ise_sequence_bitcount(real_weight_count, weight_quant_method);
|
||||
|
||||
int below_weights_pos = 128 - bits_for_weights;
|
||||
|
||||
uint8_t indices[64];
|
||||
const auto& qat = quant_and_xfer_tables[weight_quant_method];
|
||||
|
||||
decode_ise(weight_quant_method, real_weight_count, bswapped, indices, 0);
|
||||
|
||||
if (is_dual_plane)
|
||||
{
|
||||
uint8_t indices[64];
|
||||
decode_ise(weight_quant_method, real_weight_count, bswapped, indices, 0);
|
||||
for (int i = 0; i < weight_count; i++)
|
||||
{
|
||||
scb.weights[i] = indices[2 * i];
|
||||
scb.weights[i + WEIGHTS_PLANE2_OFFSET] = indices[2 * i + 1];
|
||||
scb.weights[i] = qat.unscramble_and_unquant_map[indices[2 * i]];
|
||||
scb.weights[i + WEIGHTS_PLANE2_OFFSET] = qat.unscramble_and_unquant_map[indices[2 * i + 1]];
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
decode_ise(weight_quant_method, weight_count, bswapped, scb.weights, 0);
|
||||
for (int i = 0; i < weight_count; i++)
|
||||
{
|
||||
scb.weights[i] = qat.unscramble_and_unquant_map[indices[i]];
|
||||
}
|
||||
}
|
||||
|
||||
if (is_dual_plane && partition_count == 4)
|
||||
@@ -421,14 +461,15 @@ void physical_to_symbolic(
|
||||
int encoded_type_highpart_size = 0;
|
||||
if (partition_count == 1)
|
||||
{
|
||||
color_formats[0] = read_bits(4, 13, pcb.data);
|
||||
color_formats[0] = read_bits(4, 13, pcb);
|
||||
scb.partition_index = 0;
|
||||
}
|
||||
else
|
||||
{
|
||||
encoded_type_highpart_size = (3 * partition_count) - 4;
|
||||
below_weights_pos -= encoded_type_highpart_size;
|
||||
int encoded_type = read_bits(6, 13 + PARTITION_INDEX_BITS, pcb.data) | (read_bits(encoded_type_highpart_size, below_weights_pos, pcb.data) << 6);
|
||||
int encoded_type = read_bits(6, 13 + PARTITION_INDEX_BITS, pcb) |
|
||||
(read_bits(encoded_type_highpart_size, below_weights_pos, pcb) << 6);
|
||||
int baseclass = encoded_type & 0x3;
|
||||
if (baseclass == 0)
|
||||
{
|
||||
@@ -458,7 +499,8 @@ void physical_to_symbolic(
|
||||
bitpos += 2;
|
||||
}
|
||||
}
|
||||
scb.partition_index = static_cast<uint16_t>(read_bits(6, 13, pcb.data) | (read_bits(PARTITION_INDEX_BITS - 6, 19, pcb.data) << 6));
|
||||
scb.partition_index = static_cast<uint16_t>(read_bits(6, 13, pcb) |
|
||||
(read_bits(PARTITION_INDEX_BITS - 6, 19, pcb) << 6));
|
||||
}
|
||||
|
||||
for (int i = 0; i < partition_count; i++)
|
||||
@@ -502,24 +544,27 @@ void physical_to_symbolic(
|
||||
|
||||
// Unpack the integer color values and assign to endpoints
|
||||
scb.quant_mode = static_cast<quant_method>(color_quant_level);
|
||||
|
||||
uint8_t values_to_decode[32];
|
||||
decode_ise(static_cast<quant_method>(color_quant_level), color_integer_count, pcb.data,
|
||||
decode_ise(static_cast<quant_method>(color_quant_level), color_integer_count, pcb,
|
||||
values_to_decode, (partition_count == 1 ? 17 : 19 + PARTITION_INDEX_BITS));
|
||||
|
||||
int valuecount_to_decode = 0;
|
||||
const uint8_t* unpack_table = color_scrambled_pquant_to_uquant_tables[scb.quant_mode - QUANT_6];
|
||||
for (int i = 0; i < partition_count; i++)
|
||||
{
|
||||
int vals = 2 * (color_formats[i] >> 2) + 2;
|
||||
for (int j = 0; j < vals; j++)
|
||||
{
|
||||
scb.color_values[i][j] = values_to_decode[j + valuecount_to_decode];
|
||||
scb.color_values[i][j] = unpack_table[values_to_decode[j + valuecount_to_decode]];
|
||||
}
|
||||
valuecount_to_decode += vals;
|
||||
}
|
||||
|
||||
// Fetch component for second-plane in the case of dual plane of weights.
|
||||
scb.plane2_component = -1;
|
||||
if (is_dual_plane)
|
||||
{
|
||||
scb.plane2_component = static_cast<int8_t>(read_bits(2, below_weights_pos - 2, pcb.data));
|
||||
scb.plane2_component = static_cast<int8_t>(read_bits(2, below_weights_pos - 2, pcb));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
// ----------------------------------------------------------------------------
|
||||
// Copyright 2019-2021 Arm Limited
|
||||
// Copyright 2019-2022 Arm Limited
|
||||
// Copyright 2008 Jose Fonseca
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
|
||||
@@ -26,7 +26,7 @@
|
||||
* with that is available at compile time. The current vector width is
|
||||
* accessible for e.g. loop strides via the ASTCENC_SIMD_WIDTH constant.
|
||||
*
|
||||
* Explicit scalar types are acessible via the vint1, vfloat1, vmask1 types.
|
||||
* Explicit scalar types are accessible via the vint1, vfloat1, vmask1 types.
|
||||
* These are provided primarily for prototyping and algorithm debug of VLA
|
||||
* implementations.
|
||||
*
|
||||
@@ -60,10 +60,13 @@
|
||||
|
||||
#if !defined(__clang__) && defined(_MSC_VER)
|
||||
#define ASTCENC_SIMD_INLINE __forceinline
|
||||
#define ASTCENC_NO_INLINE
|
||||
#elif defined(__GNUC__) && !defined(__clang__)
|
||||
#define ASTCENC_SIMD_INLINE __attribute__((always_inline)) inline
|
||||
#define ASTCENC_NO_INLINE __attribute__ ((noinline))
|
||||
#else
|
||||
#define ASTCENC_SIMD_INLINE __attribute__((always_inline, nodebug)) inline
|
||||
#define ASTCENC_NO_INLINE __attribute__ ((noinline))
|
||||
#endif
|
||||
|
||||
#if ASTCENC_AVX >= 2
|
||||
@@ -160,7 +163,7 @@
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE unsigned int round_down_to_simd_multiple_8(unsigned int count)
|
||||
{
|
||||
return count & ~(8 - 1);
|
||||
return count & static_cast<unsigned int>(~(8 - 1));
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -172,7 +175,7 @@ ASTCENC_SIMD_INLINE unsigned int round_down_to_simd_multiple_8(unsigned int coun
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE unsigned int round_down_to_simd_multiple_4(unsigned int count)
|
||||
{
|
||||
return count & ~(4 - 1);
|
||||
return count & static_cast<unsigned int>(~(4 - 1));
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -186,7 +189,7 @@ ASTCENC_SIMD_INLINE unsigned int round_down_to_simd_multiple_4(unsigned int coun
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE unsigned int round_down_to_simd_multiple_vla(unsigned int count)
|
||||
{
|
||||
return count & ~(ASTCENC_SIMD_WIDTH - 1);
|
||||
return count & static_cast<unsigned int>(~(ASTCENC_SIMD_WIDTH - 1));
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -200,7 +203,7 @@ ASTCENC_SIMD_INLINE unsigned int round_down_to_simd_multiple_vla(unsigned int co
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE unsigned int round_up_to_simd_multiple_vla(unsigned int count)
|
||||
{
|
||||
int multiples = (count + ASTCENC_SIMD_WIDTH - 1) / ASTCENC_SIMD_WIDTH;
|
||||
unsigned int multiples = (count + ASTCENC_SIMD_WIDTH - 1) / ASTCENC_SIMD_WIDTH;
|
||||
return multiples * ASTCENC_SIMD_WIDTH;
|
||||
}
|
||||
|
||||
@@ -219,7 +222,7 @@ ASTCENC_SIMD_INLINE vfloat change_sign(vfloat a, vfloat b)
|
||||
/**
|
||||
* @brief Return fast, but approximate, vector atan(x).
|
||||
*
|
||||
* Max error of this implementaiton is 0.004883.
|
||||
* Max error of this implementation is 0.004883.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE vfloat atan(vfloat x)
|
||||
{
|
||||
@@ -399,7 +402,7 @@ static ASTCENC_SIMD_INLINE vint4 clz(vint4 a)
|
||||
// the original integer value into a 2^N encoding we can recover easily.
|
||||
|
||||
// Convert to float without risk of rounding up by keeping only top 8 bits.
|
||||
// This trick is is guranteed to keep top 8 bits and clear the 9th.
|
||||
// This trick is is guaranteed to keep top 8 bits and clear the 9th.
|
||||
a = (~lsr<8>(a)) & a;
|
||||
a = float_as_int(int_to_float(a));
|
||||
|
||||
@@ -413,7 +416,7 @@ static ASTCENC_SIMD_INLINE vint4 clz(vint4 a)
|
||||
/**
|
||||
* @brief Return lanewise 2^a for each lane in @c a.
|
||||
*
|
||||
* Use of signed int mean that this is only valid for values in range [0, 31].
|
||||
* Use of signed int means that this is only valid for values in range [0, 31].
|
||||
*/
|
||||
static ASTCENC_SIMD_INLINE vint4 two_to_the_n(vint4 a)
|
||||
{
|
||||
@@ -507,7 +510,7 @@ static ASTCENC_SIMD_INLINE vfloat4 frexp(vfloat4 a, vint4& exp)
|
||||
exp = (lsr<23>(ai) & 0xFF) - 126;
|
||||
|
||||
// Extract and unbias the mantissa
|
||||
vint4 manti = (ai & 0x807FFFFF) | 0x3F000000;
|
||||
vint4 manti = (ai & static_cast<int>(0x807FFFFF)) | 0x3F000000;
|
||||
return int_as_float(manti);
|
||||
}
|
||||
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
// ----------------------------------------------------------------------------
|
||||
// Copyright 2019-2021 Arm Limited
|
||||
// Copyright 2019-2024 Arm Limited
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
|
||||
// use this file except in compliance with the License. You may obtain a copy
|
||||
@@ -36,6 +36,9 @@
|
||||
|
||||
#include <cstdio>
|
||||
|
||||
// Define convenience intrinsics that are missing on older compilers
|
||||
#define astcenc_mm256_set_m128i(m, n) _mm256_insertf128_si256(_mm256_castsi128_si256((n)), (m), 1)
|
||||
|
||||
// ============================================================================
|
||||
// vfloat8 data type
|
||||
// ============================================================================
|
||||
@@ -86,7 +89,8 @@ struct vfloat8
|
||||
/**
|
||||
* @brief Construct from an existing SIMD register.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE explicit vfloat8(__m256 a) {
|
||||
ASTCENC_SIMD_INLINE explicit vfloat8(__m256 a)
|
||||
{
|
||||
m = a;
|
||||
}
|
||||
|
||||
@@ -237,6 +241,14 @@ struct vint8
|
||||
return vint8(_mm256_broadcastd_epi32(a));
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Factory that returns a vector loaded from unaligned memory.
|
||||
*/
|
||||
static ASTCENC_SIMD_INLINE vint8 load(const uint8_t* p)
|
||||
{
|
||||
return vint8(_mm256_lddqu_si256(reinterpret_cast<const __m256i*>(p)));
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Factory that returns a vector loaded from 32B aligned memory.
|
||||
*/
|
||||
@@ -340,9 +352,9 @@ ASTCENC_SIMD_INLINE vmask8 operator~(vmask8 a)
|
||||
*
|
||||
* bit0 = lane 0
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE unsigned mask(vmask8 a)
|
||||
ASTCENC_SIMD_INLINE unsigned int mask(vmask8 a)
|
||||
{
|
||||
return _mm256_movemask_ps(a.m);
|
||||
return static_cast<unsigned int>(_mm256_movemask_ps(a.m));
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -354,7 +366,7 @@ ASTCENC_SIMD_INLINE bool any(vmask8 a)
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief True if any lanes are enabled, false otherwise.
|
||||
* @brief True if all lanes are enabled, false otherwise.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE bool all(vmask8 a)
|
||||
{
|
||||
@@ -461,6 +473,14 @@ ASTCENC_SIMD_INLINE vmask8 operator>(vint8 a, vint8 b)
|
||||
return vmask8(_mm256_cmpgt_epi32(a.m, b.m));
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Logical shift left.
|
||||
*/
|
||||
template <int s> ASTCENC_SIMD_INLINE vint8 lsl(vint8 a)
|
||||
{
|
||||
return vint8(_mm256_slli_epi32(a.m, s));
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Arithmetic shift right.
|
||||
*/
|
||||
@@ -503,16 +523,13 @@ ASTCENC_SIMD_INLINE vint8 hmin(vint8 a)
|
||||
m = _mm_min_epi32(m, _mm_shuffle_epi32(m, _MM_SHUFFLE(0,0,0,1)));
|
||||
m = _mm_shuffle_epi32(m, _MM_SHUFFLE(0,0,0,0));
|
||||
|
||||
// This is the most logical implementation, but the convenience intrinsic
|
||||
// is missing on older compilers (supported in g++ 9 and clang++ 9).
|
||||
//__m256i r = _mm256_set_m128i(m, m)
|
||||
__m256i r = _mm256_insertf128_si256(_mm256_castsi128_si256(m), m, 1);
|
||||
__m256i r = astcenc_mm256_set_m128i(m, m);
|
||||
vint8 vmin(r);
|
||||
return vmin;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Return the horizontal minimum of a vector.
|
||||
* @brief Return the horizontal maximum of a vector.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE vint8 hmax(vint8 a)
|
||||
{
|
||||
@@ -521,10 +538,7 @@ ASTCENC_SIMD_INLINE vint8 hmax(vint8 a)
|
||||
m = _mm_max_epi32(m, _mm_shuffle_epi32(m, _MM_SHUFFLE(0,0,0,1)));
|
||||
m = _mm_shuffle_epi32(m, _MM_SHUFFLE(0,0,0,0));
|
||||
|
||||
// This is the most logical implementation, but the convenience intrinsic
|
||||
// is missing on older compilers (supported in g++ 9 and clang++ 9).
|
||||
//__m256i r = _mm256_set_m128i(m, m)
|
||||
__m256i r = _mm256_insertf128_si256(_mm256_castsi128_si256(m), m, 1);
|
||||
__m256i r = astcenc_mm256_set_m128i(m, m);
|
||||
vint8 vmax(r);
|
||||
return vmax;
|
||||
}
|
||||
@@ -578,10 +592,7 @@ ASTCENC_SIMD_INLINE vint8 pack_low_bytes(vint8 v)
|
||||
__m128i a1 = _mm256_extracti128_si256(a, 1);
|
||||
__m128i b = _mm_unpacklo_epi32(a0, a1);
|
||||
|
||||
// This is the most logical implementation, but the convenience intrinsic
|
||||
// is missing on older compilers (supported in g++ 9 and clang++ 9).
|
||||
//__m256i r = _mm256_set_m128i(b, b)
|
||||
__m256i r = _mm256_insertf128_si256(_mm256_castsi128_si256(b), b, 1);
|
||||
__m256i r = astcenc_mm256_set_m128i(b, b);
|
||||
return vint8(r);
|
||||
}
|
||||
|
||||
@@ -731,6 +742,16 @@ ASTCENC_SIMD_INLINE vfloat8 min(vfloat8 a, vfloat8 b)
|
||||
return vfloat8(_mm256_min_ps(a.m, b.m));
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Return the min vector of a vector and a scalar.
|
||||
*
|
||||
* If either lane value is NaN, @c b will be returned for that lane.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE vfloat8 min(vfloat8 a, float b)
|
||||
{
|
||||
return min(a, vfloat8(b));
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Return the max vector of two vectors.
|
||||
*
|
||||
@@ -741,6 +762,16 @@ ASTCENC_SIMD_INLINE vfloat8 max(vfloat8 a, vfloat8 b)
|
||||
return vfloat8(_mm256_max_ps(a.m, b.m));
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Return the max vector of a vector and a scalar.
|
||||
*
|
||||
* If either lane value is NaN, @c b will be returned for that lane.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE vfloat8 max(vfloat8 a, float b)
|
||||
{
|
||||
return max(a, vfloat8(b));
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Return the clamped value between min and max.
|
||||
*
|
||||
@@ -805,13 +836,13 @@ ASTCENC_SIMD_INLINE vfloat8 hmin(vfloat8 a)
|
||||
{
|
||||
__m128 vlow = _mm256_castps256_ps128(a.m);
|
||||
__m128 vhigh = _mm256_extractf128_ps(a.m, 1);
|
||||
vlow = _mm_min_ps(vlow, vhigh);
|
||||
vlow = _mm_min_ps(vlow, vhigh);
|
||||
|
||||
// First do an horizontal reduction.
|
||||
__m128 shuf = _mm_shuffle_ps(vlow, vlow, _MM_SHUFFLE(2, 3, 0, 1));
|
||||
__m128 mins = _mm_min_ps(vlow, shuf);
|
||||
shuf = _mm_movehl_ps(shuf, mins);
|
||||
mins = _mm_min_ss(mins, shuf);
|
||||
shuf = _mm_movehl_ps(shuf, mins);
|
||||
mins = _mm_min_ss(mins, shuf);
|
||||
|
||||
// This is the most logical implementation, but the convenience intrinsic
|
||||
// is missing on older compilers (supported in g++ 9 and clang++ 9).
|
||||
@@ -836,13 +867,13 @@ ASTCENC_SIMD_INLINE vfloat8 hmax(vfloat8 a)
|
||||
{
|
||||
__m128 vlow = _mm256_castps256_ps128(a.m);
|
||||
__m128 vhigh = _mm256_extractf128_ps(a.m, 1);
|
||||
vhigh = _mm_max_ps(vlow, vhigh);
|
||||
vhigh = _mm_max_ps(vlow, vhigh);
|
||||
|
||||
// First do an horizontal reduction.
|
||||
__m128 shuf = _mm_shuffle_ps(vhigh, vhigh, _MM_SHUFFLE(2, 3, 0, 1));
|
||||
__m128 maxs = _mm_max_ps(vhigh, shuf);
|
||||
shuf = _mm_movehl_ps(shuf,maxs);
|
||||
maxs = _mm_max_ss(maxs, shuf);
|
||||
shuf = _mm_movehl_ps(shuf,maxs);
|
||||
maxs = _mm_max_ss(maxs, shuf);
|
||||
|
||||
// This is the most logical implementation, but the convenience intrinsic
|
||||
// is missing on older compilers (supported in g++ 9 and clang++ 9).
|
||||
@@ -972,6 +1003,16 @@ ASTCENC_SIMD_INLINE vint8 float_to_int(vfloat8 a)
|
||||
return vint8(_mm256_cvttps_epi32(a.m));
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Return a integer value for a float vector, using round-to-nearest.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE vint8 float_to_int_rtn(vfloat8 a)
|
||||
{
|
||||
a = a + vfloat8(0.5f);
|
||||
return vint8(_mm256_cvttps_epi32(a.m));
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* @brief Return a float value for an integer vector.
|
||||
*/
|
||||
@@ -1004,23 +1045,154 @@ ASTCENC_SIMD_INLINE vfloat8 int_as_float(vint8 a)
|
||||
return vfloat8(_mm256_castsi256_ps(a.m));
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Prepare a vtable lookup table for use with the native SIMD size.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE void vtable_prepare(vint4 t0, vint8& t0p)
|
||||
{
|
||||
// AVX2 duplicates the table within each 128-bit lane
|
||||
__m128i t0n = t0.m;
|
||||
t0p = vint8(astcenc_mm256_set_m128i(t0n, t0n));
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Prepare a vtable lookup table for use with the native SIMD size.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE void vtable_prepare(vint4 t0, vint4 t1, vint8& t0p, vint8& t1p)
|
||||
{
|
||||
// AVX2 duplicates the table within each 128-bit lane
|
||||
__m128i t0n = t0.m;
|
||||
t0p = vint8(astcenc_mm256_set_m128i(t0n, t0n));
|
||||
|
||||
__m128i t1n = _mm_xor_si128(t0.m, t1.m);
|
||||
t1p = vint8(astcenc_mm256_set_m128i(t1n, t1n));
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Prepare a vtable lookup table for use with the native SIMD size.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE void vtable_prepare(
|
||||
vint4 t0, vint4 t1, vint4 t2, vint4 t3,
|
||||
vint8& t0p, vint8& t1p, vint8& t2p, vint8& t3p)
|
||||
{
|
||||
// AVX2 duplicates the table within each 128-bit lane
|
||||
__m128i t0n = t0.m;
|
||||
t0p = vint8(astcenc_mm256_set_m128i(t0n, t0n));
|
||||
|
||||
__m128i t1n = _mm_xor_si128(t0.m, t1.m);
|
||||
t1p = vint8(astcenc_mm256_set_m128i(t1n, t1n));
|
||||
|
||||
__m128i t2n = _mm_xor_si128(t1.m, t2.m);
|
||||
t2p = vint8(astcenc_mm256_set_m128i(t2n, t2n));
|
||||
|
||||
__m128i t3n = _mm_xor_si128(t2.m, t3.m);
|
||||
t3p = vint8(astcenc_mm256_set_m128i(t3n, t3n));
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Perform an 8-bit 16-entry table lookup, with 32-bit indexes.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE vint8 vtable_8bt_32bi(vint8 t0, vint8 idx)
|
||||
{
|
||||
// Set index byte MSB to 1 for unused bytes so shuffle returns zero
|
||||
__m256i idxx = _mm256_or_si256(idx.m, _mm256_set1_epi32(static_cast<int>(0xFFFFFF00)));
|
||||
|
||||
__m256i result = _mm256_shuffle_epi8(t0.m, idxx);
|
||||
return vint8(result);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Perform an 8-bit 32-entry table lookup, with 32-bit indexes.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE vint8 vtable_8bt_32bi(vint8 t0, vint8 t1, vint8 idx)
|
||||
{
|
||||
// Set index byte MSB to 1 for unused bytes so shuffle returns zero
|
||||
__m256i idxx = _mm256_or_si256(idx.m, _mm256_set1_epi32(static_cast<int>(0xFFFFFF00)));
|
||||
|
||||
__m256i result = _mm256_shuffle_epi8(t0.m, idxx);
|
||||
idxx = _mm256_sub_epi8(idxx, _mm256_set1_epi8(16));
|
||||
|
||||
__m256i result2 = _mm256_shuffle_epi8(t1.m, idxx);
|
||||
result = _mm256_xor_si256(result, result2);
|
||||
return vint8(result);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Perform an 8-bit 64-entry table lookup, with 32-bit indexes.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE vint8 vtable_8bt_32bi(vint8 t0, vint8 t1, vint8 t2, vint8 t3, vint8 idx)
|
||||
{
|
||||
// Set index byte MSB to 1 for unused bytes so shuffle returns zero
|
||||
__m256i idxx = _mm256_or_si256(idx.m, _mm256_set1_epi32(static_cast<int>(0xFFFFFF00)));
|
||||
|
||||
__m256i result = _mm256_shuffle_epi8(t0.m, idxx);
|
||||
idxx = _mm256_sub_epi8(idxx, _mm256_set1_epi8(16));
|
||||
|
||||
__m256i result2 = _mm256_shuffle_epi8(t1.m, idxx);
|
||||
result = _mm256_xor_si256(result, result2);
|
||||
idxx = _mm256_sub_epi8(idxx, _mm256_set1_epi8(16));
|
||||
|
||||
result2 = _mm256_shuffle_epi8(t2.m, idxx);
|
||||
result = _mm256_xor_si256(result, result2);
|
||||
idxx = _mm256_sub_epi8(idxx, _mm256_set1_epi8(16));
|
||||
|
||||
result2 = _mm256_shuffle_epi8(t3.m, idxx);
|
||||
result = _mm256_xor_si256(result, result2);
|
||||
|
||||
return vint8(result);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Return a vector of interleaved RGBA data.
|
||||
*
|
||||
* Input vectors have the value stored in the bottom 8 bits of each lane,
|
||||
* with high bits set to zero.
|
||||
*
|
||||
* Output vector stores a single RGBA texel packed in each lane.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE vint8 interleave_rgba8(vint8 r, vint8 g, vint8 b, vint8 a)
|
||||
{
|
||||
return r + lsl<8>(g) + lsl<16>(b) + lsl<24>(a);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Store a vector, skipping masked lanes.
|
||||
*
|
||||
* All masked lanes must be at the end of vector, after all non-masked lanes.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE void store_lanes_masked(uint8_t* base, vint8 data, vmask8 mask)
|
||||
{
|
||||
_mm256_maskstore_epi32(reinterpret_cast<int*>(base), _mm256_castps_si256(mask.m), data.m);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Debug function to print a vector of ints.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE void print(vint8 a)
|
||||
{
|
||||
alignas(ASTCENC_VECALIGN) int v[8];
|
||||
alignas(32) int v[8];
|
||||
storea(a, v);
|
||||
printf("v8_i32:\n %8d %8d %8d %8d %8d %8d %8d %8d\n",
|
||||
v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Debug function to print a vector of ints.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE void printx(vint8 a)
|
||||
{
|
||||
alignas(32) int v[8];
|
||||
storea(a, v);
|
||||
printf("v8_i32:\n %08x %08x %08x %08x %08x %08x %08x %08x\n",
|
||||
v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Debug function to print a vector of floats.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE void print(vfloat8 a)
|
||||
{
|
||||
alignas(ASTCENC_VECALIGN) float v[8];
|
||||
alignas(32) float v[8];
|
||||
storea(a, v);
|
||||
printf("v8_f32:\n %0.4f %0.4f %0.4f %0.4f %0.4f %0.4f %0.4f %0.4f\n",
|
||||
static_cast<double>(v[0]), static_cast<double>(v[1]),
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
// ----------------------------------------------------------------------------
|
||||
// Copyright 2020-2021 Arm Limited
|
||||
// Copyright 2020-2024 Arm Limited
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
|
||||
// use this file except in compliance with the License. You may obtain a copy
|
||||
@@ -361,23 +361,51 @@ static inline int popcount(uint64_t v)
|
||||
|
||||
#endif
|
||||
|
||||
/**
|
||||
* @brief Apply signed bit transfer.
|
||||
*
|
||||
* @param input0 The first encoded endpoint.
|
||||
* @param input1 The second encoded endpoint.
|
||||
*/
|
||||
static ASTCENC_SIMD_INLINE void bit_transfer_signed(
|
||||
vint4& input0,
|
||||
vint4& input1
|
||||
) {
|
||||
input1 = lsr<1>(input1) | (input0 & 0x80);
|
||||
input0 = lsr<1>(input0) & 0x3F;
|
||||
|
||||
vmask4 mask = (input0 & 0x20) != vint4::zero();
|
||||
input0 = select(input0, input0 - 0x40, mask);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Debug function to print a vector of ints.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE void print(vint4 a)
|
||||
{
|
||||
alignas(16) int v[4];
|
||||
ASTCENC_ALIGNAS int v[4];
|
||||
storea(a, v);
|
||||
printf("v4_i32:\n %8d %8d %8d %8d\n",
|
||||
v[0], v[1], v[2], v[3]);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Debug function to print a vector of ints.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE void printx(vint4 a)
|
||||
{
|
||||
ASTCENC_ALIGNAS int v[4];
|
||||
storea(a, v);
|
||||
printf("v4_i32:\n %08x %08x %08x %08x\n",
|
||||
v[0], v[1], v[2], v[3]);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Debug function to print a vector of floats.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE void print(vfloat4 a)
|
||||
{
|
||||
alignas(16) float v[4];
|
||||
ASTCENC_ALIGNAS float v[4];
|
||||
storea(a, v);
|
||||
printf("v4_f32:\n %0.4f %0.4f %0.4f %0.4f\n",
|
||||
static_cast<double>(v[0]), static_cast<double>(v[1]),
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
// ----------------------------------------------------------------------------
|
||||
// Copyright 2019-2022 Arm Limited
|
||||
// Copyright 2019-2023 Arm Limited
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
|
||||
// use this file except in compliance with the License. You may obtain a copy
|
||||
@@ -38,6 +38,7 @@
|
||||
#endif
|
||||
|
||||
#include <cstdio>
|
||||
#include <cstring>
|
||||
|
||||
// ============================================================================
|
||||
// vfloat4 data type
|
||||
@@ -106,7 +107,7 @@ struct vfloat4
|
||||
*/
|
||||
template <int l> ASTCENC_SIMD_INLINE void set_lane(float a)
|
||||
{
|
||||
m = vld1q_lane_f32(&a, m, l);
|
||||
m = vsetq_lane_f32(a, m, l);
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -122,7 +123,7 @@ struct vfloat4
|
||||
*/
|
||||
static ASTCENC_SIMD_INLINE vfloat4 load1(const float* p)
|
||||
{
|
||||
return vfloat4(vdupq_n_f32(*p));
|
||||
return vfloat4(vld1q_dup_f32(p));
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -202,9 +203,8 @@ struct vint4
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE explicit vint4(const uint8_t *p)
|
||||
{
|
||||
uint32x2_t t8 {};
|
||||
// Cast is safe - NEON loads are allowed to be unaligned
|
||||
t8 = vld1_lane_u32((const uint32_t*)p, t8, 0);
|
||||
uint32x2_t t8 = vld1_dup_u32(reinterpret_cast<const uint32_t*>(p));
|
||||
uint16x4_t t16 = vget_low_u16(vmovl_u8(vreinterpret_u8_u32(t8)));
|
||||
m = vreinterpretq_s32_u32(vmovl_u16(t16));
|
||||
}
|
||||
@@ -251,7 +251,7 @@ struct vint4
|
||||
*/
|
||||
template <int l> ASTCENC_SIMD_INLINE void set_lane(int a)
|
||||
{
|
||||
m = vld1q_lane_s32(&a, m, l);
|
||||
m = vsetq_lane_s32(a, m, l);
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -270,6 +270,16 @@ struct vint4
|
||||
return vint4(*p);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Factory that returns a vector loaded from unaligned memory.
|
||||
*/
|
||||
static ASTCENC_SIMD_INLINE vint4 load(const uint8_t* p)
|
||||
{
|
||||
vint4 data;
|
||||
std::memcpy(&data.m, p, 4 * sizeof(int));
|
||||
return data;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Factory that returns a vector loaded from 16B aligned memory.
|
||||
*/
|
||||
@@ -283,7 +293,7 @@ struct vint4
|
||||
*/
|
||||
static ASTCENC_SIMD_INLINE vint4 lane_id()
|
||||
{
|
||||
alignas(ASTCENC_VECALIGN) static const int data[4] { 0, 1, 2, 3 };
|
||||
alignas(16) static const int data[4] { 0, 1, 2, 3 };
|
||||
return vint4(vld1q_s32(data));
|
||||
}
|
||||
|
||||
@@ -346,6 +356,14 @@ struct vmask4
|
||||
m = vreinterpretq_u32_s32(ms);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Get the scalar from a single lane.
|
||||
*/
|
||||
template <int32_t l> ASTCENC_SIMD_INLINE bool lane() const
|
||||
{
|
||||
return vgetq_lane_u32(m, l) != 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief The vector ...
|
||||
*/
|
||||
@@ -577,12 +595,20 @@ ASTCENC_SIMD_INLINE void store(vint4 a, int* p)
|
||||
vst1q_s32(p, a.m);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Store a vector to an unaligned memory address.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE void store(vint4 a, uint8_t* p)
|
||||
{
|
||||
std::memcpy(p, &a.m, sizeof(int) * 4);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Store lowest N (vector width) bytes into an unaligned address.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE void store_nbytes(vint4 a, uint8_t* p)
|
||||
{
|
||||
vst1q_lane_s32((int32_t*)p, a.m, 0);
|
||||
vst1q_lane_s32(reinterpret_cast<int32_t*>(p), a.m, 0);
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -842,7 +868,7 @@ ASTCENC_SIMD_INLINE vint4 float_to_int(vfloat4 a)
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE vint4 float_to_int_rtn(vfloat4 a)
|
||||
{
|
||||
a = round(a);
|
||||
a = a + vfloat4(0.5f);
|
||||
return vint4(vcvtq_s32_f32(a.m));
|
||||
}
|
||||
|
||||
@@ -874,7 +900,7 @@ ASTCENC_SIMD_INLINE vint4 float_to_float16(vfloat4 a)
|
||||
static inline uint16_t float_to_float16(float a)
|
||||
{
|
||||
vfloat4 av(a);
|
||||
return float_to_float16(av).lane<0>();
|
||||
return static_cast<uint16_t>(float_to_float16(av).lane<0>());
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -924,6 +950,138 @@ ASTCENC_SIMD_INLINE vfloat4 int_as_float(vint4 v)
|
||||
return vfloat4(vreinterpretq_f32_s32(v.m));
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Prepare a vtable lookup table for use with the native SIMD size.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE void vtable_prepare(vint4 t0, vint4& t0p)
|
||||
{
|
||||
t0p = t0;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* @brief Prepare a vtable lookup table for use with the native SIMD size.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE void vtable_prepare(vint4 t0, vint4 t1, vint4& t0p, vint4& t1p)
|
||||
{
|
||||
t0p = t0;
|
||||
t1p = t1;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Prepare a vtable lookup table for use with the native SIMD size.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE void vtable_prepare(
|
||||
vint4 t0, vint4 t1, vint4 t2, vint4 t3,
|
||||
vint4& t0p, vint4& t1p, vint4& t2p, vint4& t3p)
|
||||
{
|
||||
t0p = t0;
|
||||
t1p = t1;
|
||||
t2p = t2;
|
||||
t3p = t3;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Perform an 8-bit 16-entry table lookup, with 32-bit indexes.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 idx)
|
||||
{
|
||||
int8x16_t table {
|
||||
vreinterpretq_s8_s32(t0.m)
|
||||
};
|
||||
|
||||
// Set index byte above max index for unused bytes so table lookup returns zero
|
||||
int32x4_t idx_masked = vorrq_s32(idx.m, vdupq_n_s32(0xFFFFFF00));
|
||||
uint8x16_t idx_bytes = vreinterpretq_u8_s32(idx_masked);
|
||||
|
||||
return vint4(vreinterpretq_s32_s8(vqtbl1q_s8(table, idx_bytes)));
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Perform an 8-bit 32-entry table lookup, with 32-bit indexes.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 t1, vint4 idx)
|
||||
{
|
||||
int8x16x2_t table {
|
||||
vreinterpretq_s8_s32(t0.m),
|
||||
vreinterpretq_s8_s32(t1.m)
|
||||
};
|
||||
|
||||
// Set index byte above max index for unused bytes so table lookup returns zero
|
||||
int32x4_t idx_masked = vorrq_s32(idx.m, vdupq_n_s32(0xFFFFFF00));
|
||||
uint8x16_t idx_bytes = vreinterpretq_u8_s32(idx_masked);
|
||||
|
||||
return vint4(vreinterpretq_s32_s8(vqtbl2q_s8(table, idx_bytes)));
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Perform an 8-bit 64-entry table lookup, with 32-bit indexes.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 t1, vint4 t2, vint4 t3, vint4 idx)
|
||||
{
|
||||
int8x16x4_t table {
|
||||
vreinterpretq_s8_s32(t0.m),
|
||||
vreinterpretq_s8_s32(t1.m),
|
||||
vreinterpretq_s8_s32(t2.m),
|
||||
vreinterpretq_s8_s32(t3.m)
|
||||
};
|
||||
|
||||
// Set index byte above max index for unused bytes so table lookup returns zero
|
||||
int32x4_t idx_masked = vorrq_s32(idx.m, vdupq_n_s32(0xFFFFFF00));
|
||||
uint8x16_t idx_bytes = vreinterpretq_u8_s32(idx_masked);
|
||||
|
||||
return vint4(vreinterpretq_s32_s8(vqtbl4q_s8(table, idx_bytes)));
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Return a vector of interleaved RGBA data.
|
||||
*
|
||||
* Input vectors have the value stored in the bottom 8 bits of each lane,
|
||||
* with high bits set to zero.
|
||||
*
|
||||
* Output vector stores a single RGBA texel packed in each lane.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE vint4 interleave_rgba8(vint4 r, vint4 g, vint4 b, vint4 a)
|
||||
{
|
||||
return r + lsl<8>(g) + lsl<16>(b) + lsl<24>(a);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Store a single vector lane to an unaligned address.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE void store_lane(uint8_t* base, int data)
|
||||
{
|
||||
std::memcpy(base, &data, sizeof(int));
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Store a vector, skipping masked lanes.
|
||||
*
|
||||
* All masked lanes must be at the end of vector, after all non-masked lanes.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE void store_lanes_masked(uint8_t* base, vint4 data, vmask4 mask)
|
||||
{
|
||||
if (mask.lane<3>())
|
||||
{
|
||||
store(data, base);
|
||||
}
|
||||
else if (mask.lane<2>() != 0.0f)
|
||||
{
|
||||
store_lane(base + 0, data.lane<0>());
|
||||
store_lane(base + 4, data.lane<1>());
|
||||
store_lane(base + 8, data.lane<2>());
|
||||
}
|
||||
else if (mask.lane<1>() != 0.0f)
|
||||
{
|
||||
store_lane(base + 0, data.lane<0>());
|
||||
store_lane(base + 4, data.lane<1>());
|
||||
}
|
||||
else if (mask.lane<0>() != 0.0f)
|
||||
{
|
||||
store_lane(base + 0, data.lane<0>());
|
||||
}
|
||||
}
|
||||
|
||||
#define ASTCENC_USE_NATIVE_POPCOUNT 1
|
||||
|
||||
/**
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
// ----------------------------------------------------------------------------
|
||||
// Copyright 2019-2021 Arm Limited
|
||||
// Copyright 2019-2023 Arm Limited
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
|
||||
// use this file except in compliance with the License. You may obtain a copy
|
||||
@@ -275,6 +275,16 @@ struct vint4
|
||||
return vint4(*p);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Factory that returns a vector loaded from unaligned memory.
|
||||
*/
|
||||
static ASTCENC_SIMD_INLINE vint4 load(const uint8_t* p)
|
||||
{
|
||||
vint4 data;
|
||||
std::memcpy(&data.m, p, 4 * sizeof(int));
|
||||
return data;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Factory that returns a vector loaded from 16B aligned memory.
|
||||
*/
|
||||
@@ -341,6 +351,13 @@ struct vmask4
|
||||
m[3] = d == false ? 0 : -1;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Get the scalar value of a single lane.
|
||||
*/
|
||||
template <int l> ASTCENC_SIMD_INLINE float lane() const
|
||||
{
|
||||
return m[l] != 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief The vector ...
|
||||
@@ -550,10 +567,15 @@ template <int s> ASTCENC_SIMD_INLINE vint4 lsl(vint4 a)
|
||||
*/
|
||||
template <int s> ASTCENC_SIMD_INLINE vint4 lsr(vint4 a)
|
||||
{
|
||||
return vint4((int)(((unsigned int)a.m[0]) >> s),
|
||||
(int)(((unsigned int)a.m[1]) >> s),
|
||||
(int)(((unsigned int)a.m[2]) >> s),
|
||||
(int)(((unsigned int)a.m[3]) >> s));
|
||||
unsigned int as0 = static_cast<unsigned int>(a.m[0]) >> s;
|
||||
unsigned int as1 = static_cast<unsigned int>(a.m[1]) >> s;
|
||||
unsigned int as2 = static_cast<unsigned int>(a.m[2]) >> s;
|
||||
unsigned int as3 = static_cast<unsigned int>(a.m[3]) >> s;
|
||||
|
||||
return vint4(static_cast<int>(as0),
|
||||
static_cast<int>(as1),
|
||||
static_cast<int>(as2),
|
||||
static_cast<int>(as3));
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -639,13 +661,20 @@ ASTCENC_SIMD_INLINE void store(vint4 a, int* p)
|
||||
p[3] = a.m[3];
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Store a vector to an unaligned memory address.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE void store(vint4 a, uint8_t* p)
|
||||
{
|
||||
std::memcpy(p, a.m, sizeof(int) * 4);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Store lowest N (vector width) bytes into an unaligned address.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE void store_nbytes(vint4 a, uint8_t* p)
|
||||
{
|
||||
int* pi = (int*)p;
|
||||
*pi = a.m[0];
|
||||
std::memcpy(p, a.m, sizeof(uint8_t) * 4);
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -678,10 +707,10 @@ ASTCENC_SIMD_INLINE vint4 pack_low_bytes(vint4 a)
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE vint4 select(vint4 a, vint4 b, vmask4 cond)
|
||||
{
|
||||
return vint4((cond.m[0] & 0x80000000) ? b.m[0] : a.m[0],
|
||||
(cond.m[1] & 0x80000000) ? b.m[1] : a.m[1],
|
||||
(cond.m[2] & 0x80000000) ? b.m[2] : a.m[2],
|
||||
(cond.m[3] & 0x80000000) ? b.m[3] : a.m[3]);
|
||||
return vint4((cond.m[0] & static_cast<int>(0x80000000)) ? b.m[0] : a.m[0],
|
||||
(cond.m[1] & static_cast<int>(0x80000000)) ? b.m[1] : a.m[1],
|
||||
(cond.m[2] & static_cast<int>(0x80000000)) ? b.m[2] : a.m[2],
|
||||
(cond.m[3] & static_cast<int>(0x80000000)) ? b.m[3] : a.m[3]);
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
@@ -892,10 +921,10 @@ ASTCENC_SIMD_INLINE vfloat4 sqrt(vfloat4 a)
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE vfloat4 select(vfloat4 a, vfloat4 b, vmask4 cond)
|
||||
{
|
||||
return vfloat4((cond.m[0] & 0x80000000) ? b.m[0] : a.m[0],
|
||||
(cond.m[1] & 0x80000000) ? b.m[1] : a.m[1],
|
||||
(cond.m[2] & 0x80000000) ? b.m[2] : a.m[2],
|
||||
(cond.m[3] & 0x80000000) ? b.m[3] : a.m[3]);
|
||||
return vfloat4((cond.m[0] & static_cast<int>(0x80000000)) ? b.m[0] : a.m[0],
|
||||
(cond.m[1] & static_cast<int>(0x80000000)) ? b.m[1] : a.m[1],
|
||||
(cond.m[2] & static_cast<int>(0x80000000)) ? b.m[2] : a.m[2],
|
||||
(cond.m[3] & static_cast<int>(0x80000000)) ? b.m[3] : a.m[3]);
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -903,10 +932,10 @@ ASTCENC_SIMD_INLINE vfloat4 select(vfloat4 a, vfloat4 b, vmask4 cond)
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE vfloat4 select_msb(vfloat4 a, vfloat4 b, vmask4 cond)
|
||||
{
|
||||
return vfloat4((cond.m[0] & 0x80000000) ? b.m[0] : a.m[0],
|
||||
(cond.m[1] & 0x80000000) ? b.m[1] : a.m[1],
|
||||
(cond.m[2] & 0x80000000) ? b.m[2] : a.m[2],
|
||||
(cond.m[3] & 0x80000000) ? b.m[3] : a.m[3]);
|
||||
return vfloat4((cond.m[0] & static_cast<int>(0x80000000)) ? b.m[0] : a.m[0],
|
||||
(cond.m[1] & static_cast<int>(0x80000000)) ? b.m[1] : a.m[1],
|
||||
(cond.m[2] & static_cast<int>(0x80000000)) ? b.m[2] : a.m[2],
|
||||
(cond.m[3] & static_cast<int>(0x80000000)) ? b.m[3] : a.m[3]);
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -947,10 +976,10 @@ ASTCENC_SIMD_INLINE void storea(vfloat4 a, float* ptr)
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE vint4 float_to_int(vfloat4 a)
|
||||
{
|
||||
return vint4((int)a.m[0],
|
||||
(int)a.m[1],
|
||||
(int)a.m[2],
|
||||
(int)a.m[3]);
|
||||
return vint4(static_cast<int>(a.m[0]),
|
||||
static_cast<int>(a.m[1]),
|
||||
static_cast<int>(a.m[2]),
|
||||
static_cast<int>(a.m[3]));
|
||||
}
|
||||
|
||||
/**f
|
||||
@@ -958,10 +987,11 @@ ASTCENC_SIMD_INLINE vint4 float_to_int(vfloat4 a)
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE vint4 float_to_int_rtn(vfloat4 a)
|
||||
{
|
||||
return vint4((int)(a.m[0] + 0.5f),
|
||||
(int)(a.m[1] + 0.5f),
|
||||
(int)(a.m[2] + 0.5f),
|
||||
(int)(a.m[3] + 0.5f));
|
||||
a = a + vfloat4(0.5f);
|
||||
return vint4(static_cast<int>(a.m[0]),
|
||||
static_cast<int>(a.m[1]),
|
||||
static_cast<int>(a.m[2]),
|
||||
static_cast<int>(a.m[3]));
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -969,10 +999,10 @@ ASTCENC_SIMD_INLINE vint4 float_to_int_rtn(vfloat4 a)
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE vfloat4 int_to_float(vint4 a)
|
||||
{
|
||||
return vfloat4((float)a.m[0],
|
||||
(float)a.m[1],
|
||||
(float)a.m[2],
|
||||
(float)a.m[3]);
|
||||
return vfloat4(static_cast<float>(a.m[0]),
|
||||
static_cast<float>(a.m[1]),
|
||||
static_cast<float>(a.m[2]),
|
||||
static_cast<float>(a.m[3]));
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -1001,10 +1031,10 @@ static inline uint16_t float_to_float16(float a)
|
||||
ASTCENC_SIMD_INLINE vfloat4 float16_to_float(vint4 a)
|
||||
{
|
||||
return vfloat4(
|
||||
sf16_to_float(a.lane<0>()),
|
||||
sf16_to_float(a.lane<1>()),
|
||||
sf16_to_float(a.lane<2>()),
|
||||
sf16_to_float(a.lane<3>()));
|
||||
sf16_to_float(static_cast<uint16_t>(a.lane<0>())),
|
||||
sf16_to_float(static_cast<uint16_t>(a.lane<1>())),
|
||||
sf16_to_float(static_cast<uint16_t>(a.lane<2>())),
|
||||
sf16_to_float(static_cast<uint16_t>(a.lane<3>())));
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -1025,7 +1055,7 @@ ASTCENC_SIMD_INLINE float float16_to_float(uint16_t a)
|
||||
ASTCENC_SIMD_INLINE vint4 float_as_int(vfloat4 a)
|
||||
{
|
||||
vint4 r;
|
||||
memcpy(r.m, a.m, 4 * 4);
|
||||
std::memcpy(r.m, a.m, 4 * 4);
|
||||
return r;
|
||||
}
|
||||
|
||||
@@ -1039,8 +1069,138 @@ ASTCENC_SIMD_INLINE vint4 float_as_int(vfloat4 a)
|
||||
ASTCENC_SIMD_INLINE vfloat4 int_as_float(vint4 a)
|
||||
{
|
||||
vfloat4 r;
|
||||
memcpy(r.m, a.m, 4 * 4);
|
||||
std::memcpy(r.m, a.m, 4 * 4);
|
||||
return r;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Prepare a vtable lookup table for use with the native SIMD size.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE void vtable_prepare(vint4 t0, vint4& t0p)
|
||||
{
|
||||
t0p = t0;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Prepare a vtable lookup table for use with the native SIMD size.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE void vtable_prepare(vint4 t0, vint4 t1, vint4& t0p, vint4& t1p)
|
||||
{
|
||||
t0p = t0;
|
||||
t1p = t1;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Prepare a vtable lookup table for use with the native SIMD size.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE void vtable_prepare(
|
||||
vint4 t0, vint4 t1, vint4 t2, vint4 t3,
|
||||
vint4& t0p, vint4& t1p, vint4& t2p, vint4& t3p)
|
||||
{
|
||||
t0p = t0;
|
||||
t1p = t1;
|
||||
t2p = t2;
|
||||
t3p = t3;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Perform an 8-bit 16-entry table lookup, with 32-bit indexes.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 idx)
|
||||
{
|
||||
uint8_t table[16];
|
||||
|
||||
std::memcpy(table + 0, t0.m, 4 * sizeof(int));
|
||||
|
||||
return vint4(table[idx.lane<0>()],
|
||||
table[idx.lane<1>()],
|
||||
table[idx.lane<2>()],
|
||||
table[idx.lane<3>()]);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* @brief Perform an 8-bit 32-entry table lookup, with 32-bit indexes.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 t1, vint4 idx)
|
||||
{
|
||||
uint8_t table[32];
|
||||
|
||||
std::memcpy(table + 0, t0.m, 4 * sizeof(int));
|
||||
std::memcpy(table + 16, t1.m, 4 * sizeof(int));
|
||||
|
||||
return vint4(table[idx.lane<0>()],
|
||||
table[idx.lane<1>()],
|
||||
table[idx.lane<2>()],
|
||||
table[idx.lane<3>()]);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Perform an 8-bit 64-entry table lookup, with 32-bit indexes.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 t1, vint4 t2, vint4 t3, vint4 idx)
|
||||
{
|
||||
uint8_t table[64];
|
||||
|
||||
std::memcpy(table + 0, t0.m, 4 * sizeof(int));
|
||||
std::memcpy(table + 16, t1.m, 4 * sizeof(int));
|
||||
std::memcpy(table + 32, t2.m, 4 * sizeof(int));
|
||||
std::memcpy(table + 48, t3.m, 4 * sizeof(int));
|
||||
|
||||
return vint4(table[idx.lane<0>()],
|
||||
table[idx.lane<1>()],
|
||||
table[idx.lane<2>()],
|
||||
table[idx.lane<3>()]);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Return a vector of interleaved RGBA data.
|
||||
*
|
||||
* Input vectors have the value stored in the bottom 8 bits of each lane,
|
||||
* with high bits set to zero.
|
||||
*
|
||||
* Output vector stores a single RGBA texel packed in each lane.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE vint4 interleave_rgba8(vint4 r, vint4 g, vint4 b, vint4 a)
|
||||
{
|
||||
return r + lsl<8>(g) + lsl<16>(b) + lsl<24>(a);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Store a single vector lane to an unaligned address.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE void store_lane(uint8_t* base, int data)
|
||||
{
|
||||
std::memcpy(base, &data, sizeof(int));
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Store a vector, skipping masked lanes.
|
||||
*
|
||||
* All masked lanes must be at the end of vector, after all non-masked lanes.
|
||||
* Input is a byte array of at least 4 bytes per unmasked entry.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE void store_lanes_masked(uint8_t* base, vint4 data, vmask4 mask)
|
||||
{
|
||||
if (mask.m[3])
|
||||
{
|
||||
store(data, base);
|
||||
}
|
||||
else if (mask.m[2])
|
||||
{
|
||||
store_lane(base + 0, data.lane<0>());
|
||||
store_lane(base + 4, data.lane<1>());
|
||||
store_lane(base + 8, data.lane<2>());
|
||||
}
|
||||
else if (mask.m[1])
|
||||
{
|
||||
store_lane(base + 0, data.lane<0>());
|
||||
store_lane(base + 4, data.lane<1>());
|
||||
}
|
||||
else if (mask.m[0])
|
||||
{
|
||||
store_lane(base + 0, data.lane<0>());
|
||||
}
|
||||
}
|
||||
|
||||
#endif // #ifndef ASTC_VECMATHLIB_NONE_4_H_INCLUDED
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
// ----------------------------------------------------------------------------
|
||||
// Copyright 2019-2021 Arm Limited
|
||||
// Copyright 2019-2023 Arm Limited
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
|
||||
// use this file except in compliance with the License. You may obtain a copy
|
||||
@@ -39,6 +39,7 @@
|
||||
#endif
|
||||
|
||||
#include <cstdio>
|
||||
#include <cstring>
|
||||
|
||||
// ============================================================================
|
||||
// vfloat4 data type
|
||||
@@ -292,6 +293,18 @@ struct vint4
|
||||
return vint4(*p);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Factory that returns a vector loaded from unaligned memory.
|
||||
*/
|
||||
static ASTCENC_SIMD_INLINE vint4 load(const uint8_t* p)
|
||||
{
|
||||
#if ASTCENC_SSE >= 41
|
||||
return vint4(_mm_lddqu_si128(reinterpret_cast<const __m128i*>(p)));
|
||||
#else
|
||||
return vint4(_mm_loadu_si128(reinterpret_cast<const __m128i*>(p)));
|
||||
#endif
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Factory that returns a vector loaded from 16B aligned memory.
|
||||
*/
|
||||
@@ -363,6 +376,14 @@ struct vmask4
|
||||
m = _mm_castsi128_ps(mask.m);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Get the scalar value of a single lane.
|
||||
*/
|
||||
template <int l> ASTCENC_SIMD_INLINE bool lane() const
|
||||
{
|
||||
return _mm_cvtss_f32(_mm_shuffle_ps(m, m, l)) != 0.0f;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief The vector ...
|
||||
*/
|
||||
@@ -412,7 +433,7 @@ ASTCENC_SIMD_INLINE vmask4 operator~(vmask4 a)
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE unsigned int mask(vmask4 a)
|
||||
{
|
||||
return _mm_movemask_ps(a.m);
|
||||
return static_cast<unsigned int>(_mm_movemask_ps(a.m));
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
@@ -625,6 +646,14 @@ ASTCENC_SIMD_INLINE void store(vint4 a, int* p)
|
||||
_mm_storeu_ps(reinterpret_cast<float*>(p), _mm_castsi128_ps(a.m));
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Store a vector to an unaligned memory address.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE void store(vint4 a, uint8_t* p)
|
||||
{
|
||||
std::memcpy(p, &a.m, sizeof(int) * 4);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Store lowest N (vector width) bytes into an unaligned address.
|
||||
*/
|
||||
@@ -801,7 +830,7 @@ ASTCENC_SIMD_INLINE vfloat4 round(vfloat4 a)
|
||||
return vfloat4(_mm_round_ps(a.m, flags));
|
||||
#else
|
||||
__m128 v = a.m;
|
||||
__m128 neg_zero = _mm_castsi128_ps(_mm_set1_epi32(0x80000000));
|
||||
__m128 neg_zero = _mm_castsi128_ps(_mm_set1_epi32(static_cast<int>(0x80000000)));
|
||||
__m128 no_fraction = _mm_set1_ps(8388608.0f);
|
||||
__m128 abs_mask = _mm_castsi128_ps(_mm_set1_epi32(0x7FFFFFFF));
|
||||
__m128 sign = _mm_and_ps(v, neg_zero);
|
||||
@@ -926,7 +955,7 @@ ASTCENC_SIMD_INLINE vint4 float_to_int(vfloat4 a)
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE vint4 float_to_int_rtn(vfloat4 a)
|
||||
{
|
||||
a = round(a);
|
||||
a = a + vfloat4(0.5f);
|
||||
return vint4(_mm_cvttps_epi32(a.m));
|
||||
}
|
||||
|
||||
@@ -980,10 +1009,10 @@ ASTCENC_SIMD_INLINE vfloat4 float16_to_float(vint4 a)
|
||||
return vfloat4(f32);
|
||||
#else
|
||||
return vfloat4(
|
||||
sf16_to_float(a.lane<0>()),
|
||||
sf16_to_float(a.lane<1>()),
|
||||
sf16_to_float(a.lane<2>()),
|
||||
sf16_to_float(a.lane<3>()));
|
||||
sf16_to_float(static_cast<uint16_t>(a.lane<0>())),
|
||||
sf16_to_float(static_cast<uint16_t>(a.lane<1>())),
|
||||
sf16_to_float(static_cast<uint16_t>(a.lane<2>())),
|
||||
sf16_to_float(static_cast<uint16_t>(a.lane<3>())));
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -993,7 +1022,7 @@ ASTCENC_SIMD_INLINE vfloat4 float16_to_float(vint4 a)
|
||||
ASTCENC_SIMD_INLINE float float16_to_float(uint16_t a)
|
||||
{
|
||||
#if ASTCENC_F16C >= 1
|
||||
__m128i packed = _mm_set1_epi16(a);
|
||||
__m128i packed = _mm_set1_epi16(static_cast<short>(a));
|
||||
__m128 f32 = _mm_cvtph_ps(packed);
|
||||
return _mm_cvtss_f32(f32);
|
||||
#else
|
||||
@@ -1025,6 +1054,208 @@ ASTCENC_SIMD_INLINE vfloat4 int_as_float(vint4 v)
|
||||
return vfloat4(_mm_castsi128_ps(v.m));
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Prepare a vtable lookup table for use with the native SIMD size.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE void vtable_prepare(vint4 t0, vint4& t0p)
|
||||
{
|
||||
t0p = t0;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Prepare a vtable lookup table for use with the native SIMD size.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE void vtable_prepare(vint4 t0, vint4 t1, vint4& t0p, vint4& t1p)
|
||||
{
|
||||
#if ASTCENC_SSE >= 41
|
||||
t0p = t0;
|
||||
t1p = t0 ^ t1;
|
||||
#else
|
||||
t0p = t0;
|
||||
t1p = t1;
|
||||
#endif
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Prepare a vtable lookup table for use with the native SIMD size.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE void vtable_prepare(
|
||||
vint4 t0, vint4 t1, vint4 t2, vint4 t3,
|
||||
vint4& t0p, vint4& t1p, vint4& t2p, vint4& t3p)
|
||||
{
|
||||
#if ASTCENC_SSE >= 41
|
||||
t0p = t0;
|
||||
t1p = t0 ^ t1;
|
||||
t2p = t1 ^ t2;
|
||||
t3p = t2 ^ t3;
|
||||
#else
|
||||
t0p = t0;
|
||||
t1p = t1;
|
||||
t2p = t2;
|
||||
t3p = t3;
|
||||
#endif
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Perform an 8-bit 16-entry table lookup, with 32-bit indexes.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 idx)
|
||||
{
|
||||
#if ASTCENC_SSE >= 41
|
||||
// Set index byte MSB to 1 for unused bytes so shuffle returns zero
|
||||
__m128i idxx = _mm_or_si128(idx.m, _mm_set1_epi32(static_cast<int>(0xFFFFFF00)));
|
||||
|
||||
__m128i result = _mm_shuffle_epi8(t0.m, idxx);
|
||||
return vint4(result);
|
||||
#else
|
||||
uint8_t table[16];
|
||||
|
||||
std::memcpy(table + 0, &t0.m, 4 * sizeof(int));
|
||||
|
||||
return vint4(table[idx.lane<0>()],
|
||||
table[idx.lane<1>()],
|
||||
table[idx.lane<2>()],
|
||||
table[idx.lane<3>()]);
|
||||
#endif
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Perform an 8-bit 32-entry table lookup, with 32-bit indexes.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 t1, vint4 idx)
|
||||
{
|
||||
#if ASTCENC_SSE >= 41
|
||||
// Set index byte MSB to 1 for unused bytes so shuffle returns zero
|
||||
__m128i idxx = _mm_or_si128(idx.m, _mm_set1_epi32(static_cast<int>(0xFFFFFF00)));
|
||||
|
||||
__m128i result = _mm_shuffle_epi8(t0.m, idxx);
|
||||
idxx = _mm_sub_epi8(idxx, _mm_set1_epi8(16));
|
||||
|
||||
__m128i result2 = _mm_shuffle_epi8(t1.m, idxx);
|
||||
result = _mm_xor_si128(result, result2);
|
||||
|
||||
return vint4(result);
|
||||
#else
|
||||
uint8_t table[32];
|
||||
|
||||
std::memcpy(table + 0, &t0.m, 4 * sizeof(int));
|
||||
std::memcpy(table + 16, &t1.m, 4 * sizeof(int));
|
||||
|
||||
return vint4(table[idx.lane<0>()],
|
||||
table[idx.lane<1>()],
|
||||
table[idx.lane<2>()],
|
||||
table[idx.lane<3>()]);
|
||||
#endif
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Perform an 8-bit 64-entry table lookup, with 32-bit indexes.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 t1, vint4 t2, vint4 t3, vint4 idx)
|
||||
{
|
||||
#if ASTCENC_SSE >= 41
|
||||
// Set index byte MSB to 1 for unused bytes so shuffle returns zero
|
||||
__m128i idxx = _mm_or_si128(idx.m, _mm_set1_epi32(static_cast<int>(0xFFFFFF00)));
|
||||
|
||||
__m128i result = _mm_shuffle_epi8(t0.m, idxx);
|
||||
idxx = _mm_sub_epi8(idxx, _mm_set1_epi8(16));
|
||||
|
||||
__m128i result2 = _mm_shuffle_epi8(t1.m, idxx);
|
||||
result = _mm_xor_si128(result, result2);
|
||||
idxx = _mm_sub_epi8(idxx, _mm_set1_epi8(16));
|
||||
|
||||
result2 = _mm_shuffle_epi8(t2.m, idxx);
|
||||
result = _mm_xor_si128(result, result2);
|
||||
idxx = _mm_sub_epi8(idxx, _mm_set1_epi8(16));
|
||||
|
||||
result2 = _mm_shuffle_epi8(t3.m, idxx);
|
||||
result = _mm_xor_si128(result, result2);
|
||||
|
||||
return vint4(result);
|
||||
#else
|
||||
uint8_t table[64];
|
||||
|
||||
std::memcpy(table + 0, &t0.m, 4 * sizeof(int));
|
||||
std::memcpy(table + 16, &t1.m, 4 * sizeof(int));
|
||||
std::memcpy(table + 32, &t2.m, 4 * sizeof(int));
|
||||
std::memcpy(table + 48, &t3.m, 4 * sizeof(int));
|
||||
|
||||
return vint4(table[idx.lane<0>()],
|
||||
table[idx.lane<1>()],
|
||||
table[idx.lane<2>()],
|
||||
table[idx.lane<3>()]);
|
||||
#endif
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Return a vector of interleaved RGBA data.
|
||||
*
|
||||
* Input vectors have the value stored in the bottom 8 bits of each lane,
|
||||
* with high bits set to zero.
|
||||
*
|
||||
* Output vector stores a single RGBA texel packed in each lane.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE vint4 interleave_rgba8(vint4 r, vint4 g, vint4 b, vint4 a)
|
||||
{
|
||||
// Workaround an XCode compiler internal fault; note is slower than slli_epi32
|
||||
// so we should revert this when we get the opportunity
|
||||
#if defined(__APPLE__)
|
||||
__m128i value = r.m;
|
||||
value = _mm_add_epi32(value, _mm_bslli_si128(g.m, 1));
|
||||
value = _mm_add_epi32(value, _mm_bslli_si128(b.m, 2));
|
||||
value = _mm_add_epi32(value, _mm_bslli_si128(a.m, 3));
|
||||
return vint4(value);
|
||||
#else
|
||||
__m128i value = r.m;
|
||||
value = _mm_add_epi32(value, _mm_slli_epi32(g.m, 8));
|
||||
value = _mm_add_epi32(value, _mm_slli_epi32(b.m, 16));
|
||||
value = _mm_add_epi32(value, _mm_slli_epi32(a.m, 24));
|
||||
return vint4(value);
|
||||
#endif
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Store a single vector lane to an unaligned address.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE void store_lane(uint8_t* base, int data)
|
||||
{
|
||||
std::memcpy(base, &data, sizeof(int));
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Store a vector, skipping masked lanes.
|
||||
*
|
||||
* All masked lanes must be at the end of vector, after all non-masked lanes.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE void store_lanes_masked(uint8_t* base, vint4 data, vmask4 mask)
|
||||
{
|
||||
#if ASTCENC_AVX >= 2
|
||||
_mm_maskstore_epi32(reinterpret_cast<int*>(base), _mm_castps_si128(mask.m), data.m);
|
||||
#else
|
||||
// Note - we cannot use _mm_maskmoveu_si128 as the underlying hardware doesn't guarantee
|
||||
// fault suppression on masked lanes so we can get page faults at the end of an image.
|
||||
if (mask.lane<3>() != 0.0f)
|
||||
{
|
||||
store(data, base);
|
||||
}
|
||||
else if (mask.lane<2>() != 0.0f)
|
||||
{
|
||||
store_lane(base + 0, data.lane<0>());
|
||||
store_lane(base + 4, data.lane<1>());
|
||||
store_lane(base + 8, data.lane<2>());
|
||||
}
|
||||
else if (mask.lane<1>() != 0.0f)
|
||||
{
|
||||
store_lane(base + 0, data.lane<0>());
|
||||
store_lane(base + 4, data.lane<1>());
|
||||
}
|
||||
else if (mask.lane<0>() != 0.0f)
|
||||
{
|
||||
store_lane(base + 0, data.lane<0>());
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
#if defined(ASTCENC_NO_INVARIANCE) && (ASTCENC_SSE >= 41)
|
||||
|
||||
#define ASTCENC_USE_NATIVE_DOT_PRODUCT 1
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
// ----------------------------------------------------------------------------
|
||||
// Copyright 2011-2022 Arm Limited
|
||||
// Copyright 2011-2024 Arm Limited
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
|
||||
// use this file except in compliance with the License. You may obtain a copy
|
||||
@@ -44,26 +44,24 @@
|
||||
#include <cassert>
|
||||
#include <cstring>
|
||||
|
||||
|
||||
static constexpr unsigned int ANGULAR_STEPS { 40 };
|
||||
|
||||
// Store a reduced sin/cos table for 64 possible weight values; this causes slight quality loss
|
||||
// compared to using sin() and cos() directly. Must be 2^N.
|
||||
static constexpr unsigned int SINCOS_STEPS { 64 };
|
||||
static constexpr unsigned int ANGULAR_STEPS { 32 };
|
||||
|
||||
static_assert((ANGULAR_STEPS % ASTCENC_SIMD_WIDTH) == 0,
|
||||
"ANGULAR_STEPS must be multiple of ASTCENC_SIMD_WIDTH");
|
||||
|
||||
static uint8_t max_angular_steps_needed_for_quant_level[13];
|
||||
static_assert(ANGULAR_STEPS >= 32,
|
||||
"ANGULAR_STEPS must be at least max(steps_for_quant_level)");
|
||||
|
||||
// The next-to-last entry is supposed to have the value 33. This because the 32-weight mode leaves a
|
||||
// double-sized hole in the middle of the weight space, so we are better off matching 33 weights.
|
||||
static const uint8_t quantization_steps_for_level[13] {
|
||||
2, 3, 4, 5, 6, 8, 10, 12, 16, 20, 24, 33, 36
|
||||
// Store a reduced sin/cos table for 64 possible weight values; this causes
|
||||
// slight quality loss compared to using sin() and cos() directly. Must be 2^N.
|
||||
static constexpr unsigned int SINCOS_STEPS { 64 };
|
||||
|
||||
static const uint8_t steps_for_quant_level[12] {
|
||||
2, 3, 4, 5, 6, 8, 10, 12, 16, 20, 24, 32
|
||||
};
|
||||
|
||||
alignas(ASTCENC_VECALIGN) static float sin_table[SINCOS_STEPS][ANGULAR_STEPS];
|
||||
alignas(ASTCENC_VECALIGN) static float cos_table[SINCOS_STEPS][ANGULAR_STEPS];
|
||||
ASTCENC_ALIGNAS static float sin_table[SINCOS_STEPS][ANGULAR_STEPS];
|
||||
ASTCENC_ALIGNAS static float cos_table[SINCOS_STEPS][ANGULAR_STEPS];
|
||||
|
||||
#if defined(ASTCENC_DIAGNOSTICS)
|
||||
static bool print_once { true };
|
||||
@@ -72,7 +70,6 @@ alignas(ASTCENC_VECALIGN) static float cos_table[SINCOS_STEPS][ANGULAR_STEPS];
|
||||
/* See header for documentation. */
|
||||
void prepare_angular_tables()
|
||||
{
|
||||
unsigned int max_angular_steps_needed_for_quant_steps[ANGULAR_STEPS + 1];
|
||||
for (unsigned int i = 0; i < ANGULAR_STEPS; i++)
|
||||
{
|
||||
float angle_step = static_cast<float>(i + 1);
|
||||
@@ -82,13 +79,6 @@ void prepare_angular_tables()
|
||||
sin_table[j][i] = static_cast<float>(sinf((2.0f * astc::PI / (SINCOS_STEPS - 1.0f)) * angle_step * static_cast<float>(j)));
|
||||
cos_table[j][i] = static_cast<float>(cosf((2.0f * astc::PI / (SINCOS_STEPS - 1.0f)) * angle_step * static_cast<float>(j)));
|
||||
}
|
||||
|
||||
max_angular_steps_needed_for_quant_steps[i + 1] = astc::min(i + 1, ANGULAR_STEPS - 1);
|
||||
}
|
||||
|
||||
for (unsigned int i = 0; i < 13; i++)
|
||||
{
|
||||
max_angular_steps_needed_for_quant_level[i] = max_angular_steps_needed_for_quant_steps[quantization_steps_for_level[i]];
|
||||
}
|
||||
}
|
||||
|
||||
@@ -109,7 +99,7 @@ static void compute_angular_offsets(
|
||||
promise(weight_count > 0);
|
||||
promise(max_angular_steps > 0);
|
||||
|
||||
alignas(ASTCENC_VECALIGN) int isamplev[BLOCK_MAX_WEIGHTS];
|
||||
ASTCENC_ALIGNAS int isamplev[BLOCK_MAX_WEIGHTS];
|
||||
|
||||
// Precompute isample; arrays are always allocated 64 elements long
|
||||
for (unsigned int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH)
|
||||
@@ -165,7 +155,7 @@ static void compute_lowest_and_highest_weight(
|
||||
unsigned int max_angular_steps,
|
||||
unsigned int max_quant_steps,
|
||||
const float* offsets,
|
||||
int* lowest_weight,
|
||||
float* lowest_weight,
|
||||
int* weight_span,
|
||||
float* error,
|
||||
float* cut_low_weight_error,
|
||||
@@ -184,11 +174,11 @@ static void compute_lowest_and_highest_weight(
|
||||
vfloat errval = vfloat::zero();
|
||||
vfloat cut_low_weight_err = vfloat::zero();
|
||||
vfloat cut_high_weight_err = vfloat::zero();
|
||||
vfloat offset = loada(&offsets[sp]);
|
||||
vfloat offset = loada(offsets + sp);
|
||||
|
||||
for (unsigned int j = 0; j < weight_count; ++j)
|
||||
for (unsigned int j = 0; j < weight_count; j++)
|
||||
{
|
||||
vfloat sval = load1(&dec_weight_ideal_value[j]) * rcp_stepsize - offset;
|
||||
vfloat sval = load1(dec_weight_ideal_value + j) * rcp_stepsize - offset;
|
||||
vfloat svalrte = round(sval);
|
||||
vfloat diff = sval - svalrte;
|
||||
errval += diff * diff;
|
||||
@@ -218,16 +208,16 @@ static void compute_lowest_and_highest_weight(
|
||||
vint span = float_to_int(maxidx - minidx + vfloat(1));
|
||||
span = min(span, vint(max_quant_steps + 3));
|
||||
span = max(span, vint(2));
|
||||
storea(float_to_int(minidx), &lowest_weight[sp]);
|
||||
storea(span, &weight_span[sp]);
|
||||
storea(minidx, lowest_weight + sp);
|
||||
storea(span, weight_span + sp);
|
||||
|
||||
// The cut_(lowest/highest)_weight_error indicate the error that results from forcing
|
||||
// samples that should have had the weight value one step (up/down).
|
||||
vfloat ssize = 1.0f / rcp_stepsize;
|
||||
vfloat errscale = ssize * ssize;
|
||||
storea(errval * errscale, &error[sp]);
|
||||
storea(cut_low_weight_err * errscale, &cut_low_weight_error[sp]);
|
||||
storea(cut_high_weight_err * errscale, &cut_high_weight_error[sp]);
|
||||
storea(errval * errscale, error + sp);
|
||||
storea(cut_low_weight_err * errscale, cut_low_weight_error + sp);
|
||||
storea(cut_high_weight_err * errscale, cut_high_weight_error + sp);
|
||||
|
||||
rcp_stepsize = rcp_stepsize + vfloat(ASTCENC_SIMD_WIDTH);
|
||||
}
|
||||
@@ -246,21 +236,22 @@ static void compute_angular_endpoints_for_quant_levels(
|
||||
unsigned int weight_count,
|
||||
const float* dec_weight_ideal_value,
|
||||
unsigned int max_quant_level,
|
||||
float low_value[12],
|
||||
float high_value[12]
|
||||
float low_value[TUNE_MAX_ANGULAR_QUANT + 1],
|
||||
float high_value[TUNE_MAX_ANGULAR_QUANT + 1]
|
||||
) {
|
||||
unsigned int max_quant_steps = quantization_steps_for_level[max_quant_level];
|
||||
unsigned int max_quant_steps = steps_for_quant_level[max_quant_level];
|
||||
unsigned int max_angular_steps = steps_for_quant_level[max_quant_level];
|
||||
|
||||
ASTCENC_ALIGNAS float angular_offsets[ANGULAR_STEPS];
|
||||
|
||||
alignas(ASTCENC_VECALIGN) float angular_offsets[ANGULAR_STEPS];
|
||||
unsigned int max_angular_steps = max_angular_steps_needed_for_quant_level[max_quant_level];
|
||||
compute_angular_offsets(weight_count, dec_weight_ideal_value,
|
||||
max_angular_steps, angular_offsets);
|
||||
|
||||
alignas(ASTCENC_VECALIGN) int32_t lowest_weight[ANGULAR_STEPS];
|
||||
alignas(ASTCENC_VECALIGN) int32_t weight_span[ANGULAR_STEPS];
|
||||
alignas(ASTCENC_VECALIGN) float error[ANGULAR_STEPS];
|
||||
alignas(ASTCENC_VECALIGN) float cut_low_weight_error[ANGULAR_STEPS];
|
||||
alignas(ASTCENC_VECALIGN) float cut_high_weight_error[ANGULAR_STEPS];
|
||||
ASTCENC_ALIGNAS float lowest_weight[ANGULAR_STEPS];
|
||||
ASTCENC_ALIGNAS int32_t weight_span[ANGULAR_STEPS];
|
||||
ASTCENC_ALIGNAS float error[ANGULAR_STEPS];
|
||||
ASTCENC_ALIGNAS float cut_low_weight_error[ANGULAR_STEPS];
|
||||
ASTCENC_ALIGNAS float cut_high_weight_error[ANGULAR_STEPS];
|
||||
|
||||
compute_lowest_and_highest_weight(weight_count, dec_weight_ideal_value,
|
||||
max_angular_steps, max_quant_steps,
|
||||
@@ -270,7 +261,7 @@ static void compute_angular_endpoints_for_quant_levels(
|
||||
// For each quantization level, find the best error terms. Use packed vectors so data-dependent
|
||||
// branches can become selects. This involves some integer to float casts, but the values are
|
||||
// small enough so they never round the wrong way.
|
||||
vfloat4 best_results[40];
|
||||
vfloat4 best_results[36];
|
||||
|
||||
// Initialize the array to some safe defaults
|
||||
promise(max_quant_steps > 0);
|
||||
@@ -296,30 +287,30 @@ static void compute_angular_endpoints_for_quant_levels(
|
||||
// Check best error against record N
|
||||
vfloat4 best_result = best_results[idx_span];
|
||||
vfloat4 new_result = vfloat4(error[i], i_flt, 0.0f, 0.0f);
|
||||
vmask4 mask1(best_result.lane<0>() > error[i]);
|
||||
best_results[idx_span] = select(best_result, new_result, mask1);
|
||||
vmask4 mask = vfloat4(best_result.lane<0>()) > vfloat4(error[i]);
|
||||
best_results[idx_span] = select(best_result, new_result, mask);
|
||||
|
||||
// Check best error against record N-1 with either cut low or cut high
|
||||
best_result = best_results[idx_span - 1];
|
||||
|
||||
new_result = vfloat4(error_cut_low, i_flt, 1.0f, 0.0f);
|
||||
vmask4 mask2(best_result.lane<0>() > error_cut_low);
|
||||
best_result = select(best_result, new_result, mask2);
|
||||
mask = vfloat4(best_result.lane<0>()) > vfloat4(error_cut_low);
|
||||
best_result = select(best_result, new_result, mask);
|
||||
|
||||
new_result = vfloat4(error_cut_high, i_flt, 0.0f, 0.0f);
|
||||
vmask4 mask3(best_result.lane<0>() > error_cut_high);
|
||||
best_results[idx_span - 1] = select(best_result, new_result, mask3);
|
||||
mask = vfloat4(best_result.lane<0>()) > vfloat4(error_cut_high);
|
||||
best_results[idx_span - 1] = select(best_result, new_result, mask);
|
||||
|
||||
// Check best error against record N-2 with both cut low and high
|
||||
best_result = best_results[idx_span - 2];
|
||||
new_result = vfloat4(error_cut_low_high, i_flt, 1.0f, 0.0f);
|
||||
vmask4 mask4(best_result.lane<0>() > error_cut_low_high);
|
||||
best_results[idx_span - 2] = select(best_result, new_result, mask4);
|
||||
mask = vfloat4(best_result.lane<0>()) > vfloat4(error_cut_low_high);
|
||||
best_results[idx_span - 2] = select(best_result, new_result, mask);
|
||||
}
|
||||
|
||||
for (unsigned int i = 0; i <= max_quant_level; i++)
|
||||
{
|
||||
unsigned int q = quantization_steps_for_level[i];
|
||||
unsigned int q = steps_for_quant_level[i];
|
||||
int bsi = static_cast<int>(best_results[q].lane<1>());
|
||||
|
||||
// Did we find anything?
|
||||
@@ -333,181 +324,28 @@ static void compute_angular_endpoints_for_quant_levels(
|
||||
|
||||
bsi = astc::max(0, bsi);
|
||||
|
||||
float lwi = lowest_weight[bsi] + best_results[q].lane<2>();
|
||||
float hwi = lwi + static_cast<float>(q) - 1.0f;
|
||||
|
||||
float stepsize = 1.0f / (1.0f + static_cast<float>(bsi));
|
||||
int lwi = lowest_weight[bsi] + static_cast<int>(best_results[q].lane<2>());
|
||||
int hwi = lwi + q - 1;
|
||||
|
||||
float offset = angular_offsets[bsi] * stepsize;
|
||||
low_value[i] = offset + static_cast<float>(lwi) * stepsize;
|
||||
high_value[i] = offset + static_cast<float>(hwi) * stepsize;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief For a given step size compute the lowest and highest weight, variant for low weight count.
|
||||
*
|
||||
* Compute the lowest and highest weight that results from quantizing using the given stepsize and
|
||||
* offset, and then compute the resulting error. The cut errors indicate the error that results from
|
||||
* forcing samples that should have had one weight value one step up or down.
|
||||
*
|
||||
* @param weight_count The number of (decimated) weights.
|
||||
* @param dec_weight_quant_uvalue The decimated and quantized weight values.
|
||||
* @param max_angular_steps The maximum number of steps to be tested.
|
||||
* @param max_quant_steps The maximum quantization level to be tested.
|
||||
* @param offsets The angular offsets array.
|
||||
* @param[out] lowest_weight Per angular step, the lowest weight.
|
||||
* @param[out] weight_span Per angular step, the span between lowest and highest weight.
|
||||
* @param[out] error Per angular step, the error.
|
||||
*/
|
||||
static void compute_lowest_and_highest_weight_lwc(
|
||||
unsigned int weight_count,
|
||||
const float* dec_weight_quant_uvalue,
|
||||
unsigned int max_angular_steps,
|
||||
unsigned int max_quant_steps,
|
||||
const float* offsets,
|
||||
int* lowest_weight,
|
||||
int* weight_span,
|
||||
float* error
|
||||
) {
|
||||
promise(weight_count > 0);
|
||||
promise(max_angular_steps > 0);
|
||||
|
||||
vfloat rcp_stepsize = vfloat::lane_id() + vfloat(1.0f);
|
||||
|
||||
// Arrays are ANGULAR_STEPS long, so always safe to run full vectors
|
||||
for (unsigned int sp = 0; sp < max_angular_steps; sp += ASTCENC_SIMD_WIDTH)
|
||||
{
|
||||
vfloat minidx(128.0f);
|
||||
vfloat maxidx(-128.0f);
|
||||
vfloat errval = vfloat::zero();
|
||||
vfloat offset = loada(&offsets[sp]);
|
||||
|
||||
for (unsigned int j = 0; j < weight_count; ++j)
|
||||
{
|
||||
vfloat sval = load1(&dec_weight_quant_uvalue[j]) * rcp_stepsize - offset;
|
||||
vfloat svalrte = round(sval);
|
||||
vfloat diff = sval - svalrte;
|
||||
errval += diff * diff;
|
||||
|
||||
// Reset tracker on min hit
|
||||
vmask mask = svalrte < minidx;
|
||||
minidx = select(minidx, svalrte, mask);
|
||||
|
||||
// Reset tracker on max hit
|
||||
mask = svalrte > maxidx;
|
||||
maxidx = select(maxidx, svalrte, mask);
|
||||
}
|
||||
|
||||
// Write out min weight and weight span; clamp span to a usable range
|
||||
vint span = float_to_int(maxidx - minidx + vfloat(1.0f));
|
||||
span = min(span, vint(max_quant_steps + 3));
|
||||
span = max(span, vint(2));
|
||||
storea(float_to_int(minidx), &lowest_weight[sp]);
|
||||
storea(span, &weight_span[sp]);
|
||||
|
||||
// The cut_(lowest/highest)_weight_error indicate the error that results from forcing
|
||||
// samples that should have had the weight value one step (up/down).
|
||||
vfloat ssize = 1.0f / rcp_stepsize;
|
||||
vfloat errscale = ssize * ssize;
|
||||
storea(errval * errscale, &error[sp]);
|
||||
|
||||
rcp_stepsize = rcp_stepsize + vfloat(ASTCENC_SIMD_WIDTH);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief The main function for the angular algorithm, variant for low weight count.
|
||||
*
|
||||
* @param weight_count The number of (decimated) weights.
|
||||
* @param dec_weight_ideal_value The ideal decimated unquantized weight values.
|
||||
* @param max_quant_level The maximum quantization level to be tested.
|
||||
* @param[out] low_value Per angular step, the lowest weight value.
|
||||
* @param[out] high_value Per angular step, the highest weight value.
|
||||
*/
|
||||
static void compute_angular_endpoints_for_quant_levels_lwc(
|
||||
unsigned int weight_count,
|
||||
const float* dec_weight_ideal_value,
|
||||
unsigned int max_quant_level,
|
||||
float low_value[12],
|
||||
float high_value[12]
|
||||
) {
|
||||
unsigned int max_quant_steps = quantization_steps_for_level[max_quant_level];
|
||||
unsigned int max_angular_steps = max_angular_steps_needed_for_quant_level[max_quant_level];
|
||||
|
||||
alignas(ASTCENC_VECALIGN) float angular_offsets[ANGULAR_STEPS];
|
||||
alignas(ASTCENC_VECALIGN) int32_t lowest_weight[ANGULAR_STEPS];
|
||||
alignas(ASTCENC_VECALIGN) int32_t weight_span[ANGULAR_STEPS];
|
||||
alignas(ASTCENC_VECALIGN) float error[ANGULAR_STEPS];
|
||||
|
||||
compute_angular_offsets(weight_count, dec_weight_ideal_value,
|
||||
max_angular_steps, angular_offsets);
|
||||
|
||||
|
||||
compute_lowest_and_highest_weight_lwc(weight_count, dec_weight_ideal_value,
|
||||
max_angular_steps, max_quant_steps,
|
||||
angular_offsets, lowest_weight, weight_span, error);
|
||||
|
||||
// For each quantization level, find the best error terms. Use packed vectors so data-dependent
|
||||
// branches can become selects. This involves some integer to float casts, but the values are
|
||||
// small enough so they never round the wrong way.
|
||||
vfloat4 best_results[ANGULAR_STEPS];
|
||||
|
||||
// Initialize the array to some safe defaults
|
||||
promise(max_quant_steps > 0);
|
||||
for (unsigned int i = 0; i < (max_quant_steps + 4); i++)
|
||||
{
|
||||
best_results[i] = vfloat4(ERROR_CALC_DEFAULT, -1.0f, 0.0f, 0.0f);
|
||||
}
|
||||
|
||||
promise(max_angular_steps > 0);
|
||||
for (unsigned int i = 0; i < max_angular_steps; i++)
|
||||
{
|
||||
int idx_span = weight_span[i];
|
||||
|
||||
// Check best error against record N
|
||||
vfloat4 current_best = best_results[idx_span];
|
||||
vfloat4 candidate = vfloat4(error[i], static_cast<float>(i), 0.0f, 0.0f);
|
||||
vmask4 mask(current_best.lane<0>() > error[i]);
|
||||
best_results[idx_span] = select(current_best, candidate, mask);
|
||||
}
|
||||
|
||||
for (unsigned int i = 0; i <= max_quant_level; i++)
|
||||
{
|
||||
unsigned int q = quantization_steps_for_level[i];
|
||||
int bsi = static_cast<int>(best_results[q].lane<1>());
|
||||
|
||||
// Did we find anything?
|
||||
#if defined(ASTCENC_DIAGNOSTICS)
|
||||
if ((bsi < 0) && print_once)
|
||||
{
|
||||
print_once = false;
|
||||
printf("INFO: Unable to find low weight encoding within search error limit.\n\n");
|
||||
}
|
||||
#endif
|
||||
|
||||
bsi = astc::max(0, bsi);
|
||||
|
||||
int lwi = lowest_weight[bsi];
|
||||
int hwi = lwi + q - 1;
|
||||
|
||||
low_value[i] = (angular_offsets[bsi] + static_cast<float>(lwi)) / (1.0f + static_cast<float>(bsi));
|
||||
high_value[i] = (angular_offsets[bsi] + static_cast<float>(hwi)) / (1.0f + static_cast<float>(bsi));
|
||||
low_value[i] = (angular_offsets[bsi] + lwi) * stepsize;
|
||||
high_value[i] = (angular_offsets[bsi] + hwi) * stepsize;
|
||||
}
|
||||
}
|
||||
|
||||
/* See header for documentation. */
|
||||
void compute_angular_endpoints_1plane(
|
||||
unsigned int tune_low_weight_limit,
|
||||
bool only_always,
|
||||
const block_size_descriptor& bsd,
|
||||
const float* dec_weight_ideal_value,
|
||||
unsigned int max_weight_quant,
|
||||
compression_working_buffers& tmpbuf
|
||||
) {
|
||||
float (&low_value)[WEIGHTS_MAX_BLOCK_MODES] = tmpbuf.weight_low_value1;
|
||||
float (&high_value)[WEIGHTS_MAX_BLOCK_MODES] = tmpbuf.weight_high_value1;
|
||||
|
||||
float (&low_values)[WEIGHTS_MAX_DECIMATION_MODES][12] = tmpbuf.weight_low_values1;
|
||||
float (&high_values)[WEIGHTS_MAX_DECIMATION_MODES][12] = tmpbuf.weight_high_values1;
|
||||
float (&low_values)[WEIGHTS_MAX_DECIMATION_MODES][TUNE_MAX_ANGULAR_QUANT + 1] = tmpbuf.weight_low_values1;
|
||||
float (&high_values)[WEIGHTS_MAX_DECIMATION_MODES][TUNE_MAX_ANGULAR_QUANT + 1] = tmpbuf.weight_high_values1;
|
||||
|
||||
unsigned int max_decimation_modes = only_always ? bsd.decimation_mode_count_always
|
||||
: bsd.decimation_mode_count_selected;
|
||||
@@ -515,33 +353,34 @@ void compute_angular_endpoints_1plane(
|
||||
for (unsigned int i = 0; i < max_decimation_modes; i++)
|
||||
{
|
||||
const decimation_mode& dm = bsd.decimation_modes[i];
|
||||
if (!dm.ref_1_plane)
|
||||
if (!dm.is_ref_1plane(static_cast<quant_method>(max_weight_quant)))
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
unsigned int weight_count = bsd.get_decimation_info(i).weight_count;
|
||||
|
||||
if (weight_count < tune_low_weight_limit)
|
||||
unsigned int max_precision = dm.maxprec_1plane;
|
||||
if (max_precision > TUNE_MAX_ANGULAR_QUANT)
|
||||
{
|
||||
compute_angular_endpoints_for_quant_levels_lwc(
|
||||
weight_count,
|
||||
dec_weight_ideal_value + i * BLOCK_MAX_WEIGHTS,
|
||||
dm.maxprec_1plane, low_values[i], high_values[i]);
|
||||
max_precision = TUNE_MAX_ANGULAR_QUANT;
|
||||
}
|
||||
else
|
||||
|
||||
if (max_precision > max_weight_quant)
|
||||
{
|
||||
compute_angular_endpoints_for_quant_levels(
|
||||
weight_count,
|
||||
dec_weight_ideal_value + i * BLOCK_MAX_WEIGHTS,
|
||||
dm.maxprec_1plane, low_values[i], high_values[i]);
|
||||
max_precision = max_weight_quant;
|
||||
}
|
||||
|
||||
compute_angular_endpoints_for_quant_levels(
|
||||
weight_count,
|
||||
dec_weight_ideal_value + i * BLOCK_MAX_WEIGHTS,
|
||||
max_precision, low_values[i], high_values[i]);
|
||||
}
|
||||
|
||||
unsigned int max_block_modes = only_always ? bsd.block_mode_count_1plane_always
|
||||
: bsd.block_mode_count_1plane_selected;
|
||||
promise(max_block_modes > 0);
|
||||
for (unsigned int i = 0; i < max_block_modes; ++i)
|
||||
for (unsigned int i = 0; i < max_block_modes; i++)
|
||||
{
|
||||
const block_mode& bm = bsd.block_modes[i];
|
||||
assert(!bm.is_dual_plane);
|
||||
@@ -549,16 +388,24 @@ void compute_angular_endpoints_1plane(
|
||||
unsigned int quant_mode = bm.quant_mode;
|
||||
unsigned int decim_mode = bm.decimation_mode;
|
||||
|
||||
low_value[i] = low_values[decim_mode][quant_mode];
|
||||
high_value[i] = high_values[decim_mode][quant_mode];
|
||||
if (quant_mode <= TUNE_MAX_ANGULAR_QUANT)
|
||||
{
|
||||
low_value[i] = low_values[decim_mode][quant_mode];
|
||||
high_value[i] = high_values[decim_mode][quant_mode];
|
||||
}
|
||||
else
|
||||
{
|
||||
low_value[i] = 0.0f;
|
||||
high_value[i] = 1.0f;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* See header for documentation. */
|
||||
void compute_angular_endpoints_2planes(
|
||||
unsigned int tune_low_weight_limit,
|
||||
const block_size_descriptor& bsd,
|
||||
const float* dec_weight_ideal_value,
|
||||
unsigned int max_weight_quant,
|
||||
compression_working_buffers& tmpbuf
|
||||
) {
|
||||
float (&low_value1)[WEIGHTS_MAX_BLOCK_MODES] = tmpbuf.weight_low_value1;
|
||||
@@ -566,46 +413,42 @@ void compute_angular_endpoints_2planes(
|
||||
float (&low_value2)[WEIGHTS_MAX_BLOCK_MODES] = tmpbuf.weight_low_value2;
|
||||
float (&high_value2)[WEIGHTS_MAX_BLOCK_MODES] = tmpbuf.weight_high_value2;
|
||||
|
||||
float (&low_values1)[WEIGHTS_MAX_DECIMATION_MODES][12] = tmpbuf.weight_low_values1;
|
||||
float (&high_values1)[WEIGHTS_MAX_DECIMATION_MODES][12] = tmpbuf.weight_high_values1;
|
||||
float (&low_values2)[WEIGHTS_MAX_DECIMATION_MODES][12] = tmpbuf.weight_low_values2;
|
||||
float (&high_values2)[WEIGHTS_MAX_DECIMATION_MODES][12] = tmpbuf.weight_high_values2;
|
||||
float (&low_values1)[WEIGHTS_MAX_DECIMATION_MODES][TUNE_MAX_ANGULAR_QUANT + 1] = tmpbuf.weight_low_values1;
|
||||
float (&high_values1)[WEIGHTS_MAX_DECIMATION_MODES][TUNE_MAX_ANGULAR_QUANT + 1] = tmpbuf.weight_high_values1;
|
||||
float (&low_values2)[WEIGHTS_MAX_DECIMATION_MODES][TUNE_MAX_ANGULAR_QUANT + 1] = tmpbuf.weight_low_values2;
|
||||
float (&high_values2)[WEIGHTS_MAX_DECIMATION_MODES][TUNE_MAX_ANGULAR_QUANT + 1] = tmpbuf.weight_high_values2;
|
||||
|
||||
promise(bsd.decimation_mode_count_selected > 0);
|
||||
for (unsigned int i = 0; i < bsd.decimation_mode_count_selected; i++)
|
||||
{
|
||||
const decimation_mode& dm = bsd.decimation_modes[i];
|
||||
if (!dm.ref_2_planes)
|
||||
if (!dm.is_ref_2plane(static_cast<quant_method>(max_weight_quant)))
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
unsigned int weight_count = bsd.get_decimation_info(i).weight_count;
|
||||
|
||||
if (weight_count < tune_low_weight_limit)
|
||||
unsigned int max_precision = dm.maxprec_2planes;
|
||||
if (max_precision > TUNE_MAX_ANGULAR_QUANT)
|
||||
{
|
||||
compute_angular_endpoints_for_quant_levels_lwc(
|
||||
weight_count,
|
||||
dec_weight_ideal_value + i * BLOCK_MAX_WEIGHTS,
|
||||
dm.maxprec_2planes, low_values1[i], high_values1[i]);
|
||||
|
||||
compute_angular_endpoints_for_quant_levels_lwc(
|
||||
weight_count,
|
||||
dec_weight_ideal_value + i * BLOCK_MAX_WEIGHTS + WEIGHTS_PLANE2_OFFSET,
|
||||
dm.maxprec_2planes, low_values2[i], high_values2[i]);
|
||||
max_precision = TUNE_MAX_ANGULAR_QUANT;
|
||||
}
|
||||
else
|
||||
|
||||
if (max_precision > max_weight_quant)
|
||||
{
|
||||
compute_angular_endpoints_for_quant_levels(
|
||||
weight_count,
|
||||
dec_weight_ideal_value + i * BLOCK_MAX_WEIGHTS,
|
||||
dm.maxprec_2planes, low_values1[i], high_values1[i]);
|
||||
|
||||
compute_angular_endpoints_for_quant_levels(
|
||||
weight_count,
|
||||
dec_weight_ideal_value + i * BLOCK_MAX_WEIGHTS + WEIGHTS_PLANE2_OFFSET,
|
||||
dm.maxprec_2planes, low_values2[i], high_values2[i]);
|
||||
max_precision = max_weight_quant;
|
||||
}
|
||||
|
||||
compute_angular_endpoints_for_quant_levels(
|
||||
weight_count,
|
||||
dec_weight_ideal_value + i * BLOCK_MAX_WEIGHTS,
|
||||
max_precision, low_values1[i], high_values1[i]);
|
||||
|
||||
compute_angular_endpoints_for_quant_levels(
|
||||
weight_count,
|
||||
dec_weight_ideal_value + i * BLOCK_MAX_WEIGHTS + WEIGHTS_PLANE2_OFFSET,
|
||||
max_precision, low_values2[i], high_values2[i]);
|
||||
}
|
||||
|
||||
unsigned int start = bsd.block_mode_count_1plane_selected;
|
||||
@@ -616,10 +459,20 @@ void compute_angular_endpoints_2planes(
|
||||
unsigned int quant_mode = bm.quant_mode;
|
||||
unsigned int decim_mode = bm.decimation_mode;
|
||||
|
||||
low_value1[i] = low_values1[decim_mode][quant_mode];
|
||||
high_value1[i] = high_values1[decim_mode][quant_mode];
|
||||
low_value2[i] = low_values2[decim_mode][quant_mode];
|
||||
high_value2[i] = high_values2[decim_mode][quant_mode];
|
||||
if (quant_mode <= TUNE_MAX_ANGULAR_QUANT)
|
||||
{
|
||||
low_value1[i] = low_values1[decim_mode][quant_mode];
|
||||
high_value1[i] = high_values1[decim_mode][quant_mode];
|
||||
low_value2[i] = low_values2[decim_mode][quant_mode];
|
||||
high_value2[i] = high_values2[decim_mode][quant_mode];
|
||||
}
|
||||
else
|
||||
{
|
||||
low_value1[i] = 0.0f;
|
||||
high_value1[i] = 1.0f;
|
||||
low_value2[i] = 0.0f;
|
||||
high_value2[i] = 1.0f;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -23,145 +23,125 @@
|
||||
|
||||
#define _ 0 // Using _ to indicate an entry that will not be used.
|
||||
|
||||
const quantization_and_transfer_table quant_and_xfer_tables[12] {
|
||||
// Quantization method 0, range 0..1
|
||||
const quant_and_transfer_table quant_and_xfer_tables[12] {
|
||||
// QUANT2, range 0..1
|
||||
{
|
||||
QUANT_2,
|
||||
{0, 64, 255},
|
||||
{0, 64},
|
||||
{0, 1},
|
||||
{0, 64},
|
||||
{0x01004000,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,
|
||||
{0x4000,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,
|
||||
_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,
|
||||
0x01004000}
|
||||
0x4000}
|
||||
},
|
||||
// Quantization method 1, range 0..2
|
||||
// QUANT_3, range 0..2
|
||||
{
|
||||
QUANT_3,
|
||||
{0, 32, 64, 255},
|
||||
{0, 32, 64},
|
||||
{0, 1, 2},
|
||||
{0, 32, 64},
|
||||
{0x01002000,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,
|
||||
_,_,0x02004000,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,
|
||||
_,_,_,_,0x02014020}
|
||||
{0x2000,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,
|
||||
_,_,0x4000,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,
|
||||
_,_,_,_,0x4020}
|
||||
},
|
||||
// Quantization method 2, range 0..3
|
||||
// QUANT_4, range 0..3
|
||||
{
|
||||
QUANT_4,
|
||||
{0, 21, 43, 64, 255},
|
||||
{0, 21, 43, 64},
|
||||
{0, 1, 2, 3},
|
||||
{0, 21, 43, 64},
|
||||
{0x01001500,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,0x02002b00,_,_,_,_,
|
||||
_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,0x03014015,_,_,_,_,_,_,_,_,_,_,_,_,
|
||||
_,_,_,_,_,_,_,_,0x0302402b}
|
||||
{0x1500,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,0x2b00,_,_,_,_,
|
||||
_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,0x4015,_,_,_,_,_,_,_,_,_,_,_,_,
|
||||
_,_,_,_,_,_,_,_,0x402b}
|
||||
},
|
||||
// Quantization method 3, range 0..4
|
||||
//QUANT_5, range 0..4
|
||||
{
|
||||
QUANT_5,
|
||||
{0, 16, 32, 48, 64, 255},
|
||||
{0, 16, 32, 48, 64},
|
||||
{0, 1, 2, 3, 4},
|
||||
{0, 16, 32, 48, 64},
|
||||
{0x01001000,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,0x02002000,_,_,_,_,_,_,_,_,_,
|
||||
_,_,_,_,_,_,0x03013010,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,0x04024020,_,_,_,
|
||||
_,_,_,_,_,_,_,_,_,_,_,_,0x04034030}
|
||||
{0x1000,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,0x2000,_,_,_,_,_,_,_,_,_,
|
||||
_,_,_,_,_,_,0x3010,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,0x4020,_,_,_,
|
||||
_,_,_,_,_,_,_,_,_,_,_,_,0x4030}
|
||||
},
|
||||
// Quantization method 4, range 0..5
|
||||
// QUANT_6, range 0..5
|
||||
{
|
||||
QUANT_6,
|
||||
{0, 12, 25, 39, 52, 64, 255},
|
||||
{0, 12, 25, 39, 52, 64},
|
||||
{0, 2, 4, 5, 3, 1},
|
||||
{0, 64, 12, 52, 25, 39},
|
||||
{0x02000c00,_,_,_,_,_,_,_,_,_,_,_,0x04001900,_,_,_,_,_,_,_,_,_,_,_,_,
|
||||
0x0502270c,_,_,_,_,_,_,_,_,_,_,_,_,_,0x03043419,_,_,_,_,_,_,_,_,_,_,
|
||||
_,_,0x01054027,_,_,_,_,_,_,_,_,_,_,_,0x01034034}
|
||||
{0x0c00,_,_,_,_,_,_,_,_,_,_,_,0x1900,_,_,_,_,_,_,_,_,_,_,_,_,
|
||||
0x270c,_,_,_,_,_,_,_,_,_,_,_,_,_,0x3419,_,_,_,_,_,_,_,_,_,_,
|
||||
_,_,0x4027,_,_,_,_,_,_,_,_,_,_,_,0x4034}
|
||||
},
|
||||
// Quantization method 5, range 0..7
|
||||
// QUANT_8, range 0..7
|
||||
{
|
||||
QUANT_8,
|
||||
{0, 9, 18, 27, 37, 46, 55, 64, 255},
|
||||
{0, 9, 18, 27, 37, 46, 55, 64},
|
||||
{0, 1, 2, 3, 4, 5, 6, 7},
|
||||
{0, 9, 18, 27, 37, 46, 55, 64},
|
||||
{0x01000900,_,_,_,_,_,_,_,_,0x02001200,_,_,_,_,_,_,_,_,0x03011b09,_,_,
|
||||
_,_,_,_,_,_,0x04022512,_,_,_,_,_,_,_,_,_,0x05032e1b,_,_,_,_,_,_,_,_,
|
||||
0x06043725,_,_,_,_,_,_,_,_,0x0705402e,_,_,_,_,_,_,_,_,0x07064037}
|
||||
{0x0900,_,_,_,_,_,_,_,_,0x1200,_,_,_,_,_,_,_,_,0x1b09,_,_,
|
||||
_,_,_,_,_,_,0x2512,_,_,_,_,_,_,_,_,_,0x2e1b,_,_,_,_,_,_,_,_,
|
||||
0x3725,_,_,_,_,_,_,_,_,0x402e,_,_,_,_,_,_,_,_,0x4037}
|
||||
},
|
||||
// Quantization method 6, range 0..9
|
||||
// QUANT_10, range 0..9
|
||||
{
|
||||
QUANT_10,
|
||||
{0, 7, 14, 21, 28, 36, 43, 50, 57, 64, 255},
|
||||
{0, 7, 14, 21, 28, 36, 43, 50, 57, 64},
|
||||
{0, 2, 4, 6, 8, 9, 7, 5, 3, 1},
|
||||
{0, 64, 7, 57, 14, 50, 21, 43, 28, 36},
|
||||
{0x02000700,_,_,_,_,_,_,0x04000e00,_,_,_,_,_,_,0x06021507,_,_,_,_,_,_,
|
||||
0x08041c0e,_,_,_,_,_,_,0x09062415,_,_,_,_,_,_,_,0x07082b1c,_,_,_,_,_,
|
||||
_,0x05093224,_,_,_,_,_,_,0x0307392b,_,_,_,_,_,_,0x01054032,_,_,_,_,_,
|
||||
_,0x01034039}
|
||||
{0x0700,_,_,_,_,_,_,0x0e00,_,_,_,_,_,_,0x1507,_,_,_,_,_,_,
|
||||
0x1c0e,_,_,_,_,_,_,0x2415,_,_,_,_,_,_,_,0x2b1c,_,_,_,_,_,
|
||||
_,0x3224,_,_,_,_,_,_,0x392b,_,_,_,_,_,_,0x4032,_,_,_,_,_,
|
||||
_,0x4039}
|
||||
},
|
||||
// Quantization method 7, range 0..11
|
||||
// QUANT_12, range 0..11
|
||||
{
|
||||
QUANT_12,
|
||||
{0, 5, 11, 17, 23, 28, 36, 41, 47, 53, 59, 64, 255},
|
||||
{0, 5, 11, 17, 23, 28, 36, 41, 47, 53, 59, 64},
|
||||
{0, 4, 8, 2, 6, 10, 11, 7, 3, 9, 5, 1},
|
||||
{0, 64, 17, 47, 5, 59, 23, 41, 11, 53, 28, 36},
|
||||
{0x04000500,_,_,_,_,0x08000b00,_,_,_,_,_,0x02041105,_,_,_,_,_,
|
||||
0x0608170b,_,_,_,_,_,0x0a021c11,_,_,_,_,0x0b062417,_,_,_,_,_,_,_,
|
||||
0x070a291c,_,_,_,_,0x030b2f24,_,_,_,_,_,0x09073529,_,_,_,_,_,
|
||||
0x05033b2f,_,_,_,_,_,0x01094035,_,_,_,_,0x0105403b}
|
||||
{0x0500,_,_,_,_,0x0b00,_,_,_,_,_,0x1105,_,_,_,_,_,
|
||||
0x170b,_,_,_,_,_,0x1c11,_,_,_,_,0x2417,_,_,_,_,_,_,_,
|
||||
0x291c,_,_,_,_,0x2f24,_,_,_,_,_,0x3529,_,_,_,_,_,
|
||||
0x3b2f,_,_,_,_,_,0x4035,_,_,_,_,0x403b}
|
||||
},
|
||||
// Quantization method 8, range 0..15
|
||||
// QUANT_16, range 0..15
|
||||
{
|
||||
QUANT_16,
|
||||
{0, 4, 8, 12, 17, 21, 25, 29, 35, 39, 43, 47, 52, 56, 60, 64, 255},
|
||||
{0, 4, 8, 12, 17, 21, 25, 29, 35, 39, 43, 47, 52, 56, 60, 64},
|
||||
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
|
||||
{0, 4, 8, 12, 17, 21, 25, 29, 35, 39, 43, 47, 52, 56, 60, 64},
|
||||
{0x01000400,_,_,_,0x02000800,_,_,_,0x03010c04,_,_,_,0x04021108,_,_,_,_,
|
||||
0x0503150c,_,_,_,0x06041911,_,_,_,0x07051d15,_,_,_,0x08062319,_,_,_,_,
|
||||
_,0x0907271d,_,_,_,0x0a082b23,_,_,_,0x0b092f27,_,_,_,0x0c0a342b,_,_,_,
|
||||
_,0x0d0b382f,_,_,_,0x0e0c3c34,_,_,_,0x0f0d4038,_,_,_,0x0f0e403c}
|
||||
{0x0400,_,_,_,0x0800,_,_,_,0x0c04,_,_,_,0x1108,_,_,_,_,
|
||||
0x150c,_,_,_,0x1911,_,_,_,0x1d15,_,_,_,0x2319,_,_,_,_,
|
||||
_,0x271d,_,_,_,0x2b23,_,_,_,0x2f27,_,_,_,0x342b,_,_,_,
|
||||
_,0x382f,_,_,_,0x3c34,_,_,_,0x4038,_,_,_,0x403c}
|
||||
},
|
||||
// Quantization method 9, range 0..19
|
||||
// QUANT_20, range 0..19
|
||||
{
|
||||
QUANT_20,
|
||||
{0, 3, 6, 9, 13, 16, 19, 23, 26, 29, 35, 38, 41, 45, 48, 51, 55, 58,
|
||||
61, 64, 255},
|
||||
{0, 3, 6, 9, 13, 16, 19, 23, 26, 29, 35, 38, 41, 45, 48, 51, 55, 58, 61, 64},
|
||||
{0, 4, 8, 12, 16, 2, 6, 10, 14, 18, 19, 15, 11, 7, 3, 17, 13, 9, 5, 1},
|
||||
{0, 64, 16, 48, 3, 61, 19, 45, 6, 58, 23, 41, 9, 55, 26, 38, 13, 51,
|
||||
29, 35},
|
||||
{0x04000300,_,_,0x08000600,_,_,0x0c040903,_,_,0x10080d06,_,_,_,
|
||||
0x020c1009,_,_,0x0610130d,_,_,0x0a021710,_,_,_,0x0e061a13,_,_,
|
||||
0x120a1d17,_,_,0x130e231a,_,_,_,_,_,0x0f12261d,_,_,0x0b132923,_,_,
|
||||
0x070f2d26,_,_,_,0x030b3029,_,_,0x1107332d,_,_,0x0d033730,_,_,_,
|
||||
0x09113a33,_,_,0x050d3d37,_,_,0x0109403a,_,_,0x0105403d}
|
||||
{0, 64, 16, 48, 3, 61, 19, 45, 6, 58, 23, 41, 9, 55, 26, 38, 13, 51, 29, 35},
|
||||
{0x0300,_,_,0x0600,_,_,0x0903,_,_,0x0d06,_,_,_,
|
||||
0x1009,_,_,0x130d,_,_,0x1710,_,_,_,0x1a13,_,_,
|
||||
0x1d17,_,_,0x231a,_,_,_,_,_,0x261d,_,_,0x2923,_,_,
|
||||
0x2d26,_,_,_,0x3029,_,_,0x332d,_,_,0x3730,_,_,_,
|
||||
0x3a33,_,_,0x3d37,_,_,0x403a,_,_,0x403d}
|
||||
},
|
||||
// Quantization method 10, range 0..23
|
||||
// QUANT_24, range 0..23
|
||||
{
|
||||
QUANT_24,
|
||||
{0, 2, 5, 8, 11, 13, 16, 19, 22, 24, 27, 30, 34, 37, 40, 42, 45, 48,
|
||||
51, 53, 56, 59, 62, 64, 255},
|
||||
{0, 8, 16, 2, 10, 18, 4, 12, 20, 6, 14, 22, 23, 15, 7, 21, 13, 5, 19,
|
||||
11, 3, 17, 9, 1},
|
||||
{0, 64, 8, 56, 16, 48, 24, 40, 2, 62, 11, 53, 19, 45, 27, 37, 5, 59,
|
||||
13, 51, 22, 42, 30, 34},
|
||||
{0x08000200,_,0x10000500,_,_,0x02080802,_,_,0x0a100b05,_,_,0x12020d08,
|
||||
_,0x040a100b,_,_,0x0c12130d,_,_,0x14041610,_,_,0x060c1813,_,
|
||||
0x0e141b16,_,_,0x16061e18,_,_,0x170e221b,_,_,_,0x0f16251e,_,_,
|
||||
0x07172822,_,_,0x150f2a25,_,0x0d072d28,_,_,0x0515302a,_,_,0x130d332d,
|
||||
_,_,0x0b053530,_,0x03133833,_,_,0x110b3b35,_,_,0x09033e38,_,_,
|
||||
0x0111403b,_,0x0109403e}
|
||||
{0, 2, 5, 8, 11, 13, 16, 19, 22, 24, 27, 30, 34, 37, 40, 42, 45, 48, 51, 53, 56, 59, 62, 64},
|
||||
{0, 8, 16, 2, 10, 18, 4, 12, 20, 6, 14, 22, 23, 15, 7, 21, 13, 5, 19, 11, 3, 17, 9, 1},
|
||||
{0, 64, 8, 56, 16, 48, 24, 40, 2, 62, 11, 53, 19, 45, 27, 37, 5, 59, 13, 51, 22, 42, 30, 34},
|
||||
{0x0200,_,0x0500,_,_,0x0802,_,_,0x0b05,_,_,0x0d08,
|
||||
_,0x100b,_,_,0x130d,_,_,0x1610,_,_,0x1813,_,
|
||||
0x1b16,_,_,0x1e18,_,_,0x221b,_,_,_,0x251e,_,_,
|
||||
0x2822,_,_,0x2a25,_,0x2d28,_,_,0x302a,_,_,0x332d,
|
||||
_,_,0x3530,_,0x3833,_,_,0x3b35,_,_,0x3e38,_,_,
|
||||
0x403b,_,0x403e}
|
||||
},
|
||||
// Quantization method 11, range 0..31
|
||||
// QUANT_32, range 0..31
|
||||
{
|
||||
QUANT_32,
|
||||
{0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 34, 36, 38,
|
||||
40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, 64, 255},
|
||||
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
|
||||
20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31},
|
||||
{0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 34, 36, 38,
|
||||
40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, 64},
|
||||
{0x01000200,_,0x02000400,_,0x03010602,_,0x04020804,_,0x05030a06,_,
|
||||
0x06040c08,_,0x07050e0a,_,0x0806100c,_,0x0907120e,_,0x0a081410,_,
|
||||
0x0b091612,_,0x0c0a1814,_,0x0d0b1a16,_,0x0e0c1c18,_,0x0f0d1e1a,_,
|
||||
0x100e221c,_,_,_,0x110f241e,_,0x12102622,_,0x13112824,_,0x14122a26,_,
|
||||
0x15132c28,_,0x16142e2a,_,0x1715302c,_,0x1816322e,_,0x19173430,_,
|
||||
0x1a183632,_,0x1b193834,_,0x1c1a3a36,_,0x1d1b3c38,_,0x1e1c3e3a,_,
|
||||
0x1f1d403c,_,0x1f1e403e}
|
||||
{0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, 64},
|
||||
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31},
|
||||
{0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, 64},
|
||||
{0x0200,_,0x0400,_,0x0602,_,0x0804,_,0x0a06,_,
|
||||
0x0c08,_,0x0e0a,_,0x100c,_,0x120e,_,0x1410,_,
|
||||
0x1612,_,0x1814,_,0x1a16,_,0x1c18,_,0x1e1a,_,
|
||||
0x221c,_,_,_,0x241e,_,0x2622,_,0x2824,_,0x2a26,_,
|
||||
0x2c28,_,0x2e2a,_,0x302c,_,0x322e,_,0x3430,_,
|
||||
0x3632,_,0x3834,_,0x3a36,_,0x3c38,_,0x3e3a,_,
|
||||
0x403c,_,0x403e}
|
||||
}
|
||||
};
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
// ----------------------------------------------------------------------------
|
||||
// Copyright 2020-2021 Arm Limited
|
||||
// Copyright 2020-2023 Arm Limited
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
|
||||
// use this file except in compliance with the License. You may obtain a copy
|
||||
@@ -18,14 +18,25 @@
|
||||
/**
|
||||
* @brief Platform-specific function implementations.
|
||||
*
|
||||
* This module contains functions for querying the host extended ISA support.
|
||||
* This module contains the CLI entry point which also performs the role of
|
||||
* validating the host extended ISA support meets the needs of the tools.
|
||||
*/
|
||||
|
||||
// Include before the defines below to pick up any auto-setup based on compiler
|
||||
// built-in config, if not being set explicitly by the build system
|
||||
#include "astcenc_internal.h"
|
||||
#include <cstdio>
|
||||
|
||||
#if (ASTCENC_SSE > 0) || (ASTCENC_AVX > 0) || \
|
||||
/**
|
||||
* @brief The main entry point.
|
||||
*
|
||||
* @param argc The number of arguments.
|
||||
* @param argv The vector of arguments.
|
||||
*
|
||||
* @return 0 on success, non-zero otherwise.
|
||||
*/
|
||||
int astcenc_main(
|
||||
int argc,
|
||||
char **argv);
|
||||
|
||||
#if (ASTCENC_SSE > 20) || (ASTCENC_AVX > 0) || \
|
||||
(ASTCENC_POPCNT > 0) || (ASTCENC_F16C > 0)
|
||||
|
||||
static bool g_init { false };
|
||||
@@ -47,7 +58,7 @@ static bool g_cpu_has_f16c { false };
|
||||
============================================================================ */
|
||||
#if !defined(__clang__) && defined(_MSC_VER)
|
||||
#define WIN32_LEAN_AND_MEAN
|
||||
#include <Windows.h>
|
||||
#include <windows.h>
|
||||
#include <intrin.h>
|
||||
|
||||
/**
|
||||
@@ -119,8 +130,13 @@ static void detect_cpu_isa()
|
||||
}
|
||||
#endif
|
||||
|
||||
/* See header for documentation. */
|
||||
bool cpu_supports_popcnt()
|
||||
#if ASTCENC_POPCNT > 0
|
||||
/**
|
||||
* @brief Run-time detection if the host CPU supports the POPCNT extension.
|
||||
*
|
||||
* @return @c true if supported, @c false if not.
|
||||
*/
|
||||
static bool cpu_supports_popcnt()
|
||||
{
|
||||
if (!g_init)
|
||||
{
|
||||
@@ -129,9 +145,15 @@ bool cpu_supports_popcnt()
|
||||
|
||||
return g_cpu_has_popcnt;
|
||||
}
|
||||
#endif
|
||||
|
||||
/* See header for documentation. */
|
||||
bool cpu_supports_f16c()
|
||||
#if ASTCENC_F16C > 0
|
||||
/**
|
||||
* @brief Run-time detection if the host CPU supports F16C extension.
|
||||
*
|
||||
* @return @c true if supported, @c false if not.
|
||||
*/
|
||||
static bool cpu_supports_f16c()
|
||||
{
|
||||
if (!g_init)
|
||||
{
|
||||
@@ -140,9 +162,15 @@ bool cpu_supports_f16c()
|
||||
|
||||
return g_cpu_has_f16c;
|
||||
}
|
||||
#endif
|
||||
|
||||
/* See header for documentation. */
|
||||
bool cpu_supports_sse41()
|
||||
#if ASTCENC_SSE >= 41
|
||||
/**
|
||||
* @brief Run-time detection if the host CPU supports SSE 4.1 extension.
|
||||
*
|
||||
* @return @c true if supported, @c false if not.
|
||||
*/
|
||||
static bool cpu_supports_sse41()
|
||||
{
|
||||
if (!g_init)
|
||||
{
|
||||
@@ -151,9 +179,15 @@ bool cpu_supports_sse41()
|
||||
|
||||
return g_cpu_has_sse41;
|
||||
}
|
||||
#endif
|
||||
|
||||
/* See header for documentation. */
|
||||
bool cpu_supports_avx2()
|
||||
#if ASTCENC_AVX >= 2
|
||||
/**
|
||||
* @brief Run-time detection if the host CPU supports AVX 2 extension.
|
||||
*
|
||||
* @return @c true if supported, @c false if not.
|
||||
*/
|
||||
static bool cpu_supports_avx2()
|
||||
{
|
||||
if (!g_init)
|
||||
{
|
||||
@@ -162,5 +196,81 @@ bool cpu_supports_avx2()
|
||||
|
||||
return g_cpu_has_avx2;
|
||||
}
|
||||
#endif
|
||||
|
||||
/**
|
||||
* @brief Print a string to stderr.
|
||||
*/
|
||||
static inline void print_error(
|
||||
const char* format
|
||||
) {
|
||||
fprintf(stderr, "%s", format);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Validate CPU ISA support meets the requirements of this build of the library.
|
||||
*
|
||||
* Each library build is statically compiled for a particular set of CPU ISA features, such as the
|
||||
* SIMD support or other ISA extensions such as POPCNT. This function checks that the host CPU
|
||||
* actually supports everything this build needs.
|
||||
*
|
||||
* @return Return @c true if validated, @c false otherwise.
|
||||
*/
|
||||
static bool validate_cpu_isa()
|
||||
{
|
||||
#if ASTCENC_AVX >= 2
|
||||
if (!cpu_supports_avx2())
|
||||
{
|
||||
print_error("ERROR: Host does not support AVX2 ISA extension\n");
|
||||
return false;
|
||||
}
|
||||
#endif
|
||||
|
||||
#if ASTCENC_F16C >= 1
|
||||
if (!cpu_supports_f16c())
|
||||
{
|
||||
print_error("ERROR: Host does not support F16C ISA extension\n");
|
||||
return false;
|
||||
}
|
||||
#endif
|
||||
|
||||
#if ASTCENC_SSE >= 41
|
||||
if (!cpu_supports_sse41())
|
||||
{
|
||||
print_error("ERROR: Host does not support SSE4.1 ISA extension\n");
|
||||
return false;
|
||||
}
|
||||
#endif
|
||||
|
||||
#if ASTCENC_POPCNT >= 1
|
||||
if (!cpu_supports_popcnt())
|
||||
{
|
||||
print_error("ERROR: Host does not support POPCNT ISA extension\n");
|
||||
return false;
|
||||
}
|
||||
#endif
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
// Fallback for cases with no dynamic ISA availability
|
||||
static bool validate_cpu_isa()
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
int main(
|
||||
int argc,
|
||||
char **argv
|
||||
) {
|
||||
if (!validate_cpu_isa())
|
||||
{
|
||||
return 1;
|
||||
}
|
||||
|
||||
return astcenc_main(argc, argv);
|
||||
}
|
||||
@@ -1,6 +1,6 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
// ----------------------------------------------------------------------------
|
||||
// Copyright 2011-2021 Arm Limited
|
||||
// Copyright 2011-2022 Arm Limited
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
|
||||
// use this file except in compliance with the License. You may obtain a copy
|
||||
@@ -25,44 +25,39 @@
|
||||
#include "astcenccli_internal.h"
|
||||
|
||||
/**
|
||||
* @brief An accumulator using Kahan compensated floating-point summation.
|
||||
*
|
||||
* This method keeps higher precision than direct summation by keeping track of
|
||||
* the error compensation factor @c comp which can be added into the next
|
||||
* calculation. This allows single precision floats to be used in places that
|
||||
* would otherwise need double precision, which is useful when vectorizing.
|
||||
* @brief An accumulator for errors.
|
||||
*/
|
||||
class kahan_accum4
|
||||
class error_accum4
|
||||
{
|
||||
public:
|
||||
/** @brief The running sum. */
|
||||
vfloat4 sum { vfloat4::zero() };
|
||||
|
||||
/** @brief The current compensation factor. */
|
||||
vfloat4 comp { vfloat4::zero() };
|
||||
double sum_r { 0.0 };
|
||||
double sum_g { 0.0 };
|
||||
double sum_b { 0.0 };
|
||||
double sum_a { 0.0 };
|
||||
};
|
||||
|
||||
/**
|
||||
* @brief The incremental addition operator for Kahan summation.
|
||||
* @brief Incremental addition operator for error accumulators.
|
||||
*
|
||||
* @param val The Kahan accumulator to increment
|
||||
* @param val The accumulator to increment
|
||||
* @param inc The increment to apply
|
||||
*
|
||||
* @return The updated accumulator
|
||||
*/
|
||||
static kahan_accum4& operator+=(
|
||||
kahan_accum4 &val,
|
||||
static error_accum4& operator+=(
|
||||
error_accum4 &val,
|
||||
vfloat4 inc
|
||||
) {
|
||||
vfloat4 y = inc - val.comp;
|
||||
vfloat4 t = val.sum + y;
|
||||
val.comp = (t - val.sum) - y;
|
||||
val.sum = t;
|
||||
val.sum_r += static_cast<double>(inc.lane<0>());
|
||||
val.sum_g += static_cast<double>(inc.lane<1>());
|
||||
val.sum_b += static_cast<double>(inc.lane<2>());
|
||||
val.sum_a += static_cast<double>(inc.lane<3>());
|
||||
return val;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief mPSNR tonemapping operator for HDR images.
|
||||
* @brief mPSNR tone-mapping operator for HDR images.
|
||||
*
|
||||
* @param val The color value to tone map
|
||||
* @param fstop The exposure fstop; should be in range [-125, 125]
|
||||
@@ -124,10 +119,10 @@ void compute_error_metrics(
|
||||
static const int componentmasks[5] { 0x00, 0x07, 0x0C, 0x07, 0x0F };
|
||||
int componentmask = componentmasks[input_components];
|
||||
|
||||
kahan_accum4 errorsum;
|
||||
kahan_accum4 alpha_scaled_errorsum;
|
||||
kahan_accum4 log_errorsum;
|
||||
kahan_accum4 mpsnr_errorsum;
|
||||
error_accum4 errorsum;
|
||||
error_accum4 alpha_scaled_errorsum;
|
||||
error_accum4 log_errorsum;
|
||||
error_accum4 mpsnr_errorsum;
|
||||
double mean_angular_errorsum = 0.0;
|
||||
double worst_angular_errorsum = 0.0;
|
||||
|
||||
@@ -146,7 +141,7 @@ void compute_error_metrics(
|
||||
img2->dim_x, img2->dim_y, img2->dim_z);
|
||||
}
|
||||
|
||||
float rgb_peak = 0.0f;
|
||||
double rgb_peak = 0.0;
|
||||
unsigned int xsize1 = img1->dim_x;
|
||||
unsigned int xsize2 = img2->dim_x;
|
||||
|
||||
@@ -237,7 +232,10 @@ void compute_error_metrics(
|
||||
color2 = clamp(0, 65504.0f, color2);
|
||||
}
|
||||
|
||||
rgb_peak = astc::max(color1.lane<0>(), color1.lane<1>(), color1.lane<2>(), rgb_peak);
|
||||
rgb_peak = astc::max(static_cast<double>(color1.lane<0>()),
|
||||
static_cast<double>(color1.lane<1>()),
|
||||
static_cast<double>(color1.lane<2>()),
|
||||
rgb_peak);
|
||||
|
||||
vfloat4 diffcolor = color1 - color2;
|
||||
vfloat4 diffcolor_sq = diffcolor * diffcolor;
|
||||
@@ -291,106 +289,118 @@ void compute_error_metrics(
|
||||
}
|
||||
}
|
||||
|
||||
float pixels = static_cast<float>(dim_x * dim_y * dim_z);
|
||||
float num = 0.0f;
|
||||
float alpha_num = 0.0f;
|
||||
float log_num = 0.0f;
|
||||
float mpsnr_num = 0.0f;
|
||||
float samples = 0.0f;
|
||||
double pixels = static_cast<double>(dim_x * dim_y * dim_z);
|
||||
double samples = 0.0;
|
||||
|
||||
double num = 0.0;
|
||||
double alpha_num = 0.0;
|
||||
double log_num = 0.0;
|
||||
double mpsnr_num = 0.0;
|
||||
|
||||
if (componentmask & 1)
|
||||
{
|
||||
num += errorsum.sum.lane<0>();
|
||||
alpha_num += alpha_scaled_errorsum.sum.lane<0>();
|
||||
log_num += log_errorsum.sum.lane<0>();
|
||||
mpsnr_num += mpsnr_errorsum.sum.lane<0>();
|
||||
num += errorsum.sum_r;
|
||||
alpha_num += alpha_scaled_errorsum.sum_r;
|
||||
log_num += log_errorsum.sum_r;
|
||||
mpsnr_num += mpsnr_errorsum.sum_r;
|
||||
samples += pixels;
|
||||
}
|
||||
|
||||
if (componentmask & 2)
|
||||
{
|
||||
num += errorsum.sum.lane<1>();
|
||||
alpha_num += alpha_scaled_errorsum.sum.lane<1>();
|
||||
log_num += log_errorsum.sum.lane<1>();
|
||||
mpsnr_num += mpsnr_errorsum.sum.lane<1>();
|
||||
num += errorsum.sum_g;
|
||||
alpha_num += alpha_scaled_errorsum.sum_g;
|
||||
log_num += log_errorsum.sum_g;
|
||||
mpsnr_num += mpsnr_errorsum.sum_g;
|
||||
samples += pixels;
|
||||
}
|
||||
|
||||
if (componentmask & 4)
|
||||
{
|
||||
num += errorsum.sum.lane<2>();
|
||||
alpha_num += alpha_scaled_errorsum.sum.lane<2>();
|
||||
log_num += log_errorsum.sum.lane<2>();
|
||||
mpsnr_num += mpsnr_errorsum.sum.lane<2>();
|
||||
num += errorsum.sum_b;
|
||||
alpha_num += alpha_scaled_errorsum.sum_b;
|
||||
log_num += log_errorsum.sum_b;
|
||||
mpsnr_num += mpsnr_errorsum.sum_b;
|
||||
samples += pixels;
|
||||
}
|
||||
|
||||
if (componentmask & 8)
|
||||
{
|
||||
num += errorsum.sum.lane<3>();
|
||||
alpha_num += alpha_scaled_errorsum.sum.lane<3>();
|
||||
num += errorsum.sum_a;
|
||||
alpha_num += alpha_scaled_errorsum.sum_a;
|
||||
samples += pixels;
|
||||
}
|
||||
|
||||
float denom = samples;
|
||||
float stopcount = static_cast<float>(fstop_hi - fstop_lo + 1);
|
||||
float mpsnr_denom = pixels * 3.0f * stopcount * 255.0f * 255.0f;
|
||||
double denom = samples;
|
||||
double stopcount = static_cast<double>(fstop_hi - fstop_lo + 1);
|
||||
double mpsnr_denom = pixels * 3.0 * stopcount * 255.0 * 255.0;
|
||||
|
||||
float psnr;
|
||||
if (num == 0.0f)
|
||||
psnr = 999.0f;
|
||||
double psnr;
|
||||
if (num == 0.0)
|
||||
{
|
||||
psnr = 999.0;
|
||||
}
|
||||
else
|
||||
psnr = 10.0f * log10f(denom / num);
|
||||
{
|
||||
psnr = 10.0 * log10(denom / num);
|
||||
}
|
||||
|
||||
float rgb_psnr = psnr;
|
||||
double rgb_psnr = psnr;
|
||||
|
||||
printf("Quality metrics\n");
|
||||
printf("===============\n\n");
|
||||
|
||||
if (componentmask & 8)
|
||||
{
|
||||
printf(" PSNR (LDR-RGBA): %9.4f dB\n", static_cast<double>(psnr));
|
||||
printf(" PSNR (LDR-RGBA): %9.4f dB\n", psnr);
|
||||
|
||||
float alpha_psnr;
|
||||
if (alpha_num == 0.0f)
|
||||
alpha_psnr = 999.0f;
|
||||
double alpha_psnr;
|
||||
if (alpha_num == 0.0)
|
||||
{
|
||||
alpha_psnr = 999.0;
|
||||
}
|
||||
else
|
||||
alpha_psnr = 10.0f * log10f(denom / alpha_num);
|
||||
printf(" Alpha-weighted PSNR: %9.4f dB\n", static_cast<double>(alpha_psnr));
|
||||
{
|
||||
alpha_psnr = 10.0 * log10(denom / alpha_num);
|
||||
}
|
||||
printf(" Alpha-weighted PSNR: %9.4f dB\n", alpha_psnr);
|
||||
|
||||
float rgb_num = hadd_rgb_s(errorsum.sum);
|
||||
if (rgb_num == 0.0f)
|
||||
rgb_psnr = 999.0f;
|
||||
double rgb_num = errorsum.sum_r + errorsum.sum_g + errorsum.sum_b;
|
||||
if (rgb_num == 0.0)
|
||||
{
|
||||
rgb_psnr = 999.0;
|
||||
}
|
||||
else
|
||||
rgb_psnr = 10.0f * log10f(pixels * 3.0f / rgb_num);
|
||||
printf(" PSNR (LDR-RGB): %9.4f dB\n", static_cast<double>(rgb_psnr));
|
||||
{
|
||||
rgb_psnr = 10.0 * log10(pixels * 3.0 / rgb_num);
|
||||
}
|
||||
printf(" PSNR (LDR-RGB): %9.4f dB\n", rgb_psnr);
|
||||
}
|
||||
else
|
||||
{
|
||||
printf(" PSNR (LDR-RGB): %9.4f dB\n", static_cast<double>(psnr));
|
||||
printf(" PSNR (LDR-RGB): %9.4f dB\n", psnr);
|
||||
}
|
||||
|
||||
if (compute_hdr_metrics)
|
||||
{
|
||||
printf(" PSNR (RGB norm to peak): %9.4f dB (peak %f)\n",
|
||||
static_cast<double>(rgb_psnr + 20.0f * log10f(rgb_peak)),
|
||||
static_cast<double>(rgb_peak));
|
||||
rgb_psnr + 20.0 * log10(rgb_peak), rgb_peak);
|
||||
|
||||
float mpsnr;
|
||||
if (mpsnr_num == 0.0f)
|
||||
double mpsnr;
|
||||
if (mpsnr_num == 0.0)
|
||||
{
|
||||
mpsnr = 999.0f;
|
||||
mpsnr = 999.0;
|
||||
}
|
||||
else
|
||||
{
|
||||
mpsnr = 10.0f * log10f(mpsnr_denom / mpsnr_num);
|
||||
mpsnr = 10.0 * log10(mpsnr_denom / mpsnr_num);
|
||||
}
|
||||
|
||||
printf(" mPSNR (RGB): %9.4f dB (fstops %+d to %+d)\n",
|
||||
static_cast<double>(mpsnr), fstop_lo, fstop_hi);
|
||||
mpsnr, fstop_lo, fstop_hi);
|
||||
|
||||
float logrmse = astc::sqrt(log_num / pixels);
|
||||
printf(" LogRMSE (RGB): %9.4f\n", static_cast<double>(logrmse));
|
||||
double logrmse = sqrt(log_num / pixels);
|
||||
printf(" LogRMSE (RGB): %9.4f\n", logrmse);
|
||||
}
|
||||
|
||||
if (compute_normal_metrics)
|
||||
|
||||
@@ -36,12 +36,12 @@ astcenc_image *alloc_image(
|
||||
img->dim_y = dim_y;
|
||||
img->dim_z = dim_z;
|
||||
|
||||
void** data = new void*[dim_z];
|
||||
img->data = data;
|
||||
|
||||
if (bitness == 8)
|
||||
{
|
||||
void** data = new void*[dim_z];
|
||||
img->data_type = ASTCENC_TYPE_U8;
|
||||
img->data = data;
|
||||
|
||||
for (unsigned int z = 0; z < dim_z; z++)
|
||||
{
|
||||
data[z] = new uint8_t[dim_x * dim_y * 4];
|
||||
@@ -49,10 +49,7 @@ astcenc_image *alloc_image(
|
||||
}
|
||||
else if (bitness == 16)
|
||||
{
|
||||
void** data = new void*[dim_z];
|
||||
img->data_type = ASTCENC_TYPE_F16;
|
||||
img->data = data;
|
||||
|
||||
for (unsigned int z = 0; z < dim_z; z++)
|
||||
{
|
||||
data[z] = new uint16_t[dim_x * dim_y * 4];
|
||||
@@ -61,10 +58,7 @@ astcenc_image *alloc_image(
|
||||
else // if (bitness == 32)
|
||||
{
|
||||
assert(bitness == 32);
|
||||
void** data = new void*[dim_z];
|
||||
img->data_type = ASTCENC_TYPE_F32;
|
||||
img->data = data;
|
||||
|
||||
for (unsigned int z = 0; z < dim_z; z++)
|
||||
{
|
||||
data[z] = new float[dim_x * dim_y * 4];
|
||||
@@ -239,15 +233,18 @@ astcenc_image* astc_img_from_unorm8x4_array(
|
||||
/* See header for documentation. */
|
||||
float* floatx4_array_from_astc_img(
|
||||
const astcenc_image* img,
|
||||
bool y_flip
|
||||
bool y_flip,
|
||||
unsigned int z_index
|
||||
) {
|
||||
unsigned int dim_x = img->dim_x;
|
||||
unsigned int dim_y = img->dim_y;
|
||||
float *buf = new float[4 * dim_x * dim_y];
|
||||
|
||||
assert(z_index < img->dim_z);
|
||||
|
||||
if (img->data_type == ASTCENC_TYPE_U8)
|
||||
{
|
||||
uint8_t* data8 = static_cast<uint8_t*>(img->data[0]);
|
||||
uint8_t* data8 = static_cast<uint8_t*>(img->data[z_index]);
|
||||
for (unsigned int y = 0; y < dim_y; y++)
|
||||
{
|
||||
unsigned int ymod = y_flip ? dim_y - y - 1 : y;
|
||||
@@ -264,7 +261,7 @@ float* floatx4_array_from_astc_img(
|
||||
}
|
||||
else if (img->data_type == ASTCENC_TYPE_F16)
|
||||
{
|
||||
uint16_t* data16 = static_cast<uint16_t*>(img->data[0]);
|
||||
uint16_t* data16 = static_cast<uint16_t*>(img->data[z_index]);
|
||||
for (unsigned int y = 0; y < dim_y; y++)
|
||||
{
|
||||
unsigned int ymod = y_flip ? dim_y - y - 1 : y;
|
||||
@@ -287,7 +284,7 @@ float* floatx4_array_from_astc_img(
|
||||
else // if (img->data_type == ASTCENC_TYPE_F32)
|
||||
{
|
||||
assert(img->data_type == ASTCENC_TYPE_F32);
|
||||
float* data32 = static_cast<float*>(img->data[0]);
|
||||
float* data32 = static_cast<float*>(img->data[z_index]);
|
||||
for (unsigned int y = 0; y < dim_y; y++)
|
||||
{
|
||||
unsigned int ymod = y_flip ? dim_y - y - 1 : y;
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
// ----------------------------------------------------------------------------
|
||||
// Copyright 2011-2022 Arm Limited
|
||||
// Copyright 2011-2023 Arm Limited
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
|
||||
// use this file except in compliance with the License. You may obtain a copy
|
||||
@@ -26,7 +26,7 @@
|
||||
|
||||
#include "astcenccli_internal.h"
|
||||
|
||||
// Configure the STB image imagewrite library build.
|
||||
// Configure the STB image write library build.
|
||||
#define STB_IMAGE_IMPLEMENTATION
|
||||
#define STB_IMAGE_WRITE_IMPLEMENTATION
|
||||
#define STBI_NO_GIF
|
||||
@@ -61,7 +61,7 @@ static void astcenc_runtime_assert(bool condition)
|
||||
{
|
||||
if (!condition)
|
||||
{
|
||||
printf("ERROR: Corrupt input image\n");
|
||||
print_error("ERROR: Corrupt input image\n");
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
@@ -92,7 +92,7 @@ astcenc_image* load_png_with_wuffs(
|
||||
std::ifstream file(filename, std::ios::binary | std::ios::ate);
|
||||
if (!file)
|
||||
{
|
||||
printf("ERROR: Failed to load image %s (can't fopen)\n", filename);
|
||||
print_error("ERROR: Failed to load image %s (can't fopen)\n", filename);
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
// ----------------------------------------------------------------------------
|
||||
// Copyright 2011-2022 Arm Limited
|
||||
// Copyright 2011-2023 Arm Limited
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
|
||||
// use this file except in compliance with the License. You may obtain a copy
|
||||
@@ -25,6 +25,8 @@
|
||||
#include <cstdlib>
|
||||
#include <cstring>
|
||||
#include <fstream>
|
||||
#include <iomanip>
|
||||
#include <sstream>
|
||||
|
||||
#include "astcenccli_internal.h"
|
||||
|
||||
@@ -32,8 +34,39 @@
|
||||
#include "stb_image_write.h"
|
||||
#include "tinyexr.h"
|
||||
|
||||
/**
|
||||
* @brief Determine the output file name to use for a sliced image write.
|
||||
*
|
||||
* @param img The source data for the image.
|
||||
* @param filename The base name of the file to save.
|
||||
* @param index The slice index to write.
|
||||
*
|
||||
* @return The file name to use when saving the file.
|
||||
*/
|
||||
static std::string get_output_filename(
|
||||
const astcenc_image* img,
|
||||
const char* filename,
|
||||
unsigned int index
|
||||
) {
|
||||
if (img->dim_z <= 1)
|
||||
{
|
||||
return filename;
|
||||
}
|
||||
|
||||
std::string fnmod(filename);
|
||||
std::string fnext = fnmod.substr(fnmod.find_last_of("."));
|
||||
|
||||
// Remove the extension
|
||||
fnmod = fnmod.erase(fnmod.length() - fnext.size());
|
||||
|
||||
// Insert the file index into the base name, then append the extension
|
||||
std::stringstream ss;
|
||||
ss << fnmod << "_" << std::setw(3) << std::setfill('0') << index << fnext;
|
||||
return ss.str();
|
||||
}
|
||||
|
||||
/* ============================================================================
|
||||
Image load and store through the stb_iamge and tinyexr libraries
|
||||
Image load and store through the stb_image and tinyexr libraries
|
||||
============================================================================ */
|
||||
|
||||
/**
|
||||
@@ -59,7 +92,7 @@ static astcenc_image* load_image_with_tinyexr(
|
||||
int load_res = LoadEXR(&image, &dim_x, &dim_y, filename, &err);
|
||||
if (load_res != TINYEXR_SUCCESS)
|
||||
{
|
||||
printf("ERROR: Failed to load image %s (%s)\n", filename, err);
|
||||
print_error("ERROR: Failed to load image %s (%s)\n", filename, err);
|
||||
free(reinterpret_cast<void*>(const_cast<char*>(err)));
|
||||
return nullptr;
|
||||
}
|
||||
@@ -115,7 +148,7 @@ static astcenc_image* load_image_with_stb(
|
||||
}
|
||||
}
|
||||
|
||||
printf("ERROR: Failed to load image %s (%s)\n", filename, stbi_failure_reason());
|
||||
print_error("ERROR: Failed to load image %s (%s)\n", filename, stbi_failure_reason());
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
@@ -133,9 +166,21 @@ static bool store_exr_image_with_tinyexr(
|
||||
const char* filename,
|
||||
int y_flip
|
||||
) {
|
||||
float *buf = floatx4_array_from_astc_img(img, y_flip);
|
||||
int res = SaveEXR(buf, img->dim_x, img->dim_y, 4, 1, filename, nullptr);
|
||||
delete[] buf;
|
||||
int res { 0 };
|
||||
|
||||
for (unsigned int i = 0; i < img->dim_z; i++)
|
||||
{
|
||||
std::string fnmod = get_output_filename(img, filename, i);
|
||||
float* buf = floatx4_array_from_astc_img(img, y_flip, i);
|
||||
|
||||
res = SaveEXR(buf, img->dim_x, img->dim_y, 4, 1, fnmod.c_str(), nullptr);
|
||||
delete[] buf;
|
||||
if (res < 0)
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return res >= 0;
|
||||
}
|
||||
|
||||
@@ -153,11 +198,23 @@ static bool store_png_image_with_stb(
|
||||
const char* filename,
|
||||
int y_flip
|
||||
) {
|
||||
assert(img->data_type == ASTCENC_TYPE_U8);
|
||||
uint8_t* buf = reinterpret_cast<uint8_t*>(img->data[0]);
|
||||
int res { 0 };
|
||||
|
||||
assert(img->data_type == ASTCENC_TYPE_U8);
|
||||
|
||||
for (unsigned int i = 0; i < img->dim_z; i++)
|
||||
{
|
||||
std::string fnmod = get_output_filename(img, filename, i);
|
||||
uint8_t* buf = reinterpret_cast<uint8_t*>(img->data[i]);
|
||||
|
||||
stbi_flip_vertically_on_write(y_flip);
|
||||
res = stbi_write_png(fnmod.c_str(), img->dim_x, img->dim_y, 4, buf, img->dim_x * 4);
|
||||
if (res == 0)
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
stbi_flip_vertically_on_write(y_flip);
|
||||
int res = stbi_write_png(filename, img->dim_x, img->dim_y, 4, buf, img->dim_x * 4);
|
||||
return res != 0;
|
||||
}
|
||||
|
||||
@@ -175,11 +232,23 @@ static bool store_tga_image_with_stb(
|
||||
const char* filename,
|
||||
int y_flip
|
||||
) {
|
||||
assert(img->data_type == ASTCENC_TYPE_U8);
|
||||
uint8_t* buf = reinterpret_cast<uint8_t*>(img->data[0]);
|
||||
int res { 0 };
|
||||
|
||||
assert(img->data_type == ASTCENC_TYPE_U8);
|
||||
|
||||
for (unsigned int i = 0; i < img->dim_z; i++)
|
||||
{
|
||||
std::string fnmod = get_output_filename(img, filename, i);
|
||||
uint8_t* buf = reinterpret_cast<uint8_t*>(img->data[i]);
|
||||
|
||||
stbi_flip_vertically_on_write(y_flip);
|
||||
res = stbi_write_tga(fnmod.c_str(), img->dim_x, img->dim_y, 4, buf);
|
||||
if (res == 0)
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
stbi_flip_vertically_on_write(y_flip);
|
||||
int res = stbi_write_tga(filename, img->dim_x, img->dim_y, 4, buf);
|
||||
return res != 0;
|
||||
}
|
||||
|
||||
@@ -197,11 +266,23 @@ static bool store_bmp_image_with_stb(
|
||||
const char* filename,
|
||||
int y_flip
|
||||
) {
|
||||
assert(img->data_type == ASTCENC_TYPE_U8);
|
||||
uint8_t* buf = reinterpret_cast<uint8_t*>(img->data[0]);
|
||||
int res { 0 };
|
||||
|
||||
assert(img->data_type == ASTCENC_TYPE_U8);
|
||||
|
||||
for (unsigned int i = 0; i < img->dim_z; i++)
|
||||
{
|
||||
std::string fnmod = get_output_filename(img, filename, i);
|
||||
uint8_t* buf = reinterpret_cast<uint8_t*>(img->data[i]);
|
||||
|
||||
stbi_flip_vertically_on_write(y_flip);
|
||||
res = stbi_write_bmp(fnmod.c_str(), img->dim_x, img->dim_y, 4, buf);
|
||||
if (res == 0)
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
stbi_flip_vertically_on_write(y_flip);
|
||||
int res = stbi_write_bmp(filename, img->dim_x, img->dim_y, 4, buf);
|
||||
return res != 0;
|
||||
}
|
||||
|
||||
@@ -219,9 +300,21 @@ static bool store_hdr_image_with_stb(
|
||||
const char* filename,
|
||||
int y_flip
|
||||
) {
|
||||
float* buf = floatx4_array_from_astc_img(img, y_flip);
|
||||
int res = stbi_write_hdr(filename, img->dim_x, img->dim_y, 4, buf);
|
||||
delete[] buf;
|
||||
int res { 0 };
|
||||
|
||||
for (unsigned int i = 0; i < img->dim_z; i++)
|
||||
{
|
||||
std::string fnmod = get_output_filename(img, filename, i);
|
||||
float* buf = floatx4_array_from_astc_img(img, y_flip, i);
|
||||
|
||||
res = stbi_write_hdr(fnmod.c_str(), img->dim_x, img->dim_y, 4, buf);
|
||||
delete[] buf;
|
||||
if (res == 0)
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return res != 0;
|
||||
}
|
||||
|
||||
@@ -625,6 +718,16 @@ static uint32_t u32_byterev(uint32_t v)
|
||||
#define GL_LUMINANCE 0x1909
|
||||
#define GL_LUMINANCE_ALPHA 0x190A
|
||||
|
||||
#define GL_R8 0x8229
|
||||
#define GL_RG8 0x822B
|
||||
#define GL_RGB8 0x8051
|
||||
#define GL_RGBA8 0x8058
|
||||
|
||||
#define GL_R16F 0x822D
|
||||
#define GL_RG16F 0x822F
|
||||
#define GL_RGB16F 0x881B
|
||||
#define GL_RGBA16F 0x881A
|
||||
|
||||
#define GL_UNSIGNED_BYTE 0x1401
|
||||
#define GL_UNSIGNED_SHORT 0x1403
|
||||
#define GL_HALF_FLOAT 0x140B
|
||||
@@ -768,7 +871,7 @@ static unsigned int get_format(
|
||||
) {
|
||||
for (auto& it : ASTC_FORMATS)
|
||||
{
|
||||
if ((it.x == x) && (it.y == y) && (it.z == z) && (it.is_srgb == is_srgb))
|
||||
if ((it.x == x) && (it.y == y) && (it.z == z) && (it.is_srgb == is_srgb))
|
||||
{
|
||||
return it.format;
|
||||
}
|
||||
@@ -794,7 +897,7 @@ struct ktx_header
|
||||
uint32_t bytes_of_key_value_data; // size in bytes of the key-and-value area immediately following the header.
|
||||
};
|
||||
|
||||
// magic 12-byte sequence that must appear at the beginning of every KTX file.
|
||||
// Magic 12-byte sequence that must appear at the beginning of every KTX file.
|
||||
static uint8_t ktx_magic[12] {
|
||||
0xAB, 0x4B, 0x54, 0x58, 0x20, 0x31, 0x31, 0xBB, 0x0D, 0x0A, 0x1A, 0x0A
|
||||
};
|
||||
@@ -909,9 +1012,9 @@ static astcenc_image* load_ktx_uncompressed_image(
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
// Although these are set up later, we include a default initializer to remove warnings
|
||||
int bytes_per_component = 1; // bytes per component in the KTX file.
|
||||
int bitness = 8; // internal precision we will use in the codec.
|
||||
// Although these are set up later, use default initializer to remove warnings
|
||||
int bitness = 8; // Internal precision after conversion
|
||||
int bytes_per_component = 1; // Bytes per component in the KTX file
|
||||
scanline_transfer copy_method = R8_TO_RGBA8;
|
||||
|
||||
switch (hdr.gl_type)
|
||||
@@ -1017,7 +1120,7 @@ static astcenc_image* load_ktx_uncompressed_image(
|
||||
}
|
||||
case GL_FLOAT:
|
||||
{
|
||||
bitness = 32;
|
||||
bitness = 16;
|
||||
bytes_per_component = 4;
|
||||
switch (hdr.gl_format)
|
||||
{
|
||||
@@ -1126,7 +1229,7 @@ static astcenc_image* load_ktx_uncompressed_image(
|
||||
}
|
||||
}
|
||||
|
||||
// then transfer data from the surface to our own image-data-structure.
|
||||
// Transfer data from the surface to our own image data structure
|
||||
astcenc_image *astc_img = alloc_image(bitness, dim_x, dim_y, dim_z);
|
||||
|
||||
for (unsigned int z = 0; z < dim_z; z++)
|
||||
@@ -1155,7 +1258,7 @@ static astcenc_image* load_ktx_uncompressed_image(
|
||||
}
|
||||
|
||||
delete[] buf;
|
||||
is_hdr = bitness == 32;
|
||||
is_hdr = bitness >= 16;
|
||||
component_count = components;
|
||||
return astc_img;
|
||||
}
|
||||
@@ -1352,7 +1455,15 @@ static bool store_ktx_uncompressed_image(
|
||||
ktx_header hdr;
|
||||
|
||||
static const int gl_format_of_components[4] {
|
||||
GL_LUMINANCE, GL_LUMINANCE_ALPHA, GL_RGB, GL_RGBA
|
||||
GL_RED, GL_RG, GL_RGB, GL_RGBA
|
||||
};
|
||||
|
||||
static const int gl_sized_format_of_components_ldr[4] {
|
||||
GL_R8, GL_RG8, GL_RGB8, GL_RGBA8
|
||||
};
|
||||
|
||||
static const int gl_sized_format_of_components_hdr[4] {
|
||||
GL_R16F, GL_RG16F, GL_RGB16F, GL_RGBA16F
|
||||
};
|
||||
|
||||
memcpy(hdr.magic, ktx_magic, 12);
|
||||
@@ -1360,8 +1471,15 @@ static bool store_ktx_uncompressed_image(
|
||||
hdr.gl_type = (bitness == 16) ? GL_HALF_FLOAT : GL_UNSIGNED_BYTE;
|
||||
hdr.gl_type_size = bitness / 8;
|
||||
hdr.gl_format = gl_format_of_components[image_components - 1];
|
||||
hdr.gl_internal_format = gl_format_of_components[image_components - 1];
|
||||
hdr.gl_base_internal_format = gl_format_of_components[image_components - 1];
|
||||
if (bitness == 16)
|
||||
{
|
||||
hdr.gl_internal_format = gl_sized_format_of_components_hdr[image_components - 1];
|
||||
}
|
||||
else
|
||||
{
|
||||
hdr.gl_internal_format = gl_sized_format_of_components_ldr[image_components - 1];
|
||||
}
|
||||
hdr.gl_base_internal_format = hdr.gl_format;
|
||||
hdr.pixel_width = dim_x;
|
||||
hdr.pixel_height = dim_y;
|
||||
hdr.pixel_depth = (dim_z == 1) ? 0 : dim_z;
|
||||
@@ -1915,7 +2033,7 @@ static astcenc_image* load_dds_uncompressed_image(
|
||||
}
|
||||
|
||||
delete[] buf;
|
||||
is_hdr = bitness == 16;
|
||||
is_hdr = bitness >= 16;
|
||||
component_count = components;
|
||||
return astc_img;
|
||||
}
|
||||
@@ -2295,7 +2413,7 @@ bool store_ncimage(
|
||||
eptr = ".ktx"; // use KTX file format if we don't have an ending.
|
||||
}
|
||||
|
||||
for (int i=0; i < storer_descr_count; i++)
|
||||
for (int i = 0; i < storer_descr_count; i++)
|
||||
{
|
||||
if (strcmp(eptr, storer_descs[i].ending1) == 0
|
||||
|| strcmp(eptr, storer_descs[i].ending2) == 0)
|
||||
@@ -2338,7 +2456,6 @@ static unsigned int unpack_bytes(
|
||||
}
|
||||
|
||||
/* See header for documentation. */
|
||||
// TODO: Return a bool?
|
||||
int load_cimage(
|
||||
const char* filename,
|
||||
astc_compressed_image& img
|
||||
@@ -2346,22 +2463,22 @@ int load_cimage(
|
||||
std::ifstream file(filename, std::ios::in | std::ios::binary);
|
||||
if (!file)
|
||||
{
|
||||
printf("ERROR: File open failed '%s'\n", filename);
|
||||
print_error("ERROR: File open failed '%s'\n", filename);
|
||||
return 1;
|
||||
}
|
||||
|
||||
astc_header hdr;
|
||||
file.read(reinterpret_cast<char*>(&hdr), sizeof(astc_header));
|
||||
if (!file)
|
||||
if (file.fail())
|
||||
{
|
||||
printf("ERROR: File read failed '%s'\n", filename);
|
||||
print_error("ERROR: File read failed '%s'\n", filename);
|
||||
return 1;
|
||||
}
|
||||
|
||||
unsigned int magicval = unpack_bytes(hdr.magic[0], hdr.magic[1], hdr.magic[2], hdr.magic[3]);
|
||||
if (magicval != ASTC_MAGIC_ID)
|
||||
{
|
||||
printf("ERROR: File not recognized '%s'\n", filename);
|
||||
print_error("ERROR: File not recognized '%s'\n", filename);
|
||||
return 1;
|
||||
}
|
||||
|
||||
@@ -2376,7 +2493,7 @@ int load_cimage(
|
||||
|
||||
if (dim_x == 0 || dim_y == 0 || dim_z == 0)
|
||||
{
|
||||
printf("ERROR: File corrupt '%s'\n", filename);
|
||||
print_error("ERROR: Image header corrupt '%s'\n", filename);
|
||||
return 1;
|
||||
}
|
||||
|
||||
@@ -2388,9 +2505,10 @@ int load_cimage(
|
||||
uint8_t *buffer = new uint8_t[data_size];
|
||||
|
||||
file.read(reinterpret_cast<char*>(buffer), data_size);
|
||||
if (!file)
|
||||
if (file.fail())
|
||||
{
|
||||
printf("ERROR: File read failed '%s'\n", filename);
|
||||
print_error("ERROR: Image data size exceeded file size '%s'\n", filename);
|
||||
delete[] buffer;
|
||||
return 1;
|
||||
}
|
||||
|
||||
@@ -2406,7 +2524,6 @@ int load_cimage(
|
||||
}
|
||||
|
||||
/* See header for documentation. */
|
||||
// TODO: Return a bool?
|
||||
int store_cimage(
|
||||
const astc_compressed_image& img,
|
||||
const char* filename
|
||||
@@ -2436,7 +2553,7 @@ int store_cimage(
|
||||
std::ofstream file(filename, std::ios::out | std::ios::binary);
|
||||
if (!file)
|
||||
{
|
||||
printf("ERROR: File open failed '%s'\n", filename);
|
||||
print_error("ERROR: File open failed '%s'\n", filename);
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
// ----------------------------------------------------------------------------
|
||||
// Copyright 2011-2021 Arm Limited
|
||||
// Copyright 2011-2023 Arm Limited
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
|
||||
// use this file except in compliance with the License. You may obtain a copy
|
||||
@@ -68,6 +68,9 @@ struct cli_config_options
|
||||
/** @brief The number of threads to use for processing. */
|
||||
unsigned int thread_count;
|
||||
|
||||
/** @brief The number of repeats to execute for benchmarking. */
|
||||
unsigned int repeat_count;
|
||||
|
||||
/** @brief The number of image slices to load for a 3D image. */
|
||||
unsigned int array_size;
|
||||
|
||||
@@ -77,6 +80,9 @@ struct cli_config_options
|
||||
/** @brief @c true if the images should be y-flipped. */
|
||||
bool y_flip;
|
||||
|
||||
/** @brief @c true if diagnostic images should be stored. */
|
||||
bool diagnostic_images;
|
||||
|
||||
/** @brief The low exposure fstop for error computation. */
|
||||
int low_fstop;
|
||||
|
||||
@@ -90,6 +96,26 @@ struct cli_config_options
|
||||
astcenc_swizzle swz_decode;
|
||||
};
|
||||
|
||||
/**
|
||||
* @brief Print a string to stderr.
|
||||
*/
|
||||
static inline void print_error(
|
||||
const char* format
|
||||
) {
|
||||
fprintf(stderr, "%s", format);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Print a formatted string to stderr.
|
||||
*/
|
||||
template<typename ... _Args>
|
||||
static inline void print_error(
|
||||
const char* format,
|
||||
_Args...args
|
||||
) {
|
||||
fprintf(stderr, format, args...);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Load uncompressed image.
|
||||
*
|
||||
@@ -271,18 +297,20 @@ astcenc_image* astc_img_from_unorm8x4_array(
|
||||
bool y_flip);
|
||||
|
||||
/**
|
||||
* @brief Create a flattened RGBA FLOAT32 data array from an image structure.
|
||||
* @brief Create a flattened RGBA FLOAT32 data array for a single slice from an image structure.
|
||||
*
|
||||
* The returned data array is allocated with @c new[] and must be freed with a @c delete[] call.
|
||||
*
|
||||
* @param img The input image.
|
||||
* @param y_flip Should the data in the array be Y flipped?
|
||||
* @param img The input image.
|
||||
* @param y_flip Should the data in the array be Y flipped?
|
||||
* @param z_index The slice index to convert.
|
||||
*
|
||||
* @return The data array.
|
||||
*/
|
||||
float* floatx4_array_from_astc_img(
|
||||
const astcenc_image* img,
|
||||
bool y_flip);
|
||||
bool y_flip,
|
||||
unsigned int z_index);
|
||||
|
||||
/**
|
||||
* @brief Create a flattened RGBA UNORM8 data array from an image structure.
|
||||
@@ -357,14 +385,28 @@ int get_cpu_count();
|
||||
* All threads run the same thread function, and have the same thread payload, but are given a
|
||||
* unique thread ID (0 .. N-1) as a parameter to the run function to allow thread-specific behavior.
|
||||
*
|
||||
|* @param thread_count The number of threads to spawn.
|
||||
* @param func The function to execute. Must have the signature:
|
||||
* void (int thread_count, int thread_id, void* payload)
|
||||
* @param payload Pointer to an opaque thread payload object.
|
||||
* @param operation The name of the operation for this async task.
|
||||
* @param thread_count The number of threads to spawn.
|
||||
* @param func The function to execute. Must have the signature:
|
||||
* void (int thread_count, int thread_id, void* payload)
|
||||
* @param payload Pointer to an opaque thread payload object.
|
||||
*/
|
||||
void launch_threads(
|
||||
const char* operation,
|
||||
int thread_count,
|
||||
void (*func)(int, int, void*),
|
||||
void *payload);
|
||||
|
||||
/**
|
||||
* @brief The main entry point.
|
||||
*
|
||||
* @param argc The number of arguments.
|
||||
* @param argv The vector of arguments.
|
||||
*
|
||||
* @return 0 on success, non-zero otherwise.
|
||||
*/
|
||||
int astcenc_main(
|
||||
int argc,
|
||||
char **argv);
|
||||
|
||||
#endif
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
// ----------------------------------------------------------------------------
|
||||
// Copyright 2011-2021 Arm Limited
|
||||
// Copyright 2011-2023 Arm Limited
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
|
||||
// use this file except in compliance with the License. You may obtain a copy
|
||||
@@ -38,7 +38,7 @@
|
||||
#if defined(_WIN32) && !defined(__CYGWIN__)
|
||||
|
||||
#define WIN32_LEAN_AND_MEAN
|
||||
#include <Windows.h>
|
||||
#include <windows.h>
|
||||
|
||||
/** @brief Alias pthread_t to one of the internal Windows types. */
|
||||
typedef HANDLE pthread_t;
|
||||
@@ -58,9 +58,61 @@ static int pthread_create(
|
||||
static_cast<void>(attribs);
|
||||
LPTHREAD_START_ROUTINE func = reinterpret_cast<LPTHREAD_START_ROUTINE>(threadfunc);
|
||||
*thread = CreateThread(nullptr, 0, func, thread_arg, 0, nullptr);
|
||||
|
||||
// Ensure we return 0 on success, non-zero on error
|
||||
if (*thread == NULL)
|
||||
{
|
||||
return 1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Manually set CPU group and thread affinity.
|
||||
*
|
||||
* This is needed on Windows 10 or older to allow benefit from large core count
|
||||
* systems with more than 64 logical CPUs. The assignment is skipped on systems
|
||||
* with a single processor group, as it is not necessary.
|
||||
*/
|
||||
static void set_group_affinity(
|
||||
pthread_t thread,
|
||||
int thread_index
|
||||
) {
|
||||
// Skip thread assignment for hardware with a single CPU group
|
||||
int group_count = GetActiveProcessorGroupCount();
|
||||
if (group_count == 1)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
// Ensure we have a valid assign if user creates more threads than cores
|
||||
int assign_index = thread_index % get_cpu_count();
|
||||
int assign_group { 0 };
|
||||
int assign_group_cpu_count { 0 };
|
||||
|
||||
// Determine which core group and core in the group to use for this thread
|
||||
int group_cpu_count_sum { 0 };
|
||||
for (int group = 0; group < group_count; group++)
|
||||
{
|
||||
int group_cpu_count = static_cast<int>(GetMaximumProcessorCount(group));
|
||||
group_cpu_count_sum += group_cpu_count;
|
||||
|
||||
if (assign_index < group_cpu_count_sum)
|
||||
{
|
||||
assign_group = group;
|
||||
assign_group_cpu_count = group_cpu_count;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Set the affinity to the assigned group, and all supported cores
|
||||
GROUP_AFFINITY affinity {};
|
||||
affinity.Mask = (1 << assign_group_cpu_count) - 1;
|
||||
affinity.Group = assign_group;
|
||||
SetThreadGroupAffinity(thread, &affinity, nullptr);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Proxy Windows @c WaitForSingleObject underneath a pthreads-like wrapper.
|
||||
*/
|
||||
@@ -76,9 +128,8 @@ static int pthread_join(
|
||||
/* See header for documentation */
|
||||
int get_cpu_count()
|
||||
{
|
||||
SYSTEM_INFO sysinfo;
|
||||
GetSystemInfo(&sysinfo);
|
||||
return sysinfo.dwNumberOfProcessors;
|
||||
DWORD cpu_count = GetActiveProcessorCount(ALL_PROCESSOR_GROUPS);
|
||||
return static_cast<int>(cpu_count);
|
||||
}
|
||||
|
||||
/* See header for documentation */
|
||||
@@ -151,6 +202,7 @@ static void* launch_threads_helper(
|
||||
|
||||
/* See header for documentation */
|
||||
void launch_threads(
|
||||
const char* operation,
|
||||
int thread_count,
|
||||
void (*func)(int, int, void*),
|
||||
void *payload
|
||||
@@ -163,22 +215,58 @@ void launch_threads(
|
||||
}
|
||||
|
||||
// Otherwise spawn worker threads
|
||||
launch_desc *thread_descs = new launch_desc[thread_count];
|
||||
launch_desc *thread_descs = new launch_desc[thread_count];
|
||||
int actual_thread_count { 0 };
|
||||
|
||||
for (int i = 0; i < thread_count; i++)
|
||||
{
|
||||
thread_descs[i].thread_count = thread_count;
|
||||
thread_descs[i].thread_id = i;
|
||||
thread_descs[i].payload = payload;
|
||||
thread_descs[i].func = func;
|
||||
thread_descs[actual_thread_count].thread_count = thread_count;
|
||||
thread_descs[actual_thread_count].thread_id = actual_thread_count;
|
||||
thread_descs[actual_thread_count].payload = payload;
|
||||
thread_descs[actual_thread_count].func = func;
|
||||
|
||||
pthread_create(&(thread_descs[i].thread_handle), nullptr,
|
||||
launch_threads_helper, reinterpret_cast<void*>(thread_descs + i));
|
||||
// Handle pthread_create failing by simply using fewer threads
|
||||
int error = pthread_create(
|
||||
&(thread_descs[actual_thread_count].thread_handle),
|
||||
nullptr,
|
||||
launch_threads_helper,
|
||||
reinterpret_cast<void*>(thread_descs + actual_thread_count));
|
||||
|
||||
// Track how many threads we actually created
|
||||
if (!error)
|
||||
{
|
||||
// Windows needs explicit thread assignment to handle large core count systems
|
||||
#if defined(_WIN32) && !defined(__CYGWIN__)
|
||||
set_group_affinity(
|
||||
thread_descs[actual_thread_count].thread_handle,
|
||||
actual_thread_count);
|
||||
#endif
|
||||
|
||||
actual_thread_count++;
|
||||
}
|
||||
}
|
||||
|
||||
// ... and then wait for them to complete
|
||||
for (int i = 0; i < thread_count; i++)
|
||||
// If we did not create thread_count threads then emit a warning
|
||||
if (actual_thread_count != thread_count)
|
||||
{
|
||||
int log_count = actual_thread_count == 0 ? 1 : actual_thread_count;
|
||||
const char* log_s = log_count == 1 ? "" : "s";
|
||||
printf("WARNING: %s using %d thread%s due to thread creation error\n\n",
|
||||
operation, log_count, log_s);
|
||||
}
|
||||
|
||||
// If we managed to spawn any threads wait for them to complete
|
||||
if (actual_thread_count != 0)
|
||||
{
|
||||
pthread_join(thread_descs[i].thread_handle, nullptr);
|
||||
for (int i = 0; i < actual_thread_count; i++)
|
||||
{
|
||||
pthread_join(thread_descs[i].thread_handle, nullptr);
|
||||
}
|
||||
}
|
||||
// Else fall back to using this thread
|
||||
else
|
||||
{
|
||||
func(1, 0, payload);
|
||||
}
|
||||
|
||||
delete[] thread_descs;
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
// ----------------------------------------------------------------------------
|
||||
// Copyright 2011-2022 Arm Limited
|
||||
// Copyright 2011-2023 Arm Limited
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
|
||||
// use this file except in compliance with the License. You may obtain a copy
|
||||
@@ -25,7 +25,7 @@
|
||||
/** @brief The version header. */
|
||||
static const char *astcenc_copyright_string =
|
||||
R"(astcenc v%s, %u-bit %s%s%s
|
||||
Copyright 2011-%s Arm Limited, all rights reserved
|
||||
Copyright (c) 2011-%s Arm Limited. All rights reserved.
|
||||
)";
|
||||
|
||||
/** @brief The short-form help text. */
|
||||
@@ -138,14 +138,15 @@ COMPRESSION
|
||||
The quality level configures the quality-performance tradeoff for
|
||||
the compressor; more complete searches of the search space improve
|
||||
image quality at the expense of compression time. The quality level
|
||||
can be set to any value between 0 (fastest) and 100 (thorough), or
|
||||
to a fixed quality preset:
|
||||
can be set to any value between 0 (fastest) and 100 (exhaustive),
|
||||
or to a fixed quality preset:
|
||||
|
||||
-fastest (equivalent to quality = 0)
|
||||
-fast (equivalent to quality = 10)
|
||||
-medium (equivalent to quality = 60)
|
||||
-thorough (equivalent to quality = 98)
|
||||
-exhaustive (equivalent to quality = 100)
|
||||
-fastest (equivalent to quality = 0)
|
||||
-fast (equivalent to quality = 10)
|
||||
-medium (equivalent to quality = 60)
|
||||
-thorough (equivalent to quality = 98)
|
||||
-verythorough (equivalent to quality = 99)
|
||||
-exhaustive (equivalent to quality = 100)
|
||||
|
||||
For compression of production content we recommend using a quality
|
||||
level equivalent to -medium or higher.
|
||||
@@ -158,11 +159,17 @@ COMPRESSION
|
||||
to consider for common usage, based on the type of image data being
|
||||
compressed.
|
||||
|
||||
-mask
|
||||
The input texture is a mask texture with unrelated data stored
|
||||
in the various color components, so enable error heuristics that
|
||||
aim to improve quality by minimizing the effect of error
|
||||
cross-talk across the color components.
|
||||
-decode_unorm8
|
||||
Indicate that an LDR compressed texture will be used with
|
||||
the decode_unorm8 extension behavior, instead of the default
|
||||
decode_unorm16 decompression.
|
||||
|
||||
Matching the decode mode used during compression to the mode
|
||||
used at runtime will improve image quality as the compressor
|
||||
can ensure that rounding goes the right way.
|
||||
|
||||
This mode is used automatically if you decompress to an 8-bit
|
||||
per component output image format.
|
||||
|
||||
-normal
|
||||
The input texture is a three component linear LDR normal map
|
||||
@@ -175,6 +182,9 @@ COMPRESSION
|
||||
nml.xy = nml.xy * 2.0 - 1.0; // Unpack to [-1,1]
|
||||
nml.z = sqrt(1 - dot(nml.xy, nml.xy)); // Compute Z
|
||||
|
||||
Alternative component swizzles can be set with -esw and -dsw
|
||||
parameters.
|
||||
|
||||
-rgbm <max>
|
||||
The input texture is an RGBM encoded texture, storing values HDR
|
||||
values between 0 and <max> in an LDR container format with a
|
||||
@@ -193,8 +203,8 @@ COMPRESSION
|
||||
typically lowers the measured PSNR score. Perceptual methods are
|
||||
currently only available for normal maps and RGB color data.
|
||||
|
||||
-array <size>
|
||||
Loads an array of <size> 2D image slices to use as a 3D image.
|
||||
-zdim <zdim>
|
||||
Load a sequence of <zdim> 2D image slices to use as a 3D image.
|
||||
The input filename given is used is decorated with the postfix
|
||||
"_<slice>" to find the file to load. For example, an input named
|
||||
"input.png" would load as input_0.png, input_1.png, etc.
|
||||
@@ -270,53 +280,71 @@ ADVANCED COMPRESSION
|
||||
Higher numbers give better quality, as more complex blocks can
|
||||
be encoded, but will increase search time. Preset defaults are:
|
||||
|
||||
-fastest : 2
|
||||
-fast : 3
|
||||
-medium : 4
|
||||
-thorough : 4
|
||||
-exhaustive : 4
|
||||
-fastest : 2
|
||||
-fast : 3
|
||||
-medium : 4
|
||||
-thorough : 4
|
||||
-verythorough : 4
|
||||
-exhaustive : 4
|
||||
|
||||
-partitionindexlimit <number>
|
||||
Test <number> block partition indices for each partition count.
|
||||
Higher numbers give better quality, however large values give
|
||||
diminishing returns especially for smaller block sizes. Preset
|
||||
defaults are:
|
||||
-[2|3|4]partitionindexlimit <number>
|
||||
Estimate errors for <number> block partition indices for this
|
||||
partition count. Higher numbers give better quality, however
|
||||
large values give diminishing returns especially for smaller
|
||||
block sizes. Preset defaults are:
|
||||
|
||||
-fastest : 8
|
||||
-fast : 12
|
||||
-medium : 26
|
||||
-thorough : 76
|
||||
-exhaustive : 1024
|
||||
-fastest : 10 | 6 | 4
|
||||
-fast : 18 | 10 | 8
|
||||
-medium : 34 | 28 | 16
|
||||
-thorough : 82 | 60 | 30
|
||||
-verythorough : 256 | 128 | 64
|
||||
-exhaustive : 512 | 512 | 512
|
||||
|
||||
-[2|3|4]partitioncandidatelimit <number>
|
||||
Calculate errors for <number> block partition indices for this
|
||||
partition count. Higher numbers give better quality, however
|
||||
large values give diminishing returns especially for smaller
|
||||
block sizes. Preset defaults are:
|
||||
|
||||
-fastest : 2 | 2 | 2
|
||||
-fast : 2 | 2 | 2
|
||||
-medium : 2 | 2 | 2
|
||||
-thorough : 3 | 2 | 2
|
||||
-verythorough : 20 | 14 | 8
|
||||
-exhaustive : 32 | 32 | 32
|
||||
|
||||
-blockmodelimit <number>
|
||||
Test block modes below <number> usage centile in an empirically
|
||||
determined distribution of block mode frequency. This option is
|
||||
ineffective for 3D textures. Preset defaults are:
|
||||
|
||||
-fastest : 40
|
||||
-fast : 55
|
||||
-medium : 76
|
||||
-thorough : 93
|
||||
-exhaustive : 100
|
||||
-fastest : 43
|
||||
-fast : 55
|
||||
-medium : 77
|
||||
-thorough : 94
|
||||
-verythorough : 98
|
||||
-exhaustive : 100
|
||||
|
||||
-refinementlimit <value>
|
||||
Iterate only <value> refinement iterations on colors and
|
||||
-refinementlimit <number>
|
||||
Iterate <number> refinement iterations on colors and
|
||||
weights. Minimum value is 1. Preset defaults are:
|
||||
|
||||
-fastest : 2
|
||||
-fast : 3
|
||||
-medium : 3
|
||||
-thorough : 4
|
||||
-exhaustive : 4
|
||||
-fastest : 2
|
||||
-fast : 3
|
||||
-medium : 3
|
||||
-thorough : 4
|
||||
-verythorough : 4
|
||||
-exhaustive : 4
|
||||
|
||||
-candidatelimit <value>
|
||||
Trial only <value> candidate encodings for each block mode:
|
||||
-candidatelimit <number>
|
||||
Trial <number> candidate encodings for each block mode:
|
||||
|
||||
-fastest : 2
|
||||
-fast : 3
|
||||
-medium : 3
|
||||
-thorough : 4
|
||||
-exhaustive : 4
|
||||
-fastest : 2
|
||||
-fast : 3
|
||||
-medium : 3
|
||||
-thorough : 4
|
||||
-verythorough : 6
|
||||
-exhaustive : 8
|
||||
|
||||
-dblimit <number>
|
||||
Stop compression work on a block as soon as the PSNR of the
|
||||
@@ -324,37 +352,26 @@ ADVANCED COMPRESSION
|
||||
ineffective for HDR textures. Preset defaults, where N is the
|
||||
number of texels in a block, are:
|
||||
|
||||
-fastest : MAX(63-19*log10(N), 85-35*log10(N))
|
||||
-fast : MAX(63-19*log10(N), 85-35*log10(N))
|
||||
-medium : MAX(70-19*log10(N), 95-35*log10(N))
|
||||
-thorough : MAX(77-19*log10(N), 105-35*log10(N))
|
||||
-exhaustive : 999
|
||||
-fastest : MAX(63-19*log10(N), 85-35*log10(N))
|
||||
-fast : MAX(63-19*log10(N), 85-35*log10(N))
|
||||
-medium : MAX(70-19*log10(N), 95-35*log10(N))
|
||||
-thorough : MAX(77-19*log10(N), 105-35*log10(N))
|
||||
-verythorough : 999
|
||||
-exhaustive : 999
|
||||
|
||||
-2partitionlimitfactor <factor>
|
||||
-[2|3]partitionlimitfactor <factor>
|
||||
Stop compression work on a block after only testing blocks with
|
||||
up to two partitions and one plane of weights, unless the two
|
||||
up to 2/3 partitions and one plane of weights, unless the 2/3
|
||||
partition error term is lower than the error term from encoding
|
||||
with one partition by more than the specified factor. Preset
|
||||
with 1/2 partitions by more than the specified factor. Preset
|
||||
defaults are:
|
||||
|
||||
-fastest : 1.0
|
||||
-fast : 1.0
|
||||
-medium : 1.2
|
||||
-thorough : 2.5
|
||||
-exhaustive : 10.0
|
||||
|
||||
-3partitionlimitfactor <factor>
|
||||
Stop compression work on a block after only testing blocks with
|
||||
up to three partitions and one plane of weights, unless the three
|
||||
partition error term is lower than the error term from encoding
|
||||
with two partitions by more than the specified factor. Preset
|
||||
defaults are:
|
||||
|
||||
-fastest : 1.00
|
||||
-fast : 1.10
|
||||
-medium : 1.25
|
||||
-thorough : 1.25
|
||||
-exhaustive : 10.00
|
||||
-fastest : 1.00 | 1.00
|
||||
-fast : 1.00 | 1.00
|
||||
-medium : 1.10 | 1.05
|
||||
-thorough : 1.35 | 1.15
|
||||
-verythrorough : 1.60 | 1.40
|
||||
-exhaustive : 2.00 | 2.00
|
||||
|
||||
-2planelimitcorrelation <factor>
|
||||
Stop compression after testing only one plane of weights, unless
|
||||
@@ -362,53 +379,57 @@ ADVANCED COMPRESSION
|
||||
components is below this factor. This option is ineffective for
|
||||
normal maps. Preset defaults are:
|
||||
|
||||
-fastest : 0.50
|
||||
-fast : 0.65
|
||||
-medium : 0.85
|
||||
-thorough : 0.95
|
||||
-exhaustive : 0.99
|
||||
|
||||
-lowweightmodelimit <weight count>
|
||||
Use a simpler weight search for weight counts less than or
|
||||
equal to this threshold. Preset defaults are bitrate dependent:
|
||||
|
||||
-fastest : 25
|
||||
-fast : 20
|
||||
-medium : 16
|
||||
-thorough : 12
|
||||
-exhaustive : 0
|
||||
|
||||
-fastest : 0.50
|
||||
-fast : 0.65
|
||||
-medium : 0.85
|
||||
-thorough : 0.95
|
||||
-verythorough : 0.98
|
||||
-exhaustive : 0.99
|
||||
)"
|
||||
// This split in the literals is needed for Visual Studio; the compiler
|
||||
// will concatenate these two strings together ...
|
||||
R"(
|
||||
Other options
|
||||
-------------
|
||||
|
||||
-esw <swizzle>
|
||||
Swizzle the color components before compression. The swizzle is
|
||||
specified using a 4-character string, which defines the output
|
||||
format ordering. The characters may be taken from the set
|
||||
[rgba01], selecting either input color components or a literal
|
||||
zero or one. For example to swap the RG components, and replace
|
||||
alpha with 1, the swizzle 'grb1' should be used.
|
||||
Specify an encoding swizzle to reorder the color components
|
||||
before compression. The swizzle is specified using a four
|
||||
character string, which defines the format ordering used by
|
||||
the compressor.
|
||||
|
||||
The input swizzle takes place before any compression, and all
|
||||
error weighting applied using the -cw option is applied to the
|
||||
post-swizzle component ordering.
|
||||
The characters may be taken from the set [rgba01], selecting
|
||||
either input color components or a literal zero or one. For
|
||||
example to swap the RG components, and replace alpha with 1,
|
||||
the swizzle 'grb1' should be used.
|
||||
|
||||
By default all 4 post-swizzle components are included in the
|
||||
error metrics during compression. When using -esw to map two
|
||||
compression error metrics. When using -esw to map two
|
||||
component data to the L+A endpoint (e.g. -esw rrrg) the
|
||||
luminance data stored in the RGB components will be weighted 3
|
||||
times more strongly than the alpha component. This can be
|
||||
corrected using the -cw option to zero the weights of unused
|
||||
components; e.g. using -cw 1 0 0 1.
|
||||
corrected using the -ssw option to specify which components
|
||||
will be sampled at runtime e.g. -ssw ra.
|
||||
|
||||
-ssw <swizzle>
|
||||
Specify a sampling swizzle to identify which color components
|
||||
are actually read by the application shader program. For example,
|
||||
using -ssw ra tells the compressor that the green and blue error
|
||||
does not matter because the data is not actually read.
|
||||
|
||||
The sampling swizzle is based on the channel ordering after the
|
||||
-esw transform has been applied. Note -ssw exposes the same
|
||||
functionality as -cw, but in a more user-friendly form.
|
||||
|
||||
-dsw <swizzle>
|
||||
Swizzle the color components after decompression. The swizzle is
|
||||
specified using the same method as the -esw option, with support
|
||||
for an additional "z" character. This is used to specify that
|
||||
the compressed data stores an X+Y normal map, and that the Z
|
||||
output component should be reconstructed from the two components
|
||||
stored in the data. For the typical ASTC normal encoding, which
|
||||
uses an 'rrrg' compression swizzle, you should specify an 'raz1'
|
||||
Specify a decompression swizzle used to reorder the color
|
||||
components after decompression. The swizzle is specified using
|
||||
the same method as the -esw option, with support for an extra
|
||||
"z" character. This is used to specify that the compressed data
|
||||
stores an X+Y normal map, and that the Z output component
|
||||
should be reconstructed from the two components stored in the
|
||||
data. For the typical ASTC normal encoding, which uses an
|
||||
'rrrg' compression swizzle, you should specify an 'raz1'
|
||||
swizzle for decompression.
|
||||
|
||||
-yflip
|
||||
@@ -527,7 +548,7 @@ QUICK REFERENCE
|
||||
astcenc {-tl|-ts|-th|-tH} <in> <out> <blockdim> <quality> [options]
|
||||
|
||||
Mode -*l = linear LDR, -*s = sRGB LDR, -*h = HDR RGB/LDR A, -*H = HDR.
|
||||
Quality = -fastest/-fast/-medium/-thorough/-exhaustive/a float [0-100].
|
||||
Quality = -fastest/-fast/-medium/-thorough/-verythorough/-exhaustive/a float [0-100].
|
||||
)";
|
||||
|
||||
/* See header for documentation. */
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# ----------------------------------------------------------------------------
|
||||
# Copyright 2020-2022 Arm Limited
|
||||
# Copyright 2020-2023 Arm Limited
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License"); you may not
|
||||
# use this file except in compliance with the License. You may obtain a copy
|
||||
@@ -15,18 +15,32 @@
|
||||
# under the License.
|
||||
# ----------------------------------------------------------------------------
|
||||
|
||||
if(${UNIVERSAL_BUILD})
|
||||
set(ASTC_TARGET astc${CODEC})
|
||||
else()
|
||||
set(ASTC_TARGET astc${CODEC}-${ISA_SIMD})
|
||||
set(ASTCENC_TARGET astc${ASTCENC_CODEC}-${ASTCENC_ISA_SIMD})
|
||||
|
||||
project(${ASTCENC_TARGET})
|
||||
|
||||
# On CMake 3.25 or older CXX_COMPILER_FRONTEND_VARIANT is not always set
|
||||
if(CMAKE_CXX_COMPILER_FRONTEND_VARIANT STREQUAL "")
|
||||
set(CMAKE_CXX_COMPILER_FRONTEND_VARIANT "${CMAKE_CXX_COMPILER_ID}")
|
||||
endif()
|
||||
|
||||
project(${ASTC_TARGET})
|
||||
# Compiler accepts MSVC-style command line options
|
||||
set(is_msvc_fe "$<STREQUAL:${CMAKE_CXX_COMPILER_FRONTEND_VARIANT},MSVC>")
|
||||
# Compiler accepts GNU-style command line options
|
||||
set(is_gnu_fe1 "$<STREQUAL:${CMAKE_CXX_COMPILER_FRONTEND_VARIANT},GNU>")
|
||||
# Compiler accepts AppleClang-style command line options, which is also GNU-style
|
||||
set(is_gnu_fe2 "$<STREQUAL:${CMAKE_CXX_COMPILER_FRONTEND_VARIANT},AppleClang>")
|
||||
# Compiler accepts GNU-style command line options
|
||||
set(is_gnu_fe "$<OR:${is_gnu_fe1},${is_gnu_fe2}>")
|
||||
|
||||
set(GNU_LIKE "GNU,Clang,AppleClang")
|
||||
set(CLANG_LIKE "Clang,AppleClang")
|
||||
# Compiler is Visual Studio cl.exe
|
||||
set(is_msvccl "$<AND:${is_msvc_fe},$<CXX_COMPILER_ID:MSVC>>")
|
||||
# Compiler is Visual Studio clangcl.exe
|
||||
set(is_clangcl "$<AND:${is_msvc_fe},$<CXX_COMPILER_ID:Clang>>")
|
||||
# Compiler is upstream clang with the standard frontend
|
||||
set(is_clang "$<AND:${is_gnu_fe},$<CXX_COMPILER_ID:Clang,AppleClang>>")
|
||||
|
||||
add_library(${ASTC_TARGET}-static
|
||||
add_library(${ASTCENC_TARGET}-static
|
||||
STATIC
|
||||
astcenc_averages_and_directions.cpp
|
||||
astcenc_block_sizes.cpp
|
||||
@@ -46,19 +60,55 @@ add_library(${ASTC_TARGET}-static
|
||||
astcenc_partition_tables.cpp
|
||||
astcenc_percentile_tables.cpp
|
||||
astcenc_pick_best_endpoint_format.cpp
|
||||
astcenc_platform_isa_detection.cpp
|
||||
astcenc_quantization.cpp
|
||||
astcenc_symbolic_physical.cpp
|
||||
astcenc_weight_align.cpp
|
||||
astcenc_weight_quant_xfer_tables.cpp)
|
||||
|
||||
target_include_directories(${ASTC_TARGET}-static
|
||||
target_include_directories(${ASTCENC_TARGET}-static
|
||||
PUBLIC
|
||||
$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>
|
||||
$<INSTALL_INTERFACE:.>)
|
||||
|
||||
if(${CLI})
|
||||
add_executable(${ASTC_TARGET}
|
||||
if(${ASTCENC_SHAREDLIB})
|
||||
add_library(${ASTCENC_TARGET}-shared
|
||||
SHARED
|
||||
astcenc_averages_and_directions.cpp
|
||||
astcenc_block_sizes.cpp
|
||||
astcenc_color_quantize.cpp
|
||||
astcenc_color_unquantize.cpp
|
||||
astcenc_compress_symbolic.cpp
|
||||
astcenc_compute_variance.cpp
|
||||
astcenc_decompress_symbolic.cpp
|
||||
astcenc_diagnostic_trace.cpp
|
||||
astcenc_entry.cpp
|
||||
astcenc_find_best_partitioning.cpp
|
||||
astcenc_ideal_endpoints_and_weights.cpp
|
||||
astcenc_image.cpp
|
||||
astcenc_integer_sequence.cpp
|
||||
astcenc_mathlib.cpp
|
||||
astcenc_mathlib_softfloat.cpp
|
||||
astcenc_partition_tables.cpp
|
||||
astcenc_percentile_tables.cpp
|
||||
astcenc_pick_best_endpoint_format.cpp
|
||||
astcenc_quantization.cpp
|
||||
astcenc_symbolic_physical.cpp
|
||||
astcenc_weight_align.cpp
|
||||
astcenc_weight_quant_xfer_tables.cpp)
|
||||
|
||||
target_include_directories(${ASTCENC_TARGET}-shared
|
||||
PUBLIC
|
||||
$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>
|
||||
$<INSTALL_INTERFACE:.>)
|
||||
endif()
|
||||
|
||||
if(${ASTCENC_CLI})
|
||||
# Veneer is compiled without any extended ISA so we can safely do
|
||||
# ISA compatability checks without triggering a SIGILL
|
||||
add_library(${ASTCENC_TARGET}-veneer
|
||||
astcenccli_entry.cpp)
|
||||
|
||||
add_executable(${ASTCENC_TARGET}
|
||||
astcenccli_error_metrics.cpp
|
||||
astcenccli_image.cpp
|
||||
astcenccli_image_external.cpp
|
||||
@@ -67,220 +117,313 @@ if(${CLI})
|
||||
astcenccli_toplevel.cpp
|
||||
astcenccli_toplevel_help.cpp)
|
||||
|
||||
target_link_libraries(${ASTC_TARGET}
|
||||
target_link_libraries(${ASTCENC_TARGET}
|
||||
PRIVATE
|
||||
${ASTC_TARGET}-static)
|
||||
${ASTCENC_TARGET}-veneer
|
||||
${ASTCENC_TARGET}-static)
|
||||
endif()
|
||||
|
||||
macro(astcenc_set_properties NAME)
|
||||
macro(astcenc_set_properties ASTCENC_TARGET_NAME ASTCENC_IS_VENEER)
|
||||
|
||||
target_compile_features(${NAME}
|
||||
target_compile_features(${ASTCENC_TARGET_NAME}
|
||||
PRIVATE
|
||||
cxx_std_14)
|
||||
|
||||
target_compile_definitions(${NAME}
|
||||
target_compile_definitions(${ASTCENC_TARGET_NAME}
|
||||
PRIVATE
|
||||
# MSVC defines
|
||||
$<$<CXX_COMPILER_ID:MSVC>:_CRT_SECURE_NO_WARNINGS>)
|
||||
$<${is_msvc_fe}:_CRT_SECURE_NO_WARNINGS>)
|
||||
|
||||
if(${DECOMPRESSOR})
|
||||
target_compile_definitions(${NAME}
|
||||
if(${ASTCENC_DECOMPRESSOR})
|
||||
target_compile_definitions(${ASTCENC_TARGET_NAME}
|
||||
PRIVATE
|
||||
ASTCENC_DECOMPRESS_ONLY)
|
||||
endif()
|
||||
|
||||
if(${BLOCK_MAX_TEXELS})
|
||||
target_compile_definitions(${NAME}
|
||||
if(${ASTCENC_BLOCK_MAX_TEXELS})
|
||||
target_compile_definitions(${ASTCENC_TARGET_NAME}
|
||||
PRIVATE
|
||||
ASTCENC_BLOCK_MAX_TEXELS=${BLOCK_MAX_TEXELS})
|
||||
ASTCENC_BLOCK_MAX_TEXELS=${ASTCENC_BLOCK_MAX_TEXELS})
|
||||
endif()
|
||||
|
||||
if(${DIAGNOSTICS})
|
||||
target_compile_definitions(${NAME}
|
||||
if(${ASTCENC_DIAGNOSTICS})
|
||||
target_compile_definitions(${ASTCENC_TARGET_NAME}
|
||||
PUBLIC
|
||||
ASTCENC_DIAGNOSTICS)
|
||||
endif()
|
||||
|
||||
target_compile_options(${NAME}
|
||||
target_compile_options(${ASTCENC_TARGET_NAME}
|
||||
PRIVATE
|
||||
# Use pthreads on Linux/macOS
|
||||
$<$<PLATFORM_ID:Linux,Darwin>:-pthread>
|
||||
|
||||
# MSVC compiler defines
|
||||
$<$<CXX_COMPILER_ID:MSVC>:/EHsc>
|
||||
$<$<CXX_COMPILER_ID:MSVC>:/fp:strict>
|
||||
$<${is_msvc_fe}:/EHsc>
|
||||
$<${is_msvccl}:/wd4324>
|
||||
|
||||
# G++ and Clang++ compiler defines
|
||||
$<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wall>
|
||||
$<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wextra>
|
||||
$<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wpedantic>
|
||||
$<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Werror>
|
||||
$<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wshadow>
|
||||
$<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wdouble-promotion>
|
||||
$<${is_gnu_fe}:-Wall>
|
||||
$<${is_gnu_fe}:-Wextra>
|
||||
$<${is_gnu_fe}:-Wpedantic>
|
||||
$<${is_gnu_fe}:-Werror>
|
||||
$<${is_gnu_fe}:-Wshadow>
|
||||
$<${is_gnu_fe}:-Wdouble-promotion>
|
||||
$<${is_clang}:-Wdocumentation>
|
||||
|
||||
# Hide noise thrown up by Clang 10 and clang-cl
|
||||
$<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wno-unknown-warning-option>
|
||||
$<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wno-c++98-compat-pedantic>
|
||||
$<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wno-c++98-c++11-compat-pedantic>
|
||||
$<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wno-float-equal>
|
||||
$<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wno-deprecated-declarations>
|
||||
$<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wno-atomic-implicit-seq-cst>
|
||||
$<${is_gnu_fe}:-Wno-unknown-warning-option>
|
||||
$<${is_gnu_fe}:-Wno-c++98-compat-pedantic>
|
||||
$<${is_gnu_fe}:-Wno-c++98-c++11-compat-pedantic>
|
||||
$<${is_gnu_fe}:-Wno-float-equal>
|
||||
$<${is_gnu_fe}:-Wno-deprecated-declarations>
|
||||
$<${is_gnu_fe}:-Wno-atomic-implicit-seq-cst>
|
||||
|
||||
# Clang 10 also throws up warnings we need to investigate (ours)
|
||||
$<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wno-cast-align>
|
||||
$<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wno-sign-conversion>
|
||||
$<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wno-implicit-int-conversion>
|
||||
$<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wno-shift-sign-overflow>
|
||||
$<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wno-format-nonliteral>
|
||||
$<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wno-reserved-identifier>
|
||||
$<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wno-cast-function-type>
|
||||
$<${is_gnu_fe}:-Wno-cast-align>
|
||||
$<${is_gnu_fe}:-Wno-sign-conversion>
|
||||
$<${is_gnu_fe}:-Wno-implicit-int-conversion>
|
||||
$<${is_gnu_fe}:-Wno-shift-sign-overflow>
|
||||
$<${is_gnu_fe}:-Wno-format-nonliteral>
|
||||
$<${is_gnu_fe}:-Wno-reserved-identifier>
|
||||
$<${is_gnu_fe}:-Wno-cast-function-type>
|
||||
|
||||
$<$<CXX_COMPILER_ID:Clang>:-Wdocumentation>)
|
||||
# Force DWARF4 for Valgrind profiling
|
||||
$<$<AND:$<PLATFORM_ID:Linux,Darwin>,${is_clang}>:-gdwarf-4>
|
||||
|
||||
target_link_options(${NAME}
|
||||
# Disable non-portable Windows.h warning (fixing it fails builds on MinGW)
|
||||
$<$<AND:$<PLATFORM_ID:Windows>,${is_clang}>:-Wno-nonportable-system-include-path>)
|
||||
|
||||
target_link_options(${ASTCENC_TARGET_NAME}
|
||||
PRIVATE
|
||||
# Use pthreads on Linux/macOS
|
||||
$<$<PLATFORM_ID:Linux,Darwin>:-pthread>)
|
||||
|
||||
if(${ASAN})
|
||||
target_compile_options(${NAME}
|
||||
if(${ASTCENC_ASAN})
|
||||
target_compile_options(${ASTCENC_TARGET_NAME}
|
||||
PRIVATE
|
||||
$<$<CXX_COMPILER_ID:${CLANG_LIKE}>:-fsanitize=address>)
|
||||
$<${is_clang}:-fsanitize=address>)
|
||||
|
||||
target_link_options(${NAME}
|
||||
target_link_options(${ASTCENC_TARGET_NAME}
|
||||
PRIVATE
|
||||
$<$<CXX_COMPILER_ID:${CLANG_LIKE}>:-fsanitize=address>)
|
||||
$<${is_clang}:-fsanitize=address>)
|
||||
endif()
|
||||
|
||||
if(${NO_INVARIANCE})
|
||||
target_compile_definitions(${NAME}
|
||||
PRIVATE
|
||||
ASTCENC_NO_INVARIANCE=1)
|
||||
if(NOT ${ASTCENC_INVARIANCE})
|
||||
target_compile_definitions(${ASTCENC_TARGET_NAME}
|
||||
PRIVATE
|
||||
ASTCENC_NO_INVARIANCE=1)
|
||||
|
||||
# For Visual Studio prior to 2022 (compiler < 19.30) /fp:precise
|
||||
# For Visual Studio 2022 (compiler >= 19.30) /fp:precise and /fp:contract
|
||||
|
||||
# For Visual Studio 2022 ClangCL seems to have accidentally enabled contraction by default,
|
||||
# so behaves differently to CL.exe. Use the -Xclang argument to workaround and allow access
|
||||
# GNU-style switch to control contraction on the assumption this gets fixed and disabled.
|
||||
# Note ClangCL does not accept /fp:contract as an argument as of v15.0.7.
|
||||
target_compile_options(${ASTCENC_TARGET_NAME}
|
||||
PRIVATE
|
||||
$<${is_msvccl}:/fp:precise>
|
||||
$<${is_clangcl}:/fp:precise>
|
||||
$<$<AND:${is_msvccl},$<VERSION_GREATER_EQUAL:$<CXX_COMPILER_VERSION>,19.30>>:/fp:contract>
|
||||
$<$<AND:${is_clangcl},$<VERSION_GREATER_EQUAL:$<CXX_COMPILER_VERSION>,14.0.0>>:-Xclang -ffp-contract=fast>
|
||||
$<$<AND:${is_clang},$<VERSION_GREATER_EQUAL:$<CXX_COMPILER_VERSION>,10.0.0>>:-ffp-model=precise>
|
||||
$<${is_gnu_fe}:-ffp-contract=fast>)
|
||||
else()
|
||||
# For Visual Studio prior to 2022 (compiler < 19.30) /fp:strict
|
||||
# For Visual Studio 2022 (compiler >= 19.30) /fp:precise
|
||||
|
||||
# For Visual Studio 2022 ClangCL seems to have accidentally enabled contraction by default,
|
||||
# so behaves differently to CL.exe. Use the -Xclang argument to workaround and allow access
|
||||
# GNU-style switch to control contraction and force disable.
|
||||
target_compile_options(${ASTCENC_TARGET_NAME}
|
||||
PRIVATE
|
||||
$<$<AND:${is_msvccl},$<VERSION_LESS:$<CXX_COMPILER_VERSION>,19.30>>:/fp:strict>
|
||||
$<$<AND:${is_msvccl},$<VERSION_GREATER_EQUAL:$<CXX_COMPILER_VERSION>,19.30>>:/fp:precise>
|
||||
$<${is_clangcl}:/fp:precise>
|
||||
$<$<AND:${is_clangcl},$<VERSION_GREATER_EQUAL:$<CXX_COMPILER_VERSION>,14.0.0>>:-Xclang -ffp-contract=off>
|
||||
$<$<AND:${is_clang},$<VERSION_GREATER_EQUAL:$<CXX_COMPILER_VERSION>,10.0.0>>:-ffp-model=precise>
|
||||
$<${is_gnu_fe}:-ffp-contract=off>)
|
||||
endif()
|
||||
|
||||
if(${CLI})
|
||||
if(${ASTCENC_CLI})
|
||||
# Enable LTO on release builds
|
||||
set_property(TARGET ${NAME}
|
||||
set_property(TARGET ${ASTCENC_TARGET_NAME}
|
||||
PROPERTY
|
||||
INTERPROCEDURAL_OPTIMIZATION_RELEASE True)
|
||||
|
||||
# Use a static runtime on MSVC builds (ignored on non-MSVC compilers)
|
||||
set_property(TARGET ${NAME}
|
||||
set_property(TARGET ${ASTCENC_TARGET_NAME}
|
||||
PROPERTY
|
||||
MSVC_RUNTIME_LIBRARY "MultiThreaded$<$<CONFIG:Debug>:Debug>")
|
||||
endif()
|
||||
|
||||
# Set up configuration for SIMD ISA builds
|
||||
if(${ISA_SIMD} MATCHES "none")
|
||||
if(NOT ${UNIVERSAL_BUILD})
|
||||
target_compile_definitions(${NAME}
|
||||
PRIVATE
|
||||
ASTCENC_NEON=0
|
||||
ASTCENC_SSE=0
|
||||
ASTCENC_AVX=0
|
||||
ASTCENC_POPCNT=0
|
||||
ASTCENC_F16C=0)
|
||||
endif()
|
||||
|
||||
elseif(${ISA_SIMD} MATCHES "neon")
|
||||
if(NOT ${UNIVERSAL_BUILD})
|
||||
target_compile_definitions(${NAME}
|
||||
PRIVATE
|
||||
ASTCENC_NEON=1
|
||||
ASTCENC_SSE=0
|
||||
ASTCENC_AVX=0
|
||||
ASTCENC_POPCNT=0
|
||||
ASTCENC_F16C=0)
|
||||
endif()
|
||||
|
||||
elseif((${ISA_SIMD} MATCHES "sse2") OR (${UNIVERSAL_BUILD} AND ${ISA_SSE2}))
|
||||
if(NOT ${UNIVERSAL_BUILD})
|
||||
target_compile_definitions(${NAME}
|
||||
PRIVATE
|
||||
ASTCENC_NEON=0
|
||||
ASTCENC_SSE=20
|
||||
ASTCENC_AVX=0
|
||||
ASTCENC_POPCNT=0
|
||||
ASTCENC_F16C=0)
|
||||
endif()
|
||||
|
||||
# These settings are needed on AppleClang as SSE4.1 is on by default
|
||||
# Suppress unused argument for macOS universal build behavior
|
||||
target_compile_options(${NAME}
|
||||
if(${ASTCENC_ISA_SIMD} MATCHES "none")
|
||||
target_compile_definitions(${ASTCENC_TARGET_NAME}
|
||||
PRIVATE
|
||||
$<$<CXX_COMPILER_ID:AppleClang>:-msse2>
|
||||
$<$<CXX_COMPILER_ID:AppleClang>:-mno-sse4.1>
|
||||
$<$<CXX_COMPILER_ID:AppleClang>:-Wno-unused-command-line-argument>)
|
||||
ASTCENC_NEON=0
|
||||
ASTCENC_SSE=0
|
||||
ASTCENC_AVX=0
|
||||
ASTCENC_POPCNT=0
|
||||
ASTCENC_F16C=0)
|
||||
|
||||
elseif((${ISA_SIMD} MATCHES "sse4.1") OR (${UNIVERSAL_BUILD} AND ${ISA_SSE41}))
|
||||
if(NOT ${UNIVERSAL_BUILD})
|
||||
target_compile_definitions(${NAME}
|
||||
elseif(${ASTCENC_ISA_SIMD} MATCHES "neon")
|
||||
target_compile_definitions(${ASTCENC_TARGET_NAME}
|
||||
PRIVATE
|
||||
ASTCENC_NEON=1
|
||||
ASTCENC_SSE=0
|
||||
ASTCENC_AVX=0
|
||||
ASTCENC_POPCNT=0
|
||||
ASTCENC_F16C=0)
|
||||
|
||||
# Workaround MSVC codegen bug for NEON builds on VS 2022 17.2 or older
|
||||
# https://developercommunity.visualstudio.com/t/inlining-turns-constant-into-register-operand-for/1394798
|
||||
if((CMAKE_CXX_COMPILER_ID MATCHES "MSVC") AND (MSVC_VERSION LESS 1933))
|
||||
target_compile_options(${ASTCENC_TARGET_NAME}
|
||||
PRIVATE
|
||||
ASTCENC_NEON=0
|
||||
ASTCENC_SSE=41
|
||||
ASTCENC_AVX=0
|
||||
ASTCENC_POPCNT=1
|
||||
ASTCENC_F16C=0)
|
||||
$<${is_msvccl}:/d2ssa-cfg-sink->)
|
||||
endif()
|
||||
|
||||
# Suppress unused argument for macOS universal build behavior
|
||||
target_compile_options(${NAME}
|
||||
elseif(${ASTCENC_ISA_SIMD} MATCHES "sse2")
|
||||
target_compile_definitions(${ASTCENC_TARGET_NAME}
|
||||
PRIVATE
|
||||
$<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-msse4.1 -mpopcnt>
|
||||
$<$<CXX_COMPILER_ID:AppleClang>:-Wno-unused-command-line-argument>)
|
||||
ASTCENC_NEON=0
|
||||
ASTCENC_SSE=20
|
||||
ASTCENC_AVX=0
|
||||
ASTCENC_POPCNT=0
|
||||
ASTCENC_F16C=0)
|
||||
|
||||
elseif((${ISA_SIMD} MATCHES "avx2") OR (${UNIVERSAL_BUILD} AND ${ISA_AVX2}))
|
||||
if(NOT ${UNIVERSAL_BUILD})
|
||||
target_compile_definitions(${NAME}
|
||||
# Force SSE2 on AppleClang (normally SSE4.1 is the default)
|
||||
target_compile_options(${ASTCENC_TARGET_NAME}
|
||||
PRIVATE
|
||||
$<${is_clangcl}:-msse2>
|
||||
$<${is_gnu_fe}:-msse2>
|
||||
$<${is_gnu_fe}:-mno-sse4.1>
|
||||
$<${is_gnu_fe}:-Wno-unused-command-line-argument>)
|
||||
|
||||
elseif(${ASTCENC_ISA_SIMD} MATCHES "sse4.1")
|
||||
target_compile_definitions(${ASTCENC_TARGET_NAME}
|
||||
PRIVATE
|
||||
ASTCENC_NEON=0
|
||||
ASTCENC_SSE=41
|
||||
ASTCENC_AVX=0
|
||||
ASTCENC_POPCNT=1
|
||||
ASTCENC_F16C=0)
|
||||
|
||||
if (${ASTCENC_IS_VENEER})
|
||||
# Force SSE2 on AppleClang (normally SSE4.1 is the default)
|
||||
target_compile_options(${ASTCENC_TARGET_NAME}
|
||||
PRIVATE
|
||||
ASTCENC_NEON=0
|
||||
ASTCENC_SSE=41
|
||||
ASTCENC_AVX=2
|
||||
ASTCENC_POPCNT=1
|
||||
ASTCENC_F16C=1)
|
||||
$<${is_gnu_fe}:-msse2>
|
||||
$<${is_gnu_fe}:-mno-sse4.1>
|
||||
$<${is_gnu_fe}:-Wno-unused-command-line-argument>)
|
||||
else()
|
||||
target_compile_options(${ASTCENC_TARGET_NAME}
|
||||
PRIVATE
|
||||
$<${is_clangcl}:-msse4.1 -mpopcnt>
|
||||
$<${is_gnu_fe}:-msse4.1 -mpopcnt>
|
||||
$<${is_gnu_fe}:-Wno-unused-command-line-argument>)
|
||||
endif()
|
||||
|
||||
# Suppress unused argument for macOS universal build behavior
|
||||
target_compile_options(${NAME}
|
||||
elseif(${ASTCENC_ISA_SIMD} MATCHES "avx2")
|
||||
target_compile_definitions(${ASTCENC_TARGET_NAME}
|
||||
PRIVATE
|
||||
$<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-mavx2 -mpopcnt -mf16c>
|
||||
$<$<CXX_COMPILER_ID:MSVC>:/arch:AVX2>
|
||||
$<$<CXX_COMPILER_ID:AppleClang>:-Wno-unused-command-line-argument>)
|
||||
ASTCENC_NEON=0
|
||||
ASTCENC_SSE=41
|
||||
ASTCENC_AVX=2
|
||||
ASTCENC_POPCNT=1
|
||||
ASTCENC_F16C=1)
|
||||
|
||||
if (${ASTCENC_IS_VENEER})
|
||||
# Force SSE2 on AppleClang (normally SSE4.1 is the default)
|
||||
target_compile_options(${ASTCENC_TARGET_NAME}
|
||||
PRIVATE
|
||||
$<${is_gnu_fe}:-msse2>
|
||||
$<${is_gnu_fe}:-mno-sse4.1>
|
||||
$<${is_gnu_fe}:-Wno-unused-command-line-argument>)
|
||||
else()
|
||||
target_compile_options(${ASTCENC_TARGET_NAME}
|
||||
PRIVATE
|
||||
$<${is_msvc_fe}:/arch:AVX2>
|
||||
$<${is_clangcl}:-mavx2 -mpopcnt -mf16c>
|
||||
$<${is_gnu_fe}:-mavx2 -mpopcnt -mf16c>
|
||||
$<${is_gnu_fe}:-Wno-unused-command-line-argument>)
|
||||
endif()
|
||||
|
||||
# Non-invariant builds enable us to loosen the compiler constraints on
|
||||
# floating point, but this is only worth doing on CPUs with AVX2 because
|
||||
# this implies we can also enable the FMA instruction set extensions
|
||||
# which significantly improve performance. Note that this DOES reduce
|
||||
# image quality by up to 0.2 dB (normally much less), but buys an
|
||||
# average of 10-15% performance improvement ...
|
||||
if((NOT ${ASTCENC_INVARIANCE}) AND (NOT ${ASTCENC_IS_VENEER}))
|
||||
target_compile_options(${ASTCENC_TARGET_NAME}
|
||||
PRIVATE
|
||||
$<${is_gnu_fe}:-mfma>)
|
||||
endif()
|
||||
|
||||
endif()
|
||||
|
||||
endmacro()
|
||||
|
||||
if(CMAKE_CXX_COMPILER_ID MATCHES "GNU|Clang")
|
||||
string(CONCAT EXTERNAL_CXX_FLAGS
|
||||
" $<$<NOT:$<CXX_COMPILER_ID:MSVC>>: -fno-strict-aliasing>"
|
||||
" $<$<NOT:$<CXX_COMPILER_ID:MSVC>>: -Wno-unused-parameter>"
|
||||
" $<$<NOT:$<CXX_COMPILER_ID:MSVC>>: -Wno-old-style-cast>"
|
||||
" $<$<NOT:$<CXX_COMPILER_ID:MSVC>>: -Wno-double-promotion>"
|
||||
" $<$<NOT:$<CXX_COMPILER_ID:MSVC>>: -Wno-zero-as-null-pointer-constant>"
|
||||
" $<$<NOT:$<CXX_COMPILER_ID:MSVC>>: -Wno-disabled-macro-expansion>"
|
||||
" $<$<NOT:$<CXX_COMPILER_ID:MSVC>>: -Wno-reserved-id-macro>"
|
||||
" $<$<NOT:$<CXX_COMPILER_ID:MSVC>>: -Wno-extra-semi-stmt>"
|
||||
" $<$<NOT:$<CXX_COMPILER_ID:MSVC>>: -Wno-implicit-fallthrough>"
|
||||
" $<$<NOT:$<CXX_COMPILER_ID:MSVC>>: -Wno-tautological-type-limit-compare>"
|
||||
" $<$<NOT:$<CXX_COMPILER_ID:MSVC>>: -Wno-cast-qual>"
|
||||
" $<$<NOT:$<CXX_COMPILER_ID:MSVC>>: -Wno-reserved-identifier>"
|
||||
" $<$<CXX_COMPILER_ID:Clang>: -Wno-missing-prototypes>"
|
||||
" $<$<NOT:$<CXX_COMPILER_ID:MSVC>>: -Wno-suggest-override>"
|
||||
" $<$<NOT:$<CXX_COMPILER_ID:MSVC>>: -Wno-used-but-marked-unused>"
|
||||
" $<$<NOT:$<CXX_COMPILER_ID:MSVC>>: -Wno-noexcept-type>"
|
||||
" $<$<NOT:$<CXX_COMPILER_ID:MSVC>>: -Wno-comma>"
|
||||
" $<$<NOT:$<CXX_COMPILER_ID:MSVC>>: -Wno-c99-extensions>")
|
||||
string(CONCAT EXTERNAL_CXX_FLAGS
|
||||
" $<${is_gnu_fe}: -fno-strict-aliasing>"
|
||||
" $<${is_gnu_fe}: -Wno-unused-parameter>"
|
||||
" $<${is_gnu_fe}: -Wno-old-style-cast>"
|
||||
" $<${is_gnu_fe}: -Wno-double-promotion>"
|
||||
" $<${is_gnu_fe}: -Wno-zero-as-null-pointer-constant>"
|
||||
" $<${is_gnu_fe}: -Wno-disabled-macro-expansion>"
|
||||
" $<${is_gnu_fe}: -Wno-reserved-id-macro>"
|
||||
" $<${is_gnu_fe}: -Wno-extra-semi-stmt>"
|
||||
" $<${is_gnu_fe}: -Wno-implicit-fallthrough>"
|
||||
" $<${is_gnu_fe}: -Wno-tautological-type-limit-compare>"
|
||||
" $<${is_gnu_fe}: -Wno-cast-qual>"
|
||||
" $<${is_gnu_fe}: -Wno-reserved-identifier>"
|
||||
" $<${is_clang}: -Wno-missing-prototypes>"
|
||||
" $<${is_gnu_fe}: -Wno-missing-field-initializers>"
|
||||
" $<${is_gnu_fe}: -Wno-suggest-override>"
|
||||
" $<${is_gnu_fe}: -Wno-used-but-marked-unused>"
|
||||
" $<${is_gnu_fe}: -Wno-noexcept-type>"
|
||||
" $<${is_gnu_fe}: -Wno-comma>"
|
||||
" $<${is_gnu_fe}: -Wno-c99-extensions>")
|
||||
|
||||
set_source_files_properties(astcenccli_image_external.cpp
|
||||
PROPERTIES
|
||||
COMPILE_FLAGS ${EXTERNAL_CXX_FLAGS})
|
||||
set_source_files_properties(astcenccli_image_external.cpp
|
||||
PROPERTIES
|
||||
COMPILE_FLAGS ${EXTERNAL_CXX_FLAGS})
|
||||
|
||||
astcenc_set_properties(${ASTCENC_TARGET}-static OFF)
|
||||
|
||||
target_compile_options(${ASTCENC_TARGET}-static
|
||||
PRIVATE
|
||||
$<${is_msvc_fe}:/W4>)
|
||||
|
||||
if(${ASTCENC_SHAREDLIB})
|
||||
astcenc_set_properties(${ASTCENC_TARGET}-shared OFF)
|
||||
|
||||
target_compile_definitions(${ASTCENC_TARGET}-shared
|
||||
PRIVATE
|
||||
ASTCENC_DYNAMIC_LIBRARY=1)
|
||||
|
||||
target_compile_options(${ASTCENC_TARGET}-shared
|
||||
PRIVATE
|
||||
$<${is_gnu_fe}:-fvisibility=hidden>
|
||||
$<${is_msvc_fe}:/W4>)
|
||||
|
||||
if(NOT ${ASTCENC_UNIVERSAL_BUILD})
|
||||
install(TARGETS ${ASTCENC_TARGET}-shared)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
astcenc_set_properties(${ASTC_TARGET}-static)
|
||||
if(${ASTCENC_CLI})
|
||||
astcenc_set_properties(${ASTCENC_TARGET}-veneer ON)
|
||||
astcenc_set_properties(${ASTCENC_TARGET} OFF)
|
||||
|
||||
if(${CLI})
|
||||
astcenc_set_properties(${ASTC_TARGET})
|
||||
target_compile_options(${ASTCENC_TARGET}
|
||||
PRIVATE
|
||||
$<${is_msvc_fe}:/W3>)
|
||||
|
||||
target_compile_options(${ASTCENC_TARGET}-veneer
|
||||
PRIVATE
|
||||
$<${is_msvc_fe}:/W3>)
|
||||
|
||||
string(TIMESTAMP astcencoder_YEAR "%Y")
|
||||
|
||||
@@ -289,9 +432,11 @@ if(${CLI})
|
||||
astcenccli_version.h
|
||||
ESCAPE_QUOTES @ONLY)
|
||||
|
||||
target_include_directories(${ASTC_TARGET}
|
||||
target_include_directories(${ASTCENC_TARGET}
|
||||
PRIVATE
|
||||
${CMAKE_CURRENT_BINARY_DIR})
|
||||
|
||||
install(TARGETS ${ASTC_TARGET} DESTINATION ${PACKAGE_ROOT})
|
||||
if(NOT ${ASTCENC_UNIVERSAL_BUILD})
|
||||
install(TARGETS ${ASTCENC_TARGET})
|
||||
endif()
|
||||
endif()
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
/* stb_image - v2.27 - public domain image loader - http://nothings.org/stb
|
||||
/* stb_image - v2.28 - public domain image loader - http://nothings.org/stb
|
||||
no warranty implied; use at your own risk
|
||||
|
||||
Do this:
|
||||
@@ -48,6 +48,7 @@ LICENSE
|
||||
|
||||
RECENT REVISION HISTORY:
|
||||
|
||||
2.28 (2023-01-29) many error fixes, security errors, just tons of stuff
|
||||
2.27 (2021-07-11) document stbi_info better, 16-bit PNM support, bug fixes
|
||||
2.26 (2020-07-13) many minor fixes
|
||||
2.25 (2020-02-02) fix warnings
|
||||
@@ -108,7 +109,7 @@ RECENT REVISION HISTORY:
|
||||
Cass Everitt Ryamond Barbiero github:grim210
|
||||
Paul Du Bois Engin Manap Aldo Culquicondor github:sammyhw
|
||||
Philipp Wiesemann Dale Weiler Oriol Ferrer Mesia github:phprus
|
||||
Josh Tobin Matthew Gregan github:poppolopoppo
|
||||
Josh Tobin Neil Bickford Matthew Gregan github:poppolopoppo
|
||||
Julian Raschke Gregory Mullen Christian Floisand github:darealshinji
|
||||
Baldur Karlsson Kevin Schmidt JR Smith github:Michaelangel007
|
||||
Brad Weinberger Matvey Cherevko github:mosra
|
||||
@@ -140,7 +141,7 @@ RECENT REVISION HISTORY:
|
||||
// // ... x = width, y = height, n = # 8-bit components per pixel ...
|
||||
// // ... replace '0' with '1'..'4' to force that many components per pixel
|
||||
// // ... but 'n' will always be the number that it would have been if you said 0
|
||||
// stbi_image_free(data)
|
||||
// stbi_image_free(data);
|
||||
//
|
||||
// Standard parameters:
|
||||
// int *x -- outputs image width in pixels
|
||||
@@ -635,7 +636,7 @@ STBIDEF int stbi_zlib_decode_noheader_buffer(char *obuffer, int olen, const ch
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#if defined(_MSC_VER) || defined(__SYMBIAN32__)
|
||||
typedef unsigned short stbi__uint16;
|
||||
typedef signed short stbi__int16;
|
||||
typedef unsigned int stbi__uint32;
|
||||
@@ -1032,7 +1033,7 @@ static int stbi__mad3sizes_valid(int a, int b, int c, int add)
|
||||
}
|
||||
|
||||
// returns 1 if "a*b*c*d + add" has no negative terms/factors and doesn't overflow
|
||||
#if !defined(STBI_NO_LINEAR) || !defined(STBI_NO_HDR)
|
||||
#if !defined(STBI_NO_LINEAR) || !defined(STBI_NO_HDR) || !defined(STBI_NO_PNM)
|
||||
static int stbi__mad4sizes_valid(int a, int b, int c, int d, int add)
|
||||
{
|
||||
return stbi__mul2sizes_valid(a, b) && stbi__mul2sizes_valid(a*b, c) &&
|
||||
@@ -1055,7 +1056,7 @@ static void *stbi__malloc_mad3(int a, int b, int c, int add)
|
||||
return stbi__malloc(a*b*c + add);
|
||||
}
|
||||
|
||||
#if !defined(STBI_NO_LINEAR) || !defined(STBI_NO_HDR)
|
||||
#if !defined(STBI_NO_LINEAR) || !defined(STBI_NO_HDR) || !defined(STBI_NO_PNM)
|
||||
static void *stbi__malloc_mad4(int a, int b, int c, int d, int add)
|
||||
{
|
||||
if (!stbi__mad4sizes_valid(a, b, c, d, add)) return NULL;
|
||||
@@ -1063,6 +1064,23 @@ static void *stbi__malloc_mad4(int a, int b, int c, int d, int add)
|
||||
}
|
||||
#endif
|
||||
|
||||
// returns 1 if the sum of two signed ints is valid (between -2^31 and 2^31-1 inclusive), 0 on overflow.
|
||||
static int stbi__addints_valid(int a, int b)
|
||||
{
|
||||
if ((a >= 0) != (b >= 0)) return 1; // a and b have different signs, so no overflow
|
||||
if (a < 0 && b < 0) return a >= INT_MIN - b; // same as a + b >= INT_MIN; INT_MIN - b cannot overflow since b < 0.
|
||||
return a <= INT_MAX - b;
|
||||
}
|
||||
|
||||
// returns 1 if the product of two signed shorts is valid, 0 on overflow.
|
||||
static int stbi__mul2shorts_valid(short a, short b)
|
||||
{
|
||||
if (b == 0 || b == -1) return 1; // multiplication by 0 is always 0; check for -1 so SHRT_MIN/b doesn't overflow
|
||||
if ((a >= 0) == (b >= 0)) return a <= SHRT_MAX/b; // product is positive, so similar to mul2sizes_valid
|
||||
if (b < 0) return a <= SHRT_MIN / b; // same as a * b >= SHRT_MIN
|
||||
return a >= SHRT_MIN / b;
|
||||
}
|
||||
|
||||
// stbi__err - error
|
||||
// stbi__errpf - error returning pointer to float
|
||||
// stbi__errpuc - error returning pointer to unsigned char
|
||||
@@ -1985,9 +2003,12 @@ static int stbi__build_huffman(stbi__huffman *h, int *count)
|
||||
int i,j,k=0;
|
||||
unsigned int code;
|
||||
// build size list for each symbol (from JPEG spec)
|
||||
for (i=0; i < 16; ++i)
|
||||
for (j=0; j < count[i]; ++j)
|
||||
for (i=0; i < 16; ++i) {
|
||||
for (j=0; j < count[i]; ++j) {
|
||||
h->size[k++] = (stbi_uc) (i+1);
|
||||
if(k >= 257) return stbi__err("bad size list","Corrupt JPEG");
|
||||
}
|
||||
}
|
||||
h->size[k] = 0;
|
||||
|
||||
// compute actual symbols (from jpeg spec)
|
||||
@@ -2112,6 +2133,8 @@ stbi_inline static int stbi__jpeg_huff_decode(stbi__jpeg *j, stbi__huffman *h)
|
||||
|
||||
// convert the huffman code to the symbol id
|
||||
c = ((j->code_buffer >> (32 - k)) & stbi__bmask[k]) + h->delta[k];
|
||||
if(c < 0 || c >= 256) // symbol id out of bounds!
|
||||
return -1;
|
||||
STBI_ASSERT((((j->code_buffer) >> (32 - h->size[c])) & stbi__bmask[h->size[c]]) == h->code[c]);
|
||||
|
||||
// convert the id to a symbol
|
||||
@@ -2130,6 +2153,7 @@ stbi_inline static int stbi__extend_receive(stbi__jpeg *j, int n)
|
||||
unsigned int k;
|
||||
int sgn;
|
||||
if (j->code_bits < n) stbi__grow_buffer_unsafe(j);
|
||||
if (j->code_bits < n) return 0; // ran out of bits from stream, return 0s intead of continuing
|
||||
|
||||
sgn = j->code_buffer >> 31; // sign bit always in MSB; 0 if MSB clear (positive), 1 if MSB set (negative)
|
||||
k = stbi_lrot(j->code_buffer, n);
|
||||
@@ -2144,6 +2168,7 @@ stbi_inline static int stbi__jpeg_get_bits(stbi__jpeg *j, int n)
|
||||
{
|
||||
unsigned int k;
|
||||
if (j->code_bits < n) stbi__grow_buffer_unsafe(j);
|
||||
if (j->code_bits < n) return 0; // ran out of bits from stream, return 0s intead of continuing
|
||||
k = stbi_lrot(j->code_buffer, n);
|
||||
j->code_buffer = k & ~stbi__bmask[n];
|
||||
k &= stbi__bmask[n];
|
||||
@@ -2155,6 +2180,7 @@ stbi_inline static int stbi__jpeg_get_bit(stbi__jpeg *j)
|
||||
{
|
||||
unsigned int k;
|
||||
if (j->code_bits < 1) stbi__grow_buffer_unsafe(j);
|
||||
if (j->code_bits < 1) return 0; // ran out of bits from stream, return 0s intead of continuing
|
||||
k = j->code_buffer;
|
||||
j->code_buffer <<= 1;
|
||||
--j->code_bits;
|
||||
@@ -2192,8 +2218,10 @@ static int stbi__jpeg_decode_block(stbi__jpeg *j, short data[64], stbi__huffman
|
||||
memset(data,0,64*sizeof(data[0]));
|
||||
|
||||
diff = t ? stbi__extend_receive(j, t) : 0;
|
||||
if (!stbi__addints_valid(j->img_comp[b].dc_pred, diff)) return stbi__err("bad delta","Corrupt JPEG");
|
||||
dc = j->img_comp[b].dc_pred + diff;
|
||||
j->img_comp[b].dc_pred = dc;
|
||||
if (!stbi__mul2shorts_valid(dc, dequant[0])) return stbi__err("can't merge dc and ac", "Corrupt JPEG");
|
||||
data[0] = (short) (dc * dequant[0]);
|
||||
|
||||
// decode AC components, see JPEG spec
|
||||
@@ -2207,6 +2235,7 @@ static int stbi__jpeg_decode_block(stbi__jpeg *j, short data[64], stbi__huffman
|
||||
if (r) { // fast-AC path
|
||||
k += (r >> 4) & 15; // run
|
||||
s = r & 15; // combined length
|
||||
if (s > j->code_bits) return stbi__err("bad huffman code", "Combined length longer than code bits available");
|
||||
j->code_buffer <<= s;
|
||||
j->code_bits -= s;
|
||||
// decode into unzigzag'd location
|
||||
@@ -2246,8 +2275,10 @@ static int stbi__jpeg_decode_block_prog_dc(stbi__jpeg *j, short data[64], stbi__
|
||||
if (t < 0 || t > 15) return stbi__err("can't merge dc and ac", "Corrupt JPEG");
|
||||
diff = t ? stbi__extend_receive(j, t) : 0;
|
||||
|
||||
if (!stbi__addints_valid(j->img_comp[b].dc_pred, diff)) return stbi__err("bad delta", "Corrupt JPEG");
|
||||
dc = j->img_comp[b].dc_pred + diff;
|
||||
j->img_comp[b].dc_pred = dc;
|
||||
if (!stbi__mul2shorts_valid(dc, 1 << j->succ_low)) return stbi__err("can't merge dc and ac", "Corrupt JPEG");
|
||||
data[0] = (short) (dc * (1 << j->succ_low));
|
||||
} else {
|
||||
// refinement scan for DC coefficient
|
||||
@@ -2282,6 +2313,7 @@ static int stbi__jpeg_decode_block_prog_ac(stbi__jpeg *j, short data[64], stbi__
|
||||
if (r) { // fast-AC path
|
||||
k += (r >> 4) & 15; // run
|
||||
s = r & 15; // combined length
|
||||
if (s > j->code_bits) return stbi__err("bad huffman code", "Combined length longer than code bits available");
|
||||
j->code_buffer <<= s;
|
||||
j->code_bits -= s;
|
||||
zig = stbi__jpeg_dezigzag[k++];
|
||||
@@ -3102,6 +3134,7 @@ static int stbi__process_marker(stbi__jpeg *z, int m)
|
||||
sizes[i] = stbi__get8(z->s);
|
||||
n += sizes[i];
|
||||
}
|
||||
if(n > 256) return stbi__err("bad DHT header","Corrupt JPEG"); // Loop over i < n would write past end of values!
|
||||
L -= 17;
|
||||
if (tc == 0) {
|
||||
if (!stbi__build_huffman(z->huff_dc+th, sizes)) return 0;
|
||||
@@ -3267,6 +3300,13 @@ static int stbi__process_frame_header(stbi__jpeg *z, int scan)
|
||||
if (z->img_comp[i].v > v_max) v_max = z->img_comp[i].v;
|
||||
}
|
||||
|
||||
// check that plane subsampling factors are integer ratios; our resamplers can't deal with fractional ratios
|
||||
// and I've never seen a non-corrupted JPEG file actually use them
|
||||
for (i=0; i < s->img_n; ++i) {
|
||||
if (h_max % z->img_comp[i].h != 0) return stbi__err("bad H","Corrupt JPEG");
|
||||
if (v_max % z->img_comp[i].v != 0) return stbi__err("bad V","Corrupt JPEG");
|
||||
}
|
||||
|
||||
// compute interleaved mcu info
|
||||
z->img_h_max = h_max;
|
||||
z->img_v_max = v_max;
|
||||
@@ -3344,6 +3384,28 @@ static int stbi__decode_jpeg_header(stbi__jpeg *z, int scan)
|
||||
return 1;
|
||||
}
|
||||
|
||||
static int stbi__skip_jpeg_junk_at_end(stbi__jpeg *j)
|
||||
{
|
||||
// some JPEGs have junk at end, skip over it but if we find what looks
|
||||
// like a valid marker, resume there
|
||||
while (!stbi__at_eof(j->s)) {
|
||||
int x = stbi__get8(j->s);
|
||||
while (x == 255) { // might be a marker
|
||||
if (stbi__at_eof(j->s)) return STBI__MARKER_none;
|
||||
x = stbi__get8(j->s);
|
||||
if (x != 0x00 && x != 0xff) {
|
||||
// not a stuffed zero or lead-in to another marker, looks
|
||||
// like an actual marker, return it
|
||||
return x;
|
||||
}
|
||||
// stuffed zero has x=0 now which ends the loop, meaning we go
|
||||
// back to regular scan loop.
|
||||
// repeated 0xff keeps trying to read the next byte of the marker.
|
||||
}
|
||||
}
|
||||
return STBI__MARKER_none;
|
||||
}
|
||||
|
||||
// decode image to YCbCr format
|
||||
static int stbi__decode_jpeg_image(stbi__jpeg *j)
|
||||
{
|
||||
@@ -3360,25 +3422,22 @@ static int stbi__decode_jpeg_image(stbi__jpeg *j)
|
||||
if (!stbi__process_scan_header(j)) return 0;
|
||||
if (!stbi__parse_entropy_coded_data(j)) return 0;
|
||||
if (j->marker == STBI__MARKER_none ) {
|
||||
// handle 0s at the end of image data from IP Kamera 9060
|
||||
while (!stbi__at_eof(j->s)) {
|
||||
int x = stbi__get8(j->s);
|
||||
if (x == 255) {
|
||||
j->marker = stbi__get8(j->s);
|
||||
break;
|
||||
}
|
||||
}
|
||||
j->marker = stbi__skip_jpeg_junk_at_end(j);
|
||||
// if we reach eof without hitting a marker, stbi__get_marker() below will fail and we'll eventually return 0
|
||||
}
|
||||
m = stbi__get_marker(j);
|
||||
if (STBI__RESTART(m))
|
||||
m = stbi__get_marker(j);
|
||||
} else if (stbi__DNL(m)) {
|
||||
int Ld = stbi__get16be(j->s);
|
||||
stbi__uint32 NL = stbi__get16be(j->s);
|
||||
if (Ld != 4) return stbi__err("bad DNL len", "Corrupt JPEG");
|
||||
if (NL != j->s->img_y) return stbi__err("bad DNL height", "Corrupt JPEG");
|
||||
m = stbi__get_marker(j);
|
||||
} else {
|
||||
if (!stbi__process_marker(j, m)) return 0;
|
||||
if (!stbi__process_marker(j, m)) return 1;
|
||||
m = stbi__get_marker(j);
|
||||
}
|
||||
m = stbi__get_marker(j);
|
||||
}
|
||||
if (j->progressive)
|
||||
stbi__jpeg_finish(j);
|
||||
@@ -3969,6 +4028,7 @@ static void *stbi__jpeg_load(stbi__context *s, int *x, int *y, int *comp, int re
|
||||
unsigned char* result;
|
||||
stbi__jpeg* j = (stbi__jpeg*) stbi__malloc(sizeof(stbi__jpeg));
|
||||
if (!j) return stbi__errpuc("outofmem", "Out of memory");
|
||||
memset(j, 0, sizeof(stbi__jpeg));
|
||||
STBI_NOTUSED(ri);
|
||||
j->s = s;
|
||||
stbi__setup_jpeg(j);
|
||||
@@ -3982,6 +4042,7 @@ static int stbi__jpeg_test(stbi__context *s)
|
||||
int r;
|
||||
stbi__jpeg* j = (stbi__jpeg*)stbi__malloc(sizeof(stbi__jpeg));
|
||||
if (!j) return stbi__err("outofmem", "Out of memory");
|
||||
memset(j, 0, sizeof(stbi__jpeg));
|
||||
j->s = s;
|
||||
stbi__setup_jpeg(j);
|
||||
r = stbi__decode_jpeg_header(j, STBI__SCAN_type);
|
||||
@@ -4007,6 +4068,7 @@ static int stbi__jpeg_info(stbi__context *s, int *x, int *y, int *comp)
|
||||
int result;
|
||||
stbi__jpeg* j = (stbi__jpeg*) (stbi__malloc(sizeof(stbi__jpeg)));
|
||||
if (!j) return stbi__err("outofmem", "Out of memory");
|
||||
memset(j, 0, sizeof(stbi__jpeg));
|
||||
j->s = s;
|
||||
result = stbi__jpeg_info_raw(j, x, y, comp);
|
||||
STBI_FREE(j);
|
||||
@@ -4249,11 +4311,12 @@ static int stbi__parse_huffman_block(stbi__zbuf *a)
|
||||
a->zout = zout;
|
||||
return 1;
|
||||
}
|
||||
if (z >= 286) return stbi__err("bad huffman code","Corrupt PNG"); // per DEFLATE, length codes 286 and 287 must not appear in compressed data
|
||||
z -= 257;
|
||||
len = stbi__zlength_base[z];
|
||||
if (stbi__zlength_extra[z]) len += stbi__zreceive(a, stbi__zlength_extra[z]);
|
||||
z = stbi__zhuffman_decode(a, &a->z_distance);
|
||||
if (z < 0) return stbi__err("bad huffman code","Corrupt PNG");
|
||||
if (z < 0 || z >= 30) return stbi__err("bad huffman code","Corrupt PNG"); // per DEFLATE, distance codes 30 and 31 must not appear in compressed data
|
||||
dist = stbi__zdist_base[z];
|
||||
if (stbi__zdist_extra[z]) dist += stbi__zreceive(a, stbi__zdist_extra[z]);
|
||||
if (zout - a->zout_start < dist) return stbi__err("bad dist","Corrupt PNG");
|
||||
@@ -4948,7 +5011,7 @@ STBIDEF void stbi_convert_iphone_png_to_rgb(int flag_true_if_should_convert)
|
||||
static STBI_THREAD_LOCAL int stbi__unpremultiply_on_load_local, stbi__unpremultiply_on_load_set;
|
||||
static STBI_THREAD_LOCAL int stbi__de_iphone_flag_local, stbi__de_iphone_flag_set;
|
||||
|
||||
STBIDEF void stbi__unpremultiply_on_load_thread(int flag_true_if_should_unpremultiply)
|
||||
STBIDEF void stbi_set_unpremultiply_on_load_thread(int flag_true_if_should_unpremultiply)
|
||||
{
|
||||
stbi__unpremultiply_on_load_local = flag_true_if_should_unpremultiply;
|
||||
stbi__unpremultiply_on_load_set = 1;
|
||||
@@ -5057,14 +5120,13 @@ static int stbi__parse_png_file(stbi__png *z, int scan, int req_comp)
|
||||
if (!pal_img_n) {
|
||||
s->img_n = (color & 2 ? 3 : 1) + (color & 4 ? 1 : 0);
|
||||
if ((1 << 30) / s->img_x / s->img_n < s->img_y) return stbi__err("too large", "Image too large to decode");
|
||||
if (scan == STBI__SCAN_header) return 1;
|
||||
} else {
|
||||
// if paletted, then pal_n is our final components, and
|
||||
// img_n is # components to decompress/filter.
|
||||
s->img_n = 1;
|
||||
if ((1 << 30) / s->img_x / 4 < s->img_y) return stbi__err("too large","Corrupt PNG");
|
||||
// if SCAN_header, have to scan to see if we have a tRNS
|
||||
}
|
||||
// even with SCAN_header, have to scan to see if we have a tRNS
|
||||
break;
|
||||
}
|
||||
|
||||
@@ -5096,6 +5158,8 @@ static int stbi__parse_png_file(stbi__png *z, int scan, int req_comp)
|
||||
if (!(s->img_n & 1)) return stbi__err("tRNS with alpha","Corrupt PNG");
|
||||
if (c.length != (stbi__uint32) s->img_n*2) return stbi__err("bad tRNS len","Corrupt PNG");
|
||||
has_trans = 1;
|
||||
// non-paletted with tRNS = constant alpha. if header-scanning, we can stop now.
|
||||
if (scan == STBI__SCAN_header) { ++s->img_n; return 1; }
|
||||
if (z->depth == 16) {
|
||||
for (k = 0; k < s->img_n; ++k) tc16[k] = (stbi__uint16)stbi__get16be(s); // copy the values as-is
|
||||
} else {
|
||||
@@ -5108,7 +5172,13 @@ static int stbi__parse_png_file(stbi__png *z, int scan, int req_comp)
|
||||
case STBI__PNG_TYPE('I','D','A','T'): {
|
||||
if (first) return stbi__err("first not IHDR", "Corrupt PNG");
|
||||
if (pal_img_n && !pal_len) return stbi__err("no PLTE","Corrupt PNG");
|
||||
if (scan == STBI__SCAN_header) { s->img_n = pal_img_n; return 1; }
|
||||
if (scan == STBI__SCAN_header) {
|
||||
// header scan definitely stops at first IDAT
|
||||
if (pal_img_n)
|
||||
s->img_n = pal_img_n;
|
||||
return 1;
|
||||
}
|
||||
if (c.length > (1u << 30)) return stbi__err("IDAT size limit", "IDAT section larger than 2^30 bytes");
|
||||
if ((int)(ioff + c.length) < (int)ioff) return 0;
|
||||
if (ioff + c.length > idata_limit) {
|
||||
stbi__uint32 idata_limit_old = idata_limit;
|
||||
@@ -5491,8 +5561,22 @@ static void *stbi__bmp_load(stbi__context *s, int *x, int *y, int *comp, int req
|
||||
psize = (info.offset - info.extra_read - info.hsz) >> 2;
|
||||
}
|
||||
if (psize == 0) {
|
||||
if (info.offset != s->callback_already_read + (s->img_buffer - s->img_buffer_original)) {
|
||||
return stbi__errpuc("bad offset", "Corrupt BMP");
|
||||
// accept some number of extra bytes after the header, but if the offset points either to before
|
||||
// the header ends or implies a large amount of extra data, reject the file as malformed
|
||||
int bytes_read_so_far = s->callback_already_read + (int)(s->img_buffer - s->img_buffer_original);
|
||||
int header_limit = 1024; // max we actually read is below 256 bytes currently.
|
||||
int extra_data_limit = 256*4; // what ordinarily goes here is a palette; 256 entries*4 bytes is its max size.
|
||||
if (bytes_read_so_far <= 0 || bytes_read_so_far > header_limit) {
|
||||
return stbi__errpuc("bad header", "Corrupt BMP");
|
||||
}
|
||||
// we established that bytes_read_so_far is positive and sensible.
|
||||
// the first half of this test rejects offsets that are either too small positives, or
|
||||
// negative, and guarantees that info.offset >= bytes_read_so_far > 0. this in turn
|
||||
// ensures the number computed in the second half of the test can't overflow.
|
||||
if (info.offset < bytes_read_so_far || info.offset - bytes_read_so_far > extra_data_limit) {
|
||||
return stbi__errpuc("bad offset", "Corrupt BMP");
|
||||
} else {
|
||||
stbi__skip(s, info.offset - bytes_read_so_far);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -7180,12 +7264,12 @@ static float *stbi__hdr_load(stbi__context *s, int *x, int *y, int *comp, int re
|
||||
// Run
|
||||
value = stbi__get8(s);
|
||||
count -= 128;
|
||||
if (count > nleft) { STBI_FREE(hdr_data); STBI_FREE(scanline); return stbi__errpf("corrupt", "bad RLE data in HDR"); }
|
||||
if ((count == 0) || (count > nleft)) { STBI_FREE(hdr_data); STBI_FREE(scanline); return stbi__errpf("corrupt", "bad RLE data in HDR"); }
|
||||
for (z = 0; z < count; ++z)
|
||||
scanline[i++ * 4 + k] = value;
|
||||
} else {
|
||||
// Dump
|
||||
if (count > nleft) { STBI_FREE(hdr_data); STBI_FREE(scanline); return stbi__errpf("corrupt", "bad RLE data in HDR"); }
|
||||
if ((count == 0) || (count > nleft)) { STBI_FREE(hdr_data); STBI_FREE(scanline); return stbi__errpf("corrupt", "bad RLE data in HDR"); }
|
||||
for (z = 0; z < count; ++z)
|
||||
scanline[i++ * 4 + k] = stbi__get8(s);
|
||||
}
|
||||
@@ -7439,10 +7523,17 @@ static void *stbi__pnm_load(stbi__context *s, int *x, int *y, int *comp, int req
|
||||
|
||||
out = (stbi_uc *) stbi__malloc_mad4(s->img_n, s->img_x, s->img_y, ri->bits_per_channel / 8, 0);
|
||||
if (!out) return stbi__errpuc("outofmem", "Out of memory");
|
||||
stbi__getn(s, out, s->img_n * s->img_x * s->img_y * (ri->bits_per_channel / 8));
|
||||
if (!stbi__getn(s, out, s->img_n * s->img_x * s->img_y * (ri->bits_per_channel / 8))) {
|
||||
STBI_FREE(out);
|
||||
return stbi__errpuc("bad PNM", "PNM file truncated");
|
||||
}
|
||||
|
||||
if (req_comp && req_comp != s->img_n) {
|
||||
out = stbi__convert_format(out, s->img_n, req_comp, s->img_x, s->img_y);
|
||||
if (ri->bits_per_channel == 16) {
|
||||
out = (stbi_uc *) stbi__convert_format16((stbi__uint16 *) out, s->img_n, req_comp, s->img_x, s->img_y);
|
||||
} else {
|
||||
out = stbi__convert_format(out, s->img_n, req_comp, s->img_x, s->img_y);
|
||||
}
|
||||
if (out == NULL) return out; // stbi__convert_format frees input on failure
|
||||
}
|
||||
return out;
|
||||
@@ -7479,6 +7570,8 @@ static int stbi__pnm_getinteger(stbi__context *s, char *c)
|
||||
while (!stbi__at_eof(s) && stbi__pnm_isdigit(*c)) {
|
||||
value = value*10 + (*c - '0');
|
||||
*c = (char) stbi__get8(s);
|
||||
if((value > 214748364) || (value == 214748364 && *c > '7'))
|
||||
return stbi__err("integer parse overflow", "Parsing an integer in the PPM header overflowed a 32-bit int");
|
||||
}
|
||||
|
||||
return value;
|
||||
@@ -7509,9 +7602,13 @@ static int stbi__pnm_info(stbi__context *s, int *x, int *y, int *comp)
|
||||
stbi__pnm_skip_whitespace(s, &c);
|
||||
|
||||
*x = stbi__pnm_getinteger(s, &c); // read width
|
||||
if(*x == 0)
|
||||
return stbi__err("invalid width", "PPM image header had zero or overflowing width");
|
||||
stbi__pnm_skip_whitespace(s, &c);
|
||||
|
||||
*y = stbi__pnm_getinteger(s, &c); // read height
|
||||
if (*y == 0)
|
||||
return stbi__err("invalid width", "PPM image header had zero or overflowing width");
|
||||
stbi__pnm_skip_whitespace(s, &c);
|
||||
|
||||
maxv = stbi__pnm_getinteger(s, &c); // read max value
|
||||
|
||||
@@ -511,7 +511,7 @@ static int stbi_write_bmp_core(stbi__write_context *s, int x, int y, int comp, c
|
||||
|
||||
STBIWDEF int stbi_write_bmp_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const void *data)
|
||||
{
|
||||
stbi__write_context s {};
|
||||
stbi__write_context s = { 0 };
|
||||
stbi__start_write_callbacks(&s, func, context);
|
||||
return stbi_write_bmp_core(&s, x, y, comp, data);
|
||||
}
|
||||
@@ -519,7 +519,7 @@ STBIWDEF int stbi_write_bmp_to_func(stbi_write_func *func, void *context, int x,
|
||||
#ifndef STBI_WRITE_NO_STDIO
|
||||
STBIWDEF int stbi_write_bmp(char const *filename, int x, int y, int comp, const void *data)
|
||||
{
|
||||
stbi__write_context s {};
|
||||
stbi__write_context s = { 0 };
|
||||
if (stbi__start_write_file(&s,filename)) {
|
||||
int r = stbi_write_bmp_core(&s, x, y, comp, data);
|
||||
stbi__end_write_file(&s);
|
||||
@@ -610,7 +610,7 @@ static int stbi_write_tga_core(stbi__write_context *s, int x, int y, int comp, v
|
||||
|
||||
STBIWDEF int stbi_write_tga_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const void *data)
|
||||
{
|
||||
stbi__write_context s {};
|
||||
stbi__write_context s = { 0 };
|
||||
stbi__start_write_callbacks(&s, func, context);
|
||||
return stbi_write_tga_core(&s, x, y, comp, (void *) data);
|
||||
}
|
||||
@@ -618,7 +618,7 @@ STBIWDEF int stbi_write_tga_to_func(stbi_write_func *func, void *context, int x,
|
||||
#ifndef STBI_WRITE_NO_STDIO
|
||||
STBIWDEF int stbi_write_tga(char const *filename, int x, int y, int comp, const void *data)
|
||||
{
|
||||
stbi__write_context s {};
|
||||
stbi__write_context s = { 0 };
|
||||
if (stbi__start_write_file(&s,filename)) {
|
||||
int r = stbi_write_tga_core(&s, x, y, comp, (void *) data);
|
||||
stbi__end_write_file(&s);
|
||||
@@ -786,14 +786,14 @@ static int stbi_write_hdr_core(stbi__write_context *s, int x, int y, int comp, f
|
||||
|
||||
STBIWDEF int stbi_write_hdr_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const float *data)
|
||||
{
|
||||
stbi__write_context s {};
|
||||
stbi__write_context s = { 0 };
|
||||
stbi__start_write_callbacks(&s, func, context);
|
||||
return stbi_write_hdr_core(&s, x, y, comp, (float *) data);
|
||||
}
|
||||
|
||||
STBIWDEF int stbi_write_hdr(char const *filename, int x, int y, int comp, const float *data)
|
||||
{
|
||||
stbi__write_context s {};
|
||||
stbi__write_context s = { 0 };
|
||||
if (stbi__start_write_file(&s,filename)) {
|
||||
int r = stbi_write_hdr_core(&s, x, y, comp, (float *) data);
|
||||
stbi__end_write_file(&s);
|
||||
@@ -1606,7 +1606,7 @@ static int stbi_write_jpg_core(stbi__write_context *s, int width, int height, in
|
||||
|
||||
STBIWDEF int stbi_write_jpg_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const void *data, int quality)
|
||||
{
|
||||
stbi__write_context s {};
|
||||
stbi__write_context s = { 0 };
|
||||
stbi__start_write_callbacks(&s, func, context);
|
||||
return stbi_write_jpg_core(&s, x, y, comp, (void *) data, quality);
|
||||
}
|
||||
@@ -1615,7 +1615,7 @@ STBIWDEF int stbi_write_jpg_to_func(stbi_write_func *func, void *context, int x,
|
||||
#ifndef STBI_WRITE_NO_STDIO
|
||||
STBIWDEF int stbi_write_jpg(char const *filename, int x, int y, int comp, const void *data, int quality)
|
||||
{
|
||||
stbi__write_context s {};
|
||||
stbi__write_context s = { 0 };
|
||||
if (stbi__start_write_file(&s,filename)) {
|
||||
int r = stbi_write_jpg_core(&s, x, y, comp, data, quality);
|
||||
stbi__end_write_file(&s);
|
||||
|
||||
@@ -21987,15 +21987,25 @@ wuffs_adler32__hasher__up_arm_neon(
|
||||
}
|
||||
v_p.len = 0;
|
||||
}
|
||||
|
||||
static const uint16x4_t table_0 {32, 31, 30, 29};
|
||||
static const uint16x4_t table_1 {28, 27, 26, 25};
|
||||
static const uint16x4_t table_2 {24, 23, 22, 21};
|
||||
static const uint16x4_t table_3 {20, 19, 18, 17};
|
||||
static const uint16x4_t table_4 {16, 15, 14, 13};
|
||||
static const uint16x4_t table_5 {12, 11, 10, 9};
|
||||
static const uint16x4_t table_6 { 8, 7, 6, 5};
|
||||
static const uint16x4_t table_7 { 4, 3, 2, 1};
|
||||
|
||||
v_v2 = vshlq_n_u32(v_v2, 5);
|
||||
v_v2 = vmlal_u16(v_v2, vget_low_u16(v_col0), ((uint16x4_t){32, 31, 30, 29}));
|
||||
v_v2 = vmlal_u16(v_v2, vget_high_u16(v_col0), ((uint16x4_t){28, 27, 26, 25}));
|
||||
v_v2 = vmlal_u16(v_v2, vget_low_u16(v_col1), ((uint16x4_t){24, 23, 22, 21}));
|
||||
v_v2 = vmlal_u16(v_v2, vget_high_u16(v_col1), ((uint16x4_t){20, 19, 18, 17}));
|
||||
v_v2 = vmlal_u16(v_v2, vget_low_u16(v_col2), ((uint16x4_t){16, 15, 14, 13}));
|
||||
v_v2 = vmlal_u16(v_v2, vget_high_u16(v_col2), ((uint16x4_t){12, 11, 10, 9}));
|
||||
v_v2 = vmlal_u16(v_v2, vget_low_u16(v_col3), ((uint16x4_t){8, 7, 6, 5}));
|
||||
v_v2 = vmlal_u16(v_v2, vget_high_u16(v_col3), ((uint16x4_t){4, 3, 2, 1}));
|
||||
v_v2 = vmlal_u16(v_v2, vget_low_u16(v_col0), table_0);
|
||||
v_v2 = vmlal_u16(v_v2, vget_high_u16(v_col0), table_1);
|
||||
v_v2 = vmlal_u16(v_v2, vget_low_u16(v_col1), table_2);
|
||||
v_v2 = vmlal_u16(v_v2, vget_high_u16(v_col1), table_3);
|
||||
v_v2 = vmlal_u16(v_v2, vget_low_u16(v_col2), table_4);
|
||||
v_v2 = vmlal_u16(v_v2, vget_high_u16(v_col2), table_5);
|
||||
v_v2 = vmlal_u16(v_v2, vget_low_u16(v_col3), table_6);
|
||||
v_v2 = vmlal_u16(v_v2, vget_high_u16(v_col3), table_7);
|
||||
v_sum1 = vpadd_u32(vget_low_u32(v_v1), vget_high_u32(v_v1));
|
||||
v_sum2 = vpadd_u32(vget_low_u32(v_v2), vget_high_u32(v_v2));
|
||||
v_sum12 = vpadd_u32(v_sum1, v_sum2);
|
||||
|
||||
@@ -1,26 +0,0 @@
|
||||
Image Set,Block Size,Name,PSNR,Total Time,Coding Time,Coding Rate
|
||||
HDRIHaven,4x4,hdr-rgb-arboretum.hdr,45.4642,0.9206,0.1086,5.4306
|
||||
HDRIHaven,4x4,hdr-rgb-bellparkpier.hdr,55.6113,0.9023,0.1042,5.6594
|
||||
HDRIHaven,4x4,hdr-rgb-canarywharf.hdr,49.1249,0.9088,0.1151,5.1237
|
||||
HDRIHaven,4x4,hdr-rgb-eveningroad.hdr,51.8620,0.9039,0.1064,5.5433
|
||||
HDRIHaven,4x4,hdr-rgb-riverwalk.hdr,34.4006,0.9604,0.1386,4.2548
|
||||
HDRIHaven,5x5,hdr-rgb-arboretum.hdr,41.4855,0.9261,0.1123,5.2511
|
||||
HDRIHaven,5x5,hdr-rgb-bellparkpier.hdr,51.8798,0.9130,0.1129,5.2252
|
||||
HDRIHaven,5x5,hdr-rgb-canarywharf.hdr,45.5073,0.9110,0.1164,5.0690
|
||||
HDRIHaven,5x5,hdr-rgb-eveningroad.hdr,48.9690,0.9134,0.1130,5.2194
|
||||
HDRIHaven,5x5,hdr-rgb-riverwalk.hdr,30.8563,0.9531,0.1314,4.4902
|
||||
HDRIHaven,6x6,hdr-rgb-arboretum.hdr,37.9117,0.9368,0.1226,4.8125
|
||||
HDRIHaven,6x6,hdr-rgb-bellparkpier.hdr,48.9417,0.9174,0.1235,4.7743
|
||||
HDRIHaven,6x6,hdr-rgb-canarywharf.hdr,42.6901,0.9159,0.1247,4.7282
|
||||
HDRIHaven,6x6,hdr-rgb-eveningroad.hdr,46.6034,0.9219,0.1237,4.7695
|
||||
HDRIHaven,6x6,hdr-rgb-riverwalk.hdr,28.4690,0.9657,0.1410,4.1836
|
||||
HDRIHaven,8x8,hdr-rgb-arboretum.hdr,33.6259,1.0036,0.1842,3.2028
|
||||
HDRIHaven,8x8,hdr-rgb-bellparkpier.hdr,45.0949,0.9746,0.1730,3.4098
|
||||
HDRIHaven,8x8,hdr-rgb-canarywharf.hdr,39.4004,0.9796,0.1780,3.3143
|
||||
HDRIHaven,8x8,hdr-rgb-eveningroad.hdr,43.5218,0.9778,0.1735,3.3987
|
||||
HDRIHaven,8x8,hdr-rgb-riverwalk.hdr,25.6552,1.0249,0.1933,3.0509
|
||||
HDRIHaven,12x12,hdr-rgb-arboretum.hdr,30.2083,1.0440,0.2153,2.7398
|
||||
HDRIHaven,12x12,hdr-rgb-bellparkpier.hdr,41.1174,1.0073,0.1992,2.9616
|
||||
HDRIHaven,12x12,hdr-rgb-canarywharf.hdr,36.3098,1.0129,0.2058,2.8664
|
||||
HDRIHaven,12x12,hdr-rgb-eveningroad.hdr,40.7317,1.0167,0.2065,2.8566
|
||||
HDRIHaven,12x12,hdr-rgb-riverwalk.hdr,23.0473,1.0679,0.2263,2.6068
|
||||
|
@@ -1,26 +0,0 @@
|
||||
Image Set,Block Size,Name,PSNR,Total Time,Coding Time,Coding Rate
|
||||
HDRIHaven,4x4,hdr-rgb-arboretum.hdr,45.2326,0.8834,0.0732,8.0546
|
||||
HDRIHaven,4x4,hdr-rgb-bellparkpier.hdr,55.2416,0.8664,0.0739,7.9788
|
||||
HDRIHaven,4x4,hdr-rgb-canarywharf.hdr,48.4153,0.8676,0.0738,7.9926
|
||||
HDRIHaven,4x4,hdr-rgb-eveningroad.hdr,51.6138,0.8715,0.0738,7.9920
|
||||
HDRIHaven,4x4,hdr-rgb-riverwalk.hdr,34.0126,0.9023,0.0794,7.4294
|
||||
HDRIHaven,5x5,hdr-rgb-arboretum.hdr,41.3165,0.8916,0.0804,7.3356
|
||||
HDRIHaven,5x5,hdr-rgb-bellparkpier.hdr,51.4653,0.8779,0.0810,7.2802
|
||||
HDRIHaven,5x5,hdr-rgb-canarywharf.hdr,45.0821,0.8755,0.0807,7.3127
|
||||
HDRIHaven,5x5,hdr-rgb-eveningroad.hdr,48.7249,0.8807,0.0825,7.1537
|
||||
HDRIHaven,5x5,hdr-rgb-riverwalk.hdr,30.4955,0.9102,0.0845,6.9824
|
||||
HDRIHaven,6x6,hdr-rgb-arboretum.hdr,37.8196,0.8892,0.0779,7.5716
|
||||
HDRIHaven,6x6,hdr-rgb-bellparkpier.hdr,48.6921,0.8750,0.0797,7.4031
|
||||
HDRIHaven,6x6,hdr-rgb-canarywharf.hdr,42.4519,0.8730,0.0779,7.5736
|
||||
HDRIHaven,6x6,hdr-rgb-eveningroad.hdr,46.3978,0.8744,0.0797,7.3992
|
||||
HDRIHaven,6x6,hdr-rgb-riverwalk.hdr,28.3170,0.9022,0.0810,7.2811
|
||||
HDRIHaven,8x8,hdr-rgb-arboretum.hdr,33.5303,0.9379,0.1203,4.9045
|
||||
HDRIHaven,8x8,hdr-rgb-bellparkpier.hdr,44.8147,0.9207,0.1202,4.9074
|
||||
HDRIHaven,8x8,hdr-rgb-canarywharf.hdr,39.1762,0.9168,0.1183,4.9860
|
||||
HDRIHaven,8x8,hdr-rgb-eveningroad.hdr,43.3622,0.9242,0.1215,4.8552
|
||||
HDRIHaven,8x8,hdr-rgb-riverwalk.hdr,25.5936,0.9590,0.1270,4.6442
|
||||
HDRIHaven,12x12,hdr-rgb-arboretum.hdr,30.1316,0.9686,0.1448,4.0723
|
||||
HDRIHaven,12x12,hdr-rgb-bellparkpier.hdr,40.6209,0.9452,0.1410,4.1826
|
||||
HDRIHaven,12x12,hdr-rgb-canarywharf.hdr,36.0262,0.9435,0.1409,4.1869
|
||||
HDRIHaven,12x12,hdr-rgb-eveningroad.hdr,40.6170,0.9515,0.1447,4.0759
|
||||
HDRIHaven,12x12,hdr-rgb-riverwalk.hdr,23.0061,0.9935,0.1550,3.8064
|
||||
|
@@ -1,26 +0,0 @@
|
||||
Image Set,Block Size,Name,PSNR,Total Time,Coding Time,Coding Rate
|
||||
HDRIHaven,4x4,hdr-rgb-arboretum.hdr,45.6813,1.0152,0.1984,2.9722
|
||||
HDRIHaven,4x4,hdr-rgb-bellparkpier.hdr,55.9798,0.9898,0.1927,3.0610
|
||||
HDRIHaven,4x4,hdr-rgb-canarywharf.hdr,49.5307,1.0064,0.2162,2.7282
|
||||
HDRIHaven,4x4,hdr-rgb-eveningroad.hdr,52.2443,0.9997,0.1979,2.9801
|
||||
HDRIHaven,4x4,hdr-rgb-riverwalk.hdr,34.7736,1.0887,0.2683,2.1981
|
||||
HDRIHaven,5x5,hdr-rgb-arboretum.hdr,41.5886,1.0282,0.2159,2.7315
|
||||
HDRIHaven,5x5,hdr-rgb-bellparkpier.hdr,52.2735,1.0275,0.2244,2.6287
|
||||
HDRIHaven,5x5,hdr-rgb-canarywharf.hdr,46.0266,1.0367,0.2354,2.5060
|
||||
HDRIHaven,5x5,hdr-rgb-eveningroad.hdr,49.1990,1.0237,0.2220,2.6565
|
||||
HDRIHaven,5x5,hdr-rgb-riverwalk.hdr,31.0853,1.1085,0.2788,2.1153
|
||||
HDRIHaven,6x6,hdr-rgb-arboretum.hdr,37.9799,1.0462,0.2313,2.5503
|
||||
HDRIHaven,6x6,hdr-rgb-bellparkpier.hdr,49.3967,1.0207,0.2245,2.6272
|
||||
HDRIHaven,6x6,hdr-rgb-canarywharf.hdr,43.2283,1.0421,0.2421,2.4367
|
||||
HDRIHaven,6x6,hdr-rgb-eveningroad.hdr,46.7624,1.0325,0.2280,2.5873
|
||||
HDRIHaven,6x6,hdr-rgb-riverwalk.hdr,28.6249,1.1214,0.2926,2.0159
|
||||
HDRIHaven,8x8,hdr-rgb-arboretum.hdr,33.7193,1.1333,0.3106,1.8989
|
||||
HDRIHaven,8x8,hdr-rgb-bellparkpier.hdr,45.5399,1.0800,0.2739,2.1533
|
||||
HDRIHaven,8x8,hdr-rgb-canarywharf.hdr,39.7977,1.1059,0.3027,1.9487
|
||||
HDRIHaven,8x8,hdr-rgb-eveningroad.hdr,43.6493,1.1016,0.2954,1.9965
|
||||
HDRIHaven,8x8,hdr-rgb-riverwalk.hdr,25.7364,1.1932,0.3560,1.6567
|
||||
HDRIHaven,12x12,hdr-rgb-arboretum.hdr,30.2550,1.2301,0.3959,1.4897
|
||||
HDRIHaven,12x12,hdr-rgb-bellparkpier.hdr,41.3419,1.1312,0.3167,1.8626
|
||||
HDRIHaven,12x12,hdr-rgb-canarywharf.hdr,36.5225,1.1784,0.3665,1.6095
|
||||
HDRIHaven,12x12,hdr-rgb-eveningroad.hdr,40.8200,1.1691,0.3549,1.6619
|
||||
HDRIHaven,12x12,hdr-rgb-riverwalk.hdr,23.0791,1.2708,0.4258,1.3853
|
||||
|
@@ -1,26 +0,0 @@
|
||||
Image Set,Block Size,Name,PSNR,Total Time,Coding Time,Coding Rate
|
||||
HDRIHaven,4x4,hdr-rgb-arboretum.hdr,45.8140,1.2786,0.4654,1.2672
|
||||
HDRIHaven,4x4,hdr-rgb-bellparkpier.hdr,56.3108,1.2803,0.4840,1.2185
|
||||
HDRIHaven,4x4,hdr-rgb-canarywharf.hdr,50.0416,1.2881,0.4957,1.1898
|
||||
HDRIHaven,4x4,hdr-rgb-eveningroad.hdr,52.4415,1.2576,0.4593,1.2841
|
||||
HDRIHaven,4x4,hdr-rgb-riverwalk.hdr,34.9172,1.3825,0.5584,1.0563
|
||||
HDRIHaven,5x5,hdr-rgb-arboretum.hdr,41.6414,1.3319,0.5165,1.1419
|
||||
HDRIHaven,5x5,hdr-rgb-bellparkpier.hdr,52.5164,1.3635,0.5626,1.0484
|
||||
HDRIHaven,5x5,hdr-rgb-canarywharf.hdr,46.4423,1.3605,0.5630,1.0477
|
||||
HDRIHaven,5x5,hdr-rgb-eveningroad.hdr,49.3342,1.3238,0.5249,1.1236
|
||||
HDRIHaven,5x5,hdr-rgb-riverwalk.hdr,31.2127,1.4354,0.6116,0.9644
|
||||
HDRIHaven,6x6,hdr-rgb-arboretum.hdr,38.0206,1.4166,0.6002,0.9827
|
||||
HDRIHaven,6x6,hdr-rgb-bellparkpier.hdr,49.5914,1.4251,0.6261,0.9421
|
||||
HDRIHaven,6x6,hdr-rgb-canarywharf.hdr,43.5720,1.4318,0.6347,0.9293
|
||||
HDRIHaven,6x6,hdr-rgb-eveningroad.hdr,46.8656,1.4023,0.6044,0.9758
|
||||
HDRIHaven,6x6,hdr-rgb-riverwalk.hdr,28.7315,1.5346,0.7063,0.8351
|
||||
HDRIHaven,8x8,hdr-rgb-arboretum.hdr,33.7638,1.5949,0.7743,0.7618
|
||||
HDRIHaven,8x8,hdr-rgb-bellparkpier.hdr,45.7385,1.5035,0.6977,0.8454
|
||||
HDRIHaven,8x8,hdr-rgb-canarywharf.hdr,40.0389,1.5403,0.7381,0.7991
|
||||
HDRIHaven,8x8,hdr-rgb-eveningroad.hdr,43.7390,1.5266,0.7178,0.8218
|
||||
HDRIHaven,8x8,hdr-rgb-riverwalk.hdr,25.8113,1.7122,0.8752,0.6739
|
||||
HDRIHaven,12x12,hdr-rgb-arboretum.hdr,30.2896,1.7554,0.9179,0.6426
|
||||
HDRIHaven,12x12,hdr-rgb-bellparkpier.hdr,41.5190,1.5453,0.7246,0.8140
|
||||
HDRIHaven,12x12,hdr-rgb-canarywharf.hdr,36.7063,1.6414,0.8256,0.7145
|
||||
HDRIHaven,12x12,hdr-rgb-eveningroad.hdr,40.8774,1.6395,0.8212,0.7183
|
||||
HDRIHaven,12x12,hdr-rgb-riverwalk.hdr,23.1152,1.8808,1.0319,0.5716
|
||||
|
@@ -1,26 +0,0 @@
|
||||
Image Set,Block Size,Name,PSNR,Total Time,Coding Time,Coding Rate
|
||||
HDRIHaven,4x4,hdr-rgb-arboretum.hdr,45.4642,0.9467,0.1363,4.3277
|
||||
HDRIHaven,4x4,hdr-rgb-bellparkpier.hdr,55.6113,0.9284,0.1309,4.5048
|
||||
HDRIHaven,4x4,hdr-rgb-canarywharf.hdr,49.1249,0.9385,0.1436,4.1086
|
||||
HDRIHaven,4x4,hdr-rgb-eveningroad.hdr,51.8620,0.9319,0.1331,4.4325
|
||||
HDRIHaven,4x4,hdr-rgb-riverwalk.hdr,34.4006,0.9953,0.1724,3.4211
|
||||
HDRIHaven,5x5,hdr-rgb-arboretum.hdr,41.4855,0.9690,0.1534,3.8461
|
||||
HDRIHaven,5x5,hdr-rgb-bellparkpier.hdr,51.8798,0.9525,0.1534,3.8443
|
||||
HDRIHaven,5x5,hdr-rgb-canarywharf.hdr,45.5073,0.9539,0.1578,3.7371
|
||||
HDRIHaven,5x5,hdr-rgb-eveningroad.hdr,48.9690,0.9547,0.1531,3.8532
|
||||
HDRIHaven,5x5,hdr-rgb-riverwalk.hdr,30.8563,1.0047,0.1788,3.2995
|
||||
HDRIHaven,6x6,hdr-rgb-arboretum.hdr,37.9117,0.9980,0.1808,3.2631
|
||||
HDRIHaven,6x6,hdr-rgb-bellparkpier.hdr,48.9417,0.9821,0.1812,3.2554
|
||||
HDRIHaven,6x6,hdr-rgb-canarywharf.hdr,42.6901,0.9781,0.1823,3.2348
|
||||
HDRIHaven,6x6,hdr-rgb-eveningroad.hdr,46.6034,0.9832,0.1806,3.2657
|
||||
HDRIHaven,6x6,hdr-rgb-riverwalk.hdr,28.4690,1.0337,0.2048,2.8804
|
||||
HDRIHaven,8x8,hdr-rgb-arboretum.hdr,33.6259,1.0986,0.2734,2.1570
|
||||
HDRIHaven,8x8,hdr-rgb-bellparkpier.hdr,45.0949,1.0655,0.2576,2.2900
|
||||
HDRIHaven,8x8,hdr-rgb-canarywharf.hdr,39.4004,1.0731,0.2682,2.1989
|
||||
HDRIHaven,8x8,hdr-rgb-eveningroad.hdr,43.5218,1.0685,0.2607,2.2623
|
||||
HDRIHaven,8x8,hdr-rgb-riverwalk.hdr,25.6552,1.1252,0.2871,2.0544
|
||||
HDRIHaven,12x12,hdr-rgb-arboretum.hdr,30.2083,1.1537,0.3238,1.8215
|
||||
HDRIHaven,12x12,hdr-rgb-bellparkpier.hdr,41.1174,1.1093,0.2999,1.9665
|
||||
HDRIHaven,12x12,hdr-rgb-canarywharf.hdr,36.3098,1.1162,0.3092,1.9078
|
||||
HDRIHaven,12x12,hdr-rgb-eveningroad.hdr,40.7317,1.1237,0.3094,1.9064
|
||||
HDRIHaven,12x12,hdr-rgb-riverwalk.hdr,23.0473,1.1800,0.3385,1.7423
|
||||
|
@@ -1,26 +0,0 @@
|
||||
Image Set,Block Size,Name,PSNR,Total Time,Coding Time,Coding Rate
|
||||
HDRIHaven,4x4,hdr-rgb-arboretum.hdr,45.2326,0.9060,0.0936,6.3045
|
||||
HDRIHaven,4x4,hdr-rgb-bellparkpier.hdr,55.2416,0.8912,0.0937,6.2929
|
||||
HDRIHaven,4x4,hdr-rgb-canarywharf.hdr,48.4153,0.8881,0.0941,6.2703
|
||||
HDRIHaven,4x4,hdr-rgb-eveningroad.hdr,51.6138,0.8931,0.0942,6.2615
|
||||
HDRIHaven,4x4,hdr-rgb-riverwalk.hdr,34.0126,0.9247,0.1013,5.8225
|
||||
HDRIHaven,5x5,hdr-rgb-arboretum.hdr,41.3165,0.9240,0.1109,5.3166
|
||||
HDRIHaven,5x5,hdr-rgb-bellparkpier.hdr,51.4653,0.9099,0.1119,5.2715
|
||||
HDRIHaven,5x5,hdr-rgb-canarywharf.hdr,45.0821,0.9071,0.1110,5.3134
|
||||
HDRIHaven,5x5,hdr-rgb-eveningroad.hdr,48.7249,0.9123,0.1121,5.2598
|
||||
HDRIHaven,5x5,hdr-rgb-riverwalk.hdr,30.4955,0.9404,0.1154,5.1095
|
||||
HDRIHaven,6x6,hdr-rgb-arboretum.hdr,37.8196,0.9275,0.1130,5.2215
|
||||
HDRIHaven,6x6,hdr-rgb-bellparkpier.hdr,48.6921,0.9147,0.1153,5.1166
|
||||
HDRIHaven,6x6,hdr-rgb-canarywharf.hdr,42.4519,0.9091,0.1130,5.2177
|
||||
HDRIHaven,6x6,hdr-rgb-eveningroad.hdr,46.3978,0.9152,0.1149,5.1337
|
||||
HDRIHaven,6x6,hdr-rgb-riverwalk.hdr,28.3170,0.9432,0.1167,5.0546
|
||||
HDRIHaven,8x8,hdr-rgb-arboretum.hdr,33.5303,0.9951,0.1753,3.3650
|
||||
HDRIHaven,8x8,hdr-rgb-bellparkpier.hdr,44.8147,0.9780,0.1750,3.3697
|
||||
HDRIHaven,8x8,hdr-rgb-canarywharf.hdr,39.1762,0.9739,0.1732,3.4059
|
||||
HDRIHaven,8x8,hdr-rgb-eveningroad.hdr,43.3622,0.9824,0.1776,3.3203
|
||||
HDRIHaven,8x8,hdr-rgb-riverwalk.hdr,25.5936,1.0191,0.1866,3.1608
|
||||
HDRIHaven,12x12,hdr-rgb-arboretum.hdr,30.1316,1.0419,0.2144,2.7516
|
||||
HDRIHaven,12x12,hdr-rgb-bellparkpier.hdr,40.6209,1.0192,0.2113,2.7921
|
||||
HDRIHaven,12x12,hdr-rgb-canarywharf.hdr,36.0262,1.0160,0.2102,2.8064
|
||||
HDRIHaven,12x12,hdr-rgb-eveningroad.hdr,40.6170,1.0268,0.2166,2.7231
|
||||
HDRIHaven,12x12,hdr-rgb-riverwalk.hdr,23.0061,1.0702,0.2301,2.5638
|
||||
|
@@ -1,26 +0,0 @@
|
||||
Image Set,Block Size,Name,PSNR,Total Time,Coding Time,Coding Rate
|
||||
HDRIHaven,4x4,hdr-rgb-arboretum.hdr,45.6813,1.0841,0.2661,2.2164
|
||||
HDRIHaven,4x4,hdr-rgb-bellparkpier.hdr,55.9798,1.0610,0.2612,2.2584
|
||||
HDRIHaven,4x4,hdr-rgb-canarywharf.hdr,49.5307,1.0883,0.2923,2.0177
|
||||
HDRIHaven,4x4,hdr-rgb-eveningroad.hdr,52.2443,1.0682,0.2673,2.2069
|
||||
HDRIHaven,4x4,hdr-rgb-riverwalk.hdr,34.7736,1.1871,0.3598,1.6392
|
||||
HDRIHaven,5x5,hdr-rgb-arboretum.hdr,41.5886,1.1287,0.3134,1.8818
|
||||
HDRIHaven,5x5,hdr-rgb-bellparkpier.hdr,52.2735,1.1319,0.3276,1.8003
|
||||
HDRIHaven,5x5,hdr-rgb-canarywharf.hdr,46.0266,1.1436,0.3455,1.7071
|
||||
HDRIHaven,5x5,hdr-rgb-eveningroad.hdr,49.1990,1.1306,0.3256,1.8118
|
||||
HDRIHaven,5x5,hdr-rgb-riverwalk.hdr,31.0853,1.2347,0.4070,1.4490
|
||||
HDRIHaven,6x6,hdr-rgb-arboretum.hdr,37.9799,1.1702,0.3495,1.6877
|
||||
HDRIHaven,6x6,hdr-rgb-bellparkpier.hdr,49.3967,1.1431,0.3414,1.7275
|
||||
HDRIHaven,6x6,hdr-rgb-canarywharf.hdr,43.2283,1.1688,0.3681,1.6022
|
||||
HDRIHaven,6x6,hdr-rgb-eveningroad.hdr,46.7624,1.1525,0.3465,1.7024
|
||||
HDRIHaven,6x6,hdr-rgb-riverwalk.hdr,28.6249,1.2719,0.4421,1.3342
|
||||
HDRIHaven,8x8,hdr-rgb-arboretum.hdr,33.7193,1.3044,0.4816,1.2246
|
||||
HDRIHaven,8x8,hdr-rgb-bellparkpier.hdr,45.5399,1.2323,0.4252,1.3872
|
||||
HDRIHaven,8x8,hdr-rgb-canarywharf.hdr,39.7977,1.2743,0.4709,1.2526
|
||||
HDRIHaven,8x8,hdr-rgb-eveningroad.hdr,43.6493,1.2624,0.4562,1.2929
|
||||
HDRIHaven,8x8,hdr-rgb-riverwalk.hdr,25.7364,1.3818,0.5486,1.0752
|
||||
HDRIHaven,12x12,hdr-rgb-arboretum.hdr,30.2550,1.4356,0.6055,0.9742
|
||||
HDRIHaven,12x12,hdr-rgb-bellparkpier.hdr,41.3419,1.3001,0.4875,1.2099
|
||||
HDRIHaven,12x12,hdr-rgb-canarywharf.hdr,36.5225,1.3711,0.5605,1.0523
|
||||
HDRIHaven,12x12,hdr-rgb-eveningroad.hdr,40.8200,1.3601,0.5422,1.0878
|
||||
HDRIHaven,12x12,hdr-rgb-riverwalk.hdr,23.0791,1.4984,0.6513,0.9057
|
||||
|
@@ -1,26 +0,0 @@
|
||||
Image Set,Block Size,Name,PSNR,Total Time,Coding Time,Coding Rate
|
||||
HDRIHaven,4x4,hdr-rgb-arboretum.hdr,45.8140,1.4350,0.6193,0.9524
|
||||
HDRIHaven,4x4,hdr-rgb-bellparkpier.hdr,56.3108,1.4526,0.6532,0.9030
|
||||
HDRIHaven,4x4,hdr-rgb-canarywharf.hdr,50.0416,1.4670,0.6700,0.8804
|
||||
HDRIHaven,4x4,hdr-rgb-eveningroad.hdr,52.4415,1.4204,0.6200,0.9514
|
||||
HDRIHaven,4x4,hdr-rgb-riverwalk.hdr,34.9172,1.5700,0.7472,0.7894
|
||||
HDRIHaven,5x5,hdr-rgb-arboretum.hdr,41.6414,1.5601,0.7446,0.7922
|
||||
HDRIHaven,5x5,hdr-rgb-bellparkpier.hdr,52.5164,1.6150,0.8121,0.7263
|
||||
HDRIHaven,5x5,hdr-rgb-canarywharf.hdr,46.4423,1.6092,0.8127,0.7257
|
||||
HDRIHaven,5x5,hdr-rgb-eveningroad.hdr,49.3342,1.5609,0.7556,0.7806
|
||||
HDRIHaven,5x5,hdr-rgb-riverwalk.hdr,31.2127,1.7099,0.8809,0.6696
|
||||
HDRIHaven,6x6,hdr-rgb-arboretum.hdr,38.0206,1.7151,0.8957,0.6585
|
||||
HDRIHaven,6x6,hdr-rgb-bellparkpier.hdr,49.5914,1.7311,0.9290,0.6349
|
||||
HDRIHaven,6x6,hdr-rgb-canarywharf.hdr,43.5720,1.7488,0.9474,0.6226
|
||||
HDRIHaven,6x6,hdr-rgb-eveningroad.hdr,46.8656,1.7043,0.9018,0.6541
|
||||
HDRIHaven,6x6,hdr-rgb-riverwalk.hdr,28.7315,1.8778,1.0479,0.5629
|
||||
HDRIHaven,8x8,hdr-rgb-arboretum.hdr,33.7638,2.0126,1.1857,0.4975
|
||||
HDRIHaven,8x8,hdr-rgb-bellparkpier.hdr,45.7385,1.8810,1.0719,0.5502
|
||||
HDRIHaven,8x8,hdr-rgb-canarywharf.hdr,40.0389,1.9360,1.1320,0.5210
|
||||
HDRIHaven,8x8,hdr-rgb-eveningroad.hdr,43.7390,1.9138,1.1054,0.5336
|
||||
HDRIHaven,8x8,hdr-rgb-riverwalk.hdr,25.8113,2.1880,1.3469,0.4379
|
||||
HDRIHaven,12x12,hdr-rgb-arboretum.hdr,30.2896,2.2359,1.3992,0.4215
|
||||
HDRIHaven,12x12,hdr-rgb-bellparkpier.hdr,41.5190,1.9326,1.1153,0.5289
|
||||
HDRIHaven,12x12,hdr-rgb-canarywharf.hdr,36.7063,2.0784,1.2631,0.4670
|
||||
HDRIHaven,12x12,hdr-rgb-eveningroad.hdr,40.8774,2.0728,1.2533,0.4706
|
||||
HDRIHaven,12x12,hdr-rgb-riverwalk.hdr,23.1152,2.4112,1.5651,0.3769
|
||||
|
@@ -1,26 +0,0 @@
|
||||
Image Set,Block Size,Name,PSNR,Total Time,Coding Time,Coding Rate
|
||||
HDRIHaven,4x4,hdr-rgb-arboretum.hdr,45.4642,0.9349,0.1215,4.8552
|
||||
HDRIHaven,4x4,hdr-rgb-bellparkpier.hdr,55.6113,0.9154,0.1171,5.0354
|
||||
HDRIHaven,4x4,hdr-rgb-canarywharf.hdr,49.1249,0.9239,0.1284,4.5924
|
||||
HDRIHaven,4x4,hdr-rgb-eveningroad.hdr,51.8620,0.9187,0.1184,4.9800
|
||||
HDRIHaven,4x4,hdr-rgb-riverwalk.hdr,34.4006,0.9803,0.1542,3.8248
|
||||
HDRIHaven,5x5,hdr-rgb-arboretum.hdr,41.4855,0.9477,0.1327,4.4461
|
||||
HDRIHaven,5x5,hdr-rgb-bellparkpier.hdr,51.8798,0.9348,0.1335,4.4189
|
||||
HDRIHaven,5x5,hdr-rgb-canarywharf.hdr,45.5073,0.9350,0.1373,4.2959
|
||||
HDRIHaven,5x5,hdr-rgb-eveningroad.hdr,48.9690,0.9350,0.1330,4.4341
|
||||
HDRIHaven,5x5,hdr-rgb-riverwalk.hdr,30.8563,0.9812,0.1558,3.7849
|
||||
HDRIHaven,6x6,hdr-rgb-arboretum.hdr,37.9117,0.9740,0.1576,3.7418
|
||||
HDRIHaven,6x6,hdr-rgb-bellparkpier.hdr,48.9417,0.9586,0.1579,3.7349
|
||||
HDRIHaven,6x6,hdr-rgb-canarywharf.hdr,42.6901,0.9566,0.1589,3.7111
|
||||
HDRIHaven,6x6,hdr-rgb-eveningroad.hdr,46.6034,0.9586,0.1571,3.7548
|
||||
HDRIHaven,6x6,hdr-rgb-riverwalk.hdr,28.4690,1.0069,0.1779,3.3146
|
||||
HDRIHaven,8x8,hdr-rgb-arboretum.hdr,33.6259,1.0635,0.2402,2.4553
|
||||
HDRIHaven,8x8,hdr-rgb-bellparkpier.hdr,45.0949,1.0317,0.2274,2.5933
|
||||
HDRIHaven,8x8,hdr-rgb-canarywharf.hdr,39.4004,1.0395,0.2340,2.5210
|
||||
HDRIHaven,8x8,hdr-rgb-eveningroad.hdr,43.5218,1.0347,0.2271,2.5977
|
||||
HDRIHaven,8x8,hdr-rgb-riverwalk.hdr,25.6552,1.0872,0.2511,2.3487
|
||||
HDRIHaven,12x12,hdr-rgb-arboretum.hdr,30.2083,1.1156,0.2853,2.0673
|
||||
HDRIHaven,12x12,hdr-rgb-bellparkpier.hdr,41.1174,1.0738,0.2647,2.2281
|
||||
HDRIHaven,12x12,hdr-rgb-canarywharf.hdr,36.3098,1.0843,0.2727,2.1627
|
||||
HDRIHaven,12x12,hdr-rgb-eveningroad.hdr,40.7317,1.0871,0.2721,2.1680
|
||||
HDRIHaven,12x12,hdr-rgb-riverwalk.hdr,23.0473,1.1433,0.2984,1.9766
|
||||
|
@@ -1,26 +0,0 @@
|
||||
Image Set,Block Size,Name,PSNR,Total Time,Coding Time,Coding Rate
|
||||
HDRIHaven,4x4,hdr-rgb-arboretum.hdr,45.2326,0.8925,0.0828,7.1221
|
||||
HDRIHaven,4x4,hdr-rgb-bellparkpier.hdr,55.2416,0.8796,0.0829,7.1145
|
||||
HDRIHaven,4x4,hdr-rgb-canarywharf.hdr,48.4153,0.8771,0.0830,7.1040
|
||||
HDRIHaven,4x4,hdr-rgb-eveningroad.hdr,51.6138,0.8800,0.0830,7.1094
|
||||
HDRIHaven,4x4,hdr-rgb-riverwalk.hdr,34.0126,0.9129,0.0899,6.5637
|
||||
HDRIHaven,5x5,hdr-rgb-arboretum.hdr,41.3165,0.9087,0.0957,6.1635
|
||||
HDRIHaven,5x5,hdr-rgb-bellparkpier.hdr,51.4653,0.8948,0.0965,6.1136
|
||||
HDRIHaven,5x5,hdr-rgb-canarywharf.hdr,45.0821,0.8909,0.0962,6.1328
|
||||
HDRIHaven,5x5,hdr-rgb-eveningroad.hdr,48.7249,0.8958,0.0964,6.1191
|
||||
HDRIHaven,5x5,hdr-rgb-riverwalk.hdr,30.4955,0.9245,0.0994,5.9329
|
||||
HDRIHaven,6x6,hdr-rgb-arboretum.hdr,37.8196,0.9113,0.0982,6.0056
|
||||
HDRIHaven,6x6,hdr-rgb-bellparkpier.hdr,48.6921,0.8973,0.1003,5.8804
|
||||
HDRIHaven,6x6,hdr-rgb-canarywharf.hdr,42.4519,0.8922,0.0987,5.9764
|
||||
HDRIHaven,6x6,hdr-rgb-eveningroad.hdr,46.3978,0.8989,0.0997,5.9157
|
||||
HDRIHaven,6x6,hdr-rgb-riverwalk.hdr,28.3170,0.9262,0.1018,5.7928
|
||||
HDRIHaven,8x8,hdr-rgb-arboretum.hdr,33.5303,0.9712,0.1539,3.8330
|
||||
HDRIHaven,8x8,hdr-rgb-bellparkpier.hdr,44.8147,0.9563,0.1535,3.8420
|
||||
HDRIHaven,8x8,hdr-rgb-canarywharf.hdr,39.1762,0.9496,0.1499,3.9360
|
||||
HDRIHaven,8x8,hdr-rgb-eveningroad.hdr,43.3622,0.9614,0.1556,3.7901
|
||||
HDRIHaven,8x8,hdr-rgb-riverwalk.hdr,25.5936,0.9963,0.1630,3.6182
|
||||
HDRIHaven,12x12,hdr-rgb-arboretum.hdr,30.1316,1.0145,0.1901,3.1022
|
||||
HDRIHaven,12x12,hdr-rgb-bellparkpier.hdr,40.6209,0.9923,0.1861,3.1699
|
||||
HDRIHaven,12x12,hdr-rgb-canarywharf.hdr,36.0262,0.9907,0.1852,3.1845
|
||||
HDRIHaven,12x12,hdr-rgb-eveningroad.hdr,40.6170,0.9988,0.1919,3.0730
|
||||
HDRIHaven,12x12,hdr-rgb-riverwalk.hdr,23.0061,1.0422,0.2040,2.8913
|
||||
|
@@ -1,26 +0,0 @@
|
||||
Image Set,Block Size,Name,PSNR,Total Time,Coding Time,Coding Rate
|
||||
HDRIHaven,4x4,hdr-rgb-arboretum.hdr,45.6813,1.0485,0.2338,2.5226
|
||||
HDRIHaven,4x4,hdr-rgb-bellparkpier.hdr,55.9798,1.0310,0.2281,2.5855
|
||||
HDRIHaven,4x4,hdr-rgb-canarywharf.hdr,49.5307,1.0563,0.2549,2.3139
|
||||
HDRIHaven,4x4,hdr-rgb-eveningroad.hdr,52.2443,1.0411,0.2330,2.5316
|
||||
HDRIHaven,4x4,hdr-rgb-riverwalk.hdr,34.7736,1.1405,0.3145,1.8756
|
||||
HDRIHaven,5x5,hdr-rgb-arboretum.hdr,41.5886,1.0881,0.2695,2.1888
|
||||
HDRIHaven,5x5,hdr-rgb-bellparkpier.hdr,52.2735,1.0834,0.2816,2.0946
|
||||
HDRIHaven,5x5,hdr-rgb-canarywharf.hdr,46.0266,1.0971,0.2964,1.9901
|
||||
HDRIHaven,5x5,hdr-rgb-eveningroad.hdr,49.1990,1.0846,0.2788,2.1152
|
||||
HDRIHaven,5x5,hdr-rgb-riverwalk.hdr,31.0853,1.1794,0.3502,1.6841
|
||||
HDRIHaven,6x6,hdr-rgb-arboretum.hdr,37.9799,1.1205,0.3011,1.9590
|
||||
HDRIHaven,6x6,hdr-rgb-bellparkpier.hdr,49.3967,1.0966,0.2921,2.0192
|
||||
HDRIHaven,6x6,hdr-rgb-canarywharf.hdr,43.2283,1.1184,0.3173,1.8590
|
||||
HDRIHaven,6x6,hdr-rgb-eveningroad.hdr,46.7624,1.1048,0.2979,1.9796
|
||||
HDRIHaven,6x6,hdr-rgb-riverwalk.hdr,28.6249,1.2150,0.3838,1.5369
|
||||
HDRIHaven,8x8,hdr-rgb-arboretum.hdr,33.7193,1.2415,0.4217,1.3986
|
||||
HDRIHaven,8x8,hdr-rgb-bellparkpier.hdr,45.5399,1.1818,0.3740,1.5773
|
||||
HDRIHaven,8x8,hdr-rgb-canarywharf.hdr,39.7977,1.2190,0.4116,1.4330
|
||||
HDRIHaven,8x8,hdr-rgb-eveningroad.hdr,43.6493,1.2074,0.4001,1.4742
|
||||
HDRIHaven,8x8,hdr-rgb-riverwalk.hdr,25.7364,1.3194,0.4816,1.2247
|
||||
HDRIHaven,12x12,hdr-rgb-arboretum.hdr,30.2550,1.3714,0.5360,1.1005
|
||||
HDRIHaven,12x12,hdr-rgb-bellparkpier.hdr,41.3419,1.2455,0.4300,1.3717
|
||||
HDRIHaven,12x12,hdr-rgb-canarywharf.hdr,36.5225,1.3058,0.4948,1.1920
|
||||
HDRIHaven,12x12,hdr-rgb-eveningroad.hdr,40.8200,1.2958,0.4783,1.2333
|
||||
HDRIHaven,12x12,hdr-rgb-riverwalk.hdr,23.0791,1.4251,0.5758,1.0243
|
||||
|
@@ -1,26 +0,0 @@
|
||||
Image Set,Block Size,Name,PSNR,Total Time,Coding Time,Coding Rate
|
||||
HDRIHaven,4x4,hdr-rgb-arboretum.hdr,45.8140,1.3668,0.5496,1.0731
|
||||
HDRIHaven,4x4,hdr-rgb-bellparkpier.hdr,56.3108,1.3742,0.5744,1.0268
|
||||
HDRIHaven,4x4,hdr-rgb-canarywharf.hdr,50.0416,1.3844,0.5893,1.0009
|
||||
HDRIHaven,4x4,hdr-rgb-eveningroad.hdr,52.4415,1.3448,0.5442,1.0839
|
||||
HDRIHaven,4x4,hdr-rgb-riverwalk.hdr,34.9172,1.4844,0.6601,0.8936
|
||||
HDRIHaven,5x5,hdr-rgb-arboretum.hdr,41.6414,1.4673,0.6500,0.9074
|
||||
HDRIHaven,5x5,hdr-rgb-bellparkpier.hdr,52.5164,1.5132,0.7083,0.8327
|
||||
HDRIHaven,5x5,hdr-rgb-canarywharf.hdr,46.4423,1.5122,0.7090,0.8319
|
||||
HDRIHaven,5x5,hdr-rgb-eveningroad.hdr,49.3342,1.4652,0.6606,0.8928
|
||||
HDRIHaven,5x5,hdr-rgb-riverwalk.hdr,31.2127,1.5933,0.7676,0.7684
|
||||
HDRIHaven,6x6,hdr-rgb-arboretum.hdr,38.0206,1.5996,0.7801,0.7561
|
||||
HDRIHaven,6x6,hdr-rgb-bellparkpier.hdr,49.5914,1.6176,0.8131,0.7254
|
||||
HDRIHaven,6x6,hdr-rgb-canarywharf.hdr,43.5720,1.6268,0.8284,0.7120
|
||||
HDRIHaven,6x6,hdr-rgb-eveningroad.hdr,46.8656,1.5934,0.7870,0.7494
|
||||
HDRIHaven,6x6,hdr-rgb-riverwalk.hdr,28.7315,1.7475,0.9170,0.6432
|
||||
HDRIHaven,8x8,hdr-rgb-arboretum.hdr,33.7638,1.8698,1.0465,0.5636
|
||||
HDRIHaven,8x8,hdr-rgb-bellparkpier.hdr,45.7385,1.7500,0.9440,0.6248
|
||||
HDRIHaven,8x8,hdr-rgb-canarywharf.hdr,40.0389,1.8071,1.0011,0.5892
|
||||
HDRIHaven,8x8,hdr-rgb-eveningroad.hdr,43.7390,1.7833,0.9732,0.6061
|
||||
HDRIHaven,8x8,hdr-rgb-riverwalk.hdr,25.8113,2.0260,1.1907,0.4954
|
||||
HDRIHaven,12x12,hdr-rgb-arboretum.hdr,30.2896,2.0752,1.2384,0.4763
|
||||
HDRIHaven,12x12,hdr-rgb-bellparkpier.hdr,41.5190,1.8057,0.9886,0.5966
|
||||
HDRIHaven,12x12,hdr-rgb-canarywharf.hdr,36.7063,1.9364,1.1189,0.5272
|
||||
HDRIHaven,12x12,hdr-rgb-eveningroad.hdr,40.8774,1.9304,1.1109,0.5310
|
||||
HDRIHaven,12x12,hdr-rgb-riverwalk.hdr,23.1152,2.2427,1.3917,0.4238
|
||||
|
@@ -0,0 +1,26 @@
|
||||
Image Set,Block Size,Name,PSNR,Total Time,Coding Time,Coding Rate
|
||||
HDRIHaven,4x4,hdr-rgb-arboretum.hdr,45.5183,0.8421,0.0887,6.6478
|
||||
HDRIHaven,4x4,hdr-rgb-bellparkpier.hdr,55.6496,0.8315,0.0853,6.9175
|
||||
HDRIHaven,4x4,hdr-rgb-canarywharf.hdr,49.1806,0.8445,0.0956,6.1696
|
||||
HDRIHaven,4x4,hdr-rgb-eveningroad.hdr,51.9442,0.8358,0.0867,6.7996
|
||||
HDRIHaven,4x4,hdr-rgb-riverwalk.hdr,34.4810,0.8674,0.1186,4.9726
|
||||
HDRIHaven,5x5,hdr-rgb-arboretum.hdr,41.4914,0.8318,0.0778,7.5826
|
||||
HDRIHaven,5x5,hdr-rgb-bellparkpier.hdr,51.8696,0.8303,0.0787,7.4941
|
||||
HDRIHaven,5x5,hdr-rgb-canarywharf.hdr,45.5304,0.8328,0.0826,7.1426
|
||||
HDRIHaven,5x5,hdr-rgb-eveningroad.hdr,48.9886,0.8293,0.0781,7.5523
|
||||
HDRIHaven,5x5,hdr-rgb-riverwalk.hdr,30.8769,0.8495,0.0978,6.0323
|
||||
HDRIHaven,6x6,hdr-rgb-arboretum.hdr,37.9125,0.8253,0.0767,7.6866
|
||||
HDRIHaven,6x6,hdr-rgb-bellparkpier.hdr,48.9501,0.8287,0.0809,7.2888
|
||||
HDRIHaven,6x6,hdr-rgb-canarywharf.hdr,42.6974,0.8499,0.0871,6.7739
|
||||
HDRIHaven,6x6,hdr-rgb-eveningroad.hdr,46.6096,0.8252,0.0809,7.2903
|
||||
HDRIHaven,6x6,hdr-rgb-riverwalk.hdr,28.4848,0.8409,0.0923,6.3909
|
||||
HDRIHaven,8x8,hdr-rgb-arboretum.hdr,33.6322,0.8621,0.1121,5.2605
|
||||
HDRIHaven,8x8,hdr-rgb-bellparkpier.hdr,45.2324,0.8611,0.1112,5.3048
|
||||
HDRIHaven,8x8,hdr-rgb-canarywharf.hdr,39.4874,0.8599,0.1111,5.3100
|
||||
HDRIHaven,8x8,hdr-rgb-eveningroad.hdr,43.5558,0.8643,0.1140,5.1743
|
||||
HDRIHaven,8x8,hdr-rgb-riverwalk.hdr,25.6748,0.8745,0.1243,4.7461
|
||||
HDRIHaven,12x12,hdr-rgb-arboretum.hdr,30.2132,0.8974,0.1455,4.0527
|
||||
HDRIHaven,12x12,hdr-rgb-bellparkpier.hdr,41.1310,0.8895,0.1391,4.2388
|
||||
HDRIHaven,12x12,hdr-rgb-canarywharf.hdr,36.3229,0.8925,0.1410,4.1844
|
||||
HDRIHaven,12x12,hdr-rgb-eveningroad.hdr,40.7302,0.8993,0.1480,3.9848
|
||||
HDRIHaven,12x12,hdr-rgb-riverwalk.hdr,23.0516,0.9102,0.1587,3.7176
|
||||
|
@@ -0,0 +1,26 @@
|
||||
Image Set,Block Size,Name,PSNR,Total Time,Coding Time,Coding Rate
|
||||
HDRIHaven,4x4,hdr-rgb-arboretum.hdr,45.2906,0.8061,0.0538,10.9559
|
||||
HDRIHaven,4x4,hdr-rgb-bellparkpier.hdr,55.2770,0.8041,0.0556,10.6034
|
||||
HDRIHaven,4x4,hdr-rgb-canarywharf.hdr,48.4522,0.8038,0.0559,10.5527
|
||||
HDRIHaven,4x4,hdr-rgb-eveningroad.hdr,51.7385,0.8043,0.0549,10.7346
|
||||
HDRIHaven,4x4,hdr-rgb-riverwalk.hdr,34.1353,0.8130,0.0635,9.2887
|
||||
HDRIHaven,5x5,hdr-rgb-arboretum.hdr,41.3714,0.8049,0.0532,11.0825
|
||||
HDRIHaven,5x5,hdr-rgb-bellparkpier.hdr,51.4977,0.8028,0.0541,10.9037
|
||||
HDRIHaven,5x5,hdr-rgb-canarywharf.hdr,45.0897,0.8034,0.0539,10.9509
|
||||
HDRIHaven,5x5,hdr-rgb-eveningroad.hdr,48.7504,0.8038,0.0534,11.0363
|
||||
HDRIHaven,5x5,hdr-rgb-riverwalk.hdr,30.5270,0.8122,0.0598,9.8646
|
||||
HDRIHaven,6x6,hdr-rgb-arboretum.hdr,37.8240,0.7991,0.0517,11.4146
|
||||
HDRIHaven,6x6,hdr-rgb-bellparkpier.hdr,48.6977,0.7964,0.0511,11.5344
|
||||
HDRIHaven,6x6,hdr-rgb-canarywharf.hdr,42.4615,0.7980,0.0524,11.2538
|
||||
HDRIHaven,6x6,hdr-rgb-eveningroad.hdr,46.3650,0.7951,0.0508,11.6073
|
||||
HDRIHaven,6x6,hdr-rgb-riverwalk.hdr,28.3399,0.8016,0.0561,10.5226
|
||||
HDRIHaven,8x8,hdr-rgb-arboretum.hdr,33.5378,0.8291,0.0778,7.5804
|
||||
HDRIHaven,8x8,hdr-rgb-bellparkpier.hdr,44.8222,0.8210,0.0724,8.1516
|
||||
HDRIHaven,8x8,hdr-rgb-canarywharf.hdr,39.1845,0.8287,0.0744,7.9285
|
||||
HDRIHaven,8x8,hdr-rgb-eveningroad.hdr,43.3350,0.8249,0.0738,7.9934
|
||||
HDRIHaven,8x8,hdr-rgb-riverwalk.hdr,25.6111,0.8318,0.0847,6.9658
|
||||
HDRIHaven,12x12,hdr-rgb-arboretum.hdr,30.1334,0.8493,0.0981,6.0141
|
||||
HDRIHaven,12x12,hdr-rgb-bellparkpier.hdr,40.6166,0.8403,0.0915,6.4462
|
||||
HDRIHaven,12x12,hdr-rgb-canarywharf.hdr,36.0114,0.8417,0.0935,6.3050
|
||||
HDRIHaven,12x12,hdr-rgb-eveningroad.hdr,40.6068,0.8478,0.0977,6.0370
|
||||
HDRIHaven,12x12,hdr-rgb-riverwalk.hdr,23.0090,0.8590,0.1085,5.4348
|
||||
|
@@ -0,0 +1,26 @@
|
||||
Image Set,Block Size,Name,PSNR,Total Time,Coding Time,Coding Rate
|
||||
HDRIHaven,4x4,hdr-rgb-arboretum.hdr,45.6967,0.8917,0.1381,4.2706
|
||||
HDRIHaven,4x4,hdr-rgb-bellparkpier.hdr,55.9872,0.8862,0.1381,4.2711
|
||||
HDRIHaven,4x4,hdr-rgb-canarywharf.hdr,49.5481,0.9086,0.1568,3.7610
|
||||
HDRIHaven,4x4,hdr-rgb-eveningroad.hdr,52.2746,0.8945,0.1438,4.1016
|
||||
HDRIHaven,4x4,hdr-rgb-riverwalk.hdr,34.8015,0.9469,0.1920,3.0717
|
||||
HDRIHaven,5x5,hdr-rgb-arboretum.hdr,41.5897,0.8797,0.1264,4.6671
|
||||
HDRIHaven,5x5,hdr-rgb-bellparkpier.hdr,52.2586,0.8937,0.1397,4.2232
|
||||
HDRIHaven,5x5,hdr-rgb-canarywharf.hdr,46.0351,0.8985,0.1443,4.0874
|
||||
HDRIHaven,5x5,hdr-rgb-eveningroad.hdr,49.2103,0.8891,0.1370,4.3048
|
||||
HDRIHaven,5x5,hdr-rgb-riverwalk.hdr,31.0865,0.9240,0.1690,3.4898
|
||||
HDRIHaven,6x6,hdr-rgb-arboretum.hdr,37.9740,0.8824,0.1327,4.4450
|
||||
HDRIHaven,6x6,hdr-rgb-bellparkpier.hdr,49.3897,0.8941,0.1464,4.0275
|
||||
HDRIHaven,6x6,hdr-rgb-canarywharf.hdr,43.2248,0.9014,0.1505,3.9200
|
||||
HDRIHaven,6x6,hdr-rgb-eveningroad.hdr,46.7613,0.8937,0.1465,4.0265
|
||||
HDRIHaven,6x6,hdr-rgb-riverwalk.hdr,28.6388,0.9206,0.1685,3.5003
|
||||
HDRIHaven,8x8,hdr-rgb-arboretum.hdr,33.7174,0.9347,0.1818,3.2437
|
||||
HDRIHaven,8x8,hdr-rgb-bellparkpier.hdr,45.5395,0.9313,0.1833,3.2180
|
||||
HDRIHaven,8x8,hdr-rgb-canarywharf.hdr,39.7977,0.9488,0.1945,3.0318
|
||||
HDRIHaven,8x8,hdr-rgb-eveningroad.hdr,43.6514,0.9423,0.1897,3.1099
|
||||
HDRIHaven,8x8,hdr-rgb-riverwalk.hdr,25.7425,0.9624,0.2081,2.8345
|
||||
HDRIHaven,12x12,hdr-rgb-arboretum.hdr,30.2551,1.0001,0.2434,2.4234
|
||||
HDRIHaven,12x12,hdr-rgb-bellparkpier.hdr,41.3460,0.9773,0.2264,2.6047
|
||||
HDRIHaven,12x12,hdr-rgb-canarywharf.hdr,36.5142,1.0026,0.2488,2.3702
|
||||
HDRIHaven,12x12,hdr-rgb-eveningroad.hdr,40.8241,0.9929,0.2404,2.4538
|
||||
HDRIHaven,12x12,hdr-rgb-riverwalk.hdr,23.0771,1.0092,0.2552,2.3115
|
||||
|
@@ -0,0 +1,26 @@
|
||||
Image Set,Block Size,Name,PSNR,Total Time,Coding Time,Coding Rate
|
||||
HDRIHaven,4x4,hdr-rgb-arboretum.hdr,45.8226,1.0510,0.2972,1.9845
|
||||
HDRIHaven,4x4,hdr-rgb-bellparkpier.hdr,56.3071,1.0638,0.3161,1.8661
|
||||
HDRIHaven,4x4,hdr-rgb-canarywharf.hdr,50.0457,1.0919,0.3383,1.7434
|
||||
HDRIHaven,4x4,hdr-rgb-eveningroad.hdr,52.4455,1.0613,0.3117,1.8920
|
||||
HDRIHaven,4x4,hdr-rgb-riverwalk.hdr,34.9306,1.1433,0.3936,1.4987
|
||||
HDRIHaven,5x5,hdr-rgb-arboretum.hdr,41.6472,1.0421,0.2851,2.0687
|
||||
HDRIHaven,5x5,hdr-rgb-bellparkpier.hdr,52.4971,1.0836,0.3321,1.7760
|
||||
HDRIHaven,5x5,hdr-rgb-canarywharf.hdr,46.4390,1.0780,0.3304,1.7854
|
||||
HDRIHaven,5x5,hdr-rgb-eveningroad.hdr,49.3310,1.0575,0.3077,1.9169
|
||||
HDRIHaven,5x5,hdr-rgb-riverwalk.hdr,31.2247,1.1193,0.3701,1.5936
|
||||
HDRIHaven,6x6,hdr-rgb-arboretum.hdr,38.0267,1.0532,0.3053,1.9322
|
||||
HDRIHaven,6x6,hdr-rgb-bellparkpier.hdr,49.6037,1.1076,0.3576,1.6492
|
||||
HDRIHaven,6x6,hdr-rgb-canarywharf.hdr,43.5645,1.0999,0.3515,1.6779
|
||||
HDRIHaven,6x6,hdr-rgb-eveningroad.hdr,46.8686,1.0829,0.3357,1.7569
|
||||
HDRIHaven,6x6,hdr-rgb-riverwalk.hdr,28.7460,1.1268,0.3798,1.5528
|
||||
HDRIHaven,8x8,hdr-rgb-arboretum.hdr,33.7716,1.1755,0.4190,1.4076
|
||||
HDRIHaven,8x8,hdr-rgb-bellparkpier.hdr,45.7436,1.1967,0.4434,1.3303
|
||||
HDRIHaven,8x8,hdr-rgb-canarywharf.hdr,40.0304,1.2071,0.4571,1.2905
|
||||
HDRIHaven,8x8,hdr-rgb-eveningroad.hdr,43.7463,1.1927,0.4419,1.3346
|
||||
HDRIHaven,8x8,hdr-rgb-riverwalk.hdr,25.8190,1.2341,0.4767,1.2373
|
||||
HDRIHaven,12x12,hdr-rgb-arboretum.hdr,30.2963,1.3386,0.5790,1.0187
|
||||
HDRIHaven,12x12,hdr-rgb-bellparkpier.hdr,41.5470,1.2630,0.5098,1.1569
|
||||
HDRIHaven,12x12,hdr-rgb-canarywharf.hdr,36.7151,1.3345,0.5789,1.0188
|
||||
HDRIHaven,12x12,hdr-rgb-eveningroad.hdr,40.8836,1.3195,0.5665,1.0412
|
||||
HDRIHaven,12x12,hdr-rgb-riverwalk.hdr,23.1188,1.3807,0.6215,0.9491
|
||||
|
@@ -0,0 +1,26 @@
|
||||
Image Set,Block Size,Name,PSNR,Total Time,Coding Time,Coding Rate
|
||||
HDRIHaven,4x4,hdr-rgb-arboretum.hdr,45.5183,0.8977,0.1136,5.1909
|
||||
HDRIHaven,4x4,hdr-rgb-bellparkpier.hdr,55.6496,0.8914,0.1097,5.3777
|
||||
HDRIHaven,4x4,hdr-rgb-canarywharf.hdr,49.1806,0.9059,0.1224,4.8195
|
||||
HDRIHaven,4x4,hdr-rgb-eveningroad.hdr,51.9442,0.8956,0.1116,5.2860
|
||||
HDRIHaven,4x4,hdr-rgb-riverwalk.hdr,34.4810,0.9369,0.1510,3.9056
|
||||
HDRIHaven,5x5,hdr-rgb-arboretum.hdr,41.4914,0.8935,0.1075,5.4888
|
||||
HDRIHaven,5x5,hdr-rgb-bellparkpier.hdr,51.8696,0.8904,0.1081,5.4540
|
||||
HDRIHaven,5x5,hdr-rgb-canarywharf.hdr,45.5304,0.8972,0.1136,5.1936
|
||||
HDRIHaven,5x5,hdr-rgb-eveningroad.hdr,48.9886,0.8909,0.1073,5.4969
|
||||
HDRIHaven,5x5,hdr-rgb-riverwalk.hdr,30.8769,0.9189,0.1341,4.3987
|
||||
HDRIHaven,6x6,hdr-rgb-arboretum.hdr,37.9125,0.8985,0.1130,5.2191
|
||||
HDRIHaven,6x6,hdr-rgb-bellparkpier.hdr,48.9501,0.9012,0.1192,4.9491
|
||||
HDRIHaven,6x6,hdr-rgb-canarywharf.hdr,42.6974,0.9037,0.1202,4.9071
|
||||
HDRIHaven,6x6,hdr-rgb-eveningroad.hdr,46.6096,0.9013,0.1179,5.0030
|
||||
HDRIHaven,6x6,hdr-rgb-riverwalk.hdr,28.4848,0.9190,0.1341,4.3973
|
||||
HDRIHaven,8x8,hdr-rgb-arboretum.hdr,33.6322,0.9551,0.1705,3.4595
|
||||
HDRIHaven,8x8,hdr-rgb-bellparkpier.hdr,45.2324,0.9568,0.1737,3.3965
|
||||
HDRIHaven,8x8,hdr-rgb-canarywharf.hdr,39.4874,0.9558,0.1712,3.4451
|
||||
HDRIHaven,8x8,hdr-rgb-eveningroad.hdr,43.5558,0.9571,0.1754,3.3625
|
||||
HDRIHaven,8x8,hdr-rgb-riverwalk.hdr,25.6748,0.9743,0.1891,3.1189
|
||||
HDRIHaven,12x12,hdr-rgb-arboretum.hdr,30.2132,1.0035,0.2159,2.7315
|
||||
HDRIHaven,12x12,hdr-rgb-bellparkpier.hdr,41.1310,0.9935,0.2095,2.8159
|
||||
HDRIHaven,12x12,hdr-rgb-canarywharf.hdr,36.3229,0.9974,0.2112,2.7923
|
||||
HDRIHaven,12x12,hdr-rgb-eveningroad.hdr,40.7302,1.0055,0.2209,2.6707
|
||||
HDRIHaven,12x12,hdr-rgb-riverwalk.hdr,23.0516,1.0217,0.2356,2.5036
|
||||
|
@@ -0,0 +1,26 @@
|
||||
Image Set,Block Size,Name,PSNR,Total Time,Coding Time,Coding Rate
|
||||
HDRIHaven,4x4,hdr-rgb-arboretum.hdr,45.2906,0.8542,0.0694,8.5015
|
||||
HDRIHaven,4x4,hdr-rgb-bellparkpier.hdr,55.2770,0.8533,0.0721,8.1804
|
||||
HDRIHaven,4x4,hdr-rgb-canarywharf.hdr,48.4522,0.8534,0.0722,8.1682
|
||||
HDRIHaven,4x4,hdr-rgb-eveningroad.hdr,51.7385,0.8547,0.0714,8.2651
|
||||
HDRIHaven,4x4,hdr-rgb-riverwalk.hdr,34.1353,0.8659,0.0817,7.2185
|
||||
HDRIHaven,5x5,hdr-rgb-arboretum.hdr,41.3714,0.8568,0.0744,7.9233
|
||||
HDRIHaven,5x5,hdr-rgb-bellparkpier.hdr,51.4977,0.8567,0.0752,7.8485
|
||||
HDRIHaven,5x5,hdr-rgb-canarywharf.hdr,45.0897,0.8561,0.0748,7.8864
|
||||
HDRIHaven,5x5,hdr-rgb-eveningroad.hdr,48.7504,0.8567,0.0740,7.9697
|
||||
HDRIHaven,5x5,hdr-rgb-riverwalk.hdr,30.5270,0.8660,0.0828,7.1238
|
||||
HDRIHaven,6x6,hdr-rgb-arboretum.hdr,37.8240,0.8601,0.0772,7.6419
|
||||
HDRIHaven,6x6,hdr-rgb-bellparkpier.hdr,48.6977,0.8557,0.0762,7.7444
|
||||
HDRIHaven,6x6,hdr-rgb-canarywharf.hdr,42.4615,0.8602,0.0778,7.5834
|
||||
HDRIHaven,6x6,hdr-rgb-eveningroad.hdr,46.3650,0.8566,0.0751,7.8532
|
||||
HDRIHaven,6x6,hdr-rgb-riverwalk.hdr,28.3399,0.8670,0.0832,7.0880
|
||||
HDRIHaven,8x8,hdr-rgb-arboretum.hdr,33.5378,0.9024,0.1177,5.0108
|
||||
HDRIHaven,8x8,hdr-rgb-bellparkpier.hdr,44.8222,0.8925,0.1111,5.3085
|
||||
HDRIHaven,8x8,hdr-rgb-canarywharf.hdr,39.1845,0.8973,0.1128,5.2291
|
||||
HDRIHaven,8x8,hdr-rgb-eveningroad.hdr,43.3350,0.8956,0.1121,5.2622
|
||||
HDRIHaven,8x8,hdr-rgb-riverwalk.hdr,25.6111,0.9129,0.1283,4.5972
|
||||
HDRIHaven,12x12,hdr-rgb-arboretum.hdr,30.1334,0.9332,0.1461,4.0371
|
||||
HDRIHaven,12x12,hdr-rgb-bellparkpier.hdr,40.6166,0.9214,0.1369,4.3098
|
||||
HDRIHaven,12x12,hdr-rgb-canarywharf.hdr,36.0114,0.9244,0.1399,4.2166
|
||||
HDRIHaven,12x12,hdr-rgb-eveningroad.hdr,40.6068,0.9291,0.1452,4.0633
|
||||
HDRIHaven,12x12,hdr-rgb-riverwalk.hdr,23.0090,0.9473,0.1609,3.6648
|
||||
|
@@ -0,0 +1,26 @@
|
||||
Image Set,Block Size,Name,PSNR,Total Time,Coding Time,Coding Rate
|
||||
HDRIHaven,4x4,hdr-rgb-arboretum.hdr,45.6967,0.9688,0.1804,3.2692
|
||||
HDRIHaven,4x4,hdr-rgb-bellparkpier.hdr,55.9872,0.9690,0.1843,3.2001
|
||||
HDRIHaven,4x4,hdr-rgb-canarywharf.hdr,49.5481,0.9919,0.2064,2.8580
|
||||
HDRIHaven,4x4,hdr-rgb-eveningroad.hdr,52.2746,0.9747,0.1915,3.0806
|
||||
HDRIHaven,4x4,hdr-rgb-riverwalk.hdr,34.8015,1.0371,0.2509,2.3508
|
||||
HDRIHaven,5x5,hdr-rgb-arboretum.hdr,41.5897,0.9682,0.1812,3.2544
|
||||
HDRIHaven,5x5,hdr-rgb-bellparkpier.hdr,52.2586,0.9877,0.2020,2.9206
|
||||
HDRIHaven,5x5,hdr-rgb-canarywharf.hdr,46.0351,0.9964,0.2064,2.8574
|
||||
HDRIHaven,5x5,hdr-rgb-eveningroad.hdr,49.2103,0.9845,0.1963,3.0051
|
||||
HDRIHaven,5x5,hdr-rgb-riverwalk.hdr,31.0865,1.0287,0.2410,2.4476
|
||||
HDRIHaven,6x6,hdr-rgb-arboretum.hdr,37.9740,0.9863,0.1994,2.9584
|
||||
HDRIHaven,6x6,hdr-rgb-bellparkpier.hdr,49.3897,1.0056,0.2225,2.6511
|
||||
HDRIHaven,6x6,hdr-rgb-canarywharf.hdr,43.2248,1.0112,0.2255,2.6151
|
||||
HDRIHaven,6x6,hdr-rgb-eveningroad.hdr,46.7613,1.0062,0.2219,2.6583
|
||||
HDRIHaven,6x6,hdr-rgb-riverwalk.hdr,28.6388,1.0355,0.2497,2.3621
|
||||
HDRIHaven,8x8,hdr-rgb-arboretum.hdr,33.7174,1.0714,0.2815,2.0953
|
||||
HDRIHaven,8x8,hdr-rgb-bellparkpier.hdr,45.5395,1.0739,0.2891,2.0405
|
||||
HDRIHaven,8x8,hdr-rgb-canarywharf.hdr,39.7977,1.0919,0.3016,1.9554
|
||||
HDRIHaven,8x8,hdr-rgb-eveningroad.hdr,43.6514,1.0799,0.2937,2.0080
|
||||
HDRIHaven,8x8,hdr-rgb-riverwalk.hdr,25.7425,1.1059,0.3198,1.8444
|
||||
HDRIHaven,12x12,hdr-rgb-arboretum.hdr,30.2551,1.1544,0.3623,1.6280
|
||||
HDRIHaven,12x12,hdr-rgb-bellparkpier.hdr,41.3460,1.1298,0.3406,1.7318
|
||||
HDRIHaven,12x12,hdr-rgb-canarywharf.hdr,36.5142,1.1642,0.3728,1.5820
|
||||
HDRIHaven,12x12,hdr-rgb-eveningroad.hdr,40.8241,1.1478,0.3600,1.6385
|
||||
HDRIHaven,12x12,hdr-rgb-riverwalk.hdr,23.0771,1.1683,0.3769,1.5650
|
||||
|
@@ -0,0 +1,26 @@
|
||||
Image Set,Block Size,Name,PSNR,Total Time,Coding Time,Coding Rate
|
||||
HDRIHaven,4x4,hdr-rgb-arboretum.hdr,45.8226,1.1860,0.3930,1.5008
|
||||
HDRIHaven,4x4,hdr-rgb-bellparkpier.hdr,56.3071,1.2120,0.4245,1.3895
|
||||
HDRIHaven,4x4,hdr-rgb-canarywharf.hdr,50.0457,1.2394,0.4491,1.3134
|
||||
HDRIHaven,4x4,hdr-rgb-eveningroad.hdr,52.4455,1.2033,0.4194,1.4065
|
||||
HDRIHaven,4x4,hdr-rgb-riverwalk.hdr,34.9306,1.3087,0.5183,1.1380
|
||||
HDRIHaven,5x5,hdr-rgb-arboretum.hdr,41.6472,1.1900,0.4010,1.4708
|
||||
HDRIHaven,5x5,hdr-rgb-bellparkpier.hdr,52.4971,1.2626,0.4771,1.2364
|
||||
HDRIHaven,5x5,hdr-rgb-canarywharf.hdr,46.4390,1.2636,0.4679,1.2605
|
||||
HDRIHaven,5x5,hdr-rgb-eveningroad.hdr,49.3310,1.2250,0.4385,1.3451
|
||||
HDRIHaven,5x5,hdr-rgb-riverwalk.hdr,31.2247,1.3242,0.5194,1.1355
|
||||
HDRIHaven,6x6,hdr-rgb-arboretum.hdr,38.0267,1.2729,0.4619,1.2770
|
||||
HDRIHaven,6x6,hdr-rgb-bellparkpier.hdr,49.6037,1.3264,0.5368,1.0988
|
||||
HDRIHaven,6x6,hdr-rgb-canarywharf.hdr,43.5645,1.3113,0.5218,1.1304
|
||||
HDRIHaven,6x6,hdr-rgb-eveningroad.hdr,46.8686,1.2886,0.4997,1.1803
|
||||
HDRIHaven,6x6,hdr-rgb-riverwalk.hdr,28.7460,1.3489,0.5572,1.0585
|
||||
HDRIHaven,8x8,hdr-rgb-arboretum.hdr,33.7716,1.4315,0.6375,0.9253
|
||||
HDRIHaven,8x8,hdr-rgb-bellparkpier.hdr,45.7436,1.4856,0.6960,0.8475
|
||||
HDRIHaven,8x8,hdr-rgb-canarywharf.hdr,40.0304,1.4995,0.7046,0.8371
|
||||
HDRIHaven,8x8,hdr-rgb-eveningroad.hdr,43.7463,1.4690,0.6816,0.8654
|
||||
HDRIHaven,8x8,hdr-rgb-riverwalk.hdr,25.8190,1.5179,0.7243,0.8144
|
||||
HDRIHaven,12x12,hdr-rgb-arboretum.hdr,30.2963,1.6495,0.8518,0.6925
|
||||
HDRIHaven,12x12,hdr-rgb-bellparkpier.hdr,41.5470,1.5601,0.7638,0.7723
|
||||
HDRIHaven,12x12,hdr-rgb-canarywharf.hdr,36.7151,1.6553,0.8607,0.6853
|
||||
HDRIHaven,12x12,hdr-rgb-eveningroad.hdr,40.8836,1.6401,0.8407,0.7016
|
||||
HDRIHaven,12x12,hdr-rgb-riverwalk.hdr,23.1188,1.7103,0.9121,0.6467
|
||||
|
@@ -0,0 +1,26 @@
|
||||
Image Set,Block Size,Name,PSNR,Total Time,Coding Time,Coding Rate
|
||||
HDRIHaven,4x4,hdr-rgb-arboretum.hdr,45.5183,0.8375,0.0976,6.0429
|
||||
HDRIHaven,4x4,hdr-rgb-bellparkpier.hdr,55.6496,0.8309,0.0941,6.2678
|
||||
HDRIHaven,4x4,hdr-rgb-canarywharf.hdr,49.1806,0.8434,0.1052,5.6072
|
||||
HDRIHaven,4x4,hdr-rgb-eveningroad.hdr,51.9442,0.8347,0.0959,6.1512
|
||||
HDRIHaven,4x4,hdr-rgb-riverwalk.hdr,34.4810,0.8707,0.1303,4.5283
|
||||
HDRIHaven,5x5,hdr-rgb-arboretum.hdr,41.4914,0.8287,0.0893,6.6069
|
||||
HDRIHaven,5x5,hdr-rgb-bellparkpier.hdr,51.8696,0.8280,0.0901,6.5438
|
||||
HDRIHaven,5x5,hdr-rgb-canarywharf.hdr,45.5304,0.8346,0.0943,6.2538
|
||||
HDRIHaven,5x5,hdr-rgb-eveningroad.hdr,48.9886,0.8277,0.0898,6.5657
|
||||
HDRIHaven,5x5,hdr-rgb-riverwalk.hdr,30.8769,0.8531,0.1117,5.2783
|
||||
HDRIHaven,6x6,hdr-rgb-arboretum.hdr,37.9125,0.8354,0.0936,6.2988
|
||||
HDRIHaven,6x6,hdr-rgb-bellparkpier.hdr,48.9501,0.8363,0.0984,5.9934
|
||||
HDRIHaven,6x6,hdr-rgb-canarywharf.hdr,42.6974,0.8382,0.0996,5.9244
|
||||
HDRIHaven,6x6,hdr-rgb-eveningroad.hdr,46.6096,0.8361,0.0983,6.0003
|
||||
HDRIHaven,6x6,hdr-rgb-riverwalk.hdr,28.4848,0.8533,0.1115,5.2893
|
||||
HDRIHaven,8x8,hdr-rgb-arboretum.hdr,33.6322,0.8826,0.1423,4.1464
|
||||
HDRIHaven,8x8,hdr-rgb-bellparkpier.hdr,45.2324,0.8836,0.1443,4.0874
|
||||
HDRIHaven,8x8,hdr-rgb-canarywharf.hdr,39.4874,0.8802,0.1419,4.1555
|
||||
HDRIHaven,8x8,hdr-rgb-eveningroad.hdr,43.5558,0.8856,0.1463,4.0306
|
||||
HDRIHaven,8x8,hdr-rgb-riverwalk.hdr,25.6748,0.8974,0.1580,3.7333
|
||||
HDRIHaven,12x12,hdr-rgb-arboretum.hdr,30.2132,0.9334,0.1896,3.1109
|
||||
HDRIHaven,12x12,hdr-rgb-bellparkpier.hdr,41.1310,0.9244,0.1842,3.2016
|
||||
HDRIHaven,12x12,hdr-rgb-canarywharf.hdr,36.3229,0.9278,0.1864,3.1645
|
||||
HDRIHaven,12x12,hdr-rgb-eveningroad.hdr,40.7302,0.9367,0.1941,3.0392
|
||||
HDRIHaven,12x12,hdr-rgb-riverwalk.hdr,23.0516,0.9506,0.2068,2.8518
|
||||
|
@@ -0,0 +1,26 @@
|
||||
Image Set,Block Size,Name,PSNR,Total Time,Coding Time,Coding Rate
|
||||
HDRIHaven,4x4,hdr-rgb-arboretum.hdr,45.2906,0.7990,0.0592,9.9612
|
||||
HDRIHaven,4x4,hdr-rgb-bellparkpier.hdr,55.2770,0.7989,0.0614,9.6041
|
||||
HDRIHaven,4x4,hdr-rgb-canarywharf.hdr,48.4522,0.7976,0.0616,9.5813
|
||||
HDRIHaven,4x4,hdr-rgb-eveningroad.hdr,51.7385,0.7971,0.0609,9.6894
|
||||
HDRIHaven,4x4,hdr-rgb-riverwalk.hdr,34.1353,0.8086,0.0701,8.4083
|
||||
HDRIHaven,5x5,hdr-rgb-arboretum.hdr,41.3714,0.8009,0.0614,9.6050
|
||||
HDRIHaven,5x5,hdr-rgb-bellparkpier.hdr,51.4977,0.7990,0.0622,9.4774
|
||||
HDRIHaven,5x5,hdr-rgb-canarywharf.hdr,45.0897,0.7989,0.0620,9.5133
|
||||
HDRIHaven,5x5,hdr-rgb-eveningroad.hdr,48.7504,0.7988,0.0615,9.5983
|
||||
HDRIHaven,5x5,hdr-rgb-riverwalk.hdr,30.5270,0.8071,0.0688,8.5710
|
||||
HDRIHaven,6x6,hdr-rgb-arboretum.hdr,37.8240,0.8015,0.0634,9.3057
|
||||
HDRIHaven,6x6,hdr-rgb-bellparkpier.hdr,48.6977,0.7987,0.0623,9.4600
|
||||
HDRIHaven,6x6,hdr-rgb-canarywharf.hdr,42.4615,0.7996,0.0639,9.2372
|
||||
HDRIHaven,6x6,hdr-rgb-eveningroad.hdr,46.3650,0.7977,0.0621,9.5027
|
||||
HDRIHaven,6x6,hdr-rgb-riverwalk.hdr,28.3399,0.8063,0.0685,8.6155
|
||||
HDRIHaven,8x8,hdr-rgb-arboretum.hdr,33.5378,0.8373,0.0977,6.0388
|
||||
HDRIHaven,8x8,hdr-rgb-bellparkpier.hdr,44.8222,0.8319,0.0919,6.4150
|
||||
HDRIHaven,8x8,hdr-rgb-canarywharf.hdr,39.1845,0.8320,0.0935,6.3063
|
||||
HDRIHaven,8x8,hdr-rgb-eveningroad.hdr,43.3350,0.8313,0.0925,6.3766
|
||||
HDRIHaven,8x8,hdr-rgb-riverwalk.hdr,25.6111,0.8468,0.1060,5.5621
|
||||
HDRIHaven,12x12,hdr-rgb-arboretum.hdr,30.1334,0.8697,0.1281,4.6052
|
||||
HDRIHaven,12x12,hdr-rgb-bellparkpier.hdr,40.6166,0.8571,0.1199,4.9173
|
||||
HDRIHaven,12x12,hdr-rgb-canarywharf.hdr,36.0114,0.8608,0.1225,4.8147
|
||||
HDRIHaven,12x12,hdr-rgb-eveningroad.hdr,40.6068,0.8653,0.1269,4.6491
|
||||
HDRIHaven,12x12,hdr-rgb-riverwalk.hdr,23.0090,0.8808,0.1412,4.1757
|
||||
|
@@ -0,0 +1,26 @@
|
||||
Image Set,Block Size,Name,PSNR,Total Time,Coding Time,Coding Rate
|
||||
HDRIHaven,4x4,hdr-rgb-arboretum.hdr,45.6967,0.8957,0.1550,3.8046
|
||||
HDRIHaven,4x4,hdr-rgb-bellparkpier.hdr,55.9872,0.8975,0.1566,3.7654
|
||||
HDRIHaven,4x4,hdr-rgb-canarywharf.hdr,49.5481,0.9174,0.1758,3.3543
|
||||
HDRIHaven,4x4,hdr-rgb-eveningroad.hdr,52.2746,0.9025,0.1628,3.6220
|
||||
HDRIHaven,4x4,hdr-rgb-riverwalk.hdr,34.8015,0.9585,0.2149,2.7447
|
||||
HDRIHaven,5x5,hdr-rgb-arboretum.hdr,41.5897,0.8930,0.1487,3.9676
|
||||
HDRIHaven,5x5,hdr-rgb-bellparkpier.hdr,52.2586,0.9060,0.1659,3.5557
|
||||
HDRIHaven,5x5,hdr-rgb-canarywharf.hdr,46.0351,0.9125,0.1703,3.4632
|
||||
HDRIHaven,5x5,hdr-rgb-eveningroad.hdr,49.2103,0.9046,0.1621,3.6391
|
||||
HDRIHaven,5x5,hdr-rgb-riverwalk.hdr,31.0865,0.9448,0.1991,2.9627
|
||||
HDRIHaven,6x6,hdr-rgb-arboretum.hdr,37.9740,0.9065,0.1633,3.6112
|
||||
HDRIHaven,6x6,hdr-rgb-bellparkpier.hdr,49.3897,0.9236,0.1812,3.2545
|
||||
HDRIHaven,6x6,hdr-rgb-canarywharf.hdr,43.2248,0.9248,0.1852,3.1844
|
||||
HDRIHaven,6x6,hdr-rgb-eveningroad.hdr,46.7613,0.9188,0.1815,3.2495
|
||||
HDRIHaven,6x6,hdr-rgb-riverwalk.hdr,28.6388,0.9486,0.2071,2.8486
|
||||
HDRIHaven,8x8,hdr-rgb-arboretum.hdr,33.7174,0.9762,0.2337,2.5237
|
||||
HDRIHaven,8x8,hdr-rgb-bellparkpier.hdr,45.5395,0.9840,0.2405,2.4521
|
||||
HDRIHaven,8x8,hdr-rgb-canarywharf.hdr,39.7977,0.9928,0.2513,2.3470
|
||||
HDRIHaven,8x8,hdr-rgb-eveningroad.hdr,43.6514,0.9896,0.2459,2.3984
|
||||
HDRIHaven,8x8,hdr-rgb-riverwalk.hdr,25.7425,1.0085,0.2658,2.2190
|
||||
HDRIHaven,12x12,hdr-rgb-arboretum.hdr,30.2551,1.0664,0.3183,1.8532
|
||||
HDRIHaven,12x12,hdr-rgb-bellparkpier.hdr,41.3460,1.0430,0.2996,1.9688
|
||||
HDRIHaven,12x12,hdr-rgb-canarywharf.hdr,36.5142,1.0712,0.3277,1.7998
|
||||
HDRIHaven,12x12,hdr-rgb-eveningroad.hdr,40.8241,1.0608,0.3158,1.8676
|
||||
HDRIHaven,12x12,hdr-rgb-riverwalk.hdr,23.0771,1.0782,0.3310,1.7820
|
||||
|
@@ -0,0 +1,26 @@
|
||||
Image Set,Block Size,Name,PSNR,Total Time,Coding Time,Coding Rate
|
||||
HDRIHaven,4x4,hdr-rgb-arboretum.hdr,45.8226,1.0813,0.3362,1.7544
|
||||
HDRIHaven,4x4,hdr-rgb-bellparkpier.hdr,56.3071,1.0993,0.3600,1.6382
|
||||
HDRIHaven,4x4,hdr-rgb-canarywharf.hdr,50.0457,1.1243,0.3837,1.5372
|
||||
HDRIHaven,4x4,hdr-rgb-eveningroad.hdr,52.4455,1.0970,0.3554,1.6594
|
||||
HDRIHaven,4x4,hdr-rgb-riverwalk.hdr,34.9306,1.1874,0.4438,1.3289
|
||||
HDRIHaven,5x5,hdr-rgb-arboretum.hdr,41.6472,1.0771,0.3339,1.7664
|
||||
HDRIHaven,5x5,hdr-rgb-bellparkpier.hdr,52.4971,1.1350,0.3937,1.4982
|
||||
HDRIHaven,5x5,hdr-rgb-canarywharf.hdr,46.4390,1.1311,0.3896,1.5139
|
||||
HDRIHaven,5x5,hdr-rgb-eveningroad.hdr,49.3310,1.1028,0.3635,1.6227
|
||||
HDRIHaven,5x5,hdr-rgb-riverwalk.hdr,31.2247,1.1745,0.4337,1.3599
|
||||
HDRIHaven,6x6,hdr-rgb-arboretum.hdr,38.0267,1.1158,0.3735,1.5791
|
||||
HDRIHaven,6x6,hdr-rgb-bellparkpier.hdr,49.6037,1.1794,0.4390,1.3435
|
||||
HDRIHaven,6x6,hdr-rgb-canarywharf.hdr,43.5645,1.1736,0.4312,1.3680
|
||||
HDRIHaven,6x6,hdr-rgb-eveningroad.hdr,46.8686,1.1539,0.4120,1.4316
|
||||
HDRIHaven,6x6,hdr-rgb-riverwalk.hdr,28.7460,1.2102,0.4646,1.2695
|
||||
HDRIHaven,8x8,hdr-rgb-arboretum.hdr,33.7716,1.2802,0.5371,1.0981
|
||||
HDRIHaven,8x8,hdr-rgb-bellparkpier.hdr,45.7436,1.3283,0.5809,1.0154
|
||||
HDRIHaven,8x8,hdr-rgb-canarywharf.hdr,40.0304,1.3364,0.5920,0.9963
|
||||
HDRIHaven,8x8,hdr-rgb-eveningroad.hdr,43.7463,1.3139,0.5701,1.0346
|
||||
HDRIHaven,8x8,hdr-rgb-riverwalk.hdr,25.8190,1.3580,0.6114,0.9647
|
||||
HDRIHaven,12x12,hdr-rgb-arboretum.hdr,30.2963,1.4986,0.7526,0.7837
|
||||
HDRIHaven,12x12,hdr-rgb-bellparkpier.hdr,41.5470,1.4191,0.6744,0.8747
|
||||
HDRIHaven,12x12,hdr-rgb-canarywharf.hdr,36.7151,1.5040,0.7604,0.7757
|
||||
HDRIHaven,12x12,hdr-rgb-eveningroad.hdr,40.8836,1.4908,0.7417,0.7953
|
||||
HDRIHaven,12x12,hdr-rgb-riverwalk.hdr,23.1188,1.5523,0.8051,0.7327
|
||||
|
@@ -0,0 +1,26 @@
|
||||
Image Set,Block Size,Name,PSNR,Total Time,Coding Time,Coding Rate
|
||||
HDRIHaven,4x4,hdr-rgb-arboretum.hdr,45.5079,0.8407,0.0886,6.6551
|
||||
HDRIHaven,4x4,hdr-rgb-bellparkpier.hdr,55.6274,0.8317,0.0845,6.9837
|
||||
HDRIHaven,4x4,hdr-rgb-canarywharf.hdr,49.1770,0.8462,0.0950,6.2057
|
||||
HDRIHaven,4x4,hdr-rgb-eveningroad.hdr,51.9296,0.8367,0.0861,6.8505
|
||||
HDRIHaven,4x4,hdr-rgb-riverwalk.hdr,34.4914,0.8665,0.1184,4.9802
|
||||
HDRIHaven,5x5,hdr-rgb-arboretum.hdr,41.4794,0.8293,0.0776,7.6039
|
||||
HDRIHaven,5x5,hdr-rgb-bellparkpier.hdr,51.8484,0.8294,0.0779,7.5674
|
||||
HDRIHaven,5x5,hdr-rgb-canarywharf.hdr,45.5278,0.8328,0.0818,7.2112
|
||||
HDRIHaven,5x5,hdr-rgb-eveningroad.hdr,48.9680,0.8298,0.0776,7.6040
|
||||
HDRIHaven,5x5,hdr-rgb-riverwalk.hdr,30.8894,0.8484,0.0974,6.0580
|
||||
HDRIHaven,6x6,hdr-rgb-arboretum.hdr,37.9079,0.8256,0.0760,7.7585
|
||||
HDRIHaven,6x6,hdr-rgb-bellparkpier.hdr,48.9405,0.8255,0.0801,7.3633
|
||||
HDRIHaven,6x6,hdr-rgb-canarywharf.hdr,42.6946,0.8277,0.0812,7.2660
|
||||
HDRIHaven,6x6,hdr-rgb-eveningroad.hdr,46.6017,0.8275,0.0798,7.3903
|
||||
HDRIHaven,6x6,hdr-rgb-riverwalk.hdr,28.4862,0.8389,0.0912,6.4670
|
||||
HDRIHaven,8x8,hdr-rgb-arboretum.hdr,33.6288,0.8625,0.1105,5.3399
|
||||
HDRIHaven,8x8,hdr-rgb-bellparkpier.hdr,45.2302,0.8600,0.1106,5.3329
|
||||
HDRIHaven,8x8,hdr-rgb-canarywharf.hdr,39.4837,0.8614,0.1104,5.3449
|
||||
HDRIHaven,8x8,hdr-rgb-eveningroad.hdr,43.5502,0.8610,0.1128,5.2296
|
||||
HDRIHaven,8x8,hdr-rgb-riverwalk.hdr,25.6749,0.8747,0.1227,4.8062
|
||||
HDRIHaven,12x12,hdr-rgb-arboretum.hdr,30.2114,0.8978,0.1443,4.0866
|
||||
HDRIHaven,12x12,hdr-rgb-bellparkpier.hdr,41.1351,0.8862,0.1386,4.2547
|
||||
HDRIHaven,12x12,hdr-rgb-canarywharf.hdr,36.3187,0.8901,0.1399,4.2174
|
||||
HDRIHaven,12x12,hdr-rgb-eveningroad.hdr,40.7270,0.8968,0.1463,4.0312
|
||||
HDRIHaven,12x12,hdr-rgb-riverwalk.hdr,23.0489,0.9081,0.1586,3.7186
|
||||
|