ASTC-encoder update to 4.7.0

Signed-off-by: zhaonan287 <zhaonan34@huawei.com>
This commit is contained in:
ql
2024-05-15 17:46:24 +08:00
parent f273ad8f02
commit 17c0115f3c
345 changed files with 20691 additions and 10301 deletions
-2
View File
@@ -38,7 +38,6 @@ ohos_source_set("astc_encoder_static") {
"//third_party/astc-encoder/Source/astcenc_partition_tables.cpp",
"//third_party/astc-encoder/Source/astcenc_percentile_tables.cpp",
"//third_party/astc-encoder/Source/astcenc_pick_best_endpoint_format.cpp",
"//third_party/astc-encoder/Source/astcenc_platform_isa_detection.cpp",
"//third_party/astc-encoder/Source/astcenc_quantization.cpp",
"//third_party/astc-encoder/Source/astcenc_symbolic_physical.cpp",
"//third_party/astc-encoder/Source/astcenc_weight_align.cpp",
@@ -51,7 +50,6 @@ ohos_source_set("astc_encoder_static") {
ohos_shared_library("astc_encoder_shared") {
public_configs = [ ":astc_encoder_config" ]
deps = [ ":astc_encoder_static" ]
output_extension = "so"
install_enable = true
part_name = "astc-encoder"
innerapi_tags = [ "platformsdk" ]
+73 -153
View File
@@ -1,6 +1,6 @@
# SPDX-License-Identifier: Apache-2.0
# ----------------------------------------------------------------------------
# Copyright 2020-2022 Arm Limited
# Copyright 2020-2024 Arm Limited
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may not
# use this file except in compliance with the License. You may obtain a copy
@@ -24,163 +24,83 @@ if(MSVC)
add_compile_options("/wd4324") # Disable structure was padded due to alignment specifier
endif()
project(astcencoder VERSION 3.7.0)
project(astcencoder VERSION 4.7.0)
set(CMAKE_CXX_STANDARD 14)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CXX_EXTENSIONS OFF)
set(CMAKE_EXPORT_COMPILE_COMMANDS 1)
set(PACKAGE_ROOT astcenc)
set(CMAKE_XCODE_ATTRIBUTE_VALID_ARCHS "x86_64 x86_64h arm64")
include(CTest)
option(ISA_AVX2 "Enable builds for AVX2 SIMD")
option(ISA_SSE41 "Enable builds for SSE4.1 SIMD")
option(ISA_SSE2 "Enable builds for SSE2 SIMD")
option(ISA_NEON "Enable builds for NEON SIMD")
option(ISA_NONE "Enable builds for no SIMD")
option(ISA_NATIVE "Enable builds for native SIMD")
option(DECOMPRESSOR "Enable builds for decompression only")
option(DIAGNOSTICS "Enable builds for diagnostic trace")
option(ASAN "Enable builds width address sanitizer")
option(UNITTEST "Enable builds for unit tests")
option(NO_INVARIANCE "Enable builds without invariance")
option(CLI "Enable build of CLI" ON)
set(UNIVERSAL_BUILD OFF)
set(MACOS_BUILD OFF)
set(MACOS_ARCH_LEN 0)
option(ASTCENC_ISA_AVX2 "Enable astcenc builds for AVX2 SIMD")
option(ASTCENC_ISA_SSE41 "Enable astcenc builds for SSE4.1 SIMD")
option(ASTCENC_ISA_SSE2 "Enable astcenc builds for SSE2 SIMD")
option(ASTCENC_ISA_NEON "Enable astcenc builds for NEON SIMD")
option(ASTCENC_ISA_NONE "Enable astcenc builds for no SIMD")
option(ASTCENC_ISA_NATIVE "Enable astcenc builds for native SIMD")
option(ASTCENC_DECOMPRESSOR "Enable astcenc builds for decompression only")
option(ASTCENC_SHAREDLIB "Enable astcenc builds with core library shared objects")
option(ASTCENC_DIAGNOSTICS "Enable astcenc builds with diagnostic trace")
option(ASTCENC_ASAN "Enable astcenc builds with address sanitizer")
option(ASTCENC_UNITTEST "Enable astcenc builds with unit tests")
option(ASTCENC_INVARIANCE "Enable astcenc floating point invariance" ON)
option(ASTCENC_CLI "Enable build of astcenc command line tools" ON)
# Preflight for some macOS-specific build options
if("${CMAKE_SYSTEM_NAME}" STREQUAL "Darwin")
set(MACOS_BUILD ON)
list(LENGTH CMAKE_OSX_ARCHITECTURES MACOS_ARCH_LEN)
option(ASTCENC_UNIVERSAL_BUILD "Enable universal multi-arch build" ON)
if(${ASTCENC_UNIVERSAL_BUILD})
set(ASTCENC_ISA_SSE41 ON)
set(ASTCENC_ISA_AVX2 ON)
set(ASTCENC_ISA_NEON ON)
if(${ASTCENC_ISA_SSE2})
message(FATAL_ERROR "ISA_SSE2 cannot be used in a universal build")
endif()
if(${ASTCENC_ISA_NONE})
message(FATAL_ERROR "ISA_NONE cannot be used in a universal build")
endif()
if(${ASTCENC_ISA_NATIVE})
message(FATAL_ERROR "ISA_NATIVE cannot be used in a universal build")
endif()
endif()
else()
set(ASTCENC_UNIVERSAL_BUILD OFF)
endif()
# Count options which MUST be x64
set(X64_ISA_COUNT 0)
set(CONFIGS ${ISA_AVX2} ${ISA_SSE41} ${ISA_SSE2})
foreach(CONFIG ${CONFIGS})
if(${CONFIG})
math(EXPR X64_ISA_COUNT "${X64_ISA_COUNT} + 1")
set(ASTCENC_X64_ISA_COUNT 0)
set(ASTCENC_CONFIGS ${ASTCENC_ISA_AVX2} ${ASTCENC_ISA_SSE41} ${ASTCENC_ISA_SSE2})
foreach(ASTCENC_CONFIG ${ASTCENC_CONFIGS})
if(${ASTCENC_CONFIG})
math(EXPR ASTCENC_X64_ISA_COUNT "${ASTCENC_X64_ISA_COUNT} + 1")
endif()
endforeach()
# Count options which MUST be arm64
set(ARM64_ISA_COUNT 0)
set(CONFIGS ${ISA_NEON})
foreach(CONFIG ${CONFIGS})
if(${CONFIG})
math(EXPR ARM64_ISA_COUNT "${ARM64_ISA_COUNT} + 1")
set(ASTCENC_ARM64_ISA_COUNT 0)
set(ASTCENC_CONFIGS ${ASTCENC_ISA_NEON})
foreach(ASTCENC_CONFIG ${ASTCENC_CONFIGS})
if(${ASTCENC_CONFIG})
math(EXPR ASTCENC_ARM64_ISA_COUNT "${ASTCENC_ARM64_ISA_COUNT} + 1")
endif()
endforeach()
# macOS builds
if("${MACOS_BUILD}")
list(FIND CMAKE_OSX_ARCHITECTURES "x86_64" IS_X64)
list(FIND CMAKE_OSX_ARCHITECTURES "arm64" IS_ARM64)
list(FIND CMAKE_OSX_ARCHITECTURES "$(ARCHS_STANDARD)" IS_AUTO)
# Turn list index into boolean
if(${IS_X64} EQUAL -1)
set(IS_X64 OFF)
else()
set(IS_X64 ON)
endif()
if(${IS_ARM64} EQUAL -1)
set(IS_ARM64 OFF)
else()
set(IS_ARM64 ON)
endif()
if(${IS_AUTO} EQUAL -1)
set(IS_AUTO OFF)
else()
set(IS_AUTO ON)
endif()
# Set up defaults if no more specific ISA set - use XCode's own defaults
if((IS_ARM64 OR IS_AUTO) AND ("${ARM64_ISA_COUNT}" EQUAL 0) AND (NOT "${ISA_NONE}"))
set(ARM64_ISA_COUNT 1)
set(ISA_NEON ON)
endif()
if((IS_X64 OR IS_AUTO) AND ("${X64_ISA_COUNT}" EQUAL 0) AND (NOT "${ISA_NONE}"))
set(X64_ISA_COUNT 1)
set(ISA_SSE41 ON)
endif()
# User might be doing multi-architecture - XCode sets this at runtime
if("${IS_AUTO}")
if(("${ARM64_ISA_COUNT}" GREATER 1) OR ("${X64_ISA_COUNT}" GREATER 1))
message(FATAL_ERROR "For macOS universal binaries only one backend per architecture is allowed.")
endif()
set(UNIVERSAL_BUILD ON)
# User requested explicit multi-architecture universal build
elseif("${MACOS_ARCH_LEN}" GREATER 2)
message(FATAL_ERROR "For macOS universal binaries only x86_64 and arm64 builds are allowed.")
elseif("${MACOS_ARCH_LEN}" EQUAL 2)
if(NOT (${IS_X64} AND ${IS_ARM64}))
message(FATAL_ERROR "For macOS universal binaries only x86_64 and arm64 builds are allowed.")
endif()
if(("${ARM64_ISA_COUNT}" GREATER 1) OR ("${X64_ISA_COUNT}" GREATER 1))
message(FATAL_ERROR "For macOS universal binaries only one backend per architecture is allowed.")
endif()
set(UNIVERSAL_BUILD ON)
# User requested explicit single architecture build
elseif("${MACOS_ARCH_LEN}" EQUAL 1)
if("${IS_X64}" AND "${ARM64_ISA_COUNT}")
message(FATAL_ERROR "For macOS x86_64 builds an arm64 backend cannot be specified.")
endif()
if("${IS_ARM64}" AND "${X64_ISA_COUNT}")
message(FATAL_ERROR "For macOS arm64 builds an x86_64 backend cannot be specified.")
endif()
# Else is this a implicit multi-architecture universal build?
elseif(("${ARM64_ISA_COUNT}" EQUAL 1) AND ("${X64_ISA_COUNT}" GREATER 1))
string(CONCAT MSG "For macOS setting multiple architecture backends builds a universal binary. "
"For universal binaries only one backend per architecture is allowed.")
message(FATAL_ERROR "${MSG}")
elseif(("${X64_ISA_COUNT}" EQUAL 1) AND ("${ARM64_ISA_COUNT}" GREATER 1))
string(CONCAT MSG "For macOS setting multiple architecture backends builds a universal binary. "
"For universal binaries only one backend per architecture is allowed.")
message(FATAL_ERROR "${MSG}")
elseif(("${ARM64_ISA_COUNT}" EQUAL 1) AND ("${X64_ISA_COUNT}" EQUAL 1))
set(UNIVERSAL_BUILD ON)
set(CMAKE_OSX_ARCHITECTURES "arm64;x86_64")
# Else is this an implicit single architecture build?
elseif("${ARM64_ISA_COUNT}" EQUAL 1)
set(CMAKE_OSX_ARCHITECTURES "arm64")
elseif("${X64_ISA_COUNT}" EQUAL 1)
set(CMAKE_OSX_ARCHITECTURES "x86_64")
else()
# Do nothing here - assume it defaults to host?
endif()
# Non-macOS builds
else()
if(("${ARM64_ISA_COUNT}" GREATER 0) AND ("${X64_ISA_COUNT}" GREATER 0))
if(NOT "${CMAKE_SYSTEM_NAME}" STREQUAL "Darwin")
if(("${ASTCENC_ARM64_ISA_COUNT}" GREATER 0) AND ("${ASTCENC_X64_ISA_COUNT}" GREATER 0))
message(FATAL_ERROR "Builds can only support a single architecture per configure.")
endif()
endif()
# If nothing more specific is set then fall back on the compiler's defaults
if(("${ARM64_ISA_COUNT}" EQUAL 0) AND ("${X64_ISA_COUNT}" EQUAL 0) AND (NOT "${ISA_NONE}"))
set(ISA_NATIVE ON)
if(("${ASTCENC_ARM64_ISA_COUNT}" EQUAL 0) AND ("${ASTCENC_X64_ISA_COUNT}" EQUAL 0) AND (NOT "${ASTCENC_ISA_NONE}"))
set(ASTCENC_ISA_NATIVE ON)
endif()
function(printopt optName optVal)
@@ -191,38 +111,38 @@ function(printopt optName optVal)
endif()
endfunction()
if("${BLOCK_MAX_TEXELS}")
message(STATUS " Max block texels - ${BLOCK_MAX_TEXELS}")
if("${ASTCENC_BLOCK_MAX_TEXELS}")
message(STATUS " Max block texels - ${ASTCENC_BLOCK_MAX_TEXELS}")
endif()
printopt("AVX2 backend " ${ISA_AVX2})
printopt("SSE4.1 backend " ${ISA_SSE41})
printopt("SSE2 backend " ${ISA_SSE2})
printopt("NEON backend " ${ISA_NEON})
printopt("NONE backend " ${ISA_NONE})
printopt("NATIVE backend " ${ISA_NATIVE})
if("${MACOS_BUILD}")
printopt("Universal bin " ${UNIVERSAL_BUILD})
printopt("AVX2 backend " ${ASTCENC_ISA_AVX2})
printopt("SSE4.1 backend " ${ASTCENC_ISA_SSE41})
printopt("SSE2 backend " ${ASTCENC_ISA_SSE2})
printopt("NEON backend " ${ASTCENC_ISA_NEON})
printopt("NONE backend " ${ASTCENC_ISA_NONE})
printopt("NATIVE backend " ${ASTCENC_ISA_NATIVE})
if("${CMAKE_SYSTEM_NAME}" STREQUAL "Darwin")
printopt("Universal bin " ${ASTCENC_UNIVERSAL_BUILD})
endif()
printopt("Decompressor " ${DECOMPRESSOR})
printopt("No invariance " ${NO_INVARIANCE})
printopt("Diagnostics " ${DIAGNOSTICS})
printopt("ASAN " ${ASAN})
printopt("Unit tests " ${UNITTEST})
printopt("Invariance " ${ASTCENC_INVARIANCE})
printopt("Shared libs " ${ASTCENC_SHAREDLIB})
printopt("Decompressor " ${ASTCENC_DECOMPRESSOR})
printopt("Diagnostics " ${ASTCENC_DIAGNOSTICS})
printopt("ASAN " ${ASTCENC_ASAN})
printopt("Unit tests " ${ASTCENC_UNITTEST})
# Subcomponents
add_subdirectory(Source)
# Configure package archive
if(PACKAGE)
if("${MACOS_BUILD}")
string(TOLOWER "macOS" PKG_OS)
if(ASTCENC_PACKAGE)
if("${CMAKE_SYSTEM_NAME}" STREQUAL "Darwin")
string(TOLOWER "macOS" ASTCENC_PKG_OS)
else()
string(TOLOWER ${CMAKE_SYSTEM_NAME} PKG_OS)
string(TOLOWER ${CMAKE_SYSTEM_NAME} ASTCENC_PKG_OS)
endif()
set(PKG_VER ${CMAKE_PROJECT_VERSION_MAJOR}.${CMAKE_PROJECT_VERSION_MINOR})
set(CPACK_PACKAGE_FILE_NAME "astcenc-${PKG_VER}-${PKG_OS}-${PACKAGE}")
set(CPACK_PACKAGE_FILE_NAME "astcenc-${CMAKE_PROJECT_VERSION}-${ASTCENC_PKG_OS}-${ASTCENC_PACKAGE}")
set(CPACK_INCLUDE_TOPLEVEL_DIRECTORY FALSE)
set(CPACK_PACKAGE_CHECKSUM SHA256)
set(CPACK_GENERATOR ZIP)
+118 -33
View File
@@ -10,7 +10,7 @@ backends.
## Windows
Builds for Windows are tested with CMake 3.17 and Visual Studio 2019.
Builds for Windows are tested with CMake 3.17, and Visual Studio 2019 or newer.
### Configuring the build
@@ -25,13 +25,13 @@ cd build
# Configure your build of choice, for example:
# x86-64 using a Visual Studio solution
cmake -G "Visual Studio 16 2019" -T ClangCL -DCMAKE_INSTALL_PREFIX=..\ ^
-DASTCENC_ISA_AVX2=ON -DASTCENC_ISA_SSE41=ON -DASTCENC_ISA_SSE2=ON ..
# x86-64 using NMake
cmake -G "NMake Makefiles" -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=..\ ^
-DISA_AVX2=ON -DISA_SSE41=ON -DISA_SSE2=ON ..
# x86-64 using Visual Studio solution
cmake -G "Visual Studio 16 2019" -T ClangCL -DCMAKE_INSTALL_PREFIX=..\ ^
-DISA_AVX2=ON -DISA_SSE41=ON -DISA_SSE2=ON ..
-DASTCENC_ISA_AVX2=ON -DASTCENC_ISA_SSE41=ON -DASTCENC_ISA_SSE2=ON ..
```
A single CMake configure can build multiple binaries for a single target CPU
@@ -49,14 +49,15 @@ Once you have configured the build you can use NMake to compile the project
from your build dir, and install to your target install directory.
```shell
# Run a build and install build outputs in `${CMAKE_INSTALL_PREFIX}/astcenc/`
# Run a build and install build outputs in `${CMAKE_INSTALL_PREFIX}/bin/`
cd build
nmake install
```
## macOS and Linux
## macOS and Linux using Make
Builds for macOS and Linux are tested with CMake 3.17 and clang++ 9.0.
Builds for macOS and Linux are tested with CMake 3.17, and clang++ 9.0 or
newer.
> Compiling using g++ is supported, but clang++ builds are faster by ~15%.
@@ -78,15 +79,14 @@ cd build
# Arm arch64
cmake -G "Unix Makefiles" -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=../ \
-DISA_NEON=ON ..
-DASTCENC_ISA_NEON=ON ..
# x86-64
cmake -G "Unix Makefiles" -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=../ \
-DISA_AVX2=ON -DISA_SSE41=ON -DISA_SSE2=ON ..
-DASTCENC_ISA_AVX2=ON -DASTCENC_ISA_SSE41=ON -DASTCENC_ISA_SSE2=ON ..
# macOS universal binary build
cmake -G "Unix Makefiles" -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=../ \
-DISA_AVX2=ON -DISA_NEON=ON ..
cmake -G "Unix Makefiles" -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=../ ..
```
A single CMake configure can build multiple binaries for a single target CPU
@@ -94,14 +94,13 @@ architecture, for example building x64 for both SSE2 and AVX2. Each binary name
will include the build variant as a postfix. It is possible to build any set of
the supported SIMD variants by enabling only the ones you require.
For macOS, we additionally support the ability to build a universal binary,
combining one x86 and one arm64 variant into a single output binary. The OS
will select the correct variant to run for the machine being used to run the
built binary. To build a universal binary select a single x86 variant and a
single arm64 variant, and both will be included in a single output binary. It
is not required, but if `CMAKE_OSX_ARCHITECTURES` is set on the command line
(e.g. by XCode-generated build commands) it will be validated against the other
configuration variant settings.
For macOS, we additionally support the ability to build a universal binary.
This build includes SSE4.1 (`x86_64`), AVX2 (`x86_64h`), and NEON (`arm64`)
build slices in a single output binary. The OS will select the correct variant
to run for the machine being used. This is the default build target for a macOS
build, but single-target binaries can still be built by setting
`-DASTCENC_UNIVERSAL_BINARY=OFF` and then manually selecting the specific ISA
variants that are required.
### Building
@@ -109,11 +108,44 @@ Once you have configured the build you can use Make to compile the project from
your build dir, and install to your target install directory.
```shell
# Run a build and install build outputs in `${CMAKE_INSTALL_PREFIX}/astcenc/`
# Run a build and install build outputs in `${CMAKE_INSTALL_PREFIX}/bin/`
# for executable binaries and `${CMAKE_INSTALL_PREFIX}/lib/` for libraries
cd build
make install -j16
```
## macOS using XCode
Builds for macOS and Linux are tested with CMake 3.17, and XCode 14.0 or
newer.
### Configuring the build
To use CMake you must first configure the build. Create a build directory
in the root of the astcenc checkout, and then run `cmake` inside that directory
to generate the build system.
```shell
# Create a build directory
mkdir build
cd build
# Configure a universal build
cmake -G Xcode -DCMAKE_INSTALL_PREFIX=../ ..
```
### Building
Once you have configured the build you can use CMake to compile the project
from your build dir, and install to your target install directory.
```shell
cmake --build . --config Release
# Optionally install the binaries to the installation directory
cmake --install . --config Release
```
## Advanced build options
For codec developers and power users there are a number of useful features in
@@ -132,22 +164,33 @@ We support and test the following `CMAKE_BUILD_TYPE` options.
Note that optimized release builds are compiled with link-time optimization,
which can make profiling more challenging ...
### Shared Libraries
We support building the core library as a shared object by setting the CMake
option `-DASTCENC_SHAREDLIB=ON` at configure time. For macOS build targets the
shared library supports the same universal build configuration as the command
line utility.
Note that the command line tool is always statically linked; the shared objects
are an extra build output that are not currently used by the command line tool.
### Constrained block size builds
All normal builds will support all ASTC block sizes, including the worst case
6x6x6 3D block size (216 texels per block). Compressor memory footprint and
performance can be improved by limiting the block sizes supported in the build
by adding `-DBLOCK_MAX_TEXELS=<texel_count>` to to CMake command line when
configuring. Legal block sizes that are unavailable in a restricted build will
return the error `ASTCENC_ERR_NOT_IMPLEMENTED` during context creation.
by adding `-DASTCENC_BLOCK_MAX_TEXELS=<texel_count>` to to CMake command line
when configuring. Legal block sizes that are unavailable in a restricted build
will return the error `ASTCENC_ERR_NOT_IMPLEMENTED` during context creation.
### Non-invariant builds
All normal builds are designed to be invariant, so any build from the same git
revision will produce bit-identical results for all compilers and CPU
architectures. To achieve this we sacrifice some performance, so if this is
not required you can specify `-DNO_INVARIANCE=ON` to enable additional
optimizations.
not required you can specify `-DASTCENC_INVARIANCE=OFF` to enable additional
optimizations. This has most benefit for AVX2 builds where we are able to
enable use of the FMA instruction set extensions.
### No intrinsics builds
@@ -156,8 +199,8 @@ supported target architectures (x86 and arm64) guarantee SIMD availability. For
development purposes it is possible to build an intrinsic-free build which uses
no explicit SIMD acceleration (the compiler may still auto-vectorize).
To enable this binary variant add `-DISA_NONE=ON` to the CMake command line
when configuring. It is NOT recommended to use this for production; it is
To enable this binary variant add `-DASTCENC_ISA_NONE=ON` to the CMake command
line when configuring. It is NOT recommended to use this for production; it is
significantly slower than the vectorized SIMD builds.
### Test builds
@@ -171,7 +214,7 @@ git submodule init
git submodule update
```
To build unit tests add `-DUNITTEST=ON` to the CMake command line when
To build unit tests add `-DASTCENC_UNITTEST=ON` to the CMake command line when
configuring.
To run unit tests use the CMake `ctest` utility from your build directory after
@@ -185,14 +228,56 @@ ctest --verbose
### Address sanitizer builds
We support building with ASAN on Linux and macOS when using a compiler that
supports it. To build binaries with ASAN checking enabled add `-DASAN=ON` to
the CMake command line when configuring.
supports it. To build binaries with ASAN checking enabled add `-DASTCENC_ASAN=ON`
to the CMake command line when configuring.
### Android builds
Builds of the command line utility for Android are not officially supported, but can be a useful
development build for testing on e.g. different Arm CPU microarchitectures.
The build script below shows one possible route to building the command line tool for Android. Once
built the application can be pushed to e.g. `/data/local/tmp` and executed from an Android shell
terminal over `adb`.
```shell
ANDROID_ABI=arm64-v8a
ANDROID_NDK=/work/tools/android/ndk/22.1.7171670
BUILD_TYPE=RelWithDebInfo
BUILD_DIR=build
mkdir -p ${BUILD_DIR}
cd ${BUILD_DIR}
cmake \
-DCMAKE_INSTALL_PREFIX=./ \
-DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
-DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK}/build/cmake/android.toolchain.cmake \
-DANDROID_ABI=${ANDROID_ABI} \
-DANDROID_ARM_NEON=ON \
-DANDROID_PLATFORM=android-21 \
-DCMAKE_ANDROID_NDK_TOOLCHAIN_VERSION=clang \
-DANDROID_TOOLCHAIN=clang \
-DANDROID_STL=c++_static \
-DARCH=aarch64 \
-DASTCENC_ISA_NEON=ON \
..
make -j16
```
## Packaging a release bundle
We support building a release bundle of all enabled binary configurations in
the current CMake configuration using the `package` build target
Configure CMake with:
* `-DASTCENC_PACAKGE=<arch>` to set the package architecture/variant name used
to name the package archive (not set by default).
```shell
# Run a build and package build outputs in `./astcenc-<ver>-<os>-<arch>.<fmt>`
cd build
@@ -212,4 +297,4 @@ details.
- - -
_Copyright © 2019-2022, Arm Limited and contributors. All rights reserved._
_Copyright © 2019-2023, Arm Limited and contributors. All rights reserved._
-328
View File
@@ -1,328 +0,0 @@
# 2.x series change log
This page summarizes the major functional and performance changes in each
release of the 2.x series.
All performance data on this page is measured on an Intel Core i5-9600K
clocked at 4.2 GHz, running astcenc using 6 threads.
<!-- ---------------------------------------------------------------------- -->
## 2.5
**Status:** Released, March 2021
The 2.5 release is the last major release in the 2.x series. After this release
a `2.x` branch will provide stable long-term support, and the `main` branch
will switch to focusing on more radical changes for the 3.x series.
Reminder for users of the library interface - the API is not designed to be
stable across versions, and this release is not compatible with earlier 2.x
releases. Please update and rebuild your client-side code using the updated
`astcenc.h` header.
**General:**
* **Feature:** The `ISA_INVARIANCE` build option is no longer supported, as
there is no longer any performance benefit from the variant paths. All
builds are now using the equivalent of the `ISA_INVARIANCE=ON` setting, and
all builds (except Armv7) are now believed to be invariant across operating
systems, compilers, CPU architectures, and SIMD instruction sets.
* **Feature:** Armv8 32-bit builds with NEON are now supported, with
out-of-the-box support for Arm Linux soft-float and hard-float ABIs. There
are no pre-built binaries for these targets; support is included for
library users targeting older 32-bit Android and iOS devices.
* **Feature:** A compressor mode for encoding HDR textures that have been
encoded into LDR RGBM wrapper format is now supported. Note that this
encoding has some strong recommendations for how the RGBM encoding is
implemented to avoid block artifacts in the compressed image.
* **Core API:**
* **API Change:** The core API has been changed to be a pure C API, making it
easier to wrap the codec in a stable shared library ABI. Some entry points
that used to accept references now expect pointers.
* **API Change:** The decompression functionality in the core API has been
changed to allow use of multiple threads. The design pattern matches the
compression functionality, requiring the caller to create the threads,
synchronize them between images, and to call the new
`astcenc_decompress_reset()` function between images.
* **API Feature:** Defines to support exporting public API entry point
symbols from a shared object are provided, but not exposed off-the-shelf by
the CMake provided by the project.
* **API Feature:** New `astcenc_get_block_info()` function added to the core
API to allow users to perform high level analysis of compressed data. This
API is not implemented in decompressor-only builds.
* **API Feature:** Codec configuration structure has been extended to expose
the new RGBM compression mode. See the API header for details.
<!-- ---------------------------------------------------------------------- -->
## 2.4
**Status:** Released, February 2021
The 2.4 release is the fifth release in the 2.x series. It is primarily a bug
fix release for HDR image handling, which impacts all earlier 2.x series
releases.
**General:**
* **Feature:** When using the `-a` option, or the equivalent config option
for the API, any 2D blocks that are entirely zero alpha after the alpha
filter radius is taken into account are replaced by transparent black
constant color blocks. This is an RDO-like technique to improve compression
ratios of any additional application packaging compression that is applied.
**Command Line:**
* **Bug fix:** The command line wrapper now correctly loads HDR images that
have a non-square aspect ratio.
<!-- ---------------------------------------------------------------------- -->
## 2.3
**Status:** Released, January 2021
The 2.3 release is the fourth release in the 2.x series. It includes a number
of performance improvements and new features.
Reminder for users of the library interface - the API is not designed to be
stable across versions, and this release is not compatible with 2.2. Please
recompile your client-side code using the updated `astcenc.h` header.
* **General:**
* **Feature:** Decompressor-only builds of the codec are supported again.
While this is primarily a feature for library users who want to shrink
binary size, a variant command line tool `astcdec` can be built by
specifying `DECOMPRESSOR=ON` on the CMake configure command line.
* **Feature:** Diagnostic builds of the codec can now be built. These builds
generate a JSON file containing a trace of the compressor execution.
Diagnostic builds are only suitable for codec development; they are slower
and JSON generation cannot be disabled. Build by setting `DIAGNOSTICS=ON`
on the CMake configure command line.
* **Feature:** Code compatibility improved with older versions of GCC,
earliest compiler now tested is GCC 7.5 (was GCC 9.3).
* **Feature:** Code compatibility improved with newer versions of LLVM,
latest compiler now tested is Clang 12.0 (was Clang 9.0).
* **Feature:** Code compatibility improved with the Visual Studio 2019 LLVM
toolset (`clang-cl`). Using the LLVM toolset gives 25% performance
improvements and is recommended.
* **Command Line:**
* **Feature:** Quality level now accepts either a preset (`-fast`, etc) or a
float value between 0 and 100, allowing more control over the compression
quality vs performance trade-off. The presets are not evenly spaced in the
float range; they have been spaced to give the best distribution of points
between the fast and thorough presets.
* `-fastest`: 0.0
* `-fast`: 10.0
* `-medium`: 60.0
* `-thorough`: 98.0
* `-exhaustive`: 100.0
* **Core API:**
* **API Change:** Quality level preset enum replaced with a float value
between 0 (`-fastest`) and 100 (`-exhaustive`). See above for more info.
### Performance
This release includes a number of optimizations to improve performance.
* New compressor algorithm for handling encoding candidates and refinement.
* Vectorized implementation of `compute_error_of_weight_set()`.
* Unrolled implementation of `encode_ise()`.
* Many other small improvements!
The most significant change is the change to the compressor path, which now
uses an adaptive approach to candidate trials and block refinement.
In earlier releases the quality level will determine the number of encoding
candidates and the number of iterative refinement passes that are used for each
major encoding trial. This is a fixed behavior; it will always try the full N
candidates and M refinement iterations specified by the quality level for each
encoding trial.
The new approach implements two optimizations for this:
* Compression will complete when a block candidate hits the specified target
quality, after its M refinement iterations have been applied. Later block
candidates are simply abandoned.
* Block candidates will predict how much refinement can improve them, and
abandon refinement if they are unlikely to improve upon the best known
encoding already in-hand.
This pair of optimizations provides significant performance improvement to the
high quality modes which use the most block candidates and refinement
iterations. A minor loss of image quality is expected, as the blocks we no
longer test or refine may have been better coding choices.
**Absolute performance vs 2.2 release:**
![Absolute scores 2.3 vs 2.2](./ChangeLogImg/absolute-2.2-to-2.3.png)
**Relative performance vs 2.2 release:**
![Relative scores 2.3 vs 2.2](./ChangeLogImg/relative-2.2-to-2.3.png)
<!-- ---------------------------------------------------------------------- -->
## 2.2
**Status:** Released, January 2021
The 2.2 release is the third release in the 2.x series. It includes a number
of performance improvements and new features.
Reminder for users of the library interface - the API is not designed to be
stable across versions, and this release is not compatible with 2.1. Please
recompile your client-side code using the updated `astcenc.h` header.
* **General:**
* **Feature:** New Arm aarch64 NEON accelerated vector library support.
* **Improvement:** New CMake build system for all platforms.
* **Improvement:** SSE4.2 feature profile changed to SSE4.1, which more
accurately reflects the feature set used.
* **Binary releases:**
* **Improvement:** Linux binaries changed to use Clang 9.0, which gives
up to 15% performance improvement.
* **Improvement:** Windows binaries are now code signed.
* **Improvement:** macOS binaries for Apple silicon platforms now provided.
* **Improvement:** macOS binaries are now code signed and notarized.
* **Command Line:**
* **Feature:** New image preprocess `-pp-normalize` option added. This forces
normal vectors to be unit length, which is useful when compressing source
textures that use normal length to encode an NDF, which is incompatible
with ASTC's two channel encoding.
* **Feature:** New image preprocess `-pp-premultiply` option added. This
scales RGB values by the alpha value. This can be useful to minimize
cross-channel color bleed caused by GPU post-multiply filtering/blending.
* **Improvements:** Command line tool cleanly traps and reports errors for
corrupt input images rather than relying on standard library `assert()`
calls in release builds.
* **Core API:**
* **API Change:** Images using region-based metrics no longer need to include
padding; all input images should be tightly packed and `dim_pad` is removed
from the `astcenc_image` structure. This makes it easier to directly use
images loaded from other libraries.
* **API Change:** Image `data` is no longer a 3D array accessed using
`data[z][y][x]` indexing, it's an array of 2D slices. This makes it easier
to directly use images loaded from other libraries.
* **API Change:** New `ASTCENC_FLG_SELF_DECOMPRESS_ONLY` flag added to the
codec config. Using this flag enables additional optimizations that
aggressively exploit implementation- and configuration-specific, behavior
to gain performance. When using this flag the codec can only reliably
decompress images that were compressed in the same context session. Images
produced via other means may fail to decompress correctly, even if they are
otherwise valid ASTC files.
### Performance
There is one major set of optimizations in this release, related to the new
`ASTCENC_FLG_SELF_DECOMPRESS_ONLY` mode. These allow the compressor to only
create data tables it knows that it is going to use, based on its current set
of heuristics, rather than needing the full set the format allows.
The first benefit of these changes is a reduced context creation time, which
can be reduced by up to 250ms on our test machine. This is a significant
percentage of the command line utility runtime for a small image when using a
quick search preset. Compressing the whole Kodak test suite using the command
line utility and the `-fastest` preset is ~30% faster with this release, which
is mostly due to faster startup.
The reduction in the data table size in this mode also improve the core codec
speed. Our test sets show an average of 12% improvement in the codec for
`-fastest` mode, and an average of 3% for `-medium` mode.
Key for performance charts:
* Color = block size (see legend).
* Letter = image format (N = normal map, G = grayscale, L = LDR, H = HDR).
**Absolute performance vs 2.1 release:**
![Absolute scores 2.2 vs 2.1](./ChangeLogImg/absolute-2.1-to-2.2.png)
**Relative performance vs 2.1 release:**
![Relative scores 2.2 vs 2.1](./ChangeLogImg/relative-2.1-to-2.2.png)
<!-- ---------------------------------------------------------------------- -->
## 2.1
**Status:** Released, November 2020
The 2.1 release is the second release in the 2.x series. It includes a number
of performance optimizations and new features.
Reminder for users of the library interface - the API is not designed to be
stable across versions, and this release is not compatible with 2.0. Please
recompile your client-side code using the updated `astcenc.h` header.
### Features:
* **Command line:**
* **Bug fix:** The meaning of the `-tH\cH\dH` and `-th\ch\dh` compression
modes was inverted. They now match the documentation; use `-*H` for HDR
RGBA, and `-*h` for HDR RGB with LDR alpha.
* **Feature:** A new `-fastest` quality preset is now available. This is
designed for fast "roughing out" of new content, and sacrifices significant
image quality compared to `-fast`. We do not recommend its use for
production builds.
* **Feature:** A new `-candidatelimit` compression tuning option is now
available. This is a power-user control to determine how many candidates
are returned for each block mode encoding trial. This feature is used
automatically by the search presets; see `-help` for details.
* **Improvement:** The compression test modes (`-tl\ts\th\tH`) now emit a
MTex/s performance metric, in addition to coding time.
* **Core API:**
* **Feature:** A new quality preset `ASTCENC_PRE_FASTEST` is available. See
`-fastest` above for details.
* **Feature:** A new tuning option `tune_candidate_limit` is available in
the config structure. See `-candidatelimit` above for details.
* **Feature:** Image input/output can now use `ASTCENC_TYPE_F32` data types.
* **Stability:**
* **Feature:** The SSE2, SSE4.2, and AVX2 variants now produce identical
compressed output when run on the same CPU when compiled with the
preprocessor define `ASTCENC_ISA_INVARIANCE=1`. For Make builds this can
be set on the command line by setting `ISA_INV=1`. ISA invariance is off
by default; it reduces performance by 1-3%.
### Performance
Key for performance charts:
* Color = block size (see legend).
* Letter = image format (N = normal map, G = grayscale, L = LDR, H = HDR).
**Absolute performance vs 2.0 release:**
![Absolute scores 2.1 vs 2.0](./ChangeLogImg/absolute-2.0-to-2.1.png)
**Relative performance vs 2.0 release:**
![Relative scores 2.1 vs 2.0](./ChangeLogImg/relative-2.0-to-2.1.png)
<!-- ---------------------------------------------------------------------- -->
## 2.0
**Status:** Released, August 2020
The 2.0 release is first release in the 2.x series. It includes a number of
major changes over the earlier 1.7 series, and is not command-line compatible.
### Features:
* The core codec can be built as a library, exposed via a new codec API.
* The core codec supports accelerated SIMD paths for SSE2, SSE4.2, and AVX2.
* The command line syntax has a clearer mapping to Khronos feature profiles.
### Performance:
Key for performance charts
* Color = block size (see legend).
* Letter = image format (N = normal map, G = grayscale, L = LDR, H = HDR).
**Absolute performance vs 1.7 release:**
![Absolute scores 2.0 vs 1.7](./ChangeLogImg/absolute-1.7-to-2.0.png)
**Relative performance vs 1.7 release:**
![Relative scores 2.0 vs 1.7](./ChangeLogImg/relative-1.7-to-2.0.png)
- - -
_Copyright © 2020-2022, Arm Limited and contributors. All rights reserved._
-4
View File
@@ -299,10 +299,6 @@ Key for charts:
* Color = block size (see legend).
* Letter = image format (N = normal map, G = grayscale, L = LDR, H = HDR).
**Absolute performance vs 2.5 release:**
![Absolute scores 3.0 vs 2.5](./ChangeLogImg/absolute-2.5-to-3.0.png)
**Relative performance vs 2.5 release:**
![Relative scores 3.0 vs 2.5](./ChangeLogImg/relative-2.5-to-3.0.png)
+398
View File
@@ -0,0 +1,398 @@
# 4.x series change log
This page summarizes the major functional and performance changes in each
release of the 4.x series.
All performance data on this page is measured on an Intel Core i5-9600K
clocked at 4.2 GHz, running `astcenc` using AVX2 and 6 threads.
<!-- ---------------------------------------------------------------------- -->
## 4.7.0
**Status:** January 2024
The 4.7.0 release is a major maintenance release, fixing rounding behavior in
the decompressor to match the Khronos specification. This fix includes the
addition of explicit support for optimizing for `decode_unorm8` rounding.
Reminder - the codec library API is not designed to be binary compatible across
versions. We always recommend rebuilding your client-side code using the updated
`astcenc.h` header.
* **General:**
* **Bug fix:** sRGB LDR decompression now uses the correct endpoint expansion
method to create the 16-bit RGB endpoint colors, and removes the previous
correction code from the interpolation function. This bug could result in
LSB bit flips relative to the standard specification.
* **Bug fix:** Decompressing to an 8-bit per component output image now matches
the `decode_unorm8` extension rounding rules. This bug could result in
LSB bit flips relative to the standard specification.
* **Bug fix:** Code now avoids using `alignas()` in the reference C
implementation, as the default `alignas(16)` is narrower than the
native minimum alignment requirement on some CPUs.
* **Feature:** Library configuration supports a new flag,
`ASTCENC_FLG_USE_DECODE_UNORM8`. This flag indicates that the image will be
used with the `decode_unorm8` decode mode. When set during compression
this allows the compressor to use the correct rounding when determining the
best encoding.
* **Feature:** Command line tool supports a new option, `-decode_unorm8`.
This option indicates that the image will be used with the `decode_unorm8`
decode mode. This option will automatically be set for decompression
(`-d*`) and trial (`-t*`) tool operation if the decompressed output image
is stored to an 8-bit per component file format. This option must be set
manually for compression (`-c*`) tool operation, as the desired decode mode
cannot be reliably determined.
* **Feature:** Library configuration supports a new optional progress
reporting callback to be specified. This is called during compression to
to allow interactive tooling use cases to display incremental progress. The
command line tool uses this feature to show compression progress unless
`-silent` is used.
<!-- ---------------------------------------------------------------------- -->
## 4.6.1
**Status:** November 2023
The 4.6.1 release is a minor maintenance release to fix a scaling bug on
large core count Windows systems.
* **General:**
* **Optimization:** Windows builds of the `astcenc` command line tool can now
use more than 64 cores on large core count systems. This change doubled
command line performance for `-exhaustive` compression when testing on an
96 core/192 thread system.
* **Feature:** Windows Arm64 native builds of the `astcenc` command line tool
are now included in the prebuilt release binaries.
<!-- ---------------------------------------------------------------------- -->
## 4.6.0
**Status:** November 2023
The 4.6.0 release retunes the compressor heuristics to give improvements to
performance for trivial losses to image quality. It also includes some minor
bug fixes and code quality improvements.
Reminder - the codec library API is not designed to be binary compatible across
versions. We always recommend rebuilding your client-side code using the updated
`astcenc.h` header.
* **General:**
* **Bug-fix:** Fixed context allocation for contexts allocated with the
`ASTCENC_FLG_DECOMPRESS_ONLY` flag.
* **Bug-fix:** Reduced use of `reinterpret_cast` in the core codec to
avoid strict aliasing violations.
* **Optimization:** `-medium` search quality no longer tests 4 partition
encodings for block sizes between 25 and 83 texels (inclusive). This
improves performance for a tiny drop in image quality.
* **Optimization:** `-thorough` and higher search qualities no longer test the
mode0 first search for block sizes between 25 and 83 texels (inclusive).
This improves performance for a tiny drop in image quality.
* **Optimization:** `TUNE_MAX_PARTITIONING_CANDIDATES` reduced from 32 to 8
to reduce the size of stack allocated data structures. This causes a tiny
drop in image quality for the `-verythorough` and `-exhaustive` presets.
<!-- ---------------------------------------------------------------------- -->
## 4.5.0
**Status:** June 2023
The 4.5.0 release is a maintenance release with small image quality
improvements, and a number of build system quality of life improvements.
* **General:**
* **Bug-fix:** Improved handling compiler arguments in CMake, including
consistent use of MSVC-style command line arguments for ClangCL.
* **Bug-fix:** Invariant Clang builds now use `-ffp-model=precise` with
`-ffp-contract=off` which is needed to restore invariance due to recent
changes in compiler defaults.
* **Change:** macOS binary releases are now distributed as a single universal
binary for all platforms.
* **Change:** Windows binary releases are now compiled with VS2022.
* **Change:** Invariant MSVC builds for VS2022 now use `/fp:precise` instead
of `/fp:strict`, which is is now possible because precise no longer implies
contraction. This should improve performance for MSVC builds.
* **Change:** Non-invariant Clang builds now use `-ffp-model=precise` with
`-ffp-contract=on`. This should improve performance on older Clang
versions which defaulted to no contraction.
* **Change:** Non-invariant MSVC builds for VS2022 now use `/fp:precise`
with `/fp:contract`. This should improve performance for MSVC builds.
* **Change:** CMake config variables now use an `ASTCENC_` prefix to add a
namespace and group options when the library is used in a larger project.
* **Change:** CMake config `ASTCENC_UNIVERSAL_BUILD` for building macOS
universal binaries has been improved to include the `x86_64h` slice for
AVX2 builds. Universal builds are now on by default for macOS, and always
include NEON (arm64), SSE4.1 (x86_64), and AVX2 (x86_64h) variants.
* **Change:** CMake config `ASTCENC_NO_INVARIANCE` has been inverted to
remove the negated option, and is now `ASTCENC_INVARIANCE` with a default
of `ON`. Disabling this option can substantially improve performance, but
images can different across platforms and compilers.
* **Optimization:** Color quantization and packing for LDR RGB and RGBA has
been vectorized to improve performance.
* **Change:** Color quantization for LDR RGB and RGBA endpoints will now try
multiple quantization packing methods, and pick the one with the lowest
endpoint encoding error. This gives a minor image quality improvement, for
no significant performance impact when combined with the vectorization
optimizations.
<!-- ---------------------------------------------------------------------- -->
## 4.4.0
**Status:** March 2023
The 4.4.0 release is a minor release with image quality improvements, a small
performance boost, and a few new quality-of-life features.
* **General:**
* **Change:** Core library no longer checks availability of required
instruction set extensions, such as SSE4.1 or AVX2. Checking compatibility
is now the responsibility of the caller. See `astcenccli_entry.cpp` for
an example of code performing this check.
* **Change:** Core library can be built as a shared object by setting the
`-DSHAREDLIB=ON` CMake option, resulting in e.g. `libastcenc-avx2-shared.so`.
Note that the command line tool is always statically linked.
* **Change:** Decompressed 3D images will now write one output file per
slice, if the target format is a 2D image format.
* **Change:** Command line errors print to stderr instead of stdout.
* **Change:** Color encoding uses new quantization tables, that now factor
in floating-point rounding if a distance tie is found when using the
integer quant256 value. This improves image quality for 4x4 and 5x5 block
sizes.
* **Optimization:** Partition selection uses a simplified line calculation
with a faster approximation. This improves performance for all block sizes.
* **Bug-fix:** Fixed missing symbol error in decompressor-only builds.
* **Bug-fix:** Fixed infinity handling in debug trace JSON files.
### Performance:
Key for charts:
* Color = block size (see legend).
* Letter = image format (N = normal map, G = grayscale, L = LDR, H = HDR).
**Relative performance vs 4.3 release:**
![Relative scores 4.4 vs 4.3](./ChangeLogImg/relative-4.3-to-4.4.png)
<!-- ---------------------------------------------------------------------- -->
## 4.3.1
**Status:** January 2023
The 4.3.1 release is a minor maintenance release. No performance or image
quality changes are expected.
* **General:**
* **Bug-fix:** Fixed typo in `-2/3/4partitioncandidatelimit` CLI options.
* **Bug-fix:** Fixed handling for `-3/4partitionindexlimit` CLI options.
* **Bug-fix:** Updated to `stb_image.h` v2.28, which includes multiple fixes
and improvements for image loading.
<!-- ---------------------------------------------------------------------- -->
## 4.3.0
**Status:** January 2023
The 4.3.0 release is an optimization release. There are minor performance
and image quality improvements in this release.
Reminder - the codec library API is not designed to be binary compatible across
versions. We always recommend rebuilding your client-side code using the updated
`astcenc.h` header.
* **General:**
* **Bug-fix:** Use lower case `windows.h` include for MinGW compatibility.
* **Change:** The `-mask` command line option, `ASTCENC_FLG_MAP_MASK` in the
library API, has been removed.
* **Optimization:** Always skip blue-contraction for `QUANT_256` encodings.
This gives a small image quality improvement for the 4x4 block size.
* **Optimization:** Always skip RGBO vector calculation for LDR encodings.
* **Optimization:** Defer color packing and scrambling to physical layer.
* **Optimization:** Remove folded `decimation_info` lookup tables. This
significantly reduces compressor memory footprint and improves context
creation time. Impact increases with the active block size.
* **Optimization:** Increased trial and refinement pruning by using stricter
target errors when determining whether to skip iterations.
### Performance:
Key for charts:
* Color = block size (see legend).
* Letter = image format (N = normal map, G = grayscale, L = LDR, H = HDR).
**Relative performance vs 4.2 release:**
![Relative scores 4.3 vs 4.2](./ChangeLogImg/relative-4.2-to-4.3.png)
<!-- ---------------------------------------------------------------------- -->
## 4.2.0
**Status:** November 2022
The 4.2.0 release is an optimization release. There are significant performance
improvements, minor image quality improvements, and library interface changes in
this release.
Reminder - the codec library API is not designed to be binary compatible across
versions. We always recommend rebuilding your client-side code using the updated
`astcenc.h` header.
* **General:**
* **Bug-fix:** Compression for RGB and RGBA base+offset encodings no
longer generate endpoints with the incorrect blue-contract behavior.
* **Bug-fix:** Lowest channel correlation calculation now correctly ignores
constant color channels for the purposes of filtering 2 plane encodings.
On average this improves both performance and image quality.
* **Bug-fix:** ISA compatibility now checked in `config_init()` as well as
in `context_alloc()`.
* **Change:** Removed the low-weight count optimization, as more recent
changes had significantly reduced its performance benefit. Option removed
from both command line and configuration structure.
* **Feature:** The `-exhaustive` mode now runs full trials on more
partitioning candidates and block candidates. This improves image quality
by 0.1 to 0.25 dB, but slows down compression by 3x. The `-verythorough`
and `-thorough` modes also test more candidates.
* **Feature:** A new preset, `-verythorough`, has been introduced to provide
a standard performance point between `-thorough` and the re-tuned
`-exhaustive` mode. This new mode is faster and higher quality than the
`-exhaustive` preset in the 4.1 release.
* **Feature:** The compressor can now independently vary the number of
partitionings considered for error estimation for 2/3/4 partitions. This
allows heuristics to put more effort into 2 partitions, and less in to
3/4 partitions.
* **Feature:** The compressor can now run trials on a variable number of
candidate partitionings, allowing high quality modes to explore more of the
search space at the expense of slower compression. The number of trials is
independently configurable for 2/3/4 partition cases.
* **Optimization:** Introduce early-out threshold for 2/3/4 partition
searches based on the results after 1 of 2 trials. This significantly
improves performance for `-medium` and `-thorough` searches, for a minor
loss in image quality.
* **Optimization:** Reduce early-out threshold for 3/4 partition searches
based on 2/3 partition results. This significantly improves performance,
especially for `-thorough` searches, for a minor loss in image quality.
* **Optimization:** Use direct vector compare to create a SIMD mask instead
of a scalar compare that is broadcast to a vector mask.
* **Optimization:** Remove obsolete partition validity masks from the
partition selection algorithm.
* **Optimization:** Removed obsolete channel scaling from partition
`avgs_and_dirs()` calculation.
### Performance:
Key for charts:
* Color = block size (see legend).
* Letter = image format (N = normal map, G = grayscale, L = LDR, H = HDR).
**Relative performance vs 4.0 and 4.1 release:**
![Relative scores 4.2 vs 4.0](./ChangeLogImg/relative-4.0-to-4.2.png)
<!-- ---------------------------------------------------------------------- -->
## 4.1.0
**Status:** August 2022
The 4.1.0 release is a maintenance release. There is no performance or image
quality change in this release.
* **General:**
* **Change:** Command line decompressor no longer uses the legacy
`GL_LUMINANCE` or `GL_LUMINANCE_ALPHA` format enums when writing KTX
output files. Luminance textures now use the `GL_RED` format and
luminance_alpha textures now use the `GL_RG` format.
* **Change:** Command line tool gains a new `-dimage` option to generate
diagnostic images showing aspects of the compression encoding. The output
file name with its extension stripped is used as the stem of the diagnostic
image file names.
* **Bug-fix:** Library decompressor builds for SSE no longer use masked store
`maskmovdqu` instructions, as they can generate faults on masked lanes.
* **Bug-fix:** Command line decompressor now correctly uses sized type enums
for the internal format when writing output KTX files.
* **Bug-fix:** Command line compressor now correctly loads 16 and 32-bit per
component input KTX files.
* **Bug-fix:** Fixed GCC9 compiler warnings on Arm aarch64.
<!-- ---------------------------------------------------------------------- -->
## 4.0.0
**Status:** July 2022
The 4.0.0 release introduces some major performance enhancement, and a number
of larger changes to the heuristics used in the codec to find a more effective
cost:quality trade off.
* **General:**
* **Change:** The `-array` option for specifying the number of image planes
for ASTC 3D volumetric block compression been renamed to `-zdim`.
* **Change:** The build root package directory is now `bin` instead of
`astcenc`, allowing the CMake install step to write binaries into
`/usr/local/bin` if the user wishes to do so.
* **Feature:** A new `-ssw` option for specifying the shader sampling swizzle
has been added as convenience alternative to the `-cw` option. This is
needed to correct error weighting during compression if not all components
are read in the shader. For example, to extract and compress two components
from an RGBA input image, weighting the two components equally when
sampling through .ra in the shader, use `-esw ggga -ssw ra`. In this
example `-ssw ra` is equivalent to the alternative `-cw 1 0 0 1` encoding.
* **Feature:** The `-a` alpha weighting option has been re-enabled in the
backend, and now again applies alpha scaling to the RGB error metrics when
encoding. This is based on the maximum alpha in each block, not the
individual texel alpha values used in the earlier implementation.
* **Feature:** The command line tool now has `-repeats <count>` for testing,
which will iterate around compression and decompression `count` times.
Reported performance metrics also now separate compression and
decompression scores.
* **Feature:** The core codec is now warning clean up to /W4 for both MSVC
`cl.exe` and `clangcl.exe` compilers.
* **Feature:** The core codec now supports arm64 for both MSVC `cl.exe` and
`clangcl.exe` compilers.
* **Feature:** `NO_INVARIANCE` builds will enable the `-ffp-contract=fast`
option for all targets when using Clang or GCC. In addition AVX2 targets
will also set the `-mfma` option. This reduces image quality by up to 0.2dB
(normally much less), but improves performance by up to 5-20%.
* **Optimization:** Angular endpoint min/max weight selection is restricted
to weight `QUANT_11` or lower. Higher quantization levels assume default
0-1 range, which is less accurate but much faster.
* **Optimization:** Maximum weight quantization for later trials is selected
based on the weight quantization of the best encoding from the 1 plane 1
partition trial. This significantly reduces the search space for the later
trials with more planes or partitions.
* **Optimization:** Small data tables now use in-register SIMD permutes
rather than gathers (AVX2) or unrolled scalar lookups (SSE/NEON). This can
be a significant optimization for paths that are load unit limited.
* **Optimization:** Decompressed image block writes in the decompressor now
use a vectorized approach to writing each row of texels in the block,
including to ability to exploit masked stores if the target supports them.
* **Optimization:** Weight scrambling has been moved into the physical layer;
the rest of the codec now uses linear order weights.
* **Optimization:** Weight packing has been moved into the physical layer;
the rest of the codec now uses unpacked weights in the 0-64 range.
* **Optimization:** Consistently vectorize the creation of unquantized weight
grids when they are needed.
* **Optimization:** Remove redundant per-decimation mode copies of endpoint
and weight structures, which were really read-only duplicates.
* **Optimization:** Early-out the same endpoint mode color calculation if it
cannot be applied.
* **Optimization:** Numerous type size reductions applied to arrays to reduce
both context working buffer size usage and stack usage.
### Performance:
Key for charts:
* Color = block size (see legend).
* Letter = image format (N = normal map, G = grayscale, L = LDR, H = HDR).
**Relative performance vs 3.7 release:**
![Relative scores 4.0 vs 3.7](./ChangeLogImg/relative-3.7-to-4.0.png)
- - -
_Copyright © 2022-2024, Arm Limited and contributors. All rights reserved._
Binary file not shown.

Before

Width:  |  Height:  |  Size: 111 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 148 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 141 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 149 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 134 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 112 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 120 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 120 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 123 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 121 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 126 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 116 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 108 KiB

+79 -51
View File
@@ -25,7 +25,7 @@ their compressed bitrate are shown in the table below.
| BC3nm | G+R | 8 | BC1 G + BC4 R |
| BC4 | R | 4 | L8 |
| BC5 | R+G | 8 | BC1 R + BC1 G |
| BC6 | RGB (HDR) | 8 | |
| BC6H | RGB (HDR) | 8 | |
| BC7 | RGB / RGBA | 8 | |
| EAC_R11 | R | 4 | R11 |
| EAC_RG11 | RG | 8 | RG11 |
@@ -46,40 +46,40 @@ also a weakness (it reduces quality when compressing correlated signals).
# ASTC Format Mapping
The main question which arises with the mapping of another format on to ASTC
is how to handle cases where the input isn't a 4 channel RGBA input. ASTC is a
container format which always decompresses in to a 4 channel RGBA result.
is how to handle cases where the input isn't a 4 component RGBA input. ASTC is
a container format which always decompresses in to a 4 component RGBA result.
However, the internal compressed representation is very flexible and can store
1-4 channels as needed on a per-block basis.
1-4 components as needed on a per-block basis.
To get the best quality for a given bitrate, or the lowest bitrate for a given
quality, it is important that as few channels as possible are stored in the
quality, it is important that as few components as possible are stored in the
internal representation to avoid wasting coding space.
Specific optimizations in the ASTC coding scheme exist for:
* Encoding the RGB channels as a single luminance channel, so only a single
* Encoding the RGB components as a single luminance component, so only a single
value needs to be stored in the coding instead of three.
* Encoding the A channel as a constant 1.0 value, so the coding doesn't
* Encoding the A component as a constant 1.0 value, so the coding doesn't
actually need to store a per-pixel alpha value at all.
... so mapping your inputs given to the compressor to hit these paths is
really important if you want to get the best output quality for your chosen
bitrate.
## Encoding 1-4 channel data
## Encoding 1-4 component data
The table below shows the recommended channel usage for data with different
numbers of color channels present in the data.
The table below shows the recommended component usage for data with different
numbers of color components present in the data.
The coding swizzle should be applied when compressing an image. This can be
handled by the compressor when reading an uncompressed input image by
specifying the swizzle using the `-esw` command line option.
The sampling swizzle is what your should use in your shader programs to read
the data from the compressed texture, assuming no additional API-level channel
swizzling is specified by the application.
the data from the compressed texture, assuming no additional API-level
component swizzling is specified by the application.
| Input Channels | ASTC Endpoint | Coding Swizzle | Sampling Swizzle |
| Input components | ASTC Endpoint | Coding Swizzle | Sampling Swizzle |
| -------------- | ------------- | -------------- | ------------------ |
| 1 | L + 1 | `rrr1` | `.g` <sup>1</sup> |
| 2 | L + A | `rrrg` | `.ga` <sup>1</sup> |
@@ -88,13 +88,13 @@ swizzling is specified by the application.
**1:** Sampling from `g` is preferred to sampling from `r` because it allows a
single shader to be compatible with ASTC, BC1, or ETC formats. BC1 and ETC1
store color endpoints as RGB565 data, so the `g` channel will have higher
store color endpoints as RGB565 data, so the `g` component will have higher
precision. For ASTC it doesn't actually make any difference; the same single
channel luminance will be returned for all three of the `.rgb` channels.
component luminance will be returned for all three of the `.rgb` components.
## Equivalence with other formats
Based on these channel encoding requirements we can now derive the the ASTC
Based on these component encoding requirements we can now derive the the ASTC
coding equivalents for most of the other texture compression formats in common
use today.
@@ -105,7 +105,7 @@ use today.
| BC3nm | `gggr` | `.ag` | |
| BC4 | `rrr1` | `.r` | |
| BC5 | `rrrg` | `.ra` <sup>2</sup> | |
| BC6 | `rgb1` | `.rgb` | HDR profile only |
| BC6H | `rgb1` | `.rgb` <sup>3</sup> | HDR profile only |
| BC7 | `rgba` | `.rgba` | |
| EAC_R11 | `rrr1` | `.r` | |
| EAC_RG11 | `rrrg` | `.ra` <sup>2</sup> | |
@@ -115,38 +115,66 @@ use today.
| ETC2+EAC | `rgba` | `.rgba` | |
**1:** ASTC has no equivalent of the 1-bit punch-through alpha encoding
supported by BC1 or ETC2; if alpha is present it will be a full alpha channel.
supported by BC1 or ETC2; if alpha is present it will be a full alpha
component.
**2:** ASTC relies on using the L+A color endpoint type for coding efficiency
for two channel data. It therefore has no direct equivalent of a two-plane
format sampled though the `.rg` channels such as BC5 or EAC_RG11. This can
be emulated by setting texture channel swizzles in the runtime API - e.g. via
for two component data. It therefore has no direct equivalent of a two-plane
format sampled though the `.rg` components such as BC5 or EAC_RG11. This can
be emulated by setting texture component swizzles in the runtime API - e.g. via
`glTexParameteri()` for OpenGL ES - although it has been noted that API
controlled swizzles are not available in WebGL.
**3:** ASTC can only store unsigned values, and has no equivalent of the BC6
signed endpoint mode.
# Other Considerations
This section outlines some of the other things to consider when encoding
textures using ASTC.
## Encoding non-correlated channels
## Decode mode extensions
Most other texture compression formats have a static channel assignment in
ASTC is specified to decompress into a 16-bit per component RGBA output by
default, with the exception of the sRGB format which uses an 8-bit value for the
RGB components.
Decompressing in to a 16-bit per component output format is often higher than
many use cases require, especially for LDR textures which originally came from
an 8-bit per component source image. Most implementations of ASTC support the
decode mode extensions, which allow an application to opt-in to a lower
precision decompressed format (RGBA8 for LDR, RGB9E5 for HDR). Using these
extensions can improve GPU texture cache efficiency, and even improve texturing
filtering throughput, for use cases that do not need the higher precision.
The ASTC format uses different data rounding rules when the decode mode
extensions are used. To ensure that the compressor chooses the best encodings
for the RGBA8 rounding rules, you can specify `-decode_unorm8` when compressing
textures that will be decompressed into the RGBA8 intermediate. This gives a
small image quality boost.
**Note:** This mode is automatically enabled if you use the `astcenc`
decompressor to write an 8-bit per component output image.
## Encoding non-correlated components
Most other texture compression formats have a static component assignment in
terms of the expected data correlation. For example, ETC2+EAC assumes that RGB
are always correlated and that alpha is non-correlated. ASTC can automatically
encode data as either fully correlated across all 4 channels, or with any one
channel assigned to a separate non-correlated partition to the other three.
encode data as either fully correlated across all 4 components, or with any one
component assigned to a separate non-correlated partition to the other three.
The non-correlated channel can be changed on a block-by-block basis, so the
The non-correlated component can be changed on a block-by-block basis, so the
compressor can dynamically adjust the coding based on the data present in the
image. This means that there is no need for non-correlated data to be stored
in a specific channel in the input image.
in a specific component in the input image.
It is however worth noting that the alpha channel is treated differently to
the RGB color channels in some circumstances:
It is however worth noting that the alpha component is treated differently to
the RGB color components in some circumstances:
* When coding for sRGB the alpha channel will always be stored in linear space.
* When coding for HDR the alpha channel can optionally be kept as LDR data.
* When coding for sRGB the alpha component will always be stored in linear
space.
* When coding for HDR the alpha component can optionally be kept as LDR data.
## Encoding normal maps
@@ -155,21 +183,21 @@ BC5; store the X and Y components of a unit-length normal. The Z component of
the normal can be reconstructed in shader code based on the knowledge that the
vector is unit length.
To encode this we therefore want to store two input channels and should
therefore use the `rrrg` coding swizzle, and the `.ga` sampling swizzle. The
OpenGL ES shader code for reconstruction of the Z value is:
To encode this we need to store only two input components in the compressed
data, and therefore use the `rrrg` coding swizzle to align the data with the
ASTC luminance+alpha endpoint. We can sample this in shader code using the
`.ga` sampling swizzle, and reconstruct the Z value with:
vec3 nml;
nml.xy = texture(...).ga; // Load normals (range 0 to 1)
nml.xy = nml.xy * 2.0 - 1.0; // Unpack normals (range -1 to +1)
nml.z = sqrt(1 - dot(nml.xy, nml.xy)); // Compute Z, given unit length
In addition to this it is useful to optimize for angular error in the resulting
vector rather than for absolute color error in the data, which improves the
perceptual quality of the image.
Both the encoding swizzle and the angular error function are enabled by using
the `-normal` command line option.
The encoding swizzle and appropriate component weighting is enabled by using
the `-normal` command line option. If you wish to use a different pair of
components you can specify a custom swizzle after setting the `-normal`
parameter. For example, to match BC5n component ordering use
`-normal -esw gggr` for compression and `-normal -dsw arz1` for decompression.
## Encoding sRGB data
@@ -182,8 +210,8 @@ For color data it is nearly always a perceptual quality win to use sRGB input
source textures that are then compressed using the ASTC sRGB compression mode
(compress using the `-cs` command line option rather than the `-cl` command
line option). Note that sRGB gamma correction is only applied to the RGB
channels during decode; the alpha channel is always treated as linear encoded
data.
components during decode; the alpha component is always treated as linear
encoded data.
*Important:* The uncompressed input texture provided on the command line must
be stored in the sRGB color space for `-cs` to function correctly.
@@ -191,17 +219,17 @@ be stored in the sRGB color space for `-cs` to function correctly.
## Encoding HDR data
HDR data can be encoded just like LDR data, but with some caveats around
handling the alpha channel.
handling the alpha component.
For many use cases the alpha channel is an actual alpha opacity channel and is
therefore used for storing an LDR value between 0 and 1. For these cases use
the `-ch` compressor option which will treat the RGB channels as HDR, but the
A channel as LDR.
For many use cases the alpha component is an actual alpha opacity component and
is therefore used for storing an LDR value between 0 and 1. For these cases use
the `-ch` compressor option which will treat the RGB components as HDR, but the
A component as LDR.
For other use cases the alpha channel is simply a fourth data channel which is
also storing an HDR value. For these cases use the `-cH` compressor option
which will treat all channels as HDR data.
For other use cases the alpha component is simply a fourth data component which
is also storing an HDR value. For these cases use the `-cH` compressor option
which will treat all components as HDR data.
- - -
_Copyright © 2019-2022, Arm Limited and contributors. All rights reserved._
_Copyright © 2019-2024, Arm Limited and contributors. All rights reserved._
+2 -2
View File
@@ -20,8 +20,8 @@ We support a small (but growing) number of C++ unit tests, which are written
using the `googletest` framework and integrated in the CMake "CTest" test
framework.
To build unit tests pull the `googletest` git submodule and add `-DUNITTEST=ON`
to the CMake command line when configuring.
To build unit tests pull the `googletest` git submodule and add
`-DASTCENC_UNITTEST=ON` to the CMake command line when configuring.
To run unit tests use the CMake `ctest` utility from your build directory after
you have built the tests.
+1 -1
View File
@@ -3,7 +3,7 @@
"Name":"astc-encoder",
"License":"Apache 2.0 License",
"License File":"LICENSE",
"Version Number":"3.7",
"Version Number":"4.7",
"Owner":"wangyonglang@huawei.com",
"Upstream URL":"https://github.com/ARM-software/astc-encoder.git",
"Description":"The Arm Adaptive Scalable Texture Compression (ASTC) Encoder,astcenc,is a command-line tool for compressing and decompressing images using the ASTC texture compression standard."
+45 -26
View File
@@ -1,13 +1,13 @@
# About
This is the official repository for the Arm® Adaptive Scalable Texture
Compression (ASTC) Encoder, `astcenc`, a command-line tool for compressing
and decompressing images using the ASTC texture compression standard.
The Arm® Adaptive Scalable Texture Compression (ASTC) Encoder, `astcenc`, is
a command-line tool for compressing and decompressing images using the ASTC
texture compression standard.
## The ASTC format
The ASTC compressed data format, developed by Arm® and AMD, has been adopted as
an official extension to the Open GL®, OpenGL ES, and Vulkan® graphics APIs. It
an official extension to the OpenGL®, OpenGL ES, and Vulkan® graphics APIs. It
provides a major step forward in terms of both the image quality at a given
bitrate, and the format and bitrate flexibility available to content creators.
This allows more assets to use compression, often at a reduced bitrate compared
@@ -20,7 +20,7 @@ read the full [Khronos Data Format Specification][2] for all the details.
This project is licensed under the Apache 2.0 license. By downloading any
component from this repository you acknowledge that you accept terms specified
in the [LICENSE](LICENSE) file.
in the [LICENSE.txt](LICENSE.txt) file.
# Encoder feature support
@@ -33,8 +33,8 @@ dynamic range (BMP, PNG, TGA), high dynamic range (EXR, HDR), or DDS and KTX
wrapped output images.
The encoder allows control over the compression time/quality tradeoff with
`exhaustive`, `thorough`, `medium`, `fast`, and `fastest` encoding quality
presets.
`exhaustive`, `verythorough`, `thorough`, `medium`, `fast`, and `fastest`
encoding quality presets.
The encoder allows compression time and quality analysis by reporting the
compression time, and the Peak Signal-to-Noise Ratio (PSNR) between the input
@@ -58,15 +58,15 @@ from 0.89 bits/pixel up to 8 bits/pixel.
Release build binaries for the `astcenc` stable releases are provided in the
[GitHub Releases page][3].
**Latest 3.x stable release:** 3.7
* Change log: [4.x series](./Docs/ChangeLog-4x.md)
* Change log: [3.x series](./Docs/ChangeLog-3x.md)
**Latest 2.x stable release:** 2.5
* Change log: [2.x series](./Docs/ChangeLog-2x.md)
Binaries are provided for 64-bit builds on Windows, macOS, and Linux.
Binaries are provided for 64-bit builds on Windows, macOS, and Linux. The
builds of the astcenc are provided as multiple binaries, each tuned for a
specific SIMD instruction set.
## Windows and Linux
For Windows and Linux the builds of the astcenc are provided as multiple
binaries, each tuned for a specific SIMD instruction set.
For x86-64 we provide, in order of increasing performance:
@@ -78,23 +78,33 @@ The x86-64 SSE2 builds will work on all x86-64 machines, but it is the slowest
of the three. The other two require extended CPU instruction set support which
is not universally available, but each step gains ~15% more performance.
For Apple silicon macOS devices we provide:
For Arm, if binaries are available, we provide:
* `astcenc-neon` - uses NEON
## macOS
For macOS devices we provide a single universal binary `astcenc`, which allows
the OS to automatically use the correct binary variant for the current host
machine. Support is provided for three architecture slices:
* `x86_64` - uses the `astcenc-sse4.1` build defined above.
* `x86_64h` - uses the `astcenc-avx2` build defined above.
* `arm64` - uses the `astcenc-neon` build defined above.
## Repository branches
The `main` branch is an active development branch for the compressor. It aims
to be a stable branch, but as it is used for ongoing development expect it to
have some volatility.
to be a stable branch for the latest major release series, but as it is used
for ongoing development expect it to have some volatility. We recommend using
the latest stable release tag for production development.
The `2.x` branch is a stable branch for the 2.x release series. It is no longer
under active development, but is a supported branch that will continue to get
The `3.x` branch is a stable branch for the 3.x release series. It is no longer
under active development, but is a supported branch that continues to get
backported bug fixes.
The `1.x` branch is a stable branch for the 1.x release series. It is no longer
under active development or getting bug fixes.
The `1.x` and `2.x` branches are stable branches for older releases. They are
no longer under active development or getting bug fixes.
Any other branches you might find are development branches for new features or
optimizations, so might be interesting to play with but should be considered
@@ -135,6 +145,11 @@ The modes available are:
* `-ch` : use the HDR color profile, tuned for HDR RGB and LDR A.
* `-cH` : use the HDR color profile, tuned for HDR RGBA.
If you intend to use the resulting image with the decode mode extensions to
limit the decompressed precision to UNORM8, it is recommended that you also
specify the `-decode_unorm8` flag. This will ensure that the compressor uses
the correct rounding rules when choosing encodings.
## Decompressing an image
Decompress an image using the `-dl` \ `-ds` \ `-dh` \ `-dH` modes. For example:
@@ -180,11 +195,6 @@ The compression speed can be controlled from `-fastest`, through `-fast`,
encoder has to spend looking for good encodings the better the results, but it
does result in increasingly small improvements for the amount of time required.
:warning: The `-fastest` quality preset is designed for quickly roughing-out
new content. It is tuned to give the fastest possible compression, often at the
expense of significant image quality loss compared to `-fast`. We do not
recommend using it for production builds.
There are many other command line options for tuning the encoder parameters
which can be used to fine tune the compression algorithm. See the command line
help message for more details.
@@ -203,6 +213,9 @@ It covers:
* How to efficiently encode normal maps, sRGB data, and HDR data.
* Coding equivalents to other compression formats.
The [ASTC Developer Guide][5] document (external link) provides a more detailed
guide for developers using the `astcenc` compressor.
The [.astc File Format](./Docs/FileFormat.md) page provides a light-weight
specification for the `.astc` file format and how to read or write it.
@@ -217,10 +230,16 @@ how to test any modifications to the source code in this repository.
If you have issues with the `astcenc` encoder, or questions about the ASTC
texture format itself, please raise them in the GitHub issue tracker.
If you have any questions about Arm GPUs, application development for Arm GPUs,
or general mobile graphics development or technology please submit them on the
[Arm Community graphics forums][4].
- - -
_Copyright © 2013-2022, Arm Limited and contributors. All rights reserved._
_Copyright © 2013-2024, Arm Limited and contributors. All rights reserved._
[1]: ./Docs/FormatOverview.md
[2]: https://www.khronos.org/registry/DataFormat/specs/1.3/dataformat.1.3.html#ASTC
[3]: https://github.com/ARM-software/astc-encoder/releases
[4]: https://community.arm.com/support-forums/f/graphics-gaming-and-vr-forum/
[5]: https://developer.arm.com/documentation/102162/latest/?lang=en
+1 -1
View File
@@ -41,7 +41,7 @@ astc-encoder引入openharmony的thirdparty目录下,
```
./build.sh --product-name rk3568 --ccache
```
编译生成物对应路径:`out/rk3568/thirdparty/astc-encoder/libastc_encoder_shared.so`。
编译生成物对应路径:`out/rk3568/thirdparty/astc-encoder/libastc_encoder_shared.z.so`。
## 许可证<a name="section126611612164217"></a>
+62 -27
View File
@@ -1,6 +1,6 @@
# SPDX-License-Identifier: Apache-2.0
# ----------------------------------------------------------------------------
# Copyright 2020-2021 Arm Limited
# Copyright 2020-2023 Arm Limited
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may not
# use this file except in compliance with the License. You may obtain a copy
@@ -17,45 +17,80 @@
# Overwrite the LTO flags to force fat LTO; worth 3-4% performance
# See https://gitlab.kitware.com/cmake/cmake/-/issues/16808
if(CMAKE_CXX_COMPILER_ID MATCHES "GNU|Clang" AND ${CLI})
if(CMAKE_CXX_COMPILER_ID MATCHES "GNU|Clang" AND ${ASTCENC_CLI})
set(CMAKE_CXX_COMPILE_OPTIONS_IPO "-flto")
endif()
if(${DECOMPRESSOR})
set(CODEC dec)
if(${ASTCENC_DECOMPRESSOR})
set(ASTCENC_CODEC dec)
else()
set(CODEC enc)
set(ASTCENC_CODEC enc)
endif()
if(${UNIVERSAL_BUILD})
if(${ISA_AVX2})
set(ISA_SIMD "avx2")
elseif(${ISA_SSE41})
set(ISA_SIMD "sse4.1")
elseif(${ISA_SSE2})
set(ISA_SIMD "sse2")
endif()
include(cmake_core.cmake)
else()
set(ARTEFACTS native none neon avx2 sse4.1 sse2)
set(CONFIGS ${ISA_NATIVE} ${ISA_NONE} ${ISA_NEON} ${ISA_AVX2} ${ISA_SSE41} ${ISA_SSE2})
list(LENGTH ARTEFACTS ARTEFACTS_LEN)
math(EXPR ARTEFACTS_LEN "${ARTEFACTS_LEN} - 1")
set(ASTCENC_ARTIFACTS native none neon avx2 sse4.1 sse2)
set(ASTCENC_CONFIGS ${ASTCENC_ISA_NATIVE} ${ASTCENC_ISA_NONE} ${ASTCENC_ISA_NEON} ${ASTCENC_ISA_AVX2} ${ASTCENC_ISA_SSE41} ${ASTCENC_ISA_SSE2})
list(LENGTH ASTCENC_ARTIFACTS ASTCENC_ARTIFACTS_LEN)
math(EXPR ASTCENC_ARTIFACTS_LEN "${ASTCENC_ARTIFACTS_LEN} - 1")
foreach(INDEX RANGE ${ARTEFACTS_LEN})
list(GET ARTEFACTS ${INDEX} ARTEFACT)
list(GET CONFIGS ${INDEX} CONFIG)
if(${CONFIG})
set(ISA_SIMD ${ARTEFACT})
include(cmake_core.cmake)
foreach(INDEX RANGE ${ASTCENC_ARTIFACTS_LEN})
list(GET ASTCENC_ARTIFACTS ${INDEX} ASTCENC_ARTIFACT)
list(GET ASTCENC_CONFIGS ${INDEX} ASTCENC_CONFIG)
if(${ASTCENC_CONFIG})
set(ASTCENC_ISA_SIMD ${ASTCENC_ARTIFACT})
if(${ASTCENC_ISA_SIMD} MATCHES "neon")
set(CMAKE_OSX_ARCHITECTURES arm64)
elseif(${ASTCENC_ISA_SIMD} MATCHES "avx2")
set(CMAKE_OSX_ARCHITECTURES x86_64h)
elseif(NOT ${ASTCENC_ISA_SIMD} MATCHES "none")
set(CMAKE_OSX_ARCHITECTURES x86_64)
endif()
endforeach()
include(cmake_core.cmake)
endif()
endforeach()
if(${ASTCENC_CLI} AND ${ASTCENC_UNIVERSAL_BUILD})
add_custom_target(
astc${ASTCENC_CODEC}
ALL
COMMAND
lipo -create -output $<TARGET_FILE_DIR:astc${ASTCENC_CODEC}-sse4.1>/astc${ASTCENC_CODEC} -arch x86_64 $<TARGET_FILE:astc${ASTCENC_CODEC}-sse4.1> -arch x86_64h $<TARGET_FILE:astc${ASTCENC_CODEC}-avx2> -arch arm64 $<TARGET_FILE:astc${ASTCENC_CODEC}-neon>
VERBATIM)
add_dependencies(
astc${ASTCENC_CODEC}
astc${ASTCENC_CODEC}-sse4.1
astc${ASTCENC_CODEC}-avx2
astc${ASTCENC_CODEC}-neon)
install(PROGRAMS $<TARGET_FILE_DIR:astc${ASTCENC_CODEC}-sse4.1>/astc${ASTCENC_CODEC}
DESTINATION bin)
endif()
if(${ASTCENC_SHAREDLIB} AND ${ASTCENC_UNIVERSAL_BUILD})
add_custom_target(
astc${ASTCENC_CODEC}-shared
ALL
COMMAND
lipo -create -output $<TARGET_FILE_DIR:astc${ASTCENC_CODEC}-sse4.1-shared>/libastc${ASTCENC_CODEC}-shared.dylib -arch x86_64 $<TARGET_FILE:astc${ASTCENC_CODEC}-sse4.1-shared> -arch x86_64h $<TARGET_FILE:astc${ASTCENC_CODEC}-avx2-shared> -arch arm64 $<TARGET_FILE:astc${ASTCENC_CODEC}-neon-shared>
VERBATIM)
add_dependencies(
astc${ASTCENC_CODEC}-shared
astc${ASTCENC_CODEC}-sse4.1-shared
astc${ASTCENC_CODEC}-avx2-shared
astc${ASTCENC_CODEC}-neon-shared)
install(PROGRAMS $<TARGET_FILE_DIR:astc${ASTCENC_CODEC}-sse4.1-shared>/libastc${ASTCENC_CODEC}-shared.dylib
DESTINATION lib)
endif()
# - - - - - - - - - - - - - - - - - -
# Unit testing
if(${UNITTEST})
if(${ASTCENC_UNITTEST})
set(INSTALL_GTEST OFF CACHE BOOL "" FORCE)
set(CMAKE_OSX_ARCHITECTURES x86_64;arm64)
add_subdirectory(GoogleTest)
enable_testing()
add_subdirectory(UnitTest)
@@ -94,9 +94,9 @@ extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size)
int i = stream.ConsumeIntegralInRange<int>(0, testSz.size() - 1);
// Populate the physical block
physical_compressed_block pcb;
uint8_t pcb[16];
std::vector<uint8_t> buffer = stream.ConsumeBytes<uint8_t>(16);
std::memcpy(&pcb, buffer.data(), 16);
std::memcpy(pcb, buffer.data(), 16);
// Call the function under test
symbolic_compressed_block scb;
+21 -23
View File
@@ -1,6 +1,6 @@
# SPDX-License-Identifier: Apache-2.0
# ----------------------------------------------------------------------------
# Copyright 2020-2021 Arm Limited
# Copyright 2020-2023 Arm Limited
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may not
# use this file except in compliance with the License. You may obtain a copy
@@ -15,27 +15,25 @@
# under the License.
# ----------------------------------------------------------------------------
if(${UNIVERSAL_BUILD})
if(${ISA_AVX2})
set(ISA_SIMD "avx2")
elseif(${ISA_SSE41})
set(ISA_SIMD "sse4.1")
elseif(${ISA_SSE2})
set(ISA_SIMD "sse2")
endif()
include(cmake_core.cmake)
else()
set(ARTEFACTS native none neon avx2 sse4.1 sse2)
set(CONFIGS ${ISA_NATIVE} ${ISA_NONE} ${ISA_NEON} ${ISA_AVX2} ${ISA_SSE41} ${ISA_SSE2})
list(LENGTH ARTEFACTS ARTEFACTS_LEN)
math(EXPR ARTEFACTS_LEN "${ARTEFACTS_LEN} - 1")
set(ASTCENC_ARTIFACTS native none neon avx2 sse4.1 sse2)
set(ASTCENC_CONFIGS ${ASTCENC_ISA_NATIVE} ${ASTCENC_ISA_NONE} ${ASTCENC_ISA_NEON} ${ASTCENC_ISA_AVX2} ${ASTCENC_ISA_SSE41} ${ASTCENC_ISA_SSE2})
list(LENGTH ASTCENC_ARTIFACTS ASTCENC_ARTIFACTS_LEN)
math(EXPR ASTCENC_ARTIFACTS_LEN "${ASTCENC_ARTIFACTS_LEN} - 1")
foreach(INDEX RANGE ${ARTEFACTS_LEN})
list(GET ARTEFACTS ${INDEX} ARTEFACT)
list(GET CONFIGS ${INDEX} CONFIG)
if(${CONFIG})
set(ISA_SIMD ${ARTEFACT})
include(cmake_core.cmake)
foreach(INDEX RANGE ${ASTCENC_ARTIFACTS_LEN})
list(GET ASTCENC_ARTIFACTS ${INDEX} ASTCENC_ARTIFACT)
list(GET ASTCENC_CONFIGS ${INDEX} ASTCENC_CONFIG)
if(${ASTCENC_CONFIG})
set(ASTCENC_ISA_SIMD ${ASTCENC_ARTIFACT})
if(${ASTCENC_ISA_SIMD} MATCHES "neon")
set(CMAKE_OSX_ARCHITECTURES arm64)
elseif(${ASTCENC_ISA_SIMD} MATCHES "avx2")
set(CMAKE_OSX_ARCHITECTURES x86_64h)
elseif(NOT ${ASTCENC_ISA_SIMD} MATCHES "none")
set(CMAKE_OSX_ARCHITECTURES x86_64)
endif()
endforeach()
endif()
include(cmake_core.cmake)
endif()
endforeach()
+84 -84
View File
@@ -1,6 +1,6 @@
# SPDX-License-Identifier: Apache-2.0
# ----------------------------------------------------------------------------
# Copyright 2020-2021 Arm Limited
# Copyright 2020-2023 Arm Limited
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may not
# use this file except in compliance with the License. You may obtain a copy
@@ -15,25 +15,34 @@
# under the License.
# ----------------------------------------------------------------------------
if(${UNIVERSAL_BUILD})
set(ASTC_TEST test-unit)
else()
set(ASTC_TEST test-unit-${ISA_SIMD})
set(ASTCENC_TEST test-unit-${ASTCENC_ISA_SIMD})
add_executable(${ASTCENC_TEST})
# Enable LTO under the conditions where the codec library will use LTO.
# The library link will fail if the settings don't match
if(${ASTCENC_CLI})
set_property(TARGET ${ASTCENC_TEST}
PROPERTY
INTERPROCEDURAL_OPTIMIZATION_RELEASE True)
endif()
add_executable(${ASTC_TEST})
target_sources(${ASTC_TEST}
target_sources(${ASTCENC_TEST}
PRIVATE
test_simd.cpp
test_softfloat.cpp
test_decode.cpp
../astcenc_mathlib_softfloat.cpp)
target_include_directories(${ASTC_TEST}
target_include_directories(${ASTCENC_TEST}
PRIVATE
${gtest_SOURCE_DIR}/include)
target_compile_options(${ASTC_TEST}
target_link_libraries(${ASTCENC_TEST}
PRIVATE
astcenc-${ASTCENC_ISA_SIMD}-static)
target_compile_options(${ASTCENC_TEST}
PRIVATE
# Use pthreads on Linux/macOS
$<$<PLATFORM_ID:Linux,Darwin>:-pthread>
@@ -47,92 +56,83 @@ target_compile_options(${ASTC_TEST}
$<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wpedantic>
$<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Werror>
$<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wshadow>
$<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wdouble-promotion>)
$<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wno-c++98-compat-pedantic>
$<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wno-c++98-c++11-compat-pedantic>
$<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wno-float-equal>
# Ignore things that the googletest build triggers
$<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wno-unknown-warning-option>
$<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wno-double-promotion>
$<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wno-undef>
$<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wno-reserved-identifier>
$<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wno-global-constructors>)
# Set up configuration for SIMD ISA builds
if(${ISA_SIMD} MATCHES "none")
if(NOT ${UNIVERSAL_BUILD})
target_compile_definitions(${ASTC_TEST}
PRIVATE
ASTCENC_NEON=0
ASTCENC_SSE=0
ASTCENC_AVX=0
ASTCENC_POPCNT=0
ASTCENC_F16C=0)
endif()
elseif(${ISA_SIMD} MATCHES "neon")
if(NOT ${UNIVERSAL_BUILD})
target_compile_definitions(${ASTC_TEST}
PRIVATE
ASTCENC_NEON=1
ASTCENC_SSE=0
ASTCENC_AVX=0
ASTCENC_POPCNT=0
ASTCENC_F16C=0)
endif()
elseif(${ISA_SIMD} MATCHES "sse2")
if(NOT ${UNIVERSAL_BUILD})
target_compile_definitions(${ASTC_TEST}
PRIVATE
ASTCENC_NEON=0
ASTCENC_SSE=20
ASTCENC_AVX=0
ASTCENC_POPCNT=0
ASTCENC_F16C=0)
endif()
target_compile_options(${ASTC_TEST}
if(${ASTCENC_ISA_SIMD} MATCHES "none")
target_compile_definitions(${ASTCENC_TEST}
PRIVATE
$<$<CXX_COMPILER_ID:${GNU_LIKE}>:-mfpmath=sse -msse2>)
ASTCENC_NEON=0
ASTCENC_SSE=0
ASTCENC_AVX=0
ASTCENC_POPCNT=0
ASTCENC_F16C=0)
elseif(${ISA_SIMD} MATCHES "sse4.1")
if(NOT ${UNIVERSAL_BUILD})
target_compile_definitions(${ASTC_TEST}
PRIVATE
ASTCENC_NEON=0
ASTCENC_SSE=41
ASTCENC_AVX=0
ASTCENC_POPCNT=1
ASTCENC_F16C=0)
endif()
target_compile_options(${ASTC_TEST}
elseif(${ASTCENC_ISA_SIMD} MATCHES "neon")
target_compile_definitions(${ASTCENC_TEST}
PRIVATE
$<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-mfpmath=sse -msse4.1 -mpopcnt>)
ASTCENC_NEON=1
ASTCENC_SSE=0
ASTCENC_AVX=0
ASTCENC_POPCNT=0
ASTCENC_F16C=0)
elseif(${ISA_SIMD} MATCHES "avx2")
if(NOT ${UNIVERSAL_BUILD})
target_compile_definitions(${ASTC_TEST}
PRIVATE
ASTCENC_NEON=0
ASTCENC_SSE=41
ASTCENC_AVX=2
ASTCENC_POPCNT=1
ASTCENC_F16C=1)
endif()
target_compile_options(${ASTC_TEST}
elseif(${ASTCENC_ISA_SIMD} MATCHES "sse2")
target_compile_definitions(${ASTCENC_TEST}
PRIVATE
$<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-mfpmath=sse -mavx2 -mpopcnt -mf16c>
ASTCENC_NEON=0
ASTCENC_SSE=20
ASTCENC_AVX=0
ASTCENC_POPCNT=0
ASTCENC_F16C=0)
target_compile_options(${ASTCENC_TEST}
PRIVATE
$<$<CXX_COMPILER_ID:${GNU_LIKE}>:-msse2>)
elseif(${ASTCENC_ISA_SIMD} MATCHES "sse4.1")
target_compile_definitions(${ASTCENC_TEST}
PRIVATE
ASTCENC_NEON=0
ASTCENC_SSE=41
ASTCENC_AVX=0
ASTCENC_POPCNT=1
ASTCENC_F16C=0)
target_compile_options(${ASTCENC_TEST}
PRIVATE
$<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-msse4.1 -mpopcnt>)
elseif(${ASTCENC_ISA_SIMD} MATCHES "avx2")
target_compile_definitions(${ASTCENC_TEST}
PRIVATE
ASTCENC_NEON=0
ASTCENC_SSE=41
ASTCENC_AVX=2
ASTCENC_POPCNT=1
ASTCENC_F16C=1)
target_compile_options(${ASTCENC_TEST}
PRIVATE
$<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-mavx2 -mpopcnt -mf16c>
$<$<CXX_COMPILER_ID:MSVC>:/arch:AVX2>)
endif()
target_compile_options(${ASTC_TEST}
PRIVATE
$<$<CXX_COMPILER_ID:${CLANG_LIKE}>:-fsanitize=undefined>)
target_link_options(${ASTC_TEST}
PRIVATE
$<$<CXX_COMPILER_ID:${CLANG_LIKE}>:-fsanitize=undefined>)
target_link_libraries(${ASTC_TEST}
target_link_libraries(${ASTCENC_TEST}
PRIVATE
gtest_main)
add_test(NAME ${ASTC_TEST}
COMMAND ${ASTC_TEST})
add_test(NAME ${ASTCENC_TEST}
COMMAND ${ASTCENC_TEST})
install(TARGETS ${ASTC_TEST} DESTINATION ${PACKAGE_ROOT})
install(TARGETS ${ASTCENC_TEST})
+79
View File
@@ -0,0 +1,79 @@
// SPDX-License-Identifier: Apache-2.0
// ----------------------------------------------------------------------------
// Copyright 2023 Arm Limited
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy
// of the License at:
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
// License for the specific language governing permissions and limitations
// under the License.
// ----------------------------------------------------------------------------
/**
* @brief Unit tests for the vectorized SIMD functionality.
*/
#include <limits>
#include "gtest/gtest.h"
#include "../astcenc.h"
namespace astcenc
{
/** @brief Test harness for exploring issue #447. */
TEST(decode, decode12x12)
{
astcenc_error status;
astcenc_config config;
astcenc_context* context;
static const astcenc_swizzle swizzle {
ASTCENC_SWZ_R, ASTCENC_SWZ_G, ASTCENC_SWZ_B, ASTCENC_SWZ_A
};
uint8_t data[16] {
#if 0
0x84,0x00,0x38,0xC8,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0xB3,0x4D,0x78
#else
0x29,0x00,0x1A,0x97,0x01,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0xCF,0x97,0x86
#endif
};
uint8_t output[12*12*4];
astcenc_config_init(ASTCENC_PRF_LDR, 12, 12, 1, ASTCENC_PRE_MEDIUM, 0, &config);
status = astcenc_context_alloc(&config, 1, &context);
EXPECT_EQ(status, ASTCENC_SUCCESS);
astcenc_image image;
image.dim_x = 12;
image.dim_y = 12;
image.dim_z = 1;
image.data_type = ASTCENC_TYPE_U8;
uint8_t* slices = output;
image.data = reinterpret_cast<void**>(&slices);
status = astcenc_decompress_image(context, data, 16, &image, &swizzle, 0);
EXPECT_EQ(status, ASTCENC_SUCCESS);
for (int y = 0; y < 12; y++)
{
for (int x = 0; x < 12; x++)
{
uint8_t* pixel = output + (12 * 4 * y) + (4 * x);
printf("[%2dx%2d] = %03d, %03d, %03d, %03d\n", x, y, pixel[0], pixel[1], pixel[2], pixel[3]);
}
}
}
}
+381 -107
View File
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: Apache-2.0
// ----------------------------------------------------------------------------
// Copyright 2020-2022 Arm Limited
// Copyright 2020-2024 Arm Limited
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy
@@ -31,15 +31,15 @@ namespace astcenc
// Misc utility tests - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
static int round_down(int x)
static unsigned int round_down(unsigned int x)
{
int remainder = x % ASTCENC_SIMD_WIDTH;
unsigned int remainder = x % ASTCENC_SIMD_WIDTH;
return x - remainder;
}
static int round_up(int x)
static unsigned int round_up(unsigned int x)
{
int remainder = x % ASTCENC_SIMD_WIDTH;
unsigned int remainder = x % ASTCENC_SIMD_WIDTH;
if (!remainder)
{
return x;
@@ -52,9 +52,9 @@ static int round_up(int x)
TEST(misc, RoundDownVLA)
{
// Static ones which are valid for all VLA widths
EXPECT_EQ(round_down_to_simd_multiple_vla(0), 0);
EXPECT_EQ(round_down_to_simd_multiple_vla(8), 8);
EXPECT_EQ(round_down_to_simd_multiple_vla(16), 16);
EXPECT_EQ(round_down_to_simd_multiple_vla(0), 0u);
EXPECT_EQ(round_down_to_simd_multiple_vla(8), 8u);
EXPECT_EQ(round_down_to_simd_multiple_vla(16), 16u);
// Variable ones which depend on VLA width
EXPECT_EQ(round_down_to_simd_multiple_vla(3), round_down(3));
@@ -67,9 +67,9 @@ TEST(misc, RoundDownVLA)
TEST(misc, RoundUpVLA)
{
// Static ones which are valid for all VLA widths
EXPECT_EQ(round_up_to_simd_multiple_vla(0), 0);
EXPECT_EQ(round_up_to_simd_multiple_vla(8), 8);
EXPECT_EQ(round_up_to_simd_multiple_vla(16), 16);
EXPECT_EQ(round_up_to_simd_multiple_vla(0), 0u);
EXPECT_EQ(round_up_to_simd_multiple_vla(8), 8u);
EXPECT_EQ(round_up_to_simd_multiple_vla(16), 16u);
// Variable ones which depend on VLA width
EXPECT_EQ(round_up_to_simd_multiple_vla(3), round_up(3));
@@ -540,27 +540,27 @@ TEST(vfloat4, ceq)
vfloat4 a1(1.0f, 2.0f, 3.0f, 4.0f);
vfloat4 b1(0.1f, 0.2f, 0.3f, 0.4f);
vmask4 r1 = a1 == b1;
EXPECT_EQ(0, mask(r1));
EXPECT_EQ(0u, mask(r1));
EXPECT_EQ(false, any(r1));
EXPECT_EQ(false, all(r1));
vfloat4 a2(1.0f, 2.0f, 3.0f, 4.0f);
vfloat4 b2(1.0f, 0.2f, 0.3f, 0.4f);
vmask4 r2 = a2 == b2;
EXPECT_EQ(0x1, mask(r2));
EXPECT_EQ(0x1u, mask(r2));
EXPECT_EQ(true, any(r2));
EXPECT_EQ(false, all(r2));
vfloat4 a3(1.0f, 2.0f, 3.0f, 4.0f);
vfloat4 b3(1.0f, 0.2f, 3.0f, 0.4f);
vmask4 r3 = a3 == b3;
EXPECT_EQ(0x5, mask(r3));
EXPECT_EQ(0x5u, mask(r3));
EXPECT_EQ(true, any(r3));
EXPECT_EQ(false, all(r3));
vfloat4 a4(1.0f, 2.0f, 3.0f, 4.0f);
vmask4 r4 = a4 == a4;
EXPECT_EQ(0xF, mask(r4));
EXPECT_EQ(0xFu, mask(r4));
EXPECT_EQ(true, any(r4));
EXPECT_EQ(true, all(r4));
}
@@ -571,27 +571,27 @@ TEST(vfloat4, cne)
vfloat4 a1(1.0f, 2.0f, 3.0f, 4.0f);
vfloat4 b1(0.1f, 0.2f, 0.3f, 0.4f);
vmask4 r1 = a1 != b1;
EXPECT_EQ(0xF, mask(r1));
EXPECT_EQ(0xFu, mask(r1));
EXPECT_EQ(true, any(r1));
EXPECT_EQ(true, all(r1));
vfloat4 a2(1.0f, 2.0f, 3.0f, 4.0f);
vfloat4 b2(1.0f, 0.2f, 0.3f, 0.4f);
vmask4 r2 = a2 != b2;
EXPECT_EQ(0xE, mask(r2));
EXPECT_EQ(0xEu, mask(r2));
EXPECT_EQ(true, any(r2));
EXPECT_EQ(false, all(r2));
vfloat4 a3(1.0f, 2.0f, 3.0f, 4.0f);
vfloat4 b3(1.0f, 0.2f, 3.0f, 0.4f);
vmask4 r3 = a3 != b3;
EXPECT_EQ(0xA, mask(r3));
EXPECT_EQ(0xAu, mask(r3));
EXPECT_EQ(true, any(r3));
EXPECT_EQ(false, all(r3));
vfloat4 a4(1.0f, 2.0f, 3.0f, 4.0f);
vmask4 r4 = a4 != a4;
EXPECT_EQ(0, mask(r4));
EXPECT_EQ(0u, mask(r4));
EXPECT_EQ(false, any(r4));
EXPECT_EQ(false, all(r4));
}
@@ -602,7 +602,7 @@ TEST(vfloat4, clt)
vfloat4 a(1.0f, 2.0f, 3.0f, 4.0f);
vfloat4 b(0.9f, 2.1f, 3.0f, 4.1f);
vmask4 r = a < b;
EXPECT_EQ(0xA, mask(r));
EXPECT_EQ(0xAu, mask(r));
}
/** @brief Test vfloat4 cle. */
@@ -611,7 +611,7 @@ TEST(vfloat4, cle)
vfloat4 a(1.0f, 2.0f, 3.0f, 4.0f);
vfloat4 b(0.9f, 2.1f, 3.0f, 4.1f);
vmask4 r = a <= b;
EXPECT_EQ(0xE, mask(r));
EXPECT_EQ(0xEu, mask(r));
}
/** @brief Test vfloat4 cgt. */
@@ -620,7 +620,7 @@ TEST(vfloat4, cgt)
vfloat4 a(1.0f, 2.0f, 3.0f, 4.0f);
vfloat4 b(0.9f, 2.1f, 3.0f, 4.1f);
vmask4 r = a > b;
EXPECT_EQ(0x1, mask(r));
EXPECT_EQ(0x1u, mask(r));
}
/** @brief Test vfloat4 cge. */
@@ -629,7 +629,7 @@ TEST(vfloat4, cge)
vfloat4 a(1.0f, 2.0f, 3.0f, 4.0f);
vfloat4 b(0.9f, 2.1f, 3.0f, 4.1f);
vmask4 r = a >= b;
EXPECT_EQ(0x5, mask(r));
EXPECT_EQ(0x5u, mask(r));
}
/** @brief Test vfloat4 min. */
@@ -894,7 +894,8 @@ TEST(vfloat4, select)
/** @brief Test vfloat4 select MSB only. */
TEST(vfloat4, select_msb)
{
vint4 msb(0x80000000, 0, 0x80000000, 0);
int msb_set = static_cast<int>(0x80000000);
vint4 msb(msb_set, 0, msb_set, 0);
vmask4 cond(msb.m);
vfloat4 a(1.0f, 3.0f, 3.0f, 1.0f);
@@ -929,7 +930,7 @@ TEST(vfloat4, gatherf)
/** @brief Test vfloat4 storea. */
TEST(vfloat4, storea)
{
alignas(16) float out[4];
ASTCENC_ALIGNAS float out[4];
vfloat4 a(f32_data);
storea(a, out);
EXPECT_EQ(out[0], 0.0f);
@@ -941,7 +942,7 @@ TEST(vfloat4, storea)
/** @brief Test vfloat4 store. */
TEST(vfloat4, store)
{
alignas(16) float out[5];
ASTCENC_ALIGNAS float out[5];
vfloat4 a(f32_data);
store(a, &(out[1]));
EXPECT_EQ(out[1], 0.0f);
@@ -1439,27 +1440,27 @@ TEST(vint4, ceq)
vint4 a1(1, 2, 3, 4);
vint4 b1(0, 1, 2, 3);
vmask4 r1 = a1 == b1;
EXPECT_EQ(0, mask(r1));
EXPECT_EQ(0u, mask(r1));
EXPECT_EQ(false, any(r1));
EXPECT_EQ(false, all(r1));
vint4 a2(1, 2, 3, 4);
vint4 b2(1, 0, 0, 0);
vmask4 r2 = a2 == b2;
EXPECT_EQ(0x1, mask(r2));
EXPECT_EQ(0x1u, mask(r2));
EXPECT_EQ(true, any(r2));
EXPECT_EQ(false, all(r2));
vint4 a3(1, 2, 3, 4);
vint4 b3(1, 0, 3, 0);
vmask4 r3 = a3 == b3;
EXPECT_EQ(0x5, mask(r3));
EXPECT_EQ(0x5u, mask(r3));
EXPECT_EQ(true, any(r3));
EXPECT_EQ(false, all(r3));
vint4 a4(1, 2, 3, 4);
vmask4 r4 = a4 == a4;
EXPECT_EQ(0xF, mask(r4));
EXPECT_EQ(0xFu, mask(r4));
EXPECT_EQ(true, any(r4));
EXPECT_EQ(true, all(r4));
}
@@ -1470,27 +1471,27 @@ TEST(vint4, cne)
vint4 a1(1, 2, 3, 4);
vint4 b1(0, 1, 2, 3);
vmask4 r1 = a1 != b1;
EXPECT_EQ(0xF, mask(r1));
EXPECT_EQ(0xFu, mask(r1));
EXPECT_EQ(true, any(r1));
EXPECT_EQ(true, all(r1));
vint4 a2(1, 2, 3, 4);
vint4 b2(1, 0, 0, 0);
vmask4 r2 = a2 != b2;
EXPECT_EQ(0xE, mask(r2));
EXPECT_EQ(0xEu, mask(r2));
EXPECT_EQ(true, any(r2));
EXPECT_EQ(false, all(r2));
vint4 a3(1, 2, 3, 4);
vint4 b3(1, 0, 3, 0);
vmask4 r3 = a3 != b3;
EXPECT_EQ(0xA, mask(r3));
EXPECT_EQ(0xAu, mask(r3));
EXPECT_EQ(true, any(r3));
EXPECT_EQ(false, all(r3));
vint4 a4(1, 2, 3, 4);
vmask4 r4 = a4 != a4;
EXPECT_EQ(0, mask(r4));
EXPECT_EQ(0u, mask(r4));
EXPECT_EQ(false, any(r4));
EXPECT_EQ(false, all(r4));
}
@@ -1501,7 +1502,7 @@ TEST(vint4, clt)
vint4 a(1, 2, 3, 4);
vint4 b(0, 3, 3, 5);
vmask4 r = a < b;
EXPECT_EQ(0xA, mask(r));
EXPECT_EQ(0xAu, mask(r));
}
/** @brief Test vint4 cgt. */
@@ -1510,7 +1511,7 @@ TEST(vint4, cle)
vint4 a(1, 2, 3, 4);
vint4 b(0, 3, 3, 5);
vmask4 r = a > b;
EXPECT_EQ(0x1, mask(r));
EXPECT_EQ(0x1u, mask(r));
}
/** @brief Test vint4 lsl. */
@@ -1544,7 +1545,7 @@ TEST(vint4, lsr)
EXPECT_EQ(a.lane<0>(), 1);
EXPECT_EQ(a.lane<1>(), 2);
EXPECT_EQ(a.lane<2>(), 4);
EXPECT_EQ(a.lane<3>(), 0xFFFFFFFC);
EXPECT_EQ(a.lane<3>(), static_cast<int>(0xFFFFFFFC));
a = lsr<1>(a);
EXPECT_EQ(a.lane<0>(), 0);
@@ -1681,7 +1682,8 @@ TEST(vint4, hadd_rgb_s)
/** @brief Test vint4 clz. */
TEST(vint4, clz)
{
vint4 a1(0x80000000, 0x40000000, 0x20000000, 0x10000000);
int msb_set = static_cast<int>(0x80000000);
vint4 a1(msb_set, 0x40000000, 0x20000000, 0x10000000);
vint4 r1 = clz(a1);
EXPECT_EQ(r1.lane<0>(), 0);
EXPECT_EQ(r1.lane<1>(), 1);
@@ -1723,7 +1725,7 @@ TEST(vint4, two_to_the_n)
/** @brief Test vint4 storea. */
TEST(vint4, storea)
{
alignas(16) int out[4];
ASTCENC_ALIGNAS int out[4];
vint4 a(s32_data);
storea(a, out);
EXPECT_EQ(out[0], 0);
@@ -1735,7 +1737,7 @@ TEST(vint4, storea)
/** @brief Test vint4 store. */
TEST(vint4, store)
{
alignas(16) int out[5];
ASTCENC_ALIGNAS int out[5];
vint4 a(s32_data);
store(a, &(out[1]));
EXPECT_EQ(out[1], 0);
@@ -1747,12 +1749,78 @@ TEST(vint4, store)
/** @brief Test vint4 store_nbytes. */
TEST(vint4, store_nbytes)
{
alignas(16) int out;
ASTCENC_ALIGNAS int out;
vint4 a(42, 314, 75, 90);
store_nbytes(a, (uint8_t*)&out);
store_nbytes(a, reinterpret_cast<uint8_t*>(&out));
EXPECT_EQ(out, 42);
}
/** @brief Test vint4 store_lanes_masked. */
TEST(vint4, store_lanes_masked)
{
uint8_t resulta[16] { 0 };
// Store nothing
vmask4 mask1 = vint4(0) == vint4(1);
vint4 data1 = vint4(1);
store_lanes_masked(resulta, data1, mask1);
vint4 result1v = vint4::load(resulta);
vint4 expect1v = vint4::zero();
EXPECT_TRUE(all(result1v == expect1v));
// Store half
vmask4 mask2 = vint4(1, 1, 0, 0) == vint4(1);
vint4 data2 = vint4(2);
store_lanes_masked(resulta, data2, mask2);
vint4 result2v = vint4::load(resulta);
vint4 expect2v = vint4(2, 2, 0, 0);
EXPECT_TRUE(all(result2v == expect2v));
// Store all
vmask4 mask3 = vint4(1) == vint4(1);
vint4 data3 = vint4(3);
store_lanes_masked(resulta, data3, mask3);
vint4 result3v = vint4::load(resulta);
vint4 expect3v = vint4(3);
EXPECT_TRUE(all(result3v == expect3v));
}
/** @brief Test vint4 store_lanes_masked to unaligned address. */
TEST(vint4, store_lanes_masked_unaligned)
{
uint8_t resulta[17] { 0 };
// Store nothing
vmask4 mask1 = vint4(0) == vint4(1);
vint4 data1 = vint4(1);
store_lanes_masked(resulta + 1, data1, mask1);
vint4 result1v = vint4::load(resulta + 1);
vint4 expect1v = vint4::zero();
EXPECT_TRUE(all(result1v == expect1v));
// Store half
vmask4 mask2 = vint4(1, 1, 0, 0) == vint4(1);
vint4 data2 = vint4(2);
store_lanes_masked(resulta + 1, data2, mask2);
vint4 result2v = vint4::load(resulta + 1);
vint4 expect2v = vint4(2, 2, 0, 0);
EXPECT_TRUE(all(result2v == expect2v));
// Store all
vmask4 mask3 = vint4(1) == vint4(1);
vint4 data3 = vint4(3);
store_lanes_masked(resulta + 1, data3, mask3);
vint4 result3v = vint4::load(resulta + 1);
vint4 expect3v = vint4(3);
EXPECT_TRUE(all(result3v == expect3v));
}
/** @brief Test vint4 gatheri. */
TEST(vint4, gatheri)
{
@@ -1799,38 +1867,38 @@ TEST(vint4, select)
/** @brief Test vmask4 scalar literal constructor. */
TEST(vmask4, scalar_literal_construct)
{
vfloat4 m1a(0, 0, 0, 0);
vfloat4 m1b(1, 1, 1, 1);
vfloat4 m1a(0.0f, 0.0f, 0.0f, 0.0f);
vfloat4 m1b(1.0f, 1.0f, 1.0f, 1.0f);
vmask4 m1(true);
vfloat4 r = select(m1a, m1b, m1);
EXPECT_EQ(r.lane<0>(), 1);
EXPECT_EQ(r.lane<1>(), 1);
EXPECT_EQ(r.lane<2>(), 1);
EXPECT_EQ(r.lane<3>(), 1);
EXPECT_EQ(r.lane<0>(), 1.0f);
EXPECT_EQ(r.lane<1>(), 1.0f);
EXPECT_EQ(r.lane<2>(), 1.0f);
EXPECT_EQ(r.lane<3>(), 1.0f);
r = select(m1b, m1a, m1);
EXPECT_EQ(r.lane<0>(), 0);
EXPECT_EQ(r.lane<1>(), 0);
EXPECT_EQ(r.lane<2>(), 0);
EXPECT_EQ(r.lane<3>(), 0);
EXPECT_EQ(r.lane<0>(), 0.0f);
EXPECT_EQ(r.lane<1>(), 0.0f);
EXPECT_EQ(r.lane<2>(), 0.0f);
EXPECT_EQ(r.lane<3>(), 0.0f);
}
/** @brief Test vmask4 literal constructor. */
TEST(vmask4, literal_construct)
{
vfloat4 m1a(0, 0, 0, 0);
vfloat4 m1b(1, 1, 1, 1);
vfloat4 m1a(0.0f, 0.0f, 0.0f, 0.0f);
vfloat4 m1b(1.0f, 1.0f, 1.0f, 1.0f);
vmask4 m1(true, false, true, false);
vfloat4 r = select(m1a, m1b, m1);
EXPECT_EQ(r.lane<0>(), 1);
EXPECT_EQ(r.lane<1>(), 0);
EXPECT_EQ(r.lane<2>(), 1);
EXPECT_EQ(r.lane<3>(), 0);
EXPECT_EQ(r.lane<0>(), 1.0f);
EXPECT_EQ(r.lane<1>(), 0.0f);
EXPECT_EQ(r.lane<2>(), 1.0f);
EXPECT_EQ(r.lane<3>(), 0.0f);
}
/** @brief Test vmask4 or. */
@@ -1845,7 +1913,7 @@ TEST(vmask4, or)
vmask4 m2 = m2a == m2b;
vmask4 r = m1 | m2;
EXPECT_EQ(mask(r), 0xB);
EXPECT_EQ(mask(r), 0xBu);
}
/** @brief Test vmask4 and. */
@@ -1860,7 +1928,7 @@ TEST(vmask4, and)
vmask4 m2 = m2a == m2b;
vmask4 r = m1 & m2;
EXPECT_EQ(mask(r), 0x2);
EXPECT_EQ(mask(r), 0x2u);
}
/** @brief Test vmask4 xor. */
@@ -1875,7 +1943,7 @@ TEST(vmask4, xor)
vmask4 m2 = m2a == m2b;
vmask4 r = m1 ^ m2;
EXPECT_EQ(mask(r), 0x9);
EXPECT_EQ(mask(r), 0x9u);
}
/** @brief Test vmask4 not. */
@@ -1885,7 +1953,63 @@ TEST(vmask4, not)
vfloat4 m1b(1, 1, 1, 1);
vmask4 m1 = m1a == m1b;
vmask4 r = ~m1;
EXPECT_EQ(mask(r), 0x5);
EXPECT_EQ(mask(r), 0x5u);
}
/** @brief Test vint4 table permute. */
TEST(vint4, vtable_8bt_32bi_32entry)
{
vint4 table0(0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f);
vint4 table1(0x10111213, 0x14151617, 0x18191a1b, 0x1c1d1e1f);
vint4 table0p, table1p;
vtable_prepare(table0, table1, table0p, table1p);
vint4 index(0, 7, 4, 31);
vint4 result = vtable_8bt_32bi(table0p, table1p, index);
EXPECT_EQ(result.lane<0>(), 3);
EXPECT_EQ(result.lane<1>(), 4);
EXPECT_EQ(result.lane<2>(), 7);
EXPECT_EQ(result.lane<3>(), 28);
}
/** @brief Test vint4 table permute. */
TEST(vint4, vtable_8bt_32bi_64entry)
{
vint4 table0(0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f);
vint4 table1(0x10111213, 0x14151617, 0x18191a1b, 0x1c1d1e1f);
vint4 table2(0x20212223, 0x24252627, 0x28292a2b, 0x2c2d2e2f);
vint4 table3(0x30313233, 0x34353637, 0x38393a3b, 0x3c3d3e3f);
vint4 table0p, table1p, table2p, table3p;
vtable_prepare(table0, table1, table2, table3, table0p, table1p, table2p, table3p);
vint4 index(0, 7, 38, 63);
vint4 result = vtable_8bt_32bi(table0p, table1p, table2p, table3p, index);
EXPECT_EQ(result.lane<0>(), 3);
EXPECT_EQ(result.lane<1>(), 4);
EXPECT_EQ(result.lane<2>(), 37);
EXPECT_EQ(result.lane<3>(), 60);
}
/** @brief Test vint4 rgba byte interleave. */
TEST(vint4, interleave_rgba8)
{
vint4 r(0x01, 0x11, 0x21, 0x31);
vint4 g(0x02, 0x12, 0x22, 0x32);
vint4 b(0x03, 0x13, 0x23, 0x33);
vint4 a(0x04, 0x14, 0x24, 0x34);
vint4 result = interleave_rgba8(r, g, b, a);
EXPECT_EQ(result.lane<0>(), 0x04030201);
EXPECT_EQ(result.lane<1>(), 0x14131211);
EXPECT_EQ(result.lane<2>(), 0x24232221);
EXPECT_EQ(result.lane<3>(), 0x34333231);
}
# if ASTCENC_SIMD_WIDTH == 8
@@ -2142,27 +2266,27 @@ TEST(vfloat8, ceq)
vfloat8 a1(1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f);
vfloat8 b1(0.1f, 0.2f, 0.3f, 0.4f, 0.5f, 0.6f, 0.7f, 0.8f);
vmask8 r1 = a1 == b1;
EXPECT_EQ(0, mask(r1));
EXPECT_EQ(0u, mask(r1));
EXPECT_EQ(false, any(r1));
EXPECT_EQ(false, all(r1));
vfloat8 a2(1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f);
vfloat8 b2(1.0f, 0.2f, 0.3f, 0.4f, 0.5f, 0.6f, 0.7f, 0.8f);
vmask8 r2 = a2 == b2;
EXPECT_EQ(0x1, mask(r2));
EXPECT_EQ(0x1u, mask(r2));
EXPECT_EQ(true, any(r2));
EXPECT_EQ(false, all(r2));
vfloat8 a3(1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f);
vfloat8 b3(1.0f, 0.2f, 3.0f, 0.4f, 0.5f, 0.6f, 0.7f, 0.8f);
vmask8 r3 = a3 == b3;
EXPECT_EQ(0x5, mask(r3));
EXPECT_EQ(0x5u, mask(r3));
EXPECT_EQ(true, any(r3));
EXPECT_EQ(false, all(r3));
vfloat8 a4(1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f);
vmask8 r4 = a4 == a4;
EXPECT_EQ(0xFF, mask(r4));
EXPECT_EQ(0xFFu, mask(r4));
EXPECT_EQ(true, any(r4));
EXPECT_EQ(true, all(r4));
}
@@ -2173,27 +2297,27 @@ TEST(vfloat8, cne)
vfloat8 a1(1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f);
vfloat8 b1(0.1f, 0.2f, 0.3f, 0.4f, 0.5f, 0.6f, 0.7f, 0.8f);
vmask8 r1 = a1 != b1;
EXPECT_EQ(0xFF, mask(r1));
EXPECT_EQ(0xFFu, mask(r1));
EXPECT_EQ(true, any(r1));
EXPECT_EQ(true, all(r1));
vfloat8 a2(1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f);
vfloat8 b2(1.0f, 0.2f, 0.3f, 0.4f, 0.5f, 0.6f, 0.7f, 0.8f);
vmask8 r2 = a2 != b2;
EXPECT_EQ(0xFE, mask(r2));
EXPECT_EQ(0xFEu, mask(r2));
EXPECT_EQ(true, any(r2));
EXPECT_EQ(false, all(r2));
vfloat8 a3(1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f);
vfloat8 b3(1.0f, 0.2f, 3.0f, 0.4f, 0.5f, 0.6f, 0.7f, 0.8f);
vmask8 r3 = a3 != b3;
EXPECT_EQ(0xFA, mask(r3));
EXPECT_EQ(0xFAu, mask(r3));
EXPECT_EQ(true, any(r3));
EXPECT_EQ(false, all(r3));
vfloat8 a4(1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f);
vmask8 r4 = a4 != a4;
EXPECT_EQ(0, mask(r4));
EXPECT_EQ(0u, mask(r4));
EXPECT_EQ(false, any(r4));
EXPECT_EQ(false, all(r4));
}
@@ -2204,7 +2328,7 @@ TEST(vfloat8, clt)
vfloat8 a(1.0f, 2.0f, 3.0f, 4.0f, 1.0f, 2.0f, 3.0f, 4.0f);
vfloat8 b(0.9f, 2.1f, 3.0f, 4.1f, 0.9f, 2.1f, 3.0f, 4.1f);
vmask8 r = a < b;
EXPECT_EQ(0xAA, mask(r));
EXPECT_EQ(0xAAu, mask(r));
}
/** @brief Test vfloat8 cle. */
@@ -2213,7 +2337,7 @@ TEST(vfloat8, cle)
vfloat8 a(1.0f, 2.0f, 3.0f, 4.0f, 1.0f, 2.0f, 3.0f, 4.0f);
vfloat8 b(0.9f, 2.1f, 3.0f, 4.1f, 0.9f, 2.1f, 3.0f, 4.1f);
vmask8 r = a <= b;
EXPECT_EQ(0xEE, mask(r));
EXPECT_EQ(0xEEu, mask(r));
}
/** @brief Test vfloat8 cgt. */
@@ -2222,7 +2346,7 @@ TEST(vfloat8, cgt)
vfloat8 a(1.0f, 2.0f, 3.0f, 4.0f, 1.0f, 2.0f, 3.0f, 4.0f);
vfloat8 b(0.9f, 2.1f, 3.0f, 4.1f, 0.9f, 2.1f, 3.0f, 4.1f);
vmask8 r = a > b;
EXPECT_EQ(0x11, mask(r));
EXPECT_EQ(0x11u, mask(r));
}
/** @brief Test vfloat8 cge. */
@@ -2231,7 +2355,7 @@ TEST(vfloat8, cge)
vfloat8 a(1.0f, 2.0f, 3.0f, 4.0f, 1.0f, 2.0f, 3.0f, 4.0f);
vfloat8 b(0.9f, 2.1f, 3.0f, 4.1f, 0.9f, 2.1f, 3.0f, 4.1f);
vmask8 r = a >= b;
EXPECT_EQ(0x55, mask(r));
EXPECT_EQ(0x55u, mask(r));
}
/** @brief Test vfloat8 min. */
@@ -2510,7 +2634,8 @@ TEST(vfloat8, select)
/** @brief Test vfloat8 select MSB only. */
TEST(vfloat8, select_msb)
{
vint8 msb(0x80000000, 0, 0x80000000, 0, 0x80000000, 0, 0x80000000, 0);
int msb_set = static_cast<int>(0x80000000);
vint8 msb(msb_set, 0, msb_set, 0, msb_set, 0, msb_set, 0);
vmask8 cond(msb.m);
vfloat8 a(1.0f, 3.0f, 3.0f, 1.0f, 1.0f, 3.0f, 3.0f, 1.0f);
@@ -2527,7 +2652,6 @@ TEST(vfloat8, select_msb)
EXPECT_EQ(r1.lane<6>(), 2.0f);
EXPECT_EQ(r1.lane<7>(), 1.0f);
// Select in the other
vfloat8 r2 = select(b, a, cond);
EXPECT_EQ(r2.lane<0>(), 1.0f);
@@ -2867,27 +2991,27 @@ TEST(vint8, ceq)
vint8 a1(1, 2, 3, 4, 1, 2, 3, 4);
vint8 b1(0, 1, 2, 3, 0, 1, 2, 3);
vmask8 r1 = a1 == b1;
EXPECT_EQ(0, mask(r1));
EXPECT_EQ(0u, mask(r1));
EXPECT_EQ(false, any(r1));
EXPECT_EQ(false, all(r1));
vint8 a2(1, 2, 3, 4, 1, 2, 3, 4);
vint8 b2(1, 0, 0, 0, 1, 0, 0, 0);
vmask8 r2 = a2 == b2;
EXPECT_EQ(0x11, mask(r2));
EXPECT_EQ(0x11u, mask(r2));
EXPECT_EQ(true, any(r2));
EXPECT_EQ(false, all(r2));
vint8 a3(1, 2, 3, 4, 1, 2, 3, 4);
vint8 b3(1, 0, 3, 0, 1, 0, 3, 0);
vmask8 r3 = a3 == b3;
EXPECT_EQ(0x55, mask(r3));
EXPECT_EQ(0x55u, mask(r3));
EXPECT_EQ(true, any(r3));
EXPECT_EQ(false, all(r3));
vint8 a4(1, 2, 3, 4, 1, 2, 3, 4);
vmask8 r4 = a4 == a4;
EXPECT_EQ(0xFF, mask(r4));
EXPECT_EQ(0xFFu, mask(r4));
EXPECT_EQ(true, any(r4));
EXPECT_EQ(true, all(r4));
}
@@ -2898,27 +3022,27 @@ TEST(vint8, cne)
vint8 a1(1, 2, 3, 4, 1, 2, 3, 4);
vint8 b1(0, 1, 2, 3, 0, 1, 2, 3);
vmask8 r1 = a1 != b1;
EXPECT_EQ(0xFF, mask(r1));
EXPECT_EQ(0xFFu, mask(r1));
EXPECT_EQ(true, any(r1));
EXPECT_EQ(true, all(r1));
vint8 a2(1, 2, 3, 4, 1, 2, 3, 4);
vint8 b2(1, 0, 0, 0, 1, 0, 0, 0);
vmask8 r2 = a2 != b2;
EXPECT_EQ(0xEE, mask(r2));
EXPECT_EQ(0xEEu, mask(r2));
EXPECT_EQ(true, any(r2));
EXPECT_EQ(false, all(r2));
vint8 a3(1, 2, 3, 4, 1, 2, 3, 4);
vint8 b3(1, 0, 3, 0, 1, 0, 3, 0);
vmask8 r3 = a3 != b3;
EXPECT_EQ(0xAA, mask(r3));
EXPECT_EQ(0xAAu, mask(r3));
EXPECT_EQ(true, any(r3));
EXPECT_EQ(false, all(r3));
vint8 a4(1, 2, 3, 4, 1, 2, 3, 4);
vmask8 r4 = a4 != a4;
EXPECT_EQ(0, mask(r4));
EXPECT_EQ(0u, mask(r4));
EXPECT_EQ(false, any(r4));
EXPECT_EQ(false, all(r4));
}
@@ -2929,7 +3053,7 @@ TEST(vint8, clt)
vint8 a(1, 2, 3, 4, 1, 2, 3, 4);
vint8 b(0, 3, 3, 5, 0, 3, 3, 5);
vmask8 r = a < b;
EXPECT_EQ(0xAA, mask(r));
EXPECT_EQ(0xAAu, mask(r));
}
/** @brief Test vint8 cgt. */
@@ -2938,7 +3062,7 @@ TEST(vint8, cgt)
vint8 a(1, 2, 3, 4, 1, 2, 3, 4);
vint8 b(0, 3, 3, 5, 0, 3, 3, 5);
vmask8 r = a > b;
EXPECT_EQ(0x11, mask(r));
EXPECT_EQ(0x11u, mask(r));
}
/** @brief Test vint8 min. */
@@ -2973,30 +3097,66 @@ TEST(vint8, max)
EXPECT_EQ(r.lane<7>(), 5);
}
/** @brief Test vint8 lsl. */
TEST(vint8, lsl)
{
vint8 a(1, 2, 4, -4, 1, 2, 4, -4);
a = lsl<0>(a);
EXPECT_EQ(a.lane<0>(), 1);
EXPECT_EQ(a.lane<1>(), 2);
EXPECT_EQ(a.lane<2>(), 4);
EXPECT_EQ(a.lane<3>(), static_cast<int>(0xFFFFFFFC));
EXPECT_EQ(a.lane<4>(), 1);
EXPECT_EQ(a.lane<5>(), 2);
EXPECT_EQ(a.lane<6>(), 4);
EXPECT_EQ(a.lane<7>(), static_cast<int>(0xFFFFFFFC));
a = lsl<1>(a);
EXPECT_EQ(a.lane<0>(), 2);
EXPECT_EQ(a.lane<1>(), 4);
EXPECT_EQ(a.lane<2>(), 8);
EXPECT_EQ(a.lane<3>(), static_cast<int>(0xFFFFFFF8));
EXPECT_EQ(a.lane<4>(), 2);
EXPECT_EQ(a.lane<5>(), 4);
EXPECT_EQ(a.lane<6>(), 8);
EXPECT_EQ(a.lane<7>(), static_cast<int>(0xFFFFFFF8));
a = lsl<2>(a);
EXPECT_EQ(a.lane<0>(), 8);
EXPECT_EQ(a.lane<1>(), 16);
EXPECT_EQ(a.lane<2>(), 32);
EXPECT_EQ(a.lane<3>(), static_cast<int>(0xFFFFFFE0));
EXPECT_EQ(a.lane<4>(), 8);
EXPECT_EQ(a.lane<5>(), 16);
EXPECT_EQ(a.lane<6>(), 32);
EXPECT_EQ(a.lane<7>(), static_cast<int>(0xFFFFFFE0));
}
/** @brief Test vint8 lsr. */
TEST(vint8, lsr)
{
vint8 a(1, 2, 4, -4, 1, 2, 4, -4);
a = lsr<0>(a);
EXPECT_EQ(a.lane<0>(), 1);
EXPECT_EQ(a.lane<1>(), 2);
EXPECT_EQ(a.lane<2>(), 4);
EXPECT_EQ(a.lane<3>(), 0xFFFFFFFC);
EXPECT_EQ(a.lane<4>(), 1);
EXPECT_EQ(a.lane<5>(), 2);
EXPECT_EQ(a.lane<6>(), 4);
EXPECT_EQ(a.lane<7>(), 0xFFFFFFFC);
EXPECT_EQ(a.lane<0>(), 1);
EXPECT_EQ(a.lane<1>(), 2);
EXPECT_EQ(a.lane<2>(), 4);
EXPECT_EQ(a.lane<3>(), static_cast<int>(0xFFFFFFFC));
EXPECT_EQ(a.lane<4>(), 1);
EXPECT_EQ(a.lane<5>(), 2);
EXPECT_EQ(a.lane<6>(), 4);
EXPECT_EQ(a.lane<7>(), static_cast<int>(0xFFFFFFFC));
a = lsr<1>(a);
EXPECT_EQ(a.lane<0>(), 0);
EXPECT_EQ(a.lane<1>(), 1);
EXPECT_EQ(a.lane<2>(), 2);
EXPECT_EQ(a.lane<3>(), 0x7FFFFFFE);
EXPECT_EQ(a.lane<4>(), 0);
EXPECT_EQ(a.lane<5>(), 1);
EXPECT_EQ(a.lane<6>(), 2);
EXPECT_EQ(a.lane<7>(), 0x7FFFFFFE);
EXPECT_EQ(a.lane<0>(), 0);
EXPECT_EQ(a.lane<1>(), 1);
EXPECT_EQ(a.lane<2>(), 2);
EXPECT_EQ(a.lane<3>(), 0x7FFFFFFE);
EXPECT_EQ(a.lane<4>(), 0);
EXPECT_EQ(a.lane<5>(), 1);
EXPECT_EQ(a.lane<6>(), 2);
EXPECT_EQ(a.lane<7>(), 0x7FFFFFFE);
a = lsr<2>(a);
EXPECT_EQ(a.lane<0>(), 0);
@@ -3134,11 +3294,77 @@ TEST(vint8, store_nbytes)
{
alignas(32) int out[2];
vint8 a(42, 314, 75, 90, 42, 314, 75, 90);
store_nbytes(a, (uint8_t*)&out);
store_nbytes(a, reinterpret_cast<uint8_t*>(&out));
EXPECT_EQ(out[0], 42);
EXPECT_EQ(out[1], 314);
}
/** @brief Test vint8 store_lanes_masked. */
TEST(vint8, store_lanes_masked)
{
uint8_t resulta[32] { 0 };
// Store nothing
vmask8 mask1 = vint8(0) == vint8(1);
vint8 data1 = vint8(1);
store_lanes_masked(resulta, data1, mask1);
vint8 result1v = vint8::load(resulta);
vint8 expect1v = vint8::zero();
EXPECT_TRUE(all(result1v == expect1v));
// Store half
vmask8 mask2 = vint8(1, 1, 1, 1, 0, 0, 0, 0) == vint8(1);
vint8 data2 = vint8(2);
store_lanes_masked(resulta, data2, mask2);
vint8 result2v = vint8::load(resulta);
vint8 expect2v = vint8(2, 2, 2, 2, 0, 0, 0, 0);
EXPECT_TRUE(all(result2v == expect2v));
// Store all
vmask8 mask3 = vint8(1) == vint8(1);
vint8 data3 = vint8(3);
store_lanes_masked(resulta, data3, mask3);
vint8 result3v = vint8::load(resulta);
vint8 expect3v = vint8(3);
EXPECT_TRUE(all(result3v == expect3v));
}
/** @brief Test vint8 store_lanes_masked to unaligned address. */
TEST(vint8, store_lanes_masked_unaligned)
{
uint8_t resulta[33] { 0 };
// Store nothing
vmask8 mask1 = vint8(0) == vint8(1);
vint8 data1 = vint8(1);
store_lanes_masked(resulta + 1, data1, mask1);
vint8 result1v = vint8::load(resulta + 1);
vint8 expect1v = vint8::zero();
EXPECT_TRUE(all(result1v == expect1v));
// Store half
vmask8 mask2 = vint8(1, 1, 1, 1, 0, 0, 0, 0) == vint8(1);
vint8 data2 = vint8(2);
store_lanes_masked(resulta + 1, data2, mask2);
vint8 result2v = vint8::load(resulta + 1);
vint8 expect2v = vint8(2, 2, 2, 2, 0, 0, 0, 0);
EXPECT_TRUE(all(result2v == expect2v));
// Store all
vmask8 mask3 = vint8(1) == vint8(1);
vint8 data3 = vint8(3);
store_lanes_masked(resulta + 1, data3, mask3);
vint8 result3v = vint8::load(resulta + 1);
vint8 expect3v = vint8(3);
EXPECT_TRUE(all(result3v == expect3v));
}
/** @brief Test vint8 gatheri. */
TEST(vint8, gatheri)
{
@@ -3225,7 +3451,7 @@ TEST(vmask8, or)
vmask8 m2 = m2a == m2b;
vmask8 r = m1 | m2;
EXPECT_EQ(mask(r), 0xBB);
EXPECT_EQ(mask(r), 0xBBu);
}
/** @brief Test vmask8 and. */
@@ -3240,7 +3466,7 @@ TEST(vmask8, and)
vmask8 m2 = m2a == m2b;
vmask8 r = m1 & m2;
EXPECT_EQ(mask(r), 0x22);
EXPECT_EQ(mask(r), 0x22u);
}
/** @brief Test vmask8 xor. */
@@ -3255,7 +3481,7 @@ TEST(vmask8, xor)
vmask8 m2 = m2a == m2b;
vmask8 r = m1 ^ m2;
EXPECT_EQ(mask(r), 0x99);
EXPECT_EQ(mask(r), 0x99u);
}
/** @brief Test vmask8 not. */
@@ -3265,7 +3491,55 @@ TEST(vmask8, not)
vfloat8 m1b(1, 1, 1, 1, 1, 1, 1, 1);
vmask8 m1 = m1a == m1b;
vmask8 r = ~m1;
EXPECT_EQ(mask(r), 0x55);
EXPECT_EQ(mask(r), 0x55u);
}
/** @brief Test vint8 table permute. */
TEST(vint8, vtable_8bt_32bi_32entry)
{
vint4 table0(0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f);
vint4 table1(0x10111213, 0x14151617, 0x18191a1b, 0x1c1d1e1f);
vint8 table0p, table1p;
vtable_prepare(table0, table1, table0p, table1p);
vint8 index(0, 7, 4, 15, 16, 20, 23, 31);
vint8 result = vtable_8bt_32bi(table0p, table1p, index);
EXPECT_EQ(result.lane<0>(), 3);
EXPECT_EQ(result.lane<1>(), 4);
EXPECT_EQ(result.lane<2>(), 7);
EXPECT_EQ(result.lane<3>(), 12);
EXPECT_EQ(result.lane<4>(), 19);
EXPECT_EQ(result.lane<5>(), 23);
EXPECT_EQ(result.lane<6>(), 20);
EXPECT_EQ(result.lane<7>(), 28);
}
/** @brief Test vint4 table permute. */
TEST(vint8, vtable_8bt_32bi_64entry)
{
vint4 table0(0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f);
vint4 table1(0x10111213, 0x14151617, 0x18191a1b, 0x1c1d1e1f);
vint4 table2(0x20212223, 0x24252627, 0x28292a2b, 0x2c2d2e2f);
vint4 table3(0x30313233, 0x34353637, 0x38393a3b, 0x3c3d3e3f);
vint8 table0p, table1p, table2p, table3p;
vtable_prepare(table0, table1, table2, table3, table0p, table1p, table2p, table3p);
vint8 index(0, 7, 4, 15, 16, 20, 38, 63);
vint8 result = vtable_8bt_32bi(table0p, table1p, table2p, table3p, index);
EXPECT_EQ(result.lane<0>(), 3);
EXPECT_EQ(result.lane<1>(), 4);
EXPECT_EQ(result.lane<2>(), 7);
EXPECT_EQ(result.lane<3>(), 12);
EXPECT_EQ(result.lane<4>(), 19);
EXPECT_EQ(result.lane<5>(), 23);
EXPECT_EQ(result.lane<6>(), 37);
EXPECT_EQ(result.lane<7>(), 60);
}
#endif
+125 -66
View File
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: Apache-2.0
// ----------------------------------------------------------------------------
// Copyright 2020-2022 Arm Limited
// Copyright 2020-2024 Arm Limited
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy
@@ -43,10 +43,18 @@
* for faster processing. The caller is responsible for creating the worker threads, and
* synchronizing between images.
*
* Extended instruction set support
* ================================
*
* This library supports use of extended instruction sets, such as SSE4.1 and AVX2. These are
* enabled at compile time when building the library. There is no runtime checking in the core
* library that the instruction sets used are actually available. Checking compatibility is the
* responsibility of the calling code.
*
* Threading
* =========
*
* In pseudocode, the usage for manual user threading looks like this:
* In pseudo-code, the usage for manual user threading looks like this:
*
* // Configure the compressor run
* astcenc_config my_config;
@@ -74,7 +82,7 @@
*
* The codec supports compressing single images, which can be either 2D images or volumetric 3D
* images. Calling code is responsible for any handling of aggregate types, such as mipmap chains,
* texture arrays, or sliced 3D textures
* texture arrays, or sliced 3D textures.
*
* Images are passed in as an astcenc_image structure. Inputs can be either 8-bit unorm, 16-bit
* half-float, or 32-bit float, as indicated by the data_type field.
@@ -82,7 +90,7 @@
* Images can be any dimension; there is no requirement to be a multiple of the ASTC block size.
*
* Data is always passed in as 4 color components, and accessed as an array of 2D image slices. Data
* within an image slice is always tightly packed without padding. Addresing looks like this:
* within an image slice is always tightly packed without padding. Addressing looks like this:
*
* data[z_coord][y_coord * x_dim * 4 + x_coord * 4 ] // Red
* data[z_coord][y_coord * x_dim * 4 + x_coord * 4 + 1] // Green
@@ -112,8 +120,8 @@
* Input images must contain unit-length normalized and should be passed in using a two component
* swizzle. The astcenc command line tool defaults to an RRRG swizzle, but some developers prefer
* to use GGGR for compatability with BC5n which will work just as well. The Z component can be
* recovered programatically in shader code, using knowledge that the vector is unit length and that
* Z must be positive for a tangent-space normal map.
* recovered programmatically in shader code, using knowledge that the vector is unit length and
* that Z must be positive for a tangent-space normal map.
*
* Decompress-only usage
* =====================
@@ -215,8 +223,6 @@ enum astcenc_error {
ASTCENC_ERR_OUT_OF_MEM,
/** @brief The call failed due to the build using fast math. */
ASTCENC_ERR_BAD_CPU_FLOAT,
/** @brief The call failed due to the build using an unsupported ISA. */
ASTCENC_ERR_BAD_CPU_ISA,
/** @brief The call failed due to an out-of-spec parameter. */
ASTCENC_ERR_BAD_PARAM,
/** @brief The call failed due to an out-of-spec block size. */
@@ -233,6 +239,8 @@ enum astcenc_error {
ASTCENC_ERR_BAD_CONTEXT,
/** @brief The call failed due to unimplemented functionality. */
ASTCENC_ERR_NOT_IMPLEMENTED,
/** @brief The call failed due to an out-of-spec decode mode flag set. */
ASTCENC_ERR_BAD_DECODE_MODE,
#if defined(ASTCENC_DIAGNOSTICS)
/** @brief The call failed due to an issue with diagnostic tracing. */
ASTCENC_ERR_DTRACE_FAILURE,
@@ -265,9 +273,12 @@ static const float ASTCENC_PRE_FAST = 10.0f;
/** @brief The medium quality search preset. */
static const float ASTCENC_PRE_MEDIUM = 60.0f;
/** @brief The throrough quality search preset. */
/** @brief The thorough quality search preset. */
static const float ASTCENC_PRE_THOROUGH = 98.0f;
/** @brief The thorough quality search preset. */
static const float ASTCENC_PRE_VERYTHOROUGH = 99.0f;
/** @brief The exhaustive, highest quality, search preset. */
static const float ASTCENC_PRE_EXHAUSTIVE = 100.0f;
@@ -320,6 +331,11 @@ enum astcenc_type
ASTCENC_TYPE_F32 = 2
};
/**
* @brief Function pointer type for compression progress reporting callback.
*/
extern "C" typedef void (*astcenc_progress_callback)(float);
/**
* @brief Enable normal map compression.
*
@@ -331,35 +347,17 @@ enum astcenc_type
static const unsigned int ASTCENC_FLG_MAP_NORMAL = 1 << 0;
/**
* @brief Enable mask map compression.
* @brief Enable compression heuristics that assume use of decode_unorm8 decode mode.
*
* Input data will be treated a multi-layer mask map, where is is desirable for the color components
* to be treated independently for the purposes of error analysis.
* The decode_unorm8 decode mode rounds differently to the decode_fp16 decode mode, so enabling this
* flag during compression will allow the compressor to use the correct rounding when selecting
* encodings. This will improve the compressed image quality if your application is using the
* decode_unorm8 decode mode, but will reduce image quality if using decode_fp16.
*
* Note that LDR_SRGB images will always use decode_unorm8 for the RGB channels, irrespective of
* this setting.
*/
static const unsigned int ASTCENC_FLG_MAP_MASK = 1 << 1;
/**
* @brief Enable RGBM map compression.
*
* Input data will be treated as HDR data that has been stored in an LDR RGBM-encoded wrapper
* format. Data must be preprocessed by the user to be in LDR RGBM format before calling the
* compression function, this flag is only used to control the use of RGBM-specific heuristics and
* error metrics.
*
* IMPORTANT: The ASTC format is prone to bad failure modes with unconstrained RGBM data; very small
* M values can round to zero due to quantization and result in black or white pixels. It is highly
* recommended that the minimum value of M used in the encoding is kept above a lower threshold (try
* 16 or 32). Applying this threshold reduces the number of very dark colors that can be
* represented, but is still higher precision than 8-bit LDR.
*
* When this flag is set the value of @c rgbm_m_scale in the context must be set to the RGBM scale
* factor used during reconstruction. This defaults to 5 when in RGBM mode.
*
* It is recommended that the value of @c cw_a_weight is set to twice the value of the multiplier
* scale, ensuring that the M value is accurately encoded. This defaults to 10 when in RGBM mode,
* matching the default scale factor.
*/
static const unsigned int ASTCENC_FLG_MAP_RGBM = 1 << 6;
static const unsigned int ASTCENC_FLG_USE_DECODE_UNORM8 = 1 << 1;
/**
* @brief Enable alpha weighting.
@@ -396,15 +394,38 @@ static const unsigned int ASTCENC_FLG_DECOMPRESS_ONLY = 1 << 4;
*/
static const unsigned int ASTCENC_FLG_SELF_DECOMPRESS_ONLY = 1 << 5;
/**
* @brief Enable RGBM map compression.
*
* Input data will be treated as HDR data that has been stored in an LDR RGBM-encoded wrapper
* format. Data must be preprocessed by the user to be in LDR RGBM format before calling the
* compression function, this flag is only used to control the use of RGBM-specific heuristics and
* error metrics.
*
* IMPORTANT: The ASTC format is prone to bad failure modes with unconstrained RGBM data; very small
* M values can round to zero due to quantization and result in black or white pixels. It is highly
* recommended that the minimum value of M used in the encoding is kept above a lower threshold (try
* 16 or 32). Applying this threshold reduces the number of very dark colors that can be
* represented, but is still higher precision than 8-bit LDR.
*
* When this flag is set the value of @c rgbm_m_scale in the context must be set to the RGBM scale
* factor used during reconstruction. This defaults to 5 when in RGBM mode.
*
* It is recommended that the value of @c cw_a_weight is set to twice the value of the multiplier
* scale, ensuring that the M value is accurately encoded. This defaults to 10 when in RGBM mode,
* matching the default scale factor.
*/
static const unsigned int ASTCENC_FLG_MAP_RGBM = 1 << 6;
/**
* @brief The bit mask of all valid flags.
*/
static const unsigned int ASTCENC_ALL_FLAGS =
ASTCENC_FLG_MAP_MASK |
ASTCENC_FLG_MAP_NORMAL |
ASTCENC_FLG_MAP_RGBM |
ASTCENC_FLG_USE_ALPHA_WEIGHT |
ASTCENC_FLG_USE_PERCEPTUAL |
ASTCENC_FLG_USE_DECODE_UNORM8 |
ASTCENC_FLG_DECOMPRESS_ONLY |
ASTCENC_FLG_SELF_DECOMPRESS_ONLY;
@@ -452,7 +473,7 @@ struct astcenc_config
*
* It is recommended that this is set to 1 when using FLG_USE_ALPHA_WEIGHT on a texture that
* will be sampled using linear texture filtering to minimize color bleed out of transparent
* texels that are adjcent to non-transparent texels.
* texels that are adjacent to non-transparent texels.
*/
unsigned int a_scale_radius;
@@ -467,11 +488,25 @@ struct astcenc_config
unsigned int tune_partition_count_limit;
/**
* @brief The maximum number of partitions searched (-partitionindexlimit).
* @brief The maximum number of partitions searched (-2partitionindexlimit).
*
* Valid values are between 1 and 1024.
*/
unsigned int tune_partition_index_limit;
unsigned int tune_2partition_index_limit;
/**
* @brief The maximum number of partitions searched (-3partitionindexlimit).
*
* Valid values are between 1 and 1024.
*/
unsigned int tune_3partition_index_limit;
/**
* @brief The maximum number of partitions searched (-4partitionindexlimit).
*
* Valid values are between 1 and 1024.
*/
unsigned int tune_4partition_index_limit;
/**
* @brief The maximum centile for block modes searched (-blockmodelimit).
@@ -491,10 +526,31 @@ struct astcenc_config
/**
* @brief The number of trial candidates per mode search (-candidatelimit).
*
* Valid values are between 1 and TUNE_MAX_TRIAL_CANDIDATES (default 4).
* Valid values are between 1 and TUNE_MAX_TRIAL_CANDIDATES.
*/
unsigned int tune_candidate_limit;
/**
* @brief The number of trial partitionings per search (-2partitioncandidatelimit).
*
* Valid values are between 1 and TUNE_MAX_PARTITIONING_CANDIDATES.
*/
unsigned int tune_2partitioning_candidate_limit;
/**
* @brief The number of trial partitionings per search (-3partitioncandidatelimit).
*
* Valid values are between 1 and TUNE_MAX_PARTITIONING_CANDIDATES.
*/
unsigned int tune_3partitioning_candidate_limit;
/**
* @brief The number of trial partitionings per search (-4partitioncandidatelimit).
*
* Valid values are between 1 and TUNE_MAX_PARTITIONING_CANDIDATES.
*/
unsigned int tune_4partitioning_candidate_limit;
/**
* @brief The dB threshold for stopping block search (-dblimit).
*
@@ -503,51 +559,54 @@ struct astcenc_config
float tune_db_limit;
/**
* @brief The amount of overshoot needed to early-out mode 0 fast path.
* @brief The amount of MSE overshoot needed to early-out trials.
*
* We have a fast-path for mode 0 (1 partition, 1 plane) which uses only essential block modes
* as an initital search. This can short-cut compression for simple blocks, but to avoid
* shortcutting too much we* force this to overshoot the MSE threshold needed to hit the
* block-local db_limit e.g. 1.0 = no overshoot, 2.0 = need half the error to trigger.
* The first early-out is for 1 partition, 1 plane trials, where we try a minimal encode using
* the high probability block modes. This can short-cut compression for simple blocks.
*
* The second early-out is for refinement trials, where we can exit refinement once quality is
* reached.
*/
float tune_mode0_mse_overshoot;
float tune_mse_overshoot;
/**
* @brief The amount of overshoot needed to early-out refinement.
*
* The codec will refine block candidates iteratively to improve the encoding, based on the
* @c tune_refinement_limit count. Earlier implementations will use all refinement iterations,
* even if the target threshold is reached. This tuning parameter allows an early out, but with
* an overshoot MSE threshold. Setting this to 1.0 will early-out as soon as the target is hit,
* but does reduce image quality vs the default behavior of over-refinement.
*/
float tune_refinement_mse_overshoot;
/**
* @brief The threshold for skipping 2.2/3.1/3.2/4.1 trials (-2partitionlimitfactor).
* @brief The threshold for skipping 3.1/4.1 trials (-2partitionlimitfactor).
*
* This option is further scaled for normal maps, so it skips less often.
*/
float tune_2_partition_early_out_limit_factor;
float tune_2partition_early_out_limit_factor;
/**
* @brief The threshold for skipping 3.2/4.1 trials (-3partitionlimitfactor).
* @brief The threshold for skipping 4.1 trials (-3partitionlimitfactor).
*
* This option is further scaled for normal maps, so it skips less often.
*/
float tune_3_partition_early_out_limit_factor;
float tune_3partition_early_out_limit_factor;
/**
* @brief The threshold for skipping two weight planes (-2planelimitcorrelation).
*
* This option is ineffective for normal maps.
*/
float tune_2_plane_early_out_limit_correlation;
float tune_2plane_early_out_limit_correlation;
/**
* @brief The threshold below which (inclusive) we stop testing low/high/low+high cutoffs.
* @brief The config enable for the mode0 fast-path search.
*
* If this is set to TUNE_MIN_TEXELS_MODE0 or higher then the early-out fast mode0
* search is enabled. This option is ineffective for 3D block sizes.
*/
unsigned int tune_low_weight_count_limit;
float tune_search_mode0_enable;
/**
* @brief The progress callback, can be @c nullptr.
*
* If this is specified the codec will peridocially report progress for
* compression as a percentage between 0 and 100. The callback is called from one
* of the compressor threads, so doing significant work in the callback will
* reduce compression performance.
*/
astcenc_progress_callback progress_callback;
#if defined(ASTCENC_DIAGNOSTICS)
/**
@@ -811,7 +870,7 @@ ASTCENC_PUBLIC void astcenc_context_free(
* advanced content packaging pipelines.
*
* @param context Codec context.
* @param data One block of compressesd ASTC data.
* @param data One block of compressed ASTC data.
* @param info The output info structure to populate.
*
* @return @c ASTCENC_SUCCESS if the block was decoded, or an error otherwise. Note that this
+16 -97
View File
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: Apache-2.0
// ----------------------------------------------------------------------------
// Copyright 2011-2022 Arm Limited
// Copyright 2011-2023 Arm Limited
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy
@@ -390,8 +390,6 @@ void compute_avgs_and_dirs_4_comp(
const image_block& blk,
partition_metrics pm[BLOCK_MAX_PARTITIONS]
) {
float texel_weight = hadd_s(blk.channel_weight) / 4.0f;
int partition_count = pi.partition_count;
promise(partition_count > 0);
@@ -434,11 +432,6 @@ void compute_avgs_and_dirs_4_comp(
sum_wp += select(zero, texel_datum, tdm3);
}
sum_xp = sum_xp * texel_weight;
sum_yp = sum_yp * texel_weight;
sum_zp = sum_zp * texel_weight;
sum_wp = sum_wp * texel_weight;
vfloat4 prod_xp = dot(sum_xp, sum_xp);
vfloat4 prod_yp = dot(sum_yp, sum_yp);
vfloat4 prod_zp = dot(sum_zp, sum_zp);
@@ -473,8 +466,6 @@ void compute_avgs_and_dirs_3_comp(
vfloat4 partition_averages[BLOCK_MAX_PARTITIONS];
compute_partition_averages_rgba(pi, blk, partition_averages);
float texel_weight = hadd_s(blk.channel_weight.swz<0, 1, 2>());
const float* data_vr = blk.data_r;
const float* data_vg = blk.data_g;
const float* data_vb = blk.data_b;
@@ -482,8 +473,6 @@ void compute_avgs_and_dirs_3_comp(
// TODO: Data-driven permute would be useful to avoid this ...
if (omitted_component == 0)
{
texel_weight = hadd_s(blk.channel_weight.swz<1, 2, 3>());
partition_averages[0] = partition_averages[0].swz<1, 2, 3>();
partition_averages[1] = partition_averages[1].swz<1, 2, 3>();
partition_averages[2] = partition_averages[2].swz<1, 2, 3>();
@@ -495,8 +484,6 @@ void compute_avgs_and_dirs_3_comp(
}
else if (omitted_component == 1)
{
texel_weight = hadd_s(blk.channel_weight.swz<0, 2, 3>());
partition_averages[0] = partition_averages[0].swz<0, 2, 3>();
partition_averages[1] = partition_averages[1].swz<0, 2, 3>();
partition_averages[2] = partition_averages[2].swz<0, 2, 3>();
@@ -507,8 +494,6 @@ void compute_avgs_and_dirs_3_comp(
}
else if (omitted_component == 2)
{
texel_weight = hadd_s(blk.channel_weight.swz<0, 1, 3>());
partition_averages[0] = partition_averages[0].swz<0, 1, 3>();
partition_averages[1] = partition_averages[1].swz<0, 1, 3>();
partition_averages[2] = partition_averages[2].swz<0, 1, 3>();
@@ -524,8 +509,6 @@ void compute_avgs_and_dirs_3_comp(
partition_averages[3] = partition_averages[3].swz<0, 1, 2>();
}
texel_weight = texel_weight * (1.0f / 3.0f);
unsigned int partition_count = pi.partition_count;
promise(partition_count > 0);
@@ -563,10 +546,6 @@ void compute_avgs_and_dirs_3_comp(
sum_zp += select(zero, texel_datum, tdm2);
}
sum_xp = sum_xp * texel_weight;
sum_yp = sum_yp * texel_weight;
sum_zp = sum_zp * texel_weight;
vfloat4 prod_xp = dot(sum_xp, sum_xp);
vfloat4 prod_yp = dot(sum_yp, sum_yp);
vfloat4 prod_zp = dot(sum_zp, sum_zp);
@@ -591,8 +570,6 @@ void compute_avgs_and_dirs_3_comp_rgb(
const image_block& blk,
partition_metrics pm[BLOCK_MAX_PARTITIONS]
) {
float texel_weight = hadd_s(blk.channel_weight.swz<0, 1, 2>()) * (1.0f / 3.0f);
unsigned int partition_count = pi.partition_count;
promise(partition_count > 0);
@@ -632,10 +609,6 @@ void compute_avgs_and_dirs_3_comp_rgb(
sum_zp += select(zero, texel_datum, tdm2);
}
sum_xp = sum_xp * texel_weight;
sum_yp = sum_yp * texel_weight;
sum_zp = sum_zp * texel_weight;
vfloat4 prod_xp = dot(sum_xp, sum_xp);
vfloat4 prod_yp = dot(sum_yp, sum_yp);
vfloat4 prod_zp = dot(sum_zp, sum_zp);
@@ -662,7 +635,6 @@ void compute_avgs_and_dirs_2_comp(
unsigned int component2,
partition_metrics pm[BLOCK_MAX_PARTITIONS]
) {
float texel_weight;
vfloat4 average;
const float* data_vr = nullptr;
@@ -670,7 +642,6 @@ void compute_avgs_and_dirs_2_comp(
if (component1 == 0 && component2 == 1)
{
texel_weight = hadd_s(blk.channel_weight.swz<0, 1>()) / 2.0f;
average = blk.data_mean.swz<0, 1>();
data_vr = blk.data_r;
@@ -678,7 +649,6 @@ void compute_avgs_and_dirs_2_comp(
}
else if (component1 == 0 && component2 == 2)
{
texel_weight = hadd_s(blk.channel_weight.swz<0, 2>()) / 2.0f;
average = blk.data_mean.swz<0, 2>();
data_vr = blk.data_r;
@@ -688,7 +658,6 @@ void compute_avgs_and_dirs_2_comp(
{
assert(component1 == 1 && component2 == 2);
texel_weight = hadd_s(blk.channel_weight.swz<1, 2>()) / 2.0f;
average = blk.data_mean.swz<1, 2>();
data_vr = blk.data_g;
@@ -714,7 +683,7 @@ void compute_avgs_and_dirs_2_comp(
average += vfloat2(data_vr[iwt], data_vg[iwt]);
}
average = average * (1.0f / static_cast<float>(texel_count));
average = average / static_cast<float>(texel_count);
}
pm[partition].avg = average;
@@ -737,9 +706,6 @@ void compute_avgs_and_dirs_2_comp(
sum_yp += select(zero, texel_datum, tdm1);
}
sum_xp = sum_xp * texel_weight;
sum_yp = sum_yp * texel_weight;
vfloat4 prod_xp = dot(sum_xp, sum_xp);
vfloat4 prod_yp = dot(sum_yp, sum_yp);
@@ -759,8 +725,7 @@ void compute_error_squared_rgba(
const image_block& blk,
const processed_line4 uncor_plines[BLOCK_MAX_PARTITIONS],
const processed_line4 samec_plines[BLOCK_MAX_PARTITIONS],
float uncor_lengths[BLOCK_MAX_PARTITIONS],
float samec_lengths[BLOCK_MAX_PARTITIONS],
float line_lengths[BLOCK_MAX_PARTITIONS],
float& uncor_error,
float& samec_error
) {
@@ -774,12 +739,6 @@ void compute_error_squared_rgba(
{
const uint8_t *texel_indexes = pi.texels_of_partition[partition];
float uncor_loparam = 1e10f;
float uncor_hiparam = -1e10f;
float samec_loparam = 1e10f;
float samec_hiparam = -1e10f;
processed_line4 l_uncor = uncor_plines[partition];
processed_line4 l_samec = samec_plines[partition];
@@ -807,9 +766,6 @@ void compute_error_squared_rgba(
vfloat uncor_loparamv(1e10f);
vfloat uncor_hiparamv(-1e10f);
vfloat samec_loparamv(1e10f);
vfloat samec_hiparamv(-1e10f);
vfloat ew_r(blk.channel_weight.lane<0>());
vfloat ew_g(blk.channel_weight.lane<1>());
vfloat ew_b(blk.channel_weight.lane<2>());
@@ -822,17 +778,17 @@ void compute_error_squared_rgba(
for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
{
vmask mask = lane_ids < vint(texel_count);
vint texel_idxs(&(texel_indexes[i]));
vint texel_idxs(texel_indexes + i);
vfloat data_r = gatherf(blk.data_r, texel_idxs);
vfloat data_g = gatherf(blk.data_g, texel_idxs);
vfloat data_b = gatherf(blk.data_b, texel_idxs);
vfloat data_a = gatherf(blk.data_a, texel_idxs);
vfloat uncor_param = (data_r * l_uncor_bs0)
+ (data_g * l_uncor_bs1)
+ (data_b * l_uncor_bs2)
+ (data_a * l_uncor_bs3);
vfloat uncor_param = (data_r * l_uncor_bs0)
+ (data_g * l_uncor_bs1)
+ (data_b * l_uncor_bs2)
+ (data_a * l_uncor_bs3);
uncor_loparamv = min(uncor_param, uncor_loparamv);
uncor_hiparamv = max(uncor_param, uncor_hiparamv);
@@ -859,9 +815,6 @@ void compute_error_squared_rgba(
+ (data_b * l_samec_bs2)
+ (data_a * l_samec_bs3);
samec_loparamv = min(samec_param, samec_loparamv);
samec_hiparamv = max(samec_param, samec_hiparamv);
vfloat samec_dist0 = samec_param * l_samec_bs0 - data_r;
vfloat samec_dist1 = samec_param * l_samec_bs1 - data_g;
vfloat samec_dist2 = samec_param * l_samec_bs2 - data_b;
@@ -877,18 +830,9 @@ void compute_error_squared_rgba(
lane_ids += vint(ASTCENC_SIMD_WIDTH);
}
uncor_loparam = hmin_s(uncor_loparamv);
uncor_hiparam = hmax_s(uncor_hiparamv);
samec_loparam = hmin_s(samec_loparamv);
samec_hiparam = hmax_s(samec_hiparamv);
float uncor_linelen = uncor_hiparam - uncor_loparam;
float samec_linelen = samec_hiparam - samec_loparam;
// Turn very small numbers and NaNs into a small number
uncor_lengths[partition] = astc::max(uncor_linelen, 1e-7f);
samec_lengths[partition] = astc::max(samec_linelen, 1e-7f);
float uncor_linelen = hmax_s(uncor_hiparamv) - hmin_s(uncor_loparamv);
line_lengths[partition] = astc::max(uncor_linelen, 1e-7f);
}
uncor_error = hadd_s(uncor_errorsumv);
@@ -916,19 +860,9 @@ void compute_error_squared_rgb(
unsigned int texel_count = pi.partition_texel_count[partition];
promise(texel_count > 0);
float uncor_loparam = 1e10f;
float uncor_hiparam = -1e10f;
float samec_loparam = 1e10f;
float samec_hiparam = -1e10f;
processed_line3 l_uncor = pl.uncor_pline;
processed_line3 l_samec = pl.samec_pline;
// This implementation is an example vectorization of this function.
// It works for - the codec is a 2-4% faster than not vectorizing - but
// the benefit is limited by the use of gathers and register pressure
// Vectorize some useful scalar inputs
vfloat l_uncor_bs0(l_uncor.bs.lane<0>());
vfloat l_uncor_bs1(l_uncor.bs.lane<1>());
@@ -947,9 +881,6 @@ void compute_error_squared_rgb(
vfloat uncor_loparamv(1e10f);
vfloat uncor_hiparamv(-1e10f);
vfloat samec_loparamv(1e10f);
vfloat samec_hiparamv(-1e10f);
vfloat ew_r(blk.channel_weight.lane<0>());
vfloat ew_g(blk.channel_weight.lane<1>());
vfloat ew_b(blk.channel_weight.lane<2>());
@@ -961,15 +892,15 @@ void compute_error_squared_rgb(
for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
{
vmask mask = lane_ids < vint(texel_count);
vint texel_idxs(&(texel_indexes[i]));
vint texel_idxs(texel_indexes + i);
vfloat data_r = gatherf(blk.data_r, texel_idxs);
vfloat data_g = gatherf(blk.data_g, texel_idxs);
vfloat data_b = gatherf(blk.data_b, texel_idxs);
vfloat uncor_param = (data_r * l_uncor_bs0)
+ (data_g * l_uncor_bs1)
+ (data_b * l_uncor_bs2);
vfloat uncor_param = (data_r * l_uncor_bs0)
+ (data_g * l_uncor_bs1)
+ (data_b * l_uncor_bs2);
uncor_loparamv = min(uncor_param, uncor_loparamv);
uncor_hiparamv = max(uncor_param, uncor_hiparamv);
@@ -992,9 +923,6 @@ void compute_error_squared_rgb(
+ (data_g * l_samec_bs1)
+ (data_b * l_samec_bs2);
samec_loparamv = min(samec_param, samec_loparamv);
samec_hiparamv = max(samec_param, samec_hiparamv);
vfloat samec_dist0 = samec_param * l_samec_bs0 - data_r;
vfloat samec_dist1 = samec_param * l_samec_bs1 - data_g;
vfloat samec_dist2 = samec_param * l_samec_bs2 - data_b;
@@ -1008,18 +936,9 @@ void compute_error_squared_rgb(
lane_ids += vint(ASTCENC_SIMD_WIDTH);
}
uncor_loparam = hmin_s(uncor_loparamv);
uncor_hiparam = hmax_s(uncor_hiparamv);
samec_loparam = hmin_s(samec_loparamv);
samec_hiparam = hmax_s(samec_hiparamv);
float uncor_linelen = uncor_hiparam - uncor_loparam;
float samec_linelen = samec_hiparam - samec_loparam;
// Turn very small numbers and NaNs into a small number
pl.uncor_line_len = astc::max(uncor_linelen, 1e-7f);
pl.samec_line_len = astc::max(samec_linelen, 1e-7f);
float uncor_linelen = hmax_s(uncor_hiparamv) - hmin_s(uncor_loparamv);
pl.line_length = astc::max(uncor_linelen, 1e-7f);
}
uncor_error = hadd_s(uncor_errorsumv);
+67 -93
View File
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: Apache-2.0
// ----------------------------------------------------------------------------
// Copyright 2011-2022 Arm Limited
// Copyright 2011-2023 Arm Limited
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy
@@ -330,17 +330,17 @@ static void init_decimation_info_2d(
for (unsigned int j = 0; j < wb.weight_count_of_texel[i]; j++)
{
di.texel_weights_int_4t[j][i] = wb.weights_of_texel[i][j];
di.texel_weights_float_4t[j][i] = static_cast<float>(wb.weights_of_texel[i][j]) * (1.0f / WEIGHTS_TEXEL_SUM);
di.texel_weights_4t[j][i] = wb.grid_weights_of_texel[i][j];
di.texel_weight_contribs_int_tr[j][i] = wb.weights_of_texel[i][j];
di.texel_weight_contribs_float_tr[j][i] = static_cast<float>(wb.weights_of_texel[i][j]) * (1.0f / WEIGHTS_TEXEL_SUM);
di.texel_weights_tr[j][i] = wb.grid_weights_of_texel[i][j];
}
// Init all 4 entries so we can rely on zeros for vectorization
for (unsigned int j = wb.weight_count_of_texel[i]; j < 4; j++)
{
di.texel_weights_int_4t[j][i] = 0;
di.texel_weights_float_4t[j][i] = 0.0f;
di.texel_weights_4t[j][i] = 0;
di.texel_weight_contribs_int_tr[j][i] = 0;
di.texel_weight_contribs_float_tr[j][i] = 0.0f;
di.texel_weights_tr[j][i] = 0;
}
}
@@ -356,43 +356,30 @@ static void init_decimation_info_2d(
uint8_t texel = wb.texels_of_weight[i][j];
// Create transposed versions of these for better vectorization
di.weight_texel[j][i] = texel;
di.weights_flt[j][i] = static_cast<float>(wb.texel_weights_of_weight[i][j]);
di.weight_texels_tr[j][i] = texel;
di.weights_texel_contribs_tr[j][i] = static_cast<float>(wb.texel_weights_of_weight[i][j]);
// perform a layer of array unrolling. An aspect of this unrolling is that
// one of the texel-weight indexes is an identity-mapped index; we will use this
// fact to reorder the indexes so that the first one is the identity index.
int swap_idx = -1;
// Store the per-texel contribution of this weight for each texel it contributes to
di.texel_contrib_for_weight[j][i] = 0.0f;
for (unsigned int k = 0; k < 4; k++)
{
uint8_t dttw = di.texel_weights_4t[k][texel];
float dttwf = di.texel_weights_float_4t[k][texel];
uint8_t dttw = di.texel_weights_tr[k][texel];
float dttwf = di.texel_weight_contribs_float_tr[k][texel];
if (dttw == i && dttwf != 0.0f)
{
swap_idx = k;
di.texel_contrib_for_weight[j][i] = di.texel_weight_contribs_float_tr[k][texel];
break;
}
di.texel_weights_texel[i][j][k] = dttw;
di.texel_weights_float_texel[i][j][k] = dttwf;
}
if (swap_idx != 0)
{
uint8_t vi = di.texel_weights_texel[i][j][0];
float vf = di.texel_weights_float_texel[i][j][0];
di.texel_weights_texel[i][j][0] = di.texel_weights_texel[i][j][swap_idx];
di.texel_weights_float_texel[i][j][0] = di.texel_weights_float_texel[i][j][swap_idx];
di.texel_weights_texel[i][j][swap_idx] = vi;
di.texel_weights_float_texel[i][j][swap_idx] = vf;
}
}
// Initialize array tail so we can over-fetch with SIMD later to avoid loop tails
// Match last texel in active lane in SIMD group, for better gathers
uint8_t last_texel = di.weight_texel[texel_count_wt - 1][i];
uint8_t last_texel = di.weight_texels_tr[texel_count_wt - 1][i];
for (unsigned int j = texel_count_wt; j < max_texel_count_of_weight; j++)
{
di.weight_texel[j][i] = last_texel;
di.weights_flt[j][i] = 0.0f;
di.weight_texels_tr[j][i] = last_texel;
di.weights_texel_contribs_tr[j][i] = 0.0f;
}
}
@@ -404,16 +391,16 @@ static void init_decimation_info_2d(
for (unsigned int j = 0; j < 4; j++)
{
di.texel_weights_float_4t[j][i] = 0;
di.texel_weights_4t[j][i] = 0;
di.texel_weights_int_4t[j][i] = 0;
di.texel_weight_contribs_float_tr[j][i] = 0;
di.texel_weights_tr[j][i] = 0;
di.texel_weight_contribs_int_tr[j][i] = 0;
}
}
// Initialize array tail so we can over-fetch with SIMD later to avoid loop tails
// Match last texel in active lane in SIMD group, for better gathers
unsigned int last_texel_count_wt = wb.texel_count_of_weight[weights_per_block - 1];
uint8_t last_texel = di.weight_texel[last_texel_count_wt - 1][weights_per_block - 1];
uint8_t last_texel = di.weight_texels_tr[last_texel_count_wt - 1][weights_per_block - 1];
unsigned int weights_per_block_simd = round_up_to_simd_multiple_vla(weights_per_block);
for (unsigned int i = weights_per_block; i < weights_per_block_simd; i++)
@@ -422,8 +409,8 @@ static void init_decimation_info_2d(
for (unsigned int j = 0; j < max_texel_count_of_weight; j++)
{
di.weight_texel[j][i] = last_texel;
di.weights_flt[j][i] = 0.0f;
di.weight_texels_tr[j][i] = last_texel;
di.weights_texel_contribs_tr[j][i] = 0.0f;
}
}
@@ -600,16 +587,16 @@ static void init_decimation_info_3d(
// Init all 4 entries so we can rely on zeros for vectorization
for (unsigned int j = 0; j < 4; j++)
{
di.texel_weights_int_4t[j][i] = 0;
di.texel_weights_float_4t[j][i] = 0.0f;
di.texel_weights_4t[j][i] = 0;
di.texel_weight_contribs_int_tr[j][i] = 0;
di.texel_weight_contribs_float_tr[j][i] = 0.0f;
di.texel_weights_tr[j][i] = 0;
}
for (unsigned int j = 0; j < wb.weight_count_of_texel[i]; j++)
{
di.texel_weights_int_4t[j][i] = wb.weights_of_texel[i][j];
di.texel_weights_float_4t[j][i] = static_cast<float>(wb.weights_of_texel[i][j]) * (1.0f / WEIGHTS_TEXEL_SUM);
di.texel_weights_4t[j][i] = wb.grid_weights_of_texel[i][j];
di.texel_weight_contribs_int_tr[j][i] = wb.weights_of_texel[i][j];
di.texel_weight_contribs_float_tr[j][i] = static_cast<float>(wb.weights_of_texel[i][j]) * (1.0f / WEIGHTS_TEXEL_SUM);
di.texel_weights_tr[j][i] = wb.grid_weights_of_texel[i][j];
}
}
@@ -625,43 +612,30 @@ static void init_decimation_info_3d(
unsigned int texel = wb.texels_of_weight[i][j];
// Create transposed versions of these for better vectorization
di.weight_texel[j][i] = static_cast<uint8_t>(texel);
di.weights_flt[j][i] = static_cast<float>(wb.texel_weights_of_weight[i][j]);
di.weight_texels_tr[j][i] = static_cast<uint8_t>(texel);
di.weights_texel_contribs_tr[j][i] = static_cast<float>(wb.texel_weights_of_weight[i][j]);
// perform a layer of array unrolling. An aspect of this unrolling is that
// one of the texel-weight indexes is an identity-mapped index; we will use this
// fact to reorder the indexes so that the first one is the identity index.
int swap_idx = -1;
// Store the per-texel contribution of this weight for each texel it contributes to
di.texel_contrib_for_weight[j][i] = 0.0f;
for (unsigned int k = 0; k < 4; k++)
{
uint8_t dttw = di.texel_weights_4t[k][texel];
float dttwf = di.texel_weights_float_4t[k][texel];
uint8_t dttw = di.texel_weights_tr[k][texel];
float dttwf = di.texel_weight_contribs_float_tr[k][texel];
if (dttw == i && dttwf != 0.0f)
{
swap_idx = k;
di.texel_contrib_for_weight[j][i] = di.texel_weight_contribs_float_tr[k][texel];
break;
}
di.texel_weights_texel[i][j][k] = dttw;
di.texel_weights_float_texel[i][j][k] = dttwf;
}
if (swap_idx != 0)
{
uint8_t vi = di.texel_weights_texel[i][j][0];
float vf = di.texel_weights_float_texel[i][j][0];
di.texel_weights_texel[i][j][0] = di.texel_weights_texel[i][j][swap_idx];
di.texel_weights_float_texel[i][j][0] = di.texel_weights_float_texel[i][j][swap_idx];
di.texel_weights_texel[i][j][swap_idx] = vi;
di.texel_weights_float_texel[i][j][swap_idx] = vf;
}
}
// Initialize array tail so we can over-fetch with SIMD later to avoid loop tails
// Match last texel in active lane in SIMD group, for better gathers
uint8_t last_texel = di.weight_texel[texel_count_wt - 1][i];
uint8_t last_texel = di.weight_texels_tr[texel_count_wt - 1][i];
for (unsigned int j = texel_count_wt; j < max_texel_count_of_weight; j++)
{
di.weight_texel[j][i] = last_texel;
di.weights_flt[j][i] = 0.0f;
di.weight_texels_tr[j][i] = last_texel;
di.weights_texel_contribs_tr[j][i] = 0.0f;
}
}
@@ -673,16 +647,16 @@ static void init_decimation_info_3d(
for (unsigned int j = 0; j < 4; j++)
{
di.texel_weights_float_4t[j][i] = 0;
di.texel_weights_4t[j][i] = 0;
di.texel_weights_int_4t[j][i] = 0;
di.texel_weight_contribs_float_tr[j][i] = 0;
di.texel_weights_tr[j][i] = 0;
di.texel_weight_contribs_int_tr[j][i] = 0;
}
}
// Initialize array tail so we can over-fetch with SIMD later to avoid loop tails
// Match last texel in active lane in SIMD group, for better gathers
int last_texel_count_wt = wb.texel_count_of_weight[weights_per_block - 1];
uint8_t last_texel = di.weight_texel[last_texel_count_wt - 1][weights_per_block - 1];
uint8_t last_texel = di.weight_texels_tr[last_texel_count_wt - 1][weights_per_block - 1];
unsigned int weights_per_block_simd = round_up_to_simd_multiple_vla(weights_per_block);
for (unsigned int i = weights_per_block; i < weights_per_block_simd; i++)
@@ -691,8 +665,8 @@ static void init_decimation_info_3d(
for (int j = 0; j < max_texel_count_of_weight; j++)
{
di.weight_texel[j][i] = last_texel;
di.weights_flt[j][i] = 0.0f;
di.weight_texels_tr[j][i] = last_texel;
di.weights_texel_contribs_tr[j][i] = 0.0f;
}
}
@@ -802,8 +776,8 @@ static void construct_dt_entry_2d(
assert(maxprec_1plane >= 0 || maxprec_2planes >= 0);
bsd.decimation_modes[index].maxprec_1plane = static_cast<int8_t>(maxprec_1plane);
bsd.decimation_modes[index].maxprec_2planes = static_cast<int8_t>(maxprec_2planes);
bsd.decimation_modes[index].ref_1_plane = 0;
bsd.decimation_modes[index].ref_2_planes = 0;
bsd.decimation_modes[index].refprec_1plane = 0;
bsd.decimation_modes[index].refprec_2planes = 0;
}
/**
@@ -957,16 +931,6 @@ static void construct_block_size_descriptor_2d(
}
auto& bm = bsd.block_modes[packed_bm_idx];
auto& dm = bsd.decimation_modes[decimation_mode];
if (is_dual_plane)
{
dm.ref_2_planes = 1;
}
else
{
dm.ref_1_plane = 1;
}
bm.decimation_mode = static_cast<uint8_t>(decimation_mode);
bm.quant_mode = static_cast<uint8_t>(quant_mode);
@@ -974,6 +938,17 @@ static void construct_block_size_descriptor_2d(
bm.weight_bits = static_cast<uint8_t>(weight_bits);
bm.mode_index = static_cast<uint16_t>(i);
auto& dm = bsd.decimation_modes[decimation_mode];
if (is_dual_plane)
{
dm.set_ref_2plane(bm.get_weight_quant_mode());
}
else
{
dm.set_ref_1plane(bm.get_weight_quant_mode());
}
bsd.block_mode_packed_index[i] = static_cast<uint16_t>(packed_bm_idx);
packed_bm_idx++;
@@ -1002,8 +977,8 @@ static void construct_block_size_descriptor_2d(
{
bsd.decimation_modes[i].maxprec_1plane = -1;
bsd.decimation_modes[i].maxprec_2planes = -1;
bsd.decimation_modes[i].ref_1_plane = 0;
bsd.decimation_modes[i].ref_2_planes = 0;
bsd.decimation_modes[i].refprec_1plane = 0;
bsd.decimation_modes[i].refprec_2planes = 0;
}
// Determine the texels to use for kmeans clustering.
@@ -1013,7 +988,7 @@ static void construct_block_size_descriptor_2d(
}
/**
* @brief Allocate block modes and decimation tables for a single £D block size.
* @brief Allocate block modes and decimation tables for a single 3D block size.
*
* TODO: This function doesn't include all of the heuristics that we use for 2D block sizes such as
* the percentile mode cutoffs. If 3D becomes more widely used we should look at this.
@@ -1088,8 +1063,8 @@ static void construct_block_size_descriptor_3d(
bsd.decimation_modes[decimation_mode_count].maxprec_1plane = static_cast<int8_t>(maxprec_1plane);
bsd.decimation_modes[decimation_mode_count].maxprec_2planes = static_cast<int8_t>(maxprec_2planes);
bsd.decimation_modes[decimation_mode_count].ref_1_plane = maxprec_1plane == -1 ? 0 : 1;
bsd.decimation_modes[decimation_mode_count].ref_2_planes = maxprec_2planes == -1 ? 0 : 1;
bsd.decimation_modes[decimation_mode_count].refprec_1plane = maxprec_1plane == -1 ? 0 : 0xFFFF;
bsd.decimation_modes[decimation_mode_count].refprec_2planes = maxprec_2planes == -1 ? 0 : 0xFFFF;
decimation_mode_count++;
}
}
@@ -1100,15 +1075,14 @@ static void construct_block_size_descriptor_3d(
{
bsd.decimation_modes[i].maxprec_1plane = -1;
bsd.decimation_modes[i].maxprec_2planes = -1;
bsd.decimation_modes[i].ref_1_plane = 0;
bsd.decimation_modes[i].ref_2_planes = 0;
bsd.decimation_modes[i].refprec_1plane = 0;
bsd.decimation_modes[i].refprec_2planes = 0;
}
bsd.decimation_mode_count_always = 0; // Skipped for 3D modes
bsd.decimation_mode_count_selected = decimation_mode_count;
bsd.decimation_mode_count_all = decimation_mode_count;
// Construct the list of block formats
// Construct the list of block formats referencing the decimation tables
// Clear the list to a known-bad value
File diff suppressed because it is too large Load Diff
+176 -252
View File
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: Apache-2.0
// ----------------------------------------------------------------------------
// Copyright 2011-2021 Arm Limited
// Copyright 2011-2023 Arm Limited
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy
@@ -23,43 +23,6 @@
#include "astcenc_internal.h"
/**
* @brief Unquantize a color.
*
* This function uses a lookup table as the quantization is encoded to make
* hardware implementations easier, and is not a simple lerp.
*
* @param quant_level The quantization level to use.
* @param inputq The input quantized color.
*
* @return The unquantized color.
*/
static ASTCENC_SIMD_INLINE vint4 unquant_color(
quant_method quant_level,
vint4 inputq
) {
const uint8_t* unq = color_unquant_tables[quant_level - QUANT_6];
return vint4(unq[inputq.lane<0>()], unq[inputq.lane<1>()],
unq[inputq.lane<2>()], unq[inputq.lane<3>()]);
}
/**
* @brief Determine the quantized value given a quantization level.
*
* @param quant_level The quantization level to use.
* @param value The value to convert. This may be outside of the 0-255 range and will be
* clamped before the value is looked up.
*
* @return The encoded quantized value. These are not necessarily in the order; the compressor
* scrambles the values slightly to make hardware implementation easier.
*/
static inline int unquant_color(
quant_method quant_level,
int value
) {
return color_unquant_tables[quant_level - QUANT_6][value];
}
/**
* @brief Un-blue-contract a color.
*
@@ -77,35 +40,14 @@ static ASTCENC_SIMD_INLINE vint4 uncontract_color(
return select(input, bc0, mask);
}
/**
* @brief Unpack an LDR RGBA color that uses delta encoding.
*
* @param input0q The raw quantized endpoint 0 color.
* @param input1q The raw quantized endpoint 1 color deltas.
* @param quant_level The quantization level to use.
* @param[out] output0 The unpacked and unquantized endpoint 0 color.
* @param[out] output1 The unpacked and unquantized endpoint 1 color.
*/
static void rgba_delta_unpack(
vint4 input0q,
vint4 input1q,
quant_method quant_level,
void rgba_delta_unpack(
vint4 input0,
vint4 input1,
vint4& output0,
vint4& output1
) {
// Unquantize color endpoints
vint4 input0 = unquant_color(quant_level, input0q);
vint4 input1 = unquant_color(quant_level, input1q);
// Perform bit-transfer
input0 = input0 | lsl<1>(input1 & 0x80);
input1 = input1 & 0x7F;
vmask4 mask = (input1 & 0x40) != vint4::zero();
input1 = select(input1, input1 - 0x80, mask);
// Scale
input0 = asr<1>(input0);
input1 = asr<1>(input1);
// Apply bit transfer
bit_transfer_signed(input1, input0);
// Apply blue-uncontraction if needed
int rgb_sum = hadd_rgb_s(input1);
@@ -126,44 +68,28 @@ static void rgba_delta_unpack(
*
* Output alpha set to 255.
*
* @param input0q The raw quantized endpoint 0 color.
* @param input1q The raw quantized endpoint 1 color deltas.
* @param quant_level The quantization level to use.
* @param[out] output0 The unpacked and unquantized endpoint 0 color.
* @param[out] output1 The unpacked and unquantized endpoint 1 color.
* @param input0 The packed endpoint 0 color.
* @param input1 The packed endpoint 1 color deltas.
* @param[out] output0 The unpacked endpoint 0 color.
* @param[out] output1 The unpacked endpoint 1 color.
*/
static void rgb_delta_unpack(
vint4 input0q,
vint4 input1q,
quant_method quant_level,
vint4 input0,
vint4 input1,
vint4& output0,
vint4& output1
) {
rgba_delta_unpack(input0q, input1q, quant_level, output0, output1);
rgba_delta_unpack(input0, input1, output0, output1);
output0.set_lane<3>(255);
output1.set_lane<3>(255);
}
/**
* @brief Unpack an LDR RGBA color that uses direct encoding.
*
* @param input0q The raw quantized endpoint 0 color.
* @param input1q The raw quantized endpoint 1 color.
* @param quant_level The quantization level to use.
* @param[out] output0 The unpacked and unquantized endpoint 0 color.
* @param[out] output1 The unpacked and unquantized endpoint 1 color.
*/
static void rgba_unpack(
vint4 input0q,
vint4 input1q,
quant_method quant_level,
void rgba_unpack(
vint4 input0,
vint4 input1,
vint4& output0,
vint4& output1
) {
// Unquantize color endpoints
vint4 input0 = unquant_color(quant_level, input0q);
vint4 input1 = unquant_color(quant_level, input1q);
// Apply blue-uncontraction if needed
if (hadd_rgb_s(input0) > hadd_rgb_s(input1))
{
@@ -181,20 +107,18 @@ static void rgba_unpack(
*
* Output alpha set to 255.
*
* @param input0q The raw quantized endpoint 0 color.
* @param input1q The raw quantized endpoint 1 color.
* @param quant_level The quantization level to use.
* @param[out] output0 The unpacked and unquantized endpoint 0 color.
* @param[out] output1 The unpacked and unquantized endpoint 1 color.
* @param input0 The packed endpoint 0 color.
* @param input1 The packed endpoint 1 color.
* @param[out] output0 The unpacked endpoint 0 color.
* @param[out] output1 The unpacked endpoint 1 color.
*/
static void rgb_unpack(
vint4 input0q,
vint4 input1q,
quant_method quant_level,
vint4 input0,
vint4 input1,
vint4& output0,
vint4& output1
) {
rgba_unpack(input0q, input1q, quant_level, output0, output1);
rgba_unpack(input0, input1, output0, output1);
output0.set_lane<3>(255);
output1.set_lane<3>(255);
}
@@ -204,31 +128,24 @@ static void rgb_unpack(
*
* Note only the RGB channels use the scaled encoding, alpha uses direct.
*
* @param input0q The raw quantized endpoint 0 color.
* @param alpha1q The raw quantized endpoint 1 alpha value.
* @param scaleq The raw quantized scale.
* @param quant_level The quantization level to use.
* @param[out] output0 The unpacked and unquantized endpoint 0 color.
* @param[out] output1 The unpacked and unquantized endpoint 1 color.
* @param input0 The packed endpoint 0 color.
* @param alpha1 The packed endpoint 1 alpha value.
* @param scale The packed quantized scale.
* @param[out] output0 The unpacked endpoint 0 color.
* @param[out] output1 The unpacked endpoint 1 color.
*/
static void rgb_scale_alpha_unpack(
vint4 input0q,
uint8_t alpha1q,
uint8_t scaleq,
quant_method quant_level,
vint4 input0,
uint8_t alpha1,
uint8_t scale,
vint4& output0,
vint4& output1
) {
// Unquantize color endpoints
vint4 input = unquant_color(quant_level, input0q);
uint8_t alpha1 = unquant_color(quant_level, alpha1q);
uint8_t scale = unquant_color(quant_level, scaleq);
output1 = input;
output1 = input0;
output1.set_lane<3>(alpha1);
output0 = asr<8>(input * scale);
output0.set_lane<3>(input.lane<3>());
output0 = asr<8>(input0 * scale);
output0.set_lane<3>(input0.lane<3>());
}
/**
@@ -236,26 +153,21 @@ static void rgb_scale_alpha_unpack(
*
* Output alpha is 255.
*
* @param input0q The raw quantized endpoint 0 color.
* @param scaleq The raw quantized scale.
* @param quant_level The quantization level to use.
* @param[out] output0 The unpacked and unquantized endpoint 0 color.
* @param[out] output1 The unpacked and unquantized endpoint 1 color.
* @param input0 The packed endpoint 0 color.
* @param scale The packed scale.
* @param[out] output0 The unpacked endpoint 0 color.
* @param[out] output1 The unpacked endpoint 1 color.
*/
static void rgb_scale_unpack(
vint4 input0q,
int scaleq,
quant_method quant_level,
vint4 input0,
int scale,
vint4& output0,
vint4& output1
) {
vint4 input = unquant_color(quant_level, input0q);
int scale = unquant_color(quant_level, scaleq);
output1 = input;
output1 = input0;
output1.set_lane<3>(255);
output0 = asr<8>(input * scale);
output0 = asr<8>(input0 * scale);
output0.set_lane<3>(255);
}
@@ -264,19 +176,17 @@ static void rgb_scale_unpack(
*
* Output alpha is 255.
*
* @param input The raw quantized endpoints.
* @param quant_level The quantization level to use.
* @param[out] output0 The unpacked and unquantized endpoint 0 color.
* @param[out] output1 The unpacked and unquantized endpoint 1 color.
* @param input The packed endpoints.
* @param[out] output0 The unpacked endpoint 0 color.
* @param[out] output1 The unpacked endpoint 1 color.
*/
static void luminance_unpack(
const uint8_t input[2],
quant_method quant_level,
vint4& output0,
vint4& output1
) {
int lum0 = unquant_color(quant_level, input[0]);
int lum1 = unquant_color(quant_level, input[1]);
int lum0 = input[0];
int lum1 = input[1];
output0 = vint4(lum0, lum0, lum0, 255);
output1 = vint4(lum1, lum1, lum1, 255);
}
@@ -286,19 +196,17 @@ static void luminance_unpack(
*
* Output alpha is 255.
*
* @param input The raw quantized endpoints (L0, L1).
* @param quant_level The quantization level to use.
* @param[out] output0 The unpacked and unquantized endpoint 0 color.
* @param[out] output1 The unpacked and unquantized endpoint 1 color.
* @param input The packed endpoints (L0, L1).
* @param[out] output0 The unpacked endpoint 0 color.
* @param[out] output1 The unpacked endpoint 1 color.
*/
static void luminance_delta_unpack(
const uint8_t input[2],
quant_method quant_level,
vint4& output0,
vint4& output1
) {
int v0 = unquant_color(quant_level, input[0]);
int v1 = unquant_color(quant_level, input[1]);
int v0 = input[0];
int v1 = input[1];
int l0 = (v0 >> 2) | (v1 & 0xC0);
int l1 = l0 + (v1 & 0x3F);
@@ -311,21 +219,19 @@ static void luminance_delta_unpack(
/**
* @brief Unpack an LDR LA color that uses direct encoding.
*
* @param input The raw quantized endpoints (L0, L1, A0, A1).
* @param quant_level The quantization level to use.
* @param[out] output0 The unpacked and unquantized endpoint 0 color.
* @param[out] output1 The unpacked and unquantized endpoint 1 color.
* @param input The packed endpoints (L0, L1, A0, A1).
* @param[out] output0 The unpacked endpoint 0 color.
* @param[out] output1 The unpacked endpoint 1 color.
*/
static void luminance_alpha_unpack(
const uint8_t input[4],
quant_method quant_level,
vint4& output0,
vint4& output1
) {
int lum0 = unquant_color(quant_level, input[0]);
int lum1 = unquant_color(quant_level, input[1]);
int alpha0 = unquant_color(quant_level, input[2]);
int alpha1 = unquant_color(quant_level, input[3]);
int lum0 = input[0];
int lum1 = input[1];
int alpha0 = input[2];
int alpha1 = input[3];
output0 = vint4(lum0, lum0, lum0, alpha0);
output1 = vint4(lum1, lum1, lum1, alpha1);
}
@@ -333,30 +239,34 @@ static void luminance_alpha_unpack(
/**
* @brief Unpack an LDR LA color that uses delta encoding.
*
* @param input The raw quantized endpoints (L0, L1, A0, A1).
* @param quant_level The quantization level to use.
* @param[out] output0 The unpacked and unquantized endpoint 0 color.
* @param[out] output1 The unpacked and unquantized endpoint 1 color.
* @param input The packed endpoints (L0, L1, A0, A1).
* @param[out] output0 The unpacked endpoint 0 color.
* @param[out] output1 The unpacked endpoint 1 color.
*/
static void luminance_alpha_delta_unpack(
const uint8_t input[4],
quant_method quant_level,
vint4& output0,
vint4& output1
) {
int lum0 = unquant_color(quant_level, input[0]);
int lum1 = unquant_color(quant_level, input[1]);
int alpha0 = unquant_color(quant_level, input[2]);
int alpha1 = unquant_color(quant_level, input[3]);
int lum0 = input[0];
int lum1 = input[1];
int alpha0 = input[2];
int alpha1 = input[3];
lum0 |= (lum1 & 0x80) << 1;
alpha0 |= (alpha1 & 0x80) << 1;
lum1 &= 0x7F;
alpha1 &= 0x7F;
if (lum1 & 0x40)
{
lum1 -= 0x80;
}
if (alpha1 & 0x40)
{
alpha1 -= 0x80;
}
lum0 >>= 1;
lum1 >>= 1;
@@ -375,21 +285,19 @@ static void luminance_alpha_delta_unpack(
/**
* @brief Unpack an HDR RGB + offset encoding.
*
* @param input The raw quantized endpoints (packed and modal).
* @param quant_level The quantization level to use.
* @param[out] output0 The unpacked and unquantized endpoint 0 color.
* @param[out] output1 The unpacked and unquantized endpoint 1 color.
* @param input The packed endpoints (packed and modal).
* @param[out] output0 The unpacked endpoint 0 color.
* @param[out] output1 The unpacked endpoint 1 color.
*/
static void hdr_rgbo_unpack(
const uint8_t input[4],
quant_method quant_level,
vint4& output0,
vint4& output1
) {
int v0 = unquant_color(quant_level, input[0]);
int v1 = unquant_color(quant_level, input[1]);
int v2 = unquant_color(quant_level, input[2]);
int v3 = unquant_color(quant_level, input[3]);
int v0 = input[0];
int v1 = input[1];
int v2 = input[2];
int v3 = input[3];
int modeval = ((v0 & 0xC0) >> 6) | (((v1 & 0x80) >> 7) << 2) | (((v2 & 0x80) >> 7) << 3);
@@ -527,24 +435,22 @@ static void hdr_rgbo_unpack(
/**
* @brief Unpack an HDR RGB direct encoding.
*
* @param input The raw quantized endpoints (packed and modal).
* @param quant_level The quantization level to use.
* @param[out] output0 The unpacked and unquantized endpoint 0 color.
* @param[out] output1 The unpacked and unquantized endpoint 1 color.
* @param input The packed endpoints (packed and modal).
* @param[out] output0 The unpacked endpoint 0 color.
* @param[out] output1 The unpacked endpoint 1 color.
*/
static void hdr_rgb_unpack(
const uint8_t input[6],
quant_method quant_level,
vint4& output0,
vint4& output1
) {
int v0 = unquant_color(quant_level, input[0]);
int v1 = unquant_color(quant_level, input[1]);
int v2 = unquant_color(quant_level, input[2]);
int v3 = unquant_color(quant_level, input[3]);
int v4 = unquant_color(quant_level, input[4]);
int v5 = unquant_color(quant_level, input[5]);
int v0 = input[0];
int v1 = input[1];
int v2 = input[2];
int v3 = input[3];
int v4 = input[4];
int v5 = input[5];
// extract all the fixed-placement bitfields
int modeval = ((v1 & 0x80) >> 7) | (((v2 & 0x80) >> 7) << 1) | (((v3 & 0x80) >> 7) << 2);
@@ -695,21 +601,19 @@ static void hdr_rgb_unpack(
/**
* @brief Unpack an HDR RGB + LDR A direct encoding.
*
* @param input The raw quantized endpoints (packed and modal).
* @param quant_level The quantization level to use.
* @param[out] output0 The unpacked and unquantized endpoint 0 color.
* @param[out] output1 The unpacked and unquantized endpoint 1 color.
* @param input The packed endpoints (packed and modal).
* @param[out] output0 The unpacked endpoint 0 color.
* @param[out] output1 The unpacked endpoint 1 color.
*/
static void hdr_rgb_ldr_alpha_unpack(
const uint8_t input[8],
quant_method quant_level,
vint4& output0,
vint4& output1
) {
hdr_rgb_unpack(input, quant_level, output0, output1);
hdr_rgb_unpack(input, output0, output1);
int v6 = unquant_color(quant_level, input[6]);
int v7 = unquant_color(quant_level, input[7]);
int v6 = input[6];
int v7 = input[7];
output0.set_lane<3>(v6);
output1.set_lane<3>(v7);
}
@@ -717,19 +621,17 @@ static void hdr_rgb_ldr_alpha_unpack(
/**
* @brief Unpack an HDR L (small range) direct encoding.
*
* @param input The raw quantized endpoints (packed and modal).
* @param quant_level The quantization level to use.
* @param[out] output0 The unpacked and unquantized endpoint 0 color.
* @param[out] output1 The unpacked and unquantized endpoint 1 color.
* @param input The packed endpoints (packed and modal).
* @param[out] output0 The unpacked endpoint 0 color.
* @param[out] output1 The unpacked endpoint 1 color.
*/
static void hdr_luminance_small_range_unpack(
const uint8_t input[2],
quant_method quant_level,
vint4& output0,
vint4& output1
) {
int v0 = unquant_color(quant_level, input[0]);
int v1 = unquant_color(quant_level, input[1]);
int v0 = input[0];
int v1 = input[1];
int y0, y1;
if (v0 & 0x80)
@@ -745,7 +647,9 @@ static void hdr_luminance_small_range_unpack(
y1 += y0;
if (y1 > 0xFFF)
{
y1 = 0xFFF;
}
output0 = vint4(y0 << 4, y0 << 4, y0 << 4, 0x7800);
output1 = vint4(y1 << 4, y1 << 4, y1 << 4, 0x7800);
@@ -754,19 +658,17 @@ static void hdr_luminance_small_range_unpack(
/**
* @brief Unpack an HDR L (large range) direct encoding.
*
* @param input The raw quantized endpoints (packed and modal).
* @param quant_level The quantization level to use.
* @param[out] output0 The unpacked and unquantized endpoint 0 color.
* @param[out] output1 The unpacked and unquantized endpoint 1 color.
* @param input The packed endpoints (packed and modal).
* @param[out] output0 The unpacked endpoint 0 color.
* @param[out] output1 The unpacked endpoint 1 color.
*/
static void hdr_luminance_large_range_unpack(
const uint8_t input[2],
quant_method quant_level,
vint4& output0,
vint4& output1
) {
int v0 = unquant_color(quant_level, input[0]);
int v1 = unquant_color(quant_level, input[1]);
int v0 = input[0];
int v1 = input[1];
int y0, y1;
if (v1 >= v0)
@@ -787,20 +689,18 @@ static void hdr_luminance_large_range_unpack(
/**
* @brief Unpack an HDR A direct encoding.
*
* @param input The raw quantized endpoints (packed and modal).
* @param quant_level The quantization level to use.
* @param[out] output0 The unpacked and unquantized endpoint 0 color.
* @param[out] output1 The unpacked and unquantized endpoint 1 color.
* @param input The packed endpoints (packed and modal).
* @param[out] output0 The unpacked endpoint 0 color.
* @param[out] output1 The unpacked endpoint 1 color.
*/
static void hdr_alpha_unpack(
const uint8_t input[2],
quant_method quant_level,
int& output0,
int& output1
) {
int v6 = unquant_color(quant_level, input[0]);
int v7 = unquant_color(quant_level, input[1]);
int v6 = input[0];
int v7 = input[1];
int selector = ((v6 >> 7) & 1) | ((v7 >> 6) & 2);
v6 &= 0x7F;
@@ -821,9 +721,13 @@ static void hdr_alpha_unpack(
v7 += v6;
if (v7 < 0)
{
v7 = 0;
}
else if (v7 > 0xFFF)
{
v7 = 0xFFF;
}
output0 = v6;
output1 = v7;
@@ -836,21 +740,19 @@ static void hdr_alpha_unpack(
/**
* @brief Unpack an HDR RGBA direct encoding.
*
* @param input The raw quantized endpoints (packed and modal).
* @param quant_level The quantization level to use.
* @param[out] output0 The unpacked and unquantized endpoint 0 color.
* @param[out] output1 The unpacked and unquantized endpoint 1 color.
* @param input The packed endpoints (packed and modal).
* @param[out] output0 The unpacked endpoint 0 color.
* @param[out] output1 The unpacked endpoint 1 color.
*/
static void hdr_rgb_hdr_alpha_unpack(
const uint8_t input[8],
quant_method quant_level,
vint4& output0,
vint4& output1
) {
hdr_rgb_unpack(input, quant_level, output0, output1);
hdr_rgb_unpack(input, output0, output1);
int alpha0, alpha1;
hdr_alpha_unpack(input + 6, quant_level, alpha0, alpha1);
hdr_alpha_unpack(input + 6, alpha0, alpha1);
output0.set_lane<3>(alpha0);
output1.set_lane<3>(alpha1);
@@ -860,7 +762,6 @@ static void hdr_rgb_hdr_alpha_unpack(
void unpack_color_endpoints(
astcenc_profile decode_mode,
int format,
quant_method quant_level,
const uint8_t* input,
bool& rgb_hdr,
bool& alpha_hdr,
@@ -876,38 +777,38 @@ void unpack_color_endpoints(
switch (format)
{
case FMT_LUMINANCE:
luminance_unpack(input, quant_level, output0, output1);
luminance_unpack(input, output0, output1);
break;
case FMT_LUMINANCE_DELTA:
luminance_delta_unpack(input, quant_level, output0, output1);
luminance_delta_unpack(input, output0, output1);
break;
case FMT_HDR_LUMINANCE_SMALL_RANGE:
rgb_hdr = true;
alpha_hdr_default = true;
hdr_luminance_small_range_unpack(input, quant_level, output0, output1);
hdr_luminance_small_range_unpack(input, output0, output1);
break;
case FMT_HDR_LUMINANCE_LARGE_RANGE:
rgb_hdr = true;
alpha_hdr_default = true;
hdr_luminance_large_range_unpack(input, quant_level, output0, output1);
hdr_luminance_large_range_unpack(input, output0, output1);
break;
case FMT_LUMINANCE_ALPHA:
luminance_alpha_unpack(input, quant_level, output0, output1);
luminance_alpha_unpack(input, output0, output1);
break;
case FMT_LUMINANCE_ALPHA_DELTA:
luminance_alpha_delta_unpack(input, quant_level, output0, output1);
luminance_alpha_delta_unpack(input, output0, output1);
break;
case FMT_RGB_SCALE:
{
vint4 input0q(input[0], input[1], input[2], 0);
uint8_t scale = input[3];
rgb_scale_unpack(input0q, scale, quant_level, output0, output1);
rgb_scale_unpack(input0q, scale, output0, output1);
}
break;
@@ -916,21 +817,21 @@ void unpack_color_endpoints(
vint4 input0q(input[0], input[1], input[2], input[4]);
uint8_t alpha1q = input[5];
uint8_t scaleq = input[3];
rgb_scale_alpha_unpack(input0q, alpha1q, scaleq, quant_level, output0, output1);
rgb_scale_alpha_unpack(input0q, alpha1q, scaleq, output0, output1);
}
break;
case FMT_HDR_RGB_SCALE:
rgb_hdr = true;
alpha_hdr_default = true;
hdr_rgbo_unpack(input, quant_level,output0, output1);
hdr_rgbo_unpack(input, output0, output1);
break;
case FMT_RGB:
{
vint4 input0q(input[0], input[2], input[4], 0);
vint4 input1q(input[1], input[3], input[5], 0);
rgb_unpack(input0q, input1q, quant_level, output0, output1);
rgb_unpack(input0q, input1q, output0, output1);
}
break;
@@ -938,21 +839,21 @@ void unpack_color_endpoints(
{
vint4 input0q(input[0], input[2], input[4], 0);
vint4 input1q(input[1], input[3], input[5], 0);
rgb_delta_unpack(input0q, input1q, quant_level, output0, output1);
rgb_delta_unpack(input0q, input1q, output0, output1);
}
break;
case FMT_HDR_RGB:
rgb_hdr = true;
alpha_hdr_default = true;
hdr_rgb_unpack(input, quant_level, output0, output1);
hdr_rgb_unpack(input, output0, output1);
break;
case FMT_RGBA:
{
vint4 input0q(input[0], input[2], input[4], input[6]);
vint4 input1q(input[1], input[3], input[5], input[7]);
rgba_unpack(input0q, input1q, quant_level, output0, output1);
rgba_unpack(input0q, input1q, output0, output1);
}
break;
@@ -960,19 +861,19 @@ void unpack_color_endpoints(
{
vint4 input0q(input[0], input[2], input[4], input[6]);
vint4 input1q(input[1], input[3], input[5], input[7]);
rgba_delta_unpack(input0q, input1q, quant_level, output0, output1);
rgba_delta_unpack(input0q, input1q, output0, output1);
}
break;
case FMT_HDR_RGB_LDR_ALPHA:
rgb_hdr = true;
hdr_rgb_ldr_alpha_unpack(input, quant_level, output0, output1);
hdr_rgb_ldr_alpha_unpack(input, output0, output1);
break;
case FMT_HDR_RGBA:
rgb_hdr = true;
alpha_hdr = true;
hdr_rgb_hdr_alpha_unpack(input, quant_level, output0, output1);
hdr_rgb_hdr_alpha_unpack(input, output0, output1);
break;
}
@@ -993,32 +894,55 @@ void unpack_color_endpoints(
}
}
vint4 ldr_scale(257);
vint4 hdr_scale(1);
vint4 output_scale = ldr_scale;
// Handle endpoint errors and expansion
// An LDR profile image
if ((decode_mode == ASTCENC_PRF_LDR) ||
(decode_mode == ASTCENC_PRF_LDR_SRGB))
// Linear LDR 8-bit endpoints are expanded to 16-bit by replication
if (decode_mode == ASTCENC_PRF_LDR)
{
// Also matches HDR alpha, as cannot have HDR alpha without HDR RGB
if (rgb_hdr == true)
// Error color - HDR endpoint in an LDR encoding
if (rgb_hdr || alpha_hdr)
{
output0 = vint4(0xFF00, 0x0000, 0xFF00, 0xFF00);
output1 = vint4(0xFF00, 0x0000, 0xFF00, 0xFF00);
output_scale = hdr_scale;
output0 = vint4(0xFF, 0x00, 0xFF, 0xFF);
output1 = vint4(0xFF, 0x00, 0xFF, 0xFF);
rgb_hdr = false;
alpha_hdr = false;
}
output0 = output0 * 257;
output1 = output1 * 257;
}
// An HDR profile image
// sRGB LDR 8-bit endpoints are expanded to 16 bit by:
// - RGB = shift left by 8 bits and OR with 0x80
// - A = replication
else if (decode_mode == ASTCENC_PRF_LDR_SRGB)
{
// Error color - HDR endpoint in an LDR encoding
if (rgb_hdr || alpha_hdr)
{
output0 = vint4(0xFF, 0x00, 0xFF, 0xFF);
output1 = vint4(0xFF, 0x00, 0xFF, 0xFF);
rgb_hdr = false;
alpha_hdr = false;
}
vmask4 mask(true, true, true, false);
vint4 output0rgb = lsl<8>(output0) | vint4(0x80);
vint4 output0a = output0 * 257;
output0 = select(output0a, output0rgb, mask);
vint4 output1rgb = lsl<8>(output1) | vint4(0x80);
vint4 output1a = output1 * 257;
output1 = select(output1a, output1rgb, mask);
}
// An HDR profile decode, but may be using linear LDR endpoints
// Linear LDR 8-bit endpoints are expanded to 16-bit by replication
// HDR endpoints are already 16-bit
else
{
vmask4 hdr_lanes(rgb_hdr, rgb_hdr, rgb_hdr, alpha_hdr);
output_scale = select(ldr_scale, hdr_scale, hdr_lanes);
vint4 output_scale = select(vint4(257), vint4(1), hdr_lanes);
output0 = output0 * output_scale;
output1 = output1 * output_scale;
}
output0 = output0 * output_scale;
output1 = output1 * output_scale;
}
+240 -203
View File
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: Apache-2.0
// ----------------------------------------------------------------------------
// Copyright 2011-2022 Arm Limited
// Copyright 2011-2024 Arm Limited
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy
@@ -79,10 +79,10 @@ static bool realign_weights_undecimated(
// Get the quantization table
const block_mode& bm = bsd.get_block_mode(scb.block_mode);
unsigned int weight_quant_level = bm.quant_mode;
const quantization_and_transfer_table *qat = &(quant_and_xfer_tables[weight_quant_level]);
const quant_and_transfer_table& qat = quant_and_xfer_tables[weight_quant_level];
unsigned int max_plane = bm.is_dual_plane;
int plane2_component = bm.is_dual_plane ? scb.plane2_component : -1;
int plane2_component = scb.plane2_component;
vmask4 plane_mask = vint4::lane_id() == vint4(plane2_component);
// Decode the color endpoints
@@ -99,14 +99,13 @@ static bool realign_weights_undecimated(
{
unpack_color_endpoints(decode_mode,
scb.color_formats[pa_idx],
scb.get_color_quant_mode(),
scb.color_values[pa_idx],
rgb_hdr, alpha_hdr,
endpnt0[pa_idx],
endpnt1[pa_idx]);
}
uint8_t* dec_weights_quant_pvalue = scb.weights;
uint8_t* dec_weights_uquant = scb.weights;
bool adjustments = false;
// For each plane and partition ...
@@ -126,50 +125,48 @@ static bool realign_weights_undecimated(
promise(bsd.texel_count > 0);
for (unsigned int texel = 0; texel < bsd.texel_count; texel++)
{
int uqw = qat->unquantized_value[dec_weights_quant_pvalue[texel]];
int uqw = dec_weights_uquant[texel];
uint32_t prev_and_next = qat->prev_next_values[uqw];
int prev_wt_uq = prev_and_next & 0xFF;
int next_wt_uq = (prev_and_next >> 8) & 0xFF;
uint32_t prev_and_next = qat.prev_next_values[uqw];
int uqw_down = prev_and_next & 0xFF;
int uqw_up = (prev_and_next >> 8) & 0xFF;
// Interpolate the colors to create the diffs
float weight_base = static_cast<float>(uqw);
float weight_down = static_cast<float>(uqw_down - uqw);
float weight_up = static_cast<float>(uqw_up - uqw);
unsigned int partition = pi.partition_of_texel[texel];
float plane_weight = static_cast<float>(uqw);
float plane_up_weight = static_cast<float>(next_wt_uq - uqw);
float plane_down_weight = static_cast<float>(prev_wt_uq - uqw);
vfloat4 color_offset = offset[partition];
vfloat4 color_base = endpnt0f[partition];
vfloat4 color = color_base + color_offset * plane_weight;
vfloat4 color = color_base + color_offset * weight_base;
vfloat4 orig_color = blk.texel(texel);
vfloat4 error_weight = blk.channel_weight;
vfloat4 color_diff = color - orig_color;
vfloat4 color_up_diff = color_diff + color_offset * plane_up_weight;
vfloat4 color_down_diff = color_diff + color_offset * plane_down_weight;
vfloat4 color_diff_down = color_diff + color_offset * weight_down;
vfloat4 color_diff_up = color_diff + color_offset * weight_up;
float current_error = dot_s(color_diff * color_diff, error_weight);
float up_error = dot_s(color_up_diff * color_up_diff, error_weight);
float down_error = dot_s(color_down_diff * color_down_diff, error_weight);
float error_base = dot_s(color_diff * color_diff, error_weight);
float error_down = dot_s(color_diff_down * color_diff_down, error_weight);
float error_up = dot_s(color_diff_up * color_diff_up, error_weight);
// Check if the prev or next error is better, and if so use it
if ((up_error < current_error) && (up_error < down_error))
if ((error_up < error_base) && (error_up < error_down) && (uqw < 64))
{
dec_weights_quant_pvalue[texel] = static_cast<uint8_t>((prev_and_next >> 24) & 0xFF);
dec_weights_uquant[texel] = static_cast<uint8_t>(uqw_up);
adjustments = true;
}
else if (down_error < current_error)
else if ((error_down < error_base) && (uqw > 0))
{
dec_weights_quant_pvalue[texel] = static_cast<uint8_t>((prev_and_next >> 16) & 0xFF);
dec_weights_uquant[texel] = static_cast<uint8_t>(uqw_down);
adjustments = true;
}
}
// Prepare iteration for plane 2
dec_weights_quant_pvalue += WEIGHTS_PLANE2_OFFSET;
dec_weights_uquant += WEIGHTS_PLANE2_OFFSET;
plane_mask = ~plane_mask;
}
@@ -201,7 +198,7 @@ static bool realign_weights_decimated(
// Get the quantization table
const block_mode& bm = bsd.get_block_mode(scb.block_mode);
unsigned int weight_quant_level = bm.quant_mode;
const quantization_and_transfer_table *qat = &(quant_and_xfer_tables[weight_quant_level]);
const quant_and_transfer_table& qat = quant_and_xfer_tables[weight_quant_level];
// Get the decimation table
const decimation_info& di = bsd.get_decimation_info(bm.decimation_mode);
@@ -209,7 +206,7 @@ static bool realign_weights_decimated(
assert(weight_count != bsd.texel_count);
unsigned int max_plane = bm.is_dual_plane;
int plane2_component = bm.is_dual_plane ? scb.plane2_component : -1;
int plane2_component = scb.plane2_component;
vmask4 plane_mask = vint4::lane_id() == vint4(plane2_component);
// Decode the color endpoints
@@ -227,16 +224,13 @@ static bool realign_weights_decimated(
{
unpack_color_endpoints(decode_mode,
scb.color_formats[pa_idx],
scb.get_color_quant_mode(),
scb.color_values[pa_idx],
rgb_hdr, alpha_hdr,
endpnt0[pa_idx],
endpnt1[pa_idx]);
}
uint8_t uq_pl_weights[BLOCK_MAX_WEIGHTS];
float uq_pl_weightsf[BLOCK_MAX_WEIGHTS];
uint8_t* dec_weights_quant_pvalue = scb.weights;
uint8_t* dec_weights_uquant = scb.weights;
bool adjustments = false;
// For each plane and partition ...
@@ -253,97 +247,90 @@ static bool realign_weights_decimated(
}
// Create an unquantized weight grid for this decimation level
for (unsigned int we_idx = 0; we_idx < weight_count; we_idx++)
ASTCENC_ALIGNAS float uq_weightsf[BLOCK_MAX_WEIGHTS];
for (unsigned int we_idx = 0; we_idx < weight_count; we_idx += ASTCENC_SIMD_WIDTH)
{
uq_pl_weights[we_idx] = qat->unquantized_value[dec_weights_quant_pvalue[we_idx]];
uq_pl_weightsf[we_idx] = static_cast<float>(uq_pl_weights[we_idx]);
vint unquant_value(dec_weights_uquant + we_idx);
vfloat unquant_valuef = int_to_float(unquant_value);
storea(unquant_valuef, uq_weightsf + we_idx);
}
// For each weight compute previous, current, and next errors
for (unsigned int we_idx = 0; we_idx < weight_count; we_idx++)
{
unsigned int uqw = uq_pl_weights[we_idx];
float uqwf = uq_pl_weightsf[we_idx];
int uqw = dec_weights_uquant[we_idx];
uint32_t prev_and_next = qat.prev_next_values[uqw];
uint32_t prev_and_next = qat->prev_next_values[uqw];
unsigned int prev_wt_uq = prev_and_next & 0xFF;
unsigned int next_wt_uq = (prev_and_next >> 8) & 0xFF;
float uqw_base = uq_weightsf[we_idx];
float uqw_down = static_cast<float>(prev_and_next & 0xFF);
float uqw_up = static_cast<float>((prev_and_next >> 8) & 0xFF);
float uqw_next_dif = static_cast<float>(next_wt_uq) - uqwf;
float uqw_prev_dif = static_cast<float>(prev_wt_uq) - uqwf;
float uqw_diff_down = uqw_down - uqw_base;
float uqw_diff_up = uqw_up - uqw_base;
vfloat4 current_errorv = vfloat4::zero();
vfloat4 up_errorv = vfloat4::zero();
vfloat4 down_errorv = vfloat4::zero();
vfloat4 error_basev = vfloat4::zero();
vfloat4 error_downv = vfloat4::zero();
vfloat4 error_upv = vfloat4::zero();
// Interpolate the colors to create the diffs
unsigned int texels_to_evaluate = di.weight_texel_count[we_idx];
promise(texels_to_evaluate > 0);
for (unsigned int te_idx = 0; te_idx < texels_to_evaluate; te_idx++)
{
unsigned int texel = di.weight_texel[te_idx][we_idx];
float weight_base = uqwf;
unsigned int texel = di.weight_texels_tr[te_idx][we_idx];
const uint8_t *texel_weights = di.texel_weights_texel[we_idx][te_idx];
const float *texel_weights_float = di.texel_weights_float_texel[we_idx][te_idx];
float twf0 = texel_weights_float[0];
float tw_base = di.texel_contrib_for_weight[te_idx][we_idx];
weight_base = (uqwf * twf0
+ uq_pl_weightsf[texel_weights[1]] * texel_weights_float[1])
+ (uq_pl_weightsf[texel_weights[2]] * texel_weights_float[2]
+ uq_pl_weightsf[texel_weights[3]] * texel_weights_float[3]);
unsigned int partition = pi.partition_of_texel[texel];
float weight_base = (uq_weightsf[di.texel_weights_tr[0][texel]] * di.texel_weight_contribs_float_tr[0][texel]
+ uq_weightsf[di.texel_weights_tr[1][texel]] * di.texel_weight_contribs_float_tr[1][texel])
+ (uq_weightsf[di.texel_weights_tr[2][texel]] * di.texel_weight_contribs_float_tr[2][texel]
+ uq_weightsf[di.texel_weights_tr[3][texel]] * di.texel_weight_contribs_float_tr[3][texel]);
// Ideally this is integer rounded, but IQ gain it isn't worth the overhead
// float plane_weight = astc::flt_rd(weight_base + 0.5f);
// float plane_up_weight = astc::flt_rd(weight_base + 0.5f + uqw_next_dif * twf0) - plane_weight;
// float plane_down_weight = astc::flt_rd(weight_base + 0.5f + uqw_prev_dif * twf0) - plane_weight;
float plane_weight = weight_base;
float plane_up_weight = weight_base + uqw_next_dif * twf0 - plane_weight;
float plane_down_weight = weight_base + uqw_prev_dif * twf0 - plane_weight;
// float weight = astc::flt_rd(weight_base + 0.5f);
// float weight_down = astc::flt_rd(weight_base + 0.5f + uqw_diff_down * tw_base) - weight;
// float weight_up = astc::flt_rd(weight_base + 0.5f + uqw_diff_up * tw_base) - weight;
float weight_down = weight_base + uqw_diff_down * tw_base - weight_base;
float weight_up = weight_base + uqw_diff_up * tw_base - weight_base;
unsigned int partition = pi.partition_of_texel[texel];
vfloat4 color_offset = offset[partition];
vfloat4 color_base = endpnt0f[partition];
vfloat4 color = color_base + color_offset * plane_weight;
vfloat4 orig_color = blk.texel(texel);
vfloat4 color = color_base + color_offset * weight_base;
vfloat4 orig_color = blk.texel(texel);
vfloat4 color_diff = color - orig_color;
vfloat4 color_up_diff = color_diff + color_offset * plane_up_weight;
vfloat4 color_down_diff = color_diff + color_offset * plane_down_weight;
vfloat4 color_down_diff = color_diff + color_offset * weight_down;
vfloat4 color_up_diff = color_diff + color_offset * weight_up;
current_errorv += color_diff * color_diff;
up_errorv += color_up_diff * color_up_diff;
down_errorv += color_down_diff * color_down_diff;
error_basev += color_diff * color_diff;
error_downv += color_down_diff * color_down_diff;
error_upv += color_up_diff * color_up_diff;
}
vfloat4 error_weight = blk.channel_weight;
float current_error = hadd_s(current_errorv * error_weight);
float up_error = hadd_s(up_errorv * error_weight);
float down_error = hadd_s(down_errorv * error_weight);
float error_base = hadd_s(error_basev * error_weight);
float error_down = hadd_s(error_downv * error_weight);
float error_up = hadd_s(error_upv * error_weight);
// Check if the prev or next error is better, and if so use it
if ((up_error < current_error) && (up_error < down_error))
if ((error_up < error_base) && (error_up < error_down) && (uqw < 64))
{
uq_pl_weights[we_idx] = static_cast<uint8_t>(next_wt_uq);
uq_pl_weightsf[we_idx] = static_cast<float>(next_wt_uq);
dec_weights_quant_pvalue[we_idx] = static_cast<uint8_t>((prev_and_next >> 24) & 0xFF);
uq_weightsf[we_idx] = uqw_up;
dec_weights_uquant[we_idx] = static_cast<uint8_t>(uqw_up);
adjustments = true;
}
else if (down_error < current_error)
else if ((error_down < error_base) && (uqw > 0))
{
uq_pl_weights[we_idx] = static_cast<uint8_t>(prev_wt_uq);
uq_pl_weightsf[we_idx] = static_cast<float>(prev_wt_uq);
dec_weights_quant_pvalue[we_idx] = static_cast<uint8_t>((prev_and_next >> 16) & 0xFF);
uq_weightsf[we_idx] = uqw_down;
dec_weights_uquant[we_idx] = static_cast<uint8_t>(uqw_down);
adjustments = true;
}
}
// Prepare iteration for plane 2
dec_weights_quant_pvalue += WEIGHTS_PLANE2_OFFSET;
dec_weights_uquant += WEIGHTS_PLANE2_OFFSET;
plane_mask = ~plane_mask;
}
@@ -373,12 +360,15 @@ static float compress_symbolic_block_for_partition_1plane(
unsigned int partition_count,
unsigned int partition_index,
symbolic_compressed_block& scb,
compression_working_buffers& tmpbuf
compression_working_buffers& tmpbuf,
int quant_limit
) {
promise(partition_count > 0);
promise(config.tune_candidate_limit > 0);
promise(config.tune_refinement_limit > 0);
int max_weight_quant = astc::min(static_cast<int>(QUANT_32), quant_limit);
auto compute_difference = &compute_symbolic_block_difference_1plane;
if ((partition_count == 1) && !(config.flags & ASTCENC_FLG_MAP_RGBM))
{
@@ -389,13 +379,11 @@ static float compress_symbolic_block_for_partition_1plane(
// Compute ideal weights and endpoint colors, with no quantization or decimation
endpoints_and_weights& ei = tmpbuf.ei1;
endpoints_and_weights *eix = tmpbuf.eix1;
compute_ideal_colors_and_weights_1plane(blk, pi, ei);
// Compute ideal weights and endpoint colors for every decimation
float *dec_weights_ideal_value = tmpbuf.dec_weights_ideal_value;
float *dec_weights_quant_uvalue = tmpbuf.dec_weights_quant_uvalue;
uint8_t *dec_weights_quant_pvalue = tmpbuf.dec_weights_quant_pvalue;
float* dec_weights_ideal = tmpbuf.dec_weights_ideal;
uint8_t* dec_weights_uquant = tmpbuf.dec_weights_uquant;
// For each decimation mode, compute an ideal set of weights with no quantization
unsigned int max_decimation_modes = only_always ? bsd.decimation_mode_count_always
@@ -404,7 +392,7 @@ static float compress_symbolic_block_for_partition_1plane(
for (unsigned int i = 0; i < max_decimation_modes; i++)
{
const auto& dm = bsd.get_decimation_mode(i);
if (!dm.ref_1_plane)
if (!dm.is_ref_1plane(static_cast<quant_method>(max_weight_quant)))
{
continue;
}
@@ -413,9 +401,8 @@ static float compress_symbolic_block_for_partition_1plane(
compute_ideal_weights_for_decimation(
ei,
eix[i],
di,
dec_weights_ideal_value + i * BLOCK_MAX_WEIGHTS);
dec_weights_ideal + i * BLOCK_MAX_WEIGHTS);
}
// Compute maximum colors for the endpoints and ideal weights, then for each endpoint and ideal
@@ -433,14 +420,11 @@ static float compress_symbolic_block_for_partition_1plane(
// For each mode, use the angular method to compute a shift
compute_angular_endpoints_1plane(
config.tune_low_weight_count_limit,
only_always, bsd,
dec_weights_ideal_value,
tmpbuf);
only_always, bsd, dec_weights_ideal, max_weight_quant, tmpbuf);
float* weight_low_value = tmpbuf.weight_low_value1;
float* weight_high_value = tmpbuf.weight_high_value1;
int* qwt_bitcounts = tmpbuf.qwt_bitcounts;
int8_t* qwt_bitcounts = tmpbuf.qwt_bitcounts;
float* qwt_errors = tmpbuf.qwt_errors;
// For each mode (which specifies a decimation and a quantization):
@@ -456,9 +440,16 @@ static float compress_symbolic_block_for_partition_1plane(
unsigned int max_block_modes = only_always ? bsd.block_mode_count_1plane_always
: bsd.block_mode_count_1plane_selected;
promise(max_block_modes > 0);
for (unsigned int i = 0; i < max_block_modes; ++i)
for (unsigned int i = 0; i < max_block_modes; i++)
{
const block_mode& bm = bsd.block_modes[i];
if (bm.quant_mode > max_weight_quant)
{
qwt_errors[i] = 1e38f;
continue;
}
assert(!bm.is_dual_plane);
int bitcount = free_bits_for_partition_count[partition_count - 1] - bm.weight_bits;
if (bitcount <= 0)
@@ -475,26 +466,28 @@ static float compress_symbolic_block_for_partition_1plane(
int decimation_mode = bm.decimation_mode;
const auto& di = bsd.get_decimation_info(decimation_mode);
qwt_bitcounts[i] = bitcount;
qwt_bitcounts[i] = static_cast<int8_t>(bitcount);
ASTCENC_ALIGNAS float dec_weights_uquantf[BLOCK_MAX_WEIGHTS];
// Generate the optimized set of weights for the weight mode
compute_quantized_weights_for_decimation(
di,
weight_low_value[i], weight_high_value[i],
dec_weights_ideal_value + BLOCK_MAX_WEIGHTS * decimation_mode,
dec_weights_quant_uvalue + BLOCK_MAX_WEIGHTS * i,
dec_weights_quant_pvalue + BLOCK_MAX_WEIGHTS * i,
dec_weights_ideal + BLOCK_MAX_WEIGHTS * decimation_mode,
dec_weights_uquantf,
dec_weights_uquant + BLOCK_MAX_WEIGHTS * i,
bm.get_weight_quant_mode());
// Compute weight quantization errors for the block mode
qwt_errors[i] = compute_error_of_weight_set_1plane(
eix[decimation_mode],
ei,
di,
dec_weights_quant_uvalue + BLOCK_MAX_WEIGHTS * i);
dec_weights_uquantf);
}
// Decide the optimal combination of color endpoint encodings and weight encodings
int partition_format_specifiers[TUNE_MAX_TRIAL_CANDIDATES][BLOCK_MAX_PARTITIONS];
uint8_t partition_format_specifiers[TUNE_MAX_TRIAL_CANDIDATES][BLOCK_MAX_PARTITIONS];
int block_mode_index[TUNE_MAX_TRIAL_CANDIDATES];
quant_method color_quant_level[TUNE_MAX_TRIAL_CANDIDATES];
@@ -520,22 +513,22 @@ static float compress_symbolic_block_for_partition_1plane(
const block_mode& qw_bm = bsd.block_modes[bm_packed_index];
int decimation_mode = qw_bm.decimation_mode;
int weight_quant_mode = qw_bm.quant_mode;
const auto& di = bsd.get_decimation_info(decimation_mode);
promise(di.weight_count > 0);
trace_add_data("weight_x", di.weight_x);
trace_add_data("weight_y", di.weight_y);
trace_add_data("weight_z", di.weight_z);
trace_add_data("weight_quant", weight_quant_mode);
trace_add_data("weight_quant", qw_bm.quant_mode);
// Recompute the ideal color endpoints before storing them
vfloat4 rgbs_colors[BLOCK_MAX_PARTITIONS];
vfloat4 rgbo_colors[BLOCK_MAX_PARTITIONS];
symbolic_compressed_block workscb;
endpoints workep = ei.ep;
uint8_t* u8_weight_src = dec_weights_quant_pvalue + BLOCK_MAX_WEIGHTS * bm_packed_index;
uint8_t* u8_weight_src = dec_weights_uquant + BLOCK_MAX_WEIGHTS * bm_packed_index;
for (unsigned int j = 0; j < di.weight_count; j++)
{
@@ -545,52 +538,56 @@ static float compress_symbolic_block_for_partition_1plane(
for (unsigned int l = 0; l < config.tune_refinement_limit; l++)
{
recompute_ideal_colors_1plane(
blk, pi, di,
weight_quant_mode, workscb.weights,
eix[decimation_mode].ep, rgbs_colors, rgbo_colors);
blk, pi, di, workscb.weights,
workep, rgbs_colors, rgbo_colors);
// Quantize the chosen color
// Quantize the chosen color, tracking if worth trying the mod value
bool all_same = color_quant_level[i] != color_quant_level_mod[i];
for (unsigned int j = 0; j < partition_count; j++)
{
workscb.color_formats[j] = pack_color_endpoints(
privateProfile,
eix[decimation_mode].ep.endpt0[j],
eix[decimation_mode].ep.endpt1[j],
workep.endpt0[j],
workep.endpt1[j],
rgbs_colors[j],
rgbo_colors[j],
partition_format_specifiers[i][j],
workscb.color_values[j],
color_quant_level[i]);
all_same = all_same && workscb.color_formats[j] == workscb.color_formats[0];
}
// If all the color endpoint modes are the same, we get a few more bits to store colors;
// let's see if we can take advantage of this: requantize all the colors and see if the
// endpoint modes remain the same.
workscb.color_formats_matched = 0;
if ((partition_count >= 2 && workscb.color_formats[0] == workscb.color_formats[1]
&& color_quant_level[i] != color_quant_level_mod[i])
&& (partition_count == 2 || (workscb.color_formats[0] == workscb.color_formats[2]
&& (partition_count == 3 || (workscb.color_formats[0] == workscb.color_formats[3])))))
if (partition_count >= 2 && all_same)
{
uint8_t colorvals[BLOCK_MAX_PARTITIONS][12];
uint8_t colorvals[BLOCK_MAX_PARTITIONS][8];
uint8_t color_formats_mod[BLOCK_MAX_PARTITIONS] { 0 };
bool all_same_mod = true;
for (unsigned int j = 0; j < partition_count; j++)
{
color_formats_mod[j] = pack_color_endpoints(
privateProfile,
eix[decimation_mode].ep.endpt0[j],
eix[decimation_mode].ep.endpt1[j],
workep.endpt0[j],
workep.endpt1[j],
rgbs_colors[j],
rgbo_colors[j],
partition_format_specifiers[i][j],
colorvals[j],
color_quant_level_mod[i]);
// Early out as soon as it's no longer possible to use mod
if (color_formats_mod[j] != color_formats_mod[0])
{
all_same_mod = false;
break;
}
}
if (color_formats_mod[0] == color_formats_mod[1]
&& (partition_count == 2 || (color_formats_mod[0] == color_formats_mod[2]
&& (partition_count == 3 || (color_formats_mod[0] == color_formats_mod[3])))))
if (all_same_mod)
{
workscb.color_formats_matched = 1;
for (unsigned int j = 0; j < BLOCK_MAX_PARTITIONS; j++)
@@ -631,12 +628,12 @@ static float compress_symbolic_block_for_partition_1plane(
trace_add_data("error_prerealign", errorval);
best_errorval_in_mode = astc::min(errorval, best_errorval_in_mode);
// Average refinement improvement is 3.5% per iteration (allow 5%), but the first
// iteration can help more so we give it a extra 10% leeway. Use this knowledge to
// Average refinement improvement is 3.5% per iteration (allow 4.5%), but the first
// iteration can help more so we give it a extra 8% leeway. Use this knowledge to
// drive a heuristic to skip blocks that are unlikely to catch up with the best
// block we have already.
unsigned int iters_remaining = config.tune_refinement_limit - l;
float threshold = (0.05f * static_cast<float>(iters_remaining)) + 1.1f;
float threshold = (0.045f * static_cast<float>(iters_remaining)) + 1.08f;
if (errorval > (threshold * best_errorval_in_scb))
{
break;
@@ -681,10 +678,10 @@ static float compress_symbolic_block_for_partition_1plane(
best_errorval_in_mode = astc::min(errorval, best_errorval_in_mode);
// Average refinement improvement is 3.5% per iteration, so skip blocks that are
// unlikely to catch up with the best block we have already. Assume a 5% per step to
// unlikely to catch up with the best block we have already. Assume a 4.5% per step to
// give benefit of the doubt ...
unsigned int iters_remaining = config.tune_refinement_limit - 1 - l;
float threshold = (0.05f * static_cast<float>(iters_remaining)) + 1.0f;
float threshold = (0.045f * static_cast<float>(iters_remaining)) + 1.0f;
if (errorval > (threshold * best_errorval_in_scb))
{
break;
@@ -733,29 +730,30 @@ static float compress_symbolic_block_for_partition_2planes(
float tune_errorval_threshold,
unsigned int plane2_component,
symbolic_compressed_block& scb,
compression_working_buffers& tmpbuf
compression_working_buffers& tmpbuf,
int quant_limit
) {
promise(config.tune_candidate_limit > 0);
promise(config.tune_refinement_limit > 0);
promise(bsd.decimation_mode_count_selected > 0);
int max_weight_quant = astc::min(static_cast<int>(QUANT_32), quant_limit);
// Compute ideal weights and endpoint colors, with no quantization or decimation
endpoints_and_weights& ei1 = tmpbuf.ei1;
endpoints_and_weights& ei2 = tmpbuf.ei2;
endpoints_and_weights* eix1 = tmpbuf.eix1;
endpoints_and_weights* eix2 = tmpbuf.eix2;
compute_ideal_colors_and_weights_2planes(bsd, blk, plane2_component, ei1, ei2);
// Compute ideal weights and endpoint colors for every decimation
float *dec_weights_ideal_value = tmpbuf.dec_weights_ideal_value;
float *dec_weights_quant_uvalue = tmpbuf.dec_weights_quant_uvalue;
uint8_t *dec_weights_quant_pvalue = tmpbuf.dec_weights_quant_pvalue;
float* dec_weights_ideal = tmpbuf.dec_weights_ideal;
uint8_t* dec_weights_uquant = tmpbuf.dec_weights_uquant;
// For each decimation mode, compute an ideal set of weights with no quantization
for (unsigned int i = 0; i < bsd.decimation_mode_count_selected; i++)
{
const auto& dm = bsd.get_decimation_mode(i);
if (!dm.ref_2_planes)
if (!dm.is_ref_2plane(static_cast<quant_method>(max_weight_quant)))
{
continue;
}
@@ -764,15 +762,13 @@ static float compress_symbolic_block_for_partition_2planes(
compute_ideal_weights_for_decimation(
ei1,
eix1[i],
di,
dec_weights_ideal_value + i * BLOCK_MAX_WEIGHTS);
dec_weights_ideal + i * BLOCK_MAX_WEIGHTS);
compute_ideal_weights_for_decimation(
ei2,
eix2[i],
di,
dec_weights_ideal_value + i * BLOCK_MAX_WEIGHTS + WEIGHTS_PLANE2_OFFSET);
dec_weights_ideal + i * BLOCK_MAX_WEIGHTS + WEIGHTS_PLANE2_OFFSET);
}
// Compute maximum colors for the endpoints and ideal weights, then for each endpoint and ideal
@@ -800,9 +796,7 @@ static float compress_symbolic_block_for_partition_2planes(
float min_wt_cutoff2 = hmin_s(select(err_max, min_ep2, err_mask));
compute_angular_endpoints_2planes(
config.tune_low_weight_count_limit,
bsd, dec_weights_ideal_value,
tmpbuf);
bsd, dec_weights_ideal, max_weight_quant, tmpbuf);
// For each mode (which specifies a decimation and a quantization):
// * Compute number of bits needed for the quantized weights
@@ -814,7 +808,7 @@ static float compress_symbolic_block_for_partition_2planes(
float* weight_low_value2 = tmpbuf.weight_low_value2;
float* weight_high_value2 = tmpbuf.weight_high_value2;
int* qwt_bitcounts = tmpbuf.qwt_bitcounts;
int8_t* qwt_bitcounts = tmpbuf.qwt_bitcounts;
float* qwt_errors = tmpbuf.qwt_errors;
unsigned int start_2plane = bsd.block_mode_count_1plane_selected;
@@ -825,7 +819,13 @@ static float compress_symbolic_block_for_partition_2planes(
const block_mode& bm = bsd.block_modes[i];
assert(bm.is_dual_plane);
qwt_bitcounts[i] = 109 - bm.weight_bits;
if (bm.quant_mode > max_weight_quant)
{
qwt_errors[i] = 1e38f;
continue;
}
qwt_bitcounts[i] = static_cast<int8_t>(109 - bm.weight_bits);
if (weight_high_value1[i] > 1.02f * min_wt_cutoff1)
{
@@ -840,36 +840,38 @@ static float compress_symbolic_block_for_partition_2planes(
unsigned int decimation_mode = bm.decimation_mode;
const auto& di = bsd.get_decimation_info(decimation_mode);
ASTCENC_ALIGNAS float dec_weights_uquantf[BLOCK_MAX_WEIGHTS];
// Generate the optimized set of weights for the mode
compute_quantized_weights_for_decimation(
di,
weight_low_value1[i],
weight_high_value1[i],
dec_weights_ideal_value + BLOCK_MAX_WEIGHTS * decimation_mode,
dec_weights_quant_uvalue + BLOCK_MAX_WEIGHTS * i,
dec_weights_quant_pvalue + BLOCK_MAX_WEIGHTS * i,
dec_weights_ideal + BLOCK_MAX_WEIGHTS * decimation_mode,
dec_weights_uquantf,
dec_weights_uquant + BLOCK_MAX_WEIGHTS * i,
bm.get_weight_quant_mode());
compute_quantized_weights_for_decimation(
di,
weight_low_value2[i],
weight_high_value2[i],
dec_weights_ideal_value + BLOCK_MAX_WEIGHTS * decimation_mode + WEIGHTS_PLANE2_OFFSET,
dec_weights_quant_uvalue + BLOCK_MAX_WEIGHTS * i + WEIGHTS_PLANE2_OFFSET,
dec_weights_quant_pvalue + BLOCK_MAX_WEIGHTS * i + WEIGHTS_PLANE2_OFFSET,
dec_weights_ideal + BLOCK_MAX_WEIGHTS * decimation_mode + WEIGHTS_PLANE2_OFFSET,
dec_weights_uquantf + WEIGHTS_PLANE2_OFFSET,
dec_weights_uquant + BLOCK_MAX_WEIGHTS * i + WEIGHTS_PLANE2_OFFSET,
bm.get_weight_quant_mode());
// Compute weight quantization errors for the block mode
qwt_errors[i] = compute_error_of_weight_set_2planes(
eix1[decimation_mode],
eix2[decimation_mode],
ei1,
ei2,
di,
dec_weights_quant_uvalue + BLOCK_MAX_WEIGHTS * i,
dec_weights_quant_uvalue + BLOCK_MAX_WEIGHTS * i + WEIGHTS_PLANE2_OFFSET);
dec_weights_uquantf,
dec_weights_uquantf + WEIGHTS_PLANE2_OFFSET);
}
// Decide the optimal combination of color endpoint encodings and weight encodings
int partition_format_specifiers[TUNE_MAX_TRIAL_CANDIDATES][BLOCK_MAX_PARTITIONS];
uint8_t partition_format_specifiers[TUNE_MAX_TRIAL_CANDIDATES][BLOCK_MAX_PARTITIONS];
int block_mode_index[TUNE_MAX_TRIAL_CANDIDATES];
quant_method color_quant_level[TUNE_MAX_TRIAL_CANDIDATES];
@@ -901,25 +903,22 @@ static float compress_symbolic_block_for_partition_2planes(
const block_mode& qw_bm = bsd.block_modes[bm_packed_index];
int decimation_mode = qw_bm.decimation_mode;
int weight_quant_mode = qw_bm.quant_mode;
const auto& di = bsd.get_decimation_info(decimation_mode);
promise(di.weight_count > 0);
trace_add_data("weight_x", di.weight_x);
trace_add_data("weight_y", di.weight_y);
trace_add_data("weight_z", di.weight_z);
trace_add_data("weight_quant", weight_quant_mode);
// Recompute the ideal color endpoints before storing them.
merge_endpoints(eix1[decimation_mode].ep, eix2[decimation_mode].ep, plane2_component, epm);
trace_add_data("weight_quant", qw_bm.quant_mode);
vfloat4 rgbs_color;
vfloat4 rgbo_color;
symbolic_compressed_block workscb;
endpoints workep = epm;
uint8_t* u8_weight1_src = dec_weights_quant_pvalue + BLOCK_MAX_WEIGHTS * bm_packed_index;
uint8_t* u8_weight2_src = dec_weights_quant_pvalue + BLOCK_MAX_WEIGHTS * bm_packed_index + WEIGHTS_PLANE2_OFFSET;
uint8_t* u8_weight1_src = dec_weights_uquant + BLOCK_MAX_WEIGHTS * bm_packed_index;
uint8_t* u8_weight2_src = dec_weights_uquant + BLOCK_MAX_WEIGHTS * bm_packed_index + WEIGHTS_PLANE2_OFFSET;
for (int j = 0; j < di.weight_count; j++)
{
@@ -930,15 +929,15 @@ static float compress_symbolic_block_for_partition_2planes(
for (unsigned int l = 0; l < config.tune_refinement_limit; l++)
{
recompute_ideal_colors_2planes(
blk, bsd, di, weight_quant_mode,
blk, bsd, di,
workscb.weights, workscb.weights + WEIGHTS_PLANE2_OFFSET,
epm, rgbs_color, rgbo_color, plane2_component);
workep, rgbs_color, rgbo_color, plane2_component);
// Quantize the chosen color
workscb.color_formats[0] = pack_color_endpoints(
privateProfile,
epm.endpt0[0],
epm.endpt1[0],
workep.endpt0[0],
workep.endpt1[0],
rgbs_color, rgbo_color,
partition_format_specifiers[i][0],
workscb.color_values[0],
@@ -966,12 +965,12 @@ static float compress_symbolic_block_for_partition_2planes(
trace_add_data("error_prerealign", errorval);
best_errorval_in_mode = astc::min(errorval, best_errorval_in_mode);
// Average refinement improvement is 3.5% per iteration (allow 5%), but the first
// iteration can help more so we give it a extra 10% leeway. Use this knowledge to
// Average refinement improvement is 3.5% per iteration (allow 4.5%), but the first
// iteration can help more so we give it a extra 8% leeway. Use this knowledge to
// drive a heuristic to skip blocks that are unlikely to catch up with the best
// block we have already.
unsigned int iters_remaining = config.tune_refinement_limit - l;
float threshold = (0.05f * static_cast<float>(iters_remaining)) + 1.1f;
float threshold = (0.045f * static_cast<float>(iters_remaining)) + 1.08f;
if (errorval > (threshold * best_errorval_in_scb))
{
break;
@@ -1017,10 +1016,10 @@ static float compress_symbolic_block_for_partition_2planes(
best_errorval_in_mode = astc::min(errorval, best_errorval_in_mode);
// Average refinement improvement is 3.5% per iteration, so skip blocks that are
// unlikely to catch up with the best block we have already. Assume a 5% per step to
// unlikely to catch up with the best block we have already. Assume a 4.5% per step to
// give benefit of the doubt ...
unsigned int iters_remaining = config.tune_refinement_limit - 1 - l;
float threshold = (0.05f * static_cast<float>(iters_remaining)) + 1.0f;
float threshold = (0.045f * static_cast<float>(iters_remaining)) + 1.0f;
if (errorval > (threshold * best_errorval_in_scb))
{
break;
@@ -1132,12 +1131,13 @@ static float prepare_block_statistics(
aa_var -= as * (as * rpt);
rg_cov *= astc::rsqrt(astc::max(rr_var * gg_var, 1e-30f));
rb_cov *= astc::rsqrt(astc::max(rr_var * bb_var, 1e-30f));
ra_cov *= astc::rsqrt(astc::max(rr_var * aa_var, 1e-30f));
gb_cov *= astc::rsqrt(astc::max(gg_var * bb_var, 1e-30f));
ga_cov *= astc::rsqrt(astc::max(gg_var * aa_var, 1e-30f));
ba_cov *= astc::rsqrt(astc::max(bb_var * aa_var, 1e-30f));
// These will give a NaN if a channel is constant - these are fixed up in the next step
rg_cov *= astc::rsqrt(rr_var * gg_var);
rb_cov *= astc::rsqrt(rr_var * bb_var);
ra_cov *= astc::rsqrt(rr_var * aa_var);
gb_cov *= astc::rsqrt(gg_var * bb_var);
ga_cov *= astc::rsqrt(gg_var * aa_var);
ba_cov *= astc::rsqrt(bb_var * aa_var);
if (astc::isnan(rg_cov)) rg_cov = 1.0f;
if (astc::isnan(rb_cov)) rb_cov = 1.0f;
@@ -1146,7 +1146,7 @@ static float prepare_block_statistics(
if (astc::isnan(ga_cov)) ga_cov = 1.0f;
if (astc::isnan(ba_cov)) ba_cov = 1.0f;
float lowest_correlation = astc::min(fabsf(rg_cov), fabsf(rb_cov));
float lowest_correlation = astc::min(fabsf(rg_cov), fabsf(rb_cov));
lowest_correlation = astc::min(lowest_correlation, fabsf(ra_cov));
lowest_correlation = astc::min(lowest_correlation, fabsf(gb_cov));
lowest_correlation = astc::min(lowest_correlation, fabsf(ga_cov));
@@ -1173,9 +1173,9 @@ static float prepare_block_statistics(
/* See header for documentation. */
void compress_block(
const astcenc_context& ctx,
const astcenc_contexti& ctx,
const image_block& blk,
physical_compressed_block& pcb,
uint8_t pcb[16],
#if QUALITY_CONTROL
compression_working_buffers& tmpbuf,
bool calQualityEnable,
@@ -1206,16 +1206,28 @@ void compress_block(
bool block_skip_two_plane = false;
int max_partitions = (ctx.config.privateProfile == HIGH_SPEED_PROFILE) ? 1 : ctx.config.tune_partition_count_limit;
unsigned int requested_partition_indices[3] {
ctx.config.tune_2partition_index_limit,
ctx.config.tune_3partition_index_limit,
ctx.config.tune_4partition_index_limit
};
unsigned int requested_partition_trials[3] {
ctx.config.tune_2partitioning_candidate_limit,
ctx.config.tune_3partitioning_candidate_limit,
ctx.config.tune_4partitioning_candidate_limit
};
#if defined(ASTCENC_DIAGNOSTICS)
// Do this early in diagnostic builds so we can dump uniform metrics
// for every block. Do it later in release builds to avoid redundant work!
float error_weight_sum = hadd_s(blk.channel_weight) * bsd->texel_count;
float error_weight_sum = hadd_s(blk.channel_weight) * bsd.texel_count;
float error_threshold = ctx.config.tune_db_limit
* error_weight_sum
* block_is_l_scale
* block_is_la_scale;
lowest_correl = prepare_block_statistics(bsd->texel_count, blk);
lowest_correl = prepare_block_statistics(bsd.texel_count, blk);
trace_add_data("lowest_correl", lowest_correl);
trace_add_data("tune_error_threshold", error_threshold);
#endif
@@ -1228,6 +1240,7 @@ void compress_block(
trace_add_data("plane_count", 1);
scb.partition_count = 0;
// Encode as FP16 if using HDR
if ((decode_mode == ASTCENC_PRF_HDR) ||
(decode_mode == ASTCENC_PRF_HDR_RGB_LDR_A))
@@ -1244,6 +1257,7 @@ void compress_block(
vint4 color_u16 = float_to_int_rtn(color_f32);
store(color_u16, scb.constant_color);
}
trace_add_data("exit", "quality hit");
if (ctx.config.privateProfile == HIGH_SPEED_PROFILE)
{
@@ -1258,7 +1272,7 @@ void compress_block(
for (int w = 0; w < 16; w++) { // weights num is 16 when block mode is HIGH_SPEED_PROFILE_BLOCK_MODE
scb.weights[w] = 0;
}
for (int pixel = 0; pixel < BLOCK_MAX_COMPONENTS; pixel++) { // scb.constant_color[pixel] is 16 bit
for (unsigned int pixel = 0; pixel < BLOCK_MAX_COMPONENTS; pixel++) { // scb.constant_color[pixel] is 16 bit
scb.color_values[0][pixel << 1] = scb.constant_color[pixel] & BYTE_MASK; // low byte
scb.color_values[0][(pixel << 1) + 1] = (scb.constant_color[pixel] >> 8) & BYTE_MASK; // high byte
}
@@ -1291,8 +1305,8 @@ void compress_block(
float exit_thresholds_for_pcount[BLOCK_MAX_PARTITIONS] {
0.0f,
ctx.config.tune_2_partition_early_out_limit_factor,
ctx.config.tune_3_partition_early_out_limit_factor,
ctx.config.tune_2partition_early_out_limit_factor,
ctx.config.tune_3partition_early_out_limit_factor,
0.0f
};
@@ -1304,19 +1318,21 @@ void compress_block(
// compression and slightly reduces image quality.
float errorval_mult[2] {
1.0f / ctx.config.tune_mode0_mse_overshoot,
1.0f / ctx.config.tune_mse_overshoot,
1.0f
};
static const float errorval_overshoot = 1.0f / ctx.config.tune_refinement_mse_overshoot;
static const float errorval_overshoot = 1.0f / ctx.config.tune_mse_overshoot;
// Only enable MODE0 fast path (trial 0) if 2D and more than 25 texels
// Only enable MODE0 fast path if enabled
// Never enable for 3D blocks as no "always" block modes are available
int start_trial = 1;
if ((bsd.texel_count >= TUNE_MIN_TEXELS_MODE0_FASTPATH) && (bsd.zdim == 1))
if ((ctx.config.tune_search_mode0_enable >= TUNE_MIN_SEARCH_MODE0) && (bsd.zdim == 1))
{
start_trial = 0;
}
int quant_limit = QUANT_32;
for (int i = start_trial; i < 2; i++)
{
TRACE_NODE(node1, "pass");
@@ -1328,7 +1344,11 @@ void compress_block(
ctx.config.privateProfile,
ctx.config, bsd, blk, i == 0,
error_threshold * errorval_mult[i] * errorval_overshoot,
1, 0, scb, tmpbuf);
1, 0, scb, tmpbuf, QUANT_32);
// Record the quant level so we can use the filter later searches
const auto& bm = bsd.get_block_mode(scb.block_mode);
quant_limit = bm.get_weight_quant_mode();
best_errorvals_for_pcount[0] = astc::min(best_errorvals_for_pcount[0], errorval);
if ((ctx.config.privateProfile == HIGH_SPEED_PROFILE) || (errorval < (error_threshold * errorval_mult[i])))
@@ -1342,7 +1362,7 @@ void compress_block(
lowest_correl = prepare_block_statistics(bsd.texel_count, blk);
#endif
block_skip_two_plane = lowest_correl > ctx.config.tune_2_plane_early_out_limit_correlation;
block_skip_two_plane = lowest_correl > ctx.config.tune_2plane_early_out_limit_correlation;
// Test the four possible 1-partition, 2-planes modes. Do this in reverse, as
// alpha is the most likely to be non-correlated if it is present in the data.
@@ -1359,7 +1379,7 @@ void compress_block(
if (block_skip_two_plane)
{
trace_add_data("skip", "tune_2_plane_early_out_limit_correlation");
trace_add_data("skip", "tune_2plane_early_out_limit_correlation");
continue;
}
@@ -1378,11 +1398,11 @@ void compress_block(
float errorval = compress_symbolic_block_for_partition_2planes(
ctx.config.privateProfile,
ctx.config, bsd, blk, error_threshold * errorval_overshoot,
i, scb, tmpbuf);
i, scb, tmpbuf, quant_limit);
// If attempting two planes is much worse than the best one plane result
// then further two plane searches are unlikely to help so move on ...
if (errorval > (best_errorvals_for_pcount[0] * 2.0f))
if (errorval > (best_errorvals_for_pcount[0] * 1.85f))
{
break;
}
@@ -1397,13 +1417,19 @@ void compress_block(
// Find best blocks for 2, 3 and 4 partitions
for (int partition_count = 2; partition_count <= max_partitions; partition_count++)
{
unsigned int partition_indices[2] { 0 };
unsigned int partition_indices[TUNE_MAX_PARTITIONING_CANDIDATES];
find_best_partition_candidates(bsd, blk, partition_count,
ctx.config.tune_partition_index_limit,
partition_indices);
unsigned int requested_indices = requested_partition_indices[partition_count - 2];
for (unsigned int i = 0; i < 2; i++)
unsigned int requested_trials = requested_partition_trials[partition_count - 2];
requested_trials = astc::min(requested_trials, requested_indices);
unsigned int actual_trials = find_best_partition_candidates(
bsd, blk, partition_count, requested_indices, partition_indices, requested_trials);
float best_error_in_prev = best_errorvals_for_pcount[partition_count - 2];
for (unsigned int i = 0; i < actual_trials; i++)
{
TRACE_NODE(node1, "pass");
trace_add_data("partition_count", partition_count);
@@ -1416,9 +1442,22 @@ void compress_block(
ctx.config, bsd, blk, false,
error_threshold * errorval_overshoot,
partition_count, partition_indices[i],
scb, tmpbuf);
scb, tmpbuf, quant_limit);
best_errorvals_for_pcount[partition_count - 1] = astc::min(best_errorvals_for_pcount[partition_count - 1], errorval);
// If using N partitions doesn't improve much over using N-1 partitions then skip trying
// N+1. Error can dramatically improve if the data is correlated or non-correlated and
// aligns with a partitioning that suits that encoding, so for this inner loop check add
// a large error scale because the "other" trial could be a lot better.
float best_error = best_errorvals_for_pcount[partition_count - 1];
float best_error_scale = exit_thresholds_for_pcount[partition_count - 1] * 1.85f;
if (best_error > (best_error_in_prev * best_error_scale))
{
trace_add_data("skip", "tune_partition_early_out_limit_factor");
goto END_OF_TESTS;
}
if (errorval < error_threshold)
{
trace_add_data("exit", "quality hit");
@@ -1428,7 +1467,6 @@ void compress_block(
// If using N partitions doesn't improve much over using N-1 partitions then skip trying N+1
float best_error = best_errorvals_for_pcount[partition_count - 1];
float best_error_in_prev = best_errorvals_for_pcount[partition_count - 2];
float best_error_scale = exit_thresholds_for_pcount[partition_count - 1];
if (best_error > (best_error_in_prev * best_error_scale))
{
@@ -1455,7 +1493,6 @@ END_OF_TESTS:
#endif
scb.block_type = SYM_BTYPE_CONST_U16;
scb.block_mode = -2;
vfloat4 color_f32 = clamp(0.0f, 1.0f, blk.origin_texel) * 65535.0f;
vint4 color_u16 = float_to_int_rtn(color_f32);
store(color_u16, scb.constant_color);
+3 -62
View File
@@ -99,17 +99,9 @@ static void brent_kung_prefix_sum(
} while (lc_stride > 2);
}
/**
* @brief Compute averages for a pixel region.
*
* The routine computes both in a single pass, using a summed-area table to decouple the running
* time from the averaging/variance kernel size.
*
* @param[out] ctx The compressor context storing the output data.
* @param arg The input parameter structure.
*/
static void compute_pixel_region_variance(
astcenc_context& ctx,
/* See header for documentation. */
void compute_pixel_region_variance(
astcenc_contexti& ctx,
const pixel_region_args& arg
) {
// Unpack the memory structure into local variables
@@ -427,57 +419,6 @@ static void compute_pixel_region_variance(
}
}
void compute_averages(
astcenc_context& ctx,
const avg_args &ag
) {
pixel_region_args arg = ag.arg;
arg.work_memory = new vfloat4[ag.work_memory_size];
int size_x = ag.img_size_x;
int size_y = ag.img_size_y;
int size_z = ag.img_size_z;
int step_xy = ag.blk_size_xy;
int step_z = ag.blk_size_z;
int y_tasks = (size_y + step_xy - 1) / step_xy;
// All threads run this processing loop until there is no work remaining
while (true)
{
unsigned int count;
unsigned int base = ctx.manage_avg.get_task_assignment(16, count);
if (!count)
{
break;
}
for (unsigned int i = base; i < base + count; i++)
{
int z = (i / (y_tasks)) * step_z;
int y = (i - (z * y_tasks)) * step_xy;
arg.size_z = astc::min(step_z, size_z - z);
arg.offset_z = z;
arg.size_y = astc::min(step_xy, size_y - y);
arg.offset_y = y;
for (int x = 0; x < size_x; x += step_xy)
{
arg.size_x = astc::min(step_xy, size_x - x);
arg.offset_x = x;
compute_pixel_region_variance(ctx, arg);
}
}
ctx.manage_avg.complete_task_assignment(count);
}
delete[] arg.work_memory;
}
/* See header for documentation. */
unsigned int init_compute_averages(
const astcenc_image& img,
+104 -99
View File
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: Apache-2.0
// ----------------------------------------------------------------------------
// Copyright 2011-2022 Arm Limited
// Copyright 2011-2024 Arm Limited
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy
@@ -24,48 +24,18 @@
#include <stdio.h>
#include <assert.h>
/**
* @brief Compute a vector of texel weights by interpolating the decimated weight grid.
*
* @param base_texel_index The first texel to get; N (SIMD width) consecutive texels are loaded.
* @param di The weight grid decimation to use.
* @param weights The raw weights.
*
* @return The undecimated weight for N (SIMD width) texels.
*/
static vint compute_value_of_texel_weight_int_vla(
int base_texel_index,
const decimation_info& di,
const int* weights
) {
vint summed_value(8);
vint weight_count(di.texel_weight_count + base_texel_index);
int max_weight_count = hmax(weight_count).lane<0>();
promise(max_weight_count > 0);
for (int i = 0; i < max_weight_count; i++)
{
vint texel_weights(di.texel_weights_4t[i] + base_texel_index);
vint texel_weights_int(di.texel_weights_int_4t[i] + base_texel_index);
summed_value += gatheri(weights, texel_weights) * texel_weights_int;
}
return lsr<4>(summed_value);
}
/**
* @brief Compute the integer linear interpolation of two color endpoints.
*
* @param decode_mode The ASTC profile (linear or sRGB)
* @param u8_mask The mask for lanes using decode_unorm8 rather than decode_f16.
* @param color0 The endpoint0 color.
* @param color1 The endpoint1 color.
* @param weights The interpolation weight (between 0 and 64).
* @param weights The interpolation weight (between 0 and 64).
*
* @return The interpolated color.
*/
static vint4 lerp_color_int(
astcenc_profile decode_mode,
vmask4 u8_mask,
vint4 color0,
vint4 color1,
vint4 weights
@@ -73,24 +43,18 @@ static vint4 lerp_color_int(
vint4 weight1 = weights;
vint4 weight0 = vint4(64) - weight1;
if (decode_mode == ASTCENC_PRF_LDR_SRGB)
{
color0 = asr<8>(color0);
color1 = asr<8>(color1);
}
vint4 color = (color0 * weight0) + (color1 * weight1) + vint4(32);
color = asr<6>(color);
if (decode_mode == ASTCENC_PRF_LDR_SRGB)
{
color = color * vint4(257);
}
// For decode_unorm8 values force the codec to bit replicate. This allows the
// rest of the codec to assume the full 0xFFFF range for everything and ignore
// the decode_mode setting
vint4 color_u8 = asr<8>(color) * vint4(257);
color = select(color, color_u8, u8_mask);
return color;
}
/**
* @brief Convert integer color value into a float value for the decoder.
*
@@ -127,43 +91,74 @@ void unpack_weights(
const symbolic_compressed_block& scb,
const decimation_info& di,
bool is_dual_plane,
quant_method quant_level,
int weights_plane1[BLOCK_MAX_TEXELS],
int weights_plane2[BLOCK_MAX_TEXELS]
) {
// First, unquantize the weights ...
int uq_plane1_weights[BLOCK_MAX_WEIGHTS];
int uq_plane2_weights[BLOCK_MAX_WEIGHTS];
unsigned int weight_count = di.weight_count;
const quantization_and_transfer_table *qat = &(quant_and_xfer_tables[quant_level]);
// Second, undecimate the weights ...
// Safe to overshoot as all arrays are allocated to full size
if (!is_dual_plane)
{
for (unsigned int i = 0; i < weight_count; i++)
{
uq_plane1_weights[i] = qat->unquantized_value[scb.weights[i]];
}
// Build full 64-entry weight lookup table
vint4 tab0 = vint4::load(scb.weights + 0);
vint4 tab1 = vint4::load(scb.weights + 16);
vint4 tab2 = vint4::load(scb.weights + 32);
vint4 tab3 = vint4::load(scb.weights + 48);
vint tab0p, tab1p, tab2p, tab3p;
vtable_prepare(tab0, tab1, tab2, tab3, tab0p, tab1p, tab2p, tab3p);
for (unsigned int i = 0; i < bsd.texel_count; i += ASTCENC_SIMD_WIDTH)
{
store(compute_value_of_texel_weight_int_vla(i, di, uq_plane1_weights), weights_plane1 + i);
vint summed_value(8);
vint weight_count(di.texel_weight_count + i);
int max_weight_count = hmax(weight_count).lane<0>();
promise(max_weight_count > 0);
for (int j = 0; j < max_weight_count; j++)
{
vint texel_weights(di.texel_weights_tr[j] + i);
vint texel_weights_int(di.texel_weight_contribs_int_tr[j] + i);
summed_value += vtable_8bt_32bi(tab0p, tab1p, tab2p, tab3p, texel_weights) * texel_weights_int;
}
store(lsr<4>(summed_value), weights_plane1 + i);
}
}
else
{
for (unsigned int i = 0; i < weight_count; i++)
{
uq_plane1_weights[i] = qat->unquantized_value[scb.weights[i]];
uq_plane2_weights[i] = qat->unquantized_value[scb.weights[i + WEIGHTS_PLANE2_OFFSET]];
}
// Build a 32-entry weight lookup table per plane
// Plane 1
vint4 tab0_plane1 = vint4::load(scb.weights + 0);
vint4 tab1_plane1 = vint4::load(scb.weights + 16);
vint tab0_plane1p, tab1_plane1p;
vtable_prepare(tab0_plane1, tab1_plane1, tab0_plane1p, tab1_plane1p);
// Plane 2
vint4 tab0_plane2 = vint4::load(scb.weights + 32);
vint4 tab1_plane2 = vint4::load(scb.weights + 48);
vint tab0_plane2p, tab1_plane2p;
vtable_prepare(tab0_plane2, tab1_plane2, tab0_plane2p, tab1_plane2p);
for (unsigned int i = 0; i < bsd.texel_count; i += ASTCENC_SIMD_WIDTH)
{
store(compute_value_of_texel_weight_int_vla(i, di, uq_plane1_weights), weights_plane1 + i);
store(compute_value_of_texel_weight_int_vla(i, di, uq_plane2_weights), weights_plane2 + i);
vint sum_plane1(8);
vint sum_plane2(8);
vint weight_count(di.texel_weight_count + i);
int max_weight_count = hmax(weight_count).lane<0>();
promise(max_weight_count > 0);
for (int j = 0; j < max_weight_count; j++)
{
vint texel_weights(di.texel_weights_tr[j] + i);
vint texel_weights_int(di.texel_weight_contribs_int_tr[j] + i);
sum_plane1 += vtable_8bt_32bi(tab0_plane1p, tab1_plane1p, texel_weights) * texel_weights_int;
sum_plane2 += vtable_8bt_32bi(tab0_plane2p, tab1_plane2p, texel_weights) * texel_weights_int;
}
store(lsr<4>(sum_plane1), weights_plane1 + i);
store(lsr<4>(sum_plane2), weights_plane2 + i);
}
}
}
@@ -228,12 +223,13 @@ void decompress_symbolic_block(
{
vint4 colori(scb.constant_color);
// For sRGB decoding a real decoder would just use the top 8 bits for color conversion.
// We don't color convert, so rescale the top 8 bits into the full 16 bit dynamic range.
if (decode_mode == ASTCENC_PRF_LDR_SRGB)
{
colori = asr<8>(colori) * 257;
}
// Determine the UNORM8 rounding on the decode
vmask4 u8_mask = get_u8_component_mask(decode_mode, blk);
// The real decoder would just use the top 8 bits, but we rescale
// in to a 16-bit value that rounds correctly.
vint4 colori_u8 = asr<8>(colori) * 257;
colori = select(colori, colori_u8, u8_mask);
vint4 colorf16 = unorm16_to_sf16(colori);
color = float16_to_float(colorf16);
@@ -277,17 +273,19 @@ void decompress_symbolic_block(
const auto& bm = bsd.get_block_mode(scb.block_mode);
const auto& di = bsd.get_decimation_info(bm.decimation_mode);
int is_dual_plane = bm.is_dual_plane;
bool is_dual_plane = static_cast<bool>(bm.is_dual_plane);
// Unquantize and undecimate the weights
int plane1_weights[BLOCK_MAX_TEXELS];
int plane2_weights[BLOCK_MAX_TEXELS];
unpack_weights(bsd, scb, di, is_dual_plane, bm.get_weight_quant_mode(), plane1_weights, plane2_weights);
unpack_weights(bsd, scb, di, is_dual_plane, plane1_weights, plane2_weights);
// Now that we have endpoint colors and weights, we can unpack texel colors
int plane2_component = is_dual_plane ? scb.plane2_component : -1;
int plane2_component = scb.plane2_component;
vmask4 plane2_mask = vint4::lane_id() == vint4(plane2_component);
vmask4 u8_mask = get_u8_component_mask(decode_mode, blk);
for (int i = 0; i < partition_count; i++)
{
// Decode the color endpoints for this partition
@@ -298,7 +296,6 @@ void decompress_symbolic_block(
unpack_color_endpoints(decode_mode,
scb.color_formats[i],
scb.get_color_quant_mode(),
scb.color_values[i],
rgb_lns, a_lns,
ep0, ep1);
@@ -310,7 +307,7 @@ void decompress_symbolic_block(
{
int tix = pi.texels_of_partition[i][j];
vint4 weight = select(vint4(plane1_weights[tix]), vint4(plane2_weights[tix]), plane2_mask);
vint4 color = lerp_color_int(decode_mode, ep0, ep1, weight);
vint4 color = lerp_color_int(u8_mask, ep0, ep1, weight);
vfloat4 colorf = decode_texel(color, lns_mask);
blk.data_r[tix] = colorf.lane<0>();
@@ -347,7 +344,7 @@ float compute_symbolic_block_difference_2plane(
// Unquantize and undecimate the weights
int plane1_weights[BLOCK_MAX_TEXELS];
int plane2_weights[BLOCK_MAX_TEXELS];
unpack_weights(bsd, scb, di, true, bm.get_weight_quant_mode(), plane1_weights, plane2_weights);
unpack_weights(bsd, scb, di, true, plane1_weights, plane2_weights);
vmask4 plane2_mask = vint4::lane_id() == vint4(scb.plane2_component);
@@ -361,17 +358,18 @@ float compute_symbolic_block_difference_2plane(
unpack_color_endpoints(config.profile,
scb.color_formats[0],
scb.get_color_quant_mode(),
scb.color_values[0],
rgb_lns, a_lns,
ep0, ep1);
vmask4 u8_mask = get_u8_component_mask(config.profile, blk);
// Unpack and compute error for each texel in the partition
unsigned int texel_count = bsd.texel_count;
for (unsigned int i = 0; i < texel_count; i++)
{
vint4 weight = select(vint4(plane1_weights[i]), vint4(plane2_weights[i]), plane2_mask);
vint4 colori = lerp_color_int(config.profile, ep0, ep1, weight);
vint4 colori = lerp_color_int(u8_mask, ep0, ep1, weight);
vfloat4 color = int_to_float(colori);
vfloat4 oldColor = blk.texel(i);
@@ -443,7 +441,9 @@ float compute_symbolic_block_difference_1plane(
// Unquantize and undecimate the weights
int plane1_weights[BLOCK_MAX_TEXELS];
unpack_weights(bsd, scb, di, false, bm.get_weight_quant_mode(), plane1_weights, nullptr);
unpack_weights(bsd, scb, di, false, plane1_weights, nullptr);
vmask4 u8_mask = get_u8_component_mask(config.profile, blk);
vfloat4 summa = vfloat4::zero();
for (unsigned int i = 0; i < partition_count; i++)
@@ -456,7 +456,6 @@ float compute_symbolic_block_difference_1plane(
unpack_color_endpoints(config.profile,
scb.color_formats[i],
scb.get_color_quant_mode(),
scb.color_values[i],
rgb_lns, a_lns,
ep0, ep1);
@@ -466,7 +465,7 @@ float compute_symbolic_block_difference_1plane(
for (unsigned int j = 0; j < texel_count; j++)
{
unsigned int tix = pi.texels_of_partition[i][j];
vint4 colori = lerp_color_int(config.profile, ep0, ep1,
vint4 colori = lerp_color_int(u8_mask, ep0, ep1,
vint4(plane1_weights[tix]));
vfloat4 color = int_to_float(colori);
@@ -534,8 +533,8 @@ float compute_symbolic_block_difference_1plane_1partition(
const decimation_info& di = bsd.get_decimation_info(bm.decimation_mode);
// Unquantize and undecimate the weights
alignas(ASTCENC_VECALIGN) int plane1_weights[BLOCK_MAX_TEXELS];
unpack_weights(bsd, scb, di, false, bm.get_weight_quant_mode(), plane1_weights, nullptr);
ASTCENC_ALIGNAS int plane1_weights[BLOCK_MAX_TEXELS];
unpack_weights(bsd, scb, di, false, plane1_weights, nullptr);
// Decode the color endpoints for this partition
vint4 ep0;
@@ -545,24 +544,16 @@ float compute_symbolic_block_difference_1plane_1partition(
unpack_color_endpoints(config.profile,
scb.color_formats[0],
scb.get_color_quant_mode(),
scb.color_values[0],
rgb_lns, a_lns,
ep0, ep1);
// Pre-shift sRGB so things round correctly
if (config.profile == ASTCENC_PRF_LDR_SRGB)
{
ep0 = asr<8>(ep0);
ep1 = asr<8>(ep1);
}
vmask4 u8_mask = get_u8_component_mask(config.profile, blk);
// Unpack and compute error for each texel in the partition
vfloatacc summav = vfloatacc::zero();
vint lane_id = vint::lane_id();
vint srgb_scale(config.profile == ASTCENC_PRF_LDR_SRGB ? 257 : 1);
unsigned int texel_count = bsd.texel_count;
for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
@@ -581,11 +572,25 @@ float compute_symbolic_block_difference_1plane_1partition(
vint ep0_b = vint(ep0.lane<2>()) * weight0;
vint ep0_a = vint(ep0.lane<3>()) * weight0;
// Shift so things round correctly
vint colori_r = asr<6>(ep0_r + ep1_r + vint(32)) * srgb_scale;
vint colori_g = asr<6>(ep0_g + ep1_g + vint(32)) * srgb_scale;
vint colori_b = asr<6>(ep0_b + ep1_b + vint(32)) * srgb_scale;
vint colori_a = asr<6>(ep0_a + ep1_a + vint(32)) * srgb_scale;
// Combine contributions
vint colori_r = asr<6>(ep0_r + ep1_r + vint(32));
vint colori_g = asr<6>(ep0_g + ep1_g + vint(32));
vint colori_b = asr<6>(ep0_b + ep1_b + vint(32));
vint colori_a = asr<6>(ep0_a + ep1_a + vint(32));
// If using a U8 decode mode bit replicate top 8 bits
// so rest of codec can assume 0xFFFF max range everywhere
vint colori_r8 = asr<8>(colori_r) * vint(257);
colori_r = select(colori_r, colori_r8, vmask(u8_mask.lane<0>()));
vint colori_g8 = asr<8>(colori_g) * vint(257);
colori_g = select(colori_g, colori_g8, vmask(u8_mask.lane<1>()));
vint colori_b8 = asr<8>(colori_b) * vint(257);
colori_b = select(colori_b, colori_b8, vmask(u8_mask.lane<2>()));
vint colori_a8 = asr<8>(colori_a) * vint(257);
colori_a = select(colori_a, colori_a8, vmask(u8_mask.lane<3>()));
// Compute color diff
vfloat color_r = int_to_float(colori_r);
+29 -14
View File
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: Apache-2.0
// ----------------------------------------------------------------------------
// Copyright 2021 Arm Limited
// Copyright 2021-2023 Arm Limited
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy
@@ -24,6 +24,8 @@
#include <cassert>
#include <cstdarg>
#include <cstdio>
#include <cmath>
#include <limits>
#include <string>
#include "astcenc_diagnostic_trace.h"
@@ -32,7 +34,7 @@
static TraceLog* g_TraceLog = nullptr;
/** @brief The JSON indentation level. */
static const int g_trace_indent = 2;
static const size_t g_trace_indent = 2;
TraceLog::TraceLog(
const char* file_name):
@@ -55,7 +57,7 @@ TraceNode* TraceLog::get_current_leaf()
}
/* See header for documentation. */
int TraceLog::get_depth()
size_t TraceLog::get_depth()
{
return m_stack.size();
}
@@ -82,12 +84,12 @@ TraceNode::TraceNode(
vsnprintf (buffer, bufsz, format, args);
va_end (args);
// Guarantee there is a nul termintor
// Guarantee there is a nul terminator
buffer[bufsz - 1] = 0;
// Generate the node
TraceNode* parent = g_TraceLog->get_current_leaf();
int depth = g_TraceLog->get_depth();
size_t depth = g_TraceLog->get_depth();
g_TraceLog->m_stack.push_back(this);
bool comma = parent && parent->m_attrib_count;
@@ -108,8 +110,8 @@ TraceNode::TraceNode(
out << '\n';
}
int out_indent = (depth * 2) * g_trace_indent;
int in_indent = (depth * 2 + 1) * g_trace_indent;
size_t out_indent = (depth * 2) * g_trace_indent;
size_t in_indent = (depth * 2 + 1) * g_trace_indent;
std::string out_indents("");
if (out_indent)
@@ -131,8 +133,8 @@ void TraceNode::add_attrib(
) {
(void)type;
int depth = g_TraceLog->get_depth();
int indent = (depth * 2) * g_trace_indent;
size_t depth = g_TraceLog->get_depth();
size_t indent = (depth * 2) * g_trace_indent;
auto& out = g_TraceLog->m_file;
bool comma = m_attrib_count;
m_attrib_count++;
@@ -154,9 +156,9 @@ TraceNode::~TraceNode()
g_TraceLog->m_stack.pop_back();
auto& out = g_TraceLog->m_file;
int depth = g_TraceLog->get_depth();
int out_indent = (depth * 2) * g_trace_indent;
int in_indent = (depth * 2 + 1) * g_trace_indent;
size_t depth = g_TraceLog->get_depth();
size_t out_indent = (depth * 2) * g_trace_indent;
size_t in_indent = (depth * 2 + 1) * g_trace_indent;
std::string out_indents("");
if (out_indent)
@@ -189,7 +191,7 @@ void trace_add_data(
vsnprintf (buffer, bufsz, format, args);
va_end (args);
// Guarantee there is a nul termintor
// Guarantee there is a nul terminator
buffer[bufsz - 1] = 0;
std::string value = "\"" + std::string(buffer) + "\"";
@@ -203,7 +205,20 @@ void trace_add_data(
const char* key,
float value
) {
char buffer[256];
// Turn infinities into parseable values
if (std::isinf(value))
{
if (value > 0.0f)
{
value = std::numeric_limits<float>::max();
}
else
{
value = -std::numeric_limits<float>::max();
}
}
char buffer[256];
sprintf(buffer, "%.20g", (double)value);
TraceNode* node = g_TraceLog->get_current_leaf();
node->add_attrib("float", key, buffer);
+2 -2
View File
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: Apache-2.0
// ----------------------------------------------------------------------------
// Copyright 2021 Arm Limited
// Copyright 2021-2022 Arm Limited
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy
@@ -148,7 +148,7 @@ public:
*
* @return The current leaf node stack depth.
*/
int get_depth();
size_t get_depth();
/**
* @brief The file stream to write to.
+249 -181
View File
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: Apache-2.0
// ----------------------------------------------------------------------------
// Copyright 2011-2022 Arm Limited
// Copyright 2011-2024 Arm Limited
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy
@@ -24,7 +24,7 @@
#include <new>
#include "astcenc.h"
#include "astcenc_internal.h"
#include "astcenc_internal_entry.h"
#include "astcenc_diagnostic_trace.h"
/**
@@ -40,89 +40,96 @@ struct astcenc_preset_config
{
float quality;
unsigned int tune_partition_count_limit;
unsigned int tune_partition_index_limit;
unsigned int tune_2partition_index_limit;
unsigned int tune_3partition_index_limit;
unsigned int tune_4partition_index_limit;
unsigned int tune_block_mode_limit;
unsigned int tune_refinement_limit;
unsigned int tune_candidate_limit;
unsigned int tune_2partitioning_candidate_limit;
unsigned int tune_3partitioning_candidate_limit;
unsigned int tune_4partitioning_candidate_limit;
float tune_db_limit_a_base;
float tune_db_limit_b_base;
float tune_mode0_mse_overshoot;
float tune_refinement_mse_overshoot;
float tune_2_partition_early_out_limit_factor;
float tune_3_partition_early_out_limit_factor;
float tune_2_plane_early_out_limit_correlation;
unsigned int tune_low_weight_count_limit;
float tune_mse_overshoot;
float tune_2partition_early_out_limit_factor;
float tune_3partition_early_out_limit_factor;
float tune_2plane_early_out_limit_correlation;
float tune_search_mode0_enable;
};
/**
* @brief The static quality presets that are built-in for high bandwidth
* presets (x < 25 texels per block).
* @brief The static presets for high bandwidth encodings (x < 25 texels per block).
*/
static const std::array<astcenc_preset_config, 5> preset_configs_high {{
static const std::array<astcenc_preset_config, 6> preset_configs_high {{
{
ASTCENC_PRE_FASTEST,
2, 8, 42, 2, 2, 85.2f, 63.2f, 3.5f, 3.5f, 1.0f, 1.0f, 0.5f, 25
2, 10, 6, 4, 43, 2, 2, 2, 2, 2, 85.2f, 63.2f, 3.5f, 1.0f, 1.0f, 0.85f, 0.0f
}, {
ASTCENC_PRE_FAST,
3, 12, 55, 3, 3, 85.2f, 63.2f, 3.5f, 3.5f, 1.0f, 1.1f, 0.65f, 20
3, 18, 10, 8, 55, 3, 3, 2, 2, 2, 85.2f, 63.2f, 3.5f, 1.0f, 1.0f, 0.90f, 0.0f
}, {
ASTCENC_PRE_MEDIUM,
4, 26, 76, 3, 3 , 95.0f, 70.0f, 2.5f, 2.5f, 1.2f, 1.25f, 0.85f, 16
4, 34, 28, 16, 77, 3, 3, 2, 2, 2, 95.0f, 70.0f, 2.5f, 1.1f, 1.05f, 0.95f, 0.0f
}, {
ASTCENC_PRE_THOROUGH,
4, 76, 93, 4, 4, 105.0f, 77.0f, 10.0f, 10.0f, 2.5f, 1.25f, 0.95f, 12
4, 82, 60, 30, 94, 4, 4, 3, 2, 2, 105.0f, 77.0f, 10.0f, 1.35f, 1.15f, 0.97f, 0.0f
}, {
ASTCENC_PRE_VERYTHOROUGH,
4, 256, 128, 64, 98, 4, 6, 8, 6, 4, 200.0f, 200.0f, 10.0f, 1.6f, 1.4f, 0.98f, 0.0f
}, {
ASTCENC_PRE_EXHAUSTIVE,
4, 1024, 100, 4, 4, 200.0f, 200.0f, 10.0f, 10.0f, 10.0f, 10.0f, 0.99f, 0
4, 512, 512, 512, 100, 4, 8, 8, 8, 8, 200.0f, 200.0f, 10.0f, 2.0f, 2.0f, 0.99f, 0.0f
}
}};
/**
* @brief The static quality presets that are built-in for medium bandwidth
* presets (25 <= x < 64 texels per block).
* @brief The static presets for medium bandwidth encodings (25 <= x < 64 texels per block).
*/
static const std::array<astcenc_preset_config, 5> preset_configs_mid {{
static const std::array<astcenc_preset_config, 6> preset_configs_mid {{
{
ASTCENC_PRE_FASTEST,
2, 8, 40, 2, 2, 85.2f, 63.2f, 3.5f, 3.5f, 1.0f, 1.0f, 0.5f, 20
2, 10, 6, 4, 43, 2, 2, 2, 2, 2, 85.2f, 63.2f, 3.5f, 1.0f, 1.0f, 0.80f, 1.0f
}, {
ASTCENC_PRE_FAST,
3, 12, 55, 3, 3, 85.2f, 63.2f, 3.5f, 3.5f, 1.0f, 1.1f, 0.5f, 16
3, 18, 12, 10, 55, 3, 3, 2, 2, 2, 85.2f, 63.2f, 3.5f, 1.0f, 1.0f, 0.85f, 1.0f
}, {
ASTCENC_PRE_MEDIUM,
4, 26, 76, 3, 3, 95.0f, 70.0f, 3.0f, 3.0f, 1.2f, 1.25f, 0.75f, 14
3, 34, 28, 16, 77, 3, 3, 2, 2, 2, 95.0f, 70.0f, 3.0f, 1.1f, 1.05f, 0.90f, 1.0f
}, {
ASTCENC_PRE_THOROUGH,
4, 76, 93, 4, 4, 105.0f, 77.0f, 10.0f, 10.0f, 2.5f, 1.25f, 0.95f, 10
4, 82, 60, 30, 94, 4, 4, 3, 2, 2, 105.0f, 77.0f, 10.0f, 1.4f, 1.2f, 0.95f, 0.0f
}, {
ASTCENC_PRE_VERYTHOROUGH,
4, 256, 128, 64, 98, 4, 6, 8, 6, 3, 200.0f, 200.0f, 10.0f, 1.6f, 1.4f, 0.98f, 0.0f
}, {
ASTCENC_PRE_EXHAUSTIVE,
4, 1024, 100, 4, 4, 200.0f, 200.0f, 10.0f, 10.0f, 10.0f, 10.0f, 0.99f, 0
4, 256, 256, 256, 100, 4, 8, 8, 8, 8, 200.0f, 200.0f, 10.0f, 2.0f, 2.0f, 0.99f, 0.0f
}
}};
/**
* @brief The static quality presets that are built-in for low bandwidth
* presets (64 <= x texels per block).
* @brief The static presets for low bandwidth encodings (64 <= x texels per block).
*/
static const std::array<astcenc_preset_config, 5> preset_configs_low {{
static const std::array<astcenc_preset_config, 6> preset_configs_low {{
{
ASTCENC_PRE_FASTEST,
2, 6, 38, 2, 2, 85.0f, 63.0f, 3.5f, 3.5f, 1.0f, 1.0f, 0.5f, 20
2, 10, 6, 4, 40, 2, 2, 2, 2, 2, 85.0f, 63.0f, 3.5f, 1.0f, 1.0f, 0.80f, 1.0f
}, {
ASTCENC_PRE_FAST,
3, 10, 53, 3, 3, 85.0f, 63.0f, 3.5f, 3.5f, 1.0f, 1.1f, 0.5f, 16
2, 18, 12, 10, 55, 3, 3, 2, 2, 2, 85.0f, 63.0f, 3.5f, 1.0f, 1.0f, 0.85f, 1.0f
}, {
ASTCENC_PRE_MEDIUM,
3, 26, 76, 3, 3, 95.0f, 70.0f, 3.5f, 3.5f, 1.2f, 1.25f, 0.65f, 12
3, 34, 28, 16, 77, 3, 3, 2, 2, 2, 95.0f, 70.0f, 3.5f, 1.1f, 1.05f, 0.90f, 1.0f
}, {
ASTCENC_PRE_THOROUGH,
4, 75, 92, 4, 4, 105.0f, 77.0f, 10.0f, 10.0f, 2.5f, 1.25f, 0.85f, 10
4, 82, 60, 30, 93, 4, 4, 3, 2, 2, 105.0f, 77.0f, 10.0f, 1.3f, 1.2f, 0.97f, 1.0f
}, {
ASTCENC_PRE_VERYTHOROUGH,
4, 256, 128, 64, 98, 4, 6, 8, 5, 2, 200.0f, 200.0f, 10.0f, 1.6f, 1.4f, 0.98f, 1.0f
}, {
ASTCENC_PRE_EXHAUSTIVE,
4, 1024, 100, 4, 4, 200.0f, 200.0f, 10.0f, 10.0f, 10.0f, 10.0f, 0.99f, 0
4, 256, 256, 256, 100, 4, 8, 8, 8, 8, 200.0f, 200.0f, 10.0f, 2.0f, 2.0f, 0.99f, 1.0f
}
}};
@@ -151,48 +158,6 @@ static astcenc_error validate_cpu_float()
return ASTCENC_SUCCESS;
}
/**
* @brief Validate CPU ISA support meets the requirements of this build of the library.
*
* Each library build is statically compiled for a particular set of CPU ISA features, such as the
* SIMD support or other ISA extensions such as POPCNT. This function checks that the host CPU
* actually supports everything this build needs.
*
* @return Return @c ASTCENC_SUCCESS if validated, otherwise an error on failure.
*/
static astcenc_error validate_cpu_isa()
{
#if ASTCENC_SSE >= 41
if (!cpu_supports_sse41())
{
return ASTCENC_ERR_BAD_CPU_ISA;
}
#endif
#if ASTCENC_POPCNT >= 1
if (!cpu_supports_popcnt())
{
return ASTCENC_ERR_BAD_CPU_ISA;
}
#endif
#if ASTCENC_F16C >= 1
if (!cpu_supports_f16c())
{
return ASTCENC_ERR_BAD_CPU_ISA;
}
#endif
#if ASTCENC_AVX >= 2
if (!cpu_supports_avx2())
{
return ASTCENC_ERR_BAD_CPU_ISA;
}
#endif
return ASTCENC_SUCCESS;
}
/**
* @brief Validate config profile.
*
@@ -252,11 +217,13 @@ static astcenc_error validate_block_size(
/**
* @brief Validate flags.
*
* @param flags The flags to check.
* @param profile The profile to check.
* @param flags The flags to check.
*
* @return Return @c ASTCENC_SUCCESS if validated, otherwise an error on failure.
*/
static astcenc_error validate_flags(
astcenc_profile profile,
unsigned int flags
) {
// Flags field must not contain any unknown flag bits
@@ -267,14 +234,21 @@ static astcenc_error validate_flags(
}
// Flags field must only contain at most a single map type
exMask = ASTCENC_FLG_MAP_MASK
| ASTCENC_FLG_MAP_NORMAL
exMask = ASTCENC_FLG_MAP_NORMAL
| ASTCENC_FLG_MAP_RGBM;
if (popcount(flags & exMask) > 1)
{
return ASTCENC_ERR_BAD_FLAGS;
}
// Decode_unorm8 must only be used with an LDR profile
bool is_unorm8 = flags & ASTCENC_FLG_USE_DECODE_UNORM8;
bool is_hdr = (profile == ASTCENC_PRF_HDR) || (profile == ASTCENC_PRF_HDR_RGB_LDR_A);
if (is_unorm8 && is_hdr)
{
return ASTCENC_ERR_BAD_DECODE_MODE;
}
return ASTCENC_SUCCESS;
}
@@ -400,7 +374,7 @@ static astcenc_error validate_config(
return status;
}
status = validate_flags(config.flags);
status = validate_flags(config.profile, config.flags);
if (status != ASTCENC_SUCCESS)
{
return status;
@@ -423,16 +397,20 @@ static astcenc_error validate_config(
config.rgbm_m_scale = astc::max(config.rgbm_m_scale, 1.0f);
config.tune_partition_count_limit = astc::clamp(config.tune_partition_count_limit, 1u, 4u);
config.tune_partition_index_limit = astc::clamp(config.tune_partition_index_limit, 1u, BLOCK_MAX_PARTITIONINGS);
config.tune_2partition_index_limit = astc::clamp(config.tune_2partition_index_limit, 1u, BLOCK_MAX_PARTITIONINGS);
config.tune_3partition_index_limit = astc::clamp(config.tune_3partition_index_limit, 1u, BLOCK_MAX_PARTITIONINGS);
config.tune_4partition_index_limit = astc::clamp(config.tune_4partition_index_limit, 1u, BLOCK_MAX_PARTITIONINGS);
config.tune_block_mode_limit = astc::clamp(config.tune_block_mode_limit, 1u, 100u);
config.tune_refinement_limit = astc::max(config.tune_refinement_limit, 1u);
config.tune_candidate_limit = astc::clamp(config.tune_candidate_limit, 1u, TUNE_MAX_TRIAL_CANDIDATES);
config.tune_2partitioning_candidate_limit = astc::clamp(config.tune_2partitioning_candidate_limit, 1u, TUNE_MAX_PARTITIONING_CANDIDATES);
config.tune_3partitioning_candidate_limit = astc::clamp(config.tune_3partitioning_candidate_limit, 1u, TUNE_MAX_PARTITIONING_CANDIDATES);
config.tune_4partitioning_candidate_limit = astc::clamp(config.tune_4partitioning_candidate_limit, 1u, TUNE_MAX_PARTITIONING_CANDIDATES);
config.tune_db_limit = astc::max(config.tune_db_limit, 0.0f);
config.tune_mode0_mse_overshoot = astc::max(config.tune_mode0_mse_overshoot, 1.0f);
config.tune_refinement_mse_overshoot = astc::max(config.tune_refinement_mse_overshoot, 1.0f);
config.tune_2_partition_early_out_limit_factor = astc::max(config.tune_2_partition_early_out_limit_factor, 0.0f);
config.tune_3_partition_early_out_limit_factor = astc::max(config.tune_3_partition_early_out_limit_factor, 0.0f);
config.tune_2_plane_early_out_limit_correlation = astc::max(config.tune_2_plane_early_out_limit_correlation, 0.0f);
config.tune_mse_overshoot = astc::max(config.tune_mse_overshoot, 1.0f);
config.tune_2partition_early_out_limit_factor = astc::max(config.tune_2partition_early_out_limit_factor, 0.0f);
config.tune_3partition_early_out_limit_factor = astc::max(config.tune_3partition_early_out_limit_factor, 0.0f);
config.tune_2plane_early_out_limit_correlation = astc::max(config.tune_2plane_early_out_limit_correlation, 0.0f);
// Specifying a zero weight color component is not allowed; force to small value
float max_weight = astc::max(astc::max(config.cw_r_weight, config.cw_g_weight),
@@ -465,9 +443,15 @@ astcenc_error astcenc_config_init(
astcenc_config* configp
) {
astcenc_error status;
astcenc_config& config = *configp;
status = validate_cpu_float();
if (status != ASTCENC_SUCCESS)
{
return status;
}
// Zero init all config fields; although most of will be over written
astcenc_config& config = *configp;
std::memset(&config, 0, sizeof(config));
// Process the block size
@@ -494,7 +478,7 @@ astcenc_error astcenc_config_init(
return ASTCENC_ERR_BAD_QUALITY;
}
static const std::array<astcenc_preset_config, 5>* preset_configs;
static const std::array<astcenc_preset_config, 6>* preset_configs;
int texels_int = block_x * block_y * block_z;
if (texels_int < 25)
{
@@ -526,21 +510,24 @@ astcenc_error astcenc_config_init(
if (start == end)
{
config.tune_partition_count_limit = (*preset_configs)[start].tune_partition_count_limit;
config.tune_partition_index_limit = (*preset_configs)[start].tune_partition_index_limit;
config.tune_2partition_index_limit = (*preset_configs)[start].tune_2partition_index_limit;
config.tune_3partition_index_limit = (*preset_configs)[start].tune_3partition_index_limit;
config.tune_4partition_index_limit = (*preset_configs)[start].tune_4partition_index_limit;
config.tune_block_mode_limit = (*preset_configs)[start].tune_block_mode_limit;
config.tune_refinement_limit = (*preset_configs)[start].tune_refinement_limit;
config.tune_candidate_limit = astc::min((*preset_configs)[start].tune_candidate_limit,
TUNE_MAX_TRIAL_CANDIDATES);
config.tune_candidate_limit = (*preset_configs)[start].tune_candidate_limit;
config.tune_2partitioning_candidate_limit = (*preset_configs)[start].tune_2partitioning_candidate_limit;
config.tune_3partitioning_candidate_limit = (*preset_configs)[start].tune_3partitioning_candidate_limit;
config.tune_4partitioning_candidate_limit = (*preset_configs)[start].tune_4partitioning_candidate_limit;
config.tune_db_limit = astc::max((*preset_configs)[start].tune_db_limit_a_base - 35 * ltexels,
(*preset_configs)[start].tune_db_limit_b_base - 19 * ltexels);
config.tune_mode0_mse_overshoot = (*preset_configs)[start].tune_mode0_mse_overshoot;
config.tune_refinement_mse_overshoot = (*preset_configs)[start].tune_refinement_mse_overshoot;
config.tune_mse_overshoot = (*preset_configs)[start].tune_mse_overshoot;
config.tune_2_partition_early_out_limit_factor = (*preset_configs)[start].tune_2_partition_early_out_limit_factor;
config.tune_3_partition_early_out_limit_factor =(*preset_configs)[start].tune_3_partition_early_out_limit_factor;
config.tune_2_plane_early_out_limit_correlation = (*preset_configs)[start].tune_2_plane_early_out_limit_correlation;
config.tune_low_weight_count_limit = (*preset_configs)[start].tune_low_weight_count_limit;
config.tune_2partition_early_out_limit_factor = (*preset_configs)[start].tune_2partition_early_out_limit_factor;
config.tune_3partition_early_out_limit_factor = (*preset_configs)[start].tune_3partition_early_out_limit_factor;
config.tune_2plane_early_out_limit_correlation = (*preset_configs)[start].tune_2plane_early_out_limit_correlation;
config.tune_search_mode0_enable = (*preset_configs)[start].tune_search_mode0_enable;
}
// Start and end node are not the same - so interpolate between them
else
@@ -562,21 +549,24 @@ astcenc_error astcenc_config_init(
#define LERPUI(param) static_cast<unsigned int>(LERPI(param))
config.tune_partition_count_limit = LERPI(tune_partition_count_limit);
config.tune_partition_index_limit = LERPI(tune_partition_index_limit);
config.tune_2partition_index_limit = LERPI(tune_2partition_index_limit);
config.tune_3partition_index_limit = LERPI(tune_3partition_index_limit);
config.tune_4partition_index_limit = LERPI(tune_4partition_index_limit);
config.tune_block_mode_limit = LERPI(tune_block_mode_limit);
config.tune_refinement_limit = LERPI(tune_refinement_limit);
config.tune_candidate_limit = astc::min(LERPUI(tune_candidate_limit),
TUNE_MAX_TRIAL_CANDIDATES);
config.tune_candidate_limit = LERPUI(tune_candidate_limit);
config.tune_2partitioning_candidate_limit = LERPUI(tune_2partitioning_candidate_limit);
config.tune_3partitioning_candidate_limit = LERPUI(tune_3partitioning_candidate_limit);
config.tune_4partitioning_candidate_limit = LERPUI(tune_4partitioning_candidate_limit);
config.tune_db_limit = astc::max(LERP(tune_db_limit_a_base) - 35 * ltexels,
LERP(tune_db_limit_b_base) - 19 * ltexels);
config.tune_mode0_mse_overshoot = LERP(tune_mode0_mse_overshoot);
config.tune_refinement_mse_overshoot = LERP(tune_refinement_mse_overshoot);
config.tune_mse_overshoot = LERP(tune_mse_overshoot);
config.tune_2_partition_early_out_limit_factor = LERP(tune_2_partition_early_out_limit_factor);
config.tune_3_partition_early_out_limit_factor = LERP(tune_3_partition_early_out_limit_factor);
config.tune_2_plane_early_out_limit_correlation = LERP(tune_2_plane_early_out_limit_correlation);
config.tune_low_weight_count_limit = LERPI(tune_low_weight_count_limit);
config.tune_2partition_early_out_limit_factor = LERP(tune_2partition_early_out_limit_factor);
config.tune_3partition_early_out_limit_factor = LERP(tune_3partition_early_out_limit_factor);
config.tune_2plane_early_out_limit_correlation = LERP(tune_2plane_early_out_limit_correlation);
config.tune_search_mode0_enable = LERP(tune_search_mode0_enable);
#undef LERP
#undef LERPI
#undef LERPUI
@@ -604,13 +594,14 @@ astcenc_error astcenc_config_init(
case ASTCENC_PRF_HDR_RGB_LDR_A:
case ASTCENC_PRF_HDR:
config.tune_db_limit = 999.0f;
config.tune_search_mode0_enable = 0.0f;
break;
default:
return ASTCENC_ERR_BAD_PROFILE;
}
// Flags field must not contain any unknown flag bits
status = validate_flags(flags);
status = validate_flags(profile, flags);
if (status != ASTCENC_SUCCESS)
{
return status;
@@ -625,20 +616,14 @@ astcenc_error astcenc_config_init(
config.cw_g_weight = 0.0f;
config.cw_b_weight = 0.0f;
config.tune_2_partition_early_out_limit_factor *= 1.5f;
config.tune_3_partition_early_out_limit_factor *= 1.5f;
config.tune_2_plane_early_out_limit_correlation = 0.99f;
config.tune_2partition_early_out_limit_factor *= 1.5f;
config.tune_3partition_early_out_limit_factor *= 1.5f;
config.tune_2plane_early_out_limit_correlation = 0.99f;
// Normals are prone to blocking artifacts on smooth curves
// so force compressor to try harder here ...
config.tune_db_limit *= 1.03f;
}
else if (flags & ASTCENC_FLG_MAP_MASK)
{
// Masks are prone to blocking artifacts on mask edges
// so force compressor to try harder here ...
config.tune_db_limit *= 1.03f;
}
else if (flags & ASTCENC_FLG_MAP_RGBM)
{
config.rgbm_m_scale = 5.0f;
@@ -655,7 +640,7 @@ astcenc_error astcenc_config_init(
//
// ... but we scale these up to keep a better balance between color and alpha. Note
// that if the content is using alpha we'd recommend using the -a option to weight
// the color conribution by the alpha transparency.
// the color contribution by the alpha transparency.
if (flags & ASTCENC_FLG_USE_PERCEPTUAL)
{
config.cw_r_weight = 0.30f * 2.25f;
@@ -683,12 +668,6 @@ astcenc_error astcenc_context_alloc(
return status;
}
status = validate_cpu_isa();
if (status != ASTCENC_SUCCESS)
{
return status;
}
if (thread_count == 0)
{
return ASTCENC_ERR_BAD_PARAM;
@@ -702,7 +681,8 @@ astcenc_error astcenc_context_alloc(
}
#endif
astcenc_context* ctx = new astcenc_context;
astcenc_context* ctxo = new astcenc_context;
astcenc_contexti* ctx = &ctxo->context;
ctx->thread_count = thread_count;
ctx->config = config;
ctx->working_buffers = nullptr;
@@ -714,12 +694,18 @@ astcenc_error astcenc_context_alloc(
status = validate_config(ctx->config);
if (status != ASTCENC_SUCCESS)
{
delete ctx;
delete ctxo;
return status;
}
ctx->bsd = aligned_malloc<block_size_descriptor>(sizeof(block_size_descriptor), ASTCENC_VECALIGN);
bool can_omit_modes = config.flags & ASTCENC_FLG_SELF_DECOMPRESS_ONLY;
if (!ctx->bsd)
{
delete ctxo;
return ASTCENC_ERR_OUT_OF_MEM;
}
bool can_omit_modes = static_cast<bool>(config.flags & ASTCENC_FLG_SELF_DECOMPRESS_ONLY);
init_block_size_descriptor(ctx->config.privateProfile, config.block_x, config.block_y, config.block_z,
can_omit_modes,
config.tune_partition_count_limit,
@@ -728,7 +714,7 @@ astcenc_error astcenc_context_alloc(
#if !defined(ASTCENC_DECOMPRESS_ONLY)
// Do setup only needed by compression
if (!(status & ASTCENC_FLG_DECOMPRESS_ONLY))
if (!(ctx->config.flags & ASTCENC_FLG_DECOMPRESS_ONLY))
{
// Turn a dB limit into a per-texel error for faster use later
if ((ctx->config.profile == ASTCENC_PRF_LDR) || (ctx->config.profile == ASTCENC_PRF_LDR_SRGB))
@@ -742,12 +728,12 @@ astcenc_error astcenc_context_alloc(
size_t worksize = sizeof(compression_working_buffers) * thread_count;
ctx->working_buffers = aligned_malloc<compression_working_buffers>(worksize, ASTCENC_VECALIGN);
static_assert((sizeof(compression_working_buffers) % ASTCENC_VECALIGN) == 0,
static_assert((ASTCENC_VECALIGN == 0) || ((sizeof(compression_working_buffers) % ASTCENC_VECALIGN) == 0),
"compression_working_buffers size must be multiple of vector alignment");
if (!ctx->working_buffers)
{
aligned_free<block_size_descriptor>(ctx->bsd);
delete ctx;
delete ctxo;
*context = nullptr;
return ASTCENC_ERR_OUT_OF_MEM;
}
@@ -766,7 +752,7 @@ astcenc_error astcenc_context_alloc(
trace_add_data("block_z", config.block_z);
#endif
*context = ctx;
*context = ctxo;
#if !defined(ASTCENC_DECOMPRESS_ONLY)
prepare_angular_tables();
@@ -777,16 +763,17 @@ astcenc_error astcenc_context_alloc(
/* See header dor documentation. */
void astcenc_context_free(
astcenc_context* ctx
astcenc_context* ctxo
) {
if (ctx)
if (ctxo)
{
astcenc_contexti* ctx = &ctxo->context;
aligned_free<compression_working_buffers>(ctx->working_buffers);
aligned_free<block_size_descriptor>(ctx->bsd);
#if defined(ASTCENC_DIAGNOSTICS)
delete ctx->trace_log;
#endif
delete ctx;
delete ctxo;
}
}
@@ -795,14 +782,14 @@ void astcenc_context_free(
/**
* @brief Compress an image, after any preflight has completed.
*
* @param[out] ctx The compressor context.
* @param[out] ctxo The compressor context.
* @param thread_index The thread index.
* @param image The intput image.
* @param swizzle The input swizzle.
* @param[out] buffer The output array for the compressed data.
*/
static void compress_image(
astcenc_context& ctx,
astcenc_context& ctxo,
unsigned int thread_index,
const astcenc_image& image,
const astcenc_swizzle& swizzle,
@@ -814,6 +801,7 @@ static void compress_image(
uint8_t* buffer
#endif
) {
astcenc_contexti& ctx = ctxo.context;
const block_size_descriptor& bsd = *ctx.bsd;
astcenc_profile decode_mode = ctx.config.profile;
@@ -822,7 +810,7 @@ static void compress_image(
int block_x = bsd.xdim;
int block_y = bsd.ydim;
int block_z = bsd.zdim;
blk.texel_count = block_x * block_y * block_z;
blk.texel_count = static_cast<uint8_t>(block_x * block_y * block_z);
int dim_x = image.dim_x;
int dim_y = image.dim_y;
@@ -836,6 +824,8 @@ static void compress_image(
int row_blocks = xblocks;
int plane_blocks = xblocks * yblocks;
blk.decode_unorm8 = ctxo.context.config.flags & ASTCENC_FLG_USE_DECODE_UNORM8;
// Populate the block channel weights
blk.channel_weight = vfloat4(ctx.config.cw_r_weight,
ctx.config.cw_g_weight,
@@ -846,8 +836,7 @@ static void compress_image(
auto& temp_buffers = ctx.working_buffers[thread_index];
// Only the first thread actually runs the initializer
ctx.manage_compress.init(block_count);
ctxo.manage_compress.init(block_count, ctx.config.progress_callback);
// Determine if we can use an optimized load function
bool needs_swz = (swizzle.r != ASTCENC_SWZ_R) || (swizzle.g != ASTCENC_SWZ_G) ||
@@ -859,17 +848,17 @@ static void compress_image(
bool use_fast_load = !needs_swz && !needs_hdr &&
block_z == 1 && image.data_type == ASTCENC_TYPE_U8;
auto load_func = fetch_image_block;
auto load_func = load_image_block;
if (use_fast_load)
{
load_func = fetch_image_block_fast_ldr;
load_func = load_image_block_fast_ldr;
}
// All threads run this processing loop until there is no work remaining
while (true)
{
unsigned int count;
unsigned int base = ctx.manage_compress.get_task_assignment(16, count);
unsigned int base = ctxo.manage_compress.get_task_assignment(16, count);
if (!count)
{
break;
@@ -924,6 +913,18 @@ static void compress_image(
if (use_full_block)
{
load_func(decode_mode, image, blk, bsd, x * block_x, y * block_y, z * block_z, swizzle);
// Scale RGB error contribution by the maximum alpha in the block
// This encourages preserving alpha accuracy in regions with high
// transparency, and can buy up to 0.5 dB PSNR.
if (ctx.config.flags & ASTCENC_FLG_USE_ALPHA_WEIGHT)
{
float alpha_scale = blk.data_max.lane<3>() * (1.0f / 65535.0f);
blk.channel_weight = vfloat4(ctx.config.cw_r_weight * alpha_scale,
ctx.config.cw_g_weight * alpha_scale,
ctx.config.cw_b_weight * alpha_scale,
ctx.config.cw_a_weight);
}
}
// Apply alpha scale RDO - substitute constant color block
else
@@ -937,31 +938,92 @@ static void compress_image(
int offset = ((z * yblocks + y) * xblocks + x) * 16;
uint8_t *bp = buffer + offset;
physical_compressed_block* pcb = reinterpret_cast<physical_compressed_block*>(bp);
#if QUALITY_CONTROL
int32_t *mseBlock[RGBA_COM] = {nullptr, nullptr, nullptr, nullptr};
if (calQualityEnable) {
int offset = (z * yblocks + y) * xblocks + x;
offset = (z * yblocks + y) * xblocks + x;
mseBlock[R_COM] = mse[R_COM] + offset;
mseBlock[G_COM] = mse[G_COM] + offset;
mseBlock[B_COM] = mse[B_COM] + offset;
mseBlock[A_COM] = mse[A_COM] + offset;
}
compress_block(ctx, blk, *pcb, temp_buffers, calQualityEnable, mseBlock);
compress_block(ctx, blk, bp, temp_buffers, calQualityEnable, mseBlock);
#else
compress_block(ctx, blk, *pcb, temp_buffers);
compress_block(ctx, blk, bp, temp_buffers);
#endif
}
ctx.manage_compress.complete_task_assignment(count);
ctxo.manage_compress.complete_task_assignment(count);
}
}
/**
* @brief Compute regional averages in an image.
*
* This function can be called by multiple threads, but only after a single
* thread calls the setup function @c init_compute_averages().
*
* Results are written back into @c img->input_alpha_averages.
*
* @param[out] ctx The context.
* @param ag The average and variance arguments created during setup.
*/
static void compute_averages(
astcenc_context& ctx,
const avg_args &ag
) {
pixel_region_args arg = ag.arg;
arg.work_memory = new vfloat4[ag.work_memory_size];
int size_x = ag.img_size_x;
int size_y = ag.img_size_y;
int size_z = ag.img_size_z;
int step_xy = ag.blk_size_xy;
int step_z = ag.blk_size_z;
int y_tasks = (size_y + step_xy - 1) / step_xy;
// All threads run this processing loop until there is no work remaining
while (true)
{
unsigned int count;
unsigned int base = ctx.manage_avg.get_task_assignment(16, count);
if (!count)
{
break;
}
for (unsigned int i = base; i < base + count; i++)
{
int z = (i / (y_tasks)) * step_z;
int y = (i - (z * y_tasks)) * step_xy;
arg.size_z = astc::min(step_z, size_z - z);
arg.offset_z = z;
arg.size_y = astc::min(step_xy, size_y - y);
arg.offset_y = y;
for (int x = 0; x < size_x; x += step_xy)
{
arg.size_x = astc::min(step_xy, size_x - x);
arg.offset_x = x;
compute_pixel_region_variance(ctx.context, arg);
}
}
ctx.manage_avg.complete_task_assignment(count);
}
delete[] arg.work_memory;
}
#endif
/* See header for documentation. */
astcenc_error astcenc_compress_image(
astcenc_context* ctx,
astcenc_context* ctxo,
astcenc_image* imagep,
const astcenc_swizzle* swizzle,
uint8_t* data_out,
@@ -973,7 +1035,7 @@ astcenc_error astcenc_compress_image(
unsigned int thread_index
) {
#if defined(ASTCENC_DECOMPRESS_ONLY)
(void)ctx;
(void)ctxo;
(void)imagep;
(void)swizzle;
(void)data_out;
@@ -981,6 +1043,7 @@ astcenc_error astcenc_compress_image(
(void)thread_index;
return ASTCENC_ERR_BAD_CONTEXT;
#else
astcenc_contexti* ctx = &ctxo->context;
astcenc_error status;
astcenc_image& image = *imagep;
@@ -1018,7 +1081,7 @@ astcenc_error astcenc_compress_image(
// If context thread count is one then implicitly reset
if (ctx->thread_count == 1)
{
astcenc_compress_reset(ctx);
astcenc_compress_reset(ctxo);
}
if (ctx->config.a_scale_radius != 0)
@@ -1036,21 +1099,21 @@ astcenc_error astcenc_compress_image(
};
// Only the first thread actually runs the initializer
ctx->manage_avg.init(init_avg);
ctxo->manage_avg.init(init_avg);
// All threads will enter this function and dynamically grab work
compute_averages(*ctx, ctx->avg_preprocess_args);
compute_averages(*ctxo, ctx->avg_preprocess_args);
}
// Wait for compute_averages to complete before compressing
ctx->manage_avg.wait();
ctxo->manage_avg.wait();
#if QUALITY_CONTROL
compress_image(*ctx, thread_index, image, *swizzle, data_out, calQualityEnable, mse);
compress_image(*ctxo, thread_index, image, *swizzle, data_out, calQualityEnable, mse);
#else
compress_image(*ctx, thread_index, image, *swizzle, data_out);
compress_image(*ctxo, thread_index, image, *swizzle, data_out);
#endif
// Wait for compress to complete before freeing memory
ctx->manage_compress.wait();
ctxo->manage_compress.wait();
auto term_compress = [ctx]() {
delete[] ctx->input_alpha_averages;
@@ -1058,7 +1121,7 @@ astcenc_error astcenc_compress_image(
};
// Only the first thread to arrive actually runs the term
ctx->manage_compress.term(term_compress);
ctxo->manage_compress.term(term_compress);
return ASTCENC_SUCCESS;
#endif
@@ -1066,26 +1129,27 @@ astcenc_error astcenc_compress_image(
/* See header for documentation. */
astcenc_error astcenc_compress_reset(
astcenc_context* ctx
astcenc_context* ctxo
) {
#if defined(ASTCENC_DECOMPRESS_ONLY)
(void)ctx;
(void)ctxo;
return ASTCENC_ERR_BAD_CONTEXT;
#else
astcenc_contexti* ctx = &ctxo->context;
if (ctx->config.flags & ASTCENC_FLG_DECOMPRESS_ONLY)
{
return ASTCENC_ERR_BAD_CONTEXT;
}
ctx->manage_avg.reset();
ctx->manage_compress.reset();
ctxo->manage_avg.reset();
ctxo->manage_compress.reset();
return ASTCENC_SUCCESS;
#endif
}
/* See header for documentation. */
astcenc_error astcenc_decompress_image(
astcenc_context* ctx,
astcenc_context* ctxo,
const uint8_t* data,
size_t data_len,
astcenc_image* image_outp,
@@ -1094,6 +1158,7 @@ astcenc_error astcenc_decompress_image(
) {
astcenc_error status;
astcenc_image& image_out = *image_outp;
astcenc_contexti* ctx = &ctxo->context;
// Today this doesn't matter (working set on stack) but might in future ...
if (thread_index >= ctx->thread_count)
@@ -1114,6 +1179,7 @@ astcenc_error astcenc_decompress_image(
unsigned int xblocks = (image_out.dim_x + block_x - 1) / block_x;
unsigned int yblocks = (image_out.dim_y + block_y - 1) / block_y;
unsigned int zblocks = (image_out.dim_z + block_z - 1) / block_z;
unsigned int block_count = zblocks * yblocks * xblocks;
int row_blocks = xblocks;
int plane_blocks = xblocks * yblocks;
@@ -1126,22 +1192,25 @@ astcenc_error astcenc_decompress_image(
}
image_block blk;
blk.texel_count = block_x * block_y * block_z;
blk.texel_count = static_cast<uint8_t>(block_x * block_y * block_z);
// Decode mode inferred from the output data type
blk.decode_unorm8 = image_out.data_type == ASTCENC_TYPE_U8;
// If context thread count is one then implicitly reset
if (ctx->thread_count == 1)
{
astcenc_decompress_reset(ctx);
astcenc_decompress_reset(ctxo);
}
// Only the first thread actually runs the initializer
ctx->manage_decompress.init(zblocks * yblocks * xblocks);
ctxo->manage_decompress.init(block_count, nullptr);
// All threads run this processing loop until there is no work remaining
while (true)
{
unsigned int count;
unsigned int base = ctx->manage_decompress.get_task_assignment(128, count);
unsigned int base = ctxo->manage_decompress.get_task_assignment(128, count);
if (!count)
{
break;
@@ -1158,20 +1227,19 @@ astcenc_error astcenc_decompress_image(
unsigned int offset = (((z * yblocks + y) * xblocks) + x) * 16;
const uint8_t* bp = data + offset;
const physical_compressed_block& pcb = *reinterpret_cast<const physical_compressed_block*>(bp);
symbolic_compressed_block scb;
physical_to_symbolic(*ctx->bsd, pcb, scb);
physical_to_symbolic(*ctx->bsd, bp, scb);
decompress_symbolic_block(ctx->config.profile, *ctx->bsd,
x * block_x, y * block_y, z * block_z,
scb, blk);
write_image_block(image_out, blk, *ctx->bsd,
store_image_block(image_out, blk, *ctx->bsd,
x * block_x, y * block_y, z * block_z, *swizzle);
}
ctx->manage_decompress.complete_task_assignment(count);
ctxo->manage_decompress.complete_task_assignment(count);
}
return ASTCENC_SUCCESS;
@@ -1179,28 +1247,29 @@ astcenc_error astcenc_decompress_image(
/* See header for documentation. */
astcenc_error astcenc_decompress_reset(
astcenc_context* ctx
astcenc_context* ctxo
) {
ctx->manage_decompress.reset();
ctxo->manage_decompress.reset();
return ASTCENC_SUCCESS;
}
/* See header for documentation. */
astcenc_error astcenc_get_block_info(
astcenc_context* ctx,
astcenc_context* ctxo,
const uint8_t data[16],
astcenc_block_info* info
) {
#if defined(ASTCENC_DECOMPRESS_ONLY)
(void)ctx;
(void)ctxo;
(void)data;
(void)info;
return ASTCENC_ERR_BAD_CONTEXT;
#else
astcenc_contexti* ctx = &ctxo->context;
// Decode the compressed data into a symbolic form
const physical_compressed_block&pcb = *reinterpret_cast<const physical_compressed_block*>(data);
symbolic_compressed_block scb;
physical_to_symbolic(*ctx->bsd, pcb, scb);
physical_to_symbolic(*ctx->bsd, data, scb);
// Fetch the appropriate partition and decimation tables
block_size_descriptor& bsd = *ctx->bsd;
@@ -1260,7 +1329,6 @@ astcenc_error astcenc_get_block_info(
unpack_color_endpoints(ctx->config.profile,
scb.color_formats[i],
scb.get_color_quant_mode(),
scb.color_values[i],
rgb_hdr, a_hdr,
endpnt[0], endpnt[1]);
@@ -1284,7 +1352,7 @@ astcenc_error astcenc_get_block_info(
int weight_plane1[BLOCK_MAX_TEXELS];
int weight_plane2[BLOCK_MAX_TEXELS];
unpack_weights(bsd, scb, di, bm.is_dual_plane, bm.get_weight_quant_mode(), weight_plane1, weight_plane2);
unpack_weights(bsd, scb, di, bm.is_dual_plane, weight_plane1, weight_plane2);
for (unsigned int i = 0; i < bsd.texel_count; i++)
{
info->weight_values_plane1[i] = static_cast<float>(weight_plane1[i]) * (1.0f / WEIGHTS_TEXEL_SUM);
@@ -1318,8 +1386,6 @@ const char* astcenc_get_error_string(
return "ASTCENC_ERR_OUT_OF_MEM";
case ASTCENC_ERR_BAD_CPU_FLOAT:
return "ASTCENC_ERR_BAD_CPU_FLOAT";
case ASTCENC_ERR_BAD_CPU_ISA:
return "ASTCENC_ERR_BAD_CPU_ISA";
case ASTCENC_ERR_BAD_PARAM:
return "ASTCENC_ERR_BAD_PARAM";
case ASTCENC_ERR_BAD_BLOCK_SIZE:
@@ -1336,6 +1402,8 @@ const char* astcenc_get_error_string(
return "ASTCENC_ERR_BAD_CONTEXT";
case ASTCENC_ERR_NOT_IMPLEMENTED:
return "ASTCENC_ERR_NOT_IMPLEMENTED";
case ASTCENC_ERR_BAD_DECODE_MODE:
return "ASTCENC_ERR_BAD_DECODE_MODE";
#if defined(ASTCENC_DIAGNOSTICS)
case ASTCENC_ERR_DTRACE_FAILURE:
return "ASTCENC_ERR_DTRACE_FAILURE";
+149 -106
View File
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: Apache-2.0
// ----------------------------------------------------------------------------
// Copyright 2011-2022 Arm Limited
// Copyright 2011-2023 Arm Limited
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy
@@ -46,15 +46,16 @@
* lines for endpoint selection.
*/
#include <limits>
#include "astcenc_internal.h"
/**
* @brief Pick some initital kmeans cluster centers.
* @brief Pick some initial kmeans cluster centers.
*
* @param blk The image block color data to compress.
* @param texel_count The number of texels in the block.
* @param partition_count The number of partitions in the block.
* @param[out] cluster_centers The initital partition cluster center colors.
* @param[out] cluster_centers The initial partition cluster center colors.
*/
static void kmeans_init(
const image_block& blk,
@@ -249,13 +250,16 @@ static void kmeans_update(
*
* @return The number of bit mismatches.
*/
static inline unsigned int partition_mismatch2(
static inline uint8_t partition_mismatch2(
const uint64_t a[2],
const uint64_t b[2]
) {
int v1 = popcount(a[0] ^ b[0]) + popcount(a[1] ^ b[1]);
int v2 = popcount(a[0] ^ b[1]) + popcount(a[1] ^ b[0]);
return astc::min(v1, v2);
// Divide by 2 because XOR always counts errors twice, once when missing
// in the expected position, and again when present in the wrong partition
return static_cast<uint8_t>(astc::min(v1, v2) / 2);
}
/**
@@ -266,7 +270,7 @@ static inline unsigned int partition_mismatch2(
*
* @return The number of bit mismatches.
*/
static inline unsigned int partition_mismatch3(
static inline uint8_t partition_mismatch3(
const uint64_t a[3],
const uint64_t b[3]
) {
@@ -294,7 +298,9 @@ static inline unsigned int partition_mismatch3(
int s5 = p11 + p20;
int v2 = astc::min(s4, s5) + p02;
return astc::min(v0, v1, v2);
// Divide by 2 because XOR always counts errors twice, once when missing
// in the expected position, and again when present in the wrong partition
return static_cast<uint8_t>(astc::min(v0, v1, v2) / 2);
}
/**
@@ -305,7 +311,7 @@ static inline unsigned int partition_mismatch3(
*
* @return The number of bit mismatches.
*/
static inline unsigned int partition_mismatch4(
static inline uint8_t partition_mismatch4(
const uint64_t a[4],
const uint64_t b[4]
) {
@@ -341,7 +347,9 @@ static inline unsigned int partition_mismatch4(
int v2 = p02 + astc::min(p11 + mx03, p10 + mx13, p13 + mx01);
int v3 = p03 + astc::min(p11 + mx02, p12 + mx01, p10 + mx12);
return astc::min(v0, v1, v2, v3);
// Divide by 2 because XOR always counts errors twice, once when missing
// in the expected position, and again when present in the wrong partition
return static_cast<uint8_t>(astc::min(v0, v1, v2, v3) / 2);
}
using mismatch_dispatch = unsigned int (*)(const uint64_t*, const uint64_t*);
@@ -358,32 +366,36 @@ static void count_partition_mismatch_bits(
const block_size_descriptor& bsd,
unsigned int partition_count,
const uint64_t bitmaps[BLOCK_MAX_PARTITIONS],
unsigned int mismatch_counts[BLOCK_MAX_PARTITIONINGS]
uint8_t mismatch_counts[BLOCK_MAX_PARTITIONINGS]
) {
unsigned int active_count = bsd.partitioning_count_selected[partition_count - 1];
promise(active_count > 0);
if (partition_count == 2)
{
for (unsigned int i = 0; i < active_count; i++)
{
int bitcount = partition_mismatch2(bitmaps, bsd.coverage_bitmaps_2[i]);
mismatch_counts[i] = astc::max(bitcount, static_cast<int>(bsd.partitioning_valid_2[i]));
mismatch_counts[i] = partition_mismatch2(bitmaps, bsd.coverage_bitmaps_2[i]);
assert(mismatch_counts[i] < BLOCK_MAX_KMEANS_TEXELS);
assert(mismatch_counts[i] < bsd.texel_count);
}
}
else if (partition_count == 3)
{
for (unsigned int i = 0; i < active_count; i++)
{
int bitcount = partition_mismatch3(bitmaps, bsd.coverage_bitmaps_3[i]);
mismatch_counts[i] = astc::max(bitcount, static_cast<int>(bsd.partitioning_valid_3[i]));
mismatch_counts[i] = partition_mismatch3(bitmaps, bsd.coverage_bitmaps_3[i]);
assert(mismatch_counts[i] < BLOCK_MAX_KMEANS_TEXELS);
assert(mismatch_counts[i] < bsd.texel_count);
}
}
else
{
for (unsigned int i = 0; i < active_count; i++)
{
int bitcount = partition_mismatch4(bitmaps, bsd.coverage_bitmaps_4[i]);
mismatch_counts[i] = astc::max(bitcount, static_cast<int>(bsd.partitioning_valid_4[i]));
mismatch_counts[i] = partition_mismatch4(bitmaps, bsd.coverage_bitmaps_4[i]);
assert(mismatch_counts[i] < BLOCK_MAX_KMEANS_TEXELS);
assert(mismatch_counts[i] < bsd.texel_count);
}
}
}
@@ -398,11 +410,13 @@ static void count_partition_mismatch_bits(
* @return The number of active partitions in this selection.
*/
static unsigned int get_partition_ordering_by_mismatch_bits(
unsigned int texel_count,
unsigned int partitioning_count,
const unsigned int mismatch_count[BLOCK_MAX_PARTITIONINGS],
unsigned int partition_ordering[BLOCK_MAX_PARTITIONINGS]
const uint8_t mismatch_count[BLOCK_MAX_PARTITIONINGS],
uint16_t partition_ordering[BLOCK_MAX_PARTITIONINGS]
) {
unsigned int mscount[256] { 0 };
promise(partitioning_count > 0);
uint16_t mscount[BLOCK_MAX_KMEANS_TEXELS] { 0 };
// Create the histogram of mismatch counts
for (unsigned int i = 0; i < partitioning_count; i++)
@@ -410,16 +424,14 @@ static unsigned int get_partition_ordering_by_mismatch_bits(
mscount[mismatch_count[i]]++;
}
unsigned int active_count = partitioning_count - mscount[255];
// Create a running sum from the histogram array
// Cells store previous values only; i.e. exclude self after sum
unsigned int summa = 0;
for (unsigned int i = 0; i < 256; i++)
unsigned int sum = 0;
for (unsigned int i = 0; i < texel_count; i++)
{
unsigned int cnt = mscount[i];
mscount[i] = summa;
summa += cnt;
uint16_t cnt = mscount[i];
mscount[i] = sum;
sum += cnt;
}
// Use the running sum as the index, incrementing after read to allow
@@ -427,10 +439,10 @@ static unsigned int get_partition_ordering_by_mismatch_bits(
for (unsigned int i = 0; i < partitioning_count; i++)
{
unsigned int idx = mscount[mismatch_count[i]]++;
partition_ordering[idx] = i;
partition_ordering[idx] = static_cast<uint16_t>(i);
}
return active_count;
return partitioning_count;
}
/**
@@ -447,7 +459,7 @@ static unsigned int compute_kmeans_partition_ordering(
const block_size_descriptor& bsd,
const image_block& blk,
unsigned int partition_count,
unsigned int partition_ordering[BLOCK_MAX_PARTITIONINGS]
uint16_t partition_ordering[BLOCK_MAX_PARTITIONINGS]
) {
vfloat4 cluster_centers[BLOCK_MAX_PARTITIONS];
uint8_t texel_partitions[BLOCK_MAX_TEXELS];
@@ -478,22 +490,71 @@ static unsigned int compute_kmeans_partition_ordering(
}
// Count the mismatch between the block and the format's partition tables
unsigned int mismatch_counts[BLOCK_MAX_PARTITIONINGS];
uint8_t mismatch_counts[BLOCK_MAX_PARTITIONINGS];
count_partition_mismatch_bits(bsd, partition_count, bitmaps, mismatch_counts);
// Sort the partitions based on the number of mismatched bits
return get_partition_ordering_by_mismatch_bits(
texels_to_process,
bsd.partitioning_count_selected[partition_count - 1],
mismatch_counts, partition_ordering);
}
/**
* @brief Insert a partitioning into an order list of results, sorted by error.
*
* @param max_values The max number of entries in the best result arrays.
* @param this_error The error of the new entry.
* @param this_partition The partition ID of the new entry.
* @param[out] best_errors The array of best error values.
* @param[out] best_partitions The array of best partition values.
*/
static void insert_result(
unsigned int max_values,
float this_error,
unsigned int this_partition,
float* best_errors,
unsigned int* best_partitions)
{
promise(max_values > 0);
// Don't bother searching if the current worst error beats the new error
if (this_error >= best_errors[max_values - 1])
{
return;
}
// Else insert into the list in error-order
for (unsigned int i = 0; i < max_values; i++)
{
// Existing result is better - move on ...
if (this_error > best_errors[i])
{
continue;
}
// Move existing results down one
for (unsigned int j = max_values - 1; j > i; j--)
{
best_errors[j] = best_errors[j - 1];
best_partitions[j] = best_partitions[j - 1];
}
// Insert new result
best_errors[i] = this_error;
best_partitions[i] = this_partition;
break;
}
}
/* See header for documentation. */
void find_best_partition_candidates(
unsigned int find_best_partition_candidates(
const block_size_descriptor& bsd,
const image_block& blk,
unsigned int partition_count,
unsigned int partition_search_limit,
unsigned int best_partitions[2]
unsigned int best_partitions[TUNE_MAX_PARTITIONING_CANDIDATES],
unsigned int requested_candidates
) {
// Constant used to estimate quantization error for a given partitioning; the optimal value for
// this depends on bitrate. These values have been determined empirically.
@@ -517,20 +578,26 @@ void find_best_partition_candidates(
weight_imprecision_estim = weight_imprecision_estim * weight_imprecision_estim;
unsigned int partition_sequence[BLOCK_MAX_PARTITIONINGS];
uint16_t partition_sequence[BLOCK_MAX_PARTITIONINGS];
unsigned int sequence_len = compute_kmeans_partition_ordering(bsd, blk, partition_count, partition_sequence);
partition_search_limit = astc::min(partition_search_limit, sequence_len);
requested_candidates = astc::min(partition_search_limit, requested_candidates);
bool uses_alpha = !blk.is_constant_channel(3);
// Partitioning errors assuming uncorrelated-chrominance endpoints
float uncor_best_error { ERROR_CALC_DEFAULT };
unsigned int uncor_best_partition { 0 };
float uncor_best_errors[TUNE_MAX_PARTITIONING_CANDIDATES];
unsigned int uncor_best_partitions[TUNE_MAX_PARTITIONING_CANDIDATES];
// Partitioning errors assuming same-chrominance endpoints
// Store two so we can always return one different to uncorr
float samec_best_errors[2] { ERROR_CALC_DEFAULT, ERROR_CALC_DEFAULT };
unsigned int samec_best_partitions[2] { 0, 0 };
float samec_best_errors[TUNE_MAX_PARTITIONING_CANDIDATES];
unsigned int samec_best_partitions[TUNE_MAX_PARTITIONING_CANDIDATES];
for (unsigned int i = 0; i < requested_candidates; i++)
{
uncor_best_errors[i] = ERROR_CALC_DEFAULT;
samec_best_errors[i] = ERROR_CALC_DEFAULT;
}
if (uses_alpha)
{
@@ -550,8 +617,7 @@ void find_best_partition_candidates(
processed_line4 uncor_plines[BLOCK_MAX_PARTITIONS];
processed_line4 samec_plines[BLOCK_MAX_PARTITIONS];
float uncor_line_lens[BLOCK_MAX_PARTITIONS];
float samec_line_lens[BLOCK_MAX_PARTITIONS];
float line_lengths[BLOCK_MAX_PARTITIONS];
for (unsigned int j = 0; j < partition_count; j++)
{
@@ -561,13 +627,13 @@ void find_best_partition_candidates(
uncor_lines[j].b = normalize_safe(pm.dir, unit4());
uncor_plines[j].amod = uncor_lines[j].a - uncor_lines[j].b * dot(uncor_lines[j].a, uncor_lines[j].b);
uncor_plines[j].bs = uncor_lines[j].b;
uncor_plines[j].bs = uncor_lines[j].b;
samec_lines[j].a = vfloat4::zero();
samec_lines[j].b = normalize_safe(pm.avg, unit4());
samec_plines[j].amod = vfloat4::zero();
samec_plines[j].bs = samec_lines[j].b;
samec_plines[j].bs = samec_lines[j].b;
}
float uncor_error = 0.0f;
@@ -577,8 +643,7 @@ void find_best_partition_candidates(
blk,
uncor_plines,
samec_plines,
uncor_line_lens,
samec_line_lens,
line_lengths,
uncor_error,
samec_error);
@@ -597,32 +662,15 @@ void find_best_partition_candidates(
float tpp = static_cast<float>(pi.partition_texel_count[j]);
vfloat4 error_weights(tpp * weight_imprecision_estim);
vfloat4 uncor_vector = uncor_lines[j].b * uncor_line_lens[j];
vfloat4 samec_vector = samec_lines[j].b * samec_line_lens[j];
vfloat4 uncor_vector = uncor_lines[j].b * line_lengths[j];
vfloat4 samec_vector = samec_lines[j].b * line_lengths[j];
uncor_error += dot_s(uncor_vector * uncor_vector, error_weights);
samec_error += dot_s(samec_vector * samec_vector, error_weights);
}
if (uncor_error < uncor_best_error)
{
uncor_best_error = uncor_error;
uncor_best_partition = partition;
}
if (samec_error < samec_best_errors[0])
{
samec_best_errors[1] = samec_best_errors[0];
samec_best_partitions[1] = samec_best_partitions[0];
samec_best_errors[0] = samec_error;
samec_best_partitions[0] = partition;
}
else if (samec_error < samec_best_errors[1])
{
samec_best_errors[1] = samec_error;
samec_best_partitions[1] = partition;
}
insert_result(requested_candidates, uncor_error, partition, uncor_best_errors, uncor_best_partitions);
insert_result(requested_candidates, samec_error, partition, samec_best_errors, samec_best_partitions);
}
}
else
@@ -644,10 +692,10 @@ void find_best_partition_candidates(
partition_lines3& pl = plines[j];
pl.uncor_line.a = pm.avg;
pl.uncor_line.b = normalize_safe(pm.dir.swz<0, 1, 2>(), unit3());
pl.uncor_line.b = normalize_safe(pm.dir, unit3());
pl.samec_line.a = vfloat4::zero();
pl.samec_line.b = normalize_safe(pm.avg.swz<0, 1, 2>(), unit3());
pl.samec_line.b = normalize_safe(pm.avg, unit3());
pl.uncor_pline.amod = pl.uncor_line.a - pl.uncor_line.b * dot3(pl.uncor_line.a, pl.uncor_line.b);
pl.uncor_pline.bs = pl.uncor_line.b;
@@ -682,57 +730,52 @@ void find_best_partition_candidates(
float tpp = static_cast<float>(pi.partition_texel_count[j]);
vfloat4 error_weights(tpp * weight_imprecision_estim);
vfloat4 uncor_vector = pl.uncor_line.b * pl.uncor_line_len;
vfloat4 samec_vector = pl.samec_line.b * pl.samec_line_len;
vfloat4 uncor_vector = pl.uncor_line.b * pl.line_length;
vfloat4 samec_vector = pl.samec_line.b * pl.line_length;
uncor_error += dot3_s(uncor_vector * uncor_vector, error_weights);
samec_error += dot3_s(samec_vector * samec_vector, error_weights);
}
if (uncor_error < uncor_best_error)
{
uncor_best_error = uncor_error;
uncor_best_partition = partition;
}
insert_result(requested_candidates, uncor_error, partition, uncor_best_errors, uncor_best_partitions);
insert_result(requested_candidates, samec_error, partition, samec_best_errors, samec_best_partitions);
}
}
if (samec_error < samec_best_errors[0])
{
samec_best_errors[1] = samec_best_errors[0];
samec_best_partitions[1] = samec_best_partitions[0];
unsigned int interleave[2 * TUNE_MAX_PARTITIONING_CANDIDATES];
for (unsigned int i = 0; i < requested_candidates; i++)
{
interleave[2 * i] = bsd.get_raw_partition_info(partition_count, uncor_best_partitions[i]).partition_index;
interleave[2 * i + 1] = bsd.get_raw_partition_info(partition_count, samec_best_partitions[i]).partition_index;
}
samec_best_errors[0] = samec_error;
samec_best_partitions[0] = partition;
}
else if (samec_error < samec_best_errors[1])
uint64_t bitmasks[1024/64] { 0 };
unsigned int emitted = 0;
// Deduplicate the first "requested" entries
for (unsigned int i = 0; i < requested_candidates * 2; i++)
{
unsigned int partition = interleave[i];
unsigned int word = partition / 64;
unsigned int bit = partition % 64;
bool written = bitmasks[word] & (1ull << bit);
if (!written)
{
best_partitions[emitted] = partition;
bitmasks[word] |= 1ull << bit;
emitted++;
if (emitted == requested_candidates)
{
samec_best_errors[1] = samec_error;
samec_best_partitions[1] = partition;
break;
}
}
}
// Same partition is best for both, so use this first unconditionally
if (uncor_best_partition == samec_best_partitions[0])
{
best_partitions[0] = samec_best_partitions[0];
best_partitions[1] = samec_best_partitions[1];
}
// Uncor is best
else if (uncor_best_error <= samec_best_errors[0])
{
best_partitions[0] = uncor_best_partition;
best_partitions[1] = samec_best_partitions[0];
}
// Samec is best
else
{
best_partitions[0] = samec_best_partitions[0];
best_partitions[1] = uncor_best_partition;
}
// Convert these back into canonical partition IDs for the rest of the codec
best_partitions[0] = bsd.get_raw_partition_info(partition_count, best_partitions[0]).partition_index;
best_partitions[1] = bsd.get_raw_partition_info(partition_count, best_partitions[1]).partition_index;
return emitted;
}
#endif
+238 -134
View File
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: Apache-2.0
// ----------------------------------------------------------------------------
// Copyright 2011-2022 Arm Limited
// Copyright 2011-2024 Arm Limited
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy
@@ -26,6 +26,76 @@
#include "astcenc_internal.h"
#include "astcenc_vecmathlib.h"
/**
* @brief Compute the infilled weight for N texel indices in a decimated grid.
*
* @param di The weight grid decimation to use.
* @param weights The decimated weight values to use.
* @param index The first texel index to interpolate.
*
* @return The interpolated weight for the given set of SIMD_WIDTH texels.
*/
static vfloat bilinear_infill_vla(
const decimation_info& di,
const float* weights,
unsigned int index
) {
// Load the bilinear filter texel weight indexes in the decimated grid
vint weight_idx0 = vint(di.texel_weights_tr[0] + index);
vint weight_idx1 = vint(di.texel_weights_tr[1] + index);
vint weight_idx2 = vint(di.texel_weights_tr[2] + index);
vint weight_idx3 = vint(di.texel_weights_tr[3] + index);
// Load the bilinear filter weights from the decimated grid
vfloat weight_val0 = gatherf(weights, weight_idx0);
vfloat weight_val1 = gatherf(weights, weight_idx1);
vfloat weight_val2 = gatherf(weights, weight_idx2);
vfloat weight_val3 = gatherf(weights, weight_idx3);
// Load the weight contribution factors for each decimated weight
vfloat tex_weight_float0 = loada(di.texel_weight_contribs_float_tr[0] + index);
vfloat tex_weight_float1 = loada(di.texel_weight_contribs_float_tr[1] + index);
vfloat tex_weight_float2 = loada(di.texel_weight_contribs_float_tr[2] + index);
vfloat tex_weight_float3 = loada(di.texel_weight_contribs_float_tr[3] + index);
// Compute the bilinear interpolation to generate the per-texel weight
return (weight_val0 * tex_weight_float0 + weight_val1 * tex_weight_float1) +
(weight_val2 * tex_weight_float2 + weight_val3 * tex_weight_float3);
}
/**
* @brief Compute the infilled weight for N texel indices in a decimated grid.
*
* This is specialized version which computes only two weights per texel for
* encodings that are only decimated in a single axis.
*
* @param di The weight grid decimation to use.
* @param weights The decimated weight values to use.
* @param index The first texel index to interpolate.
*
* @return The interpolated weight for the given set of SIMD_WIDTH texels.
*/
static vfloat bilinear_infill_vla_2(
const decimation_info& di,
const float* weights,
unsigned int index
) {
// Load the bilinear filter texel weight indexes in the decimated grid
vint weight_idx0 = vint(di.texel_weights_tr[0] + index);
vint weight_idx1 = vint(di.texel_weights_tr[1] + index);
// Load the bilinear filter weights from the decimated grid
vfloat weight_val0 = gatherf(weights, weight_idx0);
vfloat weight_val1 = gatherf(weights, weight_idx1);
// Load the weight contribution factors for each decimated weight
vfloat tex_weight_float0 = loada(di.texel_weight_contribs_float_tr[0] + index);
vfloat tex_weight_float1 = loada(di.texel_weight_contribs_float_tr[1] + index);
// Compute the bilinear interpolation to generate the per-texel weight
return (weight_val0 * tex_weight_float0 + weight_val1 * tex_weight_float1);
}
/**
* @brief Compute the ideal endpoints and weights for 1 color component.
*
@@ -90,7 +160,7 @@ static void compute_ideal_colors_and_weights_1_comp(
highvalue = astc::max(value, highvalue);
}
if (highvalue < lowvalue)
if (highvalue <= lowvalue)
{
lowvalue = 0.0f;
highvalue = 1e-7f;
@@ -198,13 +268,13 @@ static void compute_ideal_colors_and_weights_2_comp(
for (unsigned int i = 0; i < partition_count; i++)
{
vfloat4 dir = pms[i].dir.swz<0, 1>();
vfloat4 dir = pms[i].dir;
if (hadd_s(dir) < 0.0f)
{
dir = vfloat4::zero() - dir;
}
line2 line { pms[i].avg.swz<0, 1>(), normalize_safe(dir, unit2()) };
line2 line { pms[i].avg, normalize_safe(dir, unit2()) };
float lowparam { 1e10f };
float highparam { -1e10f };
@@ -222,7 +292,7 @@ static void compute_ideal_colors_and_weights_2_comp(
// It is possible for a uniform-color partition to produce length=0;
// this causes NaN issues so set to small value to avoid this problem
if (highparam < lowparam)
if (highparam <= lowparam)
{
lowparam = 0.0f;
highparam = 1e-7f;
@@ -371,7 +441,7 @@ static void compute_ideal_colors_and_weights_3_comp(
// It is possible for a uniform-color partition to produce length=0;
// this causes NaN issues so set to small value to avoid this problem
if (highparam < lowparam)
if (highparam <= lowparam)
{
lowparam = 0.0f;
highparam = 1e-7f;
@@ -493,7 +563,7 @@ static void compute_ideal_colors_and_weights_4_comp(
// It is possible for a uniform-color partition to produce length=0;
// this causes NaN issues so set to small value to avoid this problem
if (highparam < lowparam)
if (highparam <= lowparam)
{
lowparam = 0.0f;
highparam = 1e-7f;
@@ -621,8 +691,8 @@ float compute_error_of_weight_set_1plane(
const float* dec_weight_quant_uvalue
) {
vfloatacc error_summav = vfloatacc::zero();
float error_summa = 0.0f;
unsigned int texel_count = di.texel_count;
promise(texel_count > 0);
// Process SIMD-width chunks, safe to over-fetch - the extra space is zero initialized
if (di.max_texel_weight_count > 2)
@@ -675,7 +745,7 @@ float compute_error_of_weight_set_1plane(
}
// Resolve the final scalar accumulator sum
return error_summa = hadd_s(error_summav);
return hadd_s(error_summav);
}
/* See header for documentation. */
@@ -688,6 +758,7 @@ float compute_error_of_weight_set_2planes(
) {
vfloatacc error_summav = vfloatacc::zero();
unsigned int texel_count = di.texel_count;
promise(texel_count > 0);
// Process SIMD-width chunks, safe to over-fetch - the extra space is zero initialized
if (di.max_texel_weight_count > 2)
@@ -772,8 +843,7 @@ float compute_error_of_weight_set_2planes(
/* See header for documentation. */
void compute_ideal_weights_for_decimation(
const endpoints_and_weights& eai_in,
endpoints_and_weights& eai_out,
const endpoints_and_weights& ei,
const decimation_info& di,
float* dec_weight_ideal_value
) {
@@ -783,49 +853,31 @@ void compute_ideal_weights_for_decimation(
promise(texel_count > 0);
promise(weight_count > 0);
// This function includes a copy of the epw from eai_in to eai_out. We do it here because we
// want to load the data anyway, so we can avoid loading it from memory twice.
eai_out.ep = eai_in.ep;
eai_out.is_constant_weight_error_scale = eai_in.is_constant_weight_error_scale;
// Ensure that the end of the output arrays that are used for SIMD paths later are filled so we
// can safely run SIMD elsewhere without a loop tail. Note that this is always safe as weight
// arrays always contain space for 64 elements
unsigned int prev_weight_count_simd = round_down_to_simd_multiple_vla(weight_count - 1);
storea(vfloat::zero(), dec_weight_ideal_value + prev_weight_count_simd);
// If we have a 1:1 mapping just shortcut the computation - clone the weights into both the
// weight set and the output epw copy.
// Transfer enough to also copy zero initialized SIMD over-fetch region
unsigned int texel_count_simd = round_up_to_simd_multiple_vla(texel_count);
for (unsigned int i = 0; i < texel_count_simd; i += ASTCENC_SIMD_WIDTH)
{
vfloat weight(eai_in.weights + i);
vfloat weight_error_scale(eai_in.weight_error_scale + i);
storea(weight, eai_out.weights + i);
storea(weight_error_scale, eai_out.weight_error_scale + i);
// Direct 1:1 weight mapping, so clone weights directly
// TODO: Can we just avoid the copy for direct cases?
if (is_direct)
{
storea(weight, dec_weight_ideal_value + i);
}
}
// If we have a 1:1 mapping just shortcut the computation. Transfer enough to also copy the
// zero-initialized SIMD over-fetch region
if (is_direct)
{
for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
{
vfloat weight(ei.weights + i);
storea(weight, dec_weight_ideal_value + i);
}
return;
}
// Otherwise compute an estimate and perform single refinement iteration
alignas(ASTCENC_VECALIGN) float infilled_weights[BLOCK_MAX_TEXELS];
ASTCENC_ALIGNAS float infilled_weights[BLOCK_MAX_TEXELS];
// Compute an initial average for each decimated weight
bool constant_wes = eai_in.is_constant_weight_error_scale;
vfloat weight_error_scale(eai_in.weight_error_scale[0]);
bool constant_wes = ei.is_constant_weight_error_scale;
vfloat weight_error_scale(ei.weight_error_scale[0]);
// This overshoots - this is OK as we initialize the array tails in the
// decimation table structures to safe values ...
@@ -842,24 +894,24 @@ void compute_ideal_weights_for_decimation(
for (unsigned int j = 0; j < max_texel_count; j++)
{
vint texel(di.weight_texel[j] + i);
vfloat weight = loada(di.weights_flt[j] + i);
vint texel(di.weight_texels_tr[j] + i);
vfloat weight = loada(di.weights_texel_contribs_tr[j] + i);
if (!constant_wes)
{
weight_error_scale = gatherf(eai_in.weight_error_scale, texel);
weight_error_scale = gatherf(ei.weight_error_scale, texel);
}
vfloat contrib_weight = weight * weight_error_scale;
weight_weight += contrib_weight;
initial_weight += gatherf(eai_in.weights, texel) * contrib_weight;
initial_weight += gatherf(ei.weights, texel) * contrib_weight;
}
storea(initial_weight / weight_weight, dec_weight_ideal_value + i);
}
// Populate the interpolated weight grid based on the initital average
// Populate the interpolated weight grid based on the initial average
// Process SIMD-width texel coordinates at at time while we can. Safe to
// over-process full SIMD vectors - the tail is zeroed.
if (di.max_texel_weight_count <= 2)
@@ -900,17 +952,17 @@ void compute_ideal_weights_for_decimation(
for (unsigned int j = 0; j < max_texel_count; j++)
{
vint texel(di.weight_texel[j] + i);
vfloat contrib_weight = loada(di.weights_flt[j] + i);
vint texel(di.weight_texels_tr[j] + i);
vfloat contrib_weight = loada(di.weights_texel_contribs_tr[j] + i);
if (!constant_wes)
{
weight_error_scale = gatherf(eai_in.weight_error_scale, texel);
weight_error_scale = gatherf(ei.weight_error_scale, texel);
}
vfloat scale = weight_error_scale * contrib_weight;
vfloat old_weight = gatherf(infilled_weights, texel);
vfloat ideal_weight = gatherf(eai_in.weights, texel);
vfloat ideal_weight = gatherf(ei.weights, texel);
error_change0 += contrib_weight * scale;
error_change1 += (old_weight - ideal_weight) * scale;
@@ -919,7 +971,7 @@ void compute_ideal_weights_for_decimation(
vfloat step = (error_change1 * chd_scale) / error_change0;
step = clamp(-stepsize, stepsize, step);
// Update the weight; note this can store negative values.
// Update the weight; note this can store negative values
storea(weight_val + step, dec_weight_ideal_value + i);
}
}
@@ -936,19 +988,20 @@ void compute_quantized_weights_for_decimation(
) {
int weight_count = di.weight_count;
promise(weight_count > 0);
const quantization_and_transfer_table *qat = &(quant_and_xfer_tables[quant_level]);
const quant_and_transfer_table& qat = quant_and_xfer_tables[quant_level];
// The available quant levels, stored with a minus 1 bias
static const float quant_levels_m1[12] {
1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 7.0f, 9.0f, 11.0f, 15.0f, 19.0f, 23.0f, 31.0f
};
vint steps_m1(get_quant_level(quant_level) - 1);
float quant_level_m1 = quant_levels_m1[quant_level];
// Quantize the weight set using both the specified low/high bounds and standard 0..1 bounds
// TODO: Oddity to investigate; triggered by test in issue #265.
if (high_bound < low_bound)
if (high_bound <= low_bound)
{
low_bound = 0.0f;
high_bound = 1.0f;
@@ -968,29 +1021,72 @@ void compute_quantized_weights_for_decimation(
// This runs to the rounded-up SIMD size, which is safe as the loop tail is filled with known
// safe data in compute_ideal_weights_for_decimation and arrays are always 64 elements
for (int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH)
if (get_quant_level(quant_level) <= 16)
{
vfloat ix = loada(&dec_weight_ideal_value[i]) * scalev - scaled_low_boundv;
ix = clampzo(ix);
vint4 tab0 = vint4::load(qat.quant_to_unquant);
vint tab0p;
vtable_prepare(tab0, tab0p);
// Look up the two closest indexes and return the one that was closest
vfloat ix1 = ix * quant_level_m1v;
for (int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH)
{
vfloat ix = loada(dec_weight_ideal_value + i) * scalev - scaled_low_boundv;
ix = clampzo(ix);
vint weightl = float_to_int(ix1);
vint weighth = weightl + vint(1);
// Look up the two closest indexes and return the one that was closest
vfloat ix1 = ix * quant_level_m1v;
vfloat ixl = gatherf(qat->unquantized_value_unsc, weightl);
vfloat ixh = gatherf(qat->unquantized_value_unsc, weighth);
vint weightl = float_to_int(ix1);
vint weighth = min(weightl + vint(1), steps_m1);
vmask mask = (ixl + ixh) < (vfloat(128.0f) * ix);
vint weight = select(weightl, weighth, mask);
ixl = select(ixl, ixh, mask);
vint ixli = vtable_8bt_32bi(tab0p, weightl);
vint ixhi = vtable_8bt_32bi(tab0p, weighth);
// Invert the weight-scaling that was done initially
storea(ixl * rscalev + low_boundv, &weight_set_out[i]);
vint scm = gatheri(qat->scramble_map, weight);
vint scn = pack_low_bytes(scm);
store_nbytes(scn, &quantized_weight_set[i]);
vfloat ixl = int_to_float(ixli);
vfloat ixh = int_to_float(ixhi);
vmask mask = (ixl + ixh) < (vfloat(128.0f) * ix);
vint weight = select(ixli, ixhi, mask);
ixl = select(ixl, ixh, mask);
// Invert the weight-scaling that was done initially
storea(ixl * rscalev + low_boundv, weight_set_out + i);
vint scn = pack_low_bytes(weight);
store_nbytes(scn, quantized_weight_set + i);
}
}
else
{
vint4 tab0 = vint4::load(qat.quant_to_unquant + 0);
vint4 tab1 = vint4::load(qat.quant_to_unquant + 16);
vint tab0p, tab1p;
vtable_prepare(tab0, tab1, tab0p, tab1p);
for (int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH)
{
vfloat ix = loada(dec_weight_ideal_value + i) * scalev - scaled_low_boundv;
ix = clampzo(ix);
// Look up the two closest indexes and return the one that was closest
vfloat ix1 = ix * quant_level_m1v;
vint weightl = float_to_int(ix1);
vint weighth = min(weightl + vint(1), steps_m1);
vint ixli = vtable_8bt_32bi(tab0p, tab1p, weightl);
vint ixhi = vtable_8bt_32bi(tab0p, tab1p, weighth);
vfloat ixl = int_to_float(ixli);
vfloat ixh = int_to_float(ixhi);
vmask mask = (ixl + ixh) < (vfloat(128.0f) * ix);
vint weight = select(ixli, ixhi, mask);
ixl = select(ixl, ixh, mask);
// Invert the weight-scaling that was done initially
storea(ixl * rscalev + low_boundv, weight_set_out + i);
vint scn = pack_low_bytes(weight);
store_nbytes(scn, quantized_weight_set + i);
}
}
}
@@ -1062,8 +1158,7 @@ void recompute_ideal_colors_1plane(
const image_block& blk,
const partition_info& pi,
const decimation_info& di,
int weight_quant_mode,
const uint8_t* dec_weights_quant_pvalue,
const uint8_t* dec_weights_uquant,
endpoints& ep,
vfloat4 rgbs_vectors[BLOCK_MAX_PARTITIONS],
vfloat4 rgbo_vectors[BLOCK_MAX_PARTITIONS]
@@ -1076,15 +1171,15 @@ void recompute_ideal_colors_1plane(
promise(total_texel_count > 0);
promise(partition_count > 0);
const quantization_and_transfer_table& qat = quant_and_xfer_tables[weight_quant_mode];
float dec_weight[BLOCK_MAX_WEIGHTS];
for (unsigned int i = 0; i < weight_count; i++)
ASTCENC_ALIGNAS float dec_weight[BLOCK_MAX_WEIGHTS];
for (unsigned int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH)
{
dec_weight[i] = qat.unquantized_value[dec_weights_quant_pvalue[i]] * (1.0f / 64.0f);
vint unquant_value(dec_weights_uquant + i);
vfloat unquant_valuef = int_to_float(unquant_value) * vfloat(1.0f / 64.0f);
storea(unquant_valuef, dec_weight + i);
}
alignas(ASTCENC_VECALIGN) float undec_weight[BLOCK_MAX_TEXELS];
ASTCENC_ALIGNAS float undec_weight[BLOCK_MAX_TEXELS];
float* undec_weight_ref;
if (di.max_texel_weight_count == 1)
{
@@ -1121,7 +1216,7 @@ void recompute_ideal_colors_1plane(
// Only compute a partition mean if more than one partition
if (partition_count > 1)
{
rgba_sum = vfloat4(1e-17f);
rgba_sum = vfloat4::zero();
promise(texel_count > 0);
for (unsigned int j = 0; j < texel_count; j++)
{
@@ -1157,7 +1252,6 @@ void recompute_ideal_colors_1plane(
for (unsigned int j = 0; j < texel_count; j++)
{
unsigned int tix = texel_indexes[j];
vfloat4 rgba = blk.texel(tix);
float idx0 = undec_weight_ref[tix];
@@ -1190,14 +1284,11 @@ void recompute_ideal_colors_1plane(
vfloat4 right_sum = vfloat4(right_sum_s) * color_weight;
vfloat4 lmrs_sum = vfloat3(left_sum_s, middle_sum_s, right_sum_s) * ls_weight;
vfloat4 weight_weight_sum = vfloat4(weight_weight_sum_s) * color_weight;
float psum = right_sum_s * hadd_rgb_s(color_weight);
color_vec_x = color_vec_x * color_weight;
color_vec_y = color_vec_y * color_weight;
// Initialize the luminance and scale vectors with a reasonable default
float scalediv = scale_min * (1.0f / astc::max(scale_max, 1e-10f));
float scalediv = scale_min / astc::max(scale_max, 1e-10f);
scalediv = astc::clamp1f(scalediv);
vfloat4 sds = scale_dir * scale_max;
@@ -1249,32 +1340,38 @@ void recompute_ideal_colors_1plane(
if (fabsf(ls_det1) > (ls_mss1 * 1e-4f) && scale_ep0 == scale_ep0 && scale_ep1 == scale_ep1 && scale_ep0 < scale_ep1)
{
float scalediv2 = scale_ep0 * (1.0f / scale_ep1);
float scalediv2 = scale_ep0 / scale_ep1;
vfloat4 sdsm = scale_dir * scale_ep1;
rgbs_vectors[i] = vfloat4(sdsm.lane<0>(), sdsm.lane<1>(), sdsm.lane<2>(), scalediv2);
}
}
// Calculations specific to mode #7, the HDR RGB-scale mode
vfloat4 rgbq_sum = color_vec_x + color_vec_y;
rgbq_sum.set_lane<3>(hadd_rgb_s(color_vec_y));
vfloat4 rgbovec = compute_rgbo_vector(rgba_weight_sum, weight_weight_sum, rgbq_sum, psum);
rgbo_vectors[i] = rgbovec;
// We can get a failure due to the use of a singular (non-invertible) matrix
// If it failed, compute rgbo_vectors[] with a different method ...
if (astc::isnan(dot_s(rgbovec, rgbovec)))
// Calculations specific to mode #7, the HDR RGB-scale mode - skip if known LDR
if (blk.rgb_lns[0] || blk.alpha_lns[0])
{
vfloat4 v0 = ep.endpt0[i];
vfloat4 v1 = ep.endpt1[i];
vfloat4 weight_weight_sum = vfloat4(weight_weight_sum_s) * color_weight;
float psum = right_sum_s * hadd_rgb_s(color_weight);
float avgdif = hadd_rgb_s(v1 - v0) * (1.0f / 3.0f);
avgdif = astc::max(avgdif, 0.0f);
vfloat4 rgbq_sum = color_vec_x + color_vec_y;
rgbq_sum.set_lane<3>(hadd_rgb_s(color_vec_y));
vfloat4 avg = (v0 + v1) * 0.5f;
vfloat4 ep0 = avg - vfloat4(avgdif) * 0.5f;
rgbo_vectors[i] = vfloat4(ep0.lane<0>(), ep0.lane<1>(), ep0.lane<2>(), avgdif);
vfloat4 rgbovec = compute_rgbo_vector(rgba_weight_sum, weight_weight_sum, rgbq_sum, psum);
rgbo_vectors[i] = rgbovec;
// We can get a failure due to the use of a singular (non-invertible) matrix
// If it failed, compute rgbo_vectors[] with a different method ...
if (astc::isnan(dot_s(rgbovec, rgbovec)))
{
vfloat4 v0 = ep.endpt0[i];
vfloat4 v1 = ep.endpt1[i];
float avgdif = hadd_rgb_s(v1 - v0) * (1.0f / 3.0f);
avgdif = astc::max(avgdif, 0.0f);
vfloat4 avg = (v0 + v1) * 0.5f;
vfloat4 ep0 = avg - vfloat4(avgdif) * 0.5f;
rgbo_vectors[i] = vfloat4(ep0.lane<0>(), ep0.lane<1>(), ep0.lane<2>(), avgdif);
}
}
}
}
@@ -1284,9 +1381,8 @@ void recompute_ideal_colors_2planes(
const image_block& blk,
const block_size_descriptor& bsd,
const decimation_info& di,
int weight_quant_mode,
const uint8_t* dec_weights_quant_pvalue_plane1,
const uint8_t* dec_weights_quant_pvalue_plane2,
const uint8_t* dec_weights_uquant_plane1,
const uint8_t* dec_weights_uquant_plane2,
endpoints& ep,
vfloat4& rgbs_vector,
vfloat4& rgbo_vector,
@@ -1298,20 +1394,24 @@ void recompute_ideal_colors_2planes(
promise(total_texel_count > 0);
promise(weight_count > 0);
const quantization_and_transfer_table *qat = &(quant_and_xfer_tables[weight_quant_mode]);
float dec_weight_plane1[BLOCK_MAX_WEIGHTS_2PLANE];
float dec_weight_plane2[BLOCK_MAX_WEIGHTS_2PLANE];
ASTCENC_ALIGNAS float dec_weight_plane1[BLOCK_MAX_WEIGHTS_2PLANE];
ASTCENC_ALIGNAS float dec_weight_plane2[BLOCK_MAX_WEIGHTS_2PLANE];
assert(weight_count <= BLOCK_MAX_WEIGHTS_2PLANE);
for (unsigned int i = 0; i < weight_count; i++)
for (unsigned int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH)
{
dec_weight_plane1[i] = qat->unquantized_value[dec_weights_quant_pvalue_plane1[i]] * (1.0f / 64.0f);
dec_weight_plane2[i] = qat->unquantized_value[dec_weights_quant_pvalue_plane2[i]] * (1.0f / 64.0f);
vint unquant_value1(dec_weights_uquant_plane1 + i);
vfloat unquant_value1f = int_to_float(unquant_value1) * vfloat(1.0f / 64.0f);
storea(unquant_value1f, dec_weight_plane1 + i);
vint unquant_value2(dec_weights_uquant_plane2 + i);
vfloat unquant_value2f = int_to_float(unquant_value2) * vfloat(1.0f / 64.0f);
storea(unquant_value2f, dec_weight_plane2 + i);
}
alignas(ASTCENC_VECALIGN) float undec_weight_plane1[BLOCK_MAX_TEXELS];
alignas(ASTCENC_VECALIGN) float undec_weight_plane2[BLOCK_MAX_TEXELS];
ASTCENC_ALIGNAS float undec_weight_plane1[BLOCK_MAX_TEXELS];
ASTCENC_ALIGNAS float undec_weight_plane2[BLOCK_MAX_TEXELS];
float* undec_weight_plane1_ref;
float* undec_weight_plane2_ref;
@@ -1419,7 +1519,7 @@ void recompute_ideal_colors_2planes(
color_vec_x += cwprod - cwiprod;
scale_vec += vfloat2(om_idx0, idx0) * (ls_weight * scale);
weight_weight_sum += (color_weight * color_idx);
weight_weight_sum += color_idx;
}
vfloat4 left1_sum = vfloat4(left1_sum_s) * color_weight;
@@ -1431,13 +1531,11 @@ void recompute_ideal_colors_2planes(
vfloat4 middle2_sum = vfloat4(middle2_sum_s) * color_weight;
vfloat4 right2_sum = vfloat4(right2_sum_s) * color_weight;
float psum = dot3_s(select(right1_sum, right2_sum, p2_mask), color_weight);
color_vec_x = color_vec_x * color_weight;
color_vec_y = color_vec_y * color_weight;
// Initialize the luminance and scale vectors with a reasonable default
float scalediv = scale_min * (1.0f / astc::max(scale_max, 1e-10f));
float scalediv = scale_min / astc::max(scale_max, 1e-10f);
scalediv = astc::clamp1f(scalediv);
vfloat4 sds = scale_dir * scale_max;
@@ -1493,7 +1591,7 @@ void recompute_ideal_colors_2planes(
if (fabsf(ls_det1) > (ls_mss1 * 1e-4f) && scale_ep0 == scale_ep0 && scale_ep1 == scale_ep1 && scale_ep0 < scale_ep1)
{
float scalediv2 = scale_ep0 * (1.0f / scale_ep1);
float scalediv2 = scale_ep0 / scale_ep1;
vfloat4 sdsm = scale_dir * scale_ep1;
rgbs_vector = vfloat4(sdsm.lane<0>(), sdsm.lane<1>(), sdsm.lane<2>(), scalediv2);
}
@@ -1533,26 +1631,32 @@ void recompute_ideal_colors_2planes(
ep.endpt1[0] = select(ep.endpt1[0], ep1, full_mask);
}
// Calculations specific to mode #7, the HDR RGB-scale mode
vfloat4 rgbq_sum = color_vec_x + color_vec_y;
rgbq_sum.set_lane<3>(hadd_rgb_s(color_vec_y));
rgbo_vector = compute_rgbo_vector(rgba_weight_sum, weight_weight_sum, rgbq_sum, psum);
// We can get a failure due to the use of a singular (non-invertible) matrix
// If it failed, compute rgbo_vectors[] with a different method ...
if (astc::isnan(dot_s(rgbo_vector, rgbo_vector)))
// Calculations specific to mode #7, the HDR RGB-scale mode - skip if known LDR
if (blk.rgb_lns[0] || blk.alpha_lns[0])
{
vfloat4 v0 = ep.endpt0[0];
vfloat4 v1 = ep.endpt1[0];
weight_weight_sum = weight_weight_sum * color_weight;
float psum = dot3_s(select(right1_sum, right2_sum, p2_mask), color_weight);
float avgdif = hadd_rgb_s(v1 - v0) * (1.0f / 3.0f);
avgdif = astc::max(avgdif, 0.0f);
vfloat4 rgbq_sum = color_vec_x + color_vec_y;
rgbq_sum.set_lane<3>(hadd_rgb_s(color_vec_y));
vfloat4 avg = (v0 + v1) * 0.5f;
vfloat4 ep0 = avg - vfloat4(avgdif) * 0.5f;
rgbo_vector = compute_rgbo_vector(rgba_weight_sum, weight_weight_sum, rgbq_sum, psum);
rgbo_vector = vfloat4(ep0.lane<0>(), ep0.lane<1>(), ep0.lane<2>(), avgdif);
// We can get a failure due to the use of a singular (non-invertible) matrix
// If it failed, compute rgbo_vectors[] with a different method ...
if (astc::isnan(dot_s(rgbo_vector, rgbo_vector)))
{
vfloat4 v0 = ep.endpt0[0];
vfloat4 v1 = ep.endpt1[0];
float avgdif = hadd_rgb_s(v1 - v0) * (1.0f / 3.0f);
avgdif = astc::max(avgdif, 0.0f);
vfloat4 avg = (v0 + v1) * 0.5f;
vfloat4 ep0 = avg - vfloat4(avgdif) * 0.5f;
rgbo_vector = vfloat4(ep0.lane<0>(), ep0.lane<1>(), ep0.lane<2>(), avgdif);
}
}
}
+86 -57
View File
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: Apache-2.0
// ----------------------------------------------------------------------------
// Copyright 2011-2022 Arm Limited
// Copyright 2011-2024 Arm Limited
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy
@@ -109,7 +109,7 @@ static vfloat4 swz_texel(
vfloat4 data,
const astcenc_swizzle& swz
) {
alignas(16) float datas[6];
ASTCENC_ALIGNAS float datas[6];
storea(data, datas);
datas[ASTCENC_SWZ_0] = 0.0f;
@@ -143,12 +143,12 @@ static vfloat4 encode_texel_lns(
vmask4 lns_mask
) {
vfloat4 datav_unorm = data * 65535.0f;
vfloat4 datav_lns = float_to_lns(data);
vfloat4 datav_lns = float_to_lns(data);
return select(datav_unorm, datav_lns, lns_mask);
}
/* See header for documentation. */
void fetch_image_block(
void load_image_block(
astcenc_profile decode_mode,
const astcenc_image& img,
image_block& blk,
@@ -265,7 +265,7 @@ void fetch_image_block(
}
/* See header for documentation. */
void fetch_image_block_fast_ldr(
void load_image_block_fast_ldr(
astcenc_profile decode_mode,
const astcenc_image& img,
image_block& blk,
@@ -332,7 +332,7 @@ void fetch_image_block_fast_ldr(
}
/* See header for documentation. */
void write_image_block(
void store_image_block(
astcenc_image& img,
const image_block& blk,
const block_size_descriptor& bsd,
@@ -341,24 +341,21 @@ void write_image_block(
unsigned int zpos,
const astcenc_swizzle& swz
) {
unsigned int xsize = img.dim_x;
unsigned int ysize = img.dim_y;
unsigned int zsize = img.dim_z;
unsigned int x_size = img.dim_x;
unsigned int x_start = xpos;
unsigned int x_end = std::min(xsize, xpos + bsd.xdim);
unsigned int x_nudge = bsd.xdim - (x_end - x_start);
unsigned int x_end = astc::min(x_size, xpos + bsd.xdim);
unsigned int x_count = x_end - x_start;
unsigned int x_nudge = bsd.xdim - x_count;
unsigned int y_size = img.dim_y;
unsigned int y_start = ypos;
unsigned int y_end = std::min(ysize, ypos + bsd.ydim);
unsigned int y_nudge = (bsd.ydim - (y_end - y_start)) * bsd.xdim;
unsigned int y_end = astc::min(y_size, ypos + bsd.ydim);
unsigned int y_count = y_end - y_start;
unsigned int y_nudge = (bsd.ydim - y_count) * bsd.xdim;
unsigned int z_size = img.dim_z;
unsigned int z_start = zpos;
unsigned int z_end = std::min(zsize, zpos + bsd.zdim);
float data[7];
data[ASTCENC_SWZ_0] = 0.0f;
data[ASTCENC_SWZ_1] = 1.0f;
unsigned int z_end = astc::min(z_size, zpos + bsd.zdim);
// True if any non-identity swizzle
bool needs_swz = (swz.r != ASTCENC_SWZ_R) || (swz.g != ASTCENC_SWZ_G) ||
@@ -378,47 +375,68 @@ void write_image_block(
for (unsigned int y = y_start; y < y_end; y++)
{
for (unsigned int x = x_start; x < x_end; x++)
{
vint4 colori = vint4::zero();
uint8_t* data8_row = data8 + (4 * x_size * y) + (4 * x_start);
// Errors are NaN encoded - convert to magenta error color
if (blk.data_r[idx] != blk.data_r[idx])
for (unsigned int x = 0; x < x_count; x += ASTCENC_SIMD_WIDTH)
{
unsigned int max_texels = ASTCENC_SIMD_WIDTH;
unsigned int used_texels = astc::min(x_count - x, max_texels);
// Unaligned load as rows are not always SIMD_WIDTH long
vfloat data_r(blk.data_r + idx);
vfloat data_g(blk.data_g + idx);
vfloat data_b(blk.data_b + idx);
vfloat data_a(blk.data_a + idx);
vint data_ri = float_to_int_rtn(min(data_r, 1.0f) * 255.0f);
vint data_gi = float_to_int_rtn(min(data_g, 1.0f) * 255.0f);
vint data_bi = float_to_int_rtn(min(data_b, 1.0f) * 255.0f);
vint data_ai = float_to_int_rtn(min(data_a, 1.0f) * 255.0f);
if (needs_swz)
{
colori = vint4(0xFF, 0x00, 0xFF, 0xFF);
}
else if (needs_swz)
{
data[ASTCENC_SWZ_R] = blk.data_r[idx];
data[ASTCENC_SWZ_G] = blk.data_g[idx];
data[ASTCENC_SWZ_B] = blk.data_b[idx];
data[ASTCENC_SWZ_A] = blk.data_a[idx];
vint swizzle_table[7];
swizzle_table[ASTCENC_SWZ_0] = vint(0);
swizzle_table[ASTCENC_SWZ_1] = vint(255);
swizzle_table[ASTCENC_SWZ_R] = data_ri;
swizzle_table[ASTCENC_SWZ_G] = data_gi;
swizzle_table[ASTCENC_SWZ_B] = data_bi;
swizzle_table[ASTCENC_SWZ_A] = data_ai;
if (needs_z)
{
float xcoord = (data[0] * 2.0f) - 1.0f;
float ycoord = (data[3] * 2.0f) - 1.0f;
float zcoord = 1.0f - xcoord * xcoord - ycoord * ycoord;
if (zcoord < 0.0f)
{
zcoord = 0.0f;
}
data[ASTCENC_SWZ_Z] = (astc::sqrt(zcoord) * 0.5f) + 0.5f;
vfloat data_x = (data_r * vfloat(2.0f)) - vfloat(1.0f);
vfloat data_y = (data_a * vfloat(2.0f)) - vfloat(1.0f);
vfloat data_z = vfloat(1.0f) - (data_x * data_x) - (data_y * data_y);
data_z = max(data_z, 0.0f);
data_z = (sqrt(data_z) * vfloat(0.5f)) + vfloat(0.5f);
swizzle_table[ASTCENC_SWZ_Z] = float_to_int_rtn(min(data_z, 1.0f) * 255.0f);
}
vfloat4 color = vfloat4(data[swz.r], data[swz.g], data[swz.b], data[swz.a]);
colori = float_to_int_rtn(min(color, 1.0f) * 255.0f);
data_ri = swizzle_table[swz.r];
data_gi = swizzle_table[swz.g];
data_bi = swizzle_table[swz.b];
data_ai = swizzle_table[swz.a];
}
else
// Errors are NaN encoded - convert to magenta error color
// Branch is OK here - it is almost never true so predicts well
vmask nan_mask = data_r != data_r;
if (any(nan_mask))
{
vfloat4 color = blk.texel(idx);
colori = float_to_int_rtn(min(color, 1.0f) * 255.0f);
data_ri = select(data_ri, vint(0xFF), nan_mask);
data_gi = select(data_gi, vint(0x00), nan_mask);
data_bi = select(data_bi, vint(0xFF), nan_mask);
data_ai = select(data_ai, vint(0xFF), nan_mask);
}
colori = pack_low_bytes(colori);
store_nbytes(colori, data8 + (4 * xsize * y) + (4 * x ));
vint data_rgbai = interleave_rgba8(data_ri, data_gi, data_bi, data_ai);
vmask store_mask = vint::lane_id() < vint(used_texels);
store_lanes_masked(data8_row, data_rgbai, store_mask);
idx++;
data8_row += ASTCENC_SIMD_WIDTH * 4;
idx += used_texels;
}
idx += x_nudge;
}
@@ -434,13 +452,18 @@ void write_image_block(
for (unsigned int y = y_start; y < y_end; y++)
{
for (unsigned int x = x_start; x < x_end; x++)
uint16_t* data16_row = data16 + (4 * x_size * y) + (4 * x_start);
for (unsigned int x = 0; x < x_count; x++)
{
vint4 color;
// NaNs are handled inline - no need to special case
if (needs_swz)
{
float data[7];
data[ASTCENC_SWZ_0] = 0.0f;
data[ASTCENC_SWZ_1] = 1.0f;
data[ASTCENC_SWZ_R] = blk.data_r[idx];
data[ASTCENC_SWZ_G] = blk.data_g[idx];
data[ASTCENC_SWZ_B] = blk.data_b[idx];
@@ -467,11 +490,12 @@ void write_image_block(
color = float_to_float16(colorf);
}
data16[(4 * xsize * y) + (4 * x )] = static_cast<uint16_t>(color.lane<0>());
data16[(4 * xsize * y) + (4 * x + 1)] = static_cast<uint16_t>(color.lane<1>());
data16[(4 * xsize * y) + (4 * x + 2)] = static_cast<uint16_t>(color.lane<2>());
data16[(4 * xsize * y) + (4 * x + 3)] = static_cast<uint16_t>(color.lane<3>());
// TODO: Vectorize with store N shorts?
data16_row[0] = static_cast<uint16_t>(color.lane<0>());
data16_row[1] = static_cast<uint16_t>(color.lane<1>());
data16_row[2] = static_cast<uint16_t>(color.lane<2>());
data16_row[3] = static_cast<uint16_t>(color.lane<3>());
data16_row += 4;
idx++;
}
idx += x_nudge;
@@ -490,13 +514,18 @@ void write_image_block(
for (unsigned int y = y_start; y < y_end; y++)
{
for (unsigned int x = x_start; x < x_end; x++)
float* data32_row = data32 + (4 * x_size * y) + (4 * x_start);
for (unsigned int x = 0; x < x_count; x++)
{
vfloat4 color = blk.texel(idx);
// NaNs are handled inline - no need to special case
if (needs_swz)
{
float data[7];
data[ASTCENC_SWZ_0] = 0.0f;
data[ASTCENC_SWZ_1] = 1.0f;
data[ASTCENC_SWZ_R] = color.lane<0>();
data[ASTCENC_SWZ_G] = color.lane<1>();
data[ASTCENC_SWZ_B] = color.lane<2>();
@@ -517,8 +546,8 @@ void write_image_block(
color = vfloat4(data[swz.r], data[swz.g], data[swz.b], data[swz.a]);
}
store(color, data32 + (4 * xsize * y) + (4 * x ));
store(color, data32_row);
data32_row += 4;
idx++;
}
idx += x_nudge;
+61 -66
View File
@@ -24,6 +24,7 @@
#include <array>
/** @brief Unpacked quint triplets <low,middle,high> for each packed value */
// TODO: Bitpack these into a uint16_t?
static const uint8_t quints_of_integer[128][3] {
{0, 0, 0}, {1, 0, 0}, {2, 0, 0}, {3, 0, 0},
{4, 0, 0}, {0, 4, 0}, {4, 4, 0}, {4, 4, 4},
@@ -99,6 +100,7 @@ static const uint8_t integer_of_quints[5][5][5] {
};
/** @brief Unpacked trit quintuplets <low,...,high> for each packed value */
// TODO: Bitpack these into a uint16_t?
static const uint8_t trits_of_integer[256][5] {
{0, 0, 0, 0, 0}, {1, 0, 0, 0, 0}, {2, 0, 0, 0, 0}, {0, 0, 2, 0, 0},
{0, 1, 0, 0, 0}, {1, 1, 0, 0, 0}, {2, 1, 0, 0, 0}, {1, 0, 2, 0, 0},
@@ -334,44 +336,41 @@ static const uint8_t integer_of_trits[3][3][3][3][3] {
*/
struct btq_count
{
/** @brief The quantization level. */
uint8_t quant;
/** @brief The number of bits. */
uint8_t bits;
uint8_t bits:6;
/** @brief The number of trits. */
uint8_t trits;
uint8_t trits:1;
/** @brief The number of quints. */
uint8_t quints;
uint8_t quints:1;
};
/**
* @brief The table of bits, trits, and quints needed for a quant encode.
*/
static const std::array<btq_count, 21> btq_counts {{
{ QUANT_2, 1, 0, 0 },
{ QUANT_3, 0, 1, 0 },
{ QUANT_4, 2, 0, 0 },
{ QUANT_5, 0, 0, 1 },
{ QUANT_6, 1, 1, 0 },
{ QUANT_8, 3, 0, 0 },
{ QUANT_10, 1, 0, 1 },
{ QUANT_12, 2, 1, 0 },
{ QUANT_16, 4, 0, 0 },
{ QUANT_20, 2, 0, 1 },
{ QUANT_24, 3, 1, 0 },
{ QUANT_32, 5, 0, 0 },
{ QUANT_40, 3, 0, 1 },
{ QUANT_48, 4, 1, 0 },
{ QUANT_64, 6, 0, 0 },
{ QUANT_80, 4, 0, 1 },
{ QUANT_96, 5, 1, 0 },
{ QUANT_128, 7, 0, 0 },
{ QUANT_160, 5, 0, 1 },
{ QUANT_192, 6, 1, 0 },
{ QUANT_256, 8, 0, 0 }
{ 1, 0, 0 }, // QUANT_2
{ 0, 1, 0 }, // QUANT_3
{ 2, 0, 0 }, // QUANT_4
{ 0, 0, 1 }, // QUANT_5
{ 1, 1, 0 }, // QUANT_6
{ 3, 0, 0 }, // QUANT_8
{ 1, 0, 1 }, // QUANT_10
{ 2, 1, 0 }, // QUANT_12
{ 4, 0, 0 }, // QUANT_16
{ 2, 0, 1 }, // QUANT_20
{ 3, 1, 0 }, // QUANT_24
{ 5, 0, 0 }, // QUANT_32
{ 3, 0, 1 }, // QUANT_40
{ 4, 1, 0 }, // QUANT_48
{ 6, 0, 0 }, // QUANT_64
{ 4, 0, 1 }, // QUANT_80
{ 5, 1, 0 }, // QUANT_96
{ 7, 0, 0 }, // QUANT_128
{ 5, 0, 1 }, // QUANT_160
{ 6, 1, 0 }, // QUANT_192
{ 8, 0, 0 } // QUANT_256
}};
/**
@@ -382,44 +381,38 @@ static const std::array<btq_count, 21> btq_counts {{
*/
struct ise_size
{
/** @brief The quantization level. */
uint8_t quant;
/** @brief The scaling parameter. */
uint8_t scale;
/** @brief The rounding parameter. */
uint8_t round;
uint8_t scale:6;
/** @brief The divisor parameter. */
uint8_t divisor;
uint8_t divisor:2;
};
/**
* @brief The table of scale, round, and divisors needed for quant sizing.
*/
static const std::array<ise_size, 21> ise_sizes {{
{ QUANT_2, 1, 0, 1 },
{ QUANT_3, 8, 4, 5 },
{ QUANT_4, 2, 0, 1 },
{ QUANT_5, 7, 2, 3 },
{ QUANT_6, 13, 4, 5 },
{ QUANT_8, 3, 0, 1 },
{ QUANT_10, 10, 2, 3 },
{ QUANT_12, 18, 4, 5 },
{ QUANT_16, 4, 0, 1 },
{ QUANT_20, 13, 2, 3 },
{ QUANT_24, 23, 4, 5 },
{ QUANT_32, 5, 0, 1 },
{ QUANT_40, 16, 2, 3 },
{ QUANT_48, 28, 4, 5 },
{ QUANT_64, 6, 0, 1 },
{ QUANT_80, 19, 2, 3 },
{ QUANT_96, 33, 4, 5 },
{ QUANT_128, 7, 0, 1 },
{ QUANT_160, 22, 2, 3 },
{ QUANT_192, 38, 4, 5 },
{ QUANT_256, 8, 0, 1 }
{ 1, 0 }, // QUANT_2
{ 8, 2 }, // QUANT_3
{ 2, 0 }, // QUANT_4
{ 7, 1 }, // QUANT_5
{ 13, 2 }, // QUANT_6
{ 3, 0 }, // QUANT_8
{ 10, 1 }, // QUANT_10
{ 18, 2 }, // QUANT_12
{ 4, 0 }, // QUANT_16
{ 13, 1 }, // QUANT_20
{ 23, 2 }, // QUANT_24
{ 5, 0 }, // QUANT_32
{ 16, 1 }, // QUANT_40
{ 28, 2 }, // QUANT_48
{ 6, 0 }, // QUANT_64
{ 19, 1 }, // QUANT_80
{ 33, 2 }, // QUANT_96
{ 7, 0 }, // QUANT_128
{ 22, 1 }, // QUANT_160
{ 38, 2 }, // QUANT_192
{ 8, 0 } // QUANT_256
}};
/* See header for documentation. */
@@ -435,7 +428,8 @@ unsigned int get_ise_sequence_bitcount(
}
auto& entry = ise_sizes[quant_level];
return (entry.scale * character_count + entry.round) / entry.divisor;
unsigned int divisor = (entry.divisor << 1) + 1;
return (entry.scale * character_count + divisor - 1) / divisor;
}
/**
@@ -645,7 +639,6 @@ void encode_ise(
// Write out just bits
else
{
promise(character_count > 0);
for (unsigned int i = 0; i < character_count; i++)
{
write_bits(input_data[i], bits, bit_offset, output_data);
@@ -685,10 +678,10 @@ void decode_ise(
if (trits)
{
static const unsigned int bits_to_read[5] { 2, 2, 1, 2, 1 };
static const unsigned int block_shift[5] { 0, 2, 4, 5, 7 };
static const unsigned int next_lcounter[5] { 1, 2, 3, 4, 0 };
static const unsigned int hcounter_incr[5] { 0, 0, 0, 0, 1 };
static const uint8_t bits_to_read[5] { 2, 2, 1, 2, 1 };
static const uint8_t block_shift[5] { 0, 2, 4, 5, 7 };
static const uint8_t next_lcounter[5] { 1, 2, 3, 4, 0 };
static const uint8_t hcounter_incr[5] { 0, 0, 0, 0, 1 };
unsigned int tdata = read_bits(bits_to_read[lcounter], bit_offset, input_data);
bit_offset += bits_to_read[lcounter];
tq_blocks[hcounter] |= tdata << block_shift[lcounter];
@@ -698,10 +691,10 @@ void decode_ise(
if (quints)
{
static const unsigned int bits_to_read[3] { 3, 2, 2 };
static const unsigned int block_shift[3] { 0, 3, 5 };
static const unsigned int next_lcounter[3] { 1, 2, 0 };
static const unsigned int hcounter_incr[3] { 0, 0, 1 };
static const uint8_t bits_to_read[3] { 3, 2, 2 };
static const uint8_t block_shift[3] { 0, 3, 5 };
static const uint8_t next_lcounter[3] { 1, 2, 0 };
static const uint8_t hcounter_incr[3] { 0, 0, 1 };
unsigned int tdata = read_bits(bits_to_read[lcounter], bit_offset, input_data);
bit_offset += bits_to_read[lcounter];
tq_blocks[hcounter] |= tdata << block_shift[lcounter];
@@ -714,6 +707,7 @@ void decode_ise(
if (trits)
{
unsigned int trit_blocks = (character_count + 4) / 5;
promise(trit_blocks > 0);
for (unsigned int i = 0; i < trit_blocks; i++)
{
const uint8_t *tritptr = trits_of_integer[tq_blocks[i]];
@@ -728,6 +722,7 @@ void decode_ise(
if (quints)
{
unsigned int quint_blocks = (character_count + 2) / 3;
promise(quint_blocks > 0);
for (unsigned int i = 0; i < quint_blocks; i++)
{
const uint8_t *quintptr = quints_of_integer[tq_blocks[i]];
+386 -640
View File
File diff suppressed because it is too large Load Diff
+330
View File
@@ -0,0 +1,330 @@
// SPDX-License-Identifier: Apache-2.0
// ----------------------------------------------------------------------------
// Copyright 2011-2024 Arm Limited
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy
// of the License at:
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
// License for the specific language governing permissions and limitations
// under the License.
// ----------------------------------------------------------------------------
/**
* @brief Functions and data declarations for the outer context.
*
* The outer context includes thread-pool management, which is slower to
* compile due to increased use of C++ stdlib. The inner context used in the
* majority of the codec library does not include this.
*/
#ifndef ASTCENC_INTERNAL_ENTRY_INCLUDED
#define ASTCENC_INTERNAL_ENTRY_INCLUDED
#include <atomic>
#include <condition_variable>
#include <functional>
#include <mutex>
#include "astcenc_internal.h"
/* ============================================================================
Parallel execution control
============================================================================ */
/**
* @brief A simple counter-based manager for parallel task execution.
*
* The task processing execution consists of:
*
* * A single-threaded init stage.
* * A multi-threaded processing stage.
* * A condition variable so threads can wait for processing completion.
*
* The init stage will be executed by the first thread to arrive in the critical section, there is
* no main thread in the thread pool.
*
* The processing stage uses dynamic dispatch to assign task tickets to threads on an on-demand
* basis. Threads may each therefore executed different numbers of tasks, depending on their
* processing complexity. The task queue and the task tickets are just counters; the caller must map
* these integers to an actual processing partition in a specific problem domain.
*
* The exit wait condition is needed to ensure processing has finished before a worker thread can
* progress to the next stage of the pipeline. Specifically a worker may exit the processing stage
* because there are no new tasks to assign to it while other worker threads are still processing.
* Calling @c wait() will ensure that all other worker have finished before the thread can proceed.
*
* The basic usage model:
*
* // --------- From single-threaded code ---------
*
* // Reset the tracker state
* manager->reset()
*
* // --------- From multi-threaded code ---------
*
* // Run the stage init; only first thread actually runs the lambda
* manager->init(<lambda>)
*
* do
* {
* // Request a task assignment
* uint task_count;
* uint base_index = manager->get_tasks(<granule>, task_count);
*
* // Process any tasks we were given (task_count <= granule size)
* if (task_count)
* {
* // Run the user task processing code for N tasks here
* ...
*
* // Flag these tasks as complete
* manager->complete_tasks(task_count);
* }
* } while (task_count);
*
* // Wait for all threads to complete tasks before progressing
* manager->wait()
*
* // Run the stage term; only first thread actually runs the lambda
* manager->term(<lambda>)
*/
class ParallelManager
{
private:
/** @brief Lock used for critical section and condition synchronization. */
std::mutex m_lock;
/** @brief True if the stage init() step has been executed. */
bool m_init_done;
/** @brief True if the stage term() step has been executed. */
bool m_term_done;
/** @brief Condition variable for tracking stage processing completion. */
std::condition_variable m_complete;
/** @brief Number of tasks started, but not necessarily finished. */
std::atomic<unsigned int> m_start_count;
/** @brief Number of tasks finished. */
unsigned int m_done_count;
/** @brief Number of tasks that need to be processed. */
unsigned int m_task_count;
/** @brief Progress callback (optional). */
astcenc_progress_callback m_callback;
/** @brief Lock used for callback synchronization. */
std::mutex m_callback_lock;
/** @brief Minimum progress before making a callback. */
float m_callback_min_diff;
/** @brief Last progress callback value. */
float m_callback_last_value;
public:
/** @brief Create a new ParallelManager. */
ParallelManager()
{
reset();
}
/**
* @brief Reset the tracker for a new processing batch.
*
* This must be called from single-threaded code before starting the multi-threaded processing
* operations.
*/
void reset()
{
m_init_done = false;
m_term_done = false;
m_start_count = 0;
m_done_count = 0;
m_task_count = 0;
m_callback_last_value = 0.0f;
m_callback_min_diff = 1.0f;
}
/**
* @brief Trigger the pipeline stage init step.
*
* This can be called from multi-threaded code. The first thread to hit this will process the
* initialization. Other threads will block and wait for it to complete.
*
* @param init_func Callable which executes the stage initialization. It must return the
* total number of tasks in the stage.
*/
void init(std::function<unsigned int(void)> init_func)
{
std::lock_guard<std::mutex> lck(m_lock);
if (!m_init_done)
{
m_task_count = init_func();
m_init_done = true;
}
}
/**
* @brief Trigger the pipeline stage init step.
*
* This can be called from multi-threaded code. The first thread to hit this will process the
* initialization. Other threads will block and wait for it to complete.
*
* @param task_count Total number of tasks needing processing.
* @param callback Function pointer for progress status callbacks.
*/
void init(unsigned int task_count, astcenc_progress_callback callback)
{
std::lock_guard<std::mutex> lck(m_lock);
if (!m_init_done)
{
m_callback = callback;
m_task_count = task_count;
m_init_done = true;
// Report every 1% or 4096 blocks, whichever is larger, to avoid callback overhead
float min_diff = (4096.0f / static_cast<float>(task_count)) * 100.0f;
m_callback_min_diff = astc::max(min_diff, 1.0f);
}
}
/**
* @brief Request a task assignment.
*
* Assign up to @c granule tasks to the caller for processing.
*
* @param granule Maximum number of tasks that can be assigned.
* @param[out] count Actual number of tasks assigned, or zero if no tasks were assigned.
*
* @return Task index of the first assigned task; assigned tasks increment from this.
*/
unsigned int get_task_assignment(unsigned int granule, unsigned int& count)
{
unsigned int base = m_start_count.fetch_add(granule, std::memory_order_relaxed);
if (base >= m_task_count)
{
count = 0;
return 0;
}
count = astc::min(m_task_count - base, granule);
return base;
}
/**
* @brief Complete a task assignment.
*
* Mark @c count tasks as complete. This will notify all threads blocked on @c wait() if this
* completes the processing of the stage.
*
* @param count The number of completed tasks.
*/
void complete_task_assignment(unsigned int count)
{
// Note: m_done_count cannot use an atomic without the mutex; this has a race between the
// update here and the wait() for other threads
unsigned int local_count;
float local_last_value;
{
std::unique_lock<std::mutex> lck(m_lock);
m_done_count += count;
local_count = m_done_count;
local_last_value = m_callback_last_value;
if (m_done_count == m_task_count)
{
// Ensure the progress bar hits 100%
if (m_callback)
{
std::unique_lock<std::mutex> cblck(m_callback_lock);
m_callback(100.0f);
m_callback_last_value = 100.0f;
}
lck.unlock();
m_complete.notify_all();
}
}
// Process progress callback if we have one
if (m_callback)
{
// Initial lockless test - have we progressed enough to emit?
float num = static_cast<float>(local_count);
float den = static_cast<float>(m_task_count);
float this_value = (num / den) * 100.0f;
bool report_test = (this_value - local_last_value) > m_callback_min_diff;
// Recheck under lock, because another thread might report first
if (report_test)
{
std::unique_lock<std::mutex> cblck(m_callback_lock);
bool report_retest = (this_value - m_callback_last_value) > m_callback_min_diff;
if (report_retest)
{
m_callback(this_value);
m_callback_last_value = this_value;
}
}
}
}
/**
* @brief Wait for stage processing to complete.
*/
void wait()
{
std::unique_lock<std::mutex> lck(m_lock);
m_complete.wait(lck, [this]{ return m_done_count == m_task_count; });
}
/**
* @brief Trigger the pipeline stage term step.
*
* This can be called from multi-threaded code. The first thread to hit this will process the
* work pool termination. Caller must have called @c wait() prior to calling this function to
* ensure that processing is complete.
*
* @param term_func Callable which executes the stage termination.
*/
void term(std::function<void(void)> term_func)
{
std::lock_guard<std::mutex> lck(m_lock);
if (!m_term_done)
{
term_func();
m_term_done = true;
}
}
};
/**
* @brief The astcenc compression context.
*/
struct astcenc_context
{
/** @brief The context internal state. */
astcenc_contexti context;
#if !defined(ASTCENC_DECOMPRESS_ONLY)
/** @brief The parallel manager for averages computation. */
ParallelManager manage_avg;
/** @brief The parallel manager for compression. */
ParallelManager manage_compress;
#endif
/** @brief The parallel manager for decompression. */
ParallelManager manage_decompress;
};
#endif
+14 -4
View File
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: Apache-2.0
// ----------------------------------------------------------------------------
// Copyright 2011-2021 Arm Limited
// Copyright 2011-2024 Arm Limited
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy
@@ -48,8 +48,6 @@
#define ASTCENC_SSE 42
#elif defined(__SSE4_1__)
#define ASTCENC_SSE 41
#elif defined(__SSE3__)
#define ASTCENC_SSE 30
#elif defined(__SSE2__)
#define ASTCENC_SSE 20
#else
@@ -75,10 +73,22 @@
#endif
#endif
// Force vector-sized SIMD alignment
#if ASTCENC_AVX
#define ASTCENC_VECALIGN 32
#else
#elif ASTCENC_SSE || ASTCENC_NEON
#define ASTCENC_VECALIGN 16
// Use default alignment for non-SIMD builds
#else
#define ASTCENC_VECALIGN 0
#endif
// C++11 states that alignas(0) should be ignored but GCC doesn't do
// this on some versions, so workaround and avoid emitting alignas(0)
#if ASTCENC_VECALIGN > 0
#define ASTCENC_ALIGNAS alignas(ASTCENC_VECALIGN)
#else
#define ASTCENC_ALIGNAS
#endif
#if ASTCENC_SSE != 0 || ASTCENC_AVX != 0 || ASTCENC_POPCNT != 0
+10 -10
View File
@@ -273,7 +273,7 @@ static sf16 sf32_to_sf16(sf32 inp, roundmode rmode)
of the mantissa is set.)
*/
p = (inp - 1) & UINT32_C(0x800000); /* zero if INF, nonzero if NaN. */
return ((inp + vlx) >> 13) | (p >> 14);
return static_cast<sf16>(((inp + vlx) >> 13) | (p >> 14));
/*
positive, exponent = 0, round-mode == UP; need to check whether number actually is 0.
If it is, then return 0, else return 1 (the smallest representable nonzero number)
@@ -283,7 +283,7 @@ static sf16 sf32_to_sf16(sf32 inp, roundmode rmode)
-inp will set the MSB if the input number is nonzero.
Thus (-inp) >> 31 will turn into 0 if the input number is 0 and 1 otherwise.
*/
return static_cast<uint32_t>((-static_cast<int32_t>(inp))) >> 31;
return static_cast<sf16>(static_cast<uint32_t>((-static_cast<int32_t>(inp))) >> 31);
/*
negative, exponent = , round-mode == DOWN, need to check whether number is
@@ -296,7 +296,7 @@ static sf16 sf32_to_sf16(sf32 inp, roundmode rmode)
the MSB set if it isn't. We then right-shift the value by 31 places to
get a value that is 0 if the input is -0.0 and 1 otherwise.
*/
return ((vlx - inp) >> 31) + UINT32_C(0x8000);
return static_cast<sf16>(((vlx - inp) >> 31) + UINT32_C(0x8000));
/*
for all other cases involving underflow/overflow, we don't need to
@@ -330,7 +330,7 @@ static sf16 sf32_to_sf16(sf32 inp, roundmode rmode)
case 47:
case 48:
case 49:
return vlx;
return static_cast<sf16>(vlx);
/*
for normal numbers, 'vlx' is the difference between the FP32 value of a number and the
@@ -349,14 +349,14 @@ static sf16 sf32_to_sf16(sf32 inp, roundmode rmode)
case 36:
case 37:
case 39:
return (inp + vlx) >> 13;
return static_cast<sf16>((inp + vlx) >> 13);
/* normal number, round-to-nearest-even. */
case 33:
case 38:
p = inp + vlx;
p += (inp >> 13) & 1;
return p >> 13;
return static_cast<sf16>(p >> 13);
/*
the various denormal cases. These are not expected to be common, so their performance is a bit
@@ -371,22 +371,22 @@ static sf16 sf32_to_sf16(sf32 inp, roundmode rmode)
case 27:
/* denormal, round towards zero. */
p = 126 - ((inp >> 23) & 0xFF);
return (((inp & UINT32_C(0x7FFFFF)) + UINT32_C(0x800000)) >> p) | vlx;
return static_cast<sf16>((((inp & UINT32_C(0x7FFFFF)) + UINT32_C(0x800000)) >> p) | vlx);
case 20:
case 26:
/* denormal, round away from zero. */
p = 126 - ((inp >> 23) & 0xFF);
return rtup_shift32((inp & UINT32_C(0x7FFFFF)) + UINT32_C(0x800000), p) | vlx;
return static_cast<sf16>(rtup_shift32((inp & UINT32_C(0x7FFFFF)) + UINT32_C(0x800000), p) | vlx);
case 24:
case 29:
/* denormal, round to nearest-away */
p = 126 - ((inp >> 23) & 0xFF);
return rtna_shift32((inp & UINT32_C(0x7FFFFF)) + UINT32_C(0x800000), p) | vlx;
return static_cast<sf16>(rtna_shift32((inp & UINT32_C(0x7FFFFF)) + UINT32_C(0x800000), p) | vlx);
case 23:
case 28:
/* denormal, round to nearest-even. */
p = 126 - ((inp >> 23) & 0xFF);
return rtne_shift32((inp & UINT32_C(0x7FFFFF)) + UINT32_C(0x800000), p) | vlx;
return static_cast<sf16>(rtne_shift32((inp & UINT32_C(0x7FFFFF)) + UINT32_C(0x800000), p) | vlx);
}
return 0;
+43 -36
View File
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: Apache-2.0
// ----------------------------------------------------------------------------
// Copyright 2011-2022 Arm Limited
// Copyright 2011-2023 Arm Limited
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy
@@ -21,6 +21,9 @@
#include "astcenc_internal.h"
/** @brief The number of 64-bit words needed to represent a canonical partition bit pattern. */
#define BIT_PATTERN_WORDS (((ASTCENC_BLOCK_MAX_TEXELS * 2) + 63) / 64)
/**
* @brief Generate a canonical representation of a partition pattern.
*
@@ -28,22 +31,22 @@
* the remapped texel index. Remapping ensures that we only match on the partition pattern,
* independent of the partition order generated by the hash.
*
* @param texel_count The number of texels in the block.
* @param partition_of_texel The partition assignments, in hash order.
* @param[out] bit_pattern The output bit pattern representation.
* @param texel_count The number of texels in the block.
* @param partition_of_texel The partition assignments, in hash order.
* @param[out] bit_pattern The output bit pattern representation.
*/
static void generate_canonical_partitioning(
unsigned int texel_count,
const uint8_t* partition_of_texel,
uint64_t bit_pattern[7]
uint64_t bit_pattern[BIT_PATTERN_WORDS]
) {
// Clear the pattern
for (unsigned int i = 0; i < 7; i++)
for (unsigned int i = 0; i < BIT_PATTERN_WORDS; i++)
{
bit_pattern[i] = 0;
}
// Store a mapping to reorder the raw partitions so that the the partitions are ordered such
// Store a mapping to reorder the raw partitions so that the partitions are ordered such
// that the lowest texel index in partition N is smaller than the lowest texel index in
// partition N + 1.
int mapped_index[BLOCK_MAX_PARTITIONS];
@@ -76,19 +79,35 @@ static void generate_canonical_partitioning(
* @return @c true if the patterns are the same, @c false otherwise.
*/
static bool compare_canonical_partitionings(
const uint64_t part1[7],
const uint64_t part2[7]
const uint64_t part1[BIT_PATTERN_WORDS],
const uint64_t part2[BIT_PATTERN_WORDS]
) {
return (part1[0] == part2[0]) && (part1[1] == part2[1]) &&
(part1[2] == part2[2]) && (part1[3] == part2[3]) &&
(part1[4] == part2[4]) && (part1[5] == part2[5]) &&
(part1[6] == part2[6]);
return (part1[0] == part2[0])
#if BIT_PATTERN_WORDS > 1
&& (part1[1] == part2[1])
#endif
#if BIT_PATTERN_WORDS > 2
&& (part1[2] == part2[2])
#endif
#if BIT_PATTERN_WORDS > 3
&& (part1[3] == part2[3])
#endif
#if BIT_PATTERN_WORDS > 4
&& (part1[4] == part2[4])
#endif
#if BIT_PATTERN_WORDS > 5
&& (part1[5] == part2[5])
#endif
#if BIT_PATTERN_WORDS > 6
&& (part1[6] == part2[6])
#endif
;
}
/**
* @brief Hash function used for procedural partition assignment.
*
* @param inp The hash seed.
* @param inp The hash seed.
*
* @return The hashed value.
*/
@@ -116,7 +135,7 @@ static uint32_t hash52(
* @param y The texel Y coordinate in the block.
* @param z The texel Z coordinate in the block.
* @param partition_count The total partition count of this encoding.
* @param small_block @c true if the blockhas fewer than 32 texels.
* @param small_block @c true if the block has fewer than 32 texels.
*
* @return The assigned partition index for this texel.
*/
@@ -316,25 +335,21 @@ static bool generate_one_partition_info_entry(
}
// Populate the partition index
pi.partition_index = partition_index;
pi.partition_index = static_cast<uint16_t>(partition_index);
// Populate the coverage bitmaps for 2/3/4 partitions
uint64_t* bitmaps { nullptr };
uint8_t* valids { nullptr };
if (partition_count == 2)
{
bitmaps = bsd.coverage_bitmaps_2[partition_remap_index];
valids = bsd.partitioning_valid_2;
}
else if (partition_count == 3)
{
bitmaps = bsd.coverage_bitmaps_3[partition_remap_index];
valids = bsd.partitioning_valid_3;
}
else if (partition_count == 4)
{
bitmaps = bsd.coverage_bitmaps_4[partition_remap_index];
valids = bsd.partitioning_valid_4;
}
for (unsigned int i = 0; i < BLOCK_MAX_PARTITIONS; i++)
@@ -347,9 +362,7 @@ static bool generate_one_partition_info_entry(
if (bitmaps)
{
// Populate the bitmap validity mask
valids[partition_remap_index] = valid ? 0 : 255;
// Populate the partition coverage bitmap
for (unsigned int i = 0; i < partition_count; i++)
{
bitmaps[i] = 0ULL;
@@ -374,12 +387,6 @@ static void build_partition_table_for_one_partition_count(
partition_info* ptab,
uint64_t* canonical_patterns
) {
uint8_t* partitioning_valid[3] {
bsd.partitioning_valid_2,
bsd.partitioning_valid_3,
bsd.partitioning_valid_4
};
unsigned int next_index = 0;
bsd.partitioning_count_selected[partition_count - 1] = 0;
bsd.partitioning_count_all[partition_count - 1] = 0;
@@ -397,7 +404,7 @@ static void build_partition_table_for_one_partition_count(
// Tracker for things we built in the first iteration
uint8_t build[BLOCK_MAX_PARTITIONINGS] { 0 };
for (unsigned int x = 0; x < max_iter; x++)
for (unsigned int x = 0; x < max_iter; x++)
{
for (unsigned int i = 0; i < BLOCK_MAX_PARTITIONINGS; i++)
{
@@ -413,11 +420,11 @@ static void build_partition_table_for_one_partition_count(
continue;
}
generate_canonical_partitioning(bsd.texel_count, ptab[next_index].partition_of_texel, canonical_patterns + next_index * 7);
generate_canonical_partitioning(bsd.texel_count, ptab[next_index].partition_of_texel, canonical_patterns + next_index * BIT_PATTERN_WORDS);
bool keep_canonical = true;
for (unsigned int j = 0; j < next_index; j++)
{
bool match = compare_canonical_partitionings(canonical_patterns + 7 * next_index, canonical_patterns + 7 * j);
bool match = compare_canonical_partitionings(canonical_patterns + next_index * BIT_PATTERN_WORDS, canonical_patterns + j * BIT_PATTERN_WORDS);
if (match)
{
keep_canonical = false;
@@ -429,7 +436,7 @@ static void build_partition_table_for_one_partition_count(
{
if (x == 0)
{
bsd.partitioning_packed_index[partition_count - 2][i] = next_index;
bsd.partitioning_packed_index[partition_count - 2][i] = static_cast<uint16_t>(next_index);
bsd.partitioning_count_selected[partition_count - 1]++;
bsd.partitioning_count_all[partition_count - 1]++;
build[i] = 1;
@@ -440,9 +447,8 @@ static void build_partition_table_for_one_partition_count(
{
if (x == 1)
{
bsd.partitioning_packed_index[partition_count - 2][i] = next_index;
bsd.partitioning_packed_index[partition_count - 2][i] = static_cast<uint16_t>(next_index);
bsd.partitioning_count_all[partition_count - 1]++;
partitioning_valid[partition_count - 2][next_index] = 255;
next_index++;
}
}
@@ -465,7 +471,8 @@ void init_partition_tables(
bsd.partitioning_count_selected[0] = 1;
bsd.partitioning_count_all[0] = 1;
uint64_t* canonical_patterns = new uint64_t[BLOCK_MAX_PARTITIONINGS * 7];
uint64_t* canonical_patterns = new uint64_t[BLOCK_MAX_PARTITIONINGS * BIT_PATTERN_WORDS];
build_partition_table_for_one_partition_count(bsd, can_omit_partitionings, partition_count_cutoff, 2, par_tab2, canonical_patterns);
build_partition_table_for_one_partition_count(bsd, can_omit_partitionings, partition_count_cutoff, 3, par_tab3, canonical_patterns);
build_partition_table_for_one_partition_count(bsd, can_omit_partitionings, partition_count_cutoff, 4, par_tab4, canonical_patterns);
File diff suppressed because it is too large Load Diff
+66 -88
View File
@@ -289,25 +289,13 @@ static void compute_encoding_choice_errors(
vmask4 endpt_can_offset = endpt_diff < vfloat4(0.12f * 65535.0f);
bool can_offset_encode = (mask(endpt_can_offset) & 0x7) == 0x7;
// Determine if we can blue contract encode RGB lanes
vfloat4 endpt_diff_bc(
endpt0.lane<0>() + (endpt0.lane<0>() - endpt0.lane<2>()),
endpt1.lane<0>() + (endpt1.lane<0>() - endpt1.lane<2>()),
endpt0.lane<1>() + (endpt0.lane<1>() - endpt0.lane<2>()),
endpt1.lane<1>() + (endpt1.lane<1>() - endpt1.lane<2>())
);
vmask4 endpt_can_bc_lo = endpt_diff_bc > vfloat4(0.01f * 65535.0f);
vmask4 endpt_can_bc_hi = endpt_diff_bc < vfloat4(0.99f * 65535.0f);
bool can_blue_contract = (mask(endpt_can_bc_lo & endpt_can_bc_hi) & 0x7) == 0x7;
// Store out the settings
eci[i].rgb_scale_error = (samechroma_rgb_error - uncorr_rgb_error) * 0.7f; // empirical
eci[i].rgb_luma_error = (rgb_luma_error - uncorr_rgb_error) * 1.5f; // wild guess
eci[i].luminance_error = (luminance_rgb_error - uncorr_rgb_error) * 3.0f; // empirical
eci[i].alpha_drop_error = alpha_drop_error * 3.0f;
eci[i].can_offset_encode = can_offset_encode;
eci[i].can_blue_contract = can_blue_contract;
eci[i].can_blue_contract = !blk.is_luminance();
}
}
@@ -333,15 +321,11 @@ static void compute_color_error_for_every_integer_count_and_quant_level(
const endpoints& ep,
vfloat4 error_weight,
float best_error[21][4],
int format_of_choice[21][4]
uint8_t format_of_choice[21][4]
) {
int partition_size = pi.partition_texel_count[partition_index];
static const float baseline_quant_error[21] {
(65536.0f * 65536.0f / 18.0f), // 2 values, 1 step
(65536.0f * 65536.0f / 18.0f) / (2 * 2), // 3 values, 2 steps
(65536.0f * 65536.0f / 18.0f) / (3 * 3), // 4 values, 3 steps
(65536.0f * 65536.0f / 18.0f) / (4 * 4), // 5 values
static const float baseline_quant_error[21 - QUANT_6] {
(65536.0f * 65536.0f / 18.0f) / (5 * 5),
(65536.0f * 65536.0f / 18.0f) / (7 * 7),
(65536.0f * 65536.0f / 18.0f) / (9 * 9),
@@ -529,7 +513,7 @@ static void compute_color_error_for_every_integer_count_and_quant_level(
best_error[i][1] = ERROR_CALC_DEFAULT;
best_error[i][0] = ERROR_CALC_DEFAULT;
format_of_choice[i][3] = encode_hdr_alpha ? FMT_HDR_RGBA : FMT_HDR_RGB_LDR_ALPHA;
format_of_choice[i][3] = static_cast<uint8_t>(encode_hdr_alpha ? FMT_HDR_RGBA : FMT_HDR_RGB_LDR_ALPHA);
format_of_choice[i][2] = FMT_HDR_RGB;
format_of_choice[i][1] = FMT_HDR_RGB_SCALE;
format_of_choice[i][0] = FMT_HDR_LUMINANCE_LARGE_RANGE;
@@ -540,7 +524,7 @@ static void compute_color_error_for_every_integer_count_and_quant_level(
// The base_quant_error should depend on the scale-factor that would be used during
// actual encode of the color value
float base_quant_error = baseline_quant_error[i] * static_cast<float>(partition_size);
float base_quant_error = baseline_quant_error[i - QUANT_6] * static_cast<float>(partition_size);
float rgb_quantization_error = error_weight_rgbsum * base_quant_error * 2.0f;
float alpha_quantization_error = error_weight.lane<3>() * base_quant_error * 2.0f;
float rgba_quantization_error = rgb_quantization_error + alpha_quantization_error;
@@ -549,7 +533,7 @@ static void compute_color_error_for_every_integer_count_and_quant_level(
float full_hdr_rgba_error = rgba_quantization_error + rgb_range_error + alpha_range_error;
best_error[i][3] = full_hdr_rgba_error;
format_of_choice[i][3] = encode_hdr_alpha ? FMT_HDR_RGBA : FMT_HDR_RGB_LDR_ALPHA;
format_of_choice[i][3] = static_cast<uint8_t>(encode_hdr_alpha ? FMT_HDR_RGBA : FMT_HDR_RGB_LDR_ALPHA);
// For 6 integers, we have one HDR-RGB encoding
float full_hdr_rgb_error = (rgb_quantization_error * mode11mult) + rgb_range_error + eci.alpha_drop_error;
@@ -603,7 +587,7 @@ static void compute_color_error_for_every_integer_count_and_quant_level(
error_scale_oe_rgb = 1.0f;
}
float base_quant_error = baseline_quant_error[i];
float base_quant_error = baseline_quant_error[i - QUANT_6];
float quant_error_rgb = base_quant_error_rgb * base_quant_error;
float quant_error_rgba = base_quant_error_rgba * base_quant_error;
@@ -688,10 +672,10 @@ static void compute_color_error_for_every_integer_count_and_quant_level(
static float one_partition_find_best_combination_for_bitcount(
QualityProfile privateProfile,
const float best_combined_error[21][4],
const int best_combined_format[21][4],
const uint8_t best_combined_format[21][4],
int bits_available,
quant_method& best_quant_level,
int& best_format
uint8_t& best_quant_level,
uint8_t& best_format
) {
int best_integer_count = 0;
float best_integer_count_error = ERROR_CALC_DEFAULT;
@@ -721,7 +705,7 @@ static float one_partition_find_best_combination_for_bitcount(
int ql = quant_mode_table[best_integer_count + 1][bits_available];
best_quant_level = static_cast<quant_method>(ql);
best_quant_level = static_cast<uint8_t>(ql);
if (privateProfile == HIGH_SPEED_PROFILE) // keep openSource code style
{
best_format = FMT_RGBA;
@@ -749,9 +733,9 @@ static float one_partition_find_best_combination_for_bitcount(
*/
static void two_partitions_find_best_combination_for_every_quantization_and_integer_count(
const float best_error[2][21][4], // indexed by (partition, quant-level, integer-pair-count-minus-1)
const int best_format[2][21][4],
const uint8_t best_format[2][21][4],
float best_combined_error[21][7], // indexed by (quant-level, integer-pair-count-minus-2)
int best_combined_format[21][7][2]
uint8_t best_combined_format[21][7][2]
) {
for (int i = QUANT_2; i <= QUANT_256; i++)
{
@@ -801,11 +785,11 @@ static void two_partitions_find_best_combination_for_every_quantization_and_inte
*/
static float two_partitions_find_best_combination_for_bitcount(
float best_combined_error[21][7],
int best_combined_format[21][7][2],
uint8_t best_combined_format[21][7][2],
int bits_available,
quant_method& best_quant_level,
quant_method& best_quant_level_mod,
int* best_formats
uint8_t& best_quant_level,
uint8_t& best_quant_level_mod,
uint8_t* best_formats
) {
int best_integer_count = 0;
float best_integer_count_error = ERROR_CALC_DEFAULT;
@@ -832,8 +816,8 @@ static float two_partitions_find_best_combination_for_bitcount(
int ql = quant_mode_table[best_integer_count][bits_available];
int ql_mod = quant_mode_table[best_integer_count][bits_available + 2];
best_quant_level = static_cast<quant_method>(ql);
best_quant_level_mod = static_cast<quant_method>(ql_mod);
best_quant_level = static_cast<uint8_t>(ql);
best_quant_level_mod = static_cast<uint8_t>(ql_mod);
if (ql >= QUANT_6)
{
@@ -863,9 +847,9 @@ static float two_partitions_find_best_combination_for_bitcount(
*/
static void three_partitions_find_best_combination_for_every_quantization_and_integer_count(
const float best_error[3][21][4], // indexed by (partition, quant-level, integer-count)
const int best_format[3][21][4],
const uint8_t best_format[3][21][4],
float best_combined_error[21][10],
int best_combined_format[21][10][3]
uint8_t best_combined_format[21][10][3]
) {
for (int i = QUANT_2; i <= QUANT_256; i++)
{
@@ -926,11 +910,11 @@ static void three_partitions_find_best_combination_for_every_quantization_and_in
*/
static float three_partitions_find_best_combination_for_bitcount(
const float best_combined_error[21][10],
const int best_combined_format[21][10][3],
const uint8_t best_combined_format[21][10][3],
int bits_available,
quant_method& best_quant_level,
quant_method& best_quant_level_mod,
int* best_formats
uint8_t& best_quant_level,
uint8_t& best_quant_level_mod,
uint8_t* best_formats
) {
int best_integer_count = 0;
float best_integer_count_error = ERROR_CALC_DEFAULT;
@@ -957,8 +941,8 @@ static float three_partitions_find_best_combination_for_bitcount(
int ql = quant_mode_table[best_integer_count][bits_available];
int ql_mod = quant_mode_table[best_integer_count][bits_available + 5];
best_quant_level = static_cast<quant_method>(ql);
best_quant_level_mod = static_cast<quant_method>(ql_mod);
best_quant_level = static_cast<uint8_t>(ql);
best_quant_level_mod = static_cast<uint8_t>(ql_mod);
if (ql >= QUANT_6)
{
@@ -988,9 +972,9 @@ static float three_partitions_find_best_combination_for_bitcount(
*/
static void four_partitions_find_best_combination_for_every_quantization_and_integer_count(
const float best_error[4][21][4], // indexed by (partition, quant-level, integer-count)
const int best_format[4][21][4],
const uint8_t best_format[4][21][4],
float best_combined_error[21][13],
int best_combined_format[21][13][4]
uint8_t best_combined_format[21][13][4]
) {
for (int i = QUANT_2; i <= QUANT_256; i++)
{
@@ -1062,11 +1046,11 @@ static void four_partitions_find_best_combination_for_every_quantization_and_int
*/
static float four_partitions_find_best_combination_for_bitcount(
const float best_combined_error[21][13],
const int best_combined_format[21][13][4],
const uint8_t best_combined_format[21][13][4],
int bits_available,
quant_method& best_quant_level,
quant_method& best_quant_level_mod,
int* best_formats
uint8_t& best_quant_level,
uint8_t& best_quant_level_mod,
uint8_t* best_formats
) {
int best_integer_count = 0;
float best_integer_count_error = ERROR_CALC_DEFAULT;
@@ -1093,8 +1077,8 @@ static float four_partitions_find_best_combination_for_bitcount(
int ql = quant_mode_table[best_integer_count][bits_available];
int ql_mod = quant_mode_table[best_integer_count][bits_available + 8];
best_quant_level = static_cast<quant_method>(ql);
best_quant_level_mod = static_cast<quant_method>(ql_mod);
best_quant_level = static_cast<uint8_t>(ql);
best_quant_level_mod = static_cast<uint8_t>(ql_mod);
if (ql >= QUANT_6)
{
@@ -1121,13 +1105,13 @@ unsigned int compute_ideal_endpoint_formats(
const image_block& blk,
const endpoints& ep,
// bitcounts and errors computed for the various quantization methods
const int* qwt_bitcounts,
const int8_t* qwt_bitcounts,
const float* qwt_errors,
unsigned int tune_candidate_limit,
unsigned int start_block_mode,
unsigned int end_block_mode,
// output data
int partition_format_specifiers[TUNE_MAX_TRIAL_CANDIDATES][BLOCK_MAX_PARTITIONS],
uint8_t partition_format_specifiers[TUNE_MAX_TRIAL_CANDIDATES][BLOCK_MAX_PARTITIONS],
int block_mode[TUNE_MAX_TRIAL_CANDIDATES],
quant_method quant_level[TUNE_MAX_TRIAL_CANDIDATES],
quant_method quant_level_mod[TUNE_MAX_TRIAL_CANDIDATES],
@@ -1137,8 +1121,8 @@ unsigned int compute_ideal_endpoint_formats(
promise(partition_count > 0);
int encode_hdr_rgb = blk.rgb_lns[0];
int encode_hdr_alpha = blk.alpha_lns[0];
bool encode_hdr_rgb = static_cast<bool>(blk.rgb_lns[0]);
bool encode_hdr_alpha = static_cast<bool>(blk.alpha_lns[0]);
// Compute the errors that result from various encoding choices (such as using luminance instead
// of RGB, discarding Alpha, using RGB-scale in place of two separate RGB endpoints and so on)
@@ -1146,7 +1130,7 @@ unsigned int compute_ideal_endpoint_formats(
compute_encoding_choice_errors(blk, pi, ep, eci);
float best_error[BLOCK_MAX_PARTITIONS][21][4];
int format_of_choice[BLOCK_MAX_PARTITIONS][21][4];
uint8_t format_of_choice[BLOCK_MAX_PARTITIONS][21][4];
for (int i = 0; i < partition_count; i++)
{
compute_color_error_for_every_integer_count_and_quant_level(
@@ -1156,28 +1140,24 @@ unsigned int compute_ideal_endpoint_formats(
}
float* errors_of_best_combination = tmpbuf.errors_of_best_combination;
quant_method* best_quant_levels = tmpbuf.best_quant_levels;
quant_method* best_quant_levels_mod = tmpbuf.best_quant_levels_mod;
int (&best_ep_formats)[WEIGHTS_MAX_BLOCK_MODES][BLOCK_MAX_PARTITIONS] = tmpbuf.best_ep_formats;
uint8_t* best_quant_levels = tmpbuf.best_quant_levels;
uint8_t* best_quant_levels_mod = tmpbuf.best_quant_levels_mod;
uint8_t (&best_ep_formats)[WEIGHTS_MAX_BLOCK_MODES][BLOCK_MAX_PARTITIONS] = tmpbuf.best_ep_formats;
// Ensure that the "overstep" of the last iteration in the vectorized loop will contain data
// that will never be picked as best candidate
const unsigned int packed_end_block_mode = round_up_to_simd_multiple_vla(end_block_mode);
// Ensure that the first iteration understep contains data that will never be picked
vfloat clear_error(ERROR_CALC_DEFAULT);
vint clear_quant(0);
// TODO: Can we avoid this?
for (unsigned int i = 0; i < start_block_mode; i++)
{
errors_of_best_combination[i] = ERROR_CALC_DEFAULT;
best_quant_levels[i] = QUANT_2;
best_quant_levels_mod[i] = QUANT_2;
}
unsigned int packed_start_block_mode = round_down_to_simd_multiple_vla(start_block_mode);
storea(clear_error, errors_of_best_combination + packed_start_block_mode);
store_nbytes(clear_quant, best_quant_levels + packed_start_block_mode);
store_nbytes(clear_quant, best_quant_levels_mod + packed_start_block_mode);
for (unsigned int i = end_block_mode; i < packed_end_block_mode; i++)
{
errors_of_best_combination[i] = ERROR_CALC_DEFAULT;
best_quant_levels[i] = QUANT_2;
best_quant_levels_mod[i] = QUANT_2;
}
// Ensure that last iteration overstep contains data that will never be picked
unsigned int packed_end_block_mode = round_down_to_simd_multiple_vla(end_block_mode - 1);
storea(clear_error, errors_of_best_combination + packed_end_block_mode);
store_nbytes(clear_quant, best_quant_levels + packed_end_block_mode);
store_nbytes(clear_quant, best_quant_levels_mod + packed_end_block_mode);
// Track a scalar best to avoid expensive search at least once ...
float error_of_best_combination = ERROR_CALC_DEFAULT;
@@ -1186,7 +1166,7 @@ unsigned int compute_ideal_endpoint_formats(
// The block contains 1 partition
if (partition_count == 1)
{
for (unsigned int i = start_block_mode; i < end_block_mode; ++i)
for (unsigned int i = start_block_mode; i < end_block_mode; i++)
{
if (qwt_errors[i] >= ERROR_CALC_DEFAULT)
{
@@ -1214,13 +1194,13 @@ unsigned int compute_ideal_endpoint_formats(
else if (partition_count == 2)
{
float combined_best_error[21][7];
int formats_of_choice[21][7][2];
uint8_t formats_of_choice[21][7][2];
two_partitions_find_best_combination_for_every_quantization_and_integer_count(
best_error, format_of_choice, combined_best_error, formats_of_choice);
assert(start_block_mode == 0);
for (unsigned int i = 0; i < end_block_mode; ++i)
for (unsigned int i = 0; i < end_block_mode; i++)
{
if (qwt_errors[i] >= ERROR_CALC_DEFAULT)
{
@@ -1247,13 +1227,13 @@ unsigned int compute_ideal_endpoint_formats(
else if (partition_count == 3)
{
float combined_best_error[21][10];
int formats_of_choice[21][10][3];
uint8_t formats_of_choice[21][10][3];
three_partitions_find_best_combination_for_every_quantization_and_integer_count(
best_error, format_of_choice, combined_best_error, formats_of_choice);
assert(start_block_mode == 0);
for (unsigned int i = 0; i < end_block_mode; ++i)
for (unsigned int i = 0; i < end_block_mode; i++)
{
if (qwt_errors[i] >= ERROR_CALC_DEFAULT)
{
@@ -1281,13 +1261,13 @@ unsigned int compute_ideal_endpoint_formats(
{
assert(partition_count == 4);
float combined_best_error[21][13];
int formats_of_choice[21][13][4];
uint8_t formats_of_choice[21][13][4];
four_partitions_find_best_combination_for_every_quantization_and_integer_count(
best_error, format_of_choice, combined_best_error, formats_of_choice);
assert(start_block_mode == 0);
for (unsigned int i = 0; i < end_block_mode; ++i)
for (unsigned int i = 0; i < end_block_mode; i++)
{
if (qwt_errors[i] >= ERROR_CALC_DEFAULT)
{
@@ -1330,10 +1310,8 @@ unsigned int compute_ideal_endpoint_formats(
vint lane_ids = vint::lane_id() + vint(start_block_mode);
for (unsigned int j = start_block_mode; j < end_block_mode; j += ASTCENC_SIMD_WIDTH)
{
vfloat err = vfloat(&errors_of_best_combination[j]);
vmask mask1 = err < vbest_ep_error;
vmask mask2 = vint(reinterpret_cast<int*>(best_quant_levels + j)) > vint(4);
vmask mask = mask1 & mask2;
vfloat err = vfloat(errors_of_best_combination + j);
vmask mask = err < vbest_ep_error;
vbest_ep_error = select(vbest_ep_error, err, mask);
vbest_error_index = select(vbest_error_index, lane_ids, mask);
lane_ids += vint(ASTCENC_SIMD_WIDTH);
@@ -1368,8 +1346,8 @@ unsigned int compute_ideal_endpoint_formats(
block_mode[i] = best_error_weights[i];
quant_level[i] = best_quant_levels[best_error_weights[i]];
quant_level_mod[i] = best_quant_levels_mod[best_error_weights[i]];
quant_level[i] = static_cast<quant_method>(best_quant_levels[best_error_weights[i]]);
quant_level_mod[i] = static_cast<quant_method>(best_quant_levels_mod[best_error_weights[i]]);
assert(quant_level[i] >= QUANT_6 && quant_level[i] <= QUANT_256);
assert(quant_level_mod[i] >= QUANT_6 && quant_level_mod[i] <= QUANT_256);
File diff suppressed because it is too large Load Diff
+148 -103
View File
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: Apache-2.0
// ----------------------------------------------------------------------------
// Copyright 2011-2021 Arm Limited
// Copyright 2011-2023 Arm Limited
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy
@@ -23,6 +23,50 @@
#include <cassert>
/**
* @brief Reverse bits in a byte.
*
* @param p The value to reverse.
*
* @return The reversed result.
*/
static inline int bitrev8(int p)
{
p = ((p & 0x0F) << 4) | ((p >> 4) & 0x0F);
p = ((p & 0x33) << 2) | ((p >> 2) & 0x33);
p = ((p & 0x55) << 1) | ((p >> 1) & 0x55);
return p;
}
/**
* @brief Read up to 8 bits at an arbitrary bit offset.
*
* The stored value is at most 8 bits, but can be stored at an offset of between 0 and 7 bits so may
* span two separate bytes in memory.
*
* @param bitcount The number of bits to read.
* @param bitoffset The bit offset to read from, between 0 and 7.
* @param[in,out] ptr The data pointer to read from.
*
* @return The read value.
*/
static inline int read_bits(
int bitcount,
int bitoffset,
const uint8_t* ptr
) {
int mask = (1 << bitcount) - 1;
ptr += bitoffset >> 3;
bitoffset &= 7;
int value = ptr[0] | (ptr[1] << 8);
value >>= bitoffset;
value &= mask;
return value;
}
#if !defined(ASTCENC_DECOMPRESS_ONLY)
/**
* @brief Write up to 8 bits at an arbitrary bit offset.
*
@@ -54,74 +98,47 @@ static inline void write_bits(
ptr[1] |= value >> 8;
}
/**
* @brief Read up to 8 bits at an arbitrary bit offset.
*
* The stored value is at most 8 bits, but can be stored at an offset of between 0 and 7 bits so may
* span two separate bytes in memory.
*
* @param bitcount The number of bits to read.
* @param bitoffset The bit offset to read from, between 0 and 7.
* @param[in,out] ptr The data pointer to read from.
*
* @return The read value.
*/
static inline int read_bits(
int bitcount,
int bitoffset,
const uint8_t* ptr
) {
int mask = (1 << bitcount) - 1;
ptr += bitoffset >> 3;
bitoffset &= 7;
int value = ptr[0] | (ptr[1] << 8);
value >>= bitoffset;
value &= mask;
return value;
}
/**
* @brief Reverse bits in a byte.
*
* @param p The value to reverse.
*
* @return The reversed result.
*/
static inline int bitrev8(int p)
{
p = ((p & 0x0F) << 4) | ((p >> 4) & 0x0F);
p = ((p & 0x33) << 2) | ((p >> 2) & 0x33);
p = ((p & 0x55) << 1) | ((p >> 1) & 0x55);
return p;
}
static const int HIGH_SPEED_PROFILE_COLOR_BYTES = 8;
static const int HIGH_SPEED_PROFILE_WEIGHT_BYTES = 16;
/* See header for documentation. */
void symbolic_to_physical(
const block_size_descriptor& bsd,
const symbolic_compressed_block& scb,
physical_compressed_block& pcb
uint8_t pcb[16]
) {
assert(scb.block_type != SYM_BTYPE_ERROR);
const auto& bm = bsd.get_block_mode(scb.block_mode);
const auto& di = bsd.get_decimation_info(bm.decimation_mode);
int weight_count = di.weight_count;
quant_method weight_quant_method = bm.get_weight_quant_mode();
float weight_quant_levels = static_cast<float>(get_quant_level(weight_quant_method));
const auto& qat = quant_and_xfer_tables[weight_quant_method];
if (scb.privateProfile == HIGH_SPEED_PROFILE)
{
uint8_t weights[64];
for (int i = 0; i < weight_count; i++)
{
float uqw = static_cast<float>(scb.weights[i]);
float qw = (uqw / 64.0f) * (weight_quant_levels - 1.0f);
int qwi = static_cast<int>(qw + 0.5f);
weights[i] = qat.scramble_map[qwi];
}
uint8_t weightbuf[HIGH_SPEED_PROFILE_WEIGHT_BYTES] = {0};
encode_ise(QUANT_6, HIGH_SPEED_PROFILE_WEIGHT_BYTES, scb.weights, weightbuf, 0);
encode_ise(QUANT_6, HIGH_SPEED_PROFILE_WEIGHT_BYTES, weights, weightbuf, 0);
for (int i = 0; i < HIGH_SPEED_PROFILE_WEIGHT_BYTES; i++)
{
pcb.data[i] = static_cast<uint8_t>(bitrev8(weightbuf[HIGH_SPEED_PROFILE_WEIGHT_BYTES - 1 - i]));
pcb[i] = static_cast<uint8_t>(bitrev8(weightbuf[HIGH_SPEED_PROFILE_WEIGHT_BYTES - 1 - i]));
}
pcb.data[0] = 0x43; // the first byte of every block stream is 0x43 for HIGH_SPEED_PROFILE
pcb.data[1] = 0x80; // the second byte of every block stream is 0x80 for HIGH_SPEED_PROFILE
pcb.data[2] = 0x01; // the third (2 idx) byte of every block stream is 0x01 for HIGH_SPEED_PROFILE
pcb[0] = 0x43; // the first byte of every block stream is 0x43 for HIGH_SPEED_PROFILE
pcb[1] = 0x80; // the second byte of every block stream is 0x80 for HIGH_SPEED_PROFILE
pcb[2] = 0x01; // the third (2 idx) byte of every block stream is 0x01 for HIGH_SPEED_PROFILE
uint8_t values_to_encode[HIGH_SPEED_PROFILE_COLOR_BYTES];
for (int j = 0; j < HIGH_SPEED_PROFILE_COLOR_BYTES; j++)
{
values_to_encode[j] = scb.color_values[0][j];
}
encode_ise(scb.get_color_quant_mode(), HIGH_SPEED_PROFILE_COLOR_BYTES,
values_to_encode, pcb.data, 17); // the color is starting from 17th bit for HIGH_SPEED_PROFILE
values_to_encode, pcb, 17); // the color is starting from 17th bit for HIGH_SPEED_PROFILE
return;
}
@@ -132,13 +149,13 @@ void symbolic_to_physical(
static const uint8_t cbytes[8] { 0xFC, 0xFD, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF };
for (unsigned int i = 0; i < 8; i++)
{
pcb.data[i] = cbytes[i];
pcb[i] = cbytes[i];
}
for (unsigned int i = 0; i < BLOCK_MAX_COMPONENTS; i++)
{
pcb.data[2 * i + 8] = scb.constant_color[i] & 0xFF;
pcb.data[2 * i + 9] = (scb.constant_color[i] >> 8) & 0xFF;
pcb[2 * i + 8] = scb.constant_color[i] & 0xFF;
pcb[2 * i + 9] = (scb.constant_color[i] >> 8) & 0xFF;
}
return;
@@ -151,13 +168,13 @@ void symbolic_to_physical(
static const uint8_t cbytes[8] { 0xFC, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF };
for (unsigned int i = 0; i < 8; i++)
{
pcb.data[i] = cbytes[i];
pcb[i] = cbytes[i];
}
for (unsigned int i = 0; i < BLOCK_MAX_COMPONENTS; i++)
{
pcb.data[2 * i + 8] = scb.constant_color[i] & 0xFF;
pcb.data[2 * i + 9] = (scb.constant_color[i] >> 8) & 0xFF;
pcb[2 * i + 8] = scb.constant_color[i] & 0xFF;
pcb[2 * i + 9] = (scb.constant_color[i] >> 8) & 0xFF;
}
return;
@@ -169,50 +186,60 @@ void symbolic_to_physical(
// They are encoded as an ordinary integer-sequence, then bit-reversed
uint8_t weightbuf[16] { 0 };
const auto& bm = bsd.get_block_mode(scb.block_mode);
const auto& di = bsd.get_decimation_info(bm.decimation_mode);
int weight_count = di.weight_count;
quant_method weight_quant_method = bm.get_weight_quant_mode();
int is_dual_plane = bm.is_dual_plane;
int real_weight_count = is_dual_plane ? 2 * weight_count : weight_count;
int bits_for_weights = get_ise_sequence_bitcount(real_weight_count, weight_quant_method);
uint8_t weights[64];
if (is_dual_plane)
{
uint8_t weights[64];
for (int i = 0; i < weight_count; i++)
{
weights[2 * i] = scb.weights[i];
weights[2 * i + 1] = scb.weights[i + WEIGHTS_PLANE2_OFFSET];
float uqw = static_cast<float>(scb.weights[i]);
float qw = (uqw / 64.0f) * (weight_quant_levels - 1.0f);
int qwi = static_cast<int>(qw + 0.5f);
weights[2 * i] = qat.scramble_map[qwi];
uqw = static_cast<float>(scb.weights[i + WEIGHTS_PLANE2_OFFSET]);
qw = (uqw / 64.0f) * (weight_quant_levels - 1.0f);
qwi = static_cast<int>(qw + 0.5f);
weights[2 * i + 1] = qat.scramble_map[qwi];
}
encode_ise(weight_quant_method, real_weight_count, weights, weightbuf, 0);
}
else
{
encode_ise(weight_quant_method, weight_count, scb.weights, weightbuf, 0);
for (int i = 0; i < weight_count; i++)
{
float uqw = static_cast<float>(scb.weights[i]);
float qw = (uqw / 64.0f) * (weight_quant_levels - 1.0f);
int qwi = static_cast<int>(qw + 0.5f);
weights[i] = qat.scramble_map[qwi];
}
}
encode_ise(weight_quant_method, real_weight_count, weights, weightbuf, 0);
for (int i = 0; i < 16; i++)
{
pcb.data[i] = static_cast<uint8_t>(bitrev8(weightbuf[15 - i]));
pcb[i] = static_cast<uint8_t>(bitrev8(weightbuf[15 - i]));
}
write_bits(scb.block_mode, 11, 0, pcb.data);
write_bits(partition_count - 1, 2, 11, pcb.data);
write_bits(scb.block_mode, 11, 0, pcb);
write_bits(partition_count - 1, 2, 11, pcb);
int below_weights_pos = 128 - bits_for_weights;
// Encode partition index and color endpoint types for blocks with 2+ partitions
if (partition_count > 1)
{
write_bits(scb.partition_index, 6, 13, pcb.data);
write_bits(scb.partition_index >> 6, PARTITION_INDEX_BITS - 6, 19, pcb.data);
write_bits(scb.partition_index, 6, 13, pcb);
write_bits(scb.partition_index >> 6, PARTITION_INDEX_BITS - 6, 19, pcb);
if (scb.color_formats_matched)
{
write_bits(scb.color_formats[0] << 2, 6, 13 + PARTITION_INDEX_BITS, pcb.data);
write_bits(scb.color_formats[0] << 2, 6, 13 + PARTITION_INDEX_BITS, pcb);
}
else
{
@@ -251,44 +278,48 @@ void symbolic_to_physical(
int encoded_type_highpart = encoded_type >> 6;
int encoded_type_highpart_size = (3 * partition_count) - 4;
int encoded_type_highpart_pos = 128 - bits_for_weights - encoded_type_highpart_size;
write_bits(encoded_type_lowpart, 6, 13 + PARTITION_INDEX_BITS, pcb.data);
write_bits(encoded_type_highpart, encoded_type_highpart_size, encoded_type_highpart_pos, pcb.data);
write_bits(encoded_type_lowpart, 6, 13 + PARTITION_INDEX_BITS, pcb);
write_bits(encoded_type_highpart, encoded_type_highpart_size, encoded_type_highpart_pos, pcb);
below_weights_pos -= encoded_type_highpart_size;
}
}
else
{
write_bits(scb.color_formats[0], 4, 13, pcb.data);
write_bits(scb.color_formats[0], 4, 13, pcb);
}
// In dual-plane mode, encode the color component of the second plane of weights
if (is_dual_plane)
{
write_bits(scb.plane2_component, 2, below_weights_pos - 2, pcb.data);
write_bits(scb.plane2_component, 2, below_weights_pos - 2, pcb);
}
// Encode the color components
uint8_t values_to_encode[32];
int valuecount_to_encode = 0;
const uint8_t* pack_table = color_uquant_to_scrambled_pquant_tables[scb.quant_mode - QUANT_6];
for (unsigned int i = 0; i < scb.partition_count; i++)
{
int vals = 2 * (scb.color_formats[i] >> 2) + 2;
assert(vals <= 8);
for (int j = 0; j < vals; j++)
{
values_to_encode[j + valuecount_to_encode] = scb.color_values[i][j];
values_to_encode[j + valuecount_to_encode] = pack_table[scb.color_values[i][j]];
}
valuecount_to_encode += vals;
}
encode_ise(scb.get_color_quant_mode(), valuecount_to_encode, values_to_encode, pcb.data,
encode_ise(scb.get_color_quant_mode(), valuecount_to_encode, values_to_encode, pcb,
scb.partition_count == 1 ? 17 : 19 + PARTITION_INDEX_BITS);
}
#endif
/* See header for documentation. */
void physical_to_symbolic(
const block_size_descriptor& bsd,
const physical_compressed_block& pcb,
const uint8_t pcb[16],
symbolic_compressed_block& scb
) {
uint8_t bswapped[16];
@@ -296,7 +327,7 @@ void physical_to_symbolic(
scb.block_type = SYM_BTYPE_NONCONST;
// Extract header fields
int block_mode = read_bits(11, 0, pcb.data);
int block_mode = read_bits(11, 0, pcb);
if ((block_mode & 0x1FF) == 0x1FC)
{
// Constant color block
@@ -314,24 +345,24 @@ void physical_to_symbolic(
scb.partition_count = 0;
for (int i = 0; i < 4; i++)
{
scb.constant_color[i] = pcb.data[2 * i + 8] | (pcb.data[2 * i + 9] << 8);
scb.constant_color[i] = pcb[2 * i + 8] | (pcb[2 * i + 9] << 8);
}
// Additionally, check that the void-extent
if (bsd.zdim == 1)
{
// 2D void-extent
int rsvbits = read_bits(2, 10, pcb.data);
int rsvbits = read_bits(2, 10, pcb);
if (rsvbits != 3)
{
scb.block_type = SYM_BTYPE_ERROR;
return;
}
int vx_low_s = read_bits(8, 12, pcb.data) | (read_bits(5, 12 + 8, pcb.data) << 8);
int vx_high_s = read_bits(8, 25, pcb.data) | (read_bits(5, 25 + 8, pcb.data) << 8);
int vx_low_t = read_bits(8, 38, pcb.data) | (read_bits(5, 38 + 8, pcb.data) << 8);
int vx_high_t = read_bits(8, 51, pcb.data) | (read_bits(5, 51 + 8, pcb.data) << 8);
int vx_low_s = read_bits(8, 12, pcb) | (read_bits(5, 12 + 8, pcb) << 8);
int vx_high_s = read_bits(8, 25, pcb) | (read_bits(5, 25 + 8, pcb) << 8);
int vx_low_t = read_bits(8, 38, pcb) | (read_bits(5, 38 + 8, pcb) << 8);
int vx_high_t = read_bits(8, 51, pcb) | (read_bits(5, 51 + 8, pcb) << 8);
int all_ones = vx_low_s == 0x1FFF && vx_high_s == 0x1FFF && vx_low_t == 0x1FFF && vx_high_t == 0x1FFF;
@@ -344,12 +375,12 @@ void physical_to_symbolic(
else
{
// 3D void-extent
int vx_low_s = read_bits(9, 10, pcb.data);
int vx_high_s = read_bits(9, 19, pcb.data);
int vx_low_t = read_bits(9, 28, pcb.data);
int vx_high_t = read_bits(9, 37, pcb.data);
int vx_low_p = read_bits(9, 46, pcb.data);
int vx_high_p = read_bits(9, 55, pcb.data);
int vx_low_s = read_bits(9, 10, pcb);
int vx_high_s = read_bits(9, 19, pcb);
int vx_low_t = read_bits(9, 28, pcb);
int vx_high_t = read_bits(9, 37, pcb);
int vx_low_p = read_bits(9, 46, pcb);
int vx_high_p = read_bits(9, 55, pcb);
int all_ones = vx_low_s == 0x1FF && vx_high_s == 0x1FF && vx_low_t == 0x1FF && vx_high_t == 0x1FF && vx_low_p == 0x1FF && vx_high_p == 0x1FF;
@@ -374,38 +405,47 @@ void physical_to_symbolic(
const auto& di = bsd.get_decimation_info(bm.decimation_mode);
int weight_count = di.weight_count;
promise(weight_count > 0);
quant_method weight_quant_method = static_cast<quant_method>(bm.quant_mode);
int is_dual_plane = bm.is_dual_plane;
int real_weight_count = is_dual_plane ? 2 * weight_count : weight_count;
int partition_count = read_bits(2, 11, pcb.data) + 1;
int partition_count = read_bits(2, 11, pcb) + 1;
promise(partition_count > 0);
scb.block_mode = static_cast<uint16_t>(block_mode);
scb.partition_count = static_cast<uint8_t>(partition_count);
for (int i = 0; i < 16; i++)
{
bswapped[i] = static_cast<uint8_t>(bitrev8(pcb.data[15 - i]));
bswapped[i] = static_cast<uint8_t>(bitrev8(pcb[15 - i]));
}
int bits_for_weights = get_ise_sequence_bitcount(real_weight_count, weight_quant_method);
int below_weights_pos = 128 - bits_for_weights;
uint8_t indices[64];
const auto& qat = quant_and_xfer_tables[weight_quant_method];
decode_ise(weight_quant_method, real_weight_count, bswapped, indices, 0);
if (is_dual_plane)
{
uint8_t indices[64];
decode_ise(weight_quant_method, real_weight_count, bswapped, indices, 0);
for (int i = 0; i < weight_count; i++)
{
scb.weights[i] = indices[2 * i];
scb.weights[i + WEIGHTS_PLANE2_OFFSET] = indices[2 * i + 1];
scb.weights[i] = qat.unscramble_and_unquant_map[indices[2 * i]];
scb.weights[i + WEIGHTS_PLANE2_OFFSET] = qat.unscramble_and_unquant_map[indices[2 * i + 1]];
}
}
else
{
decode_ise(weight_quant_method, weight_count, bswapped, scb.weights, 0);
for (int i = 0; i < weight_count; i++)
{
scb.weights[i] = qat.unscramble_and_unquant_map[indices[i]];
}
}
if (is_dual_plane && partition_count == 4)
@@ -421,14 +461,15 @@ void physical_to_symbolic(
int encoded_type_highpart_size = 0;
if (partition_count == 1)
{
color_formats[0] = read_bits(4, 13, pcb.data);
color_formats[0] = read_bits(4, 13, pcb);
scb.partition_index = 0;
}
else
{
encoded_type_highpart_size = (3 * partition_count) - 4;
below_weights_pos -= encoded_type_highpart_size;
int encoded_type = read_bits(6, 13 + PARTITION_INDEX_BITS, pcb.data) | (read_bits(encoded_type_highpart_size, below_weights_pos, pcb.data) << 6);
int encoded_type = read_bits(6, 13 + PARTITION_INDEX_BITS, pcb) |
(read_bits(encoded_type_highpart_size, below_weights_pos, pcb) << 6);
int baseclass = encoded_type & 0x3;
if (baseclass == 0)
{
@@ -458,7 +499,8 @@ void physical_to_symbolic(
bitpos += 2;
}
}
scb.partition_index = static_cast<uint16_t>(read_bits(6, 13, pcb.data) | (read_bits(PARTITION_INDEX_BITS - 6, 19, pcb.data) << 6));
scb.partition_index = static_cast<uint16_t>(read_bits(6, 13, pcb) |
(read_bits(PARTITION_INDEX_BITS - 6, 19, pcb) << 6));
}
for (int i = 0; i < partition_count; i++)
@@ -502,24 +544,27 @@ void physical_to_symbolic(
// Unpack the integer color values and assign to endpoints
scb.quant_mode = static_cast<quant_method>(color_quant_level);
uint8_t values_to_decode[32];
decode_ise(static_cast<quant_method>(color_quant_level), color_integer_count, pcb.data,
decode_ise(static_cast<quant_method>(color_quant_level), color_integer_count, pcb,
values_to_decode, (partition_count == 1 ? 17 : 19 + PARTITION_INDEX_BITS));
int valuecount_to_decode = 0;
const uint8_t* unpack_table = color_scrambled_pquant_to_uquant_tables[scb.quant_mode - QUANT_6];
for (int i = 0; i < partition_count; i++)
{
int vals = 2 * (color_formats[i] >> 2) + 2;
for (int j = 0; j < vals; j++)
{
scb.color_values[i][j] = values_to_decode[j + valuecount_to_decode];
scb.color_values[i][j] = unpack_table[values_to_decode[j + valuecount_to_decode]];
}
valuecount_to_decode += vals;
}
// Fetch component for second-plane in the case of dual plane of weights.
scb.plane2_component = -1;
if (is_dual_plane)
{
scb.plane2_component = static_cast<int8_t>(read_bits(2, below_weights_pos - 2, pcb.data));
scb.plane2_component = static_cast<int8_t>(read_bits(2, below_weights_pos - 2, pcb));
}
}
+13 -10
View File
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: Apache-2.0
// ----------------------------------------------------------------------------
// Copyright 2019-2021 Arm Limited
// Copyright 2019-2022 Arm Limited
// Copyright 2008 Jose Fonseca
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
@@ -26,7 +26,7 @@
* with that is available at compile time. The current vector width is
* accessible for e.g. loop strides via the ASTCENC_SIMD_WIDTH constant.
*
* Explicit scalar types are acessible via the vint1, vfloat1, vmask1 types.
* Explicit scalar types are accessible via the vint1, vfloat1, vmask1 types.
* These are provided primarily for prototyping and algorithm debug of VLA
* implementations.
*
@@ -60,10 +60,13 @@
#if !defined(__clang__) && defined(_MSC_VER)
#define ASTCENC_SIMD_INLINE __forceinline
#define ASTCENC_NO_INLINE
#elif defined(__GNUC__) && !defined(__clang__)
#define ASTCENC_SIMD_INLINE __attribute__((always_inline)) inline
#define ASTCENC_NO_INLINE __attribute__ ((noinline))
#else
#define ASTCENC_SIMD_INLINE __attribute__((always_inline, nodebug)) inline
#define ASTCENC_NO_INLINE __attribute__ ((noinline))
#endif
#if ASTCENC_AVX >= 2
@@ -160,7 +163,7 @@
*/
ASTCENC_SIMD_INLINE unsigned int round_down_to_simd_multiple_8(unsigned int count)
{
return count & ~(8 - 1);
return count & static_cast<unsigned int>(~(8 - 1));
}
/**
@@ -172,7 +175,7 @@ ASTCENC_SIMD_INLINE unsigned int round_down_to_simd_multiple_8(unsigned int coun
*/
ASTCENC_SIMD_INLINE unsigned int round_down_to_simd_multiple_4(unsigned int count)
{
return count & ~(4 - 1);
return count & static_cast<unsigned int>(~(4 - 1));
}
/**
@@ -186,7 +189,7 @@ ASTCENC_SIMD_INLINE unsigned int round_down_to_simd_multiple_4(unsigned int coun
*/
ASTCENC_SIMD_INLINE unsigned int round_down_to_simd_multiple_vla(unsigned int count)
{
return count & ~(ASTCENC_SIMD_WIDTH - 1);
return count & static_cast<unsigned int>(~(ASTCENC_SIMD_WIDTH - 1));
}
/**
@@ -200,7 +203,7 @@ ASTCENC_SIMD_INLINE unsigned int round_down_to_simd_multiple_vla(unsigned int co
*/
ASTCENC_SIMD_INLINE unsigned int round_up_to_simd_multiple_vla(unsigned int count)
{
int multiples = (count + ASTCENC_SIMD_WIDTH - 1) / ASTCENC_SIMD_WIDTH;
unsigned int multiples = (count + ASTCENC_SIMD_WIDTH - 1) / ASTCENC_SIMD_WIDTH;
return multiples * ASTCENC_SIMD_WIDTH;
}
@@ -219,7 +222,7 @@ ASTCENC_SIMD_INLINE vfloat change_sign(vfloat a, vfloat b)
/**
* @brief Return fast, but approximate, vector atan(x).
*
* Max error of this implementaiton is 0.004883.
* Max error of this implementation is 0.004883.
*/
ASTCENC_SIMD_INLINE vfloat atan(vfloat x)
{
@@ -399,7 +402,7 @@ static ASTCENC_SIMD_INLINE vint4 clz(vint4 a)
// the original integer value into a 2^N encoding we can recover easily.
// Convert to float without risk of rounding up by keeping only top 8 bits.
// This trick is is guranteed to keep top 8 bits and clear the 9th.
// This trick is is guaranteed to keep top 8 bits and clear the 9th.
a = (~lsr<8>(a)) & a;
a = float_as_int(int_to_float(a));
@@ -413,7 +416,7 @@ static ASTCENC_SIMD_INLINE vint4 clz(vint4 a)
/**
* @brief Return lanewise 2^a for each lane in @c a.
*
* Use of signed int mean that this is only valid for values in range [0, 31].
* Use of signed int means that this is only valid for values in range [0, 31].
*/
static ASTCENC_SIMD_INLINE vint4 two_to_the_n(vint4 a)
{
@@ -507,7 +510,7 @@ static ASTCENC_SIMD_INLINE vfloat4 frexp(vfloat4 a, vint4& exp)
exp = (lsr<23>(ai) & 0xFF) - 126;
// Extract and unbias the mantissa
vint4 manti = (ai & 0x807FFFFF) | 0x3F000000;
vint4 manti = (ai & static_cast<int>(0x807FFFFF)) | 0x3F000000;
return int_as_float(manti);
}
+198 -26
View File
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: Apache-2.0
// ----------------------------------------------------------------------------
// Copyright 2019-2021 Arm Limited
// Copyright 2019-2024 Arm Limited
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy
@@ -36,6 +36,9 @@
#include <cstdio>
// Define convenience intrinsics that are missing on older compilers
#define astcenc_mm256_set_m128i(m, n) _mm256_insertf128_si256(_mm256_castsi128_si256((n)), (m), 1)
// ============================================================================
// vfloat8 data type
// ============================================================================
@@ -86,7 +89,8 @@ struct vfloat8
/**
* @brief Construct from an existing SIMD register.
*/
ASTCENC_SIMD_INLINE explicit vfloat8(__m256 a) {
ASTCENC_SIMD_INLINE explicit vfloat8(__m256 a)
{
m = a;
}
@@ -237,6 +241,14 @@ struct vint8
return vint8(_mm256_broadcastd_epi32(a));
}
/**
* @brief Factory that returns a vector loaded from unaligned memory.
*/
static ASTCENC_SIMD_INLINE vint8 load(const uint8_t* p)
{
return vint8(_mm256_lddqu_si256(reinterpret_cast<const __m256i*>(p)));
}
/**
* @brief Factory that returns a vector loaded from 32B aligned memory.
*/
@@ -340,9 +352,9 @@ ASTCENC_SIMD_INLINE vmask8 operator~(vmask8 a)
*
* bit0 = lane 0
*/
ASTCENC_SIMD_INLINE unsigned mask(vmask8 a)
ASTCENC_SIMD_INLINE unsigned int mask(vmask8 a)
{
return _mm256_movemask_ps(a.m);
return static_cast<unsigned int>(_mm256_movemask_ps(a.m));
}
/**
@@ -354,7 +366,7 @@ ASTCENC_SIMD_INLINE bool any(vmask8 a)
}
/**
* @brief True if any lanes are enabled, false otherwise.
* @brief True if all lanes are enabled, false otherwise.
*/
ASTCENC_SIMD_INLINE bool all(vmask8 a)
{
@@ -461,6 +473,14 @@ ASTCENC_SIMD_INLINE vmask8 operator>(vint8 a, vint8 b)
return vmask8(_mm256_cmpgt_epi32(a.m, b.m));
}
/**
* @brief Logical shift left.
*/
template <int s> ASTCENC_SIMD_INLINE vint8 lsl(vint8 a)
{
return vint8(_mm256_slli_epi32(a.m, s));
}
/**
* @brief Arithmetic shift right.
*/
@@ -503,16 +523,13 @@ ASTCENC_SIMD_INLINE vint8 hmin(vint8 a)
m = _mm_min_epi32(m, _mm_shuffle_epi32(m, _MM_SHUFFLE(0,0,0,1)));
m = _mm_shuffle_epi32(m, _MM_SHUFFLE(0,0,0,0));
// This is the most logical implementation, but the convenience intrinsic
// is missing on older compilers (supported in g++ 9 and clang++ 9).
//__m256i r = _mm256_set_m128i(m, m)
__m256i r = _mm256_insertf128_si256(_mm256_castsi128_si256(m), m, 1);
__m256i r = astcenc_mm256_set_m128i(m, m);
vint8 vmin(r);
return vmin;
}
/**
* @brief Return the horizontal minimum of a vector.
* @brief Return the horizontal maximum of a vector.
*/
ASTCENC_SIMD_INLINE vint8 hmax(vint8 a)
{
@@ -521,10 +538,7 @@ ASTCENC_SIMD_INLINE vint8 hmax(vint8 a)
m = _mm_max_epi32(m, _mm_shuffle_epi32(m, _MM_SHUFFLE(0,0,0,1)));
m = _mm_shuffle_epi32(m, _MM_SHUFFLE(0,0,0,0));
// This is the most logical implementation, but the convenience intrinsic
// is missing on older compilers (supported in g++ 9 and clang++ 9).
//__m256i r = _mm256_set_m128i(m, m)
__m256i r = _mm256_insertf128_si256(_mm256_castsi128_si256(m), m, 1);
__m256i r = astcenc_mm256_set_m128i(m, m);
vint8 vmax(r);
return vmax;
}
@@ -578,10 +592,7 @@ ASTCENC_SIMD_INLINE vint8 pack_low_bytes(vint8 v)
__m128i a1 = _mm256_extracti128_si256(a, 1);
__m128i b = _mm_unpacklo_epi32(a0, a1);
// This is the most logical implementation, but the convenience intrinsic
// is missing on older compilers (supported in g++ 9 and clang++ 9).
//__m256i r = _mm256_set_m128i(b, b)
__m256i r = _mm256_insertf128_si256(_mm256_castsi128_si256(b), b, 1);
__m256i r = astcenc_mm256_set_m128i(b, b);
return vint8(r);
}
@@ -731,6 +742,16 @@ ASTCENC_SIMD_INLINE vfloat8 min(vfloat8 a, vfloat8 b)
return vfloat8(_mm256_min_ps(a.m, b.m));
}
/**
* @brief Return the min vector of a vector and a scalar.
*
* If either lane value is NaN, @c b will be returned for that lane.
*/
ASTCENC_SIMD_INLINE vfloat8 min(vfloat8 a, float b)
{
return min(a, vfloat8(b));
}
/**
* @brief Return the max vector of two vectors.
*
@@ -741,6 +762,16 @@ ASTCENC_SIMD_INLINE vfloat8 max(vfloat8 a, vfloat8 b)
return vfloat8(_mm256_max_ps(a.m, b.m));
}
/**
* @brief Return the max vector of a vector and a scalar.
*
* If either lane value is NaN, @c b will be returned for that lane.
*/
ASTCENC_SIMD_INLINE vfloat8 max(vfloat8 a, float b)
{
return max(a, vfloat8(b));
}
/**
* @brief Return the clamped value between min and max.
*
@@ -805,13 +836,13 @@ ASTCENC_SIMD_INLINE vfloat8 hmin(vfloat8 a)
{
__m128 vlow = _mm256_castps256_ps128(a.m);
__m128 vhigh = _mm256_extractf128_ps(a.m, 1);
vlow = _mm_min_ps(vlow, vhigh);
vlow = _mm_min_ps(vlow, vhigh);
// First do an horizontal reduction.
__m128 shuf = _mm_shuffle_ps(vlow, vlow, _MM_SHUFFLE(2, 3, 0, 1));
__m128 mins = _mm_min_ps(vlow, shuf);
shuf = _mm_movehl_ps(shuf, mins);
mins = _mm_min_ss(mins, shuf);
shuf = _mm_movehl_ps(shuf, mins);
mins = _mm_min_ss(mins, shuf);
// This is the most logical implementation, but the convenience intrinsic
// is missing on older compilers (supported in g++ 9 and clang++ 9).
@@ -836,13 +867,13 @@ ASTCENC_SIMD_INLINE vfloat8 hmax(vfloat8 a)
{
__m128 vlow = _mm256_castps256_ps128(a.m);
__m128 vhigh = _mm256_extractf128_ps(a.m, 1);
vhigh = _mm_max_ps(vlow, vhigh);
vhigh = _mm_max_ps(vlow, vhigh);
// First do an horizontal reduction.
__m128 shuf = _mm_shuffle_ps(vhigh, vhigh, _MM_SHUFFLE(2, 3, 0, 1));
__m128 maxs = _mm_max_ps(vhigh, shuf);
shuf = _mm_movehl_ps(shuf,maxs);
maxs = _mm_max_ss(maxs, shuf);
shuf = _mm_movehl_ps(shuf,maxs);
maxs = _mm_max_ss(maxs, shuf);
// This is the most logical implementation, but the convenience intrinsic
// is missing on older compilers (supported in g++ 9 and clang++ 9).
@@ -972,6 +1003,16 @@ ASTCENC_SIMD_INLINE vint8 float_to_int(vfloat8 a)
return vint8(_mm256_cvttps_epi32(a.m));
}
/**
* @brief Return a integer value for a float vector, using round-to-nearest.
*/
ASTCENC_SIMD_INLINE vint8 float_to_int_rtn(vfloat8 a)
{
a = a + vfloat8(0.5f);
return vint8(_mm256_cvttps_epi32(a.m));
}
/**
* @brief Return a float value for an integer vector.
*/
@@ -1004,23 +1045,154 @@ ASTCENC_SIMD_INLINE vfloat8 int_as_float(vint8 a)
return vfloat8(_mm256_castsi256_ps(a.m));
}
/**
* @brief Prepare a vtable lookup table for use with the native SIMD size.
*/
ASTCENC_SIMD_INLINE void vtable_prepare(vint4 t0, vint8& t0p)
{
// AVX2 duplicates the table within each 128-bit lane
__m128i t0n = t0.m;
t0p = vint8(astcenc_mm256_set_m128i(t0n, t0n));
}
/**
* @brief Prepare a vtable lookup table for use with the native SIMD size.
*/
ASTCENC_SIMD_INLINE void vtable_prepare(vint4 t0, vint4 t1, vint8& t0p, vint8& t1p)
{
// AVX2 duplicates the table within each 128-bit lane
__m128i t0n = t0.m;
t0p = vint8(astcenc_mm256_set_m128i(t0n, t0n));
__m128i t1n = _mm_xor_si128(t0.m, t1.m);
t1p = vint8(astcenc_mm256_set_m128i(t1n, t1n));
}
/**
* @brief Prepare a vtable lookup table for use with the native SIMD size.
*/
ASTCENC_SIMD_INLINE void vtable_prepare(
vint4 t0, vint4 t1, vint4 t2, vint4 t3,
vint8& t0p, vint8& t1p, vint8& t2p, vint8& t3p)
{
// AVX2 duplicates the table within each 128-bit lane
__m128i t0n = t0.m;
t0p = vint8(astcenc_mm256_set_m128i(t0n, t0n));
__m128i t1n = _mm_xor_si128(t0.m, t1.m);
t1p = vint8(astcenc_mm256_set_m128i(t1n, t1n));
__m128i t2n = _mm_xor_si128(t1.m, t2.m);
t2p = vint8(astcenc_mm256_set_m128i(t2n, t2n));
__m128i t3n = _mm_xor_si128(t2.m, t3.m);
t3p = vint8(astcenc_mm256_set_m128i(t3n, t3n));
}
/**
* @brief Perform an 8-bit 16-entry table lookup, with 32-bit indexes.
*/
ASTCENC_SIMD_INLINE vint8 vtable_8bt_32bi(vint8 t0, vint8 idx)
{
// Set index byte MSB to 1 for unused bytes so shuffle returns zero
__m256i idxx = _mm256_or_si256(idx.m, _mm256_set1_epi32(static_cast<int>(0xFFFFFF00)));
__m256i result = _mm256_shuffle_epi8(t0.m, idxx);
return vint8(result);
}
/**
* @brief Perform an 8-bit 32-entry table lookup, with 32-bit indexes.
*/
ASTCENC_SIMD_INLINE vint8 vtable_8bt_32bi(vint8 t0, vint8 t1, vint8 idx)
{
// Set index byte MSB to 1 for unused bytes so shuffle returns zero
__m256i idxx = _mm256_or_si256(idx.m, _mm256_set1_epi32(static_cast<int>(0xFFFFFF00)));
__m256i result = _mm256_shuffle_epi8(t0.m, idxx);
idxx = _mm256_sub_epi8(idxx, _mm256_set1_epi8(16));
__m256i result2 = _mm256_shuffle_epi8(t1.m, idxx);
result = _mm256_xor_si256(result, result2);
return vint8(result);
}
/**
* @brief Perform an 8-bit 64-entry table lookup, with 32-bit indexes.
*/
ASTCENC_SIMD_INLINE vint8 vtable_8bt_32bi(vint8 t0, vint8 t1, vint8 t2, vint8 t3, vint8 idx)
{
// Set index byte MSB to 1 for unused bytes so shuffle returns zero
__m256i idxx = _mm256_or_si256(idx.m, _mm256_set1_epi32(static_cast<int>(0xFFFFFF00)));
__m256i result = _mm256_shuffle_epi8(t0.m, idxx);
idxx = _mm256_sub_epi8(idxx, _mm256_set1_epi8(16));
__m256i result2 = _mm256_shuffle_epi8(t1.m, idxx);
result = _mm256_xor_si256(result, result2);
idxx = _mm256_sub_epi8(idxx, _mm256_set1_epi8(16));
result2 = _mm256_shuffle_epi8(t2.m, idxx);
result = _mm256_xor_si256(result, result2);
idxx = _mm256_sub_epi8(idxx, _mm256_set1_epi8(16));
result2 = _mm256_shuffle_epi8(t3.m, idxx);
result = _mm256_xor_si256(result, result2);
return vint8(result);
}
/**
* @brief Return a vector of interleaved RGBA data.
*
* Input vectors have the value stored in the bottom 8 bits of each lane,
* with high bits set to zero.
*
* Output vector stores a single RGBA texel packed in each lane.
*/
ASTCENC_SIMD_INLINE vint8 interleave_rgba8(vint8 r, vint8 g, vint8 b, vint8 a)
{
return r + lsl<8>(g) + lsl<16>(b) + lsl<24>(a);
}
/**
* @brief Store a vector, skipping masked lanes.
*
* All masked lanes must be at the end of vector, after all non-masked lanes.
*/
ASTCENC_SIMD_INLINE void store_lanes_masked(uint8_t* base, vint8 data, vmask8 mask)
{
_mm256_maskstore_epi32(reinterpret_cast<int*>(base), _mm256_castps_si256(mask.m), data.m);
}
/**
* @brief Debug function to print a vector of ints.
*/
ASTCENC_SIMD_INLINE void print(vint8 a)
{
alignas(ASTCENC_VECALIGN) int v[8];
alignas(32) int v[8];
storea(a, v);
printf("v8_i32:\n %8d %8d %8d %8d %8d %8d %8d %8d\n",
v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]);
}
/**
* @brief Debug function to print a vector of ints.
*/
ASTCENC_SIMD_INLINE void printx(vint8 a)
{
alignas(32) int v[8];
storea(a, v);
printf("v8_i32:\n %08x %08x %08x %08x %08x %08x %08x %08x\n",
v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]);
}
/**
* @brief Debug function to print a vector of floats.
*/
ASTCENC_SIMD_INLINE void print(vfloat8 a)
{
alignas(ASTCENC_VECALIGN) float v[8];
alignas(32) float v[8];
storea(a, v);
printf("v8_f32:\n %0.4f %0.4f %0.4f %0.4f %0.4f %0.4f %0.4f %0.4f\n",
static_cast<double>(v[0]), static_cast<double>(v[1]),
+31 -3
View File
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: Apache-2.0
// ----------------------------------------------------------------------------
// Copyright 2020-2021 Arm Limited
// Copyright 2020-2024 Arm Limited
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy
@@ -361,23 +361,51 @@ static inline int popcount(uint64_t v)
#endif
/**
* @brief Apply signed bit transfer.
*
* @param input0 The first encoded endpoint.
* @param input1 The second encoded endpoint.
*/
static ASTCENC_SIMD_INLINE void bit_transfer_signed(
vint4& input0,
vint4& input1
) {
input1 = lsr<1>(input1) | (input0 & 0x80);
input0 = lsr<1>(input0) & 0x3F;
vmask4 mask = (input0 & 0x20) != vint4::zero();
input0 = select(input0, input0 - 0x40, mask);
}
/**
* @brief Debug function to print a vector of ints.
*/
ASTCENC_SIMD_INLINE void print(vint4 a)
{
alignas(16) int v[4];
ASTCENC_ALIGNAS int v[4];
storea(a, v);
printf("v4_i32:\n %8d %8d %8d %8d\n",
v[0], v[1], v[2], v[3]);
}
/**
* @brief Debug function to print a vector of ints.
*/
ASTCENC_SIMD_INLINE void printx(vint4 a)
{
ASTCENC_ALIGNAS int v[4];
storea(a, v);
printf("v4_i32:\n %08x %08x %08x %08x\n",
v[0], v[1], v[2], v[3]);
}
/**
* @brief Debug function to print a vector of floats.
*/
ASTCENC_SIMD_INLINE void print(vfloat4 a)
{
alignas(16) float v[4];
ASTCENC_ALIGNAS float v[4];
storea(a, v);
printf("v4_f32:\n %0.4f %0.4f %0.4f %0.4f\n",
static_cast<double>(v[0]), static_cast<double>(v[1]),
+168 -10
View File
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: Apache-2.0
// ----------------------------------------------------------------------------
// Copyright 2019-2022 Arm Limited
// Copyright 2019-2023 Arm Limited
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy
@@ -38,6 +38,7 @@
#endif
#include <cstdio>
#include <cstring>
// ============================================================================
// vfloat4 data type
@@ -106,7 +107,7 @@ struct vfloat4
*/
template <int l> ASTCENC_SIMD_INLINE void set_lane(float a)
{
m = vld1q_lane_f32(&a, m, l);
m = vsetq_lane_f32(a, m, l);
}
/**
@@ -122,7 +123,7 @@ struct vfloat4
*/
static ASTCENC_SIMD_INLINE vfloat4 load1(const float* p)
{
return vfloat4(vdupq_n_f32(*p));
return vfloat4(vld1q_dup_f32(p));
}
/**
@@ -202,9 +203,8 @@ struct vint4
*/
ASTCENC_SIMD_INLINE explicit vint4(const uint8_t *p)
{
uint32x2_t t8 {};
// Cast is safe - NEON loads are allowed to be unaligned
t8 = vld1_lane_u32((const uint32_t*)p, t8, 0);
uint32x2_t t8 = vld1_dup_u32(reinterpret_cast<const uint32_t*>(p));
uint16x4_t t16 = vget_low_u16(vmovl_u8(vreinterpret_u8_u32(t8)));
m = vreinterpretq_s32_u32(vmovl_u16(t16));
}
@@ -251,7 +251,7 @@ struct vint4
*/
template <int l> ASTCENC_SIMD_INLINE void set_lane(int a)
{
m = vld1q_lane_s32(&a, m, l);
m = vsetq_lane_s32(a, m, l);
}
/**
@@ -270,6 +270,16 @@ struct vint4
return vint4(*p);
}
/**
* @brief Factory that returns a vector loaded from unaligned memory.
*/
static ASTCENC_SIMD_INLINE vint4 load(const uint8_t* p)
{
vint4 data;
std::memcpy(&data.m, p, 4 * sizeof(int));
return data;
}
/**
* @brief Factory that returns a vector loaded from 16B aligned memory.
*/
@@ -283,7 +293,7 @@ struct vint4
*/
static ASTCENC_SIMD_INLINE vint4 lane_id()
{
alignas(ASTCENC_VECALIGN) static const int data[4] { 0, 1, 2, 3 };
alignas(16) static const int data[4] { 0, 1, 2, 3 };
return vint4(vld1q_s32(data));
}
@@ -346,6 +356,14 @@ struct vmask4
m = vreinterpretq_u32_s32(ms);
}
/**
* @brief Get the scalar from a single lane.
*/
template <int32_t l> ASTCENC_SIMD_INLINE bool lane() const
{
return vgetq_lane_u32(m, l) != 0;
}
/**
* @brief The vector ...
*/
@@ -577,12 +595,20 @@ ASTCENC_SIMD_INLINE void store(vint4 a, int* p)
vst1q_s32(p, a.m);
}
/**
* @brief Store a vector to an unaligned memory address.
*/
ASTCENC_SIMD_INLINE void store(vint4 a, uint8_t* p)
{
std::memcpy(p, &a.m, sizeof(int) * 4);
}
/**
* @brief Store lowest N (vector width) bytes into an unaligned address.
*/
ASTCENC_SIMD_INLINE void store_nbytes(vint4 a, uint8_t* p)
{
vst1q_lane_s32((int32_t*)p, a.m, 0);
vst1q_lane_s32(reinterpret_cast<int32_t*>(p), a.m, 0);
}
/**
@@ -842,7 +868,7 @@ ASTCENC_SIMD_INLINE vint4 float_to_int(vfloat4 a)
*/
ASTCENC_SIMD_INLINE vint4 float_to_int_rtn(vfloat4 a)
{
a = round(a);
a = a + vfloat4(0.5f);
return vint4(vcvtq_s32_f32(a.m));
}
@@ -874,7 +900,7 @@ ASTCENC_SIMD_INLINE vint4 float_to_float16(vfloat4 a)
static inline uint16_t float_to_float16(float a)
{
vfloat4 av(a);
return float_to_float16(av).lane<0>();
return static_cast<uint16_t>(float_to_float16(av).lane<0>());
}
/**
@@ -924,6 +950,138 @@ ASTCENC_SIMD_INLINE vfloat4 int_as_float(vint4 v)
return vfloat4(vreinterpretq_f32_s32(v.m));
}
/**
* @brief Prepare a vtable lookup table for use with the native SIMD size.
*/
ASTCENC_SIMD_INLINE void vtable_prepare(vint4 t0, vint4& t0p)
{
t0p = t0;
}
/**
* @brief Prepare a vtable lookup table for use with the native SIMD size.
*/
ASTCENC_SIMD_INLINE void vtable_prepare(vint4 t0, vint4 t1, vint4& t0p, vint4& t1p)
{
t0p = t0;
t1p = t1;
}
/**
* @brief Prepare a vtable lookup table for use with the native SIMD size.
*/
ASTCENC_SIMD_INLINE void vtable_prepare(
vint4 t0, vint4 t1, vint4 t2, vint4 t3,
vint4& t0p, vint4& t1p, vint4& t2p, vint4& t3p)
{
t0p = t0;
t1p = t1;
t2p = t2;
t3p = t3;
}
/**
* @brief Perform an 8-bit 16-entry table lookup, with 32-bit indexes.
*/
ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 idx)
{
int8x16_t table {
vreinterpretq_s8_s32(t0.m)
};
// Set index byte above max index for unused bytes so table lookup returns zero
int32x4_t idx_masked = vorrq_s32(idx.m, vdupq_n_s32(0xFFFFFF00));
uint8x16_t idx_bytes = vreinterpretq_u8_s32(idx_masked);
return vint4(vreinterpretq_s32_s8(vqtbl1q_s8(table, idx_bytes)));
}
/**
* @brief Perform an 8-bit 32-entry table lookup, with 32-bit indexes.
*/
ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 t1, vint4 idx)
{
int8x16x2_t table {
vreinterpretq_s8_s32(t0.m),
vreinterpretq_s8_s32(t1.m)
};
// Set index byte above max index for unused bytes so table lookup returns zero
int32x4_t idx_masked = vorrq_s32(idx.m, vdupq_n_s32(0xFFFFFF00));
uint8x16_t idx_bytes = vreinterpretq_u8_s32(idx_masked);
return vint4(vreinterpretq_s32_s8(vqtbl2q_s8(table, idx_bytes)));
}
/**
* @brief Perform an 8-bit 64-entry table lookup, with 32-bit indexes.
*/
ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 t1, vint4 t2, vint4 t3, vint4 idx)
{
int8x16x4_t table {
vreinterpretq_s8_s32(t0.m),
vreinterpretq_s8_s32(t1.m),
vreinterpretq_s8_s32(t2.m),
vreinterpretq_s8_s32(t3.m)
};
// Set index byte above max index for unused bytes so table lookup returns zero
int32x4_t idx_masked = vorrq_s32(idx.m, vdupq_n_s32(0xFFFFFF00));
uint8x16_t idx_bytes = vreinterpretq_u8_s32(idx_masked);
return vint4(vreinterpretq_s32_s8(vqtbl4q_s8(table, idx_bytes)));
}
/**
* @brief Return a vector of interleaved RGBA data.
*
* Input vectors have the value stored in the bottom 8 bits of each lane,
* with high bits set to zero.
*
* Output vector stores a single RGBA texel packed in each lane.
*/
ASTCENC_SIMD_INLINE vint4 interleave_rgba8(vint4 r, vint4 g, vint4 b, vint4 a)
{
return r + lsl<8>(g) + lsl<16>(b) + lsl<24>(a);
}
/**
* @brief Store a single vector lane to an unaligned address.
*/
ASTCENC_SIMD_INLINE void store_lane(uint8_t* base, int data)
{
std::memcpy(base, &data, sizeof(int));
}
/**
* @brief Store a vector, skipping masked lanes.
*
* All masked lanes must be at the end of vector, after all non-masked lanes.
*/
ASTCENC_SIMD_INLINE void store_lanes_masked(uint8_t* base, vint4 data, vmask4 mask)
{
if (mask.lane<3>())
{
store(data, base);
}
else if (mask.lane<2>() != 0.0f)
{
store_lane(base + 0, data.lane<0>());
store_lane(base + 4, data.lane<1>());
store_lane(base + 8, data.lane<2>());
}
else if (mask.lane<1>() != 0.0f)
{
store_lane(base + 0, data.lane<0>());
store_lane(base + 4, data.lane<1>());
}
else if (mask.lane<0>() != 0.0f)
{
store_lane(base + 0, data.lane<0>());
}
}
#define ASTCENC_USE_NATIVE_POPCOUNT 1
/**
+197 -37
View File
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: Apache-2.0
// ----------------------------------------------------------------------------
// Copyright 2019-2021 Arm Limited
// Copyright 2019-2023 Arm Limited
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy
@@ -275,6 +275,16 @@ struct vint4
return vint4(*p);
}
/**
* @brief Factory that returns a vector loaded from unaligned memory.
*/
static ASTCENC_SIMD_INLINE vint4 load(const uint8_t* p)
{
vint4 data;
std::memcpy(&data.m, p, 4 * sizeof(int));
return data;
}
/**
* @brief Factory that returns a vector loaded from 16B aligned memory.
*/
@@ -341,6 +351,13 @@ struct vmask4
m[3] = d == false ? 0 : -1;
}
/**
* @brief Get the scalar value of a single lane.
*/
template <int l> ASTCENC_SIMD_INLINE float lane() const
{
return m[l] != 0;
}
/**
* @brief The vector ...
@@ -550,10 +567,15 @@ template <int s> ASTCENC_SIMD_INLINE vint4 lsl(vint4 a)
*/
template <int s> ASTCENC_SIMD_INLINE vint4 lsr(vint4 a)
{
return vint4((int)(((unsigned int)a.m[0]) >> s),
(int)(((unsigned int)a.m[1]) >> s),
(int)(((unsigned int)a.m[2]) >> s),
(int)(((unsigned int)a.m[3]) >> s));
unsigned int as0 = static_cast<unsigned int>(a.m[0]) >> s;
unsigned int as1 = static_cast<unsigned int>(a.m[1]) >> s;
unsigned int as2 = static_cast<unsigned int>(a.m[2]) >> s;
unsigned int as3 = static_cast<unsigned int>(a.m[3]) >> s;
return vint4(static_cast<int>(as0),
static_cast<int>(as1),
static_cast<int>(as2),
static_cast<int>(as3));
}
/**
@@ -639,13 +661,20 @@ ASTCENC_SIMD_INLINE void store(vint4 a, int* p)
p[3] = a.m[3];
}
/**
* @brief Store a vector to an unaligned memory address.
*/
ASTCENC_SIMD_INLINE void store(vint4 a, uint8_t* p)
{
std::memcpy(p, a.m, sizeof(int) * 4);
}
/**
* @brief Store lowest N (vector width) bytes into an unaligned address.
*/
ASTCENC_SIMD_INLINE void store_nbytes(vint4 a, uint8_t* p)
{
int* pi = (int*)p;
*pi = a.m[0];
std::memcpy(p, a.m, sizeof(uint8_t) * 4);
}
/**
@@ -678,10 +707,10 @@ ASTCENC_SIMD_INLINE vint4 pack_low_bytes(vint4 a)
*/
ASTCENC_SIMD_INLINE vint4 select(vint4 a, vint4 b, vmask4 cond)
{
return vint4((cond.m[0] & 0x80000000) ? b.m[0] : a.m[0],
(cond.m[1] & 0x80000000) ? b.m[1] : a.m[1],
(cond.m[2] & 0x80000000) ? b.m[2] : a.m[2],
(cond.m[3] & 0x80000000) ? b.m[3] : a.m[3]);
return vint4((cond.m[0] & static_cast<int>(0x80000000)) ? b.m[0] : a.m[0],
(cond.m[1] & static_cast<int>(0x80000000)) ? b.m[1] : a.m[1],
(cond.m[2] & static_cast<int>(0x80000000)) ? b.m[2] : a.m[2],
(cond.m[3] & static_cast<int>(0x80000000)) ? b.m[3] : a.m[3]);
}
// ============================================================================
@@ -892,10 +921,10 @@ ASTCENC_SIMD_INLINE vfloat4 sqrt(vfloat4 a)
*/
ASTCENC_SIMD_INLINE vfloat4 select(vfloat4 a, vfloat4 b, vmask4 cond)
{
return vfloat4((cond.m[0] & 0x80000000) ? b.m[0] : a.m[0],
(cond.m[1] & 0x80000000) ? b.m[1] : a.m[1],
(cond.m[2] & 0x80000000) ? b.m[2] : a.m[2],
(cond.m[3] & 0x80000000) ? b.m[3] : a.m[3]);
return vfloat4((cond.m[0] & static_cast<int>(0x80000000)) ? b.m[0] : a.m[0],
(cond.m[1] & static_cast<int>(0x80000000)) ? b.m[1] : a.m[1],
(cond.m[2] & static_cast<int>(0x80000000)) ? b.m[2] : a.m[2],
(cond.m[3] & static_cast<int>(0x80000000)) ? b.m[3] : a.m[3]);
}
/**
@@ -903,10 +932,10 @@ ASTCENC_SIMD_INLINE vfloat4 select(vfloat4 a, vfloat4 b, vmask4 cond)
*/
ASTCENC_SIMD_INLINE vfloat4 select_msb(vfloat4 a, vfloat4 b, vmask4 cond)
{
return vfloat4((cond.m[0] & 0x80000000) ? b.m[0] : a.m[0],
(cond.m[1] & 0x80000000) ? b.m[1] : a.m[1],
(cond.m[2] & 0x80000000) ? b.m[2] : a.m[2],
(cond.m[3] & 0x80000000) ? b.m[3] : a.m[3]);
return vfloat4((cond.m[0] & static_cast<int>(0x80000000)) ? b.m[0] : a.m[0],
(cond.m[1] & static_cast<int>(0x80000000)) ? b.m[1] : a.m[1],
(cond.m[2] & static_cast<int>(0x80000000)) ? b.m[2] : a.m[2],
(cond.m[3] & static_cast<int>(0x80000000)) ? b.m[3] : a.m[3]);
}
/**
@@ -947,10 +976,10 @@ ASTCENC_SIMD_INLINE void storea(vfloat4 a, float* ptr)
*/
ASTCENC_SIMD_INLINE vint4 float_to_int(vfloat4 a)
{
return vint4((int)a.m[0],
(int)a.m[1],
(int)a.m[2],
(int)a.m[3]);
return vint4(static_cast<int>(a.m[0]),
static_cast<int>(a.m[1]),
static_cast<int>(a.m[2]),
static_cast<int>(a.m[3]));
}
/**f
@@ -958,10 +987,11 @@ ASTCENC_SIMD_INLINE vint4 float_to_int(vfloat4 a)
*/
ASTCENC_SIMD_INLINE vint4 float_to_int_rtn(vfloat4 a)
{
return vint4((int)(a.m[0] + 0.5f),
(int)(a.m[1] + 0.5f),
(int)(a.m[2] + 0.5f),
(int)(a.m[3] + 0.5f));
a = a + vfloat4(0.5f);
return vint4(static_cast<int>(a.m[0]),
static_cast<int>(a.m[1]),
static_cast<int>(a.m[2]),
static_cast<int>(a.m[3]));
}
/**
@@ -969,10 +999,10 @@ ASTCENC_SIMD_INLINE vint4 float_to_int_rtn(vfloat4 a)
*/
ASTCENC_SIMD_INLINE vfloat4 int_to_float(vint4 a)
{
return vfloat4((float)a.m[0],
(float)a.m[1],
(float)a.m[2],
(float)a.m[3]);
return vfloat4(static_cast<float>(a.m[0]),
static_cast<float>(a.m[1]),
static_cast<float>(a.m[2]),
static_cast<float>(a.m[3]));
}
/**
@@ -1001,10 +1031,10 @@ static inline uint16_t float_to_float16(float a)
ASTCENC_SIMD_INLINE vfloat4 float16_to_float(vint4 a)
{
return vfloat4(
sf16_to_float(a.lane<0>()),
sf16_to_float(a.lane<1>()),
sf16_to_float(a.lane<2>()),
sf16_to_float(a.lane<3>()));
sf16_to_float(static_cast<uint16_t>(a.lane<0>())),
sf16_to_float(static_cast<uint16_t>(a.lane<1>())),
sf16_to_float(static_cast<uint16_t>(a.lane<2>())),
sf16_to_float(static_cast<uint16_t>(a.lane<3>())));
}
/**
@@ -1025,7 +1055,7 @@ ASTCENC_SIMD_INLINE float float16_to_float(uint16_t a)
ASTCENC_SIMD_INLINE vint4 float_as_int(vfloat4 a)
{
vint4 r;
memcpy(r.m, a.m, 4 * 4);
std::memcpy(r.m, a.m, 4 * 4);
return r;
}
@@ -1039,8 +1069,138 @@ ASTCENC_SIMD_INLINE vint4 float_as_int(vfloat4 a)
ASTCENC_SIMD_INLINE vfloat4 int_as_float(vint4 a)
{
vfloat4 r;
memcpy(r.m, a.m, 4 * 4);
std::memcpy(r.m, a.m, 4 * 4);
return r;
}
/**
* @brief Prepare a vtable lookup table for use with the native SIMD size.
*/
ASTCENC_SIMD_INLINE void vtable_prepare(vint4 t0, vint4& t0p)
{
t0p = t0;
}
/**
* @brief Prepare a vtable lookup table for use with the native SIMD size.
*/
ASTCENC_SIMD_INLINE void vtable_prepare(vint4 t0, vint4 t1, vint4& t0p, vint4& t1p)
{
t0p = t0;
t1p = t1;
}
/**
* @brief Prepare a vtable lookup table for use with the native SIMD size.
*/
ASTCENC_SIMD_INLINE void vtable_prepare(
vint4 t0, vint4 t1, vint4 t2, vint4 t3,
vint4& t0p, vint4& t1p, vint4& t2p, vint4& t3p)
{
t0p = t0;
t1p = t1;
t2p = t2;
t3p = t3;
}
/**
* @brief Perform an 8-bit 16-entry table lookup, with 32-bit indexes.
*/
ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 idx)
{
uint8_t table[16];
std::memcpy(table + 0, t0.m, 4 * sizeof(int));
return vint4(table[idx.lane<0>()],
table[idx.lane<1>()],
table[idx.lane<2>()],
table[idx.lane<3>()]);
}
/**
* @brief Perform an 8-bit 32-entry table lookup, with 32-bit indexes.
*/
ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 t1, vint4 idx)
{
uint8_t table[32];
std::memcpy(table + 0, t0.m, 4 * sizeof(int));
std::memcpy(table + 16, t1.m, 4 * sizeof(int));
return vint4(table[idx.lane<0>()],
table[idx.lane<1>()],
table[idx.lane<2>()],
table[idx.lane<3>()]);
}
/**
* @brief Perform an 8-bit 64-entry table lookup, with 32-bit indexes.
*/
ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 t1, vint4 t2, vint4 t3, vint4 idx)
{
uint8_t table[64];
std::memcpy(table + 0, t0.m, 4 * sizeof(int));
std::memcpy(table + 16, t1.m, 4 * sizeof(int));
std::memcpy(table + 32, t2.m, 4 * sizeof(int));
std::memcpy(table + 48, t3.m, 4 * sizeof(int));
return vint4(table[idx.lane<0>()],
table[idx.lane<1>()],
table[idx.lane<2>()],
table[idx.lane<3>()]);
}
/**
* @brief Return a vector of interleaved RGBA data.
*
* Input vectors have the value stored in the bottom 8 bits of each lane,
* with high bits set to zero.
*
* Output vector stores a single RGBA texel packed in each lane.
*/
ASTCENC_SIMD_INLINE vint4 interleave_rgba8(vint4 r, vint4 g, vint4 b, vint4 a)
{
return r + lsl<8>(g) + lsl<16>(b) + lsl<24>(a);
}
/**
* @brief Store a single vector lane to an unaligned address.
*/
ASTCENC_SIMD_INLINE void store_lane(uint8_t* base, int data)
{
std::memcpy(base, &data, sizeof(int));
}
/**
* @brief Store a vector, skipping masked lanes.
*
* All masked lanes must be at the end of vector, after all non-masked lanes.
* Input is a byte array of at least 4 bytes per unmasked entry.
*/
ASTCENC_SIMD_INLINE void store_lanes_masked(uint8_t* base, vint4 data, vmask4 mask)
{
if (mask.m[3])
{
store(data, base);
}
else if (mask.m[2])
{
store_lane(base + 0, data.lane<0>());
store_lane(base + 4, data.lane<1>());
store_lane(base + 8, data.lane<2>());
}
else if (mask.m[1])
{
store_lane(base + 0, data.lane<0>());
store_lane(base + 4, data.lane<1>());
}
else if (mask.m[0])
{
store_lane(base + 0, data.lane<0>());
}
}
#endif // #ifndef ASTC_VECMATHLIB_NONE_4_H_INCLUDED
+240 -9
View File
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: Apache-2.0
// ----------------------------------------------------------------------------
// Copyright 2019-2021 Arm Limited
// Copyright 2019-2023 Arm Limited
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy
@@ -39,6 +39,7 @@
#endif
#include <cstdio>
#include <cstring>
// ============================================================================
// vfloat4 data type
@@ -292,6 +293,18 @@ struct vint4
return vint4(*p);
}
/**
* @brief Factory that returns a vector loaded from unaligned memory.
*/
static ASTCENC_SIMD_INLINE vint4 load(const uint8_t* p)
{
#if ASTCENC_SSE >= 41
return vint4(_mm_lddqu_si128(reinterpret_cast<const __m128i*>(p)));
#else
return vint4(_mm_loadu_si128(reinterpret_cast<const __m128i*>(p)));
#endif
}
/**
* @brief Factory that returns a vector loaded from 16B aligned memory.
*/
@@ -363,6 +376,14 @@ struct vmask4
m = _mm_castsi128_ps(mask.m);
}
/**
* @brief Get the scalar value of a single lane.
*/
template <int l> ASTCENC_SIMD_INLINE bool lane() const
{
return _mm_cvtss_f32(_mm_shuffle_ps(m, m, l)) != 0.0f;
}
/**
* @brief The vector ...
*/
@@ -412,7 +433,7 @@ ASTCENC_SIMD_INLINE vmask4 operator~(vmask4 a)
*/
ASTCENC_SIMD_INLINE unsigned int mask(vmask4 a)
{
return _mm_movemask_ps(a.m);
return static_cast<unsigned int>(_mm_movemask_ps(a.m));
}
// ============================================================================
@@ -625,6 +646,14 @@ ASTCENC_SIMD_INLINE void store(vint4 a, int* p)
_mm_storeu_ps(reinterpret_cast<float*>(p), _mm_castsi128_ps(a.m));
}
/**
* @brief Store a vector to an unaligned memory address.
*/
ASTCENC_SIMD_INLINE void store(vint4 a, uint8_t* p)
{
std::memcpy(p, &a.m, sizeof(int) * 4);
}
/**
* @brief Store lowest N (vector width) bytes into an unaligned address.
*/
@@ -801,7 +830,7 @@ ASTCENC_SIMD_INLINE vfloat4 round(vfloat4 a)
return vfloat4(_mm_round_ps(a.m, flags));
#else
__m128 v = a.m;
__m128 neg_zero = _mm_castsi128_ps(_mm_set1_epi32(0x80000000));
__m128 neg_zero = _mm_castsi128_ps(_mm_set1_epi32(static_cast<int>(0x80000000)));
__m128 no_fraction = _mm_set1_ps(8388608.0f);
__m128 abs_mask = _mm_castsi128_ps(_mm_set1_epi32(0x7FFFFFFF));
__m128 sign = _mm_and_ps(v, neg_zero);
@@ -926,7 +955,7 @@ ASTCENC_SIMD_INLINE vint4 float_to_int(vfloat4 a)
*/
ASTCENC_SIMD_INLINE vint4 float_to_int_rtn(vfloat4 a)
{
a = round(a);
a = a + vfloat4(0.5f);
return vint4(_mm_cvttps_epi32(a.m));
}
@@ -980,10 +1009,10 @@ ASTCENC_SIMD_INLINE vfloat4 float16_to_float(vint4 a)
return vfloat4(f32);
#else
return vfloat4(
sf16_to_float(a.lane<0>()),
sf16_to_float(a.lane<1>()),
sf16_to_float(a.lane<2>()),
sf16_to_float(a.lane<3>()));
sf16_to_float(static_cast<uint16_t>(a.lane<0>())),
sf16_to_float(static_cast<uint16_t>(a.lane<1>())),
sf16_to_float(static_cast<uint16_t>(a.lane<2>())),
sf16_to_float(static_cast<uint16_t>(a.lane<3>())));
#endif
}
@@ -993,7 +1022,7 @@ ASTCENC_SIMD_INLINE vfloat4 float16_to_float(vint4 a)
ASTCENC_SIMD_INLINE float float16_to_float(uint16_t a)
{
#if ASTCENC_F16C >= 1
__m128i packed = _mm_set1_epi16(a);
__m128i packed = _mm_set1_epi16(static_cast<short>(a));
__m128 f32 = _mm_cvtph_ps(packed);
return _mm_cvtss_f32(f32);
#else
@@ -1025,6 +1054,208 @@ ASTCENC_SIMD_INLINE vfloat4 int_as_float(vint4 v)
return vfloat4(_mm_castsi128_ps(v.m));
}
/**
* @brief Prepare a vtable lookup table for use with the native SIMD size.
*/
ASTCENC_SIMD_INLINE void vtable_prepare(vint4 t0, vint4& t0p)
{
t0p = t0;
}
/**
* @brief Prepare a vtable lookup table for use with the native SIMD size.
*/
ASTCENC_SIMD_INLINE void vtable_prepare(vint4 t0, vint4 t1, vint4& t0p, vint4& t1p)
{
#if ASTCENC_SSE >= 41
t0p = t0;
t1p = t0 ^ t1;
#else
t0p = t0;
t1p = t1;
#endif
}
/**
* @brief Prepare a vtable lookup table for use with the native SIMD size.
*/
ASTCENC_SIMD_INLINE void vtable_prepare(
vint4 t0, vint4 t1, vint4 t2, vint4 t3,
vint4& t0p, vint4& t1p, vint4& t2p, vint4& t3p)
{
#if ASTCENC_SSE >= 41
t0p = t0;
t1p = t0 ^ t1;
t2p = t1 ^ t2;
t3p = t2 ^ t3;
#else
t0p = t0;
t1p = t1;
t2p = t2;
t3p = t3;
#endif
}
/**
* @brief Perform an 8-bit 16-entry table lookup, with 32-bit indexes.
*/
ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 idx)
{
#if ASTCENC_SSE >= 41
// Set index byte MSB to 1 for unused bytes so shuffle returns zero
__m128i idxx = _mm_or_si128(idx.m, _mm_set1_epi32(static_cast<int>(0xFFFFFF00)));
__m128i result = _mm_shuffle_epi8(t0.m, idxx);
return vint4(result);
#else
uint8_t table[16];
std::memcpy(table + 0, &t0.m, 4 * sizeof(int));
return vint4(table[idx.lane<0>()],
table[idx.lane<1>()],
table[idx.lane<2>()],
table[idx.lane<3>()]);
#endif
}
/**
* @brief Perform an 8-bit 32-entry table lookup, with 32-bit indexes.
*/
ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 t1, vint4 idx)
{
#if ASTCENC_SSE >= 41
// Set index byte MSB to 1 for unused bytes so shuffle returns zero
__m128i idxx = _mm_or_si128(idx.m, _mm_set1_epi32(static_cast<int>(0xFFFFFF00)));
__m128i result = _mm_shuffle_epi8(t0.m, idxx);
idxx = _mm_sub_epi8(idxx, _mm_set1_epi8(16));
__m128i result2 = _mm_shuffle_epi8(t1.m, idxx);
result = _mm_xor_si128(result, result2);
return vint4(result);
#else
uint8_t table[32];
std::memcpy(table + 0, &t0.m, 4 * sizeof(int));
std::memcpy(table + 16, &t1.m, 4 * sizeof(int));
return vint4(table[idx.lane<0>()],
table[idx.lane<1>()],
table[idx.lane<2>()],
table[idx.lane<3>()]);
#endif
}
/**
* @brief Perform an 8-bit 64-entry table lookup, with 32-bit indexes.
*/
ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 t1, vint4 t2, vint4 t3, vint4 idx)
{
#if ASTCENC_SSE >= 41
// Set index byte MSB to 1 for unused bytes so shuffle returns zero
__m128i idxx = _mm_or_si128(idx.m, _mm_set1_epi32(static_cast<int>(0xFFFFFF00)));
__m128i result = _mm_shuffle_epi8(t0.m, idxx);
idxx = _mm_sub_epi8(idxx, _mm_set1_epi8(16));
__m128i result2 = _mm_shuffle_epi8(t1.m, idxx);
result = _mm_xor_si128(result, result2);
idxx = _mm_sub_epi8(idxx, _mm_set1_epi8(16));
result2 = _mm_shuffle_epi8(t2.m, idxx);
result = _mm_xor_si128(result, result2);
idxx = _mm_sub_epi8(idxx, _mm_set1_epi8(16));
result2 = _mm_shuffle_epi8(t3.m, idxx);
result = _mm_xor_si128(result, result2);
return vint4(result);
#else
uint8_t table[64];
std::memcpy(table + 0, &t0.m, 4 * sizeof(int));
std::memcpy(table + 16, &t1.m, 4 * sizeof(int));
std::memcpy(table + 32, &t2.m, 4 * sizeof(int));
std::memcpy(table + 48, &t3.m, 4 * sizeof(int));
return vint4(table[idx.lane<0>()],
table[idx.lane<1>()],
table[idx.lane<2>()],
table[idx.lane<3>()]);
#endif
}
/**
* @brief Return a vector of interleaved RGBA data.
*
* Input vectors have the value stored in the bottom 8 bits of each lane,
* with high bits set to zero.
*
* Output vector stores a single RGBA texel packed in each lane.
*/
ASTCENC_SIMD_INLINE vint4 interleave_rgba8(vint4 r, vint4 g, vint4 b, vint4 a)
{
// Workaround an XCode compiler internal fault; note is slower than slli_epi32
// so we should revert this when we get the opportunity
#if defined(__APPLE__)
__m128i value = r.m;
value = _mm_add_epi32(value, _mm_bslli_si128(g.m, 1));
value = _mm_add_epi32(value, _mm_bslli_si128(b.m, 2));
value = _mm_add_epi32(value, _mm_bslli_si128(a.m, 3));
return vint4(value);
#else
__m128i value = r.m;
value = _mm_add_epi32(value, _mm_slli_epi32(g.m, 8));
value = _mm_add_epi32(value, _mm_slli_epi32(b.m, 16));
value = _mm_add_epi32(value, _mm_slli_epi32(a.m, 24));
return vint4(value);
#endif
}
/**
* @brief Store a single vector lane to an unaligned address.
*/
ASTCENC_SIMD_INLINE void store_lane(uint8_t* base, int data)
{
std::memcpy(base, &data, sizeof(int));
}
/**
* @brief Store a vector, skipping masked lanes.
*
* All masked lanes must be at the end of vector, after all non-masked lanes.
*/
ASTCENC_SIMD_INLINE void store_lanes_masked(uint8_t* base, vint4 data, vmask4 mask)
{
#if ASTCENC_AVX >= 2
_mm_maskstore_epi32(reinterpret_cast<int*>(base), _mm_castps_si128(mask.m), data.m);
#else
// Note - we cannot use _mm_maskmoveu_si128 as the underlying hardware doesn't guarantee
// fault suppression on masked lanes so we can get page faults at the end of an image.
if (mask.lane<3>() != 0.0f)
{
store(data, base);
}
else if (mask.lane<2>() != 0.0f)
{
store_lane(base + 0, data.lane<0>());
store_lane(base + 4, data.lane<1>());
store_lane(base + 8, data.lane<2>());
}
else if (mask.lane<1>() != 0.0f)
{
store_lane(base + 0, data.lane<0>());
store_lane(base + 4, data.lane<1>());
}
else if (mask.lane<0>() != 0.0f)
{
store_lane(base + 0, data.lane<0>());
}
#endif
}
#if defined(ASTCENC_NO_INVARIANCE) && (ASTCENC_SSE >= 41)
#define ASTCENC_USE_NATIVE_DOT_PRODUCT 1
+110 -257
View File
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: Apache-2.0
// ----------------------------------------------------------------------------
// Copyright 2011-2022 Arm Limited
// Copyright 2011-2024 Arm Limited
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy
@@ -44,26 +44,24 @@
#include <cassert>
#include <cstring>
static constexpr unsigned int ANGULAR_STEPS { 40 };
// Store a reduced sin/cos table for 64 possible weight values; this causes slight quality loss
// compared to using sin() and cos() directly. Must be 2^N.
static constexpr unsigned int SINCOS_STEPS { 64 };
static constexpr unsigned int ANGULAR_STEPS { 32 };
static_assert((ANGULAR_STEPS % ASTCENC_SIMD_WIDTH) == 0,
"ANGULAR_STEPS must be multiple of ASTCENC_SIMD_WIDTH");
static uint8_t max_angular_steps_needed_for_quant_level[13];
static_assert(ANGULAR_STEPS >= 32,
"ANGULAR_STEPS must be at least max(steps_for_quant_level)");
// The next-to-last entry is supposed to have the value 33. This because the 32-weight mode leaves a
// double-sized hole in the middle of the weight space, so we are better off matching 33 weights.
static const uint8_t quantization_steps_for_level[13] {
2, 3, 4, 5, 6, 8, 10, 12, 16, 20, 24, 33, 36
// Store a reduced sin/cos table for 64 possible weight values; this causes
// slight quality loss compared to using sin() and cos() directly. Must be 2^N.
static constexpr unsigned int SINCOS_STEPS { 64 };
static const uint8_t steps_for_quant_level[12] {
2, 3, 4, 5, 6, 8, 10, 12, 16, 20, 24, 32
};
alignas(ASTCENC_VECALIGN) static float sin_table[SINCOS_STEPS][ANGULAR_STEPS];
alignas(ASTCENC_VECALIGN) static float cos_table[SINCOS_STEPS][ANGULAR_STEPS];
ASTCENC_ALIGNAS static float sin_table[SINCOS_STEPS][ANGULAR_STEPS];
ASTCENC_ALIGNAS static float cos_table[SINCOS_STEPS][ANGULAR_STEPS];
#if defined(ASTCENC_DIAGNOSTICS)
static bool print_once { true };
@@ -72,7 +70,6 @@ alignas(ASTCENC_VECALIGN) static float cos_table[SINCOS_STEPS][ANGULAR_STEPS];
/* See header for documentation. */
void prepare_angular_tables()
{
unsigned int max_angular_steps_needed_for_quant_steps[ANGULAR_STEPS + 1];
for (unsigned int i = 0; i < ANGULAR_STEPS; i++)
{
float angle_step = static_cast<float>(i + 1);
@@ -82,13 +79,6 @@ void prepare_angular_tables()
sin_table[j][i] = static_cast<float>(sinf((2.0f * astc::PI / (SINCOS_STEPS - 1.0f)) * angle_step * static_cast<float>(j)));
cos_table[j][i] = static_cast<float>(cosf((2.0f * astc::PI / (SINCOS_STEPS - 1.0f)) * angle_step * static_cast<float>(j)));
}
max_angular_steps_needed_for_quant_steps[i + 1] = astc::min(i + 1, ANGULAR_STEPS - 1);
}
for (unsigned int i = 0; i < 13; i++)
{
max_angular_steps_needed_for_quant_level[i] = max_angular_steps_needed_for_quant_steps[quantization_steps_for_level[i]];
}
}
@@ -109,7 +99,7 @@ static void compute_angular_offsets(
promise(weight_count > 0);
promise(max_angular_steps > 0);
alignas(ASTCENC_VECALIGN) int isamplev[BLOCK_MAX_WEIGHTS];
ASTCENC_ALIGNAS int isamplev[BLOCK_MAX_WEIGHTS];
// Precompute isample; arrays are always allocated 64 elements long
for (unsigned int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH)
@@ -165,7 +155,7 @@ static void compute_lowest_and_highest_weight(
unsigned int max_angular_steps,
unsigned int max_quant_steps,
const float* offsets,
int* lowest_weight,
float* lowest_weight,
int* weight_span,
float* error,
float* cut_low_weight_error,
@@ -184,11 +174,11 @@ static void compute_lowest_and_highest_weight(
vfloat errval = vfloat::zero();
vfloat cut_low_weight_err = vfloat::zero();
vfloat cut_high_weight_err = vfloat::zero();
vfloat offset = loada(&offsets[sp]);
vfloat offset = loada(offsets + sp);
for (unsigned int j = 0; j < weight_count; ++j)
for (unsigned int j = 0; j < weight_count; j++)
{
vfloat sval = load1(&dec_weight_ideal_value[j]) * rcp_stepsize - offset;
vfloat sval = load1(dec_weight_ideal_value + j) * rcp_stepsize - offset;
vfloat svalrte = round(sval);
vfloat diff = sval - svalrte;
errval += diff * diff;
@@ -218,16 +208,16 @@ static void compute_lowest_and_highest_weight(
vint span = float_to_int(maxidx - minidx + vfloat(1));
span = min(span, vint(max_quant_steps + 3));
span = max(span, vint(2));
storea(float_to_int(minidx), &lowest_weight[sp]);
storea(span, &weight_span[sp]);
storea(minidx, lowest_weight + sp);
storea(span, weight_span + sp);
// The cut_(lowest/highest)_weight_error indicate the error that results from forcing
// samples that should have had the weight value one step (up/down).
vfloat ssize = 1.0f / rcp_stepsize;
vfloat errscale = ssize * ssize;
storea(errval * errscale, &error[sp]);
storea(cut_low_weight_err * errscale, &cut_low_weight_error[sp]);
storea(cut_high_weight_err * errscale, &cut_high_weight_error[sp]);
storea(errval * errscale, error + sp);
storea(cut_low_weight_err * errscale, cut_low_weight_error + sp);
storea(cut_high_weight_err * errscale, cut_high_weight_error + sp);
rcp_stepsize = rcp_stepsize + vfloat(ASTCENC_SIMD_WIDTH);
}
@@ -246,21 +236,22 @@ static void compute_angular_endpoints_for_quant_levels(
unsigned int weight_count,
const float* dec_weight_ideal_value,
unsigned int max_quant_level,
float low_value[12],
float high_value[12]
float low_value[TUNE_MAX_ANGULAR_QUANT + 1],
float high_value[TUNE_MAX_ANGULAR_QUANT + 1]
) {
unsigned int max_quant_steps = quantization_steps_for_level[max_quant_level];
unsigned int max_quant_steps = steps_for_quant_level[max_quant_level];
unsigned int max_angular_steps = steps_for_quant_level[max_quant_level];
ASTCENC_ALIGNAS float angular_offsets[ANGULAR_STEPS];
alignas(ASTCENC_VECALIGN) float angular_offsets[ANGULAR_STEPS];
unsigned int max_angular_steps = max_angular_steps_needed_for_quant_level[max_quant_level];
compute_angular_offsets(weight_count, dec_weight_ideal_value,
max_angular_steps, angular_offsets);
alignas(ASTCENC_VECALIGN) int32_t lowest_weight[ANGULAR_STEPS];
alignas(ASTCENC_VECALIGN) int32_t weight_span[ANGULAR_STEPS];
alignas(ASTCENC_VECALIGN) float error[ANGULAR_STEPS];
alignas(ASTCENC_VECALIGN) float cut_low_weight_error[ANGULAR_STEPS];
alignas(ASTCENC_VECALIGN) float cut_high_weight_error[ANGULAR_STEPS];
ASTCENC_ALIGNAS float lowest_weight[ANGULAR_STEPS];
ASTCENC_ALIGNAS int32_t weight_span[ANGULAR_STEPS];
ASTCENC_ALIGNAS float error[ANGULAR_STEPS];
ASTCENC_ALIGNAS float cut_low_weight_error[ANGULAR_STEPS];
ASTCENC_ALIGNAS float cut_high_weight_error[ANGULAR_STEPS];
compute_lowest_and_highest_weight(weight_count, dec_weight_ideal_value,
max_angular_steps, max_quant_steps,
@@ -270,7 +261,7 @@ static void compute_angular_endpoints_for_quant_levels(
// For each quantization level, find the best error terms. Use packed vectors so data-dependent
// branches can become selects. This involves some integer to float casts, but the values are
// small enough so they never round the wrong way.
vfloat4 best_results[40];
vfloat4 best_results[36];
// Initialize the array to some safe defaults
promise(max_quant_steps > 0);
@@ -296,30 +287,30 @@ static void compute_angular_endpoints_for_quant_levels(
// Check best error against record N
vfloat4 best_result = best_results[idx_span];
vfloat4 new_result = vfloat4(error[i], i_flt, 0.0f, 0.0f);
vmask4 mask1(best_result.lane<0>() > error[i]);
best_results[idx_span] = select(best_result, new_result, mask1);
vmask4 mask = vfloat4(best_result.lane<0>()) > vfloat4(error[i]);
best_results[idx_span] = select(best_result, new_result, mask);
// Check best error against record N-1 with either cut low or cut high
best_result = best_results[idx_span - 1];
new_result = vfloat4(error_cut_low, i_flt, 1.0f, 0.0f);
vmask4 mask2(best_result.lane<0>() > error_cut_low);
best_result = select(best_result, new_result, mask2);
mask = vfloat4(best_result.lane<0>()) > vfloat4(error_cut_low);
best_result = select(best_result, new_result, mask);
new_result = vfloat4(error_cut_high, i_flt, 0.0f, 0.0f);
vmask4 mask3(best_result.lane<0>() > error_cut_high);
best_results[idx_span - 1] = select(best_result, new_result, mask3);
mask = vfloat4(best_result.lane<0>()) > vfloat4(error_cut_high);
best_results[idx_span - 1] = select(best_result, new_result, mask);
// Check best error against record N-2 with both cut low and high
best_result = best_results[idx_span - 2];
new_result = vfloat4(error_cut_low_high, i_flt, 1.0f, 0.0f);
vmask4 mask4(best_result.lane<0>() > error_cut_low_high);
best_results[idx_span - 2] = select(best_result, new_result, mask4);
mask = vfloat4(best_result.lane<0>()) > vfloat4(error_cut_low_high);
best_results[idx_span - 2] = select(best_result, new_result, mask);
}
for (unsigned int i = 0; i <= max_quant_level; i++)
{
unsigned int q = quantization_steps_for_level[i];
unsigned int q = steps_for_quant_level[i];
int bsi = static_cast<int>(best_results[q].lane<1>());
// Did we find anything?
@@ -333,181 +324,28 @@ static void compute_angular_endpoints_for_quant_levels(
bsi = astc::max(0, bsi);
float lwi = lowest_weight[bsi] + best_results[q].lane<2>();
float hwi = lwi + static_cast<float>(q) - 1.0f;
float stepsize = 1.0f / (1.0f + static_cast<float>(bsi));
int lwi = lowest_weight[bsi] + static_cast<int>(best_results[q].lane<2>());
int hwi = lwi + q - 1;
float offset = angular_offsets[bsi] * stepsize;
low_value[i] = offset + static_cast<float>(lwi) * stepsize;
high_value[i] = offset + static_cast<float>(hwi) * stepsize;
}
}
/**
* @brief For a given step size compute the lowest and highest weight, variant for low weight count.
*
* Compute the lowest and highest weight that results from quantizing using the given stepsize and
* offset, and then compute the resulting error. The cut errors indicate the error that results from
* forcing samples that should have had one weight value one step up or down.
*
* @param weight_count The number of (decimated) weights.
* @param dec_weight_quant_uvalue The decimated and quantized weight values.
* @param max_angular_steps The maximum number of steps to be tested.
* @param max_quant_steps The maximum quantization level to be tested.
* @param offsets The angular offsets array.
* @param[out] lowest_weight Per angular step, the lowest weight.
* @param[out] weight_span Per angular step, the span between lowest and highest weight.
* @param[out] error Per angular step, the error.
*/
static void compute_lowest_and_highest_weight_lwc(
unsigned int weight_count,
const float* dec_weight_quant_uvalue,
unsigned int max_angular_steps,
unsigned int max_quant_steps,
const float* offsets,
int* lowest_weight,
int* weight_span,
float* error
) {
promise(weight_count > 0);
promise(max_angular_steps > 0);
vfloat rcp_stepsize = vfloat::lane_id() + vfloat(1.0f);
// Arrays are ANGULAR_STEPS long, so always safe to run full vectors
for (unsigned int sp = 0; sp < max_angular_steps; sp += ASTCENC_SIMD_WIDTH)
{
vfloat minidx(128.0f);
vfloat maxidx(-128.0f);
vfloat errval = vfloat::zero();
vfloat offset = loada(&offsets[sp]);
for (unsigned int j = 0; j < weight_count; ++j)
{
vfloat sval = load1(&dec_weight_quant_uvalue[j]) * rcp_stepsize - offset;
vfloat svalrte = round(sval);
vfloat diff = sval - svalrte;
errval += diff * diff;
// Reset tracker on min hit
vmask mask = svalrte < minidx;
minidx = select(minidx, svalrte, mask);
// Reset tracker on max hit
mask = svalrte > maxidx;
maxidx = select(maxidx, svalrte, mask);
}
// Write out min weight and weight span; clamp span to a usable range
vint span = float_to_int(maxidx - minidx + vfloat(1.0f));
span = min(span, vint(max_quant_steps + 3));
span = max(span, vint(2));
storea(float_to_int(minidx), &lowest_weight[sp]);
storea(span, &weight_span[sp]);
// The cut_(lowest/highest)_weight_error indicate the error that results from forcing
// samples that should have had the weight value one step (up/down).
vfloat ssize = 1.0f / rcp_stepsize;
vfloat errscale = ssize * ssize;
storea(errval * errscale, &error[sp]);
rcp_stepsize = rcp_stepsize + vfloat(ASTCENC_SIMD_WIDTH);
}
}
/**
* @brief The main function for the angular algorithm, variant for low weight count.
*
* @param weight_count The number of (decimated) weights.
* @param dec_weight_ideal_value The ideal decimated unquantized weight values.
* @param max_quant_level The maximum quantization level to be tested.
* @param[out] low_value Per angular step, the lowest weight value.
* @param[out] high_value Per angular step, the highest weight value.
*/
static void compute_angular_endpoints_for_quant_levels_lwc(
unsigned int weight_count,
const float* dec_weight_ideal_value,
unsigned int max_quant_level,
float low_value[12],
float high_value[12]
) {
unsigned int max_quant_steps = quantization_steps_for_level[max_quant_level];
unsigned int max_angular_steps = max_angular_steps_needed_for_quant_level[max_quant_level];
alignas(ASTCENC_VECALIGN) float angular_offsets[ANGULAR_STEPS];
alignas(ASTCENC_VECALIGN) int32_t lowest_weight[ANGULAR_STEPS];
alignas(ASTCENC_VECALIGN) int32_t weight_span[ANGULAR_STEPS];
alignas(ASTCENC_VECALIGN) float error[ANGULAR_STEPS];
compute_angular_offsets(weight_count, dec_weight_ideal_value,
max_angular_steps, angular_offsets);
compute_lowest_and_highest_weight_lwc(weight_count, dec_weight_ideal_value,
max_angular_steps, max_quant_steps,
angular_offsets, lowest_weight, weight_span, error);
// For each quantization level, find the best error terms. Use packed vectors so data-dependent
// branches can become selects. This involves some integer to float casts, but the values are
// small enough so they never round the wrong way.
vfloat4 best_results[ANGULAR_STEPS];
// Initialize the array to some safe defaults
promise(max_quant_steps > 0);
for (unsigned int i = 0; i < (max_quant_steps + 4); i++)
{
best_results[i] = vfloat4(ERROR_CALC_DEFAULT, -1.0f, 0.0f, 0.0f);
}
promise(max_angular_steps > 0);
for (unsigned int i = 0; i < max_angular_steps; i++)
{
int idx_span = weight_span[i];
// Check best error against record N
vfloat4 current_best = best_results[idx_span];
vfloat4 candidate = vfloat4(error[i], static_cast<float>(i), 0.0f, 0.0f);
vmask4 mask(current_best.lane<0>() > error[i]);
best_results[idx_span] = select(current_best, candidate, mask);
}
for (unsigned int i = 0; i <= max_quant_level; i++)
{
unsigned int q = quantization_steps_for_level[i];
int bsi = static_cast<int>(best_results[q].lane<1>());
// Did we find anything?
#if defined(ASTCENC_DIAGNOSTICS)
if ((bsi < 0) && print_once)
{
print_once = false;
printf("INFO: Unable to find low weight encoding within search error limit.\n\n");
}
#endif
bsi = astc::max(0, bsi);
int lwi = lowest_weight[bsi];
int hwi = lwi + q - 1;
low_value[i] = (angular_offsets[bsi] + static_cast<float>(lwi)) / (1.0f + static_cast<float>(bsi));
high_value[i] = (angular_offsets[bsi] + static_cast<float>(hwi)) / (1.0f + static_cast<float>(bsi));
low_value[i] = (angular_offsets[bsi] + lwi) * stepsize;
high_value[i] = (angular_offsets[bsi] + hwi) * stepsize;
}
}
/* See header for documentation. */
void compute_angular_endpoints_1plane(
unsigned int tune_low_weight_limit,
bool only_always,
const block_size_descriptor& bsd,
const float* dec_weight_ideal_value,
unsigned int max_weight_quant,
compression_working_buffers& tmpbuf
) {
float (&low_value)[WEIGHTS_MAX_BLOCK_MODES] = tmpbuf.weight_low_value1;
float (&high_value)[WEIGHTS_MAX_BLOCK_MODES] = tmpbuf.weight_high_value1;
float (&low_values)[WEIGHTS_MAX_DECIMATION_MODES][12] = tmpbuf.weight_low_values1;
float (&high_values)[WEIGHTS_MAX_DECIMATION_MODES][12] = tmpbuf.weight_high_values1;
float (&low_values)[WEIGHTS_MAX_DECIMATION_MODES][TUNE_MAX_ANGULAR_QUANT + 1] = tmpbuf.weight_low_values1;
float (&high_values)[WEIGHTS_MAX_DECIMATION_MODES][TUNE_MAX_ANGULAR_QUANT + 1] = tmpbuf.weight_high_values1;
unsigned int max_decimation_modes = only_always ? bsd.decimation_mode_count_always
: bsd.decimation_mode_count_selected;
@@ -515,33 +353,34 @@ void compute_angular_endpoints_1plane(
for (unsigned int i = 0; i < max_decimation_modes; i++)
{
const decimation_mode& dm = bsd.decimation_modes[i];
if (!dm.ref_1_plane)
if (!dm.is_ref_1plane(static_cast<quant_method>(max_weight_quant)))
{
continue;
}
unsigned int weight_count = bsd.get_decimation_info(i).weight_count;
if (weight_count < tune_low_weight_limit)
unsigned int max_precision = dm.maxprec_1plane;
if (max_precision > TUNE_MAX_ANGULAR_QUANT)
{
compute_angular_endpoints_for_quant_levels_lwc(
weight_count,
dec_weight_ideal_value + i * BLOCK_MAX_WEIGHTS,
dm.maxprec_1plane, low_values[i], high_values[i]);
max_precision = TUNE_MAX_ANGULAR_QUANT;
}
else
if (max_precision > max_weight_quant)
{
compute_angular_endpoints_for_quant_levels(
weight_count,
dec_weight_ideal_value + i * BLOCK_MAX_WEIGHTS,
dm.maxprec_1plane, low_values[i], high_values[i]);
max_precision = max_weight_quant;
}
compute_angular_endpoints_for_quant_levels(
weight_count,
dec_weight_ideal_value + i * BLOCK_MAX_WEIGHTS,
max_precision, low_values[i], high_values[i]);
}
unsigned int max_block_modes = only_always ? bsd.block_mode_count_1plane_always
: bsd.block_mode_count_1plane_selected;
promise(max_block_modes > 0);
for (unsigned int i = 0; i < max_block_modes; ++i)
for (unsigned int i = 0; i < max_block_modes; i++)
{
const block_mode& bm = bsd.block_modes[i];
assert(!bm.is_dual_plane);
@@ -549,16 +388,24 @@ void compute_angular_endpoints_1plane(
unsigned int quant_mode = bm.quant_mode;
unsigned int decim_mode = bm.decimation_mode;
low_value[i] = low_values[decim_mode][quant_mode];
high_value[i] = high_values[decim_mode][quant_mode];
if (quant_mode <= TUNE_MAX_ANGULAR_QUANT)
{
low_value[i] = low_values[decim_mode][quant_mode];
high_value[i] = high_values[decim_mode][quant_mode];
}
else
{
low_value[i] = 0.0f;
high_value[i] = 1.0f;
}
}
}
/* See header for documentation. */
void compute_angular_endpoints_2planes(
unsigned int tune_low_weight_limit,
const block_size_descriptor& bsd,
const float* dec_weight_ideal_value,
unsigned int max_weight_quant,
compression_working_buffers& tmpbuf
) {
float (&low_value1)[WEIGHTS_MAX_BLOCK_MODES] = tmpbuf.weight_low_value1;
@@ -566,46 +413,42 @@ void compute_angular_endpoints_2planes(
float (&low_value2)[WEIGHTS_MAX_BLOCK_MODES] = tmpbuf.weight_low_value2;
float (&high_value2)[WEIGHTS_MAX_BLOCK_MODES] = tmpbuf.weight_high_value2;
float (&low_values1)[WEIGHTS_MAX_DECIMATION_MODES][12] = tmpbuf.weight_low_values1;
float (&high_values1)[WEIGHTS_MAX_DECIMATION_MODES][12] = tmpbuf.weight_high_values1;
float (&low_values2)[WEIGHTS_MAX_DECIMATION_MODES][12] = tmpbuf.weight_low_values2;
float (&high_values2)[WEIGHTS_MAX_DECIMATION_MODES][12] = tmpbuf.weight_high_values2;
float (&low_values1)[WEIGHTS_MAX_DECIMATION_MODES][TUNE_MAX_ANGULAR_QUANT + 1] = tmpbuf.weight_low_values1;
float (&high_values1)[WEIGHTS_MAX_DECIMATION_MODES][TUNE_MAX_ANGULAR_QUANT + 1] = tmpbuf.weight_high_values1;
float (&low_values2)[WEIGHTS_MAX_DECIMATION_MODES][TUNE_MAX_ANGULAR_QUANT + 1] = tmpbuf.weight_low_values2;
float (&high_values2)[WEIGHTS_MAX_DECIMATION_MODES][TUNE_MAX_ANGULAR_QUANT + 1] = tmpbuf.weight_high_values2;
promise(bsd.decimation_mode_count_selected > 0);
for (unsigned int i = 0; i < bsd.decimation_mode_count_selected; i++)
{
const decimation_mode& dm = bsd.decimation_modes[i];
if (!dm.ref_2_planes)
if (!dm.is_ref_2plane(static_cast<quant_method>(max_weight_quant)))
{
continue;
}
unsigned int weight_count = bsd.get_decimation_info(i).weight_count;
if (weight_count < tune_low_weight_limit)
unsigned int max_precision = dm.maxprec_2planes;
if (max_precision > TUNE_MAX_ANGULAR_QUANT)
{
compute_angular_endpoints_for_quant_levels_lwc(
weight_count,
dec_weight_ideal_value + i * BLOCK_MAX_WEIGHTS,
dm.maxprec_2planes, low_values1[i], high_values1[i]);
compute_angular_endpoints_for_quant_levels_lwc(
weight_count,
dec_weight_ideal_value + i * BLOCK_MAX_WEIGHTS + WEIGHTS_PLANE2_OFFSET,
dm.maxprec_2planes, low_values2[i], high_values2[i]);
max_precision = TUNE_MAX_ANGULAR_QUANT;
}
else
if (max_precision > max_weight_quant)
{
compute_angular_endpoints_for_quant_levels(
weight_count,
dec_weight_ideal_value + i * BLOCK_MAX_WEIGHTS,
dm.maxprec_2planes, low_values1[i], high_values1[i]);
compute_angular_endpoints_for_quant_levels(
weight_count,
dec_weight_ideal_value + i * BLOCK_MAX_WEIGHTS + WEIGHTS_PLANE2_OFFSET,
dm.maxprec_2planes, low_values2[i], high_values2[i]);
max_precision = max_weight_quant;
}
compute_angular_endpoints_for_quant_levels(
weight_count,
dec_weight_ideal_value + i * BLOCK_MAX_WEIGHTS,
max_precision, low_values1[i], high_values1[i]);
compute_angular_endpoints_for_quant_levels(
weight_count,
dec_weight_ideal_value + i * BLOCK_MAX_WEIGHTS + WEIGHTS_PLANE2_OFFSET,
max_precision, low_values2[i], high_values2[i]);
}
unsigned int start = bsd.block_mode_count_1plane_selected;
@@ -616,10 +459,20 @@ void compute_angular_endpoints_2planes(
unsigned int quant_mode = bm.quant_mode;
unsigned int decim_mode = bm.decimation_mode;
low_value1[i] = low_values1[decim_mode][quant_mode];
high_value1[i] = high_values1[decim_mode][quant_mode];
low_value2[i] = low_values2[decim_mode][quant_mode];
high_value2[i] = high_values2[decim_mode][quant_mode];
if (quant_mode <= TUNE_MAX_ANGULAR_QUANT)
{
low_value1[i] = low_values1[decim_mode][quant_mode];
high_value1[i] = high_values1[decim_mode][quant_mode];
low_value2[i] = low_values2[decim_mode][quant_mode];
high_value2[i] = high_values2[decim_mode][quant_mode];
}
else
{
low_value1[i] = 0.0f;
high_value1[i] = 1.0f;
low_value2[i] = 0.0f;
high_value2[i] = 1.0f;
}
}
}
+77 -97
View File
@@ -23,145 +23,125 @@
#define _ 0 // Using _ to indicate an entry that will not be used.
const quantization_and_transfer_table quant_and_xfer_tables[12] {
// Quantization method 0, range 0..1
const quant_and_transfer_table quant_and_xfer_tables[12] {
// QUANT2, range 0..1
{
QUANT_2,
{0, 64, 255},
{0, 64},
{0, 1},
{0, 64},
{0x01004000,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,
{0x4000,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,
_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,
0x01004000}
0x4000}
},
// Quantization method 1, range 0..2
// QUANT_3, range 0..2
{
QUANT_3,
{0, 32, 64, 255},
{0, 32, 64},
{0, 1, 2},
{0, 32, 64},
{0x01002000,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,
_,_,0x02004000,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,
_,_,_,_,0x02014020}
{0x2000,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,
_,_,0x4000,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,
_,_,_,_,0x4020}
},
// Quantization method 2, range 0..3
// QUANT_4, range 0..3
{
QUANT_4,
{0, 21, 43, 64, 255},
{0, 21, 43, 64},
{0, 1, 2, 3},
{0, 21, 43, 64},
{0x01001500,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,0x02002b00,_,_,_,_,
_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,0x03014015,_,_,_,_,_,_,_,_,_,_,_,_,
_,_,_,_,_,_,_,_,0x0302402b}
{0x1500,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,0x2b00,_,_,_,_,
_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,0x4015,_,_,_,_,_,_,_,_,_,_,_,_,
_,_,_,_,_,_,_,_,0x402b}
},
// Quantization method 3, range 0..4
//QUANT_5, range 0..4
{
QUANT_5,
{0, 16, 32, 48, 64, 255},
{0, 16, 32, 48, 64},
{0, 1, 2, 3, 4},
{0, 16, 32, 48, 64},
{0x01001000,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,0x02002000,_,_,_,_,_,_,_,_,_,
_,_,_,_,_,_,0x03013010,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,0x04024020,_,_,_,
_,_,_,_,_,_,_,_,_,_,_,_,0x04034030}
{0x1000,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,0x2000,_,_,_,_,_,_,_,_,_,
_,_,_,_,_,_,0x3010,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,0x4020,_,_,_,
_,_,_,_,_,_,_,_,_,_,_,_,0x4030}
},
// Quantization method 4, range 0..5
// QUANT_6, range 0..5
{
QUANT_6,
{0, 12, 25, 39, 52, 64, 255},
{0, 12, 25, 39, 52, 64},
{0, 2, 4, 5, 3, 1},
{0, 64, 12, 52, 25, 39},
{0x02000c00,_,_,_,_,_,_,_,_,_,_,_,0x04001900,_,_,_,_,_,_,_,_,_,_,_,_,
0x0502270c,_,_,_,_,_,_,_,_,_,_,_,_,_,0x03043419,_,_,_,_,_,_,_,_,_,_,
_,_,0x01054027,_,_,_,_,_,_,_,_,_,_,_,0x01034034}
{0x0c00,_,_,_,_,_,_,_,_,_,_,_,0x1900,_,_,_,_,_,_,_,_,_,_,_,_,
0x270c,_,_,_,_,_,_,_,_,_,_,_,_,_,0x3419,_,_,_,_,_,_,_,_,_,_,
_,_,0x4027,_,_,_,_,_,_,_,_,_,_,_,0x4034}
},
// Quantization method 5, range 0..7
// QUANT_8, range 0..7
{
QUANT_8,
{0, 9, 18, 27, 37, 46, 55, 64, 255},
{0, 9, 18, 27, 37, 46, 55, 64},
{0, 1, 2, 3, 4, 5, 6, 7},
{0, 9, 18, 27, 37, 46, 55, 64},
{0x01000900,_,_,_,_,_,_,_,_,0x02001200,_,_,_,_,_,_,_,_,0x03011b09,_,_,
_,_,_,_,_,_,0x04022512,_,_,_,_,_,_,_,_,_,0x05032e1b,_,_,_,_,_,_,_,_,
0x06043725,_,_,_,_,_,_,_,_,0x0705402e,_,_,_,_,_,_,_,_,0x07064037}
{0x0900,_,_,_,_,_,_,_,_,0x1200,_,_,_,_,_,_,_,_,0x1b09,_,_,
_,_,_,_,_,_,0x2512,_,_,_,_,_,_,_,_,_,0x2e1b,_,_,_,_,_,_,_,_,
0x3725,_,_,_,_,_,_,_,_,0x402e,_,_,_,_,_,_,_,_,0x4037}
},
// Quantization method 6, range 0..9
// QUANT_10, range 0..9
{
QUANT_10,
{0, 7, 14, 21, 28, 36, 43, 50, 57, 64, 255},
{0, 7, 14, 21, 28, 36, 43, 50, 57, 64},
{0, 2, 4, 6, 8, 9, 7, 5, 3, 1},
{0, 64, 7, 57, 14, 50, 21, 43, 28, 36},
{0x02000700,_,_,_,_,_,_,0x04000e00,_,_,_,_,_,_,0x06021507,_,_,_,_,_,_,
0x08041c0e,_,_,_,_,_,_,0x09062415,_,_,_,_,_,_,_,0x07082b1c,_,_,_,_,_,
_,0x05093224,_,_,_,_,_,_,0x0307392b,_,_,_,_,_,_,0x01054032,_,_,_,_,_,
_,0x01034039}
{0x0700,_,_,_,_,_,_,0x0e00,_,_,_,_,_,_,0x1507,_,_,_,_,_,_,
0x1c0e,_,_,_,_,_,_,0x2415,_,_,_,_,_,_,_,0x2b1c,_,_,_,_,_,
_,0x3224,_,_,_,_,_,_,0x392b,_,_,_,_,_,_,0x4032,_,_,_,_,_,
_,0x4039}
},
// Quantization method 7, range 0..11
// QUANT_12, range 0..11
{
QUANT_12,
{0, 5, 11, 17, 23, 28, 36, 41, 47, 53, 59, 64, 255},
{0, 5, 11, 17, 23, 28, 36, 41, 47, 53, 59, 64},
{0, 4, 8, 2, 6, 10, 11, 7, 3, 9, 5, 1},
{0, 64, 17, 47, 5, 59, 23, 41, 11, 53, 28, 36},
{0x04000500,_,_,_,_,0x08000b00,_,_,_,_,_,0x02041105,_,_,_,_,_,
0x0608170b,_,_,_,_,_,0x0a021c11,_,_,_,_,0x0b062417,_,_,_,_,_,_,_,
0x070a291c,_,_,_,_,0x030b2f24,_,_,_,_,_,0x09073529,_,_,_,_,_,
0x05033b2f,_,_,_,_,_,0x01094035,_,_,_,_,0x0105403b}
{0x0500,_,_,_,_,0x0b00,_,_,_,_,_,0x1105,_,_,_,_,_,
0x170b,_,_,_,_,_,0x1c11,_,_,_,_,0x2417,_,_,_,_,_,_,_,
0x291c,_,_,_,_,0x2f24,_,_,_,_,_,0x3529,_,_,_,_,_,
0x3b2f,_,_,_,_,_,0x4035,_,_,_,_,0x403b}
},
// Quantization method 8, range 0..15
// QUANT_16, range 0..15
{
QUANT_16,
{0, 4, 8, 12, 17, 21, 25, 29, 35, 39, 43, 47, 52, 56, 60, 64, 255},
{0, 4, 8, 12, 17, 21, 25, 29, 35, 39, 43, 47, 52, 56, 60, 64},
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
{0, 4, 8, 12, 17, 21, 25, 29, 35, 39, 43, 47, 52, 56, 60, 64},
{0x01000400,_,_,_,0x02000800,_,_,_,0x03010c04,_,_,_,0x04021108,_,_,_,_,
0x0503150c,_,_,_,0x06041911,_,_,_,0x07051d15,_,_,_,0x08062319,_,_,_,_,
_,0x0907271d,_,_,_,0x0a082b23,_,_,_,0x0b092f27,_,_,_,0x0c0a342b,_,_,_,
_,0x0d0b382f,_,_,_,0x0e0c3c34,_,_,_,0x0f0d4038,_,_,_,0x0f0e403c}
{0x0400,_,_,_,0x0800,_,_,_,0x0c04,_,_,_,0x1108,_,_,_,_,
0x150c,_,_,_,0x1911,_,_,_,0x1d15,_,_,_,0x2319,_,_,_,_,
_,0x271d,_,_,_,0x2b23,_,_,_,0x2f27,_,_,_,0x342b,_,_,_,
_,0x382f,_,_,_,0x3c34,_,_,_,0x4038,_,_,_,0x403c}
},
// Quantization method 9, range 0..19
// QUANT_20, range 0..19
{
QUANT_20,
{0, 3, 6, 9, 13, 16, 19, 23, 26, 29, 35, 38, 41, 45, 48, 51, 55, 58,
61, 64, 255},
{0, 3, 6, 9, 13, 16, 19, 23, 26, 29, 35, 38, 41, 45, 48, 51, 55, 58, 61, 64},
{0, 4, 8, 12, 16, 2, 6, 10, 14, 18, 19, 15, 11, 7, 3, 17, 13, 9, 5, 1},
{0, 64, 16, 48, 3, 61, 19, 45, 6, 58, 23, 41, 9, 55, 26, 38, 13, 51,
29, 35},
{0x04000300,_,_,0x08000600,_,_,0x0c040903,_,_,0x10080d06,_,_,_,
0x020c1009,_,_,0x0610130d,_,_,0x0a021710,_,_,_,0x0e061a13,_,_,
0x120a1d17,_,_,0x130e231a,_,_,_,_,_,0x0f12261d,_,_,0x0b132923,_,_,
0x070f2d26,_,_,_,0x030b3029,_,_,0x1107332d,_,_,0x0d033730,_,_,_,
0x09113a33,_,_,0x050d3d37,_,_,0x0109403a,_,_,0x0105403d}
{0, 64, 16, 48, 3, 61, 19, 45, 6, 58, 23, 41, 9, 55, 26, 38, 13, 51, 29, 35},
{0x0300,_,_,0x0600,_,_,0x0903,_,_,0x0d06,_,_,_,
0x1009,_,_,0x130d,_,_,0x1710,_,_,_,0x1a13,_,_,
0x1d17,_,_,0x231a,_,_,_,_,_,0x261d,_,_,0x2923,_,_,
0x2d26,_,_,_,0x3029,_,_,0x332d,_,_,0x3730,_,_,_,
0x3a33,_,_,0x3d37,_,_,0x403a,_,_,0x403d}
},
// Quantization method 10, range 0..23
// QUANT_24, range 0..23
{
QUANT_24,
{0, 2, 5, 8, 11, 13, 16, 19, 22, 24, 27, 30, 34, 37, 40, 42, 45, 48,
51, 53, 56, 59, 62, 64, 255},
{0, 8, 16, 2, 10, 18, 4, 12, 20, 6, 14, 22, 23, 15, 7, 21, 13, 5, 19,
11, 3, 17, 9, 1},
{0, 64, 8, 56, 16, 48, 24, 40, 2, 62, 11, 53, 19, 45, 27, 37, 5, 59,
13, 51, 22, 42, 30, 34},
{0x08000200,_,0x10000500,_,_,0x02080802,_,_,0x0a100b05,_,_,0x12020d08,
_,0x040a100b,_,_,0x0c12130d,_,_,0x14041610,_,_,0x060c1813,_,
0x0e141b16,_,_,0x16061e18,_,_,0x170e221b,_,_,_,0x0f16251e,_,_,
0x07172822,_,_,0x150f2a25,_,0x0d072d28,_,_,0x0515302a,_,_,0x130d332d,
_,_,0x0b053530,_,0x03133833,_,_,0x110b3b35,_,_,0x09033e38,_,_,
0x0111403b,_,0x0109403e}
{0, 2, 5, 8, 11, 13, 16, 19, 22, 24, 27, 30, 34, 37, 40, 42, 45, 48, 51, 53, 56, 59, 62, 64},
{0, 8, 16, 2, 10, 18, 4, 12, 20, 6, 14, 22, 23, 15, 7, 21, 13, 5, 19, 11, 3, 17, 9, 1},
{0, 64, 8, 56, 16, 48, 24, 40, 2, 62, 11, 53, 19, 45, 27, 37, 5, 59, 13, 51, 22, 42, 30, 34},
{0x0200,_,0x0500,_,_,0x0802,_,_,0x0b05,_,_,0x0d08,
_,0x100b,_,_,0x130d,_,_,0x1610,_,_,0x1813,_,
0x1b16,_,_,0x1e18,_,_,0x221b,_,_,_,0x251e,_,_,
0x2822,_,_,0x2a25,_,0x2d28,_,_,0x302a,_,_,0x332d,
_,_,0x3530,_,0x3833,_,_,0x3b35,_,_,0x3e38,_,_,
0x403b,_,0x403e}
},
// Quantization method 11, range 0..31
// QUANT_32, range 0..31
{
QUANT_32,
{0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 34, 36, 38,
40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, 64, 255},
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31},
{0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 34, 36, 38,
40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, 64},
{0x01000200,_,0x02000400,_,0x03010602,_,0x04020804,_,0x05030a06,_,
0x06040c08,_,0x07050e0a,_,0x0806100c,_,0x0907120e,_,0x0a081410,_,
0x0b091612,_,0x0c0a1814,_,0x0d0b1a16,_,0x0e0c1c18,_,0x0f0d1e1a,_,
0x100e221c,_,_,_,0x110f241e,_,0x12102622,_,0x13112824,_,0x14122a26,_,
0x15132c28,_,0x16142e2a,_,0x1715302c,_,0x1816322e,_,0x19173430,_,
0x1a183632,_,0x1b193834,_,0x1c1a3a36,_,0x1d1b3c38,_,0x1e1c3e3a,_,
0x1f1d403c,_,0x1f1e403e}
{0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, 64},
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31},
{0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, 64},
{0x0200,_,0x0400,_,0x0602,_,0x0804,_,0x0a06,_,
0x0c08,_,0x0e0a,_,0x100c,_,0x120e,_,0x1410,_,
0x1612,_,0x1814,_,0x1a16,_,0x1c18,_,0x1e1a,_,
0x221c,_,_,_,0x241e,_,0x2622,_,0x2824,_,0x2a26,_,
0x2c28,_,0x2e2a,_,0x302c,_,0x322e,_,0x3430,_,
0x3632,_,0x3834,_,0x3a36,_,0x3c38,_,0x3e3a,_,
0x403c,_,0x403e}
}
};
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: Apache-2.0
// ----------------------------------------------------------------------------
// Copyright 2020-2021 Arm Limited
// Copyright 2020-2023 Arm Limited
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy
@@ -18,14 +18,25 @@
/**
* @brief Platform-specific function implementations.
*
* This module contains functions for querying the host extended ISA support.
* This module contains the CLI entry point which also performs the role of
* validating the host extended ISA support meets the needs of the tools.
*/
// Include before the defines below to pick up any auto-setup based on compiler
// built-in config, if not being set explicitly by the build system
#include "astcenc_internal.h"
#include <cstdio>
#if (ASTCENC_SSE > 0) || (ASTCENC_AVX > 0) || \
/**
* @brief The main entry point.
*
* @param argc The number of arguments.
* @param argv The vector of arguments.
*
* @return 0 on success, non-zero otherwise.
*/
int astcenc_main(
int argc,
char **argv);
#if (ASTCENC_SSE > 20) || (ASTCENC_AVX > 0) || \
(ASTCENC_POPCNT > 0) || (ASTCENC_F16C > 0)
static bool g_init { false };
@@ -47,7 +58,7 @@ static bool g_cpu_has_f16c { false };
============================================================================ */
#if !defined(__clang__) && defined(_MSC_VER)
#define WIN32_LEAN_AND_MEAN
#include <Windows.h>
#include <windows.h>
#include <intrin.h>
/**
@@ -119,8 +130,13 @@ static void detect_cpu_isa()
}
#endif
/* See header for documentation. */
bool cpu_supports_popcnt()
#if ASTCENC_POPCNT > 0
/**
* @brief Run-time detection if the host CPU supports the POPCNT extension.
*
* @return @c true if supported, @c false if not.
*/
static bool cpu_supports_popcnt()
{
if (!g_init)
{
@@ -129,9 +145,15 @@ bool cpu_supports_popcnt()
return g_cpu_has_popcnt;
}
#endif
/* See header for documentation. */
bool cpu_supports_f16c()
#if ASTCENC_F16C > 0
/**
* @brief Run-time detection if the host CPU supports F16C extension.
*
* @return @c true if supported, @c false if not.
*/
static bool cpu_supports_f16c()
{
if (!g_init)
{
@@ -140,9 +162,15 @@ bool cpu_supports_f16c()
return g_cpu_has_f16c;
}
#endif
/* See header for documentation. */
bool cpu_supports_sse41()
#if ASTCENC_SSE >= 41
/**
* @brief Run-time detection if the host CPU supports SSE 4.1 extension.
*
* @return @c true if supported, @c false if not.
*/
static bool cpu_supports_sse41()
{
if (!g_init)
{
@@ -151,9 +179,15 @@ bool cpu_supports_sse41()
return g_cpu_has_sse41;
}
#endif
/* See header for documentation. */
bool cpu_supports_avx2()
#if ASTCENC_AVX >= 2
/**
* @brief Run-time detection if the host CPU supports AVX 2 extension.
*
* @return @c true if supported, @c false if not.
*/
static bool cpu_supports_avx2()
{
if (!g_init)
{
@@ -162,5 +196,81 @@ bool cpu_supports_avx2()
return g_cpu_has_avx2;
}
#endif
/**
* @brief Print a string to stderr.
*/
static inline void print_error(
const char* format
) {
fprintf(stderr, "%s", format);
}
/**
* @brief Validate CPU ISA support meets the requirements of this build of the library.
*
* Each library build is statically compiled for a particular set of CPU ISA features, such as the
* SIMD support or other ISA extensions such as POPCNT. This function checks that the host CPU
* actually supports everything this build needs.
*
* @return Return @c true if validated, @c false otherwise.
*/
static bool validate_cpu_isa()
{
#if ASTCENC_AVX >= 2
if (!cpu_supports_avx2())
{
print_error("ERROR: Host does not support AVX2 ISA extension\n");
return false;
}
#endif
#if ASTCENC_F16C >= 1
if (!cpu_supports_f16c())
{
print_error("ERROR: Host does not support F16C ISA extension\n");
return false;
}
#endif
#if ASTCENC_SSE >= 41
if (!cpu_supports_sse41())
{
print_error("ERROR: Host does not support SSE4.1 ISA extension\n");
return false;
}
#endif
#if ASTCENC_POPCNT >= 1
if (!cpu_supports_popcnt())
{
print_error("ERROR: Host does not support POPCNT ISA extension\n");
return false;
}
#endif
return true;
}
#else
// Fallback for cases with no dynamic ISA availability
static bool validate_cpu_isa()
{
return true;
}
#endif
int main(
int argc,
char **argv
) {
if (!validate_cpu_isa())
{
return 1;
}
return astcenc_main(argc, argv);
}
+86 -76
View File
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: Apache-2.0
// ----------------------------------------------------------------------------
// Copyright 2011-2021 Arm Limited
// Copyright 2011-2022 Arm Limited
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy
@@ -25,44 +25,39 @@
#include "astcenccli_internal.h"
/**
* @brief An accumulator using Kahan compensated floating-point summation.
*
* This method keeps higher precision than direct summation by keeping track of
* the error compensation factor @c comp which can be added into the next
* calculation. This allows single precision floats to be used in places that
* would otherwise need double precision, which is useful when vectorizing.
* @brief An accumulator for errors.
*/
class kahan_accum4
class error_accum4
{
public:
/** @brief The running sum. */
vfloat4 sum { vfloat4::zero() };
/** @brief The current compensation factor. */
vfloat4 comp { vfloat4::zero() };
double sum_r { 0.0 };
double sum_g { 0.0 };
double sum_b { 0.0 };
double sum_a { 0.0 };
};
/**
* @brief The incremental addition operator for Kahan summation.
* @brief Incremental addition operator for error accumulators.
*
* @param val The Kahan accumulator to increment
* @param val The accumulator to increment
* @param inc The increment to apply
*
* @return The updated accumulator
*/
static kahan_accum4& operator+=(
kahan_accum4 &val,
static error_accum4& operator+=(
error_accum4 &val,
vfloat4 inc
) {
vfloat4 y = inc - val.comp;
vfloat4 t = val.sum + y;
val.comp = (t - val.sum) - y;
val.sum = t;
val.sum_r += static_cast<double>(inc.lane<0>());
val.sum_g += static_cast<double>(inc.lane<1>());
val.sum_b += static_cast<double>(inc.lane<2>());
val.sum_a += static_cast<double>(inc.lane<3>());
return val;
}
/**
* @brief mPSNR tonemapping operator for HDR images.
* @brief mPSNR tone-mapping operator for HDR images.
*
* @param val The color value to tone map
* @param fstop The exposure fstop; should be in range [-125, 125]
@@ -124,10 +119,10 @@ void compute_error_metrics(
static const int componentmasks[5] { 0x00, 0x07, 0x0C, 0x07, 0x0F };
int componentmask = componentmasks[input_components];
kahan_accum4 errorsum;
kahan_accum4 alpha_scaled_errorsum;
kahan_accum4 log_errorsum;
kahan_accum4 mpsnr_errorsum;
error_accum4 errorsum;
error_accum4 alpha_scaled_errorsum;
error_accum4 log_errorsum;
error_accum4 mpsnr_errorsum;
double mean_angular_errorsum = 0.0;
double worst_angular_errorsum = 0.0;
@@ -146,7 +141,7 @@ void compute_error_metrics(
img2->dim_x, img2->dim_y, img2->dim_z);
}
float rgb_peak = 0.0f;
double rgb_peak = 0.0;
unsigned int xsize1 = img1->dim_x;
unsigned int xsize2 = img2->dim_x;
@@ -237,7 +232,10 @@ void compute_error_metrics(
color2 = clamp(0, 65504.0f, color2);
}
rgb_peak = astc::max(color1.lane<0>(), color1.lane<1>(), color1.lane<2>(), rgb_peak);
rgb_peak = astc::max(static_cast<double>(color1.lane<0>()),
static_cast<double>(color1.lane<1>()),
static_cast<double>(color1.lane<2>()),
rgb_peak);
vfloat4 diffcolor = color1 - color2;
vfloat4 diffcolor_sq = diffcolor * diffcolor;
@@ -291,106 +289,118 @@ void compute_error_metrics(
}
}
float pixels = static_cast<float>(dim_x * dim_y * dim_z);
float num = 0.0f;
float alpha_num = 0.0f;
float log_num = 0.0f;
float mpsnr_num = 0.0f;
float samples = 0.0f;
double pixels = static_cast<double>(dim_x * dim_y * dim_z);
double samples = 0.0;
double num = 0.0;
double alpha_num = 0.0;
double log_num = 0.0;
double mpsnr_num = 0.0;
if (componentmask & 1)
{
num += errorsum.sum.lane<0>();
alpha_num += alpha_scaled_errorsum.sum.lane<0>();
log_num += log_errorsum.sum.lane<0>();
mpsnr_num += mpsnr_errorsum.sum.lane<0>();
num += errorsum.sum_r;
alpha_num += alpha_scaled_errorsum.sum_r;
log_num += log_errorsum.sum_r;
mpsnr_num += mpsnr_errorsum.sum_r;
samples += pixels;
}
if (componentmask & 2)
{
num += errorsum.sum.lane<1>();
alpha_num += alpha_scaled_errorsum.sum.lane<1>();
log_num += log_errorsum.sum.lane<1>();
mpsnr_num += mpsnr_errorsum.sum.lane<1>();
num += errorsum.sum_g;
alpha_num += alpha_scaled_errorsum.sum_g;
log_num += log_errorsum.sum_g;
mpsnr_num += mpsnr_errorsum.sum_g;
samples += pixels;
}
if (componentmask & 4)
{
num += errorsum.sum.lane<2>();
alpha_num += alpha_scaled_errorsum.sum.lane<2>();
log_num += log_errorsum.sum.lane<2>();
mpsnr_num += mpsnr_errorsum.sum.lane<2>();
num += errorsum.sum_b;
alpha_num += alpha_scaled_errorsum.sum_b;
log_num += log_errorsum.sum_b;
mpsnr_num += mpsnr_errorsum.sum_b;
samples += pixels;
}
if (componentmask & 8)
{
num += errorsum.sum.lane<3>();
alpha_num += alpha_scaled_errorsum.sum.lane<3>();
num += errorsum.sum_a;
alpha_num += alpha_scaled_errorsum.sum_a;
samples += pixels;
}
float denom = samples;
float stopcount = static_cast<float>(fstop_hi - fstop_lo + 1);
float mpsnr_denom = pixels * 3.0f * stopcount * 255.0f * 255.0f;
double denom = samples;
double stopcount = static_cast<double>(fstop_hi - fstop_lo + 1);
double mpsnr_denom = pixels * 3.0 * stopcount * 255.0 * 255.0;
float psnr;
if (num == 0.0f)
psnr = 999.0f;
double psnr;
if (num == 0.0)
{
psnr = 999.0;
}
else
psnr = 10.0f * log10f(denom / num);
{
psnr = 10.0 * log10(denom / num);
}
float rgb_psnr = psnr;
double rgb_psnr = psnr;
printf("Quality metrics\n");
printf("===============\n\n");
if (componentmask & 8)
{
printf(" PSNR (LDR-RGBA): %9.4f dB\n", static_cast<double>(psnr));
printf(" PSNR (LDR-RGBA): %9.4f dB\n", psnr);
float alpha_psnr;
if (alpha_num == 0.0f)
alpha_psnr = 999.0f;
double alpha_psnr;
if (alpha_num == 0.0)
{
alpha_psnr = 999.0;
}
else
alpha_psnr = 10.0f * log10f(denom / alpha_num);
printf(" Alpha-weighted PSNR: %9.4f dB\n", static_cast<double>(alpha_psnr));
{
alpha_psnr = 10.0 * log10(denom / alpha_num);
}
printf(" Alpha-weighted PSNR: %9.4f dB\n", alpha_psnr);
float rgb_num = hadd_rgb_s(errorsum.sum);
if (rgb_num == 0.0f)
rgb_psnr = 999.0f;
double rgb_num = errorsum.sum_r + errorsum.sum_g + errorsum.sum_b;
if (rgb_num == 0.0)
{
rgb_psnr = 999.0;
}
else
rgb_psnr = 10.0f * log10f(pixels * 3.0f / rgb_num);
printf(" PSNR (LDR-RGB): %9.4f dB\n", static_cast<double>(rgb_psnr));
{
rgb_psnr = 10.0 * log10(pixels * 3.0 / rgb_num);
}
printf(" PSNR (LDR-RGB): %9.4f dB\n", rgb_psnr);
}
else
{
printf(" PSNR (LDR-RGB): %9.4f dB\n", static_cast<double>(psnr));
printf(" PSNR (LDR-RGB): %9.4f dB\n", psnr);
}
if (compute_hdr_metrics)
{
printf(" PSNR (RGB norm to peak): %9.4f dB (peak %f)\n",
static_cast<double>(rgb_psnr + 20.0f * log10f(rgb_peak)),
static_cast<double>(rgb_peak));
rgb_psnr + 20.0 * log10(rgb_peak), rgb_peak);
float mpsnr;
if (mpsnr_num == 0.0f)
double mpsnr;
if (mpsnr_num == 0.0)
{
mpsnr = 999.0f;
mpsnr = 999.0;
}
else
{
mpsnr = 10.0f * log10f(mpsnr_denom / mpsnr_num);
mpsnr = 10.0 * log10(mpsnr_denom / mpsnr_num);
}
printf(" mPSNR (RGB): %9.4f dB (fstops %+d to %+d)\n",
static_cast<double>(mpsnr), fstop_lo, fstop_hi);
mpsnr, fstop_lo, fstop_hi);
float logrmse = astc::sqrt(log_num / pixels);
printf(" LogRMSE (RGB): %9.4f\n", static_cast<double>(logrmse));
double logrmse = sqrt(log_num / pixels);
printf(" LogRMSE (RGB): %9.4f\n", logrmse);
}
if (compute_normal_metrics)
+10 -13
View File
@@ -36,12 +36,12 @@ astcenc_image *alloc_image(
img->dim_y = dim_y;
img->dim_z = dim_z;
void** data = new void*[dim_z];
img->data = data;
if (bitness == 8)
{
void** data = new void*[dim_z];
img->data_type = ASTCENC_TYPE_U8;
img->data = data;
for (unsigned int z = 0; z < dim_z; z++)
{
data[z] = new uint8_t[dim_x * dim_y * 4];
@@ -49,10 +49,7 @@ astcenc_image *alloc_image(
}
else if (bitness == 16)
{
void** data = new void*[dim_z];
img->data_type = ASTCENC_TYPE_F16;
img->data = data;
for (unsigned int z = 0; z < dim_z; z++)
{
data[z] = new uint16_t[dim_x * dim_y * 4];
@@ -61,10 +58,7 @@ astcenc_image *alloc_image(
else // if (bitness == 32)
{
assert(bitness == 32);
void** data = new void*[dim_z];
img->data_type = ASTCENC_TYPE_F32;
img->data = data;
for (unsigned int z = 0; z < dim_z; z++)
{
data[z] = new float[dim_x * dim_y * 4];
@@ -239,15 +233,18 @@ astcenc_image* astc_img_from_unorm8x4_array(
/* See header for documentation. */
float* floatx4_array_from_astc_img(
const astcenc_image* img,
bool y_flip
bool y_flip,
unsigned int z_index
) {
unsigned int dim_x = img->dim_x;
unsigned int dim_y = img->dim_y;
float *buf = new float[4 * dim_x * dim_y];
assert(z_index < img->dim_z);
if (img->data_type == ASTCENC_TYPE_U8)
{
uint8_t* data8 = static_cast<uint8_t*>(img->data[0]);
uint8_t* data8 = static_cast<uint8_t*>(img->data[z_index]);
for (unsigned int y = 0; y < dim_y; y++)
{
unsigned int ymod = y_flip ? dim_y - y - 1 : y;
@@ -264,7 +261,7 @@ float* floatx4_array_from_astc_img(
}
else if (img->data_type == ASTCENC_TYPE_F16)
{
uint16_t* data16 = static_cast<uint16_t*>(img->data[0]);
uint16_t* data16 = static_cast<uint16_t*>(img->data[z_index]);
for (unsigned int y = 0; y < dim_y; y++)
{
unsigned int ymod = y_flip ? dim_y - y - 1 : y;
@@ -287,7 +284,7 @@ float* floatx4_array_from_astc_img(
else // if (img->data_type == ASTCENC_TYPE_F32)
{
assert(img->data_type == ASTCENC_TYPE_F32);
float* data32 = static_cast<float*>(img->data[0]);
float* data32 = static_cast<float*>(img->data[z_index]);
for (unsigned int y = 0; y < dim_y; y++)
{
unsigned int ymod = y_flip ? dim_y - y - 1 : y;
+4 -4
View File
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: Apache-2.0
// ----------------------------------------------------------------------------
// Copyright 2011-2022 Arm Limited
// Copyright 2011-2023 Arm Limited
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy
@@ -26,7 +26,7 @@
#include "astcenccli_internal.h"
// Configure the STB image imagewrite library build.
// Configure the STB image write library build.
#define STB_IMAGE_IMPLEMENTATION
#define STB_IMAGE_WRITE_IMPLEMENTATION
#define STBI_NO_GIF
@@ -61,7 +61,7 @@ static void astcenc_runtime_assert(bool condition)
{
if (!condition)
{
printf("ERROR: Corrupt input image\n");
print_error("ERROR: Corrupt input image\n");
exit(1);
}
}
@@ -92,7 +92,7 @@ astcenc_image* load_png_with_wuffs(
std::ifstream file(filename, std::ios::binary | std::ios::ate);
if (!file)
{
printf("ERROR: Failed to load image %s (can't fopen)\n", filename);
print_error("ERROR: Failed to load image %s (can't fopen)\n", filename);
return nullptr;
}
+162 -45
View File
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: Apache-2.0
// ----------------------------------------------------------------------------
// Copyright 2011-2022 Arm Limited
// Copyright 2011-2023 Arm Limited
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy
@@ -25,6 +25,8 @@
#include <cstdlib>
#include <cstring>
#include <fstream>
#include <iomanip>
#include <sstream>
#include "astcenccli_internal.h"
@@ -32,8 +34,39 @@
#include "stb_image_write.h"
#include "tinyexr.h"
/**
* @brief Determine the output file name to use for a sliced image write.
*
* @param img The source data for the image.
* @param filename The base name of the file to save.
* @param index The slice index to write.
*
* @return The file name to use when saving the file.
*/
static std::string get_output_filename(
const astcenc_image* img,
const char* filename,
unsigned int index
) {
if (img->dim_z <= 1)
{
return filename;
}
std::string fnmod(filename);
std::string fnext = fnmod.substr(fnmod.find_last_of("."));
// Remove the extension
fnmod = fnmod.erase(fnmod.length() - fnext.size());
// Insert the file index into the base name, then append the extension
std::stringstream ss;
ss << fnmod << "_" << std::setw(3) << std::setfill('0') << index << fnext;
return ss.str();
}
/* ============================================================================
Image load and store through the stb_iamge and tinyexr libraries
Image load and store through the stb_image and tinyexr libraries
============================================================================ */
/**
@@ -59,7 +92,7 @@ static astcenc_image* load_image_with_tinyexr(
int load_res = LoadEXR(&image, &dim_x, &dim_y, filename, &err);
if (load_res != TINYEXR_SUCCESS)
{
printf("ERROR: Failed to load image %s (%s)\n", filename, err);
print_error("ERROR: Failed to load image %s (%s)\n", filename, err);
free(reinterpret_cast<void*>(const_cast<char*>(err)));
return nullptr;
}
@@ -115,7 +148,7 @@ static astcenc_image* load_image_with_stb(
}
}
printf("ERROR: Failed to load image %s (%s)\n", filename, stbi_failure_reason());
print_error("ERROR: Failed to load image %s (%s)\n", filename, stbi_failure_reason());
return nullptr;
}
@@ -133,9 +166,21 @@ static bool store_exr_image_with_tinyexr(
const char* filename,
int y_flip
) {
float *buf = floatx4_array_from_astc_img(img, y_flip);
int res = SaveEXR(buf, img->dim_x, img->dim_y, 4, 1, filename, nullptr);
delete[] buf;
int res { 0 };
for (unsigned int i = 0; i < img->dim_z; i++)
{
std::string fnmod = get_output_filename(img, filename, i);
float* buf = floatx4_array_from_astc_img(img, y_flip, i);
res = SaveEXR(buf, img->dim_x, img->dim_y, 4, 1, fnmod.c_str(), nullptr);
delete[] buf;
if (res < 0)
{
break;
}
}
return res >= 0;
}
@@ -153,11 +198,23 @@ static bool store_png_image_with_stb(
const char* filename,
int y_flip
) {
assert(img->data_type == ASTCENC_TYPE_U8);
uint8_t* buf = reinterpret_cast<uint8_t*>(img->data[0]);
int res { 0 };
assert(img->data_type == ASTCENC_TYPE_U8);
for (unsigned int i = 0; i < img->dim_z; i++)
{
std::string fnmod = get_output_filename(img, filename, i);
uint8_t* buf = reinterpret_cast<uint8_t*>(img->data[i]);
stbi_flip_vertically_on_write(y_flip);
res = stbi_write_png(fnmod.c_str(), img->dim_x, img->dim_y, 4, buf, img->dim_x * 4);
if (res == 0)
{
break;
}
}
stbi_flip_vertically_on_write(y_flip);
int res = stbi_write_png(filename, img->dim_x, img->dim_y, 4, buf, img->dim_x * 4);
return res != 0;
}
@@ -175,11 +232,23 @@ static bool store_tga_image_with_stb(
const char* filename,
int y_flip
) {
assert(img->data_type == ASTCENC_TYPE_U8);
uint8_t* buf = reinterpret_cast<uint8_t*>(img->data[0]);
int res { 0 };
assert(img->data_type == ASTCENC_TYPE_U8);
for (unsigned int i = 0; i < img->dim_z; i++)
{
std::string fnmod = get_output_filename(img, filename, i);
uint8_t* buf = reinterpret_cast<uint8_t*>(img->data[i]);
stbi_flip_vertically_on_write(y_flip);
res = stbi_write_tga(fnmod.c_str(), img->dim_x, img->dim_y, 4, buf);
if (res == 0)
{
break;
}
}
stbi_flip_vertically_on_write(y_flip);
int res = stbi_write_tga(filename, img->dim_x, img->dim_y, 4, buf);
return res != 0;
}
@@ -197,11 +266,23 @@ static bool store_bmp_image_with_stb(
const char* filename,
int y_flip
) {
assert(img->data_type == ASTCENC_TYPE_U8);
uint8_t* buf = reinterpret_cast<uint8_t*>(img->data[0]);
int res { 0 };
assert(img->data_type == ASTCENC_TYPE_U8);
for (unsigned int i = 0; i < img->dim_z; i++)
{
std::string fnmod = get_output_filename(img, filename, i);
uint8_t* buf = reinterpret_cast<uint8_t*>(img->data[i]);
stbi_flip_vertically_on_write(y_flip);
res = stbi_write_bmp(fnmod.c_str(), img->dim_x, img->dim_y, 4, buf);
if (res == 0)
{
break;
}
}
stbi_flip_vertically_on_write(y_flip);
int res = stbi_write_bmp(filename, img->dim_x, img->dim_y, 4, buf);
return res != 0;
}
@@ -219,9 +300,21 @@ static bool store_hdr_image_with_stb(
const char* filename,
int y_flip
) {
float* buf = floatx4_array_from_astc_img(img, y_flip);
int res = stbi_write_hdr(filename, img->dim_x, img->dim_y, 4, buf);
delete[] buf;
int res { 0 };
for (unsigned int i = 0; i < img->dim_z; i++)
{
std::string fnmod = get_output_filename(img, filename, i);
float* buf = floatx4_array_from_astc_img(img, y_flip, i);
res = stbi_write_hdr(fnmod.c_str(), img->dim_x, img->dim_y, 4, buf);
delete[] buf;
if (res == 0)
{
break;
}
}
return res != 0;
}
@@ -625,6 +718,16 @@ static uint32_t u32_byterev(uint32_t v)
#define GL_LUMINANCE 0x1909
#define GL_LUMINANCE_ALPHA 0x190A
#define GL_R8 0x8229
#define GL_RG8 0x822B
#define GL_RGB8 0x8051
#define GL_RGBA8 0x8058
#define GL_R16F 0x822D
#define GL_RG16F 0x822F
#define GL_RGB16F 0x881B
#define GL_RGBA16F 0x881A
#define GL_UNSIGNED_BYTE 0x1401
#define GL_UNSIGNED_SHORT 0x1403
#define GL_HALF_FLOAT 0x140B
@@ -768,7 +871,7 @@ static unsigned int get_format(
) {
for (auto& it : ASTC_FORMATS)
{
if ((it.x == x) && (it.y == y) && (it.z == z) && (it.is_srgb == is_srgb))
if ((it.x == x) && (it.y == y) && (it.z == z) && (it.is_srgb == is_srgb))
{
return it.format;
}
@@ -794,7 +897,7 @@ struct ktx_header
uint32_t bytes_of_key_value_data; // size in bytes of the key-and-value area immediately following the header.
};
// magic 12-byte sequence that must appear at the beginning of every KTX file.
// Magic 12-byte sequence that must appear at the beginning of every KTX file.
static uint8_t ktx_magic[12] {
0xAB, 0x4B, 0x54, 0x58, 0x20, 0x31, 0x31, 0xBB, 0x0D, 0x0A, 0x1A, 0x0A
};
@@ -909,9 +1012,9 @@ static astcenc_image* load_ktx_uncompressed_image(
return nullptr;
}
// Although these are set up later, we include a default initializer to remove warnings
int bytes_per_component = 1; // bytes per component in the KTX file.
int bitness = 8; // internal precision we will use in the codec.
// Although these are set up later, use default initializer to remove warnings
int bitness = 8; // Internal precision after conversion
int bytes_per_component = 1; // Bytes per component in the KTX file
scanline_transfer copy_method = R8_TO_RGBA8;
switch (hdr.gl_type)
@@ -1017,7 +1120,7 @@ static astcenc_image* load_ktx_uncompressed_image(
}
case GL_FLOAT:
{
bitness = 32;
bitness = 16;
bytes_per_component = 4;
switch (hdr.gl_format)
{
@@ -1126,7 +1229,7 @@ static astcenc_image* load_ktx_uncompressed_image(
}
}
// then transfer data from the surface to our own image-data-structure.
// Transfer data from the surface to our own image data structure
astcenc_image *astc_img = alloc_image(bitness, dim_x, dim_y, dim_z);
for (unsigned int z = 0; z < dim_z; z++)
@@ -1155,7 +1258,7 @@ static astcenc_image* load_ktx_uncompressed_image(
}
delete[] buf;
is_hdr = bitness == 32;
is_hdr = bitness >= 16;
component_count = components;
return astc_img;
}
@@ -1352,7 +1455,15 @@ static bool store_ktx_uncompressed_image(
ktx_header hdr;
static const int gl_format_of_components[4] {
GL_LUMINANCE, GL_LUMINANCE_ALPHA, GL_RGB, GL_RGBA
GL_RED, GL_RG, GL_RGB, GL_RGBA
};
static const int gl_sized_format_of_components_ldr[4] {
GL_R8, GL_RG8, GL_RGB8, GL_RGBA8
};
static const int gl_sized_format_of_components_hdr[4] {
GL_R16F, GL_RG16F, GL_RGB16F, GL_RGBA16F
};
memcpy(hdr.magic, ktx_magic, 12);
@@ -1360,8 +1471,15 @@ static bool store_ktx_uncompressed_image(
hdr.gl_type = (bitness == 16) ? GL_HALF_FLOAT : GL_UNSIGNED_BYTE;
hdr.gl_type_size = bitness / 8;
hdr.gl_format = gl_format_of_components[image_components - 1];
hdr.gl_internal_format = gl_format_of_components[image_components - 1];
hdr.gl_base_internal_format = gl_format_of_components[image_components - 1];
if (bitness == 16)
{
hdr.gl_internal_format = gl_sized_format_of_components_hdr[image_components - 1];
}
else
{
hdr.gl_internal_format = gl_sized_format_of_components_ldr[image_components - 1];
}
hdr.gl_base_internal_format = hdr.gl_format;
hdr.pixel_width = dim_x;
hdr.pixel_height = dim_y;
hdr.pixel_depth = (dim_z == 1) ? 0 : dim_z;
@@ -1915,7 +2033,7 @@ static astcenc_image* load_dds_uncompressed_image(
}
delete[] buf;
is_hdr = bitness == 16;
is_hdr = bitness >= 16;
component_count = components;
return astc_img;
}
@@ -2295,7 +2413,7 @@ bool store_ncimage(
eptr = ".ktx"; // use KTX file format if we don't have an ending.
}
for (int i=0; i < storer_descr_count; i++)
for (int i = 0; i < storer_descr_count; i++)
{
if (strcmp(eptr, storer_descs[i].ending1) == 0
|| strcmp(eptr, storer_descs[i].ending2) == 0)
@@ -2338,7 +2456,6 @@ static unsigned int unpack_bytes(
}
/* See header for documentation. */
// TODO: Return a bool?
int load_cimage(
const char* filename,
astc_compressed_image& img
@@ -2346,22 +2463,22 @@ int load_cimage(
std::ifstream file(filename, std::ios::in | std::ios::binary);
if (!file)
{
printf("ERROR: File open failed '%s'\n", filename);
print_error("ERROR: File open failed '%s'\n", filename);
return 1;
}
astc_header hdr;
file.read(reinterpret_cast<char*>(&hdr), sizeof(astc_header));
if (!file)
if (file.fail())
{
printf("ERROR: File read failed '%s'\n", filename);
print_error("ERROR: File read failed '%s'\n", filename);
return 1;
}
unsigned int magicval = unpack_bytes(hdr.magic[0], hdr.magic[1], hdr.magic[2], hdr.magic[3]);
if (magicval != ASTC_MAGIC_ID)
{
printf("ERROR: File not recognized '%s'\n", filename);
print_error("ERROR: File not recognized '%s'\n", filename);
return 1;
}
@@ -2376,7 +2493,7 @@ int load_cimage(
if (dim_x == 0 || dim_y == 0 || dim_z == 0)
{
printf("ERROR: File corrupt '%s'\n", filename);
print_error("ERROR: Image header corrupt '%s'\n", filename);
return 1;
}
@@ -2388,9 +2505,10 @@ int load_cimage(
uint8_t *buffer = new uint8_t[data_size];
file.read(reinterpret_cast<char*>(buffer), data_size);
if (!file)
if (file.fail())
{
printf("ERROR: File read failed '%s'\n", filename);
print_error("ERROR: Image data size exceeded file size '%s'\n", filename);
delete[] buffer;
return 1;
}
@@ -2406,7 +2524,6 @@ int load_cimage(
}
/* See header for documentation. */
// TODO: Return a bool?
int store_cimage(
const astc_compressed_image& img,
const char* filename
@@ -2436,7 +2553,7 @@ int store_cimage(
std::ofstream file(filename, std::ios::out | std::ios::binary);
if (!file)
{
printf("ERROR: File open failed '%s'\n", filename);
print_error("ERROR: File open failed '%s'\n", filename);
return 1;
}
+51 -9
View File
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: Apache-2.0
// ----------------------------------------------------------------------------
// Copyright 2011-2021 Arm Limited
// Copyright 2011-2023 Arm Limited
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy
@@ -68,6 +68,9 @@ struct cli_config_options
/** @brief The number of threads to use for processing. */
unsigned int thread_count;
/** @brief The number of repeats to execute for benchmarking. */
unsigned int repeat_count;
/** @brief The number of image slices to load for a 3D image. */
unsigned int array_size;
@@ -77,6 +80,9 @@ struct cli_config_options
/** @brief @c true if the images should be y-flipped. */
bool y_flip;
/** @brief @c true if diagnostic images should be stored. */
bool diagnostic_images;
/** @brief The low exposure fstop for error computation. */
int low_fstop;
@@ -90,6 +96,26 @@ struct cli_config_options
astcenc_swizzle swz_decode;
};
/**
* @brief Print a string to stderr.
*/
static inline void print_error(
const char* format
) {
fprintf(stderr, "%s", format);
}
/**
* @brief Print a formatted string to stderr.
*/
template<typename ... _Args>
static inline void print_error(
const char* format,
_Args...args
) {
fprintf(stderr, format, args...);
}
/**
* @brief Load uncompressed image.
*
@@ -271,18 +297,20 @@ astcenc_image* astc_img_from_unorm8x4_array(
bool y_flip);
/**
* @brief Create a flattened RGBA FLOAT32 data array from an image structure.
* @brief Create a flattened RGBA FLOAT32 data array for a single slice from an image structure.
*
* The returned data array is allocated with @c new[] and must be freed with a @c delete[] call.
*
* @param img The input image.
* @param y_flip Should the data in the array be Y flipped?
* @param img The input image.
* @param y_flip Should the data in the array be Y flipped?
* @param z_index The slice index to convert.
*
* @return The data array.
*/
float* floatx4_array_from_astc_img(
const astcenc_image* img,
bool y_flip);
bool y_flip,
unsigned int z_index);
/**
* @brief Create a flattened RGBA UNORM8 data array from an image structure.
@@ -357,14 +385,28 @@ int get_cpu_count();
* All threads run the same thread function, and have the same thread payload, but are given a
* unique thread ID (0 .. N-1) as a parameter to the run function to allow thread-specific behavior.
*
|* @param thread_count The number of threads to spawn.
* @param func The function to execute. Must have the signature:
* void (int thread_count, int thread_id, void* payload)
* @param payload Pointer to an opaque thread payload object.
* @param operation The name of the operation for this async task.
* @param thread_count The number of threads to spawn.
* @param func The function to execute. Must have the signature:
* void (int thread_count, int thread_id, void* payload)
* @param payload Pointer to an opaque thread payload object.
*/
void launch_threads(
const char* operation,
int thread_count,
void (*func)(int, int, void*),
void *payload);
/**
* @brief The main entry point.
*
* @param argc The number of arguments.
* @param argv The vector of arguments.
*
* @return 0 on success, non-zero otherwise.
*/
int astcenc_main(
int argc,
char **argv);
#endif
+103 -15
View File
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: Apache-2.0
// ----------------------------------------------------------------------------
// Copyright 2011-2021 Arm Limited
// Copyright 2011-2023 Arm Limited
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy
@@ -38,7 +38,7 @@
#if defined(_WIN32) && !defined(__CYGWIN__)
#define WIN32_LEAN_AND_MEAN
#include <Windows.h>
#include <windows.h>
/** @brief Alias pthread_t to one of the internal Windows types. */
typedef HANDLE pthread_t;
@@ -58,9 +58,61 @@ static int pthread_create(
static_cast<void>(attribs);
LPTHREAD_START_ROUTINE func = reinterpret_cast<LPTHREAD_START_ROUTINE>(threadfunc);
*thread = CreateThread(nullptr, 0, func, thread_arg, 0, nullptr);
// Ensure we return 0 on success, non-zero on error
if (*thread == NULL)
{
return 1;
}
return 0;
}
/**
* @brief Manually set CPU group and thread affinity.
*
* This is needed on Windows 10 or older to allow benefit from large core count
* systems with more than 64 logical CPUs. The assignment is skipped on systems
* with a single processor group, as it is not necessary.
*/
static void set_group_affinity(
pthread_t thread,
int thread_index
) {
// Skip thread assignment for hardware with a single CPU group
int group_count = GetActiveProcessorGroupCount();
if (group_count == 1)
{
return;
}
// Ensure we have a valid assign if user creates more threads than cores
int assign_index = thread_index % get_cpu_count();
int assign_group { 0 };
int assign_group_cpu_count { 0 };
// Determine which core group and core in the group to use for this thread
int group_cpu_count_sum { 0 };
for (int group = 0; group < group_count; group++)
{
int group_cpu_count = static_cast<int>(GetMaximumProcessorCount(group));
group_cpu_count_sum += group_cpu_count;
if (assign_index < group_cpu_count_sum)
{
assign_group = group;
assign_group_cpu_count = group_cpu_count;
break;
}
}
// Set the affinity to the assigned group, and all supported cores
GROUP_AFFINITY affinity {};
affinity.Mask = (1 << assign_group_cpu_count) - 1;
affinity.Group = assign_group;
SetThreadGroupAffinity(thread, &affinity, nullptr);
}
/**
* @brief Proxy Windows @c WaitForSingleObject underneath a pthreads-like wrapper.
*/
@@ -76,9 +128,8 @@ static int pthread_join(
/* See header for documentation */
int get_cpu_count()
{
SYSTEM_INFO sysinfo;
GetSystemInfo(&sysinfo);
return sysinfo.dwNumberOfProcessors;
DWORD cpu_count = GetActiveProcessorCount(ALL_PROCESSOR_GROUPS);
return static_cast<int>(cpu_count);
}
/* See header for documentation */
@@ -151,6 +202,7 @@ static void* launch_threads_helper(
/* See header for documentation */
void launch_threads(
const char* operation,
int thread_count,
void (*func)(int, int, void*),
void *payload
@@ -163,22 +215,58 @@ void launch_threads(
}
// Otherwise spawn worker threads
launch_desc *thread_descs = new launch_desc[thread_count];
launch_desc *thread_descs = new launch_desc[thread_count];
int actual_thread_count { 0 };
for (int i = 0; i < thread_count; i++)
{
thread_descs[i].thread_count = thread_count;
thread_descs[i].thread_id = i;
thread_descs[i].payload = payload;
thread_descs[i].func = func;
thread_descs[actual_thread_count].thread_count = thread_count;
thread_descs[actual_thread_count].thread_id = actual_thread_count;
thread_descs[actual_thread_count].payload = payload;
thread_descs[actual_thread_count].func = func;
pthread_create(&(thread_descs[i].thread_handle), nullptr,
launch_threads_helper, reinterpret_cast<void*>(thread_descs + i));
// Handle pthread_create failing by simply using fewer threads
int error = pthread_create(
&(thread_descs[actual_thread_count].thread_handle),
nullptr,
launch_threads_helper,
reinterpret_cast<void*>(thread_descs + actual_thread_count));
// Track how many threads we actually created
if (!error)
{
// Windows needs explicit thread assignment to handle large core count systems
#if defined(_WIN32) && !defined(__CYGWIN__)
set_group_affinity(
thread_descs[actual_thread_count].thread_handle,
actual_thread_count);
#endif
actual_thread_count++;
}
}
// ... and then wait for them to complete
for (int i = 0; i < thread_count; i++)
// If we did not create thread_count threads then emit a warning
if (actual_thread_count != thread_count)
{
int log_count = actual_thread_count == 0 ? 1 : actual_thread_count;
const char* log_s = log_count == 1 ? "" : "s";
printf("WARNING: %s using %d thread%s due to thread creation error\n\n",
operation, log_count, log_s);
}
// If we managed to spawn any threads wait for them to complete
if (actual_thread_count != 0)
{
pthread_join(thread_descs[i].thread_handle, nullptr);
for (int i = 0; i < actual_thread_count; i++)
{
pthread_join(thread_descs[i].thread_handle, nullptr);
}
}
// Else fall back to using this thread
else
{
func(1, 0, payload);
}
delete[] thread_descs;
File diff suppressed because it is too large Load Diff
+133 -112
View File
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: Apache-2.0
// ----------------------------------------------------------------------------
// Copyright 2011-2022 Arm Limited
// Copyright 2011-2023 Arm Limited
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy
@@ -25,7 +25,7 @@
/** @brief The version header. */
static const char *astcenc_copyright_string =
R"(astcenc v%s, %u-bit %s%s%s
Copyright 2011-%s Arm Limited, all rights reserved
Copyright (c) 2011-%s Arm Limited. All rights reserved.
)";
/** @brief The short-form help text. */
@@ -138,14 +138,15 @@ COMPRESSION
The quality level configures the quality-performance tradeoff for
the compressor; more complete searches of the search space improve
image quality at the expense of compression time. The quality level
can be set to any value between 0 (fastest) and 100 (thorough), or
to a fixed quality preset:
can be set to any value between 0 (fastest) and 100 (exhaustive),
or to a fixed quality preset:
-fastest (equivalent to quality = 0)
-fast (equivalent to quality = 10)
-medium (equivalent to quality = 60)
-thorough (equivalent to quality = 98)
-exhaustive (equivalent to quality = 100)
-fastest (equivalent to quality = 0)
-fast (equivalent to quality = 10)
-medium (equivalent to quality = 60)
-thorough (equivalent to quality = 98)
-verythorough (equivalent to quality = 99)
-exhaustive (equivalent to quality = 100)
For compression of production content we recommend using a quality
level equivalent to -medium or higher.
@@ -158,11 +159,17 @@ COMPRESSION
to consider for common usage, based on the type of image data being
compressed.
-mask
The input texture is a mask texture with unrelated data stored
in the various color components, so enable error heuristics that
aim to improve quality by minimizing the effect of error
cross-talk across the color components.
-decode_unorm8
Indicate that an LDR compressed texture will be used with
the decode_unorm8 extension behavior, instead of the default
decode_unorm16 decompression.
Matching the decode mode used during compression to the mode
used at runtime will improve image quality as the compressor
can ensure that rounding goes the right way.
This mode is used automatically if you decompress to an 8-bit
per component output image format.
-normal
The input texture is a three component linear LDR normal map
@@ -175,6 +182,9 @@ COMPRESSION
nml.xy = nml.xy * 2.0 - 1.0; // Unpack to [-1,1]
nml.z = sqrt(1 - dot(nml.xy, nml.xy)); // Compute Z
Alternative component swizzles can be set with -esw and -dsw
parameters.
-rgbm <max>
The input texture is an RGBM encoded texture, storing values HDR
values between 0 and <max> in an LDR container format with a
@@ -193,8 +203,8 @@ COMPRESSION
typically lowers the measured PSNR score. Perceptual methods are
currently only available for normal maps and RGB color data.
-array <size>
Loads an array of <size> 2D image slices to use as a 3D image.
-zdim <zdim>
Load a sequence of <zdim> 2D image slices to use as a 3D image.
The input filename given is used is decorated with the postfix
"_<slice>" to find the file to load. For example, an input named
"input.png" would load as input_0.png, input_1.png, etc.
@@ -270,53 +280,71 @@ ADVANCED COMPRESSION
Higher numbers give better quality, as more complex blocks can
be encoded, but will increase search time. Preset defaults are:
-fastest : 2
-fast : 3
-medium : 4
-thorough : 4
-exhaustive : 4
-fastest : 2
-fast : 3
-medium : 4
-thorough : 4
-verythorough : 4
-exhaustive : 4
-partitionindexlimit <number>
Test <number> block partition indices for each partition count.
Higher numbers give better quality, however large values give
diminishing returns especially for smaller block sizes. Preset
defaults are:
-[2|3|4]partitionindexlimit <number>
Estimate errors for <number> block partition indices for this
partition count. Higher numbers give better quality, however
large values give diminishing returns especially for smaller
block sizes. Preset defaults are:
-fastest : 8
-fast : 12
-medium : 26
-thorough : 76
-exhaustive : 1024
-fastest : 10 | 6 | 4
-fast : 18 | 10 | 8
-medium : 34 | 28 | 16
-thorough : 82 | 60 | 30
-verythorough : 256 | 128 | 64
-exhaustive : 512 | 512 | 512
-[2|3|4]partitioncandidatelimit <number>
Calculate errors for <number> block partition indices for this
partition count. Higher numbers give better quality, however
large values give diminishing returns especially for smaller
block sizes. Preset defaults are:
-fastest : 2 | 2 | 2
-fast : 2 | 2 | 2
-medium : 2 | 2 | 2
-thorough : 3 | 2 | 2
-verythorough : 20 | 14 | 8
-exhaustive : 32 | 32 | 32
-blockmodelimit <number>
Test block modes below <number> usage centile in an empirically
determined distribution of block mode frequency. This option is
ineffective for 3D textures. Preset defaults are:
-fastest : 40
-fast : 55
-medium : 76
-thorough : 93
-exhaustive : 100
-fastest : 43
-fast : 55
-medium : 77
-thorough : 94
-verythorough : 98
-exhaustive : 100
-refinementlimit <value>
Iterate only <value> refinement iterations on colors and
-refinementlimit <number>
Iterate <number> refinement iterations on colors and
weights. Minimum value is 1. Preset defaults are:
-fastest : 2
-fast : 3
-medium : 3
-thorough : 4
-exhaustive : 4
-fastest : 2
-fast : 3
-medium : 3
-thorough : 4
-verythorough : 4
-exhaustive : 4
-candidatelimit <value>
Trial only <value> candidate encodings for each block mode:
-candidatelimit <number>
Trial <number> candidate encodings for each block mode:
-fastest : 2
-fast : 3
-medium : 3
-thorough : 4
-exhaustive : 4
-fastest : 2
-fast : 3
-medium : 3
-thorough : 4
-verythorough : 6
-exhaustive : 8
-dblimit <number>
Stop compression work on a block as soon as the PSNR of the
@@ -324,37 +352,26 @@ ADVANCED COMPRESSION
ineffective for HDR textures. Preset defaults, where N is the
number of texels in a block, are:
-fastest : MAX(63-19*log10(N), 85-35*log10(N))
-fast : MAX(63-19*log10(N), 85-35*log10(N))
-medium : MAX(70-19*log10(N), 95-35*log10(N))
-thorough : MAX(77-19*log10(N), 105-35*log10(N))
-exhaustive : 999
-fastest : MAX(63-19*log10(N), 85-35*log10(N))
-fast : MAX(63-19*log10(N), 85-35*log10(N))
-medium : MAX(70-19*log10(N), 95-35*log10(N))
-thorough : MAX(77-19*log10(N), 105-35*log10(N))
-verythorough : 999
-exhaustive : 999
-2partitionlimitfactor <factor>
-[2|3]partitionlimitfactor <factor>
Stop compression work on a block after only testing blocks with
up to two partitions and one plane of weights, unless the two
up to 2/3 partitions and one plane of weights, unless the 2/3
partition error term is lower than the error term from encoding
with one partition by more than the specified factor. Preset
with 1/2 partitions by more than the specified factor. Preset
defaults are:
-fastest : 1.0
-fast : 1.0
-medium : 1.2
-thorough : 2.5
-exhaustive : 10.0
-3partitionlimitfactor <factor>
Stop compression work on a block after only testing blocks with
up to three partitions and one plane of weights, unless the three
partition error term is lower than the error term from encoding
with two partitions by more than the specified factor. Preset
defaults are:
-fastest : 1.00
-fast : 1.10
-medium : 1.25
-thorough : 1.25
-exhaustive : 10.00
-fastest : 1.00 | 1.00
-fast : 1.00 | 1.00
-medium : 1.10 | 1.05
-thorough : 1.35 | 1.15
-verythrorough : 1.60 | 1.40
-exhaustive : 2.00 | 2.00
-2planelimitcorrelation <factor>
Stop compression after testing only one plane of weights, unless
@@ -362,53 +379,57 @@ ADVANCED COMPRESSION
components is below this factor. This option is ineffective for
normal maps. Preset defaults are:
-fastest : 0.50
-fast : 0.65
-medium : 0.85
-thorough : 0.95
-exhaustive : 0.99
-lowweightmodelimit <weight count>
Use a simpler weight search for weight counts less than or
equal to this threshold. Preset defaults are bitrate dependent:
-fastest : 25
-fast : 20
-medium : 16
-thorough : 12
-exhaustive : 0
-fastest : 0.50
-fast : 0.65
-medium : 0.85
-thorough : 0.95
-verythorough : 0.98
-exhaustive : 0.99
)"
// This split in the literals is needed for Visual Studio; the compiler
// will concatenate these two strings together ...
R"(
Other options
-------------
-esw <swizzle>
Swizzle the color components before compression. The swizzle is
specified using a 4-character string, which defines the output
format ordering. The characters may be taken from the set
[rgba01], selecting either input color components or a literal
zero or one. For example to swap the RG components, and replace
alpha with 1, the swizzle 'grb1' should be used.
Specify an encoding swizzle to reorder the color components
before compression. The swizzle is specified using a four
character string, which defines the format ordering used by
the compressor.
The input swizzle takes place before any compression, and all
error weighting applied using the -cw option is applied to the
post-swizzle component ordering.
The characters may be taken from the set [rgba01], selecting
either input color components or a literal zero or one. For
example to swap the RG components, and replace alpha with 1,
the swizzle 'grb1' should be used.
By default all 4 post-swizzle components are included in the
error metrics during compression. When using -esw to map two
compression error metrics. When using -esw to map two
component data to the L+A endpoint (e.g. -esw rrrg) the
luminance data stored in the RGB components will be weighted 3
times more strongly than the alpha component. This can be
corrected using the -cw option to zero the weights of unused
components; e.g. using -cw 1 0 0 1.
corrected using the -ssw option to specify which components
will be sampled at runtime e.g. -ssw ra.
-ssw <swizzle>
Specify a sampling swizzle to identify which color components
are actually read by the application shader program. For example,
using -ssw ra tells the compressor that the green and blue error
does not matter because the data is not actually read.
The sampling swizzle is based on the channel ordering after the
-esw transform has been applied. Note -ssw exposes the same
functionality as -cw, but in a more user-friendly form.
-dsw <swizzle>
Swizzle the color components after decompression. The swizzle is
specified using the same method as the -esw option, with support
for an additional "z" character. This is used to specify that
the compressed data stores an X+Y normal map, and that the Z
output component should be reconstructed from the two components
stored in the data. For the typical ASTC normal encoding, which
uses an 'rrrg' compression swizzle, you should specify an 'raz1'
Specify a decompression swizzle used to reorder the color
components after decompression. The swizzle is specified using
the same method as the -esw option, with support for an extra
"z" character. This is used to specify that the compressed data
stores an X+Y normal map, and that the Z output component
should be reconstructed from the two components stored in the
data. For the typical ASTC normal encoding, which uses an
'rrrg' compression swizzle, you should specify an 'raz1'
swizzle for decompression.
-yflip
@@ -527,7 +548,7 @@ QUICK REFERENCE
astcenc {-tl|-ts|-th|-tH} <in> <out> <blockdim> <quality> [options]
Mode -*l = linear LDR, -*s = sRGB LDR, -*h = HDR RGB/LDR A, -*H = HDR.
Quality = -fastest/-fast/-medium/-thorough/-exhaustive/a float [0-100].
Quality = -fastest/-fast/-medium/-thorough/-verythorough/-exhaustive/a float [0-100].
)";
/* See header for documentation. */
+300 -155
View File
@@ -1,6 +1,6 @@
# SPDX-License-Identifier: Apache-2.0
# ----------------------------------------------------------------------------
# Copyright 2020-2022 Arm Limited
# Copyright 2020-2023 Arm Limited
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may not
# use this file except in compliance with the License. You may obtain a copy
@@ -15,18 +15,32 @@
# under the License.
# ----------------------------------------------------------------------------
if(${UNIVERSAL_BUILD})
set(ASTC_TARGET astc${CODEC})
else()
set(ASTC_TARGET astc${CODEC}-${ISA_SIMD})
set(ASTCENC_TARGET astc${ASTCENC_CODEC}-${ASTCENC_ISA_SIMD})
project(${ASTCENC_TARGET})
# On CMake 3.25 or older CXX_COMPILER_FRONTEND_VARIANT is not always set
if(CMAKE_CXX_COMPILER_FRONTEND_VARIANT STREQUAL "")
set(CMAKE_CXX_COMPILER_FRONTEND_VARIANT "${CMAKE_CXX_COMPILER_ID}")
endif()
project(${ASTC_TARGET})
# Compiler accepts MSVC-style command line options
set(is_msvc_fe "$<STREQUAL:${CMAKE_CXX_COMPILER_FRONTEND_VARIANT},MSVC>")
# Compiler accepts GNU-style command line options
set(is_gnu_fe1 "$<STREQUAL:${CMAKE_CXX_COMPILER_FRONTEND_VARIANT},GNU>")
# Compiler accepts AppleClang-style command line options, which is also GNU-style
set(is_gnu_fe2 "$<STREQUAL:${CMAKE_CXX_COMPILER_FRONTEND_VARIANT},AppleClang>")
# Compiler accepts GNU-style command line options
set(is_gnu_fe "$<OR:${is_gnu_fe1},${is_gnu_fe2}>")
set(GNU_LIKE "GNU,Clang,AppleClang")
set(CLANG_LIKE "Clang,AppleClang")
# Compiler is Visual Studio cl.exe
set(is_msvccl "$<AND:${is_msvc_fe},$<CXX_COMPILER_ID:MSVC>>")
# Compiler is Visual Studio clangcl.exe
set(is_clangcl "$<AND:${is_msvc_fe},$<CXX_COMPILER_ID:Clang>>")
# Compiler is upstream clang with the standard frontend
set(is_clang "$<AND:${is_gnu_fe},$<CXX_COMPILER_ID:Clang,AppleClang>>")
add_library(${ASTC_TARGET}-static
add_library(${ASTCENC_TARGET}-static
STATIC
astcenc_averages_and_directions.cpp
astcenc_block_sizes.cpp
@@ -46,19 +60,55 @@ add_library(${ASTC_TARGET}-static
astcenc_partition_tables.cpp
astcenc_percentile_tables.cpp
astcenc_pick_best_endpoint_format.cpp
astcenc_platform_isa_detection.cpp
astcenc_quantization.cpp
astcenc_symbolic_physical.cpp
astcenc_weight_align.cpp
astcenc_weight_quant_xfer_tables.cpp)
target_include_directories(${ASTC_TARGET}-static
target_include_directories(${ASTCENC_TARGET}-static
PUBLIC
$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>
$<INSTALL_INTERFACE:.>)
if(${CLI})
add_executable(${ASTC_TARGET}
if(${ASTCENC_SHAREDLIB})
add_library(${ASTCENC_TARGET}-shared
SHARED
astcenc_averages_and_directions.cpp
astcenc_block_sizes.cpp
astcenc_color_quantize.cpp
astcenc_color_unquantize.cpp
astcenc_compress_symbolic.cpp
astcenc_compute_variance.cpp
astcenc_decompress_symbolic.cpp
astcenc_diagnostic_trace.cpp
astcenc_entry.cpp
astcenc_find_best_partitioning.cpp
astcenc_ideal_endpoints_and_weights.cpp
astcenc_image.cpp
astcenc_integer_sequence.cpp
astcenc_mathlib.cpp
astcenc_mathlib_softfloat.cpp
astcenc_partition_tables.cpp
astcenc_percentile_tables.cpp
astcenc_pick_best_endpoint_format.cpp
astcenc_quantization.cpp
astcenc_symbolic_physical.cpp
astcenc_weight_align.cpp
astcenc_weight_quant_xfer_tables.cpp)
target_include_directories(${ASTCENC_TARGET}-shared
PUBLIC
$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>
$<INSTALL_INTERFACE:.>)
endif()
if(${ASTCENC_CLI})
# Veneer is compiled without any extended ISA so we can safely do
# ISA compatability checks without triggering a SIGILL
add_library(${ASTCENC_TARGET}-veneer
astcenccli_entry.cpp)
add_executable(${ASTCENC_TARGET}
astcenccli_error_metrics.cpp
astcenccli_image.cpp
astcenccli_image_external.cpp
@@ -67,220 +117,313 @@ if(${CLI})
astcenccli_toplevel.cpp
astcenccli_toplevel_help.cpp)
target_link_libraries(${ASTC_TARGET}
target_link_libraries(${ASTCENC_TARGET}
PRIVATE
${ASTC_TARGET}-static)
${ASTCENC_TARGET}-veneer
${ASTCENC_TARGET}-static)
endif()
macro(astcenc_set_properties NAME)
macro(astcenc_set_properties ASTCENC_TARGET_NAME ASTCENC_IS_VENEER)
target_compile_features(${NAME}
target_compile_features(${ASTCENC_TARGET_NAME}
PRIVATE
cxx_std_14)
target_compile_definitions(${NAME}
target_compile_definitions(${ASTCENC_TARGET_NAME}
PRIVATE
# MSVC defines
$<$<CXX_COMPILER_ID:MSVC>:_CRT_SECURE_NO_WARNINGS>)
$<${is_msvc_fe}:_CRT_SECURE_NO_WARNINGS>)
if(${DECOMPRESSOR})
target_compile_definitions(${NAME}
if(${ASTCENC_DECOMPRESSOR})
target_compile_definitions(${ASTCENC_TARGET_NAME}
PRIVATE
ASTCENC_DECOMPRESS_ONLY)
endif()
if(${BLOCK_MAX_TEXELS})
target_compile_definitions(${NAME}
if(${ASTCENC_BLOCK_MAX_TEXELS})
target_compile_definitions(${ASTCENC_TARGET_NAME}
PRIVATE
ASTCENC_BLOCK_MAX_TEXELS=${BLOCK_MAX_TEXELS})
ASTCENC_BLOCK_MAX_TEXELS=${ASTCENC_BLOCK_MAX_TEXELS})
endif()
if(${DIAGNOSTICS})
target_compile_definitions(${NAME}
if(${ASTCENC_DIAGNOSTICS})
target_compile_definitions(${ASTCENC_TARGET_NAME}
PUBLIC
ASTCENC_DIAGNOSTICS)
endif()
target_compile_options(${NAME}
target_compile_options(${ASTCENC_TARGET_NAME}
PRIVATE
# Use pthreads on Linux/macOS
$<$<PLATFORM_ID:Linux,Darwin>:-pthread>
# MSVC compiler defines
$<$<CXX_COMPILER_ID:MSVC>:/EHsc>
$<$<CXX_COMPILER_ID:MSVC>:/fp:strict>
$<${is_msvc_fe}:/EHsc>
$<${is_msvccl}:/wd4324>
# G++ and Clang++ compiler defines
$<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wall>
$<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wextra>
$<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wpedantic>
$<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Werror>
$<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wshadow>
$<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wdouble-promotion>
$<${is_gnu_fe}:-Wall>
$<${is_gnu_fe}:-Wextra>
$<${is_gnu_fe}:-Wpedantic>
$<${is_gnu_fe}:-Werror>
$<${is_gnu_fe}:-Wshadow>
$<${is_gnu_fe}:-Wdouble-promotion>
$<${is_clang}:-Wdocumentation>
# Hide noise thrown up by Clang 10 and clang-cl
$<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wno-unknown-warning-option>
$<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wno-c++98-compat-pedantic>
$<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wno-c++98-c++11-compat-pedantic>
$<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wno-float-equal>
$<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wno-deprecated-declarations>
$<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wno-atomic-implicit-seq-cst>
$<${is_gnu_fe}:-Wno-unknown-warning-option>
$<${is_gnu_fe}:-Wno-c++98-compat-pedantic>
$<${is_gnu_fe}:-Wno-c++98-c++11-compat-pedantic>
$<${is_gnu_fe}:-Wno-float-equal>
$<${is_gnu_fe}:-Wno-deprecated-declarations>
$<${is_gnu_fe}:-Wno-atomic-implicit-seq-cst>
# Clang 10 also throws up warnings we need to investigate (ours)
$<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wno-cast-align>
$<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wno-sign-conversion>
$<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wno-implicit-int-conversion>
$<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wno-shift-sign-overflow>
$<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wno-format-nonliteral>
$<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wno-reserved-identifier>
$<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wno-cast-function-type>
$<${is_gnu_fe}:-Wno-cast-align>
$<${is_gnu_fe}:-Wno-sign-conversion>
$<${is_gnu_fe}:-Wno-implicit-int-conversion>
$<${is_gnu_fe}:-Wno-shift-sign-overflow>
$<${is_gnu_fe}:-Wno-format-nonliteral>
$<${is_gnu_fe}:-Wno-reserved-identifier>
$<${is_gnu_fe}:-Wno-cast-function-type>
$<$<CXX_COMPILER_ID:Clang>:-Wdocumentation>)
# Force DWARF4 for Valgrind profiling
$<$<AND:$<PLATFORM_ID:Linux,Darwin>,${is_clang}>:-gdwarf-4>
target_link_options(${NAME}
# Disable non-portable Windows.h warning (fixing it fails builds on MinGW)
$<$<AND:$<PLATFORM_ID:Windows>,${is_clang}>:-Wno-nonportable-system-include-path>)
target_link_options(${ASTCENC_TARGET_NAME}
PRIVATE
# Use pthreads on Linux/macOS
$<$<PLATFORM_ID:Linux,Darwin>:-pthread>)
if(${ASAN})
target_compile_options(${NAME}
if(${ASTCENC_ASAN})
target_compile_options(${ASTCENC_TARGET_NAME}
PRIVATE
$<$<CXX_COMPILER_ID:${CLANG_LIKE}>:-fsanitize=address>)
$<${is_clang}:-fsanitize=address>)
target_link_options(${NAME}
target_link_options(${ASTCENC_TARGET_NAME}
PRIVATE
$<$<CXX_COMPILER_ID:${CLANG_LIKE}>:-fsanitize=address>)
$<${is_clang}:-fsanitize=address>)
endif()
if(${NO_INVARIANCE})
target_compile_definitions(${NAME}
PRIVATE
ASTCENC_NO_INVARIANCE=1)
if(NOT ${ASTCENC_INVARIANCE})
target_compile_definitions(${ASTCENC_TARGET_NAME}
PRIVATE
ASTCENC_NO_INVARIANCE=1)
# For Visual Studio prior to 2022 (compiler < 19.30) /fp:precise
# For Visual Studio 2022 (compiler >= 19.30) /fp:precise and /fp:contract
# For Visual Studio 2022 ClangCL seems to have accidentally enabled contraction by default,
# so behaves differently to CL.exe. Use the -Xclang argument to workaround and allow access
# GNU-style switch to control contraction on the assumption this gets fixed and disabled.
# Note ClangCL does not accept /fp:contract as an argument as of v15.0.7.
target_compile_options(${ASTCENC_TARGET_NAME}
PRIVATE
$<${is_msvccl}:/fp:precise>
$<${is_clangcl}:/fp:precise>
$<$<AND:${is_msvccl},$<VERSION_GREATER_EQUAL:$<CXX_COMPILER_VERSION>,19.30>>:/fp:contract>
$<$<AND:${is_clangcl},$<VERSION_GREATER_EQUAL:$<CXX_COMPILER_VERSION>,14.0.0>>:-Xclang -ffp-contract=fast>
$<$<AND:${is_clang},$<VERSION_GREATER_EQUAL:$<CXX_COMPILER_VERSION>,10.0.0>>:-ffp-model=precise>
$<${is_gnu_fe}:-ffp-contract=fast>)
else()
# For Visual Studio prior to 2022 (compiler < 19.30) /fp:strict
# For Visual Studio 2022 (compiler >= 19.30) /fp:precise
# For Visual Studio 2022 ClangCL seems to have accidentally enabled contraction by default,
# so behaves differently to CL.exe. Use the -Xclang argument to workaround and allow access
# GNU-style switch to control contraction and force disable.
target_compile_options(${ASTCENC_TARGET_NAME}
PRIVATE
$<$<AND:${is_msvccl},$<VERSION_LESS:$<CXX_COMPILER_VERSION>,19.30>>:/fp:strict>
$<$<AND:${is_msvccl},$<VERSION_GREATER_EQUAL:$<CXX_COMPILER_VERSION>,19.30>>:/fp:precise>
$<${is_clangcl}:/fp:precise>
$<$<AND:${is_clangcl},$<VERSION_GREATER_EQUAL:$<CXX_COMPILER_VERSION>,14.0.0>>:-Xclang -ffp-contract=off>
$<$<AND:${is_clang},$<VERSION_GREATER_EQUAL:$<CXX_COMPILER_VERSION>,10.0.0>>:-ffp-model=precise>
$<${is_gnu_fe}:-ffp-contract=off>)
endif()
if(${CLI})
if(${ASTCENC_CLI})
# Enable LTO on release builds
set_property(TARGET ${NAME}
set_property(TARGET ${ASTCENC_TARGET_NAME}
PROPERTY
INTERPROCEDURAL_OPTIMIZATION_RELEASE True)
# Use a static runtime on MSVC builds (ignored on non-MSVC compilers)
set_property(TARGET ${NAME}
set_property(TARGET ${ASTCENC_TARGET_NAME}
PROPERTY
MSVC_RUNTIME_LIBRARY "MultiThreaded$<$<CONFIG:Debug>:Debug>")
endif()
# Set up configuration for SIMD ISA builds
if(${ISA_SIMD} MATCHES "none")
if(NOT ${UNIVERSAL_BUILD})
target_compile_definitions(${NAME}
PRIVATE
ASTCENC_NEON=0
ASTCENC_SSE=0
ASTCENC_AVX=0
ASTCENC_POPCNT=0
ASTCENC_F16C=0)
endif()
elseif(${ISA_SIMD} MATCHES "neon")
if(NOT ${UNIVERSAL_BUILD})
target_compile_definitions(${NAME}
PRIVATE
ASTCENC_NEON=1
ASTCENC_SSE=0
ASTCENC_AVX=0
ASTCENC_POPCNT=0
ASTCENC_F16C=0)
endif()
elseif((${ISA_SIMD} MATCHES "sse2") OR (${UNIVERSAL_BUILD} AND ${ISA_SSE2}))
if(NOT ${UNIVERSAL_BUILD})
target_compile_definitions(${NAME}
PRIVATE
ASTCENC_NEON=0
ASTCENC_SSE=20
ASTCENC_AVX=0
ASTCENC_POPCNT=0
ASTCENC_F16C=0)
endif()
# These settings are needed on AppleClang as SSE4.1 is on by default
# Suppress unused argument for macOS universal build behavior
target_compile_options(${NAME}
if(${ASTCENC_ISA_SIMD} MATCHES "none")
target_compile_definitions(${ASTCENC_TARGET_NAME}
PRIVATE
$<$<CXX_COMPILER_ID:AppleClang>:-msse2>
$<$<CXX_COMPILER_ID:AppleClang>:-mno-sse4.1>
$<$<CXX_COMPILER_ID:AppleClang>:-Wno-unused-command-line-argument>)
ASTCENC_NEON=0
ASTCENC_SSE=0
ASTCENC_AVX=0
ASTCENC_POPCNT=0
ASTCENC_F16C=0)
elseif((${ISA_SIMD} MATCHES "sse4.1") OR (${UNIVERSAL_BUILD} AND ${ISA_SSE41}))
if(NOT ${UNIVERSAL_BUILD})
target_compile_definitions(${NAME}
elseif(${ASTCENC_ISA_SIMD} MATCHES "neon")
target_compile_definitions(${ASTCENC_TARGET_NAME}
PRIVATE
ASTCENC_NEON=1
ASTCENC_SSE=0
ASTCENC_AVX=0
ASTCENC_POPCNT=0
ASTCENC_F16C=0)
# Workaround MSVC codegen bug for NEON builds on VS 2022 17.2 or older
# https://developercommunity.visualstudio.com/t/inlining-turns-constant-into-register-operand-for/1394798
if((CMAKE_CXX_COMPILER_ID MATCHES "MSVC") AND (MSVC_VERSION LESS 1933))
target_compile_options(${ASTCENC_TARGET_NAME}
PRIVATE
ASTCENC_NEON=0
ASTCENC_SSE=41
ASTCENC_AVX=0
ASTCENC_POPCNT=1
ASTCENC_F16C=0)
$<${is_msvccl}:/d2ssa-cfg-sink->)
endif()
# Suppress unused argument for macOS universal build behavior
target_compile_options(${NAME}
elseif(${ASTCENC_ISA_SIMD} MATCHES "sse2")
target_compile_definitions(${ASTCENC_TARGET_NAME}
PRIVATE
$<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-msse4.1 -mpopcnt>
$<$<CXX_COMPILER_ID:AppleClang>:-Wno-unused-command-line-argument>)
ASTCENC_NEON=0
ASTCENC_SSE=20
ASTCENC_AVX=0
ASTCENC_POPCNT=0
ASTCENC_F16C=0)
elseif((${ISA_SIMD} MATCHES "avx2") OR (${UNIVERSAL_BUILD} AND ${ISA_AVX2}))
if(NOT ${UNIVERSAL_BUILD})
target_compile_definitions(${NAME}
# Force SSE2 on AppleClang (normally SSE4.1 is the default)
target_compile_options(${ASTCENC_TARGET_NAME}
PRIVATE
$<${is_clangcl}:-msse2>
$<${is_gnu_fe}:-msse2>
$<${is_gnu_fe}:-mno-sse4.1>
$<${is_gnu_fe}:-Wno-unused-command-line-argument>)
elseif(${ASTCENC_ISA_SIMD} MATCHES "sse4.1")
target_compile_definitions(${ASTCENC_TARGET_NAME}
PRIVATE
ASTCENC_NEON=0
ASTCENC_SSE=41
ASTCENC_AVX=0
ASTCENC_POPCNT=1
ASTCENC_F16C=0)
if (${ASTCENC_IS_VENEER})
# Force SSE2 on AppleClang (normally SSE4.1 is the default)
target_compile_options(${ASTCENC_TARGET_NAME}
PRIVATE
ASTCENC_NEON=0
ASTCENC_SSE=41
ASTCENC_AVX=2
ASTCENC_POPCNT=1
ASTCENC_F16C=1)
$<${is_gnu_fe}:-msse2>
$<${is_gnu_fe}:-mno-sse4.1>
$<${is_gnu_fe}:-Wno-unused-command-line-argument>)
else()
target_compile_options(${ASTCENC_TARGET_NAME}
PRIVATE
$<${is_clangcl}:-msse4.1 -mpopcnt>
$<${is_gnu_fe}:-msse4.1 -mpopcnt>
$<${is_gnu_fe}:-Wno-unused-command-line-argument>)
endif()
# Suppress unused argument for macOS universal build behavior
target_compile_options(${NAME}
elseif(${ASTCENC_ISA_SIMD} MATCHES "avx2")
target_compile_definitions(${ASTCENC_TARGET_NAME}
PRIVATE
$<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-mavx2 -mpopcnt -mf16c>
$<$<CXX_COMPILER_ID:MSVC>:/arch:AVX2>
$<$<CXX_COMPILER_ID:AppleClang>:-Wno-unused-command-line-argument>)
ASTCENC_NEON=0
ASTCENC_SSE=41
ASTCENC_AVX=2
ASTCENC_POPCNT=1
ASTCENC_F16C=1)
if (${ASTCENC_IS_VENEER})
# Force SSE2 on AppleClang (normally SSE4.1 is the default)
target_compile_options(${ASTCENC_TARGET_NAME}
PRIVATE
$<${is_gnu_fe}:-msse2>
$<${is_gnu_fe}:-mno-sse4.1>
$<${is_gnu_fe}:-Wno-unused-command-line-argument>)
else()
target_compile_options(${ASTCENC_TARGET_NAME}
PRIVATE
$<${is_msvc_fe}:/arch:AVX2>
$<${is_clangcl}:-mavx2 -mpopcnt -mf16c>
$<${is_gnu_fe}:-mavx2 -mpopcnt -mf16c>
$<${is_gnu_fe}:-Wno-unused-command-line-argument>)
endif()
# Non-invariant builds enable us to loosen the compiler constraints on
# floating point, but this is only worth doing on CPUs with AVX2 because
# this implies we can also enable the FMA instruction set extensions
# which significantly improve performance. Note that this DOES reduce
# image quality by up to 0.2 dB (normally much less), but buys an
# average of 10-15% performance improvement ...
if((NOT ${ASTCENC_INVARIANCE}) AND (NOT ${ASTCENC_IS_VENEER}))
target_compile_options(${ASTCENC_TARGET_NAME}
PRIVATE
$<${is_gnu_fe}:-mfma>)
endif()
endif()
endmacro()
if(CMAKE_CXX_COMPILER_ID MATCHES "GNU|Clang")
string(CONCAT EXTERNAL_CXX_FLAGS
" $<$<NOT:$<CXX_COMPILER_ID:MSVC>>: -fno-strict-aliasing>"
" $<$<NOT:$<CXX_COMPILER_ID:MSVC>>: -Wno-unused-parameter>"
" $<$<NOT:$<CXX_COMPILER_ID:MSVC>>: -Wno-old-style-cast>"
" $<$<NOT:$<CXX_COMPILER_ID:MSVC>>: -Wno-double-promotion>"
" $<$<NOT:$<CXX_COMPILER_ID:MSVC>>: -Wno-zero-as-null-pointer-constant>"
" $<$<NOT:$<CXX_COMPILER_ID:MSVC>>: -Wno-disabled-macro-expansion>"
" $<$<NOT:$<CXX_COMPILER_ID:MSVC>>: -Wno-reserved-id-macro>"
" $<$<NOT:$<CXX_COMPILER_ID:MSVC>>: -Wno-extra-semi-stmt>"
" $<$<NOT:$<CXX_COMPILER_ID:MSVC>>: -Wno-implicit-fallthrough>"
" $<$<NOT:$<CXX_COMPILER_ID:MSVC>>: -Wno-tautological-type-limit-compare>"
" $<$<NOT:$<CXX_COMPILER_ID:MSVC>>: -Wno-cast-qual>"
" $<$<NOT:$<CXX_COMPILER_ID:MSVC>>: -Wno-reserved-identifier>"
" $<$<CXX_COMPILER_ID:Clang>: -Wno-missing-prototypes>"
" $<$<NOT:$<CXX_COMPILER_ID:MSVC>>: -Wno-suggest-override>"
" $<$<NOT:$<CXX_COMPILER_ID:MSVC>>: -Wno-used-but-marked-unused>"
" $<$<NOT:$<CXX_COMPILER_ID:MSVC>>: -Wno-noexcept-type>"
" $<$<NOT:$<CXX_COMPILER_ID:MSVC>>: -Wno-comma>"
" $<$<NOT:$<CXX_COMPILER_ID:MSVC>>: -Wno-c99-extensions>")
string(CONCAT EXTERNAL_CXX_FLAGS
" $<${is_gnu_fe}: -fno-strict-aliasing>"
" $<${is_gnu_fe}: -Wno-unused-parameter>"
" $<${is_gnu_fe}: -Wno-old-style-cast>"
" $<${is_gnu_fe}: -Wno-double-promotion>"
" $<${is_gnu_fe}: -Wno-zero-as-null-pointer-constant>"
" $<${is_gnu_fe}: -Wno-disabled-macro-expansion>"
" $<${is_gnu_fe}: -Wno-reserved-id-macro>"
" $<${is_gnu_fe}: -Wno-extra-semi-stmt>"
" $<${is_gnu_fe}: -Wno-implicit-fallthrough>"
" $<${is_gnu_fe}: -Wno-tautological-type-limit-compare>"
" $<${is_gnu_fe}: -Wno-cast-qual>"
" $<${is_gnu_fe}: -Wno-reserved-identifier>"
" $<${is_clang}: -Wno-missing-prototypes>"
" $<${is_gnu_fe}: -Wno-missing-field-initializers>"
" $<${is_gnu_fe}: -Wno-suggest-override>"
" $<${is_gnu_fe}: -Wno-used-but-marked-unused>"
" $<${is_gnu_fe}: -Wno-noexcept-type>"
" $<${is_gnu_fe}: -Wno-comma>"
" $<${is_gnu_fe}: -Wno-c99-extensions>")
set_source_files_properties(astcenccli_image_external.cpp
PROPERTIES
COMPILE_FLAGS ${EXTERNAL_CXX_FLAGS})
set_source_files_properties(astcenccli_image_external.cpp
PROPERTIES
COMPILE_FLAGS ${EXTERNAL_CXX_FLAGS})
astcenc_set_properties(${ASTCENC_TARGET}-static OFF)
target_compile_options(${ASTCENC_TARGET}-static
PRIVATE
$<${is_msvc_fe}:/W4>)
if(${ASTCENC_SHAREDLIB})
astcenc_set_properties(${ASTCENC_TARGET}-shared OFF)
target_compile_definitions(${ASTCENC_TARGET}-shared
PRIVATE
ASTCENC_DYNAMIC_LIBRARY=1)
target_compile_options(${ASTCENC_TARGET}-shared
PRIVATE
$<${is_gnu_fe}:-fvisibility=hidden>
$<${is_msvc_fe}:/W4>)
if(NOT ${ASTCENC_UNIVERSAL_BUILD})
install(TARGETS ${ASTCENC_TARGET}-shared)
endif()
endif()
astcenc_set_properties(${ASTC_TARGET}-static)
if(${ASTCENC_CLI})
astcenc_set_properties(${ASTCENC_TARGET}-veneer ON)
astcenc_set_properties(${ASTCENC_TARGET} OFF)
if(${CLI})
astcenc_set_properties(${ASTC_TARGET})
target_compile_options(${ASTCENC_TARGET}
PRIVATE
$<${is_msvc_fe}:/W3>)
target_compile_options(${ASTCENC_TARGET}-veneer
PRIVATE
$<${is_msvc_fe}:/W3>)
string(TIMESTAMP astcencoder_YEAR "%Y")
@@ -289,9 +432,11 @@ if(${CLI})
astcenccli_version.h
ESCAPE_QUOTES @ONLY)
target_include_directories(${ASTC_TARGET}
target_include_directories(${ASTCENC_TARGET}
PRIVATE
${CMAKE_CURRENT_BINARY_DIR})
install(TARGETS ${ASTC_TARGET} DESTINATION ${PACKAGE_ROOT})
if(NOT ${ASTCENC_UNIVERSAL_BUILD})
install(TARGETS ${ASTCENC_TARGET})
endif()
endif()
+126 -29
View File
@@ -1,4 +1,4 @@
/* stb_image - v2.27 - public domain image loader - http://nothings.org/stb
/* stb_image - v2.28 - public domain image loader - http://nothings.org/stb
no warranty implied; use at your own risk
Do this:
@@ -48,6 +48,7 @@ LICENSE
RECENT REVISION HISTORY:
2.28 (2023-01-29) many error fixes, security errors, just tons of stuff
2.27 (2021-07-11) document stbi_info better, 16-bit PNM support, bug fixes
2.26 (2020-07-13) many minor fixes
2.25 (2020-02-02) fix warnings
@@ -108,7 +109,7 @@ RECENT REVISION HISTORY:
Cass Everitt Ryamond Barbiero github:grim210
Paul Du Bois Engin Manap Aldo Culquicondor github:sammyhw
Philipp Wiesemann Dale Weiler Oriol Ferrer Mesia github:phprus
Josh Tobin Matthew Gregan github:poppolopoppo
Josh Tobin Neil Bickford Matthew Gregan github:poppolopoppo
Julian Raschke Gregory Mullen Christian Floisand github:darealshinji
Baldur Karlsson Kevin Schmidt JR Smith github:Michaelangel007
Brad Weinberger Matvey Cherevko github:mosra
@@ -140,7 +141,7 @@ RECENT REVISION HISTORY:
// // ... x = width, y = height, n = # 8-bit components per pixel ...
// // ... replace '0' with '1'..'4' to force that many components per pixel
// // ... but 'n' will always be the number that it would have been if you said 0
// stbi_image_free(data)
// stbi_image_free(data);
//
// Standard parameters:
// int *x -- outputs image width in pixels
@@ -635,7 +636,7 @@ STBIDEF int stbi_zlib_decode_noheader_buffer(char *obuffer, int olen, const ch
#endif
#endif
#ifdef _MSC_VER
#if defined(_MSC_VER) || defined(__SYMBIAN32__)
typedef unsigned short stbi__uint16;
typedef signed short stbi__int16;
typedef unsigned int stbi__uint32;
@@ -1032,7 +1033,7 @@ static int stbi__mad3sizes_valid(int a, int b, int c, int add)
}
// returns 1 if "a*b*c*d + add" has no negative terms/factors and doesn't overflow
#if !defined(STBI_NO_LINEAR) || !defined(STBI_NO_HDR)
#if !defined(STBI_NO_LINEAR) || !defined(STBI_NO_HDR) || !defined(STBI_NO_PNM)
static int stbi__mad4sizes_valid(int a, int b, int c, int d, int add)
{
return stbi__mul2sizes_valid(a, b) && stbi__mul2sizes_valid(a*b, c) &&
@@ -1055,7 +1056,7 @@ static void *stbi__malloc_mad3(int a, int b, int c, int add)
return stbi__malloc(a*b*c + add);
}
#if !defined(STBI_NO_LINEAR) || !defined(STBI_NO_HDR)
#if !defined(STBI_NO_LINEAR) || !defined(STBI_NO_HDR) || !defined(STBI_NO_PNM)
static void *stbi__malloc_mad4(int a, int b, int c, int d, int add)
{
if (!stbi__mad4sizes_valid(a, b, c, d, add)) return NULL;
@@ -1063,6 +1064,23 @@ static void *stbi__malloc_mad4(int a, int b, int c, int d, int add)
}
#endif
// returns 1 if the sum of two signed ints is valid (between -2^31 and 2^31-1 inclusive), 0 on overflow.
static int stbi__addints_valid(int a, int b)
{
if ((a >= 0) != (b >= 0)) return 1; // a and b have different signs, so no overflow
if (a < 0 && b < 0) return a >= INT_MIN - b; // same as a + b >= INT_MIN; INT_MIN - b cannot overflow since b < 0.
return a <= INT_MAX - b;
}
// returns 1 if the product of two signed shorts is valid, 0 on overflow.
static int stbi__mul2shorts_valid(short a, short b)
{
if (b == 0 || b == -1) return 1; // multiplication by 0 is always 0; check for -1 so SHRT_MIN/b doesn't overflow
if ((a >= 0) == (b >= 0)) return a <= SHRT_MAX/b; // product is positive, so similar to mul2sizes_valid
if (b < 0) return a <= SHRT_MIN / b; // same as a * b >= SHRT_MIN
return a >= SHRT_MIN / b;
}
// stbi__err - error
// stbi__errpf - error returning pointer to float
// stbi__errpuc - error returning pointer to unsigned char
@@ -1985,9 +2003,12 @@ static int stbi__build_huffman(stbi__huffman *h, int *count)
int i,j,k=0;
unsigned int code;
// build size list for each symbol (from JPEG spec)
for (i=0; i < 16; ++i)
for (j=0; j < count[i]; ++j)
for (i=0; i < 16; ++i) {
for (j=0; j < count[i]; ++j) {
h->size[k++] = (stbi_uc) (i+1);
if(k >= 257) return stbi__err("bad size list","Corrupt JPEG");
}
}
h->size[k] = 0;
// compute actual symbols (from jpeg spec)
@@ -2112,6 +2133,8 @@ stbi_inline static int stbi__jpeg_huff_decode(stbi__jpeg *j, stbi__huffman *h)
// convert the huffman code to the symbol id
c = ((j->code_buffer >> (32 - k)) & stbi__bmask[k]) + h->delta[k];
if(c < 0 || c >= 256) // symbol id out of bounds!
return -1;
STBI_ASSERT((((j->code_buffer) >> (32 - h->size[c])) & stbi__bmask[h->size[c]]) == h->code[c]);
// convert the id to a symbol
@@ -2130,6 +2153,7 @@ stbi_inline static int stbi__extend_receive(stbi__jpeg *j, int n)
unsigned int k;
int sgn;
if (j->code_bits < n) stbi__grow_buffer_unsafe(j);
if (j->code_bits < n) return 0; // ran out of bits from stream, return 0s intead of continuing
sgn = j->code_buffer >> 31; // sign bit always in MSB; 0 if MSB clear (positive), 1 if MSB set (negative)
k = stbi_lrot(j->code_buffer, n);
@@ -2144,6 +2168,7 @@ stbi_inline static int stbi__jpeg_get_bits(stbi__jpeg *j, int n)
{
unsigned int k;
if (j->code_bits < n) stbi__grow_buffer_unsafe(j);
if (j->code_bits < n) return 0; // ran out of bits from stream, return 0s intead of continuing
k = stbi_lrot(j->code_buffer, n);
j->code_buffer = k & ~stbi__bmask[n];
k &= stbi__bmask[n];
@@ -2155,6 +2180,7 @@ stbi_inline static int stbi__jpeg_get_bit(stbi__jpeg *j)
{
unsigned int k;
if (j->code_bits < 1) stbi__grow_buffer_unsafe(j);
if (j->code_bits < 1) return 0; // ran out of bits from stream, return 0s intead of continuing
k = j->code_buffer;
j->code_buffer <<= 1;
--j->code_bits;
@@ -2192,8 +2218,10 @@ static int stbi__jpeg_decode_block(stbi__jpeg *j, short data[64], stbi__huffman
memset(data,0,64*sizeof(data[0]));
diff = t ? stbi__extend_receive(j, t) : 0;
if (!stbi__addints_valid(j->img_comp[b].dc_pred, diff)) return stbi__err("bad delta","Corrupt JPEG");
dc = j->img_comp[b].dc_pred + diff;
j->img_comp[b].dc_pred = dc;
if (!stbi__mul2shorts_valid(dc, dequant[0])) return stbi__err("can't merge dc and ac", "Corrupt JPEG");
data[0] = (short) (dc * dequant[0]);
// decode AC components, see JPEG spec
@@ -2207,6 +2235,7 @@ static int stbi__jpeg_decode_block(stbi__jpeg *j, short data[64], stbi__huffman
if (r) { // fast-AC path
k += (r >> 4) & 15; // run
s = r & 15; // combined length
if (s > j->code_bits) return stbi__err("bad huffman code", "Combined length longer than code bits available");
j->code_buffer <<= s;
j->code_bits -= s;
// decode into unzigzag'd location
@@ -2246,8 +2275,10 @@ static int stbi__jpeg_decode_block_prog_dc(stbi__jpeg *j, short data[64], stbi__
if (t < 0 || t > 15) return stbi__err("can't merge dc and ac", "Corrupt JPEG");
diff = t ? stbi__extend_receive(j, t) : 0;
if (!stbi__addints_valid(j->img_comp[b].dc_pred, diff)) return stbi__err("bad delta", "Corrupt JPEG");
dc = j->img_comp[b].dc_pred + diff;
j->img_comp[b].dc_pred = dc;
if (!stbi__mul2shorts_valid(dc, 1 << j->succ_low)) return stbi__err("can't merge dc and ac", "Corrupt JPEG");
data[0] = (short) (dc * (1 << j->succ_low));
} else {
// refinement scan for DC coefficient
@@ -2282,6 +2313,7 @@ static int stbi__jpeg_decode_block_prog_ac(stbi__jpeg *j, short data[64], stbi__
if (r) { // fast-AC path
k += (r >> 4) & 15; // run
s = r & 15; // combined length
if (s > j->code_bits) return stbi__err("bad huffman code", "Combined length longer than code bits available");
j->code_buffer <<= s;
j->code_bits -= s;
zig = stbi__jpeg_dezigzag[k++];
@@ -3102,6 +3134,7 @@ static int stbi__process_marker(stbi__jpeg *z, int m)
sizes[i] = stbi__get8(z->s);
n += sizes[i];
}
if(n > 256) return stbi__err("bad DHT header","Corrupt JPEG"); // Loop over i < n would write past end of values!
L -= 17;
if (tc == 0) {
if (!stbi__build_huffman(z->huff_dc+th, sizes)) return 0;
@@ -3267,6 +3300,13 @@ static int stbi__process_frame_header(stbi__jpeg *z, int scan)
if (z->img_comp[i].v > v_max) v_max = z->img_comp[i].v;
}
// check that plane subsampling factors are integer ratios; our resamplers can't deal with fractional ratios
// and I've never seen a non-corrupted JPEG file actually use them
for (i=0; i < s->img_n; ++i) {
if (h_max % z->img_comp[i].h != 0) return stbi__err("bad H","Corrupt JPEG");
if (v_max % z->img_comp[i].v != 0) return stbi__err("bad V","Corrupt JPEG");
}
// compute interleaved mcu info
z->img_h_max = h_max;
z->img_v_max = v_max;
@@ -3344,6 +3384,28 @@ static int stbi__decode_jpeg_header(stbi__jpeg *z, int scan)
return 1;
}
static int stbi__skip_jpeg_junk_at_end(stbi__jpeg *j)
{
// some JPEGs have junk at end, skip over it but if we find what looks
// like a valid marker, resume there
while (!stbi__at_eof(j->s)) {
int x = stbi__get8(j->s);
while (x == 255) { // might be a marker
if (stbi__at_eof(j->s)) return STBI__MARKER_none;
x = stbi__get8(j->s);
if (x != 0x00 && x != 0xff) {
// not a stuffed zero or lead-in to another marker, looks
// like an actual marker, return it
return x;
}
// stuffed zero has x=0 now which ends the loop, meaning we go
// back to regular scan loop.
// repeated 0xff keeps trying to read the next byte of the marker.
}
}
return STBI__MARKER_none;
}
// decode image to YCbCr format
static int stbi__decode_jpeg_image(stbi__jpeg *j)
{
@@ -3360,25 +3422,22 @@ static int stbi__decode_jpeg_image(stbi__jpeg *j)
if (!stbi__process_scan_header(j)) return 0;
if (!stbi__parse_entropy_coded_data(j)) return 0;
if (j->marker == STBI__MARKER_none ) {
// handle 0s at the end of image data from IP Kamera 9060
while (!stbi__at_eof(j->s)) {
int x = stbi__get8(j->s);
if (x == 255) {
j->marker = stbi__get8(j->s);
break;
}
}
j->marker = stbi__skip_jpeg_junk_at_end(j);
// if we reach eof without hitting a marker, stbi__get_marker() below will fail and we'll eventually return 0
}
m = stbi__get_marker(j);
if (STBI__RESTART(m))
m = stbi__get_marker(j);
} else if (stbi__DNL(m)) {
int Ld = stbi__get16be(j->s);
stbi__uint32 NL = stbi__get16be(j->s);
if (Ld != 4) return stbi__err("bad DNL len", "Corrupt JPEG");
if (NL != j->s->img_y) return stbi__err("bad DNL height", "Corrupt JPEG");
m = stbi__get_marker(j);
} else {
if (!stbi__process_marker(j, m)) return 0;
if (!stbi__process_marker(j, m)) return 1;
m = stbi__get_marker(j);
}
m = stbi__get_marker(j);
}
if (j->progressive)
stbi__jpeg_finish(j);
@@ -3969,6 +4028,7 @@ static void *stbi__jpeg_load(stbi__context *s, int *x, int *y, int *comp, int re
unsigned char* result;
stbi__jpeg* j = (stbi__jpeg*) stbi__malloc(sizeof(stbi__jpeg));
if (!j) return stbi__errpuc("outofmem", "Out of memory");
memset(j, 0, sizeof(stbi__jpeg));
STBI_NOTUSED(ri);
j->s = s;
stbi__setup_jpeg(j);
@@ -3982,6 +4042,7 @@ static int stbi__jpeg_test(stbi__context *s)
int r;
stbi__jpeg* j = (stbi__jpeg*)stbi__malloc(sizeof(stbi__jpeg));
if (!j) return stbi__err("outofmem", "Out of memory");
memset(j, 0, sizeof(stbi__jpeg));
j->s = s;
stbi__setup_jpeg(j);
r = stbi__decode_jpeg_header(j, STBI__SCAN_type);
@@ -4007,6 +4068,7 @@ static int stbi__jpeg_info(stbi__context *s, int *x, int *y, int *comp)
int result;
stbi__jpeg* j = (stbi__jpeg*) (stbi__malloc(sizeof(stbi__jpeg)));
if (!j) return stbi__err("outofmem", "Out of memory");
memset(j, 0, sizeof(stbi__jpeg));
j->s = s;
result = stbi__jpeg_info_raw(j, x, y, comp);
STBI_FREE(j);
@@ -4249,11 +4311,12 @@ static int stbi__parse_huffman_block(stbi__zbuf *a)
a->zout = zout;
return 1;
}
if (z >= 286) return stbi__err("bad huffman code","Corrupt PNG"); // per DEFLATE, length codes 286 and 287 must not appear in compressed data
z -= 257;
len = stbi__zlength_base[z];
if (stbi__zlength_extra[z]) len += stbi__zreceive(a, stbi__zlength_extra[z]);
z = stbi__zhuffman_decode(a, &a->z_distance);
if (z < 0) return stbi__err("bad huffman code","Corrupt PNG");
if (z < 0 || z >= 30) return stbi__err("bad huffman code","Corrupt PNG"); // per DEFLATE, distance codes 30 and 31 must not appear in compressed data
dist = stbi__zdist_base[z];
if (stbi__zdist_extra[z]) dist += stbi__zreceive(a, stbi__zdist_extra[z]);
if (zout - a->zout_start < dist) return stbi__err("bad dist","Corrupt PNG");
@@ -4948,7 +5011,7 @@ STBIDEF void stbi_convert_iphone_png_to_rgb(int flag_true_if_should_convert)
static STBI_THREAD_LOCAL int stbi__unpremultiply_on_load_local, stbi__unpremultiply_on_load_set;
static STBI_THREAD_LOCAL int stbi__de_iphone_flag_local, stbi__de_iphone_flag_set;
STBIDEF void stbi__unpremultiply_on_load_thread(int flag_true_if_should_unpremultiply)
STBIDEF void stbi_set_unpremultiply_on_load_thread(int flag_true_if_should_unpremultiply)
{
stbi__unpremultiply_on_load_local = flag_true_if_should_unpremultiply;
stbi__unpremultiply_on_load_set = 1;
@@ -5057,14 +5120,13 @@ static int stbi__parse_png_file(stbi__png *z, int scan, int req_comp)
if (!pal_img_n) {
s->img_n = (color & 2 ? 3 : 1) + (color & 4 ? 1 : 0);
if ((1 << 30) / s->img_x / s->img_n < s->img_y) return stbi__err("too large", "Image too large to decode");
if (scan == STBI__SCAN_header) return 1;
} else {
// if paletted, then pal_n is our final components, and
// img_n is # components to decompress/filter.
s->img_n = 1;
if ((1 << 30) / s->img_x / 4 < s->img_y) return stbi__err("too large","Corrupt PNG");
// if SCAN_header, have to scan to see if we have a tRNS
}
// even with SCAN_header, have to scan to see if we have a tRNS
break;
}
@@ -5096,6 +5158,8 @@ static int stbi__parse_png_file(stbi__png *z, int scan, int req_comp)
if (!(s->img_n & 1)) return stbi__err("tRNS with alpha","Corrupt PNG");
if (c.length != (stbi__uint32) s->img_n*2) return stbi__err("bad tRNS len","Corrupt PNG");
has_trans = 1;
// non-paletted with tRNS = constant alpha. if header-scanning, we can stop now.
if (scan == STBI__SCAN_header) { ++s->img_n; return 1; }
if (z->depth == 16) {
for (k = 0; k < s->img_n; ++k) tc16[k] = (stbi__uint16)stbi__get16be(s); // copy the values as-is
} else {
@@ -5108,7 +5172,13 @@ static int stbi__parse_png_file(stbi__png *z, int scan, int req_comp)
case STBI__PNG_TYPE('I','D','A','T'): {
if (first) return stbi__err("first not IHDR", "Corrupt PNG");
if (pal_img_n && !pal_len) return stbi__err("no PLTE","Corrupt PNG");
if (scan == STBI__SCAN_header) { s->img_n = pal_img_n; return 1; }
if (scan == STBI__SCAN_header) {
// header scan definitely stops at first IDAT
if (pal_img_n)
s->img_n = pal_img_n;
return 1;
}
if (c.length > (1u << 30)) return stbi__err("IDAT size limit", "IDAT section larger than 2^30 bytes");
if ((int)(ioff + c.length) < (int)ioff) return 0;
if (ioff + c.length > idata_limit) {
stbi__uint32 idata_limit_old = idata_limit;
@@ -5491,8 +5561,22 @@ static void *stbi__bmp_load(stbi__context *s, int *x, int *y, int *comp, int req
psize = (info.offset - info.extra_read - info.hsz) >> 2;
}
if (psize == 0) {
if (info.offset != s->callback_already_read + (s->img_buffer - s->img_buffer_original)) {
return stbi__errpuc("bad offset", "Corrupt BMP");
// accept some number of extra bytes after the header, but if the offset points either to before
// the header ends or implies a large amount of extra data, reject the file as malformed
int bytes_read_so_far = s->callback_already_read + (int)(s->img_buffer - s->img_buffer_original);
int header_limit = 1024; // max we actually read is below 256 bytes currently.
int extra_data_limit = 256*4; // what ordinarily goes here is a palette; 256 entries*4 bytes is its max size.
if (bytes_read_so_far <= 0 || bytes_read_so_far > header_limit) {
return stbi__errpuc("bad header", "Corrupt BMP");
}
// we established that bytes_read_so_far is positive and sensible.
// the first half of this test rejects offsets that are either too small positives, or
// negative, and guarantees that info.offset >= bytes_read_so_far > 0. this in turn
// ensures the number computed in the second half of the test can't overflow.
if (info.offset < bytes_read_so_far || info.offset - bytes_read_so_far > extra_data_limit) {
return stbi__errpuc("bad offset", "Corrupt BMP");
} else {
stbi__skip(s, info.offset - bytes_read_so_far);
}
}
@@ -7180,12 +7264,12 @@ static float *stbi__hdr_load(stbi__context *s, int *x, int *y, int *comp, int re
// Run
value = stbi__get8(s);
count -= 128;
if (count > nleft) { STBI_FREE(hdr_data); STBI_FREE(scanline); return stbi__errpf("corrupt", "bad RLE data in HDR"); }
if ((count == 0) || (count > nleft)) { STBI_FREE(hdr_data); STBI_FREE(scanline); return stbi__errpf("corrupt", "bad RLE data in HDR"); }
for (z = 0; z < count; ++z)
scanline[i++ * 4 + k] = value;
} else {
// Dump
if (count > nleft) { STBI_FREE(hdr_data); STBI_FREE(scanline); return stbi__errpf("corrupt", "bad RLE data in HDR"); }
if ((count == 0) || (count > nleft)) { STBI_FREE(hdr_data); STBI_FREE(scanline); return stbi__errpf("corrupt", "bad RLE data in HDR"); }
for (z = 0; z < count; ++z)
scanline[i++ * 4 + k] = stbi__get8(s);
}
@@ -7439,10 +7523,17 @@ static void *stbi__pnm_load(stbi__context *s, int *x, int *y, int *comp, int req
out = (stbi_uc *) stbi__malloc_mad4(s->img_n, s->img_x, s->img_y, ri->bits_per_channel / 8, 0);
if (!out) return stbi__errpuc("outofmem", "Out of memory");
stbi__getn(s, out, s->img_n * s->img_x * s->img_y * (ri->bits_per_channel / 8));
if (!stbi__getn(s, out, s->img_n * s->img_x * s->img_y * (ri->bits_per_channel / 8))) {
STBI_FREE(out);
return stbi__errpuc("bad PNM", "PNM file truncated");
}
if (req_comp && req_comp != s->img_n) {
out = stbi__convert_format(out, s->img_n, req_comp, s->img_x, s->img_y);
if (ri->bits_per_channel == 16) {
out = (stbi_uc *) stbi__convert_format16((stbi__uint16 *) out, s->img_n, req_comp, s->img_x, s->img_y);
} else {
out = stbi__convert_format(out, s->img_n, req_comp, s->img_x, s->img_y);
}
if (out == NULL) return out; // stbi__convert_format frees input on failure
}
return out;
@@ -7479,6 +7570,8 @@ static int stbi__pnm_getinteger(stbi__context *s, char *c)
while (!stbi__at_eof(s) && stbi__pnm_isdigit(*c)) {
value = value*10 + (*c - '0');
*c = (char) stbi__get8(s);
if((value > 214748364) || (value == 214748364 && *c > '7'))
return stbi__err("integer parse overflow", "Parsing an integer in the PPM header overflowed a 32-bit int");
}
return value;
@@ -7509,9 +7602,13 @@ static int stbi__pnm_info(stbi__context *s, int *x, int *y, int *comp)
stbi__pnm_skip_whitespace(s, &c);
*x = stbi__pnm_getinteger(s, &c); // read width
if(*x == 0)
return stbi__err("invalid width", "PPM image header had zero or overflowing width");
stbi__pnm_skip_whitespace(s, &c);
*y = stbi__pnm_getinteger(s, &c); // read height
if (*y == 0)
return stbi__err("invalid width", "PPM image header had zero or overflowing width");
stbi__pnm_skip_whitespace(s, &c);
maxv = stbi__pnm_getinteger(s, &c); // read max value
+8 -8
View File
@@ -511,7 +511,7 @@ static int stbi_write_bmp_core(stbi__write_context *s, int x, int y, int comp, c
STBIWDEF int stbi_write_bmp_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const void *data)
{
stbi__write_context s {};
stbi__write_context s = { 0 };
stbi__start_write_callbacks(&s, func, context);
return stbi_write_bmp_core(&s, x, y, comp, data);
}
@@ -519,7 +519,7 @@ STBIWDEF int stbi_write_bmp_to_func(stbi_write_func *func, void *context, int x,
#ifndef STBI_WRITE_NO_STDIO
STBIWDEF int stbi_write_bmp(char const *filename, int x, int y, int comp, const void *data)
{
stbi__write_context s {};
stbi__write_context s = { 0 };
if (stbi__start_write_file(&s,filename)) {
int r = stbi_write_bmp_core(&s, x, y, comp, data);
stbi__end_write_file(&s);
@@ -610,7 +610,7 @@ static int stbi_write_tga_core(stbi__write_context *s, int x, int y, int comp, v
STBIWDEF int stbi_write_tga_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const void *data)
{
stbi__write_context s {};
stbi__write_context s = { 0 };
stbi__start_write_callbacks(&s, func, context);
return stbi_write_tga_core(&s, x, y, comp, (void *) data);
}
@@ -618,7 +618,7 @@ STBIWDEF int stbi_write_tga_to_func(stbi_write_func *func, void *context, int x,
#ifndef STBI_WRITE_NO_STDIO
STBIWDEF int stbi_write_tga(char const *filename, int x, int y, int comp, const void *data)
{
stbi__write_context s {};
stbi__write_context s = { 0 };
if (stbi__start_write_file(&s,filename)) {
int r = stbi_write_tga_core(&s, x, y, comp, (void *) data);
stbi__end_write_file(&s);
@@ -786,14 +786,14 @@ static int stbi_write_hdr_core(stbi__write_context *s, int x, int y, int comp, f
STBIWDEF int stbi_write_hdr_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const float *data)
{
stbi__write_context s {};
stbi__write_context s = { 0 };
stbi__start_write_callbacks(&s, func, context);
return stbi_write_hdr_core(&s, x, y, comp, (float *) data);
}
STBIWDEF int stbi_write_hdr(char const *filename, int x, int y, int comp, const float *data)
{
stbi__write_context s {};
stbi__write_context s = { 0 };
if (stbi__start_write_file(&s,filename)) {
int r = stbi_write_hdr_core(&s, x, y, comp, (float *) data);
stbi__end_write_file(&s);
@@ -1606,7 +1606,7 @@ static int stbi_write_jpg_core(stbi__write_context *s, int width, int height, in
STBIWDEF int stbi_write_jpg_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const void *data, int quality)
{
stbi__write_context s {};
stbi__write_context s = { 0 };
stbi__start_write_callbacks(&s, func, context);
return stbi_write_jpg_core(&s, x, y, comp, (void *) data, quality);
}
@@ -1615,7 +1615,7 @@ STBIWDEF int stbi_write_jpg_to_func(stbi_write_func *func, void *context, int x,
#ifndef STBI_WRITE_NO_STDIO
STBIWDEF int stbi_write_jpg(char const *filename, int x, int y, int comp, const void *data, int quality)
{
stbi__write_context s {};
stbi__write_context s = { 0 };
if (stbi__start_write_file(&s,filename)) {
int r = stbi_write_jpg_core(&s, x, y, comp, data, quality);
stbi__end_write_file(&s);
+18 -8
View File
@@ -21987,15 +21987,25 @@ wuffs_adler32__hasher__up_arm_neon(
}
v_p.len = 0;
}
static const uint16x4_t table_0 {32, 31, 30, 29};
static const uint16x4_t table_1 {28, 27, 26, 25};
static const uint16x4_t table_2 {24, 23, 22, 21};
static const uint16x4_t table_3 {20, 19, 18, 17};
static const uint16x4_t table_4 {16, 15, 14, 13};
static const uint16x4_t table_5 {12, 11, 10, 9};
static const uint16x4_t table_6 { 8, 7, 6, 5};
static const uint16x4_t table_7 { 4, 3, 2, 1};
v_v2 = vshlq_n_u32(v_v2, 5);
v_v2 = vmlal_u16(v_v2, vget_low_u16(v_col0), ((uint16x4_t){32, 31, 30, 29}));
v_v2 = vmlal_u16(v_v2, vget_high_u16(v_col0), ((uint16x4_t){28, 27, 26, 25}));
v_v2 = vmlal_u16(v_v2, vget_low_u16(v_col1), ((uint16x4_t){24, 23, 22, 21}));
v_v2 = vmlal_u16(v_v2, vget_high_u16(v_col1), ((uint16x4_t){20, 19, 18, 17}));
v_v2 = vmlal_u16(v_v2, vget_low_u16(v_col2), ((uint16x4_t){16, 15, 14, 13}));
v_v2 = vmlal_u16(v_v2, vget_high_u16(v_col2), ((uint16x4_t){12, 11, 10, 9}));
v_v2 = vmlal_u16(v_v2, vget_low_u16(v_col3), ((uint16x4_t){8, 7, 6, 5}));
v_v2 = vmlal_u16(v_v2, vget_high_u16(v_col3), ((uint16x4_t){4, 3, 2, 1}));
v_v2 = vmlal_u16(v_v2, vget_low_u16(v_col0), table_0);
v_v2 = vmlal_u16(v_v2, vget_high_u16(v_col0), table_1);
v_v2 = vmlal_u16(v_v2, vget_low_u16(v_col1), table_2);
v_v2 = vmlal_u16(v_v2, vget_high_u16(v_col1), table_3);
v_v2 = vmlal_u16(v_v2, vget_low_u16(v_col2), table_4);
v_v2 = vmlal_u16(v_v2, vget_high_u16(v_col2), table_5);
v_v2 = vmlal_u16(v_v2, vget_low_u16(v_col3), table_6);
v_v2 = vmlal_u16(v_v2, vget_high_u16(v_col3), table_7);
v_sum1 = vpadd_u32(vget_low_u32(v_v1), vget_high_u32(v_v1));
v_sum2 = vpadd_u32(vget_low_u32(v_v2), vget_high_u32(v_v2));
v_sum12 = vpadd_u32(v_sum1, v_sum2);
@@ -1,26 +0,0 @@
Image Set,Block Size,Name,PSNR,Total Time,Coding Time,Coding Rate
HDRIHaven,4x4,hdr-rgb-arboretum.hdr,45.4642,0.9206,0.1086,5.4306
HDRIHaven,4x4,hdr-rgb-bellparkpier.hdr,55.6113,0.9023,0.1042,5.6594
HDRIHaven,4x4,hdr-rgb-canarywharf.hdr,49.1249,0.9088,0.1151,5.1237
HDRIHaven,4x4,hdr-rgb-eveningroad.hdr,51.8620,0.9039,0.1064,5.5433
HDRIHaven,4x4,hdr-rgb-riverwalk.hdr,34.4006,0.9604,0.1386,4.2548
HDRIHaven,5x5,hdr-rgb-arboretum.hdr,41.4855,0.9261,0.1123,5.2511
HDRIHaven,5x5,hdr-rgb-bellparkpier.hdr,51.8798,0.9130,0.1129,5.2252
HDRIHaven,5x5,hdr-rgb-canarywharf.hdr,45.5073,0.9110,0.1164,5.0690
HDRIHaven,5x5,hdr-rgb-eveningroad.hdr,48.9690,0.9134,0.1130,5.2194
HDRIHaven,5x5,hdr-rgb-riverwalk.hdr,30.8563,0.9531,0.1314,4.4902
HDRIHaven,6x6,hdr-rgb-arboretum.hdr,37.9117,0.9368,0.1226,4.8125
HDRIHaven,6x6,hdr-rgb-bellparkpier.hdr,48.9417,0.9174,0.1235,4.7743
HDRIHaven,6x6,hdr-rgb-canarywharf.hdr,42.6901,0.9159,0.1247,4.7282
HDRIHaven,6x6,hdr-rgb-eveningroad.hdr,46.6034,0.9219,0.1237,4.7695
HDRIHaven,6x6,hdr-rgb-riverwalk.hdr,28.4690,0.9657,0.1410,4.1836
HDRIHaven,8x8,hdr-rgb-arboretum.hdr,33.6259,1.0036,0.1842,3.2028
HDRIHaven,8x8,hdr-rgb-bellparkpier.hdr,45.0949,0.9746,0.1730,3.4098
HDRIHaven,8x8,hdr-rgb-canarywharf.hdr,39.4004,0.9796,0.1780,3.3143
HDRIHaven,8x8,hdr-rgb-eveningroad.hdr,43.5218,0.9778,0.1735,3.3987
HDRIHaven,8x8,hdr-rgb-riverwalk.hdr,25.6552,1.0249,0.1933,3.0509
HDRIHaven,12x12,hdr-rgb-arboretum.hdr,30.2083,1.0440,0.2153,2.7398
HDRIHaven,12x12,hdr-rgb-bellparkpier.hdr,41.1174,1.0073,0.1992,2.9616
HDRIHaven,12x12,hdr-rgb-canarywharf.hdr,36.3098,1.0129,0.2058,2.8664
HDRIHaven,12x12,hdr-rgb-eveningroad.hdr,40.7317,1.0167,0.2065,2.8566
HDRIHaven,12x12,hdr-rgb-riverwalk.hdr,23.0473,1.0679,0.2263,2.6068
1 Image Set Block Size Name PSNR Total Time Coding Time Coding Rate
2 HDRIHaven 4x4 hdr-rgb-arboretum.hdr 45.4642 0.9206 0.1086 5.4306
3 HDRIHaven 4x4 hdr-rgb-bellparkpier.hdr 55.6113 0.9023 0.1042 5.6594
4 HDRIHaven 4x4 hdr-rgb-canarywharf.hdr 49.1249 0.9088 0.1151 5.1237
5 HDRIHaven 4x4 hdr-rgb-eveningroad.hdr 51.8620 0.9039 0.1064 5.5433
6 HDRIHaven 4x4 hdr-rgb-riverwalk.hdr 34.4006 0.9604 0.1386 4.2548
7 HDRIHaven 5x5 hdr-rgb-arboretum.hdr 41.4855 0.9261 0.1123 5.2511
8 HDRIHaven 5x5 hdr-rgb-bellparkpier.hdr 51.8798 0.9130 0.1129 5.2252
9 HDRIHaven 5x5 hdr-rgb-canarywharf.hdr 45.5073 0.9110 0.1164 5.0690
10 HDRIHaven 5x5 hdr-rgb-eveningroad.hdr 48.9690 0.9134 0.1130 5.2194
11 HDRIHaven 5x5 hdr-rgb-riverwalk.hdr 30.8563 0.9531 0.1314 4.4902
12 HDRIHaven 6x6 hdr-rgb-arboretum.hdr 37.9117 0.9368 0.1226 4.8125
13 HDRIHaven 6x6 hdr-rgb-bellparkpier.hdr 48.9417 0.9174 0.1235 4.7743
14 HDRIHaven 6x6 hdr-rgb-canarywharf.hdr 42.6901 0.9159 0.1247 4.7282
15 HDRIHaven 6x6 hdr-rgb-eveningroad.hdr 46.6034 0.9219 0.1237 4.7695
16 HDRIHaven 6x6 hdr-rgb-riverwalk.hdr 28.4690 0.9657 0.1410 4.1836
17 HDRIHaven 8x8 hdr-rgb-arboretum.hdr 33.6259 1.0036 0.1842 3.2028
18 HDRIHaven 8x8 hdr-rgb-bellparkpier.hdr 45.0949 0.9746 0.1730 3.4098
19 HDRIHaven 8x8 hdr-rgb-canarywharf.hdr 39.4004 0.9796 0.1780 3.3143
20 HDRIHaven 8x8 hdr-rgb-eveningroad.hdr 43.5218 0.9778 0.1735 3.3987
21 HDRIHaven 8x8 hdr-rgb-riverwalk.hdr 25.6552 1.0249 0.1933 3.0509
22 HDRIHaven 12x12 hdr-rgb-arboretum.hdr 30.2083 1.0440 0.2153 2.7398
23 HDRIHaven 12x12 hdr-rgb-bellparkpier.hdr 41.1174 1.0073 0.1992 2.9616
24 HDRIHaven 12x12 hdr-rgb-canarywharf.hdr 36.3098 1.0129 0.2058 2.8664
25 HDRIHaven 12x12 hdr-rgb-eveningroad.hdr 40.7317 1.0167 0.2065 2.8566
26 HDRIHaven 12x12 hdr-rgb-riverwalk.hdr 23.0473 1.0679 0.2263 2.6068
@@ -1,26 +0,0 @@
Image Set,Block Size,Name,PSNR,Total Time,Coding Time,Coding Rate
HDRIHaven,4x4,hdr-rgb-arboretum.hdr,45.2326,0.8834,0.0732,8.0546
HDRIHaven,4x4,hdr-rgb-bellparkpier.hdr,55.2416,0.8664,0.0739,7.9788
HDRIHaven,4x4,hdr-rgb-canarywharf.hdr,48.4153,0.8676,0.0738,7.9926
HDRIHaven,4x4,hdr-rgb-eveningroad.hdr,51.6138,0.8715,0.0738,7.9920
HDRIHaven,4x4,hdr-rgb-riverwalk.hdr,34.0126,0.9023,0.0794,7.4294
HDRIHaven,5x5,hdr-rgb-arboretum.hdr,41.3165,0.8916,0.0804,7.3356
HDRIHaven,5x5,hdr-rgb-bellparkpier.hdr,51.4653,0.8779,0.0810,7.2802
HDRIHaven,5x5,hdr-rgb-canarywharf.hdr,45.0821,0.8755,0.0807,7.3127
HDRIHaven,5x5,hdr-rgb-eveningroad.hdr,48.7249,0.8807,0.0825,7.1537
HDRIHaven,5x5,hdr-rgb-riverwalk.hdr,30.4955,0.9102,0.0845,6.9824
HDRIHaven,6x6,hdr-rgb-arboretum.hdr,37.8196,0.8892,0.0779,7.5716
HDRIHaven,6x6,hdr-rgb-bellparkpier.hdr,48.6921,0.8750,0.0797,7.4031
HDRIHaven,6x6,hdr-rgb-canarywharf.hdr,42.4519,0.8730,0.0779,7.5736
HDRIHaven,6x6,hdr-rgb-eveningroad.hdr,46.3978,0.8744,0.0797,7.3992
HDRIHaven,6x6,hdr-rgb-riverwalk.hdr,28.3170,0.9022,0.0810,7.2811
HDRIHaven,8x8,hdr-rgb-arboretum.hdr,33.5303,0.9379,0.1203,4.9045
HDRIHaven,8x8,hdr-rgb-bellparkpier.hdr,44.8147,0.9207,0.1202,4.9074
HDRIHaven,8x8,hdr-rgb-canarywharf.hdr,39.1762,0.9168,0.1183,4.9860
HDRIHaven,8x8,hdr-rgb-eveningroad.hdr,43.3622,0.9242,0.1215,4.8552
HDRIHaven,8x8,hdr-rgb-riverwalk.hdr,25.5936,0.9590,0.1270,4.6442
HDRIHaven,12x12,hdr-rgb-arboretum.hdr,30.1316,0.9686,0.1448,4.0723
HDRIHaven,12x12,hdr-rgb-bellparkpier.hdr,40.6209,0.9452,0.1410,4.1826
HDRIHaven,12x12,hdr-rgb-canarywharf.hdr,36.0262,0.9435,0.1409,4.1869
HDRIHaven,12x12,hdr-rgb-eveningroad.hdr,40.6170,0.9515,0.1447,4.0759
HDRIHaven,12x12,hdr-rgb-riverwalk.hdr,23.0061,0.9935,0.1550,3.8064
1 Image Set Block Size Name PSNR Total Time Coding Time Coding Rate
2 HDRIHaven 4x4 hdr-rgb-arboretum.hdr 45.2326 0.8834 0.0732 8.0546
3 HDRIHaven 4x4 hdr-rgb-bellparkpier.hdr 55.2416 0.8664 0.0739 7.9788
4 HDRIHaven 4x4 hdr-rgb-canarywharf.hdr 48.4153 0.8676 0.0738 7.9926
5 HDRIHaven 4x4 hdr-rgb-eveningroad.hdr 51.6138 0.8715 0.0738 7.9920
6 HDRIHaven 4x4 hdr-rgb-riverwalk.hdr 34.0126 0.9023 0.0794 7.4294
7 HDRIHaven 5x5 hdr-rgb-arboretum.hdr 41.3165 0.8916 0.0804 7.3356
8 HDRIHaven 5x5 hdr-rgb-bellparkpier.hdr 51.4653 0.8779 0.0810 7.2802
9 HDRIHaven 5x5 hdr-rgb-canarywharf.hdr 45.0821 0.8755 0.0807 7.3127
10 HDRIHaven 5x5 hdr-rgb-eveningroad.hdr 48.7249 0.8807 0.0825 7.1537
11 HDRIHaven 5x5 hdr-rgb-riverwalk.hdr 30.4955 0.9102 0.0845 6.9824
12 HDRIHaven 6x6 hdr-rgb-arboretum.hdr 37.8196 0.8892 0.0779 7.5716
13 HDRIHaven 6x6 hdr-rgb-bellparkpier.hdr 48.6921 0.8750 0.0797 7.4031
14 HDRIHaven 6x6 hdr-rgb-canarywharf.hdr 42.4519 0.8730 0.0779 7.5736
15 HDRIHaven 6x6 hdr-rgb-eveningroad.hdr 46.3978 0.8744 0.0797 7.3992
16 HDRIHaven 6x6 hdr-rgb-riverwalk.hdr 28.3170 0.9022 0.0810 7.2811
17 HDRIHaven 8x8 hdr-rgb-arboretum.hdr 33.5303 0.9379 0.1203 4.9045
18 HDRIHaven 8x8 hdr-rgb-bellparkpier.hdr 44.8147 0.9207 0.1202 4.9074
19 HDRIHaven 8x8 hdr-rgb-canarywharf.hdr 39.1762 0.9168 0.1183 4.9860
20 HDRIHaven 8x8 hdr-rgb-eveningroad.hdr 43.3622 0.9242 0.1215 4.8552
21 HDRIHaven 8x8 hdr-rgb-riverwalk.hdr 25.5936 0.9590 0.1270 4.6442
22 HDRIHaven 12x12 hdr-rgb-arboretum.hdr 30.1316 0.9686 0.1448 4.0723
23 HDRIHaven 12x12 hdr-rgb-bellparkpier.hdr 40.6209 0.9452 0.1410 4.1826
24 HDRIHaven 12x12 hdr-rgb-canarywharf.hdr 36.0262 0.9435 0.1409 4.1869
25 HDRIHaven 12x12 hdr-rgb-eveningroad.hdr 40.6170 0.9515 0.1447 4.0759
26 HDRIHaven 12x12 hdr-rgb-riverwalk.hdr 23.0061 0.9935 0.1550 3.8064
@@ -1,26 +0,0 @@
Image Set,Block Size,Name,PSNR,Total Time,Coding Time,Coding Rate
HDRIHaven,4x4,hdr-rgb-arboretum.hdr,45.6813,1.0152,0.1984,2.9722
HDRIHaven,4x4,hdr-rgb-bellparkpier.hdr,55.9798,0.9898,0.1927,3.0610
HDRIHaven,4x4,hdr-rgb-canarywharf.hdr,49.5307,1.0064,0.2162,2.7282
HDRIHaven,4x4,hdr-rgb-eveningroad.hdr,52.2443,0.9997,0.1979,2.9801
HDRIHaven,4x4,hdr-rgb-riverwalk.hdr,34.7736,1.0887,0.2683,2.1981
HDRIHaven,5x5,hdr-rgb-arboretum.hdr,41.5886,1.0282,0.2159,2.7315
HDRIHaven,5x5,hdr-rgb-bellparkpier.hdr,52.2735,1.0275,0.2244,2.6287
HDRIHaven,5x5,hdr-rgb-canarywharf.hdr,46.0266,1.0367,0.2354,2.5060
HDRIHaven,5x5,hdr-rgb-eveningroad.hdr,49.1990,1.0237,0.2220,2.6565
HDRIHaven,5x5,hdr-rgb-riverwalk.hdr,31.0853,1.1085,0.2788,2.1153
HDRIHaven,6x6,hdr-rgb-arboretum.hdr,37.9799,1.0462,0.2313,2.5503
HDRIHaven,6x6,hdr-rgb-bellparkpier.hdr,49.3967,1.0207,0.2245,2.6272
HDRIHaven,6x6,hdr-rgb-canarywharf.hdr,43.2283,1.0421,0.2421,2.4367
HDRIHaven,6x6,hdr-rgb-eveningroad.hdr,46.7624,1.0325,0.2280,2.5873
HDRIHaven,6x6,hdr-rgb-riverwalk.hdr,28.6249,1.1214,0.2926,2.0159
HDRIHaven,8x8,hdr-rgb-arboretum.hdr,33.7193,1.1333,0.3106,1.8989
HDRIHaven,8x8,hdr-rgb-bellparkpier.hdr,45.5399,1.0800,0.2739,2.1533
HDRIHaven,8x8,hdr-rgb-canarywharf.hdr,39.7977,1.1059,0.3027,1.9487
HDRIHaven,8x8,hdr-rgb-eveningroad.hdr,43.6493,1.1016,0.2954,1.9965
HDRIHaven,8x8,hdr-rgb-riverwalk.hdr,25.7364,1.1932,0.3560,1.6567
HDRIHaven,12x12,hdr-rgb-arboretum.hdr,30.2550,1.2301,0.3959,1.4897
HDRIHaven,12x12,hdr-rgb-bellparkpier.hdr,41.3419,1.1312,0.3167,1.8626
HDRIHaven,12x12,hdr-rgb-canarywharf.hdr,36.5225,1.1784,0.3665,1.6095
HDRIHaven,12x12,hdr-rgb-eveningroad.hdr,40.8200,1.1691,0.3549,1.6619
HDRIHaven,12x12,hdr-rgb-riverwalk.hdr,23.0791,1.2708,0.4258,1.3853
1 Image Set Block Size Name PSNR Total Time Coding Time Coding Rate
2 HDRIHaven 4x4 hdr-rgb-arboretum.hdr 45.6813 1.0152 0.1984 2.9722
3 HDRIHaven 4x4 hdr-rgb-bellparkpier.hdr 55.9798 0.9898 0.1927 3.0610
4 HDRIHaven 4x4 hdr-rgb-canarywharf.hdr 49.5307 1.0064 0.2162 2.7282
5 HDRIHaven 4x4 hdr-rgb-eveningroad.hdr 52.2443 0.9997 0.1979 2.9801
6 HDRIHaven 4x4 hdr-rgb-riverwalk.hdr 34.7736 1.0887 0.2683 2.1981
7 HDRIHaven 5x5 hdr-rgb-arboretum.hdr 41.5886 1.0282 0.2159 2.7315
8 HDRIHaven 5x5 hdr-rgb-bellparkpier.hdr 52.2735 1.0275 0.2244 2.6287
9 HDRIHaven 5x5 hdr-rgb-canarywharf.hdr 46.0266 1.0367 0.2354 2.5060
10 HDRIHaven 5x5 hdr-rgb-eveningroad.hdr 49.1990 1.0237 0.2220 2.6565
11 HDRIHaven 5x5 hdr-rgb-riverwalk.hdr 31.0853 1.1085 0.2788 2.1153
12 HDRIHaven 6x6 hdr-rgb-arboretum.hdr 37.9799 1.0462 0.2313 2.5503
13 HDRIHaven 6x6 hdr-rgb-bellparkpier.hdr 49.3967 1.0207 0.2245 2.6272
14 HDRIHaven 6x6 hdr-rgb-canarywharf.hdr 43.2283 1.0421 0.2421 2.4367
15 HDRIHaven 6x6 hdr-rgb-eveningroad.hdr 46.7624 1.0325 0.2280 2.5873
16 HDRIHaven 6x6 hdr-rgb-riverwalk.hdr 28.6249 1.1214 0.2926 2.0159
17 HDRIHaven 8x8 hdr-rgb-arboretum.hdr 33.7193 1.1333 0.3106 1.8989
18 HDRIHaven 8x8 hdr-rgb-bellparkpier.hdr 45.5399 1.0800 0.2739 2.1533
19 HDRIHaven 8x8 hdr-rgb-canarywharf.hdr 39.7977 1.1059 0.3027 1.9487
20 HDRIHaven 8x8 hdr-rgb-eveningroad.hdr 43.6493 1.1016 0.2954 1.9965
21 HDRIHaven 8x8 hdr-rgb-riverwalk.hdr 25.7364 1.1932 0.3560 1.6567
22 HDRIHaven 12x12 hdr-rgb-arboretum.hdr 30.2550 1.2301 0.3959 1.4897
23 HDRIHaven 12x12 hdr-rgb-bellparkpier.hdr 41.3419 1.1312 0.3167 1.8626
24 HDRIHaven 12x12 hdr-rgb-canarywharf.hdr 36.5225 1.1784 0.3665 1.6095
25 HDRIHaven 12x12 hdr-rgb-eveningroad.hdr 40.8200 1.1691 0.3549 1.6619
26 HDRIHaven 12x12 hdr-rgb-riverwalk.hdr 23.0791 1.2708 0.4258 1.3853
@@ -1,26 +0,0 @@
Image Set,Block Size,Name,PSNR,Total Time,Coding Time,Coding Rate
HDRIHaven,4x4,hdr-rgb-arboretum.hdr,45.8140,1.2786,0.4654,1.2672
HDRIHaven,4x4,hdr-rgb-bellparkpier.hdr,56.3108,1.2803,0.4840,1.2185
HDRIHaven,4x4,hdr-rgb-canarywharf.hdr,50.0416,1.2881,0.4957,1.1898
HDRIHaven,4x4,hdr-rgb-eveningroad.hdr,52.4415,1.2576,0.4593,1.2841
HDRIHaven,4x4,hdr-rgb-riverwalk.hdr,34.9172,1.3825,0.5584,1.0563
HDRIHaven,5x5,hdr-rgb-arboretum.hdr,41.6414,1.3319,0.5165,1.1419
HDRIHaven,5x5,hdr-rgb-bellparkpier.hdr,52.5164,1.3635,0.5626,1.0484
HDRIHaven,5x5,hdr-rgb-canarywharf.hdr,46.4423,1.3605,0.5630,1.0477
HDRIHaven,5x5,hdr-rgb-eveningroad.hdr,49.3342,1.3238,0.5249,1.1236
HDRIHaven,5x5,hdr-rgb-riverwalk.hdr,31.2127,1.4354,0.6116,0.9644
HDRIHaven,6x6,hdr-rgb-arboretum.hdr,38.0206,1.4166,0.6002,0.9827
HDRIHaven,6x6,hdr-rgb-bellparkpier.hdr,49.5914,1.4251,0.6261,0.9421
HDRIHaven,6x6,hdr-rgb-canarywharf.hdr,43.5720,1.4318,0.6347,0.9293
HDRIHaven,6x6,hdr-rgb-eveningroad.hdr,46.8656,1.4023,0.6044,0.9758
HDRIHaven,6x6,hdr-rgb-riverwalk.hdr,28.7315,1.5346,0.7063,0.8351
HDRIHaven,8x8,hdr-rgb-arboretum.hdr,33.7638,1.5949,0.7743,0.7618
HDRIHaven,8x8,hdr-rgb-bellparkpier.hdr,45.7385,1.5035,0.6977,0.8454
HDRIHaven,8x8,hdr-rgb-canarywharf.hdr,40.0389,1.5403,0.7381,0.7991
HDRIHaven,8x8,hdr-rgb-eveningroad.hdr,43.7390,1.5266,0.7178,0.8218
HDRIHaven,8x8,hdr-rgb-riverwalk.hdr,25.8113,1.7122,0.8752,0.6739
HDRIHaven,12x12,hdr-rgb-arboretum.hdr,30.2896,1.7554,0.9179,0.6426
HDRIHaven,12x12,hdr-rgb-bellparkpier.hdr,41.5190,1.5453,0.7246,0.8140
HDRIHaven,12x12,hdr-rgb-canarywharf.hdr,36.7063,1.6414,0.8256,0.7145
HDRIHaven,12x12,hdr-rgb-eveningroad.hdr,40.8774,1.6395,0.8212,0.7183
HDRIHaven,12x12,hdr-rgb-riverwalk.hdr,23.1152,1.8808,1.0319,0.5716
1 Image Set Block Size Name PSNR Total Time Coding Time Coding Rate
2 HDRIHaven 4x4 hdr-rgb-arboretum.hdr 45.8140 1.2786 0.4654 1.2672
3 HDRIHaven 4x4 hdr-rgb-bellparkpier.hdr 56.3108 1.2803 0.4840 1.2185
4 HDRIHaven 4x4 hdr-rgb-canarywharf.hdr 50.0416 1.2881 0.4957 1.1898
5 HDRIHaven 4x4 hdr-rgb-eveningroad.hdr 52.4415 1.2576 0.4593 1.2841
6 HDRIHaven 4x4 hdr-rgb-riverwalk.hdr 34.9172 1.3825 0.5584 1.0563
7 HDRIHaven 5x5 hdr-rgb-arboretum.hdr 41.6414 1.3319 0.5165 1.1419
8 HDRIHaven 5x5 hdr-rgb-bellparkpier.hdr 52.5164 1.3635 0.5626 1.0484
9 HDRIHaven 5x5 hdr-rgb-canarywharf.hdr 46.4423 1.3605 0.5630 1.0477
10 HDRIHaven 5x5 hdr-rgb-eveningroad.hdr 49.3342 1.3238 0.5249 1.1236
11 HDRIHaven 5x5 hdr-rgb-riverwalk.hdr 31.2127 1.4354 0.6116 0.9644
12 HDRIHaven 6x6 hdr-rgb-arboretum.hdr 38.0206 1.4166 0.6002 0.9827
13 HDRIHaven 6x6 hdr-rgb-bellparkpier.hdr 49.5914 1.4251 0.6261 0.9421
14 HDRIHaven 6x6 hdr-rgb-canarywharf.hdr 43.5720 1.4318 0.6347 0.9293
15 HDRIHaven 6x6 hdr-rgb-eveningroad.hdr 46.8656 1.4023 0.6044 0.9758
16 HDRIHaven 6x6 hdr-rgb-riverwalk.hdr 28.7315 1.5346 0.7063 0.8351
17 HDRIHaven 8x8 hdr-rgb-arboretum.hdr 33.7638 1.5949 0.7743 0.7618
18 HDRIHaven 8x8 hdr-rgb-bellparkpier.hdr 45.7385 1.5035 0.6977 0.8454
19 HDRIHaven 8x8 hdr-rgb-canarywharf.hdr 40.0389 1.5403 0.7381 0.7991
20 HDRIHaven 8x8 hdr-rgb-eveningroad.hdr 43.7390 1.5266 0.7178 0.8218
21 HDRIHaven 8x8 hdr-rgb-riverwalk.hdr 25.8113 1.7122 0.8752 0.6739
22 HDRIHaven 12x12 hdr-rgb-arboretum.hdr 30.2896 1.7554 0.9179 0.6426
23 HDRIHaven 12x12 hdr-rgb-bellparkpier.hdr 41.5190 1.5453 0.7246 0.8140
24 HDRIHaven 12x12 hdr-rgb-canarywharf.hdr 36.7063 1.6414 0.8256 0.7145
25 HDRIHaven 12x12 hdr-rgb-eveningroad.hdr 40.8774 1.6395 0.8212 0.7183
26 HDRIHaven 12x12 hdr-rgb-riverwalk.hdr 23.1152 1.8808 1.0319 0.5716
@@ -1,26 +0,0 @@
Image Set,Block Size,Name,PSNR,Total Time,Coding Time,Coding Rate
HDRIHaven,4x4,hdr-rgb-arboretum.hdr,45.4642,0.9467,0.1363,4.3277
HDRIHaven,4x4,hdr-rgb-bellparkpier.hdr,55.6113,0.9284,0.1309,4.5048
HDRIHaven,4x4,hdr-rgb-canarywharf.hdr,49.1249,0.9385,0.1436,4.1086
HDRIHaven,4x4,hdr-rgb-eveningroad.hdr,51.8620,0.9319,0.1331,4.4325
HDRIHaven,4x4,hdr-rgb-riverwalk.hdr,34.4006,0.9953,0.1724,3.4211
HDRIHaven,5x5,hdr-rgb-arboretum.hdr,41.4855,0.9690,0.1534,3.8461
HDRIHaven,5x5,hdr-rgb-bellparkpier.hdr,51.8798,0.9525,0.1534,3.8443
HDRIHaven,5x5,hdr-rgb-canarywharf.hdr,45.5073,0.9539,0.1578,3.7371
HDRIHaven,5x5,hdr-rgb-eveningroad.hdr,48.9690,0.9547,0.1531,3.8532
HDRIHaven,5x5,hdr-rgb-riverwalk.hdr,30.8563,1.0047,0.1788,3.2995
HDRIHaven,6x6,hdr-rgb-arboretum.hdr,37.9117,0.9980,0.1808,3.2631
HDRIHaven,6x6,hdr-rgb-bellparkpier.hdr,48.9417,0.9821,0.1812,3.2554
HDRIHaven,6x6,hdr-rgb-canarywharf.hdr,42.6901,0.9781,0.1823,3.2348
HDRIHaven,6x6,hdr-rgb-eveningroad.hdr,46.6034,0.9832,0.1806,3.2657
HDRIHaven,6x6,hdr-rgb-riverwalk.hdr,28.4690,1.0337,0.2048,2.8804
HDRIHaven,8x8,hdr-rgb-arboretum.hdr,33.6259,1.0986,0.2734,2.1570
HDRIHaven,8x8,hdr-rgb-bellparkpier.hdr,45.0949,1.0655,0.2576,2.2900
HDRIHaven,8x8,hdr-rgb-canarywharf.hdr,39.4004,1.0731,0.2682,2.1989
HDRIHaven,8x8,hdr-rgb-eveningroad.hdr,43.5218,1.0685,0.2607,2.2623
HDRIHaven,8x8,hdr-rgb-riverwalk.hdr,25.6552,1.1252,0.2871,2.0544
HDRIHaven,12x12,hdr-rgb-arboretum.hdr,30.2083,1.1537,0.3238,1.8215
HDRIHaven,12x12,hdr-rgb-bellparkpier.hdr,41.1174,1.1093,0.2999,1.9665
HDRIHaven,12x12,hdr-rgb-canarywharf.hdr,36.3098,1.1162,0.3092,1.9078
HDRIHaven,12x12,hdr-rgb-eveningroad.hdr,40.7317,1.1237,0.3094,1.9064
HDRIHaven,12x12,hdr-rgb-riverwalk.hdr,23.0473,1.1800,0.3385,1.7423
1 Image Set Block Size Name PSNR Total Time Coding Time Coding Rate
2 HDRIHaven 4x4 hdr-rgb-arboretum.hdr 45.4642 0.9467 0.1363 4.3277
3 HDRIHaven 4x4 hdr-rgb-bellparkpier.hdr 55.6113 0.9284 0.1309 4.5048
4 HDRIHaven 4x4 hdr-rgb-canarywharf.hdr 49.1249 0.9385 0.1436 4.1086
5 HDRIHaven 4x4 hdr-rgb-eveningroad.hdr 51.8620 0.9319 0.1331 4.4325
6 HDRIHaven 4x4 hdr-rgb-riverwalk.hdr 34.4006 0.9953 0.1724 3.4211
7 HDRIHaven 5x5 hdr-rgb-arboretum.hdr 41.4855 0.9690 0.1534 3.8461
8 HDRIHaven 5x5 hdr-rgb-bellparkpier.hdr 51.8798 0.9525 0.1534 3.8443
9 HDRIHaven 5x5 hdr-rgb-canarywharf.hdr 45.5073 0.9539 0.1578 3.7371
10 HDRIHaven 5x5 hdr-rgb-eveningroad.hdr 48.9690 0.9547 0.1531 3.8532
11 HDRIHaven 5x5 hdr-rgb-riverwalk.hdr 30.8563 1.0047 0.1788 3.2995
12 HDRIHaven 6x6 hdr-rgb-arboretum.hdr 37.9117 0.9980 0.1808 3.2631
13 HDRIHaven 6x6 hdr-rgb-bellparkpier.hdr 48.9417 0.9821 0.1812 3.2554
14 HDRIHaven 6x6 hdr-rgb-canarywharf.hdr 42.6901 0.9781 0.1823 3.2348
15 HDRIHaven 6x6 hdr-rgb-eveningroad.hdr 46.6034 0.9832 0.1806 3.2657
16 HDRIHaven 6x6 hdr-rgb-riverwalk.hdr 28.4690 1.0337 0.2048 2.8804
17 HDRIHaven 8x8 hdr-rgb-arboretum.hdr 33.6259 1.0986 0.2734 2.1570
18 HDRIHaven 8x8 hdr-rgb-bellparkpier.hdr 45.0949 1.0655 0.2576 2.2900
19 HDRIHaven 8x8 hdr-rgb-canarywharf.hdr 39.4004 1.0731 0.2682 2.1989
20 HDRIHaven 8x8 hdr-rgb-eveningroad.hdr 43.5218 1.0685 0.2607 2.2623
21 HDRIHaven 8x8 hdr-rgb-riverwalk.hdr 25.6552 1.1252 0.2871 2.0544
22 HDRIHaven 12x12 hdr-rgb-arboretum.hdr 30.2083 1.1537 0.3238 1.8215
23 HDRIHaven 12x12 hdr-rgb-bellparkpier.hdr 41.1174 1.1093 0.2999 1.9665
24 HDRIHaven 12x12 hdr-rgb-canarywharf.hdr 36.3098 1.1162 0.3092 1.9078
25 HDRIHaven 12x12 hdr-rgb-eveningroad.hdr 40.7317 1.1237 0.3094 1.9064
26 HDRIHaven 12x12 hdr-rgb-riverwalk.hdr 23.0473 1.1800 0.3385 1.7423
@@ -1,26 +0,0 @@
Image Set,Block Size,Name,PSNR,Total Time,Coding Time,Coding Rate
HDRIHaven,4x4,hdr-rgb-arboretum.hdr,45.2326,0.9060,0.0936,6.3045
HDRIHaven,4x4,hdr-rgb-bellparkpier.hdr,55.2416,0.8912,0.0937,6.2929
HDRIHaven,4x4,hdr-rgb-canarywharf.hdr,48.4153,0.8881,0.0941,6.2703
HDRIHaven,4x4,hdr-rgb-eveningroad.hdr,51.6138,0.8931,0.0942,6.2615
HDRIHaven,4x4,hdr-rgb-riverwalk.hdr,34.0126,0.9247,0.1013,5.8225
HDRIHaven,5x5,hdr-rgb-arboretum.hdr,41.3165,0.9240,0.1109,5.3166
HDRIHaven,5x5,hdr-rgb-bellparkpier.hdr,51.4653,0.9099,0.1119,5.2715
HDRIHaven,5x5,hdr-rgb-canarywharf.hdr,45.0821,0.9071,0.1110,5.3134
HDRIHaven,5x5,hdr-rgb-eveningroad.hdr,48.7249,0.9123,0.1121,5.2598
HDRIHaven,5x5,hdr-rgb-riverwalk.hdr,30.4955,0.9404,0.1154,5.1095
HDRIHaven,6x6,hdr-rgb-arboretum.hdr,37.8196,0.9275,0.1130,5.2215
HDRIHaven,6x6,hdr-rgb-bellparkpier.hdr,48.6921,0.9147,0.1153,5.1166
HDRIHaven,6x6,hdr-rgb-canarywharf.hdr,42.4519,0.9091,0.1130,5.2177
HDRIHaven,6x6,hdr-rgb-eveningroad.hdr,46.3978,0.9152,0.1149,5.1337
HDRIHaven,6x6,hdr-rgb-riverwalk.hdr,28.3170,0.9432,0.1167,5.0546
HDRIHaven,8x8,hdr-rgb-arboretum.hdr,33.5303,0.9951,0.1753,3.3650
HDRIHaven,8x8,hdr-rgb-bellparkpier.hdr,44.8147,0.9780,0.1750,3.3697
HDRIHaven,8x8,hdr-rgb-canarywharf.hdr,39.1762,0.9739,0.1732,3.4059
HDRIHaven,8x8,hdr-rgb-eveningroad.hdr,43.3622,0.9824,0.1776,3.3203
HDRIHaven,8x8,hdr-rgb-riverwalk.hdr,25.5936,1.0191,0.1866,3.1608
HDRIHaven,12x12,hdr-rgb-arboretum.hdr,30.1316,1.0419,0.2144,2.7516
HDRIHaven,12x12,hdr-rgb-bellparkpier.hdr,40.6209,1.0192,0.2113,2.7921
HDRIHaven,12x12,hdr-rgb-canarywharf.hdr,36.0262,1.0160,0.2102,2.8064
HDRIHaven,12x12,hdr-rgb-eveningroad.hdr,40.6170,1.0268,0.2166,2.7231
HDRIHaven,12x12,hdr-rgb-riverwalk.hdr,23.0061,1.0702,0.2301,2.5638
1 Image Set Block Size Name PSNR Total Time Coding Time Coding Rate
2 HDRIHaven 4x4 hdr-rgb-arboretum.hdr 45.2326 0.9060 0.0936 6.3045
3 HDRIHaven 4x4 hdr-rgb-bellparkpier.hdr 55.2416 0.8912 0.0937 6.2929
4 HDRIHaven 4x4 hdr-rgb-canarywharf.hdr 48.4153 0.8881 0.0941 6.2703
5 HDRIHaven 4x4 hdr-rgb-eveningroad.hdr 51.6138 0.8931 0.0942 6.2615
6 HDRIHaven 4x4 hdr-rgb-riverwalk.hdr 34.0126 0.9247 0.1013 5.8225
7 HDRIHaven 5x5 hdr-rgb-arboretum.hdr 41.3165 0.9240 0.1109 5.3166
8 HDRIHaven 5x5 hdr-rgb-bellparkpier.hdr 51.4653 0.9099 0.1119 5.2715
9 HDRIHaven 5x5 hdr-rgb-canarywharf.hdr 45.0821 0.9071 0.1110 5.3134
10 HDRIHaven 5x5 hdr-rgb-eveningroad.hdr 48.7249 0.9123 0.1121 5.2598
11 HDRIHaven 5x5 hdr-rgb-riverwalk.hdr 30.4955 0.9404 0.1154 5.1095
12 HDRIHaven 6x6 hdr-rgb-arboretum.hdr 37.8196 0.9275 0.1130 5.2215
13 HDRIHaven 6x6 hdr-rgb-bellparkpier.hdr 48.6921 0.9147 0.1153 5.1166
14 HDRIHaven 6x6 hdr-rgb-canarywharf.hdr 42.4519 0.9091 0.1130 5.2177
15 HDRIHaven 6x6 hdr-rgb-eveningroad.hdr 46.3978 0.9152 0.1149 5.1337
16 HDRIHaven 6x6 hdr-rgb-riverwalk.hdr 28.3170 0.9432 0.1167 5.0546
17 HDRIHaven 8x8 hdr-rgb-arboretum.hdr 33.5303 0.9951 0.1753 3.3650
18 HDRIHaven 8x8 hdr-rgb-bellparkpier.hdr 44.8147 0.9780 0.1750 3.3697
19 HDRIHaven 8x8 hdr-rgb-canarywharf.hdr 39.1762 0.9739 0.1732 3.4059
20 HDRIHaven 8x8 hdr-rgb-eveningroad.hdr 43.3622 0.9824 0.1776 3.3203
21 HDRIHaven 8x8 hdr-rgb-riverwalk.hdr 25.5936 1.0191 0.1866 3.1608
22 HDRIHaven 12x12 hdr-rgb-arboretum.hdr 30.1316 1.0419 0.2144 2.7516
23 HDRIHaven 12x12 hdr-rgb-bellparkpier.hdr 40.6209 1.0192 0.2113 2.7921
24 HDRIHaven 12x12 hdr-rgb-canarywharf.hdr 36.0262 1.0160 0.2102 2.8064
25 HDRIHaven 12x12 hdr-rgb-eveningroad.hdr 40.6170 1.0268 0.2166 2.7231
26 HDRIHaven 12x12 hdr-rgb-riverwalk.hdr 23.0061 1.0702 0.2301 2.5638
@@ -1,26 +0,0 @@
Image Set,Block Size,Name,PSNR,Total Time,Coding Time,Coding Rate
HDRIHaven,4x4,hdr-rgb-arboretum.hdr,45.6813,1.0841,0.2661,2.2164
HDRIHaven,4x4,hdr-rgb-bellparkpier.hdr,55.9798,1.0610,0.2612,2.2584
HDRIHaven,4x4,hdr-rgb-canarywharf.hdr,49.5307,1.0883,0.2923,2.0177
HDRIHaven,4x4,hdr-rgb-eveningroad.hdr,52.2443,1.0682,0.2673,2.2069
HDRIHaven,4x4,hdr-rgb-riverwalk.hdr,34.7736,1.1871,0.3598,1.6392
HDRIHaven,5x5,hdr-rgb-arboretum.hdr,41.5886,1.1287,0.3134,1.8818
HDRIHaven,5x5,hdr-rgb-bellparkpier.hdr,52.2735,1.1319,0.3276,1.8003
HDRIHaven,5x5,hdr-rgb-canarywharf.hdr,46.0266,1.1436,0.3455,1.7071
HDRIHaven,5x5,hdr-rgb-eveningroad.hdr,49.1990,1.1306,0.3256,1.8118
HDRIHaven,5x5,hdr-rgb-riverwalk.hdr,31.0853,1.2347,0.4070,1.4490
HDRIHaven,6x6,hdr-rgb-arboretum.hdr,37.9799,1.1702,0.3495,1.6877
HDRIHaven,6x6,hdr-rgb-bellparkpier.hdr,49.3967,1.1431,0.3414,1.7275
HDRIHaven,6x6,hdr-rgb-canarywharf.hdr,43.2283,1.1688,0.3681,1.6022
HDRIHaven,6x6,hdr-rgb-eveningroad.hdr,46.7624,1.1525,0.3465,1.7024
HDRIHaven,6x6,hdr-rgb-riverwalk.hdr,28.6249,1.2719,0.4421,1.3342
HDRIHaven,8x8,hdr-rgb-arboretum.hdr,33.7193,1.3044,0.4816,1.2246
HDRIHaven,8x8,hdr-rgb-bellparkpier.hdr,45.5399,1.2323,0.4252,1.3872
HDRIHaven,8x8,hdr-rgb-canarywharf.hdr,39.7977,1.2743,0.4709,1.2526
HDRIHaven,8x8,hdr-rgb-eveningroad.hdr,43.6493,1.2624,0.4562,1.2929
HDRIHaven,8x8,hdr-rgb-riverwalk.hdr,25.7364,1.3818,0.5486,1.0752
HDRIHaven,12x12,hdr-rgb-arboretum.hdr,30.2550,1.4356,0.6055,0.9742
HDRIHaven,12x12,hdr-rgb-bellparkpier.hdr,41.3419,1.3001,0.4875,1.2099
HDRIHaven,12x12,hdr-rgb-canarywharf.hdr,36.5225,1.3711,0.5605,1.0523
HDRIHaven,12x12,hdr-rgb-eveningroad.hdr,40.8200,1.3601,0.5422,1.0878
HDRIHaven,12x12,hdr-rgb-riverwalk.hdr,23.0791,1.4984,0.6513,0.9057
1 Image Set Block Size Name PSNR Total Time Coding Time Coding Rate
2 HDRIHaven 4x4 hdr-rgb-arboretum.hdr 45.6813 1.0841 0.2661 2.2164
3 HDRIHaven 4x4 hdr-rgb-bellparkpier.hdr 55.9798 1.0610 0.2612 2.2584
4 HDRIHaven 4x4 hdr-rgb-canarywharf.hdr 49.5307 1.0883 0.2923 2.0177
5 HDRIHaven 4x4 hdr-rgb-eveningroad.hdr 52.2443 1.0682 0.2673 2.2069
6 HDRIHaven 4x4 hdr-rgb-riverwalk.hdr 34.7736 1.1871 0.3598 1.6392
7 HDRIHaven 5x5 hdr-rgb-arboretum.hdr 41.5886 1.1287 0.3134 1.8818
8 HDRIHaven 5x5 hdr-rgb-bellparkpier.hdr 52.2735 1.1319 0.3276 1.8003
9 HDRIHaven 5x5 hdr-rgb-canarywharf.hdr 46.0266 1.1436 0.3455 1.7071
10 HDRIHaven 5x5 hdr-rgb-eveningroad.hdr 49.1990 1.1306 0.3256 1.8118
11 HDRIHaven 5x5 hdr-rgb-riverwalk.hdr 31.0853 1.2347 0.4070 1.4490
12 HDRIHaven 6x6 hdr-rgb-arboretum.hdr 37.9799 1.1702 0.3495 1.6877
13 HDRIHaven 6x6 hdr-rgb-bellparkpier.hdr 49.3967 1.1431 0.3414 1.7275
14 HDRIHaven 6x6 hdr-rgb-canarywharf.hdr 43.2283 1.1688 0.3681 1.6022
15 HDRIHaven 6x6 hdr-rgb-eveningroad.hdr 46.7624 1.1525 0.3465 1.7024
16 HDRIHaven 6x6 hdr-rgb-riverwalk.hdr 28.6249 1.2719 0.4421 1.3342
17 HDRIHaven 8x8 hdr-rgb-arboretum.hdr 33.7193 1.3044 0.4816 1.2246
18 HDRIHaven 8x8 hdr-rgb-bellparkpier.hdr 45.5399 1.2323 0.4252 1.3872
19 HDRIHaven 8x8 hdr-rgb-canarywharf.hdr 39.7977 1.2743 0.4709 1.2526
20 HDRIHaven 8x8 hdr-rgb-eveningroad.hdr 43.6493 1.2624 0.4562 1.2929
21 HDRIHaven 8x8 hdr-rgb-riverwalk.hdr 25.7364 1.3818 0.5486 1.0752
22 HDRIHaven 12x12 hdr-rgb-arboretum.hdr 30.2550 1.4356 0.6055 0.9742
23 HDRIHaven 12x12 hdr-rgb-bellparkpier.hdr 41.3419 1.3001 0.4875 1.2099
24 HDRIHaven 12x12 hdr-rgb-canarywharf.hdr 36.5225 1.3711 0.5605 1.0523
25 HDRIHaven 12x12 hdr-rgb-eveningroad.hdr 40.8200 1.3601 0.5422 1.0878
26 HDRIHaven 12x12 hdr-rgb-riverwalk.hdr 23.0791 1.4984 0.6513 0.9057
@@ -1,26 +0,0 @@
Image Set,Block Size,Name,PSNR,Total Time,Coding Time,Coding Rate
HDRIHaven,4x4,hdr-rgb-arboretum.hdr,45.8140,1.4350,0.6193,0.9524
HDRIHaven,4x4,hdr-rgb-bellparkpier.hdr,56.3108,1.4526,0.6532,0.9030
HDRIHaven,4x4,hdr-rgb-canarywharf.hdr,50.0416,1.4670,0.6700,0.8804
HDRIHaven,4x4,hdr-rgb-eveningroad.hdr,52.4415,1.4204,0.6200,0.9514
HDRIHaven,4x4,hdr-rgb-riverwalk.hdr,34.9172,1.5700,0.7472,0.7894
HDRIHaven,5x5,hdr-rgb-arboretum.hdr,41.6414,1.5601,0.7446,0.7922
HDRIHaven,5x5,hdr-rgb-bellparkpier.hdr,52.5164,1.6150,0.8121,0.7263
HDRIHaven,5x5,hdr-rgb-canarywharf.hdr,46.4423,1.6092,0.8127,0.7257
HDRIHaven,5x5,hdr-rgb-eveningroad.hdr,49.3342,1.5609,0.7556,0.7806
HDRIHaven,5x5,hdr-rgb-riverwalk.hdr,31.2127,1.7099,0.8809,0.6696
HDRIHaven,6x6,hdr-rgb-arboretum.hdr,38.0206,1.7151,0.8957,0.6585
HDRIHaven,6x6,hdr-rgb-bellparkpier.hdr,49.5914,1.7311,0.9290,0.6349
HDRIHaven,6x6,hdr-rgb-canarywharf.hdr,43.5720,1.7488,0.9474,0.6226
HDRIHaven,6x6,hdr-rgb-eveningroad.hdr,46.8656,1.7043,0.9018,0.6541
HDRIHaven,6x6,hdr-rgb-riverwalk.hdr,28.7315,1.8778,1.0479,0.5629
HDRIHaven,8x8,hdr-rgb-arboretum.hdr,33.7638,2.0126,1.1857,0.4975
HDRIHaven,8x8,hdr-rgb-bellparkpier.hdr,45.7385,1.8810,1.0719,0.5502
HDRIHaven,8x8,hdr-rgb-canarywharf.hdr,40.0389,1.9360,1.1320,0.5210
HDRIHaven,8x8,hdr-rgb-eveningroad.hdr,43.7390,1.9138,1.1054,0.5336
HDRIHaven,8x8,hdr-rgb-riverwalk.hdr,25.8113,2.1880,1.3469,0.4379
HDRIHaven,12x12,hdr-rgb-arboretum.hdr,30.2896,2.2359,1.3992,0.4215
HDRIHaven,12x12,hdr-rgb-bellparkpier.hdr,41.5190,1.9326,1.1153,0.5289
HDRIHaven,12x12,hdr-rgb-canarywharf.hdr,36.7063,2.0784,1.2631,0.4670
HDRIHaven,12x12,hdr-rgb-eveningroad.hdr,40.8774,2.0728,1.2533,0.4706
HDRIHaven,12x12,hdr-rgb-riverwalk.hdr,23.1152,2.4112,1.5651,0.3769
1 Image Set Block Size Name PSNR Total Time Coding Time Coding Rate
2 HDRIHaven 4x4 hdr-rgb-arboretum.hdr 45.8140 1.4350 0.6193 0.9524
3 HDRIHaven 4x4 hdr-rgb-bellparkpier.hdr 56.3108 1.4526 0.6532 0.9030
4 HDRIHaven 4x4 hdr-rgb-canarywharf.hdr 50.0416 1.4670 0.6700 0.8804
5 HDRIHaven 4x4 hdr-rgb-eveningroad.hdr 52.4415 1.4204 0.6200 0.9514
6 HDRIHaven 4x4 hdr-rgb-riverwalk.hdr 34.9172 1.5700 0.7472 0.7894
7 HDRIHaven 5x5 hdr-rgb-arboretum.hdr 41.6414 1.5601 0.7446 0.7922
8 HDRIHaven 5x5 hdr-rgb-bellparkpier.hdr 52.5164 1.6150 0.8121 0.7263
9 HDRIHaven 5x5 hdr-rgb-canarywharf.hdr 46.4423 1.6092 0.8127 0.7257
10 HDRIHaven 5x5 hdr-rgb-eveningroad.hdr 49.3342 1.5609 0.7556 0.7806
11 HDRIHaven 5x5 hdr-rgb-riverwalk.hdr 31.2127 1.7099 0.8809 0.6696
12 HDRIHaven 6x6 hdr-rgb-arboretum.hdr 38.0206 1.7151 0.8957 0.6585
13 HDRIHaven 6x6 hdr-rgb-bellparkpier.hdr 49.5914 1.7311 0.9290 0.6349
14 HDRIHaven 6x6 hdr-rgb-canarywharf.hdr 43.5720 1.7488 0.9474 0.6226
15 HDRIHaven 6x6 hdr-rgb-eveningroad.hdr 46.8656 1.7043 0.9018 0.6541
16 HDRIHaven 6x6 hdr-rgb-riverwalk.hdr 28.7315 1.8778 1.0479 0.5629
17 HDRIHaven 8x8 hdr-rgb-arboretum.hdr 33.7638 2.0126 1.1857 0.4975
18 HDRIHaven 8x8 hdr-rgb-bellparkpier.hdr 45.7385 1.8810 1.0719 0.5502
19 HDRIHaven 8x8 hdr-rgb-canarywharf.hdr 40.0389 1.9360 1.1320 0.5210
20 HDRIHaven 8x8 hdr-rgb-eveningroad.hdr 43.7390 1.9138 1.1054 0.5336
21 HDRIHaven 8x8 hdr-rgb-riverwalk.hdr 25.8113 2.1880 1.3469 0.4379
22 HDRIHaven 12x12 hdr-rgb-arboretum.hdr 30.2896 2.2359 1.3992 0.4215
23 HDRIHaven 12x12 hdr-rgb-bellparkpier.hdr 41.5190 1.9326 1.1153 0.5289
24 HDRIHaven 12x12 hdr-rgb-canarywharf.hdr 36.7063 2.0784 1.2631 0.4670
25 HDRIHaven 12x12 hdr-rgb-eveningroad.hdr 40.8774 2.0728 1.2533 0.4706
26 HDRIHaven 12x12 hdr-rgb-riverwalk.hdr 23.1152 2.4112 1.5651 0.3769
@@ -1,26 +0,0 @@
Image Set,Block Size,Name,PSNR,Total Time,Coding Time,Coding Rate
HDRIHaven,4x4,hdr-rgb-arboretum.hdr,45.4642,0.9349,0.1215,4.8552
HDRIHaven,4x4,hdr-rgb-bellparkpier.hdr,55.6113,0.9154,0.1171,5.0354
HDRIHaven,4x4,hdr-rgb-canarywharf.hdr,49.1249,0.9239,0.1284,4.5924
HDRIHaven,4x4,hdr-rgb-eveningroad.hdr,51.8620,0.9187,0.1184,4.9800
HDRIHaven,4x4,hdr-rgb-riverwalk.hdr,34.4006,0.9803,0.1542,3.8248
HDRIHaven,5x5,hdr-rgb-arboretum.hdr,41.4855,0.9477,0.1327,4.4461
HDRIHaven,5x5,hdr-rgb-bellparkpier.hdr,51.8798,0.9348,0.1335,4.4189
HDRIHaven,5x5,hdr-rgb-canarywharf.hdr,45.5073,0.9350,0.1373,4.2959
HDRIHaven,5x5,hdr-rgb-eveningroad.hdr,48.9690,0.9350,0.1330,4.4341
HDRIHaven,5x5,hdr-rgb-riverwalk.hdr,30.8563,0.9812,0.1558,3.7849
HDRIHaven,6x6,hdr-rgb-arboretum.hdr,37.9117,0.9740,0.1576,3.7418
HDRIHaven,6x6,hdr-rgb-bellparkpier.hdr,48.9417,0.9586,0.1579,3.7349
HDRIHaven,6x6,hdr-rgb-canarywharf.hdr,42.6901,0.9566,0.1589,3.7111
HDRIHaven,6x6,hdr-rgb-eveningroad.hdr,46.6034,0.9586,0.1571,3.7548
HDRIHaven,6x6,hdr-rgb-riverwalk.hdr,28.4690,1.0069,0.1779,3.3146
HDRIHaven,8x8,hdr-rgb-arboretum.hdr,33.6259,1.0635,0.2402,2.4553
HDRIHaven,8x8,hdr-rgb-bellparkpier.hdr,45.0949,1.0317,0.2274,2.5933
HDRIHaven,8x8,hdr-rgb-canarywharf.hdr,39.4004,1.0395,0.2340,2.5210
HDRIHaven,8x8,hdr-rgb-eveningroad.hdr,43.5218,1.0347,0.2271,2.5977
HDRIHaven,8x8,hdr-rgb-riverwalk.hdr,25.6552,1.0872,0.2511,2.3487
HDRIHaven,12x12,hdr-rgb-arboretum.hdr,30.2083,1.1156,0.2853,2.0673
HDRIHaven,12x12,hdr-rgb-bellparkpier.hdr,41.1174,1.0738,0.2647,2.2281
HDRIHaven,12x12,hdr-rgb-canarywharf.hdr,36.3098,1.0843,0.2727,2.1627
HDRIHaven,12x12,hdr-rgb-eveningroad.hdr,40.7317,1.0871,0.2721,2.1680
HDRIHaven,12x12,hdr-rgb-riverwalk.hdr,23.0473,1.1433,0.2984,1.9766
1 Image Set Block Size Name PSNR Total Time Coding Time Coding Rate
2 HDRIHaven 4x4 hdr-rgb-arboretum.hdr 45.4642 0.9349 0.1215 4.8552
3 HDRIHaven 4x4 hdr-rgb-bellparkpier.hdr 55.6113 0.9154 0.1171 5.0354
4 HDRIHaven 4x4 hdr-rgb-canarywharf.hdr 49.1249 0.9239 0.1284 4.5924
5 HDRIHaven 4x4 hdr-rgb-eveningroad.hdr 51.8620 0.9187 0.1184 4.9800
6 HDRIHaven 4x4 hdr-rgb-riverwalk.hdr 34.4006 0.9803 0.1542 3.8248
7 HDRIHaven 5x5 hdr-rgb-arboretum.hdr 41.4855 0.9477 0.1327 4.4461
8 HDRIHaven 5x5 hdr-rgb-bellparkpier.hdr 51.8798 0.9348 0.1335 4.4189
9 HDRIHaven 5x5 hdr-rgb-canarywharf.hdr 45.5073 0.9350 0.1373 4.2959
10 HDRIHaven 5x5 hdr-rgb-eveningroad.hdr 48.9690 0.9350 0.1330 4.4341
11 HDRIHaven 5x5 hdr-rgb-riverwalk.hdr 30.8563 0.9812 0.1558 3.7849
12 HDRIHaven 6x6 hdr-rgb-arboretum.hdr 37.9117 0.9740 0.1576 3.7418
13 HDRIHaven 6x6 hdr-rgb-bellparkpier.hdr 48.9417 0.9586 0.1579 3.7349
14 HDRIHaven 6x6 hdr-rgb-canarywharf.hdr 42.6901 0.9566 0.1589 3.7111
15 HDRIHaven 6x6 hdr-rgb-eveningroad.hdr 46.6034 0.9586 0.1571 3.7548
16 HDRIHaven 6x6 hdr-rgb-riverwalk.hdr 28.4690 1.0069 0.1779 3.3146
17 HDRIHaven 8x8 hdr-rgb-arboretum.hdr 33.6259 1.0635 0.2402 2.4553
18 HDRIHaven 8x8 hdr-rgb-bellparkpier.hdr 45.0949 1.0317 0.2274 2.5933
19 HDRIHaven 8x8 hdr-rgb-canarywharf.hdr 39.4004 1.0395 0.2340 2.5210
20 HDRIHaven 8x8 hdr-rgb-eveningroad.hdr 43.5218 1.0347 0.2271 2.5977
21 HDRIHaven 8x8 hdr-rgb-riverwalk.hdr 25.6552 1.0872 0.2511 2.3487
22 HDRIHaven 12x12 hdr-rgb-arboretum.hdr 30.2083 1.1156 0.2853 2.0673
23 HDRIHaven 12x12 hdr-rgb-bellparkpier.hdr 41.1174 1.0738 0.2647 2.2281
24 HDRIHaven 12x12 hdr-rgb-canarywharf.hdr 36.3098 1.0843 0.2727 2.1627
25 HDRIHaven 12x12 hdr-rgb-eveningroad.hdr 40.7317 1.0871 0.2721 2.1680
26 HDRIHaven 12x12 hdr-rgb-riverwalk.hdr 23.0473 1.1433 0.2984 1.9766
@@ -1,26 +0,0 @@
Image Set,Block Size,Name,PSNR,Total Time,Coding Time,Coding Rate
HDRIHaven,4x4,hdr-rgb-arboretum.hdr,45.2326,0.8925,0.0828,7.1221
HDRIHaven,4x4,hdr-rgb-bellparkpier.hdr,55.2416,0.8796,0.0829,7.1145
HDRIHaven,4x4,hdr-rgb-canarywharf.hdr,48.4153,0.8771,0.0830,7.1040
HDRIHaven,4x4,hdr-rgb-eveningroad.hdr,51.6138,0.8800,0.0830,7.1094
HDRIHaven,4x4,hdr-rgb-riverwalk.hdr,34.0126,0.9129,0.0899,6.5637
HDRIHaven,5x5,hdr-rgb-arboretum.hdr,41.3165,0.9087,0.0957,6.1635
HDRIHaven,5x5,hdr-rgb-bellparkpier.hdr,51.4653,0.8948,0.0965,6.1136
HDRIHaven,5x5,hdr-rgb-canarywharf.hdr,45.0821,0.8909,0.0962,6.1328
HDRIHaven,5x5,hdr-rgb-eveningroad.hdr,48.7249,0.8958,0.0964,6.1191
HDRIHaven,5x5,hdr-rgb-riverwalk.hdr,30.4955,0.9245,0.0994,5.9329
HDRIHaven,6x6,hdr-rgb-arboretum.hdr,37.8196,0.9113,0.0982,6.0056
HDRIHaven,6x6,hdr-rgb-bellparkpier.hdr,48.6921,0.8973,0.1003,5.8804
HDRIHaven,6x6,hdr-rgb-canarywharf.hdr,42.4519,0.8922,0.0987,5.9764
HDRIHaven,6x6,hdr-rgb-eveningroad.hdr,46.3978,0.8989,0.0997,5.9157
HDRIHaven,6x6,hdr-rgb-riverwalk.hdr,28.3170,0.9262,0.1018,5.7928
HDRIHaven,8x8,hdr-rgb-arboretum.hdr,33.5303,0.9712,0.1539,3.8330
HDRIHaven,8x8,hdr-rgb-bellparkpier.hdr,44.8147,0.9563,0.1535,3.8420
HDRIHaven,8x8,hdr-rgb-canarywharf.hdr,39.1762,0.9496,0.1499,3.9360
HDRIHaven,8x8,hdr-rgb-eveningroad.hdr,43.3622,0.9614,0.1556,3.7901
HDRIHaven,8x8,hdr-rgb-riverwalk.hdr,25.5936,0.9963,0.1630,3.6182
HDRIHaven,12x12,hdr-rgb-arboretum.hdr,30.1316,1.0145,0.1901,3.1022
HDRIHaven,12x12,hdr-rgb-bellparkpier.hdr,40.6209,0.9923,0.1861,3.1699
HDRIHaven,12x12,hdr-rgb-canarywharf.hdr,36.0262,0.9907,0.1852,3.1845
HDRIHaven,12x12,hdr-rgb-eveningroad.hdr,40.6170,0.9988,0.1919,3.0730
HDRIHaven,12x12,hdr-rgb-riverwalk.hdr,23.0061,1.0422,0.2040,2.8913
1 Image Set Block Size Name PSNR Total Time Coding Time Coding Rate
2 HDRIHaven 4x4 hdr-rgb-arboretum.hdr 45.2326 0.8925 0.0828 7.1221
3 HDRIHaven 4x4 hdr-rgb-bellparkpier.hdr 55.2416 0.8796 0.0829 7.1145
4 HDRIHaven 4x4 hdr-rgb-canarywharf.hdr 48.4153 0.8771 0.0830 7.1040
5 HDRIHaven 4x4 hdr-rgb-eveningroad.hdr 51.6138 0.8800 0.0830 7.1094
6 HDRIHaven 4x4 hdr-rgb-riverwalk.hdr 34.0126 0.9129 0.0899 6.5637
7 HDRIHaven 5x5 hdr-rgb-arboretum.hdr 41.3165 0.9087 0.0957 6.1635
8 HDRIHaven 5x5 hdr-rgb-bellparkpier.hdr 51.4653 0.8948 0.0965 6.1136
9 HDRIHaven 5x5 hdr-rgb-canarywharf.hdr 45.0821 0.8909 0.0962 6.1328
10 HDRIHaven 5x5 hdr-rgb-eveningroad.hdr 48.7249 0.8958 0.0964 6.1191
11 HDRIHaven 5x5 hdr-rgb-riverwalk.hdr 30.4955 0.9245 0.0994 5.9329
12 HDRIHaven 6x6 hdr-rgb-arboretum.hdr 37.8196 0.9113 0.0982 6.0056
13 HDRIHaven 6x6 hdr-rgb-bellparkpier.hdr 48.6921 0.8973 0.1003 5.8804
14 HDRIHaven 6x6 hdr-rgb-canarywharf.hdr 42.4519 0.8922 0.0987 5.9764
15 HDRIHaven 6x6 hdr-rgb-eveningroad.hdr 46.3978 0.8989 0.0997 5.9157
16 HDRIHaven 6x6 hdr-rgb-riverwalk.hdr 28.3170 0.9262 0.1018 5.7928
17 HDRIHaven 8x8 hdr-rgb-arboretum.hdr 33.5303 0.9712 0.1539 3.8330
18 HDRIHaven 8x8 hdr-rgb-bellparkpier.hdr 44.8147 0.9563 0.1535 3.8420
19 HDRIHaven 8x8 hdr-rgb-canarywharf.hdr 39.1762 0.9496 0.1499 3.9360
20 HDRIHaven 8x8 hdr-rgb-eveningroad.hdr 43.3622 0.9614 0.1556 3.7901
21 HDRIHaven 8x8 hdr-rgb-riverwalk.hdr 25.5936 0.9963 0.1630 3.6182
22 HDRIHaven 12x12 hdr-rgb-arboretum.hdr 30.1316 1.0145 0.1901 3.1022
23 HDRIHaven 12x12 hdr-rgb-bellparkpier.hdr 40.6209 0.9923 0.1861 3.1699
24 HDRIHaven 12x12 hdr-rgb-canarywharf.hdr 36.0262 0.9907 0.1852 3.1845
25 HDRIHaven 12x12 hdr-rgb-eveningroad.hdr 40.6170 0.9988 0.1919 3.0730
26 HDRIHaven 12x12 hdr-rgb-riverwalk.hdr 23.0061 1.0422 0.2040 2.8913
@@ -1,26 +0,0 @@
Image Set,Block Size,Name,PSNR,Total Time,Coding Time,Coding Rate
HDRIHaven,4x4,hdr-rgb-arboretum.hdr,45.6813,1.0485,0.2338,2.5226
HDRIHaven,4x4,hdr-rgb-bellparkpier.hdr,55.9798,1.0310,0.2281,2.5855
HDRIHaven,4x4,hdr-rgb-canarywharf.hdr,49.5307,1.0563,0.2549,2.3139
HDRIHaven,4x4,hdr-rgb-eveningroad.hdr,52.2443,1.0411,0.2330,2.5316
HDRIHaven,4x4,hdr-rgb-riverwalk.hdr,34.7736,1.1405,0.3145,1.8756
HDRIHaven,5x5,hdr-rgb-arboretum.hdr,41.5886,1.0881,0.2695,2.1888
HDRIHaven,5x5,hdr-rgb-bellparkpier.hdr,52.2735,1.0834,0.2816,2.0946
HDRIHaven,5x5,hdr-rgb-canarywharf.hdr,46.0266,1.0971,0.2964,1.9901
HDRIHaven,5x5,hdr-rgb-eveningroad.hdr,49.1990,1.0846,0.2788,2.1152
HDRIHaven,5x5,hdr-rgb-riverwalk.hdr,31.0853,1.1794,0.3502,1.6841
HDRIHaven,6x6,hdr-rgb-arboretum.hdr,37.9799,1.1205,0.3011,1.9590
HDRIHaven,6x6,hdr-rgb-bellparkpier.hdr,49.3967,1.0966,0.2921,2.0192
HDRIHaven,6x6,hdr-rgb-canarywharf.hdr,43.2283,1.1184,0.3173,1.8590
HDRIHaven,6x6,hdr-rgb-eveningroad.hdr,46.7624,1.1048,0.2979,1.9796
HDRIHaven,6x6,hdr-rgb-riverwalk.hdr,28.6249,1.2150,0.3838,1.5369
HDRIHaven,8x8,hdr-rgb-arboretum.hdr,33.7193,1.2415,0.4217,1.3986
HDRIHaven,8x8,hdr-rgb-bellparkpier.hdr,45.5399,1.1818,0.3740,1.5773
HDRIHaven,8x8,hdr-rgb-canarywharf.hdr,39.7977,1.2190,0.4116,1.4330
HDRIHaven,8x8,hdr-rgb-eveningroad.hdr,43.6493,1.2074,0.4001,1.4742
HDRIHaven,8x8,hdr-rgb-riverwalk.hdr,25.7364,1.3194,0.4816,1.2247
HDRIHaven,12x12,hdr-rgb-arboretum.hdr,30.2550,1.3714,0.5360,1.1005
HDRIHaven,12x12,hdr-rgb-bellparkpier.hdr,41.3419,1.2455,0.4300,1.3717
HDRIHaven,12x12,hdr-rgb-canarywharf.hdr,36.5225,1.3058,0.4948,1.1920
HDRIHaven,12x12,hdr-rgb-eveningroad.hdr,40.8200,1.2958,0.4783,1.2333
HDRIHaven,12x12,hdr-rgb-riverwalk.hdr,23.0791,1.4251,0.5758,1.0243
1 Image Set Block Size Name PSNR Total Time Coding Time Coding Rate
2 HDRIHaven 4x4 hdr-rgb-arboretum.hdr 45.6813 1.0485 0.2338 2.5226
3 HDRIHaven 4x4 hdr-rgb-bellparkpier.hdr 55.9798 1.0310 0.2281 2.5855
4 HDRIHaven 4x4 hdr-rgb-canarywharf.hdr 49.5307 1.0563 0.2549 2.3139
5 HDRIHaven 4x4 hdr-rgb-eveningroad.hdr 52.2443 1.0411 0.2330 2.5316
6 HDRIHaven 4x4 hdr-rgb-riverwalk.hdr 34.7736 1.1405 0.3145 1.8756
7 HDRIHaven 5x5 hdr-rgb-arboretum.hdr 41.5886 1.0881 0.2695 2.1888
8 HDRIHaven 5x5 hdr-rgb-bellparkpier.hdr 52.2735 1.0834 0.2816 2.0946
9 HDRIHaven 5x5 hdr-rgb-canarywharf.hdr 46.0266 1.0971 0.2964 1.9901
10 HDRIHaven 5x5 hdr-rgb-eveningroad.hdr 49.1990 1.0846 0.2788 2.1152
11 HDRIHaven 5x5 hdr-rgb-riverwalk.hdr 31.0853 1.1794 0.3502 1.6841
12 HDRIHaven 6x6 hdr-rgb-arboretum.hdr 37.9799 1.1205 0.3011 1.9590
13 HDRIHaven 6x6 hdr-rgb-bellparkpier.hdr 49.3967 1.0966 0.2921 2.0192
14 HDRIHaven 6x6 hdr-rgb-canarywharf.hdr 43.2283 1.1184 0.3173 1.8590
15 HDRIHaven 6x6 hdr-rgb-eveningroad.hdr 46.7624 1.1048 0.2979 1.9796
16 HDRIHaven 6x6 hdr-rgb-riverwalk.hdr 28.6249 1.2150 0.3838 1.5369
17 HDRIHaven 8x8 hdr-rgb-arboretum.hdr 33.7193 1.2415 0.4217 1.3986
18 HDRIHaven 8x8 hdr-rgb-bellparkpier.hdr 45.5399 1.1818 0.3740 1.5773
19 HDRIHaven 8x8 hdr-rgb-canarywharf.hdr 39.7977 1.2190 0.4116 1.4330
20 HDRIHaven 8x8 hdr-rgb-eveningroad.hdr 43.6493 1.2074 0.4001 1.4742
21 HDRIHaven 8x8 hdr-rgb-riverwalk.hdr 25.7364 1.3194 0.4816 1.2247
22 HDRIHaven 12x12 hdr-rgb-arboretum.hdr 30.2550 1.3714 0.5360 1.1005
23 HDRIHaven 12x12 hdr-rgb-bellparkpier.hdr 41.3419 1.2455 0.4300 1.3717
24 HDRIHaven 12x12 hdr-rgb-canarywharf.hdr 36.5225 1.3058 0.4948 1.1920
25 HDRIHaven 12x12 hdr-rgb-eveningroad.hdr 40.8200 1.2958 0.4783 1.2333
26 HDRIHaven 12x12 hdr-rgb-riverwalk.hdr 23.0791 1.4251 0.5758 1.0243
@@ -1,26 +0,0 @@
Image Set,Block Size,Name,PSNR,Total Time,Coding Time,Coding Rate
HDRIHaven,4x4,hdr-rgb-arboretum.hdr,45.8140,1.3668,0.5496,1.0731
HDRIHaven,4x4,hdr-rgb-bellparkpier.hdr,56.3108,1.3742,0.5744,1.0268
HDRIHaven,4x4,hdr-rgb-canarywharf.hdr,50.0416,1.3844,0.5893,1.0009
HDRIHaven,4x4,hdr-rgb-eveningroad.hdr,52.4415,1.3448,0.5442,1.0839
HDRIHaven,4x4,hdr-rgb-riverwalk.hdr,34.9172,1.4844,0.6601,0.8936
HDRIHaven,5x5,hdr-rgb-arboretum.hdr,41.6414,1.4673,0.6500,0.9074
HDRIHaven,5x5,hdr-rgb-bellparkpier.hdr,52.5164,1.5132,0.7083,0.8327
HDRIHaven,5x5,hdr-rgb-canarywharf.hdr,46.4423,1.5122,0.7090,0.8319
HDRIHaven,5x5,hdr-rgb-eveningroad.hdr,49.3342,1.4652,0.6606,0.8928
HDRIHaven,5x5,hdr-rgb-riverwalk.hdr,31.2127,1.5933,0.7676,0.7684
HDRIHaven,6x6,hdr-rgb-arboretum.hdr,38.0206,1.5996,0.7801,0.7561
HDRIHaven,6x6,hdr-rgb-bellparkpier.hdr,49.5914,1.6176,0.8131,0.7254
HDRIHaven,6x6,hdr-rgb-canarywharf.hdr,43.5720,1.6268,0.8284,0.7120
HDRIHaven,6x6,hdr-rgb-eveningroad.hdr,46.8656,1.5934,0.7870,0.7494
HDRIHaven,6x6,hdr-rgb-riverwalk.hdr,28.7315,1.7475,0.9170,0.6432
HDRIHaven,8x8,hdr-rgb-arboretum.hdr,33.7638,1.8698,1.0465,0.5636
HDRIHaven,8x8,hdr-rgb-bellparkpier.hdr,45.7385,1.7500,0.9440,0.6248
HDRIHaven,8x8,hdr-rgb-canarywharf.hdr,40.0389,1.8071,1.0011,0.5892
HDRIHaven,8x8,hdr-rgb-eveningroad.hdr,43.7390,1.7833,0.9732,0.6061
HDRIHaven,8x8,hdr-rgb-riverwalk.hdr,25.8113,2.0260,1.1907,0.4954
HDRIHaven,12x12,hdr-rgb-arboretum.hdr,30.2896,2.0752,1.2384,0.4763
HDRIHaven,12x12,hdr-rgb-bellparkpier.hdr,41.5190,1.8057,0.9886,0.5966
HDRIHaven,12x12,hdr-rgb-canarywharf.hdr,36.7063,1.9364,1.1189,0.5272
HDRIHaven,12x12,hdr-rgb-eveningroad.hdr,40.8774,1.9304,1.1109,0.5310
HDRIHaven,12x12,hdr-rgb-riverwalk.hdr,23.1152,2.2427,1.3917,0.4238
1 Image Set Block Size Name PSNR Total Time Coding Time Coding Rate
2 HDRIHaven 4x4 hdr-rgb-arboretum.hdr 45.8140 1.3668 0.5496 1.0731
3 HDRIHaven 4x4 hdr-rgb-bellparkpier.hdr 56.3108 1.3742 0.5744 1.0268
4 HDRIHaven 4x4 hdr-rgb-canarywharf.hdr 50.0416 1.3844 0.5893 1.0009
5 HDRIHaven 4x4 hdr-rgb-eveningroad.hdr 52.4415 1.3448 0.5442 1.0839
6 HDRIHaven 4x4 hdr-rgb-riverwalk.hdr 34.9172 1.4844 0.6601 0.8936
7 HDRIHaven 5x5 hdr-rgb-arboretum.hdr 41.6414 1.4673 0.6500 0.9074
8 HDRIHaven 5x5 hdr-rgb-bellparkpier.hdr 52.5164 1.5132 0.7083 0.8327
9 HDRIHaven 5x5 hdr-rgb-canarywharf.hdr 46.4423 1.5122 0.7090 0.8319
10 HDRIHaven 5x5 hdr-rgb-eveningroad.hdr 49.3342 1.4652 0.6606 0.8928
11 HDRIHaven 5x5 hdr-rgb-riverwalk.hdr 31.2127 1.5933 0.7676 0.7684
12 HDRIHaven 6x6 hdr-rgb-arboretum.hdr 38.0206 1.5996 0.7801 0.7561
13 HDRIHaven 6x6 hdr-rgb-bellparkpier.hdr 49.5914 1.6176 0.8131 0.7254
14 HDRIHaven 6x6 hdr-rgb-canarywharf.hdr 43.5720 1.6268 0.8284 0.7120
15 HDRIHaven 6x6 hdr-rgb-eveningroad.hdr 46.8656 1.5934 0.7870 0.7494
16 HDRIHaven 6x6 hdr-rgb-riverwalk.hdr 28.7315 1.7475 0.9170 0.6432
17 HDRIHaven 8x8 hdr-rgb-arboretum.hdr 33.7638 1.8698 1.0465 0.5636
18 HDRIHaven 8x8 hdr-rgb-bellparkpier.hdr 45.7385 1.7500 0.9440 0.6248
19 HDRIHaven 8x8 hdr-rgb-canarywharf.hdr 40.0389 1.8071 1.0011 0.5892
20 HDRIHaven 8x8 hdr-rgb-eveningroad.hdr 43.7390 1.7833 0.9732 0.6061
21 HDRIHaven 8x8 hdr-rgb-riverwalk.hdr 25.8113 2.0260 1.1907 0.4954
22 HDRIHaven 12x12 hdr-rgb-arboretum.hdr 30.2896 2.0752 1.2384 0.4763
23 HDRIHaven 12x12 hdr-rgb-bellparkpier.hdr 41.5190 1.8057 0.9886 0.5966
24 HDRIHaven 12x12 hdr-rgb-canarywharf.hdr 36.7063 1.9364 1.1189 0.5272
25 HDRIHaven 12x12 hdr-rgb-eveningroad.hdr 40.8774 1.9304 1.1109 0.5310
26 HDRIHaven 12x12 hdr-rgb-riverwalk.hdr 23.1152 2.2427 1.3917 0.4238
@@ -0,0 +1,26 @@
Image Set,Block Size,Name,PSNR,Total Time,Coding Time,Coding Rate
HDRIHaven,4x4,hdr-rgb-arboretum.hdr,45.5183,0.8421,0.0887,6.6478
HDRIHaven,4x4,hdr-rgb-bellparkpier.hdr,55.6496,0.8315,0.0853,6.9175
HDRIHaven,4x4,hdr-rgb-canarywharf.hdr,49.1806,0.8445,0.0956,6.1696
HDRIHaven,4x4,hdr-rgb-eveningroad.hdr,51.9442,0.8358,0.0867,6.7996
HDRIHaven,4x4,hdr-rgb-riverwalk.hdr,34.4810,0.8674,0.1186,4.9726
HDRIHaven,5x5,hdr-rgb-arboretum.hdr,41.4914,0.8318,0.0778,7.5826
HDRIHaven,5x5,hdr-rgb-bellparkpier.hdr,51.8696,0.8303,0.0787,7.4941
HDRIHaven,5x5,hdr-rgb-canarywharf.hdr,45.5304,0.8328,0.0826,7.1426
HDRIHaven,5x5,hdr-rgb-eveningroad.hdr,48.9886,0.8293,0.0781,7.5523
HDRIHaven,5x5,hdr-rgb-riverwalk.hdr,30.8769,0.8495,0.0978,6.0323
HDRIHaven,6x6,hdr-rgb-arboretum.hdr,37.9125,0.8253,0.0767,7.6866
HDRIHaven,6x6,hdr-rgb-bellparkpier.hdr,48.9501,0.8287,0.0809,7.2888
HDRIHaven,6x6,hdr-rgb-canarywharf.hdr,42.6974,0.8499,0.0871,6.7739
HDRIHaven,6x6,hdr-rgb-eveningroad.hdr,46.6096,0.8252,0.0809,7.2903
HDRIHaven,6x6,hdr-rgb-riverwalk.hdr,28.4848,0.8409,0.0923,6.3909
HDRIHaven,8x8,hdr-rgb-arboretum.hdr,33.6322,0.8621,0.1121,5.2605
HDRIHaven,8x8,hdr-rgb-bellparkpier.hdr,45.2324,0.8611,0.1112,5.3048
HDRIHaven,8x8,hdr-rgb-canarywharf.hdr,39.4874,0.8599,0.1111,5.3100
HDRIHaven,8x8,hdr-rgb-eveningroad.hdr,43.5558,0.8643,0.1140,5.1743
HDRIHaven,8x8,hdr-rgb-riverwalk.hdr,25.6748,0.8745,0.1243,4.7461
HDRIHaven,12x12,hdr-rgb-arboretum.hdr,30.2132,0.8974,0.1455,4.0527
HDRIHaven,12x12,hdr-rgb-bellparkpier.hdr,41.1310,0.8895,0.1391,4.2388
HDRIHaven,12x12,hdr-rgb-canarywharf.hdr,36.3229,0.8925,0.1410,4.1844
HDRIHaven,12x12,hdr-rgb-eveningroad.hdr,40.7302,0.8993,0.1480,3.9848
HDRIHaven,12x12,hdr-rgb-riverwalk.hdr,23.0516,0.9102,0.1587,3.7176
1 Image Set Block Size Name PSNR Total Time Coding Time Coding Rate
2 HDRIHaven 4x4 hdr-rgb-arboretum.hdr 45.5183 0.8421 0.0887 6.6478
3 HDRIHaven 4x4 hdr-rgb-bellparkpier.hdr 55.6496 0.8315 0.0853 6.9175
4 HDRIHaven 4x4 hdr-rgb-canarywharf.hdr 49.1806 0.8445 0.0956 6.1696
5 HDRIHaven 4x4 hdr-rgb-eveningroad.hdr 51.9442 0.8358 0.0867 6.7996
6 HDRIHaven 4x4 hdr-rgb-riverwalk.hdr 34.4810 0.8674 0.1186 4.9726
7 HDRIHaven 5x5 hdr-rgb-arboretum.hdr 41.4914 0.8318 0.0778 7.5826
8 HDRIHaven 5x5 hdr-rgb-bellparkpier.hdr 51.8696 0.8303 0.0787 7.4941
9 HDRIHaven 5x5 hdr-rgb-canarywharf.hdr 45.5304 0.8328 0.0826 7.1426
10 HDRIHaven 5x5 hdr-rgb-eveningroad.hdr 48.9886 0.8293 0.0781 7.5523
11 HDRIHaven 5x5 hdr-rgb-riverwalk.hdr 30.8769 0.8495 0.0978 6.0323
12 HDRIHaven 6x6 hdr-rgb-arboretum.hdr 37.9125 0.8253 0.0767 7.6866
13 HDRIHaven 6x6 hdr-rgb-bellparkpier.hdr 48.9501 0.8287 0.0809 7.2888
14 HDRIHaven 6x6 hdr-rgb-canarywharf.hdr 42.6974 0.8499 0.0871 6.7739
15 HDRIHaven 6x6 hdr-rgb-eveningroad.hdr 46.6096 0.8252 0.0809 7.2903
16 HDRIHaven 6x6 hdr-rgb-riverwalk.hdr 28.4848 0.8409 0.0923 6.3909
17 HDRIHaven 8x8 hdr-rgb-arboretum.hdr 33.6322 0.8621 0.1121 5.2605
18 HDRIHaven 8x8 hdr-rgb-bellparkpier.hdr 45.2324 0.8611 0.1112 5.3048
19 HDRIHaven 8x8 hdr-rgb-canarywharf.hdr 39.4874 0.8599 0.1111 5.3100
20 HDRIHaven 8x8 hdr-rgb-eveningroad.hdr 43.5558 0.8643 0.1140 5.1743
21 HDRIHaven 8x8 hdr-rgb-riverwalk.hdr 25.6748 0.8745 0.1243 4.7461
22 HDRIHaven 12x12 hdr-rgb-arboretum.hdr 30.2132 0.8974 0.1455 4.0527
23 HDRIHaven 12x12 hdr-rgb-bellparkpier.hdr 41.1310 0.8895 0.1391 4.2388
24 HDRIHaven 12x12 hdr-rgb-canarywharf.hdr 36.3229 0.8925 0.1410 4.1844
25 HDRIHaven 12x12 hdr-rgb-eveningroad.hdr 40.7302 0.8993 0.1480 3.9848
26 HDRIHaven 12x12 hdr-rgb-riverwalk.hdr 23.0516 0.9102 0.1587 3.7176
@@ -0,0 +1,26 @@
Image Set,Block Size,Name,PSNR,Total Time,Coding Time,Coding Rate
HDRIHaven,4x4,hdr-rgb-arboretum.hdr,45.2906,0.8061,0.0538,10.9559
HDRIHaven,4x4,hdr-rgb-bellparkpier.hdr,55.2770,0.8041,0.0556,10.6034
HDRIHaven,4x4,hdr-rgb-canarywharf.hdr,48.4522,0.8038,0.0559,10.5527
HDRIHaven,4x4,hdr-rgb-eveningroad.hdr,51.7385,0.8043,0.0549,10.7346
HDRIHaven,4x4,hdr-rgb-riverwalk.hdr,34.1353,0.8130,0.0635,9.2887
HDRIHaven,5x5,hdr-rgb-arboretum.hdr,41.3714,0.8049,0.0532,11.0825
HDRIHaven,5x5,hdr-rgb-bellparkpier.hdr,51.4977,0.8028,0.0541,10.9037
HDRIHaven,5x5,hdr-rgb-canarywharf.hdr,45.0897,0.8034,0.0539,10.9509
HDRIHaven,5x5,hdr-rgb-eveningroad.hdr,48.7504,0.8038,0.0534,11.0363
HDRIHaven,5x5,hdr-rgb-riverwalk.hdr,30.5270,0.8122,0.0598,9.8646
HDRIHaven,6x6,hdr-rgb-arboretum.hdr,37.8240,0.7991,0.0517,11.4146
HDRIHaven,6x6,hdr-rgb-bellparkpier.hdr,48.6977,0.7964,0.0511,11.5344
HDRIHaven,6x6,hdr-rgb-canarywharf.hdr,42.4615,0.7980,0.0524,11.2538
HDRIHaven,6x6,hdr-rgb-eveningroad.hdr,46.3650,0.7951,0.0508,11.6073
HDRIHaven,6x6,hdr-rgb-riverwalk.hdr,28.3399,0.8016,0.0561,10.5226
HDRIHaven,8x8,hdr-rgb-arboretum.hdr,33.5378,0.8291,0.0778,7.5804
HDRIHaven,8x8,hdr-rgb-bellparkpier.hdr,44.8222,0.8210,0.0724,8.1516
HDRIHaven,8x8,hdr-rgb-canarywharf.hdr,39.1845,0.8287,0.0744,7.9285
HDRIHaven,8x8,hdr-rgb-eveningroad.hdr,43.3350,0.8249,0.0738,7.9934
HDRIHaven,8x8,hdr-rgb-riverwalk.hdr,25.6111,0.8318,0.0847,6.9658
HDRIHaven,12x12,hdr-rgb-arboretum.hdr,30.1334,0.8493,0.0981,6.0141
HDRIHaven,12x12,hdr-rgb-bellparkpier.hdr,40.6166,0.8403,0.0915,6.4462
HDRIHaven,12x12,hdr-rgb-canarywharf.hdr,36.0114,0.8417,0.0935,6.3050
HDRIHaven,12x12,hdr-rgb-eveningroad.hdr,40.6068,0.8478,0.0977,6.0370
HDRIHaven,12x12,hdr-rgb-riverwalk.hdr,23.0090,0.8590,0.1085,5.4348
1 Image Set Block Size Name PSNR Total Time Coding Time Coding Rate
2 HDRIHaven 4x4 hdr-rgb-arboretum.hdr 45.2906 0.8061 0.0538 10.9559
3 HDRIHaven 4x4 hdr-rgb-bellparkpier.hdr 55.2770 0.8041 0.0556 10.6034
4 HDRIHaven 4x4 hdr-rgb-canarywharf.hdr 48.4522 0.8038 0.0559 10.5527
5 HDRIHaven 4x4 hdr-rgb-eveningroad.hdr 51.7385 0.8043 0.0549 10.7346
6 HDRIHaven 4x4 hdr-rgb-riverwalk.hdr 34.1353 0.8130 0.0635 9.2887
7 HDRIHaven 5x5 hdr-rgb-arboretum.hdr 41.3714 0.8049 0.0532 11.0825
8 HDRIHaven 5x5 hdr-rgb-bellparkpier.hdr 51.4977 0.8028 0.0541 10.9037
9 HDRIHaven 5x5 hdr-rgb-canarywharf.hdr 45.0897 0.8034 0.0539 10.9509
10 HDRIHaven 5x5 hdr-rgb-eveningroad.hdr 48.7504 0.8038 0.0534 11.0363
11 HDRIHaven 5x5 hdr-rgb-riverwalk.hdr 30.5270 0.8122 0.0598 9.8646
12 HDRIHaven 6x6 hdr-rgb-arboretum.hdr 37.8240 0.7991 0.0517 11.4146
13 HDRIHaven 6x6 hdr-rgb-bellparkpier.hdr 48.6977 0.7964 0.0511 11.5344
14 HDRIHaven 6x6 hdr-rgb-canarywharf.hdr 42.4615 0.7980 0.0524 11.2538
15 HDRIHaven 6x6 hdr-rgb-eveningroad.hdr 46.3650 0.7951 0.0508 11.6073
16 HDRIHaven 6x6 hdr-rgb-riverwalk.hdr 28.3399 0.8016 0.0561 10.5226
17 HDRIHaven 8x8 hdr-rgb-arboretum.hdr 33.5378 0.8291 0.0778 7.5804
18 HDRIHaven 8x8 hdr-rgb-bellparkpier.hdr 44.8222 0.8210 0.0724 8.1516
19 HDRIHaven 8x8 hdr-rgb-canarywharf.hdr 39.1845 0.8287 0.0744 7.9285
20 HDRIHaven 8x8 hdr-rgb-eveningroad.hdr 43.3350 0.8249 0.0738 7.9934
21 HDRIHaven 8x8 hdr-rgb-riverwalk.hdr 25.6111 0.8318 0.0847 6.9658
22 HDRIHaven 12x12 hdr-rgb-arboretum.hdr 30.1334 0.8493 0.0981 6.0141
23 HDRIHaven 12x12 hdr-rgb-bellparkpier.hdr 40.6166 0.8403 0.0915 6.4462
24 HDRIHaven 12x12 hdr-rgb-canarywharf.hdr 36.0114 0.8417 0.0935 6.3050
25 HDRIHaven 12x12 hdr-rgb-eveningroad.hdr 40.6068 0.8478 0.0977 6.0370
26 HDRIHaven 12x12 hdr-rgb-riverwalk.hdr 23.0090 0.8590 0.1085 5.4348
@@ -0,0 +1,26 @@
Image Set,Block Size,Name,PSNR,Total Time,Coding Time,Coding Rate
HDRIHaven,4x4,hdr-rgb-arboretum.hdr,45.6967,0.8917,0.1381,4.2706
HDRIHaven,4x4,hdr-rgb-bellparkpier.hdr,55.9872,0.8862,0.1381,4.2711
HDRIHaven,4x4,hdr-rgb-canarywharf.hdr,49.5481,0.9086,0.1568,3.7610
HDRIHaven,4x4,hdr-rgb-eveningroad.hdr,52.2746,0.8945,0.1438,4.1016
HDRIHaven,4x4,hdr-rgb-riverwalk.hdr,34.8015,0.9469,0.1920,3.0717
HDRIHaven,5x5,hdr-rgb-arboretum.hdr,41.5897,0.8797,0.1264,4.6671
HDRIHaven,5x5,hdr-rgb-bellparkpier.hdr,52.2586,0.8937,0.1397,4.2232
HDRIHaven,5x5,hdr-rgb-canarywharf.hdr,46.0351,0.8985,0.1443,4.0874
HDRIHaven,5x5,hdr-rgb-eveningroad.hdr,49.2103,0.8891,0.1370,4.3048
HDRIHaven,5x5,hdr-rgb-riverwalk.hdr,31.0865,0.9240,0.1690,3.4898
HDRIHaven,6x6,hdr-rgb-arboretum.hdr,37.9740,0.8824,0.1327,4.4450
HDRIHaven,6x6,hdr-rgb-bellparkpier.hdr,49.3897,0.8941,0.1464,4.0275
HDRIHaven,6x6,hdr-rgb-canarywharf.hdr,43.2248,0.9014,0.1505,3.9200
HDRIHaven,6x6,hdr-rgb-eveningroad.hdr,46.7613,0.8937,0.1465,4.0265
HDRIHaven,6x6,hdr-rgb-riverwalk.hdr,28.6388,0.9206,0.1685,3.5003
HDRIHaven,8x8,hdr-rgb-arboretum.hdr,33.7174,0.9347,0.1818,3.2437
HDRIHaven,8x8,hdr-rgb-bellparkpier.hdr,45.5395,0.9313,0.1833,3.2180
HDRIHaven,8x8,hdr-rgb-canarywharf.hdr,39.7977,0.9488,0.1945,3.0318
HDRIHaven,8x8,hdr-rgb-eveningroad.hdr,43.6514,0.9423,0.1897,3.1099
HDRIHaven,8x8,hdr-rgb-riverwalk.hdr,25.7425,0.9624,0.2081,2.8345
HDRIHaven,12x12,hdr-rgb-arboretum.hdr,30.2551,1.0001,0.2434,2.4234
HDRIHaven,12x12,hdr-rgb-bellparkpier.hdr,41.3460,0.9773,0.2264,2.6047
HDRIHaven,12x12,hdr-rgb-canarywharf.hdr,36.5142,1.0026,0.2488,2.3702
HDRIHaven,12x12,hdr-rgb-eveningroad.hdr,40.8241,0.9929,0.2404,2.4538
HDRIHaven,12x12,hdr-rgb-riverwalk.hdr,23.0771,1.0092,0.2552,2.3115
1 Image Set Block Size Name PSNR Total Time Coding Time Coding Rate
2 HDRIHaven 4x4 hdr-rgb-arboretum.hdr 45.6967 0.8917 0.1381 4.2706
3 HDRIHaven 4x4 hdr-rgb-bellparkpier.hdr 55.9872 0.8862 0.1381 4.2711
4 HDRIHaven 4x4 hdr-rgb-canarywharf.hdr 49.5481 0.9086 0.1568 3.7610
5 HDRIHaven 4x4 hdr-rgb-eveningroad.hdr 52.2746 0.8945 0.1438 4.1016
6 HDRIHaven 4x4 hdr-rgb-riverwalk.hdr 34.8015 0.9469 0.1920 3.0717
7 HDRIHaven 5x5 hdr-rgb-arboretum.hdr 41.5897 0.8797 0.1264 4.6671
8 HDRIHaven 5x5 hdr-rgb-bellparkpier.hdr 52.2586 0.8937 0.1397 4.2232
9 HDRIHaven 5x5 hdr-rgb-canarywharf.hdr 46.0351 0.8985 0.1443 4.0874
10 HDRIHaven 5x5 hdr-rgb-eveningroad.hdr 49.2103 0.8891 0.1370 4.3048
11 HDRIHaven 5x5 hdr-rgb-riverwalk.hdr 31.0865 0.9240 0.1690 3.4898
12 HDRIHaven 6x6 hdr-rgb-arboretum.hdr 37.9740 0.8824 0.1327 4.4450
13 HDRIHaven 6x6 hdr-rgb-bellparkpier.hdr 49.3897 0.8941 0.1464 4.0275
14 HDRIHaven 6x6 hdr-rgb-canarywharf.hdr 43.2248 0.9014 0.1505 3.9200
15 HDRIHaven 6x6 hdr-rgb-eveningroad.hdr 46.7613 0.8937 0.1465 4.0265
16 HDRIHaven 6x6 hdr-rgb-riverwalk.hdr 28.6388 0.9206 0.1685 3.5003
17 HDRIHaven 8x8 hdr-rgb-arboretum.hdr 33.7174 0.9347 0.1818 3.2437
18 HDRIHaven 8x8 hdr-rgb-bellparkpier.hdr 45.5395 0.9313 0.1833 3.2180
19 HDRIHaven 8x8 hdr-rgb-canarywharf.hdr 39.7977 0.9488 0.1945 3.0318
20 HDRIHaven 8x8 hdr-rgb-eveningroad.hdr 43.6514 0.9423 0.1897 3.1099
21 HDRIHaven 8x8 hdr-rgb-riverwalk.hdr 25.7425 0.9624 0.2081 2.8345
22 HDRIHaven 12x12 hdr-rgb-arboretum.hdr 30.2551 1.0001 0.2434 2.4234
23 HDRIHaven 12x12 hdr-rgb-bellparkpier.hdr 41.3460 0.9773 0.2264 2.6047
24 HDRIHaven 12x12 hdr-rgb-canarywharf.hdr 36.5142 1.0026 0.2488 2.3702
25 HDRIHaven 12x12 hdr-rgb-eveningroad.hdr 40.8241 0.9929 0.2404 2.4538
26 HDRIHaven 12x12 hdr-rgb-riverwalk.hdr 23.0771 1.0092 0.2552 2.3115
@@ -0,0 +1,26 @@
Image Set,Block Size,Name,PSNR,Total Time,Coding Time,Coding Rate
HDRIHaven,4x4,hdr-rgb-arboretum.hdr,45.8226,1.0510,0.2972,1.9845
HDRIHaven,4x4,hdr-rgb-bellparkpier.hdr,56.3071,1.0638,0.3161,1.8661
HDRIHaven,4x4,hdr-rgb-canarywharf.hdr,50.0457,1.0919,0.3383,1.7434
HDRIHaven,4x4,hdr-rgb-eveningroad.hdr,52.4455,1.0613,0.3117,1.8920
HDRIHaven,4x4,hdr-rgb-riverwalk.hdr,34.9306,1.1433,0.3936,1.4987
HDRIHaven,5x5,hdr-rgb-arboretum.hdr,41.6472,1.0421,0.2851,2.0687
HDRIHaven,5x5,hdr-rgb-bellparkpier.hdr,52.4971,1.0836,0.3321,1.7760
HDRIHaven,5x5,hdr-rgb-canarywharf.hdr,46.4390,1.0780,0.3304,1.7854
HDRIHaven,5x5,hdr-rgb-eveningroad.hdr,49.3310,1.0575,0.3077,1.9169
HDRIHaven,5x5,hdr-rgb-riverwalk.hdr,31.2247,1.1193,0.3701,1.5936
HDRIHaven,6x6,hdr-rgb-arboretum.hdr,38.0267,1.0532,0.3053,1.9322
HDRIHaven,6x6,hdr-rgb-bellparkpier.hdr,49.6037,1.1076,0.3576,1.6492
HDRIHaven,6x6,hdr-rgb-canarywharf.hdr,43.5645,1.0999,0.3515,1.6779
HDRIHaven,6x6,hdr-rgb-eveningroad.hdr,46.8686,1.0829,0.3357,1.7569
HDRIHaven,6x6,hdr-rgb-riverwalk.hdr,28.7460,1.1268,0.3798,1.5528
HDRIHaven,8x8,hdr-rgb-arboretum.hdr,33.7716,1.1755,0.4190,1.4076
HDRIHaven,8x8,hdr-rgb-bellparkpier.hdr,45.7436,1.1967,0.4434,1.3303
HDRIHaven,8x8,hdr-rgb-canarywharf.hdr,40.0304,1.2071,0.4571,1.2905
HDRIHaven,8x8,hdr-rgb-eveningroad.hdr,43.7463,1.1927,0.4419,1.3346
HDRIHaven,8x8,hdr-rgb-riverwalk.hdr,25.8190,1.2341,0.4767,1.2373
HDRIHaven,12x12,hdr-rgb-arboretum.hdr,30.2963,1.3386,0.5790,1.0187
HDRIHaven,12x12,hdr-rgb-bellparkpier.hdr,41.5470,1.2630,0.5098,1.1569
HDRIHaven,12x12,hdr-rgb-canarywharf.hdr,36.7151,1.3345,0.5789,1.0188
HDRIHaven,12x12,hdr-rgb-eveningroad.hdr,40.8836,1.3195,0.5665,1.0412
HDRIHaven,12x12,hdr-rgb-riverwalk.hdr,23.1188,1.3807,0.6215,0.9491
1 Image Set Block Size Name PSNR Total Time Coding Time Coding Rate
2 HDRIHaven 4x4 hdr-rgb-arboretum.hdr 45.8226 1.0510 0.2972 1.9845
3 HDRIHaven 4x4 hdr-rgb-bellparkpier.hdr 56.3071 1.0638 0.3161 1.8661
4 HDRIHaven 4x4 hdr-rgb-canarywharf.hdr 50.0457 1.0919 0.3383 1.7434
5 HDRIHaven 4x4 hdr-rgb-eveningroad.hdr 52.4455 1.0613 0.3117 1.8920
6 HDRIHaven 4x4 hdr-rgb-riverwalk.hdr 34.9306 1.1433 0.3936 1.4987
7 HDRIHaven 5x5 hdr-rgb-arboretum.hdr 41.6472 1.0421 0.2851 2.0687
8 HDRIHaven 5x5 hdr-rgb-bellparkpier.hdr 52.4971 1.0836 0.3321 1.7760
9 HDRIHaven 5x5 hdr-rgb-canarywharf.hdr 46.4390 1.0780 0.3304 1.7854
10 HDRIHaven 5x5 hdr-rgb-eveningroad.hdr 49.3310 1.0575 0.3077 1.9169
11 HDRIHaven 5x5 hdr-rgb-riverwalk.hdr 31.2247 1.1193 0.3701 1.5936
12 HDRIHaven 6x6 hdr-rgb-arboretum.hdr 38.0267 1.0532 0.3053 1.9322
13 HDRIHaven 6x6 hdr-rgb-bellparkpier.hdr 49.6037 1.1076 0.3576 1.6492
14 HDRIHaven 6x6 hdr-rgb-canarywharf.hdr 43.5645 1.0999 0.3515 1.6779
15 HDRIHaven 6x6 hdr-rgb-eveningroad.hdr 46.8686 1.0829 0.3357 1.7569
16 HDRIHaven 6x6 hdr-rgb-riverwalk.hdr 28.7460 1.1268 0.3798 1.5528
17 HDRIHaven 8x8 hdr-rgb-arboretum.hdr 33.7716 1.1755 0.4190 1.4076
18 HDRIHaven 8x8 hdr-rgb-bellparkpier.hdr 45.7436 1.1967 0.4434 1.3303
19 HDRIHaven 8x8 hdr-rgb-canarywharf.hdr 40.0304 1.2071 0.4571 1.2905
20 HDRIHaven 8x8 hdr-rgb-eveningroad.hdr 43.7463 1.1927 0.4419 1.3346
21 HDRIHaven 8x8 hdr-rgb-riverwalk.hdr 25.8190 1.2341 0.4767 1.2373
22 HDRIHaven 12x12 hdr-rgb-arboretum.hdr 30.2963 1.3386 0.5790 1.0187
23 HDRIHaven 12x12 hdr-rgb-bellparkpier.hdr 41.5470 1.2630 0.5098 1.1569
24 HDRIHaven 12x12 hdr-rgb-canarywharf.hdr 36.7151 1.3345 0.5789 1.0188
25 HDRIHaven 12x12 hdr-rgb-eveningroad.hdr 40.8836 1.3195 0.5665 1.0412
26 HDRIHaven 12x12 hdr-rgb-riverwalk.hdr 23.1188 1.3807 0.6215 0.9491
@@ -0,0 +1,26 @@
Image Set,Block Size,Name,PSNR,Total Time,Coding Time,Coding Rate
HDRIHaven,4x4,hdr-rgb-arboretum.hdr,45.5183,0.8977,0.1136,5.1909
HDRIHaven,4x4,hdr-rgb-bellparkpier.hdr,55.6496,0.8914,0.1097,5.3777
HDRIHaven,4x4,hdr-rgb-canarywharf.hdr,49.1806,0.9059,0.1224,4.8195
HDRIHaven,4x4,hdr-rgb-eveningroad.hdr,51.9442,0.8956,0.1116,5.2860
HDRIHaven,4x4,hdr-rgb-riverwalk.hdr,34.4810,0.9369,0.1510,3.9056
HDRIHaven,5x5,hdr-rgb-arboretum.hdr,41.4914,0.8935,0.1075,5.4888
HDRIHaven,5x5,hdr-rgb-bellparkpier.hdr,51.8696,0.8904,0.1081,5.4540
HDRIHaven,5x5,hdr-rgb-canarywharf.hdr,45.5304,0.8972,0.1136,5.1936
HDRIHaven,5x5,hdr-rgb-eveningroad.hdr,48.9886,0.8909,0.1073,5.4969
HDRIHaven,5x5,hdr-rgb-riverwalk.hdr,30.8769,0.9189,0.1341,4.3987
HDRIHaven,6x6,hdr-rgb-arboretum.hdr,37.9125,0.8985,0.1130,5.2191
HDRIHaven,6x6,hdr-rgb-bellparkpier.hdr,48.9501,0.9012,0.1192,4.9491
HDRIHaven,6x6,hdr-rgb-canarywharf.hdr,42.6974,0.9037,0.1202,4.9071
HDRIHaven,6x6,hdr-rgb-eveningroad.hdr,46.6096,0.9013,0.1179,5.0030
HDRIHaven,6x6,hdr-rgb-riverwalk.hdr,28.4848,0.9190,0.1341,4.3973
HDRIHaven,8x8,hdr-rgb-arboretum.hdr,33.6322,0.9551,0.1705,3.4595
HDRIHaven,8x8,hdr-rgb-bellparkpier.hdr,45.2324,0.9568,0.1737,3.3965
HDRIHaven,8x8,hdr-rgb-canarywharf.hdr,39.4874,0.9558,0.1712,3.4451
HDRIHaven,8x8,hdr-rgb-eveningroad.hdr,43.5558,0.9571,0.1754,3.3625
HDRIHaven,8x8,hdr-rgb-riverwalk.hdr,25.6748,0.9743,0.1891,3.1189
HDRIHaven,12x12,hdr-rgb-arboretum.hdr,30.2132,1.0035,0.2159,2.7315
HDRIHaven,12x12,hdr-rgb-bellparkpier.hdr,41.1310,0.9935,0.2095,2.8159
HDRIHaven,12x12,hdr-rgb-canarywharf.hdr,36.3229,0.9974,0.2112,2.7923
HDRIHaven,12x12,hdr-rgb-eveningroad.hdr,40.7302,1.0055,0.2209,2.6707
HDRIHaven,12x12,hdr-rgb-riverwalk.hdr,23.0516,1.0217,0.2356,2.5036
1 Image Set Block Size Name PSNR Total Time Coding Time Coding Rate
2 HDRIHaven 4x4 hdr-rgb-arboretum.hdr 45.5183 0.8977 0.1136 5.1909
3 HDRIHaven 4x4 hdr-rgb-bellparkpier.hdr 55.6496 0.8914 0.1097 5.3777
4 HDRIHaven 4x4 hdr-rgb-canarywharf.hdr 49.1806 0.9059 0.1224 4.8195
5 HDRIHaven 4x4 hdr-rgb-eveningroad.hdr 51.9442 0.8956 0.1116 5.2860
6 HDRIHaven 4x4 hdr-rgb-riverwalk.hdr 34.4810 0.9369 0.1510 3.9056
7 HDRIHaven 5x5 hdr-rgb-arboretum.hdr 41.4914 0.8935 0.1075 5.4888
8 HDRIHaven 5x5 hdr-rgb-bellparkpier.hdr 51.8696 0.8904 0.1081 5.4540
9 HDRIHaven 5x5 hdr-rgb-canarywharf.hdr 45.5304 0.8972 0.1136 5.1936
10 HDRIHaven 5x5 hdr-rgb-eveningroad.hdr 48.9886 0.8909 0.1073 5.4969
11 HDRIHaven 5x5 hdr-rgb-riverwalk.hdr 30.8769 0.9189 0.1341 4.3987
12 HDRIHaven 6x6 hdr-rgb-arboretum.hdr 37.9125 0.8985 0.1130 5.2191
13 HDRIHaven 6x6 hdr-rgb-bellparkpier.hdr 48.9501 0.9012 0.1192 4.9491
14 HDRIHaven 6x6 hdr-rgb-canarywharf.hdr 42.6974 0.9037 0.1202 4.9071
15 HDRIHaven 6x6 hdr-rgb-eveningroad.hdr 46.6096 0.9013 0.1179 5.0030
16 HDRIHaven 6x6 hdr-rgb-riverwalk.hdr 28.4848 0.9190 0.1341 4.3973
17 HDRIHaven 8x8 hdr-rgb-arboretum.hdr 33.6322 0.9551 0.1705 3.4595
18 HDRIHaven 8x8 hdr-rgb-bellparkpier.hdr 45.2324 0.9568 0.1737 3.3965
19 HDRIHaven 8x8 hdr-rgb-canarywharf.hdr 39.4874 0.9558 0.1712 3.4451
20 HDRIHaven 8x8 hdr-rgb-eveningroad.hdr 43.5558 0.9571 0.1754 3.3625
21 HDRIHaven 8x8 hdr-rgb-riverwalk.hdr 25.6748 0.9743 0.1891 3.1189
22 HDRIHaven 12x12 hdr-rgb-arboretum.hdr 30.2132 1.0035 0.2159 2.7315
23 HDRIHaven 12x12 hdr-rgb-bellparkpier.hdr 41.1310 0.9935 0.2095 2.8159
24 HDRIHaven 12x12 hdr-rgb-canarywharf.hdr 36.3229 0.9974 0.2112 2.7923
25 HDRIHaven 12x12 hdr-rgb-eveningroad.hdr 40.7302 1.0055 0.2209 2.6707
26 HDRIHaven 12x12 hdr-rgb-riverwalk.hdr 23.0516 1.0217 0.2356 2.5036
@@ -0,0 +1,26 @@
Image Set,Block Size,Name,PSNR,Total Time,Coding Time,Coding Rate
HDRIHaven,4x4,hdr-rgb-arboretum.hdr,45.2906,0.8542,0.0694,8.5015
HDRIHaven,4x4,hdr-rgb-bellparkpier.hdr,55.2770,0.8533,0.0721,8.1804
HDRIHaven,4x4,hdr-rgb-canarywharf.hdr,48.4522,0.8534,0.0722,8.1682
HDRIHaven,4x4,hdr-rgb-eveningroad.hdr,51.7385,0.8547,0.0714,8.2651
HDRIHaven,4x4,hdr-rgb-riverwalk.hdr,34.1353,0.8659,0.0817,7.2185
HDRIHaven,5x5,hdr-rgb-arboretum.hdr,41.3714,0.8568,0.0744,7.9233
HDRIHaven,5x5,hdr-rgb-bellparkpier.hdr,51.4977,0.8567,0.0752,7.8485
HDRIHaven,5x5,hdr-rgb-canarywharf.hdr,45.0897,0.8561,0.0748,7.8864
HDRIHaven,5x5,hdr-rgb-eveningroad.hdr,48.7504,0.8567,0.0740,7.9697
HDRIHaven,5x5,hdr-rgb-riverwalk.hdr,30.5270,0.8660,0.0828,7.1238
HDRIHaven,6x6,hdr-rgb-arboretum.hdr,37.8240,0.8601,0.0772,7.6419
HDRIHaven,6x6,hdr-rgb-bellparkpier.hdr,48.6977,0.8557,0.0762,7.7444
HDRIHaven,6x6,hdr-rgb-canarywharf.hdr,42.4615,0.8602,0.0778,7.5834
HDRIHaven,6x6,hdr-rgb-eveningroad.hdr,46.3650,0.8566,0.0751,7.8532
HDRIHaven,6x6,hdr-rgb-riverwalk.hdr,28.3399,0.8670,0.0832,7.0880
HDRIHaven,8x8,hdr-rgb-arboretum.hdr,33.5378,0.9024,0.1177,5.0108
HDRIHaven,8x8,hdr-rgb-bellparkpier.hdr,44.8222,0.8925,0.1111,5.3085
HDRIHaven,8x8,hdr-rgb-canarywharf.hdr,39.1845,0.8973,0.1128,5.2291
HDRIHaven,8x8,hdr-rgb-eveningroad.hdr,43.3350,0.8956,0.1121,5.2622
HDRIHaven,8x8,hdr-rgb-riverwalk.hdr,25.6111,0.9129,0.1283,4.5972
HDRIHaven,12x12,hdr-rgb-arboretum.hdr,30.1334,0.9332,0.1461,4.0371
HDRIHaven,12x12,hdr-rgb-bellparkpier.hdr,40.6166,0.9214,0.1369,4.3098
HDRIHaven,12x12,hdr-rgb-canarywharf.hdr,36.0114,0.9244,0.1399,4.2166
HDRIHaven,12x12,hdr-rgb-eveningroad.hdr,40.6068,0.9291,0.1452,4.0633
HDRIHaven,12x12,hdr-rgb-riverwalk.hdr,23.0090,0.9473,0.1609,3.6648
1 Image Set Block Size Name PSNR Total Time Coding Time Coding Rate
2 HDRIHaven 4x4 hdr-rgb-arboretum.hdr 45.2906 0.8542 0.0694 8.5015
3 HDRIHaven 4x4 hdr-rgb-bellparkpier.hdr 55.2770 0.8533 0.0721 8.1804
4 HDRIHaven 4x4 hdr-rgb-canarywharf.hdr 48.4522 0.8534 0.0722 8.1682
5 HDRIHaven 4x4 hdr-rgb-eveningroad.hdr 51.7385 0.8547 0.0714 8.2651
6 HDRIHaven 4x4 hdr-rgb-riverwalk.hdr 34.1353 0.8659 0.0817 7.2185
7 HDRIHaven 5x5 hdr-rgb-arboretum.hdr 41.3714 0.8568 0.0744 7.9233
8 HDRIHaven 5x5 hdr-rgb-bellparkpier.hdr 51.4977 0.8567 0.0752 7.8485
9 HDRIHaven 5x5 hdr-rgb-canarywharf.hdr 45.0897 0.8561 0.0748 7.8864
10 HDRIHaven 5x5 hdr-rgb-eveningroad.hdr 48.7504 0.8567 0.0740 7.9697
11 HDRIHaven 5x5 hdr-rgb-riverwalk.hdr 30.5270 0.8660 0.0828 7.1238
12 HDRIHaven 6x6 hdr-rgb-arboretum.hdr 37.8240 0.8601 0.0772 7.6419
13 HDRIHaven 6x6 hdr-rgb-bellparkpier.hdr 48.6977 0.8557 0.0762 7.7444
14 HDRIHaven 6x6 hdr-rgb-canarywharf.hdr 42.4615 0.8602 0.0778 7.5834
15 HDRIHaven 6x6 hdr-rgb-eveningroad.hdr 46.3650 0.8566 0.0751 7.8532
16 HDRIHaven 6x6 hdr-rgb-riverwalk.hdr 28.3399 0.8670 0.0832 7.0880
17 HDRIHaven 8x8 hdr-rgb-arboretum.hdr 33.5378 0.9024 0.1177 5.0108
18 HDRIHaven 8x8 hdr-rgb-bellparkpier.hdr 44.8222 0.8925 0.1111 5.3085
19 HDRIHaven 8x8 hdr-rgb-canarywharf.hdr 39.1845 0.8973 0.1128 5.2291
20 HDRIHaven 8x8 hdr-rgb-eveningroad.hdr 43.3350 0.8956 0.1121 5.2622
21 HDRIHaven 8x8 hdr-rgb-riverwalk.hdr 25.6111 0.9129 0.1283 4.5972
22 HDRIHaven 12x12 hdr-rgb-arboretum.hdr 30.1334 0.9332 0.1461 4.0371
23 HDRIHaven 12x12 hdr-rgb-bellparkpier.hdr 40.6166 0.9214 0.1369 4.3098
24 HDRIHaven 12x12 hdr-rgb-canarywharf.hdr 36.0114 0.9244 0.1399 4.2166
25 HDRIHaven 12x12 hdr-rgb-eveningroad.hdr 40.6068 0.9291 0.1452 4.0633
26 HDRIHaven 12x12 hdr-rgb-riverwalk.hdr 23.0090 0.9473 0.1609 3.6648
@@ -0,0 +1,26 @@
Image Set,Block Size,Name,PSNR,Total Time,Coding Time,Coding Rate
HDRIHaven,4x4,hdr-rgb-arboretum.hdr,45.6967,0.9688,0.1804,3.2692
HDRIHaven,4x4,hdr-rgb-bellparkpier.hdr,55.9872,0.9690,0.1843,3.2001
HDRIHaven,4x4,hdr-rgb-canarywharf.hdr,49.5481,0.9919,0.2064,2.8580
HDRIHaven,4x4,hdr-rgb-eveningroad.hdr,52.2746,0.9747,0.1915,3.0806
HDRIHaven,4x4,hdr-rgb-riverwalk.hdr,34.8015,1.0371,0.2509,2.3508
HDRIHaven,5x5,hdr-rgb-arboretum.hdr,41.5897,0.9682,0.1812,3.2544
HDRIHaven,5x5,hdr-rgb-bellparkpier.hdr,52.2586,0.9877,0.2020,2.9206
HDRIHaven,5x5,hdr-rgb-canarywharf.hdr,46.0351,0.9964,0.2064,2.8574
HDRIHaven,5x5,hdr-rgb-eveningroad.hdr,49.2103,0.9845,0.1963,3.0051
HDRIHaven,5x5,hdr-rgb-riverwalk.hdr,31.0865,1.0287,0.2410,2.4476
HDRIHaven,6x6,hdr-rgb-arboretum.hdr,37.9740,0.9863,0.1994,2.9584
HDRIHaven,6x6,hdr-rgb-bellparkpier.hdr,49.3897,1.0056,0.2225,2.6511
HDRIHaven,6x6,hdr-rgb-canarywharf.hdr,43.2248,1.0112,0.2255,2.6151
HDRIHaven,6x6,hdr-rgb-eveningroad.hdr,46.7613,1.0062,0.2219,2.6583
HDRIHaven,6x6,hdr-rgb-riverwalk.hdr,28.6388,1.0355,0.2497,2.3621
HDRIHaven,8x8,hdr-rgb-arboretum.hdr,33.7174,1.0714,0.2815,2.0953
HDRIHaven,8x8,hdr-rgb-bellparkpier.hdr,45.5395,1.0739,0.2891,2.0405
HDRIHaven,8x8,hdr-rgb-canarywharf.hdr,39.7977,1.0919,0.3016,1.9554
HDRIHaven,8x8,hdr-rgb-eveningroad.hdr,43.6514,1.0799,0.2937,2.0080
HDRIHaven,8x8,hdr-rgb-riverwalk.hdr,25.7425,1.1059,0.3198,1.8444
HDRIHaven,12x12,hdr-rgb-arboretum.hdr,30.2551,1.1544,0.3623,1.6280
HDRIHaven,12x12,hdr-rgb-bellparkpier.hdr,41.3460,1.1298,0.3406,1.7318
HDRIHaven,12x12,hdr-rgb-canarywharf.hdr,36.5142,1.1642,0.3728,1.5820
HDRIHaven,12x12,hdr-rgb-eveningroad.hdr,40.8241,1.1478,0.3600,1.6385
HDRIHaven,12x12,hdr-rgb-riverwalk.hdr,23.0771,1.1683,0.3769,1.5650
1 Image Set Block Size Name PSNR Total Time Coding Time Coding Rate
2 HDRIHaven 4x4 hdr-rgb-arboretum.hdr 45.6967 0.9688 0.1804 3.2692
3 HDRIHaven 4x4 hdr-rgb-bellparkpier.hdr 55.9872 0.9690 0.1843 3.2001
4 HDRIHaven 4x4 hdr-rgb-canarywharf.hdr 49.5481 0.9919 0.2064 2.8580
5 HDRIHaven 4x4 hdr-rgb-eveningroad.hdr 52.2746 0.9747 0.1915 3.0806
6 HDRIHaven 4x4 hdr-rgb-riverwalk.hdr 34.8015 1.0371 0.2509 2.3508
7 HDRIHaven 5x5 hdr-rgb-arboretum.hdr 41.5897 0.9682 0.1812 3.2544
8 HDRIHaven 5x5 hdr-rgb-bellparkpier.hdr 52.2586 0.9877 0.2020 2.9206
9 HDRIHaven 5x5 hdr-rgb-canarywharf.hdr 46.0351 0.9964 0.2064 2.8574
10 HDRIHaven 5x5 hdr-rgb-eveningroad.hdr 49.2103 0.9845 0.1963 3.0051
11 HDRIHaven 5x5 hdr-rgb-riverwalk.hdr 31.0865 1.0287 0.2410 2.4476
12 HDRIHaven 6x6 hdr-rgb-arboretum.hdr 37.9740 0.9863 0.1994 2.9584
13 HDRIHaven 6x6 hdr-rgb-bellparkpier.hdr 49.3897 1.0056 0.2225 2.6511
14 HDRIHaven 6x6 hdr-rgb-canarywharf.hdr 43.2248 1.0112 0.2255 2.6151
15 HDRIHaven 6x6 hdr-rgb-eveningroad.hdr 46.7613 1.0062 0.2219 2.6583
16 HDRIHaven 6x6 hdr-rgb-riverwalk.hdr 28.6388 1.0355 0.2497 2.3621
17 HDRIHaven 8x8 hdr-rgb-arboretum.hdr 33.7174 1.0714 0.2815 2.0953
18 HDRIHaven 8x8 hdr-rgb-bellparkpier.hdr 45.5395 1.0739 0.2891 2.0405
19 HDRIHaven 8x8 hdr-rgb-canarywharf.hdr 39.7977 1.0919 0.3016 1.9554
20 HDRIHaven 8x8 hdr-rgb-eveningroad.hdr 43.6514 1.0799 0.2937 2.0080
21 HDRIHaven 8x8 hdr-rgb-riverwalk.hdr 25.7425 1.1059 0.3198 1.8444
22 HDRIHaven 12x12 hdr-rgb-arboretum.hdr 30.2551 1.1544 0.3623 1.6280
23 HDRIHaven 12x12 hdr-rgb-bellparkpier.hdr 41.3460 1.1298 0.3406 1.7318
24 HDRIHaven 12x12 hdr-rgb-canarywharf.hdr 36.5142 1.1642 0.3728 1.5820
25 HDRIHaven 12x12 hdr-rgb-eveningroad.hdr 40.8241 1.1478 0.3600 1.6385
26 HDRIHaven 12x12 hdr-rgb-riverwalk.hdr 23.0771 1.1683 0.3769 1.5650
@@ -0,0 +1,26 @@
Image Set,Block Size,Name,PSNR,Total Time,Coding Time,Coding Rate
HDRIHaven,4x4,hdr-rgb-arboretum.hdr,45.8226,1.1860,0.3930,1.5008
HDRIHaven,4x4,hdr-rgb-bellparkpier.hdr,56.3071,1.2120,0.4245,1.3895
HDRIHaven,4x4,hdr-rgb-canarywharf.hdr,50.0457,1.2394,0.4491,1.3134
HDRIHaven,4x4,hdr-rgb-eveningroad.hdr,52.4455,1.2033,0.4194,1.4065
HDRIHaven,4x4,hdr-rgb-riverwalk.hdr,34.9306,1.3087,0.5183,1.1380
HDRIHaven,5x5,hdr-rgb-arboretum.hdr,41.6472,1.1900,0.4010,1.4708
HDRIHaven,5x5,hdr-rgb-bellparkpier.hdr,52.4971,1.2626,0.4771,1.2364
HDRIHaven,5x5,hdr-rgb-canarywharf.hdr,46.4390,1.2636,0.4679,1.2605
HDRIHaven,5x5,hdr-rgb-eveningroad.hdr,49.3310,1.2250,0.4385,1.3451
HDRIHaven,5x5,hdr-rgb-riverwalk.hdr,31.2247,1.3242,0.5194,1.1355
HDRIHaven,6x6,hdr-rgb-arboretum.hdr,38.0267,1.2729,0.4619,1.2770
HDRIHaven,6x6,hdr-rgb-bellparkpier.hdr,49.6037,1.3264,0.5368,1.0988
HDRIHaven,6x6,hdr-rgb-canarywharf.hdr,43.5645,1.3113,0.5218,1.1304
HDRIHaven,6x6,hdr-rgb-eveningroad.hdr,46.8686,1.2886,0.4997,1.1803
HDRIHaven,6x6,hdr-rgb-riverwalk.hdr,28.7460,1.3489,0.5572,1.0585
HDRIHaven,8x8,hdr-rgb-arboretum.hdr,33.7716,1.4315,0.6375,0.9253
HDRIHaven,8x8,hdr-rgb-bellparkpier.hdr,45.7436,1.4856,0.6960,0.8475
HDRIHaven,8x8,hdr-rgb-canarywharf.hdr,40.0304,1.4995,0.7046,0.8371
HDRIHaven,8x8,hdr-rgb-eveningroad.hdr,43.7463,1.4690,0.6816,0.8654
HDRIHaven,8x8,hdr-rgb-riverwalk.hdr,25.8190,1.5179,0.7243,0.8144
HDRIHaven,12x12,hdr-rgb-arboretum.hdr,30.2963,1.6495,0.8518,0.6925
HDRIHaven,12x12,hdr-rgb-bellparkpier.hdr,41.5470,1.5601,0.7638,0.7723
HDRIHaven,12x12,hdr-rgb-canarywharf.hdr,36.7151,1.6553,0.8607,0.6853
HDRIHaven,12x12,hdr-rgb-eveningroad.hdr,40.8836,1.6401,0.8407,0.7016
HDRIHaven,12x12,hdr-rgb-riverwalk.hdr,23.1188,1.7103,0.9121,0.6467
1 Image Set Block Size Name PSNR Total Time Coding Time Coding Rate
2 HDRIHaven 4x4 hdr-rgb-arboretum.hdr 45.8226 1.1860 0.3930 1.5008
3 HDRIHaven 4x4 hdr-rgb-bellparkpier.hdr 56.3071 1.2120 0.4245 1.3895
4 HDRIHaven 4x4 hdr-rgb-canarywharf.hdr 50.0457 1.2394 0.4491 1.3134
5 HDRIHaven 4x4 hdr-rgb-eveningroad.hdr 52.4455 1.2033 0.4194 1.4065
6 HDRIHaven 4x4 hdr-rgb-riverwalk.hdr 34.9306 1.3087 0.5183 1.1380
7 HDRIHaven 5x5 hdr-rgb-arboretum.hdr 41.6472 1.1900 0.4010 1.4708
8 HDRIHaven 5x5 hdr-rgb-bellparkpier.hdr 52.4971 1.2626 0.4771 1.2364
9 HDRIHaven 5x5 hdr-rgb-canarywharf.hdr 46.4390 1.2636 0.4679 1.2605
10 HDRIHaven 5x5 hdr-rgb-eveningroad.hdr 49.3310 1.2250 0.4385 1.3451
11 HDRIHaven 5x5 hdr-rgb-riverwalk.hdr 31.2247 1.3242 0.5194 1.1355
12 HDRIHaven 6x6 hdr-rgb-arboretum.hdr 38.0267 1.2729 0.4619 1.2770
13 HDRIHaven 6x6 hdr-rgb-bellparkpier.hdr 49.6037 1.3264 0.5368 1.0988
14 HDRIHaven 6x6 hdr-rgb-canarywharf.hdr 43.5645 1.3113 0.5218 1.1304
15 HDRIHaven 6x6 hdr-rgb-eveningroad.hdr 46.8686 1.2886 0.4997 1.1803
16 HDRIHaven 6x6 hdr-rgb-riverwalk.hdr 28.7460 1.3489 0.5572 1.0585
17 HDRIHaven 8x8 hdr-rgb-arboretum.hdr 33.7716 1.4315 0.6375 0.9253
18 HDRIHaven 8x8 hdr-rgb-bellparkpier.hdr 45.7436 1.4856 0.6960 0.8475
19 HDRIHaven 8x8 hdr-rgb-canarywharf.hdr 40.0304 1.4995 0.7046 0.8371
20 HDRIHaven 8x8 hdr-rgb-eveningroad.hdr 43.7463 1.4690 0.6816 0.8654
21 HDRIHaven 8x8 hdr-rgb-riverwalk.hdr 25.8190 1.5179 0.7243 0.8144
22 HDRIHaven 12x12 hdr-rgb-arboretum.hdr 30.2963 1.6495 0.8518 0.6925
23 HDRIHaven 12x12 hdr-rgb-bellparkpier.hdr 41.5470 1.5601 0.7638 0.7723
24 HDRIHaven 12x12 hdr-rgb-canarywharf.hdr 36.7151 1.6553 0.8607 0.6853
25 HDRIHaven 12x12 hdr-rgb-eveningroad.hdr 40.8836 1.6401 0.8407 0.7016
26 HDRIHaven 12x12 hdr-rgb-riverwalk.hdr 23.1188 1.7103 0.9121 0.6467
@@ -0,0 +1,26 @@
Image Set,Block Size,Name,PSNR,Total Time,Coding Time,Coding Rate
HDRIHaven,4x4,hdr-rgb-arboretum.hdr,45.5183,0.8375,0.0976,6.0429
HDRIHaven,4x4,hdr-rgb-bellparkpier.hdr,55.6496,0.8309,0.0941,6.2678
HDRIHaven,4x4,hdr-rgb-canarywharf.hdr,49.1806,0.8434,0.1052,5.6072
HDRIHaven,4x4,hdr-rgb-eveningroad.hdr,51.9442,0.8347,0.0959,6.1512
HDRIHaven,4x4,hdr-rgb-riverwalk.hdr,34.4810,0.8707,0.1303,4.5283
HDRIHaven,5x5,hdr-rgb-arboretum.hdr,41.4914,0.8287,0.0893,6.6069
HDRIHaven,5x5,hdr-rgb-bellparkpier.hdr,51.8696,0.8280,0.0901,6.5438
HDRIHaven,5x5,hdr-rgb-canarywharf.hdr,45.5304,0.8346,0.0943,6.2538
HDRIHaven,5x5,hdr-rgb-eveningroad.hdr,48.9886,0.8277,0.0898,6.5657
HDRIHaven,5x5,hdr-rgb-riverwalk.hdr,30.8769,0.8531,0.1117,5.2783
HDRIHaven,6x6,hdr-rgb-arboretum.hdr,37.9125,0.8354,0.0936,6.2988
HDRIHaven,6x6,hdr-rgb-bellparkpier.hdr,48.9501,0.8363,0.0984,5.9934
HDRIHaven,6x6,hdr-rgb-canarywharf.hdr,42.6974,0.8382,0.0996,5.9244
HDRIHaven,6x6,hdr-rgb-eveningroad.hdr,46.6096,0.8361,0.0983,6.0003
HDRIHaven,6x6,hdr-rgb-riverwalk.hdr,28.4848,0.8533,0.1115,5.2893
HDRIHaven,8x8,hdr-rgb-arboretum.hdr,33.6322,0.8826,0.1423,4.1464
HDRIHaven,8x8,hdr-rgb-bellparkpier.hdr,45.2324,0.8836,0.1443,4.0874
HDRIHaven,8x8,hdr-rgb-canarywharf.hdr,39.4874,0.8802,0.1419,4.1555
HDRIHaven,8x8,hdr-rgb-eveningroad.hdr,43.5558,0.8856,0.1463,4.0306
HDRIHaven,8x8,hdr-rgb-riverwalk.hdr,25.6748,0.8974,0.1580,3.7333
HDRIHaven,12x12,hdr-rgb-arboretum.hdr,30.2132,0.9334,0.1896,3.1109
HDRIHaven,12x12,hdr-rgb-bellparkpier.hdr,41.1310,0.9244,0.1842,3.2016
HDRIHaven,12x12,hdr-rgb-canarywharf.hdr,36.3229,0.9278,0.1864,3.1645
HDRIHaven,12x12,hdr-rgb-eveningroad.hdr,40.7302,0.9367,0.1941,3.0392
HDRIHaven,12x12,hdr-rgb-riverwalk.hdr,23.0516,0.9506,0.2068,2.8518
1 Image Set Block Size Name PSNR Total Time Coding Time Coding Rate
2 HDRIHaven 4x4 hdr-rgb-arboretum.hdr 45.5183 0.8375 0.0976 6.0429
3 HDRIHaven 4x4 hdr-rgb-bellparkpier.hdr 55.6496 0.8309 0.0941 6.2678
4 HDRIHaven 4x4 hdr-rgb-canarywharf.hdr 49.1806 0.8434 0.1052 5.6072
5 HDRIHaven 4x4 hdr-rgb-eveningroad.hdr 51.9442 0.8347 0.0959 6.1512
6 HDRIHaven 4x4 hdr-rgb-riverwalk.hdr 34.4810 0.8707 0.1303 4.5283
7 HDRIHaven 5x5 hdr-rgb-arboretum.hdr 41.4914 0.8287 0.0893 6.6069
8 HDRIHaven 5x5 hdr-rgb-bellparkpier.hdr 51.8696 0.8280 0.0901 6.5438
9 HDRIHaven 5x5 hdr-rgb-canarywharf.hdr 45.5304 0.8346 0.0943 6.2538
10 HDRIHaven 5x5 hdr-rgb-eveningroad.hdr 48.9886 0.8277 0.0898 6.5657
11 HDRIHaven 5x5 hdr-rgb-riverwalk.hdr 30.8769 0.8531 0.1117 5.2783
12 HDRIHaven 6x6 hdr-rgb-arboretum.hdr 37.9125 0.8354 0.0936 6.2988
13 HDRIHaven 6x6 hdr-rgb-bellparkpier.hdr 48.9501 0.8363 0.0984 5.9934
14 HDRIHaven 6x6 hdr-rgb-canarywharf.hdr 42.6974 0.8382 0.0996 5.9244
15 HDRIHaven 6x6 hdr-rgb-eveningroad.hdr 46.6096 0.8361 0.0983 6.0003
16 HDRIHaven 6x6 hdr-rgb-riverwalk.hdr 28.4848 0.8533 0.1115 5.2893
17 HDRIHaven 8x8 hdr-rgb-arboretum.hdr 33.6322 0.8826 0.1423 4.1464
18 HDRIHaven 8x8 hdr-rgb-bellparkpier.hdr 45.2324 0.8836 0.1443 4.0874
19 HDRIHaven 8x8 hdr-rgb-canarywharf.hdr 39.4874 0.8802 0.1419 4.1555
20 HDRIHaven 8x8 hdr-rgb-eveningroad.hdr 43.5558 0.8856 0.1463 4.0306
21 HDRIHaven 8x8 hdr-rgb-riverwalk.hdr 25.6748 0.8974 0.1580 3.7333
22 HDRIHaven 12x12 hdr-rgb-arboretum.hdr 30.2132 0.9334 0.1896 3.1109
23 HDRIHaven 12x12 hdr-rgb-bellparkpier.hdr 41.1310 0.9244 0.1842 3.2016
24 HDRIHaven 12x12 hdr-rgb-canarywharf.hdr 36.3229 0.9278 0.1864 3.1645
25 HDRIHaven 12x12 hdr-rgb-eveningroad.hdr 40.7302 0.9367 0.1941 3.0392
26 HDRIHaven 12x12 hdr-rgb-riverwalk.hdr 23.0516 0.9506 0.2068 2.8518
@@ -0,0 +1,26 @@
Image Set,Block Size,Name,PSNR,Total Time,Coding Time,Coding Rate
HDRIHaven,4x4,hdr-rgb-arboretum.hdr,45.2906,0.7990,0.0592,9.9612
HDRIHaven,4x4,hdr-rgb-bellparkpier.hdr,55.2770,0.7989,0.0614,9.6041
HDRIHaven,4x4,hdr-rgb-canarywharf.hdr,48.4522,0.7976,0.0616,9.5813
HDRIHaven,4x4,hdr-rgb-eveningroad.hdr,51.7385,0.7971,0.0609,9.6894
HDRIHaven,4x4,hdr-rgb-riverwalk.hdr,34.1353,0.8086,0.0701,8.4083
HDRIHaven,5x5,hdr-rgb-arboretum.hdr,41.3714,0.8009,0.0614,9.6050
HDRIHaven,5x5,hdr-rgb-bellparkpier.hdr,51.4977,0.7990,0.0622,9.4774
HDRIHaven,5x5,hdr-rgb-canarywharf.hdr,45.0897,0.7989,0.0620,9.5133
HDRIHaven,5x5,hdr-rgb-eveningroad.hdr,48.7504,0.7988,0.0615,9.5983
HDRIHaven,5x5,hdr-rgb-riverwalk.hdr,30.5270,0.8071,0.0688,8.5710
HDRIHaven,6x6,hdr-rgb-arboretum.hdr,37.8240,0.8015,0.0634,9.3057
HDRIHaven,6x6,hdr-rgb-bellparkpier.hdr,48.6977,0.7987,0.0623,9.4600
HDRIHaven,6x6,hdr-rgb-canarywharf.hdr,42.4615,0.7996,0.0639,9.2372
HDRIHaven,6x6,hdr-rgb-eveningroad.hdr,46.3650,0.7977,0.0621,9.5027
HDRIHaven,6x6,hdr-rgb-riverwalk.hdr,28.3399,0.8063,0.0685,8.6155
HDRIHaven,8x8,hdr-rgb-arboretum.hdr,33.5378,0.8373,0.0977,6.0388
HDRIHaven,8x8,hdr-rgb-bellparkpier.hdr,44.8222,0.8319,0.0919,6.4150
HDRIHaven,8x8,hdr-rgb-canarywharf.hdr,39.1845,0.8320,0.0935,6.3063
HDRIHaven,8x8,hdr-rgb-eveningroad.hdr,43.3350,0.8313,0.0925,6.3766
HDRIHaven,8x8,hdr-rgb-riverwalk.hdr,25.6111,0.8468,0.1060,5.5621
HDRIHaven,12x12,hdr-rgb-arboretum.hdr,30.1334,0.8697,0.1281,4.6052
HDRIHaven,12x12,hdr-rgb-bellparkpier.hdr,40.6166,0.8571,0.1199,4.9173
HDRIHaven,12x12,hdr-rgb-canarywharf.hdr,36.0114,0.8608,0.1225,4.8147
HDRIHaven,12x12,hdr-rgb-eveningroad.hdr,40.6068,0.8653,0.1269,4.6491
HDRIHaven,12x12,hdr-rgb-riverwalk.hdr,23.0090,0.8808,0.1412,4.1757
1 Image Set Block Size Name PSNR Total Time Coding Time Coding Rate
2 HDRIHaven 4x4 hdr-rgb-arboretum.hdr 45.2906 0.7990 0.0592 9.9612
3 HDRIHaven 4x4 hdr-rgb-bellparkpier.hdr 55.2770 0.7989 0.0614 9.6041
4 HDRIHaven 4x4 hdr-rgb-canarywharf.hdr 48.4522 0.7976 0.0616 9.5813
5 HDRIHaven 4x4 hdr-rgb-eveningroad.hdr 51.7385 0.7971 0.0609 9.6894
6 HDRIHaven 4x4 hdr-rgb-riverwalk.hdr 34.1353 0.8086 0.0701 8.4083
7 HDRIHaven 5x5 hdr-rgb-arboretum.hdr 41.3714 0.8009 0.0614 9.6050
8 HDRIHaven 5x5 hdr-rgb-bellparkpier.hdr 51.4977 0.7990 0.0622 9.4774
9 HDRIHaven 5x5 hdr-rgb-canarywharf.hdr 45.0897 0.7989 0.0620 9.5133
10 HDRIHaven 5x5 hdr-rgb-eveningroad.hdr 48.7504 0.7988 0.0615 9.5983
11 HDRIHaven 5x5 hdr-rgb-riverwalk.hdr 30.5270 0.8071 0.0688 8.5710
12 HDRIHaven 6x6 hdr-rgb-arboretum.hdr 37.8240 0.8015 0.0634 9.3057
13 HDRIHaven 6x6 hdr-rgb-bellparkpier.hdr 48.6977 0.7987 0.0623 9.4600
14 HDRIHaven 6x6 hdr-rgb-canarywharf.hdr 42.4615 0.7996 0.0639 9.2372
15 HDRIHaven 6x6 hdr-rgb-eveningroad.hdr 46.3650 0.7977 0.0621 9.5027
16 HDRIHaven 6x6 hdr-rgb-riverwalk.hdr 28.3399 0.8063 0.0685 8.6155
17 HDRIHaven 8x8 hdr-rgb-arboretum.hdr 33.5378 0.8373 0.0977 6.0388
18 HDRIHaven 8x8 hdr-rgb-bellparkpier.hdr 44.8222 0.8319 0.0919 6.4150
19 HDRIHaven 8x8 hdr-rgb-canarywharf.hdr 39.1845 0.8320 0.0935 6.3063
20 HDRIHaven 8x8 hdr-rgb-eveningroad.hdr 43.3350 0.8313 0.0925 6.3766
21 HDRIHaven 8x8 hdr-rgb-riverwalk.hdr 25.6111 0.8468 0.1060 5.5621
22 HDRIHaven 12x12 hdr-rgb-arboretum.hdr 30.1334 0.8697 0.1281 4.6052
23 HDRIHaven 12x12 hdr-rgb-bellparkpier.hdr 40.6166 0.8571 0.1199 4.9173
24 HDRIHaven 12x12 hdr-rgb-canarywharf.hdr 36.0114 0.8608 0.1225 4.8147
25 HDRIHaven 12x12 hdr-rgb-eveningroad.hdr 40.6068 0.8653 0.1269 4.6491
26 HDRIHaven 12x12 hdr-rgb-riverwalk.hdr 23.0090 0.8808 0.1412 4.1757
@@ -0,0 +1,26 @@
Image Set,Block Size,Name,PSNR,Total Time,Coding Time,Coding Rate
HDRIHaven,4x4,hdr-rgb-arboretum.hdr,45.6967,0.8957,0.1550,3.8046
HDRIHaven,4x4,hdr-rgb-bellparkpier.hdr,55.9872,0.8975,0.1566,3.7654
HDRIHaven,4x4,hdr-rgb-canarywharf.hdr,49.5481,0.9174,0.1758,3.3543
HDRIHaven,4x4,hdr-rgb-eveningroad.hdr,52.2746,0.9025,0.1628,3.6220
HDRIHaven,4x4,hdr-rgb-riverwalk.hdr,34.8015,0.9585,0.2149,2.7447
HDRIHaven,5x5,hdr-rgb-arboretum.hdr,41.5897,0.8930,0.1487,3.9676
HDRIHaven,5x5,hdr-rgb-bellparkpier.hdr,52.2586,0.9060,0.1659,3.5557
HDRIHaven,5x5,hdr-rgb-canarywharf.hdr,46.0351,0.9125,0.1703,3.4632
HDRIHaven,5x5,hdr-rgb-eveningroad.hdr,49.2103,0.9046,0.1621,3.6391
HDRIHaven,5x5,hdr-rgb-riverwalk.hdr,31.0865,0.9448,0.1991,2.9627
HDRIHaven,6x6,hdr-rgb-arboretum.hdr,37.9740,0.9065,0.1633,3.6112
HDRIHaven,6x6,hdr-rgb-bellparkpier.hdr,49.3897,0.9236,0.1812,3.2545
HDRIHaven,6x6,hdr-rgb-canarywharf.hdr,43.2248,0.9248,0.1852,3.1844
HDRIHaven,6x6,hdr-rgb-eveningroad.hdr,46.7613,0.9188,0.1815,3.2495
HDRIHaven,6x6,hdr-rgb-riverwalk.hdr,28.6388,0.9486,0.2071,2.8486
HDRIHaven,8x8,hdr-rgb-arboretum.hdr,33.7174,0.9762,0.2337,2.5237
HDRIHaven,8x8,hdr-rgb-bellparkpier.hdr,45.5395,0.9840,0.2405,2.4521
HDRIHaven,8x8,hdr-rgb-canarywharf.hdr,39.7977,0.9928,0.2513,2.3470
HDRIHaven,8x8,hdr-rgb-eveningroad.hdr,43.6514,0.9896,0.2459,2.3984
HDRIHaven,8x8,hdr-rgb-riverwalk.hdr,25.7425,1.0085,0.2658,2.2190
HDRIHaven,12x12,hdr-rgb-arboretum.hdr,30.2551,1.0664,0.3183,1.8532
HDRIHaven,12x12,hdr-rgb-bellparkpier.hdr,41.3460,1.0430,0.2996,1.9688
HDRIHaven,12x12,hdr-rgb-canarywharf.hdr,36.5142,1.0712,0.3277,1.7998
HDRIHaven,12x12,hdr-rgb-eveningroad.hdr,40.8241,1.0608,0.3158,1.8676
HDRIHaven,12x12,hdr-rgb-riverwalk.hdr,23.0771,1.0782,0.3310,1.7820
1 Image Set Block Size Name PSNR Total Time Coding Time Coding Rate
2 HDRIHaven 4x4 hdr-rgb-arboretum.hdr 45.6967 0.8957 0.1550 3.8046
3 HDRIHaven 4x4 hdr-rgb-bellparkpier.hdr 55.9872 0.8975 0.1566 3.7654
4 HDRIHaven 4x4 hdr-rgb-canarywharf.hdr 49.5481 0.9174 0.1758 3.3543
5 HDRIHaven 4x4 hdr-rgb-eveningroad.hdr 52.2746 0.9025 0.1628 3.6220
6 HDRIHaven 4x4 hdr-rgb-riverwalk.hdr 34.8015 0.9585 0.2149 2.7447
7 HDRIHaven 5x5 hdr-rgb-arboretum.hdr 41.5897 0.8930 0.1487 3.9676
8 HDRIHaven 5x5 hdr-rgb-bellparkpier.hdr 52.2586 0.9060 0.1659 3.5557
9 HDRIHaven 5x5 hdr-rgb-canarywharf.hdr 46.0351 0.9125 0.1703 3.4632
10 HDRIHaven 5x5 hdr-rgb-eveningroad.hdr 49.2103 0.9046 0.1621 3.6391
11 HDRIHaven 5x5 hdr-rgb-riverwalk.hdr 31.0865 0.9448 0.1991 2.9627
12 HDRIHaven 6x6 hdr-rgb-arboretum.hdr 37.9740 0.9065 0.1633 3.6112
13 HDRIHaven 6x6 hdr-rgb-bellparkpier.hdr 49.3897 0.9236 0.1812 3.2545
14 HDRIHaven 6x6 hdr-rgb-canarywharf.hdr 43.2248 0.9248 0.1852 3.1844
15 HDRIHaven 6x6 hdr-rgb-eveningroad.hdr 46.7613 0.9188 0.1815 3.2495
16 HDRIHaven 6x6 hdr-rgb-riverwalk.hdr 28.6388 0.9486 0.2071 2.8486
17 HDRIHaven 8x8 hdr-rgb-arboretum.hdr 33.7174 0.9762 0.2337 2.5237
18 HDRIHaven 8x8 hdr-rgb-bellparkpier.hdr 45.5395 0.9840 0.2405 2.4521
19 HDRIHaven 8x8 hdr-rgb-canarywharf.hdr 39.7977 0.9928 0.2513 2.3470
20 HDRIHaven 8x8 hdr-rgb-eveningroad.hdr 43.6514 0.9896 0.2459 2.3984
21 HDRIHaven 8x8 hdr-rgb-riverwalk.hdr 25.7425 1.0085 0.2658 2.2190
22 HDRIHaven 12x12 hdr-rgb-arboretum.hdr 30.2551 1.0664 0.3183 1.8532
23 HDRIHaven 12x12 hdr-rgb-bellparkpier.hdr 41.3460 1.0430 0.2996 1.9688
24 HDRIHaven 12x12 hdr-rgb-canarywharf.hdr 36.5142 1.0712 0.3277 1.7998
25 HDRIHaven 12x12 hdr-rgb-eveningroad.hdr 40.8241 1.0608 0.3158 1.8676
26 HDRIHaven 12x12 hdr-rgb-riverwalk.hdr 23.0771 1.0782 0.3310 1.7820
@@ -0,0 +1,26 @@
Image Set,Block Size,Name,PSNR,Total Time,Coding Time,Coding Rate
HDRIHaven,4x4,hdr-rgb-arboretum.hdr,45.8226,1.0813,0.3362,1.7544
HDRIHaven,4x4,hdr-rgb-bellparkpier.hdr,56.3071,1.0993,0.3600,1.6382
HDRIHaven,4x4,hdr-rgb-canarywharf.hdr,50.0457,1.1243,0.3837,1.5372
HDRIHaven,4x4,hdr-rgb-eveningroad.hdr,52.4455,1.0970,0.3554,1.6594
HDRIHaven,4x4,hdr-rgb-riverwalk.hdr,34.9306,1.1874,0.4438,1.3289
HDRIHaven,5x5,hdr-rgb-arboretum.hdr,41.6472,1.0771,0.3339,1.7664
HDRIHaven,5x5,hdr-rgb-bellparkpier.hdr,52.4971,1.1350,0.3937,1.4982
HDRIHaven,5x5,hdr-rgb-canarywharf.hdr,46.4390,1.1311,0.3896,1.5139
HDRIHaven,5x5,hdr-rgb-eveningroad.hdr,49.3310,1.1028,0.3635,1.6227
HDRIHaven,5x5,hdr-rgb-riverwalk.hdr,31.2247,1.1745,0.4337,1.3599
HDRIHaven,6x6,hdr-rgb-arboretum.hdr,38.0267,1.1158,0.3735,1.5791
HDRIHaven,6x6,hdr-rgb-bellparkpier.hdr,49.6037,1.1794,0.4390,1.3435
HDRIHaven,6x6,hdr-rgb-canarywharf.hdr,43.5645,1.1736,0.4312,1.3680
HDRIHaven,6x6,hdr-rgb-eveningroad.hdr,46.8686,1.1539,0.4120,1.4316
HDRIHaven,6x6,hdr-rgb-riverwalk.hdr,28.7460,1.2102,0.4646,1.2695
HDRIHaven,8x8,hdr-rgb-arboretum.hdr,33.7716,1.2802,0.5371,1.0981
HDRIHaven,8x8,hdr-rgb-bellparkpier.hdr,45.7436,1.3283,0.5809,1.0154
HDRIHaven,8x8,hdr-rgb-canarywharf.hdr,40.0304,1.3364,0.5920,0.9963
HDRIHaven,8x8,hdr-rgb-eveningroad.hdr,43.7463,1.3139,0.5701,1.0346
HDRIHaven,8x8,hdr-rgb-riverwalk.hdr,25.8190,1.3580,0.6114,0.9647
HDRIHaven,12x12,hdr-rgb-arboretum.hdr,30.2963,1.4986,0.7526,0.7837
HDRIHaven,12x12,hdr-rgb-bellparkpier.hdr,41.5470,1.4191,0.6744,0.8747
HDRIHaven,12x12,hdr-rgb-canarywharf.hdr,36.7151,1.5040,0.7604,0.7757
HDRIHaven,12x12,hdr-rgb-eveningroad.hdr,40.8836,1.4908,0.7417,0.7953
HDRIHaven,12x12,hdr-rgb-riverwalk.hdr,23.1188,1.5523,0.8051,0.7327
1 Image Set Block Size Name PSNR Total Time Coding Time Coding Rate
2 HDRIHaven 4x4 hdr-rgb-arboretum.hdr 45.8226 1.0813 0.3362 1.7544
3 HDRIHaven 4x4 hdr-rgb-bellparkpier.hdr 56.3071 1.0993 0.3600 1.6382
4 HDRIHaven 4x4 hdr-rgb-canarywharf.hdr 50.0457 1.1243 0.3837 1.5372
5 HDRIHaven 4x4 hdr-rgb-eveningroad.hdr 52.4455 1.0970 0.3554 1.6594
6 HDRIHaven 4x4 hdr-rgb-riverwalk.hdr 34.9306 1.1874 0.4438 1.3289
7 HDRIHaven 5x5 hdr-rgb-arboretum.hdr 41.6472 1.0771 0.3339 1.7664
8 HDRIHaven 5x5 hdr-rgb-bellparkpier.hdr 52.4971 1.1350 0.3937 1.4982
9 HDRIHaven 5x5 hdr-rgb-canarywharf.hdr 46.4390 1.1311 0.3896 1.5139
10 HDRIHaven 5x5 hdr-rgb-eveningroad.hdr 49.3310 1.1028 0.3635 1.6227
11 HDRIHaven 5x5 hdr-rgb-riverwalk.hdr 31.2247 1.1745 0.4337 1.3599
12 HDRIHaven 6x6 hdr-rgb-arboretum.hdr 38.0267 1.1158 0.3735 1.5791
13 HDRIHaven 6x6 hdr-rgb-bellparkpier.hdr 49.6037 1.1794 0.4390 1.3435
14 HDRIHaven 6x6 hdr-rgb-canarywharf.hdr 43.5645 1.1736 0.4312 1.3680
15 HDRIHaven 6x6 hdr-rgb-eveningroad.hdr 46.8686 1.1539 0.4120 1.4316
16 HDRIHaven 6x6 hdr-rgb-riverwalk.hdr 28.7460 1.2102 0.4646 1.2695
17 HDRIHaven 8x8 hdr-rgb-arboretum.hdr 33.7716 1.2802 0.5371 1.0981
18 HDRIHaven 8x8 hdr-rgb-bellparkpier.hdr 45.7436 1.3283 0.5809 1.0154
19 HDRIHaven 8x8 hdr-rgb-canarywharf.hdr 40.0304 1.3364 0.5920 0.9963
20 HDRIHaven 8x8 hdr-rgb-eveningroad.hdr 43.7463 1.3139 0.5701 1.0346
21 HDRIHaven 8x8 hdr-rgb-riverwalk.hdr 25.8190 1.3580 0.6114 0.9647
22 HDRIHaven 12x12 hdr-rgb-arboretum.hdr 30.2963 1.4986 0.7526 0.7837
23 HDRIHaven 12x12 hdr-rgb-bellparkpier.hdr 41.5470 1.4191 0.6744 0.8747
24 HDRIHaven 12x12 hdr-rgb-canarywharf.hdr 36.7151 1.5040 0.7604 0.7757
25 HDRIHaven 12x12 hdr-rgb-eveningroad.hdr 40.8836 1.4908 0.7417 0.7953
26 HDRIHaven 12x12 hdr-rgb-riverwalk.hdr 23.1188 1.5523 0.8051 0.7327
@@ -0,0 +1,26 @@
Image Set,Block Size,Name,PSNR,Total Time,Coding Time,Coding Rate
HDRIHaven,4x4,hdr-rgb-arboretum.hdr,45.5079,0.8407,0.0886,6.6551
HDRIHaven,4x4,hdr-rgb-bellparkpier.hdr,55.6274,0.8317,0.0845,6.9837
HDRIHaven,4x4,hdr-rgb-canarywharf.hdr,49.1770,0.8462,0.0950,6.2057
HDRIHaven,4x4,hdr-rgb-eveningroad.hdr,51.9296,0.8367,0.0861,6.8505
HDRIHaven,4x4,hdr-rgb-riverwalk.hdr,34.4914,0.8665,0.1184,4.9802
HDRIHaven,5x5,hdr-rgb-arboretum.hdr,41.4794,0.8293,0.0776,7.6039
HDRIHaven,5x5,hdr-rgb-bellparkpier.hdr,51.8484,0.8294,0.0779,7.5674
HDRIHaven,5x5,hdr-rgb-canarywharf.hdr,45.5278,0.8328,0.0818,7.2112
HDRIHaven,5x5,hdr-rgb-eveningroad.hdr,48.9680,0.8298,0.0776,7.6040
HDRIHaven,5x5,hdr-rgb-riverwalk.hdr,30.8894,0.8484,0.0974,6.0580
HDRIHaven,6x6,hdr-rgb-arboretum.hdr,37.9079,0.8256,0.0760,7.7585
HDRIHaven,6x6,hdr-rgb-bellparkpier.hdr,48.9405,0.8255,0.0801,7.3633
HDRIHaven,6x6,hdr-rgb-canarywharf.hdr,42.6946,0.8277,0.0812,7.2660
HDRIHaven,6x6,hdr-rgb-eveningroad.hdr,46.6017,0.8275,0.0798,7.3903
HDRIHaven,6x6,hdr-rgb-riverwalk.hdr,28.4862,0.8389,0.0912,6.4670
HDRIHaven,8x8,hdr-rgb-arboretum.hdr,33.6288,0.8625,0.1105,5.3399
HDRIHaven,8x8,hdr-rgb-bellparkpier.hdr,45.2302,0.8600,0.1106,5.3329
HDRIHaven,8x8,hdr-rgb-canarywharf.hdr,39.4837,0.8614,0.1104,5.3449
HDRIHaven,8x8,hdr-rgb-eveningroad.hdr,43.5502,0.8610,0.1128,5.2296
HDRIHaven,8x8,hdr-rgb-riverwalk.hdr,25.6749,0.8747,0.1227,4.8062
HDRIHaven,12x12,hdr-rgb-arboretum.hdr,30.2114,0.8978,0.1443,4.0866
HDRIHaven,12x12,hdr-rgb-bellparkpier.hdr,41.1351,0.8862,0.1386,4.2547
HDRIHaven,12x12,hdr-rgb-canarywharf.hdr,36.3187,0.8901,0.1399,4.2174
HDRIHaven,12x12,hdr-rgb-eveningroad.hdr,40.7270,0.8968,0.1463,4.0312
HDRIHaven,12x12,hdr-rgb-riverwalk.hdr,23.0489,0.9081,0.1586,3.7186
1 Image Set Block Size Name PSNR Total Time Coding Time Coding Rate
2 HDRIHaven 4x4 hdr-rgb-arboretum.hdr 45.5079 0.8407 0.0886 6.6551
3 HDRIHaven 4x4 hdr-rgb-bellparkpier.hdr 55.6274 0.8317 0.0845 6.9837
4 HDRIHaven 4x4 hdr-rgb-canarywharf.hdr 49.1770 0.8462 0.0950 6.2057
5 HDRIHaven 4x4 hdr-rgb-eveningroad.hdr 51.9296 0.8367 0.0861 6.8505
6 HDRIHaven 4x4 hdr-rgb-riverwalk.hdr 34.4914 0.8665 0.1184 4.9802
7 HDRIHaven 5x5 hdr-rgb-arboretum.hdr 41.4794 0.8293 0.0776 7.6039
8 HDRIHaven 5x5 hdr-rgb-bellparkpier.hdr 51.8484 0.8294 0.0779 7.5674
9 HDRIHaven 5x5 hdr-rgb-canarywharf.hdr 45.5278 0.8328 0.0818 7.2112
10 HDRIHaven 5x5 hdr-rgb-eveningroad.hdr 48.9680 0.8298 0.0776 7.6040
11 HDRIHaven 5x5 hdr-rgb-riverwalk.hdr 30.8894 0.8484 0.0974 6.0580
12 HDRIHaven 6x6 hdr-rgb-arboretum.hdr 37.9079 0.8256 0.0760 7.7585
13 HDRIHaven 6x6 hdr-rgb-bellparkpier.hdr 48.9405 0.8255 0.0801 7.3633
14 HDRIHaven 6x6 hdr-rgb-canarywharf.hdr 42.6946 0.8277 0.0812 7.2660
15 HDRIHaven 6x6 hdr-rgb-eveningroad.hdr 46.6017 0.8275 0.0798 7.3903
16 HDRIHaven 6x6 hdr-rgb-riverwalk.hdr 28.4862 0.8389 0.0912 6.4670
17 HDRIHaven 8x8 hdr-rgb-arboretum.hdr 33.6288 0.8625 0.1105 5.3399
18 HDRIHaven 8x8 hdr-rgb-bellparkpier.hdr 45.2302 0.8600 0.1106 5.3329
19 HDRIHaven 8x8 hdr-rgb-canarywharf.hdr 39.4837 0.8614 0.1104 5.3449
20 HDRIHaven 8x8 hdr-rgb-eveningroad.hdr 43.5502 0.8610 0.1128 5.2296
21 HDRIHaven 8x8 hdr-rgb-riverwalk.hdr 25.6749 0.8747 0.1227 4.8062
22 HDRIHaven 12x12 hdr-rgb-arboretum.hdr 30.2114 0.8978 0.1443 4.0866
23 HDRIHaven 12x12 hdr-rgb-bellparkpier.hdr 41.1351 0.8862 0.1386 4.2547
24 HDRIHaven 12x12 hdr-rgb-canarywharf.hdr 36.3187 0.8901 0.1399 4.2174
25 HDRIHaven 12x12 hdr-rgb-eveningroad.hdr 40.7270 0.8968 0.1463 4.0312
26 HDRIHaven 12x12 hdr-rgb-riverwalk.hdr 23.0489 0.9081 0.1586 3.7186

Some files were not shown because too many files have changed in this diff Show More