Add NEON impl too, mostly for exercise purposes

This commit is contained in:
Henrik Rydgård 2022-12-01 17:09:54 +01:00
parent e6f0f84a45
commit d0e8cfa365
2 changed files with 25 additions and 0 deletions

View File

@ -52,6 +52,13 @@ inline uint32_t Float4ToUint8x4(const float f[4]) {
__m128 value = _mm_mul_ps(_mm_loadu_ps(f), _mm_load_ps(exactly_255_x4));
__m128i ivalue = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(value), zero), zero);
return _mm_cvtsi128_si32(ivalue);
#elif PPSSPP_ARCH(ARM_NEON)
const float32x4_t value = vmulq_f32(vld1q_f32(f), vdupq_n_f32(255.0f));
uint32x4_t ivalue32 = vcvtq_u32_f32(value);
uint16x4_t ivalue16 = vqmovn_u32(ivalue32);
uint8x8_t ivalue8 = vqmovn_u16(vcombine_u16(ivalue16, ivalue16)); // Is there no way to avoid the combine here?
uint32x2_t outValue32 = vreinterpret_u8_u32(ivalue8);
return vget_lane_u32(outValue32, 0);
#else
int i4[4];
for (int i = 0; i < 4; i++) {
@ -74,6 +81,13 @@ inline uint32_t Float4ToUint8x4_NoClamp(const float f[4]) {
__m128 value = _mm_mul_ps(_mm_loadu_ps(f), _mm_load_ps(exactly_255_x4));
__m128i ivalue = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(value), zero), zero);
return _mm_cvtsi128_si32(ivalue);
#elif PPSSPP_ARCH(ARM_NEON)
const float32x4_t value = vmulq_f32(vld1q_f32(f), vdupq_n_f32(255.0f));
uint32x4_t ivalue32 = vcvtq_u32_f32(value);
uint16x4_t ivalue16 = vqmovn_u32(ivalue32);
uint8x8_t ivalue8 = vqmovn_u16(vcombine_u16(ivalue16, ivalue16)); // Is there no way to avoid the combine here?
uint32x2_t outValue32 = vreinterpret_u8_u32(ivalue8);
return vget_lane_u32(outValue32, 0);
#else
u32 i4[4];
for (int i = 0; i < 4; i++) {

View File

@ -41,6 +41,7 @@
#endif
#include "Common/Data/Collections/TinySet.h"
#include "Common/Data/Convert/SmallDataConvert.h"
#include "Common/Data/Text/Parsers.h"
#include "Common/Data/Text/WrapText.h"
#include "Common/Data/Encoding/Utf8.h"
@ -782,6 +783,15 @@ static bool TestWrapText() {
return true;
}
static bool TestSmallDataConvert() {
float f[4] = { 1.0f / 255.0f, 2.0f / 255.0f, 3.0f / 255.0f, 4.0f / 255.f };
uint32_t result = Float4ToUint8x4_NoClamp(f);
EXPECT_EQ_HEX(result, 0x04030201);
result = Float4ToUint8x4(f);
EXPECT_EQ_HEX(result, 0x04030201);
return true;
}
typedef bool (*TestFunc)();
struct TestItem {
const char *name;
@ -832,6 +842,7 @@ TestItem availableTests[] = {
TEST_ITEM(ThreadManager),
TEST_ITEM(WrapText),
TEST_ITEM(TinySet),
TEST_ITEM(SmallDataConvert),
};
int main(int argc, const char *argv[]) {