ppsspp/Common/Math/CrossSIMD.h

66 lines
1.7 KiB
C

// CrossSIMD
//
// Compatibility wrappers for SIMD dialects.
//
// In the long run, might do a more general single-source-SIMD wrapper here consisting
// of defines that translate to either NEON or SSE. It would be possible to write quite a lot of
// our various color conversion functions and so on in a pretty generic manner.
#include "ppsspp_config.h"
#include "stdint.h"
#ifdef __clang__
// Weird how you can't just use #pragma in a macro.
#define DO_NOT_VECTORIZE_LOOP _Pragma("clang loop vectorize(disable)")
#else
#define DO_NOT_VECTORIZE_LOOP
#endif
#if PPSSPP_ARCH(SSE2)
#include <emmintrin.h>
#endif
#if PPSSPP_ARCH(ARM_NEON)
#if defined(_MSC_VER) && PPSSPP_ARCH(ARM64)
#include <arm64_neon.h>
#else
#include <arm_neon.h>
#endif
#endif
// Basic types
#if PPSSPP_ARCH(ARM64_NEON)
// No special ones here.
#elif PPSSPP_ARCH(ARM_NEON)
// Compatibility wrappers making ARM64 NEON code run on ARM32
// With optimization on, these should compile down to the optimal code.
inline float32x4_t vmulq_laneq_f32(float32x4_t a, float32x4_t b, int lane) {
switch (lane & 3) {
case 0: return vmulq_lane_f32(a, vget_low_f32(b), 0);
case 1: return vmulq_lane_f32(a, vget_low_f32(b), 1);
case 2: return vmulq_lane_f32(a, vget_high_f32(b), 0);
default: return vmulq_lane_f32(a, vget_high_f32(b), 1);
}
}
inline float32x4_t vmlaq_laneq_f32(float32x4_t a, float32x4_t b, float32x4_t c, int lane) {
switch (lane & 3) {
case 0: return vmlaq_lane_f32(a, b, vget_low_f32(c), 0);
case 1: return vmlaq_lane_f32(a, b, vget_low_f32(c), 1);
case 2: return vmlaq_lane_f32(a, b, vget_high_f32(c), 0);
default: return vmlaq_lane_f32(a, b, vget_high_f32(c), 1);
}
}
inline uint32x4_t vcgezq_f32(float32x4_t v) {
return vcgeq_f32(v, vdupq_n_f32(0.0f));
}
#endif