NEON-optimize matrix tranposes

This commit is contained in:
Henrik Rydgård 2023-11-27 23:21:47 +01:00
parent 45aae7b9da
commit 4ec2d76bc9
2 changed files with 17 additions and 1 deletions

View File

@ -1944,7 +1944,7 @@ bool GPUCommon::DescribeCodePtr(const u8 *ptr, std::string &name) {
}
void GPUCommon::UpdateUVScaleOffset() {
#ifdef _M_SSE
#if defined(_M_SSE)
__m128i values = _mm_slli_epi32(_mm_load_si128((const __m128i *)&gstate.texscaleu), 8);
_mm_storeu_si128((__m128i *)&gstate_c.uv, values);
#elif PPSSPP_ARCH(ARM_NEON)

View File

@ -1152,6 +1152,13 @@ inline void ConvertMatrix4x3To4x4(float *m4x4, const float *m4x3) {
}
inline void ConvertMatrix4x3To4x4Transposed(float *m4x4, const float *m4x3) {
#if PPSSPP_ARCH(ARM_NEON)
// vld3q is a perfect match here!
float32x4x3_t packed = vld3q_f32(m4x3);
vst1q_f32(m4x4, packed.val[0]);
vst1q_f32(m4x4 + 4, packed.val[1]);
vst1q_f32(m4x4 + 8, packed.val[2]);
#else
m4x4[0] = m4x3[0];
m4x4[1] = m4x3[3];
m4x4[2] = m4x3[6];
@ -1164,6 +1171,7 @@ inline void ConvertMatrix4x3To4x4Transposed(float *m4x4, const float *m4x3) {
m4x4[9] = m4x3[5];
m4x4[10] = m4x3[8];
m4x4[11] = m4x3[11];
#endif
m4x4[12] = 0.0f;
m4x4[13] = 0.0f;
m4x4[14] = 0.0f;
@ -1179,6 +1187,13 @@ inline void ConvertMatrix4x3To4x4Transposed(float *m4x4, const float *m4x3) {
// 89AB
// Don't see a way to SIMD that. Should be pretty fast anyway.
inline void ConvertMatrix4x3To3x4Transposed(float *m4x4, const float *m4x3) {
#if PPSSPP_ARCH(ARM_NEON)
// vld3q is a perfect match here!
float32x4x3_t packed = vld3q_f32(m4x3);
vst1q_f32(m4x4, packed.val[0]);
vst1q_f32(m4x4 + 4, packed.val[1]);
vst1q_f32(m4x4 + 8, packed.val[2]);
#else
m4x4[0] = m4x3[0];
m4x4[1] = m4x3[3];
m4x4[2] = m4x3[6];
@ -1191,6 +1206,7 @@ inline void ConvertMatrix4x3To3x4Transposed(float *m4x4, const float *m4x3) {
m4x4[9] = m4x3[5];
m4x4[10] = m4x3[8];
m4x4[11] = m4x3[11];
#endif
}
inline void Transpose4x4(float out[16], const float in[16]) {