Merge pull request #15628 from hrydgard/arm64-neon-flag

Add ARM64_NEON compile arch flag
2024-11-24 14:00:03 +00:00 · 2022-06-25 16:29:03 -07:00 · 2022-06-25 16:29:03 -07:00 · 8ae673572a
commit 8ae673572a
parent ac057bdf91 cd92151de7
5 changed files with 29 additions and 28 deletions
--- a/Core/MIPS/IR/IRInterpreter.cpp
+++ b/Core/MIPS/IR/IRInterpreter.cpp
@ -258,7 +258,7 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, int count) {
 		{
 #if defined(_M_SSE)
 			_mm_store_ps(&mips->f[inst->dest], _mm_load_ps(&mips->f[inst->src1]));
-#elif PPSSPP_ARCH(ARM64)
+#elif PPSSPP_ARCH(ARM64_NEON)
 			vst1q_f32(&mips->f[inst->dest], vld1q_f32(&mips->f[inst->src1]));
 #else
 			memcpy(&mips->f[inst->dest], &mips->f[inst->src1], 4 * sizeof(float));
@ -270,7 +270,7 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, int count) {
 		{
 #if defined(_M_SSE)
 			_mm_store_ps(&mips->f[inst->dest], _mm_add_ps(_mm_load_ps(&mips->f[inst->src1]), _mm_load_ps(&mips->f[inst->src2])));
-#elif PPSSPP_ARCH(ARM64)
+#elif PPSSPP_ARCH(ARM64_NEON)
 			vst1q_f32(&mips->f[inst->dest], vaddq_f32(vld1q_f32(&mips->f[inst->src1]), vld1q_f32(&mips->f[inst->src2])));
 #else
 			for (int i = 0; i < 4; i++)
@ -283,7 +283,7 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, int count) {
 		{
 #if defined(_M_SSE)
 			_mm_store_ps(&mips->f[inst->dest], _mm_sub_ps(_mm_load_ps(&mips->f[inst->src1]), _mm_load_ps(&mips->f[inst->src2])));
-#elif PPSSPP_ARCH(ARM64)
+#elif PPSSPP_ARCH(ARM64_NEON)
 			vst1q_f32(&mips->f[inst->dest], vsubq_f32(vld1q_f32(&mips->f[inst->src1]), vld1q_f32(&mips->f[inst->src2])));
 #else
 			for (int i = 0; i < 4; i++)
@ -296,7 +296,7 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, int count) {
 		{
 #if defined(_M_SSE)
 			_mm_store_ps(&mips->f[inst->dest], _mm_mul_ps(_mm_load_ps(&mips->f[inst->src1]), _mm_load_ps(&mips->f[inst->src2])));
-#elif PPSSPP_ARCH(ARM64)
+#elif PPSSPP_ARCH(ARM64_NEON)
 			vst1q_f32(&mips->f[inst->dest], vmulq_f32(vld1q_f32(&mips->f[inst->src1]), vld1q_f32(&mips->f[inst->src2])));
 #else
 			for (int i = 0; i < 4; i++)
@ -331,7 +331,7 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, int count) {
 		{
 #if defined(_M_SSE)
 			_mm_store_ps(&mips->f[inst->dest], _mm_xor_ps(_mm_load_ps(&mips->f[inst->src1]), _mm_load_ps((const float *)signBits)));
-#elif PPSSPP_ARCH(ARM64)
+#elif PPSSPP_ARCH(ARM64_NEON)
 			vst1q_f32(&mips->f[inst->dest], vnegq_f32(vld1q_f32(&mips->f[inst->src1])));
 #else
 			for (int i = 0; i < 4; i++)
@ -344,7 +344,7 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, int count) {
 		{
 #if defined(_M_SSE)
 			_mm_store_ps(&mips->f[inst->dest], _mm_and_ps(_mm_load_ps(&mips->f[inst->src1]), _mm_load_ps((const float *)noSignMask)));
-#elif PPSSPP_ARCH(ARM64)
+#elif PPSSPP_ARCH(ARM64_NEON)
 			vst1q_f32(&mips->f[inst->dest], vabsq_f32(vld1q_f32(&mips->f[inst->src1])));
 #else
 			for (int i = 0; i < 4; i++)
--- a/GPU/Math3D.h
+++ b/GPU/Math3D.h
@ -69,7 +69,7 @@ public:
 #if defined(_M_SSE)
 		__m128i ivec;
 		__m128 vec;
-#elif PPSSPP_ARCH(ARM64)
+#elif PPSSPP_ARCH(ARM64_NEON)
 		int32x4_t ivec;
 		float32x4_t vec;
 #endif
@ -84,7 +84,7 @@ public:
 #if defined(_M_SSE)
 	Vec2(const __m128 &_vec) : vec(_vec) {}
 	Vec2(const __m128i &_ivec) : ivec(_ivec) {}
-#elif PPSSPP_ARCH(ARM64)
+#elif PPSSPP_ARCH(ARM64_NEON)
 	Vec2(const float32x4_t &_vec) : vec(_vec) {}
 #if !defined(_MSC_VER)
 	Vec2(const int32x4_t &_ivec) : ivec(_ivec) {}
@ -217,7 +217,7 @@ public:
 #if defined(_M_SSE)
 		__m128i ivec;
 		__m128 vec;
-#elif PPSSPP_ARCH(ARM64)
+#elif PPSSPP_ARCH(ARM64_NEON)
 		int32x4_t ivec;
 		float32x4_t vec;
 #endif
@ -236,7 +236,7 @@ public:
 	Vec3(const Vec3Packed<T> &_xyz) {
 		vec = _mm_loadu_ps(_xyz.AsArray());
 	}
-#elif PPSSPP_ARCH(ARM64)
+#elif PPSSPP_ARCH(ARM64_NEON)
 	Vec3(const float32x4_t &_vec) : vec(_vec) {}
 #if !defined(_MSC_VER)
 	Vec3(const int32x4_t &_ivec) : ivec(_ivec) {}
@ -576,7 +576,7 @@ public:
 #if defined(_M_SSE)
 		__m128i ivec;
 		__m128 vec;
-#elif PPSSPP_ARCH(ARM64)
+#elif PPSSPP_ARCH(ARM64_NEON)
 		int32x4_t ivec;
 		float32x4_t vec;
 #endif
@ -593,7 +593,7 @@ public:
 #if defined(_M_SSE)
 	Vec4(const __m128 &_vec) : vec(_vec) {}
 	Vec4(const __m128i &_ivec) : ivec(_ivec) {}
-#elif PPSSPP_ARCH(ARM64)
+#elif PPSSPP_ARCH(ARM64_NEON)
 	Vec4(const float32x4_t &_vec) : vec(_vec) {}
 #if !defined(_MSC_VER)
 	Vec4(const int32x4_t &_ivec) : ivec(_ivec) {}
@ -918,7 +918,7 @@ inline void Vec3ByMatrix43(float vecOut[3], const float v[3], const float m[12])
 	vecOut[0] = _mm_cvtss_f32(sum);
 	vecOut[1] = vectorGetByIndex<1>(sum);
 	vecOut[2] = vectorGetByIndex<2>(sum);
-#elif PPSSPP_ARCH(ARM_NEON) && PPSSPP_ARCH(ARM64)
+#elif PPSSPP_ARCH(ARM64_NEON)
 	float32x4_t sum = Vec3ByMatrix43Internal(vld1q_f32(v), m);
 	vecOut[0] = vgetq_lane_f32(sum, 0);
 	vecOut[1] = vgetq_lane_f32(sum, 1);
@ -936,7 +936,7 @@ inline Vec3f MATH3D_CALL Vec3ByMatrix43(const Vec3f v, const float m[12]) {
 	__m128 y = _mm_shuffle_ps(v.vec, v.vec, _MM_SHUFFLE(1, 1, 1, 1));
 	__m128 z = _mm_shuffle_ps(v.vec, v.vec, _MM_SHUFFLE(2, 2, 2, 2));
 	return Vec3ByMatrix43Internal(x, y, z, m);
-#elif PPSSPP_ARCH(ARM_NEON) && PPSSPP_ARCH(ARM64)
+#elif PPSSPP_ARCH(ARM64_NEON)
 	return Vec3ByMatrix43Internal(v.vec, m);
 #else
 	Vec3f vecOut;
@ -957,7 +957,7 @@ inline __m128 MATH3D_CALL Vec3ByMatrix44Internal(__m128 x, __m128 y, __m128 z, c
 		_mm_add_ps(_mm_mul_ps(col2, z), col3));
 	return sum;
 }
-#elif PPSSPP_ARCH(ARM_NEON) && PPSSPP_ARCH(ARM64)
+#elif PPSSPP_ARCH(ARM64_NEON)
 inline float32x4_t Vec3ByMatrix44Internal(float32x4_t vec, const float m[16]) {
 	float32x4_t col0 = vld1q_f32(m);
 	float32x4_t col1 = vld1q_f32(m + 4);
@ -977,7 +977,7 @@ inline void Vec3ByMatrix44(float vecOut[4], const float v[3], const float m[16])
 	__m128 z = _mm_set1_ps(v[2]);
 	__m128 sum = Vec3ByMatrix44Internal(x, y, z, m);
 	_mm_storeu_ps(vecOut, sum);
-#elif PPSSPP_ARCH(ARM_NEON) && PPSSPP_ARCH(ARM64)
+#elif PPSSPP_ARCH(ARM64_NEON)
 	float32x4_t sum = Vec3ByMatrix44Internal(vld1q_f32(v), m);
 	vst1q_f32(vecOut, sum);
 #else
@ -994,7 +994,7 @@ inline Vec4f MATH3D_CALL Vec3ByMatrix44(const Vec3f v, const float m[16]) {
 	__m128 y = _mm_shuffle_ps(v.vec, v.vec, _MM_SHUFFLE(1, 1, 1, 1));
 	__m128 z = _mm_shuffle_ps(v.vec, v.vec, _MM_SHUFFLE(2, 2, 2, 2));
 	return Vec3ByMatrix44Internal(x, y, z, m);
-#elif PPSSPP_ARCH(ARM_NEON) && PPSSPP_ARCH(ARM64)
+#elif PPSSPP_ARCH(ARM64_NEON)
 	return Vec3ByMatrix44Internal(v.vec, m);
 #else
 	Vec4f vecOut;
@ -1014,7 +1014,7 @@ inline __m128 MATH3D_CALL Norm3ByMatrix43Internal(__m128 x, __m128 y, __m128 z,
 		_mm_mul_ps(col2, z));
 	return sum;
 }
-#elif PPSSPP_ARCH(ARM_NEON) && PPSSPP_ARCH(ARM64)
+#elif PPSSPP_ARCH(ARM64_NEON)
 inline float32x4_t Norm3ByMatrix43Internal(float32x4_t vec, const float m[16]) {
 	float32x4_t col0 = vld1q_f32(m);
 	float32x4_t col1 = vld1q_f32(m + 3);
@ -1035,7 +1035,7 @@ inline void Norm3ByMatrix43(float vecOut[3], const float v[3], const float m[12]
 	vecOut[0] = _mm_cvtss_f32(sum);
 	vecOut[1] = vectorGetByIndex<1>(sum);
 	vecOut[2] = vectorGetByIndex<2>(sum);
-#elif PPSSPP_ARCH(ARM_NEON) && PPSSPP_ARCH(ARM64)
+#elif PPSSPP_ARCH(ARM64_NEON)
 	float32x4_t sum = Norm3ByMatrix43Internal(vld1q_f32(v), m);
 	vecOut[0] = vgetq_lane_f32(sum, 0);
 	vecOut[1] = vgetq_lane_f32(sum, 1);
@ -1053,7 +1053,7 @@ inline Vec3f MATH3D_CALL Norm3ByMatrix43(const Vec3f v, const float m[12]) {
 	__m128 y = _mm_shuffle_ps(v.vec, v.vec, _MM_SHUFFLE(1, 1, 1, 1));
 	__m128 z = _mm_shuffle_ps(v.vec, v.vec, _MM_SHUFFLE(2, 2, 2, 2));
 	return Norm3ByMatrix43Internal(x, y, z, m);
-#elif PPSSPP_ARCH(ARM_NEON) && PPSSPP_ARCH(ARM64)
+#elif PPSSPP_ARCH(ARM64_NEON)
 	return Norm3ByMatrix43Internal(v.vec, m);
 #else
 	Vec3f vecOut;
--- a/GPU/Software/RasterizerRegCache.cpp
+++ b/GPU/Software/RasterizerRegCache.cpp
@ -24,7 +24,7 @@ namespace Rasterizer {
 void RegCache::SetupABI(const std::vector<Purpose> &args, bool forceRetain) {
 #if PPSSPP_ARCH(ARM)
 	_assert_msg_(false, "Not yet implemented");
-#elif PPSSPP_ARCH(ARM64)
+#elif PPSSPP_ARCH(ARM64_NEON)
 	using namespace Arm64Gen;

 	// ARM64 has a generous allotment of registers.
@ -389,7 +389,7 @@ RegCache::RegStatus *RegCache::FindReg(Reg r, Purpose p) {
 }

 CodeBlock::CodeBlock(int size)
-#if PPSSPP_ARCH(ARM64)
+#if PPSSPP_ARCH(ARM64_NEON)
 	: fp(this)
 #endif
 {
--- a/GPU/Software/RasterizerRegCache.h
+++ b/GPU/Software/RasterizerRegCache.h
@ -28,7 +28,7 @@
 #if defined(_M_SSE)
 #include <emmintrin.h>
 #endif
-#if PPSSPP_ARCH(ARM_NEON) && PPSSPP_ARCH(ARM64)
+#if PPSSPP_ARCH(ARM64_NEON)
 #if defined(_MSC_VER) && PPSSPP_ARCH(ARM64)
 #include <arm64_neon.h>
 #else
@ -38,7 +38,7 @@

 #if PPSSPP_ARCH(ARM)
 #include "Common/ArmEmitter.h"
-#elif PPSSPP_ARCH(ARM64)
+#elif PPSSPP_ARCH(ARM64_NEON)
 #include "Common/Arm64Emitter.h"
 #elif PPSSPP_ARCH(X86) || PPSSPP_ARCH(AMD64)
 #include "Common/x64Emitter.h"
@ -54,7 +54,7 @@ namespace Rasterizer {
 // While not part of the reg cache proper, this is the type it is built for.
 #if PPSSPP_ARCH(ARM)
 typedef ArmGen::ARMXCodeBlock BaseCodeBlock;
-#elif PPSSPP_ARCH(ARM64)
+#elif PPSSPP_ARCH(ARM64_NEON)
 typedef Arm64Gen::ARM64CodeBlock BaseCodeBlock;
 #elif PPSSPP_ARCH(X86) || PPSSPP_ARCH(AMD64)
 typedef Gen::XCodeBlock BaseCodeBlock;
@ -65,7 +65,7 @@ typedef FakeGen::FakeXCodeBlock BaseCodeBlock;
 #endif

 // We also have the types of things that end up in regs.
-#if PPSSPP_ARCH(ARM64)
+#if PPSSPP_ARCH(ARM64_NEON)
 typedef int32x4_t Vec4IntArg;
 typedef int32x4_t Vec4IntResult;
 typedef float32x4_t Vec4FloatArg;
@ -160,7 +160,7 @@ struct RegCache {
 #if PPSSPP_ARCH(ARM)
 	typedef ArmGen::ARMReg Reg;
 	static constexpr Reg REG_INVALID_VALUE = ArmGen::INVALID_REG;
-#elif PPSSPP_ARCH(ARM64)
+#elif PPSSPP_ARCH(ARM64_NEON)
 	typedef Arm64Gen::ARM64Reg Reg;
 	static constexpr Reg REG_INVALID_VALUE = Arm64Gen::INVALID_REG;
 #elif PPSSPP_ARCH(X86) || PPSSPP_ARCH(AMD64)
@ -242,7 +242,7 @@ protected:
 	void WriteDynamicConst8x16(const u8 *&ptr, uint16_t value);
 	void WriteDynamicConst4x32(const u8 *&ptr, uint32_t value);

-#if PPSSPP_ARCH(ARM64)
+#if PPSSPP_ARCH(ARM64_NEON)
 	Arm64Gen::ARM64FloatEmitter fp;
 #endif

--- a/ppsspp_config.h
+++ b/ppsspp_config.h
@ -54,6 +54,7 @@
    #define PPSSPP_ARCH_ARM64 1
    #define PPSSPP_ARCH_64BIT 1
    #define PPSSPP_ARCH_ARM_NEON 1
+    #define PPSSPP_ARCH_ARM64_NEON 1
 #endif

 #if defined(__mips64__)