Merge pull request #15628 from hrydgard/arm64-neon-flag

Add ARM64_NEON compile arch flag
This commit is contained in:
Unknown W. Brackets 2022-06-25 16:29:03 -07:00 committed by GitHub
commit 8ae673572a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 29 additions and 28 deletions

View File

@ -258,7 +258,7 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, int count) {
{
#if defined(_M_SSE)
_mm_store_ps(&mips->f[inst->dest], _mm_load_ps(&mips->f[inst->src1]));
#elif PPSSPP_ARCH(ARM64)
#elif PPSSPP_ARCH(ARM64_NEON)
vst1q_f32(&mips->f[inst->dest], vld1q_f32(&mips->f[inst->src1]));
#else
memcpy(&mips->f[inst->dest], &mips->f[inst->src1], 4 * sizeof(float));
@ -270,7 +270,7 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, int count) {
{
#if defined(_M_SSE)
_mm_store_ps(&mips->f[inst->dest], _mm_add_ps(_mm_load_ps(&mips->f[inst->src1]), _mm_load_ps(&mips->f[inst->src2])));
#elif PPSSPP_ARCH(ARM64)
#elif PPSSPP_ARCH(ARM64_NEON)
vst1q_f32(&mips->f[inst->dest], vaddq_f32(vld1q_f32(&mips->f[inst->src1]), vld1q_f32(&mips->f[inst->src2])));
#else
for (int i = 0; i < 4; i++)
@ -283,7 +283,7 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, int count) {
{
#if defined(_M_SSE)
_mm_store_ps(&mips->f[inst->dest], _mm_sub_ps(_mm_load_ps(&mips->f[inst->src1]), _mm_load_ps(&mips->f[inst->src2])));
#elif PPSSPP_ARCH(ARM64)
#elif PPSSPP_ARCH(ARM64_NEON)
vst1q_f32(&mips->f[inst->dest], vsubq_f32(vld1q_f32(&mips->f[inst->src1]), vld1q_f32(&mips->f[inst->src2])));
#else
for (int i = 0; i < 4; i++)
@ -296,7 +296,7 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, int count) {
{
#if defined(_M_SSE)
_mm_store_ps(&mips->f[inst->dest], _mm_mul_ps(_mm_load_ps(&mips->f[inst->src1]), _mm_load_ps(&mips->f[inst->src2])));
#elif PPSSPP_ARCH(ARM64)
#elif PPSSPP_ARCH(ARM64_NEON)
vst1q_f32(&mips->f[inst->dest], vmulq_f32(vld1q_f32(&mips->f[inst->src1]), vld1q_f32(&mips->f[inst->src2])));
#else
for (int i = 0; i < 4; i++)
@ -331,7 +331,7 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, int count) {
{
#if defined(_M_SSE)
_mm_store_ps(&mips->f[inst->dest], _mm_xor_ps(_mm_load_ps(&mips->f[inst->src1]), _mm_load_ps((const float *)signBits)));
#elif PPSSPP_ARCH(ARM64)
#elif PPSSPP_ARCH(ARM64_NEON)
vst1q_f32(&mips->f[inst->dest], vnegq_f32(vld1q_f32(&mips->f[inst->src1])));
#else
for (int i = 0; i < 4; i++)
@ -344,7 +344,7 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, int count) {
{
#if defined(_M_SSE)
_mm_store_ps(&mips->f[inst->dest], _mm_and_ps(_mm_load_ps(&mips->f[inst->src1]), _mm_load_ps((const float *)noSignMask)));
#elif PPSSPP_ARCH(ARM64)
#elif PPSSPP_ARCH(ARM64_NEON)
vst1q_f32(&mips->f[inst->dest], vabsq_f32(vld1q_f32(&mips->f[inst->src1])));
#else
for (int i = 0; i < 4; i++)

View File

@ -69,7 +69,7 @@ public:
#if defined(_M_SSE)
__m128i ivec;
__m128 vec;
#elif PPSSPP_ARCH(ARM64)
#elif PPSSPP_ARCH(ARM64_NEON)
int32x4_t ivec;
float32x4_t vec;
#endif
@ -84,7 +84,7 @@ public:
#if defined(_M_SSE)
Vec2(const __m128 &_vec) : vec(_vec) {}
Vec2(const __m128i &_ivec) : ivec(_ivec) {}
#elif PPSSPP_ARCH(ARM64)
#elif PPSSPP_ARCH(ARM64_NEON)
Vec2(const float32x4_t &_vec) : vec(_vec) {}
#if !defined(_MSC_VER)
Vec2(const int32x4_t &_ivec) : ivec(_ivec) {}
@ -217,7 +217,7 @@ public:
#if defined(_M_SSE)
__m128i ivec;
__m128 vec;
#elif PPSSPP_ARCH(ARM64)
#elif PPSSPP_ARCH(ARM64_NEON)
int32x4_t ivec;
float32x4_t vec;
#endif
@ -236,7 +236,7 @@ public:
Vec3(const Vec3Packed<T> &_xyz) {
vec = _mm_loadu_ps(_xyz.AsArray());
}
#elif PPSSPP_ARCH(ARM64)
#elif PPSSPP_ARCH(ARM64_NEON)
Vec3(const float32x4_t &_vec) : vec(_vec) {}
#if !defined(_MSC_VER)
Vec3(const int32x4_t &_ivec) : ivec(_ivec) {}
@ -576,7 +576,7 @@ public:
#if defined(_M_SSE)
__m128i ivec;
__m128 vec;
#elif PPSSPP_ARCH(ARM64)
#elif PPSSPP_ARCH(ARM64_NEON)
int32x4_t ivec;
float32x4_t vec;
#endif
@ -593,7 +593,7 @@ public:
#if defined(_M_SSE)
Vec4(const __m128 &_vec) : vec(_vec) {}
Vec4(const __m128i &_ivec) : ivec(_ivec) {}
#elif PPSSPP_ARCH(ARM64)
#elif PPSSPP_ARCH(ARM64_NEON)
Vec4(const float32x4_t &_vec) : vec(_vec) {}
#if !defined(_MSC_VER)
Vec4(const int32x4_t &_ivec) : ivec(_ivec) {}
@ -918,7 +918,7 @@ inline void Vec3ByMatrix43(float vecOut[3], const float v[3], const float m[12])
vecOut[0] = _mm_cvtss_f32(sum);
vecOut[1] = vectorGetByIndex<1>(sum);
vecOut[2] = vectorGetByIndex<2>(sum);
#elif PPSSPP_ARCH(ARM_NEON) && PPSSPP_ARCH(ARM64)
#elif PPSSPP_ARCH(ARM64_NEON)
float32x4_t sum = Vec3ByMatrix43Internal(vld1q_f32(v), m);
vecOut[0] = vgetq_lane_f32(sum, 0);
vecOut[1] = vgetq_lane_f32(sum, 1);
@ -936,7 +936,7 @@ inline Vec3f MATH3D_CALL Vec3ByMatrix43(const Vec3f v, const float m[12]) {
__m128 y = _mm_shuffle_ps(v.vec, v.vec, _MM_SHUFFLE(1, 1, 1, 1));
__m128 z = _mm_shuffle_ps(v.vec, v.vec, _MM_SHUFFLE(2, 2, 2, 2));
return Vec3ByMatrix43Internal(x, y, z, m);
#elif PPSSPP_ARCH(ARM_NEON) && PPSSPP_ARCH(ARM64)
#elif PPSSPP_ARCH(ARM64_NEON)
return Vec3ByMatrix43Internal(v.vec, m);
#else
Vec3f vecOut;
@ -957,7 +957,7 @@ inline __m128 MATH3D_CALL Vec3ByMatrix44Internal(__m128 x, __m128 y, __m128 z, c
_mm_add_ps(_mm_mul_ps(col2, z), col3));
return sum;
}
#elif PPSSPP_ARCH(ARM_NEON) && PPSSPP_ARCH(ARM64)
#elif PPSSPP_ARCH(ARM64_NEON)
inline float32x4_t Vec3ByMatrix44Internal(float32x4_t vec, const float m[16]) {
float32x4_t col0 = vld1q_f32(m);
float32x4_t col1 = vld1q_f32(m + 4);
@ -977,7 +977,7 @@ inline void Vec3ByMatrix44(float vecOut[4], const float v[3], const float m[16])
__m128 z = _mm_set1_ps(v[2]);
__m128 sum = Vec3ByMatrix44Internal(x, y, z, m);
_mm_storeu_ps(vecOut, sum);
#elif PPSSPP_ARCH(ARM_NEON) && PPSSPP_ARCH(ARM64)
#elif PPSSPP_ARCH(ARM64_NEON)
float32x4_t sum = Vec3ByMatrix44Internal(vld1q_f32(v), m);
vst1q_f32(vecOut, sum);
#else
@ -994,7 +994,7 @@ inline Vec4f MATH3D_CALL Vec3ByMatrix44(const Vec3f v, const float m[16]) {
__m128 y = _mm_shuffle_ps(v.vec, v.vec, _MM_SHUFFLE(1, 1, 1, 1));
__m128 z = _mm_shuffle_ps(v.vec, v.vec, _MM_SHUFFLE(2, 2, 2, 2));
return Vec3ByMatrix44Internal(x, y, z, m);
#elif PPSSPP_ARCH(ARM_NEON) && PPSSPP_ARCH(ARM64)
#elif PPSSPP_ARCH(ARM64_NEON)
return Vec3ByMatrix44Internal(v.vec, m);
#else
Vec4f vecOut;
@ -1014,7 +1014,7 @@ inline __m128 MATH3D_CALL Norm3ByMatrix43Internal(__m128 x, __m128 y, __m128 z,
_mm_mul_ps(col2, z));
return sum;
}
#elif PPSSPP_ARCH(ARM_NEON) && PPSSPP_ARCH(ARM64)
#elif PPSSPP_ARCH(ARM64_NEON)
inline float32x4_t Norm3ByMatrix43Internal(float32x4_t vec, const float m[16]) {
float32x4_t col0 = vld1q_f32(m);
float32x4_t col1 = vld1q_f32(m + 3);
@ -1035,7 +1035,7 @@ inline void Norm3ByMatrix43(float vecOut[3], const float v[3], const float m[12]
vecOut[0] = _mm_cvtss_f32(sum);
vecOut[1] = vectorGetByIndex<1>(sum);
vecOut[2] = vectorGetByIndex<2>(sum);
#elif PPSSPP_ARCH(ARM_NEON) && PPSSPP_ARCH(ARM64)
#elif PPSSPP_ARCH(ARM64_NEON)
float32x4_t sum = Norm3ByMatrix43Internal(vld1q_f32(v), m);
vecOut[0] = vgetq_lane_f32(sum, 0);
vecOut[1] = vgetq_lane_f32(sum, 1);
@ -1053,7 +1053,7 @@ inline Vec3f MATH3D_CALL Norm3ByMatrix43(const Vec3f v, const float m[12]) {
__m128 y = _mm_shuffle_ps(v.vec, v.vec, _MM_SHUFFLE(1, 1, 1, 1));
__m128 z = _mm_shuffle_ps(v.vec, v.vec, _MM_SHUFFLE(2, 2, 2, 2));
return Norm3ByMatrix43Internal(x, y, z, m);
#elif PPSSPP_ARCH(ARM_NEON) && PPSSPP_ARCH(ARM64)
#elif PPSSPP_ARCH(ARM64_NEON)
return Norm3ByMatrix43Internal(v.vec, m);
#else
Vec3f vecOut;

View File

@ -24,7 +24,7 @@ namespace Rasterizer {
void RegCache::SetupABI(const std::vector<Purpose> &args, bool forceRetain) {
#if PPSSPP_ARCH(ARM)
_assert_msg_(false, "Not yet implemented");
#elif PPSSPP_ARCH(ARM64)
#elif PPSSPP_ARCH(ARM64_NEON)
using namespace Arm64Gen;
// ARM64 has a generous allotment of registers.
@ -389,7 +389,7 @@ RegCache::RegStatus *RegCache::FindReg(Reg r, Purpose p) {
}
CodeBlock::CodeBlock(int size)
#if PPSSPP_ARCH(ARM64)
#if PPSSPP_ARCH(ARM64_NEON)
: fp(this)
#endif
{

View File

@ -28,7 +28,7 @@
#if defined(_M_SSE)
#include <emmintrin.h>
#endif
#if PPSSPP_ARCH(ARM_NEON) && PPSSPP_ARCH(ARM64)
#if PPSSPP_ARCH(ARM64_NEON)
#if defined(_MSC_VER) && PPSSPP_ARCH(ARM64)
#include <arm64_neon.h>
#else
@ -38,7 +38,7 @@
#if PPSSPP_ARCH(ARM)
#include "Common/ArmEmitter.h"
#elif PPSSPP_ARCH(ARM64)
#elif PPSSPP_ARCH(ARM64_NEON)
#include "Common/Arm64Emitter.h"
#elif PPSSPP_ARCH(X86) || PPSSPP_ARCH(AMD64)
#include "Common/x64Emitter.h"
@ -54,7 +54,7 @@ namespace Rasterizer {
// While not part of the reg cache proper, this is the type it is built for.
#if PPSSPP_ARCH(ARM)
typedef ArmGen::ARMXCodeBlock BaseCodeBlock;
#elif PPSSPP_ARCH(ARM64)
#elif PPSSPP_ARCH(ARM64_NEON)
typedef Arm64Gen::ARM64CodeBlock BaseCodeBlock;
#elif PPSSPP_ARCH(X86) || PPSSPP_ARCH(AMD64)
typedef Gen::XCodeBlock BaseCodeBlock;
@ -65,7 +65,7 @@ typedef FakeGen::FakeXCodeBlock BaseCodeBlock;
#endif
// We also have the types of things that end up in regs.
#if PPSSPP_ARCH(ARM64)
#if PPSSPP_ARCH(ARM64_NEON)
typedef int32x4_t Vec4IntArg;
typedef int32x4_t Vec4IntResult;
typedef float32x4_t Vec4FloatArg;
@ -160,7 +160,7 @@ struct RegCache {
#if PPSSPP_ARCH(ARM)
typedef ArmGen::ARMReg Reg;
static constexpr Reg REG_INVALID_VALUE = ArmGen::INVALID_REG;
#elif PPSSPP_ARCH(ARM64)
#elif PPSSPP_ARCH(ARM64_NEON)
typedef Arm64Gen::ARM64Reg Reg;
static constexpr Reg REG_INVALID_VALUE = Arm64Gen::INVALID_REG;
#elif PPSSPP_ARCH(X86) || PPSSPP_ARCH(AMD64)
@ -242,7 +242,7 @@ protected:
void WriteDynamicConst8x16(const u8 *&ptr, uint16_t value);
void WriteDynamicConst4x32(const u8 *&ptr, uint32_t value);
#if PPSSPP_ARCH(ARM64)
#if PPSSPP_ARCH(ARM64_NEON)
Arm64Gen::ARM64FloatEmitter fp;
#endif

View File

@ -54,6 +54,7 @@
#define PPSSPP_ARCH_ARM64 1
#define PPSSPP_ARCH_64BIT 1
#define PPSSPP_ARCH_ARM_NEON 1
#define PPSSPP_ARCH_ARM64_NEON 1
#endif
#if defined(__mips64__)