From ba01eac4673bba1cd28e83346a54239e14539e8c Mon Sep 17 00:00:00 2001 From: Ryan Houdek Date: Thu, 24 Aug 2023 14:18:33 -0700 Subject: [PATCH] IR: Adds support for ARM's FCMA FCADD instruction --- .../Core/Interpreter/InterpreterDefines.h | 11 +++++ .../Core/Interpreter/InterpreterOps.cpp | 1 + .../Core/Interpreter/InterpreterOps.h | 1 + .../Interface/Core/Interpreter/VectorOps.cpp | 38 ++++++++++++++++ .../Source/Interface/Core/JIT/Arm64/JIT.cpp | 1 + .../Interface/Core/JIT/Arm64/JITClass.h | 1 + .../Interface/Core/JIT/Arm64/VectorOps.cpp | 44 +++++++++++++++++++ FEXCore/Source/Interface/IR/IR.json | 4 ++ 8 files changed, 101 insertions(+) diff --git a/FEXCore/Source/Interface/Core/Interpreter/InterpreterDefines.h b/FEXCore/Source/Interface/Core/Interpreter/InterpreterDefines.h index 63da19906..1b42c769d 100644 --- a/FEXCore/Source/Interface/Core/Interpreter/InterpreterDefines.h +++ b/FEXCore/Source/Interface/Core/Interpreter/InterpreterDefines.h @@ -65,6 +65,17 @@ } \ break; \ } +#define DO_VECTOR_FCADD_PAIR_OP(size, type, func) \ + case size: { \ + auto *Dst_d = reinterpret_cast(std::data(Tmp)); \ + auto *Src1_d = reinterpret_cast(Src1); \ + auto *Src2_d = reinterpret_cast(Src2); \ + for (uint8_t i = 0; i < Elements; i += 2) { \ + func(&Dst_d[i], &Src1_d[i], &Src2_d[i]); \ + } \ + break; \ + } + #define DO_VECTOR_SCALAR_OP(size, type, func) \ case size: { \ auto *Dst_d = reinterpret_cast(std::data(Tmp)); \ diff --git a/FEXCore/Source/Interface/Core/Interpreter/InterpreterOps.cpp b/FEXCore/Source/Interface/Core/Interpreter/InterpreterOps.cpp index ad24d04ed..21c43accd 100644 --- a/FEXCore/Source/Interface/Core/Interpreter/InterpreterOps.cpp +++ b/FEXCore/Source/Interface/Core/Interpreter/InterpreterOps.cpp @@ -286,6 +286,7 @@ constexpr OpHandlerArray InterpreterOpHandlers = [] { REGISTER_OP(VREV64, VRev64); REGISTER_OP(VPCMPESTRX, VPCMPESTRX); REGISTER_OP(VPCMPISTRX, VPCMPISTRX); + REGISTER_OP(VFCADD, VFCADD); // Encryption ops REGISTER_OP(VAESIMC, AESImc); diff --git a/FEXCore/Source/Interface/Core/Interpreter/InterpreterOps.h b/FEXCore/Source/Interface/Core/Interpreter/InterpreterOps.h index 53d9dfabb..85e245278 100644 --- a/FEXCore/Source/Interface/Core/Interpreter/InterpreterOps.h +++ b/FEXCore/Source/Interface/Core/Interpreter/InterpreterOps.h @@ -314,6 +314,7 @@ namespace FEXCore::CPU { DEF_OP(VRev64); DEF_OP(VPCMPESTRX); DEF_OP(VPCMPISTRX); + DEF_OP(VFCADD); ///< Encryption ops DEF_OP(AESImc); diff --git a/FEXCore/Source/Interface/Core/Interpreter/VectorOps.cpp b/FEXCore/Source/Interface/Core/Interpreter/VectorOps.cpp index b49c273c0..8c27f8ea0 100644 --- a/FEXCore/Source/Interface/Core/Interpreter/VectorOps.cpp +++ b/FEXCore/Source/Interface/Core/Interpreter/VectorOps.cpp @@ -2561,6 +2561,44 @@ DEF_OP(VPCMPISTRX) { memcpy(GDP, &Result, sizeof(Result)); } +DEF_OP(VFCADD) { + const auto Op = IROp->C(); + const uint8_t OpSize = IROp->Size; + + const auto *Src1 = GetSrc(Data->SSAData, Op->Vector1); + const auto *Src2 = GetSrc(Data->SSAData, Op->Vector2); + const auto Rotate = Op->Rotate; + LOGMAN_THROW_A_FMT(Rotate == 90 || Rotate == 270, "Invalid rotate!"); + + TempVectorDataArray Tmp; + + const uint8_t ElementSize = Op->Header.ElementSize; + const uint8_t Elements = OpSize / ElementSize; + + const auto Func = [Rotate](auto dst, auto src1, auto src2) { + auto Element1 = src2[1]; + auto Element3 = src2[0]; + if (Rotate == 90) { + Element1 = -Element1; + } + else { + Element3 = -Element3; + } + dst[0] = src1[0] + Element1; + dst[1] = src1[1] + Element3; + }; + + switch (ElementSize) { + //DO_VECTOR_FCADD_PAIR_OP(2, float16_t, Func) + DO_VECTOR_FCADD_PAIR_OP(4, float, Func) + DO_VECTOR_FCADD_PAIR_OP(8, double, Func) + default: + LOGMAN_MSG_A_FMT("Unknown Element Size: {}", ElementSize); + break; + } + memcpy(GDP, Tmp.data(), OpSize); +} + #undef DEF_OP } // namespace FEXCore::CPU diff --git a/FEXCore/Source/Interface/Core/JIT/Arm64/JIT.cpp b/FEXCore/Source/Interface/Core/JIT/Arm64/JIT.cpp index b195bb5af..9e0a0edde 100644 --- a/FEXCore/Source/Interface/Core/JIT/Arm64/JIT.cpp +++ b/FEXCore/Source/Interface/Core/JIT/Arm64/JIT.cpp @@ -1104,6 +1104,7 @@ CPUBackend::CompiledCode Arm64JITCore::CompileCode(uint64_t Entry, REGISTER_OP(VTBL1, VTBL1); REGISTER_OP(VREV32, VRev32); REGISTER_OP(VREV64, VRev64); + REGISTER_OP(VFCADD, VFCADD); #undef REGISTER_OP default: diff --git a/FEXCore/Source/Interface/Core/JIT/Arm64/JITClass.h b/FEXCore/Source/Interface/Core/JIT/Arm64/JITClass.h index 19317adce..29e242d7b 100644 --- a/FEXCore/Source/Interface/Core/JIT/Arm64/JITClass.h +++ b/FEXCore/Source/Interface/Core/JIT/Arm64/JITClass.h @@ -470,6 +470,7 @@ private: DEF_OP(VTBL1); DEF_OP(VRev32); DEF_OP(VRev64); + DEF_OP(VFCADD); ///< Encryption ops DEF_OP(AESImc); diff --git a/FEXCore/Source/Interface/Core/JIT/Arm64/VectorOps.cpp b/FEXCore/Source/Interface/Core/JIT/Arm64/VectorOps.cpp index 39b256a78..af4f40b1b 100644 --- a/FEXCore/Source/Interface/Core/JIT/Arm64/VectorOps.cpp +++ b/FEXCore/Source/Interface/Core/JIT/Arm64/VectorOps.cpp @@ -3825,6 +3825,50 @@ DEF_OP(VRev64) { } } +DEF_OP(VFCADD) { + const auto Op = IROp->C(); + const auto OpSize = IROp->Size; + + const auto ElementSize = Op->Header.ElementSize; + const auto Is256Bit = OpSize == Core::CPUState::XMM_AVX_REG_SIZE; + + const auto Dst = GetVReg(Node); + const auto Vector1 = GetVReg(Op->Vector1.ID()); + const auto Vector2 = GetVReg(Op->Vector2.ID()); + + LOGMAN_THROW_AA_FMT(ElementSize == 2 || ElementSize == 4 || ElementSize == 8, "Invalid size"); + LOGMAN_THROW_A_FMT(Op->Rotate == 90 || Op->Rotate == 270, "Invalidate Rotate"); + const auto SubRegSize = + ElementSize == 2 ? ARMEmitter::SubRegSize::i16Bit : + ElementSize == 4 ? ARMEmitter::SubRegSize::i32Bit : ARMEmitter::SubRegSize::i64Bit; + const auto Rotate = + Op->Rotate == 90 ? ARMEmitter::Rotation::ROTATE_90 : ARMEmitter::Rotation::ROTATE_270; + + if (HostSupportsSVE256 && Is256Bit) { + const auto Mask = PRED_TMP_32B.Merging(); + + if (Dst == Vector1) { + // Trivial case where we already have first vector in the destination + // register. We can just do the operation in place. + fcadd(SubRegSize, Dst.Z(), Mask, Vector1.Z(), Vector2.Z(), Rotate); + } + else { + // SVE FCADD is a destructive operation, so we need + // a temporary for performing operations. + movprfx(VTMP1.Z(), Vector1.Z()); + fcadd(SubRegSize, VTMP1.Z(), Mask, VTMP1.Z(), Vector2.Z(), Rotate); + mov(Dst.Z(), VTMP1.Z()); + } + } else { + if (OpSize == 8) { + fcadd(SubRegSize, Dst.D(), Vector1.D(), Vector2.D(), Rotate); + } + else { + fcadd(SubRegSize, Dst.Q(), Vector1.Q(), Vector2.Q(), Rotate); + } + } +} + #undef DEF_OP } diff --git a/FEXCore/Source/Interface/IR/IR.json b/FEXCore/Source/Interface/IR/IR.json index eddc13b48..154278404 100644 --- a/FEXCore/Source/Interface/IR/IR.json +++ b/FEXCore/Source/Interface/IR/IR.json @@ -1558,6 +1558,10 @@ "course of creating the intermediate result" ], "DestSize": "4" + }, + "FPR = VFCADD u8:#RegisterSize, u8:#ElementSize, FPR:$Vector1, FPR:$Vector2, u16:$Rotate": { + "DestSize": "RegisterSize", + "NumElements": "RegisterSize / ElementSize" } }, "Conv": {