diff --git a/Common/x64Emitter.cpp b/Common/x64Emitter.cpp index c2a5ba8c4d..814fc7e0d6 100644 --- a/Common/x64Emitter.cpp +++ b/Common/x64Emitter.cpp @@ -1697,7 +1697,6 @@ void XEmitter::MOVMSKPD(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0x50, dest, ar void XEmitter::LDDQU(X64Reg dest, OpArg arg) {WriteSSEOp(0xF2, sseLDDQU, dest, arg);} // For integer data only -// THESE TWO ARE UNTESTED. void XEmitter::UNPCKLPS(X64Reg dest, OpArg arg) {WriteSSEOp(0x00, 0x14, dest, arg);} void XEmitter::UNPCKHPS(X64Reg dest, OpArg arg) {WriteSSEOp(0x00, 0x15, dest, arg);} @@ -1892,6 +1891,9 @@ void XEmitter::PTEST(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3817, dest void XEmitter::PACKUSDW(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x382b, dest, arg);} void XEmitter::DPPS(X64Reg dest, OpArg arg, u8 mask) {WriteSSE41Op(0x66, 0x3A40, dest, arg, 1); Write8(mask);} +void XEmitter::INSERTPS(X64Reg dest, OpArg arg, u8 dstsubreg, u8 srcsubreg, u8 zmask) { WriteSSE41Op(0x66, 0x3A21, dest, arg, 1); Write8((srcsubreg << 6) | (dstsubreg << 4) | zmask); } +void XEmitter::EXTRACTPS(OpArg dest, X64Reg arg, u8 subreg) { WriteSSE41Op(0x66, 0x3A17, arg, dest, 1); Write8(subreg); } + void XEmitter::PMINSB(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3838, dest, arg);} void XEmitter::PMINSD(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3839, dest, arg);} void XEmitter::PMINUW(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x383a, dest, arg);} @@ -2084,7 +2086,7 @@ void XEmitter::VCVTTPD2DQ(int bits, X64Reg regOp1, OpArg arg) { WriteAVXOp(bits, void XEmitter::VCVTTSS2SI(int bits, X64Reg regOp1, OpArg arg) { WriteAVXOp(0, 0xF3, 0x2C, regOp1, arg, 0, bits == 64 ? 1 : 0); } void XEmitter::VCVTTSD2SI(int bits, X64Reg regOp1, OpArg arg) { WriteAVXOp(0, 0xF2, 0x2C, regOp1, arg, 0, bits == 64 ? 1 : 0); } void XEmitter::VEXTRACTPS(OpArg arg, X64Reg regOp1, u8 subreg) { WriteAVXOp(0, 0x66, 0x3A17, regOp1, arg, 1); Write8(subreg); } -void XEmitter::VINSERTPS(X64Reg regOp1, X64Reg regOp2, OpArg arg, u8 subreg) { WriteAVXOp(0, 0x66, 0x3A21, regOp1, regOp2, arg, 1); Write8(subreg); } +void XEmitter::VINSERTPS(X64Reg regOp1, X64Reg regOp2, OpArg arg, u8 dstsubreg, u8 srcsubreg, u8 zmask) { WriteAVXOp(0, 0x66, 0x3A21, regOp1, regOp2, arg, 1); Write8((srcsubreg << 6) | (dstsubreg << 4) | zmask); } void XEmitter::VLDDQU(int bits, X64Reg regOp1, OpArg arg) { WriteAVXOp(bits, 0xF2, sseLDDQU, regOp1, arg); } void XEmitter::VMOVAPS(int bits, X64Reg regOp1, OpArg arg) { WriteAVXOp(bits, 0x00, sseMOVAPfromRM, regOp1, arg); } void XEmitter::VMOVAPD(int bits, X64Reg regOp1, OpArg arg) { WriteAVXOp(bits, 0x66, sseMOVAPfromRM, regOp1, arg); } diff --git a/Common/x64Emitter.h b/Common/x64Emitter.h index 16f30a35b0..832ed767cb 100644 --- a/Common/x64Emitter.h +++ b/Common/x64Emitter.h @@ -684,12 +684,14 @@ public: // SSE4: Further horizontal operations - dot products. These are weirdly flexible, the arg contains both a read mask and a write "mask". void DPPD(X64Reg dest, OpArg src, u8 arg); - - // These are probably useful for VFPU emulation. - void INSERTPS(X64Reg dest, OpArg src, u8 arg); - void EXTRACTPS(OpArg dest, X64Reg src, u8 arg); #endif + // SSE4: Insert and extract for floats. + // Note: insert from memory or an XMM. + void INSERTPS(X64Reg dest, OpArg arg, u8 dstsubreg, u8 srcsubreg = 0, u8 zmask = 0); + // Extract to memory or GPR. + void EXTRACTPS(OpArg dest, X64Reg arg, u8 subreg); + // SSE3: Horizontal operations in SIMD registers. Very slow! shufps-based code beats it handily on Ivy. void HADDPS(X64Reg dest, OpArg src); @@ -1040,7 +1042,7 @@ public: // Can only extract from the low 128 bits. void VEXTRACTPS(OpArg arg, X64Reg regOp1, u8 subreg); // Can only insert into the low 128 bits, zeros upper bits. Inserts from XMM. - void VINSERTPS(X64Reg regOp1, X64Reg regOp2, OpArg arg, u8 subreg); + void VINSERTPS(X64Reg regOp1, X64Reg regOp2, OpArg arg, u8 dstsubreg, u8 srcsubreg = 0, u8 zmask = 0); void VLDDQU(int bits, X64Reg regOp1, OpArg arg); void VMOVAPS(int bits, X64Reg regOp1, OpArg arg); void VMOVAPD(int bits, X64Reg regOp1, OpArg arg); diff --git a/Core/MIPS/x86/X64IRRegCache.cpp b/Core/MIPS/x86/X64IRRegCache.cpp index ee176546c8..b8b2dd522f 100644 --- a/Core/MIPS/x86/X64IRRegCache.cpp +++ b/Core/MIPS/x86/X64IRRegCache.cpp @@ -453,6 +453,189 @@ void X64IRRegCache::StoreNativeReg(IRNativeReg nreg, IRReg first, int lanes) { } } +bool X64IRRegCache::TransferNativeReg(IRNativeReg nreg, IRNativeReg dest, MIPSLoc type, IRReg first, int lanes, MIPSMap flags) { + bool allowed = !mr[nr[nreg].mipsReg].isStatic; + // There's currently no support for non-XMMs here. + allowed = allowed && type == MIPSLoc::FREG; + + if (dest == -1) + dest = nreg; + + if (allowed && (flags == MIPSMap::INIT || flags == MIPSMap::DIRTY)) { + // Alright, changing lane count (possibly including lane position.) + IRReg oldfirst = nr[nreg].mipsReg; + int oldlanes = 0; + while (mr[oldfirst + oldlanes].nReg == nreg) + oldlanes++; + _assert_msg_(oldlanes != 0, "TransferNativeReg encountered nreg mismatch"); + _assert_msg_(oldlanes != lanes, "TransferNativeReg transfer to same lanecount, misaligned?"); + + if (lanes == 1) { + // Okay, start by storing if dirty. + if (nr[nreg].isDirty) { + StoreNativeReg(nreg, oldfirst, oldlanes); + nr[nreg].isDirty = false; + } + // Next, shuffle the desired element into first place. + u8 shuf = VFPU_SWIZZLE(mr[first].lane, mr[first].lane, mr[first].lane, mr[first].lane); + if (mr[first].lane > 0 && cpu_info.bAVX && dest != nreg) { + emit_->VSHUFPS(128, FromNativeReg(dest), FromNativeReg(nreg), ::R(FromNativeReg(nreg)), shuf); + } else if (mr[first].lane <= 0 && dest != nreg) { + emit_->MOVAPS(FromNativeReg(dest), ::R(FromNativeReg(nreg))); + } else if (mr[first].lane > 0) { + if (dest != nreg) + emit_->MOVAPS(FromNativeReg(dest), ::R(FromNativeReg(nreg))); + emit_->SHUFPS(FromNativeReg(dest), ::R(FromNativeReg(dest)), shuf); + } + + // TODO: Consider moving the others to free regs if available? Likely will be wanted later. + + // Now update accounting. + for (int i = 0; i < oldlanes; ++i) { + auto &mreg = mr[oldfirst + i]; + if (oldfirst + i == first) { + mreg.lane = 0; + mreg.nReg = dest; + } else { + // No longer in a register. + mreg.nReg = -1; + mreg.lane = -1; + mreg.loc = MIPSLoc::MEM; + } + } + + if (dest != nreg) { + nr[dest].isDirty = nr[nreg].isDirty; + nr[nreg].mipsReg = -1; + nr[nreg].isDirty = false; + } + nr[dest].mipsReg = first; + + return true; + } + + if ((lanes == 4 || lanes == 2) && oldlanes == 1) { + X64Reg cur[4]{}; + int numInRegs = 0; + int numDirty = 0; + bool unavail = false; + for (int i = 0; i < lanes; ++i) { + if (mr[first + i].lane != -1 || (i != 0 && mr[first + i].spillLockIRIndex >= irIndex_)) { + unavail = true; + break; + } + + if (mr[first + i].nReg == -1) { + cur[i] = INVALID_REG; + } else { + cur[i] = FromNativeReg(mr[first + i].nReg); + numInRegs++; + if (nr[cur[i]].isDirty) + numDirty++; + } + } + + if (numInRegs == 0) + unavail = true; + + bool handled = false; + if (!unavail) { + // If everything's currently in a reg, move it into this reg. + if (lanes == 4) { + if (cur[0] == INVALID_REG) { + cur[0] = FromNativeReg(dest); + emit_->MOVSS(cur[0], MDisp(CTXREG, -128 + GetMipsRegOffset(first + 0))); + numInRegs++; + } + + // A lot of other methods are possible, but seem to make things slower in practice. + if (numInRegs == 4) { + // y = yw##, x = xz##, x = xyzw. + emit_->UNPCKLPS(cur[1], ::R(cur[3])); + emit_->UNPCKLPS(cur[0], ::R(cur[2])); + emit_->UNPCKLPS(cur[0], ::R(cur[1])); + handled = true; + } else if (numInRegs == 2 && cur[1] != INVALID_REG) { + // x = xy##, then load zw. + emit_->UNPCKLPS(cur[0], ::R(cur[1])); + emit_->MOVHPS(cur[0], MDisp(CTXREG, -128 + GetMipsRegOffset(first + 2))); + handled = true; + } else if (cpu_info.bSSE4_1 && cur[1] != INVALID_REG && cur[2] != INVALID_REG) { + // x = xz##, z=w###, y=yw##, x=xyzw. + emit_->UNPCKLPS(cur[0], ::R(cur[2])); + emit_->MOVSS(cur[2], MDisp(CTXREG, -128 + GetMipsRegOffset(first + 3))); + emit_->UNPCKLPS(cur[1], ::R(cur[2])); + emit_->UNPCKLPS(cur[0], ::R(cur[1])); + handled = true; + } else if (cpu_info.bSSE4_1 && numDirty != 0 && cur[1] != INVALID_REG && cur[3] != INVALID_REG) { + // y = yw##, load z into x[1], x = xyzw. + emit_->UNPCKLPS(cur[1], ::R(cur[3])); + emit_->INSERTPS(cur[0], MDisp(CTXREG, -128 + GetMipsRegOffset(first + 2)), 1); + emit_->UNPCKLPS(cur[0], ::R(cur[1])); + handled = true; + } else if (cpu_info.bSSE4_1 && numDirty != 0 && cur[2] != INVALID_REG && cur[3] != INVALID_REG) { + // load y to x[1], z = zw##, x = xyzw. + emit_->INSERTPS(cur[0], MDisp(CTXREG, -128 + GetMipsRegOffset(first + 1)), 1); + emit_->UNPCKLPS(cur[2], ::R(cur[3])); + emit_->MOVLHPS(cur[0], cur[2]); + handled = true; + } else if (cpu_info.bSSE4_1) { + // TODO: This might be worse than flushing depending? + for (int i = 1; i < 4; ++i) { + if (cur[i] == INVALID_REG) + emit_->INSERTPS(cur[0], MDisp(CTXREG, -128 + GetMipsRegOffset(first + i)), i); + else + emit_->INSERTPS(cur[0], ::R(cur[i]), i, 0); + } + handled = true; + } + } else if (lanes == 2) { + if (cur[0] != INVALID_REG && cur[1] != INVALID_REG) { + emit_->UNPCKLPS(cur[0], ::R(cur[1])); + handled = true; + } else if (cur[0] != INVALID_REG && cpu_info.bSSE4_1) { + emit_->INSERTPS(cur[0], MDisp(CTXREG, -128 + GetMipsRegOffset(first + 1)), 1); + handled = true; + } + } + } + + if (handled) { + mr[first].lane = 0; + for (int i = 0; i < lanes; ++i) { + if (mr[first + i].nReg != -1) { + // If this was dirty, the combined reg is now dirty. + if (nr[mr[first + i].nReg].isDirty) + nr[dest].isDirty = true; + + // Throw away the other register we're no longer using. + if (i != 0) + DiscardNativeReg(mr[first + i].nReg); + } + + // And set it as using the new one. + mr[first + i].lane = i; + mr[first + i].loc = type; + mr[first + i].nReg = dest; + } + + if (cur[0] != FromNativeReg(dest)) + emit_->MOVAPS(FromNativeReg(dest), ::R(cur[0])); + + if (dest != nreg) { + nr[dest].mipsReg = first; + nr[nreg].mipsReg = -1; + nr[nreg].isDirty = false; + } + + return true; + } + } + } + + return IRNativeRegCacheBase::TransferNativeReg(nreg, dest, type, first, lanes, flags); +} + void X64IRRegCache::SetNativeRegValue(IRNativeReg nreg, uint32_t imm) { X64Reg r = FromNativeReg(nreg); _dbg_assert_(nreg >= 0 && nreg < NUM_X_REGS); diff --git a/Core/MIPS/x86/X64IRRegCache.h b/Core/MIPS/x86/X64IRRegCache.h index f33e4e8d89..fd2a720bf8 100644 --- a/Core/MIPS/x86/X64IRRegCache.h +++ b/Core/MIPS/x86/X64IRRegCache.h @@ -117,6 +117,7 @@ protected: void StoreNativeReg(IRNativeReg nreg, IRReg first, int lanes) override; void SetNativeRegValue(IRNativeReg nreg, uint32_t imm) override; void StoreRegValue(IRReg mreg, uint32_t imm) override; + bool TransferNativeReg(IRNativeReg nreg, IRNativeReg dest, MIPSLoc type, IRReg first, int lanes, MIPSMap flags) override; private: IRNativeReg GPRToNativeReg(Gen::X64Reg r) {