diff --git a/Common/x64Emitter.cpp b/Common/x64Emitter.cpp
index c2a5ba8c4d..814fc7e0d6 100644
--- a/Common/x64Emitter.cpp
+++ b/Common/x64Emitter.cpp
@@ -1697,7 +1697,6 @@ void XEmitter::MOVMSKPD(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0x50, dest, ar
 
 void XEmitter::LDDQU(X64Reg dest, OpArg arg)    {WriteSSEOp(0xF2, sseLDDQU, dest, arg);} // For integer data only
 
-// THESE TWO ARE UNTESTED.
 void XEmitter::UNPCKLPS(X64Reg dest, OpArg arg) {WriteSSEOp(0x00, 0x14, dest, arg);}
 void XEmitter::UNPCKHPS(X64Reg dest, OpArg arg) {WriteSSEOp(0x00, 0x15, dest, arg);}
 
@@ -1892,6 +1891,9 @@ void XEmitter::PTEST(X64Reg dest, OpArg arg)    {WriteSSE41Op(0x66, 0x3817, dest
 void XEmitter::PACKUSDW(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x382b, dest, arg);}
 void XEmitter::DPPS(X64Reg dest, OpArg arg, u8 mask) {WriteSSE41Op(0x66, 0x3A40, dest, arg, 1); Write8(mask);}
 
+void XEmitter::INSERTPS(X64Reg dest, OpArg arg, u8 dstsubreg, u8 srcsubreg, u8 zmask) { WriteSSE41Op(0x66, 0x3A21, dest, arg, 1); Write8((srcsubreg << 6) | (dstsubreg << 4) | zmask); }
+void XEmitter::EXTRACTPS(OpArg dest, X64Reg arg, u8 subreg) { WriteSSE41Op(0x66, 0x3A17, arg, dest, 1); Write8(subreg); }
+
 void XEmitter::PMINSB(X64Reg dest, OpArg arg)   {WriteSSE41Op(0x66, 0x3838, dest, arg);}
 void XEmitter::PMINSD(X64Reg dest, OpArg arg)   {WriteSSE41Op(0x66, 0x3839, dest, arg);}
 void XEmitter::PMINUW(X64Reg dest, OpArg arg)   {WriteSSE41Op(0x66, 0x383a, dest, arg);}
@@ -2084,7 +2086,7 @@ void XEmitter::VCVTTPD2DQ(int bits, X64Reg regOp1, OpArg arg) { WriteAVXOp(bits,
 void XEmitter::VCVTTSS2SI(int bits, X64Reg regOp1, OpArg arg) { WriteAVXOp(0, 0xF3, 0x2C, regOp1, arg, 0, bits == 64 ? 1 : 0); }
 void XEmitter::VCVTTSD2SI(int bits, X64Reg regOp1, OpArg arg) { WriteAVXOp(0, 0xF2, 0x2C, regOp1, arg, 0, bits == 64 ? 1 : 0); }
 void XEmitter::VEXTRACTPS(OpArg arg, X64Reg regOp1, u8 subreg) { WriteAVXOp(0, 0x66, 0x3A17, regOp1, arg, 1); Write8(subreg); }
-void XEmitter::VINSERTPS(X64Reg regOp1, X64Reg regOp2, OpArg arg, u8 subreg) { WriteAVXOp(0, 0x66, 0x3A21, regOp1, regOp2, arg, 1); Write8(subreg); }
+void XEmitter::VINSERTPS(X64Reg regOp1, X64Reg regOp2, OpArg arg, u8 dstsubreg, u8 srcsubreg, u8 zmask) { WriteAVXOp(0, 0x66, 0x3A21, regOp1, regOp2, arg, 1); Write8((srcsubreg << 6) | (dstsubreg << 4) | zmask); }
 void XEmitter::VLDDQU(int bits, X64Reg regOp1, OpArg arg) { WriteAVXOp(bits, 0xF2, sseLDDQU, regOp1, arg); }
 void XEmitter::VMOVAPS(int bits, X64Reg regOp1, OpArg arg) { WriteAVXOp(bits, 0x00, sseMOVAPfromRM, regOp1, arg); }
 void XEmitter::VMOVAPD(int bits, X64Reg regOp1, OpArg arg) { WriteAVXOp(bits, 0x66, sseMOVAPfromRM, regOp1, arg); }
diff --git a/Common/x64Emitter.h b/Common/x64Emitter.h
index 16f30a35b0..832ed767cb 100644
--- a/Common/x64Emitter.h
+++ b/Common/x64Emitter.h
@@ -684,12 +684,14 @@ public:
 
 	// SSE4: Further horizontal operations - dot products. These are weirdly flexible, the arg contains both a read mask and a write "mask".
 	void DPPD(X64Reg dest, OpArg src, u8 arg);
-
-	// These are probably useful for VFPU emulation.
-	void INSERTPS(X64Reg dest, OpArg src, u8 arg);
-	void EXTRACTPS(OpArg dest, X64Reg src, u8 arg);
 #endif
 
+	// SSE4: Insert and extract for floats.
+	// Note: insert from memory or an XMM.
+	void INSERTPS(X64Reg dest, OpArg arg, u8 dstsubreg, u8 srcsubreg = 0, u8 zmask = 0);
+	// Extract to memory or GPR.
+	void EXTRACTPS(OpArg dest, X64Reg arg, u8 subreg);
+
 	// SSE3: Horizontal operations in SIMD registers. Very slow! shufps-based code beats it handily on Ivy.
 	void HADDPS(X64Reg dest, OpArg src);
 
@@ -1040,7 +1042,7 @@ public:
 	// Can only extract from the low 128 bits.
 	void VEXTRACTPS(OpArg arg, X64Reg regOp1, u8 subreg);
 	// Can only insert into the low 128 bits, zeros upper bits.  Inserts from XMM.
-	void VINSERTPS(X64Reg regOp1, X64Reg regOp2, OpArg arg, u8 subreg);
+	void VINSERTPS(X64Reg regOp1, X64Reg regOp2, OpArg arg, u8 dstsubreg, u8 srcsubreg = 0, u8 zmask = 0);
 	void VLDDQU(int bits, X64Reg regOp1, OpArg arg);
 	void VMOVAPS(int bits, X64Reg regOp1, OpArg arg);
 	void VMOVAPD(int bits, X64Reg regOp1, OpArg arg);
diff --git a/Core/MIPS/x86/X64IRRegCache.cpp b/Core/MIPS/x86/X64IRRegCache.cpp
index ee176546c8..b8b2dd522f 100644
--- a/Core/MIPS/x86/X64IRRegCache.cpp
+++ b/Core/MIPS/x86/X64IRRegCache.cpp
@@ -453,6 +453,189 @@ void X64IRRegCache::StoreNativeReg(IRNativeReg nreg, IRReg first, int lanes) {
 	}
 }
 
+bool X64IRRegCache::TransferNativeReg(IRNativeReg nreg, IRNativeReg dest, MIPSLoc type, IRReg first, int lanes, MIPSMap flags) {
+	bool allowed = !mr[nr[nreg].mipsReg].isStatic;
+	// There's currently no support for non-XMMs here.
+	allowed = allowed && type == MIPSLoc::FREG;
+
+	if (dest == -1)
+		dest = nreg;
+
+	if (allowed && (flags == MIPSMap::INIT || flags == MIPSMap::DIRTY)) {
+		// Alright, changing lane count (possibly including lane position.)
+		IRReg oldfirst = nr[nreg].mipsReg;
+		int oldlanes = 0;
+		while (mr[oldfirst + oldlanes].nReg == nreg)
+			oldlanes++;
+		_assert_msg_(oldlanes != 0, "TransferNativeReg encountered nreg mismatch");
+		_assert_msg_(oldlanes != lanes, "TransferNativeReg transfer to same lanecount, misaligned?");
+
+		if (lanes == 1) {
+			// Okay, start by storing if dirty.
+			if (nr[nreg].isDirty) {
+				StoreNativeReg(nreg, oldfirst, oldlanes);
+				nr[nreg].isDirty = false;
+			}
+			// Next, shuffle the desired element into first place.
+			u8 shuf = VFPU_SWIZZLE(mr[first].lane, mr[first].lane, mr[first].lane, mr[first].lane);
+			if (mr[first].lane > 0 && cpu_info.bAVX && dest != nreg) {
+				emit_->VSHUFPS(128, FromNativeReg(dest), FromNativeReg(nreg), ::R(FromNativeReg(nreg)), shuf);
+			} else if (mr[first].lane <= 0 && dest != nreg) {
+				emit_->MOVAPS(FromNativeReg(dest), ::R(FromNativeReg(nreg)));
+			} else if (mr[first].lane > 0) {
+				if (dest != nreg)
+					emit_->MOVAPS(FromNativeReg(dest), ::R(FromNativeReg(nreg)));
+				emit_->SHUFPS(FromNativeReg(dest), ::R(FromNativeReg(dest)), shuf);
+			}
+
+			// TODO: Consider moving the others to free regs if available?  Likely will be wanted later.
+
+			// Now update accounting.
+			for (int i = 0; i < oldlanes; ++i) {
+				auto &mreg = mr[oldfirst + i];
+				if (oldfirst + i == first) {
+					mreg.lane = 0;
+					mreg.nReg = dest;
+				} else {
+					// No longer in a register.
+					mreg.nReg = -1;
+					mreg.lane = -1;
+					mreg.loc = MIPSLoc::MEM;
+				}
+			}
+
+			if (dest != nreg) {
+				nr[dest].isDirty = nr[nreg].isDirty;
+				nr[nreg].mipsReg = -1;
+				nr[nreg].isDirty = false;
+			}
+			nr[dest].mipsReg = first;
+
+			return true;
+		}
+
+		if ((lanes == 4 || lanes == 2) && oldlanes == 1) {
+			X64Reg cur[4]{};
+			int numInRegs = 0;
+			int numDirty = 0;
+			bool unavail = false;
+			for (int i = 0; i < lanes; ++i) {
+				if (mr[first + i].lane != -1 || (i != 0 && mr[first + i].spillLockIRIndex >= irIndex_)) {
+					unavail = true;
+					break;
+				}
+
+				if (mr[first + i].nReg == -1) {
+					cur[i] = INVALID_REG;
+				} else {
+					cur[i] = FromNativeReg(mr[first + i].nReg);
+					numInRegs++;
+					if (nr[cur[i]].isDirty)
+						numDirty++;
+				}
+			}
+
+			if (numInRegs == 0)
+				unavail = true;
+
+			bool handled = false;
+			if (!unavail) {
+				// If everything's currently in a reg, move it into this reg.
+				if (lanes == 4) {
+					if (cur[0] == INVALID_REG) {
+						cur[0] = FromNativeReg(dest);
+						emit_->MOVSS(cur[0], MDisp(CTXREG, -128 + GetMipsRegOffset(first + 0)));
+						numInRegs++;
+					}
+
+					// A lot of other methods are possible, but seem to make things slower in practice.
+					if (numInRegs == 4) {
+						// y = yw##, x = xz##, x = xyzw.
+						emit_->UNPCKLPS(cur[1], ::R(cur[3]));
+						emit_->UNPCKLPS(cur[0], ::R(cur[2]));
+						emit_->UNPCKLPS(cur[0], ::R(cur[1]));
+						handled = true;
+					} else if (numInRegs == 2 && cur[1] != INVALID_REG) {
+						// x = xy##, then load zw.
+						emit_->UNPCKLPS(cur[0], ::R(cur[1]));
+						emit_->MOVHPS(cur[0], MDisp(CTXREG, -128 + GetMipsRegOffset(first + 2)));
+						handled = true;
+					} else if (cpu_info.bSSE4_1 && cur[1] != INVALID_REG && cur[2] != INVALID_REG) {
+						// x = xz##, z=w###, y=yw##, x=xyzw.
+						emit_->UNPCKLPS(cur[0], ::R(cur[2]));
+						emit_->MOVSS(cur[2], MDisp(CTXREG, -128 + GetMipsRegOffset(first + 3)));
+						emit_->UNPCKLPS(cur[1], ::R(cur[2]));
+						emit_->UNPCKLPS(cur[0], ::R(cur[1]));
+						handled = true;
+					} else if (cpu_info.bSSE4_1 && numDirty != 0 && cur[1] != INVALID_REG && cur[3] != INVALID_REG) {
+						// y = yw##, load z into x[1], x = xyzw.
+						emit_->UNPCKLPS(cur[1], ::R(cur[3]));
+						emit_->INSERTPS(cur[0], MDisp(CTXREG, -128 + GetMipsRegOffset(first + 2)), 1);
+						emit_->UNPCKLPS(cur[0], ::R(cur[1]));
+						handled = true;
+					} else if (cpu_info.bSSE4_1 && numDirty != 0 && cur[2] != INVALID_REG && cur[3] != INVALID_REG) {
+						// load y to x[1], z = zw##, x = xyzw.
+						emit_->INSERTPS(cur[0], MDisp(CTXREG, -128 + GetMipsRegOffset(first + 1)), 1);
+						emit_->UNPCKLPS(cur[2], ::R(cur[3]));
+						emit_->MOVLHPS(cur[0], cur[2]);
+						handled = true;
+					} else if (cpu_info.bSSE4_1) {
+						// TODO: This might be worse than flushing depending?
+						for (int i = 1; i < 4; ++i) {
+							if (cur[i] == INVALID_REG)
+								emit_->INSERTPS(cur[0], MDisp(CTXREG, -128 + GetMipsRegOffset(first + i)), i);
+							else
+								emit_->INSERTPS(cur[0], ::R(cur[i]), i, 0);
+						}
+						handled = true;
+					}
+				} else if (lanes == 2) {
+					if (cur[0] != INVALID_REG && cur[1] != INVALID_REG) {
+						emit_->UNPCKLPS(cur[0], ::R(cur[1]));
+						handled = true;
+					} else if (cur[0] != INVALID_REG && cpu_info.bSSE4_1) {
+						emit_->INSERTPS(cur[0], MDisp(CTXREG, -128 + GetMipsRegOffset(first + 1)), 1);
+						handled = true;
+					}
+				}
+			}
+
+			if (handled) {
+				mr[first].lane = 0;
+				for (int i = 0; i < lanes; ++i) {
+					if (mr[first + i].nReg != -1) {
+						// If this was dirty, the combined reg is now dirty.
+						if (nr[mr[first + i].nReg].isDirty)
+							nr[dest].isDirty = true;
+
+						// Throw away the other register we're no longer using.
+						if (i != 0)
+							DiscardNativeReg(mr[first + i].nReg);
+					}
+
+					// And set it as using the new one.
+					mr[first + i].lane = i;
+					mr[first + i].loc = type;
+					mr[first + i].nReg = dest;
+				}
+
+				if (cur[0] != FromNativeReg(dest))
+					emit_->MOVAPS(FromNativeReg(dest), ::R(cur[0]));
+
+				if (dest != nreg) {
+					nr[dest].mipsReg = first;
+					nr[nreg].mipsReg = -1;
+					nr[nreg].isDirty = false;
+				}
+
+				return true;
+			}
+		}
+	}
+
+	return IRNativeRegCacheBase::TransferNativeReg(nreg, dest, type, first, lanes, flags);
+}
+
 void X64IRRegCache::SetNativeRegValue(IRNativeReg nreg, uint32_t imm) {
 	X64Reg r = FromNativeReg(nreg);
 	_dbg_assert_(nreg >= 0 && nreg < NUM_X_REGS);
diff --git a/Core/MIPS/x86/X64IRRegCache.h b/Core/MIPS/x86/X64IRRegCache.h
index f33e4e8d89..fd2a720bf8 100644
--- a/Core/MIPS/x86/X64IRRegCache.h
+++ b/Core/MIPS/x86/X64IRRegCache.h
@@ -117,6 +117,7 @@ protected:
 	void StoreNativeReg(IRNativeReg nreg, IRReg first, int lanes) override;
 	void SetNativeRegValue(IRNativeReg nreg, uint32_t imm) override;
 	void StoreRegValue(IRReg mreg, uint32_t imm) override;
+	bool TransferNativeReg(IRNativeReg nreg, IRNativeReg dest, MIPSLoc type, IRReg first, int lanes, MIPSMap flags) override;
 
 private:
 	IRNativeReg GPRToNativeReg(Gen::X64Reg r) {