Common: Switch integer shuffle/insert/extract instructions to auto SSE/AVX

This commit is contained in:
TellowKrinkle
2025-06-02 01:15:42 -05:00
committed by TellowKrinkle
parent c9ddab444a
commit 0c8c798051
3 changed files with 93 additions and 87 deletions

View File

@@ -31,17 +31,17 @@ namespace x86Emitter
{
// Copies doublewords from src and inserts them into dest at dword locations selected
// with the order operand (8 bit immediate).
const xImplSimd_DestRegImmSSE D;
const xImplSimd_2ArgImm D;
// Copies words from the low quadword of src and inserts them into the low quadword
// of dest at word locations selected with the order operand (8 bit immediate).
// The high quadword of src is copied to the high quadword of dest.
const xImplSimd_DestRegImmSSE LW;
const xImplSimd_2ArgImm LW;
// Copies words from the high quadword of src and inserts them into the high quadword
// of dest at word locations selected with the order operand (8 bit immediate).
// The low quadword of src is copied to the low quadword of dest.
const xImplSimd_DestRegImmSSE HW;
const xImplSimd_2ArgImm HW;
// [sSSE-3] Performs in-place shuffles of bytes in dest according to the shuffle
// control mask in src. If the most significant bit (bit[7]) of each byte of the
@@ -50,42 +50,7 @@ namespace x86Emitter
// byte in dest. The value of each index is the least significant 4 bits (128-bit
// operation) or 3 bits (64-bit operation) of the shuffle control byte.
//
const xImplSimd_DestRegEither B;
// below is my test bed for a new system, free of subclasses. Was supposed to improve intellisense
// but it doesn't (makes it worse). Will try again in MSVC 2010. --air
#if 0
// Copies words from src and inserts them into dest at word locations selected with
// the order operand (8 bit immediate).
// Copies doublewords from src and inserts them into dest at dword locations selected
// with the order operand (8 bit immediate).
void D( const xRegisterSSE& to, const xRegisterSSE& from, u8 imm ) const { xOpWrite0F( 0x66, 0x70, to, from, imm ); }
void D( const xRegisterSSE& to, const xIndirectVoid& from, u8 imm ) const { xOpWrite0F( 0x66, 0x70, to, from, imm ); }
// Copies words from the low quadword of src and inserts them into the low quadword
// of dest at word locations selected with the order operand (8 bit immediate).
// The high quadword of src is copied to the high quadword of dest.
void LW( const xRegisterSSE& to, const xRegisterSSE& from, u8 imm ) const { xOpWrite0F( 0xf2, 0x70, to, from, imm ); }
void LW( const xRegisterSSE& to, const xIndirectVoid& from, u8 imm ) const { xOpWrite0F( 0xf2, 0x70, to, from, imm ); }
// Copies words from the high quadword of src and inserts them into the high quadword
// of dest at word locations selected with the order operand (8 bit immediate).
// The low quadword of src is copied to the low quadword of dest.
void HW( const xRegisterSSE& to, const xRegisterSSE& from, u8 imm ) const { xOpWrite0F( 0xf3, 0x70, to, from, imm ); }
void HW( const xRegisterSSE& to, const xIndirectVoid& from, u8 imm ) const { xOpWrite0F( 0xf3, 0x70, to, from, imm ); }
// [sSSE-3] Performs in-place shuffles of bytes in dest according to the shuffle
// control mask in src. If the most significant bit (bit[7]) of each byte of the
// shuffle control mask is set, then constant zero is written in the result byte.
// Each byte in the shuffle control mask forms an index to permute the corresponding
// byte in dest. The value of each index is the least significant 4 bits (128-bit
// operation) or 3 bits (64-bit operation) of the shuffle control byte.
//
void B( const xRegisterSSE& to, const xRegisterSSE& from ) const { OpWriteSSE( 0x66, 0x0038 ); }
void B( const xRegisterSSE& to, const xIndirectVoid& from ) const { OpWriteSSE( 0x66, 0x0038 ); }
#endif
const xImplSimd_3Arg B;
};
// --------------------------------------------------------------------------------------
@@ -183,17 +148,25 @@ namespace x86Emitter
//
struct xImplSimd_PInsert
{
void B(const xRegisterSSE& to, const xRegister32& from, u8 imm8) const;
void B(const xRegisterSSE& to, const xIndirect32& from, u8 imm8) const;
void B(const xRegisterSSE& dst, const xRegister32& src, u8 imm8) const { B(dst, dst, src, imm8); }
void B(const xRegisterSSE& dst, const xIndirect8& src, u8 imm8) const { B(dst, dst, src, imm8); }
void B(const xRegisterSSE& dst, const xRegisterSSE& src1, const xRegister32& src2, u8 imm8) const;
void B(const xRegisterSSE& dst, const xRegisterSSE& src1, const xIndirect8& src2, u8 imm8) const;
void W(const xRegisterSSE& to, const xRegister32& from, u8 imm8) const;
void W(const xRegisterSSE& to, const xIndirect32& from, u8 imm8) const;
void W(const xRegisterSSE& dst, const xRegister32& src, u8 imm8) const { W(dst, dst, src, imm8); }
void W(const xRegisterSSE& dst, const xIndirect16& src, u8 imm8) const { W(dst, dst, src, imm8); }
void W(const xRegisterSSE& dst, const xRegisterSSE& src1, const xRegister32& src2, u8 imm8) const;
void W(const xRegisterSSE& dst, const xRegisterSSE& src1, const xIndirect16& src2, u8 imm8) const;
void D(const xRegisterSSE& to, const xRegister32& from, u8 imm8) const;
void D(const xRegisterSSE& to, const xIndirect32& from, u8 imm8) const;
void D(const xRegisterSSE& dst, const xRegister32& src, u8 imm8) const { D(dst, dst, src, imm8); }
void D(const xRegisterSSE& dst, const xIndirect32& src, u8 imm8) const { D(dst, dst, src, imm8); }
void D(const xRegisterSSE& dst, const xRegisterSSE& src1, const xRegister32& src2, u8 imm8) const;
void D(const xRegisterSSE& dst, const xRegisterSSE& src1, const xIndirect32& src2, u8 imm8) const;
void Q(const xRegisterSSE& to, const xRegister64& from, u8 imm8) const;
void Q(const xRegisterSSE& to, const xIndirect64& from, u8 imm8) const;
void Q(const xRegisterSSE& dst, const xRegister64& src, u8 imm8) const { Q(dst, dst, src, imm8); }
void Q(const xRegisterSSE& dst, const xIndirect64& src, u8 imm8) const { Q(dst, dst, src, imm8); }
void Q(const xRegisterSSE& dst, const xRegisterSSE& src1, const xRegister64& src2, u8 imm8) const;
void Q(const xRegisterSSE& dst, const xRegisterSSE& src1, const xIndirect64& src2, u8 imm8) const;
};
//////////////////////////////////////////////////////////////////////////////////////////
@@ -206,8 +179,8 @@ namespace x86Emitter
// [SSE-4.1] Copies the byte element specified by imm8 from src to dest. The upper bits
// of dest are zero-extended (cleared). This can be used to extract any single packed
// byte value from src into an x86 32 bit register.
void B(const xRegister32& to, const xRegisterSSE& from, u8 imm8) const;
void B(const xIndirect32& dest, const xRegisterSSE& from, u8 imm8) const;
void B(const xRegister32& dst, const xRegisterSSE& src, u8 imm8) const;
void B(const xIndirect8& dst, const xRegisterSSE& src, u8 imm8) const;
// Copies the word element specified by imm8 from src to dest. The upper bits
// of dest are zero-extended (cleared). This can be used to extract any single packed
@@ -215,16 +188,17 @@ namespace x86Emitter
//
// [SSE-4.1] Note: Indirect memory forms of this instruction are an SSE-4.1 extension!
//
void W(const xRegister32& to, const xRegisterSSE& from, u8 imm8) const;
void W(const xIndirect32& dest, const xRegisterSSE& from, u8 imm8) const;
void W(const xRegister32& dst, const xRegisterSSE& src, u8 imm8) const;
void W(const xIndirect16& dst, const xRegisterSSE& src, u8 imm8) const;
// [SSE-4.1] Copies the dword element specified by imm8 from src to dest. This can be
// used to extract any single packed dword value from src into an x86 32 bit register.
void D(const xRegister32& to, const xRegisterSSE& from, u8 imm8) const;
void D(const xIndirect32& dest, const xRegisterSSE& from, u8 imm8) const;
void D(const xRegister32& dst, const xRegisterSSE& src, u8 imm8) const;
void D(const xIndirect32& dst, const xRegisterSSE& src, u8 imm8) const;
// Insert a qword integer value from r/m64 into the xmm1 at the destination element specified by imm8.
void Q(const xRegister64& to, const xRegisterSSE& from, u8 imm8) const;
void Q(const xIndirect64& dest, const xRegisterSSE& from, u8 imm8) const;
// [SSE-4.1] Copies the dword element specified by imm8 from src to dest. This can be
// used to extract any single packed dword value from src into an x86 64 bit register.
void Q(const xRegister64& dst, const xRegisterSSE& src, u8 imm8) const;
void Q(const xIndirect64& dst, const xRegisterSSE& src, u8 imm8) const;
};
} // namespace x86Emitter

View File

@@ -562,39 +562,38 @@ namespace x86Emitter
EmitSIMD(SIMDInstructionInfo(0xc6).d().p66(), dst, src1, src2, selector);
}
void xImplSimd_PInsert::B(const xRegisterSSE& to, const xRegister32& from, u8 imm8) const { xOpWrite0F(0x66, 0x203a, to, from, imm8); }
void xImplSimd_PInsert::B(const xRegisterSSE& to, const xIndirect32& from, u8 imm8) const { xOpWrite0F(0x66, 0x203a, to, from, imm8); }
void xImplSimd_PInsert::B(const xRegisterSSE& dst, const xRegisterSSE& src1, const xRegister32& src2, u8 imm8) const { EmitSIMD(SIMDInstructionInfo(0x20).i().p66().m0f3a(), dst, src1, src2, imm8); }
void xImplSimd_PInsert::B(const xRegisterSSE& dst, const xRegisterSSE& src1, const xIndirect8& src2, u8 imm8) const { EmitSIMD(SIMDInstructionInfo(0x20).i().p66().m0f3a(), dst, src1, src2, imm8); }
void xImplSimd_PInsert::W(const xRegisterSSE& to, const xRegister32& from, u8 imm8) const { xOpWrite0F(0x66, 0xc4, to, from, imm8); }
void xImplSimd_PInsert::W(const xRegisterSSE& to, const xIndirect32& from, u8 imm8) const { xOpWrite0F(0x66, 0xc4, to, from, imm8); }
void xImplSimd_PInsert::W(const xRegisterSSE& dst, const xRegisterSSE& src1, const xRegister32& src2, u8 imm8) const { EmitSIMD(SIMDInstructionInfo(0xc4).i().p66(), dst, src1, src2, imm8); }
void xImplSimd_PInsert::W(const xRegisterSSE& dst, const xRegisterSSE& src1, const xIndirect16& src2, u8 imm8) const { EmitSIMD(SIMDInstructionInfo(0xc4).i().p66(), dst, src1, src2, imm8); }
void xImplSimd_PInsert::D(const xRegisterSSE& to, const xRegister32& from, u8 imm8) const { xOpWrite0F(0x66, 0x223a, to, from, imm8); }
void xImplSimd_PInsert::D(const xRegisterSSE& to, const xIndirect32& from, u8 imm8) const { xOpWrite0F(0x66, 0x223a, to, from, imm8); }
void xImplSimd_PInsert::D(const xRegisterSSE& dst, const xRegisterSSE& src1, const xRegister32& src2, u8 imm8) const { EmitSIMD(SIMDInstructionInfo(0x22).i().p66().m0f3a().srcw(), dst, src1, src2, imm8); }
void xImplSimd_PInsert::D(const xRegisterSSE& dst, const xRegisterSSE& src1, const xIndirect32& src2, u8 imm8) const { EmitSIMD(SIMDInstructionInfo(0x22).i().p66().m0f3a().srcw(), dst, src1, src2, imm8); }
void xImplSimd_PInsert::Q(const xRegisterSSE& to, const xRegister64& from, u8 imm8) const { xOpWrite0F(0x66, 0x223a, to, from, imm8); }
void xImplSimd_PInsert::Q(const xRegisterSSE& to, const xIndirect64& from, u8 imm8) const { xOpWrite0F(0x66, 0x223a, to, from, imm8); }
void xImplSimd_PInsert::Q(const xRegisterSSE& dst, const xRegisterSSE& src1, const xRegister64& src2, u8 imm8) const { EmitSIMD(SIMDInstructionInfo(0x22).i().p66().m0f3a().srcw(), dst, src1, src2, imm8); }
void xImplSimd_PInsert::Q(const xRegisterSSE& dst, const xRegisterSSE& src1, const xIndirect64& src2, u8 imm8) const { EmitSIMD(SIMDInstructionInfo(0x22).i().p66().m0f3a().srcw(), dst, src1, src2, imm8); }
void SimdImpl_PExtract::B(const xRegister32& to, const xRegisterSSE& from, u8 imm8) const { xOpWrite0F(0x66, 0x143a, from, to, imm8); }
void SimdImpl_PExtract::B(const xIndirect32& dest, const xRegisterSSE& from, u8 imm8) const { xOpWrite0F(0x66, 0x143a, from, dest, imm8); }
void SimdImpl_PExtract::B(const xRegister32& dst, const xRegisterSSE& src, u8 imm8) const { EmitSIMD(SIMDInstructionInfo(0x14).mov().p66().m0f3a(), src, src, dst, imm8); }
void SimdImpl_PExtract::B(const xIndirect8& dst, const xRegisterSSE& src, u8 imm8) const { EmitSIMD(SIMDInstructionInfo(0x14).mov().p66().m0f3a(), src, src, dst, imm8); }
void SimdImpl_PExtract::W(const xRegister32& to, const xRegisterSSE& from, u8 imm8) const { xOpWrite0F(0x66, 0xc5, from, to, imm8); }
void SimdImpl_PExtract::W(const xIndirect32& dest, const xRegisterSSE& from, u8 imm8) const { xOpWrite0F(0x66, 0x153a, from, dest, imm8); }
void SimdImpl_PExtract::W(const xRegister32& dst, const xRegisterSSE& src, u8 imm8) const { EmitSIMD(SIMDInstructionInfo(0xc5).mov().p66(), dst, dst, src, imm8); }
void SimdImpl_PExtract::W(const xIndirect16& dst, const xRegisterSSE& src, u8 imm8) const { EmitSIMD(SIMDInstructionInfo(0x15).mov().p66().m0f3a(), src, src, dst, imm8); }
void SimdImpl_PExtract::D(const xRegister32& to, const xRegisterSSE& from, u8 imm8) const { xOpWrite0F(0x66, 0x163a, from, to, imm8); }
void SimdImpl_PExtract::D(const xIndirect32& dest, const xRegisterSSE& from, u8 imm8) const { xOpWrite0F(0x66, 0x163a, from, dest, imm8); }
void SimdImpl_PExtract::D(const xRegister32& dst, const xRegisterSSE& src, u8 imm8) const { EmitSIMD(SIMDInstructionInfo(0x16).mov().p66().m0f3a().srcw(), src, src, dst, imm8); }
void SimdImpl_PExtract::D(const xIndirect32& dst, const xRegisterSSE& src, u8 imm8) const { EmitSIMD(SIMDInstructionInfo(0x16).mov().p66().m0f3a().srcw(), src, src, dst, imm8); }
void SimdImpl_PExtract::Q(const xRegister64& to, const xRegisterSSE& from, u8 imm8) const { xOpWrite0F(0x66, 0x163a, from, to, imm8); }
void SimdImpl_PExtract::Q(const xIndirect64& dest, const xRegisterSSE& from, u8 imm8) const { xOpWrite0F(0x66, 0x163a, from, dest, imm8); }
void SimdImpl_PExtract::Q(const xRegister64& dst, const xRegisterSSE& src, u8 imm8) const { EmitSIMD(SIMDInstructionInfo(0x16).mov().p66().m0f3a().srcw(), src, src, dst, imm8); }
void SimdImpl_PExtract::Q(const xIndirect64& dst, const xRegisterSSE& src, u8 imm8) const { EmitSIMD(SIMDInstructionInfo(0x16).mov().p66().m0f3a().srcw(), src, src, dst, imm8); }
const xImplSimd_Shuffle xSHUF = {};
const xImplSimd_PShuffle xPSHUF =
{
{0x66, 0x70}, // D
{0xf2, 0x70}, // LW
{0xf3, 0x70}, // HW
{0x66, 0x0038}, // B
{
{SIMDInstructionInfo(0x70).i().p66()}, // D
{SIMDInstructionInfo(0x70).i().pf2()}, // LW
{SIMDInstructionInfo(0x70).i().pf3()}, // HW
{SIMDInstructionInfo(0x00).i().p66().m0f38()}, // B
};
const SimdImpl_PUnpack xPUNPCK =