Common: Switch integer shuffle/insert/extract instructions to auto SSE/AVX

This commit is contained in:
TellowKrinkle
2025-06-02 01:15:42 -05:00
committed by TellowKrinkle
parent c9ddab444a
commit 0c8c798051
3 changed files with 93 additions and 87 deletions

View File

@@ -31,17 +31,17 @@ namespace x86Emitter
{
// Copies doublewords from src and inserts them into dest at dword locations selected
// with the order operand (8 bit immediate).
const xImplSimd_DestRegImmSSE D;
const xImplSimd_2ArgImm D;
// Copies words from the low quadword of src and inserts them into the low quadword
// of dest at word locations selected with the order operand (8 bit immediate).
// The high quadword of src is copied to the high quadword of dest.
const xImplSimd_DestRegImmSSE LW;
const xImplSimd_2ArgImm LW;
// Copies words from the high quadword of src and inserts them into the high quadword
// of dest at word locations selected with the order operand (8 bit immediate).
// The low quadword of src is copied to the low quadword of dest.
const xImplSimd_DestRegImmSSE HW;
const xImplSimd_2ArgImm HW;
// [sSSE-3] Performs in-place shuffles of bytes in dest according to the shuffle
// control mask in src. If the most significant bit (bit[7]) of each byte of the
@@ -50,42 +50,7 @@ namespace x86Emitter
// byte in dest. The value of each index is the least significant 4 bits (128-bit
// operation) or 3 bits (64-bit operation) of the shuffle control byte.
//
const xImplSimd_DestRegEither B;
// below is my test bed for a new system, free of subclasses. Was supposed to improve intellisense
// but it doesn't (makes it worse). Will try again in MSVC 2010. --air
#if 0
// Copies words from src and inserts them into dest at word locations selected with
// the order operand (8 bit immediate).
// Copies doublewords from src and inserts them into dest at dword locations selected
// with the order operand (8 bit immediate).
void D( const xRegisterSSE& to, const xRegisterSSE& from, u8 imm ) const { xOpWrite0F( 0x66, 0x70, to, from, imm ); }
void D( const xRegisterSSE& to, const xIndirectVoid& from, u8 imm ) const { xOpWrite0F( 0x66, 0x70, to, from, imm ); }
// Copies words from the low quadword of src and inserts them into the low quadword
// of dest at word locations selected with the order operand (8 bit immediate).
// The high quadword of src is copied to the high quadword of dest.
void LW( const xRegisterSSE& to, const xRegisterSSE& from, u8 imm ) const { xOpWrite0F( 0xf2, 0x70, to, from, imm ); }
void LW( const xRegisterSSE& to, const xIndirectVoid& from, u8 imm ) const { xOpWrite0F( 0xf2, 0x70, to, from, imm ); }
// Copies words from the high quadword of src and inserts them into the high quadword
// of dest at word locations selected with the order operand (8 bit immediate).
// The low quadword of src is copied to the low quadword of dest.
void HW( const xRegisterSSE& to, const xRegisterSSE& from, u8 imm ) const { xOpWrite0F( 0xf3, 0x70, to, from, imm ); }
void HW( const xRegisterSSE& to, const xIndirectVoid& from, u8 imm ) const { xOpWrite0F( 0xf3, 0x70, to, from, imm ); }
// [sSSE-3] Performs in-place shuffles of bytes in dest according to the shuffle
// control mask in src. If the most significant bit (bit[7]) of each byte of the
// shuffle control mask is set, then constant zero is written in the result byte.
// Each byte in the shuffle control mask forms an index to permute the corresponding
// byte in dest. The value of each index is the least significant 4 bits (128-bit
// operation) or 3 bits (64-bit operation) of the shuffle control byte.
//
void B( const xRegisterSSE& to, const xRegisterSSE& from ) const { OpWriteSSE( 0x66, 0x0038 ); }
void B( const xRegisterSSE& to, const xIndirectVoid& from ) const { OpWriteSSE( 0x66, 0x0038 ); }
#endif
const xImplSimd_3Arg B;
};
// --------------------------------------------------------------------------------------
@@ -183,17 +148,25 @@ namespace x86Emitter
//
struct xImplSimd_PInsert
{
void B(const xRegisterSSE& to, const xRegister32& from, u8 imm8) const;
void B(const xRegisterSSE& to, const xIndirect32& from, u8 imm8) const;
void B(const xRegisterSSE& dst, const xRegister32& src, u8 imm8) const { B(dst, dst, src, imm8); }
void B(const xRegisterSSE& dst, const xIndirect8& src, u8 imm8) const { B(dst, dst, src, imm8); }
void B(const xRegisterSSE& dst, const xRegisterSSE& src1, const xRegister32& src2, u8 imm8) const;
void B(const xRegisterSSE& dst, const xRegisterSSE& src1, const xIndirect8& src2, u8 imm8) const;
void W(const xRegisterSSE& to, const xRegister32& from, u8 imm8) const;
void W(const xRegisterSSE& to, const xIndirect32& from, u8 imm8) const;
void W(const xRegisterSSE& dst, const xRegister32& src, u8 imm8) const { W(dst, dst, src, imm8); }
void W(const xRegisterSSE& dst, const xIndirect16& src, u8 imm8) const { W(dst, dst, src, imm8); }
void W(const xRegisterSSE& dst, const xRegisterSSE& src1, const xRegister32& src2, u8 imm8) const;
void W(const xRegisterSSE& dst, const xRegisterSSE& src1, const xIndirect16& src2, u8 imm8) const;
void D(const xRegisterSSE& to, const xRegister32& from, u8 imm8) const;
void D(const xRegisterSSE& to, const xIndirect32& from, u8 imm8) const;
void D(const xRegisterSSE& dst, const xRegister32& src, u8 imm8) const { D(dst, dst, src, imm8); }
void D(const xRegisterSSE& dst, const xIndirect32& src, u8 imm8) const { D(dst, dst, src, imm8); }
void D(const xRegisterSSE& dst, const xRegisterSSE& src1, const xRegister32& src2, u8 imm8) const;
void D(const xRegisterSSE& dst, const xRegisterSSE& src1, const xIndirect32& src2, u8 imm8) const;
void Q(const xRegisterSSE& to, const xRegister64& from, u8 imm8) const;
void Q(const xRegisterSSE& to, const xIndirect64& from, u8 imm8) const;
void Q(const xRegisterSSE& dst, const xRegister64& src, u8 imm8) const { Q(dst, dst, src, imm8); }
void Q(const xRegisterSSE& dst, const xIndirect64& src, u8 imm8) const { Q(dst, dst, src, imm8); }
void Q(const xRegisterSSE& dst, const xRegisterSSE& src1, const xRegister64& src2, u8 imm8) const;
void Q(const xRegisterSSE& dst, const xRegisterSSE& src1, const xIndirect64& src2, u8 imm8) const;
};
//////////////////////////////////////////////////////////////////////////////////////////
@@ -206,8 +179,8 @@ namespace x86Emitter
// [SSE-4.1] Copies the byte element specified by imm8 from src to dest. The upper bits
// of dest are zero-extended (cleared). This can be used to extract any single packed
// byte value from src into an x86 32 bit register.
void B(const xRegister32& to, const xRegisterSSE& from, u8 imm8) const;
void B(const xIndirect32& dest, const xRegisterSSE& from, u8 imm8) const;
void B(const xRegister32& dst, const xRegisterSSE& src, u8 imm8) const;
void B(const xIndirect8& dst, const xRegisterSSE& src, u8 imm8) const;
// Copies the word element specified by imm8 from src to dest. The upper bits
// of dest are zero-extended (cleared). This can be used to extract any single packed
@@ -215,16 +188,17 @@ namespace x86Emitter
//
// [SSE-4.1] Note: Indirect memory forms of this instruction are an SSE-4.1 extension!
//
void W(const xRegister32& to, const xRegisterSSE& from, u8 imm8) const;
void W(const xIndirect32& dest, const xRegisterSSE& from, u8 imm8) const;
void W(const xRegister32& dst, const xRegisterSSE& src, u8 imm8) const;
void W(const xIndirect16& dst, const xRegisterSSE& src, u8 imm8) const;
// [SSE-4.1] Copies the dword element specified by imm8 from src to dest. This can be
// used to extract any single packed dword value from src into an x86 32 bit register.
void D(const xRegister32& to, const xRegisterSSE& from, u8 imm8) const;
void D(const xIndirect32& dest, const xRegisterSSE& from, u8 imm8) const;
void D(const xRegister32& dst, const xRegisterSSE& src, u8 imm8) const;
void D(const xIndirect32& dst, const xRegisterSSE& src, u8 imm8) const;
// Insert a qword integer value from r/m64 into the xmm1 at the destination element specified by imm8.
void Q(const xRegister64& to, const xRegisterSSE& from, u8 imm8) const;
void Q(const xIndirect64& dest, const xRegisterSSE& from, u8 imm8) const;
// [SSE-4.1] Copies the dword element specified by imm8 from src to dest. This can be
// used to extract any single packed dword value from src into an x86 64 bit register.
void Q(const xRegister64& dst, const xRegisterSSE& src, u8 imm8) const;
void Q(const xIndirect64& dst, const xRegisterSSE& src, u8 imm8) const;
};
} // namespace x86Emitter

View File

@@ -562,39 +562,38 @@ namespace x86Emitter
EmitSIMD(SIMDInstructionInfo(0xc6).d().p66(), dst, src1, src2, selector);
}
void xImplSimd_PInsert::B(const xRegisterSSE& to, const xRegister32& from, u8 imm8) const { xOpWrite0F(0x66, 0x203a, to, from, imm8); }
void xImplSimd_PInsert::B(const xRegisterSSE& to, const xIndirect32& from, u8 imm8) const { xOpWrite0F(0x66, 0x203a, to, from, imm8); }
void xImplSimd_PInsert::B(const xRegisterSSE& dst, const xRegisterSSE& src1, const xRegister32& src2, u8 imm8) const { EmitSIMD(SIMDInstructionInfo(0x20).i().p66().m0f3a(), dst, src1, src2, imm8); }
void xImplSimd_PInsert::B(const xRegisterSSE& dst, const xRegisterSSE& src1, const xIndirect8& src2, u8 imm8) const { EmitSIMD(SIMDInstructionInfo(0x20).i().p66().m0f3a(), dst, src1, src2, imm8); }
void xImplSimd_PInsert::W(const xRegisterSSE& to, const xRegister32& from, u8 imm8) const { xOpWrite0F(0x66, 0xc4, to, from, imm8); }
void xImplSimd_PInsert::W(const xRegisterSSE& to, const xIndirect32& from, u8 imm8) const { xOpWrite0F(0x66, 0xc4, to, from, imm8); }
void xImplSimd_PInsert::W(const xRegisterSSE& dst, const xRegisterSSE& src1, const xRegister32& src2, u8 imm8) const { EmitSIMD(SIMDInstructionInfo(0xc4).i().p66(), dst, src1, src2, imm8); }
void xImplSimd_PInsert::W(const xRegisterSSE& dst, const xRegisterSSE& src1, const xIndirect16& src2, u8 imm8) const { EmitSIMD(SIMDInstructionInfo(0xc4).i().p66(), dst, src1, src2, imm8); }
void xImplSimd_PInsert::D(const xRegisterSSE& to, const xRegister32& from, u8 imm8) const { xOpWrite0F(0x66, 0x223a, to, from, imm8); }
void xImplSimd_PInsert::D(const xRegisterSSE& to, const xIndirect32& from, u8 imm8) const { xOpWrite0F(0x66, 0x223a, to, from, imm8); }
void xImplSimd_PInsert::D(const xRegisterSSE& dst, const xRegisterSSE& src1, const xRegister32& src2, u8 imm8) const { EmitSIMD(SIMDInstructionInfo(0x22).i().p66().m0f3a().srcw(), dst, src1, src2, imm8); }
void xImplSimd_PInsert::D(const xRegisterSSE& dst, const xRegisterSSE& src1, const xIndirect32& src2, u8 imm8) const { EmitSIMD(SIMDInstructionInfo(0x22).i().p66().m0f3a().srcw(), dst, src1, src2, imm8); }
void xImplSimd_PInsert::Q(const xRegisterSSE& to, const xRegister64& from, u8 imm8) const { xOpWrite0F(0x66, 0x223a, to, from, imm8); }
void xImplSimd_PInsert::Q(const xRegisterSSE& to, const xIndirect64& from, u8 imm8) const { xOpWrite0F(0x66, 0x223a, to, from, imm8); }
void xImplSimd_PInsert::Q(const xRegisterSSE& dst, const xRegisterSSE& src1, const xRegister64& src2, u8 imm8) const { EmitSIMD(SIMDInstructionInfo(0x22).i().p66().m0f3a().srcw(), dst, src1, src2, imm8); }
void xImplSimd_PInsert::Q(const xRegisterSSE& dst, const xRegisterSSE& src1, const xIndirect64& src2, u8 imm8) const { EmitSIMD(SIMDInstructionInfo(0x22).i().p66().m0f3a().srcw(), dst, src1, src2, imm8); }
void SimdImpl_PExtract::B(const xRegister32& to, const xRegisterSSE& from, u8 imm8) const { xOpWrite0F(0x66, 0x143a, from, to, imm8); }
void SimdImpl_PExtract::B(const xIndirect32& dest, const xRegisterSSE& from, u8 imm8) const { xOpWrite0F(0x66, 0x143a, from, dest, imm8); }
void SimdImpl_PExtract::B(const xRegister32& dst, const xRegisterSSE& src, u8 imm8) const { EmitSIMD(SIMDInstructionInfo(0x14).mov().p66().m0f3a(), src, src, dst, imm8); }
void SimdImpl_PExtract::B(const xIndirect8& dst, const xRegisterSSE& src, u8 imm8) const { EmitSIMD(SIMDInstructionInfo(0x14).mov().p66().m0f3a(), src, src, dst, imm8); }
void SimdImpl_PExtract::W(const xRegister32& to, const xRegisterSSE& from, u8 imm8) const { xOpWrite0F(0x66, 0xc5, from, to, imm8); }
void SimdImpl_PExtract::W(const xIndirect32& dest, const xRegisterSSE& from, u8 imm8) const { xOpWrite0F(0x66, 0x153a, from, dest, imm8); }
void SimdImpl_PExtract::W(const xRegister32& dst, const xRegisterSSE& src, u8 imm8) const { EmitSIMD(SIMDInstructionInfo(0xc5).mov().p66(), dst, dst, src, imm8); }
void SimdImpl_PExtract::W(const xIndirect16& dst, const xRegisterSSE& src, u8 imm8) const { EmitSIMD(SIMDInstructionInfo(0x15).mov().p66().m0f3a(), src, src, dst, imm8); }
void SimdImpl_PExtract::D(const xRegister32& to, const xRegisterSSE& from, u8 imm8) const { xOpWrite0F(0x66, 0x163a, from, to, imm8); }
void SimdImpl_PExtract::D(const xIndirect32& dest, const xRegisterSSE& from, u8 imm8) const { xOpWrite0F(0x66, 0x163a, from, dest, imm8); }
void SimdImpl_PExtract::D(const xRegister32& dst, const xRegisterSSE& src, u8 imm8) const { EmitSIMD(SIMDInstructionInfo(0x16).mov().p66().m0f3a().srcw(), src, src, dst, imm8); }
void SimdImpl_PExtract::D(const xIndirect32& dst, const xRegisterSSE& src, u8 imm8) const { EmitSIMD(SIMDInstructionInfo(0x16).mov().p66().m0f3a().srcw(), src, src, dst, imm8); }
void SimdImpl_PExtract::Q(const xRegister64& to, const xRegisterSSE& from, u8 imm8) const { xOpWrite0F(0x66, 0x163a, from, to, imm8); }
void SimdImpl_PExtract::Q(const xIndirect64& dest, const xRegisterSSE& from, u8 imm8) const { xOpWrite0F(0x66, 0x163a, from, dest, imm8); }
void SimdImpl_PExtract::Q(const xRegister64& dst, const xRegisterSSE& src, u8 imm8) const { EmitSIMD(SIMDInstructionInfo(0x16).mov().p66().m0f3a().srcw(), src, src, dst, imm8); }
void SimdImpl_PExtract::Q(const xIndirect64& dst, const xRegisterSSE& src, u8 imm8) const { EmitSIMD(SIMDInstructionInfo(0x16).mov().p66().m0f3a().srcw(), src, src, dst, imm8); }
const xImplSimd_Shuffle xSHUF = {};
const xImplSimd_PShuffle xPSHUF =
{
{0x66, 0x70}, // D
{0xf2, 0x70}, // LW
{0xf3, 0x70}, // HW
{0x66, 0x0038}, // B
{
{SIMDInstructionInfo(0x70).i().p66()}, // D
{SIMDInstructionInfo(0x70).i().pf2()}, // LW
{SIMDInstructionInfo(0x70).i().pf3()}, // HW
{SIMDInstructionInfo(0x00).i().p66().m0f38()}, // B
};
const SimdImpl_PUnpack xPUNPCK =

View File

@@ -292,6 +292,27 @@ TEST(CodegenTests, SSETest)
CODEGEN_TEST(xEXTRACTPS(ptr32[r9], xmm3, 3), "66 41 0f 3a 17 19 03");
CODEGEN_TEST(xEXTRACTPS(ptr32[base], xmm1, 2), "66 0f 3a 17 0d f6 ff ff ff 02");
CODEGEN_TEST(xPSHUF.D(xmm2, ptr[r8], 0), "66 41 0f 70 10 00");
CODEGEN_TEST(xPSHUF.LW(xmm3, xmm8, 1), "f2 41 0f 70 d8 01");
CODEGEN_TEST(xPSHUF.HW(xmm4, xmm2, 8), "f3 0f 70 e2 08");
CODEGEN_TEST(xPSHUF.B(xmm2, ptr[r8]), "66 41 0f 38 00 10");
CODEGEN_TEST(xPINSR.B(xmm1, ebx, 1), "66 0f 3a 20 cb 01");
CODEGEN_TEST(xPINSR.W(xmm1, ebx, 1), "66 0f c4 cb 01");
CODEGEN_TEST(xPINSR.D(xmm1, ebx, 1), "66 0f 3a 22 cb 01");
CODEGEN_TEST(xPINSR.Q(xmm1, rbx, 1), "66 48 0f 3a 22 cb 01");
CODEGEN_TEST(xPINSR.B(xmm9, ptr8[rax], 1), "66 44 0f 3a 20 08 01");
CODEGEN_TEST(xPINSR.W(xmm9, ptr16[rax], 1), "66 44 0f c4 08 01");
CODEGEN_TEST(xPINSR.D(xmm9, ptr32[rax], 1), "66 44 0f 3a 22 08 01");
CODEGEN_TEST(xPINSR.Q(xmm9, ptr64[rax], 1), "66 4c 0f 3a 22 08 01");
CODEGEN_TEST(xPEXTR.B(ebx, xmm1, 1), "66 0f 3a 14 cb 01");
CODEGEN_TEST(xPEXTR.W(ebx, xmm1, 1), "66 0f c5 d9 01");
CODEGEN_TEST(xPEXTR.D(ebx, xmm1, 1), "66 0f 3a 16 cb 01");
CODEGEN_TEST(xPEXTR.Q(rbx, xmm1, 1), "66 48 0f 3a 16 cb 01");
CODEGEN_TEST(xPEXTR.B(ptr8[rax], xmm9, 1), "66 44 0f 3a 14 08 01");
CODEGEN_TEST(xPEXTR.W(ptr16[rax], xmm9, 1), "66 44 0f 3a 15 08 01");
CODEGEN_TEST(xPEXTR.D(ptr32[rax], xmm9, 1), "66 44 0f 3a 16 08 01");
CODEGEN_TEST(xPEXTR.Q(ptr64[rax], xmm9, 1), "66 4c 0f 3a 16 08 01");
CODEGEN_TEST(xMOVAPS(xmm0, xmm1), "0f 28 c1");
CODEGEN_TEST(xMOVAPS(xmm8, xmm9), "45 0f 28 c1");
CODEGEN_TEST(xMOVUPS(xmm8, ptr128[r8+r9]), "47 0f 10 04 08");
@@ -306,15 +327,6 @@ TEST(CodegenTests, SSETest)
CODEGEN_TEST(xMOVD(r10, xmm1), "66 49 0f 7e ca");
CODEGEN_TEST(xMOVD(rax, xmm10), "66 4c 0f 7e d0");
CODEGEN_TEST(xMOVD(r10, xmm10), "66 4d 0f 7e d2");
CODEGEN_TEST(xPINSR.B(xmm0, ebx, 1), "66 0f 3a 20 c3 01");
CODEGEN_TEST(xPINSR.W(xmm0, ebx, 1), "66 0f c4 c3 01");
CODEGEN_TEST(xPINSR.D(xmm0, ebx, 1), "66 0f 3a 22 c3 01");
CODEGEN_TEST(xPINSR.Q(xmm0, rbx, 1), "66 48 0f 3a 22 c3 01");
CODEGEN_TEST(xPEXTR.B(ebx, xmm0, 1), "66 0f 3a 14 c3 01");
CODEGEN_TEST(xPEXTR.W(ebx, xmm0, 1), "66 0f c5 c3 01");
CODEGEN_TEST(xPEXTR.D(ebx, xmm0, 1), "66 0f 3a 16 c3 01");
CODEGEN_TEST(xPEXTR.Q(rbx, xmm0, 1), "66 48 0f 3a 16 c3 01");
CODEGEN_TEST(xPEXTR.Q(ptr64[rax], xmm0, 1), "66 48 0f 3a 16 00 01");
}
TEST(CodegenTests, AVXTest)
@@ -464,6 +476,27 @@ TEST(CodegenTests, AVXTest)
CODEGEN_TEST(xEXTRACTPS(ptr32[r9], xmm3, 3), "c4 c3 79 17 19 03");
CODEGEN_TEST(xEXTRACTPS(ptr32[base], xmm1, 2), "c4 e3 79 17 0d f6 ff ff ff 02");
CODEGEN_TEST(xPSHUF.D(xmm2, ptr[r8], 0), "c4 c1 79 70 10 00");
CODEGEN_TEST(xPSHUF.LW(xmm3, xmm8, 1), "c4 c1 7b 70 d8 01");
CODEGEN_TEST(xPSHUF.HW(xmm4, xmm2, 8), "c5 fa 70 e2 08");
CODEGEN_TEST(xPSHUF.B(xmm2, ptr[r8]), "c4 c2 69 00 10");
CODEGEN_TEST(xPINSR.B(xmm1, ebx, 1), "c4 e3 71 20 cb 01");
CODEGEN_TEST(xPINSR.W(xmm1, ebx, 1), "c5 f1 c4 cb 01");
CODEGEN_TEST(xPINSR.D(xmm1, ebx, 1), "c4 e3 71 22 cb 01");
CODEGEN_TEST(xPINSR.Q(xmm1, rbx, 1), "c4 e3 f1 22 cb 01");
CODEGEN_TEST(xPINSR.B(xmm9, ptr8[rax], 1), "c4 63 31 20 08 01");
CODEGEN_TEST(xPINSR.W(xmm9, ptr16[rax], 1), "c5 31 c4 08 01");
CODEGEN_TEST(xPINSR.D(xmm9, ptr32[rax], 1), "c4 63 31 22 08 01");
CODEGEN_TEST(xPINSR.Q(xmm9, ptr64[rax], 1), "c4 63 b1 22 08 01");
CODEGEN_TEST(xPEXTR.B(ebx, xmm1, 1), "c4 e3 79 14 cb 01");
CODEGEN_TEST(xPEXTR.W(ebx, xmm1, 1), "c5 f9 c5 d9 01");
CODEGEN_TEST(xPEXTR.D(ebx, xmm1, 1), "c4 e3 79 16 cb 01");
CODEGEN_TEST(xPEXTR.Q(rbx, xmm1, 1), "c4 e3 f9 16 cb 01");
CODEGEN_TEST(xPEXTR.B(ptr8[rax], xmm9, 1), "c4 63 79 14 08 01");
CODEGEN_TEST(xPEXTR.W(ptr16[rax], xmm9, 1), "c4 63 79 15 08 01");
CODEGEN_TEST(xPEXTR.D(ptr32[rax], xmm9, 1), "c4 63 79 16 08 01");
CODEGEN_TEST(xPEXTR.Q(ptr64[rax], xmm9, 1), "c4 63 f9 16 08 01");
CODEGEN_TEST(xVMOVAPS(xmm0, xmm1), "c5 f8 28 c1");
CODEGEN_TEST(xVMOVAPS(xmm0, ptr32[rdi]), "c5 f8 28 07");
CODEGEN_TEST(xVMOVAPS(ptr32[rdi], xmm0), "c5 f8 29 07");