From 8ad9d7d04760e0e96336c307bb8670edf0be49a3 Mon Sep 17 00:00:00 2001 From: TellowKrinkle Date: Sun, 1 Jun 2025 21:30:20 -0500 Subject: [PATCH] Common: Switch SIMD padd/pmul instructions to auto SSE/AVX --- common/emitter/implement/simd_arithmetic.h | 30 +++++------ common/emitter/simd.cpp | 52 +++++++++---------- .../common/x86emitter/codegen_tests_main.cpp | 48 +++++++++++++++++ 3 files changed, 89 insertions(+), 41 deletions(-) diff --git a/common/emitter/implement/simd_arithmetic.h b/common/emitter/implement/simd_arithmetic.h index adf2ec82d8..ef9993f213 100644 --- a/common/emitter/implement/simd_arithmetic.h +++ b/common/emitter/implement/simd_arithmetic.h @@ -51,32 +51,32 @@ namespace x86Emitter // struct xImplSimd_AddSub { - const xImplSimd_DestRegEither B; - const xImplSimd_DestRegEither W; - const xImplSimd_DestRegEither D; - const xImplSimd_DestRegEither Q; + const xImplSimd_3Arg B; + const xImplSimd_3Arg W; + const xImplSimd_3Arg D; + const xImplSimd_3Arg Q; // Add/Sub packed signed byte [8bit] integers from src into dest, and saturate the results. - const xImplSimd_DestRegEither SB; + const xImplSimd_3Arg SB; // Add/Sub packed signed word [16bit] integers from src into dest, and saturate the results. - const xImplSimd_DestRegEither SW; + const xImplSimd_3Arg SW; // Add/Sub packed unsigned byte [8bit] integers from src into dest, and saturate the results. - const xImplSimd_DestRegEither USB; + const xImplSimd_3Arg USB; // Add/Sub packed unsigned word [16bit] integers from src into dest, and saturate the results. - const xImplSimd_DestRegEither USW; + const xImplSimd_3Arg USW; }; ////////////////////////////////////////////////////////////////////////////////////////// // struct xImplSimd_PMul { - const xImplSimd_DestRegEither LW; - const xImplSimd_DestRegEither HW; - const xImplSimd_DestRegEither HUW; - const xImplSimd_DestRegEither UDQ; + const xImplSimd_3Arg LW; + const xImplSimd_3Arg HW; + const xImplSimd_3Arg HUW; + const xImplSimd_3Arg UDQ; // [SSE-3] PMULHRSW multiplies vertically each signed 16-bit integer from dest with the // corresponding signed 16-bit integer of source, producing intermediate signed 32-bit @@ -88,14 +88,14 @@ namespace x86Emitter // // Both operands can be MMX or XMM registers. Source can be register or memory. // - const xImplSimd_DestRegEither HRSW; + const xImplSimd_3Arg HRSW; // [SSE-4.1] Multiply the packed dword signed integers in dest with src, and store // the low 32 bits of each product in xmm1. - const xImplSimd_DestRegSSE LD; + const xImplSimd_3Arg LD; // [SSE-4.1] Multiply the packed signed dword integers in dest with src. - const xImplSimd_DestRegSSE DQ; + const xImplSimd_3Arg DQ; }; ////////////////////////////////////////////////////////////////////////////////////////// diff --git a/common/emitter/simd.cpp b/common/emitter/simd.cpp index 5edb75a5ea..6e595f9520 100644 --- a/common/emitter/simd.cpp +++ b/common/emitter/simd.cpp @@ -339,41 +339,41 @@ namespace x86Emitter }; const xImplSimd_AddSub xPADD = - { - {0x66, 0xdc + 0x20}, // B - {0x66, 0xdc + 0x21}, // W - {0x66, 0xdc + 0x22}, // D - {0x66, 0xd4}, // Q + { + {SIMDInstructionInfo(0xfc).p66().i().commutative()}, // B + {SIMDInstructionInfo(0xfd).p66().i().commutative()}, // W + {SIMDInstructionInfo(0xfe).p66().i().commutative()}, // D + {SIMDInstructionInfo(0xd4).p66().i().commutative()}, // Q - {0x66, 0xdc + 0x10}, // SB - {0x66, 0xdc + 0x11}, // SW - {0x66, 0xdc}, // USB - {0x66, 0xdc + 1}, // USW + {SIMDInstructionInfo(0xec).p66().i().commutative()}, // SB + {SIMDInstructionInfo(0xed).p66().i().commutative()}, // SW + {SIMDInstructionInfo(0xdc).p66().i().commutative()}, // USB + {SIMDInstructionInfo(0xdd).p66().i().commutative()}, // USW }; const xImplSimd_AddSub xPSUB = - { - {0x66, 0xd8 + 0x20}, // B - {0x66, 0xd8 + 0x21}, // W - {0x66, 0xd8 + 0x22}, // D - {0x66, 0xfb}, // Q + { + {SIMDInstructionInfo(0xf8).p66().i()}, // B + {SIMDInstructionInfo(0xf9).p66().i()}, // W + {SIMDInstructionInfo(0xfa).p66().i()}, // D + {SIMDInstructionInfo(0xfb).p66().i()}, // Q - {0x66, 0xd8 + 0x10}, // SB - {0x66, 0xd8 + 0x11}, // SW - {0x66, 0xd8}, // USB - {0x66, 0xd8 + 1}, // USW + {SIMDInstructionInfo(0xe8).p66().i()}, // SB + {SIMDInstructionInfo(0xe9).p66().i()}, // SW + {SIMDInstructionInfo(0xd8).p66().i()}, // USB + {SIMDInstructionInfo(0xd9).p66().i()}, // USW }; const xImplSimd_PMul xPMUL = - { - {0x66, 0xd5}, // LW - {0x66, 0xe5}, // HW - {0x66, 0xe4}, // HUW - {0x66, 0xf4}, // UDQ + { + {SIMDInstructionInfo(0xd5).p66().i().commutative()}, // LW + {SIMDInstructionInfo(0xe5).p66().i().commutative()}, // HW + {SIMDInstructionInfo(0xe4).p66().i().commutative()}, // HUW + {SIMDInstructionInfo(0xf4).p66().i().commutative()}, // UDQ - {0x66, 0x0b38}, // HRSW - {0x66, 0x4038}, // LD - {0x66, 0x2838}, // DQ + {SIMDInstructionInfo(0x0b).p66().m0f38().i().commutative()}, // HRSW + {SIMDInstructionInfo(0x40).p66().m0f38().i().commutative()}, // LD + {SIMDInstructionInfo(0x28).p66().m0f38().i().commutative()}, // DQ }; const xImplSimd_rSqrt xRSQRT = diff --git a/tests/ctest/common/x86emitter/codegen_tests_main.cpp b/tests/ctest/common/x86emitter/codegen_tests_main.cpp index 5c80b9670b..ae66077ae9 100644 --- a/tests/ctest/common/x86emitter/codegen_tests_main.cpp +++ b/tests/ctest/common/x86emitter/codegen_tests_main.cpp @@ -187,6 +187,30 @@ TEST(CodegenTests, SSETest) CODEGEN_TEST(xPSRL.Q(xmm7, 4), "66 0f 73 d7 04"); CODEGEN_TEST(xPSRL.DQ(xmm8, 5), "66 41 0f 73 d8 05"); + CODEGEN_TEST(xPADD.B(xmm1, xmm8), "66 41 0f fc c8"); + CODEGEN_TEST(xPADD.W(xmm4, xmm7), "66 0f fd e7"); + CODEGEN_TEST(xPADD.D(xmm2, ptr[rcx]), "66 0f fe 11"); + CODEGEN_TEST(xPADD.Q(xmm8, xmm2), "66 44 0f d4 c2"); + CODEGEN_TEST(xPADD.SB(xmm9, xmm8), "66 45 0f ec c8"); + CODEGEN_TEST(xPADD.SW(xmm2, ptr[r8]), "66 41 0f ed 10"); + CODEGEN_TEST(xPADD.USB(xmm3, xmm3), "66 0f dc db"); + CODEGEN_TEST(xPADD.USW(xmm2, xmm9), "66 41 0f dd d1"); + CODEGEN_TEST(xPSUB.B(xmm1, xmm8), "66 41 0f f8 c8"); + CODEGEN_TEST(xPSUB.W(xmm4, xmm7), "66 0f f9 e7"); + CODEGEN_TEST(xPSUB.D(xmm2, ptr[rcx]), "66 0f fa 11"); + CODEGEN_TEST(xPSUB.Q(xmm8, xmm2), "66 44 0f fb c2"); + CODEGEN_TEST(xPSUB.SB(xmm9, xmm8), "66 45 0f e8 c8"); + CODEGEN_TEST(xPSUB.SW(xmm2, ptr[r8]), "66 41 0f e9 10"); + CODEGEN_TEST(xPSUB.USB(xmm3, xmm3), "66 0f d8 db"); + CODEGEN_TEST(xPSUB.USW(xmm2, xmm9), "66 41 0f d9 d1"); + CODEGEN_TEST(xPMUL.LW(xmm2, xmm8), "66 41 0f d5 d0"); + CODEGEN_TEST(xPMUL.HW(xmm9, ptr[r9]), "66 45 0f e5 09"); + CODEGEN_TEST(xPMUL.HUW(xmm4, xmm3), "66 0f e4 e3"); + CODEGEN_TEST(xPMUL.UDQ(xmm1, xmm7), "66 0f f4 cf"); + CODEGEN_TEST(xPMUL.HRSW(xmm2, xmm4), "66 0f 38 0b d4"); + CODEGEN_TEST(xPMUL.LD(xmm1, xmm8), "66 41 0f 38 40 c8"); + CODEGEN_TEST(xPMUL.DQ(xmm4, xmm9), "66 41 0f 38 28 e1"); + CODEGEN_TEST(xMOVAPS(xmm0, xmm1), "0f 28 c1"); CODEGEN_TEST(xMOVAPS(xmm8, xmm9), "45 0f 28 c1"); CODEGEN_TEST(xMOVUPS(xmm8, ptr128[r8+r9]), "47 0f 10 04 08"); @@ -255,6 +279,30 @@ TEST(CodegenTests, AVXTest) CODEGEN_TEST(xPSRL.Q(xmm7, 4), "c5 c1 73 d7 04"); CODEGEN_TEST(xPSRL.DQ(xmm8, 5), "c4 c1 39 73 d8 05"); + CODEGEN_TEST(xPADD.B(xmm1, xmm8), "c5 b9 fc c9"); // => vpaddb xmm1, xmm8, xmm1 + CODEGEN_TEST(xPADD.W(xmm4, xmm7), "c5 d9 fd e7"); + CODEGEN_TEST(xPADD.D(xmm2, ptr[rcx]), "c5 e9 fe 11"); + CODEGEN_TEST(xPADD.Q(xmm8, xmm2), "c5 39 d4 c2"); + CODEGEN_TEST(xPADD.SB(xmm9, xmm8), "c4 41 31 ec c8"); + CODEGEN_TEST(xPADD.SW(xmm2, ptr[r8]), "c4 c1 69 ed 10"); + CODEGEN_TEST(xPADD.USB(xmm3, xmm3), "c5 e1 dc db"); + CODEGEN_TEST(xPADD.USW(xmm2, xmm9), "c5 b1 dd d2"); // => vpaddd xmm2, xmm9, xmm2 + CODEGEN_TEST(xPSUB.B(xmm1, xmm8), "c4 c1 71 f8 c8"); + CODEGEN_TEST(xPSUB.W(xmm4, xmm7), "c5 d9 f9 e7"); + CODEGEN_TEST(xPSUB.D(xmm2, ptr[rcx]), "c5 e9 fa 11"); + CODEGEN_TEST(xPSUB.Q(xmm8, xmm2), "c5 39 fb c2"); + CODEGEN_TEST(xPSUB.SB(xmm9, xmm8), "c4 41 31 e8 c8"); + CODEGEN_TEST(xPSUB.SW(xmm2, ptr[r8]), "c4 c1 69 e9 10"); + CODEGEN_TEST(xPSUB.USB(xmm3, xmm3), "c5 e1 d8 db"); + CODEGEN_TEST(xPSUB.USW(xmm2, xmm9), "c4 c1 69 d9 d1"); + CODEGEN_TEST(xPMUL.LW(xmm2, xmm8), "c5 b9 d5 d2"); // => vpmullw xmm2, xmm8, xmm2 + CODEGEN_TEST(xPMUL.HW(xmm9, ptr[r9]), "c4 41 31 e5 09"); + CODEGEN_TEST(xPMUL.HUW(xmm4, xmm3), "c5 d9 e4 e3"); + CODEGEN_TEST(xPMUL.UDQ(xmm1, xmm7), "c5 f1 f4 cf"); + CODEGEN_TEST(xPMUL.HRSW(xmm2, xmm4), "c4 e2 69 0b d4"); + CODEGEN_TEST(xPMUL.LD(xmm1, xmm8), "c4 c2 71 40 c8"); + CODEGEN_TEST(xPMUL.DQ(xmm4, xmm9), "c4 c2 59 28 e1"); + CODEGEN_TEST(xVMOVAPS(xmm0, xmm1), "c5 f8 28 c1"); CODEGEN_TEST(xVMOVAPS(xmm0, ptr32[rdi]), "c5 f8 28 07"); CODEGEN_TEST(xVMOVAPS(ptr32[rdi], xmm0), "c5 f8 29 07");