diff --git a/FEXCore/Source/Interface/Core/OpcodeDispatcher/AVX_128.cpp b/FEXCore/Source/Interface/Core/OpcodeDispatcher/AVX_128.cpp index e97493a55..ff1f52ce9 100644 --- a/FEXCore/Source/Interface/Core/OpcodeDispatcher/AVX_128.cpp +++ b/FEXCore/Source/Interface/Core/OpcodeDispatcher/AVX_128.cpp @@ -748,17 +748,26 @@ void OpDispatchBuilder::AVX128_MOVQ(OpcodeArgs) { void OpDispatchBuilder::AVX128_VMOVLP(OpcodeArgs) { auto Src1 = AVX128_LoadSource_WithOpSize(Op, Op->Src[0], Op->Flags, false); - if (Op->Dest.IsGPR()) { - auto Src2 = AVX128_LoadSource_WithOpSize(Op, Op->Src[1], Op->Flags, false); - + if (!Op->Dest.IsGPR()) { + ///< VMOVLPS/PD mem64, xmm1 + StoreResult_WithOpSize(FPRClass, Op, Op->Dest, Src1.Low, OpSize::i64Bit, OpSize::i64Bit); + } else if (!Op->Src[1].IsGPR()) { + ///< VMOVLPS/PD xmm1, xmm2, mem64 // Bits[63:0] come from Src2[63:0] // Bits[127:64] come from Src1[127:64] - Ref Result_Low = _VInsElement(OpSize::i128Bit, OpSize::i64Bit, 1, 1, Src2.Low, Src1.Low); + auto Src2 = LoadSource_WithOpSize(FPRClass, Op, Op->Src[1], OpSize::i64Bit, Op->Flags); + Ref Result_Low = _VInsElement(OpSize::i128Bit, OpSize::i64Bit, 1, 1, Src2, Src1.Low); Ref ZeroVector = LoadZeroVector(OpSize::i128Bit); AVX128_StoreResult_WithOpSize(Op, Op->Dest, RefPair {.Low = Result_Low, .High = ZeroVector}); } else { - StoreResult_WithOpSize(FPRClass, Op, Op->Dest, Src1.Low, OpSize::i64Bit, OpSize::i64Bit); + ///< VMOVHLPS/PD xmm1, xmm2, xmm3 + auto Src2 = AVX128_LoadSource_WithOpSize(Op, Op->Src[1], Op->Flags, false); + + Ref Result_Low = _VInsElement(OpSize::i128Bit, OpSize::i64Bit, 0, 1, Src1.Low, Src2.Low); + Ref ZeroVector = LoadZeroVector(OpSize::i128Bit); + + AVX128_StoreResult_WithOpSize(Op, Op->Dest, RefPair {.Low = Result_Low, .High = ZeroVector}); } } diff --git a/FEXCore/Source/Interface/Core/OpcodeDispatcher/Vector.cpp b/FEXCore/Source/Interface/Core/OpcodeDispatcher/Vector.cpp index f6033e36b..f06992052 100644 --- a/FEXCore/Source/Interface/Core/OpcodeDispatcher/Vector.cpp +++ b/FEXCore/Source/Interface/Core/OpcodeDispatcher/Vector.cpp @@ -132,15 +132,23 @@ void OpDispatchBuilder::MOVLPOp(OpcodeArgs) { } void OpDispatchBuilder::VMOVLPOp(OpcodeArgs) { - if (Op->Dest.IsGPR()) { - Ref Src1 = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags, {.Align = 16}); - Ref Src2 = LoadSource(FPRClass, Op, Op->Src[1], Op->Flags, {.Align = 8}); - Ref Result = _VInsElement(16, 8, 0, 0, Src1, Src2); + Ref Src1 = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags, {.Align = 16}); + if (!Op->Dest.IsGPR()) { + ///< VMOVLPS/PD mem64, xmm1 + StoreResult_WithOpSize(FPRClass, Op, Op->Dest, Src1, 8, 8); + } else if (!Op->Src[1].IsGPR()) { + ///< VMOVLPS/PD xmm1, xmm2, mem64 + // Bits[63:0] come from Src2[63:0] + // Bits[127:64] come from Src1[127:64] + Ref Src2 = LoadSource(FPRClass, Op, Op->Src[1], Op->Flags, {.Align = 8}); + Ref Result = _VInsElement(OpSize::i128Bit, OpSize::i64Bit, 1, 1, Src2, Src1); StoreResult(FPRClass, Op, Result, -1); } else { - Ref Src = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags, {.Align = 8}); - StoreResult_WithOpSize(FPRClass, Op, Op->Dest, Src, 8, 8); + ///< VMOVHLPS/PD xmm1, xmm2, xmm3 + Ref Src2 = LoadSource(FPRClass, Op, Op->Src[1], Op->Flags, {.Align = 16}); + Ref Result = _VInsElement(OpSize::i128Bit, OpSize::i64Bit, 0, 1, Src1, Src2); + StoreResult(FPRClass, Op, Result, -1); } } diff --git a/FEXCore/Source/Interface/Core/X86Tables/VEXTables.cpp b/FEXCore/Source/Interface/Core/X86Tables/VEXTables.cpp index f9ec12e43..fb120eae7 100644 --- a/FEXCore/Source/Interface/Core/X86Tables/VEXTables.cpp +++ b/FEXCore/Source/Interface/Core/X86Tables/VEXTables.cpp @@ -27,8 +27,8 @@ std::array VEXTableOps = []() consteval { {OPD(1, 0b10, 0x11), 1, X86InstInfo{"VMOVSS", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_SF_MOD_DST | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0, nullptr}}, {OPD(1, 0b11, 0x11), 1, X86InstInfo{"VMOVSD", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_SF_MOD_DST | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0, nullptr}}, - {OPD(1, 0b00, 0x12), 1, X86InstInfo{"VMOVLPS", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_SF_MOD_MEM_ONLY | FLAGS_XMM_FLAGS | FLAGS_VEX_1ST_SRC, 0, nullptr}}, - {OPD(1, 0b01, 0x12), 1, X86InstInfo{"VMOVLPD", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_SF_MOD_MEM_ONLY | FLAGS_XMM_FLAGS | FLAGS_VEX_1ST_SRC, 0, nullptr}}, + {OPD(1, 0b00, 0x12), 1, X86InstInfo{"VMOVLPS", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS | FLAGS_VEX_1ST_SRC, 0, nullptr}}, + {OPD(1, 0b01, 0x12), 1, X86InstInfo{"VMOVLPD", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS | FLAGS_VEX_1ST_SRC, 0, nullptr}}, {OPD(1, 0b10, 0x12), 1, X86InstInfo{"VMOVSLDUP", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0, nullptr}}, {OPD(1, 0b11, 0x12), 1, X86InstInfo{"VMOVDDUP", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0, nullptr}}, diff --git a/unittests/ASM/VEX/vmovhlps.asm b/unittests/ASM/VEX/vmovhlps.asm new file mode 100644 index 000000000..3507de7e9 --- /dev/null +++ b/unittests/ASM/VEX/vmovhlps.asm @@ -0,0 +1,34 @@ +%ifdef CONFIG +{ + "HostFeatures": ["AVX"], + "RegData": { + "XMM1": ["0x4150f0e342241b6c", "0xdddddddddddddddd", "0x0000000000000000", "0x0000000000000000"], + "XMM2": ["0xCCCCCCCCCCCCCCCC", "0xDDDDDDDDDDDDDDDD", "0xEEEEEEEEEEEEEEEE", "0xFFFFFFFFFFFFFFFF"], + "XMM3": ["0x4150f0e342241b6c", "0xdddddddddddddddd", "0x0000000000000000", "0x0000000000000000"], + "XMM5": ["0x428b029f42a63326", "0x4150f0e342241b6c", "0x41aff21340ab4706", "0x40aa5bea411ac802"], + "XMM6": ["0x428b029f42a63326", "0x4150f0e342241b6c", "0x41aff21340ab4706", "0x40aa5bea411ac802"] + } +} +%endif + +; Load inputs +vmovapd ymm1, [rel .data] +vmovapd ymm2, [rel .data + 32] +vmovapd ymm5, [rel .data_random] +vmovapd ymm6, [rel .data_random] + +vmovhlps xmm1, xmm2, xmm5 +vmovhlps xmm3, xmm1, xmm5 + +hlt + +align 32 +.data: +db 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF +db 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF +db 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xDD, 0xDD, 0xDD, 0xDD, 0xDD, 0xDD, 0xDD, 0xDD +db 0xEE, 0xEE, 0xEE, 0xEE, 0xEE, 0xEE, 0xEE, 0xEE, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF + +.data_random: +dd 83.0999,69.50512,41.02678,13.05881,5.35242,21.9932,9.67383,5.32372,29.02872,66.50151,19.30764,91.3633,40.45086,50.96153,32.64489,23.97574,90.64316,24.22547,98.9394,91.21715,90.80143,99.48407,64.97245,74.39838,35.22761,25.35321,5.8732,90.19956,33.03133,52.02952,58.38554,10.17531,47.84703,84.04831,90.02965,65.81329,96.27991,6.64479,25.58971,95.00694,88.1929,37.16964,49.52602,10.27223,77.70605,20.21439,9.8056,41.29389,15.4071,57.54286,9.61117,55.54302,52.90745,4.88086,72.52882,3.0201,56.55091,71.22749,61.84736,88.74295,47.72641,24.17404,33.70564,96.71303 + diff --git a/unittests/InstructionCountCI/VEX_map1.json b/unittests/InstructionCountCI/VEX_map1.json index 275b1fae1..6ddaceb16 100644 --- a/unittests/InstructionCountCI/VEX_map1.json +++ b/unittests/InstructionCountCI/VEX_map1.json @@ -216,8 +216,8 @@ ], "ExpectedArm64ASM": [ "ldr q2, [x4]", - "mov v16.16b, v17.16b", - "mov v16.d[0], v2.d[0]" + "mov v16.16b, v2.16b", + "mov v16.d[1], v17.d[1]" ] }, "vmovlpd xmm0, xmm1, [rax]": { @@ -228,8 +228,8 @@ ], "ExpectedArm64ASM": [ "ldr q2, [x4]", - "mov v16.16b, v17.16b", - "mov v16.d[0], v2.d[0]" + "mov v16.16b, v2.16b", + "mov v16.d[1], v17.d[1]" ] }, "vmovsldup xmm0, [rax]": {