mirror of
https://github.com/FEX-Emu/FEX.git
synced 2025-02-16 04:47:32 +00:00
Merge pull request #2950 from Sonicadvance1/optimize_pmaddwd
OpcodeDispatcher: Optimize pmaddwd
This commit is contained in:
commit
185e3bfcb6
@ -3292,20 +3292,17 @@ OrderedNode* OpDispatchBuilder::PMADDWDOpImpl(OpcodeArgs, const X86Tables::Decod
|
||||
OrderedNode *Src2Node = LoadSource(FPRClass, Op, Src2, Op->Flags, -1);
|
||||
|
||||
if (Size == 8) {
|
||||
// MMX implementation can be slightly more optimal
|
||||
Size <<= 1;
|
||||
auto MullResult = _VSMull(Size, 2, Src1Node, Src2Node);
|
||||
return _VAddP(Size, 4, MullResult, MullResult);
|
||||
}
|
||||
|
||||
auto Src1_L = _VSXTL(Size, 2, Src1Node); // [15:0 ], [31:16], [32:47 ], [63:48 ]
|
||||
auto Src1_H = _VSXTL2(Size, 2, Src1Node); // [79:64], [95:80], [111:96], [127:112]
|
||||
|
||||
auto Src2_L = _VSXTL(Size, 2, Src2Node); // [15:0 ], [31:16], [32:47 ], [63:48 ]
|
||||
auto Src2_H = _VSXTL2(Size, 2, Src2Node); // [79:64], [95:80], [111:96], [127:112]
|
||||
|
||||
auto Res_L = _VSMul(Size, 4, Src1_L, Src2_L); // [15:0 ], [31:16], [32:47 ], [63:48 ] : Original elements
|
||||
auto Res_H = _VSMul(Size, 4, Src1_H, Src2_H); // [79:64], [95:80], [111:96], [127:112] : Original elements
|
||||
auto Lower = _VSMull(Size, 2, Src1Node, Src2Node);
|
||||
auto Upper = _VSMull2(Size, 2, Src1Node, Src2Node);
|
||||
|
||||
// [15:0 ] + [31:16], [32:47 ] + [63:48 ], [79:64] + [95:80], [111:96] + [127:112]
|
||||
return _VAddP(Size, 4, Res_L, Res_H);
|
||||
return _VAddP(Size, 4, Lower, Upper);
|
||||
}
|
||||
|
||||
void OpDispatchBuilder::PMADDWD(OpcodeArgs) {
|
||||
|
@ -4410,19 +4410,14 @@
|
||||
]
|
||||
},
|
||||
"pmaddwd mm0, mm1": {
|
||||
"ExpectedInstructionCount": 10,
|
||||
"Optimal": "No",
|
||||
"ExpectedInstructionCount": 5,
|
||||
"Optimal": "Yes",
|
||||
"Comment": "0x0f 0xf5",
|
||||
"ExpectedArm64ASM": [
|
||||
"ldr d4, [x28, #752]",
|
||||
"ldr d5, [x28, #768]",
|
||||
"sxtl v6.4s, v4.4h",
|
||||
"sxtl2 v4.4s, v4.8h",
|
||||
"sxtl v7.4s, v5.4h",
|
||||
"sxtl2 v5.4s, v5.8h",
|
||||
"mul v6.4s, v6.4s, v7.4s",
|
||||
"mul v4.4s, v4.4s, v5.4s",
|
||||
"addp v4.4s, v6.4s, v4.4s",
|
||||
"smull v4.4s, v4.4h, v5.4h",
|
||||
"addp v4.4s, v4.4s, v4.4s",
|
||||
"str d4, [x28, #752]"
|
||||
]
|
||||
},
|
||||
|
@ -1087,16 +1087,12 @@
|
||||
]
|
||||
},
|
||||
"pmaddwd xmm0, xmm1": {
|
||||
"ExpectedInstructionCount": 7,
|
||||
"Optimal": "No",
|
||||
"ExpectedInstructionCount": 3,
|
||||
"Optimal": "Yes",
|
||||
"Comment": "0x66 0x0f 0xf5",
|
||||
"ExpectedArm64ASM": [
|
||||
"sxtl v4.4s, v16.4h",
|
||||
"sxtl2 v5.4s, v16.8h",
|
||||
"sxtl v6.4s, v17.4h",
|
||||
"sxtl2 v7.4s, v17.8h",
|
||||
"mul v4.4s, v4.4s, v6.4s",
|
||||
"mul v5.4s, v5.4s, v7.4s",
|
||||
"smull v4.4s, v16.4h, v17.4h",
|
||||
"smull2 v5.4s, v16.8h, v17.8h",
|
||||
"addp v16.4s, v4.4s, v5.4s"
|
||||
]
|
||||
},
|
||||
|
@ -6863,7 +6863,7 @@
|
||||
]
|
||||
},
|
||||
"vpmaddwd xmm0, xmm1, xmm2": {
|
||||
"ExpectedInstructionCount": 11,
|
||||
"ExpectedInstructionCount": 7,
|
||||
"Optimal": "No",
|
||||
"Comment": [
|
||||
"Map 1 0b01 0xf5 128-bit"
|
||||
@ -6871,12 +6871,8 @@
|
||||
"ExpectedArm64ASM": [
|
||||
"mov z4.d, p7/m, z17.d",
|
||||
"mov z5.d, p7/m, z18.d",
|
||||
"sxtl v6.4s, v4.4h",
|
||||
"sxtl2 v4.4s, v4.8h",
|
||||
"sxtl v7.4s, v5.4h",
|
||||
"sxtl2 v5.4s, v5.8h",
|
||||
"mul v6.4s, v6.4s, v7.4s",
|
||||
"mul v4.4s, v4.4s, v5.4s",
|
||||
"smull v6.4s, v4.4h, v5.4h",
|
||||
"smull2 v4.4s, v4.8h, v5.8h",
|
||||
"addp v4.4s, v6.4s, v4.4s",
|
||||
"mov v4.16b, v4.16b",
|
||||
"mov z16.d, p7/m, z4.d"
|
||||
@ -6891,12 +6887,12 @@
|
||||
"ExpectedArm64ASM": [
|
||||
"mov z4.d, p7/m, z17.d",
|
||||
"mov z5.d, p7/m, z18.d",
|
||||
"sunpklo z6.s, z4.h",
|
||||
"sunpkhi z4.s, z4.h",
|
||||
"sunpklo z7.s, z5.h",
|
||||
"sunpkhi z5.s, z5.h",
|
||||
"mul z6.s, z6.s, z7.s",
|
||||
"mul z4.s, z4.s, z5.s",
|
||||
"smullb z0.s, z4.h, z5.h",
|
||||
"smullt z1.s, z4.h, z5.h",
|
||||
"zip1 z6.s, z0.s, z1.s",
|
||||
"smullb z0.s, z4.h, z5.h",
|
||||
"smullt z1.s, z4.h, z5.h",
|
||||
"zip2 z4.s, z0.s, z1.s",
|
||||
"movprfx z0, z6",
|
||||
"addp z0.s, p7/m, z0.s, z4.s",
|
||||
"uzp1 z4.s, z0.s, z0.s",
|
||||
|
Loading…
x
Reference in New Issue
Block a user