Merge pull request #2950 from Sonicadvance1/optimize_pmaddwd

OpcodeDispatcher: Optimize pmaddwd
This commit is contained in:
Mai 2023-08-21 21:39:30 -04:00 committed by GitHub
commit 185e3bfcb6
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 23 additions and 39 deletions

View File

@ -3292,20 +3292,17 @@ OrderedNode* OpDispatchBuilder::PMADDWDOpImpl(OpcodeArgs, const X86Tables::Decod
OrderedNode *Src2Node = LoadSource(FPRClass, Op, Src2, Op->Flags, -1);
if (Size == 8) {
// MMX implementation can be slightly more optimal
Size <<= 1;
auto MullResult = _VSMull(Size, 2, Src1Node, Src2Node);
return _VAddP(Size, 4, MullResult, MullResult);
}
auto Src1_L = _VSXTL(Size, 2, Src1Node); // [15:0 ], [31:16], [32:47 ], [63:48 ]
auto Src1_H = _VSXTL2(Size, 2, Src1Node); // [79:64], [95:80], [111:96], [127:112]
auto Src2_L = _VSXTL(Size, 2, Src2Node); // [15:0 ], [31:16], [32:47 ], [63:48 ]
auto Src2_H = _VSXTL2(Size, 2, Src2Node); // [79:64], [95:80], [111:96], [127:112]
auto Res_L = _VSMul(Size, 4, Src1_L, Src2_L); // [15:0 ], [31:16], [32:47 ], [63:48 ] : Original elements
auto Res_H = _VSMul(Size, 4, Src1_H, Src2_H); // [79:64], [95:80], [111:96], [127:112] : Original elements
auto Lower = _VSMull(Size, 2, Src1Node, Src2Node);
auto Upper = _VSMull2(Size, 2, Src1Node, Src2Node);
// [15:0 ] + [31:16], [32:47 ] + [63:48 ], [79:64] + [95:80], [111:96] + [127:112]
return _VAddP(Size, 4, Res_L, Res_H);
return _VAddP(Size, 4, Lower, Upper);
}
void OpDispatchBuilder::PMADDWD(OpcodeArgs) {

View File

@ -4410,19 +4410,14 @@
]
},
"pmaddwd mm0, mm1": {
"ExpectedInstructionCount": 10,
"Optimal": "No",
"ExpectedInstructionCount": 5,
"Optimal": "Yes",
"Comment": "0x0f 0xf5",
"ExpectedArm64ASM": [
"ldr d4, [x28, #752]",
"ldr d5, [x28, #768]",
"sxtl v6.4s, v4.4h",
"sxtl2 v4.4s, v4.8h",
"sxtl v7.4s, v5.4h",
"sxtl2 v5.4s, v5.8h",
"mul v6.4s, v6.4s, v7.4s",
"mul v4.4s, v4.4s, v5.4s",
"addp v4.4s, v6.4s, v4.4s",
"smull v4.4s, v4.4h, v5.4h",
"addp v4.4s, v4.4s, v4.4s",
"str d4, [x28, #752]"
]
},

View File

@ -1087,16 +1087,12 @@
]
},
"pmaddwd xmm0, xmm1": {
"ExpectedInstructionCount": 7,
"Optimal": "No",
"ExpectedInstructionCount": 3,
"Optimal": "Yes",
"Comment": "0x66 0x0f 0xf5",
"ExpectedArm64ASM": [
"sxtl v4.4s, v16.4h",
"sxtl2 v5.4s, v16.8h",
"sxtl v6.4s, v17.4h",
"sxtl2 v7.4s, v17.8h",
"mul v4.4s, v4.4s, v6.4s",
"mul v5.4s, v5.4s, v7.4s",
"smull v4.4s, v16.4h, v17.4h",
"smull2 v5.4s, v16.8h, v17.8h",
"addp v16.4s, v4.4s, v5.4s"
]
},

View File

@ -6863,7 +6863,7 @@
]
},
"vpmaddwd xmm0, xmm1, xmm2": {
"ExpectedInstructionCount": 11,
"ExpectedInstructionCount": 7,
"Optimal": "No",
"Comment": [
"Map 1 0b01 0xf5 128-bit"
@ -6871,12 +6871,8 @@
"ExpectedArm64ASM": [
"mov z4.d, p7/m, z17.d",
"mov z5.d, p7/m, z18.d",
"sxtl v6.4s, v4.4h",
"sxtl2 v4.4s, v4.8h",
"sxtl v7.4s, v5.4h",
"sxtl2 v5.4s, v5.8h",
"mul v6.4s, v6.4s, v7.4s",
"mul v4.4s, v4.4s, v5.4s",
"smull v6.4s, v4.4h, v5.4h",
"smull2 v4.4s, v4.8h, v5.8h",
"addp v4.4s, v6.4s, v4.4s",
"mov v4.16b, v4.16b",
"mov z16.d, p7/m, z4.d"
@ -6891,12 +6887,12 @@
"ExpectedArm64ASM": [
"mov z4.d, p7/m, z17.d",
"mov z5.d, p7/m, z18.d",
"sunpklo z6.s, z4.h",
"sunpkhi z4.s, z4.h",
"sunpklo z7.s, z5.h",
"sunpkhi z5.s, z5.h",
"mul z6.s, z6.s, z7.s",
"mul z4.s, z4.s, z5.s",
"smullb z0.s, z4.h, z5.h",
"smullt z1.s, z4.h, z5.h",
"zip1 z6.s, z0.s, z1.s",
"smullb z0.s, z4.h, z5.h",
"smullt z1.s, z4.h, z5.h",
"zip2 z4.s, z0.s, z1.s",
"movprfx z0, z6",
"addp z0.s, p7/m, z0.s, z4.s",
"uzp1 z4.s, z0.s, z0.s",