OpcodeDispatcher: Remove unnecessary moves in {AVX}VectorUnaryOp

When dealing with source vectors, we can use the vector length
rather than using a smaller size and zero extending the register,
especially since the resulting value is just inserted into another
vector.
This commit is contained in:
Lioncache 2023-08-19 19:41:31 -04:00
parent db60a2fd4b
commit 09addb217a
4 changed files with 28 additions and 38 deletions

View File

@ -651,13 +651,18 @@ template
void OpDispatchBuilder::AVXVectorScalarALUOp<IR::OP_VFSUB, 8>(OpcodeArgs);
void OpDispatchBuilder::VectorUnaryOpImpl(OpcodeArgs, IROps IROp, size_t ElementSize, bool Scalar) {
const auto Size = Scalar ? ElementSize : GetSrcSize(Op);
// In the event of a scalar operation and a vector source, then
// we can specify the entire vector length in order to avoid
// unnecessary sign extension on the element to be operated on.
// In the event of a memory operand, we load the exact element size.
const auto SrcSize = Scalar && Op->Src[0].IsGPR() ? 16U : GetSrcSize(Op);
const auto OpSize = Scalar ? ElementSize : GetSrcSize(Op);
const auto DstSize = GetDstSize(Op);
OrderedNode *Src = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags, -1);
OrderedNode *Src = LoadSource_WithOpSize(FPRClass, Op, Op->Src[0], SrcSize, Op->Flags, -1);
OrderedNode *Dest = LoadSource_WithOpSize(FPRClass, Op, Op->Dest, DstSize, Op->Flags, -1);
auto ALUOp = _VFSqrt(Size, ElementSize, Src);
auto ALUOp = _VFSqrt(OpSize, ElementSize, Src);
// Overwrite our IR's op type
ALUOp.first->Header.Op = IROp;
@ -702,19 +707,24 @@ template
void OpDispatchBuilder::VectorUnaryOp<IR::OP_VABS, 4, false>(OpcodeArgs);
void OpDispatchBuilder::AVXVectorUnaryOpImpl(OpcodeArgs, IROps IROp, size_t ElementSize, bool Scalar) {
const auto Size = Scalar ? ElementSize : GetSrcSize(Op);
// In the event of a scalar operation and a vector source, then
// we can specify the entire vector length in order to avoid
// unnecessary sign extension on the element to be operated on.
// In the event of a memory operand, we load the exact element size.
const auto SrcSize = Scalar && Op->Src[1].IsGPR() ? 16U : GetSrcSize(Op);
const auto OpSize = Scalar ? ElementSize : GetSrcSize(Op);
const auto DstSize = GetDstSize(Op);
OrderedNode *Src = [&] {
const auto SrcIndex = Scalar ? 1 : 0;
return LoadSource(FPRClass, Op, Op->Src[SrcIndex], Op->Flags, -1);
return LoadSource_WithOpSize(FPRClass, Op, Op->Src[SrcIndex], SrcSize, Op->Flags, -1);
}();
OrderedNode *Dest = [&] {
const auto& Operand = Scalar ? Op->Src[0] : Op->Dest;
return LoadSource_WithOpSize(FPRClass, Op, Operand, DstSize, Op->Flags, -1);
}();
auto ALUOp = _VFSqrt(Size, ElementSize, Src);
auto ALUOp = _VFSqrt(OpSize, ElementSize, Src);
// Overwrite our IR's op type
ALUOp.first->Header.Op = IROp;

View File

@ -206,47 +206,38 @@
]
},
"sqrtss xmm0, xmm1": {
"ExpectedInstructionCount": 5,
"ExpectedInstructionCount": 2,
"Optimal": "No",
"Comment": "0xf3 0x0f 0x51",
"ExpectedArm64ASM": [
"movi v0.2d, #0x0",
"mov v0.s[0], v17.s[0]",
"mov v4.16b, v0.16b",
"fsqrt s4, s4",
"fsqrt s4, s17",
"mov v16.s[0], v4.s[0]"
]
},
"rsqrtss xmm0, xmm1": {
"ExpectedInstructionCount": 7,
"ExpectedInstructionCount": 4,
"Optimal": "No",
"Comment": [
"FEAT_FPRES could make this more optimal",
"0xf3 0x0f 0x52"
],
"ExpectedArm64ASM": [
"movi v0.2d, #0x0",
"mov v0.s[0], v17.s[0]",
"mov v4.16b, v0.16b",
"fmov s0, #0x70 (1.0000)",
"fsqrt s1, s4",
"fsqrt s1, s17",
"fdiv s4, s0, s1",
"mov v16.s[0], v4.s[0]"
]
},
"rcpss xmm0, xmm1": {
"ExpectedInstructionCount": 6,
"ExpectedInstructionCount": 3,
"Optimal": "No",
"Comment": [
"FEAT_FPRES could make this more optimal",
"0xf3 0x0f 0x53"
],
"ExpectedArm64ASM": [
"movi v0.2d, #0x0",
"mov v0.s[0], v17.s[0]",
"mov v4.16b, v0.16b",
"fmov s0, #0x70 (1.0000)",
"fdiv s4, s0, s4",
"fdiv s4, s0, s17",
"mov v16.s[0], v4.s[0]"
]
},

View File

@ -187,15 +187,14 @@
]
},
"sqrtsd xmm0, xmm1": {
"ExpectedInstructionCount": 3,
"ExpectedInstructionCount": 2,
"Optimal": "No",
"Comment": [
"With AFP mode FEX can remove an insert after the operation.",
"0xf2 0x0f 0x51"
],
"ExpectedArm64ASM": [
"mov v4.8b, v17.8b",
"fsqrt d4, d4",
"fsqrt d4, d17",
"mov v16.d[0], v4.d[0]"
]
},

View File

@ -729,7 +729,7 @@
]
},
"vsqrtss xmm0, xmm1, xmm2": {
"ExpectedInstructionCount": 11,
"ExpectedInstructionCount": 8,
"Optimal": "No",
"Comment": [
"Insert in to first element could be more optimal, which is the common case.",
@ -737,9 +737,6 @@
],
"ExpectedArm64ASM": [
"mov z4.d, p7/m, z18.d",
"movi v0.2d, #0x0",
"mov v0.s[0], v4.s[0]",
"mov v4.16b, v0.16b",
"mov z5.d, p7/m, z17.d",
"fsqrt s4, s4",
"mov v0.16b, v5.16b",
@ -750,7 +747,7 @@
]
},
"vsqrtsd xmm0, xmm1, xmm2": {
"ExpectedInstructionCount": 9,
"ExpectedInstructionCount": 8,
"Optimal": "No",
"Comment": [
"Insert in to first element could be more optimal, which is the common case.",
@ -758,7 +755,6 @@
],
"ExpectedArm64ASM": [
"mov z4.d, p7/m, z18.d",
"mov v4.8b, v4.8b",
"mov z5.d, p7/m, z17.d",
"fsqrt d4, d4",
"mov v0.16b, v5.16b",
@ -800,7 +796,7 @@
]
},
"vrsqrtss xmm0, xmm1, xmm2": {
"ExpectedInstructionCount": 13,
"ExpectedInstructionCount": 10,
"Optimal": "No",
"Comment": [
"FEAT_FPRES could make this more optimal",
@ -808,9 +804,6 @@
],
"ExpectedArm64ASM": [
"mov z4.d, p7/m, z18.d",
"movi v0.2d, #0x0",
"mov v0.s[0], v4.s[0]",
"mov v4.16b, v0.16b",
"mov z5.d, p7/m, z17.d",
"fmov s0, #0x70 (1.0000)",
"fsqrt s1, s4",
@ -853,7 +846,7 @@
]
},
"vrcpss xmm0, xmm1, xmm2": {
"ExpectedInstructionCount": 12,
"ExpectedInstructionCount": 9,
"Optimal": "No",
"Comment": [
"FEAT_FPRES could make this more optimal",
@ -861,9 +854,6 @@
],
"ExpectedArm64ASM": [
"mov z4.d, p7/m, z18.d",
"movi v0.2d, #0x0",
"mov v0.s[0], v4.s[0]",
"mov v4.16b, v0.16b",
"mov z5.d, p7/m, z17.d",
"fmov s0, #0x70 (1.0000)",
"fdiv s4, s0, s4",