mirror of
https://github.com/FEX-Emu/FEX.git
synced 2025-02-12 18:39:18 +00:00
OpcodeDispatcher: Remove unnecessary moves in {AVX}VectorUnaryOp
When dealing with source vectors, we can use the vector length rather than using a smaller size and zero extending the register, especially since the resulting value is just inserted into another vector.
This commit is contained in:
parent
db60a2fd4b
commit
09addb217a
@ -651,13 +651,18 @@ template
|
||||
void OpDispatchBuilder::AVXVectorScalarALUOp<IR::OP_VFSUB, 8>(OpcodeArgs);
|
||||
|
||||
void OpDispatchBuilder::VectorUnaryOpImpl(OpcodeArgs, IROps IROp, size_t ElementSize, bool Scalar) {
|
||||
const auto Size = Scalar ? ElementSize : GetSrcSize(Op);
|
||||
// In the event of a scalar operation and a vector source, then
|
||||
// we can specify the entire vector length in order to avoid
|
||||
// unnecessary sign extension on the element to be operated on.
|
||||
// In the event of a memory operand, we load the exact element size.
|
||||
const auto SrcSize = Scalar && Op->Src[0].IsGPR() ? 16U : GetSrcSize(Op);
|
||||
const auto OpSize = Scalar ? ElementSize : GetSrcSize(Op);
|
||||
const auto DstSize = GetDstSize(Op);
|
||||
|
||||
OrderedNode *Src = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags, -1);
|
||||
OrderedNode *Src = LoadSource_WithOpSize(FPRClass, Op, Op->Src[0], SrcSize, Op->Flags, -1);
|
||||
OrderedNode *Dest = LoadSource_WithOpSize(FPRClass, Op, Op->Dest, DstSize, Op->Flags, -1);
|
||||
|
||||
auto ALUOp = _VFSqrt(Size, ElementSize, Src);
|
||||
auto ALUOp = _VFSqrt(OpSize, ElementSize, Src);
|
||||
// Overwrite our IR's op type
|
||||
ALUOp.first->Header.Op = IROp;
|
||||
|
||||
@ -702,19 +707,24 @@ template
|
||||
void OpDispatchBuilder::VectorUnaryOp<IR::OP_VABS, 4, false>(OpcodeArgs);
|
||||
|
||||
void OpDispatchBuilder::AVXVectorUnaryOpImpl(OpcodeArgs, IROps IROp, size_t ElementSize, bool Scalar) {
|
||||
const auto Size = Scalar ? ElementSize : GetSrcSize(Op);
|
||||
// In the event of a scalar operation and a vector source, then
|
||||
// we can specify the entire vector length in order to avoid
|
||||
// unnecessary sign extension on the element to be operated on.
|
||||
// In the event of a memory operand, we load the exact element size.
|
||||
const auto SrcSize = Scalar && Op->Src[1].IsGPR() ? 16U : GetSrcSize(Op);
|
||||
const auto OpSize = Scalar ? ElementSize : GetSrcSize(Op);
|
||||
const auto DstSize = GetDstSize(Op);
|
||||
|
||||
OrderedNode *Src = [&] {
|
||||
const auto SrcIndex = Scalar ? 1 : 0;
|
||||
return LoadSource(FPRClass, Op, Op->Src[SrcIndex], Op->Flags, -1);
|
||||
return LoadSource_WithOpSize(FPRClass, Op, Op->Src[SrcIndex], SrcSize, Op->Flags, -1);
|
||||
}();
|
||||
OrderedNode *Dest = [&] {
|
||||
const auto& Operand = Scalar ? Op->Src[0] : Op->Dest;
|
||||
return LoadSource_WithOpSize(FPRClass, Op, Operand, DstSize, Op->Flags, -1);
|
||||
}();
|
||||
|
||||
auto ALUOp = _VFSqrt(Size, ElementSize, Src);
|
||||
auto ALUOp = _VFSqrt(OpSize, ElementSize, Src);
|
||||
// Overwrite our IR's op type
|
||||
ALUOp.first->Header.Op = IROp;
|
||||
|
||||
|
@ -206,47 +206,38 @@
|
||||
]
|
||||
},
|
||||
"sqrtss xmm0, xmm1": {
|
||||
"ExpectedInstructionCount": 5,
|
||||
"ExpectedInstructionCount": 2,
|
||||
"Optimal": "No",
|
||||
"Comment": "0xf3 0x0f 0x51",
|
||||
"ExpectedArm64ASM": [
|
||||
"movi v0.2d, #0x0",
|
||||
"mov v0.s[0], v17.s[0]",
|
||||
"mov v4.16b, v0.16b",
|
||||
"fsqrt s4, s4",
|
||||
"fsqrt s4, s17",
|
||||
"mov v16.s[0], v4.s[0]"
|
||||
]
|
||||
},
|
||||
"rsqrtss xmm0, xmm1": {
|
||||
"ExpectedInstructionCount": 7,
|
||||
"ExpectedInstructionCount": 4,
|
||||
"Optimal": "No",
|
||||
"Comment": [
|
||||
"FEAT_FPRES could make this more optimal",
|
||||
"0xf3 0x0f 0x52"
|
||||
],
|
||||
"ExpectedArm64ASM": [
|
||||
"movi v0.2d, #0x0",
|
||||
"mov v0.s[0], v17.s[0]",
|
||||
"mov v4.16b, v0.16b",
|
||||
"fmov s0, #0x70 (1.0000)",
|
||||
"fsqrt s1, s4",
|
||||
"fsqrt s1, s17",
|
||||
"fdiv s4, s0, s1",
|
||||
"mov v16.s[0], v4.s[0]"
|
||||
]
|
||||
},
|
||||
"rcpss xmm0, xmm1": {
|
||||
"ExpectedInstructionCount": 6,
|
||||
"ExpectedInstructionCount": 3,
|
||||
"Optimal": "No",
|
||||
"Comment": [
|
||||
"FEAT_FPRES could make this more optimal",
|
||||
"0xf3 0x0f 0x53"
|
||||
],
|
||||
"ExpectedArm64ASM": [
|
||||
"movi v0.2d, #0x0",
|
||||
"mov v0.s[0], v17.s[0]",
|
||||
"mov v4.16b, v0.16b",
|
||||
"fmov s0, #0x70 (1.0000)",
|
||||
"fdiv s4, s0, s4",
|
||||
"fdiv s4, s0, s17",
|
||||
"mov v16.s[0], v4.s[0]"
|
||||
]
|
||||
},
|
||||
|
@ -187,15 +187,14 @@
|
||||
]
|
||||
},
|
||||
"sqrtsd xmm0, xmm1": {
|
||||
"ExpectedInstructionCount": 3,
|
||||
"ExpectedInstructionCount": 2,
|
||||
"Optimal": "No",
|
||||
"Comment": [
|
||||
"With AFP mode FEX can remove an insert after the operation.",
|
||||
"0xf2 0x0f 0x51"
|
||||
],
|
||||
"ExpectedArm64ASM": [
|
||||
"mov v4.8b, v17.8b",
|
||||
"fsqrt d4, d4",
|
||||
"fsqrt d4, d17",
|
||||
"mov v16.d[0], v4.d[0]"
|
||||
]
|
||||
},
|
||||
|
@ -729,7 +729,7 @@
|
||||
]
|
||||
},
|
||||
"vsqrtss xmm0, xmm1, xmm2": {
|
||||
"ExpectedInstructionCount": 11,
|
||||
"ExpectedInstructionCount": 8,
|
||||
"Optimal": "No",
|
||||
"Comment": [
|
||||
"Insert in to first element could be more optimal, which is the common case.",
|
||||
@ -737,9 +737,6 @@
|
||||
],
|
||||
"ExpectedArm64ASM": [
|
||||
"mov z4.d, p7/m, z18.d",
|
||||
"movi v0.2d, #0x0",
|
||||
"mov v0.s[0], v4.s[0]",
|
||||
"mov v4.16b, v0.16b",
|
||||
"mov z5.d, p7/m, z17.d",
|
||||
"fsqrt s4, s4",
|
||||
"mov v0.16b, v5.16b",
|
||||
@ -750,7 +747,7 @@
|
||||
]
|
||||
},
|
||||
"vsqrtsd xmm0, xmm1, xmm2": {
|
||||
"ExpectedInstructionCount": 9,
|
||||
"ExpectedInstructionCount": 8,
|
||||
"Optimal": "No",
|
||||
"Comment": [
|
||||
"Insert in to first element could be more optimal, which is the common case.",
|
||||
@ -758,7 +755,6 @@
|
||||
],
|
||||
"ExpectedArm64ASM": [
|
||||
"mov z4.d, p7/m, z18.d",
|
||||
"mov v4.8b, v4.8b",
|
||||
"mov z5.d, p7/m, z17.d",
|
||||
"fsqrt d4, d4",
|
||||
"mov v0.16b, v5.16b",
|
||||
@ -800,7 +796,7 @@
|
||||
]
|
||||
},
|
||||
"vrsqrtss xmm0, xmm1, xmm2": {
|
||||
"ExpectedInstructionCount": 13,
|
||||
"ExpectedInstructionCount": 10,
|
||||
"Optimal": "No",
|
||||
"Comment": [
|
||||
"FEAT_FPRES could make this more optimal",
|
||||
@ -808,9 +804,6 @@
|
||||
],
|
||||
"ExpectedArm64ASM": [
|
||||
"mov z4.d, p7/m, z18.d",
|
||||
"movi v0.2d, #0x0",
|
||||
"mov v0.s[0], v4.s[0]",
|
||||
"mov v4.16b, v0.16b",
|
||||
"mov z5.d, p7/m, z17.d",
|
||||
"fmov s0, #0x70 (1.0000)",
|
||||
"fsqrt s1, s4",
|
||||
@ -853,7 +846,7 @@
|
||||
]
|
||||
},
|
||||
"vrcpss xmm0, xmm1, xmm2": {
|
||||
"ExpectedInstructionCount": 12,
|
||||
"ExpectedInstructionCount": 9,
|
||||
"Optimal": "No",
|
||||
"Comment": [
|
||||
"FEAT_FPRES could make this more optimal",
|
||||
@ -861,9 +854,6 @@
|
||||
],
|
||||
"ExpectedArm64ASM": [
|
||||
"mov z4.d, p7/m, z18.d",
|
||||
"movi v0.2d, #0x0",
|
||||
"mov v0.s[0], v4.s[0]",
|
||||
"mov v4.16b, v0.16b",
|
||||
"mov z5.d, p7/m, z17.d",
|
||||
"fmov s0, #0x70 (1.0000)",
|
||||
"fdiv s4, s0, s4",
|
||||
|
Loading…
x
Reference in New Issue
Block a user