Merge pull request #3692 from pmatos/AFP_RPRES_fix

Fixes AFP.NEP handling on scalar insertions
This commit is contained in:
Ryan Houdek 2024-06-19 19:23:49 -07:00 committed by GitHub
commit da21ee3cda
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
11 changed files with 95 additions and 68 deletions

View File

@ -224,16 +224,18 @@ DEF_FBINOP_SCALAR_INSERT(VFSubScalarInsert, fsub)
DEF_FBINOP_SCALAR_INSERT(VFMulScalarInsert, fmul)
DEF_FBINOP_SCALAR_INSERT(VFDivScalarInsert, fdiv)
// VFScalarOperation performs the operation described through ScalarEmit between Vector1 and Vector2,
// storing it into Dst. This is a scalar operation, so the only lowest element of each vector is used for the operation.
// The result is stored into the destination. The untouched bits of the destination come from Vector1, unless it's a 256 vector
// and ZeroUpperBits is true, in which case the upper bits are zero.
void Arm64JITCore::VFScalarOperation(uint8_t OpSize, uint8_t ElementSize, bool ZeroUpperBits, ScalarBinaryOpCaller ScalarEmit,
ARMEmitter::VRegister Dst, ARMEmitter::VRegister Vector1, ARMEmitter::VRegister Vector2) {
const auto Is256Bit = OpSize == Core::CPUState::XMM_AVX_REG_SIZE;
if (!Is256Bit) {
LOGMAN_THROW_A_FMT(ZeroUpperBits == false, "128-bit operation doesn't support ZeroUpperBits in {}", __func__);
}
LOGMAN_THROW_A_FMT(Is256Bit || !ZeroUpperBits, "128-bit operation doesn't support ZeroUpperBits in {}", __func__);
// Bit of a tricky detail.
// The upper bits of the destination comes from the first source.
// The upper bits of the destination comes from Vector1.
LOGMAN_THROW_AA_FMT(ElementSize == 2 || ElementSize == 4 || ElementSize == 8, "Invalid size");
const auto SubRegSize = ARMEmitter::ToVectorSizePair(ElementSize == 2 ? ARMEmitter::SubRegSize::i16Bit :
ElementSize == 4 ? ARMEmitter::SubRegSize::i32Bit :
@ -261,8 +263,8 @@ void Arm64JITCore::VFScalarOperation(uint8_t OpSize, uint8_t ElementSize, bool Z
ins(SubRegSize.Vector, Dst.Q(), 0, VTMP1.Q(), 0);
}
}
} else if (Dst != Vector2) {
if (!ZeroUpperBits && Is256Bit) {
} else if (Dst != Vector2) { // Dst different from both Vector1 and Vector2
if (Is256Bit && !ZeroUpperBits) {
mov(Dst.Z(), Vector1.Z());
} else {
mov(Dst.Q(), Vector1.Q());
@ -279,36 +281,30 @@ void Arm64JITCore::VFScalarOperation(uint8_t OpSize, uint8_t ElementSize, bool Z
ins(SubRegSize.Vector, Dst.Q(), 0, VTMP1.Q(), 0);
}
}
} else {
// Destination intersects Vector2, can't do anything optimal in this case.
// Do the scalar operation first and then move and insert.
} else { // Dst same as Vector2
ScalarEmit(VTMP1, Vector1, Vector2);
if (!ZeroUpperBits && Is256Bit) {
mov(Dst.Z(), Vector1.Z());
} else {
mov(Dst.Q(), Vector1.Q());
}
if (!ZeroUpperBits && Is256Bit) {
ptrue(SubRegSize.Vector, Predicate, ARMEmitter::PredicatePattern::SVE_VL1);
mov(SubRegSize.Vector, Dst.Z(), Predicate.Merging(), VTMP1.Z());
} else {
mov(Dst.Q(), Vector1.Q());
ins(SubRegSize.Vector, Dst.Q(), 0, VTMP1.Q(), 0);
}
}
}
// Similarly to VFScalarOperation it performs the operation described through ScalarEmit operating on Vector2.
// However the result of the scalar operation is inserted into Vector1 and moved to Destination.
// The untouched bits of the destination come from Vector1, unless it's a 256 vector
// and ZeroUpperBits is true, in which case the upper bits are zero.
void Arm64JITCore::VFScalarUnaryOperation(uint8_t OpSize, uint8_t ElementSize, bool ZeroUpperBits, ScalarUnaryOpCaller ScalarEmit,
ARMEmitter::VRegister Dst, ARMEmitter::VRegister Vector1,
std::variant<ARMEmitter::VRegister, ARMEmitter::Register> Vector2) {
const auto Is256Bit = OpSize == Core::CPUState::XMM_AVX_REG_SIZE;
if (!Is256Bit) {
LOGMAN_THROW_A_FMT(ZeroUpperBits == false, "128-bit operation doesn't support ZeroUpperBits in {}", __func__);
}
// Bit of a tricky detail.
// The upper bits of the destination comes from the first source.
LOGMAN_THROW_A_FMT(Is256Bit || !ZeroUpperBits, "128-bit operation doesn't support ZeroUpperBits in {}", __func__);
LOGMAN_THROW_AA_FMT(ElementSize == 2 || ElementSize == 4 || ElementSize == 8, "Invalid size");
const auto SubRegSize = ARMEmitter::ToVectorSizePair(ElementSize == 2 ? ARMEmitter::SubRegSize::i16Bit :
@ -327,7 +323,7 @@ void Arm64JITCore::VFScalarUnaryOperation(uint8_t OpSize, uint8_t ElementSize, b
mov(Dst.Q(), Vector1.Q());
}
if (HostSupportsAFP) {
if (HostSupportsAFP) { // or Dst (here Dst == Vector1)
// If the host CPU supports AFP then scalar does an insert without modifying upper bits.
ScalarEmit(Dst, Vector2);
} else {
@ -366,14 +362,10 @@ void Arm64JITCore::VFScalarUnaryOperation(uint8_t OpSize, uint8_t ElementSize, b
if (!ZeroUpperBits && Is256Bit) {
mov(Dst.Z(), Vector1.Z());
} else {
mov(Dst.Q(), Vector1.Q());
}
if (!ZeroUpperBits && Is256Bit) {
ptrue(SubRegSize.Vector, Predicate, ARMEmitter::PredicatePattern::SVE_VL1);
mov(SubRegSize.Vector, Dst.Z(), Predicate.Merging(), VTMP1.Z());
} else {
mov(Dst.Q(), Vector1.Q());
ins(SubRegSize.Vector, Dst.Q(), 0, VTMP1.Q(), 0);
}
}
@ -457,12 +449,17 @@ DEF_OP(VFRSqrtScalarInsert) {
fmov(SubRegSize.Scalar, VTMP1.Q(), 1.0f);
fsqrt(SubRegSize.Scalar, VTMP2, Src);
fdiv(SubRegSize.Scalar, Dst, VTMP1, VTMP2);
if (HostSupportsAFP) {
fdiv(SubRegSize.Scalar, VTMP1, VTMP1, VTMP2);
ins(SubRegSize.Vector, Dst, 0, VTMP1, 0);
} else {
fdiv(SubRegSize.Scalar, Dst, VTMP1, VTMP2);
}
};
auto ScalarEmitRPRES = [this, SubRegSize](ARMEmitter::VRegister Dst, std::variant<ARMEmitter::VRegister, ARMEmitter::Register> SrcVar) {
auto Src = *std::get_if<ARMEmitter::VRegister>(&SrcVar);
frsqrte(SubRegSize.Scalar, Dst.S(), Src.S());
frsqrte(SubRegSize.Scalar, Dst.D(), Src.D());
};
std::array<ScalarUnaryOpCaller, 2> Handlers = {
@ -590,7 +587,28 @@ DEF_OP(VSToFVectorInsert) {
// Claim the element size is 8-bytes.
// Might be scalar 8-byte (cvtsi2ss xmm0, rax)
// Might be vector i32v2 (cvtpi2ps xmm0, mm0)
VFScalarUnaryOperation(IROp->Size, ElementSize * (HasTwoElements ? 2 : 1), Op->ZeroUpperBits, ScalarEmit, Dst, Vector1, Vector2);
if (!HasTwoElements) {
VFScalarUnaryOperation(IROp->Size, ElementSize, Op->ZeroUpperBits, ScalarEmit, Dst, Vector1, Vector2);
return;
}
// Dealing with the odd case of this being actually a vector operation rather than scalar.
const auto Is256Bit = IROp->Size == Core::CPUState::XMM_AVX_REG_SIZE;
constexpr auto Predicate = ARMEmitter::PReg::p0;
ScalarEmit(VTMP1, Vector2);
if (!Op->ZeroUpperBits && Is256Bit) {
if (Dst != Vector1) {
mov(Dst.Z(), Vector1.Z());
}
ptrue(ARMEmitter::SubRegSize::i64Bit, Predicate, ARMEmitter::PredicatePattern::SVE_VL1);
mov(ARMEmitter::SubRegSize::i64Bit, Dst.Z(), Predicate.Merging(), VTMP1.Z());
} else {
if (Dst != Vector1) {
mov(Dst.Q(), Vector1.Q());
}
ins(ARMEmitter::SubRegSize::i64Bit, Dst.Q(), 0, VTMP1.Q(), 0);
}
}
DEF_OP(VSToFGPRInsert) {
@ -679,11 +697,11 @@ DEF_OP(VFCMPScalarInsert) {
auto ScalarEmitEQ = [this, SubRegSize](ARMEmitter::VRegister Dst, ARMEmitter::VRegister Src1, ARMEmitter::VRegister Src2) {
switch (SubRegSize.Scalar) {
case ARMEmitter::ScalarRegSize::i16Bit: {
fcmeq(Dst.H(), Src1.H(), Src2.H());
fcmeq(Dst.H(), Src2.H(), Src1.H());
break;
}
case ARMEmitter::ScalarRegSize::i32Bit:
case ARMEmitter::ScalarRegSize::i64Bit: fcmeq(SubRegSize.Scalar, Dst, Src1, Src2); break;
case ARMEmitter::ScalarRegSize::i64Bit: fcmeq(SubRegSize.Scalar, Dst, Src2, Src1); break;
default: break;
}
};
@ -748,11 +766,11 @@ DEF_OP(VFCMPScalarInsert) {
[this, SubRegSize, ZeroUpperBits, Is256Bit](ARMEmitter::VRegister Dst, ARMEmitter::VRegister Src1, ARMEmitter::VRegister Src2) {
switch (SubRegSize.Scalar) {
case ARMEmitter::ScalarRegSize::i16Bit: {
fcmeq(VTMP1.H(), Src1.H(), Src2.H());
fcmeq(VTMP1.H(), Src2.H(), Src1.H());
break;
}
case ARMEmitter::ScalarRegSize::i32Bit:
case ARMEmitter::ScalarRegSize::i64Bit: fcmeq(SubRegSize.Scalar, VTMP1, Src1, Src2); break;
case ARMEmitter::ScalarRegSize::i64Bit: fcmeq(SubRegSize.Scalar, VTMP1, Src2, Src1); break;
default: break;
}
// If the destination is a temporary then it is going to do an insert after the operation.

View File

@ -10,23 +10,27 @@
},
"Instructions": {
"cvtpi2ps xmm0, [rax]": {
"ExpectedInstructionCount": 2,
"ExpectedInstructionCount": 4,
"Comment": [
"0x0f 0x2a"
],
"ExpectedArm64ASM": [
"ldr d2, [x4]",
"scvtf v16.2s, v2.2s"
"scvtf v0.2s, v2.2s",
"ptrue p0.d, vl1",
"mov z16.d, p0/m, z0.d"
]
},
"cvtpi2ps xmm0, mm0": {
"ExpectedInstructionCount": 2,
"ExpectedInstructionCount": 4,
"Comment": [
"0x0f 0x2a"
],
"ExpectedArm64ASM": [
"ldr d2, [x28, #1040]",
"scvtf v16.2s, v2.2s"
"scvtf v0.2s, v2.2s",
"ptrue p0.d, vl1",
"mov z16.d, p0/m, z0.d"
]
}
}

View File

@ -46,7 +46,7 @@
]
},
"rsqrtss xmm0, xmm1": {
"ExpectedInstructionCount": 3,
"ExpectedInstructionCount": 4,
"Comment": [
"FEAT_FPRES could make this more optimal",
"0xf3 0x0f 0x52"
@ -54,7 +54,8 @@
"ExpectedArm64ASM": [
"fmov s0, #0x70 (1.0000)",
"fsqrt s1, s17",
"fdiv s16, s0, s1"
"fdiv s0, s0, s1",
"mov v16.s[0], v0.s[0]"
]
},
"rcpss xmm0, xmm1": {
@ -143,7 +144,7 @@
"0xf3 0x0f 0xc2"
],
"ExpectedArm64ASM": [
"fcmeq s16, s16, s17"
"fcmeq s16, s17, s16"
]
},
"cmpss xmm0, xmm1, 1": {
@ -184,7 +185,7 @@
"0xf3 0x0f 0xc2"
],
"ExpectedArm64ASM": [
"fcmeq s0, s16, s17",
"fcmeq s0, s17, s16",
"mvn v0.8b, v0.8b",
"ptrue p0.s, vl1",
"mov z16.s, p0/m, z0.s"

View File

@ -135,7 +135,7 @@
"0xf2 0x0f 0xc2"
],
"ExpectedArm64ASM": [
"fcmeq d16, d16, d17"
"fcmeq d16, d17, d16"
]
},
"cmpsd xmm0, xmm1, 1": {
@ -176,7 +176,7 @@
"0xf2 0x0f 0xc2"
],
"ExpectedArm64ASM": [
"fcmeq d0, d16, d17",
"fcmeq d0, d17, d16",
"mvn v0.8b, v0.8b",
"ptrue p0.d, vl1",
"mov z16.d, p0/m, z0.d"

View File

@ -11,23 +11,25 @@
},
"Instructions": {
"cvtpi2ps xmm0, [rax]": {
"ExpectedInstructionCount": 2,
"ExpectedInstructionCount": 3,
"Comment": [
"0x0f 0x2a"
],
"ExpectedArm64ASM": [
"ldr d2, [x4]",
"scvtf v16.2s, v2.2s"
"scvtf v0.2s, v2.2s",
"mov v16.d[0], v0.d[0]"
]
},
"cvtpi2ps xmm0, mm0": {
"ExpectedInstructionCount": 2,
"ExpectedInstructionCount": 3,
"Comment": [
"0x0f 0x2a"
],
"ExpectedArm64ASM": [
"ldr d2, [x28, #1040]",
"scvtf v16.2s, v2.2s"
"scvtf v0.2s, v2.2s",
"mov v16.d[0], v0.d[0]"
]
}
}

View File

@ -47,7 +47,7 @@
]
},
"rsqrtss xmm0, xmm1": {
"ExpectedInstructionCount": 3,
"ExpectedInstructionCount": 4,
"Comment": [
"FEAT_FPRES could make this more optimal",
"0xf3 0x0f 0x52"
@ -55,7 +55,8 @@
"ExpectedArm64ASM": [
"fmov s0, #0x70 (1.0000)",
"fsqrt s1, s17",
"fdiv s16, s0, s1"
"fdiv s0, s0, s1",
"mov v16.s[0], v0.s[0]"
]
},
"rcpss xmm0, xmm1": {
@ -144,7 +145,7 @@
"0xf3 0x0f 0xc2"
],
"ExpectedArm64ASM": [
"fcmeq s16, s16, s17"
"fcmeq s16, s17, s16"
]
},
"cmpss xmm0, xmm1, 1": {
@ -184,7 +185,7 @@
"0xf3 0x0f 0xc2"
],
"ExpectedArm64ASM": [
"fcmeq s0, s16, s17",
"fcmeq s0, s17, s16",
"mvn v0.8b, v0.8b",
"mov v16.s[0], v0.s[0]"
]

View File

@ -136,7 +136,7 @@
"0xf2 0x0f 0xc2"
],
"ExpectedArm64ASM": [
"fcmeq d16, d16, d17"
"fcmeq d16, d17, d16"
]
},
"cmpsd xmm0, xmm1, 1": {
@ -176,7 +176,7 @@
"0xf2 0x0f 0xc2"
],
"ExpectedArm64ASM": [
"fcmeq d0, d16, d17",
"fcmeq d0, d17, d16",
"mvn v0.8b, v0.8b",
"mov v16.d[0], v0.d[0]"
]

View File

@ -30,7 +30,7 @@
]
},
"vrsqrtss xmm0, xmm1, xmm2": {
"ExpectedInstructionCount": 4,
"ExpectedInstructionCount": 5,
"Comment": [
"FEAT_FPRES could make this more optimal",
"Map 1 0b10 0x52 128-bit"
@ -39,7 +39,8 @@
"mov v16.16b, v17.16b",
"fmov s0, #0x70 (1.0000)",
"fsqrt s1, s18",
"fdiv s16, s0, s1"
"fdiv s0, s0, s1",
"mov v16.s[0], v0.s[0]"
]
},
"vrcpss xmm0, xmm1, xmm2": {
@ -61,7 +62,7 @@
],
"ExpectedArm64ASM": [
"mov v16.16b, v17.16b",
"fcmeq s16, s17, s18"
"fcmeq s16, s18, s17"
]
},
"vcmpss xmm0, xmm1, xmm2, 0x01": {
@ -105,7 +106,7 @@
],
"ExpectedArm64ASM": [
"mov v16.16b, v17.16b",
"fcmeq s0, s17, s18",
"fcmeq s0, s18, s17",
"mvn v0.8b, v0.8b",
"mov v16.s[0], v0.s[0]"
]
@ -154,7 +155,7 @@
],
"ExpectedArm64ASM": [
"mov v16.16b, v17.16b",
"fcmeq d16, d17, d18"
"fcmeq d16, d18, d17"
]
},
"vcmpsd xmm0, xmm1, xmm2, 0x01": {
@ -198,7 +199,7 @@
],
"ExpectedArm64ASM": [
"mov v16.16b, v17.16b",
"fcmeq d0, d17, d18",
"fcmeq d0, d18, d17",
"mvn v0.8b, v0.8b",
"mov v16.d[0], v0.d[0]"
]

View File

@ -525,7 +525,7 @@
"0xf3 0x0f 0xc2"
],
"ExpectedArm64ASM": [
"fcmeq s0, s16, s17",
"fcmeq s0, s17, s16",
"mov v16.s[0], v0.s[0]"
]
},
@ -568,7 +568,7 @@
"0xf3 0x0f 0xc2"
],
"ExpectedArm64ASM": [
"fcmeq s0, s16, s17",
"fcmeq s0, s17, s16",
"mvn v0.8b, v0.8b",
"mov v16.s[0], v0.s[0]"
]

View File

@ -366,7 +366,7 @@
"0xf2 0x0f 0xc2"
],
"ExpectedArm64ASM": [
"fcmeq d0, d16, d17",
"fcmeq d0, d17, d16",
"mov v16.d[0], v0.d[0]"
]
},
@ -409,7 +409,7 @@
"0xf2 0x0f 0xc2"
],
"ExpectedArm64ASM": [
"fcmeq d0, d16, d17",
"fcmeq d0, d17, d16",
"mvn v0.8b, v0.8b",
"mov v16.d[0], v0.d[0]"
]

View File

@ -2394,7 +2394,7 @@
],
"ExpectedArm64ASM": [
"mov v16.16b, v17.16b",
"fcmeq s0, s17, s18",
"fcmeq s0, s18, s17",
"mov v16.s[0], v0.s[0]"
]
},
@ -2441,7 +2441,7 @@
],
"ExpectedArm64ASM": [
"mov v16.16b, v17.16b",
"fcmeq s0, s17, s18",
"fcmeq s0, s18, s17",
"mvn v0.8b, v0.8b",
"mov v16.s[0], v0.s[0]"
]
@ -2490,7 +2490,7 @@
],
"ExpectedArm64ASM": [
"mov v16.16b, v17.16b",
"fcmeq d0, d17, d18",
"fcmeq d0, d18, d17",
"mov v16.d[0], v0.d[0]"
]
},
@ -2537,7 +2537,7 @@
],
"ExpectedArm64ASM": [
"mov v16.16b, v17.16b",
"fcmeq d0, d17, d18",
"fcmeq d0, d18, d17",
"mvn v0.8b, v0.8b",
"mov v16.d[0], v0.d[0]"
]