mirror of
https://github.com/RPCSX/llvm.git
synced 2025-02-04 19:38:22 +00:00
AVX512F: FMA intrinsic + FNEG - sequence optimization
The previous commit (r280368 - https://reviews.llvm.org/D23313) does not cover AVX-512F, KNL set. FNEG(x) operation is lowered to (bitcast (vpxor (bitcast x), (bitcast constfp(0x80000000))). It happens because FP XOR is not supported for 512-bit data types on KNL and we use integer XOR instead. I added pattern match for integer XOR. Differential Revision: https://reviews.llvm.org/D24221 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@280785 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
parent
1da68f6209
commit
78405002cf
@ -29233,28 +29233,6 @@ static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,
|
|||||||
return DAG.getNode(X86ISD::PCMPGT, SDLoc(N), VT, Shift.getOperand(0), Ones);
|
return DAG.getNode(X86ISD::PCMPGT, SDLoc(N), VT, Shift.getOperand(0), Ones);
|
||||||
}
|
}
|
||||||
|
|
||||||
static SDValue combineXor(SDNode *N, SelectionDAG &DAG,
|
|
||||||
TargetLowering::DAGCombinerInfo &DCI,
|
|
||||||
const X86Subtarget &Subtarget) {
|
|
||||||
if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget))
|
|
||||||
return Cmp;
|
|
||||||
|
|
||||||
if (DCI.isBeforeLegalizeOps())
|
|
||||||
return SDValue();
|
|
||||||
|
|
||||||
if (SDValue RV = foldXorTruncShiftIntoCmp(N, DAG))
|
|
||||||
return RV;
|
|
||||||
|
|
||||||
if (Subtarget.hasCMov())
|
|
||||||
if (SDValue RV = combineIntegerAbs(N, DAG))
|
|
||||||
return RV;
|
|
||||||
|
|
||||||
if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
|
|
||||||
return FPLogic;
|
|
||||||
|
|
||||||
return SDValue();
|
|
||||||
}
|
|
||||||
|
|
||||||
/// This function detects the AVG pattern between vectors of unsigned i8/i16,
|
/// This function detects the AVG pattern between vectors of unsigned i8/i16,
|
||||||
/// which is c = (a + b + 1) / 2, and replace this operation with the efficient
|
/// which is c = (a + b + 1) / 2, and replace this operation with the efficient
|
||||||
/// X86ISD::AVG instruction.
|
/// X86ISD::AVG instruction.
|
||||||
@ -30363,12 +30341,68 @@ static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
|
|||||||
return combineVectorTruncation(N, DAG, Subtarget);
|
return combineVectorTruncation(N, DAG, Subtarget);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Returns the negated value if the node \p N flips sign of FP value.
|
||||||
|
///
|
||||||
|
/// FP-negation node may have different forms: FNEG(x) or FXOR (x, 0x80000000).
|
||||||
|
/// AVX512F does not have FXOR, so FNEG is lowered as
|
||||||
|
/// (bitcast (xor (bitcast x), (bitcast ConstantFP(0x80000000)))).
|
||||||
|
/// In this case we go though all bitcasts.
|
||||||
|
static SDValue isFNEG(SDNode *N) {
|
||||||
|
if (N->getOpcode() == ISD::FNEG)
|
||||||
|
return N->getOperand(0);
|
||||||
|
|
||||||
|
SDValue Op = peekThroughBitcasts(SDValue(N, 0));
|
||||||
|
if (Op.getOpcode() != X86ISD::FXOR && Op.getOpcode() != ISD::XOR)
|
||||||
|
return SDValue();
|
||||||
|
|
||||||
|
SDValue Op1 = peekThroughBitcasts(Op.getOperand(1));
|
||||||
|
if (!Op1.getValueType().isFloatingPoint())
|
||||||
|
return SDValue();
|
||||||
|
|
||||||
|
SDValue Op0 = peekThroughBitcasts(Op.getOperand(0));
|
||||||
|
|
||||||
|
unsigned EltBits = Op1.getValueType().getScalarSizeInBits();
|
||||||
|
auto isSignBitValue = [&](const ConstantFP *C) {
|
||||||
|
return C->getValueAPF().bitcastToAPInt() == APInt::getSignBit(EltBits);
|
||||||
|
};
|
||||||
|
|
||||||
|
// There is more than one way to represent the same constant on
|
||||||
|
// the different X86 targets. The type of the node may also depend on size.
|
||||||
|
// - load scalar value and broadcast
|
||||||
|
// - BUILD_VECTOR node
|
||||||
|
// - load from a constant pool.
|
||||||
|
// We check all variants here.
|
||||||
|
if (Op1.getOpcode() == X86ISD::VBROADCAST) {
|
||||||
|
if (auto *C = getTargetConstantFromNode(Op1.getOperand(0)))
|
||||||
|
if (isSignBitValue(cast<ConstantFP>(C)))
|
||||||
|
return Op0;
|
||||||
|
|
||||||
|
} else if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op1)) {
|
||||||
|
if (ConstantFPSDNode *CN = BV->getConstantFPSplatNode())
|
||||||
|
if (isSignBitValue(CN->getConstantFPValue()))
|
||||||
|
return Op0;
|
||||||
|
|
||||||
|
} else if (auto *C = getTargetConstantFromNode(Op1)) {
|
||||||
|
if (C->getType()->isVectorTy()) {
|
||||||
|
if (auto *SplatV = C->getSplatValue())
|
||||||
|
if (isSignBitValue(cast<ConstantFP>(SplatV)))
|
||||||
|
return Op0;
|
||||||
|
} else if (auto *FPConst = dyn_cast<ConstantFP>(C))
|
||||||
|
if (isSignBitValue(FPConst))
|
||||||
|
return Op0;
|
||||||
|
}
|
||||||
|
return SDValue();
|
||||||
|
}
|
||||||
|
|
||||||
/// Do target-specific dag combines on floating point negations.
|
/// Do target-specific dag combines on floating point negations.
|
||||||
static SDValue combineFneg(SDNode *N, SelectionDAG &DAG,
|
static SDValue combineFneg(SDNode *N, SelectionDAG &DAG,
|
||||||
const X86Subtarget &Subtarget) {
|
const X86Subtarget &Subtarget) {
|
||||||
EVT VT = N->getValueType(0);
|
EVT OrigVT = N->getValueType(0);
|
||||||
|
SDValue Arg = isFNEG(N);
|
||||||
|
assert(Arg.getNode() && "N is expected to be an FNEG node");
|
||||||
|
|
||||||
|
EVT VT = Arg.getValueType();
|
||||||
EVT SVT = VT.getScalarType();
|
EVT SVT = VT.getScalarType();
|
||||||
SDValue Arg = N->getOperand(0);
|
|
||||||
SDLoc DL(N);
|
SDLoc DL(N);
|
||||||
|
|
||||||
// Let legalize expand this if it isn't a legal type yet.
|
// Let legalize expand this if it isn't a legal type yet.
|
||||||
@ -30381,40 +30415,30 @@ static SDValue combineFneg(SDNode *N, SelectionDAG &DAG,
|
|||||||
if (Arg.getOpcode() == ISD::FMUL && (SVT == MVT::f32 || SVT == MVT::f64) &&
|
if (Arg.getOpcode() == ISD::FMUL && (SVT == MVT::f32 || SVT == MVT::f64) &&
|
||||||
Arg->getFlags()->hasNoSignedZeros() && Subtarget.hasAnyFMA()) {
|
Arg->getFlags()->hasNoSignedZeros() && Subtarget.hasAnyFMA()) {
|
||||||
SDValue Zero = DAG.getConstantFP(0.0, DL, VT);
|
SDValue Zero = DAG.getConstantFP(0.0, DL, VT);
|
||||||
return DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0),
|
SDValue NewNode = DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0),
|
||||||
Arg.getOperand(1), Zero);
|
Arg.getOperand(1), Zero);
|
||||||
|
return DAG.getBitcast(OrigVT, NewNode);
|
||||||
}
|
}
|
||||||
|
|
||||||
// If we're negating a FMA node, then we can adjust the
|
// If we're negating a FMA node, then we can adjust the
|
||||||
// instruction to include the extra negation.
|
// instruction to include the extra negation.
|
||||||
|
unsigned NewOpcode = 0;
|
||||||
if (Arg.hasOneUse()) {
|
if (Arg.hasOneUse()) {
|
||||||
switch (Arg.getOpcode()) {
|
switch (Arg.getOpcode()) {
|
||||||
case X86ISD::FMADD:
|
case X86ISD::FMADD: NewOpcode = X86ISD::FNMSUB; break;
|
||||||
return DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0),
|
case X86ISD::FMSUB: NewOpcode = X86ISD::FNMADD; break;
|
||||||
Arg.getOperand(1), Arg.getOperand(2));
|
case X86ISD::FNMADD: NewOpcode = X86ISD::FMSUB; break;
|
||||||
case X86ISD::FMSUB:
|
case X86ISD::FNMSUB: NewOpcode = X86ISD::FMADD; break;
|
||||||
return DAG.getNode(X86ISD::FNMADD, DL, VT, Arg.getOperand(0),
|
case X86ISD::FMADD_RND: NewOpcode = X86ISD::FNMSUB_RND; break;
|
||||||
Arg.getOperand(1), Arg.getOperand(2));
|
case X86ISD::FMSUB_RND: NewOpcode = X86ISD::FNMADD_RND; break;
|
||||||
case X86ISD::FNMADD:
|
case X86ISD::FNMADD_RND: NewOpcode = X86ISD::FMSUB_RND; break;
|
||||||
return DAG.getNode(X86ISD::FMSUB, DL, VT, Arg.getOperand(0),
|
case X86ISD::FNMSUB_RND: NewOpcode = X86ISD::FMADD_RND; break;
|
||||||
Arg.getOperand(1), Arg.getOperand(2));
|
|
||||||
case X86ISD::FNMSUB:
|
|
||||||
return DAG.getNode(X86ISD::FMADD, DL, VT, Arg.getOperand(0),
|
|
||||||
Arg.getOperand(1), Arg.getOperand(2));
|
|
||||||
case X86ISD::FMADD_RND:
|
|
||||||
return DAG.getNode(X86ISD::FNMSUB_RND, DL, VT, Arg.getOperand(0),
|
|
||||||
Arg.getOperand(1), Arg.getOperand(2), Arg.getOperand(3));
|
|
||||||
case X86ISD::FMSUB_RND:
|
|
||||||
return DAG.getNode(X86ISD::FNMADD_RND, DL, VT, Arg.getOperand(0),
|
|
||||||
Arg.getOperand(1), Arg.getOperand(2), Arg.getOperand(3));
|
|
||||||
case X86ISD::FNMADD_RND:
|
|
||||||
return DAG.getNode(X86ISD::FMSUB_RND, DL, VT, Arg.getOperand(0),
|
|
||||||
Arg.getOperand(1), Arg.getOperand(2), Arg.getOperand(3));
|
|
||||||
case X86ISD::FNMSUB_RND:
|
|
||||||
return DAG.getNode(X86ISD::FMADD_RND, DL, VT, Arg.getOperand(0),
|
|
||||||
Arg.getOperand(1), Arg.getOperand(2), Arg.getOperand(3));
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if (NewOpcode)
|
||||||
|
return DAG.getBitcast(OrigVT, DAG.getNode(NewOpcode, DL, VT,
|
||||||
|
Arg.getNode()->ops()));
|
||||||
|
|
||||||
return SDValue();
|
return SDValue();
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -30442,42 +30466,28 @@ static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG,
|
|||||||
return SDValue();
|
return SDValue();
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Returns true if the node \p N is FNEG(x) or FXOR (x, 0x80000000).
|
static SDValue combineXor(SDNode *N, SelectionDAG &DAG,
|
||||||
bool isFNEG(const SDNode *N) {
|
TargetLowering::DAGCombinerInfo &DCI,
|
||||||
if (N->getOpcode() == ISD::FNEG)
|
const X86Subtarget &Subtarget) {
|
||||||
return true;
|
if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget))
|
||||||
|
return Cmp;
|
||||||
|
|
||||||
if (N->getOpcode() == X86ISD::FXOR) {
|
if (DCI.isBeforeLegalizeOps())
|
||||||
unsigned EltBits = N->getSimpleValueType(0).getScalarSizeInBits();
|
return SDValue();
|
||||||
SDValue Op1 = N->getOperand(1);
|
|
||||||
|
|
||||||
auto isSignBitValue = [&](const ConstantFP *C) {
|
if (SDValue RV = foldXorTruncShiftIntoCmp(N, DAG))
|
||||||
return C->getValueAPF().bitcastToAPInt() == APInt::getSignBit(EltBits);
|
return RV;
|
||||||
};
|
|
||||||
|
|
||||||
// There is more than one way to represent the same constant on
|
if (Subtarget.hasCMov())
|
||||||
// the different X86 targets. The type of the node may also depend on size.
|
if (SDValue RV = combineIntegerAbs(N, DAG))
|
||||||
// - load scalar value and broadcast
|
return RV;
|
||||||
// - BUILD_VECTOR node
|
|
||||||
// - load from a constant pool.
|
|
||||||
// We check all variants here.
|
|
||||||
if (Op1.getOpcode() == X86ISD::VBROADCAST) {
|
|
||||||
if (auto *C = getTargetConstantFromNode(Op1.getOperand(0)))
|
|
||||||
return isSignBitValue(cast<ConstantFP>(C));
|
|
||||||
|
|
||||||
} else if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op1)) {
|
if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
|
||||||
if (ConstantFPSDNode *CN = BV->getConstantFPSplatNode())
|
return FPLogic;
|
||||||
return isSignBitValue(CN->getConstantFPValue());
|
|
||||||
|
|
||||||
} else if (auto *C = getTargetConstantFromNode(Op1)) {
|
if (isFNEG(N))
|
||||||
if (C->getType()->isVectorTy()) {
|
return combineFneg(N, DAG, Subtarget);
|
||||||
if (auto *SplatV = C->getSplatValue())
|
return SDValue();
|
||||||
return isSignBitValue(cast<ConstantFP>(SplatV));
|
|
||||||
} else if (auto *FPConst = dyn_cast<ConstantFP>(C))
|
|
||||||
return isSignBitValue(FPConst);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return false;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
|
/// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
|
||||||
@ -30907,18 +30917,20 @@ static SDValue combineFMA(SDNode *N, SelectionDAG &DAG,
|
|||||||
SDValue B = N->getOperand(1);
|
SDValue B = N->getOperand(1);
|
||||||
SDValue C = N->getOperand(2);
|
SDValue C = N->getOperand(2);
|
||||||
|
|
||||||
bool NegA = isFNEG(A.getNode());
|
auto invertIfNegative = [](SDValue &V) {
|
||||||
bool NegB = isFNEG(B.getNode());
|
if (SDValue NegVal = isFNEG(V.getNode())) {
|
||||||
bool NegC = isFNEG(C.getNode());
|
V = NegVal;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
};
|
||||||
|
|
||||||
|
bool NegA = invertIfNegative(A);
|
||||||
|
bool NegB = invertIfNegative(B);
|
||||||
|
bool NegC = invertIfNegative(C);
|
||||||
|
|
||||||
// Negative multiplication when NegA xor NegB
|
// Negative multiplication when NegA xor NegB
|
||||||
bool NegMul = (NegA != NegB);
|
bool NegMul = (NegA != NegB);
|
||||||
if (NegA)
|
|
||||||
A = A.getOperand(0);
|
|
||||||
if (NegB)
|
|
||||||
B = B.getOperand(0);
|
|
||||||
if (NegC)
|
|
||||||
C = C.getOperand(0);
|
|
||||||
|
|
||||||
unsigned NewOpcode;
|
unsigned NewOpcode;
|
||||||
if (!NegMul)
|
if (!NegMul)
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||||
; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512bw -mattr=+avx512vl -mattr=+avx512dq | FileCheck %s
|
; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512bw -mattr=+avx512vl -mattr=+avx512dq | FileCheck %s --check-prefix=CHECK --check-prefix=SKX
|
||||||
|
; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f -mattr=+fma | FileCheck %s --check-prefix=CHECK --check-prefix=KNL
|
||||||
|
|
||||||
; This test checks combinations of FNEG and FMA intrinsics on AVX-512 target
|
; This test checks combinations of FNEG and FMA intrinsics on AVX-512 target
|
||||||
; PR28892
|
; PR28892
|
||||||
@ -88,11 +89,18 @@ entry:
|
|||||||
}
|
}
|
||||||
|
|
||||||
define <8 x float> @test8(<8 x float> %a, <8 x float> %b, <8 x float> %c) {
|
define <8 x float> @test8(<8 x float> %a, <8 x float> %b, <8 x float> %c) {
|
||||||
; CHECK-LABEL: test8:
|
; SKX-LABEL: test8:
|
||||||
; CHECK: # BB#0: # %entry
|
; SKX: # BB#0: # %entry
|
||||||
; CHECK-NEXT: vxorps {{.*}}(%rip){1to8}, %ymm2, %ymm2
|
; SKX-NEXT: vxorps {{.*}}(%rip){1to8}, %ymm2, %ymm2
|
||||||
; CHECK-NEXT: vfmsub213ps %ymm2, %ymm1, %ymm0
|
; SKX-NEXT: vfmsub213ps %ymm2, %ymm1, %ymm0
|
||||||
; CHECK-NEXT: retq
|
; SKX-NEXT: retq
|
||||||
|
;
|
||||||
|
; KNL-LABEL: test8:
|
||||||
|
; KNL: # BB#0: # %entry
|
||||||
|
; KNL-NEXT: vbroadcastss {{.*}}(%rip), %ymm3
|
||||||
|
; KNL-NEXT: vxorps %ymm3, %ymm2, %ymm2
|
||||||
|
; KNL-NEXT: vfmsub213ps %ymm2, %ymm1, %ymm0
|
||||||
|
; KNL-NEXT: retq
|
||||||
entry:
|
entry:
|
||||||
%sub.c = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %c
|
%sub.c = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %c
|
||||||
%0 = tail call <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %sub.c) #2
|
%0 = tail call <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %sub.c) #2
|
||||||
@ -115,22 +123,9 @@ entry:
|
|||||||
|
|
||||||
declare <8 x double> @llvm.x86.avx512.mask.vfmadd.pd.512(<8 x double> %a, <8 x double> %b, <8 x double> %c, i8, i32)
|
declare <8 x double> @llvm.x86.avx512.mask.vfmadd.pd.512(<8 x double> %a, <8 x double> %b, <8 x double> %c, i8, i32)
|
||||||
|
|
||||||
define <4 x double> @test10(<4 x double> %a, <4 x double> %b, <4 x double> %c) {
|
define <2 x double> @test10(<2 x double> %a, <2 x double> %b, <2 x double> %c) {
|
||||||
; CHECK-LABEL: test10:
|
; CHECK-LABEL: test10:
|
||||||
; CHECK: # BB#0: # %entry
|
; CHECK: # BB#0: # %entry
|
||||||
; CHECK-NEXT: vfnmsub213pd %ymm2, %ymm1, %ymm0
|
|
||||||
; CHECK-NEXT: retq
|
|
||||||
entry:
|
|
||||||
%0 = tail call <4 x double> @llvm.x86.avx512.mask.vfmadd.pd.256(<4 x double> %a, <4 x double> %b, <4 x double> %c, i8 -1) #2
|
|
||||||
%sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %0
|
|
||||||
ret <4 x double> %sub.i
|
|
||||||
}
|
|
||||||
|
|
||||||
declare <4 x double> @llvm.x86.avx512.mask.vfmadd.pd.256(<4 x double> %a, <4 x double> %b, <4 x double> %c, i8)
|
|
||||||
|
|
||||||
define <2 x double> @test11(<2 x double> %a, <2 x double> %b, <2 x double> %c) {
|
|
||||||
; CHECK-LABEL: test11:
|
|
||||||
; CHECK: # BB#0: # %entry
|
|
||||||
; CHECK-NEXT: vfnmsub213sd %xmm2, %xmm0, %xmm1
|
; CHECK-NEXT: vfnmsub213sd %xmm2, %xmm0, %xmm1
|
||||||
; CHECK-NEXT: vmovaps %xmm1, %xmm0
|
; CHECK-NEXT: vmovaps %xmm1, %xmm0
|
||||||
; CHECK-NEXT: retq
|
; CHECK-NEXT: retq
|
||||||
|
Loading…
x
Reference in New Issue
Block a user