Add support for matching psign & plendvb to the x86 target

Remove unnecessary pandn patterns, 'vnot' patfrag looks through bitcasts

llvm-svn: 122098
This commit is contained in:
Nate Begeman 2010-12-17 22:55:37 +00:00
parent 64c013d79d
commit ef5f3c0fa7
5 changed files with 186 additions and 41 deletions

View File

@ -981,6 +981,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
setTargetDAGCombine(ISD::SRA);
setTargetDAGCombine(ISD::SRL);
setTargetDAGCombine(ISD::OR);
setTargetDAGCombine(ISD::AND);
setTargetDAGCombine(ISD::STORE);
setTargetDAGCombine(ISD::ZERO_EXTEND);
if (Subtarget->is64Bit())
@ -8870,6 +8871,10 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::PINSRB: return "X86ISD::PINSRB";
case X86ISD::PINSRW: return "X86ISD::PINSRW";
case X86ISD::PSHUFB: return "X86ISD::PSHUFB";
case X86ISD::PANDN: return "X86ISD::PANDN";
case X86ISD::PSIGNB: return "X86ISD::PSIGNB";
case X86ISD::PSIGNW: return "X86ISD::PSIGNW";
case X86ISD::PSIGND: return "X86ISD::PSIGND";
case X86ISD::FMAX: return "X86ISD::FMAX";
case X86ISD::FMIN: return "X86ISD::FMIN";
case X86ISD::FRSQRT: return "X86ISD::FRSQRT";
@ -11053,6 +11058,36 @@ static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG,
return SDValue();
}
static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget *Subtarget) {
if (DCI.isBeforeLegalizeOps())
return SDValue();
// Want to form PANDN nodes, in the hopes of then easily combining them with
// OR and AND nodes to form PBLEND/PSIGN.
EVT VT = N->getValueType(0);
if (VT != MVT::v2i64)
return SDValue();
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
DebugLoc DL = N->getDebugLoc();
// Check LHS for vnot
if (N0.getOpcode() == ISD::XOR &&
ISD::isBuildVectorAllOnes(N0.getOperand(1).getNode()))
return DAG.getNode(X86ISD::PANDN, DL, VT, N0.getOperand(0), N1);
// Check RHS for vnot
if (N1.getOpcode() == ISD::XOR &&
ISD::isBuildVectorAllOnes(N1.getOperand(1).getNode()))
return DAG.getNode(X86ISD::PANDN, DL, VT, N1.getOperand(0), N0);
return SDValue();
}
static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget *Subtarget) {
@ -11060,12 +11095,101 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG,
return SDValue();
EVT VT = N->getValueType(0);
if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64)
if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64 && VT != MVT::v2i64)
return SDValue();
// fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c)
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
// look for psign/blend
if (Subtarget->hasSSSE3()) {
if (VT == MVT::v2i64) {
// Canonicalize pandn to RHS
if (N0.getOpcode() == X86ISD::PANDN)
std::swap(N0, N1);
// or (and (m, x), (pandn m, y))
if (N0.getOpcode() == ISD::AND && N1.getOpcode() == X86ISD::PANDN) {
SDValue Mask = N1.getOperand(0);
SDValue X = N1.getOperand(1);
SDValue Y;
if (N0.getOperand(0) == Mask)
Y = N0.getOperand(1);
if (N0.getOperand(1) == Mask)
Y = N0.getOperand(0);
// Check to see if the mask appeared in both the AND and PANDN and
if (!Y.getNode())
return SDValue();
// Validate that X, Y, and Mask are BIT_CONVERTS, and see through them.
if (Mask.getOpcode() != ISD::BITCAST ||
X.getOpcode() != ISD::BITCAST ||
Y.getOpcode() != ISD::BITCAST)
return SDValue();
// Look through mask bitcast.
Mask = Mask.getOperand(0);
EVT MaskVT = Mask.getValueType();
// Validate that the Mask operand is a vector sra node. The sra node
// will be an intrinsic.
if (Mask.getOpcode() != ISD::INTRINSIC_WO_CHAIN)
return SDValue();
// FIXME: what to do for bytes, since there is a psignb/pblendvb, but
// there is no psrai.b
switch (cast<ConstantSDNode>(Mask.getOperand(0))->getZExtValue()) {
case Intrinsic::x86_sse2_psrai_w:
case Intrinsic::x86_sse2_psrai_d:
break;
default: return SDValue();
}
// Check that the SRA is all signbits.
SDValue SraC = Mask.getOperand(2);
unsigned SraAmt = cast<ConstantSDNode>(SraC)->getZExtValue();
unsigned EltBits = MaskVT.getVectorElementType().getSizeInBits();
if ((SraAmt + 1) != EltBits)
return SDValue();
DebugLoc DL = N->getDebugLoc();
// Now we know we at least have a plendvb with the mask val. See if
// we can form a psignb/w/d.
// psign = x.type == y.type == mask.type && y = sub(0, x);
X = X.getOperand(0);
Y = Y.getOperand(0);
if (Y.getOpcode() == ISD::SUB && Y.getOperand(1) == X &&
ISD::isBuildVectorAllZeros(Y.getOperand(0).getNode()) &&
X.getValueType() == MaskVT && X.getValueType() == Y.getValueType()){
unsigned Opc = 0;
switch (EltBits) {
case 8: Opc = X86ISD::PSIGNB; break;
case 16: Opc = X86ISD::PSIGNW; break;
case 32: Opc = X86ISD::PSIGND; break;
default: break;
}
if (Opc) {
SDValue Sign = DAG.getNode(Opc, DL, MaskVT, X, Mask.getOperand(1));
return DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Sign);
}
}
// PBLENDVB only available on SSE 4.1
if (!Subtarget->hasSSE41())
return SDValue();
unsigned IID = Intrinsic::x86_sse41_pblendvb;
X = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, X);
Y = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Y);
Mask = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Mask);
Mask = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::v16i8,
DAG.getConstant(IID, MVT::i32), X, Y, Mask);
return DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Mask);
}
}
}
// fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c)
if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL)
std::swap(N0, N1);
if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL)
@ -11116,7 +11240,7 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG,
DAG.getNode(ISD::TRUNCATE, DL,
MVT::i8, ShAmt0));
}
return SDValue();
}
@ -11334,6 +11458,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
case ISD::SHL:
case ISD::SRA:
case ISD::SRL: return PerformShiftCombine(N, DAG, Subtarget);
case ISD::AND: return PerformAndCombine(N, DAG, DCI, Subtarget);
case ISD::OR: return PerformOrCombine(N, DAG, DCI, Subtarget);
case ISD::STORE: return PerformSTORECombine(N, DAG, Subtarget);
case X86ISD::FXOR:

View File

@ -159,7 +159,13 @@ namespace llvm {
/// PSHUFB - Shuffle 16 8-bit values within a vector.
PSHUFB,
/// PANDN - and with not'd value.
PANDN,
/// PSIGNB/W/D - Copy integer sign.
PSIGNB, PSIGNW, PSIGND,
/// FMAX, FMIN - Floating point max and min.
///
FMAX, FMIN,

View File

@ -43,6 +43,18 @@ def X86ucomi : SDNode<"X86ISD::UCOMI", SDTX86CmpTest>;
def X86pshufb : SDNode<"X86ISD::PSHUFB",
SDTypeProfile<1, 2, [SDTCisVT<0, v16i8>, SDTCisSameAs<0,1>,
SDTCisSameAs<0,2>]>>;
def X86pandn : SDNode<"X86ISD::PANDN",
SDTypeProfile<1, 2, [SDTCisVT<0, v2i64>, SDTCisSameAs<0,1>,
SDTCisSameAs<0,2>]>>;
def X86psignb : SDNode<"X86ISD::PSIGNB",
SDTypeProfile<1, 2, [SDTCisVT<0, v16i8>, SDTCisSameAs<0,1>,
SDTCisSameAs<0,2>]>>;
def X86psignw : SDNode<"X86ISD::PSIGNW",
SDTypeProfile<1, 2, [SDTCisVT<0, v8i16>, SDTCisSameAs<0,1>,
SDTCisSameAs<0,2>]>>;
def X86psignd : SDNode<"X86ISD::PSIGND",
SDTypeProfile<1, 2, [SDTCisVT<0, v4i32>, SDTCisSameAs<0,1>,
SDTCisSameAs<0,2>]>>;
def X86pextrb : SDNode<"X86ISD::PEXTRB",
SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisPtrTy<2>]>>;
def X86pextrw : SDNode<"X86ISD::PEXTRW",

View File

@ -1584,19 +1584,13 @@ defm XOR : sse12_fp_packed_logical<0x57, "xor", xor>;
let isCommutable = 0 in
defm ANDN : sse12_fp_packed_logical<0x55, "andn", undef /* dummy */, 1, [
// single r+r
[(set VR128:$dst, (v2i64 (and (xor VR128:$src1,
(bc_v2i64 (v4i32 immAllOnesV))),
VR128:$src2)))],
[(set VR128:$dst, (X86pandn VR128:$src1, VR128:$src2))],
// double r+r
[(set VR128:$dst, (and (vnot (bc_v2i64 (v2f64 VR128:$src1))),
(bc_v2i64 (v2f64 VR128:$src2))))],
[],
// single r+m
[(set VR128:$dst, (v2i64 (and (xor (bc_v2i64 (v4f32 VR128:$src1)),
(bc_v2i64 (v4i32 immAllOnesV))),
(memopv2i64 addr:$src2))))],
[(set VR128:$dst, (X86pandn VR128:$src1, (memopv2i64 addr:$src2)))],
// double r+m
[(set VR128:$dst, (and (vnot (bc_v2i64 (v2f64 VR128:$src1))),
(memopv2i64 addr:$src2)))]]>;
[]]>;
//===----------------------------------------------------------------------===//
// SSE 1 & 2 - Arithmetic Instructions
@ -2536,15 +2530,11 @@ let ExeDomain = SSEPackedInt in {
}
def PANDNrr : PDI<0xDF, MRMSrcReg,
(outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
"pandn\t{$src2, $dst|$dst, $src2}",
[(set VR128:$dst, (v2i64 (and (vnot VR128:$src1),
VR128:$src2)))]>;
"pandn\t{$src2, $dst|$dst, $src2}", []>;
def PANDNrm : PDI<0xDF, MRMSrcMem,
(outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
"pandn\t{$src2, $dst|$dst, $src2}",
[(set VR128:$dst, (v2i64 (and (vnot VR128:$src1),
(memopv2i64 addr:$src2))))]>;
"pandn\t{$src2, $dst|$dst, $src2}", []>;
}
} // Constraints = "$src1 = $dst"
@ -3608,6 +3598,13 @@ def : Pat<(X86pshufb VR128:$src, VR128:$mask),
def : Pat<(X86pshufb VR128:$src, (bc_v16i8 (memopv2i64 addr:$mask))),
(PSHUFBrm128 VR128:$src, addr:$mask)>, Requires<[HasSSSE3]>;
def : Pat<(X86psignb VR128:$src1, VR128:$src2),
(PSIGNBrr128 VR128:$src1, VR128:$src2)>, Requires<[HasSSSE3]>;
def : Pat<(X86psignw VR128:$src1, VR128:$src2),
(PSIGNWrr128 VR128:$src1, VR128:$src2)>, Requires<[HasSSSE3]>;
def : Pat<(X86psignd VR128:$src1, VR128:$src2),
(PSIGNDrr128 VR128:$src1, VR128:$src2)>, Requires<[HasSSSE3]>;
//===---------------------------------------------------------------------===//
// SSSE3 - Packed Align Instruction Patterns
//===---------------------------------------------------------------------===//
@ -3896,27 +3893,6 @@ def : Pat<(v4f32 (movlp:$src3 VR128:$src1, (v4f32 VR128:$src2))),
def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))),
(MOVZPQILo2PQIrr VR128:$src)>, Requires<[HasSSE2]>;
// Some special case pandn patterns.
def : Pat<(v2i64 (and (xor VR128:$src1, (bc_v2i64 (v4i32 immAllOnesV))),
VR128:$src2)),
(PANDNrr VR128:$src1, VR128:$src2)>, Requires<[HasSSE2]>;
def : Pat<(v2i64 (and (xor VR128:$src1, (bc_v2i64 (v8i16 immAllOnesV))),
VR128:$src2)),
(PANDNrr VR128:$src1, VR128:$src2)>, Requires<[HasSSE2]>;
def : Pat<(v2i64 (and (xor VR128:$src1, (bc_v2i64 (v16i8 immAllOnesV))),
VR128:$src2)),
(PANDNrr VR128:$src1, VR128:$src2)>, Requires<[HasSSE2]>;
def : Pat<(v2i64 (and (xor VR128:$src1, (bc_v2i64 (v4i32 immAllOnesV))),
(memop addr:$src2))),
(PANDNrm VR128:$src1, addr:$src2)>, Requires<[HasSSE2]>;
def : Pat<(v2i64 (and (xor VR128:$src1, (bc_v2i64 (v8i16 immAllOnesV))),
(memop addr:$src2))),
(PANDNrm VR128:$src1, addr:$src2)>, Requires<[HasSSE2]>;
def : Pat<(v2i64 (and (xor VR128:$src1, (bc_v2i64 (v16i8 immAllOnesV))),
(memop addr:$src2))),
(PANDNrm VR128:$src1, addr:$src2)>, Requires<[HasSSE2]>;
// vector -> vector casts
def : Pat<(v4f32 (sint_to_fp (v4i32 VR128:$src))),
(Int_CVTDQ2PSrr VR128:$src)>, Requires<[HasSSE2]>;

View File

@ -0,0 +1,26 @@
; RUN: llc < %s -mcpu=nehalem | FileCheck %s
define <4 x i32> @psignd(<4 x i32> %a, <4 x i32> %b) nounwind ssp {
entry:
; CHECK: psignd
; CHECK-NOT: sub
%b.lobit = ashr <4 x i32> %b, <i32 31, i32 31, i32 31, i32 31>
%sub = sub nsw <4 x i32> zeroinitializer, %a
%0 = xor <4 x i32> %b.lobit, <i32 -1, i32 -1, i32 -1, i32 -1>
%1 = and <4 x i32> %a, %0
%2 = and <4 x i32> %b.lobit, %sub
%cond = or <4 x i32> %1, %2
ret <4 x i32> %cond
}
define <4 x i32> @pblendvb(<4 x i32> %b, <4 x i32> %a, <4 x i32> %c) nounwind ssp {
entry:
; CHECK: pblendvb
%b.lobit = ashr <4 x i32> %b, <i32 31, i32 31, i32 31, i32 31>
%sub = sub nsw <4 x i32> zeroinitializer, %a
%0 = xor <4 x i32> %b.lobit, <i32 -1, i32 -1, i32 -1, i32 -1>
%1 = and <4 x i32> %c, %0
%2 = and <4 x i32> %a, %b.lobit
%cond = or <4 x i32> %1, %2
ret <4 x i32> %cond
}