Add wider vector/integer support for PR12312

- Enhance the fix to PR12312 to support wider integer, such as 256-bit
  integer. If more than 1 fully evaluated vectors are found, POR them
  first followed by the final PTEST.

llvm-svn: 163832
This commit is contained in:
Michael Liao 2012-09-13 20:24:54 +00:00
parent 7c620b0d5f
commit 0c0da113c5
3 changed files with 223 additions and 113 deletions

View File

@ -8347,6 +8347,98 @@ static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {
return DAG.getNode(ISD::AND, dl, VT, xFGETSIGN, DAG.getConstant(1, VT));
}
// LowerVectorAllZeroTest - Check whether an OR'd tree is PTEST-able.
//
SDValue X86TargetLowering::LowerVectorAllZeroTest(SDValue Op, SelectionDAG &DAG) const {
assert(Op.getOpcode() == ISD::OR && "Only check OR'd tree.");
if (!Subtarget->hasSSE41())
return SDValue();
if (!Op->hasOneUse())
return SDValue();
SDNode *N = Op.getNode();
DebugLoc DL = N->getDebugLoc();
SmallVector<SDValue, 8> Opnds;
DenseMap<SDValue, unsigned> VecInMap;
EVT VT = MVT::Other;
// Recognize a special case where a vector is casted into wide integer to
// test all 0s.
Opnds.push_back(N->getOperand(0));
Opnds.push_back(N->getOperand(1));
for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {
SmallVector<SDValue, 8>::const_iterator I = Opnds.begin() + Slot;
// BFS traverse all OR'd operands.
if (I->getOpcode() == ISD::OR) {
Opnds.push_back(I->getOperand(0));
Opnds.push_back(I->getOperand(1));
// Re-evaluate the number of nodes to be traversed.
e += 2; // 2 more nodes (LHS and RHS) are pushed.
continue;
}
// Quit if a non-EXTRACT_VECTOR_ELT
if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
return SDValue();
// Quit if without a constant index.
SDValue Idx = I->getOperand(1);
if (!isa<ConstantSDNode>(Idx))
return SDValue();
SDValue ExtractedFromVec = I->getOperand(0);
DenseMap<SDValue, unsigned>::iterator M = VecInMap.find(ExtractedFromVec);
if (M == VecInMap.end()) {
VT = ExtractedFromVec.getValueType();
// Quit if not 128/256-bit vector.
if (!VT.is128BitVector() && !VT.is256BitVector())
return SDValue();
// Quit if not the same type.
if (VecInMap.begin() != VecInMap.end() &&
VT != VecInMap.begin()->first.getValueType())
return SDValue();
M = VecInMap.insert(std::make_pair(ExtractedFromVec, 0)).first;
}
M->second |= 1U << cast<ConstantSDNode>(Idx)->getZExtValue();
}
assert((VT.is128BitVector() || VT.is256BitVector()) &&
"Not extracted from 128-bit vector.");
unsigned FullMask = (1U << VT.getVectorNumElements()) - 1U;
SmallVector<SDValue, 8> VecIns;
for (DenseMap<SDValue, unsigned>::const_iterator
I = VecInMap.begin(), E = VecInMap.end(); I != E; ++I) {
// Quit if not all elements are used.
if (I->second != FullMask)
return SDValue();
VecIns.push_back(I->first);
}
EVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
// Cast all vectors into TestVT for PTEST.
for (unsigned i = 0, e = VecIns.size(); i < e; ++i)
VecIns[i] = DAG.getNode(ISD::BITCAST, DL, TestVT, VecIns[i]);
// If more than one full vectors are evaluated, OR them first before PTEST.
for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1; Slot += 2, e += 1) {
// Each iteration will OR 2 nodes and append the result until there is only
// 1 node left, i.e. the final OR'd value of all vectors.
SDValue LHS = VecIns[Slot];
SDValue RHS = VecIns[Slot + 1];
VecIns.push_back(DAG.getNode(ISD::OR, DL, TestVT, LHS, RHS));
}
return DAG.getNode(X86ISD::PTEST, DL, MVT::i32,
VecIns.back(), VecIns.back());
}
/// Emit nodes that will be selected as "test Op0,Op0", or something
/// equivalent.
SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC,
@ -8486,9 +8578,17 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC,
switch (ArithOp.getOpcode()) {
default: llvm_unreachable("unexpected operator!");
case ISD::SUB: Opcode = X86ISD::SUB; break;
case ISD::OR: Opcode = X86ISD::OR; break;
case ISD::XOR: Opcode = X86ISD::XOR; break;
case ISD::AND: Opcode = X86ISD::AND; break;
case ISD::OR: {
if (!NeedTruncation && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
SDValue EFLAGS = LowerVectorAllZeroTest(Op, DAG);
if (EFLAGS.getNode())
return EFLAGS;
}
Opcode = X86ISD::OR;
break;
}
}
NumOperands = 2;
@ -14205,84 +14305,6 @@ static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
return SDValue();
}
/// checkFlaggedOrCombine - DAG combination on X86ISD::OR, i.e. with EFLAGS
/// updated. If only flag result is used and the result is evaluated from a
/// series of element extraction, try to combine it into a PTEST.
static SDValue checkFlaggedOrCombine(SDValue Or, X86::CondCode &CC,
SelectionDAG &DAG,
const X86Subtarget *Subtarget) {
SDNode *N = Or.getNode();
DebugLoc DL = N->getDebugLoc();
// Only SSE4.1 and beyond supports PTEST or like.
if (!Subtarget->hasSSE41())
return SDValue();
if (N->getOpcode() != X86ISD::OR)
return SDValue();
// Quit if the value result of OR is used.
if (N->hasAnyUseOfValue(0))
return SDValue();
// Quit if not used as a boolean value.
if (CC != X86::COND_E && CC != X86::COND_NE)
return SDValue();
SmallVector<SDValue, 8> Opnds;
SDValue VecIn;
EVT VT = MVT::Other;
unsigned Mask = 0;
// Recognize a special case where a vector is casted into wide integer to
// test all 0s.
Opnds.push_back(N->getOperand(0));
Opnds.push_back(N->getOperand(1));
for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {
SmallVector<SDValue, 8>::const_iterator I = Opnds.begin() + Slot;
// BFS traverse all OR'd operands.
if (I->getOpcode() == ISD::OR) {
Opnds.push_back(I->getOperand(0));
Opnds.push_back(I->getOperand(1));
// Re-evaluate the number of nodes to be traversed.
e += 2; // 2 more nodes (LHS and RHS) are pushed.
continue;
}
// Quit if a non-EXTRACT_VECTOR_ELT
if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
return SDValue();
// Quit if without a constant index.
SDValue Idx = I->getOperand(1);
if (!isa<ConstantSDNode>(Idx))
return SDValue();
// Check if all elements are extracted from the same vector.
SDValue ExtractedFromVec = I->getOperand(0);
if (VecIn.getNode() == 0) {
VT = ExtractedFromVec.getValueType();
// FIXME: only 128-bit vector is supported so far.
if (!VT.is128BitVector())
return SDValue();
VecIn = ExtractedFromVec;
} else if (VecIn != ExtractedFromVec)
return SDValue();
// Record the constant index.
Mask |= 1U << cast<ConstantSDNode>(Idx)->getZExtValue();
}
assert(VT.is128BitVector() && "Only 128-bit vector PTEST is supported so far.");
// Quit if not all elements are used.
if (Mask != (1U << VT.getVectorNumElements()) - 1U)
return SDValue();
return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, VecIn, VecIn);
}
/// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
@ -14321,14 +14343,6 @@ static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG,
Ops, array_lengthof(Ops));
}
Flags = checkFlaggedOrCombine(Cond, CC, DAG, Subtarget);
if (Flags.getNode()) {
SDValue Ops[] = { FalseOp, TrueOp,
DAG.getConstant(CC, MVT::i8), Flags };
return DAG.getNode(X86ISD::CMOV, DL, N->getVTList(),
Ops, array_lengthof(Ops));
}
// If this is a select between two integer constants, try to do some
// optimizations. Note that the operands are ordered the opposite of SELECT
// operands.
@ -15860,12 +15874,6 @@ static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG &DAG,
return DAG.getNode(X86ISD::SETCC, DL, N->getVTList(), Cond, Flags);
}
Flags = checkFlaggedOrCombine(EFLAGS, CC, DAG, Subtarget);
if (Flags.getNode()) {
SDValue Cond = DAG.getConstant(CC, MVT::i8);
return DAG.getNode(X86ISD::SETCC, DL, N->getVTList(), Cond, Flags);
}
return SDValue();
}
@ -15889,13 +15897,6 @@ static SDValue PerformBrCondCombine(SDNode *N, SelectionDAG &DAG,
Flags);
}
Flags = checkFlaggedOrCombine(EFLAGS, CC, DAG, Subtarget);
if (Flags.getNode()) {
SDValue Cond = DAG.getConstant(CC, MVT::i8);
return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), Chain, Dest, Cond,
Flags);
}
return SDValue();
}

View File

@ -811,6 +811,8 @@ namespace llvm {
SDValue LowerVectorBroadcast(SDValue Op, SelectionDAG &DAG) const;
SDValue NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerVectorAllZeroTest(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerVectorFpExtend(SDValue &Op, SelectionDAG &DAG) const;
virtual SDValue

View File

@ -1,7 +1,7 @@
; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+sse41,-avx < %s | FileCheck %s --check-prefix SSE41
; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx < %s | FileCheck %s --check-prefix AVX
; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx,-avx2 < %s | FileCheck %s --check-prefix AVX
define i32 @veccond(<4 x i32> %input) {
define i32 @veccond128(<4 x i32> %input) {
entry:
%0 = bitcast <4 x i32> %input to i128
%1 = icmp ne i128 %0, 0
@ -11,38 +11,145 @@ if-true-block: ; preds = %entry
ret i32 0
endif-block: ; preds = %entry,
ret i32 1
; SSE41: veccond
; SSE41: veccond128
; SSE41: ptest
; SSE41: ret
; AVX: veccond
; AVX: vptest
; AVX: veccond128
; AVX: vptest %xmm{{.*}}, %xmm{{.*}}
; AVX: ret
}
define i32 @vectest(<4 x i32> %input) {
define i32 @veccond256(<8 x i32> %input) {
entry:
%0 = bitcast <8 x i32> %input to i256
%1 = icmp ne i256 %0, 0
br i1 %1, label %if-true-block, label %endif-block
if-true-block: ; preds = %entry
ret i32 0
endif-block: ; preds = %entry,
ret i32 1
; SSE41: veccond256
; SSE41: por
; SSE41: ptest
; SSE41: ret
; AVX: veccond256
; AVX: vptest %ymm{{.*}}, %ymm{{.*}}
; AVX: ret
}
define i32 @veccond512(<16 x i32> %input) {
entry:
%0 = bitcast <16 x i32> %input to i512
%1 = icmp ne i512 %0, 0
br i1 %1, label %if-true-block, label %endif-block
if-true-block: ; preds = %entry
ret i32 0
endif-block: ; preds = %entry,
ret i32 1
; SSE41: veccond512
; SSE41: por
; SSE41: por
; SSE41: por
; SSE41: ptest
; SSE41: ret
; AVX: veccond512
; AVX: vorps
; AVX: vptest %ymm{{.*}}, %ymm{{.*}}
; AVX: ret
}
define i32 @vectest128(<4 x i32> %input) {
entry:
%0 = bitcast <4 x i32> %input to i128
%1 = icmp ne i128 %0, 0
%2 = zext i1 %1 to i32
ret i32 %2
; SSE41: vectest
; SSE41: vectest128
; SSE41: ptest
; SSE41: ret
; AVX: vectest
; AVX: vptest
; AVX: vectest128
; AVX: vptest %xmm{{.*}}, %xmm{{.*}}
; AVX: ret
}
define i32 @vecsel(<4 x i32> %input, i32 %a, i32 %b) {
define i32 @vectest256(<8 x i32> %input) {
entry:
%0 = bitcast <8 x i32> %input to i256
%1 = icmp ne i256 %0, 0
%2 = zext i1 %1 to i32
ret i32 %2
; SSE41: vectest256
; SSE41: por
; SSE41: ptest
; SSE41: ret
; AVX: vectest256
; AVX: vptest %ymm{{.*}}, %ymm{{.*}}
; AVX: ret
}
define i32 @vectest512(<16 x i32> %input) {
entry:
%0 = bitcast <16 x i32> %input to i512
%1 = icmp ne i512 %0, 0
%2 = zext i1 %1 to i32
ret i32 %2
; SSE41: vectest512
; SSE41: por
; SSE41: por
; SSE41: por
; SSE41: ptest
; SSE41: ret
; AVX: vectest512
; AVX: vorps
; AVX: vptest %ymm{{.*}}, %ymm{{.*}}
; AVX: ret
}
define i32 @vecsel128(<4 x i32> %input, i32 %a, i32 %b) {
entry:
%0 = bitcast <4 x i32> %input to i128
%1 = icmp ne i128 %0, 0
%2 = select i1 %1, i32 %a, i32 %b
ret i32 %2
; SSE41: vecsel
; SSE41: vecsel128
; SSE41: ptest
; SSE41: ret
; AVX: vecsel
; AVX: vptest
; AVX: vecsel128
; AVX: vptest %xmm{{.*}}, %xmm{{.*}}
; AVX: ret
}
define i32 @vecsel256(<8 x i32> %input, i32 %a, i32 %b) {
entry:
%0 = bitcast <8 x i32> %input to i256
%1 = icmp ne i256 %0, 0
%2 = select i1 %1, i32 %a, i32 %b
ret i32 %2
; SSE41: vecsel256
; SSE41: por
; SSE41: ptest
; SSE41: ret
; AVX: vecsel256
; AVX: vptest %ymm{{.*}}, %ymm{{.*}}
; AVX: ret
}
define i32 @vecsel512(<16 x i32> %input, i32 %a, i32 %b) {
entry:
%0 = bitcast <16 x i32> %input to i512
%1 = icmp ne i512 %0, 0
%2 = select i1 %1, i32 %a, i32 %b
ret i32 %2
; SSE41: vecsel512
; SSE41: por
; SSE41: por
; SSE41: por
; SSE41: ptest
; SSE41: ret
; AVX: vecsel512
; AVX: vorps
; AVX: vptest %ymm{{.*}}, %ymm{{.*}}
; AVX: ret
}