[x86] Delete a bunch of really bad and totally unnecessary code in the

X86 target-specific DAG combining that tried to convert VSELECT nodes
into VECTOR_SHUFFLE nodes that it "knew" would lower into
immediate-controlled blend nodes.

Turns out, we have perfectly good lowering of all these VSELECT nodes,
and indeed that lowering already knows how to handle lowering through
BLENDI to immediate-controlled blend nodes. The code just wasn't getting
used much because this thing forced the world to go through the vector
shuffle lowering. Yuck.

This also exposes that I was too aggressive in avoiding domain crossing
in v218588 with that lowering -- when the other option is to expand into
two 128-bit vectors, it is worth domain crossing. Restore that behavior
now that we have nice tests covering it.

The test updates here fall into two camps. One is where previously we
ended up with an unsigned encoding of the blend operand and now we get
a signed encoding. In most of those places there were elaborate comments
explaining exactly what these operands really mean. Rather than that,
just switch these tests to use the nicely decoded comments that make it
obvious that the final shuffle matches.

The other updates are just removing pointless domain crossing by
blending integers with PBLENDW rather than BLENDPS.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@218589 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
Chandler Carruth 2014-09-29 02:01:20 +00:00
parent 3589550b3e
commit d23f1883d3
4 changed files with 17 additions and 137 deletions

View File

@ -11797,43 +11797,6 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
return SDValue();
}
// This function assumes its argument is a BUILD_VECTOR of constants or
// undef SDNodes. i.e: ISD::isBuildVectorOfConstantSDNodes(BuildVector) is
// true.
static bool BUILD_VECTORtoBlendMask(BuildVectorSDNode *BuildVector,
unsigned &MaskValue) {
MaskValue = 0;
unsigned NumElems = BuildVector->getNumOperands();
// There are 2 lanes if (NumElems > 8), and 1 lane otherwise.
unsigned NumLanes = (NumElems - 1) / 8 + 1;
unsigned NumElemsInLane = NumElems / NumLanes;
// Blend for v16i16 should be symetric for the both lanes.
for (unsigned i = 0; i < NumElemsInLane; ++i) {
SDValue EltCond = BuildVector->getOperand(i);
SDValue SndLaneEltCond =
(NumLanes == 2) ? BuildVector->getOperand(i + NumElemsInLane) : EltCond;
int Lane1Cond = -1, Lane2Cond = -1;
if (isa<ConstantSDNode>(EltCond))
Lane1Cond = !isZero(EltCond);
if (isa<ConstantSDNode>(SndLaneEltCond))
Lane2Cond = !isZero(SndLaneEltCond);
if (Lane1Cond == Lane2Cond || Lane2Cond < 0)
// Lane1Cond != 0, means we want the first argument.
// Lane1Cond == 0, means we want the second argument.
// The encoding of this argument is 0 for the first argument, 1
// for the second. Therefore, invert the condition.
MaskValue |= !Lane1Cond << i;
else if (Lane1Cond < 0)
MaskValue |= !Lane2Cond << i;
else
return false;
}
return true;
}
/// \brief Try to lower a VSELECT instruction to an immediate-controlled blend
/// instruction.
static SDValue lowerVSELECTtoBLENDI(SDValue Op, const X86Subtarget *Subtarget,
@ -11883,17 +11846,18 @@ static SDValue lowerVSELECTtoBLENDI(SDValue Op, const X86Subtarget *Subtarget,
} else {
// Everything else uses a generic blend mask computation with a custom type.
if (VT.isInteger()) {
if (VT.is256BitVector()) {
// The 256-bit integer blend instructions are only available on AVX2.
if (!Subtarget->hasAVX2())
return SDValue();
// We do the blend on v8i32 for 256-bit integer types.
BlendVT = MVT::v8i32;
} else {
if (VT.is256BitVector())
// We cast to floating point types if integer blends aren't available,
// and we coerce integer blends when available to occur on the v8i32
// type.
BlendVT = Subtarget->hasAVX2()
? MVT::v8i32
: MVT::getVectorVT(
MVT::getFloatingPointVT(VT.getScalarSizeInBits()),
VT.getVectorNumElements());
else
// For 128-bit vectors we do the blend on v8i16 types.
BlendVT = MVT::v8i16;
}
}
assert(BlendVT.getVectorNumElements() <= 8 &&
"Cannot blend more than 8 elements with an immediate!");
@ -21718,57 +21682,6 @@ matchIntegerMINMAX(SDValue Cond, EVT VT, SDValue LHS, SDValue RHS,
return std::make_pair(Opc, NeedSplit);
}
static SDValue
TransformVSELECTtoBlendVECTOR_SHUFFLE(SDNode *N, SelectionDAG &DAG,
const X86Subtarget *Subtarget) {
SDLoc dl(N);
SDValue Cond = N->getOperand(0);
SDValue LHS = N->getOperand(1);
SDValue RHS = N->getOperand(2);
if (Cond.getOpcode() == ISD::SIGN_EXTEND) {
SDValue CondSrc = Cond->getOperand(0);
if (CondSrc->getOpcode() == ISD::SIGN_EXTEND_INREG)
Cond = CondSrc->getOperand(0);
}
MVT VT = N->getSimpleValueType(0);
MVT EltVT = VT.getVectorElementType();
unsigned NumElems = VT.getVectorNumElements();
// There is no blend with immediate in AVX-512.
if (VT.is512BitVector())
return SDValue();
if (!Subtarget->hasSSE41() || EltVT == MVT::i8)
return SDValue();
if (!Subtarget->hasInt256() && VT == MVT::v16i16)
return SDValue();
if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
return SDValue();
// A vselect where all conditions and data are constants can be optimized into
// a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
if (ISD::isBuildVectorOfConstantSDNodes(LHS.getNode()) &&
ISD::isBuildVectorOfConstantSDNodes(RHS.getNode()))
return SDValue();
unsigned MaskValue = 0;
if (!BUILD_VECTORtoBlendMask(cast<BuildVectorSDNode>(Cond), MaskValue))
return SDValue();
SmallVector<int, 8> ShuffleMask(NumElems, -1);
for (unsigned i = 0; i < NumElems; ++i) {
// Be sure we emit undef where we can.
if (Cond.getOperand(i)->getOpcode() == ISD::UNDEF)
ShuffleMask[i] = -1;
else
ShuffleMask[i] = i + NumElems * ((MaskValue >> i) & 1);
}
return DAG.getVectorShuffle(VT, dl, LHS, RHS, &ShuffleMask[0]);
}
/// PerformSELECTCombine - Do target-specific dag combines on SELECT and VSELECT
/// nodes.
static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
@ -22318,23 +22231,6 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
DCI.CommitTargetLoweringOpt(TLO);
}
// We should generate an X86ISD::BLENDI from a vselect if its argument
// is a sign_extend_inreg of an any_extend of a BUILD_VECTOR of
// constants. This specific pattern gets generated when we split a
// selector for a 512 bit vector in a machine without AVX512 (but with
// 256-bit vectors), during legalization:
//
// (vselect (sign_extend (any_extend (BUILD_VECTOR)) i1) LHS RHS)
//
// Iff we find this pattern and the build_vectors are built from
// constants, we translate the vselect into a shuffle_vector that we
// know will be matched by LowerVECTOR_SHUFFLEtoBlend.
if (N->getOpcode() == ISD::VSELECT && !DCI.isBeforeLegalize()) {
SDValue Shuffle = TransformVSELECTtoBlendVECTOR_SHUFFLE(N, DAG, Subtarget);
if (Shuffle.getNode())
return Shuffle;
}
return SDValue();
}

View File

@ -21,7 +21,7 @@ define <4 x float> @vsel_float(<4 x float> %v1, <4 x float> %v2) {
;CHECK-LABEL: vsel_i32:
;CHECK: vblendps $10, %xmm1, %xmm0, %xmm0
;CHECK: vpblendw {{.*}} ## xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
;CHECK: ret
define <4 x i32> @vsel_i32(<4 x i32> %v1, <4 x i32> %v2) {
%vsel = select <4 x i1> <i1 true, i1 false, i1 true, i1 false>, <4 x i32> %v1, <4 x i32> %v2
@ -61,13 +61,7 @@ define <16 x i8> @vsel_i8(<16 x i8> %v1, <16 x i8> %v2) {
;CHECK-LABEL: vsel_float8:
;CHECK-NOT: vinsertf128
; <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false>
; which translates into the boolean mask (big endian representation):
; 00010001 = 17.
; '1' means takes the first argument, '0' means takes the second argument.
; This is the opposite of the intel syntax, thus we expect
; the inverted mask: 11101110 = 238.
;CHECK: vblendps $238, %ymm1, %ymm0, %ymm0
;CHECK: vblendps {{.*}} ## ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
;CHECK: ret
define <8 x float> @vsel_float8(<8 x float> %v1, <8 x float> %v2) {
%vsel = select <8 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false>, <8 x float> %v1, <8 x float> %v2
@ -76,7 +70,7 @@ define <8 x float> @vsel_float8(<8 x float> %v1, <8 x float> %v2) {
;CHECK-LABEL: vsel_i328:
;CHECK-NOT: vinsertf128
;CHECK: vblendps $238, %ymm1, %ymm0, %ymm0
;CHECK: vblendps {{.*}} ## ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
;CHECK-NEXT: ret
define <8 x i32> @vsel_i328(<8 x i32> %v1, <8 x i32> %v2) {
%vsel = select <8 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false>, <8 x i32> %v1, <8 x i32> %v2

View File

@ -22,17 +22,7 @@ define <4 x i8> @vsel_4xi8(<4 x i8> %v1, <4 x i8> %v2) {
}
;CHECK-LABEL: vsel_8xi16:
; The select mask is
; <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false>
; which translates into the boolean mask (big endian representation):
; 00010001 = 17.
; '1' means takes the first argument, '0' means takes the second argument.
; This is the opposite of the intel syntax, thus we expect
; the inverted mask: 11101110 = 238.
; According to the ABI:
; v1 is in xmm0 => first argument is xmm0.
; v2 is in xmm1 => second argument is xmm1.
;CHECK: pblendw $238, %xmm1, %xmm0
;CHECK: pblendw {{.*}} ## xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7]
;CHECK: ret
define <8 x i16> @vsel_8xi16(<8 x i16> %v1, <8 x i16> %v2) {
%vsel = select <8 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false>, <8 x i16> %v1, <8 x i16> %v2

View File

@ -10,7 +10,7 @@ define <4 x float> @vsel_float(<4 x float> %v1, <4 x float> %v2) {
;CHECK-LABEL: vsel_4xi8:
;CHECK: blendps
;CHECK: blendw
;CHECK: ret
define <4 x i8> @vsel_4xi8(<4 x i8> %v1, <4 x i8> %v2) {
%vsel = select <4 x i1> <i1 true, i1 true, i1 false, i1 true>, <4 x i8> %v1, <4 x i8> %v2
@ -18,7 +18,7 @@ define <4 x i8> @vsel_4xi8(<4 x i8> %v1, <4 x i8> %v2) {
}
;CHECK-LABEL: vsel_4xi16:
;CHECK: blendps
;CHECK: blendw
;CHECK: ret
define <4 x i16> @vsel_4xi16(<4 x i16> %v1, <4 x i16> %v2) {
%vsel = select <4 x i1> <i1 true, i1 false, i1 true, i1 true>, <4 x i16> %v1, <4 x i16> %v2
@ -27,7 +27,7 @@ define <4 x i16> @vsel_4xi16(<4 x i16> %v1, <4 x i16> %v2) {
;CHECK-LABEL: vsel_i32:
;CHECK: blendps
;CHECK: blendw
;CHECK: ret
define <4 x i32> @vsel_i32(<4 x i32> %v1, <4 x i32> %v2) {
%vsel = select <4 x i1> <i1 true, i1 true, i1 false, i1 true>, <4 x i32> %v1, <4 x i32> %v2