Fix a long standing deficiency in the X86 backend: we would

sometimes emit "zero" and "all one" vectors multiple times, for example: _test2: pcmpeqd %mm0, %mm0 movq %mm0, _M1 pcmpeqd %mm0, %mm0 movq %mm0, _M2 ret instead of: _test2: pcmpeqd %mm0, %mm0 movq %mm0, _M1 movq %mm0, _M2 ret This patch fixes this by always arranging for zero/one vectors to be defined as v4i32 or v2i32 (SSE/MMX) instead of letting them be any random type. This ensures they get trivially CSE'd on the dag. This fix is also important for LegalizeDAGTypes, as it gets unhappy when the x86 backend wants BUILD_VECTOR(i64 0) to be legal even when 'i64' isn't legal. This patch makes the following changes: 1) X86TargetLowering::LowerBUILD_VECTOR now lowers 0/1 vectors into their canonical types. 2) The now-dead patterns are removed from the SSE/MMX .td files. 3) All the patterns in the .td file that referred to immAllOnesV or immAllZerosV in the wrong form now use *_bc to match them with a bitcast wrapped around them. 4) X86DAGToDAGISel::SelectScalarSSELoad is generalized to handle bitcast'd zero vectors, which simplifies the code actually. 5) getShuffleVectorZeroOrUndef is updated to generate a shuffle that is legal, instead of generating one that is illegal and expecting a later legalize pass to clean it up. 6) isZeroShuffle is generalized to handle bitcast of zeros. 7) several other minor tweaks. This patch is definite goodness, but has the potential to cause random code quality regressions. Please be on the lookout for these and let me know if they happen. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@44310 91177308-0d34-0410-b5e6-96231b3b80d8
2024-12-03 09:21:13 +00:00 · 2007-11-25 00:24:49 +00:00 · 2007-11-25 00:24:49 +00:00 · 8a594489bf
commit 8a594489bf
parent c58d558a79
5 changed files with 128 additions and 97 deletions
--- a/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/lib/Target/X86/X86ISelDAGToDAG.cpp
@ -842,20 +842,15 @@ bool X86DAGToDAGISel::SelectScalarSSELoad(SDOperand Op, SDOperand Pred,
  // Also handle the case where we explicitly require zeros in the top
  // elements.  This is a vector shuffle from the zero vector.
  if (N.getOpcode() == ISD::VECTOR_SHUFFLE && N.Val->hasOneUse() &&
-      N.getOperand(0).getOpcode() == ISD::BUILD_VECTOR &&
+      // Check to see if the top elements are all zeros (or bitcast of zeros).
+      ISD::isBuildVectorAllZeros(N.getOperand(0).Val) &&
      N.getOperand(1).getOpcode() == ISD::SCALAR_TO_VECTOR && 
      N.getOperand(1).Val->hasOneUse() &&
      ISD::isNON_EXTLoad(N.getOperand(1).getOperand(0).Val) &&
      N.getOperand(1).getOperand(0).hasOneUse()) {
-    // Check to see if the BUILD_VECTOR is building a zero vector.
-    SDOperand BV = N.getOperand(0);
-    for (unsigned i = 0, e = BV.getNumOperands(); i != e; ++i)
-      if (!isZeroNode(BV.getOperand(i)) &&
-          BV.getOperand(i).getOpcode() != ISD::UNDEF)
-        return false;  // Not a zero/undef vector.
    // Check to see if the shuffle mask is 4/L/L/L or 2/L, where L is something
    // from the LHS.
-    unsigned VecWidth = BV.getNumOperands();
+    unsigned VecWidth=MVT::getVectorNumElements(N.getOperand(0).getValueType());
    SDOperand ShufMask = N.getOperand(2);
    assert(ShufMask.getOpcode() == ISD::BUILD_VECTOR && "Invalid shuf mask!");
    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(ShufMask.getOperand(0))) {
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@ -2728,7 +2728,7 @@ static bool isPSHUFHW_PSHUFLWMask(SDNode *N) {
  return true;
 }

-/// CommuteVectorShuffle - Swap vector_shuffle operandsas well as
+/// CommuteVectorShuffle - Swap vector_shuffle operands as well as
 /// values in ther permute mask.
 static SDOperand CommuteVectorShuffle(SDOperand Op, SDOperand &V1,
                                      SDOperand &V2, SDOperand &Mask,
@ -2867,23 +2867,24 @@ static bool isZeroShuffle(SDNode *N) {
  unsigned NumElems = Mask.getNumOperands();
  for (unsigned i = 0; i != NumElems; ++i) {
    SDOperand Arg = Mask.getOperand(i);
-    if (Arg.getOpcode() != ISD::UNDEF) {
-      unsigned Idx = cast<ConstantSDNode>(Arg)->getValue();
-      if (Idx < NumElems) {
-        unsigned Opc = V1.Val->getOpcode();
-        if (Opc == ISD::UNDEF)
-          continue;
-        if (Opc != ISD::BUILD_VECTOR ||
-            !isZeroNode(V1.Val->getOperand(Idx)))
-          return false;
-      } else if (Idx >= NumElems) {
-        unsigned Opc = V2.Val->getOpcode();
-        if (Opc == ISD::UNDEF)
-          continue;
-        if (Opc != ISD::BUILD_VECTOR ||
-            !isZeroNode(V2.Val->getOperand(Idx - NumElems)))
-          return false;
-      }
+    if (Arg.getOpcode() == ISD::UNDEF)
+      continue;
+    
+    unsigned Idx = cast<ConstantSDNode>(Arg)->getValue();
+    if (Idx < NumElems) {
+      unsigned Opc = V1.Val->getOpcode();
+      if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V1.Val))
+        continue;
+      if (Opc != ISD::BUILD_VECTOR ||
+          !isZeroNode(V1.Val->getOperand(Idx)))
+        return false;
+    } else if (Idx >= NumElems) {
+      unsigned Opc = V2.Val->getOpcode();
+      if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V2.Val))
+        continue;
+      if (Opc != ISD::BUILD_VECTOR ||
+          !isZeroNode(V2.Val->getOperand(Idx - NumElems)))
+        return false;
    }
  }
  return true;
@ -2893,14 +2894,35 @@ static bool isZeroShuffle(SDNode *N) {
 ///
 static SDOperand getZeroVector(MVT::ValueType VT, SelectionDAG &DAG) {
  assert(MVT::isVector(VT) && "Expected a vector type");
-  unsigned NumElems = MVT::getVectorNumElements(VT);
-  MVT::ValueType EVT = MVT::getVectorElementType(VT);
-  bool isFP = MVT::isFloatingPoint(EVT);
-  SDOperand Zero = isFP ? DAG.getConstantFP(0.0, EVT) : DAG.getConstant(0, EVT);
-  SmallVector<SDOperand, 8> ZeroVec(NumElems, Zero);
-  return DAG.getNode(ISD::BUILD_VECTOR, VT, &ZeroVec[0], ZeroVec.size());
+  
+  // Always build zero vectors as <4 x i32> or <2 x i32> bitcasted to their dest
+  // type.  This ensures they get CSE'd.
+  SDOperand Cst = DAG.getTargetConstant(0, MVT::i32);
+  SDOperand Vec;
+  if (MVT::getSizeInBits(VT) == 64)  // MMX
+    Vec = DAG.getNode(ISD::BUILD_VECTOR, MVT::v2i32, Cst, Cst);
+  else                                              // SSE
+    Vec = DAG.getNode(ISD::BUILD_VECTOR, MVT::v4i32, Cst, Cst, Cst, Cst);
+  return DAG.getNode(ISD::BIT_CONVERT, VT, Vec);
 }

+/// getOnesVector - Returns a vector of specified type with all bits set.
+///
+static SDOperand getOnesVector(MVT::ValueType VT, SelectionDAG &DAG) {
+  assert(MVT::isVector(VT) && "Expected a vector type");
+  
+  // Always build ones vectors as <4 x i32> or <2 x i32> bitcasted to their dest
+  // type.  This ensures they get CSE'd.
+  SDOperand Cst = DAG.getTargetConstant(~0U, MVT::i32);
+  SDOperand Vec;
+  if (MVT::getSizeInBits(VT) == 64)  // MMX
+    Vec = DAG.getNode(ISD::BUILD_VECTOR, MVT::v2i32, Cst, Cst);
+  else                                              // SSE
+    Vec = DAG.getNode(ISD::BUILD_VECTOR, MVT::v4i32, Cst, Cst, Cst, Cst);
+  return DAG.getNode(ISD::BIT_CONVERT, VT, Vec);
+}
+
+
 /// NormalizeMask - V2 is a splat, modify the mask (if needed) so all elements
 /// that point to V2 points to its first element.
 static SDOperand NormalizeMask(SDOperand Mask, SelectionDAG &DAG) {
@ -2981,24 +3003,28 @@ static SDOperand PromoteSplat(SDOperand Op, SelectionDAG &DAG) {
  }
  V1 = DAG.getNode(ISD::BIT_CONVERT, MVT::v4i32, V1);

-  MVT::ValueType MaskVT = MVT::getIntVectorWithNumElements(4);
-  Mask = getZeroVector(MaskVT, DAG);
+  Mask = getZeroVector(MVT::v4i32, DAG);
  SDOperand Shuffle = DAG.getNode(ISD::VECTOR_SHUFFLE, MVT::v4i32, V1,
                                  DAG.getNode(ISD::UNDEF, MVT::v4i32), Mask);
  return DAG.getNode(ISD::BIT_CONVERT, VT, Shuffle);
 }

 /// getShuffleVectorZeroOrUndef - Return a vector_shuffle of the specified
-/// vector of zero or undef vector.
+/// vector of zero or undef vector.  This produces a shuffle where the low
+/// element of V2 is swizzled into the zero/undef vector, landing at element
+/// Idx.  This produces a shuffle mask like 4,1,2,3 (idx=0) or  0,1,2,4 (idx=3).
 static SDOperand getShuffleVectorZeroOrUndef(SDOperand V2, MVT::ValueType VT,
                                             unsigned NumElems, unsigned Idx,
                                             bool isZero, SelectionDAG &DAG) {
  SDOperand V1 = isZero ? getZeroVector(VT, DAG) : DAG.getNode(ISD::UNDEF, VT);
  MVT::ValueType MaskVT = MVT::getIntVectorWithNumElements(NumElems);
  MVT::ValueType EVT = MVT::getVectorElementType(MaskVT);
-  SDOperand Zero = DAG.getConstant(0, EVT);
-  SmallVector<SDOperand, 8> MaskVec(NumElems, Zero);
-  MaskVec[Idx] = DAG.getConstant(NumElems, EVT);
+  SmallVector<SDOperand, 16> MaskVec;
+  for (unsigned i = 0; i != NumElems; ++i)
+    if (i == Idx)  // If this is the insertion idx, put the low elt of V2 here.
+      MaskVec.push_back(DAG.getConstant(NumElems, EVT));
+    else
+      MaskVec.push_back(DAG.getConstant(i, EVT));
  SDOperand Mask = DAG.getNode(ISD::BUILD_VECTOR, MaskVT,
                               &MaskVec[0], MaskVec.size());
  return DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V1, V2, Mask);
@ -3078,13 +3104,18 @@ static SDOperand LowerBuildVectorv8i16(SDOperand Op, unsigned NonZeros,

 SDOperand
 X86TargetLowering::LowerBUILD_VECTOR(SDOperand Op, SelectionDAG &DAG) {
-  // All zero's are handled with pxor.
-  if (ISD::isBuildVectorAllZeros(Op.Val))
-    return Op;
+  // All zero's are handled with pxor, all one's are handled with pcmpeqd.
+  if (ISD::isBuildVectorAllZeros(Op.Val) || ISD::isBuildVectorAllOnes(Op.Val)) {
+    // Canonicalize this to either <4 x i32> or <2 x i32> (SSE vs MMX) to
+    // 1) ensure the zero vectors are CSE'd, and 2) ensure that i64 scalars are
+    // eliminated on x86-32 hosts.
+    if (Op.getValueType() == MVT::v4i32 || Op.getValueType() == MVT::v2i32)
+      return Op;

-  // All one's are handled with pcmpeqd.
-  if (ISD::isBuildVectorAllOnes(Op.Val))
-    return Op;
+    if (ISD::isBuildVectorAllOnes(Op.Val))
+      return getOnesVector(Op.getValueType(), DAG);
+    return getZeroVector(Op.getValueType(), DAG);
+  }

  MVT::ValueType VT = Op.getValueType();
  MVT::ValueType EVT = MVT::getVectorElementType(VT);
@ -3113,12 +3144,8 @@ X86TargetLowering::LowerBUILD_VECTOR(SDOperand Op, SelectionDAG &DAG) {
  }

  if (NumNonZero == 0) {
-    if (NumZero == 0)
-      // All undef vector. Return an UNDEF.
-      return DAG.getNode(ISD::UNDEF, VT);
-    else
-      // A mix of zero and undef. Return a zero vector.
-      return getZeroVector(VT, DAG);
+    // All undef vector. Return an UNDEF.  All zero vectors were handled above.
+    return DAG.getNode(ISD::UNDEF, VT);
  }

  // Splat is obviously ok. Let legalizer expand it to a shuffle.
@ -3299,8 +3326,12 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDOperand Op, SelectionDAG &DAG) {
    return CommuteVectorShuffle(Op, V1, V2, PermMask, DAG);

  bool Commuted = false;
+  // FIXME: This should also accept a bitcast of a splat?  Be careful, not
+  // 1,1,1,1 -> v8i16 though.
  V1IsSplat = isSplatVector(V1.Val);
  V2IsSplat = isSplatVector(V2.Val);
+  
+  // Canonicalize the splat or undef, if present, to be on the RHS.
  if ((V1IsSplat || V1IsUndef) && !(V2IsSplat || V2IsUndef)) {
    Op = CommuteVectorShuffle(Op, V1, V2, PermMask, DAG);
    std::swap(V1IsSplat, V2IsSplat);
--- a/lib/Target/X86/X86InstrMMX.td
+++ b/lib/Target/X86/X86InstrMMX.td
@ -486,14 +486,13 @@ def MMX_MASKMOVQ : MMXI<0xF7, MRMDestMem, (outs), (ins VR64:$src, VR64:$mask),
 //===----------------------------------------------------------------------===//

 // Alias instructions that map zero vector to pxor.
-// FIXME: remove when we can teach regalloc that xor reg, reg is ok.
 let isReMaterializable = 1 in {
  def MMX_V_SET0       : MMXI<0xEF, MRMInitReg, (outs VR64:$dst), (ins),
                              "pxor\t$dst, $dst",
-                              [(set VR64:$dst, (v1i64 immAllZerosV))]>;
+                              [(set VR64:$dst, (v2i32 immAllZerosV))]>;
  def MMX_V_SETALLONES : MMXI<0x76, MRMInitReg, (outs VR64:$dst), (ins),
                              "pcmpeqd\t$dst, $dst",
-                              [(set VR64:$dst, (v1i64 immAllOnesV))]>;
+                              [(set VR64:$dst, (v2i32 immAllOnesV))]>;
 }

 //===----------------------------------------------------------------------===//
@ -510,18 +509,6 @@ def : Pat<(store (v2i32 VR64:$src), addr:$dst),
 def : Pat<(store (v1i64 VR64:$src), addr:$dst),
          (MMX_MOVQ64mr addr:$dst, VR64:$src)>;

-// 64-bit vector all zero's.
-def : Pat<(v8i8  immAllZerosV), (MMX_V_SET0)>;
-def : Pat<(v4i16 immAllZerosV), (MMX_V_SET0)>;
-def : Pat<(v2i32 immAllZerosV), (MMX_V_SET0)>;
-def : Pat<(v1i64 immAllZerosV), (MMX_V_SET0)>;
-
-// 64-bit vector all one's.
-def : Pat<(v8i8  immAllOnesV), (MMX_V_SETALLONES)>;
-def : Pat<(v4i16 immAllOnesV), (MMX_V_SETALLONES)>;
-def : Pat<(v2i32 immAllOnesV), (MMX_V_SETALLONES)>;
-def : Pat<(v1i64 immAllOnesV), (MMX_V_SETALLONES)>;
-
 // Bit convert.
 def : Pat<(v8i8  (bitconvert (v1i64 VR64:$src))), (v8i8  VR64:$src)>;
 def : Pat<(v8i8  (bitconvert (v2i32 VR64:$src))), (v8i8  VR64:$src)>;
@ -551,10 +538,10 @@ def MMX_X86s2vec : SDNode<"X86ISD::S2VEC",  SDTypeProfile<1, 1, []>, []>;
 // Move scalar to XMM zero-extended
 // movd to XMM register zero-extends
 let AddedComplexity = 15 in {
-  def : Pat<(v8i8 (vector_shuffle immAllZerosV,
+  def : Pat<(v8i8 (vector_shuffle immAllZerosV_bc,
                    (v8i8 (MMX_X86s2vec GR32:$src)), MMX_MOVL_shuffle_mask)),
            (MMX_MOVZDI2PDIrr GR32:$src)>;
-  def : Pat<(v4i16 (vector_shuffle immAllZerosV,
+  def : Pat<(v4i16 (vector_shuffle immAllZerosV_bc,
                    (v4i16 (MMX_X86s2vec GR32:$src)), MMX_MOVL_shuffle_mask)),
            (MMX_MOVZDI2PDIrr GR32:$src)>;
  def : Pat<(v2i32 (vector_shuffle immAllZerosV,
@ -606,19 +593,19 @@ let AddedComplexity = 20 in {
 def : Pat<(v1i64 (and (xor VR64:$src1, (bc_v1i64 (v2i32 immAllOnesV))),
                  VR64:$src2)),
          (MMX_PANDNrr VR64:$src1, VR64:$src2)>;
-def : Pat<(v1i64 (and (xor VR64:$src1, (bc_v1i64 (v4i16 immAllOnesV))),
+def : Pat<(v1i64 (and (xor VR64:$src1, (bc_v1i64 (v4i16 immAllOnesV_bc))),
                  VR64:$src2)),
          (MMX_PANDNrr VR64:$src1, VR64:$src2)>;
-def : Pat<(v1i64 (and (xor VR64:$src1, (bc_v1i64 (v8i8  immAllOnesV))),
+def : Pat<(v1i64 (and (xor VR64:$src1, (bc_v1i64 (v8i8  immAllOnesV_bc))),
                  VR64:$src2)),
          (MMX_PANDNrr VR64:$src1, VR64:$src2)>;

 def : Pat<(v1i64 (and (xor VR64:$src1, (bc_v1i64 (v2i32 immAllOnesV))),
                  (load addr:$src2))),
          (MMX_PANDNrm VR64:$src1, addr:$src2)>;
-def : Pat<(v1i64 (and (xor VR64:$src1, (bc_v1i64 (v4i16 immAllOnesV))),
+def : Pat<(v1i64 (and (xor VR64:$src1, (bc_v1i64 (v4i16 immAllOnesV_bc))),
                  (load addr:$src2))),
          (MMX_PANDNrm VR64:$src1, addr:$src2)>;
-def : Pat<(v1i64 (and (xor VR64:$src1, (bc_v1i64 (v8i8  immAllOnesV))),
+def : Pat<(v1i64 (and (xor VR64:$src1, (bc_v1i64 (v8i8  immAllOnesV_bc))),
                  (load addr:$src2))),
          (MMX_PANDNrm VR64:$src1, addr:$src2)>;
--- a/lib/Target/X86/X86InstrSSE.td
+++ b/lib/Target/X86/X86InstrSSE.td
@ -939,11 +939,10 @@ def STMXCSR : PSI<0xAE, MRM3m, (outs), (ins i32mem:$dst),
                  "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)]>;

 // Alias instructions that map zero vector to pxor / xorp* for sse.
-// FIXME: remove when we can teach regalloc that xor reg, reg is ok.
 let isReMaterializable = 1 in
 def V_SET0 : PSI<0x57, MRMInitReg, (outs VR128:$dst), (ins),
                 "xorps\t$dst, $dst",
-                 [(set VR128:$dst, (v4f32 immAllZerosV))]>;
+                 [(set VR128:$dst, (v4i32 immAllZerosV))]>;

 // FR32 to 128-bit vector conversion.
 def MOVSS2PSrr : SSI<0x10, MRMSrcReg, (outs VR128:$dst), (ins FR32:$src),
@ -991,7 +990,7 @@ let isTwoAddress = 1 in {
 let AddedComplexity = 20 in
 def MOVZSS2PSrm : SSI<0x10, MRMSrcMem, (outs VR128:$dst), (ins f32mem:$src),
                      "movss\t{$src, $dst|$dst, $src}",
-                      [(set VR128:$dst, (v4f32 (vector_shuffle immAllZerosV,
+                      [(set VR128:$dst, (v4f32 (vector_shuffle immAllZerosV_bc,
                                 (v4f32 (scalar_to_vector (loadf32 addr:$src))),
                                                MOVL_shuffle_mask)))]>;

@ -2119,11 +2118,10 @@ def MFENCE : I<0xAE, MRM6m, (outs), (ins),


 // Alias instructions that map zero vector to pxor / xorp* for sse.
-// FIXME: remove when we can teach regalloc that xor reg, reg is ok.
 let isReMaterializable = 1 in
  def V_SETALLONES : PDI<0x76, MRMInitReg, (outs VR128:$dst), (ins),
                         "pcmpeqd\t$dst, $dst",
-                         [(set VR128:$dst, (v2f64 immAllOnesV))]>;
+                         [(set VR128:$dst, (v4i32 immAllOnesV))]>;

 // FR64 to 128-bit vector conversion.
 def MOVSD2PDrr : SDI<0x10, MRMSrcReg, (outs VR128:$dst), (ins FR64:$src),
@ -2220,7 +2218,7 @@ let AddedComplexity = 20 in
  def MOVZSD2PDrm : SDI<0x10, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
                        "movsd\t{$src, $dst|$dst, $src}",
                        [(set VR128:$dst,
-                          (v2f64 (vector_shuffle immAllZerosV,
+                          (v2f64 (vector_shuffle immAllZerosV_bc,
                                  (v2f64 (scalar_to_vector
                                          (loadf64 addr:$src))),
                                  MOVL_shuffle_mask)))]>;
@ -2692,21 +2690,6 @@ def : Pat<(v8i16 (undef)), (IMPLICIT_DEF_VR128)>, Requires<[HasSSE2]>;
 def : Pat<(v4i32 (undef)), (IMPLICIT_DEF_VR128)>, Requires<[HasSSE2]>;
 def : Pat<(v2i64 (undef)), (IMPLICIT_DEF_VR128)>, Requires<[HasSSE2]>;

-// 128-bit vector all zero's.
-def : Pat<(v16i8 immAllZerosV), (V_SET0)>, Requires<[HasSSE2]>;
-def : Pat<(v8i16 immAllZerosV), (V_SET0)>, Requires<[HasSSE2]>;
-def : Pat<(v4i32 immAllZerosV), (V_SET0)>, Requires<[HasSSE2]>;
-def : Pat<(v2i64 immAllZerosV), (V_SET0)>, Requires<[HasSSE2]>;
-def : Pat<(v2f64 immAllZerosV), (V_SET0)>, Requires<[HasSSE2]>;
-
-// 128-bit vector all one's.
-def : Pat<(v16i8 immAllOnesV), (V_SETALLONES)>, Requires<[HasSSE2]>;
-def : Pat<(v8i16 immAllOnesV), (V_SETALLONES)>, Requires<[HasSSE2]>;
-def : Pat<(v4i32 immAllOnesV), (V_SETALLONES)>, Requires<[HasSSE2]>;
-def : Pat<(v2i64 immAllOnesV), (V_SETALLONES)>, Requires<[HasSSE2]>;
-def : Pat<(v4f32 immAllOnesV), (V_SETALLONES)>, Requires<[HasSSE1]>;
-
-
 // Scalar to v8i16 / v16i8. The source may be a GR32, but only the lower 8 or
 // 16-bits matter.
 def : Pat<(v8i16 (X86s2vec GR32:$src)), (MOVDI2PDIrr GR32:$src)>,
@ -2751,17 +2734,17 @@ let Predicates = [HasSSE2] in {
 // Move scalar to XMM zero-extended
 // movd to XMM register zero-extends
 let AddedComplexity = 15 in {
-def : Pat<(v8i16 (vector_shuffle immAllZerosV,
+def : Pat<(v8i16 (vector_shuffle immAllZerosV_bc,
                  (v8i16 (X86s2vec GR32:$src)), MOVL_shuffle_mask)),
          (MOVZDI2PDIrr GR32:$src)>, Requires<[HasSSE2]>;
-def : Pat<(v16i8 (vector_shuffle immAllZerosV,
+def : Pat<(v16i8 (vector_shuffle immAllZerosV_bc,
                  (v16i8 (X86s2vec GR32:$src)), MOVL_shuffle_mask)),
          (MOVZDI2PDIrr GR32:$src)>, Requires<[HasSSE2]>;
 // Zeroing a VR128 then do a MOVS{S|D} to the lower bits.
-def : Pat<(v2f64 (vector_shuffle immAllZerosV,
+def : Pat<(v2f64 (vector_shuffle immAllZerosV_bc,
                  (v2f64 (scalar_to_vector FR64:$src)), MOVL_shuffle_mask)),
          (MOVLSD2PDrr (V_SET0), FR64:$src)>, Requires<[HasSSE2]>;
-def : Pat<(v4f32 (vector_shuffle immAllZerosV,
+def : Pat<(v4f32 (vector_shuffle immAllZerosV_bc,
                  (v4f32 (scalar_to_vector FR32:$src)), MOVL_shuffle_mask)),
          (MOVLSS2PSrr (V_SET0), FR32:$src)>, Requires<[HasSSE2]>;
 }
@ -2911,7 +2894,7 @@ def : Pat<(v4i32 (vector_shuffle VR128:$src1, VR128:$src2,

 // Set lowest element and zero upper elements.
 let AddedComplexity = 20 in
-def : Pat<(bc_v2i64 (vector_shuffle immAllZerosV,
+def : Pat<(bc_v2i64 (vector_shuffle immAllZerosV_bc,
                     (v2f64 (scalar_to_vector (loadf64 addr:$src))),
                     MOVL_shuffle_mask)),
          (MOVZQI2PQIrm addr:$src)>, Requires<[HasSSE2]>;
--- a/test/CodeGen/X86/vec_zero_cse.ll
+++ b/test/CodeGen/X86/vec_zero_cse.ll
@ -0,0 +1,35 @@
+; RUN: llvm-as < %s | llc -relocation-model=static -mcpu=yonah | grep pxor | count 1
+; RUN: llvm-as < %s | llc -relocation-model=static -mcpu=yonah | grep xorps | count 1
+; RUN: llvm-as < %s | llc -relocation-model=static -mcpu=yonah | grep pcmpeqd | count 2
+
+@M1 = external global <1 x i64>
+@M2 = external global <2 x i32>
+
+@S1 = external global <2 x i64>
+@S2 = external global <4 x i32>
+
+define void @test() {
+  store <1 x i64> zeroinitializer, <1 x i64>* @M1
+  store <2 x i32> zeroinitializer, <2 x i32>* @M2
+  ret void
+}
+
+define void @test2() {
+  store <1 x i64> < i64 -1 >, <1 x i64>* @M1
+  store <2 x i32> < i32 -1, i32 -1 >, <2 x i32>* @M2
+  ret void
+}
+
+define void @test3() {
+  store <2 x i64> zeroinitializer, <2 x i64>* @S1
+  store <4 x i32> zeroinitializer, <4 x i32>* @S2
+  ret void
+}
+
+define void @test4() {
+  store <2 x i64> < i64 -1, i64 -1>, <2 x i64>* @S1
+  store <4 x i32> < i32 -1, i32 -1, i32 -1, i32 -1 >, <4 x i32>* @S2
+  ret void
+}
+
+