[x86] instcombine more cases of insertps into a shufflevector

This is a follow-on to D8833 (insertps optimization when the zero mask is not used). In this patch, we check for the case where the zmask is used, but both input vectors to the insertps intrinsic are the same operand or the zmask overrides the destination lane. This lets us replace the 2nd shuffle input operand with the zero vector. Differential Revision: http://reviews.llvm.org/D9257 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@235810 91177308-0d34-0410-b5e6-96231b3b80d8
2025-02-04 03:17:51 +00:00 · 2015-04-25 20:55:25 +00:00 · 2015-04-25 20:55:25 +00:00 · 1111a216ee
commit 1111a216ee
parent 3b91606783
2 changed files with 69 additions and 19 deletions
--- a/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCalls.cpp
@ -201,7 +201,7 @@ static Value *SimplifyX86insertps(const IntrinsicInst &II,
                                  InstCombiner::BuilderTy &Builder) {
  if (auto *CInt = dyn_cast<ConstantInt>(II.getArgOperand(2))) {
    VectorType *VecTy = cast<VectorType>(II.getType());
-    ConstantAggregateZero *ZeroVector = ConstantAggregateZero::get(VecTy);
+    assert(VecTy->getNumElements() == 4 && "insertps with wrong vector type");
    
    // The immediate permute control byte looks like this:
    //    [3:0] - zero mask for each 32-bit lane
@ -213,25 +213,42 @@ static Value *SimplifyX86insertps(const IntrinsicInst &II,
    uint8_t DestLane = (Imm >> 4) & 0x3;
    uint8_t SourceLane = (Imm >> 6) & 0x3;

+    ConstantAggregateZero *ZeroVector = ConstantAggregateZero::get(VecTy);
+
    // If all zero mask bits are set, this was just a weird way to
    // generate a zero vector.
    if (ZMask == 0xf)
      return ZeroVector;
-    
-    // TODO: Model this case as two shuffles or a 'logical and' plus shuffle?
-    if (ZMask)
-      return nullptr;

-    assert(VecTy->getNumElements() == 4 && "insertps with wrong vector type");
-
-    // If we're not zeroing anything, this is a single shuffle.
-    // Replace the selected destination lane with the selected source lane.
-    // For all other lanes, pass the first source bits through.
+    // Initialize by passing all of the first source bits through.
    int ShuffleMask[4] = { 0, 1, 2, 3 };
-    ShuffleMask[DestLane] = SourceLane + 4;
-    
-    return Builder.CreateShuffleVector(II.getArgOperand(0), II.getArgOperand(1),
-                                       ShuffleMask);
+
+    // We may replace the second operand with the zero vector.
+    Value *V1 = II.getArgOperand(1);
+
+    if (ZMask) {
+      // If the zero mask is being used with a single input or the zero mask
+      // overrides the destination lane, this is a shuffle with the zero vector.
+      if ((II.getArgOperand(0) == II.getArgOperand(1)) ||
+          (ZMask & (1 << DestLane))) {
+        V1 = ZeroVector;
+        // We may still move 32-bits of the first source vector from one lane
+        // to another.
+        ShuffleMask[DestLane] = SourceLane;
+        // The zero mask may override the previous insert operation.
+        for (unsigned i = 0; i < 4; ++i)
+          if ((ZMask >> i) & 0x1)
+            ShuffleMask[i] = i + 4;
+      } else {
+        // TODO: Model this case as 2 shuffles or a 'logical and' plus shuffle?
+        return nullptr;
+      }
+    } else {
+      // Replace the selected destination lane with the selected source lane.
+      ShuffleMask[DestLane] = SourceLane + 4;
+    }
+  
+    return Builder.CreateShuffleVector(II.getArgOperand(0), V1, ShuffleMask);
  }
  return nullptr;
 }
--- a/test/Transforms/InstCombine/x86-insertps.ll
+++ b/test/Transforms/InstCombine/x86-insertps.ll
@ -30,14 +30,47 @@ define <4 x float> @insertps_0xff(<4 x float> %v1, <4 x float> %v2) {
 ; CHECK-NEXT:  ret <4 x float> zeroinitializer
 }

-; If some zero mask bits are set, we do not change anything.
+; If some zero mask bits are set that do not override the insertion, we do not change anything.

-define <4 x float> @insertps_0x03(<4 x float> %v1, <4 x float> %v2) {
-  %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 3)
+define <4 x float> @insertps_0x0c(<4 x float> %v1, <4 x float> %v2) {
+  %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 12)
  ret <4 x float> %res

-; CHECK-LABEL: @insertps_0x03
-; CHECK-NEXT:  call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 3)
+; CHECK-LABEL: @insertps_0x0c
+; CHECK-NEXT:  call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 12)
+; CHECK-NEXT:  ret <4 x float>
+}
+
+; ...unless both input vectors are the same operand.
+
+define <4 x float> @insertps_0x15_single_input(<4 x float> %v1) {
+  %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v1, i8 21)
+  ret <4 x float> %res
+
+; CHECK-LABEL: @insertps_0x15_single_input
+; CHECK-NEXT:  shufflevector <4 x float> %v1, <4 x float> <float 0.000000e+00, float undef, float 0.000000e+00, float undef>, <4 x i32> <i32 4, i32 0, i32 6, i32 3>
+; CHECK-NEXT:  ret <4 x float>
+}
+
+; The zero mask overrides the insertion lane.
+
+define <4 x float> @insertps_0x1a_single_input(<4 x float> %v1) {
+  %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v1, i8 26)
+  ret <4 x float> %res
+
+; CHECK-LABEL: @insertps_0x1a_single_input
+; CHECK-NEXT:  shufflevector <4 x float> %v1, <4 x float> <float undef, float 0.000000e+00, float undef, float 0.000000e+00>, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEXT:  ret <4 x float>
+}
+
+; The zero mask overrides the insertion lane, so the second input vector is not used.
+
+define <4 x float> @insertps_0xc1(<4 x float> %v1, <4 x float> %v2) {
+  %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 193)
+  ret <4 x float> %res
+
+; CHECK-LABEL: @insertps_0xc1
+; CHECK-NEXT:  shufflevector <4 x float> %v1, <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:  ret <4 x float>
 }