[AArch64] Teach perfect shuffles tables about D-lane movs

Similar to D123386, this adds D-Movs to the AArch64 perfect shuffle tables, slightly lowering the costs a little more. This is a rough improvement in general, especially if you ignore mov v0.16b, v2.16b type moves that are often artefacts of the calling convention. The D register movs are encoded as (0x4 | LaneIdx), and to generate a D register move we are required to bitcast into a higher type, but it is otherwise very similar to the S-lane mov's already supported. Differential Revision: https://reviews.llvm.org/D125477
2025-02-20 01:57:37 +00:00 · 2022-05-17 18:16:45 +01:00 · 2022-05-17 18:16:45 +01:00 · 4c6a070a2c
commit 4c6a070a2c
parent bd93df937a
10 changed files with 501 additions and 479 deletions
--- a/clang/test/CodeGen/aarch64-neon-vcmla.c
+++ b/clang/test/CodeGen/aarch64-neon-vcmla.c
@ -148,10 +148,8 @@ float64x2_t test_vcmlaq_rot270_f64(float64x2_t acc, float64x2_t lhs, float64x2_t
 }

 // CHECK-LABEL: @test_vcmla_lane_f16(
-// CHECK: [[CPLX:%.*]] = bitcast <4 x half> %rhs to <2 x i32>
-// CHECK: [[DUP:%.*]] = shufflevector <2 x i32> [[CPLX]], <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-// CHECK: [[DUP_FLT:%.*]] = bitcast <2 x i32> [[DUP]] to <4 x half>
-// CHECK: [[RES:%.*]] = call <4 x half> @llvm.aarch64.neon.vcmla.rot0.v4f16(<4 x half> %acc, <4 x half> %lhs, <4 x half> [[DUP_FLT]])
+// CHECK: [[DUP:%.*]] = shufflevector <4 x half> %rhs, <4 x half> poison, <4 x i32> <i32 2, i32 3, i32 2, i32 3>
+// CHECK: [[RES:%.*]] = call <4 x half> @llvm.aarch64.neon.vcmla.rot0.v4f16(<4 x half> %acc, <4 x half> %lhs, <4 x half> [[DUP]])
 // CHECK: ret <4 x half> [[RES]]
 float16x4_t test_vcmla_lane_f16(float16x4_t acc, float16x4_t lhs, float16x4_t rhs) {
  return vcmla_lane_f16(acc, lhs, rhs, 1);
@ -209,29 +207,25 @@ float32x2_t test_vcmla_laneq_f32(float32x2_t acc, float32x2_t lhs, float32x4_t r
 // CHECK-LABEL: @test_vcmlaq_lane_f32(
 // CHECK: [[CPLX:%.*]] = bitcast <2 x float> %rhs to i64
 // CHECK: [[CPLX_VEC:%.*]] = insertelement <2 x i64> undef, i64 [[CPLX]], i64 0
-// CHECK: [[DUP:%.*]] = shufflevector <2 x i64> [[CPLX_VEC]], <2 x i64> poison, <2 x i32> zeroinitializer
-// CHECK: [[DUP_FLT:%.*]] = bitcast <2 x i64> [[DUP]] to <4 x float>
-// CHECK: [[RES:%.*]] = call <4 x float> @llvm.aarch64.neon.vcmla.rot0.v4f32(<4 x float> %acc, <4 x float> %lhs, <4 x float> [[DUP_FLT]])
+// CHECK: [[CPLX2:%.*]] = bitcast <2 x i64> [[CPLX_VEC]] to <4 x float>
+// CHECK: [[DUP:%.*]] = shufflevector <4 x float> [[CPLX2]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+// CHECK: [[RES:%.*]] = call <4 x float> @llvm.aarch64.neon.vcmla.rot0.v4f32(<4 x float> %acc, <4 x float> %lhs, <4 x float> [[DUP]])
 // CHECK: ret <4 x float> [[RES]]
 float32x4_t test_vcmlaq_lane_f32(float32x4_t acc, float32x4_t lhs, float32x2_t rhs) {
  return vcmlaq_lane_f32(acc, lhs, rhs, 0);
 }

 // CHECK-LABEL: @test_vcmlaq_laneq_f32(
-// CHECK: [[CPLX:%.*]] = bitcast <4 x float> %rhs to <2 x i64>
-// CHECK: [[DUP:%.*]] = shufflevector <2 x i64> [[CPLX]], <2 x i64> undef, <2 x i32> <i32 1, i32 1>
-// CHECK: [[DUP_FLT:%.*]] = bitcast <2 x i64> [[DUP]] to <4 x float>
-// CHECK: [[RES:%.*]] = call <4 x float> @llvm.aarch64.neon.vcmla.rot0.v4f32(<4 x float> %acc, <4 x float> %lhs, <4 x float> [[DUP_FLT]])
+// CHECK: [[DUP:%.*]] = shufflevector <4 x float> %rhs, <4 x float> poison, <4 x i32> <i32 2, i32 3, i32 2, i32 3>
+// CHECK: [[RES:%.*]] = call <4 x float> @llvm.aarch64.neon.vcmla.rot0.v4f32(<4 x float> %acc, <4 x float> %lhs, <4 x float> [[DUP]])
 // CHECK: ret <4 x float> [[RES]]
 float32x4_t test_vcmlaq_laneq_f32(float32x4_t acc, float32x4_t lhs, float32x4_t rhs) {
  return vcmlaq_laneq_f32(acc, lhs, rhs, 1);
 }

 // CHECK-LABEL: @test_vcmla_rot90_lane_f16(
-// CHECK: [[CPLX:%.*]] = bitcast <4 x half> %rhs to <2 x i32>
-// CHECK: [[DUP:%.*]] = shufflevector <2 x i32> [[CPLX]], <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-// CHECK: [[DUP_FLT:%.*]] = bitcast <2 x i32> [[DUP]] to <4 x half>
-// CHECK: [[RES:%.*]] = call <4 x half> @llvm.aarch64.neon.vcmla.rot90.v4f16(<4 x half> %acc, <4 x half> %lhs, <4 x half> [[DUP_FLT]])
+// CHECK: [[DUP:%.*]] = shufflevector <4 x half> %rhs, <4 x half> poison, <4 x i32> <i32 2, i32 3, i32 2, i32 3>
+// CHECK: [[RES:%.*]] = call <4 x half> @llvm.aarch64.neon.vcmla.rot90.v4f16(<4 x half> %acc, <4 x half> %lhs, <4 x half> [[DUP]])
 // CHECK: ret <4 x half> [[RES]]
 float16x4_t test_vcmla_rot90_lane_f16(float16x4_t acc, float16x4_t lhs, float16x4_t rhs) {
  return vcmla_rot90_lane_f16(acc, lhs, rhs, 1);
@ -289,29 +283,25 @@ float32x2_t test_vcmla_rot90_laneq_f32(float32x2_t acc, float32x2_t lhs, float32
 // CHECK-LABEL: @test_vcmlaq_rot90_lane_f32(
 // CHECK: [[CPLX:%.*]] = bitcast <2 x float> %rhs to i64
 // CHECK: [[CPLX_VEC:%.*]] = insertelement <2 x i64> undef, i64 [[CPLX]], i64 0
-// CHECK: [[DUP:%.*]] = shufflevector <2 x i64> [[CPLX_VEC]], <2 x i64> poison, <2 x i32> zeroinitializer
-// CHECK: [[DUP_FLT:%.*]] = bitcast <2 x i64> [[DUP]] to <4 x float>
-// CHECK: [[RES:%.*]] = call <4 x float> @llvm.aarch64.neon.vcmla.rot90.v4f32(<4 x float> %acc, <4 x float> %lhs, <4 x float> [[DUP_FLT]])
+// CHECK: [[CPLX2:%.*]] = bitcast <2 x i64> [[CPLX_VEC]] to <4 x float>
+// CHECK: [[DUP:%.*]] = shufflevector <4 x float> [[CPLX2]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+// CHECK: [[RES:%.*]] = call <4 x float> @llvm.aarch64.neon.vcmla.rot90.v4f32(<4 x float> %acc, <4 x float> %lhs, <4 x float> [[DUP]])
 // CHECK: ret <4 x float> [[RES]]
 float32x4_t test_vcmlaq_rot90_lane_f32(float32x4_t acc, float32x4_t lhs, float32x2_t rhs) {
  return vcmlaq_rot90_lane_f32(acc, lhs, rhs, 0);
 }

 // CHECK-LABEL: @test_vcmlaq_rot90_laneq_f32(
-// CHECK: [[CPLX:%.*]] = bitcast <4 x float> %rhs to <2 x i64>
-// CHECK: [[DUP:%.*]] = shufflevector <2 x i64> [[CPLX]], <2 x i64> undef, <2 x i32> <i32 1, i32 1>
-// CHECK: [[DUP_FLT:%.*]] = bitcast <2 x i64> [[DUP]] to <4 x float>
-// CHECK: [[RES:%.*]] = call <4 x float> @llvm.aarch64.neon.vcmla.rot90.v4f32(<4 x float> %acc, <4 x float> %lhs, <4 x float> [[DUP_FLT]])
+// CHECK: [[DUP:%.*]] = shufflevector <4 x float> %rhs, <4 x float> poison, <4 x i32> <i32 2, i32 3, i32 2, i32 3>
+// CHECK: [[RES:%.*]] = call <4 x float> @llvm.aarch64.neon.vcmla.rot90.v4f32(<4 x float> %acc, <4 x float> %lhs, <4 x float> [[DUP]])
 // CHECK: ret <4 x float> [[RES]]
 float32x4_t test_vcmlaq_rot90_laneq_f32(float32x4_t acc, float32x4_t lhs, float32x4_t rhs) {
  return vcmlaq_rot90_laneq_f32(acc, lhs, rhs, 1);
 }

 // CHECK-LABEL: @test_vcmla_rot180_lane_f16(
-// CHECK: [[CPLX:%.*]] = bitcast <4 x half> %rhs to <2 x i32>
-// CHECK: [[DUP:%.*]] = shufflevector <2 x i32> [[CPLX]], <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-// CHECK: [[DUP_FLT:%.*]] = bitcast <2 x i32> [[DUP]] to <4 x half>
-// CHECK: [[RES:%.*]] = call <4 x half> @llvm.aarch64.neon.vcmla.rot180.v4f16(<4 x half> %acc, <4 x half> %lhs, <4 x half> [[DUP_FLT]])
+// CHECK: [[DUP:%.*]] = shufflevector <4 x half> %rhs, <4 x half> poison, <4 x i32> <i32 2, i32 3, i32 2, i32 3>
+// CHECK: [[RES:%.*]] = call <4 x half> @llvm.aarch64.neon.vcmla.rot180.v4f16(<4 x half> %acc, <4 x half> %lhs, <4 x half> [[DUP]])
 // CHECK: ret <4 x half> [[RES]]
 float16x4_t test_vcmla_rot180_lane_f16(float16x4_t acc, float16x4_t lhs, float16x4_t rhs) {
  return vcmla_rot180_lane_f16(acc, lhs, rhs, 1);
@ -369,29 +359,25 @@ float32x2_t test_vcmla_rot180_laneq_f32(float32x2_t acc, float32x2_t lhs, float3
 // CHECK-LABEL: @test_vcmlaq_rot180_lane_f32(
 // CHECK: [[CPLX:%.*]] = bitcast <2 x float> %rhs to i64
 // CHECK: [[CPLX_VEC:%.*]] = insertelement <2 x i64> undef, i64 [[CPLX]], i64 0
-// CHECK: [[DUP:%.*]] = shufflevector <2 x i64> [[CPLX_VEC]], <2 x i64> poison, <2 x i32> zeroinitializer
-// CHECK: [[DUP_FLT:%.*]] = bitcast <2 x i64> [[DUP]] to <4 x float>
-// CHECK: [[RES:%.*]] = call <4 x float> @llvm.aarch64.neon.vcmla.rot180.v4f32(<4 x float> %acc, <4 x float> %lhs, <4 x float> [[DUP_FLT]])
+// CHECK: [[CPLX2:%.*]] = bitcast <2 x i64> [[CPLX_VEC]] to <4 x float>
+// CHECK: [[DUP:%.*]] = shufflevector <4 x float> [[CPLX2]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+// CHECK: [[RES:%.*]] = call <4 x float> @llvm.aarch64.neon.vcmla.rot180.v4f32(<4 x float> %acc, <4 x float> %lhs, <4 x float> [[DUP]])
 // CHECK: ret <4 x float> [[RES]]
 float32x4_t test_vcmlaq_rot180_lane_f32(float32x4_t acc, float32x4_t lhs, float32x2_t rhs) {
  return vcmlaq_rot180_lane_f32(acc, lhs, rhs, 0);
 }

 // CHECK-LABEL: @test_vcmlaq_rot180_laneq_f32(
-// CHECK: [[CPLX:%.*]] = bitcast <4 x float> %rhs to <2 x i64>
-// CHECK: [[DUP:%.*]] = shufflevector <2 x i64> [[CPLX]], <2 x i64> undef, <2 x i32> <i32 1, i32 1>
-// CHECK: [[DUP_FLT:%.*]] = bitcast <2 x i64> [[DUP]] to <4 x float>
-// CHECK: [[RES:%.*]] = call <4 x float> @llvm.aarch64.neon.vcmla.rot180.v4f32(<4 x float> %acc, <4 x float> %lhs, <4 x float> [[DUP_FLT]])
+// CHECK: [[DUP:%.*]] = shufflevector <4 x float> %rhs, <4 x float> poison, <4 x i32> <i32 2, i32 3, i32 2, i32 3>
+// CHECK: [[RES:%.*]] = call <4 x float> @llvm.aarch64.neon.vcmla.rot180.v4f32(<4 x float> %acc, <4 x float> %lhs, <4 x float> [[DUP]])
 // CHECK: ret <4 x float> [[RES]]
 float32x4_t test_vcmlaq_rot180_laneq_f32(float32x4_t acc, float32x4_t lhs, float32x4_t rhs) {
  return vcmlaq_rot180_laneq_f32(acc, lhs, rhs, 1);
 }

 // CHECK-LABEL: @test_vcmla_rot270_lane_f16(
-// CHECK: [[CPLX:%.*]] = bitcast <4 x half> %rhs to <2 x i32>
-// CHECK: [[DUP:%.*]] = shufflevector <2 x i32> [[CPLX]], <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-// CHECK: [[DUP_FLT:%.*]] = bitcast <2 x i32> [[DUP]] to <4 x half>
-// CHECK: [[RES:%.*]] = call <4 x half> @llvm.aarch64.neon.vcmla.rot270.v4f16(<4 x half> %acc, <4 x half> %lhs, <4 x half> [[DUP_FLT]])
+// CHECK: [[DUP:%.*]] = shufflevector <4 x half> %rhs, <4 x half> poison, <4 x i32> <i32 2, i32 3, i32 2, i32 3>
+// CHECK: [[RES:%.*]] = call <4 x half> @llvm.aarch64.neon.vcmla.rot270.v4f16(<4 x half> %acc, <4 x half> %lhs, <4 x half> [[DUP]])
 // CHECK: ret <4 x half> [[RES]]
 float16x4_t test_vcmla_rot270_lane_f16(float16x4_t acc, float16x4_t lhs, float16x4_t rhs) {
  return vcmla_rot270_lane_f16(acc, lhs, rhs, 1);
@ -449,19 +435,17 @@ float32x2_t test_vcmla_rot270_laneq_f32(float32x2_t acc, float32x2_t lhs, float3
 // CHECK-LABEL: @test_vcmlaq_rot270_lane_f32(
 // CHECK: [[CPLX:%.*]] = bitcast <2 x float> %rhs to i64
 // CHECK: [[CPLX_VEC:%.*]] = insertelement <2 x i64> undef, i64 [[CPLX]], i64 0
-// CHECK: [[DUP:%.*]] = shufflevector <2 x i64> [[CPLX_VEC]], <2 x i64> poison, <2 x i32> zeroinitializer
-// CHECK: [[DUP_FLT:%.*]] = bitcast <2 x i64> [[DUP]] to <4 x float>
-// CHECK: [[RES:%.*]] = call <4 x float> @llvm.aarch64.neon.vcmla.rot270.v4f32(<4 x float> %acc, <4 x float> %lhs, <4 x float> [[DUP_FLT]])
+// CHECK: [[CPLX2:%.*]] = bitcast <2 x i64> [[DUP]] to <4 x float>
+// CHECK: [[DUP:%.*]] = shufflevector <4 x float> [[CPLX2]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+// CHECK: [[RES:%.*]] = call <4 x float> @llvm.aarch64.neon.vcmla.rot270.v4f32(<4 x float> %acc, <4 x float> %lhs, <4 x float> [[DUP]])
 // CHECK: ret <4 x float> [[RES]]
 float32x4_t test_vcmlaq_rot270_lane_f32(float32x4_t acc, float32x4_t lhs, float32x2_t rhs) {
  return vcmlaq_rot270_lane_f32(acc, lhs, rhs, 0);
 }

 // CHECK-LABEL: @test_vcmlaq_rot270_laneq_f32(
-// CHECK: [[CPLX:%.*]] = bitcast <4 x float> %rhs to <2 x i64>
-// CHECK: [[DUP:%.*]] = shufflevector <2 x i64> [[CPLX]], <2 x i64> undef, <2 x i32> <i32 1, i32 1>
-// CHECK: [[DUP_FLT:%.*]] = bitcast <2 x i64> [[DUP]] to <4 x float>
-// CHECK: [[RES:%.*]] = call <4 x float> @llvm.aarch64.neon.vcmla.rot270.v4f32(<4 x float> %acc, <4 x float> %lhs, <4 x float> [[DUP_FLT]])
+// CHECK: [[DUP:%.*]] = shufflevector <4 x float> %rhs, <4 x float> poison, <4 x i32> <i32 2, i32 3, i32 2, i32 3>
+// CHECK: [[RES:%.*]] = call <4 x float> @llvm.aarch64.neon.vcmla.rot270.v4f32(<4 x float> %acc, <4 x float> %lhs, <4 x float> [[DUP]])
 // CHECK: ret <4 x float> [[RES]]
 float32x4_t test_vcmlaq_rot270_laneq_f32(float32x4_t acc, float32x4_t lhs, float32x4_t rhs) {
  return vcmlaq_rot270_laneq_f32(acc, lhs, rhs, 1);
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@ -9812,14 +9812,37 @@ static SDValue GeneratePerfectShuffle(unsigned ID, SDValue V1,
        LHSID, V1, V2, PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl);
    EVT VT = OpLHS.getValueType();
    assert(RHSID < 8 && "Expected a lane index for RHSID!");
-    int MaskElt = getPFIDLane(ID, RHSID);
-    assert(MaskElt >= 0 && "Didn't expect an undef movlane index!");
-    unsigned ExtLane = MaskElt < 4 ? MaskElt : (MaskElt - 4);
-    SDValue Input = MaskElt < 4 ? V1 : V2;
-    // Be careful about creating illegal types. Use f16 instead of i16.
-    if (VT == MVT::v4i16) {
-      Input = DAG.getBitcast(MVT::v4f16, Input);
-      OpLHS = DAG.getBitcast(MVT::v4f16, OpLHS);
+    unsigned ExtLane = 0;
+    SDValue Input;
+
+    // OP_MOVLANE are either D movs (if bit 0x4 is set) or S movs. D movs
+    // convert into a higher type.
+    if (RHSID & 0x4) {
+      int MaskElt = getPFIDLane(ID, (RHSID & 0x01) << 1) >> 1;
+      if (MaskElt == -1)
+        MaskElt = (getPFIDLane(ID, ((RHSID & 0x01) << 1) + 1) - 1) >> 1;
+      assert(MaskElt >= 0 && "Didn't expect an undef movlane index!");
+      ExtLane = MaskElt < 2 ? MaskElt : (MaskElt - 2);
+      Input = MaskElt < 2 ? V1 : V2;
+      if (VT.getScalarSizeInBits() == 16) {
+        Input = DAG.getBitcast(MVT::v2f32, Input);
+        OpLHS = DAG.getBitcast(MVT::v2f32, OpLHS);
+      } else {
+        assert(VT.getScalarSizeInBits() == 32 &&
+               "Expected 16 or 32 bit shuffle elemements");
+        Input = DAG.getBitcast(MVT::v2f64, Input);
+        OpLHS = DAG.getBitcast(MVT::v2f64, OpLHS);
+      }
+    } else {
+      int MaskElt = getPFIDLane(ID, RHSID);
+      assert(MaskElt >= 0 && "Didn't expect an undef movlane index!");
+      ExtLane = MaskElt < 4 ? MaskElt : (MaskElt - 4);
+      Input = MaskElt < 4 ? V1 : V2;
+      // Be careful about creating illegal types. Use f16 instead of i16.
+      if (VT == MVT::v4i16) {
+        Input = DAG.getBitcast(MVT::v4f16, Input);
+        OpLHS = DAG.getBitcast(MVT::v4f16, OpLHS);
+      }
    }
    SDValue Ext = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
                              Input.getValueType().getVectorElementType(),
--- a/llvm/lib/Target/AArch64/AArch64PerfectShuffle.h
+++ b/llvm/lib/Target/AArch64/AArch64PerfectShuffle.h
--- a/llvm/test/Analysis/CostModel/AArch64/shuffle-other.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/shuffle-other.ll
@ -97,8 +97,8 @@ define void @insert_subvec() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_4_2 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_4_3 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %v16i8_4_05 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 18, i32 19, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i16_2_0 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i16_2_1 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i16_2_0 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i16_2_1 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %v8i16_2_0 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_2_1 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_2_2 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
@ -109,9 +109,9 @@ define void @insert_subvec() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_4_2 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_4_3 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %v16i16_4_05 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 18, i32 19, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i32_2_0 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i32_2_1 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i32_2_0 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_2_0 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_2_1 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_2_0 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_2_1 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_2_2 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_2_3 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
--- a/llvm/test/CodeGen/AArch64/arm64-dup.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-dup.ll
@ -404,9 +404,10 @@ entry:
 define <4 x i16> @test_perfectshuffle_dupext_v4i16(<4 x i16> %a, <4 x i16> %b) nounwind {
 ; CHECK-LABEL: test_perfectshuffle_dupext_v4i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    dup.4h v0, v0[0]
-; CHECK-NEXT:    ext.8b v0, v0, v1, #4
+; CHECK-NEXT:    trn1.4h v0, v0, v0
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    mov.s v0[1], v1[0]
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-NEXT:    ret
  %r = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 0, i32 4, i32 5>
  ret <4 x i16> %r
@ -415,9 +416,10 @@ define <4 x i16> @test_perfectshuffle_dupext_v4i16(<4 x i16> %a, <4 x i16> %b) n
 define <4 x half> @test_perfectshuffle_dupext_v4f16(<4 x half> %a, <4 x half> %b) nounwind {
 ; CHECK-LABEL: test_perfectshuffle_dupext_v4f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    dup.4h v0, v0[0]
-; CHECK-NEXT:    ext.8b v0, v0, v1, #4
+; CHECK-NEXT:    trn1.4h v0, v0, v0
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    mov.s v0[1], v1[0]
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-NEXT:    ret
  %r = shufflevector <4 x half> %a, <4 x half> %b, <4 x i32> <i32 0, i32 0, i32 4, i32 5>
  ret <4 x half> %r
@ -426,8 +428,8 @@ define <4 x half> @test_perfectshuffle_dupext_v4f16(<4 x half> %a, <4 x half> %b
 define <4 x i32> @test_perfectshuffle_dupext_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
 ; CHECK-LABEL: test_perfectshuffle_dupext_v4i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    dup.4s v0, v0[0]
-; CHECK-NEXT:    ext.16b v0, v0, v1, #8
+; CHECK-NEXT:    trn1.4s v0, v0, v0
+; CHECK-NEXT:    mov.d v0[1], v1[0]
 ; CHECK-NEXT:    ret
  %r = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 0, i32 4, i32 5>
  ret <4 x i32> %r
@ -436,8 +438,8 @@ define <4 x i32> @test_perfectshuffle_dupext_v4i32(<4 x i32> %a, <4 x i32> %b) n
 define <4 x float> @test_perfectshuffle_dupext_v4f32(<4 x float> %a, <4 x float> %b) nounwind {
 ; CHECK-LABEL: test_perfectshuffle_dupext_v4f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    dup.4s v0, v0[0]
-; CHECK-NEXT:    ext.16b v0, v0, v1, #8
+; CHECK-NEXT:    trn1.4s v0, v0, v0
+; CHECK-NEXT:    mov.d v0[1], v1[0]
 ; CHECK-NEXT:    ret
  %r = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 0, i32 4, i32 5>
  ret <4 x float> %r
--- a/llvm/test/CodeGen/AArch64/reduce-shuffle.ll
+++ b/llvm/test/CodeGen/AArch64/reduce-shuffle.ll
@ -36,124 +36,121 @@ define i32 @v1(ptr nocapture noundef readonly %p1, i32 noundef %i1, ptr nocaptur
 ; CHECK-NEXT:    ld1 { v6.s }[1], [x2]
 ; CHECK-NEXT:    ushll v0.8h, v0.8b, #0
 ; CHECK-NEXT:    usubl v7.4s, v3.4h, v5.4h
-; CHECK-NEXT:    usubl2 v5.4s, v3.8h, v5.8h
-; CHECK-NEXT:    usubl2 v3.4s, v2.8h, v4.8h
+; CHECK-NEXT:    usubl2 v3.4s, v3.8h, v5.8h
+; CHECK-NEXT:    usubl2 v5.4s, v2.8h, v4.8h
 ; CHECK-NEXT:    usubl v2.4s, v2.4h, v4.4h
 ; CHECK-NEXT:    ushll v4.8h, v6.8b, #0
-; CHECK-NEXT:    shl v3.4s, v3.4s, #16
+; CHECK-NEXT:    shl v5.4s, v5.4s, #16
 ; CHECK-NEXT:    usubl2 v6.4s, v0.8h, v4.8h
 ; CHECK-NEXT:    shl v2.4s, v2.4s, #16
 ; CHECK-NEXT:    usubl v0.4s, v0.4h, v4.4h
+; CHECK-NEXT:    add v1.4s, v5.4s, v1.4s
+; CHECK-NEXT:    shl v4.4s, v6.4s, #16
+; CHECK-NEXT:    shl v0.4s, v0.4s, #16
 ; CHECK-NEXT:    add v2.4s, v2.4s, v16.4s
-; CHECK-NEXT:    shl v6.4s, v6.4s, #16
-; CHECK-NEXT:    shl v4.4s, v0.4s, #16
-; CHECK-NEXT:    add v19.4s, v6.4s, v5.4s
-; CHECK-NEXT:    add v6.4s, v4.4s, v7.4s
-; CHECK-NEXT:    add v17.4s, v3.4s, v1.4s
-; CHECK-NEXT:    ext v18.16b, v2.16b, v2.16b, #12
-; CHECK-NEXT:    zip1 v7.4s, v6.4s, v19.4s
-; CHECK-NEXT:    uzp2 v16.4s, v2.4s, v17.4s
-; CHECK-NEXT:    mov v1.16b, v2.16b
-; CHECK-NEXT:    mov v4.16b, v17.16b
-; CHECK-NEXT:    mov v20.16b, v6.16b
-; CHECK-NEXT:    zip2 v0.4s, v17.4s, v2.4s
-; CHECK-NEXT:    zip2 v3.4s, v2.4s, v17.4s
-; CHECK-NEXT:    mov v1.s[0], v17.s[1]
-; CHECK-NEXT:    ext v5.16b, v17.16b, v18.16b, #12
-; CHECK-NEXT:    zip2 v17.4s, v6.4s, v19.4s
-; CHECK-NEXT:    mov v4.s[1], v2.s[0]
-; CHECK-NEXT:    ext v18.16b, v6.16b, v7.16b, #8
-; CHECK-NEXT:    mov v20.s[3], v19.s[2]
-; CHECK-NEXT:    uzp2 v6.4s, v16.4s, v2.4s
-; CHECK-NEXT:    mov v1.d[1], v7.d[1]
-; CHECK-NEXT:    mov v4.d[1], v18.d[1]
-; CHECK-NEXT:    mov v5.d[1], v17.d[1]
-; CHECK-NEXT:    mov v0.d[1], v20.d[1]
-; CHECK-NEXT:    mov v6.d[1], v17.d[1]
-; CHECK-NEXT:    mov v3.d[1], v20.d[1]
-; CHECK-NEXT:    add v2.4s, v1.4s, v4.4s
-; CHECK-NEXT:    sub v1.4s, v4.4s, v1.4s
-; CHECK-NEXT:    sub v0.4s, v0.4s, v5.4s
-; CHECK-NEXT:    add v3.4s, v6.4s, v3.4s
-; CHECK-NEXT:    sub v5.4s, v1.4s, v0.4s
-; CHECK-NEXT:    add v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    add v3.4s, v4.4s, v3.4s
+; CHECK-NEXT:    add v0.4s, v0.4s, v7.4s
+; CHECK-NEXT:    uzp2 v6.4s, v2.4s, v1.4s
+; CHECK-NEXT:    ext v17.16b, v2.16b, v2.16b, #12
+; CHECK-NEXT:    zip1 v4.4s, v0.4s, v3.4s
 ; CHECK-NEXT:    mov v16.16b, v2.16b
-; CHECK-NEXT:    mov v4.16b, v3.16b
-; CHECK-NEXT:    mov v16.s[0], v2.s[1]
-; CHECK-NEXT:    rev64 v1.4s, v5.4s
-; CHECK-NEXT:    rev64 v6.4s, v0.4s
-; CHECK-NEXT:    mov v4.s[0], v3.s[1]
-; CHECK-NEXT:    mov v16.s[1], v2.s[0]
-; CHECK-NEXT:    add v17.4s, v5.4s, v1.4s
-; CHECK-NEXT:    add v18.4s, v0.4s, v6.4s
+; CHECK-NEXT:    mov v19.16b, v1.16b
+; CHECK-NEXT:    zip2 v5.4s, v1.4s, v2.4s
+; CHECK-NEXT:    zip2 v18.4s, v2.4s, v1.4s
+; CHECK-NEXT:    mov v16.s[0], v1.s[1]
+; CHECK-NEXT:    uzp2 v6.4s, v6.4s, v2.4s
+; CHECK-NEXT:    zip2 v7.4s, v0.4s, v3.4s
+; CHECK-NEXT:    ext v1.16b, v1.16b, v17.16b, #12
+; CHECK-NEXT:    ext v17.16b, v0.16b, v4.16b, #8
+; CHECK-NEXT:    mov v19.s[1], v2.s[0]
+; CHECK-NEXT:    mov v0.s[3], v3.s[2]
+; CHECK-NEXT:    mov v6.d[1], v7.d[1]
+; CHECK-NEXT:    mov v16.d[1], v4.d[1]
+; CHECK-NEXT:    mov v19.d[1], v17.d[1]
+; CHECK-NEXT:    mov v18.d[1], v0.d[1]
+; CHECK-NEXT:    mov v1.d[1], v7.d[1]
+; CHECK-NEXT:    mov v5.d[1], v0.d[1]
+; CHECK-NEXT:    add v0.4s, v16.4s, v19.4s
+; CHECK-NEXT:    add v4.4s, v6.4s, v18.4s
+; CHECK-NEXT:    rev64 v3.4s, v0.4s
 ; CHECK-NEXT:    sub v1.4s, v5.4s, v1.4s
-; CHECK-NEXT:    sub v0.4s, v0.4s, v6.4s
-; CHECK-NEXT:    mov v4.s[1], v3.s[0]
-; CHECK-NEXT:    ext v5.16b, v18.16b, v0.16b, #4
-; CHECK-NEXT:    ext v6.16b, v17.16b, v1.16b, #4
-; CHECK-NEXT:    add v3.4s, v3.4s, v16.4s
-; CHECK-NEXT:    sub v2.4s, v2.4s, v4.4s
-; CHECK-NEXT:    rev64 v16.4s, v3.4s
-; CHECK-NEXT:    rev64 v4.4s, v5.4s
-; CHECK-NEXT:    rev64 v5.4s, v6.4s
-; CHECK-NEXT:    rev64 v6.4s, v2.4s
-; CHECK-NEXT:    add v19.4s, v3.4s, v16.4s
-; CHECK-NEXT:    sub v3.4s, v3.4s, v16.4s
-; CHECK-NEXT:    rev64 v16.4s, v19.4s
-; CHECK-NEXT:    add v20.4s, v2.4s, v6.4s
-; CHECK-NEXT:    sub v2.4s, v2.4s, v6.4s
-; CHECK-NEXT:    ext v6.16b, v19.16b, v19.16b, #4
-; CHECK-NEXT:    ext v19.16b, v20.16b, v2.16b, #4
-; CHECK-NEXT:    mov v17.s[3], v1.s[3]
-; CHECK-NEXT:    ext v5.16b, v1.16b, v5.16b, #12
-; CHECK-NEXT:    mov v18.s[3], v0.s[3]
-; CHECK-NEXT:    rev64 v19.4s, v19.4s
-; CHECK-NEXT:    ext v4.16b, v0.16b, v4.16b, #12
-; CHECK-NEXT:    mov v20.s[3], v2.s[3]
-; CHECK-NEXT:    sub v22.4s, v17.4s, v5.4s
-; CHECK-NEXT:    trn2 v16.4s, v16.4s, v3.4s
-; CHECK-NEXT:    trn2 v3.4s, v3.4s, v6.4s
-; CHECK-NEXT:    mov v17.s[0], v1.s[0]
-; CHECK-NEXT:    ext v1.16b, v2.16b, v19.16b, #12
-; CHECK-NEXT:    sub v21.4s, v18.4s, v4.4s
-; CHECK-NEXT:    mov v18.s[0], v0.s[0]
-; CHECK-NEXT:    ext v0.16b, v3.16b, v3.16b, #4
-; CHECK-NEXT:    add v3.4s, v17.4s, v5.4s
-; CHECK-NEXT:    sub v5.4s, v20.4s, v1.4s
-; CHECK-NEXT:    mov v20.s[0], v2.s[0]
-; CHECK-NEXT:    add v4.4s, v18.4s, v4.4s
-; CHECK-NEXT:    add v6.4s, v16.4s, v0.4s
-; CHECK-NEXT:    mov v4.d[1], v21.d[1]
-; CHECK-NEXT:    sub v0.4s, v16.4s, v0.4s
-; CHECK-NEXT:    add v1.4s, v20.4s, v1.4s
-; CHECK-NEXT:    movi v7.8h, #1
-; CHECK-NEXT:    mov v3.d[1], v22.d[1]
-; CHECK-NEXT:    mov v1.d[1], v5.d[1]
-; CHECK-NEXT:    mov v6.d[1], v0.d[1]
-; CHECK-NEXT:    ushr v2.4s, v4.4s, #15
-; CHECK-NEXT:    ushr v5.4s, v3.4s, #15
-; CHECK-NEXT:    and v0.16b, v2.16b, v7.16b
-; CHECK-NEXT:    ushr v17.4s, v1.4s, #15
-; CHECK-NEXT:    movi v2.2d, #0x00ffff0000ffff
-; CHECK-NEXT:    ushr v16.4s, v6.4s, #15
-; CHECK-NEXT:    and v5.16b, v5.16b, v7.16b
-; CHECK-NEXT:    and v17.16b, v17.16b, v7.16b
-; CHECK-NEXT:    and v7.16b, v16.16b, v7.16b
-; CHECK-NEXT:    mul v0.4s, v0.4s, v2.4s
-; CHECK-NEXT:    mul v5.4s, v5.4s, v2.4s
-; CHECK-NEXT:    mul v7.4s, v7.4s, v2.4s
-; CHECK-NEXT:    mul v2.4s, v17.4s, v2.4s
-; CHECK-NEXT:    add v4.4s, v0.4s, v4.4s
-; CHECK-NEXT:    add v3.4s, v5.4s, v3.4s
-; CHECK-NEXT:    add v6.4s, v7.4s, v6.4s
-; CHECK-NEXT:    add v1.4s, v2.4s, v1.4s
-; CHECK-NEXT:    eor v6.16b, v6.16b, v7.16b
-; CHECK-NEXT:    eor v1.16b, v1.16b, v2.16b
-; CHECK-NEXT:    eor v2.16b, v3.16b, v5.16b
-; CHECK-NEXT:    eor v0.16b, v4.16b, v0.16b
+; CHECK-NEXT:    rev64 v5.4s, v4.4s
+; CHECK-NEXT:    sub v2.4s, v19.4s, v16.4s
+; CHECK-NEXT:    mov v3.d[1], v0.d[1]
+; CHECK-NEXT:    add v6.4s, v1.4s, v2.4s
+; CHECK-NEXT:    sub v1.4s, v2.4s, v1.4s
+; CHECK-NEXT:    mov v5.d[1], v4.d[1]
+; CHECK-NEXT:    rev64 v2.4s, v1.4s
+; CHECK-NEXT:    rev64 v7.4s, v6.4s
+; CHECK-NEXT:    add v3.4s, v4.4s, v3.4s
+; CHECK-NEXT:    sub v0.4s, v0.4s, v5.4s
+; CHECK-NEXT:    add v4.4s, v1.4s, v2.4s
+; CHECK-NEXT:    add v16.4s, v6.4s, v7.4s
+; CHECK-NEXT:    sub v1.4s, v1.4s, v2.4s
+; CHECK-NEXT:    sub v2.4s, v6.4s, v7.4s
+; CHECK-NEXT:    rev64 v6.4s, v3.4s
+; CHECK-NEXT:    rev64 v17.4s, v0.4s
+; CHECK-NEXT:    ext v7.16b, v4.16b, v1.16b, #4
+; CHECK-NEXT:    ext v5.16b, v16.16b, v2.16b, #4
+; CHECK-NEXT:    add v18.4s, v3.4s, v6.4s
+; CHECK-NEXT:    add v19.4s, v0.4s, v17.4s
+; CHECK-NEXT:    sub v0.4s, v0.4s, v17.4s
+; CHECK-NEXT:    sub v3.4s, v3.4s, v6.4s
+; CHECK-NEXT:    rev64 v6.4s, v7.4s
+; CHECK-NEXT:    rev64 v7.4s, v18.4s
+; CHECK-NEXT:    ext v17.16b, v18.16b, v18.16b, #4
+; CHECK-NEXT:    ext v18.16b, v19.16b, v0.16b, #4
+; CHECK-NEXT:    rev64 v5.4s, v5.4s
+; CHECK-NEXT:    mov v16.s[3], v2.s[3]
+; CHECK-NEXT:    mov v4.s[3], v1.s[3]
+; CHECK-NEXT:    rev64 v18.4s, v18.4s
+; CHECK-NEXT:    mov v19.s[3], v0.s[3]
+; CHECK-NEXT:    ext v5.16b, v2.16b, v5.16b, #12
+; CHECK-NEXT:    ext v6.16b, v1.16b, v6.16b, #12
+; CHECK-NEXT:    trn2 v7.4s, v7.4s, v3.4s
+; CHECK-NEXT:    trn2 v3.4s, v3.4s, v17.4s
+; CHECK-NEXT:    ext v18.16b, v0.16b, v18.16b, #12
+; CHECK-NEXT:    sub v17.4s, v16.4s, v5.4s
+; CHECK-NEXT:    sub v20.4s, v4.4s, v6.4s
+; CHECK-NEXT:    ext v3.16b, v3.16b, v3.16b, #4
+; CHECK-NEXT:    mov v16.s[0], v2.s[0]
+; CHECK-NEXT:    sub v2.4s, v19.4s, v18.4s
+; CHECK-NEXT:    mov v4.s[0], v1.s[0]
+; CHECK-NEXT:    mov v19.s[0], v0.s[0]
+; CHECK-NEXT:    add v1.4s, v7.4s, v3.4s
+; CHECK-NEXT:    sub v0.4s, v7.4s, v3.4s
+; CHECK-NEXT:    add v3.4s, v4.4s, v6.4s
+; CHECK-NEXT:    add v4.4s, v16.4s, v5.4s
+; CHECK-NEXT:    add v5.4s, v19.4s, v18.4s
+; CHECK-NEXT:    mov v4.d[1], v17.d[1]
+; CHECK-NEXT:    mov v3.d[1], v20.d[1]
+; CHECK-NEXT:    mov v1.d[1], v0.d[1]
+; CHECK-NEXT:    mov v5.d[1], v2.d[1]
+; CHECK-NEXT:    movi v0.8h, #1
+; CHECK-NEXT:    movi v17.2d, #0x00ffff0000ffff
+; CHECK-NEXT:    ushr v2.4s, v1.4s, #15
+; CHECK-NEXT:    ushr v6.4s, v4.4s, #15
+; CHECK-NEXT:    ushr v7.4s, v5.4s, #15
+; CHECK-NEXT:    ushr v16.4s, v3.4s, #15
+; CHECK-NEXT:    and v6.16b, v6.16b, v0.16b
+; CHECK-NEXT:    and v16.16b, v16.16b, v0.16b
+; CHECK-NEXT:    and v7.16b, v7.16b, v0.16b
+; CHECK-NEXT:    and v0.16b, v2.16b, v0.16b
+; CHECK-NEXT:    mul v2.4s, v6.4s, v17.4s
+; CHECK-NEXT:    mul v6.4s, v16.4s, v17.4s
+; CHECK-NEXT:    mul v0.4s, v0.4s, v17.4s
+; CHECK-NEXT:    mul v7.4s, v7.4s, v17.4s
+; CHECK-NEXT:    add v4.4s, v2.4s, v4.4s
+; CHECK-NEXT:    add v3.4s, v6.4s, v3.4s
+; CHECK-NEXT:    add v1.4s, v0.4s, v1.4s
+; CHECK-NEXT:    add v5.4s, v7.4s, v5.4s
+; CHECK-NEXT:    eor v0.16b, v1.16b, v0.16b
+; CHECK-NEXT:    eor v1.16b, v5.16b, v7.16b
+; CHECK-NEXT:    eor v3.16b, v3.16b, v6.16b
+; CHECK-NEXT:    eor v2.16b, v4.16b, v2.16b
+; CHECK-NEXT:    add v2.4s, v2.4s, v3.4s
+; CHECK-NEXT:    add v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    add v0.4s, v0.4s, v2.4s
-; CHECK-NEXT:    add v1.4s, v6.4s, v1.4s
-; CHECK-NEXT:    add v0.4s, v1.4s, v0.4s
 ; CHECK-NEXT:    addv s0, v0.4s
 ; CHECK-NEXT:    fmov w8, s0
 ; CHECK-NEXT:    lsr w9, w8, #16
@ -301,96 +298,94 @@ define i32 @v2(ptr nocapture noundef readonly %p1, i32 noundef %i1, ptr nocaptur
 ; CHECK-NEXT:    add v3.4s, v4.4s, v3.4s
 ; CHECK-NEXT:    add v0.4s, v0.4s, v7.4s
 ; CHECK-NEXT:    uzp2 v6.4s, v2.4s, v1.4s
-; CHECK-NEXT:    zip1 v7.4s, v0.4s, v3.4s
+; CHECK-NEXT:    ext v17.16b, v2.16b, v2.16b, #12
+; CHECK-NEXT:    zip1 v4.4s, v0.4s, v3.4s
 ; CHECK-NEXT:    mov v16.16b, v2.16b
-; CHECK-NEXT:    mov v4.16b, v1.16b
-; CHECK-NEXT:    zip2 v17.4s, v2.4s, v1.4s
+; CHECK-NEXT:    mov v19.16b, v0.16b
+; CHECK-NEXT:    zip2 v5.4s, v1.4s, v2.4s
+; CHECK-NEXT:    zip2 v18.4s, v2.4s, v1.4s
 ; CHECK-NEXT:    mov v16.s[0], v1.s[1]
 ; CHECK-NEXT:    uzp2 v6.4s, v6.4s, v2.4s
-; CHECK-NEXT:    zip2 v18.4s, v0.4s, v3.4s
-; CHECK-NEXT:    mov v4.s[1], v2.s[0]
-; CHECK-NEXT:    ext v19.16b, v0.16b, v7.16b, #8
-; CHECK-NEXT:    mov v0.s[3], v3.s[2]
-; CHECK-NEXT:    zip2 v5.4s, v1.4s, v2.4s
-; CHECK-NEXT:    ext v2.16b, v2.16b, v2.16b, #12
-; CHECK-NEXT:    mov v16.d[1], v7.d[1]
-; CHECK-NEXT:    mov v4.d[1], v19.d[1]
-; CHECK-NEXT:    mov v6.d[1], v18.d[1]
-; CHECK-NEXT:    mov v17.d[1], v0.d[1]
-; CHECK-NEXT:    ext v1.16b, v1.16b, v2.16b, #12
-; CHECK-NEXT:    add v2.4s, v16.4s, v4.4s
-; CHECK-NEXT:    mov v5.d[1], v0.d[1]
-; CHECK-NEXT:    add v6.4s, v6.4s, v17.4s
-; CHECK-NEXT:    mov v3.16b, v2.16b
-; CHECK-NEXT:    mov v0.16b, v6.16b
-; CHECK-NEXT:    mov v3.s[0], v2.s[1]
-; CHECK-NEXT:    mov v0.s[0], v6.s[1]
-; CHECK-NEXT:    mov v1.d[1], v18.d[1]
-; CHECK-NEXT:    sub v4.4s, v4.4s, v16.4s
-; CHECK-NEXT:    mov v3.s[1], v2.s[0]
-; CHECK-NEXT:    mov v0.s[1], v6.s[0]
-; CHECK-NEXT:    sub v1.4s, v5.4s, v1.4s
-; CHECK-NEXT:    add v5.4s, v1.4s, v4.4s
-; CHECK-NEXT:    add v3.4s, v6.4s, v3.4s
-; CHECK-NEXT:    sub v1.4s, v4.4s, v1.4s
-; CHECK-NEXT:    sub v0.4s, v2.4s, v0.4s
-; CHECK-NEXT:    zip1 v6.4s, v3.4s, v5.4s
-; CHECK-NEXT:    uzp2 v2.4s, v3.4s, v5.4s
+; CHECK-NEXT:    ext v7.16b, v1.16b, v17.16b, #12
+; CHECK-NEXT:    zip2 v17.4s, v0.4s, v3.4s
+; CHECK-NEXT:    ext v0.16b, v0.16b, v4.16b, #8
+; CHECK-NEXT:    mov v1.s[1], v2.s[0]
+; CHECK-NEXT:    mov v19.s[3], v3.s[2]
+; CHECK-NEXT:    mov v6.d[1], v17.d[1]
+; CHECK-NEXT:    mov v16.d[1], v4.d[1]
+; CHECK-NEXT:    mov v1.d[1], v0.d[1]
+; CHECK-NEXT:    mov v18.d[1], v19.d[1]
+; CHECK-NEXT:    mov v7.d[1], v17.d[1]
+; CHECK-NEXT:    mov v5.d[1], v19.d[1]
+; CHECK-NEXT:    add v0.4s, v16.4s, v1.4s
+; CHECK-NEXT:    add v3.4s, v6.4s, v18.4s
+; CHECK-NEXT:    rev64 v2.4s, v0.4s
+; CHECK-NEXT:    sub v4.4s, v5.4s, v7.4s
+; CHECK-NEXT:    rev64 v5.4s, v3.4s
+; CHECK-NEXT:    sub v1.4s, v1.4s, v16.4s
+; CHECK-NEXT:    mov v2.d[1], v0.d[1]
+; CHECK-NEXT:    add v6.4s, v4.4s, v1.4s
+; CHECK-NEXT:    mov v5.d[1], v3.d[1]
+; CHECK-NEXT:    sub v1.4s, v1.4s, v4.4s
+; CHECK-NEXT:    add v2.4s, v3.4s, v2.4s
+; CHECK-NEXT:    sub v0.4s, v0.4s, v5.4s
+; CHECK-NEXT:    zip1 v4.4s, v2.4s, v6.4s
+; CHECK-NEXT:    uzp2 v5.4s, v2.4s, v6.4s
 ; CHECK-NEXT:    zip1 v7.4s, v0.4s, v1.4s
-; CHECK-NEXT:    mov v16.16b, v3.16b
-; CHECK-NEXT:    zip2 v4.4s, v3.4s, v5.4s
-; CHECK-NEXT:    mov v16.s[1], v5.s[1]
-; CHECK-NEXT:    mov v5.16b, v0.16b
-; CHECK-NEXT:    trn2 v6.4s, v3.4s, v6.4s
+; CHECK-NEXT:    mov v16.16b, v2.16b
+; CHECK-NEXT:    zip2 v3.4s, v2.4s, v6.4s
+; CHECK-NEXT:    mov v16.s[1], v6.s[1]
+; CHECK-NEXT:    mov v6.16b, v0.16b
+; CHECK-NEXT:    trn2 v4.4s, v2.4s, v4.4s
 ; CHECK-NEXT:    zip2 v17.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ext v0.16b, v0.16b, v7.16b, #8
-; CHECK-NEXT:    uzp2 v2.4s, v2.4s, v3.4s
-; CHECK-NEXT:    mov v5.s[3], v1.s[2]
+; CHECK-NEXT:    uzp2 v2.4s, v5.4s, v2.4s
+; CHECK-NEXT:    mov v6.s[3], v1.s[2]
 ; CHECK-NEXT:    mov v16.d[1], v7.d[1]
-; CHECK-NEXT:    mov v6.d[1], v0.d[1]
+; CHECK-NEXT:    mov v4.d[1], v0.d[1]
 ; CHECK-NEXT:    mov v2.d[1], v17.d[1]
-; CHECK-NEXT:    mov v4.d[1], v5.d[1]
+; CHECK-NEXT:    mov v3.d[1], v6.d[1]
 ; CHECK-NEXT:    movi v0.8h, #1
-; CHECK-NEXT:    add v1.4s, v16.4s, v6.4s
-; CHECK-NEXT:    sub v3.4s, v6.4s, v16.4s
-; CHECK-NEXT:    add v6.4s, v4.4s, v2.4s
-; CHECK-NEXT:    sub v2.4s, v2.4s, v4.4s
-; CHECK-NEXT:    zip2 v4.4s, v3.4s, v1.4s
-; CHECK-NEXT:    zip2 v7.4s, v2.4s, v6.4s
+; CHECK-NEXT:    add v1.4s, v16.4s, v4.4s
+; CHECK-NEXT:    sub v4.4s, v4.4s, v16.4s
+; CHECK-NEXT:    add v6.4s, v3.4s, v2.4s
+; CHECK-NEXT:    sub v2.4s, v2.4s, v3.4s
 ; CHECK-NEXT:    ext v5.16b, v1.16b, v1.16b, #4
+; CHECK-NEXT:    zip2 v3.4s, v4.4s, v1.4s
+; CHECK-NEXT:    zip2 v7.4s, v2.4s, v6.4s
 ; CHECK-NEXT:    ext v17.16b, v6.16b, v6.16b, #4
-; CHECK-NEXT:    zip1 v16.4s, v1.4s, v3.4s
-; CHECK-NEXT:    zip2 v1.4s, v1.4s, v3.4s
-; CHECK-NEXT:    add v4.4s, v4.4s, v7.4s
+; CHECK-NEXT:    zip1 v16.4s, v1.4s, v4.4s
+; CHECK-NEXT:    zip2 v1.4s, v1.4s, v4.4s
+; CHECK-NEXT:    add v3.4s, v3.4s, v7.4s
 ; CHECK-NEXT:    zip2 v7.4s, v6.4s, v2.4s
-; CHECK-NEXT:    ext v3.16b, v5.16b, v3.16b, #8
 ; CHECK-NEXT:    zip1 v6.4s, v6.4s, v2.4s
+; CHECK-NEXT:    ext v4.16b, v5.16b, v4.16b, #8
 ; CHECK-NEXT:    ext v2.16b, v17.16b, v2.16b, #8
 ; CHECK-NEXT:    sub v1.4s, v7.4s, v1.4s
-; CHECK-NEXT:    movi v7.2d, #0x00ffff0000ffff
-; CHECK-NEXT:    ext v3.16b, v3.16b, v5.16b, #4
-; CHECK-NEXT:    ushr v5.4s, v4.4s, #15
-; CHECK-NEXT:    ext v2.16b, v2.16b, v17.16b, #4
-; CHECK-NEXT:    ushr v17.4s, v1.4s, #15
-; CHECK-NEXT:    and v5.16b, v5.16b, v0.16b
-; CHECK-NEXT:    mul v5.4s, v5.4s, v7.4s
 ; CHECK-NEXT:    sub v6.4s, v6.4s, v16.4s
-; CHECK-NEXT:    add v2.4s, v3.4s, v2.4s
+; CHECK-NEXT:    ext v4.16b, v4.16b, v5.16b, #4
+; CHECK-NEXT:    ext v2.16b, v2.16b, v17.16b, #4
+; CHECK-NEXT:    movi v7.2d, #0x00ffff0000ffff
+; CHECK-NEXT:    ushr v5.4s, v3.4s, #15
 ; CHECK-NEXT:    ushr v16.4s, v6.4s, #15
-; CHECK-NEXT:    add v3.4s, v5.4s, v4.4s
+; CHECK-NEXT:    ushr v17.4s, v1.4s, #15
+; CHECK-NEXT:    add v2.4s, v4.4s, v2.4s
+; CHECK-NEXT:    and v5.16b, v5.16b, v0.16b
 ; CHECK-NEXT:    ushr v4.4s, v2.4s, #15
 ; CHECK-NEXT:    and v17.16b, v17.16b, v0.16b
 ; CHECK-NEXT:    and v16.16b, v16.16b, v0.16b
 ; CHECK-NEXT:    and v0.16b, v4.16b, v0.16b
+; CHECK-NEXT:    mul v5.4s, v5.4s, v7.4s
 ; CHECK-NEXT:    mul v16.4s, v16.4s, v7.4s
 ; CHECK-NEXT:    mul v17.4s, v17.4s, v7.4s
 ; CHECK-NEXT:    mul v0.4s, v0.4s, v7.4s
-; CHECK-NEXT:    eor v3.16b, v3.16b, v5.16b
+; CHECK-NEXT:    add v3.4s, v5.4s, v3.4s
 ; CHECK-NEXT:    add v6.4s, v16.4s, v6.4s
 ; CHECK-NEXT:    add v1.4s, v17.4s, v1.4s
 ; CHECK-NEXT:    add v2.4s, v0.4s, v2.4s
 ; CHECK-NEXT:    eor v4.16b, v6.16b, v16.16b
 ; CHECK-NEXT:    eor v1.16b, v1.16b, v17.16b
+; CHECK-NEXT:    eor v3.16b, v3.16b, v5.16b
 ; CHECK-NEXT:    eor v0.16b, v2.16b, v0.16b
 ; CHECK-NEXT:    add v1.4s, v3.4s, v1.4s
 ; CHECK-NEXT:    add v0.4s, v0.4s, v4.4s
--- a/llvm/test/CodeGen/AArch64/shuffle-tbl34.ll
+++ b/llvm/test/CodeGen/AArch64/shuffle-tbl34.ll
@ -517,9 +517,10 @@ define <8 x i16> @shuffle3_v4i16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) {
 define <4 x i32> @shuffle3_v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
 ; CHECK-LABEL: shuffle3_v4i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    zip1 v0.4s, v0.4s, v0.4s
-; CHECK-NEXT:    mov v0.s[1], v1.s[0]
-; CHECK-NEXT:    mov v0.s[2], v2.s[0]
+; CHECK-NEXT:    trn1 v1.4s, v0.4s, v1.4s
+; CHECK-NEXT:    mov v1.d[1], v0.d[0]
+; CHECK-NEXT:    mov v1.s[2], v2.s[0]
+; CHECK-NEXT:    mov v0.16b, v1.16b
 ; CHECK-NEXT:    ret
  %x = shufflevector <4 x i32> %a, <4 x i32> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
  %y = shufflevector <4 x i32> %c, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
--- a/llvm/test/CodeGen/AArch64/shuffles.ll
+++ b/llvm/test/CodeGen/AArch64/shuffles.ll
@ -4,19 +4,19 @@
 define <16 x i32> @test_shuf1(<16 x i32> %x, <16 x i32> %y) {
 ; CHECK-LABEL: test_shuf1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    dup v3.4s, v4.s[0]
-; CHECK-NEXT:    ext v5.16b, v6.16b, v1.16b, #4
-; CHECK-NEXT:    uzp1 v16.4s, v1.4s, v0.4s
-; CHECK-NEXT:    uzp2 v17.4s, v2.4s, v4.4s
-; CHECK-NEXT:    mov v3.s[0], v6.s[3]
-; CHECK-NEXT:    trn2 v4.4s, v1.4s, v5.4s
-; CHECK-NEXT:    trn2 v1.4s, v16.4s, v1.4s
-; CHECK-NEXT:    trn1 v2.4s, v17.4s, v2.4s
-; CHECK-NEXT:    mov v3.s[2], v7.s[3]
+; CHECK-NEXT:    ext v16.16b, v6.16b, v1.16b, #4
+; CHECK-NEXT:    dup v5.4s, v4.s[0]
+; CHECK-NEXT:    uzp1 v17.4s, v1.4s, v0.4s
+; CHECK-NEXT:    uzp2 v18.4s, v2.4s, v4.4s
+; CHECK-NEXT:    rev64 v3.4s, v7.4s
+; CHECK-NEXT:    trn2 v4.4s, v1.4s, v16.4s
+; CHECK-NEXT:    mov v5.s[0], v6.s[3]
+; CHECK-NEXT:    trn2 v1.4s, v17.4s, v1.4s
+; CHECK-NEXT:    trn1 v2.4s, v18.4s, v2.4s
 ; CHECK-NEXT:    mov v4.s[0], v7.s[1]
+; CHECK-NEXT:    mov v3.d[0], v5.d[0]
 ; CHECK-NEXT:    ext v1.16b, v0.16b, v1.16b, #12
 ; CHECK-NEXT:    mov v2.s[3], v7.s[0]
-; CHECK-NEXT:    mov v3.s[3], v7.s[2]
 ; CHECK-NEXT:    mov v0.16b, v4.16b
 ; CHECK-NEXT:    ret
  %s3 = shufflevector <16 x i32> %x, <16 x i32> %y, <16 x i32> <i32 29, i32 26, i32 7, i32 4, i32 3, i32 6, i32 5, i32 2, i32 9, i32 8, i32 17, i32 28, i32 27, i32 16, i32 31, i32 30>
@ -26,10 +26,10 @@ define <16 x i32> @test_shuf1(<16 x i32> %x, <16 x i32> %y) {
 define <4 x i32> @test_shuf2(<16 x i32> %x, <16 x i32> %y) {
 ; CHECK-LABEL: test_shuf2:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    zip2 v0.4s, v7.4s, v6.4s
-; CHECK-NEXT:    trn2 v0.4s, v7.4s, v0.4s
-; CHECK-NEXT:    mov v0.s[2], v1.s[3]
-; CHECK-NEXT:    mov v0.s[3], v1.s[0]
+; CHECK-NEXT:    zip2 v2.4s, v7.4s, v6.4s
+; CHECK-NEXT:    ext v0.16b, v1.16b, v1.16b, #4
+; CHECK-NEXT:    trn2 v1.4s, v7.4s, v2.4s
+; CHECK-NEXT:    mov v0.d[0], v1.d[0]
 ; CHECK-NEXT:    ret
  %s3 = shufflevector <16 x i32> %x, <16 x i32> %y, <4 x i32> <i32 29, i32 26, i32 7, i32 4>
  ret <4 x i32> %s3
@ -60,9 +60,9 @@ define <4 x i32> @test_shuf4(<16 x i32> %x, <16 x i32> %y) {
 define <4 x i32> @test_shuf5(<16 x i32> %x, <16 x i32> %y) {
 ; CHECK-LABEL: test_shuf5:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ext v0.16b, v6.16b, v4.16b, #12
-; CHECK-NEXT:    mov v0.s[2], v7.s[3]
-; CHECK-NEXT:    mov v0.s[3], v7.s[2]
+; CHECK-NEXT:    rev64 v0.4s, v7.4s
+; CHECK-NEXT:    ext v1.16b, v6.16b, v4.16b, #12
+; CHECK-NEXT:    mov v0.d[0], v1.d[0]
 ; CHECK-NEXT:    ret
  %s3 = shufflevector <16 x i32> %x, <16 x i32> %y, <4 x i32> <i32 27, i32 16, i32 31, i32 30>
  ret <4 x i32> %s3
@ -96,9 +96,10 @@ define <4 x i32> @test4366(<4 x i32> %a, <4 x i32> %b)
 define <4 x i32> @test7367(<4 x i32> %a, <4 x i32> %b)
 ; CHECK-LABEL: test7367:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov v1.s[0], v1.s[3]
-; CHECK-NEXT:    mov v1.s[1], v0.s[3]
-; CHECK-NEXT:    mov v0.16b, v1.16b
+; CHECK-NEXT:    mov v2.16b, v1.16b
+; CHECK-NEXT:    mov v2.d[0], v0.d[1]
+; CHECK-NEXT:    mov v2.s[0], v1.s[3]
+; CHECK-NEXT:    mov v0.16b, v2.16b
 ; CHECK-NEXT:    ret
 {
  %r = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 7, i32 3, i32 6, i32 7>
@ -108,9 +109,8 @@ define <4 x i32> @test7367(<4 x i32> %a, <4 x i32> %b)
 define <4 x i32> @test4045(<4 x i32> %a, <4 x i32> %b)
 ; CHECK-LABEL: test4045:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ext v2.16b, v0.16b, v1.16b, #4
-; CHECK-NEXT:    ext v0.16b, v2.16b, v0.16b, #4
-; CHECK-NEXT:    ext v0.16b, v0.16b, v1.16b, #8
+; CHECK-NEXT:    trn1 v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    mov v0.d[1], v1.d[0]
 ; CHECK-NEXT:    ret
 {
  %r = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 0, i32 4, i32 5>
@ -120,9 +120,8 @@ define <4 x i32> @test4045(<4 x i32> %a, <4 x i32> %b)
 define <4 x i32> @test0067(<4 x i32> %a, <4 x i32> %b)
 ; CHECK-LABEL: test0067:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov v1.s[0], v0.s[0]
-; CHECK-NEXT:    mov v1.s[1], v0.s[0]
-; CHECK-NEXT:    mov v0.16b, v1.16b
+; CHECK-NEXT:    trn1 v0.4s, v0.4s, v0.4s
+; CHECK-NEXT:    mov v0.d[1], v1.d[1]
 ; CHECK-NEXT:    ret
 {
  %r = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 0, i32 6, i32 7>
--- a/llvm/test/Transforms/VectorCombine/AArch64/vecreduce-shuffle.ll
+++ b/llvm/test/Transforms/VectorCombine/AArch64/vecreduce-shuffle.ll
@ -88,7 +88,7 @@ define i32 @reduceshuffle_twoin_concat_v4i32(<2 x i32> %a, <2 x i32> %b) {

 define i32 @reduceshuffle_twoin_lowelts_v4i32(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK-LABEL: @reduceshuffle_twoin_lowelts_v4i32(
-; CHECK-NEXT:    [[X:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 5, i32 1, i32 4>
+; CHECK-NEXT:    [[X:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; CHECK-NEXT:    [[R:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[X]])
 ; CHECK-NEXT:    ret i32 [[R]]
 ;
@ -193,7 +193,7 @@ define i32 @reduceshuffle_twoin_extraotheruse_v4i32(<4 x i32> %a, <4 x i32> %b)

 define i32 @reduceshuffle_twoin_splat_v4i32(<4 x i32> %a, <4 x i32> %b, i32 %c) {
 ; CHECK-LABEL: @reduceshuffle_twoin_splat_v4i32(
-; CHECK-NEXT:    [[S:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 5, i32 1, i32 4>
+; CHECK-NEXT:    [[S:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; CHECK-NEXT:    [[INSERT:%.*]] = insertelement <4 x i32> poison, i32 [[C:%.*]], i32 0
 ; CHECK-NEXT:    [[SPLAT:%.*]] = shufflevector <4 x i32> [[INSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[X:%.*]] = xor <4 x i32> [[S]], [[SPLAT]]
--- a/llvm/utils/PerfectShuffle/PerfectShuffle.cpp
+++ b/llvm/utils/PerfectShuffle/PerfectShuffle.cpp
@ -326,6 +326,24 @@ int main() {
            ShufTab[i].Arg1 = LaneIdx;
          }
        }
+
+        // Similar idea for using a D register mov, masking out 2 lanes to undef
+        for (unsigned LaneIdx = 0; LaneIdx < 4; LaneIdx += 2) {
+          unsigned Ln0 = getMaskElt(i, LaneIdx);
+          unsigned Ln1 = getMaskElt(i, LaneIdx + 1);
+          if ((Ln0 == 0 && Ln1 == 1) || (Ln0 == 2 && Ln1 == 3) ||
+              (Ln0 == 4 && Ln1 == 5) || (Ln0 == 6 && Ln1 == 7)) {
+            unsigned NewElt = setMaskElt(i, LaneIdx, 8);
+            NewElt = setMaskElt(NewElt, LaneIdx + 1, 8);
+            if (ShufTab[NewElt].Cost + 1 < ShufTab[i].Cost) {
+              MadeChange = true;
+              ShufTab[i].Cost = ShufTab[NewElt].Cost + 1;
+              ShufTab[i].Op = &InsOp;
+              ShufTab[i].Arg0 = NewElt;
+              ShufTab[i].Arg1 = (LaneIdx >> 1) | 0x4;
+            }
+          }
+        }
      }
 #endif
    }