[AArch64] Fix condition for "high-vector" DUP optimizations.

AArch64 NEON has a bunch of instructions with a "2" suffix that extract the top half of the source vectors, instead of the bottom half. We have some DAGCombines to try to take advantage of that. However, they assumed that any EXTRACT_VECTOR was extracting the high half of the vector in question. This issue has apparently existed since the AArch64 backend was merged. Fixes https://bugs.llvm.org/show_bug.cgi?id=40632 . Differential Revision: https://reviews.llvm.org/D57862 llvm-svn: 353486
2024-12-06 10:58:44 +00:00 · 2019-02-08 00:23:35 +00:00 · 2019-02-08 00:23:35 +00:00 · b87d675297
commit b87d675297
parent 22c328969d
4 changed files with 263 additions and 23 deletions
--- a/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/lib/Target/AArch64/AArch64ISelLowering.cpp
@ -9722,12 +9722,13 @@ static SDValue tryExtendDUPToExtractHigh(SDValue N, SelectionDAG &DAG) {
                     DAG.getConstant(NumElems, dl, MVT::i64));
 }

-static bool isEssentiallyExtractSubvector(SDValue N) {
-  if (N.getOpcode() == ISD::EXTRACT_SUBVECTOR)
-    return true;
-
-  return N.getOpcode() == ISD::BITCAST &&
-         N.getOperand(0).getOpcode() == ISD::EXTRACT_SUBVECTOR;
+static bool isEssentiallyExtractHighSubvector(SDValue N) {
+  if (N.getOpcode() == ISD::BITCAST)
+    N = N.getOperand(0);
+  if (N.getOpcode() != ISD::EXTRACT_SUBVECTOR)
+    return false;
+  return cast<ConstantSDNode>(N.getOperand(1))->getAPIntValue() ==
+         N.getOperand(0).getValueType().getVectorNumElements() / 2;
 }

 /// Helper structure to keep track of ISD::SET_CC operands.
@ -9894,13 +9895,13 @@ static SDValue performAddSubLongCombine(SDNode *N,

  // It's not worth doing if at least one of the inputs isn't already an
  // extract, but we don't know which it'll be so we have to try both.
-  if (isEssentiallyExtractSubvector(LHS.getOperand(0))) {
+  if (isEssentiallyExtractHighSubvector(LHS.getOperand(0))) {
    RHS = tryExtendDUPToExtractHigh(RHS.getOperand(0), DAG);
    if (!RHS.getNode())
      return SDValue();

    RHS = DAG.getNode(ExtType, SDLoc(N), VT, RHS);
-  } else if (isEssentiallyExtractSubvector(RHS.getOperand(0))) {
+  } else if (isEssentiallyExtractHighSubvector(RHS.getOperand(0))) {
    LHS = tryExtendDUPToExtractHigh(LHS.getOperand(0), DAG);
    if (!LHS.getNode())
      return SDValue();
@ -9933,11 +9934,11 @@ static SDValue tryCombineLongOpWithDup(unsigned IID, SDNode *N,
  // Either node could be a DUP, but it's not worth doing both of them (you'd
  // just as well use the non-high version) so look for a corresponding extract
  // operation on the other "wing".
-  if (isEssentiallyExtractSubvector(LHS)) {
+  if (isEssentiallyExtractHighSubvector(LHS)) {
    RHS = tryExtendDUPToExtractHigh(RHS, DAG);
    if (!RHS.getNode())
      return SDValue();
-  } else if (isEssentiallyExtractSubvector(RHS)) {
+  } else if (isEssentiallyExtractHighSubvector(RHS)) {
    LHS = tryExtendDUPToExtractHigh(LHS, DAG);
    if (!LHS.getNode())
      return SDValue();
--- a/test/CodeGen/AArch64/arm64-vabs.ll
+++ b/test/CodeGen/AArch64/arm64-vabs.ll
@ -885,6 +885,20 @@ declare double @llvm.fabs.f64(double) nounwind readnone
 define <2 x i64> @uabdl_from_extract_dup(<4 x i32> %lhs, i32 %rhs) {
 ; CHECK-LABEL: uabdl_from_extract_dup:
 ; CHECK-NOT: ext.16b
+; CHECK: uabdl.2d
+  %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
+  %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
+
+  %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
+
+  %res = tail call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %lhs.high, <2 x i32> %rhsvec) nounwind
+  %res1 = zext <2 x i32> %res to <2 x i64>
+  ret <2 x i64> %res1
+}
+
+define <2 x i64> @uabdl2_from_extract_dup(<4 x i32> %lhs, i32 %rhs) {
+; CHECK-LABEL: uabdl2_from_extract_dup:
+; CHECK-NOT: ext.16b
 ; CHECK: uabdl2.2d
  %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
  %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
@ -899,6 +913,20 @@ define <2 x i64> @uabdl_from_extract_dup(<4 x i32> %lhs, i32 %rhs) {
 define <2 x i64> @sabdl_from_extract_dup(<4 x i32> %lhs, i32 %rhs) {
 ; CHECK-LABEL: sabdl_from_extract_dup:
 ; CHECK-NOT: ext.16b
+; CHECK: sabdl.2d
+  %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
+  %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
+
+  %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
+
+  %res = tail call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %lhs.high, <2 x i32> %rhsvec) nounwind
+  %res1 = zext <2 x i32> %res to <2 x i64>
+  ret <2 x i64> %res1
+}
+
+define <2 x i64> @sabdl2_from_extract_dup(<4 x i32> %lhs, i32 %rhs) {
+; CHECK-LABEL: sabdl2_from_extract_dup:
+; CHECK-NOT: ext.16b
 ; CHECK: sabdl2.2d
  %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
  %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
--- a/test/CodeGen/AArch64/arm64-vadd.ll
+++ b/test/CodeGen/AArch64/arm64-vadd.ll
@ -738,6 +738,22 @@ declare <2 x float> @llvm.aarch64.neon.addp.v2f32(<2 x float>, <2 x float>) noun
 declare <4 x float> @llvm.aarch64.neon.addp.v4f32(<4 x float>, <4 x float>) nounwind readnone
 declare <2 x double> @llvm.aarch64.neon.addp.v2f64(<2 x double>, <2 x double>) nounwind readnone

+define <2 x i64> @uaddl_duprhs(<4 x i32> %lhs, i32 %rhs) {
+; CHECK-LABEL: uaddl_duprhs
+; CHECK-NOT: ext.16b
+; CHECK: uaddl.2d
+  %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
+  %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
+
+  %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
+
+  %lhs.ext = zext <2 x i32> %lhs.high to <2 x i64>
+  %rhs.ext = zext <2 x i32> %rhsvec to <2 x i64>
+
+  %res = add <2 x i64> %lhs.ext, %rhs.ext
+  ret <2 x i64> %res
+}
+
 define <2 x i64> @uaddl2_duprhs(<4 x i32> %lhs, i32 %rhs) {
 ; CHECK-LABEL: uaddl2_duprhs
 ; CHECK-NOT: ext.16b
@ -754,6 +770,22 @@ define <2 x i64> @uaddl2_duprhs(<4 x i32> %lhs, i32 %rhs) {
  ret <2 x i64> %res
 }

+define <2 x i64> @saddl_duplhs(i32 %lhs, <4 x i32> %rhs) {
+; CHECK-LABEL: saddl_duplhs
+; CHECK-NOT: ext.16b
+; CHECK: saddl.2d
+  %lhsvec.tmp = insertelement <2 x i32> undef, i32 %lhs, i32 0
+  %lhsvec = insertelement <2 x i32> %lhsvec.tmp, i32 %lhs, i32 1
+
+  %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
+
+  %lhs.ext = sext <2 x i32> %lhsvec to <2 x i64>
+  %rhs.ext = sext <2 x i32> %rhs.high to <2 x i64>
+
+  %res = add <2 x i64> %lhs.ext, %rhs.ext
+  ret <2 x i64> %res
+}
+
 define <2 x i64> @saddl2_duplhs(i32 %lhs, <4 x i32> %rhs) {
 ; CHECK-LABEL: saddl2_duplhs
 ; CHECK-NOT: ext.16b
@ -770,6 +802,22 @@ define <2 x i64> @saddl2_duplhs(i32 %lhs, <4 x i32> %rhs) {
  ret <2 x i64> %res
 }

+define <2 x i64> @usubl_duprhs(<4 x i32> %lhs, i32 %rhs) {
+; CHECK-LABEL: usubl_duprhs
+; CHECK-NOT: ext.16b
+; CHECK: usubl.2d
+  %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
+  %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
+
+  %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
+
+  %lhs.ext = zext <2 x i32> %lhs.high to <2 x i64>
+  %rhs.ext = zext <2 x i32> %rhsvec to <2 x i64>
+
+  %res = sub <2 x i64> %lhs.ext, %rhs.ext
+  ret <2 x i64> %res
+}
+
 define <2 x i64> @usubl2_duprhs(<4 x i32> %lhs, i32 %rhs) {
 ; CHECK-LABEL: usubl2_duprhs
 ; CHECK-NOT: ext.16b
@ -786,8 +834,24 @@ define <2 x i64> @usubl2_duprhs(<4 x i32> %lhs, i32 %rhs) {
  ret <2 x i64> %res
 }

+define <2 x i64> @ssubl_duplhs(i32 %lhs, <4 x i32> %rhs) {
+; CHECK-LABEL: ssubl_duplhs:
+; CHECK-NOT: ext.16b
+; CHECK: ssubl.2d
+  %lhsvec.tmp = insertelement <2 x i32> undef, i32 %lhs, i32 0
+  %lhsvec = insertelement <2 x i32> %lhsvec.tmp, i32 %lhs, i32 1
+
+  %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
+
+  %lhs.ext = sext <2 x i32> %lhsvec to <2 x i64>
+  %rhs.ext = sext <2 x i32> %rhs.high to <2 x i64>
+
+  %res = sub <2 x i64> %lhs.ext, %rhs.ext
+  ret <2 x i64> %res
+}
+
 define <2 x i64> @ssubl2_duplhs(i32 %lhs, <4 x i32> %rhs) {
-; CHECK-LABEL: ssubl2_duplhs
+; CHECK-LABEL: ssubl2_duplhs:
 ; CHECK-NOT: ext.16b
 ; CHECK: ssubl2.2d
  %lhsvec.tmp = insertelement <2 x i32> undef, i32 %lhs, i32 0
--- a/test/CodeGen/AArch64/arm64-vmul.ll
+++ b/test/CodeGen/AArch64/arm64-vmul.ll
@ -1338,6 +1338,19 @@ entry:
  ret <4 x i32> %vmull2.i
 }

+define <4 x i32> @foo6a(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c) nounwind readnone optsize ssp {
+; CHECK-LABEL: foo6a:
+; CHECK-NEXT: smull.4s v0, v1, v2[1]
+; CHECK-NEXT: ret
+entry:
+  %0 = bitcast <8 x i16> %b to <2 x i64>
+  %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 0>
+  %1 = bitcast <1 x i64> %shuffle.i to <4 x i16>
+  %shuffle = shufflevector <4 x i16> %c, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %1, <4 x i16> %shuffle) nounwind
+  ret <4 x i32> %vmull2.i
+}
+
 define <2 x i64> @foo7(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c) nounwind readnone optsize ssp {
 ; CHECK-LABEL: foo7:
 ; CHECK-NEXT: smull2.2d v0, v1, v2[1]
@ -1351,6 +1364,20 @@ entry:
  ret <2 x i64> %vmull2.i
 }

+define <2 x i64> @foo7a(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c) nounwind readnone optsize ssp {
+; CHECK-LABEL: foo7a:
+; CHECK-NEXT: smull.2d v0, v1, v2[1]
+; CHECK-NEXT: ret
+entry:
+  %0 = bitcast <4 x i32> %b to <2 x i64>
+  %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 0>
+  %1 = bitcast <1 x i64> %shuffle.i to <2 x i32>
+  %shuffle = shufflevector <2 x i32> %c, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %1, <2 x i32> %shuffle) nounwind
+  ret <2 x i64> %vmull2.i
+}
+
+
 define <4 x i32> @foo8(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c) nounwind readnone optsize ssp {
 ; CHECK-LABEL: foo8:
 ; CHECK-NEXT: umull2.4s v0, v1, v2[1]
@ -1364,6 +1391,19 @@ entry:
  ret <4 x i32> %vmull2.i
 }

+define <4 x i32> @foo8a(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c) nounwind readnone optsize ssp {
+; CHECK-LABEL: foo8a:
+; CHECK-NEXT: umull.4s v0, v1, v2[1]
+; CHECK-NEXT: ret
+entry:
+  %0 = bitcast <8 x i16> %b to <2 x i64>
+  %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 0>
+  %1 = bitcast <1 x i64> %shuffle.i to <4 x i16>
+  %shuffle = shufflevector <4 x i16> %c, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %1, <4 x i16> %shuffle) nounwind
+  ret <4 x i32> %vmull2.i
+}
+
 define <2 x i64> @foo9(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c) nounwind readnone optsize ssp {
 ; CHECK-LABEL: foo9:
 ; CHECK-NEXT: umull2.2d v0, v1, v2[1]
@ -1377,6 +1417,19 @@ entry:
  ret <2 x i64> %vmull2.i
 }

+define <2 x i64> @foo9a(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c) nounwind readnone optsize ssp {
+; CHECK-LABEL: foo9a:
+; CHECK-NEXT: umull.2d v0, v1, v2[1]
+; CHECK-NEXT: ret
+entry:
+  %0 = bitcast <4 x i32> %b to <2 x i64>
+  %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 0>
+  %1 = bitcast <1 x i64> %shuffle.i to <2 x i32>
+  %shuffle = shufflevector <2 x i32> %c, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %1, <2 x i32> %shuffle) nounwind
+  ret <2 x i64> %vmull2.i
+}
+
 define <8 x i16> @bar0(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) nounwind {
 ; CHECK-LABEL: bar0:
 ; CHECK: smlal2.8h v0, v1, v2
@ -1667,6 +1720,24 @@ entry:
  ret <2 x i64> %vmull2.i
 }

+define <4 x i32> @vmull_low_n_s16_test(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c, i32 %d) nounwind readnone optsize ssp {
+entry:
+; CHECK: vmull_low_n_s16_test
+; CHECK-NOT: ext
+; CHECK: smull.4s
+; CHECK-NEXT: ret
+  %conv = trunc i32 %d to i16
+  %0 = bitcast <8 x i16> %b to <2 x i64>
+  %shuffle.i.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 0>
+  %1 = bitcast <1 x i64> %shuffle.i.i to <4 x i16>
+  %vecinit.i = insertelement <4 x i16> undef, i16 %conv, i32 0
+  %vecinit1.i = insertelement <4 x i16> %vecinit.i, i16 %conv, i32 1
+  %vecinit2.i = insertelement <4 x i16> %vecinit1.i, i16 %conv, i32 2
+  %vecinit3.i = insertelement <4 x i16> %vecinit2.i, i16 %conv, i32 3
+  %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %1, <4 x i16> %vecinit3.i) nounwind
+  ret <4 x i32> %vmull2.i.i
+}
+
 define <4 x i32> @vmull_high_n_s16_test(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c, i32 %d) nounwind readnone optsize ssp {
 entry:
 ; CHECK: vmull_high_n_s16_test
@ -1804,8 +1875,21 @@ define <2 x i64> @mlal_from_two_extracts(<2 x i64> %accum, <4 x i32> %lhs, <4 x
  ret <2 x i64> %sum
 }

-define <2 x i64> @mull_from_extract_dup(<4 x i32> %lhs, i32 %rhs) {
-; CHECK-LABEL: mull_from_extract_dup:
+define <2 x i64> @mull_from_extract_dup_low(<4 x i32> %lhs, i32 %rhs) {
+; CHECK-LABEL: mull_from_extract_dup_low:
+; CHECK-NOT: ext
+; CHECK: sqdmull.2d
+  %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
+  %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
+
+  %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
+
+  %res = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhsvec) nounwind
+  ret <2 x i64> %res
+}
+
+define <2 x i64> @mull_from_extract_dup_high(<4 x i32> %lhs, i32 %rhs) {
+; CHECK-LABEL: mull_from_extract_dup_high:
 ; CHECK-NOT: ext
 ; CHECK: sqdmull2.2d
  %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
@ -1817,8 +1901,21 @@ define <2 x i64> @mull_from_extract_dup(<4 x i32> %lhs, i32 %rhs) {
  ret <2 x i64> %res
 }

-define <8 x i16> @pmull_from_extract_dup(<16 x i8> %lhs, i8 %rhs) {
-; CHECK-LABEL: pmull_from_extract_dup:
+define <8 x i16> @pmull_from_extract_dup_low(<16 x i8> %lhs, i8 %rhs) {
+; CHECK-LABEL: pmull_from_extract_dup_low:
+; CHECK-NOT: ext
+; CHECK: pmull.8h
+  %rhsvec.0 = insertelement <8 x i8> undef, i8 %rhs, i32 0
+  %rhsvec = shufflevector <8 x i8> %rhsvec.0, <8 x i8> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+
+  %lhs.high = shufflevector <16 x i8> %lhs, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+
+  %res = tail call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %lhs.high, <8 x i8> %rhsvec) nounwind
+  ret <8 x i16> %res
+}
+
+define <8 x i16> @pmull_from_extract_dup_high(<16 x i8> %lhs, i8 %rhs) {
+; CHECK-LABEL: pmull_from_extract_dup_high:
 ; CHECK-NOT: ext
 ; CHECK: pmull2.8h
  %rhsvec.0 = insertelement <8 x i8> undef, i8 %rhs, i32 0
@ -1830,8 +1927,20 @@ define <8 x i16> @pmull_from_extract_dup(<16 x i8> %lhs, i8 %rhs) {
  ret <8 x i16> %res
 }

-define <8 x i16> @pmull_from_extract_duplane(<16 x i8> %lhs, <8 x i8> %rhs) {
-; CHECK-LABEL: pmull_from_extract_duplane:
+define <8 x i16> @pmull_from_extract_duplane_low(<16 x i8> %lhs, <8 x i8> %rhs) {
+; CHECK-LABEL: pmull_from_extract_duplane_low:
+; CHECK-NOT: ext
+; CHECK: pmull.8h
+
+  %lhs.high = shufflevector <16 x i8> %lhs, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %rhs.high = shufflevector <8 x i8> %rhs, <8 x i8> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+
+  %res = tail call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %lhs.high, <8 x i8> %rhs.high) nounwind
+  ret <8 x i16> %res
+}
+
+define <8 x i16> @pmull_from_extract_duplane_high(<16 x i8> %lhs, <8 x i8> %rhs) {
+; CHECK-LABEL: pmull_from_extract_duplane_high:
 ; CHECK-NOT: ext
 ; CHECK: pmull2.8h

@ -1842,8 +1951,20 @@ define <8 x i16> @pmull_from_extract_duplane(<16 x i8> %lhs, <8 x i8> %rhs) {
  ret <8 x i16> %res
 }

-define <2 x i64> @sqdmull_from_extract_duplane(<4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK-LABEL: sqdmull_from_extract_duplane:
+define <2 x i64> @sqdmull_from_extract_duplane_low(<4 x i32> %lhs, <4 x i32> %rhs) {
+; CHECK-LABEL: sqdmull_from_extract_duplane_low:
+; CHECK-NOT: ext
+; CHECK: sqdmull.2d
+
+  %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
+  %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 0, i32 0>
+
+  %res = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind
+  ret <2 x i64> %res
+}
+
+define <2 x i64> @sqdmull_from_extract_duplane_high(<4 x i32> %lhs, <4 x i32> %rhs) {
+; CHECK-LABEL: sqdmull_from_extract_duplane_high:
 ; CHECK-NOT: ext
 ; CHECK: sqdmull2.2d

@ -1854,8 +1975,21 @@ define <2 x i64> @sqdmull_from_extract_duplane(<4 x i32> %lhs, <4 x i32> %rhs) {
  ret <2 x i64> %res
 }

-define <2 x i64> @sqdmlal_from_extract_duplane(<2 x i64> %accum, <4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK-LABEL: sqdmlal_from_extract_duplane:
+define <2 x i64> @sqdmlal_from_extract_duplane_low(<2 x i64> %accum, <4 x i32> %lhs, <4 x i32> %rhs) {
+; CHECK-LABEL: sqdmlal_from_extract_duplane_low:
+; CHECK-NOT: ext
+; CHECK: sqdmlal.2d
+
+  %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
+  %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 0, i32 0>
+
+  %res = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind
+  %sum = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %accum, <2 x i64> %res)
+  ret <2 x i64> %sum
+}
+
+define <2 x i64> @sqdmlal_from_extract_duplane_high(<2 x i64> %accum, <4 x i32> %lhs, <4 x i32> %rhs) {
+; CHECK-LABEL: sqdmlal_from_extract_duplane_high:
 ; CHECK-NOT: ext
 ; CHECK: sqdmlal2.2d

@ -1867,8 +2001,21 @@ define <2 x i64> @sqdmlal_from_extract_duplane(<2 x i64> %accum, <4 x i32> %lhs,
  ret <2 x i64> %sum
 }

-define <2 x i64> @umlal_from_extract_duplane(<2 x i64> %accum, <4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK-LABEL: umlal_from_extract_duplane:
+define <2 x i64> @umlal_from_extract_duplane_low(<2 x i64> %accum, <4 x i32> %lhs, <4 x i32> %rhs) {
+; CHECK-LABEL: umlal_from_extract_duplane_low:
+; CHECK-NOT: ext
+; CHECK: umlal.2d
+
+  %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
+  %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 0, i32 0>
+
+  %res = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind
+  %sum = add <2 x i64> %accum, %res
+  ret <2 x i64> %sum
+}
+
+define <2 x i64> @umlal_from_extract_duplane_high(<2 x i64> %accum, <4 x i32> %lhs, <4 x i32> %rhs) {
+; CHECK-LABEL: umlal_from_extract_duplane_high:
 ; CHECK-NOT: ext
 ; CHECK: umlal2.2d