[AArch64] Fix condition for "high-vector" DUP optimizations.

AArch64 NEON has a bunch of instructions with a "2" suffix that extract
the top half of the source vectors, instead of the bottom half.  We have
some DAGCombines to try to take advantage of that.  However, they
assumed that any EXTRACT_VECTOR was extracting the high half of the
vector in question.

This issue has apparently existed since the AArch64 backend was merged.

Fixes https://bugs.llvm.org/show_bug.cgi?id=40632 .

Differential Revision: https://reviews.llvm.org/D57862

llvm-svn: 353486
This commit is contained in:
Eli Friedman 2019-02-08 00:23:35 +00:00
parent 22c328969d
commit b87d675297
4 changed files with 263 additions and 23 deletions

View File

@ -9722,12 +9722,13 @@ static SDValue tryExtendDUPToExtractHigh(SDValue N, SelectionDAG &DAG) {
DAG.getConstant(NumElems, dl, MVT::i64));
}
static bool isEssentiallyExtractSubvector(SDValue N) {
if (N.getOpcode() == ISD::EXTRACT_SUBVECTOR)
return true;
return N.getOpcode() == ISD::BITCAST &&
N.getOperand(0).getOpcode() == ISD::EXTRACT_SUBVECTOR;
static bool isEssentiallyExtractHighSubvector(SDValue N) {
if (N.getOpcode() == ISD::BITCAST)
N = N.getOperand(0);
if (N.getOpcode() != ISD::EXTRACT_SUBVECTOR)
return false;
return cast<ConstantSDNode>(N.getOperand(1))->getAPIntValue() ==
N.getOperand(0).getValueType().getVectorNumElements() / 2;
}
/// Helper structure to keep track of ISD::SET_CC operands.
@ -9894,13 +9895,13 @@ static SDValue performAddSubLongCombine(SDNode *N,
// It's not worth doing if at least one of the inputs isn't already an
// extract, but we don't know which it'll be so we have to try both.
if (isEssentiallyExtractSubvector(LHS.getOperand(0))) {
if (isEssentiallyExtractHighSubvector(LHS.getOperand(0))) {
RHS = tryExtendDUPToExtractHigh(RHS.getOperand(0), DAG);
if (!RHS.getNode())
return SDValue();
RHS = DAG.getNode(ExtType, SDLoc(N), VT, RHS);
} else if (isEssentiallyExtractSubvector(RHS.getOperand(0))) {
} else if (isEssentiallyExtractHighSubvector(RHS.getOperand(0))) {
LHS = tryExtendDUPToExtractHigh(LHS.getOperand(0), DAG);
if (!LHS.getNode())
return SDValue();
@ -9933,11 +9934,11 @@ static SDValue tryCombineLongOpWithDup(unsigned IID, SDNode *N,
// Either node could be a DUP, but it's not worth doing both of them (you'd
// just as well use the non-high version) so look for a corresponding extract
// operation on the other "wing".
if (isEssentiallyExtractSubvector(LHS)) {
if (isEssentiallyExtractHighSubvector(LHS)) {
RHS = tryExtendDUPToExtractHigh(RHS, DAG);
if (!RHS.getNode())
return SDValue();
} else if (isEssentiallyExtractSubvector(RHS)) {
} else if (isEssentiallyExtractHighSubvector(RHS)) {
LHS = tryExtendDUPToExtractHigh(LHS, DAG);
if (!LHS.getNode())
return SDValue();

View File

@ -885,6 +885,20 @@ declare double @llvm.fabs.f64(double) nounwind readnone
define <2 x i64> @uabdl_from_extract_dup(<4 x i32> %lhs, i32 %rhs) {
; CHECK-LABEL: uabdl_from_extract_dup:
; CHECK-NOT: ext.16b
; CHECK: uabdl.2d
%rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
%rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
%lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
%res = tail call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %lhs.high, <2 x i32> %rhsvec) nounwind
%res1 = zext <2 x i32> %res to <2 x i64>
ret <2 x i64> %res1
}
define <2 x i64> @uabdl2_from_extract_dup(<4 x i32> %lhs, i32 %rhs) {
; CHECK-LABEL: uabdl2_from_extract_dup:
; CHECK-NOT: ext.16b
; CHECK: uabdl2.2d
%rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
%rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
@ -899,6 +913,20 @@ define <2 x i64> @uabdl_from_extract_dup(<4 x i32> %lhs, i32 %rhs) {
define <2 x i64> @sabdl_from_extract_dup(<4 x i32> %lhs, i32 %rhs) {
; CHECK-LABEL: sabdl_from_extract_dup:
; CHECK-NOT: ext.16b
; CHECK: sabdl.2d
%rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
%rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
%lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
%res = tail call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %lhs.high, <2 x i32> %rhsvec) nounwind
%res1 = zext <2 x i32> %res to <2 x i64>
ret <2 x i64> %res1
}
define <2 x i64> @sabdl2_from_extract_dup(<4 x i32> %lhs, i32 %rhs) {
; CHECK-LABEL: sabdl2_from_extract_dup:
; CHECK-NOT: ext.16b
; CHECK: sabdl2.2d
%rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
%rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1

View File

@ -738,6 +738,22 @@ declare <2 x float> @llvm.aarch64.neon.addp.v2f32(<2 x float>, <2 x float>) noun
declare <4 x float> @llvm.aarch64.neon.addp.v4f32(<4 x float>, <4 x float>) nounwind readnone
declare <2 x double> @llvm.aarch64.neon.addp.v2f64(<2 x double>, <2 x double>) nounwind readnone
define <2 x i64> @uaddl_duprhs(<4 x i32> %lhs, i32 %rhs) {
; CHECK-LABEL: uaddl_duprhs
; CHECK-NOT: ext.16b
; CHECK: uaddl.2d
%rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
%rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
%lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
%lhs.ext = zext <2 x i32> %lhs.high to <2 x i64>
%rhs.ext = zext <2 x i32> %rhsvec to <2 x i64>
%res = add <2 x i64> %lhs.ext, %rhs.ext
ret <2 x i64> %res
}
define <2 x i64> @uaddl2_duprhs(<4 x i32> %lhs, i32 %rhs) {
; CHECK-LABEL: uaddl2_duprhs
; CHECK-NOT: ext.16b
@ -754,6 +770,22 @@ define <2 x i64> @uaddl2_duprhs(<4 x i32> %lhs, i32 %rhs) {
ret <2 x i64> %res
}
define <2 x i64> @saddl_duplhs(i32 %lhs, <4 x i32> %rhs) {
; CHECK-LABEL: saddl_duplhs
; CHECK-NOT: ext.16b
; CHECK: saddl.2d
%lhsvec.tmp = insertelement <2 x i32> undef, i32 %lhs, i32 0
%lhsvec = insertelement <2 x i32> %lhsvec.tmp, i32 %lhs, i32 1
%rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
%lhs.ext = sext <2 x i32> %lhsvec to <2 x i64>
%rhs.ext = sext <2 x i32> %rhs.high to <2 x i64>
%res = add <2 x i64> %lhs.ext, %rhs.ext
ret <2 x i64> %res
}
define <2 x i64> @saddl2_duplhs(i32 %lhs, <4 x i32> %rhs) {
; CHECK-LABEL: saddl2_duplhs
; CHECK-NOT: ext.16b
@ -770,6 +802,22 @@ define <2 x i64> @saddl2_duplhs(i32 %lhs, <4 x i32> %rhs) {
ret <2 x i64> %res
}
define <2 x i64> @usubl_duprhs(<4 x i32> %lhs, i32 %rhs) {
; CHECK-LABEL: usubl_duprhs
; CHECK-NOT: ext.16b
; CHECK: usubl.2d
%rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
%rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
%lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
%lhs.ext = zext <2 x i32> %lhs.high to <2 x i64>
%rhs.ext = zext <2 x i32> %rhsvec to <2 x i64>
%res = sub <2 x i64> %lhs.ext, %rhs.ext
ret <2 x i64> %res
}
define <2 x i64> @usubl2_duprhs(<4 x i32> %lhs, i32 %rhs) {
; CHECK-LABEL: usubl2_duprhs
; CHECK-NOT: ext.16b
@ -786,8 +834,24 @@ define <2 x i64> @usubl2_duprhs(<4 x i32> %lhs, i32 %rhs) {
ret <2 x i64> %res
}
define <2 x i64> @ssubl_duplhs(i32 %lhs, <4 x i32> %rhs) {
; CHECK-LABEL: ssubl_duplhs:
; CHECK-NOT: ext.16b
; CHECK: ssubl.2d
%lhsvec.tmp = insertelement <2 x i32> undef, i32 %lhs, i32 0
%lhsvec = insertelement <2 x i32> %lhsvec.tmp, i32 %lhs, i32 1
%rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
%lhs.ext = sext <2 x i32> %lhsvec to <2 x i64>
%rhs.ext = sext <2 x i32> %rhs.high to <2 x i64>
%res = sub <2 x i64> %lhs.ext, %rhs.ext
ret <2 x i64> %res
}
define <2 x i64> @ssubl2_duplhs(i32 %lhs, <4 x i32> %rhs) {
; CHECK-LABEL: ssubl2_duplhs
; CHECK-LABEL: ssubl2_duplhs:
; CHECK-NOT: ext.16b
; CHECK: ssubl2.2d
%lhsvec.tmp = insertelement <2 x i32> undef, i32 %lhs, i32 0

View File

@ -1338,6 +1338,19 @@ entry:
ret <4 x i32> %vmull2.i
}
define <4 x i32> @foo6a(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c) nounwind readnone optsize ssp {
; CHECK-LABEL: foo6a:
; CHECK-NEXT: smull.4s v0, v1, v2[1]
; CHECK-NEXT: ret
entry:
%0 = bitcast <8 x i16> %b to <2 x i64>
%shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 0>
%1 = bitcast <1 x i64> %shuffle.i to <4 x i16>
%shuffle = shufflevector <4 x i16> %c, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
%vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %1, <4 x i16> %shuffle) nounwind
ret <4 x i32> %vmull2.i
}
define <2 x i64> @foo7(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c) nounwind readnone optsize ssp {
; CHECK-LABEL: foo7:
; CHECK-NEXT: smull2.2d v0, v1, v2[1]
@ -1351,6 +1364,20 @@ entry:
ret <2 x i64> %vmull2.i
}
define <2 x i64> @foo7a(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c) nounwind readnone optsize ssp {
; CHECK-LABEL: foo7a:
; CHECK-NEXT: smull.2d v0, v1, v2[1]
; CHECK-NEXT: ret
entry:
%0 = bitcast <4 x i32> %b to <2 x i64>
%shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 0>
%1 = bitcast <1 x i64> %shuffle.i to <2 x i32>
%shuffle = shufflevector <2 x i32> %c, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
%vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %1, <2 x i32> %shuffle) nounwind
ret <2 x i64> %vmull2.i
}
define <4 x i32> @foo8(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c) nounwind readnone optsize ssp {
; CHECK-LABEL: foo8:
; CHECK-NEXT: umull2.4s v0, v1, v2[1]
@ -1364,6 +1391,19 @@ entry:
ret <4 x i32> %vmull2.i
}
define <4 x i32> @foo8a(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c) nounwind readnone optsize ssp {
; CHECK-LABEL: foo8a:
; CHECK-NEXT: umull.4s v0, v1, v2[1]
; CHECK-NEXT: ret
entry:
%0 = bitcast <8 x i16> %b to <2 x i64>
%shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 0>
%1 = bitcast <1 x i64> %shuffle.i to <4 x i16>
%shuffle = shufflevector <4 x i16> %c, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
%vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %1, <4 x i16> %shuffle) nounwind
ret <4 x i32> %vmull2.i
}
define <2 x i64> @foo9(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c) nounwind readnone optsize ssp {
; CHECK-LABEL: foo9:
; CHECK-NEXT: umull2.2d v0, v1, v2[1]
@ -1377,6 +1417,19 @@ entry:
ret <2 x i64> %vmull2.i
}
define <2 x i64> @foo9a(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c) nounwind readnone optsize ssp {
; CHECK-LABEL: foo9a:
; CHECK-NEXT: umull.2d v0, v1, v2[1]
; CHECK-NEXT: ret
entry:
%0 = bitcast <4 x i32> %b to <2 x i64>
%shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 0>
%1 = bitcast <1 x i64> %shuffle.i to <2 x i32>
%shuffle = shufflevector <2 x i32> %c, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
%vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %1, <2 x i32> %shuffle) nounwind
ret <2 x i64> %vmull2.i
}
define <8 x i16> @bar0(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) nounwind {
; CHECK-LABEL: bar0:
; CHECK: smlal2.8h v0, v1, v2
@ -1667,6 +1720,24 @@ entry:
ret <2 x i64> %vmull2.i
}
define <4 x i32> @vmull_low_n_s16_test(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c, i32 %d) nounwind readnone optsize ssp {
entry:
; CHECK: vmull_low_n_s16_test
; CHECK-NOT: ext
; CHECK: smull.4s
; CHECK-NEXT: ret
%conv = trunc i32 %d to i16
%0 = bitcast <8 x i16> %b to <2 x i64>
%shuffle.i.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 0>
%1 = bitcast <1 x i64> %shuffle.i.i to <4 x i16>
%vecinit.i = insertelement <4 x i16> undef, i16 %conv, i32 0
%vecinit1.i = insertelement <4 x i16> %vecinit.i, i16 %conv, i32 1
%vecinit2.i = insertelement <4 x i16> %vecinit1.i, i16 %conv, i32 2
%vecinit3.i = insertelement <4 x i16> %vecinit2.i, i16 %conv, i32 3
%vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %1, <4 x i16> %vecinit3.i) nounwind
ret <4 x i32> %vmull2.i.i
}
define <4 x i32> @vmull_high_n_s16_test(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c, i32 %d) nounwind readnone optsize ssp {
entry:
; CHECK: vmull_high_n_s16_test
@ -1804,8 +1875,21 @@ define <2 x i64> @mlal_from_two_extracts(<2 x i64> %accum, <4 x i32> %lhs, <4 x
ret <2 x i64> %sum
}
define <2 x i64> @mull_from_extract_dup(<4 x i32> %lhs, i32 %rhs) {
; CHECK-LABEL: mull_from_extract_dup:
define <2 x i64> @mull_from_extract_dup_low(<4 x i32> %lhs, i32 %rhs) {
; CHECK-LABEL: mull_from_extract_dup_low:
; CHECK-NOT: ext
; CHECK: sqdmull.2d
%rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
%rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
%lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
%res = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhsvec) nounwind
ret <2 x i64> %res
}
define <2 x i64> @mull_from_extract_dup_high(<4 x i32> %lhs, i32 %rhs) {
; CHECK-LABEL: mull_from_extract_dup_high:
; CHECK-NOT: ext
; CHECK: sqdmull2.2d
%rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
@ -1817,8 +1901,21 @@ define <2 x i64> @mull_from_extract_dup(<4 x i32> %lhs, i32 %rhs) {
ret <2 x i64> %res
}
define <8 x i16> @pmull_from_extract_dup(<16 x i8> %lhs, i8 %rhs) {
; CHECK-LABEL: pmull_from_extract_dup:
define <8 x i16> @pmull_from_extract_dup_low(<16 x i8> %lhs, i8 %rhs) {
; CHECK-LABEL: pmull_from_extract_dup_low:
; CHECK-NOT: ext
; CHECK: pmull.8h
%rhsvec.0 = insertelement <8 x i8> undef, i8 %rhs, i32 0
%rhsvec = shufflevector <8 x i8> %rhsvec.0, <8 x i8> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
%lhs.high = shufflevector <16 x i8> %lhs, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%res = tail call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %lhs.high, <8 x i8> %rhsvec) nounwind
ret <8 x i16> %res
}
define <8 x i16> @pmull_from_extract_dup_high(<16 x i8> %lhs, i8 %rhs) {
; CHECK-LABEL: pmull_from_extract_dup_high:
; CHECK-NOT: ext
; CHECK: pmull2.8h
%rhsvec.0 = insertelement <8 x i8> undef, i8 %rhs, i32 0
@ -1830,8 +1927,20 @@ define <8 x i16> @pmull_from_extract_dup(<16 x i8> %lhs, i8 %rhs) {
ret <8 x i16> %res
}
define <8 x i16> @pmull_from_extract_duplane(<16 x i8> %lhs, <8 x i8> %rhs) {
; CHECK-LABEL: pmull_from_extract_duplane:
define <8 x i16> @pmull_from_extract_duplane_low(<16 x i8> %lhs, <8 x i8> %rhs) {
; CHECK-LABEL: pmull_from_extract_duplane_low:
; CHECK-NOT: ext
; CHECK: pmull.8h
%lhs.high = shufflevector <16 x i8> %lhs, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%rhs.high = shufflevector <8 x i8> %rhs, <8 x i8> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
%res = tail call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %lhs.high, <8 x i8> %rhs.high) nounwind
ret <8 x i16> %res
}
define <8 x i16> @pmull_from_extract_duplane_high(<16 x i8> %lhs, <8 x i8> %rhs) {
; CHECK-LABEL: pmull_from_extract_duplane_high:
; CHECK-NOT: ext
; CHECK: pmull2.8h
@ -1842,8 +1951,20 @@ define <8 x i16> @pmull_from_extract_duplane(<16 x i8> %lhs, <8 x i8> %rhs) {
ret <8 x i16> %res
}
define <2 x i64> @sqdmull_from_extract_duplane(<4 x i32> %lhs, <4 x i32> %rhs) {
; CHECK-LABEL: sqdmull_from_extract_duplane:
define <2 x i64> @sqdmull_from_extract_duplane_low(<4 x i32> %lhs, <4 x i32> %rhs) {
; CHECK-LABEL: sqdmull_from_extract_duplane_low:
; CHECK-NOT: ext
; CHECK: sqdmull.2d
%lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
%rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 0, i32 0>
%res = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind
ret <2 x i64> %res
}
define <2 x i64> @sqdmull_from_extract_duplane_high(<4 x i32> %lhs, <4 x i32> %rhs) {
; CHECK-LABEL: sqdmull_from_extract_duplane_high:
; CHECK-NOT: ext
; CHECK: sqdmull2.2d
@ -1854,8 +1975,21 @@ define <2 x i64> @sqdmull_from_extract_duplane(<4 x i32> %lhs, <4 x i32> %rhs) {
ret <2 x i64> %res
}
define <2 x i64> @sqdmlal_from_extract_duplane(<2 x i64> %accum, <4 x i32> %lhs, <4 x i32> %rhs) {
; CHECK-LABEL: sqdmlal_from_extract_duplane:
define <2 x i64> @sqdmlal_from_extract_duplane_low(<2 x i64> %accum, <4 x i32> %lhs, <4 x i32> %rhs) {
; CHECK-LABEL: sqdmlal_from_extract_duplane_low:
; CHECK-NOT: ext
; CHECK: sqdmlal.2d
%lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
%rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 0, i32 0>
%res = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind
%sum = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %accum, <2 x i64> %res)
ret <2 x i64> %sum
}
define <2 x i64> @sqdmlal_from_extract_duplane_high(<2 x i64> %accum, <4 x i32> %lhs, <4 x i32> %rhs) {
; CHECK-LABEL: sqdmlal_from_extract_duplane_high:
; CHECK-NOT: ext
; CHECK: sqdmlal2.2d
@ -1867,8 +2001,21 @@ define <2 x i64> @sqdmlal_from_extract_duplane(<2 x i64> %accum, <4 x i32> %lhs,
ret <2 x i64> %sum
}
define <2 x i64> @umlal_from_extract_duplane(<2 x i64> %accum, <4 x i32> %lhs, <4 x i32> %rhs) {
; CHECK-LABEL: umlal_from_extract_duplane:
define <2 x i64> @umlal_from_extract_duplane_low(<2 x i64> %accum, <4 x i32> %lhs, <4 x i32> %rhs) {
; CHECK-LABEL: umlal_from_extract_duplane_low:
; CHECK-NOT: ext
; CHECK: umlal.2d
%lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
%rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 0, i32 0>
%res = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind
%sum = add <2 x i64> %accum, %res
ret <2 x i64> %sum
}
define <2 x i64> @umlal_from_extract_duplane_high(<2 x i64> %accum, <4 x i32> %lhs, <4 x i32> %rhs) {
; CHECK-LABEL: umlal_from_extract_duplane_high:
; CHECK-NOT: ext
; CHECK: umlal2.2d