mirror of
https://github.com/capstone-engine/llvm-capstone.git
synced 2025-04-01 12:43:47 +00:00
[AArch64] Remove copy instruction between uaddlv with v4i16/v8i16 and dup (#66508)
If there are copy instructions between uaddlv with v4i16/v8i16 and dup for transfer from gpr to fpr, try to remove them with duplane. It is a follow-up patch of https://reviews.llvm.org/D159267
This commit is contained in:
parent
ec7baca17e
commit
59c3dcafd8
@ -5335,7 +5335,8 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
|
||||
case Intrinsic::aarch64_neon_uaddlv: {
|
||||
EVT OpVT = Op.getOperand(1).getValueType();
|
||||
EVT ResVT = Op.getValueType();
|
||||
if (ResVT == MVT::i32 && (OpVT == MVT::v8i8 || OpVT == MVT::v16i8)) {
|
||||
if (ResVT == MVT::i32 && (OpVT == MVT::v8i8 || OpVT == MVT::v16i8 ||
|
||||
OpVT == MVT::v8i16 || OpVT == MVT::v4i16)) {
|
||||
// In order to avoid insert_subvector, used v4i32 than v2i32.
|
||||
SDValue UADDLV =
|
||||
DAG.getNode(AArch64ISD::UADDLV, dl, MVT::v4i32, Op.getOperand(1));
|
||||
@ -22273,6 +22274,7 @@ static SDValue performSelectCombine(SDNode *N,
|
||||
static SDValue performDUPCombine(SDNode *N,
|
||||
TargetLowering::DAGCombinerInfo &DCI) {
|
||||
EVT VT = N->getValueType(0);
|
||||
SDLoc DL(N);
|
||||
// If "v2i32 DUP(x)" and "v4i32 DUP(x)" both exist, use an extract from the
|
||||
// 128bit vector version.
|
||||
if (VT.is64BitVector() && DCI.isAfterLegalizeDAG()) {
|
||||
@ -22280,14 +22282,32 @@ static SDValue performDUPCombine(SDNode *N,
|
||||
SmallVector<SDValue> Ops(N->ops());
|
||||
if (SDNode *LN = DCI.DAG.getNodeIfExists(N->getOpcode(),
|
||||
DCI.DAG.getVTList(LVT), Ops)) {
|
||||
SDLoc DL(N);
|
||||
return DCI.DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SDValue(LN, 0),
|
||||
DCI.DAG.getConstant(0, DL, MVT::i64));
|
||||
}
|
||||
}
|
||||
|
||||
if (N->getOpcode() == AArch64ISD::DUP)
|
||||
if (N->getOpcode() == AArch64ISD::DUP) {
|
||||
if (DCI.isAfterLegalizeDAG()) {
|
||||
// If scalar dup's operand is extract_vector_elt, try to combine them into
|
||||
// duplane. For example,
|
||||
//
|
||||
// t21: i32 = extract_vector_elt t19, Constant:i64<0>
|
||||
// t18: v4i32 = AArch64ISD::DUP t21
|
||||
// ==>
|
||||
// t22: v4i32 = AArch64ISD::DUPLANE32 t19, Constant:i64<0>
|
||||
SDValue EXTRACT_VEC_ELT = N->getOperand(0);
|
||||
if (EXTRACT_VEC_ELT.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
|
||||
if (VT == EXTRACT_VEC_ELT.getOperand(0).getValueType()) {
|
||||
unsigned Opcode = getDUPLANEOp(VT.getVectorElementType());
|
||||
return DCI.DAG.getNode(Opcode, DL, VT, EXTRACT_VEC_ELT.getOperand(0),
|
||||
EXTRACT_VEC_ELT.getOperand(1));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return performPostLD1Combine(N, DCI, false);
|
||||
}
|
||||
|
||||
return SDValue();
|
||||
}
|
||||
|
@ -6472,12 +6472,24 @@ def : Pat<(i32 (int_aarch64_neon_uaddlv (v8i16 (AArch64uaddlp (v16i8 V128:$op)))
|
||||
(v8i16 (SUBREG_TO_REG (i64 0), (UADDLVv16i8v V128:$op), hsub)),
|
||||
ssub))>;
|
||||
|
||||
def : Pat<(v4i32 (AArch64uaddlv (v8i16 (AArch64uaddlp (v16i8 V128:$op))))),
|
||||
(v4i32 (SUBREG_TO_REG (i64 0), (UADDLVv16i8v V128:$op), hsub))>;
|
||||
|
||||
def : Pat<(v4i32 (AArch64uaddlv (v4i16 (AArch64uaddlp (v8i8 V64:$op))))),
|
||||
(v4i32 (SUBREG_TO_REG (i64 0), (UADDLVv8i8v V64:$op), hsub))>;
|
||||
|
||||
def : Pat<(v4i32 (AArch64uaddlv (v8i8 V64:$Rn))),
|
||||
(v4i32 (SUBREG_TO_REG (i64 0), (UADDLVv8i8v V64:$Rn), hsub))>;
|
||||
|
||||
def : Pat<(v4i32 (AArch64uaddlv (v4i16 V64:$Rn))),
|
||||
(v4i32 (SUBREG_TO_REG (i64 0), (UADDLVv4i16v V64:$Rn), ssub))>;
|
||||
|
||||
def : Pat<(v4i32 (AArch64uaddlv (v16i8 V128:$Rn))),
|
||||
(v4i32 (SUBREG_TO_REG (i64 0), (UADDLVv16i8v V128:$Rn), hsub))>;
|
||||
|
||||
def : Pat<(v4i32 (AArch64uaddlv (v8i16 V128:$Rn))),
|
||||
(v4i32 (SUBREG_TO_REG (i64 0), (UADDLVv8i16v V128:$Rn), ssub))>;
|
||||
|
||||
// Patterns for across-vector intrinsics, that have a node equivalent, that
|
||||
// returns a vector (with only the low lane defined) instead of a scalar.
|
||||
// In effect, opNode is the same as (scalar_to_vector (IntNode)).
|
||||
|
@ -14,8 +14,8 @@ define void @insert_vec_v2i32_uaddlv_from_v8i16(ptr %0) {
|
||||
; CHECK-NEXT: movi.2d v1, #0000000000000000
|
||||
; CHECK-NEXT: uaddlv.8h s0, v0
|
||||
; CHECK-NEXT: mov.s v1[0], v0[0]
|
||||
; CHECK-NEXT: ucvtf.2s v1, v1
|
||||
; CHECK-NEXT: str d1, [x0]
|
||||
; CHECK-NEXT: ucvtf.2s v0, v1
|
||||
; CHECK-NEXT: str d0, [x0]
|
||||
; CHECK-NEXT: ret
|
||||
|
||||
entry:
|
||||
@ -52,8 +52,8 @@ define void @insert_vec_v16i32_uaddlv_from_v8i16(ptr %0) {
|
||||
; CHECK-NEXT: uaddlv.8h s1, v0
|
||||
; CHECK-NEXT: stp q0, q0, [x0, #32]
|
||||
; CHECK-NEXT: mov.s v2[0], v1[0]
|
||||
; CHECK-NEXT: ucvtf.4s v2, v2
|
||||
; CHECK-NEXT: stp q2, q0, [x0]
|
||||
; CHECK-NEXT: ucvtf.4s v1, v2
|
||||
; CHECK-NEXT: stp q1, q0, [x0]
|
||||
; CHECK-NEXT: ret
|
||||
|
||||
entry:
|
||||
@ -76,8 +76,8 @@ define void @insert_vec_v23i32_uaddlv_from_v8i16(ptr %0) {
|
||||
; CHECK-NEXT: st1.s { v0 }[2], [x8]
|
||||
; CHECK-NEXT: str d0, [x0, #80]
|
||||
; CHECK-NEXT: mov.s v2[0], v1[0]
|
||||
; CHECK-NEXT: ucvtf.4s v2, v2
|
||||
; CHECK-NEXT: str q2, [x0]
|
||||
; CHECK-NEXT: ucvtf.4s v1, v2
|
||||
; CHECK-NEXT: str q1, [x0]
|
||||
; CHECK-NEXT: ret
|
||||
|
||||
entry:
|
||||
@ -256,9 +256,9 @@ define void @insert_vec_v16i64_uaddlv_from_v4i16(ptr %0) {
|
||||
; CHECK-NEXT: uaddlv.4h s1, v0
|
||||
; CHECK-NEXT: stp q0, q0, [x0, #32]
|
||||
; CHECK-NEXT: mov.s v2[0], v1[0]
|
||||
; CHECK-NEXT: ucvtf.2d v2, v2
|
||||
; CHECK-NEXT: fcvtn v2.2s, v2.2d
|
||||
; CHECK-NEXT: stp q2, q0, [x0]
|
||||
; CHECK-NEXT: ucvtf.2d v1, v2
|
||||
; CHECK-NEXT: fcvtn v1.2s, v1.2d
|
||||
; CHECK-NEXT: stp q1, q0, [x0]
|
||||
; CHECK-NEXT: ret
|
||||
|
||||
entry:
|
||||
|
@ -9,16 +9,15 @@ define i32 @widget(i64 %arg, <8 x i16> %arg1) {
|
||||
; CHECK: // %bb.0: // %bb
|
||||
; CHECK-NEXT: sub sp, sp, #16
|
||||
; CHECK-NEXT: .cfi_def_cfa_offset 16
|
||||
; CHECK-NEXT: umov w9, v0.h[0]
|
||||
; CHECK-NEXT: movi v0.2d, #0000000000000000
|
||||
; CHECK-NEXT: mov x10, sp
|
||||
; CHECK-NEXT: bfi x10, x0, #1, #3
|
||||
; CHECK-NEXT: movi v1.2d, #0000000000000000
|
||||
; CHECK-NEXT: mov x9, sp
|
||||
; CHECK-NEXT: dup v0.8h, v0.h[0]
|
||||
; CHECK-NEXT: bfi x9, x0, #1, #3
|
||||
; CHECK-NEXT: mov x8, x0
|
||||
; CHECK-NEXT: mov w0, wzr
|
||||
; CHECK-NEXT: dup v1.8h, w9
|
||||
; CHECK-NEXT: str q0, [sp]
|
||||
; CHECK-NEXT: ld1 { v1.h }[1], [x10]
|
||||
; CHECK-NEXT: str q1, [x8]
|
||||
; CHECK-NEXT: str q1, [sp]
|
||||
; CHECK-NEXT: ld1 { v0.h }[1], [x9]
|
||||
; CHECK-NEXT: str q0, [x8]
|
||||
; CHECK-NEXT: add sp, sp, #16
|
||||
; CHECK-NEXT: ret
|
||||
bb:
|
||||
|
@ -195,7 +195,6 @@ entry:
|
||||
}
|
||||
|
||||
declare <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16>, i32)
|
||||
|
||||
declare i64 @llvm.aarch64.neon.urshl.i64(i64, i64)
|
||||
|
||||
define <8 x i8> @uaddlv_v8i8_urshr(<8 x i8> %a) {
|
||||
@ -215,3 +214,36 @@ entry:
|
||||
%vecinit7.i = shufflevector <8 x i8> %vecinit.i, <8 x i8> poison, <8 x i32> zeroinitializer
|
||||
ret <8 x i8> %vecinit7.i
|
||||
}
|
||||
|
||||
define <4 x i32> @uaddlv_dup_v4i16(<4 x i16> %a) {
|
||||
; CHECK-LABEL: uaddlv_dup_v4i16:
|
||||
; CHECK: // %bb.0: // %entry
|
||||
; CHECK-NEXT: uaddlv s0, v0.4h
|
||||
; CHECK-NEXT: dup v0.4s, v0.s[0]
|
||||
; CHECK-NEXT: ushr v0.4s, v0.4s, #3
|
||||
; CHECK-NEXT: ret
|
||||
entry:
|
||||
%vaddlv.i = tail call i32 @llvm.aarch64.neon.uaddlv.i32.v4i16(<4 x i16> %a)
|
||||
%vecinit.i = insertelement <4 x i32> undef, i32 %vaddlv.i, i64 0
|
||||
%vecinit7.i = shufflevector <4 x i32> %vecinit.i, <4 x i32> poison, <4 x i32> zeroinitializer
|
||||
%vshr_n = lshr <4 x i32> %vecinit7.i, <i32 3, i32 3, i32 3, i32 3>
|
||||
ret <4 x i32> %vshr_n
|
||||
}
|
||||
|
||||
define <4 x i32> @uaddlv_dup_v8i16(<8 x i16> %a) {
|
||||
; CHECK-LABEL: uaddlv_dup_v8i16:
|
||||
; CHECK: // %bb.0: // %entry
|
||||
; CHECK-NEXT: uaddlv s0, v0.8h
|
||||
; CHECK-NEXT: dup v0.4s, v0.s[0]
|
||||
; CHECK-NEXT: ushr v0.4s, v0.4s, #3
|
||||
; CHECK-NEXT: ret
|
||||
entry:
|
||||
%vaddlv.i = tail call i32 @llvm.aarch64.neon.uaddlv.i32.v8i16(<8 x i16> %a)
|
||||
%vecinit.i = insertelement <4 x i32> undef, i32 %vaddlv.i, i64 0
|
||||
%vecinit7.i = shufflevector <4 x i32> %vecinit.i, <4 x i32> poison, <4 x i32> zeroinitializer
|
||||
%vshr_n = lshr <4 x i32> %vecinit7.i, <i32 3, i32 3, i32 3, i32 3>
|
||||
ret <4 x i32> %vshr_n
|
||||
}
|
||||
|
||||
declare i32 @llvm.aarch64.neon.uaddlv.i32.v8i16(<8 x i16>)
|
||||
declare i32 @llvm.aarch64.neon.uaddlv.i32.v4i16(<4 x i16>)
|
||||
|
@ -26,7 +26,21 @@ define i16 @uaddlv_uaddlp_v16i8(<16 x i8> %0) {
|
||||
ret i16 %4
|
||||
}
|
||||
|
||||
define i16 @uaddlv_uaddlp_v8i8(<8 x i8> %0) {
|
||||
; CHECK-LABEL: uaddlv_uaddlp_v8i8:
|
||||
; CHECK: // %bb.0:
|
||||
; CHECK-NEXT: uaddlv h0, v0.8b
|
||||
; CHECK-NEXT: fmov w0, s0
|
||||
; CHECK-NEXT: ret
|
||||
%2 = tail call <4 x i16> @llvm.aarch64.neon.uaddlp.v4i16.v8i8(<8 x i8> %0)
|
||||
%3 = tail call i32 @llvm.aarch64.neon.uaddlv.i32.v4i16(<4 x i16> %2)
|
||||
%4 = trunc i32 %3 to i16
|
||||
ret i16 %4
|
||||
}
|
||||
|
||||
declare i64 @llvm.aarch64.neon.uaddlv.i64.v4i32(<4 x i32>)
|
||||
declare i32 @llvm.aarch64.neon.uaddlv.i32.v8i16(<8 x i16>)
|
||||
declare i32 @llvm.aarch64.neon.uaddlv.i32.v4i16(<4 x i16>)
|
||||
declare <4 x i32> @llvm.aarch64.neon.uaddlp.v4i32.v8i16(<8 x i16>)
|
||||
declare <8 x i16> @llvm.aarch64.neon.uaddlp.v8i16.v16i8(<16 x i8>)
|
||||
declare <4 x i16> @llvm.aarch64.neon.uaddlp.v4i16.v8i8(<8 x i8>)
|
||||
|
Loading…
x
Reference in New Issue
Block a user