mirror of
https://github.com/RPCSX/llvm.git
synced 2025-02-17 11:39:11 +00:00
[ARM] Implement isExtractSubvectorCheap.
See https://reviews.llvm.org/D6678 for the history of isExtractSubvectorCheap. Essentially the same considerations apply to ARM. This temporarily breaks the formation of vpadd/vpaddl in certain cases; AddCombineToVPADDL essentially assumes that we won't form VUZP shuffles. See https://reviews.llvm.org/D27779 for followup fix. Differential Revision: https://reviews.llvm.org/D27774 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@290198 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
parent
967c9cbd8f
commit
1e77c707b7
@ -12921,6 +12921,14 @@ bool ARMTargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
|
||||
return true;
|
||||
}
|
||||
|
||||
bool ARMTargetLowering::isExtractSubvectorCheap(EVT ResVT,
|
||||
unsigned Index) const {
|
||||
if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
|
||||
return false;
|
||||
|
||||
return (Index == 0 || Index == ResVT.getVectorNumElements());
|
||||
}
|
||||
|
||||
Instruction* ARMTargetLowering::makeDMB(IRBuilder<> &Builder,
|
||||
ARM_MB::MemBOpt Domain) const {
|
||||
Module *M = Builder.GetInsertBlock()->getParent()->getParent();
|
||||
|
@ -431,6 +431,10 @@ namespace llvm {
|
||||
bool shouldConvertConstantLoadToIntImm(const APInt &Imm,
|
||||
Type *Ty) const override;
|
||||
|
||||
/// Return true if EXTRACT_SUBVECTOR is cheap for this result type
|
||||
/// with this index.
|
||||
bool isExtractSubvectorCheap(EVT ResVT, unsigned Index) const override;
|
||||
|
||||
/// \brief Returns true if an argument of type Ty needs to be passed in a
|
||||
/// contiguous block of registers in calling convention CallConv.
|
||||
bool functionArgumentNeedsConsecutiveRegisters(
|
||||
|
@ -217,21 +217,19 @@ define <4 x i16> @test_multisource(<32 x i16>* %B) nounwind {
|
||||
; CHECK-LABEL: test_multisource:
|
||||
; CHECK: @ BB#0:
|
||||
; CHECK-NEXT: mov r1, r0
|
||||
; CHECK-NEXT: add r2, r0, #48
|
||||
; CHECK-NEXT: add r0, r0, #32
|
||||
; CHECK-NEXT: add r2, r0, #32
|
||||
; CHECK-NEXT: add r0, r0, #48
|
||||
; CHECK-NEXT: vld1.16 {d16, d17}, [r1:128]!
|
||||
; CHECK-NEXT: vld1.64 {d20, d21}, [r2:128]
|
||||
; CHECK-NEXT: vld1.64 {d18, d19}, [r1:128]
|
||||
; CHECK-NEXT: vmov.u16 r1, d16[0]
|
||||
; CHECK-NEXT: vld1.64 {d16, d17}, [r0:128]
|
||||
; CHECK-NEXT: vmov.16 d22[0], r1
|
||||
; CHECK-NEXT: vmov.u16 r0, d18[0]
|
||||
; CHECK-NEXT: vmov.u16 r1, d16[0]
|
||||
; CHECK-NEXT: vmov.16 d22[1], r0
|
||||
; CHECK-NEXT: vmov.u16 r0, d20[0]
|
||||
; CHECK-NEXT: vmov.16 d22[2], r1
|
||||
; CHECK-NEXT: vmov.16 d22[3], r0
|
||||
; CHECK-NEXT: vmov r0, r1, d22
|
||||
; CHECK-NEXT: vld1.64 {d18, d19}, [r0:128]
|
||||
; CHECK-NEXT: vld1.64 {d22, d23}, [r1:128]
|
||||
; CHECK-NEXT: vorr d24, d20, d20
|
||||
; CHECK-NEXT: vzip.16 d24, d18
|
||||
; CHECK-NEXT: vext.16 d18, d20, d24, #2
|
||||
; CHECK-NEXT: vtrn.16 q8, q11
|
||||
; CHECK-NEXT: vext.16 d16, d18, d16, #2
|
||||
; CHECK-NEXT: vext.16 d16, d16, d16, #2
|
||||
; CHECK-NEXT: vmov r0, r1, d16
|
||||
; CHECK-NEXT: mov pc, lr
|
||||
%tmp1 = load <32 x i16>, <32 x i16>* %B
|
||||
%tmp2 = shufflevector <32 x i16> %tmp1, <32 x i16> undef, <4 x i32> <i32 0, i32 8, i32 16, i32 24>
|
||||
@ -244,14 +242,8 @@ define <4 x i16> @test_largespan(<8 x i16>* %B) nounwind {
|
||||
; CHECK-LABEL: test_largespan:
|
||||
; CHECK: @ BB#0:
|
||||
; CHECK-NEXT: vld1.64 {d16, d17}, [r0]
|
||||
; CHECK-NEXT: vmov.u16 r1, d16[0]
|
||||
; CHECK-NEXT: vmov.u16 r0, d16[2]
|
||||
; CHECK-NEXT: vmov.16 d18[0], r1
|
||||
; CHECK-NEXT: vmov.u16 r1, d17[0]
|
||||
; CHECK-NEXT: vmov.16 d18[1], r0
|
||||
; CHECK-NEXT: vmov.u16 r0, d17[2]
|
||||
; CHECK-NEXT: vmov.16 d18[2], r1
|
||||
; CHECK-NEXT: vmov.16 d18[3], r0
|
||||
; CHECK-NEXT: vorr d18, d16, d16
|
||||
; CHECK-NEXT: vuzp.16 d18, d17
|
||||
; CHECK-NEXT: vmov r0, r1, d18
|
||||
; CHECK-NEXT: mov pc, lr
|
||||
%tmp1 = load <8 x i16>, <8 x i16>* %B
|
||||
|
@ -213,36 +213,47 @@ define <2 x i64> @vpaddlQu32(<4 x i32>* %A) nounwind {
|
||||
ret <2 x i64> %tmp2
|
||||
}
|
||||
|
||||
; Test AddCombine optimization that generates a vpaddl.s
|
||||
define void @addCombineToVPADDL() nounwind ssp {
|
||||
; CHECK-LABEL: addCombineToVPADDL:
|
||||
; Combine vuzp+vadd->vpadd.
|
||||
; FIXME: Implement this optimization
|
||||
define void @addCombineToVPADD(<16 x i8> *%cbcr, <8 x i8> *%X) nounwind ssp {
|
||||
; CHECK-LABEL: addCombineToVPADD:
|
||||
; CHECK: @ BB#0:
|
||||
; CHECK-NEXT: .save {r11}
|
||||
; CHECK-NEXT: push {r11}
|
||||
; CHECK-NEXT: .setfp r11, sp
|
||||
; CHECK-NEXT: mov r11, sp
|
||||
; CHECK-NEXT: .pad #44
|
||||
; CHECK-NEXT: sub sp, sp, #44
|
||||
; CHECK-NEXT: bic sp, sp, #15
|
||||
; CHECK-NEXT: add r0, sp, #16
|
||||
; CHECK-NEXT: vld1.64 {d16, d17}, [r0:128]
|
||||
; CHECK-NEXT: vpaddl.s8 q8, q8
|
||||
; CHECK-NEXT: vmovn.i16 d16, q8
|
||||
; CHECK-NEXT: vstr d16, [sp, #8]
|
||||
; CHECK-NEXT: mov sp, r11
|
||||
; CHECK-NEXT: pop {r11}
|
||||
; CHECK-NEXT: vld1.64 {d16, d17}, [r0]
|
||||
; CHECK-NEXT: vorr d18, d17, d17
|
||||
; CHECK-NEXT: vuzp.8 d16, d18
|
||||
; CHECK-NEXT: vadd.i8 d16, d18, d16
|
||||
; CHECK-NEXT: vstr d16, [r1]
|
||||
; CHECK-NEXT: mov pc, lr
|
||||
%cbcr = alloca <16 x i8>, align 16
|
||||
%X = alloca <8 x i8>, align 8
|
||||
%tmp = load <16 x i8>, <16 x i8>* %cbcr
|
||||
%tmp1 = shufflevector <16 x i8> %tmp, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
|
||||
%tmp2 = load <16 x i8>, <16 x i8>* %cbcr
|
||||
%tmp3 = shufflevector <16 x i8> %tmp2, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
|
||||
%tmp3 = shufflevector <16 x i8> %tmp, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
|
||||
|
||||
%add = add <8 x i8> %tmp3, %tmp1
|
||||
store <8 x i8> %add, <8 x i8>* %X, align 8
|
||||
ret void
|
||||
}
|
||||
|
||||
; Combine vuzp+vaddl->vpaddl
|
||||
; FIXME: Implement this optimization.
|
||||
define void @addCombineToVPADDL_sext(<16 x i8> *%cbcr, <8 x i16> *%X) nounwind ssp {
|
||||
; CHECK-LABEL: addCombineToVPADDL_sext:
|
||||
; CHECK: @ BB#0:
|
||||
; CHECK-NEXT: vld1.64 {d16, d17}, [r0]
|
||||
; CHECK-NEXT: vorr d18, d17, d17
|
||||
; CHECK-NEXT: vuzp.8 d16, d18
|
||||
; CHECK-NEXT: vaddl.s8 q8, d18, d16
|
||||
; CHECK-NEXT: vst1.64 {d16, d17}, [r1]
|
||||
; CHECK-NEXT: mov pc, lr
|
||||
%tmp = load <16 x i8>, <16 x i8>* %cbcr
|
||||
%tmp1 = shufflevector <16 x i8> %tmp, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
|
||||
%tmp3 = shufflevector <16 x i8> %tmp, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
|
||||
%tmp4 = sext <8 x i8> %tmp3 to <8 x i16>
|
||||
%tmp5 = sext <8 x i8> %tmp1 to <8 x i16>
|
||||
%add = add <8 x i16> %tmp4, %tmp5
|
||||
store <8 x i16> %add, <8 x i16>* %X, align 8
|
||||
ret void
|
||||
}
|
||||
|
||||
; Legalization produces a EXTRACT_VECTOR_ELT DAG node which performs an extend from
|
||||
; i16 to i32. In this case the input for the formed VPADDL needs to be a vector of i16s.
|
||||
define <2 x i16> @fromExtendingExtractVectorElt(<4 x i16> %in) {
|
||||
|
@ -7,14 +7,14 @@ define <8 x i8> @vuzpi8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
|
||||
; CHECK-NEXT: vldr d16, [r1]
|
||||
; CHECK-NEXT: vldr d17, [r0]
|
||||
; CHECK-NEXT: vuzp.8 d17, d16
|
||||
; CHECK-NEXT: vadd.i8 d16, d17, d16
|
||||
; CHECK-NEXT: vmul.i8 d16, d17, d16
|
||||
; CHECK-NEXT: vmov r0, r1, d16
|
||||
; CHECK-NEXT: mov pc, lr
|
||||
%tmp1 = load <8 x i8>, <8 x i8>* %A
|
||||
%tmp2 = load <8 x i8>, <8 x i8>* %B
|
||||
%tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
|
||||
%tmp4 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
|
||||
%tmp5 = add <8 x i8> %tmp3, %tmp4
|
||||
%tmp5 = mul <8 x i8> %tmp3, %tmp4
|
||||
ret <8 x i8> %tmp5
|
||||
}
|
||||
|
||||
@ -39,14 +39,14 @@ define <4 x i16> @vuzpi16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
|
||||
; CHECK-NEXT: vldr d16, [r1]
|
||||
; CHECK-NEXT: vldr d17, [r0]
|
||||
; CHECK-NEXT: vuzp.16 d17, d16
|
||||
; CHECK-NEXT: vadd.i16 d16, d17, d16
|
||||
; CHECK-NEXT: vmul.i16 d16, d17, d16
|
||||
; CHECK-NEXT: vmov r0, r1, d16
|
||||
; CHECK-NEXT: mov pc, lr
|
||||
%tmp1 = load <4 x i16>, <4 x i16>* %A
|
||||
%tmp2 = load <4 x i16>, <4 x i16>* %B
|
||||
%tmp3 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
|
||||
%tmp4 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
|
||||
%tmp5 = add <4 x i16> %tmp3, %tmp4
|
||||
%tmp5 = mul <4 x i16> %tmp3, %tmp4
|
||||
ret <4 x i16> %tmp5
|
||||
}
|
||||
|
||||
@ -207,14 +207,14 @@ define <8 x i8> @vuzpi8_undef(<8 x i8>* %A, <8 x i8>* %B) nounwind {
|
||||
; CHECK-NEXT: vldr d16, [r1]
|
||||
; CHECK-NEXT: vldr d17, [r0]
|
||||
; CHECK-NEXT: vuzp.8 d17, d16
|
||||
; CHECK-NEXT: vadd.i8 d16, d17, d16
|
||||
; CHECK-NEXT: vmul.i8 d16, d17, d16
|
||||
; CHECK-NEXT: vmov r0, r1, d16
|
||||
; CHECK-NEXT: mov pc, lr
|
||||
%tmp1 = load <8 x i8>, <8 x i8>* %A
|
||||
%tmp2 = load <8 x i8>, <8 x i8>* %B
|
||||
%tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 8, i32 10, i32 12, i32 14>
|
||||
%tmp4 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 undef, i32 undef, i32 13, i32 15>
|
||||
%tmp5 = add <8 x i8> %tmp3, %tmp4
|
||||
%tmp5 = mul <8 x i8> %tmp3, %tmp4
|
||||
ret <8 x i8> %tmp5
|
||||
}
|
||||
|
||||
@ -550,3 +550,22 @@ define <10 x i8> @vuzp_wide_type(<10 x i8> %tr0, <10 x i8> %tr1,
|
||||
%rv = select <10 x i1> %c, <10 x i8> %tr0, <10 x i8> %tr1
|
||||
ret <10 x i8> %rv
|
||||
}
|
||||
|
||||
%struct.uint8x8x2_t = type { [2 x <8 x i8>] }
|
||||
define %struct.uint8x8x2_t @vuzp_extract_subvector(<16 x i8> %t) #0 {
|
||||
; CHECK-LABEL: vuzp_extract_subvector:
|
||||
; CHECK: @ BB#0:
|
||||
; CHECK-NEXT: vmov d17, r2, r3
|
||||
; CHECK-NEXT: vmov d16, r0, r1
|
||||
; CHECK-NEXT: vorr d18, d17, d17
|
||||
; CHECK-NEXT: vuzp.8 d16, d18
|
||||
; CHECK-NEXT: vmov r0, r1, d16
|
||||
; CHECK-NEXT: vmov r2, r3, d18
|
||||
; CHECK-NEXT: mov pc, lr
|
||||
|
||||
%vuzp.i = shufflevector <16 x i8> %t, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
|
||||
%vuzp1.i = shufflevector <16 x i8> %t, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
|
||||
%.fca.0.0.insert = insertvalue %struct.uint8x8x2_t undef, <8 x i8> %vuzp.i, 0, 0
|
||||
%.fca.0.1.insert = insertvalue %struct.uint8x8x2_t %.fca.0.0.insert, <8 x i8> %vuzp1.i, 0, 1
|
||||
ret %struct.uint8x8x2_t %.fca.0.1.insert
|
||||
}
|
||||
|
@ -332,9 +332,8 @@ define void @vzip_vext_factor(<8 x i16>* %A, <4 x i16>* %B) {
|
||||
; CHECK-LABEL: vzip_vext_factor:
|
||||
; CHECK: @ BB#0: @ %entry
|
||||
; CHECK-NEXT: vld1.64 {d16, d17}, [r0]
|
||||
; CHECK-NEXT: vext.16 d16, d16, d17, #3
|
||||
; CHECK-NEXT: vext.16 d17, d16, d16, #1
|
||||
; CHECK-NEXT: vzip.16 d16, d17
|
||||
; CHECK-NEXT: vext.16 d18, d16, d17, #1
|
||||
; CHECK-NEXT: vext.16 d16, d18, d17, #2
|
||||
; CHECK-NEXT: vext.16 d16, d16, d16, #1
|
||||
; CHECK-NEXT: vstr d16, [r1]
|
||||
; CHECK-NEXT: mov pc, lr
|
||||
|
Loading…
x
Reference in New Issue
Block a user