mirror of
https://github.com/RPCSX/llvm.git
synced 2025-01-26 06:14:42 +00:00
Improve long vector sext/zext lowering on ARM
The ARM backend currently has poor codegen for long sext/zext operations, such as v8i8 -> v8i32. This patch addresses this by performing a custom expansion in ARMISelLowering. It also adds/changes the cost of such lowering in ARMTTI. This partially addresses PR14867. Patch by Pete Couperus git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@177380 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
parent
54e57f8cb7
commit
5ad5f5931e
@ -564,6 +564,16 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
|
||||
setOperationAction(ISD::FP_ROUND, MVT::v2f32, Expand);
|
||||
setOperationAction(ISD::FP_EXTEND, MVT::v2f64, Expand);
|
||||
|
||||
// Custom expand long extensions to vectors.
|
||||
setOperationAction(ISD::SIGN_EXTEND, MVT::v8i32, Custom);
|
||||
setOperationAction(ISD::ZERO_EXTEND, MVT::v8i32, Custom);
|
||||
setOperationAction(ISD::SIGN_EXTEND, MVT::v4i64, Custom);
|
||||
setOperationAction(ISD::ZERO_EXTEND, MVT::v4i64, Custom);
|
||||
setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);
|
||||
setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);
|
||||
setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom);
|
||||
setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom);
|
||||
|
||||
// NEON does not have single instruction CTPOP for vectors with element
|
||||
// types wider than 8-bits. However, custom lowering can leverage the
|
||||
// v8i8/v16i8 vcnt instruction.
|
||||
@ -3433,6 +3443,47 @@ SDValue ARMTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
|
||||
return FrameAddr;
|
||||
}
|
||||
|
||||
/// Custom Expand long vector extensions, where size(DestVec) > 2*size(SrcVec),
|
||||
/// and size(DestVec) > 128-bits.
|
||||
/// This is achieved by doing the one extension from the SrcVec, splitting the
|
||||
/// result, extending these parts, and then concatenating these into the
|
||||
/// destination.
|
||||
static SDValue ExpandVectorExtension(SDNode *N, SelectionDAG &DAG) {
|
||||
SDValue Op = N->getOperand(0);
|
||||
EVT SrcVT = Op.getValueType();
|
||||
EVT DestVT = N->getValueType(0);
|
||||
|
||||
assert(DestVT.getSizeInBits() > 128 &&
|
||||
"Custom sext/zext expansion needs >128-bit vector.");
|
||||
// If this is a normal length extension, use the default expansion.
|
||||
if (SrcVT.getSizeInBits()*4 != DestVT.getSizeInBits() &&
|
||||
SrcVT.getSizeInBits()*8 != DestVT.getSizeInBits())
|
||||
return SDValue();
|
||||
|
||||
DebugLoc dl = N->getDebugLoc();
|
||||
unsigned SrcEltSize = SrcVT.getVectorElementType().getSizeInBits();
|
||||
unsigned DestEltSize = DestVT.getVectorElementType().getSizeInBits();
|
||||
unsigned NumElts = SrcVT.getVectorNumElements();
|
||||
LLVMContext &Ctx = *DAG.getContext();
|
||||
SDValue Mid, SplitLo, SplitHi, ExtLo, ExtHi;
|
||||
|
||||
EVT MidVT = EVT::getVectorVT(Ctx, EVT::getIntegerVT(Ctx, SrcEltSize*2),
|
||||
NumElts);
|
||||
EVT SplitVT = EVT::getVectorVT(Ctx, EVT::getIntegerVT(Ctx, SrcEltSize*2),
|
||||
NumElts/2);
|
||||
EVT ExtVT = EVT::getVectorVT(Ctx, EVT::getIntegerVT(Ctx, DestEltSize),
|
||||
NumElts/2);
|
||||
|
||||
Mid = DAG.getNode(N->getOpcode(), dl, MidVT, Op);
|
||||
SplitLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SplitVT, Mid,
|
||||
DAG.getIntPtrConstant(0));
|
||||
SplitHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SplitVT, Mid,
|
||||
DAG.getIntPtrConstant(NumElts/2));
|
||||
ExtLo = DAG.getNode(N->getOpcode(), dl, ExtVT, SplitLo);
|
||||
ExtHi = DAG.getNode(N->getOpcode(), dl, ExtVT, SplitHi);
|
||||
return DAG.getNode(ISD::CONCAT_VECTORS, dl, DestVT, ExtLo, ExtHi);
|
||||
}
|
||||
|
||||
/// ExpandBITCAST - If the target supports VFP, this function is called to
|
||||
/// expand a bit convert where either the source or destination type is i64 to
|
||||
/// use a VMOVDRR or VMOVRRD node. This should not be done when the non-i64
|
||||
@ -5621,6 +5672,10 @@ void ARMTargetLowering::ReplaceNodeResults(SDNode *N,
|
||||
case ISD::BITCAST:
|
||||
Res = ExpandBITCAST(N, DAG);
|
||||
break;
|
||||
case ISD::SIGN_EXTEND:
|
||||
case ISD::ZERO_EXTEND:
|
||||
Res = ExpandVectorExtension(N, DAG);
|
||||
break;
|
||||
case ISD::SRL:
|
||||
case ISD::SRA:
|
||||
Res = Expand64BitShift(N, DAG, Subtarget);
|
||||
|
@ -211,11 +211,19 @@ unsigned ARMTTI::getCastInstrCost(unsigned Opcode, Type *Dst,
|
||||
{ ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 0 },
|
||||
{ ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 1 },
|
||||
|
||||
// The number of vmovl instructions for the extension.
|
||||
{ ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
|
||||
{ ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
|
||||
{ ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
|
||||
{ ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
|
||||
{ ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 7 },
|
||||
{ ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 7 },
|
||||
{ ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 6 },
|
||||
{ ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 6 },
|
||||
{ ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
|
||||
{ ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
|
||||
|
||||
// Operations that we legalize using load/stores to the stack.
|
||||
{ ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 16*2 + 4*4 },
|
||||
{ ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 16*2 + 4*3 },
|
||||
{ ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 8*2 + 2*4 },
|
||||
{ ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 8*2 + 2*3 },
|
||||
{ ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 4*1 + 16*2 + 2*1 },
|
||||
{ ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 2*1 + 8*2 + 1 },
|
||||
|
||||
|
@ -152,15 +152,29 @@ define i32 @casts() {
|
||||
; CHECK: cost of 10 {{.*}} uitofp
|
||||
%r69 = uitofp i64 undef to double
|
||||
|
||||
; Vector cast cost of instructions lowering the cast to the stack.
|
||||
; CHECK: cost of 24 {{.*}} sext
|
||||
; CHECK: cost of 3 {{.*}} sext
|
||||
%r70 = sext <8 x i8> undef to <8 x i32>
|
||||
; CHECK: cost of 48 {{.*}} sext
|
||||
; CHECK: cost of 6 {{.*}} sext
|
||||
%r71 = sext <16 x i8> undef to <16 x i32>
|
||||
; CHECK: cost of 22 {{.*}} zext
|
||||
; CHECK: cost of 3 {{.*}} zext
|
||||
%r72 = zext <8 x i8> undef to <8 x i32>
|
||||
; CHECK: cost of 44 {{.*}} zext
|
||||
; CHECK: cost of 6 {{.*}} zext
|
||||
%r73 = zext <16 x i8> undef to <16 x i32>
|
||||
|
||||
; CHECK: cost of 7 {{.*}} sext
|
||||
%rext_0 = sext <8 x i8> undef to <8 x i64>
|
||||
; CHECK: cost of 7 {{.*}} zext
|
||||
%rext_1 = zext <8 x i8> undef to <8 x i64>
|
||||
; CHECK: cost of 6 {{.*}} sext
|
||||
%rext_2 = sext <8 x i16> undef to <8 x i64>
|
||||
; CHECK: cost of 6 {{.*}} zext
|
||||
%rext_3 = zext <8 x i16> undef to <8 x i64>
|
||||
; CHECK: cost of 3 {{.*}} sext
|
||||
%rext_4 = sext <4 x i16> undef to <4 x i64>
|
||||
; CHECK: cost of 3 {{.*}} zext
|
||||
%rext_5 = zext <4 x i16> undef to <4 x i64>
|
||||
|
||||
; Vector cast cost of instructions lowering the cast to the stack.
|
||||
; CHECK: cost of 19 {{.*}} trunc
|
||||
%r74 = trunc <8 x i32> undef to <8 x i8>
|
||||
; CHECK: cost of 38 {{.*}} trunc
|
||||
|
@ -165,17 +165,12 @@ declare <4 x i16> @llvm.arm.neon.vcvtfp2hf(<4 x float>) nounwind readnone
|
||||
%T1_5 = type <8 x i32>
|
||||
; CHECK: func_cvt5:
|
||||
define void @func_cvt5(%T0_5* %loadaddr, %T1_5* %storeaddr) {
|
||||
; CHECK: strh
|
||||
; CHECK: strh
|
||||
; CHECK: strh
|
||||
; CHECK: strh
|
||||
; CHECK: strh
|
||||
; CHECK: strh
|
||||
; CHECK: strh
|
||||
; CHECK: strh
|
||||
; CHECK: vmovl.s8
|
||||
; CHECK: vmovl.s16
|
||||
; CHECK: vmovl.s16
|
||||
%v0 = load %T0_5* %loadaddr
|
||||
; COST: func_cvt5
|
||||
; COST: cost of 24 {{.*}} sext
|
||||
; COST: cost of 3 {{.*}} sext
|
||||
%r = sext %T0_5 %v0 to %T1_5
|
||||
store %T1_5 %r, %T1_5* %storeaddr
|
||||
ret void
|
||||
@ -186,17 +181,12 @@ define void @func_cvt5(%T0_5* %loadaddr, %T1_5* %storeaddr) {
|
||||
%TA1_5 = type <8 x i32>
|
||||
; CHECK: func_cvt1:
|
||||
define void @func_cvt1(%TA0_5* %loadaddr, %TA1_5* %storeaddr) {
|
||||
; CHECK: strh
|
||||
; CHECK: strh
|
||||
; CHECK: strh
|
||||
; CHECK: strh
|
||||
; CHECK: strh
|
||||
; CHECK: strh
|
||||
; CHECK: strh
|
||||
; CHECK: strh
|
||||
; CHECK: vmovl.u8
|
||||
; CHECK: vmovl.u16
|
||||
; CHECK: vmovl.u16
|
||||
%v0 = load %TA0_5* %loadaddr
|
||||
; COST: func_cvt1
|
||||
; COST: cost of 22 {{.*}} zext
|
||||
; COST: cost of 3 {{.*}} zext
|
||||
%r = zext %TA0_5 %v0 to %TA1_5
|
||||
store %TA1_5 %r, %TA1_5* %storeaddr
|
||||
ret void
|
||||
@ -228,25 +218,13 @@ define void @func_cvt51(%T0_51* %loadaddr, %T1_51* %storeaddr) {
|
||||
%TT1_5 = type <16 x i32>
|
||||
; CHECK: func_cvt52:
|
||||
define void @func_cvt52(%TT0_5* %loadaddr, %TT1_5* %storeaddr) {
|
||||
; CHECK: strh
|
||||
; CHECK: strh
|
||||
; CHECK: strh
|
||||
; CHECK: strh
|
||||
; CHECK: strh
|
||||
; CHECK: strh
|
||||
; CHECK: strh
|
||||
; CHECK: strh
|
||||
; CHECK: strh
|
||||
; CHECK: strh
|
||||
; CHECK: strh
|
||||
; CHECK: strh
|
||||
; CHECK: strh
|
||||
; CHECK: strh
|
||||
; CHECK: strh
|
||||
; CHECK: strh
|
||||
; CHECK: vmovl.s16
|
||||
; CHECK: vmovl.s16
|
||||
; CHECK: vmovl.s16
|
||||
; CHECK: vmovl.s16
|
||||
%v0 = load %TT0_5* %loadaddr
|
||||
; COST: func_cvt52
|
||||
; COST: cost of 48 {{.*}} sext
|
||||
; COST: cost of 6 {{.*}} sext
|
||||
%r = sext %TT0_5 %v0 to %TT1_5
|
||||
store %TT1_5 %r, %TT1_5* %storeaddr
|
||||
ret void
|
||||
@ -257,25 +235,13 @@ define void @func_cvt52(%TT0_5* %loadaddr, %TT1_5* %storeaddr) {
|
||||
%TTA1_5 = type <16 x i32>
|
||||
; CHECK: func_cvt12:
|
||||
define void @func_cvt12(%TTA0_5* %loadaddr, %TTA1_5* %storeaddr) {
|
||||
; CHECK: strh
|
||||
; CHECK: strh
|
||||
; CHECK: strh
|
||||
; CHECK: strh
|
||||
; CHECK: strh
|
||||
; CHECK: strh
|
||||
; CHECK: strh
|
||||
; CHECK: strh
|
||||
; CHECK: strh
|
||||
; CHECK: strh
|
||||
; CHECK: strh
|
||||
; CHECK: strh
|
||||
; CHECK: strh
|
||||
; CHECK: strh
|
||||
; CHECK: strh
|
||||
; CHECK: strh
|
||||
; CHECK: vmovl.u16
|
||||
; CHECK: vmovl.u16
|
||||
; CHECK: vmovl.u16
|
||||
; CHECK: vmovl.u16
|
||||
%v0 = load %TTA0_5* %loadaddr
|
||||
; COST: func_cvt12
|
||||
; COST: cost of 44 {{.*}} zext
|
||||
; COST: cost of 6 {{.*}} zext
|
||||
%r = zext %TTA0_5 %v0 to %TTA1_5
|
||||
store %TTA1_5 %r, %TTA1_5* %storeaddr
|
||||
ret void
|
||||
@ -309,3 +275,56 @@ define void @func_cvt512(%TT0_51* %loadaddr, %TT1_51* %storeaddr) {
|
||||
store %TT1_51 %r, %TT1_51* %storeaddr
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK: sext_v4i16_v4i64:
|
||||
define void @sext_v4i16_v4i64(<4 x i16>* %loadaddr, <4 x i64>* %storeaddr) {
|
||||
; CHECK: vmovl.s32
|
||||
; CHECK: vmovl.s32
|
||||
%v0 = load <4 x i16>* %loadaddr
|
||||
; COST: sext_v4i16_v4i64
|
||||
; COST: cost of 3 {{.*}} sext
|
||||
%r = sext <4 x i16> %v0 to <4 x i64>
|
||||
store <4 x i64> %r, <4 x i64>* %storeaddr
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK: zext_v4i16_v4i64:
|
||||
define void @zext_v4i16_v4i64(<4 x i16>* %loadaddr, <4 x i64>* %storeaddr) {
|
||||
; CHECK: vmovl.u32
|
||||
; CHECK: vmovl.u32
|
||||
%v0 = load <4 x i16>* %loadaddr
|
||||
; COST: zext_v4i16_v4i64
|
||||
; COST: cost of 3 {{.*}} zext
|
||||
%r = zext <4 x i16> %v0 to <4 x i64>
|
||||
store <4 x i64> %r, <4 x i64>* %storeaddr
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK: sext_v8i16_v8i64:
|
||||
define void @sext_v8i16_v8i64(<8 x i16>* %loadaddr, <8 x i64>* %storeaddr) {
|
||||
; CHECK: vmovl.s32
|
||||
; CHECK: vmovl.s32
|
||||
; CHECK: vmovl.s32
|
||||
; CHECK: vmovl.s32
|
||||
%v0 = load <8 x i16>* %loadaddr
|
||||
; COST: sext_v8i16_v8i64
|
||||
; COST: cost of 6 {{.*}} sext
|
||||
%r = sext <8 x i16> %v0 to <8 x i64>
|
||||
store <8 x i64> %r, <8 x i64>* %storeaddr
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK: zext_v8i16_v8i64:
|
||||
define void @zext_v8i16_v8i64(<8 x i16>* %loadaddr, <8 x i64>* %storeaddr) {
|
||||
; CHECK: vmovl.u32
|
||||
; CHECK: vmovl.u32
|
||||
; CHECK: vmovl.u32
|
||||
; CHECK: vmovl.u32
|
||||
%v0 = load <8 x i16>* %loadaddr
|
||||
; COST: zext_v8i16_v8i64
|
||||
; COST: cost of 6 {{.*}} zext
|
||||
%r = zext <8 x i16> %v0 to <8 x i64>
|
||||
store <8 x i64> %r, <8 x i64>* %storeaddr
|
||||
ret void
|
||||
}
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user