mirror of
https://github.com/capstone-engine/llvm-capstone.git
synced 2025-01-10 01:55:08 +00:00
[ppc] Correctly compute the cost of loading 32/64 bit memory into VSR
VSX has instructions lxsiwax/lxsdx that can load 32/64 bit value into VSX register cheaply. That patch makes it known to memory cost model, so the vectorization of the test case in pr30990 is beneficial. Differential Revision: https://reviews.llvm.org/D26713 llvm-svn: 288560
This commit is contained in:
parent
33f947057d
commit
835de1f3ab
@ -360,11 +360,6 @@ int PPCTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
|
||||
|
||||
int Cost = BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace);
|
||||
|
||||
// Aligned loads and stores are easy.
|
||||
unsigned SrcBytes = LT.second.getStoreSize();
|
||||
if (!SrcBytes || !Alignment || Alignment >= SrcBytes)
|
||||
return Cost;
|
||||
|
||||
bool IsAltivecType = ST->hasAltivec() &&
|
||||
(LT.second == MVT::v16i8 || LT.second == MVT::v8i16 ||
|
||||
LT.second == MVT::v4i32 || LT.second == MVT::v4f32);
|
||||
@ -373,6 +368,20 @@ int PPCTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
|
||||
bool IsQPXType = ST->hasQPX() &&
|
||||
(LT.second == MVT::v4f64 || LT.second == MVT::v4f32);
|
||||
|
||||
// VSX has 32b/64b load instructions. Legalization can handle loading of
|
||||
// 32b/64b to VSR correctly and cheaply. But BaseT::getMemoryOpCost and
|
||||
// PPCTargetLowering can't compute the cost appropriately. So here we
|
||||
// explicitly check this case.
|
||||
unsigned MemBytes = Src->getPrimitiveSizeInBits();
|
||||
if (Opcode == Instruction::Load && ST->hasVSX() && IsAltivecType &&
|
||||
(MemBytes == 64 || (ST->hasP8Vector() && MemBytes == 32)))
|
||||
return 1;
|
||||
|
||||
// Aligned loads and stores are easy.
|
||||
unsigned SrcBytes = LT.second.getStoreSize();
|
||||
if (!SrcBytes || !Alignment || Alignment >= SrcBytes)
|
||||
return Cost;
|
||||
|
||||
// If we can use the permutation-based load sequence, then this is also
|
||||
// relatively cheap (not counting loop-invariant instructions): one load plus
|
||||
// one permute (the last load in a series has extra cost, but we're
|
||||
|
19
llvm/test/Analysis/CostModel/PowerPC/vsr_load_32_64.ll
Normal file
19
llvm/test/Analysis/CostModel/PowerPC/vsr_load_32_64.ll
Normal file
@ -0,0 +1,19 @@
|
||||
; RUN: opt < %s -cost-model -analyze -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr8 -mattr=+vsx | FileCheck %s
|
||||
target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
|
||||
target triple = "powerpc64-unknown-linux-gnu"
|
||||
|
||||
define i32 @loads(i32 %arg) {
|
||||
; CHECK: cost of 1 {{.*}} load
|
||||
load <4 x i8>, <4 x i8>* undef, align 1
|
||||
|
||||
; CHECK: cost of 1 {{.*}} load
|
||||
load <8 x i8>, <8 x i8>* undef, align 1
|
||||
|
||||
; CHECK: cost of 1 {{.*}} load
|
||||
load <2 x i16>, <2 x i16>* undef, align 2
|
||||
|
||||
; CHECK: cost of 1 {{.*}} load
|
||||
load <4 x i16>, <4 x i16>* undef, align 2
|
||||
|
||||
ret i32 undef
|
||||
}
|
140
llvm/test/Transforms/LoopVectorize/PowerPC/pr30990.ll
Normal file
140
llvm/test/Transforms/LoopVectorize/PowerPC/pr30990.ll
Normal file
@ -0,0 +1,140 @@
|
||||
; RUN: opt < %s -loop-vectorize -mcpu=pwr8 -mattr=+vsx -force-vector-interleave=1 -vectorizer-maximize-bandwidth=0 -S | FileCheck %s
|
||||
|
||||
target triple = "powerpc64-unknown-linux-gnu"
|
||||
|
||||
define signext i32 @foo(i8* readonly %ptr, i32 signext %l) {
|
||||
entry:
|
||||
%idx.ext = sext i32 %l to i64
|
||||
%add.ptr = getelementptr inbounds i8, i8* %ptr, i64 %idx.ext
|
||||
%cmp7 = icmp sgt i32 %l, 0
|
||||
br i1 %cmp7, label %while.body.preheader, label %while.end
|
||||
|
||||
while.body.preheader: ; preds = %entry
|
||||
br label %while.body
|
||||
|
||||
while.body: ; preds = %while.body.preheader, %while.body
|
||||
%count.09 = phi i32 [ %add, %while.body ], [ 0, %while.body.preheader ]
|
||||
%ptr.addr.08 = phi i8* [ %incdec.ptr, %while.body ], [ %ptr, %while.body.preheader ]
|
||||
%0 = load i8, i8* %ptr.addr.08, align 1
|
||||
%cmp1 = icmp slt i8 %0, -64
|
||||
%cond = zext i1 %cmp1 to i32
|
||||
%add = add nsw i32 %cond, %count.09
|
||||
%incdec.ptr = getelementptr inbounds i8, i8* %ptr.addr.08, i64 1
|
||||
%cmp = icmp ult i8* %incdec.ptr, %add.ptr
|
||||
br i1 %cmp, label %while.body, label %while.end.loopexit
|
||||
|
||||
while.end.loopexit: ; preds = %while.body
|
||||
%add.lcssa = phi i32 [ %add, %while.body ]
|
||||
br label %while.end
|
||||
|
||||
while.end: ; preds = %while.end.loopexit, %entry
|
||||
%count.0.lcssa = phi i32 [ 0, %entry ], [ %add.lcssa, %while.end.loopexit ]
|
||||
ret i32 %count.0.lcssa
|
||||
|
||||
; CHECK: load <4 x i8>
|
||||
; CHECK: icmp slt <4 x i8>
|
||||
}
|
||||
|
||||
|
||||
define signext i16 @foo2(i8* readonly %ptr, i32 signext %l) {
|
||||
entry:
|
||||
%idx.ext = sext i32 %l to i64
|
||||
%add.ptr = getelementptr inbounds i8, i8* %ptr, i64 %idx.ext
|
||||
%cmp7 = icmp sgt i32 %l, 0
|
||||
br i1 %cmp7, label %while.body.preheader, label %while.end
|
||||
|
||||
while.body.preheader: ; preds = %entry
|
||||
br label %while.body
|
||||
|
||||
while.body: ; preds = %while.body.preheader, %while.body
|
||||
%count.09 = phi i16 [ %add, %while.body ], [ 0, %while.body.preheader ]
|
||||
%ptr.addr.08 = phi i8* [ %incdec.ptr, %while.body ], [ %ptr, %while.body.preheader ]
|
||||
%0 = load i8, i8* %ptr.addr.08, align 1
|
||||
%cmp1 = icmp slt i8 %0, -64
|
||||
%cond = zext i1 %cmp1 to i16
|
||||
%add = add nsw i16 %cond, %count.09
|
||||
%incdec.ptr = getelementptr inbounds i8, i8* %ptr.addr.08, i64 1
|
||||
%cmp = icmp ult i8* %incdec.ptr, %add.ptr
|
||||
br i1 %cmp, label %while.body, label %while.end.loopexit
|
||||
|
||||
while.end.loopexit: ; preds = %while.body
|
||||
%add.lcssa = phi i16 [ %add, %while.body ]
|
||||
br label %while.end
|
||||
|
||||
while.end: ; preds = %while.end.loopexit, %entry
|
||||
%count.0.lcssa = phi i16 [ 0, %entry ], [ %add.lcssa, %while.end.loopexit ]
|
||||
ret i16 %count.0.lcssa
|
||||
|
||||
; CHECK-LABEL: foo2
|
||||
; CHECK: load <8 x i8>
|
||||
; CHECK: icmp slt <8 x i8>
|
||||
}
|
||||
|
||||
define signext i32 @foo3(i16* readonly %ptr, i32 signext %l) {
|
||||
entry:
|
||||
%idx.ext = sext i32 %l to i64
|
||||
%add.ptr = getelementptr inbounds i16, i16* %ptr, i64 %idx.ext
|
||||
%cmp7 = icmp sgt i32 %l, 0
|
||||
br i1 %cmp7, label %while.body.preheader, label %while.end
|
||||
|
||||
while.body.preheader: ; preds = %entry
|
||||
br label %while.body
|
||||
|
||||
while.body: ; preds = %while.body.preheader, %while.body
|
||||
%count.09 = phi i32 [ %add, %while.body ], [ 0, %while.body.preheader ]
|
||||
%ptr.addr.16 = phi i16* [ %incdec.ptr, %while.body ], [ %ptr, %while.body.preheader ]
|
||||
%0 = load i16, i16* %ptr.addr.16, align 1
|
||||
%cmp1 = icmp slt i16 %0, -64
|
||||
%cond = zext i1 %cmp1 to i32
|
||||
%add = add nsw i32 %cond, %count.09
|
||||
%incdec.ptr = getelementptr inbounds i16, i16* %ptr.addr.16, i64 1
|
||||
%cmp = icmp ult i16* %incdec.ptr, %add.ptr
|
||||
br i1 %cmp, label %while.body, label %while.end.loopexit
|
||||
|
||||
while.end.loopexit: ; preds = %while.body
|
||||
%add.lcssa = phi i32 [ %add, %while.body ]
|
||||
br label %while.end
|
||||
|
||||
while.end: ; preds = %while.end.loopexit, %entry
|
||||
%count.0.lcssa = phi i32 [ 0, %entry ], [ %add.lcssa, %while.end.loopexit ]
|
||||
ret i32 %count.0.lcssa
|
||||
|
||||
; CHECK-LABEL: foo3
|
||||
; CHECK: load <4 x i16>
|
||||
; CHECK: icmp slt <4 x i16>
|
||||
}
|
||||
|
||||
define i64 @foo4(i16* readonly %ptr, i32 signext %l) {
|
||||
entry:
|
||||
%idx.ext = sext i32 %l to i64
|
||||
%add.ptr = getelementptr inbounds i16, i16* %ptr, i64 %idx.ext
|
||||
%cmp7 = icmp sgt i32 %l, 0
|
||||
br i1 %cmp7, label %while.body.preheader, label %while.end
|
||||
|
||||
while.body.preheader: ; preds = %entry
|
||||
br label %while.body
|
||||
|
||||
while.body: ; preds = %while.body.preheader, %while.body
|
||||
%count.09 = phi i64 [ %add, %while.body ], [ 0, %while.body.preheader ]
|
||||
%ptr.addr.16 = phi i16* [ %incdec.ptr, %while.body ], [ %ptr, %while.body.preheader ]
|
||||
%0 = load i16, i16* %ptr.addr.16, align 1
|
||||
%cmp1 = icmp slt i16 %0, -64
|
||||
%cond = zext i1 %cmp1 to i64
|
||||
%add = add nsw i64 %cond, %count.09
|
||||
%incdec.ptr = getelementptr inbounds i16, i16* %ptr.addr.16, i64 1
|
||||
%cmp = icmp ult i16* %incdec.ptr, %add.ptr
|
||||
br i1 %cmp, label %while.body, label %while.end.loopexit
|
||||
|
||||
while.end.loopexit: ; preds = %while.body
|
||||
%add.lcssa = phi i64 [ %add, %while.body ]
|
||||
br label %while.end
|
||||
|
||||
while.end: ; preds = %while.end.loopexit, %entry
|
||||
%count.0.lcssa = phi i64 [ 0, %entry ], [ %add.lcssa, %while.end.loopexit ]
|
||||
ret i64 %count.0.lcssa
|
||||
|
||||
; CHECK-LABEL: foo4
|
||||
; CHECK: load <2 x i16>
|
||||
; CHECK: icmp slt <2 x i16>
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user