mirror of
https://github.com/RPCS3/llvm.git
synced 2025-03-01 15:19:05 +00:00
[AlignmentFromAssumptions] getNewAlignmentDiff(): use getURemExpr()
The alignment is calculated incorrectly, thus sometimes it doesn't generate aligned mov instructions, as shown by the example below: ``` // b.cc typedef long long index; extern "C" index g_tid; extern "C" index g_num; void add3(float* __restrict__ a, float* __restrict__ b, float* __restrict__ c) { index n = 64*1024; index m = 16*1024; index k = 4*1024; index tid = g_tid; index num = g_num; __builtin_assume_aligned(a, 32); __builtin_assume_aligned(b, 32); __builtin_assume_aligned(c, 32); for (index i0=tid*k; i0<m; i0+=num*k) for (index i1=0; i1<n*m; i1+=m) for (index i2=0; i2<k; i2++) c[i1+i0+i2] = b[i0+i2] + a[i1+i0+i2]; } ``` Compile with `clang b.cc -Ofast -march=skylake -mavx2 -S` ``` vmovaps -224(%rdi,%rbx,4), %ymm0 vmovups -192(%rdi,%rbx,4), %ymm1 # should be movaps vmovups -160(%rdi,%rbx,4), %ymm2 # should be movaps vmovups -128(%rdi,%rbx,4), %ymm3 # should be movaps vaddps -224(%rsi,%rbx,4), %ymm0, %ymm0 vaddps -192(%rsi,%rbx,4), %ymm1, %ymm1 vaddps -160(%rsi,%rbx,4), %ymm2, %ymm2 vaddps -128(%rsi,%rbx,4), %ymm3, %ymm3 vmovaps %ymm0, -224(%rdx,%rbx,4) vmovups %ymm1, -192(%rdx,%rbx,4) # should be movaps vmovups %ymm2, -160(%rdx,%rbx,4) # should be movaps vmovups %ymm3, -128(%rdx,%rbx,4) # should be movaps ``` Differential Revision: https://reviews.llvm.org/D66575 Patch by Dun Liang git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@369723 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
parent
4c91463616
commit
a8fb688f58
@ -93,9 +93,7 @@ static unsigned getNewAlignmentDiff(const SCEV *DiffSCEV,
|
||||
const SCEV *AlignSCEV,
|
||||
ScalarEvolution *SE) {
|
||||
// DiffUnits = Diff % int64_t(Alignment)
|
||||
const SCEV *DiffAlignDiv = SE->getUDivExpr(DiffSCEV, AlignSCEV);
|
||||
const SCEV *DiffAlign = SE->getMulExpr(DiffAlignDiv, AlignSCEV);
|
||||
const SCEV *DiffUnitsSCEV = SE->getMinusSCEV(DiffAlign, DiffSCEV);
|
||||
const SCEV *DiffUnitsSCEV = SE->getURemExpr(DiffSCEV, AlignSCEV);
|
||||
|
||||
LLVM_DEBUG(dbgs() << "\talignment relative to " << *AlignSCEV << " is "
|
||||
<< *DiffUnitsSCEV << " (diff: " << *DiffSCEV << ")\n");
|
||||
|
@ -90,6 +90,61 @@ for.end: ; preds = %for.body
|
||||
; CHECK: ret i32 %add.lcssa
|
||||
}
|
||||
|
||||
; test D66575
|
||||
; def hoo2(a, id, num):
|
||||
; for i0 in range(id*64, 4096, num*64):
|
||||
; for i1 in range(0, 4096, 32):
|
||||
; for i2 in range(0, 4096, 32):
|
||||
; load(a, i0+i1+i2+32)
|
||||
define void @hoo2(i32* nocapture %a, i64 %id, i64 %num) nounwind uwtable readonly {
|
||||
entry:
|
||||
%ptrint = ptrtoint i32* %a to i64
|
||||
%maskedptr = and i64 %ptrint, 31
|
||||
%maskcond = icmp eq i64 %maskedptr, 0
|
||||
tail call void @llvm.assume(i1 %maskcond)
|
||||
%id.mul = shl nsw i64 %id, 6
|
||||
%num.mul = shl nsw i64 %num, 6
|
||||
br label %for0.body
|
||||
|
||||
for0.body:
|
||||
%i0 = phi i64 [ %id.mul, %entry ], [ %i0.next, %for0.end ]
|
||||
br label %for1.body
|
||||
|
||||
for1.body:
|
||||
%i1 = phi i64 [ 0, %for0.body ], [ %i1.next, %for1.end ]
|
||||
br label %for2.body
|
||||
|
||||
for2.body:
|
||||
%i2 = phi i64 [ 0, %for1.body ], [ %i2.next, %for2.body ]
|
||||
|
||||
%t1 = add nuw nsw i64 %i0, %i1
|
||||
%t2 = add nuw nsw i64 %t1, %i2
|
||||
%t3 = add nuw nsw i64 %t2, 32
|
||||
%arrayidx = getelementptr inbounds i32, i32* %a, i64 %t3
|
||||
%x = load i32, i32* %arrayidx, align 4
|
||||
|
||||
%i2.next = add nuw nsw i64 %i2, 32
|
||||
%cmp2 = icmp ult i64 %i2.next, 4096
|
||||
br i1 %cmp2, label %for2.body, label %for1.end
|
||||
|
||||
for1.end:
|
||||
%i1.next = add nuw nsw i64 %i1, 32
|
||||
%cmp1 = icmp ult i64 %i1.next, 4096
|
||||
br i1 %cmp1, label %for1.body, label %for0.end
|
||||
|
||||
for0.end:
|
||||
%i0.next = add nuw nsw i64 %i0, %num.mul
|
||||
%cmp0 = icmp ult i64 %i0.next, 4096
|
||||
br i1 %cmp0, label %for0.body, label %return
|
||||
|
||||
return:
|
||||
ret void
|
||||
|
||||
; CHECK-LABEL: @hoo2
|
||||
; CHECK: load i32, i32* %arrayidx, align 32
|
||||
; CHECK: ret void
|
||||
}
|
||||
|
||||
define i32 @joo(i32* nocapture %a) nounwind uwtable readonly {
|
||||
entry:
|
||||
%ptrint = ptrtoint i32* %a to i64
|
||||
|
Loading…
x
Reference in New Issue
Block a user