mirror of
https://github.com/RPCS3/llvm.git
synced 2025-01-10 14:12:11 +00:00
a6425604c2
insertions. The old behavior could cause arbitrarily bad memory usage in the DAG combiner if there was heavy traffic of adding nodes already on the worklist to it. This commit switches the DAG combine worklist to work the same way as the instcombine worklist where we null-out removed entries and only add new entries to the worklist. My measurements of codegen time shows slight improvement. The memory utilization is unsurprisingly dominated by other factors (the IR and DAG itself I suspect). This change results in subtle, frustrating churn in the particular order in which DAG combines are applied which causes a number of minor regressions where we fail to match a pattern previously matched by accident. AFAICT, all of these should be using AddToWorklist to directly or should be written in a less brittle way. None of the changes seem drastically bad, and a few of the changes seem distinctly better. A major change required to make this work is to significantly harden the way in which the DAG combiner handle nodes which become dead (zero-uses). Previously, we relied on the ability to "priority-bump" them on the combine worklist to achieve recursive deletion of these nodes and ensure that the frontier of remaining live nodes all were added to the worklist. Instead, I've introduced a routine to just implement that precise logic with no indirection. It is a significantly simpler operation than that of the combiner worklist proper. I suspect this will also fix some other problems with the combiner. I think the x86 changes are really minor and uninteresting, but the avx512 change at least is hiding a "regression" (despite the test case being just noise, not testing some performance invariant) that might be looked into. Not sure if any of the others impact specific "important" code paths, but they didn't look terribly interesting to me, or the changes were really minor. The consensus in review is to fix any regressions that show up after the fact here. Thanks to the other reviewers for checking the output on other architectures. There is a specific regression on ARM that Tim already has a fix prepped to commit. Differential Revision: http://reviews.llvm.org/D4616 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@213727 91177308-0d34-0410-b5e6-96231b3b80d8
50 lines
1.5 KiB
LLVM
50 lines
1.5 KiB
LLVM
; RUN: llc -march=x86-64 < %s | FileCheck %s
|
|
|
|
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
|
|
target triple = "x86_64-pc-linux-gnu"
|
|
|
|
; DAGCombiner should fold this code in finite time.
|
|
; rdar://8606584
|
|
|
|
define void @test1() nounwind readnone {
|
|
bb.nph:
|
|
br label %while.cond
|
|
|
|
while.cond: ; preds = %while.cond, %bb.nph
|
|
%tmp6 = load i32* undef, align 4
|
|
%and = or i64 undef, undef
|
|
%conv11 = zext i32 undef to i64
|
|
%conv14 = zext i32 %tmp6 to i64
|
|
%shl15 = shl i64 %conv14, 1
|
|
%shl15.masked = and i64 %shl15, 4294967294
|
|
%and17 = or i64 %shl15.masked, %conv11
|
|
%add = add i64 %and17, 1
|
|
%xor = xor i64 %add, %and
|
|
%tmp20 = load i64* undef, align 8
|
|
%add21 = add i64 %xor, %tmp20
|
|
%conv22 = trunc i64 %add21 to i32
|
|
store i32 %conv22, i32* undef, align 4
|
|
br i1 false, label %while.end, label %while.cond
|
|
|
|
while.end: ; preds = %while.cond
|
|
ret void
|
|
}
|
|
|
|
; DAG Combiner can't fold this into a load of the 1'th byte.
|
|
; PR8757
|
|
define i32 @test3(i32 *%P) nounwind ssp {
|
|
store volatile i32 128, i32* %P
|
|
%tmp4.pre = load i32* %P
|
|
%phitmp = trunc i32 %tmp4.pre to i16
|
|
%phitmp13 = shl i16 %phitmp, 8
|
|
%phitmp14 = ashr i16 %phitmp13, 8
|
|
%phitmp15 = lshr i16 %phitmp14, 8
|
|
%phitmp16 = zext i16 %phitmp15 to i32
|
|
ret i32 %phitmp16
|
|
|
|
; CHECK: movl $128, (%rdi)
|
|
; CHECK-NEXT: movsbl (%rdi), %eax
|
|
; CHECK-NEXT: movzbl %ah, %eax
|
|
; CHECK-NEXT: ret
|
|
}
|