mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2024-12-11 21:45:16 +00:00
db63a47135
The original take 1 was 6102310d814ad73eab60a88b21dd70874f7a056f, which taught InstSimplify to do that, which seemed better at time, since we got EarlyCSE support for free. However, it was proven that we can not do that there, the simplified-to PHI would not be reachable from the original PHI, and that is not something InstSimplify is allowed to do, as noted in the commit ed90f15efb40d26b5d3ead3bb8e9e284218e0186 that reverted it: > It appears to cause compilation non-determinism and caused stage3 mismatches. Then there was take 2 3e69871ab5a66fb55913a2a2f5e7f5b42899a4c9, which was InstCombine-specific, but it again showed stage2-stage3 differences, and reverted in bdaa3f86a040b138c58de41d73d35b76fdec1380. This is quite alarming. Here, let's try to change how we find existing PHI candidate: due to the worklist order, and the way PHI nodes are inserted (it may be inserted as the first one, or maybe not), let's look at *all* PHI nodes in the block. Effects on vanilla llvm test-suite + RawSpeed: ``` | statistic name | baseline | proposed | Δ | % | \|%\| | |----------------------------------------------------|-----------|-----------|-------:|---------:|---------:| | asm-printer.EmittedInsts | 7942329 | 7942457 | 128 | 0.00% | 0.00% | | assembler.ObjectBytes | 254295632 | 254312480 | 16848 | 0.01% | 0.01% | | correlated-value-propagation.NumPhis | 18412 | 18347 | -65 | -0.35% | 0.35% | | early-cse.NumCSE | 2183283 | 2183267 | -16 | 0.00% | 0.00% | | early-cse.NumSimplify | 550105 | 541842 | -8263 | -1.50% | 1.50% | | instcombine.NumAggregateReconstructionsSimplified | 73 | 4506 | 4433 | 6072.60% | 6072.60% | | instcombine.NumCombined | 3640311 | 3644419 | 4108 | 0.11% | 0.11% | | instcombine.NumDeadInst | 1778204 | 1783205 | 5001 | 0.28% | 0.28% | | instcombine.NumPHICSEs | 0 | 22490 | 22490 | 0.00% | 0.00% | | instcombine.NumWorklistIterations | 2023272 | 2024400 | 1128 | 0.06% | 0.06% | | instcount.NumCallInst | 1758395 | 1758802 | 407 | 0.02% | 0.02% | | instcount.NumInvokeInst | 59478 | 59502 | 24 | 0.04% | 0.04% | | instcount.NumPHIInst | 330557 | 330545 | -12 | 0.00% | 0.00% | | instcount.TotalBlocks | 1077138 | 1077220 | 82 | 0.01% | 0.01% | | instcount.TotalFuncs | 101442 | 101441 | -1 | 0.00% | 0.00% | | instcount.TotalInsts | 8831946 | 8832606 | 660 | 0.01% | 0.01% | | simplifycfg.NumHoistCommonCode | 24186 | 24187 | 1 | 0.00% | 0.00% | | simplifycfg.NumInvokes | 4300 | 4410 | 110 | 2.56% | 2.56% | | simplifycfg.NumSimpl | 1019813 | 999767 | -20046 | -1.97% | 1.97% | ``` So it fires 22490 times, which is less than ~24k the take 1 did, but more than what take 2 did (22228 times) . It allows foldAggregateConstructionIntoAggregateReuse() to actually work after PHI-of-extractvalue folds did their thing. Previously SimplifyCFG would have done this PHI CSE, of all places. Additionally, allows some more `invoke`->`call` folds to happen (+110, +2.56%). All in all, expectedly, this catches less things overall, but all the motivational cases are still caught, so all good. |
||
---|---|---|
.. | ||
AArch64 | ||
AMDGPU | ||
ARM | ||
Hexagon | ||
PowerPC | ||
SystemZ | ||
X86 | ||
XCore | ||
12-12-11-if-conv.ll | ||
2012-10-20-infloop.ll | ||
2012-10-22-isconsec.ll | ||
2016-07-27-loop-vec.ll | ||
alias-set-with-uncomputable-bounds.ll | ||
align.ll | ||
assume.ll | ||
bsd_regex.ll | ||
bzip_reverse_loops.ll | ||
calloc.ll | ||
cast-induction.ll | ||
check-prof-info.ll | ||
conditional-assignment.ll | ||
consec_no_gep.ll | ||
consecutive-ptr-uniforms.ll | ||
control-flow.ll | ||
cpp-new-array.ll | ||
dbg.value.ll | ||
dead_instructions.ll | ||
debugloc.ll | ||
demanded-bits-of-pointer-instruction.ll | ||
diag-missing-instr-debug-loc.ll | ||
diag-with-hotness-info-2.ll | ||
diag-with-hotness-info.ll | ||
disable_nonforced_enable.ll | ||
disable_nonforced.ll | ||
discriminator.ll | ||
ee-crash.ll | ||
exact.ll | ||
explicit_outer_detection.ll | ||
explicit_outer_nonuniform_inner.ll | ||
explicit_outer_uniform_diverg_branch.ll | ||
fcmp-vectorize.ll | ||
first-order-recurrence-complex.ll | ||
first-order-recurrence-multiply-recurrences.ll | ||
first-order-recurrence.ll | ||
fix-reduction-dbg.ll | ||
flags.ll | ||
float-induction.ll | ||
float-minmax-instruction-flag.ll | ||
float-reduction.ll | ||
fneg.ll | ||
followup.ll | ||
funcall.ll | ||
gcc-examples.ll | ||
gep_with_bitcast.ll | ||
global_alias.ll | ||
hints-trans.ll | ||
hoist-loads.ll | ||
i8-induction.ll | ||
icmp-uniforms.ll | ||
if-conv-crash.ll | ||
if-conversion-edgemasks.ll | ||
if-conversion-nest.ll | ||
if-conversion-reduction.ll | ||
if-conversion.ll | ||
if-pred-non-void.ll | ||
if-pred-not-when-safe.ll | ||
if-pred-stores.ll | ||
if-reduction.ll | ||
incorrect-dom-info.ll | ||
increment.ll | ||
induction_plus.ll | ||
induction-step.ll | ||
induction.ll | ||
infiniteloop.ll | ||
int_sideeffect.ll | ||
interleaved-accesses-1.ll | ||
interleaved-accesses-2.ll | ||
interleaved-accesses-3.ll | ||
interleaved-accesses-alias.ll | ||
interleaved-accesses-masked-group.ll | ||
interleaved-accesses-pred-stores.ll | ||
interleaved-accesses-uniform-load.ll | ||
interleaved-accesses.ll | ||
interleaved-acess-with-remarks.ll | ||
intrinsic.ll | ||
invariant-store-vectorization.ll | ||
iv_outside_user.ll | ||
lcssa-crash.ll | ||
libcall-remark.ll | ||
lifetime.ll | ||
loop-form.ll | ||
loop-legality-checks.ll | ||
loop-scalars.ll | ||
loop-vect-memdep.ll | ||
loop-vect-option.ll | ||
memdep-fold-tail.ll | ||
memdep.ll | ||
metadata-unroll.ll | ||
metadata-width.ll | ||
metadata.ll | ||
middle-block-dbg.ll | ||
miniters.ll | ||
minmax_reduction.ll | ||
multi-use-reduction-bug.ll | ||
multiple-address-spaces.ll | ||
multiple-strides-vectorization.ll | ||
no_array_bounds.ll | ||
no_idiv_reduction.ll | ||
no_int_induction.ll | ||
no_outside_user.ll | ||
no_switch_disable_vectorization.ll | ||
no_switch.ll | ||
no-interleave-up-front.ll | ||
noalias-md-licm.ll | ||
noalias-md.ll | ||
nofloat-report.ll | ||
nofloat.ll | ||
non-const-n.ll | ||
nontemporal.ll | ||
novect-lcssa-cfg-invalidation.ll | ||
nsw-crash.ll | ||
nuw.ll | ||
opt.ll | ||
optsize.ll | ||
outer_loop_test1.ll | ||
outer_loop_test2.ll | ||
partial-lcssa.ll | ||
phi-cost.ll | ||
phi-hang.ll | ||
pointer-induction.ll | ||
pr25281.ll | ||
pr28541.ll | ||
pr30654-phiscev-sext-trunc.ll | ||
pr30806-phi-scev.ll | ||
pr30806.ll | ||
pr31098.ll | ||
pr31190.ll | ||
pr32859.ll | ||
pr33706.ll | ||
pr34681.ll | ||
pr35743.ll | ||
pr35773.ll | ||
pr36311.ll | ||
pr36983.ll | ||
pr37248.ll | ||
pr37515.ll | ||
pr38697.ll | ||
pr38800.ll | ||
pr39099.ll | ||
pr39417-optsize-scevchecks.ll | ||
pr43166-fold-tail-by-masking.ll | ||
pr44488-predication.ll | ||
pr45259.ll | ||
pr45525.ll | ||
pr45679-fold-tail-by-masking.ll | ||
pr46525-expander-insertpoint.ll | ||
preserve-dbg-loc-and-loop-metadata.ll | ||
ptr_loops.ll | ||
ptr-induction.ll | ||
read-only.ll | ||
reduction-inloop-uf4.ll | ||
reduction-inloop.ll | ||
reduction-order.ll | ||
reduction-predselect.ll | ||
reduction-small-size.ll | ||
reduction.ll | ||
remove_metadata.ll | ||
reverse_induction.ll | ||
reverse_iter.ll | ||
runtime-check-address-space.ll | ||
runtime-check-needed-but-empty.ll | ||
runtime-check-readonly-address-space.ll | ||
runtime-check-readonly.ll | ||
runtime-check.ll | ||
runtime-limit.ll | ||
safegep.ll | ||
same-base-access.ll | ||
scalar_after_vectorization.ll | ||
scalar-select.ll | ||
scev-exitlim-crash.ll | ||
simple-unroll.ll | ||
skip-iterations.ll | ||
small-loop.ll | ||
start-non-zero.ll | ||
store-shuffle-bug.ll | ||
struct_access.ll | ||
tail-folding-counting-down.ll | ||
tail-folding-vectorization-factor-1.ll | ||
tbaa-nodep.ll | ||
tripcount.ll | ||
undef-inst-bug.ll | ||
unroll_novec.ll | ||
unroll-novec-memcheck-metadata.ll | ||
unroll.ll | ||
unsafe-dep-remark.ll | ||
unsized-pointee-crash.ll | ||
use-scalar-epilogue-if-tp-fails.ll | ||
value-ptr-bug.ll | ||
vect-phiscev-sext-trunc.ll | ||
vect.omp.persistence.ll | ||
vect.stats.ll | ||
vector-geps.ll | ||
vector-intrinsic-call-cost.ll | ||
vectorize-once.ll | ||
vectorizeVFone.ll | ||
version-mem-access.ll | ||
vplan_hcfg_stress_test.ll | ||
vplan-outer-loop-uncomputable-trip-count.ll | ||
vplan-stress-test-no-explict-vf.ll | ||
write-only.ll | ||
zero-sized-pointee-crash.ll |