mirror of
https://github.com/RPCS3/llvm.git
synced 2025-01-27 05:32:22 +00:00
update a bunch of entries.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@122700 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
parent
67fb341f8b
commit
527b47d189
@ -2,38 +2,6 @@ Target Independent Opportunities:
|
||||
|
||||
//===---------------------------------------------------------------------===//
|
||||
|
||||
We should recognize idioms for add-with-carry and turn it into the appropriate
|
||||
intrinsics. This example:
|
||||
|
||||
unsigned add32carry(unsigned sum, unsigned x) {
|
||||
unsigned z = sum + x;
|
||||
if (sum + x < x)
|
||||
z++;
|
||||
return z;
|
||||
}
|
||||
|
||||
Compiles to: clang t.c -S -o - -O3 -fomit-frame-pointer -m64 -mkernel
|
||||
|
||||
_add32carry: ## @add32carry
|
||||
addl %esi, %edi
|
||||
cmpl %esi, %edi
|
||||
sbbl %eax, %eax
|
||||
andl $1, %eax
|
||||
addl %edi, %eax
|
||||
ret
|
||||
|
||||
with clang, but to:
|
||||
|
||||
_add32carry:
|
||||
leal (%rsi,%rdi), %eax
|
||||
cmpl %esi, %eax
|
||||
adcl $0, %eax
|
||||
ret
|
||||
|
||||
with gcc.
|
||||
|
||||
//===---------------------------------------------------------------------===//
|
||||
|
||||
Dead argument elimination should be enhanced to handle cases when an argument is
|
||||
dead to an externally visible function. Though the argument can't be removed
|
||||
from the externally visible function, the caller doesn't need to pass it in.
|
||||
@ -82,6 +50,9 @@ unsigned int mul(unsigned int a,unsigned int b) {
|
||||
return a*b;
|
||||
}
|
||||
|
||||
The legalization code for mul-with-overflow needs to be made more robust before
|
||||
this can be implemented though.
|
||||
|
||||
//===---------------------------------------------------------------------===//
|
||||
|
||||
Get the C front-end to expand hypot(x,y) -> llvm.sqrt(x*x+y*y) when errno and
|
||||
@ -92,41 +63,6 @@ right).
|
||||
|
||||
//===---------------------------------------------------------------------===//
|
||||
|
||||
Solve this DAG isel folding deficiency:
|
||||
|
||||
int X, Y;
|
||||
|
||||
void fn1(void)
|
||||
{
|
||||
X = X | (Y << 3);
|
||||
}
|
||||
|
||||
compiles to
|
||||
|
||||
fn1:
|
||||
movl Y, %eax
|
||||
shll $3, %eax
|
||||
orl X, %eax
|
||||
movl %eax, X
|
||||
ret
|
||||
|
||||
The problem is the store's chain operand is not the load X but rather
|
||||
a TokenFactor of the load X and load Y, which prevents the folding.
|
||||
|
||||
There are two ways to fix this:
|
||||
|
||||
1. The dag combiner can start using alias analysis to realize that y/x
|
||||
don't alias, making the store to X not dependent on the load from Y.
|
||||
2. The generated isel could be made smarter in the case it can't
|
||||
disambiguate the pointers.
|
||||
|
||||
Number 1 is the preferred solution.
|
||||
|
||||
This has been "fixed" by a TableGen hack. But that is a short term workaround
|
||||
which will be removed once the proper fix is made.
|
||||
|
||||
//===---------------------------------------------------------------------===//
|
||||
|
||||
On targets with expensive 64-bit multiply, we could LSR this:
|
||||
|
||||
for (i = ...; ++i) {
|
||||
@ -339,14 +275,6 @@ unsigned long reverse(unsigned v) {
|
||||
return v ^ (t >> 8);
|
||||
}
|
||||
|
||||
Neither is this (very standard idiom):
|
||||
|
||||
int f(int n)
|
||||
{
|
||||
return (((n) << 24) | (((n) & 0xff00) << 8)
|
||||
| (((n) >> 8) & 0xff00) | ((n) >> 24));
|
||||
}
|
||||
|
||||
//===---------------------------------------------------------------------===//
|
||||
|
||||
[LOOP RECOGNITION]
|
||||
@ -382,9 +310,7 @@ unsigned int popcount(unsigned int input) {
|
||||
return count;
|
||||
}
|
||||
|
||||
This is a form of idiom recognition for loops, the same thing that could be
|
||||
useful for recognizing memset/memcpy. This sort of thing should be added to the
|
||||
loop idiom pass.
|
||||
This sort of thing should be added to the loop idiom pass.
|
||||
|
||||
//===---------------------------------------------------------------------===//
|
||||
|
||||
@ -639,46 +565,21 @@ struct THotKey { short Key; bool Control; bool Shift; bool Alt; };
|
||||
extern THotKey m_HotKey;
|
||||
THotKey GetHotKey () { return m_HotKey; }
|
||||
|
||||
into (-O3 -fno-exceptions -static -fomit-frame-pointer):
|
||||
into (-m64 -O3 -fno-exceptions -static -fomit-frame-pointer):
|
||||
|
||||
__Z9GetHotKeyv:
|
||||
pushl %esi
|
||||
movl 8(%esp), %eax
|
||||
movb _m_HotKey+3, %cl
|
||||
movb _m_HotKey+4, %dl
|
||||
movb _m_HotKey+2, %ch
|
||||
movw _m_HotKey, %si
|
||||
movw %si, (%eax)
|
||||
movb %ch, 2(%eax)
|
||||
movb %cl, 3(%eax)
|
||||
movb %dl, 4(%eax)
|
||||
popl %esi
|
||||
ret $4
|
||||
|
||||
GCC produces:
|
||||
|
||||
__Z9GetHotKeyv:
|
||||
movl _m_HotKey, %edx
|
||||
movl 4(%esp), %eax
|
||||
movl %edx, (%eax)
|
||||
movzwl _m_HotKey+4, %edx
|
||||
movw %dx, 4(%eax)
|
||||
ret $4
|
||||
|
||||
The LLVM IR contains the needed alignment info, so we should be able to
|
||||
merge the loads and stores into 4-byte loads:
|
||||
|
||||
%struct.THotKey = type { i16, i8, i8, i8 }
|
||||
define void @_Z9GetHotKeyv(%struct.THotKey* sret %agg.result) nounwind {
|
||||
...
|
||||
%tmp2 = load i16* getelementptr (@m_HotKey, i32 0, i32 0), align 8
|
||||
%tmp5 = load i8* getelementptr (@m_HotKey, i32 0, i32 1), align 2
|
||||
%tmp8 = load i8* getelementptr (@m_HotKey, i32 0, i32 2), align 1
|
||||
%tmp11 = load i8* getelementptr (@m_HotKey, i32 0, i32 3), align 2
|
||||
|
||||
Alternatively, we should use a small amount of base-offset alias analysis
|
||||
to make it so the scheduler doesn't need to hold all the loads in regs at
|
||||
once.
|
||||
__Z9GetHotKeyv: ## @_Z9GetHotKeyv
|
||||
movq _m_HotKey@GOTPCREL(%rip), %rax
|
||||
movzwl (%rax), %ecx
|
||||
movzbl 2(%rax), %edx
|
||||
shlq $16, %rdx
|
||||
orq %rcx, %rdx
|
||||
movzbl 3(%rax), %ecx
|
||||
shlq $24, %rcx
|
||||
orq %rdx, %rcx
|
||||
movzbl 4(%rax), %eax
|
||||
shlq $32, %rax
|
||||
orq %rcx, %rax
|
||||
ret
|
||||
|
||||
//===---------------------------------------------------------------------===//
|
||||
|
||||
@ -764,20 +665,6 @@ etc. On X86, we miss a bunch of 'rotate by variable' cases because the rotate
|
||||
matching code in dag combine doesn't look through truncates aggressively
|
||||
enough. Here are some testcases reduces from GCC PR17886:
|
||||
|
||||
unsigned long long f(unsigned long long x, int y) {
|
||||
return (x << y) | (x >> 64-y);
|
||||
}
|
||||
unsigned f2(unsigned x, int y){
|
||||
return (x << y) | (x >> 32-y);
|
||||
}
|
||||
unsigned long long f3(unsigned long long x){
|
||||
int y = 9;
|
||||
return (x << y) | (x >> 64-y);
|
||||
}
|
||||
unsigned f4(unsigned x){
|
||||
int y = 10;
|
||||
return (x << y) | (x >> 32-y);
|
||||
}
|
||||
unsigned long long f5(unsigned long long x, unsigned long long y) {
|
||||
return (x << 8) | ((y >> 48) & 0xffull);
|
||||
}
|
||||
@ -796,11 +683,6 @@ unsigned long long f6(unsigned long long x, unsigned long long y, int z) {
|
||||
}
|
||||
}
|
||||
|
||||
On X86-64, we only handle f2/f3/f4 right. On x86-32, a few of these
|
||||
generate truly horrible code, instead of using shld and friends. On
|
||||
ARM, we end up with calls to L___lshrdi3/L___ashldi3 in f, which is
|
||||
badness. PPC64 misses f, f5 and f6. CellSPU aborts in isel.
|
||||
|
||||
//===---------------------------------------------------------------------===//
|
||||
|
||||
This (and similar related idioms):
|
||||
|
@ -1507,6 +1507,8 @@ loop, the value comes into the loop as two values, and
|
||||
RegsForValue::getCopyFromRegs doesn't know how to put an AssertSext on the
|
||||
constructed BUILD_PAIR which represents the cast value.
|
||||
|
||||
This can be handled by making CodeGenPrepare sink the cast.
|
||||
|
||||
//===---------------------------------------------------------------------===//
|
||||
|
||||
Test instructions can be eliminated by using EFLAGS values from arithmetic
|
||||
@ -1847,3 +1849,38 @@ _foo:
|
||||
0 is the only unsigned number < 1.
|
||||
|
||||
//===---------------------------------------------------------------------===//
|
||||
|
||||
This code:
|
||||
|
||||
%0 = type { i32, i1 }
|
||||
|
||||
define i32 @add32carry(i32 %sum, i32 %x) nounwind readnone ssp {
|
||||
entry:
|
||||
%uadd = tail call %0 @llvm.uadd.with.overflow.i32(i32 %sum, i32 %x)
|
||||
%cmp = extractvalue %0 %uadd, 1
|
||||
%inc = zext i1 %cmp to i32
|
||||
%add = add i32 %x, %sum
|
||||
%z.0 = add i32 %add, %inc
|
||||
ret i32 %z.0
|
||||
}
|
||||
|
||||
declare %0 @llvm.uadd.with.overflow.i32(i32, i32) nounwind readnone
|
||||
|
||||
compiles to:
|
||||
|
||||
_add32carry: ## @add32carry
|
||||
addl %esi, %edi
|
||||
sbbl %ecx, %ecx
|
||||
movl %edi, %eax
|
||||
subl %ecx, %eax
|
||||
ret
|
||||
|
||||
But it could be:
|
||||
|
||||
_add32carry:
|
||||
leal (%rsi,%rdi), %eax
|
||||
cmpl %esi, %eax
|
||||
adcl $0, %eax
|
||||
ret
|
||||
|
||||
//===---------------------------------------------------------------------===//
|
||||
|
Loading…
x
Reference in New Issue
Block a user