mirror of
https://github.com/RPCS3/llvm.git
synced 2025-01-26 20:57:15 +00:00
A few minor updates, removing implemented stuff and adding a couple of
new things. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@47458 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
parent
69e6a8d5a8
commit
a2e7efa6d3
@ -54,6 +54,17 @@ One better solution for 1LL << x is:
|
||||
|
||||
But that requires good 8-bit subreg support.
|
||||
|
||||
Also, this might be better. It's an extra shift, but it's one instruction
|
||||
shorter, and doesn't stress 8-bit subreg support.
|
||||
(From http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01148.html,
|
||||
but without the unnecessary and.)
|
||||
movl %ecx, %eax
|
||||
shrl $5, %eax
|
||||
movl %eax, %edx
|
||||
xorl $1, %edx
|
||||
sall %cl, %eax
|
||||
sall %cl. %edx
|
||||
|
||||
64-bit shifts (in general) expand to really bad code. Instead of using
|
||||
cmovs, we should expand to a conditional branch like GCC produces.
|
||||
|
||||
@ -67,6 +78,9 @@ into:
|
||||
xorl $1, %eax
|
||||
ret
|
||||
|
||||
(Although note that this isn't a legal way to express the code that llvm-gcc
|
||||
currently generates for that function.)
|
||||
|
||||
//===---------------------------------------------------------------------===//
|
||||
|
||||
Some isel ideas:
|
||||
@ -94,34 +108,6 @@ the coalescer how to deal with it though.
|
||||
|
||||
//===---------------------------------------------------------------------===//
|
||||
|
||||
Count leading zeros and count trailing zeros:
|
||||
|
||||
int clz(int X) { return __builtin_clz(X); }
|
||||
int ctz(int X) { return __builtin_ctz(X); }
|
||||
|
||||
$ gcc t.c -S -o - -O3 -fomit-frame-pointer -masm=intel
|
||||
clz:
|
||||
bsr %eax, DWORD PTR [%esp+4]
|
||||
xor %eax, 31
|
||||
ret
|
||||
ctz:
|
||||
bsf %eax, DWORD PTR [%esp+4]
|
||||
ret
|
||||
|
||||
however, check that these are defined for 0 and 32. Our intrinsics are, GCC's
|
||||
aren't.
|
||||
|
||||
Another example (use predsimplify to eliminate a select):
|
||||
|
||||
int foo (unsigned long j) {
|
||||
if (j)
|
||||
return __builtin_ffs (j) - 1;
|
||||
else
|
||||
return 0;
|
||||
}
|
||||
|
||||
//===---------------------------------------------------------------------===//
|
||||
|
||||
It appears icc use push for parameter passing. Need to investigate.
|
||||
|
||||
//===---------------------------------------------------------------------===//
|
||||
@ -236,32 +222,6 @@ which is probably slower, but it's interesting at least :)
|
||||
|
||||
//===---------------------------------------------------------------------===//
|
||||
|
||||
The first BB of this code:
|
||||
|
||||
declare bool %foo()
|
||||
int %bar() {
|
||||
%V = call bool %foo()
|
||||
br bool %V, label %T, label %F
|
||||
T:
|
||||
ret int 1
|
||||
F:
|
||||
call bool %foo()
|
||||
ret int 12
|
||||
}
|
||||
|
||||
compiles to:
|
||||
|
||||
_bar:
|
||||
subl $12, %esp
|
||||
call L_foo$stub
|
||||
xorb $1, %al
|
||||
testb %al, %al
|
||||
jne LBB_bar_2 # F
|
||||
|
||||
It would be better to emit "cmp %al, 1" than a xor and test.
|
||||
|
||||
//===---------------------------------------------------------------------===//
|
||||
|
||||
We are currently lowering large (1MB+) memmove/memcpy to rep/stosl and rep/movsl
|
||||
We should leave these as libcalls for everything over a much lower threshold,
|
||||
since libc is hand tuned for medium and large mem ops (avoiding RFO for large
|
||||
@ -483,19 +443,24 @@ shorter than movl + leal.
|
||||
|
||||
//===---------------------------------------------------------------------===//
|
||||
|
||||
Implement CTTZ, CTLZ with bsf and bsr. GCC produces:
|
||||
__builtin_ffs codegen is messy.
|
||||
|
||||
int ctz_(unsigned X) { return __builtin_ctz(X); }
|
||||
int clz_(unsigned X) { return __builtin_clz(X); }
|
||||
int ffs_(unsigned X) { return __builtin_ffs(X); }
|
||||
|
||||
_ctz_:
|
||||
bsfl 4(%esp), %eax
|
||||
ret
|
||||
_clz_:
|
||||
bsrl 4(%esp), %eax
|
||||
xorl $31, %eax
|
||||
llvm produces:
|
||||
ffs_:
|
||||
movl 4(%esp), %ecx
|
||||
bsfl %ecx, %eax
|
||||
movl $32, %edx
|
||||
cmove %edx, %eax
|
||||
incl %eax
|
||||
xorl %edx, %edx
|
||||
testl %ecx, %ecx
|
||||
cmove %edx, %eax
|
||||
ret
|
||||
|
||||
vs gcc:
|
||||
|
||||
_ffs_:
|
||||
movl $-1, %edx
|
||||
bsfl 4(%esp), %eax
|
||||
@ -503,6 +468,15 @@ _ffs_:
|
||||
addl $1, %eax
|
||||
ret
|
||||
|
||||
Another example of __builtin_ffs (use predsimplify to eliminate a select):
|
||||
|
||||
int foo (unsigned long j) {
|
||||
if (j)
|
||||
return __builtin_ffs (j) - 1;
|
||||
else
|
||||
return 0;
|
||||
}
|
||||
|
||||
//===---------------------------------------------------------------------===//
|
||||
|
||||
It appears gcc place string data with linkonce linkage in
|
||||
@ -1062,6 +1036,8 @@ Should compile to:
|
||||
setae %al
|
||||
ret
|
||||
|
||||
FIXME: That code looks wrong; bool return is normally defined as zext.
|
||||
|
||||
on x86-64, not:
|
||||
|
||||
__Z11no_overflowjj:
|
||||
@ -1208,35 +1184,44 @@ void compare (long long foo) {
|
||||
|
||||
to:
|
||||
|
||||
_compare:
|
||||
subl $12, %esp
|
||||
cmpl $0, 16(%esp)
|
||||
compare:
|
||||
subl $4, %esp
|
||||
cmpl $0, 8(%esp)
|
||||
setne %al
|
||||
movzbw %al, %ax
|
||||
cmpl $1, 20(%esp)
|
||||
cmpl $1, 12(%esp)
|
||||
setg %cl
|
||||
movzbw %cl, %cx
|
||||
cmove %ax, %cx
|
||||
movw %cx, %ax
|
||||
testb $1, %al
|
||||
je LBB1_2 # cond_true
|
||||
testb $1, %cl
|
||||
jne .LBB1_2 # UnifiedReturnBlock
|
||||
.LBB1_1: # ifthen
|
||||
call abort
|
||||
.LBB1_2: # UnifiedReturnBlock
|
||||
addl $4, %esp
|
||||
ret
|
||||
|
||||
(also really horrible code on ppc). This is due to the expand code for 64-bit
|
||||
compares. GCC produces multiple branches, which is much nicer:
|
||||
|
||||
_compare:
|
||||
pushl %ebp
|
||||
movl %esp, %ebp
|
||||
subl $8, %esp
|
||||
movl 8(%ebp), %eax
|
||||
movl 12(%ebp), %edx
|
||||
subl $1, %edx
|
||||
jg L5
|
||||
L7:
|
||||
jl L4
|
||||
compare:
|
||||
subl $12, %esp
|
||||
movl 20(%esp), %edx
|
||||
movl 16(%esp), %eax
|
||||
decl %edx
|
||||
jle .L7
|
||||
.L5:
|
||||
addl $12, %esp
|
||||
ret
|
||||
.p2align 4,,7
|
||||
.L7:
|
||||
jl .L4
|
||||
cmpl $0, %eax
|
||||
jbe L4
|
||||
L5:
|
||||
.p2align 4,,8
|
||||
ja .L5
|
||||
.L4:
|
||||
.p2align 4,,9
|
||||
call abort
|
||||
|
||||
//===---------------------------------------------------------------------===//
|
||||
|
||||
@ -1380,7 +1365,7 @@ Should compile into:
|
||||
|
||||
_foo:
|
||||
movzwl 4(%esp), %eax
|
||||
orb $-1, %al ;; 'orl 255' is also fine :)
|
||||
orl $255, %eax
|
||||
ret
|
||||
|
||||
instead of:
|
||||
@ -1550,6 +1535,48 @@ See PR2053 for more details.
|
||||
|
||||
//===---------------------------------------------------------------------===//
|
||||
|
||||
We should investigate using cdq/ctld (effect: edx = sar eax, 31)
|
||||
more aggressively; it should cost the same as a move+shift on any modern
|
||||
processor, but it's a lot shorter. Downside is that it puts more
|
||||
pressure on register allocation because it has fixed operands.
|
||||
|
||||
Example:
|
||||
int abs(int x) {return x < 0 ? -x : x;}
|
||||
|
||||
gcc compiles this to the following when using march/mtune=pentium2/3/4/m/etc.:
|
||||
abs:
|
||||
movl 4(%esp), %eax
|
||||
cltd
|
||||
xorl %edx, %eax
|
||||
subl %edx, %eax
|
||||
ret
|
||||
|
||||
//===---------------------------------------------------------------------===//
|
||||
|
||||
Consider:
|
||||
|
||||
#include <inttypes.h>
|
||||
uint64_t a;
|
||||
uint16_t b;
|
||||
uint64_t mul(void) {
|
||||
return a * b;
|
||||
}
|
||||
|
||||
Currently, we generate the following:
|
||||
|
||||
mul:
|
||||
movzwl b, %ecx
|
||||
movl %ecx, %eax
|
||||
mull a
|
||||
imull a+4, %ecx
|
||||
addl %edx, %ecx
|
||||
movl %ecx, %edx
|
||||
ret
|
||||
|
||||
llvm should be able to commute the addl so that the movl isn't necessary.
|
||||
|
||||
//===---------------------------------------------------------------------===//
|
||||
|
||||
Consider:
|
||||
int test(unsigned long a, unsigned long b) { return -(a < b); }
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user