mirror of
https://github.com/RPCS3/llvm.git
synced 2024-12-27 06:35:30 +00:00
Updates.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@30245 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
parent
0f4aa6ee20
commit
f47d167c3b
@ -147,32 +147,6 @@ and ISD::FMAX node types?
|
||||
|
||||
//===---------------------------------------------------------------------===//
|
||||
|
||||
The first BB of this code:
|
||||
|
||||
declare bool %foo()
|
||||
int %bar() {
|
||||
%V = call bool %foo()
|
||||
br bool %V, label %T, label %F
|
||||
T:
|
||||
ret int 1
|
||||
F:
|
||||
call bool %foo()
|
||||
ret int 12
|
||||
}
|
||||
|
||||
compiles to:
|
||||
|
||||
_bar:
|
||||
subl $12, %esp
|
||||
call L_foo$stub
|
||||
xorb $1, %al
|
||||
testb %al, %al
|
||||
jne LBB_bar_2 # F
|
||||
|
||||
It would be better to emit "cmp %al, 1" than a xor and test.
|
||||
|
||||
//===---------------------------------------------------------------------===//
|
||||
|
||||
Lower memcpy / memset to a series of SSE 128 bit move instructions when it's
|
||||
feasible.
|
||||
|
||||
@ -274,33 +248,6 @@ instead of por and movdqa. Does it matter?
|
||||
|
||||
//===---------------------------------------------------------------------===//
|
||||
|
||||
Use movddup to splat a v2f64 directly from a memory source. e.g.
|
||||
|
||||
#include <emmintrin.h>
|
||||
|
||||
void test(__m128d *r, double A) {
|
||||
*r = _mm_set1_pd(A);
|
||||
}
|
||||
|
||||
llc:
|
||||
|
||||
_test:
|
||||
movsd 8(%esp), %xmm0
|
||||
unpcklpd %xmm0, %xmm0
|
||||
movl 4(%esp), %eax
|
||||
movapd %xmm0, (%eax)
|
||||
ret
|
||||
|
||||
icc:
|
||||
|
||||
_test:
|
||||
movl 4(%esp), %eax
|
||||
movddup 8(%esp), %xmm0
|
||||
movapd %xmm0, (%eax)
|
||||
ret
|
||||
|
||||
//===---------------------------------------------------------------------===//
|
||||
|
||||
X86RegisterInfo::copyRegToReg() returns X86::MOVAPSrr for VR128. Is it possible
|
||||
to choose between movaps, movapd, and movdqa based on types of source and
|
||||
destination?
|
||||
@ -311,69 +258,6 @@ shorter.
|
||||
|
||||
//===---------------------------------------------------------------------===//
|
||||
|
||||
We are emitting bad code for this:
|
||||
|
||||
float %test(float* %V, int %I, int %D, float %V) {
|
||||
entry:
|
||||
%tmp = seteq int %D, 0
|
||||
br bool %tmp, label %cond_true, label %cond_false23
|
||||
|
||||
cond_true:
|
||||
%tmp3 = getelementptr float* %V, int %I
|
||||
%tmp = load float* %tmp3
|
||||
%tmp5 = setgt float %tmp, %V
|
||||
%tmp6 = tail call bool %llvm.isunordered.f32( float %tmp, float %V )
|
||||
%tmp7 = or bool %tmp5, %tmp6
|
||||
br bool %tmp7, label %UnifiedReturnBlock, label %cond_next
|
||||
|
||||
cond_next:
|
||||
%tmp10 = add int %I, 1
|
||||
%tmp12 = getelementptr float* %V, int %tmp10
|
||||
%tmp13 = load float* %tmp12
|
||||
%tmp15 = setle float %tmp13, %V
|
||||
%tmp16 = tail call bool %llvm.isunordered.f32( float %tmp13, float %V )
|
||||
%tmp17 = or bool %tmp15, %tmp16
|
||||
%retval = select bool %tmp17, float 0.000000e+00, float 1.000000e+00
|
||||
ret float %retval
|
||||
|
||||
cond_false23:
|
||||
%tmp28 = tail call float %foo( float* %V, int %I, int %D, float %V )
|
||||
ret float %tmp28
|
||||
|
||||
UnifiedReturnBlock: ; preds = %cond_true
|
||||
ret float 0.000000e+00
|
||||
}
|
||||
|
||||
declare bool %llvm.isunordered.f32(float, float)
|
||||
|
||||
declare float %foo(float*, int, int, float)
|
||||
|
||||
|
||||
It exposes a known load folding problem:
|
||||
|
||||
movss (%edx,%ecx,4), %xmm1
|
||||
ucomiss %xmm1, %xmm0
|
||||
|
||||
As well as this:
|
||||
|
||||
LBB_test_2: # cond_next
|
||||
movss LCPI1_0, %xmm2
|
||||
pxor %xmm3, %xmm3
|
||||
ucomiss %xmm0, %xmm1
|
||||
jbe LBB_test_6 # cond_next
|
||||
LBB_test_5: # cond_next
|
||||
movaps %xmm2, %xmm3
|
||||
LBB_test_6: # cond_next
|
||||
movss %xmm3, 40(%esp)
|
||||
flds 40(%esp)
|
||||
addl $44, %esp
|
||||
ret
|
||||
|
||||
Clearly it's unnecessary to clear %xmm3. It's also not clear why we are emitting
|
||||
three moves (movss, movaps, movss).
|
||||
|
||||
//===---------------------------------------------------------------------===//
|
||||
|
||||
External test Nurbs exposed some problems. Look for
|
||||
__ZN15Nurbs_SSE_Cubic17TessellateSurfaceE, bb cond_next140. This is what icc
|
||||
emits:
|
||||
|
@ -390,44 +390,6 @@ require a copy to be inserted (in X86InstrInfo::convertToThreeAddress).
|
||||
|
||||
//===---------------------------------------------------------------------===//
|
||||
|
||||
This code generates ugly code, probably due to costs being off or something:
|
||||
|
||||
void %test(float* %P, <4 x float>* %P2 ) {
|
||||
%xFloat0.688 = load float* %P
|
||||
%loadVector37.712 = load <4 x float>* %P2
|
||||
%inFloat3.713 = insertelement <4 x float> %loadVector37.712, float 0.000000e+00, uint 3
|
||||
store <4 x float> %inFloat3.713, <4 x float>* %P2
|
||||
ret void
|
||||
}
|
||||
|
||||
Generates:
|
||||
|
||||
_test:
|
||||
pxor %xmm0, %xmm0
|
||||
movd %xmm0, %eax ;; EAX = 0!
|
||||
movl 8(%esp), %ecx
|
||||
movaps (%ecx), %xmm0
|
||||
pinsrw $6, %eax, %xmm0
|
||||
shrl $16, %eax ;; EAX = 0 again!
|
||||
pinsrw $7, %eax, %xmm0
|
||||
movaps %xmm0, (%ecx)
|
||||
ret
|
||||
|
||||
It would be better to generate:
|
||||
|
||||
_test:
|
||||
movl 8(%esp), %ecx
|
||||
movaps (%ecx), %xmm0
|
||||
xor %eax, %eax
|
||||
pinsrw $6, %eax, %xmm0
|
||||
pinsrw $7, %eax, %xmm0
|
||||
movaps %xmm0, (%ecx)
|
||||
ret
|
||||
|
||||
or use pxor (to make a zero vector) and shuffle (to insert it).
|
||||
|
||||
//===---------------------------------------------------------------------===//
|
||||
|
||||
Bad codegen:
|
||||
|
||||
char foo(int x) { return x; }
|
||||
|
Loading…
Reference in New Issue
Block a user