Updates.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@30245 91177308-0d34-0410-b5e6-96231b3b80d8
2025-04-12 02:47:10 +00:00 · 2006-09-11 05:35:17 +00:00 · 2006-09-11 05:35:17 +00:00 · f47d167c3b
commit f47d167c3b
parent 0f4aa6ee20
2 changed files with 0 additions and 154 deletions
--- a/lib/Target/X86/README-SSE.txt
+++ b/lib/Target/X86/README-SSE.txt
@ -147,32 +147,6 @@ and ISD::FMAX node types?
 //===---------------------------------------------------------------------===//
 The first BB of this code:
 declare bool %foo()
 int %bar() {
        %V = call bool %foo()
        br bool %V, label %T, label %F
 T:
        ret int 1
 F:
        call bool %foo()
        ret int 12
 }
 compiles to:
 _bar:
        subl $12, %esp
        call L_foo$stub
        xorb $1, %al
        testb %al, %al
        jne LBB_bar_2   # F
 It would be better to emit "cmp %al, 1" than a xor and test.
 //===---------------------------------------------------------------------===//
 Lower memcpy / memset to a series of SSE 128 bit move instructions when it's
 feasible.
@ -274,33 +248,6 @@ instead of por and movdqa. Does it matter?
 //===---------------------------------------------------------------------===//
 Use movddup to splat a v2f64 directly from a memory source. e.g.
 #include <emmintrin.h>
 void test(__m128d *r, double A) {
  *r = _mm_set1_pd(A);
 }
 llc:
 _test:
 	movsd 8(%esp), %xmm0
 	unpcklpd %xmm0, %xmm0
 	movl 4(%esp), %eax
 	movapd %xmm0, (%eax)
 	ret
 icc:
 _test:
 	movl 4(%esp), %eax
 	movddup 8(%esp), %xmm0
 	movapd %xmm0, (%eax)
 	ret
 //===---------------------------------------------------------------------===//
 X86RegisterInfo::copyRegToReg() returns X86::MOVAPSrr for VR128. Is it possible
 to choose between movaps, movapd, and movdqa based on types of source and
 destination?
@ -311,69 +258,6 @@ shorter.
 //===---------------------------------------------------------------------===//
 We are emitting bad code for this:
 float %test(float* %V, int %I, int %D, float %V) {
 entry:
 	%tmp = seteq int %D, 0
 	br bool %tmp, label %cond_true, label %cond_false23
 cond_true:
 	%tmp3 = getelementptr float* %V, int %I
 	%tmp = load float* %tmp3
 	%tmp5 = setgt float %tmp, %V
 	%tmp6 = tail call bool %llvm.isunordered.f32( float %tmp, float %V )
 	%tmp7 = or bool %tmp5, %tmp6
 	br bool %tmp7, label %UnifiedReturnBlock, label %cond_next
 cond_next:
 	%tmp10 = add int %I, 1
 	%tmp12 = getelementptr float* %V, int %tmp10
 	%tmp13 = load float* %tmp12
 	%tmp15 = setle float %tmp13, %V
 	%tmp16 = tail call bool %llvm.isunordered.f32( float %tmp13, float %V )
 	%tmp17 = or bool %tmp15, %tmp16
 	%retval = select bool %tmp17, float 0.000000e+00, float 1.000000e+00
 	ret float %retval
 cond_false23:
 	%tmp28 = tail call float %foo( float* %V, int %I, int %D, float %V )
 	ret float %tmp28
 UnifiedReturnBlock:		; preds = %cond_true
 	ret float 0.000000e+00
 }
 declare bool %llvm.isunordered.f32(float, float)
 declare float %foo(float*, int, int, float)
 It exposes a known load folding problem:
 	movss (%edx,%ecx,4), %xmm1
 	ucomiss %xmm1, %xmm0
 As well as this:
 LBB_test_2:	# cond_next
 	movss LCPI1_0, %xmm2
 	pxor %xmm3, %xmm3
 	ucomiss %xmm0, %xmm1
 	jbe LBB_test_6	# cond_next
 LBB_test_5:	# cond_next
 	movaps %xmm2, %xmm3
 LBB_test_6:	# cond_next
 	movss %xmm3, 40(%esp)
 	flds 40(%esp)
 	addl $44, %esp
 	ret
 Clearly it's unnecessary to clear %xmm3. It's also not clear why we are emitting
 three moves (movss, movaps, movss).
 //===---------------------------------------------------------------------===//
 External test Nurbs exposed some problems. Look for
 __ZN15Nurbs_SSE_Cubic17TessellateSurfaceE, bb cond_next140. This is what icc
 emits:
--- a/lib/Target/X86/README.txt
+++ b/lib/Target/X86/README.txt
@ -390,44 +390,6 @@ require a copy to be inserted (in X86InstrInfo::convertToThreeAddress).
 //===---------------------------------------------------------------------===//
 This code generates ugly code, probably due to costs being off or something:
 void %test(float* %P, <4 x float>* %P2 ) {
        %xFloat0.688 = load float* %P
        %loadVector37.712 = load <4 x float>* %P2
        %inFloat3.713 = insertelement <4 x float> %loadVector37.712, float 0.000000e+00, uint 3
        store <4 x float> %inFloat3.713, <4 x float>* %P2
        ret void
 }
 Generates:
 _test:
        pxor %xmm0, %xmm0
        movd %xmm0, %eax        ;; EAX = 0!
        movl 8(%esp), %ecx
        movaps (%ecx), %xmm0
        pinsrw $6, %eax, %xmm0
        shrl $16, %eax          ;; EAX = 0 again!
        pinsrw $7, %eax, %xmm0
        movaps %xmm0, (%ecx)
        ret
 It would be better to generate:
 _test:
        movl 8(%esp), %ecx
        movaps (%ecx), %xmm0
 	xor %eax, %eax
        pinsrw $6, %eax, %xmm0
        pinsrw $7, %eax, %xmm0
        movaps %xmm0, (%ecx)
        ret
 or use pxor (to make a zero vector) and shuffle (to insert it).
 //===---------------------------------------------------------------------===//
 Bad codegen:
 char foo(int x) { return x; }