Another entry.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@27784 91177308-0d34-0410-b5e6-96231b3b80d8
2025-01-26 20:57:15 +00:00 · 2006-04-18 00:21:01 +00:00 · 2006-04-18 00:21:01 +00:00 · 7fa094a261
commit 7fa094a261
parent cdfc3c82a7
1 changed files with 151 additions and 0 deletions
--- a/lib/Target/X86/README.txt
+++ b/lib/Target/X86/README.txt
@ -810,3 +810,154 @@ destination?
 How about andps, andpd, and pand? Do we really care about the type of the packed
 elements? If not, why not always use the "ps" variants which are likely to be
 shorter.
+
+//===---------------------------------------------------------------------===//
+
+We are emitting bad code for this:
+
+float %test(float* %V, int %I, int %D, float %V) {
+entry:
+	%tmp = seteq int %D, 0
+	br bool %tmp, label %cond_true, label %cond_false23
+
+cond_true:
+	%tmp3 = getelementptr float* %V, int %I
+	%tmp = load float* %tmp3
+	%tmp5 = setgt float %tmp, %V
+	%tmp6 = tail call bool %llvm.isunordered.f32( float %tmp, float %V )
+	%tmp7 = or bool %tmp5, %tmp6
+	br bool %tmp7, label %UnifiedReturnBlock, label %cond_next
+
+cond_next:
+	%tmp10 = add int %I, 1
+	%tmp12 = getelementptr float* %V, int %tmp10
+	%tmp13 = load float* %tmp12
+	%tmp15 = setle float %tmp13, %V
+	%tmp16 = tail call bool %llvm.isunordered.f32( float %tmp13, float %V )
+	%tmp17 = or bool %tmp15, %tmp16
+	%retval = select bool %tmp17, float 0.000000e+00, float 1.000000e+00
+	ret float %retval
+
+cond_false23:
+	%tmp28 = tail call float %foo( float* %V, int %I, int %D, float %V )
+	ret float %tmp28
+
+UnifiedReturnBlock:		; preds = %cond_true
+	ret float 0.000000e+00
+}
+
+declare bool %llvm.isunordered.f32(float, float)
+
+declare float %foo(float*, int, int, float)
+
+
+It exposes a known load folding problem:
+
+	movss (%edx,%ecx,4), %xmm1
+	ucomiss %xmm1, %xmm0
+
+As well as this:
+
+LBB_test_2:	# cond_next
+	movss LCPI1_0, %xmm2
+	pxor %xmm3, %xmm3
+	ucomiss %xmm0, %xmm1
+	jbe LBB_test_6	# cond_next
+LBB_test_5:	# cond_next
+	movaps %xmm2, %xmm3
+LBB_test_6:	# cond_next
+	movss %xmm3, 40(%esp)
+	flds 40(%esp)
+	addl $44, %esp
+	ret
+
+Clearly it's unnecessary to clear %xmm3. It's also not clear why we are emitting
+three moves (movss, movaps, movss).
+
+//===---------------------------------------------------------------------===//
+
+External test Nurbs exposed some problems. Look for
+__ZN15Nurbs_SSE_Cubic17TessellateSurfaceE, bb cond_next140. This is what icc
+emits:
+
+        movaps    (%edx), %xmm2                                 #59.21
+        movaps    (%edx), %xmm5                                 #60.21
+        movaps    (%edx), %xmm4                                 #61.21
+        movaps    (%edx), %xmm3                                 #62.21
+        movl      40(%ecx), %ebp                                #69.49
+        shufps    $0, %xmm2, %xmm5                              #60.21
+        movl      100(%esp), %ebx                               #69.20
+        movl      (%ebx), %edi                                  #69.20
+        imull     %ebp, %edi                                    #69.49
+        addl      (%eax), %edi                                  #70.33
+        shufps    $85, %xmm2, %xmm4                             #61.21
+        shufps    $170, %xmm2, %xmm3                            #62.21
+        shufps    $255, %xmm2, %xmm2                            #63.21
+        lea       (%ebp,%ebp,2), %ebx                           #69.49
+        negl      %ebx                                          #69.49
+        lea       -3(%edi,%ebx), %ebx                           #70.33
+        shll      $4, %ebx                                      #68.37
+        addl      32(%ecx), %ebx                                #68.37
+        testb     $15, %bl                                      #91.13
+        jne       L_B1.24       # Prob 5%                       #91.13
+
+This is the llvm code after instruction scheduling:
+
+cond_next140 (0xa910740, LLVM BB @0xa90beb0):
+	%reg1078 = MOV32ri -3
+	%reg1079 = ADD32rm %reg1078, %reg1068, 1, %NOREG, 0
+	%reg1037 = MOV32rm %reg1024, 1, %NOREG, 40
+	%reg1080 = IMUL32rr %reg1079, %reg1037
+	%reg1081 = MOV32rm %reg1058, 1, %NOREG, 0
+	%reg1038 = LEA32r %reg1081, 1, %reg1080, -3
+	%reg1036 = MOV32rm %reg1024, 1, %NOREG, 32
+	%reg1082 = SHL32ri %reg1038, 4
+	%reg1039 = ADD32rr %reg1036, %reg1082
+	%reg1083 = MOVAPSrm %reg1059, 1, %NOREG, 0
+	%reg1034 = SHUFPSrr %reg1083, %reg1083, 170
+	%reg1032 = SHUFPSrr %reg1083, %reg1083, 0
+	%reg1035 = SHUFPSrr %reg1083, %reg1083, 255
+	%reg1033 = SHUFPSrr %reg1083, %reg1083, 85
+	%reg1040 = MOV32rr %reg1039
+	%reg1084 = AND32ri8 %reg1039, 15
+	CMP32ri8 %reg1084, 0
+	JE mbb<cond_next204,0xa914d30>
+
+Still ok. After register allocation:
+
+cond_next140 (0xa910740, LLVM BB @0xa90beb0):
+	%EAX = MOV32ri -3
+	%EDX = MOV32rm <fi#3>, 1, %NOREG, 0
+	ADD32rm %EAX<def&use>, %EDX, 1, %NOREG, 0
+	%EDX = MOV32rm <fi#7>, 1, %NOREG, 0
+	%EDX = MOV32rm %EDX, 1, %NOREG, 40
+	IMUL32rr %EAX<def&use>, %EDX
+	%ESI = MOV32rm <fi#5>, 1, %NOREG, 0
+	%ESI = MOV32rm %ESI, 1, %NOREG, 0
+	MOV32mr <fi#4>, 1, %NOREG, 0, %ESI
+	%EAX = LEA32r %ESI, 1, %EAX, -3
+	%ESI = MOV32rm <fi#7>, 1, %NOREG, 0
+	%ESI = MOV32rm %ESI, 1, %NOREG, 32
+	%EDI = MOV32rr %EAX
+	SHL32ri %EDI<def&use>, 4
+	ADD32rr %EDI<def&use>, %ESI
+	%XMM0 = MOVAPSrm %ECX, 1, %NOREG, 0
+	%XMM1 = MOVAPSrr %XMM0
+	SHUFPSrr %XMM1<def&use>, %XMM1, 170
+	%XMM2 = MOVAPSrr %XMM0
+	SHUFPSrr %XMM2<def&use>, %XMM2, 0
+	%XMM3 = MOVAPSrr %XMM0
+	SHUFPSrr %XMM3<def&use>, %XMM3, 255
+	SHUFPSrr %XMM0<def&use>, %XMM0, 85
+	%EBX = MOV32rr %EDI
+	AND32ri8 %EBX<def&use>, 15
+	CMP32ri8 %EBX, 0
+	JE mbb<cond_next204,0xa914d30>
+
+This looks really bad. The problem is shufps is a destructive opcode. Since it
+appears as operand two in more than one shufps ops. It resulted in a number of
+copies. Note icc also suffers from the same problem. Either the instruction
+selector should select pshufd or The register allocator can made the two-address
+to three-address transformation.
+
+It also exposes some other problems. See MOV32ri -3 and the spills.