From a2e7efa6d3118ed58395578e4d871b2c415dff11 Mon Sep 17 00:00:00 2001
From: Eli Friedman <eli.friedman@gmail.com>
Date: Thu, 21 Feb 2008 21:16:49 +0000
Subject: [PATCH] A few minor updates, removing implemented stuff and adding a
 couple of new things.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@47458 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/README.txt | 193 ++++++++++++++++++++++----------------
 1 file changed, 110 insertions(+), 83 deletions(-)

diff --git a/lib/Target/X86/README.txt b/lib/Target/X86/README.txt
index 736e299ad80..e140c147c61 100644
--- a/lib/Target/X86/README.txt
+++ b/lib/Target/X86/README.txt
@@ -54,6 +54,17 @@ One better solution for 1LL << x is:
 
 But that requires good 8-bit subreg support.
 
+Also, this might be better.  It's an extra shift, but it's one instruction
+shorter, and doesn't stress 8-bit subreg support.
+(From http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01148.html,
+but without the unnecessary and.)
+        movl %ecx, %eax
+        shrl $5, %eax
+        movl %eax, %edx
+        xorl $1, %edx
+        sall %cl, %eax
+        sall %cl. %edx
+
 64-bit shifts (in general) expand to really bad code.  Instead of using
 cmovs, we should expand to a conditional branch like GCC produces.
 
@@ -67,6 +78,9 @@ into:
         xorl    $1, %eax
         ret
 
+(Although note that this isn't a legal way to express the code that llvm-gcc
+currently generates for that function.)
+
 //===---------------------------------------------------------------------===//
 
 Some isel ideas:
@@ -94,34 +108,6 @@ the coalescer how to deal with it though.
 
 //===---------------------------------------------------------------------===//
 
-Count leading zeros and count trailing zeros:
-
-int clz(int X) { return __builtin_clz(X); }
-int ctz(int X) { return __builtin_ctz(X); }
-
-$ gcc t.c -S -o - -O3  -fomit-frame-pointer -masm=intel
-clz:
-        bsr     %eax, DWORD PTR [%esp+4]
-        xor     %eax, 31
-        ret
-ctz:
-        bsf     %eax, DWORD PTR [%esp+4]
-        ret
-
-however, check that these are defined for 0 and 32.  Our intrinsics are, GCC's
-aren't.
-
-Another example (use predsimplify to eliminate a select):
-
-int foo (unsigned long j) {
-  if (j)
-    return __builtin_ffs (j) - 1;
-  else
-    return 0;
-}
-
-//===---------------------------------------------------------------------===//
-
 It appears icc use push for parameter passing. Need to investigate.
 
 //===---------------------------------------------------------------------===//
@@ -236,32 +222,6 @@ which is probably slower, but it's interesting at least :)
 
 //===---------------------------------------------------------------------===//
 
-The first BB of this code:
-
-declare bool %foo()
-int %bar() {
-        %V = call bool %foo()
-        br bool %V, label %T, label %F
-T:
-        ret int 1
-F:
-        call bool %foo()
-        ret int 12
-}
-
-compiles to:
-
-_bar:
-        subl $12, %esp
-        call L_foo$stub
-        xorb $1, %al
-        testb %al, %al
-        jne LBB_bar_2   # F
-
-It would be better to emit "cmp %al, 1" than a xor and test.
-
-//===---------------------------------------------------------------------===//
-
 We are currently lowering large (1MB+) memmove/memcpy to rep/stosl and rep/movsl
 We should leave these as libcalls for everything over a much lower threshold,
 since libc is hand tuned for medium and large mem ops (avoiding RFO for large
@@ -483,19 +443,24 @@ shorter than movl + leal.
 
 //===---------------------------------------------------------------------===//
 
-Implement CTTZ, CTLZ with bsf and bsr. GCC produces:
+__builtin_ffs codegen is messy.
 
-int ctz_(unsigned X) { return __builtin_ctz(X); }
-int clz_(unsigned X) { return __builtin_clz(X); }
 int ffs_(unsigned X) { return __builtin_ffs(X); }
 
-_ctz_:
-        bsfl    4(%esp), %eax
-        ret
-_clz_:
-        bsrl    4(%esp), %eax
-        xorl    $31, %eax
+llvm produces:
+ffs_:
+        movl    4(%esp), %ecx
+        bsfl    %ecx, %eax
+        movl    $32, %edx
+        cmove   %edx, %eax
+        incl    %eax
+        xorl    %edx, %edx
+        testl   %ecx, %ecx
+        cmove   %edx, %eax
         ret
+
+vs gcc:
+
 _ffs_:
         movl    $-1, %edx
         bsfl    4(%esp), %eax
@@ -503,6 +468,15 @@ _ffs_:
         addl    $1, %eax
         ret
 
+Another example of __builtin_ffs (use predsimplify to eliminate a select):
+
+int foo (unsigned long j) {
+  if (j)
+    return __builtin_ffs (j) - 1;
+  else
+    return 0;
+}
+
 //===---------------------------------------------------------------------===//
 
 It appears gcc place string data with linkonce linkage in
@@ -1062,6 +1036,8 @@ Should compile to:
                 setae   %al
                 ret
 
+FIXME: That code looks wrong; bool return is normally defined as zext.
+
 on x86-64, not:
 
 __Z11no_overflowjj:
@@ -1208,35 +1184,44 @@ void compare (long long foo) {
 
 to:
 
-_compare:
-        subl    $12, %esp
-        cmpl    $0, 16(%esp)
+compare:
+        subl    $4, %esp
+        cmpl    $0, 8(%esp)
         setne   %al
         movzbw  %al, %ax
-        cmpl    $1, 20(%esp)
+        cmpl    $1, 12(%esp)
         setg    %cl
         movzbw  %cl, %cx
         cmove   %ax, %cx
-        movw    %cx, %ax
-        testb   $1, %al
-        je      LBB1_2  # cond_true
+        testb   $1, %cl
+        jne     .LBB1_2 # UnifiedReturnBlock
+.LBB1_1:        # ifthen
+        call    abort
+.LBB1_2:        # UnifiedReturnBlock
+        addl    $4, %esp
+        ret
 
 (also really horrible code on ppc).  This is due to the expand code for 64-bit
 compares.  GCC produces multiple branches, which is much nicer:
 
-_compare:
-        pushl   %ebp
-        movl    %esp, %ebp
-        subl    $8, %esp
-        movl    8(%ebp), %eax
-        movl    12(%ebp), %edx
-        subl    $1, %edx
-        jg     L5
-L7:
-        jl      L4
+compare:
+        subl    $12, %esp
+        movl    20(%esp), %edx
+        movl    16(%esp), %eax
+        decl    %edx
+        jle     .L7
+.L5:
+        addl    $12, %esp
+        ret
+        .p2align 4,,7
+.L7:
+        jl      .L4
         cmpl    $0, %eax
-        jbe      L4
-L5:
+        .p2align 4,,8
+        ja      .L5
+.L4:
+        .p2align 4,,9
+        call    abort
 
 //===---------------------------------------------------------------------===//
 
@@ -1380,7 +1365,7 @@ Should compile into:
 
 _foo:
         movzwl  4(%esp), %eax
-        orb     $-1, %al           ;; 'orl 255' is also fine :)
+        orl     $255, %eax
         ret
 
 instead of:
@@ -1550,6 +1535,48 @@ See PR2053 for more details.
 
 //===---------------------------------------------------------------------===//
 
+We should investigate using cdq/ctld (effect: edx = sar eax, 31)
+more aggressively; it should cost the same as a move+shift on any modern
+processor, but it's a lot shorter. Downside is that it puts more
+pressure on register allocation because it has fixed operands.
+
+Example:
+int abs(int x) {return x < 0 ? -x : x;}
+
+gcc compiles this to the following when using march/mtune=pentium2/3/4/m/etc.:
+abs:
+        movl    4(%esp), %eax
+        cltd
+        xorl    %edx, %eax
+        subl    %edx, %eax
+        ret
+
+//===---------------------------------------------------------------------===//
+
+Consider:
+
+#include <inttypes.h>
+uint64_t a;
+uint16_t b;
+uint64_t mul(void) {
+  return a * b;
+}
+
+Currently, we generate the following:
+
+mul:
+        movzwl  b, %ecx
+        movl    %ecx, %eax
+        mull    a
+        imull   a+4, %ecx
+        addl    %edx, %ecx
+        movl    %ecx, %edx
+        ret
+
+llvm should be able to commute the addl so that the movl isn't necessary.
+
+//===---------------------------------------------------------------------===//
+
 Consider:
 int test(unsigned long a, unsigned long b) { return -(a < b); }