mirror of
https://github.com/RPCS3/llvm.git
synced 2025-03-05 00:59:19 +00:00

Summary: The original heuristic to break critical edge during machine sink is relatively conservertive: when there is only one instruction sinkable to the critical edge, it is likely that the machine sink pass will not break the critical edge. This leads to many speculative instructions executed at runtime. However, with profile info, we could model the splitting benefits: if the critical edge has 50% taken rate, it would always be beneficial to split the critical edge to avoid the speculated runtime instructions. This patch uses profile to guide critical edge splitting in machine sink pass. The performance impact on speccpu2006 on Intel sandybridge machines: spec/2006/fp/C++/444.namd 25.3 +0.26% spec/2006/fp/C++/447.dealII 45.96 -0.10% spec/2006/fp/C++/450.soplex 41.97 +1.49% spec/2006/fp/C++/453.povray 36.83 -0.96% spec/2006/fp/C/433.milc 23.81 +0.32% spec/2006/fp/C/470.lbm 41.17 +0.34% spec/2006/fp/C/482.sphinx3 48.13 +0.69% spec/2006/int/C++/471.omnetpp 22.45 +3.25% spec/2006/int/C++/473.astar 21.35 -2.06% spec/2006/int/C++/483.xalancbmk 36.02 -2.39% spec/2006/int/C/400.perlbench 33.7 -0.17% spec/2006/int/C/401.bzip2 22.9 +0.52% spec/2006/int/C/403.gcc 32.42 -0.54% spec/2006/int/C/429.mcf 39.59 +0.19% spec/2006/int/C/445.gobmk 26.98 -0.00% spec/2006/int/C/456.hmmer 24.52 -0.18% spec/2006/int/C/458.sjeng 28.26 +0.02% spec/2006/int/C/462.libquantum 55.44 +3.74% spec/2006/int/C/464.h264ref 46.67 -0.39% geometric mean +0.20% Manually checked 473 and 471 to verify the diff is in the noise range. Reviewers: rengolin, davidxl Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D24818 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@284757 91177308-0d34-0410-b5e6-96231b3b80d8
781 lines
22 KiB
LLVM
781 lines
22 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
|
; RUN: llc < %s -mtriple=i686-unknown-unknown | FileCheck %s --check-prefix=CHECK --check-prefix=X32
|
|
; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=CHECK --check-prefix=X64
|
|
; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+bmi,+lzcnt | FileCheck %s --check-prefix=CHECK --check-prefix=X32-CLZ
|
|
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+bmi,+lzcnt | FileCheck %s --check-prefix=CHECK --check-prefix=X64-CLZ
|
|
|
|
declare i8 @llvm.cttz.i8(i8, i1)
|
|
declare i16 @llvm.cttz.i16(i16, i1)
|
|
declare i32 @llvm.cttz.i32(i32, i1)
|
|
declare i64 @llvm.cttz.i64(i64, i1)
|
|
|
|
declare i8 @llvm.ctlz.i8(i8, i1)
|
|
declare i16 @llvm.ctlz.i16(i16, i1)
|
|
declare i32 @llvm.ctlz.i32(i32, i1)
|
|
declare i64 @llvm.ctlz.i64(i64, i1)
|
|
|
|
define i8 @cttz_i8(i8 %x) {
|
|
; X32-LABEL: cttz_i8:
|
|
; X32: # BB#0:
|
|
; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
|
|
; X32-NEXT: bsfl %eax, %eax
|
|
; X32-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
|
|
; X32-NEXT: retl
|
|
;
|
|
; X64-LABEL: cttz_i8:
|
|
; X64: # BB#0:
|
|
; X64-NEXT: movzbl %dil, %eax
|
|
; X64-NEXT: bsfl %eax, %eax
|
|
; X64-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
|
|
; X64-NEXT: retq
|
|
;
|
|
; X32-CLZ-LABEL: cttz_i8:
|
|
; X32-CLZ: # BB#0:
|
|
; X32-CLZ-NEXT: movzbl {{[0-9]+}}(%esp), %eax
|
|
; X32-CLZ-NEXT: tzcntl %eax, %eax
|
|
; X32-CLZ-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
|
|
; X32-CLZ-NEXT: retl
|
|
;
|
|
; X64-CLZ-LABEL: cttz_i8:
|
|
; X64-CLZ: # BB#0:
|
|
; X64-CLZ-NEXT: movzbl %dil, %eax
|
|
; X64-CLZ-NEXT: tzcntl %eax, %eax
|
|
; X64-CLZ-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
|
|
; X64-CLZ-NEXT: retq
|
|
%tmp = call i8 @llvm.cttz.i8( i8 %x, i1 true )
|
|
ret i8 %tmp
|
|
}
|
|
|
|
define i16 @cttz_i16(i16 %x) {
|
|
; X32-LABEL: cttz_i16:
|
|
; X32: # BB#0:
|
|
; X32-NEXT: bsfw {{[0-9]+}}(%esp), %ax
|
|
; X32-NEXT: retl
|
|
;
|
|
; X64-LABEL: cttz_i16:
|
|
; X64: # BB#0:
|
|
; X64-NEXT: bsfw %di, %ax
|
|
; X64-NEXT: retq
|
|
;
|
|
; X32-CLZ-LABEL: cttz_i16:
|
|
; X32-CLZ: # BB#0:
|
|
; X32-CLZ-NEXT: tzcntw {{[0-9]+}}(%esp), %ax
|
|
; X32-CLZ-NEXT: retl
|
|
;
|
|
; X64-CLZ-LABEL: cttz_i16:
|
|
; X64-CLZ: # BB#0:
|
|
; X64-CLZ-NEXT: tzcntw %di, %ax
|
|
; X64-CLZ-NEXT: retq
|
|
%tmp = call i16 @llvm.cttz.i16( i16 %x, i1 true )
|
|
ret i16 %tmp
|
|
}
|
|
|
|
define i32 @cttz_i32(i32 %x) {
|
|
; X32-LABEL: cttz_i32:
|
|
; X32: # BB#0:
|
|
; X32-NEXT: bsfl {{[0-9]+}}(%esp), %eax
|
|
; X32-NEXT: retl
|
|
;
|
|
; X64-LABEL: cttz_i32:
|
|
; X64: # BB#0:
|
|
; X64-NEXT: bsfl %edi, %eax
|
|
; X64-NEXT: retq
|
|
;
|
|
; X32-CLZ-LABEL: cttz_i32:
|
|
; X32-CLZ: # BB#0:
|
|
; X32-CLZ-NEXT: tzcntl {{[0-9]+}}(%esp), %eax
|
|
; X32-CLZ-NEXT: retl
|
|
;
|
|
; X64-CLZ-LABEL: cttz_i32:
|
|
; X64-CLZ: # BB#0:
|
|
; X64-CLZ-NEXT: tzcntl %edi, %eax
|
|
; X64-CLZ-NEXT: retq
|
|
%tmp = call i32 @llvm.cttz.i32( i32 %x, i1 true )
|
|
ret i32 %tmp
|
|
}
|
|
|
|
define i64 @cttz_i64(i64 %x) {
|
|
; X32-LABEL: cttz_i64:
|
|
; X32: # BB#0:
|
|
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
|
|
; X32-NEXT: testl %eax, %eax
|
|
; X32-NEXT: jne .LBB3_1
|
|
; X32-NEXT: # BB#2:
|
|
; X32-NEXT: bsfl {{[0-9]+}}(%esp), %eax
|
|
; X32-NEXT: addl $32, %eax
|
|
; X32-NEXT: xorl %edx, %edx
|
|
; X32-NEXT: retl
|
|
; X32-NEXT: .LBB3_1:
|
|
; X32-NEXT: bsfl %eax, %eax
|
|
; X32-NEXT: xorl %edx, %edx
|
|
; X32-NEXT: retl
|
|
;
|
|
; X64-LABEL: cttz_i64:
|
|
; X64: # BB#0:
|
|
; X64-NEXT: bsfq %rdi, %rax
|
|
; X64-NEXT: retq
|
|
;
|
|
; X32-CLZ-LABEL: cttz_i64:
|
|
; X32-CLZ: # BB#0:
|
|
; X32-CLZ-NEXT: movl {{[0-9]+}}(%esp), %eax
|
|
; X32-CLZ-NEXT: testl %eax, %eax
|
|
; X32-CLZ-NEXT: jne .LBB3_1
|
|
; X32-CLZ-NEXT: # BB#2:
|
|
; X32-CLZ-NEXT: tzcntl {{[0-9]+}}(%esp), %eax
|
|
; X32-CLZ-NEXT: addl $32, %eax
|
|
; X32-CLZ-NEXT: xorl %edx, %edx
|
|
; X32-CLZ-NEXT: retl
|
|
; X32-CLZ-NEXT: .LBB3_1:
|
|
; X32-CLZ-NEXT: tzcntl %eax, %eax
|
|
; X32-CLZ-NEXT: xorl %edx, %edx
|
|
; X32-CLZ-NEXT: retl
|
|
;
|
|
; X64-CLZ-LABEL: cttz_i64:
|
|
; X64-CLZ: # BB#0:
|
|
; X64-CLZ-NEXT: tzcntq %rdi, %rax
|
|
; X64-CLZ-NEXT: retq
|
|
%tmp = call i64 @llvm.cttz.i64( i64 %x, i1 true )
|
|
ret i64 %tmp
|
|
}
|
|
|
|
define i8 @ctlz_i8(i8 %x) {
|
|
; X32-LABEL: ctlz_i8:
|
|
; X32: # BB#0:
|
|
; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
|
|
; X32-NEXT: bsrl %eax, %eax
|
|
; X32-NEXT: xorl $7, %eax
|
|
; X32-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
|
|
; X32-NEXT: retl
|
|
;
|
|
; X64-LABEL: ctlz_i8:
|
|
; X64: # BB#0:
|
|
; X64-NEXT: movzbl %dil, %eax
|
|
; X64-NEXT: bsrl %eax, %eax
|
|
; X64-NEXT: xorl $7, %eax
|
|
; X64-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
|
|
; X64-NEXT: retq
|
|
;
|
|
; X32-CLZ-LABEL: ctlz_i8:
|
|
; X32-CLZ: # BB#0:
|
|
; X32-CLZ-NEXT: movzbl {{[0-9]+}}(%esp), %eax
|
|
; X32-CLZ-NEXT: lzcntl %eax, %eax
|
|
; X32-CLZ-NEXT: addl $-24, %eax
|
|
; X32-CLZ-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
|
|
; X32-CLZ-NEXT: retl
|
|
;
|
|
; X64-CLZ-LABEL: ctlz_i8:
|
|
; X64-CLZ: # BB#0:
|
|
; X64-CLZ-NEXT: movzbl %dil, %eax
|
|
; X64-CLZ-NEXT: lzcntl %eax, %eax
|
|
; X64-CLZ-NEXT: addl $-24, %eax
|
|
; X64-CLZ-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
|
|
; X64-CLZ-NEXT: retq
|
|
%tmp2 = call i8 @llvm.ctlz.i8( i8 %x, i1 true )
|
|
ret i8 %tmp2
|
|
}
|
|
|
|
define i16 @ctlz_i16(i16 %x) {
|
|
; X32-LABEL: ctlz_i16:
|
|
; X32: # BB#0:
|
|
; X32-NEXT: bsrw {{[0-9]+}}(%esp), %ax
|
|
; X32-NEXT: xorl $15, %eax
|
|
; X32-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
|
|
; X32-NEXT: retl
|
|
;
|
|
; X64-LABEL: ctlz_i16:
|
|
; X64: # BB#0:
|
|
; X64-NEXT: bsrw %di, %ax
|
|
; X64-NEXT: xorl $15, %eax
|
|
; X64-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
|
|
; X64-NEXT: retq
|
|
;
|
|
; X32-CLZ-LABEL: ctlz_i16:
|
|
; X32-CLZ: # BB#0:
|
|
; X32-CLZ-NEXT: lzcntw {{[0-9]+}}(%esp), %ax
|
|
; X32-CLZ-NEXT: retl
|
|
;
|
|
; X64-CLZ-LABEL: ctlz_i16:
|
|
; X64-CLZ: # BB#0:
|
|
; X64-CLZ-NEXT: lzcntw %di, %ax
|
|
; X64-CLZ-NEXT: retq
|
|
%tmp2 = call i16 @llvm.ctlz.i16( i16 %x, i1 true )
|
|
ret i16 %tmp2
|
|
}
|
|
|
|
define i32 @ctlz_i32(i32 %x) {
|
|
; X32-LABEL: ctlz_i32:
|
|
; X32: # BB#0:
|
|
; X32-NEXT: bsrl {{[0-9]+}}(%esp), %eax
|
|
; X32-NEXT: xorl $31, %eax
|
|
; X32-NEXT: retl
|
|
;
|
|
; X64-LABEL: ctlz_i32:
|
|
; X64: # BB#0:
|
|
; X64-NEXT: bsrl %edi, %eax
|
|
; X64-NEXT: xorl $31, %eax
|
|
; X64-NEXT: retq
|
|
;
|
|
; X32-CLZ-LABEL: ctlz_i32:
|
|
; X32-CLZ: # BB#0:
|
|
; X32-CLZ-NEXT: lzcntl {{[0-9]+}}(%esp), %eax
|
|
; X32-CLZ-NEXT: retl
|
|
;
|
|
; X64-CLZ-LABEL: ctlz_i32:
|
|
; X64-CLZ: # BB#0:
|
|
; X64-CLZ-NEXT: lzcntl %edi, %eax
|
|
; X64-CLZ-NEXT: retq
|
|
%tmp = call i32 @llvm.ctlz.i32( i32 %x, i1 true )
|
|
ret i32 %tmp
|
|
}
|
|
|
|
define i64 @ctlz_i64(i64 %x) {
|
|
; X32-LABEL: ctlz_i64:
|
|
; X32: # BB#0:
|
|
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
|
|
; X32-NEXT: testl %eax, %eax
|
|
; X32-NEXT: jne .LBB7_1
|
|
; X32-NEXT: # BB#2:
|
|
; X32-NEXT: bsrl {{[0-9]+}}(%esp), %eax
|
|
; X32-NEXT: xorl $31, %eax
|
|
; X32-NEXT: addl $32, %eax
|
|
; X32-NEXT: xorl %edx, %edx
|
|
; X32-NEXT: retl
|
|
; X32-NEXT: .LBB7_1:
|
|
; X32-NEXT: bsrl %eax, %eax
|
|
; X32-NEXT: xorl $31, %eax
|
|
; X32-NEXT: xorl %edx, %edx
|
|
; X32-NEXT: retl
|
|
;
|
|
; X64-LABEL: ctlz_i64:
|
|
; X64: # BB#0:
|
|
; X64-NEXT: bsrq %rdi, %rax
|
|
; X64-NEXT: xorq $63, %rax
|
|
; X64-NEXT: retq
|
|
;
|
|
; X32-CLZ-LABEL: ctlz_i64:
|
|
; X32-CLZ: # BB#0:
|
|
; X32-CLZ-NEXT: movl {{[0-9]+}}(%esp), %eax
|
|
; X32-CLZ-NEXT: testl %eax, %eax
|
|
; X32-CLZ-NEXT: jne .LBB7_1
|
|
; X32-CLZ-NEXT: # BB#2:
|
|
; X32-CLZ-NEXT: lzcntl {{[0-9]+}}(%esp), %eax
|
|
; X32-CLZ-NEXT: addl $32, %eax
|
|
; X32-CLZ-NEXT: xorl %edx, %edx
|
|
; X32-CLZ-NEXT: retl
|
|
; X32-CLZ-NEXT: .LBB7_1:
|
|
; X32-CLZ-NEXT: lzcntl %eax, %eax
|
|
; X32-CLZ-NEXT: xorl %edx, %edx
|
|
; X32-CLZ-NEXT: retl
|
|
;
|
|
; X64-CLZ-LABEL: ctlz_i64:
|
|
; X64-CLZ: # BB#0:
|
|
; X64-CLZ-NEXT: lzcntq %rdi, %rax
|
|
; X64-CLZ-NEXT: retq
|
|
%tmp = call i64 @llvm.ctlz.i64( i64 %x, i1 true )
|
|
ret i64 %tmp
|
|
}
|
|
|
|
; Generate a test and branch to handle zero inputs because bsr/bsf are very slow.
|
|
define i8 @ctlz_i8_zero_test(i8 %n) {
|
|
; X32-LABEL: ctlz_i8_zero_test:
|
|
; X32: # BB#0:
|
|
; X32-NEXT: movb {{[0-9]+}}(%esp), %al
|
|
; X32-NEXT: testb %al, %al
|
|
; X32-NEXT: je .LBB8_1
|
|
; X32-NEXT: # BB#2: # %cond.false
|
|
; X32-NEXT: movzbl %al, %eax
|
|
; X32-NEXT: bsrl %eax, %eax
|
|
; X32-NEXT: xorl $7, %eax
|
|
; X32-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
|
|
; X32-NEXT: retl
|
|
; X32-NEXT: .LBB8_1:
|
|
; X32-NEXT: movb $8, %al
|
|
; X32-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
|
|
; X32-NEXT: retl
|
|
;
|
|
; X64-LABEL: ctlz_i8_zero_test:
|
|
; X64: # BB#0:
|
|
; X64-NEXT: testb %dil, %dil
|
|
; X64-NEXT: je .LBB8_1
|
|
; X64-NEXT: # BB#2: # %cond.false
|
|
; X64-NEXT: movzbl %dil, %eax
|
|
; X64-NEXT: bsrl %eax, %eax
|
|
; X64-NEXT: xorl $7, %eax
|
|
; X64-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
|
|
; X64-NEXT: retq
|
|
; X64-NEXT: .LBB8_1:
|
|
; X64-NEXT: movb $8, %al
|
|
; X64-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
|
|
; X64-NEXT: retq
|
|
;
|
|
; X32-CLZ-LABEL: ctlz_i8_zero_test:
|
|
; X32-CLZ: # BB#0:
|
|
; X32-CLZ-NEXT: movzbl {{[0-9]+}}(%esp), %eax
|
|
; X32-CLZ-NEXT: lzcntl %eax, %eax
|
|
; X32-CLZ-NEXT: addl $-24, %eax
|
|
; X32-CLZ-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
|
|
; X32-CLZ-NEXT: retl
|
|
;
|
|
; X64-CLZ-LABEL: ctlz_i8_zero_test:
|
|
; X64-CLZ: # BB#0:
|
|
; X64-CLZ-NEXT: movzbl %dil, %eax
|
|
; X64-CLZ-NEXT: lzcntl %eax, %eax
|
|
; X64-CLZ-NEXT: addl $-24, %eax
|
|
; X64-CLZ-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
|
|
; X64-CLZ-NEXT: retq
|
|
%tmp1 = call i8 @llvm.ctlz.i8(i8 %n, i1 false)
|
|
ret i8 %tmp1
|
|
}
|
|
|
|
; Generate a test and branch to handle zero inputs because bsr/bsf are very slow.
|
|
define i16 @ctlz_i16_zero_test(i16 %n) {
|
|
; X32-LABEL: ctlz_i16_zero_test:
|
|
; X32: # BB#0:
|
|
; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
|
|
; X32-NEXT: testw %ax, %ax
|
|
; X32-NEXT: je .LBB9_1
|
|
; X32-NEXT: # BB#2: # %cond.false
|
|
; X32-NEXT: bsrw %ax, %ax
|
|
; X32-NEXT: xorl $15, %eax
|
|
; X32-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
|
|
; X32-NEXT: retl
|
|
; X32-NEXT: .LBB9_1:
|
|
; X32-NEXT: movw $16, %ax
|
|
; X32-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
|
|
; X32-NEXT: retl
|
|
;
|
|
; X64-LABEL: ctlz_i16_zero_test:
|
|
; X64: # BB#0:
|
|
; X64-NEXT: testw %di, %di
|
|
; X64-NEXT: je .LBB9_1
|
|
; X64-NEXT: # BB#2: # %cond.false
|
|
; X64-NEXT: bsrw %di, %ax
|
|
; X64-NEXT: xorl $15, %eax
|
|
; X64-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
|
|
; X64-NEXT: retq
|
|
; X64-NEXT: .LBB9_1:
|
|
; X64-NEXT: movw $16, %ax
|
|
; X64-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
|
|
; X64-NEXT: retq
|
|
;
|
|
; X32-CLZ-LABEL: ctlz_i16_zero_test:
|
|
; X32-CLZ: # BB#0:
|
|
; X32-CLZ-NEXT: lzcntw {{[0-9]+}}(%esp), %ax
|
|
; X32-CLZ-NEXT: retl
|
|
;
|
|
; X64-CLZ-LABEL: ctlz_i16_zero_test:
|
|
; X64-CLZ: # BB#0:
|
|
; X64-CLZ-NEXT: lzcntw %di, %ax
|
|
; X64-CLZ-NEXT: retq
|
|
%tmp1 = call i16 @llvm.ctlz.i16(i16 %n, i1 false)
|
|
ret i16 %tmp1
|
|
}
|
|
|
|
; Generate a test and branch to handle zero inputs because bsr/bsf are very slow.
|
|
define i32 @ctlz_i32_zero_test(i32 %n) {
|
|
; X32-LABEL: ctlz_i32_zero_test:
|
|
; X32: # BB#0:
|
|
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
|
|
; X32-NEXT: testl %eax, %eax
|
|
; X32-NEXT: je .LBB10_1
|
|
; X32-NEXT: # BB#2: # %cond.false
|
|
; X32-NEXT: bsrl %eax, %eax
|
|
; X32-NEXT: xorl $31, %eax
|
|
; X32-NEXT: retl
|
|
; X32-NEXT: .LBB10_1:
|
|
; X32-NEXT: movl $32, %eax
|
|
; X32-NEXT: retl
|
|
;
|
|
; X64-LABEL: ctlz_i32_zero_test:
|
|
; X64: # BB#0:
|
|
; X64-NEXT: testl %edi, %edi
|
|
; X64-NEXT: je .LBB10_1
|
|
; X64-NEXT: # BB#2: # %cond.false
|
|
; X64-NEXT: bsrl %edi, %eax
|
|
; X64-NEXT: xorl $31, %eax
|
|
; X64-NEXT: retq
|
|
; X64-NEXT: .LBB10_1:
|
|
; X64-NEXT: movl $32, %eax
|
|
; X64-NEXT: retq
|
|
;
|
|
; X32-CLZ-LABEL: ctlz_i32_zero_test:
|
|
; X32-CLZ: # BB#0:
|
|
; X32-CLZ-NEXT: lzcntl {{[0-9]+}}(%esp), %eax
|
|
; X32-CLZ-NEXT: retl
|
|
;
|
|
; X64-CLZ-LABEL: ctlz_i32_zero_test:
|
|
; X64-CLZ: # BB#0:
|
|
; X64-CLZ-NEXT: lzcntl %edi, %eax
|
|
; X64-CLZ-NEXT: retq
|
|
%tmp1 = call i32 @llvm.ctlz.i32(i32 %n, i1 false)
|
|
ret i32 %tmp1
|
|
}
|
|
|
|
; Generate a test and branch to handle zero inputs because bsr/bsf are very slow.
|
|
define i64 @ctlz_i64_zero_test(i64 %n) {
|
|
; X32-LABEL: ctlz_i64_zero_test:
|
|
; X32: # BB#0:
|
|
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
|
|
; X32-NEXT: bsrl {{[0-9]+}}(%esp), %edx
|
|
; X32-NEXT: movl $63, %eax
|
|
; X32-NEXT: je .LBB11_2
|
|
; X32-NEXT: # BB#1:
|
|
; X32-NEXT: movl %edx, %eax
|
|
; X32-NEXT: .LBB11_2:
|
|
; X32-NEXT: testl %ecx, %ecx
|
|
; X32-NEXT: jne .LBB11_3
|
|
; X32-NEXT: # BB#4:
|
|
; X32-NEXT: xorl $31, %eax
|
|
; X32-NEXT: addl $32, %eax
|
|
; X32-NEXT: xorl %edx, %edx
|
|
; X32-NEXT: retl
|
|
; X32-NEXT: .LBB11_3:
|
|
; X32-NEXT: bsrl %ecx, %eax
|
|
; X32-NEXT: xorl $31, %eax
|
|
; X32-NEXT: xorl %edx, %edx
|
|
; X32-NEXT: retl
|
|
;
|
|
; X64-LABEL: ctlz_i64_zero_test:
|
|
; X64: # BB#0:
|
|
; X64-NEXT: testq %rdi, %rdi
|
|
; X64-NEXT: je .LBB11_1
|
|
; X64-NEXT: # BB#2: # %cond.false
|
|
; X64-NEXT: bsrq %rdi, %rax
|
|
; X64-NEXT: xorq $63, %rax
|
|
; X64-NEXT: retq
|
|
; X64-NEXT: .LBB11_1:
|
|
; X64-NEXT: movl $64, %eax
|
|
; X64-NEXT: retq
|
|
;
|
|
; X32-CLZ-LABEL: ctlz_i64_zero_test:
|
|
; X32-CLZ: # BB#0:
|
|
; X32-CLZ-NEXT: movl {{[0-9]+}}(%esp), %eax
|
|
; X32-CLZ-NEXT: testl %eax, %eax
|
|
; X32-CLZ-NEXT: jne .LBB11_1
|
|
; X32-CLZ-NEXT: # BB#2:
|
|
; X32-CLZ-NEXT: lzcntl {{[0-9]+}}(%esp), %eax
|
|
; X32-CLZ-NEXT: addl $32, %eax
|
|
; X32-CLZ-NEXT: xorl %edx, %edx
|
|
; X32-CLZ-NEXT: retl
|
|
; X32-CLZ-NEXT: .LBB11_1:
|
|
; X32-CLZ-NEXT: lzcntl %eax, %eax
|
|
; X32-CLZ-NEXT: xorl %edx, %edx
|
|
; X32-CLZ-NEXT: retl
|
|
;
|
|
; X64-CLZ-LABEL: ctlz_i64_zero_test:
|
|
; X64-CLZ: # BB#0:
|
|
; X64-CLZ-NEXT: lzcntq %rdi, %rax
|
|
; X64-CLZ-NEXT: retq
|
|
%tmp1 = call i64 @llvm.ctlz.i64(i64 %n, i1 false)
|
|
ret i64 %tmp1
|
|
}
|
|
|
|
; Generate a test and branch to handle zero inputs because bsr/bsf are very slow.
|
|
define i8 @cttz_i8_zero_test(i8 %n) {
|
|
; X32-LABEL: cttz_i8_zero_test:
|
|
; X32: # BB#0:
|
|
; X32-NEXT: movb {{[0-9]+}}(%esp), %al
|
|
; X32-NEXT: testb %al, %al
|
|
; X32-NEXT: je .LBB12_1
|
|
; X32-NEXT: # BB#2: # %cond.false
|
|
; X32-NEXT: movzbl %al, %eax
|
|
; X32-NEXT: bsfl %eax, %eax
|
|
; X32-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
|
|
; X32-NEXT: retl
|
|
; X32-NEXT: .LBB12_1
|
|
; X32-NEXT: movb $8, %al
|
|
; X32-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
|
|
; X32-NEXT: retl
|
|
;
|
|
; X64-LABEL: cttz_i8_zero_test:
|
|
; X64: # BB#0:
|
|
; X64-NEXT: testb %dil, %dil
|
|
; X64-NEXT: je .LBB12_1
|
|
; X64-NEXT: # BB#2: # %cond.false
|
|
; X64-NEXT: movzbl %dil, %eax
|
|
; X64-NEXT: bsfl %eax, %eax
|
|
; X64-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
|
|
; X64-NEXT: retq
|
|
; X64-NEXT: .LBB12_1:
|
|
; X64-NEXT: movb $8, %al
|
|
; X64-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
|
|
; X64-NEXT: retq
|
|
;
|
|
; X32-CLZ-LABEL: cttz_i8_zero_test:
|
|
; X32-CLZ: # BB#0:
|
|
; X32-CLZ-NEXT: movzbl {{[0-9]+}}(%esp), %eax
|
|
; X32-CLZ-NEXT: orl $256, %eax # imm = 0x100
|
|
; X32-CLZ-NEXT: tzcntl %eax, %eax
|
|
; X32-CLZ-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
|
|
; X32-CLZ-NEXT: retl
|
|
;
|
|
; X64-CLZ-LABEL: cttz_i8_zero_test:
|
|
; X64-CLZ: # BB#0:
|
|
; X64-CLZ-NEXT: movzbl %dil, %eax
|
|
; X64-CLZ-NEXT: orl $256, %eax # imm = 0x100
|
|
; X64-CLZ-NEXT: tzcntl %eax, %eax
|
|
; X64-CLZ-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
|
|
; X64-CLZ-NEXT: retq
|
|
%tmp1 = call i8 @llvm.cttz.i8(i8 %n, i1 false)
|
|
ret i8 %tmp1
|
|
}
|
|
|
|
; Generate a test and branch to handle zero inputs because bsr/bsf are very slow.
|
|
define i16 @cttz_i16_zero_test(i16 %n) {
|
|
; X32-LABEL: cttz_i16_zero_test:
|
|
; X32: # BB#0:
|
|
; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
|
|
; X32-NEXT: testw %ax, %ax
|
|
; X32-NEXT: je .LBB13_1
|
|
; X32-NEXT: # BB#2: # %cond.false
|
|
; X32-NEXT: bsfw %ax, %ax
|
|
; X32-NEXT: retl
|
|
; X32-NEXT: .LBB13_1
|
|
; X32-NEXT: movw $16, %ax
|
|
; X32-NEXT: retl
|
|
;
|
|
; X64-LABEL: cttz_i16_zero_test:
|
|
; X64: # BB#0:
|
|
; X64-NEXT: testw %di, %di
|
|
; X64-NEXT: je .LBB13_1
|
|
; X64-NEXT: # BB#2: # %cond.false
|
|
; X64-NEXT: bsfw %di, %ax
|
|
; X64-NEXT: retq
|
|
; X64-NEXT: .LBB13_1:
|
|
; X64-NEXT: movw $16, %ax
|
|
; X64-NEXT: retq
|
|
;
|
|
; X32-CLZ-LABEL: cttz_i16_zero_test:
|
|
; X32-CLZ: # BB#0:
|
|
; X32-CLZ-NEXT: tzcntw {{[0-9]+}}(%esp), %ax
|
|
; X32-CLZ-NEXT: retl
|
|
;
|
|
; X64-CLZ-LABEL: cttz_i16_zero_test:
|
|
; X64-CLZ: # BB#0:
|
|
; X64-CLZ-NEXT: tzcntw %di, %ax
|
|
; X64-CLZ-NEXT: retq
|
|
%tmp1 = call i16 @llvm.cttz.i16(i16 %n, i1 false)
|
|
ret i16 %tmp1
|
|
}
|
|
|
|
; Generate a test and branch to handle zero inputs because bsr/bsf are very slow.
|
|
define i32 @cttz_i32_zero_test(i32 %n) {
|
|
; X32-LABEL: cttz_i32_zero_test:
|
|
; X32: # BB#0:
|
|
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
|
|
; X32-NEXT: testl %eax, %eax
|
|
; X32-NEXT: je .LBB14_1
|
|
; X32-NEXT: # BB#2: # %cond.false
|
|
; X32-NEXT: bsfl %eax, %eax
|
|
; X32-NEXT: retl
|
|
; X32-NEXT: .LBB14_1
|
|
; X32-NEXT: movl $32, %eax
|
|
; X32-NEXT: retl
|
|
;
|
|
; X64-LABEL: cttz_i32_zero_test:
|
|
; X64: # BB#0:
|
|
; X64-NEXT: testl %edi, %edi
|
|
; X64-NEXT: je .LBB14_1
|
|
; X64-NEXT: # BB#2: # %cond.false
|
|
; X64-NEXT: bsfl %edi, %eax
|
|
; X64-NEXT: retq
|
|
; X64-NEXT: .LBB14_1:
|
|
; X64-NEXT: movl $32, %eax
|
|
; X64-NEXT: retq
|
|
;
|
|
; X32-CLZ-LABEL: cttz_i32_zero_test:
|
|
; X32-CLZ: # BB#0:
|
|
; X32-CLZ-NEXT: tzcntl {{[0-9]+}}(%esp), %eax
|
|
; X32-CLZ-NEXT: retl
|
|
;
|
|
; X64-CLZ-LABEL: cttz_i32_zero_test:
|
|
; X64-CLZ: # BB#0:
|
|
; X64-CLZ-NEXT: tzcntl %edi, %eax
|
|
; X64-CLZ-NEXT: retq
|
|
%tmp1 = call i32 @llvm.cttz.i32(i32 %n, i1 false)
|
|
ret i32 %tmp1
|
|
}
|
|
|
|
; Generate a test and branch to handle zero inputs because bsr/bsf are very slow.
|
|
define i64 @cttz_i64_zero_test(i64 %n) {
|
|
; X32-LABEL: cttz_i64_zero_test:
|
|
; X32: # BB#0:
|
|
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
|
|
; X32-NEXT: bsfl {{[0-9]+}}(%esp), %edx
|
|
; X32-NEXT: movl $32, %eax
|
|
; X32-NEXT: je .LBB15_2
|
|
; X32-NEXT: # BB#1:
|
|
; X32-NEXT: movl %edx, %eax
|
|
; X32-NEXT: .LBB15_2:
|
|
; X32-NEXT: testl %ecx, %ecx
|
|
; X32-NEXT: jne .LBB15_3
|
|
; X32-NEXT: # BB#4:
|
|
; X32-NEXT: addl $32, %eax
|
|
; X32-NEXT: xorl %edx, %edx
|
|
; X32-NEXT: retl
|
|
; X32-NEXT: .LBB15_3:
|
|
; X32-NEXT: bsfl %ecx, %eax
|
|
; X32-NEXT: xorl %edx, %edx
|
|
; X32-NEXT: retl
|
|
;
|
|
; X64-LABEL: cttz_i64_zero_test:
|
|
; X64: # BB#0:
|
|
; X64-NEXT: testq %rdi, %rdi
|
|
; X64-NEXT: je .LBB15_1
|
|
; X64-NEXT: # BB#2: # %cond.false
|
|
; X64-NEXT: bsfq %rdi, %rax
|
|
; X64-NEXT: retq
|
|
; X64-NEXT: .LBB15_1:
|
|
; X64-NEXT: movl $64, %eax
|
|
; X64-NEXT: retq
|
|
;
|
|
; X32-CLZ-LABEL: cttz_i64_zero_test:
|
|
; X32-CLZ: # BB#0:
|
|
; X32-CLZ-NEXT: movl {{[0-9]+}}(%esp), %eax
|
|
; X32-CLZ-NEXT: testl %eax, %eax
|
|
; X32-CLZ-NEXT: jne .LBB15_1
|
|
; X32-CLZ-NEXT: # BB#2:
|
|
; X32-CLZ-NEXT: tzcntl {{[0-9]+}}(%esp), %eax
|
|
; X32-CLZ-NEXT: addl $32, %eax
|
|
; X32-CLZ-NEXT: xorl %edx, %edx
|
|
; X32-CLZ-NEXT: retl
|
|
; X32-CLZ-NEXT: .LBB15_1:
|
|
; X32-CLZ-NEXT: tzcntl %eax, %eax
|
|
; X32-CLZ-NEXT: xorl %edx, %edx
|
|
; X32-CLZ-NEXT: retl
|
|
;
|
|
; X64-CLZ-LABEL: cttz_i64_zero_test:
|
|
; X64-CLZ: # BB#0:
|
|
; X64-CLZ-NEXT: tzcntq %rdi, %rax
|
|
; X64-CLZ-NEXT: retq
|
|
%tmp1 = call i64 @llvm.cttz.i64(i64 %n, i1 false)
|
|
ret i64 %tmp1
|
|
}
|
|
|
|
; Don't generate the cmovne when the source is known non-zero (and bsr would
|
|
; not set ZF).
|
|
; rdar://9490949
|
|
; FIXME: The compare and branch are produced late in IR (by CodeGenPrepare), and
|
|
; codegen doesn't know how to delete the movl and je.
|
|
define i32 @ctlz_i32_fold_cmov(i32 %n) {
|
|
; X32-LABEL: ctlz_i32_fold_cmov:
|
|
; X32: # BB#0:
|
|
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
|
|
; X32-NEXT: orl $1, %eax
|
|
; X32-NEXT: je .LBB16_1
|
|
; X32-NEXT: # BB#2: # %cond.false
|
|
; X32-NEXT: bsrl %eax, %eax
|
|
; X32-NEXT: xorl $31, %eax
|
|
; X32-NEXT: retl
|
|
; X32-NEXT: .LBB16_1
|
|
; X32-NEXT: movl $32, %eax
|
|
; X32-NEXT: retl
|
|
;
|
|
; X64-LABEL: ctlz_i32_fold_cmov:
|
|
; X64: # BB#0:
|
|
; X64-NEXT: orl $1, %edi
|
|
; X64-NEXT: je .LBB16_1
|
|
; X64-NEXT: # BB#2: # %cond.false
|
|
; X64-NEXT: bsrl %edi, %eax
|
|
; X64-NEXT: xorl $31, %eax
|
|
; X64-NEXT: retq
|
|
; X64-NEXT: .LBB16_1:
|
|
; X64-NEXT: movl $32, %eax
|
|
; X64-NEXT: retq
|
|
;
|
|
; X32-CLZ-LABEL: ctlz_i32_fold_cmov:
|
|
; X32-CLZ: # BB#0:
|
|
; X32-CLZ-NEXT: movl {{[0-9]+}}(%esp), %eax
|
|
; X32-CLZ-NEXT: orl $1, %eax
|
|
; X32-CLZ-NEXT: lzcntl %eax, %eax
|
|
; X32-CLZ-NEXT: retl
|
|
;
|
|
; X64-CLZ-LABEL: ctlz_i32_fold_cmov:
|
|
; X64-CLZ: # BB#0:
|
|
; X64-CLZ-NEXT: orl $1, %edi
|
|
; X64-CLZ-NEXT: lzcntl %edi, %eax
|
|
; X64-CLZ-NEXT: retq
|
|
%or = or i32 %n, 1
|
|
%tmp1 = call i32 @llvm.ctlz.i32(i32 %or, i1 false)
|
|
ret i32 %tmp1
|
|
}
|
|
|
|
; Don't generate any xors when a 'ctlz' intrinsic is actually used to compute
|
|
; the most significant bit, which is what 'bsr' does natively.
|
|
; FIXME: We should probably select BSR instead of LZCNT in these circumstances.
|
|
define i32 @ctlz_bsr(i32 %n) {
|
|
; X32-LABEL: ctlz_bsr:
|
|
; X32: # BB#0:
|
|
; X32-NEXT: bsrl {{[0-9]+}}(%esp), %eax
|
|
; X32-NEXT: retl
|
|
;
|
|
; X64-LABEL: ctlz_bsr:
|
|
; X64: # BB#0:
|
|
; X64-NEXT: bsrl %edi, %eax
|
|
; X64-NEXT: retq
|
|
;
|
|
; X32-CLZ-LABEL: ctlz_bsr:
|
|
; X32-CLZ: # BB#0:
|
|
; X32-CLZ-NEXT: lzcntl {{[0-9]+}}(%esp), %eax
|
|
; X32-CLZ-NEXT: xorl $31, %eax
|
|
; X32-CLZ-NEXT: retl
|
|
;
|
|
; X64-CLZ-LABEL: ctlz_bsr:
|
|
; X64-CLZ: # BB#0:
|
|
; X64-CLZ-NEXT: lzcntl %edi, %eax
|
|
; X64-CLZ-NEXT: xorl $31, %eax
|
|
; X64-CLZ-NEXT: retq
|
|
%ctlz = call i32 @llvm.ctlz.i32(i32 %n, i1 true)
|
|
%bsr = xor i32 %ctlz, 31
|
|
ret i32 %bsr
|
|
}
|
|
|
|
; Generate a test and branch to handle zero inputs because bsr/bsf are very slow.
|
|
; FIXME: The compare and branch are produced late in IR (by CodeGenPrepare), and
|
|
; codegen doesn't know how to combine the $32 and $31 into $63.
|
|
define i32 @ctlz_bsr_zero_test(i32 %n) {
|
|
; X32-LABEL: ctlz_bsr_zero_test:
|
|
; X32: # BB#0:
|
|
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
|
|
; X32-NEXT: testl %eax, %eax
|
|
; X32-NEXT: je .LBB18_1
|
|
; X32-NEXT: # BB#2: # %cond.false
|
|
; X32-NEXT: bsrl %eax, %eax
|
|
; X32-NEXT: xorl $31, %eax
|
|
; X32-NEXT: xorl $31, %eax
|
|
; X32-NEXT: retl
|
|
; X32-NEXT: .LBB18_1:
|
|
; X32-NEXT: movl $32, %eax
|
|
; X32-NEXT: xorl $31, %eax
|
|
; X32-NEXT: retl
|
|
;
|
|
; X64-LABEL: ctlz_bsr_zero_test:
|
|
; X64: # BB#0:
|
|
; X64-NEXT: testl %edi, %edi
|
|
; X64-NEXT: je .LBB18_1
|
|
; X64-NEXT: # BB#2: # %cond.false
|
|
; X64-NEXT: bsrl %edi, %eax
|
|
; X64-NEXT: xorl $31, %eax
|
|
; X64-NEXT: xorl $31, %eax
|
|
; X64-NEXT: retq
|
|
; X64-NEXT: .LBB18_1:
|
|
; X64-NEXT: movl $32, %eax
|
|
; X64-NEXT: xorl $31, %eax
|
|
; X64-NEXT: retq
|
|
;
|
|
; X32-CLZ-LABEL: ctlz_bsr_zero_test:
|
|
; X32-CLZ: # BB#0:
|
|
; X32-CLZ-NEXT: lzcntl {{[0-9]+}}(%esp), %eax
|
|
; X32-CLZ-NEXT: xorl $31, %eax
|
|
; X32-CLZ-NEXT: retl
|
|
;
|
|
; X64-CLZ-LABEL: ctlz_bsr_zero_test:
|
|
; X64-CLZ: # BB#0:
|
|
; X64-CLZ-NEXT: lzcntl %edi, %eax
|
|
; X64-CLZ-NEXT: xorl $31, %eax
|
|
; X64-CLZ-NEXT: retq
|
|
%ctlz = call i32 @llvm.ctlz.i32(i32 %n, i1 false)
|
|
%bsr = xor i32 %ctlz, 31
|
|
ret i32 %bsr
|
|
}
|