llvm/test/CodeGen/X86/2008-08-05-SpillerBug.ll
Evan Cheng 206d1856ad Added a linearscan register allocation optimization. When the register allocator spill an interval with multiple uses in the same basic block, it creates a different virtual register for each of the reloads. e.g.
%reg1498<def> = MOV32rm %reg1024, 1, %reg0, 12, %reg0, Mem:LD(4,4) [sunkaddr39 + 0]
        %reg1506<def> = MOV32rm %reg1024, 1, %reg0, 8, %reg0, Mem:LD(4,4) [sunkaddr42 + 0]
        %reg1486<def> = MOV32rr %reg1506
        %reg1486<def> = XOR32rr %reg1486, %reg1498, %EFLAGS<imp-def,dead>
        %reg1510<def> = MOV32rm %reg1024, 1, %reg0, 4, %reg0, Mem:LD(4,4) [sunkaddr45 + 0]

=>

        %reg1498<def> = MOV32rm %reg2036, 1, %reg0, 12, %reg0, Mem:LD(4,4) [sunkaddr39 + 0]
        %reg1506<def> = MOV32rm %reg2037, 1, %reg0, 8, %reg0, Mem:LD(4,4) [sunkaddr42 + 0]
        %reg1486<def> = MOV32rr %reg1506
        %reg1486<def> = XOR32rr %reg1486, %reg1498, %EFLAGS<imp-def,dead>
        %reg1510<def> = MOV32rm %reg2038, 1, %reg0, 4, %reg0, Mem:LD(4,4) [sunkaddr45 + 0]

From linearscan's point of view, each of reg2036, 2037, and 2038 are separate registers, each is "killed" after a single use. The reloaded register is available and it's often clobbered right away. e.g. In thise case reg1498 is allocated EAX while reg2036 is allocated RAX. This means we end up with multiple reloads from the same stack slot in the same basic block.

Now linearscan recognize there are other reloads from same SS in the same BB. So it'll "downgrade" RAX (and its aliases) after reg2036 is allocated until the next reload (reg2037) is done. This greatly increase the likihood reloads from SS are reused.

This speeds up sha1 from OpenSSL by 5.8%. It is also an across the board win for SPEC2000 and 2006.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@69585 91177308-0d34-0410-b5e6-96231b3b80d8
2009-04-20 08:01:12 +00:00

45 lines
1.7 KiB
LLVM

; RUN: llvm-as < %s | llc -mtriple=i386-apple-darwin -disable-fp-elim -stats |& grep asm-printer | grep 56
; PR2568
@g_3 = external global i16 ; <i16*> [#uses=1]
@g_5 = external global i32 ; <i32*> [#uses=3]
declare i32 @func_15(i16 signext , i16 signext , i32) nounwind
define void @func_9_entry_2E_ce(i8 %p_11) nounwind {
newFuncRoot:
br label %entry.ce
entry.ce.ret.exitStub: ; preds = %entry.ce
ret void
entry.ce: ; preds = %newFuncRoot
load i16* @g_3, align 2 ; <i16>:0 [#uses=1]
icmp sgt i16 %0, 0 ; <i1>:1 [#uses=1]
zext i1 %1 to i32 ; <i32>:2 [#uses=1]
load i32* @g_5, align 4 ; <i32>:3 [#uses=4]
icmp ugt i32 %2, %3 ; <i1>:4 [#uses=1]
zext i1 %4 to i32 ; <i32>:5 [#uses=1]
icmp eq i32 %3, 0 ; <i1>:6 [#uses=1]
%.0 = select i1 %6, i32 1, i32 %3 ; <i32> [#uses=1]
urem i32 1, %.0 ; <i32>:7 [#uses=2]
sext i8 %p_11 to i16 ; <i16>:8 [#uses=1]
trunc i32 %3 to i16 ; <i16>:9 [#uses=1]
tail call i32 @func_15( i16 signext %8, i16 signext %9, i32 1 ) nounwind ; <i32>:10 [#uses=0]
load i32* @g_5, align 4 ; <i32>:11 [#uses=1]
trunc i32 %11 to i16 ; <i16>:12 [#uses=1]
tail call i32 @func_15( i16 signext %12, i16 signext 1, i32 %7 ) nounwind ; <i32>:13 [#uses=0]
sext i8 %p_11 to i32 ; <i32>:14 [#uses=1]
%p_11.lobit = lshr i8 %p_11, 7 ; <i8> [#uses=1]
%tmp = zext i8 %p_11.lobit to i32 ; <i32> [#uses=1]
%tmp.not = xor i32 %tmp, 1 ; <i32> [#uses=1]
%.015 = ashr i32 %14, %tmp.not ; <i32> [#uses=2]
icmp eq i32 %.015, 0 ; <i1>:15 [#uses=1]
%.016 = select i1 %15, i32 1, i32 %.015 ; <i32> [#uses=1]
udiv i32 %7, %.016 ; <i32>:16 [#uses=1]
icmp ult i32 %5, %16 ; <i1>:17 [#uses=1]
zext i1 %17 to i32 ; <i32>:18 [#uses=1]
store i32 %18, i32* @g_5, align 4
br label %entry.ce.ret.exitStub
}