From 5ef7ab8d718b71aaf86fd267a3428fe9ea8cd6d1 Mon Sep 17 00:00:00 2001 From: Hans Wennborg Date: Mon, 4 Apr 2016 21:02:46 +0000 Subject: [PATCH] Re-commit r265039 "[X86] Merge adjacent stack adjustments in eliminateCallFramePseudoInstr (PR27140)" The original commit miscompiled things on 32-bit Windows, e.g. a Clang boostrap. It turns out that mergeSPUpdates() was a bit too generous in what it interpreted as a stack adjustment, causing the following code: addl $12, %esp leal -4(%ebp), %esp To be "optimized" into simply: addl $8, %esp This commit tightens up mergeSPUpdates() and includes a new test (test14 in movtopush.ll) for this situation. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@265345 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86FrameLowering.cpp | 39 ++++++++---- test/CodeGen/X86/2006-05-02-InstrSched1.ll | 2 +- test/CodeGen/X86/fold-push.ll | 2 +- test/CodeGen/X86/force-align-stack-alloca.ll | 14 +++-- test/CodeGen/X86/localescape.ll | 3 +- test/CodeGen/X86/memset-2.ll | 2 +- test/CodeGen/X86/movtopush.ll | 64 ++++++++++++++++++-- test/CodeGen/X86/push-cfi-debug.ll | 4 +- test/CodeGen/X86/push-cfi.ll | 4 +- 9 files changed, 103 insertions(+), 31 deletions(-) diff --git a/lib/Target/X86/X86FrameLowering.cpp b/lib/Target/X86/X86FrameLowering.cpp index 89fa44142a4..c9b29b59c95 100644 --- a/lib/Target/X86/X86FrameLowering.cpp +++ b/lib/Target/X86/X86FrameLowering.cpp @@ -378,11 +378,16 @@ int X86FrameLowering::mergeSPUpdates(MachineBasicBlock &MBB, if ((Opc == X86::ADD64ri32 || Opc == X86::ADD64ri8 || Opc == X86::ADD32ri || Opc == X86::ADD32ri8) && PI->getOperand(0).getReg() == StackPtr){ + assert(PI->getOperand(1).getReg() == StackPtr); Offset += PI->getOperand(2).getImm(); MBB.erase(PI); if (!doMergeWithPrevious) MBBI = NI; } else if ((Opc == X86::LEA32r || Opc == X86::LEA64_32r) && - PI->getOperand(0).getReg() == StackPtr) { + PI->getOperand(0).getReg() == StackPtr && + PI->getOperand(1).getReg() == StackPtr && + PI->getOperand(2).getImm() == 1 && + PI->getOperand(3).getReg() == X86::NoRegister && + PI->getOperand(5).getReg() == X86::NoRegister) { // For LEAs we have: def = lea SP, FI, noreg, Offset, noreg. Offset += PI->getOperand(4).getImm(); MBB.erase(PI); @@ -390,6 +395,7 @@ int X86FrameLowering::mergeSPUpdates(MachineBasicBlock &MBB, } else if ((Opc == X86::SUB64ri32 || Opc == X86::SUB64ri8 || Opc == X86::SUB32ri || Opc == X86::SUB32ri8) && PI->getOperand(0).getReg() == StackPtr) { + assert(PI->getOperand(1).getReg() == StackPtr); Offset -= PI->getOperand(2).getImm(); MBB.erase(PI); if (!doMergeWithPrevious) MBBI = NI; @@ -2533,13 +2539,22 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, BuildCFI(MBB, I, DL, MCCFIInstruction::createAdjustCfaOffset(nullptr, -InternalAmt)); - if (Amount) { - // Add Amount to SP to destroy a frame, and subtract to setup. - int Offset = isDestroy ? Amount : -Amount; + // Add Amount to SP to destroy a frame, or subtract to setup. + int64_t StackAdjustment = isDestroy ? Amount : -Amount; - if (!(Fn->optForMinSize() && - adjustStackWithPops(MBB, I, DL, Offset))) - BuildStackAdjustment(MBB, I, DL, Offset, /*InEpilogue=*/false); + if (StackAdjustment) { + // Merge with any previous or following adjustment instruction. + StackAdjustment += mergeSPUpdates(MBB, I, true); + StackAdjustment += mergeSPUpdates(MBB, I, false); + + if (!StackAdjustment) { + // This and the merged instruction canceled out each other. + return I; + } + + if (!(Fn->optForMinSize() && + adjustStackWithPops(MBB, I, DL, StackAdjustment))) + BuildStackAdjustment(MBB, I, DL, StackAdjustment, /*InEpilogue=*/false); } if (DwarfCFI && !hasFP(MF)) { @@ -2549,14 +2564,12 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, // CFI only for EH purposes or for debugging. EH only requires the CFA // offset to be correct at each call site, while for debugging we want // it to be more precise. - int CFAOffset = Amount; + // TODO: When not using precise CFA, we also need to adjust for the // InternalAmt here. - - if (CFAOffset) { - CFAOffset = isDestroy ? -CFAOffset : CFAOffset; - BuildCFI(MBB, I, DL, - MCCFIInstruction::createAdjustCfaOffset(nullptr, CFAOffset)); + if (StackAdjustment) { + BuildCFI(MBB, I, DL, MCCFIInstruction::createAdjustCfaOffset( + nullptr, -StackAdjustment)); } } diff --git a/test/CodeGen/X86/2006-05-02-InstrSched1.ll b/test/CodeGen/X86/2006-05-02-InstrSched1.ll index eae0ec21c09..acd32e49e60 100644 --- a/test/CodeGen/X86/2006-05-02-InstrSched1.ll +++ b/test/CodeGen/X86/2006-05-02-InstrSched1.ll @@ -1,6 +1,6 @@ ; REQUIRES: asserts ; RUN: llc < %s -mtriple=i686-unknown-linux -relocation-model=static -stats 2>&1 | \ -; RUN: grep asm-printer | grep 15 +; RUN: grep asm-printer | grep 14 ; ; It's possible to schedule this in 14 instructions by avoiding ; callee-save registers, but the scheduler isn't currently that diff --git a/test/CodeGen/X86/fold-push.ll b/test/CodeGen/X86/fold-push.ll index eaf91351021..9d3afd1c449 100644 --- a/test/CodeGen/X86/fold-push.ll +++ b/test/CodeGen/X86/fold-push.ll @@ -14,7 +14,7 @@ define void @test(i32 %a, i32 %b) optsize nounwind { ; SLM: movl (%esp), [[RELOAD:%e..]] ; SLM-NEXT: pushl [[RELOAD]] ; CHECK: calll -; CHECK-NEXT: addl $4, %esp +; CHECK-NEXT: addl $8, %esp %c = add i32 %a, %b call void @foo(i32 %c) call void asm sideeffect "nop", "~{ax},~{bx},~{cx},~{dx},~{bp},~{si},~{di}"() diff --git a/test/CodeGen/X86/force-align-stack-alloca.ll b/test/CodeGen/X86/force-align-stack-alloca.ll index d0cf3417008..8d42680e199 100644 --- a/test/CodeGen/X86/force-align-stack-alloca.ll +++ b/test/CodeGen/X86/force-align-stack-alloca.ll @@ -32,15 +32,21 @@ define i64 @g(i32 %i) nounwind { ; CHECK: movl %{{...}}, %esp ; CHECK-NOT: {{[^ ,]*}}, %esp ; -; Next we set up the memset call, and then undo it. +; Next we set up the memset call. ; CHECK: subl $20, %esp ; CHECK-NOT: {{[^ ,]*}}, %esp +; CHECK: pushl +; CHECK: pushl +; CHECK: pushl ; CHECK: calll memset -; CHECK-NEXT: addl $32, %esp +; +; Deallocating 32 bytes of outgoing call frame for memset and +; allocating 28 bytes for calling f yields a 4-byte adjustment: +; CHECK-NEXT: addl $4, %esp ; CHECK-NOT: {{[^ ,]*}}, %esp ; -; Next we set up the call to 'f'. -; CHECK: subl $28, %esp +; And move on to call 'f', and then restore the stack. +; CHECK: pushl ; CHECK-NOT: {{[^ ,]*}}, %esp ; CHECK: calll f ; CHECK-NEXT: addl $32, %esp diff --git a/test/CodeGen/X86/localescape.ll b/test/CodeGen/X86/localescape.ll index 07c3b7f4a35..10ab8dd9672 100644 --- a/test/CodeGen/X86/localescape.ll +++ b/test/CodeGen/X86/localescape.ll @@ -137,6 +137,5 @@ define void @alloc_func_no_frameaddr() { ; X86: movl $13, (%esp) ; X86: pushl $0 ; X86: calll _print_framealloc_from_fp -; X86: addl $4, %esp -; X86: addl $8, %esp +; X86: addl $12, %esp ; X86: retl diff --git a/test/CodeGen/X86/memset-2.ll b/test/CodeGen/X86/memset-2.ll index 7f37b62a28b..e9a7b566b1d 100644 --- a/test/CodeGen/X86/memset-2.ll +++ b/test/CodeGen/X86/memset-2.ll @@ -5,7 +5,7 @@ declare void @llvm.memset.i32(i8*, i8, i32, i32) nounwind define fastcc void @t1() nounwind { ; CHECK-LABEL: t1: -; CHECK: subl $12, %esp +; CHECK: subl $16, %esp ; CHECK: pushl $188 ; CHECK-NEXT: pushl $0 ; CHECK-NEXT: pushl $0 diff --git a/test/CodeGen/X86/movtopush.ll b/test/CodeGen/X86/movtopush.ll index 5dd465ef5a3..59bc214f66c 100644 --- a/test/CodeGen/X86/movtopush.ll +++ b/test/CodeGen/X86/movtopush.ll @@ -2,6 +2,7 @@ ; RUN: llc < %s -mtriple=i686-windows -no-x86-call-frame-opt | FileCheck %s -check-prefix=NOPUSH ; RUN: llc < %s -mtriple=x86_64-windows | FileCheck %s -check-prefix=X64 ; RUN: llc < %s -mtriple=i686-windows -stackrealign -stack-alignment=32 | FileCheck %s -check-prefix=ALIGNED +; RUN: llc < %s -mtriple=i686-pc-linux | FileCheck %s -check-prefix=LINUX %class.Class = type { i32 } %struct.s = type { i64 } @@ -12,6 +13,10 @@ declare x86_thiscallcc void @thiscall(%class.Class* %class, i32 %a, i32 %b, i32 declare void @oneparam(i32 %a) declare void @eightparams(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h) declare void @struct(%struct.s* byval %a, i32 %b, i32 %c, i32 %d) +declare void @inalloca(<{ %struct.s }>* inalloca) + +declare i8* @llvm.stacksave() +declare void @llvm.stackrestore(i8*) ; We should get pushes for x86, even though there is a reserved call frame. ; Make sure we don't touch x86-64, and that turning it off works. @@ -223,8 +228,7 @@ entry: ; NORMAL-NEXT: pushl $2 ; NORMAL-NEXT: pushl $1 ; NORMAL-NEXT: call -; NORMAL-NEXT: addl $16, %esp -; NORMAL-NEXT: subl $20, %esp +; NORMAL-NEXT: subl $4, %esp ; NORMAL-NEXT: movl 20(%esp), [[E1:%e..]] ; NORMAL-NEXT: movl 24(%esp), [[E2:%e..]] ; NORMAL-NEXT: movl [[E2]], 4(%esp) @@ -261,7 +265,7 @@ entry: ; NORMAL-NEXT: pushl $2 ; NORMAL-NEXT: pushl $1 ; NORMAL-NEXT: calll *16(%esp) -; NORMAL-NEXT: addl $16, %esp +; NORMAL-NEXT: addl $24, %esp define void @test10() optsize { %stack_fptr = alloca void (i32, i32, i32, i32)* store void (i32, i32, i32, i32)* @good, void (i32, i32, i32, i32)** %stack_fptr @@ -314,8 +318,7 @@ entry: ; NORMAL-NEXT: pushl $2 ; NORMAL-NEXT: pushl $1 ; NORMAL-NEXT: calll _good -; NORMAL-NEXT: addl $16, %esp -; NORMAL-NEXT: subl $20, %esp +; NORMAL-NEXT: subl $4, %esp ; NORMAL: movl $8, 16(%esp) ; NORMAL-NEXT: movl $7, 12(%esp) ; NORMAL-NEXT: movl $6, 8(%esp) @@ -358,3 +361,54 @@ entry: call void @good(i32 %val1, i32 %val2, i32 %val3, i32 %add) ret i32* %ptr3 } + +; Make sure to fold adjacent stack adjustments. +; LINUX-LABEL: pr27140: +; LINUX: subl $12, %esp +; LINUX: .cfi_def_cfa_offset 16 +; LINUX-NOT: sub +; LINUX: pushl $4 +; LINUX: .cfi_adjust_cfa_offset 4 +; LINUX: pushl $3 +; LINUX: .cfi_adjust_cfa_offset 4 +; LINUX: pushl $2 +; LINUX: .cfi_adjust_cfa_offset 4 +; LINUX: pushl $1 +; LINUX: .cfi_adjust_cfa_offset 4 +; LINUX: calll good +; LINUX: addl $28, %esp +; LINUX: .cfi_adjust_cfa_offset -28 +; LINUX-NOT: add +; LINUX: retl +define void @pr27140() optsize { +entry: + tail call void @good(i32 1, i32 2, i32 3, i32 4) + ret void +} + +; Check that a stack restore (leal -4(%ebp), %esp) doesn't get merged with a +; stack adjustment (addl $12, %esp). Just because it's a lea doesn't mean it's +; simply decreasing the stack pointer. +; NORMAL-LABEL: test14: +; NORMAL: calll _B_func +; NORMAL: leal -4(%ebp), %esp +; NORMAL-NOT: %esp +; NORMAL: retl +%struct.A = type { i32, i32 } +%struct.B = type { i8 } +declare x86_thiscallcc %struct.B* @B_ctor(%struct.B* returned, %struct.A* byval) +declare void @B_func(%struct.B* sret, %struct.B*, i32) +define void @test14(%struct.A* %a) { +entry: + %ref.tmp = alloca %struct.B, align 1 + %agg.tmp = alloca i64, align 4 + %tmpcast = bitcast i64* %agg.tmp to %struct.A* + %tmp = alloca %struct.B, align 1 + %0 = bitcast %struct.A* %a to i64* + %1 = load i64, i64* %0, align 4 + store i64 %1, i64* %agg.tmp, align 4 + %call = call x86_thiscallcc %struct.B* @B_ctor(%struct.B* %ref.tmp, %struct.A* byval %tmpcast) + %2 = getelementptr inbounds %struct.B, %struct.B* %tmp, i32 0, i32 0 + call void @B_func(%struct.B* sret %tmp, %struct.B* %ref.tmp, i32 1) + ret void +} diff --git a/test/CodeGen/X86/push-cfi-debug.ll b/test/CodeGen/X86/push-cfi-debug.ll index 1dfe64e6980..217898fffb1 100644 --- a/test/CodeGen/X86/push-cfi-debug.ll +++ b/test/CodeGen/X86/push-cfi-debug.ll @@ -23,8 +23,8 @@ declare x86_stdcallcc void @stdfoo(i32, i32) #0 ; CHECK: .cfi_adjust_cfa_offset 4 ; CHECK: calll stdfoo ; CHECK: .cfi_adjust_cfa_offset -8 -; CHECK: addl $8, %esp -; CHECK: .cfi_adjust_cfa_offset -8 +; CHECK: addl $20, %esp +; CHECK: .cfi_adjust_cfa_offset -20 define void @test1() #0 !dbg !4 { entry: tail call void @foo(i32 1, i32 2) #1, !dbg !10 diff --git a/test/CodeGen/X86/push-cfi.ll b/test/CodeGen/X86/push-cfi.ll index 6389708f42c..5498af51f23 100644 --- a/test/CodeGen/X86/push-cfi.ll +++ b/test/CodeGen/X86/push-cfi.ll @@ -82,8 +82,8 @@ cleanup: ; LINUX-NEXT: Ltmp{{[0-9]+}}: ; LINUX-NEXT: .cfi_adjust_cfa_offset 4 ; LINUX-NEXT: call -; LINUX-NEXT: addl $16, %esp -; LINUX: .cfi_adjust_cfa_offset -16 +; LINUX-NEXT: addl $28, %esp +; LINUX: .cfi_adjust_cfa_offset -28 ; DARWIN-NOT: .cfi_escape ; DARWIN-NOT: pushl define void @test2_nofp() #0 personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {