From 2028b793e1fd1a8dd4d99b0b7c9972865d5e806a Mon Sep 17 00:00:00 2001
From: Rafael Espindola <rafael.espindola@gmail.com>
Date: Wed, 11 Jan 2012 19:00:37 +0000
Subject: [PATCH] Support segmented stacks on mac. This uses TLS slot 90, which
 actually belongs to JavaScriptCore. We only support frames with static size
 Patch by Brian Anderson.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@147960 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86FrameLowering.cpp  |  83 ++++++--
 lib/Target/X86/X86TargetMachine.cpp  |   3 -
 test/CodeGen/X86/segmented-stacks.ll | 279 ++++++++++++++++++++-------
 3 files changed, 273 insertions(+), 92 deletions(-)
diff --git a/lib/Target/X86/X86FrameLowering.cpp b/lib/Target/X86/X86FrameLowering.cpp
index 4386762c853..4cda76c0a41 100644
--- a/lib/Target/X86/X86FrameLowering.cpp
+++ b/lib/Target/X86/X86FrameLowering.cpp
@@ -1298,10 +1298,15 @@ HasNestArgument(const MachineFunction *MF) {
   return false;
 }
 
+
+/// GetScratchRegister - Get a register for performing work in the segmented
+/// stack prologue. Depending on platform and the properties of the function
+/// either one or two registers will be needed. Set primary to true for
+/// the first register, false for the second.
 static unsigned
-GetScratchRegister(bool Is64Bit, const MachineFunction &MF) {
+GetScratchRegister(bool Is64Bit, const MachineFunction &MF, bool Primary) {
   if (Is64Bit) {
-    return X86::R11;
+    return Primary ? X86::R11 : X86::R12;
   } else {
     CallingConv::ID CallingConvention = MF.getFunction()->getCallingConv();
     bool IsNested = HasNestArgument(&MF);
@@ -1313,13 +1318,13 @@ GetScratchRegister(bool Is64Bit, const MachineFunction &MF) {
                            "nested function.");
         return -1;
       } else {
-        return X86::EAX;
+        return Primary ? X86::EAX : X86::ECX;
       }
     } else {
       if (IsNested)
-        return X86::EDX;
+        return Primary ? X86::EDX : X86::EAX;
       else
-        return X86::ECX;
+        return Primary ? X86::ECX : X86::EAX;
     }
   }
 }
@@ -1339,14 +1344,14 @@ X86FrameLowering::adjustForSegmentedStacks(MachineFunction &MF) const {
   DebugLoc DL;
   const X86Subtarget *ST = &MF.getTarget().getSubtarget<X86Subtarget>();
 
-  unsigned ScratchReg = GetScratchRegister(Is64Bit, MF);
+  unsigned ScratchReg = GetScratchRegister(Is64Bit, MF, true);
   assert(!MF.getRegInfo().isLiveIn(ScratchReg) &&
          "Scratch register is live-in");
 
   if (MF.getFunction()->isVarArg())
     report_fatal_error("Segmented stacks do not support vararg functions.");
-  if (!ST->isTargetLinux())
-    report_fatal_error("Segmented stacks supported only on linux.");
+  if (!ST->isTargetLinux() && !ST->isTargetDarwin())
+    report_fatal_error("Segmented stacks supported only on linux and darwin.");
 
   MachineBasicBlock *allocMBB = MF.CreateMachineBasicBlock();
   MachineBasicBlock *checkMBB = MF.CreateMachineBasicBlock();
@@ -1377,12 +1382,21 @@ X86FrameLowering::adjustForSegmentedStacks(MachineFunction &MF) const {
   // prologue.
   StackSize = MFI->getStackSize();
 
+  // When the frame size is less than 256 we just compare the stack
+  // boundary directly to the value of the stack pointer, per gcc.
+  bool CompareStackPointer = StackSize < kSplitStackAvailable;
+
   // Read the limit off the current stacklet off the stack_guard location.
   if (Is64Bit) {
-    TlsReg = X86::FS;
-    TlsOffset = 0x70;
+    if (ST->isTargetLinux()) {
+      TlsReg = X86::FS;
+      TlsOffset = 0x70;
+    } else if (ST->isTargetDarwin()) {
+      TlsReg = X86::GS;
+      TlsOffset = 0x60 + 90*8; // See pthread_machdep.h. Steal TLS slot 90.
+    }
 
-    if (StackSize < kSplitStackAvailable)
+    if (CompareStackPointer)
       ScratchReg = X86::RSP;
     else
       BuildMI(checkMBB, DL, TII.get(X86::LEA64r), ScratchReg).addReg(X86::RSP)
@@ -1392,16 +1406,55 @@ X86FrameLowering::adjustForSegmentedStacks(MachineFunction &MF) const {
       .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg);
   } else {
     TlsReg = X86::GS;
-    TlsOffset = 0x30;
 
-    if (StackSize < kSplitStackAvailable)
+    if (CompareStackPointer)
       ScratchReg = X86::ESP;
     else
       BuildMI(checkMBB, DL, TII.get(X86::LEA32r), ScratchReg).addReg(X86::ESP)
         .addImm(1).addReg(0).addImm(-StackSize).addReg(0);
 
-    BuildMI(checkMBB, DL, TII.get(X86::CMP32rm)).addReg(ScratchReg)
-      .addReg(0).addImm(0).addReg(0).addImm(TlsOffset).addReg(TlsReg);
+    if (ST->isTargetLinux()) {
+      TlsOffset = 0x30;
+
+      BuildMI(checkMBB, DL, TII.get(X86::CMP32rm)).addReg(ScratchReg)
+        .addReg(0).addImm(0).addReg(0).addImm(TlsOffset).addReg(TlsReg);
+    } else if (ST->isTargetDarwin()) {
+      TlsOffset = 0x48 + 90*4;
+
+      // TlsOffset doesn't fit into a mod r/m byte so we need an extra register
+      unsigned ScratchReg2;
+      bool SaveScratch2;
+      if (CompareStackPointer) {
+        // The primary scratch register is available for holding the TLS offset
+        ScratchReg2 = GetScratchRegister(Is64Bit, MF, true);
+        SaveScratch2 = false;
+      } else {
+        // Need to use a second register to hold the TLS offset
+        ScratchReg2 = GetScratchRegister(Is64Bit, MF, false);
+
+        // Unfortunately, with fastcc the second scratch register may hold an arg
+        SaveScratch2 = MF.getRegInfo().isLiveIn(ScratchReg2);
+      }
+
+      // If Scratch2 is live-in then it needs to be saved
+      assert((!MF.getRegInfo().isLiveIn(ScratchReg2) || SaveScratch2) &&
+             "Scratch register is live-in and not saved");
+
+      if (SaveScratch2)
+        BuildMI(checkMBB, DL, TII.get(X86::PUSH32r))
+          .addReg(ScratchReg2, RegState::Kill);
+
+      BuildMI(checkMBB, DL, TII.get(X86::MOV32ri), ScratchReg2)
+        .addImm(TlsOffset);
+      BuildMI(checkMBB, DL, TII.get(X86::CMP32rm))
+        .addReg(ScratchReg)
+        .addReg(ScratchReg2).addImm(1).addReg(0)
+        .addImm(0)
+        .addReg(TlsReg);
+
+      if (SaveScratch2)
+        BuildMI(checkMBB, DL, TII.get(X86::POP32r), ScratchReg2);
+    }
   }
 
   // This jump is taken if SP >= (Stacklet Limit + Stack Space required).
diff --git a/lib/Target/X86/X86TargetMachine.cpp b/lib/Target/X86/X86TargetMachine.cpp
index d73a3dd7f3c..b8002d57ebf 100644
--- a/lib/Target/X86/X86TargetMachine.cpp
+++ b/lib/Target/X86/X86TargetMachine.cpp
@@ -102,9 +102,6 @@ X86TargetMachine::X86TargetMachine(const Target &T, StringRef TT,
   // default to hard float ABI
   if (Options.FloatABIType == FloatABI::Default)
     this->Options.FloatABIType = FloatABI::Hard;   
-
-  if (Options.EnableSegmentedStacks && !Subtarget.isTargetELF())
-    report_fatal_error("Segmented stacks are only implemented on ELF.");
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/test/CodeGen/X86/segmented-stacks.ll b/test/CodeGen/X86/segmented-stacks.ll
index 3ba18cffccf..6e91d00ac6d 100644
--- a/test/CodeGen/X86/segmented-stacks.ll
+++ b/test/CodeGen/X86/segmented-stacks.ll
@@ -1,9 +1,13 @@
-; RUN: llc < %s -mtriple=i686-linux -segmented-stacks -verify-machineinstrs | FileCheck %s -check-prefix=X32
-; RUN: llc < %s -mtriple=x86_64-linux  -segmented-stacks -verify-machineinstrs | FileCheck %s -check-prefix=X64
+; RUN: llc < %s -mtriple=i686-linux -segmented-stacks -verify-machineinstrs | FileCheck %s -check-prefix=X32-Linux
+; RUN: llc < %s -mtriple=x86_64-linux  -segmented-stacks -verify-machineinstrs | FileCheck %s -check-prefix=X64-Linux
+; RUN: llc < %s -mtriple=i686-darwin -segmented-stacks -verify-machineinstrs | FileCheck %s -check-prefix=X32-Darwin
+; RUN: llc < %s -mtriple=x86_64-darwin -segmented-stacks -verify-machineinstrs | FileCheck %s -check-prefix=X64-Darwin
 
 ; We used to crash with filetype=obj
 ; RUN: llc < %s -mtriple=i686-linux -segmented-stacks -filetype=obj
 ; RUN: llc < %s -mtriple=x86_64-linux -segmented-stacks -filetype=obj
+; RUN: llc < %s -mtriple=i686-darwin -segmented-stacks -filetype=obj
+; RUN: llc < %s -mtriple=x86_64-darwin -segmented-stacks -filetype=obj
 
 ; Just to prevent the alloca from being optimized away
 declare void @dummy_use(i32*, i32)
@@ -13,25 +17,46 @@ define void @test_basic() {
         call void @dummy_use (i32* %mem, i32 10)
 	ret void
 
-; X32:      test_basic:
+; X32-Linux:       test_basic:
 
-; X32:      cmpl %gs:48, %esp
-; X32-NEXT: ja      .LBB0_2
+; X32-Linux:       cmpl %gs:48, %esp
+; X32-Linux-NEXT:  ja      .LBB0_2
 
-; X32:      pushl $0
-; X32-NEXT: pushl $60
-; X32-NEXT: calll __morestack
-; X32-NEXT: ret 
+; X32-Linux:       pushl $0
+; X32-Linux-NEXT:  pushl $60
+; X32-Linux-NEXT:  calll __morestack
+; X32-Linux-NEXT:  ret
 
-; X64:      test_basic:
+; X64-Linux:       test_basic:
 
-; X64:      cmpq %fs:112, %rsp
-; X64-NEXT: ja      .LBB0_2
+; X64-Linux:       cmpq %fs:112, %rsp
+; X64-Linux-NEXT:  ja      .LBB0_2
 
-; X64:      movabsq $40, %r10
-; X64-NEXT: movabsq $0, %r11
-; X64-NEXT: callq __morestack
-; X64-NEXT: ret
+; X64-Linux:       movabsq $40, %r10
+; X64-Linux-NEXT:  movabsq $0, %r11
+; X64-Linux-NEXT:  callq __morestack
+; X64-Linux-NEXT:  ret
+
+; X32-Darwin:      test_basic:
+
+; X32-Darwin:      movl $432, %ecx
+; X32-Darwin-NEXT: cmpl %gs:(%ecx), %esp
+; X32-Darwin-NEXT: ja      LBB0_2
+
+; X32-Darwin:      pushl $0
+; X32-Darwin-NEXT: pushl $60
+; X32-Darwin-NEXT: calll ___morestack
+; X32-Darwin-NEXT: ret
+
+; X64-Darwin:      test_basic:
+
+; X64-Darwin:      cmpq %gs:816, %rsp
+; X64-Darwin-NEXT: ja      LBB0_2
+
+; X64-Darwin:      movabsq $40, %r10
+; X64-Darwin-NEXT: movabsq $0, %r11
+; X64-Darwin-NEXT: callq ___morestack
+; X64-Darwin-NEXT: ret
 
 }
 
@@ -40,23 +65,42 @@ define i32 @test_nested(i32 * nest %closure, i32 %other) {
        %result = add i32 %other, %addend
        ret i32 %result
 
-; X32:      cmpl %gs:48, %esp
-; X32-NEXT: ja      .LBB1_2
+; X32-Linux:       cmpl %gs:48, %esp
+; X32-Linux-NEXT:  ja      .LBB1_2
 
-; X32:      pushl $4
-; X32-NEXT: pushl $0
-; X32-NEXT: calll __morestack
-; X32-NEXT: ret
+; X32-Linux:       pushl $4
+; X32-Linux-NEXT:  pushl $0
+; X32-Linux-NEXT:  calll __morestack
+; X32-Linux-NEXT:  ret
 
-; X64:      cmpq %fs:112, %rsp
-; X64-NEXT: ja      .LBB1_2
+; X64-Linux:       cmpq %fs:112, %rsp
+; X64-Linux-NEXT:  ja      .LBB1_2
 
-; X64:      movq %r10, %rax
-; X64-NEXT: movabsq $0, %r10
-; X64-NEXT: movabsq $0, %r11
-; X64-NEXT: callq __morestack
-; X64-NEXT: ret
-; X64-NEXT: movq %rax, %r10
+; X64-Linux:       movq %r10, %rax
+; X64-Linux-NEXT:  movabsq $0, %r10
+; X64-Linux-NEXT:  movabsq $0, %r11
+; X64-Linux-NEXT:  callq __morestack
+; X64-Linux-NEXT:  ret
+; X64-Linux-NEXT:  movq %rax, %r10
+
+; X32-Darwin:      movl $432, %edx
+; X32-Darwin-NEXT: cmpl %gs:(%edx), %esp
+; X32-Darwin-NEXT: ja      LBB1_2
+
+; X32-Darwin:      pushl $4
+; X32-Darwin-NEXT: pushl $0
+; X32-Darwin-NEXT: calll ___morestack
+; X32-Darwin-NEXT: ret
+
+; X64-Darwin:      cmpq %gs:816, %rsp
+; X64-Darwin-NEXT: ja      LBB1_2
+
+; X64-Darwin:      movq %r10, %rax
+; X64-Darwin-NEXT: movabsq $0, %r10
+; X64-Darwin-NEXT: movabsq $0, %r11
+; X64-Darwin-NEXT: callq ___morestack
+; X64-Darwin-NEXT: ret
+; X64-Darwin-NEXT: movq %rax, %r10
 
 }
 
@@ -65,23 +109,42 @@ define void @test_large() {
         call void @dummy_use (i32* %mem, i32 0)
         ret void
 
-; X32:      leal -40012(%esp), %ecx
-; X32-NEXT: cmpl %gs:48, %ecx
-; X32-NEXT: ja      .LBB2_2
+; X32-Linux:       leal -40012(%esp), %ecx
+; X32-Linux-NEXT:  cmpl %gs:48, %ecx
+; X32-Linux-NEXT:  ja      .LBB2_2
 
-; X32:      pushl $0
-; X32-NEXT: pushl $40012
-; X32-NEXT: calll __morestack
-; X32-NEXT: ret
+; X32-Linux:       pushl $0
+; X32-Linux-NEXT:  pushl $40012
+; X32-Linux-NEXT:  calll __morestack
+; X32-Linux-NEXT:  ret
 
-; X64:      leaq -40008(%rsp), %r11
-; X64-NEXT: cmpq %fs:112, %r11
-; X64-NEXT: ja      .LBB2_2
+; X64-Linux:       leaq -40008(%rsp), %r11
+; X64-Linux-NEXT:  cmpq %fs:112, %r11
+; X64-Linux-NEXT:  ja      .LBB2_2
 
-; X64:      movabsq $40008, %r10
-; X64-NEXT: movabsq $0, %r11
-; X64-NEXT: callq __morestack
-; X64-NEXT: ret
+; X64-Linux:       movabsq $40008, %r10
+; X64-Linux-NEXT:  movabsq $0, %r11
+; X64-Linux-NEXT:  callq __morestack
+; X64-Linux-NEXT:  ret
+
+; X32-Darwin:      leal -40012(%esp), %ecx
+; X32-Darwin-NEXT: movl $432, %eax
+; X32-Darwin-NEXT: cmpl %gs:(%eax), %ecx
+; X32-Darwin-NEXT: ja      LBB2_2
+
+; X32-Darwin:      pushl $0
+; X32-Darwin-NEXT: pushl $40012
+; X32-Darwin-NEXT: calll ___morestack
+; X32-Darwin-NEXT: ret
+
+; X64-Darwin:      leaq -40008(%rsp), %r11
+; X64-Darwin-NEXT: cmpq %gs:816, %r11
+; X64-Darwin-NEXT: ja      LBB2_2
+
+; X64-Darwin:      movabsq $40008, %r10
+; X64-Darwin-NEXT: movabsq $0, %r11
+; X64-Darwin-NEXT: callq ___morestack
+; X64-Darwin-NEXT: ret
 
 }
 
@@ -90,25 +153,46 @@ define fastcc void @test_fastcc() {
         call void @dummy_use (i32* %mem, i32 10)
         ret void
 
-; X32:      test_fastcc:
+; X32-Linux:       test_fastcc:
 
-; X32:      cmpl %gs:48, %esp
-; X32-NEXT: ja      .LBB3_2
+; X32-Linux:       cmpl %gs:48, %esp
+; X32-Linux-NEXT:  ja      .LBB3_2
 
-; X32:      pushl $0
-; X32-NEXT: pushl $60
-; X32-NEXT: calll __morestack
-; X32-NEXT: ret
+; X32-Linux:       pushl $0
+; X32-Linux-NEXT:  pushl $60
+; X32-Linux-NEXT:  calll __morestack
+; X32-Linux-NEXT:  ret
 
-; X64:      test_fastcc:
+; X64-Linux:       test_fastcc:
 
-; X64:      cmpq %fs:112, %rsp
-; X64-NEXT: ja      .LBB3_2
+; X64-Linux:       cmpq %fs:112, %rsp
+; X64-Linux-NEXT:  ja      .LBB3_2
 
-; X64:      movabsq $40, %r10
-; X64-NEXT: movabsq $0, %r11
-; X64-NEXT: callq __morestack
-; X64-NEXT: ret
+; X64-Linux:       movabsq $40, %r10
+; X64-Linux-NEXT:  movabsq $0, %r11
+; X64-Linux-NEXT:  callq __morestack
+; X64-Linux-NEXT:  ret
+
+; X32-Darwin:      test_fastcc:
+
+; X32-Darwin:      movl $432, %eax
+; X32-Darwin-NEXT: cmpl %gs:(%eax), %esp
+; X32-Darwin-NEXT: ja      LBB3_2
+
+; X32-Darwin:      pushl $0
+; X32-Darwin-NEXT: pushl $60
+; X32-Darwin-NEXT: calll ___morestack
+; X32-Darwin-NEXT: ret
+
+; X64-Darwin:      test_fastcc:
+
+; X64-Darwin:      cmpq %gs:816, %rsp
+; X64-Darwin-NEXT: ja      LBB3_2
+
+; X64-Darwin:      movabsq $40, %r10
+; X64-Darwin-NEXT: movabsq $0, %r11
+; X64-Darwin-NEXT: callq ___morestack
+; X64-Darwin-NEXT: ret
 
 }
 
@@ -117,25 +201,72 @@ define fastcc void @test_fastcc_large() {
         call void @dummy_use (i32* %mem, i32 0)
         ret void
 
-; X32:      test_fastcc_large:
+; X32-Linux:       test_fastcc_large:
 
-; X32:      leal -40012(%esp), %eax
-; X32-NEXT: cmpl %gs:48, %eax
-; X32-NEXT: ja      .LBB4_2
+; X32-Linux:       leal -40012(%esp), %eax
+; X32-Linux-NEXT:  cmpl %gs:48, %eax
+; X32-Linux-NEXT:  ja      .LBB4_2
 
-; X32:      pushl $0
-; X32-NEXT: pushl $40012
-; X32-NEXT: calll __morestack
-; X32-NEXT: ret
+; X32-Linux:       pushl $0
+; X32-Linux-NEXT:  pushl $40012
+; X32-Linux-NEXT:  calll __morestack
+; X32-Linux-NEXT:  ret
 
-; X64:      test_fastcc_large:
+; X64-Linux:       test_fastcc_large:
 
-; X64:      leaq -40008(%rsp), %r11
-; X64-NEXT: cmpq %fs:112, %r11
-; X64-NEXT: ja      .LBB4_2
+; X64-Linux:       leaq -40008(%rsp), %r11
+; X64-Linux-NEXT:  cmpq %fs:112, %r11
+; X64-Linux-NEXT:  ja      .LBB4_2
+
+; X64-Linux:       movabsq $40008, %r10
+; X64-Linux-NEXT:  movabsq $0, %r11
+; X64-Linux-NEXT:  callq __morestack
+; X64-Linux-NEXT:  ret
+
+; X32-Darwin:      test_fastcc_large:
+
+; X32-Darwin:      leal -40012(%esp), %eax
+; X32-Darwin-NEXT: movl $432, %ecx
+; X32-Darwin-NEXT: cmpl %gs:(%ecx), %eax
+; X32-Darwin-NEXT: ja      LBB4_2
+
+; X32-Darwin:      pushl $0
+; X32-Darwin-NEXT: pushl $40012
+; X32-Darwin-NEXT: calll ___morestack
+; X32-Darwin-NEXT: ret
+
+; X64-Darwin:      test_fastcc_large:
+
+; X64-Darwin:      leaq -40008(%rsp), %r11
+; X64-Darwin-NEXT: cmpq %gs:816, %r11
+; X64-Darwin-NEXT: ja      LBB4_2
+
+; X64-Darwin:      movabsq $40008, %r10
+; X64-Darwin-NEXT: movabsq $0, %r11
+; X64-Darwin-NEXT: callq ___morestack
+; X64-Darwin-NEXT: ret
+
+}
+
+define fastcc void @test_fastcc_large_with_ecx_arg(i32 %a) {
+        %mem = alloca i32, i32 10000
+        call void @dummy_use (i32* %mem, i32 %a)
+        ret void
+
+; This is testing that the Mac implementation preserves ecx
+
+; X32-Darwin:      test_fastcc_large_with_ecx_arg:
+
+; X32-Darwin:      leal -40012(%esp), %eax
+; X32-Darwin-NEXT: pushl %ecx
+; X32-Darwin-NEXT: movl $432, %ecx
+; X32-Darwin-NEXT: cmpl %gs:(%ecx), %eax
+; X32-Darwin-NEXT: popl %ecx
+; X32-Darwin-NEXT: ja      LBB5_2
+
+; X32-Darwin:      pushl $0
+; X32-Darwin-NEXT: pushl $40012
+; X32-Darwin-NEXT: calll ___morestack
+; X32-Darwin-NEXT: ret
 
-; X64:      movabsq $40008, %r10
-; X64-NEXT: movabsq $0, %r11
-; X64-NEXT: callq __morestack
-; X64-NEXT: ret
 }