diff --git a/lib/Target/X86/X86ExpandPseudo.cpp b/lib/Target/X86/X86ExpandPseudo.cpp index 2af2be621ff..d7d0d3cb561 100644 --- a/lib/Target/X86/X86ExpandPseudo.cpp +++ b/lib/Target/X86/X86ExpandPseudo.cpp @@ -186,6 +186,38 @@ bool X86ExpandPseudo::ExpandMI(MachineBasicBlock &MBB, MBBI->eraseFromParent(); return true; } + case X86::LCMPXCHG8B_SAVE_EBX: + case X86::LCMPXCHG16B_SAVE_RBX: { + // Perform the following transformation. + // SaveRbx = pseudocmpxchg Addr, <4 opds for the address>, InArg, SaveRbx + // => + // [E|R]BX = InArg + // actualcmpxchg Addr + // [E|R]BX = SaveRbx + const MachineOperand &InArg = MBBI->getOperand(6); + unsigned SaveRbx = MBBI->getOperand(7).getReg(); + + unsigned ActualInArg = + Opcode == X86::LCMPXCHG8B_SAVE_EBX ? X86::EBX : X86::RBX; + // Copy the input argument of the pseudo into the argument of the + // actual instruction. + TII->copyPhysReg(MBB, MBBI, DL, ActualInArg, InArg.getReg(), + InArg.isKill()); + // Create the actual instruction. + unsigned ActualOpc = + Opcode == X86::LCMPXCHG8B_SAVE_EBX ? X86::LCMPXCHG8B : X86::LCMPXCHG16B; + MachineInstr *NewInstr = BuildMI(MBB, MBBI, DL, TII->get(ActualOpc)); + // Copy the operands related to the address. + for (unsigned Idx = 1; Idx < 6; ++Idx) + NewInstr->addOperand(MBBI->getOperand(Idx)); + // Finally, restore the value of RBX. + TII->copyPhysReg(MBB, MBBI, DL, ActualInArg, SaveRbx, + /*SrcIsKill*/ true); + + // Delete the pseudo. + MBBI->eraseFromParent(); + return true; + } } llvm_unreachable("Previous switch has a fallthrough?"); } diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index eac67d1e67d..bbbbf3e6537 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -21254,20 +21254,49 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, DAG.getConstant(0, dl, HalfT)); swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3), DAG.getConstant(1, dl, HalfT)); - swapInL = DAG.getCopyToReg(cpInH.getValue(0), dl, - Regs64bit ? X86::RBX : X86::EBX, - swapInL, cpInH.getValue(1)); - swapInH = DAG.getCopyToReg(swapInL.getValue(0), dl, - Regs64bit ? X86::RCX : X86::ECX, - swapInH, swapInL.getValue(1)); - SDValue Ops[] = { swapInH.getValue(0), - N->getOperand(1), - swapInH.getValue(1) }; + swapInH = + DAG.getCopyToReg(cpInH.getValue(0), dl, Regs64bit ? X86::RCX : X86::ECX, + swapInH, cpInH.getValue(1)); + // If the current function needs the base pointer, RBX, + // we shouldn't use cmpxchg directly. + // Indeed the lowering of that instruction will clobber + // that register and since RBX will be a reserved register + // the register allocator will not make sure its value will + // be properly saved and restored around this live-range. + const X86RegisterInfo *TRI = Subtarget.getRegisterInfo(); + SDValue Result; SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); + unsigned BasePtr = TRI->getBaseRegister(); MachineMemOperand *MMO = cast(N)->getMemOperand(); - unsigned Opcode = Regs64bit ? X86ISD::LCMPXCHG16_DAG : - X86ISD::LCMPXCHG8_DAG; - SDValue Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO); + if (TRI->hasBasePointer(DAG.getMachineFunction()) && + (BasePtr == X86::RBX || BasePtr == X86::EBX)) { + // ISel prefers the LCMPXCHG64 variant. + // If that assert breaks, that means it is not the case anymore, + // and we need to teach LCMPXCHG8_SAVE_EBX_DAG how to save RBX, + // not just EBX. This is a matter of accepting i64 input for that + // pseudo, and restoring into the register of the right wide + // in expand pseudo. Everything else should just work. + assert(((Regs64bit == (BasePtr == X86::RBX)) || BasePtr == X86::EBX) && + "Saving only half of the RBX"); + unsigned Opcode = Regs64bit ? X86ISD::LCMPXCHG16_SAVE_RBX_DAG + : X86ISD::LCMPXCHG8_SAVE_EBX_DAG; + SDValue RBXSave = DAG.getCopyFromReg(swapInH.getValue(0), dl, + Regs64bit ? X86::RBX : X86::EBX, + HalfT, swapInH.getValue(1)); + SDValue Ops[] = {/*Chain*/ RBXSave.getValue(1), N->getOperand(1), swapInL, + RBXSave, + /*Glue*/ RBXSave.getValue(2)}; + Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO); + } else { + unsigned Opcode = + Regs64bit ? X86ISD::LCMPXCHG16_DAG : X86ISD::LCMPXCHG8_DAG; + swapInL = DAG.getCopyToReg(swapInH.getValue(0), dl, + Regs64bit ? X86::RBX : X86::EBX, swapInL, + swapInH.getValue(1)); + SDValue Ops[] = {swapInL.getValue(0), N->getOperand(1), + swapInL.getValue(1)}; + Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO); + } SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl, Regs64bit ? X86::RAX : X86::EAX, HalfT, Result.getValue(1)); @@ -21422,6 +21451,10 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::LCMPXCHG_DAG: return "X86ISD::LCMPXCHG_DAG"; case X86ISD::LCMPXCHG8_DAG: return "X86ISD::LCMPXCHG8_DAG"; case X86ISD::LCMPXCHG16_DAG: return "X86ISD::LCMPXCHG16_DAG"; + case X86ISD::LCMPXCHG8_SAVE_EBX_DAG: + return "X86ISD::LCMPXCHG8_SAVE_EBX_DAG"; + case X86ISD::LCMPXCHG16_SAVE_RBX_DAG: + return "X86ISD::LCMPXCHG16_SAVE_RBX_DAG"; case X86ISD::LADD: return "X86ISD::LADD"; case X86ISD::LSUB: return "X86ISD::LSUB"; case X86ISD::LOR: return "X86ISD::LOR"; @@ -23569,6 +23602,14 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, case X86::VFMSUBADDPDr213rY: case X86::VFMSUBADDPSr213rY: return emitFMA3Instr(MI, BB); + case X86::LCMPXCHG8B_SAVE_EBX: + case X86::LCMPXCHG16B_SAVE_RBX: { + unsigned BasePtr = + MI->getOpcode() == X86::LCMPXCHG8B_SAVE_EBX ? X86::EBX : X86::RBX; + if (!BB->isLiveIn(BasePtr)) + BB->addLiveIn(BasePtr); + return BB; + } } } diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h index 57ba49b5035..94e2f54d6b2 100644 --- a/lib/Target/X86/X86ISelLowering.h +++ b/lib/Target/X86/X86ISelLowering.h @@ -523,6 +523,8 @@ namespace llvm { LCMPXCHG_DAG = ISD::FIRST_TARGET_MEMORY_OPCODE, LCMPXCHG8_DAG, LCMPXCHG16_DAG, + LCMPXCHG8_SAVE_EBX_DAG, + LCMPXCHG16_SAVE_RBX_DAG, /// LOCK-prefixed arithmetic read-modify-write instructions. /// EFLAGS, OUTCHAIN = LADD(INCHAIN, PTR, RHS) diff --git a/lib/Target/X86/X86InstrCompiler.td b/lib/Target/X86/X86InstrCompiler.td index 6ed62a00451..2c92cc370b5 100644 --- a/lib/Target/X86/X86InstrCompiler.td +++ b/lib/Target/X86/X86InstrCompiler.td @@ -737,6 +737,38 @@ defm LCMPXCHG8B : LCMPXCHG_UnOp<0xC7, MRM1m, "cmpxchg8b", IIC_CMPX_LOCK_8B>; } +// This pseudo must be used when the frame uses RBX as +// the base pointer. Indeed, in such situation RBX is a reserved +// register and the register allocator will ignore any use/def of +// it. In other words, the register will not fix the clobbering of +// RBX that will happen when setting the arguments for the instrucion. +// +// Unlike the actual related instuction, we mark that this one +// defines EBX (instead of using EBX). +// The rationale is that we will define RBX during the expansion of +// the pseudo. The argument feeding EBX is ebx_input. +// +// The additional argument, $ebx_save, is a temporary register used to +// save the value of RBX accross the actual instruction. +// +// To make sure the register assigned to $ebx_save does not interfere with +// the definition of the actual instruction, we use a definition $dst which +// is tied to $rbx_save. That way, the live-range of $rbx_save spans accross +// the instruction and we are sure we will have a valid register to restore +// the value of RBX. +let Defs = [EAX, EDX, EBX, EFLAGS], Uses = [EAX, ECX, EDX], + SchedRW = [WriteALULd, WriteRMW], isCodeGenOnly = 1, isPseudo = 1, + Constraints = "$ebx_save = $dst", usesCustomInserter = 1 in { +def LCMPXCHG8B_SAVE_EBX : + I<0, Pseudo, (outs GR32:$dst), + (ins i64mem:$ptr, GR32:$ebx_input, GR32:$ebx_save), + !strconcat("cmpxchg8b", "\t$ptr"), + [(set GR32:$dst, (X86cas8save_ebx addr:$ptr, GR32:$ebx_input, + GR32:$ebx_save))], + IIC_CMPX_LOCK_8B>; +} + + let Defs = [RAX, RDX, EFLAGS], Uses = [RAX, RBX, RCX, RDX], Predicates = [HasCmpxchg16b], SchedRW = [WriteALULd, WriteRMW] in { defm LCMPXCHG16B : LCMPXCHG_UnOp<0xC7, MRM1m, "cmpxchg16b", @@ -744,6 +776,20 @@ defm LCMPXCHG16B : LCMPXCHG_UnOp<0xC7, MRM1m, "cmpxchg16b", IIC_CMPX_LOCK_16B>, REX_W; } +// Same as LCMPXCHG8B_SAVE_RBX but for the 16 Bytes variant. +let Defs = [RAX, RDX, RBX, EFLAGS], Uses = [RAX, RCX, RDX], + Predicates = [HasCmpxchg16b], SchedRW = [WriteALULd, WriteRMW], + isCodeGenOnly = 1, isPseudo = 1, Constraints = "$rbx_save = $dst", + usesCustomInserter = 1 in { +def LCMPXCHG16B_SAVE_RBX : + I<0, Pseudo, (outs GR64:$dst), + (ins i128mem:$ptr, GR64:$rbx_input, GR64:$rbx_save), + !strconcat("cmpxchg16b", "\t$ptr"), + [(set GR64:$dst, (X86cas16save_rbx addr:$ptr, GR64:$rbx_input, + GR64:$rbx_save))], + IIC_CMPX_LOCK_16B>; +} + defm LCMPXCHG : LCMPXCHG_BinOp<0xB0, 0xB1, MRMDestMem, "cmpxchg", X86cas, IIC_CMPX_LOCK_8, IIC_CMPX_LOCK>; diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td index aaf87a7b42a..873a44cb499 100644 --- a/lib/Target/X86/X86InstrInfo.td +++ b/lib/Target/X86/X86InstrInfo.td @@ -71,6 +71,12 @@ def SDTX86rdrand : SDTypeProfile<2, 0, [SDTCisInt<0>, SDTCisVT<1, i32>]>; def SDTX86cas : SDTypeProfile<0, 3, [SDTCisPtrTy<0>, SDTCisInt<1>, SDTCisVT<2, i8>]>; def SDTX86caspair : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>; +def SDTX86caspairSaveEbx8 : SDTypeProfile<1, 3, + [SDTCisVT<0, i32>, SDTCisPtrTy<1>, + SDTCisVT<2, i32>, SDTCisVT<3, i32>]>; +def SDTX86caspairSaveRbx16 : SDTypeProfile<1, 3, + [SDTCisVT<0, i64>, SDTCisPtrTy<1>, + SDTCisVT<2, i64>, SDTCisVT<3, i64>]>; def SDTLockBinaryArithWithFlags : SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisPtrTy<1>, @@ -155,6 +161,14 @@ def X86cas8 : SDNode<"X86ISD::LCMPXCHG8_DAG", SDTX86caspair, def X86cas16 : SDNode<"X86ISD::LCMPXCHG16_DAG", SDTX86caspair, [SDNPHasChain, SDNPInGlue, SDNPOutGlue, SDNPMayStore, SDNPMayLoad, SDNPMemOperand]>; +def X86cas8save_ebx : SDNode<"X86ISD::LCMPXCHG8_SAVE_EBX_DAG", + SDTX86caspairSaveEbx8, + [SDNPHasChain, SDNPInGlue, SDNPOutGlue, + SDNPMayStore, SDNPMayLoad, SDNPMemOperand]>; +def X86cas16save_rbx : SDNode<"X86ISD::LCMPXCHG16_SAVE_RBX_DAG", + SDTX86caspairSaveRbx16, + [SDNPHasChain, SDNPInGlue, SDNPOutGlue, + SDNPMayStore, SDNPMayLoad, SDNPMemOperand]>; def X86retflag : SDNode<"X86ISD::RET_FLAG", SDTX86Ret, [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>; diff --git a/test/CodeGen/X86/atomic128.ll b/test/CodeGen/X86/atomic128.ll index c41269b0b60..1bf7bfbfa26 100644 --- a/test/CodeGen/X86/atomic128.ll +++ b/test/CodeGen/X86/atomic128.ll @@ -4,9 +4,14 @@ define i128 @val_compare_and_swap(i128* %p, i128 %oldval, i128 %newval) { ; CHECK-LABEL: val_compare_and_swap: +; Due to the scheduling right after isel for cmpxchg and given the +; machine scheduler and copy coalescer do not mess up with physical +; register live-ranges, we end up with a useless copy. +; +; CHECK: movq %rcx, [[TMP:%r[0-9a-z]+]] ; CHECK: movq %rsi, %rax -; CHECK: movq %rcx, %rbx ; CHECK: movq %r8, %rcx +; CHECK: movq [[TMP]], %rbx ; CHECK: lock ; CHECK: cmpxchg16b (%rdi) @@ -216,8 +221,8 @@ define i128 @atomic_load_seq_cst(i128* %p) { ; CHECK-LABEL: atomic_load_seq_cst: ; CHECK: xorl %eax, %eax ; CHECK: xorl %edx, %edx -; CHECK: xorl %ebx, %ebx ; CHECK: xorl %ecx, %ecx +; CHECK: xorl %ebx, %ebx ; CHECK: lock ; CHECK: cmpxchg16b (%rdi) @@ -229,8 +234,8 @@ define i128 @atomic_load_relaxed(i128* %p) { ; CHECK: atomic_load_relaxed: ; CHECK: xorl %eax, %eax ; CHECK: xorl %edx, %edx -; CHECK: xorl %ebx, %ebx ; CHECK: xorl %ecx, %ecx +; CHECK: xorl %ebx, %ebx ; CHECK: lock ; CHECK: cmpxchg16b (%rdi) diff --git a/test/CodeGen/X86/base-pointer-and-cmpxchg.ll b/test/CodeGen/X86/base-pointer-and-cmpxchg.ll new file mode 100644 index 00000000000..8de6d64428e --- /dev/null +++ b/test/CodeGen/X86/base-pointer-and-cmpxchg.ll @@ -0,0 +1,51 @@ +; RUN: llc -mtriple=x86_64-apple-macosx -mattr=+cx16 -x86-use-base-pointer=true -stackrealign -stack-alignment=32 %s -o - | FileCheck --check-prefix=CHECK --check-prefix=USE_BASE --check-prefix=USE_BASE_64 %s +; RUN: llc -mtriple=x86_64-apple-macosx -mattr=+cx16 -x86-use-base-pointer=false -stackrealign -stack-alignment=32 %s -o - | FileCheck --check-prefix=CHECK --check-prefix=DONT_USE_BASE %s +; RUN: llc -mtriple=x86_64-linux-gnux32 -mattr=+cx16 -x86-use-base-pointer=true -stackrealign -stack-alignment=32 %s -o - | FileCheck --check-prefix=CHECK --check-prefix=USE_BASE --check-prefix=USE_BASE_32 %s +; RUN: llc -mtriple=x86_64-linux-gnux32 -mattr=+cx16 -x86-use-base-pointer=false -stackrealign -stack-alignment=32 %s -o - | FileCheck --check-prefix=CHECK --check-prefix=DONT_USE_BASE %s + +; This function uses dynamic allocated stack to force the use +; of a frame pointer. +; The inline asm clobbers a bunch of registers to make sure +; the frame pointer will need to be used (for spilling in that case). +; +; Then, we check that when we use rbx as the base pointer, +; we do not use cmpxchg, since using that instruction requires +; to clobbers rbx to set the arguments of the instruction and when +; rbx is used as the base pointer, RA cannot fix the code for us. +; +; CHECK-LABEL: cmp_and_swap16: +; Check that we actually use rbx. +; gnux32 use the 32bit variant of the registers. +; USE_BASE_64: movq %rsp, %rbx +; USE_BASE_32: movl %esp, %ebx +; +; Make sure the base pointer is saved before the RBX argument for +; cmpxchg16b is set. +; +; Because of how the test is written, we spill SAVE_RBX. +; However, it would have been perfectly fine to just keep it in register. +; USE_BASE: movq %rbx, [[SAVE_RBX_SLOT:[0-9]*\(%[er]bx\)]] +; +; SAVE_RBX must be in register before we clobber rbx. +; It is fine to use any register but rbx and the ones defined and use +; by cmpxchg. Since such regex would be complicated to write, just stick +; to the numbered registers. The bottom line is: if this test case fails +; because of that regex, this is likely just the regex being too conservative. +; USE_BASE: movq [[SAVE_RBX_SLOT]], [[SAVE_RBX:%r[0-9]+]] +; +; USE_BASE: movq {{[^ ]+}}, %rbx +; USE_BASE-NEXT: cmpxchg16b +; USE_BASE-NEXT: movq [[SAVE_RBX]], %rbx +; +; DONT_USE_BASE-NOT: movq %rsp, %rbx +; DONT_USE_BASE-NOT: movl %esp, %ebx +; DONT_USE_BASE: cmpxchg +define i1 @cmp_and_swap16(i128 %a, i128 %b, i128* %addr, i32 %n) { + %dummy = alloca i32, i32 %n +tail call void asm sideeffect "nop", "~{rax},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() + %cmp = cmpxchg i128* %addr, i128 %a, i128 %b seq_cst seq_cst + %res = extractvalue { i128, i1 } %cmp, 1 + %idx = getelementptr i32, i32* %dummy, i32 5 + store i32 %n, i32* %idx + ret i1 %res +}