From 8d30502e60f412de082c1c774728e90aa5243e6c Mon Sep 17 00:00:00 2001 From: James Y Knight Date: Fri, 17 Jun 2016 18:11:48 +0000 Subject: [PATCH] Support expanding partial-word cmpxchg to full-word cmpxchg in AtomicExpandPass. Many CPUs only have the ability to do a 4-byte cmpxchg (or ll/sc), not 1 or 2-byte. For those, you need to mask and shift the 1 or 2 byte values appropriately to use the 4-byte instruction. This change adds support for cmpxchg-based instruction sets (only SPARC, in LLVM). The support can be extended for LL/SC-based PPC and MIPS in the future, supplanting the ISel expansions those architectures currently use. Tests added for the IR transform and SPARCv9. Differential Revision: http://reviews.llvm.org/D21029 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@273025 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/Target/TargetLowering.h | 18 + lib/CodeGen/AtomicExpandPass.cpp | 459 +++++++++++++++--- lib/CodeGen/TargetLoweringBase.cpp | 2 + lib/Target/Sparc/SparcISelLowering.cpp | 2 + test/CodeGen/SPARC/atomics.ll | 132 ++++- .../Transforms/AtomicExpand/SPARC/partword.ll | 166 +++++++ 6 files changed, 723 insertions(+), 56 deletions(-) create mode 100644 test/Transforms/AtomicExpand/SPARC/partword.ll diff --git a/include/llvm/Target/TargetLowering.h b/include/llvm/Target/TargetLowering.h index 5dfe7dc5bde..14bf0e7ab59 100644 --- a/include/llvm/Target/TargetLowering.h +++ b/include/llvm/Target/TargetLowering.h @@ -1136,6 +1136,15 @@ public: return MaxAtomicSizeInBitsSupported; } + /// Returns the size of the smallest cmpxchg or ll/sc instruction + /// the backend supports. Any smaller operations are widened in + /// AtomicExpandPass. + /// + /// Note that *unlike* operations above the maximum size, atomic ops + /// are still natively supported below the minimum; they just + /// require a more complex expansion. + unsigned getMinCmpXchgSizeInBits() const { return MinCmpXchgSizeInBits; } + /// Whether AtomicExpandPass should automatically insert fences and reduce /// ordering for this atomic. This should be true for most architectures with /// weak memory ordering. Defaults to false. @@ -1552,6 +1561,11 @@ protected: MaxAtomicSizeInBitsSupported = SizeInBits; } + // Sets the minimum cmpxchg or ll/sc size supported by the backend. + void setMinCmpXchgSizeInBits(unsigned SizeInBits) { + MinCmpXchgSizeInBits = SizeInBits; + } + public: //===--------------------------------------------------------------------===// // Addressing mode description hooks (used by LSR etc). @@ -1965,6 +1979,10 @@ private: /// Accesses larger than this will be expanded by AtomicExpandPass. unsigned MaxAtomicSizeInBitsSupported; + /// Size in bits of the minimum cmpxchg or ll/sc operation the + /// backend supports. + unsigned MinCmpXchgSizeInBits; + /// If set to a physical register, this specifies the register that /// llvm.savestack/llvm.restorestack should save and restore. unsigned StackPointerRegisterToSaveRestore; diff --git a/lib/CodeGen/AtomicExpandPass.cpp b/lib/CodeGen/AtomicExpandPass.cpp index 4b26b643619..bf5cf105a8f 100644 --- a/lib/CodeGen/AtomicExpandPass.cpp +++ b/lib/CodeGen/AtomicExpandPass.cpp @@ -57,10 +57,25 @@ namespace { StoreInst *convertAtomicStoreToIntegerType(StoreInst *SI); bool expandAtomicStore(StoreInst *SI); bool tryExpandAtomicRMW(AtomicRMWInst *AI); - bool expandAtomicOpToLLSC( - Instruction *I, Value *Addr, AtomicOrdering MemOpOrder, + Value * + insertRMWLLSCLoop(IRBuilder<> &Builder, Type *ResultTy, Value *Addr, + AtomicOrdering MemOpOrder, + function_ref &, Value *)> PerformOp); + void expandAtomicOpToLLSC( + Instruction *I, Type *ResultTy, Value *Addr, AtomicOrdering MemOpOrder, function_ref &, Value *)> PerformOp); + void expandPartwordAtomicRMW( + AtomicRMWInst *I, + TargetLoweringBase::AtomicExpansionKind ExpansionKind); + void expandPartwordCmpXchg(AtomicCmpXchgInst *I); + AtomicCmpXchgInst *convertCmpXchgToIntegerType(AtomicCmpXchgInst *CI); + static Value *insertRMWCmpXchgLoop( + IRBuilder<> &Builder, Type *ResultType, Value *Addr, + AtomicOrdering MemOpOrder, + function_ref &, Value *)> PerformOp, + CreateCmpXchgInstFun CreateCmpXchg); + bool expandAtomicCmpXchg(AtomicCmpXchgInst *CI); bool isIdempotentRMW(AtomicRMWInst *AI); bool simplifyIdempotentRMW(AtomicRMWInst *AI); @@ -74,6 +89,10 @@ namespace { void expandAtomicStoreToLibcall(StoreInst *LI); void expandAtomicRMWToLibcall(AtomicRMWInst *I); void expandAtomicCASToLibcall(AtomicCmpXchgInst *I); + + friend bool + llvm::expandAtomicRMWToCmpXchg(AtomicRMWInst *AI, + CreateCmpXchgInstFun CreateCmpXchg); }; } @@ -285,9 +304,17 @@ bool AtomicExpand::runOnFunction(Function &F) { "invariant broken"); MadeChange = true; } - - if (TLI->shouldExpandAtomicCmpXchgInIR(CASI)) - MadeChange |= expandAtomicCmpXchg(CASI); + + unsigned MinCASSize = TLI->getMinCmpXchgSizeInBits() / 8; + unsigned ValueSize = getAtomicOpSize(CASI); + if (ValueSize < MinCASSize) { + assert(!TLI->shouldExpandAtomicCmpXchgInIR(CASI) && + "MinCmpXchgSizeInBits not yet supported for LL/SC expansions."); + expandPartwordCmpXchg(CASI); + } else { + if (TLI->shouldExpandAtomicCmpXchgInIR(CASI)) + MadeChange |= expandAtomicCmpXchg(CASI); + } } } return MadeChange; @@ -355,9 +382,10 @@ bool AtomicExpand::tryExpandAtomicLoad(LoadInst *LI) { case TargetLoweringBase::AtomicExpansionKind::None: return false; case TargetLoweringBase::AtomicExpansionKind::LLSC: - return expandAtomicOpToLLSC( - LI, LI->getPointerOperand(), LI->getOrdering(), + expandAtomicOpToLLSC( + LI, LI->getType(), LI->getPointerOperand(), LI->getOrdering(), [](IRBuilder<> &Builder, Value *Loaded) { return Loaded; }); + return true; case TargetLoweringBase::AtomicExpansionKind::LLOnly: return expandAtomicLoadToLL(LI); case TargetLoweringBase::AtomicExpansionKind::CmpXChg: @@ -498,32 +526,353 @@ bool AtomicExpand::tryExpandAtomicRMW(AtomicRMWInst *AI) { switch (TLI->shouldExpandAtomicRMWInIR(AI)) { case TargetLoweringBase::AtomicExpansionKind::None: return false; - case TargetLoweringBase::AtomicExpansionKind::LLSC: - return expandAtomicOpToLLSC(AI, AI->getPointerOperand(), AI->getOrdering(), - [&](IRBuilder<> &Builder, Value *Loaded) { - return performAtomicOp(AI->getOperation(), - Builder, Loaded, - AI->getValOperand()); - }); - case TargetLoweringBase::AtomicExpansionKind::CmpXChg: - return expandAtomicRMWToCmpXchg(AI, createCmpXchgInstFun); + case TargetLoweringBase::AtomicExpansionKind::LLSC: { + unsigned MinCASSize = TLI->getMinCmpXchgSizeInBits() / 8; + unsigned ValueSize = getAtomicOpSize(AI); + if (ValueSize < MinCASSize) { + llvm_unreachable( + "MinCmpXchgSizeInBits not yet supported for LL/SC architectures."); + } else { + auto PerformOp = [&](IRBuilder<> &Builder, Value *Loaded) { + return performAtomicOp(AI->getOperation(), Builder, Loaded, + AI->getValOperand()); + }; + expandAtomicOpToLLSC(AI, AI->getType(), AI->getPointerOperand(), + AI->getOrdering(), PerformOp); + } + return true; + } + case TargetLoweringBase::AtomicExpansionKind::CmpXChg: { + unsigned MinCASSize = TLI->getMinCmpXchgSizeInBits() / 8; + unsigned ValueSize = getAtomicOpSize(AI); + if (ValueSize < MinCASSize) { + expandPartwordAtomicRMW(AI, + TargetLoweringBase::AtomicExpansionKind::CmpXChg); + } else { + expandAtomicRMWToCmpXchg(AI, createCmpXchgInstFun); + } + return true; + } default: llvm_unreachable("Unhandled case in tryExpandAtomicRMW"); } } -bool AtomicExpand::expandAtomicOpToLLSC( - Instruction *I, Value *Addr, AtomicOrdering MemOpOrder, - function_ref &, Value *)> PerformOp) { +namespace { + +/// Result values from createMaskInstrs helper. +struct PartwordMaskValues { + Type *WordType; + Type *ValueType; + Value *AlignedAddr; + Value *ShiftAmt; + Value *Mask; + Value *Inv_Mask; +}; +} // end anonymous namespace + +/// This is a helper function which builds instructions to provide +/// values necessary for partword atomic operations. It takes an +/// incoming address, Addr, and ValueType, and constructs the address, +/// shift-amounts and masks needed to work with a larger value of size +/// WordSize. +/// +/// AlignedAddr: Addr rounded down to a multiple of WordSize +/// +/// ShiftAmt: Number of bits to right-shift a WordSize value loaded +/// from AlignAddr for it to have the same value as if +/// ValueType was loaded from Addr. +/// +/// Mask: Value to mask with the value loaded from AlignAddr to +/// include only the part that would've been loaded from Addr. +/// +/// Inv_Mask: The inverse of Mask. + +static PartwordMaskValues createMaskInstrs(IRBuilder<> &Builder, Instruction *I, + Type *ValueType, Value *Addr, + unsigned WordSize) { + PartwordMaskValues Ret; + BasicBlock *BB = I->getParent(); Function *F = BB->getParent(); + Module *M = I->getModule(); + LLVMContext &Ctx = F->getContext(); + const DataLayout &DL = M->getDataLayout(); + + unsigned ValueSize = DL.getTypeStoreSize(ValueType); + + assert(ValueSize < WordSize); + + Ret.ValueType = ValueType; + Ret.WordType = Type::getIntNTy(Ctx, WordSize * 8); + + Type *WordPtrType = + Ret.WordType->getPointerTo(Addr->getType()->getPointerAddressSpace()); + + Value *AddrInt = Builder.CreatePtrToInt(Addr, DL.getIntPtrType(Ctx)); + Ret.AlignedAddr = Builder.CreateIntToPtr( + Builder.CreateAnd(AddrInt, ~(uint64_t)(WordSize - 1)), WordPtrType, + "AlignedAddr"); + + Value *PtrLSB = Builder.CreateAnd(AddrInt, WordSize - 1, "PtrLSB"); + if (DL.isLittleEndian()) { + // turn bytes into bits + Ret.ShiftAmt = Builder.CreateShl(PtrLSB, 3); + } else { + // turn bytes into bits, and count from the other side. + Ret.ShiftAmt = + Builder.CreateShl(Builder.CreateXor(PtrLSB, WordSize - ValueSize), 3); + } + + Ret.ShiftAmt = Builder.CreateTrunc(Ret.ShiftAmt, Ret.WordType, "ShiftAmt"); + Ret.Mask = Builder.CreateShl( + ConstantInt::get(Ret.WordType, (1 << ValueSize * 8) - 1), Ret.ShiftAmt, + "Mask"); + Ret.Inv_Mask = Builder.CreateNot(Ret.Mask, "Inv_Mask"); + + return Ret; +} + +/// Emit IR to implement a masked version of a given atomicrmw +/// operation. (That is, only the bits under the Mask should be +/// affected by the operation) +static Value *performMaskedAtomicOp(AtomicRMWInst::BinOp Op, + IRBuilder<> &Builder, Value *Loaded, + Value *Shifted_Inc, Value *Inc, + const PartwordMaskValues &PMV) { + switch (Op) { + case AtomicRMWInst::Xchg: { + Value *Loaded_MaskOut = Builder.CreateAnd(Loaded, PMV.Inv_Mask); + Value *FinalVal = Builder.CreateOr(Loaded_MaskOut, Shifted_Inc); + return FinalVal; + } + case AtomicRMWInst::Or: + case AtomicRMWInst::Xor: + // Or/Xor won't affect any other bits, so can just be done + // directly. + return performAtomicOp(Op, Builder, Loaded, Shifted_Inc); + case AtomicRMWInst::Add: + case AtomicRMWInst::Sub: + case AtomicRMWInst::And: + case AtomicRMWInst::Nand: { + // The other arithmetic ops need to be masked into place. + Value *NewVal = performAtomicOp(Op, Builder, Loaded, Shifted_Inc); + Value *NewVal_Masked = Builder.CreateAnd(NewVal, PMV.Mask); + Value *Loaded_MaskOut = Builder.CreateAnd(Loaded, PMV.Inv_Mask); + Value *FinalVal = Builder.CreateOr(Loaded_MaskOut, NewVal_Masked); + return FinalVal; + } + case AtomicRMWInst::Max: + case AtomicRMWInst::Min: + case AtomicRMWInst::UMax: + case AtomicRMWInst::UMin: { + // Finally, comparison ops will operate on the full value, so + // truncate down to the original size, and expand out again after + // doing the operation. + Value *Loaded_Shiftdown = Builder.CreateTrunc( + Builder.CreateLShr(Loaded, PMV.ShiftAmt), PMV.ValueType); + Value *NewVal = performAtomicOp(Op, Builder, Loaded_Shiftdown, Inc); + Value *NewVal_Shiftup = Builder.CreateShl( + Builder.CreateZExt(NewVal, PMV.WordType), PMV.ShiftAmt); + Value *Loaded_MaskOut = Builder.CreateAnd(Loaded, PMV.Inv_Mask); + Value *FinalVal = Builder.CreateOr(Loaded_MaskOut, NewVal_Shiftup); + return FinalVal; + } + default: + llvm_unreachable("Unknown atomic op"); + } +} + +/// Expand a sub-word atomicrmw operation into an appropriate +/// word-sized operation. +/// +/// It will create an LL/SC or cmpxchg loop, as appropriate, the same +/// way as a typical atomicrmw expansion. The only difference here is +/// that the operation inside of the loop must operate only upon a +/// part of the value. +void AtomicExpand::expandPartwordAtomicRMW( + AtomicRMWInst *AI, TargetLoweringBase::AtomicExpansionKind ExpansionKind) { + + assert(ExpansionKind == TargetLoweringBase::AtomicExpansionKind::CmpXChg); + + AtomicOrdering MemOpOrder = AI->getOrdering(); + + IRBuilder<> Builder(AI); + + PartwordMaskValues PMV = + createMaskInstrs(Builder, AI, AI->getType(), AI->getPointerOperand(), + TLI->getMinCmpXchgSizeInBits() / 8); + + Value *ValOperand_Shifted = + Builder.CreateShl(Builder.CreateZExt(AI->getValOperand(), PMV.WordType), + PMV.ShiftAmt, "ValOperand_Shifted"); + + auto PerformPartwordOp = [&](IRBuilder<> &Builder, Value *Loaded) { + return performMaskedAtomicOp(AI->getOperation(), Builder, Loaded, + ValOperand_Shifted, AI->getValOperand(), PMV); + }; + + // TODO: When we're ready to support LLSC conversions too, use + // insertRMWLLSCLoop here for ExpansionKind==LLSC. + Value *OldResult = + insertRMWCmpXchgLoop(Builder, PMV.WordType, PMV.AlignedAddr, MemOpOrder, + PerformPartwordOp, createCmpXchgInstFun); + Value *FinalOldResult = Builder.CreateTrunc( + Builder.CreateLShr(OldResult, PMV.ShiftAmt), PMV.ValueType); + AI->replaceAllUsesWith(FinalOldResult); + AI->eraseFromParent(); +} + +void AtomicExpand::expandPartwordCmpXchg(AtomicCmpXchgInst *CI) { + // The basic idea here is that we're expanding a cmpxchg of a + // smaller memory size up to a word-sized cmpxchg. To do this, we + // need to add a retry-loop for strong cmpxchg, so that + // modifications to other parts of the word don't cause a spurious + // failure. + + // This generates code like the following: + // [[Setup mask values PMV.*]] + // %NewVal_Shifted = shl i32 %NewVal, %PMV.ShiftAmt + // %Cmp_Shifted = shl i32 %Cmp, %PMV.ShiftAmt + // %InitLoaded = load i32* %addr + // %InitLoaded_MaskOut = and i32 %InitLoaded, %PMV.Inv_Mask + // br partword.cmpxchg.loop + // partword.cmpxchg.loop: + // %Loaded_MaskOut = phi i32 [ %InitLoaded_MaskOut, %entry ], + // [ %OldVal_MaskOut, %partword.cmpxchg.failure ] + // %FullWord_NewVal = or i32 %Loaded_MaskOut, %NewVal_Shifted + // %FullWord_Cmp = or i32 %Loaded_MaskOut, %Cmp_Shifted + // %NewCI = cmpxchg i32* %PMV.AlignedAddr, i32 %FullWord_Cmp, + // i32 %FullWord_NewVal success_ordering failure_ordering + // %OldVal = extractvalue { i32, i1 } %NewCI, 0 + // %Success = extractvalue { i32, i1 } %NewCI, 1 + // br i1 %Success, label %partword.cmpxchg.end, + // label %partword.cmpxchg.failure + // partword.cmpxchg.failure: + // %OldVal_MaskOut = and i32 %OldVal, %PMV.Inv_Mask + // %ShouldContinue = icmp ne i32 %Loaded_MaskOut, %OldVal_MaskOut + // br i1 %ShouldContinue, label %partword.cmpxchg.loop, + // label %partword.cmpxchg.end + // partword.cmpxchg.end: + // %tmp1 = lshr i32 %OldVal, %PMV.ShiftAmt + // %FinalOldVal = trunc i32 %tmp1 to i8 + // %tmp2 = insertvalue { i8, i1 } undef, i8 %FinalOldVal, 0 + // %Res = insertvalue { i8, i1 } %25, i1 %Success, 1 + + Value *Addr = CI->getPointerOperand(); + Value *Cmp = CI->getCompareOperand(); + Value *NewVal = CI->getNewValOperand(); + + BasicBlock *BB = CI->getParent(); + Function *F = BB->getParent(); + IRBuilder<> Builder(CI); + LLVMContext &Ctx = Builder.getContext(); + + const int WordSize = TLI->getMinCmpXchgSizeInBits() / 8; + + BasicBlock *EndBB = + BB->splitBasicBlock(CI->getIterator(), "partword.cmpxchg.end"); + auto FailureBB = + BasicBlock::Create(Ctx, "partword.cmpxchg.failure", F, EndBB); + auto LoopBB = BasicBlock::Create(Ctx, "partword.cmpxchg.loop", F, FailureBB); + + // The split call above "helpfully" added a branch at the end of BB + // (to the wrong place). + std::prev(BB->end())->eraseFromParent(); + Builder.SetInsertPoint(BB); + + PartwordMaskValues PMV = createMaskInstrs( + Builder, CI, CI->getCompareOperand()->getType(), Addr, WordSize); + + // Shift the incoming values over, into the right location in the word. + Value *NewVal_Shifted = + Builder.CreateShl(Builder.CreateZExt(NewVal, PMV.WordType), PMV.ShiftAmt); + Value *Cmp_Shifted = + Builder.CreateShl(Builder.CreateZExt(Cmp, PMV.WordType), PMV.ShiftAmt); + + // Load the entire current word, and mask into place the expected and new + // values + LoadInst *InitLoaded = Builder.CreateLoad(PMV.WordType, PMV.AlignedAddr); + InitLoaded->setVolatile(CI->isVolatile()); + Value *InitLoaded_MaskOut = Builder.CreateAnd(InitLoaded, PMV.Inv_Mask); + Builder.CreateBr(LoopBB); + + // partword.cmpxchg.loop: + Builder.SetInsertPoint(LoopBB); + PHINode *Loaded_MaskOut = Builder.CreatePHI(PMV.WordType, 2); + Loaded_MaskOut->addIncoming(InitLoaded_MaskOut, BB); + + // Mask/Or the expected and new values into place in the loaded word. + Value *FullWord_NewVal = Builder.CreateOr(Loaded_MaskOut, NewVal_Shifted); + Value *FullWord_Cmp = Builder.CreateOr(Loaded_MaskOut, Cmp_Shifted); + AtomicCmpXchgInst *NewCI = Builder.CreateAtomicCmpXchg( + PMV.AlignedAddr, FullWord_Cmp, FullWord_NewVal, CI->getSuccessOrdering(), + CI->getFailureOrdering(), CI->getSynchScope()); + NewCI->setVolatile(CI->isVolatile()); + // When we're building a strong cmpxchg, we need a loop, so you + // might think we could use a weak cmpxchg inside. But, using strong + // allows the below comparison for ShouldContinue, and we're + // expecting the underlying cmpxchg to be a machine instruction, + // which is strong anyways. + NewCI->setWeak(CI->isWeak()); + + Value *OldVal = Builder.CreateExtractValue(NewCI, 0); + Value *Success = Builder.CreateExtractValue(NewCI, 1); + + if (CI->isWeak()) + Builder.CreateBr(EndBB); + else + Builder.CreateCondBr(Success, EndBB, FailureBB); + + // partword.cmpxchg.failure: + Builder.SetInsertPoint(FailureBB); + // Upon failure, verify that the masked-out part of the loaded value + // has been modified. If it didn't, abort the cmpxchg, since the + // masked-in part must've. + Value *OldVal_MaskOut = Builder.CreateAnd(OldVal, PMV.Inv_Mask); + Value *ShouldContinue = Builder.CreateICmpNE(Loaded_MaskOut, OldVal_MaskOut); + Builder.CreateCondBr(ShouldContinue, LoopBB, EndBB); + + // Add the second value to the phi from above + Loaded_MaskOut->addIncoming(OldVal_MaskOut, FailureBB); + + // partword.cmpxchg.end: + Builder.SetInsertPoint(CI); + + Value *FinalOldVal = Builder.CreateTrunc( + Builder.CreateLShr(OldVal, PMV.ShiftAmt), PMV.ValueType); + Value *Res = UndefValue::get(CI->getType()); + Res = Builder.CreateInsertValue(Res, FinalOldVal, 0); + Res = Builder.CreateInsertValue(Res, Success, 1); + + CI->replaceAllUsesWith(Res); + CI->eraseFromParent(); +} + +void AtomicExpand::expandAtomicOpToLLSC( + Instruction *I, Type *ResultType, Value *Addr, AtomicOrdering MemOpOrder, + function_ref &, Value *)> PerformOp) { + IRBuilder<> Builder(I); + Value *Loaded = + insertRMWLLSCLoop(Builder, ResultType, Addr, MemOpOrder, PerformOp); + + I->replaceAllUsesWith(Loaded); + I->eraseFromParent(); +} + +Value *AtomicExpand::insertRMWLLSCLoop( + IRBuilder<> &Builder, Type *ResultTy, Value *Addr, + AtomicOrdering MemOpOrder, + function_ref &, Value *)> PerformOp) { + LLVMContext &Ctx = Builder.getContext(); + BasicBlock *BB = Builder.GetInsertBlock(); + Function *F = BB->getParent(); // Given: atomicrmw some_op iN* %addr, iN %incr ordering // // The standard expansion we produce is: // [...] - // fence? // atomicrmw.start: // %loaded = @load.linked(%addr) // %new = some_op iN %loaded, %incr @@ -531,17 +880,13 @@ bool AtomicExpand::expandAtomicOpToLLSC( // %try_again = icmp i32 ne %stored, 0 // br i1 %try_again, label %loop, label %atomicrmw.end // atomicrmw.end: - // fence? // [...] - BasicBlock *ExitBB = BB->splitBasicBlock(I->getIterator(), "atomicrmw.end"); + BasicBlock *ExitBB = + BB->splitBasicBlock(Builder.GetInsertPoint(), "atomicrmw.end"); BasicBlock *LoopBB = BasicBlock::Create(Ctx, "atomicrmw.start", F, ExitBB); - // This grabs the DebugLoc from I. - IRBuilder<> Builder(I); - // The split call above "helpfully" added a branch at the end of BB (to the - // wrong place), but we might want a fence too. It's easiest to just remove - // the branch entirely. + // wrong place). std::prev(BB->end())->eraseFromParent(); Builder.SetInsertPoint(BB); Builder.CreateBr(LoopBB); @@ -559,11 +904,7 @@ bool AtomicExpand::expandAtomicOpToLLSC( Builder.CreateCondBr(TryAgain, LoopBB, ExitBB); Builder.SetInsertPoint(ExitBB, ExitBB->begin()); - - I->replaceAllUsesWith(Loaded); - I->eraseFromParent(); - - return true; + return Loaded; } /// Convert an atomic cmpxchg of a non-integral type to an integer cmpxchg of @@ -867,17 +1208,14 @@ bool AtomicExpand::simplifyIdempotentRMW(AtomicRMWInst* RMWI) { return false; } -bool llvm::expandAtomicRMWToCmpXchg(AtomicRMWInst *AI, - CreateCmpXchgInstFun CreateCmpXchg) { - assert(AI); - - AtomicOrdering MemOpOrder = AI->getOrdering() == AtomicOrdering::Unordered - ? AtomicOrdering::Monotonic - : AI->getOrdering(); - Value *Addr = AI->getPointerOperand(); - BasicBlock *BB = AI->getParent(); +Value *AtomicExpand::insertRMWCmpXchgLoop( + IRBuilder<> &Builder, Type *ResultTy, Value *Addr, + AtomicOrdering MemOpOrder, + function_ref &, Value *)> PerformOp, + CreateCmpXchgInstFun CreateCmpXchg) { + LLVMContext &Ctx = Builder.getContext(); + BasicBlock *BB = Builder.GetInsertBlock(); Function *F = BB->getParent(); - LLVMContext &Ctx = F->getContext(); // Given: atomicrmw some_op iN* %addr, iN %incr ordering // @@ -894,34 +1232,34 @@ bool llvm::expandAtomicRMWToCmpXchg(AtomicRMWInst *AI, // br i1 %success, label %atomicrmw.end, label %loop // atomicrmw.end: // [...] - BasicBlock *ExitBB = BB->splitBasicBlock(AI->getIterator(), "atomicrmw.end"); + BasicBlock *ExitBB = + BB->splitBasicBlock(Builder.GetInsertPoint(), "atomicrmw.end"); BasicBlock *LoopBB = BasicBlock::Create(Ctx, "atomicrmw.start", F, ExitBB); - // This grabs the DebugLoc from AI. - IRBuilder<> Builder(AI); - // The split call above "helpfully" added a branch at the end of BB (to the // wrong place), but we want a load. It's easiest to just remove // the branch entirely. std::prev(BB->end())->eraseFromParent(); Builder.SetInsertPoint(BB); - LoadInst *InitLoaded = Builder.CreateLoad(Addr); + LoadInst *InitLoaded = Builder.CreateLoad(ResultTy, Addr); // Atomics require at least natural alignment. - InitLoaded->setAlignment(AI->getType()->getPrimitiveSizeInBits() / 8); + InitLoaded->setAlignment(ResultTy->getPrimitiveSizeInBits() / 8); Builder.CreateBr(LoopBB); // Start the main loop block now that we've taken care of the preliminaries. Builder.SetInsertPoint(LoopBB); - PHINode *Loaded = Builder.CreatePHI(AI->getType(), 2, "loaded"); + PHINode *Loaded = Builder.CreatePHI(ResultTy, 2, "loaded"); Loaded->addIncoming(InitLoaded, BB); - Value *NewVal = - performAtomicOp(AI->getOperation(), Builder, Loaded, AI->getValOperand()); + Value *NewVal = PerformOp(Builder, Loaded); Value *NewLoaded = nullptr; Value *Success = nullptr; - CreateCmpXchg(Builder, Addr, Loaded, NewVal, MemOpOrder, + CreateCmpXchg(Builder, Addr, Loaded, NewVal, + MemOpOrder == AtomicOrdering::Unordered + ? AtomicOrdering::Monotonic + : MemOpOrder, Success, NewLoaded); assert(Success && NewLoaded); @@ -930,10 +1268,23 @@ bool llvm::expandAtomicRMWToCmpXchg(AtomicRMWInst *AI, Builder.CreateCondBr(Success, ExitBB, LoopBB); Builder.SetInsertPoint(ExitBB, ExitBB->begin()); + return NewLoaded; +} - AI->replaceAllUsesWith(NewLoaded); +// Note: This function is exposed externally by AtomicExpandUtils.h +bool llvm::expandAtomicRMWToCmpXchg(AtomicRMWInst *AI, + CreateCmpXchgInstFun CreateCmpXchg) { + IRBuilder<> Builder(AI); + Value *Loaded = AtomicExpand::insertRMWCmpXchgLoop( + Builder, AI->getType(), AI->getPointerOperand(), AI->getOrdering(), + [&](IRBuilder<> &Builder, Value *Loaded) { + return performAtomicOp(AI->getOperation(), Builder, Loaded, + AI->getValOperand()); + }, + CreateCmpXchg); + + AI->replaceAllUsesWith(Loaded); AI->eraseFromParent(); - return true; } diff --git a/lib/CodeGen/TargetLoweringBase.cpp b/lib/CodeGen/TargetLoweringBase.cpp index 52ab7cf3763..aa7c4a67209 100644 --- a/lib/CodeGen/TargetLoweringBase.cpp +++ b/lib/CodeGen/TargetLoweringBase.cpp @@ -830,6 +830,8 @@ TargetLoweringBase::TargetLoweringBase(const TargetMachine &tm) : TM(tm) { // with the Target-specific changes necessary. MaxAtomicSizeInBitsSupported = 1024; + MinCmpXchgSizeInBits = 0; + std::fill(std::begin(LibcallRoutineNames), std::end(LibcallRoutineNames), nullptr); InitLibcallNames(LibcallRoutineNames, TM.getTargetTriple()); diff --git a/lib/Target/Sparc/SparcISelLowering.cpp b/lib/Target/Sparc/SparcISelLowering.cpp index 895593528ae..605acd6df05 100644 --- a/lib/Target/Sparc/SparcISelLowering.cpp +++ b/lib/Target/Sparc/SparcISelLowering.cpp @@ -1647,6 +1647,8 @@ SparcTargetLowering::SparcTargetLowering(const TargetMachine &TM, else setMaxAtomicSizeInBitsSupported(0); + setMinCmpXchgSizeInBits(32); + setOperationAction(ISD::ATOMIC_SWAP, MVT::i32, Legal); setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Legal); diff --git a/test/CodeGen/SPARC/atomics.ll b/test/CodeGen/SPARC/atomics.ll index 6fe333a4be8..5e608e728c3 100644 --- a/test/CodeGen/SPARC/atomics.ll +++ b/test/CodeGen/SPARC/atomics.ll @@ -64,6 +64,90 @@ entry: ret i64 %2 } +;; TODO: the "move %icc" and related instructions are totally +;; redundant here. There's something weird happening in optimization +;; of the success value of cmpxchg. + +; CHECK-LABEL: test_cmpxchg_i8 +; CHECK: and %o1, -4, %o2 +; CHECK: mov 3, %o3 +; CHECK: andn %o3, %o1, %o1 +; CHECK: sll %o1, 3, %o1 +; CHECK: mov 255, %o3 +; CHECK: sll %o3, %o1, %o5 +; CHECK: xor %o5, -1, %o3 +; CHECK: mov 123, %o4 +; CHECK: ld [%o2], %g2 +; CHECK: sll %o4, %o1, %o4 +; CHECK: and %o0, 255, %o0 +; CHECK: sll %o0, %o1, %o0 +; CHECK: andn %g2, %o5, %g2 +; CHECK: sethi 0, %o5 +; CHECK: [[LABEL1:\.L.*]]: +; CHECK: or %g2, %o4, %g3 +; CHECK: or %g2, %o0, %g4 +; CHECK: cas [%o2], %g4, %g3 +; CHECK: cmp %g3, %g4 +; CHECK: mov %o5, %g4 +; CHECK: move %icc, 1, %g4 +; CHECK: cmp %g4, 0 +; CHECK: bne [[LABEL2:\.L.*]] +; CHECK: nop +; CHECK: and %g3, %o3, %g4 +; CHECK: cmp %g2, %g4 +; CHECK: bne [[LABEL1]] +; CHECK: mov %g4, %g2 +; CHECK: [[LABEL2]]: +; CHECK: retl +; CHECK: srl %g3, %o1, %o0 +define i8 @test_cmpxchg_i8(i8 %a, i8* %ptr) { +entry: + %pair = cmpxchg i8* %ptr, i8 %a, i8 123 monotonic monotonic + %b = extractvalue { i8, i1 } %pair, 0 + ret i8 %b +} + +; CHECK-LABEL: test_cmpxchg_i16 + +; CHECK: and %o1, -4, %o2 +; CHECK: and %o1, 3, %o1 +; CHECK: xor %o1, 2, %o1 +; CHECK: sll %o1, 3, %o1 +; CHECK: sethi 63, %o3 +; CHECK: or %o3, 1023, %o4 +; CHECK: sll %o4, %o1, %o5 +; CHECK: xor %o5, -1, %o3 +; CHECK: and %o0, %o4, %o4 +; CHECK: ld [%o2], %g2 +; CHECK: mov 123, %o0 +; CHECK: sll %o0, %o1, %o0 +; CHECK: sll %o4, %o1, %o4 +; CHECK: andn %g2, %o5, %g2 +; CHECK: sethi 0, %o5 +; CHECK: [[LABEL1:\.L.*]]: +; CHECK: or %g2, %o0, %g3 +; CHECK: or %g2, %o4, %g4 +; CHECK: cas [%o2], %g4, %g3 +; CHECK: cmp %g3, %g4 +; CHECK: mov %o5, %g4 +; CHECK: move %icc, 1, %g4 +; CHECK: cmp %g4, 0 +; CHECK: bne [[LABEL2:\.L.*]] +; CHECK: nop +; CHECK: and %g3, %o3, %g4 +; CHECK: cmp %g2, %g4 +; CHECK: bne [[LABEL1]] +; CHECK: mov %g4, %g2 +; CHECK: [[LABEL2]]: +; CHECK: retl +; CHECK: srl %g3, %o1, %o0 +define i16 @test_cmpxchg_i16(i16 %a, i16* %ptr) { +entry: + %pair = cmpxchg i16* %ptr, i16 %a, i16 123 monotonic monotonic + %b = extractvalue { i16, i1 } %pair, 0 + ret i16 %b +} + ; CHECK-LABEL: test_cmpxchg_i32 ; CHECK: mov 123, [[R:%[gilo][0-7]]] ; CHECK: cas [%o1], %o0, [[R]] @@ -86,6 +170,26 @@ entry: ret i64 %b } +; CHECK-LABEL: test_swap_i8 +; CHECK: mov 42, [[R:%[gilo][0-7]]] +; CHECK: cas + +define i8 @test_swap_i8(i8 %a, i8* %ptr) { +entry: + %b = atomicrmw xchg i8* %ptr, i8 42 monotonic + ret i8 %b +} + +; CHECK-LABEL: test_swap_i16 +; CHECK: mov 42, [[R:%[gilo][0-7]]] +; CHECK: cas + +define i16 @test_swap_i16(i16 %a, i16* %ptr) { +entry: + %b = atomicrmw xchg i16* %ptr, i16 42 monotonic + ret i16 %b +} + ; CHECK-LABEL: test_swap_i32 ; CHECK: mov 42, [[R:%[gilo][0-7]]] ; CHECK: swap [%o1], [[R]] @@ -105,12 +209,36 @@ entry: ret i64 %b } -; CHECK-LABEL: test_load_add_32 +; CHECK-LABEL: test_load_sub_i8 +; CHECK: membar +; CHECK: .L{{.*}}: +; CHECK: sub +; CHECK: cas [{{%[gilo][0-7]}}] +; CHECK: membar +define zeroext i8 @test_load_sub_i8(i8* %p, i8 zeroext %v) { +entry: + %0 = atomicrmw sub i8* %p, i8 %v seq_cst + ret i8 %0 +} + +; CHECK-LABEL: test_load_sub_i16 +; CHECK: membar +; CHECK: .L{{.*}}: +; CHECK: sub +; CHECK: cas [{{%[gilo][0-7]}}] +; CHECK: membar +define zeroext i16 @test_load_sub_i16(i16* %p, i16 zeroext %v) { +entry: + %0 = atomicrmw sub i16* %p, i16 %v seq_cst + ret i16 %0 +} + +; CHECK-LABEL: test_load_add_i32 ; CHECK: membar ; CHECK: add [[V:%[gilo][0-7]]], %o1, [[U:%[gilo][0-7]]] ; CHECK: cas [%o0], [[V]], [[U]] ; CHECK: membar -define zeroext i32 @test_load_add_32(i32* %p, i32 zeroext %v) { +define zeroext i32 @test_load_add_i32(i32* %p, i32 zeroext %v) { entry: %0 = atomicrmw add i32* %p, i32 %v seq_cst ret i32 %0 diff --git a/test/Transforms/AtomicExpand/SPARC/partword.ll b/test/Transforms/AtomicExpand/SPARC/partword.ll new file mode 100644 index 00000000000..9963d17c242 --- /dev/null +++ b/test/Transforms/AtomicExpand/SPARC/partword.ll @@ -0,0 +1,166 @@ +; RUN: opt -S %s -atomic-expand | FileCheck %s + +;; Verify the cmpxchg and atomicrmw expansions where sub-word-size +;; instructions are not available. + +;;; NOTE: this test is mostly target-independent -- any target which +;;; doesn't support cmpxchg of sub-word sizes would do. +target datalayout = "E-m:e-i64:64-n32:64-S128" +target triple = "sparcv9-unknown-unknown" + +; CHECK-LABEL: @test_cmpxchg_i8( +; CHECK: fence seq_cst +; CHECK: %0 = ptrtoint i8* %arg to i64 +; CHECK: %1 = and i64 %0, -4 +; CHECK: %AlignedAddr = inttoptr i64 %1 to i32* +; CHECK: %PtrLSB = and i64 %0, 3 +; CHECK: %2 = xor i64 %PtrLSB, 3 +; CHECK: %3 = shl i64 %2, 3 +; CHECK: %ShiftAmt = trunc i64 %3 to i32 +; CHECK: %Mask = shl i32 255, %ShiftAmt +; CHECK: %Inv_Mask = xor i32 %Mask, -1 +; CHECK: %4 = zext i8 %new to i32 +; CHECK: %5 = shl i32 %4, %ShiftAmt +; CHECK: %6 = zext i8 %old to i32 +; CHECK: %7 = shl i32 %6, %ShiftAmt +; CHECK: %8 = load i32, i32* %AlignedAddr +; CHECK: %9 = and i32 %8, %Inv_Mask +; CHECK: br label %partword.cmpxchg.loop +; CHECK:partword.cmpxchg.loop: +; CHECK: %10 = phi i32 [ %9, %entry ], [ %16, %partword.cmpxchg.failure ] +; CHECK: %11 = or i32 %10, %5 +; CHECK: %12 = or i32 %10, %7 +; CHECK: %13 = cmpxchg i32* %AlignedAddr, i32 %12, i32 %11 monotonic monotonic +; CHECK: %14 = extractvalue { i32, i1 } %13, 0 +; CHECK: %15 = extractvalue { i32, i1 } %13, 1 +; CHECK: br i1 %15, label %partword.cmpxchg.end, label %partword.cmpxchg.failure +; CHECK:partword.cmpxchg.failure: +; CHECK: %16 = and i32 %14, %Inv_Mask +; CHECK: %17 = icmp ne i32 %10, %16 +; CHECK: br i1 %17, label %partword.cmpxchg.loop, label %partword.cmpxchg.end +; CHECK:partword.cmpxchg.end: +; CHECK: %18 = lshr i32 %14, %ShiftAmt +; CHECK: %19 = trunc i32 %18 to i8 +; CHECK: %20 = insertvalue { i8, i1 } undef, i8 %19, 0 +; CHECK: %21 = insertvalue { i8, i1 } %20, i1 %15, 1 +; CHECK: fence seq_cst +; CHECK: %ret = extractvalue { i8, i1 } %21, 0 +; CHECK: ret i8 %ret +define i8 @test_cmpxchg_i8(i8* %arg, i8 %old, i8 %new) { +entry: + %ret_succ = cmpxchg i8* %arg, i8 %old, i8 %new seq_cst monotonic + %ret = extractvalue { i8, i1 } %ret_succ, 0 + ret i8 %ret +} + +; CHECK-LABEL: @test_cmpxchg_i16( +; CHECK: fence seq_cst +; CHECK: %0 = ptrtoint i16* %arg to i64 +; CHECK: %1 = and i64 %0, -4 +; CHECK: %AlignedAddr = inttoptr i64 %1 to i32* +; CHECK: %PtrLSB = and i64 %0, 3 +; CHECK: %2 = xor i64 %PtrLSB, 2 +; CHECK: %3 = shl i64 %2, 3 +; CHECK: %ShiftAmt = trunc i64 %3 to i32 +; CHECK: %Mask = shl i32 65535, %ShiftAmt +; CHECK: %Inv_Mask = xor i32 %Mask, -1 +; CHECK: %4 = zext i16 %new to i32 +; CHECK: %5 = shl i32 %4, %ShiftAmt +; CHECK: %6 = zext i16 %old to i32 +; CHECK: %7 = shl i32 %6, %ShiftAmt +; CHECK: %8 = load i32, i32* %AlignedAddr +; CHECK: %9 = and i32 %8, %Inv_Mask +; CHECK: br label %partword.cmpxchg.loop +; CHECK:partword.cmpxchg.loop: +; CHECK: %10 = phi i32 [ %9, %entry ], [ %16, %partword.cmpxchg.failure ] +; CHECK: %11 = or i32 %10, %5 +; CHECK: %12 = or i32 %10, %7 +; CHECK: %13 = cmpxchg i32* %AlignedAddr, i32 %12, i32 %11 monotonic monotonic +; CHECK: %14 = extractvalue { i32, i1 } %13, 0 +; CHECK: %15 = extractvalue { i32, i1 } %13, 1 +; CHECK: br i1 %15, label %partword.cmpxchg.end, label %partword.cmpxchg.failure +; CHECK:partword.cmpxchg.failure: +; CHECK: %16 = and i32 %14, %Inv_Mask +; CHECK: %17 = icmp ne i32 %10, %16 +; CHECK: br i1 %17, label %partword.cmpxchg.loop, label %partword.cmpxchg.end +; CHECK:partword.cmpxchg.end: +; CHECK: %18 = lshr i32 %14, %ShiftAmt +; CHECK: %19 = trunc i32 %18 to i16 +; CHECK: %20 = insertvalue { i16, i1 } undef, i16 %19, 0 +; CHECK: %21 = insertvalue { i16, i1 } %20, i1 %15, 1 +; CHECK: fence seq_cst +; CHECK: %ret = extractvalue { i16, i1 } %21, 0 +; CHECK: ret i16 %ret +define i16 @test_cmpxchg_i16(i16* %arg, i16 %old, i16 %new) { +entry: + %ret_succ = cmpxchg i16* %arg, i16 %old, i16 %new seq_cst monotonic + %ret = extractvalue { i16, i1 } %ret_succ, 0 + ret i16 %ret +} + + +; CHECK-LABEL: @test_add_i16( +; CHECK: fence seq_cst +; CHECK: %0 = ptrtoint i16* %arg to i64 +; CHECK: %1 = and i64 %0, -4 +; CHECK: %AlignedAddr = inttoptr i64 %1 to i32* +; CHECK: %PtrLSB = and i64 %0, 3 +; CHECK: %2 = xor i64 %PtrLSB, 2 +; CHECK: %3 = shl i64 %2, 3 +; CHECK: %ShiftAmt = trunc i64 %3 to i32 +; CHECK: %Mask = shl i32 65535, %ShiftAmt +; CHECK: %Inv_Mask = xor i32 %Mask, -1 +; CHECK: %4 = zext i16 %val to i32 +; CHECK: %ValOperand_Shifted = shl i32 %4, %ShiftAmt +; CHECK: %5 = load i32, i32* %AlignedAddr, align 4 +; CHECK: br label %atomicrmw.start +; CHECK:atomicrmw.start: +; CHECK: %loaded = phi i32 [ %5, %entry ], [ %newloaded, %atomicrmw.start ] +; CHECK: %new = add i32 %loaded, %ValOperand_Shifted +; CHECK: %6 = and i32 %new, %Mask +; CHECK: %7 = and i32 %loaded, %Inv_Mask +; CHECK: %8 = or i32 %7, %6 +; CHECK: %9 = cmpxchg i32* %AlignedAddr, i32 %loaded, i32 %8 monotonic monotonic +; CHECK: %success = extractvalue { i32, i1 } %9, 1 +; CHECK: %newloaded = extractvalue { i32, i1 } %9, 0 +; CHECK: br i1 %success, label %atomicrmw.end, label %atomicrmw.start +; CHECK:atomicrmw.end: +; CHECK: %10 = lshr i32 %newloaded, %ShiftAmt +; CHECK: %11 = trunc i32 %10 to i16 +; CHECK: fence seq_cst +; CHECK: ret i16 %11 +define i16 @test_add_i16(i16* %arg, i16 %val) { +entry: + %ret = atomicrmw add i16* %arg, i16 %val seq_cst + ret i16 %ret +} + +; CHECK-LABEL: @test_xor_i16( +; (I'm going to just assert on the bits that differ from add, above.) +; CHECK:atomicrmw.start: +; CHECK: %new = xor i32 %loaded, %ValOperand_Shifted +; CHECK: %6 = cmpxchg i32* %AlignedAddr, i32 %loaded, i32 %new monotonic monotonic +; CHECK:atomicrmw.end: +define i16 @test_xor_i16(i16* %arg, i16 %val) { +entry: + %ret = atomicrmw xor i16* %arg, i16 %val seq_cst + ret i16 %ret +} + +; CHECK-LABEL: @test_min_i16( +; CHECK:atomicrmw.start: +; CHECK: %6 = lshr i32 %loaded, %ShiftAmt +; CHECK: %7 = trunc i32 %6 to i16 +; CHECK: %8 = icmp sle i16 %7, %val +; CHECK: %new = select i1 %8, i16 %7, i16 %val +; CHECK: %9 = zext i16 %new to i32 +; CHECK: %10 = shl i32 %9, %ShiftAmt +; CHECK: %11 = and i32 %loaded, %Inv_Mask +; CHECK: %12 = or i32 %11, %10 +; CHECK: %13 = cmpxchg i32* %AlignedAddr, i32 %loaded, i32 %12 monotonic monotonic +; CHECK:atomicrmw.end: +define i16 @test_min_i16(i16* %arg, i16 %val) { +entry: + %ret = atomicrmw min i16* %arg, i16 %val seq_cst + ret i16 %ret +}