[AArch64] Emit clrex in the expanded cmpxchg fail block.

In the comparison failure block of a cmpxchg expansion, the initial
ldrex/ldxr will not be followed by a matching strex/stxr.
On ARM/AArch64, this unnecessarily ties up the execution monitor,
which might have a negative performance impact on some uarchs.

Instead, release the monitor in the failure block.
The clrex instruction was designed for this: use it.

Also see ARMARM v8-A B2.10.2:
"Exclusive access instructions and Shareable memory locations".

Differential Revision: http://reviews.llvm.org/D13033

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@248291 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
Ahmed Bougacha 2015-09-22 17:21:44 +00:00
parent c8a4540af1
commit ee629d8395
6 changed files with 84 additions and 36 deletions

View File

@ -1097,6 +1097,14 @@ public:
}
/// @}
// Emits code that executes when the comparison result in the ll/sc
// expansion of a cmpxchg instruction is such that the store-conditional will
// not execute. This makes it possible to balance out the load-linked with
// a dedicated instruction, if desired.
// E.g., on ARM, if ldrex isn't followed by strex, the exclusive monitor would
// be unnecessarily held, except if clrex, inserted by this hook, is executed.
virtual void emitAtomicCmpXchgNoStoreLLBalance(IRBuilder<> &Builder) const {}
/// Returns true if the given (atomic) store should be expanded by the
/// IR-level AtomicExpand pass into an "atomic xchg" which ignores its input.
virtual bool shouldExpandAtomicStoreInIR(StoreInst *SI) const {

View File

@ -374,7 +374,7 @@ bool AtomicExpand::expandAtomicCmpXchg(AtomicCmpXchgInst *CI) {
// %loaded = @load.linked(%addr)
// %should_store = icmp eq %loaded, %desired
// br i1 %should_store, label %cmpxchg.trystore,
// label %cmpxchg.failure
// label %cmpxchg.nostore
// cmpxchg.trystore:
// %stored = @store_conditional(%new, %addr)
// %success = icmp eq i32 %stored, 0
@ -382,6 +382,9 @@ bool AtomicExpand::expandAtomicCmpXchg(AtomicCmpXchgInst *CI) {
// cmpxchg.success:
// fence?
// br label %cmpxchg.end
// cmpxchg.nostore:
// @load_linked_fail_balance()?
// br label %cmpxchg.failure
// cmpxchg.failure:
// fence?
// br label %cmpxchg.end
@ -392,7 +395,8 @@ bool AtomicExpand::expandAtomicCmpXchg(AtomicCmpXchgInst *CI) {
// [...]
BasicBlock *ExitBB = BB->splitBasicBlock(CI, "cmpxchg.end");
auto FailureBB = BasicBlock::Create(Ctx, "cmpxchg.failure", F, ExitBB);
auto SuccessBB = BasicBlock::Create(Ctx, "cmpxchg.success", F, FailureBB);
auto NoStoreBB = BasicBlock::Create(Ctx, "cmpxchg.nostore", F, FailureBB);
auto SuccessBB = BasicBlock::Create(Ctx, "cmpxchg.success", F, NoStoreBB);
auto TryStoreBB = BasicBlock::Create(Ctx, "cmpxchg.trystore", F, SuccessBB);
auto LoopBB = BasicBlock::Create(Ctx, "cmpxchg.start", F, TryStoreBB);
@ -416,7 +420,7 @@ bool AtomicExpand::expandAtomicCmpXchg(AtomicCmpXchgInst *CI) {
// If the cmpxchg doesn't actually need any ordering when it fails, we can
// jump straight past that fence instruction (if it exists).
Builder.CreateCondBr(ShouldStore, TryStoreBB, FailureBB);
Builder.CreateCondBr(ShouldStore, TryStoreBB, NoStoreBB);
Builder.SetInsertPoint(TryStoreBB);
Value *StoreSuccess = TLI->emitStoreConditional(
@ -432,6 +436,13 @@ bool AtomicExpand::expandAtomicCmpXchg(AtomicCmpXchgInst *CI) {
/*IsLoad=*/true);
Builder.CreateBr(ExitBB);
Builder.SetInsertPoint(NoStoreBB);
// In the failing case, where we don't execute the store-conditional, the
// target might want to balance out the load-linked with a dedicated
// instruction (e.g., on ARM, clearing the exclusive monitor).
TLI->emitAtomicCmpXchgNoStoreLLBalance(Builder);
Builder.CreateBr(FailureBB);
Builder.SetInsertPoint(FailureBB);
TLI->emitTrailingFence(Builder, FailureOrder, /*IsStore=*/true,
/*IsLoad=*/true);

View File

@ -9682,6 +9682,13 @@ Value *AArch64TargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr,
cast<PointerType>(Addr->getType())->getElementType());
}
void AArch64TargetLowering::emitAtomicCmpXchgNoStoreLLBalance(
IRBuilder<> &Builder) const {
Module *M = Builder.GetInsertBlock()->getParent()->getParent();
Builder.CreateCall(
llvm::Intrinsic::getDeclaration(M, Intrinsic::aarch64_clrex));
}
Value *AArch64TargetLowering::emitStoreConditional(IRBuilder<> &Builder,
Value *Val, Value *Addr,
AtomicOrdering Ord) const {

View File

@ -348,6 +348,8 @@ public:
Value *emitStoreConditional(IRBuilder<> &Builder, Value *Val,
Value *Addr, AtomicOrdering Ord) const override;
void emitAtomicCmpXchgNoStoreLLBalance(IRBuilder<> &Builder) const override;
TargetLoweringBase::AtomicExpansionKind
shouldExpandAtomicLoadInIR(LoadInst *LI) const override;
bool shouldExpandAtomicStoreInIR(StoreInst *SI) const override;

View File

@ -2,13 +2,17 @@
define i32 @val_compare_and_swap(i32* %p, i32 %cmp, i32 %new) #0 {
; CHECK-LABEL: val_compare_and_swap:
; CHECK-NEXT: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
; CHECK-NEXT: ldaxr [[RESULT:w[0-9]+]], [x0]
; CHECK-NEXT: mov x[[ADDR:[0-9]+]], x0
; CHECK-NEXT: [[TRYBB:.?LBB[0-9_]+]]:
; CHECK-NEXT: ldaxr [[RESULT:w[0-9]+]], [x[[ADDR]]]
; CHECK-NEXT: cmp [[RESULT]], w1
; CHECK-NEXT: b.ne [[LABEL2:.?LBB[0-9]+_[0-9]+]]
; CHECK-NEXT: stxr [[SCRATCH_REG:w[0-9]+]], w2, [x0]
; CHECK-NEXT: cbnz [[SCRATCH_REG]], [[LABEL]]
; CHECK-NEXT: [[LABEL2]]:
; CHECK-NEXT: b.ne [[FAILBB:.?LBB[0-9_]+]]
; CHECK-NEXT: stxr [[SCRATCH_REG:w[0-9]+]], w2, [x[[ADDR]]]
; CHECK-NEXT: cbnz [[SCRATCH_REG]], [[TRYBB]]
; CHECK-NEXT: b [[EXITBB:.?LBB[0-9_]+]]
; CHECK-NEXT: [[FAILBB]]:
; CHECK-NEXT: clrex
; CHECK-NEXT: [[EXITBB]]:
%pair = cmpxchg i32* %p, i32 %cmp, i32 %new acquire acquire
%val = extractvalue { i32, i1 } %pair, 0
ret i32 %val
@ -17,13 +21,16 @@ define i32 @val_compare_and_swap(i32* %p, i32 %cmp, i32 %new) #0 {
define i32 @val_compare_and_swap_from_load(i32* %p, i32 %cmp, i32* %pnew) #0 {
; CHECK-LABEL: val_compare_and_swap_from_load:
; CHECK-NEXT: ldr [[NEW:w[0-9]+]], [x2]
; CHECK-NEXT: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
; CHECK-NEXT: [[TRYBB:.?LBB[0-9_]+]]:
; CHECK-NEXT: ldaxr [[RESULT:w[0-9]+]], [x0]
; CHECK-NEXT: cmp [[RESULT]], w1
; CHECK-NEXT: b.ne [[LABEL2:.?LBB[0-9]+_[0-9]+]]
; CHECK-NEXT: b.ne [[FAILBB:.?LBB[0-9_]+]]
; CHECK-NEXT: stxr [[SCRATCH_REG:w[0-9]+]], [[NEW]], [x0]
; CHECK-NEXT: cbnz [[SCRATCH_REG]], [[LABEL]]
; CHECK-NEXT: [[LABEL2]]:
; CHECK-NEXT: cbnz [[SCRATCH_REG]], [[TRYBB]]
; CHECK-NEXT: b [[EXITBB:.?LBB[0-9_]+]]
; CHECK-NEXT: [[FAILBB]]:
; CHECK-NEXT: clrex
; CHECK-NEXT: [[EXITBB]]:
%new = load i32, i32* %pnew
%pair = cmpxchg i32* %p, i32 %cmp, i32 %new acquire acquire
%val = extractvalue { i32, i1 } %pair, 0
@ -32,13 +39,17 @@ define i32 @val_compare_and_swap_from_load(i32* %p, i32 %cmp, i32* %pnew) #0 {
define i32 @val_compare_and_swap_rel(i32* %p, i32 %cmp, i32 %new) #0 {
; CHECK-LABEL: val_compare_and_swap_rel:
; CHECK-NEXT: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
; CHECK-NEXT: ldaxr [[RESULT:w[0-9]+]], [x0]
; CHECK-NEXT: mov x[[ADDR:[0-9]+]], x0
; CHECK-NEXT: [[TRYBB:.?LBB[0-9_]+]]:
; CHECK-NEXT: ldaxr [[RESULT:w[0-9]+]], [x[[ADDR]]
; CHECK-NEXT: cmp [[RESULT]], w1
; CHECK-NEXT: b.ne [[LABEL2:.?LBB[0-9]+_[0-9]+]]
; CHECK-NEXT: stlxr [[SCRATCH_REG:w[0-9]+]], w2, [x0]
; CHECK-NEXT: cbnz [[SCRATCH_REG]], [[LABEL]]
; CHECK-NEXT: [[LABEL2]]:
; CHECK-NEXT: b.ne [[FAILBB:.?LBB[0-9_]+]]
; CHECK-NEXT: stlxr [[SCRATCH_REG:w[0-9]+]], w2, [x[[ADDR]]
; CHECK-NEXT: cbnz [[SCRATCH_REG]], [[TRYBB]]
; CHECK-NEXT: b [[EXITBB:.?LBB[0-9_]+]]
; CHECK-NEXT: [[FAILBB]]:
; CHECK-NEXT: clrex
; CHECK-NEXT: [[EXITBB]]:
%pair = cmpxchg i32* %p, i32 %cmp, i32 %new acq_rel monotonic
%val = extractvalue { i32, i1 } %pair, 0
ret i32 %val
@ -47,13 +58,16 @@ define i32 @val_compare_and_swap_rel(i32* %p, i32 %cmp, i32 %new) #0 {
define i64 @val_compare_and_swap_64(i64* %p, i64 %cmp, i64 %new) #0 {
; CHECK-LABEL: val_compare_and_swap_64:
; CHECK-NEXT: mov x[[ADDR:[0-9]+]], x0
; CHECK-NEXT: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
; CHECK-NEXT: [[TRYBB:.?LBB[0-9_]+]]:
; CHECK-NEXT: ldxr [[RESULT:x[0-9]+]], [x[[ADDR]]]
; CHECK-NEXT: cmp [[RESULT]], x1
; CHECK-NEXT: b.ne [[LABEL2:.?LBB[0-9]+_[0-9]+]]
; CHECK-NEXT: b.ne [[FAILBB:.?LBB[0-9_]+]]
; CHECK-NEXT: stxr [[SCRATCH_REG:w[0-9]+]], x2, [x[[ADDR]]]
; CHECK-NEXT: cbnz [[SCRATCH_REG]], [[LABEL]]
; CHECK-NEXT: [[LABEL2]]:
; CHECK-NEXT: cbnz [[SCRATCH_REG]], [[TRYBB]]
; CHECK-NEXT: b [[EXITBB:.?LBB[0-9_]+]]
; CHECK-NEXT: [[FAILBB]]:
; CHECK-NEXT: clrex
; CHECK-NEXT: [[EXITBB]]:
%pair = cmpxchg i64* %p, i64 %cmp, i64 %new monotonic monotonic
%val = extractvalue { i64, i1 } %pair, 0
ret i64 %val
@ -61,13 +75,13 @@ define i64 @val_compare_and_swap_64(i64* %p, i64 %cmp, i64 %new) #0 {
define i32 @fetch_and_nand(i32* %p) #0 {
; CHECK-LABEL: fetch_and_nand:
; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
; CHECK: [[TRYBB:.?LBB[0-9_]+]]:
; CHECK: ldxr w[[DEST_REG:[0-9]+]], [x0]
; CHECK: mvn [[TMP_REG:w[0-9]+]], w[[DEST_REG]]
; CHECK: orr [[SCRATCH2_REG:w[0-9]+]], [[TMP_REG]], #0xfffffff8
; CHECK-NOT: stlxr [[SCRATCH2_REG]], [[SCRATCH2_REG]]
; CHECK: stlxr [[SCRATCH_REG:w[0-9]+]], [[SCRATCH2_REG]], [x0]
; CHECK: cbnz [[SCRATCH_REG]], [[LABEL]]
; CHECK: cbnz [[SCRATCH_REG]], [[TRYBB]]
; CHECK: mov x0, x[[DEST_REG]]
%val = atomicrmw nand i32* %p, i32 7 release
ret i32 %val
@ -76,12 +90,12 @@ define i32 @fetch_and_nand(i32* %p) #0 {
define i64 @fetch_and_nand_64(i64* %p) #0 {
; CHECK-LABEL: fetch_and_nand_64:
; CHECK: mov x[[ADDR:[0-9]+]], x0
; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
; CHECK: [[TRYBB:.?LBB[0-9_]+]]:
; CHECK: ldaxr x[[DEST_REG:[0-9]+]], [x[[ADDR]]]
; CHECK: mvn w[[TMP_REG:[0-9]+]], w[[DEST_REG]]
; CHECK: orr [[SCRATCH2_REG:x[0-9]+]], x[[TMP_REG]], #0xfffffffffffffff8
; CHECK: stlxr [[SCRATCH_REG:w[0-9]+]], [[SCRATCH2_REG]], [x[[ADDR]]]
; CHECK: cbnz [[SCRATCH_REG]], [[LABEL]]
; CHECK: cbnz [[SCRATCH_REG]], [[TRYBB]]
%val = atomicrmw nand i64* %p, i64 7 acq_rel
ret i64 %val
@ -90,12 +104,12 @@ define i64 @fetch_and_nand_64(i64* %p) #0 {
define i32 @fetch_and_or(i32* %p) #0 {
; CHECK-LABEL: fetch_and_or:
; CHECK: movz [[OLDVAL_REG:w[0-9]+]], #0x5
; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
; CHECK: [[TRYBB:.?LBB[0-9_]+]]:
; CHECK: ldaxr w[[DEST_REG:[0-9]+]], [x0]
; CHECK: orr [[SCRATCH2_REG:w[0-9]+]], w[[DEST_REG]], [[OLDVAL_REG]]
; CHECK-NOT: stlxr [[SCRATCH2_REG]], [[SCRATCH2_REG]]
; CHECK: stlxr [[SCRATCH_REG:w[0-9]+]], [[SCRATCH2_REG]], [x0]
; CHECK: cbnz [[SCRATCH_REG]], [[LABEL]]
; CHECK: cbnz [[SCRATCH_REG]], [[TRYBB]]
; CHECK: mov x0, x[[DEST_REG]]
%val = atomicrmw or i32* %p, i32 5 seq_cst
ret i32 %val
@ -104,11 +118,11 @@ define i32 @fetch_and_or(i32* %p) #0 {
define i64 @fetch_and_or_64(i64* %p) #0 {
; CHECK: fetch_and_or_64:
; CHECK: mov x[[ADDR:[0-9]+]], x0
; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
; CHECK: [[TRYBB:.?LBB[0-9_]+]]:
; CHECK: ldxr [[DEST_REG:x[0-9]+]], [x[[ADDR]]]
; CHECK: orr [[SCRATCH2_REG:x[0-9]+]], [[DEST_REG]], #0x7
; CHECK: stxr [[SCRATCH_REG:w[0-9]+]], [[SCRATCH2_REG]], [x[[ADDR]]]
; CHECK: cbnz [[SCRATCH_REG]], [[LABEL]]
; CHECK: cbnz [[SCRATCH_REG]], [[TRYBB]]
%val = atomicrmw or i64* %p, i64 7 monotonic
ret i64 %val
}

View File

@ -893,6 +893,8 @@ define i8 @test_atomic_cmpxchg_i8(i8 %wanted, i8 %new) nounwind {
; CHECK-NEXT: b.ne [[GET_OUT:.LBB[0-9]+_[0-9]+]]
; CHECK: stxrb [[STATUS:w[0-9]+]], {{w[0-9]+}}, [x[[ADDR]]]
; CHECK-NEXT: cbnz [[STATUS]], [[STARTAGAIN]]
; CHECK: [[GET_OUT]]:
; CHECK: clrex
; CHECK-NOT: dmb
; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD]]
@ -916,6 +918,8 @@ define i16 @test_atomic_cmpxchg_i16(i16 %wanted, i16 %new) nounwind {
; CHECK-NEXT: b.ne [[GET_OUT:.LBB[0-9]+_[0-9]+]]
; CHECK: stlxrh [[STATUS:w[0-9]+]], {{w[0-9]+}}, [x[[ADDR]]]
; CHECK-NEXT: cbnz [[STATUS]], [[STARTAGAIN]]
; CHECK: [[GET_OUT]]:
; CHECK: clrex
; CHECK-NOT: dmb
; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD]]
@ -927,21 +931,21 @@ define i32 @test_atomic_cmpxchg_i32(i32 %wanted, i32 %new) nounwind {
%pair = cmpxchg i32* @var32, i32 %wanted, i32 %new release monotonic
%old = extractvalue { i32, i1 } %pair, 0
; CHECK: mov {{[xw]}}[[WANTED:[0-9]+]], {{[xw]}}0
; CHECK-NOT: dmb
; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32
; CHECK: [[STARTAGAIN:.LBB[0-9]+_[0-9]+]]:
; CHECK: ldxr w[[OLD:[0-9]+]], [x[[ADDR]]]
; w0 below is a reasonable guess but could change: it certainly comes into the
; function there.
; CHECK-NEXT: cmp w[[OLD]], w0
; CHECK-NEXT: cmp w[[OLD]], w[[WANTED]]
; CHECK-NEXT: b.ne [[GET_OUT:.LBB[0-9]+_[0-9]+]]
; CHECK: stlxr [[STATUS:w[0-9]+]], {{w[0-9]+}}, [x[[ADDR]]]
; CHECK-NEXT: cbnz [[STATUS]], [[STARTAGAIN]]
; CHECK: [[GET_OUT]]:
; CHECK: clrex
; CHECK-NOT: dmb
; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD]]
ret i32 %old
}
@ -963,6 +967,8 @@ define void @test_atomic_cmpxchg_i64(i64 %wanted, i64 %new) nounwind {
; As above, w1 is a reasonable guess.
; CHECK: stxr [[STATUS:w[0-9]+]], x1, [x[[ADDR]]]
; CHECK-NEXT: cbnz [[STATUS]], [[STARTAGAIN]]
; CHECK: [[GET_OUT]]:
; CHECK: clrex
; CHECK-NOT: dmb
; CHECK: str x[[OLD]],