[X86] Make wide loads be managed by AtomicExpand

Summary: AtomicExpand already had logic for expanding wide loads and stores on LL/SC architectures, and for expanding wide stores on CmpXchg architectures, but not for wide loads on CmpXchg architectures. This patch fills this hole, and makes use of this new feature in the X86 backend. Only one functionnal change: we now lose the SynchScope attribute. It is regrettable, but I have another patch that I will submit soon that will solve this for all of AtomicExpand (it seemed better to split it apart as it is a different concern). Test Plan: make check-all (lots of tests for this functionality already exist) Reviewers: jfb Subscribers: llvm-commits Differential Revision: http://reviews.llvm.org/D5404 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@218332 91177308-0d34-0410-b5e6-96231b3b80d8
2024-12-02 00:37:09 +00:00 · 2014-09-23 20:59:25 +00:00 · 2014-09-23 20:59:25 +00:00 · 30e7514d01
commit 30e7514d01
parent 58bca6e8ec
3 changed files with 41 additions and 32 deletions
--- a/lib/CodeGen/AtomicExpandPass.cpp
+++ b/lib/CodeGen/AtomicExpandPass.cpp
@ -44,6 +44,8 @@ namespace {
    bool bracketInstWithFences(Instruction *I, AtomicOrdering Order,
                               bool IsStore, bool IsLoad);
    bool expandAtomicLoad(LoadInst *LI);
+    bool expandAtomicLoadToLL(LoadInst *LI);
+    bool expandAtomicLoadToCmpXchg(LoadInst *LI);
    bool expandAtomicStore(StoreInst *SI);
    bool expandAtomicRMW(AtomicRMWInst *AI);
    bool expandAtomicRMWToLLSC(AtomicRMWInst *AI);
@ -160,6 +162,15 @@ bool AtomicExpand::bracketInstWithFences(Instruction *I, AtomicOrdering Order,
 }

 bool AtomicExpand::expandAtomicLoad(LoadInst *LI) {
+   if (TM->getSubtargetImpl()
+          ->getTargetLowering()
+          ->hasLoadLinkedStoreConditional())
+    return expandAtomicLoadToLL(LI);
+  else
+    return expandAtomicLoadToCmpXchg(LI);
+}
+
+bool AtomicExpand::expandAtomicLoadToLL(LoadInst *LI) {
  auto TLI = TM->getSubtargetImpl()->getTargetLowering();
  IRBuilder<> Builder(LI);

@ -175,6 +186,24 @@ bool AtomicExpand::expandAtomicLoad(LoadInst *LI) {
  return true;
 }

+bool AtomicExpand::expandAtomicLoadToCmpXchg(LoadInst *LI) {
+  IRBuilder<> Builder(LI);
+  AtomicOrdering Order = LI->getOrdering();
+  Value *Addr = LI->getPointerOperand();
+  Type *Ty = cast<PointerType>(Addr->getType())->getElementType();
+  Constant *DummyVal = Constant::getNullValue(Ty);
+
+  Value *Pair = Builder.CreateAtomicCmpXchg(
+      Addr, DummyVal, DummyVal, Order,
+      AtomicCmpXchgInst::getStrongestFailureOrdering(Order));
+  Value *Loaded = Builder.CreateExtractValue(Pair, 0, "loaded");
+
+  LI->replaceAllUsesWith(Loaded);
+  LI->eraseFromParent();
+
+  return true;
+}
+
 bool AtomicExpand::expandAtomicStore(StoreInst *SI) {
  // This function is only called on atomic stores that are too large to be
  // atomic if implemented as a native store. So we replace them by an
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@ -17477,8 +17477,11 @@ bool X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
  return needsCmpXchgNb(SI->getValueOperand()->getType());
 }

-bool X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *SI) const {
-  return false; // FIXME, currently these are expanded separately in this file.
+// Note: this turns large loads into lock cmpxchg8b/16b.
+// FIXME: On 32 bits x86, fild/movq might be faster than lock cmpxchg8b.
+bool X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
+  auto PTy = cast<PointerType>(LI->getPointerOperand()->getType());
+  return needsCmpXchgNb(PTy->getElementType());
 }

 bool X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
@ -17855,29 +17858,6 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
  }
 }

-static void ReplaceATOMIC_LOAD(SDNode *Node,
-                               SmallVectorImpl<SDValue> &Results,
-                               SelectionDAG &DAG) {
-  SDLoc dl(Node);
-  EVT VT = cast<AtomicSDNode>(Node)->getMemoryVT();
-
-  // Convert wide load -> cmpxchg8b/cmpxchg16b
-  // FIXME: On 32-bit, load -> fild or movq would be more efficient
-  //        (The only way to get a 16-byte load is cmpxchg16b)
-  // FIXME: 16-byte ATOMIC_CMP_SWAP isn't actually hooked up at the moment.
-  SDValue Zero = DAG.getConstant(0, VT);
-  SDVTList VTs = DAG.getVTList(VT, MVT::i1, MVT::Other);
-  SDValue Swap =
-      DAG.getAtomicCmpSwap(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, dl, VT, VTs,
-                           Node->getOperand(0), Node->getOperand(1), Zero, Zero,
-                           cast<AtomicSDNode>(Node)->getMemOperand(),
-                           cast<AtomicSDNode>(Node)->getOrdering(),
-                           cast<AtomicSDNode>(Node)->getOrdering(),
-                           cast<AtomicSDNode>(Node)->getSynchScope());
-  Results.push_back(Swap.getValue(0));
-  Results.push_back(Swap.getValue(2));
-}
-
 /// ReplaceNodeResults - Replace a node with an illegal result type
 /// with a new node built out of custom code.
 void X86TargetLowering::ReplaceNodeResults(SDNode *N,
@ -18036,12 +18016,10 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
  case ISD::ATOMIC_LOAD_MAX:
  case ISD::ATOMIC_LOAD_UMIN:
  case ISD::ATOMIC_LOAD_UMAX:
+  case ISD::ATOMIC_LOAD: {
    // Delegate to generic TypeLegalization. Situations we can really handle
    // should have already been dealt with by AtomicExpandPass.cpp.
    break;
-  case ISD::ATOMIC_LOAD: {
-    ReplaceATOMIC_LOAD(N, Results, DAG);
-    return;
  }
  case ISD::BITCAST: {
    assert(Subtarget->hasSSE2() && "Requires at least SSE2!");
--- a/test/CodeGen/X86/atomic-load-store-wide.ll
+++ b/test/CodeGen/X86/atomic-load-store-wide.ll
@ -4,16 +4,18 @@
 ; FIXME: The generated code can be substantially improved.

 define void @test1(i64* %ptr, i64 %val1) {
-; CHECK: test1
-; CHECK: cmpxchg8b
+; CHECK-LABEL: test1
+; CHECK: lock
+; CHECK-NEXT: cmpxchg8b
 ; CHECK-NEXT: jne
  store atomic i64 %val1, i64* %ptr seq_cst, align 8
  ret void
 }

 define i64 @test2(i64* %ptr) {
-; CHECK: test2
-; CHECK: cmpxchg8b
+; CHECK-LABEL: test2
+; CHECK: lock
+; CHECK-NEXT: cmpxchg8b
  %val = load atomic i64* %ptr seq_cst, align 8
  ret i64 %val
 }