From f9eea5b8c638adba5f6692683cc9b995952cabb8 Mon Sep 17 00:00:00 2001
From: Philip Reames <listmail@philipreames.com>
Date: Thu, 17 Sep 2020 15:39:50 -0700
Subject: [PATCH] [AArch64] Enable implicit null check transformation

This change enables the generic implicit null transformation for the AArch64 target. As background for those unfamiliar with our implicit null check support:

    An implicit null check is the use of a signal handler to catch and redirect to a handler a null pointer. Specifically, it's replacing an explicit conditional branch with such a redirect. This is only done for very cold branches under frontend control w/appropriate metadata.
    FAULTING_OP is used to wrap the faulting instruction. It is modelled as being a conditional branch to reflect the fact it can transfer control in the CFG.
    FAULTING_OP does not need to be an analyzable branch to achieve it's purpose. (Or at least, that's the x86 model. I find this slightly questionable.)
    When lowering to MC, we convert the FAULTING_OP back into the actual instruction, record the labels, and lower the original instruction.

As can be seen in the test changes, currently the AArch64 backend does not eliminate the unconditional branch to the fallthrough block. I've tried two approaches, neither of which worked. I plan to return to this in a separate change set once I've wrapped my head around the interactions a bit better. (X86 handles this via AllowModify on analyzeBranch, but adding the obvious code causing BranchFolding to crash. I haven't yet figured out if it's a latent bug in BranchFolding, or something I'm doing wrong.)

Differential Revision: https://reviews.llvm.org/D87851
---
 lib/CodeGen/BranchRelaxation.cpp            | 36 +++++-----
 lib/CodeGen/ImplicitNullChecks.cpp          | 66 ++++++++++--------
 lib/Target/AArch64/AArch64AsmPrinter.cpp    | 48 ++++++++++++-
 lib/Target/AArch64/AArch64InstrInfo.cpp     | 50 ++++++++++++++
 lib/Target/AArch64/AArch64InstrInfo.h       |  3 +
 test/CodeGen/AArch64/implicit-null-check.ll | 75 +++++++++++++--------
 6 files changed, 204 insertions(+), 74 deletions(-)

diff --git a/lib/CodeGen/BranchRelaxation.cpp b/lib/CodeGen/BranchRelaxation.cpp
index 5a3ec1a36f9..366c303614d 100644
--- a/lib/CodeGen/BranchRelaxation.cpp
+++ b/lib/CodeGen/BranchRelaxation.cpp
@@ -507,25 +507,31 @@ bool BranchRelaxation::relaxBranchInstructions() {
       Next = std::next(J);
       MachineInstr &MI = *J;
 
-      if (MI.isConditionalBranch()) {
-        MachineBasicBlock *DestBB = TII->getBranchDestBlock(MI);
-        if (!isBlockInRange(MI, *DestBB)) {
-          if (Next != MBB.end() && Next->isConditionalBranch()) {
-            // If there are multiple conditional branches, this isn't an
-            // analyzable block. Split later terminators into a new block so
-            // each one will be analyzable.
+      if (!MI.isConditionalBranch())
+        continue;
 
-            splitBlockBeforeInstr(*Next, DestBB);
-          } else {
-            fixupConditionalBranch(MI);
-            ++NumConditionalRelaxed;
-          }
+      if (MI.getOpcode() == TargetOpcode::FAULTING_OP)
+        // FAULTING_OP's destination is not encoded in the instruction stream
+        // and thus never needs relaxed.
+        continue;
 
-          Changed = true;
+      MachineBasicBlock *DestBB = TII->getBranchDestBlock(MI);
+      if (!isBlockInRange(MI, *DestBB)) {
+        if (Next != MBB.end() && Next->isConditionalBranch()) {
+          // If there are multiple conditional branches, this isn't an
+          // analyzable block. Split later terminators into a new block so
+          // each one will be analyzable.
 
-          // This may have modified all of the terminators, so start over.
-          Next = MBB.getFirstTerminator();
+          splitBlockBeforeInstr(*Next, DestBB);
+        } else {
+          fixupConditionalBranch(MI);
+          ++NumConditionalRelaxed;
         }
+
+        Changed = true;
+
+        // This may have modified all of the terminators, so start over.
+        Next = MBB.getFirstTerminator();
       }
     }
   }
diff --git a/lib/CodeGen/ImplicitNullChecks.cpp b/lib/CodeGen/ImplicitNullChecks.cpp
index 9030f322683..c2b764e5580 100644
--- a/lib/CodeGen/ImplicitNullChecks.cpp
+++ b/lib/CodeGen/ImplicitNullChecks.cpp
@@ -373,10 +373,14 @@ ImplicitNullChecks::isSuitableMemoryOp(const MachineInstr &MI,
   bool OffsetIsScalable;
   const MachineOperand *BaseOp;
 
+  // Implementation restriction for faulting_op insertion
+  // TODO: This could be relaxed if we find a test case which warrants it.
+  if (MI.getDesc().getNumDefs() > 1)
+   return SR_Unsuitable;
 
   // FIXME: This handles only simple addressing mode.
   if (!TII->getMemOperandWithOffset(MI, BaseOp, Offset, OffsetIsScalable, TRI))
-   return SR_Unsuitable;
+    return SR_Unsuitable;
 
   // We need the base of the memory instruction to be same as the register
   // where the null check is performed (i.e. PointerReg).
@@ -502,9 +506,9 @@ bool ImplicitNullChecks::analyzeBlockForNullChecks(
          MBP.Predicate == MachineBranchPredicate::PRED_EQ)))
     return false;
 
-  // If we cannot erase the test instruction itself, then making the null check
-  // implicit does not buy us much.
-  if (!MBP.SingleUseCondition)
+  // If there is a separate condition generation instruction, we chose not to
+  // transform unless we can remove both condition and consuming branch.
+  if (MBP.ConditionDef && !MBP.SingleUseCondition)
     return false;
 
   MachineBasicBlock *NotNullSucc, *NullSucc;
@@ -522,32 +526,34 @@ bool ImplicitNullChecks::analyzeBlockForNullChecks(
   if (NotNullSucc->pred_size() != 1)
     return false;
 
-  // To prevent the invalid transformation of the following code:
-  //
-  //   mov %rax, %rcx
-  //   test %rax, %rax
-  //   %rax = ...
-  //   je throw_npe
-  //   mov(%rcx), %r9
-  //   mov(%rax), %r10
-  //
-  // into:
-  //
-  //   mov %rax, %rcx
-  //   %rax = ....
-  //   faulting_load_op("movl (%rax), %r10", throw_npe)
-  //   mov(%rcx), %r9
-  //
-  // we must ensure that there are no instructions between the 'test' and
-  // conditional jump that modify %rax.
   const Register PointerReg = MBP.LHS.getReg();
 
-  assert(MBP.ConditionDef->getParent() ==  &MBB && "Should be in basic block");
-
-  for (auto I = MBB.rbegin(); MBP.ConditionDef != &*I; ++I)
-    if (I->modifiesRegister(PointerReg, TRI))
-      return false;
+  if (MBP.ConditionDef) {
+    // To prevent the invalid transformation of the following code:
+    //
+    //   mov %rax, %rcx
+    //   test %rax, %rax
+    //   %rax = ...
+    //   je throw_npe
+    //   mov(%rcx), %r9
+    //   mov(%rax), %r10
+    //
+    // into:
+    //
+    //   mov %rax, %rcx
+    //   %rax = ....
+    //   faulting_load_op("movl (%rax), %r10", throw_npe)
+    //   mov(%rcx), %r9
+    //
+    // we must ensure that there are no instructions between the 'test' and
+    // conditional jump that modify %rax.
+    assert(MBP.ConditionDef->getParent() ==  &MBB &&
+           "Should be in basic block");
 
+    for (auto I = MBB.rbegin(); MBP.ConditionDef != &*I; ++I)
+      if (I->modifiesRegister(PointerReg, TRI))
+        return false;
+  }
   // Starting with a code fragment like:
   //
   //   test %rax, %rax
@@ -726,9 +732,11 @@ void ImplicitNullChecks::rewriteNullChecks(
     }
 
     NC.getMemOperation()->eraseFromParent();
-    NC.getCheckOperation()->eraseFromParent();
+    if (auto *CheckOp = NC.getCheckOperation())
+      CheckOp->eraseFromParent();
 
-    // Insert an *unconditional* branch to not-null successor.
+    // Insert an *unconditional* branch to not-null successor - we expect
+    // block placement to remove fallthroughs later.
     TII->insertBranch(*NC.getCheckBlock(), NC.getNotNullSucc(), nullptr,
                       /*Cond=*/None, DL);
 
diff --git a/lib/Target/AArch64/AArch64AsmPrinter.cpp b/lib/Target/AArch64/AArch64AsmPrinter.cpp
index 30ac7f4c0d2..da8447f91f3 100644
--- a/lib/Target/AArch64/AArch64AsmPrinter.cpp
+++ b/lib/Target/AArch64/AArch64AsmPrinter.cpp
@@ -32,6 +32,7 @@
 #include "llvm/BinaryFormat/COFF.h"
 #include "llvm/BinaryFormat/ELF.h"
 #include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/FaultMaps.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstr.h"
@@ -69,12 +70,13 @@ namespace {
 class AArch64AsmPrinter : public AsmPrinter {
   AArch64MCInstLower MCInstLowering;
   StackMaps SM;
+  FaultMaps FM;
   const AArch64Subtarget *STI;
 
 public:
   AArch64AsmPrinter(TargetMachine &TM, std::unique_ptr<MCStreamer> Streamer)
       : AsmPrinter(TM, std::move(Streamer)), MCInstLowering(OutContext, *this),
-        SM(*this) {}
+        SM(*this), FM(*this) {}
 
   StringRef getPassName() const override { return "AArch64 Assembly Printer"; }
 
@@ -97,6 +99,7 @@ public:
                        const MachineInstr &MI);
   void LowerSTATEPOINT(MCStreamer &OutStreamer, StackMaps &SM,
                        const MachineInstr &MI);
+  void LowerFAULTING_OP(const MachineInstr &MI);
 
   void LowerPATCHABLE_FUNCTION_ENTER(const MachineInstr &MI);
   void LowerPATCHABLE_FUNCTION_EXIT(const MachineInstr &MI);
@@ -524,7 +527,11 @@ void AArch64AsmPrinter::emitEndOfAsmFile(Module &M) {
     // generates code that does this, it is always safe to set.
     OutStreamer->emitAssemblerFlag(MCAF_SubsectionsViaSymbols);
   }
+  
+  // Emit stack and fault map information.
   emitStackMaps(SM);
+  FM.serializeToFaultMapSection();
+
 }
 
 void AArch64AsmPrinter::EmitLOHs() {
@@ -970,6 +977,42 @@ void AArch64AsmPrinter::LowerSTATEPOINT(MCStreamer &OutStreamer, StackMaps &SM,
   SM.recordStatepoint(*MILabel, MI);
 }
 
+void AArch64AsmPrinter::LowerFAULTING_OP(const MachineInstr &FaultingMI) {
+  // FAULTING_LOAD_OP <def>, <faltinf type>, <MBB handler>,
+  //                  <opcode>, <operands>
+
+  Register DefRegister = FaultingMI.getOperand(0).getReg();
+  FaultMaps::FaultKind FK =
+      static_cast<FaultMaps::FaultKind>(FaultingMI.getOperand(1).getImm());
+  MCSymbol *HandlerLabel = FaultingMI.getOperand(2).getMBB()->getSymbol();
+  unsigned Opcode = FaultingMI.getOperand(3).getImm();
+  unsigned OperandsBeginIdx = 4;
+
+  auto &Ctx = OutStreamer->getContext();
+  MCSymbol *FaultingLabel = Ctx.createTempSymbol();
+  OutStreamer->emitLabel(FaultingLabel);
+
+  assert(FK < FaultMaps::FaultKindMax && "Invalid Faulting Kind!");
+  FM.recordFaultingOp(FK, FaultingLabel, HandlerLabel);
+
+  MCInst MI;
+  MI.setOpcode(Opcode);
+
+  if (DefRegister != (Register)0)
+    MI.addOperand(MCOperand::createReg(DefRegister));
+
+  for (auto I = FaultingMI.operands_begin() + OperandsBeginIdx,
+            E = FaultingMI.operands_end();
+       I != E; ++I) {
+    MCOperand Dest;
+    lowerOperand(*I, Dest);
+    MI.addOperand(Dest);
+  }
+
+  OutStreamer->AddComment("on-fault: " + HandlerLabel->getName());
+  OutStreamer->emitInstruction(MI, getSubtargetInfo());
+}
+
 void AArch64AsmPrinter::EmitFMov0(const MachineInstr &MI) {
   Register DestReg = MI.getOperand(0).getReg();
   if (STI->hasZeroCycleZeroingFP() && !STI->hasZeroCycleZeroingFPWorkaround()) {
@@ -1254,6 +1297,9 @@ void AArch64AsmPrinter::emitInstruction(const MachineInstr *MI) {
   case TargetOpcode::STATEPOINT:
     return LowerSTATEPOINT(*OutStreamer, SM, *MI);
 
+  case TargetOpcode::FAULTING_OP:
+    return LowerFAULTING_OP(*MI);
+
   case TargetOpcode::PATCHABLE_FUNCTION_ENTER:
     LowerPATCHABLE_FUNCTION_ENTER(*MI);
     return;
diff --git a/lib/Target/AArch64/AArch64InstrInfo.cpp b/lib/Target/AArch64/AArch64InstrInfo.cpp
index fb26b2430bf..3d1cf767cfc 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -328,6 +328,56 @@ bool AArch64InstrInfo::analyzeBranch(MachineBasicBlock &MBB,
   return true;
 }
 
+bool AArch64InstrInfo::analyzeBranchPredicate(MachineBasicBlock &MBB,
+                                              MachineBranchPredicate &MBP,
+                                              bool AllowModify) const {
+  // For the moment, handle only a block which ends with a cb(n)zx followed by
+  // a fallthrough.  Why this?  Because it is a common form.
+  // TODO: Should we handle b.cc?
+
+  MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
+  if (I == MBB.end())
+    return true;
+
+  // Skip over SpeculationBarrierEndBB terminators
+  if (I->getOpcode() == AArch64::SpeculationBarrierISBDSBEndBB ||
+      I->getOpcode() == AArch64::SpeculationBarrierSBEndBB) {
+    --I;
+  }
+
+  if (!isUnpredicatedTerminator(*I))
+    return true;
+
+  // Get the last instruction in the block.
+  MachineInstr *LastInst = &*I;
+  unsigned LastOpc = LastInst->getOpcode();
+  if (!isCondBranchOpcode(LastOpc))
+    return true;
+
+  switch (LastOpc) {
+  default:
+    return true;
+  case AArch64::CBZW:
+  case AArch64::CBZX:
+  case AArch64::CBNZW:
+  case AArch64::CBNZX:
+    break;
+  };
+
+  MBP.TrueDest = LastInst->getOperand(1).getMBB();
+  assert(MBP.TrueDest && "expected!");
+  MBP.FalseDest = MBB.getNextNode();
+
+  MBP.ConditionDef = nullptr;
+  MBP.SingleUseCondition = false;
+
+  MBP.LHS = LastInst->getOperand(0);
+  MBP.RHS = MachineOperand::CreateImm(0);
+  MBP.Predicate = LastOpc == AArch64::CBNZX ? MachineBranchPredicate::PRED_NE
+                                            : MachineBranchPredicate::PRED_EQ;
+  return false;
+}
+
 bool AArch64InstrInfo::reverseBranchCondition(
     SmallVectorImpl<MachineOperand> &Cond) const {
   if (Cond[0].getImm() != -1) {
diff --git a/lib/Target/AArch64/AArch64InstrInfo.h b/lib/Target/AArch64/AArch64InstrInfo.h
index 298c04d8170..1a21d8474e0 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.h
+++ b/lib/Target/AArch64/AArch64InstrInfo.h
@@ -188,6 +188,9 @@ public:
                      MachineBasicBlock *&FBB,
                      SmallVectorImpl<MachineOperand> &Cond,
                      bool AllowModify = false) const override;
+  bool analyzeBranchPredicate(MachineBasicBlock &MBB,
+                              MachineBranchPredicate &MBP,
+                              bool AllowModify) const override;
   unsigned removeBranch(MachineBasicBlock &MBB,
                         int *BytesRemoved = nullptr) const override;
   unsigned insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
diff --git a/test/CodeGen/AArch64/implicit-null-check.ll b/test/CodeGen/AArch64/implicit-null-check.ll
index 5e7bb6f5bba..ab9b8dd3488 100644
--- a/test/CodeGen/AArch64/implicit-null-check.ll
+++ b/test/CodeGen/AArch64/implicit-null-check.ll
@@ -5,15 +5,14 @@
 ; file with the same name in the X86 tree, but adjusted to remove patterns
 ; related to memory folding of arithmetic (since aarch64 doesn't), and add
 ; a couple of aarch64 specific tests.
-; NOTE: Currently negative tests as these are being precommitted before
-; the changes to enable.
 
 define i32 @imp_null_check_load_fallthrough(i32* %x) {
 ; CHECK-LABEL: imp_null_check_load_fallthrough:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    cbz x0, .LBB0_2
-; CHECK-NEXT:  // %bb.1: // %not_null
-; CHECK-NEXT:    ldr w0, [x0]
+; CHECK-NEXT:  .Ltmp0:
+; CHECK-NEXT:    ldr w0, [x0] // on-fault: .LBB0_2
+; CHECK-NEXT:    b .LBB0_1
+; CHECK-NEXT:  .LBB0_1: // %not_null
 ; CHECK-NEXT:    ret
 ; CHECK-NEXT:  .LBB0_2: // %is_null
 ; CHECK-NEXT:    mov w0, #42
@@ -34,9 +33,10 @@ is_null:
 define i32 @imp_null_check_load_reorder(i32* %x) {
 ; CHECK-LABEL: imp_null_check_load_reorder:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    cbz x0, .LBB1_2
-; CHECK-NEXT:  // %bb.1: // %not_null
-; CHECK-NEXT:    ldr w0, [x0]
+; CHECK-NEXT:  .Ltmp1:
+; CHECK-NEXT:    ldr w0, [x0] // on-fault: .LBB1_2
+; CHECK-NEXT:    b .LBB1_1
+; CHECK-NEXT:  .LBB1_1: // %not_null
 ; CHECK-NEXT:    ret
 ; CHECK-NEXT:  .LBB1_2: // %is_null
 ; CHECK-NEXT:    mov w0, #42
@@ -56,9 +56,10 @@ define i32 @imp_null_check_load_reorder(i32* %x) {
 define i32 @imp_null_check_unordered_load(i32* %x) {
 ; CHECK-LABEL: imp_null_check_unordered_load:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    cbz x0, .LBB2_2
-; CHECK-NEXT:  // %bb.1: // %not_null
-; CHECK-NEXT:    ldr w0, [x0]
+; CHECK-NEXT:  .Ltmp2:
+; CHECK-NEXT:    ldr w0, [x0] // on-fault: .LBB2_2
+; CHECK-NEXT:    b .LBB2_1
+; CHECK-NEXT:  .LBB2_1: // %not_null
 ; CHECK-NEXT:    ret
 ; CHECK-NEXT:  .LBB2_2: // %is_null
 ; CHECK-NEXT:    mov w0, #42
@@ -76,6 +77,8 @@ define i32 @imp_null_check_unordered_load(i32* %x) {
 }
 
 
+; TODO: Can be converted into implicit check.
+;; Probably could be implicit, but we're conservative for now
 define i32 @imp_null_check_seq_cst_load(i32* %x) {
 ; CHECK-LABEL: imp_null_check_seq_cst_load:
 ; CHECK:       // %bb.0: // %entry
@@ -125,9 +128,10 @@ define i32 @imp_null_check_volatile_load(i32* %x) {
 define i8 @imp_null_check_load_i8(i8* %x) {
 ; CHECK-LABEL: imp_null_check_load_i8:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    cbz x0, .LBB5_2
-; CHECK-NEXT:  // %bb.1: // %not_null
-; CHECK-NEXT:    ldrb w0, [x0]
+; CHECK-NEXT:  .Ltmp3:
+; CHECK-NEXT:    ldrb w0, [x0] // on-fault: .LBB5_2
+; CHECK-NEXT:    b .LBB5_1
+; CHECK-NEXT:  .LBB5_1: // %not_null
 ; CHECK-NEXT:    ret
 ; CHECK-NEXT:  .LBB5_2: // %is_null
 ; CHECK-NEXT:    mov w0, #42
@@ -176,9 +180,10 @@ define i256 @imp_null_check_load_i256(i256* %x) {
 define i32 @imp_null_check_gep_load(i32* %x) {
 ; CHECK-LABEL: imp_null_check_gep_load:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    cbz x0, .LBB7_2
-; CHECK-NEXT:  // %bb.1: // %not_null
-; CHECK-NEXT:    ldr w0, [x0, #128]
+; CHECK-NEXT:  .Ltmp4:
+; CHECK-NEXT:    ldr w0, [x0, #128] // on-fault: .LBB7_2
+; CHECK-NEXT:    b .LBB7_1
+; CHECK-NEXT:  .LBB7_1: // %not_null
 ; CHECK-NEXT:    ret
 ; CHECK-NEXT:  .LBB7_2: // %is_null
 ; CHECK-NEXT:    mov w0, #42
@@ -199,9 +204,10 @@ define i32 @imp_null_check_gep_load(i32* %x) {
 define i32 @imp_null_check_add_result(i32* %x, i32 %p) {
 ; CHECK-LABEL: imp_null_check_add_result:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    cbz x0, .LBB8_2
-; CHECK-NEXT:  // %bb.1: // %not_null
-; CHECK-NEXT:    ldr w8, [x0]
+; CHECK-NEXT:  .Ltmp5:
+; CHECK-NEXT:    ldr w8, [x0] // on-fault: .LBB8_2
+; CHECK-NEXT:    b .LBB8_1
+; CHECK-NEXT:  .LBB8_1: // %not_null
 ; CHECK-NEXT:    add w0, w8, w1
 ; CHECK-NEXT:    ret
 ; CHECK-NEXT:  .LBB8_2: // %is_null
@@ -225,9 +231,10 @@ define i32 @imp_null_check_add_result(i32* %x, i32 %p) {
 define i32 @imp_null_check_hoist_over_udiv(i32* %x, i32 %a, i32 %b) {
 ; CHECK-LABEL: imp_null_check_hoist_over_udiv:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    cbz x0, .LBB9_2
-; CHECK-NEXT:  // %bb.1: // %not_null
-; CHECK-NEXT:    ldr w8, [x0]
+; CHECK-NEXT:  .Ltmp6:
+; CHECK-NEXT:    ldr w8, [x0] // on-fault: .LBB9_2
+; CHECK-NEXT:    b .LBB9_1
+; CHECK-NEXT:  .LBB9_1: // %not_null
 ; CHECK-NEXT:    udiv w9, w1, w2
 ; CHECK-NEXT:    add w0, w8, w9
 ; CHECK-NEXT:    ret
@@ -249,6 +256,8 @@ define i32 @imp_null_check_hoist_over_udiv(i32* %x, i32 %a, i32 %b) {
 }
 
 
+; TODO: We should be able to hoist this - we can on x86, why isn't this
+; working for aarch64?  Aliasing?
 define i32 @imp_null_check_hoist_over_unrelated_load(i32* %x, i32* %y, i32* %z) {
 ; CHECK-LABEL: imp_null_check_hoist_over_unrelated_load:
 ; CHECK:       // %bb.0: // %entry
@@ -278,9 +287,10 @@ define i32 @imp_null_check_hoist_over_unrelated_load(i32* %x, i32* %y, i32* %z)
 define i32 @imp_null_check_gep_load_with_use_dep(i32* %x, i32 %a) {
 ; CHECK-LABEL: imp_null_check_gep_load_with_use_dep:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    cbz x0, .LBB11_2
-; CHECK-NEXT:  // %bb.1: // %not_null
-; CHECK-NEXT:    ldr w8, [x0]
+; CHECK-NEXT:  .Ltmp7:
+; CHECK-NEXT:    ldr w8, [x0] // on-fault: .LBB11_2
+; CHECK-NEXT:    b .LBB11_1
+; CHECK-NEXT:  .LBB11_1: // %not_null
 ; CHECK-NEXT:    add w9, w0, w1
 ; CHECK-NEXT:    add w8, w9, w8
 ; CHECK-NEXT:    add w0, w8, #4 // =4
@@ -304,6 +314,8 @@ define i32 @imp_null_check_gep_load_with_use_dep(i32* %x, i32 %a) {
   ret i32 %z
 }
 
+;; TODO: We could handle this case as we can lift the fence into the
+;; previous block before the conditional without changing behavior.
 define i32 @imp_null_check_load_fence1(i32* %x) {
 ; CHECK-LABEL: imp_null_check_load_fence1:
 ; CHECK:       // %bb.0: // %entry
@@ -328,6 +340,8 @@ not_null:
   ret i32 %t
 }
 
+;; TODO: We could handle this case as we can lift the fence into the
+;; previous block before the conditional without changing behavior.
 define i32 @imp_null_check_load_fence2(i32* %x) {
 ; CHECK-LABEL: imp_null_check_load_fence2:
 ; CHECK:       // %bb.0: // %entry
@@ -352,6 +366,7 @@ not_null:
   ret i32 %t
 }
 
+; TODO: We can fold to implicit null here, not sure why this isn't working
 define void @imp_null_check_store(i32* %x) {
 ; CHECK-LABEL: imp_null_check_store:
 ; CHECK:       // %bb.0: // %entry
@@ -374,6 +389,7 @@ define void @imp_null_check_store(i32* %x) {
   ret void
 }
 
+;; TODO: can be implicit
 define void @imp_null_check_unordered_store(i32* %x) {
 ; CHECK-LABEL: imp_null_check_unordered_store:
 ; CHECK:       // %bb.0: // %entry
@@ -399,9 +415,10 @@ define void @imp_null_check_unordered_store(i32* %x) {
 define i32 @imp_null_check_neg_gep_load(i32* %x) {
 ; CHECK-LABEL: imp_null_check_neg_gep_load:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    cbz x0, .LBB16_2
-; CHECK-NEXT:  // %bb.1: // %not_null
-; CHECK-NEXT:    ldur w0, [x0, #-128]
+; CHECK-NEXT:  .Ltmp8:
+; CHECK-NEXT:    ldur w0, [x0, #-128] // on-fault: .LBB16_2
+; CHECK-NEXT:    b .LBB16_1
+; CHECK-NEXT:  .LBB16_1: // %not_null
 ; CHECK-NEXT:    ret
 ; CHECK-NEXT:  .LBB16_2: // %is_null
 ; CHECK-NEXT:    mov w0, #42