diff --git a/lib/Target/PowerPC/PPC.h b/lib/Target/PowerPC/PPC.h index ae8d8b4f5df..e157fd37c6e 100644 --- a/lib/Target/PowerPC/PPC.h +++ b/lib/Target/PowerPC/PPC.h @@ -41,6 +41,7 @@ namespace llvm { FunctionPass *createPPCVSXCopyPass(); FunctionPass *createPPCVSXFMAMutatePass(); FunctionPass *createPPCVSXSwapRemovalPass(); + FunctionPass *createPPCMIPeepholePass(); FunctionPass *createPPCBranchSelectionPass(); FunctionPass *createPPCISelDag(PPCTargetMachine &TM); FunctionPass *createPPCTLSDynamicCallPass(); diff --git a/lib/Target/PowerPC/PPCMIPeephole.cpp b/lib/Target/PowerPC/PPCMIPeephole.cpp new file mode 100644 index 00000000000..fe339d70d7d --- /dev/null +++ b/lib/Target/PowerPC/PPCMIPeephole.cpp @@ -0,0 +1,230 @@ +//===-------------- PPCMIPeephole.cpp - MI Peephole Cleanups -------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===---------------------------------------------------------------------===// +// +// This pass performs peephole optimizations to clean up ugly code +// sequences at the MachineInstruction layer. It runs at the end of +// the SSA phases, following VSX swap removal. A pass of dead code +// elimination follows this one for quick clean-up of any dead +// instructions introduced here. Although we could do this as callbacks +// from the generic peephole pass, this would have a couple of bad +// effects: it might remove optimization opportunities for VSX swap +// removal, and it would miss cleanups made possible following VSX +// swap removal. +// +//===---------------------------------------------------------------------===// + +#include "PPCInstrInfo.h" +#include "PPC.h" +#include "PPCInstrBuilder.h" +#include "PPCTargetMachine.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/Support/Debug.h" + +using namespace llvm; + +#define DEBUG_TYPE "ppc-mi-peepholes" + +namespace llvm { + void initializePPCMIPeepholePass(PassRegistry&); +} + +namespace { + +struct PPCMIPeephole : public MachineFunctionPass { + + static char ID; + const PPCInstrInfo *TII; + MachineFunction *MF; + MachineRegisterInfo *MRI; + + PPCMIPeephole() : MachineFunctionPass(ID) { + initializePPCMIPeepholePass(*PassRegistry::getPassRegistry()); + } + +private: + // Initialize class variables. + void initialize(MachineFunction &MFParm); + + // Perform peepholes. + bool simplifyCode(void); + + // Find the "true" register represented by SrcReg (following chains + // of copies and subreg_to_reg operations). + unsigned lookThruCopyLike(unsigned SrcReg); + +public: + // Main entry point for this pass. + bool runOnMachineFunction(MachineFunction &MF) override { + initialize(MF); + return simplifyCode(); + } +}; + +// Initialize class variables. +void PPCMIPeephole::initialize(MachineFunction &MFParm) { + MF = &MFParm; + MRI = &MF->getRegInfo(); + TII = MF->getSubtarget().getInstrInfo(); + DEBUG(dbgs() << "*** PowerPC MI peephole pass ***\n\n"); + DEBUG(MF->dump()); +} + +// Perform peephole optimizations. +bool PPCMIPeephole::simplifyCode(void) { + bool Simplified = false; + MachineInstr* ToErase = nullptr; + + for (MachineBasicBlock &MBB : *MF) { + for (MachineInstr &MI : MBB) { + + // If the previous instruction was marked for elimination, + // remove it now. + if (ToErase) { + ToErase->eraseFromParent(); + ToErase = nullptr; + } + + // Ignore debug instructions. + if (MI.isDebugValue()) + continue; + + // Per-opcode peepholes. + switch (MI.getOpcode()) { + + default: + break; + + case PPC::XXPERMDI: { + // Perform simplifications of 2x64 vector swaps and splats. + // A swap is identified by an immediate value of 2, and a splat + // is identified by an immediate value of 0 or 3. + int Immed = MI.getOperand(3).getImm(); + + if (Immed != 1) { + + // For each of these simplifications, we need the two source + // regs to match. Unfortunately, MachineCSE ignores COPY and + // SUBREG_TO_REG, so for example we can see + // XXPERMDI t, SUBREG_TO_REG(s), SUBREG_TO_REG(s), immed. + // We have to look through chains of COPY and SUBREG_TO_REG + // to find the real source values for comparison. + unsigned TrueReg1 = lookThruCopyLike(MI.getOperand(1).getReg()); + unsigned TrueReg2 = lookThruCopyLike(MI.getOperand(2).getReg()); + + if (TrueReg1 == TrueReg2 + && TargetRegisterInfo::isVirtualRegister(TrueReg1)) { + MachineInstr *DefMI = MRI->getVRegDef(TrueReg1); + + // If this is a splat or a swap fed by another splat, we + // can replace it with a copy. + if (DefMI && DefMI->getOpcode() == PPC::XXPERMDI) { + unsigned FeedImmed = DefMI->getOperand(3).getImm(); + unsigned FeedReg1 + = lookThruCopyLike(DefMI->getOperand(1).getReg()); + unsigned FeedReg2 + = lookThruCopyLike(DefMI->getOperand(2).getReg()); + + if ((FeedImmed == 0 || FeedImmed == 3) && FeedReg1 == FeedReg2) { + DEBUG(dbgs() + << "Optimizing splat/swap or splat/splat " + "to splat/copy: "); + DEBUG(MI.dump()); + BuildMI(MBB, &MI, MI.getDebugLoc(), + TII->get(PPC::COPY), MI.getOperand(0).getReg()) + .addOperand(MI.getOperand(1)); + ToErase = &MI; + Simplified = true; + } + + // If this is a splat fed by a swap, we can simplify modify + // the splat to splat the other value from the swap's input + // parameter. + else if ((Immed == 0 || Immed == 3) + && FeedImmed == 2 && FeedReg1 == FeedReg2) { + DEBUG(dbgs() << "Optimizing swap/splat => splat: "); + DEBUG(MI.dump()); + MI.getOperand(1).setReg(DefMI->getOperand(1).getReg()); + MI.getOperand(2).setReg(DefMI->getOperand(2).getReg()); + MI.getOperand(3).setImm(3 - Immed); + Simplified = true; + } + + // If this is a swap fed by a swap, we can replace it + // with a copy from the first swap's input. + else if (Immed == 2 && FeedImmed == 2 && FeedReg1 == FeedReg2) { + DEBUG(dbgs() << "Optimizing swap/swap => copy: "); + DEBUG(MI.dump()); + BuildMI(MBB, &MI, MI.getDebugLoc(), + TII->get(PPC::COPY), MI.getOperand(0).getReg()) + .addOperand(DefMI->getOperand(1)); + ToErase = &MI; + Simplified = true; + } + } + } + } + break; + } + } + } + + // If the last instruction was marked for elimination, + // remove it now. + if (ToErase) { + ToErase->eraseFromParent(); + ToErase = nullptr; + } + } + + return Simplified; +} + +// This is used to find the "true" source register for an +// XXPERMDI instruction, since MachineCSE does not handle the +// "copy-like" operations (Copy and SubregToReg). Returns +// the original SrcReg unless it is the target of a copy-like +// operation, in which case we chain backwards through all +// such operations to the ultimate source register. If a +// physical register is encountered, we stop the search. +unsigned PPCMIPeephole::lookThruCopyLike(unsigned SrcReg) { + + while (true) { + + MachineInstr *MI = MRI->getVRegDef(SrcReg); + if (!MI->isCopyLike()) + return SrcReg; + + unsigned CopySrcReg; + if (MI->isCopy()) + CopySrcReg = MI->getOperand(1).getReg(); + else { + assert(MI->isSubregToReg() && "bad opcode for lookThruCopyLike"); + CopySrcReg = MI->getOperand(2).getReg(); + } + + if (!TargetRegisterInfo::isVirtualRegister(CopySrcReg)) + return CopySrcReg; + + SrcReg = CopySrcReg; + } +} + +} // end default namespace + +INITIALIZE_PASS_BEGIN(PPCMIPeephole, DEBUG_TYPE, + "PowerPC MI Peephole Optimization", false, false) +INITIALIZE_PASS_END(PPCMIPeephole, DEBUG_TYPE, + "PowerPC MI Peephole Optimization", false, false) + +char PPCMIPeephole::ID = 0; +FunctionPass* +llvm::createPPCMIPeepholePass() { return new PPCMIPeephole(); } + diff --git a/lib/Target/PowerPC/PPCTargetMachine.cpp b/lib/Target/PowerPC/PPCTargetMachine.cpp index df687b2cade..24a9ef0ef07 100644 --- a/lib/Target/PowerPC/PPCTargetMachine.cpp +++ b/lib/Target/PowerPC/PPCTargetMachine.cpp @@ -42,6 +42,10 @@ static cl:: opt DisableVSXSwapRemoval("disable-ppc-vsx-swap-removal", cl::Hidden, cl::desc("Disable VSX Swap Removal for PPC")); +static cl:: +opt DisableMIPeephole("disable-ppc-peephole", cl::Hidden, + cl::desc("Disable machine peepholes for PPC")); + static cl::opt EnableGEPOpt("ppc-gep-opt", cl::Hidden, cl::desc("Enable optimizations on complex GEPs"), @@ -348,6 +352,12 @@ void PPCPassConfig::addMachineSSAOptimization() { if (TM->getTargetTriple().getArch() == Triple::ppc64le && !DisableVSXSwapRemoval) addPass(createPPCVSXSwapRemovalPass()); + // Target-specific peephole cleanups performed after instruction + // selection. + if (!DisableMIPeephole) { + addPass(createPPCMIPeepholePass()); + addPass(&DeadMachineInstructionElimID); + } } void PPCPassConfig::addPreRegAlloc() { diff --git a/test/CodeGen/PowerPC/p8-scalar_vector_conversions.ll b/test/CodeGen/PowerPC/p8-scalar_vector_conversions.ll index 35d501e40cb..7e8991647ae 100644 --- a/test/CodeGen/PowerPC/p8-scalar_vector_conversions.ll +++ b/test/CodeGen/PowerPC/p8-scalar_vector_conversions.ll @@ -63,7 +63,7 @@ entry: ret <2 x i64> %splat.splat ; CHECK: mtvsrd {{[0-9]+}}, 3 ; CHECK-LE: mtvsrd [[REG1:[0-9]+]], 3 -; CHECK-LE: xxswapd {{[0-9]+}}, [[REG1]] +; CHECK-LE: xxspltd [[REG1]], [[REG1]], 0 } ; Function Attrs: nounwind diff --git a/test/CodeGen/PowerPC/pr25157-peephole.ll b/test/CodeGen/PowerPC/pr25157-peephole.ll new file mode 100644 index 00000000000..c5bd49b492c --- /dev/null +++ b/test/CodeGen/PowerPC/pr25157-peephole.ll @@ -0,0 +1,61 @@ +; RUN: llc -mcpu=pwr8 -mtriple=powerpc64le-unknown-linux-gnu < %s | FileCheck %s + +; Verify peephole simplification of splats and swaps. Bugpoint-reduced +; test from Eric Schweitz. + +%struct.BSS38.51.4488.9911.14348.16813.20264.24701.28152.31603.35054.39491.44914.45407.46393.46886.47872.49351.49844.50830.51323.52309.53295.53788.54281.55267.55760.59211.61625 = type <{ [28 x i8] }> +%struct_main1_2_.491.4928.10351.14788.17253.20704.25141.28592.32043.35494.39931.45354.45847.46833.47326.48312.49791.50284.51270.51763.52749.53735.54228.54721.55707.56200.59651.61626 = type <{ [64 x i8] }> + +@.BSS38 = external global %struct.BSS38.51.4488.9911.14348.16813.20264.24701.28152.31603.35054.39491.44914.45407.46393.46886.47872.49351.49844.50830.51323.52309.53295.53788.54281.55267.55760.59211.61625, align 32 +@_main1_2_ = external global %struct_main1_2_.491.4928.10351.14788.17253.20704.25141.28592.32043.35494.39931.45354.45847.46833.47326.48312.49791.50284.51270.51763.52749.53735.54228.54721.55707.56200.59651.61626, section ".comm", align 16 + +define void @aercalc_() { +L.entry: + br i1 undef, label %L.LB38_2426, label %L.LB38_2911 + +L.LB38_2911: + br i1 undef, label %L.LB38_2140, label %L.LB38_2640 + +L.LB38_2640: + unreachable + +L.LB38_2426: + br i1 undef, label %L.LB38_2438, label %L.LB38_2920 + +L.LB38_2920: + br i1 undef, label %L.LB38_2438, label %L.LB38_2921 + +L.LB38_2921: + br label %L.LB38_2140 + +L.LB38_2140: + ret void + +L.LB38_2438: + br i1 undef, label %L.LB38_2451, label %L.LB38_2935 + +L.LB38_2935: + br i1 undef, label %L.LB38_2451, label %L.LB38_2936 + +L.LB38_2936: + unreachable + +L.LB38_2451: + br i1 undef, label %L.LB38_2452, label %L.LB38_2937 + +L.LB38_2937: + unreachable + +L.LB38_2452: + %0 = load float, float* bitcast (i8* getelementptr inbounds (%struct.BSS38.51.4488.9911.14348.16813.20264.24701.28152.31603.35054.39491.44914.45407.46393.46886.47872.49351.49844.50830.51323.52309.53295.53788.54281.55267.55760.59211.61625, %struct.BSS38.51.4488.9911.14348.16813.20264.24701.28152.31603.35054.39491.44914.45407.46393.46886.47872.49351.49844.50830.51323.52309.53295.53788.54281.55267.55760.59211.61625* @.BSS38, i64 0, i32 0, i64 16) to float*), align 16 + %1 = fpext float %0 to double + %2 = insertelement <2 x double> undef, double %1, i32 1 + store <2 x double> %2, <2 x double>* bitcast (i8* getelementptr inbounds (%struct_main1_2_.491.4928.10351.14788.17253.20704.25141.28592.32043.35494.39931.45354.45847.46833.47326.48312.49791.50284.51270.51763.52749.53735.54228.54721.55707.56200.59651.61626, %struct_main1_2_.491.4928.10351.14788.17253.20704.25141.28592.32043.35494.39931.45354.45847.46833.47326.48312.49791.50284.51270.51763.52749.53735.54228.54721.55707.56200.59651.61626* @_main1_2_, i64 0, i32 0, i64 32) to <2 x double>*), align 16 + unreachable +} + +; CHECK-LABEL: @aercalc_ +; CHECK: lxsspx +; CHECK: xxspltd +; CHECK: stxvd2x +; CHECK-NOT: xxswapd diff --git a/test/CodeGen/PowerPC/swaps-le-5.ll b/test/CodeGen/PowerPC/swaps-le-5.ll index 5cd739a0efa..3e13bd16c23 100644 --- a/test/CodeGen/PowerPC/swaps-le-5.ll +++ b/test/CodeGen/PowerPC/swaps-le-5.ll @@ -15,11 +15,11 @@ entry: } ; CHECK-LABEL: @bar0 -; CHECK-DAG: xxswapd {{[0-9]+}}, 1 ; CHECK-DAG: lxvd2x [[REG1:[0-9]+]] ; CHECK-DAG: xxspltd [[REG2:[0-9]+]] ; CHECK: xxpermdi [[REG3:[0-9]+]], [[REG2]], [[REG1]], 1 ; CHECK: stxvd2x [[REG3]] +; CHECK-NOT: xxswapd define void @bar1(double %y) { entry: @@ -30,11 +30,11 @@ entry: } ; CHECK-LABEL: @bar1 -; CHECK-DAG: xxswapd {{[0-9]+}}, 1 ; CHECK-DAG: lxvd2x [[REG1:[0-9]+]] ; CHECK-DAG: xxspltd [[REG2:[0-9]+]] ; CHECK: xxmrghd [[REG3:[0-9]+]], [[REG1]], [[REG2]] ; CHECK: stxvd2x [[REG3]] +; CHECK-NOT: xxswapd define void @baz0() { entry: diff --git a/test/CodeGen/PowerPC/swaps-le-6.ll b/test/CodeGen/PowerPC/swaps-le-6.ll index 365aeee2d8f..df88322e4fd 100644 --- a/test/CodeGen/PowerPC/swaps-le-6.ll +++ b/test/CodeGen/PowerPC/swaps-le-6.ll @@ -20,8 +20,7 @@ entry: ; CHECK-LABEL: @bar0 ; CHECK-DAG: lxvd2x [[REG1:[0-9]+]] ; CHECK-DAG: lxsdx [[REG2:[0-9]+]] -; CHECK: xxswapd [[REG3:[0-9]+]], [[REG2]] -; CHECK: xxspltd [[REG4:[0-9]+]], [[REG3]], 1 +; CHECK: xxspltd [[REG4:[0-9]+]], [[REG2]], 0 ; CHECK: xxpermdi [[REG5:[0-9]+]], [[REG4]], [[REG1]], 1 ; CHECK: stxvd2x [[REG5]] @@ -37,8 +36,7 @@ entry: ; CHECK-LABEL: @bar1 ; CHECK-DAG: lxvd2x [[REG1:[0-9]+]] ; CHECK-DAG: lxsdx [[REG2:[0-9]+]] -; CHECK: xxswapd [[REG3:[0-9]+]], [[REG2]] -; CHECK: xxspltd [[REG4:[0-9]+]], [[REG3]], 1 +; CHECK: xxspltd [[REG4:[0-9]+]], [[REG2]], 0 ; CHECK: xxmrghd [[REG5:[0-9]+]], [[REG1]], [[REG4]] ; CHECK: stxvd2x [[REG5]] diff --git a/test/CodeGen/PowerPC/vsx.ll b/test/CodeGen/PowerPC/vsx.ll index b4b1d248d1a..b2eefb66676 100644 --- a/test/CodeGen/PowerPC/vsx.ll +++ b/test/CodeGen/PowerPC/vsx.ll @@ -1228,9 +1228,8 @@ define <2 x i32> @test80(i32 %v) { ; CHECK-LE-LABEL: @test80 ; CHECK-LE-DAG: mtvsrd [[R1:[0-9]+]], 3 ; CHECK-LE-DAG: addi [[R2:[0-9]+]], {{[0-9]+}}, .LCPI -; CHECK-LE-DAG: xxswapd [[V1:[0-9]+]], [[R1]] ; CHECK-LE-DAG: lxvd2x [[V2:[0-9]+]], 0, [[R2]] -; CHECK-LE-DAG: xxspltd 34, [[V1]] +; CHECK-LE-DAG: xxspltd 34, [[R1]] ; CHECK-LE-DAG: xxswapd 35, [[V2]] ; CHECK-LE: vaddudm 2, 2, 3 ; CHECK-LE: blr diff --git a/test/CodeGen/PowerPC/vsx_insert_extract_le.ll b/test/CodeGen/PowerPC/vsx_insert_extract_le.ll index 6c89b1092bd..97e1548f965 100644 --- a/test/CodeGen/PowerPC/vsx_insert_extract_le.ll +++ b/test/CodeGen/PowerPC/vsx_insert_extract_le.ll @@ -33,12 +33,8 @@ define double @teste0(<2 x double>* %p1) { %r = extractelement <2 x double> %v, i32 0 ret double %r -; FIXME: Swap optimization will collapse this into lxvd2x 1, 0, 3. - ; CHECK-LABEL: teste0 -; CHECK: lxvd2x 0, 0, 3 -; CHECK: xxswapd 0, 0 -; CHECK: xxswapd 1, 0 +; CHECK: lxvd2x 1, 0, 3 } define double @teste1(<2 x double>* %p1) { diff --git a/test/CodeGen/PowerPC/vsx_shuffle_le.ll b/test/CodeGen/PowerPC/vsx_shuffle_le.ll index dcfa0e78886..4f767c7ca78 100644 --- a/test/CodeGen/PowerPC/vsx_shuffle_le.ll +++ b/test/CodeGen/PowerPC/vsx_shuffle_le.ll @@ -8,8 +8,7 @@ define <2 x double> @test00(<2 x double>* %p1, <2 x double>* %p2) { ; CHECK-LABEL: test00 ; CHECK: lxvd2x 0, 0, 3 -; CHECK: xxswapd 0, 0 -; CHECK: xxspltd 34, 0, 1 +; CHECK: xxspltd 34, 0, 0 } define <2 x double> @test01(<2 x double>* %p1, <2 x double>* %p2) { @@ -58,9 +57,7 @@ define <2 x double> @test10(<2 x double>* %p1, <2 x double>* %p2) { ret <2 x double> %v3 ; CHECK-LABEL: @test10 -; CHECK: lxvd2x 0, 0, 3 -; CHECK: xxswapd 0, 0 -; CHECK: xxswapd 34, 0 +; CHECK: lxvd2x 34, 0, 3 } define <2 x double> @test11(<2 x double>* %p1, <2 x double>* %p2) { @@ -71,8 +68,7 @@ define <2 x double> @test11(<2 x double>* %p1, <2 x double>* %p2) { ; CHECK-LABEL: @test11 ; CHECK: lxvd2x 0, 0, 3 -; CHECK: xxswapd 0, 0 -; CHECK: xxspltd 34, 0, 0 +; CHECK: xxspltd 34, 0, 1 } define <2 x double> @test12(<2 x double>* %p1, <2 x double>* %p2) { @@ -139,8 +135,7 @@ define <2 x double> @test22(<2 x double>* %p1, <2 x double>* %p2) { ; CHECK-LABEL: @test22 ; CHECK: lxvd2x 0, 0, 4 -; CHECK: xxswapd 0, 0 -; CHECK: xxspltd 34, 0, 1 +; CHECK: xxspltd 34, 0, 0 } define <2 x double> @test23(<2 x double>* %p1, <2 x double>* %p2) { @@ -189,9 +184,7 @@ define <2 x double> @test32(<2 x double>* %p1, <2 x double>* %p2) { ret <2 x double> %v3 ; CHECK-LABEL: @test32 -; CHECK: lxvd2x 0, 0, 4 -; CHECK: xxswapd 0, 0 -; CHECK: xxswapd 34, 0 +; CHECK: lxvd2x 34, 0, 4 } define <2 x double> @test33(<2 x double>* %p1, <2 x double>* %p2) { @@ -202,6 +195,5 @@ define <2 x double> @test33(<2 x double>* %p1, <2 x double>* %p2) { ; CHECK-LABEL: @test33 ; CHECK: lxvd2x 0, 0, 4 -; CHECK: xxswapd 0, 0 -; CHECK: xxspltd 34, 0, 0 +; CHECK: xxspltd 34, 0, 1 } diff --git a/test/Transforms/PlaceSafepoints/finite-loops.ll b/test/Transforms/PlaceSafepoints/finite-loops.ll index 9121e92896c..b98073d6a6e 100644 --- a/test/Transforms/PlaceSafepoints/finite-loops.ll +++ b/test/Transforms/PlaceSafepoints/finite-loops.ll @@ -11,6 +11,7 @@ define void @test1(i32) gc "statepoint-example" { ; CHECK: statepoint ; CHECK-LABEL: loop ; CHECK-NOT: statepoint +; CHECK-LABEL: exit entry: br label %loop @@ -32,6 +33,7 @@ define void @test2(i32) gc "statepoint-example" { ; CHECK: statepoint ; CHECK-LABEL: loop ; CHECK-NOT: statepoint +; CHECK-LABEL: exit entry: br label %loop @@ -56,6 +58,7 @@ define void @test3(i8 %upper) gc "statepoint-example" { ; CHECK: statepoint ; CHECK-LABEL: loop ; CHECK-NOT: statepoint +; CHECK-LABEL: exit entry: br label %loop @@ -77,12 +80,14 @@ define void @test4(i64 %upper) gc "statepoint-example" { ; CHECK: statepoint ; CHECK-LABEL: loop ; CHECK: statepoint +; CHECK-LABEL: exit ; COUNTED-64-LABEL: test4 ; COUNTED-64-LABEL: entry ; COUNTED-64: statepoint ; COUNTED-64-LABEL: loop ; COUNTED-64-NOT: statepoint +; COUNTED-64-LABEL: exit entry: br label %loop @@ -105,12 +110,14 @@ define void @test5(i64 %upper) gc "statepoint-example" { ; CHECK: statepoint ; CHECK-LABEL: loop ; CHECK: statepoint +; CHECK-LABEL: exit ; COUNTED-64-LABEL: test5 ; COUNTED-64-LABEL: entry ; COUNTED-64: statepoint ; COUNTED-64-LABEL: loop ; COUNTED-64: statepoint +; COUNTED-64-LABEL: exit entry: br label %loop