From 276b77e66c538264d79b78c00bbad9f890f58011 Mon Sep 17 00:00:00 2001
From: Evan Cheng <evan.cheng@apple.com>
Date: Fri, 17 Apr 2009 01:29:40 +0000
Subject: [PATCH] Teach spiller to unfold instructions which modref spill slot
 when a scratch register is available and when it's profitable.

e.g.
     xorq  %r12<kill>, %r13
     addq  %rax, -184(%rbp)
     addq  %r13, -184(%rbp)
==>
     xorq  %r12<kill>, %r13
     movq  -184(%rbp), %r12
     addq  %rax, %r12
     addq  %r13, %r12
     movq  %r12, -184(%rbp)

Two more instructions, but fewer memory accesses. It can also open up
opportunities for more optimizations.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@69341 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/Spiller.cpp                      | 219 ++++++++++++++++++-
 lib/CodeGen/Spiller.h                        |  19 +-
 test/CodeGen/X86/2009-04-16-SpillerUnfold.ll | 139 ++++++++++++
 3 files changed, 366 insertions(+), 11 deletions(-)
 create mode 100644 test/CodeGen/X86/2009-04-16-SpillerUnfold.ll
diff --git a/lib/CodeGen/Spiller.cpp b/lib/CodeGen/Spiller.cpp
index 5edde381706..92bb785de60 100644
--- a/lib/CodeGen/Spiller.cpp
+++ b/lib/CodeGen/Spiller.cpp
@@ -29,6 +29,7 @@ STATISTIC(NumLoads   , "Number of loads added");
 STATISTIC(NumReused  , "Number of values reused");
 STATISTIC(NumDCE     , "Number of copies elided");
 STATISTIC(NumSUnfold , "Number of stores unfolded");
+STATISTIC(NumModRefUnfold, "Number of modref unfolded");
 
 namespace {
   enum SpillerName { simple, local };
@@ -524,6 +525,7 @@ bool LocalSpiller::runOnMachineFunction(MachineFunction &MF, VirtRegMap &VRM) {
   RegInfo = &MF.getRegInfo(); 
   TRI = MF.getTarget().getRegisterInfo();
   TII = MF.getTarget().getInstrInfo();
+  AllocatableRegs = TRI->getAllocatableSet(MF);
   DOUT << "\n**** Local spiller rewriting function '"
        << MF.getFunction()->getName() << "':\n";
   DOUT << "**** Machine Instrs (NOTE! Does not include spills and reloads!)"
@@ -595,7 +597,201 @@ bool LocalSpiller::runOnMachineFunction(MachineFunction &MF, VirtRegMap &VRM) {
 }
 
 
-/// PrepForUnfoldOpti - Turn a store folding instruction into a load folding
+/// FoldsStackSlotModRef - Return true if the specified MI folds the specified
+/// stack slot mod/ref. It also checks if it's possible to unfold the
+/// instruction by having it define a specified physical register instead.
+static bool FoldsStackSlotModRef(MachineInstr &MI, int SS, unsigned PhysReg,
+                                 const TargetInstrInfo *TII,
+                                 const TargetRegisterInfo *TRI,
+                                 VirtRegMap &VRM) {
+  if (VRM.hasEmergencySpills(&MI) || VRM.isSpillPt(&MI))
+    return false;
+
+  bool Found = false;
+  VirtRegMap::MI2VirtMapTy::const_iterator I, End;
+  for (tie(I, End) = VRM.getFoldedVirts(&MI); I != End; ++I) {
+    unsigned VirtReg = I->second.first;
+    VirtRegMap::ModRef MR = I->second.second;
+    if (MR & VirtRegMap::isModRef)
+      if (VRM.getStackSlot(VirtReg) == SS) {
+        Found= TII->getOpcodeAfterMemoryUnfold(MI.getOpcode(), true, true) != 0;
+        break;
+      }
+  }
+  if (!Found)
+    return false;
+
+  // Does the instruction uses a register that overlaps the scratch register?
+  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = MI.getOperand(i);
+    if (!MO.isReg() || MO.getReg() == 0)
+      continue;
+    unsigned Reg = MO.getReg();
+    if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+      if (!VRM.hasPhys(Reg))
+        continue;
+      Reg = VRM.getPhys(Reg);
+    }
+    if (TRI->regsOverlap(PhysReg, Reg))
+      return false;
+  }
+  return true;
+}
+
+/// FindFreeRegister - Find a free register of a given register class by looking
+/// at (at most) the last two machine instructions.
+static unsigned FindFreeRegister(MachineBasicBlock::iterator MII,
+                                 MachineBasicBlock &MBB,
+                                 const TargetRegisterClass *RC,
+                                 const TargetRegisterInfo *TRI,
+                                 BitVector &AllocatableRegs) {
+  BitVector Defs(TRI->getNumRegs());
+  BitVector Uses(TRI->getNumRegs());
+  SmallVector<unsigned, 4> LocalUses;
+  SmallVector<unsigned, 4> Kills;
+
+  // Take a look at 2 instructions at most.
+  for (unsigned Count = 0; Count < 2; ++Count) {
+    if (MII == MBB.begin())
+      break;
+    MachineInstr *PrevMI = prior(MII);
+    for (unsigned i = 0, e = PrevMI->getNumOperands(); i != e; ++i) {
+      MachineOperand &MO = PrevMI->getOperand(i);
+      if (!MO.isReg() || MO.getReg() == 0)
+        continue;
+      unsigned Reg = MO.getReg();
+      if (MO.isDef()) {
+        Defs.set(Reg);
+        for (const unsigned *AS = TRI->getAliasSet(Reg); *AS; ++AS)
+          Defs.set(*AS);
+      } else  {
+        LocalUses.push_back(Reg);
+        if (MO.isKill() && AllocatableRegs[Reg])
+          Kills.push_back(Reg);
+      }
+    }
+
+    for (unsigned i = 0, e = Kills.size(); i != e; ++i) {
+      unsigned Kill = Kills[i];
+      if (!Defs[Kill] && !Uses[Kill] &&
+          TRI->getPhysicalRegisterRegClass(Kill) == RC)
+        return Kill;
+    }
+    for (unsigned i = 0, e = LocalUses.size(); i != e; ++i) {
+      unsigned Reg = LocalUses[i];
+      Uses.set(Reg);
+      for (const unsigned *AS = TRI->getAliasSet(Reg); *AS; ++AS)
+        Uses.set(*AS);
+    }
+
+    MII = PrevMI;
+  }
+
+  return 0;
+}
+
+static
+void AssignPhysToVirtReg(MachineInstr *MI, unsigned VirtReg, unsigned PhysReg) {
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = MI->getOperand(i);
+    if (MO.isReg() && MO.getReg() == VirtReg)
+      MO.setReg(PhysReg);
+  }
+}
+
+/// OptimizeByUnfold2 - Unfold a series of load / store folding instructions if
+/// a scratch register is available.
+///     xorq  %r12<kill>, %r13
+///     addq  %rax, -184(%rbp)
+///     addq  %r13, -184(%rbp)
+/// ==>
+///     xorq  %r12<kill>, %r13
+///     movq  -184(%rbp), %r12
+///     addq  %rax, %r12
+///     addq  %r13, %r12
+///     movq  %r12, -184(%rbp)
+bool LocalSpiller::OptimizeByUnfold2(unsigned VirtReg, int SS,
+                                    MachineBasicBlock &MBB,
+                                    MachineBasicBlock::iterator &MII,
+                                    std::vector<MachineInstr*> &MaybeDeadStores,
+                                    AvailableSpills &Spills,
+                                    BitVector &RegKills,
+                                    std::vector<MachineOperand*> &KillOps,
+                                    VirtRegMap &VRM) {
+  MachineBasicBlock::iterator NextMII = next(MII);
+  if (NextMII == MBB.end())
+    return false;
+
+  if (TII->getOpcodeAfterMemoryUnfold(MII->getOpcode(), true, true) == 0)
+    return false;
+
+  // Now let's see if the last couple of instructions happens to have freed up
+  // a register.
+  const TargetRegisterClass* RC = RegInfo->getRegClass(VirtReg);
+  unsigned PhysReg = FindFreeRegister(MII, MBB, RC, TRI, AllocatableRegs);
+  if (!PhysReg)
+    return false;
+
+  MachineFunction &MF = *MBB.getParent();
+  TRI = MF.getTarget().getRegisterInfo();
+  MachineInstr &MI = *MII;
+  if (!FoldsStackSlotModRef(MI, SS, PhysReg, TII, TRI, VRM))
+    return false;
+
+  // If the next instruction also folds the same SS modref and can be unfoled,
+  // then it's worthwhile to issue a load from SS into the free register and
+  // then unfold these instructions.
+  if (!FoldsStackSlotModRef(*NextMII, SS, PhysReg, TII, TRI, VRM))
+    return false;
+
+  // Load from SS to the spare physical register.
+  TII->loadRegFromStackSlot(MBB, MII, PhysReg, SS, RC);
+  // This invalidates Phys.
+  Spills.ClobberPhysReg(PhysReg);
+  // Remember it's available.
+  Spills.addAvailable(SS, PhysReg);
+  MaybeDeadStores[SS] = NULL;
+
+  // Unfold current MI.
+  SmallVector<MachineInstr*, 4> NewMIs;
+  if (!TII->unfoldMemoryOperand(MF, &MI, VirtReg, false, false, NewMIs))
+    assert(0 && "Unable unfold the load / store folding instruction!");
+  assert(NewMIs.size() == 1);
+  AssignPhysToVirtReg(NewMIs[0], VirtReg, PhysReg);
+  VRM.transferRestorePts(&MI, NewMIs[0]);
+  MII = MBB.insert(MII, NewMIs[0]);
+  InvalidateKills(MI, RegKills, KillOps);
+  VRM.RemoveMachineInstrFromMaps(&MI);
+  MBB.erase(&MI);
+  ++NumModRefUnfold;
+
+  // Unfold next instructions that fold the same SS.
+  do {
+    MachineInstr &NextMI = *NextMII;
+    NextMII = next(NextMII);
+    NewMIs.clear();
+    if (!TII->unfoldMemoryOperand(MF, &NextMI, VirtReg, false, false, NewMIs))
+      assert(0 && "Unable unfold the load / store folding instruction!");
+    assert(NewMIs.size() == 1);
+    AssignPhysToVirtReg(NewMIs[0], VirtReg, PhysReg);
+    VRM.transferRestorePts(&NextMI, NewMIs[0]);
+    MBB.insert(NextMII, NewMIs[0]);
+    InvalidateKills(NextMI, RegKills, KillOps);
+    VRM.RemoveMachineInstrFromMaps(&NextMI);
+    MBB.erase(&NextMI);
+    ++NumModRefUnfold;
+  } while (FoldsStackSlotModRef(*NextMII, SS, PhysReg, TII, TRI, VRM));
+
+  // Store the value back into SS.
+  TII->storeRegToStackSlot(MBB, NextMII, PhysReg, true, SS, RC);
+  MachineInstr *StoreMI = prior(NextMII);
+  VRM.addSpillSlotUse(SS, StoreMI);
+  VRM.virtFolded(VirtReg, StoreMI, VirtRegMap::isMod);
+
+  return true;
+}
+
+/// OptimizeByUnfold - Turn a store folding instruction into a load folding
 /// instruction. e.g.
 ///     xorl  %edi, %eax
 ///     movl  %eax, -32(%ebp)
@@ -607,7 +803,7 @@ bool LocalSpiller::runOnMachineFunction(MachineFunction &MF, VirtRegMap &VRM) {
 ///     mov   %eax, -32(%ebp)
 /// This enables unfolding optimization for a subsequent instruction which will
 /// also eliminate the newly introduced store instruction.
-bool LocalSpiller::PrepForUnfoldOpti(MachineBasicBlock &MBB,
+bool LocalSpiller::OptimizeByUnfold(MachineBasicBlock &MBB,
                                     MachineBasicBlock::iterator &MII,
                                     std::vector<MachineInstr*> &MaybeDeadStores,
                                     AvailableSpills &Spills,
@@ -646,8 +842,14 @@ bool LocalSpiller::PrepForUnfoldOpti(MachineBasicBlock &MBB,
     }
   }
 
-  if (!UnfoldedOpc)
-    return false;
+  if (!UnfoldedOpc) {
+    if (!UnfoldVR)
+      return false;
+
+    // Look for other unfolding opportunities.
+    return OptimizeByUnfold2(UnfoldVR, FoldedSS, MBB, MII,
+                             MaybeDeadStores, Spills, RegKills, KillOps, VRM);
+  }
 
   for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
     MachineOperand &MO = MI.getOperand(i);
@@ -705,6 +907,7 @@ bool LocalSpiller::PrepForUnfoldOpti(MachineBasicBlock &MBB,
       MF.DeleteMachineInstr(NewMI);
     }
   }
+
   return false;
 }
 
@@ -770,7 +973,7 @@ bool LocalSpiller::CommuteToFoldReload(MachineBasicBlock &MBB,
     VRM.addSpillSlotUse(SS, FoldedMI);
     VRM.virtFolded(VirtReg, FoldedMI, VirtRegMap::isRef);
     // Insert new def MI and spill MI.
-    const TargetRegisterClass* RC = MF.getRegInfo().getRegClass(VirtReg);
+    const TargetRegisterClass* RC = RegInfo->getRegClass(VirtReg);
     TII->storeRegToStackSlot(MBB, &MI, NewReg, true, SS, RC);
     MII = prior(MII);
     MachineInstr *StoreMI = MII;
@@ -935,13 +1138,13 @@ void LocalSpiller::RewriteMBB(MachineBasicBlock &MBB, VirtRegMap &VRM,
   DistanceMap.clear();
   for (MachineBasicBlock::iterator MII = MBB.begin(), E = MBB.end();
        MII != E; ) {
-    MachineBasicBlock::iterator NextMII = MII; ++NextMII;
+    MachineBasicBlock::iterator NextMII = next(MII);
 
     VirtRegMap::MI2VirtMapTy::const_iterator I, End;
     bool Erased = false;
     bool BackTracked = false;
-    if (PrepForUnfoldOpti(MBB, MII,
-                          MaybeDeadStores, Spills, RegKills, KillOps, VRM))
+    if (OptimizeByUnfold(MBB, MII,
+                         MaybeDeadStores, Spills, RegKills, KillOps, VRM))
       NextMII = next(MII);
 
     MachineInstr &MI = *MII;
diff --git a/lib/CodeGen/Spiller.h b/lib/CodeGen/Spiller.h
index 5a42a8279db..c0d08379604 100644
--- a/lib/CodeGen/Spiller.h
+++ b/lib/CodeGen/Spiller.h
@@ -97,7 +97,7 @@ namespace llvm {
     const TargetRegisterInfo *getRegInfo() const { return TRI; }
 
     /// getSpillSlotOrReMatPhysReg - If the specified stack slot or remat is
-    /// available in a  physical register, return that PhysReg, otherwise
+    /// available in a physical register, return that PhysReg, otherwise
     /// return 0.
     unsigned getSpillSlotOrReMatPhysReg(int Slot) const {
       std::map<int, unsigned>::const_iterator I =
@@ -284,6 +284,7 @@ namespace llvm {
     MachineRegisterInfo *RegInfo;
     const TargetRegisterInfo *TRI;
     const TargetInstrInfo *TII;
+    BitVector AllocatableRegs;
     DenseMap<MachineInstr*, unsigned> DistanceMap;
   public:
     bool runOnMachineFunction(MachineFunction &MF, VirtRegMap &VRM);
@@ -291,12 +292,22 @@ namespace llvm {
     void TransferDeadness(MachineBasicBlock *MBB, unsigned CurDist,
                           unsigned Reg, BitVector &RegKills,
                           std::vector<MachineOperand*> &KillOps);
-    bool PrepForUnfoldOpti(MachineBasicBlock &MBB,
-                           MachineBasicBlock::iterator &MII,
+
+    bool OptimizeByUnfold(MachineBasicBlock &MBB,
+                          MachineBasicBlock::iterator &MII,
+                          std::vector<MachineInstr*> &MaybeDeadStores,
+                          AvailableSpills &Spills, BitVector &RegKills,
+                          std::vector<MachineOperand*> &KillOps,
+                          VirtRegMap &VRM);
+
+    bool OptimizeByUnfold2(unsigned VirtReg, int SS,
+                           MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator &MII, 
                            std::vector<MachineInstr*> &MaybeDeadStores,
                            AvailableSpills &Spills, BitVector &RegKills,
                            std::vector<MachineOperand*> &KillOps,
                            VirtRegMap &VRM);
+
     bool CommuteToFoldReload(MachineBasicBlock &MBB,
                              MachineBasicBlock::iterator &MII,
                              unsigned VirtReg, unsigned SrcReg, int SS,
@@ -305,6 +316,7 @@ namespace llvm {
                              std::vector<MachineOperand*> &KillOps,
                              const TargetRegisterInfo *TRI,
                              VirtRegMap &VRM);
+
     void SpillRegToStackSlot(MachineBasicBlock &MBB,
                              MachineBasicBlock::iterator &MII,
                              int Idx, unsigned PhysReg, int StackSlot,
@@ -315,6 +327,7 @@ namespace llvm {
                              BitVector &RegKills,
                              std::vector<MachineOperand*> &KillOps,
                              VirtRegMap &VRM);
+
     void RewriteMBB(MachineBasicBlock &MBB, VirtRegMap &VRM,
                     AvailableSpills &Spills,
                     BitVector &RegKills, std::vector<MachineOperand*> &KillOps);
diff --git a/test/CodeGen/X86/2009-04-16-SpillerUnfold.ll b/test/CodeGen/X86/2009-04-16-SpillerUnfold.ll
new file mode 100644
index 00000000000..e9e2e4aaa1b
--- /dev/null
+++ b/test/CodeGen/X86/2009-04-16-SpillerUnfold.ll
@@ -0,0 +1,139 @@
+; RUN: llvm-as < %s | llc -mtriple=x86_64-apple-darwin10.0 -relocation-model=pic -disable-fp-elim -stats |& grep {Number of modref unfolded}
+
+	%struct.SHA512_CTX = type { [8 x i64], i64, i64, %struct.anon, i32, i32 }
+	%struct.anon = type { [16 x i64] }
+@K512 = external constant [80 x i64], align 32		; <[80 x i64]*> [#uses=2]
+
+define fastcc void @sha512_block_data_order(%struct.SHA512_CTX* nocapture %ctx, i8* nocapture %in, i64 %num) nounwind ssp {
+entry:
+	br label %bb349
+
+bb349:		; preds = %bb349, %entry
+	%e.0489 = phi i64 [ 0, %entry ], [ %e.0, %bb349 ]		; <i64> [#uses=3]
+	%b.0472 = phi i64 [ 0, %entry ], [ %87, %bb349 ]		; <i64> [#uses=2]
+	%asmtmp356 = call i64 asm "rorq $1,$0", "=r,J,0,~{dirflag},~{fpsr},~{flags},~{cc}"(i32 41, i64 %e.0489) nounwind		; <i64> [#uses=1]
+	%0 = xor i64 0, %asmtmp356		; <i64> [#uses=1]
+	%1 = add i64 0, %0		; <i64> [#uses=1]
+	%2 = add i64 %1, 0		; <i64> [#uses=1]
+	%3 = add i64 %2, 0		; <i64> [#uses=1]
+	%4 = add i64 %3, 0		; <i64> [#uses=5]
+	%asmtmp372 = call i64 asm "rorq $1,$0", "=r,J,0,~{dirflag},~{fpsr},~{flags},~{cc}"(i32 34, i64 %4) nounwind		; <i64> [#uses=1]
+	%asmtmp373 = call i64 asm "rorq $1,$0", "=r,J,0,~{dirflag},~{fpsr},~{flags},~{cc}"(i32 39, i64 %4) nounwind		; <i64> [#uses=0]
+	%5 = xor i64 %asmtmp372, 0		; <i64> [#uses=0]
+	%6 = xor i64 0, %b.0472		; <i64> [#uses=1]
+	%7 = and i64 %4, %6		; <i64> [#uses=1]
+	%8 = xor i64 %7, 0		; <i64> [#uses=1]
+	%9 = add i64 0, %8		; <i64> [#uses=1]
+	%10 = add i64 %9, 0		; <i64> [#uses=2]
+	%asmtmp377 = call i64 asm "rorq $1,$0", "=r,J,0,~{dirflag},~{fpsr},~{flags},~{cc}"(i32 61, i64 0) nounwind		; <i64> [#uses=1]
+	%11 = xor i64 0, %asmtmp377		; <i64> [#uses=1]
+	%12 = add i64 0, %11		; <i64> [#uses=1]
+	%13 = add i64 %12, 0		; <i64> [#uses=1]
+	%not381 = xor i64 0, -1		; <i64> [#uses=1]
+	%14 = and i64 %e.0489, %not381		; <i64> [#uses=1]
+	%15 = xor i64 0, %14		; <i64> [#uses=1]
+	%16 = add i64 %15, 0		; <i64> [#uses=1]
+	%17 = add i64 %16, %13		; <i64> [#uses=1]
+	%18 = add i64 %17, 0		; <i64> [#uses=1]
+	%19 = add i64 %18, 0		; <i64> [#uses=2]
+	%20 = add i64 %19, %b.0472		; <i64> [#uses=3]
+	%21 = add i64 %19, 0		; <i64> [#uses=1]
+	%22 = add i64 %21, 0		; <i64> [#uses=1]
+	%23 = add i32 0, 12		; <i32> [#uses=1]
+	%24 = and i32 %23, 12		; <i32> [#uses=1]
+	%25 = zext i32 %24 to i64		; <i64> [#uses=1]
+	%26 = getelementptr [16 x i64]* null, i64 0, i64 %25		; <i64*> [#uses=0]
+	%27 = add i64 0, %e.0489		; <i64> [#uses=1]
+	%28 = add i64 %27, 0		; <i64> [#uses=1]
+	%29 = add i64 %28, 0		; <i64> [#uses=1]
+	%30 = add i64 %29, 0		; <i64> [#uses=2]
+	%31 = and i64 %10, %4		; <i64> [#uses=1]
+	%32 = xor i64 0, %31		; <i64> [#uses=1]
+	%33 = add i64 %30, 0		; <i64> [#uses=3]
+	%34 = add i64 %30, %32		; <i64> [#uses=1]
+	%35 = add i64 %34, 0		; <i64> [#uses=1]
+	%36 = and i64 %33, %20		; <i64> [#uses=1]
+	%37 = xor i64 %36, 0		; <i64> [#uses=1]
+	%38 = add i64 %37, 0		; <i64> [#uses=1]
+	%39 = add i64 %38, 0		; <i64> [#uses=1]
+	%40 = add i64 %39, 0		; <i64> [#uses=1]
+	%41 = add i64 %40, 0		; <i64> [#uses=1]
+	%42 = add i64 %41, %4		; <i64> [#uses=3]
+	%43 = or i32 0, 6		; <i32> [#uses=1]
+	%44 = and i32 %43, 14		; <i32> [#uses=1]
+	%45 = zext i32 %44 to i64		; <i64> [#uses=1]
+	%46 = getelementptr [16 x i64]* null, i64 0, i64 %45		; <i64*> [#uses=1]
+	%not417 = xor i64 %42, -1		; <i64> [#uses=1]
+	%47 = and i64 %20, %not417		; <i64> [#uses=1]
+	%48 = xor i64 0, %47		; <i64> [#uses=1]
+	%49 = getelementptr [80 x i64]* @K512, i64 0, i64 0		; <i64*> [#uses=1]
+	%50 = load i64* %49, align 8		; <i64> [#uses=1]
+	%51 = add i64 %48, 0		; <i64> [#uses=1]
+	%52 = add i64 %51, 0		; <i64> [#uses=1]
+	%53 = add i64 %52, 0		; <i64> [#uses=1]
+	%54 = add i64 %53, %50		; <i64> [#uses=2]
+	%asmtmp420 = call i64 asm "rorq $1,$0", "=r,J,0,~{dirflag},~{fpsr},~{flags},~{cc}"(i32 34, i64 0) nounwind		; <i64> [#uses=1]
+	%asmtmp421 = call i64 asm "rorq $1,$0", "=r,J,0,~{dirflag},~{fpsr},~{flags},~{cc}"(i32 39, i64 0) nounwind		; <i64> [#uses=1]
+	%55 = xor i64 %asmtmp420, 0		; <i64> [#uses=1]
+	%56 = xor i64 %55, %asmtmp421		; <i64> [#uses=1]
+	%57 = add i64 %54, %10		; <i64> [#uses=5]
+	%58 = add i64 %54, 0		; <i64> [#uses=1]
+	%59 = add i64 %58, %56		; <i64> [#uses=2]
+	%60 = or i32 0, 7		; <i32> [#uses=1]
+	%61 = and i32 %60, 15		; <i32> [#uses=1]
+	%62 = zext i32 %61 to i64		; <i64> [#uses=1]
+	%63 = getelementptr [16 x i64]* null, i64 0, i64 %62		; <i64*> [#uses=2]
+	%64 = load i64* null, align 8		; <i64> [#uses=1]
+	%65 = lshr i64 %64, 6		; <i64> [#uses=1]
+	%66 = xor i64 0, %65		; <i64> [#uses=1]
+	%67 = xor i64 %66, 0		; <i64> [#uses=1]
+	%68 = load i64* %46, align 8		; <i64> [#uses=1]
+	%69 = load i64* null, align 8		; <i64> [#uses=1]
+	%70 = add i64 %68, 0		; <i64> [#uses=1]
+	%71 = add i64 %70, %67		; <i64> [#uses=1]
+	%72 = add i64 %71, %69		; <i64> [#uses=1]
+	%asmtmp427 = call i64 asm "rorq $1,$0", "=r,J,0,~{dirflag},~{fpsr},~{flags},~{cc}"(i32 18, i64 %57) nounwind		; <i64> [#uses=1]
+	%asmtmp428 = call i64 asm "rorq $1,$0", "=r,J,0,~{dirflag},~{fpsr},~{flags},~{cc}"(i32 41, i64 %57) nounwind		; <i64> [#uses=1]
+	%73 = xor i64 %asmtmp427, 0		; <i64> [#uses=1]
+	%74 = xor i64 %73, %asmtmp428		; <i64> [#uses=1]
+	%75 = and i64 %57, %42		; <i64> [#uses=1]
+	%not429 = xor i64 %57, -1		; <i64> [#uses=1]
+	%76 = and i64 %33, %not429		; <i64> [#uses=1]
+	%77 = xor i64 %75, %76		; <i64> [#uses=1]
+	%78 = getelementptr [80 x i64]* @K512, i64 0, i64 0		; <i64*> [#uses=1]
+	%79 = load i64* %78, align 16		; <i64> [#uses=1]
+	%80 = add i64 %77, %20		; <i64> [#uses=1]
+	%81 = add i64 %80, %72		; <i64> [#uses=1]
+	%82 = add i64 %81, %74		; <i64> [#uses=1]
+	%83 = add i64 %82, %79		; <i64> [#uses=1]
+	%asmtmp432 = call i64 asm "rorq $1,$0", "=r,J,0,~{dirflag},~{fpsr},~{flags},~{cc}"(i32 34, i64 %59) nounwind		; <i64> [#uses=1]
+	%asmtmp433 = call i64 asm "rorq $1,$0", "=r,J,0,~{dirflag},~{fpsr},~{flags},~{cc}"(i32 39, i64 %59) nounwind		; <i64> [#uses=1]
+	%84 = xor i64 %asmtmp432, 0		; <i64> [#uses=1]
+	%85 = xor i64 %84, %asmtmp433		; <i64> [#uses=1]
+	%86 = add i64 %83, %22		; <i64> [#uses=2]
+	%87 = add i64 0, %85		; <i64> [#uses=1]
+	%asmtmp435 = call i64 asm "rorq $1,$0", "=r,J,0,~{dirflag},~{fpsr},~{flags},~{cc}"(i32 8, i64 0) nounwind		; <i64> [#uses=1]
+	%88 = xor i64 0, %asmtmp435		; <i64> [#uses=1]
+	%89 = load i64* null, align 8		; <i64> [#uses=3]
+	%asmtmp436 = call i64 asm "rorq $1,$0", "=r,J,0,~{dirflag},~{fpsr},~{flags},~{cc}"(i32 19, i64 %89) nounwind		; <i64> [#uses=1]
+	%asmtmp437 = call i64 asm "rorq $1,$0", "=r,J,0,~{dirflag},~{fpsr},~{flags},~{cc}"(i32 61, i64 %89) nounwind		; <i64> [#uses=1]
+	%90 = lshr i64 %89, 6		; <i64> [#uses=1]
+	%91 = xor i64 %asmtmp436, %90		; <i64> [#uses=1]
+	%92 = xor i64 %91, %asmtmp437		; <i64> [#uses=1]
+	%93 = load i64* %63, align 8		; <i64> [#uses=1]
+	%94 = load i64* null, align 8		; <i64> [#uses=1]
+	%95 = add i64 %93, %88		; <i64> [#uses=1]
+	%96 = add i64 %95, %92		; <i64> [#uses=1]
+	%97 = add i64 %96, %94		; <i64> [#uses=2]
+	store i64 %97, i64* %63, align 8
+	%98 = and i64 %86, %57		; <i64> [#uses=1]
+	%not441 = xor i64 %86, -1		; <i64> [#uses=1]
+	%99 = and i64 %42, %not441		; <i64> [#uses=1]
+	%100 = xor i64 %98, %99		; <i64> [#uses=1]
+	%101 = add i64 %100, %33		; <i64> [#uses=1]
+	%102 = add i64 %101, %97		; <i64> [#uses=1]
+	%103 = add i64 %102, 0		; <i64> [#uses=1]
+	%104 = add i64 %103, 0		; <i64> [#uses=1]
+	%e.0 = add i64 %104, %35		; <i64> [#uses=1]
+	br label %bb349
+}