diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp
index b4163c54502..7e6f2ad5958 100644
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@@ -48,6 +48,7 @@
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetOptions.h"
+#include <utility>
 using namespace llvm;
 
 STATISTIC(NumTailCalls, "Number of tail calls");
@@ -7245,8 +7246,430 @@ MachineBasicBlock *OtherSucc(MachineBasicBlock *MBB, MachineBasicBlock *Succ) {
   llvm_unreachable("Expecting a BB with two successors!");
 }
 
-MachineBasicBlock *ARMTargetLowering::
-EmitStructByval(MachineInstr *MI, MachineBasicBlock *BB) const {
+namespace {
+// This class is a helper for lowering the COPY_STRUCT_BYVAL_I32 instruction.
+// It defines the operations needed to lower the byval copy. We use a helper
+// class because the opcodes and machine instructions are different for each
+// subtarget, but the overall algorithm for the lowering is the same.  The
+// implementation of each operation will be defined separately for arm, thumb1,
+// and thumb2 targets by subclassing this base class. See
+// ARMTargetLowering::EmitStructByval() for how these operations are used.
+class TargetStructByvalEmitter {
+public:
+  TargetStructByvalEmitter(const TargetInstrInfo *TII_,
+                           MachineRegisterInfo &MRI_,
+                           const TargetRegisterClass *TRC_)
+      : TII(TII_), MRI(MRI_), TRC(TRC_) {}
+
+  // Emit a post-increment load of "unit" size. The unit size is based on the
+  // alignment of the struct being copied (4, 2, or 1 bytes). Alignments higher
+  // than 4 are handled separately by using NEON instructions.
+  //
+  // \param baseReg the register holding the address to load.
+  // \param baseOut the register to recieve the incremented address.
+  // \returns the register holding the loaded value.
+  virtual unsigned emitUnitLoad(MachineBasicBlock *BB, MachineInstr *MI,
+                                DebugLoc &dl, unsigned baseReg,
+                                unsigned baseOut) = 0;
+
+  // Emit a post-increment store of "unit" size. The unit size is based on the
+  // alignment of the struct being copied (4, 2, or 1 bytes). Alignments higher
+  // than 4 are handled separately by using NEON instructions.
+  //
+  // \param baseReg the register holding the address to store.
+  // \param storeReg the register holding the value to store.
+  // \param baseOut the register to recieve the incremented address.
+  virtual void emitUnitStore(MachineBasicBlock *BB, MachineInstr *MI,
+                             DebugLoc &dl, unsigned baseReg, unsigned storeReg,
+                             unsigned baseOut) = 0;
+
+  // Emit a post-increment load of one byte.
+  //
+  // \param baseReg the register holding the address to load.
+  // \param baseOut the register to recieve the incremented address.
+  // \returns the register holding the loaded value.
+  virtual unsigned emitByteLoad(MachineBasicBlock *BB, MachineInstr *MI,
+                                DebugLoc &dl, unsigned baseReg,
+                                unsigned baseOut) = 0;
+
+  // Emit a post-increment store of one byte.
+  //
+  // \param baseReg the register holding the address to store.
+  // \param storeReg the register holding the value to store.
+  // \param baseOut the register to recieve the incremented address.
+  virtual void emitByteStore(MachineBasicBlock *BB, MachineInstr *MI,
+                             DebugLoc &dl, unsigned baseReg, unsigned storeReg,
+                             unsigned baseOut) = 0;
+
+  // Emit a load of a constant value.
+  //
+  // \param Constant the register holding the address to store.
+  // \returns the register holding the loaded value.
+  virtual unsigned emitConstantLoad(MachineBasicBlock *BB, MachineInstr *MI,
+                                    DebugLoc &dl, unsigned Constant,
+                                    const DataLayout *DL) = 0;
+
+  // Emit a subtract of a register minus immediate, with the immediate equal to
+  // the "unit" size. The unit size is based on the alignment of the struct
+  // being copied (16, 8, 4, 2, or 1 bytes).
+  //
+  // \param InReg the register holding the initial value.
+  // \param OutReg the register to recieve the subtracted value.
+  virtual void emitSubImm(MachineBasicBlock *BB, MachineInstr *MI, DebugLoc &dl,
+                          unsigned InReg, unsigned OutReg) = 0;
+
+  // Emit a branch based on a condition code of not equal.
+  //
+  // \param TargetBB the destination of the branch.
+  virtual void emitBranchNE(MachineBasicBlock *BB, MachineInstr *MI,
+                            DebugLoc &dl, MachineBasicBlock *TargetBB) = 0;
+
+  // Find the constant pool index for the given constant. This method is
+  // implemented in the base class because it is the same for all subtargets.
+  //
+  // \param LoopSize the constant value for which the index should be returned.
+  // \returns the constant pool index for the constant.
+  unsigned getConstantPoolIndex(MachineFunction *MF, const DataLayout *DL,
+                                unsigned LoopSize) {
+    MachineConstantPool *ConstantPool = MF->getConstantPool();
+    Type *Int32Ty = Type::getInt32Ty(MF->getFunction()->getContext());
+    const Constant *C = ConstantInt::get(Int32Ty, LoopSize);
+
+    // MachineConstantPool wants an explicit alignment.
+    unsigned Align = DL->getPrefTypeAlignment(Int32Ty);
+    if (Align == 0)
+      Align = DL->getTypeAllocSize(C->getType());
+    return ConstantPool->getConstantPoolIndex(C, Align);
+  }
+
+  // Return the register class used by the subtarget.
+  //
+  // \returns the target register class.
+  const TargetRegisterClass *getTRC() const { return TRC; }
+
+  virtual ~TargetStructByvalEmitter() {};
+
+protected:
+  const TargetInstrInfo *TII;
+  MachineRegisterInfo &MRI;
+  const TargetRegisterClass *TRC;
+};
+
+class ARMStructByvalEmitter : public TargetStructByvalEmitter {
+public:
+  ARMStructByvalEmitter(const TargetInstrInfo *TII, MachineRegisterInfo &MRI,
+                        unsigned LoadStoreSize)
+      : TargetStructByvalEmitter(
+            TII, MRI, (const TargetRegisterClass *)&ARM::GPRRegClass),
+        UnitSize(LoadStoreSize),
+        UnitLdOpc(LoadStoreSize == 4
+                      ? ARM::LDR_POST_IMM
+                      : LoadStoreSize == 2
+                            ? ARM::LDRH_POST
+                            : LoadStoreSize == 1 ? ARM::LDRB_POST_IMM : 0),
+        UnitStOpc(LoadStoreSize == 4
+                      ? ARM::STR_POST_IMM
+                      : LoadStoreSize == 2
+                            ? ARM::STRH_POST
+                            : LoadStoreSize == 1 ? ARM::STRB_POST_IMM : 0) {}
+
+  unsigned emitUnitLoad(MachineBasicBlock *BB, MachineInstr *MI, DebugLoc &dl,
+                        unsigned baseReg, unsigned baseOut) {
+    unsigned scratch = MRI.createVirtualRegister(TRC);
+    AddDefaultPred(BuildMI(*BB, MI, dl, TII->get(UnitLdOpc), scratch).addReg(
+        baseOut, RegState::Define).addReg(baseReg).addReg(0).addImm(UnitSize));
+    return scratch;
+  }
+
+  void emitUnitStore(MachineBasicBlock *BB, MachineInstr *MI, DebugLoc &dl,
+                     unsigned baseReg, unsigned storeReg, unsigned baseOut) {
+    AddDefaultPred(BuildMI(*BB, MI, dl, TII->get(UnitStOpc), baseOut).addReg(
+        storeReg).addReg(baseReg).addReg(0).addImm(UnitSize));
+  }
+
+  unsigned emitByteLoad(MachineBasicBlock *BB, MachineInstr *MI, DebugLoc &dl,
+                        unsigned baseReg, unsigned baseOut) {
+    unsigned scratch = MRI.createVirtualRegister(TRC);
+    AddDefaultPred(BuildMI(*BB, MI, dl, TII->get(ARM::LDRB_POST_IMM), scratch)
+                       .addReg(baseOut, RegState::Define).addReg(baseReg)
+                       .addReg(0).addImm(1));
+    return scratch;
+  }
+
+  void emitByteStore(MachineBasicBlock *BB, MachineInstr *MI, DebugLoc &dl,
+                     unsigned baseReg, unsigned storeReg, unsigned baseOut) {
+    AddDefaultPred(BuildMI(*BB, MI, dl, TII->get(ARM::STRB_POST_IMM), baseOut)
+                       .addReg(storeReg).addReg(baseReg).addReg(0).addImm(1));
+  }
+
+  unsigned emitConstantLoad(MachineBasicBlock *BB, MachineInstr *MI,
+                            DebugLoc &dl, unsigned Constant,
+                            const DataLayout *DL) {
+    unsigned constReg = MRI.createVirtualRegister(TRC);
+    unsigned Idx = getConstantPoolIndex(BB->getParent(), DL, Constant);
+    AddDefaultPred(BuildMI(*BB, MI, dl, TII->get(ARM::LDRcp)).addReg(
+        constReg, RegState::Define).addConstantPoolIndex(Idx).addImm(0));
+    return constReg;
+  }
+
+  void emitSubImm(MachineBasicBlock *BB, MachineInstr *MI, DebugLoc &dl,
+                  unsigned InReg, unsigned OutReg) {
+    MachineInstrBuilder MIB =
+        BuildMI(*BB, MI, dl, TII->get(ARM::SUBri), OutReg);
+    AddDefaultCC(AddDefaultPred(MIB.addReg(InReg).addImm(UnitSize)));
+    MIB->getOperand(5).setReg(ARM::CPSR);
+    MIB->getOperand(5).setIsDef(true);
+  }
+
+  void emitBranchNE(MachineBasicBlock *BB, MachineInstr *MI, DebugLoc &dl,
+                    MachineBasicBlock *TargetBB) {
+    BuildMI(*BB, MI, dl, TII->get(ARM::Bcc)).addMBB(TargetBB).addImm(ARMCC::NE)
+        .addReg(ARM::CPSR);
+  }
+
+private:
+  const unsigned UnitSize;
+  const unsigned UnitLdOpc;
+  const unsigned UnitStOpc;
+};
+
+class Thumb2StructByvalEmitter : public TargetStructByvalEmitter {
+public:
+  Thumb2StructByvalEmitter(const TargetInstrInfo *TII, MachineRegisterInfo &MRI,
+                           unsigned LoadStoreSize)
+      : TargetStructByvalEmitter(
+            TII, MRI, (const TargetRegisterClass *)&ARM::tGPRRegClass),
+        UnitSize(LoadStoreSize),
+        UnitLdOpc(LoadStoreSize == 4
+                      ? ARM::t2LDR_POST
+                      : LoadStoreSize == 2
+                            ? ARM::t2LDRH_POST
+                            : LoadStoreSize == 1 ? ARM::t2LDRB_POST : 0),
+        UnitStOpc(LoadStoreSize == 4
+                      ? ARM::t2STR_POST
+                      : LoadStoreSize == 2
+                            ? ARM::t2STRH_POST
+                            : LoadStoreSize == 1 ? ARM::t2STRB_POST : 0) {}
+
+  unsigned emitUnitLoad(MachineBasicBlock *BB, MachineInstr *MI, DebugLoc &dl,
+                        unsigned baseReg, unsigned baseOut) {
+    unsigned scratch = MRI.createVirtualRegister(TRC);
+    AddDefaultPred(BuildMI(*BB, MI, dl, TII->get(UnitLdOpc), scratch).addReg(
+        baseOut, RegState::Define).addReg(baseReg).addImm(UnitSize));
+    return scratch;
+  }
+
+  void emitUnitStore(MachineBasicBlock *BB, MachineInstr *MI, DebugLoc &dl,
+                     unsigned baseReg, unsigned storeReg, unsigned baseOut) {
+    AddDefaultPred(BuildMI(*BB, MI, dl, TII->get(UnitStOpc), baseOut)
+                       .addReg(storeReg).addReg(baseReg).addImm(UnitSize));
+  }
+
+  unsigned emitByteLoad(MachineBasicBlock *BB, MachineInstr *MI, DebugLoc &dl,
+                        unsigned baseReg, unsigned baseOut) {
+    unsigned scratch = MRI.createVirtualRegister(TRC);
+    AddDefaultPred(BuildMI(*BB, MI, dl, TII->get(ARM::t2LDRB_POST), scratch)
+                       .addReg(baseOut, RegState::Define).addReg(baseReg)
+                       .addImm(1));
+    return scratch;
+  }
+
+  void emitByteStore(MachineBasicBlock *BB, MachineInstr *MI, DebugLoc &dl,
+                     unsigned baseReg, unsigned storeReg, unsigned baseOut) {
+    AddDefaultPred(BuildMI(*BB, MI, dl, TII->get(ARM::t2STRB_POST), baseOut)
+                       .addReg(storeReg).addReg(baseReg).addImm(1));
+  }
+
+  unsigned emitConstantLoad(MachineBasicBlock *BB, MachineInstr *MI,
+                            DebugLoc &dl, unsigned Constant,
+                            const DataLayout *DL) {
+    unsigned VConst = MRI.createVirtualRegister(TRC);
+    unsigned Vtmp = VConst;
+    if ((Constant & 0xFFFF0000) != 0)
+      Vtmp = MRI.createVirtualRegister(TRC);
+    AddDefaultPred(BuildMI(BB, dl, TII->get(ARM::t2MOVi16), Vtmp)
+                       .addImm(Constant & 0xFFFF));
+
+    if ((Constant & 0xFFFF0000) != 0)
+      AddDefaultPred(BuildMI(BB, dl, TII->get(ARM::t2MOVTi16), VConst)
+                         .addReg(Vtmp).addImm(Constant >> 16));
+    return VConst;
+  }
+
+  void emitSubImm(MachineBasicBlock *BB, MachineInstr *MI, DebugLoc &dl,
+                  unsigned InReg, unsigned OutReg) {
+    MachineInstrBuilder MIB =
+        BuildMI(*BB, MI, dl, TII->get(ARM::t2SUBri), OutReg);
+    AddDefaultCC(AddDefaultPred(MIB.addReg(InReg).addImm(UnitSize)));
+    MIB->getOperand(5).setReg(ARM::CPSR);
+    MIB->getOperand(5).setIsDef(true);
+  }
+
+  void emitBranchNE(MachineBasicBlock *BB, MachineInstr *MI, DebugLoc &dl,
+                    MachineBasicBlock *TargetBB) {
+    BuildMI(BB, dl, TII->get(ARM::t2Bcc)).addMBB(TargetBB).addImm(ARMCC::NE)
+        .addReg(ARM::CPSR);
+  }
+
+private:
+  const unsigned UnitSize;
+  const unsigned UnitLdOpc;
+  const unsigned UnitStOpc;
+};
+
+// This class is a thin wrapper that delegates most of the work to the correct
+// TargetStructByvalEmitter implementation. It also handles the lowering for
+// targets that support neon because the neon implementation is the same for all
+// targets that support it.
+class StructByvalEmitter {
+public:
+  StructByvalEmitter(unsigned LoadStoreSize, const ARMSubtarget *Subtarget,
+                     const TargetInstrInfo *TII_, MachineRegisterInfo &MRI_,
+                     const DataLayout *DL_)
+      : UnitSize(LoadStoreSize),
+        TargetEmitter(
+          Subtarget->isThumb2()
+              ? static_cast<TargetStructByvalEmitter *>(
+                    new Thumb2StructByvalEmitter(TII_, MRI_,
+                                                 LoadStoreSize))
+              : static_cast<TargetStructByvalEmitter *>(
+                    new ARMStructByvalEmitter(TII_, MRI_,
+                                              LoadStoreSize))),
+        TII(TII_), MRI(MRI_), DL(DL_),
+        VecTRC(UnitSize == 16
+                   ? (const TargetRegisterClass *)&ARM::DPairRegClass
+                   : UnitSize == 8
+                         ? (const TargetRegisterClass *)&ARM::DPRRegClass
+                         : 0),
+        VecLdOpc(UnitSize == 16 ? ARM::VLD1q32wb_fixed
+                                : UnitSize == 8 ? ARM::VLD1d32wb_fixed : 0),
+        VecStOpc(UnitSize == 16 ? ARM::VST1q32wb_fixed
+                                : UnitSize == 8 ? ARM::VST1d32wb_fixed : 0) {}
+
+  // Emit a post-increment load of "unit" size. The unit size is based on the
+  // alignment of the struct being copied (16, 8, 4, 2, or 1 bytes). Loads of 16
+  // or 8 bytes use NEON instructions to load the value.
+  //
+  // \param baseReg the register holding the address to load.
+  // \param baseOut the register to recieve the incremented address. If baseOut
+  // is 0 then a new register is created to hold the incremented address.
+  // \returns a pair of registers holding the loaded value and the updated
+  // address.
+  std::pair<unsigned, unsigned> emitUnitLoad(MachineBasicBlock *BB,
+                                             MachineInstr *MI, DebugLoc &dl,
+                                             unsigned baseReg,
+                                             unsigned baseOut = 0) {
+    unsigned scratch = 0;
+    if (baseOut == 0)
+      baseOut = MRI.createVirtualRegister(TargetEmitter->getTRC());
+    if (UnitSize >= 8) { // neon
+      scratch = MRI.createVirtualRegister(VecTRC);
+      AddDefaultPred(BuildMI(*BB, MI, dl, TII->get(VecLdOpc), scratch).addReg(
+          baseOut, RegState::Define).addReg(baseReg).addImm(0));
+    } else {
+      scratch = TargetEmitter->emitUnitLoad(BB, MI, dl, baseReg, baseOut);
+    }
+    return std::make_pair(scratch, baseOut);
+  }
+
+  // Emit a post-increment store of "unit" size. The unit size is based on the
+  // alignment of the struct being copied (16, 8, 4, 2, or 1 bytes). Stores of
+  // 16 or 8 bytes use NEON instructions to store the value.
+  //
+  // \param baseReg the register holding the address to store.
+  // \param storeReg the register holding the value to store.
+  // \param baseOut the register to recieve the incremented address. If baseOut
+  // is 0 then a new register is created to hold the incremented address.
+  // \returns the register holding the updated address.
+  unsigned emitUnitStore(MachineBasicBlock *BB, MachineInstr *MI, DebugLoc &dl,
+                         unsigned baseReg, unsigned storeReg,
+                         unsigned baseOut = 0) {
+    if (baseOut == 0)
+      baseOut = MRI.createVirtualRegister(TargetEmitter->getTRC());
+    if (UnitSize >= 8) { // neon
+      AddDefaultPred(BuildMI(*BB, MI, dl, TII->get(VecStOpc), baseOut)
+                         .addReg(baseReg).addImm(0).addReg(storeReg));
+    } else {
+      TargetEmitter->emitUnitStore(BB, MI, dl, baseReg, storeReg, baseOut);
+    }
+    return baseOut;
+  }
+
+  // Emit a post-increment load of one byte.
+  //
+  // \param baseReg the register holding the address to load.
+  // \returns a pair of registers holding the loaded value and the updated
+  // address.
+  std::pair<unsigned, unsigned> emitByteLoad(MachineBasicBlock *BB,
+                                             MachineInstr *MI, DebugLoc &dl,
+                                             unsigned baseReg) {
+    unsigned baseOut = MRI.createVirtualRegister(TargetEmitter->getTRC());
+    unsigned scratch =
+        TargetEmitter->emitByteLoad(BB, MI, dl, baseReg, baseOut);
+    return std::make_pair(scratch, baseOut);
+  }
+
+  // Emit a post-increment store of one byte.
+  //
+  // \param baseReg the register holding the address to store.
+  // \param storeReg the register holding the value to store.
+  // \returns the register holding the updated address.
+  unsigned emitByteStore(MachineBasicBlock *BB, MachineInstr *MI, DebugLoc &dl,
+                         unsigned baseReg, unsigned storeReg) {
+    unsigned baseOut = MRI.createVirtualRegister(TargetEmitter->getTRC());
+    TargetEmitter->emitByteStore(BB, MI, dl, baseReg, storeReg, baseOut);
+    return baseOut;
+  }
+
+  // Emit a load of the constant LoopSize.
+  //
+  // \param LoopSize the constant to load.
+  // \returns the register holding the loaded constant.
+  unsigned emitConstantLoad(MachineBasicBlock *BB, MachineInstr *MI,
+                            DebugLoc &dl, unsigned LoopSize) {
+    return TargetEmitter->emitConstantLoad(BB, MI, dl, LoopSize, DL);
+  }
+
+  // Emit a subtract of a register minus immediate, with the immediate equal to
+  // the "unit" size. The unit size is based on the alignment of the struct
+  // being copied (16, 8, 4, 2, or 1 bytes).
+  //
+  // \param InReg the register holding the initial value.
+  // \param OutReg the register to recieve the subtracted value.
+  void emitSubImm(MachineBasicBlock *BB, MachineInstr *MI, DebugLoc &dl,
+                  unsigned InReg, unsigned OutReg) {
+    TargetEmitter->emitSubImm(BB, MI, dl, InReg, OutReg);
+  }
+
+  // Emit a branch based on a condition code of not equal.
+  //
+  // \param TargetBB the destination of the branch.
+  void emitBranchNE(MachineBasicBlock *BB, MachineInstr *MI, DebugLoc &dl,
+                    MachineBasicBlock *TargetBB) {
+    TargetEmitter->emitBranchNE(BB, MI, dl, TargetBB);
+  }
+
+  // Return the register class used by the subtarget.
+  //
+  // \returns the target register class.
+  const TargetRegisterClass *getTRC() const { return TargetEmitter->getTRC(); }
+
+private:
+  const unsigned UnitSize;
+  OwningPtr<TargetStructByvalEmitter> TargetEmitter;
+  const TargetInstrInfo *TII;
+  MachineRegisterInfo &MRI;
+  const DataLayout *DL;
+
+  const TargetRegisterClass *VecTRC;
+  const unsigned VecLdOpc;
+  const unsigned VecStOpc;
+};
+}
+
+MachineBasicBlock *
+ARMTargetLowering::EmitStructByval(MachineInstr *MI,
+                                   MachineBasicBlock *BB) const {
   // This pseudo instruction has 3 operands: dst, src, size
   // We expand it to a loop if size > Subtarget->getMaxInlineSizeThreshold().
   // Otherwise, we will generate unrolled scalar copies.
@@ -7261,23 +7684,13 @@ EmitStructByval(MachineInstr *MI, MachineBasicBlock *BB) const {
   unsigned Align = MI->getOperand(3).getImm();
   DebugLoc dl = MI->getDebugLoc();
 
-  bool isThumb2 = Subtarget->isThumb2();
   MachineFunction *MF = BB->getParent();
   MachineRegisterInfo &MRI = MF->getRegInfo();
-  unsigned ldrOpc, strOpc, UnitSize = 0;
-
-  const TargetRegisterClass *TRC = isThumb2 ?
-    (const TargetRegisterClass*)&ARM::tGPRRegClass :
-    (const TargetRegisterClass*)&ARM::GPRRegClass;
-  const TargetRegisterClass *TRC_Vec = 0;
+  unsigned UnitSize = 0;
 
   if (Align & 1) {
-    ldrOpc = isThumb2 ? ARM::t2LDRB_POST : ARM::LDRB_POST_IMM;
-    strOpc = isThumb2 ? ARM::t2STRB_POST : ARM::STRB_POST_IMM;
     UnitSize = 1;
   } else if (Align & 2) {
-    ldrOpc = isThumb2 ? ARM::t2LDRH_POST : ARM::LDRH_POST;
-    strOpc = isThumb2 ? ARM::t2STRH_POST : ARM::STRH_POST;
     UnitSize = 2;
   } else {
     // Check whether we can use NEON instructions.
@@ -7285,27 +7698,18 @@ EmitStructByval(MachineInstr *MI, MachineBasicBlock *BB) const {
           hasAttribute(AttributeSet::FunctionIndex,
                        Attribute::NoImplicitFloat) &&
         Subtarget->hasNEON()) {
-      if ((Align % 16 == 0) && SizeVal >= 16) {
-        ldrOpc = ARM::VLD1q32wb_fixed;
-        strOpc = ARM::VST1q32wb_fixed;
+      if ((Align % 16 == 0) && SizeVal >= 16)
         UnitSize = 16;
-        TRC_Vec = (const TargetRegisterClass*)&ARM::DPairRegClass;
-      }
-      else if ((Align % 8 == 0) && SizeVal >= 8) {
-        ldrOpc = ARM::VLD1d32wb_fixed;
-        strOpc = ARM::VST1d32wb_fixed;
+      else if ((Align % 8 == 0) && SizeVal >= 8)
         UnitSize = 8;
-        TRC_Vec = (const TargetRegisterClass*)&ARM::DPRRegClass;
-      }
     }
     // Can't use NEON instructions.
-    if (UnitSize == 0) {
-      ldrOpc = isThumb2 ? ARM::t2LDR_POST : ARM::LDR_POST_IMM;
-      strOpc = isThumb2 ? ARM::t2STR_POST : ARM::STR_POST_IMM;
+    if (UnitSize == 0)
       UnitSize = 4;
-    }
   }
 
+  StructByvalEmitter ByvalEmitter(UnitSize, Subtarget, TII, MRI,
+                                  getDataLayout());
   unsigned BytesLeft = SizeVal % UnitSize;
   unsigned LoopSize = SizeVal - BytesLeft;
 
@@ -7316,67 +7720,22 @@ EmitStructByval(MachineInstr *MI, MachineBasicBlock *BB) const {
     unsigned srcIn = src;
     unsigned destIn = dest;
     for (unsigned i = 0; i < LoopSize; i+=UnitSize) {
-      unsigned scratch = MRI.createVirtualRegister(UnitSize >= 8 ? TRC_Vec:TRC);
-      unsigned srcOut = MRI.createVirtualRegister(TRC);
-      unsigned destOut = MRI.createVirtualRegister(TRC);
-      if (UnitSize >= 8) {
-        AddDefaultPred(BuildMI(*BB, MI, dl,
-          TII->get(ldrOpc), scratch)
-          .addReg(srcOut, RegState::Define).addReg(srcIn).addImm(0));
-
-        AddDefaultPred(BuildMI(*BB, MI, dl, TII->get(strOpc), destOut)
-          .addReg(destIn).addImm(0).addReg(scratch));
-      } else if (isThumb2) {
-        AddDefaultPred(BuildMI(*BB, MI, dl,
-          TII->get(ldrOpc), scratch)
-          .addReg(srcOut, RegState::Define).addReg(srcIn).addImm(UnitSize));
-
-        AddDefaultPred(BuildMI(*BB, MI, dl, TII->get(strOpc), destOut)
-          .addReg(scratch).addReg(destIn)
-          .addImm(UnitSize));
-      } else {
-        AddDefaultPred(BuildMI(*BB, MI, dl,
-          TII->get(ldrOpc), scratch)
-          .addReg(srcOut, RegState::Define).addReg(srcIn).addReg(0)
-          .addImm(UnitSize));
-
-        AddDefaultPred(BuildMI(*BB, MI, dl, TII->get(strOpc), destOut)
-          .addReg(scratch).addReg(destIn)
-          .addReg(0).addImm(UnitSize));
-      }
-      srcIn = srcOut;
-      destIn = destOut;
+      std::pair<unsigned, unsigned> res =
+          ByvalEmitter.emitUnitLoad(BB, MI, dl, srcIn);
+      unsigned scratch = res.first;
+      srcIn = res.second;
+      destIn = ByvalEmitter.emitUnitStore(BB, MI, dl, destIn, scratch);
     }
 
     // Handle the leftover bytes with LDRB and STRB.
     // [scratch, srcOut] = LDRB_POST(srcIn, 1)
     // [destOut] = STRB_POST(scratch, destIn, 1)
-    ldrOpc = isThumb2 ? ARM::t2LDRB_POST : ARM::LDRB_POST_IMM;
-    strOpc = isThumb2 ? ARM::t2STRB_POST : ARM::STRB_POST_IMM;
     for (unsigned i = 0; i < BytesLeft; i++) {
-      unsigned scratch = MRI.createVirtualRegister(TRC);
-      unsigned srcOut = MRI.createVirtualRegister(TRC);
-      unsigned destOut = MRI.createVirtualRegister(TRC);
-      if (isThumb2) {
-        AddDefaultPred(BuildMI(*BB, MI, dl,
-          TII->get(ldrOpc),scratch)
-          .addReg(srcOut, RegState::Define).addReg(srcIn).addImm(1));
-
-        AddDefaultPred(BuildMI(*BB, MI, dl, TII->get(strOpc), destOut)
-          .addReg(scratch).addReg(destIn)
-          .addImm(1));
-      } else {
-        AddDefaultPred(BuildMI(*BB, MI, dl,
-          TII->get(ldrOpc),scratch)
-          .addReg(srcOut, RegState::Define).addReg(srcIn)
-          .addReg(0).addImm(1));
-
-        AddDefaultPred(BuildMI(*BB, MI, dl, TII->get(strOpc), destOut)
-          .addReg(scratch).addReg(destIn)
-          .addReg(0).addImm(1));
-      }
-      srcIn = srcOut;
-      destIn = destOut;
+      std::pair<unsigned, unsigned> res =
+          ByvalEmitter.emitByteLoad(BB, MI, dl, srcIn);
+      unsigned scratch = res.first;
+      srcIn = res.second;
+      destIn = ByvalEmitter.emitByteStore(BB, MI, dl, destIn, scratch);
     }
     MI->eraseFromParent();   // The instruction is gone now.
     return BB;
@@ -7414,34 +7773,7 @@ EmitStructByval(MachineInstr *MI, MachineBasicBlock *BB) const {
   exitMBB->transferSuccessorsAndUpdatePHIs(BB);
 
   // Load an immediate to varEnd.
-  unsigned varEnd = MRI.createVirtualRegister(TRC);
-  if (isThumb2) {
-    unsigned VReg1 = varEnd;
-    if ((LoopSize & 0xFFFF0000) != 0)
-      VReg1 = MRI.createVirtualRegister(TRC);
-    AddDefaultPred(BuildMI(BB, dl, TII->get(ARM::t2MOVi16), VReg1)
-                   .addImm(LoopSize & 0xFFFF));
-
-    if ((LoopSize & 0xFFFF0000) != 0)
-      AddDefaultPred(BuildMI(BB, dl, TII->get(ARM::t2MOVTi16), varEnd)
-                     .addReg(VReg1)
-                     .addImm(LoopSize >> 16));
-  } else {
-    MachineConstantPool *ConstantPool = MF->getConstantPool();
-    Type *Int32Ty = Type::getInt32Ty(MF->getFunction()->getContext());
-    const Constant *C = ConstantInt::get(Int32Ty, LoopSize);
-
-    // MachineConstantPool wants an explicit alignment.
-    unsigned Align = getDataLayout()->getPrefTypeAlignment(Int32Ty);
-    if (Align == 0)
-      Align = getDataLayout()->getTypeAllocSize(C->getType());
-    unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align);
-
-    AddDefaultPred(BuildMI(BB, dl, TII->get(ARM::LDRcp))
-                   .addReg(varEnd, RegState::Define)
-                   .addConstantPoolIndex(Idx)
-                   .addImm(0));
-  }
+  unsigned varEnd = ByvalEmitter.emitConstantLoad(BB, MI, dl, LoopSize);
   BB->addSuccessor(loopMBB);
 
   // Generate the loop body:
@@ -7450,12 +7782,12 @@ EmitStructByval(MachineInstr *MI, MachineBasicBlock *BB) const {
   //   destPhi = PHI(destLoop, dst)
   MachineBasicBlock *entryBB = BB;
   BB = loopMBB;
-  unsigned varLoop = MRI.createVirtualRegister(TRC);
-  unsigned varPhi = MRI.createVirtualRegister(TRC);
-  unsigned srcLoop = MRI.createVirtualRegister(TRC);
-  unsigned srcPhi = MRI.createVirtualRegister(TRC);
-  unsigned destLoop = MRI.createVirtualRegister(TRC);
-  unsigned destPhi = MRI.createVirtualRegister(TRC);
+  unsigned varLoop = MRI.createVirtualRegister(ByvalEmitter.getTRC());
+  unsigned varPhi = MRI.createVirtualRegister(ByvalEmitter.getTRC());
+  unsigned srcLoop = MRI.createVirtualRegister(ByvalEmitter.getTRC());
+  unsigned srcPhi = MRI.createVirtualRegister(ByvalEmitter.getTRC());
+  unsigned destLoop = MRI.createVirtualRegister(ByvalEmitter.getTRC());
+  unsigned destPhi = MRI.createVirtualRegister(ByvalEmitter.getTRC());
 
   BuildMI(*BB, BB->begin(), dl, TII->get(ARM::PHI), varPhi)
     .addReg(varLoop).addMBB(loopMBB)
@@ -7469,39 +7801,16 @@ EmitStructByval(MachineInstr *MI, MachineBasicBlock *BB) const {
 
   //   [scratch, srcLoop] = LDR_POST(srcPhi, UnitSize)
   //   [destLoop] = STR_POST(scratch, destPhi, UnitSiz)
-  unsigned scratch = MRI.createVirtualRegister(UnitSize >= 8 ? TRC_Vec:TRC);
-  if (UnitSize >= 8) {
-    AddDefaultPred(BuildMI(BB, dl, TII->get(ldrOpc), scratch)
-      .addReg(srcLoop, RegState::Define).addReg(srcPhi).addImm(0));
-
-    AddDefaultPred(BuildMI(BB, dl, TII->get(strOpc), destLoop)
-      .addReg(destPhi).addImm(0).addReg(scratch));
-  } else if (isThumb2) {
-    AddDefaultPred(BuildMI(BB, dl, TII->get(ldrOpc), scratch)
-      .addReg(srcLoop, RegState::Define).addReg(srcPhi).addImm(UnitSize));
-
-    AddDefaultPred(BuildMI(BB, dl, TII->get(strOpc), destLoop)
-      .addReg(scratch).addReg(destPhi)
-      .addImm(UnitSize));
-  } else {
-    AddDefaultPred(BuildMI(BB, dl, TII->get(ldrOpc), scratch)
-      .addReg(srcLoop, RegState::Define).addReg(srcPhi).addReg(0)
-      .addImm(UnitSize));
-
-    AddDefaultPred(BuildMI(BB, dl, TII->get(strOpc), destLoop)
-      .addReg(scratch).addReg(destPhi)
-      .addReg(0).addImm(UnitSize));
+  {
+    std::pair<unsigned, unsigned> res =
+        ByvalEmitter.emitUnitLoad(BB, BB->end(), dl, srcPhi, srcLoop);
+    unsigned scratch = res.first;
+    ByvalEmitter.emitUnitStore(BB, BB->end(), dl, destPhi, scratch, destLoop);
   }
 
   // Decrement loop variable by UnitSize.
-  MachineInstrBuilder MIB = BuildMI(BB, dl,
-    TII->get(isThumb2 ? ARM::t2SUBri : ARM::SUBri), varLoop);
-  AddDefaultCC(AddDefaultPred(MIB.addReg(varPhi).addImm(UnitSize)));
-  MIB->getOperand(5).setReg(ARM::CPSR);
-  MIB->getOperand(5).setIsDef(true);
-
-  BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc))
-    .addMBB(loopMBB).addImm(ARMCC::NE).addReg(ARM::CPSR);
+  ByvalEmitter.emitSubImm(BB, BB->end(), dl, varPhi, varLoop);
+  ByvalEmitter.emitBranchNE(BB, BB->end(), dl, loopMBB);
 
   // loopMBB can loop back to loopMBB or fall through to exitMBB.
   BB->addSuccessor(loopMBB);
@@ -7510,36 +7819,17 @@ EmitStructByval(MachineInstr *MI, MachineBasicBlock *BB) const {
   // Add epilogue to handle BytesLeft.
   BB = exitMBB;
   MachineInstr *StartOfExit = exitMBB->begin();
-  ldrOpc = isThumb2 ? ARM::t2LDRB_POST : ARM::LDRB_POST_IMM;
-  strOpc = isThumb2 ? ARM::t2STRB_POST : ARM::STRB_POST_IMM;
 
   //   [scratch, srcOut] = LDRB_POST(srcLoop, 1)
   //   [destOut] = STRB_POST(scratch, destLoop, 1)
   unsigned srcIn = srcLoop;
   unsigned destIn = destLoop;
   for (unsigned i = 0; i < BytesLeft; i++) {
-    unsigned scratch = MRI.createVirtualRegister(TRC);
-    unsigned srcOut = MRI.createVirtualRegister(TRC);
-    unsigned destOut = MRI.createVirtualRegister(TRC);
-    if (isThumb2) {
-      AddDefaultPred(BuildMI(*BB, StartOfExit, dl,
-        TII->get(ldrOpc),scratch)
-        .addReg(srcOut, RegState::Define).addReg(srcIn).addImm(1));
-
-      AddDefaultPred(BuildMI(*BB, StartOfExit, dl, TII->get(strOpc), destOut)
-        .addReg(scratch).addReg(destIn)
-        .addImm(1));
-    } else {
-      AddDefaultPred(BuildMI(*BB, StartOfExit, dl,
-        TII->get(ldrOpc),scratch)
-        .addReg(srcOut, RegState::Define).addReg(srcIn).addReg(0).addImm(1));
-
-      AddDefaultPred(BuildMI(*BB, StartOfExit, dl, TII->get(strOpc), destOut)
-        .addReg(scratch).addReg(destIn)
-        .addReg(0).addImm(1));
-    }
-    srcIn = srcOut;
-    destIn = destOut;
+    std::pair<unsigned, unsigned> res =
+        ByvalEmitter.emitByteLoad(BB, StartOfExit, dl, srcIn);
+    unsigned scratch = res.first;
+    srcIn = res.second;
+    destIn = ByvalEmitter.emitByteStore(BB, StartOfExit, dl, destIn, scratch);
   }
 
   MI->eraseFromParent();   // The instruction is gone now.