diff --git a/lib/Target/R600/SIInstrInfo.cpp b/lib/Target/R600/SIInstrInfo.cpp
index 3ed8dfa325f..c8a52971efa 100644
--- a/lib/Target/R600/SIInstrInfo.cpp
+++ b/lib/Target/R600/SIInstrInfo.cpp
@@ -496,6 +496,9 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) {
   case AMDGPU::REG_SEQUENCE: return AMDGPU::REG_SEQUENCE;
   case AMDGPU::COPY: return AMDGPU::COPY;
   case AMDGPU::PHI: return AMDGPU::PHI;
+  case AMDGPU::S_MOV_B32:
+    return MI.getOperand(1).isReg() ?
+           TargetOpcode::COPY : AMDGPU::V_MOV_B32_e32;
   case AMDGPU::S_ADD_I32: return AMDGPU::V_ADD_I32_e32;
   case AMDGPU::S_ADDC_U32: return AMDGPU::V_ADDC_U32_e32;
   case AMDGPU::S_SUB_I32: return AMDGPU::V_SUB_I32_e32;
@@ -680,12 +683,57 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
 
   while (!Worklist.empty()) {
     MachineInstr *Inst = Worklist.pop_back_val();
+    MachineBasicBlock *MBB = Inst->getParent();
+    MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
+
+    // Handle some special cases
+    switch(Inst->getOpcode()) {
+      case AMDGPU::S_MOV_B64: {
+        DebugLoc DL = Inst->getDebugLoc();
+
+        // If the source operand is a register we can replace this with a
+        // copy
+        if (Inst->getOperand(1).isReg()) {
+          MachineInstr *Copy = BuildMI(*MBB, Inst, DL,
+                                       get(TargetOpcode::COPY))
+                                       .addOperand(Inst->getOperand(0))
+                                       .addOperand(Inst->getOperand(1));
+          Worklist.push_back(Copy);
+        } else {
+          // Otherwise, we need to split this into two movs, because there is
+          // no 64-bit VALU move instruction.
+          unsigned LoDst, HiDst, Dst;
+          LoDst = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+          HiDst = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+          Dst = MRI.createVirtualRegister(
+              MRI.getRegClass(Inst->getOperand(0).getReg()));
+
+          MachineInstr *Lo = BuildMI(*MBB, Inst, DL, get(AMDGPU::S_MOV_B32),
+                                     LoDst)
+                             .addImm(Inst->getOperand(1).getImm() & 0xFFFFFFFF);
+          MachineInstr *Hi = BuildMI(*MBB, Inst, DL, get(AMDGPU::S_MOV_B32),
+                                     HiDst)
+                                    .addImm(Inst->getOperand(1).getImm() >> 32);
+
+          BuildMI(*MBB, Inst, DL, get(TargetOpcode::REG_SEQUENCE), Dst)
+                  .addReg(LoDst)
+                  .addImm(AMDGPU::sub0)
+                  .addReg(HiDst)
+                  .addImm(AMDGPU::sub1);
+
+          MRI.replaceRegWith(Inst->getOperand(0).getReg(), Dst);
+          Worklist.push_back(Lo);
+          Worklist.push_back(Hi);
+        }
+        Inst->eraseFromParent();
+        continue;
+      }
+    }
+
     unsigned NewOpcode = getVALUOp(*Inst);
     if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END)
       continue;
 
-    MachineRegisterInfo &MRI = Inst->getParent()->getParent()->getRegInfo();
-
     // Use the new VALU Opcode.
     const MCInstrDesc &NewDesc = get(NewOpcode);
     Inst->setDesc(NewDesc);
diff --git a/test/CodeGen/R600/salu-to-valu.ll b/test/CodeGen/R600/salu-to-valu.ll
new file mode 100644
index 00000000000..c989c9d6722
--- /dev/null
+++ b/test/CodeGen/R600/salu-to-valu.ll
@@ -0,0 +1,42 @@
+; RUN: llc < %s -march=r600 -mcpu=SI  | FileCheck %s
+
+; In this test both the pointer and the offset operands to the
+; BUFFER_LOAD instructions end up being stored in vgprs.  This
+; requires us to add the pointer and offset together, store the
+; result in the offset operand (vaddr), and then store 0 in an
+; sgpr register pair and use that for the pointer operand
+; (low 64-bits of srsrc).
+
+; CHECK-LABEL: @mubuf
+; Make sure we aren't using VGPRs for the source operand of S_MOV_B64
+; CHECK-NOT: S_MOV_B64 s[{{[0-9]+:[0-9]+}}], v
+define void @mubuf(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
+entry:
+  %0 = call i32 @llvm.r600.read.tidig.x() #1
+  %1 = call i32 @llvm.r600.read.tidig.y() #1
+  %2 = sext i32 %0 to i64
+  %3 = sext i32 %1 to i64
+  br label %loop
+
+loop:
+  %4 = phi i64 [0, %entry], [%5, %loop]
+  %5 = add i64 %2, %4
+  %6 = getelementptr i8 addrspace(1)* %in, i64 %5
+  %7 = load i8 addrspace(1)* %6, align 1
+  %8 = or i64 %5, 1
+  %9 = getelementptr i8 addrspace(1)* %in, i64 %8
+  %10 = load i8 addrspace(1)* %9, align 1
+  %11 = add i8 %7, %10
+  %12 = sext i8 %11 to i32
+  store i32 %12, i32 addrspace(1)* %out
+  %13 = icmp slt i64 %5, 10
+  br i1 %13, label %loop, label %done
+
+done:
+  ret void
+}
+
+declare i32 @llvm.r600.read.tidig.x() #1
+declare i32 @llvm.r600.read.tidig.y() #1
+
+attributes #1 = { nounwind readnone }