diff --git a/Core/MIPS/ARM/ArmAsm.cpp b/Core/MIPS/ARM/ArmAsm.cpp
index 12cf33676..d089410cd 100644
--- a/Core/MIPS/ARM/ArmAsm.cpp
+++ b/Core/MIPS/ARM/ArmAsm.cpp
@@ -65,13 +65,22 @@ void JitAt()
 	MIPSComp::jit->Compile(currentMIPS->pc);
 }
 
+/*
+double testD;
+
+u32 DoubleTest(u32 sp) {
+	volatile double local = 1.0;
+	testD += local;
+	return (u32)(&local);
+}
+
 void ShowPC(u32 sp) {
 	if (currentMIPS) {
-		WARN_LOG(HLE, "PC : %08x  ArmSP : %08x", currentMIPS->pc, sp);
+		ERROR_LOG(HLE, "ShowPC : %08x  ArmSP : %08x", currentMIPS->pc, sp);
 	} else {
 		ERROR_LOG(HLE, "Universe corrupt?");
 	}
-}
+}*/
 
 void DisassembleArm(const u8 *data, int size);
 
@@ -90,6 +99,13 @@ void Jit::GenerateFixedCode()
 	SetCC(CC_AL);
 
 	PUSH(9, R4, R5, R6, R7, R8, R9, R10, R11, _LR);
+	// Take care to 8-byte align stack for function calls.
+	// This actually misaligns the stack within the JIT itself but that doesn't really matter
+	// as the JIT does not use the stack at all.
+	SUB(_SP, _SP, 4);
+
+	// QuickCallFunction(R3, (void *)&DoubleTest);
+	// QuickCallFunction(R3, (void *)&ShowPC);
 
 	// Fixed registers, these are always kept when in Jit context.
 	// R13 cannot be used as it's the stack pointer.
@@ -132,7 +148,9 @@ void Jit::GenerateFixedCode()
 			BIC(R0, R0, Operand2(0xFC, 4));
 			CMP(R1, Operand2(MIPS_EMUHACK_OPCODE >> 24, 4));
 			SetCC(CC_EQ);
-				// IDEA - we have 24 bits, why not just use offsets from base of code?
+				// IDEA - we have 26 bits, why not just use offsets from base of code?
+				// Another idea: Shift the bloc number left by two in the op, this would let us do
+				// LDR(R0, R9, R0, true, true); here, replacing the two next instructions.
 				ADD(R0, R9, Operand2(2, ST_LSL, R0));
 				LDR(R0, R0);
 				B(R0);
@@ -154,6 +172,8 @@ void Jit::GenerateFixedCode()
 
 	breakpointBailout = GetCodePtr();
 
+	ADD(_SP, _SP, 4);
+
 	POP(9, R4, R5, R6, R7, R8, R9, R10, R11, _PC);  // Returns
 
 	// Uncomment if you want to see the output...
diff --git a/Core/MIPS/ARM/ArmCompLoadStore.cpp b/Core/MIPS/ARM/ArmCompLoadStore.cpp
index 5479580c9..b310b8936 100644
--- a/Core/MIPS/ARM/ArmCompLoadStore.cpp
+++ b/Core/MIPS/ARM/ArmCompLoadStore.cpp
@@ -14,13 +14,35 @@
 
 // Official git repository and contact information can be found at
 // https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.
+
+
+// Optimization ideas:
+//
+// It's common to see sequences of stores writing or reading to a contiguous set of
+// addresses in function prologues/epilogues:
+//  sw s5, 104(sp)
+//  sw s4, 100(sp)
+//  sw s3, 96(sp)
+//  sw s2, 92(sp)
+//  sw s1, 88(sp)
+//  sw s0, 84(sp)
+//  sw ra, 108(sp)
+//  mov s4, a0
+//  mov s3, a1
+//  ...
+// Such sequences could easily be detected and turned into nice contiguous
+// sequences of ARM stores instead of the current 3 instructions per sw/lw.
+//
+// Also, if we kept track of the likely register content of a cached register,
+// (pointer or data), we could avoid many BIC instructions.
+
+
 #include "../../MemMap.h"
 #include "../MIPSAnalyst.h"
 #include "../../Config.h"
 #include "ArmJit.h"
 #include "ArmRegCache.h"
 
-
 #define _RS ((op>>21) & 0x1F)
 #define _RT ((op>>16) & 0x1F)
 #define _RD ((op>>11) & 0x1F)
@@ -44,7 +66,7 @@ namespace MIPSComp
 			// Don't load anything into $zr
 			return;
 		}
-		switch (o) 
+		switch (o)
 		{
 		case 37: //R(rt) = ReadMem16(addr); break; //lhu
 			Comp_Generic(op);