diff --git a/GPU/Common/VertexDecoderCommon.h b/GPU/Common/VertexDecoderCommon.h
index 38be9f129..73a276145 100644
--- a/GPU/Common/VertexDecoderCommon.h
+++ b/GPU/Common/VertexDecoderCommon.h
@@ -673,6 +673,8 @@ private:
 	void Jit_WriteMorphColor(int outOff, bool checkAlpha = true);
 	void Jit_AnyS8ToFloat(int srcoff);
 	void Jit_AnyS16ToFloat(int srcoff);
+	void Jit_AnyU8ToFloat(int srcoff);
+	void Jit_AnyU16ToFloat(int srcoff);
 	void Jit_AnyS8Morph(int srcoff, int dstoff);
 	void Jit_AnyS16Morph(int srcoff, int dstoff);
 	void Jit_AnyFloatMorph(int srcoff, int dstoff);
diff --git a/GPU/Common/VertexDecoderX86.cpp b/GPU/Common/VertexDecoderX86.cpp
index f85c732ff..54a2c53ea 100644
--- a/GPU/Common/VertexDecoderX86.cpp
+++ b/GPU/Common/VertexDecoderX86.cpp
@@ -360,9 +360,28 @@ void VertexDecoderJitCache::Jit_WeightsU16() {
 }
 
 void VertexDecoderJitCache::Jit_WeightsU8ToFloat() {
+	int j = 0;
+
+	switch (dec_->nweights) {
+	case 4:
+		// We'll at least do the first 4 fast.
+	case 5:
+	case 6:
+	case 7:
+		j = 4;
+		Jit_AnyU8ToFloat(dec_->weightoff);
+		MOVUPS(MDisp(dstReg, dec_->decFmt.w0off), XMM3);
+		break;
+	case 8:
+		Jit_AnyU8ToFloat(dec_->weightoff);
+		MOVUPS(MDisp(dstReg, dec_->decFmt.w0off), XMM3);
+		Jit_AnyU8ToFloat(dec_->weightoff + 4);
+		MOVUPS(MDisp(dstReg, dec_->decFmt.w1off), XMM3);
+		return;
+	}
+
 	// Basic implementation - a byte at a time. TODO: Optimize
-	int j;
-	for (j = 0; j < dec_->nweights; j++) {
+	for (; j < dec_->nweights; j++) {
 		MOVZX(32, 8, tempReg1, MDisp(srcReg, dec_->weightoff + j));
 		CVTSI2SS(fpScratchReg, R(tempReg1));
 		MULSS(fpScratchReg, M(&by128));
@@ -375,9 +394,28 @@ void VertexDecoderJitCache::Jit_WeightsU8ToFloat() {
 }
 
 void VertexDecoderJitCache::Jit_WeightsU16ToFloat() {
+	int j = 0;
+
+	switch (dec_->nweights) {
+	case 4:
+		// We'll at least do the first 4 fast.
+	case 5:
+	case 6:
+	case 7:
+		j = 4;
+		Jit_AnyU16ToFloat(dec_->weightoff);
+		MOVUPS(MDisp(dstReg, dec_->decFmt.w0off), XMM3);
+		break;
+	case 8:
+		Jit_AnyU16ToFloat(dec_->weightoff);
+		MOVUPS(MDisp(dstReg, dec_->decFmt.w0off), XMM3);
+		Jit_AnyU16ToFloat(dec_->weightoff + 4 * 2);
+		MOVUPS(MDisp(dstReg, dec_->decFmt.w1off), XMM3);
+		return;
+	}
+
 	// Basic implementation - a short at a time. TODO: Optimize
-	int j;
-	for (j = 0; j < dec_->nweights; j++) {
+	for (; j < dec_->nweights; j++) {
 		MOVZX(32, 16, tempReg1, MDisp(srcReg, dec_->weightoff + j * 2));
 		CVTSI2SS(fpScratchReg, R(tempReg1));
 		MULSS(fpScratchReg, M(&by32768));
@@ -1145,6 +1183,35 @@ void VertexDecoderJitCache::Jit_AnyS16ToFloat(int srcoff) {
 	MULPS(XMM3, M(&by32768));
 }
 
+void VertexDecoderJitCache::Jit_AnyU8ToFloat(int srcoff) {
+	if (!cpu_info.bSSE4_1) {
+		XORPS(XMM3, R(XMM3));
+	}
+	MOVD_xmm(XMM1, MDisp(srcReg, srcoff));
+	if (cpu_info.bSSE4_1) {
+		PMOVZXBD(XMM1, R(XMM1));
+	} else {
+		PUNPCKLBW(XMM1, R(XMM3));
+		PUNPCKLWD(XMM1, R(XMM3));
+	}
+	CVTDQ2PS(XMM3, R(XMM1));
+	MULPS(XMM3, M(&by128));
+}
+
+void VertexDecoderJitCache::Jit_AnyU16ToFloat(int srcoff) {
+	if (!cpu_info.bSSE4_1) {
+		XORPS(XMM3, R(XMM3));
+	}
+	MOVQ_xmm(XMM1, MDisp(srcReg, srcoff));
+	if (cpu_info.bSSE4_1) {
+		PMOVZXWD(XMM1, R(XMM1));
+	} else {
+		PUNPCKLWD(XMM1, R(XMM3));
+	}
+	CVTDQ2PS(XMM3, R(XMM1));
+	MULPS(XMM3, M(&by32768));
+}
+
 void VertexDecoderJitCache::Jit_AnyS8Morph(int srcoff, int dstoff) {
 	MOV(PTRBITS, R(tempReg1), ImmPtr(&gstate_c.morphWeights[0]));
 	PXOR(fpScratchReg4, R(fpScratchReg4));