diff --git a/GPU/Common/VertexDecoderCommon.cpp b/GPU/Common/VertexDecoderCommon.cpp
index e10666dc63..7f141d634d 100644
--- a/GPU/Common/VertexDecoderCommon.cpp
+++ b/GPU/Common/VertexDecoderCommon.cpp
@@ -658,9 +658,9 @@ void VertexDecoder::Step_NormalS8ToFloat() const
 {
 	float *normal = (float *)(decoded_ + decFmt.nrmoff);
 	const s8 *sv = (const s8*)(ptr_ + nrmoff);
-	normal[0] = sv[0] * (1.0f / 128.0f);
-	normal[1] = sv[1] * (1.0f / 128.0f);
-	normal[2] = sv[2] * (1.0f / 128.0f);
+	normal[0] = (float)sv[0] * (1.0f / 128.0f);
+	normal[1] = (float)sv[1] * (1.0f / 128.0f);
+	normal[2] = (float)sv[2] * (1.0f / 128.0f);
 }
 
 void VertexDecoder::Step_NormalS16() const
diff --git a/GPU/Math3D.h b/GPU/Math3D.h
index b9af0a7e44..0fa5ed78af 100644
--- a/GPU/Math3D.h
+++ b/GPU/Math3D.h
@@ -912,6 +912,7 @@ float MATH3D_CALL vectorGetByIndex(__m128 v) {
 
 #if defined(_M_SSE)
 // x, y, and z should be broadcast.  Should only be used through Vec3f version.
+// Note that this will read an extra float from the matrix, so it better not be at the end of an allocation!
 inline __m128 MATH3D_CALL Vec3ByMatrix43Internal(__m128 x, __m128 y, __m128 z, const float m[12]) {
 	__m128 col0 = _mm_loadu_ps(m);
 	__m128 col1 = _mm_loadu_ps(m + 3);