From e5c6cf965b90ece9e06a7ea2295e00343d8f212b Mon Sep 17 00:00:00 2001
From: Henrik Rydgard <hrydgard@gmail.com>
Date: Fri, 16 Nov 2012 15:16:14 +0100
Subject: [PATCH] Fixes and optimizations to vertex decoding and lighting.
 Motorcycles are now visible in MotoGP.

---
 GPU/GLES/FragmentShaderGenerator.cpp |   1 -
 GPU/GLES/TextureCache.cpp            |  12 --
 GPU/GLES/TransformPipeline.cpp       | 213 +++++++++++++++++----------
 GPU/GLES/TransformPipeline.h         |   1 +
 GPU/GLES/VertexDecoder.cpp           | 213 +++++++++++++++++----------
 GPU/GLES/VertexDecoder.h             |  28 ++--
 GPU/GLES/VertexShaderGenerator.cpp   |   1 -
 Globals.h                            |  18 +++
 Windows/PPSSPP.sln                   |  34 -----
 Windows/PPSSPP.vcxproj               | 110 +-------------
 android/jni/Android.mk               |   4 +-
 11 files changed, 308 insertions(+), 327 deletions(-)

diff --git a/GPU/GLES/FragmentShaderGenerator.cpp b/GPU/GLES/FragmentShaderGenerator.cpp
index efa8425de..8b1e0b257 100644
--- a/GPU/GLES/FragmentShaderGenerator.cpp
+++ b/GPU/GLES/FragmentShaderGenerator.cpp
@@ -74,7 +74,6 @@ char *GenerateFragmentShader()
 #endif
 
 	int lmode = gstate.lmode & 1;
-	lmode = 0;  /// for now
 
 	if (gstate.textureMapEnable & 1)
 		WRITE(p, "uniform sampler2D tex;\n");
diff --git a/GPU/GLES/TextureCache.cpp b/GPU/GLES/TextureCache.cpp
index 6066af4fb..6cc852b6c 100644
--- a/GPU/GLES/TextureCache.cpp
+++ b/GPU/GLES/TextureCache.cpp
@@ -400,18 +400,6 @@ struct DXT1Block
 	u16 color2;
 };
 
-inline u8 Convert5To8(u8 v)
-{
-	// Swizzle bits: 00012345 -> 12345123
-	return (v << 3) | (v >> 2);
-}
-
-inline u8 Convert6To8(u8 v)
-{
-	// Swizzle bits: 00123456 -> 12345612
-	return (v << 2) | (v >> 4);
-}
-
 inline u32 makecol(int r, int g, int b, int a)
 {
 	return (a << 24)|(r << 16)|(g << 8)|b;
diff --git a/GPU/GLES/TransformPipeline.cpp b/GPU/GLES/TransformPipeline.cpp
index 15a28e08e..1e749e251 100644
--- a/GPU/GLES/TransformPipeline.cpp
+++ b/GPU/GLES/TransformPipeline.cpp
@@ -57,72 +57,86 @@ uint16_t indexBuffer[65536];	// Unused
 
 // TODO: This should really return 2 colors, one for specular and one for diffuse.
 
-void Light(float colorOut[4], const float colorIn[4], Vec3 pos, Vec3 normal, float dots[4])
-{
-	// could cache a lot of stuff, such as ambient, across vertices...
+// Convenient way to do precomputation to save the parts of the lighting calculation
+// that's common between the many vertices of a draw call.
+class Lighter {
+public:
+	Lighter();
+	void Light(float colorOut0[4], float colorOut1[4], const float colorIn[4], Vec3 pos, Vec3 normal, float dots[4]);
 
-	bool doShadeMapping = (gstate.texmapmode & 0x3) == 2;
-	if (!doShadeMapping && !(gstate.lightEnable[0]&1) && !(gstate.lightEnable[1]&1) && !(gstate.lightEnable[2]&1) && !(gstate.lightEnable[3]&1))
-	{
-		memcpy(colorOut, colorIn, sizeof(float) * 4);
-		return;
-	}
-
-	Color4 emissive;
-	emissive.GetFromRGB(gstate.materialemissive);
+private:
+	bool disabled_;
 	Color4 globalAmbient;
+	Color4 materialEmissive;
+	Color4 materialAmbient;
+	Color4 materialDiffuse;
+	Color4 materialSpecular;
+	float specCoef_;
+	Vec3 viewer_;
+	bool doShadeMapping_;
+	int materialUpdate_;
+};
+
+Lighter::Lighter() {
+	disabled_ = false;
+	doShadeMapping_ = (gstate.texmapmode & 0x3) == 2;
+	if (!doShadeMapping_ && !(gstate.lightEnable[0]&1) && !(gstate.lightEnable[1]&1) && !(gstate.lightEnable[2]&1) && !(gstate.lightEnable[3]&1))
+	{
+		disabled_ = true;
+	}
+	materialEmissive.GetFromRGB(gstate.materialemissive);
+	materialEmissive.a = 0.0f;
 	globalAmbient.GetFromRGB(gstate.ambientcolor);
 	globalAmbient.GetFromA(gstate.ambientalpha);
+	materialAmbient.GetFromRGB(gstate.materialambient);
+	materialAmbient.a = 1.0f;
+	materialDiffuse.GetFromRGB(gstate.materialdiffuse);
+	materialDiffuse.a = 1.0f;
+	materialSpecular.GetFromRGB(gstate.materialspecular);
+	materialSpecular.a = 1.0f;
+	specCoef_ = getFloat24(gstate.materialspecularcoef);
+	viewer_ = Vec3(-gstate.viewMatrix[9], -gstate.viewMatrix[10], -gstate.viewMatrix[11]);
+	materialUpdate_ = gstate.materialupdate & 7;
+}
+
+void Lighter::Light(float colorOut0[4], float colorOut1[4], const float colorIn[4], Vec3 pos, Vec3 normal, float dots[4])
+{
+	if (disabled_) {
+		memcpy(colorOut0, colorIn, sizeof(float) * 4);
+		memset(colorOut1, 0, sizeof(float) * 4);
+		return;
+	}
 
 	Vec3 norm = normal.Normalized();
 	Color4 in(colorIn);
 
-	Color4 ambient;
-	if (gstate.materialupdate & 1)
-	{
-		ambient = in;
-	}
+	const Color4 *ambient;
+	if (materialUpdate_ & 1)
+		ambient = &in;
 	else
-	{
-		ambient.GetFromRGB(gstate.materialambient);
-		ambient.a=1.0f;
-	}
+		ambient = &materialAmbient;
 
-	Color4 diffuse;
-	if (gstate.materialupdate & 2)
-	{
-		diffuse = in;
-	}
+	const Color4 *diffuse;
+	if (materialUpdate_ & 2)
+		diffuse = &in;
 	else
-	{
-		diffuse.GetFromRGB(gstate.materialdiffuse);
-		diffuse.a=1.0f;
-	}
+		diffuse = &materialDiffuse;
 
-	Color4 specular;
-	if (gstate.materialupdate & 4)
-	{
-		specular = in;
-	}
+	const Color4 *specular;
+	if (materialUpdate_ & 4)
+		specular = &in;
 	else
-	{
-		specular.GetFromRGB(gstate.materialspecular);
-		specular.a=1.0f;
-	}
-
-	float specCoef = getFloat24(gstate.materialspecularcoef);
-	
-	Vec3 viewer(-gstate.viewMatrix[9], -gstate.viewMatrix[10], -gstate.viewMatrix[11]);
-
-	Color4 lightSum = globalAmbient * ambient + emissive;
-
+		specular = &materialSpecular;
+		
+	Color4 lightSum0 = globalAmbient * *ambient + materialEmissive;
+	Color4 lightSum1(0,0,0,0);
 
 	// Try lights.elf - there's something wrong with the lighting
 
 	for (int l = 0; l < 4; l++)
 	{
 		// can we skip this light?
-		if ((gstate.lightEnable[l] & 1) == 0 && !doShadeMapping)
+		if ((gstate.lightEnable[l] & 1) == 0 && !doShadeMapping_)
 			continue;
 
 		GELightComputation comp = (GELightComputation)(gstate.ltype[l]&3);
@@ -151,10 +165,9 @@ void Light(float colorOut[4], const float colorIn[4], Vec3 pos, Vec3 normal, flo
 		if (dot < 0.0f) dot = 0.0f;
 
 		if (poweredDiffuse)
-			dot = powf(dot, specCoef);
+			dot = powf(dot, specCoef_);
 
-		Color4 diff = (gstate.lightColor[1][l] * diffuse) * (dot * lightScale);	
-		Color4 spec(0,0,0,0);
+		Color4 diff = (gstate.lightColor[1][l] * *diffuse) * (dot * lightScale);	
 
 		// Real PSP specular
 		Vec3 toViewer(0,0,1);
@@ -170,20 +183,27 @@ void Light(float colorOut[4], const float colorIn[4], Vec3 pos, Vec3 normal, flo
 			dot = halfVec * norm;
 			if (dot >= 0)
 			{
-				spec += (gstate.lightColor[2][l] * specular * (powf(dot, specCoef)*lightScale));
+				lightSum1 += (gstate.lightColor[2][l] * *specular * (powf(dot, specCoef_)*lightScale));
 			}	
 		}
 		dots[l] = dot;
 		if (gstate.lightEnable[l] & 1)
 		{
-			lightSum += gstate.lightColor[0][l]*ambient + diff + spec;
+			lightSum0 += gstate.lightColor[0][l] * *ambient + diff;
 		}
 	}
 
-	for (int i = 0; i < 3; i++)
-		colorOut[i] = lightSum[i];
+	// 4?
+	for (int i = 0; i < 4; i++) {
+		colorOut0[i] = lightSum0[i];
+		colorOut1[i] = lightSum1[i];
+	}
 }
 
+// This is the software transform pipeline, which is necessary for supporting RECT
+// primitives correctly. Other primitives are possible to transform and light in hardware
+// using vertex shader, which will be way, way faster, especially on mobile. This has
+// not yet been implemented though.
 void TransformAndDrawPrim(void *verts, void *inds, int prim, int vertexCount, LinkedShader *program, float *customUV, int forceIndexType)
 {
 	// First, decode the verts and apply morphing
@@ -234,6 +254,8 @@ void TransformAndDrawPrim(void *verts, void *inds, int prim, int vertexCount, Li
 		vertexCount = 0x10000/3;
 #endif
 
+	Lighter lighter;
+
 	for (int i = 0; i < vertexCount; i++)
 	{	
 		int indexType = (gstate.vertType & GE_VTYPE_IDX_MASK);
@@ -255,9 +277,10 @@ void TransformAndDrawPrim(void *verts, void *inds, int prim, int vertexCount, Li
 			index = i;
 		}
 
-		float v[3] = {0,0,0};
-		float c[4] = {1,1,1,1};
-		float uv[2] = {0,0};
+		float v[3] = {0, 0, 0};
+		float c0[4] = {1, 1, 1, 1};
+		float c1[4] = {0, 0, 0, 0};
+		float uv[2] = {0, 0};
 
 		if (gstate.vertType & GE_VTYPE_THROUGH_MASK)
 		{
@@ -265,8 +288,11 @@ void TransformAndDrawPrim(void *verts, void *inds, int prim, int vertexCount, Li
 			for (int j=0; j<3; j++)
 				v[j] = decoded[index].pos[j];
 			// TODO : check if has color
-			for (int j=0; j<4; j++)
-				c[j] = decoded[index].color[j];
+			for (int j=0; j<4; j++) {
+				c0[j] = decoded[index].color[j] / 255.0f;
+				c1[j] = 0.0f;
+			}
+
 			// TODO : check if has uv
 			for (int j=0; j<2; j++)
 				uv[j] = decoded[index].uv[j];
@@ -304,30 +330,51 @@ void TransformAndDrawPrim(void *verts, void *inds, int prim, int vertexCount, Li
 				Norm3ByMatrix43(norm, nsum.v, gstate.worldMatrix);
 			}
 
-
 			// Perform lighting here if enabled. don't need to check through, it's checked above.
 			float dots[4] = {0,0,0,0};
 			if (program->a_color0 != -1)
 			{
-				//c[1] = norm[1];
-				float litColor[4] = {0,0,0,0};
-				Light(litColor, decoded[index].color, out, norm, dots);
+				float unlitColor[4];
+				for (int j = 0; j < 4; j++) {
+					unlitColor[j] = decoded[index].color[j] / 255.0f;
+				}
+				float litColor0[4];
+				float litColor1[4];
+				lighter.Light(litColor0, litColor1, unlitColor, out, norm, dots);
+				
 				if (gstate.lightingEnable & 1)
 				{
-					memcpy(c, litColor, sizeof(litColor));
+					// TODO: don't ignore gstate.lmode - we should send two colors in that case
+					if (gstate.lmode & 1) {
+						// Separate colors
+						for (int j = 0; j < 4; j++) {
+							c0[j] = litColor0[j];
+							c1[j] = litColor1[j];
+						}
+					} else {
+						// Summed color into c0
+						for (int j = 0; j < 4; j++) {
+							c0[j] = litColor0[j] + litColor1[j];
+							c1[j] = 0.0f;
+						}
+					}
 				}
 				else
 				{
 					// no lighting? copy the color.
-					for (int j = 0; j < 4; j++)
-						c[j] = decoded[index].color[j];
+					for (int j = 0; j < 4; j++) {
+						c0[j] = unlitColor[j];
+						c1[j] = 0.0f;
+					}
 				}
 			}
 			else
 			{
 				// no color in the fragment program???
-				for (int j = 0; j < 4; j++)
-					c[j] = decoded[index].color[j];
+				for (int j = 0; j < 4; j++) {
+					c0[j] = decoded[index].color[j] / 255.0f;
+					c1[j] = 0.0f;
+				}
 			}
 
 			if (customUV) {
@@ -382,11 +429,13 @@ void TransformAndDrawPrim(void *verts, void *inds, int prim, int vertexCount, Li
 				}
 			}
 
-			// Transform the coord by the view matrix. Should this be done before or after texcoord generation?
+			// Transform the coord by the view matrix.
+			// We only really need to do it here for RECTANGLES drawing. However,
+			// there's no point in optimizing it out because all other primitives
+			// will be moved to hardware transform anyway.
 			Vec3ByMatrix43(v, out, gstate.viewMatrix);
 		}
 
-
 		// We need to tesselate axis-aligned rectangles, as they're only specified by two coordinates.
 		if (prim == GE_PRIM_RECTANGLES)
 		{
@@ -404,42 +453,48 @@ void TransformAndDrawPrim(void *verts, void *inds, int prim, int vertexCount, Li
 				trans->x = v[0]; trans->y = v[1];
 				trans->z = v[2]; 
 				trans->uv[0] = uv[0]; trans->uv[1] = uv[1];
-				memcpy(trans->color, c, 4*sizeof(float));
+				memcpy(trans->color0, c0, 4*sizeof(float));
+				memcpy(trans->color1, c1, 4*sizeof(float));
 				trans++;
 
 				// top right
 				trans->x = v2[0]; trans->y = v[1];
 				trans->z = v[2]; 
 				trans->uv[0] = uv2[0]; trans->uv[1] = uv[1];
-				memcpy(trans->color, c, 4*sizeof(float));
+				memcpy(trans->color0, c0, 4*sizeof(float));
+				memcpy(trans->color1, c1, 4*sizeof(float));
 				trans++;
 
 				// bottom right
 				trans->x = v2[0]; trans->y = v2[1];
 				trans->z = v[2]; 
 				trans->uv[0] = uv2[0]; trans->uv[1] = uv2[1];
-				memcpy(trans->color, c, 4*sizeof(float));
+				memcpy(trans->color0, c0, 4*sizeof(float));
+				memcpy(trans->color1, c1, 4*sizeof(float));
 				trans++;
 
 				// bottom left
 				trans->x = v[0]; trans->y = v2[1];
 				trans->z = v[2]; 
 				trans->uv[0] = uv[0]; trans->uv[1] = uv2[1];
-				memcpy(trans->color, c, 4*sizeof(float));
+				memcpy(trans->color0, c0, 4*sizeof(float));
+				memcpy(trans->color1, c1, 4*sizeof(float));
 				trans++;
 
 				// top left
 				trans->x = v[0]; trans->y = v[1];
 				trans->z = v[2]; 
 				trans->uv[0] = uv[0]; trans->uv[1] = uv[1];
-				memcpy(trans->color, c, 4*sizeof(float));
+				memcpy(trans->color0, c0, 4*sizeof(float));
+				memcpy(trans->color1, c1, 4*sizeof(float));
 				trans++;
 
 				// bottom right
 				trans->x = v2[0]; trans->y = v2[1];
 				trans->z = v[2]; 
 				trans->uv[0] = uv2[0]; trans->uv[1] = uv2[1];
-				memcpy(trans->color, c, 4*sizeof(float));
+				memcpy(trans->color0, c0, 4*sizeof(float));
+				memcpy(trans->color1, c1, 4*sizeof(float));
 				trans++;
 
 				numTrans += 6;
@@ -448,7 +503,8 @@ void TransformAndDrawPrim(void *verts, void *inds, int prim, int vertexCount, Li
 		else
 		{
 			memcpy(&trans->x, v, 3*sizeof(float));
-			memcpy(trans->color, c, 4*sizeof(float));
+			memcpy(trans->color0, c0, 4*sizeof(float));
+			memcpy(trans->color1, c1, 4*sizeof(float));
 			memcpy(trans->uv, uv, 2*sizeof(float));
 			trans++;
 			numTrans++;
@@ -458,15 +514,18 @@ void TransformAndDrawPrim(void *verts, void *inds, int prim, int vertexCount, Li
 	glEnableVertexAttribArray(program->a_position);
 	if (useTexCoord && program->a_texcoord != -1) glEnableVertexAttribArray(program->a_texcoord);
 	if (program->a_color0 != -1) glEnableVertexAttribArray(program->a_color0);
+	if (program->a_color1 != -1) glEnableVertexAttribArray(program->a_color1);
 	const int vertexSize = sizeof(*trans);
 	glVertexAttribPointer(program->a_position, 3, GL_FLOAT, GL_FALSE, vertexSize, transformed);
 	if (useTexCoord && program->a_texcoord != -1) glVertexAttribPointer(program->a_texcoord, 2, GL_FLOAT, GL_FALSE, vertexSize, ((uint8_t*)transformed) + 3 * 4);	
 	if (program->a_color0 != -1) glVertexAttribPointer(program->a_color0, 4, GL_FLOAT, GL_FALSE, vertexSize, ((uint8_t*)transformed) + 5 * 4);
+	if (program->a_color1 != -1) glVertexAttribPointer(program->a_color1, 4, GL_FLOAT, GL_FALSE, vertexSize, ((uint8_t*)transformed) + 9 * 4);
 	// NOTICE_LOG(G3D,"DrawPrimitive: %i", numTrans);
 	glDrawArrays(glprim[prim], 0, numTrans);
 	glDisableVertexAttribArray(program->a_position);
 	if (useTexCoord && program->a_texcoord != -1) glDisableVertexAttribArray(program->a_texcoord);
 	if (program->a_color0 != -1) glDisableVertexAttribArray(program->a_color0);
+	if (program->a_color1 != -1) glDisableVertexAttribArray(program->a_color1);
 
 	/*
 	if (((gstate.vertType ) & GE_VTYPE_IDX_MASK) == GE_VTYPE_IDX_8BIT)
diff --git a/GPU/GLES/TransformPipeline.h b/GPU/GLES/TransformPipeline.h
index 7d1c24a22..330921778 100644
--- a/GPU/GLES/TransformPipeline.h
+++ b/GPU/GLES/TransformPipeline.h
@@ -19,4 +19,5 @@
 
 struct LinkedShader;
 
+
 void TransformAndDrawPrim(void *verts, void *inds, int prim, int count, LinkedShader *shader, float *customUV = 0, int forceIndexType = -1);
diff --git a/GPU/GLES/VertexDecoder.cpp b/GPU/GLES/VertexDecoder.cpp
index 163fb373d..44bc7bb9c 100644
--- a/GPU/GLES/VertexDecoder.cpp
+++ b/GPU/GLES/VertexDecoder.cpp
@@ -46,8 +46,6 @@ inline int align(int n, int align)
 	return (n+(align-1)) & ~(align-1);
 }
 
-static int onesize;
-
 void VertexDecoder::SetVertexType(u32 fmt)
 {
 	fmt = fmt;
@@ -80,13 +78,13 @@ void VertexDecoder::SetVertexType(u32 fmt)
 		size = align(size, tcalign[tc]);
 		tcoff = size;
 		size += tcsize[tc];
-		if (tcalign[tc]>biggest)
-			biggest=tcalign[tc];
+		if (tcalign[tc] > biggest)
+			biggest = tcalign[tc];
 	}
 
 	if (col)
 	{
-		size = align(size,colalign[col]);
+		size = align(size, colalign[col]);
 		coloff = size;
 		size += colsize[col];
 		if (colalign[col] > biggest)
@@ -96,26 +94,27 @@ void VertexDecoder::SetVertexType(u32 fmt)
 	{
 		coloff = 0;
 	}
+
 	if (nrm)
 	{
-		size = align(size,nrmalign[nrm]);
+		size = align(size, nrmalign[nrm]);
 		nrmoff = size;
 		size += nrmsize[nrm];
 		if (nrmalign[nrm] > biggest)
 			biggest = nrmalign[nrm]; 
 	}
 
-	//if (pos)
+	//if (pos)  - there's always a position
 	{
-		size = align(size,posalign[pos]);
+		size = align(size, posalign[pos]);
 		posoff = size;
 		size += possize[pos];
 		if (posalign[pos] > biggest)
 			biggest = posalign[pos];
 	}
 
-	size = align(size,biggest);
-	onesize = size;
+	size = align(size, biggest);
+	onesize_ = size;
 	size *= morphcount;
 	DEBUG_LOG(G3D,"SVT : size = %i, aligned to biggest %i", size, biggest);
 }
@@ -127,24 +126,36 @@ void VertexDecoder::DecodeVerts(DecodedVertex *decoded, const void *verts, const
 
 	char *ptr = (char *)verts;
 
-	for (int i = 0; i < count; i++)
-	{
-		int index;
-		if (idx == (GE_VTYPE_IDX_8BIT >> 11))
-		{
-			index = ((u8*)inds)[i];
-		} 
-		else if (idx == (GE_VTYPE_IDX_16BIT >> 11))
-		{
-			index = ((u16*)inds)[i];
+	// Find index bounds. Could cache this in display lists.
+	int lowerBound = 0x7FFFFFFF;
+	int upperBound = 0;
+	if (idx == (GE_VTYPE_IDX_8BIT >> GE_VTYPE_IDX_SHIFT)) {
+		const u8 *ind8 = (const u8 *)inds;
+		for (int i = 0; i < count; i++) {
+			if (ind8[i] < lowerBound)
+				lowerBound = ind8[i];
+			if (ind8[i] > upperBound)
+				upperBound = ind8[i];
 		}
-		else
-		{
-			index = i;
+	} else if (idx == (GE_VTYPE_IDX_16BIT >> GE_VTYPE_IDX_SHIFT)) {
+		const u16 *ind16 = (const u16*)inds;
+		for (int i = 0; i < count; i++) {
+			if (ind16[i] < lowerBound)
+				lowerBound = ind16[i];
+			if (ind16[i] > upperBound)
+				upperBound = ind16[i];
 		}
+	} else {
+		lowerBound = 0;
+		upperBound = count - 1;
+	}
 
+	// Decode the vertices within the found bounds, once each (unlike the previous way..)
+	for (int index = lowerBound; index <= upperBound; index++)
+	{
 		ptr = (char*)verts + (index * size);
 
+		// TODO: Should weights be morphed?
 		float *wt = decoded[index].weights;
 		switch (weighttype)
 		{
@@ -153,29 +164,30 @@ void VertexDecoder::DecodeVerts(DecodedVertex *decoded, const void *verts, const
 
 		case GE_VTYPE_WEIGHT_8BIT >> 9:
 			{
-				u8 *wdata = (u8*)(ptr);
-				for (int j=0; j<nweights; j++)
+				const u8 *wdata = (const u8*)(ptr);
+				for (int j = 0; j < nweights; j++)
 					wt[j] = (float)wdata[j] / 255.0f;
 			}
 			break;
 
 		case GE_VTYPE_WEIGHT_16BIT >> 9:
 			{
-				u16 *wdata = (u16*)(ptr);
-				for (int j=0; j<nweights; j++)
+				const u16 *wdata = (const u16*)(ptr);
+				for (int j = 0; j < nweights; j++)
 					wt[j] = (float)wdata[j] / 65535.0f;
 			}
 			break;
 
 		case GE_VTYPE_WEIGHT_FLOAT >> 9:
 			{
-				float *wdata = (float*)(ptr+0);
-				for (int j=0; j<nweights; j++)
+				const float *wdata = (const float*)(ptr+0);
+				for (int j = 0; j < nweights; j++)
 					wt[j] = wdata[j];
 			}
 			break;
 		}
 
+		// TODO: Not morphing UV yet
 		float *uv = decoded[index].uv;
 		switch (tc)
 		{
@@ -186,15 +198,15 @@ void VertexDecoder::DecodeVerts(DecodedVertex *decoded, const void *verts, const
 
 		case GE_VTYPE_TC_8BIT:
 			{
-				u8 *uvdata = (u8*)(ptr + tcoff);
-				for (int j=0; j<2; j++)
+				const u8 *uvdata = (const u8*)(ptr + tcoff);
+				for (int j = 0; j < 2; j++)
 					uv[j] = (float)uvdata[j]/255.0f;
 				break;
 			}
 
 		case GE_VTYPE_TC_16BIT:
 			{
-				u16 *uvdata = (u16*)(ptr + tcoff);
+				const u16 *uvdata = (const u16*)(ptr + tcoff);
 				if (throughmode)
 				{
 					uv[0] = (float)uvdata[0] / (float)(gstate.curTextureWidth);
@@ -210,81 +222,86 @@ void VertexDecoder::DecodeVerts(DecodedVertex *decoded, const void *verts, const
 
 		case GE_VTYPE_TC_FLOAT:
 			{
-				float *uvdata = (float*)(ptr + tcoff);
-				for (int j=0; j<2; j++)
+				const float *uvdata = (const float*)(ptr + tcoff);
+				for (int j = 0; j < 2; j++)
 					uv[j] = uvdata[j];
 			}
 			break;
 		}
 
-		float *c = decoded[index].color;
+		// TODO: Not morphing color yet
+		u8 *c = decoded[index].color;
 		switch (col)
 		{
-		case GE_VTYPE_COL_4444>>2:
+		case GE_VTYPE_COL_4444 >> 2:
 			{
 				u16 cdata = *(u16*)(ptr + coloff);
 				for (int j = 0; j < 4; j++)
-					c[j] = (float)(cdata>>(j * 4) & 0xF) / 15.0f;
+					c[j] = Convert4To8((cdata >> (j * 4)) & 0xF);
 			}
 			break;
 
-		case GE_VTYPE_COL_565>>2:
+		case GE_VTYPE_COL_565 >> 2:
 			{
 				u16 cdata = *(u16*)(ptr + coloff);
-				c[0] = (float)(cdata & 0x1f) / 31.0f;
-				c[1] = (float)((cdata>>5) & 0x3f) / 63.0f;
-				c[2] = (float)((cdata>>11) & 0x1f) / 31.0f;
+				c[0] = Convert5To8(cdata & 0x1f);
+				c[1] = Convert6To8((cdata>>5) & 0x3f);
+				c[2] = Convert5To8((cdata>>11) & 0x1f);
 				c[3] = 1.0f;
 			}
 			break;
 
-		case GE_VTYPE_COL_5551>>2:
+		case GE_VTYPE_COL_5551 >> 2:
 			{
 				u16 cdata = *(u16*)(ptr + coloff);
-				c[0] = (float)(cdata & 0x1f) / 31.0f;
-				c[1] = (float)((cdata>>5) & 0x1f) / 31.0f;
-				c[2] = (float)((cdata>>10) & 0x1f) / 31.0f;
-				c[3] = (float)(cdata>>15);
+				c[0] = Convert5To8(cdata & 0x1f);
+				c[1] = Convert5To8((cdata>>5) & 0x1f);
+				c[2] = Convert5To8((cdata>>10) & 0x1f);
+				c[3] = (cdata>>15) ? 255 : 0;
 			}
 			break;
 
-		case GE_VTYPE_COL_8888>>2:
+		case GE_VTYPE_COL_8888 >> 2:
 			{
+				// TODO: speedup
 				u8 *cdata = (u8*)(ptr + coloff);
-				for (int j=0; j<4; j++)
-					c[j] = (float)cdata[j] / 255.0f;
+				for (int j = 0; j < 4; j++)
+					c[j] = cdata[j];
 			}
 			break;
 
 		default:
-			c[0]=1.0f; c[1]=1.0f; c[2]=1.0f; c[3]=1.0f;
+			c[0]=255; c[1]=255; c[2]=255; c[3]=255;
 			break;
 		}
 
-
 		float *normal = decoded[index].normal;
-		memset(normal,0,sizeof(float)*3);
-		for (int n=0; n<morphcount; n++)
+		memset(normal, 0, sizeof(float)*3);
+		for (int n = 0; n < morphcount; n++)
 		{
+			float multiplier = gstate.morphWeights[n];
+			if (gstate.reversenormals & 0xFFFFFF) {
+				multiplier = -multiplier;
+			}
 			switch (nrm)
 			{
 			case 0:
 				//no normals
 				break;
 
-			case GE_VTYPE_NRM_FLOAT>>5:
+			case GE_VTYPE_NRM_FLOAT >> 5:
 				{
-					float *fv = (float*)(ptr + onesize*n + nrmoff);
-					for (int j=0; j<3; j++)
-						normal[j] += fv[j] * gstate.morphWeights[n];
+					const float *fv = (const float*)(ptr + onesize_*n + nrmoff);
+					for (int j = 0; j < 3; j++)
+						normal[j] += fv[j] * multiplier;
 				}
 				break;
 
-			case GE_VTYPE_NRM_16BIT>>5:
+			case GE_VTYPE_NRM_16BIT >> 5:
 				{
-					short *sv = (short*)(ptr + onesize*n + nrmoff);
-					for (int j=0; j<3; j++)
-						normal[j] += (sv[j]/32767.0f) * gstate.morphWeights[n];
+					const short *sv = (const short*)(ptr + onesize_*n + nrmoff);
+					for (int j = 0; j < 3; j++)
+						normal[j] += (sv[j]/32767.0f) * multiplier;
 				}
 				break;
 
@@ -294,38 +311,78 @@ void VertexDecoder::DecodeVerts(DecodedVertex *decoded, const void *verts, const
 			}
 		}
 
-		if (gstate.reversenormals & 0xFFFFFF)
-		{
-			for (int j = 0; j < 3; j++)
-				normal[j] = -normal[j];
-		}
-
 		float *v = decoded[index].pos;
-		memset(v, 0, sizeof(float)*3);
-		for (int n = 0; n < morphcount; n++)
-		{
+
+		if (morphcount == 1) {
 			switch (pos)
 			{
-			case GE_VTYPE_POS_FLOAT>>7:
+			case GE_VTYPE_POS_FLOAT >> 7:
 				{
-					float *fv = (float*)(ptr + onesize*n + posoff);
-					for (int j=0; j<3; j++)
-						v[j] += fv[j] * gstate.morphWeights[n];
+					const float *fv = (const float*)(ptr + posoff);
+					for (int j = 0; j < 3; j++)
+						v[j] = fv[j];
 				}
 				break;
 
-			case GE_VTYPE_POS_16BIT>>7:
+			case GE_VTYPE_POS_16BIT >> 7:
 				{
-					short *sv = (short*)(ptr + onesize*n + posoff);
+					float multiplier = 1.0f / 32767.0f;
+					if (throughmode) multiplier = 1.0f;
+					const short *sv = (const short*)(ptr + posoff);
 					for (int j = 0; j < 3; j++)
-						v[j] += sv[j] * gstate.morphWeights[n];
+						v[j] = sv[j] * multiplier;
+				}
+				break;
+
+			case GE_VTYPE_POS_8BIT >> 7:
+				{
+					const s8 *sv = (const s8*)(ptr + posoff);
+					for (int j = 0; j < 3; j++)
+						v[j] = sv[j] / 127.f;
 				}
 				break;
 
 			default:
-				DEBUG_LOG(G3D,"Unknown position format %i",pos);
+				ERROR_LOG(G3D,"Unknown position format %i",pos);
 				break;
 			}
+		} else {
+			memset(v, 0, sizeof(float) * 3);
+			for (int n = 0; n < morphcount; n++)
+			{
+				switch (pos)
+				{
+				case GE_VTYPE_POS_FLOAT >> 7:
+					{
+						const float *fv = (const float*)(ptr + posoff);
+						for (int j = 0; j < 3; j++)
+							v[j] += fv[j] * gstate.morphWeights[n];
+					}
+					break;
+
+				case GE_VTYPE_POS_16BIT >> 7:
+					{
+						float multiplier = 1.0f / 32767.0f;
+						if (throughmode) multiplier = 1.0f;
+						const short *sv = (const short*)(ptr + posoff);
+						for (int j = 0; j < 3; j++)
+							v[j] += (sv[j] * multiplier) * gstate.morphWeights[n];
+					}
+					break;
+
+				case GE_VTYPE_POS_8BIT >> 7:
+					{
+						const s8 *sv = (const s8*)(ptr + posoff);
+						for (int j = 0; j < 3; j++)
+							v[j] += (sv[j] / 127.f) * gstate.morphWeights[n];
+					}
+					break;
+
+				default:
+					ERROR_LOG(G3D,"Unknown position format %i",pos);
+					break;
+				}
+			}
 		}
 	}
 }
diff --git a/GPU/GLES/VertexDecoder.h b/GPU/GLES/VertexDecoder.h
index ae77d1c8e..5ac8c379b 100644
--- a/GPU/GLES/VertexDecoder.h
+++ b/GPU/GLES/VertexDecoder.h
@@ -24,15 +24,16 @@ struct DecodedVertex
 	float pos[3];     // in case of morph, preblend during decode
 	float normal[3];  // in case of morph, preblend during decode
 	float uv[2];      // scaled by uscale, vscale, if there
-	float color[4];   // unlit
-	float weights[8];
+	u8 color[4];   // unlit
+	float weights[8];  // ugh, expensive
 };
 
 struct TransformedVertex
 {
 	float x, y, z;     // in case of morph, preblend during decode
 	float uv[2];      // scaled by uscale, vscale, if there
-	float color[4];   // prelit
+	float color0[4];   // prelit
+	float color1[4];   // prelit
 };
 
 
@@ -44,22 +45,30 @@ struct TransformedVertex
 //   - will compile into lighting fast specialized x86 
 //   - will not bother translating components that can be read directly
 //     by OpenGL ES. Will still have to translate 565 colors, and things
-//     like that. DecodedVertex will not be a fixed struct.
+//     like that. DecodedVertex will not be a fixed struct. Will have to
+//     do morphing here.
 //
 // We want 100% perf on 1Ghz even in vertex complex games!
 class VertexDecoder 
 {
+public:
+	VertexDecoder() : coloff(0), nrmoff(0), posoff(0) {}
+	~VertexDecoder() {}
+	void SetVertexType(u32 fmt);
+	void DecodeVerts(DecodedVertex *decoded, const void *verts, const void *inds, int prim, int count) const;
+
+private:
 	u32 fmt;
 	bool throughmode;
 	int biggest;
+	int size;
+	int onesize_;
 
 	int weightoff;
 	int tcoff;
 	int coloff;
 	int nrmoff;
 	int posoff;
-	int size;
-	int oneSize;
 
 	int tc;
 	int col;
@@ -70,11 +79,4 @@ class VertexDecoder
 	int morphcount;
 	int nweights;
 
-public:
-	VertexDecoder() : coloff(0), nrmoff(0), posoff(0) {}
-	~VertexDecoder() {}
-	void SetVertexType(u32 fmt);
-	void DecodeVerts(DecodedVertex *decoded, const void *verts, const void *inds, int prim, int count) const;
-
-	// void DoGLVertexAttribPointer()
 };
diff --git a/GPU/GLES/VertexShaderGenerator.cpp b/GPU/GLES/VertexShaderGenerator.cpp
index ecac3fe32..d48536f6a 100644
--- a/GPU/GLES/VertexShaderGenerator.cpp
+++ b/GPU/GLES/VertexShaderGenerator.cpp
@@ -60,7 +60,6 @@ char *GenerateVertexShader()
 #endif
 
 	int lmode = gstate.lmode & 1;
-	lmode = 0;    // TODO: support separate specular
 
 	WRITE("attribute vec4 a_position;");
 	WRITE("attribute vec2 a_texcoord;");
diff --git a/Globals.h b/Globals.h
index 557360114..a07fb6655 100644
--- a/Globals.h
+++ b/Globals.h
@@ -35,6 +35,24 @@ inline u32 _byteswap_ulong(u32 data)
 
 #endif
 
+inline u8 Convert4To8(u8 v)
+{
+	// Swizzle bits: 00012345 -> 12345123
+	return (v << 4) | (v);
+}
+
+inline u8 Convert5To8(u8 v)
+{
+	// Swizzle bits: 00012345 -> 12345123
+	return (v << 3) | (v >> 2);
+}
+
+inline u8 Convert6To8(u8 v)
+{
+	// Swizzle bits: 00123456 -> 12345612
+	return (v << 2) | (v >> 4);
+}
+
 #ifndef DISALLOW_COPY_AND_ASSIGN
 #define DISALLOW_COPY_AND_ASSIGN(t) \
  private: \
diff --git a/Windows/PPSSPP.sln b/Windows/PPSSPP.sln
index 2647de4ef..0ee6604d9 100644
--- a/Windows/PPSSPP.sln
+++ b/Windows/PPSSPP.sln
@@ -35,8 +35,6 @@ Global
 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
 		Debug|Win32 = Debug|Win32
 		Debug|x64 = Debug|x64
-		DebugFast|Win32 = DebugFast|Win32
-		DebugFast|x64 = DebugFast|x64
 		Release|Win32 = Release|Win32
 		Release|x64 = Release|x64
 	EndGlobalSection
@@ -45,10 +43,6 @@ Global
 		{567AF8DB-42C1-4D08-96CD-D70A2DFEFC6B}.Debug|Win32.Build.0 = Debug|Win32
 		{567AF8DB-42C1-4D08-96CD-D70A2DFEFC6B}.Debug|x64.ActiveCfg = Debug|x64
 		{567AF8DB-42C1-4D08-96CD-D70A2DFEFC6B}.Debug|x64.Build.0 = Debug|x64
-		{567AF8DB-42C1-4D08-96CD-D70A2DFEFC6B}.DebugFast|Win32.ActiveCfg = DebugFast|Win32
-		{567AF8DB-42C1-4D08-96CD-D70A2DFEFC6B}.DebugFast|Win32.Build.0 = DebugFast|Win32
-		{567AF8DB-42C1-4D08-96CD-D70A2DFEFC6B}.DebugFast|x64.ActiveCfg = Debug|x64
-		{567AF8DB-42C1-4D08-96CD-D70A2DFEFC6B}.DebugFast|x64.Build.0 = Debug|x64
 		{567AF8DB-42C1-4D08-96CD-D70A2DFEFC6B}.Release|Win32.ActiveCfg = Release|Win32
 		{567AF8DB-42C1-4D08-96CD-D70A2DFEFC6B}.Release|Win32.Build.0 = Release|Win32
 		{567AF8DB-42C1-4D08-96CD-D70A2DFEFC6B}.Release|x64.ActiveCfg = Release|x64
@@ -57,10 +51,6 @@ Global
 		{3FCDBAE2-5103-4350-9A8E-848CE9C73195}.Debug|Win32.Build.0 = Debug|Win32
 		{3FCDBAE2-5103-4350-9A8E-848CE9C73195}.Debug|x64.ActiveCfg = Debug|x64
 		{3FCDBAE2-5103-4350-9A8E-848CE9C73195}.Debug|x64.Build.0 = Debug|x64
-		{3FCDBAE2-5103-4350-9A8E-848CE9C73195}.DebugFast|Win32.ActiveCfg = Debug|Win32
-		{3FCDBAE2-5103-4350-9A8E-848CE9C73195}.DebugFast|Win32.Build.0 = Debug|Win32
-		{3FCDBAE2-5103-4350-9A8E-848CE9C73195}.DebugFast|x64.ActiveCfg = Debug|x64
-		{3FCDBAE2-5103-4350-9A8E-848CE9C73195}.DebugFast|x64.Build.0 = Debug|x64
 		{3FCDBAE2-5103-4350-9A8E-848CE9C73195}.Release|Win32.ActiveCfg = Release|Win32
 		{3FCDBAE2-5103-4350-9A8E-848CE9C73195}.Release|Win32.Build.0 = Release|Win32
 		{3FCDBAE2-5103-4350-9A8E-848CE9C73195}.Release|x64.ActiveCfg = Release|x64
@@ -69,10 +59,6 @@ Global
 		{F761046E-6C38-4428-A5F1-38391A37BB34}.Debug|Win32.Build.0 = Debug|Win32
 		{F761046E-6C38-4428-A5F1-38391A37BB34}.Debug|x64.ActiveCfg = Debug|x64
 		{F761046E-6C38-4428-A5F1-38391A37BB34}.Debug|x64.Build.0 = Debug|x64
-		{F761046E-6C38-4428-A5F1-38391A37BB34}.DebugFast|Win32.ActiveCfg = Debug|Win32
-		{F761046E-6C38-4428-A5F1-38391A37BB34}.DebugFast|Win32.Build.0 = Debug|Win32
-		{F761046E-6C38-4428-A5F1-38391A37BB34}.DebugFast|x64.ActiveCfg = Debug|x64
-		{F761046E-6C38-4428-A5F1-38391A37BB34}.DebugFast|x64.Build.0 = Debug|x64
 		{F761046E-6C38-4428-A5F1-38391A37BB34}.Release|Win32.ActiveCfg = Release|Win32
 		{F761046E-6C38-4428-A5F1-38391A37BB34}.Release|Win32.Build.0 = Release|Win32
 		{F761046E-6C38-4428-A5F1-38391A37BB34}.Release|x64.ActiveCfg = Release|x64
@@ -81,10 +67,6 @@ Global
 		{457F45D2-556F-47BC-A31D-AFF0D15BEAED}.Debug|Win32.Build.0 = Debug|Win32
 		{457F45D2-556F-47BC-A31D-AFF0D15BEAED}.Debug|x64.ActiveCfg = Debug|x64
 		{457F45D2-556F-47BC-A31D-AFF0D15BEAED}.Debug|x64.Build.0 = Debug|x64
-		{457F45D2-556F-47BC-A31D-AFF0D15BEAED}.DebugFast|Win32.ActiveCfg = Debug|Win32
-		{457F45D2-556F-47BC-A31D-AFF0D15BEAED}.DebugFast|Win32.Build.0 = Debug|Win32
-		{457F45D2-556F-47BC-A31D-AFF0D15BEAED}.DebugFast|x64.ActiveCfg = Debug|x64
-		{457F45D2-556F-47BC-A31D-AFF0D15BEAED}.DebugFast|x64.Build.0 = Debug|x64
 		{457F45D2-556F-47BC-A31D-AFF0D15BEAED}.Release|Win32.ActiveCfg = Release|Win32
 		{457F45D2-556F-47BC-A31D-AFF0D15BEAED}.Release|Win32.Build.0 = Release|Win32
 		{457F45D2-556F-47BC-A31D-AFF0D15BEAED}.Release|x64.ActiveCfg = Release|x64
@@ -93,10 +75,6 @@ Global
 		{533F1D30-D04D-47CC-AD71-20F658907E36}.Debug|Win32.Build.0 = Debug|Win32
 		{533F1D30-D04D-47CC-AD71-20F658907E36}.Debug|x64.ActiveCfg = Debug|x64
 		{533F1D30-D04D-47CC-AD71-20F658907E36}.Debug|x64.Build.0 = Debug|x64
-		{533F1D30-D04D-47CC-AD71-20F658907E36}.DebugFast|Win32.ActiveCfg = Debug|Win32
-		{533F1D30-D04D-47CC-AD71-20F658907E36}.DebugFast|Win32.Build.0 = Debug|Win32
-		{533F1D30-D04D-47CC-AD71-20F658907E36}.DebugFast|x64.ActiveCfg = Debug|x64
-		{533F1D30-D04D-47CC-AD71-20F658907E36}.DebugFast|x64.Build.0 = Debug|x64
 		{533F1D30-D04D-47CC-AD71-20F658907E36}.Release|Win32.ActiveCfg = Release|Win32
 		{533F1D30-D04D-47CC-AD71-20F658907E36}.Release|Win32.Build.0 = Release|Win32
 		{533F1D30-D04D-47CC-AD71-20F658907E36}.Release|x64.ActiveCfg = Release|x64
@@ -105,10 +83,6 @@ Global
 		{E8B58922-9827-493D-81E0-4B6E6BD77171}.Debug|Win32.Build.0 = Debug|Win32
 		{E8B58922-9827-493D-81E0-4B6E6BD77171}.Debug|x64.ActiveCfg = Debug|x64
 		{E8B58922-9827-493D-81E0-4B6E6BD77171}.Debug|x64.Build.0 = Debug|x64
-		{E8B58922-9827-493D-81E0-4B6E6BD77171}.DebugFast|Win32.ActiveCfg = Debug|Win32
-		{E8B58922-9827-493D-81E0-4B6E6BD77171}.DebugFast|Win32.Build.0 = Debug|Win32
-		{E8B58922-9827-493D-81E0-4B6E6BD77171}.DebugFast|x64.ActiveCfg = Debug|x64
-		{E8B58922-9827-493D-81E0-4B6E6BD77171}.DebugFast|x64.Build.0 = Debug|x64
 		{E8B58922-9827-493D-81E0-4B6E6BD77171}.Release|Win32.ActiveCfg = Release|Win32
 		{E8B58922-9827-493D-81E0-4B6E6BD77171}.Release|Win32.Build.0 = Release|Win32
 		{E8B58922-9827-493D-81E0-4B6E6BD77171}.Release|x64.ActiveCfg = Release|x64
@@ -117,10 +91,6 @@ Global
 		{EE9BD869-CAA3-447D-8328-294D90DE2C1F}.Debug|Win32.Build.0 = Debug|Win32
 		{EE9BD869-CAA3-447D-8328-294D90DE2C1F}.Debug|x64.ActiveCfg = Debug|x64
 		{EE9BD869-CAA3-447D-8328-294D90DE2C1F}.Debug|x64.Build.0 = Debug|x64
-		{EE9BD869-CAA3-447D-8328-294D90DE2C1F}.DebugFast|Win32.ActiveCfg = Debug|Win32
-		{EE9BD869-CAA3-447D-8328-294D90DE2C1F}.DebugFast|Win32.Build.0 = Debug|Win32
-		{EE9BD869-CAA3-447D-8328-294D90DE2C1F}.DebugFast|x64.ActiveCfg = Debug|x64
-		{EE9BD869-CAA3-447D-8328-294D90DE2C1F}.DebugFast|x64.Build.0 = Debug|x64
 		{EE9BD869-CAA3-447D-8328-294D90DE2C1F}.Release|Win32.ActiveCfg = Release|Win32
 		{EE9BD869-CAA3-447D-8328-294D90DE2C1F}.Release|Win32.Build.0 = Release|Win32
 		{EE9BD869-CAA3-447D-8328-294D90DE2C1F}.Release|x64.ActiveCfg = Release|x64
@@ -129,10 +99,6 @@ Global
 		{3BAAE095-E0AB-4B0E-B5DF-CE39C8AE31DE}.Debug|Win32.Build.0 = Debug|Win32
 		{3BAAE095-E0AB-4B0E-B5DF-CE39C8AE31DE}.Debug|x64.ActiveCfg = Debug|x64
 		{3BAAE095-E0AB-4B0E-B5DF-CE39C8AE31DE}.Debug|x64.Build.0 = Debug|x64
-		{3BAAE095-E0AB-4B0E-B5DF-CE39C8AE31DE}.DebugFast|Win32.ActiveCfg = Debug|Win32
-		{3BAAE095-E0AB-4B0E-B5DF-CE39C8AE31DE}.DebugFast|Win32.Build.0 = Debug|Win32
-		{3BAAE095-E0AB-4B0E-B5DF-CE39C8AE31DE}.DebugFast|x64.ActiveCfg = Debug|x64
-		{3BAAE095-E0AB-4B0E-B5DF-CE39C8AE31DE}.DebugFast|x64.Build.0 = Debug|x64
 		{3BAAE095-E0AB-4B0E-B5DF-CE39C8AE31DE}.Release|Win32.ActiveCfg = Release|Win32
 		{3BAAE095-E0AB-4B0E-B5DF-CE39C8AE31DE}.Release|Win32.Build.0 = Release|Win32
 		{3BAAE095-E0AB-4B0E-B5DF-CE39C8AE31DE}.Release|x64.ActiveCfg = Release|x64
diff --git a/Windows/PPSSPP.vcxproj b/Windows/PPSSPP.vcxproj
index 8d486faac..b5a069f27 100644
--- a/Windows/PPSSPP.vcxproj
+++ b/Windows/PPSSPP.vcxproj
@@ -1,14 +1,6 @@
 ﻿<?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
   <ItemGroup Label="ProjectConfigurations">
-    <ProjectConfiguration Include="DebugFast|Win32">
-      <Configuration>DebugFast</Configuration>
-      <Platform>Win32</Platform>
-    </ProjectConfiguration>
-    <ProjectConfiguration Include="DebugFast|x64">
-      <Configuration>DebugFast</Configuration>
-      <Platform>x64</Platform>
-    </ProjectConfiguration>
     <ProjectConfiguration Include="Debug|Win32">
       <Configuration>Debug</Configuration>
       <Platform>Win32</Platform>
@@ -33,11 +25,6 @@
     <ProjectName>PPSSPPWindows</ProjectName>
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='DebugFast|Win32'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <CharacterSet>MultiByte</CharacterSet>
-    <WholeProgramOptimization>false</WholeProgramOptimization>
-  </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
     <ConfigurationType>Application</ConfigurationType>
     <CharacterSet>MultiByte</CharacterSet>
@@ -47,11 +34,6 @@
     <ConfigurationType>Application</ConfigurationType>
     <CharacterSet>MultiByte</CharacterSet>
   </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='DebugFast|x64'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <CharacterSet>MultiByte</CharacterSet>
-    <WholeProgramOptimization>false</WholeProgramOptimization>
-  </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
     <ConfigurationType>Application</ConfigurationType>
     <CharacterSet>MultiByte</CharacterSet>
@@ -64,10 +46,6 @@
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
   </ImportGroup>
-  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='DebugFast|Win32'" Label="PropertySheets">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-    <Import Project="$(VCTargetsPath)Microsoft.CPP.UpgradeFromVC71.props" />
-  </ImportGroup>
   <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="PropertySheets">
     <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
     <Import Project="$(VCTargetsPath)Microsoft.CPP.UpgradeFromVC71.props" />
@@ -76,10 +54,6 @@
     <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
     <Import Project="$(VCTargetsPath)Microsoft.CPP.UpgradeFromVC71.props" />
   </ImportGroup>
-  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='DebugFast|x64'" Label="PropertySheets">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-    <Import Project="$(VCTargetsPath)Microsoft.CPP.UpgradeFromVC71.props" />
-  </ImportGroup>
   <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
     <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
     <Import Project="$(VCTargetsPath)Microsoft.CPP.UpgradeFromVC71.props" />
@@ -102,18 +76,6 @@
     <OutDir Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(Platform)\$(Configuration)\</OutDir>
     <IntDir Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(Platform)\$(Configuration)\</IntDir>
     <LinkIncremental Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</LinkIncremental>
-    <OutDir Condition="'$(Configuration)|$(Platform)'=='DebugFast|Win32'">..\</OutDir>
-    <IntDir Condition="'$(Configuration)|$(Platform)'=='DebugFast|Win32'">$(Configuration)\</IntDir>
-    <LinkIncremental Condition="'$(Configuration)|$(Platform)'=='DebugFast|Win32'">false</LinkIncremental>
-    <OutDir Condition="'$(Configuration)|$(Platform)'=='DebugFast|x64'">$(Platform)\$(Configuration)\</OutDir>
-    <IntDir Condition="'$(Configuration)|$(Platform)'=='DebugFast|x64'">$(Platform)\$(Configuration)\</IntDir>
-    <LinkIncremental Condition="'$(Configuration)|$(Platform)'=='DebugFast|x64'">false</LinkIncremental>
-    <CodeAnalysisRuleSet Condition="'$(Configuration)|$(Platform)'=='DebugFast|Win32'">AllRules.ruleset</CodeAnalysisRuleSet>
-    <CodeAnalysisRules Condition="'$(Configuration)|$(Platform)'=='DebugFast|Win32'" />
-    <CodeAnalysisRuleAssemblies Condition="'$(Configuration)|$(Platform)'=='DebugFast|Win32'" />
-    <CodeAnalysisRuleSet Condition="'$(Configuration)|$(Platform)'=='DebugFast|x64'">AllRules.ruleset</CodeAnalysisRuleSet>
-    <CodeAnalysisRules Condition="'$(Configuration)|$(Platform)'=='DebugFast|x64'" />
-    <CodeAnalysisRuleAssemblies Condition="'$(Configuration)|$(Platform)'=='DebugFast|x64'" />
     <CodeAnalysisRuleSet Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">AllRules.ruleset</CodeAnalysisRuleSet>
     <CodeAnalysisRules Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" />
     <CodeAnalysisRuleAssemblies Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" />
@@ -246,94 +208,34 @@
       <FixedBaseAddress>true</FixedBaseAddress>
     </Link>
   </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='DebugFast|Win32'">
-    <ClCompile>
-      <PreprocessorDefinitions>WIN32;NDEBUG;_WINDOWS;LOGGING;_DEBUG;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <ExceptionHandling>Sync</ExceptionHandling>
-      <RuntimeLibrary>MultiThreadedDebugDLL</RuntimeLibrary>
-      <BufferSecurityCheck>false</BufferSecurityCheck>
-      <PrecompiledHeader>Use</PrecompiledHeader>
-      <AssemblerOutput>
-      </AssemblerOutput>
-      <WarningLevel>Level3</WarningLevel>
-      <DebugInformationFormat>ProgramDatabase</DebugInformationFormat>
-      <AdditionalIncludeDirectories>../common;..;../native;../native/ext/glew;../ext/zlib</AdditionalIncludeDirectories>
-      <ForcedIncludeFiles>stdafx.h</ForcedIncludeFiles>
-    </ClCompile>
-    <Link>
-      <AdditionalDependencies>XInput.lib;Winmm.lib;Ws2_32.lib;opengl32.lib;dsound.lib;glu32.lib;comctl32.lib;%(AdditionalDependencies)</AdditionalDependencies>
-      <OutputFile>$(OutDir)$(TargetName)$(TargetExt)</OutputFile>
-      <AdditionalLibraryDirectories>%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-      <SubSystem>Windows</SubSystem>
-      <OptimizeReferences>true</OptimizeReferences>
-      <EnableCOMDATFolding>true</EnableCOMDATFolding>
-      <TargetMachine>MachineX86</TargetMachine>
-    </Link>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='DebugFast|x64'">
-    <Midl>
-      <TargetEnvironment>X64</TargetEnvironment>
-    </Midl>
-    <ClCompile>
-      <PreprocessorDefinitions>WIN32;NDEBUG;_WINDOWS;LOGGING;_DEBUG;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <ExceptionHandling>Sync</ExceptionHandling>
-      <RuntimeLibrary>MultiThreadedDebugDLL</RuntimeLibrary>
-      <BufferSecurityCheck>false</BufferSecurityCheck>
-      <PrecompiledHeader>Use</PrecompiledHeader>
-      <WarningLevel>Level3</WarningLevel>
-      <DebugInformationFormat>ProgramDatabase</DebugInformationFormat>
-      <AdditionalIncludeDirectories>../common;..;../native;../native/ext/glew;../ext/zlib</AdditionalIncludeDirectories>
-    </ClCompile>
-    <Link>
-      <AdditionalDependencies>imgdecoder.lib;opengl32.lib;dsound.lib;glu32.lib;comctl32.lib;XInput.lib;%(AdditionalDependencies)</AdditionalDependencies>
-      <OutputFile>$(OutDir)DaShDebugFast.exe</OutputFile>
-      <AdditionalLibraryDirectories>%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-      <SubSystem>Windows</SubSystem>
-      <OptimizeReferences>true</OptimizeReferences>
-      <EnableCOMDATFolding>true</EnableCOMDATFolding>
-      <TargetMachine>MachineX64</TargetMachine>
-    </Link>
-  </ItemDefinitionGroup>
   <ItemGroup>
     <ClCompile Include="..\android\jni\EmuScreen.cpp">
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='DebugFast|Win32'">true</ExcludedFromBuild>
       <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</ExcludedFromBuild>
       <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">true</ExcludedFromBuild>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='DebugFast|x64'">true</ExcludedFromBuild>
       <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
       <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
     </ClCompile>
     <ClCompile Include="..\android\jni\GamepadEmu.cpp">
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='DebugFast|Win32'">true</ExcludedFromBuild>
       <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</ExcludedFromBuild>
       <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">true</ExcludedFromBuild>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='DebugFast|x64'">true</ExcludedFromBuild>
       <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
       <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
     </ClCompile>
     <ClCompile Include="..\android\jni\MenuScreens.cpp">
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='DebugFast|Win32'">true</ExcludedFromBuild>
       <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</ExcludedFromBuild>
       <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">true</ExcludedFromBuild>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='DebugFast|x64'">true</ExcludedFromBuild>
       <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
       <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
     </ClCompile>
     <ClCompile Include="..\android\jni\NativeApp.cpp">
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='DebugFast|Win32'">true</ExcludedFromBuild>
       <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</ExcludedFromBuild>
       <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">true</ExcludedFromBuild>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='DebugFast|x64'">true</ExcludedFromBuild>
       <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
       <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
     </ClCompile>
     <ClCompile Include="..\android\jni\UIShader.cpp">
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='DebugFast|Win32'">true</ExcludedFromBuild>
       <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</ExcludedFromBuild>
       <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">true</ExcludedFromBuild>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='DebugFast|x64'">true</ExcludedFromBuild>
       <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
       <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
     </ClCompile>
@@ -344,12 +246,6 @@
       <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">NotUsing</PrecompiledHeader>
       <ForcedIncludeFiles Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
       </ForcedIncludeFiles>
-      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='DebugFast|Win32'">NotUsing</PrecompiledHeader>
-      <ForcedIncludeFiles Condition="'$(Configuration)|$(Platform)'=='DebugFast|Win32'">
-      </ForcedIncludeFiles>
-      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='DebugFast|x64'">NotUsing</PrecompiledHeader>
-      <ForcedIncludeFiles Condition="'$(Configuration)|$(Platform)'=='DebugFast|x64'">
-      </ForcedIncludeFiles>
       <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">NotUsing</PrecompiledHeader>
       <ForcedIncludeFiles Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
       </ForcedIncludeFiles>
@@ -368,8 +264,6 @@
     <ClCompile Include="KeyboardDevice.cpp" />
     <ClCompile Include="W32Util\DialogManager.cpp" />
     <ClCompile Include="W32Util\Misc.cpp">
-      <ObjectFileName Condition="'$(Configuration)|$(Platform)'=='DebugFast|Win32'">$(IntDir)%(Filename)2.obj</ObjectFileName>
-      <ObjectFileName Condition="'$(Configuration)|$(Platform)'=='DebugFast|x64'">$(IntDir)%(Filename)2.obj</ObjectFileName>
       <ObjectFileName Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename)2.obj</ObjectFileName>
       <ObjectFileName Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)%(Filename)2.obj</ObjectFileName>
       <ObjectFileName Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename)2.obj</ObjectFileName>
@@ -385,8 +279,6 @@
     <ClCompile Include="..\Globals.cpp" />
     <ClCompile Include="..\main.cpp" />
     <ClCompile Include="..\stdafx.cpp">
-      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='DebugFast|Win32'">Create</PrecompiledHeader>
-      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='DebugFast|x64'">Create</PrecompiledHeader>
       <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Create</PrecompiledHeader>
       <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Create</PrecompiledHeader>
       <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Create</PrecompiledHeader>
@@ -464,4 +356,4 @@
       <UserProperties RESOURCE_FILE="DaSh.rc" />
     </VisualStudio>
   </ProjectExtensions>
-</Project>
+</Project>
\ No newline at end of file
diff --git a/android/jni/Android.mk b/android/jni/Android.mk
index 5b0680bcc..025c73a38 100644
--- a/android/jni/Android.mk
+++ b/android/jni/Android.mk
@@ -5,7 +5,7 @@ LOCAL_PATH := $(call my-dir)
 include $(CLEAR_VARS)
 
 LOCAL_MODULE := native_audio
-LOCAL_CFLAGS := -O2 -fsigned-char -Wall -Wno-multichar -Wno-psabi -std=gnu++0x
+LOCAL_CFLAGS := -O2 -fsigned-char -ffast-math -Wall -Wno-multichar -Wno-psabi -std=gnu++0x
 NATIVE := ../../native
 LOCAL_SRC_FILES := \
 		$(NATIVE)/android/native-audio-so.cpp
@@ -24,7 +24,7 @@ LOCAL_MODULE := ppsspp_jni
 NATIVE := ../../native
 SRC := ../..
 
-LOCAL_CFLAGS := -DUSE_PROFILER -DGL_GLEXT_PROTOTYPES -O2 -fsigned-char -Wall -Wno-multichar -Wno-psabi -std=gnu++0x -Wno-unused-variable -fno-strict-aliasing
+LOCAL_CFLAGS := -DUSE_PROFILER -DGL_GLEXT_PROTOTYPES -O2 -fsigned-char -Wall -Wno-multichar -Wno-psabi -std=gnu++0x -Wno-unused-variable -fno-strict-aliasing -ffast-math
 LOCAL_CPPFLAGS := 
 LOCAL_C_INCLUDES := \
   $(LOCAL_PATH)/../../Common \