Cleanup and reorganize gpu code a little

2024-11-23 05:19:56 +00:00 · 2012-12-21 16:49:42 +01:00 · 2012-12-21 16:49:42 +01:00 · 4c23d668e1
commit 4c23d668e1
parent 91744a03b0
9 changed files with 288 additions and 270 deletions
--- a/GPU/GLES/DisplayListInterpreter.cpp
+++ b/GPU/GLES/DisplayListInterpreter.cpp
@ -948,9 +948,9 @@ void GLES_GPU::ExecuteOp(u32 op, u32 diff)

 			int l = (cmd - GE_CMD_LAC0) / 3;
 			int t = (cmd - GE_CMD_LAC0) % 3;
-			gstate_c.lightColor[t][l].r = r;
-			gstate_c.lightColor[t][l].g = g;
-			gstate_c.lightColor[t][l].b = b;
+			gstate_c.lightColor[t][l][0] = r;
+			gstate_c.lightColor[t][l][1] = g;
+			gstate_c.lightColor[t][l][2] = b;
 			if (diff)
 				shaderManager.DirtyUniform(DIRTY_LIGHT0 << l);
 		}
@ -1236,7 +1236,7 @@ bool GLES_GPU::InterpretList()
 		op = Memory::ReadUnchecked_U32(dcontext.pc); //read from memory
 		u32 cmd = op >> 24;
 		u32 diff = op ^ gstate.cmdmem[cmd];
-		gstate.cmdmem[cmd] = op;	 // crashes if I try to put the whole op there??
+		gstate.cmdmem[cmd] = op;

 		ExecuteOp(op, diff);

@ -1257,6 +1257,15 @@ void GLES_GPU::UpdateStats()

 void GLES_GPU::DoBlockTransfer()
 {
+	// TODO: This is used a lot to copy data around between render targets and textures,
+	// and also to quickly load textures from RAM to VRAM. So we should do checks like the following:
+	//  * Does dstBasePtr point to an existing texture? If so invalidate it and reload it immediately.
+	//
+	//  * Does srcBasePtr point to a render target, and dstBasePtr to a texture? If so
+	//    either copy between rt and texture or reassign the texture to point to the render target
+	//
+	// etc....
+
 	u32 srcBasePtr = (gstate.transfersrc & 0xFFFFFF) | ((gstate.transfersrcw & 0xFF0000) << 8);
 	u32 srcStride = gstate.transfersrcw & 0x3FF;

--- a/GPU/GLES/DisplayListInterpreter.h
+++ b/GPU/GLES/DisplayListInterpreter.h
@ -25,6 +25,7 @@
 #include "gfx_es2/fbo.h"

 class ShaderManager;
+class LinkedShader;
 struct DecVtxFormat;

 class GLES_GPU : public GPUInterface
@ -52,7 +53,7 @@ public:
 private:
 	// TransformPipeline.cpp
 	void TransformAndDrawPrim(void *verts, void *inds, int prim, int vertexCount, float *customUV, int forceIndexType, int *bytesRead = 0);
-	void SoftwareTransformAndDraw(int prim, LinkedShader *program, int forceIndexType, int vertexCount, void *inds, const DecVtxFormat &decVtxFormat, int indexLowerBound, int indexUpperBound, float *customUV);
+	//void SoftwareTransformAndDraw(int prim, LinkedShader *program, int forceIndexType, int vertexCount, void *inds, const DecVtxFormat &decVtxFormat, int indexLowerBound, int indexUpperBound, float *customUV);
 	void ApplyDrawState();
 	void Flush(int prim);
 	void UpdateViewportAndProjection();
--- a/GPU/GLES/ShaderManager.cpp
+++ b/GPU/GLES/ShaderManager.cpp
@ -260,9 +260,9 @@ void LinkedShader::use() {
 			glUniform3fv(u_lightpos[i], 1, gstate_c.lightpos[i]);
 			glUniform3fv(u_lightdir[i], 1, gstate_c.lightdir[i]);
 			glUniform3fv(u_lightatt[i], 1, gstate_c.lightatt[i]);
-			glUniform3fv(u_lightambient[i], 1, &gstate_c.lightColor[0][i].r);
-			glUniform3fv(u_lightdiffuse[i], 1, &gstate_c.lightColor[1][i].r);
-			glUniform3fv(u_lightspecular[i], 1, &gstate_c.lightColor[2][i].r);
+			glUniform3fv(u_lightambient[i], 1, gstate_c.lightColor[0][i]);
+			glUniform3fv(u_lightdiffuse[i], 1, gstate_c.lightColor[1][i]);
+			glUniform3fv(u_lightspecular[i], 1, gstate_c.lightColor[2][i]);
 		}
 	}

--- a/GPU/GLES/ShaderManager.h
+++ b/GPU/GLES/ShaderManager.h
@ -23,10 +23,11 @@
 #include "VertexShaderGenerator.h"
 #include "FragmentShaderGenerator.h"

-struct Shader;
+class Shader;

-struct LinkedShader
+class LinkedShader
 {
+public:
 	LinkedShader(Shader *vs, Shader *fs);
 	~LinkedShader();

@ -116,11 +117,12 @@ enum

 // Real public interface

-struct Shader
-{
+class Shader {
+public:
 	Shader(const char *code, uint32_t shaderType);
 	uint32_t shader;
 	const std::string &source() const { return source_; }
+
 private:
 	std::string source_;
 };
--- a/GPU/GLES/StateMapping.cpp
+++ b/GPU/GLES/StateMapping.cpp
@ -1,4 +1,11 @@
 #include "StateMapping.h"
+#include "../../native/gfx_es2/gl_state.h"
+
+#include "../Math3D.h"
+#include "../GPUState.h"
+#include "../ge_constants.h"
+#include "DisplayListInterpreter.h"
+#include "ShaderManager.h"

 const GLint aLookup[] = {
 	GL_DST_COLOR,
@ -51,3 +58,165 @@ const GLuint ztests[] =
 	GL_NEVER, GL_ALWAYS, GL_EQUAL, GL_NOTEQUAL, 
 	GL_LESS, GL_LEQUAL, GL_GREATER, GL_GEQUAL,
 };
+
+void GLES_GPU::ApplyDrawState()
+{
+
+	// TODO: All this setup is soon so expensive that we'll need dirty flags, or simply do it in the command writes where we detect dirty by xoring. Silly to do all this work on every drawcall.
+
+	// TODO: The top bit of the alpha channel should be written to the stencil bit somehow. This appears to require very expensive multipass rendering :( Alternatively, one could do a
+	// single fullscreen pass that converts alpha to stencil (or 2 passes, to set both the 0 and 1 values) very easily.
+
+	// Set cull
+	bool wantCull = !gstate.isModeClear() && !gstate.isModeThrough() && gstate.isCullEnabled();
+	glstate.cullFace.set(wantCull);
+
+	if(wantCull) {
+		u8 cullMode = gstate.getCullMode();
+		glstate.cullFaceMode.set(cullingMode[cullMode]);
+	}
+
+	// Set blend
+	bool wantBlend = !gstate.isModeClear() && (gstate.alphaBlendEnable & 1);
+	glstate.blend.set(wantBlend);
+	if(wantBlend) {
+		// This can't be done exactly as there are several PSP blend modes that are impossible to do on OpenGL ES 2.0, and some even on regular OpenGL for desktop.
+		// HOWEVER - we should be able to approximate the 2x modes in the shader, although they will clip wrongly.
+
+		// Examples of seen unimplementable blend states:
+		// Mortal Kombat Unchained: FixA=0000ff FixB=000080 FuncA=10 FuncB=10
+
+		int blendFuncA  = gstate.getBlendFuncA();
+		int blendFuncB  = gstate.getBlendFuncB();
+		int blendFuncEq = gstate.getBlendEq();
+
+		glstate.blendEquation.set(eqLookup[blendFuncEq]);
+
+		if (blendFuncA != GE_SRCBLEND_FIXA && blendFuncB != GE_DSTBLEND_FIXB) {
+			// All is valid, no blendcolor needed
+			glstate.blendFunc.set(aLookup[blendFuncA], bLookup[blendFuncB]);
+		} else {
+			GLuint glBlendFuncA = blendFuncA == GE_SRCBLEND_FIXA ? GL_INVALID_ENUM : aLookup[blendFuncA];
+			GLuint glBlendFuncB = blendFuncB == GE_DSTBLEND_FIXB ? GL_INVALID_ENUM : bLookup[blendFuncB];
+			u32 fixA = gstate.getFixA();
+			u32 fixB = gstate.getFixB();
+			// Shortcut by using GL_ONE where possible, no need to set blendcolor
+			if (glBlendFuncA == GL_INVALID_ENUM && blendFuncA == GE_SRCBLEND_FIXA) {
+				if (fixA == 0xFFFFFF)
+					glBlendFuncA = GL_ONE;
+				else if (fixA == 0)
+					glBlendFuncA = GL_ZERO;
+			} 
+			if (glBlendFuncB == GL_INVALID_ENUM && blendFuncB == GE_DSTBLEND_FIXB) {
+				if (fixB == 0xFFFFFF)
+					glBlendFuncB = GL_ONE;
+				else if (fixB == 0)
+					glBlendFuncB = GL_ZERO;
+			}
+			if (glBlendFuncA == GL_INVALID_ENUM && glBlendFuncB != GL_INVALID_ENUM) {
+				// Can use blendcolor trivially.
+				const float blendColor[4] = {(fixA & 0xFF)/255.0f, ((fixA >> 8) & 0xFF)/255.0f, ((fixA >> 16) & 0xFF)/255.0f, 1.0f};
+				glstate.blendColor.set(blendColor);
+				glBlendFuncA = GL_CONSTANT_COLOR;
+			} else if (glBlendFuncA != GL_INVALID_ENUM && glBlendFuncB == GL_INVALID_ENUM) {
+				// Can use blendcolor trivially.
+				const float blendColor[4] = {(fixB & 0xFF)/255.0f, ((fixB >> 8) & 0xFF)/255.0f, ((fixB >> 16) & 0xFF)/255.0f, 1.0f};
+				glstate.blendColor.set(blendColor);
+				glBlendFuncB = GL_CONSTANT_COLOR;
+			} else if (glBlendFuncA == GL_INVALID_ENUM && glBlendFuncB == GL_INVALID_ENUM) {  // Should also check for approximate equality
+				if (fixA == (fixB ^ 0xFFFFFF)) {
+					glBlendFuncA = GL_CONSTANT_COLOR;
+					glBlendFuncB = GL_ONE_MINUS_CONSTANT_COLOR;
+					const float blendColor[4] = {(fixA & 0xFF)/255.0f, ((fixA >> 8) & 0xFF)/255.0f, ((fixA >> 16) & 0xFF)/255.0f, 1.0f};
+					glstate.blendColor.set(blendColor);
+				} else if (fixA == fixB) {
+					glBlendFuncA = GL_CONSTANT_COLOR;
+					glBlendFuncB = GL_CONSTANT_COLOR;
+					const float blendColor[4] = {(fixA & 0xFF)/255.0f, ((fixA >> 8) & 0xFF)/255.0f, ((fixA >> 16) & 0xFF)/255.0f, 1.0f};
+					glstate.blendColor.set(blendColor);
+				} else {
+					DEBUG_LOG(HLE, "ERROR INVALID blendcolorstate: FixA=%06x FixB=%06x FuncA=%i FuncB=%i", gstate.getFixA(), gstate.getFixB(), gstate.getBlendFuncA(), gstate.getBlendFuncB());
+					glBlendFuncA = GL_ONE;
+					glBlendFuncB = GL_ONE;
+				}
+			}
+			// At this point, through all paths above, glBlendFuncA and glBlendFuncB will be set somehow.
+
+			glstate.blendFunc.set(glBlendFuncA, glBlendFuncB);
+		}
+	}
+
+	bool wantDepthTest = gstate.isModeClear() || gstate.isDepthTestEnabled();
+	glstate.depthTest.set(wantDepthTest);
+	if(wantDepthTest) {
+		// Force GL_ALWAYS if mode clear
+		int depthTestFunc = gstate.isModeClear() ? 1 : gstate.getDepthTestFunc();
+		glstate.depthFunc.set(ztests[depthTestFunc]);
+	}
+
+	bool wantDepthWrite = gstate.isModeClear() || gstate.isDepthWriteEnabled();
+	glstate.depthWrite.set(wantDepthWrite ? GL_TRUE : GL_FALSE);
+
+	float depthRangeMin = gstate_c.zOff - gstate_c.zScale;
+	float depthRangeMax = gstate_c.zOff + gstate_c.zScale;
+	glstate.depthRange.set(depthRangeMin, depthRangeMax);
+}
+
+void GLES_GPU::UpdateViewportAndProjection()
+{
+	bool throughmode = (gstate.vertType & GE_VTYPE_THROUGH_MASK) != 0;
+
+	// We can probably use these to simply set scissors? Maybe we need to offset by regionX1/Y1
+	int regionX1 = gstate.region1 & 0x3FF;
+	int regionY1 = (gstate.region1 >> 10) & 0x3FF;
+	int regionX2 = (gstate.region2 & 0x3FF) + 1;
+	int regionY2 = ((gstate.region2 >> 10) & 0x3FF) + 1;
+
+	float offsetX = (float)(gstate.offsetx & 0xFFFF) / 16.0f;
+	float offsetY = (float)(gstate.offsety & 0xFFFF) / 16.0f;
+
+	if (throughmode) {
+		// No viewport transform here. Let's experiment with using region.
+		return;
+		glViewport((0 + regionX1) * renderWidthFactor_, (0 - regionY1) * renderHeightFactor_, (regionX2 - regionX1) * renderWidthFactor_, (regionY2 - regionY1) * renderHeightFactor_);
+	} else {
+		// These we can turn into a glViewport call, offset by offsetX and offsetY. Math after.
+		float vpXa = getFloat24(gstate.viewportx1);
+		float vpXb = getFloat24(gstate.viewportx2);
+		float vpYa = getFloat24(gstate.viewporty1);
+		float vpYb = getFloat24(gstate.viewporty2);
+		float vpZa = getFloat24(gstate.viewportz1);  //  / 65536.0f   should map it to OpenGL's 0.0-1.0 Z range
+		float vpZb = getFloat24(gstate.viewportz2);  //  / 65536.0f
+
+		// The viewport transform appears to go like this: 
+		// Xscreen = -offsetX + vpXb + vpXa * Xview
+		// Yscreen = -offsetY + vpYb + vpYa * Yview
+		// Zscreen = vpZb + vpZa * Zview
+
+		// This means that to get the analogue glViewport we must:
+		float vpX0 = vpXb - offsetX - vpXa;
+		float vpY0 = vpYb - offsetY + vpYa;   // Need to account for sign of Y
+		gstate_c.vpWidth = vpXa * 2;
+		gstate_c.vpHeight = -vpYa * 2;
+
+		return;
+
+		float vpWidth = fabsf(gstate_c.vpWidth);
+		float vpHeight = fabsf(gstate_c.vpHeight);
+
+		// TODO: These two should feed into glDepthRange somehow.
+		float vpZ0 = (vpZb - vpZa) / 65536.0f;
+		float vpZ1 = (vpZa * 2) / 65536.0f;
+
+		vpX0 *= renderWidthFactor_;
+		vpY0 *= renderHeightFactor_;
+		vpWidth *= renderWidthFactor_;
+		vpHeight *= renderHeightFactor_;
+
+		// Flip vpY0 to match the OpenGL coordinate system.
+		vpY0 = renderHeight_ - (vpY0 + vpHeight);
+		glViewport(vpX0, vpY0, vpWidth, vpHeight);
+		// Sadly, as glViewport takes integers, we will not be able to support sub pixel offsets this way. But meh.
+		shaderManager_->DirtyUniform(DIRTY_PROJMATRIX);
+	}
+}
--- a/GPU/GLES/TransformPipeline.cpp
+++ b/GPU/GLES/TransformPipeline.cpp
@ -31,8 +31,7 @@
 #include "ShaderManager.h"
 #include "DisplayListInterpreter.h"

-GLuint glprim[8] =
-{
+const GLuint glprim[8] = {
 	GL_POINTS,
 	GL_LINES,
 	GL_LINE_STRIP,
@ -43,9 +42,11 @@ GLuint glprim[8] =
 };

 u8 decoded[65536 * 32];
+// uint16_t decIndex[65536];	// Unused
+
 TransformedVertex transformed[65536];
 TransformedVertex transformedExpanded[65536];
-uint16_t indexBuffer[65536];	// Unused
+

 // TODO: This should really return 2 colors, one for specular and one for diffuse.

@ -159,7 +160,8 @@ void Lighter::Light(float colorOut0[4], float colorOut1[4], const float colorIn[
 			if (lightScale > 1.0f) lightScale = 1.0f;
 		}

-		Color4 diff = (gstate_c.lightColor[1][l] * *diffuse) * (dot * lightScale);
+		Color4 lightDiff(gstate_c.lightColor[1][l], 0.0f);
+		Color4 diff = (lightDiff * *diffuse) * (dot * lightScale);

 		// Real PSP specular
 		Vec3 toViewer(0,0,1);
@ -175,13 +177,15 @@ void Lighter::Light(float colorOut0[4], float colorOut1[4], const float colorIn[
 			dot = halfVec * norm;
 			if (dot >= 0)
 			{
-				lightSum1 += (gstate_c.lightColor[2][l] * *specular * (powf(dot, specCoef_)*lightScale));
+				Color4 lightSpec(gstate_c.lightColor[2][l], 0.0f);
+				lightSum1 += (lightSpec * *specular * (powf(dot, specCoef_)*lightScale));
 			}
 		}
 		dots[l] = dot;
 		if (gstate.lightEnable[l] & 1)
 		{
-			lightSum0 += gstate_c.lightColor[0][l] * *ambient + diff;
+			Color4 lightAmbient(gstate_c.lightColor[2][l], 1.0f);
+			lightSum0 += lightAmbient * *ambient + diff;
 		}
 	}

@ -243,7 +247,22 @@ static void DesetupDecFmtForDraw(LinkedShader *program, const DecVtxFormat &decF
 	VertexAttribDisable(program->a_position, decFmt.posfmt);
 }

-void GLES_GPU::SoftwareTransformAndDraw(int prim, LinkedShader *program, int forceIndexType, int vertexCount, void *inds, const DecVtxFormat &decVtxFormat, int indexLowerBound, int indexUpperBound, float *customUV)
+// This is the software transform pipeline, which is necessary for supporting RECT
+// primitives correctly, and may be easier to use for debugging than the hardware
+// transform pipeline.
+
+// There's code here that simply expands transformed RECTANGLES into plain triangles.
+
+// We're gonna have to keep software transforming RECTANGLES, unless we use a geom shader which we can't on OpenGL ES 2.0.
+// Usually, though, these primitives don't use lighting etc so it's no biggie performance wise, but it would be nice to get rid of
+// this code.
+
+// Actually, if we find the camera-relative right and down vectors, it might even be possible to add the extra points in pre-transformed
+// space and thus make decent use of hardware transform.
+
+// Actually again, single quads could be drawn more efficiently using GL_TRIANGLE_STRIP, no need to duplicate verts as for
+// GL_TRIANGLES. Still need to sw transform to compute the extra two corners though.
+void SoftwareTransformAndDraw(int prim, LinkedShader *program, int forceIndexType, int vertexCount, void *inds, const DecVtxFormat &decVtxFormat, int indexLowerBound, int indexUpperBound, float *customUV)
 {
 	/*
 	DEBUG_LOG(G3D, "View matrix:");
@ -254,28 +273,12 @@ void GLES_GPU::SoftwareTransformAndDraw(int prim, LinkedShader *program, int for
 	DEBUG_LOG(G3D, "%f %f %f", m[9], m[10], m[11]);
 	*/

-	// Then, transform and draw in one big swoop (urgh!)
-	// need to move this to the shader.
-
-	// We're gonna have to keep software transforming RECTANGLES, unless we use a geom shader which we can't on OpenGL ES 2.0.
-	// Usually, though, these primitives don't use lighting etc so it's no biggie performance wise, but it would be nice to get rid of
-	// this code.
-
-	// Actually, if we find the camera-relative right and down vectors, it might even be possible to add the extra points in pre-transformed
-	// space and thus make decent use of hardware transform.
-
-	// Actually again, single quads could be drawn more efficiently using GL_TRIANGLE_STRIP, no need to duplicate verts as for
-	// GL_TRIANGLES. Still need to sw transform to compute the extra two corners though.
-	
 	// Temporary storage for RECTANGLES emulation
 	float v2[3] = {0};
 	float uv2[2] = {0};

 	bool throughmode = (gstate.vertType & GE_VTYPE_THROUGH_MASK) != 0;

-
-	// TODO: Could use glDrawElements in some cases, see below.
-
 	// TODO: Split up into multiple draw calls for GLES 2.0 where you can't guarantee support for more than 0x10000 verts.

 #if defined(USING_GLES2)
@ -453,7 +456,7 @@ void GLES_GPU::SoftwareTransformAndDraw(int prim, LinkedShader *program, int for
 					}
 					break;
 				case 2:
-					// Shade mapping - use dots from light sources to generate U and V.
+					// Shade mapping - use dot products from light sources to generate U and V.
 					{
 						uv[0] = dots[gstate.getUVLS0()];
 						uv[1] = dots[gstate.getUVLS1()];
@ -466,21 +469,17 @@ void GLES_GPU::SoftwareTransformAndDraw(int prim, LinkedShader *program, int for
 			}

 			// Transform the coord by the view matrix.
-			// We only really need to do it here for RECTANGLES drawing. However,
-			// there's no point in optimizing it out because all other primitives
-			// will be moved to hardware transform anyway.
 			Vec3ByMatrix43(v, out, gstate.viewMatrix);
 		}

-		// TODO: Write to a flexible buffer.
+		// TODO: Write to a flexible buffer, we don't always need all four components.
 		memcpy(&transformed[index].x, v, 3 * sizeof(float));
 		memcpy(&transformed[index].uv, uv, 2 * sizeof(float));
 		memcpy(&transformed[index].color0, c0, 4 * sizeof(float));
 		memcpy(&transformed[index].color1, c1, 3 * sizeof(float));
 	}

-	// Step 2: Expand using the index buffer, and expand rectangles.
-
+	// Step 2: expand rectangles.
 	const TransformedVertex *drawBuffer = transformed;
 	int numTrans = 0;

@ -488,7 +487,6 @@ void GLES_GPU::SoftwareTransformAndDraw(int prim, LinkedShader *program, int for
 	if (forceIndexType != -1) {
 		indexType = forceIndexType;
 	}
-
 	bool drawIndexed = false;
 	GLuint glIndexType = 0;

@ -581,7 +579,8 @@ void GLES_GPU::SoftwareTransformAndDraw(int prim, LinkedShader *program, int for
 		}
 	}

-	// TODO: Make a cache for glEnableVertexAttribArray and glVertexAttribPtr states, these spam the gDebugger log.
+	// TODO: Make a cache for glEnableVertexAttribArray and glVertexAttribPtr states,
+	// these spam the gDebugger log.
 	glEnableVertexAttribArray(program->a_position);
 	if (program->a_texcoord != -1) glEnableVertexAttribArray(program->a_texcoord);
 	if (program->a_color0 != -1) glEnableVertexAttribArray(program->a_color0);
@ -591,7 +590,6 @@ void GLES_GPU::SoftwareTransformAndDraw(int prim, LinkedShader *program, int for
 	if (program->a_texcoord != -1) glVertexAttribPointer(program->a_texcoord, 2, GL_FLOAT, GL_FALSE, vertexSize, ((uint8_t*)drawBuffer) + 3 * 4);
 	if (program->a_color0 != -1) glVertexAttribPointer(program->a_color0, 4, GL_FLOAT, GL_FALSE, vertexSize, ((uint8_t*)drawBuffer) + 5 * 4);
 	if (program->a_color1 != -1) glVertexAttribPointer(program->a_color1, 3, GL_FLOAT, GL_FALSE, vertexSize, ((uint8_t*)drawBuffer) + 9 * 4);
-	// NOTICE_LOG(G3D,"DrawPrimitive: %i", numTrans);
 	if (drawIndexed) {
 		glDrawElements(glprim[prim], numTrans, glIndexType, (GLvoid *)inds);
 	} else {
@ -603,10 +601,6 @@ void GLES_GPU::SoftwareTransformAndDraw(int prim, LinkedShader *program, int for
 	if (program->a_color1 != -1) glDisableVertexAttribArray(program->a_color1);
 }

-// This is the software transform pipeline, which is necessary for supporting RECT
-// primitives correctly. Other primitives are possible to transform and light in hardware
-// using vertex shader, which will be way, way faster, especially on mobile. This has
-// not yet been implemented though.
 void GLES_GPU::TransformAndDrawPrim(void *verts, void *inds, int prim, int vertexCount, float *customUV, int forceIndexType, int *bytesRead)
 {
 	int indexLowerBound, indexUpperBound;
@ -653,7 +647,6 @@ void GLES_GPU::TransformAndDrawPrim(void *verts, void *inds, int prim, int verte

 	if (CanUseHardwareTransform(prim)) {
 		SetupDecFmtForDraw(program, dec.GetDecVtxFmt(), decoded);
-
 		bool drawIndexed;
 		GLuint glIndexType;
 		int indexType = (gstate.vertType & GE_VTYPE_IDX_MASK);
@ -680,7 +673,6 @@ void GLES_GPU::TransformAndDrawPrim(void *verts, void *inds, int prim, int verte
 		} else {
 			glDrawArrays(glprim[prim], 0, numTrans);
 		}
-
 		DesetupDecFmtForDraw(program, dec.GetDecVtxFmt());
 	} else {
 		SoftwareTransformAndDraw(prim, program, forceIndexType, vertexCount, inds, dec.GetDecVtxFmt(), indexLowerBound, indexUpperBound, customUV);
@ -690,161 +682,3 @@ void GLES_GPU::TransformAndDrawPrim(void *verts, void *inds, int prim, int verte
 void GLES_GPU::Flush(int prim) {
 	// TODO
 }
-
-void GLES_GPU::ApplyDrawState()
-{
-
-	// TODO: All this setup is soon so expensive that we'll need dirty flags, or simply do it in the command writes where we detect dirty by xoring. Silly to do all this work on every drawcall.
-
-	// TODO: The top bit of the alpha channel should be written to the stencil bit somehow. This appears to require very expensive multipass rendering :( Alternatively, one could do a
-	// single fullscreen pass that converts alpha to stencil (or 2 passes, to set both the 0 and 1 values) very easily.
-
-	// Set cull
-	bool wantCull = !gstate.isModeClear() && !gstate.isModeThrough() && gstate.isCullEnabled();
-	glstate.cullFace.set(wantCull);
-
-	if(wantCull) {
-		u8 cullMode = gstate.getCullMode();
-		glstate.cullFaceMode.set(cullingMode[cullMode]);
-	}
-
-	// Set blend
-	bool wantBlend = !gstate.isModeClear() && (gstate.alphaBlendEnable & 1);
-	glstate.blend.set(wantBlend);
-	if(wantBlend) {
-		// This can't be done exactly as there are several PSP blend modes that are impossible to do on OpenGL ES 2.0, and some even on regular OpenGL for desktop.
-		// HOWEVER - we should be able to approximate the 2x modes in the shader, although they will clip wrongly.
-		int blendFuncA  = gstate.getBlendFuncA();
-		int blendFuncB  = gstate.getBlendFuncB();
-		int blendFuncEq = gstate.getBlendEq();
-
-		glstate.blendEquation.set(eqLookup[blendFuncEq]);
-
-		if (blendFuncA != GE_SRCBLEND_FIXA && blendFuncB != GE_DSTBLEND_FIXB) {
-			// All is valid, no blendcolor needed
-			glstate.blendFunc.set(aLookup[blendFuncA], bLookup[blendFuncB]);
-		} else {
-			GLuint glBlendFuncA = blendFuncA == GE_SRCBLEND_FIXA ? GL_INVALID_ENUM : aLookup[blendFuncA];
-			GLuint glBlendFuncB = blendFuncB == GE_DSTBLEND_FIXB ? GL_INVALID_ENUM : bLookup[blendFuncB];
-			u32 fixA = gstate.getFixA();
-			u32 fixB = gstate.getFixB();
-			// Shortcut by using GL_ONE where possible, no need to set blendcolor
-			if (glBlendFuncA == GL_INVALID_ENUM && blendFuncA == GE_SRCBLEND_FIXA) {
-				if (fixA == 0xFFFFFF)
-					glBlendFuncA = GL_ONE;
-				else if (fixA == 0)
-					glBlendFuncA = GL_ZERO;
-			} 
-			if (glBlendFuncB == GL_INVALID_ENUM && blendFuncB == GE_DSTBLEND_FIXB) {
-				if (fixB == 0xFFFFFF)
-					glBlendFuncB = GL_ONE;
-				else if (fixB == 0)
-					glBlendFuncB = GL_ZERO;
-			}
-			if (glBlendFuncA == GL_INVALID_ENUM && glBlendFuncB != GL_INVALID_ENUM) {
-				// Can use blendcolor trivially.
-				const float blendColor[4] = {(fixA & 0xFF)/255.0f, ((fixA >> 8) & 0xFF)/255.0f, ((fixA >> 16) & 0xFF)/255.0f, 1.0f};
-				glstate.blendColor.set(blendColor);
-				glBlendFuncA = GL_CONSTANT_COLOR;
-			} else if (glBlendFuncA != GL_INVALID_ENUM && glBlendFuncB == GL_INVALID_ENUM) {
-				// Can use blendcolor trivially.
-				const float blendColor[4] = {(fixB & 0xFF)/255.0f, ((fixB >> 8) & 0xFF)/255.0f, ((fixB >> 16) & 0xFF)/255.0f, 1.0f};
-				glstate.blendColor.set(blendColor);
-				glBlendFuncB = GL_CONSTANT_COLOR;
-			} else if (glBlendFuncA == GL_INVALID_ENUM && glBlendFuncB == GL_INVALID_ENUM) {  // Should also check for approximate equality
-				if (fixA == (fixB ^ 0xFFFFFF)) {
-					glBlendFuncA = GL_CONSTANT_COLOR;
-					glBlendFuncB = GL_ONE_MINUS_CONSTANT_COLOR;
-					const float blendColor[4] = {(fixA & 0xFF)/255.0f, ((fixA >> 8) & 0xFF)/255.0f, ((fixA >> 16) & 0xFF)/255.0f, 1.0f};
-					glstate.blendColor.set(blendColor);
-				} else if (fixA == fixB) {
-					glBlendFuncA = GL_CONSTANT_COLOR;
-					glBlendFuncB = GL_CONSTANT_COLOR;
-					const float blendColor[4] = {(fixA & 0xFF)/255.0f, ((fixA >> 8) & 0xFF)/255.0f, ((fixA >> 16) & 0xFF)/255.0f, 1.0f};
-					glstate.blendColor.set(blendColor);
-				} else {
-					NOTICE_LOG(HLE, "ERROR INVALID blendcolorstate: FixA=%06x FixB=%06x FuncA=%i FuncB=%i", gstate.getFixA(), gstate.getFixB(), gstate.getBlendFuncA(), gstate.getBlendFuncB());
-					glBlendFuncA = GL_ONE;
-					glBlendFuncB = GL_ONE;
-				}
-			}
-			// At this point, through all paths above, glBlendFuncA and glBlendFuncB will be set somehow.
-
-			glstate.blendFunc.set(glBlendFuncA, glBlendFuncB);
-		}
-	}
-
-	bool wantDepthTest = gstate.isModeClear() || gstate.isDepthTestEnabled();
-	glstate.depthTest.set(wantDepthTest);
-	if(wantDepthTest) {
-		// Force GL_ALWAYS if mode clear
-		int depthTestFunc = gstate.isModeClear() ? 1 : gstate.getDepthTestFunc();
-		glstate.depthFunc.set(ztests[depthTestFunc]);
-	}
-
-	bool wantDepthWrite = gstate.isModeClear() || gstate.isDepthWriteEnabled();
-	glstate.depthWrite.set(wantDepthWrite ? GL_TRUE : GL_FALSE);
-
-	float depthRangeMin = gstate_c.zOff - gstate_c.zScale;
-	float depthRangeMax = gstate_c.zOff + gstate_c.zScale;
-	glstate.depthRange.set(depthRangeMin, depthRangeMax);
-}
-
-void GLES_GPU::UpdateViewportAndProjection()
-{
-	bool throughmode = (gstate.vertType & GE_VTYPE_THROUGH_MASK) != 0;
-
-	// We can probably use these to simply set scissors? Maybe we need to offset by regionX1/Y1
-	int regionX1 = gstate.region1 & 0x3FF;
-	int regionY1 = (gstate.region1 >> 10) & 0x3FF;
-	int regionX2 = (gstate.region2 & 0x3FF) + 1;
-	int regionY2 = ((gstate.region2 >> 10) & 0x3FF) + 1;
-
-	float offsetX = (float)(gstate.offsetx & 0xFFFF) / 16.0f;
-	float offsetY = (float)(gstate.offsety & 0xFFFF) / 16.0f;
-
-	if (throughmode) {
-		// No viewport transform here. Let's experiment with using region.
-		return;
-		glViewport((0 + regionX1) * renderWidthFactor_, (0 - regionY1) * renderHeightFactor_, (regionX2 - regionX1) * renderWidthFactor_, (regionY2 - regionY1) * renderHeightFactor_);
-	} else {
-		// These we can turn into a glViewport call, offset by offsetX and offsetY. Math after.
-		float vpXa = getFloat24(gstate.viewportx1);
-		float vpXb = getFloat24(gstate.viewportx2);
-		float vpYa = getFloat24(gstate.viewporty1);
-		float vpYb = getFloat24(gstate.viewporty2);
-		float vpZa = getFloat24(gstate.viewportz1);  //  / 65536.0f   should map it to OpenGL's 0.0-1.0 Z range
-		float vpZb = getFloat24(gstate.viewportz2);  //  / 65536.0f
-
-		// The viewport transform appears to go like this: 
-		// Xscreen = -offsetX + vpXb + vpXa * Xview
-		// Yscreen = -offsetY + vpYb + vpYa * Yview
-		// Zscreen = vpZb + vpZa * Zview
-
-		// This means that to get the analogue glViewport we must:
-		float vpX0 = vpXb - offsetX - vpXa;
-		float vpY0 = vpYb - offsetY + vpYa;   // Need to account for sign of Y
-		gstate_c.vpWidth = vpXa * 2;
-		gstate_c.vpHeight = -vpYa * 2;
-
-		return;
-
-		float vpWidth = fabsf(gstate_c.vpWidth);
-		float vpHeight = fabsf(gstate_c.vpHeight);
-
-		// TODO: These two should feed into glDepthRange somehow.
-		float vpZ0 = (vpZb - vpZa) / 65536.0f;
-		float vpZ1 = (vpZa * 2) / 65536.0f;
-
-		vpX0 *= renderWidthFactor_;
-		vpY0 *= renderHeightFactor_;
-		vpWidth *= renderWidthFactor_;
-		vpHeight *= renderHeightFactor_;
-
-		// Flip vpY0 to match the OpenGL coordinate system.
-		vpY0 = renderHeight_ - (vpY0 + vpHeight);
-		glViewport(vpX0, vpY0, vpWidth, vpHeight);
-		// Sadly, as glViewport takes integers, we will not be able to support sub pixel offsets this way. But meh.
-		shaderManager_->DirtyUniform(DIRTY_PROJMATRIX);
-	}
-}
--- a/GPU/GLES/TransformPipeline.h
+++ b/GPU/GLES/TransformPipeline.h
@ -17,4 +17,52 @@

 #pragma once

-struct LinkedShader;
+class LinkedShader;
+struct DecVtxFormat;
+
+// Only used by SW transform
+struct Color4
+{
+	float r,g,b,a;
+	Color4() : r(0), g(0), b(0), a(0) { }
+	Color4(float _r, float _g, float _b, float _a=1.0f)
+	{
+		r=_r; g=_g; b=_b; a=_a;
+	}
+	Color4(const float in[4]) {r=in[0];g=in[1];b=in[2];a=in[3];}
+	Color4(const float in[3], float alpha) {r=in[0];g=in[1];b=in[2];a=alpha;}
+
+	const float &operator [](int i) const {return *(&r + i);}
+
+	Color4 operator *(float f) const
+	{
+		return Color4(f*r,f*g,f*b,f*a);
+	}
+	Color4 operator *(const Color4 &c) const
+	{
+		return Color4(r*c.r,g*c.g,b*c.b,a*c.a);
+	}
+	Color4 operator +(const Color4 &c) const
+	{
+		return Color4(r+c.r,g+c.g,b+c.b,a+c.a);
+	}
+	void operator +=(const Color4 &c)
+	{
+		r+=c.r;
+		g+=c.g;
+		b+=c.b;
+		a+=c.a;
+	}
+	void GetFromRGB(u32 col)
+	{
+		r = ((col>>16) & 0xff)/255.0f;
+		g = ((col>>8) & 0xff)/255.0f;
+		b = ((col>>0) & 0xff)/255.0f;
+	}
+	void GetFromA(u32 col)
+	{
+		a = (col&0xff)/255.0f;
+	}
+};
+
+void SoftwareTransformAndDraw(int prim, LinkedShader *program, int forceIndexType, int vertexCount, void *inds, const DecVtxFormat &decVtxFormat, int indexLowerBound, int indexUpperBound, float *customUV);
--- a/GPU/GPUState.h
+++ b/GPU/GPUState.h
@ -22,58 +22,10 @@
 #include "ge_constants.h"
 #include <cstring>

-// TODO: this doesn't belong here
-struct Color4
-{
-	float r,g,b,a;
-	Color4() : r(0), g(0), b(0), a(0) { }
-	Color4(float _r, float _g, float _b, float _a=1.0f)
-	{
-		r=_r; g=_g; b=_b; a=_a;
-	}
-	Color4(const float in[4]) {r=in[0];g=in[1];b=in[2];a=in[3];}
-
-	float &operator [](int i) {return *(&r + i);}
-	const float &operator [](int i) const {return *(&r + i);}
-
-	Color4 operator *(float f) const
-	{
-		return Color4(f*r,f*g,f*b,f*a);
-	}
-	Color4 operator *(const Color4 &c) const
-	{
-		return Color4(r*c.r,g*c.g,b*c.b,a*c.a);
-	}
-	void operator *=(const Color4 &c)
-	{
-		r*=c.r,g*=c.g,b*=c.b,a*=c.a;
-	}
-	Color4 operator +(const Color4 &c) const
-	{
-		return Color4(r+c.r,g+c.g,b+c.b,a+c.a);
-	}
-	void operator +=(const Color4 &c)
-	{
-		r+=c.r;
-		g+=c.g;
-		b+=c.b;
-		a+=c.a;
-	}
-	void GetFromRGB(u32 col)
-	{
-		r = ((col>>16)&0xff)/255.0f;
-		g = ((col>>8)&0xff)/255.0f;
-		b = ((col>>0)&0xff)/255.0f;
-	}
-	void GetFromA(u32 col)
-	{
-		a = (col&0xff)/255.0f;
-	}
-};
-
-
 struct GPUgstate
 {
+	// Getting rid of this ugly union in favor of the accessor functions
+	// might be a good idea....
 	union
 	{
 		u32 cmdmem[256];
@ -239,7 +191,7 @@ struct GPUgstate
 	float tgenMatrix[12];
 	float boneMatrix[12 * 8];  // Eight bone matrices.

-	bool isModeThrough() const { return (vertType & GE_VTYPE_THROUGH) != 0; }
+	// Pixel Pipeline
 	bool isModeClear()   const { return clearmode & 1; }
 	bool isCullEnabled() const { return cullfaceEnable & 1; }
 	int getCullMode()   const { return cullmode & 1; }
@ -253,11 +205,14 @@ struct GPUgstate
 	int getDepthTestFunc() const { return ztestfunc & 0x7; }
 	bool isFogEnabled() const { return fogEnable & 1; }

+	// UV gen
 	int getUVGenMode() const { return texmapmode & 3;}   // 2 bits
 	int getUVProjMode() const { return (texmapmode >> 8) & 3;}   // 2 bits
 	int getUVLS0() const { return texshade & 0x3; }  // 2 bits
 	int getUVLS1() const { return (texshade >> 8) & 0x3; }  // 2 bits

+	// Vertex type
+	bool isModeThrough() const { return (vertType & GE_VTYPE_THROUGH) != 0; }
 	int getNumBoneWeights() const {
 		return 1 + ((vertType & GE_VTYPE_WEIGHTCOUNT_MASK) >> GE_VTYPE_WEIGHTCOUNT_SHIFT);
 	}
@ -279,7 +234,7 @@ struct GPUStateCache
 	float lightpos[4][3];
 	float lightdir[4][3];
 	float lightatt[4][3];
-	Color4 lightColor[3][4]; //Amtient Diffuse Specular
+	float lightColor[3][4][3]; //Amtient Diffuse Specular
 	float morphWeights[8];

 	// bezier patch subdivision
--- a/GPU/Null/NullGpu.cpp
+++ b/GPU/Null/NullGpu.cpp
@ -596,9 +596,9 @@ void NullGPU::ExecuteOp(u32 op, u32 diff)

 			int l = (cmd - GE_CMD_LAC0) / 3;
 			int t = (cmd - GE_CMD_LAC0) % 3;
-			gstate_c.lightColor[t][l].r = r;
-			gstate_c.lightColor[t][l].g = g;
-			gstate_c.lightColor[t][l].b = b;
+			gstate_c.lightColor[t][l][0] = r;
+			gstate_c.lightColor[t][l][1] = g;
+			gstate_c.lightColor[t][l][2] = b;
 		}
 		break;