Assorted GPU fixes: Advance vertex pointer, fixes missing triangles in SPT. More logging. Separate proj matrices for through and normal mode (through matrix don't need to get updated as often). Some cleanup.

2025-01-21 08:14:48 +00:00 · 2012-11-28 13:45:22 +01:00 · 2012-11-28 13:45:22 +01:00 · 980d13fe50
commit 980d13fe50
parent 111f52d67f
15 changed files with 204 additions and 109 deletions
--- a/Core/HLE/sceGe.cpp
+++ b/Core/HLE/sceGe.cpp
@ -115,17 +115,18 @@ u32 sceGeDrawSync(u32 mode)
 	return 0;
 }

+void sceGeContinue()
+{
+	ERROR_LOG(HLE,"UNIMPL sceGeContinue");
+	// no arguments
+}
+
 void sceGeBreak()
 {
 	u32 mode = PARAM(0); //0 : current dlist 1: all drawing
 	ERROR_LOG(HLE,"UNIMPL sceGeBreak(mode=%d)",mode);
 }

-void sceGeContinue()
-{
-	ERROR_LOG(HLE,"UNIMPL sceGeContinue");
-	// no arguments
-}

 u32 sceGeSetCallback(u32 structAddr)
 {
--- a/Core/HLE/sceKernelMutex.cpp
+++ b/Core/HLE/sceKernelMutex.cpp
@ -827,7 +827,7 @@ void sceKernelUnlockLwMutex(u32 workareaPtr, int count)
 	{
 		__KernelUnlockLwMutex(workarea, error);
 		Memory::WriteStruct(workareaPtr, &workarea);
-		__KernelReSchedule("mutex unlocked");
+		__KernelReSchedule("lwmutex unlocked");
 	}
 	else
 		Memory::WriteStruct(workareaPtr, &workarea);
--- a/GPU/GLES/DisplayListInterpreter.cpp
+++ b/GPU/GLES/DisplayListInterpreter.cpp
@ -276,6 +276,16 @@ void GLES_GPU::DrawSync(int mode)
 	
 }

+void GLES_GPU::Continue()
+{
+
+}
+
+void GLES_GPU::Break()
+{
+
+}
+
 // Just to get something on the screen, we'll just not subdivide correctly.
 void GLES_GPU::DrawBezier(int ucount, int vcount)
 {
@ -310,7 +320,7 @@ void EnterClearMode(u32 data)
 	bool alphaMask = (data >> 9) & 1;
 	bool updateZ = (data >> 10) & 1;
 	glColorMask(colMask, colMask, colMask, alphaMask);
-	glDepthMask(updateZ); // Update Z or not
+	glstate.depthWrite.set(updateZ ? GL_TRUE : GL_FALSE);
 }

 void LeaveClearMode()
@ -321,8 +331,8 @@ void LeaveClearMode()
 	// Fogging
 	// Antialiasing
 	// Alpha test
-	glDepthMask(1);
 	glColorMask(1,1,1,1);
+	glstate.depthWrite.set(!(gstate.zmsk & 1) ? GL_TRUE : GL_FALSE);
 	// dirtyshader?
 }

@ -371,7 +381,12 @@ void GLES_GPU::ExecuteOp(u32 op, u32 diff)
 			void *inds = 0;
 			if ((gstate.vertType & GE_VTYPE_IDX_MASK) != GE_VTYPE_IDX_NONE)
 				inds = Memory::GetPointer(gstate_c.indexAddr);
-			TransformAndDrawPrim(verts, inds, type, count, 0, -1);
+
+			// Seems we have to advance the vertex addr, at least in some cases. 
+			// Question: Should we also advance the index addr?
+			int bytesRead;
+			TransformAndDrawPrim(verts, inds, type, count, 0, -1, &bytesRead);
+			gstate_c.vertexAddr += bytesRead;
 		}
 		break;

@ -407,17 +422,21 @@ void GLES_GPU::ExecuteOp(u32 op, u32 diff)
 	case GE_CMD_CALL: 
 		{
 			u32 retval = dcontext.pc + 4;
-			stack[stackptr++] = retval; 
-			u32 target = (((gstate.base & 0x00FF0000) << 8) | (op & 0xFFFFFC)) & 0xFFFFFFF;
-			DEBUG_LOG(G3D,"DL CMD CALL - %08x to %08x, ret=%08x", dcontext.pc, target, retval);
-			dcontext.pc = target - 4;	// pc will be increased after we return, counteract that
+			if (stackptr == ARRAY_SIZE(stack)) {
+				ERROR_LOG(G3D, "CALL: Stack full!");
+			} else {
+				stack[stackptr++] = retval;
+				u32 target = (((gstate.base & 0x00FF0000) << 8) | (op & 0xFFFFFC)) & 0xFFFFFFF;
+				DEBUG_LOG(G3D,"DL CMD CALL - %08x to %08x, ret=%08x", dcontext.pc, target, retval);
+				dcontext.pc = target - 4;	// pc will be increased after we return, counteract that
+			}
 		}
 		break;

 	case GE_CMD_RET: 
 		//TODO : debug!
 		{
-			u32 target = stack[--stackptr] & 0xFFFFFFF; 
+			u32 target = dcontext.pc & 0xF0000000 | (stack[--stackptr] & 0x0FFFFFFF); 
 			DEBUG_LOG(G3D,"DL CMD RET - from %08x to %08x", dcontext.pc, target);
 			dcontext.pc = target - 4;
 		}
@ -426,11 +445,55 @@ void GLES_GPU::ExecuteOp(u32 op, u32 diff)
 	case GE_CMD_SIGNAL:
 		{
 			ERROR_LOG(G3D, "DL GE_CMD_SIGNAL %08x", data & 0xFFFFFF);
-			int behaviour = (data >> 16) & 0xFF;
-			int signal = data & 0xFFFF;
+			// Processed in GE_END.
+		}
+		break;

-			if (interruptsEnabled_)
-				__TriggerInterruptWithArg(PSP_GE_INTR, PSP_GE_SUBINTR_SIGNAL, signal);
+	case GE_CMD_FINISH:
+		DEBUG_LOG(G3D,"DL CMD FINISH");
+		if (interruptsEnabled_)
+			__TriggerInterruptWithArg(PSP_GE_INTR, PSP_GE_SUBINTR_FINISH, 0);
+		break;
+
+	case GE_CMD_END: 
+		DEBUG_LOG(G3D,"DL CMD END");
+		switch (prev >> 24)
+		{
+		case GE_CMD_SIGNAL:
+			{
+				int behaviour = (data >> 16) & 0xFF;
+				int signal = data & 0xFFFF;
+				// We should probably defer to sceGe here, no sense in implementing this stuff in every GPU
+				switch (behaviour) {
+				case 1:  // Signal with Wait
+					ERROR_LOG(G3D, "Signal with Wait UNIMPLEMENTED!");
+					break;
+				case 2:
+					DEBUG_LOG(G3D, "Signal without wait");
+					break;
+				case 3:
+					ERROR_LOG(G3D, "Signal with Pause UNIMPLEMENTED");
+					break;
+				case 0x10:
+					ERROR_LOG(G3D, "Signal with Jump UNIMPLEMENTED");
+					break;
+				case 0x11:
+					ERROR_LOG(G3D, "Signal with Jump UNIMPLEMENTED");
+					break;
+				case 0x12:
+					ERROR_LOG(G3D, "Signal with Return UNIMPLEMENTED");
+					break;
+				}
+				if (interruptsEnabled_)
+					__TriggerInterruptWithArg(PSP_GE_INTR, PSP_GE_SUBINTR_SIGNAL, signal);
+			}
+			break;
+		case GE_CMD_FINISH:
+			finished = true;
+			break;
+		default:
+			DEBUG_LOG(G3D,"Ah, not finished: %06x", prev & 0xFFFFFF);
+			break;
 		}
 		break;

@ -461,32 +524,6 @@ void GLES_GPU::ExecuteOp(u32 op, u32 diff)
 		//			offsetAddr = data<<8;
 		break;

-	case GE_CMD_FINISH:
-		DEBUG_LOG(G3D,"DL CMD FINISH");
-		if (interruptsEnabled_)
-			__TriggerInterruptWithArg(PSP_GE_INTR, PSP_GE_SUBINTR_FINISH, 0);
-		break;
-
-	case GE_CMD_END: 
-		DEBUG_LOG(G3D,"DL CMD END");
-		{
-			switch (prev >> 24)
-			{
-			case GE_CMD_FINISH:
-				finished = true;
-				break;
-			default:
-				DEBUG_LOG(G3D,"Ah, not finished: %06x", prev & 0xFFFFFF);
-				break;
-			}
-		}
-			
-		// This should generate a Reading Ended interrupt
-		// if (interruptsEnabled_)
-		//   __TriggerInterrupt(PSP_GE_INTR);
-
-		break;
-
 	case GE_CMD_REGION1:
 		{
 			int x1 = data & 0x3ff;
@ -515,7 +552,6 @@ void GLES_GPU::ExecuteOp(u32 op, u32 diff)

 	case GE_CMD_TEXTUREMAPENABLE: 
 		DEBUG_LOG(G3D, "DL Texture map enable: %i", data);
-		glEnDis(GL_TEXTURE_2D, data&1); 
 		break;

 	case GE_CMD_LIGHTINGENABLE:
@ -951,7 +987,19 @@ void GLES_GPU::ExecuteOp(u32 op, u32 diff)
 			int mag = (data >> 8) & 1;
 			DEBUG_LOG(G3D,"DL TexFilter min: %i mag: %i", min, mag);
 		}
+		break;

+	case GE_CMD_TEXMODE:
+		DEBUG_LOG(G3D,"DL TexMode %08x", data);
+		break;
+	case GE_CMD_TEXFORMAT:
+		DEBUG_LOG(G3D,"DL TexFormat %08x", data);
+		break;
+	case GE_CMD_TEXFLUSH:
+		DEBUG_LOG(G3D,"DL TexFlush");
+		break;
+	case GE_CMD_TEXWRAP:
+		DEBUG_LOG(G3D,"DL TexWrap %08x", data);
 		break;
 	//////////////////////////////////////////////////////////////////
 	//	Z/STENCIL TESTING
--- a/GPU/GLES/DisplayListInterpreter.h
+++ b/GPU/GLES/DisplayListInterpreter.h
@ -37,6 +37,8 @@ public:
 	virtual void ExecuteOp(u32 op, u32 diff);
 	virtual bool InterpretList();
 	virtual void DrawSync(int mode);
+	virtual void Continue();
+	virtual void Break();
 	virtual void EnableInterrupts(bool enable) {
 		interruptsEnabled_ = enable;
 	}
@ -48,7 +50,7 @@ public:

 private:
 	// TransformPipeline.cpp
-	void TransformAndDrawPrim(void *verts, void *inds, int prim, int vertexCount, float *customUV, int forceIndexType);
+	void TransformAndDrawPrim(void *verts, void *inds, int prim, int vertexCount, float *customUV, int forceIndexType, int *bytesRead = 0);
 	void UpdateViewportAndProjection();
 	void DrawBezier(int ucount, int vcount);
 	void DoBlockTransfer();
--- a/GPU/GLES/FragmentShaderGenerator.cpp
+++ b/GPU/GLES/FragmentShaderGenerator.cpp
@ -58,12 +58,14 @@ void ComputeFragmentShaderID(FragmentShaderID *id)
 		id->d[0] |= (gstate.textureMapEnable & 1) << 7;
 		id->d[0] |= (gstate.alphaTestEnable & 1) << 8;
 		id->d[0] |= (gstate.alphatest & 0x7) << 9;	 // alpha test func
-		//id->d[0] |= (gstate.fogEnable & 1) << 9;
+		id->d[0] |= (gstate.fogEnable & 1) << 9;
 	}
 }

 // Missing: Alpha test, color test, Z depth range, fog
 // Also, logic ops etc, of course. Urgh.
+// We could do all this with booleans, but I don't trust the shader compilers on
+// Android devices to be anything but stupid.
 char *GenerateFragmentShader()
 {
 	char *p = buffer;
@ -79,11 +81,17 @@ char *GenerateFragmentShader()
 		WRITE(p, "uniform sampler2D tex;\n");
 	if (gstate.alphaTestEnable & 1)
 		WRITE(p, "uniform vec4 u_alpharef;\n");
+	if (gstate.fogEnable & 1) {
+		WRITE(p, "uniform vec3 u_fogcolor;\n");
+		WRITE(p, "uniform vec2 u_fogcoef;\n");
+	}
 	WRITE(p, "uniform vec4 u_texenv;\n");
 	WRITE(p, "varying vec4 v_color0;\n");
 	if (lmode)
 		WRITE(p, "varying vec4 v_color1;\n");
 	WRITE(p, "varying vec2 v_texcoord;\n");
+	if (gstate.isFogEnabled())
+		WRITE(p, "varying float v_depth;\n");

 	WRITE(p, "void main() {\n");
 	WRITE(p, "  vec4 v;\n");
@ -106,12 +114,11 @@ char *GenerateFragmentShader()

 		if (gstate.textureMapEnable & 1) {
 			WRITE(p, "	vec4 t = texture2D(tex, v_texcoord);\n");
-			// WRITE(p, "	vec4 t = vec4(1,0,1,1);");
 			WRITE(p, "	vec4 p = clamp(v_color0, 0.0, 1.0);\n");
 		} else {
 			// No texture mapping
-			WRITE(p, "	vec4 t = vec4(1.0, 1.0, 1.0, 1.0);\n"); //, secondary);
-			WRITE(p, "	vec4 p = clamp(v_color0, 0.0, 1.0);\n"); // , secondary);
+			WRITE(p, "	vec4 t = vec4(1.0, 1.0, 1.0, 1.0);\n");
+			WRITE(p, "	vec4 p = clamp(v_color0, 0.0, 1.0);\n");
 		}

 		if (gstate.texfunc & 0x100) { // texfmt == RGBA
@ -150,15 +157,19 @@ char *GenerateFragmentShader()
 			WRITE(p, "  v = v * vec4(2.0, 2.0, 2.0, 1.0);");
 		}
 		
-		
 		if (gstate.alphaTestEnable & 1) {
 			int alphaTestFunc = gstate.alphatest & 7;
 			const char *alphaTestFuncs[] = { "#", "#", " == ", " != ", " < ", " <= ", " > ", " >= " };	// never/always don't make sense
 			if (alphaTestFuncs[alphaTestFunc][0] != '#')
 				WRITE(p, "if (!(v.a %s u_alpharef.x)) discard;", alphaTestFuncs[alphaTestFunc]);
 		}
-		// Fogging should be added here - and disabled during clear mode

+		if (gstate.isFogEnabled()) {
+			// Haven't figured out how to adjust the depth range yet.
+			// WRITE(p, "  v = mix(v, u_fogcolor, u_fogcoef.x + u_fogcoef.y * v_depth;\n");
+			// WRITE(p, "  v.x = v_depth;\n");
+		}
+		// Fogging should be added here - and disabled during clear mode
 	}

 	WRITE(p, "  gl_FragColor = v;\n");
--- a/GPU/GLES/ShaderManager.cpp
+++ b/GPU/GLES/ShaderManager.cpp
@ -77,9 +77,10 @@ LinkedShader::LinkedShader(Shader *vs, Shader *fs)

 	u_tex		= glGetUniformLocation(program, "tex");
 	u_proj	 = glGetUniformLocation(program, "u_proj");
+	u_proj_through = glGetUniformLocation(program, "u_proj_through");
 	u_texenv = glGetUniformLocation(program, "u_texenv");
 	u_fogcolor = glGetUniformLocation(program, "u_fogcolor");
-	u_fogparam = glGetUniformLocation(program, "u_fogparam");
+	u_fogcoef = glGetUniformLocation(program, "u_fogcoef");
 	u_alpharef = glGetUniformLocation(program, "u_alpharef");

 	a_position	= glGetAttribLocation(program, "a_position");
@ -102,35 +103,40 @@ void LinkedShader::use() {
 	glUseProgram(program);	
 	glUniform1i(u_tex, 0);
 	// Update any dirty uniforms before we draw
-	if (dirtyUniforms & DIRTY_PROJMATRIX) {
-		if (gstate.vertType & GE_VTYPE_THROUGH_MASK) {
-			Matrix4x4 proj;
-			proj.setOrtho(0.0f, 480, 272, 0, -1, 0);	// TODO: Store this somewhere instead of regenerating! And not in each LinkedShader object!
-			glUniformMatrix4fv(u_proj, 1, GL_FALSE, proj.getReadPtr());
+	if (u_proj != -1 && (dirtyUniforms & DIRTY_PROJMATRIX)) {
+		glUniformMatrix4fv(u_proj, 1, GL_FALSE, gstate.projMatrix);
+		float flippedMatrix[16];
+		memcpy(flippedMatrix, gstate.projMatrix, 16 * sizeof(float));
+		if (gstate_c.vpHeight < 0) {
+			flippedMatrix[5] = -flippedMatrix[5];
+			flippedMatrix[13] = -flippedMatrix[13];
 		}
-		else
-		{
-			glUniformMatrix4fv(u_proj, 1, GL_FALSE, gstate.projMatrix);
-			float flippedMatrix[16];
-			memcpy(flippedMatrix, gstate.projMatrix, 16 * sizeof(float));
-			if (gstate_c.vpHeight < 0) {
-				flippedMatrix[5] = -flippedMatrix[5];
-				flippedMatrix[13] = -flippedMatrix[13];
-			}
-			if (gstate_c.vpWidth < 0) {
-				flippedMatrix[0] = -flippedMatrix[0];
-				flippedMatrix[12] = -flippedMatrix[12];
-			}
-
-			glUniformMatrix4fv(u_proj, 1, GL_FALSE, flippedMatrix);
+		if (gstate_c.vpWidth < 0) {
+			flippedMatrix[0] = -flippedMatrix[0];
+			flippedMatrix[12] = -flippedMatrix[12];
 		}
+		glUniformMatrix4fv(u_proj, 1, GL_FALSE, flippedMatrix);
 	}
-	if (u_texenv != -1 && dirtyUniforms & DIRTY_TEXENV) {
+	if (u_proj_through != -1 && (dirtyUniforms & DIRTY_PROJTHROUGHMATRIX))
+	{
+		Matrix4x4 proj_through;
+		proj_through.setOrtho(0.0f, 480, 272, 0, -1, 0);	// TODO: Store this somewhere instead of regenerating! And not in each LinkedShader object!
+		glUniformMatrix4fv(u_proj_through, 1, GL_FALSE, proj_through.getReadPtr());
+	}
+	if (u_texenv != -1 && (dirtyUniforms & DIRTY_TEXENV)) {
 		glUniform4f(u_texenv, 1.0, 1.0, 1.0, 1.0);	// TODO
 	}
-	if (u_alpharef != -1 && dirtyUniforms & DIRTY_ALPHAREF) {
+	if (u_alpharef != -1 && (dirtyUniforms & DIRTY_ALPHAREF)) {
 		glUniform4f(u_alpharef, ((float)((gstate.alphatest >> 8) & 0xFF)) / 255.0f, 0.0f, 0.0f, 0.0f);
 	}
+	if (u_fogcolor != -1 && (dirtyUniforms & DIRTY_FOGCOLOR)) {
+		const float fogc[3] = { ((gstate.fogcolor & 0xFF0000) >> 16) / 255.0f, ((gstate.fogcolor & 0xFF00) >> 8) / 255.0f, ((gstate.fogcolor & 0xFF)) / 255.0f};
+		glUniform3fv(u_fogcolor, 1, fogc);
+	}
+	if (u_fogcoef != -1 && (dirtyUniforms & DIRTY_FOGCOEF)) {
+		const float fogcoef[2] = { getFloat24(gstate.fog1), getFloat24(gstate.fog2) };
+		glUniform2fv(u_fogcoef, 1, fogcoef);
+	}

 	dirtyUniforms = 0;
 }
--- a/GPU/GLES/ShaderManager.h
+++ b/GPU/GLES/ShaderManager.h
@ -46,25 +46,25 @@ struct LinkedShader

 	int u_tex;
 	int u_proj;
+	int u_proj_through;
 	int u_texenv;

 	// Fragment processing inputs
 	int u_alpharef;
-
-	// unused
 	int u_fogcolor;
-	int u_fogparam;
+	int u_fogcoef;
 };

 enum
 {
 	DIRTY_PROJMATRIX = (1 << 0),
-	DIRTY_FOGCOLOR	 = (1 << 1),
-	DIRTY_FOGPARAM	 = (1 << 2),
-	DIRTY_TEXENV		 = (1 << 3),
-	DIRTY_ALPHAREF	 = (1 << 4),
+	DIRTY_PROJTHROUGHMATRIX = (1 << 1),
+	DIRTY_FOGCOLOR	 = (1 << 2),
+	DIRTY_FOGCOEF    = (1 << 3),
+	DIRTY_TEXENV		 = (1 << 4),
+	DIRTY_ALPHAREF	 = (1 << 5),

-	DIRTY_ALL = (1 << 5) - 1
+	DIRTY_ALL = (1 << 6) - 1
 };

 // Real public interface
--- a/GPU/GLES/TransformPipeline.cpp
+++ b/GPU/GLES/TransformPipeline.cpp
@ -196,7 +196,7 @@ void Lighter::Light(float colorOut0[4], float colorOut1[4], const float colorIn[
 // primitives correctly. Other primitives are possible to transform and light in hardware
 // using vertex shader, which will be way, way faster, especially on mobile. This has
 // not yet been implemented though.
-void GLES_GPU::TransformAndDrawPrim(void *verts, void *inds, int prim, int vertexCount, float *customUV, int forceIndexType)
+void GLES_GPU::TransformAndDrawPrim(void *verts, void *inds, int prim, int vertexCount, float *customUV, int forceIndexType, int *bytesRead)
 {
 	int indexLowerBound, indexUpperBound;
 	// First, decode the verts and apply morphing
@ -218,6 +218,9 @@ void GLES_GPU::TransformAndDrawPrim(void *verts, void *inds, int prim, int verte
 	gpuStats.numDrawCalls++;
 	gpuStats.numVertsTransformed += vertexCount;
 	
+	if (bytesRead)
+		*bytesRead = vertexCount * dec.VertexSize();
+
 	bool throughmode = (gstate.vertType & GE_VTYPE_THROUGH_MASK) != 0;
 	// Then, transform and draw in one big swoop (urgh!)
 	// need to move this to the shader.
--- a/GPU/GLES/VertexDecoder.cpp
+++ b/GPU/GLES/VertexDecoder.cpp
@ -156,7 +156,7 @@ void VertexDecoder::DecodeVerts(DecodedVertex *decoded, const void *verts, const
 			{
 				const u8 *wdata = (const u8*)(ptr);
 				for (int j = 0; j < nweights; j++)
-					wt[j] = (float)wdata[j] / 255.0f;
+					wt[j] = (float)wdata[j] / 128.0f;
 			}
 			break;

@ -164,7 +164,7 @@ void VertexDecoder::DecodeVerts(DecodedVertex *decoded, const void *verts, const
 			{
 				const u16 *wdata = (const u16*)(ptr);
 				for (int j = 0; j < nweights; j++)
-					wt[j] = (float)wdata[j] / 65535.0f;
+					wt[j] = (float)wdata[j] / 32768.0f;
 			}
 			break;

@ -190,7 +190,7 @@ void VertexDecoder::DecodeVerts(DecodedVertex *decoded, const void *verts, const
 			{
 				const u8 *uvdata = (const u8*)(ptr + tcoff);
 				for (int j = 0; j < 2; j++)
-					uv[j] = (float)uvdata[j] / 255.0f;
+					uv[j] = (float)uvdata[j] / 128.0f;
 				break;
 			}

--- a/GPU/GLES/VertexDecoder.h
+++ b/GPU/GLES/VertexDecoder.h
@ -59,7 +59,7 @@ public:
 	void SetVertexType(u32 fmt);
 	void DecodeVerts(DecodedVertex *decoded, const void *verts, const void *inds, int prim, int count, int *indexLowerBound, int *indexUpperBound) const;
 	bool hasColor() const { return col != 0; }
-
+	int VertexSize() const { return size; }
 private:
 	u32 fmt;
 	bool throughmode;
--- a/GPU/GLES/VertexShaderGenerator.cpp
+++ b/GPU/GLES/VertexShaderGenerator.cpp
@ -40,10 +40,10 @@ static char buffer[16384];

 void ComputeVertexShaderID(VertexShaderID *id)
 {
-	// There's currently only one vertex shader
-	// as we do the transform in software.
 	memset(id->d, 0, sizeof(id->d));
 	id->d[0] = gstate.lmode & 1;
+	id->d[0] |= ((int)gstate.isModeThrough()) << 1;
+	id->d[0] |= ((int)gstate.isFogEnabled()) << 2;
 }

 void WriteLight(char *p, int l) {
@ -67,19 +67,32 @@ char *GenerateVertexShader()
 	if (lmode)
 		WRITE("attribute vec4 a_color1;");

-	WRITE("uniform mat4 u_proj;");
+	if (gstate.isModeThrough())	{
+		WRITE("uniform mat4 u_proj_through;");
+	} else {
+		WRITE("uniform mat4 u_proj;");
+		// Add all the uniforms we'll need to transform properly.
+	}

 	WRITE("varying vec4 v_color0;");
 	if (lmode)
 		WRITE("varying vec4 v_color1;");
 	WRITE("varying vec2 v_texcoord;");
-
+	if (gstate.isFogEnabled())
+		WRITE("varying float v_depth;");
 	WRITE("void main() {");
-	WRITE("v_color0 = a_color0;");
+	WRITE("  v_color0 = a_color0;");
 	if (lmode)
-		WRITE("v_color1 = a_color1;");
-	WRITE("v_texcoord = a_texcoord;");
-	WRITE("gl_Position = u_proj * a_position;");
+		WRITE("  v_color1 = a_color1;");
+	WRITE("  v_texcoord = a_texcoord;");
+	if (gstate.isModeThrough())	{
+		WRITE("  gl_Position = u_proj_through * a_position;");
+	} else {
+		WRITE("  gl_Position = u_proj * a_position;");
+	}
+	if (gstate.isFogEnabled()) {
+		WRITE("  v_depth = gl_Position.z;");
+	}
 	WRITE("}");

 	return buffer;
--- a/GPU/GPUInterface.h
+++ b/GPU/GPUInterface.h
@ -28,11 +28,14 @@ public:
 	virtual void InitClear() = 0;

 	// Draw queue management
+	// TODO: Much of this should probably be shared between the different GPU implementations.
 	virtual u32 EnqueueList(u32 listpc, u32 stall) = 0;
 	virtual void UpdateStall(int listid, u32 newstall) = 0;
+	virtual void DrawSync(int mode) = 0;
+	virtual void Continue() = 0;
+	
 	virtual void ExecuteOp(u32 op, u32 diff) = 0;
 	virtual bool InterpretList() = 0;
-	virtual void DrawSync(int mode) = 0;

 	// Framebuffer management
 	virtual void SetDisplayFramebuffer(u32 framebuf, u32 stride, int format) = 0;
--- a/GPU/GPUState.h
+++ b/GPU/GPUState.h
@ -240,18 +240,19 @@ struct GPUgstate
 	float tgenMatrix[12];
 	float boneMatrix[12 * 8];  // Eight bone matrices.

-	inline bool isModeThrough() const { return (vertType & GE_VTYPE_THROUGH) != 0; }
-	inline bool isModeClear()   const { return clearmode & 1; }
-	inline bool isCullEnabled() const { return cullfaceEnable & 1; }
-	inline int  getCullMode()   const { return cullmode & 1; }
-	inline int  getBlendFuncA() const { return blend & 0xF; }
-	inline u32 getFixA() const { return blendfixa & 0xFFFFFF; }
-	inline u32 getFixB() const { return blendfixb & 0xFFFFFF; }
-	inline int  getBlendFuncB() const { return (blend >> 4) & 0xF; }
-	inline int  getBlendEq()    const { return (blend >> 8) & 0x7; }
-	inline bool isDepthTestEnabled() const { return zTestEnable & 1; }
-	inline bool isDepthWriteEnabled() const { return !(zmsk & 1); }
-	inline int  getDepthTestFunc() const { return ztestfunc & 0x7; }
+	bool isModeThrough() const { return (vertType & GE_VTYPE_THROUGH) != 0; }
+	bool isModeClear()   const { return clearmode & 1; }
+	bool isCullEnabled() const { return cullfaceEnable & 1; }
+	int  getCullMode()   const { return cullmode & 1; }
+	int  getBlendFuncA() const { return blend & 0xF; }
+	u32 getFixA() const { return blendfixa & 0xFFFFFF; }
+	u32 getFixB() const { return blendfixb & 0xFFFFFF; }
+	int  getBlendFuncB() const { return (blend >> 4) & 0xF; }
+	int  getBlendEq()    const { return (blend >> 8) & 0x7; }
+	bool isDepthTestEnabled() const { return zTestEnable & 1; }
+	bool isDepthWriteEnabled() const { return !(zmsk & 1); }
+	int  getDepthTestFunc() const { return ztestfunc & 0x7; }
+	bool isFogEnabled() const { return fogEnable & 1; }
 };
 // Real data in the context ends here

--- a/GPU/Null/NullGpu.cpp
+++ b/GPU/Null/NullGpu.cpp
@ -108,6 +108,12 @@ void NullGPU::DrawSync(int mode)
 	}
 }

+void NullGPU::Continue()
+{
+
+}
+
+
 void NullGPU::ExecuteOp(u32 op, u32 diff)
 {
 	u32 cmd = op >> 24;
--- a/GPU/Null/NullGpu.h
+++ b/GPU/Null/NullGpu.h
@ -30,6 +30,7 @@ public:
 	virtual void UpdateStall(int listid, u32 newstall);
 	virtual void ExecuteOp(u32 op, u32 diff);
 	virtual bool InterpretList();
+	virtual void Continue();
 	virtual void DrawSync(int mode);
 	virtual void EnableInterrupts(bool enable) {
 		interruptsEnabled_ = enable;