Experiment: Generate "Ubershaders" that can handle all lighting configurations

This drastically reduces the shader compile stutter that happens when a lot of new light setups are created, like on the first punch in Tekken 6. There's more stuff that might benefit from being made dynamic like this. These branches are very cheap on modern GPUs since they're branching on a uniform variable, so no divergence. Only tested on Vulkan. I think we'll need to keep the old path too for gpus like Mali-450...
2025-02-06 13:38:56 +00:00 · 2022-09-25 13:59:52 +02:00 · 2022-09-25 13:59:52 +02:00 · 7adba20fac
commit 7adba20fac
parent b1afeeaf43
9 changed files with 218 additions and 86 deletions
--- a/GPU/Common/ShaderCommon.h
+++ b/GPU/Common/ShaderCommon.h
@ -88,12 +88,13 @@ enum : uint64_t {
 	DIRTY_COLORWRITEMASK = 1ULL << 36,

 	DIRTY_MIPBIAS = 1ULL << 37,
+	DIRTY_LIGHT_CONTROL = 1ULL << 38,

-	// space for 4 more uniform dirty flags. Remember to update DIRTY_ALL_UNIFORMS.
+	// space for 1 more uniform dirty flags. Remember to update DIRTY_ALL_UNIFORMS.

 	DIRTY_BONE_UNIFORMS = 0xFF000000ULL,

-	DIRTY_ALL_UNIFORMS = 0x3FFFFFFFFFULL,
+	DIRTY_ALL_UNIFORMS = 0x7FFFFFFFFFULL,
 	DIRTY_ALL_LIGHTS = DIRTY_LIGHT0 | DIRTY_LIGHT1 | DIRTY_LIGHT2 | DIRTY_LIGHT3,

 	// Other dirty elements that aren't uniforms!
@ -113,6 +114,8 @@ enum : uint64_t {
 	// TODO: Should we also add DIRTY_FRAMEBUF here? It kinda generally takes care of itself.
 	DIRTY_ALL_RENDER_STATE = DIRTY_BLEND_STATE | DIRTY_DEPTHSTENCIL_STATE | DIRTY_RASTER_STATE | DIRTY_VIEWPORTSCISSOR_STATE | DIRTY_VERTEXSHADER_STATE | DIRTY_FRAGMENTSHADER_STATE | DIRTY_TEXTURE_IMAGE | DIRTY_TEXTURE_PARAMS,

+	// Note that the top 8 bits (54-63) cannot be dirtied through the commonCommandTable due to packing of other flags.
+
 	DIRTY_ALL = 0xFFFFFFFFFFFFFFFF
 };

--- a/GPU/Common/ShaderId.cpp
+++ b/GPU/Common/ShaderId.cpp
@ -41,6 +41,9 @@ std::string VertexShaderDesc(const VShaderID &id) {
 	if (id.Bit(VS_BIT_LIGHTING_ENABLE)) {
 		desc << "Light: ";
 	}
+	if (id.Bit(VS_BIT_LIGHT_UBERSHADER)) {
+		desc << "LightUberShader ";
+	}
 	for (int i = 0; i < 4; i++) {
 		bool enabled = id.Bit(VS_BIT_LIGHT0_ENABLE + i) && id.Bit(VS_BIT_LIGHTING_ENABLE);
 		if (enabled || (uvgMode == GE_TEXMAP_ENVIRONMENT_MAP && (ls0 == i || ls1 == i))) {
@ -125,13 +128,17 @@ void ComputeVertexShaderID(VShaderID *id_out, u32 vertType, bool useHWTransform,
 			// doShadeMapping is stored as UVGenMode, and light type doesn't matter for shade mapping.
 			id.SetBits(VS_BIT_MATERIAL_UPDATE, 3, gstate.getMaterialUpdate());
 			id.SetBit(VS_BIT_LIGHTING_ENABLE);
-			// Light bits
-			for (int i = 0; i < 4; i++) {
-				bool chanEnabled = gstate.isLightChanEnabled(i) != 0;
-				id.SetBit(VS_BIT_LIGHT0_ENABLE + i, chanEnabled);
-				if (chanEnabled) {
-					id.SetBits(VS_BIT_LIGHT0_COMP + 4 * i, 2, gstate.getLightComputation(i));
-					id.SetBits(VS_BIT_LIGHT0_TYPE + 4 * i, 2, gstate.getLightType(i));
+			if (gstate_c.Supports(GPU_USE_LIGHT_UBERSHADER)) {
+				id.SetBit(VS_BIT_LIGHT_UBERSHADER);
+			} else {
+				// Light bits
+				for (int i = 0; i < 4; i++) {
+					bool chanEnabled = gstate.isLightChanEnabled(i) != 0;
+					id.SetBit(VS_BIT_LIGHT0_ENABLE + i, chanEnabled);
+					if (chanEnabled) {
+						id.SetBits(VS_BIT_LIGHT0_COMP + 4 * i, 2, gstate.getLightComputation(i));
+						id.SetBits(VS_BIT_LIGHT0_TYPE + 4 * i, 2, gstate.getLightType(i));
+					}
 				}
 			}
 		}
--- a/GPU/Common/ShaderId.h
+++ b/GPU/Common/ShaderId.h
@ -33,7 +33,11 @@ enum VShaderBit : uint8_t {
 	VS_BIT_BONES = 22,  // 3 should be enough, not 8
 	// 25 - 29 are free.
 	VS_BIT_ENABLE_BONES = 30,
-	// 31 is free.
+
+	// If this is set along with LIGHTING_ENABLE, all other lighting bits below
+	// are passed to the shader directly instead.
+	VS_BIT_LIGHT_UBERSHADER = 31,
+
 	VS_BIT_LIGHT0_COMP = 32,  // 2 bits
 	VS_BIT_LIGHT0_TYPE = 34,  // 2 bits
 	VS_BIT_LIGHT1_COMP = 36,  // 2 bits
--- a/GPU/Common/ShaderUniforms.cpp
+++ b/GPU/Common/ShaderUniforms.cpp
@ -279,8 +279,30 @@ void LightUpdateUniforms(UB_VS_Lights *ub, uint64_t dirtyUniforms) {
 		Uint8x3ToFloat4_Alpha(ub->materialSpecular, gstate.materialspecular, std::max(0.0f, getFloat24(gstate.materialspecularcoef)));
 	}
 	if (dirtyUniforms & DIRTY_MATEMISSIVE) {
-		Uint8x3ToFloat4(ub->materialEmissive, gstate.materialemissive);
+		// We're not touching the fourth f32 here, because we store an u32 of control bits in it.
+		float temp[4];
+		Uint8x3ToFloat4(temp, gstate.materialemissive);
+		memcpy(ub->materialEmissive, temp, 12);
 	}
+
+	if (dirtyUniforms & DIRTY_LIGHT_CONTROL) {
+		// Bit organization
+		// Bottom 4 bits are enable bits for each light.
+		// Then, for each light, comes 2 bits for "comp" and 2 bits for "type".
+		uint32_t lightControl = 0;
+		for (int i = 0; i < 4; i++) {
+			if (gstate.isLightChanEnabled(i)) {
+				lightControl |= 1 << i;
+			}
+
+			u32 computation = (u32)gstate.getLightComputation(i);  // 2 bits
+			u32 type = (u32)gstate.getLightType(i);  // 2 bits
+			lightControl |= computation << (4 + i * 4);
+			lightControl |= type << (4 + i * 4 + 2);
+		}
+		ub->lightControl = lightControl;
+	}
+
 	for (int i = 0; i < 4; i++) {
 		if (dirtyUniforms & (DIRTY_LIGHT0 << i)) {
 			if (gstate.isDirectionalLight(i)) {
--- a/GPU/Common/ShaderUniforms.h
+++ b/GPU/Common/ShaderUniforms.h
@ -80,7 +80,8 @@ struct UB_VS_Lights {
 	float ambientColor[4];
 	float materialDiffuse[4];
 	float materialSpecular[4];
-	float materialEmissive[4];
+	float materialEmissive[3];
+	uint32_t lightControl;
 	float lpos[4][4];
 	float ldir[4][4];
 	float latt[4][4];
@ -95,6 +96,7 @@ R"(	vec4 u_ambient;
 	vec3 u_matdiffuse;
 	vec4 u_matspecular;
 	vec3 u_matemissive;
+    uint u_lightControl;  // light ubershader
 	vec3 u_lightpos0;
 	vec3 u_lightpos1;
 	vec3 u_lightpos2;
--- a/GPU/Common/VertexShaderGenerator.cpp
+++ b/GPU/Common/VertexShaderGenerator.cpp
@ -185,6 +185,11 @@ bool GenerateVertexShader(const VShaderID &id, char *buffer, const ShaderLanguag
 	bool enableLighting = id.Bit(VS_BIT_LIGHTING_ENABLE);
 	int matUpdate = id.Bits(VS_BIT_MATERIAL_UPDATE, 3);

+	bool lightUberShader = id.Bit(VS_BIT_LIGHT_UBERSHADER);
+	if (lightUberShader) {
+		_dbg_assert_(compat.bitwiseOps);
+	}
+
 	// Apparently we don't support bezier/spline together with bones.
 	bool doBezier = id.Bit(VS_BIT_BEZIER) && !enableBones && useHWTransform;
 	bool doSpline = id.Bit(VS_BIT_SPLINE) && !enableBones && useHWTransform;
@ -524,12 +529,12 @@ bool GenerateVertexShader(const VShaderID &id, char *buffer, const ShaderLanguag
 				*uniformMask |= DIRTY_UVSCALEOFFSET;
 			}
 			for (int i = 0; i < 4; i++) {
-				if (doLight[i] != LIGHT_OFF) {
+				if (lightUberShader || doLight[i] != LIGHT_OFF) {
 					// This is needed for shade mapping
 					WRITE(p, "uniform vec3 u_lightpos%i;\n", i);
 					*uniformMask |= DIRTY_LIGHT0 << i;
 				}
-				if (doLight[i] == LIGHT_FULL) {
+				if (lightUberShader || doLight[i] == LIGHT_FULL) {
 					*uniformMask |= DIRTY_LIGHT0 << i;
 					GELightType type = static_cast<GELightType>(id.Bits(VS_BIT_LIGHT0_TYPE + 4 * i, 2));
 					GELightComputation comp = static_cast<GELightComputation>(id.Bits(VS_BIT_LIGHT0_COMP + 4 * i, 2));
@ -728,7 +733,6 @@ bool GenerateVertexShader(const VShaderID &id, char *buffer, const ShaderLanguag
 			WRITE(p, "  vec4 basis_u = tess_weights_u[weight_idx.x].basis;\n");
 			WRITE(p, "  vec4 basis_v = tess_weights_v[weight_idx.y].basis;\n");
 			WRITE(p, "  mat4 basis = outerProduct(basis_u, basis_v);\n");
-
 		} else {
 			WRITE(p, "  int index_u, index_v;\n");
 			for (int i = 0; i < 4; i++) {
@ -987,6 +991,13 @@ bool GenerateVertexShader(const VShaderID &id, char *buffer, const ShaderLanguag
 					anySpots = true;
 			}

+			if (lightUberShader) {
+				anySpots = true;
+				diffuseIsZero = false;
+				specularIsZero = false;
+				distanceNeeded = true;
+			}
+
 			if (!specularIsZero) {
 				WRITE(p, "  lowp vec3 lightSum1 = splat3(0.0);\n");
 			}
@ -1004,76 +1015,131 @@ bool GenerateVertexShader(const VShaderID &id, char *buffer, const ShaderLanguag
 			}
 		}

-		// Calculate lights if needed. If shade mapping is enabled, lights may need to be
-		// at least partially calculated.
-		for (int i = 0; i < 4; i++) {
-			if (doLight[i] != LIGHT_FULL)
-				continue;
-
-			GELightType type = static_cast<GELightType>(id.Bits(VS_BIT_LIGHT0_TYPE + 4*i, 2));
-			GELightComputation comp = static_cast<GELightComputation>(id.Bits(VS_BIT_LIGHT0_COMP + 4*i, 2));
-
-			if (type == GE_LIGHTTYPE_DIRECTIONAL) {
-				// We prenormalize light positions for directional lights.
-				WRITE(p, "  toLight = u_lightpos%i;\n", i);
-			} else {
-				WRITE(p, "  toLight = u_lightpos%i - worldpos;\n", i);
-				WRITE(p, "  distance = length(toLight);\n");
-				WRITE(p, "  toLight /= distance;\n");
+		if (lightUberShader) {
+			// TODO: Actually loop in the shader. For now, we write it all out.
+			for (int i = 0; i < 4; i++) {
+				p.F("if ((u_lightControl & %d) != 0) {\n", 1 << i);
+				p.F("   uint type = (u_lightControl >> %d) & 3;\n", 4 + 4 * i);
+				p.F("   uint comp = (u_lightControl >> %d) & 3;\n", 4 + 4 * i + 2);
+				p.C("   if (type == 0) {\n");  // GE_LIGHTTYPE_DIRECTIONAL
+				p.F("     toLight = u_lightpos%d;\n", i);
+				p.C("   } else {\n");
+				p.F("     toLight = u_lightpos%d - worldpos;\n", i);
+				p.F("     distance = length(toLight);\n", i);
+				p.F("     toLight /= distance;\n", i);
+				p.C("   }\n");
+				p.C("   ldot = dot(toLight, worldnormal);\n");
+				p.C("   if (comp == 2) {\n");  // GE_LIGHTCOMP_ONLYPOWDIFFUSE
+				p.C("     if (u_matspecular.a <= 0.0) {\n");
+				p.C("       ldot = 1.0;\n");
+				p.C("     } else {\n");
+				p.C("       ldot = pow(max(ldot, 0.0), u_matspecular.a);\n");
+				p.C("     }\n");
+				p.C("   }\n");
+				p.C("   switch (type) {\n");// Attenuation
+				p.C("   case 1:\n");  // GE_LIGHTTYPE_POINT
+				p.F("     lightScale = clamp(1.0 / dot(u_lightatt%i, vec3(1.0, distance, distance*distance)), 0.0, 1.0);\n", i);
+				p.C("     break;\n");
+				p.C("   case 2:\n");  // GE_LIGHTTYPE_SPOT
+				p.F("     angle = length(u_lightdir%i) == 0.0 ? 0.0 : dot(normalize(u_lightdir%i), toLight);\n", i, i);
+				p.F("     if (angle >= u_lightangle_spotCoef%i.x) {\n", i);
+				p.F("       lightScale = clamp(1.0 / dot(u_lightatt%i, vec3(1.0, distance, distance*distance)), 0.0, 1.0) * (u_lightangle_spotCoef%i.y <= 0.0 ? 1.0 : pow(angle, u_lightangle_spotCoef%i.y));\n", i, i, i);
+				p.C("     } else {\n");
+				p.C("       lightScale = 0.0;\n");
+				p.C("     }\n");
+				p.C("     break;\n");
+				p.C("   default:\n");  // GE_LIGHTTYPE_DIRECTIONAL
+				p.C("     lightScale = 1.0;\n");
+				p.C("     break;\n");
+				p.C("   }\n");
+				p.F("   diffuse = (u_lightdiffuse%i * %s) * max(ldot, 0.0);\n", i, diffuseStr);
+				p.C("   if (comp == 1) {\n");  // do specular
+				p.C("     if (ldot >= 0.0) {\n");
+				p.C("       ldot = dot(normalize(toLight + vec3(0.0, 0.0, 1.0)), worldnormal);\n");
+				p.C("       if (u_matspecular.a <= 0.0) {\n");
+				p.C("         ldot = 1.0;\n");
+				p.C("       } else {\n");
+				p.C("         ldot = pow(max(ldot, 0.0), u_matspecular.a);\n");
+				p.C("       }\n");
+				p.C("       if (ldot > 0.0)\n");
+				p.F("         lightSum1 += u_lightspecular%i * %s * ldot * lightScale;\n", i, specularStr);
+				p.C("     }\n");
+				p.C("   }\n");
+				p.F("   lightSum0.rgb += (u_lightambient%i * %s.rgb + diffuse) * lightScale;\n", i, ambientStr);
+				p.C(" }\n");
 			}
+		} else {
+			// Calculate lights if needed. If shade mapping is enabled, lights may need to be
+			// at least partially calculated.
+			for (int i = 0; i < 4; i++) {
+				if (doLight[i] != LIGHT_FULL)
+					continue;

-			bool doSpecular = comp == GE_LIGHTCOMP_BOTH;
-			bool poweredDiffuse = comp == GE_LIGHTCOMP_ONLYPOWDIFFUSE;
+				GELightType type = static_cast<GELightType>(id.Bits(VS_BIT_LIGHT0_TYPE + 4 * i, 2));
+				GELightComputation comp = static_cast<GELightComputation>(id.Bits(VS_BIT_LIGHT0_COMP + 4 * i, 2));

-			WRITE(p, "  ldot = dot(toLight, worldnormal);\n");
-			if (poweredDiffuse) {
-				// pow(0.0, 0.0) may be undefined, but the PSP seems to treat it as 1.0.
-				// Seen in Tales of the World: Radiant Mythology (#2424.)
-				WRITE(p, "  if (u_matspecular.a <= 0.0) {\n");
-				WRITE(p, "    ldot = 1.0;\n");
-				WRITE(p, "  } else {\n");
-				WRITE(p, "    ldot = pow(max(ldot, 0.0), u_matspecular.a);\n");
-				WRITE(p, "  }\n");
+				if (type == GE_LIGHTTYPE_DIRECTIONAL) {
+					// We prenormalize light positions for directional lights.
+					p.F("  toLight = u_lightpos%i;\n", i);
+				} else {
+					p.F("  toLight = u_lightpos%i - worldpos;\n", i);
+					p.C("  distance = length(toLight);\n");
+					p.C("  toLight /= distance;\n");
+				}
+
+				bool doSpecular = comp == GE_LIGHTCOMP_BOTH;
+				bool poweredDiffuse = comp == GE_LIGHTCOMP_ONLYPOWDIFFUSE;
+
+				p.C("  ldot = dot(toLight, worldnormal);\n");
+				if (poweredDiffuse) {
+					// pow(0.0, 0.0) may be undefined, but the PSP seems to treat it as 1.0.
+					// Seen in Tales of the World: Radiant Mythology (#2424.)
+					p.C("  if (u_matspecular.a <= 0.0) {\n");
+					p.C("    ldot = 1.0;\n");
+					p.C("  } else {\n");
+					p.C("    ldot = pow(max(ldot, 0.0), u_matspecular.a);\n");
+					p.C("  }\n");
+				}
+
+				const char *timesLightScale = " * lightScale";
+
+				// Attenuation
+				switch (type) {
+				case GE_LIGHTTYPE_DIRECTIONAL:
+					timesLightScale = "";
+					break;
+				case GE_LIGHTTYPE_POINT:
+					p.F("  lightScale = clamp(1.0 / dot(u_lightatt%i, vec3(1.0, distance, distance*distance)), 0.0, 1.0);\n", i);
+					break;
+				case GE_LIGHTTYPE_SPOT:
+				case GE_LIGHTTYPE_UNKNOWN:
+					p.F("  angle = length(u_lightdir%i) == 0.0 ? 0.0 : dot(normalize(u_lightdir%i), toLight);\n", i, i);
+					p.F("  if (angle >= u_lightangle_spotCoef%i.x) {\n", i);
+					p.F("    lightScale = clamp(1.0 / dot(u_lightatt%i, vec3(1.0, distance, distance*distance)), 0.0, 1.0) * (u_lightangle_spotCoef%i.y <= 0.0 ? 1.0 : pow(angle, u_lightangle_spotCoef%i.y));\n", i, i, i);
+					p.C("  } else {\n");
+					p.C("    lightScale = 0.0;\n");
+					p.C("  }\n");
+					break;
+				default:
+					// ILLEGAL
+					break;
+				}
+
+				p.F("  diffuse = (u_lightdiffuse%i * %s) * max(ldot, 0.0);\n", i, diffuseStr);
+				if (doSpecular) {
+					p.C("  if (ldot >= 0.0) {\n");
+					p.C("    ldot = dot(normalize(toLight + vec3(0.0, 0.0, 1.0)), worldnormal);\n");
+					p.C("    if (u_matspecular.a <= 0.0) {\n");
+					p.C("      ldot = 1.0;\n");
+					p.C("    } else {\n");
+					p.C("      ldot = pow(max(ldot, 0.0), u_matspecular.a);\n");
+					p.C("    }\n");
+					p.C("    if (ldot > 0.0)\n");
+					p.F("      lightSum1 += u_lightspecular%i * %s * ldot %s;\n", i, specularStr, timesLightScale);
+					p.C("  }\n");
+				}
+				p.F("  lightSum0.rgb += (u_lightambient%i * %s.rgb + diffuse)%s;\n", i, ambientStr, timesLightScale);
 			}
-
-			const char *timesLightScale = " * lightScale";
-
-			// Attenuation
-			switch (type) {
-			case GE_LIGHTTYPE_DIRECTIONAL:
-				timesLightScale = "";
-				break;
-			case GE_LIGHTTYPE_POINT:
-				WRITE(p, "  lightScale = clamp(1.0 / dot(u_lightatt%i, vec3(1.0, distance, distance*distance)), 0.0, 1.0);\n", i);
-				break;
-			case GE_LIGHTTYPE_SPOT:
-			case GE_LIGHTTYPE_UNKNOWN:
-				WRITE(p, "  angle = length(u_lightdir%i) == 0.0 ? 0.0 : dot(normalize(u_lightdir%i), toLight);\n", i, i);
-				WRITE(p, "  if (angle >= u_lightangle_spotCoef%i.x) {\n", i);
-				WRITE(p, "    lightScale = clamp(1.0 / dot(u_lightatt%i, vec3(1.0, distance, distance*distance)), 0.0, 1.0) * (u_lightangle_spotCoef%i.y <= 0.0 ? 1.0 : pow(angle, u_lightangle_spotCoef%i.y));\n", i, i, i);
-				WRITE(p, "  } else {\n");
-				WRITE(p, "    lightScale = 0.0;\n");
-				WRITE(p, "  }\n");
-				break;
-			default:
-				// ILLEGAL
-				break;
-			}
-
-			WRITE(p, "  diffuse = (u_lightdiffuse%i * %s) * max(ldot, 0.0);\n", i, diffuseStr);
-			if (doSpecular) {
-				WRITE(p, "  if (ldot >= 0.0) {\n");
-				WRITE(p, "    ldot = dot(normalize(toLight + vec3(0.0, 0.0, 1.0)), worldnormal);\n");
-				WRITE(p, "    if (u_matspecular.a <= 0.0) {\n");
-				WRITE(p, "      ldot = 1.0;\n");
-				WRITE(p, "    } else {\n");
-				WRITE(p, "      ldot = pow(max(ldot, 0.0), u_matspecular.a);\n");
-				WRITE(p, "    }\n");
-				WRITE(p, "    if (ldot > 0.0)\n");
-				WRITE(p, "      lightSum1 += u_lightspecular%i * %s * ldot %s;\n", i, specularStr, timesLightScale);
-				WRITE(p, "  }\n");
-			}
-			WRITE(p, "  lightSum0.rgb += (u_lightambient%i * %s.rgb + diffuse)%s;\n", i, ambientStr, timesLightScale);
 		}

 		if (enableLighting) {
--- a/GPU/GPUCommon.cpp
+++ b/GPU/GPUCommon.cpp
@ -103,10 +103,10 @@ const CommonCommandTableEntry commonCommandTable[] = {
 	// These change the vertex shader so need flushing.
 	{ GE_CMD_REVERSENORMAL, FLAG_FLUSHBEFOREONCHANGE, DIRTY_VERTEXSHADER_STATE },
 	{ GE_CMD_LIGHTINGENABLE, FLAG_FLUSHBEFOREONCHANGE, DIRTY_VERTEXSHADER_STATE | DIRTY_FRAGMENTSHADER_STATE },
-	{ GE_CMD_LIGHTENABLE0, FLAG_FLUSHBEFOREONCHANGE, DIRTY_VERTEXSHADER_STATE  },
-	{ GE_CMD_LIGHTENABLE1, FLAG_FLUSHBEFOREONCHANGE, DIRTY_VERTEXSHADER_STATE  },
-	{ GE_CMD_LIGHTENABLE2, FLAG_FLUSHBEFOREONCHANGE, DIRTY_VERTEXSHADER_STATE  },
-	{ GE_CMD_LIGHTENABLE3, FLAG_FLUSHBEFOREONCHANGE, DIRTY_VERTEXSHADER_STATE  },
+	{ GE_CMD_LIGHTENABLE0, FLAG_FLUSHBEFOREONCHANGE, DIRTY_VERTEXSHADER_STATE },
+	{ GE_CMD_LIGHTENABLE1, FLAG_FLUSHBEFOREONCHANGE, DIRTY_VERTEXSHADER_STATE },
+	{ GE_CMD_LIGHTENABLE2, FLAG_FLUSHBEFOREONCHANGE, DIRTY_VERTEXSHADER_STATE },
+	{ GE_CMD_LIGHTENABLE3, FLAG_FLUSHBEFOREONCHANGE, DIRTY_VERTEXSHADER_STATE },
 	{ GE_CMD_LIGHTTYPE0, FLAG_FLUSHBEFOREONCHANGE, DIRTY_VERTEXSHADER_STATE | DIRTY_LIGHT0 },
 	{ GE_CMD_LIGHTTYPE1, FLAG_FLUSHBEFOREONCHANGE, DIRTY_VERTEXSHADER_STATE | DIRTY_LIGHT1 },
 	{ GE_CMD_LIGHTTYPE2, FLAG_FLUSHBEFOREONCHANGE, DIRTY_VERTEXSHADER_STATE | DIRTY_LIGHT2 },
@ -450,6 +450,21 @@ void GPUCommon::UpdateCmdInfo() {
 		cmdInfo_[GE_CMD_JUMP].func = &GPUCommon::Execute_Jump;
 		cmdInfo_[GE_CMD_CALL].func = &GPUCommon::Execute_Call;
 	}
+
+	// Reconfigure for light ubershader or not.
+	for (int i = 0; i < 4; i++) {
+		if (gstate_c.Supports(GPU_USE_LIGHT_UBERSHADER)) {
+			cmdInfo_[GE_CMD_LIGHTENABLE0 + i].RemoveDirty(DIRTY_VERTEXSHADER_STATE);
+			cmdInfo_[GE_CMD_LIGHTENABLE0 + i].AddDirty(DIRTY_LIGHT_CONTROL);
+			cmdInfo_[GE_CMD_LIGHTTYPE0 + i].RemoveDirty(DIRTY_VERTEXSHADER_STATE);
+			cmdInfo_[GE_CMD_LIGHTTYPE0 + i].AddDirty(DIRTY_LIGHT_CONTROL);
+		} else {
+			cmdInfo_[GE_CMD_LIGHTENABLE0 + i].RemoveDirty(DIRTY_LIGHT_CONTROL);
+			cmdInfo_[GE_CMD_LIGHTENABLE0 + i].AddDirty(DIRTY_VERTEXSHADER_STATE);
+			cmdInfo_[GE_CMD_LIGHTTYPE0 + i].RemoveDirty(DIRTY_LIGHT_CONTROL);
+			cmdInfo_[GE_CMD_LIGHTTYPE0 + i].AddDirty(DIRTY_VERTEXSHADER_STATE);
+		}
+	}
 }

 void GPUCommon::BeginHostFrame() {
@ -3202,6 +3217,10 @@ u32 GPUCommon::CheckGPUFeatures() const {
 		features |= GPU_SUPPORTS_ANY_FRAMEBUFFER_FETCH;
 	}

+	if (draw_->GetDeviceCaps().fragmentShaderInt32Supported) {
+		features |= GPU_USE_LIGHT_UBERSHADER;
+	}
+
 	if (PSP_CoreParameter().compat.flags().ClearToRAM) {
 		features |= GPU_USE_CLEAR_RAM_HACK;
 	}
--- a/GPU/GPUCommon.h
+++ b/GPU/GPUCommon.h
@ -314,6 +314,14 @@ protected:
 	struct CommandInfo {
 		uint64_t flags;
 		GPUCommon::CmdFunc func;
+
+		// Dirty flags are mashed into the regular flags by a left shift of 8.
+		void AddDirty(u64 dirty) {
+			flags |= dirty << 8;
+		}
+		void RemoveDirty(u64 dirty) {
+			flags &= ~(dirty << 8);
+		}
 	};

 	static CommandInfo cmdInfo_[256];
--- a/GPU/GPUState.h
+++ b/GPU/GPUState.h
@ -469,7 +469,8 @@ struct UVScale {
 // Might want to move this mechanism into the backend later.
 enum {
 	GPU_SUPPORTS_DUALSOURCE_BLEND = FLAG_BIT(0),
-	// Free bits: 1-2
+	GPU_USE_LIGHT_UBERSHADER = FLAG_BIT(1),
+	// Free bit: 2
 	GPU_SUPPORTS_VS_RANGE_CULLING = FLAG_BIT(3),
 	GPU_SUPPORTS_BLEND_MINMAX = FLAG_BIT(4),
 	GPU_SUPPORTS_LOGIC_OP = FLAG_BIT(5),