Merge branch 'master' into platform_openxr_pico

2024-11-28 07:50:49 +00:00 · 2022-09-28 00:11:02 +02:00 · 2022-09-28 00:11:02 +02:00 · 0c2120d596
commit 0c2120d596
parent 464fc446e3 afe3ba01fc
22 changed files with 429 additions and 246 deletions
--- a/Common/GPU/Vulkan/VulkanFrameData.h
+++ b/Common/GPU/Vulkan/VulkanFrameData.h
@ -48,16 +48,16 @@ struct FrameData {
 	std::condition_variable fenceCondVar;
 	bool readyForFence = true;

-	VkFence fence;
-	VkFence readbackFence;  // Strictly speaking we might only need one global of these.
+	VkFence fence = VK_NULL_HANDLE;
+	VkFence readbackFence = VK_NULL_HANDLE;  // Strictly speaking we might only need one global of these.

 	// These are on different threads so need separate pools.
-	VkCommandPool cmdPoolInit;  // Written to from main thread
-	VkCommandPool cmdPoolMain;  // Written to from render thread, which also submits
+	VkCommandPool cmdPoolInit = VK_NULL_HANDLE;  // Written to from main thread
+	VkCommandPool cmdPoolMain = VK_NULL_HANDLE;  // Written to from render thread, which also submits

-	VkCommandBuffer initCmd;
-	VkCommandBuffer mainCmd;
-	VkCommandBuffer presentCmd;
+	VkCommandBuffer initCmd = VK_NULL_HANDLE;
+	VkCommandBuffer mainCmd = VK_NULL_HANDLE;
+	VkCommandBuffer presentCmd = VK_NULL_HANDLE;

 	bool hasInitCommands = false;
 	bool hasMainCommands = false;
@ -73,7 +73,7 @@ struct FrameData {

 	// Profiling.
 	QueueProfileContext profile;
-	bool profilingEnabled_;
+	bool profilingEnabled_ = false;

 	void Init(VulkanContext *vulkan, int index);
 	void Destroy(VulkanContext *vulkan);
--- a/Core/Compatibility.cpp
+++ b/Core/Compatibility.cpp
@ -109,6 +109,7 @@ void Compatibility::CheckSettings(IniFile &iniFile, const std::string &gameID) {
 	CheckSetting(iniFile, gameID, "SplitFramebufferMargin", &flags_.SplitFramebufferMargin);
 	CheckSetting(iniFile, gameID, "ForceLowerResolutionForEffectsOn", &flags_.ForceLowerResolutionForEffectsOn);
 	CheckSetting(iniFile, gameID, "AllowDownloadCLUT", &flags_.AllowDownloadCLUT);
+	CheckSetting(iniFile, gameID, "NearestFilteringOnFramebufferCreate", &flags_.NearestFilteringOnFramebufferCreate);
 }

 void Compatibility::CheckSetting(IniFile &iniFile, const std::string &gameID, const char *option, bool *flag) {
--- a/Core/Compatibility.h
+++ b/Core/Compatibility.h
@ -89,6 +89,7 @@ struct CompatFlags {
 	bool SplitFramebufferMargin;
 	bool ForceLowerResolutionForEffectsOn;
 	bool AllowDownloadCLUT;
+	bool NearestFilteringOnFramebufferCreate;
 };

 struct VRCompat {
--- a/Core/HLE/sceMpeg.cpp
+++ b/Core/HLE/sceMpeg.cpp
@ -125,7 +125,7 @@ struct SceMpegLLI
 };

 void SceMpegAu::read(u32 addr) {
-	Memory::Memcpy(this, addr, sizeof(this), "SceMpegAu");
+	Memory::Memcpy(this, addr, sizeof(*this), "SceMpegAu");
 	pts = (pts & 0xFFFFFFFFULL) << 32 | (((u64)pts) >> 32);
 	dts = (dts & 0xFFFFFFFFULL) << 32 | (((u64)dts) >> 32);
 }
@ -133,7 +133,7 @@ void SceMpegAu::read(u32 addr) {
 void SceMpegAu::write(u32 addr) {
 	pts = (pts & 0xFFFFFFFFULL) << 32 | (((u64)pts) >> 32);
 	dts = (dts & 0xFFFFFFFFULL) << 32 | (((u64)dts) >> 32);
-	Memory::Memcpy(addr, this, sizeof(this), "SceMpegAu");
+	Memory::Memcpy(addr, this, sizeof(*this), "SceMpegAu");
 }

 /*
--- a/GPU/Common/DrawEngineCommon.cpp
+++ b/GPU/Common/DrawEngineCommon.cpp
@ -717,6 +717,16 @@ uint64_t DrawEngineCommon::ComputeHash() {
 	return fullhash;
 }

+// Cheap bit scrambler from https://nullprogram.com/blog/2018/07/31/
+inline uint32_t lowbias32_r(uint32_t x) {
+	x ^= x >> 16;
+	x *= 0x43021123U;
+	x ^= x >> 15 ^ x >> 30;
+	x *= 0x1d69e2a5U;
+	x ^= x >> 16;
+	return x;
+}
+
 // vertTypeID is the vertex type but with the UVGen mode smashed into the top bits.
 void DrawEngineCommon::SubmitPrim(const void *verts, const void *inds, GEPrimitiveType prim, int vertexCount, u32 vertTypeID, int cullMode, int *bytesRead) {
 	if (!indexGen.PrimCompatible(prevPrim_, prim) || numDrawCalls >= MAX_DEFERRED_DRAW_CALLS || vertexCountInDrawCalls_ + vertexCount > VERTEX_BUFFER_MAX) {
@ -745,10 +755,10 @@ void DrawEngineCommon::SubmitPrim(const void *verts, const void *inds, GEPrimiti
 	if (g_Config.bVertexCache) {
 		u32 dhash = dcid_;
 		dhash = __rotl(dhash ^ (u32)(uintptr_t)verts, 13);
-		dhash = __rotl(dhash ^ (u32)(uintptr_t)inds, 13);
-		dhash = __rotl(dhash ^ (u32)vertTypeID, 13);
-		dhash = __rotl(dhash ^ (u32)vertexCount, 13);
-		dcid_ = dhash ^ (u32)prim;
+		dhash = __rotl(dhash ^ (u32)(uintptr_t)inds, 19);
+		dhash = __rotl(dhash ^ (u32)vertTypeID, 7);
+		dhash = __rotl(dhash ^ (u32)vertexCount, 11);
+		dcid_ = lowbias32_r(dhash ^ (u32)prim);
 	}

 	DeferredDrawCall &dc = drawCalls[numDrawCalls];
--- a/GPU/Common/FragmentShaderGenerator.cpp
+++ b/GPU/Common/FragmentShaderGenerator.cpp
@ -183,7 +183,8 @@ bool GenerateFragmentShader(const FShaderID &id, char *buffer, const ShaderLangu
 			WRITE(p, "int roundAndScaleTo255i(in highp float x) { return int(floor(x * 255.0 + 0.5)); }\n");
 		}
 		if (enableColorTest && !colorTestAgainstZero) {
-			WRITE(p, "ivec3 roundAndScaleTo255iv(in highp vec3 x) { return ivec3(floor(x * 255.0 + 0.5)); }\n");
+			WRITE(p, "uint roundAndScaleTo8x4(in highp vec3 x) { uvec3 u = uvec3(floor(x * 255.0 + 0.5)); return u.r | (u.g << 8) | (u.b << 16); }\n");
+			WRITE(p, "uint packFloatsTo8x4(in vec3 x) { uvec3 u = uvec3(x); return u.r | (u.g << 8) | (u.b << 16); }\n");
 		}

 		WRITE(p, "layout (location = 0, index = 0) out vec4 fragColor0;\n");
@ -262,7 +263,8 @@ bool GenerateFragmentShader(const FShaderID &id, char *buffer, const ShaderLangu
 		}
 		if (enableColorTest) {
 			if (compat.shaderLanguage == HLSL_D3D11) {
-				WRITE(p, "uvec3 roundAndScaleTo255iv(float3 x) { return (floor(x * 255.0f + 0.5f)); }\n");
+				WRITE(p, "uint roundAndScaleTo8x4(float3 x) { uvec3 u = (floor(x * 255.0f + 0.5f)); return u.r | (u.g << 8) | (u.b << 16); }\n");
+				WRITE(p, "uint packFloatsTo8x4(in vec3 x) { uvec3 u = uvec3(x); return u.r | (u.g << 8) | (u.b << 16); }\n");
 			} else {
 				WRITE(p, "vec3 roundAndScaleTo255v(float3 x) { return floor(x * 255.0f + 0.5f); }\n");
 			}
@ -354,7 +356,7 @@ bool GenerateFragmentShader(const FShaderID &id, char *buffer, const ShaderLangu
 				WRITE(p, "uniform vec4 u_alphacolorref;\n");
 				if (compat.bitwiseOps && ((enableColorTest && !colorTestAgainstZero) || (enableAlphaTest && !alphaTestAgainstZero))) {
 					*uniformMask |= DIRTY_ALPHACOLORMASK;
-					WRITE(p, "uniform ivec4 u_alphacolormask;\n");
+					WRITE(p, "uniform uint u_alphacolormask;\n");
 				}
 			}
 		}
@ -408,7 +410,8 @@ bool GenerateFragmentShader(const FShaderID &id, char *buffer, const ShaderLangu
 			}
 			if (enableColorTest && !colorTestAgainstZero) {
 				if (compat.bitwiseOps) {
-					WRITE(p, "ivec3 roundAndScaleTo255iv(in vec3 x) { return ivec3(floor(x * 255.0 + 0.5)); }\n");
+					WRITE(p, "uint roundAndScaleTo8x4(in vec3 x) { uvec3 u = uvec3(floor(x * 255.99)); return u.r | (u.g << 8) | (u.b << 16); }\n");
+					WRITE(p, "uint packFloatsTo8x4(in vec3 x) { uvec3 u = uvec3(x); return u.r | (u.g << 8) | (u.b << 16); }\n");
 				} else if (gl_extensions.gpuVendor == GPU_VENDOR_IMGTEC) {
 					WRITE(p, "vec3 roundTo255thv(in vec3 x) { vec3 y = x + (0.5/255.0); return y - fract(y * 255.0) * (1.0 / 255.0); }\n");
 				} else {
@ -458,6 +461,12 @@ bool GenerateFragmentShader(const FShaderID &id, char *buffer, const ShaderLangu
 		WRITE(p, "}\n");
 	}

+	if (compat.bitwiseOps && enableColorTest) {
+		p.C("uvec3 unpackUVec3(highp uint x) {\n");
+		p.C("  return uvec3(x & 0xFF, (x >> 8) & 0xFF, (x >> 16) & 0xFF);\n");
+		p.C("}\n");
+	}
+
 	// PowerVR needs a custom modulo function. For some reason, this has far higher precision than the builtin one.
 	if ((gl_extensions.bugs & BUG_PVR_SHADER_PRECISION_BAD) && needShaderTexClamp) {
 		WRITE(p, "float mymod(float a, float b) { return a - b * floor(a / b); }\n");
@ -873,7 +882,7 @@ bool GenerateFragmentShader(const FShaderID &id, char *buffer, const ShaderLangu
 				const char *alphaTestFuncs[] = { "#", "#", " != ", " == ", " >= ", " > ", " <= ", " < " };
 				if (alphaTestFuncs[alphaTestFunc][0] != '#') {
 					if (compat.bitwiseOps) {
-						WRITE(p, "  if ((roundAndScaleTo255i(v.a) & u_alphacolormask.a) %s int(u_alphacolorref.a)) %s\n", alphaTestFuncs[alphaTestFunc], discardStatement);
+						WRITE(p, "  if ((roundAndScaleTo255i(v.a) & int(u_alphacolormask >> 24)) %s int(u_alphacolorref.a)) %s\n", alphaTestFuncs[alphaTestFunc], discardStatement);
 					} else if (gl_extensions.gpuVendor == GPU_VENDOR_IMGTEC) {
 						// Work around bad PVR driver problem where equality check + discard just doesn't work.
 						if (alphaTestFunc != GE_COMP_NOTEQUAL) {
@ -927,34 +936,22 @@ bool GenerateFragmentShader(const FShaderID &id, char *buffer, const ShaderLangu
 				}
 			} else {
 				const char *colorTestFuncs[] = { "#", "#", " != ", " == " };
-				if (colorTestFuncs[colorTestFunc][0] != '#') {
+				const char *test = colorTestFuncs[colorTestFunc];
+				if (test[0] != '#') {
 					// TODO: Unify these paths better.
-					if (compat.shaderLanguage == HLSL_D3D11) {
-						const char *test = colorTestFuncs[colorTestFunc];
-						WRITE(p, "  uvec3 v_scaled = roundAndScaleTo255iv(v.rgb);\n");
-						WRITE(p, "  uvec3 v_masked = v_scaled & u_alphacolormask.rgb;\n");
-						WRITE(p, "  uvec3 colorTestRef = u_alphacolorref.rgb & u_alphacolormask.rgb;\n");
-						// We have to test the components separately, or we get incorrect results.  See #10629.
-						WRITE(p, "  if (v_masked.r %s colorTestRef.r && v_masked.g %s colorTestRef.g && v_masked.b %s colorTestRef.b) %s\n", test, test, test, discardStatement);
-					} else if (compat.shaderLanguage == HLSL_D3D9) {
-						const char *test = colorTestFuncs[colorTestFunc];
+					if (compat.shaderLanguage == HLSL_D3D9) {
 						// TODO: Use a texture to lookup bitwise ops instead?
 						WRITE(p, "  vec3 colortest = roundAndScaleTo255v(v.rgb);\n");
 						WRITE(p, "  if ((colortest.r %s u_alphacolorref.r) && (colortest.g %s u_alphacolorref.g) && (colortest.b %s u_alphacolorref.b)) %s\n", test, test, test, discardStatement);
 					} else if (compat.bitwiseOps) {
-						WRITE(p, "  ivec3 v_scaled = roundAndScaleTo255iv(v.rgb);\n");
-						if (compat.shaderLanguage == GLSL_VULKAN) {
-							// Apparently GLES3 does not support vector bitwise ops, but Vulkan does?
-							WRITE(p, "  if ((v_scaled & u_alphacolormask.rgb) %s (u_alphacolorref.rgb & u_alphacolormask.rgb)) %s\n", colorTestFuncs[colorTestFunc], discardStatement);
-						} else {
-							const char *maskedFragColor = "ivec3(v_scaled.r & u_alphacolormask.r, v_scaled.g & u_alphacolormask.g, v_scaled.b & u_alphacolormask.b)";
-							const char *maskedColorRef = "ivec3(int(u_alphacolorref.r) & u_alphacolormask.r, int(u_alphacolorref.g) & u_alphacolormask.g, int(u_alphacolorref.b) & u_alphacolormask.b)";
-							WRITE(p, "  if (%s %s %s) %s\n", maskedFragColor, colorTestFuncs[colorTestFunc], maskedColorRef, discardStatement);
-						}
+						WRITE(p, "  uint v_uint = roundAndScaleTo8x4(v.rgb);\n");
+						WRITE(p, "  uint v_masked = v_uint & u_alphacolormask;\n");
+						WRITE(p, "  uint colorTestRef = packFloatsTo8x4(u_alphacolorref.rgb) & u_alphacolormask;\n");
+						WRITE(p, "  if (v_masked %s colorTestRef) %s\n", test, discardStatement);
 					} else if (gl_extensions.gpuVendor == GPU_VENDOR_IMGTEC) {
-						WRITE(p, "  if (roundTo255thv(v.rgb) %s u_alphacolorref.rgb) %s\n", colorTestFuncs[colorTestFunc], discardStatement);
+						WRITE(p, "  if (roundTo255thv(v.rgb) %s u_alphacolorref.rgb) %s\n", test, discardStatement);
 					} else {
-						WRITE(p, "  if (roundAndScaleTo255v(v.rgb) %s u_alphacolorref.rgb) %s\n", colorTestFuncs[colorTestFunc], discardStatement);
+						WRITE(p, "  if (roundAndScaleTo255v(v.rgb) %s u_alphacolorref.rgb) %s\n", test, discardStatement);
 					}
 				} else {
 					WRITE(p, "  %s\n", discardStatement);
--- a/GPU/Common/FramebufferManagerCommon.cpp
+++ b/GPU/Common/FramebufferManagerCommon.cpp
@ -1061,7 +1061,11 @@ void FramebufferManagerCommon::DrawPixels(VirtualFramebuffer *vfb, int dstX, int

 	DrawTextureFlags flags;
 	if (useBufferedRendering_ && vfb && vfb->fbo) {
-		flags = channel == RASTER_COLOR ? DRAWTEX_LINEAR : DRAWTEX_NEAREST;
+		if (channel == RASTER_DEPTH || PSP_CoreParameter().compat.flags().NearestFilteringOnFramebufferCreate) {
+			flags = DRAWTEX_NEAREST;
+		} else {
+			flags = DRAWTEX_LINEAR;
+		}
 		draw_->BindFramebufferAsRenderTarget(vfb->fbo, { Draw::RPAction::KEEP, Draw::RPAction::KEEP, Draw::RPAction::KEEP }, tag);
 		SetViewport2D(0, 0, vfb->renderWidth, vfb->renderHeight);
 		draw_->SetScissorRect(0, 0, vfb->renderWidth, vfb->renderHeight);
--- a/GPU/Common/ShaderUniforms.cpp
+++ b/GPU/Common/ShaderUniforms.cpp
@ -80,7 +80,7 @@ void BaseUpdateUniforms(UB_VS_FS_Base *ub, uint64_t dirtyUniforms, bool flipView
 		Uint8x3ToInt4_Alpha(ub->alphaColorRef, gstate.getColorTestRef(), gstate.getAlphaTestRef() & gstate.getAlphaTestMask());
 	}
 	if (dirtyUniforms & DIRTY_ALPHACOLORMASK) {
-		Uint8x3ToInt4_Alpha(ub->colorTestMask, gstate.getColorTestMask(), gstate.getAlphaTestMask());
+		ub->colorTestMask = gstate.getColorTestMask() | (gstate.getAlphaTestMask() << 24);
 	}
 	if (dirtyUniforms & DIRTY_FOGCOLOR) {
 		Uint8x3ToFloat4(ub->fogColor, gstate.fogcolor);
--- a/GPU/Common/ShaderUniforms.h
+++ b/GPU/Common/ShaderUniforms.h
@ -17,10 +17,9 @@ enum : uint64_t {
 	DIRTY_MATDIFFUSE | DIRTY_MATSPECULAR | DIRTY_MATEMISSIVE | DIRTY_AMBIENT,
 };

-// TODO: Split into two structs, one for software transform and one for hardware transform, to save space.
-// Currently 512 bytes. Probably can't get to 256 (nVidia's UBO alignment).
+// Currently 480 bytes. Probably can't get to 256 (nVidia's UBO alignment, also common in other vendors).
 // Every line here is a 4-float.
-struct UB_VS_FS_Base {
+struct alignas(16) UB_VS_FS_Base {
 	float proj[16];
 	float proj_through[16];
 	float view[12];
@ -29,21 +28,19 @@ struct UB_VS_FS_Base {
 	float uvScaleOffset[4];
 	float depthRange[4];
 	// Rotation is used only for software transform.
-	float fogCoef[2]; float stencil; float rotation;
 	float matAmbient[4];
 	float cullRangeMin[4];
 	float cullRangeMax[4];
 	uint32_t spline_counts; uint32_t depal_mask_shift_off_fmt;  // 4 params packed into one.
 	uint32_t colorWriteMask; float mipBias;
 	// Fragment data
-	float fogColor[4];
-	float texEnvColor[4];  // .w is unused
+	float fogColor[4];     // .w is unused
+	float texEnvColor[3]; uint32_t colorTestMask;
 	int alphaColorRef[4];
-	int colorTestMask[4];
-	float blendFixA[4];  // .w is unused
-	float blendFixB[4];  // .w is unused
+	float blendFixA[3]; float stencil;
+	float blendFixB[3]; float rotation;
 	float texClamp[4];
-	float texClampOffset[4];  // .zw are unused
+	float texClampOffset[2]; float fogCoef[2];
 };

 static const char * const ub_baseStr =
@ -54,9 +51,6 @@ R"(  mat4 u_proj;
  mat3x4 u_texmtx;
  vec4 u_uvscaleoffset;
  vec4 u_depthRange;
-  vec2 u_fogcoef;
-  float u_stencilReplaceValue;
-  float u_rotation;
  vec4 u_matambientalpha;
  vec4 u_cullRangeMin;
  vec4 u_cullRangeMax;
@ -66,17 +60,18 @@ R"(  mat4 u_proj;
  float u_mipBias;
  vec3 u_fogcolor;
  vec3 u_texenv;
+  uint u_alphacolormask;
  ivec4 u_alphacolorref;
-  ivec4 u_alphacolormask;
-  vec3 u_blendFixA;
-  vec3 u_blendFixB;
+  vec3 u_blendFixA; float u_stencilReplaceValue;
+  vec3 u_blendFixB; float u_rotation;
  vec4 u_texclamp;
  vec2 u_texclampoff;
+  vec2 u_fogcoef;
 )";

 // 512 bytes. Would like to shrink more. Some colors only have 8-bit precision and we expand
 // them to float unnecessarily, could just as well expand in the shader.
-struct UB_VS_Lights {
+struct alignas(16) UB_VS_Lights {
 	float ambientColor[4];
 	float materialDiffuse[4];
 	float materialSpecular[4];
@ -129,7 +124,7 @@ R"(	vec4 u_ambient;

 // With some cleverness, we could get away with uploading just half this when only the four or five first
 // bones are being used. This is 384b.
-struct UB_VS_Bones {
+struct alignas(16) UB_VS_Bones {
 	float bones[8][12];
 };

--- a/GPU/Common/SoftwareTransformCommon.cpp
+++ b/GPU/Common/SoftwareTransformCommon.cpp
@ -168,6 +168,29 @@ void SoftwareTransform::SetProjMatrix(float mtx[14], bool invertedX, bool invert
 	projMatrix_.translateAndScale(trans, scale);
 }

+static void ReadWeightedNormal(Vec3f &source, VertexReader &reader, u32 vertType, bool skinningEnabled) {
+	if (reader.hasNormal())
+		reader.ReadNrm(source.AsArray());
+	if (skinningEnabled) {
+		float weights[8];
+		reader.ReadWeights(weights);
+
+		// Have to recalculate this, unfortunately.  Please use software skinning...
+		Vec3f nsum(0, 0, 0);
+		for (int i = 0; i < vertTypeGetNumBoneWeights(vertType); i++) {
+			if (weights[i] != 0.0f) {
+				Vec3f norm;
+				Norm3ByMatrix43(norm.AsArray(), source.AsArray(), gstate.boneMatrix + i * 12);
+				nsum += norm * weights[i];
+			}
+		}
+
+		source = nsum;
+	}
+	if (gstate.areNormalsReversed())
+		source = -source;
+}
+
 void SoftwareTransform::Decode(int prim, u32 vertType, const DecVtxFormat &decVtxFormat, int maxIndex, SoftwareTransformResult *result) {
 	u8 *decoded = params_.decoded;
 	TransformedVertex *transformed = params_.transformed;
@ -284,7 +307,7 @@ void SoftwareTransform::Decode(int prim, u32 vertType, const DecVtxFormat &decVt
 				}
 			} else {
 				float weights[8];
-				// TODO: For flat, are weights from the provoking used for color/normal?
+				// For flat, we need the vertex weights.
 				reader.Goto(index);
 				reader.ReadWeights(weights);

@ -358,10 +381,8 @@ void SoftwareTransform::Decode(int prim, u32 vertType, const DecVtxFormat &decVt

 			case GE_TEXMAP_TEXTURE_MATRIX:
 				{
-					// TODO: What's the correct behavior with flat shading?  Provoked normal or real normal?
-
 					// Projection mapping
-					Vec3f source;
+					Vec3f source(0.0f, 0.0f, 1.0f);
 					switch (gstate.getUVProjMode())	{
 					case GE_PROJMAP_POSITION: // Use model space XYZ as source
 						source = pos;
@ -372,14 +393,28 @@ void SoftwareTransform::Decode(int prim, u32 vertType, const DecVtxFormat &decVt
 						break;

 					case GE_PROJMAP_NORMALIZED_NORMAL: // Use normalized normal as source
-						source = normal.NormalizedOr001(cpu_info.bSSE4_1);
+						// Flat uses the vertex normal, not provoking.
+						if (provokeIndOffset == 0) {
+							source = normal.Normalized(cpu_info.bSSE4_1);
+						} else {
+							reader.Goto(index);
+							ReadWeightedNormal(source, reader, vertType, skinningEnabled);
+							source.Normalize();
+						}
 						if (!reader.hasNormal()) {
 							ERROR_LOG_REPORT(G3D, "Normal projection mapping without normal?");
 						}
 						break;

 					case GE_PROJMAP_NORMAL: // Use non-normalized normal as source!
-						source = normal;
+						// Flat uses the vertex normal, not provoking.
+						if (provokeIndOffset == 0) {
+							source = normal;
+						} else {
+							// Need to read the normal for this vertex and weight it again..
+							reader.Goto(index);
+							ReadWeightedNormal(source, reader, vertType, skinningEnabled);
+						}
 						if (!reader.hasNormal()) {
 							ERROR_LOG_REPORT(G3D, "Normal projection mapping without normal?");
 						}
--- a/GPU/Common/VertexShaderGenerator.cpp
+++ b/GPU/Common/VertexShaderGenerator.cpp
@ -1231,9 +1231,9 @@ bool GenerateVertexShader(const VShaderID &id, char *buffer, const ShaderLanguag
 						break;
 					case GE_PROJMAP_NORMALIZED_NORMAL:  // Use normalized transformed normal as source
 						if ((doBezier || doSpline) && hasNormalTess)
-							temp_tc = StringFromFormat("length(tess.nrm) == 0.0 ? vec4(0.0, 0.0, 1.0, 1.0) : vec4(normalize(%stess.nrm), 1.0)", flipNormalTess ? "-" : "");
+							temp_tc = StringFromFormat("length(tess.nrm) == 0.0 ? vec4(0.0, 0.0, 0.0, 1.0) : vec4(normalize(%stess.nrm), 1.0)", flipNormalTess ? "-" : "");
 						else if (hasNormal)
-							temp_tc = StringFromFormat("length(normal) == 0.0 ? vec4(0.0, 0.0, 1.0, 1.0) : vec4(normalize(%snormal), 1.0)", flipNormal ? "-" : "");
+							temp_tc = StringFromFormat("length(normal) == 0.0 ? vec4(0.0, 0.0, 0.0, 1.0) : vec4(normalize(%snormal), 1.0)", flipNormal ? "-" : "");
 						else
 							temp_tc = "vec4(0.0, 0.0, 1.0, 1.0)";
 						break;
--- a/GPU/GLES/ShaderManagerGLES.cpp
+++ b/GPU/GLES/ShaderManagerGLES.cpp
@ -448,7 +448,7 @@ void LinkedShader::UpdateUniforms(u32 vertType, const ShaderID &vsid, bool useBu
 		SetColorUniform3Alpha255(render_, &u_alphacolorref, gstate.getColorTestRef(), gstate.getAlphaTestRef() & gstate.getAlphaTestMask());
 	}
 	if (dirty & DIRTY_ALPHACOLORMASK) {
-		SetColorUniform3iAlpha(render_, &u_alphacolormask, gstate.colortestmask, gstate.getAlphaTestMask());
+		render_->SetUniformUI1(&u_alphacolormask, gstate.getColorTestMask() | (gstate.getAlphaTestMask() << 24));
 	}
 	if (dirty & DIRTY_COLORWRITEMASK) {
 		render_->SetUniformUI1(&u_colorWriteMask, ~((gstate.pmska << 24) | (gstate.pmskc & 0xFFFFFF)));
--- a/GPU/Software/Clipper.cpp
+++ b/GPU/Software/Clipper.cpp
@ -45,7 +45,7 @@ inline bool different_signs(float x, float y) {
 	return ((x <= 0 && y > 0) || (x > 0 && y <= 0));
 }

-inline float clip_dotprod(const VertexData &vert, float A, float B, float C, float D) {
+inline float clip_dotprod(const ClipVertexData &vert, float A, float B, float C, float D) {
 	return (vert.clippos.x * A + vert.clippos.y * B + vert.clippos.z * C + vert.clippos.w * D);
 }

@ -131,7 +131,61 @@ static inline bool CheckOutsideZ(ClipCoords p, int &pos, int &neg) {
 	return false;
 }

-void ProcessRect(const VertexData &v0, const VertexData &v1, BinManager &binner) {
+static void RotateUV(const VertexData &tl, const VertexData &br, VertexData &tr, VertexData &bl) {
+	const int x1 = tl.screenpos.x;
+	const int x2 = br.screenpos.x;
+	const int y1 = tl.screenpos.y;
+	const int y2 = br.screenpos.y;
+
+	if ((x1 < x2 && y1 > y2) || (x1 > x2 && y1 < y2)) {
+		std::swap(bl.texturecoords, tr.texturecoords);
+	}
+}
+
+// This is used for rectangle texture projection, which is very uncommon.
+// To avoid complicating the common rectangle path, this just uses triangles.
+static void AddTriangleRect(const VertexData &v0, const VertexData &v1, BinManager &binner) {
+	VertexData buf[4];
+	buf[0] = v1;
+	buf[0].screenpos = ScreenCoords(v0.screenpos.x, v0.screenpos.y, v1.screenpos.z);
+	buf[0].texturecoords = v0.texturecoords;
+
+	buf[1] = v1;
+	buf[1].screenpos = ScreenCoords(v0.screenpos.x, v1.screenpos.y, v1.screenpos.z);
+	buf[1].texturecoords = Vec3Packed<float>(v0.texturecoords.x, v1.texturecoords.y, v0.texturecoords.z);
+
+	buf[2] = v1;
+	buf[2].screenpos = ScreenCoords(v1.screenpos.x, v0.screenpos.y, v1.screenpos.z);
+	buf[2].texturecoords = Vec3Packed<float>(v1.texturecoords.x, v0.texturecoords.y, v1.texturecoords.z);
+
+	buf[3] = v1;
+
+	VertexData *topleft = &buf[0];
+	VertexData *topright = &buf[1];
+	VertexData *bottomleft = &buf[2];
+	VertexData *bottomright = &buf[3];
+
+	// DrawTriangle always culls, so sort out the drawing order.
+	for (int i = 0; i < 4; ++i) {
+		if (buf[i].screenpos.x < topleft->screenpos.x && buf[i].screenpos.y < topleft->screenpos.y)
+			topleft = &buf[i];
+		if (buf[i].screenpos.x > topright->screenpos.x && buf[i].screenpos.y < topright->screenpos.y)
+			topright = &buf[i];
+		if (buf[i].screenpos.x < bottomleft->screenpos.x && buf[i].screenpos.y > bottomleft->screenpos.y)
+			bottomleft = &buf[i];
+		if (buf[i].screenpos.x > bottomright->screenpos.x && buf[i].screenpos.y > bottomright->screenpos.y)
+			bottomright = &buf[i];
+	}
+
+	RotateUV(v0, v1, *topright, *bottomleft);
+
+	binner.AddTriangle(*topleft, *topright, *bottomleft);
+	binner.AddTriangle(*bottomleft, *topright, *topleft);
+	binner.AddTriangle(*topright, *bottomright, *bottomleft);
+	binner.AddTriangle(*bottomleft, *bottomright, *topright);
+}
+
+void ProcessRect(const ClipVertexData &v0, const ClipVertexData &v1, BinManager &binner) {
 	if (!binner.State().throughMode) {
 		// If any verts were outside range, throw the entire prim away.
 		if (v0.OutsideRange() || v1.OutsideRange())
@ -149,37 +203,44 @@ void ProcessRect(const VertexData &v0, const VertexData &v1, BinManager &binner)
 		else if (outsidePos >= 2 || outsideNeg >= 2)
 			return;

-		if (v0.fogdepth != v1.fogdepth) {
+		if (v0.v.fogdepth != v1.v.fogdepth) {
 			// Rectangles seem to always use nearest along X for fog depth, but reversed.
 			// TODO: Check exactness of middle.
-			VertexData vhalf0 = v1;
-			vhalf0.screenpos.x = v0.screenpos.x + (v1.screenpos.x - v0.screenpos.x) / 2;
+			VertexData vhalf0 = v1.v;
+			vhalf0.screenpos.x = v0.v.screenpos.x + (v1.v.screenpos.x - v0.v.screenpos.x) / 2;

-			VertexData vhalf1 = v1;
-			vhalf1.screenpos.x = v0.screenpos.x + (v1.screenpos.x - v0.screenpos.x) / 2;
-			vhalf1.screenpos.y = v0.screenpos.y;
+			VertexData vhalf1 = v1.v;
+			vhalf1.screenpos.x = v0.v.screenpos.x + (v1.v.screenpos.x - v0.v.screenpos.x) / 2;
+			vhalf1.screenpos.y = v0.v.screenpos.y;

-			VertexData vrev1 = v1;
-			vrev1.fogdepth = v0.fogdepth;
+			VertexData vrev1 = v1.v;
+			vrev1.fogdepth = v0.v.fogdepth;

-			binner.AddRect(v0, vhalf0);
-			binner.AddRect(vhalf1, vrev1);
+			if (binner.State().textureProj) {
+				AddTriangleRect(v0.v, vhalf0, binner);
+				AddTriangleRect(vhalf1, vrev1, binner);
+			} else {
+				binner.AddRect(v0.v, vhalf0);
+				binner.AddRect(vhalf1, vrev1);
+			}
+		} else if (binner.State().textureProj) {
+			AddTriangleRect(v0.v, v1.v, binner);
 		} else {
-			binner.AddRect(v0, v1);
+			binner.AddRect(v0.v, v1.v);
 		}
 	} else {
 		// through mode handling
-		if (Rasterizer::RectangleFastPath(v0, v1, binner)) {
+		if (Rasterizer::RectangleFastPath(v0.v, v1.v, binner)) {
 			return;
 		} else if (gstate.isModeClear() && !gstate.isDitherEnabled()) {
-			binner.AddClearRect(v0, v1);
+			binner.AddClearRect(v0.v, v1.v);
 		} else {
-			binner.AddRect(v0, v1);
+			binner.AddRect(v0.v, v1.v);
 		}
 	}
 }

-void ProcessPoint(const VertexData &v0, BinManager &binner) {
+void ProcessPoint(const ClipVertexData &v0, BinManager &binner) {
 	// If any verts were outside range, throw the entire prim away.
 	if (!binner.State().throughMode) {
 		if (v0.OutsideRange())
@ -187,13 +248,13 @@ void ProcessPoint(const VertexData &v0, BinManager &binner) {
 	}

 	// Points need no clipping. Will be bounds checked in the rasterizer (which seems backwards?)
-	binner.AddPoint(v0);
+	binner.AddPoint(v0.v);
 }

-void ProcessLine(const VertexData &v0, const VertexData &v1, BinManager &binner) {
+void ProcessLine(const ClipVertexData &v0, const ClipVertexData &v1, BinManager &binner) {
 	if (binner.State().throughMode) {
 		// Actually, should clip this one too so we don't need to do bounds checks in the rasterizer.
-		binner.AddLine(v0, v1);
+		binner.AddLine(v0.v, v1.v);
 		return;
 	}

@ -216,24 +277,26 @@ void ProcessLine(const VertexData &v0, const VertexData &v1, BinManager &binner)
 	int mask1 = CalcClipMask(v1.clippos);
 	int mask = mask0 | mask1;
 	if ((mask & CLIP_NEG_Z_BIT) == 0) {
-		binner.AddLine(v0, v1);
+		binner.AddLine(v0.v, v1.v);
 		return;
 	}

-	VertexData ClippedVertices[2] = { v0, v1 };
-	VertexData *Vertices[2] = { &ClippedVertices[0], &ClippedVertices[1] };
+	ClipVertexData ClippedVertices[2] = { v0, v1 };
+	ClipVertexData *Vertices[2] = { &ClippedVertices[0], &ClippedVertices[1] };
 	bool clipped = false;
 	CLIP_LINE(CLIP_NEG_Z_BIT,  0,  0,  1, 1);

-	VertexData data[2] = { *Vertices[0], *Vertices[1] };
+	ClipVertexData data[2] = { *Vertices[0], *Vertices[1] };
 	if (clipped) {
-		data[0].screenpos = TransformUnit::ClipToScreen(data[0].clippos);
-		data[1].screenpos = TransformUnit::ClipToScreen(data[1].clippos);
+		data[0].v.screenpos = TransformUnit::ClipToScreen(data[0].clippos);
+		data[1].v.screenpos = TransformUnit::ClipToScreen(data[1].clippos);
+		data[0].v.clipw = data[0].clippos.w;
+		data[1].v.clipw = data[1].clippos.w;
 	}
-	binner.AddLine(data[0], data[1]);
+	binner.AddLine(data[0].v, data[1].v);
 }

-void ProcessTriangle(const VertexData &v0, const VertexData &v1, const VertexData &v2, const VertexData &provoking, BinManager &binner) {
+void ProcessTriangle(const ClipVertexData &v0, const ClipVertexData &v1, const ClipVertexData &v2, const ClipVertexData &provoking, BinManager &binner) {
 	int mask = 0;
 	if (!binner.State().throughMode) {
 		// If any verts were outside range, throw the entire prim away.
@ -262,20 +325,20 @@ void ProcessTriangle(const VertexData &v0, const VertexData &v1, const VertexDat
 	if ((mask & CLIP_NEG_Z_BIT) == 0) {
 		if (gstate.getShadeMode() == GE_SHADE_FLAT) {
 			// So that the order of clipping doesn't matter...
-			VertexData corrected2 = v2;
-			corrected2.color0 = provoking.color0;
-			corrected2.color1 = provoking.color1;
-			binner.AddTriangle(v0, v1, corrected2);
+			VertexData corrected2 = v2.v;
+			corrected2.color0 = provoking.v.color0;
+			corrected2.color1 = provoking.v.color1;
+			binner.AddTriangle(v0.v, v1.v, corrected2);
 		} else {
-			binner.AddTriangle(v0, v1, v2);
+			binner.AddTriangle(v0.v, v1.v, v2.v);
 		}
 		return;
 	}

 	enum { NUM_CLIPPED_VERTICES = 3, NUM_INDICES = NUM_CLIPPED_VERTICES + 3 };

-	VertexData* Vertices[NUM_INDICES];
-	VertexData ClippedVertices[NUM_INDICES];
+	ClipVertexData* Vertices[NUM_INDICES];
+	ClipVertexData ClippedVertices[NUM_INDICES];
 	for (int i = 0; i < NUM_INDICES; ++i)
 		Vertices[i] = &ClippedVertices[i];

@ -319,22 +382,25 @@ void ProcessTriangle(const VertexData &v0, const VertexData &v1, const VertexDat

 	for (int i = 0; i + 3 <= numIndices; i += 3) {
 		if (indices[i] != SKIP_FLAG) {
-			VertexData &subv0 = *Vertices[indices[i + 0]];
-			VertexData &subv1 = *Vertices[indices[i + 1]];
-			VertexData &subv2 = *Vertices[indices[i + 2]];
+			ClipVertexData &subv0 = *Vertices[indices[i + 0]];
+			ClipVertexData &subv1 = *Vertices[indices[i + 1]];
+			ClipVertexData &subv2 = *Vertices[indices[i + 2]];
 			if (clipped) {
-				subv0.screenpos = TransformUnit::ClipToScreen(subv0.clippos);
-				subv1.screenpos = TransformUnit::ClipToScreen(subv1.clippos);
-				subv2.screenpos = TransformUnit::ClipToScreen(subv2.clippos);
+				subv0.v.screenpos = TransformUnit::ClipToScreen(subv0.clippos);
+				subv1.v.screenpos = TransformUnit::ClipToScreen(subv1.clippos);
+				subv2.v.screenpos = TransformUnit::ClipToScreen(subv2.clippos);
+				subv0.v.clipw = subv0.clippos.w;
+				subv1.v.clipw = subv1.clippos.w;
+				subv2.v.clipw = subv2.clippos.w;
 			}

 			if (gstate.getShadeMode() == GE_SHADE_FLAT) {
 				// So that the order of clipping doesn't matter...
-				subv2.color0 = provoking.color0;
-				subv2.color1 = provoking.color1;
+				subv2.v.color0 = provoking.v.color0;
+				subv2.v.color1 = provoking.v.color1;
 			}

-			binner.AddTriangle(subv0, subv1, subv2);
+			binner.AddTriangle(subv0.v, subv1.v, subv2.v);
 		}
 	}
 }
--- a/GPU/Software/Clipper.h
+++ b/GPU/Software/Clipper.h
@ -26,9 +26,9 @@ class BinManager;

 namespace Clipper {

-void ProcessPoint(const VertexData &v0, BinManager &binner);
-void ProcessLine(const VertexData &v0, const VertexData &v1, BinManager &binner);
-void ProcessTriangle(const VertexData &v0, const VertexData &v1, const VertexData &v2, const VertexData &provoking, BinManager &binner);
-void ProcessRect(const VertexData &v0, const VertexData &v1, BinManager &binner);
+void ProcessPoint(const ClipVertexData &v0, BinManager &binner);
+void ProcessLine(const ClipVertexData &v0, const ClipVertexData &v1, BinManager &binner);
+void ProcessTriangle(const ClipVertexData &v0, const ClipVertexData &v1, const ClipVertexData &v2, const ClipVertexData &provoking, BinManager &binner);
+void ProcessRect(const ClipVertexData &v0, const ClipVertexData &v1, BinManager &binner);

 }
--- a/GPU/Software/Rasterizer.cpp
+++ b/GPU/Software/Rasterizer.cpp
@ -129,6 +129,7 @@ void ComputeRasterizerState(RasterizerState *state) {
 		state->mipFilt = gstate.isMipmapFilteringEnabled();
 		state->minFilt = gstate.isMinifyFilteringEnabled();
 		state->magFilt = gstate.isMagnifyFilteringEnabled();
+		state->textureProj = gstate.getUVGenMode() == GE_TEXMAP_TEXTURE_MATRIX;
 	}

 	state->shadeGouraud = gstate.getShadeMode() == GE_SHADE_GOURAUD;
@ -224,12 +225,9 @@ static inline u8 ClampFogDepth(float fogdepth) {
 }

 static inline void GetTextureCoordinates(const VertexData& v0, const VertexData& v1, const float p, float &s, float &t) {
-	// All UV gen modes, by the time they get here, behave the same.
-
-	// TODO: What happens if vertex has no texture coordinates?
 	// Note that for environment mapping, texture coordinates have been calculated during lighting
-	float q0 = 1.f / v0.clippos.w;
-	float q1 = 1.f / v1.clippos.w;
+	float q0 = 1.f / v0.clipw;
+	float q1 = 1.f / v1.clipw;
 	float wq0 = p * q0;
 	float wq1 = (1.0f - p) * q1;

@ -238,14 +236,26 @@ static inline void GetTextureCoordinates(const VertexData& v0, const VertexData&
 	t = (v0.texturecoords.t() * wq0 + v1.texturecoords.t() * wq1) * q_recip;
 }

-static inline void GetTextureCoordinates(const VertexData &v0, const VertexData &v1, const VertexData &v2, const Vec4<int> &w0, const Vec4<int> &w1, const Vec4<int> &w2, const Vec4<float> &wsum_recip, Vec4<float> &s, Vec4<float> &t) {
-	// All UV gen modes, by the time they get here, behave the same.
+static inline void GetTextureCoordinatesProj(const VertexData& v0, const VertexData& v1, const float p, float &s, float &t) {
+	// This is for texture matrix projection.
+	float q0 = 1.f / v0.clipw;
+	float q1 = 1.f / v1.clipw;
+	float wq0 = p * q0;
+	float wq1 = (1.0f - p) * q1;

-	// TODO: What happens if vertex has no texture coordinates?
+	float q_recip = 1.0f / (wq0 + wq1);
+	float q = (v0.texturecoords.q() * wq0 + v1.texturecoords.q() * wq1) * q_recip;
+	q_recip *= 1.0f / q;
+
+	s = (v0.texturecoords.s() * wq0 + v1.texturecoords.s() * wq1) * q_recip;
+	t = (v0.texturecoords.t() * wq0 + v1.texturecoords.t() * wq1) * q_recip;
+}
+
+static inline void GetTextureCoordinates(const VertexData &v0, const VertexData &v1, const VertexData &v2, const Vec4<int> &w0, const Vec4<int> &w1, const Vec4<int> &w2, const Vec4<float> &wsum_recip, Vec4<float> &s, Vec4<float> &t) {
 	// Note that for environment mapping, texture coordinates have been calculated during lighting.
-	float q0 = 1.f / v0.clippos.w;
-	float q1 = 1.f / v1.clippos.w;
-	float q2 = 1.f / v2.clippos.w;
+	float q0 = 1.f / v0.clipw;
+	float q1 = 1.f / v1.clipw;
+	float q2 = 1.f / v2.clipw;
 	Vec4<float> wq0 = w0.Cast<float>() * q0;
 	Vec4<float> wq1 = w1.Cast<float>() * q1;
 	Vec4<float> wq2 = w2.Cast<float>() * q2;
@ -255,6 +265,23 @@ static inline void GetTextureCoordinates(const VertexData &v0, const VertexData
 	t = Interpolate(v0.texturecoords.t(), v1.texturecoords.t(), v2.texturecoords.t(), wq0, wq1, wq2, q_recip);
 }

+static inline void GetTextureCoordinatesProj(const VertexData &v0, const VertexData &v1, const VertexData &v2, const Vec4<int> &w0, const Vec4<int> &w1, const Vec4<int> &w2, const Vec4<float> &wsum_recip, Vec4<float> &s, Vec4<float> &t) {
+	// This is for texture matrix projection.
+	float q0 = 1.f / v0.clipw;
+	float q1 = 1.f / v1.clipw;
+	float q2 = 1.f / v2.clipw;
+	Vec4<float> wq0 = w0.Cast<float>() * q0;
+	Vec4<float> wq1 = w1.Cast<float>() * q1;
+	Vec4<float> wq2 = w2.Cast<float>() * q2;
+
+	Vec4<float> q_recip = (wq0 + wq1 + wq2).Reciprocal();
+	Vec4<float> q = Interpolate(v0.texturecoords.q(), v1.texturecoords.q(), v2.texturecoords.q(), wq0, wq1, wq2, q_recip);
+	q_recip = q_recip * q.Reciprocal();
+
+	s = Interpolate(v0.texturecoords.s(), v1.texturecoords.s(), v2.texturecoords.s(), wq0, wq1, wq2, q_recip);
+	t = Interpolate(v0.texturecoords.t(), v1.texturecoords.t(), v2.texturecoords.t(), wq0, wq1, wq2, q_recip);
+}
+
 static inline void SetPixelDepth(int x, int y, int stride, u16 value) {
 	depthbuf.Set16(x, y, stride, value);
 }
@ -676,6 +703,9 @@ void DrawTriangleSlice(
 						// For levels > 0, mipmapping is always based on level 0.  Simpler to scale first.
 						s *= 1.0f / (float)(1 << state.samplerID.width0Shift);
 						t *= 1.0f / (float)(1 << state.samplerID.height0Shift);
+					} else if (state.textureProj) {
+						// Texture coordinate interpolation must definitely be perspective-correct.
+						GetTextureCoordinatesProj(v0, v1, v2, w0, w1, w2, wsum_recip, s, t);
 					} else {
 						// Texture coordinate interpolation must definitely be perspective-correct.
 						GetTextureCoordinates(v0, v1, v2, w0, w1, w2, wsum_recip, s, t);
@ -772,8 +802,9 @@ void DrawRectangle(const VertexData &v0, const VertexData &v1, const BinCoords &
 	Vec2f stx(0.0f, 0.0f);
 	Vec2f sty(0.0f, 0.0f);
 	if (state.enableTextures) {
-		Vec2f tc0 = v0.texturecoords;
-		Vec2f tc1 = v1.texturecoords;
+		// Note: texture projection is not handled here, those always turn into triangles.
+		Vec2f tc0 = v0.texturecoords.uv();
+		Vec2f tc1 = v1.texturecoords.uv();
 		if (state.throughMode) {
 			// For levels > 0, mipmapping is always based on level 0.  Simpler to scale first.
 			tc0.s() *= 1.0f / (float)(1 << state.samplerID.width0Shift);
@ -960,6 +991,8 @@ void DrawPoint(const VertexData &v0, const BinCoords &range, const RasterizerSta
 		if (state.throughMode) {
 			s *= 1.0f / (float)(1 << state.samplerID.width0Shift);
 			t *= 1.0f / (float)(1 << state.samplerID.height0Shift);
+		} else if (state.textureProj) {
+			GetTextureCoordinatesProj(v0, v0, 0.0f, s, t);
 		} else {
 			// Texture coordinate interpolation must definitely be perspective-correct.
 			GetTextureCoordinates(v0, v0, 0.0f, s, t);
@ -1270,13 +1303,16 @@ void DrawLine(const VertexData &v0, const VertexData &v1, const BinCoords &range
 				float s, s1;
 				float t, t1;
 				if (state.throughMode) {
-					Vec2<float> tc = (v0.texturecoords * (float)(steps - i) + v1.texturecoords * (float)i) / steps1;
-					Vec2<float> tc1 = (v0.texturecoords * (float)(steps - i - 1) + v1.texturecoords * (float)(i + 1)) / steps1;
+					Vec2<float> tc = (v0.texturecoords.uv() * (float)(steps - i) + v1.texturecoords.uv() * (float)i) / steps1;
+					Vec2<float> tc1 = (v0.texturecoords.uv() * (float)(steps - i - 1) + v1.texturecoords.uv() * (float)(i + 1)) / steps1;

 					s = tc.s() * (1.0f / (float)(1 << state.samplerID.width0Shift));
 					s1 = tc1.s() * (1.0f / (float)(1 << state.samplerID.width0Shift));
 					t = tc.t() * (1.0f / (float)(1 << state.samplerID.height0Shift));
 					t1 = tc1.t() * (1.0f / (float)(1 << state.samplerID.height0Shift));
+				} else if (state.textureProj) {
+					GetTextureCoordinatesProj(v0, v1, (float)(steps - i) / steps1, s, t);
+					GetTextureCoordinatesProj(v0, v1, (float)(steps - i - 1) / steps1, s1, t1);
 				} else {
 					// Texture coordinate interpolation must definitely be perspective-correct.
 					GetTextureCoordinates(v0, v1, (float)(steps - i) / steps1, s, t);
--- a/GPU/Software/Rasterizer.h
+++ b/GPU/Software/Rasterizer.h
@ -54,6 +54,7 @@ struct RasterizerState {
 		bool minFilt : 1;
 		bool magFilt : 1;
 		bool antialiasLines : 1;
+		bool textureProj : 1;
 	};

 #if defined(SOFTGPU_MEMORY_TAGGING_DETAILED) || defined(SOFTGPU_MEMORY_TAGGING_BASIC)
--- a/GPU/Software/RasterizerRectangle.cpp
+++ b/GPU/Software/RasterizerRectangle.cpp
@ -356,7 +356,7 @@ bool RectangleFastPath(const VertexData &v0, const VertexData &v1, BinManager &b
 	// Currently only works for TL/BR, which is the most common but not required.
 	bool orient_check = xdiff >= 0 && ydiff >= 0;
 	// We already have a fast path for clear in ClearRectangle.
-	bool state_check = state.throughMode && !state.pixelID.clearMode && !state.samplerID.hasAnyMips && NoClampOrWrap(state, v0.texturecoords) && NoClampOrWrap(state, v1.texturecoords);
+	bool state_check = state.throughMode && !state.pixelID.clearMode && !state.samplerID.hasAnyMips && NoClampOrWrap(state, v0.texturecoords.uv()) && NoClampOrWrap(state, v1.texturecoords.uv());
 	// This doesn't work well with offset drawing, see #15876.  Through never has a subpixel offset.
 	bool subpixel_check = ((v0.screenpos.x | v0.screenpos.y | v1.screenpos.x | v1.screenpos.y) & 0xF) == 0;
 	if ((coord_check || !state.enableTextures) && orient_check && state_check && subpixel_check) {
@ -393,16 +393,16 @@ bool RectangleFastPath(const VertexData &v0, const VertexData &v1, BinManager &b
 	return false;
 }

-static bool AreCoordsRectangleCompatible(const RasterizerState &state, const VertexData &data0, const VertexData &data1) {
-	if (data1.color0 != data0.color0)
+static bool AreCoordsRectangleCompatible(const RasterizerState &state, const ClipVertexData &data0, const ClipVertexData &data1) {
+	if (data1.v.color0 != data0.v.color0)
 		return false;
-	if (data1.screenpos.z != data0.screenpos.z) {
+	if (data1.v.screenpos.z != data0.v.screenpos.z) {
 		// Sometimes, we don't actually care about z.
 		if (state.pixelID.depthWrite || state.pixelID.DepthTestFunc() != GE_COMP_ALWAYS)
 			return false;
 	}
 	if (!state.throughMode) {
-		if (data1.color1 != data0.color1)
+		if (data1.v.color1 != data0.v.color1)
 			return false;
 		// This means it should be culled, outside range.
 		if (data1.OutsideRange() || data0.OutsideRange())
@ -414,26 +414,29 @@ static bool AreCoordsRectangleCompatible(const RasterizerState &state, const Ver
 			if (data1.clippos.w - halftexel > data0.clippos.w || data1.clippos.w + halftexel < data0.clippos.w)
 				return false;
 		}
-		if (state.pixelID.applyFog && data1.fogdepth != data0.fogdepth) {
+		// If we're projecting textures, only allow an exact match for simplicity.
+		if (state.enableTextures && data1.v.texturecoords.q() != data0.v.texturecoords.q())
+			return false;
+		if (state.pixelID.applyFog && data1.v.fogdepth != data0.v.fogdepth) {
 			// Similar to w, this only matters if they're farther apart than 1/255.
 			static constexpr float foghalfstep = 0.5f / 255.0f;
-			if (data1.fogdepth - foghalfstep > data0.fogdepth || data1.fogdepth + foghalfstep < data0.fogdepth)
+			if (data1.v.fogdepth - foghalfstep > data0.v.fogdepth || data1.v.fogdepth + foghalfstep < data0.v.fogdepth)
 				return false;
 		}
 	}
 	return true;
 }

-bool DetectRectangleFromStrip(const RasterizerState &state, const VertexData data[4], int *tlIndex, int *brIndex) {
+bool DetectRectangleFromStrip(const RasterizerState &state, const ClipVertexData data[4], int *tlIndex, int *brIndex) {
 	// Color and Z must be flat.  Also find the TL and BR meanwhile.
 	int tl = 0, br = 0;
 	for (int i = 1; i < 4; ++i) {
 		if (!AreCoordsRectangleCompatible(state, data[i], data[0]))
 			return false;

-		if (data[i].screenpos.x <= data[tl].screenpos.x && data[i].screenpos.y <= data[tl].screenpos.y)
+		if (data[i].v.screenpos.x <= data[tl].v.screenpos.x && data[i].v.screenpos.y <= data[tl].v.screenpos.y)
 			tl = i;
-		if (data[i].screenpos.x >= data[br].screenpos.x && data[i].screenpos.y >= data[br].screenpos.y)
+		if (data[i].v.screenpos.x >= data[br].v.screenpos.x && data[i].v.screenpos.y >= data[br].v.screenpos.y)
 			br = i;
 	}

@ -442,36 +445,36 @@ bool DetectRectangleFromStrip(const RasterizerState &state, const VertexData dat

 	// OK, now let's look at data to detect rectangles. There are a few possibilities
 	// but we focus on Darkstalkers for now.
-	if (data[0].screenpos.x == data[1].screenpos.x &&
-		data[0].screenpos.y == data[2].screenpos.y &&
-		data[2].screenpos.x == data[3].screenpos.x &&
-		data[1].screenpos.y == data[3].screenpos.y) {
+	if (data[0].v.screenpos.x == data[1].v.screenpos.x &&
+		data[0].v.screenpos.y == data[2].v.screenpos.y &&
+		data[2].v.screenpos.x == data[3].v.screenpos.x &&
+		data[1].v.screenpos.y == data[3].v.screenpos.y) {
 		// Okay, this is in the shape of a rectangle, but what about texture?
 		if (!state.enableTextures)
 			return true;

-		if (data[0].texturecoords.x == data[1].texturecoords.x &&
-			data[0].texturecoords.y == data[2].texturecoords.y &&
-			data[2].texturecoords.x == data[3].texturecoords.x &&
-			data[1].texturecoords.y == data[3].texturecoords.y) {
+		if (data[0].v.texturecoords.x == data[1].v.texturecoords.x &&
+			data[0].v.texturecoords.y == data[2].v.texturecoords.y &&
+			data[2].v.texturecoords.x == data[3].v.texturecoords.x &&
+			data[1].v.texturecoords.y == data[3].v.texturecoords.y) {
 			// It's a rectangle!
 			return true;
 		}
 		return false;
 	}
 	// There's the other vertex order too...
-	if (data[0].screenpos.x == data[2].screenpos.x &&
-		data[0].screenpos.y == data[1].screenpos.y &&
-		data[1].screenpos.x == data[3].screenpos.x &&
-		data[2].screenpos.y == data[3].screenpos.y) {
+	if (data[0].v.screenpos.x == data[2].v.screenpos.x &&
+		data[0].v.screenpos.y == data[1].v.screenpos.y &&
+		data[1].v.screenpos.x == data[3].v.screenpos.x &&
+		data[2].v.screenpos.y == data[3].v.screenpos.y) {
 		// Okay, this is in the shape of a rectangle, but what about texture?
 		if (!state.enableTextures)
 			return true;

-		if (data[0].texturecoords.x == data[2].texturecoords.x &&
-			data[0].texturecoords.y == data[1].texturecoords.y &&
-			data[1].texturecoords.x == data[3].texturecoords.x &&
-			data[2].texturecoords.y == data[3].texturecoords.y) {
+		if (data[0].v.texturecoords.x == data[2].v.texturecoords.x &&
+			data[0].v.texturecoords.y == data[1].v.texturecoords.y &&
+			data[1].v.texturecoords.x == data[3].v.texturecoords.x &&
+			data[2].v.texturecoords.y == data[3].v.texturecoords.y) {
 			// It's a rectangle!
 			return true;
 		}
@ -480,7 +483,7 @@ bool DetectRectangleFromStrip(const RasterizerState &state, const VertexData dat
 	return false;
 }

-bool DetectRectangleFromFan(const RasterizerState &state, const VertexData *data, int c, int *tlIndex, int *brIndex) {
+bool DetectRectangleFromFan(const RasterizerState &state, const ClipVertexData *data, int c, int *tlIndex, int *brIndex) {
 	// Color and Z must be flat.
 	for (int i = 1; i < c; ++i) {
 		if (!AreCoordsRectangleCompatible(state, data[i], data[0]))
@ -489,8 +492,8 @@ bool DetectRectangleFromFan(const RasterizerState &state, const VertexData *data

 	// Check for the common case: a single TL-TR-BR-BL.
 	if (c == 4) {
-		const auto &pos0 = data[0].screenpos, &pos1 = data[1].screenpos;
-		const auto &pos2 = data[2].screenpos, &pos3 = data[3].screenpos;
+		const auto &pos0 = data[0].v.screenpos, &pos1 = data[1].v.screenpos;
+		const auto &pos2 = data[2].v.screenpos, &pos3 = data[3].v.screenpos;
 		if (pos0.x == pos3.x && pos1.x == pos2.x && pos0.y == pos1.y && pos3.y == pos2.y) {
 			// Looking like yes.  Set TL/BR based on y order first...
 			*tlIndex = pos0.y > pos3.y ? 2 : 0;
@ -505,13 +508,13 @@ bool DetectRectangleFromFan(const RasterizerState &state, const VertexData *data
 			if (!state.enableTextures)
 				return true;

-			const auto &textl = data[*tlIndex].texturecoords, &textr = data[*tlIndex ^ 1].texturecoords;
-			const auto &texbl = data[*brIndex ^ 1].texturecoords, &texbr = data[*brIndex].texturecoords;
+			const auto &textl = data[*tlIndex].v.texturecoords, &textr = data[*tlIndex ^ 1].v.texturecoords;
+			const auto &texbl = data[*brIndex ^ 1].v.texturecoords, &texbr = data[*brIndex].v.texturecoords;

 			if (textl.x == texbl.x && textr.x == texbr.x && textl.y == textr.y && texbl.y == texbr.y) {
 				// Okay, the texture is also good, but let's avoid rotation issues.
-				const auto &postl = data[*tlIndex].screenpos;
-				const auto &posbr = data[*brIndex].screenpos;
+				const auto &postl = data[*tlIndex].v.screenpos;
+				const auto &posbr = data[*brIndex].v.screenpos;
 				return textl.y < texbr.y && postl.y < posbr.y && textl.x < texbr.x && postl.x < posbr.x;
 			}
 		}
@ -520,26 +523,26 @@ bool DetectRectangleFromFan(const RasterizerState &state, const VertexData *data
 	return false;
 }

-bool DetectRectangleFromPair(const RasterizerState &state, const VertexData data[6], int *tlIndex, int *brIndex) {
+bool DetectRectangleFromPair(const RasterizerState &state, const ClipVertexData data[6], int *tlIndex, int *brIndex) {
 	// Color and Z must be flat.  Also find the TL and BR meanwhile.
 	int tl = 0, br = 0;
 	for (int i = 1; i < 6; ++i) {
 		if (!AreCoordsRectangleCompatible(state, data[i], data[0]))
 			return false;

-		if (data[i].screenpos.x <= data[tl].screenpos.x && data[i].screenpos.y <= data[tl].screenpos.y)
+		if (data[i].v.screenpos.x <= data[tl].v.screenpos.x && data[i].v.screenpos.y <= data[tl].v.screenpos.y)
 			tl = i;
-		if (data[i].screenpos.x >= data[br].screenpos.x && data[i].screenpos.y >= data[br].screenpos.y)
+		if (data[i].v.screenpos.x >= data[br].v.screenpos.x && data[i].v.screenpos.y >= data[br].v.screenpos.y)
 			br = i;
 	}

 	*tlIndex = tl;
 	*brIndex = br;

-	auto xat = [&](int i) { return data[i].screenpos.x; };
-	auto yat = [&](int i) { return data[i].screenpos.y; };
-	auto uat = [&](int i) { return data[i].texturecoords.x; };
-	auto vat = [&](int i) { return data[i].texturecoords.y; };
+	auto xat = [&](int i) { return data[i].v.screenpos.x; };
+	auto yat = [&](int i) { return data[i].v.screenpos.y; };
+	auto uat = [&](int i) { return data[i].v.texturecoords.x; };
+	auto vat = [&](int i) { return data[i].v.texturecoords.y; };

 	// A likely order would be: TL, TR, BR, TL, BR, BL.  We'd have the last index of each.
 	// TODO: Make more generic.
@ -567,12 +570,12 @@ bool DetectRectangleFromPair(const RasterizerState &state, const VertexData data
 	return false;
 }

-bool DetectRectangleThroughModeSlices(const RasterizerState &state, const VertexData data[4]) {
+bool DetectRectangleThroughModeSlices(const RasterizerState &state, const ClipVertexData data[4]) {
 	// Color and Z must be flat.
 	for (int i = 1; i < 4; ++i) {
-		if (!(data[i].color0 == data[0].color0))
+		if (!(data[i].v.color0 == data[0].v.color0))
 			return false;
-		if (!(data[i].screenpos.z == data[0].screenpos.z)) {
+		if (!(data[i].v.screenpos.z == data[0].v.screenpos.z)) {
 			// Sometimes, we don't actually care about z.
 			if (state.pixelID.depthWrite || state.pixelID.DepthTestFunc() != GE_COMP_ALWAYS)
 				return false;
@ -580,15 +583,15 @@ bool DetectRectangleThroughModeSlices(const RasterizerState &state, const Vertex
 	}

 	// Games very commonly use vertical strips of rectangles.  Detect and combine.
-	const auto &tl1 = data[0].screenpos, &br1 = data[1].screenpos;
-	const auto &tl2 = data[2].screenpos, &br2 = data[3].screenpos;
+	const auto &tl1 = data[0].v.screenpos, &br1 = data[1].v.screenpos;
+	const auto &tl2 = data[2].v.screenpos, &br2 = data[3].v.screenpos;
 	if (tl1.y == tl2.y && br1.y == br2.y && br1.y > tl1.y) {
 		if (br1.x == tl2.x && tl1.x < br1.x && tl2.x < br2.x) {
 			if (!state.enableTextures)
 				return true;

-			const auto &textl1 = data[0].texturecoords, &texbr1 = data[1].texturecoords;
-			const auto &textl2 = data[2].texturecoords, &texbr2 = data[3].texturecoords;
+			const auto &textl1 = data[0].v.texturecoords, &texbr1 = data[1].v.texturecoords;
+			const auto &textl2 = data[2].v.texturecoords, &texbr2 = data[3].v.texturecoords;
 			if (textl1.y != textl2.y || texbr1.y != texbr2.y || textl1.y > texbr1.y)
 				return false;
 			if (texbr1.x != textl2.x || textl1.x > texbr1.x || textl2.x > texbr2.x)
--- a/GPU/Software/RasterizerRectangle.h
+++ b/GPU/Software/RasterizerRectangle.h
@ -20,8 +20,8 @@ namespace Rasterizer {
 	bool RectangleFastPath(const VertexData &v0, const VertexData &v1, BinManager &binner);
 	void DrawSprite(const VertexData &v0, const VertexData &v1, const BinCoords &range, const RasterizerState &state);

-	bool DetectRectangleFromStrip(const RasterizerState &state, const VertexData data[4], int *tlIndex, int *brIndex);
-	bool DetectRectangleFromFan(const RasterizerState &state, const VertexData *data, int c, int *tlIndex, int *brIndex);
-	bool DetectRectangleFromPair(const RasterizerState &state, const VertexData data[6], int *tlIndex, int *brIndex);
-	bool DetectRectangleThroughModeSlices(const RasterizerState &state, const VertexData data[4]);
+	bool DetectRectangleFromStrip(const RasterizerState &state, const ClipVertexData data[4], int *tlIndex, int *brIndex);
+	bool DetectRectangleFromFan(const RasterizerState &state, const ClipVertexData *data, int c, int *tlIndex, int *brIndex);
+	bool DetectRectangleFromPair(const RasterizerState &state, const ClipVertexData data[6], int *tlIndex, int *brIndex);
+	bool DetectRectangleThroughModeSlices(const RasterizerState &state, const ClipVertexData data[4]);
 }
--- a/GPU/Software/SoftGpu.cpp
+++ b/GPU/Software/SoftGpu.cpp
@ -160,7 +160,7 @@ const SoftwareCommandTableEntry softgpuCommandTable[] = {
 	{ GE_CMD_LOGICOP, 0, SoftDirty::PIXEL_BASIC | SoftDirty::PIXEL_CACHED },
 	{ GE_CMD_LOGICOPENABLE, 0, SoftDirty::PIXEL_BASIC | SoftDirty::PIXEL_CACHED },

-	{ GE_CMD_TEXMAPMODE, 0, SoftDirty::TRANSFORM_BASIC },
+	{ GE_CMD_TEXMAPMODE, 0, SoftDirty::TRANSFORM_BASIC | SoftDirty::RAST_TEX },

 	// These are read on every SubmitPrim, no need for dirtying or flushing.
 	{ GE_CMD_TEXSCALEU },
--- a/GPU/Software/TransformUnit.cpp
+++ b/GPU/Software/TransformUnit.cpp
@ -102,22 +102,24 @@ void SoftwareDrawEngine::DispatchSubmitImm(GEPrimitiveType prim, TransformedVert
 		transformUnit.SubmitPrimitive(nullptr, nullptr, prim, 0, vertTypeID, nullptr, this);

 	for (int i = 0; i < vertexCount; i++) {
-		VertexData vert;
+		ClipVertexData vert;
 		vert.clippos = ClipCoords(buffer[i].pos);
-		vert.texturecoords.x = buffer[i].u;
-		vert.texturecoords.y = buffer[i].v;
+		vert.v.texturecoords.x = buffer[i].u;
+		vert.v.texturecoords.y = buffer[i].v;
+		vert.v.texturecoords.z = buffer[i].uv_w;
 		if (gstate.isModeThrough()) {
-			vert.texturecoords.x *= gstate.getTextureWidth(0);
-			vert.texturecoords.y *= gstate.getTextureHeight(0);
+			vert.v.texturecoords.x *= gstate.getTextureWidth(0);
+			vert.v.texturecoords.y *= gstate.getTextureHeight(0);
 		} else {
 			vert.clippos.z *= 1.0f / 65535.0f;
 		}
-		vert.color0 = buffer[i].color0_32;
-		vert.color1 = gstate.isUsingSecondaryColor() && !gstate.isModeThrough() ? buffer[i].color1_32 : 0;
-		vert.fogdepth = buffer[i].fog;
-		vert.screenpos.x = (int)(buffer[i].x * 16.0f);
-		vert.screenpos.y = (int)(buffer[i].y * 16.0f);
-		vert.screenpos.z = (u16)(u32)buffer[i].z;
+		vert.v.clipw = buffer[i].pos_w;
+		vert.v.color0 = buffer[i].color0_32;
+		vert.v.color1 = gstate.isUsingSecondaryColor() && !gstate.isModeThrough() ? buffer[i].color1_32 : 0;
+		vert.v.fogdepth = buffer[i].fog;
+		vert.v.screenpos.x = (int)(buffer[i].x * 16.0f);
+		vert.v.screenpos.y = (int)(buffer[i].y * 16.0f);
+		vert.v.screenpos.z = (u16)(u32)buffer[i].z;

 		transformUnit.SubmitImmVertex(vert, this);
 	}
@ -259,6 +261,8 @@ void ComputeTransformState(TransformState *state, const VertexReader &vreader) {
 	state->negateNormals = gstate.areNormalsReversed();

 	state->uvGenMode = gstate.getUVGenMode();
+	if (state->uvGenMode == GE_TEXMAP_UNKNOWN)
+		state->uvGenMode = GE_TEXMAP_TEXTURE_COORDS;

 	if (state->enableTransform) {
 		bool canSkipWorldPos = true;
@ -315,26 +319,34 @@ void ComputeTransformState(TransformState *state, const VertexReader &vreader) {
 		state->roundToScreen = &ClipToScreenInternal<false, true>;
 }

-VertexData TransformUnit::ReadVertex(VertexReader &vreader, const TransformState &state) {
+ClipVertexData TransformUnit::ReadVertex(VertexReader &vreader, const TransformState &state) {
 	PROFILE_THIS_SCOPE("read_vert");
-	VertexData vertex;
+	// If we ever thread this, we'll have to change this.
+	ClipVertexData vertex;

 	ModelCoords pos;
 	// VertexDecoder normally scales z, but we want it unscaled.
 	vreader.ReadPosThroughZ16(pos.AsArray());

+	static Vec3Packedf lastTC;
 	if (state.readUV) {
-		vreader.ReadUV(vertex.texturecoords.AsArray());
+		vreader.ReadUV(vertex.v.texturecoords.AsArray());
+		vertex.v.texturecoords.q() = 0.0f;
+		lastTC = vertex.v.texturecoords;
 	} else {
-		vertex.texturecoords.SetZero();
+		vertex.v.texturecoords = lastTC;
 	}

-	Vec3<float> normal;
+	Vec3f normal;
+	static Vec3f lastnormal;
 	if (vreader.hasNormal()) {
 		vreader.ReadNrm(normal.AsArray());
+		lastnormal = normal;

 		if (state.negateNormals)
 			normal = -normal;
+	} else {
+		normal = lastnormal;
 	}

 	if (state.readWeights) {
@ -359,12 +371,12 @@ VertexData TransformUnit::ReadVertex(VertexReader &vreader, const TransformState
 	}

 	if (vreader.hasColor0()) {
-		vreader.ReadColor0_8888((u8 *)&vertex.color0);
+		vreader.ReadColor0_8888((u8 *)&vertex.v.color0);
 	} else {
-		vertex.color0 = gstate.getMaterialAmbientRGBA();
+		vertex.v.color0 = gstate.getMaterialAmbientRGBA();
 	}

-	vertex.color1 = 0;
+	vertex.v.color1 = 0;

 	if (state.enableTransform) {
 		WorldCoords worldpos;
@ -389,18 +401,19 @@ VertexData TransformUnit::ReadVertex(VertexReader &vreader, const TransformState
 		screenScaled = vertex.clippos.xyz() * state.screenScale / vertex.clippos.w + state.screenAdd;
 #endif
 		bool outside_range_flag = false;
-		vertex.screenpos = state.roundToScreen(screenScaled, vertex.clippos, &outside_range_flag);
+		vertex.v.screenpos = state.roundToScreen(screenScaled, vertex.clippos, &outside_range_flag);
 		if (outside_range_flag) {
 			// We use this, essentially, as the flag.
-			vertex.screenpos.x = 0x7FFFFFFF;
+			vertex.v.screenpos.x = 0x7FFFFFFF;
 			return vertex;
 		}

 		if (state.enableFog) {
-			vertex.fogdepth = Dot(state.posToFog, Vec4f(pos, 1.0f));
+			vertex.v.fogdepth = Dot(state.posToFog, Vec4f(pos, 1.0f));
 		} else {
-			vertex.fogdepth = 1.0f;
+			vertex.v.fogdepth = 1.0f;
 		}
+		vertex.v.clipw = vertex.clippos.w;

 		Vec3<float> worldnormal;
 		if (vreader.hasNormal()) {
@ -419,40 +432,35 @@ VertexData TransformUnit::ReadVertex(VertexReader &vreader, const TransformState
 				break;

 			case GE_PROJMAP_UV:
-				source = Vec3f(vertex.texturecoords, 0.0f);
+				source = Vec3f(vertex.v.texturecoords.uv(), 0.0f);
 				break;

 			case GE_PROJMAP_NORMALIZED_NORMAL:
-				source = normal.NormalizedOr001(cpu_info.bSSE4_1);
+				// This does not use 0, 0, 1 if length is zero.
+				source = normal.Normalized(cpu_info.bSSE4_1);
 				break;

 			case GE_PROJMAP_NORMAL:
 				source = normal;
 				break;
-
-			default:
-				source = Vec3f::AssignToAll(0.0f);
-				ERROR_LOG_REPORT(G3D, "Software: Unsupported UV projection mode %x", gstate.getUVProjMode());
-				break;
 			}

-			// TODO: What about uv scale and offset?
+			// Note that UV scale/offset are not used in this mode.
 			Vec3<float> stq = Vec3ByMatrix43(source, gstate.tgenMatrix);
-			float z_recip = 1.0f / stq.z;
-			vertex.texturecoords = Vec2f(stq.x * z_recip, stq.y * z_recip);
+			vertex.v.texturecoords = Vec3Packedf(stq.x, stq.y, stq.z);
 		} else if (state.uvGenMode == GE_TEXMAP_ENVIRONMENT_MAP) {
-			Lighting::GenerateLightST(vertex, worldnormal);
+			Lighting::GenerateLightST(vertex.v, worldnormal);
 		}

 		PROFILE_THIS_SCOPE("light");
 		if (state.enableLighting)
-			Lighting::Process(vertex, worldpos, worldnormal, state.lightingState);
+			Lighting::Process(vertex.v, worldpos, worldnormal, state.lightingState);
 	} else {
-		vertex.screenpos.x = (int)(pos[0] * SCREEN_SCALE_FACTOR);
-		vertex.screenpos.y = (int)(pos[1] * SCREEN_SCALE_FACTOR);
-		vertex.screenpos.z = pos[2];
-		vertex.clippos.w = 1.f;
-		vertex.fogdepth = 1.f;
+		vertex.v.screenpos.x = (int)(pos[0] * SCREEN_SCALE_FACTOR);
+		vertex.v.screenpos.y = (int)(pos[1] * SCREEN_SCALE_FACTOR);
+		vertex.v.screenpos.z = pos[2];
+		vertex.v.clipw = 1.0f;
+		vertex.v.fogdepth = 1.0f;
 	}

 	return vertex;
@ -503,7 +511,7 @@ public:
 		}
 	}

-	inline VertexData Read(int vtx) {
+	inline ClipVertexData Read(int vtx) {
 		if (useIndices_) {
 			if (useCache_) {
 				return cached_[conv_(vtx) - lowerBound_];
@ -523,13 +531,13 @@ protected:
 	TransformUnit &transform_;
 	uint16_t lowerBound_;
 	uint16_t upperBound_;
-	static std::vector<VertexData> cached_;
+	static std::vector<ClipVertexData> cached_;
 	bool useIndices_ = false;
 	bool useCache_ = false;
 };

 // Static to reduce allocations mid-frame.
-std::vector<VertexData> SoftwareVertexReader::cached_;
+std::vector<ClipVertexData> SoftwareVertexReader::cached_;

 void TransformUnit::SubmitPrimitive(const void* vertices, const void* indices, GEPrimitiveType prim_type, int vertex_count, u32 vertex_type, int *bytesRead, SoftwareDrawEngine *drawEngine)
 {
@ -572,7 +580,7 @@ void TransformUnit::SubmitPrimitive(const void* vertices, const void* indices, G
 	if (vreader.IsThrough() && cullType == CullType::OFF && prim_type == GE_PRIM_TRIANGLES && data_index_ == 0 && vertex_count >= 6 && ((vertex_count) % 6) == 0) {
 		// Some games send rectangles as a series of regular triangles.
 		// We look for this, but only in throughmode.
-		VertexData buf[6];
+		ClipVertexData buf[6];
 		int buf_index = data_index_;
 		for (int i = 0; i < data_index_; ++i) {
 			buf[i] = data_[i];
@ -823,7 +831,7 @@ void TransformUnit::SubmitPrimitive(const void* vertices, const void* indices, G
 	}
 }

-void TransformUnit::SubmitImmVertex(const VertexData &vert, SoftwareDrawEngine *drawEngine) {
+void TransformUnit::SubmitImmVertex(const ClipVertexData &vert, SoftwareDrawEngine *drawEngine) {
 	// Where we put it is different for STRIP/FAN types.
 	switch (prev_prim_) {
 	case GE_PRIM_POINTS:
@ -864,7 +872,7 @@ void TransformUnit::SubmitImmVertex(const VertexData &vert, SoftwareDrawEngine *
 	isImmDraw_ = false;
 }

-void TransformUnit::SendTriangle(CullType cullType, const VertexData *verts, int provoking) {
+void TransformUnit::SendTriangle(CullType cullType, const ClipVertexData *verts, int provoking) {
 	if (cullType == CullType::OFF) {
 		Clipper::ProcessTriangle(verts[0], verts[1], verts[2], verts[provoking], *binner_);
 		Clipper::ProcessTriangle(verts[2], verts[1], verts[0], verts[provoking], *binner_);
--- a/GPU/Software/TransformUnit.h
+++ b/GPU/Software/TransformUnit.h
@ -78,28 +78,33 @@ struct DrawingCoords {
 	s16 y;
 };

-struct VertexData {
-	void Lerp(float t, const VertexData &a, const VertexData &b) {
+struct alignas(16) VertexData {
+	Vec3Packedf texturecoords;
+	float clipw;
+	uint32_t color0;
+	uint32_t color1;
+	ScreenCoords screenpos;
+	float fogdepth;
+};
+
+struct ClipVertexData {
+	void Lerp(float t, const ClipVertexData &a, const ClipVertexData &b) {
 		clippos = ::Lerp(a.clippos, b.clippos, t);
 		// Ignore screenpos because Lerp() is only used pre-calculation of screenpos.
-		texturecoords = ::Lerp(a.texturecoords, b.texturecoords, t);
-		fogdepth = ::Lerp(a.fogdepth, b.fogdepth, t);
+		v.texturecoords = ::Lerp(a.v.texturecoords, b.v.texturecoords, t);
+		v.fogdepth = ::Lerp(a.v.fogdepth, b.v.fogdepth, t);

 		u16 t_int = (u16)(t * 256);
-		color0 = LerpInt<Vec4<int>, 256>(Vec4<int>::FromRGBA(a.color0), Vec4<int>::FromRGBA(b.color0), t_int).ToRGBA();
-		color1 = LerpInt<Vec3<int>, 256>(Vec3<int>::FromRGB(a.color1), Vec3<int>::FromRGB(b.color1), t_int).ToRGB();
+		v.color0 = LerpInt<Vec4<int>, 256>(Vec4<int>::FromRGBA(a.v.color0), Vec4<int>::FromRGBA(b.v.color0), t_int).ToRGBA();
+		v.color1 = LerpInt<Vec3<int>, 256>(Vec3<int>::FromRGB(a.v.color1), Vec3<int>::FromRGB(b.v.color1), t_int).ToRGB();
 	}

 	bool OutsideRange() const {
-		return screenpos.x == 0x7FFFFFFF;
+		return v.screenpos.x == 0x7FFFFFFF;
 	}

 	ClipCoords clippos;
-	Vec2<float> texturecoords;
-	uint32_t color0;
-	uint32_t color1;
-	ScreenCoords screenpos; // TODO: Shouldn't store this ?
-	float fogdepth;
+	VertexData v;
 };

 class VertexReader;
@ -130,7 +135,7 @@ public:
 	static ScreenCoords DrawingToScreen(const DrawingCoords &coords, u16 z);

 	void SubmitPrimitive(const void* vertices, const void* indices, GEPrimitiveType prim_type, int vertex_count, u32 vertex_type, int *bytesRead, SoftwareDrawEngine *drawEngine);
-	void SubmitImmVertex(const VertexData &vert, SoftwareDrawEngine *drawEngine);
+	void SubmitImmVertex(const ClipVertexData &vert, SoftwareDrawEngine *drawEngine);

 	bool GetCurrentSimpleVertices(int count, std::vector<GPUDebugVertex> &vertices, std::vector<u16> &indices);

@ -144,14 +149,14 @@ public:
 	SoftDirty GetDirty();

 private:
-	VertexData ReadVertex(VertexReader &vreader, const TransformState &state);
-	void SendTriangle(CullType cullType, const VertexData *verts, int provoking = 2);
+	ClipVertexData ReadVertex(VertexReader &vreader, const TransformState &state);
+	void SendTriangle(CullType cullType, const ClipVertexData *verts, int provoking = 2);

 	u8 *decoded_ = nullptr;
 	BinManager *binner_ = nullptr;

 	// Normally max verts per prim is 3, but we temporarily need 4 to detect rectangles from strips.
-	VertexData data_[4];
+	ClipVertexData data_[4];
 	// This is the index of the next vert in data (or higher, may need modulus.)
 	int data_index_ = 0;
 	GEPrimitiveType prev_prim_ = GE_PRIM_POINTS;
--- a/assets/compat.ini
+++ b/assets/compat.ini
@ -1283,6 +1283,27 @@ ULJM05494 = true
 NPJH50143 = true
 ULJM05738 = true

+[NearestFilteringOnFramebufferCreate]
+# Ridge Racer speedometer dynamic CLUT problem - they rely on some palette entries
+# from memory, and render to the rest of the palette. The palette entries loaded from memory
+# must not be blurred by filtering, so nearest it is. See issue #8509
+
+# Ridge Racer
+ULJS00001 = true
+ULUS10001 = true
+UCKS45002 = true
+UCES00002 = true
+ULJS19002 = true
+UCKS45053 = true
+NPJH50140 = true
+
+# Ridge Racer 2
+ULJS00080 = true
+UCKS45032 = true
+UCES00422 = true
+UCAS40273 = true
+NPJH50366 = true
+
 [AllowDownloadCLUT]
 # Temporary compatibility option, while working on the GPU CLUT-from-framebuffer path.
 # Not required for any games now that it works, but might be useful for development.