Merge branch 'master' into compat_openxr_gta

2025-01-27 16:24:54 +00:00 · 2022-09-23 14:16:58 +02:00 · 2022-09-23 14:16:58 +02:00 · adffbb2ea7
commit adffbb2ea7
parent 209b5b3079 d79828270a
36 changed files with 402 additions and 261 deletions
--- a/Common/GPU/Vulkan/VulkanQueueRunner.cpp
+++ b/Common/GPU/Vulkan/VulkanQueueRunner.cpp
@ -30,16 +30,14 @@ static void MergeRenderAreaRectInto(VkRect2D *dest, VkRect2D &src) {
 // We need to take the "max" of the features used in the two render passes.
 RenderPassType MergeRPTypes(RenderPassType a, RenderPassType b) {
 	// Either both are backbuffer type, or neither are.
-	_dbg_assert_((a == RP_TYPE_BACKBUFFER) == (b == RP_TYPE_BACKBUFFER));
-	if (a == b) {
-		// Trivial merging case.
+	// These can't merge with other renderpasses
+	if (a == RP_TYPE_BACKBUFFER || b == RP_TYPE_BACKBUFFER) {
+		_dbg_assert_(a == b);
 		return a;
-	} else if (a == RP_TYPE_COLOR_DEPTH && b == RP_TYPE_COLOR_DEPTH_INPUT) {
-		return RP_TYPE_COLOR_DEPTH_INPUT;
-	} else if (a == RP_TYPE_COLOR_DEPTH_INPUT && b == RP_TYPE_COLOR_DEPTH) {
-		return RP_TYPE_COLOR_DEPTH_INPUT;
 	}
-	return a;
+
+	// The rest we can just OR together to get the maximum feature set.
+	return (RenderPassType)((u32)a | (u32)b);
 }

 void VulkanQueueRunner::CreateDeviceObjects() {
@ -326,29 +324,33 @@ static VkAttachmentStoreOp ConvertStoreAction(VKRRenderPassStoreAction action) {
 // Self-dependency: https://github.com/gpuweb/gpuweb/issues/442#issuecomment-547604827
 // Also see https://www.khronos.org/registry/vulkan/specs/1.3-extensions/html/vkspec.html#synchronization-pipeline-barriers-subpass-self-dependencies

-VkRenderPass CreateRP(VulkanContext *vulkan, const RPKey &key, RenderPassType rpType) {
-	bool selfDependency = rpType == RP_TYPE_COLOR_DEPTH_INPUT;
+VkRenderPass CreateRenderPass(VulkanContext *vulkan, const RPKey &key, RenderPassType rpType) {
+	bool selfDependency = rpType == RP_TYPE_COLOR_INPUT || rpType == RP_TYPE_COLOR_DEPTH_INPUT;
+	bool isBackbuffer = rpType == RP_TYPE_BACKBUFFER;
+	bool hasDepth = rpType == RP_TYPE_BACKBUFFER || rpType == RP_TYPE_COLOR_DEPTH || rpType == RP_TYPE_COLOR_DEPTH_INPUT;

 	VkAttachmentDescription attachments[2] = {};
-	attachments[0].format = rpType == RP_TYPE_BACKBUFFER ? vulkan->GetSwapchainFormat() : VK_FORMAT_R8G8B8A8_UNORM;
+	attachments[0].format = isBackbuffer ? vulkan->GetSwapchainFormat() : VK_FORMAT_R8G8B8A8_UNORM;
 	attachments[0].samples = VK_SAMPLE_COUNT_1_BIT;
 	attachments[0].loadOp = ConvertLoadAction(key.colorLoadAction);
 	attachments[0].storeOp = ConvertStoreAction(key.colorStoreAction);
 	attachments[0].stencilLoadOp = VK_ATTACHMENT_LOAD_OP_DONT_CARE;
 	attachments[0].stencilStoreOp = VK_ATTACHMENT_STORE_OP_DONT_CARE;
-	attachments[0].initialLayout = rpType == RP_TYPE_BACKBUFFER ? VK_IMAGE_LAYOUT_UNDEFINED : VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL;
-	attachments[0].finalLayout = rpType == RP_TYPE_BACKBUFFER ? VK_IMAGE_LAYOUT_PRESENT_SRC_KHR : VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL;
+	attachments[0].initialLayout = isBackbuffer ? VK_IMAGE_LAYOUT_UNDEFINED : VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL;
+	attachments[0].finalLayout = isBackbuffer ? VK_IMAGE_LAYOUT_PRESENT_SRC_KHR : VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL;
 	attachments[0].flags = 0;

-	attachments[1].format = vulkan->GetDeviceInfo().preferredDepthStencilFormat;
-	attachments[1].samples = VK_SAMPLE_COUNT_1_BIT;
-	attachments[1].loadOp = ConvertLoadAction(key.depthLoadAction);
-	attachments[1].storeOp = ConvertStoreAction(key.depthStoreAction);
-	attachments[1].stencilLoadOp = ConvertLoadAction(key.stencilLoadAction);
-	attachments[1].stencilStoreOp = ConvertStoreAction(key.stencilStoreAction);
-	attachments[1].initialLayout = VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL;
-	attachments[1].finalLayout = VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL;
-	attachments[1].flags = 0;
+	if (hasDepth) {
+		attachments[1].format = vulkan->GetDeviceInfo().preferredDepthStencilFormat;
+		attachments[1].samples = VK_SAMPLE_COUNT_1_BIT;
+		attachments[1].loadOp = ConvertLoadAction(key.depthLoadAction);
+		attachments[1].storeOp = ConvertStoreAction(key.depthStoreAction);
+		attachments[1].stencilLoadOp = ConvertLoadAction(key.stencilLoadAction);
+		attachments[1].stencilStoreOp = ConvertStoreAction(key.stencilStoreAction);
+		attachments[1].initialLayout = VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL;
+		attachments[1].finalLayout = VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL;
+		attachments[1].flags = 0;
+	}

 	VkAttachmentReference color_reference{};
 	color_reference.attachment = 0;
@ -371,7 +373,9 @@ VkRenderPass CreateRP(VulkanContext *vulkan, const RPKey &key, RenderPassType rp
 	subpass.colorAttachmentCount = 1;
 	subpass.pColorAttachments = &color_reference;
 	subpass.pResolveAttachments = nullptr;
-	subpass.pDepthStencilAttachment = &depth_reference;
+	if (hasDepth) {
+		subpass.pDepthStencilAttachment = &depth_reference;
+	}
 	subpass.preserveAttachmentCount = 0;
 	subpass.pPreserveAttachments = nullptr;

@ -380,12 +384,12 @@ VkRenderPass CreateRP(VulkanContext *vulkan, const RPKey &key, RenderPassType rp
 	size_t numDeps = 0;

 	VkRenderPassCreateInfo rp{ VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO };
-	rp.attachmentCount = 2;
+	rp.attachmentCount = hasDepth ? 2 : 1;
 	rp.pAttachments = attachments;
 	rp.subpassCount = 1;
 	rp.pSubpasses = &subpass;

-	if (rpType == RP_TYPE_BACKBUFFER) {
+	if (isBackbuffer) {
 		deps[numDeps].srcSubpass = VK_SUBPASS_EXTERNAL;
 		deps[numDeps].dstSubpass = 0;
 		deps[numDeps].srcStageMask = VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT;
@ -393,7 +397,6 @@ VkRenderPass CreateRP(VulkanContext *vulkan, const RPKey &key, RenderPassType rp
 		deps[numDeps].srcAccessMask = 0;
 		deps[numDeps].dstAccessMask = VK_ACCESS_COLOR_ATTACHMENT_READ_BIT | VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT;
 		numDeps++;
-		rp.dependencyCount = 1;
 	}

 	if (selfDependency) {
@ -424,7 +427,7 @@ VkRenderPass VKRRenderPass::Get(VulkanContext *vulkan, RenderPassType rpType) {
 	// practical later when referring to it. Could change to on-demand if it feels motivated
 	// but I think the render pass objects are cheap.
 	if (!pass[(int)rpType]) {
-		pass[(int)rpType] = CreateRP(vulkan, key_, (RenderPassType)rpType);
+		pass[(int)rpType] = CreateRenderPass(vulkan, key_, (RenderPassType)rpType);
 	}
 	return pass[(int)rpType];
 }
@ -873,8 +876,10 @@ std::string VulkanQueueRunner::StepToString(const VKRStep &step) const {
 		const char *renderCmd;
 		switch (step.render.renderPassType) {
 		case RP_TYPE_BACKBUFFER: renderCmd = "BACKBUF"; break;
-		case RP_TYPE_COLOR_DEPTH: renderCmd = "RENDER"; break;
-		case RP_TYPE_COLOR_DEPTH_INPUT: renderCmd = "RENDER_INPUT"; break;
+		case RP_TYPE_COLOR: renderCmd = "RENDER"; break;
+		case RP_TYPE_COLOR_DEPTH: renderCmd = "RENDER_DEPTH"; break;
+		case RP_TYPE_COLOR_INPUT: renderCmd = "RENDER_INPUT"; break;
+		case RP_TYPE_COLOR_DEPTH_INPUT: renderCmd = "RENDER_DEPTH_INPUT"; break;
 		default: renderCmd = "N/A";
 		}
 		snprintf(buffer, sizeof(buffer), "%s %s (draws: %d, %dx%d/%dx%d, fb: %p, )", renderCmd, step.tag, step.render.numDraws, actual_w, actual_h, w, h, step.render.framebuffer);
@ -1153,7 +1158,7 @@ void TransitionToOptimal(VkCommandBuffer cmd, VkImage colorImage, VkImageLayout
 			srcStageMask = VK_PIPELINE_STAGE_TRANSFER_BIT;
 			break;
 		default:
-			_dbg_assert_msg_(false, "GetRenderPass: Unexpected color layout %d", (int)colorLayout);
+			_dbg_assert_msg_(false, "TransitionToOptimal: Unexpected color layout %d", (int)colorLayout);
 			break;
 		}
 		recordBarrier->TransitionImage(
@ -1189,7 +1194,7 @@ void TransitionToOptimal(VkCommandBuffer cmd, VkImage colorImage, VkImageLayout
 			srcStageMask = VK_PIPELINE_STAGE_TRANSFER_BIT;
 			break;
 		default:
-			_dbg_assert_msg_(false, "GetRenderPass: Unexpected depth layout %d", (int)depthStencilLayout);
+			_dbg_assert_msg_(false, "TransitionToOptimal: Unexpected depth layout %d", (int)depthStencilLayout);
 			break;
 		}
 		recordBarrier->TransitionImage(
@ -1236,7 +1241,7 @@ void TransitionFromOptimal(VkCommandBuffer cmd, VkImage colorImage, VkImageLayou
 			// Nothing to do.
 			break;
 		default:
-			_dbg_assert_msg_(false, "GetRenderPass: Unexpected final color layout %d", (int)colorLayout);
+			_dbg_assert_msg_(false, "TransitionFromOptimal: Unexpected final color layout %d", (int)colorLayout);
 			break;
 		}
 		barrier[0].oldLayout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL;
@ -1275,7 +1280,7 @@ void TransitionFromOptimal(VkCommandBuffer cmd, VkImage colorImage, VkImageLayou
 			// Nothing to do.
 			break;
 		default:
-			_dbg_assert_msg_(false, "GetRenderPass: Unexpected final depth layout %d", (int)depthStencilLayout);
+			_dbg_assert_msg_(false, "TransitionFromOptimal: Unexpected final depth layout %d", (int)depthStencilLayout);
 			break;
 		}
 		barrier[barrierCount].oldLayout = VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL;
--- a/Common/GPU/Vulkan/VulkanQueueRunner.h
+++ b/Common/GPU/Vulkan/VulkanQueueRunner.h
@ -43,18 +43,24 @@ enum class VKRRenderCommand : uint8_t {

 enum class PipelineFlags {
 	NONE = 0,
-	USES_LINES = (1 << 2),
 	USES_BLEND_CONSTANT = (1 << 3),
-	USES_DEPTH_STENCIL = (1 << 4),  // Reads or writes the depth buffer.
+	USES_DEPTH_STENCIL = (1 << 4),  // Reads or writes the depth or stencil buffers.
 	USES_INPUT_ATTACHMENT = (1 << 5),
 };
 ENUM_CLASS_BITOPS(PipelineFlags);

 // Pipelines need to be created for the right type of render pass.
 enum RenderPassType {
-	RP_TYPE_BACKBUFFER,
+	// These four are organized so that bit 0 is DEPTH and bit 1 is INPUT, so
+	// they can be OR-ed together in MergeRPTypes.
+	RP_TYPE_COLOR,
 	RP_TYPE_COLOR_DEPTH,
+	RP_TYPE_COLOR_INPUT,
 	RP_TYPE_COLOR_DEPTH_INPUT,
+
+	// This is the odd one out, and gets special handling in MergeRPTypes.
+	RP_TYPE_BACKBUFFER,  // For the backbuffer we can always use CLEAR/DONT_CARE, so bandwidth cost for a depth channel is negligible.
+
 	// Later will add pure-color render passes.
 	RP_TYPE_COUNT,
 };
--- a/Common/GPU/Vulkan/VulkanRenderManager.cpp
+++ b/Common/GPU/Vulkan/VulkanRenderManager.cpp
@ -158,33 +158,37 @@ VKRFramebuffer::VKRFramebuffer(VulkanContext *vk, VkCommandBuffer initCmd, VKRRe
 	// We create the actual framebuffer objects on demand, because some combinations might not make sense.
 }

-VkFramebuffer VKRFramebuffer::Get(VKRRenderPass *compatibleRenderPass, RenderPassType renderPassType) {
-	if (framebuf[(int)renderPassType]) {
-		return framebuf[(int)renderPassType];
+VkFramebuffer VKRFramebuffer::Get(VKRRenderPass *compatibleRenderPass, RenderPassType rpType) {
+	if (framebuf[(int)rpType]) {
+		return framebuf[(int)rpType];
 	}

 	VkFramebufferCreateInfo fbci{ VK_STRUCTURE_TYPE_FRAMEBUFFER_CREATE_INFO };
 	VkImageView views[2]{};

-	fbci.renderPass = compatibleRenderPass->Get(vulkan_, renderPassType);
-	fbci.attachmentCount = 2;
-	fbci.pAttachments = views;
+	bool hasDepth = rpType == RP_TYPE_BACKBUFFER || rpType == RP_TYPE_COLOR_DEPTH || rpType == RP_TYPE_COLOR_DEPTH_INPUT;
+
 	views[0] = color.imageView;
-	views[1] = depth.imageView;
+	if (hasDepth) {
+		views[1] = depth.imageView;
+	}
+	fbci.renderPass = compatibleRenderPass->Get(vulkan_, rpType);
+	fbci.attachmentCount = hasDepth ? 2 : 1;
+	fbci.pAttachments = views;
 	fbci.width = width;
 	fbci.height = height;
 	fbci.layers = 1;

-	VkResult res = vkCreateFramebuffer(vulkan_->GetDevice(), &fbci, nullptr, &framebuf[(int)renderPassType]);
+	VkResult res = vkCreateFramebuffer(vulkan_->GetDevice(), &fbci, nullptr, &framebuf[(int)rpType]);
 	_assert_(res == VK_SUCCESS);

 	if (!tag_.empty() && vulkan_->Extensions().EXT_debug_utils) {
 		vulkan_->SetDebugName(color.image, VK_OBJECT_TYPE_IMAGE, StringFromFormat("fb_color_%s", tag_.c_str()).c_str());
 		vulkan_->SetDebugName(depth.image, VK_OBJECT_TYPE_IMAGE, StringFromFormat("fb_depth_%s", tag_.c_str()).c_str());
-		vulkan_->SetDebugName(framebuf[(int)renderPassType], VK_OBJECT_TYPE_FRAMEBUFFER, StringFromFormat("fb_%s", tag_.c_str()).c_str());
+		vulkan_->SetDebugName(framebuf[(int)rpType], VK_OBJECT_TYPE_FRAMEBUFFER, StringFromFormat("fb_%s", tag_.c_str()).c_str());
 	}

-	return framebuf[(int)renderPassType];
+	return framebuf[(int)rpType];
 }

 VKRFramebuffer::~VKRFramebuffer() {
@ -656,15 +660,16 @@ void VulkanRenderManager::EndCurRenderStep() {
 		curRenderStep_->render.colorLoad, curRenderStep_->render.depthLoad, curRenderStep_->render.stencilLoad,
 		curRenderStep_->render.colorStore, curRenderStep_->render.depthStore, curRenderStep_->render.stencilStore,
 	};
-	RenderPassType rpType = RP_TYPE_COLOR_DEPTH;
 	// Save the accumulated pipeline flags so we can use that to configure the render pass.
 	// We'll often be able to avoid loading/saving the depth/stencil buffer.
 	curRenderStep_->render.pipelineFlags = curPipelineFlags_;
+	bool depthStencil = (curPipelineFlags_ & PipelineFlags::USES_DEPTH_STENCIL) != 0;
+	RenderPassType rpType = depthStencil ? RP_TYPE_COLOR_DEPTH : RP_TYPE_COLOR;
 	if (!curRenderStep_->render.framebuffer) {
 		rpType = RP_TYPE_BACKBUFFER;
 	} else if (curPipelineFlags_ & PipelineFlags::USES_INPUT_ATTACHMENT) {
 		// Not allowed on backbuffers.
-		rpType = RP_TYPE_COLOR_DEPTH_INPUT;
+		rpType = depthStencil ? RP_TYPE_COLOR_DEPTH_INPUT : RP_TYPE_COLOR_INPUT;
 	}
 	// TODO: Also add render pass types for depth/stencil-less.

@ -714,9 +719,11 @@ void VulkanRenderManager::BindFramebufferAsRenderTarget(VKRFramebuffer *fb, VKRR
 		}
 		if (depth == VKRRenderPassLoadAction::CLEAR) {
 			clearMask |= VK_IMAGE_ASPECT_DEPTH_BIT;
+			curPipelineFlags_ |= PipelineFlags::USES_DEPTH_STENCIL;
 		}
 		if (stencil == VKRRenderPassLoadAction::CLEAR) {
 			clearMask |= VK_IMAGE_ASPECT_STENCIL_BIT;
+			curPipelineFlags_ |= PipelineFlags::USES_DEPTH_STENCIL;
 		}

 		// If we need a clear and the previous step has commands already, it's best to just add a clear and keep going.
@ -997,6 +1004,10 @@ void VulkanRenderManager::Clear(uint32_t clearColor, float clearZ, int clearSten
 		curRenderStep_->render.depthLoad = (clearMask & VK_IMAGE_ASPECT_DEPTH_BIT) ? VKRRenderPassLoadAction::CLEAR : VKRRenderPassLoadAction::KEEP;
 		curRenderStep_->render.stencilLoad = (clearMask & VK_IMAGE_ASPECT_STENCIL_BIT) ? VKRRenderPassLoadAction::CLEAR : VKRRenderPassLoadAction::KEEP;

+		if (clearMask & (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)) {
+			curPipelineFlags_ |= PipelineFlags::USES_DEPTH_STENCIL;
+		}
+
 		// In case there were commands already.
 		curRenderStep_->render.numDraws = 0;
 		RemoveDrawCommands(&curRenderStep_->commands);
@ -1269,7 +1280,10 @@ void VulkanRenderManager::Run(int frame) {
 	BeginSubmitFrame(frame);

 	FrameData &frameData = frameData_[frame];
-	queueRunner_.PreprocessSteps(frameData_[frame].steps);
+	queueRunner_.PreprocessSteps(frameData.steps);
+	// Likely during shutdown, happens in headless.
+	if (frameData.steps.empty() && !frameData.hasAcquired)
+		frameData.skipSwap = true;
 	//queueRunner_.LogSteps(stepsOnThread, false);
 	queueRunner_.RunSteps(frameData, frameDataShared_);

--- a/Common/GPU/Vulkan/thin3d_vulkan.cpp
+++ b/Common/GPU/Vulkan/thin3d_vulkan.cpp
@ -1056,6 +1056,7 @@ Pipeline *VKContext::CreateGraphicsPipeline(const PipelineDesc &desc, const char
 	if (depth->info.depthTestEnable || depth->info.stencilTestEnable) {
 		pipelineFlags |= PipelineFlags::USES_DEPTH_STENCIL;
 	}
+	// TODO: We need code to set USES_BLEND_CONSTANT here too, if we're ever gonna use those in thin3d code.

 	VKPipeline *pipeline = new VKPipeline(vulkan_, desc.uniformDesc ? desc.uniformDesc->uniformBufferSize : 16 * sizeof(float), pipelineFlags, tag);

--- a/Common/VR/PPSSPPVR.cpp
+++ b/Common/VR/PPSSPPVR.cpp
@ -294,7 +294,8 @@ bool StartVRRender() {

 		// Decide if the scene is 3D or not
 		if (g_Config.bEnableVR && !VR_GetConfig(VR_CONFIG_FORCE_2D) && (VR_GetConfig(VR_CONFIG_3D_GEOMETRY_COUNT) > 15)) {
-			VR_SetConfig(VR_CONFIG_MODE, g_Config.bEnableStereo ? VR_MODE_STEREO_6DOF : VR_MODE_MONO_6DOF);
+			bool stereo = VR_GetConfig(VR_CONFIG_6DOF_PRECISE) && g_Config.bEnableStereo;
+			VR_SetConfig(VR_CONFIG_MODE, stereo ? VR_MODE_STEREO_6DOF : VR_MODE_MONO_6DOF);
 		} else {
 			VR_SetConfig(VR_CONFIG_MODE, VR_MODE_FLAT_SCREEN);
 		}
--- a/Common/VR/VRRenderer.cpp
+++ b/Common/VR/VRRenderer.cpp
@ -359,13 +359,17 @@ void VR_FinishFrame( engine_t* engine ) {
 		for (int eye = 0; eye < ovrMaxNumEyes; eye++) {
 			int imageLayer = engine->appState.Renderer.Multiview ? eye : 0;
 			ovrFramebuffer* frameBuffer = &engine->appState.Renderer.FrameBuffer[0];
-			if ((vrMode != VR_MODE_MONO_6DOF) && !engine->appState.Renderer.Multiview) {
-				frameBuffer = &engine->appState.Renderer.FrameBuffer[eye];
+			XrPosef pose = invViewTransform[0];
+			if (vrMode != VR_MODE_MONO_6DOF) {
+				if (!engine->appState.Renderer.Multiview) {
+					frameBuffer = &engine->appState.Renderer.FrameBuffer[eye];
+				}
+				pose = invViewTransform[eye];
 			}

 			memset(&projection_layer_elements[eye], 0, sizeof(XrCompositionLayerProjectionView));
 			projection_layer_elements[eye].type = XR_TYPE_COMPOSITION_LAYER_PROJECTION_VIEW;
-			projection_layer_elements[eye].pose = invViewTransform[eye];
+			projection_layer_elements[eye].pose = pose;
 			projection_layer_elements[eye].fov = fov;

 			memset(&projection_layer_elements[eye].subImage, 0, sizeof(XrSwapchainSubImage));
@ -502,9 +506,16 @@ ovrMatrix4f VR_GetMatrix( VRMatrix matrix ) {
 			output.M[2][3] -= hmdposition.z * (vrConfig[VR_CONFIG_MIRROR_AXIS_Z] ? -1.0f : 1.0f) * scale;
 		}
 		if (vrConfig[VR_CONFIG_6DOF_PRECISE] && (matrix == VR_VIEW_MATRIX_RIGHT_EYE)) {
-			output.M[0][3] += (invViewTransform[1].position.x - invViewTransform[0].position.x) * scale;
-			output.M[1][3] += (invViewTransform[1].position.y - invViewTransform[0].position.y) * scale;
-			output.M[2][3] += (invViewTransform[1].position.z - invViewTransform[0].position.z) * scale;
+			float dx = fabs(invViewTransform[1].position.x - invViewTransform[0].position.x);
+			float dy = fabs(invViewTransform[1].position.y - invViewTransform[0].position.y);
+			float dz = fabs(invViewTransform[1].position.z - invViewTransform[0].position.z);
+			float ipd = sqrt(dx * dx + dy * dy + dz * dz);
+			XrVector3f separation = {ipd * scale, 0.0f, 0.0f};
+			separation = XrQuaternionf_Rotate(invView.orientation, separation);
+			separation = XrVector3f_ScalarMultiply(separation, vrConfig[VR_CONFIG_MIRROR_AXIS_Z] ? -1.0f : 1.0f);
+			output.M[0][3] -= separation.x;
+			output.M[1][3] -= separation.y;
+			output.M[2][3] -= separation.z;
 		}
 	} else {
 		assert(false);
--- a/Core/Compatibility.cpp
+++ b/Core/Compatibility.cpp
@ -109,7 +109,6 @@ void Compatibility::CheckSettings(IniFile &iniFile, const std::string &gameID) {
 	CheckSetting(iniFile, gameID, "SplitFramebufferMargin", &flags_.SplitFramebufferMargin);
 	CheckSetting(iniFile, gameID, "ForceLowerResolutionForEffectsOn", &flags_.ForceLowerResolutionForEffectsOn);
 	CheckSetting(iniFile, gameID, "AllowDownloadCLUT", &flags_.AllowDownloadCLUT);
-	CheckSetting(iniFile, gameID, "UploadDepthForCLUTTextures", &flags_.UploadDepthForCLUTTextures);
 }

 void Compatibility::CheckSetting(IniFile &iniFile, const std::string &gameID, const char *option, bool *flag) {
--- a/Core/Compatibility.h
+++ b/Core/Compatibility.h
@ -89,7 +89,6 @@ struct CompatFlags {
 	bool SplitFramebufferMargin;
 	bool ForceLowerResolutionForEffectsOn;
 	bool AllowDownloadCLUT;
-	bool UploadDepthForCLUTTextures;
 };

 struct VRCompat {
--- a/Core/HLE/sceKernelMemory.cpp
+++ b/Core/HLE/sceKernelMemory.cpp
@ -2174,68 +2174,89 @@ int sceKernelDeleteTlspl(SceUID uid)
 	return error;
 }

-int sceKernelGetTlsAddr(SceUID uid)
-{
-	// TODO: Allocate downward if PSP_TLSPL_ATTR_HIGHMEM?
-	DEBUG_LOG(SCEKERNEL, "sceKernelGetTlsAddr(%08x)", uid);
+struct FindTLSByIndexArg {
+	int index;
+	TLSPL *result = nullptr;
+};

+static bool FindTLSByIndex(TLSPL *possible, FindTLSByIndexArg *state) {
+	if (possible->ntls.index == state->index) {
+		state->result = possible;
+		return false;
+	}
+	return true;
+}
+
+int sceKernelGetTlsAddr(SceUID uid) {
 	if (!__KernelIsDispatchEnabled() || __IsInInterrupt())
-		return 0;
+		return hleLogWarning(SCEKERNEL, 0, "dispatch disabled");

 	u32 error;
 	TLSPL *tls = kernelObjects.Get<TLSPL>(uid, error);
-	if (tls)
-	{
-		SceUID threadID = __KernelGetCurThread();
-		int allocBlock = -1;
-		bool needsClear = false;
+	if (!tls) {
+		if (uid < 0)
+			return hleLogError(SCEKERNEL, 0, "tlspl not found");

-		// If the thread already has one, return it.
+		// There's this weird behavior where it looks up by index.  Maybe we shouldn't use uids...
+		if (!tlsplUsedIndexes[(uid >> 3) & 15])
+			return hleLogError(SCEKERNEL, 0, "tlspl not found");
+
+		FindTLSByIndexArg state;
+		state.index = (uid >> 3) & 15;
+		kernelObjects.Iterate<TLSPL>(&FindTLSByIndex, &state);
+		if (!state.result)
+			return hleLogError(SCEKERNEL, 0, "tlspl not found");
+
+		tls = state.result;
+	}
+
+	SceUID threadID = __KernelGetCurThread();
+	int allocBlock = -1;
+	bool needsClear = false;
+
+	// If the thread already has one, return it.
+	for (size_t i = 0; i < tls->ntls.totalBlocks && allocBlock == -1; ++i)
+	{
+		if (tls->usage[i] == threadID)
+			allocBlock = (int) i;
+	}
+
+	if (allocBlock == -1)
+	{
 		for (size_t i = 0; i < tls->ntls.totalBlocks && allocBlock == -1; ++i)
 		{
-			if (tls->usage[i] == threadID)
-				allocBlock = (int) i;
+			// The PSP doesn't give the same block out twice in a row, even if freed.
+			if (tls->usage[tls->next] == 0)
+				allocBlock = tls->next;
+			tls->next = (tls->next + 1) % tls->ntls.totalBlocks;
 		}

-		if (allocBlock == -1)
+		if (allocBlock != -1)
 		{
-			for (size_t i = 0; i < tls->ntls.totalBlocks && allocBlock == -1; ++i)
-			{
-				// The PSP doesn't give the same block out twice in a row, even if freed.
-				if (tls->usage[tls->next] == 0)
-					allocBlock = tls->next;
-				tls->next = (tls->next + 1) % tls->ntls.totalBlocks;
-			}
-
-			if (allocBlock != -1)
-			{
-				tls->usage[allocBlock] = threadID;
-				tlsplThreadEndChecks.insert(std::make_pair(threadID, uid));
-				--tls->ntls.freeBlocks;
-				needsClear = true;
-			}
+			tls->usage[allocBlock] = threadID;
+			tlsplThreadEndChecks.insert(std::make_pair(threadID, uid));
+			--tls->ntls.freeBlocks;
+			needsClear = true;
 		}
-
-		if (allocBlock == -1)
-		{
-			tls->waitingThreads.push_back(threadID);
-			__KernelWaitCurThread(WAITTYPE_TLSPL, uid, 1, 0, false, "allocate tls");
-			return 0;
-		}
-
-		u32 alignedSize = (tls->ntls.blockSize + tls->alignment - 1) & ~(tls->alignment - 1);
-		u32 allocAddress = tls->address + allocBlock * alignedSize;
-		NotifyMemInfo(MemBlockFlags::SUB_ALLOC, allocAddress, tls->ntls.blockSize, "TlsAddr");
-
-		// We clear the blocks upon first allocation (and also when they are freed, both are necessary.)
-		if (needsClear) {
-			Memory::Memset(allocAddress, 0, tls->ntls.blockSize, "TlsAddr");
-		}
-
-		return allocAddress;
 	}
-	else
-		return 0;
+
+	if (allocBlock == -1)
+	{
+		tls->waitingThreads.push_back(threadID);
+		__KernelWaitCurThread(WAITTYPE_TLSPL, uid, 1, 0, false, "allocate tls");
+		return hleLogDebug(SCEKERNEL, 0, "waiting for tls alloc");
+	}
+
+	u32 alignedSize = (tls->ntls.blockSize + tls->alignment - 1) & ~(tls->alignment - 1);
+	u32 allocAddress = tls->address + allocBlock * alignedSize;
+	NotifyMemInfo(MemBlockFlags::SUB_ALLOC, allocAddress, tls->ntls.blockSize, "TlsAddr");
+
+	// We clear the blocks upon first allocation (and also when they are freed, both are necessary.)
+	if (needsClear) {
+		Memory::Memset(allocAddress, 0, tls->ntls.blockSize, "TlsAddr");
+	}
+
+	return hleLogDebug(SCEKERNEL, allocAddress);
 }

 // Parameters are an educated guess.
--- a/GPU/Common/Draw2D.cpp
+++ b/GPU/Common/Draw2D.cpp
@ -70,6 +70,23 @@ Draw2DPipelineInfo GenerateDraw2DCopyColorFs(ShaderWriter &writer) {
 	};
 }

+Draw2DPipelineInfo GenerateDraw2DCopyColorRect2LinFs(ShaderWriter &writer) {
+	writer.DeclareSamplers(samplers);
+	writer.BeginFSMain(g_draw2Duniforms, varyings, FSFLAG_NONE);
+	writer.C("  vec2 tSize = texSize / scaleFactor;\n");
+	writer.C("  vec2 pixels = v_texcoord * tSize;\n");
+	writer.C("  float u = mod(pixels.x, tSize.x);\n");
+	writer.C("  float v = floor(pixels.x / tSize.x);\n");
+	writer.C("  vec4 outColor = ").SampleTexture2D("tex", "vec2(u, v) / tSize").C(";\n");
+	writer.EndFSMain("outColor", FSFLAG_NONE);
+
+	return Draw2DPipelineInfo{
+		"draw2d_copy_color_rect2lin",
+		RASTER_COLOR,
+		RASTER_COLOR,
+	};
+}
+
 Draw2DPipelineInfo GenerateDraw2DCopyDepthFs(ShaderWriter &writer) {
 	writer.DeclareSamplers(samplers);
 	writer.BeginFSMain(Slice<UniformDef>::empty(), varyings, FSFLAG_WRITEDEPTH);
@ -318,6 +335,13 @@ Draw2DPipeline *FramebufferManagerCommon::Get2DPipeline(Draw2DShader shader) {
 		pipeline = draw2DPipelineColor_;
 		break;

+	case DRAW2D_COPY_COLOR_RECT2LIN:
+		if (!draw2DPipelineColorRect2Lin_) {
+			draw2DPipelineColorRect2Lin_ = draw2D_.Create2DPipeline(&GenerateDraw2DCopyColorRect2LinFs);
+		}
+		pipeline = draw2DPipelineColorRect2Lin_;
+		break;
+
 	case DRAW2D_COPY_DEPTH:
 		if (!draw_->GetDeviceCaps().fragmentShaderDepthWriteSupported) {
 			// Can't do it
--- a/GPU/Common/Draw2D.h
+++ b/GPU/Common/Draw2D.h
@ -16,6 +16,7 @@ enum Draw2DShader {
 	DRAW2D_COPY_DEPTH,
 	DRAW2D_565_TO_DEPTH,
 	DRAW2D_565_TO_DEPTH_DESWIZZLE,
+	DRAW2D_COPY_COLOR_RECT2LIN,
 };

 inline RasterChannel Draw2DSourceChannel(Draw2DShader shader) {
--- a/GPU/Common/DrawEngineCommon.h
+++ b/GPU/Common/DrawEngineCommon.h
@ -147,6 +147,8 @@ protected:

 	bool useHWTransform_ = false;
 	bool useHWTessellation_ = false;
+	// Used to prevent unnecessary flushing in softgpu.
+	bool flushOnParams_ = true;

 	// Vertex collector buffers
 	u8 *decoded = nullptr;
--- a/GPU/Common/FramebufferManagerCommon.cpp
+++ b/GPU/Common/FramebufferManagerCommon.cpp
@ -547,27 +547,26 @@ void FramebufferManagerCommon::SetDepthFrameBuffer(bool isClearingDepth) {
 		return;
 	}

+	// First time use of this framebuffer's depth buffer.
+	bool newlyUsingDepth = (currentRenderVfb_->usageFlags & FB_USAGE_RENDER_DEPTH) == 0;
+	currentRenderVfb_->usageFlags |= FB_USAGE_RENDER_DEPTH;
+
 	// If this first draw call is anything other than a clear, "resolve" the depth buffer,
 	// by copying from any overlapping buffers with fresher content.
-	if (!isClearingDepth) {
+	if (!isClearingDepth && useBufferedRendering_) {
 		CopyToDepthFromOverlappingFramebuffers(currentRenderVfb_);

-		// Special compatibility trick for Burnout Dominator lens flares. Not sure how to best generalize this. See issue #11100
-		if (PSP_CoreParameter().compat.flags().UploadDepthForCLUTTextures && (currentRenderVfb_->usageFlags & FB_USAGE_CLUT) != 0) {
-			// Set the flag, then upload memory contents to depth channel.
+		// Need to upload the first line of depth buffers, for Burnout Dominator lens flares. See issue #11100 and comments to #16081.
+		// Might make this more generic and upload the whole depth buffer if we find it's needed for something.
+		if (newlyUsingDepth) {
 			// Sanity check the depth buffer pointer.
-			if (currentRenderVfb_->z_address != 0 && currentRenderVfb_->z_address != currentRenderVfb_->fb_address) {
-				if (Memory::IsValidRange(currentRenderVfb_->z_address, currentRenderVfb_->width * 2)) {
-					const u16 *src = (const u16 *)Memory::GetPointerUnchecked(currentRenderVfb_->z_address);
-					DrawPixels(currentRenderVfb_, 0, 0, (const u8 *)src, GE_FORMAT_DEPTH16, currentRenderVfb_->z_stride, currentRenderVfb_->width, currentRenderVfb_->height, RASTER_DEPTH, "Depth Upload");
-				}
+			if (Memory::IsValidRange(currentRenderVfb_->z_address, currentRenderVfb_->width * 2)) {
+				const u16 *src = (const u16 *)Memory::GetPointerUnchecked(currentRenderVfb_->z_address);
+				DrawPixels(currentRenderVfb_, 0, 0, (const u8 *)src, GE_FORMAT_DEPTH16, currentRenderVfb_->z_stride, currentRenderVfb_->width, currentRenderVfb_->height, RASTER_DEPTH, "Depth Upload");
 			}
 		}
 	}

-	// First time use of this framebuffer's depth buffer.
-	currentRenderVfb_->usageFlags |= FB_USAGE_RENDER_DEPTH;
-
 	currentRenderVfb_->depthBindSeq = GetBindSeqCount();
 }

@ -647,7 +646,7 @@ void FramebufferManagerCommon::CopyToDepthFromOverlappingFramebuffers(VirtualFra
 		}
 	}

-	gstate_c.Dirty(DIRTY_TEXTURE_IMAGE | DIRTY_VIEWPORTSCISSOR_STATE | DIRTY_RASTER_STATE | DIRTY_DEPTHSTENCIL_STATE | DIRTY_BLEND_STATE);
+	gstate_c.Dirty(DIRTY_ALL_RENDER_STATE);
 }

 // Can't easily dynamically create these strings, we just pass along the pointer.
@ -915,7 +914,7 @@ void FramebufferManagerCommon::BlitFramebufferDepth(VirtualFramebuffer *src, Vir

 	// Some GPUs can copy depth but only if stencil gets to come along for the ride. We only want to use this if there is no blit functionality.
 	if (useCopy) {
-		draw_->CopyFramebufferImage(src->fbo, 0, 0, 0, 0, dst->fbo, 0, 0, 0, 0, w, h, 1, Draw::FB_DEPTH_BIT, "BlitFramebufferDepth");
+		draw_->CopyFramebufferImage(src->fbo, 0, 0, 0, 0, dst->fbo, 0, 0, 0, 0, w, h, 1, Draw::FB_DEPTH_BIT, "CopyFramebufferDepth");
 		RebindFramebuffer("After BlitFramebufferDepth");
 	} else if (useBlit) {
 		// We'll accept whether we get a separate depth blit or not...
@ -1021,7 +1020,7 @@ void FramebufferManagerCommon::UpdateFromMemory(u32 addr, int size) {
 	// TODO: Could go through all FBOs, but probably not important?
 	// TODO: Could also check for inner changes, but video is most important.
 	// TODO: This shouldn't care if it's a display framebuf or not, should work exactly the same.
-	bool isDisplayBuf = addr == DisplayFramebufAddr() || addr == PrevDisplayFramebufAddr();
+	bool isDisplayBuf = addr == CurrentDisplayFramebufAddr() || addr == PrevDisplayFramebufAddr();
 	// TODO: Deleting the FBO is a heavy hammer solution, so let's only do it if it'd help.
 	if (!Memory::IsValidAddress(displayFramebufPtr_))
 		return;
@ -1097,7 +1096,7 @@ void FramebufferManagerCommon::DrawPixels(VirtualFramebuffer *vfb, int dstX, int
 		pixelsTex->Release();
 		draw_->InvalidateCachedState();

-		gstate_c.Dirty(DIRTY_BLEND_STATE | DIRTY_RASTER_STATE | DIRTY_DEPTHSTENCIL_STATE | DIRTY_VIEWPORTSCISSOR_STATE | DIRTY_TEXTURE_IMAGE | DIRTY_TEXTURE_PARAMS);
+		gstate_c.Dirty(DIRTY_ALL_RENDER_STATE);
 	}
 }

@ -1540,7 +1539,7 @@ void FramebufferManagerCommon::ResizeFramebufFBO(VirtualFramebuffer *vfb, int w,
 	if (creating) {
 		WARN_LOG(FRAMEBUF, "Creating %s FBO at %08x/%d %dx%d (force=%d)", GeBufferFormatToString(vfb->fb_format), vfb->fb_address, vfb->fb_stride, vfb->bufferWidth, vfb->bufferHeight, (int)force);
 	} else {
-		WARN_LOG(FRAMEBUF, "Resizing %s FBO at %08x/%d from %dx%d to %dx%d (force=%d)", GeBufferFormatToString(vfb->fb_format), vfb->fb_address, vfb->fb_stride, old.bufferWidth, old.bufferHeight, vfb->bufferWidth, vfb->bufferHeight, (int)force);
+		WARN_LOG(FRAMEBUF, "Resizing %s FBO at %08x/%d from %dx%d to %dx%d (force=%d, skipCopy=%d)", GeBufferFormatToString(vfb->fb_format), vfb->fb_address, vfb->fb_stride, old.bufferWidth, old.bufferHeight, vfb->bufferWidth, vfb->bufferHeight, (int)force, (int)skipCopy);
 	}

 	// During hardware rendering, we always render at full color depth even if the game wouldn't on real hardware.
@ -1578,8 +1577,10 @@ void FramebufferManagerCommon::ResizeFramebufFBO(VirtualFramebuffer *vfb, int w,
 		if (vfb->fbo) {
 			draw_->BindFramebufferAsRenderTarget(vfb->fbo, { Draw::RPAction::CLEAR, Draw::RPAction::CLEAR, Draw::RPAction::CLEAR }, "ResizeFramebufFBO");
 			if (!skipCopy) {
-				BlitFramebuffer(vfb, 0, 0, &old, 0, 0, std::min((u16)oldWidth, std::min(vfb->bufferWidth, vfb->width)), std::min((u16)oldHeight, std::min(vfb->height, vfb->bufferHeight)), 0, RASTER_COLOR, "Blit_ResizeFramebufFBO");
-				// Depth copying is handled by deferred copies later.
+				BlitFramebuffer(vfb, 0, 0, &old, 0, 0, std::min((u16)oldWidth, std::min(vfb->bufferWidth, vfb->width)), std::min((u16)oldHeight, std::min(vfb->height, vfb->bufferHeight)), 0, RASTER_COLOR, "BlitColor_ResizeFramebufFBO");
+			}
+			if (vfb->usageFlags & FB_USAGE_RENDER_DEPTH) {
+				BlitFramebuffer(vfb, 0, 0, &old, 0, 0, std::min((u16)oldWidth, std::min(vfb->bufferWidth, vfb->width)), std::min((u16)oldHeight, std::min(vfb->height, vfb->bufferHeight)), 0, RASTER_DEPTH, "BlitDepth_ResizeFramebufFBO");
 			}
 		}
 		fbosToDelete_.push_back(old.fbo);
@ -2182,7 +2183,7 @@ void FramebufferManagerCommon::NotifyBlockTransferAfter(u32 dstBasePtr, int dstS
 	// We may still do a partial block draw below if this doesn't pass.
 	if (!useBufferedRendering_ && dstStride >= 480 && width >= 480 && height == 272) {
 		bool isPrevDisplayBuffer = PrevDisplayFramebufAddr() == dstBasePtr;
-		bool isDisplayBuffer = DisplayFramebufAddr() == dstBasePtr;
+		bool isDisplayBuffer = CurrentDisplayFramebufAddr() == dstBasePtr;
 		if (isPrevDisplayBuffer || isDisplayBuffer) {
 			FlushBeforeCopy();
 			DrawFramebufferToOutput(Memory::GetPointerUnchecked(dstBasePtr), dstStride, displayFormat_);
@ -2214,8 +2215,9 @@ void FramebufferManagerCommon::NotifyBlockTransferAfter(u32 dstBasePtr, int dstS
 			int dstBpp = BufferFormatBytesPerPixel(dstRect.vfb->fb_format);
 			float dstXFactor = (float)bpp / dstBpp;
 			if (dstRect.w_bytes / bpp > dstRect.vfb->width || dstRect.h > dstRect.vfb->height) {
-				// The buffer isn't big enough, and we have a clear hint of size.  Resize.
+				// The buffer isn't big enough, and we have a clear hint of size. Resize.
 				// This happens in Valkyrie Profile when uploading video at the ending.
+				// Also happens to the CLUT framebuffer in the Burnout Dominator lens flare effect. See #16075
 				ResizeFramebufFBO(dstRect.vfb, dstRect.w_bytes / bpp, dstRect.h, false, true);
 				// Make sure we don't flop back and forth.
 				dstRect.vfb->newWidth = std::max(dstRect.w_bytes / bpp, (int)dstRect.vfb->width);
@ -2357,8 +2359,8 @@ void FramebufferManagerCommon::ShowScreenResolution() {
 // * Save state screenshots(could probably be async but need to manage the stall.)
 bool FramebufferManagerCommon::GetFramebuffer(u32 fb_address, int fb_stride, GEBufferFormat format, GPUDebugBuffer &buffer, int maxScaleFactor) {
 	VirtualFramebuffer *vfb = currentRenderVfb_;
-	if (!vfb) {
-		vfb = GetVFBAt(fb_address);
+	if (!vfb || vfb->fb_address != fb_address) {
+		vfb = ResolveVFB(fb_address, fb_stride, format);
 	}

 	if (!vfb) {
@ -2701,6 +2703,7 @@ void FramebufferManagerCommon::DeviceLost() {
 	DoRelease(stencilUploadSampler_);
 	DoRelease(stencilUploadPipeline_);
 	DoRelease(draw2DPipelineColor_);
+	DoRelease(draw2DPipelineColorRect2Lin_);
 	DoRelease(draw2DPipelineDepth_);
 	DoRelease(draw2DPipeline565ToDepth_);
 	DoRelease(draw2DPipeline565ToDepthDeswizzle_);
@ -2766,7 +2769,7 @@ void FramebufferManagerCommon::DrawActiveTexture(float x, float y, float w, floa

 	draw2D_.DrawStrip2D(nullptr, coord, 4, (flags & DRAWTEX_LINEAR) != 0, Get2DPipeline((flags & DRAWTEX_DEPTH) ? DRAW2D_COPY_DEPTH : DRAW2D_COPY_COLOR));

-	gstate_c.Dirty(DIRTY_BLEND_STATE | DIRTY_RASTER_STATE | DIRTY_DEPTHSTENCIL_STATE | DIRTY_VIEWPORTSCISSOR_STATE | DIRTY_TEXTURE_IMAGE | DIRTY_TEXTURE_PARAMS | DIRTY_VERTEXSHADER_STATE | DIRTY_FRAGMENTSHADER_STATE);
+	gstate_c.Dirty(DIRTY_ALL_RENDER_STATE);
 }

 void FramebufferManagerCommon::BlitFramebuffer(VirtualFramebuffer *dst, int dstX, int dstY, VirtualFramebuffer *src, int srcX, int srcY, int w, int h, int bpp, RasterChannel channel, const char *tag) {
@ -2779,6 +2782,11 @@ void FramebufferManagerCommon::BlitFramebuffer(VirtualFramebuffer *dst, int dstX
 		return;
 	}

+	if (channel == RASTER_DEPTH && !draw_->GetDeviceCaps().fragmentShaderDepthWriteSupported) {
+		// Can't do anything :(
+		return;
+	}
+
 	// Perform a little bit of clipping first.
 	// Block transfer coords are unsigned so I don't think we need to clip on the left side.. Although there are
 	// other uses for BlitFramebuffer.
@ -2870,7 +2878,7 @@ void FramebufferManagerCommon::BlitFramebuffer(VirtualFramebuffer *dst, int dstX

 	draw_->InvalidateCachedState();

-	gstate_c.Dirty(DIRTY_BLEND_STATE | DIRTY_DEPTHSTENCIL_STATE | DIRTY_RASTER_STATE | DIRTY_VIEWPORTSCISSOR_STATE | DIRTY_VERTEXSHADER_STATE | DIRTY_FRAGMENTSHADER_STATE);
+	gstate_c.Dirty(DIRTY_ALL_RENDER_STATE);
 }

 // The input is raw pixel coordinates, scale not taken into account.
@ -2906,7 +2914,7 @@ void FramebufferManagerCommon::BlitUsingRaster(

 	draw2D_.Blit(pipeline, srcX1, srcY1, srcX2, srcY2, destX1, destY1, destX2, destY2, (float)srcW, (float)srcH, (float)destW, (float)destH, linearFilter, scaleFactor);

-	gstate_c.Dirty(DIRTY_BLEND_STATE | DIRTY_DEPTHSTENCIL_STATE | DIRTY_RASTER_STATE | DIRTY_VIEWPORTSCISSOR_STATE | DIRTY_VERTEXSHADER_STATE | DIRTY_FRAGMENTSHADER_STATE);
+	gstate_c.Dirty(DIRTY_ALL_RENDER_STATE);
 }

 VirtualFramebuffer *FramebufferManagerCommon::ResolveFramebufferColorToFormat(VirtualFramebuffer *src, GEBufferFormat newFormat) {
--- a/GPU/Common/FramebufferManagerCommon.h
+++ b/GPU/Common/FramebufferManagerCommon.h
@ -335,15 +335,18 @@ public:
 	u32 PrevDisplayFramebufAddr() const {
 		return prevDisplayFramebuf_ ? prevDisplayFramebuf_->fb_address : 0;
 	}
-	u32 DisplayFramebufAddr() const {
+	u32 CurrentDisplayFramebufAddr() const {
 		return displayFramebuf_ ? displayFramebuf_->fb_address : 0;
 	}

+	u32 DisplayFramebufAddr() const {
+		return displayFramebufPtr_;
+	}
 	u32 DisplayFramebufStride() const {
-		return displayFramebuf_ ? displayStride_ : 0;
+		return displayStride_;
 	}
 	GEBufferFormat DisplayFramebufFormat() const {
-		return displayFramebuf_ ? displayFormat_ : GE_FORMAT_INVALID;
+		return displayFormat_;
 	}

 	bool UseBufferedRendering() const {
@ -566,6 +569,7 @@ protected:

 	// Draw2D pipelines
 	Draw2DPipeline *draw2DPipelineColor_ = nullptr;
+	Draw2DPipeline *draw2DPipelineColorRect2Lin_ = nullptr;
 	Draw2DPipeline *draw2DPipelineDepth_ = nullptr;
 	Draw2DPipeline *draw2DPipeline565ToDepth_ = nullptr;
 	Draw2DPipeline *draw2DPipeline565ToDepthDeswizzle_ = nullptr;
--- a/GPU/Common/ShaderCommon.h
+++ b/GPU/Common/ShaderCommon.h
@ -109,6 +109,10 @@ enum : uint64_t {
 	DIRTY_VERTEXSHADER_STATE = 1ULL << 47,
 	DIRTY_FRAGMENTSHADER_STATE = 1ULL << 48,

+	// Everything that's not uniforms. Use this after using thin3d.
+	// TODO: Should we also add DIRTY_FRAMEBUF here? It kinda generally takes care of itself.
+	DIRTY_ALL_RENDER_STATE = DIRTY_BLEND_STATE | DIRTY_DEPTHSTENCIL_STATE | DIRTY_RASTER_STATE | DIRTY_VIEWPORTSCISSOR_STATE | DIRTY_VERTEXSHADER_STATE | DIRTY_FRAGMENTSHADER_STATE | DIRTY_TEXTURE_IMAGE | DIRTY_TEXTURE_PARAMS,
+
 	DIRTY_ALL = 0xFFFFFFFFFFFFFFFF
 };

--- a/GPU/Common/SoftwareTransformCommon.cpp
+++ b/GPU/Common/SoftwareTransformCommon.cpp
@ -790,13 +790,13 @@ void SoftwareTransform::ExpandLines(int vertexCount, int &maxIndex, u16 *&inds,
 			float yoff = addWidth.y * dy;

 			// bottom right
-			trans[0].CopyFromWithOffset(transVtx2, xoff, yoff);
+			trans[0].CopyFromWithOffset(transVtx2, xoff * transVtx2.pos_w, yoff * transVtx2.pos_w);
 			// top right
-			trans[1].CopyFromWithOffset(transVtx1, xoff, yoff);
+			trans[1].CopyFromWithOffset(transVtx1, xoff * transVtx1.pos_w, yoff * transVtx1.pos_w);
 			// top left
-			trans[2].CopyFromWithOffset(transVtx1, -xoff, -yoff);
+			trans[2].CopyFromWithOffset(transVtx1, -xoff * transVtx1.pos_w, -yoff * transVtx1.pos_w);
 			// bottom left
-			trans[3].CopyFromWithOffset(transVtx2, -xoff, -yoff);
+			trans[3].CopyFromWithOffset(transVtx2, -xoff * transVtx2.pos_w, -yoff * transVtx2.pos_w);

 			// Triangle: BR-TR-TL
 			indsOut[0] = i * 2 + 0;
@ -835,17 +835,17 @@ void SoftwareTransform::ExpandLines(int vertexCount, int &maxIndex, u16 *&inds,

 			// bottom right
 			trans[0] = transVtxBL;
-			trans[0].x += addWidth.x * dx;
-			trans[0].y += addWidth.y * dy;
-			trans[0].u += addWidth.x * du;
-			trans[0].v += addWidth.y * dv;
+			trans[0].x += addWidth.x * dx * trans[0].pos_w;
+			trans[0].y += addWidth.y * dy * trans[0].pos_w;
+			trans[0].u += addWidth.x * du * trans[0].uv_w;
+			trans[0].v += addWidth.y * dv * trans[0].uv_w;

 			// top right
 			trans[1] = transVtxTL;
-			trans[1].x += addWidth.x * dx;
-			trans[1].y += addWidth.y * dy;
-			trans[1].u += addWidth.x * du;
-			trans[1].v += addWidth.y * dv;
+			trans[1].x += addWidth.x * dx * trans[1].pos_w;
+			trans[1].y += addWidth.y * dy * trans[1].pos_w;
+			trans[1].u += addWidth.x * du * trans[1].uv_w;
+			trans[1].v += addWidth.y * dv * trans[1].uv_w;

 			// top left
 			trans[2] = transVtxTL;
--- a/GPU/Common/SplineCommon.cpp
+++ b/GPU/Common/SplineCommon.cpp
@ -577,7 +577,8 @@ void DrawEngineCommon::SubmitCurve(const void *control_points, const void *indic
 	if (output.count)
 		DispatchSubmitPrim(output.vertices, output.indices, PatchPrimToPrim(surface.primType), output.count, vertTypeID, gstate.getCullMode(), &generatedBytesRead);

-	DispatchFlush();
+	if (flushOnParams_)
+		DispatchFlush();

 	if (origVertType & GE_VTYPE_TC_MASK) {
 		gstate_c.uv = prevUVScale;
--- a/GPU/Common/StencilCommon.cpp
+++ b/GPU/Common/StencilCommon.cpp
@ -186,13 +186,9 @@ bool FramebufferManagerCommon::PerformStencilUpload(u32 addr, int size, StencilU

 		// Otherwise, we can skip alpha in many cases, in which case we don't even use a shader.
 		if (flags & StencilUpload::IGNORE_ALPHA) {
-			shaderManager_->DirtyLastShader();
-
 			if (dstBuffer->fbo) {
 				draw_->BindFramebufferAsRenderTarget(dstBuffer->fbo, { Draw::RPAction::KEEP, Draw::RPAction::KEEP, Draw::RPAction::CLEAR }, "PerformStencilUpload_Clear");
 			}
-
-			gstate_c.Dirty(DIRTY_BLEND_STATE | DIRTY_VIEWPORTSCISSOR_STATE | DIRTY_DEPTHSTENCIL_STATE);
 			return true;
 		}
 	}
@ -333,6 +329,6 @@ bool FramebufferManagerCommon::PerformStencilUpload(u32 addr, int size, StencilU
 	tex->Release();

 	draw_->InvalidateCachedState();
-	gstate_c.Dirty(DIRTY_BLEND_STATE | DIRTY_RASTER_STATE | DIRTY_DEPTHSTENCIL_STATE | DIRTY_VIEWPORTSCISSOR_STATE | DIRTY_TEXTURE_IMAGE | DIRTY_TEXTURE_PARAMS | DIRTY_VERTEXSHADER_STATE | DIRTY_FRAGMENTSHADER_STATE);
+	gstate_c.Dirty(DIRTY_ALL_RENDER_STATE);
 	return true;
 }
--- a/GPU/Common/TextureCacheCommon.cpp
+++ b/GPU/Common/TextureCacheCommon.cpp
@ -418,10 +418,13 @@ TexCacheEntry *TextureCacheCommon::SetTexture() {
 	// Should probably revisit how this works..
 	gstate_c.SetNeedShaderTexclamp(false);
 	gstate_c.skipDrawReason &= ~SKIPDRAW_BAD_FB_TEXTURE;
-	if (gstate_c.bgraTexture != isBgraBackend_) {
+
+	bool isBgraTexture = isBgraBackend_ && !hasClutGPU;
+
+	if (gstate_c.bgraTexture != isBgraTexture) {
 		gstate_c.Dirty(DIRTY_FRAGMENTSHADER_STATE);
 	}
-	gstate_c.bgraTexture = isBgraBackend_;
+	gstate_c.bgraTexture = isBgraTexture;

 	if (entryIter != cache_.end()) {
 		entry = entryIter->second.get();
@ -1015,7 +1018,8 @@ bool TextureCacheCommon::MatchFramebuffer(
 			return false;
 		}

-		if (fb_stride_in_bytes != tex_stride_in_bytes) {
+		// Note the check for texHeight - we really don't care about a stride mismatch if texHeight == 1.
+		if (fb_stride_in_bytes != tex_stride_in_bytes && texHeight > 1) {
 			// Probably irrelevant. Although, as we shall see soon, there are exceptions.
 			// Burnout Dominator lens flare trick special case.
 			if (fb_format == GE_FORMAT_8888 && entry.format == GE_TFMT_CLUT8 && texWidth == 4 && texHeight == 1) {
@ -1205,6 +1209,8 @@ void TextureCacheCommon::LoadClut(u32 clutAddr, u32 loadBytes) {
 			clutRenderOffset_ = MAX_CLUT_OFFSET;
 			const std::vector<VirtualFramebuffer *> &framebuffers = framebufferManager_->Framebuffers();

+			u32 bestClutAddress = 0xFFFFFFFF;
+
 			VirtualFramebuffer *chosenFramebuffer = nullptr;
 			for (VirtualFramebuffer *framebuffer : framebuffers) {
 				const u32 fb_address = framebuffer->fb_address & 0x3FFFFFFF;
@ -1231,7 +1237,7 @@ void TextureCacheCommon::LoadClut(u32 clutAddr, u32 loadBytes) {
 						WARN_LOG_N_TIMES(clutfb, 5, G3D, "Detected LoadCLUT(%d bytes) from framebuffer %08x (%s), byte offset %d", loadBytes, fb_address, GeBufferFormatToString(framebuffer->fb_format), offset);
 						framebuffer->last_frame_clut = gpuStats.numFlips;
 						framebuffer->usageFlags |= FB_USAGE_CLUT;
-						clutRenderAddress_ = framebuffer->fb_address;
+						bestClutAddress = framebuffer->fb_address;
 						clutRenderOffset_ = (u32)offset;
 						chosenFramebuffer = framebuffer;
 						if (offset == 0) {
@ -1242,7 +1248,9 @@ void TextureCacheCommon::LoadClut(u32 clutAddr, u32 loadBytes) {
 				}
 			}

-			if (chosenFramebuffer) {
+			if (chosenFramebuffer && chosenFramebuffer->fbo) {
+				clutRenderAddress_ = bestClutAddress;
+
 				if (!dynamicClutTemp_) {
 					Draw::FramebufferDesc desc{};
 					desc.width = 512;
@ -1256,11 +1264,12 @@ void TextureCacheCommon::LoadClut(u32 clutAddr, u32 loadBytes) {
 					dynamicClutTemp_ = draw_->CreateFramebuffer(desc);
 				}

-				// Download the pixels to our temp clut, scaling down if needed.
+				// Copy the pixels to our temp clut, scaling down if needed and wrapping.
+				// TODO: Take the clutRenderOffset_ into account here.
 				framebufferManager_->BlitUsingRaster(
 					chosenFramebuffer->fbo, 0.0f, 0.0f, 512.0f * chosenFramebuffer->renderScaleFactor, 1.0f, 
 					dynamicClutTemp_, 0.0f, 0.0f, 512.0f, 1.0f, 
-					false, 1.0f, framebufferManager_->Get2DPipeline(DRAW2D_COPY_COLOR), "copy_clut_to_temp");
+					false, chosenFramebuffer->renderScaleFactor, framebufferManager_->Get2DPipeline(DRAW2D_COPY_COLOR_RECT2LIN), "copy_clut_to_temp");
 				clutRenderFormat_ = chosenFramebuffer->fb_format;
 			}
 			NotifyMemInfo(MemBlockFlags::ALLOC, clutAddr, loadBytes, "CLUT");
@ -2091,7 +2100,6 @@ void TextureCacheCommon::ApplyTextureFramebuffer(VirtualFramebuffer *framebuffer
 				mode = ShaderDepalMode::SMOOTHED;
 			}

-			// Since we started/ended render passes, might need these.
 			gstate_c.Dirty(DIRTY_DEPAL);
 			gstate_c.SetUseShaderDepal(mode);
 			gstate_c.depalFramebufferFormat = framebuffer->fb_format;
@ -2189,8 +2197,8 @@ void TextureCacheCommon::ApplyTextureFramebuffer(VirtualFramebuffer *framebuffer
 	SamplerCacheKey samplerKey = GetFramebufferSamplingParams(framebuffer->bufferWidth, framebuffer->bufferHeight);
 	ApplySamplingParams(samplerKey);

-	// Since we started/ended render passes, might need these.
-	gstate_c.Dirty(DIRTY_BLEND_STATE | DIRTY_DEPTHSTENCIL_STATE | DIRTY_RASTER_STATE | DIRTY_VIEWPORTSCISSOR_STATE);
+	// Since we've drawn using thin3d, might need these.
+	gstate_c.Dirty(DIRTY_ALL_RENDER_STATE);
 }

 // Applies depal to a normal (non-framebuffer) texture, pre-decoded to CLUT8 format.
@ -2281,8 +2289,8 @@ void TextureCacheCommon::ApplyTextureDepal(TexCacheEntry *entry) {
 	SamplerCacheKey samplerKey = GetFramebufferSamplingParams(texWidth, texHeight);
 	ApplySamplingParams(samplerKey);

-	// Since we started/ended render passes, might need these.
-	gstate_c.Dirty(DIRTY_BLEND_STATE | DIRTY_DEPTHSTENCIL_STATE | DIRTY_RASTER_STATE | DIRTY_VIEWPORTSCISSOR_STATE);
+	// Since we've drawn using thin3d, might need these.
+	gstate_c.Dirty(DIRTY_ALL_RENDER_STATE);
 }

 void TextureCacheCommon::Clear(bool delete_them) {
@ -2630,14 +2638,32 @@ bool TextureCacheCommon::PrepareBuildTexture(BuildTexturePlan &plan, TexCacheEnt
 		}
 	}

-	if (isPPGETexture) {
-		plan.replaced = &replacer_.FindNone();
-		plan.replaceValid = false;
+	bool canReplace = !isPPGETexture;
+	if (entry->status & TexCacheEntry::TexStatus::STATUS_CLUT_GPU) {
+		_dbg_assert_(entry->format == GE_TFMT_CLUT4 || entry->format == GE_TFMT_CLUT8);
+		plan.decodeToClut8 = true;
+		// We only support 1 mip level when doing CLUT on GPU for now.
+		// Supporting more would be possible, just not very interesting until we need it.
+		plan.levelsToCreate = 1;
+		plan.levelsToLoad = 1;
+		plan.maxPossibleLevels = 1;
+		plan.scaleFactor = 1;
+		plan.saveTexture = false;  // Can't yet save these properly.
+		canReplace = false;
 	} else {
+		plan.decodeToClut8 = false;
+	}
+
+	if (canReplace) {
 		plan.replaced = &FindReplacement(entry, plan.w, plan.h, plan.depth);
 		plan.replaceValid = plan.replaced->Valid();
+	} else {
+		plan.replaced = &replacer_.FindNone();
+		plan.replaceValid = false;
 	}

+	// NOTE! Last chance to change scale factor here!
+
 	plan.saveTexture = false;
 	if (plan.replaceValid) {
 		// We're replacing, so we won't scale.
@ -2648,7 +2674,7 @@ bool TextureCacheCommon::PrepareBuildTexture(BuildTexturePlan &plan, TexCacheEnt
 		// But, we still need to create the texture at a larger size.
 		plan.replaced->GetSize(0, plan.createW, plan.createH);
 	} else {
-		if (replacer_.Enabled() && !plan.replaceValid && plan.depth == 1) {
+		if (replacer_.Enabled() && !plan.replaceValid && plan.depth == 1 && canReplace) {
 			ReplacedTextureDecodeInfo replacedInfo;
 			// TODO: Do we handle the race where a replacement becomes valid AFTER this but before we save?
 			replacedInfo.cachekey = entry->CacheKey();
@ -2673,27 +2699,12 @@ bool TextureCacheCommon::PrepareBuildTexture(BuildTexturePlan &plan, TexCacheEnt
 		plan.levelsToLoad = 1;
 	}

-	if (plan.isVideo || plan.depth != 1) {
+	if (plan.isVideo || plan.depth != 1 || plan.decodeToClut8) {
 		plan.maxPossibleLevels = 1;
 	} else {
 		plan.maxPossibleLevels = log2i(std::min(plan.createW, plan.createH)) + 1;
 	}

-	if (entry->status & TexCacheEntry::TexStatus::STATUS_CLUT_GPU) {
-		_dbg_assert_(entry->format == GE_TFMT_CLUT4 || entry->format == GE_TFMT_CLUT8);
-		plan.decodeToClut8 = true;
-		// We only support 1 mip level when doing CLUT on GPU for now.
-		// Supporting more would be possible, just not very interesting until we need it.
-		plan.levelsToCreate = 1;
-		plan.levelsToLoad = 1;
-		plan.maxPossibleLevels = 1;
-		plan.scaleFactor = 1;
-		plan.saveTexture = false;  // Can't yet save these properly.
-		// TODO: Also forcibly disable replacement, or check that the replacement is a 8-bit paletted texture.
-	} else {
-		plan.decodeToClut8 = false;
-	}
-
 	if (plan.levelsToCreate == 1) {
 		entry->status |= TexCacheEntry::STATUS_NO_MIPS;
 	} else {
--- a/GPU/Common/VertexShaderGenerator.cpp
+++ b/GPU/Common/VertexShaderGenerator.cpp
@ -1128,7 +1128,10 @@ bool GenerateVertexShader(const VShaderID &id, char *buffer, const ShaderLanguag
 					}
 				} else {
 					if (hasTexcoord) {
-						WRITE(p, "  %sv_texcoord = vec3(texcoord.xy * u_uvscaleoffset.xy + u_uvscaleoffset.zw, 0.0);\n", compat.vsOutPrefix);
+						if (doBezier || doSpline)
+							WRITE(p, "  %sv_texcoord = vec3(tess.tex.xy * u_uvscaleoffset.xy + u_uvscaleoffset.zw, 0.0);\n", compat.vsOutPrefix);
+						else
+							WRITE(p, "  %sv_texcoord = vec3(texcoord.xy * u_uvscaleoffset.xy + u_uvscaleoffset.zw, 0.0);\n", compat.vsOutPrefix);
 					} else {
 						WRITE(p, "  %sv_texcoord = vec3(u_uvscaleoffset.zw, 0.0);\n", compat.vsOutPrefix);
 					}
@ -1140,26 +1143,36 @@ bool GenerateVertexShader(const VShaderID &id, char *buffer, const ShaderLanguag
 					std::string temp_tc;
 					switch (uvProjMode) {
 					case GE_PROJMAP_POSITION:  // Use model space XYZ as source
-						temp_tc = "vec4(position, 1.0)";
+						if (doBezier || doSpline)
+							temp_tc = "vec4(tess.pos, 1.0)";
+						else
+							temp_tc = "vec4(position, 1.0)";
 						break;
 					case GE_PROJMAP_UV:  // Use unscaled UV as source
 						{
 							// prescale is false here.
 							if (hasTexcoord) {
-								temp_tc = "vec4(texcoord.xy, 0.0, 1.0)";
+								if (doBezier || doSpline)
+									temp_tc = "vec4(tess.tex.xy, 0.0, 1.0)";
+								else
+									temp_tc = "vec4(texcoord.xy, 0.0, 1.0)";
 							} else {
 								temp_tc = "vec4(0.0, 0.0, 0.0, 1.0)";
 							}
 						}
 						break;
 					case GE_PROJMAP_NORMALIZED_NORMAL:  // Use normalized transformed normal as source
-						if (hasNormal)
+						if ((doBezier || doSpline) && hasNormalTess)
+							temp_tc = StringFromFormat("length(tess.nrm) == 0.0 ? vec4(0.0, 0.0, 1.0, 1.0) : vec4(normalize(%stess.nrm), 1.0)", flipNormalTess ? "-" : "");
+						else if (hasNormal)
 							temp_tc = StringFromFormat("length(normal) == 0.0 ? vec4(0.0, 0.0, 1.0, 1.0) : vec4(normalize(%snormal), 1.0)", flipNormal ? "-" : "");
 						else
 							temp_tc = "vec4(0.0, 0.0, 1.0, 1.0)";
 						break;
 					case GE_PROJMAP_NORMAL:  // Use non-normalized transformed normal as source
-						if (hasNormal)
+						if ((doBezier || doSpline) && hasNormalTess)
+							temp_tc = flipNormalTess ? "vec4(-tess.nrm, 1.0)" : "vec4(tess.nrm, 1.0)";
+						else if (hasNormal)
 							temp_tc = flipNormal ? "vec4(-normal, 1.0)" : "vec4(normal, 1.0)";
 						else
 							temp_tc = "vec4(0.0, 0.0, 1.0, 1.0)";
@ -1189,37 +1202,34 @@ bool GenerateVertexShader(const VShaderID &id, char *buffer, const ShaderLanguag
 			WRITE(p, "  %sv_fogdepth = (viewPos.z + u_fogcoef.x) * u_fogcoef.y;\n", compat.vsOutPrefix);
 	}

-	if (clipClampedDepth || (vertexRangeCulling && !IsVRBuild())) {
-		WRITE(p, "  vec3 projPos = outPos.xyz / outPos.w;\n");
-	}
-
 	if (clipClampedDepth) {
 		const char *clip0 = compat.shaderLanguage == HLSL_D3D11 ? ".x" : "[0]";
 		const char *clip1 = compat.shaderLanguage == HLSL_D3D11 ? ".y" : "[1]";
-		WRITE(p, "  mediump float integerZ = projPos.z * u_depthRange.x + u_depthRange.y;\n");

 		// This should clip against minz, but only when it's above zero.
 		if (ShaderLanguageIsOpenGL(compat.shaderLanguage)) {
 			// On OpenGL/GLES, these values account for the -1 -> 1 range.
 			WRITE(p, "  if (u_depthRange.y - u_depthRange.x >= 1.0) {\n");
+			WRITE(p, "    %sgl_ClipDistance%s = outPos.w + outPos.z;\n", compat.vsOutPrefix, clip0);
 		} else {
 			// Everywhere else, it's 0 -> 1, simpler.
 			WRITE(p, "  if (u_depthRange.y >= 1.0) {\n");
+			WRITE(p, "    %sgl_ClipDistance%s = outPos.z;\n", compat.vsOutPrefix, clip0);
 		}
-		WRITE(p, "    %sgl_ClipDistance%s = integerZ;\n", compat.vsOutPrefix, clip0);
 		WRITE(p, "  } else {\n");
 		WRITE(p, "    %sgl_ClipDistance%s = 0.0;\n", compat.vsOutPrefix, clip0);
 		WRITE(p, "  }\n");

 		// This is similar, but for maxz when it's below 65535.0.  -1/0 don't matter here.
 		WRITE(p, "  if (u_depthRange.x + u_depthRange.y <= 65534.0) {\n");
-		WRITE(p, "    %sgl_ClipDistance%s = 65535.0 - integerZ;\n", compat.vsOutPrefix, clip1);
+		WRITE(p, "    %sgl_ClipDistance%s = outPos.w - outPos.z;\n", compat.vsOutPrefix, clip1);
 		WRITE(p, "  } else {\n");
 		WRITE(p, "    %sgl_ClipDistance%s = 0.0;\n", compat.vsOutPrefix, clip1);
 		WRITE(p, "  }\n");
 	}

 	if (vertexRangeCulling && !IsVRBuild()) {
+		WRITE(p, "  vec3 projPos = outPos.xyz / outPos.w;\n");
 		WRITE(p, "  float projZ = (projPos.z - u_depthRange.z) * u_depthRange.w;\n");
 		// Vertex range culling doesn't happen when Z clips, note sign of w is important.
 		WRITE(p, "  if (u_cullRangeMin.w <= 0.0 || projZ * outPos.w > -outPos.w) {\n");
--- a/GPU/GLES/DrawEngineGLES.cpp
+++ b/GPU/GLES/DrawEngineGLES.cpp
@ -500,7 +500,8 @@ void TessellationDataTransferGLES::SendDataToShader(const SimpleVertex *const *p
 		prevSizeU = size_u;
 		prevSizeV = size_v;
 		if (!data_tex[0])
-			data_tex[0] = renderManager_->CreateTexture(GL_TEXTURE_2D, size_u * 3, size_v, 1, 1);
+			renderManager_->DeleteTexture(data_tex[0]);
+		data_tex[0] = renderManager_->CreateTexture(GL_TEXTURE_2D, size_u * 3, size_v, 1, 1);
 		renderManager_->TextureImage(data_tex[0], 0, size_u * 3, size_v, 1, Draw::DataFormat::R32G32B32A32_FLOAT, nullptr, GLRAllocType::NONE, false);
 		renderManager_->FinalizeTexture(data_tex[0], 0, false);
 	}
@ -518,7 +519,8 @@ void TessellationDataTransferGLES::SendDataToShader(const SimpleVertex *const *p
 	if (prevSizeWU < weights.size_u) {
 		prevSizeWU = weights.size_u;
 		if (!data_tex[1])
-			data_tex[1] = renderManager_->CreateTexture(GL_TEXTURE_2D, weights.size_u * 2, 1, 1, 1);
+			renderManager_->DeleteTexture(data_tex[1]);
+		data_tex[1] = renderManager_->CreateTexture(GL_TEXTURE_2D, weights.size_u * 2, 1, 1, 1);
 		renderManager_->TextureImage(data_tex[1], 0, weights.size_u * 2, 1, 1, Draw::DataFormat::R32G32B32A32_FLOAT, nullptr, GLRAllocType::NONE, false);
 		renderManager_->FinalizeTexture(data_tex[1], 0, false);
 	}
@ -529,7 +531,8 @@ void TessellationDataTransferGLES::SendDataToShader(const SimpleVertex *const *p
 	if (prevSizeWV < weights.size_v) {
 		prevSizeWV = weights.size_v;
 		if (!data_tex[2])
-			data_tex[2] = renderManager_->CreateTexture(GL_TEXTURE_2D, weights.size_v * 2, 1, 1, 1);
+			renderManager_->DeleteTexture(data_tex[2]);
+		data_tex[2] = renderManager_->CreateTexture(GL_TEXTURE_2D, weights.size_v * 2, 1, 1, 1);
 		renderManager_->TextureImage(data_tex[2], 0, weights.size_v * 2, 1, 1, Draw::DataFormat::R32G32B32A32_FLOAT, nullptr, GLRAllocType::NONE, false);
 		renderManager_->FinalizeTexture(data_tex[2], 0, false);
 	}
--- a/GPU/GPUCommon.cpp
+++ b/GPU/GPUCommon.cpp
@ -1942,7 +1942,8 @@ void GPUCommon::Execute_Bezier(u32 op, u32 diff) {
 	}

 	// Can't flush after setting gstate_c.submitType below since it'll be a mess - it must be done already.
-	drawEngineCommon_->DispatchFlush();
+	if (flushOnParams_)
+		drawEngineCommon_->DispatchFlush();

 	Spline::BezierSurface surface;
 	surface.tess_u = gstate.getPatchDivisionU();
@ -2014,7 +2015,8 @@ void GPUCommon::Execute_Spline(u32 op, u32 diff) {
 	}

 	// Can't flush after setting gstate_c.submitType below since it'll be a mess - it must be done already.
-	drawEngineCommon_->DispatchFlush();
+	if (flushOnParams_)
+		drawEngineCommon_->DispatchFlush();

 	Spline::SplineSurface surface;
 	surface.tess_u = gstate.getPatchDivisionU();
--- a/GPU/GPUState.h
+++ b/GPU/GPUState.h
@ -469,12 +469,12 @@ struct UVScale {
 // Might want to move this mechanism into the backend later.
 enum {
 	GPU_SUPPORTS_DUALSOURCE_BLEND = FLAG_BIT(0),
-	// Free bit: 1
-	GPU_SUPPORTS_GLSL_330 = FLAG_BIT(2),
+	// Free bits: 1-2
 	GPU_SUPPORTS_VS_RANGE_CULLING = FLAG_BIT(3),
 	GPU_SUPPORTS_BLEND_MINMAX = FLAG_BIT(4),
 	GPU_SUPPORTS_LOGIC_OP = FLAG_BIT(5),
 	GPU_USE_DEPTH_RANGE_HACK = FLAG_BIT(6),
+	// Free bit: 7
 	GPU_SUPPORTS_ANISOTROPY = FLAG_BIT(8),
 	GPU_USE_CLEAR_RAM_HACK = FLAG_BIT(9),
 	GPU_SUPPORTS_INSTANCE_RENDERING = FLAG_BIT(10),
@ -485,8 +485,7 @@ enum {
 	// Free bit: 15
 	GPU_SUPPORTS_DEPTH_TEXTURE = FLAG_BIT(16),
 	GPU_SUPPORTS_ACCURATE_DEPTH = FLAG_BIT(17),
-	GPU_SUPPORTS_FRAGMENT_SHADER_INTERLOCK = FLAG_BIT(18),
-	// Free bits: 19
+	// Free bits: 18-19
 	GPU_SUPPORTS_ANY_FRAMEBUFFER_FETCH = FLAG_BIT(20),
 	GPU_SCALE_DEPTH_FROM_24BIT_TO_16BIT = FLAG_BIT(21),
 	GPU_ROUND_FRAGMENT_DEPTH_TO_16BIT = FLAG_BIT(22),
--- a/GPU/Software/BinManager.cpp
+++ b/GPU/Software/BinManager.cpp
@ -197,16 +197,16 @@ void BinManager::UpdateState(bool throughMode) {
 			Flush("tex");

 		// Okay, now update what's pending.
-		constexpr uint32_t mirrorMask = 0x0FFFFFFF & ~0x00600000;
-		const uint32_t bpp = state.pixelID.FBFormat() == GE_FORMAT_8888 ? 4 : 2;
-		pendingWrites_[0].Expand(gstate.getFrameBufAddress() & mirrorMask, bpp, gstate.FrameBufStride(), scissorTL, scissorBR);
-		if (state.pixelID.depthWrite)
-			pendingWrites_[1].Expand(gstate.getDepthBufAddress() & mirrorMask, 2, gstate.DepthBufStride(), scissorTL, scissorBR);
+		MarkPendingWrites(state);

 		ClearDirty(SoftDirty::BINNER_RANGE);
 	} else if (pendingOverlap_) {
-		if (HasTextureWrite(state))
+		if (HasTextureWrite(state)) {
 			Flush("tex");
+
+			// We need the pending writes set, which flushing cleared.  Set them again.
+			MarkPendingWrites(state);
+		}
 	}

 	if (HasDirty(SoftDirty::BINNER_OVERLAP)) {
@ -282,6 +282,17 @@ void BinManager::MarkPendingReads(const Rasterizer::RasterizerState &state) {
 	}
 }

+void BinManager::MarkPendingWrites(const Rasterizer::RasterizerState &state) {
+	DrawingCoords scissorTL(gstate.getScissorX1(), gstate.getScissorY1());
+	DrawingCoords scissorBR(std::min(gstate.getScissorX2(), gstate.getRegionX2()), std::min(gstate.getScissorY2(), gstate.getRegionY2()));
+
+	constexpr uint32_t mirrorMask = 0x0FFFFFFF & ~0x00600000;
+	const uint32_t bpp = state.pixelID.FBFormat() == GE_FORMAT_8888 ? 4 : 2;
+	pendingWrites_[0].Expand(gstate.getFrameBufAddress() & mirrorMask, bpp, gstate.FrameBufStride(), scissorTL, scissorBR);
+	if (state.pixelID.depthWrite)
+		pendingWrites_[1].Expand(gstate.getDepthBufAddress() & mirrorMask, 2, gstate.DepthBufStride(), scissorTL, scissorBR);
+}
+
 inline void BinDirtyRange::Expand(uint32_t newBase, uint32_t bpp, uint32_t stride, DrawingCoords &tl, DrawingCoords &br) {
 	const uint32_t w = br.x - tl.x + 1;
 	const uint32_t h = br.y - tl.y + 1;
--- a/GPU/Software/BinManager.h
+++ b/GPU/Software/BinManager.h
@ -267,6 +267,7 @@ private:
 	int mostThreads_ = 0;

 	void MarkPendingReads(const Rasterizer::RasterizerState &state);
+	void MarkPendingWrites(const Rasterizer::RasterizerState &state);
 	bool HasTextureWrite(const Rasterizer::RasterizerState &state);
 	BinCoords Scissor(BinCoords range);
 	BinCoords Range(const VertexData &v0, const VertexData &v1, const VertexData &v2);
--- a/GPU/Software/Rasterizer.cpp
+++ b/GPU/Software/Rasterizer.cpp
@ -1136,13 +1136,20 @@ void DrawPoint(const VertexData &v0, const BinCoords &range, const RasterizerSta
 }

 void ClearRectangle(const VertexData &v0, const VertexData &v1, const BinCoords &range, const RasterizerState &state) {
-	DrawingCoords pprime = TransformUnit::ScreenToDrawing(range.x1, range.y1);
-	DrawingCoords pend = TransformUnit::ScreenToDrawing(range.x2, range.y2);
+	int entireX1 = std::min(v0.screenpos.x, v1.screenpos.x);
+	int entireY1 = std::min(v0.screenpos.y, v1.screenpos.y);
+	int entireX2 = std::max(v0.screenpos.x, v1.screenpos.x) - 1;
+	int entireY2 = std::max(v0.screenpos.y, v1.screenpos.y) - 1;
+	int minX = std::max(entireX1, range.x1) | (SCREEN_SCALE_FACTOR / 2 - 1);
+	int minY = std::max(entireY1, range.y1) | (SCREEN_SCALE_FACTOR / 2 - 1);
+	int maxX = std::min(entireX2, range.x2);
+	int maxY = std::min(entireY2, range.y2);
+	const DrawingCoords pprime = TransformUnit::ScreenToDrawing(minX, minY);
+	const DrawingCoords pend = TransformUnit::ScreenToDrawing(maxX, maxY);
 	auto &pixelID = state.pixelID;
 	auto &samplerID = state.samplerID;

-	// Min and max are in PSP fixed point screen coordinates, 16 here is for the 4 subpixel bits.
-	const int w = (range.x2 - range.x1 + 1) / SCREEN_SCALE_FACTOR;
+	const int w = pend.x - pprime.x + 1;
 	if (w <= 0)
 		return;

--- a/GPU/Software/RasterizerRectangle.cpp
+++ b/GPU/Software/RasterizerRectangle.cpp
@ -93,7 +93,10 @@ static inline bool AlphaTestIsNeedless(const PixelFuncID &pixelID) {
 	case GE_COMP_NOTEQUAL:
 	case GE_COMP_GREATER:
 	case GE_COMP_GEQUAL:
-		return pixelID.alphaBlend && pixelID.alphaTestRef == 0 && !pixelID.hasAlphaTestMask;
+		if (pixelID.alphaTestRef != 0 || pixelID.hasAlphaTestMask)
+			return false;
+		// DrawSinglePixel5551 assumes it can take the src color directly if full alpha.
+		return pixelID.alphaBlend && pixelID.AlphaBlendSrc() == PixelBlendFactor::SRCALPHA && pixelID.AlphaBlendDst() == PixelBlendFactor::INVSRCALPHA;
 	}

 	return false;
--- a/GPU/Software/SoftGpu.cpp
+++ b/GPU/Software/SoftGpu.cpp
@ -490,14 +490,16 @@ void SoftGPU::SetDisplayFramebuffer(u32 framebuf, u32 stride, GEBufferFormat for

 DSStretch g_DarkStalkerStretch;

-void SoftGPU::ConvertTextureDescFrom16(Draw::TextureDesc &desc, int srcwidth, int srcheight, u8 *overrideData) {
+void SoftGPU::ConvertTextureDescFrom16(Draw::TextureDesc &desc, int srcwidth, int srcheight, const uint16_t *overrideData) {
 	// TODO: This should probably be converted in a shader instead..
 	fbTexBuffer_.resize(srcwidth * srcheight);
-	FormatBuffer displayBuffer;
-	displayBuffer.data = overrideData ? overrideData : Memory::GetPointerWrite(displayFramebuf_);
+	const uint16_t *displayBuffer = overrideData;
+	if (!displayBuffer)
+		displayBuffer = (const uint16_t *)Memory::GetPointer(displayFramebuf_);
+
 	for (int y = 0; y < srcheight; ++y) {
 		u32 *buf_line = &fbTexBuffer_[y * srcwidth];
-		const u16 *fb_line = &displayBuffer.as16[y * displayStride_];
+		const u16 *fb_line = &displayBuffer[y * displayStride_];

 		switch (displayFormat_) {
 		case GE_FORMAT_565:
@ -557,7 +559,7 @@ void SoftGPU::CopyToCurrentFboFromDisplayRam(int srcwidth, int srcheight) {
 	bool hasPostShader = presentation_ && presentation_->HasPostShader();

 	if (PSP_CoreParameter().compat.flags().DarkStalkersPresentHack && displayFormat_ == GE_FORMAT_5551 && g_DarkStalkerStretch != DSStretch::Off) {
-		u8 *data = Memory::GetPointerWrite(0x04088000);
+		const u8 *data = Memory::GetPointerWrite(0x04088000);
 		bool fillDesc = true;
 		if (draw_->GetDataFormatSupport(Draw::DataFormat::A1B5G5R5_UNORM_PACK16) & Draw::FMT_TEXTURE) {
 			// The perfect one.
@ -567,7 +569,7 @@ void SoftGPU::CopyToCurrentFboFromDisplayRam(int srcwidth, int srcheight) {
 			desc.format = Draw::DataFormat::A1R5G5B5_UNORM_PACK16;
 			outputFlags |= OutputFlags::RB_SWIZZLE;
 		} else {
-			ConvertTextureDescFrom16(desc, srcwidth, srcheight, data);
+			ConvertTextureDescFrom16(desc, srcwidth, srcheight, (const uint16_t *)data);
 			fillDesc = false;
 		}
 		if (fillDesc) {
@ -586,13 +588,13 @@ void SoftGPU::CopyToCurrentFboFromDisplayRam(int srcwidth, int srcheight) {
 		hasImage = false;
 		u1 = 1.0f;
 	} else if (displayFormat_ == GE_FORMAT_8888) {
-		u8 *data = Memory::GetPointerWrite(displayFramebuf_);
+		const u8 *data = Memory::GetPointer(displayFramebuf_);
 		desc.width = displayStride_ == 0 ? srcwidth : displayStride_;
 		desc.height = srcheight;
 		desc.initData.push_back(data);
 		desc.format = Draw::DataFormat::R8G8B8A8_UNORM;
 	} else if (displayFormat_ == GE_FORMAT_5551) {
-		const u8 *data = Memory::GetPointerWrite(displayFramebuf_);
+		const u8 *data = Memory::GetPointer(displayFramebuf_);
 		bool fillDesc = true;
 		if (draw_->GetDataFormatSupport(Draw::DataFormat::A1B5G5R5_UNORM_PACK16) & Draw::FMT_TEXTURE) {
 			// The perfect one.
@ -1247,18 +1249,19 @@ bool SoftGPU::GetCurrentFramebuffer(GPUDebugBuffer &buffer, GPUDebugFramebufferT
 	int stride = gstate.FrameBufStride();
 	DrawingCoords size = GetTargetSize(stride);
 	GEBufferFormat fmt = gstate.FrameBufFormat();
+	const u8 *src = fb.data;

 	if (type == GPU_DBG_FRAMEBUF_DISPLAY) {
 		size.x = 480;
 		size.y = 272;
 		stride = displayStride_;
 		fmt = displayFormat_;
+		src = Memory::GetPointer(displayFramebuf_);
 	}

 	buffer.Allocate(size.x, size.y, fmt);

 	const int depth = fmt == GE_FORMAT_8888 ? 4 : 2;
-	const u8 *src = fb.data;
 	u8 *dst = buffer.GetData();
 	const int byteWidth = size.x * depth;
 	for (int16_t y = 0; y < size.y; ++y) {
--- a/GPU/Software/SoftGpu.h
+++ b/GPU/Software/SoftGpu.h
@ -64,17 +64,17 @@ enum class SoftDirty : uint64_t {
 	PIXEL_DITHER = 1ULL << 3,
 	PIXEL_WRITEMASK = 1ULL << 4,
 	PIXEL_CACHED = 1ULL << 5,
-	PIXEL_ALL = 63ULL << 0,
+	PIXEL_ALL = 0b111111ULL << 0,

 	SAMPLER_BASIC = 1ULL << 6,
 	SAMPLER_TEXLIST = 1ULL << 7,
 	SAMPLER_CLUT = 1ULL << 8,
-	SAMPLER_ALL = 7ULL << 6,
+	SAMPLER_ALL = 0b111ULL << 6,

 	RAST_BASIC = 1ULL << 9,
 	RAST_TEX = 1ULL << 10,
 	RAST_OFFSET = 1ULL << 11,
-	RAST_ALL = 7ULL << 9,
+	RAST_ALL = 0b111ULL << 9,

 	LIGHT_BASIC = 1ULL << 12,
 	LIGHT_MATERIAL = 1ULL << 13,
@ -82,13 +82,13 @@ enum class SoftDirty : uint64_t {
 	LIGHT_1 = 1ULL << 15,
 	LIGHT_2 = 1ULL << 16,
 	LIGHT_3 = 1ULL << 17,
-	LIGHT_ALL = 63ULL << 12,
+	LIGHT_ALL = 0b111111ULL << 12,

 	TRANSFORM_BASIC = 1ULL << 18,
 	TRANSFORM_MATRIX = 1ULL << 19,
 	TRANSFORM_VIEWPORT = 1ULL << 20,
 	TRANSFORM_FOG = 1ULL << 21,
-	TRANSFORM_ALL = 31ULL << 18,
+	TRANSFORM_ALL = 0b1111ULL << 18,

 	BINNER_RANGE = 1ULL << 22,
 	BINNER_OVERLAP = 1ULL << 23,
@ -194,7 +194,7 @@ public:
 protected:
 	void FastRunLoop(DisplayList &list) override;
 	void CopyToCurrentFboFromDisplayRam(int srcwidth, int srcheight);
-	void ConvertTextureDescFrom16(Draw::TextureDesc &desc, int srcwidth, int srcheight, u8 *overrideData = nullptr);
+	void ConvertTextureDescFrom16(Draw::TextureDesc &desc, int srcwidth, int srcheight, const uint16_t *overrideData = nullptr);

 private:
 	void MarkDirty(uint32_t addr, uint32_t stride, uint32_t height, GEBufferFormat fmt, SoftGPUVRAMDirty value);
--- a/GPU/Software/TransformUnit.cpp
+++ b/GPU/Software/TransformUnit.cpp
@ -54,6 +54,7 @@ SoftwareDrawEngine::SoftwareDrawEngine() {
 	// All this is a LOT of memory, need to see if we can cut down somehow.  Used for splines.
 	decoded = (u8 *)AllocateMemoryPages(DECODED_VERTEX_BUFFER_SIZE, MEM_PROT_READ | MEM_PROT_WRITE);
 	decIndex = (u16 *)AllocateMemoryPages(DECODED_INDEX_BUFFER_SIZE, MEM_PROT_READ | MEM_PROT_WRITE);
+	flushOnParams_ = false;
 }

 SoftwareDrawEngine::~SoftwareDrawEngine() {
--- a/GPU/Vulkan/PipelineManagerVulkan.cpp
+++ b/GPU/Vulkan/PipelineManagerVulkan.cpp
@ -170,8 +170,8 @@ static std::string CutFromMain(std::string str) {
 }

 static VulkanPipeline *CreateVulkanPipeline(VulkanRenderManager *renderManager, VkPipelineCache pipelineCache,
-		VkPipelineLayout layout, PipelineFlags pipelineFlags, const VulkanPipelineRasterStateKey &key,
-		const DecVtxFormat *decFmt, VulkanVertexShader *vs, VulkanFragmentShader *fs, bool useHwTransform, u32 variantBitmask) {
+	VkPipelineLayout layout, PipelineFlags pipelineFlags, const VulkanPipelineRasterStateKey &key,
+	const DecVtxFormat *decFmt, VulkanVertexShader *vs, VulkanFragmentShader *fs, bool useHwTransform, u32 variantBitmask) {
 	VulkanPipeline *vulkanPipeline = new VulkanPipeline();
 	VKRGraphicsPipelineDesc *desc = &vulkanPipeline->desc;
 	desc->pipelineCache = pipelineCache;
@ -221,7 +221,7 @@ static VulkanPipeline *CreateVulkanPipeline(VulkanRenderManager *renderManager,
 	VkDynamicState *dynamicStates = &desc->dynamicStates[0];
 	int numDyn = 0;
 	if (key.blendEnable &&
-		  (UsesBlendConstant(key.srcAlpha) || UsesBlendConstant(key.srcColor) || UsesBlendConstant(key.destAlpha) || UsesBlendConstant(key.destColor))) {
+		(UsesBlendConstant(key.srcAlpha) || UsesBlendConstant(key.srcColor) || UsesBlendConstant(key.destAlpha) || UsesBlendConstant(key.destColor))) {
 		dynamicStates[numDyn++] = VK_DYNAMIC_STATE_BLEND_CONSTANTS;
 		useBlendConstant = true;
 	}
@ -232,12 +232,12 @@ static VulkanPipeline *CreateVulkanPipeline(VulkanRenderManager *renderManager,
 		dynamicStates[numDyn++] = VK_DYNAMIC_STATE_STENCIL_COMPARE_MASK;
 		dynamicStates[numDyn++] = VK_DYNAMIC_STATE_STENCIL_REFERENCE;
 	}
-	
+
 	VkPipelineDynamicStateCreateInfo &ds = desc->ds;
 	ds.flags = 0;
 	ds.pDynamicStates = dynamicStates;
 	ds.dynamicStateCount = numDyn;
-	
+
 	VkPipelineRasterizationStateCreateInfo &rs = desc->rs;
 	rs.flags = 0;
 	rs.depthBiasEnable = false;
@ -299,10 +299,9 @@ static VulkanPipeline *CreateVulkanPipeline(VulkanRenderManager *renderManager,
 	VKRGraphicsPipeline *pipeline = renderManager->CreateGraphicsPipeline(desc, variantBitmask, "game");

 	vulkanPipeline->pipeline = pipeline;
-	if (useBlendConstant)
+	if (useBlendConstant) {
 		pipelineFlags |= PipelineFlags::USES_BLEND_CONSTANT;
-	if (key.topology == VK_PRIMITIVE_TOPOLOGY_LINE_LIST || key.topology == VK_PRIMITIVE_TOPOLOGY_LINE_STRIP)
-		pipelineFlags |= PipelineFlags::USES_LINES;
+	}
 	if (dss.depthTestEnable || dss.stencilTestEnable) {
 		pipelineFlags |= PipelineFlags::USES_DEPTH_STENCIL;
 	}
--- a/GPU/Vulkan/PipelineManagerVulkan.h
+++ b/GPU/Vulkan/PipelineManagerVulkan.h
@ -58,7 +58,6 @@ struct VulkanPipeline {
 	PipelineFlags pipelineFlags;  // PipelineFlags enum above.

 	bool UsesBlendConstant() const { return (pipelineFlags & PipelineFlags::USES_BLEND_CONSTANT) != 0; }
-	bool UsesLines() const { return (pipelineFlags & PipelineFlags::USES_LINES) != 0; }
 	bool UsesDepthStencil() const { return (pipelineFlags & PipelineFlags::USES_DEPTH_STENCIL) != 0; }
 	bool UsesInputAttachment() const { return (pipelineFlags & PipelineFlags::USES_INPUT_ATTACHMENT) != 0; }

--- a/Windows/GEDebugger/GEDebugger.h
+++ b/Windows/GEDebugger/GEDebugger.h
@ -134,7 +134,7 @@ private:
 	int textureLevel_ = 0;
 	bool showClut_ = false;
 	bool forceOpaque_ = false;
-	bool autoFlush_ = false;
+	bool autoFlush_ = true;
 	// The most recent primary/framebuffer and texture buffers.
 	const GPUDebugBuffer *primaryBuffer_ = nullptr;
 	const GPUDebugBuffer *secondBuffer_ = nullptr;
--- a/assets/compat.ini
+++ b/assets/compat.ini
@ -520,6 +520,14 @@ ULES01086 = true
 # LEGO Batman: The Videogame
 ULUS10380 = true
 ULES01151 = true
+# Burnout Dominator
+ULUS10236 = true
+ULES00750 = true
+ULJM05242 = true
+ULJM05371 = true
+NPJH50304 = true
+ULES00703 = true
+
 # TODO: There are many more.

 [RequireBlockTransfer]
@ -1278,16 +1286,3 @@ ULJM05738 = true
 [AllowDownloadCLUT]
 # Temporary compatibility option, while working on the GPU CLUT-from-framebuffer path.
 # Not required for any games now that it works, but might be useful for development.
-
-[UploadDepthForCLUTTextures]
-# Burnout Dominator - lens flare effect (issue #11100)
-# We need a preinitialized depth buffer
-ULUS10236 = true
-ULES00703 = true
-
-# Need for Speed - Shift (same as Burnout Dominator)
-ULUS10462 = true
-ULES01275 = true
-ULJM05494 = true
-NPJH50143 = true
-ULJM05738 = true
--- a/headless/StubHost.cpp
+++ b/headless/StubHost.cpp
@ -46,7 +46,7 @@ void HeadlessHost::SendDebugScreenshot(const u8 *pixbuf, u32 w, u32 h) {
 	const static u32 FRAME_HEIGHT = 272;

 	GPUDebugBuffer buffer;
-	gpuDebug->GetCurrentFramebuffer(buffer, GPU_DBG_FRAMEBUF_RENDER);
+	gpuDebug->GetCurrentFramebuffer(buffer, GPU_DBG_FRAMEBUF_DISPLAY);
 	const std::vector<u32> pixels = TranslateDebugBufferToCompare(&buffer, 512, 272);

 	ScreenshotComparer comparer(pixels, FRAME_STRIDE, FRAME_WIDTH, FRAME_HEIGHT);
--- a/test.py
+++ b/test.py
@ -147,8 +147,10 @@ tests_good = [
  "gpu/commands/blend",
  "gpu/commands/blend565",
  "gpu/commands/blocktransfer",
+  "gpu/commands/cull",
  "gpu/commands/fog",
  "gpu/commands/material",
+  "gpu/complex/complex",
  "gpu/displaylist/alignment",
  "gpu/dither/dither",
  "gpu/filtering/mipmaplinear",
@ -159,6 +161,7 @@ tests_good = [
  "gpu/ge/queue",
  "gpu/primitives/indices",
  "gpu/primitives/invalidprim",
+  "gpu/primitives/points",
  "gpu/primitives/trianglefan",
  "gpu/primitives/trianglestrip",
  "gpu/primitives/triangles",
@ -181,6 +184,7 @@ tests_good = [
  "gpu/texfunc/replace",
  "gpu/textures/mipmap",
  "gpu/textures/rotate",
+  "gpu/vertices/colors",
  "hash/hash",
  "hle/check_not_used_uids",
  "intr/intr",
@ -387,9 +391,7 @@ tests_next = [
  "font/shadowglyphimageclip",
  "font/shadowinfo",
  "gpu/clipping/guardband",
-  "gpu/commands/cull",
  "gpu/commands/light",
-  "gpu/complex/complex",
  "gpu/depth/precision",
  "gpu/displaylist/state",
  "gpu/filtering/linear",
@ -404,7 +406,6 @@ tests_next = [
  "gpu/primitives/immediate",
  "gpu/primitives/lines",
  "gpu/primitives/linestrip",
-  "gpu/primitives/points",
  "gpu/primitives/rectangles",
  "gpu/primitives/spline",
  "gpu/reflection/reflection",
@ -415,7 +416,6 @@ tests_next = [
  "gpu/simple/simple",
  "gpu/textures/size",
  "gpu/triangle/triangle",
-  "gpu/vertices/colors",
  "gpu/vertices/texcoords",
  "intr/registersub",
  "intr/releasesub",