Merge branch 'master' into compat_openxr_gta

2024-11-23 13:30:02 +00:00 · 2022-09-20 21:46:33 +02:00 · 2022-09-20 21:46:33 +02:00 · c4dbd6d045
commit c4dbd6d045
parent 861af63be9 dc22fd2205
141 changed files with 2419 additions and 1686 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -623,6 +623,8 @@ add_library(Common STATIC
 	Common/GPU/Vulkan/VulkanRenderManager.h
 	Common/GPU/Vulkan/VulkanQueueRunner.cpp
 	Common/GPU/Vulkan/VulkanQueueRunner.h
+	Common/GPU/Vulkan/VulkanFrameData.cpp
+	Common/GPU/Vulkan/VulkanFrameData.h
 	Common/Input/GestureDetector.cpp
 	Common/Input/GestureDetector.h
 	Common/Input/KeyCodes.h
--- a/Common/Common.vcxproj
+++ b/Common/Common.vcxproj
@ -441,6 +441,7 @@
    <ClInclude Include="GPU\Vulkan\VulkanBarrier.h" />
    <ClInclude Include="GPU\Vulkan\VulkanContext.h" />
    <ClInclude Include="GPU\Vulkan\VulkanDebug.h" />
+    <ClInclude Include="GPU\Vulkan\VulkanFrameData.h" />
    <ClInclude Include="GPU\Vulkan\VulkanImage.h" />
    <ClInclude Include="GPU\Vulkan\VulkanLoader.h" />
    <ClInclude Include="GPU\Vulkan\VulkanMemory.h" />
@ -861,6 +862,7 @@
    <ClCompile Include="GPU\Vulkan\VulkanBarrier.cpp" />
    <ClCompile Include="GPU\Vulkan\VulkanContext.cpp" />
    <ClCompile Include="GPU\Vulkan\VulkanDebug.cpp" />
+    <ClCompile Include="GPU\Vulkan\VulkanFrameData.cpp" />
    <ClCompile Include="GPU\Vulkan\VulkanImage.cpp" />
    <ClCompile Include="GPU\Vulkan\VulkanLoader.cpp" />
    <ClCompile Include="GPU\Vulkan\VulkanMemory.cpp" />
--- a/Common/Common.vcxproj.filters
+++ b/Common/Common.vcxproj.filters
@ -419,6 +419,9 @@
      <Filter>GPU\Vulkan</Filter>
    </ClInclude>
    <ClInclude Include="RiscVEmitter.h" />
+    <ClInclude Include="GPU\Vulkan\VulkanFrameData.h">
+      <Filter>GPU\Vulkan</Filter>
+    </ClInclude>
  </ItemGroup>
  <ItemGroup>
    <ClCompile Include="ABI.cpp" />
@ -791,6 +794,9 @@
      <Filter>GPU\Vulkan</Filter>
    </ClCompile>
    <ClCompile Include="RiscVEmitter.cpp" />
+    <ClCompile Include="GPU\Vulkan\VulkanFrameData.cpp">
+      <Filter>GPU\Vulkan</Filter>
+    </ClCompile>
  </ItemGroup>
  <ItemGroup>
    <Filter Include="Crypto">
--- a/Common/GPU/D3D11/thin3d_d3d11.cpp
+++ b/Common/GPU/D3D11/thin3d_d3d11.cpp
@ -270,6 +270,7 @@ D3D11DrawContext::D3D11DrawContext(ID3D11Device *device, ID3D11DeviceContext *de
 	caps_.anisoSupported = true;
 	caps_.textureNPOTFullySupported = true;
 	caps_.fragmentShaderDepthWriteSupported = true;
+	caps_.blendMinMaxSupported = true;

 	D3D11_FEATURE_DATA_D3D11_OPTIONS options{};
 	HRESULT result = device_->CheckFeatureSupport(D3D11_FEATURE_D3D11_OPTIONS, &options, sizeof(options));
--- a/Common/GPU/D3D9/thin3d_d3d9.cpp
+++ b/Common/GPU/D3D9/thin3d_d3d9.cpp
@ -646,6 +646,63 @@ void D3D9Context::InvalidateCachedState() {
 	curPipeline_ = nullptr;
 }

+// TODO: Move this detection elsewhere when it's needed elsewhere, not before. It's ugly.
+// Source: https://envytools.readthedocs.io/en/latest/hw/pciid.html#gf100
+enum NVIDIAGeneration {
+	NV_PRE_KEPLER,
+	NV_KEPLER,
+	NV_MAXWELL,
+	NV_PASCAL,
+	NV_VOLTA,
+	NV_TURING,  // or later
+};
+
+static NVIDIAGeneration NVIDIAGetDeviceGeneration(int deviceID) {
+	if (deviceID >= 0x1180 && deviceID <= 0x11bf)
+		return NV_KEPLER;  // GK104
+	if (deviceID >= 0x11c0 && deviceID <= 0x11fa)
+		return NV_KEPLER;  // GK106
+	if (deviceID >= 0x0fc0 && deviceID <= 0x0fff)
+		return NV_KEPLER;  // GK107
+	if (deviceID >= 0x1003 && deviceID <= 0x1028)
+		return NV_KEPLER;  // GK110(B)
+	if (deviceID >= 0x1280 && deviceID <= 0x12ba)
+		return NV_KEPLER;  // GK208
+	if (deviceID >= 0x1381 && deviceID <= 0x13b0)
+		return NV_MAXWELL;  // GM107
+	if (deviceID >= 0x1340 && deviceID <= 0x134d)
+		return NV_MAXWELL;  // GM108
+	if (deviceID >= 0x13c0 && deviceID <= 0x13d9)
+		return NV_MAXWELL;  // GM204
+	if (deviceID >= 0x1401 && deviceID <= 0x1427)
+		return NV_MAXWELL;  // GM206
+	if (deviceID >= 0x15f7 && deviceID <= 0x15f9)
+		return NV_PASCAL;  // GP100
+	if (deviceID >= 0x15f7 && deviceID <= 0x15f9)
+		return NV_PASCAL;  // GP100
+	if (deviceID >= 0x1b00 && deviceID <= 0x1b38)
+		return NV_PASCAL;  // GP102
+	if (deviceID >= 0x1b80 && deviceID <= 0x1be1)
+		return NV_PASCAL;  // GP104
+	if (deviceID >= 0x1c02 && deviceID <= 0x1c62)
+		return NV_PASCAL;  // GP106
+	if (deviceID >= 0x1c81 && deviceID <= 0x1c92)
+		return NV_PASCAL;  // GP107
+	if (deviceID >= 0x1d01 && deviceID <= 0x1d12)
+		return NV_PASCAL;  // GP108
+	if (deviceID >= 0x1d81 && deviceID <= 0x1dba)
+		return NV_VOLTA;   // GV100
+	if (deviceID >= 0x1e02 && deviceID <= 0x1e3c)
+		return NV_TURING;  // TU102
+	if (deviceID >= 0x1e82 && deviceID <= 0x1ed0)
+		return NV_TURING;  // TU104
+	if (deviceID >= 0x1f02 && deviceID <= 0x1f51)
+		return NV_TURING;  // TU104
+	if (deviceID >= 0x1e02)
+		return NV_TURING;  // More TU models or later, probably.
+	return NV_PRE_KEPLER;
+}
+
 #define FB_DIV 1
 #define FOURCC_INTZ ((D3DFORMAT)(MAKEFOURCC('I', 'N', 'T', 'Z')))

@ -665,14 +722,24 @@ D3D9Context::D3D9Context(IDirect3D9 *d3d, IDirect3D9Ex *d3dEx, int adapterId, ID
 		caps_.vendor = GPUVendor::VENDOR_UNKNOWN;
 	}

-	if (!FAILED(device->GetDeviceCaps(&d3dCaps_))) {
+	D3DCAPS9 caps;
+	ZeroMemory(&caps, sizeof(caps));
+	HRESULT result = 0;
+	if (deviceEx_) {
+		result = deviceEx_->GetDeviceCaps(&caps);
+	} else {
+		result = device_->GetDeviceCaps(&caps);
+	}
+
+	if (SUCCEEDED(result)) {
 		sprintf(shadeLangVersion_, "PS: %04x VS: %04x", d3dCaps_.PixelShaderVersion & 0xFFFF, d3dCaps_.VertexShaderVersion & 0xFFFF);
 	} else {
+		WARN_LOG(G3D, "Direct3D9: Failed to get the device caps!");
 		strcpy(shadeLangVersion_, "N/A");
 	}
+
 	caps_.deviceID = identifier_.DeviceId;
 	caps_.multiViewport = false;
-	caps_.anisoSupported = true;
 	caps_.depthRangeMinusOneToOne = false;
 	caps_.preferredDepthBufferFormat = DataFormat::D24_S8;
 	caps_.dualSourceBlend = false;
@ -684,8 +751,30 @@ D3D9Context::D3D9Context(IDirect3D9 *d3d, IDirect3D9Ex *d3dEx, int adapterId, ID
 	caps_.framebufferDepthCopySupported = false;
 	caps_.framebufferSeparateDepthCopySupported = false;
 	caps_.texture3DSupported = true;
-	caps_.textureNPOTFullySupported = true;
 	caps_.fragmentShaderDepthWriteSupported = true;
+	caps_.blendMinMaxSupported = true;
+
+	if ((caps.RasterCaps & D3DPRASTERCAPS_ANISOTROPY) != 0 && caps.MaxAnisotropy > 1) {
+		caps_.anisoSupported = true;
+	}
+	if ((caps.TextureCaps & (D3DPTEXTURECAPS_NONPOW2CONDITIONAL | D3DPTEXTURECAPS_POW2)) == 0) {
+		caps_.textureNPOTFullySupported = true;
+	}
+
+	// VS range culling (killing triangles in the vertex shader using NaN) causes problems on Intel.
+	// Also causes problems on old NVIDIA.
+	switch (caps_.vendor) {
+	case Draw::GPUVendor::VENDOR_INTEL:
+		bugs_.Infest(Bugs::BROKEN_NAN_IN_CONDITIONAL);
+		break;
+	case Draw::GPUVendor::VENDOR_NVIDIA:
+		// Older NVIDIAs don't seem to like NaNs in their DX9 vertex shaders.
+		// No idea if KEPLER is the right cutoff, but let's go with it.
+		if (NVIDIAGetDeviceGeneration(caps_.deviceID) < NV_KEPLER) {
+			bugs_.Infest(Bugs::BROKEN_NAN_IN_CONDITIONAL);
+		}
+		break;
+	}

 	if (d3d) {
 		D3DDISPLAYMODE displayMode;
--- a/Common/GPU/OpenGL/GLQueueRunner.cpp
+++ b/Common/GPU/OpenGL/GLQueueRunner.cpp
@ -814,7 +814,7 @@ void GLQueueRunner::PerformRenderPass(const GLRStep &step, bool first, bool last
 	int logicOp = -1;
 	bool logicEnabled = false;
 #endif
-	bool clipDistance0Enabled = false;
+	bool clipDistanceEnabled[8]{};
 	GLuint blendEqColor = (GLuint)-1;
 	GLuint blendEqAlpha = (GLuint)-1;

@ -1123,14 +1123,18 @@ void GLQueueRunner::PerformRenderPass(const GLRStep &step, bool first, bool last
 		{
 			if (curProgram != c.program.program) {
 				glUseProgram(c.program.program->program);
-				if (c.program.program->use_clip_distance0 != clipDistance0Enabled) {
-					if (c.program.program->use_clip_distance0)
-						glEnable(GL_CLIP_DISTANCE0);
-					else
-						glDisable(GL_CLIP_DISTANCE0);
-					clipDistance0Enabled = c.program.program->use_clip_distance0;
-				}
 				curProgram = c.program.program;
+
+				for (size_t i = 0; i < ARRAY_SIZE(clipDistanceEnabled); ++i) {
+					if (c.program.program->use_clip_distance[i] == clipDistanceEnabled[i])
+						continue;
+
+					if (c.program.program->use_clip_distance[i])
+						glEnable(GL_CLIP_DISTANCE0 + (GLenum)i);
+					else
+						glDisable(GL_CLIP_DISTANCE0 + (GLenum)i);
+					clipDistanceEnabled[i] = c.program.program->use_clip_distance[i];
+				}
 			}
 			CHECK_GL_ERROR_IF_DEBUG();
 			break;
@ -1371,8 +1375,10 @@ void GLQueueRunner::PerformRenderPass(const GLRStep &step, bool first, bool last
 		glDisable(GL_COLOR_LOGIC_OP);
 	}
 #endif
-	if (clipDistance0Enabled)
-		glDisable(GL_CLIP_DISTANCE0);
+	for (size_t i = 0; i < ARRAY_SIZE(clipDistanceEnabled); ++i) {
+		if (clipDistanceEnabled[i])
+			glDisable(GL_CLIP_DISTANCE0 + (GLenum)i);
+	}
 	if ((colorMask & 15) != 15)
 		glColorMask(GL_TRUE, GL_TRUE, GL_TRUE, GL_TRUE);
 	CHECK_GL_ERROR_IF_DEBUG();
--- a/Common/GPU/OpenGL/GLRenderManager.cpp
+++ b/Common/GPU/OpenGL/GLRenderManager.cpp
@ -579,7 +579,6 @@ void GLRenderManager::EndSubmitFrame(int frame) {
 void GLRenderManager::Run(int frame) {
 	BeginSubmitFrame(frame);

-
 	FrameData &frameData = frameData_[frame];

 	auto &stepsOnThread = frameData_[frame].steps;
--- a/Common/GPU/OpenGL/GLRenderManager.h
+++ b/Common/GPU/OpenGL/GLRenderManager.h
@ -91,6 +91,13 @@ public:
 	std::string error;
 };

+struct GLRProgramFlags {
+	bool supportDualSource : 1;
+	bool useClipDistance0 : 1;
+	bool useClipDistance1 : 1;
+	bool useClipDistance2 : 1;
+};
+
 class GLRProgram {
 public:
 	~GLRProgram() {
@ -119,7 +126,7 @@ public:
 	std::vector<Semantic> semantics_;
 	std::vector<UniformLocQuery> queries_;
 	std::vector<Initializer> initialize_;
-	bool use_clip_distance0 = false;
+	bool use_clip_distance[8]{};

 	struct UniformInfo {
 		int loc_;
@ -427,15 +434,17 @@ public:
 	// not be an active render pass.
 	GLRProgram *CreateProgram(
 		std::vector<GLRShader *> shaders, std::vector<GLRProgram::Semantic> semantics, std::vector<GLRProgram::UniformLocQuery> queries,
-		std::vector<GLRProgram::Initializer> initializers, bool supportDualSource, bool useClipDistance0) {
+		std::vector<GLRProgram::Initializer> initializers, const GLRProgramFlags &flags) {
 		GLRInitStep step{ GLRInitStepType::CREATE_PROGRAM };
 		_assert_(shaders.size() <= ARRAY_SIZE(step.create_program.shaders));
 		step.create_program.program = new GLRProgram();
 		step.create_program.program->semantics_ = semantics;
 		step.create_program.program->queries_ = queries;
 		step.create_program.program->initialize_ = initializers;
-		step.create_program.program->use_clip_distance0 = useClipDistance0;
-		step.create_program.support_dual_source = supportDualSource;
+		step.create_program.program->use_clip_distance[0] = flags.useClipDistance0;
+		step.create_program.program->use_clip_distance[1] = flags.useClipDistance1;
+		step.create_program.program->use_clip_distance[2] = flags.useClipDistance2;
+		step.create_program.support_dual_source = flags.supportDualSource;
 		_assert_msg_(shaders.size() > 0, "Can't create a program with zero shaders");
 		for (size_t i = 0; i < shaders.size(); i++) {
 			step.create_program.shaders[i] = shaders[i];
@ -1003,6 +1012,7 @@ private:
 		bool readyForFence = true;
 		bool readyForRun = false;
 		bool readyForSubmit = false;
+
 		bool skipSwap = false;
 		GLRRunType type = GLRRunType::END;

--- a/Common/GPU/OpenGL/thin3d_gl.cpp
+++ b/Common/GPU/OpenGL/thin3d_gl.cpp
@ -552,6 +552,8 @@ OpenGLContext::OpenGLContext() {
 	caps_.framebufferDepthBlitSupported = caps_.framebufferBlitSupported;
 	caps_.framebufferStencilBlitSupported = caps_.framebufferBlitSupported;
 	caps_.depthClampSupported = gl_extensions.ARB_depth_clamp;
+	caps_.blendMinMaxSupported = gl_extensions.EXT_blend_minmax;
+
 	if (gl_extensions.IsGLES) {
 		caps_.clipDistanceSupported = gl_extensions.EXT_clip_cull_distance || gl_extensions.APPLE_clip_distance;
 		caps_.cullDistanceSupported = gl_extensions.EXT_clip_cull_distance;
@ -711,8 +713,10 @@ OpenGLContext::OpenGLContext() {
 		}
 	}

-	if (gl_extensions.IsGLES) {
+	// NOTE: We only support framebuffer fetch on ES3 due to past issues..
+	if (gl_extensions.IsGLES && gl_extensions.GLES3) {
 		caps_.framebufferFetchSupported = (gl_extensions.EXT_shader_framebuffer_fetch || gl_extensions.ARM_shader_framebuffer_fetch);
+
 		if (gl_extensions.EXT_shader_framebuffer_fetch) {
 			shaderLanguageDesc_.framebufferFetchExtension = "#extension GL_EXT_shader_framebuffer_fetch : require";
 			shaderLanguageDesc_.lastFragData = gl_extensions.GLES3 ? "fragColor0" : "gl_LastFragData[0]";
@ -1234,7 +1238,8 @@ bool OpenGLPipeline::LinkShaders() {
 		}
 	}

-	program_ = render_->CreateProgram(linkShaders, semantics, queries, initialize, false, false);
+	GLRProgramFlags flags{};
+	program_ = render_->CreateProgram(linkShaders, semantics, queries, initialize, flags);
 	return true;
 }

--- a/Common/GPU/Vulkan/VulkanBarrier.cpp
+++ b/Common/GPU/Vulkan/VulkanBarrier.cpp
@ -4,7 +4,7 @@

 void VulkanBarrier::Flush(VkCommandBuffer cmd) {
 	if (!imageBarriers_.empty()) {
-		vkCmdPipelineBarrier(cmd, srcStageMask_, dstStageMask_, 0, 0, nullptr, 0, nullptr, (uint32_t)imageBarriers_.size(), imageBarriers_.data());
+		vkCmdPipelineBarrier(cmd, srcStageMask_, dstStageMask_, dependencyFlags_, 0, nullptr, 0, nullptr, (uint32_t)imageBarriers_.size(), imageBarriers_.data());
 	}
 	imageBarriers_.clear();
 	srcStageMask_ = 0;
--- a/Common/GPU/Vulkan/VulkanBarrier.h
+++ b/Common/GPU/Vulkan/VulkanBarrier.h
@ -21,6 +21,7 @@ public:
 	) {
 		srcStageMask_ |= srcStageMask;
 		dstStageMask_ |= dstStageMask;
+		dependencyFlags_ |= VK_DEPENDENCY_BY_REGION_BIT;

 		VkImageMemoryBarrier imageBarrier;
 		imageBarrier.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER;
@ -112,4 +113,5 @@ private:
 	VkPipelineStageFlags srcStageMask_ = 0;
 	VkPipelineStageFlags dstStageMask_ = 0;
 	std::vector<VkImageMemoryBarrier> imageBarriers_;
+	VkDependencyFlags dependencyFlags_ = 0;
 };
--- a/Common/GPU/Vulkan/VulkanContext.cpp
+++ b/Common/GPU/Vulkan/VulkanContext.cpp
@ -667,7 +667,10 @@ VkResult VulkanContext::CreateDevice() {
 		extensionsLookup_.KHR_create_renderpass2 = true;
 		extensionsLookup_.KHR_depth_stencil_resolve = EnableDeviceExtension(VK_KHR_DEPTH_STENCIL_RESOLVE_EXTENSION_NAME);
 	}
+
 	extensionsLookup_.EXT_shader_stencil_export = EnableDeviceExtension(VK_EXT_SHADER_STENCIL_EXPORT_EXTENSION_NAME);
+	extensionsLookup_.EXT_fragment_shader_interlock = EnableDeviceExtension(VK_EXT_FRAGMENT_SHADER_INTERLOCK_EXTENSION_NAME);
+	extensionsLookup_.ARM_rasterization_order_attachment_access = EnableDeviceExtension(VK_ARM_RASTERIZATION_ORDER_ATTACHMENT_ACCESS_EXTENSION_NAME);

 	VkDeviceCreateInfo device_info{ VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO };
 	device_info.queueCreateInfoCount = 1;
--- a/Common/GPU/Vulkan/VulkanDebug.cpp
+++ b/Common/GPU/Vulkan/VulkanDebug.cpp
@ -86,7 +86,6 @@ VKAPI_ATTR VkBool32 VKAPI_CALL VulkanDebugUtilsCallback(
 	} else {
 		WARN_LOG(G3D, "VKDEBUG: %s", msg.c_str());
 	}
-
 	// false indicates that layer should not bail-out of an
 	// API call that had validation failures. This may mean that the
 	// app dies inside the driver due to invalid parameter(s).
@ -94,3 +93,4 @@ VKAPI_ATTR VkBool32 VKAPI_CALL VulkanDebugUtilsCallback(
 	// keep that behavior here.
 	return false;
 }
+
--- a/Common/GPU/Vulkan/VulkanFrameData.cpp
+++ b/Common/GPU/Vulkan/VulkanFrameData.cpp
@ -0,0 +1,207 @@
+#include "VulkanFrameData.h"
+#include "Common/Log.h"
+
+void FrameData::Init(VulkanContext *vulkan, int index) {
+	this->index = index;
+	VkDevice device = vulkan->GetDevice();
+
+	VkCommandPoolCreateInfo cmd_pool_info = { VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO };
+	cmd_pool_info.queueFamilyIndex = vulkan->GetGraphicsQueueFamilyIndex();
+	cmd_pool_info.flags = VK_COMMAND_POOL_CREATE_TRANSIENT_BIT;
+	VkResult res = vkCreateCommandPool(device, &cmd_pool_info, nullptr, &cmdPoolInit);
+	_dbg_assert_(res == VK_SUCCESS);
+	res = vkCreateCommandPool(device, &cmd_pool_info, nullptr, &cmdPoolMain);
+	_dbg_assert_(res == VK_SUCCESS);
+
+	VkCommandBufferAllocateInfo cmd_alloc = { VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO };
+	cmd_alloc.commandPool = cmdPoolInit;
+	cmd_alloc.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY;
+	cmd_alloc.commandBufferCount = 1;
+	res = vkAllocateCommandBuffers(device, &cmd_alloc, &initCmd);
+	_dbg_assert_(res == VK_SUCCESS);
+	cmd_alloc.commandPool = cmdPoolMain;
+	res = vkAllocateCommandBuffers(device, &cmd_alloc, &mainCmd);
+	res = vkAllocateCommandBuffers(device, &cmd_alloc, &presentCmd);
+	_dbg_assert_(res == VK_SUCCESS);
+
+	// Creating the frame fence with true so they can be instantly waited on the first frame
+	fence = vulkan->CreateFence(true);
+
+	// This fence one is used for synchronizing readbacks. Does not need preinitialization.
+	readbackFence = vulkan->CreateFence(false);
+
+	VkQueryPoolCreateInfo query_ci{ VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO };
+	query_ci.queryCount = MAX_TIMESTAMP_QUERIES;
+	query_ci.queryType = VK_QUERY_TYPE_TIMESTAMP;
+	res = vkCreateQueryPool(device, &query_ci, nullptr, &profile.queryPool);
+}
+
+void FrameData::Destroy(VulkanContext *vulkan) {
+	VkDevice device = vulkan->GetDevice();
+	// TODO: I don't think free-ing command buffers is necessary before destroying a pool.
+	vkFreeCommandBuffers(device, cmdPoolInit, 1, &initCmd);
+	vkFreeCommandBuffers(device, cmdPoolMain, 1, &mainCmd);
+	vkDestroyCommandPool(device, cmdPoolInit, nullptr);
+	vkDestroyCommandPool(device, cmdPoolMain, nullptr);
+	vkDestroyFence(device, fence, nullptr);
+	vkDestroyFence(device, readbackFence, nullptr);
+	vkDestroyQueryPool(device, profile.queryPool, nullptr);
+}
+
+void FrameData::AcquireNextImage(VulkanContext *vulkan, FrameDataShared &shared) {
+	_dbg_assert_(!hasAcquired);
+
+	// Get the index of the next available swapchain image, and a semaphore to block command buffer execution on.
+	VkResult res = vkAcquireNextImageKHR(vulkan->GetDevice(), vulkan->GetSwapchain(), UINT64_MAX, shared.acquireSemaphore, (VkFence)VK_NULL_HANDLE, &curSwapchainImage);
+	switch (res) {
+	case VK_SUCCESS:
+		hasAcquired = true;
+		break;
+	case VK_SUBOPTIMAL_KHR:
+		hasAcquired = true;
+		// Hopefully the resize will happen shortly. Ignore - one frame might look bad or something.
+		WARN_LOG(G3D, "VK_SUBOPTIMAL_KHR returned - ignoring");
+		break;
+	case VK_ERROR_OUT_OF_DATE_KHR:
+		// We do not set hasAcquired here!
+		WARN_LOG(G3D, "VK_ERROR_OUT_OF_DATE_KHR returned from AcquireNextImage - processing the frame, but not presenting");
+		skipSwap = true;
+		break;
+	default:
+		// Weird, shouldn't get any other values. Maybe lost device?
+		_assert_msg_(false, "vkAcquireNextImageKHR failed! result=%s", VulkanResultToString(res));
+		break;
+	}
+}
+
+VkResult FrameData::QueuePresent(VulkanContext *vulkan, FrameDataShared &shared) {
+	_dbg_assert_(hasAcquired);
+	hasAcquired = false;
+	_dbg_assert_(!skipSwap);
+
+	VkSwapchainKHR swapchain = vulkan->GetSwapchain();
+	VkPresentInfoKHR present = { VK_STRUCTURE_TYPE_PRESENT_INFO_KHR };
+	present.swapchainCount = 1;
+	present.pSwapchains = &swapchain;
+	present.pImageIndices = &curSwapchainImage;
+	present.pWaitSemaphores = &shared.renderingCompleteSemaphore;
+	present.waitSemaphoreCount = 1;
+
+	return vkQueuePresentKHR(vulkan->GetGraphicsQueue(), &present);
+}
+
+VkCommandBuffer FrameData::GetInitCmd(VulkanContext *vulkan) {
+	if (!hasInitCommands) {
+		VkCommandBufferBeginInfo begin = {
+			VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO,
+			nullptr,
+			VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT
+		};
+		vkResetCommandPool(vulkan->GetDevice(), cmdPoolInit, 0);
+		VkResult res = vkBeginCommandBuffer(initCmd, &begin);
+		if (res != VK_SUCCESS) {
+			return VK_NULL_HANDLE;
+		}
+		hasInitCommands = true;
+	}
+	return initCmd;
+}
+
+void FrameData::SubmitPending(VulkanContext *vulkan, FrameSubmitType type, FrameDataShared &sharedData) {
+	VkCommandBuffer cmdBufs[2];
+	int numCmdBufs = 0;
+
+	VkFence fenceToTrigger = VK_NULL_HANDLE;
+
+	if (hasInitCommands) {
+		if (profilingEnabled_) {
+			// Pre-allocated query ID 1 - end of init cmdbuf.
+			vkCmdWriteTimestamp(initCmd, VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, profile.queryPool, 1);
+		}
+
+		VkResult res = vkEndCommandBuffer(initCmd);
+		cmdBufs[numCmdBufs++] = initCmd;
+
+		_assert_msg_(res == VK_SUCCESS, "vkEndCommandBuffer failed (init)! result=%s", VulkanResultToString(res));
+		hasInitCommands = false;
+	}
+
+	if ((hasMainCommands || hasPresentCommands) && type == FrameSubmitType::Sync) {
+		fenceToTrigger = readbackFence;
+	}
+
+	if (hasMainCommands) {
+		VkResult res = vkEndCommandBuffer(mainCmd);
+		_assert_msg_(res == VK_SUCCESS, "vkEndCommandBuffer failed (main)! result=%s", VulkanResultToString(res));
+
+		cmdBufs[numCmdBufs++] = mainCmd;
+		hasMainCommands = false;
+	}
+
+	if (hasPresentCommands) {
+		VkResult res = vkEndCommandBuffer(presentCmd);
+		_assert_msg_(res == VK_SUCCESS, "vkEndCommandBuffer failed (present)! result=%s", VulkanResultToString(res));
+
+		cmdBufs[numCmdBufs++] = presentCmd;
+		hasPresentCommands = false;
+
+		if (type == FrameSubmitType::Present) {
+			fenceToTrigger = fence;
+		}
+	}
+
+	if (!numCmdBufs && fenceToTrigger == VK_NULL_HANDLE) {
+		// Nothing to do.
+		return;
+	}
+
+	VkSubmitInfo submit_info{ VK_STRUCTURE_TYPE_SUBMIT_INFO };
+	VkPipelineStageFlags waitStage[1]{ VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT };
+	if (type == FrameSubmitType::Present && !skipSwap) {
+		_dbg_assert_(hasAcquired);
+		submit_info.waitSemaphoreCount = 1;
+		submit_info.pWaitSemaphores = &sharedData.acquireSemaphore;
+		submit_info.pWaitDstStageMask = waitStage;
+	}
+	submit_info.commandBufferCount = (uint32_t)numCmdBufs;
+	submit_info.pCommandBuffers = cmdBufs;
+	if (type == FrameSubmitType::Present && !skipSwap) {
+		submit_info.signalSemaphoreCount = 1;
+		submit_info.pSignalSemaphores = &sharedData.renderingCompleteSemaphore;
+	}
+	VkResult res = vkQueueSubmit(vulkan->GetGraphicsQueue(), 1, &submit_info, fenceToTrigger);
+	if (res == VK_ERROR_DEVICE_LOST) {
+		_assert_msg_(false, "Lost the Vulkan device in vkQueueSubmit! If this happens again, switch Graphics Backend away from Vulkan");
+	} else {
+		_assert_msg_(res == VK_SUCCESS, "vkQueueSubmit failed (main)! result=%s", VulkanResultToString(res));
+	}
+
+	if (type == FrameSubmitType::Sync) {
+		// Hard stall of the GPU, not ideal, but necessary so the CPU has the contents of the readback.
+		vkWaitForFences(vulkan->GetDevice(), 1, &readbackFence, true, UINT64_MAX);
+		vkResetFences(vulkan->GetDevice(), 1, &readbackFence);
+	}
+
+	// When !triggerFence, we notify after syncing with Vulkan.
+	if (type == FrameSubmitType::Present || type == FrameSubmitType::Sync) {
+		VERBOSE_LOG(G3D, "PULL: Frame %d.readyForFence = true", index);
+		std::unique_lock<std::mutex> lock(push_mutex);
+		readyForFence = true;  // misnomer in sync mode!
+		push_condVar.notify_all();
+	}
+}
+
+void FrameDataShared::Init(VulkanContext *vulkan) {
+	VkSemaphoreCreateInfo semaphoreCreateInfo = { VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO };
+	semaphoreCreateInfo.flags = 0;
+	VkResult res = vkCreateSemaphore(vulkan->GetDevice(), &semaphoreCreateInfo, nullptr, &acquireSemaphore);
+	_dbg_assert_(res == VK_SUCCESS);
+	res = vkCreateSemaphore(vulkan->GetDevice(), &semaphoreCreateInfo, nullptr, &renderingCompleteSemaphore);
+	_dbg_assert_(res == VK_SUCCESS);
+}
+
+void FrameDataShared::Destroy(VulkanContext *vulkan) {
+	VkDevice device = vulkan->GetDevice();
+	vkDestroySemaphore(device, acquireSemaphore, nullptr);
+	vkDestroySemaphore(device, renderingCompleteSemaphore, nullptr);
+}
--- a/Common/GPU/Vulkan/VulkanFrameData.h
+++ b/Common/GPU/Vulkan/VulkanFrameData.h
@ -0,0 +1,101 @@
+#pragma once
+
+#include <cstdint>
+
+#include <mutex>
+#include <condition_variable>
+
+#include "Common/GPU/Vulkan/VulkanContext.h"
+
+struct VKRStep;
+
+enum {
+	MAX_TIMESTAMP_QUERIES = 128,
+};
+
+enum class VKRRunType {
+	END,
+	SYNC,
+};
+
+struct QueueProfileContext {
+	VkQueryPool queryPool;
+	std::vector<std::string> timestampDescriptions;
+	std::string profileSummary;
+	double cpuStartTime;
+	double cpuEndTime;
+};
+
+struct FrameDataShared {
+	// Permanent objects
+	VkSemaphore acquireSemaphore = VK_NULL_HANDLE;
+	VkSemaphore renderingCompleteSemaphore = VK_NULL_HANDLE;
+
+	void Init(VulkanContext *vulkan);
+	void Destroy(VulkanContext *vulkan);
+};
+
+enum class FrameSubmitType {
+	Pending,
+	Sync,
+	Present,
+};
+
+// Per-frame data, round-robin so we can overlap submission with execution of the previous frame.
+struct FrameData {
+	std::mutex push_mutex;
+	std::condition_variable push_condVar;
+
+	std::mutex pull_mutex;
+	std::condition_variable pull_condVar;
+
+	bool readyForFence = true;
+	bool readyForRun = false;  // protected by pull_mutex
+	bool skipSwap = false;
+
+	VkFence fence;
+	VkFence readbackFence;  // Strictly speaking we might only need one global of these.
+
+	// These are on different threads so need separate pools.
+	VkCommandPool cmdPoolInit;  // Written to from main thread
+	VkCommandPool cmdPoolMain;  // Written to from render thread, which also submits
+
+	VkCommandBuffer initCmd;
+	VkCommandBuffer mainCmd;
+	VkCommandBuffer presentCmd;
+
+	bool hasInitCommands = false;
+	bool hasMainCommands = false;
+	bool hasPresentCommands = false;
+
+	bool hasAcquired = false;
+
+	std::vector<VKRStep *> steps;
+
+	// Swapchain.
+	uint32_t curSwapchainImage = -1;
+
+	// Profiling.
+	QueueProfileContext profile;
+	bool profilingEnabled_;
+
+	void Init(VulkanContext *vulkan, int index);
+	void Destroy(VulkanContext *vulkan);
+
+	void AcquireNextImage(VulkanContext *vulkan, FrameDataShared &shared);
+	VkResult QueuePresent(VulkanContext *vulkan, FrameDataShared &shared);
+	VkCommandBuffer GetInitCmd(VulkanContext *vulkan);
+
+	// This will only submit if we are actually recording init commands.
+	void SubmitPending(VulkanContext *vulkan, FrameSubmitType type, FrameDataShared &shared);
+
+	VKRRunType RunType() const {
+		return runType_;
+	}
+
+	VKRRunType runType_ = VKRRunType::END;
+
+private:
+	// Metadata for logging etc
+	int index;
+};
--- a/Common/GPU/Vulkan/VulkanLoader.h
+++ b/Common/GPU/Vulkan/VulkanLoader.h
@ -241,6 +241,8 @@ struct VulkanExtensions {
 	bool KHR_depth_stencil_resolve;
 	bool EXT_shader_stencil_export;
 	bool EXT_swapchain_colorspace;
+	bool ARM_rasterization_order_attachment_access;
+	bool EXT_fragment_shader_interlock;
 	// bool EXT_depth_range_unrestricted;  // Allows depth outside [0.0, 1.0] in 32-bit float depth buffers.
 };

--- a/Common/GPU/Vulkan/VulkanQueueRunner.cpp
+++ b/Common/GPU/Vulkan/VulkanQueueRunner.cpp
@ -34,8 +34,11 @@ RenderPassType MergeRPTypes(RenderPassType a, RenderPassType b) {
 	if (a == b) {
 		// Trivial merging case.
 		return a;
+	} else if (a == RP_TYPE_COLOR_DEPTH && b == RP_TYPE_COLOR_DEPTH_INPUT) {
+		return RP_TYPE_COLOR_DEPTH_INPUT;
+	} else if (a == RP_TYPE_COLOR_DEPTH_INPUT && b == RP_TYPE_COLOR_DEPTH) {
+		return RP_TYPE_COLOR_DEPTH_INPUT;
 	}
-	// More cases to be added later.
 	return a;
 }

@ -138,6 +141,171 @@ void VulkanQueueRunner::DestroyDeviceObjects() {
 	renderPasses_.Clear();
 }

+bool VulkanQueueRunner::CreateSwapchain(VkCommandBuffer cmdInit) {
+	VkResult res = vkGetSwapchainImagesKHR(vulkan_->GetDevice(), vulkan_->GetSwapchain(), &swapchainImageCount_, nullptr);
+	_dbg_assert_(res == VK_SUCCESS);
+
+	VkImage *swapchainImages = new VkImage[swapchainImageCount_];
+	res = vkGetSwapchainImagesKHR(vulkan_->GetDevice(), vulkan_->GetSwapchain(), &swapchainImageCount_, swapchainImages);
+	if (res != VK_SUCCESS) {
+		ERROR_LOG(G3D, "vkGetSwapchainImagesKHR failed");
+		delete[] swapchainImages;
+		return false;
+	}
+
+	for (uint32_t i = 0; i < swapchainImageCount_; i++) {
+		SwapchainImageData sc_buffer{};
+		sc_buffer.image = swapchainImages[i];
+
+		VkImageViewCreateInfo color_image_view = { VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO };
+		color_image_view.format = vulkan_->GetSwapchainFormat();
+		color_image_view.components.r = VK_COMPONENT_SWIZZLE_IDENTITY;
+		color_image_view.components.g = VK_COMPONENT_SWIZZLE_IDENTITY;
+		color_image_view.components.b = VK_COMPONENT_SWIZZLE_IDENTITY;
+		color_image_view.components.a = VK_COMPONENT_SWIZZLE_IDENTITY;
+		color_image_view.subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
+		color_image_view.subresourceRange.baseMipLevel = 0;
+		color_image_view.subresourceRange.levelCount = 1;
+		color_image_view.subresourceRange.baseArrayLayer = 0;
+		color_image_view.subresourceRange.layerCount = 1;
+		color_image_view.viewType = VK_IMAGE_VIEW_TYPE_2D;
+		color_image_view.flags = 0;
+		color_image_view.image = sc_buffer.image;
+
+		// We leave the images as UNDEFINED, there's no need to pre-transition them as
+		// the backbuffer renderpass starts out with them being auto-transitioned from UNDEFINED anyway.
+		// Also, turns out it's illegal to transition un-acquired images, thanks Hans-Kristian. See #11417.
+
+		res = vkCreateImageView(vulkan_->GetDevice(), &color_image_view, nullptr, &sc_buffer.view);
+		swapchainImages_.push_back(sc_buffer);
+		_dbg_assert_(res == VK_SUCCESS);
+	}
+	delete[] swapchainImages;
+
+	// Must be before InitBackbufferRenderPass.
+	if (InitDepthStencilBuffer(cmdInit)) {
+		InitBackbufferFramebuffers(vulkan_->GetBackbufferWidth(), vulkan_->GetBackbufferHeight());
+	}
+	return true;
+}
+
+
+bool VulkanQueueRunner::InitBackbufferFramebuffers(int width, int height) {
+	VkResult res;
+	// We share the same depth buffer but have multiple color buffers, see the loop below.
+	VkImageView attachments[2] = { VK_NULL_HANDLE, depth_.view };
+
+	VkFramebufferCreateInfo fb_info = { VK_STRUCTURE_TYPE_FRAMEBUFFER_CREATE_INFO };
+	fb_info.renderPass = GetCompatibleRenderPass()->Get(vulkan_, RP_TYPE_BACKBUFFER);
+	fb_info.attachmentCount = 2;
+	fb_info.pAttachments = attachments;
+	fb_info.width = width;
+	fb_info.height = height;
+	fb_info.layers = 1;
+
+	framebuffers_.resize(swapchainImageCount_);
+
+	for (uint32_t i = 0; i < swapchainImageCount_; i++) {
+		attachments[0] = swapchainImages_[i].view;
+		res = vkCreateFramebuffer(vulkan_->GetDevice(), &fb_info, nullptr, &framebuffers_[i]);
+		_dbg_assert_(res == VK_SUCCESS);
+		if (res != VK_SUCCESS) {
+			framebuffers_.clear();
+			return false;
+		}
+	}
+
+	return true;
+}
+
+bool VulkanQueueRunner::InitDepthStencilBuffer(VkCommandBuffer cmd) {
+	const VkFormat depth_format = vulkan_->GetDeviceInfo().preferredDepthStencilFormat;
+	int aspectMask = VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT;
+	VkImageCreateInfo image_info = { VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO };
+	image_info.imageType = VK_IMAGE_TYPE_2D;
+	image_info.format = depth_format;
+	image_info.extent.width = vulkan_->GetBackbufferWidth();
+	image_info.extent.height = vulkan_->GetBackbufferHeight();
+	image_info.extent.depth = 1;
+	image_info.mipLevels = 1;
+	image_info.arrayLayers = 1;
+	image_info.samples = VK_SAMPLE_COUNT_1_BIT;
+	image_info.queueFamilyIndexCount = 0;
+	image_info.pQueueFamilyIndices = nullptr;
+	image_info.sharingMode = VK_SHARING_MODE_EXCLUSIVE;
+	image_info.usage = VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT;
+	image_info.flags = 0;
+
+	depth_.format = depth_format;
+
+	VmaAllocationCreateInfo allocCreateInfo{};
+	VmaAllocationInfo allocInfo{};
+
+	allocCreateInfo.usage = VMA_MEMORY_USAGE_GPU_ONLY;
+
+	VkResult res = vmaCreateImage(vulkan_->Allocator(), &image_info, &allocCreateInfo, &depth_.image, &depth_.alloc, &allocInfo);
+	_dbg_assert_(res == VK_SUCCESS);
+	if (res != VK_SUCCESS)
+		return false;
+
+	vulkan_->SetDebugName(depth_.image, VK_OBJECT_TYPE_IMAGE, "BackbufferDepth");
+
+	TransitionImageLayout2(cmd, depth_.image, 0, 1,
+		aspectMask,
+		VK_IMAGE_LAYOUT_UNDEFINED, VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL,
+		VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT | VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT,
+		VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT | VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT,
+		0, VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT | VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT);
+
+	VkImageViewCreateInfo depth_view_info = { VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO };
+	depth_view_info.image = depth_.image;
+	depth_view_info.format = depth_format;
+	depth_view_info.components.r = VK_COMPONENT_SWIZZLE_IDENTITY;
+	depth_view_info.components.g = VK_COMPONENT_SWIZZLE_IDENTITY;
+	depth_view_info.components.b = VK_COMPONENT_SWIZZLE_IDENTITY;
+	depth_view_info.components.a = VK_COMPONENT_SWIZZLE_IDENTITY;
+	depth_view_info.subresourceRange.aspectMask = aspectMask;
+	depth_view_info.subresourceRange.baseMipLevel = 0;
+	depth_view_info.subresourceRange.levelCount = 1;
+	depth_view_info.subresourceRange.baseArrayLayer = 0;
+	depth_view_info.subresourceRange.layerCount = 1;
+	depth_view_info.viewType = VK_IMAGE_VIEW_TYPE_2D;
+	depth_view_info.flags = 0;
+
+	VkDevice device = vulkan_->GetDevice();
+
+	res = vkCreateImageView(device, &depth_view_info, NULL, &depth_.view);
+	_dbg_assert_(res == VK_SUCCESS);
+	if (res != VK_SUCCESS)
+		return false;
+
+	return true;
+}
+
+
+void VulkanQueueRunner::DestroyBackBuffers() {
+	for (auto &image : swapchainImages_) {
+		vulkan_->Delete().QueueDeleteImageView(image.view);
+	}
+	swapchainImages_.clear();
+
+	if (depth_.view) {
+		vulkan_->Delete().QueueDeleteImageView(depth_.view);
+	}
+	if (depth_.image) {
+		_dbg_assert_(depth_.alloc);
+		vulkan_->Delete().QueueDeleteImageAllocation(depth_.image, depth_.alloc);
+	}
+	depth_ = {};
+	for (uint32_t i = 0; i < framebuffers_.size(); i++) {
+		_dbg_assert_(framebuffers_[i] != VK_NULL_HANDLE);
+		vulkan_->Delete().QueueDeleteFramebuffer(framebuffers_[i]);
+	}
+	framebuffers_.clear();
+
+	INFO_LOG(G3D, "Backbuffers destroyed");
+}
+
 static VkAttachmentLoadOp ConvertLoadAction(VKRRenderPassLoadAction action) {
 	switch (action) {
 	case VKRRenderPassLoadAction::CLEAR:     return VK_ATTACHMENT_LOAD_OP_CLEAR;
@ -155,7 +323,12 @@ static VkAttachmentStoreOp ConvertStoreAction(VKRRenderPassStoreAction action) {
 	return VK_ATTACHMENT_STORE_OP_DONT_CARE;  // avoid compiler warning
 }

+// Self-dependency: https://github.com/gpuweb/gpuweb/issues/442#issuecomment-547604827
+// Also see https://www.khronos.org/registry/vulkan/specs/1.3-extensions/html/vkspec.html#synchronization-pipeline-barriers-subpass-self-dependencies
+
 VkRenderPass CreateRP(VulkanContext *vulkan, const RPKey &key, RenderPassType rpType) {
+	bool selfDependency = rpType == RP_TYPE_COLOR_DEPTH_INPUT;
+
 	VkAttachmentDescription attachments[2] = {};
 	attachments[0].format = rpType == RP_TYPE_BACKBUFFER ? vulkan->GetSwapchainFormat() : VK_FORMAT_R8G8B8A8_UNORM;
 	attachments[0].samples = VK_SAMPLE_COUNT_1_BIT;
@ -179,7 +352,7 @@ VkRenderPass CreateRP(VulkanContext *vulkan, const RPKey &key, RenderPassType rp

 	VkAttachmentReference color_reference{};
 	color_reference.attachment = 0;
-	color_reference.layout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL;
+	color_reference.layout = selfDependency ? VK_IMAGE_LAYOUT_GENERAL : VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL;

 	VkAttachmentReference depth_reference{};
 	depth_reference.attachment = 1;
@ -188,8 +361,13 @@ VkRenderPass CreateRP(VulkanContext *vulkan, const RPKey &key, RenderPassType rp
 	VkSubpassDescription subpass{};
 	subpass.pipelineBindPoint = VK_PIPELINE_BIND_POINT_GRAPHICS;
 	subpass.flags = 0;
-	subpass.inputAttachmentCount = 0;
-	subpass.pInputAttachments = nullptr;
+	if (selfDependency) {
+		subpass.inputAttachmentCount = 1;
+		subpass.pInputAttachments = &color_reference;
+	} else {
+		subpass.inputAttachmentCount = 0;
+		subpass.pInputAttachments = nullptr;
+	}
 	subpass.colorAttachmentCount = 1;
 	subpass.pColorAttachments = &color_reference;
 	subpass.pResolveAttachments = nullptr;
@ -198,22 +376,40 @@ VkRenderPass CreateRP(VulkanContext *vulkan, const RPKey &key, RenderPassType rp
 	subpass.pPreserveAttachments = nullptr;

 	// Not sure if this is really necessary.
-	VkSubpassDependency dep{};
-	dep.srcSubpass = VK_SUBPASS_EXTERNAL;
-	dep.dstSubpass = 0;
-	dep.srcStageMask = VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT;
-	dep.dstStageMask = VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT;
-	dep.srcAccessMask = 0;
-	dep.dstAccessMask = VK_ACCESS_COLOR_ATTACHMENT_READ_BIT | VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT;
+	VkSubpassDependency deps[2]{};
+	size_t numDeps = 0;

 	VkRenderPassCreateInfo rp{ VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO };
 	rp.attachmentCount = 2;
 	rp.pAttachments = attachments;
 	rp.subpassCount = 1;
 	rp.pSubpasses = &subpass;
+
 	if (rpType == RP_TYPE_BACKBUFFER) {
+		deps[numDeps].srcSubpass = VK_SUBPASS_EXTERNAL;
+		deps[numDeps].dstSubpass = 0;
+		deps[numDeps].srcStageMask = VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT;
+		deps[numDeps].dstStageMask = VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT;
+		deps[numDeps].srcAccessMask = 0;
+		deps[numDeps].dstAccessMask = VK_ACCESS_COLOR_ATTACHMENT_READ_BIT | VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT;
+		numDeps++;
 		rp.dependencyCount = 1;
-		rp.pDependencies = &dep;
+	}
+
+	if (selfDependency) {
+		deps[numDeps].dependencyFlags = VK_DEPENDENCY_BY_REGION_BIT;
+		deps[numDeps].srcAccessMask = VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT;
+		deps[numDeps].dstAccessMask = VK_ACCESS_INPUT_ATTACHMENT_READ_BIT;
+		deps[numDeps].srcStageMask = VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT;
+		deps[numDeps].dstStageMask = VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT;
+		deps[numDeps].srcSubpass = 0;
+		deps[numDeps].dstSubpass = 0;
+		numDeps++;
+	}
+
+	if (numDeps > 0) {
+		rp.dependencyCount = (u32)numDeps;
+		rp.pDependencies = deps;
 	}

 	VkRenderPass pass;
@ -246,6 +442,30 @@ VKRRenderPass *VulkanQueueRunner::GetRenderPass(const RPKey &key) {
 	return pass;
 }

+// Must match the subpass self-dependency declared above.
+void VulkanQueueRunner::SelfDependencyBarrier(VKRImage &img, VkImageAspectFlags aspect, VulkanBarrier *recordBarrier) {
+	if (aspect & VK_IMAGE_ASPECT_COLOR_BIT) {
+		VkAccessFlags srcAccessMask = VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT;
+		VkAccessFlags dstAccessMask = VK_ACCESS_INPUT_ATTACHMENT_READ_BIT;
+		VkPipelineStageFlags srcStageMask = VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT;
+		VkPipelineStageFlags dstStageMask = VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT;
+		recordBarrier->TransitionImage(
+			img.image,
+			0,
+			1,
+			aspect,
+			VK_IMAGE_LAYOUT_GENERAL,
+			VK_IMAGE_LAYOUT_GENERAL,
+			srcAccessMask,
+			dstAccessMask,
+			srcStageMask,
+			dstStageMask
+		);
+	} else {
+		_assert_msg_(false, "Depth self-dependencies not yet supported");
+	}
+}
+
 void VulkanQueueRunner::PreprocessSteps(std::vector<VKRStep *> &steps) {
 	// Optimizes renderpasses, then sequences them.
 	// Planned optimizations: 
@ -321,23 +541,47 @@ void VulkanQueueRunner::PreprocessSteps(std::vector<VKRStep *> &steps) {
 	}
 }

-void VulkanQueueRunner::RunSteps(VkCommandBuffer cmd, std::vector<VKRStep *> &steps, QueueProfileContext *profile) {
+void VulkanQueueRunner::RunSteps(FrameData &frameData, FrameDataShared &frameDataShared) {
+	QueueProfileContext *profile = frameData.profilingEnabled_ ? &frameData.profile : nullptr;
+
 	if (profile)
 		profile->cpuStartTime = time_now_d();

 	bool emitLabels = vulkan_->Extensions().EXT_debug_utils;

-	for (size_t i = 0; i < steps.size(); i++) {
-		const VKRStep &step = *steps[i];
+	VkCommandBuffer cmd = frameData.hasPresentCommands ? frameData.presentCmd : frameData.mainCmd;
+
+	for (size_t i = 0; i < frameData.steps.size(); i++) {
+		const VKRStep &step = *frameData.steps[i];

 		if (emitLabels) {
 			VkDebugUtilsLabelEXT labelInfo{ VK_STRUCTURE_TYPE_DEBUG_UTILS_LABEL_EXT };
 			labelInfo.pLabelName = step.tag;
-			vkCmdBeginDebugUtilsLabelEXT(cmd, &labelInfo);
+			vkCmdBeginDebugUtilsLabelEXT(frameData.mainCmd, &labelInfo);
 		}

 		switch (step.stepType) {
 		case VKRStepType::RENDER:
+			if (!step.render.framebuffer) {
+				frameData.SubmitPending(vulkan_, FrameSubmitType::Pending, frameDataShared);
+
+				// When stepping in the GE debugger, we can end up here multiple times in a "frame".
+				// So only acquire once.
+				if (!frameData.hasAcquired) {
+					frameData.AcquireNextImage(vulkan_, frameDataShared);
+					SetBackbuffer(framebuffers_[frameData.curSwapchainImage], swapchainImages_[frameData.curSwapchainImage].image);
+				}
+
+				_dbg_assert_(!frameData.hasPresentCommands);
+				// A RENDER step rendering to the backbuffer is normally the last step that happens in a frame,
+				// unless taking a screenshot, in which case there might be a READBACK_IMAGE after it.
+				// This is why we have to switch cmd to presentCmd, in this case.
+				VkCommandBufferBeginInfo begin{ VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO };
+				begin.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT;
+				vkBeginCommandBuffer(frameData.presentCmd, &begin);
+				frameData.hasPresentCommands = true;
+				cmd = frameData.presentCmd;
+			}
 			PerformRenderPass(step, cmd);
 			break;
 		case VKRStepType::COPY:
@ -368,10 +612,12 @@ void VulkanQueueRunner::RunSteps(VkCommandBuffer cmd, std::vector<VKRStep *> &st

 	// Deleting all in one go should be easier on the instruction cache than deleting
 	// them as we go - and easier to debug because we can look backwards in the frame.
-	for (size_t i = 0; i < steps.size(); i++) {
-		delete steps[i];
+	for (auto step : frameData.steps) {
+		delete step;
 	}

+	frameData.steps.clear();
+
 	if (profile)
 		profile->cpuEndTime = time_now_d();
 }
@ -628,6 +874,7 @@ std::string VulkanQueueRunner::StepToString(const VKRStep &step) const {
 		switch (step.render.renderPassType) {
 		case RP_TYPE_BACKBUFFER: renderCmd = "BACKBUF"; break;
 		case RP_TYPE_COLOR_DEPTH: renderCmd = "RENDER"; break;
+		case RP_TYPE_COLOR_DEPTH_INPUT: renderCmd = "RENDER_INPUT"; break;
 		default: renderCmd = "N/A";
 		}
 		snprintf(buffer, sizeof(buffer), "%s %s (draws: %d, %dx%d/%dx%d, fb: %p, )", renderCmd, step.tag, step.render.numDraws, actual_w, actual_h, w, h, step.render.framebuffer);
@ -817,6 +1064,9 @@ void VulkanQueueRunner::LogRenderPass(const VKRStep &pass, bool verbose) {
 			case VKRRenderCommand::REMOVED:
 				INFO_LOG(G3D, "  (Removed)");
 				break;
+			case VKRRenderCommand::SELF_DEPENDENCY_BARRIER:
+				INFO_LOG(G3D, "  SelfBarrier()");
+				break;
 			case VKRRenderCommand::BIND_GRAPHICS_PIPELINE:
 				INFO_LOG(G3D, "  BindGraphicsPipeline(%x)", (int)(intptr_t)cmd.graphics_pipeline.pipeline);
 				break;
@ -1070,7 +1320,6 @@ void VulkanQueueRunner::PerformRenderPass(const VKRStep &step, VkCommandBuffer c
 		}
 	}

-	
 	// Don't execute empty renderpasses that keep the contents.
 	if (step.commands.empty() && step.render.colorLoad == VKRRenderPassLoadAction::KEEP && step.render.depthLoad == VKRRenderPassLoadAction::KEEP && step.render.stencilLoad == VKRRenderPassLoadAction::KEEP) {
 		// Flush the pending barrier
@ -1120,6 +1369,7 @@ void VulkanQueueRunner::PerformRenderPass(const VKRStep &step, VkCommandBuffer c

 	// This reads the layout of the color and depth images, and chooses a render pass using them that
 	// will transition to the desired final layout.
+	//
 	// NOTE: Flushes recordBarrier_.
 	VKRRenderPass *renderPass = PerformBindFramebufferAsRenderTarget(step, cmd);

@ -1235,6 +1485,15 @@ void VulkanQueueRunner::PerformRenderPass(const VKRStep &step, VkCommandBuffer c
 			break;
 		}

+		case VKRRenderCommand::SELF_DEPENDENCY_BARRIER:
+		{
+			_assert_(step.render.pipelineFlags & PipelineFlags::USES_INPUT_ATTACHMENT);
+			VulkanBarrier barrier;
+			SelfDependencyBarrier(step.render.framebuffer->color, VK_IMAGE_ASPECT_COLOR_BIT, &barrier);
+			barrier.Flush(cmd);
+			break;
+		}
+
 		case VKRRenderCommand::PUSH_CONSTANTS:
 			vkCmdPushConstants(cmd, pipelineLayout, c.push.stages, c.push.offset, c.push.size, c.push.data);
 			break;
--- a/Common/GPU/Vulkan/VulkanQueueRunner.h
+++ b/Common/GPU/Vulkan/VulkanQueueRunner.h
@ -8,6 +8,7 @@
 #include "Common/Data/Collections/Hashmaps.h"
 #include "Common/GPU/Vulkan/VulkanContext.h"
 #include "Common/GPU/Vulkan/VulkanBarrier.h"
+#include "Common/GPU/Vulkan/VulkanFrameData.h"
 #include "Common/Data/Convert/SmallDataConvert.h"
 #include "Common/Data/Collections/TinySet.h"
 #include "Common/GPU/DataFormat.h"
@ -16,11 +17,11 @@ class VKRFramebuffer;
 struct VKRGraphicsPipeline;
 struct VKRComputePipeline;
 struct VKRImage;
+struct FrameData;

 enum {
 	QUEUE_HACK_MGS2_ACID = 1,
 	QUEUE_HACK_SONIC = 2,
-	// Killzone PR = 4.
 	QUEUE_HACK_RENDERPASS_MERGE = 8,
 };

@ -36,20 +37,24 @@ enum class VKRRenderCommand : uint8_t {
 	DRAW,
 	DRAW_INDEXED,
 	PUSH_CONSTANTS,
+	SELF_DEPENDENCY_BARRIER,
 	NUM_RENDER_COMMANDS,
 };

-enum PipelineFlags {
-	PIPELINE_FLAG_NONE = 0,
-	PIPELINE_FLAG_USES_LINES = (1 << 2),
-	PIPELINE_FLAG_USES_BLEND_CONSTANT = (1 << 3),
-	PIPELINE_FLAG_USES_DEPTH_STENCIL = (1 << 4),  // Reads or writes the depth buffer.
+enum class PipelineFlags {
+	NONE = 0,
+	USES_LINES = (1 << 2),
+	USES_BLEND_CONSTANT = (1 << 3),
+	USES_DEPTH_STENCIL = (1 << 4),  // Reads or writes the depth buffer.
+	USES_INPUT_ATTACHMENT = (1 << 5),
 };
+ENUM_CLASS_BITOPS(PipelineFlags);

 // Pipelines need to be created for the right type of render pass.
 enum RenderPassType {
 	RP_TYPE_BACKBUFFER,
 	RP_TYPE_COLOR_DEPTH,
+	RP_TYPE_COLOR_DEPTH_INPUT,
 	// Later will add pure-color render passes.
 	RP_TYPE_COUNT,
 };
@ -146,14 +151,6 @@ struct TransitionRequest {
 	VkImageLayout targetLayout;
 };

-struct QueueProfileContext {
-	VkQueryPool queryPool;
-	std::vector<std::string> timestampDescriptions;
-	std::string profileSummary;
-	double cpuStartTime;
-	double cpuEndTime;
-};
-
 class VKRRenderPass;

 struct VKRStep {
@ -168,7 +165,6 @@ struct VKRStep {
 	union {
 		struct {
 			VKRFramebuffer *framebuffer;
-			// TODO: Look these up through renderPass?
 			VKRRenderPassLoadAction colorLoad;
 			VKRRenderPassLoadAction depthLoad;
 			VKRRenderPassLoadAction stencilLoad;
@ -183,7 +179,7 @@ struct VKRStep {
 			int numReads;
 			VkImageLayout finalColorLayout;
 			VkImageLayout finalDepthStencilLayout;
-			u32 pipelineFlags;
+			PipelineFlags pipelineFlags;  // contains the self dependency flag, in the form of USES_INPUT_ATTACHMENT
 			VkRect2D renderArea;
 			// Render pass type. Deduced after finishing recording the pass, from the used pipelines.
 			// NOTE: Storing the render pass here doesn't do much good, we change the compatible parameters (load/store ops) during step optimization.
@ -255,7 +251,7 @@ public:
 	}

 	void PreprocessSteps(std::vector<VKRStep *> &steps);
-	void RunSteps(VkCommandBuffer cmd, std::vector<VKRStep *> &steps, QueueProfileContext *profile);
+	void RunSteps(FrameData &frameData, FrameDataShared &frameDataShared);
 	void LogSteps(const std::vector<VKRStep *> &steps, bool verbose);

 	std::string StepToString(const VKRStep &step) const;
@ -263,6 +259,14 @@ public:
 	void CreateDeviceObjects();
 	void DestroyDeviceObjects();

+	// Swapchain
+	void DestroyBackBuffers();
+	bool CreateSwapchain(VkCommandBuffer cmdInit);
+
+	bool HasBackbuffers() const {
+		return !framebuffers_.empty();
+	}
+
 	// Get a render pass that's compatible with all our framebuffers.
 	// Note that it's precached, cannot look up in the map as this might be on another thread.
 	VKRRenderPass *GetCompatibleRenderPass() const {
@ -302,6 +306,9 @@ public:
 	}

 private:
+	bool InitBackbufferFramebuffers(int width, int height);
+	bool InitDepthStencilBuffer(VkCommandBuffer cmd);  // Used for non-buffered rendering.
+
 	VKRRenderPass *PerformBindFramebufferAsRenderTarget(const VKRStep &pass, VkCommandBuffer cmd);
 	void PerformRenderPass(const VKRStep &pass, VkCommandBuffer cmd);
 	void PerformCopy(const VKRStep &pass, VkCommandBuffer cmd);
@ -324,6 +331,8 @@ private:
 	static void SetupTransitionToTransferSrc(VKRImage &img, VkImageAspectFlags aspect, VulkanBarrier *recordBarrier);
 	static void SetupTransitionToTransferDst(VKRImage &img, VkImageAspectFlags aspect, VulkanBarrier *recordBarrier);

+	static void SelfDependencyBarrier(VKRImage &img, VkImageAspectFlags aspect, VulkanBarrier *recordBarrier);
+
 	VulkanContext *vulkan_;

 	VkFramebuffer backbuffer_ = VK_NULL_HANDLE;
@ -354,4 +363,20 @@ private:
 	// Stored here to help reuse the allocation.

 	VulkanBarrier recordBarrier_;
+
+	// Swap chain management
+	struct SwapchainImageData {
+		VkImage image;
+		VkImageView view;
+	};
+	std::vector<VkFramebuffer> framebuffers_;
+	std::vector<SwapchainImageData> swapchainImages_;
+	uint32_t swapchainImageCount_ = 0;
+	struct DepthBufferInfo {
+		VkFormat format = VK_FORMAT_UNDEFINED;
+		VkImage image = VK_NULL_HANDLE;
+		VmaAllocation alloc = VK_NULL_HANDLE;
+		VkImageView view = VK_NULL_HANDLE;
+	};
+	DepthBufferInfo depth_;
 };
--- a/Common/GPU/Vulkan/VulkanRenderManager.cpp
+++ b/Common/GPU/Vulkan/VulkanRenderManager.cpp
@ -223,7 +223,7 @@ void CreateImage(VulkanContext *vulkan, VkCommandBuffer cmd, VKRImage &img, int
 	// Strictly speaking we don't yet need VK_IMAGE_USAGE_SAMPLED_BIT for depth buffers since we do not yet sample depth buffers.
 	ici.usage = VK_IMAGE_USAGE_SAMPLED_BIT | VK_IMAGE_USAGE_TRANSFER_DST_BIT | VK_IMAGE_USAGE_TRANSFER_SRC_BIT;
 	if (color) {
-		ici.usage |= VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT;
+		ici.usage |= VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT | VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT;
 	} else {
 		ici.usage |= VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT;
 	}
@ -288,53 +288,15 @@ void CreateImage(VulkanContext *vulkan, VkCommandBuffer cmd, VKRImage &img, int
 }

 VulkanRenderManager::VulkanRenderManager(VulkanContext *vulkan) : vulkan_(vulkan), queueRunner_(vulkan) {
-	VkSemaphoreCreateInfo semaphoreCreateInfo = { VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO };
-	semaphoreCreateInfo.flags = 0;
-	VkResult res = vkCreateSemaphore(vulkan_->GetDevice(), &semaphoreCreateInfo, nullptr, &acquireSemaphore_);
-	_dbg_assert_(res == VK_SUCCESS);
-	res = vkCreateSemaphore(vulkan_->GetDevice(), &semaphoreCreateInfo, nullptr, &renderingCompleteSemaphore_);
-	_dbg_assert_(res == VK_SUCCESS);
-
 	inflightFramesAtStart_ = vulkan_->GetInflightFrames();
+
+	frameDataShared_.Init(vulkan);
+
 	for (int i = 0; i < inflightFramesAtStart_; i++) {
-		VkCommandPoolCreateInfo cmd_pool_info = { VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO };
-		cmd_pool_info.queueFamilyIndex = vulkan_->GetGraphicsQueueFamilyIndex();
-		cmd_pool_info.flags = VK_COMMAND_POOL_CREATE_TRANSIENT_BIT;
-		VkResult res = vkCreateCommandPool(vulkan_->GetDevice(), &cmd_pool_info, nullptr, &frameData_[i].cmdPoolInit);
-		_dbg_assert_(res == VK_SUCCESS);
-		res = vkCreateCommandPool(vulkan_->GetDevice(), &cmd_pool_info, nullptr, &frameData_[i].cmdPoolMain);
-		_dbg_assert_(res == VK_SUCCESS);
-
-		VkCommandBufferAllocateInfo cmd_alloc = { VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO };
-		cmd_alloc.commandPool = frameData_[i].cmdPoolInit;
-		cmd_alloc.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY;
-		cmd_alloc.commandBufferCount = 1;
-
-		res = vkAllocateCommandBuffers(vulkan_->GetDevice(), &cmd_alloc, &frameData_[i].initCmd);
-		_dbg_assert_(res == VK_SUCCESS);
-		cmd_alloc.commandPool = frameData_[i].cmdPoolMain;
-		res = vkAllocateCommandBuffers(vulkan_->GetDevice(), &cmd_alloc, &frameData_[i].mainCmd);
-		_dbg_assert_(res == VK_SUCCESS);
-
-		// Creating the frame fence with true so they can be instantly waited on the first frame
-		frameData_[i].fence = vulkan_->CreateFence(true);
-
-		// This fence one is used for synchronizing readbacks. Does not need preinitialization.
-		frameData_[i].readbackFence = vulkan_->CreateFence(false);
-
-		VkQueryPoolCreateInfo query_ci{ VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO };
-		query_ci.queryCount = MAX_TIMESTAMP_QUERIES;
-		query_ci.queryType = VK_QUERY_TYPE_TIMESTAMP;
-		res = vkCreateQueryPool(vulkan_->GetDevice(), &query_ci, nullptr, &frameData_[i].profile.queryPool);
+		frameData_[i].Init(vulkan, i);
 	}

 	queueRunner_.CreateDeviceObjects();
-
-	// AMD hack for issue #10097 (older drivers only.)
-	const auto &props = vulkan_->GetPhysicalDeviceProperties().properties;
-	if (props.vendorID == VULKAN_VENDOR_AMD && props.apiVersion < VK_API_VERSION_1_1) {
-		useThread_ = false;
-	}
 }

 bool VulkanRenderManager::CreateBackbuffers() {
@ -342,52 +304,14 @@ bool VulkanRenderManager::CreateBackbuffers() {
 		ERROR_LOG(G3D, "No swapchain - can't create backbuffers");
 		return false;
 	}
-	VkResult res = vkGetSwapchainImagesKHR(vulkan_->GetDevice(), vulkan_->GetSwapchain(), &swapchainImageCount_, nullptr);
-	_dbg_assert_(res == VK_SUCCESS);

-	VkImage *swapchainImages = new VkImage[swapchainImageCount_];
-	res = vkGetSwapchainImagesKHR(vulkan_->GetDevice(), vulkan_->GetSwapchain(), &swapchainImageCount_, swapchainImages);
-	if (res != VK_SUCCESS) {
-		ERROR_LOG(G3D, "vkGetSwapchainImagesKHR failed");
-		delete[] swapchainImages;
-		return false;
-	}

 	VkCommandBuffer cmdInit = GetInitCmd();

-	for (uint32_t i = 0; i < swapchainImageCount_; i++) {
-		SwapchainImageData sc_buffer{};
-		sc_buffer.image = swapchainImages[i];
-
-		VkImageViewCreateInfo color_image_view = { VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO };
-		color_image_view.format = vulkan_->GetSwapchainFormat();
-		color_image_view.components.r = VK_COMPONENT_SWIZZLE_IDENTITY;
-		color_image_view.components.g = VK_COMPONENT_SWIZZLE_IDENTITY;
-		color_image_view.components.b = VK_COMPONENT_SWIZZLE_IDENTITY;
-		color_image_view.components.a = VK_COMPONENT_SWIZZLE_IDENTITY;
-		color_image_view.subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
-		color_image_view.subresourceRange.baseMipLevel = 0;
-		color_image_view.subresourceRange.levelCount = 1;
-		color_image_view.subresourceRange.baseArrayLayer = 0;
-		color_image_view.subresourceRange.layerCount = 1;
-		color_image_view.viewType = VK_IMAGE_VIEW_TYPE_2D;
-		color_image_view.flags = 0;
-		color_image_view.image = sc_buffer.image;
-
-		// We leave the images as UNDEFINED, there's no need to pre-transition them as
-		// the backbuffer renderpass starts out with them being auto-transitioned from UNDEFINED anyway.
-		// Also, turns out it's illegal to transition un-acquired images, thanks Hans-Kristian. See #11417.
-
-		res = vkCreateImageView(vulkan_->GetDevice(), &color_image_view, nullptr, &sc_buffer.view);
-		swapchainImages_.push_back(sc_buffer);
-		_dbg_assert_(res == VK_SUCCESS);
+	if (!queueRunner_.CreateSwapchain(cmdInit)) {
+		return false;
 	}
-	delete[] swapchainImages;

-	// Must be before InitBackbufferRenderPass.
-	if (InitDepthStencilBuffer(cmdInit)) {
-		InitBackbufferFramebuffers(vulkan_->GetBackbufferWidth(), vulkan_->GetBackbufferHeight());
-	}
 	curWidthRaw_ = -1;
 	curHeightRaw_ = -1;

@ -404,7 +328,7 @@ bool VulkanRenderManager::CreateBackbuffers() {
 	outOfDateFrames_ = 0;

 	// Start the thread.
-	if (useThread_ && HasBackbuffers()) {
+	if (HasBackbuffers()) {
 		run_ = true;
 		// Won't necessarily be 0.
 		threadInitFrame_ = vulkan_->GetCurFrame();
@ -417,57 +341,58 @@ bool VulkanRenderManager::CreateBackbuffers() {
 }

 void VulkanRenderManager::StopThread() {
-	if (useThread_ && run_) {
-		run_ = false;
-		// Stop the thread.
-		for (int i = 0; i < vulkan_->GetInflightFrames(); i++) {
-			auto &frameData = frameData_[i];
-			{
-				std::unique_lock<std::mutex> lock(frameData.push_mutex);
-				frameData.push_condVar.notify_all();
-			}
-			{
-				std::unique_lock<std::mutex> lock(frameData.pull_mutex);
-				frameData.pull_condVar.notify_all();
-			}
-			// Zero the queries so we don't try to pull them later.
-			frameData.profile.timestampDescriptions.clear();
-		}
-		thread_.join();
-		INFO_LOG(G3D, "Vulkan submission thread joined. Frame=%d", vulkan_->GetCurFrame());
-		compileCond_.notify_all();
-		compileThread_.join();
-		INFO_LOG(G3D, "Vulkan compiler thread joined.");
-
-		// Eat whatever has been queued up for this frame if anything.
-		Wipe();
-
-		// Wait for any fences to finish and be resignaled, so we don't have sync issues.
-		// Also clean out any queued data, which might refer to things that might not be valid
-		// when we restart...
-		for (int i = 0; i < vulkan_->GetInflightFrames(); i++) {
-			auto &frameData = frameData_[i];
-			_assert_(!frameData.readyForRun);
-			_assert_(frameData.steps.empty());
-			if (frameData.hasInitCommands) {
-				// Clear 'em out.  This can happen on restart sometimes.
-				vkEndCommandBuffer(frameData.initCmd);
-				frameData.hasInitCommands = false;
-			}
-			frameData.readyForRun = false;
-			for (size_t i = 0; i < frameData.steps.size(); i++) {
-				delete frameData.steps[i];
-			}
-			frameData.steps.clear();
-
-			std::unique_lock<std::mutex> lock(frameData.push_mutex);
-			while (!frameData.readyForFence) {
-				VLOG("PUSH: Waiting for frame[%d].readyForFence = 1 (stop)", i);
-				frameData.push_condVar.wait(lock);
-			}
-		}
-	} else {
+	if (!run_) {
 		INFO_LOG(G3D, "Vulkan submission thread was already stopped.");
+		return;
+	}
+
+	run_ = false;
+	// Stop the thread.
+	for (int i = 0; i < vulkan_->GetInflightFrames(); i++) {
+		auto &frameData = frameData_[i];
+		{
+			std::unique_lock<std::mutex> lock(frameData.push_mutex);
+			frameData.push_condVar.notify_all();
+		}
+		{
+			std::unique_lock<std::mutex> lock(frameData.pull_mutex);
+			frameData.pull_condVar.notify_all();
+		}
+		// Zero the queries so we don't try to pull them later.
+		frameData.profile.timestampDescriptions.clear();
+	}
+	thread_.join();
+	INFO_LOG(G3D, "Vulkan submission thread joined. Frame=%d", vulkan_->GetCurFrame());
+	compileCond_.notify_all();
+	compileThread_.join();
+	INFO_LOG(G3D, "Vulkan compiler thread joined.");
+
+	// Eat whatever has been queued up for this frame if anything.
+	Wipe();
+
+	// Wait for any fences to finish and be resignaled, so we don't have sync issues.
+	// Also clean out any queued data, which might refer to things that might not be valid
+	// when we restart...
+	for (int i = 0; i < vulkan_->GetInflightFrames(); i++) {
+		auto &frameData = frameData_[i];
+		_assert_(!frameData.readyForRun);
+		_assert_(frameData.steps.empty());
+		if (frameData.hasInitCommands) {
+			// Clear 'em out.  This can happen on restart sometimes.
+			vkEndCommandBuffer(frameData.initCmd);
+			frameData.hasInitCommands = false;
+		}
+		frameData.readyForRun = false;
+		for (size_t i = 0; i < frameData.steps.size(); i++) {
+			delete frameData.steps[i];
+		}
+		frameData.steps.clear();
+
+		std::unique_lock<std::mutex> lock(frameData.push_mutex);
+		while (!frameData.readyForFence) {
+			VLOG("PUSH: Waiting for frame[%d].readyForFence = 1 (stop)", i);
+			frameData.push_condVar.wait(lock);
+		}
 	}
 }

@ -475,26 +400,7 @@ void VulkanRenderManager::DestroyBackbuffers() {
 	StopThread();
 	vulkan_->WaitUntilQueueIdle();

-	for (auto &image : swapchainImages_) {
-		vulkan_->Delete().QueueDeleteImageView(image.view);
-	}
-	swapchainImages_.clear();
-
-	if (depth_.view) {
-		vulkan_->Delete().QueueDeleteImageView(depth_.view);
-	}
-	if (depth_.image) {
-		_dbg_assert_(depth_.alloc);
-		vulkan_->Delete().QueueDeleteImageAllocation(depth_.image, depth_.alloc);
-	}
-	depth_ = {};
-	for (uint32_t i = 0; i < framebuffers_.size(); i++) {
-		_dbg_assert_(framebuffers_[i] != VK_NULL_HANDLE);
-		vulkan_->Delete().QueueDeleteFramebuffer(framebuffers_[i]);
-	}
-	framebuffers_.clear();
-
-	INFO_LOG(G3D, "Backbuffers destroyed");
+	queueRunner_.DestroyBackBuffers();
 }

 VulkanRenderManager::~VulkanRenderManager() {
@ -504,16 +410,9 @@ VulkanRenderManager::~VulkanRenderManager() {

 	DrainCompileQueue();
 	VkDevice device = vulkan_->GetDevice();
-	vkDestroySemaphore(device, acquireSemaphore_, nullptr);
-	vkDestroySemaphore(device, renderingCompleteSemaphore_, nullptr);
+	frameDataShared_.Destroy(vulkan_);
 	for (int i = 0; i < inflightFramesAtStart_; i++) {
-		vkFreeCommandBuffers(device, frameData_[i].cmdPoolInit, 1, &frameData_[i].initCmd);
-		vkFreeCommandBuffers(device, frameData_[i].cmdPoolMain, 1, &frameData_[i].mainCmd);
-		vkDestroyCommandPool(device, frameData_[i].cmdPoolInit, nullptr);
-		vkDestroyCommandPool(device, frameData_[i].cmdPoolMain, nullptr);
-		vkDestroyFence(device, frameData_[i].fence, nullptr);
-		vkDestroyFence(device, frameData_[i].readbackFence, nullptr);
-		vkDestroyQueryPool(device, frameData_[i].profile.queryPool, nullptr);
+		frameData_[i].Destroy(vulkan_);
 	}
 	queueRunner_.DestroyDeviceObjects();
 }
@ -534,7 +433,9 @@ void VulkanRenderManager::CompileThreadFunc() {
 			break;
 		}

-		INFO_LOG(G3D, "Compilation thread has %d pipelines to create", (int)toCompile.size());
+		if (!toCompile.empty()) {
+			INFO_LOG(G3D, "Compilation thread has %d pipelines to create", (int)toCompile.size());
+		}

 		// TODO: Here we can sort the pending pipelines by vertex and fragment shaders,
 		// and split up further.
@ -574,6 +475,7 @@ void VulkanRenderManager::ThreadFunc() {
 					threadFrame = 0;
 			}
 			FrameData &frameData = frameData_[threadFrame];
+
 			std::unique_lock<std::mutex> lock(frameData.pull_mutex);
 			while (!frameData.readyForRun && run_) {
 				VLOG("PULL: Waiting for frame[%d].readyForRun", threadFrame);
@ -589,8 +491,7 @@ void VulkanRenderManager::ThreadFunc() {
 			// but that created a race condition where frames could end up not finished properly on resize etc.

 			// Only increment next time if we're done.
-			nextFrame = frameData.type == VKRRunType::END;
-			_dbg_assert_(frameData.type == VKRRunType::END || frameData.type == VKRRunType::SYNC);
+			nextFrame = frameData.RunType() == VKRRunType::END;
 		}
 		VLOG("PULL: Running frame %d", threadFrame);
 		if (firstFrame) {
@ -615,7 +516,7 @@ void VulkanRenderManager::BeginFrame(bool enableProfiling, bool enableLogProfile
 	FrameData &frameData = frameData_[curFrame];

 	// Make sure the very last command buffer from the frame before the previous has been fully executed.
-	if (useThread_) {
+	{
 		std::unique_lock<std::mutex> lock(frameData.push_mutex);
 		while (!frameData.readyForFence) {
 			VLOG("PUSH: Waiting for frame[%d].readyForFence = 1", curFrame);
@ -633,7 +534,6 @@ void VulkanRenderManager::BeginFrame(bool enableProfiling, bool enableLogProfile

 	// Can't set this until after the fence.
 	frameData.profilingEnabled_ = enableProfiling;
-	frameData.readbackFenceUsed = false;

 	uint64_t queryResults[MAX_TIMESTAMP_QUERIES];

@ -698,21 +598,7 @@ void VulkanRenderManager::BeginFrame(bool enableProfiling, bool enableLogProfile

 VkCommandBuffer VulkanRenderManager::GetInitCmd() {
 	int curFrame = vulkan_->GetCurFrame();
-	FrameData &frameData = frameData_[curFrame];
-	if (!frameData.hasInitCommands) {
-		VkCommandBufferBeginInfo begin = {
-			VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO,
-			nullptr,
-			VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT
-		};
-		vkResetCommandPool(vulkan_->GetDevice(), frameData.cmdPoolInit, 0);
-		VkResult res = vkBeginCommandBuffer(frameData.initCmd, &begin);
-		if (res != VK_SUCCESS) {
-			return VK_NULL_HANDLE;
-		}
-		frameData.hasInitCommands = true;
-	}
-	return frameData_[curFrame].initCmd;
+	return frameData_[curFrame].GetInitCmd(vulkan_);
 }

 VKRGraphicsPipeline *VulkanRenderManager::CreateGraphicsPipeline(VKRGraphicsPipelineDesc *desc, uint32_t variantBitmask, const char *tag) {
@ -771,16 +657,20 @@ void VulkanRenderManager::EndCurRenderStep() {
 		curRenderStep_->render.colorStore, curRenderStep_->render.depthStore, curRenderStep_->render.stencilStore,
 	};
 	RenderPassType rpType = RP_TYPE_COLOR_DEPTH;
+	// Save the accumulated pipeline flags so we can use that to configure the render pass.
+	// We'll often be able to avoid loading/saving the depth/stencil buffer.
 	curRenderStep_->render.pipelineFlags = curPipelineFlags_;
 	if (!curRenderStep_->render.framebuffer) {
 		rpType = RP_TYPE_BACKBUFFER;
+	} else if (curPipelineFlags_ & PipelineFlags::USES_INPUT_ATTACHMENT) {
+		// Not allowed on backbuffers.
+		rpType = RP_TYPE_COLOR_DEPTH_INPUT;
 	}
+	// TODO: Also add render pass types for depth/stencil-less.

 	VKRRenderPass *renderPass = queueRunner_.GetRenderPass(key);
 	curRenderStep_->render.renderPassType = rpType;

-	// Save the accumulated pipeline flags so we can use that to configure the render pass.
-	// We'll often be able to avoid loading/saving the depth/stencil buffer.
 	compileMutex_.lock();
 	bool needsCompile = false;
 	for (VKRGraphicsPipeline *pipeline : pipelinesToCheck_) {
@ -806,7 +696,12 @@ void VulkanRenderManager::EndCurRenderStep() {

 	// We no longer have a current render step.
 	curRenderStep_ = nullptr;
-	curPipelineFlags_ = 0;
+	curPipelineFlags_ = (PipelineFlags)0;
+}
+
+void VulkanRenderManager::BindCurrentFramebufferAsInputAttachment0(VkImageAspectFlags aspectBits) {
+	_dbg_assert_(curRenderStep_);
+	curRenderStep_->commands.push_back(VkRenderData{ VKRRenderCommand::SELF_DEPENDENCY_BARRIER });
 }

 void VulkanRenderManager::BindFramebufferAsRenderTarget(VKRFramebuffer *fb, VKRRenderPassLoadAction color, VKRRenderPassLoadAction depth, VKRRenderPassLoadAction stencil, uint32_t clearColor, float clearDepth, uint8_t clearStencil, const char *tag) {
@ -1028,98 +923,6 @@ void VulkanRenderManager::CopyImageToMemorySync(VkImage image, int mipLevel, int
 	queueRunner_.CopyReadbackBuffer(w, h, destFormat, destFormat, pixelStride, pixels);
 }

-bool VulkanRenderManager::InitBackbufferFramebuffers(int width, int height) {
-	VkResult res;
-	// We share the same depth buffer but have multiple color buffers, see the loop below.
-	VkImageView attachments[2] = { VK_NULL_HANDLE, depth_.view };
-
-	VkFramebufferCreateInfo fb_info = { VK_STRUCTURE_TYPE_FRAMEBUFFER_CREATE_INFO };
-	fb_info.renderPass = queueRunner_.GetCompatibleRenderPass()->Get(vulkan_, RP_TYPE_BACKBUFFER);
-	fb_info.attachmentCount = 2;
-	fb_info.pAttachments = attachments;
-	fb_info.width = width;
-	fb_info.height = height;
-	fb_info.layers = 1;
-
-	framebuffers_.resize(swapchainImageCount_);
-
-	for (uint32_t i = 0; i < swapchainImageCount_; i++) {
-		attachments[0] = swapchainImages_[i].view;
-		res = vkCreateFramebuffer(vulkan_->GetDevice(), &fb_info, nullptr, &framebuffers_[i]);
-		_dbg_assert_(res == VK_SUCCESS);
-		if (res != VK_SUCCESS) {
-			framebuffers_.clear();
-			return false;
-		}
-	}
-
-	return true;
-}
-
-bool VulkanRenderManager::InitDepthStencilBuffer(VkCommandBuffer cmd) {
-	const VkFormat depth_format = vulkan_->GetDeviceInfo().preferredDepthStencilFormat;
-	int aspectMask = VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT;
-	VkImageCreateInfo image_info = { VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO };
-	image_info.imageType = VK_IMAGE_TYPE_2D;
-	image_info.format = depth_format;
-	image_info.extent.width = vulkan_->GetBackbufferWidth();
-	image_info.extent.height = vulkan_->GetBackbufferHeight();
-	image_info.extent.depth = 1;
-	image_info.mipLevels = 1;
-	image_info.arrayLayers = 1;
-	image_info.samples = VK_SAMPLE_COUNT_1_BIT;
-	image_info.queueFamilyIndexCount = 0;
-	image_info.pQueueFamilyIndices = nullptr;
-	image_info.sharingMode = VK_SHARING_MODE_EXCLUSIVE;
-	image_info.usage = VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT;
-	image_info.flags = 0;
-
-	depth_.format = depth_format;
-
-	VmaAllocationCreateInfo allocCreateInfo{};
-	VmaAllocationInfo allocInfo{};
-
-	allocCreateInfo.usage = VMA_MEMORY_USAGE_GPU_ONLY;
-
-	VkResult res = vmaCreateImage(vulkan_->Allocator(), &image_info, &allocCreateInfo, &depth_.image, &depth_.alloc, &allocInfo);
-	_dbg_assert_(res == VK_SUCCESS);
-	if (res != VK_SUCCESS)
-		return false;
-
-	vulkan_->SetDebugName(depth_.image, VK_OBJECT_TYPE_IMAGE, "BackbufferDepth");
-
-	TransitionImageLayout2(cmd, depth_.image, 0, 1,
-		aspectMask,
-		VK_IMAGE_LAYOUT_UNDEFINED, VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL,
-		VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT | VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT,
-		VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT | VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT,
-		0, VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT | VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT);
-
-	VkImageViewCreateInfo depth_view_info = { VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO };
-	depth_view_info.image = depth_.image;
-	depth_view_info.format = depth_format;
-	depth_view_info.components.r = VK_COMPONENT_SWIZZLE_IDENTITY;
-	depth_view_info.components.g = VK_COMPONENT_SWIZZLE_IDENTITY;
-	depth_view_info.components.b = VK_COMPONENT_SWIZZLE_IDENTITY;
-	depth_view_info.components.a = VK_COMPONENT_SWIZZLE_IDENTITY;
-	depth_view_info.subresourceRange.aspectMask = aspectMask;
-	depth_view_info.subresourceRange.baseMipLevel = 0;
-	depth_view_info.subresourceRange.levelCount = 1;
-	depth_view_info.subresourceRange.baseArrayLayer = 0;
-	depth_view_info.subresourceRange.layerCount = 1;
-	depth_view_info.viewType = VK_IMAGE_VIEW_TYPE_2D;
-	depth_view_info.flags = 0;
-
-	VkDevice device = vulkan_->GetDevice();
-
-	res = vkCreateImageView(device, &depth_view_info, NULL, &depth_.view);
-	_dbg_assert_(res == VK_SUCCESS);
-	if (res != VK_SUCCESS)
-		return false;
-
-	return true;
-}
-
 static void RemoveDrawCommands(std::vector<VkRenderData> *cmds) {
 	// Here we remove any DRAW type commands when we hit a CLEAR.
 	for (auto &c : *cmds) {
@ -1359,6 +1162,9 @@ VkImageView VulkanRenderManager::BindFramebufferAsTexture(VKRFramebuffer *fb, in
 	}
 }

+// Called on main thread.
+// Sends the collected commands to the render thread. Submit-latency should be
+// measured from here, probably.
 void VulkanRenderManager::Finish() {
 	EndCurRenderStep();

@ -1371,18 +1177,14 @@ void VulkanRenderManager::Finish() {

 	int curFrame = vulkan_->GetCurFrame();
 	FrameData &frameData = frameData_[curFrame];
-	if (!useThread_) {
-		frameData.steps = std::move(steps_);
-		steps_.clear();
-		frameData.type = VKRRunType::END;
-		Run(curFrame);
-	} else {
+
+	{
 		std::unique_lock<std::mutex> lock(frameData.pull_mutex);
 		VLOG("PUSH: Frame[%d].readyForRun = true", curFrame);
 		frameData.steps = std::move(steps_);
 		steps_.clear();
 		frameData.readyForRun = true;
-		frameData.type = VKRRunType::END;
+		frameData.runType_ = VKRRunType::END;
 		frameData.pull_condVar.notify_all();
 	}
 	vulkan_->EndFrame();
@ -1397,118 +1199,39 @@ void VulkanRenderManager::Wipe() {
 	steps_.clear();
 }

+// Called on the render thread.
+//
 // Can be called multiple times with no bad side effects. This is so that we can either begin a frame the normal way,
 // or stop it in the middle for a synchronous readback, then start over again mostly normally but without repeating
 // the backbuffer image acquisition.
 void VulkanRenderManager::BeginSubmitFrame(int frame) {
 	FrameData &frameData = frameData_[frame];
-	if (!frameData.hasBegun) {
-		// Get the index of the next available swapchain image, and a semaphore to block command buffer execution on.
-		VkResult res = vkAcquireNextImageKHR(vulkan_->GetDevice(), vulkan_->GetSwapchain(), UINT64_MAX, acquireSemaphore_, (VkFence)VK_NULL_HANDLE, &frameData.curSwapchainImage);
-		if (res == VK_SUBOPTIMAL_KHR) {
-			// Hopefully the resize will happen shortly. Ignore - one frame might look bad or something.
-			WARN_LOG(G3D, "VK_SUBOPTIMAL_KHR returned - ignoring");
-		} else if (res == VK_ERROR_OUT_OF_DATE_KHR) {
-			WARN_LOG(G3D, "VK_ERROR_OUT_OF_DATE_KHR returned - processing the frame, but not presenting");
-			frameData.skipSwap = true;
-		} else {
-			_assert_msg_(res == VK_SUCCESS, "vkAcquireNextImageKHR failed! result=%s", VulkanResultToString(res));
-		}

+	// Should only have at most the init command buffer pending here (that one came from the other thread).
+	_dbg_assert_(!frameData.hasPresentCommands);
+	frameData.SubmitPending(vulkan_, FrameSubmitType::Pending, frameDataShared_);
+
+	if (!frameData.hasMainCommands) {
+		// Effectively resets both main and present command buffers, since they both live in this pool.
+		// We always record main commands first, so we don't need to reset the present command buffer separately.
 		vkResetCommandPool(vulkan_->GetDevice(), frameData.cmdPoolMain, 0);
+
 		VkCommandBufferBeginInfo begin{ VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO };
 		begin.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT;
-		res = vkBeginCommandBuffer(frameData.mainCmd, &begin);
-
+		VkResult res = vkBeginCommandBuffer(frameData.mainCmd, &begin);
+		frameData.hasMainCommands = true;
 		_assert_msg_(res == VK_SUCCESS, "vkBeginCommandBuffer failed! result=%s", VulkanResultToString(res));
-
-		queueRunner_.SetBackbuffer(framebuffers_[frameData.curSwapchainImage], swapchainImages_[frameData.curSwapchainImage].image);
-
-		frameData.hasBegun = true;
 	}
 }

-void VulkanRenderManager::Submit(int frame, bool triggerFrameFence) {
-	FrameData &frameData = frameData_[frame];
-	if (frameData.hasInitCommands) {
-		if (frameData.profilingEnabled_ && triggerFrameFence) {
-			// Pre-allocated query ID 1.
-			vkCmdWriteTimestamp(frameData.initCmd, VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, frameData.profile.queryPool, 1);
-		}
-		VkResult res = vkEndCommandBuffer(frameData.initCmd);
-		_assert_msg_(res == VK_SUCCESS, "vkEndCommandBuffer failed (init)! result=%s", VulkanResultToString(res));
-	}
-
-	VkResult res = vkEndCommandBuffer(frameData.mainCmd);
-	_assert_msg_(res == VK_SUCCESS, "vkEndCommandBuffer failed (main)! result=%s", VulkanResultToString(res));
-
-	VkCommandBuffer cmdBufs[2];
-	int numCmdBufs = 0;
-	if (frameData.hasInitCommands) {
-		cmdBufs[numCmdBufs++] = frameData.initCmd;
-		if (splitSubmit_) {
-			// Send the init commands off separately. Used this once to confirm that the cause of a device loss was in the init cmdbuf.
-			VkSubmitInfo submit_info{ VK_STRUCTURE_TYPE_SUBMIT_INFO };
-			submit_info.commandBufferCount = (uint32_t)numCmdBufs;
-			submit_info.pCommandBuffers = cmdBufs;
-			res = vkQueueSubmit(vulkan_->GetGraphicsQueue(), 1, &submit_info, VK_NULL_HANDLE);
-			if (res == VK_ERROR_DEVICE_LOST) {
-				_assert_msg_(false, "Lost the Vulkan device in split submit! If this happens again, switch Graphics Backend away from Vulkan");
-			} else {
-				_assert_msg_(res == VK_SUCCESS, "vkQueueSubmit failed (init)! result=%s", VulkanResultToString(res));
-			}
-			numCmdBufs = 0;
-		}
-	}
-	cmdBufs[numCmdBufs++] = frameData.mainCmd;
-
-	VkSubmitInfo submit_info{ VK_STRUCTURE_TYPE_SUBMIT_INFO };
-	VkPipelineStageFlags waitStage[1]{ VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT };
-	if (triggerFrameFence && !frameData.skipSwap) {
-		submit_info.waitSemaphoreCount = 1;
-		submit_info.pWaitSemaphores = &acquireSemaphore_;
-		submit_info.pWaitDstStageMask = waitStage;
-	}
-	submit_info.commandBufferCount = (uint32_t)numCmdBufs;
-	submit_info.pCommandBuffers = cmdBufs;
-	if (triggerFrameFence && !frameData.skipSwap) {
-		submit_info.signalSemaphoreCount = 1;
-		submit_info.pSignalSemaphores = &renderingCompleteSemaphore_;
-	}
-	res = vkQueueSubmit(vulkan_->GetGraphicsQueue(), 1, &submit_info, triggerFrameFence ? frameData.fence : frameData.readbackFence);
-	if (res == VK_ERROR_DEVICE_LOST) {
-		_assert_msg_(false, "Lost the Vulkan device in vkQueueSubmit! If this happens again, switch Graphics Backend away from Vulkan");
-	} else {
-		_assert_msg_(res == VK_SUCCESS, "vkQueueSubmit failed (main, split=%d)! result=%s", (int)splitSubmit_, VulkanResultToString(res));
-	}
-
-	// When !triggerFence, we notify after syncing with Vulkan.
-	if (useThread_ && triggerFrameFence) {
-		VLOG("PULL: Frame %d.readyForFence = true", frame);
-		std::unique_lock<std::mutex> lock(frameData.push_mutex);
-		frameData.readyForFence = true;
-		frameData.push_condVar.notify_all();
-	}
-
-	frameData.hasInitCommands = false;
-}
-
+// Called on the render thread.
 void VulkanRenderManager::EndSubmitFrame(int frame) {
 	FrameData &frameData = frameData_[frame];
-	frameData.hasBegun = false;

-	Submit(frame, true);
+	frameData.SubmitPending(vulkan_, FrameSubmitType::Present, frameDataShared_);

 	if (!frameData.skipSwap) {
-		VkSwapchainKHR swapchain = vulkan_->GetSwapchain();
-		VkPresentInfoKHR present = { VK_STRUCTURE_TYPE_PRESENT_INFO_KHR };
-		present.swapchainCount = 1;
-		present.pSwapchains = &swapchain;
-		present.pImageIndices = &frameData.curSwapchainImage;
-		present.pWaitSemaphores = &renderingCompleteSemaphore_;
-		present.waitSemaphoreCount = 1;
-
-		VkResult res = vkQueuePresentKHR(vulkan_->GetGraphicsQueue(), &present);
+		VkResult res = frameData.QueuePresent(vulkan_, frameDataShared_);
 		if (res == VK_ERROR_OUT_OF_DATE_KHR) {
 			// We clearly didn't get this in vkAcquireNextImageKHR because of the skipSwap check above.
 			// Do the increment.
@ -1528,18 +1251,29 @@ void VulkanRenderManager::EndSubmitFrame(int frame) {
 	}
 }

+void VulkanRenderManager::EndSyncFrame(int frame) {
+	FrameData &frameData = frameData_[frame];
+
+	// The submit will trigger the readbackFence, and also do the wait for it.
+	frameData.SubmitPending(vulkan_, FrameSubmitType::Sync, frameDataShared_);
+
+	// At this point we can resume filling the command buffers for the current frame since
+	// we know the device is idle - and thus all previously enqueued command buffers have been processed.
+	// No need to switch to the next frame number, would just be confusing.
+	std::unique_lock<std::mutex> lock(frameData.push_mutex);
+	frameData.readyForFence = true;
+	frameData.push_condVar.notify_all();
+}
+
 void VulkanRenderManager::Run(int frame) {
 	BeginSubmitFrame(frame);

 	FrameData &frameData = frameData_[frame];
-	auto &stepsOnThread = frameData_[frame].steps;
-	VkCommandBuffer cmd = frameData.mainCmd;
-	queueRunner_.PreprocessSteps(stepsOnThread);
+	queueRunner_.PreprocessSteps(frameData_[frame].steps);
 	//queueRunner_.LogSteps(stepsOnThread, false);
-	queueRunner_.RunSteps(cmd, stepsOnThread, frameData.profilingEnabled_ ? &frameData.profile : nullptr);
-	stepsOnThread.clear();
+	queueRunner_.RunSteps(frameData, frameDataShared_);

-	switch (frameData.type) {
+	switch (frameData.runType_) {
 	case VKRRunType::END:
 		EndSubmitFrame(frame);
 		break;
@ -1555,59 +1289,24 @@ void VulkanRenderManager::Run(int frame) {
 	VLOG("PULL: Finished running frame %d", frame);
 }

-void VulkanRenderManager::EndSyncFrame(int frame) {
-	FrameData &frameData = frameData_[frame];
-
-	frameData.readbackFenceUsed = true;
-
-	// The submit will trigger the readbackFence.
-	Submit(frame, false);
-
-	// Hard stall of the GPU, not ideal, but necessary so the CPU has the contents of the readback.
-	vkWaitForFences(vulkan_->GetDevice(), 1, &frameData.readbackFence, true, UINT64_MAX);
-	vkResetFences(vulkan_->GetDevice(), 1, &frameData.readbackFence);
-
-	// At this point we can resume filling the command buffers for the current frame since
-	// we know the device is idle - and thus all previously enqueued command buffers have been processed.
-	// No need to switch to the next frame number.
-	VkCommandBufferBeginInfo begin{
-		VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO,
-		nullptr,
-		VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT
-	};
-	vkResetCommandPool(vulkan_->GetDevice(), frameData.cmdPoolMain, 0);
-	VkResult res = vkBeginCommandBuffer(frameData.mainCmd, &begin);
-	_assert_(res == VK_SUCCESS);
-
-	if (useThread_) {
-		std::unique_lock<std::mutex> lock(frameData.push_mutex);
-		frameData.readyForFence = true;
-		frameData.push_condVar.notify_all();
-	}
-}
-
 void VulkanRenderManager::FlushSync() {
 	renderStepOffset_ += (int)steps_.size();

 	int curFrame = vulkan_->GetCurFrame();
 	FrameData &frameData = frameData_[curFrame];
-	if (!useThread_) {
-		frameData.steps = std::move(steps_);
-		steps_.clear();
-		frameData.type = VKRRunType::SYNC;
-		Run(curFrame);
-	} else {
+	
+	{
 		std::unique_lock<std::mutex> lock(frameData.pull_mutex);
 		VLOG("PUSH: Frame[%d].readyForRun = true (sync)", curFrame);
 		frameData.steps = std::move(steps_);
 		steps_.clear();
 		frameData.readyForRun = true;
 		_dbg_assert_(!frameData.readyForFence);
-		frameData.type = VKRRunType::SYNC;
+		frameData.runType_ = VKRRunType::SYNC;
 		frameData.pull_condVar.notify_all();
 	}

-	if (useThread_) {
+	{
 		std::unique_lock<std::mutex> lock(frameData.push_mutex);
 		// Wait for the flush to be hit, since we're syncing.
 		while (!frameData.readyForFence) {
--- a/Common/GPU/Vulkan/VulkanRenderManager.h
+++ b/Common/GPU/Vulkan/VulkanRenderManager.h
@ -65,15 +65,6 @@ private:
 	std::string tag_;
 };

-enum class VKRRunType {
-	END,
-	SYNC,
-};
-
-enum {
-	MAX_TIMESTAMP_QUERIES = 128,
-};
-
 struct BoundingRect {
 	int x1;
 	int y1;
@ -236,6 +227,8 @@ public:
 	// as the other backends, even though there's no actual binding happening here.
 	VkImageView BindFramebufferAsTexture(VKRFramebuffer *fb, int binding, VkImageAspectFlags aspectBits, int attachment);

+	void BindCurrentFramebufferAsInputAttachment0(VkImageAspectFlags aspectBits);
+
 	bool CopyFramebufferToMemorySync(VKRFramebuffer *src, VkImageAspectFlags aspectBits, int x, int y, int w, int h, Draw::DataFormat destFormat, uint8_t *pixels, int pixelStride, const char *tag);
 	void CopyImageToMemorySync(VkImage image, int mipLevel, int x, int y, int w, int h, Draw::DataFormat destFormat, uint8_t *pixels, int pixelStride, const char *tag);

@ -440,11 +433,7 @@ public:
 	void DestroyBackbuffers();

 	bool HasBackbuffers() {
-		return !framebuffers_.empty();
-	}
-
-	void SetSplitSubmit(bool split) {
-		splitSubmit_ = split;
+		return queueRunner_.HasBackbuffers();
 	}

 	void SetInflightFrames(int f) {
@ -470,13 +459,10 @@ public:
 	}

 private:
-	bool InitBackbufferFramebuffers(int width, int height);
-	bool InitDepthStencilBuffer(VkCommandBuffer cmd);  // Used for non-buffered rendering.
 	void EndCurRenderStep();

 	void BeginSubmitFrame(int frame);
 	void EndSubmitFrame(int frame);
-	void Submit(int frame, bool triggerFence);

 	// Bad for performance but sometimes necessary for synchronous CPU readbacks (screenshots and whatnot).
 	void FlushSync();
@ -484,43 +470,7 @@ private:

 	void StopThread();

-	// Permanent objects
-	VkSemaphore acquireSemaphore_;
-	VkSemaphore renderingCompleteSemaphore_;
-
-	// Per-frame data, round-robin so we can overlap submission with execution of the previous frame.
-	struct FrameData {
-		std::mutex push_mutex;
-		std::condition_variable push_condVar;
-
-		std::mutex pull_mutex;
-		std::condition_variable pull_condVar;
-
-		bool readyForFence = true;
-		bool readyForRun = false;
-		bool skipSwap = false;
-		VKRRunType type = VKRRunType::END;
-
-		VkFence fence;
-		VkFence readbackFence;  // Strictly speaking we might only need one of these.
-		bool readbackFenceUsed = false;
-
-		// These are on different threads so need separate pools.
-		VkCommandPool cmdPoolInit;
-		VkCommandPool cmdPoolMain;
-		VkCommandBuffer initCmd;
-		VkCommandBuffer mainCmd;
-		bool hasInitCommands = false;
-		std::vector<VKRStep *> steps;
-
-		// Swapchain.
-		bool hasBegun = false;
-		uint32_t curSwapchainImage = -1;
-
-		// Profiling.
-		QueueProfileContext profile;
-		bool profilingEnabled_;
-	};
+	FrameDataShared frameDataShared_;

 	FrameData frameData_[VulkanContext::MAX_INFLIGHT_FRAMES];
 	int newInflightFrames_ = -1;
@ -544,11 +494,10 @@ private:
 	VKRStep *curRenderStep_ = nullptr;
 	bool curStepHasViewport_ = false;
 	bool curStepHasScissor_ = false;
-	u32 curPipelineFlags_ = 0;
+	PipelineFlags curPipelineFlags_{};
 	BoundingRect curRenderArea_;

 	std::vector<VKRStep *> steps_;
-	bool splitSubmit_ = false;

 	// Execution time state
 	bool run_ = true;
@ -568,23 +517,4 @@ private:

 	// pipelines to check and possibly create at the end of the current render pass.
 	std::vector<VKRGraphicsPipeline *> pipelinesToCheck_;
-
-	// Swap chain management
-	struct SwapchainImageData {
-		VkImage image;
-		VkImageView view;
-	};
-	std::vector<VkFramebuffer> framebuffers_;
-	std::vector<SwapchainImageData> swapchainImages_;
-	uint32_t swapchainImageCount_ = 0;
-	struct DepthBufferInfo {
-		VkFormat format = VK_FORMAT_UNDEFINED;
-		VkImage image = VK_NULL_HANDLE;
-		VmaAllocation alloc = VK_NULL_HANDLE;
-		VkImageView view = VK_NULL_HANDLE;
-	};
-	DepthBufferInfo depth_;
-
-	// This works great - except see issue #10097. WTF?
-	bool useThread_ = true;
 };
--- a/Common/GPU/Vulkan/thin3d_vulkan.cpp
+++ b/Common/GPU/Vulkan/thin3d_vulkan.cpp
@ -361,7 +361,7 @@ class VKFramebuffer;

 class VKContext : public DrawContext {
 public:
-	VKContext(VulkanContext *vulkan, bool splitSubmit);
+	VKContext(VulkanContext *vulkan);
 	virtual ~VKContext();

 	const DeviceCaps &GetDeviceCaps() const override {
@ -401,9 +401,10 @@ public:
 	// These functions should be self explanatory.
 	void BindFramebufferAsRenderTarget(Framebuffer *fbo, const RenderPassInfo &rp, const char *tag) override;
 	Framebuffer *GetCurrentRenderTarget() override {
-		return curFramebuffer_;
+		return (Framebuffer *)curFramebuffer_.ptr;
 	}
 	void BindFramebufferAsTexture(Framebuffer *fbo, int binding, FBChannel channelBit, int attachment) override;
+	void BindCurrentFramebufferForColorInput() override;

 	void GetFramebufferDimensions(Framebuffer *fbo, int *w, int *h) override;

@ -473,27 +474,7 @@ public:
 	std::vector<std::string> GetFeatureList() const override;
 	std::vector<std::string> GetExtensionList() const override;

-	uint64_t GetNativeObject(NativeObject obj, void *srcObject) override {
-		switch (obj) {
-		case NativeObject::CONTEXT:
-			return (uint64_t)vulkan_;
-		case NativeObject::INIT_COMMANDBUFFER:
-			return (uint64_t)renderManager_.GetInitCmd();
-		case NativeObject::BOUND_TEXTURE0_IMAGEVIEW:
-			return (uint64_t)boundImageView_[0];
-		case NativeObject::BOUND_TEXTURE1_IMAGEVIEW:
-			return (uint64_t)boundImageView_[1];
-		case NativeObject::RENDER_MANAGER:
-			return (uint64_t)(uintptr_t)&renderManager_;
-		case NativeObject::NULL_IMAGEVIEW:
-			return (uint64_t)GetNullTexture()->GetImageView();
-		case NativeObject::TEXTURE_VIEW:
-			return (uint64_t)(((VKTexture *)srcObject)->GetImageView());
-		default:
-			Crash();
-			return 0;
-		}
-	}
+	uint64_t GetNativeObject(NativeObject obj, void *srcObject) override;

 	void HandleEvent(Event ev, int width, int height, void *param1, void *param2) override;

@ -522,7 +503,7 @@ private:
 	VkDescriptorSetLayout descriptorSetLayout_ = VK_NULL_HANDLE;
 	VkPipelineLayout pipelineLayout_ = VK_NULL_HANDLE;
 	VkPipelineCache pipelineCache_ = VK_NULL_HANDLE;
-	AutoRef<Framebuffer> curFramebuffer_;
+	AutoRef<VKFramebuffer> curFramebuffer_;

 	VkDevice device_;
 	VkQueue queue_;
@ -781,7 +762,7 @@ bool VKTexture::Create(VkCommandBuffer cmd, VulkanPushBuffer *push, const Textur
 	return true;
 }

-VKContext::VKContext(VulkanContext *vulkan, bool splitSubmit)
+VKContext::VKContext(VulkanContext *vulkan)
 	: vulkan_(vulkan), renderManager_(vulkan) {
 	shaderLanguageDesc_.Init(GLSL_VULKAN);

@ -807,9 +788,11 @@ VKContext::VKContext(VulkanContext *vulkan, bool splitSubmit)
 	caps_.fragmentShaderInt32Supported = true;
 	caps_.textureNPOTFullySupported = true;
 	caps_.fragmentShaderDepthWriteSupported = true;
+	caps_.blendMinMaxSupported = true;
 	caps_.logicOpSupported = vulkan->GetDeviceFeatures().enabled.logicOp != 0;

 	auto deviceProps = vulkan->GetPhysicalDeviceProperties(vulkan_->GetCurrentPhysicalDeviceIndex()).properties;
+
 	switch (deviceProps.vendorID) {
 	case VULKAN_VENDOR_AMD: caps_.vendor = GPUVendor::VENDOR_AMD; break;
 	case VULKAN_VENDOR_ARM: caps_.vendor = GPUVendor::VENDOR_ARM; break;
@ -831,6 +814,11 @@ VKContext::VKContext(VulkanContext *vulkan, bool splitSubmit)
 		// Color write mask not masking write in certain scenarios with a depth test, see #10421.
 		// Known still present on driver 0x80180000 and Adreno 5xx (possibly more.)
 		bugs_.Infest(Bugs::COLORWRITEMASK_BROKEN_WITH_DEPTHTEST);
+
+		// Trying to follow all the rules in https://registry.khronos.org/vulkan/specs/1.3/html/vkspec.html#synchronization-pipeline-barriers-subpass-self-dependencies
+		// and https://registry.khronos.org/vulkan/specs/1.3/html/vkspec.html#renderpass-feedbackloop, but still it doesn't
+		// quite work - artifacts on triangle boundaries on Adreno.
+		bugs_.Infest(Bugs::SUBPASS_FEEDBACK_BROKEN);
 	} else if (caps_.vendor == GPUVendor::VENDOR_AMD) {
 		// See issue #10074, and also #10065 (AMD) and #10109 for the choice of the driver version to check for.
 		if (deviceProps.driverVersion < 0x00407000) {
@ -840,19 +828,27 @@ VKContext::VKContext(VulkanContext *vulkan, bool splitSubmit)
 		// Workaround for Intel driver bug. TODO: Re-enable after some driver version
 		bugs_.Infest(Bugs::DUAL_SOURCE_BLENDING_BROKEN);
 	} else if (caps_.vendor == GPUVendor::VENDOR_ARM) {
+		int majorVersion = VK_API_VERSION_MAJOR(deviceProps.driverVersion);
+
 		// These GPUs (up to some certain hardware version?) have a bug where draws where gl_Position.w == .z
 		// corrupt the depth buffer. This is easily worked around by simply scaling Z down a tiny bit when this case
 		// is detected. See: https://github.com/hrydgard/ppsspp/issues/11937
 		bugs_.Infest(Bugs::EQUAL_WZ_CORRUPTS_DEPTH);
-		// At least one driver at the upper end of the range is known to be likely to suffer from the bug causing issue #13833 (Midnight Club map broken).
-		bugs_.Infest(Bugs::MALI_STENCIL_DISCARD_BUG);

-		// This started in driver 31 or 32.
-		if (VK_API_VERSION_MAJOR(deviceProps.driverVersion) >= 32) {
+		// Nearly identical to the the Adreno bug, see #13833 (Midnight Club map broken) and other issues.
+		// Reported fixed in major version 40 - let's add a check once confirmed.
+		bugs_.Infest(Bugs::NO_DEPTH_CANNOT_DISCARD_STENCIL);
+
+		// This started in driver 31 or 32, fixed in 40 - let's add a check once confirmed.
+		if (majorVersion >= 32) {
 			bugs_.Infest(Bugs::MALI_CONSTANT_LOAD_BUG);  // See issue #15661
 		}
 	}

+	// Limited, through input attachments and self-dependencies.
+	// We turn it off here already if buggy.
+	caps_.framebufferFetchSupported = !bugs_.Has(Bugs::SUBPASS_FEEDBACK_BROKEN);
+
 	caps_.deviceID = deviceProps.deviceID;
 	device_ = vulkan->GetDevice();

@ -920,8 +916,6 @@ VKContext::VKContext(VulkanContext *vulkan, bool splitSubmit)
 	VkPipelineCacheCreateInfo pc{ VK_STRUCTURE_TYPE_PIPELINE_CACHE_CREATE_INFO };
 	res = vkCreatePipelineCache(vulkan_->GetDevice(), &pc, nullptr, &pipelineCache_);
 	_assert_(VK_SUCCESS == res);
-
-	renderManager_.SetSplitSubmit(splitSubmit);
 }

 VKContext::~VKContext() {
@ -1058,12 +1052,12 @@ Pipeline *VKContext::CreateGraphicsPipeline(const PipelineDesc &desc, const char
 	VKDepthStencilState *depth = (VKDepthStencilState *)desc.depthStencil;
 	VKRasterState *raster = (VKRasterState *)desc.raster;

-	u32 pipelineFlags = 0;
+	PipelineFlags pipelineFlags = (PipelineFlags)0;
 	if (depth->info.depthTestEnable || depth->info.stencilTestEnable) {
-		pipelineFlags |= PIPELINE_FLAG_USES_DEPTH_STENCIL;
+		pipelineFlags |= PipelineFlags::USES_DEPTH_STENCIL;
 	}

-	VKPipeline *pipeline = new VKPipeline(vulkan_, desc.uniformDesc ? desc.uniformDesc->uniformBufferSize : 16 * sizeof(float), (PipelineFlags)pipelineFlags, tag);
+	VKPipeline *pipeline = new VKPipeline(vulkan_, desc.uniformDesc ? desc.uniformDesc->uniformBufferSize : 16 * sizeof(float), pipelineFlags, tag);

 	VKRGraphicsPipelineDesc &gDesc = pipeline->vkrDesc;

@ -1401,8 +1395,8 @@ void VKContext::Clear(int clearMask, uint32_t colorval, float depthVal, int sten
 	renderManager_.Clear(colorval, depthVal, stencilVal, mask);
 }

-DrawContext *T3DCreateVulkanContext(VulkanContext *vulkan, bool split) {
-	return new VKContext(vulkan, split);
+DrawContext *T3DCreateVulkanContext(VulkanContext *vulkan) {
+	return new VKContext(vulkan);
 }

 void AddFeature(std::vector<std::string> &features, const char *name, VkBool32 available, VkBool32 enabled) {
@ -1584,6 +1578,10 @@ void VKContext::BindFramebufferAsTexture(Framebuffer *fbo, int binding, FBChanne
 	boundImageView_[binding] = renderManager_.BindFramebufferAsTexture(fb->GetFB(), binding, aspect, attachment);
 }

+void VKContext::BindCurrentFramebufferForColorInput() {
+	renderManager_.BindCurrentFramebufferAsInputAttachment0(VK_IMAGE_ASPECT_COLOR_BIT);
+}
+
 void VKContext::GetFramebufferDimensions(Framebuffer *fbo, int *w, int *h) {
 	VKFramebuffer *fb = (VKFramebuffer *)fbo;
 	if (fb) {
@ -1624,4 +1622,28 @@ void VKContext::InvalidateFramebuffer(FBInvalidationStage stage, uint32_t channe
 	}
 }

+uint64_t VKContext::GetNativeObject(NativeObject obj, void *srcObject) {
+	switch (obj) {
+	case NativeObject::CONTEXT:
+		return (uint64_t)vulkan_;
+	case NativeObject::INIT_COMMANDBUFFER:
+		return (uint64_t)renderManager_.GetInitCmd();
+	case NativeObject::BOUND_TEXTURE0_IMAGEVIEW:
+		return (uint64_t)boundImageView_[0];
+	case NativeObject::BOUND_TEXTURE1_IMAGEVIEW:
+		return (uint64_t)boundImageView_[1];
+	case NativeObject::RENDER_MANAGER:
+		return (uint64_t)(uintptr_t)&renderManager_;
+	case NativeObject::NULL_IMAGEVIEW:
+		return (uint64_t)GetNullTexture()->GetImageView();
+	case NativeObject::TEXTURE_VIEW:
+		return (uint64_t)(((VKTexture *)srcObject)->GetImageView());
+	case NativeObject::BOUND_FRAMEBUFFER_COLOR_IMAGEVIEW:
+		return (uint64_t)curFramebuffer_->GetFB()->color.imageView;
+	default:
+		Crash();
+		return 0;
+	}
+}
+
 }  // namespace Draw
--- a/Common/GPU/thin3d.cpp
+++ b/Common/GPU/thin3d.cpp
@ -678,9 +678,9 @@ const char *Bugs::GetBugName(uint32_t bug) {
 	case COLORWRITEMASK_BROKEN_WITH_DEPTHTEST: return "COLORWRITEMASK_BROKEN_WITH_DEPTHTEST";
 	case BROKEN_FLAT_IN_SHADER: return "BROKEN_FLAT_IN_SHADER";
 	case EQUAL_WZ_CORRUPTS_DEPTH: return "EQUAL_WZ_CORRUPTS_DEPTH";
-	case MALI_STENCIL_DISCARD_BUG: return "MALI_STENCIL_DISCARD_BUG";
 	case RASPBERRY_SHADER_COMP_HANG: return "RASPBERRY_SHADER_COMP_HANG";
 	case MALI_CONSTANT_LOAD_BUG: return "MALI_CONSTANT_LOAD_BUG";
+	case SUBPASS_FEEDBACK_BROKEN: return "SUBPASS_FEEDBACK_BROKEN";
 	default: return "(N/A)";
 	}
 }
--- a/Common/GPU/thin3d.h
+++ b/Common/GPU/thin3d.h
@ -242,6 +242,7 @@ enum class NativeObject {
 	INIT_COMMANDBUFFER,
 	BOUND_TEXTURE0_IMAGEVIEW,
 	BOUND_TEXTURE1_IMAGEVIEW,
+	BOUND_FRAMEBUFFER_COLOR_IMAGEVIEW,
 	RENDER_MANAGER,
 	TEXTURE_VIEW,
 	NULL_IMAGEVIEW,
@ -328,9 +329,9 @@ public:
 		COLORWRITEMASK_BROKEN_WITH_DEPTHTEST = 5,
 		BROKEN_FLAT_IN_SHADER = 6,
 		EQUAL_WZ_CORRUPTS_DEPTH = 7,
-		MALI_STENCIL_DISCARD_BUG = 8,
-		RASPBERRY_SHADER_COMP_HANG = 9,
-		MALI_CONSTANT_LOAD_BUG = 10,
+		RASPBERRY_SHADER_COMP_HANG = 8,
+		MALI_CONSTANT_LOAD_BUG = 9,
+		SUBPASS_FEEDBACK_BROKEN = 10,
 		MAX_BUG,
 	};

@ -546,6 +547,7 @@ struct DeviceCaps {
 	bool textureNPOTFullySupported;
 	bool fragmentShaderDepthWriteSupported;
 	bool textureDepthSupported;
+	bool blendMinMaxSupported;

 	std::string deviceName;  // The device name to use when creating the thin3d context, to get the same one.
 };
@ -651,6 +653,9 @@ public:
 	// binding must be < MAX_TEXTURE_SLOTS (0, 1 are okay if it's 2).
 	virtual void BindFramebufferAsTexture(Framebuffer *fbo, int binding, FBChannel channelBit, int attachment) = 0;

+	// Framebuffer fetch / input attachment support, needs to be explicit in Vulkan.
+	virtual void BindCurrentFramebufferForColorInput() {}
+
 	// deprecated, only used by D3D9
 	virtual uintptr_t GetFramebufferAPITexture(Framebuffer *fbo, int channelBits, int attachment) {
 		return 0;
--- a/Common/GPU/thin3d_create.h
+++ b/Common/GPU/thin3d_create.h
@ -31,6 +31,6 @@ DrawContext *T3DCreateDX9Context(IDirect3D9 *d3d, IDirect3D9Ex *d3dEx, int adapt
 DrawContext *T3DCreateD3D11Context(ID3D11Device *device, ID3D11DeviceContext *context, ID3D11Device1 *device1, ID3D11DeviceContext1 *context1, D3D_FEATURE_LEVEL featureLevel, HWND hWnd, std::vector<std::string> adapterNames);
 #endif

-DrawContext *T3DCreateVulkanContext(VulkanContext *context, bool splitSubmit);
+DrawContext *T3DCreateVulkanContext(VulkanContext *context);

 }  // namespace Draw
--- a/Common/UI/Context.cpp
+++ b/Common/UI/Context.cpp
@ -170,7 +170,7 @@ void UIContext::ActivateTopScissor() {
 		int h = std::max(0.0f, ceilf(scale_y * bounds.h));
 		if (x < 0 || y < 0 || x + w > pixel_xres || y + h > pixel_yres) {
 			// This won't actually report outside a game, but we can try.
-			ERROR_LOG_REPORT(G3D, "UI scissor out of bounds: %d,%d-%d,%d / %d,%d", x, y, w, h, pixel_xres, pixel_yres);
+			ERROR_LOG_REPORT(G3D, "UI scissor out of bounds in %sScreen: %d,%d-%d,%d / %d,%d", screenTag_ ? screenTag_ : "N/A", x, y, w, h, pixel_xres, pixel_yres);
 			x = std::max(0, x);
 			y = std::max(0, y);
 			w = std::min(w, pixel_xres - x);
--- a/Common/UI/Context.h
+++ b/Common/UI/Context.h
@ -74,7 +74,6 @@ public:
 	const UI::Theme *theme;

 	// Utility methods
-
 	TextDrawer *Text() const { return textDrawer_; }

 	void SetFontStyle(const UI::FontStyle &style);
@ -103,6 +102,10 @@ public:

 	void setUIAtlas(const std::string &name);

+	void SetScreenTag(const char *tag) {
+		screenTag_ = tag;
+	}
+
 private:
 	Draw::DrawContext *draw_ = nullptr;
 	Bounds bounds_;
@ -126,4 +129,6 @@ private:

 	std::string lastUIAtlas_;
 	std::string UIAtlas_ = "ui_atlas.zim";
+
+	const char *screenTag_ = nullptr;
 };
--- a/Common/UI/Screen.h
+++ b/Common/UI/Screen.h
@ -71,7 +71,7 @@ public:
 	// what screen it is.
 	virtual void *dialogData() { return 0; }

-	virtual std::string tag() const { return std::string(""); }
+	virtual const char *tag() const = 0;

 	virtual bool isTransparent() const { return false; }
 	virtual bool isTopLevel() const { return false; }
--- a/Common/UI/UIScreen.cpp
+++ b/Common/UI/UIScreen.cpp
@ -117,6 +117,9 @@ void UIScreen::render() {

 	if (root_) {
 		UIContext *uiContext = screenManager()->getUIContext();
+
+		uiContext->SetScreenTag(tag());
+
 		UI::LayoutViewHierarchy(*uiContext, root_, ignoreInsets_);

 		uiContext->PushTransform({translation_, scale_, alpha_});
--- a/Common/UI/UIScreen.h
+++ b/Common/UI/UIScreen.h
@ -136,7 +136,7 @@ public:
 	void SetHiddenChoices(std::set<int> hidden) {
 		hidden_ = hidden;
 	}
-	virtual std::string tag() const override { return std::string("listpopup"); }
+	const char *tag() const override { return "listpopup"; }

 	UI::Event OnChoice;

@ -187,6 +187,8 @@ public:
 		disabled_ = *value_ < 0;
 	}

+	const char *tag() const override { return "SliderPopup"; }
+
 	Event OnChange;

 private:
@ -214,6 +216,8 @@ public:
 	: PopupScreen(title, "OK", "Cancel"), units_(units), value_(value), originalValue_(*value), minValue_(minValue), maxValue_(maxValue), step_(step), changing_(false), liveUpdate_(liveUpdate) {}
 	void CreatePopupContents(UI::ViewGroup *parent) override;

+	const char *tag() const override { return "SliderFloatPopup"; }
+
 	Event OnChange;

 private:
@ -241,6 +245,8 @@ public:
 		: PopupScreen(title, "OK", "Cancel"), value_(value), placeholder_(placeholder), maxLen_(maxLen) {}
 	virtual void CreatePopupContents(ViewGroup *parent) override;

+	const char *tag() const override { return "TextEditPopup"; }
+
 	Event OnChange;

 private:
--- a/Common/VR/VRRenderer.cpp
+++ b/Common/VR/VRRenderer.cpp
@ -9,6 +9,7 @@
 #include <GLES3/gl3.h>
 #include <GLES3/gl3ext.h>

+XrFovf fov;
 XrView* projections;
 XrPosef invViewTransform[2];
 XrFrameState frameState = {};
@ -293,7 +294,12 @@ bool VR_InitFrame( engine_t* engine ) {
 			projections));
 	//

+	fov = {};
 	for (int eye = 0; eye < ovrMaxNumEyes; eye++) {
+		fov.angleLeft += projections[eye].fov.angleLeft / 2.0f;
+		fov.angleRight += projections[eye].fov.angleRight / 2.0f;
+		fov.angleUp += projections[eye].fov.angleUp / 2.0f;
+		fov.angleDown += projections[eye].fov.angleDown / 2.0f;
 		invViewTransform[eye] = projections[eye].pose;
 	}

@ -353,10 +359,7 @@ void VR_FinishFrame( engine_t* engine ) {
 		for (int eye = 0; eye < ovrMaxNumEyes; eye++) {
 			int imageLayer = engine->appState.Renderer.Multiview ? eye : 0;
 			ovrFramebuffer* frameBuffer = &engine->appState.Renderer.FrameBuffer[0];
-			XrFovf fov = projections[eye].fov;
-			if (vrMode == VR_MODE_MONO_6DOF) {
-				fov = projections[0].fov;
-			} else if (!engine->appState.Renderer.Multiview) {
+			if ((vrMode != VR_MODE_MONO_6DOF) && !engine->appState.Renderer.Multiview) {
 				frameBuffer = &engine->appState.Renderer.FrameBuffer[eye];
 			}

@ -463,7 +466,6 @@ void VR_BindFramebuffer(engine_t *engine) {
 ovrMatrix4f VR_GetMatrix( VRMatrix matrix ) {
 	ovrMatrix4f output;
 	if ((matrix == VR_PROJECTION_MATRIX_LEFT_EYE) || (matrix == VR_PROJECTION_MATRIX_RIGHT_EYE)) {
-		XrFovf fov = matrix == VR_PROJECTION_MATRIX_LEFT_EYE ? projections[0].fov : projections[1].fov;
 		float near = (float)vrConfig[VR_CONFIG_FOV_SCALE] / 200.0f;
 		output = ovrMatrix4f_CreateProjectionFov(fov.angleLeft, fov.angleRight, fov.angleUp, fov.angleDown, near, 0.0f );
 	} else if ((matrix == VR_VIEW_MATRIX_LEFT_EYE) || (matrix == VR_VIEW_MATRIX_RIGHT_EYE)) {
--- a/Core/Config.cpp
+++ b/Core/Config.cpp
@ -938,7 +938,6 @@ static ConfigSetting graphicsSettings[] = {
 	ReportedConfigSetting("FragmentTestCache", &g_Config.bFragmentTestCache, true, true, true),

 	ConfigSetting("GfxDebugOutput", &g_Config.bGfxDebugOutput, false, false, false),
-	ConfigSetting("GfxDebugSplitSubmit", &g_Config.bGfxDebugSplitSubmit, false, false, false),
 	ConfigSetting("LogFrameDrops", &g_Config.bLogFrameDrops, false, true, false),

 	ConfigSetting("InflightFrames", &g_Config.iInflightFrames, 3, true, false),
--- a/Core/Config.h
+++ b/Core/Config.h
@ -176,6 +176,7 @@ public:
 	bool bSustainedPerformanceMode;  // Android: Slows clocks down to avoid overheating/speed fluctuations.
 	bool bIgnoreScreenInsets;  // Android: Center screen disregarding insets if this is enabled.
 	bool bVSync;
+
 	int iFrameSkip;
 	int iFrameSkipType;
 	int iFastForwardMode; // See FastForwardMode in ConfigValues.h.
@ -242,7 +243,6 @@ public:
 	bool bShaderChainRequires60FPS;
 	std::string sTextureShaderName;
 	bool bGfxDebugOutput;
-	bool bGfxDebugSplitSubmit;
 	int iInflightFrames;
 	bool bRenderDuplicateFrames;

--- a/Core/Dialog/PSPGamedataInstallDialog.cpp
+++ b/Core/Dialog/PSPGamedataInstallDialog.cpp
@ -36,8 +36,9 @@ const static u32 GAMEDATA_BYTES_PER_READ = 32768;
 // If this is too high, some games (e.g. Senjou no Valkyria 3) will lag.
 const static u32 GAMEDATA_READS_PER_UPDATE = 20;

-const u32 ERROR_UTILITY_GAMEDATA_MEMSTRICK_WRITE_PROTECTED = 0x80111903;
 const u32 ERROR_UTILITY_GAMEDATA_MEMSTRICK_REMOVED = 0x80111901;
+const u32 ERROR_UTILITY_GAMEDATA_MEMSTRICK_WRITE_PROTECTED = 0x80111903;
+const u32 ERROR_UTILITY_GAMEDATA_INVALID_MODE = 0x80111908;

 static const std::string SFO_FILENAME = "PARAM.SFO";

@ -88,9 +89,14 @@ int PSPGamedataInstallDialog::Init(u32 paramAddr) {
 	}

 	int size = Memory::Read_U32(paramAddr);
+	if (size != 1424 && size != 1432) {
+		ERROR_LOG_REPORT(SCEUTILITY, "sceGamedataInstallInitStart: invalid param size %d", size);
+		return SCE_ERROR_UTILITY_INVALID_PARAM_SIZE;
+	}
+
 	memset(&request, 0, sizeof(request));
 	// Only copy the right size to support different request format
-	Memory::Memcpy(&request, paramAddr, size);
+	Memory::Memcpy(&request, paramAddr, size, "sceGamedataInstallInitStart");

 	ChangeStatusInit(GAMEDATA_INIT_DELAY_US);
 	return 0;
@ -100,6 +106,17 @@ int PSPGamedataInstallDialog::Update(int animSpeed) {
 	if (GetStatus() != SCE_UTILITY_STATUS_RUNNING)
 		return SCE_ERROR_UTILITY_INVALID_STATUS;

+	if (param->mode >= 2) {
+		param->common.result = ERROR_UTILITY_GAMEDATA_INVALID_MODE;
+		param.NotifyWrite("DialogResult");
+		ChangeStatus(SCE_UTILITY_STATUS_FINISHED, 0);
+		WARN_LOG_REPORT(SCEUTILITY, "sceUtilityGamedataInstallUpdate: invalid mode %d", param->mode);
+		return 0;
+	}
+
+	// TODO: param->mode == 1 should show a prompt to confirm, then a progress bar.
+	// Any other mode (i.e. 0 or negative) should proceed and show no UI.
+
 	// TODO: This should return error codes in some cases, like write failure.
 	// request.common.result must be updated for errors as well.
 	
@ -222,6 +239,9 @@ void PSPGamedataInstallDialog::WriteSfoFile() {
 }

 int PSPGamedataInstallDialog::Abort() {
+	param->common.result = 1;
+	param.NotifyWrite("DialogResult");
+
 	// TODO: Delete the files or anything?
 	return PSPDialog::Shutdown();
 }
--- a/Core/Dialog/PSPGamedataInstallDialog.h
+++ b/Core/Dialog/PSPGamedataInstallDialog.h
@ -22,7 +22,7 @@

 struct SceUtilityGamedataInstallParam {
 	pspUtilityDialogCommon common;
-	u32_le unknown1;
+	s32_le mode;
 	char gameName[13];
 	char ignore1[3];
 	char dataName[20];
--- a/Core/HLE/sceKernel.cpp
+++ b/Core/HLE/sceKernel.cpp
@ -820,7 +820,7 @@ const HLEFunction ThreadManForUser[] =
 	{0X87D4DD36, &WrapI_IU<sceKernelCancelReceiveMbx>,               "sceKernelCancelReceiveMbx",                 'i', "ix"      },
 	{0XA8E8C846, &WrapI_IU<sceKernelReferMbxStatus>,                 "sceKernelReferMbxStatus",                   'i', "ip"      },

-	{0X7C0DC2A0, &WrapI_CIUUU<sceKernelCreateMsgPipe>,               "sceKernelCreateMsgPipe",                    'i', "sixxx"   },
+	{0X7C0DC2A0, &WrapI_CIUUU<sceKernelCreateMsgPipe>,               "sceKernelCreateMsgPipe",                    'i', "sixxp"   },
 	{0XF0B7DA1C, &WrapI_I<sceKernelDeleteMsgPipe>,                   "sceKernelDeleteMsgPipe",                    'i', "i"       },
 	{0X876DBFAD, &WrapI_IUUUUU<sceKernelSendMsgPipe>,                "sceKernelSendMsgPipe",                      'i', "ixxxxx"  },
 	{0X7C41F2C2, &WrapI_IUUUUU<sceKernelSendMsgPipeCB>,              "sceKernelSendMsgPipeCB",                    'i', "ixxxxx"  },
@ -831,7 +831,7 @@ const HLEFunction ThreadManForUser[] =
 	{0X349B864D, &WrapI_IUU<sceKernelCancelMsgPipe>,                 "sceKernelCancelMsgPipe",                    'i', "ixx"     },
 	{0X33BE4024, &WrapI_IU<sceKernelReferMsgPipeStatus>,             "sceKernelReferMsgPipeStatus",               'i', "ip"      },

-	{0X56C039B5, &WrapI_CIUUU<sceKernelCreateVpl>,                   "sceKernelCreateVpl",                        'i', "sixxx"   },
+	{0X56C039B5, &WrapI_CIUUU<sceKernelCreateVpl>,                   "sceKernelCreateVpl",                        'i', "sixxp"   },
 	{0X89B3D48C, &WrapI_I<sceKernelDeleteVpl>,                       "sceKernelDeleteVpl",                        'i', "i"       },
 	{0XBED27435, &WrapI_IUUU<sceKernelAllocateVpl>,                  "sceKernelAllocateVpl",                      'i', "ixxx",   HLE_NOT_IN_INTERRUPT | HLE_NOT_DISPATCH_SUSPENDED },
 	{0XEC0A693F, &WrapI_IUUU<sceKernelAllocateVplCB>,                "sceKernelAllocateVplCB",                    'i', "ixxx",   HLE_NOT_IN_INTERRUPT | HLE_NOT_DISPATCH_SUSPENDED },
@ -840,7 +840,7 @@ const HLEFunction ThreadManForUser[] =
 	{0X1D371B8A, &WrapI_IU<sceKernelCancelVpl>,                      "sceKernelCancelVpl",                        'i', "ix"      },
 	{0X39810265, &WrapI_IU<sceKernelReferVplStatus>,                 "sceKernelReferVplStatus",                   'i', "ip"      },

-	{0XC07BB470, &WrapI_CUUUUU<sceKernelCreateFpl>,                  "sceKernelCreateFpl",                        'i', "sxxxxx"  },
+	{0XC07BB470, &WrapI_CUUUUU<sceKernelCreateFpl>,                  "sceKernelCreateFpl",                        'i', "sixxxp"  },
 	{0XED1410E0, &WrapI_I<sceKernelDeleteFpl>,                       "sceKernelDeleteFpl",                        'i', "i"       },
 	{0XD979E9BF, &WrapI_IUU<sceKernelAllocateFpl>,                   "sceKernelAllocateFpl",                      'i', "ixx",    HLE_NOT_IN_INTERRUPT | HLE_NOT_DISPATCH_SUSPENDED },
 	{0XE7282CB6, &WrapI_IUU<sceKernelAllocateFplCB>,                 "sceKernelAllocateFplCB",                    'i', "ixx",    HLE_NOT_IN_INTERRUPT | HLE_NOT_DISPATCH_SUSPENDED },
@ -864,7 +864,7 @@ const HLEFunction ThreadManForUser[] =
 	{0XD8B299AE, &WrapU_IUUU<sceKernelSetVTimerHandler>,             "sceKernelSetVTimerHandler",                 'x', "ixxx"    },
 	{0X53B00E9A, &WrapU_IU64UU<sceKernelSetVTimerHandlerWide>,       "sceKernelSetVTimerHandlerWide",             'x', "iXxx"    },

-	{0X8DAFF657, &WrapI_CUUUUU<sceKernelCreateTlspl>,                "sceKernelCreateTlspl",                      'i', "sxxxxx"  },
+	{0X8DAFF657, &WrapI_CUUUUU<sceKernelCreateTlspl>,                "sceKernelCreateTlspl",                      'i', "sixxxp"  },
 	{0X32BF938E, &WrapI_I<sceKernelDeleteTlspl>,                     "sceKernelDeleteTlspl",                      'i', "i"       },
 	{0X721067F3, &WrapI_IU<sceKernelReferTlsplStatus>,               "sceKernelReferTlsplStatus",                 'i', "xp"      },
 	// Not completely certain about args.
@ -908,7 +908,7 @@ const HLEFunction ThreadManForKernel[] =
 	{0x1fb15a32, &WrapU_IU<sceKernelSetEventFlag>,                   "sceKernelSetEventFlag",                     'x', "ix",     HLE_KERNEL_SYSCALL },
 	{0x812346e4, &WrapU_IU<sceKernelClearEventFlag>,                 "sceKernelClearEventFlag",                   'x', "ix",     HLE_KERNEL_SYSCALL },
 	{0x402fcf22, &WrapI_IUUUU<sceKernelWaitEventFlag>,               "sceKernelWaitEventFlag",                    'i', "ixxpp",  HLE_NOT_IN_INTERRUPT | HLE_KERNEL_SYSCALL},
-	{0xc07bb470, &WrapI_CUUUUU<sceKernelCreateFpl>,                  "sceKernelCreateFpl",                        'i', "sxxxxx" ,HLE_KERNEL_SYSCALL },
+	{0xc07bb470, &WrapI_CUUUUU<sceKernelCreateFpl>,                  "sceKernelCreateFpl",                        'i', "sixxxp" ,HLE_KERNEL_SYSCALL },
 	{0xed1410e0, &WrapI_I<sceKernelDeleteFpl>,                       "sceKernelDeleteFpl",                        'i', "i"      ,HLE_KERNEL_SYSCALL },
 	{0x623ae665, &WrapI_IU<sceKernelTryAllocateFpl>,                 "sceKernelTryAllocateFpl",                   'i', "ix"     ,HLE_KERNEL_SYSCALL },
 	{0x616403ba, &WrapI_I<sceKernelTerminateThread>,                 "sceKernelTerminateThread",                  'i', "i"      ,HLE_KERNEL_SYSCALL },
@ -932,7 +932,7 @@ const HLEFunction ThreadManForKernel[] =
 	{0x0D81716A, &WrapI_IU<sceKernelPollMbx>,                        "sceKernelPollMbx",                          'i', "ix",     HLE_KERNEL_SYSCALL },
 	{0x87D4DD36, &WrapI_IU<sceKernelCancelReceiveMbx>,               "sceKernelCancelReceiveMbx",                 'i', "ix",     HLE_KERNEL_SYSCALL },
 	{0xA8E8C846, &WrapI_IU<sceKernelReferMbxStatus>,                 "sceKernelReferMbxStatus",                   'i', "ip",     HLE_KERNEL_SYSCALL },
-	{0x56C039B5, &WrapI_CIUUU<sceKernelCreateVpl>,                   "sceKernelCreateVpl",                        'i', "sixxx",  HLE_KERNEL_SYSCALL },
+	{0x56C039B5, &WrapI_CIUUU<sceKernelCreateVpl>,                   "sceKernelCreateVpl",                        'i', "sixxp",  HLE_KERNEL_SYSCALL },
 	{0x89B3D48C, &WrapI_I<sceKernelDeleteVpl>,                       "sceKernelDeleteVpl",                        'i', "i",      HLE_KERNEL_SYSCALL },
 	{0xBED27435, &WrapI_IUUU<sceKernelAllocateVpl>,                  "sceKernelAllocateVpl",                      'i', "ixxx",   HLE_KERNEL_SYSCALL | HLE_NOT_IN_INTERRUPT | HLE_NOT_DISPATCH_SUSPENDED },
 	{0xEC0A693F, &WrapI_IUUU<sceKernelAllocateVplCB>,                "sceKernelAllocateVplCB",                    'i', "ixxx",   HLE_KERNEL_SYSCALL | HLE_NOT_IN_INTERRUPT | HLE_NOT_DISPATCH_SUSPENDED },
--- a/Core/HLE/sceKernelMemory.cpp
+++ b/Core/HLE/sceKernelMemory.cpp
@ -46,6 +46,7 @@ const int TLSPL_NUM_INDEXES = 16;
 // STATE BEGIN
 BlockAllocator userMemory(256);
 BlockAllocator kernelMemory(256);
+BlockAllocator volatileMemory(256);

 static int vplWaitTimer = -1;
 static int fplWaitTimer = -1;
@ -432,6 +433,7 @@ void __KernelMemoryInit()
 	MemBlockInfoInit();
 	kernelMemory.Init(PSP_GetKernelMemoryBase(), PSP_GetKernelMemoryEnd() - PSP_GetKernelMemoryBase(), false);
 	userMemory.Init(PSP_GetUserMemoryBase(), PSP_GetUserMemoryEnd() - PSP_GetUserMemoryBase(), false);
+	volatileMemory.Init(PSP_GetVolatileMemoryStart(), PSP_GetVolatileMemoryEnd() - PSP_GetVolatileMemoryStart(), false);
 	ParallelMemset(&g_threadManager, Memory::GetPointerWrite(PSP_GetKernelMemoryBase()), 0, PSP_GetUserMemoryEnd() - PSP_GetKernelMemoryBase());
 	NotifyMemInfo(MemBlockFlags::WRITE, PSP_GetKernelMemoryBase(), PSP_GetUserMemoryEnd() - PSP_GetKernelMemoryBase(), "MemInit");
 	INFO_LOG(SCEKERNEL, "Kernel and user memory pools initialized");
@ -457,12 +459,14 @@ void __KernelMemoryInit()

 void __KernelMemoryDoState(PointerWrap &p)
 {
-	auto s = p.Section("sceKernelMemory", 1, 2);
+	auto s = p.Section("sceKernelMemory", 1, 3);
 	if (!s)
 		return;

 	kernelMemory.DoState(p);
 	userMemory.DoState(p);
+	if (s >= 3)
+		volatileMemory.DoState(p);

 	Do(p, vplWaitTimer);
 	CoreTiming::RestoreRegisterEvent(vplWaitTimer, "VplTimeout", __KernelVplTimeout);
@ -481,6 +485,11 @@ void __KernelMemoryDoState(PointerWrap &p)

 void __KernelMemoryShutdown()
 {
+#ifdef _DEBUG
+	INFO_LOG(SCEKERNEL, "Shutting down volatile memory pool: ");
+	volatileMemory.ListBlocks();
+#endif
+	volatileMemory.Shutdown();
 #ifdef _DEBUG
 	INFO_LOG(SCEKERNEL,"Shutting down user memory pool: ");
 	userMemory.ListBlocks();
@ -495,6 +504,56 @@ void __KernelMemoryShutdown()
 	MemBlockInfoShutdown();
 }

+BlockAllocator *BlockAllocatorFromID(int id) {
+	switch (id) {
+	case 1:
+	case 3:
+	case 4:
+		if (hleIsKernelMode())
+			return &kernelMemory;
+		return nullptr;
+
+	case 2:
+	case 6:
+		return &userMemory;
+
+	case 8:
+	case 10:
+		if (hleIsKernelMode())
+			return &userMemory;
+		return nullptr;
+
+	case 5:
+		return &volatileMemory;
+
+	default:
+		break;
+	}
+
+	return nullptr;
+}
+
+int BlockAllocatorToID(const BlockAllocator *alloc) {
+	if (alloc == &kernelMemory)
+		return 1;
+	if (alloc == &userMemory)
+		return 2;
+	if (alloc == &volatileMemory)
+		return 5;
+	return 0;
+}
+
+BlockAllocator *BlockAllocatorFromAddr(u32 addr) {
+	addr &= 0x3FFFFFFF;
+	if (Memory::IsKernelAndNotVolatileAddress(addr))
+		return &kernelMemory;
+	if (Memory::IsKernelAddress(addr))
+		return &volatileMemory;
+	if (Memory::IsRAMAddress(addr))
+		return &userMemory;
+	return nullptr;
+}
+
 enum SceKernelFplAttr
 {
 	PSP_FPL_ATTR_FIFO     = 0x0000,
@ -580,29 +639,18 @@ static void __KernelSortFplThreads(FPL *fpl)
 		std::stable_sort(fpl->waitingThreads.begin(), fpl->waitingThreads.end(), __FplThreadSortPriority);
 }

-int sceKernelCreateFpl(const char *name, u32 mpid, u32 attr, u32 blockSize, u32 numBlocks, u32 optPtr)
-{
+int sceKernelCreateFpl(const char *name, u32 mpid, u32 attr, u32 blockSize, u32 numBlocks, u32 optPtr) {
 	if (!name)
-	{
-		WARN_LOG_REPORT(SCEKERNEL, "%08x=sceKernelCreateFpl(): invalid name", SCE_KERNEL_ERROR_NO_MEMORY);
-		return SCE_KERNEL_ERROR_NO_MEMORY;
-	}
+		return hleLogWarning(SCEKERNEL, SCE_KERNEL_ERROR_NO_MEMORY, "invalid name");
 	if (mpid < 1 || mpid > 9 || mpid == 7)
-	{
-		WARN_LOG_REPORT(SCEKERNEL, "%08x=sceKernelCreateFpl(): invalid partition %d", SCE_KERNEL_ERROR_ILLEGAL_ARGUMENT, mpid);
-		return SCE_KERNEL_ERROR_ILLEGAL_ARGUMENT;
-	}
-	// We only support user right now.
-	if (mpid != 2 && mpid != 6)
-	{
-		WARN_LOG_REPORT(SCEKERNEL, "%08x=sceKernelCreateFpl(): invalid partition %d", SCE_KERNEL_ERROR_ILLEGAL_PERM, mpid);
-		return SCE_KERNEL_ERROR_ILLEGAL_PERM;
-	}
+		return hleLogWarning(SCEKERNEL, SCE_KERNEL_ERROR_ILLEGAL_ARGUMENT, "invalid partition %d", mpid);
+
+	BlockAllocator *allocator = BlockAllocatorFromID(mpid);
+	if (allocator == nullptr)
+		return hleLogWarning(SCEKERNEL, SCE_KERNEL_ERROR_ILLEGAL_PERM, "invalid partition %d", mpid);
 	if (((attr & ~PSP_FPL_ATTR_KNOWN) & ~0xFF) != 0)
-	{
-		WARN_LOG_REPORT(SCEKERNEL, "%08x=sceKernelCreateFpl(): invalid attr parameter: %08x", SCE_KERNEL_ERROR_ILLEGAL_ATTR, attr);
-		return SCE_KERNEL_ERROR_ILLEGAL_ATTR;
-	}
+		return hleLogWarning(SCEKERNEL, SCE_KERNEL_ERROR_ILLEGAL_ATTR, "invalid attr parameter: %08x", attr);
+
 	// There's probably a simpler way to get this same basic formula...
 	// This is based on results from a PSP.
 	bool illegalMemSize = blockSize == 0 || numBlocks == 0;
@ -611,25 +659,16 @@ int sceKernelCreateFpl(const char *name, u32 mpid, u32 attr, u32 blockSize, u32
 	if (!illegalMemSize && (u64) numBlocks >= 0x100000000ULL / (((u64) blockSize + 3ULL) & ~3ULL))
 		illegalMemSize = true;
 	if (illegalMemSize)
-	{
-		WARN_LOG_REPORT(SCEKERNEL, "%08x=sceKernelCreateFpl(): invalid blockSize/count", SCE_KERNEL_ERROR_ILLEGAL_MEMSIZE);
-		return SCE_KERNEL_ERROR_ILLEGAL_MEMSIZE;
-	}
+		return hleReportWarning(SCEKERNEL, SCE_KERNEL_ERROR_ILLEGAL_MEMSIZE, "invalid blockSize/count");

 	int alignment = 4;
-	if (optPtr != 0)
-	{
-		u32 size = Memory::Read_U32(optPtr);
-		if (size > 8)
-			WARN_LOG_REPORT(SCEKERNEL, "sceKernelCreateFpl(): unsupported extra options, size = %d", size);
+	if (Memory::IsValidRange(optPtr, 4)) {
+		u32 size = Memory::ReadUnchecked_U32(optPtr);
 		if (size >= 4)
 			alignment = Memory::Read_U32(optPtr + 4);
 		// Must be a power of 2 to be valid.
 		if ((alignment & (alignment - 1)) != 0)
-		{
-			WARN_LOG_REPORT(SCEKERNEL, "%08x=sceKernelCreateFpl(): invalid alignment %d", SCE_KERNEL_ERROR_ILLEGAL_ARGUMENT, alignment);
-			return SCE_KERNEL_ERROR_ILLEGAL_ARGUMENT;
-		}
+			return hleLogWarning(SCEKERNEL, SCE_KERNEL_ERROR_ILLEGAL_ARGUMENT, "invalid alignment %d", alignment);
 	}

 	if (alignment < 4)
@ -638,9 +677,8 @@ int sceKernelCreateFpl(const char *name, u32 mpid, u32 attr, u32 blockSize, u32
 	int alignedSize = ((int)blockSize + alignment - 1) & ~(alignment - 1);
 	u32 totalSize = alignedSize * numBlocks;
 	bool atEnd = (attr & PSP_FPL_ATTR_HIGHMEM) != 0;
-	u32 address = userMemory.Alloc(totalSize, atEnd, "FPL");
-	if (address == (u32)-1)
-	{
+	u32 address = allocator->Alloc(totalSize, atEnd, "FPL");
+	if (address == (u32)-1) {
 		DEBUG_LOG(SCEKERNEL, "sceKernelCreateFpl(\"%s\", partition=%i, attr=%08x, bsize=%i, nb=%i) FAILED - out of ram", 
 			name, mpid, attr, blockSize, numBlocks);
 		return SCE_KERNEL_ERROR_NO_MEMORY;
@ -682,7 +720,10 @@ int sceKernelDeleteFpl(SceUID uid)
 		if (wokeThreads)
 			hleReSchedule("fpl deleted");

-		userMemory.Free(fpl->address);
+		BlockAllocator *alloc = BlockAllocatorFromAddr(fpl->address);
+		_assert_msg_(alloc != nullptr, "Should always have a valid allocator/address");
+		if (alloc)
+			alloc->Free(fpl->address);
 		return kernelObjects.Destroy<FPL>(uid);
 	}
 	else
@ -955,18 +996,23 @@ public:
 			alloc->Free(address);
 	}
 	bool IsValid() {return address != (u32)-1;}
-	BlockAllocator *alloc;

 	void DoState(PointerWrap &p) override
 	{
-		auto s = p.Section("PMB", 1);
+		auto s = p.Section("PMB", 1, 2);
 		if (!s)
 			return;

 		Do(p, address);
 		DoArray(p, name, sizeof(name));
+		if (s >= 2) {
+			int allocType = BlockAllocatorToID(alloc);
+			Do(p, allocType);
+			alloc = BlockAllocatorFromID(allocType);
+		}
 	}

+	BlockAllocator *alloc;
 	u32 address;
 	char name[32];
 };
@ -986,44 +1032,28 @@ static u32 sceKernelTotalFreeMemSize()
 	return retVal;
 }

-int sceKernelAllocPartitionMemory(int partition, const char *name, int type, u32 size, u32 addr)
-{
-	if (name == NULL)
-	{
-		WARN_LOG_REPORT(SCEKERNEL, "%08x=sceKernelAllocPartitionMemory(): invalid name", SCE_KERNEL_ERROR_ERROR);
-		return SCE_KERNEL_ERROR_ERROR;
-	}
-	if (size == 0)
-	{
-		WARN_LOG_REPORT(SCEKERNEL, "%08x=sceKernelAllocPartitionMemory(): invalid size %x", SCE_KERNEL_ERROR_MEMBLOCK_ALLOC_FAILED, size);
-		return SCE_KERNEL_ERROR_MEMBLOCK_ALLOC_FAILED;
+int sceKernelAllocPartitionMemory(int partition, const char *name, int type, u32 size, u32 addr) {
+	if (type < PSP_SMEM_Low || type > PSP_SMEM_HighAligned)
+		return hleLogWarning(SCEKERNEL, SCE_KERNEL_ERROR_ILLEGAL_MEMBLOCKTYPE, "invalid type %x", type);
+	// Alignment is only allowed for powers of 2.
+	if (type == PSP_SMEM_LowAligned || type == PSP_SMEM_HighAligned) {
+		if ((addr & (addr - 1)) != 0 || addr == 0)
+			return hleLogWarning(SCEKERNEL, SCE_KERNEL_ERROR_ILLEGAL_ALIGNMENT_SIZE, "invalid alignment %x", addr);
 	}
 	if (partition < 1 || partition > 9 || partition == 7)
-	{
-		WARN_LOG_REPORT(SCEKERNEL, "%08x=sceKernelAllocPartitionMemory(): invalid partition %x", SCE_KERNEL_ERROR_ILLEGAL_ARGUMENT, partition);
-		return SCE_KERNEL_ERROR_ILLEGAL_ARGUMENT;
-	}
-	// We only support user right now.
-	if (partition != 2 && partition != 5 && partition != 6)
-	{
-		WARN_LOG_REPORT(SCEKERNEL, "%08x=sceKernelAllocPartitionMemory(): invalid partition %x", SCE_KERNEL_ERROR_ILLEGAL_PARTITION, partition);
-		return SCE_KERNEL_ERROR_ILLEGAL_PARTITION;
-	}
-	if (type < PSP_SMEM_Low || type > PSP_SMEM_HighAligned)
-	{
-		WARN_LOG_REPORT(SCEKERNEL, "%08x=sceKernelAllocPartitionMemory(): invalid type %x", SCE_KERNEL_ERROR_ILLEGAL_MEMBLOCKTYPE, type);
-		return SCE_KERNEL_ERROR_ILLEGAL_MEMBLOCKTYPE;
-	}
-	// Alignment is only allowed for powers of 2.
-	if ((type == PSP_SMEM_LowAligned || type == PSP_SMEM_HighAligned) && ((addr & (addr - 1)) != 0 || addr == 0))
-	{
-		WARN_LOG_REPORT(SCEKERNEL, "%08x=sceKernelAllocPartitionMemory(): invalid alignment %x", SCE_KERNEL_ERROR_ILLEGAL_ALIGNMENT_SIZE, addr);
-		return SCE_KERNEL_ERROR_ILLEGAL_ALIGNMENT_SIZE;
-	}
+		return hleLogWarning(SCEKERNEL, SCE_KERNEL_ERROR_ILLEGAL_ARGUMENT, "invalid partition %x", partition);

-	PartitionMemoryBlock *block = new PartitionMemoryBlock(&userMemory, name, size, (MemblockType)type, addr);
-	if (!block->IsValid())
-	{
+	BlockAllocator *allocator = BlockAllocatorFromID(partition);
+	if (allocator == nullptr)
+		return hleLogWarning(SCEKERNEL, SCE_KERNEL_ERROR_ILLEGAL_PARTITION, "invalid partition %x", partition);
+
+	if (name == nullptr)
+		return hleLogWarning(SCEKERNEL, SCE_KERNEL_ERROR_ERROR, "invalid name");
+	if (size == 0)
+		return hleLogWarning(SCEKERNEL, SCE_KERNEL_ERROR_MEMBLOCK_ALLOC_FAILED, "invalid size %x", size);
+
+	PartitionMemoryBlock *block = new PartitionMemoryBlock(allocator, name, size, (MemblockType)type, addr);
+	if (!block->IsValid()) {
 		delete block;
 		ERROR_LOG(SCEKERNEL, "sceKernelAllocPartitionMemory(partition = %i, %s, type= %i, size= %i, addr= %08x): allocation failed", partition, name, type, size, addr);
 		return SCE_KERNEL_ERROR_MEMBLOCK_ALLOC_FAILED;
@ -1451,40 +1481,23 @@ static void __KernelSortVplThreads(VPL *vpl)
 		std::stable_sort(vpl->waitingThreads.begin(), vpl->waitingThreads.end(), __VplThreadSortPriority);
 }

-SceUID sceKernelCreateVpl(const char *name, int partition, u32 attr, u32 vplSize, u32 optPtr)
-{
+SceUID sceKernelCreateVpl(const char *name, int partition, u32 attr, u32 vplSize, u32 optPtr) {
 	if (!name)
-	{
-		WARN_LOG_REPORT(SCEKERNEL, "%08x=sceKernelCreateVpl(): invalid name", SCE_KERNEL_ERROR_ERROR);
-		return SCE_KERNEL_ERROR_ERROR;
-	}
+		return hleLogWarning(SCEKERNEL, SCE_KERNEL_ERROR_ERROR, "invalid name");
 	if (partition < 1 || partition > 9 || partition == 7)
-	{
-		WARN_LOG_REPORT(SCEKERNEL, "%08x=sceKernelCreateVpl(): invalid partition %d", SCE_KERNEL_ERROR_ILLEGAL_ARGUMENT, partition);
-		return SCE_KERNEL_ERROR_ILLEGAL_ARGUMENT;
-	}
-	// We only support user right now.
-	if (partition != 2 && partition != 6)
-	{
-		WARN_LOG_REPORT(SCEKERNEL, "%08x=sceKernelCreateVpl(): invalid partition %d", SCE_KERNEL_ERROR_ILLEGAL_PERM, partition);
-		return SCE_KERNEL_ERROR_ILLEGAL_PERM;
-	}
+		return hleLogWarning(SCEKERNEL, SCE_KERNEL_ERROR_ILLEGAL_ARGUMENT, "invalid partition %d", partition);
+
+	BlockAllocator *allocator = BlockAllocatorFromID(partition);
+	if (allocator == nullptr)
+		return hleLogWarning(SCEKERNEL, SCE_KERNEL_ERROR_ILLEGAL_PERM, "invalid partition %d", partition);
+
 	if (((attr & ~PSP_VPL_ATTR_KNOWN) & ~0xFF) != 0)
-	{
-		WARN_LOG_REPORT(SCEKERNEL, "%08x=sceKernelCreateVpl(): invalid attr parameter: %08x", SCE_KERNEL_ERROR_ILLEGAL_ATTR, attr);
-		return SCE_KERNEL_ERROR_ILLEGAL_ATTR;
-	}
+		return hleLogWarning(SCEKERNEL, SCE_KERNEL_ERROR_ILLEGAL_ATTR, "invalid attr parameter: %08x", attr);
 	if (vplSize == 0)
-	{
-		WARN_LOG_REPORT(SCEKERNEL, "%08x=sceKernelCreateVpl(): invalid size", SCE_KERNEL_ERROR_ILLEGAL_MEMSIZE);
-		return SCE_KERNEL_ERROR_ILLEGAL_MEMSIZE;
-	}
+		return hleLogWarning(SCEKERNEL, SCE_KERNEL_ERROR_ILLEGAL_MEMSIZE, "invalid size");
 	// Block Allocator seems to A-OK this, let's stop it here.
 	if (vplSize >= 0x80000000)
-	{
-		WARN_LOG_REPORT(SCEKERNEL, "%08x=sceKernelCreateVpl(): way too big size", SCE_KERNEL_ERROR_NO_MEMORY);
-		return SCE_KERNEL_ERROR_NO_MEMORY;
-	}
+		return hleLogWarning(SCEKERNEL, SCE_KERNEL_ERROR_NO_MEMORY, "way too big size");

 	// Can't have that little space in a Vpl, sorry.
 	if (vplSize <= 0x30)
@ -1493,12 +1506,9 @@ SceUID sceKernelCreateVpl(const char *name, int partition, u32 attr, u32 vplSize

 	// We ignore the upalign to 256 and do it ourselves by 8.
 	u32 allocSize = vplSize;
-	u32 memBlockPtr = userMemory.Alloc(allocSize, (attr & PSP_VPL_ATTR_HIGHMEM) != 0, "VPL");
+	u32 memBlockPtr = allocator->Alloc(allocSize, (attr & PSP_VPL_ATTR_HIGHMEM) != 0, "VPL");
 	if (memBlockPtr == (u32)-1)
-	{
-		ERROR_LOG(SCEKERNEL, "sceKernelCreateVpl(): Failed to allocate %i bytes of pool data", vplSize);
-		return SCE_KERNEL_ERROR_NO_MEMORY;
-	}
+		return hleLogError(SCEKERNEL, SCE_KERNEL_ERROR_NO_MEMORY, "failed to allocate %i bytes of pool data", vplSize);

 	VPL *vpl = new VPL;
 	SceUID id = kernelObjects.Create(vpl);
@ -1542,7 +1552,10 @@ int sceKernelDeleteVpl(SceUID uid)
 		if (wokeThreads)
 			hleReSchedule("vpl deleted");

-		userMemory.Free(vpl->address);
+		BlockAllocator *alloc = BlockAllocatorFromAddr(vpl->address);
+		_assert_msg_(alloc != nullptr, "Should always have a valid allocator/address");
+		if (alloc)
+			alloc->Free(vpl->address);
 		kernelObjects.Destroy<VPL>(uid);
 		return 0;
 	}
@ -2044,29 +2057,17 @@ void __KernelTlsplThreadEnd(SceUID threadID)
 	tlsplThreadEndChecks.erase(locked.first, locked.second);
 }

-SceUID sceKernelCreateTlspl(const char *name, u32 partition, u32 attr, u32 blockSize, u32 count, u32 optionsPtr)
-{
+SceUID sceKernelCreateTlspl(const char *name, u32 partition, u32 attr, u32 blockSize, u32 count, u32 optionsPtr) {
 	if (!name)
-	{
-		WARN_LOG_REPORT(SCEKERNEL, "%08x=sceKernelCreateTlspl(): invalid name", SCE_KERNEL_ERROR_NO_MEMORY);
-		return SCE_KERNEL_ERROR_NO_MEMORY;
-	}
+		return hleLogWarning(SCEKERNEL, SCE_KERNEL_ERROR_NO_MEMORY, "invalid name");
 	if ((attr & ~PSP_TLSPL_ATTR_KNOWN) >= 0x100)
-	{
-		WARN_LOG_REPORT(SCEKERNEL, "%08x=sceKernelCreateTlspl(): invalid attr parameter: %08x", SCE_KERNEL_ERROR_ILLEGAL_ATTR, attr);
-		return SCE_KERNEL_ERROR_ILLEGAL_ATTR;
-	}
+		return hleLogWarning(SCEKERNEL, SCE_KERNEL_ERROR_ILLEGAL_ATTR, "invalid attr parameter: %08x", attr);
 	if (partition < 1 || partition > 9 || partition == 7)
-	{
-		WARN_LOG_REPORT(SCEKERNEL, "%08x=sceKernelCreateTlspl(): invalid partition %d", SCE_KERNEL_ERROR_ILLEGAL_ARGUMENT, partition);
-		return SCE_KERNEL_ERROR_ILLEGAL_ARGUMENT;
-	}
-	// We only support user right now.
-	if (partition != 2 && partition != 6)
-	{
-		WARN_LOG_REPORT(SCEKERNEL, "%08x=sceKernelCreateTlspl(): invalid partition %d", SCE_KERNEL_ERROR_ILLEGAL_PERM, partition);
-		return SCE_KERNEL_ERROR_ILLEGAL_PERM;
-	}
+		return hleLogWarning(SCEKERNEL, SCE_KERNEL_ERROR_ILLEGAL_ARGUMENT, "invalid partition %d", partition);
+
+	BlockAllocator *allocator = BlockAllocatorFromID(partition);
+	if (allocator == nullptr)
+		return hleLogWarning(SCEKERNEL, SCE_KERNEL_ERROR_ILLEGAL_PERM, "invalid partition %x", partition);

 	// There's probably a simpler way to get this same basic formula...
 	// This is based on results from a PSP.
@ -2076,41 +2077,29 @@ SceUID sceKernelCreateTlspl(const char *name, u32 partition, u32 attr, u32 block
 	if (!illegalMemSize && (u64) count >= 0x100000000ULL / (((u64) blockSize + 3ULL) & ~3ULL))
 		illegalMemSize = true;
 	if (illegalMemSize)
-	{
-		WARN_LOG_REPORT(SCEKERNEL, "%08x=sceKernelCreateTlspl(): invalid blockSize/count", SCE_KERNEL_ERROR_ILLEGAL_MEMSIZE);
-		return SCE_KERNEL_ERROR_ILLEGAL_MEMSIZE;
-	}
+		return hleLogWarning(SCEKERNEL, SCE_KERNEL_ERROR_ILLEGAL_MEMSIZE, "invalid blockSize/count");

 	int index = -1;
-	for (int i = 0; i < TLSPL_NUM_INDEXES; ++i)
-		if (tlsplUsedIndexes[i] == false)
-		{
+	for (int i = 0; i < TLSPL_NUM_INDEXES; ++i) {
+		if (tlsplUsedIndexes[i] == false) {
 			index = i;
 			break;
 		}
+	}

 	if (index == -1)
-	{
-		WARN_LOG_REPORT(SCEKERNEL, "%08x=sceKernelCreateTlspl(): ran out of indexes for TLS pools", PSP_ERROR_TOO_MANY_TLSPL);
-		return PSP_ERROR_TOO_MANY_TLSPL;
-	}
+		return hleLogWarning(SCEKERNEL, PSP_ERROR_TOO_MANY_TLSPL, "ran out of indexes for TLS pools");

 	// Unless otherwise specified, we align to 4 bytes (a mips word.)
 	u32 alignment = 4;
-	if (optionsPtr != 0)
-	{
-		u32 size = Memory::Read_U32(optionsPtr);
-		if (size > 8)
-			WARN_LOG_REPORT(SCEKERNEL, "sceKernelCreateTlspl(%s) unsupported options parameter, size = %d", name, size);
+	if (Memory::IsValidRange(optionsPtr, 4)) {
+		u32 size = Memory::ReadUnchecked_U32(optionsPtr);
 		if (size >= 8)
 			alignment = Memory::Read_U32(optionsPtr + 4);

 		// Note that 0 intentionally is allowed.
 		if ((alignment & (alignment - 1)) != 0)
-		{
-			ERROR_LOG_REPORT(SCEKERNEL, "sceKernelCreateTlspl(%s): alignment is not a power of 2: %d", name, alignment);
-			return SCE_KERNEL_ERROR_ILLEGAL_ARGUMENT;
-		}
+			return hleLogError(SCEKERNEL, SCE_KERNEL_ERROR_ILLEGAL_ARGUMENT, "alignment is not a power of 2: %d", alignment);
 		// This goes for 0, 1, and 2.  Can't have less than 4 byte alignment.
 		if (alignment < 4)
 			alignment = 4;
@ -2120,16 +2109,13 @@ SceUID sceKernelCreateTlspl(const char *name, u32 partition, u32 attr, u32 block
 	u32 alignedSize = (blockSize + alignment - 1) & ~(alignment - 1);

 	u32 totalSize = alignedSize * count;
-	u32 blockPtr = userMemory.Alloc(totalSize, (attr & PSP_TLSPL_ATTR_HIGHMEM) != 0, name);
+	u32 blockPtr = allocator->Alloc(totalSize, (attr & PSP_TLSPL_ATTR_HIGHMEM) != 0, name);
 #ifdef _DEBUG
-	userMemory.ListBlocks();
+	allocator->ListBlocks();
 #endif

-	if (blockPtr == (u32) -1)
-	{
-		ERROR_LOG(SCEKERNEL, "%08x=sceKernelCreateTlspl(%s, %d, %08x, %d, %d, %08x): failed to allocate memory", SCE_KERNEL_ERROR_NO_MEMORY, name, partition, attr, blockSize, count, optionsPtr);
-		return SCE_KERNEL_ERROR_NO_MEMORY;
-	}
+	if (blockPtr == (u32)-1)
+		return hleLogError(SCEKERNEL, SCE_KERNEL_ERROR_NO_MEMORY, "failed to allocate memory");

 	TLSPL *tls = new TLSPL();
 	SceUID id = kernelObjects.Create(tls);
@ -2148,9 +2134,7 @@ SceUID sceKernelCreateTlspl(const char *name, u32 partition, u32 attr, u32 block
 	tls->alignment = alignment;
 	tls->usage.resize(count, 0);

-	WARN_LOG(SCEKERNEL, "%08x=sceKernelCreateTlspl(%s, %d, %08x, %d, %d, %08x)", id, name, partition, attr, blockSize, count, optionsPtr);
-
-	return id;
+	return hleLogSuccessInfoI(SCEKERNEL, id);
 }

 int sceKernelDeleteTlspl(SceUID uid)
@ -2178,7 +2162,10 @@ int sceKernelDeleteTlspl(SceUID uid)
 			HLEKernel::ResumeFromWait(threadID, WAITTYPE_TLSPL, uid, 0);
 		hleReSchedule("deleted tlspl");

-		userMemory.Free(tls->address);
+		BlockAllocator *allocator = BlockAllocatorFromAddr(tls->address);
+		_assert_msg_(allocator != nullptr, "Should always have a valid allocator/address");
+		if (allocator)
+			allocator->Free(tls->address);
 		tlsplUsedIndexes[tls->ntls.index] = false;
 		kernelObjects.Destroy<TLSPL>(uid);
 	}
--- a/Core/HLE/sceKernelMemory.h
+++ b/Core/HLE/sceKernelMemory.h
@ -40,6 +40,10 @@ KernelObject *__KernelMemoryVPLObject();
 KernelObject *__KernelMemoryPMBObject();
 KernelObject *__KernelTlsplObject();

+BlockAllocator *BlockAllocatorFromID(int id);
+int BlockAllocatorToID(const BlockAllocator *alloc);
+BlockAllocator *BlockAllocatorFromAddr(u32 addr);
+
 SceUID sceKernelCreateVpl(const char *name, int partition, u32 attr, u32 vplSize, u32 optPtr);
 int sceKernelDeleteVpl(SceUID uid);
 int sceKernelAllocateVpl(SceUID uid, u32 size, u32 addrPtr, u32 timeoutPtr);
--- a/Core/HLE/sceKernelMsgPipe.cpp
+++ b/Core/HLE/sceKernelMsgPipe.cpp
@ -140,10 +140,13 @@ struct MsgPipe : public KernelObject
 	int GetIDType() const override { return SCE_KERNEL_TMID_Mpipe; }

 	MsgPipe() : buffer(0) {}
-	~MsgPipe()
-	{
-		if (buffer != 0)
-			userMemory.Free(buffer);
+	~MsgPipe() {
+		if (buffer != 0) {
+			BlockAllocator *alloc = BlockAllocatorFromAddr(buffer);
+			_assert_msg_(alloc != nullptr, "Should always have a valid allocator/address");
+			if (alloc)
+				alloc->Free(buffer);
+		}
 	}

 	u32 GetUsedSize()
@ -667,41 +670,26 @@ void __KernelMsgPipeDoState(PointerWrap &p)
 	CoreTiming::RestoreRegisterEvent(waitTimer, "MsgPipeTimeout", __KernelMsgPipeTimeout);
 }

-int sceKernelCreateMsgPipe(const char *name, int partition, u32 attr, u32 size, u32 optionsPtr)
-{
+int sceKernelCreateMsgPipe(const char *name, int partition, u32 attr, u32 size, u32 optionsPtr) {
 	if (!name)
-	{
-		WARN_LOG_REPORT(SCEKERNEL, "%08x=sceKernelCreateMsgPipe(): invalid name", SCE_KERNEL_ERROR_NO_MEMORY);
-		return SCE_KERNEL_ERROR_NO_MEMORY;
-	}
+		return hleLogWarning(SCEKERNEL, SCE_KERNEL_ERROR_NO_MEMORY, "invalid name");
 	if (partition < 1 || partition > 9 || partition == 7)
-	{
-		WARN_LOG_REPORT(SCEKERNEL, "%08x=sceKernelCreateMsgPipe(): invalid partition %d", SCE_KERNEL_ERROR_ILLEGAL_ARGUMENT, partition);
-		return SCE_KERNEL_ERROR_ILLEGAL_ARGUMENT;
-	}
-	// We only support user right now.
-	if (partition != 2 && partition != 6)
-	{
-		WARN_LOG_REPORT(SCEKERNEL, "%08x=sceKernelCreateMsgPipe(): invalid partition %d", SCE_KERNEL_ERROR_ILLEGAL_PERM, partition);
-		return SCE_KERNEL_ERROR_ILLEGAL_PERM;
-	}
+		return hleLogWarning(SCEKERNEL, SCE_KERNEL_ERROR_ILLEGAL_ARGUMENT, "invalid partition %d", partition);
+
+	BlockAllocator *allocator = BlockAllocatorFromID(partition);
+	if (allocator == nullptr)
+		return hleLogWarning(SCEKERNEL, SCE_KERNEL_ERROR_ILLEGAL_PERM, "invalid partition %d", partition);
+
 	if ((attr & ~SCE_KERNEL_MPA_KNOWN) >= 0x100)
-	{
-		WARN_LOG_REPORT(SCEKERNEL, "%08x=sceKernelCreateEventFlag(%s): invalid attr parameter: %08x", SCE_KERNEL_ERROR_ILLEGAL_ATTR, name, attr);
-		return SCE_KERNEL_ERROR_ILLEGAL_ATTR;
-	}
+		return hleLogWarning(SCEKERNEL, SCE_KERNEL_ERROR_ILLEGAL_ATTR, "invalid attr parameter: %08x", attr);

 	u32 memBlockPtr = 0;
-	if (size != 0)
-	{
+	if (size != 0) {
 		// We ignore the upalign to 256.
 		u32 allocSize = size;
-		memBlockPtr = userMemory.Alloc(allocSize, (attr & SCE_KERNEL_MPA_HIGHMEM) != 0, "MsgPipe");
+		memBlockPtr = allocator->Alloc(allocSize, (attr & SCE_KERNEL_MPA_HIGHMEM) != 0, "MsgPipe");
 		if (memBlockPtr == (u32)-1)
-		{
-			ERROR_LOG(SCEKERNEL, "%08x=sceKernelCreateEventFlag(%s): Failed to allocate %i bytes for buffer", SCE_KERNEL_ERROR_NO_MEMORY, name, size);
-			return SCE_KERNEL_ERROR_NO_MEMORY;
-		}
+			return hleLogError(SCEKERNEL, SCE_KERNEL_ERROR_NO_MEMORY, "failed to allocate %i bytes for buffer", size);
 	}

 	MsgPipe *m = new MsgPipe();
--- a/Core/HLE/sceUtility.cpp
+++ b/Core/HLE/sceUtility.cpp
@ -744,11 +744,14 @@ static int sceUtilityGamedataInstallInitStart(u32 paramsAddr) {
 	}

 	ActivateDialog(UtilityDialogType::GAMEDATAINSTALL);
-	return hleLogSuccessInfoX(SCEUTILITY, gamedataInstallDialog->Init(paramsAddr));
+	int result = gamedataInstallDialog->Init(paramsAddr);
+	if (result < 0)
+		DeactivateDialog();
+	return hleLogSuccessInfoX(SCEUTILITY, result);
 }

 static int sceUtilityGamedataInstallShutdownStart() {
-	if (currentDialogType != UtilityDialogType::GAMEDATAINSTALL) {
+	if (!currentDialogActive || currentDialogType != UtilityDialogType::GAMEDATAINSTALL) {
 		return hleLogWarning(SCEUTILITY, SCE_ERROR_UTILITY_WRONG_TYPE, "wrong dialog type");
 	}
 	
@ -757,7 +760,7 @@ static int sceUtilityGamedataInstallShutdownStart() {
 }

 static int sceUtilityGamedataInstallUpdate(int animSpeed) {
-	if (currentDialogType != UtilityDialogType::GAMEDATAINSTALL) {
+	if (!currentDialogActive || currentDialogType != UtilityDialogType::GAMEDATAINSTALL) {
 		return hleLogWarning(SCEUTILITY, SCE_ERROR_UTILITY_WRONG_TYPE, "wrong dialog type");
 	}
 	
@ -765,8 +768,9 @@ static int sceUtilityGamedataInstallUpdate(int animSpeed) {
 }

 static int sceUtilityGamedataInstallGetStatus() {
-	if (currentDialogType != UtilityDialogType::GAMEDATAINSTALL) {
+	if (!currentDialogActive || currentDialogType != UtilityDialogType::GAMEDATAINSTALL) {
 		// This is called incorrectly all the time by some games. So let's not bother warning.
+		hleEatCycles(200);
 		return hleLogDebug(SCEUTILITY, SCE_ERROR_UTILITY_WRONG_TYPE, "wrong dialog type");
 	}

@ -776,7 +780,7 @@ static int sceUtilityGamedataInstallGetStatus() {
 }

 static int sceUtilityGamedataInstallAbort() {
-	if (currentDialogType != UtilityDialogType::GAMEDATAINSTALL) {
+	if (!currentDialogActive || currentDialogType != UtilityDialogType::GAMEDATAINSTALL) {
 		return hleLogWarning(SCEUTILITY, SCE_ERROR_UTILITY_WRONG_TYPE, "wrong dialog type");
 	}
 	
--- a/Core/System.cpp
+++ b/Core/System.cpp
@ -413,11 +413,11 @@ bool PSP_InitStart(const CoreParameter &coreParam, std::string *error_string) {
 	}

 #if defined(_WIN32) && PPSSPP_ARCH(AMD64)
-	INFO_LOG(BOOT, "PPSSPP %s Windows 64 bit", PPSSPP_GIT_VERSION);
+	NOTICE_LOG(BOOT, "PPSSPP %s Windows 64 bit", PPSSPP_GIT_VERSION);
 #elif defined(_WIN32) && !PPSSPP_ARCH(AMD64)
-	INFO_LOG(BOOT, "PPSSPP %s Windows 32 bit", PPSSPP_GIT_VERSION);
+	NOTICE_LOG(BOOT, "PPSSPP %s Windows 32 bit", PPSSPP_GIT_VERSION);
 #else
-	INFO_LOG(BOOT, "PPSSPP %s", PPSSPP_GIT_VERSION);
+	NOTICE_LOG(BOOT, "PPSSPP %s", PPSSPP_GIT_VERSION);
 #endif

 	Core_NotifyLifecycle(CoreLifecycle::STARTING);
--- a/GPU/Common/DrawEngineCommon.cpp
+++ b/GPU/Common/DrawEngineCommon.cpp
@ -19,6 +19,7 @@

 #include "Common/Data/Convert/ColorConv.h"
 #include "Common/Profiler/Profiler.h"
+#include "Common/LogReporting.h"
 #include "Core/Config.h"
 #include "GPU/Common/DrawEngineCommon.h"
 #include "GPU/Common/SplineCommon.h"
@ -188,6 +189,57 @@ u32 DrawEngineCommon::NormalizeVertices(u8 *outPtr, u8 *bufPtr, const u8 *inPtr,
 	return DrawEngineCommon::NormalizeVertices(outPtr, bufPtr, inPtr, dec, lowerBound, upperBound, vertType);
 }

+void DrawEngineCommon::DispatchSubmitImm(GEPrimitiveType prim, TransformedVertex *buffer, int vertexCount, int cullMode, bool continuation) {
+	// Instead of plumbing through properly (we'd need to inject these pretransformed vertices in the middle
+	// of SoftwareTransform(), which would take a lot of refactoring), we'll cheat and just turn these into
+	// through vertices.
+	// Since the only known use is Thrillville and it only uses it to clear, we just use color and pos.
+	struct ImmVertex {
+		float uv[2];
+		uint32_t color;
+		float xyz[3];
+	};
+	std::vector<ImmVertex> temp;
+	temp.resize(vertexCount);
+	uint32_t color1Used = 0;
+	for (int i = 0; i < vertexCount; i++) {
+		// Since we're sending through, scale back up to w/h.
+		temp[i].uv[0] = buffer[i].u * gstate.getTextureWidth(0);
+		temp[i].uv[1] = buffer[i].v * gstate.getTextureHeight(0);
+		temp[i].color = buffer[i].color0_32;
+		temp[i].xyz[0] = buffer[i].pos[0];
+		temp[i].xyz[1] = buffer[i].pos[1];
+		temp[i].xyz[2] = buffer[i].pos[2];
+		color1Used |= buffer[i].color1_32;
+	}
+	int vtype = GE_VTYPE_TC_FLOAT | GE_VTYPE_POS_FLOAT | GE_VTYPE_COL_8888 | GE_VTYPE_THROUGH;
+	// TODO: Handle fog and secondary color somehow?
+
+	if (gstate.isFogEnabled() && !gstate.isModeThrough()) {
+		WARN_LOG_REPORT_ONCE(geimmfog, G3D, "Imm vertex used fog");
+	}
+	if (color1Used != 0 && gstate.isUsingSecondaryColor() && !gstate.isModeThrough()) {
+		WARN_LOG_REPORT_ONCE(geimmcolor1, G3D, "Imm vertex used secondary color");
+	}
+
+	bool prevThrough = gstate.isModeThrough();
+	// Code checks this reg directly, not just the vtype ID.
+	if (!prevThrough) {
+		gstate.vertType |= GE_VTYPE_THROUGH;
+		gstate_c.Dirty(DIRTY_VERTEXSHADER_STATE | DIRTY_FRAGMENTSHADER_STATE | DIRTY_RASTER_STATE | DIRTY_VIEWPORTSCISSOR_STATE | DIRTY_UVSCALEOFFSET | DIRTY_CULLRANGE);
+	}
+
+	int bytesRead;
+	uint32_t vertTypeID = GetVertTypeID(vtype, 0);
+	SubmitPrim(&temp[0], nullptr, prim, vertexCount, vertTypeID, cullMode, &bytesRead);
+	DispatchFlush();
+
+	if (!prevThrough) {
+		gstate.vertType &= ~GE_VTYPE_THROUGH;
+		gstate_c.Dirty(DIRTY_VERTEXSHADER_STATE | DIRTY_FRAGMENTSHADER_STATE | DIRTY_RASTER_STATE | DIRTY_VIEWPORTSCISSOR_STATE | DIRTY_UVSCALEOFFSET | DIRTY_CULLRANGE);
+	}
+}
+
 // This code has plenty of potential for optimization.
 //
 // It does the simplest and safest test possible: If all points of a bbox is outside a single of
@ -484,12 +536,12 @@ u32 DrawEngineCommon::NormalizeVertices(u8 *outPtr, u8 *bufPtr, const u8 *inPtr,
 	return GE_VTYPE_TC_FLOAT | GE_VTYPE_COL_8888 | GE_VTYPE_NRM_FLOAT | GE_VTYPE_POS_FLOAT | (vertType & (GE_VTYPE_IDX_MASK | GE_VTYPE_THROUGH));
 }

-void DrawEngineCommon::ApplyFramebufferRead(bool *fboTexNeedsBind) {
+void DrawEngineCommon::ApplyFramebufferRead(FBOTexState *fboTexState) {
 	if (gstate_c.Supports(GPU_SUPPORTS_ANY_FRAMEBUFFER_FETCH)) {
-		*fboTexNeedsBind = false;
+		*fboTexState = FBO_TEX_READ_FRAMEBUFFER;
 	} else {
 		gpuStats.numCopiesForShaderBlend++;
-		*fboTexNeedsBind = true;
+		*fboTexState = FBO_TEX_COPY_BIND_TEX;
 	}

 	gstate_c.Dirty(DIRTY_SHADERBLEND);
--- a/GPU/Common/DrawEngineCommon.h
+++ b/GPU/Common/DrawEngineCommon.h
@ -46,6 +46,12 @@ enum {
 	TEX_SLOT_SPLINE_WEIGHTS_V = 6,
 };

+enum FBOTexState {
+	FBO_TEX_NONE,
+	FBO_TEX_COPY_BIND_TEX,
+	FBO_TEX_READ_FRAMEBUFFER,
+};
+
 inline uint32_t GetVertTypeID(uint32_t vertType, int uvGenMode) {
 	// As the decoder depends on the UVGenMode when we use UV prescale, we simply mash it
 	// into the top of the verttype where there are unused bits.
@ -84,10 +90,7 @@ public:
 		SubmitPrim(verts, inds, prim, vertexCount, vertTypeID, cullMode, bytesRead);
 	}

-	virtual void DispatchSubmitImm(const void *verts, const void *inds, GEPrimitiveType prim, int vertexCount, u32 vertTypeID, int cullMode, int *bytesRead) {
-		SubmitPrim(verts, inds, prim, vertexCount, vertTypeID, cullMode, bytesRead);
-		DispatchFlush();
-	}
+	virtual void DispatchSubmitImm(GEPrimitiveType prim, TransformedVertex *buffer, int vertexCount, int cullMode, bool continuation);

 	bool TestBoundingBox(const void* control_points, int vertexCount, u32 vertType, int *bytesRead);

@ -130,7 +133,7 @@ protected:
 	// Vertex decoding
 	void DecodeVertsStep(u8 *dest, int &i, int &decodedVerts);

-	void ApplyFramebufferRead(bool *fboTexNeedsBind);
+	void ApplyFramebufferRead(FBOTexState *fboTexState);

 	inline int IndexSize(u32 vtype) const {
 		const u32 indexType = (vtype & GE_VTYPE_IDX_MASK);
--- a/GPU/Common/FragmentShaderGenerator.cpp
+++ b/GPU/Common/FragmentShaderGenerator.cpp
@ -134,10 +134,12 @@ bool GenerateFragmentShader(const FShaderID &id, char *buffer, const ShaderLangu
 	GELogicOp replaceLogicOpType = isModeClear ? GE_LOGIC_COPY : (GELogicOp)id.Bits(FS_BIT_REPLACE_LOGIC_OP, 4);
 	bool replaceLogicOp = replaceLogicOpType != GE_LOGIC_COPY && compat.bitwiseOps;

-	bool readFramebuffer = replaceBlend == REPLACE_BLEND_READ_FRAMEBUFFER || colorWriteMask || replaceLogicOp;
-	bool readFramebufferTex = readFramebuffer && !gstate_c.Supports(GPU_SUPPORTS_ANY_FRAMEBUFFER_FETCH);
+	bool needFramebufferRead = replaceBlend == REPLACE_BLEND_READ_FRAMEBUFFER || colorWriteMask || replaceLogicOp;

-	bool needFragCoord = readFramebuffer || gstate_c.Supports(GPU_ROUND_FRAGMENT_DEPTH_TO_16BIT);
+	bool fetchFramebuffer = needFramebufferRead && gstate_c.Supports(GPU_SUPPORTS_ANY_FRAMEBUFFER_FETCH);
+	bool readFramebufferTex = needFramebufferRead && !gstate_c.Supports(GPU_SUPPORTS_ANY_FRAMEBUFFER_FETCH);
+
+	bool needFragCoord = readFramebufferTex || gstate_c.Supports(GPU_ROUND_FRAGMENT_DEPTH_TO_16BIT);
 	bool writeDepth = gstate_c.Supports(GPU_ROUND_FRAGMENT_DEPTH_TO_16BIT);

 	if (shaderDepalMode != ShaderDepalMode::OFF && !doTexture) {
@ -157,6 +159,11 @@ bool GenerateFragmentShader(const FShaderID &id, char *buffer, const ShaderLangu

 		if (readFramebufferTex) {
 			WRITE(p, "layout (binding = 1) uniform sampler2D fbotex;\n");
+		} else if (fetchFramebuffer) {
+			WRITE(p, "layout (input_attachment_index = 0, binding = 9) uniform subpassInput inputColor;\n");
+			if (fragmentShaderFlags) {
+				*fragmentShaderFlags |= FragmentShaderFlags::INPUT_ATTACHMENT;
+			}
 		}

 		if (shaderDepalMode != ShaderDepalMode::OFF) {
@ -416,7 +423,7 @@ bool GenerateFragmentShader(const FShaderID &id, char *buffer, const ShaderLangu

 		if (!strcmp(compat.fragColor0, "fragColor0")) {
 			const char *qualifierColor0 = "out";
-			if (readFramebuffer && compat.lastFragData && !strcmp(compat.lastFragData, compat.fragColor0)) {
+			if (fetchFramebuffer && compat.lastFragData && !strcmp(compat.lastFragData, compat.fragColor0)) {
 				qualifierColor0 = "inout";
 			}
 			// Output the output color definitions.
@ -492,20 +499,26 @@ bool GenerateFragmentShader(const FShaderID &id, char *buffer, const ShaderLangu
 	}

 	// Two things read from the old framebuffer - shader replacement blending and bit-level masking.
-	if (readFramebuffer) {
+	if (readFramebufferTex) {
 		if (compat.shaderLanguage == HLSL_D3D11) {
 			WRITE(p, "  vec4 destColor = fbotex.Load(int3((int)gl_FragCoord.x, (int)gl_FragCoord.y, 0));\n");
 		} else if (compat.shaderLanguage == HLSL_D3D9) {
 			WRITE(p, "  vec4 destColor = tex2D(fbotex, gl_FragCoord.xy * u_fbotexSize.xy);\n", compat.texture);
-		} else if (gstate_c.Supports(GPU_SUPPORTS_ANY_FRAMEBUFFER_FETCH)) {
-			// If we have EXT_shader_framebuffer_fetch / ARM_shader_framebuffer_fetch, we skip the blit.
-			// We can just read the prev value more directly.
-			WRITE(p, "  lowp vec4 destColor = %s;\n", compat.lastFragData);
 		} else if (!compat.texelFetch) {
 			WRITE(p, "  lowp vec4 destColor = %s(fbotex, gl_FragCoord.xy * u_fbotexSize.xy);\n", compat.texture);
 		} else {
 			WRITE(p, "  lowp vec4 destColor = %s(fbotex, ivec2(gl_FragCoord.x, gl_FragCoord.y), 0);\n", compat.texelFetch);
 		}
+	} else if (fetchFramebuffer) {
+		// If we have EXT_shader_framebuffer_fetch / ARM_shader_framebuffer_fetch, we skip the blit.
+		// We can just read the prev value more directly.
+		if (compat.shaderLanguage == GLSL_3xx) {
+			WRITE(p, "  lowp vec4 destColor = %s;\n", compat.lastFragData);
+		} else if (compat.shaderLanguage == GLSL_VULKAN) {
+			WRITE(p, "  lowp vec4 destColor = subpassLoad(inputColor);\n", compat.lastFragData);
+		} else {
+			_assert_msg_(false, "Need fetch destColor, but not a compatible language");
+		}
 	}

 	if (isModeClear) {
--- a/GPU/Common/FragmentShaderGenerator.h
+++ b/GPU/Common/FragmentShaderGenerator.h
@ -42,7 +42,7 @@ struct FShaderID;

 // Can technically be deduced from the fragment shader ID, but this is safer.
 enum class FragmentShaderFlags : u32 {
-	FS_FLAG_INPUT_ATTACHMENT = 1,
+	INPUT_ATTACHMENT = 1,
 };
 ENUM_CLASS_BITOPS(FragmentShaderFlags);

--- a/GPU/Common/GPUStateUtils.cpp
+++ b/GPU/Common/GPUStateUtils.cpp
@ -231,6 +231,7 @@ StencilValueType ReplaceAlphaWithStencilType() {
 	case GE_FORMAT_8888:
 	case GE_FORMAT_INVALID:
 	case GE_FORMAT_DEPTH16:
+	case GE_FORMAT_CLUT8:
 		switch (gstate.getStencilOpZPass()) {
 		case GE_STENCILOP_REPLACE:
 			// TODO: Could detect zero here and force ZERO - less uniform updates?
@ -859,66 +860,63 @@ static inline bool blendColorSimilar(uint32_t a, uint32_t b, int margin = 25) {
 static bool SimulateLogicOpIfNeeded(BlendFactor &srcBlend, BlendFactor &dstBlend, BlendEq &blendEq) {
 	// Note: our shader solution applies logic ops BEFORE blending, not correctly after.
 	// This is however fine for the most common ones, like CLEAR/NOOP/SET, etc.
-	if (!gstate_c.Supports(GPU_SUPPORTS_LOGIC_OP)) {
-		if (gstate.isLogicOpEnabled()) {
-			switch (gstate.getLogicOp()) {
-			case GE_LOGIC_CLEAR:
-				srcBlend = BlendFactor::ZERO;
-				dstBlend = BlendFactor::ZERO;
-				blendEq = BlendEq::ADD;
-				return true;
-			case GE_LOGIC_AND:
-			case GE_LOGIC_AND_REVERSE:
-				WARN_LOG_REPORT_ONCE(d3dLogicOpAnd, G3D, "Unsupported AND logic op: %x", gstate.getLogicOp());
-				break;
-			case GE_LOGIC_COPY:
-				// This is the same as off.
-				break;
-			case GE_LOGIC_COPY_INVERTED:
-				// Handled in the shader.
-				break;
-			case GE_LOGIC_AND_INVERTED:
-			case GE_LOGIC_NOR:
-			case GE_LOGIC_NAND:
-			case GE_LOGIC_EQUIV:
-				// Handled in the shader.
-				WARN_LOG_REPORT_ONCE(d3dLogicOpAndInverted, G3D, "Attempted invert for logic op: %x", gstate.getLogicOp());
-				break;
-			case GE_LOGIC_INVERTED:
-				srcBlend = BlendFactor::ONE;
-				dstBlend = BlendFactor::ONE;
-				blendEq = BlendEq::SUBTRACT;
-				WARN_LOG_REPORT_ONCE(d3dLogicOpInverted, G3D, "Attempted inverse for logic op: %x", gstate.getLogicOp());
-				return true;
-			case GE_LOGIC_NOOP:
-				srcBlend = BlendFactor::ZERO;
-				dstBlend = BlendFactor::ONE;
-				blendEq = BlendEq::ADD;
-				return true;
-			case GE_LOGIC_XOR:
-				WARN_LOG_REPORT_ONCE(d3dLogicOpOrXor, G3D, "Unsupported XOR logic op: %x", gstate.getLogicOp());
-				break;
-			case GE_LOGIC_OR:
-			case GE_LOGIC_OR_INVERTED:
-				// Inverted in shader.
-				srcBlend = BlendFactor::ONE;
-				dstBlend = BlendFactor::ONE;
-				blendEq = BlendEq::ADD;
-				WARN_LOG_REPORT_ONCE(d3dLogicOpOr, G3D, "Attempted or for logic op: %x", gstate.getLogicOp());
-				return true;
-			case GE_LOGIC_OR_REVERSE:
-				WARN_LOG_REPORT_ONCE(d3dLogicOpOrReverse, G3D, "Unsupported OR REVERSE logic op: %x", gstate.getLogicOp());
-				break;
-			case GE_LOGIC_SET:
-				srcBlend = BlendFactor::ONE;
-				dstBlend = BlendFactor::ONE;
-				blendEq = BlendEq::ADD;
-				WARN_LOG_REPORT_ONCE(d3dLogicOpSet, G3D, "Attempted set for logic op: %x", gstate.getLogicOp());
-				return true;
-			}
+	if (!gstate_c.Supports(GPU_SUPPORTS_LOGIC_OP) && gstate.isLogicOpEnabled()) {
+		switch (gstate.getLogicOp()) {
+		case GE_LOGIC_CLEAR:
+			srcBlend = BlendFactor::ZERO;
+			dstBlend = BlendFactor::ZERO;
+			blendEq = BlendEq::ADD;
+			return true;
+		case GE_LOGIC_AND:
+		case GE_LOGIC_AND_REVERSE:
+			WARN_LOG_REPORT_ONCE(d3dLogicOpAnd, G3D, "Unsupported AND logic op: %x", gstate.getLogicOp());
+			break;
+		case GE_LOGIC_COPY:
+			// This is the same as off.
+			break;
+		case GE_LOGIC_COPY_INVERTED:
+			// Handled in the shader.
+			break;
+		case GE_LOGIC_AND_INVERTED:
+		case GE_LOGIC_NOR:
+		case GE_LOGIC_NAND:
+		case GE_LOGIC_EQUIV:
+			// Handled in the shader.
+			WARN_LOG_REPORT_ONCE(d3dLogicOpAndInverted, G3D, "Attempted invert for logic op: %x", gstate.getLogicOp());
+			break;
+		case GE_LOGIC_INVERTED:
+			srcBlend = BlendFactor::ONE;
+			dstBlend = BlendFactor::ONE;
+			blendEq = BlendEq::SUBTRACT;
+			WARN_LOG_REPORT_ONCE(d3dLogicOpInverted, G3D, "Attempted inverse for logic op: %x", gstate.getLogicOp());
+			return true;
+		case GE_LOGIC_NOOP:
+			srcBlend = BlendFactor::ZERO;
+			dstBlend = BlendFactor::ONE;
+			blendEq = BlendEq::ADD;
+			return true;
+		case GE_LOGIC_XOR:
+			WARN_LOG_REPORT_ONCE(d3dLogicOpOrXor, G3D, "Unsupported XOR logic op: %x", gstate.getLogicOp());
+			break;
+		case GE_LOGIC_OR:
+		case GE_LOGIC_OR_INVERTED:
+			// Inverted in shader.
+			srcBlend = BlendFactor::ONE;
+			dstBlend = BlendFactor::ONE;
+			blendEq = BlendEq::ADD;
+			WARN_LOG_REPORT_ONCE(d3dLogicOpOr, G3D, "Attempted or for logic op: %x", gstate.getLogicOp());
+			return true;
+		case GE_LOGIC_OR_REVERSE:
+			WARN_LOG_REPORT_ONCE(d3dLogicOpOrReverse, G3D, "Unsupported OR REVERSE logic op: %x", gstate.getLogicOp());
+			break;
+		case GE_LOGIC_SET:
+			srcBlend = BlendFactor::ONE;
+			dstBlend = BlendFactor::ONE;
+			blendEq = BlendEq::ADD;
+			WARN_LOG_REPORT_ONCE(d3dLogicOpSet, G3D, "Attempted set for logic op: %x", gstate.getLogicOp());
+			return true;
 		}
 	}
-
 	return false;
 }

@ -1080,6 +1078,12 @@ static void ConvertBlendState(GenericBlendState &blendState, bool forceReplaceBl
 	case REPLACE_BLEND_NO:
 		// We may still want to do something about stencil -> alpha.
 		ApplyStencilReplaceAndLogicOpIgnoreBlend(replaceAlphaWithStencil, blendState);
+
+		if (forceReplaceBlend) {
+			// If this is true, the logic and mask replacements will be applied, at least. In that case,
+			// we should not apply any logic op simulation.
+			blendState.simulateLogicOpType = LOGICOPTYPE_NORMAL;
+		}
 		return;

 	case REPLACE_BLEND_BLUE_TO_ALPHA:
--- a/GPU/Common/ShaderId.cpp
+++ b/GPU/Common/ShaderId.cpp
@ -349,13 +349,10 @@ void ComputeFragmentShaderID(FShaderID *id_out, const ComputedPipelineState &pip

 		id.SetBit(FS_BIT_COLOR_WRITEMASK, colorWriteMask);

-		if (g_Config.bVendorBugChecksEnabled) {
-			if (bugs.Has(Draw::Bugs::NO_DEPTH_CANNOT_DISCARD_STENCIL)) {
-				id.SetBit(FS_BIT_NO_DEPTH_CANNOT_DISCARD_STENCIL, !IsStencilTestOutputDisabled() && !gstate.isDepthWriteEnabled());
-			} else if (bugs.Has(Draw::Bugs::MALI_STENCIL_DISCARD_BUG) && PSP_CoreParameter().compat.flags().MaliDepthStencilBugWorkaround) {
-				// Very similar driver bug to the Adreno one, with the same workaround (though might look into if there are cheaper ones!)
-				// Keeping the conditions separate since it can probably be made tighter.
-				id.SetBit(FS_BIT_NO_DEPTH_CANNOT_DISCARD_STENCIL, !IsStencilTestOutputDisabled() && (!gstate.isDepthTestEnabled() || !gstate.isDepthWriteEnabled()));
+		if (g_Config.bVendorBugChecksEnabled && bugs.Has(Draw::Bugs::NO_DEPTH_CANNOT_DISCARD_STENCIL)) {
+			bool stencilWithoutDepth = !IsStencilTestOutputDisabled() && (!gstate.isDepthTestEnabled() || !gstate.isDepthWriteEnabled());
+			if (stencilWithoutDepth) {
+				id.SetBit(FS_BIT_NO_DEPTH_CANNOT_DISCARD_STENCIL, stencilWithoutDepth);
 			}
 		}
 	}
--- a/GPU/Common/StencilCommon.cpp
+++ b/GPU/Common/StencilCommon.cpp
@ -171,6 +171,7 @@ bool FramebufferManagerCommon::PerformStencilUpload(u32 addr, int size, StencilU
 		break;
 	case GE_FORMAT_INVALID:
 	case GE_FORMAT_DEPTH16:
+	case GE_FORMAT_CLUT8:
 		// Inconceivable.
 		_assert_(false);
 		break;
--- a/GPU/Common/TextureCacheCommon.cpp
+++ b/GPU/Common/TextureCacheCommon.cpp
@ -37,6 +37,7 @@
 #include "GPU/Common/ShaderId.h"
 #include "GPU/Common/GPUStateUtils.h"
 #include "GPU/Debugger/Debugger.h"
+#include "GPU/Debugger/Record.h"
 #include "GPU/GPUCommon.h"
 #include "GPU/GPUInterface.h"
 #include "GPU/GPUState.h"
@ -292,11 +293,18 @@ SamplerCacheKey TextureCacheCommon::GetSamplingParams(int maxLevel, const TexCac
 SamplerCacheKey TextureCacheCommon::GetFramebufferSamplingParams(u16 bufferWidth, u16 bufferHeight) {
 	SamplerCacheKey key = GetSamplingParams(0, nullptr);

+	// In case auto max quality was on, restore min filt. Another fix for water in Outrun.
+	if (g_Config.iTexFiltering == TEX_FILTER_AUTO_MAX_QUALITY) {
+		int minFilt = gstate.texfilter & 0x7;
+		key.minFilt = minFilt & 1;
+	}
+
 	// Kill any mipmapping settings.
 	key.mipEnable = false;
 	key.mipFilt = false;
 	key.aniso = 0.0;
 	key.maxLevel = 0.0f;
+	key.lodBias = 0.0f;

 	// Often the framebuffer will not match the texture size. We'll wrap/clamp in the shader in that case.
 	int w = gstate.getTextureWidth(0);
@ -1260,14 +1268,17 @@ void TextureCacheCommon::LoadClut(u32 clutAddr, u32 loadBytes) {

 		// It's possible for a game to load CLUT outside valid memory without crashing, should result in zeroes.
 		u32 bytes = Memory::ValidSize(clutAddr, loadBytes);
-		if (clutRenderAddress_ != 0xFFFFFFFF && PSP_CoreParameter().compat.flags().AllowDownloadCLUT) {
+		bool performDownload = PSP_CoreParameter().compat.flags().AllowDownloadCLUT;
+		if (GPURecord::IsActive())
+			performDownload = true;
+		if (clutRenderAddress_ != 0xFFFFFFFF && performDownload) {
 			framebufferManager_->DownloadFramebufferForClut(clutRenderAddress_, clutRenderOffset_ + bytes);
 			Memory::MemcpyUnchecked(clutBufRaw_, clutAddr, bytes);
 			if (bytes < loadBytes) {
 				memset((u8 *)clutBufRaw_ + bytes, 0x00, loadBytes - bytes);
 			}
 		} else {
-			// Here we could check for clutRenderAddres_ != 0xFFFFFFFF and zero the CLUT or something,
+			// Here we could check for clutRenderAddress_ != 0xFFFFFFFF and zero the CLUT or something,
 			// but choosing not to for now. Though the results of loading the CLUT from RAM here is
 			// almost certainly going to be bogus.
 #ifdef _M_SSE
@ -1986,6 +1997,9 @@ static bool CanDepalettize(GETextureFormat texFormat, GEBufferFormat bufferForma
 				return true;
 			}
 			break;
+		case GE_FORMAT_CLUT8:
+			// Shouldn't happen here.
+			return false;
 		}
 		WARN_LOG(G3D, "Invalid CLUT/framebuffer combination: %s vs %s", GeTextureFormatToString(texFormat), GeBufferFormatToString(bufferFormat));
 		return false;
--- a/GPU/Common/VertexDecoderArm.cpp
+++ b/GPU/Common/VertexDecoderArm.cpp
@ -872,22 +872,14 @@ void VertexDecoderJitCache::Jit_NormalFloat() {
 	STMIA(scratchReg, false, 3, tempReg1, tempReg2, tempReg3);
 }

-// Through expands into floats, always. Might want to look at changing this.
 void VertexDecoderJitCache::Jit_PosS8Through() {
-	DEBUG_LOG_REPORT_ONCE(vertexS8Through, G3D, "Using S8 positions in throughmode");
 	_dbg_assert_msg_(fpScratchReg + 1 == fpScratchReg2, "VertexDecoder fpScratchRegs must be in order.");
 	_dbg_assert_msg_(fpScratchReg2 + 1 == fpScratchReg3, "VertexDecoder fpScratchRegs must be in order.");

-	// TODO: SIMD
-	LDRSB(tempReg1, srcReg, dec_->posoff);
-	LDRSB(tempReg2, srcReg, dec_->posoff + 1);
-	LDRB(tempReg3, srcReg, dec_->posoff + 2);
-	static const ARMReg tr[3] = { tempReg1, tempReg2, tempReg3 };
-	static const ARMReg fr[3] = { fpScratchReg, fpScratchReg2, fpScratchReg3 };
+	// 8-bit positions in throughmode always decode to 0, depth included.
+	VEOR(neonScratchReg, neonScratchReg, neonScratchReg);
+	VEOR(neonScratchReg2, neonScratchReg, neonScratchReg);
 	ADD(scratchReg, dstReg, dec_->decFmt.posoff);
-	VMOV(neonScratchReg, tempReg1, tempReg2);
-	VMOV(neonScratchReg2, tempReg3, tempReg3);
-	VCVT(F_32 | I_SIGNED, neonScratchRegQ, neonScratchRegQ);
 	VST1(F_32, neonScratchReg, scratchReg, 2, ALIGN_NONE);
 }

--- a/GPU/Common/VertexDecoderArm64.cpp
+++ b/GPU/Common/VertexDecoderArm64.cpp
@ -668,15 +668,11 @@ void VertexDecoderJitCache::Jit_PosFloat() {
 }

 void VertexDecoderJitCache::Jit_PosS8Through() {
-	LDRSB(INDEX_UNSIGNED, tempReg1, srcReg, dec_->posoff);
-	LDRSB(INDEX_UNSIGNED, tempReg2, srcReg, dec_->posoff + 1);
-	LDRB(INDEX_UNSIGNED, tempReg3, srcReg, dec_->posoff + 2);
-	fp.SCVTF(fpScratchReg, tempReg1);
-	fp.SCVTF(fpScratchReg2, tempReg2);
-	fp.SCVTF(fpScratchReg3, tempReg3);
+	// 8-bit positions in throughmode always decode to 0, depth included.
+	fp.EOR(fpScratchReg, fpScratchReg, fpScratchReg);
 	STR(INDEX_UNSIGNED, fpScratchReg, dstReg, dec_->decFmt.posoff);
-	STR(INDEX_UNSIGNED, fpScratchReg2, dstReg, dec_->decFmt.posoff + 4);
-	STR(INDEX_UNSIGNED, fpScratchReg3, dstReg, dec_->decFmt.posoff + 8);
+	STR(INDEX_UNSIGNED, fpScratchReg, dstReg, dec_->decFmt.posoff + 4);
+	STR(INDEX_UNSIGNED, fpScratchReg, dstReg, dec_->decFmt.posoff + 8);
 }

 void VertexDecoderJitCache::Jit_PosS16Through() {
--- a/GPU/Common/VertexDecoderCommon.cpp
+++ b/GPU/Common/VertexDecoderCommon.cpp
@ -773,14 +773,20 @@ void VertexDecoder::Step_PosFloatSkin() const
 	Vec3ByMatrix43(pos, fn, skinMatrix);
 }

-void VertexDecoder::Step_PosS8Through() const
-{
+void VertexDecoder::Step_PosInvalid() const {
+	// Invalid positions are just culled.  Simulate by forcing invalid values.
 	float *v = (float *)(decoded_ + decFmt.posoff);
-	const s8 *sv = (const s8 *)(ptr_ + posoff);
-	const u8 *uv = (const u8 *)(ptr_ + posoff);
-	v[0] = sv[0];
-	v[1] = sv[1];
-	v[2] = uv[2];
+	v[0] = std::numeric_limits<float>::infinity();
+	v[1] = std::numeric_limits<float>::infinity();
+	v[2] = std::numeric_limits<float>::infinity();
+}
+
+void VertexDecoder::Step_PosS8Through() const {
+	// 8-bit positions in throughmode always decode to 0, depth included.
+	float *v = (float *)(decoded_ + decFmt.posoff);
+	v[0] = 0;
+	v[1] = 0;
+	v[2] = 0;
 }

 void VertexDecoder::Step_PosS16Through() const
@ -1023,35 +1029,35 @@ static const StepFunction nrmstep_morphskin[4] = {
 };

 static const StepFunction posstep[4] = {
-	&VertexDecoder::Step_PosS8,
+	&VertexDecoder::Step_PosInvalid,
 	&VertexDecoder::Step_PosS8,
 	&VertexDecoder::Step_PosS16,
 	&VertexDecoder::Step_PosFloat,
 };

 static const StepFunction posstep_skin[4] = {
-	&VertexDecoder::Step_PosS8Skin,
+	&VertexDecoder::Step_PosInvalid,
 	&VertexDecoder::Step_PosS8Skin,
 	&VertexDecoder::Step_PosS16Skin,
 	&VertexDecoder::Step_PosFloatSkin,
 };

 static const StepFunction posstep_morph[4] = {
-	&VertexDecoder::Step_PosS8Morph,
+	&VertexDecoder::Step_PosInvalid,
 	&VertexDecoder::Step_PosS8Morph,
 	&VertexDecoder::Step_PosS16Morph,
 	&VertexDecoder::Step_PosFloatMorph,
 };

 static const StepFunction posstep_morph_skin[4] = {
-	&VertexDecoder::Step_PosS8MorphSkin,
+	&VertexDecoder::Step_PosInvalid,
 	&VertexDecoder::Step_PosS8MorphSkin,
 	&VertexDecoder::Step_PosS16MorphSkin,
 	&VertexDecoder::Step_PosFloatMorphSkin,
 };

 static const StepFunction posstep_through[4] = {
-	&VertexDecoder::Step_PosS8Through,
+	&VertexDecoder::Step_PosInvalid,
 	&VertexDecoder::Step_PosS8Through,
 	&VertexDecoder::Step_PosS16Through,
 	&VertexDecoder::Step_PosFloatThrough,
@ -1224,9 +1230,8 @@ void VertexDecoder::SetVertexType(u32 fmt, const VertexDecoderOptions &options,
 	bool reportNoPos = false;
 	if (!pos) {
 		reportNoPos = true;
-		pos = 1;
 	}
-	if (pos) { // there's always a position
+	if (pos >= 0) { // there's always a position
 		size = align(size, posalign[pos]);
 		posoff = size;
 		size += possize[pos];
--- a/GPU/Common/VertexDecoderCommon.h
+++ b/GPU/Common/VertexDecoderCommon.h
@ -433,6 +433,7 @@ public:
 	void Step_PosS16MorphSkin() const;
 	void Step_PosFloatMorphSkin() const;

+	void Step_PosInvalid() const;
 	void Step_PosS8Through() const;
 	void Step_PosS16Through() const;
 	void Step_PosFloatThrough() const;
--- a/GPU/Common/VertexDecoderX86.cpp
+++ b/GPU/Common/VertexDecoderX86.cpp
@ -1345,14 +1345,9 @@ void VertexDecoderJitCache::Jit_NormalFloatSkin() {

 // Through expands into floats, always. Might want to look at changing this.
 void VertexDecoderJitCache::Jit_PosS8Through() {
-	DEBUG_LOG_REPORT_ONCE(vertexS8Through, G3D, "Using S8 positions in throughmode");
 	// SIMD doesn't really matter since this isn't useful on hardware.
+	XORPS(fpScratchReg, R(fpScratchReg));
 	for (int i = 0; i < 3; i++) {
-		if (i == 2)
-			MOVZX(32, 8, tempReg1, MDisp(srcReg, dec_->posoff + i));
-		else
-			MOVSX(32, 8, tempReg1, MDisp(srcReg, dec_->posoff + i));
-		CVTSI2SS(fpScratchReg, R(tempReg1));
 		MOVSS(MDisp(dstReg, dec_->decFmt.posoff + i * 4), fpScratchReg);
 	}
 }
--- a/GPU/Common/VertexShaderGenerator.cpp
+++ b/GPU/Common/VertexShaderGenerator.cpp
@ -142,10 +142,11 @@ bool GenerateVertexShader(const VShaderID &id, char *buffer, const ShaderLanguag
 		if (gl_extensions.EXT_gpu_shader4) {
 			gl_exts.push_back("#extension GL_EXT_gpu_shader4 : enable");
 		}
-		if (gl_extensions.EXT_clip_cull_distance && id.Bit(VS_BIT_VERTEX_RANGE_CULLING)) {
+		bool useClamp = gstate_c.Supports(GPU_SUPPORTS_DEPTH_CLAMP) && !id.Bit(VS_BIT_IS_THROUGH);
+		if (gl_extensions.EXT_clip_cull_distance && (id.Bit(VS_BIT_VERTEX_RANGE_CULLING) || useClamp)) {
 			gl_exts.push_back("#extension GL_EXT_clip_cull_distance : enable");
 		}
-		if (gl_extensions.APPLE_clip_distance && id.Bit(VS_BIT_VERTEX_RANGE_CULLING)) {
+		if (gl_extensions.APPLE_clip_distance && (id.Bit(VS_BIT_VERTEX_RANGE_CULLING) || useClamp)) {
 			gl_exts.push_back("#extension GL_APPLE_clip_distance : enable");
 		}
 		if (gl_extensions.ARB_cull_distance && id.Bit(VS_BIT_VERTEX_RANGE_CULLING)) {
@ -227,6 +228,10 @@ bool GenerateVertexShader(const VShaderID &id, char *buffer, const ShaderLanguag
 	bool texCoordInVec3 = false;

 	bool vertexRangeCulling = id.Bit(VS_BIT_VERTEX_RANGE_CULLING) && !isModeThrough;
+	bool clipClampedDepth = !isModeThrough && gstate_c.Supports(GPU_SUPPORTS_DEPTH_CLAMP) && gstate_c.Supports(GPU_SUPPORTS_CLIP_DISTANCE);
+	const char *vertexRangeClipSuffix = "[0]";
+	if (vertexRangeCulling && clipClampedDepth)
+		vertexRangeClipSuffix = "[2]";

 	if (compat.shaderLanguage == GLSL_VULKAN) {
 		WRITE(p, "\n");
@ -419,8 +424,15 @@ bool GenerateVertexShader(const VShaderID &id, char *buffer, const ShaderLanguag
 			WRITE(p, "  vec4 gl_Position   : POSITION;\n");
 		} else {
 			WRITE(p, "  vec4 gl_Position   : SV_Position;\n");
-			if (vertexRangeCulling && gstate_c.Supports(GPU_SUPPORTS_CLIP_DISTANCE)) {
-				WRITE(p, "  float gl_ClipDistance : SV_ClipDistance0;\n");
+			bool clipRange = vertexRangeCulling && gstate_c.Supports(GPU_SUPPORTS_CLIP_DISTANCE);
+			if (clipClampedDepth && clipRange) {
+				WRITE(p, "  float3 gl_ClipDistance : SV_ClipDistance;\n");
+				vertexRangeClipSuffix = ".z";
+			} else if (clipClampedDepth) {
+				WRITE(p, "  float2 gl_ClipDistance : SV_ClipDistance;\n");
+			} else if (clipRange) {
+				WRITE(p, "  float gl_ClipDistance : SV_ClipDistance;\n");
+				vertexRangeClipSuffix = "";
 			}
 			if (vertexRangeCulling && gstate_c.Supports(GPU_SUPPORTS_CULL_DISTANCE)) {
 				WRITE(p, "  float2 gl_CullDistance : SV_CullDistance0;\n");
@ -1177,8 +1189,37 @@ bool GenerateVertexShader(const VShaderID &id, char *buffer, const ShaderLanguag
 			WRITE(p, "  %sv_fogdepth = (viewPos.z + u_fogcoef.x) * u_fogcoef.y;\n", compat.vsOutPrefix);
 	}

-	if (vertexRangeCulling && !IsVRBuild()) {
+	if (clipClampedDepth || (vertexRangeCulling && !IsVRBuild())) {
 		WRITE(p, "  vec3 projPos = outPos.xyz / outPos.w;\n");
+	}
+
+	if (clipClampedDepth) {
+		const char *clip0 = compat.shaderLanguage == HLSL_D3D11 ? ".x" : "[0]";
+		const char *clip1 = compat.shaderLanguage == HLSL_D3D11 ? ".y" : "[1]";
+		WRITE(p, "  mediump float integerZ = projPos.z * u_depthRange.x + u_depthRange.y;\n");
+
+		// This should clip against minz, but only when it's above zero.
+		if (ShaderLanguageIsOpenGL(compat.shaderLanguage)) {
+			// On OpenGL/GLES, these values account for the -1 -> 1 range.
+			WRITE(p, "  if (u_depthRange.y - u_depthRange.x >= 1.0) {\n");
+		} else {
+			// Everywhere else, it's 0 -> 1, simpler.
+			WRITE(p, "  if (u_depthRange.y >= 1.0) {\n");
+		}
+		WRITE(p, "    %sgl_ClipDistance%s = integerZ;\n", compat.vsOutPrefix, clip0);
+		WRITE(p, "  } else {\n");
+		WRITE(p, "    %sgl_ClipDistance%s = 0.0;\n", compat.vsOutPrefix, clip0);
+		WRITE(p, "  }\n");
+
+		// This is similar, but for maxz when it's below 65535.0.  -1/0 don't matter here.
+		WRITE(p, "  if (u_depthRange.x + u_depthRange.y <= 65534.0) {\n");
+		WRITE(p, "    %sgl_ClipDistance%s = 65535.0 - integerZ;\n", compat.vsOutPrefix, clip1);
+		WRITE(p, "  } else {\n");
+		WRITE(p, "    %sgl_ClipDistance%s = 0.0;\n", compat.vsOutPrefix, clip1);
+		WRITE(p, "  }\n");
+	}
+
+	if (vertexRangeCulling && !IsVRBuild()) {
 		WRITE(p, "  float projZ = (projPos.z - u_depthRange.z) * u_depthRange.w;\n");
 		// Vertex range culling doesn't happen when Z clips, note sign of w is important.
 		WRITE(p, "  if (u_cullRangeMin.w <= 0.0 || projZ * outPos.w > -outPos.w) {\n");
@ -1194,12 +1235,11 @@ bool GenerateVertexShader(const VShaderID &id, char *buffer, const ShaderLanguag
 		WRITE(p, "    }\n");
 		WRITE(p, "  }\n");

-		const char *clip0 = compat.shaderLanguage == HLSL_D3D11 ? "" : "[0]";
 		const char *cull0 = compat.shaderLanguage == HLSL_D3D11 ? ".x" : "[0]";
 		const char *cull1 = compat.shaderLanguage == HLSL_D3D11 ? ".y" : "[1]";
 		if (gstate_c.Supports(GPU_SUPPORTS_CLIP_DISTANCE)) {
 			// TODO: Not rectangles...
-			WRITE(p, "  %sgl_ClipDistance%s = projZ * outPos.w + outPos.w;\n", compat.vsOutPrefix, clip0);
+			WRITE(p, "  %sgl_ClipDistance%s = projZ * outPos.w + outPos.w;\n", compat.vsOutPrefix, vertexRangeClipSuffix);
 		}
 		if (gstate_c.Supports(GPU_SUPPORTS_CULL_DISTANCE)) {
 			// Cull any triangle fully outside in the same direction when depth clamp enabled.
--- a/GPU/D3D11/GPU_D3D11.cpp
+++ b/GPU/D3D11/GPU_D3D11.cpp
@ -82,7 +82,7 @@ GPU_D3D11::GPU_D3D11(GraphicsContext *gfxCtx, Draw::DrawContext *draw)
 	// No need to flush before the tex scale/offset commands if we are baking
 	// the tex scale/offset into the vertices anyway.
 	UpdateCmdInfo();
-	CheckGPUFeatures();
+	gstate_c.featureFlags = CheckGPUFeatures();

 	BuildReportingInfo();

@ -100,40 +100,16 @@ GPU_D3D11::~GPU_D3D11() {
 	stockD3D11.Destroy();
 }

-void GPU_D3D11::CheckGPUFeatures() {
-	u32 features = 0;
-
-	features |= GPU_SUPPORTS_BLEND_MINMAX;
+u32 GPU_D3D11::CheckGPUFeatures() const {
+	u32 features = GPUCommon::CheckGPUFeatures();

 	// Accurate depth is required because the Direct3D API does not support inverse Z.
 	// So we cannot incorrectly use the viewport transform as the depth range on Direct3D.
 	// TODO: Breaks text in PaRappa for some reason?
 	features |= GPU_SUPPORTS_ACCURATE_DEPTH;

-#ifndef _M_ARM
-	// TODO: Do proper feature detection
-	features |= GPU_SUPPORTS_ANISOTROPY;
-#endif
-
-	features |= GPU_SUPPORTS_DEPTH_TEXTURE;
-	features |= GPU_SUPPORTS_TEXTURE_NPOT;
-	if (draw_->GetDeviceCaps().dualSourceBlend)
-		features |= GPU_SUPPORTS_DUALSOURCE_BLEND;
 	if (draw_->GetDeviceCaps().depthClampSupported)
 		features |= GPU_SUPPORTS_DEPTH_CLAMP;
-	if (draw_->GetDeviceCaps().clipDistanceSupported)
-		features |= GPU_SUPPORTS_CLIP_DISTANCE;
-	if (draw_->GetDeviceCaps().cullDistanceSupported)
-		features |= GPU_SUPPORTS_CULL_DISTANCE;
-	if (!draw_->GetBugs().Has(Draw::Bugs::BROKEN_NAN_IN_CONDITIONAL)) {
-		// Ignore the compat setting if clip and cull are both enabled.
-		// When supported, we can do the depth side of range culling more correctly.
-		const bool supported = draw_->GetDeviceCaps().clipDistanceSupported && draw_->GetDeviceCaps().cullDistanceSupported;
-		const bool disabled = PSP_CoreParameter().compat.flags().DisableRangeCulling;
-		if (supported || !disabled) {
-			features |= GPU_SUPPORTS_VS_RANGE_CULLING;
-		}
-	}

 	features |= GPU_SUPPORTS_TEXTURE_FLOAT;
 	features |= GPU_SUPPORTS_INSTANCE_RENDERING;
@ -146,10 +122,6 @@ void GPU_D3D11::CheckGPUFeatures() {
 		features |= GPU_SUPPORTS_16BIT_FORMATS;
 	}

-	if (draw_->GetDeviceCaps().logicOpSupported) {
-		features |= GPU_SUPPORTS_LOGIC_OP;
-	}
-
 	if (!g_Config.bHighQualityDepth && (features & GPU_SUPPORTS_ACCURATE_DEPTH) != 0) {
 		features |= GPU_SCALE_DEPTH_FROM_24BIT_TO_16BIT;
 	} else if (PSP_CoreParameter().compat.flags().PixelDepthRounding) {
@ -164,11 +136,7 @@ void GPU_D3D11::CheckGPUFeatures() {
 		features |= GPU_USE_DEPTH_RANGE_HACK;
 	}

-	if (PSP_CoreParameter().compat.flags().ClearToRAM) {
-		features |= GPU_USE_CLEAR_RAM_HACK;
-	}
-
-	gstate_c.featureFlags = features;
+	return features;
 }

 // Needs to be called on GPU thread, not reporting thread.
@ -206,7 +174,7 @@ void GPU_D3D11::BeginHostFrame() {
 	GPUCommon::BeginHostFrame();
 	UpdateCmdInfo();
 	if (resized_) {
-		CheckGPUFeatures();
+		gstate_c.featureFlags = CheckGPUFeatures();
 		framebufferManager_->Resized();
 		drawEngine_.Resized();
 		textureCache_->NotifyConfigChanged();
--- a/GPU/D3D11/GPU_D3D11.h
+++ b/GPU/D3D11/GPU_D3D11.h
@ -36,7 +36,7 @@ public:
 	GPU_D3D11(GraphicsContext *gfxCtx, Draw::DrawContext *draw);
 	~GPU_D3D11();

-	void CheckGPUFeatures() override;
+	u32 CheckGPUFeatures() const override;
 	void PreExecuteOp(u32 op, u32 diff) override;
 	void ExecuteOp(u32 op, u32 diff) override;

--- a/GPU/D3D11/StateMappingD3D11.cpp
+++ b/GPU/D3D11/StateMappingD3D11.cpp
@ -153,15 +153,16 @@ void DrawEngineD3D11::ApplyDrawState(int prim) {
 			// We ignore the logicState on D3D since there's no support, the emulation of it is blend-and-shader only.

 			if (pipelineState_.FramebufferRead()) {
-				bool fboTexNeedsBind = false;
-				ApplyFramebufferRead(&fboTexNeedsBind);
+				FBOTexState fboTexBindState = FBO_TEX_NONE;
+				ApplyFramebufferRead(&fboTexBindState);
 				// The shader takes over the responsibility for blending, so recompute.
 				ApplyStencilReplaceAndLogicOpIgnoreBlend(blendState.replaceAlphaWithStencil, blendState);

-				if (fboTexNeedsBind) {
+				if (fboTexBindState == FBO_TEX_COPY_BIND_TEX) {
 					framebufferManager_->BindFramebufferAsColorTexture(1, framebufferManager_->GetCurrentRenderVFB(), BINDFBCOLOR_MAY_COPY);
 					// No sampler required, we do a plain Load in the pixel shader.
 					fboTexBound_ = true;
+					fboTexBindState = FBO_TEX_NONE;

 					framebufferManager_->RebindFramebuffer("RebindFramebuffer - ApplyDrawState");
 					// Must dirty blend state here so we re-copy next time.  Example: Lunar's spell effects.
--- a/GPU/Debugger/Debugger.cpp
+++ b/GPU/Debugger/Debugger.cpp
@ -18,6 +18,7 @@
 #include <vector>
 #include "Common/Log.h"
 #include "Common/StringUtils.h"
+#include "Common/TimeUtil.h"
 #include "GPU/GPU.h"
 #include "GPU/Debugger/Breakpoints.h"
 #include "GPU/Debugger/Debugger.h"
@ -35,6 +36,8 @@ static int primsLastFrame = 0;
 static int primsThisFrame = 0;
 static int thisFlipNum = 0;

+static double lastStepTime = -1.0;
+
 static std::vector<std::pair<int, int>> restrictPrimRanges;
 static std::string restrictPrimRule;

@ -56,6 +59,7 @@ void SetActive(bool flag) {
 		breakNext = BreakNext::NONE;
 		breakAtCount = -1;
 		GPUStepping::ResumeFromStepping();
+		lastStepTime = -1.0;
 	}
 }

@ -79,6 +83,7 @@ void SetBreakNext(BreakNext next) {
 		GPUBreakpoints::AddCmdBreakpoint(GE_CMD_SPLINE, true);
 	}
 	GPUStepping::ResumeFromStepping();
+	lastStepTime = next == BreakNext::NONE ? -1.0 : time_now_d();
 }

 void SetBreakCount(int c, bool relative) {
@ -130,7 +135,12 @@ bool NotifyCommand(u32 pc) {
 		GPUBreakpoints::ClearTempBreakpoints();

 		auto info = gpuDebug->DissassembleOp(pc);
-		NOTICE_LOG(G3D, "Waiting at %08x, %s", pc, info.desc.c_str());
+		if (lastStepTime >= 0.0) {
+			NOTICE_LOG(G3D, "Waiting at %08x, %s (%fms)", pc, info.desc.c_str(), (time_now_d() - lastStepTime) * 1000.0);
+			lastStepTime = -1.0;
+		} else {
+			NOTICE_LOG(G3D, "Waiting at %08x, %s", pc, info.desc.c_str());
+		}
 		GPUStepping::EnterStepping();
 	}

@ -141,7 +151,12 @@ void NotifyDraw() {
 	if (!active)
 		return;
 	if (breakNext == BreakNext::DRAW && !GPUStepping::IsStepping()) {
-		NOTICE_LOG(G3D, "Waiting at a draw");
+		if (lastStepTime >= 0.0) {
+			NOTICE_LOG(G3D, "Waiting at a draw (%fms)", (time_now_d() - lastStepTime) * 1000.0);
+			lastStepTime = -1.0;
+		} else {
+			NOTICE_LOG(G3D, "Waiting at a draw");
+		}
 		GPUStepping::EnterStepping();
 	}
 }
--- a/GPU/Debugger/Playback.cpp
+++ b/GPU/Debugger/Playback.cpp
@ -298,6 +298,7 @@ private:
 	void Registers(u32 ptr, u32 sz);
 	void Vertices(u32 ptr, u32 sz);
 	void Indices(u32 ptr, u32 sz);
+	void ClutAddr(u32 ptr, u32 sz);
 	void Clut(u32 ptr, u32 sz);
 	void TransferSrc(u32 ptr, u32 sz);
 	void Memset(u32 ptr, u32 sz);
@ -308,6 +309,8 @@ private:
 	void Display(u32 ptr, u32 sz);

 	u32 execMemcpyDest = 0;
+	u32 execClutAddr = 0;
+	u32 execClutFlags = 0;
 	u32 execListBuf = 0;
 	u32 execListPos = 0;
 	u32 execListID = 0;
@ -472,15 +475,40 @@ void DumpExecute::Indices(u32 ptr, u32 sz) {
 	execListQueue.push_back((GE_CMD_IADDR << 24) | (psp & 0x00FFFFFF));
 }

-void DumpExecute::Clut(u32 ptr, u32 sz) {
-	u32 psp = mapping_.Map(ptr, sz, std::bind(&DumpExecute::SyncStall, this));
-	if (psp == 0) {
-		ERROR_LOG(SYSTEM, "Unable to allocate for clut");
-		return;
-	}
+void DumpExecute::ClutAddr(u32 ptr, u32 sz) {
+	struct ClutAddrData {
+		u32 addr;
+		u32 flags;
+	};
+	const ClutAddrData *data = (const ClutAddrData *)(pushbuf_.data() + ptr);
+	execClutAddr = data->addr;
+	execClutFlags = data->flags;
+}

-	execListQueue.push_back((GE_CMD_CLUTADDRUPPER << 24) | ((psp >> 8) & 0x00FF0000));
-	execListQueue.push_back((GE_CMD_CLUTADDR << 24) | (psp & 0x00FFFFFF));
+void DumpExecute::Clut(u32 ptr, u32 sz) {
+	// This is always run when we have the actual address set.
+	if (execClutAddr != 0) {
+		const bool isTarget = (execClutFlags & 1) != 0;
+		const bool unchangedVRAM = (execClutFlags & 2) != 0;
+
+		// TODO: Could use drawnVRAM flag, but it can be wrong.
+		// Could potentially always skip if !isTarget, but playing it safe for offset texture behavior.
+		if (Memory::IsValidRange(execClutAddr, sz) && !unchangedVRAM && (!isTarget || !g_Config.bSoftwareRendering)) {
+			// Intentionally don't trigger an upload here.
+			Memory::MemcpyUnchecked(execClutAddr, pushbuf_.data() + ptr, sz);
+		}
+
+		execClutAddr = 0;
+	} else {
+		u32 psp = mapping_.Map(ptr, sz, std::bind(&DumpExecute::SyncStall, this));
+		if (psp == 0) {
+			ERROR_LOG(SYSTEM, "Unable to allocate for clut");
+			return;
+		}
+
+		execListQueue.push_back((GE_CMD_CLUTADDRUPPER << 24) | ((psp >> 8) & 0x00FF0000));
+		execListQueue.push_back((GE_CMD_CLUTADDR << 24) | (psp & 0x00FFFFFF));
+	}
 }

 void DumpExecute::TransferSrc(u32 ptr, u32 sz) {
@ -619,6 +647,10 @@ bool DumpExecute::Run() {
 			Indices(cmd.ptr, cmd.sz);
 			break;

+		case CommandType::CLUTADDR:
+			ClutAddr(cmd.ptr, cmd.sz);
+			break;
+
 		case CommandType::CLUT:
 			Clut(cmd.ptr, cmd.sz);
 			break;
--- a/GPU/Debugger/Record.cpp
+++ b/GPU/Debugger/Record.cpp
@ -37,6 +37,7 @@
 #include "Core/MemMap.h"
 #include "Core/System.h"
 #include "Core/ThreadPools.h"
+#include "GPU/Common/GPUDebugInterface.h"
 #include "GPU/GPUInterface.h"
 #include "GPU/GPUState.h"
 #include "GPU/ge_constants.h"
@ -152,8 +153,19 @@ static void BeginRecording() {
 	u32 sz = 512 * 4;
 	pushbuf.resize(pushbuf.size() + sz);
 	gstate.Save((u32_le *)(pushbuf.data() + ptr));
-
 	commands.push_back({CommandType::INIT, sz, ptr});
+
+	// Also save the initial CLUT.
+	GPUDebugBuffer clut;
+	if (gpuDebug->GetCurrentClut(clut)) {
+		sz = clut.GetStride() * clut.PixelSize();
+		_assert_msg_(sz == 1024, "CLUT should be 1024 bytes");
+		ptr = (u32)pushbuf.size();
+		pushbuf.resize(pushbuf.size() + sz);
+		memcpy(pushbuf.data() + ptr, clut.GetData(), sz);
+		commands.push_back({ CommandType::CLUT, sz, ptr });
+	}
+
 	DirtyAllVRAM(DirtyVRAMFlag::DIRTY);
 }

@ -308,6 +320,34 @@ static Command EmitCommandWithRAM(CommandType t, const void *p, u32 sz, u32 alig
 	return cmd;
 }

+static u32 GetTargetFlags(u32 addr, u32 sizeInRAM) {
+	const bool isTarget = lastRenderTargets.find(addr) != lastRenderTargets.end();
+
+	bool isDirtyVRAM = false;
+	bool isDrawnVRAM = false;
+	uint32_t start = (addr >> DIRTY_VRAM_SHIFT) & DIRTY_VRAM_MASK;
+	for (uint32_t i = 0; i < (sizeInRAM + DIRTY_VRAM_ROUND) >> DIRTY_VRAM_SHIFT; ++i) {
+		DirtyVRAMFlag flag = dirtyVRAM[start + i];
+		isDirtyVRAM = isDirtyVRAM || flag != DirtyVRAMFlag::CLEAN;
+		isDrawnVRAM = isDrawnVRAM || flag == DirtyVRAMFlag::DRAWN;
+
+		// Mark the VRAM clean now that it's been copied to VRAM.
+		if (flag == DirtyVRAMFlag::DIRTY)
+			dirtyVRAM[start + i] = DirtyVRAMFlag::CLEAN;
+	}
+
+	// The isTarget flag is mostly used for replay of dumps on a PSP.
+	u32 flags = isTarget ? 1 : 0;
+	// The unchangedVRAM flag tells us we can skip recopying.
+	if (!isDirtyVRAM)
+		flags |= 2;
+	// And the drawn flag tells us this data was potentially drawn to.
+	if (isDrawnVRAM)
+		flags |= 4;
+
+	return flags;
+}
+
 static void EmitTextureData(int level, u32 texaddr) {
 	GETextureFormat format = gstate.getTextureFormat();
 	int w = gstate.getTextureWidth(level);
@ -315,7 +355,6 @@ static void EmitTextureData(int level, u32 texaddr) {
 	int bufw = GetTextureBufw(level, texaddr, format);
 	int extraw = w > bufw ? w - bufw : 0;
 	u32 sizeInRAM = (textureBitsPerPixel[format] * (bufw * h + extraw)) / 8;
-	const bool isTarget = lastRenderTargets.find(texaddr) != lastRenderTargets.end();

 	CommandType type = CommandType((int)CommandType::TEXTURE0 + level);
 	const u8 *p = Memory::GetPointerUnchecked(texaddr);
@ -330,27 +369,7 @@ static void EmitTextureData(int level, u32 texaddr) {
 			u32 pad;
 		};

-		bool isDirtyVRAM = false;
-		bool isDrawnVRAM = false;
-		uint32_t start = (texaddr >> DIRTY_VRAM_SHIFT) & DIRTY_VRAM_MASK;
-		for (uint32_t i = 0; i < (sizeInRAM + DIRTY_VRAM_ROUND) >> DIRTY_VRAM_SHIFT; ++i) {
-			DirtyVRAMFlag flag = dirtyVRAM[start + i];
-			isDirtyVRAM = isDirtyVRAM || flag != DirtyVRAMFlag::CLEAN;
-			isDrawnVRAM = isDrawnVRAM || flag == DirtyVRAMFlag::DRAWN;
-
-			// Mark the VRAM clean now that it's been copied to VRAM.
-			if (flag == DirtyVRAMFlag::DIRTY)
-				dirtyVRAM[start + i] = DirtyVRAMFlag::CLEAN;
-		}
-
-		// The isTarget flag is mostly used for replay of dumps on a PSP.
-		u32 flags = isTarget ? 1 : 0;
-		// The unchangedVRAM flag tells us we can skip recopying.
-		if (!isDirtyVRAM)
-			flags |= 2;
-		// And the drawn flag tells us this data was potentially drawn to.
-		if (isDrawnVRAM)
-			flags |= 4;
+		u32 flags = GetTargetFlags(texaddr, sizeInRAM);
 		FramebufData framebuf{ texaddr, bufw, flags };
 		framebufData.resize(sizeof(framebuf) + bytes);
 		memcpy(&framebufData[0], &framebuf, sizeof(framebuf));
@ -456,12 +475,33 @@ static void EmitTransfer(u32 op) {

 static void EmitClut(u32 op) {
 	u32 addr = gstate.getClutAddress();
+
+	// Hardware rendering may be using a framebuffer as CLUT.
+	// To get at this, we first run the command (normally we're called right before it has run.)
+	if (Memory::IsVRAMAddress(addr))
+		gpuDebug->SetCmdValue(op);
+
 	// Actually should only be 0x3F, but we allow enhanced CLUTs.  See #15727.
 	u32 blocks = (op & 0x7F) == 0x40 ? 0x40 : (op & 0x3F);
 	u32 bytes = blocks * 32;
 	bytes = Memory::ValidSize(addr, bytes);

 	if (bytes != 0) {
+		// Send the original address so VRAM can be reasoned about.
+		if (Memory::IsVRAMAddress(addr)) {
+			struct ClutAddrData {
+				u32 addr;
+				u32 flags;
+			};
+			u32 flags = GetTargetFlags(addr, bytes);
+			ClutAddrData data{ addr, flags };
+
+			FlushRegisters();
+			Command cmd{CommandType::CLUTADDR, sizeof(data), (u32)pushbuf.size()};
+			pushbuf.resize(pushbuf.size() + sizeof(data));
+			memcpy(pushbuf.data() + cmd.ptr, &data, sizeof(data));
+			commands.push_back(cmd);
+		}
 		EmitCommandWithRAM(CommandType::CLUT, Memory::GetPointerUnchecked(addr), bytes, 16);
 	}

--- a/GPU/Debugger/RecordFormat.h
+++ b/GPU/Debugger/RecordFormat.h
@ -49,6 +49,7 @@ enum class CommandType : u8 {
 	MEMCPYDEST = 7,
 	MEMCPYDATA = 8,
 	DISPLAY = 9,
+	CLUTADDR = 10,

 	TEXTURE0 = 0x10,
 	TEXTURE1 = 0x11,
--- a/GPU/Directx9/DrawEngineDX9.h
+++ b/GPU/Directx9/DrawEngineDX9.h
@ -170,6 +170,8 @@ private:
 	// Hardware tessellation
 	TessellationDataTransferDX9 *tessDataTransferDX9;

+	FBOTexState fboTexBindState_ = FBO_TEX_NONE;
+
 	int lastRenderStepId_ = -1;

 	bool fboTexNeedsBind_ = false;
--- a/GPU/Directx9/GPU_DX9.cpp
+++ b/GPU/Directx9/GPU_DX9.cpp
@ -81,7 +81,7 @@ GPU_DX9::GPU_DX9(GraphicsContext *gfxCtx, Draw::DrawContext *draw)
 	// No need to flush before the tex scale/offset commands if we are baking
 	// the tex scale/offset into the vertices anyway.
 	UpdateCmdInfo();
-	CheckGPUFeatures();
+	gstate_c.featureFlags = CheckGPUFeatures();

 	BuildReportingInfo();

@ -98,68 +98,9 @@ GPU_DX9::GPU_DX9(GraphicsContext *gfxCtx, Draw::DrawContext *draw)
 	}
 }

-// TODO: Move this detection elsewhere when it's needed elsewhere, not before. It's ugly.
-// Source: https://envytools.readthedocs.io/en/latest/hw/pciid.html#gf100
-enum NVIDIAGeneration {
-	NV_PRE_KEPLER,
-	NV_KEPLER,
-	NV_MAXWELL,
-	NV_PASCAL,
-	NV_VOLTA,
-	NV_TURING,  // or later
-};
-
-static NVIDIAGeneration NVIDIAGetDeviceGeneration(int deviceID) {
-	if (deviceID >= 0x1180 && deviceID <= 0x11bf)
-		return NV_KEPLER;  // GK104
-	if (deviceID >= 0x11c0 && deviceID <= 0x11fa)
-		return NV_KEPLER;  // GK106
-	if (deviceID >= 0x0fc0 && deviceID <= 0x0fff)
-		return NV_KEPLER;  // GK107
-	if (deviceID >= 0x1003 && deviceID <= 0x1028)
-		return NV_KEPLER;  // GK110(B)
-	if (deviceID >= 0x1280 && deviceID <= 0x12ba)
-		return NV_KEPLER;  // GK208
-	if (deviceID >= 0x1381 && deviceID <= 0x13b0)
-		return NV_MAXWELL;  // GM107
-	if (deviceID >= 0x1340 && deviceID <= 0x134d)
-		return NV_MAXWELL;  // GM108
-	if (deviceID >= 0x13c0 && deviceID <= 0x13d9)
-		return NV_MAXWELL;  // GM204
-	if (deviceID >= 0x1401 && deviceID <= 0x1427)
-		return NV_MAXWELL;  // GM206
-	if (deviceID >= 0x15f7 && deviceID <= 0x15f9)
-		return NV_PASCAL;  // GP100
-	if (deviceID >= 0x15f7 && deviceID <= 0x15f9)
-		return NV_PASCAL;  // GP100
-	if (deviceID >= 0x1b00 && deviceID <= 0x1b38)
-		return NV_PASCAL;  // GP102
-	if (deviceID >= 0x1b80 && deviceID <= 0x1be1)
-		return NV_PASCAL;  // GP104
-	if (deviceID >= 0x1c02 && deviceID <= 0x1c62)
-		return NV_PASCAL;  // GP106
-	if (deviceID >= 0x1c81 && deviceID <= 0x1c92)
-		return NV_PASCAL;  // GP107
-	if (deviceID >= 0x1d01 && deviceID <= 0x1d12)
-		return NV_PASCAL;  // GP108
-	if (deviceID >= 0x1d81 && deviceID <= 0x1dba)
-		return NV_VOLTA;   // GV100
-	if (deviceID >= 0x1e02 && deviceID <= 0x1e3c)
-		return NV_TURING;  // TU102
-	if (deviceID >= 0x1e82 && deviceID <= 0x1ed0)
-		return NV_TURING;  // TU104
-	if (deviceID >= 0x1f02 && deviceID <= 0x1f51)
-		return NV_TURING;  // TU104
-	if (deviceID >= 0x1e02)
-		return NV_TURING;  // More TU models or later, probably.
-	return NV_PRE_KEPLER;
-}
-
-void GPU_DX9::CheckGPUFeatures() {
-	u32 features = 0;
+u32 GPU_DX9::CheckGPUFeatures() const {
+	u32 features = GPUCommon::CheckGPUFeatures();
 	features |= GPU_SUPPORTS_16BIT_FORMATS;
-	features |= GPU_SUPPORTS_BLEND_MINMAX;
-	features |= GPU_SUPPORTS_DEPTH_TEXTURE;
 	features |= GPU_SUPPORTS_TEXTURE_LOD_CONTROL;

 	// Accurate depth is required because the Direct3D API does not support inverse Z.
@ -168,41 +109,6 @@ void GPU_DX9::CheckGPUFeatures() {
 	features |= GPU_SUPPORTS_ACCURATE_DEPTH;

 	auto vendor = draw_->GetDeviceCaps().vendor;
-	if (!PSP_CoreParameter().compat.flags().DisableRangeCulling) {
-		// VS range culling (killing triangles in the vertex shader using NaN) causes problems on Intel.
-		// Also causes problems on old NVIDIA.
-		switch (vendor) {
-		case Draw::GPUVendor::VENDOR_INTEL:
-			break;
-		case Draw::GPUVendor::VENDOR_NVIDIA:
-			// Older NVIDIAs don't seem to like NaNs in their DX9 vertex shaders.
-			// No idea if KEPLER is the right cutoff, but let's go with it.
-			if (NVIDIAGetDeviceGeneration(draw_->GetDeviceCaps().deviceID) >= NV_KEPLER) {
-				features |= GPU_SUPPORTS_VS_RANGE_CULLING;
-			}
-			break;
-		default:
-			features |= GPU_SUPPORTS_VS_RANGE_CULLING;
-			break;
-		}
-	}
-
-	D3DCAPS9 caps;
-	ZeroMemory(&caps, sizeof(caps));
-	HRESULT result = 0;
-	if (deviceEx_) {
-		result = deviceEx_->GetDeviceCaps(&caps);
-	} else {
-		result = device_->GetDeviceCaps(&caps);
-	}
-	if (FAILED(result)) {
-		WARN_LOG_REPORT(G3D, "Direct3D9: Failed to get the device caps!");
-	} else {
-		if ((caps.RasterCaps & D3DPRASTERCAPS_ANISOTROPY) != 0 && caps.MaxAnisotropy > 1)
-			features |= GPU_SUPPORTS_ANISOTROPY;
-		if ((caps.TextureCaps & (D3DPTEXTURECAPS_NONPOW2CONDITIONAL | D3DPTEXTURECAPS_POW2)) == 0)
-			features |= GPU_SUPPORTS_TEXTURE_NPOT;
-	}

 	if (!g_Config.bHighQualityDepth) {
 		features |= GPU_SCALE_DEPTH_FROM_24BIT_TO_16BIT;
@ -213,11 +119,7 @@ void GPU_DX9::CheckGPUFeatures() {
 		features |= GPU_ROUND_DEPTH_TO_16BIT;
 	}

-	if (PSP_CoreParameter().compat.flags().ClearToRAM) {
-		features |= GPU_USE_CLEAR_RAM_HACK;
-	}
-
-	gstate_c.featureFlags = features;
+	return features;
 }

 GPU_DX9::~GPU_DX9() {
@ -261,7 +163,7 @@ void GPU_DX9::BeginHostFrame() {
 	GPUCommon::BeginHostFrame();
 	UpdateCmdInfo();
 	if (resized_) {
-		CheckGPUFeatures();
+		gstate_c.featureFlags = CheckGPUFeatures();
 		framebufferManager_->Resized();
 		drawEngine_.Resized();
 		shaderManagerDX9_->DirtyShader();
--- a/GPU/Directx9/GPU_DX9.h
+++ b/GPU/Directx9/GPU_DX9.h
@ -35,7 +35,7 @@ public:
 	GPU_DX9(GraphicsContext *gfxCtx, Draw::DrawContext *draw);
 	~GPU_DX9();

-	void CheckGPUFeatures() override;
+	u32 CheckGPUFeatures() const override;
 	void PreExecuteOp(u32 op, u32 diff) override;
 	void ExecuteOp(u32 op, u32 diff) override;

--- a/GPU/Directx9/StateMappingDX9.cpp
+++ b/GPU/Directx9/StateMappingDX9.cpp
@ -99,14 +99,14 @@ void DrawEngineDX9::ApplyDrawState(int prim) {
 	if (!gstate.isModeClear()) {
 		textureCache_->ApplyTexture();

-		if (fboTexNeedsBind_) {
+		if (fboTexBindState_ = FBO_TEX_COPY_BIND_TEX) {
 			// Note that this is positions, not UVs, that we need the copy from.
 			framebufferManager_->BindFramebufferAsColorTexture(1, framebufferManager_->GetCurrentRenderVFB(), BINDFBCOLOR_MAY_COPY);
 			// If we are rendering at a higher resolution, linear is probably best for the dest color.
 			device_->SetSamplerState(1, D3DSAMP_MAGFILTER, D3DTEXF_LINEAR);
 			device_->SetSamplerState(1, D3DSAMP_MINFILTER, D3DTEXF_LINEAR);
 			fboTexBound_ = true;
-			fboTexNeedsBind_ = false;
+			fboTexBindState_ = FBO_TEX_NONE;
 		}

 		// TODO: Test texture?
@ -133,20 +133,23 @@ void DrawEngineDX9::ApplyDrawState(int prim) {
 			// We ignore the logicState on D3D since there's no support, the emulation of it is blend-and-shader only.

 			if (pipelineState_.FramebufferRead()) {
-				bool fboTexNeedsBind = false;
-				ApplyFramebufferRead(&fboTexNeedsBind);
+				ApplyFramebufferRead(&fboTexBindState_);
 				// The shader takes over the responsibility for blending, so recompute.
 				ApplyStencilReplaceAndLogicOpIgnoreBlend(blendState.replaceAlphaWithStencil, blendState);

-				if (fboTexNeedsBind) {
+				if (fboTexBindState_ == FBO_TEX_COPY_BIND_TEX) {
 					// Note that this is positions, not UVs, that we need the copy from.
 					framebufferManager_->BindFramebufferAsColorTexture(1, framebufferManager_->GetCurrentRenderVFB(), BINDFBCOLOR_MAY_COPY);
 					// If we are rendering at a higher resolution, linear is probably best for the dest color.
 					device_->SetSamplerState(1, D3DSAMP_MAGFILTER, D3DTEXF_LINEAR);
 					device_->SetSamplerState(1, D3DSAMP_MINFILTER, D3DTEXF_LINEAR);
 					fboTexBound_ = true;
+					fboTexBindState_ = FBO_TEX_NONE;
 					dirtyRequiresRecheck_ |= DIRTY_BLEND_STATE;
 					gstate_c.Dirty(DIRTY_BLEND_STATE);
+				} else if (fboTexBindState_ == FBO_TEX_READ_FRAMEBUFFER) {
+					// Not supported.
+					fboTexBindState_ = FBO_TEX_NONE;
 				}

 				dirtyRequiresRecheck_ |= DIRTY_FRAGMENTSHADER_STATE;
--- a/GPU/GLES/DepthBufferGLES.cpp
+++ b/GPU/GLES/DepthBufferGLES.cpp
@ -118,7 +118,8 @@ void FramebufferManagerGLES::PackDepthbuffer(VirtualFramebuffer *vfb, int x, int
 			queries.push_back({ &u_depthDownloadTo8, "u_depthTo8" });
 			std::vector<GLRProgram::Initializer> inits;
 			inits.push_back({ &u_depthDownloadTex, 0, TEX_SLOT_PSP_TEXTURE });
-			depthDownloadProgram_ = render->CreateProgram(shaders, semantics, queries, inits, false, false);
+			GLRProgramFlags flags{};
+			depthDownloadProgram_ = render->CreateProgram(shaders, semantics, queries, inits, flags);
 			for (auto iter : shaders) {
 				render->DeleteShader(iter);
 			}
--- a/GPU/GLES/GPU_GLES.cpp
+++ b/GPU/GLES/GPU_GLES.cpp
@ -54,7 +54,7 @@
 GPU_GLES::GPU_GLES(GraphicsContext *gfxCtx, Draw::DrawContext *draw)
 	: GPUCommon(gfxCtx, draw), drawEngine_(draw), fragmentTestCache_(draw) {
 	UpdateVsyncInterval(true);
-	CheckGPUFeatures();
+	gstate_c.featureFlags = CheckGPUFeatures();

 	shaderManagerGL_ = new ShaderManagerGLES(draw);
 	framebufferManagerGL_ = new FramebufferManagerGLES(draw);
@ -148,42 +148,17 @@ GPU_GLES::~GPU_GLES() {
 // Take the raw GL extension and versioning data and turn into feature flags.
 // TODO: This should use DrawContext::GetDeviceCaps() more and more, and eventually
 // this can be shared between all the backends.
-void GPU_GLES::CheckGPUFeatures() {
-	u32 features = 0;
+u32 GPU_GLES::CheckGPUFeatures() const {
+	u32 features = GPUCommon::CheckGPUFeatures();

 	features |= GPU_SUPPORTS_16BIT_FORMATS;

-	if (draw_->GetDeviceCaps().dualSourceBlend) {
-		if (!g_Config.bVendorBugChecksEnabled || !draw_->GetBugs().Has(Draw::Bugs::DUAL_SOURCE_BLENDING_BROKEN)) {
-			features |= GPU_SUPPORTS_DUALSOURCE_BLEND;
-		}
-	}
-
-	if (gl_extensions.EXT_shader_framebuffer_fetch || gl_extensions.ARM_shader_framebuffer_fetch) {
-		// This has caused problems in the past.  Let's only enable on GLES3.
-		if (gl_extensions.GLES3) {
-			features |= GPU_SUPPORTS_ANY_FRAMEBUFFER_FETCH;
-		}
-	}
-
 	if ((gl_extensions.gpuVendor == GPU_VENDOR_NVIDIA) || (gl_extensions.gpuVendor == GPU_VENDOR_AMD))
 		features |= GPU_PREFER_REVERSE_COLOR_ORDER;

-	if (draw_->GetDeviceCaps().textureNPOTFullySupported)
-		features |= GPU_SUPPORTS_TEXTURE_NPOT;
-
-	if (gl_extensions.EXT_blend_minmax)
-		features |= GPU_SUPPORTS_BLEND_MINMAX;
-
-	if (draw_->GetDeviceCaps().logicOpSupported)
-		features |= GPU_SUPPORTS_LOGIC_OP;
-
 	if (gl_extensions.GLES3 || !gl_extensions.IsGLES)
 		features |= GPU_SUPPORTS_TEXTURE_LOD_CONTROL;

-	if (draw_->GetDeviceCaps().anisoSupported)
-		features |= GPU_SUPPORTS_ANISOTROPY;
-
 	bool canUseInstanceID = gl_extensions.EXT_draw_instanced || gl_extensions.ARB_draw_instanced;
 	bool canDefInstanceID = gl_extensions.IsGLES || gl_extensions.EXT_gpu_shader4 || gl_extensions.VersionGEThan(3, 1);
 	bool instanceRendering = gl_extensions.GLES3 || (canUseInstanceID && canDefInstanceID);
@ -202,21 +177,6 @@ void GPU_GLES::CheckGPUFeatures() {
 		// Our implementation of depth texturing needs simple Z range, so can't
 		// use the extension hacks (yet).
 	}
-	if (draw_->GetDeviceCaps().textureDepthSupported)
-		features |= GPU_SUPPORTS_DEPTH_TEXTURE;
-	if (draw_->GetDeviceCaps().clipDistanceSupported)
-		features |= GPU_SUPPORTS_CLIP_DISTANCE;
-	if (draw_->GetDeviceCaps().cullDistanceSupported)
-		features |= GPU_SUPPORTS_CULL_DISTANCE;
-	if (!draw_->GetBugs().Has(Draw::Bugs::BROKEN_NAN_IN_CONDITIONAL)) {
-		// Ignore the compat setting if clip and cull are both enabled.
-		// When supported, we can do the depth side of range culling more correctly.
-		const bool supported = draw_->GetDeviceCaps().clipDistanceSupported && draw_->GetDeviceCaps().cullDistanceSupported;
-		const bool disabled = PSP_CoreParameter().compat.flags().DisableRangeCulling;
-		if (supported || !disabled) {
-			features |= GPU_SUPPORTS_VS_RANGE_CULLING;
-		}
-	}

 	// If we already have a 16-bit depth buffer, we don't need to round.
 	bool prefer24 = draw_->GetDeviceCaps().preferredDepthBufferFormat == Draw::DataFormat::D24_S8;
@ -245,11 +205,7 @@ void GPU_GLES::CheckGPUFeatures() {
 		features |= GPU_USE_DEPTH_RANGE_HACK;
 	}

-	if (PSP_CoreParameter().compat.flags().ClearToRAM) {
-		features |= GPU_USE_CLEAR_RAM_HACK;
-	}
-
-	gstate_c.featureFlags = features;
+	return features;
 }

 bool GPU_GLES::IsReady() {
@ -321,7 +277,7 @@ void GPU_GLES::BeginHostFrame() {
 	GPUCommon::BeginHostFrame();
 	UpdateCmdInfo();
 	if (resized_) {
-		CheckGPUFeatures();
+		gstate_c.featureFlags = CheckGPUFeatures();
 		framebufferManager_->Resized();
 		drawEngine_.Resized();
 		shaderManagerGL_->DirtyShader();
--- a/GPU/GLES/GPU_GLES.h
+++ b/GPU/GLES/GPU_GLES.h
@ -38,7 +38,7 @@ public:
 	~GPU_GLES();

 	// This gets called on startup and when we get back from settings.
-	void CheckGPUFeatures() override;
+	u32 CheckGPUFeatures() const override;

 	bool IsReady() override;
 	void CancelReady() override;
--- a/GPU/GLES/ShaderManagerGLES.cpp
+++ b/GPU/GLES/ShaderManagerGLES.cpp
@ -192,9 +192,18 @@ LinkedShader::LinkedShader(GLRenderManager *render, VShaderID VSID, Shader *vs,
 	initialize.push_back({ &u_tess_weights_u, 0, TEX_SLOT_SPLINE_WEIGHTS_U });
 	initialize.push_back({ &u_tess_weights_v, 0, TEX_SLOT_SPLINE_WEIGHTS_V });

-	bool useDualSource = (gstate_c.featureFlags & GPU_SUPPORTS_DUALSOURCE_BLEND) != 0;
-	bool useClip0 = VSID.Bit(VS_BIT_VERTEX_RANGE_CULLING) && gstate_c.Supports(GPU_SUPPORTS_CLIP_DISTANCE);
-	program = render->CreateProgram(shaders, semantics, queries, initialize, useDualSource, useClip0);
+	GLRProgramFlags flags{};
+	flags.supportDualSource = (gstate_c.featureFlags & GPU_SUPPORTS_DUALSOURCE_BLEND) != 0;
+	if (!VSID.Bit(VS_BIT_IS_THROUGH) && gstate_c.Supports(GPU_SUPPORTS_DEPTH_CLAMP)) {
+		flags.useClipDistance0 = true;
+		flags.useClipDistance1 = true;
+		if (VSID.Bit(VS_BIT_VERTEX_RANGE_CULLING) && gstate_c.Supports(GPU_SUPPORTS_CLIP_DISTANCE))
+			flags.useClipDistance2 = true;
+	} else if (VSID.Bit(VS_BIT_VERTEX_RANGE_CULLING) && gstate_c.Supports(GPU_SUPPORTS_CLIP_DISTANCE)) {
+		flags.useClipDistance0 = true;
+	}
+
+	program = render->CreateProgram(shaders, semantics, queries, initialize, flags);

 	// The rest, use the "dirty" mechanism.
 	dirtyUniforms = DIRTY_ALL_UNIFORMS;
--- a/GPU/GLES/StateMappingGLES.cpp
+++ b/GPU/GLES/StateMappingGLES.cpp
@ -149,13 +149,14 @@ void DrawEngineGLES::ApplyDrawState(int prim) {
 			GenericLogicState &logicState = pipelineState_.logicState;

 			if (pipelineState_.FramebufferRead()) {
-				bool fboTexNeedsBind = false;
-				ApplyFramebufferRead(&fboTexNeedsBind);
+				FBOTexState fboTexBindState = FBO_TEX_NONE;
+				ApplyFramebufferRead(&fboTexBindState);
 				// The shader takes over the responsibility for blending, so recompute.
 				ApplyStencilReplaceAndLogicOpIgnoreBlend(blendState.replaceAlphaWithStencil, blendState);

 				// We copy the framebuffer here, as doing so will wipe any blend state if we do it later.
-				if (fboTexNeedsBind) {
+				// fboTexNeedsBind_ won't be set if we can read directly from the target.
+				if (fboTexBindState == FBO_TEX_COPY_BIND_TEX) {
 					// Note that this is positions, not UVs, that we need the copy from.
 					framebufferManager_->BindFramebufferAsColorTexture(1, framebufferManager_->GetCurrentRenderVFB(), BINDFBCOLOR_MAY_COPY);
 					// If we are rendering at a higher resolution, linear is probably best for the dest color.
@ -166,6 +167,9 @@ void DrawEngineGLES::ApplyDrawState(int prim) {
 					// Must dirty blend state here so we re-copy next time.  Example: Lunar's spell effects.
 					dirtyRequiresRecheck_ |= DIRTY_BLEND_STATE;
 					gstate_c.Dirty(DIRTY_BLEND_STATE);
+				} else if (fboTexBindState == FBO_TEX_READ_FRAMEBUFFER) {
+					// No action needed here.
+					fboTexBindState = FBO_TEX_NONE;
 				}
 				dirtyRequiresRecheck_ |= DIRTY_FRAGMENTSHADER_STATE;
 				gstate_c.Dirty(DIRTY_FRAGMENTSHADER_STATE);
--- a/GPU/GPUCommon.cpp
+++ b/GPU/GPUCommon.cpp
@ -89,7 +89,7 @@ const CommonCommandTableEntry commonCommandTable[] = {
 	// These affect the fragment shader so need flushing.
 	{ GE_CMD_CLEARMODE, FLAG_FLUSHBEFOREONCHANGE, DIRTY_BLEND_STATE | DIRTY_DEPTHSTENCIL_STATE | DIRTY_RASTER_STATE | DIRTY_VIEWPORTSCISSOR_STATE | DIRTY_CULLRANGE | DIRTY_VERTEXSHADER_STATE | DIRTY_FRAGMENTSHADER_STATE },
 	{ GE_CMD_TEXTUREMAPENABLE, FLAG_FLUSHBEFOREONCHANGE, DIRTY_VERTEXSHADER_STATE | DIRTY_FRAGMENTSHADER_STATE },
-	{ GE_CMD_FOGENABLE, FLAG_FLUSHBEFOREONCHANGE, DIRTY_VERTEXSHADER_STATE | DIRTY_FRAGMENTSHADER_STATE},
+	{ GE_CMD_FOGENABLE, FLAG_FLUSHBEFOREONCHANGE, DIRTY_VERTEXSHADER_STATE | DIRTY_FRAGMENTSHADER_STATE },
 	{ GE_CMD_TEXMODE, FLAG_FLUSHBEFOREONCHANGE, DIRTY_TEXTURE_PARAMS | DIRTY_FRAGMENTSHADER_STATE },
 	{ GE_CMD_TEXSHADELS, FLAG_FLUSHBEFOREONCHANGE, DIRTY_VERTEXSHADER_STATE },
 	// Raster state for Direct3D 9, uncommon.
@ -2414,10 +2414,10 @@ void GPUCommon::Execute_ImmVertexAlphaPrim(u32 op, u32 diff) {
 		immPrim_ = (GEPrimitiveType)prim;
 		// Flags seem to only be respected from the first prim.
 		immFlags_ = op & 0x00FFF800;
+		immFirstSent_ = false;
 	} else if (prim == GE_PRIM_KEEP_PREVIOUS && immPrim_ != GE_PRIM_INVALID) {
 		static constexpr int flushPrimCount[] = { 1, 2, 0, 3, 0, 0, 2, 0 };
-		// Instead of finding a proper point to flush, we just emit a full rectangle every time one
-		// is finished.
+		// Instead of finding a proper point to flush, we just emit prims when we can.
 		if (immCount_ == flushPrimCount[immPrim_ & 7])
 			FlushImm();
 	} else {
@ -2439,31 +2439,6 @@ void GPUCommon::FlushImm() {
 	}
 	UpdateUVScaleOffset();

-	// Instead of plumbing through properly (we'd need to inject these pretransformed vertices in the middle
-	// of SoftwareTransform(), which would take a lot of refactoring), we'll cheat and just turn these into
-	// through vertices.
-	// Since the only known use is Thrillville and it only uses it to clear, we just use color and pos.
-	struct ImmVertex {
-		float uv[2];
-		uint32_t color;
-		float xyz[3];
-	};
-	ImmVertex temp[MAX_IMMBUFFER_SIZE];
-	uint32_t color1Used = 0;
-	for (int i = 0; i < immCount_; i++) {
-		// Since we're sending through, scale back up to w/h.
-		temp[i].uv[0] = immBuffer_[i].u * gstate.getTextureWidth(0);
-		temp[i].uv[1] = immBuffer_[i].v * gstate.getTextureHeight(0);
-		temp[i].color = immBuffer_[i].color0_32;
-		temp[i].xyz[0] = immBuffer_[i].pos[0];
-		temp[i].xyz[1] = immBuffer_[i].pos[1];
-		temp[i].xyz[2] = immBuffer_[i].pos[2];
-		color1Used |= immBuffer_[i].color1_32;
-	}
-	int vtype = GE_VTYPE_TC_FLOAT | GE_VTYPE_POS_FLOAT | GE_VTYPE_COL_8888 | GE_VTYPE_THROUGH;
-
-	// TODO: Handle fog and secondary color somehow?
-
 	bool antialias = (immFlags_ & GE_IMM_ANTIALIAS) != 0;
 	bool prevAntialias = gstate.isAntiAliasEnabled();
 	bool shading = (immFlags_ & GE_IMM_SHADING) != 0;
@ -2473,40 +2448,42 @@ void GPUCommon::FlushImm() {
 	int cullMode = (immFlags_ & GE_IMM_CULLFACE) != 0 ? 1 : 0;
 	bool texturing = (immFlags_ & GE_IMM_TEXTURE) != 0;
 	bool prevTexturing = gstate.isTextureMapEnabled();
+	bool fog = (immFlags_ & GE_IMM_FOG) != 0;
+	bool prevFog = gstate.isFogEnabled();
 	bool dither = (immFlags_ & GE_IMM_DITHER) != 0;
 	bool prevDither = gstate.isDitherEnabled();

 	if ((immFlags_ & GE_IMM_CLIPMASK) != 0) {
 		WARN_LOG_REPORT_ONCE(geimmclipvalue, G3D, "Imm vertex used clip value, flags=%06x", immFlags_);
-	} else if ((immFlags_ & GE_IMM_FOG) != 0) {
-		WARN_LOG_REPORT_ONCE(geimmfog, G3D, "Imm vertex used fog, flags=%06x", immFlags_);
-	} else if (color1Used != 0 && gstate.isUsingSecondaryColor()) {
-		WARN_LOG_REPORT_ONCE(geimmcolor1, G3D, "Imm vertex used secondary color, flags=%06x", immFlags_);
 	}

-	if (texturing != prevTexturing || cullEnable != prevCullEnable || dither != prevDither || prevShading != shading) {
+	bool changed = texturing != prevTexturing || cullEnable != prevCullEnable || dither != prevDither;
+	changed = changed || prevShading != shading || prevFog != fog;
+	if (changed) {
 		DispatchFlush();
 		gstate.antiAliasEnable = (GE_CMD_ANTIALIASENABLE << 24) | (int)antialias;
 		gstate.shademodel = (GE_CMD_SHADEMODE << 24) | (int)shading;
 		gstate.cullfaceEnable = (GE_CMD_CULLFACEENABLE << 24) | (int)cullEnable;
 		gstate.textureMapEnable = (GE_CMD_TEXTUREMAPENABLE << 24) | (int)texturing;
+		gstate.fogEnable = (GE_CMD_FOGENABLE << 24) | (int)fog;
 		gstate.ditherEnable = (GE_CMD_DITHERENABLE << 24) | (int)dither;
-		gstate_c.Dirty(DIRTY_VERTEXSHADER_STATE | DIRTY_FRAGMENTSHADER_STATE | DIRTY_RASTER_STATE);
+		gstate_c.Dirty(DIRTY_VERTEXSHADER_STATE | DIRTY_FRAGMENTSHADER_STATE | DIRTY_RASTER_STATE | DIRTY_VIEWPORTSCISSOR_STATE | DIRTY_UVSCALEOFFSET | DIRTY_CULLRANGE);
 	}

-	int bytesRead;
-	uint32_t vertTypeID = GetVertTypeID(vtype, 0);
-	drawEngineCommon_->DispatchSubmitImm(temp, nullptr, immPrim_, immCount_, vertTypeID, cullMode, &bytesRead);
-	// TODO: In the future, make a special path for these.
-	// drawEngineCommon_->DispatchSubmitImm(immBuffer_, immCount_);
+	drawEngineCommon_->DispatchSubmitImm(immPrim_, immBuffer_, immCount_, cullMode, immFirstSent_);
 	immCount_ = 0;
+	immFirstSent_ = true;

-	gstate.antiAliasEnable = (GE_CMD_ANTIALIASENABLE << 24) | (int)prevAntialias;
-	gstate.shademodel = (GE_CMD_SHADEMODE << 24) | (int)prevShading;
-	gstate.cullfaceEnable = (GE_CMD_CULLFACEENABLE << 24) | (int)prevCullEnable;
-	gstate.textureMapEnable = (GE_CMD_TEXTUREMAPENABLE << 24) | (int)prevTexturing;
-	gstate.ditherEnable = (GE_CMD_DITHERENABLE << 24) | (int)prevDither;
-	gstate_c.Dirty(DIRTY_VERTEXSHADER_STATE | DIRTY_FRAGMENTSHADER_STATE | DIRTY_RASTER_STATE);
+	if (changed) {
+		DispatchFlush();
+		gstate.antiAliasEnable = (GE_CMD_ANTIALIASENABLE << 24) | (int)prevAntialias;
+		gstate.shademodel = (GE_CMD_SHADEMODE << 24) | (int)prevShading;
+		gstate.cullfaceEnable = (GE_CMD_CULLFACEENABLE << 24) | (int)prevCullEnable;
+		gstate.textureMapEnable = (GE_CMD_TEXTUREMAPENABLE << 24) | (int)prevTexturing;
+		gstate.fogEnable = (GE_CMD_FOGENABLE << 24) | (int)prevFog;
+		gstate.ditherEnable = (GE_CMD_DITHERENABLE << 24) | (int)prevDither;
+		gstate_c.Dirty(DIRTY_VERTEXSHADER_STATE | DIRTY_FRAGMENTSHADER_STATE | DIRTY_RASTER_STATE | DIRTY_VIEWPORTSCISSOR_STATE | DIRTY_UVSCALEOFFSET | DIRTY_CULLRANGE);
+	}
 }

 void GPUCommon::ExecuteOp(u32 op, u32 diff) {
@ -3176,3 +3153,56 @@ size_t GPUCommon::FormatGPUStatsCommon(char *buffer, size_t size) {
 		vertexAverageCycles
 	);
 }
+
+u32 GPUCommon::CheckGPUFeatures() const {
+	u32 features = 0;
+	if (draw_->GetDeviceCaps().logicOpSupported) {
+		features |= GPU_SUPPORTS_LOGIC_OP;
+	}
+	if (draw_->GetDeviceCaps().anisoSupported) {
+		features |= GPU_SUPPORTS_ANISOTROPY;
+	}
+	if (draw_->GetDeviceCaps().textureNPOTFullySupported) {
+		features |= GPU_SUPPORTS_TEXTURE_NPOT;
+	}
+	if (draw_->GetDeviceCaps().dualSourceBlend) {
+		if (!g_Config.bVendorBugChecksEnabled || !draw_->GetBugs().Has(Draw::Bugs::DUAL_SOURCE_BLENDING_BROKEN)) {
+			features |= GPU_SUPPORTS_DUALSOURCE_BLEND;
+		}
+	}
+	if (draw_->GetDeviceCaps().blendMinMaxSupported) {
+		features |= GPU_SUPPORTS_BLEND_MINMAX;
+	}
+
+	if (draw_->GetDeviceCaps().clipDistanceSupported) {
+		features |= GPU_SUPPORTS_CLIP_DISTANCE;
+	}
+
+	if (draw_->GetDeviceCaps().cullDistanceSupported) {
+		features |= GPU_SUPPORTS_CULL_DISTANCE;
+	}
+
+	if (draw_->GetDeviceCaps().textureDepthSupported) {
+		features |= GPU_SUPPORTS_DEPTH_TEXTURE;
+	}
+
+	if (!draw_->GetBugs().Has(Draw::Bugs::BROKEN_NAN_IN_CONDITIONAL)) {
+		// Ignore the compat setting if clip and cull are both enabled.
+		// When supported, we can do the depth side of range culling more correctly.
+		const bool supported = draw_->GetDeviceCaps().clipDistanceSupported && draw_->GetDeviceCaps().cullDistanceSupported;
+		const bool disabled = PSP_CoreParameter().compat.flags().DisableRangeCulling;
+		if (supported || !disabled) {
+			features |= GPU_SUPPORTS_VS_RANGE_CULLING;
+		}
+	}
+
+	if (draw_->GetDeviceCaps().framebufferFetchSupported) {
+		features |= GPU_SUPPORTS_ANY_FRAMEBUFFER_FETCH;
+	}
+
+	if (PSP_CoreParameter().compat.flags().ClearToRAM) {
+		features |= GPU_USE_CLEAR_RAM_HACK;
+	}
+
+	return features;
+}
--- a/GPU/GPUCommon.h
+++ b/GPU/GPUCommon.h
@ -76,7 +76,7 @@ public:
 	Draw::DrawContext *GetDrawContext() override {
 		return draw_;
 	}
-	virtual void CheckGPUFeatures() = 0;
+	virtual u32 CheckGPUFeatures() const;

 	void UpdateCmdInfo();

@ -103,7 +103,7 @@ public:
 	void ExecuteOp(u32 op, u32 diff) override;
 	void PreExecuteOp(u32 op, u32 diff) override;

-	bool InterpretList(DisplayList &list) override;
+	bool InterpretList(DisplayList &list);
 	void ProcessDLQueue();
 	u32  UpdateStall(int listid, u32 newstall) override;
 	u32  EnqueueList(u32 listpc, u32 stall, int subIntrBase, PSPPointer<PspGeListArgs> args, bool head) override;
@ -355,6 +355,7 @@ protected:
 	int immCount_ = 0;
 	GEPrimitiveType immPrim_ = GE_PRIM_INVALID;
 	uint32_t immFlags_ = 0;
+	bool immFirstSent_ = false;

 	std::string reportingPrimaryInfo_;
 	std::string reportingFullInfo_;
--- a/GPU/GPUInterface.h
+++ b/GPU/GPUInterface.h
@ -205,7 +205,6 @@ public:

 	virtual void PreExecuteOp(u32 op, u32 diff) = 0;
 	virtual void ExecuteOp(u32 op, u32 diff) = 0;
-	virtual bool InterpretList(DisplayList& list) = 0;

 	// Framebuffer management
 	virtual void SetDisplayFramebuffer(u32 framebuf, u32 stride, GEBufferFormat format) = 0;
--- a/GPU/GPUState.h
+++ b/GPU/GPUState.h
@ -485,7 +485,8 @@ enum {
 	// Free bit: 15
 	GPU_SUPPORTS_DEPTH_TEXTURE = FLAG_BIT(16),
 	GPU_SUPPORTS_ACCURATE_DEPTH = FLAG_BIT(17),
-	// Free bits: 18-19
+	GPU_SUPPORTS_FRAGMENT_SHADER_INTERLOCK = FLAG_BIT(18),
+	// Free bits: 19
 	GPU_SUPPORTS_ANY_FRAMEBUFFER_FETCH = FLAG_BIT(20),
 	GPU_SCALE_DEPTH_FROM_24BIT_TO_16BIT = FLAG_BIT(21),
 	GPU_ROUND_FRAGMENT_DEPTH_TO_16BIT = FLAG_BIT(22),
--- a/GPU/Software/BinManager.cpp
+++ b/GPU/Software/BinManager.cpp
@ -167,7 +167,7 @@ void BinManager::UpdateState(bool throughMode) {
 		if (states_.Full())
 			Flush("states");
 		stateIndex_ = (uint16_t)states_.Push(RasterizerState());
-		ComputeRasterizerState(&states_[stateIndex_], throughMode);
+		ComputeRasterizerState(&states_[stateIndex_]);
 		states_[stateIndex_].samplerID.cached.clut = cluts_[clutIndex_].readable;

 		ClearDirty(SoftDirty::PIXEL_ALL | SoftDirty::SAMPLER_ALL | SoftDirty::RAST_ALL);
@ -326,7 +326,7 @@ void BinManager::AddTriangle(const VertexData &v0, const VertexData &v1, const V
 	if (d01.x * d02.y - d01.y * d02.x < 0)
 		return;
 	// If all points have identical coords, we'll have 0 weights and not skip properly, so skip here.
-	if (d01.x == 0 && d01.y == 0 && d02.x == 0 && d02.y == 0)
+	if ((d01.x == 0 && d02.x == 0) || (d01.y == 0 && d02.y == 0))
 		return;

 	// Was it fully outside the scissor?
@ -474,6 +474,9 @@ void BinManager::Drain() {
 }

 void BinManager::Flush(const char *reason) {
+	if (queueRange_.x1 == 0x7FFFFFFF)
+		return;
+
 	double st;
 	if (coreCollectDebugStats)
 		st = time_now_d();
--- a/GPU/Software/Clipper.cpp
+++ b/GPU/Software/Clipper.cpp
@ -133,6 +133,10 @@ static inline bool CheckOutsideZ(ClipCoords p, int &pos, int &neg) {

 void ProcessRect(const VertexData &v0, const VertexData &v1, BinManager &binner) {
 	if (!binner.State().throughMode) {
+		// If any verts were outside range, throw the entire prim away.
+		if (v0.OutsideRange() || v1.OutsideRange())
+			return;
+
 		// We may discard the entire rect based on depth values.
 		int outsidePos = 0, outsideNeg = 0;
 		CheckOutsideZ(v0.clippos, outsidePos, outsideNeg);
@ -176,6 +180,12 @@ void ProcessRect(const VertexData &v0, const VertexData &v1, BinManager &binner)
 }

 void ProcessPoint(const VertexData &v0, BinManager &binner) {
+	// If any verts were outside range, throw the entire prim away.
+	if (!binner.State().throughMode) {
+		if (v0.OutsideRange())
+			return;
+	}
+
 	// Points need no clipping. Will be bounds checked in the rasterizer (which seems backwards?)
 	binner.AddPoint(v0);
 }
@ -187,6 +197,10 @@ void ProcessLine(const VertexData &v0, const VertexData &v1, BinManager &binner)
 		return;
 	}

+	// If any verts were outside range, throw the entire prim away.
+	if (v0.OutsideRange() || v1.OutsideRange())
+		return;
+
 	int outsidePos = 0, outsideNeg = 0;
 	CheckOutsideZ(v0.clippos, outsidePos, outsideNeg);
 	CheckOutsideZ(v1.clippos, outsidePos, outsideNeg);
@ -222,6 +236,10 @@ void ProcessLine(const VertexData &v0, const VertexData &v1, BinManager &binner)
 void ProcessTriangle(const VertexData &v0, const VertexData &v1, const VertexData &v2, const VertexData &provoking, BinManager &binner) {
 	int mask = 0;
 	if (!binner.State().throughMode) {
+		// If any verts were outside range, throw the entire prim away.
+		if (v0.OutsideRange() || v1.OutsideRange() || v2.OutsideRange())
+			return;
+
 		mask |= CalcClipMask(v0.clippos);
 		mask |= CalcClipMask(v1.clippos);
 		mask |= CalcClipMask(v2.clippos);
--- a/GPU/Software/FuncId.cpp
+++ b/GPU/Software/FuncId.cpp
@ -48,11 +48,11 @@ static inline PixelBlendFactor OptimizeAlphaFactor(uint32_t color) {
 	return PixelBlendFactor::FIX;
 }

-void ComputePixelFuncID(PixelFuncID *id, bool throughMode) {
+void ComputePixelFuncID(PixelFuncID *id) {
 	id->fullKey = 0;

 	// TODO: Could this be minz > 0x0000 || maxz < 0xFFFF?  Maybe unsafe, depending on verts...
-	id->applyDepthRange = !throughMode;
+	id->applyDepthRange = !gstate.isModeThrough();
 	// Dither happens even in clear mode.
 	id->dithering = gstate.isDitherEnabled();
 	id->fbFormat = gstate.FrameBufFormat();
@ -169,7 +169,7 @@ void ComputePixelFuncID(PixelFuncID *id, bool throughMode) {
 		}

 		id->applyLogicOp = gstate.isLogicOpEnabled() && gstate.getLogicOp() != GE_LOGIC_COPY;
-		id->applyFog = gstate.isFogEnabled() && !throughMode;
+		id->applyFog = gstate.isFogEnabled() && !gstate.isModeThrough();

 		id->earlyZChecks = id->DepthTestFunc() != GE_COMP_ALWAYS;
 		if (id->stencilTest && id->earlyZChecks) {
--- a/GPU/Software/FuncId.h
+++ b/GPU/Software/FuncId.h
@ -244,7 +244,7 @@ struct hash<SamplerID> {

 };

-void ComputePixelFuncID(PixelFuncID *id, bool throughMode);
+void ComputePixelFuncID(PixelFuncID *id);
 std::string DescribePixelFuncID(const PixelFuncID &id);

 void ComputeSamplerID(SamplerID *id);
--- a/GPU/Software/Rasterizer.cpp
+++ b/GPU/Software/Rasterizer.cpp
@ -93,8 +93,8 @@ static inline Vec4<float> Interpolate(const float &c0, const float &c1, const fl
 	return Interpolate(c0, c1, c2, w0.Cast<float>(), w1.Cast<float>(), w2.Cast<float>(), wsum_recip);
 }

-void ComputeRasterizerState(RasterizerState *state, bool throughMode) {
-	ComputePixelFuncID(&state->pixelID, throughMode);
+void ComputeRasterizerState(RasterizerState *state) {
+	ComputePixelFuncID(&state->pixelID);
 	state->drawPixel = Rasterizer::GetSingleFunc(state->pixelID);

 	state->enableTextures = gstate.isTextureMapEnabled() && !state->pixelID.clearMode;
@ -132,7 +132,7 @@ void ComputeRasterizerState(RasterizerState *state, bool throughMode) {
 	}

 	state->shadeGouraud = gstate.getShadeMode() == GE_SHADE_GOURAUD;
-	state->throughMode = throughMode;
+	state->throughMode = gstate.isModeThrough();
 	state->antialiasLines = gstate.isAntiAliasEnabled();

 #if defined(SOFTGPU_MEMORY_TAGGING_DETAILED) || defined(SOFTGPU_MEMORY_TAGGING_BASIC)
@ -1235,6 +1235,7 @@ void ClearRectangle(const VertexData &v0, const VertexData &v1, const BinCoords

 	case GE_FORMAT_INVALID:
 	case GE_FORMAT_DEPTH16:
+	case GE_FORMAT_CLUT8:
 		_dbg_assert_msg_(false, "Software: invalid framebuf format.");
 		break;
 	}
@ -1365,7 +1366,7 @@ void DrawLine(const VertexData &v0, const VertexData &v1, const BinCoords &range
 						maskOK = false;
 				}

-				if (!CheckDepthTestPassed(pixelID.DepthTestFunc(), x, y, pixelID.cached.depthbufStride, z)) {
+				if (!CheckDepthTestPassed(pixelID.DepthTestFunc(), p.x, p.y, pixelID.cached.depthbufStride, z)) {
 					maskOK = false;
 				}
 			}
--- a/GPU/Software/Rasterizer.h
+++ b/GPU/Software/Rasterizer.h
@ -65,7 +65,7 @@ struct RasterizerState {
 	}
 };

-void ComputeRasterizerState(RasterizerState *state, bool throughMode);
+void ComputeRasterizerState(RasterizerState *state);

 // Draws a triangle if its vertices are specified in counter-clockwise order
 void DrawTriangle(const VertexData &v0, const VertexData &v1, const VertexData &v2, const BinCoords &range, const RasterizerState &state);
--- a/GPU/Software/RasterizerRectangle.cpp
+++ b/GPU/Software/RasterizerRectangle.cpp
@ -116,8 +116,12 @@ void DrawSprite(const VertexData &v0, const VertexData &v1, const BinCoords &ran
 	DrawingCoords scissorTL = TransformUnit::ScreenToDrawing(range.x1, range.y1);
 	DrawingCoords scissorBR = TransformUnit::ScreenToDrawing(range.x2, range.y2);

-	int z = v1.screenpos.z;
-	int fog = 255;
+	const int z = v1.screenpos.z;
+	constexpr int fog = 255;
+
+	// Since it's flat, we can check depth range early.  Matters for earlyZChecks.
+	if (pixelID.applyDepthRange && (z < pixelID.cached.minz || z > pixelID.cached.maxz))
+		return;

 	bool isWhite = v1.color0 == 0xFFFFFFFF;

@ -204,15 +208,31 @@ void DrawSprite(const VertexData &v0, const VertexData &v1, const BinCoords &ran

 			float t = tf_start;
 			const Vec4<int> c0 = Vec4<int>::FromRGBA(v1.color0);
-			for (int y = pos0.y; y < pos1.y; y++) {
-				float s = sf_start;
-				// Not really that fast but faster than triangle.
-				for (int x = pos0.x; x < pos1.x; x++) {
-					Vec4<int> prim_color = state.nearest(s, t, xoff, yoff, ToVec4IntArg(c0), &texptr, &texbufw, 0, 0, state.samplerID);
-					state.drawPixel(x, y, z, 255, ToVec4IntArg(prim_color), pixelID);
-					s += dsf;
+			if (pixelID.earlyZChecks) {
+				for (int y = pos0.y; y < pos1.y; y++) {
+					float s = sf_start;
+					// Not really that fast but faster than triangle.
+					for (int x = pos0.x; x < pos1.x; x++) {
+						if (CheckDepthTestPassed(pixelID.DepthTestFunc(), x, y, pixelID.cached.depthbufStride, z)) {
+							Vec4<int> prim_color = state.nearest(s, t, xoff, yoff, ToVec4IntArg(c0), &texptr, &texbufw, 0, 0, state.samplerID);
+							state.drawPixel(x, y, z, fog, ToVec4IntArg(prim_color), pixelID);
+						}
+
+						s += dsf;
+					}
+					t += dtf;
+				}
+			} else {
+				for (int y = pos0.y; y < pos1.y; y++) {
+					float s = sf_start;
+					// Not really that fast but faster than triangle.
+					for (int x = pos0.x; x < pos1.x; x++) {
+						Vec4<int> prim_color = state.nearest(s, t, xoff, yoff, ToVec4IntArg(c0), &texptr, &texbufw, 0, 0, state.samplerID);
+						state.drawPixel(x, y, z, fog, ToVec4IntArg(prim_color), pixelID);
+						s += dsf;
+					}
+					t += dtf;
 				}
-				t += dtf;
 			}
 		}
 	} else {
@ -239,6 +259,16 @@ void DrawSprite(const VertexData &v0, const VertexData &v1, const BinCoords &ran
 					pixel++;
 				}
 			}
+		} else if (pixelID.earlyZChecks) {
+			const Vec4<int> prim_color = Vec4<int>::FromRGBA(v1.color0);
+			for (int y = pos0.y; y < pos1.y; y++) {
+				for (int x = pos0.x; x < pos1.x; x++) {
+					if (!CheckDepthTestPassed(pixelID.DepthTestFunc(), x, y, pixelID.cached.depthbufStride, z))
+						continue;
+
+					state.drawPixel(x, y, z, fog, ToVec4IntArg(prim_color), pixelID);
+				}
+			}
 		} else {
 			const Vec4<int> prim_color = Vec4<int>::FromRGBA(v1.color0);
 			for (int y = pos0.y; y < pos1.y; y++) {
@ -325,15 +355,18 @@ bool RectangleFastPath(const VertexData &v0, const VertexData &v1, BinManager &b
 }

 static bool AreCoordsRectangleCompatible(const RasterizerState &state, const VertexData &data0, const VertexData &data1) {
-	if (!(data1.color0 == data0.color0))
+	if (data1.color0 != data0.color0)
 		return false;
-	if (!(data1.screenpos.z == data0.screenpos.z)) {
+	if (data1.screenpos.z != data0.screenpos.z) {
 		// Sometimes, we don't actually care about z.
 		if (state.pixelID.depthWrite || state.pixelID.DepthTestFunc() != GE_COMP_ALWAYS)
 			return false;
 	}
 	if (!state.throughMode) {
-		if (!state.throughMode && !(data1.color1 == data0.color1))
+		if (data1.color1 != data0.color1)
+			return false;
+		// This means it should be culled, outside range.
+		if (data1.OutsideRange() || data0.OutsideRange())
 			return false;
 		// Do we have to think about perspective correction or slope mip level?
 		if (state.enableTextures && data1.clippos.w != data0.clippos.w) {
--- a/GPU/Software/SoftGpu.cpp
+++ b/GPU/Software/SoftGpu.cpp
@ -361,7 +361,7 @@ const SoftwareCommandTableEntry softgpuCommandTable[] = {
 	{ GE_CMD_VTCT },
 	{ GE_CMD_VTCQ },
 	{ GE_CMD_VCV },
-	{ GE_CMD_VAP, FLAG_EXECUTE, SoftDirty::NONE, &GPUCommon::Execute_ImmVertexAlphaPrim },
+	{ GE_CMD_VAP, FLAG_EXECUTE, SoftDirty::NONE, &SoftGPU::Execute_ImmVertexAlphaPrim },
 	{ GE_CMD_VFC },
 	{ GE_CMD_VSCV },

@ -639,6 +639,7 @@ void SoftGPU::CopyToCurrentFboFromDisplayRam(int srcwidth, int srcheight) {
 }

 void SoftGPU::CopyDisplayToOutput(bool reallyDirty) {
+	drawEngine_->transformUnit.Flush("output");
 	// The display always shows 480x272.
 	CopyToCurrentFboFromDisplayRam(FB_WIDTH, FB_HEIGHT);
 	MarkDirty(displayFramebuf_, displayStride_, 272, displayFormat_, SoftGPUVRAMDirty::CLEAR);
@ -650,7 +651,7 @@ void SoftGPU::MarkDirty(uint32_t addr, uint32_t stride, uint32_t height, GEBuffe
 }

 void SoftGPU::MarkDirty(uint32_t addr, uint32_t bytes, SoftGPUVRAMDirty value) {
-	// Don't bother tracking if frameskipping.
+	// Only bother tracking if frameskipping.
 	if (g_Config.iFrameSkip == 0)
 		return;
 	if (!Memory::IsVRAMAddress(addr) || !Memory::IsVRAMAddress(addr + bytes - 1))
@ -1005,19 +1006,24 @@ void SoftGPU::Execute_LoadClut(u32 op, u32 diff) {

 void SoftGPU::Execute_FramebufPtr(u32 op, u32 diff) {
 	// We assume fb.data won't change while we're drawing.
-	drawEngine_->transformUnit.Flush("framebuf");
-	fb.data = Memory::GetPointerWrite(gstate.getFrameBufAddress());
+	if (diff) {
+		drawEngine_->transformUnit.Flush("framebuf");
+		fb.data = Memory::GetPointerWrite(gstate.getFrameBufAddress());
+	}
 }

 void SoftGPU::Execute_FramebufFormat(u32 op, u32 diff) {
 	// We should flush, because ranges within bins may change.
-	drawEngine_->transformUnit.Flush("framebuf");
+	if (diff)
+		drawEngine_->transformUnit.Flush("framebuf");
 }

 void SoftGPU::Execute_ZbufPtr(u32 op, u32 diff) {
 	// We assume depthbuf.data won't change while we're drawing.
-	drawEngine_->transformUnit.Flush("depthbuf");
-	depthbuf.data = Memory::GetPointerWrite(gstate.getDepthBufAddress());
+	if (diff) {
+		drawEngine_->transformUnit.Flush("depthbuf");
+		depthbuf.data = Memory::GetPointerWrite(gstate.getDepthBufAddress());
+	}
 }

 void SoftGPU::Execute_VertexType(u32 op, u32 diff) {
@ -1109,6 +1115,12 @@ void SoftGPU::Execute_BoneMtxData(u32 op, u32 diff) {
 	gstate.boneMatrixData  = GE_CMD_BONEMATRIXDATA << 24;
 }

+void SoftGPU::Execute_ImmVertexAlphaPrim(u32 op, u32 diff) {
+	GPUCommon::Execute_ImmVertexAlphaPrim(op, diff);
+	// We won't flush as often as hardware renderers, so we want to flush right away.
+	FlushImm();
+}
+
 void SoftGPU::Execute_Call(u32 op, u32 diff) {
 	PROFILE_THIS_SCOPE("gpu_call");

@ -1138,6 +1150,18 @@ void SoftGPU::FinishDeferred() {
 	drawEngine_->transformUnit.Flush("finish");
 }

+int SoftGPU::ListSync(int listid, int mode) {
+	// Take this as a cue that we need to finish drawing.
+	drawEngine_->transformUnit.Flush("listsync");
+	return GPUCommon::ListSync(listid, mode);
+}
+
+u32 SoftGPU::DrawSync(int mode) {
+	// Take this as a cue that we need to finish drawing.
+	drawEngine_->transformUnit.Flush("drawsync");
+	return GPUCommon::DrawSync(mode);
+}
+
 void SoftGPU::GetStats(char *buffer, size_t bufsize) {
 	drawEngine_->transformUnit.GetStats(buffer, bufsize);
 }
--- a/GPU/Software/SoftGpu.h
+++ b/GPU/Software/SoftGpu.h
@ -127,10 +127,12 @@ public:
 	SoftGPU(GraphicsContext *gfxCtx, Draw::DrawContext *draw);
 	~SoftGPU();

-	void CheckGPUFeatures() override {}
+	u32 CheckGPUFeatures() const override { return 0; }
 	void InitClear() override {}
 	void ExecuteOp(u32 op, u32 diff) override;
 	void FinishDeferred() override;
+	int ListSync(int listid, int mode) override;
+	u32 DrawSync(int mode) override;

 	void SetDisplayFramebuffer(u32 framebuf, u32 stride, GEBufferFormat format) override;
 	void CopyDisplayToOutput(bool reallyDirty) override;
@ -185,6 +187,8 @@ public:
 	void Execute_TgenMtxData(u32 op, u32 diff);
 	void Execute_BoneMtxData(u32 op, u32 diff);

+	void Execute_ImmVertexAlphaPrim(u32 op, u32 diff);
+
 	typedef void (SoftGPU::*CmdFunc)(u32 op, u32 diff);

 protected:
--- a/GPU/Software/TransformUnit.cpp
+++ b/GPU/Software/TransformUnit.cpp
@ -70,14 +70,66 @@ void SoftwareDrawEngine::DispatchSubmitPrim(const void *verts, const void *inds,
 	transformUnit.SubmitPrimitive(verts, inds, prim, vertexCount, vertTypeID, bytesRead, this);
 }

-void SoftwareDrawEngine::DispatchSubmitImm(const void *verts, const void *inds, GEPrimitiveType prim, int vertexCount, u32 vertTypeID, int cullMode, int *bytesRead) {
+void SoftwareDrawEngine::DispatchSubmitImm(GEPrimitiveType prim, TransformedVertex *buffer, int vertexCount, int cullMode, bool continuation) {
+	uint32_t vertTypeID = GetVertTypeID(gstate.vertType | GE_VTYPE_POS_FLOAT, gstate.getUVGenMode());
+
 	int flipCull = cullMode != gstate.getCullMode() ? 1 : 0;
 	// TODO: For now, just setting all dirty.
 	transformUnit.SetDirty(SoftDirty(-1));
 	gstate.cullmode ^= flipCull;
-	transformUnit.SubmitPrimitive(verts, inds, prim, vertexCount, vertTypeID, bytesRead, this);
+
+	// TODO: This is a bit ugly.  Should bypass when clipping...
+	uint32_t xScale = gstate.viewportxscale;
+	uint32_t xCenter = gstate.viewportxcenter;
+	uint32_t yScale = gstate.viewportyscale;
+	uint32_t yCenter = gstate.viewportycenter;
+	uint32_t zScale = gstate.viewportzscale;
+	uint32_t zCenter = gstate.viewportzcenter;
+
+	// Force scale to 1 and center to zero.
+	gstate.viewportxscale = (GE_CMD_VIEWPORTXSCALE << 24) | 0x3F8000;
+	gstate.viewportxcenter = (GE_CMD_VIEWPORTXCENTER << 24) | 0x000000;
+	gstate.viewportyscale = (GE_CMD_VIEWPORTYSCALE << 24) | 0x3F8000;
+	gstate.viewportycenter = (GE_CMD_VIEWPORTYCENTER << 24) | 0x000000;
+	// Z we scale to 65535 for neg z clipping.
+	gstate.viewportzscale = (GE_CMD_VIEWPORTZSCALE << 24) | 0x477FFF;
+	gstate.viewportzcenter = (GE_CMD_VIEWPORTZCENTER << 24) | 0x000000;
+
+	// Before we start, submit 0 prims to reset the prev prim type.
+	// Following submits will always be KEEP_PREVIOUS.
+	if (!continuation)
+		transformUnit.SubmitPrimitive(nullptr, nullptr, prim, 0, vertTypeID, nullptr, this);
+
+	for (int i = 0; i < vertexCount; i++) {
+		VertexData vert;
+		vert.clippos = ClipCoords(buffer[i].pos);
+		vert.texturecoords.x = buffer[i].u;
+		vert.texturecoords.y = buffer[i].v;
+		if (gstate.isModeThrough()) {
+			vert.texturecoords.x *= gstate.getTextureWidth(0);
+			vert.texturecoords.y *= gstate.getTextureHeight(0);
+		} else {
+			vert.clippos.z *= 1.0f / 65535.0f;
+		}
+		vert.color0 = buffer[i].color0_32;
+		vert.color1 = gstate.isUsingSecondaryColor() && !gstate.isModeThrough() ? buffer[i].color1_32 : 0;
+		vert.fogdepth = buffer[i].fog;
+		vert.screenpos.x = (int)(buffer[i].x * 16.0f);
+		vert.screenpos.y = (int)(buffer[i].y * 16.0f);
+		vert.screenpos.z = (u16)(u32)buffer[i].z;
+
+		transformUnit.SubmitImmVertex(vert, this);
+	}
+
+	gstate.viewportxscale = xScale;
+	gstate.viewportxcenter = xCenter;
+	gstate.viewportyscale = yScale;
+	gstate.viewportycenter = yCenter;
+	gstate.viewportzscale = zScale;
+	gstate.viewportzcenter = zCenter;
+
 	gstate.cullmode ^= flipCull;
-	// TODO: Should really clear, but the vertex type is faked so things might need resetting...
+	// TODO: Should really clear, but a bunch of values are forced so we this is safest.
 	transformUnit.SetDirty(SoftDirty(-1));
 }

@ -273,7 +325,7 @@ void ComputeTransformState(TransformState *state, const VertexReader &vreader) {
 		state->roundToScreen = &ClipToScreenInternal<false, true>;
 }

-VertexData TransformUnit::ReadVertex(VertexReader &vreader, const TransformState &state, bool &outside_range_flag) {
+VertexData TransformUnit::ReadVertex(VertexReader &vreader, const TransformState &state) {
 	PROFILE_THIS_SCOPE("read_vert");
 	VertexData vertex;

@ -362,9 +414,13 @@ VertexData TransformUnit::ReadVertex(VertexReader &vreader, const TransformState
 #else
 		screenScaled = vertex.clippos.xyz() * state.screenScale / vertex.clippos.w + state.screenAdd;
 #endif
+		bool outside_range_flag = false;
 		vertex.screenpos = state.roundToScreen(screenScaled, vertex.clippos, &outside_range_flag);
-		if (outside_range_flag)
+		if (outside_range_flag) {
+			// We use this, essentially, as the flag.
+			vertex.screenpos.x = 0x7FFFFFFF;
 			return vertex;
+		}

 		if (state.enableFog) {
 			vertex.fogdepth = (viewpos.z + state.fogEnd) * state.fogSlope;
@ -447,20 +503,19 @@ void TransformUnit::SubmitPrimitive(const void* vertices, const void* indices, G
 	if (gstate_c.skipDrawReason & SKIPDRAW_SKIPFRAME) {
 		return;
 	}
-	// Throughmode never draws 8-bit primitives, maybe because they can't fully specify the screen?
-	if ((vertex_type & GE_VTYPE_THROUGH_MASK) != 0 && (vertex_type & GE_VTYPE_POS_MASK) == GE_VTYPE_POS_8BIT)
-		return;
 	// Vertices without position are just entirely culled.
+	// Note: Throughmode does draw 8-bit primitives, but positions are always zero - handled in decode.
 	if ((vertex_type & GE_VTYPE_POS_MASK) == 0)
 		return;

 	u16 index_lower_bound = 0;
-	u16 index_upper_bound = vertex_count - 1;
+	u16 index_upper_bound = vertex_count == 0 ? 0 : vertex_count - 1;
 	IndexConverter ConvertIndex(vertex_type, indices);

 	if (indices)
 		GetIndexBounds(indices, vertex_count, vertex_type, &index_lower_bound, &index_upper_bound);
-	vdecoder.DecodeVerts(decoded_, vertices, index_lower_bound, index_upper_bound);
+	if (vertex_count != 0)
+		vdecoder.DecodeVerts(decoded_, vertices, index_lower_bound, index_upper_bound);

 	VertexReader vreader(decoded_, vtxfmt, vertex_type);

@ -471,19 +526,11 @@ void TransformUnit::SubmitPrimitive(const void* vertices, const void* indices, G
 		prim_type = prev_prim_;
 	}

-	int vtcs_per_prim;
-	switch (prim_type) {
-	case GE_PRIM_POINTS: vtcs_per_prim = 1; break;
-	case GE_PRIM_LINES: vtcs_per_prim = 2; break;
-	case GE_PRIM_TRIANGLES: vtcs_per_prim = 3; break;
-	case GE_PRIM_RECTANGLES: vtcs_per_prim = 2; break;
-	default: vtcs_per_prim = 0; break;
-	}
-
 	// TODO: Do this in two passes - first process the vertices (before indexing/stripping),
 	// then resolve the indices. This lets us avoid transforming shared vertices twice.

 	binner_->UpdateState(vreader.isThrough());
+	hasDraws_ = true;

 	static TransformState transformState;
 	if (binner_->HasDirty(SoftDirty::LIGHT_ALL | SoftDirty::TRANSFORM_ALL)) {
@ -494,9 +541,17 @@ void TransformUnit::SubmitPrimitive(const void* vertices, const void* indices, G
 	bool skipCull = !gstate.isCullEnabled() || gstate.isModeClear();
 	const CullType cullType = skipCull ? CullType::OFF : (gstate.getCullMode() ? CullType::CCW : CullType::CW);

-	bool outside_range_flag = false;
+	auto readVertexAt = [&](VertexReader &vreader, const TransformState &transformState, int vtx) {
+		if (indices) {
+			vreader.Goto(ConvertIndex(vtx) - index_lower_bound);
+		} else {
+			vreader.Goto(vtx);
+		}

-	if (vreader.isThrough() && cullType == CullType::OFF && prim_type == GE_PRIM_TRIANGLES && data_index_ + vertex_count >= 6 && ((data_index_ + vertex_count) % 6) == 0) {
+		return ReadVertex(vreader, transformState);
+	};
+
+	if (vreader.isThrough() && cullType == CullType::OFF && prim_type == GE_PRIM_TRIANGLES && data_index_ == 0 && vertex_count >= 6 && ((vertex_count) % 6) == 0) {
 		// Some games send rectangles as a series of regular triangles.
 		// We look for this, but only in throughmode.
 		VertexData buf[6];
@ -506,20 +561,7 @@ void TransformUnit::SubmitPrimitive(const void* vertices, const void* indices, G
 		}

 		for (int vtx = 0; vtx < vertex_count; ++vtx) {
-			if (indices) {
-				vreader.Goto(ConvertIndex(vtx) - index_lower_bound);
-			} else {
-				vreader.Goto(vtx);
-			}
-
-			buf[buf_index++] = ReadVertex(vreader, transformState, outside_range_flag);
-			if (buf_index >= 3 && outside_range_flag) {
-				// Cull, just pretend it didn't happen.
-				buf_index -= 3;
-				outside_range_flag = false;
-				continue;
-			}
-
+			buf[buf_index++] = readVertexAt(vreader, transformState, vtx);
 			if (buf_index < 6)
 				continue;

@ -552,73 +594,54 @@ void TransformUnit::SubmitPrimitive(const void* vertices, const void* indices, G
 		return;
 	}

+	// Note: intentionally, these allow for the case of vertex_count == 0, but data_index_ > 0.
+	// This is used for immediate-mode primitives.
 	switch (prim_type) {
 	case GE_PRIM_POINTS:
-	case GE_PRIM_LINES:
-	case GE_PRIM_TRIANGLES:
-		{
-			for (int vtx = 0; vtx < vertex_count; ++vtx) {
-				if (indices) {
-					vreader.Goto(ConvertIndex(vtx) - index_lower_bound);
-				} else {
-					vreader.Goto(vtx);
-				}
-
-				data_[data_index_++] = ReadVertex(vreader, transformState, outside_range_flag);
-				if (data_index_ < vtcs_per_prim) {
-					// Keep reading.  Note: an incomplete prim will stay read for GE_PRIM_KEEP_PREVIOUS.
-					continue;
-				}
-
-				// Okay, we've got enough verts.  Reset the index for next time.
-				data_index_ = 0;
-				if (outside_range_flag) {
-					// Cull the prim if it was outside, and move to the next prim.
-					outside_range_flag = false;
-					continue;
-				}
-
-				switch (prim_type) {
-				case GE_PRIM_TRIANGLES:
-					SendTriangle(cullType, &data_[0]);
-					break;
-
-				case GE_PRIM_LINES:
-					Clipper::ProcessLine(data_[0], data_[1], *binner_);
-					break;
-
-				case GE_PRIM_POINTS:
-					Clipper::ProcessPoint(data_[0], *binner_);
-					break;
-
-				default:
-					_dbg_assert_msg_(false, "Unexpected prim type: %d", prim_type);
-				}
-			}
-			break;
+		for (int i = 0; i < data_index_; ++i)
+			Clipper::ProcessPoint(data_[i], *binner_);
+		data_index_ = 0;
+		for (int vtx = 0; vtx < vertex_count; ++vtx) {
+			data_[0] = readVertexAt(vreader, transformState, vtx);
+			Clipper::ProcessPoint(data_[0], *binner_);
 		}
+		break;
+
+	case GE_PRIM_LINES:
+		for (int i = 0; i < data_index_ - 1; i += 2)
+			Clipper::ProcessLine(data_[i + 0], data_[i + 1], *binner_);
+		data_index_ &= 1;
+		for (int vtx = 0; vtx < vertex_count; ++vtx) {
+			data_[data_index_++] = readVertexAt(vreader, transformState, vtx);
+			if (data_index_ == 2) {
+				Clipper::ProcessLine(data_[0], data_[1], *binner_);
+				data_index_ = 0;
+			}
+		}
+		break;
+
+	case GE_PRIM_TRIANGLES:
+		for (int vtx = 0; vtx < vertex_count; ++vtx) {
+			data_[data_index_++] = readVertexAt(vreader, transformState, vtx);
+			if (data_index_ < 3) {
+				// Keep reading.  Note: an incomplete prim will stay read for GE_PRIM_KEEP_PREVIOUS.
+				continue;
+			}
+			// Okay, we've got enough verts.  Reset the index for next time.
+			data_index_ = 0;
+
+			SendTriangle(cullType, &data_[0]);
+		}
+		// In case vertex_count was 0.
+		if (data_index_ >= 3) {
+			SendTriangle(cullType, &data_[0]);
+			data_index_ = 0;
+		}
+		break;

 	case GE_PRIM_RECTANGLES:
 		for (int vtx = 0; vtx < vertex_count; ++vtx) {
-			if (indices) {
-				vreader.Goto(ConvertIndex(vtx) - index_lower_bound);
-			} else {
-				vreader.Goto(vtx);
-			}
-
-			data_[data_index_++] = ReadVertex(vreader, transformState, outside_range_flag);
-			if (outside_range_flag) {
-				outside_range_flag = false;
-				// Note: this is the post increment index.  If odd, we set the first vert.
-				if (data_index_ & 1) {
-					// Skip the next one and forget this one.
-					vtx++;
-					data_index_--;
-				} else {
-					// Forget both of the last 2.
-					data_index_ -= 2;
-				}
-			}
+			data_[data_index_++] = readVertexAt(vreader, transformState, vtx);

 			if (data_index_ == 4 && vreader.isThrough() && cullType == CullType::OFF) {
 				if (Rasterizer::DetectRectangleThroughModeSlices(binner_->State(), data_)) {
@ -646,19 +669,7 @@ void TransformUnit::SubmitPrimitive(const void* vertices, const void* indices, G
 			// If data_index_ is 1 or 2, etc., it means we're continuing a line strip.
 			int skip_count = data_index_ == 0 ? 1 : 0;
 			for (int vtx = 0; vtx < vertex_count; ++vtx) {
-				if (indices) {
-					vreader.Goto(ConvertIndex(vtx) - index_lower_bound);
-				} else {
-					vreader.Goto(vtx);
-				}
-
-				data_[(data_index_++) & 1] = ReadVertex(vreader, transformState, outside_range_flag);
-				if (outside_range_flag) {
-					// Drop all primitives containing the current vertex
-					skip_count = 2;
-					outside_range_flag = false;
-					continue;
-				}
+				data_[(data_index_++) & 1] = readVertexAt(vreader, transformState, vtx);

 				if (skip_count) {
 					--skip_count;
@ -667,6 +678,9 @@ void TransformUnit::SubmitPrimitive(const void* vertices, const void* indices, G
 					Clipper::ProcessLine(data_[data_index_ & 1], data_[(data_index_ & 1) ^ 1], *binner_);
 				}
 			}
+			// If this is from immediate-mode drawing, we always had one new vert (already in data_.)
+			if (isImmDraw_ && data_index_ >= 2)
+				Clipper::ProcessLine(data_[data_index_ & 1], data_[(data_index_ & 1) ^ 1], *binner_);
 			break;
 		}

@ -681,19 +695,15 @@ void TransformUnit::SubmitPrimitive(const void* vertices, const void* indices, G
 			if (data_index_ == 0 && vertex_count >= 4 && (vertex_count & 1) == 0 && cullType == CullType::OFF) {
 				for (int base = 0; base < vertex_count - 2; base += 2) {
 					for (int vtx = base == 0 ? 0 : 2; vtx < 4; ++vtx) {
-						if (indices) {
-							vreader.Goto(ConvertIndex(base + vtx) - index_lower_bound);
-						} else {
-							vreader.Goto(base + vtx);
-						}
-						data_[vtx] = ReadVertex(vreader, transformState, outside_range_flag);
+						data_[vtx] = readVertexAt(vreader, transformState, base + vtx);
 					}

 					// If a strip is effectively a rectangle, draw it as such!
 					int tl = -1, br = -1;
-					if (!outside_range_flag && Rasterizer::DetectRectangleFromStrip(binner_->State(), data_, &tl, &br)) {
+					if (Rasterizer::DetectRectangleFromStrip(binner_->State(), data_, &tl, &br)) {
 						Clipper::ProcessRect(data_[tl], data_[br], *binner_);
 						start_vtx += 2;
+						skip_count = 0;
 						if (base + 4 >= vertex_count) {
 							start_vtx = vertex_count;
 							break;
@ -710,32 +720,29 @@ void TransformUnit::SubmitPrimitive(const void* vertices, const void* indices, G
 				}
 			}

-			outside_range_flag = false;
-			for (int vtx = start_vtx; vtx < vertex_count; ++vtx) {
-				if (indices) {
-					vreader.Goto(ConvertIndex(vtx) - index_lower_bound);
-				} else {
-					vreader.Goto(vtx);
-				}
-
+			for (int vtx = start_vtx; vtx < vertex_count && skip_count > 0; ++vtx) {
 				int provoking_index = (data_index_++) % 3;
-				data_[provoking_index] = ReadVertex(vreader, transformState, outside_range_flag);
-				if (outside_range_flag) {
-					// Drop all primitives containing the current vertex
-					skip_count = 2;
-					outside_range_flag = false;
-					continue;
-				}
+				data_[provoking_index] = readVertexAt(vreader, transformState, vtx);
+				--skip_count;
+				++start_vtx;
+			}

-				if (skip_count) {
-					--skip_count;
-					continue;
-				}
+			for (int vtx = start_vtx; vtx < vertex_count; ++vtx) {
+				int provoking_index = (data_index_++) % 3;
+				data_[provoking_index] = readVertexAt(vreader, transformState, vtx);

 				int wind = (data_index_ - 1) % 2;
 				CullType altCullType = cullType == CullType::OFF ? cullType : CullType((int)cullType ^ wind);
 				SendTriangle(altCullType, &data_[0], provoking_index);
 			}
+
+			// If this is from immediate-mode drawing, we always had one new vert (already in data_.)
+			if (isImmDraw_ && data_index_ >= 3) {
+				int provoking_index = (data_index_ - 1) % 3;
+				int wind = (data_index_ - 1) % 2;
+				CullType altCullType = cullType == CullType::OFF ? cullType : CullType((int)cullType ^ wind);
+				SendTriangle(altCullType, &data_[0], provoking_index);
+			}
 			break;
 		}

@ -747,64 +754,47 @@ void TransformUnit::SubmitPrimitive(const void* vertices, const void* indices, G
 			int start_vtx = 0;

 			// Only read the central vertex if we're not continuing.
-			if (data_index_ == 0) {
-				if (indices) {
-					vreader.Goto(ConvertIndex(0) - index_lower_bound);
-				} else {
-					vreader.Goto(0);
-				}
-				data_[0] = ReadVertex(vreader, transformState, outside_range_flag);
+			if (data_index_ == 0 && vertex_count > 0) {
+				data_[0] = readVertexAt(vreader, transformState, 0);
 				data_index_++;
 				start_vtx = 1;
-
-				// If the central vertex is outside range, all the points are toast.
-				if (outside_range_flag)
-					break;
 			}

 			if (data_index_ == 1 && vertex_count == 4 && cullType == CullType::OFF) {
 				for (int vtx = start_vtx; vtx < vertex_count; ++vtx) {
-					if (indices) {
-						vreader.Goto(ConvertIndex(vtx) - index_lower_bound);
-					} else {
-						vreader.Goto(vtx);
-					}
-					data_[vtx] = ReadVertex(vreader, transformState, outside_range_flag);
+					data_[vtx] = readVertexAt(vreader, transformState, vtx);
 				}

 				int tl = -1, br = -1;
-				if (!outside_range_flag && Rasterizer::DetectRectangleFromFan(binner_->State(), data_, vertex_count, &tl, &br)) {
+				if (Rasterizer::DetectRectangleFromFan(binner_->State(), data_, vertex_count, &tl, &br)) {
 					Clipper::ProcessRect(data_[tl], data_[br], *binner_);
 					break;
 				}
 			}

-			outside_range_flag = false;
-			for (int vtx = start_vtx; vtx < vertex_count; ++vtx) {
-				if (indices) {
-					vreader.Goto(ConvertIndex(vtx) - index_lower_bound);
-				} else {
-					vreader.Goto(vtx);
-				}
-
+			for (int vtx = start_vtx; vtx < vertex_count && skip_count > 0; ++vtx) {
 				int provoking_index = 2 - ((data_index_++) % 2);
-				data_[provoking_index] = ReadVertex(vreader, transformState, outside_range_flag);
-				if (outside_range_flag) {
-					// Drop all primitives containing the current vertex
-					skip_count = 2;
-					outside_range_flag = false;
-					continue;
-				}
+				data_[provoking_index] = readVertexAt(vreader, transformState, vtx);
+				--skip_count;
+				++start_vtx;
+			}

-				if (skip_count) {
-					--skip_count;
-					continue;
-				}
+			for (int vtx = start_vtx; vtx < vertex_count; ++vtx) {
+				int provoking_index = 2 - ((data_index_++) % 2);
+				data_[provoking_index] = readVertexAt(vreader, transformState, vtx);

 				int wind = (data_index_ - 1) % 2;
 				CullType altCullType = cullType == CullType::OFF ? cullType : CullType((int)cullType ^ wind);
 				SendTriangle(altCullType, &data_[0], provoking_index);
 			}
+
+			// If this is from immediate-mode drawing, we always had one new vert (already in data_.)
+			if (isImmDraw_ && data_index_ >= 3) {
+				int wind = (data_index_ - 1) % 2;
+				int provoking_index = 2 - wind;
+				CullType altCullType = cullType == CullType::OFF ? cullType : CullType((int)cullType ^ wind);
+				SendTriangle(altCullType, &data_[0], provoking_index);
+			}
 			break;
 		}

@ -814,6 +804,47 @@ void TransformUnit::SubmitPrimitive(const void* vertices, const void* indices, G
 	}
 }

+void TransformUnit::SubmitImmVertex(const VertexData &vert, SoftwareDrawEngine *drawEngine) {
+	// Where we put it is different for STRIP/FAN types.
+	switch (prev_prim_) {
+	case GE_PRIM_POINTS:
+	case GE_PRIM_LINES:
+	case GE_PRIM_TRIANGLES:
+	case GE_PRIM_RECTANGLES:
+		// This is the easy one.  SubmitPrimitive resets data_index_.
+		data_[data_index_++] = vert;
+		break;
+
+	case GE_PRIM_LINE_STRIP:
+		// This one alternates, and data_index_ > 0 means it draws a segment.
+		data_[(data_index_++) & 1] = vert;
+		break;
+
+	case GE_PRIM_TRIANGLE_STRIP:
+		data_[(data_index_++) % 3] = vert;
+		break;
+
+	case GE_PRIM_TRIANGLE_FAN:
+		if (data_index_ == 0) {
+			data_[data_index_++] = vert;
+		} else {
+			int provoking_index = 2 - ((data_index_++) % 2);
+			data_[provoking_index] = vert;
+		}
+		break;
+
+	default:
+		_assert_msg_(false, "Invalid prim type: %d", (int)prev_prim_);
+		break;
+	}
+
+	uint32_t vertTypeID = GetVertTypeID(gstate.vertType | GE_VTYPE_POS_FLOAT, gstate.getUVGenMode());
+	// This now processes the step with shared logic, given the existing data_.
+	isImmDraw_ = true;
+	SubmitPrimitive(nullptr, nullptr, GE_PRIM_KEEP_PREVIOUS, 0, vertTypeID, nullptr, drawEngine);
+	isImmDraw_ = false;
+}
+
 void TransformUnit::SendTriangle(CullType cullType, const VertexData *verts, int provoking) {
 	if (cullType == CullType::OFF) {
 		Clipper::ProcessTriangle(verts[0], verts[1], verts[2], verts[provoking], *binner_);
@ -826,8 +857,12 @@ void TransformUnit::SendTriangle(CullType cullType, const VertexData *verts, int
 }

 void TransformUnit::Flush(const char *reason) {
+	if (!hasDraws_)
+		return;
+
 	binner_->Flush(reason);
 	GPUDebug::NotifyDraw();
+	hasDraws_ = false;
 }

 void TransformUnit::GetStats(char *buffer, size_t bufsize) {
@ -836,6 +871,9 @@ void TransformUnit::GetStats(char *buffer, size_t bufsize) {
 }

 void TransformUnit::FlushIfOverlap(const char *reason, bool modifying, uint32_t addr, uint32_t stride, uint32_t w, uint32_t h) {
+	if (!hasDraws_)
+		return;
+
 	if (binner_->HasPendingWrite(addr, stride, w, h))
 		Flush(reason);
 	if (modifying && binner_->HasPendingRead(addr, stride, w, h))
--- a/GPU/Software/TransformUnit.h
+++ b/GPU/Software/TransformUnit.h
@ -90,6 +90,10 @@ struct VertexData {
 		color1 = LerpInt<Vec3<int>, 256>(Vec3<int>::FromRGB(a.color1), Vec3<int>::FromRGB(b.color1), t_int).ToRGB();
 	}

+	bool OutsideRange() const {
+		return screenpos.x == 0x7FFFFFFF;
+	}
+
 	ClipCoords clippos;
 	Vec2<float> texturecoords;
 	uint32_t color0;
@ -125,6 +129,7 @@ public:
 	static ScreenCoords DrawingToScreen(const DrawingCoords &coords, u16 z);

 	void SubmitPrimitive(const void* vertices, const void* indices, GEPrimitiveType prim_type, int vertex_count, u32 vertex_type, int *bytesRead, SoftwareDrawEngine *drawEngine);
+	void SubmitImmVertex(const VertexData &vert, SoftwareDrawEngine *drawEngine);

 	bool GetCurrentSimpleVertices(int count, std::vector<GPUDebugVertex> &vertices, std::vector<u16> &indices);

@ -138,7 +143,7 @@ public:
 	SoftDirty GetDirty();

 private:
-	VertexData ReadVertex(VertexReader &vreader, const TransformState &lstate, bool &outside_range_flag);
+	VertexData ReadVertex(VertexReader &vreader, const TransformState &state);
 	void SendTriangle(CullType cullType, const VertexData *verts, int provoking = 2);

 	u8 *decoded_ = nullptr;
@ -149,6 +154,8 @@ private:
 	// This is the index of the next vert in data (or higher, may need modulus.)
 	int data_index_ = 0;
 	GEPrimitiveType prev_prim_ = GE_PRIM_POINTS;
+	bool hasDraws_ = false;
+	bool isImmDraw_ = false;
 };

 class SoftwareDrawEngine : public DrawEngineCommon {
@ -158,7 +165,7 @@ public:

 	void DispatchFlush() override;
 	void DispatchSubmitPrim(const void *verts, const void *inds, GEPrimitiveType prim, int vertexCount, u32 vertType, int cullMode, int *bytesRead) override;
-	void DispatchSubmitImm(const void *verts, const void *inds, GEPrimitiveType prim, int vertexCount, u32 vertTypeID, int cullMode, int *bytesRead) override;
+	void DispatchSubmitImm(GEPrimitiveType prim, TransformedVertex *buffer, int vertexCount, int cullMode, bool continuation) override;

 	VertexDecoder *FindVertexDecoder(u32 vtype);

--- a/GPU/Vulkan/DrawEngineVulkan.cpp
+++ b/GPU/Vulkan/DrawEngineVulkan.cpp
@ -71,6 +71,7 @@ enum {
 	DRAW_BINDING_TESS_STORAGE_BUF = 6,
 	DRAW_BINDING_TESS_STORAGE_BUF_WU = 7,
 	DRAW_BINDING_TESS_STORAGE_BUF_WV = 8,
+	DRAW_BINDING_INPUT_ATTACHMENT = 9,
 };

 enum {
@ -94,7 +95,10 @@ DrawEngineVulkan::DrawEngineVulkan(Draw::DrawContext *draw)

 void DrawEngineVulkan::InitDeviceObjects() {
 	// All resources we need for PSP drawing. Usually only bindings 0 and 2-4 are populated.
-	VkDescriptorSetLayoutBinding bindings[9]{};
+
+	// TODO: Make things more flexible, so we at least have specialized layouts for input attachments and tess.
+	// Note that it becomes a support matrix..
+	VkDescriptorSetLayoutBinding bindings[10]{};
 	bindings[0].descriptorCount = 1;
 	bindings[0].descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER;
 	bindings[0].stageFlags = VK_SHADER_STAGE_FRAGMENT_BIT;
@ -132,6 +136,10 @@ void DrawEngineVulkan::InitDeviceObjects() {
 	bindings[8].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
 	bindings[8].stageFlags = VK_SHADER_STAGE_VERTEX_BIT;
 	bindings[8].binding = DRAW_BINDING_TESS_STORAGE_BUF_WV;
+	bindings[9].descriptorCount = 1;
+	bindings[9].descriptorType = VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT;
+	bindings[9].stageFlags = VK_SHADER_STAGE_FRAGMENT_BIT;
+	bindings[9].binding = DRAW_BINDING_INPUT_ATTACHMENT;

 	VulkanContext *vulkan = (VulkanContext *)draw_->GetNativeObject(Draw::NativeObject::CONTEXT);
 	VkDevice device = vulkan->GetDevice();
@ -145,13 +153,15 @@ void DrawEngineVulkan::InitDeviceObjects() {

 	static constexpr int DEFAULT_DESC_POOL_SIZE = 512;
 	std::vector<VkDescriptorPoolSize> dpTypes;
-	dpTypes.resize(3);
+	dpTypes.resize(4);
 	dpTypes[0].descriptorCount = DEFAULT_DESC_POOL_SIZE * 3;
 	dpTypes[0].type = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC;
 	dpTypes[1].descriptorCount = DEFAULT_DESC_POOL_SIZE * 3;  // Don't use these for tess anymore, need max three per set.
 	dpTypes[1].type = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER;
 	dpTypes[2].descriptorCount = DEFAULT_DESC_POOL_SIZE * 3;  // TODO: Use a separate layout when no spline stuff is needed to reduce the need for these.
 	dpTypes[2].type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
+	dpTypes[3].descriptorCount = DEFAULT_DESC_POOL_SIZE;  // TODO: Use a separate layout when no spline stuff is needed to reduce the need for these.
+	dpTypes[3].type = VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT;

 	VkDescriptorPoolCreateInfo dp{ VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO };
 	// Don't want to mess around with individually freeing these.
@ -379,6 +389,7 @@ VkDescriptorSet DrawEngineVulkan::GetOrCreateDescriptorSet(VkImageView imageView
 	key.base_ = base;
 	key.light_ = light;
 	key.bone_ = bone;
+	key.secondaryIsInputAttachment = boundSecondaryIsInputAttachment_;

 	FrameData &frame = GetCurFrame();
 	// See if we already have this descriptor set cached.
@ -417,15 +428,15 @@ VkDescriptorSet DrawEngineVulkan::GetOrCreateDescriptorSet(VkImageView imageView
 	}

 	if (boundSecondary_) {
-		tex[1].imageLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL;
+		tex[1].imageLayout = key.secondaryIsInputAttachment ? VK_IMAGE_LAYOUT_GENERAL : VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL;
 		tex[1].imageView = boundSecondary_;
 		tex[1].sampler = samplerSecondaryNearest_;
 		writes[n].sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET;
 		writes[n].pNext = nullptr;
-		writes[n].dstBinding = DRAW_BINDING_2ND_TEXTURE;
+		writes[n].dstBinding = key.secondaryIsInputAttachment ? DRAW_BINDING_INPUT_ATTACHMENT : DRAW_BINDING_2ND_TEXTURE;
 		writes[n].pImageInfo = &tex[1];
 		writes[n].descriptorCount = 1;
-		writes[n].descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER;
+		writes[n].descriptorType = key.secondaryIsInputAttachment ? VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT : VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER;
 		writes[n].dstSet = desc;
 		n++;
 	}
@ -788,7 +799,7 @@ void DrawEngineVulkan::DoFlush() {
 				lastRenderStepId_ = curRenderStepId;
 			}

-			renderManager->BindPipeline(pipeline->pipeline, (PipelineFlags)pipeline->flags, pipelineLayout_);
+			renderManager->BindPipeline(pipeline->pipeline, pipeline->pipelineFlags, pipelineLayout_);
 			if (pipeline != lastPipeline_) {
 				if (lastPipeline_ && !(lastPipeline_->UsesBlendConstant() && pipeline->UsesBlendConstant())) {
 					gstate_c.Dirty(DIRTY_BLEND_STATE);
@ -916,7 +927,7 @@ void DrawEngineVulkan::DoFlush() {
 					lastRenderStepId_ = curRenderStepId;
 				}

-				renderManager->BindPipeline(pipeline->pipeline, (PipelineFlags)pipeline->flags, pipelineLayout_);
+				renderManager->BindPipeline(pipeline->pipeline, pipeline->pipelineFlags, pipelineLayout_);
 				if (pipeline != lastPipeline_) {
 					if (lastPipeline_ && !lastPipeline_->UsesBlendConstant() && pipeline->UsesBlendConstant()) {
 						gstate_c.Dirty(DIRTY_BLEND_STATE);
--- a/GPU/Vulkan/DrawEngineVulkan.h
+++ b/GPU/Vulkan/DrawEngineVulkan.h
@ -217,6 +217,8 @@ private:

 	// Secondary texture for shader blending
 	VkImageView boundSecondary_ = VK_NULL_HANDLE;
+	bool boundSecondaryIsInputAttachment_ = false;
+
 	// CLUT texture for shader depal
 	VkImageView boundDepal_ = VK_NULL_HANDLE;
 	bool boundDepalSmoothed_ = false;
@ -234,6 +236,7 @@ private:
 		VkSampler sampler_;
 		VkBuffer base_, light_, bone_;  // All three UBO slots will be set to this. This will usually be identical
 		// for all draws in a frame, except when the buffer has to grow.
+		bool secondaryIsInputAttachment;
 	};

 	// We alternate between these.
@ -281,7 +284,7 @@ private:
 	VulkanDynamicState dynState_{};

 	int tessOffset_ = 0;
-	bool fboTexNeedsBind_ = false;
+	FBOTexState fboTexBindState_ = FBO_TEX_NONE;

 	// Hardware tessellation
 	TessellationDataTransferVulkan *tessDataTransferVulkan;
--- a/GPU/Vulkan/FramebufferManagerVulkan.h
+++ b/GPU/Vulkan/FramebufferManagerVulkan.h
@ -33,7 +33,7 @@ class VulkanPushBuffer;

 class FramebufferManagerVulkan : public FramebufferManagerCommon {
 public:
-	FramebufferManagerVulkan(Draw::DrawContext *draw);
+	explicit FramebufferManagerVulkan(Draw::DrawContext *draw);
 	~FramebufferManagerVulkan();

 	// If within a render pass, this will just issue a regular clear. If beginning a new render pass,
--- a/GPU/Vulkan/GPU_Vulkan.cpp
+++ b/GPU/Vulkan/GPU_Vulkan.cpp
@ -52,7 +52,7 @@

 GPU_Vulkan::GPU_Vulkan(GraphicsContext *gfxCtx, Draw::DrawContext *draw)
 	: GPUCommon(gfxCtx, draw), drawEngine_(draw) {
-	CheckGPUFeatures();
+	gstate_c.featureFlags = CheckGPUFeatures();

 	VulkanContext *vulkan = (VulkanContext *)gfxCtx->GetAPIContext();

@ -182,8 +182,8 @@ GPU_Vulkan::~GPU_Vulkan() {
 	delete framebufferManagerVulkan_;
 }

-void GPU_Vulkan::CheckGPUFeatures() {
-	uint32_t features = 0;
+u32 GPU_Vulkan::CheckGPUFeatures() const {
+	uint32_t features = GPUCommon::CheckGPUFeatures();

 	VulkanContext *vulkan = (VulkanContext *)draw_->GetNativeObject(Draw::NativeObject::CONTEXT);
 	switch (vulkan->GetPhysicalDeviceProperties().properties.vendorID) {
@ -222,44 +222,14 @@ void GPU_Vulkan::CheckGPUFeatures() {

 	// Mandatory features on Vulkan, which may be checked in "centralized" code
 	features |= GPU_SUPPORTS_TEXTURE_LOD_CONTROL;
-	features |= GPU_SUPPORTS_BLEND_MINMAX;
-	features |= GPU_SUPPORTS_TEXTURE_NPOT;
 	features |= GPU_SUPPORTS_INSTANCE_RENDERING;
 	features |= GPU_SUPPORTS_VERTEX_TEXTURE_FETCH;
 	features |= GPU_SUPPORTS_TEXTURE_FLOAT;
-	features |= GPU_SUPPORTS_DEPTH_TEXTURE;

 	auto &enabledFeatures = vulkan->GetDeviceFeatures().enabled;
 	if (enabledFeatures.depthClamp) {
 		features |= GPU_SUPPORTS_DEPTH_CLAMP;
 	}
-	if (enabledFeatures.shaderClipDistance) {
-		features |= GPU_SUPPORTS_CLIP_DISTANCE;
-	}
-	if (enabledFeatures.shaderCullDistance) {
-		// Must support at least 8 if feature supported, so we're fine.
-		features |= GPU_SUPPORTS_CULL_DISTANCE;
-	}
-	if (!draw_->GetBugs().Has(Draw::Bugs::BROKEN_NAN_IN_CONDITIONAL)) {
-		// Ignore the compat setting if clip and cull are both enabled.
-		// When supported, we can do the depth side of range culling more correctly.
-		const bool supported = draw_->GetDeviceCaps().clipDistanceSupported && draw_->GetDeviceCaps().cullDistanceSupported;
-		const bool disabled = PSP_CoreParameter().compat.flags().DisableRangeCulling;
-		if (supported || !disabled) {
-			features |= GPU_SUPPORTS_VS_RANGE_CULLING;
-		}
-	}
-	if (enabledFeatures.dualSrcBlend) {
-		if (!g_Config.bVendorBugChecksEnabled || !draw_->GetBugs().Has(Draw::Bugs::DUAL_SOURCE_BLENDING_BROKEN)) {
-			features |= GPU_SUPPORTS_DUALSOURCE_BLEND;
-		}
-	}
-	if (draw_->GetDeviceCaps().logicOpSupported) {
-		features |= GPU_SUPPORTS_LOGIC_OP;
-	}
-	if (draw_->GetDeviceCaps().anisoSupported) {
-		features |= GPU_SUPPORTS_ANISOTROPY;
-	}

 	// These are VULKAN_4444_FORMAT and friends.
 	uint32_t fmt4444 = draw_->GetDataFormatSupport(Draw::DataFormat::B4G4R4A4_UNORM_PACK16);
@ -275,10 +245,6 @@ void GPU_Vulkan::CheckGPUFeatures() {
 		INFO_LOG(G3D, "Deficient texture format support: 4444: %d  1555: %d  565: %d", fmt4444, fmt1555, fmt565);
 	}

-	if (PSP_CoreParameter().compat.flags().ClearToRAM) {
-		features |= GPU_USE_CLEAR_RAM_HACK;
-	}
-
 	if (!g_Config.bHighQualityDepth && (features & GPU_SUPPORTS_ACCURATE_DEPTH) != 0) {
 		features |= GPU_SCALE_DEPTH_FROM_24BIT_TO_16BIT;
 	}
@ -290,7 +256,7 @@ void GPU_Vulkan::CheckGPUFeatures() {
 		features |= GPU_ROUND_DEPTH_TO_16BIT;
 	}

-	gstate_c.featureFlags = features;
+	return features;
 }

 void GPU_Vulkan::BeginHostFrame() {
@ -298,7 +264,7 @@ void GPU_Vulkan::BeginHostFrame() {
 	UpdateCmdInfo();

 	if (resized_) {
-		CheckGPUFeatures();
+		gstate_c.featureFlags = CheckGPUFeatures();
 		// In case the GPU changed.
 		BuildReportingInfo();
 		framebufferManager_->Resized();
@ -537,7 +503,7 @@ void GPU_Vulkan::DeviceRestore() {
 	GPUCommon::DeviceRestore();
 	InitDeviceObjects();

-	CheckGPUFeatures();
+	gstate_c.featureFlags = CheckGPUFeatures();
 	BuildReportingInfo();
 	UpdateCmdInfo();

--- a/GPU/Vulkan/GPU_Vulkan.h
+++ b/GPU/Vulkan/GPU_Vulkan.h
@ -38,7 +38,7 @@ public:
 	~GPU_Vulkan();

 	// This gets called on startup and when we get back from settings.
-	void CheckGPUFeatures() override;
+	u32 CheckGPUFeatures() const override;

 	bool IsReady() override;
 	void CancelReady() override;
--- a/GPU/Vulkan/PipelineManagerVulkan.cpp
+++ b/GPU/Vulkan/PipelineManagerVulkan.cpp
@ -170,7 +170,7 @@ static std::string CutFromMain(std::string str) {
 }

 static VulkanPipeline *CreateVulkanPipeline(VulkanRenderManager *renderManager, VkPipelineCache pipelineCache,
-		VkPipelineLayout layout, const VulkanPipelineRasterStateKey &key,
+		VkPipelineLayout layout, PipelineFlags pipelineFlags, const VulkanPipelineRasterStateKey &key,
 		const DecVtxFormat *decFmt, VulkanVertexShader *vs, VulkanFragmentShader *fs, bool useHwTransform, u32 variantBitmask) {
 	VulkanPipeline *vulkanPipeline = new VulkanPipeline();
 	VKRGraphicsPipelineDesc *desc = &vulkanPipeline->desc;
@ -299,14 +299,14 @@ static VulkanPipeline *CreateVulkanPipeline(VulkanRenderManager *renderManager,
 	VKRGraphicsPipeline *pipeline = renderManager->CreateGraphicsPipeline(desc, variantBitmask, "game");

 	vulkanPipeline->pipeline = pipeline;
-	vulkanPipeline->flags = 0;
 	if (useBlendConstant)
-		vulkanPipeline->flags |= PIPELINE_FLAG_USES_BLEND_CONSTANT;
+		pipelineFlags |= PipelineFlags::USES_BLEND_CONSTANT;
 	if (key.topology == VK_PRIMITIVE_TOPOLOGY_LINE_LIST || key.topology == VK_PRIMITIVE_TOPOLOGY_LINE_STRIP)
-		vulkanPipeline->flags |= PIPELINE_FLAG_USES_LINES;
+		pipelineFlags |= PipelineFlags::USES_LINES;
 	if (dss.depthTestEnable || dss.stencilTestEnable) {
-		vulkanPipeline->flags |= PIPELINE_FLAG_USES_DEPTH_STENCIL;
+		pipelineFlags |= PipelineFlags::USES_DEPTH_STENCIL;
 	}
+	vulkanPipeline->pipelineFlags = pipelineFlags;
 	return vulkanPipeline;
 }

@ -329,8 +329,13 @@ VulkanPipeline *PipelineManagerVulkan::GetOrCreatePipeline(VulkanRenderManager *
 	if (iter)
 		return iter;

+	PipelineFlags pipelineFlags = (PipelineFlags)0;
+	if (fs->Flags() & FragmentShaderFlags::INPUT_ATTACHMENT) {
+		pipelineFlags |= PipelineFlags::USES_INPUT_ATTACHMENT;
+	}
+
 	VulkanPipeline *pipeline = CreateVulkanPipeline(
-		renderManager, pipelineCache_, layout,
+		renderManager, pipelineCache_, layout, pipelineFlags,
 		rasterKey, decFmt, vs, fs, useHwTransform, variantBitmask);
 	pipelines_.Insert(key, pipeline);

--- a/GPU/Vulkan/PipelineManagerVulkan.h
+++ b/GPU/Vulkan/PipelineManagerVulkan.h
@ -55,11 +55,12 @@ struct VulkanPipelineKey {
 struct VulkanPipeline {
 	VKRGraphicsPipeline *pipeline;
 	VKRGraphicsPipelineDesc desc;
-	int flags;  // PipelineFlags enum above.
+	PipelineFlags pipelineFlags;  // PipelineFlags enum above.

-	bool UsesBlendConstant() const { return (flags & PIPELINE_FLAG_USES_BLEND_CONSTANT) != 0; }
-	bool UsesLines() const { return (flags & PIPELINE_FLAG_USES_LINES) != 0; }
-	bool UsesDepthStencil() const { return (flags & PIPELINE_FLAG_USES_DEPTH_STENCIL) != 0; }
+	bool UsesBlendConstant() const { return (pipelineFlags & PipelineFlags::USES_BLEND_CONSTANT) != 0; }
+	bool UsesLines() const { return (pipelineFlags & PipelineFlags::USES_LINES) != 0; }
+	bool UsesDepthStencil() const { return (pipelineFlags & PipelineFlags::USES_DEPTH_STENCIL) != 0; }
+	bool UsesInputAttachment() const { return (pipelineFlags & PipelineFlags::USES_INPUT_ATTACHMENT) != 0; }

 	u32 GetVariantsBitmask() const;
 };
--- a/GPU/Vulkan/StateMappingVulkan.cpp
+++ b/GPU/Vulkan/StateMappingVulkan.cpp
@ -153,7 +153,7 @@ void DrawEngineVulkan::ConvertStateToVulkanKey(FramebufferManagerVulkan &fbManag
 			GenericLogicState &logicState = pipelineState_.logicState;

 			if (pipelineState_.FramebufferRead()) {
-				ApplyFramebufferRead(&fboTexNeedsBind_);
+				ApplyFramebufferRead(&fboTexBindState_);
 				// The shader takes over the responsibility for blending, so recompute.
 				// We might still end up using blend to write something to alpha.
 				ApplyStencilReplaceAndLogicOpIgnoreBlend(blendState.replaceAlphaWithStencil, blendState);
@ -364,15 +364,23 @@ void DrawEngineVulkan::BindShaderBlendTex() {
 	// TODO: At this point, we know if the vertices are full alpha or not.
 	// Set the nearest/linear here (since we correctly know if alpha/color tests are needed)?
 	if (!gstate.isModeClear()) {
-		if (fboTexNeedsBind_) {
+		if (fboTexBindState_ == FBO_TEX_COPY_BIND_TEX) {
 			bool bindResult = framebufferManager_->BindFramebufferAsColorTexture(1, framebufferManager_->GetCurrentRenderVFB(), BINDFBCOLOR_MAY_COPY);
 			_dbg_assert_(bindResult);
 			boundSecondary_ = (VkImageView)draw_->GetNativeObject(Draw::NativeObject::BOUND_TEXTURE1_IMAGEVIEW);
+			boundSecondaryIsInputAttachment_ = false;
 			fboTexBound_ = true;
-			fboTexNeedsBind_ = false;
+			fboTexBindState_ = FBO_TEX_NONE;

 			// Must dirty blend state here so we re-copy next time.  Example: Lunar's spell effects.
 			dirtyRequiresRecheck_ |= DIRTY_BLEND_STATE;
+		} else if (fboTexBindState_ == FBO_TEX_READ_FRAMEBUFFER) {
+			draw_->BindCurrentFramebufferForColorInput();
+			boundSecondary_ = (VkImageView)draw_->GetNativeObject(Draw::NativeObject::BOUND_FRAMEBUFFER_COLOR_IMAGEVIEW);
+			boundSecondaryIsInputAttachment_ = true;
+			fboTexBindState_ = FBO_TEX_NONE;
+		} else {
+			boundSecondary_ = VK_NULL_HANDLE;
 		}
 	}
 }
--- a/SDL/SDLVulkanGraphicsContext.cpp
+++ b/SDL/SDLVulkanGraphicsContext.cpp
@ -116,7 +116,7 @@ bool SDLVulkanGraphicsContext::Init(SDL_Window *&window, int x, int y, int mode,
 		return false;
 	}

-	draw_ = Draw::T3DCreateVulkanContext(vulkan_, false);
+	draw_ = Draw::T3DCreateVulkanContext(vulkan_);
 	SetGPUBackend(GPUBackend::VULKAN);
 	bool success = draw_->CreatePresets();
 	_assert_(success);
--- a/UI/ComboKeyMappingScreen.cpp
+++ b/UI/ComboKeyMappingScreen.cpp
@ -56,6 +56,8 @@ public:
 		parent->Add(scroll);
 	}

+	const char *tag() const override { return "ButtonShape"; }
+
 private:
 	int *setting_;
 };
@ -84,6 +86,8 @@ public:
 		parent->Add(scroll);
 	}

+	const char *tag() const override { return "ButtonIcon"; }
+
 private:
 	int *setting_;
 };
--- a/UI/ComboKeyMappingScreen.h
+++ b/UI/ComboKeyMappingScreen.h
@ -28,6 +28,8 @@ class ComboKeyScreen : public UIDialogScreenWithBackground {
 public:
 	ComboKeyScreen(int id): id_(id) {}

+	const char *tag() const override { return "ComboKey"; }
+
 	void CreateViews() override;
 	void onFinish(DialogResult result) override;

--- a/UI/ControlMappingScreen.cpp
+++ b/UI/ControlMappingScreen.cpp
@ -313,7 +313,7 @@ UI::EventReturn ControlMappingScreen::OnVisualizeMapping(UI::EventParams &params
 }

 void ControlMappingScreen::dialogFinished(const Screen *dialog, DialogResult result) {
-	if (result == DR_OK && dialog->tag() == "listpopup") {
+	if (result == DR_OK && std::string(dialog->tag()) == "listpopup") {
 		ListPopupScreen *popup = (ListPopupScreen *)dialog;
 		KeyMap::AutoConfForPad(popup->GetChoiceString());
 	}
--- a/UI/ControlMappingScreen.h
+++ b/UI/ControlMappingScreen.h
@ -35,7 +35,7 @@ class SingleControlMapper;
 class ControlMappingScreen : public UIDialogScreenWithBackground {
 public:
 	ControlMappingScreen() {}
-	std::string tag() const override { return "control mapping"; }
+	const char *tag() const override { return "ControlMapping"; }

 protected:
 	void CreateViews() override;
@ -47,7 +47,7 @@ private:
 	UI::EventReturn OnAutoConfigure(UI::EventParams &params);
 	UI::EventReturn OnVisualizeMapping(UI::EventParams &params);

-	virtual void dialogFinished(const Screen *dialog, DialogResult result) override;
+	void dialogFinished(const Screen *dialog, DialogResult result) override;

 	UI::ScrollView *rightScroll_;
 	std::vector<SingleControlMapper *> mappers_;
@ -61,17 +61,19 @@ public:
 		pspBtn_ = btn;
 	}

-	virtual bool key(const KeyInput &key) override;
-	virtual bool axis(const AxisInput &axis) override;
+	const char *tag() const override { return "KeyMappingNewKey"; }
+
+	bool key(const KeyInput &key) override;
+	bool axis(const AxisInput &axis) override;

 	void SetDelay(float t);

 protected:
 	void CreatePopupContents(UI::ViewGroup *parent) override;

-	virtual bool FillVertical() const override { return false; }
-	virtual bool ShowButtons() const override { return true; }
-	virtual void OnCompleted(DialogResult result) override {}
+	bool FillVertical() const override { return false; }
+	bool ShowButtons() const override { return true; }
+	void OnCompleted(DialogResult result) override {}

 private:
 	int pspBtn_;
@ -87,6 +89,8 @@ public:
 		pspBtn_ = btn;
 	}

+	const char *tag() const override { return "KeyMappingNewMouseKey"; }
+
 	bool key(const KeyInput &key) override;
 	bool axis(const AxisInput &axis) override;

@ -114,6 +118,8 @@ public:

 	void update() override;

+	const char *tag() const override { return "AnalogSetup"; }
+
 protected:
 	void CreateViews() override;

@ -144,6 +150,8 @@ public:
 	bool key(const KeyInput &key) override;
 	bool axis(const AxisInput &axis) override;

+	const char *tag() const override { return "TouchTest"; }
+
 protected:
 	struct TrackedTouch {
 		int id;
@ -171,6 +179,8 @@ class VisualMappingScreen : public UIDialogScreenWithBackground {
 public:
 	VisualMappingScreen() {}

+	const char *tag() const override { return "VisualMapping"; }
+
 protected:
 	void CreateViews() override;

--- a/UI/CwCheatScreen.h
+++ b/UI/CwCheatScreen.h
@ -41,6 +41,8 @@ public:
 	void update() override;
 	void onFinish(DialogResult result) override;

+	const char *tag() const override { return "CwCheat"; }
+
 protected:
 	void CreateViews() override;

--- a/UI/DevScreens.cpp
+++ b/UI/DevScreens.cpp
@ -85,7 +85,7 @@ static const char *logLevelList[] = {
 	"Verb."
 };

-void DevMenu::CreatePopupContents(UI::ViewGroup *parent) {
+void DevMenuScreen::CreatePopupContents(UI::ViewGroup *parent) {
 	using namespace UI;
 	auto dev = GetI18NCategory("Developer");
 	auto sy = GetI18NCategory("System");
@ -94,25 +94,25 @@ void DevMenu::CreatePopupContents(UI::ViewGroup *parent) {
 	LinearLayout *items = new LinearLayout(ORIENT_VERTICAL);

 #if !defined(MOBILE_DEVICE)
-	items->Add(new Choice(dev->T("Log View")))->OnClick.Handle(this, &DevMenu::OnLogView);
+	items->Add(new Choice(dev->T("Log View")))->OnClick.Handle(this, &DevMenuScreen::OnLogView);
 #endif
-	items->Add(new Choice(dev->T("Logging Channels")))->OnClick.Handle(this, &DevMenu::OnLogConfig);
-	items->Add(new Choice(sy->T("Developer Tools")))->OnClick.Handle(this, &DevMenu::OnDeveloperTools);
-	items->Add(new Choice(dev->T("Jit Compare")))->OnClick.Handle(this, &DevMenu::OnJitCompare);
-	items->Add(new Choice(dev->T("Shader Viewer")))->OnClick.Handle(this, &DevMenu::OnShaderView);
+	items->Add(new Choice(dev->T("Logging Channels")))->OnClick.Handle(this, &DevMenuScreen::OnLogConfig);
+	items->Add(new Choice(sy->T("Developer Tools")))->OnClick.Handle(this, &DevMenuScreen::OnDeveloperTools);
+	items->Add(new Choice(dev->T("Jit Compare")))->OnClick.Handle(this, &DevMenuScreen::OnJitCompare);
+	items->Add(new Choice(dev->T("Shader Viewer")))->OnClick.Handle(this, &DevMenuScreen::OnShaderView);
 	if (g_Config.iGPUBackend == (int)GPUBackend::VULKAN) {
 		// TODO: Make a new allocator visualizer for VMA.
 		// items->Add(new CheckBox(&g_Config.bShowAllocatorDebug, dev->T("Allocator Viewer")));
 		items->Add(new CheckBox(&g_Config.bShowGpuProfile, dev->T("GPU Profile")));
 	}
-	items->Add(new Choice(dev->T("Toggle Freeze")))->OnClick.Handle(this, &DevMenu::OnFreezeFrame);
-	items->Add(new Choice(dev->T("Dump Frame GPU Commands")))->OnClick.Handle(this, &DevMenu::OnDumpFrame);
-	items->Add(new Choice(dev->T("Toggle Audio Debug")))->OnClick.Handle(this, &DevMenu::OnToggleAudioDebug);
+	items->Add(new Choice(dev->T("Toggle Freeze")))->OnClick.Handle(this, &DevMenuScreen::OnFreezeFrame);
+	items->Add(new Choice(dev->T("Dump Frame GPU Commands")))->OnClick.Handle(this, &DevMenuScreen::OnDumpFrame);
+	items->Add(new Choice(dev->T("Toggle Audio Debug")))->OnClick.Handle(this, &DevMenuScreen::OnToggleAudioDebug);
 #ifdef USE_PROFILER
 	items->Add(new CheckBox(&g_Config.bShowFrameProfiler, dev->T("Frame Profiler"), ""));
 #endif
 	items->Add(new CheckBox(&g_Config.bDrawFrameGraph, dev->T("Draw Frametimes Graph")));
-	items->Add(new Choice(dev->T("Reset limited logging")))->OnClick.Handle(this, &DevMenu::OnResetLimitedLogging);
+	items->Add(new Choice(dev->T("Reset limited logging")))->OnClick.Handle(this, &DevMenuScreen::OnResetLimitedLogging);

 	scroll->Add(items);
 	parent->Add(scroll);
@ -123,48 +123,48 @@ void DevMenu::CreatePopupContents(UI::ViewGroup *parent) {
 	}
 }

-UI::EventReturn DevMenu::OnToggleAudioDebug(UI::EventParams &e) {
+UI::EventReturn DevMenuScreen::OnToggleAudioDebug(UI::EventParams &e) {
 	g_Config.bShowAudioDebug = !g_Config.bShowAudioDebug;
 	return UI::EVENT_DONE;
 }

-UI::EventReturn DevMenu::OnResetLimitedLogging(UI::EventParams &e) {
+UI::EventReturn DevMenuScreen::OnResetLimitedLogging(UI::EventParams &e) {
 	Reporting::ResetCounts();
 	return UI::EVENT_DONE;
 }

-UI::EventReturn DevMenu::OnLogView(UI::EventParams &e) {
+UI::EventReturn DevMenuScreen::OnLogView(UI::EventParams &e) {
 	UpdateUIState(UISTATE_PAUSEMENU);
 	screenManager()->push(new LogScreen());
 	return UI::EVENT_DONE;
 }

-UI::EventReturn DevMenu::OnLogConfig(UI::EventParams &e) {
+UI::EventReturn DevMenuScreen::OnLogConfig(UI::EventParams &e) {
 	UpdateUIState(UISTATE_PAUSEMENU);
 	screenManager()->push(new LogConfigScreen());
 	return UI::EVENT_DONE;
 }

-UI::EventReturn DevMenu::OnDeveloperTools(UI::EventParams &e) {
+UI::EventReturn DevMenuScreen::OnDeveloperTools(UI::EventParams &e) {
 	UpdateUIState(UISTATE_PAUSEMENU);
 	screenManager()->push(new DeveloperToolsScreen());
 	return UI::EVENT_DONE;
 }

-UI::EventReturn DevMenu::OnJitCompare(UI::EventParams &e) {
+UI::EventReturn DevMenuScreen::OnJitCompare(UI::EventParams &e) {
 	UpdateUIState(UISTATE_PAUSEMENU);
 	screenManager()->push(new JitCompareScreen());
 	return UI::EVENT_DONE;
 }

-UI::EventReturn DevMenu::OnShaderView(UI::EventParams &e) {
+UI::EventReturn DevMenuScreen::OnShaderView(UI::EventParams &e) {
 	UpdateUIState(UISTATE_PAUSEMENU);
 	if (gpu)  // Avoid crashing if chosen while the game is being loaded.
 		screenManager()->push(new ShaderListScreen());
 	return UI::EVENT_DONE;
 }

-UI::EventReturn DevMenu::OnFreezeFrame(UI::EventParams &e) {
+UI::EventReturn DevMenuScreen::OnFreezeFrame(UI::EventParams &e) {
 	if (PSP_CoreParameter().frozen) {
 		PSP_CoreParameter().frozen = false;
 	} else {
@ -173,12 +173,12 @@ UI::EventReturn DevMenu::OnFreezeFrame(UI::EventParams &e) {
 	return UI::EVENT_DONE;
 }

-UI::EventReturn DevMenu::OnDumpFrame(UI::EventParams &e) {
+UI::EventReturn DevMenuScreen::OnDumpFrame(UI::EventParams &e) {
 	gpu->DumpNextFrame();
 	return UI::EVENT_DONE;
 }

-void DevMenu::dialogFinished(const Screen *dialog, DialogResult result) {
+void DevMenuScreen::dialogFinished(const Screen *dialog, DialogResult result) {
 	UpdateUIState(UISTATE_INGAME);
 	// Close when a subscreen got closed.
 	// TODO: a bug in screenmanager causes this not to work here.
@ -514,7 +514,15 @@ void SystemInfoScreen::CreateViews() {
 	const std::string apiNameKey = draw->GetInfoString(InfoField::APINAME);
 	const char *apiName = gr->T(apiNameKey);
 	deviceSpecs->Add(new InfoItem(si->T("3D API"), apiName));
-	deviceSpecs->Add(new InfoItem(si->T("Vendor"), draw->GetInfoString(InfoField::VENDORSTRING)));
+
+	// TODO: Not really vendor, on most APIs it's a device name (GL calls it vendor though).
+	std::string vendorString;
+	if (draw->GetDeviceCaps().deviceID != 0) {
+		vendorString = StringFromFormat("%s (%08x)", draw->GetInfoString(InfoField::VENDORSTRING).c_str(), draw->GetDeviceCaps().deviceID);
+	} else {
+		vendorString = draw->GetInfoString(InfoField::VENDORSTRING);
+	}
+	deviceSpecs->Add(new InfoItem(si->T("Vendor"), vendorString));
 	std::string vendor = draw->GetInfoString(InfoField::VENDOR);
 	if (vendor.size())
 		deviceSpecs->Add(new InfoItem(si->T("Vendor (detected)"), vendor));
--- a/Show More
+++ b/Show More