diff --git a/GPU/Common/FramebufferCommon.cpp b/GPU/Common/FramebufferCommon.cpp
index 94072a85b2..a54e58c61e 100644
--- a/GPU/Common/FramebufferCommon.cpp
+++ b/GPU/Common/FramebufferCommon.cpp
@@ -208,6 +208,7 @@ void FramebufferManagerCommon::SetNumExtraFBOs(int num) {
 		extraFBOs_.push_back(fbo);
 	}
 	currentRenderVfb_ = 0;
+	// TODO: Should probably not do this bind.
 	if (num != 0)
 		draw_->BindFramebufferAsRenderTarget(nullptr, { Draw::RPAction::KEEP, Draw::RPAction::KEEP, Draw::RPAction::KEEP });
 }
diff --git a/GPU/GLES/DrawEngineGLES.cpp b/GPU/GLES/DrawEngineGLES.cpp
index be77ad5362..d05489dd92 100644
--- a/GPU/GLES/DrawEngineGLES.cpp
+++ b/GPU/GLES/DrawEngineGLES.cpp
@@ -115,7 +115,8 @@ enum {
 
 enum { VAI_KILL_AGE = 120, VAI_UNRELIABLE_KILL_AGE = 240, VAI_UNRELIABLE_KILL_MAX = 4 };
 
-DrawEngineGLES::DrawEngineGLES(Draw::DrawContext *draw) : vai_(256), draw_(draw) {
+DrawEngineGLES::DrawEngineGLES(Draw::DrawContext *draw) : vai_(256), draw_(draw), inputLayoutMap_(16) {
+	render_ = (GLRenderManager *)draw_->GetNativeObject(Draw::NativeObject::RENDER_MANAGER);
 
 	decOptions_.expandAllWeightsToFloat = false;
 	decOptions_.expand8BitNormalsToFloat = false;
@@ -178,9 +179,29 @@ void DrawEngineGLES::InitDeviceObjects() {
 	} else {
 		ERROR_LOG(G3D, "Device objects already initialized!");
 	}
+
+	for (int i = 0; i < GLRenderManager::MAX_INFLIGHT_FRAMES; i++) {
+		frameData_[i].pushVertex = new GLPushBuffer(render_, 1024 * 1024);
+		frameData_[i].pushIndex = new GLPushBuffer(render_, 512 * 1024);
+	}
+
+	int vertexSize = sizeof(TransformedVertex);
+	std::vector<GLRInputLayout::Entry> entries;
+	entries.push_back({ ATTR_POSITION, 4, GL_FLOAT, GL_FALSE, vertexSize, 0 });
+	entries.push_back({ ATTR_TEXCOORD, 3, GL_FLOAT, GL_FALSE, vertexSize, offsetof(TransformedVertex, u) });
+	entries.push_back({ ATTR_COLOR0, 4, GL_UNSIGNED_BYTE, GL_TRUE, vertexSize, offsetof(TransformedVertex, color0) });
+	entries.push_back({ ATTR_COLOR1, 3, GL_UNSIGNED_BYTE, GL_TRUE, vertexSize, offsetof(TransformedVertex, color1) });
+	softwareInputLayout_ = render_->CreateInputLayout(entries);
 }
 
 void DrawEngineGLES::DestroyDeviceObjects() {
+	for (int i = 0; i < GLRenderManager::MAX_INFLIGHT_FRAMES; i++) {
+		frameData_[i].pushVertex->Destroy();
+		frameData_[i].pushIndex->Destroy();
+		delete frameData_[i].pushVertex;
+		delete frameData_[i].pushIndex;
+	}
+
 	ClearTrackedVertexArrays();
 	if (!bufferNameCache_.empty()) {
 		glstate.arrayBuffer.unbind();
@@ -194,6 +215,27 @@ void DrawEngineGLES::DestroyDeviceObjects() {
 			glDeleteVertexArrays(1, &sharedVao_);
 		}
 	}
+
+	render_->DeleteInputLayout(softwareInputLayout_);
+}
+
+void DrawEngineGLES::ClearInputLayoutMap() {
+	inputLayoutMap_.Iterate([&](const uint32_t &key, GLRInputLayout *il) {
+		render_->DeleteInputLayout(il);
+	});
+	inputLayoutMap_.Clear();
+}
+
+void DrawEngineGLES::BeginFrame() {
+	FrameData &frameData = frameData_[render_->GetCurFrame()];
+	frameData.pushIndex->Begin();
+	frameData.pushVertex->Begin();
+}
+
+void DrawEngineGLES::EndFrame() {
+	FrameData &frameData = frameData_[render_->GetCurFrame()];
+	frameData.pushIndex->End();
+	frameData.pushVertex->End();
 }
 
 struct GlTypeInfo {
@@ -220,24 +262,40 @@ static const GlTypeInfo GLComp[] = {
 	{GL_UNSIGNED_SHORT, 4, GL_TRUE},// 	DEC_U16_4,
 };
 
-static inline void VertexAttribSetup(int attrib, int fmt, int stride, u8 *ptr) {
+static inline void VertexAttribSetup(int attrib, int fmt, int stride, int offset, std::vector<GLRInputLayout::Entry> &entries) {
 	if (fmt) {
 		const GlTypeInfo &type = GLComp[fmt];
-		glVertexAttribPointer(attrib, type.count, type.type, type.normalized, stride, ptr);
+		GLRInputLayout::Entry entry;
+		entry.offset = offset;
+		entry.location = attrib;
+		entry.normalized = type.normalized;
+		entry.type = type.type;
+		entry.stride = stride;
+		entry.count = type.count;
+		entries.push_back(entry);
 	}
 }
 
 // TODO: Use VBO and get rid of the vertexData pointers - with that, we will supply only offsets
-static void SetupDecFmtForDraw(LinkedShader *program, const DecVtxFormat &decFmt, u8 *vertexData) {
-	CHECK_GL_ERROR_IF_DEBUG();
-	VertexAttribSetup(ATTR_W1, decFmt.w0fmt, decFmt.stride, vertexData + decFmt.w0off);
-	VertexAttribSetup(ATTR_W2, decFmt.w1fmt, decFmt.stride, vertexData + decFmt.w1off);
-	VertexAttribSetup(ATTR_TEXCOORD, decFmt.uvfmt, decFmt.stride, vertexData + decFmt.uvoff);
-	VertexAttribSetup(ATTR_COLOR0, decFmt.c0fmt, decFmt.stride, vertexData + decFmt.c0off);
-	VertexAttribSetup(ATTR_COLOR1, decFmt.c1fmt, decFmt.stride, vertexData + decFmt.c1off);
-	VertexAttribSetup(ATTR_NORMAL, decFmt.nrmfmt, decFmt.stride, vertexData + decFmt.nrmoff);
-	VertexAttribSetup(ATTR_POSITION, decFmt.posfmt, decFmt.stride, vertexData + decFmt.posoff);
-	CHECK_GL_ERROR_IF_DEBUG();
+GLRInputLayout *DrawEngineGLES::SetupDecFmtForDraw(LinkedShader *program, const DecVtxFormat &decFmt) {
+	uint32_t key = decFmt.id;
+	GLRInputLayout *inputLayout = inputLayoutMap_.Get(key);
+	if (inputLayout) {
+		return inputLayout;
+	}
+
+	std::vector<GLRInputLayout::Entry> entries;
+	VertexAttribSetup(ATTR_W1, decFmt.w0fmt, decFmt.stride, decFmt.w0off, entries);
+	VertexAttribSetup(ATTR_W2, decFmt.w1fmt, decFmt.stride, decFmt.w1off, entries);
+	VertexAttribSetup(ATTR_TEXCOORD, decFmt.uvfmt, decFmt.stride, decFmt.uvoff, entries);
+	VertexAttribSetup(ATTR_COLOR0, decFmt.c0fmt, decFmt.stride, decFmt.c0off, entries);
+	VertexAttribSetup(ATTR_COLOR1, decFmt.c1fmt, decFmt.stride, decFmt.c1off, entries);
+	VertexAttribSetup(ATTR_NORMAL, decFmt.nrmfmt, decFmt.stride, decFmt.nrmoff, entries);
+	VertexAttribSetup(ATTR_POSITION, decFmt.posfmt, decFmt.stride, decFmt.posoff, entries);
+
+	inputLayout = render_->CreateInputLayout(entries);
+	inputLayoutMap_.Insert(key, inputLayout);
+	return inputLayout;
 }
 
 void DrawEngineGLES::SubmitPrim(void *verts, void *inds, GEPrimitiveType prim, int vertexCount, u32 vertType, int *bytesRead) {
@@ -304,6 +362,17 @@ void DrawEngineGLES::SubmitPrim(void *verts, void *inds, GEPrimitiveType prim, i
 	}
 }
 
+void DrawEngineGLES::DecodeVertsToPushBuffer(GLPushBuffer *push, uint32_t *bindOffset, GLRBuffer **buf) {
+	u8 *dest = decoded;
+
+	// Figure out how much pushbuffer space we need to allocate.
+	if (push) {
+		int vertsToDecode = ComputeNumVertsToDecode();
+		dest = (u8 *)push->Push(vertsToDecode * dec_->GetDecVtxFmt().stride, bindOffset, buf);
+	}
+	DecodeVerts(dest);
+}
+
 void DrawEngineGLES::MarkUnreliable(VertexArrayInfo *vai) {
 	vai->status = VertexArrayInfo::VAI_UNRELIABLE;
 	if (vai->vbo) {
@@ -430,18 +499,23 @@ void DrawEngineGLES::FreeVertexArray(VertexArrayInfo *vai) {
 
 void DrawEngineGLES::DoFlush() {
 	PROFILE_THIS_SCOPE("flush");
-	CHECK_GL_ERROR_IF_DEBUG();
 
+	FrameData &frameData = frameData_[render_->GetCurFrame()];
+	
 	gpuStats.numFlushes++;
 	gpuStats.numTrackedVertexArrays = (int)vai_.size();
 
 	GEPrimitiveType prim = prevPrim_;
 	ApplyDrawState(prim);
-	CHECK_GL_ERROR_IF_DEBUG();
 
 	VShaderID vsid;
 	Shader *vshader = shaderManager_->ApplyVertexShader(prim, lastVType_, &vsid);
 
+	GLRBuffer *vertexBuffer = nullptr;
+	GLRBuffer *indexBuffer = nullptr;
+	uint32_t vertexBufferOffset = 0;
+	uint32_t indexBufferOffset = 0;
+
 	if (vshader->UseHWTransform()) {
 		GLuint vbo = 0, ebo = 0;
 		int vertexCount = 0;
@@ -453,6 +527,9 @@ void DrawEngineGLES::DoFlush() {
 		if (g_Config.bSoftwareSkinning && (lastVType_ & GE_VTYPE_WEIGHT_MASK))
 			useCache = false;
 
+		// TEMPORARY
+		useCache = false;
+
 		if (useCache) {
 			u32 id = dcid_ ^ gstate.getUVGenMode();  // This can have an effect on which UV decoder we need to use! And hence what the decoded data will look like. See #9263
 			VertexArrayInfo *vai = vai_.Get(id);
@@ -597,7 +674,7 @@ void DrawEngineGLES::DoFlush() {
 
 			vai->lastFrame = gpuStats.numFlips;
 		} else {
-			DecodeVerts(decoded);
+			DecodeVertsToPushBuffer(frameData.pushVertex, &vertexBufferOffset, &vertexBuffer);
 
 rotateVBO:
 			gpuStats.numUncachedVertsDrawn += indexGen.VertexCount();
@@ -606,9 +683,6 @@ rotateVBO:
 			if (!useElements && indexGen.PureCount()) {
 				vertexCount = indexGen.PureCount();
 			}
-			glstate.arrayBuffer.unbind();
-			glstate.elementArrayBuffer.unbind();
-
 			prim = indexGen.Prim();
 		}
 
@@ -630,16 +704,21 @@ rotateVBO:
 		}
 
 		LinkedShader *program = shaderManager_->ApplyFragmentShader(vsid, vshader, lastVType_, prim);
-		SetupDecFmtForDraw(program, dec_->GetDecVtxFmt(), vbo ? 0 : decoded);
-
+		GLRInputLayout *inputLayout = SetupDecFmtForDraw(program, dec_->GetDecVtxFmt());
+		render_->BindVertexBuffer(vertexBuffer);
+		render_->BindInputLayout(inputLayout, (void *)(uintptr_t)vertexBufferOffset);
 		if (useElements) {
+			if (!indexBuffer) {
+				indexBufferOffset = (uint32_t)frameData.pushIndex->Push(decIndex, sizeof(uint16_t) * indexGen.VertexCount(), &indexBuffer);
+				render_->BindIndexBuffer(indexBuffer);
+			}
 			if (gstate_c.bezier || gstate_c.spline)
 				// Instanced rendering for instanced tessellation
-				glDrawElementsInstanced(glprim[prim], vertexCount, GL_UNSIGNED_SHORT, ebo ? 0 : (GLvoid*)decIndex, numPatches);
+				; // glDrawElementsInstanced(glprim[prim], vertexCount, GL_UNSIGNED_SHORT, (GLvoid*)(intptr_t)indexBufferOffset, numPatches);
 			else
-				glDrawElements(glprim[prim], vertexCount, GL_UNSIGNED_SHORT, ebo ? 0 : (GLvoid*)decIndex);
+				render_->DrawIndexed(glprim[prim], vertexCount, GL_UNSIGNED_SHORT, (GLvoid*)(intptr_t)indexBufferOffset);
 		} else {
-			glDrawArrays(glprim[prim], 0, vertexCount);
+			render_->Draw(glprim[prim], 0, vertexCount);
 		}
 	} else {
 		DecodeVerts(decoded);
@@ -695,28 +774,18 @@ rotateVBO:
 
 			bool doTextureProjection = gstate.getUVGenMode() == GE_TEXMAP_TEXTURE_MATRIX;
 
-			const uint8_t *bufferStart = (const uint8_t *)drawBuffer;
-			if (gstate_c.Supports(GPU_SUPPORTS_VAO)) {
-				bufferStart = 0;
-				BindBuffer(drawBuffer, vertexSize * maxIndex);
-				if (drawIndexed) {
-					BindElementBuffer(inds, sizeof(short) * numTrans);
-					inds = 0;
-				}
-			} else {
-				glstate.arrayBuffer.unbind();
-				glstate.elementArrayBuffer.unbind();
-			}
-
-			glVertexAttribPointer(ATTR_POSITION, 4, GL_FLOAT, GL_FALSE, vertexSize, bufferStart);
-			int attrMask = program->attrMask;
-			if (attrMask & (1 << ATTR_TEXCOORD)) glVertexAttribPointer(ATTR_TEXCOORD, doTextureProjection ? 3 : 2, GL_FLOAT, GL_FALSE, vertexSize, bufferStart + offsetof(TransformedVertex, u));
-			if (attrMask & (1 << ATTR_COLOR0)) glVertexAttribPointer(ATTR_COLOR0, 4, GL_UNSIGNED_BYTE, GL_TRUE, vertexSize, bufferStart + offsetof(TransformedVertex, color0));
-			if (attrMask & (1 << ATTR_COLOR1)) glVertexAttribPointer(ATTR_COLOR1, 3, GL_UNSIGNED_BYTE, GL_TRUE, vertexSize, bufferStart + offsetof(TransformedVertex, color1));
 			if (drawIndexed) {
-				glDrawElements(glprim[prim], numTrans, GL_UNSIGNED_SHORT, inds);
+				vertexBufferOffset = (uint32_t)frameData.pushVertex->Push(drawBuffer, maxIndex * sizeof(TransformedVertex), &vertexBuffer);
+				indexBufferOffset = (uint32_t)frameData.pushIndex->Push(decIndex, sizeof(uint16_t) * indexGen.VertexCount(), &indexBuffer);
+				render_->BindIndexBuffer(indexBuffer);
+				render_->BindVertexBuffer(vertexBuffer);
+				render_->BindInputLayout(softwareInputLayout_, (void *)(intptr_t)vertexBufferOffset);
+				render_->DrawIndexed(glprim[prim], numTrans, GL_UNSIGNED_SHORT, inds);
 			} else {
-				glDrawArrays(glprim[prim], 0, numTrans);
+				vertexBufferOffset = (uint32_t)frameData.pushVertex->Push(drawBuffer, numTrans * sizeof(TransformedVertex), &vertexBuffer);
+				render_->BindVertexBuffer(vertexBuffer);
+				render_->BindInputLayout(softwareInputLayout_, (void *)(intptr_t)vertexBufferOffset);
+				render_->Draw(glprim[prim], 0, numTrans);
 			}
 		} else if (result.action == SW_CLEAR) {
 			u32 clearColor = result.color;
@@ -735,26 +804,12 @@ rotateVBO:
 				framebufferManager_->SetDepthUpdated();
 			}
 
-			// Note that scissor may still apply while clearing.  Turn off other tests for the clear.
-			glstate.stencilTest.disable();
-			glstate.stencilMask.set(0xFF);
-			glstate.depthTest.disable();
-
 			GLbitfield target = 0;
 			if (colorMask || alphaMask) target |= GL_COLOR_BUFFER_BIT;
 			if (alphaMask) target |= GL_STENCIL_BUFFER_BIT;
 			if (depthMask) target |= GL_DEPTH_BUFFER_BIT;
 
-			glstate.colorMask.set(colorMask, colorMask, colorMask, alphaMask);
-			glClearColor(col[0], col[1], col[2], col[3]);
-#ifdef USING_GLES2
-			glClearDepthf(clearDepth);
-#else
-			glClearDepth(clearDepth);
-#endif
-			// Stencil takes alpha.
-			glClearStencil(clearColor >> 24);
-			glClear(target);
+			render_->Clear(clearColor, clearDepth, clearColor >> 24, target);
 			framebufferManager_->SetColorUpdated(gstate_c.skipDrawReason);
 
 			int scissorX1 = gstate.getScissorX1();
@@ -766,6 +821,7 @@ rotateVBO:
 			if (g_Config.bBlockTransferGPU && (gstate_c.featureFlags & GPU_USE_CLEAR_RAM_HACK) && colorMask && (alphaMask || gstate.FrameBufFormat() == GE_FORMAT_565)) {
 				framebufferManager_->ApplyClearToMemory(scissorX1, scissorY1, scissorX2, scissorY2, clearColor);
 			}
+			gstate_c.Dirty(DIRTY_BLEND_STATE);  // Make sure the color mask gets re-applied.
 		}
 	}
 
diff --git a/GPU/GLES/DrawEngineGLES.h b/GPU/GLES/DrawEngineGLES.h
index f50b0ffb35..ce2b51d025 100644
--- a/GPU/GLES/DrawEngineGLES.h
+++ b/GPU/GLES/DrawEngineGLES.h
@@ -28,6 +28,7 @@
 #include "GPU/Common/GPUStateUtils.h"
 #include "GPU/GLES/FragmentShaderGeneratorGLES.h"
 #include "gfx/gl_common.h"
+#include "thin3d/GLRenderManager.h"
 
 class LinkedShader;
 class ShaderManagerGLES;
@@ -126,6 +127,10 @@ public:
 	void ClearTrackedVertexArrays() override;
 	void DecimateTrackedVertexArrays();
 
+	void BeginFrame();
+	void EndFrame();
+
+
 	// So that this can be inlined
 	void Flush() {
 		if (!numDrawCalls)
@@ -151,6 +156,8 @@ public:
 	GLuint BindElementBuffer(const void *p, size_t sz);
 	void DecimateBuffers();
 
+	void ClearInputLayoutMap();
+
 private:
 	void InitDeviceObjects();
 	void DestroyDeviceObjects();
@@ -160,14 +167,28 @@ private:
 	void ApplyDrawStateLate();
 	void ResetShaderBlending();
 
+	GLRInputLayout *SetupDecFmtForDraw(LinkedShader *program, const DecVtxFormat &decFmt);
+
+	void DecodeVertsToPushBuffer(GLPushBuffer *push, uint32_t *bindOffset, GLRBuffer **buf);
+
 	GLuint AllocateBuffer(size_t sz);
 	void FreeBuffer(GLuint buf);
 	void FreeVertexArray(VertexArrayInfo *vai);
 
 	void MarkUnreliable(VertexArrayInfo *vai);
 
+	struct FrameData {
+		GLPushBuffer *pushVertex;
+		GLPushBuffer *pushIndex;
+	};
+	FrameData frameData_[GLRenderManager::MAX_INFLIGHT_FRAMES];
+
 	PrehashMap<VertexArrayInfo *, nullptr> vai_;
 
+	DenseHashMap<uint32_t, GLRInputLayout *, nullptr> inputLayoutMap_;
+
+	GLRInputLayout *softwareInputLayout_ = nullptr;
+
 	// Vertex buffer objects
 	// Element buffer objects
 	struct BufferNameInfo {
@@ -177,6 +198,7 @@ private:
 		bool used;
 		int lastFrame;
 	};
+	GLRenderManager *render_;
 	std::vector<GLuint> bufferNameCache_;
 	std::multimap<size_t, GLuint> freeSizedBuffers_;
 	std::unordered_map<GLuint, BufferNameInfo> bufferNameInfo_;
diff --git a/GPU/GLES/FramebufferManagerGLES.cpp b/GPU/GLES/FramebufferManagerGLES.cpp
index 20eb4aaf8c..bcda525e8a 100644
--- a/GPU/GLES/FramebufferManagerGLES.cpp
+++ b/GPU/GLES/FramebufferManagerGLES.cpp
@@ -82,6 +82,7 @@ const int MAX_PBO = 2;
 void ConvertFromRGBA8888(u8 *dst, const u8 *src, u32 dstStride, u32 srcStride, u32 width, u32 height, GEBufferFormat format);
 
 void FramebufferManagerGLES::DisableState() {
+	/*
 	glstate.blend.disable();
 	glstate.cullFace.disable();
 	glstate.depthTest.disable();
@@ -92,7 +93,7 @@ void FramebufferManagerGLES::DisableState() {
 #endif
 	glstate.colorMask.set(GL_TRUE, GL_TRUE, GL_TRUE, GL_TRUE);
 	glstate.stencilMask.set(0xFF);
-
+	*/
 	gstate_c.Dirty(DIRTY_BLEND_STATE | DIRTY_RASTER_STATE | DIRTY_DEPTHSTENCIL_STATE | DIRTY_VIEWPORTSCISSOR_STATE);
 }
 
@@ -247,6 +248,7 @@ FramebufferManagerGLES::FramebufferManagerGLES(Draw::DrawContext *draw) :
 	needBackBufferYSwap_ = true;
 	needGLESRebinds_ = true;
 	CreateDeviceObjects();
+	render_ = (GLRenderManager *)draw_->GetNativeObject(Draw::NativeObject::RENDER_MANAGER);
 }
 
 void FramebufferManagerGLES::Init() {
diff --git a/GPU/GLES/FramebufferManagerGLES.h b/GPU/GLES/FramebufferManagerGLES.h
index 94c87ae982..d7686a0c96 100644
--- a/GPU/GLES/FramebufferManagerGLES.h
+++ b/GPU/GLES/FramebufferManagerGLES.h
@@ -29,6 +29,7 @@
 #include "Core/Config.h"
 #include "GPU/GPUCommon.h"
 #include "GPU/Common/FramebufferCommon.h"
+#include "thin3d/GLRenderManager.h"
 
 struct GLSLProgram;
 class TextureCacheGLES;
@@ -110,6 +111,8 @@ private:
 	void PackFramebufferSync_(VirtualFramebuffer *vfb, int x, int y, int w, int h) override;
 	void PackDepthbuffer(VirtualFramebuffer *vfb, int x, int y, int w, int h);
 
+	GLRenderManager *render_;
+
 	// Used by DrawPixels
 	unsigned int drawPixelsTex_;
 	GEBufferFormat drawPixelsTexFormat_;
diff --git a/GPU/GLES/GPU_GLES.cpp b/GPU/GLES/GPU_GLES.cpp
index 78528e7a19..beb5fbbfc4 100644
--- a/GPU/GLES/GPU_GLES.cpp
+++ b/GPU/GLES/GPU_GLES.cpp
@@ -403,6 +403,10 @@ void GPU_GLES::BuildReportingInfo() {
 void GPU_GLES::DeviceLost() {
 	ILOG("GPU_GLES: DeviceLost");
 
+	// Simply drop all caches and textures.
+	// FBOs appear to survive? Or no?
+	// TransformDraw has registered as a GfxResourceHolder.
+	drawEngine_.ClearInputLayoutMap();
 	shaderManagerGL_->ClearCache(false);
 	textureCacheGL_->Clear(false);
 	fragmentTestCache_.Clear(false);
@@ -451,6 +455,12 @@ void GPU_GLES::BeginHostFrame() {
 		shaderManagerGL_->DirtyShader();
 		textureCacheGL_->NotifyConfigChanged();
 	}
+
+	drawEngine_.BeginFrame();
+}
+
+void GPU_GLES::EndHostFrame() {
+	drawEngine_.EndFrame();
 }
 
 inline void GPU_GLES::UpdateVsyncInterval(bool force) {
diff --git a/GPU/GLES/GPU_GLES.h b/GPU/GLES/GPU_GLES.h
index f99fe6d3e3..4378f166a7 100644
--- a/GPU/GLES/GPU_GLES.h
+++ b/GPU/GLES/GPU_GLES.h
@@ -80,6 +80,7 @@ public:
 	std::string DebugGetShaderString(std::string id, DebugShaderType shader, DebugShaderStringType stringType) override;
 
 	void BeginHostFrame() override;
+	void EndHostFrame() override;
 
 protected:
 	void FastRunLoop(DisplayList &list) override;
diff --git a/GPU/GLES/ShaderManagerGLES.cpp b/GPU/GLES/ShaderManagerGLES.cpp
index b2a54a3d42..08b9072feb 100644
--- a/GPU/GLES/ShaderManagerGLES.cpp
+++ b/GPU/GLES/ShaderManagerGLES.cpp
@@ -166,21 +166,15 @@ LinkedShader::LinkedShader(GLRenderManager *render, VShaderID VSID, Shader *vs,
 	attrMask = vs->GetAttrMask();
 	availableUniforms = vs->GetUniformMask() | fs->GetUniformMask();
 
-	program = render->CreateProgram(shaders, semantics, queries, gstate_c.featureFlags & GPU_SUPPORTS_DUALSOURCE_BLEND);
+	std::vector<GLRProgram::Initializer> initialize;
+	initialize.push_back({ &u_tex, 0, 0 });
+	initialize.push_back({ &u_fbotex, 0, 1 });
+	initialize.push_back({ &u_testtex, 0, 2 });
+	initialize.push_back({ &u_tess_pos_tex, 4 }); // Texture unit 4
+	initialize.push_back({ &u_tess_tex_tex, 5 }); // Texture unit 5
+	initialize.push_back({ &u_tess_col_tex, 6 }); // Texture unit 6
 
-	render->BindProgram(program);
-
-	// Default uniform values
-	render->SetUniformI1(&u_tex, 0);
-	render->SetUniformI1(&u_fbotex, 1);
-	render->SetUniformI1(&u_fbotex, 2);
-	
-	if (u_tess_pos_tex != -1)
-		render->SetUniformI1(&u_tess_pos_tex, 4); // Texture unit 4
-	if (u_tess_tex_tex != -1)
-		render->SetUniformI1(&u_tess_tex_tex, 5); // Texture unit 5
-	if (u_tess_col_tex != -1)
-		render->SetUniformI1(&u_tess_col_tex, 6); // Texture unit 6
+	program = render->CreateProgram(shaders, semantics, queries, initialize, gstate_c.featureFlags & GPU_SUPPORTS_DUALSOURCE_BLEND);
 
 	// The rest, use the "dirty" mechanism.
 	dirtyUniforms = DIRTY_ALL_UNIFORMS;
diff --git a/GPU/GLES/StateMappingGLES.cpp b/GPU/GLES/StateMappingGLES.cpp
index f810087770..54acebcb1c 100644
--- a/GPU/GLES/StateMappingGLES.cpp
+++ b/GPU/GLES/StateMappingGLES.cpp
@@ -230,10 +230,14 @@ void DrawEngineGLES::ApplyDrawState(int prim) {
 			}
 #endif
 			int mask = (int)rmask | ((int)gmask << 1) | ((int)bmask << 2) | ((int)amask << 3);
-			renderManager->SetBlendAndMask(mask, blendState.enabled,
-				glBlendFactorLookup[(size_t)blendState.srcColor], glBlendFactorLookup[(size_t)blendState.dstColor],
-				glBlendFactorLookup[(size_t)blendState.srcAlpha], glBlendFactorLookup[(size_t)blendState.dstAlpha],
-				glBlendEqLookup[(size_t)blendState.eqColor], glBlendEqLookup[(size_t)blendState.eqAlpha]);
+			if (blendState.enabled) {
+				renderManager->SetBlendAndMask(mask, blendState.enabled,
+					glBlendFactorLookup[(size_t)blendState.srcColor], glBlendFactorLookup[(size_t)blendState.dstColor],
+					glBlendFactorLookup[(size_t)blendState.srcAlpha], glBlendFactorLookup[(size_t)blendState.dstAlpha],
+					glBlendEqLookup[(size_t)blendState.eqColor], glBlendEqLookup[(size_t)blendState.eqAlpha]);
+			} else {
+				renderManager->SetNoBlendAndMask(mask);
+			}
 
 #ifndef USING_GLES2
 			if (gstate_c.Supports(GPU_SUPPORTS_LOGIC_OP)) {
@@ -288,9 +292,13 @@ void DrawEngineGLES::ApplyDrawState(int prim) {
 			GenericStencilFuncState stencilState;
 			ConvertStencilFuncState(stencilState);
 			// Stencil Test
-			renderManager->SetStencil(stencilState.enabled, compareOps[stencilState.testFunc],
-				stencilOps[stencilState.sFail], stencilOps[stencilState.zFail], stencilOps[stencilState.zPass],
-				stencilState.writeMask, stencilState.testMask, stencilState.testRef);
+			if (stencilState.enabled) {
+				renderManager->SetStencil(stencilState.enabled, compareOps[stencilState.testFunc],
+					stencilOps[stencilState.sFail], stencilOps[stencilState.zFail], stencilOps[stencilState.zPass],
+					stencilState.writeMask, stencilState.testMask, stencilState.testRef);
+			} else {
+				renderManager->SetStencilDisabled();
+			}
 		}
 	}
 
diff --git a/ext/native/thin3d/GLQueueRunner.cpp b/ext/native/thin3d/GLQueueRunner.cpp
index c186ea065d..0ddc148c0d 100644
--- a/ext/native/thin3d/GLQueueRunner.cpp
+++ b/ext/native/thin3d/GLQueueRunner.cpp
@@ -70,7 +70,6 @@ void GLQueueRunner::RunInitSteps(const std::vector<GLRInitStep> &steps) {
 				}
 			}
 #endif
-
 			glLinkProgram(program->program);
 
 			GLint linkStatus = GL_FALSE;
@@ -111,8 +110,17 @@ void GLQueueRunner::RunInitSteps(const std::vector<GLRInitStep> &steps) {
 				*x.dest = glGetUniformLocation(program->program, x.name);
 			}
 
-			// Here we could (using glGetAttribLocation) save a bitmask about which pieces of vertex data are used in the shader
-			// and then AND it with the vertex format bitmask later...
+			// Run initializers.
+			for (int i = 0; i < program->initialize_.size(); i++) {
+				auto &init = program->initialize_[i];
+				GLint uniform = *init.uniform;
+				if (uniform != -1) {
+					switch (init.type) {
+					case 0:
+						glUniform1i(uniform, init.value);
+					}
+				}
+			}
 		}
 			break;
 		case GLRInitStepType::CREATE_SHADER:
@@ -240,8 +248,11 @@ void GLQueueRunner::PerformRenderPass(const GLRStep &step) {
 			} else {
 				glDisable(GL_BLEND);
 			}
+			glColorMask(c.blend.mask & 1, (c.blend.mask >> 1) & 1, (c.blend.mask >> 2) & 1, (c.blend.mask >> 3) & 1);
 			break;
 		case GLRRenderCommand::CLEAR:
+			glDisable(GL_SCISSOR_TEST);
+			glColorMask(GL_TRUE, GL_TRUE, GL_TRUE, GL_TRUE);
 			if (c.clear.clearMask & GL_COLOR_BUFFER_BIT) {
 				float color[4];
 				Uint8x4ToFloat4(color, c.clear.clearColor);
@@ -258,6 +269,7 @@ void GLQueueRunner::PerformRenderPass(const GLRStep &step) {
 				glClearStencil(c.clear.clearStencil);
 			}
 			glClear(c.clear.clearMask);
+			glEnable(GL_SCISSOR_TEST);
 			break;
 		case GLRRenderCommand::BLENDCOLOR:
 			glBlendColor(c.blendColor.color[0], c.blendColor.color[1], c.blendColor.color[2], c.blendColor.color[3]);
@@ -376,6 +388,12 @@ void GLQueueRunner::PerformRenderPass(const GLRStep &step) {
 			glBindBuffer(GL_ARRAY_BUFFER, buf);
 			break;
 		}
+		case GLRRenderCommand::BIND_INDEX_BUFFER:
+		{
+			GLuint buf = c.bind_buffer.buffer ? c.bind_buffer.buffer->buffer : 0;
+			glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, buf);
+			break;
+		}
 		case GLRRenderCommand::GENMIPS:
 			glGenerateMipmap(GL_TEXTURE_2D);
 			break;
diff --git a/ext/native/thin3d/GLQueueRunner.h b/ext/native/thin3d/GLQueueRunner.h
index f3d5c7df17..810027d461 100644
--- a/ext/native/thin3d/GLQueueRunner.h
+++ b/ext/native/thin3d/GLQueueRunner.h
@@ -44,6 +44,7 @@ enum class GLRRenderCommand : uint8_t {
 	BIND_FB_TEXTURE,
 	BIND_INPUT_LAYOUT,
 	BIND_VERTEX_BUFFER,
+	BIND_INDEX_BUFFER,
 	GENMIPS,
 	DRAW,
 	DRAW_INDEXED,
diff --git a/ext/native/thin3d/GLRenderManager.cpp b/ext/native/thin3d/GLRenderManager.cpp
index 226aed1458..1c05cc4577 100644
--- a/ext/native/thin3d/GLRenderManager.cpp
+++ b/ext/native/thin3d/GLRenderManager.cpp
@@ -352,6 +352,14 @@ GLPushBuffer::~GLPushBuffer() {
 	assert(buffers_.empty());
 }
 
+void GLPushBuffer::Map() {
+	assert(!writePtr_);
+	// TODO: Even a good old glMapBuffer could actually work well here.
+	// VkResult res = vkMapMemory(device_, buffers_[buf_].deviceMemory, 0, size_, 0, (void **)(&writePtr_));
+	writePtr_ = buffers_[buf_].deviceMemory;
+	assert(writePtr_);
+}
+
 void GLPushBuffer::Unmap() {
 	assert(writePtr_);
 	// Here we should simply upload everything to the buffers.
diff --git a/ext/native/thin3d/GLRenderManager.h b/ext/native/thin3d/GLRenderManager.h
index cd77d12152..a3ee244b9f 100644
--- a/ext/native/thin3d/GLRenderManager.h
+++ b/ext/native/thin3d/GLRenderManager.h
@@ -89,9 +89,16 @@ public:
 		const char *name;
 	};
 
+	struct Initializer {
+		GLint *uniform;
+		int type;
+		int value;
+	};
+
 	GLuint program = 0;
 	std::vector<Semantic> semantics_;
 	std::vector<UniformLocQuery> queries_;
+	std::vector<Initializer> initialize_;
 
 	struct UniformInfo {
 		int loc_;
@@ -222,12 +229,15 @@ public:
 		return step.create_shader.shader;
 	}
 
-	GLRProgram *CreateProgram(std::vector<GLRShader *> shaders, std::vector<GLRProgram::Semantic> semantics, std::vector<GLRProgram::UniformLocQuery> queries, bool supportDualSource) {
+	GLRProgram *CreateProgram(
+		std::vector<GLRShader *> shaders, std::vector<GLRProgram::Semantic> semantics, std::vector<GLRProgram::UniformLocQuery> queries,
+		std::vector<GLRProgram::Initializer> initalizers, bool supportDualSource) {
 		GLRInitStep step{ GLRInitStepType::CREATE_PROGRAM };
 		assert(shaders.size() <= ARRAY_SIZE(step.create_program.shaders));
 		step.create_program.program = new GLRProgram();
 		step.create_program.program->semantics_ = semantics;
 		step.create_program.program->queries_ = queries;
+		step.create_program.program->initialize_ = initalizers;
 		for (int i = 0; i < shaders.size(); i++) {
 			step.create_program.shaders[i] = shaders[i];
 		}
@@ -327,6 +337,13 @@ public:
 		curRenderStep_->commands.push_back(data);
 	}
 
+	void BindIndexBuffer(GLRBuffer *buffer) {  // Want to support an offset but can't in ES 2.0. We supply an offset when binding the buffers instead.
+		_dbg_assert_(G3D, curRenderStep_ && curRenderStep_->stepType == GLRStepType::RENDER);
+		GLRRenderData data{ GLRRenderCommand::BIND_INDEX_BUFFER};
+		data.bind_buffer.buffer = buffer;
+		curRenderStep_->commands.push_back(data);
+	}
+
 	void BindInputLayout(GLRInputLayout *inputLayout, const void *offset) {
 		_dbg_assert_(G3D, curRenderStep_ && curRenderStep_->stepType == GLRStepType::RENDER);
 		assert(inputLayout);
@@ -403,7 +420,7 @@ public:
 
 	void SetUniformM4x4(GLint *loc, const float *udata) {
 		_dbg_assert_(G3D, curRenderStep_ && curRenderStep_->stepType == GLRStepType::RENDER);
-		GLRRenderData data{ GLRRenderCommand::UNIFORM4F };
+		GLRRenderData data{ GLRRenderCommand::UNIFORMMATRIX };
 		data.uniformMatrix4.loc = loc;
 		memcpy(data.uniformMatrix4.m, udata, sizeof(float) * 16);
 		curRenderStep_->commands.push_back(data);
@@ -442,6 +459,7 @@ public:
 	void SetStencil(bool enabled, GLenum func, GLenum sFail, GLenum zFail, GLenum pass, uint8_t writeMask, uint8_t compareMask, uint8_t refValue) {
 		_dbg_assert_(G3D, curRenderStep_ && curRenderStep_->stepType == GLRStepType::RENDER);
 		GLRRenderData data{ GLRRenderCommand::STENCIL };
+		data.stencil.enabled = enabled;
 		data.stencil.func = func;
 		data.stencil.sFail = sFail;
 		data.stencil.zFail = zFail;
@@ -452,6 +470,14 @@ public:
 		curRenderStep_->commands.push_back(data);
 	}
 
+	void SetStencilDisabled() {
+		_dbg_assert_(G3D, curRenderStep_ && curRenderStep_->stepType == GLRStepType::RENDER);
+		GLRRenderData data;
+		data.cmd = GLRRenderCommand::STENCIL;
+		data.stencil.enabled = false;
+		curRenderStep_->commands.push_back(data);
+	}
+
 	void SetBlendFactor(const float color[4]) {
 		_dbg_assert_(G3D, curRenderStep_ && curRenderStep_->stepType == GLRStepType::RENDER);
 		GLRRenderData data{ GLRRenderCommand::BLENDCOLOR };
@@ -613,13 +639,7 @@ public:
 		Unmap();
 	}
 
-	void Map() {
-		assert(!writePtr_);
-		// VkResult res = vkMapMemory(device_, buffers_[buf_].deviceMemory, 0, size_, 0, (void **)(&writePtr_));
-		writePtr_ = buffers_[buf_].deviceMemory;
-		assert(writePtr_);
-	}
-
+	void Map();
 	void Unmap();
 
 	// When using the returned memory, make sure to bind the returned vkbuf.
diff --git a/ext/native/thin3d/thin3d_gl.cpp b/ext/native/thin3d/thin3d_gl.cpp
index 6ba2d06b38..1331a34f2d 100644
--- a/ext/native/thin3d/thin3d_gl.cpp
+++ b/ext/native/thin3d/thin3d_gl.cpp
@@ -1099,7 +1099,8 @@ bool OpenGLPipeline::LinkShaders() {
 	semantics.push_back({ SEM_TANGENT, "Tangent" });
 	semantics.push_back({ SEM_BINORMAL, "Binormal" });
 	std::vector<GLRProgram::UniformLocQuery> queries;
-	program_ = render_->CreateProgram(linkShaders, semantics, queries, false);
+	std::vector<GLRProgram::Initializer> initialize;
+	program_ = render_->CreateProgram(linkShaders, semantics, queries, initialize, false);
 	return true;
 }