diff --git a/CMakeLists.txt b/CMakeLists.txt
index b60032d8f6..9aeeed988f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -830,7 +830,10 @@ if(ANDROID)
 	set(nativeExtra ${nativeExtra} ${NativeAppSource})
 endif()
 
-set(THIN3D_PLATFORMS ext/native/thin3d/thin3d_gl.cpp)
+set(THIN3D_PLATFORMS ext/native/thin3d/thin3d_gl.cpp
+	ext/native/thin3d/GLRenderManager.cpp
+	ext/native/thin3d/GLQueueRunner.cpp)
+
 set(THIN3D_PLATFORMS ${THIN3D_PLATFORMS}
 	ext/native/thin3d/thin3d_vulkan.cpp
 	ext/native/thin3d/VulkanRenderManager.cpp
diff --git a/GPU/GLES/DepalettizeShaderGLES.cpp b/GPU/GLES/DepalettizeShaderGLES.cpp
index 63fc9d05bd..95f58f37c6 100644
--- a/GPU/GLES/DepalettizeShaderGLES.cpp
+++ b/GPU/GLES/DepalettizeShaderGLES.cpp
@@ -167,8 +167,8 @@ DepalShader *DepalShaderCacheGLES::GetDepalettizeShader(uint32_t clutMode, GEBuf
 	queries.push_back({ &depal->u_pal, "pal" });
 
 	std::vector<GLRProgram::Initializer> initializer;
-	initializer.push_back({ &depal->u_tex, 0 });
-	initializer.push_back({ &depal->u_pal, 3 });
+	initializer.push_back({ &depal->u_tex, 0, 0 });
+	initializer.push_back({ &depal->u_pal, 0, 3 });
 
 	std::vector<GLRShader *> shaders{ vertexShader_, fragShader };
 
@@ -177,8 +177,6 @@ DepalShader *DepalShaderCacheGLES::GetDepalettizeShader(uint32_t clutMode, GEBuf
 	depal->program = program;
 	depal->fragShader = fragShader;
 	depal->code = buffer;
-	depal->a_position = 0;
-	depal->a_texcoord0 = 1;
 	cache_[id] = depal;
 
 	delete[] buffer;
diff --git a/GPU/GLES/DepalettizeShaderGLES.h b/GPU/GLES/DepalettizeShaderGLES.h
index bc3e93facd..6fd9749fec 100644
--- a/GPU/GLES/DepalettizeShaderGLES.h
+++ b/GPU/GLES/DepalettizeShaderGLES.h
@@ -29,8 +29,6 @@ class DepalShader {
 public:
 	GLRProgram *program;
 	GLRShader *fragShader;
-	GLint a_position;
-	GLint a_texcoord0;
 	GLint u_tex;
 	GLint u_pal;
 	std::string code;
diff --git a/GPU/GLES/DrawEngineGLES.cpp b/GPU/GLES/DrawEngineGLES.cpp
index d6161e4335..166347016a 100644
--- a/GPU/GLES/DrawEngineGLES.cpp
+++ b/GPU/GLES/DrawEngineGLES.cpp
@@ -401,8 +401,14 @@ void DrawEngineGLES::DoFlush() {
 	gpuStats.numFlushes++;
 	gpuStats.numTrackedVertexArrays = (int)vai_.size();
 
+	bool textureNeedsApply = false;
+	if (gstate_c.IsDirty(DIRTY_TEXTURE_IMAGE | DIRTY_TEXTURE_PARAMS) && !gstate.isModeClear() && gstate.isTextureMapEnabled()) {
+		textureCache_->SetTexture();
+		gstate_c.Clean(DIRTY_TEXTURE_IMAGE | DIRTY_TEXTURE_PARAMS);
+		textureNeedsApply = true;
+	}
+
 	GEPrimitiveType prim = prevPrim_;
-	ApplyDrawState(prim);
 
 	VShaderID vsid;
 	Shader *vshader = shaderManager_->ApplyVertexShader(prim, lastVType_, &vsid);
@@ -592,12 +598,17 @@ rotateVBO:
 			gstate_c.vertexFullAlpha = gstate_c.vertexFullAlpha && ((hasColor && (gstate.materialupdate & 1)) || gstate.getMaterialAmbientA() == 255) && (!gstate.isLightingEnabled() || gstate.getAmbientA() == 255);
 		}
 
+		if (textureNeedsApply)
+			textureCache_->ApplyTexture();
+
+		// Need to ApplyDrawState after ApplyTexture because depal can launch a render pass and that wrecks the state.
+		ApplyDrawState(prim);
 		ApplyDrawStateLate(false, 0);
 		
 		LinkedShader *program = shaderManager_->ApplyFragmentShader(vsid, vshader, lastVType_, prim);
 		GLRInputLayout *inputLayout = SetupDecFmtForDraw(program, dec_->GetDecVtxFmt());
 		render_->BindVertexBuffer(vertexBuffer);
-		render_->BindInputLayout(inputLayout, (void *)(uintptr_t)vertexBufferOffset);
+		render_->BindInputLayout(inputLayout, vertexBufferOffset);
 		if (useElements) {
 			if (!indexBuffer) {
 				indexBufferOffset = (uint32_t)frameData.pushIndex->Push(decIndex, sizeof(uint16_t) * indexGen.VertexCount(), &indexBuffer);
@@ -653,6 +664,11 @@ rotateVBO:
 			prim, vertexCount,
 			dec_->VertexType(), inds, GE_VTYPE_IDX_16BIT, dec_->GetDecVtxFmt(),
 			maxIndex, drawBuffer, numTrans, drawIndexed, &params, &result);
+
+		if (textureNeedsApply)
+			textureCache_->ApplyTexture();
+
+		ApplyDrawState(prim);
 		ApplyDrawStateLate(result.setStencil, result.stencilValue);
 
 		LinkedShader *program = shaderManager_->ApplyFragmentShader(vsid, vshader, lastVType_, prim);
@@ -666,13 +682,13 @@ rotateVBO:
 				vertexBufferOffset = (uint32_t)frameData.pushVertex->Push(drawBuffer, maxIndex * sizeof(TransformedVertex), &vertexBuffer);
 				indexBufferOffset = (uint32_t)frameData.pushIndex->Push(inds, sizeof(uint16_t) * numTrans, &indexBuffer);
 				render_->BindVertexBuffer(vertexBuffer);
-				render_->BindInputLayout(softwareInputLayout_, (void *)(intptr_t)vertexBufferOffset);
+				render_->BindInputLayout(softwareInputLayout_, vertexBufferOffset);
 				render_->BindIndexBuffer(indexBuffer);
 				render_->DrawIndexed(glprim[prim], numTrans, GL_UNSIGNED_SHORT, (void *)(intptr_t)indexBufferOffset);
 			} else {
 				vertexBufferOffset = (uint32_t)frameData.pushVertex->Push(drawBuffer, numTrans * sizeof(TransformedVertex), &vertexBuffer);
 				render_->BindVertexBuffer(vertexBuffer);
-				render_->BindInputLayout(softwareInputLayout_, (void *)(intptr_t)vertexBufferOffset);
+				render_->BindInputLayout(softwareInputLayout_, vertexBufferOffset);
 				render_->Draw(glprim[prim], 0, numTrans);
 			}
 		} else if (result.action == SW_CLEAR) {
diff --git a/GPU/GLES/FramebufferManagerGLES.cpp b/GPU/GLES/FramebufferManagerGLES.cpp
index 2f033d736c..76acf62990 100644
--- a/GPU/GLES/FramebufferManagerGLES.cpp
+++ b/GPU/GLES/FramebufferManagerGLES.cpp
@@ -466,7 +466,7 @@ void FramebufferManagerGLES::DrawActiveTexture(float x, float y, float w, float
 	void *dest = drawEngineGL_->GetPushVertexBuffer()->Push(sizeof(verts), &bindOffset, &buffer);
 	memcpy(dest, verts, sizeof(verts));
 	render_->BindVertexBuffer(buffer);
-	render_->BindInputLayout(simple2DInputLayout_, (void *)(intptr_t)bindOffset);
+	render_->BindInputLayout(simple2DInputLayout_, bindOffset);
 	render_->Draw(GL_TRIANGLE_STRIP, 0, 4);
 }
 
diff --git a/GPU/GLES/StateMappingGLES.cpp b/GPU/GLES/StateMappingGLES.cpp
index 11b7e79576..932d148656 100644
--- a/GPU/GLES/StateMappingGLES.cpp
+++ b/GPU/GLES/StateMappingGLES.cpp
@@ -132,11 +132,6 @@ inline void DrawEngineGLES::ResetShaderBlending() {
 void DrawEngineGLES::ApplyDrawState(int prim) {
 	GLRenderManager *renderManager = (GLRenderManager *)draw_->GetNativeObject(Draw::NativeObject::RENDER_MANAGER);
 
-	if (gstate_c.IsDirty(DIRTY_TEXTURE_IMAGE | DIRTY_TEXTURE_PARAMS) && !gstate.isModeClear() && gstate.isTextureMapEnabled()) {
-		textureCache_->SetTexture();
-		gstate_c.Clean(DIRTY_TEXTURE_IMAGE | DIRTY_TEXTURE_PARAMS);
-	}
-
 	if (!gstate_c.IsDirty(DIRTY_BLEND_STATE | DIRTY_DEPTHSTENCIL_STATE | DIRTY_RASTER_STATE | DIRTY_VIEWPORTSCISSOR_STATE)) {
 		// Nothing to do, let's early-out
 		return;
@@ -341,10 +336,6 @@ void DrawEngineGLES::ApplyDrawStateLate(bool setStencil, int stencilValue) {
 			fboTexNeedBind_ = false;
 		}
 
-		// Apply the texture after the FBO tex, since it might unbind the texture.
-		// TODO: Could use a separate texture unit to be safer?
-		textureCache_->ApplyTexture();
-
 		// Apply last, once we know the alpha params of the texture.
 		if (gstate.isAlphaTestEnabled() || gstate.isColorTestEnabled()) {
 			fragmentTestCache_->BindTestTexture(2);
diff --git a/GPU/GLES/TextureCacheGLES.cpp b/GPU/GLES/TextureCacheGLES.cpp
index fffdc1da9e..652082d148 100644
--- a/GPU/GLES/TextureCacheGLES.cpp
+++ b/GPU/GLES/TextureCacheGLES.cpp
@@ -53,9 +53,14 @@ TextureCacheGLES::TextureCacheGLES(Draw::DrawContext *draw)
 	SetupTextureDecoder();
 
 	nextTexture_ = nullptr;
+	std::vector<GLRInputLayout::Entry> entries;
+	entries.push_back({ 0, 3, GL_FLOAT, GL_FALSE, 20, 0 });
+	entries.push_back({ 1, 2, GL_FLOAT, GL_FALSE, 20, 12 });
+	shadeInputLayout_ = render_->CreateInputLayout(entries);
 }
 
 TextureCacheGLES::~TextureCacheGLES() {
+	render_->DeleteInputLayout(shadeInputLayout_);
 	Clear(true);
 }
 
@@ -65,7 +70,7 @@ void TextureCacheGLES::SetFramebufferManager(FramebufferManagerGLES *fbManager)
 }
 
 void TextureCacheGLES::ReleaseTexture(TexCacheEntry *entry, bool delete_them) {
-	DEBUG_LOG(G3D, "Deleting texture %i", entry->textureName);
+	DEBUG_LOG(G3D, "Deleting texture %08x", entry->addr);
 	if (delete_them) {
 		if (entry->textureName) {
 			render_->DeleteTexture(entry->textureName);
@@ -322,21 +327,11 @@ void TextureCacheGLES::Unbind() {
 class TextureShaderApplier {
 public:
 	struct Pos {
-		Pos(float x_, float y_, float z_) : x(x_), y(y_), z(z_) {
-		}
-		Pos() {
-		}
-
 		float x;
 		float y;
 		float z;
 	};
 	struct UV {
-		UV(float u_, float v_) : u(u_), v(v_) {
-		}
-		UV() {
-		}
-
 		float u;
 		float v;
 	};
@@ -379,69 +374,45 @@ public:
 			const float top = v1 * invHalfHeight - 1.0f;
 			const float bottom = v2 * invHalfHeight - 1.0f;
 			// Points are: BL, BR, TR, TL.
-			pos_[0] = Pos(left, bottom, -1.0f);
-			pos_[1] = Pos(right, bottom, -1.0f);
-			pos_[2] = Pos(right, top, -1.0f);
-			pos_[3] = Pos(left, top, -1.0f);
+			pos_[0] = Pos{ left, bottom, -1.0f };
+			pos_[1] = Pos{ right, bottom, -1.0f };
+			pos_[2] = Pos{ right, top, -1.0f };
+			pos_[3] = Pos{ left, top, -1.0f };
 
 			// And also the UVs, same order.
 			const float uvleft = u1 * invWidth;
 			const float uvright = u2 * invWidth;
 			const float uvtop = v1 * invHeight;
 			const float uvbottom = v2 * invHeight;
-			uv_[0] = UV(uvleft, uvbottom);
-			uv_[1] = UV(uvright, uvbottom);
-			uv_[2] = UV(uvright, uvtop);
-			uv_[3] = UV(uvleft, uvtop);
+			uv_[0] = UV{ uvleft, uvbottom };
+			uv_[1] = UV{ uvright, uvbottom };
+			uv_[2] = UV{ uvright, uvtop };
+			uv_[3] = UV{ uvleft, uvtop };
 		}
 	}
 
-	void Use(GLRenderManager *render, DrawEngineGLES *transformDraw) {
+	void Use(GLRenderManager *render, DrawEngineGLES *transformDraw, GLRInputLayout *inputLayout) {
 		render->BindProgram(shader_->program);
-
-		/*
-		// Restore will rebind all of the state below.
-		if (gstate_c.Supports(GPU_SUPPORTS_VAO)) {
-			static const GLubyte indices[4] = { 0, 1, 3, 2 };
-			transformDraw->BindBuffer(pos_, sizeof(pos_), uv_, sizeof(uv_));
-			transformDraw->BindElementBuffer(indices, sizeof(indices));
-		} else {
-			glBindBuffer(GL_ARRAY_BUFFER, 0);
-			glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, 0);
+		struct SimpleVertex {
+			float pos[3];
+			float uv[2];
+		};
+		uint32_t bindOffset;
+		GLRBuffer *bindBuffer;
+		SimpleVertex *verts = (SimpleVertex *)transformDraw->GetPushVertexBuffer()->Push(sizeof(SimpleVertex) * 4, &bindOffset, &bindBuffer);
+		int order[4] = { 0 ,1, 3, 2 };
+		for (int i = 0; i < 4; i++) {
+			memcpy(verts[i].pos, &pos_[order[i]], sizeof(Pos));
+			memcpy(verts[i].uv, &uv_[order[i]], sizeof(UV));
 		}
-		glEnableVertexAttribArray(shader_->a_position);
-		glEnableVertexAttribArray(shader_->a_texcoord0);
-		*/
+		render->BindVertexBuffer(bindBuffer);
+		render->BindInputLayout(inputLayout, bindOffset);
 	}
 
-	void Shade() {
+	void Shade(GLRenderManager *render) {
 		static const GLubyte indices[4] = { 0, 1, 3, 2 };
-		/*
-		glstate.blend.force(false);
-		glstate.colorMask.force(true, true, true, true);
-		glstate.scissorTest.force(false);
-		glstate.cullFace.force(false);
-		glstate.depthTest.force(false);
-		glstate.stencilTest.force(false);
-#if !defined(USING_GLES2)
-		glstate.colorLogicOp.force(false);
-#endif
-		glViewport(0, 0, renderW_, renderH_);
-
-		if (gstate_c.Supports(GPU_SUPPORTS_VAO)) {
-			glVertexAttribPointer(shader_->a_position, 3, GL_FLOAT, GL_FALSE, 12, 0);
-			glVertexAttribPointer(shader_->a_texcoord0, 2, GL_FLOAT, GL_FALSE, 8, (void *)sizeof(pos_));
-			glDrawElements(GL_TRIANGLE_STRIP, 4, GL_UNSIGNED_BYTE, 0);
-		} else {
-			glVertexAttribPointer(shader_->a_position, 3, GL_FLOAT, GL_FALSE, 12, pos_);
-			glVertexAttribPointer(shader_->a_texcoord0, 2, GL_FLOAT, GL_FALSE, 8, uv_);
-			glDrawElements(GL_TRIANGLE_STRIP, 4, GL_UNSIGNED_BYTE, indices);
-		}
-		glDisableVertexAttribArray(shader_->a_position);
-		glDisableVertexAttribArray(shader_->a_texcoord0);
-
-		glstate.Restore();
-		*/
+		render->SetViewport(GLRViewport{ 0, 0, (float)renderW_, (float)renderH_, 0.0f, 1.0f });
+		render->Draw(GL_TRIANGLE_STRIP, 0, 4);
 	}
 
 protected:
@@ -469,14 +440,13 @@ void TextureCacheGLES::ApplyTextureFramebuffer(TexCacheEntry *entry, VirtualFram
 
 		TextureShaderApplier shaderApply(depal, framebuffer->bufferWidth, framebuffer->bufferHeight, framebuffer->renderWidth, framebuffer->renderHeight);
 		shaderApply.ApplyBounds(gstate_c.vertBounds, gstate_c.curTextureXOffset, gstate_c.curTextureYOffset);
-		shaderApply.Use(render_, drawEngine_);
-
-		render_->BindTexture(3, clutTexture);
+		shaderApply.Use(render_, drawEngine_, shadeInputLayout_);
 
 		framebufferManagerGL_->BindFramebufferAsColorTexture(0, framebuffer, BINDFBCOLOR_SKIP_COPY);
+		render_->BindTexture(3, clutTexture);
 		render_->SetTextureSampler(GL_CLAMP_TO_EDGE, GL_CLAMP_TO_EDGE, GL_NEAREST, GL_NEAREST, 0.0f);
 
-		shaderApply.Shade();
+		shaderApply.Shade(render_);
 
 		draw_->BindFramebufferAsTexture(depalFBO, 0, Draw::FB_COLOR_BIT, 0);
 
@@ -497,6 +467,9 @@ void TextureCacheGLES::ApplyTextureFramebuffer(TexCacheEntry *entry, VirtualFram
 	SetFramebufferSamplingParams(framebuffer->bufferWidth, framebuffer->bufferHeight);
 
 	InvalidateLastTexture();
+
+	// Since we started/ended render passes, might need these.
+	gstate_c.Dirty(DIRTY_BLEND_STATE | DIRTY_DEPTHSTENCIL_STATE | DIRTY_RASTER_STATE | DIRTY_VIEWPORTSCISSOR_STATE);
 }
 
 ReplacedTextureFormat FromGLESFormat(GLenum fmt) {
diff --git a/GPU/GLES/TextureCacheGLES.h b/GPU/GLES/TextureCacheGLES.h
index dcefcfb700..2d415c599a 100644
--- a/GPU/GLES/TextureCacheGLES.h
+++ b/GPU/GLES/TextureCacheGLES.h
@@ -101,6 +101,8 @@ private:
 	ShaderManagerGLES *shaderManager_;
 	DrawEngineGLES *drawEngine_;
 
+	GLRInputLayout *shadeInputLayout_;
+
 	enum { INVALID_TEX = -1 };
 };
 
diff --git a/ext/native/thin3d/GLQueueRunner.cpp b/ext/native/thin3d/GLQueueRunner.cpp
index 7fbff23834..0f7288ea3c 100644
--- a/ext/native/thin3d/GLQueueRunner.cpp
+++ b/ext/native/thin3d/GLQueueRunner.cpp
@@ -82,11 +82,9 @@ void GLQueueRunner::RunInitSteps(const std::vector<GLRInitStep> &steps) {
 				glBindFragDataLocation(program->program, 0, "fragColor0");
 			}
 #elif !defined(IOS)
-			if (gl_extensions.GLES3) {
-				if (gstate_c.featureFlags & GPU_SUPPORTS_DUALSOURCE_BLEND) {
-					glBindFragDataLocationIndexedEXT(program->program, 0, 0, "fragColor0");
-					glBindFragDataLocationIndexedEXT(program->program, 0, 1, "fragColor1");
-				}
+			if (gl_extensions.GLES3 && step.create_program.support_dual_source) {
+				glBindFragDataLocationIndexedEXT(program->program, 0, 0, "fragColor0");
+				glBindFragDataLocationIndexedEXT(program->program, 0, 1, "fragColor1");
 			}
 #endif
 			glLinkProgram(program->program);
@@ -413,7 +411,9 @@ void GLQueueRunner::PerformRenderPass(const GLRStep &step) {
 	glActiveTexture(GL_TEXTURE0 + activeTexture);
 
 	int attrMask = 0;
-
+	int colorMask = -1;
+	int depthMask = -1;
+	int depthFunc = -1;
 	// State filtering tracking.
 	GLuint curArrayBuffer = (GLuint)-1;
 	GLuint curElemArrayBuffer = (GLuint)-1;
@@ -424,8 +424,14 @@ void GLQueueRunner::PerformRenderPass(const GLRStep &step) {
 		case GLRRenderCommand::DEPTH:
 			if (c.depth.enabled) {
 				glEnable(GL_DEPTH_TEST);
-				glDepthMask(c.depth.write);
-				glDepthFunc(c.depth.func);
+				if (c.depth.write != depthMask) {
+					glDepthMask(c.depth.write);
+					depthMask = c.depth.write;
+				}
+				if (c.depth.func != depthFunc) {
+					glDepthFunc(c.depth.func);
+					depthFunc = c.depth.func;
+				}
 			} else {
 				glDisable(GL_DEPTH_TEST);
 			}
@@ -438,11 +444,15 @@ void GLQueueRunner::PerformRenderPass(const GLRStep &step) {
 			} else {
 				glDisable(GL_BLEND);
 			}
-			glColorMask(c.blend.mask & 1, (c.blend.mask >> 1) & 1, (c.blend.mask >> 2) & 1, (c.blend.mask >> 3) & 1);
+			if (c.blend.mask != colorMask) {
+				glColorMask(c.blend.mask & 1, (c.blend.mask >> 1) & 1, (c.blend.mask >> 2) & 1, (c.blend.mask >> 3) & 1);
+				colorMask = c.blend.mask;
+			}
 			break;
 		case GLRRenderCommand::CLEAR:
 			glDisable(GL_SCISSOR_TEST);
 			glColorMask(GL_TRUE, GL_TRUE, GL_TRUE, GL_TRUE);
+			colorMask = 0xF;
 			if (c.clear.clearMask & GL_COLOR_BUFFER_BIT) {
 				float color[4];
 				Uint8x4ToFloat4(color, c.clear.clearColor);
@@ -485,7 +495,11 @@ void GLQueueRunner::PerformRenderPass(const GLRStep &step) {
 
 			// TODO: Support FP viewports through glViewportArrays
 			glViewport((GLint)c.viewport.vp.x, (GLint)y, (GLsizei)c.viewport.vp.w, (GLsizei)c.viewport.vp.h);
+#if !defined(USING_GLES2)
 			glDepthRange(c.viewport.vp.minZ, c.viewport.vp.maxZ);
+#else
+			glDepthRangef(c.viewport.vp.minZ, c.viewport.vp.maxZ);
+#endif
 			break;
 		}
 		case GLRRenderCommand::SCISSOR:
@@ -597,8 +611,10 @@ void GLQueueRunner::PerformRenderPass(const GLRStep &step) {
 		}
 		case GLRRenderCommand::BINDPROGRAM:
 		{
-			glUseProgram(c.program.program->program);
-			curProgram = c.program.program;
+			if (curProgram != c.program.program) {
+				glUseProgram(c.program.program->program);
+				curProgram = c.program.program;
+			}
 			break;
 		}
 		case GLRRenderCommand::BIND_INPUT_LAYOUT:
diff --git a/ext/native/thin3d/GLQueueRunner.h b/ext/native/thin3d/GLQueueRunner.h
index 1c805f3eed..56e2d507eb 100644
--- a/ext/native/thin3d/GLQueueRunner.h
+++ b/ext/native/thin3d/GLQueueRunner.h
@@ -136,7 +136,7 @@ struct GLRRenderData {
 		} program;
 		struct {
 			GLRInputLayout *inputLayout;
-			intptr_t offset;
+			size_t offset;
 		} inputLayout;
 		struct {
 			GLenum wrapS;
diff --git a/ext/native/thin3d/GLRenderManager.h b/ext/native/thin3d/GLRenderManager.h
index 0cab4cb7ca..145e42cc00 100644
--- a/ext/native/thin3d/GLRenderManager.h
+++ b/ext/native/thin3d/GLRenderManager.h
@@ -3,7 +3,9 @@
 #include <thread>
 #include <map>
 #include <vector>
+#include <string>
 #include <mutex>
+#include <condition_variable>
 #include <cassert>
 
 #include "gfx/gl_common.h"
@@ -83,8 +85,9 @@ public:
 	};
 
 	// Must ONLY be called from GLQueueRunner!
+	// Also it's pretty slow...
 	int GetUniformLoc(const char *name) {
-		auto iter = uniformCache_.find(name);
+		auto iter = uniformCache_.find(std::string(name));
 		int loc = -1;
 		if (iter != uniformCache_.end()) {
 			loc = iter->second.loc_;
@@ -232,6 +235,7 @@ public:
 		step.create_program.program->semantics_ = semantics;
 		step.create_program.program->queries_ = queries;
 		step.create_program.program->initialize_ = initalizers;
+		step.create_program.support_dual_source = supportDualSource;
 		_assert_msg_(G3D, shaders.size() > 0, "Can't create a program with zero shaders");
 		for (int i = 0; i < shaders.size(); i++) {
 			step.create_program.shaders[i] = shaders[i];
@@ -288,15 +292,15 @@ public:
 	void BlitFramebuffer(GLRFramebuffer *src, GLRect2D srcRect, GLRFramebuffer *dst, GLRect2D dstRect, int aspectMask, bool filter);
 
 	// Takes ownership of data if deleteData = true.
-	void BufferSubdata(GLRBuffer *buffer, int offset, int size, uint8_t *data, bool deleteData = true) {
+	void BufferSubdata(GLRBuffer *buffer, size_t offset, size_t size, uint8_t *data, bool deleteData = true) {
 		// TODO: Maybe should be a render command instead of an init command? When possible it's better as
 		// an init command, that's for sure.
 		GLRInitStep step{ GLRInitStepType::BUFFER_SUBDATA };
 		_dbg_assert_(G3D, offset >= 0);
 		_dbg_assert_(G3D, offset <= buffer->size_ - size);
 		step.buffer_subdata.buffer = buffer;
-		step.buffer_subdata.offset = offset;
-		step.buffer_subdata.size = size;
+		step.buffer_subdata.offset = (int)offset;
+		step.buffer_subdata.size = (int)size;
 		step.buffer_subdata.data = data;
 		step.buffer_subdata.deleteData = deleteData;
 		initSteps_.push_back(step);
@@ -366,12 +370,12 @@ public:
 		curRenderStep_->commands.push_back(data);
 	}
 
-	void BindInputLayout(GLRInputLayout *inputLayout, const void *offset) {
+	void BindInputLayout(GLRInputLayout *inputLayout, size_t offset) {
 		_dbg_assert_(G3D, curRenderStep_ && curRenderStep_->stepType == GLRStepType::RENDER);
 		assert(inputLayout);
 		GLRRenderData data{ GLRRenderCommand::BIND_INPUT_LAYOUT };
 		data.inputLayout.inputLayout = inputLayout;
-		data.inputLayout.offset = (intptr_t)offset;
+		data.inputLayout.offset = offset;
 		curRenderStep_->commands.push_back(data);
 	}
 
diff --git a/ext/native/thin3d/thin3d_gl.cpp b/ext/native/thin3d/thin3d_gl.cpp
index 05aed08ef1..76c8d6b6bd 100644
--- a/ext/native/thin3d/thin3d_gl.cpp
+++ b/ext/native/thin3d/thin3d_gl.cpp
@@ -1115,7 +1115,6 @@ void OpenGLContext::DrawIndexed(int vertexCount, int offset) {
 }
 
 void OpenGLContext::DrawUP(const void *vdata, int vertexCount) {
-#if 1
 	int stride = curPipeline_->inputLayout->stride;
 	size_t dataSize = stride * vertexCount;
 
@@ -1127,16 +1126,9 @@ void OpenGLContext::DrawUP(const void *vdata, int vertexCount) {
 	ApplySamplers();
 
 	renderManager_.BindVertexBuffer(buf);
-	renderManager_.BindInputLayout(curPipeline_->inputLayout->inputLayout_, (void *)offset);
+	renderManager_.BindInputLayout(curPipeline_->inputLayout->inputLayout_, offset);
 
 	renderManager_.Draw(curPipeline_->prim, 0, vertexCount);
-#else
-	ApplySamplers();
-	renderManager_.BindInputLayout(curPipeline_->inputLayout->inputLayout_, (void *)vdata);
-	renderManager_.Draw(curPipeline_->prim, 0, vertexCount);
-	renderManager_.UnbindInputLayout(curPipeline_->inputLayout->inputLayout_);
-
-#endif
 }
 
 void OpenGLContext::Clear(int mask, uint32_t colorval, float depthVal, int stencilVal) {