Directx9 Gpu

2024-11-25 09:09:49 +00:00 · 2013-08-17 11:23:51 +02:00 · 2013-08-17 11:23:51 +02:00 · 3188c00629
commit 3188c00629
parent 0b2cd9ccf4
30 changed files with 11528 additions and 0 deletions
--- a/GPU/Directx9/DisplayListInterpreter.cpp
+++ b/GPU/Directx9/DisplayListInterpreter.cpp
--- a/GPU/Directx9/DisplayListInterpreter.h
+++ b/GPU/Directx9/DisplayListInterpreter.h
@ -0,0 +1,90 @@
+// Copyright (c) 2012- PPSSPP Project.
+
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, version 2.0 or later versions.
+
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License 2.0 for more details.
+
+// A copy of the GPL 2.0 should have been included with the program.
+// If not, see http://www.gnu.org/licenses/
+
+// Official git repository and contact information can be found at
+// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.
+
+#pragma once
+
+#include <list>
+#include <deque>
+
+#include "../GPUCommon.h"
+#include "Framebuffer.h"
+#include "VertexDecoder.h"
+#include "TransformPipeline.h"
+#include "TextureCache.h"
+#include "helper/fbo.h"
+
+class ShaderManager;
+class LinkedShader;
+
+class DIRECTX9_GPU : public GPUCommon
+{
+public:
+	DIRECTX9_GPU();
+	~DIRECTX9_GPU();
+	virtual void InitClear();
+	virtual void PreExecuteOp(u32 op, u32 diff);
+	virtual void ExecuteOp(u32 op, u32 diff);
+	virtual u32  DrawSync(int mode);
+
+	virtual void SetDisplayFramebuffer(u32 framebuf, u32 stride, GEBufferFormat format);
+	virtual void CopyDisplayToOutput();
+	virtual void BeginFrame();
+	virtual void UpdateStats();
+	virtual void InvalidateCache(u32 addr, int size, GPUInvalidationType type);
+	virtual void UpdateMemory(u32 dest, u32 src, int size);
+	virtual void ClearCacheNextFrame();
+	virtual void DeviceLost();  // Only happens on Android. Drop all textures and shaders.
+
+	virtual void DumpNextFrame();
+	virtual void Flush();
+	virtual void DoState(PointerWrap &p);
+	
+	// Called by the window system if the window size changed. This will be reflected in PSPCoreParam.pixel*.
+	virtual void Resized();
+	virtual bool DecodeTexture(u8* dest, GPUgstate state)
+	{
+		return textureCache_.DecodeTexture(dest, state);
+	}
+	virtual bool FramebufferDirty();
+
+	virtual void GetReportingInfo(std::string &primaryInfo, std::string &fullInfo) {
+		primaryInfo = reportingPrimaryInfo_;
+		fullInfo = reportingFullInfo_;
+	}
+	std::vector<FramebufferInfo> GetFramebufferList();
+
+protected:
+	virtual void FastRunLoop(DisplayList &list);
+
+private:
+	void DoBlockTransfer();
+	void ApplyDrawState(int prim);
+	void CheckFlushOp(u32 op, u32 diff);
+	void BuildReportingInfo();
+
+	FramebufferManager framebufferManager_;
+	TextureCache textureCache_;
+	TransformDrawEngine transformDraw_;
+	ShaderManager *shaderManager_;
+
+	u8 *flushBeforeCommand_;
+	bool resized_;
+	int lastVsync_;
+
+	std::string reportingPrimaryInfo_;
+	std::string reportingFullInfo_;
+};
--- a/GPU/Directx9/FragmentShaderGenerator.cpp
+++ b/GPU/Directx9/FragmentShaderGenerator.cpp
@ -0,0 +1,315 @@
+// Copyright (c) 2012- PPSSPP Project.
+
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, version 2.0 or later versions.
+
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License 2.0 for more details.
+
+// A copy of the GPL 2.0 should have been included with the program.
+// If not, see http://www.gnu.org/licenses/
+
+// Official git repository and contact information can be found at
+// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.
+
+#include "FragmentShaderGenerator.h"
+#include "../ge_constants.h"
+#include "../GPUState.h"
+#include <cstdio>
+
+#define WRITE p+=sprintf
+
+// #define DEBUG_SHADER
+
+// GL_NV_shader_framebuffer_fetch looks interesting....
+
+static bool IsAlphaTestTriviallyTrue() {
+	int alphaTestFunc = gstate.alphatest & 7;
+	int alphaTestRef = (gstate.alphatest >> 8) & 0xFF;
+	
+	switch (alphaTestFunc) {
+	case GE_COMP_ALWAYS:
+		return true;
+	case GE_COMP_GEQUAL:
+		if (alphaTestRef == 0)
+			return true;
+
+	// This breaks the trees in MotoGP, for example.
+	// case GE_COMP_GREATER:
+	//if (alphaTestRef == 0 && (gstate.alphaBlendEnable & 1) && gstate.getBlendFuncA() == GE_SRCBLEND_SRCALPHA && gstate.getBlendFuncB() == GE_SRCBLEND_INVSRCALPHA)
+	//	return true;
+
+	case GE_COMP_LEQUAL:
+		if (alphaTestRef == 255)
+			return true;
+	default:
+		return false;
+	}
+}
+
+static bool IsColorTestTriviallyTrue() {
+	int colorTestFunc = gstate.colortest & 3;
+	switch (colorTestFunc) {
+	case GE_COMP_ALWAYS:
+		return true;
+	default:
+		return false;
+	}
+}
+
+static bool CanDoubleSrcBlendMode() {
+	if (!gstate.isAlphaBlendEnabled()) {
+		return false;
+	}
+
+	int funcA = gstate.getBlendFuncA();
+	int funcB = gstate.getBlendFuncB();
+	if (funcA != GE_SRCBLEND_DOUBLESRCALPHA) {
+		funcB = funcA;
+		funcA = gstate.getBlendFuncB();
+	}
+	if (funcA != GE_SRCBLEND_DOUBLESRCALPHA) {
+		return false;
+	}
+
+	// One side should be doubled.  Let's check the other side.
+	// LittleBigPlanet, for example, uses 2.0 * src, 1.0 - src, which can't double.
+	switch (funcB) {
+	case GE_DSTBLEND_SRCALPHA:
+	case GE_DSTBLEND_INVSRCALPHA:
+		return false;
+
+	default:
+		return true;
+	}
+}
+
+
+// Here we must take all the bits of the gstate that determine what the fragment shader will
+// look like, and concatenate them together into an ID.
+void ComputeFragmentShaderID(FragmentShaderID *id) {
+	memset(&id->d[0], 0, sizeof(id->d));
+	if (gstate.clearmode & 1) {
+		// We only need one clear shader, so let's ignore the rest of the bits.
+		id->d[0] = 1;
+	} else {
+		int lmode = (gstate.lmode & 1) && gstate.isLightingEnabled();
+		bool enableFog = gstate.isFogEnabled() && !gstate.isModeThrough();
+		bool enableAlphaTest = gstate.isAlphaTestEnabled() && !IsAlphaTestTriviallyTrue();
+		bool enableColorTest = gstate.isColorTestEnabled() && !IsColorTestTriviallyTrue();
+		bool enableColorDoubling = (gstate.texfunc & 0x10000) != 0;
+		// This isn't really correct, but it's a hack to get doubled blend modes to work more correctly.
+		bool enableAlphaDoubling = CanDoubleSrcBlendMode();
+		bool doTextureProjection = gstate.getUVGenMode() == 1;
+		bool doTextureAlpha = (gstate.texfunc & 0x100) != 0;
+
+		// All texfuncs except replace are the same for RGB as for RGBA with full alpha.
+		if (gstate_c.textureFullAlpha && (gstate.texfunc & 0x7) != GE_TEXFUNC_REPLACE)
+			doTextureAlpha = false;
+
+		// id->d[0] |= (gstate.clearmode & 1);
+		if (gstate.isTextureMapEnabled()) {
+			id->d[0] |= 1 << 1;
+			id->d[0] |= (gstate.texfunc & 0x7) << 2;
+			id->d[0] |= (doTextureAlpha & 1) << 5; // rgb or rgba
+		}
+		id->d[0] |= (lmode & 1) << 7;
+		id->d[0] |= gstate.isAlphaTestEnabled() << 8;
+		if (enableAlphaTest)
+			id->d[0] |= (gstate.alphatest & 0x7) << 9;	 // alpha test func
+		id->d[0] |= gstate.isColorTestEnabled() << 12;
+		if (enableColorTest)
+			id->d[0] |= (gstate.colortest & 0x3) << 13;	 // color test func
+		id->d[0] |= (enableFog & 1) << 15;
+		id->d[0] |= (doTextureProjection & 1) << 16;
+		id->d[0] |= (enableColorDoubling & 1) << 17;
+		id->d[0] |= (enableAlphaDoubling & 1) << 18;
+	}
+}
+
+// Missing: Z depth range
+// Also, logic ops etc, of course. Urgh.
+#if 0
+void GenerateFragmentShader(char *buffer) {
+	//--------------------------------------------------------------------------------------
+	// Pixel shader
+	//--------------------------------------------------------------------------------------
+	const char * pscode =
+		" sampler s: register(s0);					   "
+		" struct PS_IN                                 "
+		" {                                            "
+		"		float3 Uv   : TEXCOORD0;              "
+		"		float4 C1    : COLOR0;                 "  // Vertex color
+		"		float4 C2    : COLOR1;                 "  // Vertex color                     
+		" };                                           " 
+		"                                              "
+		" float4 main( PS_IN In ) : COLOR              "
+		" {                                            "
+		//"   float4 c = In.C1;							"
+		"	float4 c = tex2D(s, In.Uv.xy);			"
+		"   return c;								   "
+		" }                                            ";
+
+	strcpy(buffer, pscode);
+}
+#else
+void GenerateFragmentShader(char *buffer) {
+	char *p = buffer;
+
+	int lmode = (gstate.lmode & 1) && gstate.isLightingEnabled();
+	int doTexture = gstate.isTextureMapEnabled() && !gstate.isModeClear();
+	bool enableFog = gstate.isFogEnabled() && !gstate.isModeThrough() && !gstate.isModeClear();
+	bool enableAlphaTest = gstate.isAlphaTestEnabled() && !IsAlphaTestTriviallyTrue() && !gstate.isModeClear();
+	bool enableColorTest = gstate.isColorTestEnabled() && !IsColorTestTriviallyTrue() && !gstate.isModeClear();
+	bool enableColorDoubling = (gstate.texfunc & 0x10000) != 0;
+	// This isn't really correct, but it's a hack to get doubled blend modes to work more correctly.
+	bool enableAlphaDoubling = CanDoubleSrcBlendMode();
+	bool doTextureProjection = gstate.getUVGenMode() == 1;
+	bool doTextureAlpha = (gstate.texfunc & 0x100) != 0;
+
+	if (gstate_c.textureFullAlpha && (gstate.texfunc & 0x7) != GE_TEXFUNC_REPLACE)
+		doTextureAlpha = false;
+
+	if (doTexture)
+		WRITE(p, "sampler tex: register(s0);\n");
+
+	if (enableAlphaTest || enableColorTest) {
+		WRITE(p, "float4 u_alphacolorref;\n");
+		WRITE(p, "float3 u_colormask;\n");
+	}
+	if (gstate.isTextureMapEnabled()) 
+		WRITE(p, "float3 u_texenv;\n");
+	if (enableFog) {
+		WRITE(p, "float3 u_fogcolor;\n");
+	}
+	
+
+	if (enableAlphaTest) {
+		WRITE(p, "float roundAndScaleTo255f(float x) { return floor(x * 255.0f + 0.5f); }\n");
+	}
+	if (enableColorTest) {
+		WRITE(p, "float3 roundAndScaleTo255v(float3 x) { return floor(x * 255.0f + 0.5f); }\n");
+	}
+
+	WRITE(p, " struct PS_IN                               ");
+	WRITE(p, " {                                          ");
+	WRITE(p, "		float4 v_texcoord: TEXCOORD0;         ");
+	WRITE(p, "		float4 v_color0: COLOR0;              "); 
+	WRITE(p, "		float4 v_color1: COLOR1;              ");    
+	if (enableFog) {
+		WRITE(p, "float v_fogdepth:FOG;\n");
+	}
+	WRITE(p, " };                                         "); 
+	WRITE(p, "                                            ");
+	WRITE(p, " float4 main( PS_IN In ) : COLOR            ");
+	WRITE(p, " {									      ");
+
+	if (gstate.isModeClear()) {
+		// Clear mode does not allow any fancy shading.
+		WRITE(p, "  return In.v_color0;\n");
+	} else {
+		const char *secondary = "";
+		// Secondary color for specular on top of texture
+		if (lmode) {
+			WRITE(p, "  float4 s = float4(In.v_color1);\n");
+			secondary = " + s";
+		} else {
+			secondary = "";
+		}
+
+		if (gstate.textureMapEnable & 1) {
+			if (doTextureProjection) {
+				WRITE(p, "  float4 t = tex2Dproj(tex, In.v_texcoord);\n");
+			} else {
+				WRITE(p, "  float4 t = tex2D(tex, In.v_texcoord.xy);\n");
+			}
+			WRITE(p, "  float4 p = In.v_color0;\n");
+
+			if (doTextureAlpha) { // texfmt == RGBA
+				switch (gstate.texfunc & 0x7) {
+				case GE_TEXFUNC_MODULATE:
+					WRITE(p, "  float4 v = p * t%s;\n", secondary); break;
+				case GE_TEXFUNC_DECAL:
+					WRITE(p, "  float4 v = float4(lerp(p.rgb, t.rgb, t.a), p.a)%s;\n", secondary); break;
+				case GE_TEXFUNC_BLEND:
+					WRITE(p, "  float4 v = float4(lerp(p.rgb, u_texenv.rgb, t.rgb), p.a * t.a)%s;\n", secondary); break;
+				case GE_TEXFUNC_REPLACE:
+					WRITE(p, "  float4 v = t%s;\n", secondary); break;
+				case GE_TEXFUNC_ADD:
+					WRITE(p, "  float4 v = float4(p.rgb + t.rgb, p.a * t.a)%s;\n", secondary); break;
+				default:
+					WRITE(p, "  float4 v = p;\n"); break;
+				}
+
+			} else {	// texfmt == RGB
+				switch (gstate.texfunc & 0x7) {
+				case GE_TEXFUNC_MODULATE:
+					WRITE(p, "  float4 v = float4(t.rgb * p.rgb, p.a)%s;\n", secondary); break;
+				case GE_TEXFUNC_DECAL:
+					WRITE(p, "  float4 v = float4(t.rgb, p.a)%s;\n", secondary); break;
+				case GE_TEXFUNC_BLEND:
+					WRITE(p, "  float4 v = float4(lerp(p.rgb, u_texenv.rgb, t.rgb), p.a)%s;\n", secondary); break;
+				case GE_TEXFUNC_REPLACE:
+					WRITE(p, "  float4 v = float4(t.rgb, p.a)%s;\n", secondary); break;
+				case GE_TEXFUNC_ADD:
+					WRITE(p, "  float4 v = float4(p.rgb + t.rgb, p.a)%s;\n", secondary); break;
+				default:
+					WRITE(p, "  float4 v = p;\n"); break;
+				}
+			}
+		} else {
+			// No texture mapping
+			WRITE(p, "  float4 v = In.v_color0 %s;\n", secondary);
+			// HACK ONLY DISPLAY TEXTuRE !!!
+			//WRITE(p, "  clip(-1);\n");
+		}
+
+		if (enableAlphaTest) {
+			int alphaTestFunc = gstate.alphatest & 7;
+			const char *alphaTestFuncs[] = { "#", "#", " != ", " == ", " >= ", " > ", " <= ", " < " };	// never/always don't make sense
+			if (alphaTestFuncs[alphaTestFunc][0] != '#') {
+				// WRITE(p, "  if (roundAndScaleTo255f(v.a) %s u_alphacolorref.a) discard;\n", alphaTestFuncs[alphaTestFunc]);
+				//WRITE(p, "clip((roundAndScaleTo255f(v.rgb) %s u_alphacolorref.a)? -1:1);\n", alphaTestFuncs[alphaTestFunc]);
+				WRITE(p, "  if (roundAndScaleTo255f(v.a) %s u_alphacolorref.a) clip(-1);\n", alphaTestFuncs[alphaTestFunc]);
+			}
+		}
+
+		// TODO: Before or after the color test?
+		if (enableColorDoubling && enableAlphaDoubling) {
+			WRITE(p, "  v = v * 2.0;\n");
+		} else if (enableColorDoubling) {
+			WRITE(p, "  v.rgb = v.rgb * 2.0;\n");
+		} else if (enableAlphaDoubling) {
+			WRITE(p, "  v.a = v.a * 2.0;\n");
+		}
+		
+		if (enableColorTest) {
+			int colorTestFunc = gstate.colortest & 3;
+			const char *colorTestFuncs[] = { "#", "#", " != ", " == " };	// never/always don't make sense
+			int colorTestMask = gstate.colormask;
+			if (colorTestFuncs[colorTestFunc][0] != '#') {
+				//WRITE(p, "clip((roundAndScaleTo255v(v.rgb) %s u_alphacolorref.rgb)? -1:1);\n", colorTestFuncs[colorTestFunc]);
+				//WRITE(p, "if (roundAndScaleTo255v(v.rgb) %s u_alphacolorref.rgb)  clip(-1);\n", colorTestFuncs[colorTestFunc]);
+
+				// cleanup ?
+				const char * test = colorTestFuncs[colorTestFunc];
+				WRITE(p, "float3 colortest = roundAndScaleTo255v(v.rgb);\n");
+				WRITE(p, "if ((colortest.r %s u_alphacolorref.r) && (colortest.g %s u_alphacolorref.g) && (colortest.b %s u_alphacolorref.b ))  clip(-1);\n", test, test, test);
+
+			}
+		}
+
+		if (enableFog) {
+			WRITE(p, "  float fogCoef = clamp(In.v_fogdepth, 0.0, 1.0);\n");
+			WRITE(p, "  return lerp(float4(u_fogcolor, v.a), v, fogCoef);\n");
+			// WRITE(p, "  v.x = v_depth;\n");
+		} else {
+			WRITE(p, "  return v;\n");
+		}
+	}
+	WRITE(p, "}\n");
+}
+#endif
--- a/GPU/Directx9/FragmentShaderGenerator.h
+++ b/GPU/Directx9/FragmentShaderGenerator.h
@ -0,0 +1,52 @@
+// Copyright (c) 2012- PPSSPP Project.
+
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, version 2.0 or later versions.
+
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License 2.0 for more details.
+
+// A copy of the GPL 2.0 should have been included with the program.
+// If not, see http://www.gnu.org/licenses/
+
+// Official git repository and contact information can be found at
+// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.
+
+#pragma once
+
+#include "Globals.h"
+
+struct FragmentShaderID
+{
+	FragmentShaderID() {d[0] = 0xFFFFFFFF;}
+	void clear() {d[0] = 0xFFFFFFFF;}
+	u32 d[1];
+	bool operator < (const FragmentShaderID &other) const
+	{
+		for (size_t i = 0; i < sizeof(d) / sizeof(u32); i++)
+		{
+			if (d[i] < other.d[i])
+				return true;
+			if (d[i] > other.d[i])
+				return false;
+		}
+		return false;
+	}
+	bool operator == (const FragmentShaderID &other) const
+	{
+		for (size_t i = 0; i < sizeof(d) / sizeof(u32); i++)
+		{
+			if (d[i] != other.d[i])
+				return false;
+		}
+		return true;
+	}
+};
+
+
+void ComputeFragmentShaderID(FragmentShaderID *id);
+
+void GenerateFragmentShader(char *buffer);
--- a/GPU/Directx9/Framebuffer.cpp
+++ b/GPU/Directx9/Framebuffer.cpp
@ -0,0 +1,968 @@
+// Copyright (c) 2012- PPSSPP Project.
+
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, version 2.0 or later versions.
+
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License 2.0 for more details.
+
+// A copy of the GPL 2.0 should have been included with the program.
+// If not, see http://www.gnu.org/licenses/
+
+// Official git repository and contact information can be found at
+// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.
+
+#include "math/lin/matrix4x4.h"
+
+#include "Core/Host.h"
+#include "Core/MemMap.h"
+#include "Core/Config.h"
+#include "Core/System.h"
+#include "GPU/ge_constants.h"
+#include "GPU/GPUState.h"
+
+#include "helper/dx_state.h"
+#include "helper/fbo.h"
+
+#include "GPU/Directx9/Framebuffer.h"
+#include "GPU/Directx9/TextureCache.h"
+#include "GPU/Directx9/ShaderManager.h"
+
+// Aggressively delete unused FBO:s to save gpu memory.
+enum {
+	FBO_OLD_AGE = 5,
+};
+
+static bool MaskedEqual(u32 addr1, u32 addr2) {
+	return (addr1 & 0x3FFFFFF) == (addr2 & 0x3FFFFFF);
+}
+
+inline u16 RGBA8888toRGB565(u32 px) {
+	return ((px >> 3) & 0x001F) | ((px >> 5) & 0x07E0) | ((px >> 8) & 0xF800);
+}
+
+inline u16 RGBA8888toRGBA4444(u32 px) {
+	return ((px >> 4) & 0x000F) | ((px >> 8) & 0x00F0) | ((px >> 12) & 0x0F00) | ((px >> 16) & 0xF000);
+}
+
+inline u16 RGBA8888toRGBA5551(u32 px) {
+	return ((px >> 3) & 0x001F) | ((px >> 6) & 0x03E0) | ((px >> 9) & 0x7C00) | ((px >> 16) & 0x8000);
+}
+
+void ConvertFromRGBA8888(u8 *dst, u8 *src, u32 stride, u32 height, GEBufferFormat format);
+
+void CenterRect(float *x, float *y, float *w, float *h,
+                float origW, float origH, float frameW, float frameH)
+{
+	if (g_Config.bStretchToDisplay)
+	{
+		*x = 0;
+		*y = 0;
+		*w = frameW;
+		*h = frameH;
+		return;
+	}
+
+	float origRatio = origW/origH;
+	float frameRatio = frameW/frameH;
+
+	if (origRatio > frameRatio)
+	{
+		// Image is wider than frame. Center vertically.
+		float scale = origW / frameW;
+		*x = 0.0f;
+		*w = frameW;
+		*h = frameW / origRatio;
+#ifdef BLACKBERRY
+		// Stretch a little bit
+		if (g_Config.bPartialStretch)
+			*h = (frameH + *h) / 2.0f; // (408 + 720) / 2 = 564
+#endif
+		*y = (frameH - *h) / 2.0f;
+	}
+	else
+	{
+		// Image is taller than frame. Center horizontally.
+		float scale = origH / frameH;
+		*y = 0.0f;
+		*h = frameH;
+		*w = frameH * origRatio;
+		*x = (frameW - *w) / 2.0f;
+	}
+}
+
+FramebufferManager::FramebufferManager() :
+	ramDisplayFramebufPtr_(0),
+	displayFramebufPtr_(0),
+	displayStride_(0),
+	displayFormat_(GE_FORMAT_565),
+	displayFramebuf_(0),
+	prevDisplayFramebuf_(0),
+	prevPrevDisplayFramebuf_(0),
+	frameLastFramebufUsed(0),
+	currentRenderVfb_(0),
+	drawPixelsTex_(0),
+	drawPixelsTexFormat_(GE_FORMAT_INVALID),
+	convBuf(0)
+{
+#if 0
+	draw2dprogram = glsl_create_source(basic_vs, tex_fs);
+
+	glsl_bind(draw2dprogram);
+	glUniform1i(draw2dprogram->sampler0, 0);
+	glsl_unbind();
+
+#endif
+	// And an initial clear. We don't clear per frame as the games are supposed to handle that
+	// by themselves.
+	dxstate.depthWrite.set(true);
+	dxstate.colorMask.set(true, true, true, true);
+	pD3Ddevice->Clear(0, NULL, D3DCLEAR_STENCIL|D3DCLEAR_TARGET |D3DCLEAR_ZBUFFER, D3DCOLOR_XRGB(0, 0, 0), 0, 0);
+
+	pD3Ddevice->CreateTexture(512, 272, 1, 0, D3DFMT(D3DFMT_A8R8G8B8), NULL, &drawPixelsTex_, NULL);
+
+	useBufferedRendering_ = g_Config.iRenderingMode != FB_NON_BUFFERED_MODE ? 1 : 0;
+}
+
+FramebufferManager::~FramebufferManager() {
+#if 0
+	if (drawPixelsTex_)
+		glDeleteTextures(1, &drawPixelsTex_);
+	glsl_destroy(draw2dprogram);
+#endif
+	if(drawPixelsTex_) {
+		drawPixelsTex_->Release();
+	}
+	delete [] convBuf;
+}
+
+static inline void ARGB8From4444(u16 c, u32 * dst) {
+	*dst = ((c & 0xf) << 4) | (((c >> 4) & 0xf) << 12) | (((c >> 8) & 0xf) << 20) | ((c >> 12) << 28);
+}
+static inline void ARGB8From565(u16 c, u32 * dst) {
+	*dst = ((c & 0x001f) << 19) | (((c >> 5) & 0x003f) << 11) | ((((c >> 10) & 0x001f) << 3)) | 0xFF000000;
+}
+static inline void ARGB8From5551(u16 c, u32 * dst) {
+	*dst = ((c & 0x001f) << 19) | (((c >> 5) & 0x001f) << 11) | ((((c >> 10) & 0x001f) << 3)) | 0xFF000000;
+}
+
+void FramebufferManager::DrawPixels(const u8 *framebuf, GEBufferFormat pixelFormat, int linesize) {
+	u8 * convBuf = NULL;
+	D3DLOCKED_RECT rect;
+
+	drawPixelsTex_->LockRect(0, &rect, NULL, D3DLOCK_NOOVERWRITE);
+
+	convBuf = (u8*)rect.pBits;
+
+	// Final format is ARGB(directx)
+
+	// TODO: We can just change the texture format and flip some bits around instead of this.
+	if (pixelFormat != GE_FORMAT_8888 || linesize != 512) {
+		for (int y = 0; y < 272; y++) {
+			switch (pixelFormat) {
+			// not tested
+			case GE_FORMAT_565:
+				{
+					const u16 *src = (const u16 *)framebuf + linesize * y;
+					u32 *dst = (u32*)(convBuf + rect.Pitch * y);
+					for (int x = 0; x < 480; x++) {
+						u16 col0 = LE_16(src[x+0]);
+						ARGB8From565(col0, &dst[x + 0]);
+					}
+				}
+				break;
+			// faster
+			case GE_FORMAT_5551:
+				{
+					const u16 *src = (const u16 *)framebuf + linesize * y;
+					u32 *dst = (u32*)(convBuf + rect.Pitch * y);
+					for (int x = 0; x < 480; x++) {
+						u16 col0 = LE_16(src[x+0]);
+						ARGB8From5551(col0, &dst[x + 0]);
+					}
+				}
+				break;
+			// not tested
+			case GE_FORMAT_4444:
+				{
+					const u16 *src = (const u16 *)framebuf + linesize * y;
+					u32 *dst = (u32*)(convBuf + rect.Pitch * y);
+					for (int x = 0; x < 480; x++)
+					{
+						u16 col = LE_16(src[x]);
+						dst[x * 4 + 0] = (col >> 12) << 4;
+						dst[x * 4 + 1] = ((col >> 8) & 0xf) << 4;
+						dst[x * 4 + 2] = ((col >> 4) & 0xf) << 4;
+						dst[x * 4 + 3] = (col & 0xf) << 4;
+					}
+				}
+				break;
+
+			case GE_FORMAT_8888:
+				{
+					const u8 *src = framebuf + linesize * 4 * y;
+					u8 *dst = convBuf + rect.Pitch * y;
+					memcpy(dst, src, 4 * 480);
+				}
+				break;
+			}
+		}
+	} else {
+		memcpy(convBuf, framebuf, 4 * 480 * 512);
+	}
+
+	drawPixelsTex_->UnlockRect(0);
+	// D3DXSaveTextureToFile("game:\\cc.png", D3DXIFF_PNG, drawPixelsTex_, NULL);
+
+	pD3Ddevice->SetTexture(0, drawPixelsTex_);
+
+	float x, y, w, h;
+	CenterRect(&x, &y, &w, &h, 480.0f, 272.0f, (float)PSP_CoreParameter().pixelWidth, (float)PSP_CoreParameter().pixelHeight);
+	DrawActiveTexture(x, y, w, h, false, 480.0f / 512.0f);
+}
+
+void FramebufferManager::DrawActiveTexture(float x, float y, float w, float h, bool flip, float uscale, float vscale) {
+	float u2 = uscale;
+	// Since we're flipping, 0 is down.  That's where the scale goes.
+	float v1 = flip ? 1.0f : 1.0f - vscale;
+	float v2 = flip ? 1.0f - vscale : 1.0f;
+
+	const float coord[] = { 
+		x,	 y,	  0,	0,	v1,
+		x+w, y,	  0,	u2, v1,
+		x+w, y+h, 0,	u2, v2,
+		x,	 y+h, 0,	0,	v2
+	}; 
+
+	Matrix4x4 ortho;
+	ortho.setOrtho(0, (float)PSP_CoreParameter().pixelWidth, (float)PSP_CoreParameter().pixelHeight, 0, -1, 1);
+
+	//pD3Ddevice->SetRenderState(D3DRS_FILLMODE, D3DFILL_WIREFRAME);
+	pD3Ddevice->SetRenderState(D3DRS_CULLMODE, D3DCULL_NONE);
+	pD3Ddevice->SetVertexShaderConstantF(0, ortho.getReadPtr(), 4);
+	
+	pD3Ddevice->SetVertexDeclaration(pFramebufferVertexDecl);
+	pD3Ddevice->SetPixelShader(pFramebufferPixelShader);
+	pD3Ddevice->SetVertexShader(pFramebufferVertexShader);
+	pD3Ddevice->SetTexture(0, drawPixelsTex_);
+	pD3Ddevice->DrawPrimitiveUP(D3DPT_TRIANGLEFAN, 2, coord, 5 * sizeof(float));
+}
+
+VirtualFramebuffer *FramebufferManager::GetDisplayFBO() {
+	VirtualFramebuffer *match = NULL;
+	for (size_t i = 0; i < vfbs_.size(); ++i) {
+		VirtualFramebuffer *v = vfbs_[i];
+		if (MaskedEqual(v->fb_address, displayFramebufPtr_) && v->format == displayFormat_ && v->width >= 480) {
+			// Could check w too but whatever
+			if (match == NULL || match->last_frame_used < v->last_frame_used) {
+				match = v;
+			}
+		}
+	}
+	if (match != NULL) {
+		return match;
+	}
+
+	DEBUG_LOG(HLE, "Finding no FBO matching address %08x", displayFramebufPtr_);
+#if 0  // defined(_DEBUG)
+	std::string debug = "FBOs: ";
+	for (size_t i = 0; i < vfbs_.size(); ++i) {
+		char temp[256];
+		sprintf(temp, "%08x %i %i", vfbs_[i]->fb_address, vfbs_[i]->width, vfbs_[i]->height);
+		debug += std::string(temp);
+	}
+	ERROR_LOG(HLE, "FBOs: %s", debug.c_str());
+#endif
+	return 0;
+}
+
+void GetViewportDimensions(int &w, int &h) {
+	float vpXa = getFloat24(gstate.viewportx1);
+	float vpYa = getFloat24(gstate.viewporty1);
+	w = (int)fabsf(vpXa * 2);
+	h = (int)fabsf(vpYa * 2);
+}
+
+// Heuristics to figure out the size of FBO to create.
+void GuessDrawingSize(int &drawing_width, int &drawing_height) {
+	int viewport_width, viewport_height;
+	int default_width = 480; 
+	int default_height = 272;
+	int regionX2 = (gstate.getRegionX2() + 1) ;
+	int regionY2 = (gstate.getRegionY2() + 1) ;
+	int fb_stride = gstate.fbwidth & 0x3C0;
+	GetViewportDimensions(viewport_width, viewport_height);
+
+	// Generated FBO shouldn't greate than 512x512
+	if ( viewport_width > 512 && viewport_height > 512 ) {
+		viewport_width = default_width;
+		viewport_height = default_height;
+	}
+
+	if (fb_stride < 512) {
+		drawing_width = std::min(viewport_width, regionX2);
+		drawing_height = std::min(viewport_height, regionY2);
+	} else {
+		drawing_width = std::max(viewport_width, default_width);
+		drawing_height = std::max(viewport_height, default_height);
+	}
+}
+
+void FramebufferManager::DestroyFramebuf(VirtualFramebuffer *v) {
+	textureCache_->NotifyFramebufferDestroyed(v->fb_address, v);
+	if (v->fbo) {
+		fbo_destroy(v->fbo);
+		v->fbo = 0;
+	}
+
+	// Wipe some pointers
+	if (currentRenderVfb_ == v)
+		currentRenderVfb_ = 0;
+	if (displayFramebuf_ == v)
+		displayFramebuf_ = 0;
+	if (prevDisplayFramebuf_ == v)
+		prevDisplayFramebuf_ = 0;
+	if (prevPrevDisplayFramebuf_ == v)
+		prevPrevDisplayFramebuf_ = 0;
+
+	delete v;
+}
+
+void FramebufferManager::SetRenderFrameBuffer() {
+	if (!gstate_c.framebufChanged && currentRenderVfb_) {
+		currentRenderVfb_->last_frame_used = gpuStats.numFrames;
+		return;
+	}
+	gstate_c.framebufChanged = false;
+
+	// Get parameters
+	u32 fb_address = (gstate.fbptr & 0xFFE000) | ((gstate.fbwidth & 0xFF0000) << 8);
+	int fb_stride = gstate.fbwidth & 0x3C0;
+
+	u32 z_address = (gstate.zbptr & 0xFFE000) | ((gstate.zbwidth & 0xFF0000) << 8);
+	int z_stride = gstate.zbwidth & 0x3C0;
+
+	// Yeah this is not completely right. but it'll do for now.
+	//int drawing_width = ((gstate.region2) & 0x3FF) + 1;
+	//int drawing_height = ((gstate.region2 >> 10) & 0x3FF) + 1;
+
+	// As there are no clear "framebuffer width" and "framebuffer height" registers,
+	// we need to infer the size of the current framebuffer somehow. Let's try the viewport.
+	
+	GEBufferFormat fmt = static_cast<GEBufferFormat>(gstate.framebufpixformat & 3);
+
+	int drawing_width, drawing_height;
+	GuessDrawingSize(drawing_width, drawing_height);
+
+	int buffer_width = drawing_width;
+	int buffer_height = drawing_height;
+
+	// Find a matching framebuffer
+	VirtualFramebuffer *vfb = 0;
+	for (size_t i = 0; i < vfbs_.size(); ++i) {
+		VirtualFramebuffer *v = vfbs_[i];
+		if (MaskedEqual(v->fb_address, fb_address) && v->format == fmt) {
+				// Let's not be so picky for now. Let's say this is the one.
+				vfb = v;
+				// Update fb stride in case it changed
+				vfb->fb_stride = fb_stride;
+			if (v->bufferWidth >= drawing_width && v->bufferHeight >= drawing_height) { 
+				v->width = drawing_width;
+				v->height = drawing_height;
+			}
+			break; 
+		}
+	}
+
+	float renderWidthFactor = (float)PSP_CoreParameter().renderWidth / 480.0f;
+	float renderHeightFactor = (float)PSP_CoreParameter().renderHeight / 272.0f;
+
+	// None found? Create one.
+	if (!vfb) {
+		gstate_c.textureChanged = true;
+		vfb = new VirtualFramebuffer();
+		vfb->fbo = 0;
+		vfb->fb_address = fb_address;
+		vfb->fb_stride = fb_stride;
+		vfb->z_address = z_address;
+		vfb->z_stride = z_stride;
+		vfb->width = drawing_width;
+		vfb->height = drawing_height;
+		vfb->renderWidth = (u16)(drawing_width * renderWidthFactor);
+		vfb->renderHeight = (u16)(drawing_height * renderHeightFactor);
+		vfb->bufferWidth = buffer_width;
+		vfb->bufferHeight = buffer_height;
+		vfb->format = fmt;
+		vfb->usageFlags = FB_USAGE_RENDERTARGET;
+		vfb->dirtyAfterDisplay = true;
+
+		if (g_Config.bTrueColor) {
+			vfb->colorDepth = FBO_8888;
+		} else { 
+			switch (fmt) {
+				case GE_FORMAT_4444: 
+					vfb->colorDepth = FBO_4444; 
+					break;
+				case GE_FORMAT_5551: 
+					vfb->colorDepth = FBO_5551; 
+					break;
+				case GE_FORMAT_565: 
+					vfb->colorDepth = FBO_565; 
+					break;
+				case GE_FORMAT_8888: 
+					vfb->colorDepth = FBO_8888; 
+					break;
+				default: 
+					vfb->colorDepth = FBO_8888; 
+					break;
+			}
+		}
+			
+		//#ifdef ANDROID
+		//	vfb->colorDepth = FBO_8888;
+		//#endif
+
+		if (useBufferedRendering_) {
+			vfb->fbo = fbo_create(vfb->renderWidth, vfb->renderHeight, 1, true, vfb->colorDepth);
+			if (vfb->fbo) {
+				fbo_bind_as_render_target(vfb->fbo);
+			} else {
+				ERROR_LOG(HLE, "Error creating FBO! %i x %i", vfb->renderWidth, vfb->renderHeight);
+			}
+		} else {
+			fbo_unbind();
+			// Let's ignore rendering to targets that have not (yet) been displayed.
+			gstate_c.skipDrawReason |= SKIPDRAW_NON_DISPLAYED_FB;
+		}
+
+		textureCache_->NotifyFramebuffer(vfb->fb_address, vfb);
+
+		vfb->last_frame_used = gpuStats.numFrames;
+		frameLastFramebufUsed = gpuStats.numFrames;
+		vfbs_.push_back(vfb);
+
+		dxstate.depthWrite.set(true);
+		dxstate.colorMask.set(true, true, true, true);
+		pD3Ddevice->Clear(0, NULL, D3DCLEAR_STENCIL|D3DCLEAR_TARGET |D3DCLEAR_ZBUFFER, D3DCOLOR_XRGB(0, 0, 0), 0, 0);
+
+		currentRenderVfb_ = vfb;
+
+		INFO_LOG(HLE, "Creating FBO for %08x : %i x %i x %i", vfb->fb_address, vfb->width, vfb->height, vfb->format);
+
+	// We already have it!
+	} else if (vfb != currentRenderVfb_) {
+		// Use it as a render target.
+		DEBUG_LOG(HLE, "Switching render target to FBO for %08x: %i x %i x %i ", vfb->fb_address, vfb->width, vfb->height, vfb->format);
+		vfb->usageFlags |= FB_USAGE_RENDERTARGET;
+		gstate_c.textureChanged = true;
+		vfb->last_frame_used = gpuStats.numFrames;
+		frameLastFramebufUsed = gpuStats.numFrames;
+		vfb->dirtyAfterDisplay = true;
+
+		if (useBufferedRendering_) {
+			if (vfb->fbo) {
+				fbo_bind_as_render_target(vfb->fbo);
+			} else {
+				// wtf? This should only happen very briefly when toggling bBufferedRendering
+				fbo_unbind();
+			}
+		} else {
+			if (vfb->fbo) {
+				// wtf? This should only happen very briefly when toggling bBufferedRendering
+				textureCache_->NotifyFramebufferDestroyed(vfb->fb_address, vfb);
+				fbo_destroy(vfb->fbo);
+				vfb->fbo = 0;
+			}
+			fbo_unbind();
+
+			// Let's ignore rendering to targets that have not (yet) been displayed.
+			if (vfb->usageFlags & FB_USAGE_DISPLAYED_FRAMEBUFFER)
+				gstate_c.skipDrawReason &= ~SKIPDRAW_NON_DISPLAYED_FB;
+			else
+				gstate_c.skipDrawReason |= SKIPDRAW_NON_DISPLAYED_FB;
+
+			/*
+			if (drawing_width == 480 && drawing_height == 272) {
+				gstate_c.skipDrawReason &= ~SKIPDRAW_SKIPNONFB;
+				// OK!
+			} else {
+				gstate_c.skipDrawReason |= ~SKIPDRAW_SKIPNONFB;
+			}*/
+		}
+		textureCache_->NotifyFramebuffer(vfb->fb_address, vfb);
+
+#if 1
+		// Some tiled mobile GPUs benefit IMMENSELY from clearing an FBO before rendering
+		// to it. This broke stuff before, so now it only clears on the first use of an
+		// FBO in a frame. This means that some games won't be able to avoid the on-some-GPUs
+		// performance-crushing framebuffer reloads from RAM, but we'll have to live with that.
+		if (vfb->last_frame_used != gpuStats.numFrames)	{
+			dxstate.depthWrite.set(true);
+			dxstate.colorMask.set(true, true, true, true);
+			pD3Ddevice->Clear(0, NULL, D3DCLEAR_STENCIL|D3DCLEAR_TARGET |D3DCLEAR_ZBUFFER, D3DCOLOR_XRGB(0, 0, 0), 0, 0);
+		}
+#endif
+		currentRenderVfb_ = vfb;
+	} else {
+		vfb->last_frame_used = gpuStats.numFrames;
+		frameLastFramebufUsed = gpuStats.numFrames;
+	}
+
+	// ugly...
+	if (gstate_c.curRTWidth != vfb->width || gstate_c.curRTHeight != vfb->height) {
+		shaderManager_->DirtyUniform(DIRTY_PROJTHROUGHMATRIX);
+		gstate_c.curRTWidth = vfb->width;
+		gstate_c.curRTHeight = vfb->height;
+	}
+}
+
+void FramebufferManager::CopyDisplayToOutput() {
+	fbo_unbind();
+	currentRenderVfb_ = 0;
+
+	VirtualFramebuffer *vfb = GetDisplayFBO();
+	if (!vfb) {
+		if (Memory::IsValidAddress(ramDisplayFramebufPtr_)) {
+			// The game is displaying something directly from RAM. In GTA, it's decoded video.
+			DrawPixels(Memory::GetPointer(ramDisplayFramebufPtr_), displayFormat_, displayStride_);
+		} else if (Memory::IsValidAddress(displayFramebufPtr_)) {
+			// The game is displaying something directly from RAM. In GTA, it's decoded video.
+			DrawPixels(Memory::GetPointer(displayFramebufPtr_), displayFormat_, displayStride_);
+		} else {
+			DEBUG_LOG(HLE, "Found no FBO to display! displayFBPtr = %08x", displayFramebufPtr_);
+			// No framebuffer to display! Clear to black.
+			dxstate.depthWrite.set(true);
+			dxstate.colorMask.set(true, true, true, true);
+			pD3Ddevice->Clear(0, NULL, D3DCLEAR_STENCIL|D3DCLEAR_TARGET |D3DCLEAR_ZBUFFER, D3DCOLOR_XRGB(0, 0, 0), 0, 0);
+		}
+		return;
+	}
+
+	vfb->usageFlags |= FB_USAGE_DISPLAYED_FRAMEBUFFER;
+	vfb->dirtyAfterDisplay = false;
+
+	if (prevDisplayFramebuf_ != displayFramebuf_) {
+		prevPrevDisplayFramebuf_ = prevDisplayFramebuf_;
+	}
+	if (displayFramebuf_ != vfb) {
+		prevDisplayFramebuf_ = displayFramebuf_;
+	}
+	displayFramebuf_ = vfb;
+
+	if (vfb->fbo) {
+		dxstate.viewport.set(0, 0, PSP_CoreParameter().pixelWidth, PSP_CoreParameter().pixelHeight);
+		DEBUG_LOG(HLE, "Displaying FBO %08x", vfb->fb_address);
+		dxstate.blend.disable();
+		dxstate.cullMode.set(false, false);
+		dxstate.depthTest.disable();
+		dxstate.scissorTest.disable();
+		dxstate.stencilTest.disable();
+
+		// Resolve
+		//fbo_resolve(vfb->fbo);
+
+		fbo_bind_color_as_texture(vfb->fbo, 0);
+	
+	// These are in the output display coordinates
+		float x, y, w, h;
+		CenterRect(&x, &y, &w, &h, 480.0f, 272.0f, (float)PSP_CoreParameter().pixelWidth, (float)PSP_CoreParameter().pixelHeight);
+		DrawActiveTexture(x, y, w, h, true, 480.0f / (float)vfb->width, 272.0f / (float)vfb->height);
+		pD3Ddevice->SetTexture(0, NULL);
+	}
+
+	if (resized_) {
+		dxstate.depthWrite.set(true);
+		dxstate.colorMask.set(true, true, true, true);
+		pD3Ddevice->Clear(0, NULL, D3DCLEAR_STENCIL|D3DCLEAR_TARGET |D3DCLEAR_ZBUFFER, D3DCOLOR_XRGB(0, 0, 0), 0, 0);
+	}
+}
+void FramebufferManager::ReadFramebufferToMemory(VirtualFramebuffer *vfb) {
+	// This only works with buffered rendering
+	if (!useBufferedRendering_) {
+		return;
+	}
+
+	if(vfb) {
+		// We'll pseudo-blit framebuffers here to get a resized and flipped version of vfb.
+		// For now we'll keep these on the same struct as the ones that can get displayed
+		// (and blatantly copy work already done above while at it).
+		VirtualFramebuffer *nvfb = 0;
+
+		// We maintain a separate vector of framebuffer objects for blitting.
+		for (size_t i = 0; i < bvfbs_.size(); ++i) {
+			VirtualFramebuffer *v = bvfbs_[i];
+			if (MaskedEqual(v->fb_address, vfb->fb_address) && v->format == vfb->format) {
+				if (v->bufferWidth == vfb->bufferWidth && v->bufferHeight == vfb->bufferHeight) {
+					nvfb = v;
+					v->fb_stride = vfb->fb_stride;
+					v->width = vfb->width;
+					v->height = vfb->height;
+					break;
+				}
+			}
+		}
+
+		// Create a new fbo if none was found for the size
+		if(!nvfb) {
+			nvfb = new VirtualFramebuffer();
+			nvfb->fbo = 0;
+			nvfb->fb_address = vfb->fb_address;
+			nvfb->fb_stride = vfb->fb_stride;
+			nvfb->z_address = vfb->z_address;
+			nvfb->z_stride = vfb->z_stride;
+			nvfb->width = vfb->width;
+			nvfb->height = vfb->height;
+			nvfb->renderWidth = vfb->width;
+			nvfb->renderHeight = vfb->height;
+			nvfb->bufferWidth = vfb->bufferWidth;
+			nvfb->bufferHeight = vfb->bufferHeight;
+			nvfb->format = vfb->format;
+			nvfb->usageFlags = FB_USAGE_RENDERTARGET;
+			nvfb->dirtyAfterDisplay = true;
+
+			if(g_Config.bTrueColor) {
+				nvfb->colorDepth = FBO_8888;
+			} else {
+				switch (vfb->format) {
+					case GE_FORMAT_4444:
+						nvfb->colorDepth = FBO_4444;
+						break;
+					case GE_FORMAT_5551:
+						nvfb->colorDepth = FBO_5551;
+						break;
+					case GE_FORMAT_565: 
+						nvfb->colorDepth = FBO_565;
+						break;
+					case GE_FORMAT_8888:
+					default: 
+						nvfb->colorDepth = FBO_8888;
+						break;
+				}
+			}
+
+			nvfb->fbo = fbo_create(nvfb->width, nvfb->height, 1, true, nvfb->colorDepth);
+			if (!(nvfb->fbo)) {
+				ERROR_LOG(HLE, "Error creating FBO! %i x %i", nvfb->renderWidth, nvfb->renderHeight);
+			}
+
+			if (useBufferedRendering_) {
+				if (nvfb->fbo) {
+					fbo_bind_as_render_target(nvfb->fbo);
+				} else {
+					fbo_unbind();
+					return;
+				}
+			}
+
+			nvfb->last_frame_used = gpuStats.numFrames;
+			bvfbs_.push_back(nvfb);
+
+			dxstate.depthWrite.set(true);
+			dxstate.colorMask.set(true, true, true, true);
+			pD3Ddevice->Clear(0, NULL, D3DCLEAR_STENCIL|D3DCLEAR_TARGET |D3DCLEAR_ZBUFFER, D3DCOLOR_XRGB(0, 0, 0), 0, 0);
+
+
+		} else {
+			nvfb->usageFlags |= FB_USAGE_RENDERTARGET;
+			nvfb->last_frame_used = gpuStats.numFrames;
+			nvfb->dirtyAfterDisplay = true;
+
+			if (useBufferedRendering_) {
+				if (nvfb->fbo) {
+					fbo_bind_as_render_target(nvfb->fbo);
+#if 1
+					// Some tiled mobile GPUs benefit IMMENSELY from clearing an FBO before rendering
+					// to it. This broke stuff before, so now it only clears on the first use of an
+					// FBO in a frame. This means that some games won't be able to avoid the on-some-GPUs
+					// performance-crushing framebuffer reloads from RAM, but we'll have to live with that.
+					if (nvfb->last_frame_used != gpuStats.numFrames)	{
+						dxstate.depthWrite.set(true);
+						dxstate.colorMask.set(true, true, true, true);
+						pD3Ddevice->Clear(0, NULL, D3DCLEAR_STENCIL|D3DCLEAR_TARGET |D3DCLEAR_ZBUFFER, D3DCOLOR_XRGB(0, 0, 0), 0, 0);
+					}
+#endif
+				} else {
+					fbo_unbind();
+					return;
+				}
+			}
+		}
+
+		BlitFramebuffer_(vfb, nvfb, false);
+
+		PackFramebufferDirectx9_(nvfb);
+	}
+}
+
+void FramebufferManager::BlitFramebuffer_(VirtualFramebuffer *src, VirtualFramebuffer *dst, bool flip, float upscale, float vscale) {
+	// This only works with buffered rendering
+	if (!useBufferedRendering_ || !src->fbo) {
+		return;
+	}
+
+	fbo_bind_as_render_target(dst->fbo);
+	
+	/*
+	if(glCheckFramebufferStatus(GL_DRAW_FRAMEBUFFER) != GL_FRAMEBUFFER_COMPLETE) {
+		ERROR_LOG(HLE, "Incomplete target framebuffer, aborting blit");
+		fbo_unbind();
+		return;
+	}
+	*/
+
+	dxstate.viewport.set(0, 0, dst->width, dst->height);
+	dxstate.depthTest.disable();
+	dxstate.blend.disable();
+	dxstate.cullMode.set(0, 0);
+	dxstate.depthTest.disable();
+	dxstate.scissorTest.disable();
+	dxstate.stencilTest.disable();
+
+	fbo_bind_color_as_texture(src->fbo, 0);
+
+	float x, y, w, h;
+	CenterRect(&x, &y, &w, &h, 480.0f, 272.0f, (float)PSP_CoreParameter().pixelWidth, (float)PSP_CoreParameter().pixelHeight);
+	
+	DrawActiveTexture(x, y, w, h, flip, upscale, vscale);
+	
+	pD3Ddevice->SetTexture(0, NULL);
+	fbo_unbind();
+}
+
+// TODO: SSE/NEON
+void ConvertFromRGBA8888(u8 *dst, u8 *src, u32 stride, u32 height, GEBufferFormat format) {
+	if(format == GE_FORMAT_8888) {
+		if(src == dst) {
+			return;
+		} else { // Here lets assume they don't intersect
+			memcpy(dst, src, stride * height * 4);
+		}
+	} else { // But here it shouldn't matter if they do
+		int size = height * stride;
+		const u32 *src32 = (const u32 *)src;
+		u16 *dst16 = (u16 *)dst;
+		switch (format) {
+			case GE_FORMAT_565: // BGR 565
+				for(int i = 0; i < size; i++) {
+					dst16[i] = RGBA8888toRGB565(src32[i]);
+				}
+				break;
+			case GE_FORMAT_5551: // ABGR 1555
+				for(int i = 0; i < size; i++) {
+					dst16[i] = RGBA8888toRGBA5551(src32[i]);
+				}
+
+				break;
+			case GE_FORMAT_4444: // ABGR 4444
+				for(int i = 0; i < size; i++) {
+					dst16[i] = RGBA8888toRGBA4444(src32[i]);
+				}
+				break;
+			case GE_FORMAT_8888:
+				// Not possible.
+				break;
+			default:
+				break;
+		}
+	}
+}
+
+#include <xgraphics.h>
+
+static void Resolve(u8* data, VirtualFramebuffer *vfb) {
+#ifdef _XBOX
+	D3DTexture * rtt = (D3DTexture*)fbo_get_rtt(vfb->fbo);
+	pD3Ddevice->Resolve(D3DRESOLVE_RENDERTARGET0, NULL, rtt, NULL, 0, 0, NULL, 0.f, 0, NULL);
+
+	D3DLOCKED_RECT p;
+	rtt->LockRect(0, &p, NULL, 0);
+	rtt->UnlockRect(0);
+
+	// vfb->fbo->tex is tilled !!!!
+	XGUntileTextureLevel(vfb->width, vfb->height, 0, D3DFMT_LIN_A8R8G8B8, XGTILE_NONPACKED, data, p.Pitch, NULL, p.pBits, NULL);
+#endif
+}
+
+void FramebufferManager::PackFramebufferDirectx9_(VirtualFramebuffer *vfb) {
+	if (useBufferedRendering_ && vfb->fbo) {
+		fbo_bind_for_read(vfb->fbo);
+	} else {
+		fbo_unbind();
+		return;
+	}
+
+	// Pixel size always 4 here because we always request RGBA8888
+	size_t bufSize = vfb->fb_stride * vfb->height * 4;
+	u32 fb_address = (0x44000000) | vfb->fb_address;
+
+	u8 *packed = 0;
+	if(vfb->format == GE_FORMAT_8888) {
+		packed = (u8 *)Memory::GetPointer(fb_address);
+	} else { // End result may be 16-bit but we are reading 32-bit, so there may not be enough space at fb_address
+		packed = (u8 *)malloc(bufSize * sizeof(u8));
+	}
+
+	if(packed) {
+		DEBUG_LOG(HLE, "Reading framebuffer to mem, bufSize = %u, packed = %p, fb_address = %08x", 
+			(u32)bufSize, packed, fb_address);
+
+		Resolve(packed, vfb);
+
+		if(vfb->format != GE_FORMAT_8888) { // If not RGBA 8888 we need to convert
+			ConvertFromRGBA8888(Memory::GetPointer(fb_address), packed, vfb->fb_stride, vfb->height, vfb->format);
+			free(packed);
+		}
+	}
+
+	fbo_unbind();
+}
+void FramebufferManager::EndFrame() {
+	if (resized_) {
+		DestroyAllFBOs();
+		dxstate.viewport.set(0, 0, PSP_CoreParameter().pixelWidth, PSP_CoreParameter().pixelHeight);
+		resized_ = false;
+	}
+}
+
+void FramebufferManager::DeviceLost() {
+	DestroyAllFBOs();
+	resized_ = false;
+}
+
+void FramebufferManager::BeginFrame() {
+	DecimateFBOs();
+	currentRenderVfb_ = 0;
+	useBufferedRendering_ = g_Config.iRenderingMode != FB_NON_BUFFERED_MODE ? 1 : 0;
+}
+
+void FramebufferManager::SetDisplayFramebuffer(u32 framebuf, u32 stride, GEBufferFormat format) {
+
+	if ((framebuf & 0x04000000) == 0) {
+		DEBUG_LOG(HLE, "Non-VRAM display framebuffer address set: %08x", framebuf);
+		ramDisplayFramebufPtr_ = framebuf;
+		displayStride_ = stride;
+		displayFormat_ = format;
+	} else {
+		ramDisplayFramebufPtr_ = 0;
+		displayFramebufPtr_ = framebuf;
+		displayStride_ = stride;
+		displayFormat_ = format;
+	}
+}
+
+std::vector<FramebufferInfo> FramebufferManager::GetFramebufferList() {
+	std::vector<FramebufferInfo> list;
+
+	for (size_t i = 0; i < vfbs_.size(); ++i) {
+		VirtualFramebuffer *vfb = vfbs_[i];
+
+		FramebufferInfo info;
+		info.fb_address = vfb->fb_address;
+		info.z_address = vfb->z_address;
+		info.format = vfb->format;
+		info.width = vfb->width;
+		info.height = vfb->height;
+		info.fbo = vfb->fbo;
+		list.push_back(info);
+	}
+
+	return list;
+}
+
+void FramebufferManager::DecimateFBOs() {
+	fbo_unbind();
+	currentRenderVfb_ = 0;
+	int num = g_Config.iFrameSkip > 0 && g_Config.iFrameSkip != 9 ? g_Config.iFrameSkip : 3;
+	bool skipFrame = (gpuStats.numFrames % num == 0);
+	bool useFramebufferToMem = g_Config.iRenderingMode != FB_BUFFERED_MODE ? 1 : 0;
+
+	for (size_t i = 0; i < vfbs_.size(); ++i) {
+		VirtualFramebuffer *vfb = vfbs_[i];
+		int age = frameLastFramebufUsed - vfb->last_frame_used;
+
+		if(useFramebufferToMem) {
+			// Commit framebuffers to memory
+			if(skipFrame && age <= FBO_OLD_AGE) 
+				ReadFramebufferToMemory(vfb);
+		}
+
+		if (vfb == displayFramebuf_ || vfb == prevDisplayFramebuf_ || vfb == prevPrevDisplayFramebuf_) {
+			continue;
+		}
+
+		if (age > FBO_OLD_AGE) {
+			INFO_LOG(HLE, "Decimating FBO for %08x (%i x %i x %i), age %i", vfb->fb_address, vfb->width, vfb->height, vfb->format, age)
+			DestroyFramebuf(vfb);
+			vfbs_.erase(vfbs_.begin() + i--);
+		}
+	}
+
+	// Do the same for ReadFramebuffersToMemory's VFBs
+	for (size_t i = 0; i < bvfbs_.size(); ++i) {
+		VirtualFramebuffer *vfb = bvfbs_[i];
+		int age = frameLastFramebufUsed - vfb->last_frame_used;
+		if (age > FBO_OLD_AGE) {
+			INFO_LOG(HLE, "Decimating FBO for %08x (%i x %i x %i), age %i", vfb->fb_address, vfb->width, vfb->height, vfb->format, age)
+			DestroyFramebuf(vfb);
+			bvfbs_.erase(bvfbs_.begin() + i--);
+		}
+	}
+}
+
+void FramebufferManager::DestroyAllFBOs() {
+	fbo_unbind();
+	currentRenderVfb_ = 0;
+	displayFramebuf_ = 0;
+	prevDisplayFramebuf_ = 0;
+	prevPrevDisplayFramebuf_ = 0;
+
+	for (size_t i = 0; i < vfbs_.size(); ++i) {
+		VirtualFramebuffer *vfb = vfbs_[i];
+		INFO_LOG(HLE, "Destroying FBO for %08x : %i x %i x %i", vfb->fb_address, vfb->width, vfb->height, vfb->format);
+		DestroyFramebuf(vfb);
+	}
+	vfbs_.clear();
+}
+
+void FramebufferManager::UpdateFromMemory(u32 addr, int size) {
+	addr &= ~0x40000000;
+	// TODO: Could go through all FBOs, but probably not important?
+	// TODO: Could also check for inner changes, but video is most important.
+	if (addr == DisplayFramebufAddr() || addr == PrevDisplayFramebufAddr()) {
+		// TODO: Deleting the FBO is a heavy hammer solution, so let's only do it if it'd help.
+		if (!Memory::IsValidAddress(displayFramebufPtr_))
+			return;
+
+		fbo_unbind();
+		currentRenderVfb_ = 0;
+
+		bool needUnbind = false;
+		for (size_t i = 0; i < vfbs_.size(); ++i) {
+			VirtualFramebuffer *vfb = vfbs_[i];
+			if (MaskedEqual(vfb->fb_address, addr)) {
+				vfb->dirtyAfterDisplay = true;
+				// TODO: This without the fbo_unbind() above would be better than destroying the FBO.
+				// However, it doesn't seem to work for Star Ocean, at least
+				if (useBufferedRendering_) {
+					fbo_bind_as_render_target(vfb->fbo);
+					needUnbind = true;
+					DrawPixels(Memory::GetPointer(addr), vfb->format, vfb->fb_stride);
+				} else {
+					INFO_LOG(HLE, "Invalidating FBO for %08x (%i x %i x %i)", vfb->fb_address, vfb->width, vfb->height, vfb->format)
+					DestroyFramebuf(vfb);
+					vfbs_.erase(vfbs_.begin() + i--);
+				}
+			}
+		}
+
+		if (needUnbind)
+			fbo_unbind();
+	}
+}
+
+void FramebufferManager::Resized() {
+	resized_ = true;
+}
--- a/GPU/Directx9/Framebuffer.h
+++ b/GPU/Directx9/Framebuffer.h
@ -0,0 +1,164 @@
+// Copyright (c) 2012- PPSSPP Project.
+
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, version 2.0 or later versions.
+
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License 2.0 for more details.
+
+// A copy of the GPL 2.0 should have been included with the program.
+// If not, see http://www.gnu.org/licenses/
+
+// Official git repository and contact information can be found at
+// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.
+
+#pragma once
+
+#include <list>
+
+#include "helper/fbo.h"
+// Keeps track of allocated FBOs.
+// Also provides facilities for drawing and later converting raw
+// pixel data.
+
+
+#include "../Globals.h"
+#include "GPU/GPUCommon.h"
+
+struct GLSLProgram;
+class TextureCache;
+
+enum {
+	FB_USAGE_DISPLAYED_FRAMEBUFFER = 1,
+	FB_USAGE_RENDERTARGET = 2,
+	FB_USAGE_TEXTURE = 4,
+};
+
+enum {	
+	FB_NON_BUFFERED_MODE = 0,
+	FB_BUFFERED_MODE = 1,
+	FB_READFBOMEMORY_CPU = 2,
+	FB_READFBOMEMORY_GPU = 3,
+};
+
+struct VirtualFramebuffer {
+	int last_frame_used;
+
+	u32 fb_address;
+	u32 z_address;
+	int fb_stride;
+	int z_stride;
+
+	// There's also a top left of the drawing region, but meh...
+
+	// width/height: The detected size of the current framebuffer.
+	u16 width;
+	u16 height;
+	// renderWidth/renderHeight: The actual size we render at. May be scaled to render at higher resolutions.
+	u16 renderWidth;
+	u16 renderHeight;
+	// bufferWidth/bufferHeight: The actual (but non scaled) size of the buffer we render to. May only be bigger than width/height.
+	u16 bufferWidth;
+	u16 bufferHeight;
+
+	u16 usageFlags;
+
+	GEBufferFormat format;  // virtual, right now they are all RGBA8888
+	FBOColorDepth colorDepth;
+	FBO *fbo;
+
+	bool dirtyAfterDisplay;
+};
+
+void CenterRect(float *x, float *y, float *w, float *h,
+								float origW, float origH, float frameW, float frameH);
+
+class ShaderManager;
+
+class FramebufferManager {
+public:
+	FramebufferManager();
+	~FramebufferManager();
+
+	void SetTextureCache(TextureCache *tc) {
+		textureCache_ = tc;
+	}
+	void SetShaderManager(ShaderManager *sm) {
+		shaderManager_ = sm;
+	}
+
+	void DrawPixels(const u8 *framebuf, GEBufferFormat pixelFormat, int linesize);
+	void DrawActiveTexture(float x, float y, float w, float h, bool flip = false, float uscale = 1.0f, float vscale = 1.0f);
+
+	void DestroyAllFBOs();
+	void DecimateFBOs();
+
+	void BeginFrame();
+	void EndFrame();
+	void Resized();
+	void DeviceLost();
+	void CopyDisplayToOutput();
+	void SetRenderFrameBuffer();  // Uses parameters computed from gstate
+	void UpdateFromMemory(u32 addr, int size);
+
+	void ReadFramebufferToMemory(VirtualFramebuffer *vfb);
+
+	// TODO: Break out into some form of FBO manager
+	VirtualFramebuffer *GetDisplayFBO();
+	void SetDisplayFramebuffer(u32 framebuf, u32 stride, GEBufferFormat format);
+	size_t NumVFBs() const { return vfbs_.size(); }
+
+	std::vector<FramebufferInfo> GetFramebufferList();
+
+	int GetRenderWidth() const { return currentRenderVfb_ ? currentRenderVfb_->renderWidth : 480; }
+	int GetRenderHeight() const { return currentRenderVfb_ ? currentRenderVfb_->renderHeight : 272; }
+	int GetTargetWidth() const { return currentRenderVfb_ ? currentRenderVfb_->width : 480; }
+	int GetTargetHeight() const { return currentRenderVfb_ ? currentRenderVfb_->height : 272; }
+
+	u32 PrevDisplayFramebufAddr() {
+		return prevDisplayFramebuf_ ? (0x04000000 | prevDisplayFramebuf_->fb_address) : 0;
+	}
+	u32 DisplayFramebufAddr() {
+		return displayFramebuf_ ? (0x04000000 | displayFramebuf_->fb_address) : 0;
+	}
+
+	void DestroyFramebuf(VirtualFramebuffer *vfb);
+
+private:
+	u32 ramDisplayFramebufPtr_;  // workaround for MotoGP insanity
+	u32 displayFramebufPtr_;
+	u32 displayStride_;
+	GEBufferFormat displayFormat_;
+
+	VirtualFramebuffer *displayFramebuf_;
+	VirtualFramebuffer *prevDisplayFramebuf_;
+	VirtualFramebuffer *prevPrevDisplayFramebuf_;
+	int frameLastFramebufUsed;
+
+	std::vector<VirtualFramebuffer *> vfbs_;
+
+	VirtualFramebuffer *currentRenderVfb_;
+
+	// Used by ReadFramebufferToMemory
+	void BlitFramebuffer_(VirtualFramebuffer *src, VirtualFramebuffer *dst, bool flip = false, float upscale = 1.0f, float vscale = 1.0f);
+	void PackFramebufferDirectx9_(VirtualFramebuffer *vfb);
+	int gpuVendor;
+	std::vector<VirtualFramebuffer *> bvfbs_; // blitting FBOs
+	
+	// Used by DrawPixels
+	LPDIRECT3DTEXTURE9 drawPixelsTex_;
+	GEBufferFormat drawPixelsTexFormat_;
+
+	u8 *convBuf;
+	GLSLProgram *draw2dprogram;
+
+
+	TextureCache *textureCache_;
+	ShaderManager *shaderManager_;
+
+	bool resized_;
+	bool useBufferedRendering_;
+};
--- a/GPU/Directx9/IndexGenerator.cpp
+++ b/GPU/Directx9/IndexGenerator.cpp
@ -0,0 +1,359 @@
+// Copyright (c) 2012- PPSSPP Project.
+
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, version 2.0 or later versions.
+
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License 2.0 for more details.
+
+// A copy of the GPL 2.0 should have been included with the program.
+// If not, see http://www.gnu.org/licenses/
+
+// Official git repository and contact information can be found at
+// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.
+
+#include "IndexGenerator.h"
+
+#include "Common/Common.h"
+
+// Points don't need indexing...
+static const u8 indexedPrimitiveType[7] = {
+	GE_PRIM_POINTS,
+	GE_PRIM_LINES,
+	GE_PRIM_LINES,
+	GE_PRIM_TRIANGLES,
+	GE_PRIM_TRIANGLES,
+	GE_PRIM_TRIANGLES,
+	GE_PRIM_RECTANGLES,
+};
+
+void IndexGenerator::Reset() {
+	prim_ = -1;
+	count_ = 0;
+	index_ = 0;
+	seenPrims_ = 0;
+	pureCount_ = 0;
+	this->inds_ = indsBase_;
+}
+
+bool IndexGenerator::PrimCompatible(int prim1, int prim2) {
+	if (prim1 == -1)
+		return true;
+	return indexedPrimitiveType[prim1] == indexedPrimitiveType[prim2];
+}
+
+bool IndexGenerator::PrimCompatible(int prim) {
+	if (prim_ == -1)
+		return true;
+	return indexedPrimitiveType[prim] == prim_;
+}
+
+void IndexGenerator::Setup(u16 *inds) {
+	this->indsBase_ = inds;
+	Reset();
+}
+
+void IndexGenerator::AddPrim(int prim, int vertexCount) {
+	switch (prim) {
+	case GE_PRIM_POINTS: AddPoints(vertexCount); break;
+	case GE_PRIM_LINES: AddLineList(vertexCount); break;
+	case GE_PRIM_LINE_STRIP: AddLineStrip(vertexCount); break;
+	case GE_PRIM_TRIANGLES: AddList(vertexCount); break;
+	case GE_PRIM_TRIANGLE_STRIP: AddStrip(vertexCount); break;
+	case GE_PRIM_TRIANGLE_FAN: AddFan(vertexCount); break;
+	case GE_PRIM_RECTANGLES: AddRectangles(vertexCount); break;  // Same
+	}
+}
+
+void IndexGenerator::AddPoints(int numVerts) {
+	for (int i = 0; i < numVerts; i++)
+		*inds_++ = index_ + i;
+	// ignore overflow verts
+	index_ += numVerts;
+	count_ += numVerts;
+	prim_ = GE_PRIM_POINTS;
+	seenPrims_ |= 1 << GE_PRIM_POINTS;
+}
+
+void IndexGenerator::AddList(int numVerts) {
+	int numTris = numVerts / 3;
+	for (int i = 0; i < numTris; i++) {
+		*inds_++ = index_ + i*3;
+		*inds_++ = index_ + i*3 + 1;
+		*inds_++ = index_ + i*3 + 2;
+	}
+
+	// ignore overflow verts
+	index_ += numVerts;
+	count_ += numTris * 3;
+	prim_ = GE_PRIM_TRIANGLES;
+	seenPrims_ |= 1 << GE_PRIM_TRIANGLES;
+}
+
+void IndexGenerator::AddStrip(int numVerts) {
+	bool wind = false;
+	int numTris = numVerts - 2;
+	for (int i = 0; i < numTris; i++) {
+		*inds_++ = index_ + i;
+		*inds_++ = index_ + i+(wind?2:1);
+		*inds_++ = index_ + i+(wind?1:2);
+		wind = !wind;
+	}
+	index_ += numVerts;
+	count_ += numTris * 3;
+	// This is so we can detect one single strip by just looking at seenPrims_.
+	if (!seenPrims_) {
+		seenPrims_ = 1 << GE_PRIM_TRIANGLE_STRIP;
+		prim_ = GE_PRIM_TRIANGLE_STRIP;
+		pureCount_ = numVerts;
+	} else {
+		seenPrims_ |= 1 << GE_PRIM_TRIANGLE_STRIP;
+		seenPrims_ |= 1 << GE_PRIM_TRIANGLES;
+		prim_ = GE_PRIM_TRIANGLES;
+		pureCount_ = 0;
+	}
+}
+
+void IndexGenerator::AddFan(int numVerts) {
+	int numTris = numVerts - 2;
+	for (int i = 0; i < numTris; i++) {
+		*inds_++ = index_;
+		*inds_++ = index_ + i + 1;
+		*inds_++ = index_ + i + 2;
+	}
+	index_ += numVerts;
+	count_ += numTris * 3;
+	prim_ = GE_PRIM_TRIANGLES;
+	seenPrims_ |= 1 << GE_PRIM_TRIANGLE_FAN;
+}
+
+//Lines
+void IndexGenerator::AddLineList(int numVerts) {
+	int numLines = numVerts / 2;
+	for (int i = 0; i < numLines; i++) {
+		*inds_++ = index_ + i*2;
+		*inds_++ = index_ + i*2+1;
+	}
+	index_ += numVerts;
+	count_ += numLines * 2;
+	prim_ = GE_PRIM_LINES;
+	seenPrims_ |= 1 << prim_;
+}
+
+void IndexGenerator::AddLineStrip(int numVerts) {
+	int numLines = numVerts - 1;
+	for (int i = 0; i < numLines; i++) {
+		*inds_++ = index_ + i;
+		*inds_++ = index_ + i + 1;
+	}
+	index_ += numVerts;
+	count_ += numLines * 2;
+	prim_ = GE_PRIM_LINES;
+	seenPrims_ |= 1 << GE_PRIM_LINE_STRIP;
+}
+
+void IndexGenerator::AddRectangles(int numVerts) {
+	int numRects = numVerts / 2;
+	for (int i = 0; i < numRects; i++) {
+		*inds_++ = index_ + i*2;
+		*inds_++ = index_ + i*2+1;
+	}
+	index_ += numVerts;
+	count_ += numRects * 2;
+	prim_ = GE_PRIM_RECTANGLES;
+	seenPrims_ |= 1 << GE_PRIM_RECTANGLES;
+}
+
+void IndexGenerator::TranslatePrim(int prim, int numInds, const u8 *inds, int indexOffset) {
+	switch (prim) {
+	case GE_PRIM_POINTS: TranslatePoints(numInds, inds, indexOffset); break;
+	case GE_PRIM_LINES: TranslateLineList(numInds, inds, indexOffset); break;
+	case GE_PRIM_LINE_STRIP: TranslateLineStrip(numInds, inds, indexOffset); break;
+	case GE_PRIM_TRIANGLES: TranslateList(numInds, inds, indexOffset); break;
+	case GE_PRIM_TRIANGLE_STRIP: TranslateStrip(numInds, inds, indexOffset); break;
+	case GE_PRIM_TRIANGLE_FAN: TranslateFan(numInds, inds, indexOffset); break;
+	case GE_PRIM_RECTANGLES: TranslateRectangles(numInds, inds, indexOffset); break;  // Same
+	}
+}
+
+void IndexGenerator::TranslatePrim(int prim, int numInds, const u16 *inds, int indexOffset) {
+	switch (prim) {
+	case GE_PRIM_POINTS: TranslatePoints(numInds, inds, indexOffset); break;
+	case GE_PRIM_LINES: TranslateLineList(numInds, inds, indexOffset); break;
+	case GE_PRIM_LINE_STRIP: TranslateLineStrip(numInds, inds, indexOffset); break;
+	case GE_PRIM_TRIANGLES: TranslateList(numInds, inds, indexOffset); break;
+	case GE_PRIM_TRIANGLE_STRIP: TranslateStrip(numInds, inds, indexOffset); break;
+	case GE_PRIM_TRIANGLE_FAN: TranslateFan(numInds, inds, indexOffset); break;
+	case GE_PRIM_RECTANGLES: TranslateRectangles(numInds, inds, indexOffset); break;  // Same
+	}
+}
+
+void IndexGenerator::TranslatePoints(int numInds, const u8 *inds, int indexOffset) {
+	for (int i = 0; i < numInds; i++)
+		*inds_++ = index_ - indexOffset + inds[i];
+	count_ += numInds;
+	prim_ = GE_PRIM_POINTS;
+	seenPrims_ |= (1 << GE_PRIM_POINTS) | SEEN_INDEX8;
+}
+
+void IndexGenerator::TranslatePoints(int numInds, const u16 *_inds, int indexOffset) {
+	const u16_le *inds = (u16_le*)_inds;
+	for (int i = 0; i < numInds; i++)
+		*inds_++ = index_ - indexOffset + inds[i];
+	count_ += numInds;
+	prim_ = GE_PRIM_POINTS;
+	seenPrims_ |= (1 << GE_PRIM_POINTS) | SEEN_INDEX16;
+}
+
+void IndexGenerator::TranslateList(int numInds, const u8 *inds, int indexOffset) {
+	int numTris = numInds / 3;
+	for (int i = 0; i < numTris; i++) {
+		*inds_++ = index_ - indexOffset + inds[i*3];
+		*inds_++ = index_ - indexOffset + inds[i*3 + 1];
+		*inds_++ = index_ - indexOffset + inds[i*3 + 2];
+	}
+	count_ += numTris * 3;
+	prim_ = GE_PRIM_TRIANGLES;
+	seenPrims_ |= (1 << GE_PRIM_TRIANGLES) | SEEN_INDEX8;
+}
+
+void IndexGenerator::TranslateStrip(int numInds, const u8 *inds, int indexOffset) {
+	bool wind = false;
+	int numTris = numInds - 2;
+	for (int i = 0; i < numTris; i++) {
+		*inds_++ = index_ - indexOffset + inds[i];
+		*inds_++ = index_ - indexOffset + inds[i + (wind?2:1)];
+		*inds_++ = index_ - indexOffset + inds[i + (wind?1:2)];
+		wind = !wind;
+	}
+	count_ += numTris * 3;
+	prim_ = GE_PRIM_TRIANGLES;
+	seenPrims_ |= (1 << GE_PRIM_TRIANGLE_STRIP) | SEEN_INDEX8;
+}
+
+void IndexGenerator::TranslateFan(int numInds, const u8 *inds, int indexOffset) {
+	if (numInds <= 0) return;
+	int numTris = numInds - 2;
+	for (int i = 0; i < numTris; i++) {
+		*inds_++ = index_ - indexOffset + inds[0];
+		*inds_++ = index_ - indexOffset + inds[i + 1];
+		*inds_++ = index_ - indexOffset + inds[i + 2];
+	}
+	count_ += numTris * 3;
+	prim_ = GE_PRIM_TRIANGLES;
+	seenPrims_ |= (1 << GE_PRIM_TRIANGLE_FAN) | SEEN_INDEX8;
+}
+
+void IndexGenerator::TranslateList(int numInds, const u16 *_inds, int indexOffset) {
+	const u16_le *inds = (u16_le*)_inds;
+	int numTris = numInds / 3;
+	for (int i = 0; i < numTris; i++) {
+		*inds_++ = index_ - indexOffset + inds[i*3];
+		*inds_++ = index_ - indexOffset + inds[i*3 + 1];
+		*inds_++ = index_ - indexOffset + inds[i*3 + 2];
+	}
+	count_ += numTris * 3;
+	prim_ = GE_PRIM_TRIANGLES;
+	seenPrims_ |= (1 << GE_PRIM_TRIANGLES) | SEEN_INDEX16;
+}
+
+void IndexGenerator::TranslateStrip(int numInds, const u16 *_inds, int indexOffset) {
+	const u16_le *inds = (u16_le*)_inds;
+	bool wind = false;
+	int numTris = numInds - 2;
+	for (int i = 0; i < numTris; i++) {
+		*inds_++ = index_ - indexOffset + inds[i];
+		*inds_++ = index_ - indexOffset + inds[i + (wind?2:1)];
+		*inds_++ = index_ - indexOffset + inds[i + (wind?1:2)];
+		wind = !wind;
+	}
+	count_ += numTris * 3;
+	prim_ = GE_PRIM_TRIANGLES;
+	seenPrims_ |= (1 << GE_PRIM_TRIANGLE_STRIP) | SEEN_INDEX16;
+}
+
+void IndexGenerator::TranslateFan(int numInds, const u16 *_inds, int indexOffset) {
+	const u16_le *inds = (u16_le*)_inds;
+	if (numInds <= 0) return;
+	int numTris = numInds - 2;
+	for (int i = 0; i < numTris; i++) {
+		*inds_++ = index_ - indexOffset + inds[0];
+		*inds_++ = index_ - indexOffset + inds[i + 1];
+		*inds_++ = index_ - indexOffset + inds[i + 2];
+	}
+	count_ += numTris * 3;
+	prim_ = GE_PRIM_TRIANGLES;
+	seenPrims_ |= (1 << GE_PRIM_TRIANGLE_FAN) | SEEN_INDEX16;
+}
+
+void IndexGenerator::TranslateLineList(int numInds, const u8 *inds, int indexOffset) {
+	int numLines = numInds / 2;
+	for (int i = 0; i < numLines; i++) {
+		*inds_++ = index_ - indexOffset + inds[i*2];
+		*inds_++ = index_ - indexOffset + inds[i*2+1];
+	}
+	count_ += numLines * 2;
+	prim_ = GE_PRIM_LINES;
+	seenPrims_ |= (1 << GE_PRIM_LINES) | SEEN_INDEX8;
+}
+
+void IndexGenerator::TranslateLineStrip(int numInds, const u8 *inds, int indexOffset) {
+	int numLines = numInds - 1;
+	for (int i = 0; i < numLines; i++) {
+		*inds_++ = index_ - indexOffset + inds[i];
+		*inds_++ = index_ - indexOffset + inds[i + 1];
+	}
+	count_ += numLines * 2;
+	prim_ = GE_PRIM_LINES;
+	seenPrims_ |= (1 << GE_PRIM_LINE_STRIP) | SEEN_INDEX8;
+}
+
+void IndexGenerator::TranslateLineList(int numInds, const u16 *_inds, int indexOffset) {
+	const u16_le *inds = (u16_le*)_inds;
+	int numLines = numInds / 2;
+	for (int i = 0; i < numLines; i++) {
+		*inds_++ = index_ - indexOffset + inds[i*2];
+		*inds_++ = index_ - indexOffset + inds[i*2+1];
+	}
+	count_ += numLines * 2;
+	prim_ = GE_PRIM_LINES;
+	seenPrims_ |= (1 << GE_PRIM_LINES) | SEEN_INDEX16;
+}
+
+void IndexGenerator::TranslateLineStrip(int numInds, const u16 *_inds, int indexOffset) {	
+	const u16_le *inds = (u16_le*)_inds;
+	int numLines = numInds - 1;
+	for (int i = 0; i < numLines; i++) {
+		*inds_++ = index_ - indexOffset + inds[i];
+		*inds_++ = index_ - indexOffset + inds[i + 1];
+	}
+	count_ += numLines * 2;
+	prim_ = GE_PRIM_LINES;
+	seenPrims_ |= (1 << GE_PRIM_LINE_STRIP) | SEEN_INDEX16;
+}
+
+void IndexGenerator::TranslateRectangles(int numInds, const u8 *inds, int indexOffset) {
+	int numRects = numInds / 2;
+	for (int i = 0; i < numRects; i++) {
+		*inds_++ = index_ - indexOffset + inds[i*2];
+		*inds_++ = index_ - indexOffset + inds[i*2+1];
+	}
+	count_ += numRects * 2;
+	prim_ = GE_PRIM_RECTANGLES;
+	seenPrims_ |= (1 << GE_PRIM_RECTANGLES) | SEEN_INDEX8;
+}
+
+void IndexGenerator::TranslateRectangles(int numInds, const u16 *_inds, int indexOffset) {	
+	const u16_le *inds = (u16_le*)_inds;
+	int numRects = numInds / 2;
+	for (int i = 0; i < numRects; i++) {
+		*inds_++ = index_ - indexOffset + inds[i*2];
+		*inds_++ = index_ - indexOffset + inds[i*2+1];
+	}
+	count_ += numRects * 2;
+	prim_ = GE_PRIM_RECTANGLES;
+	seenPrims_ |= (1 << GE_PRIM_RECTANGLES) | SEEN_INDEX16;
+}
--- a/GPU/Directx9/IndexGenerator.h
+++ b/GPU/Directx9/IndexGenerator.h
@ -0,0 +1,99 @@
+// Copyright (c) 2012- PPSSPP Project.
+
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, version 2.0 or later versions.
+
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License 2.0 for more details.
+
+// A copy of the GPL 2.0 should have been included with the program.
+// If not, see http://www.gnu.org/licenses/
+
+// Official git repository and contact information can be found at
+// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.
+
+
+#pragma once
+
+#include <algorithm>
+#include "CommonTypes.h"
+#include "../ge_constants.h"
+
+class IndexGenerator
+{
+public:
+	void Setup(u16 *indexptr);
+	void Reset();
+	static bool PrimCompatible(int prim1, int prim2);
+	bool PrimCompatible(int prim);
+	int Prim() const { return prim_; }
+
+	void AddPrim(int prim, int vertexCount);
+	void TranslatePrim(int prim, int numInds, const u8 *inds, int indexOffset);
+	void TranslatePrim(int prim, int numInds, const u16 *inds, int indexOffset);
+
+	void Advance(int numVerts) {
+		index_ += numVerts;
+	}
+
+	void SetIndex(int ind) { index_ = ind; }
+	int MaxIndex() const { return index_; }
+	int VertexCount() const { return count_; }
+	bool Empty() const { return index_ == 0; }
+	int SeenPrims() const { return seenPrims_; }
+	int PureCount() const { return pureCount_; }
+	bool SeenOnlyPurePrims() const {
+		return seenPrims_ == (1 << GE_PRIM_TRIANGLES) ||
+			seenPrims_ == (1 << GE_PRIM_LINES) ||
+			seenPrims_ == (1 << GE_PRIM_POINTS) ||
+			seenPrims_ == (1 << GE_PRIM_TRIANGLE_STRIP);
+	}
+
+private:
+	// Points (why index these? code simplicity)
+	void AddPoints(int numVerts);
+	// Triangles
+	void AddList(int numVerts);
+	void AddStrip(int numVerts);
+	void AddFan(int numVerts);
+	// Lines
+	void AddLineList(int numVerts);
+	void AddLineStrip(int numVerts);
+	// Rectangles
+	void AddRectangles(int numVerts);
+
+	void TranslatePoints(int numVerts, const u8 *inds, int indexOffset);	
+	void TranslatePoints(int numVerts, const u16 *inds, int indexOffset);
+	// Translates already indexed lists
+	void TranslateLineList(int numVerts, const u8 *inds, int indexOffset);
+	void TranslateLineList(int numVerts, const u16 *inds, int indexOffset);
+	void TranslateLineStrip(int numVerts, const u8 *inds, int indexOffset);
+	void TranslateLineStrip(int numVerts, const u16 *inds, int indexOffset);
+
+	void TranslateRectangles(int numVerts, const u8 *inds, int indexOffset);
+	void TranslateRectangles(int numVerts, const u16 *inds, int indexOffset);
+
+	void TranslateList(int numVerts, const u8 *inds, int indexOffset);
+	void TranslateList(int numVerts, const u16 *inds, int indexOffset);
+	void TranslateStrip(int numVerts, const u8 *inds, int indexOffset);
+	void TranslateStrip(int numVerts, const u16 *inds, int indexOffset);
+	void TranslateFan(int numVerts, const u8 *inds, int indexOffset);
+	void TranslateFan(int numVerts, const u16 *inds, int indexOffset);
+
+	enum {
+		SEEN_INDEX8 = 1 << 16,
+		SEEN_INDEX16 = 1 << 17
+	};
+
+	u16 *indsBase_;
+	u16 *inds_;
+	int index_;
+	int count_;
+	int pureCount_;
+	int prim_;
+	int seenPrims_;
+};
+
--- a/GPU/Directx9/ShaderManager.cpp
+++ b/GPU/Directx9/ShaderManager.cpp
@ -0,0 +1,604 @@
+// Copyright (c) 2012- PPSSPP Project.
+
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, version 2.0 or later versions.
+
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License 2.0 for more details.
+
+// A copy of the GPL 2.0 should have been included with the program.
+// If not, see http://www.gnu.org/licenses/
+
+// Official git repository and contact information can be found at
+// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.
+
+#ifdef _WIN32
+#define SHADERLOG
+#endif
+
+#include <map>
+#include "helper/global.h"
+#include "math/lin/matrix4x4.h"
+
+#include "Common/Common.h"
+#include "Core/Reporting.h"
+#include "GPU/GPUState.h"
+#include "GPU/ge_constants.h"
+#include "GPU/Directx9/ShaderManager.h"
+#include "GPU/Directx9/TransformPipeline.h"
+#include "UI/OnScreenDisplay.h"
+#include "Framebuffer.h"
+
+// For matrices convertions
+#include <xnamath.h>
+
+PSShader::PSShader(const char *code, bool useHWTransform) : failed_(false), useHWTransform_(useHWTransform) {
+	source_ = code;
+#ifdef SHADERLOG
+	OutputDebugString(code);
+#endif
+	bool success;
+
+	success = CompilePixelShader(code, &shader, &constant);
+
+	if (!success) {
+		failed_ = true;
+		shader = NULL;
+	} else {
+		DEBUG_LOG(G3D, "Compiled shader:\n%s\n", (const char *)code);
+	}
+}
+
+PSShader::~PSShader() {
+	if (shader)
+		shader->Release();
+}
+
+VSShader::VSShader(const char *code, bool useHWTransform) : failed_(false), useHWTransform_(useHWTransform) {
+	source_ = code;
+#ifdef SHADERLOG
+	OutputDebugString(code);
+#endif
+	bool success;
+
+	success = CompileVertexShader(code, &shader, &constant);
+
+	if (!success) {
+		failed_ = true;
+		shader = NULL;
+	} else {
+		DEBUG_LOG(G3D, "Compiled shader:\n%s\n", (const char *)code);
+	}
+}
+
+VSShader::~VSShader() {
+	if (shader)
+		shader->Release();
+}
+
+LinkedShader::LinkedShader(VSShader *vs, PSShader *fs, bool useHWTransform)
+		:dirtyUniforms(0), useHWTransform_(useHWTransform) {
+	
+	INFO_LOG(G3D, "Linked shader: vs %i fs %i", (int)vs->shader, (int)fs->shader);
+
+	u_tex = fs->constant->GetConstantByName(NULL, "tex");
+	u_proj = vs->constant->GetConstantByName(NULL, "u_proj");
+	u_proj_through = vs->constant->GetConstantByName(NULL, "u_proj_through");
+	u_texenv = fs->constant->GetConstantByName(NULL, "u_texenv");
+	u_fogcolor = fs->constant->GetConstantByName(NULL, "u_fogcolor");
+	u_fogcoef = fs->constant->GetConstantByName(NULL, "u_fogcoef");
+	u_alphacolorref = fs->constant->GetConstantByName(NULL, "u_alphacolorref");
+	u_colormask = fs->constant->GetConstantByName(NULL, "u_colormask");
+
+	// Transform
+	u_view = vs->constant->GetConstantByName(NULL, "u_view");
+	u_world = vs->constant->GetConstantByName(NULL, "u_world");
+	u_texmtx = vs->constant->GetConstantByName(NULL, "u_texmtx");
+
+	numBones = gstate.getNumBoneWeights();
+#ifdef USE_BONE_ARRAY
+	u_bone = glGetUniformLocation(program, "u_bone");
+#else
+	for (int i = 0; i < numBones; i++) {
+		char name[10];
+		sprintf(name, "u_bone%i", i);
+		// u_bone[i] = glGetUniformLocation(program, name);
+		u_bone[i] = vs->constant->GetConstantByName(NULL, name);
+	}
+#endif
+
+	// Lighting, texturing
+	u_ambient = vs->constant->GetConstantByName(NULL, "u_ambient");
+	u_matambientalpha = vs->constant->GetConstantByName(NULL, "u_matambientalpha");
+	u_matdiffuse = vs->constant->GetConstantByName(NULL, "u_matdiffuse");
+	u_matspecular = vs->constant->GetConstantByName(NULL, "u_matspecular");
+	u_matemissive = vs->constant->GetConstantByName(NULL, "u_matemissive");
+	u_uvscaleoffset = vs->constant->GetConstantByName(NULL, "u_uvscaleoffset");
+
+	for (int i = 0; i < 4; i++) {
+		char temp[64];
+		sprintf(temp, "u_lightpos%i", i);
+		u_lightpos[i] = vs->constant->GetConstantByName(NULL, temp);
+		sprintf(temp, "u_lightdir%i", i);
+		u_lightdir[i] = vs->constant->GetConstantByName(NULL, temp);
+		sprintf(temp, "u_lightatt%i", i);
+		u_lightatt[i] = vs->constant->GetConstantByName(NULL, temp);
+		sprintf(temp, "u_lightangle%i", i);
+		u_lightangle[i] = vs->constant->GetConstantByName(NULL, temp);
+		sprintf(temp, "u_lightspotCoef%i", i);
+		u_lightspotCoef[i] = vs->constant->GetConstantByName(NULL, temp);
+		sprintf(temp, "u_lightambient%i", i);
+		u_lightambient[i] = vs->constant->GetConstantByName(NULL, temp);
+		sprintf(temp, "u_lightdiffuse%i", i);
+		u_lightdiffuse[i] = vs->constant->GetConstantByName(NULL, temp);
+		sprintf(temp, "u_lightspecular%i", i);
+		u_lightspecular[i] = vs->constant->GetConstantByName(NULL, temp);
+	}
+
+	/*
+	a_position = glGetAttribLocation(program, "a_position");
+	a_color0 = glGetAttribLocation(program, "a_color0");
+	a_color1 = glGetAttribLocation(program, "a_color1");
+	a_texcoord = glGetAttribLocation(program, "a_texcoord");
+	a_normal = glGetAttribLocation(program, "a_normal");
+	a_weight0123 = glGetAttribLocation(program, "a_w1");
+	a_weight4567 = glGetAttribLocation(program, "a_w2");
+	*/
+
+	//glUseProgram(program);
+
+	pD3Ddevice->SetPixelShader(fs->shader);
+	pD3Ddevice->SetVertexShader(vs->shader);
+
+	m_vs = vs;
+	m_fs = fs;
+
+	// Default uniform values
+	//glUniform1i(u_tex, 0);
+	// The rest, use the "dirty" mechanism.
+	dirtyUniforms = DIRTY_ALL;
+	use();
+}
+
+LinkedShader::~LinkedShader() {
+//	glDeleteProgram(program);
+}
+
+// Utility
+static void SetColorUniform3(LPD3DXCONSTANTTABLE constant, int uniform, u32 color) {
+	const float col[3] = {
+		((color & 0xFF)) / 255.0f,
+		((color & 0xFF00) >> 8) / 255.0f,
+		((color & 0xFF0000) >> 16) / 255.0f
+	};
+	constant->SetFloatArray(pD3Ddevice, uniform, col, 3);
+}
+
+static void SetColorUniform3Alpha(LPD3DXCONSTANTTABLE constant, int uniform, u32 color, u8 alpha) {
+	const float col[4] = {
+		((color & 0xFF)) / 255.0f,
+		((color & 0xFF00) >> 8) / 255.0f,
+		((color & 0xFF0000) >> 16) / 255.0f,
+		alpha/255.0f
+	};
+	//glUniform4fv(uniform, 1, col);
+	constant->SetFloatArray(pD3Ddevice, uniform, col, 4);
+}
+
+// This passes colors unscaled (e.g. 0 - 255 not 0 - 1.)
+static void SetColorUniform3Alpha255(LPD3DXCONSTANTTABLE constant, int uniform, u32 color, u8 alpha) {
+	const float col[4] = {
+		(float)((color & 0xFF)),
+		(float)((color & 0xFF00) >> 8),
+		(float)((color & 0xFF0000) >> 16),
+		(float)alpha
+	};
+	//glUniform4fv(uniform, 1, col);
+	constant->SetFloatArray(pD3Ddevice, uniform, col, 4);
+}
+
+static void SetColorUniform3ExtraFloat(LPD3DXCONSTANTTABLE constant, int uniform, u32 color, float extra) {
+	const float col[4] = {
+		((color & 0xFF)) / 255.0f,
+		((color & 0xFF00) >> 8) / 255.0f,
+		((color & 0xFF0000) >> 16) / 255.0f,
+		extra
+	};
+	constant->SetFloatArray(pD3Ddevice, uniform, col, 4);
+}
+
+static void ConvertMatrix4x3To4x4(const float *m4x3, float *m4x4) {
+	m4x4[0] = m4x3[0];
+	m4x4[1] = m4x3[1];
+	m4x4[2] = m4x3[2];
+	m4x4[3] = 0.0f;
+	m4x4[4] = m4x3[3];
+	m4x4[5] = m4x3[4];
+	m4x4[6] = m4x3[5];
+	m4x4[7] = 0.0f;
+	m4x4[8] = m4x3[6];
+	m4x4[9] = m4x3[7];
+	m4x4[10] = m4x3[8];
+	m4x4[11] = 0.0f;
+	m4x4[12] = m4x3[9];
+	m4x4[13] = m4x3[10];
+	m4x4[14] = m4x3[11];
+	m4x4[15] = 1.0f;
+}
+
+static void SetMatrix4x3(LPD3DXCONSTANTTABLE constant, int uniform, const float *m4x3) {
+	float m4x4[16];
+	ConvertMatrix4x3To4x4(m4x3, m4x4);
+	constant->SetMatrix(pD3Ddevice, uniform, (D3DXMATRIX*)m4x4);
+}
+
+void LinkedShader::use() {
+	
+	updateUniforms();
+/*
+	glUseProgram(program);	
+	updateUniforms();
+	glEnableVertexAttribArray(a_position);
+	if (a_texcoord != -1) glEnableVertexAttribArray(a_texcoord);
+	if (a_color0 != -1) glEnableVertexAttribArray(a_color0);
+	if (a_color1 != -1) glEnableVertexAttribArray(a_color1);
+	if (a_normal != -1) glEnableVertexAttribArray(a_normal);
+	if (a_weight0123 != -1) glEnableVertexAttribArray(a_weight0123);
+	if (a_weight4567 != -1) glEnableVertexAttribArray(a_weight4567);
+	*/
+	pD3Ddevice->SetPixelShader(m_fs->shader);
+	pD3Ddevice->SetVertexShader(m_vs->shader);
+}
+
+void LinkedShader::stop() {
+	/*
+	glDisableVertexAttribArray(a_position);
+	if (a_texcoord != -1) glDisableVertexAttribArray(a_texcoord);
+	if (a_color0 != -1) glDisableVertexAttribArray(a_color0);
+	if (a_color1 != -1) glDisableVertexAttribArray(a_color1);
+	if (a_normal != -1) glDisableVertexAttribArray(a_normal);
+	if (a_weight0123 != -1) glDisableVertexAttribArray(a_weight0123);
+	if (a_weight4567 != -1) glDisableVertexAttribArray(a_weight4567);
+	*/
+}
+
+// Depth in ogl is between -1;1 we need between 0;1
+static void ConvertMatrices(Matrix4x4 & in) {
+	/*
+	in.zz *= 0.5f;
+	in.wz += 1.f;
+	*/
+	Matrix4x4 s;
+	Matrix4x4 t;
+	s.setScaling(Vec3(1, 1, 0.5f));
+	t.setTranslation(Vec3(0, 0, 0.5f));
+	in = in * s;
+	in = in * t;
+}
+
+void LinkedShader::updateUniforms() {
+	if (!dirtyUniforms)
+		return;
+
+	// Update any dirty uniforms before we draw
+	if (u_proj != 0 && (dirtyUniforms & DIRTY_PROJMATRIX)) {
+		Matrix4x4 flippedMatrix;
+		memcpy(&flippedMatrix, gstate.projMatrix, 16 * sizeof(float));
+		if (gstate_c.vpHeight < 0) {
+			flippedMatrix[5] = -flippedMatrix[5];
+			flippedMatrix[13] = -flippedMatrix[13];
+		}
+		if (gstate_c.vpWidth < 0) {
+			flippedMatrix[0] = -flippedMatrix[0];
+			flippedMatrix[12] = -flippedMatrix[12];
+		}
+		// Convert matrices !
+		ConvertMatrices(flippedMatrix);
+
+		m_vs->constant->SetMatrix(pD3Ddevice, u_proj, (D3DXMATRIX*)flippedMatrix.getReadPtr());
+	}
+	if (u_proj_through != 0 && (dirtyUniforms & DIRTY_PROJTHROUGHMATRIX))
+	{
+		Matrix4x4 proj_through;
+		proj_through.setOrtho(0.0f, gstate_c.curRTWidth, gstate_c.curRTHeight, 0, 0, 1);
+
+		// Convert matrices !
+		ConvertMatrices(proj_through);
+
+		m_vs->constant->SetMatrix(pD3Ddevice, u_proj_through, (D3DXMATRIX*)proj_through.getReadPtr());
+	}
+	if (u_texenv != 0 && (dirtyUniforms & DIRTY_TEXENV)) {
+		SetColorUniform3(m_fs->constant, u_texenv, gstate.texenvcolor);
+	}
+	if (u_alphacolorref != 0 && (dirtyUniforms & DIRTY_ALPHACOLORREF)) {
+		SetColorUniform3Alpha255(m_fs->constant, u_alphacolorref, gstate.getColorTestRef(), gstate.getAlphaTestRef());
+	}
+	if (u_colormask != 0 && (dirtyUniforms & DIRTY_COLORMASK)) {
+		SetColorUniform3(m_fs->constant, u_colormask, gstate.colormask);
+	}
+	if (u_fogcolor != 0 && (dirtyUniforms & DIRTY_FOGCOLOR)) {
+		SetColorUniform3(m_fs->constant, u_fogcolor, gstate.fogcolor);
+	}
+	if (u_fogcoef != 0 && (dirtyUniforms & DIRTY_FOGCOEF)) {
+		const float fogcoef[2] = {
+			getFloat24(gstate.fog1),
+			getFloat24(gstate.fog2),
+		};
+		//glUniform2fv(u_fogcoef, 1, fogcoef);
+		m_fs->constant->SetFloatArray(pD3Ddevice, u_fogcoef, fogcoef, 2);
+	}
+
+	// Texturing
+	if (u_uvscaleoffset != 0 && (dirtyUniforms & DIRTY_UVSCALEOFFSET)) {
+		float uvscaleoff[4];
+		if (gstate.isModeThrough()) {
+			// We never get here because we don't use HW transform with through mode.
+			// Although - why don't we?
+			uvscaleoff[0] = gstate_c.uv.uScale / gstate_c.curTextureWidth;
+			uvscaleoff[1] = gstate_c.uv.vScale / gstate_c.curTextureHeight;
+			uvscaleoff[2] = gstate_c.uv.uOff / gstate_c.curTextureWidth;
+			uvscaleoff[3] = gstate_c.uv.vOff / gstate_c.curTextureHeight;
+		} else {
+			int w = 1 << (gstate.texsize[0] & 0xf);
+			int h = 1 << ((gstate.texsize[0] >> 8) & 0xf);
+			float widthFactor = (float)w / (float)gstate_c.curTextureWidth;
+			float heightFactor = (float)h / (float)gstate_c.curTextureHeight;
+			if ((gstate.texmapmode & 3) == 0) {
+				static const float rescale[4] = {1.0f, 2*127.5f/128.f, 2*32767.5f/32768.f, 1.0f};
+				float factor = rescale[(gstate.vertType & GE_VTYPE_TC_MASK) >> GE_VTYPE_TC_SHIFT];
+				uvscaleoff[0] = gstate_c.uv.uScale * factor * widthFactor;
+				uvscaleoff[1] = gstate_c.uv.vScale * factor * heightFactor;
+				uvscaleoff[2] = gstate_c.uv.uOff * widthFactor;
+				uvscaleoff[3] = gstate_c.uv.vOff * heightFactor;
+			} else {
+				uvscaleoff[0] = widthFactor;
+				uvscaleoff[1] = heightFactor;
+				uvscaleoff[2] = 0.0f;
+				uvscaleoff[3] = 0.0f;
+			}
+		}		
+		m_vs->constant->SetFloatArray(pD3Ddevice, u_uvscaleoffset, uvscaleoff, 4);
+	}
+
+	// Transform
+	if (u_world != 0 && (dirtyUniforms & DIRTY_WORLDMATRIX)) {
+		SetMatrix4x3(m_vs->constant, u_world, gstate.worldMatrix);
+	}
+	if (u_view != 0 && (dirtyUniforms & DIRTY_VIEWMATRIX)) {
+		SetMatrix4x3(m_vs->constant, u_view, gstate.viewMatrix);
+	}
+	if (u_texmtx != 0 && (dirtyUniforms & DIRTY_TEXMATRIX)) {
+		SetMatrix4x3(m_vs->constant, u_texmtx, gstate.tgenMatrix);
+	}
+
+	// TODO: Could even set all bones in one go if they're all dirty.
+#ifdef USE_BONE_ARRAY
+	if (u_bone != 0) {
+		float allBones[8 * 16];
+
+		bool allDirty = true;
+		for (int i = 0; i < numBones; i++) {
+			if (dirtyUniforms & (DIRTY_BONEMATRIX0 << i)) {
+				ConvertMatrix4x3To4x4(gstate.boneMatrix + 12 * i, allBones + 16 * i);
+			} else {
+				allDirty = false;
+			}
+		}
+		if (allDirty) {
+			// Set them all with one call
+			glUniformMatrix4fv(u_bone, numBones, GL_FALSE, allBones);
+		} else {
+			// Set them one by one. Could try to coalesce two in a row etc but too lazy.
+			for (int i = 0; i < numBones; i++) {
+				if (dirtyUniforms & (DIRTY_BONEMATRIX0 << i)) {
+					glUniformMatrix4fv(u_bone + i, 1, GL_FALSE, allBones + 16 * i);
+				}
+			}
+		}
+	}
+#else
+	float bonetemp[16];
+	for (int i = 0; i < numBones; i++) {
+		if (dirtyUniforms & (DIRTY_BONEMATRIX0 << i)) {
+			ConvertMatrix4x3To4x4(gstate.boneMatrix + 12 * i, bonetemp);
+			//glUniformMatrix4fv(u_bone[i], 1, GL_FALSE, bonetemp);
+			
+			//m_vs->constant->SetMatrix(pD3Ddevice, u_bone[i], (D3DXMATRIX*)bonetemp);
+		}
+	}
+#endif
+
+	// Lighting
+	if (u_ambient != 0 && (dirtyUniforms & DIRTY_AMBIENT)) {
+		SetColorUniform3Alpha(m_vs->constant, u_ambient, gstate.ambientcolor, gstate.getAmbientA());
+	}
+	if (u_matambientalpha != 0 && (dirtyUniforms & DIRTY_MATAMBIENTALPHA)) {
+		SetColorUniform3Alpha(m_vs->constant, u_matambientalpha, gstate.materialambient, gstate.getMaterialAmbientA());
+	}
+	if (u_matdiffuse != 0 && (dirtyUniforms & DIRTY_MATDIFFUSE)) {
+		SetColorUniform3(m_vs->constant, u_matdiffuse, gstate.materialdiffuse);
+	}
+	if (u_matemissive != 0 && (dirtyUniforms & DIRTY_MATEMISSIVE)) {
+		SetColorUniform3(m_vs->constant,u_matemissive, gstate.materialemissive);
+	}
+	if (u_matspecular != 0 && (dirtyUniforms & DIRTY_MATSPECULAR)) {
+		SetColorUniform3ExtraFloat(m_vs->constant,u_matspecular, gstate.materialspecular, getFloat24(gstate.materialspecularcoef));
+	}
+	/*
+	for (int i = 0; i < 4; i++) {
+		if (dirtyUniforms & (DIRTY_LIGHT0 << i)) {
+			if (gstate.isDirectionalLight(i)) {
+				// Prenormalize
+				float x = gstate_c.lightpos[i][0];
+				float y = gstate_c.lightpos[i][1];
+				float z = gstate_c.lightpos[i][2];
+				float len = sqrtf(x*x+y*y+z*z);
+				if (len == 0.0f) 
+					len = 1.0f;
+				else
+					len = 1.0f / len;
+				float vec[3] = { x * len, y * len, z * len };
+				if (u_lightpos[i] != -1) glUniform3fv(u_lightpos[i], 1, vec);
+			} else {
+				if (u_lightpos[i] != -1) glUniform3fv(u_lightpos[i], 1, gstate_c.lightpos[i]);
+			}
+			if (u_lightdir[i] != -1) glUniform3fv(u_lightdir[i], 1, gstate_c.lightdir[i]);
+			if (u_lightatt[i] != -1) glUniform3fv(u_lightatt[i], 1, gstate_c.lightatt[i]);
+			if (u_lightangle[i] != -1) glUniform1f(u_lightangle[i], gstate_c.lightangle[i]);
+			if (u_lightspotCoef[i] != -1) glUniform1f(u_lightspotCoef[i], gstate_c.lightspotCoef[i]);
+			if (u_lightambient[i] != -1) glUniform3fv(u_lightambient[i], 1, gstate_c.lightColor[0][i]);
+			if (u_lightdiffuse[i] != -1) glUniform3fv(u_lightdiffuse[i], 1, gstate_c.lightColor[1][i]);
+			if (u_lightspecular[i] != -1) glUniform3fv(u_lightspecular[i], 1, gstate_c.lightColor[2][i]);
+		}
+	}
+	*/
+
+	dirtyUniforms = 0;
+}
+
+ShaderManager::ShaderManager() : lastShader(NULL), globalDirty(0xFFFFFFFF), shaderSwitchDirty(0) {
+	codeBuffer_ = new char[16384];
+}
+
+ShaderManager::~ShaderManager() {
+	delete [] codeBuffer_;
+}
+
+
+void ShaderManager::DirtyUniform(u32 what) {
+	globalDirty |= what;
+}
+
+void ShaderManager::Clear() {
+	for (auto iter = linkedShaderCache.begin(); iter != linkedShaderCache.end(); ++iter) {
+		delete iter->ls;
+	}
+	for (auto iter = fsCache.begin(); iter != fsCache.end(); ++iter)	{
+		delete iter->second;
+	}
+	for (auto iter = vsCache.begin(); iter != vsCache.end(); ++iter)	{
+		delete iter->second;
+	}
+	linkedShaderCache.clear();
+	fsCache.clear();
+	vsCache.clear();
+	globalDirty = 0xFFFFFFFF;
+	lastFSID.clear();
+	lastVSID.clear();
+	DirtyShader();
+}
+
+void ShaderManager::ClearCache(bool deleteThem) {
+	Clear();
+}
+
+
+void ShaderManager::DirtyShader() {
+	// Forget the last shader ID
+	lastFSID.clear();
+	lastVSID.clear();
+	lastShader = 0;
+	globalDirty = 0xFFFFFFFF;
+	shaderSwitchDirty = 0;
+}
+
+void ShaderManager::EndFrame() { // disables vertex arrays
+	if (lastShader)
+		lastShader->stop();
+	lastShader = 0;
+}
+
+
+LinkedShader *ShaderManager::ApplyShader(int prim) {
+	if (globalDirty) {
+		if (lastShader)
+			lastShader->dirtyUniforms |= globalDirty;
+		shaderSwitchDirty |= globalDirty;
+		globalDirty = 0;
+	}
+
+	bool useHWTransform = CanUseHardwareTransform(prim);
+
+	VertexShaderID VSID;
+	FragmentShaderID FSID;
+	ComputeVertexShaderID(&VSID, prim, useHWTransform);
+	ComputeFragmentShaderID(&FSID);
+
+	// Just update uniforms if this is the same shader as last time.
+	if (lastShader != 0 && VSID == lastVSID && FSID == lastFSID) {
+		lastShader->updateUniforms();
+		return lastShader;	// Already all set.
+	}
+
+	if (lastShader != 0) {
+		// There was a previous shader and we're switching.
+		lastShader->stop();
+	}
+
+	lastVSID = VSID;
+	lastFSID = FSID;
+
+	VSCache::iterator vsIter = vsCache.find(VSID);
+	VSShader *vs;
+	if (vsIter == vsCache.end())	{
+		// Vertex shader not in cache. Let's compile it.
+		GenerateVertexShader(prim, codeBuffer_, useHWTransform);
+		vs = new VSShader(codeBuffer_, useHWTransform);
+
+		if (vs->Failed()) {
+			ERROR_LOG(HLE, "Shader compilation failed, falling back to software transform");
+			osm.Show("hardware transform error - falling back to software", 2.5f, 0xFF3030FF, -1, true);
+			delete vs;
+
+			// TODO: Look for existing shader with the appropriate ID, use that instead of generating a new one - however, need to make sure
+			// that that shader ID is not used when computing the linked shader ID below, because then IDs won't match
+			// next time and we'll do this over and over...
+
+			// Can still work with software transform.
+			GenerateVertexShader(prim, codeBuffer_, false);
+			vs = new VSShader(codeBuffer_, false);
+		}
+
+		vsCache[VSID] = vs;
+	} else {
+		vs = vsIter->second;
+	}
+
+	FSCache::iterator fsIter = fsCache.find(FSID);
+	PSShader *fs;
+	if (fsIter == fsCache.end())	{
+		// Fragment shader not in cache. Let's compile it.
+		GenerateFragmentShader(codeBuffer_);
+		fs = new PSShader(codeBuffer_, useHWTransform);
+		fsCache[FSID] = fs;
+	} else {
+		fs = fsIter->second;
+	}
+
+	// Okay, we have both shaders. Let's see if there's a linked one.
+	LinkedShader *ls = NULL;
+
+	for (auto iter = linkedShaderCache.begin(); iter != linkedShaderCache.end(); ++iter) {
+		// Deferred dirtying! Let's see if we can make this even more clever later.
+		iter->ls->dirtyUniforms |= shaderSwitchDirty;
+
+		if (iter->vs == vs && iter->fs == fs) {
+			ls = iter->ls;
+		}
+	}
+	shaderSwitchDirty = 0;
+
+	if (ls == NULL) {
+		ls = new LinkedShader(vs, fs, vs->UseHWTransform());	// This does "use" automatically
+		const LinkedShaderCacheEntry entry(vs, fs, ls);
+		linkedShaderCache.push_back(entry);
+	} else {
+		ls->use();
+	}
+
+	lastShader = ls;
+	return ls;
+}
--- a/GPU/Directx9/ShaderManager.h
+++ b/GPU/Directx9/ShaderManager.h
@ -0,0 +1,215 @@
+// Copyright (c) 2012- PPSSPP Project.
+
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, version 2.0 or later versions.
+
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License 2.0 for more details.
+
+// A copy of the GPL 2.0 should have been included with the program.
+// If not, see http://www.gnu.org/licenses/
+
+// Official git repository and contact information can be found at
+// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.
+
+#pragma once
+
+#include "base/basictypes.h"
+#include "../../Globals.h"
+#include <map>
+#include "VertexShaderGenerator.h"
+#include "FragmentShaderGenerator.h"
+
+class PSShader;
+class VSShader;
+
+class LinkedShader
+{
+public:
+	LinkedShader(VSShader *vs, PSShader *fs, bool useHWTransform);
+	~LinkedShader();
+
+	void use();
+	void stop();
+	void updateUniforms();
+
+	// Set to false if the VS failed, happens on Mali-400 a lot for complex shaders.
+	bool useHWTransform_;
+
+	VSShader *m_vs;
+	PSShader *m_fs;
+
+	u32 dirtyUniforms;
+
+	// Pre-fetched attrs and uniforms
+	int a_position;
+	int a_color0;
+	int a_color1;
+	int a_texcoord;
+	int a_normal;
+	int a_weight0123;
+	int a_weight4567;
+
+	int u_tex;
+	int u_proj;
+	int u_proj_through;
+	int u_texenv;
+	int u_view;
+	int u_texmtx;
+	int u_world;
+#ifdef USE_BONE_ARRAY
+	int u_bone;  // array, size is numBones
+#else
+	int u_bone[8];
+#endif
+	int numBones;
+	
+	// Fragment processing inputs
+	int u_alphacolorref;
+	int u_colormask;
+	int u_fogcolor;
+	int u_fogcoef;
+
+	// Texturing
+	int u_uvscaleoffset;
+
+	// Lighting
+	int u_ambient;
+	int u_matambientalpha;
+	int u_matdiffuse;
+	int u_matspecular;
+	int u_matemissive;
+	int u_lightpos[4];
+	int u_lightdir[4];
+	int u_lightatt[4];  // attenuation
+	int u_lightangle[4]; // spotlight cone angle (cosine)
+	int u_lightspotCoef[4]; // spotlight dropoff
+	int u_lightdiffuse[4];  // each light consist of vec4[3]
+	int u_lightspecular[4];  // attenuation
+	int u_lightambient[4];  // attenuation
+};
+
+// Will reach 32 bits soon :P
+enum
+{
+	DIRTY_PROJMATRIX = (1 << 0),
+	DIRTY_PROJTHROUGHMATRIX = (1 << 1),
+	DIRTY_FOGCOLOR	 = (1 << 2),
+	DIRTY_FOGCOEF    = (1 << 3),
+	DIRTY_TEXENV		 = (1 << 4),
+	DIRTY_ALPHACOLORREF	 = (1 << 5),
+	DIRTY_COLORREF	 = (1 << 6),
+	DIRTY_COLORMASK	 = (1 << 7),
+	DIRTY_LIGHT0 = (1 << 8),
+	DIRTY_LIGHT1 = (1 << 9),
+	DIRTY_LIGHT2 = (1 << 10),
+	DIRTY_LIGHT3 = (1 << 11),
+
+	DIRTY_MATDIFFUSE = (1 << 12),
+	DIRTY_MATSPECULAR = (1 << 13),
+	DIRTY_MATEMISSIVE = (1 << 14),
+	DIRTY_AMBIENT = (1 << 15),
+	DIRTY_MATAMBIENTALPHA = (1 << 16),
+	DIRTY_MATERIAL = (1 << 17),  // let's set all 4 together (emissive ambient diffuse specular). We hide specular coef in specular.a
+	DIRTY_UVSCALEOFFSET = (1 << 18),  // this will be dirtied ALL THE TIME... maybe we'll need to do "last value with this shader compares"
+
+	DIRTY_WORLDMATRIX = (1 << 21),
+	DIRTY_VIEWMATRIX = (1 << 22),  // Maybe we'll fold this into projmatrix eventually
+	DIRTY_TEXMATRIX = (1 << 23),
+	DIRTY_BONEMATRIX0 = (1 << 24),
+	DIRTY_BONEMATRIX1 = (1 << 25),
+	DIRTY_BONEMATRIX2 = (1 << 26),
+	DIRTY_BONEMATRIX3 = (1 << 27),
+	DIRTY_BONEMATRIX4 = (1 << 28),
+	DIRTY_BONEMATRIX5 = (1 << 29),
+	DIRTY_BONEMATRIX6 = (1 << 30),
+	DIRTY_BONEMATRIX7 = (1 << 31),
+
+	DIRTY_ALL = 0xFFFFFFFF
+};
+
+// Real public interface
+
+class PSShader {
+public:
+	PSShader(const char *code, bool useHWTransform);
+	~PSShader();
+
+	const std::string &source() const { return source_; }
+
+	bool Failed() const { return failed_; }
+	bool UseHWTransform() const { return useHWTransform_; }
+	
+	LPDIRECT3DPIXELSHADER9 shader;
+	LPD3DXCONSTANTTABLE constant;
+protected:	
+	std::string source_;
+	bool failed_;
+	bool useHWTransform_;
+};
+
+class VSShader {
+public:
+	VSShader(const char *code, bool useHWTransform);
+	~VSShader();
+
+	const std::string &source() const { return source_; }
+
+	bool Failed() const { return failed_; }
+	bool UseHWTransform() const { return useHWTransform_; }
+	
+	LPDIRECT3DVERTEXSHADER9 shader;
+	LPD3DXCONSTANTTABLE constant;
+protected:	
+	std::string source_;
+	bool failed_;
+	bool useHWTransform_;
+};
+
+class ShaderManager
+{
+public:
+	ShaderManager();
+	~ShaderManager();
+
+	void ClearCache(bool deleteThem);  // TODO: deleteThem currently not respected
+	LinkedShader *ApplyShader(int prim);
+	void DirtyShader();
+	void DirtyUniform(u32 what);
+	void EndFrame();  // disables vertex arrays
+
+	int NumVertexShaders() const { return (int)vsCache.size(); }
+	int NumFragmentShaders() const { return (int)fsCache.size(); }
+	int NumPrograms() const { return (int)linkedShaderCache.size(); }
+
+private:
+	void Clear();
+
+	struct LinkedShaderCacheEntry {
+		LinkedShaderCacheEntry(VSShader *vs_, PSShader *fs_, LinkedShader *ls_)
+			: vs(vs_), fs(fs_), ls(ls_) { }
+
+		VSShader *vs;
+		PSShader *fs;
+		LinkedShader *ls;
+	};
+	typedef std::vector<LinkedShaderCacheEntry> LinkedShaderCache;
+
+	LinkedShaderCache linkedShaderCache;
+	FragmentShaderID lastFSID;
+	VertexShaderID lastVSID;
+
+	LinkedShader *lastShader;
+	u32 globalDirty;
+	u32 shaderSwitchDirty;
+	char *codeBuffer_;
+
+	typedef std::map<FragmentShaderID, PSShader *> FSCache;
+	FSCache fsCache;
+
+	typedef std::map<VertexShaderID, VSShader *> VSCache;
+	VSCache vsCache;
+};
--- a/GPU/Directx9/StateMapping.cpp
+++ b/GPU/Directx9/StateMapping.cpp
@ -0,0 +1,370 @@
+// Copyright (c) 2012- PPSSPP Project.
+
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, version 2.0 or later versions.
+
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License 2.0 for more details.
+
+// A copy of the GPL 2.0 should have been included with the program.
+// If not, see http://www.gnu.org/licenses/
+
+// Official git repository and contact information can be found at
+// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.
+
+#include "StateMapping.h"
+
+#include "GPU/Math3D.h"
+#include "GPU/GPUState.h"
+#include "GPU/ge_constants.h"
+#include "Core/System.h"
+#include "Core/Config.h"
+#include "Core/Reporting.h"
+#include "DisplayListInterpreter.h"
+#include "ShaderManager.h"
+#include "TextureCache.h"
+#include "Framebuffer.h"
+
+static const D3DBLEND aLookup[11] = {
+	D3DBLEND_DESTCOLOR,
+	D3DBLEND_INVDESTCOLOR,
+	D3DBLEND_SRCALPHA,
+	D3DBLEND_INVSRCALPHA,
+	D3DBLEND_DESTALPHA,
+	D3DBLEND_INVDESTALPHA,
+	D3DBLEND_SRCALPHA,	// should be 2x
+	D3DBLEND_INVSRCALPHA,	 // should be 2x
+	D3DBLEND_DESTALPHA,	 // should be 2x
+	D3DBLEND_INVDESTALPHA,	 // should be 2x	-	and COLOR?
+	D3DBLEND_BLENDFACTOR,	// FIXA
+};
+
+static const D3DBLEND bLookup[11] = {
+	D3DBLEND_SRCCOLOR,
+	D3DBLEND_INVSRCCOLOR,
+	D3DBLEND_SRCALPHA,
+	D3DBLEND_INVSRCALPHA,
+	D3DBLEND_DESTALPHA,
+	D3DBLEND_INVDESTALPHA,
+	D3DBLEND_SRCALPHA,	// should be 2x
+	D3DBLEND_INVSRCALPHA,	 // should be 2x
+	D3DBLEND_DESTALPHA,	 // should be 2x
+	D3DBLEND_INVDESTALPHA,	 // should be 2x
+	D3DBLEND_BLENDFACTOR,	// FIXB
+};
+
+static const D3DBLENDOP eqLookup[] = {
+	D3DBLENDOP_ADD,
+	D3DBLENDOP_SUBTRACT,
+	D3DBLENDOP_REVSUBTRACT,
+	D3DBLENDOP_MIN,
+	D3DBLENDOP_MAX,
+	D3DBLENDOP_ADD, // should be abs(diff)
+};
+
+static const D3DCULL cullingMode[] = {
+	D3DCULL_CW,
+	D3DCULL_CCW,
+};
+
+static const D3DCMPFUNC ztests[] = {
+	D3DCMP_NEVER, D3DCMP_ALWAYS, D3DCMP_EQUAL, D3DCMP_NOTEQUAL, 
+	D3DCMP_LESS, D3DCMP_LESSEQUAL, D3DCMP_GREATER, D3DCMP_GREATEREQUAL,
+};
+
+static const D3DSTENCILOP stencilOps[] = {
+	D3DSTENCILOP_KEEP,
+	D3DSTENCILOP_ZERO,
+	D3DSTENCILOP_REPLACE,
+	D3DSTENCILOP_INVERT,
+	D3DSTENCILOP_INCR,
+	D3DSTENCILOP_DECR,  // don't know if these should be wrap or not
+	D3DSTENCILOP_KEEP, // reserved
+	D3DSTENCILOP_KEEP, // reserved
+};
+
+static u32 blendColor2Func(u32 fix) {
+	if (fix == 0xFFFFFF)
+		return D3DBLEND_ONE;
+	if (fix == 0)
+		return D3DBLEND_ZERO;
+
+	Vec3f fix3 = Vec3f::FromRGB(fix);
+	if (fix3.x >= 0.99 && fix3.y >= 0.99 && fix3.z >= 0.99)
+		return D3DBLEND_ONE;
+	else if (fix3.x <= 0.01 && fix3.y <= 0.01 && fix3.z <= 0.01)
+		return D3DBLEND_ZERO;
+	return D3DBLEND_UNK;
+}
+
+static bool blendColorSimilar(Vec3f a, Vec3f b, float margin = 0.1f) {
+	Vec3f diff = a - b;
+	if (fabsf(diff.x) <= margin && fabsf(diff.y) <= margin && fabsf(diff.z) <= margin)
+		return true;
+	return false;
+}
+
+void TransformDrawEngine::ApplyDrawState(int prim) {
+	// TODO: All this setup is soon so expensive that we'll need dirty flags, or simply do it in the command writes where we detect dirty by xoring. Silly to do all this work on every drawcall.
+
+	if (gstate_c.textureChanged) {
+		if (gstate.isTextureMapEnabled()) {
+			textureCache_->SetTexture();
+		}
+		gstate_c.textureChanged = false;
+	}
+
+	// TODO: The top bit of the alpha channel should be written to the stencil bit somehow. This appears to require very expensive multipass rendering :( Alternatively, one could do a
+	// single fullscreen pass that converts alpha to stencil (or 2 passes, to set both the 0 and 1 values) very easily.
+
+	// Set blend
+	bool wantBlend = !gstate.isModeClear() && gstate.isAlphaBlendEnabled();
+	dxstate.blend.set(wantBlend);
+	if (wantBlend) {
+		// This can't be done exactly as there are several PSP blend modes that are impossible to do on OpenGL ES 2.0, and some even on regular OpenGL for desktop.
+		// HOWEVER - we should be able to approximate the 2x modes in the shader, although they will clip wrongly.
+
+		// Examples of seen unimplementable blend states:
+		// Mortal Kombat Unchained: FixA=0000ff FixB=000080 FuncA=10 FuncB=10
+
+		int blendFuncA  = gstate.getBlendFuncA();
+		int blendFuncB  = gstate.getBlendFuncB();
+		int blendFuncEq = gstate.getBlendEq();
+		if (blendFuncA > GE_SRCBLEND_FIXA) blendFuncA = GE_SRCBLEND_FIXA;
+		if (blendFuncB > GE_DSTBLEND_FIXB) blendFuncB = GE_DSTBLEND_FIXB;
+
+		// Shortcut by using D3DBLEND_ONE where possible, no need to set blendcolor
+		u32 glBlendFuncA = blendFuncA == GE_SRCBLEND_FIXA ? blendColor2Func(gstate.getFixA()) : aLookup[blendFuncA];
+		u32 glBlendFuncB = blendFuncB == GE_DSTBLEND_FIXB ? blendColor2Func(gstate.getFixB()) : bLookup[blendFuncB];
+		if (blendFuncA == GE_SRCBLEND_FIXA || blendFuncB == GE_DSTBLEND_FIXB) {
+			Vec3f fixA = Vec3f::FromRGB(gstate.getFixA());
+			Vec3f fixB = Vec3f::FromRGB(gstate.getFixB());
+			if (glBlendFuncA == D3DBLEND_UNK && glBlendFuncB != D3DBLEND_UNK) {
+				// Can use blendcolor trivially.
+				const float blendColor[4] = {fixA.x, fixA.y, fixA.z, 1.0f};
+				dxstate.blendColor.set(blendColor);
+				glBlendFuncA = D3DBLEND_BLENDFACTOR;
+			} else if (glBlendFuncA != D3DBLEND_UNK && glBlendFuncB == D3DBLEND_UNK) {
+				// Can use blendcolor trivially.
+				const float blendColor[4] = {fixB.x, fixB.y, fixB.z, 1.0f};
+				dxstate.blendColor.set(blendColor);
+				glBlendFuncB = D3DBLEND_BLENDFACTOR;
+			} else if (glBlendFuncA == D3DBLEND_UNK && glBlendFuncB == D3DBLEND_UNK) {
+				if (blendColorSimilar(fixA, Vec3f::AssignToAll(1.0f) - fixB)) {
+					glBlendFuncA = D3DBLEND_BLENDFACTOR;
+					glBlendFuncB = D3DBLEND_INVBLENDFACTOR;
+					const float blendColor[4] = {fixA.x, fixA.y, fixA.z, 1.0f};
+					dxstate.blendColor.set(blendColor);
+				} else if (blendColorSimilar(fixA, fixB)) {
+					glBlendFuncA = D3DBLEND_BLENDFACTOR;
+					glBlendFuncB = D3DBLEND_BLENDFACTOR;
+					const float blendColor[4] = {fixA.x, fixA.y, fixA.z, 1.0f};
+					dxstate.blendColor.set(blendColor);
+				} else {
+					static bool didReportBlend = false;
+					if (!didReportBlend)
+						Reporting::ReportMessage("ERROR INVALID blendcolorstate: FixA=%06x FixB=%06x FuncA=%i FuncB=%i", gstate.getFixA(), gstate.getFixB(), gstate.getBlendFuncA(), gstate.getBlendFuncB());
+					didReportBlend = true;
+
+					DEBUG_LOG(HLE, "ERROR INVALID blendcolorstate: FixA=%06x FixB=%06x FuncA=%i FuncB=%i", gstate.getFixA(), gstate.getFixB(), gstate.getBlendFuncA(), gstate.getBlendFuncB());
+					// Let's approximate, at least.  Close is better than totally off.
+					const bool nearZeroA = blendColorSimilar(fixA, Vec3f::AssignToAll(0.0f), 0.25f);
+					const bool nearZeroB = blendColorSimilar(fixB, Vec3f::AssignToAll(0.0f), 0.25f);
+					if (nearZeroA || blendColorSimilar(fixA, Vec3f::AssignToAll(1.0f), 0.25f)) {
+						glBlendFuncA = nearZeroA ? D3DBLEND_ZERO : D3DBLEND_ONE;
+						glBlendFuncB = D3DBLEND_BLENDFACTOR;
+						const float blendColor[4] = {fixB.x, fixB.y, fixB.z, 1.0f};
+						dxstate.blendColor.set(blendColor);
+					// We need to pick something.  Let's go with A as the fixed color.
+					} else {
+						glBlendFuncA = D3DBLEND_BLENDFACTOR;
+						glBlendFuncB = nearZeroB ? D3DBLEND_ZERO : D3DBLEND_ONE;
+						const float blendColor[4] = {fixA.x, fixA.y, fixA.z, 1.0f};
+						dxstate.blendColor.set(blendColor);
+					}
+				}
+			}
+		}
+
+		// At this point, through all paths above, glBlendFuncA and glBlendFuncB will be set right somehow.
+		dxstate.blendFunc.set(glBlendFuncA, glBlendFuncB);
+		dxstate.blendEquation.set(eqLookup[blendFuncEq]);
+	}
+
+	// Set Dither
+	if (gstate.isDitherEnabled()) {
+		dxstate.dither.enable();
+		dxstate.dither.set(true);
+	} else
+		dxstate.dither.disable();
+
+	// Set ColorMask/Stencil/Depth
+	if (gstate.isModeClear()) {
+
+		// Set Cull 
+		dxstate.cullMode.set(false, false);
+		
+		// Depth Test
+		bool depthMask = (gstate.clearmode >> 10) & 1;
+		dxstate.depthTest.enable();
+		dxstate.depthFunc.set(D3DCMP_ALWAYS);
+		dxstate.depthWrite.set(depthMask);
+
+		// Color Test
+		bool colorMask = (gstate.clearmode >> 8) & 1;
+		bool alphaMask = (gstate.clearmode >> 9) & 1;
+		dxstate.colorMask.set(colorMask, colorMask, colorMask, alphaMask);
+
+		// Stencil Test
+		if (alphaMask) {
+			dxstate.stencilTest.enable();
+			dxstate.stencilOp.set(D3DSTENCILOP_REPLACE, D3DSTENCILOP_REPLACE, D3DSTENCILOP_REPLACE);
+			dxstate.stencilFunc.set(D3DCMP_ALWAYS, 0, 0xFF);
+		} else {
+			dxstate.depthTest.disable();
+		}
+
+	} else {
+		
+		// Set cull
+		bool wantCull = !gstate.isModeThrough() && prim != GE_PRIM_RECTANGLES && gstate.isCullEnabled();
+		dxstate.cullMode.set(wantCull, gstate.getCullMode());	
+
+		// Depth Test
+		if (gstate.isDepthTestEnabled()) {
+			dxstate.depthTest.enable();
+			dxstate.depthFunc.set(ztests[gstate.getDepthTestFunc()]);
+			dxstate.depthWrite.set(gstate.isDepthWriteEnabled());
+		} else 
+			dxstate.depthTest.disable();
+
+		// PSP color/alpha mask is per bit but we can only support per byte.
+		// But let's do that, at least. And let's try a threshold.
+		bool rmask = (gstate.pmskc & 0xFF) < 128;
+		bool gmask = ((gstate.pmskc >> 8) & 0xFF) < 128;
+		bool bmask = ((gstate.pmskc >> 16) & 0xFF) < 128;
+		bool amask = (gstate.pmska & 0xFF) < 128;
+		dxstate.colorMask.set(rmask, gmask, bmask, amask);
+		
+		// Stencil Test
+		if (gstate.isStencilTestEnabled()) {
+			dxstate.stencilTest.enable();
+			dxstate.stencilFunc.set(ztests[gstate.getStencilTestFunction()],
+				gstate.getStencilTestRef(),
+				gstate.getStencilTestMask());
+			dxstate.stencilOp.set(stencilOps[gstate.getStencilOpSFail()],  // stencil fail
+				stencilOps[gstate.getStencilOpZFail()],  // depth fail
+				stencilOps[gstate.getStencilOpZPass()]); // depth pass
+		} else {
+			dxstate.stencilTest.disable();
+		}
+	}
+
+	float renderWidthFactor, renderHeightFactor;
+	float renderWidth, renderHeight;
+	float renderX, renderY;
+	bool useBufferedRendering = g_Config.iRenderingMode != 0 ? 1 : 0;
+	if (useBufferedRendering) {
+		renderX = 0.0f;
+		renderY = 0.0f;
+		renderWidth = framebufferManager_->GetRenderWidth();
+		renderHeight = framebufferManager_->GetRenderHeight();
+		renderWidthFactor = (float)renderWidth / framebufferManager_->GetTargetWidth();
+		renderHeightFactor = (float)renderHeight / framebufferManager_->GetTargetHeight();
+	} else {
+		// TODO: Aspect-ratio aware and centered
+		float pixelW = PSP_CoreParameter().pixelWidth;
+		float pixelH = PSP_CoreParameter().pixelHeight;
+		CenterRect(&renderX, &renderY, &renderWidth, &renderHeight, 480, 272, pixelW, pixelH);
+		renderWidthFactor = renderWidth / 480.0f;
+		renderHeightFactor = renderHeight / 272.0f;
+	}
+
+	bool throughmode = (gstate.vertType & GE_VTYPE_THROUGH_MASK) != 0;
+
+	// Scissor
+	int scissorX1 = (gstate.getScissorX1());
+	int scissorY1 = (gstate.getScissorY1());
+	int scissorX2 = (gstate.getScissorX2());
+	int scissorY2 = (gstate.getScissorY2());
+
+	// This is a bit of a hack as the render buffer isn't always that size
+	if (scissorX1 == 0 && scissorY1 == 0 
+		&& scissorX2 >= (int) (gstate_c.curRTWidth - 1) 
+		&& scissorY2 >= (int) (gstate_c.curRTHeight - 1)) {
+		dxstate.scissorTest.disable();
+	} else {
+		dxstate.scissorTest.enable();
+		dxstate.scissorRect.set(
+			renderX + scissorX1 * renderWidthFactor,
+			renderY + scissorY1 * renderHeightFactor,
+			renderY + scissorX2 * renderWidthFactor,
+			renderY + scissorY2 * renderHeightFactor);
+	}
+
+	/*
+	int regionX1 = gstate.region1 & 0x3FF;
+	int regionY1 = (gstate.region1 >> 10) & 0x3FF;
+	int regionX2 = (gstate.region2 & 0x3FF) + 1;
+	int regionY2 = ((gstate.region2 >> 10) & 0x3FF) + 1;
+	*/
+	int regionX1 = 0;
+	int regionY1 = 0;
+	int regionX2 = gstate_c.curRTWidth;
+	int regionY2 = gstate_c.curRTHeight;
+
+	float offsetX = (float)(gstate.offsetx & 0xFFFF) / 16.0f;
+	float offsetY = (float)(gstate.offsety & 0xFFFF) / 16.0f;
+
+	if (throughmode) {
+		// No viewport transform here. Let's experiment with using region.
+		dxstate.viewport.set(
+			renderX + (0 + regionX1) * renderWidthFactor, 
+			renderY + (0 - regionY1) * renderHeightFactor,
+			(regionX2 - regionX1) * renderWidthFactor,
+			(regionY2 - regionY1) * renderHeightFactor,
+			0.f, 1.f);
+	} else {
+		// These we can turn into a glViewport call, offset by offsetX and offsetY. Math after.
+		float vpXa = getFloat24(gstate.viewportx1);
+		float vpXb = getFloat24(gstate.viewportx2);
+		float vpYa = getFloat24(gstate.viewporty1);
+		float vpYb = getFloat24(gstate.viewporty2);
+
+		// The viewport transform appears to go like this: 
+		// Xscreen = -offsetX + vpXb + vpXa * Xview
+		// Yscreen = -offsetY + vpYb + vpYa * Yview
+		// Zscreen = vpZb + vpZa * Zview
+
+		// This means that to get the analogue glViewport we must:
+		float vpX0 = vpXb - offsetX - vpXa;
+		float vpY0 = vpYb - offsetY + vpYa;   // Need to account for sign of Y
+		gstate_c.vpWidth = vpXa * 2.0f;
+		gstate_c.vpHeight = -vpYa * 2.0f;
+
+		float vpWidth = fabsf(gstate_c.vpWidth);
+		float vpHeight = fabsf(gstate_c.vpHeight);
+
+		vpX0 *= renderWidthFactor;
+		vpY0 *= renderHeightFactor;
+		vpWidth *= renderWidthFactor;
+		vpHeight *= renderHeightFactor;
+
+		vpX0 = (vpXb - offsetX - fabsf(vpXa)) * renderWidthFactor;
+		// Flip vpY0 to match the OpenGL coordinate system.
+		vpY0 = renderHeight - (vpYb - offsetY + fabsf(vpYa)) * renderHeightFactor;		
+		
+		// Sadly, as glViewport takes integers, we will not be able to support sub pixel offsets this way. But meh.
+		// shaderManager_->DirtyUniform(DIRTY_PROJMATRIX);
+
+		float zScale = getFloat24(gstate.viewportz1) / 65535.0f;
+		float zOff = getFloat24(gstate.viewportz2) / 65535.0f;
+		float depthRangeMin = zOff - zScale;
+		float depthRangeMax = zOff + zScale;
+
+		dxstate.viewport.set(vpX0 + renderX, vpY0 + renderY, vpWidth, vpHeight, depthRangeMin, depthRangeMax);
+	}
+}
--- a/GPU/Directx9/StateMapping.h
+++ b/GPU/Directx9/StateMapping.h
@ -0,0 +1,5 @@
+#pragma once
+
+#include "helper/global.h"
+#include "helper/dx_state.h"
+//#include "../native/gfx/gl_common.h"
--- a/GPU/Directx9/TextureCache.cpp
+++ b/GPU/Directx9/TextureCache.cpp
--- a/GPU/Directx9/TextureCache.h
+++ b/GPU/Directx9/TextureCache.h
@ -0,0 +1,151 @@
+// Copyright (c) 2012- PPSSPP Project.
+
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, version 2.0 or later versions.
+
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License 2.0 for more details.
+
+// A copy of the GPL 2.0 should have been included with the program.
+// If not, see http://www.gnu.org/licenses/
+
+// Official git repository and contact information can be found at
+// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.
+
+#pragma once
+
+#include "../Globals.h"
+#include "helper/global.h"
+#include "helper/fbo.h"
+#include "GPU/GPUInterface.h"
+#include "GPU/GPUState.h"
+#include "TextureScaler.h"
+
+struct VirtualFramebuffer;
+
+enum TextureFiltering {
+	AUTO = 1,
+	NEAREST = 2,
+	LINEAR = 3,   
+	LINEARFMV = 4,
+};
+class TextureCache 
+{
+public:
+	TextureCache();
+	~TextureCache();
+
+	void SetTexture();
+
+	void Clear(bool delete_them);
+	void StartFrame();
+	void Invalidate(u32 addr, int size, GPUInvalidationType type);
+	void InvalidateAll(GPUInvalidationType type);
+	void ClearNextFrame();
+	void LoadClut();
+
+	// FramebufferManager keeps TextureCache updated about what regions of memory
+	// are being rendered to. This is barebones so far.
+	void NotifyFramebuffer(u32 address, VirtualFramebuffer *framebuffer);
+	void NotifyFramebufferDestroyed(u32 address, VirtualFramebuffer *framebuffer);
+
+	size_t NumLoadedTextures() const {
+		return cache.size();
+	}
+
+	// Only used by Qt UI?
+	bool DecodeTexture(u8 *output, GPUgstate state);
+
+private:
+	// Wow this is starting to grow big. Soon need to start looking at resizing it.
+	// Must stay a POD.
+	struct TexCacheEntry {
+		// After marking STATUS_UNRELIABLE, if it stays the same this many frames we'll trust it again.
+		const static int FRAMES_REGAIN_TRUST = 1000;
+
+		enum Status {
+			STATUS_HASHING = 0x00,
+			STATUS_RELIABLE = 0x01,  // cache, don't hash
+			STATUS_UNRELIABLE = 0x02,  // never cache
+			STATUS_MASK = 0x03,
+
+			STATUS_ALPHA_UNKNOWN = 0x04,
+			STATUS_ALPHA_FULL = 0x00,  // Has no alpha channel, or always full alpha.
+			STATUS_ALPHA_SIMPLE = 0x08,  // Like above, but also has 0 alpha (e.g. 5551.)
+			STATUS_ALPHA_MASK = 0x0c,
+		};
+
+		// Status, but int so we can zero initialize.
+		int status;
+		u32 addr;
+		u32 hash;
+		VirtualFramebuffer *framebuffer;  // if null, not sourced from an FBO.
+		u32 sizeInRAM;
+		int lastFrame;
+		int numFrames;
+		int numInvalidated;
+		u32 framesUntilNextFullHash;
+		u8 format;
+		u16 dim;
+		u16 bufw;
+		LPDIRECT3DTEXTURE9 texture;  //GLuint
+		int invalidHint;
+		u32 fullhash;
+		u32 cluthash;
+		int maxLevel;
+		float lodBias;
+
+		// Cache the current filter settings so we can avoid setting it again.
+		// (OpenGL madness where filter settings are attached to each texture).
+		u8 magFilt;
+		u8 minFilt;
+		bool sClamp;
+		bool tClamp;
+
+		bool Matches(u16 dim2, u8 format2, int maxLevel2);
+	};
+
+	void Decimate();  // Run this once per frame to get rid of old textures.
+	void *UnswizzleFromMem(u32 texaddr, u32 bufw, u32 bytesPerPixel, u32 level);
+	void *readIndexedTex(int level, u32 texaddr, int bytesPerIndex, u32 dstFmt);
+	void UpdateSamplingParams(TexCacheEntry &entry, bool force);
+	void LoadTextureLevel(TexCacheEntry &entry, int level, bool replaceImages);
+	void *DecodeTextureLevel(GETextureFormat format, GEPaletteFormat clutformat, int level, u32 &texByteAlign, u32 &dstFmt);
+	void CheckAlpha(TexCacheEntry &entry, u32 *pixelData, u32 dstFmt, int w, int h);
+	template <typename T>
+	const T *GetCurrentClut();
+	u32 GetCurrentClutHash();
+	void UpdateCurrentClut();
+
+	TexCacheEntry *GetEntryAt(u32 texaddr);
+
+	typedef std::map<u64, TexCacheEntry> TexCache;
+	TexCache cache;
+	TexCache secondCache;
+
+	bool clearCacheNextFrame_;
+	bool lowMemoryMode_;
+	TextureScaler scaler;
+
+	SimpleBuf<u32> tmpTexBuf32;
+	SimpleBuf<u16> tmpTexBuf16;
+
+	SimpleBuf<u32> tmpTexBufRearrange;
+
+	u32 clutLastFormat_;
+	u32 *clutBufRaw_;
+	u32 *clutBufConverted_;
+	u32 *clutBuf_;
+	u32 clutHash_;
+	u32 clutTotalBytes_;
+	// True if the clut is just alpha values in the same order (RGBA4444-bit only.)
+	bool clutAlphaLinear_;
+	u16 clutAlphaLinearColor_;
+
+	LPDIRECT3DTEXTURE9 lastBoundTexture;
+	float maxAnisotropyLevel;
+};
+
--- a/GPU/Directx9/TextureScaler.cpp
+++ b/GPU/Directx9/TextureScaler.cpp
@ -0,0 +1,676 @@
+// Copyright (c) 2012- PPSSPP Project.
+
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, version 2.0 or later versions.
+
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License 2.0 for more details.
+
+// A copy of the GPL 2.0 should have been included with the program.
+// If not, see http://www.gnu.org/licenses/
+
+// Official git repository and contact information can be found at
+// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.
+
+#include "TextureScaler.h"
+
+#include "Core/Config.h"
+#include "Common/Common.h"
+#include "Common/Log.h"
+#include "Common/MsgHandler.h"
+#include "Common/CommonFuncs.h"
+#include "Common/ThreadPools.h"
+#include "Common/CPUDetect.h"
+#include "ext/xbrz/xbrz.h"
+#include <stdlib.h>
+#include <math.h>
+
+#if _M_SSE >= 0x402
+#include <nmmintrin.h>
+#endif
+
+// Report the time and throughput for each larger scaling operation in the log
+//#define SCALING_MEASURE_TIME
+
+#ifdef SCALING_MEASURE_TIME
+#include "native/base/timeutil.h"
+#endif
+
+/////////////////////////////////////// Helper Functions (mostly math for parallelization)
+
+namespace {
+	//////////////////////////////////////////////////////////////////// Color space conversion
+
+	// convert 4444 image to 8888, parallelizable
+	void convert4444(u16* data, u32* out, int width, int l, int u) {
+		for(int y = l; y < u; ++y) {
+			for(int x = 0; x < width; ++x) {
+				u32 val = data[y*width + x];
+				u32 r = ((val>>12) & 0xF) * 17;
+				u32 g = ((val>> 8) & 0xF) * 17;
+				u32 b = ((val>> 4) & 0xF) * 17;
+				u32 a = ((val>> 0) & 0xF) * 17;
+				out[y*width + x] = (a << 24) | (b << 16) | (g << 8) | r;
+			}
+		}
+	}
+
+	// convert 565 image to 8888, parallelizable
+	void convert565(u16* data, u32* out, int width, int l, int u) {
+		for(int y = l; y < u; ++y) {
+			for(int x = 0; x < width; ++x) {
+				u32 val = data[y*width + x];
+				u32 r = Convert5To8((val>>11) & 0x1F);
+				u32 g = Convert6To8((val>> 5) & 0x3F);
+				u32 b = Convert5To8((val    ) & 0x1F);
+				out[y*width + x] = (0xFF << 24) | (b << 16) | (g << 8) | r;
+			}
+		}
+	}
+
+	// convert 5551 image to 8888, parallelizable
+	void convert5551(u16* data, u32* out, int width, int l, int u) {
+		for(int y = l; y < u; ++y) {
+			for(int x = 0; x < width; ++x) {
+				u32 val = data[y*width + x];
+				u32 r = Convert5To8((val>>11) & 0x1F);
+				u32 g = Convert5To8((val>> 6) & 0x1F);
+				u32 b = Convert5To8((val>> 1) & 0x1F);
+				u32 a = (val & 0x1) * 255;
+				out[y*width + x] = (a << 24) | (b << 16) | (g << 8) | r;
+			}
+		}
+	}
+
+	//////////////////////////////////////////////////////////////////// Various image processing
+
+	#define R(_col) ((_col>> 0)&0xFF)
+	#define G(_col) ((_col>> 8)&0xFF)
+	#define B(_col) ((_col>>16)&0xFF)
+	#define A(_col) ((_col>>24)&0xFF)
+
+	#define DISTANCE(_p1,_p2) ( abs(static_cast<int>(static_cast<int>(R(_p1))-R(_p2))) + abs(static_cast<int>(static_cast<int>(G(_p1))-G(_p2))) \
+							  + abs(static_cast<int>(static_cast<int>(B(_p1))-B(_p2))) + abs(static_cast<int>(static_cast<int>(A(_p1))-A(_p2))) )
+	
+	// this is sadly much faster than an inline function with a loop, at least in VC10
+	#define MIX_PIXELS(_p0, _p1, _factors) \
+		( (R(_p0)*(_factors)[0] + R(_p1)*(_factors)[1])/255 <<  0 ) | \
+		( (G(_p0)*(_factors)[0] + G(_p1)*(_factors)[1])/255 <<  8 ) | \
+		( (B(_p0)*(_factors)[0] + B(_p1)*(_factors)[1])/255 << 16 ) | \
+		( (A(_p0)*(_factors)[0] + A(_p1)*(_factors)[1])/255 << 24 )
+
+	#define BLOCK_SIZE 32
+	
+	// 3x3 convolution with Neumann boundary conditions, parallelizable
+	// quite slow, could be sped up a lot
+	// especially handling of separable kernels
+	void convolve3x3(u32* data, u32* out, const int kernel[3][3], int width, int height, int l, int u) {
+		for(int yb = 0; yb < (u-l)/BLOCK_SIZE+1; ++yb) {
+			for(int xb = 0; xb < width/BLOCK_SIZE+1; ++xb) {
+				for(int y = l+yb*BLOCK_SIZE; y < l+(yb+1)*BLOCK_SIZE && y < u; ++y) {
+					for(int x = xb*BLOCK_SIZE; x < (xb+1)*BLOCK_SIZE && x < width; ++x) {
+						int val = 0;
+						for(int yoff = -1; yoff <= 1; ++yoff) {
+							int yy = std::max(std::min(y+yoff, height-1), 0);
+							for(int xoff = -1; xoff <= 1; ++xoff) {
+								int xx = std::max(std::min(x+xoff, width-1), 0);
+								val += data[yy*width + xx] * kernel[yoff+1][xoff+1];
+							}
+						}
+						out[y*width + x] = abs(val);
+					}
+				}
+			}
+		}
+	}
+
+	// deposterization: smoothes posterized gradients from low-color-depth (e.g. 444, 565, compressed) sources
+	void deposterizeH(u32* data, u32* out, int w, int l, int u) {
+		static const int T = 8;
+		for(int y = l; y < u; ++y) {
+			for(int x = 0; x < w; ++x) {
+				int inpos = y*w + x;
+				u32 center = data[inpos];
+				if(x==0 || x==w-1) {
+					out[y*w + x] = center;
+					continue;
+				}
+				u32 left   = data[inpos - 1];
+				u32 right  = data[inpos + 1];
+				out[y*w + x] = 0;
+				for(int c=0; c<4; ++c) {
+					u8 lc = ((  left>>c*8)&0xFF);
+					u8 cc = ((center>>c*8)&0xFF);
+					u8 rc = (( right>>c*8)&0xFF);
+					if((lc != rc) && ((lc == cc && abs((int)((int)rc)-cc) <= T) || (rc == cc && abs((int)((int)lc)-cc) <= T))) {
+						// blend this component
+						out[y*w + x] |= ((rc+lc)/2) << (c*8);
+					} else {
+						// no change for this component
+						out[y*w + x] |= cc << (c*8);
+					}
+				}
+			}
+		}
+	}
+	void deposterizeV(u32* data, u32* out, int w, int h, int l, int u) {
+		static const int T = 8;
+		for(int xb = 0; xb < w/BLOCK_SIZE+1; ++xb) {
+			for(int y = l; y < u; ++y) {
+				for(int x = xb*BLOCK_SIZE; x < (xb+1)*BLOCK_SIZE && x < w; ++x) {
+					u32 center = data[ y    * w + x];
+					if(y==0 || y==h-1) {
+						out[y*w + x] = center;
+						continue;
+					}
+					u32 upper  = data[(y-1) * w + x];
+					u32 lower  = data[(y+1) * w + x];
+					out[y*w + x] = 0;
+					for(int c=0; c<4; ++c) {
+						u8 uc = (( upper>>c*8)&0xFF);
+						u8 cc = ((center>>c*8)&0xFF);
+						u8 lc = (( lower>>c*8)&0xFF);
+						if((uc != lc) && ((uc == cc && abs((int)((int)lc)-cc) <= T) || (lc == cc && abs((int)((int)uc)-cc) <= T))) {
+							// blend this component
+							out[y*w + x] |= ((lc+uc)/2) << (c*8);
+						} else {
+							// no change for this component
+							out[y*w + x] |= cc << (c*8);
+						}
+					}
+				}
+			}
+		}
+	}
+
+	// generates a distance mask value for each pixel in data
+	// higher values -> larger distance to the surrounding pixels
+	void generateDistanceMask(u32* data, u32* out, int width, int height, int l, int u) {
+		for(int yb = 0; yb < (u-l)/BLOCK_SIZE+1; ++yb) {
+			for(int xb = 0; xb < width/BLOCK_SIZE+1; ++xb) {
+				for(int y = l+yb*BLOCK_SIZE; y < l+(yb+1)*BLOCK_SIZE && y < u; ++y) {
+					for(int x = xb*BLOCK_SIZE; x < (xb+1)*BLOCK_SIZE && x < width; ++x) {
+						out[y*width + x] = 0;
+						u32 center = data[y*width + x];
+						for(int yoff = -1; yoff <= 1; ++yoff) {
+							int yy = y+yoff;
+							if(yy == height || yy == -1) {
+								out[y*width + x] += 1200; // assume distance at borders, usually makes for better result
+								continue;
+							}
+							for(int xoff = -1; xoff <= 1; ++xoff) {
+								if(yoff == 0 && xoff == 0) continue;
+								int xx = x+xoff;
+								if(xx == width || xx == -1) {
+									out[y*width + x] += 400; // assume distance at borders, usually makes for better result
+									continue;
+								}
+								out[y*width + x] += DISTANCE(data[yy*width + xx], center);
+							}
+						}
+					}
+				}
+			}
+		}
+	}
+
+	// mix two images based on a mask
+	void mix(u32* data, u32* source, u32* mask, u32 maskmax, int width, int l, int u) {
+		for(int y = l; y < u; ++y) {
+			for(int x = 0; x < width; ++x) {
+				int pos = y*width + x;
+				u8 mixFactors[2] = { 0, static_cast<u8>((std::min(mask[pos], maskmax)*255)/maskmax) };
+				mixFactors[0] = 255-mixFactors[1];
+				data[pos] = MIX_PIXELS(data[pos], source[pos], mixFactors);
+				if(A(source[pos]) == 0) data[pos] = data[pos] & 0x00FFFFFF; // xBRZ always does a better job with hard alpha
+			}
+		}
+	}
+
+	//////////////////////////////////////////////////////////////////// Bicubic scaling
+	
+	// generate the value of a Mitchell-Netravali scaling spline at distance d, with parameters A and B
+	// B=1 C=0   : cubic B spline (very smooth)
+	// B=C=1/3   : recommended for general upscaling
+	// B=0 C=1/2 : Catmull-Rom spline (sharp, ringing)
+	// see Mitchell & Netravali, "Reconstruction Filters in Computer Graphics"
+	inline float mitchell(float x, float B, float C) {
+		float ax = fabs(x);
+		if(ax>=2.0f) return 0.0f;
+		if(ax>=1.0f) return ((-B-6*C)*(x*x*x) + (6*B+30*C)*(x*x) + (-12*B-48*C)*x + (8*B+24*C))/6.0f;
+		return ((12-9*B-6*C)*(x*x*x) + (-18+12*B+6*C)*(x*x) + (6-2*B))/6.0f;
+	}
+
+	// arrays for pre-calculating weights and sums (~20KB)
+	// Dimensions:
+	//   0: 0 = BSpline, 1 = mitchell
+	//   2: 2-5x scaling
+	// 2,3: 5x5 generated pixels 
+	// 4,5: 5x5 pixels sampled from
+	float bicubicWeights[2][4][5][5][5][5];
+	float bicubicInvSums[2][4][5][5];
+
+	// initialize pre-computed weights array
+	void initBicubicWeights() {
+		float B[2] = { 1.0f, 0.334f };
+		float C[2] = { 0.0f, 0.334f };
+		for(int type=0; type<2; ++type) {
+			for(int factor=2; factor<=5; ++factor) {
+				for(int x=0; x<factor; ++x) {
+					for(int y=0; y<factor; ++y) {
+						float sum = 0.0f;
+						for(int sx = -2; sx <= 2; ++sx) { 
+							for(int sy = -2; sy <= 2; ++sy) {
+								float dx = (x+0.5f)/factor - (sx+0.5f);
+								float dy = (y+0.5f)/factor - (sy+0.5f);
+								float dist = sqrt(dx*dx + dy*dy);
+								float weight = mitchell(dist, B[type], C[type]);
+								bicubicWeights[type][factor-2][x][y][sx+2][sy+2] = weight;
+								sum += weight;
+							}
+						}
+						bicubicInvSums[type][factor-2][x][y] = 1.0f/sum;
+					}
+				}
+			}
+		}
+	}
+
+	// perform bicubic scaling by factor f, with precomputed spline type T
+	template<int f, int T>
+	void scaleBicubicT(u32* data, u32* out, int w, int h, int l, int u) {
+		int outw = w*f;
+		for(int yb = 0; yb < (u-l)*f/BLOCK_SIZE+1; ++yb) {
+			for(int xb = 0; xb < w*f/BLOCK_SIZE+1; ++xb) {
+				for(int y = l*f+yb*BLOCK_SIZE; y < l*f+(yb+1)*BLOCK_SIZE && y < u*f; ++y) {
+					for(int x = xb*BLOCK_SIZE; x < (xb+1)*BLOCK_SIZE && x < w*f; ++x) {
+						float r = 0.0f, g = 0.0f, b = 0.0f, a = 0.0f;
+						int cx = x/f, cy = y/f;
+						// sample supporting pixels in original image
+						for(int sx = -2; sx <= 2; ++sx) { 
+							for(int sy = -2; sy <= 2; ++sy) {
+								float weight = bicubicWeights[T][f-2][x%f][y%f][sx+2][sy+2];
+								if(weight != 0.0f) {
+									// clamp pixel locations
+									int csy = std::max(std::min(sy+cy,h-1),0);
+									int csx = std::max(std::min(sx+cx,w-1),0);
+									// sample & add weighted components
+									u32 sample = data[csy*w+csx];
+									r += weight*R(sample);
+									g += weight*G(sample);
+									b += weight*B(sample);
+									a += weight*A(sample);
+								}
+							}
+						}
+						// generate and write result
+						float invSum = bicubicInvSums[T][f-2][x%f][y%f];
+						int ri = std::min(std::max(static_cast<int>(ceilf(r*invSum)),0),255);
+						int gi = std::min(std::max(static_cast<int>(ceilf(g*invSum)),0),255);
+						int bi = std::min(std::max(static_cast<int>(ceilf(b*invSum)),0),255);
+						int ai = std::min(std::max(static_cast<int>(ceilf(a*invSum)),0),255);
+						out[y*outw + x] = (ai << 24) | (bi << 16) | (gi << 8) | ri;
+					}
+				}
+			}
+		}
+	}
+	#if _M_SSE >= 0x401
+	template<int f, int T>
+	void scaleBicubicTSSE41(u32* data, u32* out, int w, int h, int l, int u) {
+		int outw = w*f;
+		for(int yb = 0; yb < (u-l)*f/BLOCK_SIZE+1; ++yb) {
+			for(int xb = 0; xb < w*f/BLOCK_SIZE+1; ++xb) {
+				for(int y = l*f+yb*BLOCK_SIZE; y < l*f+(yb+1)*BLOCK_SIZE && y < u*f; ++y) {
+					for(int x = xb*BLOCK_SIZE; x < (xb+1)*BLOCK_SIZE && x < w*f; ++x) {
+						__m128 result = _mm_set1_ps(0.0f);
+						int cx = x/f, cy = y/f;
+						// sample supporting pixels in original image
+						for(int sx = -2; sx <= 2; ++sx) { 
+							for(int sy = -2; sy <= 2; ++sy) {
+								float weight = bicubicWeights[T][f-2][x%f][y%f][sx+2][sy+2];
+								if(weight != 0.0f) {
+									// clamp pixel locations
+									int csy = std::max(std::min(sy+cy,h-1),0);
+									int csx = std::max(std::min(sx+cx,w-1),0);
+									// sample & add weighted components
+									__m128i sample = _mm_cvtsi32_si128(data[csy*w+csx]);
+									sample = _mm_cvtepu8_epi32(sample);
+									__m128 col = _mm_cvtepi32_ps(sample);
+									col = _mm_mul_ps(col, _mm_set1_ps(weight));
+									result = _mm_add_ps(result, col);
+								}
+							}
+						}
+						// generate and write result
+						__m128i pixel = _mm_cvtps_epi32(_mm_mul_ps(result, _mm_set1_ps(bicubicInvSums[T][f-2][x%f][y%f])));
+						pixel = _mm_packs_epi32(pixel, pixel);
+						pixel = _mm_packus_epi16(pixel, pixel);
+						out[y*outw + x] = _mm_cvtsi128_si32(pixel);
+					}
+				}
+			}
+		}
+	}
+	#endif
+
+	void scaleBicubicBSpline(int factor, u32* data, u32* out, int w, int h, int l, int u) {
+		#if _M_SSE >= 0x401
+		if(cpu_info.bSSE4_1) {
+			switch(factor) {
+			case 2: scaleBicubicTSSE41<2, 0>(data, out, w, h, l, u); break; // when I first tested this, 
+			case 3: scaleBicubicTSSE41<3, 0>(data, out, w, h, l, u); break; // it was even slower than I had expected
+			case 4: scaleBicubicTSSE41<4, 0>(data, out, w, h, l, u); break; // turns out I had not included
+			case 5: scaleBicubicTSSE41<5, 0>(data, out, w, h, l, u); break; // any of these break statements
+			default: ERROR_LOG(G3D, "Bicubic upsampling only implemented for factors 2 to 5");
+			}
+		} else {
+		#endif
+			switch(factor) {
+			case 2: scaleBicubicT<2, 0>(data, out, w, h, l, u); break; // when I first tested this, 
+			case 3: scaleBicubicT<3, 0>(data, out, w, h, l, u); break; // it was even slower than I had expected
+			case 4: scaleBicubicT<4, 0>(data, out, w, h, l, u); break; // turns out I had not included
+			case 5: scaleBicubicT<5, 0>(data, out, w, h, l, u); break; // any of these break statements
+			default: ERROR_LOG(G3D, "Bicubic upsampling only implemented for factors 2 to 5");
+			}
+		#if _M_SSE >= 0x401
+		}
+		#endif
+	}
+
+	void scaleBicubicMitchell(int factor, u32* data, u32* out, int w, int h, int l, int u) {
+		#if _M_SSE >= 0x401
+		if(cpu_info.bSSE4_1) {
+			switch(factor) {
+			case 2: scaleBicubicTSSE41<2, 1>(data, out, w, h, l, u); break;
+			case 3: scaleBicubicTSSE41<3, 1>(data, out, w, h, l, u); break;
+			case 4: scaleBicubicTSSE41<4, 1>(data, out, w, h, l, u); break;
+			case 5: scaleBicubicTSSE41<5, 1>(data, out, w, h, l, u); break;
+			default: ERROR_LOG(G3D, "Bicubic upsampling only implemented for factors 2 to 5");
+			}
+		} else {
+		#endif
+			switch(factor) {
+			case 2: scaleBicubicT<2, 1>(data, out, w, h, l, u); break;
+			case 3: scaleBicubicT<3, 1>(data, out, w, h, l, u); break;
+			case 4: scaleBicubicT<4, 1>(data, out, w, h, l, u); break;
+			case 5: scaleBicubicT<5, 1>(data, out, w, h, l, u); break;
+			default: ERROR_LOG(G3D, "Bicubic upsampling only implemented for factors 2 to 5");
+			}
+		#if _M_SSE >= 0x401
+		}
+		#endif
+	}
+
+	//////////////////////////////////////////////////////////////////// Bilinear scaling
+
+	const static u8 BILINEAR_FACTORS[4][3][2] = {
+		{ { 44,211}, {  0,  0}, {  0,  0} }, // x2
+		{ { 64,191}, {  0,255}, {  0,  0} }, // x3
+		{ { 77,178}, { 26,229}, {  0,  0} }, // x4
+		{ {102,153}, { 51,204}, {  0,255} }, // x5
+	};
+	// integral bilinear upscaling by factor f, horizontal part
+	template<int f>
+	void bilinearHt(u32* data, u32* out, int w, int l, int u) {
+		static_assert(f>1 && f<=5, "Bilinear scaling only implemented for factors 2 to 5");
+		int outw = w*f;
+		for(int y = l; y < u; ++y) {
+			for(int x = 0; x < w; ++x) {
+				int inpos = y*w + x;
+				u32 left   = data[inpos - (x==0  ?0:1)];
+				u32 center = data[inpos];
+				u32 right  = data[inpos + (x==w-1?0:1)];
+				int i=0;
+				for(; i<f/2+f%2; ++i) { // first half of the new pixels + center, hope the compiler unrolls this
+					out[y*outw + x*f + i] = MIX_PIXELS(left, center, BILINEAR_FACTORS[f-2][i]);
+				}
+				for(; i<f      ; ++i) { // second half of the new pixels, hope the compiler unrolls this
+					out[y*outw + x*f + i] = MIX_PIXELS(right, center, BILINEAR_FACTORS[f-2][f-1-i]);
+				}
+			}
+		}
+	}
+	void bilinearH(int factor, u32* data, u32* out, int w, int l, int u) {
+		switch(factor) {
+		case 2: bilinearHt<2>(data, out, w, l, u); break;
+		case 3: bilinearHt<3>(data, out, w, l, u); break;
+		case 4: bilinearHt<4>(data, out, w, l, u); break;
+		case 5: bilinearHt<5>(data, out, w, l, u); break;
+		default: ERROR_LOG(G3D, "Bilinear upsampling only implemented for factors 2 to 5");
+		}
+	}
+	// integral bilinear upscaling by factor f, vertical part
+	// gl/gu == global lower and upper bound
+	template<int f>
+	void bilinearVt(u32* data, u32* out, int w, int gl, int gu, int l, int u) {
+		static_assert(f>1 && f<=5, "Bilinear scaling only implemented for 2x, 3x, 4x, and 5x");
+		int outw = w*f;
+		for(int xb = 0; xb < outw/BLOCK_SIZE+1; ++xb) {
+			for(int y = l; y < u; ++y) {
+				u32 uy = y - (y==gl  ?0:1);
+				u32 ly = y + (y==gu-1?0:1);
+				for(int x = xb*BLOCK_SIZE; x < (xb+1)*BLOCK_SIZE && x < outw; ++x) {
+					u32 upper  = data[uy * outw + x];
+					u32 center = data[y * outw + x];
+					u32 lower  = data[ly * outw + x];
+					int i=0;
+					for(; i<f/2+f%2; ++i) { // first half of the new pixels + center, hope the compiler unrolls this
+						out[(y*f + i)*outw + x] = MIX_PIXELS(upper, center, BILINEAR_FACTORS[f-2][i]);
+					}
+					for(; i<f      ; ++i) { // second half of the new pixels, hope the compiler unrolls this
+						out[(y*f + i)*outw + x] = MIX_PIXELS(lower, center, BILINEAR_FACTORS[f-2][f-1-i]);
+					}
+				}
+			}
+		}
+	}
+	void bilinearV(int factor, u32* data, u32* out, int w, int gl, int gu, int l, int u) {
+		switch(factor) {
+		case 2: bilinearVt<2>(data, out, w, gl, gu, l, u); break;
+		case 3: bilinearVt<3>(data, out, w, gl, gu, l, u); break;
+		case 4: bilinearVt<4>(data, out, w, gl, gu, l, u); break;
+		case 5: bilinearVt<5>(data, out, w, gl, gu, l, u); break;
+		default: ERROR_LOG(G3D, "Bilinear upsampling only implemented for factors 2 to 5");
+		}
+	}
+
+	#undef BLOCK_SIZE
+	#undef MIX_PIXELS
+	#undef DISTANCE
+	#undef R
+	#undef G
+	#undef B
+	#undef A
+
+	// used for debugging texture scaling (writing textures to files)
+	static int g_imgCount = 0;
+	void dbgPPM(int w, int h, u8* pixels, const char* prefix = "dbg") { // 3 component RGB
+		char fn[32];
+		snprintf(fn, 32, "%s%04d.ppm", prefix, g_imgCount++);
+		FILE *fp = fopen(fn, "wb");
+		fprintf(fp, "P6\n%d %d\n255\n", w, h);
+		for(int j = 0; j < h; ++j) {
+			for(int i = 0; i < w; ++i) {
+				static unsigned char color[3];
+				color[0] = pixels[(j*w+i)*4+0];  /* red */
+				color[1] = pixels[(j*w+i)*4+1];  /* green */
+				color[2] = pixels[(j*w+i)*4+2];  /* blue */
+				fwrite(color, 1, 3, fp);
+			}
+		}
+		fclose(fp);
+	}
+	void dbgPGM(int w, int h, u32* pixels, const char* prefix = "dbg") { // 1 component
+		char fn[32];
+		snprintf(fn, 32, "%s%04d.pgm", prefix, g_imgCount++);
+		FILE *fp = fopen(fn, "wb");
+		fprintf(fp, "P5\n%d %d\n65536\n", w, h);
+		for(int j = 0; j < h; ++j) {
+			for(int i = 0; i < w; ++i) {
+				fwrite((pixels+(j*w+i)), 1, 2, fp);
+			}
+		}
+		fclose(fp);
+	}
+}
+
+/////////////////////////////////////// Texture Scaler
+
+TextureScaler::TextureScaler() {
+	initBicubicWeights();
+}
+
+bool TextureScaler::IsEmptyOrFlat(u32* data, int pixels, u32 fmt) {
+	int pixelsPerWord = (fmt == D3DFMT_A8R8G8B8) ? 1 : 2;
+	int ref = data[0];
+	for(int i=0; i<pixels/pixelsPerWord; ++i) {
+		if(data[i]!=ref) return false;
+	}
+	return true;
+}
+
+void TextureScaler::Scale(u32* &data, u32 &dstFmt, int &width, int &height, int factor) {
+	// prevent processing empty or flat textures (this happens a lot in some games)
+	// doesn't hurt the standard case, will be very quick for textures with actual texture
+	if(IsEmptyOrFlat(data, width*height, dstFmt)) {
+		INFO_LOG(G3D, "TextureScaler: early exit -- empty/flat texture");
+		return;
+	}
+
+	#ifdef SCALING_MEASURE_TIME
+	double t_start = real_time_now();
+	#endif
+
+	bufInput.resize(width*height); // used to store the input image image if it needs to be reformatted
+	bufOutput.resize(width*height*factor*factor); // used to store the upscaled image
+	u32 *inputBuf = bufInput.data();
+	u32 *outputBuf = bufOutput.data();
+
+	// convert texture to correct format for scaling
+	ConvertTo8888(dstFmt, data, inputBuf, width, height);
+	
+	// deposterize
+	if(g_Config.bTexDeposterize) {
+		bufDeposter.resize(width*height);
+		DePosterize(inputBuf, bufDeposter.data(), width, height);
+		inputBuf = bufDeposter.data();
+	}
+	
+	// scale 
+	switch(g_Config.iTexScalingType) {
+	case XBRZ:
+		ScaleXBRZ(factor, inputBuf, outputBuf, width, height);
+		break;
+	case HYBRID:
+		ScaleHybrid(factor, inputBuf, outputBuf, width, height);
+		break;
+	case BICUBIC:
+		ScaleBicubicMitchell(factor, inputBuf, outputBuf, width, height);
+		break;
+	case HYBRID_BICUBIC:
+		ScaleHybrid(factor, inputBuf, outputBuf, width, height, true);
+		break;
+	default:
+		ERROR_LOG(G3D, "Unknown scaling type: %d", g_Config.iTexScalingType);
+	}
+
+	// update values accordingly
+	data = outputBuf;
+	dstFmt = D3DFMT_A8R8G8B8;
+	width *= factor;
+	height *= factor;
+
+	#ifdef SCALING_MEASURE_TIME
+	if(width*height > 64*64*factor*factor) {
+		double t = real_time_now() - t_start;
+		NOTICE_LOG(MASTER_LOG, "TextureScaler: processed %9d pixels in %6.5lf seconds. (%9.2lf Mpixels/second)", 
+			width*height, t, (width*height)/(t*1000*1000));
+	}
+	#endif
+}
+
+void TextureScaler::ScaleXBRZ(int factor, u32* source, u32* dest, int width, int height) {
+	xbrz::ScalerCfg cfg;
+	GlobalThreadPool::Loop(std::bind(&xbrz::scale, factor, source, dest, width, height, cfg, placeholder::_1, placeholder::_2), 0, height);
+}
+
+void TextureScaler::ScaleBilinear(int factor, u32* source, u32* dest, int width, int height) {
+	bufTmp1.resize(width*height*factor);
+	u32 *tmpBuf = bufTmp1.data();
+	GlobalThreadPool::Loop(std::bind(&bilinearH, factor, source, tmpBuf, width, placeholder::_1, placeholder::_2), 0, height);
+	GlobalThreadPool::Loop(std::bind(&bilinearV, factor, tmpBuf, dest, width, 0, height, placeholder::_1, placeholder::_2), 0, height);
+}
+
+void TextureScaler::ScaleBicubicBSpline(int factor, u32* source, u32* dest, int width, int height) {
+	GlobalThreadPool::Loop(std::bind(&scaleBicubicBSpline, factor, source, dest, width, height, placeholder::_1, placeholder::_2), 0, height);
+}
+
+void TextureScaler::ScaleBicubicMitchell(int factor, u32* source, u32* dest, int width, int height) {
+	GlobalThreadPool::Loop(std::bind(&scaleBicubicMitchell, factor, source, dest, width, height, placeholder::_1, placeholder::_2), 0, height);
+}
+
+void TextureScaler::ScaleHybrid(int factor, u32* source, u32* dest, int width, int height, bool bicubic) {
+	// Basic algorithm:
+	// 1) determine a feature mask C based on a sobel-ish filter + splatting, and upscale that mask bilinearly
+	// 2) generate 2 scaled images: A - using Bilinear filtering, B - using xBRZ
+	// 3) output = A*C + B*(1-C)
+	
+	const static int KERNEL_SPLAT[3][3] = {
+		{ 1, 1, 1 }, { 1, 1, 1 }, { 1, 1, 1 }
+	};
+	
+	bufTmp1.resize(width*height);
+	bufTmp2.resize(width*height*factor*factor);
+	bufTmp3.resize(width*height*factor*factor);
+	GlobalThreadPool::Loop(std::bind(&generateDistanceMask, source, bufTmp1.data(), width, height, placeholder::_1, placeholder::_2), 0, height);
+	GlobalThreadPool::Loop(std::bind(&convolve3x3, bufTmp1.data(), bufTmp2.data(), KERNEL_SPLAT, width, height, placeholder::_1, placeholder::_2), 0, height);
+	ScaleBilinear(factor, bufTmp2.data(), bufTmp3.data(), width, height);
+	// mask C is now in bufTmp3
+
+	ScaleXBRZ(factor, source, bufTmp2.data(), width, height);
+	// xBRZ upscaled source is in bufTmp2
+
+	if(bicubic) ScaleBicubicBSpline(factor, source, dest, width, height);
+	else ScaleBilinear(factor, source, dest, width, height);
+	// Upscaled source is in dest
+
+	// Now we can mix it all together
+	// The factor 8192 was found through practical testing on a variety of textures
+	GlobalThreadPool::Loop(std::bind(&mix, dest, bufTmp2.data(), bufTmp3.data(), 8192, width*factor, placeholder::_1, placeholder::_2), 0, height*factor);
+}
+
+void TextureScaler::DePosterize(u32* source, u32* dest, int width, int height) {
+	bufTmp3.resize(width*height);
+	GlobalThreadPool::Loop(std::bind(&deposterizeH, source, bufTmp3.data(), width, placeholder::_1, placeholder::_2), 0, height);
+	GlobalThreadPool::Loop(std::bind(&deposterizeV, bufTmp3.data(), dest, width, height, placeholder::_1, placeholder::_2), 0, height);
+	GlobalThreadPool::Loop(std::bind(&deposterizeH, dest, bufTmp3.data(), width, placeholder::_1, placeholder::_2), 0, height);
+	GlobalThreadPool::Loop(std::bind(&deposterizeV, bufTmp3.data(), dest, width, height, placeholder::_1, placeholder::_2), 0, height);
+}
+
+void TextureScaler::ConvertTo8888(u32 format, u32* source, u32* &dest, int width, int height) {
+	switch(format) {
+	case D3DFMT_A8R8G8B8:
+		dest = source; // already fine
+		break;
+
+	case D3DFMT_A4R4G4B4:
+		GlobalThreadPool::Loop(std::bind(&convert4444, (u16*)source, dest, width, placeholder::_1, placeholder::_2), 0, height);
+		break;
+
+	case D3DFMT_R5G6B5:
+		GlobalThreadPool::Loop(std::bind(&convert565, (u16*)source, dest, width, placeholder::_1, placeholder::_2), 0, height);
+		break;
+
+	case D3DFMT_A1R5G5B5:
+		GlobalThreadPool::Loop(std::bind(&convert5551, (u16*)source, dest, width, placeholder::_1, placeholder::_2), 0, height);
+		break;
+
+	default:
+		dest = source;
+		ERROR_LOG(G3D, "iXBRZTexScaling: unsupported texture format");
+	}
+}
--- a/GPU/Directx9/TextureScaler.h
+++ b/GPU/Directx9/TextureScaler.h
@ -0,0 +1,52 @@
+// Copyright (c) 2012- PPSSPP Project.
+
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, version 2.0 or later versions.
+
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License 2.0 for more details.
+
+// A copy of the GPL 2.0 should have been included with the program.
+// If not, see http://www.gnu.org/licenses/
+
+// Official git repository and contact information can be found at
+// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.
+
+#pragma once
+
+#include "Common/MemoryUtil.h"
+#include "../Globals.h"
+#include "helper/global.h"
+//#include "gfx/gl_common.h"
+
+#include <vector>
+
+
+class TextureScaler {
+public:
+	TextureScaler();
+
+	void Scale(u32* &data, u32 &dstfmt, int &width, int &height, int factor);
+
+	enum { XBRZ= 0, HYBRID = 1, BICUBIC = 2, HYBRID_BICUBIC = 3 };
+
+private:
+	void ScaleXBRZ(int factor, u32* source, u32* dest, int width, int height);
+	void ScaleBilinear(int factor, u32* source, u32* dest, int width, int height);
+	void ScaleBicubicBSpline(int factor, u32* source, u32* dest, int width, int height);
+	void ScaleBicubicMitchell(int factor, u32* source, u32* dest, int width, int height);
+	void ScaleHybrid(int factor, u32* source, u32* dest, int width, int height, bool bicubic = false);
+	void ConvertTo8888(u32 format, u32* source, u32* &dest, int width, int height);
+
+	void DePosterize(u32* source, u32* dest, int width, int height);
+
+	bool IsEmptyOrFlat(u32* data, int pixels, u32 fmt);
+
+	// depending on the factor and texture sizes, these can get pretty large 
+	// maximum is (100 MB total for a 512 by 512 texture with scaling factor 5 and hybrid scaling)
+	// of course, scaling factor 5 is totally silly anyway
+	SimpleBuf<u32> bufInput, bufDeposter, bufOutput, bufTmp1, bufTmp2, bufTmp3;
+};
--- a/GPU/Directx9/TransformPipeline.cpp
+++ b/GPU/Directx9/TransformPipeline.cpp
--- a/GPU/Directx9/TransformPipeline.h
+++ b/GPU/Directx9/TransformPipeline.h
@ -0,0 +1,221 @@
+// Copyright (c) 2012- PPSSPP Project.
+
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, version 2.0 or later versions.
+
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License 2.0 for more details.
+
+// A copy of the GPL 2.0 should have been included with the program.
+// If not, see http://www.gnu.org/licenses/
+
+// Official git repository and contact information can be found at
+// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.
+
+#pragma once
+
+#include <map>
+
+#include "IndexGenerator.h"
+#include "VertexDecoder.h"
+
+class LinkedShader;
+class ShaderManager;
+class TextureCache;
+class FramebufferManager;
+
+struct DecVtxFormat;
+
+// States transitions:
+// On creation: DRAWN_NEW
+// DRAWN_NEW -> DRAWN_HASHING
+// DRAWN_HASHING -> DRAWN_RELIABLE
+// DRAWN_HASHING -> DRAWN_UNRELIABLE
+// DRAWN_ONCE -> UNRELIABLE
+// DRAWN_RELIABLE -> DRAWN_SAFE
+// UNRELIABLE -> death
+// DRAWN_ONCE -> death
+// DRAWN_RELIABLE -> death
+
+
+// Don't bother storing information about draws smaller than this.
+enum {
+	VERTEX_CACHE_THRESHOLD = 20,
+};
+
+// Try to keep this POD.
+class VertexArrayInfo {
+public:
+	VertexArrayInfo() {
+		status = VAI_NEW;
+		vbo = 0;
+		ebo = 0;
+		numDCs = 0;
+		prim = -1;
+		numDraws = 0;
+		numFrames = 0;
+		lastFrame = gpuStats.numFrames;
+		numVerts = 0;
+		drawsUntilNextFullHash = 0;
+	}
+	~VertexArrayInfo();
+	enum Status {
+		VAI_NEW,
+		VAI_HASHING,
+		VAI_RELIABLE,  // cache, don't hash
+		VAI_UNRELIABLE,  // never cache
+	};
+
+	u32 hash;
+
+	Status status;
+
+	LPDIRECT3DVERTEXBUFFER9 vbo;
+	LPDIRECT3DINDEXBUFFER9 ebo;
+
+	
+	// Precalculated parameter for drawdrawElements
+	u16 numVerts;
+	s8 prim;
+
+	// ID information
+	u8 numDCs;
+	int numDraws;
+	int numFrames;
+	int lastFrame;  // So that we can forget.
+	u16 drawsUntilNextFullHash;
+};
+
+
+// Handles transform, lighting and drawing.
+class TransformDrawEngine {
+public:
+	TransformDrawEngine();
+	virtual ~TransformDrawEngine();
+	void SubmitPrim(void *verts, void *inds, int prim, int vertexCount, u32 vertexType, int forceIndexType, int *bytesRead);
+	void DrawBezier(int ucount, int vcount);
+	void DrawSpline(int ucount, int vcount, int utype, int vtype);
+	void DecodeVerts();
+	void Flush();
+	void SetShaderManager(ShaderManager *shaderManager) {
+		shaderManager_ = shaderManager;
+	}
+	void SetTextureCache(TextureCache *textureCache) {
+		textureCache_ = textureCache;
+	}
+	void SetFramebufferManager(FramebufferManager *fbManager) {
+		framebufferManager_ = fbManager;
+	}
+	void InitDeviceObjects();
+	void DestroyDeviceObjects();
+	void GLLost() {};
+
+	void DecimateTrackedVertexArrays();
+	void ClearTrackedVertexArrays();
+
+	void SetupVertexDecoder(u32 vertType);
+
+	// This requires a SetupVertexDecoder call first.
+	int EstimatePerVertexCost();
+
+private:
+	void SoftwareTransformAndDraw(int prim, u8 *decoded, LinkedShader *program, int vertexCount, u32 vertexType, void *inds, int indexType, const DecVtxFormat &decVtxFormat, int maxIndex);
+	void ApplyDrawState(int prim);
+	bool IsReallyAClear(int numVerts) const;
+
+	// drawcall ID
+	u32 ComputeFastDCID();
+	u32 ComputeHash();  // Reads deferred vertex data.
+
+	VertexDecoder *GetVertexDecoder(u32 vtype);
+
+	// Defer all vertex decoding to a Flush, so that we can hash and cache the
+	// generated buffers without having to redecode them every time.
+	struct DeferredDrawCall {
+		void *verts;
+		void *inds;
+		u32 vertType;
+		u8 indexType;
+		u8 prim;
+		u16 vertexCount;
+		u16 indexLowerBound;
+		u16 indexUpperBound;
+	};
+
+	// Vertex collector state
+	IndexGenerator indexGen;
+	int collectedVerts;
+	int prevPrim_;
+
+	// Cached vertex decoders
+	std::map<u32, VertexDecoder *> decoderMap_;
+	VertexDecoder *dec_;
+	u32 lastVType_;
+	
+	// Vertex collector buffers
+	u8 *decoded;
+	u16 *decIndex;
+
+	TransformedVertex *transformed;
+	TransformedVertex *transformedExpanded;
+
+	std::map<u32, VertexArrayInfo *> vai_;
+
+	// Vertex buffer objects
+	// Element buffer objects
+	enum { NUM_VBOS = 2 };
+	LPDIRECT3DVERTEXBUFFER9 vbo_[NUM_VBOS];
+	LPDIRECT3DINDEXBUFFER9 ebo_[NUM_VBOS];
+	int curVbo_;
+
+	// Other
+	ShaderManager *shaderManager_;
+	TextureCache *textureCache_;
+	FramebufferManager *framebufferManager_;
+
+	enum { MAX_DEFERRED_DRAW_CALLS = 128 };
+	DeferredDrawCall drawCalls[MAX_DEFERRED_DRAW_CALLS];
+	int numDrawCalls;
+	UVScale *uvScale;
+};
+
+// Only used by SW transform
+struct Color4 {
+	float a, r, g, b;
+
+	Color4() : r(0), g(0), b(0), a(0) { }
+	Color4(float _r, float _g, float _b, float _a=1.0f)
+		: r(_r), g(_g), b(_b), a(_a) {
+	}
+	Color4(const float in[4]) {a=in[0];r=in[1];g=in[2];b=in[3];}
+	Color4(const float in[3], float alpha) {r=in[0];g=in[1];b=in[2];a=alpha;}
+
+	const float &operator [](int i) const {return *(&a + i);}
+
+	Color4 operator *(float f) const {
+		return Color4(f*r,f*g,f*b,f*a);
+	}
+	Color4 operator *(const Color4 &c) const {
+		return Color4(r*c.r,g*c.g,b*c.b,a*c.a);
+	}
+	Color4 operator +(const Color4 &c) const {
+		return Color4(r+c.r,g+c.g,b+c.b,a+c.a);
+	}
+	void operator +=(const Color4 &c) {
+		r+=c.r;
+		g+=c.g;
+		b+=c.b;
+		a+=c.a;
+	}
+	void GetFromRGB(u32 col) {
+		b = ((col>>16) & 0xff)/255.0f;
+		g = ((col>>8) & 0xff)/255.0f;
+		r = ((col>>0) & 0xff)/255.0f;
+	}
+	void GetFromA(u32 col) {
+		a = (col&0xff)/255.0f;
+	}
+};
--- a/GPU/Directx9/VertexDecoder.cpp
+++ b/GPU/Directx9/VertexDecoder.cpp
@ -0,0 +1,886 @@
+// Copyright (c) 2012- PPSSPP Project.
+
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, version 2.0 or later versions.
+
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License 2.0 for more details.
+
+// A copy of the GPL 2.0 should have been included with the program.
+// If not, see http://www.gnu.org/licenses/
+
+// Official git repository and contact information can be found at
+// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.
+
+#include "math/lin/matrix4x4.h"
+
+#include "Core/Config.h"
+#include "Core/MemMap.h"
+#include "GPU/ge_constants.h"
+
+#include "VertexDecoder.h"
+#include "VertexShaderGenerator.h"
+
+void PrintDecodedVertex(VertexReader &vtx) {
+	if (vtx.hasNormal())
+	{
+		float nrm[3];
+		vtx.ReadNrm(nrm);
+		printf("N: %f %f %f\n", nrm[0], nrm[1], nrm[2]);
+	}
+	if (vtx.hasUV()) {
+		float uv[2];
+		vtx.ReadUV(uv);
+		printf("TC: %f %f\n", uv[0], uv[1]);
+	}
+	if (vtx.hasColor0()) {
+		float col0[4];
+		vtx.ReadColor0(col0);
+		printf("C0: %f %f %f %f\n", col0[0], col0[1], col0[2], col0[3]);
+	}
+	if (vtx.hasColor1()) {
+		float col1[3];
+		vtx.ReadColor1(col1);
+		printf("C1: %f %f %f\n", col1[0], col1[1], col1[2]);
+	}
+	// Etc..
+	float pos[3];
+	vtx.ReadPos(pos);
+	printf("P: %f %f %f\n", pos[0], pos[1], pos[2]);
+}
+
+const u8 tcsize[4] = {0,2,4,8}, tcalign[4] = {0,1,2,4};
+const u8 colsize[8] = {0,0,0,0,2,2,2,4}, colalign[8] = {0,0,0,0,2,2,2,4};
+const u8 nrmsize[4] = {0,3,6,12}, nrmalign[4] = {0,1,2,4};
+const u8 possize[4] = {0,3,6,12}, posalign[4] = {0,1,2,4};
+const u8 wtsize[4] = {0,1,2,4}, wtalign[4] = {0,1,2,4};
+
+inline int align(int n, int align) {
+	return (n + (align - 1)) & ~(align - 1);
+}
+
+int DecFmtSize(u8 fmt) {
+	switch (fmt) {
+	case DEC_NONE: return 0;
+	case DEC_FLOAT_1: return 4;
+	case DEC_FLOAT_2: return 8;
+	case DEC_FLOAT_3: return 12;
+	case DEC_FLOAT_4: return 16;
+	case DEC_S8_3: return 4;
+	case DEC_S16_3: return 8;
+	case DEC_U8_1: return 4;
+	case DEC_U8_2: return 4;
+	case DEC_U8_3: return 4;
+	case DEC_U8_4: return 4;
+	case DEC_U16_1: return 4;
+	case DEC_U16_2: return 4;
+	case DEC_U16_3: return 8;
+	case DEC_U16_4: return 8;
+	case DEC_U8A_2: return 4;
+	case DEC_U16A_2: return 4;
+	default:
+		return 0;
+	}
+}
+#if 0
+// This is what the software transform spits out, and thus w
+DecVtxFormat GetTransformedVtxFormat(const DecVtxFormat &fmt) {
+	DecVtxFormat tfm = {0};
+	int size = 0;
+	int offset = 0;
+	// Weights disappear during transform.
+	if (fmt.uvfmt) {
+		// UV always becomes float2.
+		tfm.uvfmt = DEC_FLOAT_2;
+		tfm.uvoff = offset;
+		offset += DecFmtSize(tfm.uvfmt);
+	}
+	// We always (?) get two colors out, they're floats (although we'd probably be fine with less precision).
+	tfm.c0fmt = DEC_FLOAT_4;
+	tfm.c0off = offset;
+	offset += DecFmtSize(tfm.c0fmt);
+	tfm.c1fmt = DEC_FLOAT_3;  // color1 (specular) doesn't have alpha.
+	tfm.c1off = offset;
+	offset += DecFmtSize(tfm.c1fmt);
+	// We never get a normal, it's gone.
+	// But we do get a position, and it's always float3.
+	tfm.posfmt = DEC_FLOAT_3;
+	tfm.posoff = offset;
+	offset += DecFmtSize(tfm.posfmt);
+	// Update stride.
+	tfm.stride = offset;
+	return tfm;
+}
+#endif
+
+void VertexDecoder::Step_WeightsU8() const
+{
+	u8 *wt = (u8 *)(decoded_ + decFmt.w0off);
+	const u8 *wdata = (const u8*)(ptr_);
+	int j;
+	for (j = 0; j < nweights; j++)
+		wt[j] = wdata[j];
+	while (j & 3)   // Zero additional weights rounding up to 4.
+		wt[j++] = 0;
+}
+
+void VertexDecoder::Step_WeightsU16() const
+{
+	u16 *wt = (u16 *)(decoded_  + decFmt.w0off);
+	const u16_le *wdata = (const u16_le*)(ptr_);
+	int j;
+	for (j = 0; j < nweights; j++)
+		wt[j] =wdata[j];
+	while (j & 3)   // Zero additional weights rounding up to 4.
+		wt[j++] = 0;
+}
+
+// Float weights should be uncommon, we can live with having to multiply these by 2.0
+// to avoid special checks in the vertex shader generator.
+// (PSP uses 0.0-2.0 fixed point numbers for weights)
+void VertexDecoder::Step_WeightsFloat() const
+{
+	float *wt = (float *)(decoded_ + decFmt.w0off);
+	const float_le *wdata = (const float_le*)(ptr_);
+	int j;
+	for (j = 0; j < nweights; j++) {
+		wt[j] = wdata[j];
+	}
+	while (j & 3)   // Zero additional weights rounding up to 4.
+		wt[j++] = 0.0f;
+}
+
+void VertexDecoder::Step_TcU8() const
+{
+	u8 *uv = (u8 *)(decoded_ + decFmt.uvoff);
+	const u8 *uvdata = (const u8*)(ptr_ + tcoff);
+	uv[0] = uvdata[0];
+	uv[1] = uvdata[1];
+}
+
+void VertexDecoder::Step_TcU16() const
+{
+	u16 *uv = (u16 *)(decoded_ + decFmt.uvoff);
+	const u16_le *uvdata = (const u16_le*)(ptr_ + tcoff);
+	uv[0] = uvdata[0];
+	uv[1] = uvdata[1];
+}
+
+void VertexDecoder::Step_TcU16Double() const
+{
+	u16 *uv = (u16 *)(decoded_ + decFmt.uvoff);
+	const u16_le *uvdata = (const u16_le*)(ptr_ + tcoff);
+	*uv = *uvdata;
+	uv[0] = uvdata[0] * 2;
+	uv[1] = uvdata[1] * 2;
+}
+
+void VertexDecoder::Step_TcU16Through() const
+{
+	u16 *uv = (u16 *)(decoded_ + decFmt.uvoff);
+	const u16_le *uvdata = (const u16_le*)(ptr_ + tcoff);
+	uv[0] = uvdata[0];
+	uv[1] = uvdata[1];
+}
+
+void VertexDecoder::Step_TcU16ThroughDouble() const
+{
+	u16 *uv = (u16 *)(decoded_ + decFmt.uvoff);
+	const u16_le *uvdata = (const u16_le*)(ptr_ + tcoff);
+	uv[0] = uvdata[0] * 2;
+	uv[1] = uvdata[1] * 2;
+}
+
+void VertexDecoder::Step_TcFloat() const
+{
+	float *uv = (float *)(decoded_ + decFmt.uvoff);
+	const float_le *uvdata = (const float_le*)(ptr_ + tcoff);
+	uv[0] = uvdata[0];
+	uv[1] = uvdata[1];
+}
+
+void VertexDecoder::Step_TcFloatThrough() const
+{
+	float *uv = (float *)(decoded_ + decFmt.uvoff);
+	const float_le *uvdata = (const float_le*)(ptr_ + tcoff);
+	uv[0] = uvdata[0];
+	uv[1] = uvdata[1];
+}
+
+void VertexDecoder::Step_TcU8Prescale() const {
+	float *uv = (float *)(decoded_ + decFmt.uvoff);
+	const u8 *uvdata = (const u8 *)(ptr_ + tcoff);
+	uv[0] = (float)uvdata[0] * (1.f / 128.f) * gstate_c.uv.uScale + gstate_c.uv.uOff;
+	uv[1] = (float)uvdata[1] * (1.f / 128.f) * gstate_c.uv.vScale + gstate_c.uv.vOff;
+}
+
+void VertexDecoder::Step_TcU16Prescale() const {
+	float *uv = (float *)(decoded_ + decFmt.uvoff);
+	const u16_le *uvdata = (const u16_le *)(ptr_ + tcoff);
+	uv[0] = (float)uvdata[0] * (1.f / 32768.f) * gstate_c.uv.uScale + gstate_c.uv.uOff;
+	uv[1] = (float)uvdata[1] * (1.f / 32768.f) * gstate_c.uv.vScale + gstate_c.uv.vOff;
+}
+
+void VertexDecoder::Step_TcFloatPrescale() const {
+	float *uv = (float *)(decoded_ + decFmt.uvoff);
+	const float_le *uvdata = (const float_le*)(ptr_ + tcoff);
+	uv[0] = uvdata[0] * gstate_c.uv.uScale + gstate_c.uv.uOff;
+	uv[1] = uvdata[1] * gstate_c.uv.vScale + gstate_c.uv.vOff;
+}
+
+void VertexDecoder::Step_Color565() const
+{
+	u8 *c = decoded_ + decFmt.c0off;
+	u16 cdata = (u16)(*(u16_le*)(ptr_ + coloff));
+
+	c[0] = 255;
+	c[1] = Convert5To8(cdata & 0x1f);
+	c[2] = Convert6To8((cdata>>5) & 0x3f);
+	c[3] = Convert5To8((cdata>>11) & 0x1f);
+}
+
+void VertexDecoder::Step_Color5551() const
+{
+	u8 *c = decoded_ + decFmt.c0off;
+	u16 cdata = (u16)(*(u16_le*)(ptr_ + coloff));
+	c[0] = Convert5To8(cdata & 0x1f);
+	c[1] = Convert5To8((cdata>>5) & 0x1f);
+	c[2] = Convert5To8((cdata>>10) & 0x1f);
+	c[3] = (cdata >> 15) ? 255 : 0;
+}
+
+void VertexDecoder::Step_Color4444() const
+{
+	u8 *c = decoded_ + decFmt.c0off;
+	u16 cdata = (u16)(*(u16_le*)(ptr_ + coloff));
+	c[0] =  Convert4To8((cdata >> (12)) & 0xF);
+	c[1] =  Convert4To8((cdata >> (0)) & 0xF);
+	c[2] =  Convert4To8((cdata >> (4)) & 0xF);
+	c[3] =  Convert4To8((cdata >> (8)) & 0xF);
+}
+
+void VertexDecoder::Step_Color8888() const
+{
+ 	// Directx want ARGB
+	u8 *c = (u8*)(decoded_ + decFmt.c0off);
+	const u8 *cdata = (const u8*)(ptr_ + coloff);
+	c[0] = cdata[3];
+	c[1] = cdata[0];
+	c[2] = cdata[1];
+	c[3] = cdata[2];
+}
+
+void VertexDecoder::Step_Color565Morph() const
+{
+	float col[3] = {0};
+	for (int n = 0; n < morphcount; n++)
+	{
+		float w = gstate_c.morphWeights[n];		
+		u16 cdata = (u16)(*(u16_le*)(ptr_ + onesize_*n + coloff));
+
+		col[0] += w * (cdata & 0x1f) * (255.0f / 31.0f);
+		col[1] += w * ((cdata>>5) & 0x3f) * (255.0f / 63.0f);
+		col[2] += w * ((cdata>>11) & 0x1f) * (255.0f / 31.0f);
+	}
+	u8 *c = decoded_ + decFmt.c0off;
+	// Dx want ARGB
+	c[0] = 255;
+	c[1] = (u8)col[0];
+	c[2] = (u8)col[1];
+	c[3] = (u8)col[2];
+}
+
+void VertexDecoder::Step_Color5551Morph() const
+{
+	float col[4] = {0};
+	for (int n = 0; n < morphcount; n++)
+	{
+		float w = gstate_c.morphWeights[n];
+		u16 cdata = (u16)(*(u16_le*)(ptr_ + onesize_*n + coloff));
+		col[0] += w * (cdata & 0x1f) * (255.0f / 31.0f);
+		col[1] += w * ((cdata>>5) & 0x1f) * (255.0f / 31.0f);
+		col[2] += w * ((cdata>>10) & 0x1f) * (255.0f / 31.0f);
+		col[3] += w * ((cdata>>15) ? 255.0f : 0.0f);
+	}
+	u8 *c = decoded_ + decFmt.c0off;
+	// Dx want ARGB
+	c[0] = (u8)col[3];
+	c[1] = (u8)col[0];
+	c[2] = (u8)col[1];
+	c[3] = (u8)col[2];
+}
+
+void VertexDecoder::Step_Color4444Morph() const
+{
+	float col[4] = {0};
+	for (int n = 0; n < morphcount; n++)
+	{
+		float w = gstate_c.morphWeights[n];
+		u16 cdata = (u16)(*(u16_le*)(ptr_ + onesize_*n + coloff));
+		for (int j = 0; j < 4; j++)
+			col[j] += w * ((cdata >> (j * 4)) & 0xF) * (255.0f / 15.0f);
+	}
+	u8 *c = decoded_ + decFmt.c0off;
+	// Dx want ARGB
+	c[0] = (u8)col[3];
+	c[1] = (u8)col[0];
+	c[2] = (u8)col[1];
+	c[3] = (u8)col[2];
+}
+
+void VertexDecoder::Step_Color8888Morph() const
+{
+	float col[4] = {0};
+	for (int n = 0; n < morphcount; n++)
+	{
+		float w = gstate_c.morphWeights[n];
+		const u8 *cdata = (const u8*)(ptr_ + onesize_*n + coloff);
+		for (int j = 0; j < 4; j++)
+			col[j] += w * cdata[j];
+	}
+	u8 *c = decoded_ + decFmt.c0off;
+	
+	// Dx want ARGB
+	c[0] = (u8)col[3];
+	c[1] = (u8)col[0];
+	c[2] = (u8)col[1];
+	c[3] = (u8)col[2];
+}
+
+void VertexDecoder::Step_NormalS8() const
+{
+	s8 *normal = (s8 *)(decoded_ + decFmt.nrmoff);
+	u8 xorval = 0;
+	if (gstate.reversenormals & 1)
+		xorval = 0xFF;  // Using xor instead of - to handle -128
+	const s8 *sv = (const s8*)(ptr_ + nrmoff);
+	for (int j = 0; j < 3; j++)
+		normal[j] = sv[j] ^ xorval;
+	normal[3] = 0;
+}
+
+void VertexDecoder::Step_NormalS16() const
+{
+	s16 *normal = (s16 *)(decoded_ + decFmt.nrmoff);
+	u16 xorval = 0;
+	if (gstate.reversenormals & 1)
+		xorval = 0xFFFF;
+	const s16_le *sv = (const s16_le*)(ptr_ + nrmoff);
+	for (int j = 0; j < 3; j++)
+		normal[j] = sv[j] ^ xorval;
+	normal[3] = 0;
+}
+
+void VertexDecoder::Step_NormalFloat() const
+{
+	float *normal = (float *)(decoded_ + decFmt.nrmoff);
+	float multiplier = 1.0f;
+	if (gstate.reversenormals & 1)
+		multiplier = -multiplier;
+	const float_le *fv = (const float_le*)(ptr_ + nrmoff);
+	for (int j = 0; j < 3; j++)
+		normal[j] = fv[j] * multiplier;
+}
+
+void VertexDecoder::Step_NormalS8Morph() const
+{
+	float *normal = (float *)(decoded_ + decFmt.nrmoff);
+	memset(normal, 0, sizeof(float)*3);
+	for (int n = 0; n < morphcount; n++)
+	{
+		float multiplier = gstate_c.morphWeights[n];
+		if (gstate.reversenormals & 1) {
+			multiplier = -multiplier;
+		}
+		const s8 *bv = (const s8*)(ptr_ + onesize_*n + nrmoff);
+		multiplier *= (1.0f/127.0f);
+		for (int j = 0; j < 3; j++)
+			normal[j] += bv[j] * multiplier;
+	}
+}
+
+void VertexDecoder::Step_NormalS16Morph() const
+{
+	float *normal = (float *)(decoded_ + decFmt.nrmoff);
+	memset(normal, 0, sizeof(float)*3);
+	for (int n = 0; n < morphcount; n++)
+	{
+		float multiplier = gstate_c.morphWeights[n];
+		if (gstate.reversenormals & 1) {
+			multiplier = -multiplier;
+		}
+		const s16_le *sv = (const s16_le *)(ptr_ + onesize_*n + nrmoff);
+		multiplier *= (1.0f/32767.0f);
+		for (int j = 0; j < 3; j++)
+			normal[j] += sv[j] * multiplier;
+	}
+}
+
+void VertexDecoder::Step_NormalFloatMorph() const
+{
+	float *normal = (float *)(decoded_ + decFmt.nrmoff);
+	memset(normal, 0, sizeof(float)*3);
+	for (int n = 0; n < morphcount; n++)
+	{
+		float multiplier = gstate_c.morphWeights[n];
+		if (gstate.reversenormals & 1) {
+			multiplier = -multiplier;
+		}
+		const float_le *fv = (const float_le*)(ptr_ + onesize_*n + nrmoff);
+		for (int j = 0; j < 3; j++)
+			normal[j] += fv[j] * multiplier;
+	}
+}
+
+void VertexDecoder::Step_PosS8() const
+{
+	s8 *v = (s8 *)(decoded_ + decFmt.posoff);
+	const s8 *sv = (const s8*)(ptr_ + posoff);
+	for (int j = 0; j < 3; j++)
+		v[j] = sv[j];
+	v[3] = 0;
+}
+
+void VertexDecoder::Step_PosS16() const
+{
+	s16 *v = (s16 *)(decoded_ + decFmt.posoff);
+	const s16_le *sv = (const s16_le*)(ptr_ + posoff);
+	for (int j = 0; j < 3; j++)
+		v[j] = sv[j];
+	v[3] = 0;
+}
+
+void VertexDecoder::Step_PosFloat() const
+{
+	float *v = (float *)(decoded_ + decFmt.posoff);
+	const float_le *sv = (const float_le*)(ptr_ + posoff);
+	v[0] = sv[0];
+	v[1] = sv[1];
+	v[2] = sv[2];
+}
+
+void VertexDecoder::Step_PosS8Through() const
+{
+	float *v = (float *)(decoded_ + decFmt.posoff);
+	const s8 *sv = (const s8*)(ptr_ + posoff);
+	v[0] = sv[0];
+	v[1] = sv[1];
+	v[2] = sv[2];
+	v[3] = 0;
+}
+
+void VertexDecoder::Step_PosS16Through() const
+{
+	float *v = (float *)(decoded_ + decFmt.posoff);
+	const s16_le *sv = (const s16_le*)(ptr_ + posoff);
+	v[0] = sv[0];
+	v[1] = sv[1];
+	v[2] = sv[2];
+	v[3] = 0;
+}
+
+void VertexDecoder::Step_PosFloatThrough() const
+{
+	float *v = (float *)(decoded_ + decFmt.posoff);
+	const float_le *fv = (const float_le*)(ptr_ + posoff);
+	v[0] = fv[0];
+	v[1] = fv[1];
+	v[2] = fv[2];
+	v[3] = 0;
+}
+
+void VertexDecoder::Step_PosS8Morph() const
+{
+	float *v = (float *)(decoded_ + decFmt.posoff);
+	memset(v, 0, sizeof(float) * 3);
+	for (int n = 0; n < morphcount; n++) {
+		float multiplier = 1.0f / 127.0f;
+		const s8 *sv = (const s8*)(ptr_ + onesize_*n + posoff);
+		for (int j = 0; j < 3; j++)
+			v[j] += (float)sv[j] * (multiplier * gstate_c.morphWeights[n]);
+	}
+}
+
+void VertexDecoder::Step_PosS16Morph() const
+{
+	float *v = (float *)(decoded_ + decFmt.posoff);
+	memset(v, 0, sizeof(float) * 3);
+	for (int n = 0; n < morphcount; n++) {
+		float multiplier = 1.0f / 32767.0f;
+		const s16_le *sv = (const s16_le*)(ptr_ + onesize_*n + posoff);
+		for (int j = 0; j < 3; j++)
+			v[j] += (float)sv[j] * (multiplier * gstate_c.morphWeights[n]);
+	}
+}
+
+void VertexDecoder::Step_PosFloatMorph() const
+{
+	float *v = (float *)(decoded_ + decFmt.posoff);
+	memset(v, 0, sizeof(float) * 3);
+	for (int n = 0; n < morphcount; n++) {
+		const float_le *fv = (const float_le*)(ptr_ + onesize_*n + posoff);
+		for (int j = 0; j < 3; j++)
+			v[j] += fv[j] * gstate_c.morphWeights[n];
+	}
+}
+
+static const StepFunction wtstep[4] = {
+	0,
+	&VertexDecoder::Step_WeightsU8,
+	&VertexDecoder::Step_WeightsU16,
+	&VertexDecoder::Step_WeightsFloat,
+};
+
+static const StepFunction tcstep[4] = {
+	0,
+	&VertexDecoder::Step_TcU8,
+	&VertexDecoder::Step_TcU16,
+	&VertexDecoder::Step_TcFloat,
+};
+
+static const StepFunction tcstep_prescale[4] = {
+	0,
+	&VertexDecoder::Step_TcU8Prescale,
+	&VertexDecoder::Step_TcU16Prescale,
+	&VertexDecoder::Step_TcFloatPrescale,
+};
+
+static const StepFunction tcstep_through[4] = {
+	0,
+	&VertexDecoder::Step_TcU8,
+	&VertexDecoder::Step_TcU16Through,
+	&VertexDecoder::Step_TcFloatThrough,
+};
+
+// Some HD Remaster games double the u16 texture coordinates.
+static const StepFunction tcstep_Remaster[4] = {
+	0,
+	&VertexDecoder::Step_TcU8,
+	&VertexDecoder::Step_TcU16Double,
+	&VertexDecoder::Step_TcFloat,
+};
+
+static const StepFunction tcstep_through_Remaster[4] = {
+	0,
+	&VertexDecoder::Step_TcU8,
+	&VertexDecoder::Step_TcU16ThroughDouble,
+	&VertexDecoder::Step_TcFloatThrough,
+};
+
+// TODO: Tc Morph
+
+static const StepFunction colstep[8] = {
+	0, 0, 0, 0,
+	&VertexDecoder::Step_Color565,
+	&VertexDecoder::Step_Color5551,
+	&VertexDecoder::Step_Color4444,
+	&VertexDecoder::Step_Color8888,
+};
+
+static const StepFunction colstep_morph[8] = {
+	0, 0, 0, 0,
+	&VertexDecoder::Step_Color565Morph,
+	&VertexDecoder::Step_Color5551Morph,
+	&VertexDecoder::Step_Color4444Morph,
+	&VertexDecoder::Step_Color8888Morph,
+};
+
+static const StepFunction nrmstep[4] = {
+	0,
+	&VertexDecoder::Step_NormalS8,
+	&VertexDecoder::Step_NormalS16,
+	&VertexDecoder::Step_NormalFloat,
+};
+
+static const StepFunction nrmstep_morph[4] = {
+	0,
+	&VertexDecoder::Step_NormalS8Morph,
+	&VertexDecoder::Step_NormalS16Morph,
+	&VertexDecoder::Step_NormalFloatMorph,
+};
+
+static const StepFunction posstep[4] = {
+	0,
+	&VertexDecoder::Step_PosS8,
+	&VertexDecoder::Step_PosS16,
+	&VertexDecoder::Step_PosFloat,
+};
+
+static const StepFunction posstep_morph[4] = {
+	0,
+	&VertexDecoder::Step_PosS8Morph,
+	&VertexDecoder::Step_PosS16Morph,
+	&VertexDecoder::Step_PosFloatMorph,
+};
+
+static const StepFunction posstep_through[4] = {
+	0,
+	&VertexDecoder::Step_PosS8Through,
+	&VertexDecoder::Step_PosS16Through,
+	&VertexDecoder::Step_PosFloatThrough,
+};
+
+
+int RoundUp4(int x) {
+	return (x + 3) & ~3;
+}
+
+void VertexDecoder::SetVertexType(u32 fmt) {
+	fmt_ = fmt;
+	throughmode = (fmt & GE_VTYPE_THROUGH) != 0;
+	numSteps_ = 0;
+
+	int biggest = 0;
+	size = 0;
+
+	tc = fmt & 0x3;
+	col = (fmt >> 2) & 0x7;
+	nrm = (fmt >> 5) & 0x3;
+	pos = (fmt >> 7) & 0x3;
+	weighttype = (fmt >> 9) & 0x3;
+	idx = (fmt >> 11) & 0x3;
+	morphcount = ((fmt >> 18) & 0x7)+1;
+	nweights = ((fmt >> 14) & 0x7)+1;
+
+	int decOff = 0;
+	memset(&decFmt, 0, sizeof(decFmt));
+
+	DEBUG_LOG(G3D,"VTYPE: THRU=%i TC=%i COL=%i POS=%i NRM=%i WT=%i NW=%i IDX=%i MC=%i", (int)throughmode, tc,col,pos,nrm,weighttype,nweights,idx,morphcount);
+
+	if (weighttype) { // && nweights?
+		//size = align(size, wtalign[weighttype]);	unnecessary
+		size += wtsize[weighttype] * nweights;
+		if (wtalign[weighttype] > biggest)
+			biggest = wtalign[weighttype];
+
+		steps_[numSteps_++] = wtstep[weighttype];
+
+		int fmtBase = DEC_FLOAT_1;
+		if (weighttype == GE_VTYPE_WEIGHT_8BIT >> GE_VTYPE_WEIGHT_SHIFT) {
+			fmtBase = DEC_U8_1;
+		} else if (weighttype == GE_VTYPE_WEIGHT_16BIT >> GE_VTYPE_WEIGHT_SHIFT) {
+			fmtBase = DEC_U16_1;
+		} else if (weighttype == GE_VTYPE_WEIGHT_FLOAT >> GE_VTYPE_WEIGHT_SHIFT) {
+			fmtBase = DEC_FLOAT_1;
+		}
+
+		int numWeights = TranslateNumBones(nweights);
+
+		if (numWeights <= 4) {
+			decFmt.w0off = decOff;
+			decFmt.w0fmt = fmtBase + numWeights - 1;
+			decOff += DecFmtSize(decFmt.w0fmt);
+		} else {
+			decFmt.w0off = decOff;
+			decFmt.w0fmt = fmtBase + 3;
+			decOff += DecFmtSize(decFmt.w0fmt);
+			decFmt.w1off = decOff;
+			decFmt.w1fmt = fmtBase + numWeights - 5;
+			decOff += DecFmtSize(decFmt.w1fmt);
+		}
+	}
+
+	if (tc) {
+		size = align(size, tcalign[tc]);
+		tcoff = size;
+		size += tcsize[tc];
+		if (tcalign[tc] > biggest)
+			biggest = tcalign[tc];
+
+		if (g_Config.bPrescaleUV && !throughmode && gstate.getTextureFunction() == 0) {
+			steps_[numSteps_++] = tcstep_prescale[tc];
+			decFmt.uvfmt = DEC_FLOAT_2;
+		} else {
+			if (g_DoubleTextureCoordinates)
+				steps_[numSteps_++] = throughmode ? tcstep_through_Remaster[tc] : tcstep_Remaster[tc];
+			else
+		steps_[numSteps_++] = throughmode ? tcstep_through[tc] : tcstep[tc];
+
+		switch (tc) {
+		case GE_VTYPE_TC_8BIT >> GE_VTYPE_TC_SHIFT:
+			decFmt.uvfmt = throughmode ? DEC_U8A_2 : DEC_U8_2;
+			break;
+		case GE_VTYPE_TC_16BIT >> GE_VTYPE_TC_SHIFT:
+			decFmt.uvfmt = throughmode ? DEC_U16A_2 : DEC_U16_2;
+			break;
+		case GE_VTYPE_TC_FLOAT >> GE_VTYPE_TC_SHIFT:
+			decFmt.uvfmt = DEC_FLOAT_2;
+			break;
+		}
+		}
+
+		decFmt.uvoff = decOff;
+		decOff += DecFmtSize(decFmt.uvfmt);
+	}
+
+	if (col) {
+		size = align(size, colalign[col]);
+		coloff = size;
+		size += colsize[col];
+		if (colalign[col] > biggest)
+			biggest = colalign[col]; 
+
+		steps_[numSteps_++] = morphcount == 1 ? colstep[col] : colstep_morph[col];
+
+		// All color formats decode to DEC_U8_4 currently.
+		// They can become floats later during transform though.
+		decFmt.c0fmt = DEC_U8_4;
+		decFmt.c0off = decOff;
+		decOff += DecFmtSize(decFmt.c0fmt);
+	} else {
+		coloff = 0;
+	}
+
+	if (nrm) {
+		size = align(size, nrmalign[nrm]);
+		nrmoff = size;
+		size += nrmsize[nrm];
+		if (nrmalign[nrm] > biggest)
+			biggest = nrmalign[nrm]; 
+
+		steps_[numSteps_++] = morphcount == 1 ? nrmstep[nrm] : nrmstep_morph[nrm];
+
+		if (morphcount == 1) {
+			// The normal formats match the gl formats perfectly, let's use 'em.
+			switch (nrm) {
+			case GE_VTYPE_NRM_8BIT >> GE_VTYPE_NRM_SHIFT: decFmt.nrmfmt = DEC_S8_3; break;
+			case GE_VTYPE_NRM_16BIT >> GE_VTYPE_NRM_SHIFT: decFmt.nrmfmt = DEC_S16_3; break;
+			case GE_VTYPE_NRM_FLOAT >> GE_VTYPE_NRM_SHIFT: decFmt.nrmfmt = DEC_FLOAT_3; break;
+			}
+		} else {
+			decFmt.nrmfmt = DEC_FLOAT_3;
+		}
+
+		// Actually, temporarily let's not.
+		decFmt.nrmoff = decOff;
+		decOff += DecFmtSize(decFmt.nrmfmt);
+	}
+
+	//if (pos)  - there's always a position
+	{
+		size = align(size, posalign[pos]);
+		posoff = size;
+		size += possize[pos];
+		if (posalign[pos] > biggest)
+			biggest = posalign[pos];
+
+		if (throughmode) {
+			steps_[numSteps_++] = posstep_through[pos];
+			decFmt.posfmt = DEC_FLOAT_3;
+		} else {
+			steps_[numSteps_++] = morphcount == 1 ? posstep[pos] : posstep_morph[pos];
+
+			if (morphcount == 1) {
+				// The non-through-mode position formats match the gl formats perfectly, let's use 'em.
+				switch (pos) {
+				case GE_VTYPE_POS_8BIT >> GE_VTYPE_POS_SHIFT: decFmt.posfmt = DEC_S8_3; break;
+				case GE_VTYPE_POS_16BIT >> GE_VTYPE_POS_SHIFT: decFmt.posfmt = DEC_S16_3; break;
+				case GE_VTYPE_POS_FLOAT >> GE_VTYPE_POS_SHIFT: decFmt.posfmt = DEC_FLOAT_3; break;
+				}
+			} else {
+				// Actually, temporarily let's not.
+				decFmt.posfmt = DEC_FLOAT_3;
+			}
+		}
+		decFmt.posoff = decOff;
+		decOff += DecFmtSize(decFmt.posfmt);
+	}
+	decFmt.stride = decOff;
+
+	size = align(size, biggest);
+	onesize_ = size;
+	size *= morphcount;
+	DEBUG_LOG(G3D,"SVT : size = %i, aligned to biggest %i", size, biggest);
+}
+
+void GetIndexBounds(void *inds, int count, u32 vertType, u16 *indexLowerBound, u16 *indexUpperBound) {
+	// Find index bounds. Could cache this in display lists.
+	// Also, this could be greatly sped up with SSE2/NEON, although rarely a bottleneck.
+	int lowerBound = 0x7FFFFFFF;
+	int upperBound = 0;
+	u32 idx = vertType & GE_VTYPE_IDX_MASK;
+	if (idx == GE_VTYPE_IDX_8BIT) {
+		const u8 *ind8 = (const u8 *)inds;
+		for (int i = 0; i < count; i++) {
+			if (ind8[i] > upperBound)
+				upperBound = ind8[i];
+			if (ind8[i] < lowerBound)
+				lowerBound = ind8[i];
+		}
+	} else if (idx == GE_VTYPE_IDX_16BIT) {
+		const u16 *ind16 = (const u16*)inds;
+		for (int i = 0; i < count; i++) {
+			if (ind16[i] > upperBound)
+				upperBound = ind16[i];
+			if (ind16[i] < lowerBound)
+				lowerBound = ind16[i];
+		}
+	} else {
+		lowerBound = 0;
+		upperBound = count - 1;
+	}
+	*indexLowerBound = (u16)lowerBound;
+	*indexUpperBound = (u16)upperBound;
+}
+
+void VertexDecoder::DecodeVerts(u8 *decodedptr, const void *verts, int indexLowerBound, int indexUpperBound) const {
+	// Decode the vertices within the found bounds, once each
+	// decoded_ and ptr_ are used in the steps, so can't be turned into locals for speed.
+	decoded_ = decodedptr;
+	ptr_ = (const u8*)verts + indexLowerBound * size;
+	int stride = decFmt.stride;
+	for (int index = indexLowerBound; index <= indexUpperBound; index++) {
+		for (int i = 0; i < numSteps_; i++) {
+			((*this).*steps_[i])();
+		}
+		ptr_ += size;
+		decoded_ += stride;
+	}
+}
+
+// TODO: Does not support morphs, skinning etc.
+u32 VertexDecoder::InjectUVs(u8 *decoded, const void *verts, float *customuv, int count) const {
+	u32 customVertType = (gstate.vertType & ~GE_VTYPE_TC_MASK) | GE_VTYPE_TC_FLOAT;
+	VertexDecoder decOut;
+	decOut.SetVertexType(customVertType);
+	
+	const u8 *inp = (const u8 *)verts;
+	u8 *out = decoded;
+	for (int i = 0; i < count; i++) {
+		if (pos) memcpy(out + decOut.posoff, inp + posoff, possize[pos]);
+		if (nrm) memcpy(out + decOut.nrmoff, inp + nrmoff, nrmsize[nrm]);
+		if (col) memcpy(out + decOut.coloff, inp + coloff, colsize[col]);
+		// Ignore others for now, this is all we need for puzbob.
+		// Inject!
+		memcpy(out + decOut.tcoff, &customuv[i * 2], tcsize[decOut.tc]);
+		inp += this->onesize_;
+		out += decOut.onesize_;
+	}
+	return customVertType;
+}
+
+int VertexDecoder::ToString(char *output) const {
+	char * start = output;
+	output += sprintf(output, "P: %i ", pos);
+	if (nrm)
+		output += sprintf(output, "N: %i ", nrm);
+	if (col)
+		output += sprintf(output, "C: %i ", col);
+	if (tc)
+		output += sprintf(output, "T: %i ", tc);
+	if (weighttype)
+		output += sprintf(output, "W: %i ", weighttype);
+	if (idx)
+		output += sprintf(output, "I: %i ", idx);
+	if (morphcount > 1)
+		output += sprintf(output, "Morph: %i ", morphcount);
+	output += sprintf(output, "Verts: %i ", stats_[STAT_VERTSSUBMITTED]);
+	if (throughmode)
+		output += sprintf(output, " (through)");
+
+	output += sprintf(output, " (size: %i)", VertexSize());
+	return output - start;
+}
--- a/GPU/Directx9/VertexDecoder.h
+++ b/GPU/Directx9/VertexDecoder.h
@ -0,0 +1,437 @@
+// Copyright (c) 2012- PPSSPP Project.
+
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, version 2.0 or later versions.
+
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License 2.0 for more details.
+
+// A copy of the GPL 2.0 should have been included with the program.
+// If not, see http://www.gnu.org/licenses/
+
+// Official git repository and contact information can be found at
+// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.
+
+#pragma once
+
+#include "../GPUState.h"
+#include "../Globals.h"
+#include "base/basictypes.h"
+#include "Core/Reporting.h"
+
+// DecVtxFormat - vertex formats for PC
+// Kind of like a D3D VertexDeclaration.
+// Can write code to easily bind these using OpenGL, or read these manually.
+// No morph support, that is taken care of by the VertexDecoder.
+
+enum {
+	DEC_NONE,
+	DEC_FLOAT_1,
+	DEC_FLOAT_2,
+	DEC_FLOAT_3,
+	DEC_FLOAT_4,
+	DEC_S8_3,
+	DEC_S16_3,
+	DEC_U8_1,
+	DEC_U8_2,
+	DEC_U8_3,
+	DEC_U8_4,
+	DEC_U16_1,
+	DEC_U16_2,
+	DEC_U16_3,
+	DEC_U16_4,
+	DEC_U8A_2,
+	DEC_U16A_2,
+};
+
+int DecFmtSize(u8 fmt);
+
+struct DecVtxFormat {
+	u8 w0fmt; u8 w0off;  // first 4 weights
+	u8 w1fmt; u8 w1off;  // second 4 weights
+	u8 uvfmt; u8 uvoff;
+	u8 c0fmt; u8 c0off;  // First color
+	u8 c1fmt; u8 c1off;
+	u8 nrmfmt; u8 nrmoff;
+	u8 posfmt; u8 posoff;
+	short stride;
+};
+
+// This struct too.
+struct TransformedVertex
+{
+	float x, y, z, fog;     // in case of morph, preblend during decode
+	float u; float v; float w;   // scaled by uscale, vscale, if there
+	u8 color0[4];   // prelit
+	u8 color1[4];   // prelit
+};
+
+DecVtxFormat GetTransformedVtxFormat(const DecVtxFormat &fmt);
+
+class VertexDecoder;
+
+typedef void (VertexDecoder::*StepFunction)() const;
+
+void GetIndexBounds(void *inds, int count, u32 vertType, u16 *indexLowerBound, u16 *indexUpperBound);
+
+enum {
+	STAT_VERTSSUBMITTED = 0,
+	NUM_VERTEX_DECODER_STATS = 1
+};
+
+// Right now
+//   - compiles into list of called functions
+// Future TODO
+//   - will compile into lighting fast specialized x86 and ARM
+class VertexDecoder
+{
+public:
+	VertexDecoder() : coloff(0), nrmoff(0), posoff(0) {}
+	~VertexDecoder() {}
+
+	// prim is needed knowledge for a performance hack (PrescaleUV)
+	void SetVertexType(u32 vtype);
+	u32 VertexType() const { return fmt_; }
+	const DecVtxFormat &GetDecVtxFmt() { return decFmt; }
+
+	void DecodeVerts(u8 *decoded, const void *verts, int indexLowerBound, int indexUpperBound) const;
+
+	// This could be easily generalized to inject any one component. Don't know another use for it though.
+	u32 InjectUVs(u8 *decoded, const void *verts, float *customuv, int count) const;
+
+	bool hasColor() const { return col != 0; }
+	int VertexSize() const { return size; }
+
+	void Step_WeightsU8() const;
+	void Step_WeightsU16() const;
+	void Step_WeightsFloat() const;
+
+	void Step_TcU8() const;
+	void Step_TcU16() const;
+	void Step_TcFloat() const;
+
+	void Step_TcU8Prescale() const;
+	void Step_TcU16Prescale() const;
+	void Step_TcFloatPrescale() const;
+
+	void Step_TcU16Double() const;
+	void Step_TcU16Through() const;
+	void Step_TcU16ThroughDouble() const;
+	void Step_TcFloatThrough() const;
+
+	// TODO: tcmorph
+
+	void Step_Color4444() const;
+	void Step_Color565() const;
+	void Step_Color5551() const;
+	void Step_Color8888() const;
+
+	void Step_Color4444Morph() const;
+	void Step_Color565Morph() const;
+	void Step_Color5551Morph() const;
+	void Step_Color8888Morph() const;
+
+	void Step_NormalS8() const;
+	void Step_NormalS16() const;
+	void Step_NormalFloat() const;
+
+	void Step_NormalS8Morph() const;
+	void Step_NormalS16Morph() const;
+	void Step_NormalFloatMorph() const;
+
+	void Step_PosS8() const;
+	void Step_PosS16() const;
+	void Step_PosFloat() const;
+
+	void Step_PosS8Morph() const;
+	void Step_PosS16Morph() const;
+	void Step_PosFloatMorph() const;
+
+	void Step_PosS8Through() const;
+	void Step_PosS16Through() const;
+	void Step_PosFloatThrough() const;
+
+	void ResetStats() {
+		memset(stats_, 0, sizeof(stats_));
+	}
+
+	void IncrementStat(int stat, int amount) {
+		stats_[stat] += amount;
+	}
+
+	// output must be big for safety.
+	// Returns number of chars written.
+	// Ugly for speed.
+	int ToString(char *output) const;
+
+	// Mutable decoder state
+	mutable u8 *decoded_;
+	mutable const u8 *ptr_;
+
+	// "Immutable" state, set at startup
+
+	// The decoding steps
+	StepFunction steps_[5];
+	int numSteps_;
+
+	u32 fmt_;
+	DecVtxFormat decFmt;
+
+	bool throughmode;
+	int biggest;
+	int size;
+	int onesize_;
+
+	int weightoff;
+	int tcoff;
+	int coloff;
+	int nrmoff;
+	int posoff;
+
+	int tc;
+	int col;
+	int nrm;
+	int pos;
+	int weighttype;
+	int idx;
+	int morphcount;
+	int nweights;
+
+	int stats_[NUM_VERTEX_DECODER_STATS];
+};
+
+// Reads decoded vertex formats in a convenient way. For software transform and debugging.
+class VertexReader
+{
+public:
+	VertexReader(u8 *base, const DecVtxFormat &decFmt, int vtype) : base_(base), data_(base), decFmt_(decFmt), vtype_(vtype) {}
+
+	void ReadPos(float pos[3]) const {
+		switch (decFmt_.posfmt) {
+		case DEC_FLOAT_3:
+			{
+				const float *f = (const float *)(data_ + decFmt_.posoff);
+				memcpy(pos, f, 12);
+				if (isThrough()) {
+					// Integer value passed in a float. Wraps and all, required for Monster Hunter.
+					pos[2] = (float)((u16)(s32)pos[2]) * (1.0f / 65535.0f);
+				}
+			}
+			break;
+		case DEC_S16_3:
+			{
+				// X and Y are signed 16 bit, Z is unsigned 16 bit
+				const s16 *s = (const s16 *)(data_ + decFmt_.posoff);
+				const u16 *u = (const u16 *)(data_ + decFmt_.posoff);
+				if (isThrough()) {
+					for (int i = 0; i < 2; i++)
+						pos[i] = s[i];
+					pos[2] = u[2] * (1.0f / 65535.0f);
+				} else {
+					for (int i = 0; i < 3; i++)
+						pos[i] = s[i] * (1.f / 32767.f);
+				}
+			}
+			break;
+		case DEC_S8_3:
+			{
+				// X and Y are signed 8 bit, Z is unsigned 8 bit
+				const s8 *b = (const s8 *)(data_ + decFmt_.posoff);
+				const u8 *u = (const u8 *)(data_ + decFmt_.posoff);
+				if (isThrough()) {
+					for (int i = 0; i < 2; i++)
+						pos[i] = b[i];
+					pos[2] = u[2] / 255.0f;
+				} else {
+					for (int i = 0; i < 3; i++)
+						pos[i] = b[i] * (1.f / 127.f);
+				}
+			}
+			break;
+		default:
+			ERROR_LOG(G3D, "Reader: Unsupported Pos Format");
+			break;
+		}
+	}
+
+	void ReadNrm(float nrm[3]) const {
+		switch (decFmt_.nrmfmt) {
+		case DEC_FLOAT_3:
+			//memcpy(nrm, data_ + decFmt_.nrmoff, 12);
+			{
+				const float *f = (const float *)(data_ + decFmt_.nrmoff);
+				for (int i = 0; i < 3; i++)
+					nrm[i] = f[i] ;
+			}
+			break;
+		case DEC_S16_3:
+			{
+				const s16 *s = (const s16 *)(data_ + decFmt_.nrmoff);
+				for (int i = 0; i < 3; i++)
+					nrm[i] = s[i] * (1.f / 32767.f);
+			}
+			break;
+		case DEC_S8_3:
+			{
+				const s8 *b = (const s8 *)(data_ + decFmt_.nrmoff);
+				for (int i = 0; i < 3; i++)
+					nrm[i] = b[i] * (1.f / 127.f);
+			}
+			break;
+		default:
+			ERROR_LOG(G3D, "Reader: Unsupported Nrm Format");
+			break;
+		}
+	}
+
+	void ReadUV(float uv[2]) const {
+		switch (decFmt_.uvfmt) {
+		case DEC_U8_2:
+			{
+				const u8 *b = (const u8 *)(data_ + decFmt_.uvoff);
+				uv[0] = b[0] * (1.f / 128.f);
+				uv[1] = b[1] * (1.f / 128.f);
+			}
+			break;
+
+		case DEC_U16_2:
+			{
+				const u16 *s = (const u16 *)(data_ + decFmt_.uvoff);
+				uv[0] = s[0] * (1.f / 32768.f);
+				uv[1] = s[1] * (1.f / 32768.f);
+			}
+			break;
+
+		case DEC_FLOAT_2:
+			{
+				const float *f = (const float *)(data_ + decFmt_.uvoff);
+				uv[0] = f[0];
+				uv[1] = f[1];
+			}
+			break;
+
+		case DEC_U16A_2:
+			{
+				const u16 *p = (const u16 *)(data_ + decFmt_.uvoff);
+				uv[0] = (float)p[0];
+				uv[1] = (float)p[1];
+			}
+			break;
+		default:
+			ERROR_LOG(G3D, "Reader: Unsupported UV Format");
+			break;
+		}
+	}
+
+	void ReadColor0(float color[4]) const {
+		switch (decFmt_.c0fmt) {
+		case DEC_U8_4:
+			{
+				const u8 *b = (const u8 *)(data_ + decFmt_.c0off);
+				for (int i = 0; i < 4; i++)
+					color[i] = b[i] * (1.f / 255.f);
+			}
+			break;
+		case DEC_FLOAT_4:
+			memcpy(color, data_ + decFmt_.c0off, 16); 
+			break;
+		default:
+			ERROR_LOG(G3D, "Reader: Unsupported C0 Format");
+			break;
+		}
+	}
+
+	void ReadColor1(float color[3]) const {
+		switch (decFmt_.c1fmt) {
+		case DEC_U8_4:
+			{
+				const u8 *b = (const u8 *)(data_ + decFmt_.c1off);
+				for (int i = 0; i < 3; i++)
+					color[i] = b[i] * (1.f / 255.f);
+			}
+			break;
+		case DEC_FLOAT_4:
+			memcpy(color, data_ + decFmt_.c1off, 12); 
+			break;
+		default:
+			ERROR_LOG(G3D, "Reader: Unsupported C1 Format");
+			break;
+		}
+	}
+
+	void ReadWeights(float weights[8]) const {
+		const float *f = (const float *)(data_ + decFmt_.w0off);
+		const u8 *b = (const u8 *)(data_ + decFmt_.w0off);
+		const u16 *s = (const u16 *)(data_ + decFmt_.w0off);
+		switch (decFmt_.w0fmt) {
+		case DEC_FLOAT_1:
+		case DEC_FLOAT_2:
+		case DEC_FLOAT_3:
+		case DEC_FLOAT_4:
+			for (int i = 0; i <= decFmt_.w0fmt - DEC_FLOAT_1; i++)
+				weights[i] = f[i];
+			break;
+		case DEC_U8_1: weights[0] = b[0] * (1.f / 128.f); break;
+		case DEC_U8_2: for (int i = 0; i < 2; i++) weights[i] = b[i] * (1.f / 128.f); break;
+		case DEC_U8_3: for (int i = 0; i < 3; i++) weights[i] = b[i] * (1.f / 128.f); break;
+		case DEC_U8_4: for (int i = 0; i < 4; i++) weights[i] = b[i] * (1.f / 128.f); break;
+		case DEC_U16_1: weights[0] = s[0] * (1.f / 32768.f); break;
+		case DEC_U16_2: for (int i = 0; i < 2; i++) weights[i] = s[i] * (1.f / 32768.f); break;
+		case DEC_U16_3: for (int i = 0; i < 3; i++) weights[i] = s[i] * (1.f / 32768.f); break;
+		case DEC_U16_4: for (int i = 0; i < 4; i++) weights[i] = s[i] * (1.f / 32768.f); break;
+		default:
+			ERROR_LOG(G3D, "Reader: Unsupported W0 Format");
+			break;
+		}
+
+		f = (const float *)(data_ + decFmt_.w1off);
+		b = (const u8 *)(data_ + decFmt_.w1off);
+		s = (const u16 *)(data_ + decFmt_.w1off);
+		switch (decFmt_.w1fmt) {
+		case 0:
+			// It's fine for there to be w0 weights but not w1.
+			break;
+		case DEC_FLOAT_1:
+		case DEC_FLOAT_2:
+		case DEC_FLOAT_3:
+		case DEC_FLOAT_4:
+			for (int i = 0; i <= decFmt_.w1fmt - DEC_FLOAT_1; i++)
+				weights[i+4] = f[i];
+			break;
+		case DEC_U8_1: weights[4] = b[0] * (1.f / 128.f); break;
+		case DEC_U8_2: for (int i = 0; i < 2; i++) weights[i+4] = b[i] * (1.f / 128.f); break;
+		case DEC_U8_3: for (int i = 0; i < 3; i++) weights[i+4] = b[i] * (1.f / 128.f); break;
+		case DEC_U8_4: for (int i = 0; i < 4; i++) weights[i+4] = b[i] * (1.f / 128.f); break;
+		case DEC_U16_1: weights[4] = s[0] * (1.f / 32768.f); break;
+		case DEC_U16_2: for (int i = 0; i < 2; i++) weights[i+4] = s[i] * (1.f / 32768.f); break;
+		case DEC_U16_3: for (int i = 0; i < 3; i++) weights[i+4] = s[i] * (1.f / 32768.f); break;
+		case DEC_U16_4: for (int i = 0; i < 4; i++) weights[i+4] = s[i]  * (1.f / 32768.f); break;
+		default:
+			ERROR_LOG(G3D, "Reader: Unsupported W1 Format");
+			break;
+		}
+	}
+
+	bool hasColor0() const { return decFmt_.c0fmt != 0; }
+	bool hasColor1() const { return decFmt_.c1fmt != 0; }
+	bool hasNormal() const { return decFmt_.nrmfmt != 0; }
+	bool hasUV() const { return decFmt_.uvfmt != 0; }
+	bool isThrough() const { return (vtype_ & GE_VTYPE_THROUGH) != 0; }
+	void Goto(int index) {
+		data_ = base_ + index * decFmt_.stride;
+	}
+
+private:
+	u8 *base_;
+	u8 *data_;
+	DecVtxFormat decFmt_;
+	int vtype_;
+};
+
+// Debugging utilities
+void PrintDecodedVertex(VertexReader &vtx);
+
+
--- a/GPU/Directx9/VertexShaderGenerator.cpp
+++ b/GPU/Directx9/VertexShaderGenerator.cpp
@ -0,0 +1,251 @@
+// Copyright (c) 2012- PPSSPP Project.
+
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, version 2.0 or later versions.
+
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License 2.0 for more details.
+
+// A copy of the GPL 2.0 should have been included with the program.
+// If not, see http://www.gnu.org/licenses/
+
+// Official git repository and contact information can be found at
+// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.
+
+#include <stdio.h>
+#include <locale.h>
+
+#if defined(_WIN32) && defined(_DEBUG)
+#include "Common/CommonWindows.h"
+#endif
+
+#include "base/stringutil.h"
+#include "GPU/ge_constants.h"
+#include "GPU/GPUState.h"
+#include "Core/Config.h"
+
+#include "GPU/Directx9/VertexShaderGenerator.h"
+
+#undef WRITE
+
+#define WRITE p+=sprintf
+
+bool CanUseHardwareTransform(int prim) {
+	if (!g_Config.bHardwareTransform)
+		return false;
+	return !gstate.isModeThrough() && prim != GE_PRIM_RECTANGLES;
+}
+
+int TranslateNumBones(int bones) {
+	if (!bones) return 0;
+	if (bones < 4) return 4;
+	// if (bones < 8) return 8;   I get drawing problems in FF:CC with this!
+	return bones;
+}
+
+// prim so we can special case for RECTANGLES :(
+void ComputeVertexShaderID(VertexShaderID *id, int prim, bool useHWTransform) {
+	const u32 vertType = gstate.vertType;
+	int doTexture = gstate.isTextureMapEnabled() && !gstate.isModeClear();
+	bool doTextureProjection = gstate.getUVGenMode() == 1;
+
+	bool hasColor = (vertType & GE_VTYPE_COL_MASK) != 0;
+	bool hasNormal = (vertType & GE_VTYPE_NRM_MASK) != 0;
+	bool hasBones = (vertType & GE_VTYPE_WEIGHT_MASK) != 0;
+	bool enableFog = gstate.isFogEnabled() && !gstate.isModeThrough() && !gstate.isModeClear();
+	bool lmode = gstate.isUsingSecondaryColor() && gstate.isLightingEnabled();
+
+	memset(id->d, 0, sizeof(id->d));
+	id->d[0] = lmode & 1;
+	id->d[0] |= ((int)gstate.isModeThrough()) << 1;
+	id->d[0] |= ((int)enableFog) << 2;
+	id->d[0] |= doTexture << 3;
+	id->d[0] |= (hasColor & 1) << 4;
+	if (doTexture) {
+		id->d[0] |= (gstate_c.flipTexture & 1) << 5;
+		id->d[0] |= (doTextureProjection & 1) << 6;
+	}
+
+	if (useHWTransform) {
+		id->d[0] |= 1 << 8;
+		id->d[0] |= (hasNormal & 1) << 9;
+
+		// UV generation mode
+		id->d[0] |= gstate.getUVGenMode() << 16;
+
+		// The next bits are used differently depending on UVgen mode
+		if (gstate.getUVGenMode() == 1) {
+			id->d[0] |= gstate.getUVProjMode() << 18;
+		} else if (gstate.getUVGenMode() == 2) {
+			id->d[0] |= gstate.getUVLS0() << 18;
+			id->d[0] |= gstate.getUVLS1() << 20;
+		}
+
+		// Bones
+		if (hasBones)
+			id->d[0] |= (TranslateNumBones(gstate.getNumBoneWeights()) - 1) << 22;
+
+		// Okay, d[1] coming up. ==============
+
+		if (gstate.isLightingEnabled() || gstate.getUVGenMode() == 2) {
+			// Light bits
+			for (int i = 0; i < 4; i++) {
+				id->d[1] |= gstate.getLightComputation(i) << (i * 4);
+				id->d[1] |= gstate.getLightType(i) << (i * 4 + 2);
+			}
+			id->d[1] |= (gstate.materialupdate & 7) << 16;
+			for (int i = 0; i < 4; i++) {
+				id->d[1] |= (gstate.isLightChanEnabled(i) & 1) << (20 + i);
+			}
+		}
+		id->d[1] |= gstate.isLightingEnabled() << 24;
+		id->d[1] |= ((vertType & GE_VTYPE_WEIGHT_MASK) >> GE_VTYPE_WEIGHT_SHIFT) << 25;
+	}
+}
+
+static const char * const boneWeightAttrDecl[8] = {
+	"attribute mediump float a_w1;\n",
+	"attribute mediump vec2 a_w1;\n",
+	"attribute mediump vec3 a_w1;\n",
+	"attribute mediump vec4 a_w1;\n",
+	"attribute mediump vec4 a_w1;\nattribute mediump float a_w2;\n",
+	"attribute mediump vec4 a_w1;\nattribute mediump vec2 a_w2;\n",
+	"attribute mediump vec4 a_w1;\nattribute mediump vec3 a_w2;\n",
+	"attribute mediump vec4 a_w1;\nattribute mediump vec4 a_w2;\n",
+};
+
+enum DoLightComputation {
+	LIGHT_OFF,
+	LIGHT_SHADE,
+	LIGHT_FULL,
+};
+
+#if 0 // used for debugging
+void GenerateVertexShader(int prim, char *buffer, bool useHWTransform) {
+	const char * vscode =
+    " float4x4 u_proj : register(c0);              "
+    "                                              "
+    " struct VS_IN                                 "
+    "                                              "
+    " {                                            "
+    "		float4 ObjPos   : POSITION;            "                 
+	"		float3 Uv   : TEXCOORD0;               "
+	"		float4 C1    : COLOR0;                 "  // Vertex color
+	"		float4 C2    : COLOR1;                 "  // Vertex color
+    " };                                           "
+    "                                              "
+    " struct VS_OUT                                "
+    " {                                            "
+    "		float4 ObjPos   : POSITION;            "                 
+	"		float4 Uv   : TEXCOORD0;               "
+	"		float4 C1    : COLOR0;                 "  // Vertex color
+	"		float4 C2    : COLOR1;                 "  // Vertex color
+    " };                                           "
+    "                                              "
+    " VS_OUT main( VS_IN In )                      "
+    " {                                            "
+    "		VS_OUT Out;                              "
+	"       Out.ObjPos = mul( float4(In.ObjPos.xyz, 1), u_proj );  "  // Transform vertex into
+	"		Out.Uv = float4(In.Uv.xy, 0, In.Uv.z);			"
+	"		Out.C1 = In.C1;			"
+	"		Out.C2 = In.C2;			"
+    "		return Out;                              "  // Transfer color
+    " }                                            ";
+
+	strcpy(buffer, vscode);
+}
+#else
+
+void GenerateVertexShader(int prim, char *buffer, bool useHWTransform) {
+	char *p = buffer;
+	const u32 vertType = gstate.vertType;
+
+	int lmode = gstate.isUsingSecondaryColor() && gstate.isLightingEnabled();
+	int doTexture = gstate.isTextureMapEnabled() && !gstate.isModeClear();
+
+	bool hasColor = (vertType & GE_VTYPE_COL_MASK) != 0 || !useHWTransform;
+	bool hasNormal = (vertType & GE_VTYPE_NRM_MASK) != 0 && useHWTransform;
+	bool enableFog = gstate.isFogEnabled() && !gstate.isModeThrough() && !gstate.isModeClear();
+	bool throughmode = (vertType & GE_VTYPE_THROUGH_MASK) != 0;
+	bool flipV = gstate_c.flipTexture;
+	bool doTextureProjection = gstate.getUVGenMode() == 1;
+
+	DoLightComputation doLight[4] = {LIGHT_OFF, LIGHT_OFF, LIGHT_OFF, LIGHT_OFF};
+	if (useHWTransform) {
+		int shadeLight0 = gstate.getUVGenMode() == 2 ? gstate.getUVLS0() : -1;
+		int shadeLight1 = gstate.getUVGenMode() == 2 ? gstate.getUVLS1() : -1;
+		for (int i = 0; i < 4; i++) {
+			if (i == shadeLight0 || i == shadeLight1)
+				doLight[i] = LIGHT_SHADE;
+			if (gstate.isLightingEnabled() && gstate.isLightChanEnabled(i))
+				doLight[i] = LIGHT_FULL;
+		}
+	}
+
+
+	if (gstate.isModeThrough())	{
+		WRITE(p, "float4x4 u_proj_through;\n");
+	} else {
+		WRITE(p, "float4x4 u_proj;\n");
+		// Add all the uniforms we'll need to transform properly.
+	}
+	if (useHWTransform || !hasColor)
+		WRITE(p, "float4 u_matambientalpha;\n");  // matambient + matalpha
+	
+
+	WRITE(p, " struct VS_IN                                ");
+    WRITE(p, "                                             ");
+    WRITE(p,  " {                                          ");
+    WRITE(p, "		float4 ObjPos   : POSITION;            ");
+	WRITE(p, "		float3 Uv   : TEXCOORD0;               ");
+	WRITE(p, "		float4 C1    : COLOR0;                 ");
+	WRITE(p, "		float4 C2    : COLOR1;                 ");
+    WRITE(p, " };                                          ");
+    WRITE(p, "                                             ");
+    WRITE(p, " struct VS_OUT                               ");
+    WRITE(p, " {                                           ");
+    WRITE(p, "		float4 ObjPos   : POSITION;            ");
+	WRITE(p, "		float4 Uv   : TEXCOORD0;               ");
+	WRITE(p, "		float4 C1    : COLOR0;                 ");
+	WRITE(p, "		float4 C2    : COLOR1;                 ");
+	if (enableFog) {
+		WRITE(p, "float v_fogdepth:FOG;\n");
+	}
+    WRITE(p, " };                                          ");
+    WRITE(p, "                                             ");
+    WRITE(p, " VS_OUT main( VS_IN In )                     ");
+    WRITE(p, " {                                           ");	
+	WRITE(p, "		VS_OUT Out;							   ");  
+	if (1) {
+		// Simple pass-through of vertex data to fragment shader
+		if (gstate.isModeThrough())	{
+			WRITE(p, "Out.ObjPos = mul( float4(In.ObjPos.xyz, 1), u_proj_through );");
+			//WRITE(p, "Out.ObjPos.z = ((1+Out.ObjPos.z)/2);"); // Dx z versus opengl z
+		} else {
+			//WRITE(p, "  Out.ObjPos = mul( u_proj, float4(In.ObjPos.xyz, 1) );");
+			WRITE(p, "Out.ObjPos = mul( float4(In.ObjPos.xyz, 1), u_proj );");
+			//WRITE(p, "Out.ObjPos.z = ((1+Out.ObjPos.z)/2);"); // Dx z versus opengl z
+		}
+	//WRITE(p, "Out.Uv = In.Uv;");
+	WRITE(p, "Out.Uv = float4(In.Uv.xy, 0, In.Uv.z);");
+	if (hasColor) {
+		WRITE(p, "Out.C1 = In.C1;");
+		WRITE(p, "Out.C2 = In.C2;");
+	} else {
+		WRITE(p, "  Out.C1 = u_matambientalpha;\n");
+		WRITE(p, "  Out.C2 = float4(0,0,0,0);\n");
+	}
+	if (enableFog) {
+		WRITE(p, "  Out.v_fogdepth = In.ObjPos.w;\n");
+	}
+    WRITE(p, "	return Out;             ");
+	}
+	WRITE(p, "}\n");
+}
+
+
+
+#endif
--- a/GPU/Directx9/VertexShaderGenerator.h
+++ b/GPU/Directx9/VertexShaderGenerator.h
@ -0,0 +1,57 @@
+// Copyright (c) 2012- PPSSPP Project.
+
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, version 2.0 or later versions.
+
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License 2.0 for more details.
+
+// A copy of the GPL 2.0 should have been included with the program.
+// If not, see http://www.gnu.org/licenses/
+
+// Official git repository and contact information can be found at
+// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.
+
+#pragma once
+
+#include "Globals.h"
+
+// #define USE_BONE_ARRAY
+
+struct VertexShaderID
+{
+	VertexShaderID() {d[0] = 0xFFFFFFFF;}
+	void clear() {d[0] = 0xFFFFFFFF;}
+	u32 d[2];
+	bool operator < (const VertexShaderID &other) const
+	{
+		for (size_t i = 0; i < sizeof(d) / sizeof(u32); i++)
+		{
+			if (d[i] < other.d[i])
+				return true;
+			if (d[i] > other.d[i])
+				return false;
+		}
+		return false;
+	}
+	bool operator == (const VertexShaderID &other) const
+	{
+		for (size_t i = 0; i < sizeof(d) / sizeof(u32); i++)
+		{
+			if (d[i] != other.d[i])
+				return false;
+		}
+		return true;
+	}
+};
+
+bool CanUseHardwareTransform(int prim);
+
+void ComputeVertexShaderID(VertexShaderID *id, int prim, bool useHWTransform);
+void GenerateVertexShader(int prim, char *buffer, bool useHWTransform);
+
+// Collapse to less skinning shaders to reduce shader switching, which is expensive.
+int TranslateNumBones(int bones);
--- a/GPU/Directx9/helper/dx_state.cpp
+++ b/GPU/Directx9/helper/dx_state.cpp
@ -0,0 +1,79 @@
+#include "dx_state.h"
+#include <assert.h>
+
+
+DirectxState dxstate;
+GLExtensions gl_extensions;
+
+int DirectxState::state_count = 0;
+
+void DirectxState::Initialize() {
+	if(initialized) return;
+
+	Restore();
+
+	initialized = true;
+}
+
+void DirectxState::Restore() {
+	int count = 0;
+	blend.restore(); count++;
+	blendEquation.restore(); count++;
+	blendFunc.restore(); count++;
+	blendColor.restore(); count++;
+
+	scissorTest.restore(); count++;
+	scissorRect.restore(); count++;
+
+	//cullFace.restore(); count++;
+	//cullFaceMode.restore(); count++;
+	cullMode.restore(); count++;
+
+	depthTest.restore(); count++;
+//	depthRange.restore(); count++;
+	depthFunc.restore(); count++;
+	depthWrite.restore(); count++;
+
+	colorMask.restore(); count++;
+	viewport.restore(); count++;
+
+	stencilTest.restore(); count++;
+	stencilOp.restore(); count++;
+	stencilFunc.restore(); count++;
+
+	dither.restore(); count++;
+
+	assert(count == state_count && "DirectxState::Restore is missing some states");
+}
+
+void CheckGLExtensions() {
+	static bool done = false;
+	if (done)
+		return;
+	done = true;
+
+	memset(&gl_extensions, 0, sizeof(gl_extensions));
+
+/*
+	gl_extensions.OES_packed_depth_stencil = strstr(extString, "GL_OES_packed_depth_stencil") != 0;
+	gl_extensions.OES_depth24 = strstr(extString, "GL_OES_depth24") != 0;
+	gl_extensions.OES_depth_texture = strstr(extString, "GL_OES_depth_texture") != 0;
+	gl_extensions.EXT_discard_framebuffer = strstr(extString, "GL_EXT_discard_framebuffer") != 0;
+#ifdef USING_GLES2
+	gl_extensions.FBO_ARB = true;
+	gl_extensions.FBO_EXT = false;
+#else
+	gl_extensions.FBO_ARB = strstr(extString, "GL_ARB_framebuffer_object") != 0;
+	gl_extensions.FBO_EXT = strstr(extString, "GL_EXT_framebuffer_object") != 0;
+#endif
+*/
+}
+
+void DirectxState::SetVSyncInterval(int interval) {
+	/*
+#ifdef _WIN32
+	if( wglSwapIntervalEXT )
+		wglSwapIntervalEXT(interval);
+#endif
+	*/
+}
--- a/GPU/Directx9/helper/dx_state.h
+++ b/GPU/Directx9/helper/dx_state.h
@ -0,0 +1,332 @@
+#pragma once
+
+#include <functional>
+#include <string.h>
+#include "global.h"
+
+// OpenGL state cache. Should convert all code to use this instead of directly calling glEnable etc,
+// as GL state changes can be expensive on some hardware.
+class DirectxState
+{
+private:
+	template<D3DRENDERSTATETYPE cap, bool init>
+	class BoolState {
+		bool _value;
+	public:
+		BoolState() : _value(init) {
+			DirectxState::state_count++;
+        }
+
+		inline void set(bool value) {
+			_value = value;
+			pD3Ddevice->SetRenderState(cap, value);
+		}
+		inline void enable() {
+			set(true);
+		}
+		inline void disable() {
+			set(false);
+		}
+		operator bool() const {
+			return isset();
+		}
+		inline bool isset() {
+			return _value;
+		}
+		void restore() {
+			pD3Ddevice->SetRenderState(cap, _value);
+		}
+	};
+
+	template<D3DRENDERSTATETYPE state1, DWORD p1def>
+	class DxState1 {
+		D3DRENDERSTATETYPE _state1;
+		DWORD p1;
+	public:
+		DxState1() : _state1(state1), p1(p1def) {
+			DirectxState::state_count++;
+        }
+
+		inline void set(DWORD newp1) {
+			p1 = newp1;
+			pD3Ddevice->SetRenderState(_state1, p1);
+		}
+		void restore() {
+			pD3Ddevice->SetRenderState(_state1, p1);
+		}
+	};
+
+	template<D3DRENDERSTATETYPE state1, DWORD p1def, D3DRENDERSTATETYPE state2, DWORD p2def>
+	class DxState2 {
+		D3DRENDERSTATETYPE _state1;
+		D3DRENDERSTATETYPE _state2;
+		DWORD p1;
+		DWORD p2;
+	public:
+		DxState2() : _state1(state1),_state2(state2), p1(p1def), p2(p2def) {
+			DirectxState::state_count++;
+        }
+
+		inline void set(DWORD newp1, DWORD newp2) {
+			p1 = newp1;
+			p2 = newp2;
+			pD3Ddevice->SetRenderState(_state1, p1);
+			pD3Ddevice->SetRenderState(_state2, p2);
+		}
+		void restore() {
+			pD3Ddevice->SetRenderState(_state1, p1);
+			pD3Ddevice->SetRenderState(_state2, p2);
+		}
+	};
+
+	template<D3DRENDERSTATETYPE state1, DWORD p1def, D3DRENDERSTATETYPE state2, DWORD p2def, D3DRENDERSTATETYPE state3, DWORD p3def>
+	class DxState3 {
+		D3DRENDERSTATETYPE _state1;
+		D3DRENDERSTATETYPE _state2;
+		D3DRENDERSTATETYPE _state3;
+		DWORD p1;
+		DWORD p2;
+		DWORD p3;
+	public:
+		DxState3() : _state1(state1),_state2(state2), _state3(state3), 
+			p1(p1def), p2(p2def), p3(p3def) {
+			DirectxState::state_count++;
+        }
+
+		inline void set(DWORD newp1, DWORD newp2, DWORD newp3) {
+			p1 = newp1;
+			p2 = newp2;
+			p3 = newp3;
+			pD3Ddevice->SetRenderState(_state1, p1);
+			pD3Ddevice->SetRenderState(_state2, p2);
+			pD3Ddevice->SetRenderState(_state3, p2);
+		}
+		void restore() {
+			pD3Ddevice->SetRenderState(_state1, p1);
+			pD3Ddevice->SetRenderState(_state2, p2);
+			pD3Ddevice->SetRenderState(_state3, p2);
+		}
+	};
+	
+	#define STATE4(func, p1type, p2type, p3type, p4type, p1def, p2def, p3def, p4def) \
+	class SavedState4_##func { \
+		p1type p1; \
+		p2type p2; \
+		p3type p3; \
+		p4type p4; \
+	public: \
+		SavedState4_##func() : p1(p1def), p2(p2def), p3(p3def), p4(p4def) { \
+			DirectxState::state_count++; \
+        }; \
+		inline void set(p1type newp1, p2type newp2, p3type newp3, p4type newp4) { \
+			p1 = newp1; \
+			p2 = newp2; \
+			p3 = newp3; \
+			p4 = newp4; \
+			func(p1, p2, p3, p4); \
+		} \
+		inline void restore() { \
+			func(p1, p2, p3, p4); \
+		} \
+	}
+
+#define STATEFLOAT4(func, def) \
+	class SavedState4_##func { \
+		float p[4]; \
+	public: \
+		SavedState4_##func() { \
+			for (int i = 0; i < 4; i++) {p[i] = def;} \
+			DirectxState::state_count++; \
+		}; \
+		inline void set(const float v[4]) { \
+			if(memcmp(p,v,sizeof(float)*4)) { \
+				memcpy(p,v,sizeof(float)*4); \
+				func(p[0], p[1], p[2], p[3]); \
+			} \
+		} \
+		inline void restore() { \
+			func(p[0], p[1], p[2], p[3]); \
+		} \
+	}
+	class SavedBlendFactor {
+		DWORD c;
+	public:
+		SavedBlendFactor() {
+			c = 0xFFFFFFFF;
+			DirectxState::state_count++;
+		}
+		inline void set(const float v[4]) {
+			c = D3DCOLOR_COLORVALUE(v[0], v[1], v[2], v[3]);			
+			pD3Ddevice->SetRenderState(D3DRS_BLENDFACTOR, c);
+		}
+		inline void restore() {
+			pD3Ddevice->SetRenderState(D3DRS_BLENDFACTOR, c);
+		}
+	};
+
+	class SavedColorMask {
+		DWORD mask;
+	public:
+		SavedColorMask() {
+			mask = D3DCOLORWRITEENABLE_ALL;
+			DirectxState::state_count++;
+		}
+
+		inline void set(bool r, bool g, bool b, bool a) {
+			mask = 0;
+			if (r) {
+				mask |=D3DCOLORWRITEENABLE_RED;
+			}
+			if (g) {
+				mask |=D3DCOLORWRITEENABLE_GREEN;
+			}
+			if (b) {
+				mask |=D3DCOLORWRITEENABLE_BLUE;
+			}
+			if (a) {
+				mask |=D3DCOLORWRITEENABLE_ALPHA;
+			}
+			pD3Ddevice->SetRenderState(D3DRS_COLORWRITEENABLE, mask);
+			
+		}
+		inline void restore() {
+			pD3Ddevice->SetRenderState(D3DRS_COLORWRITEENABLE, mask);
+		}
+	};
+
+
+	class BoolUnused {
+	public:
+		BoolUnused() {
+			DirectxState::state_count++;
+		}
+		inline void set(bool) {
+			
+		}
+		inline void restore() {
+			
+		}
+
+		inline void enable() {
+			set(true);
+		}
+		inline void disable() {
+			set(false);
+		}
+	};
+
+	class StateVp {
+	D3DVIEWPORT9 viewport;
+	public:
+		inline void set(int x, int y, int w, int h,  float n = 0.f, float f = 1.f) {
+			viewport.X=x;
+			viewport.Y=y;
+			viewport.Width=w;
+			viewport.Height=h;	
+			/*
+			if (f > n) {
+				viewport.MinZ=n;
+				viewport.MaxZ=f;
+			} else {
+				viewport.MinZ=f;
+				viewport.MaxZ=n;
+			}
+			*/
+			viewport.MinZ=n;
+			viewport.MaxZ=f;
+
+			pD3Ddevice->SetViewport(&viewport);
+		}
+
+		inline void restore() {
+			pD3Ddevice->SetViewport(&viewport);
+		}
+	};
+
+	class StateScissor {
+		
+	public:
+		inline void set(int x1, int y1, int x2, int y2)  {
+			RECT rect = {x1, y1, x2, y2};
+			//pD3Ddevice->SetScissorRect(&rect);
+		}
+
+		inline void restore() {
+		}
+	};
+
+	class CullMode {
+		DWORD cull;
+	public:
+		inline void set(int wantcull, int cullmode) {
+			if (!wantcull) {
+				// disable
+				cull = D3DCULL_NONE;
+			} else {
+				// add front face ...
+				cull = cullmode==0?D3DCULL_CW:D3DCULL_CCW;
+			}
+			
+			pD3Ddevice->SetRenderState(D3DRS_CULLMODE, cull);
+		}
+
+		inline void restore() {
+			pD3Ddevice->SetRenderState(D3DRS_CULLMODE, cull);
+		}
+	};
+
+	bool initialized;
+
+public:
+	static int state_count;
+	DirectxState() : initialized(false) {}
+	void Initialize();
+	void Restore();
+
+	// When adding a state here, don't forget to add it to DirectxState::Restore() too
+	BoolState<D3DRS_ALPHABLENDENABLE, false> blend;
+	DxState2<D3DRS_SRCBLEND, D3DBLEND_SRCALPHA, D3DRS_DESTBLEND, D3DBLEND_INVSRCALPHA> blendFunc;
+	DxState1<D3DRS_BLENDOP, D3DBLENDOP_ADD> blendEquation;
+	SavedBlendFactor blendColor;
+
+	BoolState<D3DRS_SCISSORTESTENABLE, false> scissorTest;
+
+	BoolUnused dither;
+
+	CullMode cullMode;
+
+	BoolState<D3DRS_ZENABLE, false> depthTest;
+
+	DxState1<D3DRS_ZFUNC, D3DCMP_LESSEQUAL> depthFunc;
+	DxState1<D3DRS_ZWRITEENABLE, TRUE> depthWrite;
+
+	SavedColorMask colorMask;
+
+	StateVp viewport;
+	StateScissor scissorRect;
+
+	BoolState<D3DRS_STENCILENABLE, false> stencilTest;
+	DxState3<D3DRS_STENCILFAIL, D3DSTENCILOP_KEEP, D3DRS_STENCILZFAIL, D3DSTENCILOP_KEEP, D3DRS_STENCILPASS, D3DSTENCILOP_KEEP> stencilOp;
+	DxState3<D3DRS_STENCILFUNC, D3DCMP_ALWAYS, D3DRS_STENCILREF, 0, D3DRS_STENCILMASK, 0xFFFFFFFF> stencilFunc;
+
+	// Only works on Win32, all other platforms are "force-vsync"
+	void SetVSyncInterval(int interval);  // one of the above VSYNC, or a higher number for multi-frame waits (could be useful for 30hz games)
+};
+
+#undef STATE1
+#undef STATE2
+
+extern DirectxState dxstate;
+
+struct GLExtensions {
+	bool OES_depth24;
+	bool OES_packed_depth_stencil;
+	bool OES_depth_texture;
+	bool EXT_discard_framebuffer;
+	bool FBO_ARB;
+	bool FBO_EXT;
+};
+
+extern GLExtensions gl_extensions;
+
+void CheckGLExtensions();
--- a/GPU/Directx9/helper/fbo.cpp
+++ b/GPU/Directx9/helper/fbo.cpp
@ -0,0 +1,107 @@
+#include "global.h"
+#include <stdint.h>
+#include <string.h>
+#include "fbo.h"
+
+static LPDIRECT3DSURFACE9 deviceRTsurf;
+static LPDIRECT3DSURFACE9 deviceDSsurf;
+
+
+struct FBO {
+	LPDIRECT3DSURFACE9 surf;
+	LPDIRECT3DSURFACE9 depthstencil;
+	LPDIRECT3DTEXTURE9 tex;
+	uint32_t color_texture;
+	uint32_t z_stencil_buffer;  // Either this is set, or the two below.
+	uint32_t z_buffer;
+	uint32_t stencil_buffer;
+
+	int width;
+	int height;
+	FBOColorDepth colorDepth;
+};
+
+void fbo_init() {
+	pD3Ddevice->GetRenderTarget(0, &deviceRTsurf);
+	pD3Ddevice->GetDepthStencilSurface(&deviceDSsurf);
+}
+
+FBO * current_fbo = NULL;
+
+
+FBO *fbo_create(int width, int height, int num_color_textures, bool z_stencil, FBOColorDepth colorDepth) {
+	FBO *fbo = new FBO();
+	fbo->width = width;
+	fbo->height = height;
+	fbo->colorDepth = colorDepth;
+
+	// only support 32bit surfaces
+	//pD3Ddevice->CreateRenderTarget(fbo->width, fbo->height, D3DFMT_A8R8G8B8, D3DMULTISAMPLE_NONE, 0, FALSE, &fbo->surf, NULL);
+	
+	/*
+	// Create depth + stencil target | forced to 24-bit Z, 8-bit stencil
+	pD3Ddevice->CreateDepthStencilSurface(fbo->width, fbo->height, D3DFMT_D24S8, D3DMULTISAMPLE_NONE, 0, FALSE, &fbo->depthstencil, NULL);
+	*/
+	// Only needed on xbox :s
+	pD3Ddevice->CreateTexture(fbo->width, fbo->height, 1, 0, D3DFMT_A8R8G8B8, 0, &fbo->tex, NULL);
+
+	fbo->stencil_buffer = 8;
+	fbo->z_buffer = 24;
+	return fbo;
+}
+
+void * fbo_get_rtt(FBO *fbo) {
+	return fbo->tex;
+}
+
+void fbo_unbind() {
+	if (current_fbo != NULL) {
+		//pD3Ddevice->Resolve( D3DRESOLVE_RENDERTARGET0, NULL, current_fbo->tex, NULL, 0, 0, NULL, 0.0f, 0, NULL );
+	}
+	current_fbo = NULL;
+
+	//pD3Ddevice->SetRenderTarget(0, deviceRTsurf);
+	//pD3Ddevice->SetDepthStencilSurface(deviceDSsurf);
+}
+
+void fbo_resolve(FBO *fbo) {
+	pD3Ddevice->Resolve( D3DRESOLVE_RENDERTARGET0|D3DRESOLVE_ALLFRAGMENTS|D3DRESOLVE_CLEARRENDERTARGET|D3DRESOLVE_CLEARDEPTHSTENCIL, NULL, fbo->tex, NULL, 0, 0, NULL, 0.0f, 0, NULL );
+}
+
+void fbo_bind_as_render_target(FBO *fbo) {
+	current_fbo = fbo;
+
+	//pD3Ddevice->SetRenderTarget(0, fbo->surf);
+	//pD3Ddevice->SetDepthStencilSurface(fbo->depthstencil);
+}
+
+void fbo_bind_for_read(FBO *fbo) {
+	OutputDebugStringA("fbo_bind_for_read: Fix me\r\n");
+}
+
+void fbo_bind_color_as_texture(FBO *fbo, int color) {
+	//OutputDebugStringA("fbo_bind_color_as_texture: Fix me\r\n");
+	//pD3Ddevice->SetTexture(0, fbo->tex);
+	pD3Ddevice->SetTexture(0, NULL);
+}
+
+void fbo_destroy(FBO *fbo) {
+	/*
+	fbo->depthstencil->Release();
+	*/
+	//fbo->surf->Release();
+	fbo->tex->Release();
+	delete fbo;
+}
+
+void fbo_get_dimensions(FBO *fbo, int *w, int *h) {
+	*w = fbo->width;
+	*h = fbo->height;
+}
+
+void SwapBuffer() {
+	pD3Ddevice->Present(0, 0, 0, 0);
+
+	// :s
+	pD3Ddevice->Clear(0, NULL, D3DCLEAR_STENCIL|D3DCLEAR_TARGET |D3DCLEAR_ZBUFFER, 0xFFFFFFFF, 0, 0);
+}
--- a/GPU/Directx9/helper/fbo.h
+++ b/GPU/Directx9/helper/fbo.h
@ -0,0 +1,39 @@
+#pragma once
+
+// Simple wrapper around FBO functionality.
+// Very C-ish API because that's what I felt like, and it's cool to completely
+// hide the data from callers...
+
+struct FBO;
+
+enum FBOColorDepth {
+	FBO_8888,
+	FBO_565,
+	FBO_4444,
+	FBO_5551,
+};
+
+
+// Creates a simple FBO with a RGBA32 color buffer stored in a texture, and
+// optionally an accompanying Z/stencil buffer.
+// No mipmap support.
+// num_color_textures must be 1 for now.
+// you lose bound texture state.
+
+// On some hardware, you might get a 24-bit depth buffer even though you only wanted a 16-bit one.
+FBO *fbo_create(int width, int height, int num_color_textures, bool z_stencil, FBOColorDepth colorDepth = FBO_8888);
+
+// These functions should be self explanatory.
+void fbo_bind_as_render_target(FBO *fbo);
+// color must be 0, for now.
+void fbo_bind_color_as_texture(FBO *fbo, int color);
+void fbo_bind_for_read(FBO *fbo);
+void fbo_unbind();
+void fbo_destroy(FBO *fbo);
+void fbo_get_dimensions(FBO *fbo, int *w, int *h);
+void fbo_resolve(FBO *fbo);
+
+void * fbo_get_rtt(FBO *fbo);
+
+// To get default depth and rt surface
+void fbo_init();
--- a/GPU/Directx9/helper/global.cpp
+++ b/GPU/Directx9/helper/global.cpp
@ -0,0 +1,229 @@
+#include "global.h"
+#include "fbo.h"
+
+LPDIRECT3DDEVICE9 pD3Ddevice = NULL;
+LPDIRECT3D9 pD3D = NULL;
+
+
+static const char * vscode =
+    " float4x4 matWVP : register(c0);              "
+    "                                              "
+    " struct VS_IN                                 "
+    "                                              "
+    " {                                            "
+    "		float4 ObjPos   : POSITION;              "                 
+	"		float2 Uv    : TEXCOORD0;                 "  // Vertex color
+    " };                                           "
+    "                                              "
+    " struct VS_OUT                                "
+    " {                                            "
+    "		float4 ProjPos  : POSITION;              " 
+	"		float2 Uv    : TEXCOORD0;                 "  // Vertex color
+    " };                                           "
+    "                                              "
+    " VS_OUT main( VS_IN In )                      "
+    " {                                            "
+    "		VS_OUT Out;                              "
+	"     Out.ProjPos = mul( matWVP, In.ObjPos );  "  // Transform vertex into
+	"		Out.Uv = In.Uv;			"
+    "		return Out;                              "  // Transfer color
+    " }                                            ";
+
+//--------------------------------------------------------------------------------------
+// Pixel shader
+//--------------------------------------------------------------------------------------
+static const char * pscode =
+	" sampler s: register(s0);					   "
+    " struct PS_IN                                 "
+    " {                                            "
+    "     float2 Uv : TEXCOORD0;                   "                     
+    " };                                           " 
+    "                                              "
+    " float4 main( PS_IN In ) : COLOR              "
+    " {                                            "
+    "   float4 c =  tex2D(s, In.Uv)  ;           "
+	"	c.a = 1.0f;"
+	"   return c;								   "
+    " }                                            ";
+
+IDirect3DVertexDeclaration9* pFramebufferVertexDecl = NULL;
+
+static const D3DVERTEXELEMENT9  VertexElements[] =
+{
+    { 0,  0, D3DDECLTYPE_FLOAT3, D3DDECLMETHOD_DEFAULT, D3DDECLUSAGE_POSITION, 0 },
+	{ 0, 12, D3DDECLTYPE_FLOAT2, D3DDECLMETHOD_DEFAULT, D3DDECLUSAGE_TEXCOORD, 0 },
+    D3DDECL_END()
+};
+
+IDirect3DVertexDeclaration9* pSoftVertexDecl = NULL;
+
+static const D3DVERTEXELEMENT9  SoftTransVertexElements[] =
+{
+    { 0,  0, D3DDECLTYPE_FLOAT4, D3DDECLMETHOD_DEFAULT, D3DDECLUSAGE_POSITION, 0 },
+	{ 0, 16, D3DDECLTYPE_FLOAT3, D3DDECLMETHOD_DEFAULT, D3DDECLUSAGE_TEXCOORD, 0 },
+    { 0, 28, D3DDECLTYPE_D3DCOLOR, D3DDECLMETHOD_DEFAULT, D3DDECLUSAGE_COLOR,  0 },
+	{ 0, 32, D3DDECLTYPE_D3DCOLOR, D3DDECLMETHOD_DEFAULT, D3DDECLUSAGE_COLOR,  1 },
+    D3DDECL_END()
+};
+
+LPDIRECT3DVERTEXSHADER9      pFramebufferVertexShader = NULL; // Vertex Shader
+LPDIRECT3DPIXELSHADER9       pFramebufferPixelShader = NULL;  // Pixel Shader
+
+bool CompilePixelShader(const char * code, LPDIRECT3DPIXELSHADER9 * pShader, LPD3DXCONSTANTTABLE * pShaderTable) {
+	LPD3DXCONSTANTTABLE shaderTable = *pShaderTable;
+
+	ID3DXBuffer* pShaderCode;
+	ID3DXBuffer* pErrorMsg;
+
+	HRESULT hr;
+
+	// Compile pixel shader.
+	hr = D3DXCompileShader( code, 
+		(UINT)strlen( code ),
+		NULL, 
+		NULL, 
+		"main", 
+		"ps_3_0", 
+		0, 
+		&pShaderCode, 
+		&pErrorMsg,
+		pShaderTable );
+	if( FAILED(hr) )
+	{
+		OutputDebugStringA((CHAR*)pErrorMsg->GetBufferPointer());
+		DebugBreak();
+		return false;
+	}
+
+	// Create pixel shader.
+	pD3Ddevice->CreatePixelShader( (DWORD*)pShaderCode->GetBufferPointer(), 
+		pShader );
+
+	pShaderCode->Release();
+
+	return true;
+}
+
+bool CompileVertexShader(const char * code, LPDIRECT3DVERTEXSHADER9 * pShader, LPD3DXCONSTANTTABLE * pShaderTable) {
+	LPD3DXCONSTANTTABLE shaderTable = *pShaderTable;
+
+	ID3DXBuffer* pShaderCode;
+	ID3DXBuffer* pErrorMsg;
+
+	HRESULT hr;
+
+	// Compile pixel shader.
+	hr = D3DXCompileShader( code, 
+		(UINT)strlen( code ),
+		NULL, 
+		NULL, 
+		"main", 
+		"vs_3_0", 
+		0, 
+		&pShaderCode, 
+		&pErrorMsg,
+		pShaderTable );
+	if( FAILED(hr) )
+	{
+		OutputDebugStringA((CHAR*)pErrorMsg->GetBufferPointer());
+		DebugBreak();
+		return false;
+	}
+
+	// Create pixel shader.
+	pD3Ddevice->CreateVertexShader( (DWORD*)pShaderCode->GetBufferPointer(), 
+		pShader );
+
+	pShaderCode->Release();
+
+	return true;
+}
+
+void CompileShaders() {
+	ID3DXBuffer* pShaderCode;
+	ID3DXBuffer* pErrorMsg;
+	HRESULT hr;
+	// Compile vertex shader.
+	hr = D3DXCompileShader( vscode, 
+		(UINT)strlen( vscode ),
+		NULL, 
+		NULL, 
+		"main", 
+		"vs_2_0", 
+		0, 
+		&pShaderCode, 
+		&pErrorMsg,
+		NULL );
+	if( FAILED(hr) )
+	{
+		OutputDebugStringA((CHAR*)pErrorMsg->GetBufferPointer());
+		DebugBreak();
+	}
+
+	// Create pixel shader.
+	pD3Ddevice->CreateVertexShader( (DWORD*)pShaderCode->GetBufferPointer(), 
+		&pFramebufferVertexShader );
+
+	pShaderCode->Release();
+
+	// Compile pixel shader.
+	hr = D3DXCompileShader( pscode, 
+		(UINT)strlen( pscode ),
+		NULL, 
+		NULL, 
+		"main", 
+		"ps_2_0", 
+		0, 
+		&pShaderCode, 
+		&pErrorMsg,
+		NULL );
+	if( FAILED(hr) )
+	{
+		OutputDebugStringA((CHAR*)pErrorMsg->GetBufferPointer());
+		DebugBreak();
+	}
+
+	// Create pixel shader.
+	pD3Ddevice->CreatePixelShader( (DWORD*)pShaderCode->GetBufferPointer(), 
+		&pFramebufferPixelShader );
+
+	pShaderCode->Release();
+
+	pD3Ddevice->CreateVertexDeclaration( VertexElements, &pFramebufferVertexDecl );
+	pD3Ddevice->SetVertexDeclaration( pFramebufferVertexDecl );
+
+	pD3Ddevice->CreateVertexDeclaration( SoftTransVertexElements, &pSoftVertexDecl );
+}
+
+void DirectxInit() {
+
+	pD3D = Direct3DCreate9( D3D_SDK_VERSION );
+
+    // Set up the structure used to create the D3DDevice. Most parameters are
+    // zeroed out. We set Windowed to TRUE, since we want to do D3D in a
+    // window, and then set the SwapEffect to "discard", which is the most
+    // efficient method of presenting the back buffer to the display.  And 
+    // we request a back buffer format that matches the current desktop display 
+    // format.
+    D3DPRESENT_PARAMETERS d3dpp;
+    ZeroMemory( &d3dpp, sizeof( d3dpp ) );
+    d3dpp.BackBufferWidth = 1280;
+    d3dpp.BackBufferHeight = 720;
+    d3dpp.BackBufferFormat =  ( D3DFORMAT )( D3DFMT_A8R8G8B8 );
+    d3dpp.FrontBufferFormat = ( D3DFORMAT )( D3DFMT_LE_A8R8G8B8 );
+    d3dpp.MultiSampleType = D3DMULTISAMPLE_NONE;
+    d3dpp.MultiSampleQuality = 0;
+    d3dpp.BackBufferCount = 1;
+    d3dpp.EnableAutoDepthStencil = TRUE;
+    d3dpp.AutoDepthStencilFormat = D3DFMT_D24S8;
+    d3dpp.SwapEffect = D3DSWAPEFFECT_DISCARD;
+    d3dpp.PresentationInterval = D3DPRESENT_INTERVAL_IMMEDIATE;
+	
+	pD3D->CreateDevice( D3DADAPTER_DEFAULT, D3DDEVTYPE_HAL, NULL,
+                                      D3DCREATE_HARDWARE_VERTEXPROCESSING,
+                                      &d3dpp, &pD3Ddevice);
+
+	CompileShaders();
+
+	fbo_init();
+}
--- a/GPU/Directx9/helper/global.h
+++ b/GPU/Directx9/helper/global.h
@ -0,0 +1,21 @@
+#pragma once
+#include <xtl.h>
+#include <d3d9.h>
+#include <d3dx9.h>
+
+// Used on xbox to create a linear format
+#define D3DFMT(x)	(D3DFORMAT)MAKELINFMT(x)
+
+extern LPDIRECT3DDEVICE9 pD3Ddevice;
+
+extern LPDIRECT3DVERTEXSHADER9      pFramebufferVertexShader; // Vertex Shader
+extern LPDIRECT3DPIXELSHADER9       pFramebufferPixelShader;  // Pixel Shader
+
+extern IDirect3DVertexDeclaration9* pFramebufferVertexDecl;
+extern IDirect3DVertexDeclaration9* pSoftVertexDecl;
+
+bool CompilePixelShader(const char * code, LPDIRECT3DPIXELSHADER9 * pShader, LPD3DXCONSTANTTABLE * pShaderTable);
+bool CompileVertexShader(const char * code, LPDIRECT3DVERTEXSHADER9 * pShader, LPD3DXCONSTANTTABLE * pShaderTable);
+
+
+#define D3DBLEND_UNK	D3DSTENCILOP_FORCE_DWORD
--- a/GPU/GPUXbox.vcxproj
+++ b/GPU/GPUXbox.vcxproj
@ -0,0 +1,464 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug_Optimised|Win32">
+      <Configuration>Debug_Optimised</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Debug_Optimised|x64">
+      <Configuration>Debug_Optimised</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Debug_Optimised|Xbox 360">
+      <Configuration>Debug_Optimised</Configuration>
+      <Platform>Xbox 360</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Debug|Win32">
+      <Configuration>Debug</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Debug|Xbox 360">
+      <Configuration>Debug</Configuration>
+      <Platform>Xbox 360</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|Win32">
+      <Configuration>Release</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|Xbox 360">
+      <Configuration>Release</Configuration>
+      <Platform>Xbox 360</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{DCC4F772-A6E5-4F54-9ACA-BD090CC971C5}</ProjectGuid>
+    <RootNamespace>GPU</RootNamespace>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+    <ConfigurationType>StaticLibrary</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <CharacterSet>MultiByte</CharacterSet>
+    <WholeProgramOptimization>false</WholeProgramOptimization>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug_Optimised|Win32'" Label="Configuration">
+    <ConfigurationType>StaticLibrary</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <CharacterSet>MultiByte</CharacterSet>
+    <WholeProgramOptimization>false</WholeProgramOptimization>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Xbox 360'" Label="Configuration">
+    <ConfigurationType>StaticLibrary</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <CharacterSet>MultiByte</CharacterSet>
+    <WholeProgramOptimization>false</WholeProgramOptimization>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug_Optimised|Xbox 360'" Label="Configuration">
+    <ConfigurationType>StaticLibrary</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <CharacterSet>MultiByte</CharacterSet>
+    <WholeProgramOptimization>false</WholeProgramOptimization>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+    <ConfigurationType>StaticLibrary</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <CharacterSet>MultiByte</CharacterSet>
+    <WholeProgramOptimization>false</WholeProgramOptimization>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug_Optimised|x64'" Label="Configuration">
+    <ConfigurationType>StaticLibrary</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <CharacterSet>MultiByte</CharacterSet>
+    <WholeProgramOptimization>false</WholeProgramOptimization>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+    <ConfigurationType>StaticLibrary</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <WholeProgramOptimization>false</WholeProgramOptimization>
+    <CharacterSet>MultiByte</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Xbox 360'" Label="Configuration">
+    <ConfigurationType>StaticLibrary</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <WholeProgramOptimization>false</WholeProgramOptimization>
+    <CharacterSet>MultiByte</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+    <ConfigurationType>StaticLibrary</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <WholeProgramOptimization>false</WholeProgramOptimization>
+    <CharacterSet>MultiByte</CharacterSet>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug_Optimised|Win32'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Xbox 360'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug_Optimised|Xbox 360'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug_Optimised|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|Xbox 360'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Xbox 360'">
+    <OutputFile>$(OutDir)GPU$(TargetExt)</OutputFile>
+  </PropertyGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <AdditionalIncludeDirectories>../common;..;../native;../native/ext/glew;</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions>_MBCS;%(PreprocessorDefinitions);_CRT_SECURE_NO_WARNINGS</PreprocessorDefinitions>
+      <EnableEnhancedInstructionSet>StreamingSIMDExtensions2</EnableEnhancedInstructionSet>
+      <FloatingPointModel>Fast</FloatingPointModel>
+      <MultiProcessorCompilation>true</MultiProcessorCompilation>
+      <MinimalRebuild>false</MinimalRebuild>
+      <RuntimeTypeInfo>false</RuntimeTypeInfo>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug_Optimised|Win32'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <AdditionalIncludeDirectories>../common;..;../native;../native/ext/glew;</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions>_MBCS;%(PreprocessorDefinitions);_CRT_SECURE_NO_WARNINGS</PreprocessorDefinitions>
+      <EnableEnhancedInstructionSet>StreamingSIMDExtensions2</EnableEnhancedInstructionSet>
+      <FloatingPointModel>Fast</FloatingPointModel>
+      <MultiProcessorCompilation>true</MultiProcessorCompilation>
+      <MinimalRebuild>false</MinimalRebuild>
+      <RuntimeTypeInfo>false</RuntimeTypeInfo>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Xbox 360'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <AdditionalIncludeDirectories>../common;..;../native;../native/ext/glew;</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions>USE_DIRECTX;BIG_ENDIAN;PPC;_XBOX;_MBCS;%(PreprocessorDefinitions);_CRT_SECURE_NO_WARNINGS</PreprocessorDefinitions>
+      <MultiProcessorCompilation>true</MultiProcessorCompilation>
+      <MinimalRebuild>false</MinimalRebuild>
+      <RuntimeTypeInfo>false</RuntimeTypeInfo>
+      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
+      <ForcedIncludeFiles>core/x360_compat.h</ForcedIncludeFiles>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug_Optimised|Xbox 360'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Full</Optimization>
+      <AdditionalIncludeDirectories>../common;..;../native;../native/ext/glew;</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions>USE_DIRECTX;BIG_ENDIAN;PPC;_XBOX;_MBCS;%(PreprocessorDefinitions);_CRT_SECURE_NO_WARNINGS</PreprocessorDefinitions>
+      <MultiProcessorCompilation>true</MultiProcessorCompilation>
+      <MinimalRebuild>false</MinimalRebuild>
+      <RuntimeTypeInfo>false</RuntimeTypeInfo>
+      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
+      <ForcedIncludeFiles>core/x360_compat.h</ForcedIncludeFiles>
+      <BasicRuntimeChecks>Default</BasicRuntimeChecks>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <AdditionalIncludeDirectories>../common;..;../native;../native/ext/glew;</AdditionalIncludeDirectories>
+      <EnableEnhancedInstructionSet>StreamingSIMDExtensions2</EnableEnhancedInstructionSet>
+      <FloatingPointModel>Fast</FloatingPointModel>
+      <OmitFramePointers>false</OmitFramePointers>
+      <MultiProcessorCompilation>true</MultiProcessorCompilation>
+      <MinimalRebuild>false</MinimalRebuild>
+      <RuntimeTypeInfo>false</RuntimeTypeInfo>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug_Optimised|x64'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <AdditionalIncludeDirectories>../common;..;../native;../native/ext/glew;</AdditionalIncludeDirectories>
+      <EnableEnhancedInstructionSet>StreamingSIMDExtensions2</EnableEnhancedInstructionSet>
+      <FloatingPointModel>Fast</FloatingPointModel>
+      <OmitFramePointers>false</OmitFramePointers>
+      <MultiProcessorCompilation>true</MultiProcessorCompilation>
+      <MinimalRebuild>false</MinimalRebuild>
+      <RuntimeTypeInfo>false</RuntimeTypeInfo>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <AdditionalIncludeDirectories>../common;..;../native;../native/ext/glew;</AdditionalIncludeDirectories>
+      <BufferSecurityCheck>false</BufferSecurityCheck>
+      <EnableEnhancedInstructionSet>StreamingSIMDExtensions2</EnableEnhancedInstructionSet>
+      <FloatingPointModel>Fast</FloatingPointModel>
+      <PreprocessorDefinitions>_MBCS;%(PreprocessorDefinitions);_CRT_SECURE_NO_WARNINGS</PreprocessorDefinitions>
+      <FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
+      <MultiProcessorCompilation>true</MultiProcessorCompilation>
+      <RuntimeTypeInfo>false</RuntimeTypeInfo>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Xbox 360'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Full</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <AdditionalIncludeDirectories>../common;..;../native;../native/ext/glew;</AdditionalIncludeDirectories>
+      <BufferSecurityCheck>false</BufferSecurityCheck>
+      <FloatingPointModel>Fast</FloatingPointModel>
+      <PreprocessorDefinitions>USE_DIRECTX;WIN32;_XBOX;PPC;BIG_ENDIAN;NO_JIT;;%(PreprocessorDefinitions);_CRT_SECURE_NO_WARNINGS</PreprocessorDefinitions>
+      <FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
+      <MultiProcessorCompilation>true</MultiProcessorCompilation>
+      <RuntimeTypeInfo>false</RuntimeTypeInfo>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+      <ForcedIncludeFiles>core/x360_compat.h</ForcedIncludeFiles>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <AdditionalIncludeDirectories>../common;..;../native;../native/ext/glew;</AdditionalIncludeDirectories>
+      <BufferSecurityCheck>false</BufferSecurityCheck>
+      <EnableEnhancedInstructionSet>StreamingSIMDExtensions2</EnableEnhancedInstructionSet>
+      <FloatingPointModel>Fast</FloatingPointModel>
+      <FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
+      <OmitFramePointers>false</OmitFramePointers>
+      <MultiProcessorCompilation>true</MultiProcessorCompilation>
+      <RuntimeTypeInfo>false</RuntimeTypeInfo>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <ClInclude Include="..\ext\xbrz\xbrz.h" />
+    <ClInclude Include="Directx9\DisplayListInterpreter.h" />
+    <ClInclude Include="Directx9\FragmentShaderGenerator.h" />
+    <ClInclude Include="Directx9\Framebuffer.h" />
+    <ClInclude Include="Directx9\helper\dx_state.h" />
+    <ClInclude Include="Directx9\helper\fbo.h" />
+    <ClInclude Include="Directx9\helper\global.h" />
+    <ClInclude Include="Directx9\IndexGenerator.h" />
+    <ClInclude Include="Directx9\ShaderManager.h" />
+    <ClInclude Include="Directx9\StateMapping.h" />
+    <ClInclude Include="Directx9\TextureCache.h" />
+    <ClInclude Include="Directx9\TextureScaler.h" />
+    <ClInclude Include="Directx9\TransformPipeline.h" />
+    <ClInclude Include="Directx9\VertexDecoder.h" />
+    <ClInclude Include="Directx9\VertexShaderGenerator.h" />
+    <ClInclude Include="ge_constants.h" />
+    <ClInclude Include="GLES\DisplayListInterpreter.h">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Xbox 360'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug_Optimised|Xbox 360'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Xbox 360'">true</ExcludedFromBuild>
+    </ClInclude>
+    <ClInclude Include="GLES\FragmentShaderGenerator.h">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Xbox 360'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug_Optimised|Xbox 360'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Xbox 360'">true</ExcludedFromBuild>
+    </ClInclude>
+    <ClInclude Include="GLES\Framebuffer.h">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Xbox 360'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug_Optimised|Xbox 360'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Xbox 360'">true</ExcludedFromBuild>
+    </ClInclude>
+    <ClInclude Include="GLES\IndexGenerator.h">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Xbox 360'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug_Optimised|Xbox 360'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Xbox 360'">true</ExcludedFromBuild>
+    </ClInclude>
+    <ClInclude Include="GLES\ShaderManager.h">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Xbox 360'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug_Optimised|Xbox 360'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Xbox 360'">true</ExcludedFromBuild>
+    </ClInclude>
+    <ClInclude Include="GLES\StateMapping.h">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Xbox 360'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug_Optimised|Xbox 360'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Xbox 360'">true</ExcludedFromBuild>
+    </ClInclude>
+    <ClInclude Include="GLES\TextureCache.h">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Xbox 360'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug_Optimised|Xbox 360'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Xbox 360'">true</ExcludedFromBuild>
+    </ClInclude>
+    <ClInclude Include="GLES\TextureScaler.h">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Xbox 360'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug_Optimised|Xbox 360'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Xbox 360'">true</ExcludedFromBuild>
+    </ClInclude>
+    <ClInclude Include="GLES\TransformPipeline.h">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Xbox 360'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug_Optimised|Xbox 360'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Xbox 360'">true</ExcludedFromBuild>
+    </ClInclude>
+    <ClInclude Include="GLES\VertexDecoder.h">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Xbox 360'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug_Optimised|Xbox 360'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Xbox 360'">true</ExcludedFromBuild>
+    </ClInclude>
+    <ClInclude Include="GLES\VertexShaderGenerator.h">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Xbox 360'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug_Optimised|Xbox 360'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Xbox 360'">true</ExcludedFromBuild>
+    </ClInclude>
+    <ClInclude Include="GeDisasm.h" />
+    <ClInclude Include="GPUCommon.h" />
+    <ClInclude Include="GPUInterface.h" />
+    <ClInclude Include="GPUState.h" />
+    <ClInclude Include="Math3D.h" />
+    <ClInclude Include="Null\NullGpu.h" />
+  </ItemGroup>
+  <ItemGroup>
+    <ClCompile Include="..\ext\xbrz\xbrz.cpp" />
+    <ClCompile Include="Directx9\DisplayListInterpreter.cpp" />
+    <ClCompile Include="Directx9\FragmentShaderGenerator.cpp" />
+    <ClCompile Include="Directx9\Framebuffer.cpp" />
+    <ClCompile Include="Directx9\helper\dx_state.cpp" />
+    <ClCompile Include="Directx9\helper\fbo.cpp" />
+    <ClCompile Include="Directx9\helper\global.cpp" />
+    <ClCompile Include="Directx9\IndexGenerator.cpp" />
+    <ClCompile Include="Directx9\ShaderManager.cpp" />
+    <ClCompile Include="Directx9\StateMapping.cpp" />
+    <ClCompile Include="Directx9\TextureCache.cpp" />
+    <ClCompile Include="Directx9\TextureScaler.cpp" />
+    <ClCompile Include="Directx9\TransformPipeline.cpp" />
+    <ClCompile Include="Directx9\VertexDecoder.cpp" />
+    <ClCompile Include="Directx9\VertexShaderGenerator.cpp" />
+    <ClCompile Include="GLES\DisplayListInterpreter.cpp">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Xbox 360'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug_Optimised|Xbox 360'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Xbox 360'">true</ExcludedFromBuild>
+    </ClCompile>
+    <ClCompile Include="GLES\FragmentShaderGenerator.cpp">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Xbox 360'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug_Optimised|Xbox 360'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Xbox 360'">true</ExcludedFromBuild>
+    </ClCompile>
+    <ClCompile Include="GLES\Framebuffer.cpp">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Xbox 360'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug_Optimised|Xbox 360'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Xbox 360'">true</ExcludedFromBuild>
+    </ClCompile>
+    <ClCompile Include="GLES\IndexGenerator.cpp">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Xbox 360'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug_Optimised|Xbox 360'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Xbox 360'">true</ExcludedFromBuild>
+    </ClCompile>
+    <ClCompile Include="GLES\ShaderManager.cpp">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Xbox 360'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug_Optimised|Xbox 360'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Xbox 360'">true</ExcludedFromBuild>
+    </ClCompile>
+    <ClCompile Include="GLES\StateMapping.cpp">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Xbox 360'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug_Optimised|Xbox 360'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Xbox 360'">true</ExcludedFromBuild>
+    </ClCompile>
+    <ClCompile Include="GLES\TextureCache.cpp">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Xbox 360'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug_Optimised|Xbox 360'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Xbox 360'">true</ExcludedFromBuild>
+    </ClCompile>
+    <ClCompile Include="GLES\TextureScaler.cpp">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Xbox 360'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug_Optimised|Xbox 360'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Xbox 360'">true</ExcludedFromBuild>
+    </ClCompile>
+    <ClCompile Include="GLES\TransformPipeline.cpp">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Xbox 360'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug_Optimised|Xbox 360'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Xbox 360'">true</ExcludedFromBuild>
+    </ClCompile>
+    <ClCompile Include="GLES\VertexDecoder.cpp">
+      <AssemblerOutput Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">AssemblyAndSourceCode</AssemblerOutput>
+      <AssemblerOutput Condition="'$(Configuration)|$(Platform)'=='Release|Xbox 360'">AssemblyAndSourceCode</AssemblerOutput>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Xbox 360'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug_Optimised|Xbox 360'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Xbox 360'">true</ExcludedFromBuild>
+    </ClCompile>
+    <ClCompile Include="GLES\VertexShaderGenerator.cpp">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Xbox 360'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug_Optimised|Xbox 360'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Xbox 360'">true</ExcludedFromBuild>
+    </ClCompile>
+    <ClCompile Include="GeDisasm.cpp" />
+    <ClCompile Include="GPUCommon.cpp" />
+    <ClCompile Include="GPUState.cpp" />
+    <ClCompile Include="Math3D.cpp" />
+    <ClCompile Include="Null\NullGpu.cpp" />
+  </ItemGroup>
+  <ItemGroup>
+    <ProjectReference Include="..\Common\Common.vcxproj">
+      <Project>{3fcdbae2-5103-4350-9a8e-848ce9c73195}</Project>
+    </ProjectReference>
+  </ItemGroup>
+  <ItemGroup>
+    <None Include="CMakeLists.txt" />
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+  </ImportGroup>
+</Project>
--- a/GPU/GPUXbox.vcxproj.filters
+++ b/GPU/GPUXbox.vcxproj.filters
@ -0,0 +1,214 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup>
+    <Filter Include="GLES">
+      <UniqueIdentifier>{f7563dba-8146-4c21-a092-e864ff145d79}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="Software">
+      <UniqueIdentifier>{4f6d1284-2c23-4ebc-842c-666a1305bfed}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="Common">
+      <UniqueIdentifier>{21783292-4dd7-447b-af93-356cd2eaa4d6}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="Null">
+      <UniqueIdentifier>{b31aa5a1-da08-47e6-9467-ab1d547b6ff3}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="Directx9">
+      <UniqueIdentifier>{eb2a1d3d-24c7-4df8-b3cb-79a4b9734d70}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="Directx9\helper">
+      <UniqueIdentifier>{862f23b4-2c1b-4d16-9450-caecbb77f276}</UniqueIdentifier>
+    </Filter>
+  </ItemGroup>
+  <ItemGroup>
+    <ClInclude Include="ge_constants.h">
+      <Filter>Common</Filter>
+    </ClInclude>
+    <ClInclude Include="Math3D.h">
+      <Filter>Common</Filter>
+    </ClInclude>
+    <ClInclude Include="GLES\DisplayListInterpreter.h">
+      <Filter>GLES</Filter>
+    </ClInclude>
+    <ClInclude Include="GLES\FragmentShaderGenerator.h">
+      <Filter>GLES</Filter>
+    </ClInclude>
+    <ClInclude Include="GLES\Framebuffer.h">
+      <Filter>GLES</Filter>
+    </ClInclude>
+    <ClInclude Include="GLES\ShaderManager.h">
+      <Filter>GLES</Filter>
+    </ClInclude>
+    <ClInclude Include="GLES\TextureCache.h">
+      <Filter>GLES</Filter>
+    </ClInclude>
+    <ClInclude Include="GLES\TransformPipeline.h">
+      <Filter>GLES</Filter>
+    </ClInclude>
+    <ClInclude Include="GLES\VertexDecoder.h">
+      <Filter>GLES</Filter>
+    </ClInclude>
+    <ClInclude Include="GLES\VertexShaderGenerator.h">
+      <Filter>GLES</Filter>
+    </ClInclude>
+    <ClInclude Include="GPUState.h">
+      <Filter>Common</Filter>
+    </ClInclude>
+    <ClInclude Include="GPUInterface.h">
+      <Filter>Common</Filter>
+    </ClInclude>
+    <ClInclude Include="Null\NullGpu.h">
+      <Filter>Null</Filter>
+    </ClInclude>
+    <ClInclude Include="GLES\StateMapping.h">
+      <Filter>GLES</Filter>
+    </ClInclude>
+    <ClInclude Include="GLES\IndexGenerator.h">
+      <Filter>GLES</Filter>
+    </ClInclude>
+    <ClInclude Include="GeDisasm.h" />
+    <ClInclude Include="GPUCommon.h">
+      <Filter>Common</Filter>
+    </ClInclude>
+    <ClInclude Include="..\ext\xbrz\xbrz.h" />
+    <ClInclude Include="GLES\TextureScaler.h">
+      <Filter>GLES</Filter>
+    </ClInclude>
+    <ClInclude Include="Directx9\DisplayListInterpreter.h">
+      <Filter>Directx9</Filter>
+    </ClInclude>
+    <ClInclude Include="Directx9\FragmentShaderGenerator.h">
+      <Filter>Directx9</Filter>
+    </ClInclude>
+    <ClInclude Include="Directx9\Framebuffer.h">
+      <Filter>Directx9</Filter>
+    </ClInclude>
+    <ClInclude Include="Directx9\IndexGenerator.h">
+      <Filter>Directx9</Filter>
+    </ClInclude>
+    <ClInclude Include="Directx9\ShaderManager.h">
+      <Filter>Directx9</Filter>
+    </ClInclude>
+    <ClInclude Include="Directx9\StateMapping.h">
+      <Filter>Directx9</Filter>
+    </ClInclude>
+    <ClInclude Include="Directx9\TextureCache.h">
+      <Filter>Directx9</Filter>
+    </ClInclude>
+    <ClInclude Include="Directx9\TextureScaler.h">
+      <Filter>Directx9</Filter>
+    </ClInclude>
+    <ClInclude Include="Directx9\TransformPipeline.h">
+      <Filter>Directx9</Filter>
+    </ClInclude>
+    <ClInclude Include="Directx9\VertexDecoder.h">
+      <Filter>Directx9</Filter>
+    </ClInclude>
+    <ClInclude Include="Directx9\VertexShaderGenerator.h">
+      <Filter>Directx9</Filter>
+    </ClInclude>
+    <ClInclude Include="Directx9\helper\fbo.h">
+      <Filter>Directx9\helper</Filter>
+    </ClInclude>
+    <ClInclude Include="Directx9\helper\global.h">
+      <Filter>Directx9\helper</Filter>
+    </ClInclude>
+    <ClInclude Include="Directx9\helper\dx_state.h">
+      <Filter>Directx9\helper</Filter>
+    </ClInclude>
+  </ItemGroup>
+  <ItemGroup>
+    <ClCompile Include="Math3D.cpp">
+      <Filter>Common</Filter>
+    </ClCompile>
+    <ClCompile Include="GLES\DisplayListInterpreter.cpp">
+      <Filter>GLES</Filter>
+    </ClCompile>
+    <ClCompile Include="GLES\FragmentShaderGenerator.cpp">
+      <Filter>GLES</Filter>
+    </ClCompile>
+    <ClCompile Include="GLES\Framebuffer.cpp">
+      <Filter>GLES</Filter>
+    </ClCompile>
+    <ClCompile Include="GLES\ShaderManager.cpp">
+      <Filter>GLES</Filter>
+    </ClCompile>
+    <ClCompile Include="GLES\TextureCache.cpp">
+      <Filter>GLES</Filter>
+    </ClCompile>
+    <ClCompile Include="GLES\TransformPipeline.cpp">
+      <Filter>GLES</Filter>
+    </ClCompile>
+    <ClCompile Include="GLES\VertexDecoder.cpp">
+      <Filter>GLES</Filter>
+    </ClCompile>
+    <ClCompile Include="GLES\VertexShaderGenerator.cpp">
+      <Filter>GLES</Filter>
+    </ClCompile>
+    <ClCompile Include="GPUState.cpp">
+      <Filter>Common</Filter>
+    </ClCompile>
+    <ClCompile Include="Null\NullGpu.cpp">
+      <Filter>Null</Filter>
+    </ClCompile>
+    <ClCompile Include="GLES\StateMapping.cpp">
+      <Filter>GLES</Filter>
+    </ClCompile>
+    <ClCompile Include="GLES\IndexGenerator.cpp">
+      <Filter>GLES</Filter>
+    </ClCompile>
+    <ClCompile Include="GeDisasm.cpp" />
+    <ClCompile Include="GPUCommon.cpp">
+      <Filter>Common</Filter>
+    </ClCompile>
+    <ClCompile Include="..\ext\xbrz\xbrz.cpp" />
+    <ClCompile Include="GLES\TextureScaler.cpp">
+      <Filter>GLES</Filter>
+    </ClCompile>
+    <ClCompile Include="Directx9\DisplayListInterpreter.cpp">
+      <Filter>Directx9</Filter>
+    </ClCompile>
+    <ClCompile Include="Directx9\FragmentShaderGenerator.cpp">
+      <Filter>Directx9</Filter>
+    </ClCompile>
+    <ClCompile Include="Directx9\Framebuffer.cpp">
+      <Filter>Directx9</Filter>
+    </ClCompile>
+    <ClCompile Include="Directx9\IndexGenerator.cpp">
+      <Filter>Directx9</Filter>
+    </ClCompile>
+    <ClCompile Include="Directx9\ShaderManager.cpp">
+      <Filter>Directx9</Filter>
+    </ClCompile>
+    <ClCompile Include="Directx9\StateMapping.cpp">
+      <Filter>Directx9</Filter>
+    </ClCompile>
+    <ClCompile Include="Directx9\TextureCache.cpp">
+      <Filter>Directx9</Filter>
+    </ClCompile>
+    <ClCompile Include="Directx9\TextureScaler.cpp">
+      <Filter>Directx9</Filter>
+    </ClCompile>
+    <ClCompile Include="Directx9\TransformPipeline.cpp">
+      <Filter>Directx9</Filter>
+    </ClCompile>
+    <ClCompile Include="Directx9\VertexDecoder.cpp">
+      <Filter>Directx9</Filter>
+    </ClCompile>
+    <ClCompile Include="Directx9\VertexShaderGenerator.cpp">
+      <Filter>Directx9</Filter>
+    </ClCompile>
+    <ClCompile Include="Directx9\helper\fbo.cpp">
+      <Filter>Directx9\helper</Filter>
+    </ClCompile>
+    <ClCompile Include="Directx9\helper\dx_state.cpp">
+      <Filter>Directx9\helper</Filter>
+    </ClCompile>
+    <ClCompile Include="Directx9\helper\global.cpp">
+      <Filter>Directx9\helper</Filter>
+    </ClCompile>
+  </ItemGroup>
+  <ItemGroup>
+    <None Include="CMakeLists.txt" />
+  </ItemGroup>
+</Project>