diff --git a/GPU/Common/TextureCacheCommon.cpp b/GPU/Common/TextureCacheCommon.cpp
index 06d5d14126..30207f1c90 100644
--- a/GPU/Common/TextureCacheCommon.cpp
+++ b/GPU/Common/TextureCacheCommon.cpp
@@ -15,17 +15,42 @@
 // Official git repository and contact information can be found at
 // https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.
 
+#include <algorithm>
+#include "Common/MemoryUtil.h"
 #include "Core/Config.h"
+#include "Core/Reporting.h"
 #include "GPU/Common/FramebufferCommon.h"
 #include "GPU/Common/TextureCacheCommon.h"
+#include "GPU/Common/TextureDecoder.h"
 #include "GPU/Common/ShaderId.h"
 #include "GPU/Common/GPUStateUtils.h"
 #include "GPU/GPUState.h"
+#include "GPU/GPUInterface.h"
 
 // Ugly.
 extern int g_iNumVideos;
 
-TextureCacheCommon::~TextureCacheCommon() {}
+TextureCacheCommon::TextureCacheCommon()
+	: nextTexture_(nullptr),
+	clutLastFormat_(0xFFFFFFFF), clutTotalBytes_(0), clutMaxBytes_(0), clutRenderAddress_(0xFFFFFFFF) {
+	// TODO: Clamp down to 256/1KB?  Need to check mipmapShareClut and clamp loadclut.
+	clutBufRaw_ = (u32 *)AllocateAlignedMemory(1024 * sizeof(u32), 16);  // 4KB
+	clutBufConverted_ = (u32 *)AllocateAlignedMemory(1024 * sizeof(u32), 16);  // 4KB
+
+	// Zap so we get consistent behavior if the game fails to load some of the CLUT.
+	memset(clutBufRaw_, 0, 1024 * sizeof(u32));
+	memset(clutBufConverted_, 0, 1024 * sizeof(u32));
+
+	// This is 5MB of temporary storage. Might be possible to shrink it.
+	tmpTexBuf32.resize(1024 * 512);  // 2MB
+	tmpTexBuf16.resize(1024 * 512);  // 1MB
+	tmpTexBufRearrange.resize(1024 * 512);   // 2MB
+}
+
+TextureCacheCommon::~TextureCacheCommon() {
+	FreeAlignedMemory(clutBufConverted_);
+	FreeAlignedMemory(clutBufRaw_);
+}
 
 bool TextureCacheCommon::SetOffsetTexture(u32 offset) {
 	return false;
@@ -88,3 +113,182 @@ void TextureCacheCommon::GetSamplingParams(int &minFilt, int &magFilt, bool &sCl
 		minFilt &= 1;
 	}
 }
+
+void TextureCacheCommon::NotifyFramebuffer(u32 address, VirtualFramebuffer *framebuffer, FramebufferNotification msg) {
+	// Must be in VRAM so | 0x04000000 it is.  Also, ignore memory mirrors.
+	// These checks are mainly to reduce scanning all textures.
+	const u32 addr = (address | 0x04000000) & 0x3F9FFFFF;
+	const u32 bpp = framebuffer->format == GE_FORMAT_8888 ? 4 : 2;
+	const u64 cacheKey = (u64)addr << 32;
+	// If it has a clut, those are the low 32 bits, so it'll be inside this range.
+	// Also, if it's a subsample of the buffer, it'll also be within the FBO.
+	const u64 cacheKeyEnd = cacheKey + ((u64)(framebuffer->fb_stride * framebuffer->height * bpp) << 32);
+
+	// The first mirror starts at 0x04200000 and there are 3.  We search all for framebuffers.
+	const u64 mirrorCacheKey = (u64)0x04200000 << 32;
+	const u64 mirrorCacheKeyEnd = (u64)0x04800000 << 32;
+
+	switch (msg) {
+	case NOTIFY_FB_CREATED:
+	case NOTIFY_FB_UPDATED:
+		// Ensure it's in the framebuffer cache.
+		if (std::find(fbCache_.begin(), fbCache_.end(), framebuffer) == fbCache_.end()) {
+			fbCache_.push_back(framebuffer);
+		}
+		for (auto it = cache.lower_bound(cacheKey), end = cache.upper_bound(cacheKeyEnd); it != end; ++it) {
+			AttachFramebuffer(&it->second, addr, framebuffer);
+		}
+		// Let's assume anything in mirrors is fair game to check.
+		for (auto it = cache.lower_bound(mirrorCacheKey), end = cache.upper_bound(mirrorCacheKeyEnd); it != end; ++it) {
+			const u64 mirrorlessKey = it->first & ~0x0060000000000000ULL;
+			// Let's still make sure it's in the cache range.
+			if (mirrorlessKey >= cacheKey && mirrorlessKey <= cacheKeyEnd) {
+				AttachFramebuffer(&it->second, addr, framebuffer);
+			}
+		}
+		break;
+
+	case NOTIFY_FB_DESTROYED:
+		fbCache_.erase(std::remove(fbCache_.begin(), fbCache_.end(),  framebuffer), fbCache_.end());
+		for (auto it = cache.lower_bound(cacheKey), end = cache.upper_bound(cacheKeyEnd); it != end; ++it) {
+			DetachFramebuffer(&it->second, addr, framebuffer);
+		}
+		for (auto it = cache.lower_bound(mirrorCacheKey), end = cache.upper_bound(mirrorCacheKeyEnd); it != end; ++it) {
+			const u64 mirrorlessKey = it->first & ~0x0060000000000000ULL;
+			// Let's still make sure it's in the cache range.
+			if (mirrorlessKey >= cacheKey && mirrorlessKey <= cacheKeyEnd) {
+				DetachFramebuffer(&it->second, addr, framebuffer);
+			}
+		}
+		break;
+	}
+}
+
+void TextureCacheCommon::LoadClut(u32 clutAddr, u32 loadBytes) {
+	clutTotalBytes_ = loadBytes;
+	clutRenderAddress_ = 0xFFFFFFFF;
+
+	if (Memory::IsValidAddress(clutAddr)) {
+		if (Memory::IsVRAMAddress(clutAddr)) {
+			// Clear the uncached bit, etc. to match framebuffers.
+			const u32 clutFramebufAddr = clutAddr & 0x3FFFFFFF;
+
+			for (size_t i = 0, n = fbCache_.size(); i < n; ++i) {
+				auto framebuffer = fbCache_[i];
+				if ((framebuffer->fb_address | 0x04000000) == clutFramebufAddr) {
+					framebuffer->last_frame_clut = gpuStats.numFlips;
+					framebuffer->usageFlags |= FB_USAGE_CLUT;
+					clutRenderAddress_ = framebuffer->fb_address;
+				}
+			}
+		}
+
+		// It's possible for a game to (successfully) access outside valid memory.
+		u32 bytes = Memory::ValidSize(clutAddr, loadBytes);
+		if (clutRenderAddress_ != 0xFFFFFFFF && !g_Config.bDisableSlowFramebufEffects) {
+			gpu->PerformMemoryDownload(clutAddr, bytes);
+		}
+
+#ifdef _M_SSE
+		int numBlocks = bytes / 16;
+		if (bytes == loadBytes) {
+			const __m128i *source = (const __m128i *)Memory::GetPointerUnchecked(clutAddr);
+			__m128i *dest = (__m128i *)clutBufRaw_;
+			for (int i = 0; i < numBlocks; i++, source += 2, dest += 2) {
+				__m128i data1 = _mm_loadu_si128(source);
+				__m128i data2 = _mm_loadu_si128(source + 1);
+				_mm_store_si128(dest, data1);
+				_mm_store_si128(dest + 1, data2);
+			}
+		} else {
+			Memory::MemcpyUnchecked(clutBufRaw_, clutAddr, bytes);
+			if (bytes < loadBytes) {
+				memset((u8 *)clutBufRaw_ + bytes, 0x00, loadBytes - bytes);
+			}
+		}
+#else
+		Memory::MemcpyUnchecked(clutBufRaw_, clutAddr, bytes);
+		if (bytes < clutTotalBytes_) {
+			memset((u8 *)clutBufRaw_ + bytes, 0x00, clutTotalBytes_ - bytes);
+		}
+#endif
+	} else {
+		memset(clutBufRaw_, 0x00, loadBytes);
+	}
+	// Reload the clut next time.
+	clutLastFormat_ = 0xFFFFFFFF;
+	clutMaxBytes_ = std::max(clutMaxBytes_, loadBytes);
+}
+
+void *TextureCacheCommon::UnswizzleFromMem(const u8 *texptr, u32 bufw, u32 height, u32 bytesPerPixel) {
+	const u32 rowWidth = (bytesPerPixel > 0) ? (bufw * bytesPerPixel) : (bufw / 2);
+	const u32 pitch = rowWidth / 4;
+	const int bxc = rowWidth / 16;
+	int byc = (height + 7) / 8;
+	if (byc == 0)
+		byc = 1;
+
+	u32 ydest = 0;
+	if (rowWidth >= 16) {
+		u32 *ydestp = tmpTexBuf32.data();
+		// The most common one, so it gets an optimized implementation.
+		DoUnswizzleTex16(texptr, ydestp, bxc, byc, pitch, rowWidth);
+	} else if (rowWidth == 8) {
+		const u32 *src = (const u32 *) texptr;
+		for (int by = 0; by < byc; by++) {
+			for (int n = 0; n < 8; n++, ydest += 2) {
+				tmpTexBuf32[ydest + 0] = *src++;
+				tmpTexBuf32[ydest + 1] = *src++;
+				src += 2; // skip two u32
+			}
+		}
+	} else if (rowWidth == 4) {
+		const u32 *src = (const u32 *) texptr;
+		for (int by = 0; by < byc; by++) {
+			for (int n = 0; n < 8; n++, ydest++) {
+				tmpTexBuf32[ydest] = *src++;
+				src += 3;
+			}
+		}
+	} else if (rowWidth == 2) {
+		const u16 *src = (const u16 *) texptr;
+		for (int by = 0; by < byc; by++) {
+			for (int n = 0; n < 4; n++, ydest++) {
+				u16 n1 = src[0];
+				u16 n2 = src[8];
+				tmpTexBuf32[ydest] = (u32)n1 | ((u32)n2 << 16);
+				src += 16;
+			}
+		}
+	} else if (rowWidth == 1) {
+		const u8 *src = (const u8 *) texptr;
+		for (int by = 0; by < byc; by++) {
+			for (int n = 0; n < 2; n++, ydest++) {
+				u8 n1 = src[ 0];
+				u8 n2 = src[16];
+				u8 n3 = src[32];
+				u8 n4 = src[48];
+				tmpTexBuf32[ydest] = (u32)n1 | ((u32)n2 << 8) | ((u32)n3 << 16) | ((u32)n4 << 24);
+				src += 64;
+			}
+		}
+	}
+	return tmpTexBuf32.data();
+}
+
+void *TextureCacheCommon::RearrangeBuf(void *inBuf, u32 inRowBytes, u32 outRowBytes, int h, bool allowInPlace) {
+	const u8 *read = (const u8 *)inBuf;
+	void *outBuf = inBuf;
+	u8 *write = (u8 *)inBuf;
+	if (outRowBytes > inRowBytes || !allowInPlace) {
+		write = (u8 *)tmpTexBufRearrange.data();
+		outBuf = tmpTexBufRearrange.data();
+	}
+	for (int y = 0; y < h; y++) {
+		memmove(write, read, outRowBytes);
+		read += inRowBytes;
+		write += outRowBytes;
+	}
+
+	return outBuf;
+}
diff --git a/GPU/Common/TextureCacheCommon.h b/GPU/Common/TextureCacheCommon.h
index cfa9738b91..847a1c2917 100644
--- a/GPU/Common/TextureCacheCommon.h
+++ b/GPU/Common/TextureCacheCommon.h
@@ -26,14 +26,26 @@ enum TextureFiltering {
 	TEX_FILTER_LINEAR_VIDEO = 4,
 };
 
+enum FramebufferNotification {
+	NOTIFY_FB_CREATED,
+	NOTIFY_FB_UPDATED,
+	NOTIFY_FB_DESTROYED,
+};
+
 struct VirtualFramebuffer;
 
 class TextureCacheCommon {
 public:
+	TextureCacheCommon();
 	virtual ~TextureCacheCommon();
 
+	void LoadClut(u32 clutAddr, u32 loadBytes);
+
 	virtual bool SetOffsetTexture(u32 offset);
 
+	// FramebufferManager keeps TextureCache updated about what regions of memory are being rendered to.
+	void NotifyFramebuffer(u32 address, VirtualFramebuffer *framebuffer, FramebufferNotification msg);
+
 	int AttachedDrawingHeight();
 
 	// Wow this is starting to grow big. Soon need to start looking at resizing it.
@@ -115,9 +127,33 @@ public:
 	};
 
 protected:
+	// Can't be unordered_map, we use lower_bound ... although for some reason that compiles on MSVC.
+	typedef std::map<u64, TexCacheEntry> TexCache;
+
+	void *UnswizzleFromMem(const u8 *texptr, u32 bufw, u32 height, u32 bytesPerPixel);
+	void *RearrangeBuf(void *inBuf, u32 inRowBytes, u32 outRowBytes, int h, bool allowInPlace = true);
+
 	void GetSamplingParams(int &minFilt, int &magFilt, bool &sClamp, bool &tClamp, float &lodBias, u8 maxLevel);
 
+	virtual bool AttachFramebuffer(TexCacheEntry *entry, u32 address, VirtualFramebuffer *framebuffer, u32 texaddrOffset = 0) = 0;
+	virtual void DetachFramebuffer(TexCacheEntry *entry, u32 address, VirtualFramebuffer *framebuffer) = 0;
+
+	TexCache cache;
+	std::vector<VirtualFramebuffer *> fbCache_;
+
+	SimpleBuf<u32> tmpTexBuf32;
+	SimpleBuf<u16> tmpTexBuf16;
+	SimpleBuf<u32> tmpTexBufRearrange;
+
 	TexCacheEntry *nextTexture_;
+
+	// Raw is where we keep the original bytes.  Converted is where we swap colors if necessary.
+	u32 *clutBufRaw_;
+	u32 *clutBufConverted_;
+	u32 clutLastFormat_;
+	u32 clutTotalBytes_;
+	u32 clutMaxBytes_;
+	u32 clutRenderAddress_;
 };
 
 inline bool TextureCacheCommon::TexCacheEntry::Matches(u16 dim2, u8 format2, u8 maxLevel2) {
diff --git a/GPU/Common/TextureDecoder.h b/GPU/Common/TextureDecoder.h
index f10e35f5c0..84ba1f6878 100644
--- a/GPU/Common/TextureDecoder.h
+++ b/GPU/Common/TextureDecoder.h
@@ -28,6 +28,7 @@ enum CheckAlphaResult {
 #include "Core/MemMap.h"
 #include "GPU/ge_constants.h"
 #include "GPU/Common/TextureDecoderNEON.h"
+#include "GPU/GPUState.h"
 
 void SetupTextureDecoder();
 
diff --git a/GPU/Directx9/TextureCacheDX9.cpp b/GPU/Directx9/TextureCacheDX9.cpp
index ebda4de03d..6f828f4742 100644
--- a/GPU/Directx9/TextureCacheDX9.cpp
+++ b/GPU/Directx9/TextureCacheDX9.cpp
@@ -63,23 +63,10 @@ namespace DX9 {
 #define TEXCACHE_MIN_PRESSURE 16 * 1024 * 1024  // Total in VRAM
 #define TEXCACHE_SECOND_MIN_PRESSURE 4 * 1024 * 1024
 
-TextureCacheDX9::TextureCacheDX9() : cacheSizeEstimate_(0), secondCacheSizeEstimate_(0), clearCacheNextFrame_(false), lowMemoryMode_(false), clutBuf_(NULL), clutMaxBytes_(0), clutRenderAddress_(0), texelsScaledThisFrame_(0) {
+TextureCacheDX9::TextureCacheDX9() : cacheSizeEstimate_(0), secondCacheSizeEstimate_(0), clearCacheNextFrame_(false), lowMemoryMode_(false), clutBuf_(NULL), texelsScaledThisFrame_(0) {
 	timesInvalidatedAllThisFrame_ = 0;
 	lastBoundTexture = INVALID_TEX;
 	decimationCounter_ = TEXCACHE_DECIMATION_INTERVAL;
-	// This is 5MB of temporary storage. Might be possible to shrink it.
-	tmpTexBuf32.resize(1024 * 512);  // 2MB
-	tmpTexBuf16.resize(1024 * 512);  // 1MB
-	tmpTexBufRearrange.resize(1024 * 512);   // 2MB
-
-	// TODO: Clamp down to 256/1KB?  Need to check mipmapShareClut and clamp loadclut.
-	clutBufConverted_ = (u32 *)AllocateAlignedMemory(1024 * sizeof(u32), 16);  // 4KB
-	clutBufRaw_ = (u32 *)AllocateAlignedMemory(1024 * sizeof(u32), 16);  // 4KB
-
-	// Zap these so that reads from uninitialized parts of the CLUT look the same in
-	// release and debug
-	memset(clutBufConverted_, 0, 1024 * sizeof(u32));
-	memset(clutBufRaw_, 0, 1024 * sizeof(u32));
 
 	D3DCAPS9 pCaps;
 	ZeroMemory(&pCaps, sizeof(pCaps));
@@ -102,8 +89,6 @@ TextureCacheDX9::TextureCacheDX9() : cacheSizeEstimate_(0), secondCacheSizeEstim
 
 TextureCacheDX9::~TextureCacheDX9() {
 	Clear(true);
-	FreeAlignedMemory(clutBufConverted_);
-	FreeAlignedMemory(clutBufRaw_);
 }
 
 static u32 EstimateTexMemoryUsage(const TextureCacheDX9::TexCacheEntry *entry) {
@@ -443,104 +428,6 @@ inline void TextureCacheDX9::DetachFramebuffer(TexCacheEntry *entry, u32 address
 	}
 }
 
-void TextureCacheDX9::NotifyFramebuffer(u32 address, VirtualFramebuffer *framebuffer, FramebufferNotification msg) {
-	// Must be in VRAM so | 0x04000000 it is.  Also, ignore memory mirrors.
-	// These checks are mainly to reduce scanning all textures.
-	const u32 addr = (address | 0x04000000) & 0x3F9FFFFF;
-	const u32 bpp = framebuffer->format == GE_FORMAT_8888 ? 4 : 2;
-	const u64 cacheKey = (u64)addr << 32;
-	// If it has a clut, those are the low 32 bits, so it'll be inside this range.
-	// Also, if it's a subsample of the buffer, it'll also be within the FBO.
-	const u64 cacheKeyEnd = cacheKey + ((u64)(framebuffer->fb_stride * framebuffer->height * bpp) << 32);
-
-	// The first mirror starts at 0x04200000 and there are 3.  We search all for framebuffers.
-	const u64 mirrorCacheKey = (u64)0x04200000 << 32;
-	const u64 mirrorCacheKeyEnd = (u64)0x04800000 << 32;
-
-	switch (msg) {
-	case NOTIFY_FB_CREATED:
-	case NOTIFY_FB_UPDATED:
-		// Ensure it's in the framebuffer cache.
-		if (std::find(fbCache_.begin(), fbCache_.end(), framebuffer) == fbCache_.end()) {
-			fbCache_.push_back(framebuffer);
-		}
-		for (auto it = cache.lower_bound(cacheKey), end = cache.upper_bound(cacheKeyEnd); it != end; ++it) {
-			AttachFramebuffer(&it->second, addr, framebuffer);
-		}
-		// Let's assume anything in mirrors is fair game to check.
-		for (auto it = cache.lower_bound(mirrorCacheKey), end = cache.upper_bound(mirrorCacheKeyEnd); it != end; ++it) {
-			AttachFramebuffer(&it->second, addr, framebuffer);
-		}
-		break;
-
-	case NOTIFY_FB_DESTROYED:
-		fbCache_.erase(std::remove(fbCache_.begin(), fbCache_.end(),  framebuffer), fbCache_.end());
-		for (auto it = cache.lower_bound(cacheKey), end = cache.upper_bound(cacheKeyEnd); it != end; ++it) {
-			DetachFramebuffer(&it->second, addr, framebuffer);
-		}
-		for (auto it = cache.lower_bound(mirrorCacheKey), end = cache.upper_bound(mirrorCacheKeyEnd); it != end; ++it) {
-			DetachFramebuffer(&it->second, addr, framebuffer);
-		}
-		break;
-	}
-}
-
-void *TextureCacheDX9::UnswizzleFromMem(const u8 *texptr, u32 bufw, u32 height, u32 bytesPerPixel) {
-	const u32 rowWidth = (bytesPerPixel > 0) ? (bufw * bytesPerPixel) : (bufw / 2);
-	const u32 pitch = rowWidth / 4;
-	const int bxc = rowWidth / 16;
-	int byc = (height + 7) / 8;
-	if (byc == 0)
-		byc = 1;
-
-	u32 ydest = 0;
-	if (rowWidth >= 16) {
-		u32 *ydestp = tmpTexBuf32.data();
-		// The most common one, so it gets an optimized implementation.
-		DoUnswizzleTex16(texptr, ydestp, bxc, byc, pitch, rowWidth);
-	} else if (rowWidth == 8) {
-		const u32 *src = (const u32 *) texptr;
-		for (int by = 0; by < byc; by++) {
-			for (int n = 0; n < 8; n++, ydest += 2) {
-				tmpTexBuf32[ydest + 0] = *src++;
-				tmpTexBuf32[ydest + 1] = *src++;
-				src += 2; // skip two u32
-			}
-		}
-	} else if (rowWidth == 4) {
-		const u32 *src = (const u32 *) texptr;
-		for (int by = 0; by < byc; by++) {
-			for (int n = 0; n < 8; n++, ydest++) {
-				tmpTexBuf32[ydest] = *src++;
-				src += 3;
-			}
-		}
-	} else if (rowWidth == 2) {
-		const u16 *src = (const u16 *) texptr;
-		for (int by = 0; by < byc; by++) {
-			for (int n = 0; n < 4; n++, ydest++) {
-				u16 n1 = src[0];
-				u16 n2 = src[8];
-				tmpTexBuf32[ydest] = (u32)n1 | ((u32)n2 << 16);
-				src += 16;
-			}
-		}
-	} else if (rowWidth == 1) {
-		const u8 *src = (const u8 *) texptr;
-		for (int by = 0; by < byc; by++) {
-			for (int n = 0; n < 2; n++, ydest++) {
-				u8 n1 = src[ 0];
-				u8 n2 = src[16];
-				u8 n3 = src[32];
-				u8 n4 = src[48];
-				tmpTexBuf32[ydest] = (u32)n1 | ((u32)n2 << 8) | ((u32)n3 << 16) | ((u32)n4 << 24);
-				src += 64;
-			}
-		}
-	}
-	return tmpTexBuf32.data();
-}
-
 void *TextureCacheDX9::ReadIndexedTex(int level, const u8 *texptr, int bytesPerIndex, u32 dstFmt, int bufw) {
 	int w = gstate.getTextureWidth(level);
 	int h = gstate.getTextureHeight(level);
@@ -786,62 +673,6 @@ static inline u32 QuickTexHash(u32 addr, int bufw, int w, int h, GETextureFormat
 	return DoQuickTexHash(checkp, sizeInRAM);
 }
 
-void TextureCacheDX9::LoadClut(u32 clutAddr, u32 loadBytes) {
-	// Clear the uncached bit, etc. to match framebuffers.
-	clutAddr = clutAddr & 0x3FFFFFFF;
-	bool foundFramebuffer = false;
-
-	clutRenderAddress_ = 0;
-	for (size_t i = 0, n = fbCache_.size(); i < n; ++i) {
-		auto framebuffer = fbCache_[i];
-		if ((framebuffer->fb_address | 0x04000000) == clutAddr) {
-			framebuffer->last_frame_clut = gpuStats.numFlips;
-			framebuffer->usageFlags |= FB_USAGE_CLUT;
-			foundFramebuffer = true;
-			WARN_LOG_REPORT_ONCE(clutrenderdx9, G3D, "Using rendered CLUT for texture decode at %08x (%dx%dx%d)", clutAddr, framebuffer->width, framebuffer->height, framebuffer->colorDepth);
-			clutRenderAddress_ = framebuffer->fb_address;
-		}
-	}
-
-	clutTotalBytes_ = loadBytes;
-	if (Memory::IsValidAddress(clutAddr)) {
-		// It's possible for a game to (successfully) access outside valid memory.
-		u32 bytes = Memory::ValidSize(clutAddr, loadBytes);
-		if (foundFramebuffer && !g_Config.bDisableSlowFramebufEffects) {
-			gpu->PerformMemoryDownload(clutAddr, bytes);
-		}
-
-#ifdef _M_SSE
-		int numBlocks = bytes / 16;
-		if (bytes == loadBytes) {
-			const __m128i *source = (const __m128i *)Memory::GetPointerUnchecked(clutAddr);
-			__m128i *dest = (__m128i *)clutBufRaw_;
-			for (int i = 0; i < numBlocks; i++, source += 2, dest += 2) {
-				__m128i data1 = _mm_loadu_si128(source);
-				__m128i data2 = _mm_loadu_si128(source + 1);
-				_mm_store_si128(dest, data1);
-				_mm_store_si128(dest + 1, data2);
-			}
-		} else {
-			Memory::MemcpyUnchecked(clutBufRaw_, clutAddr, bytes);
-			if (bytes < loadBytes) {
-				memset(clutBufRaw_ + bytes, 0x00, loadBytes - bytes);
-			}
-		}
-#else
-		Memory::MemcpyUnchecked(clutBufRaw_, clutAddr, bytes);
-		if (bytes < clutTotalBytes_) {
-			memset(clutBufRaw_ + bytes, 0x00, loadBytes - bytes);
-		}
-#endif
-	} else {
-		memset(clutBufRaw_, 0x00, loadBytes);
-	}
-	// Reload the clut next time.
-	clutLastFormat_ = 0xFFFFFFFF;
-	clutMaxBytes_ = std::max(clutMaxBytes_, loadBytes);
-}
-
 void TextureCacheDX9::UpdateCurrentClut(GEPaletteFormat clutFormat, u32 clutBase, bool clutIndexIsSimple) {
 	const u32 clutBaseBytes = clutBase * (clutFormat == GE_CMODE_32BIT_ABGR8888 ? sizeof(u32) : sizeof(u16));
 	// Technically, these extra bytes weren't loaded, but hopefully it was loaded earlier.
@@ -1202,7 +1033,7 @@ void TextureCacheDX9::SetTexture(bool force) {
 		// Check for FBO - slow!
 		if (entry->framebuffer) {
 			if (match) {
-				if (hasClut && clutRenderAddress_ != 0) {
+				if (hasClut && clutRenderAddress_ != 0xFFFFFFFF) {
 					WARN_LOG_REPORT_ONCE(clutAndTexRender, G3D, "Using rendered texture with rendered CLUT: texfmt=%d, clutfmt=%d", gstate.getTextureFormat(), gstate.getClutPaletteFormat());
 				}
 
@@ -1378,7 +1209,7 @@ void TextureCacheDX9::SetTexture(bool force) {
 		TexCacheEntry entryNew = {0};
 		cache[cachekey] = entryNew;
 
-		if (hasClut && clutRenderAddress_ != 0) {
+		if (hasClut && clutRenderAddress_ != 0xFFFFFFFF) {
 			WARN_LOG_REPORT_ONCE(clutUseRender, G3D, "Using texture with rendered CLUT: texfmt=%d, clutfmt=%d", gstate.getTextureFormat(), gstate.getClutPaletteFormat());
 		}
 
@@ -1759,7 +1590,7 @@ void *TextureCacheDX9::DecodeTextureLevel(GETextureFormat format, GEPaletteForma
 		ERROR_LOG_REPORT(G3D, "NO finalbuf! Will crash!");
 	}
 
-	if (w != bufw) {
+	if (!(g_Config.iTexScalingLevel == 1 && gstate_c.Supports(GPU_SUPPORTS_UNPACK_SUBIMAGE)) && w != bufw) {
 		int pixelSize;
 		switch (dstFmt) {
 		case D3DFMT_A4R4G4B4:
@@ -1772,21 +1603,7 @@ void *TextureCacheDX9::DecodeTextureLevel(GETextureFormat format, GEPaletteForma
 			break;
 		}
 		// Need to rearrange the buffer to simulate GL_UNPACK_ROW_LENGTH etc.
-		int inRowBytes = bufw * pixelSize;
-		int outRowBytes = w * pixelSize;
-		const u8 *read = (const u8 *)finalBuf;
-		u8 *write = 0;
-		if (w > bufw) {
-			write = (u8 *)tmpTexBufRearrange.data();
-			finalBuf = tmpTexBufRearrange.data();
-		} else {
-			write = (u8 *)finalBuf;
-		}
-		for (int y = 0; y < h; y++) {
-			memmove(write, read, outRowBytes);
-			read += inRowBytes;
-			write += outRowBytes;
-		}
+		finalBuf = RearrangeBuf(finalBuf, bufw * pixelSize, w * pixelSize, h);
 	}
 
 	return finalBuf;
diff --git a/GPU/Directx9/TextureCacheDX9.h b/GPU/Directx9/TextureCacheDX9.h
index df99d26930..c3ce5d7bc0 100644
--- a/GPU/Directx9/TextureCacheDX9.h
+++ b/GPU/Directx9/TextureCacheDX9.h
@@ -35,12 +35,6 @@ class FramebufferManagerDX9;
 class DepalShaderCacheDX9;
 class ShaderManagerDX9;
 
-enum FramebufferNotification {
-	NOTIFY_FB_CREATED,
-	NOTIFY_FB_UPDATED,
-	NOTIFY_FB_DESTROYED,
-};
-
 class TextureCacheDX9 : public TextureCacheCommon {
 public:
 	TextureCacheDX9();
@@ -54,11 +48,6 @@ public:
 	void Invalidate(u32 addr, int size, GPUInvalidationType type);
 	void InvalidateAll(GPUInvalidationType type);
 	void ClearNextFrame();
-	void LoadClut(u32 clutAddr, u32 loadBytes);
-
-	// FramebufferManager keeps TextureCache updated about what regions of memory
-	// are being rendered to. This is barebones so far.
-	void NotifyFramebuffer(u32 address, VirtualFramebuffer *framebuffer, FramebufferNotification msg);
 
 	void SetFramebufferManager(FramebufferManagerDX9 *fbManager) {
 		framebufferManager_ = fbManager;
@@ -84,12 +73,8 @@ public:
 	void ApplyTexture();
 
 private:
-	// Can't be unordered_map, we use lower_bound ... although for some reason that compiles on MSVC.
-	typedef std::map<u64, TexCacheEntry> TexCache;
-
 	void Decimate();  // Run this once per frame to get rid of old textures.
 	void DeleteTexture(TexCache::iterator it);
-	void *UnswizzleFromMem(const u8 *texptr, u32 bufw, u32 height, u32 bytesPerPixel);
 	void *ReadIndexedTex(int level, const u8 *texptr, int bytesPerIndex, u32 dstFmt, int bufw);
 	void UpdateSamplingParams(TexCacheEntry &entry, bool force);
 	void LoadTextureLevel(TexCacheEntry &entry, int level, int maxLevel, bool replaceImages, int scaleFactor, u32 dstFmt);
@@ -100,8 +85,8 @@ private:
 	const T *GetCurrentClut();
 	u32 GetCurrentClutHash();
 	void UpdateCurrentClut(GEPaletteFormat clutFormat, u32 clutBase, bool clutIndexIsSimple);
-	bool AttachFramebuffer(TexCacheEntry *entry, u32 address, VirtualFramebuffer *framebuffer, u32 texaddrOffset = 0);
-	void DetachFramebuffer(TexCacheEntry *entry, u32 address, VirtualFramebuffer *framebuffer);
+	bool AttachFramebuffer(TexCacheEntry *entry, u32 address, VirtualFramebuffer *framebuffer, u32 texaddrOffset = 0) override;
+	void DetachFramebuffer(TexCacheEntry *entry, u32 address, VirtualFramebuffer *framebuffer) override;
 	void SetTextureFramebuffer(TexCacheEntry *entry, VirtualFramebuffer *framebuffer);
 	void ApplyTextureFramebuffer(TexCacheEntry *entry, VirtualFramebuffer *framebuffer);
 
@@ -116,9 +101,7 @@ private:
 		}
 	}
 
-	TexCache cache;
 	TexCache secondCache;
-	std::vector<VirtualFramebuffer *> fbCache_;
 	u32 cacheSizeEstimate_;
 	u32 secondCacheSizeEstimate_;
 
@@ -135,22 +118,11 @@ private:
 	bool lowMemoryMode_;
 	TextureScalerDX9 scaler;
 
-	SimpleBuf<u32> tmpTexBuf32;
-	SimpleBuf<u16> tmpTexBuf16;
-
-	SimpleBuf<u32> tmpTexBufRearrange;
-
-	u32 clutLastFormat_;
-	u32 *clutBufRaw_;
-	u32 *clutBufConverted_;
 	u32 *clutBuf_;
 	u32 clutHash_;
-	u32 clutTotalBytes_;
-	u32 clutMaxBytes_;
 	// True if the clut is just alpha values in the same order (RGBA4444-bit only.)
 	bool clutAlphaLinear_;
 	u16 clutAlphaLinearColor_;
-	u32 clutRenderAddress_;
 
 	LPDIRECT3DTEXTURE9 lastBoundTexture;
 	float maxAnisotropyLevel;
diff --git a/GPU/GLES/TextureCache.cpp b/GPU/GLES/TextureCache.cpp
index 02255aa46b..4ce09e2980 100644
--- a/GPU/GLES/TextureCache.cpp
+++ b/GPU/GLES/TextureCache.cpp
@@ -69,26 +69,15 @@
 #define GL_UNPACK_ROW_LENGTH 0x0CF2
 #endif
 
+#define INVALID_TEX -1
+
 // Hack!
 extern int g_iNumVideos;
 
-TextureCache::TextureCache() : cacheSizeEstimate_(0), secondCacheSizeEstimate_(0), clearCacheNextFrame_(false), lowMemoryMode_(false), clutBuf_(NULL), clutMaxBytes_(0), clutRenderAddress_(0), texelsScaledThisFrame_(0) {
+TextureCache::TextureCache() : cacheSizeEstimate_(0), secondCacheSizeEstimate_(0), clearCacheNextFrame_(false), lowMemoryMode_(false), clutBuf_(NULL), texelsScaledThisFrame_(0) {
 	timesInvalidatedAllThisFrame_ = 0;
-	lastBoundTexture = -1;
+	lastBoundTexture = INVALID_TEX;
 	decimationCounter_ = TEXCACHE_DECIMATION_INTERVAL;
-	// This is 5MB of temporary storage. Might be possible to shrink it.
-	tmpTexBuf32.resize(1024 * 512);  // 2MB
-	tmpTexBuf16.resize(1024 * 512);  // 1MB
-	tmpTexBufRearrange.resize(1024 * 512);   // 2MB
-
-	// TODO: Clamp down to 256/1KB?  Need to check mipmapShareClut and clamp loadclut.
-	clutBufConverted_ = (u32 *)AllocateAlignedMemory(1024 * sizeof(u32), 16);  // 4KB
-	clutBufRaw_ = (u32 *)AllocateAlignedMemory(1024 * sizeof(u32), 16);  // 4KB
-
-	// Zap these so that reads from uninitialized parts of the CLUT look the same in
-	// release and debug
-	memset(clutBufConverted_, 0, 1024 * sizeof(u32));
-	memset(clutBufRaw_, 0, 1024 * sizeof(u32));
 
 	glGetFloatv(GL_MAX_TEXTURE_MAX_ANISOTROPY_EXT, &maxAnisotropyLevel);
 	SetupTextureDecoder();
@@ -98,8 +87,6 @@ TextureCache::TextureCache() : cacheSizeEstimate_(0), secondCacheSizeEstimate_(0
 
 TextureCache::~TextureCache() {
 	Clear(true);
-	FreeAlignedMemory(clutBufConverted_);
-	FreeAlignedMemory(clutBufRaw_);
 }
 
 static u32 EstimateTexMemoryUsage(const TextureCache::TexCacheEntry *entry) {
@@ -136,7 +123,7 @@ static u32 EstimateTexMemoryUsage(const TextureCache::TexCacheEntry *entry) {
 
 void TextureCache::Clear(bool delete_them) {
 	glBindTexture(GL_TEXTURE_2D, 0);
-	lastBoundTexture = -1;
+	lastBoundTexture = INVALID_TEX;
 	if (delete_them) {
 		for (TexCache::iterator iter = cache.begin(); iter != cache.end(); ++iter) {
 			DEBUG_LOG(G3D, "Deleting texture %i", iter->second.textureName);
@@ -184,7 +171,7 @@ void TextureCache::Decimate() {
 		const u32 had = cacheSizeEstimate_;
 
 		glBindTexture(GL_TEXTURE_2D, 0);
-		lastBoundTexture = -1;
+		lastBoundTexture = INVALID_TEX;
 		int killAge = lowMemoryMode_ ? TEXTURE_KILL_AGE_LOWMEM : TEXTURE_KILL_AGE;
 		for (TexCache::iterator iter = cache.begin(); iter != cache.end(); ) {
 			if (iter->second.lastFrame + killAge < gpuStats.numFlips) {
@@ -441,104 +428,6 @@ inline void TextureCache::DetachFramebuffer(TexCacheEntry *entry, u32 address, V
 	}
 }
 
-void TextureCache::NotifyFramebuffer(u32 address, VirtualFramebuffer *framebuffer, FramebufferNotification msg) {
-	// Must be in VRAM so | 0x04000000 it is.  Also, ignore memory mirrors.
-	// These checks are mainly to reduce scanning all textures.
-	const u32 addr = (address | 0x04000000) & 0x3F9FFFFF;
-	const u32 bpp = framebuffer->format == GE_FORMAT_8888 ? 4 : 2;
-	const u64 cacheKey = (u64)addr << 32;
-	// If it has a clut, those are the low 32 bits, so it'll be inside this range.
-	// Also, if it's a subsample of the buffer, it'll also be within the FBO.
-	const u64 cacheKeyEnd = cacheKey + ((u64)(framebuffer->fb_stride * framebuffer->height * bpp) << 32);
-
-	// The first mirror starts at 0x04200000 and there are 3.  We search all for framebuffers.
-	const u64 mirrorCacheKey = (u64)0x04200000 << 32;
-	const u64 mirrorCacheKeyEnd = (u64)0x04800000 << 32;
-
-	switch (msg) {
-	case NOTIFY_FB_CREATED:
-	case NOTIFY_FB_UPDATED:
-		// Ensure it's in the framebuffer cache.
-		if (std::find(fbCache_.begin(), fbCache_.end(), framebuffer) == fbCache_.end()) {
-			fbCache_.push_back(framebuffer);
-		}
-		for (auto it = cache.lower_bound(cacheKey), end = cache.upper_bound(cacheKeyEnd); it != end; ++it) {
-			AttachFramebuffer(&it->second, addr, framebuffer);
-		}
-		// Let's assume anything in mirrors is fair game to check.
-		for (auto it = cache.lower_bound(mirrorCacheKey), end = cache.upper_bound(mirrorCacheKeyEnd); it != end; ++it) {
-			AttachFramebuffer(&it->second, addr, framebuffer);
-		}
-		break;
-
-	case NOTIFY_FB_DESTROYED:
-		fbCache_.erase(std::remove(fbCache_.begin(), fbCache_.end(),  framebuffer), fbCache_.end());
-		for (auto it = cache.lower_bound(cacheKey), end = cache.upper_bound(cacheKeyEnd); it != end; ++it) {
-			DetachFramebuffer(&it->second, addr, framebuffer);
-		}
-		for (auto it = cache.lower_bound(mirrorCacheKey), end = cache.upper_bound(mirrorCacheKeyEnd); it != end; ++it) {
-			DetachFramebuffer(&it->second, addr, framebuffer);
-		}
-		break;
-	}
-}
-
-void *TextureCache::UnswizzleFromMem(const u8 *texptr, u32 bufw, u32 height, u32 bytesPerPixel) {
-	const u32 rowWidth = (bytesPerPixel > 0) ? (bufw * bytesPerPixel) : (bufw / 2);
-	const u32 pitch = rowWidth / 4;
-	const int bxc = rowWidth / 16;
-	int byc = (height + 7) / 8;
-	if (byc == 0)
-		byc = 1;
-
-	u32 ydest = 0;
-	if (rowWidth >= 16) {
-		u32 *ydestp = tmpTexBuf32.data();
-		// The most common one, so it gets an optimized implementation.
-		DoUnswizzleTex16(texptr, ydestp, bxc, byc, pitch, rowWidth);
-	} else if (rowWidth == 8) {
-		const u32 *src = (const u32 *) texptr;
-		for (int by = 0; by < byc; by++) {
-			for (int n = 0; n < 8; n++, ydest += 2) {
-				tmpTexBuf32[ydest + 0] = *src++;
-				tmpTexBuf32[ydest + 1] = *src++;
-				src += 2; // skip two u32
-			}
-		}
-	} else if (rowWidth == 4) {
-		const u32 *src = (const u32 *) texptr;
-		for (int by = 0; by < byc; by++) {
-			for (int n = 0; n < 8; n++, ydest++) {
-				tmpTexBuf32[ydest] = *src++;
-				src += 3;
-			}
-		}
-	} else if (rowWidth == 2) {
-		const u16 *src = (const u16 *) texptr;
-		for (int by = 0; by < byc; by++) {
-			for (int n = 0; n < 4; n++, ydest++) {
-				u16 n1 = src[0];
-				u16 n2 = src[8];
-				tmpTexBuf32[ydest] = (u32)n1 | ((u32)n2 << 16);
-				src += 16;
-			}
-		}
-	} else if (rowWidth == 1) {
-		const u8 *src = (const u8 *) texptr;
-		for (int by = 0; by < byc; by++) {
-			for (int n = 0; n < 2; n++, ydest++) {
-				u8 n1 = src[ 0];
-				u8 n2 = src[16];
-				u8 n3 = src[32];
-				u8 n4 = src[48];
-				tmpTexBuf32[ydest] = (u32)n1 | ((u32)n2 << 8) | ((u32)n3 << 16) | ((u32)n4 << 24);
-				src += 64;
-			}
-		}
-	}
-	return tmpTexBuf32.data();
-}
-
 void *TextureCache::ReadIndexedTex(int level, const u8 *texptr, int bytesPerIndex, GLuint dstFmt, int bufw) {
 	int w = gstate.getTextureWidth(level);
 	int h = gstate.getTextureHeight(level);
@@ -780,7 +669,7 @@ static void ConvertColors(void *dstBuf, const void *srcBuf, GLuint dstFmt, int n
 }
 
 void TextureCache::StartFrame() {
-	lastBoundTexture = -1;
+	lastBoundTexture = INVALID_TEX;
 	timesInvalidatedAllThisFrame_ = 0;
 
 	if (texelsScaledThisFrame_) {
@@ -810,62 +699,6 @@ static inline u32 QuickTexHash(u32 addr, int bufw, int w, int h, GETextureFormat
 	return DoQuickTexHash(checkp, sizeInRAM);
 }
 
-void TextureCache::LoadClut(u32 clutAddr, u32 loadBytes) {
-	// Clear the uncached bit, etc. to match framebuffers.
-	clutAddr = clutAddr & 0x3FFFFFFF;
-	bool foundFramebuffer = false;
-
-	clutRenderAddress_ = 0;
-	for (size_t i = 0, n = fbCache_.size(); i < n; ++i) {
-		auto framebuffer = fbCache_[i];
-		if ((framebuffer->fb_address | 0x04000000) == clutAddr) {
-			framebuffer->last_frame_clut = gpuStats.numFlips;
-			framebuffer->usageFlags |= FB_USAGE_CLUT;
-			foundFramebuffer = true;
-			WARN_LOG_REPORT_ONCE(clutrender, G3D, "Using rendered CLUT for texture decode at %08x (%dx%dx%d)", clutAddr, framebuffer->width, framebuffer->height, framebuffer->colorDepth);
-			clutRenderAddress_ = framebuffer->fb_address;
-		}
-	}
-
-	clutTotalBytes_ = loadBytes;
-	if (Memory::IsValidAddress(clutAddr)) {
-		// It's possible for a game to (successfully) access outside valid memory.
-		u32 bytes = Memory::ValidSize(clutAddr, loadBytes);
-		if (foundFramebuffer && !g_Config.bDisableSlowFramebufEffects) {
-			gpu->PerformMemoryDownload(clutAddr, bytes);
-		}
-
-#ifdef _M_SSE
-		int numBlocks = bytes / 16;
-		if (bytes == loadBytes) {
-			const __m128i *source = (const __m128i *)Memory::GetPointerUnchecked(clutAddr);
-			__m128i *dest = (__m128i *)clutBufRaw_;
-			for (int i = 0; i < numBlocks; i++, source += 2, dest += 2) {
-				__m128i data1 = _mm_loadu_si128(source);
-				__m128i data2 = _mm_loadu_si128(source + 1);
-				_mm_store_si128(dest, data1);
-				_mm_store_si128(dest + 1, data2);
-			}
-		} else {
-			Memory::MemcpyUnchecked(clutBufRaw_, clutAddr, bytes);
-			if (bytes < loadBytes) {
-				memset((u8 *)clutBufRaw_ + bytes, 0x00, loadBytes - bytes);
-			}
-		}
-#else
-		Memory::MemcpyUnchecked(clutBufRaw_, clutAddr, bytes);
-		if (bytes < clutTotalBytes_) {
-			memset((u8 *)clutBufRaw_ + bytes, 0x00, clutTotalBytes_ - bytes);
-		}
-#endif
-	} else {
-		memset(clutBufRaw_, 0x00, loadBytes);
-	}
-	// Reload the clut next time.
-	clutLastFormat_ = 0xFFFFFFFF;
-	clutMaxBytes_ = std::max(clutMaxBytes_, loadBytes);
-}
-
 void TextureCache::UpdateCurrentClut(GEPaletteFormat clutFormat, u32 clutBase, bool clutIndexIsSimple) {
 	const u32 clutBaseBytes = clutFormat == GE_CMODE_32BIT_ABGR8888 ? (clutBase * sizeof(u32)) : (clutBase * sizeof(u16));
 	// Technically, these extra bytes weren't loaded, but hopefully it was loaded earlier.
@@ -1179,7 +1012,7 @@ void TextureCache::ApplyTextureFramebuffer(TexCacheEntry *entry, VirtualFramebuf
 	framebufferManager_->RebindFramebuffer();
 	SetFramebufferSamplingParams(framebuffer->bufferWidth, framebuffer->bufferHeight);
 
-	lastBoundTexture = -1;
+	lastBoundTexture = INVALID_TEX;
 }
 
 bool TextureCache::SetOffsetTexture(u32 offset) {
@@ -1221,20 +1054,20 @@ void TextureCache::SetTexture(bool force) {
 #ifdef DEBUG_TEXTURES
 	if (SetDebugTexture()) {
 		// A different texture was bound, let's rebind next time.
-		lastBoundTexture = -1;
+		lastBoundTexture = INVALID_TEX;
 		return;
 	}
 #endif
 
 	if (force) {
-		lastBoundTexture = -1;
+		lastBoundTexture = INVALID_TEX;
 	}
 
 	u32 texaddr = gstate.getTextureAddress(0);
 	if (!Memory::IsValidAddress(texaddr)) {
 		// Bind a null texture and return.
 		glBindTexture(GL_TEXTURE_2D, 0);
-		lastBoundTexture = -1;
+		lastBoundTexture = INVALID_TEX;
 		return;
 	}
 
@@ -1285,7 +1118,7 @@ void TextureCache::SetTexture(bool force) {
 		// Check for FBO - slow!
 		if (entry->framebuffer) {
 			if (match) {
-				if (hasClut && clutRenderAddress_ != 0) {
+				if (hasClut && clutRenderAddress_ != 0xFFFFFFFF) {
 					WARN_LOG_REPORT_ONCE(clutAndTexRender, G3D, "Using rendered texture with rendered CLUT: texfmt=%d, clutfmt=%d", gstate.getTextureFormat(), gstate.getClutPaletteFormat());
 				}
 
@@ -1433,7 +1266,7 @@ void TextureCache::SetTexture(bool force) {
 					replaceImages = true;
 				} else {
 					if (entry->textureName == lastBoundTexture) {
-						lastBoundTexture = -1;
+						lastBoundTexture = INVALID_TEX;
 					}
 					glDeleteTextures(1, &entry->textureName);
 				}
@@ -1459,7 +1292,7 @@ void TextureCache::SetTexture(bool force) {
 		TexCacheEntry entryNew = {0};
 		cache[cachekey] = entryNew;
 
-		if (hasClut && clutRenderAddress_ != 0) {
+		if (hasClut && clutRenderAddress_ != 0xFFFFFFFF) {
 			WARN_LOG_REPORT_ONCE(clutUseRender, G3D, "Using texture with rendered CLUT: texfmt=%d, clutfmt=%d", gstate.getTextureFormat(), gstate.getClutPaletteFormat());
 		}
 
@@ -1927,21 +1760,7 @@ void *TextureCache::DecodeTextureLevel(GETextureFormat format, GEPaletteFormat c
 		}
 
 		// Need to rearrange the buffer to simulate GL_UNPACK_ROW_LENGTH etc.
-		int inRowBytes = bufw * pixelSize;
-		int outRowBytes = w * pixelSize;
-		const u8 *read = (const u8 *)finalBuf;
-		u8 *write = 0;
-		if (w > bufw) {
-			write = (u8 *)tmpTexBufRearrange.data();
-			finalBuf = tmpTexBufRearrange.data();
-		} else {
-			write = (u8 *)finalBuf;
-		}
-		for (int y = 0; y < h; y++) {
-			memmove(write, read, outRowBytes);
-			read += inRowBytes;
-			write += outRowBytes;
-		}
+		finalBuf = RearrangeBuf(finalBuf, bufw * pixelSize, w * pixelSize, h);
 	}
 
 	return finalBuf;
diff --git a/GPU/GLES/TextureCache.h b/GPU/GLES/TextureCache.h
index e26dcbcd82..903edb184a 100644
--- a/GPU/GLES/TextureCache.h
+++ b/GPU/GLES/TextureCache.h
@@ -34,12 +34,6 @@ class DepalShaderCache;
 class ShaderManager;
 class TransformDrawEngine;
 
-enum FramebufferNotification {
-	NOTIFY_FB_CREATED,
-	NOTIFY_FB_UPDATED,
-	NOTIFY_FB_DESTROYED,
-};
-
 inline bool UseBGRA8888() {
 	// TODO: Other platforms?  May depend on vendor which is faster?
 #ifdef _WIN32
@@ -61,11 +55,6 @@ public:
 	void Invalidate(u32 addr, int size, GPUInvalidationType type);
 	void InvalidateAll(GPUInvalidationType type);
 	void ClearNextFrame();
-	void LoadClut(u32 clutAddr, u32 loadBytes);
-
-	// FramebufferManager keeps TextureCache updated about what regions of memory
-	// are being rendered to. This is barebones so far.
-	void NotifyFramebuffer(u32 address, VirtualFramebuffer *framebuffer, FramebufferNotification msg);
 
 	void SetFramebufferManager(FramebufferManager *fbManager) {
 		framebufferManager_ = fbManager;
@@ -99,12 +88,8 @@ public:
 	void ApplyTexture();
 
 private:
-	// Can't be unordered_map, we use lower_bound ... although for some reason that compiles on MSVC.
-	typedef std::map<u64, TexCacheEntry> TexCache;
-
 	void Decimate();  // Run this once per frame to get rid of old textures.
 	void DeleteTexture(TexCache::iterator it);
-	void *UnswizzleFromMem(const u8 *texptr, u32 bufw, u32 height, u32 bytesPerPixel);
 	void *ReadIndexedTex(int level, const u8 *texptr, int bytesPerIndex, GLuint dstFmt, int bufw);
 	void UpdateSamplingParams(TexCacheEntry &entry, bool force);
 	void LoadTextureLevel(TexCacheEntry &entry, int level, bool replaceImages, int scaleFactor, GLenum dstFmt);
@@ -115,14 +100,12 @@ private:
 	const T *GetCurrentClut();
 	u32 GetCurrentClutHash();
 	void UpdateCurrentClut(GEPaletteFormat clutFormat, u32 clutBase, bool clutIndexIsSimple);
-	bool AttachFramebuffer(TexCacheEntry *entry, u32 address, VirtualFramebuffer *framebuffer, u32 texaddrOffset = 0);
-	void DetachFramebuffer(TexCacheEntry *entry, u32 address, VirtualFramebuffer *framebuffer);
+	bool AttachFramebuffer(TexCacheEntry *entry, u32 address, VirtualFramebuffer *framebuffer, u32 texaddrOffset = 0) override;
+	void DetachFramebuffer(TexCacheEntry *entry, u32 address, VirtualFramebuffer *framebuffer) override;
 	void SetTextureFramebuffer(TexCacheEntry *entry, VirtualFramebuffer *framebuffer);
 	void ApplyTextureFramebuffer(TexCacheEntry *entry, VirtualFramebuffer *framebuffer);
 
-	TexCache cache;
 	TexCache secondCache;
-	std::vector<VirtualFramebuffer *> fbCache_;
 	std::vector<u32> nameCache_;
 	u32 cacheSizeEstimate_;
 	u32 secondCacheSizeEstimate_;
@@ -141,22 +124,11 @@ private:
 
 	TextureScalerGL scaler;
 
-	SimpleBuf<u32> tmpTexBuf32;
-	SimpleBuf<u16> tmpTexBuf16;
-
-	SimpleBuf<u32> tmpTexBufRearrange;
-
-	u32 clutLastFormat_;
-	u32 *clutBufRaw_;
-	u32 *clutBufConverted_;
 	u32 *clutBuf_;
 	u32 clutHash_;
-	u32 clutTotalBytes_;
-	u32 clutMaxBytes_;
 	// True if the clut is just alpha values in the same order (RGBA4444-bit only.)
 	bool clutAlphaLinear_;
 	u16 clutAlphaLinearColor_;
-	u32 clutRenderAddress_;
 
 	u32 lastBoundTexture;
 	float maxAnisotropyLevel;