diff --git a/GPU/Common/TextureCacheCommon.cpp b/GPU/Common/TextureCacheCommon.cpp index 06d5d14126..30207f1c90 100644 --- a/GPU/Common/TextureCacheCommon.cpp +++ b/GPU/Common/TextureCacheCommon.cpp @@ -15,17 +15,42 @@ // Official git repository and contact information can be found at // https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/. +#include +#include "Common/MemoryUtil.h" #include "Core/Config.h" +#include "Core/Reporting.h" #include "GPU/Common/FramebufferCommon.h" #include "GPU/Common/TextureCacheCommon.h" +#include "GPU/Common/TextureDecoder.h" #include "GPU/Common/ShaderId.h" #include "GPU/Common/GPUStateUtils.h" #include "GPU/GPUState.h" +#include "GPU/GPUInterface.h" // Ugly. extern int g_iNumVideos; -TextureCacheCommon::~TextureCacheCommon() {} +TextureCacheCommon::TextureCacheCommon() + : nextTexture_(nullptr), + clutLastFormat_(0xFFFFFFFF), clutTotalBytes_(0), clutMaxBytes_(0), clutRenderAddress_(0xFFFFFFFF) { + // TODO: Clamp down to 256/1KB? Need to check mipmapShareClut and clamp loadclut. + clutBufRaw_ = (u32 *)AllocateAlignedMemory(1024 * sizeof(u32), 16); // 4KB + clutBufConverted_ = (u32 *)AllocateAlignedMemory(1024 * sizeof(u32), 16); // 4KB + + // Zap so we get consistent behavior if the game fails to load some of the CLUT. + memset(clutBufRaw_, 0, 1024 * sizeof(u32)); + memset(clutBufConverted_, 0, 1024 * sizeof(u32)); + + // This is 5MB of temporary storage. Might be possible to shrink it. + tmpTexBuf32.resize(1024 * 512); // 2MB + tmpTexBuf16.resize(1024 * 512); // 1MB + tmpTexBufRearrange.resize(1024 * 512); // 2MB +} + +TextureCacheCommon::~TextureCacheCommon() { + FreeAlignedMemory(clutBufConverted_); + FreeAlignedMemory(clutBufRaw_); +} bool TextureCacheCommon::SetOffsetTexture(u32 offset) { return false; @@ -88,3 +113,182 @@ void TextureCacheCommon::GetSamplingParams(int &minFilt, int &magFilt, bool &sCl minFilt &= 1; } } + +void TextureCacheCommon::NotifyFramebuffer(u32 address, VirtualFramebuffer *framebuffer, FramebufferNotification msg) { + // Must be in VRAM so | 0x04000000 it is. Also, ignore memory mirrors. + // These checks are mainly to reduce scanning all textures. + const u32 addr = (address | 0x04000000) & 0x3F9FFFFF; + const u32 bpp = framebuffer->format == GE_FORMAT_8888 ? 4 : 2; + const u64 cacheKey = (u64)addr << 32; + // If it has a clut, those are the low 32 bits, so it'll be inside this range. + // Also, if it's a subsample of the buffer, it'll also be within the FBO. + const u64 cacheKeyEnd = cacheKey + ((u64)(framebuffer->fb_stride * framebuffer->height * bpp) << 32); + + // The first mirror starts at 0x04200000 and there are 3. We search all for framebuffers. + const u64 mirrorCacheKey = (u64)0x04200000 << 32; + const u64 mirrorCacheKeyEnd = (u64)0x04800000 << 32; + + switch (msg) { + case NOTIFY_FB_CREATED: + case NOTIFY_FB_UPDATED: + // Ensure it's in the framebuffer cache. + if (std::find(fbCache_.begin(), fbCache_.end(), framebuffer) == fbCache_.end()) { + fbCache_.push_back(framebuffer); + } + for (auto it = cache.lower_bound(cacheKey), end = cache.upper_bound(cacheKeyEnd); it != end; ++it) { + AttachFramebuffer(&it->second, addr, framebuffer); + } + // Let's assume anything in mirrors is fair game to check. + for (auto it = cache.lower_bound(mirrorCacheKey), end = cache.upper_bound(mirrorCacheKeyEnd); it != end; ++it) { + const u64 mirrorlessKey = it->first & ~0x0060000000000000ULL; + // Let's still make sure it's in the cache range. + if (mirrorlessKey >= cacheKey && mirrorlessKey <= cacheKeyEnd) { + AttachFramebuffer(&it->second, addr, framebuffer); + } + } + break; + + case NOTIFY_FB_DESTROYED: + fbCache_.erase(std::remove(fbCache_.begin(), fbCache_.end(), framebuffer), fbCache_.end()); + for (auto it = cache.lower_bound(cacheKey), end = cache.upper_bound(cacheKeyEnd); it != end; ++it) { + DetachFramebuffer(&it->second, addr, framebuffer); + } + for (auto it = cache.lower_bound(mirrorCacheKey), end = cache.upper_bound(mirrorCacheKeyEnd); it != end; ++it) { + const u64 mirrorlessKey = it->first & ~0x0060000000000000ULL; + // Let's still make sure it's in the cache range. + if (mirrorlessKey >= cacheKey && mirrorlessKey <= cacheKeyEnd) { + DetachFramebuffer(&it->second, addr, framebuffer); + } + } + break; + } +} + +void TextureCacheCommon::LoadClut(u32 clutAddr, u32 loadBytes) { + clutTotalBytes_ = loadBytes; + clutRenderAddress_ = 0xFFFFFFFF; + + if (Memory::IsValidAddress(clutAddr)) { + if (Memory::IsVRAMAddress(clutAddr)) { + // Clear the uncached bit, etc. to match framebuffers. + const u32 clutFramebufAddr = clutAddr & 0x3FFFFFFF; + + for (size_t i = 0, n = fbCache_.size(); i < n; ++i) { + auto framebuffer = fbCache_[i]; + if ((framebuffer->fb_address | 0x04000000) == clutFramebufAddr) { + framebuffer->last_frame_clut = gpuStats.numFlips; + framebuffer->usageFlags |= FB_USAGE_CLUT; + clutRenderAddress_ = framebuffer->fb_address; + } + } + } + + // It's possible for a game to (successfully) access outside valid memory. + u32 bytes = Memory::ValidSize(clutAddr, loadBytes); + if (clutRenderAddress_ != 0xFFFFFFFF && !g_Config.bDisableSlowFramebufEffects) { + gpu->PerformMemoryDownload(clutAddr, bytes); + } + +#ifdef _M_SSE + int numBlocks = bytes / 16; + if (bytes == loadBytes) { + const __m128i *source = (const __m128i *)Memory::GetPointerUnchecked(clutAddr); + __m128i *dest = (__m128i *)clutBufRaw_; + for (int i = 0; i < numBlocks; i++, source += 2, dest += 2) { + __m128i data1 = _mm_loadu_si128(source); + __m128i data2 = _mm_loadu_si128(source + 1); + _mm_store_si128(dest, data1); + _mm_store_si128(dest + 1, data2); + } + } else { + Memory::MemcpyUnchecked(clutBufRaw_, clutAddr, bytes); + if (bytes < loadBytes) { + memset((u8 *)clutBufRaw_ + bytes, 0x00, loadBytes - bytes); + } + } +#else + Memory::MemcpyUnchecked(clutBufRaw_, clutAddr, bytes); + if (bytes < clutTotalBytes_) { + memset((u8 *)clutBufRaw_ + bytes, 0x00, clutTotalBytes_ - bytes); + } +#endif + } else { + memset(clutBufRaw_, 0x00, loadBytes); + } + // Reload the clut next time. + clutLastFormat_ = 0xFFFFFFFF; + clutMaxBytes_ = std::max(clutMaxBytes_, loadBytes); +} + +void *TextureCacheCommon::UnswizzleFromMem(const u8 *texptr, u32 bufw, u32 height, u32 bytesPerPixel) { + const u32 rowWidth = (bytesPerPixel > 0) ? (bufw * bytesPerPixel) : (bufw / 2); + const u32 pitch = rowWidth / 4; + const int bxc = rowWidth / 16; + int byc = (height + 7) / 8; + if (byc == 0) + byc = 1; + + u32 ydest = 0; + if (rowWidth >= 16) { + u32 *ydestp = tmpTexBuf32.data(); + // The most common one, so it gets an optimized implementation. + DoUnswizzleTex16(texptr, ydestp, bxc, byc, pitch, rowWidth); + } else if (rowWidth == 8) { + const u32 *src = (const u32 *) texptr; + for (int by = 0; by < byc; by++) { + for (int n = 0; n < 8; n++, ydest += 2) { + tmpTexBuf32[ydest + 0] = *src++; + tmpTexBuf32[ydest + 1] = *src++; + src += 2; // skip two u32 + } + } + } else if (rowWidth == 4) { + const u32 *src = (const u32 *) texptr; + for (int by = 0; by < byc; by++) { + for (int n = 0; n < 8; n++, ydest++) { + tmpTexBuf32[ydest] = *src++; + src += 3; + } + } + } else if (rowWidth == 2) { + const u16 *src = (const u16 *) texptr; + for (int by = 0; by < byc; by++) { + for (int n = 0; n < 4; n++, ydest++) { + u16 n1 = src[0]; + u16 n2 = src[8]; + tmpTexBuf32[ydest] = (u32)n1 | ((u32)n2 << 16); + src += 16; + } + } + } else if (rowWidth == 1) { + const u8 *src = (const u8 *) texptr; + for (int by = 0; by < byc; by++) { + for (int n = 0; n < 2; n++, ydest++) { + u8 n1 = src[ 0]; + u8 n2 = src[16]; + u8 n3 = src[32]; + u8 n4 = src[48]; + tmpTexBuf32[ydest] = (u32)n1 | ((u32)n2 << 8) | ((u32)n3 << 16) | ((u32)n4 << 24); + src += 64; + } + } + } + return tmpTexBuf32.data(); +} + +void *TextureCacheCommon::RearrangeBuf(void *inBuf, u32 inRowBytes, u32 outRowBytes, int h, bool allowInPlace) { + const u8 *read = (const u8 *)inBuf; + void *outBuf = inBuf; + u8 *write = (u8 *)inBuf; + if (outRowBytes > inRowBytes || !allowInPlace) { + write = (u8 *)tmpTexBufRearrange.data(); + outBuf = tmpTexBufRearrange.data(); + } + for (int y = 0; y < h; y++) { + memmove(write, read, outRowBytes); + read += inRowBytes; + write += outRowBytes; + } + + return outBuf; +} diff --git a/GPU/Common/TextureCacheCommon.h b/GPU/Common/TextureCacheCommon.h index cfa9738b91..847a1c2917 100644 --- a/GPU/Common/TextureCacheCommon.h +++ b/GPU/Common/TextureCacheCommon.h @@ -26,14 +26,26 @@ enum TextureFiltering { TEX_FILTER_LINEAR_VIDEO = 4, }; +enum FramebufferNotification { + NOTIFY_FB_CREATED, + NOTIFY_FB_UPDATED, + NOTIFY_FB_DESTROYED, +}; + struct VirtualFramebuffer; class TextureCacheCommon { public: + TextureCacheCommon(); virtual ~TextureCacheCommon(); + void LoadClut(u32 clutAddr, u32 loadBytes); + virtual bool SetOffsetTexture(u32 offset); + // FramebufferManager keeps TextureCache updated about what regions of memory are being rendered to. + void NotifyFramebuffer(u32 address, VirtualFramebuffer *framebuffer, FramebufferNotification msg); + int AttachedDrawingHeight(); // Wow this is starting to grow big. Soon need to start looking at resizing it. @@ -115,9 +127,33 @@ public: }; protected: + // Can't be unordered_map, we use lower_bound ... although for some reason that compiles on MSVC. + typedef std::map TexCache; + + void *UnswizzleFromMem(const u8 *texptr, u32 bufw, u32 height, u32 bytesPerPixel); + void *RearrangeBuf(void *inBuf, u32 inRowBytes, u32 outRowBytes, int h, bool allowInPlace = true); + void GetSamplingParams(int &minFilt, int &magFilt, bool &sClamp, bool &tClamp, float &lodBias, u8 maxLevel); + virtual bool AttachFramebuffer(TexCacheEntry *entry, u32 address, VirtualFramebuffer *framebuffer, u32 texaddrOffset = 0) = 0; + virtual void DetachFramebuffer(TexCacheEntry *entry, u32 address, VirtualFramebuffer *framebuffer) = 0; + + TexCache cache; + std::vector fbCache_; + + SimpleBuf tmpTexBuf32; + SimpleBuf tmpTexBuf16; + SimpleBuf tmpTexBufRearrange; + TexCacheEntry *nextTexture_; + + // Raw is where we keep the original bytes. Converted is where we swap colors if necessary. + u32 *clutBufRaw_; + u32 *clutBufConverted_; + u32 clutLastFormat_; + u32 clutTotalBytes_; + u32 clutMaxBytes_; + u32 clutRenderAddress_; }; inline bool TextureCacheCommon::TexCacheEntry::Matches(u16 dim2, u8 format2, u8 maxLevel2) { diff --git a/GPU/Common/TextureDecoder.h b/GPU/Common/TextureDecoder.h index f10e35f5c0..84ba1f6878 100644 --- a/GPU/Common/TextureDecoder.h +++ b/GPU/Common/TextureDecoder.h @@ -28,6 +28,7 @@ enum CheckAlphaResult { #include "Core/MemMap.h" #include "GPU/ge_constants.h" #include "GPU/Common/TextureDecoderNEON.h" +#include "GPU/GPUState.h" void SetupTextureDecoder(); diff --git a/GPU/Directx9/TextureCacheDX9.cpp b/GPU/Directx9/TextureCacheDX9.cpp index ebda4de03d..6f828f4742 100644 --- a/GPU/Directx9/TextureCacheDX9.cpp +++ b/GPU/Directx9/TextureCacheDX9.cpp @@ -63,23 +63,10 @@ namespace DX9 { #define TEXCACHE_MIN_PRESSURE 16 * 1024 * 1024 // Total in VRAM #define TEXCACHE_SECOND_MIN_PRESSURE 4 * 1024 * 1024 -TextureCacheDX9::TextureCacheDX9() : cacheSizeEstimate_(0), secondCacheSizeEstimate_(0), clearCacheNextFrame_(false), lowMemoryMode_(false), clutBuf_(NULL), clutMaxBytes_(0), clutRenderAddress_(0), texelsScaledThisFrame_(0) { +TextureCacheDX9::TextureCacheDX9() : cacheSizeEstimate_(0), secondCacheSizeEstimate_(0), clearCacheNextFrame_(false), lowMemoryMode_(false), clutBuf_(NULL), texelsScaledThisFrame_(0) { timesInvalidatedAllThisFrame_ = 0; lastBoundTexture = INVALID_TEX; decimationCounter_ = TEXCACHE_DECIMATION_INTERVAL; - // This is 5MB of temporary storage. Might be possible to shrink it. - tmpTexBuf32.resize(1024 * 512); // 2MB - tmpTexBuf16.resize(1024 * 512); // 1MB - tmpTexBufRearrange.resize(1024 * 512); // 2MB - - // TODO: Clamp down to 256/1KB? Need to check mipmapShareClut and clamp loadclut. - clutBufConverted_ = (u32 *)AllocateAlignedMemory(1024 * sizeof(u32), 16); // 4KB - clutBufRaw_ = (u32 *)AllocateAlignedMemory(1024 * sizeof(u32), 16); // 4KB - - // Zap these so that reads from uninitialized parts of the CLUT look the same in - // release and debug - memset(clutBufConverted_, 0, 1024 * sizeof(u32)); - memset(clutBufRaw_, 0, 1024 * sizeof(u32)); D3DCAPS9 pCaps; ZeroMemory(&pCaps, sizeof(pCaps)); @@ -102,8 +89,6 @@ TextureCacheDX9::TextureCacheDX9() : cacheSizeEstimate_(0), secondCacheSizeEstim TextureCacheDX9::~TextureCacheDX9() { Clear(true); - FreeAlignedMemory(clutBufConverted_); - FreeAlignedMemory(clutBufRaw_); } static u32 EstimateTexMemoryUsage(const TextureCacheDX9::TexCacheEntry *entry) { @@ -443,104 +428,6 @@ inline void TextureCacheDX9::DetachFramebuffer(TexCacheEntry *entry, u32 address } } -void TextureCacheDX9::NotifyFramebuffer(u32 address, VirtualFramebuffer *framebuffer, FramebufferNotification msg) { - // Must be in VRAM so | 0x04000000 it is. Also, ignore memory mirrors. - // These checks are mainly to reduce scanning all textures. - const u32 addr = (address | 0x04000000) & 0x3F9FFFFF; - const u32 bpp = framebuffer->format == GE_FORMAT_8888 ? 4 : 2; - const u64 cacheKey = (u64)addr << 32; - // If it has a clut, those are the low 32 bits, so it'll be inside this range. - // Also, if it's a subsample of the buffer, it'll also be within the FBO. - const u64 cacheKeyEnd = cacheKey + ((u64)(framebuffer->fb_stride * framebuffer->height * bpp) << 32); - - // The first mirror starts at 0x04200000 and there are 3. We search all for framebuffers. - const u64 mirrorCacheKey = (u64)0x04200000 << 32; - const u64 mirrorCacheKeyEnd = (u64)0x04800000 << 32; - - switch (msg) { - case NOTIFY_FB_CREATED: - case NOTIFY_FB_UPDATED: - // Ensure it's in the framebuffer cache. - if (std::find(fbCache_.begin(), fbCache_.end(), framebuffer) == fbCache_.end()) { - fbCache_.push_back(framebuffer); - } - for (auto it = cache.lower_bound(cacheKey), end = cache.upper_bound(cacheKeyEnd); it != end; ++it) { - AttachFramebuffer(&it->second, addr, framebuffer); - } - // Let's assume anything in mirrors is fair game to check. - for (auto it = cache.lower_bound(mirrorCacheKey), end = cache.upper_bound(mirrorCacheKeyEnd); it != end; ++it) { - AttachFramebuffer(&it->second, addr, framebuffer); - } - break; - - case NOTIFY_FB_DESTROYED: - fbCache_.erase(std::remove(fbCache_.begin(), fbCache_.end(), framebuffer), fbCache_.end()); - for (auto it = cache.lower_bound(cacheKey), end = cache.upper_bound(cacheKeyEnd); it != end; ++it) { - DetachFramebuffer(&it->second, addr, framebuffer); - } - for (auto it = cache.lower_bound(mirrorCacheKey), end = cache.upper_bound(mirrorCacheKeyEnd); it != end; ++it) { - DetachFramebuffer(&it->second, addr, framebuffer); - } - break; - } -} - -void *TextureCacheDX9::UnswizzleFromMem(const u8 *texptr, u32 bufw, u32 height, u32 bytesPerPixel) { - const u32 rowWidth = (bytesPerPixel > 0) ? (bufw * bytesPerPixel) : (bufw / 2); - const u32 pitch = rowWidth / 4; - const int bxc = rowWidth / 16; - int byc = (height + 7) / 8; - if (byc == 0) - byc = 1; - - u32 ydest = 0; - if (rowWidth >= 16) { - u32 *ydestp = tmpTexBuf32.data(); - // The most common one, so it gets an optimized implementation. - DoUnswizzleTex16(texptr, ydestp, bxc, byc, pitch, rowWidth); - } else if (rowWidth == 8) { - const u32 *src = (const u32 *) texptr; - for (int by = 0; by < byc; by++) { - for (int n = 0; n < 8; n++, ydest += 2) { - tmpTexBuf32[ydest + 0] = *src++; - tmpTexBuf32[ydest + 1] = *src++; - src += 2; // skip two u32 - } - } - } else if (rowWidth == 4) { - const u32 *src = (const u32 *) texptr; - for (int by = 0; by < byc; by++) { - for (int n = 0; n < 8; n++, ydest++) { - tmpTexBuf32[ydest] = *src++; - src += 3; - } - } - } else if (rowWidth == 2) { - const u16 *src = (const u16 *) texptr; - for (int by = 0; by < byc; by++) { - for (int n = 0; n < 4; n++, ydest++) { - u16 n1 = src[0]; - u16 n2 = src[8]; - tmpTexBuf32[ydest] = (u32)n1 | ((u32)n2 << 16); - src += 16; - } - } - } else if (rowWidth == 1) { - const u8 *src = (const u8 *) texptr; - for (int by = 0; by < byc; by++) { - for (int n = 0; n < 2; n++, ydest++) { - u8 n1 = src[ 0]; - u8 n2 = src[16]; - u8 n3 = src[32]; - u8 n4 = src[48]; - tmpTexBuf32[ydest] = (u32)n1 | ((u32)n2 << 8) | ((u32)n3 << 16) | ((u32)n4 << 24); - src += 64; - } - } - } - return tmpTexBuf32.data(); -} - void *TextureCacheDX9::ReadIndexedTex(int level, const u8 *texptr, int bytesPerIndex, u32 dstFmt, int bufw) { int w = gstate.getTextureWidth(level); int h = gstate.getTextureHeight(level); @@ -786,62 +673,6 @@ static inline u32 QuickTexHash(u32 addr, int bufw, int w, int h, GETextureFormat return DoQuickTexHash(checkp, sizeInRAM); } -void TextureCacheDX9::LoadClut(u32 clutAddr, u32 loadBytes) { - // Clear the uncached bit, etc. to match framebuffers. - clutAddr = clutAddr & 0x3FFFFFFF; - bool foundFramebuffer = false; - - clutRenderAddress_ = 0; - for (size_t i = 0, n = fbCache_.size(); i < n; ++i) { - auto framebuffer = fbCache_[i]; - if ((framebuffer->fb_address | 0x04000000) == clutAddr) { - framebuffer->last_frame_clut = gpuStats.numFlips; - framebuffer->usageFlags |= FB_USAGE_CLUT; - foundFramebuffer = true; - WARN_LOG_REPORT_ONCE(clutrenderdx9, G3D, "Using rendered CLUT for texture decode at %08x (%dx%dx%d)", clutAddr, framebuffer->width, framebuffer->height, framebuffer->colorDepth); - clutRenderAddress_ = framebuffer->fb_address; - } - } - - clutTotalBytes_ = loadBytes; - if (Memory::IsValidAddress(clutAddr)) { - // It's possible for a game to (successfully) access outside valid memory. - u32 bytes = Memory::ValidSize(clutAddr, loadBytes); - if (foundFramebuffer && !g_Config.bDisableSlowFramebufEffects) { - gpu->PerformMemoryDownload(clutAddr, bytes); - } - -#ifdef _M_SSE - int numBlocks = bytes / 16; - if (bytes == loadBytes) { - const __m128i *source = (const __m128i *)Memory::GetPointerUnchecked(clutAddr); - __m128i *dest = (__m128i *)clutBufRaw_; - for (int i = 0; i < numBlocks; i++, source += 2, dest += 2) { - __m128i data1 = _mm_loadu_si128(source); - __m128i data2 = _mm_loadu_si128(source + 1); - _mm_store_si128(dest, data1); - _mm_store_si128(dest + 1, data2); - } - } else { - Memory::MemcpyUnchecked(clutBufRaw_, clutAddr, bytes); - if (bytes < loadBytes) { - memset(clutBufRaw_ + bytes, 0x00, loadBytes - bytes); - } - } -#else - Memory::MemcpyUnchecked(clutBufRaw_, clutAddr, bytes); - if (bytes < clutTotalBytes_) { - memset(clutBufRaw_ + bytes, 0x00, loadBytes - bytes); - } -#endif - } else { - memset(clutBufRaw_, 0x00, loadBytes); - } - // Reload the clut next time. - clutLastFormat_ = 0xFFFFFFFF; - clutMaxBytes_ = std::max(clutMaxBytes_, loadBytes); -} - void TextureCacheDX9::UpdateCurrentClut(GEPaletteFormat clutFormat, u32 clutBase, bool clutIndexIsSimple) { const u32 clutBaseBytes = clutBase * (clutFormat == GE_CMODE_32BIT_ABGR8888 ? sizeof(u32) : sizeof(u16)); // Technically, these extra bytes weren't loaded, but hopefully it was loaded earlier. @@ -1202,7 +1033,7 @@ void TextureCacheDX9::SetTexture(bool force) { // Check for FBO - slow! if (entry->framebuffer) { if (match) { - if (hasClut && clutRenderAddress_ != 0) { + if (hasClut && clutRenderAddress_ != 0xFFFFFFFF) { WARN_LOG_REPORT_ONCE(clutAndTexRender, G3D, "Using rendered texture with rendered CLUT: texfmt=%d, clutfmt=%d", gstate.getTextureFormat(), gstate.getClutPaletteFormat()); } @@ -1378,7 +1209,7 @@ void TextureCacheDX9::SetTexture(bool force) { TexCacheEntry entryNew = {0}; cache[cachekey] = entryNew; - if (hasClut && clutRenderAddress_ != 0) { + if (hasClut && clutRenderAddress_ != 0xFFFFFFFF) { WARN_LOG_REPORT_ONCE(clutUseRender, G3D, "Using texture with rendered CLUT: texfmt=%d, clutfmt=%d", gstate.getTextureFormat(), gstate.getClutPaletteFormat()); } @@ -1759,7 +1590,7 @@ void *TextureCacheDX9::DecodeTextureLevel(GETextureFormat format, GEPaletteForma ERROR_LOG_REPORT(G3D, "NO finalbuf! Will crash!"); } - if (w != bufw) { + if (!(g_Config.iTexScalingLevel == 1 && gstate_c.Supports(GPU_SUPPORTS_UNPACK_SUBIMAGE)) && w != bufw) { int pixelSize; switch (dstFmt) { case D3DFMT_A4R4G4B4: @@ -1772,21 +1603,7 @@ void *TextureCacheDX9::DecodeTextureLevel(GETextureFormat format, GEPaletteForma break; } // Need to rearrange the buffer to simulate GL_UNPACK_ROW_LENGTH etc. - int inRowBytes = bufw * pixelSize; - int outRowBytes = w * pixelSize; - const u8 *read = (const u8 *)finalBuf; - u8 *write = 0; - if (w > bufw) { - write = (u8 *)tmpTexBufRearrange.data(); - finalBuf = tmpTexBufRearrange.data(); - } else { - write = (u8 *)finalBuf; - } - for (int y = 0; y < h; y++) { - memmove(write, read, outRowBytes); - read += inRowBytes; - write += outRowBytes; - } + finalBuf = RearrangeBuf(finalBuf, bufw * pixelSize, w * pixelSize, h); } return finalBuf; diff --git a/GPU/Directx9/TextureCacheDX9.h b/GPU/Directx9/TextureCacheDX9.h index df99d26930..c3ce5d7bc0 100644 --- a/GPU/Directx9/TextureCacheDX9.h +++ b/GPU/Directx9/TextureCacheDX9.h @@ -35,12 +35,6 @@ class FramebufferManagerDX9; class DepalShaderCacheDX9; class ShaderManagerDX9; -enum FramebufferNotification { - NOTIFY_FB_CREATED, - NOTIFY_FB_UPDATED, - NOTIFY_FB_DESTROYED, -}; - class TextureCacheDX9 : public TextureCacheCommon { public: TextureCacheDX9(); @@ -54,11 +48,6 @@ public: void Invalidate(u32 addr, int size, GPUInvalidationType type); void InvalidateAll(GPUInvalidationType type); void ClearNextFrame(); - void LoadClut(u32 clutAddr, u32 loadBytes); - - // FramebufferManager keeps TextureCache updated about what regions of memory - // are being rendered to. This is barebones so far. - void NotifyFramebuffer(u32 address, VirtualFramebuffer *framebuffer, FramebufferNotification msg); void SetFramebufferManager(FramebufferManagerDX9 *fbManager) { framebufferManager_ = fbManager; @@ -84,12 +73,8 @@ public: void ApplyTexture(); private: - // Can't be unordered_map, we use lower_bound ... although for some reason that compiles on MSVC. - typedef std::map TexCache; - void Decimate(); // Run this once per frame to get rid of old textures. void DeleteTexture(TexCache::iterator it); - void *UnswizzleFromMem(const u8 *texptr, u32 bufw, u32 height, u32 bytesPerPixel); void *ReadIndexedTex(int level, const u8 *texptr, int bytesPerIndex, u32 dstFmt, int bufw); void UpdateSamplingParams(TexCacheEntry &entry, bool force); void LoadTextureLevel(TexCacheEntry &entry, int level, int maxLevel, bool replaceImages, int scaleFactor, u32 dstFmt); @@ -100,8 +85,8 @@ private: const T *GetCurrentClut(); u32 GetCurrentClutHash(); void UpdateCurrentClut(GEPaletteFormat clutFormat, u32 clutBase, bool clutIndexIsSimple); - bool AttachFramebuffer(TexCacheEntry *entry, u32 address, VirtualFramebuffer *framebuffer, u32 texaddrOffset = 0); - void DetachFramebuffer(TexCacheEntry *entry, u32 address, VirtualFramebuffer *framebuffer); + bool AttachFramebuffer(TexCacheEntry *entry, u32 address, VirtualFramebuffer *framebuffer, u32 texaddrOffset = 0) override; + void DetachFramebuffer(TexCacheEntry *entry, u32 address, VirtualFramebuffer *framebuffer) override; void SetTextureFramebuffer(TexCacheEntry *entry, VirtualFramebuffer *framebuffer); void ApplyTextureFramebuffer(TexCacheEntry *entry, VirtualFramebuffer *framebuffer); @@ -116,9 +101,7 @@ private: } } - TexCache cache; TexCache secondCache; - std::vector fbCache_; u32 cacheSizeEstimate_; u32 secondCacheSizeEstimate_; @@ -135,22 +118,11 @@ private: bool lowMemoryMode_; TextureScalerDX9 scaler; - SimpleBuf tmpTexBuf32; - SimpleBuf tmpTexBuf16; - - SimpleBuf tmpTexBufRearrange; - - u32 clutLastFormat_; - u32 *clutBufRaw_; - u32 *clutBufConverted_; u32 *clutBuf_; u32 clutHash_; - u32 clutTotalBytes_; - u32 clutMaxBytes_; // True if the clut is just alpha values in the same order (RGBA4444-bit only.) bool clutAlphaLinear_; u16 clutAlphaLinearColor_; - u32 clutRenderAddress_; LPDIRECT3DTEXTURE9 lastBoundTexture; float maxAnisotropyLevel; diff --git a/GPU/GLES/TextureCache.cpp b/GPU/GLES/TextureCache.cpp index 02255aa46b..4ce09e2980 100644 --- a/GPU/GLES/TextureCache.cpp +++ b/GPU/GLES/TextureCache.cpp @@ -69,26 +69,15 @@ #define GL_UNPACK_ROW_LENGTH 0x0CF2 #endif +#define INVALID_TEX -1 + // Hack! extern int g_iNumVideos; -TextureCache::TextureCache() : cacheSizeEstimate_(0), secondCacheSizeEstimate_(0), clearCacheNextFrame_(false), lowMemoryMode_(false), clutBuf_(NULL), clutMaxBytes_(0), clutRenderAddress_(0), texelsScaledThisFrame_(0) { +TextureCache::TextureCache() : cacheSizeEstimate_(0), secondCacheSizeEstimate_(0), clearCacheNextFrame_(false), lowMemoryMode_(false), clutBuf_(NULL), texelsScaledThisFrame_(0) { timesInvalidatedAllThisFrame_ = 0; - lastBoundTexture = -1; + lastBoundTexture = INVALID_TEX; decimationCounter_ = TEXCACHE_DECIMATION_INTERVAL; - // This is 5MB of temporary storage. Might be possible to shrink it. - tmpTexBuf32.resize(1024 * 512); // 2MB - tmpTexBuf16.resize(1024 * 512); // 1MB - tmpTexBufRearrange.resize(1024 * 512); // 2MB - - // TODO: Clamp down to 256/1KB? Need to check mipmapShareClut and clamp loadclut. - clutBufConverted_ = (u32 *)AllocateAlignedMemory(1024 * sizeof(u32), 16); // 4KB - clutBufRaw_ = (u32 *)AllocateAlignedMemory(1024 * sizeof(u32), 16); // 4KB - - // Zap these so that reads from uninitialized parts of the CLUT look the same in - // release and debug - memset(clutBufConverted_, 0, 1024 * sizeof(u32)); - memset(clutBufRaw_, 0, 1024 * sizeof(u32)); glGetFloatv(GL_MAX_TEXTURE_MAX_ANISOTROPY_EXT, &maxAnisotropyLevel); SetupTextureDecoder(); @@ -98,8 +87,6 @@ TextureCache::TextureCache() : cacheSizeEstimate_(0), secondCacheSizeEstimate_(0 TextureCache::~TextureCache() { Clear(true); - FreeAlignedMemory(clutBufConverted_); - FreeAlignedMemory(clutBufRaw_); } static u32 EstimateTexMemoryUsage(const TextureCache::TexCacheEntry *entry) { @@ -136,7 +123,7 @@ static u32 EstimateTexMemoryUsage(const TextureCache::TexCacheEntry *entry) { void TextureCache::Clear(bool delete_them) { glBindTexture(GL_TEXTURE_2D, 0); - lastBoundTexture = -1; + lastBoundTexture = INVALID_TEX; if (delete_them) { for (TexCache::iterator iter = cache.begin(); iter != cache.end(); ++iter) { DEBUG_LOG(G3D, "Deleting texture %i", iter->second.textureName); @@ -184,7 +171,7 @@ void TextureCache::Decimate() { const u32 had = cacheSizeEstimate_; glBindTexture(GL_TEXTURE_2D, 0); - lastBoundTexture = -1; + lastBoundTexture = INVALID_TEX; int killAge = lowMemoryMode_ ? TEXTURE_KILL_AGE_LOWMEM : TEXTURE_KILL_AGE; for (TexCache::iterator iter = cache.begin(); iter != cache.end(); ) { if (iter->second.lastFrame + killAge < gpuStats.numFlips) { @@ -441,104 +428,6 @@ inline void TextureCache::DetachFramebuffer(TexCacheEntry *entry, u32 address, V } } -void TextureCache::NotifyFramebuffer(u32 address, VirtualFramebuffer *framebuffer, FramebufferNotification msg) { - // Must be in VRAM so | 0x04000000 it is. Also, ignore memory mirrors. - // These checks are mainly to reduce scanning all textures. - const u32 addr = (address | 0x04000000) & 0x3F9FFFFF; - const u32 bpp = framebuffer->format == GE_FORMAT_8888 ? 4 : 2; - const u64 cacheKey = (u64)addr << 32; - // If it has a clut, those are the low 32 bits, so it'll be inside this range. - // Also, if it's a subsample of the buffer, it'll also be within the FBO. - const u64 cacheKeyEnd = cacheKey + ((u64)(framebuffer->fb_stride * framebuffer->height * bpp) << 32); - - // The first mirror starts at 0x04200000 and there are 3. We search all for framebuffers. - const u64 mirrorCacheKey = (u64)0x04200000 << 32; - const u64 mirrorCacheKeyEnd = (u64)0x04800000 << 32; - - switch (msg) { - case NOTIFY_FB_CREATED: - case NOTIFY_FB_UPDATED: - // Ensure it's in the framebuffer cache. - if (std::find(fbCache_.begin(), fbCache_.end(), framebuffer) == fbCache_.end()) { - fbCache_.push_back(framebuffer); - } - for (auto it = cache.lower_bound(cacheKey), end = cache.upper_bound(cacheKeyEnd); it != end; ++it) { - AttachFramebuffer(&it->second, addr, framebuffer); - } - // Let's assume anything in mirrors is fair game to check. - for (auto it = cache.lower_bound(mirrorCacheKey), end = cache.upper_bound(mirrorCacheKeyEnd); it != end; ++it) { - AttachFramebuffer(&it->second, addr, framebuffer); - } - break; - - case NOTIFY_FB_DESTROYED: - fbCache_.erase(std::remove(fbCache_.begin(), fbCache_.end(), framebuffer), fbCache_.end()); - for (auto it = cache.lower_bound(cacheKey), end = cache.upper_bound(cacheKeyEnd); it != end; ++it) { - DetachFramebuffer(&it->second, addr, framebuffer); - } - for (auto it = cache.lower_bound(mirrorCacheKey), end = cache.upper_bound(mirrorCacheKeyEnd); it != end; ++it) { - DetachFramebuffer(&it->second, addr, framebuffer); - } - break; - } -} - -void *TextureCache::UnswizzleFromMem(const u8 *texptr, u32 bufw, u32 height, u32 bytesPerPixel) { - const u32 rowWidth = (bytesPerPixel > 0) ? (bufw * bytesPerPixel) : (bufw / 2); - const u32 pitch = rowWidth / 4; - const int bxc = rowWidth / 16; - int byc = (height + 7) / 8; - if (byc == 0) - byc = 1; - - u32 ydest = 0; - if (rowWidth >= 16) { - u32 *ydestp = tmpTexBuf32.data(); - // The most common one, so it gets an optimized implementation. - DoUnswizzleTex16(texptr, ydestp, bxc, byc, pitch, rowWidth); - } else if (rowWidth == 8) { - const u32 *src = (const u32 *) texptr; - for (int by = 0; by < byc; by++) { - for (int n = 0; n < 8; n++, ydest += 2) { - tmpTexBuf32[ydest + 0] = *src++; - tmpTexBuf32[ydest + 1] = *src++; - src += 2; // skip two u32 - } - } - } else if (rowWidth == 4) { - const u32 *src = (const u32 *) texptr; - for (int by = 0; by < byc; by++) { - for (int n = 0; n < 8; n++, ydest++) { - tmpTexBuf32[ydest] = *src++; - src += 3; - } - } - } else if (rowWidth == 2) { - const u16 *src = (const u16 *) texptr; - for (int by = 0; by < byc; by++) { - for (int n = 0; n < 4; n++, ydest++) { - u16 n1 = src[0]; - u16 n2 = src[8]; - tmpTexBuf32[ydest] = (u32)n1 | ((u32)n2 << 16); - src += 16; - } - } - } else if (rowWidth == 1) { - const u8 *src = (const u8 *) texptr; - for (int by = 0; by < byc; by++) { - for (int n = 0; n < 2; n++, ydest++) { - u8 n1 = src[ 0]; - u8 n2 = src[16]; - u8 n3 = src[32]; - u8 n4 = src[48]; - tmpTexBuf32[ydest] = (u32)n1 | ((u32)n2 << 8) | ((u32)n3 << 16) | ((u32)n4 << 24); - src += 64; - } - } - } - return tmpTexBuf32.data(); -} - void *TextureCache::ReadIndexedTex(int level, const u8 *texptr, int bytesPerIndex, GLuint dstFmt, int bufw) { int w = gstate.getTextureWidth(level); int h = gstate.getTextureHeight(level); @@ -780,7 +669,7 @@ static void ConvertColors(void *dstBuf, const void *srcBuf, GLuint dstFmt, int n } void TextureCache::StartFrame() { - lastBoundTexture = -1; + lastBoundTexture = INVALID_TEX; timesInvalidatedAllThisFrame_ = 0; if (texelsScaledThisFrame_) { @@ -810,62 +699,6 @@ static inline u32 QuickTexHash(u32 addr, int bufw, int w, int h, GETextureFormat return DoQuickTexHash(checkp, sizeInRAM); } -void TextureCache::LoadClut(u32 clutAddr, u32 loadBytes) { - // Clear the uncached bit, etc. to match framebuffers. - clutAddr = clutAddr & 0x3FFFFFFF; - bool foundFramebuffer = false; - - clutRenderAddress_ = 0; - for (size_t i = 0, n = fbCache_.size(); i < n; ++i) { - auto framebuffer = fbCache_[i]; - if ((framebuffer->fb_address | 0x04000000) == clutAddr) { - framebuffer->last_frame_clut = gpuStats.numFlips; - framebuffer->usageFlags |= FB_USAGE_CLUT; - foundFramebuffer = true; - WARN_LOG_REPORT_ONCE(clutrender, G3D, "Using rendered CLUT for texture decode at %08x (%dx%dx%d)", clutAddr, framebuffer->width, framebuffer->height, framebuffer->colorDepth); - clutRenderAddress_ = framebuffer->fb_address; - } - } - - clutTotalBytes_ = loadBytes; - if (Memory::IsValidAddress(clutAddr)) { - // It's possible for a game to (successfully) access outside valid memory. - u32 bytes = Memory::ValidSize(clutAddr, loadBytes); - if (foundFramebuffer && !g_Config.bDisableSlowFramebufEffects) { - gpu->PerformMemoryDownload(clutAddr, bytes); - } - -#ifdef _M_SSE - int numBlocks = bytes / 16; - if (bytes == loadBytes) { - const __m128i *source = (const __m128i *)Memory::GetPointerUnchecked(clutAddr); - __m128i *dest = (__m128i *)clutBufRaw_; - for (int i = 0; i < numBlocks; i++, source += 2, dest += 2) { - __m128i data1 = _mm_loadu_si128(source); - __m128i data2 = _mm_loadu_si128(source + 1); - _mm_store_si128(dest, data1); - _mm_store_si128(dest + 1, data2); - } - } else { - Memory::MemcpyUnchecked(clutBufRaw_, clutAddr, bytes); - if (bytes < loadBytes) { - memset((u8 *)clutBufRaw_ + bytes, 0x00, loadBytes - bytes); - } - } -#else - Memory::MemcpyUnchecked(clutBufRaw_, clutAddr, bytes); - if (bytes < clutTotalBytes_) { - memset((u8 *)clutBufRaw_ + bytes, 0x00, clutTotalBytes_ - bytes); - } -#endif - } else { - memset(clutBufRaw_, 0x00, loadBytes); - } - // Reload the clut next time. - clutLastFormat_ = 0xFFFFFFFF; - clutMaxBytes_ = std::max(clutMaxBytes_, loadBytes); -} - void TextureCache::UpdateCurrentClut(GEPaletteFormat clutFormat, u32 clutBase, bool clutIndexIsSimple) { const u32 clutBaseBytes = clutFormat == GE_CMODE_32BIT_ABGR8888 ? (clutBase * sizeof(u32)) : (clutBase * sizeof(u16)); // Technically, these extra bytes weren't loaded, but hopefully it was loaded earlier. @@ -1179,7 +1012,7 @@ void TextureCache::ApplyTextureFramebuffer(TexCacheEntry *entry, VirtualFramebuf framebufferManager_->RebindFramebuffer(); SetFramebufferSamplingParams(framebuffer->bufferWidth, framebuffer->bufferHeight); - lastBoundTexture = -1; + lastBoundTexture = INVALID_TEX; } bool TextureCache::SetOffsetTexture(u32 offset) { @@ -1221,20 +1054,20 @@ void TextureCache::SetTexture(bool force) { #ifdef DEBUG_TEXTURES if (SetDebugTexture()) { // A different texture was bound, let's rebind next time. - lastBoundTexture = -1; + lastBoundTexture = INVALID_TEX; return; } #endif if (force) { - lastBoundTexture = -1; + lastBoundTexture = INVALID_TEX; } u32 texaddr = gstate.getTextureAddress(0); if (!Memory::IsValidAddress(texaddr)) { // Bind a null texture and return. glBindTexture(GL_TEXTURE_2D, 0); - lastBoundTexture = -1; + lastBoundTexture = INVALID_TEX; return; } @@ -1285,7 +1118,7 @@ void TextureCache::SetTexture(bool force) { // Check for FBO - slow! if (entry->framebuffer) { if (match) { - if (hasClut && clutRenderAddress_ != 0) { + if (hasClut && clutRenderAddress_ != 0xFFFFFFFF) { WARN_LOG_REPORT_ONCE(clutAndTexRender, G3D, "Using rendered texture with rendered CLUT: texfmt=%d, clutfmt=%d", gstate.getTextureFormat(), gstate.getClutPaletteFormat()); } @@ -1433,7 +1266,7 @@ void TextureCache::SetTexture(bool force) { replaceImages = true; } else { if (entry->textureName == lastBoundTexture) { - lastBoundTexture = -1; + lastBoundTexture = INVALID_TEX; } glDeleteTextures(1, &entry->textureName); } @@ -1459,7 +1292,7 @@ void TextureCache::SetTexture(bool force) { TexCacheEntry entryNew = {0}; cache[cachekey] = entryNew; - if (hasClut && clutRenderAddress_ != 0) { + if (hasClut && clutRenderAddress_ != 0xFFFFFFFF) { WARN_LOG_REPORT_ONCE(clutUseRender, G3D, "Using texture with rendered CLUT: texfmt=%d, clutfmt=%d", gstate.getTextureFormat(), gstate.getClutPaletteFormat()); } @@ -1927,21 +1760,7 @@ void *TextureCache::DecodeTextureLevel(GETextureFormat format, GEPaletteFormat c } // Need to rearrange the buffer to simulate GL_UNPACK_ROW_LENGTH etc. - int inRowBytes = bufw * pixelSize; - int outRowBytes = w * pixelSize; - const u8 *read = (const u8 *)finalBuf; - u8 *write = 0; - if (w > bufw) { - write = (u8 *)tmpTexBufRearrange.data(); - finalBuf = tmpTexBufRearrange.data(); - } else { - write = (u8 *)finalBuf; - } - for (int y = 0; y < h; y++) { - memmove(write, read, outRowBytes); - read += inRowBytes; - write += outRowBytes; - } + finalBuf = RearrangeBuf(finalBuf, bufw * pixelSize, w * pixelSize, h); } return finalBuf; diff --git a/GPU/GLES/TextureCache.h b/GPU/GLES/TextureCache.h index e26dcbcd82..903edb184a 100644 --- a/GPU/GLES/TextureCache.h +++ b/GPU/GLES/TextureCache.h @@ -34,12 +34,6 @@ class DepalShaderCache; class ShaderManager; class TransformDrawEngine; -enum FramebufferNotification { - NOTIFY_FB_CREATED, - NOTIFY_FB_UPDATED, - NOTIFY_FB_DESTROYED, -}; - inline bool UseBGRA8888() { // TODO: Other platforms? May depend on vendor which is faster? #ifdef _WIN32 @@ -61,11 +55,6 @@ public: void Invalidate(u32 addr, int size, GPUInvalidationType type); void InvalidateAll(GPUInvalidationType type); void ClearNextFrame(); - void LoadClut(u32 clutAddr, u32 loadBytes); - - // FramebufferManager keeps TextureCache updated about what regions of memory - // are being rendered to. This is barebones so far. - void NotifyFramebuffer(u32 address, VirtualFramebuffer *framebuffer, FramebufferNotification msg); void SetFramebufferManager(FramebufferManager *fbManager) { framebufferManager_ = fbManager; @@ -99,12 +88,8 @@ public: void ApplyTexture(); private: - // Can't be unordered_map, we use lower_bound ... although for some reason that compiles on MSVC. - typedef std::map TexCache; - void Decimate(); // Run this once per frame to get rid of old textures. void DeleteTexture(TexCache::iterator it); - void *UnswizzleFromMem(const u8 *texptr, u32 bufw, u32 height, u32 bytesPerPixel); void *ReadIndexedTex(int level, const u8 *texptr, int bytesPerIndex, GLuint dstFmt, int bufw); void UpdateSamplingParams(TexCacheEntry &entry, bool force); void LoadTextureLevel(TexCacheEntry &entry, int level, bool replaceImages, int scaleFactor, GLenum dstFmt); @@ -115,14 +100,12 @@ private: const T *GetCurrentClut(); u32 GetCurrentClutHash(); void UpdateCurrentClut(GEPaletteFormat clutFormat, u32 clutBase, bool clutIndexIsSimple); - bool AttachFramebuffer(TexCacheEntry *entry, u32 address, VirtualFramebuffer *framebuffer, u32 texaddrOffset = 0); - void DetachFramebuffer(TexCacheEntry *entry, u32 address, VirtualFramebuffer *framebuffer); + bool AttachFramebuffer(TexCacheEntry *entry, u32 address, VirtualFramebuffer *framebuffer, u32 texaddrOffset = 0) override; + void DetachFramebuffer(TexCacheEntry *entry, u32 address, VirtualFramebuffer *framebuffer) override; void SetTextureFramebuffer(TexCacheEntry *entry, VirtualFramebuffer *framebuffer); void ApplyTextureFramebuffer(TexCacheEntry *entry, VirtualFramebuffer *framebuffer); - TexCache cache; TexCache secondCache; - std::vector fbCache_; std::vector nameCache_; u32 cacheSizeEstimate_; u32 secondCacheSizeEstimate_; @@ -141,22 +124,11 @@ private: TextureScalerGL scaler; - SimpleBuf tmpTexBuf32; - SimpleBuf tmpTexBuf16; - - SimpleBuf tmpTexBufRearrange; - - u32 clutLastFormat_; - u32 *clutBufRaw_; - u32 *clutBufConverted_; u32 *clutBuf_; u32 clutHash_; - u32 clutTotalBytes_; - u32 clutMaxBytes_; // True if the clut is just alpha values in the same order (RGBA4444-bit only.) bool clutAlphaLinear_; u16 clutAlphaLinearColor_; - u32 clutRenderAddress_; u32 lastBoundTexture; float maxAnisotropyLevel;