mirror of
https://github.com/hrydgard/ppsspp.git
synced 2024-11-23 13:30:02 +00:00
Track flags to reduce unnecessary VRAM zeroing.
If we haven't downloaded to RAM since the last zero, no need to zero again. This is the most common case.
This commit is contained in:
parent
19bf222ea4
commit
70d17d1bc7
@ -131,72 +131,6 @@ u32 DrawEngineCommon::NormalizeVertices(u8 *outPtr, u8 *bufPtr, const u8 *inPtr,
|
||||
return DrawEngineCommon::NormalizeVertices(outPtr, bufPtr, inPtr, dec, lowerBound, upperBound, vertType);
|
||||
}
|
||||
|
||||
void DrawEngineCommon::ApplyClearToMemory(int x1, int y1, int x2, int y2, u32 clearColor) {
|
||||
u8 *addr = Memory::GetPointer(gstate.getFrameBufAddress());
|
||||
const bool singleByteClear = (clearColor >> 16) == (clearColor & 0xFFFF) && (clearColor >> 24) == (clearColor & 0xFF);
|
||||
const int bpp = gstate.FrameBufFormat() == GE_FORMAT_8888 ? 4 : 2;
|
||||
const int stride = gstate.FrameBufStride();
|
||||
const int width = x2 - x1;
|
||||
|
||||
// Can use memset for simple cases. Often alpha is different and gums up the works.
|
||||
// The check for bpp==4 etc is because we don't properly convert the clear color to the correct
|
||||
// 16-bit format before computing the singleByteClear value. That could be done, but it was easier
|
||||
// to just fall back to the generic case.
|
||||
if (singleByteClear && (bpp == 4 || clearColor == 0)) {
|
||||
const int byteStride = stride * bpp;
|
||||
const int byteWidth = width * bpp;
|
||||
addr += x1 * bpp;
|
||||
for (int y = y1; y < y2; ++y) {
|
||||
memset(addr + y * byteStride, clearColor, byteWidth);
|
||||
}
|
||||
} else {
|
||||
u16 clear16 = 0;
|
||||
switch (gstate.FrameBufFormat()) {
|
||||
case GE_FORMAT_565: ConvertRGBA8888ToRGB565(&clear16, &clearColor, 1); break;
|
||||
case GE_FORMAT_5551: ConvertRGBA8888ToRGBA5551(&clear16, &clearColor, 1); break;
|
||||
case GE_FORMAT_4444: ConvertRGBA8888ToRGBA4444(&clear16, &clearColor, 1); break;
|
||||
}
|
||||
|
||||
// This will most often be true - rarely is the width not aligned.
|
||||
// TODO: We should really use non-temporal stores here to avoid the cache,
|
||||
// as it's unlikely that these bytes will be read.
|
||||
if ((width & 3) == 0 && (x1 & 3) == 0) {
|
||||
u64 val64 = clearColor | ((u64)clearColor << 32);
|
||||
int xstride = 2;
|
||||
if (bpp == 2) {
|
||||
// Spread to all eight bytes.
|
||||
u64 c2 = clear16 | (clear16 << 16);
|
||||
val64 = c2 | (c2 << 32);
|
||||
xstride = 4;
|
||||
}
|
||||
|
||||
u64 *addr64 = (u64 *)addr;
|
||||
const int stride64 = stride / xstride;
|
||||
const int x1_64 = x1 / xstride;
|
||||
const int x2_64 = x2 / xstride;
|
||||
for (int y = y1; y < y2; ++y) {
|
||||
for (int x = x1_64; x < x2_64; ++x) {
|
||||
addr64[y * stride64 + x] = val64;
|
||||
}
|
||||
}
|
||||
} else if (bpp == 4) {
|
||||
u32 *addr32 = (u32 *)addr;
|
||||
for (int y = y1; y < y2; ++y) {
|
||||
for (int x = x1; x < x2; ++x) {
|
||||
addr32[y * stride + x] = clearColor;
|
||||
}
|
||||
}
|
||||
} else if (bpp == 2) {
|
||||
u16 *addr16 = (u16 *)addr;
|
||||
for (int y = y1; y < y2; ++y) {
|
||||
for (int x = x1; x < x2; ++x) {
|
||||
addr16[y * stride + x] = clear16;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// This code is HIGHLY unoptimized!
|
||||
//
|
||||
// It does the simplest and safest test possible: If all points of a bbox is outside a single of
|
||||
|
@ -66,7 +66,6 @@ protected:
|
||||
// Preprocessing for spline/bezier
|
||||
u32 NormalizeVertices(u8 *outPtr, u8 *bufPtr, const u8 *inPtr, int lowerBound, int upperBound, u32 vertType);
|
||||
|
||||
void ApplyClearToMemory(int x1, int y1, int x2, int y2, u32 clearColor);
|
||||
bool ApplyShaderBlending();
|
||||
|
||||
VertexDecoder *GetVertexDecoder(u32 vtype);
|
||||
|
@ -23,6 +23,7 @@
|
||||
#include "gfx_es2/gpu_features.h"
|
||||
|
||||
#include "i18n/i18n.h"
|
||||
#include "Common/ColorConv.h"
|
||||
#include "Common/Common.h"
|
||||
#include "Core/Config.h"
|
||||
#include "Core/CoreParameter.h"
|
||||
@ -564,6 +565,7 @@ void FramebufferManagerCommon::NotifyRenderFramebufferUpdated(VirtualFramebuffer
|
||||
void FramebufferManagerCommon::NotifyRenderFramebufferSwitched(VirtualFramebuffer *prevVfb, VirtualFramebuffer *vfb, bool isClearingDepth) {
|
||||
if (ShouldDownloadFramebuffer(vfb) && !vfb->memoryUpdated) {
|
||||
ReadFramebufferToMemory(vfb, true, 0, 0, vfb->width, vfb->height);
|
||||
vfb->usageFlags = (vfb->usageFlags | FB_USAGE_DOWNLOAD) & ~FB_USAGE_DOWNLOAD_CLEAR;
|
||||
} else {
|
||||
DownloadFramebufferOnSwitch(prevVfb);
|
||||
}
|
||||
@ -776,6 +778,7 @@ void FramebufferManagerCommon::DownloadFramebufferOnSwitch(VirtualFramebuffer *v
|
||||
// Saving each frame would be slow.
|
||||
if (!g_Config.bDisableSlowFramebufEffects) {
|
||||
ReadFramebufferToMemory(vfb, true, 0, 0, vfb->safeWidth, vfb->safeHeight);
|
||||
vfb->usageFlags = (vfb->usageFlags | FB_USAGE_DOWNLOAD) & ~FB_USAGE_DOWNLOAD_CLEAR;
|
||||
vfb->firstFrameSaved = true;
|
||||
vfb->safeWidth = 0;
|
||||
vfb->safeHeight = 0;
|
||||
@ -1021,6 +1024,7 @@ void FramebufferManagerCommon::DecimateFBOs() {
|
||||
if (ShouldDownloadFramebuffer(vfb) && age == 0 && !vfb->memoryUpdated) {
|
||||
bool sync = gl_extensions.IsGLES;
|
||||
ReadFramebufferToMemory(vfb, sync, 0, 0, vfb->width, vfb->height);
|
||||
vfb->usageFlags = (vfb->usageFlags | FB_USAGE_DOWNLOAD) & ~FB_USAGE_DOWNLOAD_CLEAR;
|
||||
}
|
||||
|
||||
// Let's also "decimate" the usageFlags.
|
||||
@ -1239,6 +1243,7 @@ bool FramebufferManagerCommon::NotifyFramebufferCopy(u32 src, u32 dst, int size,
|
||||
WARN_LOG_REPORT_ONCE(btdcpyheight, G3D, "Memcpy fbo download %08x -> %08x skipped, %d+%d is taller than %d", src, dst, srcY, srcH, srcBuffer->bufferHeight);
|
||||
} else if (g_Config.bBlockTransferGPU && !srcBuffer->memoryUpdated) {
|
||||
ReadFramebufferToMemory(srcBuffer, true, 0, srcY, srcBuffer->width, srcH);
|
||||
srcBuffer->usageFlags = (srcBuffer->usageFlags | FB_USAGE_DOWNLOAD) & ~FB_USAGE_DOWNLOAD_CLEAR;
|
||||
}
|
||||
return false;
|
||||
} else {
|
||||
@ -1393,6 +1398,86 @@ VirtualFramebuffer *FramebufferManagerCommon::FindDownloadTempBuffer(VirtualFram
|
||||
return nvfb;
|
||||
}
|
||||
|
||||
void FramebufferManagerCommon::ApplyClearToMemory(int x1, int y1, int x2, int y2, u32 clearColor) {
|
||||
if (currentRenderVfb_) {
|
||||
if ((currentRenderVfb_->usageFlags & FB_USAGE_DOWNLOAD_CLEAR) != 0) {
|
||||
// Already zeroed in memory.
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
u8 *addr = Memory::GetPointer(gstate.getFrameBufAddress());
|
||||
const bool singleByteClear = (clearColor >> 16) == (clearColor & 0xFFFF) && (clearColor >> 24) == (clearColor & 0xFF);
|
||||
const int bpp = gstate.FrameBufFormat() == GE_FORMAT_8888 ? 4 : 2;
|
||||
const int stride = gstate.FrameBufStride();
|
||||
const int width = x2 - x1;
|
||||
|
||||
// Can use memset for simple cases. Often alpha is different and gums up the works.
|
||||
// The check for bpp==4 etc is because we don't properly convert the clear color to the correct
|
||||
// 16-bit format before computing the singleByteClear value. That could be done, but it was easier
|
||||
// to just fall back to the generic case.
|
||||
if (singleByteClear && (bpp == 4 || clearColor == 0)) {
|
||||
const int byteStride = stride * bpp;
|
||||
const int byteWidth = width * bpp;
|
||||
addr += x1 * bpp;
|
||||
for (int y = y1; y < y2; ++y) {
|
||||
memset(addr + y * byteStride, clearColor, byteWidth);
|
||||
}
|
||||
} else {
|
||||
u16 clear16 = 0;
|
||||
switch (gstate.FrameBufFormat()) {
|
||||
case GE_FORMAT_565: ConvertRGBA8888ToRGB565(&clear16, &clearColor, 1); break;
|
||||
case GE_FORMAT_5551: ConvertRGBA8888ToRGBA5551(&clear16, &clearColor, 1); break;
|
||||
case GE_FORMAT_4444: ConvertRGBA8888ToRGBA4444(&clear16, &clearColor, 1); break;
|
||||
}
|
||||
|
||||
// This will most often be true - rarely is the width not aligned.
|
||||
// TODO: We should really use non-temporal stores here to avoid the cache,
|
||||
// as it's unlikely that these bytes will be read.
|
||||
if ((width & 3) == 0 && (x1 & 3) == 0) {
|
||||
u64 val64 = clearColor | ((u64)clearColor << 32);
|
||||
int xstride = 2;
|
||||
if (bpp == 2) {
|
||||
// Spread to all eight bytes.
|
||||
u64 c2 = clear16 | (clear16 << 16);
|
||||
val64 = c2 | (c2 << 32);
|
||||
xstride = 4;
|
||||
}
|
||||
|
||||
u64 *addr64 = (u64 *)addr;
|
||||
const int stride64 = stride / xstride;
|
||||
const int x1_64 = x1 / xstride;
|
||||
const int x2_64 = x2 / xstride;
|
||||
for (int y = y1; y < y2; ++y) {
|
||||
for (int x = x1_64; x < x2_64; ++x) {
|
||||
addr64[y * stride64 + x] = val64;
|
||||
}
|
||||
}
|
||||
} else if (bpp == 4) {
|
||||
u32 *addr32 = (u32 *)addr;
|
||||
for (int y = y1; y < y2; ++y) {
|
||||
for (int x = x1; x < x2; ++x) {
|
||||
addr32[y * stride + x] = clearColor;
|
||||
}
|
||||
}
|
||||
} else if (bpp == 2) {
|
||||
u16 *addr16 = (u16 *)addr;
|
||||
for (int y = y1; y < y2; ++y) {
|
||||
for (int x = x1; x < x2; ++x) {
|
||||
addr16[y * stride + x] = clear16;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (currentRenderVfb_) {
|
||||
// The current content is in memory now, so update the flag.
|
||||
if (x1 == 0 && y1 == 0 && x2 >= currentRenderVfb_->width && y2 >= currentRenderVfb_->height) {
|
||||
currentRenderVfb_->usageFlags |= FB_USAGE_DOWNLOAD_CLEAR;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void FramebufferManagerCommon::OptimizeDownloadRange(VirtualFramebuffer * vfb, int & x, int & y, int & w, int & h) {
|
||||
if (gameUsesSequentialCopies_) {
|
||||
// Ignore the x/y/etc., read the entire thing.
|
||||
@ -1404,6 +1489,7 @@ void FramebufferManagerCommon::OptimizeDownloadRange(VirtualFramebuffer * vfb, i
|
||||
if (x == 0 && y == 0 && w == vfb->width && h == vfb->height) {
|
||||
// Mark it as fully downloaded until next render to it.
|
||||
vfb->memoryUpdated = true;
|
||||
vfb->usageFlags |= FB_USAGE_DOWNLOAD;
|
||||
} else {
|
||||
// Let's try to set the flag eventually, if the game copies a lot.
|
||||
// Some games copy subranges very frequently.
|
||||
@ -1485,6 +1571,7 @@ bool FramebufferManagerCommon::NotifyBlockTransferBefore(u32 dstBasePtr, int dst
|
||||
if (tooTall)
|
||||
WARN_LOG_ONCE(btdheight, G3D, "Block transfer download %08x -> %08x dangerous, %d+%d is taller than %d", srcBasePtr, dstBasePtr, srcY, srcHeight, srcBuffer->bufferHeight);
|
||||
ReadFramebufferToMemory(srcBuffer, true, static_cast<int>(srcX * srcXFactor), srcY, static_cast<int>(srcWidth * srcXFactor), srcHeight);
|
||||
srcBuffer->usageFlags = (srcBuffer->usageFlags | FB_USAGE_DOWNLOAD) & ~FB_USAGE_DOWNLOAD_CLEAR;
|
||||
}
|
||||
}
|
||||
return false; // Let the bit copy happen
|
||||
|
@ -31,6 +31,8 @@ enum {
|
||||
FB_USAGE_RENDERTARGET = 2,
|
||||
FB_USAGE_TEXTURE = 4,
|
||||
FB_USAGE_CLUT = 8,
|
||||
FB_USAGE_DOWNLOAD = 16,
|
||||
FB_USAGE_DOWNLOAD_CLEAR = 32,
|
||||
};
|
||||
|
||||
enum {
|
||||
@ -191,6 +193,7 @@ public:
|
||||
bool NotifyFramebufferCopy(u32 src, u32 dest, int size, bool isMemset, u32 skipDrawReason);
|
||||
void NotifyVideoUpload(u32 addr, int size, int width, GEBufferFormat fmt);
|
||||
void UpdateFromMemory(u32 addr, int size, bool safe);
|
||||
void ApplyClearToMemory(int x1, int y1, int x2, int y2, u32 clearColor);
|
||||
virtual bool NotifyStencilUpload(u32 addr, int size, bool skipZero = false) = 0;
|
||||
// Returns true if it's sure this is a direct FBO->FBO transfer and it has already handle it.
|
||||
// In that case we hardly need to actually copy the bytes in VRAM, they will be wrong anyway (unless
|
||||
|
@ -935,7 +935,7 @@ rotateVBO:
|
||||
if (g_Config.bBlockTransferGPU && (gstate_c.featureFlags & GPU_USE_CLEAR_RAM_HACK) && gstate.isClearModeColorMask() && (gstate.isClearModeAlphaMask() || gstate.FrameBufFormat() == GE_FORMAT_565)) {
|
||||
int scissorX1 = gstate.getScissorX1();
|
||||
int scissorY1 = gstate.getScissorY1();
|
||||
ApplyClearToMemory(scissorX1, scissorY1, scissorX2, scissorY2, clearColor);
|
||||
framebufferManager_->ApplyClearToMemory(scissorX1, scissorY1, scissorX2, scissorY2, clearColor);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -874,7 +874,7 @@ rotateVBO:
|
||||
if (g_Config.bBlockTransferGPU && (gstate_c.featureFlags & GPU_USE_CLEAR_RAM_HACK) && gstate.isClearModeColorMask() && (gstate.isClearModeAlphaMask() || gstate.FrameBufFormat() == GE_FORMAT_565)) {
|
||||
int scissorX1 = gstate.getScissorX1();
|
||||
int scissorY1 = gstate.getScissorY1();
|
||||
ApplyClearToMemory(scissorX1, scissorY1, scissorX2, scissorY2, clearColor);
|
||||
framebufferManager_->ApplyClearToMemory(scissorX1, scissorY1, scissorX2, scissorY2, clearColor);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -976,7 +976,7 @@ rotateVBO:
|
||||
framebufferManager_->SetSafeSize(scissorX2, scissorY2);
|
||||
|
||||
if (g_Config.bBlockTransferGPU && (gstate_c.featureFlags & GPU_USE_CLEAR_RAM_HACK) && colorMask && (alphaMask || gstate.FrameBufFormat() == GE_FORMAT_565)) {
|
||||
ApplyClearToMemory(scissorX1, scissorY1, scissorX2, scissorY2, clearColor);
|
||||
framebufferManager_->ApplyClearToMemory(scissorX1, scissorY1, scissorX2, scissorY2, clearColor);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -892,7 +892,7 @@ void DrawEngineVulkan::DoFlush(VkCommandBuffer cmd) {
|
||||
framebufferManager_->SetSafeSize(scissorX2, scissorY2);
|
||||
|
||||
if (g_Config.bBlockTransferGPU && (gstate_c.featureFlags & GPU_USE_CLEAR_RAM_HACK) && gstate.isClearModeColorMask() && (gstate.isClearModeAlphaMask() || gstate.FrameBufFormat() == GE_FORMAT_565)) {
|
||||
ApplyClearToMemory(scissorX1, scissorY1, scissorX2, scissorY2, result.color);
|
||||
framebufferManager_->ApplyClearToMemory(scissorX1, scissorY1, scissorX2, scissorY2, result.color);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user