mirror of
https://github.com/hrydgard/ppsspp.git
synced 2024-11-27 07:20:49 +00:00
softgpu: Avoid flush texturing from stride.
This generally detects overlap more accurately using a dirty rectangles approach. Also detects render to self much more accurately, including with depth.
This commit is contained in:
parent
dec0ba7b79
commit
c0c3f7284a
@ -22,6 +22,7 @@
|
||||
#include "Common/Thread/ThreadManager.h"
|
||||
#include "Common/TimeUtil.h"
|
||||
#include "Core/System.h"
|
||||
#include "GPU/Common/TextureDecoder.h"
|
||||
#include "GPU/Software/BinManager.h"
|
||||
#include "GPU/Software/Rasterizer.h"
|
||||
#include "GPU/Software/RasterizerRectangle.h"
|
||||
@ -172,23 +173,6 @@ void BinManager::UpdateState() {
|
||||
scissor_.x2 = screenScissorBR.x + 15;
|
||||
scissor_.y2 = screenScissorBR.y + 15;
|
||||
|
||||
// Disallow threads when rendering to target.
|
||||
const uint32_t renderTarget = gstate.getFrameBufAddress() & 0x0FFFFFFF;
|
||||
bool selfRender = (gstate.getTextureAddress(0) & 0x0FFFFFFF) == renderTarget;
|
||||
if (gstate.isMipmapEnabled()) {
|
||||
for (int i = 0; i <= gstate.getTextureMaxLevel(); ++i)
|
||||
selfRender = selfRender || (gstate.getTextureAddress(i) & 0x0FFFFFFF) == renderTarget;
|
||||
}
|
||||
|
||||
int newMaxTasks = selfRender ? 1 : g_threadManager.GetNumLooperThreads();
|
||||
if (newMaxTasks > MAX_POSSIBLE_TASKS)
|
||||
newMaxTasks = MAX_POSSIBLE_TASKS;
|
||||
// We don't want to overlap wrong, so flush any pending.
|
||||
if (maxTasks_ != newMaxTasks) {
|
||||
maxTasks_ = newMaxTasks;
|
||||
Flush("selfrender");
|
||||
}
|
||||
|
||||
// Our bin sizes are based on offset, so if that changes we have to flush.
|
||||
if (queueOffsetX_ != gstate.getOffsetX16() || queueOffsetY_ != gstate.getOffsetY16()) {
|
||||
Flush("offset");
|
||||
@ -200,6 +184,81 @@ void BinManager::UpdateState() {
|
||||
lastFlipstats_ = gpuStats.numFlips;
|
||||
ResetStats();
|
||||
}
|
||||
|
||||
// If we're about to texture from something still pending (i.e. depth), flush.
|
||||
const auto &state = State();
|
||||
const bool hadDepth = pendingWrites_[1].base != 0;
|
||||
if (HasTextureWrite(state))
|
||||
Flush("tex");
|
||||
|
||||
// Okay, now update what's pending.
|
||||
constexpr uint32_t mirrorMask = 0x0FFFFFFF & ~0x00600000;
|
||||
const uint32_t bpp = state.pixelID.FBFormat() == GE_FORMAT_8888 ? 4 : 2;
|
||||
pendingWrites_[0].Expand(gstate.getFrameBufAddress() & mirrorMask, bpp, gstate.FrameBufStride(), scissorTL, scissorBR);
|
||||
if (state.pixelID.depthWrite)
|
||||
pendingWrites_[1].Expand(gstate.getDepthBufAddress() & mirrorMask, 2, gstate.DepthBufStride(), scissorTL, scissorBR);
|
||||
|
||||
// Disallow threads when rendering to the target, even offset.
|
||||
bool selfRender = HasTextureWrite(state);
|
||||
int newMaxTasks = selfRender ? 1 : g_threadManager.GetNumLooperThreads();
|
||||
if (newMaxTasks > MAX_POSSIBLE_TASKS)
|
||||
newMaxTasks = MAX_POSSIBLE_TASKS;
|
||||
// We don't want to overlap wrong, so flush any pending.
|
||||
if (maxTasks_ != newMaxTasks) {
|
||||
maxTasks_ = newMaxTasks;
|
||||
Flush("selfrender");
|
||||
}
|
||||
|
||||
// Lastly, we have to check if we're newly writing depth we were texturing before.
|
||||
// This happens in Call of Duty (depth clear after depth texture), for example.
|
||||
if (!hadDepth && state.pixelID.depthWrite) {
|
||||
for (size_t i = 0; i < states_.Size(); ++i) {
|
||||
if (HasTextureWrite(states_.Peek(i)))
|
||||
Flush("selfdepth");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
bool BinManager::HasTextureWrite(const RasterizerState &state) {
|
||||
if (!state.enableTextures)
|
||||
return false;
|
||||
|
||||
const int textureBits = textureBitsPerPixel[state.samplerID.texfmt];
|
||||
for (int i = 0; i <= state.maxTexLevel; ++i) {
|
||||
int byteStride = (state.texbufw[i] * textureBits) / 8;
|
||||
int byteWidth = (state.samplerID.cached.sizes[i].w * textureBits) / 8;
|
||||
int h = state.samplerID.cached.sizes[i].h;
|
||||
if (HasPendingWrite(state.texaddr[i], byteStride, byteWidth, h))
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
inline void BinDirtyRange::Expand(uint32_t newBase, uint32_t bpp, uint32_t stride, DrawingCoords &tl, DrawingCoords &br) {
|
||||
const uint32_t w = br.x - tl.x + 1;
|
||||
const uint32_t h = br.y - tl.y + 1;
|
||||
|
||||
newBase += tl.y * stride * bpp + tl.x * bpp;
|
||||
if (base == 0) {
|
||||
base = newBase;
|
||||
strideBytes = stride * bpp;
|
||||
widthBytes = w * bpp;
|
||||
height = h;
|
||||
return;
|
||||
}
|
||||
|
||||
height = std::max(height, h);
|
||||
if (base == newBase && strideBytes == stride * bpp) {
|
||||
widthBytes = std::max(widthBytes, w * bpp);
|
||||
return;
|
||||
}
|
||||
|
||||
if (stride != 0)
|
||||
height += ((int)base - (int)newBase) / (stride * bpp);
|
||||
base = std::min(base, newBase);
|
||||
strideBytes = std::max(strideBytes, stride * bpp);
|
||||
widthBytes = strideBytes;
|
||||
}
|
||||
|
||||
void BinManager::UpdateClut(const void *src) {
|
||||
@ -377,6 +436,9 @@ void BinManager::Flush(const char *reason) {
|
||||
queueOffsetX_ = -1;
|
||||
queueOffsetY_ = -1;
|
||||
|
||||
for (auto &pending : pendingWrites_)
|
||||
pending.base = 0;
|
||||
|
||||
if (coreCollectDebugStats) {
|
||||
double et = time_now_d();
|
||||
flushReasonTimes_[reason] += et - st;
|
||||
@ -387,6 +449,39 @@ void BinManager::Flush(const char *reason) {
|
||||
}
|
||||
}
|
||||
|
||||
bool BinManager::HasPendingWrite(uint32_t start, uint32_t stride, uint32_t w, uint32_t h) {
|
||||
// We can only write to VRAM.
|
||||
if (!Memory::IsVRAMAddress(start))
|
||||
return false;
|
||||
// Ignore mirrors for overlap detection.
|
||||
start &= 0x0FFFFFFF & ~0x00600000;
|
||||
|
||||
uint32_t size = stride * h;
|
||||
for (const auto &range : pendingWrites_) {
|
||||
if (range.base == 0 || range.strideBytes == 0)
|
||||
continue;
|
||||
if (start >= range.base + range.height * range.strideBytes || start + size <= range.base)
|
||||
continue;
|
||||
|
||||
// Let's simply go through each line. Might be in the stride gap.
|
||||
uint32_t row = start;
|
||||
for (uint32_t y = 0; y < h; ++y) {
|
||||
int32_t offset = row - range.base;
|
||||
int32_t rangeY = offset / (int32_t)range.strideBytes;
|
||||
uint32_t rangeX = offset % (int32_t)range.strideBytes;
|
||||
if (rangeY >= 0 && (uint32_t)rangeY < range.height) {
|
||||
// If this row is either within width, or extends beyond stride, overlap.
|
||||
if (rangeX < range.widthBytes || rangeX + w >= range.strideBytes)
|
||||
return true;
|
||||
}
|
||||
|
||||
row += stride;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
void BinManager::GetStats(char *buffer, size_t bufsize) {
|
||||
double allTotal = 0.0;
|
||||
double slowestTotalTime = 0.0;
|
||||
|
@ -104,6 +104,14 @@ struct BinQueue {
|
||||
size_--;
|
||||
}
|
||||
|
||||
// Only safe if you're the only one reading.
|
||||
const T &Peek(size_t offset) const {
|
||||
size_t i = head_ + offset;
|
||||
if (i >= N)
|
||||
i -= N;
|
||||
return items_[i];
|
||||
}
|
||||
|
||||
// Only safe if you're the only one writing.
|
||||
T &PeekPush() {
|
||||
return items_[tail_];
|
||||
@ -159,6 +167,15 @@ struct BinTaskList {
|
||||
}
|
||||
};
|
||||
|
||||
struct BinDirtyRange {
|
||||
uint32_t base;
|
||||
uint32_t strideBytes;
|
||||
uint32_t widthBytes;
|
||||
uint32_t height;
|
||||
|
||||
void Expand(uint32_t newBase, uint32_t bpp, uint32_t stride, DrawingCoords &tl, DrawingCoords &br);
|
||||
};
|
||||
|
||||
class BinManager {
|
||||
public:
|
||||
BinManager();
|
||||
@ -179,6 +196,7 @@ public:
|
||||
|
||||
void Drain();
|
||||
void Flush(const char *reason);
|
||||
bool HasPendingWrite(uint32_t start, uint32_t stride, uint32_t w, uint32_t h);
|
||||
|
||||
void GetStats(char *buffer, size_t bufsize);
|
||||
void ResetStats();
|
||||
@ -215,6 +233,8 @@ private:
|
||||
std::atomic<bool> taskStatus_[MAX_POSSIBLE_TASKS];
|
||||
BinWaitable *waitable_ = nullptr;
|
||||
|
||||
BinDirtyRange pendingWrites_[2]{};
|
||||
|
||||
std::unordered_map<const char *, double> flushReasonTimes_;
|
||||
std::unordered_map<const char *, double> lastFlushReasonTimes_;
|
||||
const char *slowestFlushReason_ = nullptr;
|
||||
@ -223,6 +243,7 @@ private:
|
||||
int enqueues_ = 0;
|
||||
int mostThreads_ = 0;
|
||||
|
||||
bool HasTextureWrite(const Rasterizer::RasterizerState &state);
|
||||
BinCoords Scissor(BinCoords range);
|
||||
BinCoords Range(const VertexData &v0, const VertexData &v1, const VertexData &v2);
|
||||
BinCoords Range(const VertexData &v0, const VertexData &v1);
|
||||
|
@ -115,12 +115,13 @@ void ComputeRasterizerState(RasterizerState *state) {
|
||||
}
|
||||
|
||||
state->maxTexLevel = state->samplerID.hasAnyMips ? gstate.getTextureMaxLevel() : 0;
|
||||
state->enableTextures = gstate.isTextureMapEnabled();
|
||||
state->enableTextures = gstate.isTextureMapEnabled() && !state->pixelID.clearMode;
|
||||
|
||||
if (state->enableTextures && !state->pixelID.clearMode) {
|
||||
if (state->enableTextures) {
|
||||
GETextureFormat texfmt = state->samplerID.TexFmt();
|
||||
for (uint8_t i = 0; i <= state->maxTexLevel; i++) {
|
||||
u32 texaddr = gstate.getTextureAddress(i);
|
||||
state->texaddr[i] = texaddr;
|
||||
state->texbufw[i] = GetTextureBufw(i, texaddr, texfmt);
|
||||
if (Memory::IsValidAddress(texaddr))
|
||||
state->texptr[i] = Memory::GetPointerUnchecked(texaddr);
|
||||
|
@ -39,6 +39,7 @@ struct RasterizerState {
|
||||
SingleFunc drawPixel;
|
||||
Sampler::LinearFunc linear;
|
||||
Sampler::NearestFunc nearest;
|
||||
uint32_t texaddr[8]{};
|
||||
int texbufw[8]{};
|
||||
u8 *texptr[8]{};
|
||||
float textureLodSlope;
|
||||
|
@ -548,9 +548,6 @@ void SoftGPU::ExecuteOp(u32 op, u32 diff) {
|
||||
|
||||
case GE_CMD_SCISSOR1:
|
||||
case GE_CMD_SCISSOR2:
|
||||
for (int i = 0; i < 8; ++i) {
|
||||
drawEngine_->transformUnit.FlushIfOverlap("scissor", gstate.getTextureAddress(i), 4 * gstate.getTextureWidth(i) * gstate.getTextureHeight(i));
|
||||
}
|
||||
break;
|
||||
|
||||
case GE_CMD_MINZ:
|
||||
@ -583,7 +580,6 @@ void SoftGPU::ExecuteOp(u32 op, u32 diff) {
|
||||
case GE_CMD_TEXADDR5:
|
||||
case GE_CMD_TEXADDR6:
|
||||
case GE_CMD_TEXADDR7:
|
||||
drawEngine_->transformUnit.FlushIfOverlap("texaddr", gstate.getTextureAddress(cmd - GE_CMD_TEXADDR0), 4 * gstate.getTextureWidth(cmd - GE_CMD_TEXADDR0) * gstate.getTextureHeight(cmd - GE_CMD_TEXADDR0));
|
||||
break;
|
||||
|
||||
case GE_CMD_TEXBUFWIDTH0:
|
||||
@ -594,7 +590,6 @@ void SoftGPU::ExecuteOp(u32 op, u32 diff) {
|
||||
case GE_CMD_TEXBUFWIDTH5:
|
||||
case GE_CMD_TEXBUFWIDTH6:
|
||||
case GE_CMD_TEXBUFWIDTH7:
|
||||
drawEngine_->transformUnit.FlushIfOverlap("texbufw", gstate.getTextureAddress(cmd - GE_CMD_TEXBUFWIDTH0), 4 * gstate.getTextureWidth(cmd - GE_CMD_TEXBUFWIDTH0) * gstate.getTextureHeight(cmd - GE_CMD_TEXBUFWIDTH0));
|
||||
break;
|
||||
|
||||
case GE_CMD_CLUTADDR:
|
||||
@ -607,7 +602,7 @@ void SoftGPU::ExecuteOp(u32 op, u32 diff) {
|
||||
u32 clutTotalBytes = gstate.getClutLoadBytes();
|
||||
|
||||
// Might be copying drawing into the CLUT, so flush.
|
||||
drawEngine_->transformUnit.FlushIfOverlap("loadclut", clutAddr, clutTotalBytes);
|
||||
drawEngine_->transformUnit.FlushIfOverlap("loadclut", clutAddr, clutTotalBytes, clutTotalBytes, 1);
|
||||
|
||||
if (Memory::IsValidAddress(clutAddr)) {
|
||||
u32 validSize = Memory::ValidSize(clutAddr, clutTotalBytes);
|
||||
@ -662,8 +657,9 @@ void SoftGPU::ExecuteOp(u32 op, u32 diff) {
|
||||
const uint32_t dst = dstBasePtr + (dstY * dstStride + dstX) * bpp;
|
||||
const uint32_t dstSize = height * dstStride * bpp;
|
||||
|
||||
drawEngine_->transformUnit.FlushIfOverlap("blockxfer", src, srcSize);
|
||||
drawEngine_->transformUnit.FlushIfOverlap("blockxfer", dst, dstSize);
|
||||
// Need to flush both source and target, so we overwrite properly.
|
||||
drawEngine_->transformUnit.FlushIfOverlap("blockxfer", src, srcStride, width * bpp, height);
|
||||
drawEngine_->transformUnit.FlushIfOverlap("blockxfer", dst, dstStride, width * bpp, height);
|
||||
|
||||
DEBUG_LOG(G3D, "Block transfer: %08x/%x -> %08x/%x, %ix%ix%i (%i,%i)->(%i,%i)", srcBasePtr, srcStride, dstBasePtr, dstStride, width, height, bpp, srcX, srcY, dstX, dstY);
|
||||
|
||||
@ -693,7 +689,6 @@ void SoftGPU::ExecuteOp(u32 op, u32 diff) {
|
||||
case GE_CMD_TEXSIZE5:
|
||||
case GE_CMD_TEXSIZE6:
|
||||
case GE_CMD_TEXSIZE7:
|
||||
drawEngine_->transformUnit.FlushIfOverlap("texsize", gstate.getTextureAddress(cmd - GE_CMD_TEXSIZE0), 4 * gstate.getTextureWidth(cmd - GE_CMD_TEXSIZE0) * gstate.getTextureHeight(cmd - GE_CMD_TEXSIZE0));
|
||||
break;
|
||||
|
||||
case GE_CMD_ZBUFPTR:
|
||||
|
@ -767,21 +767,8 @@ void TransformUnit::GetStats(char *buffer, size_t bufsize) {
|
||||
binner_->GetStats(buffer, bufsize);
|
||||
}
|
||||
|
||||
void TransformUnit::FlushIfOverlap(const char *reason, uint32_t addr, uint32_t sz) {
|
||||
if (!Memory::IsVRAMAddress(addr))
|
||||
return;
|
||||
addr &= 0x0FFFFFFF;
|
||||
|
||||
uint32_t targetHeight = gstate.getScissorY2() + 1;
|
||||
uint32_t target = gstate.getFrameBufAddress() & 0x0FFFFFFF;
|
||||
uint32_t targetStride = gstate.FrameBufStride() * (gstate.FrameBufFormat() == GE_FORMAT_8888 ? 4 : 2);
|
||||
uint32_t ztarget = gstate.getDepthBufAddress() & 0x0FFFFFFF;
|
||||
uint32_t ztargetStride = gstate.DepthBufStride() * 2;
|
||||
|
||||
// TODO: Skip if the texture is between width and stride?
|
||||
if (addr < target + targetHeight * targetStride && addr + sz >= target)
|
||||
Flush(reason);
|
||||
else if (addr < ztarget + targetHeight * ztargetStride && addr + sz >= ztarget)
|
||||
void TransformUnit::FlushIfOverlap(const char *reason, uint32_t addr, uint32_t stride, uint32_t w, uint32_t h) {
|
||||
if (binner_->HasPendingWrite(addr, stride, w, h))
|
||||
Flush(reason);
|
||||
}
|
||||
|
||||
|
@ -121,7 +121,7 @@ public:
|
||||
bool GetCurrentSimpleVertices(int count, std::vector<GPUDebugVertex> &vertices, std::vector<u16> &indices);
|
||||
|
||||
void Flush(const char *reason);
|
||||
void FlushIfOverlap(const char *reason, uint32_t addr, uint32_t sz);
|
||||
void FlushIfOverlap(const char *reason, uint32_t addr, uint32_t stride, uint32_t w, uint32_t h);
|
||||
void NotifyClutUpdate(const void *src);
|
||||
|
||||
void GetStats(char *buffer, size_t bufsize);
|
||||
|
Loading…
Reference in New Issue
Block a user