softgpu: Avoid flush texturing from stride.

This generally detects overlap more accurately using a dirty rectangles
approach.  Also detects render to self much more accurately, including
with depth.
This commit is contained in:
Unknown W. Brackets 2022-01-20 18:17:23 -08:00
parent dec0ba7b79
commit c0c3f7284a
7 changed files with 144 additions and 44 deletions

View File

@ -22,6 +22,7 @@
#include "Common/Thread/ThreadManager.h"
#include "Common/TimeUtil.h"
#include "Core/System.h"
#include "GPU/Common/TextureDecoder.h"
#include "GPU/Software/BinManager.h"
#include "GPU/Software/Rasterizer.h"
#include "GPU/Software/RasterizerRectangle.h"
@ -172,23 +173,6 @@ void BinManager::UpdateState() {
scissor_.x2 = screenScissorBR.x + 15;
scissor_.y2 = screenScissorBR.y + 15;
// Disallow threads when rendering to target.
const uint32_t renderTarget = gstate.getFrameBufAddress() & 0x0FFFFFFF;
bool selfRender = (gstate.getTextureAddress(0) & 0x0FFFFFFF) == renderTarget;
if (gstate.isMipmapEnabled()) {
for (int i = 0; i <= gstate.getTextureMaxLevel(); ++i)
selfRender = selfRender || (gstate.getTextureAddress(i) & 0x0FFFFFFF) == renderTarget;
}
int newMaxTasks = selfRender ? 1 : g_threadManager.GetNumLooperThreads();
if (newMaxTasks > MAX_POSSIBLE_TASKS)
newMaxTasks = MAX_POSSIBLE_TASKS;
// We don't want to overlap wrong, so flush any pending.
if (maxTasks_ != newMaxTasks) {
maxTasks_ = newMaxTasks;
Flush("selfrender");
}
// Our bin sizes are based on offset, so if that changes we have to flush.
if (queueOffsetX_ != gstate.getOffsetX16() || queueOffsetY_ != gstate.getOffsetY16()) {
Flush("offset");
@ -200,6 +184,81 @@ void BinManager::UpdateState() {
lastFlipstats_ = gpuStats.numFlips;
ResetStats();
}
// If we're about to texture from something still pending (i.e. depth), flush.
const auto &state = State();
const bool hadDepth = pendingWrites_[1].base != 0;
if (HasTextureWrite(state))
Flush("tex");
// Okay, now update what's pending.
constexpr uint32_t mirrorMask = 0x0FFFFFFF & ~0x00600000;
const uint32_t bpp = state.pixelID.FBFormat() == GE_FORMAT_8888 ? 4 : 2;
pendingWrites_[0].Expand(gstate.getFrameBufAddress() & mirrorMask, bpp, gstate.FrameBufStride(), scissorTL, scissorBR);
if (state.pixelID.depthWrite)
pendingWrites_[1].Expand(gstate.getDepthBufAddress() & mirrorMask, 2, gstate.DepthBufStride(), scissorTL, scissorBR);
// Disallow threads when rendering to the target, even offset.
bool selfRender = HasTextureWrite(state);
int newMaxTasks = selfRender ? 1 : g_threadManager.GetNumLooperThreads();
if (newMaxTasks > MAX_POSSIBLE_TASKS)
newMaxTasks = MAX_POSSIBLE_TASKS;
// We don't want to overlap wrong, so flush any pending.
if (maxTasks_ != newMaxTasks) {
maxTasks_ = newMaxTasks;
Flush("selfrender");
}
// Lastly, we have to check if we're newly writing depth we were texturing before.
// This happens in Call of Duty (depth clear after depth texture), for example.
if (!hadDepth && state.pixelID.depthWrite) {
for (size_t i = 0; i < states_.Size(); ++i) {
if (HasTextureWrite(states_.Peek(i)))
Flush("selfdepth");
}
}
}
bool BinManager::HasTextureWrite(const RasterizerState &state) {
if (!state.enableTextures)
return false;
const int textureBits = textureBitsPerPixel[state.samplerID.texfmt];
for (int i = 0; i <= state.maxTexLevel; ++i) {
int byteStride = (state.texbufw[i] * textureBits) / 8;
int byteWidth = (state.samplerID.cached.sizes[i].w * textureBits) / 8;
int h = state.samplerID.cached.sizes[i].h;
if (HasPendingWrite(state.texaddr[i], byteStride, byteWidth, h))
return true;
}
return false;
}
inline void BinDirtyRange::Expand(uint32_t newBase, uint32_t bpp, uint32_t stride, DrawingCoords &tl, DrawingCoords &br) {
const uint32_t w = br.x - tl.x + 1;
const uint32_t h = br.y - tl.y + 1;
newBase += tl.y * stride * bpp + tl.x * bpp;
if (base == 0) {
base = newBase;
strideBytes = stride * bpp;
widthBytes = w * bpp;
height = h;
return;
}
height = std::max(height, h);
if (base == newBase && strideBytes == stride * bpp) {
widthBytes = std::max(widthBytes, w * bpp);
return;
}
if (stride != 0)
height += ((int)base - (int)newBase) / (stride * bpp);
base = std::min(base, newBase);
strideBytes = std::max(strideBytes, stride * bpp);
widthBytes = strideBytes;
}
void BinManager::UpdateClut(const void *src) {
@ -377,6 +436,9 @@ void BinManager::Flush(const char *reason) {
queueOffsetX_ = -1;
queueOffsetY_ = -1;
for (auto &pending : pendingWrites_)
pending.base = 0;
if (coreCollectDebugStats) {
double et = time_now_d();
flushReasonTimes_[reason] += et - st;
@ -387,6 +449,39 @@ void BinManager::Flush(const char *reason) {
}
}
bool BinManager::HasPendingWrite(uint32_t start, uint32_t stride, uint32_t w, uint32_t h) {
// We can only write to VRAM.
if (!Memory::IsVRAMAddress(start))
return false;
// Ignore mirrors for overlap detection.
start &= 0x0FFFFFFF & ~0x00600000;
uint32_t size = stride * h;
for (const auto &range : pendingWrites_) {
if (range.base == 0 || range.strideBytes == 0)
continue;
if (start >= range.base + range.height * range.strideBytes || start + size <= range.base)
continue;
// Let's simply go through each line. Might be in the stride gap.
uint32_t row = start;
for (uint32_t y = 0; y < h; ++y) {
int32_t offset = row - range.base;
int32_t rangeY = offset / (int32_t)range.strideBytes;
uint32_t rangeX = offset % (int32_t)range.strideBytes;
if (rangeY >= 0 && (uint32_t)rangeY < range.height) {
// If this row is either within width, or extends beyond stride, overlap.
if (rangeX < range.widthBytes || rangeX + w >= range.strideBytes)
return true;
}
row += stride;
}
}
return false;
}
void BinManager::GetStats(char *buffer, size_t bufsize) {
double allTotal = 0.0;
double slowestTotalTime = 0.0;

View File

@ -104,6 +104,14 @@ struct BinQueue {
size_--;
}
// Only safe if you're the only one reading.
const T &Peek(size_t offset) const {
size_t i = head_ + offset;
if (i >= N)
i -= N;
return items_[i];
}
// Only safe if you're the only one writing.
T &PeekPush() {
return items_[tail_];
@ -159,6 +167,15 @@ struct BinTaskList {
}
};
struct BinDirtyRange {
uint32_t base;
uint32_t strideBytes;
uint32_t widthBytes;
uint32_t height;
void Expand(uint32_t newBase, uint32_t bpp, uint32_t stride, DrawingCoords &tl, DrawingCoords &br);
};
class BinManager {
public:
BinManager();
@ -179,6 +196,7 @@ public:
void Drain();
void Flush(const char *reason);
bool HasPendingWrite(uint32_t start, uint32_t stride, uint32_t w, uint32_t h);
void GetStats(char *buffer, size_t bufsize);
void ResetStats();
@ -215,6 +233,8 @@ private:
std::atomic<bool> taskStatus_[MAX_POSSIBLE_TASKS];
BinWaitable *waitable_ = nullptr;
BinDirtyRange pendingWrites_[2]{};
std::unordered_map<const char *, double> flushReasonTimes_;
std::unordered_map<const char *, double> lastFlushReasonTimes_;
const char *slowestFlushReason_ = nullptr;
@ -223,6 +243,7 @@ private:
int enqueues_ = 0;
int mostThreads_ = 0;
bool HasTextureWrite(const Rasterizer::RasterizerState &state);
BinCoords Scissor(BinCoords range);
BinCoords Range(const VertexData &v0, const VertexData &v1, const VertexData &v2);
BinCoords Range(const VertexData &v0, const VertexData &v1);

View File

@ -115,12 +115,13 @@ void ComputeRasterizerState(RasterizerState *state) {
}
state->maxTexLevel = state->samplerID.hasAnyMips ? gstate.getTextureMaxLevel() : 0;
state->enableTextures = gstate.isTextureMapEnabled();
state->enableTextures = gstate.isTextureMapEnabled() && !state->pixelID.clearMode;
if (state->enableTextures && !state->pixelID.clearMode) {
if (state->enableTextures) {
GETextureFormat texfmt = state->samplerID.TexFmt();
for (uint8_t i = 0; i <= state->maxTexLevel; i++) {
u32 texaddr = gstate.getTextureAddress(i);
state->texaddr[i] = texaddr;
state->texbufw[i] = GetTextureBufw(i, texaddr, texfmt);
if (Memory::IsValidAddress(texaddr))
state->texptr[i] = Memory::GetPointerUnchecked(texaddr);

View File

@ -39,6 +39,7 @@ struct RasterizerState {
SingleFunc drawPixel;
Sampler::LinearFunc linear;
Sampler::NearestFunc nearest;
uint32_t texaddr[8]{};
int texbufw[8]{};
u8 *texptr[8]{};
float textureLodSlope;

View File

@ -548,9 +548,6 @@ void SoftGPU::ExecuteOp(u32 op, u32 diff) {
case GE_CMD_SCISSOR1:
case GE_CMD_SCISSOR2:
for (int i = 0; i < 8; ++i) {
drawEngine_->transformUnit.FlushIfOverlap("scissor", gstate.getTextureAddress(i), 4 * gstate.getTextureWidth(i) * gstate.getTextureHeight(i));
}
break;
case GE_CMD_MINZ:
@ -583,7 +580,6 @@ void SoftGPU::ExecuteOp(u32 op, u32 diff) {
case GE_CMD_TEXADDR5:
case GE_CMD_TEXADDR6:
case GE_CMD_TEXADDR7:
drawEngine_->transformUnit.FlushIfOverlap("texaddr", gstate.getTextureAddress(cmd - GE_CMD_TEXADDR0), 4 * gstate.getTextureWidth(cmd - GE_CMD_TEXADDR0) * gstate.getTextureHeight(cmd - GE_CMD_TEXADDR0));
break;
case GE_CMD_TEXBUFWIDTH0:
@ -594,7 +590,6 @@ void SoftGPU::ExecuteOp(u32 op, u32 diff) {
case GE_CMD_TEXBUFWIDTH5:
case GE_CMD_TEXBUFWIDTH6:
case GE_CMD_TEXBUFWIDTH7:
drawEngine_->transformUnit.FlushIfOverlap("texbufw", gstate.getTextureAddress(cmd - GE_CMD_TEXBUFWIDTH0), 4 * gstate.getTextureWidth(cmd - GE_CMD_TEXBUFWIDTH0) * gstate.getTextureHeight(cmd - GE_CMD_TEXBUFWIDTH0));
break;
case GE_CMD_CLUTADDR:
@ -607,7 +602,7 @@ void SoftGPU::ExecuteOp(u32 op, u32 diff) {
u32 clutTotalBytes = gstate.getClutLoadBytes();
// Might be copying drawing into the CLUT, so flush.
drawEngine_->transformUnit.FlushIfOverlap("loadclut", clutAddr, clutTotalBytes);
drawEngine_->transformUnit.FlushIfOverlap("loadclut", clutAddr, clutTotalBytes, clutTotalBytes, 1);
if (Memory::IsValidAddress(clutAddr)) {
u32 validSize = Memory::ValidSize(clutAddr, clutTotalBytes);
@ -662,8 +657,9 @@ void SoftGPU::ExecuteOp(u32 op, u32 diff) {
const uint32_t dst = dstBasePtr + (dstY * dstStride + dstX) * bpp;
const uint32_t dstSize = height * dstStride * bpp;
drawEngine_->transformUnit.FlushIfOverlap("blockxfer", src, srcSize);
drawEngine_->transformUnit.FlushIfOverlap("blockxfer", dst, dstSize);
// Need to flush both source and target, so we overwrite properly.
drawEngine_->transformUnit.FlushIfOverlap("blockxfer", src, srcStride, width * bpp, height);
drawEngine_->transformUnit.FlushIfOverlap("blockxfer", dst, dstStride, width * bpp, height);
DEBUG_LOG(G3D, "Block transfer: %08x/%x -> %08x/%x, %ix%ix%i (%i,%i)->(%i,%i)", srcBasePtr, srcStride, dstBasePtr, dstStride, width, height, bpp, srcX, srcY, dstX, dstY);
@ -693,7 +689,6 @@ void SoftGPU::ExecuteOp(u32 op, u32 diff) {
case GE_CMD_TEXSIZE5:
case GE_CMD_TEXSIZE6:
case GE_CMD_TEXSIZE7:
drawEngine_->transformUnit.FlushIfOverlap("texsize", gstate.getTextureAddress(cmd - GE_CMD_TEXSIZE0), 4 * gstate.getTextureWidth(cmd - GE_CMD_TEXSIZE0) * gstate.getTextureHeight(cmd - GE_CMD_TEXSIZE0));
break;
case GE_CMD_ZBUFPTR:

View File

@ -767,21 +767,8 @@ void TransformUnit::GetStats(char *buffer, size_t bufsize) {
binner_->GetStats(buffer, bufsize);
}
void TransformUnit::FlushIfOverlap(const char *reason, uint32_t addr, uint32_t sz) {
if (!Memory::IsVRAMAddress(addr))
return;
addr &= 0x0FFFFFFF;
uint32_t targetHeight = gstate.getScissorY2() + 1;
uint32_t target = gstate.getFrameBufAddress() & 0x0FFFFFFF;
uint32_t targetStride = gstate.FrameBufStride() * (gstate.FrameBufFormat() == GE_FORMAT_8888 ? 4 : 2);
uint32_t ztarget = gstate.getDepthBufAddress() & 0x0FFFFFFF;
uint32_t ztargetStride = gstate.DepthBufStride() * 2;
// TODO: Skip if the texture is between width and stride?
if (addr < target + targetHeight * targetStride && addr + sz >= target)
Flush(reason);
else if (addr < ztarget + targetHeight * ztargetStride && addr + sz >= ztarget)
void TransformUnit::FlushIfOverlap(const char *reason, uint32_t addr, uint32_t stride, uint32_t w, uint32_t h) {
if (binner_->HasPendingWrite(addr, stride, w, h))
Flush(reason);
}

View File

@ -121,7 +121,7 @@ public:
bool GetCurrentSimpleVertices(int count, std::vector<GPUDebugVertex> &vertices, std::vector<u16> &indices);
void Flush(const char *reason);
void FlushIfOverlap(const char *reason, uint32_t addr, uint32_t sz);
void FlushIfOverlap(const char *reason, uint32_t addr, uint32_t stride, uint32_t w, uint32_t h);
void NotifyClutUpdate(const void *src);
void GetStats(char *buffer, size_t bufsize);