softgpu: Avoid flush texturing from stride.

This generally detects overlap more accurately using a dirty rectangles approach. Also detects render to self much more accurately, including with depth.
2024-11-27 07:20:49 +00:00 · 2022-01-20 18:17:23 -08:00 · 2022-01-20 18:17:23 -08:00 · c0c3f7284a
commit c0c3f7284a
parent dec0ba7b79
7 changed files with 144 additions and 44 deletions
--- a/GPU/Software/BinManager.cpp
+++ b/GPU/Software/BinManager.cpp
@ -22,6 +22,7 @@
 #include "Common/Thread/ThreadManager.h"
 #include "Common/TimeUtil.h"
 #include "Core/System.h"
+#include "GPU/Common/TextureDecoder.h"
 #include "GPU/Software/BinManager.h"
 #include "GPU/Software/Rasterizer.h"
 #include "GPU/Software/RasterizerRectangle.h"
@ -172,23 +173,6 @@ void BinManager::UpdateState() {
 	scissor_.x2 = screenScissorBR.x + 15;
 	scissor_.y2 = screenScissorBR.y + 15;

-	// Disallow threads when rendering to target.
-	const uint32_t renderTarget = gstate.getFrameBufAddress() & 0x0FFFFFFF;
-	bool selfRender = (gstate.getTextureAddress(0) & 0x0FFFFFFF) == renderTarget;
-	if (gstate.isMipmapEnabled()) {
-		for (int i = 0; i <= gstate.getTextureMaxLevel(); ++i)
-			selfRender = selfRender || (gstate.getTextureAddress(i) & 0x0FFFFFFF) == renderTarget;
-	}
-
-	int newMaxTasks = selfRender ? 1 : g_threadManager.GetNumLooperThreads();
-	if (newMaxTasks > MAX_POSSIBLE_TASKS)
-		newMaxTasks = MAX_POSSIBLE_TASKS;
-	// We don't want to overlap wrong, so flush any pending.
-	if (maxTasks_ != newMaxTasks) {
-		maxTasks_ = newMaxTasks;
-		Flush("selfrender");
-	}
-
 	// Our bin sizes are based on offset, so if that changes we have to flush.
 	if (queueOffsetX_ != gstate.getOffsetX16() || queueOffsetY_ != gstate.getOffsetY16()) {
 		Flush("offset");
@ -200,6 +184,81 @@ void BinManager::UpdateState() {
 		lastFlipstats_ = gpuStats.numFlips;
 		ResetStats();
 	}
+
+	// If we're about to texture from something still pending (i.e. depth), flush.
+	const auto &state = State();
+	const bool hadDepth = pendingWrites_[1].base != 0;
+	if (HasTextureWrite(state))
+		Flush("tex");
+
+	// Okay, now update what's pending.
+	constexpr uint32_t mirrorMask = 0x0FFFFFFF & ~0x00600000;
+	const uint32_t bpp = state.pixelID.FBFormat() == GE_FORMAT_8888 ? 4 : 2;
+	pendingWrites_[0].Expand(gstate.getFrameBufAddress() & mirrorMask, bpp, gstate.FrameBufStride(), scissorTL, scissorBR);
+	if (state.pixelID.depthWrite)
+		pendingWrites_[1].Expand(gstate.getDepthBufAddress() & mirrorMask, 2, gstate.DepthBufStride(), scissorTL, scissorBR);
+
+	// Disallow threads when rendering to the target, even offset.
+	bool selfRender = HasTextureWrite(state);
+	int newMaxTasks = selfRender ? 1 : g_threadManager.GetNumLooperThreads();
+	if (newMaxTasks > MAX_POSSIBLE_TASKS)
+		newMaxTasks = MAX_POSSIBLE_TASKS;
+	// We don't want to overlap wrong, so flush any pending.
+	if (maxTasks_ != newMaxTasks) {
+		maxTasks_ = newMaxTasks;
+		Flush("selfrender");
+	}
+
+	// Lastly, we have to check if we're newly writing depth we were texturing before.
+	// This happens in Call of Duty (depth clear after depth texture), for example.
+	if (!hadDepth && state.pixelID.depthWrite) {
+		for (size_t i = 0; i < states_.Size(); ++i) {
+			if (HasTextureWrite(states_.Peek(i)))
+				Flush("selfdepth");
+		}
+	}
+}
+
+bool BinManager::HasTextureWrite(const RasterizerState &state) {
+	if (!state.enableTextures)
+		return false;
+
+	const int textureBits = textureBitsPerPixel[state.samplerID.texfmt];
+	for (int i = 0; i <= state.maxTexLevel; ++i) {
+		int byteStride = (state.texbufw[i] * textureBits) / 8;
+		int byteWidth = (state.samplerID.cached.sizes[i].w * textureBits) / 8;
+		int h = state.samplerID.cached.sizes[i].h;
+		if (HasPendingWrite(state.texaddr[i], byteStride, byteWidth, h))
+			return true;
+	}
+
+	return false;
+}
+
+inline void BinDirtyRange::Expand(uint32_t newBase, uint32_t bpp, uint32_t stride, DrawingCoords &tl, DrawingCoords &br) {
+	const uint32_t w = br.x - tl.x + 1;
+	const uint32_t h = br.y - tl.y + 1;
+
+	newBase += tl.y * stride * bpp + tl.x * bpp;
+	if (base == 0) {
+		base = newBase;
+		strideBytes = stride * bpp;
+		widthBytes = w * bpp;
+		height = h;
+		return;
+	}
+
+	height = std::max(height, h);
+	if (base == newBase && strideBytes == stride * bpp) {
+		widthBytes = std::max(widthBytes, w * bpp);
+		return;
+	}
+
+	if (stride != 0)
+		height += ((int)base - (int)newBase) / (stride * bpp);
+	base = std::min(base, newBase);
+	strideBytes = std::max(strideBytes, stride * bpp);
+	widthBytes = strideBytes;
 }

 void BinManager::UpdateClut(const void *src) {
@ -377,6 +436,9 @@ void BinManager::Flush(const char *reason) {
 	queueOffsetX_ = -1;
 	queueOffsetY_ = -1;

+	for (auto &pending : pendingWrites_)
+		pending.base = 0;
+
 	if (coreCollectDebugStats) {
 		double et = time_now_d();
 		flushReasonTimes_[reason] += et - st;
@ -387,6 +449,39 @@ void BinManager::Flush(const char *reason) {
 	}
 }

+bool BinManager::HasPendingWrite(uint32_t start, uint32_t stride, uint32_t w, uint32_t h) {
+	// We can only write to VRAM.
+	if (!Memory::IsVRAMAddress(start))
+		return false;
+	// Ignore mirrors for overlap detection.
+	start &= 0x0FFFFFFF & ~0x00600000;
+
+	uint32_t size = stride * h;
+	for (const auto &range : pendingWrites_) {
+		if (range.base == 0 || range.strideBytes == 0)
+			continue;
+		if (start >= range.base + range.height * range.strideBytes || start + size <= range.base)
+			continue;
+
+		// Let's simply go through each line.  Might be in the stride gap.
+		uint32_t row = start;
+		for (uint32_t y = 0; y < h; ++y) {
+			int32_t offset = row - range.base;
+			int32_t rangeY = offset / (int32_t)range.strideBytes;
+			uint32_t rangeX = offset % (int32_t)range.strideBytes;
+			if (rangeY >= 0 && (uint32_t)rangeY < range.height) {
+				// If this row is either within width, or extends beyond stride, overlap.
+				if (rangeX < range.widthBytes || rangeX + w >= range.strideBytes)
+					return true;
+			}
+
+			row += stride;
+		}
+	}
+
+	return false;
+}
+
 void BinManager::GetStats(char *buffer, size_t bufsize) {
 	double allTotal = 0.0;
 	double slowestTotalTime = 0.0;
--- a/GPU/Software/BinManager.h
+++ b/GPU/Software/BinManager.h
@ -104,6 +104,14 @@ struct BinQueue {
 		size_--;
 	}

+	// Only safe if you're the only one reading.
+	const T &Peek(size_t offset) const {
+		size_t i = head_ + offset;
+		if (i >= N)
+			i -= N;
+		return items_[i];
+	}
+
 	// Only safe if you're the only one writing.
 	T &PeekPush() {
 		return items_[tail_];
@ -159,6 +167,15 @@ struct BinTaskList {
 	}
 };

+struct BinDirtyRange {
+	uint32_t base;
+	uint32_t strideBytes;
+	uint32_t widthBytes;
+	uint32_t height;
+
+	void Expand(uint32_t newBase, uint32_t bpp, uint32_t stride, DrawingCoords &tl, DrawingCoords &br);
+};
+
 class BinManager {
 public:
 	BinManager();
@ -179,6 +196,7 @@ public:

 	void Drain();
 	void Flush(const char *reason);
+	bool HasPendingWrite(uint32_t start, uint32_t stride, uint32_t w, uint32_t h);

 	void GetStats(char *buffer, size_t bufsize);
 	void ResetStats();
@ -215,6 +233,8 @@ private:
 	std::atomic<bool> taskStatus_[MAX_POSSIBLE_TASKS];
 	BinWaitable *waitable_ = nullptr;

+	BinDirtyRange pendingWrites_[2]{};
+
 	std::unordered_map<const char *, double> flushReasonTimes_;
 	std::unordered_map<const char *, double> lastFlushReasonTimes_;
 	const char *slowestFlushReason_ = nullptr;
@ -223,6 +243,7 @@ private:
 	int enqueues_ = 0;
 	int mostThreads_ = 0;

+	bool HasTextureWrite(const Rasterizer::RasterizerState &state);
 	BinCoords Scissor(BinCoords range);
 	BinCoords Range(const VertexData &v0, const VertexData &v1, const VertexData &v2);
 	BinCoords Range(const VertexData &v0, const VertexData &v1);
--- a/GPU/Software/Rasterizer.cpp
+++ b/GPU/Software/Rasterizer.cpp
@ -115,12 +115,13 @@ void ComputeRasterizerState(RasterizerState *state) {
 	}

 	state->maxTexLevel = state->samplerID.hasAnyMips ? gstate.getTextureMaxLevel() : 0;
-	state->enableTextures = gstate.isTextureMapEnabled();
+	state->enableTextures = gstate.isTextureMapEnabled() && !state->pixelID.clearMode;

-	if (state->enableTextures && !state->pixelID.clearMode) {
+	if (state->enableTextures) {
 		GETextureFormat texfmt = state->samplerID.TexFmt();
 		for (uint8_t i = 0; i <= state->maxTexLevel; i++) {
 			u32 texaddr = gstate.getTextureAddress(i);
+			state->texaddr[i] = texaddr;
 			state->texbufw[i] = GetTextureBufw(i, texaddr, texfmt);
 			if (Memory::IsValidAddress(texaddr))
 				state->texptr[i] = Memory::GetPointerUnchecked(texaddr);
--- a/GPU/Software/Rasterizer.h
+++ b/GPU/Software/Rasterizer.h
@ -39,6 +39,7 @@ struct RasterizerState {
 	SingleFunc drawPixel;
 	Sampler::LinearFunc linear;
 	Sampler::NearestFunc nearest;
+	uint32_t texaddr[8]{};
 	int texbufw[8]{};
 	u8 *texptr[8]{};
 	float textureLodSlope;
--- a/GPU/Software/SoftGpu.cpp
+++ b/GPU/Software/SoftGpu.cpp
@ -548,9 +548,6 @@ void SoftGPU::ExecuteOp(u32 op, u32 diff) {

 	case GE_CMD_SCISSOR1:
 	case GE_CMD_SCISSOR2:
-		for (int i = 0; i < 8; ++i) {
-			drawEngine_->transformUnit.FlushIfOverlap("scissor", gstate.getTextureAddress(i), 4 * gstate.getTextureWidth(i) * gstate.getTextureHeight(i));
-		}
 		break;

 	case GE_CMD_MINZ:
@ -583,7 +580,6 @@ void SoftGPU::ExecuteOp(u32 op, u32 diff) {
 	case GE_CMD_TEXADDR5:
 	case GE_CMD_TEXADDR6:
 	case GE_CMD_TEXADDR7:
-		drawEngine_->transformUnit.FlushIfOverlap("texaddr", gstate.getTextureAddress(cmd - GE_CMD_TEXADDR0), 4 * gstate.getTextureWidth(cmd - GE_CMD_TEXADDR0) * gstate.getTextureHeight(cmd - GE_CMD_TEXADDR0));
 		break;

 	case GE_CMD_TEXBUFWIDTH0:
@ -594,7 +590,6 @@ void SoftGPU::ExecuteOp(u32 op, u32 diff) {
 	case GE_CMD_TEXBUFWIDTH5:
 	case GE_CMD_TEXBUFWIDTH6:
 	case GE_CMD_TEXBUFWIDTH7:
-		drawEngine_->transformUnit.FlushIfOverlap("texbufw", gstate.getTextureAddress(cmd - GE_CMD_TEXBUFWIDTH0), 4 * gstate.getTextureWidth(cmd - GE_CMD_TEXBUFWIDTH0) * gstate.getTextureHeight(cmd - GE_CMD_TEXBUFWIDTH0));
 		break;

 	case GE_CMD_CLUTADDR:
@ -607,7 +602,7 @@ void SoftGPU::ExecuteOp(u32 op, u32 diff) {
 			u32 clutTotalBytes = gstate.getClutLoadBytes();

 			// Might be copying drawing into the CLUT, so flush.
-			drawEngine_->transformUnit.FlushIfOverlap("loadclut", clutAddr, clutTotalBytes);
+			drawEngine_->transformUnit.FlushIfOverlap("loadclut", clutAddr, clutTotalBytes, clutTotalBytes, 1);

 			if (Memory::IsValidAddress(clutAddr)) {
 				u32 validSize = Memory::ValidSize(clutAddr, clutTotalBytes);
@ -662,8 +657,9 @@ void SoftGPU::ExecuteOp(u32 op, u32 diff) {
 			const uint32_t dst = dstBasePtr + (dstY * dstStride + dstX) * bpp;
 			const uint32_t dstSize = height * dstStride * bpp;

-			drawEngine_->transformUnit.FlushIfOverlap("blockxfer", src, srcSize);
-			drawEngine_->transformUnit.FlushIfOverlap("blockxfer", dst, dstSize);
+			// Need to flush both source and target, so we overwrite properly.
+			drawEngine_->transformUnit.FlushIfOverlap("blockxfer", src, srcStride, width * bpp, height);
+			drawEngine_->transformUnit.FlushIfOverlap("blockxfer", dst, dstStride, width * bpp, height);

 			DEBUG_LOG(G3D, "Block transfer: %08x/%x -> %08x/%x, %ix%ix%i (%i,%i)->(%i,%i)", srcBasePtr, srcStride, dstBasePtr, dstStride, width, height, bpp, srcX, srcY, dstX, dstY);

@ -693,7 +689,6 @@ void SoftGPU::ExecuteOp(u32 op, u32 diff) {
 	case GE_CMD_TEXSIZE5:
 	case GE_CMD_TEXSIZE6:
 	case GE_CMD_TEXSIZE7:
-		drawEngine_->transformUnit.FlushIfOverlap("texsize", gstate.getTextureAddress(cmd - GE_CMD_TEXSIZE0), 4 * gstate.getTextureWidth(cmd - GE_CMD_TEXSIZE0) * gstate.getTextureHeight(cmd - GE_CMD_TEXSIZE0));
 		break;

 	case GE_CMD_ZBUFPTR:
--- a/GPU/Software/TransformUnit.cpp
+++ b/GPU/Software/TransformUnit.cpp
@ -767,21 +767,8 @@ void TransformUnit::GetStats(char *buffer, size_t bufsize) {
 	binner_->GetStats(buffer, bufsize);
 }

-void TransformUnit::FlushIfOverlap(const char *reason, uint32_t addr, uint32_t sz) {
-	if (!Memory::IsVRAMAddress(addr))
-		return;
-	addr &= 0x0FFFFFFF;
-
-	uint32_t targetHeight = gstate.getScissorY2() + 1;
-	uint32_t target = gstate.getFrameBufAddress() & 0x0FFFFFFF;
-	uint32_t targetStride = gstate.FrameBufStride() * (gstate.FrameBufFormat() == GE_FORMAT_8888 ? 4 : 2);
-	uint32_t ztarget = gstate.getDepthBufAddress() & 0x0FFFFFFF;
-	uint32_t ztargetStride = gstate.DepthBufStride() * 2;
-
-	// TODO: Skip if the texture is between width and stride?
-	if (addr < target + targetHeight * targetStride && addr + sz >= target)
-		Flush(reason);
-	else if (addr < ztarget + targetHeight * ztargetStride && addr + sz >= ztarget)
+void TransformUnit::FlushIfOverlap(const char *reason, uint32_t addr, uint32_t stride, uint32_t w, uint32_t h) {
+	if (binner_->HasPendingWrite(addr, stride, w, h))
 		Flush(reason);
 }

--- a/GPU/Software/TransformUnit.h
+++ b/GPU/Software/TransformUnit.h
@ -121,7 +121,7 @@ public:
 	bool GetCurrentSimpleVertices(int count, std::vector<GPUDebugVertex> &vertices, std::vector<u16> &indices);

 	void Flush(const char *reason);
-	void FlushIfOverlap(const char *reason, uint32_t addr, uint32_t sz);
+	void FlushIfOverlap(const char *reason, uint32_t addr, uint32_t stride, uint32_t w, uint32_t h);
 	void NotifyClutUpdate(const void *src);

 	void GetStats(char *buffer, size_t bufsize);