softgpu: Correct matrix value update wrapping.

The values read back when saving a context or getting matrix data are set differently than the actual values used for rendering. This implements the wrapping and bleeding between matrices within softgpu, but leaves hardware rendering to only use the rendering registers for speed.
2024-11-23 05:19:56 +00:00 · 2022-09-27 22:29:55 -07:00 · 2022-09-27 22:29:55 -07:00 · 6b20c0318d
commit 6b20c0318d
parent 95d2083f04
7 changed files with 233 additions and 92 deletions
--- a/Core/HLE/sceGe.cpp
+++ b/Core/HLE/sceGe.cpp
@ -524,8 +524,10 @@ static int sceGeGetMtx(int type, u32 matrixPtr) {
 		return hleLogError(SCEGE, -1, "bad matrix ptr");
 	}

-	u32 *dest = (u32 *)Memory::GetPointerWriteUnchecked(matrixPtr);
-	if (!gpu || !gpu->GetMatrix24(GEMatrixType(type), dest))
+	u32_le *dest = (u32_le *)Memory::GetPointerWriteUnchecked(matrixPtr);
+	// Note: this reads the CPU-visible matrix values, which may differ from the actual used values.
+	// They only differ when more DATA commands are sent than are valid for a matrix.
+	if (!gpu || !gpu->GetMatrix24(GEMatrixType(type), dest, 0))
 		return hleLogError(SCEGE, SCE_KERNEL_ERROR_INVALID_INDEX, "invalid matrix");

 	return hleLogSuccessInfoI(SCEGE, 0);
--- a/GPU/GPUCommon.cpp
+++ b/GPU/GPUCommon.cpp
@ -425,6 +425,7 @@ GPUCommon::GPUCommon(GraphicsContext *gfxCtx, Draw::DrawContext *draw) :

 	UpdateCmdInfo();
 	UpdateVsyncInterval(true);
+	ResetMatrices();

 	PPGeSetDrawContext(draw);
 }
@ -731,13 +732,13 @@ int GPUCommon::GetStack(int index, u32 stackPtr) {
 	return currentList->stackptr;
 }

-static void CopyMatrix24(u32 *result, float *mtx, u32 count) {
+static void CopyMatrix24(u32_le *result, const float *mtx, u32 count, u32 cmdbits) {
 	for (u32 i = 0; i < count; ++i) {
-		result[i] = toFloat24(mtx[i]);
+		result[i] = toFloat24(mtx[i]) | cmdbits;
 	}
 }

-bool GPUCommon::GetMatrix24(GEMatrixType type, u32 *result) {
+bool GPUCommon::GetMatrix24(GEMatrixType type, u32_le *result, u32 cmdbits) {
 	switch (type) {
 	case GE_MTX_BONE0:
 	case GE_MTX_BONE1:
@ -747,19 +748,19 @@ bool GPUCommon::GetMatrix24(GEMatrixType type, u32 *result) {
 	case GE_MTX_BONE5:
 	case GE_MTX_BONE6:
 	case GE_MTX_BONE7:
-		CopyMatrix24(result, gstate.boneMatrix + (type - GE_MTX_BONE0) * 12, 12);
+		CopyMatrix24(result, gstate.boneMatrix + (type - GE_MTX_BONE0) * 12, 12, cmdbits);
 		break;
 	case GE_MTX_TEXGEN:
-		CopyMatrix24(result, gstate.tgenMatrix, 12);
+		CopyMatrix24(result, gstate.tgenMatrix, 12, cmdbits);
 		break;
 	case GE_MTX_WORLD:
-		CopyMatrix24(result, gstate.worldMatrix, 12);
+		CopyMatrix24(result, gstate.worldMatrix, 12, cmdbits);
 		break;
 	case GE_MTX_VIEW:
-		CopyMatrix24(result, gstate.viewMatrix, 12);
+		CopyMatrix24(result, gstate.viewMatrix, 12, cmdbits);
 		break;
 	case GE_MTX_PROJECTION:
-		CopyMatrix24(result, gstate.projMatrix, 16);
+		CopyMatrix24(result, gstate.projMatrix, 16, cmdbits);
 		break;
 	default:
 		return false;
@ -767,6 +768,20 @@ bool GPUCommon::GetMatrix24(GEMatrixType type, u32 *result) {
 	return true;
 }

+void GPUCommon::ResetMatrices() {
+	// This means we restored a context, so update the visible matrix data.
+	for (size_t i = 0; i < ARRAY_SIZE(gstate.boneMatrix); ++i)
+		matrixVisible.bone[i] = toFloat24(gstate.boneMatrix[i]);
+	for (size_t i = 0; i < ARRAY_SIZE(gstate.worldMatrix); ++i)
+		matrixVisible.world[i] = toFloat24(gstate.worldMatrix[i]);
+	for (size_t i = 0; i < ARRAY_SIZE(gstate.viewMatrix); ++i)
+		matrixVisible.view[i] = toFloat24(gstate.viewMatrix[i]);
+	for (size_t i = 0; i < ARRAY_SIZE(gstate.projMatrix); ++i)
+		matrixVisible.proj[i] = toFloat24(gstate.projMatrix[i]);
+	for (size_t i = 0; i < ARRAY_SIZE(gstate.tgenMatrix); ++i)
+		matrixVisible.tgen[i] = toFloat24(gstate.tgenMatrix[i]);
+}
+
 u32 GPUCommon::EnqueueList(u32 listpc, u32 stall, int subIntrBase, PSPPointer<PspGeListArgs> args, bool head) {
 	// TODO Check the stack values in missing arg and ajust the stack depth

@ -1389,7 +1404,7 @@ void GPUCommon::DoExecuteCall(u32 target) {
 		// Check for the end
 		if ((Memory::ReadUnchecked_U32(target + 11 * 4) >> 24) == GE_CMD_BONEMATRIXDATA &&
 				(Memory::ReadUnchecked_U32(target + 12 * 4) >> 24) == GE_CMD_RET &&
-				(gstate.boneMatrixNumber & 0x7F) <= 96 - 12) {
+				(gstate.boneMatrixNumber & 0x00FFFFFF) <= 96 - 12) {
 			// Yep, pretty sure this is a bone matrix call.  Double check stall first.
 			if (target > currentList->stall || target + 12 * 4 < currentList->stall) {
 				FastLoadBoneMatrix(target);
@ -1923,7 +1938,7 @@ void GPUCommon::Execute_Prim(u32 op, u32 diff) {
 				(Memory::ReadUnchecked_U32(target + 11 * 4) >> 24) == GE_CMD_BONEMATRIXDATA &&
 				(Memory::ReadUnchecked_U32(target + 12 * 4) >> 24) == GE_CMD_RET &&
 				(target > currentList->stall || target + 12 * 4 < currentList->stall) &&
-				(gstate.boneMatrixNumber & 0x7F) <= 96 - 12) {
+				(gstate.boneMatrixNumber & 0x00FFFFFF) <= 96 - 12) {
 				FastLoadBoneMatrix(target);
 			} else {
 				goto bail;
@ -2167,7 +2182,7 @@ void GPUCommon::Execute_WorldMtxNum(u32 op, u32 diff) {
 	int i = 0;

 	// We must record the individual data commands while debugRecording_.
-	bool fastLoad = !debugRecording_;
+	bool fastLoad = !debugRecording_ && end > 0;
 	// Stalling in the middle of a matrix would be stupid, I doubt this check is necessary.
 	if (currentList->pc < currentList->stall && currentList->pc + end * 4 >= currentList->stall) {
 		fastLoad = false;
@ -2188,7 +2203,7 @@ void GPUCommon::Execute_WorldMtxNum(u32 op, u32 diff) {
 	}

 	const int count = i;
-	gstate.worldmtxnum = (GE_CMD_WORLDMATRIXNUMBER << 24) | ((op + count) & 0xF);
+	gstate.worldmtxnum = (GE_CMD_WORLDMATRIXNUMBER << 24) | ((op & 0xF) + count);

 	// Skip over the loaded data, it's done now.
 	UpdatePC(currentList->pc, currentList->pc + count * 4);
@ -2197,7 +2212,7 @@ void GPUCommon::Execute_WorldMtxNum(u32 op, u32 diff) {

 void GPUCommon::Execute_WorldMtxData(u32 op, u32 diff) {
 	// Note: it's uncommon to get here now, see above.
-	int num = gstate.worldmtxnum & 0xF;
+	int num = gstate.worldmtxnum & 0x00FFFFFF;
 	u32 newVal = op << 8;
 	if (num < 12 && newVal != ((const u32 *)gstate.worldMatrix)[num]) {
 		Flush();
@ -2205,7 +2220,7 @@ void GPUCommon::Execute_WorldMtxData(u32 op, u32 diff) {
 		gstate_c.Dirty(DIRTY_WORLDMATRIX);
 	}
 	num++;
-	gstate.worldmtxnum = (GE_CMD_WORLDMATRIXNUMBER << 24) | (num & 0xF);
+	gstate.worldmtxnum = (GE_CMD_WORLDMATRIXNUMBER << 24) | (num & 0x00FFFFFF);
 	gstate.worldmtxdata = GE_CMD_WORLDMATRIXDATA << 24;
 }

@ -2216,7 +2231,7 @@ void GPUCommon::Execute_ViewMtxNum(u32 op, u32 diff) {
 	const int end = 12 - (op & 0xF);
 	int i = 0;

-	bool fastLoad = !debugRecording_;
+	bool fastLoad = !debugRecording_ && end > 0;
 	if (currentList->pc < currentList->stall && currentList->pc + end * 4 >= currentList->stall) {
 		fastLoad = false;
 	}
@ -2236,7 +2251,7 @@ void GPUCommon::Execute_ViewMtxNum(u32 op, u32 diff) {
 	}

 	const int count = i;
-	gstate.viewmtxnum = (GE_CMD_VIEWMATRIXNUMBER << 24) | ((op + count) & 0xF);
+	gstate.viewmtxnum = (GE_CMD_VIEWMATRIXNUMBER << 24) | ((op & 0xF) + count);

 	// Skip over the loaded data, it's done now.
 	UpdatePC(currentList->pc, currentList->pc + count * 4);
@ -2245,7 +2260,7 @@ void GPUCommon::Execute_ViewMtxNum(u32 op, u32 diff) {

 void GPUCommon::Execute_ViewMtxData(u32 op, u32 diff) {
 	// Note: it's uncommon to get here now, see above.
-	int num = gstate.viewmtxnum & 0xF;
+	int num = gstate.viewmtxnum & 0x00FFFFFF;
 	u32 newVal = op << 8;
 	if (num < 12 && newVal != ((const u32 *)gstate.viewMatrix)[num]) {
 		Flush();
@ -2253,7 +2268,7 @@ void GPUCommon::Execute_ViewMtxData(u32 op, u32 diff) {
 		gstate_c.Dirty(DIRTY_VIEWMATRIX);
 	}
 	num++;
-	gstate.viewmtxnum = (GE_CMD_VIEWMATRIXNUMBER << 24) | (num & 0xF);
+	gstate.viewmtxnum = (GE_CMD_VIEWMATRIXNUMBER << 24) | (num & 0x00FFFFFF);
 	gstate.viewmtxdata = GE_CMD_VIEWMATRIXDATA << 24;
 }

@ -2284,7 +2299,7 @@ void GPUCommon::Execute_ProjMtxNum(u32 op, u32 diff) {
 	}

 	const int count = i;
-	gstate.projmtxnum = (GE_CMD_PROJMATRIXNUMBER << 24) | ((op + count) & 0x1F);
+	gstate.projmtxnum = (GE_CMD_PROJMATRIXNUMBER << 24) | ((op & 0xF) + count);

 	// Skip over the loaded data, it's done now.
 	UpdatePC(currentList->pc, currentList->pc + count * 4);
@ -2293,16 +2308,16 @@ void GPUCommon::Execute_ProjMtxNum(u32 op, u32 diff) {

 void GPUCommon::Execute_ProjMtxData(u32 op, u32 diff) {
 	// Note: it's uncommon to get here now, see above.
-	int num = gstate.projmtxnum & 0x1F;    // NOTE: Changed from 0xF to catch overflows
+	int num = gstate.projmtxnum & 0x00FFFFFF;
 	u32 newVal = op << 8;
-	if (num < 0x10 && newVal != ((const u32 *)gstate.projMatrix)[num]) {
+	if (num < 16 && newVal != ((const u32 *)gstate.projMatrix)[num]) {
 		Flush();
 		((u32 *)gstate.projMatrix)[num] = newVal;
 		gstate_c.Dirty(DIRTY_PROJMATRIX);
 	}
 	num++;
 	if (num <= 16)
-		gstate.projmtxnum = (GE_CMD_PROJMATRIXNUMBER << 24) | (num & 0xF);
+		gstate.projmtxnum = (GE_CMD_PROJMATRIXNUMBER << 24) | (num & 0x00FFFFFF);
 	gstate.projmtxdata = GE_CMD_PROJMATRIXDATA << 24;
 }

@ -2313,7 +2328,7 @@ void GPUCommon::Execute_TgenMtxNum(u32 op, u32 diff) {
 	const int end = 12 - (op & 0xF);
 	int i = 0;

-	bool fastLoad = !debugRecording_;
+	bool fastLoad = !debugRecording_ && end > 0;
 	if (currentList->pc < currentList->stall && currentList->pc + end * 4 >= currentList->stall) {
 		fastLoad = false;
 	}
@ -2333,7 +2348,7 @@ void GPUCommon::Execute_TgenMtxNum(u32 op, u32 diff) {
 	}

 	const int count = i;
-	gstate.texmtxnum = (GE_CMD_TGENMATRIXNUMBER << 24) | ((op + count) & 0xF);
+	gstate.texmtxnum = (GE_CMD_TGENMATRIXNUMBER << 24) | ((op & 0xF) + count);

 	// Skip over the loaded data, it's done now.
 	UpdatePC(currentList->pc, currentList->pc + count * 4);
@ -2342,7 +2357,7 @@ void GPUCommon::Execute_TgenMtxNum(u32 op, u32 diff) {

 void GPUCommon::Execute_TgenMtxData(u32 op, u32 diff) {
 	// Note: it's uncommon to get here now, see above.
-	int num = gstate.texmtxnum & 0xF;
+	int num = gstate.texmtxnum & 0x00FFFFFF;
 	u32 newVal = op << 8;
 	if (num < 12 && newVal != ((const u32 *)gstate.tgenMatrix)[num]) {
 		Flush();
@ -2350,7 +2365,7 @@ void GPUCommon::Execute_TgenMtxData(u32 op, u32 diff) {
 		gstate_c.Dirty(DIRTY_TEXMATRIX | DIRTY_FRAGMENTSHADER_STATE);  // We check the matrix to see if we need projection
 	}
 	num++;
-	gstate.texmtxnum = (GE_CMD_TGENMATRIXNUMBER << 24) | (num & 0xF);
+	gstate.texmtxnum = (GE_CMD_TGENMATRIXNUMBER << 24) | (num & 0x00FFFFFF);
 	gstate.texmtxdata = GE_CMD_TGENMATRIXDATA << 24;
 }

@ -2400,7 +2415,7 @@ void GPUCommon::Execute_BoneMtxNum(u32 op, u32 diff) {
 	}

 	const int count = i;
-	gstate.boneMatrixNumber = (GE_CMD_BONEMATRIXNUMBER << 24) | ((op + count) & 0x7F);
+	gstate.boneMatrixNumber = (GE_CMD_BONEMATRIXNUMBER << 24) | ((op & 0x7F) + count);

 	// Skip over the loaded data, it's done now.
 	UpdatePC(currentList->pc, currentList->pc + count * 4);
@ -2409,7 +2424,7 @@ void GPUCommon::Execute_BoneMtxNum(u32 op, u32 diff) {

 void GPUCommon::Execute_BoneMtxData(u32 op, u32 diff) {
 	// Note: it's uncommon to get here now, see above.
-	int num = gstate.boneMatrixNumber & 0x7F;
+	int num = gstate.boneMatrixNumber & 0x00FFFFFF;
 	u32 newVal = op << 8;
 	if (num < 96 && newVal != ((const u32 *)gstate.boneMatrix)[num]) {
 		// Bone matrices should NOT flush when software skinning is enabled!
@ -2422,7 +2437,7 @@ void GPUCommon::Execute_BoneMtxData(u32 op, u32 diff) {
 		((u32 *)gstate.boneMatrix)[num] = newVal;
 	}
 	num++;
-	gstate.boneMatrixNumber = (GE_CMD_BONEMATRIXNUMBER << 24) | (num & 0x7F);
+	gstate.boneMatrixNumber = (GE_CMD_BONEMATRIXNUMBER << 24) | (num & 0x00FFFFFF);
 	gstate.boneMatrixData = GE_CMD_BONEMATRIXDATA << 24;
 }

@ -2661,7 +2676,7 @@ struct DisplayList_v2 {
 };

 void GPUCommon::DoState(PointerWrap &p) {
-	auto s = p.Section("GPUCommon", 1, 4);
+	auto s = p.Section("GPUCommon", 1, 5);
 	if (!s)
 		return;

@ -2733,6 +2748,10 @@ void GPUCommon::DoState(PointerWrap &p) {
 	Do(p, isbreak);
 	Do(p, drawCompleteTicks);
 	Do(p, busyTicks);
+
+	if (s >= 5) {
+		Do(p, matrixVisible.all);
+	}
 }

 void GPUCommon::InterruptStart(int listid) {
--- a/GPU/GPUCommon.h
+++ b/GPU/GPUCommon.h
@ -111,7 +111,8 @@ public:
 	int  ListSync(int listid, int mode) override;
 	u32  DrawSync(int mode) override;
 	int  GetStack(int index, u32 stackPtr) override;
-	bool GetMatrix24(GEMatrixType type, u32 *result) override;
+	bool GetMatrix24(GEMatrixType type, u32_le *result, u32 cmdbits) override;
+	void ResetMatrices() override;
 	void DoState(PointerWrap &p) override;
 	bool BusyDrawing() override;
 	u32  Continue() override;
@ -366,6 +367,21 @@ protected:
 	uint32_t immFlags_ = 0;
 	bool immFirstSent_ = false;

+	// Whe matrix data overflows, the CPU visible values wrap and bleed between matrices.
+	// But this doesn't actually change the values used by rendering.
+	// The CPU visible values affect the GPU when list contexts are restored.
+	// Note: not maintained by all backends, here for save stating.
+	union {
+		struct {
+			u32 bone[12 * 8];
+			u32 world[12];
+			u32 view[12];
+			u32 proj[16];
+			u32 tgen[12];
+		};
+		u32 all[12 * 8 + 12 + 12 + 16 + 12];
+	} matrixVisible;
+
 	std::string reportingPrimaryInfo_;
 	std::string reportingFullInfo_;

--- a/GPU/GPUInterface.h
+++ b/GPU/GPUInterface.h
@ -198,7 +198,8 @@ public:
 	virtual u32  Continue() = 0;
 	virtual u32  Break(int mode) = 0;
 	virtual int  GetStack(int index, u32 stackPtr) = 0;
-	virtual bool GetMatrix24(GEMatrixType type, u32 *result) = 0;
+	virtual bool GetMatrix24(GEMatrixType type, u32_le *result, u32 cmdbits) = 0;
+	virtual void ResetMatrices() = 0;

 	virtual void InterruptStart(int listid) = 0;
 	virtual void InterruptEnd(int listid) = 0;
--- a/GPU/GPUState.cpp
+++ b/GPU/GPUState.cpp
@ -24,6 +24,7 @@
 #include "Core/System.h"
 #include "Core/MemMap.h"
 #include "GPU/ge_constants.h"
+#include "GPU/GPUInterface.h"
 #include "GPU/GPUState.h"

 #ifdef _M_SSE
@ -85,11 +86,20 @@ static const CmdRange contextCmdRanges[] = {
 	// Skip: {0xFA, 0xFF},
 };

-static u32_le *SaveMatrix(u32_le *cmds, const float *mtx, int sz, int numcmd, int datacmd) {
+static u32_le *SaveMatrix(u32_le *cmds, GEMatrixType type, int sz, int numcmd, int datacmd) {
+	if (!gpu)
+		return cmds;
+
 	*cmds++ = numcmd << 24;
-	for (int i = 0; i < sz; ++i) {
-		*cmds++ = (datacmd << 24) | toFloat24(mtx[i]);
+	// This saves the CPU-visible values, not the actual used ones, which may differ.
+	// Note that Restore overwrites both values.
+	if (type == GE_MTX_BONE0) {
+		for (int i = 0; i < 8; ++i)
+			gpu->GetMatrix24(GEMatrixType(GE_MTX_BONE0 + i), cmds + i * 12, datacmd << 24);
+	} else {
+		gpu->GetMatrix24(type, cmds, datacmd << 24);
 	}
+	cmds += sz;

 	return cmds;
 }
@ -117,6 +127,9 @@ void GPUgstate::Reset() {
 	memset(gstate.tgenMatrix, 0, sizeof(gstate.tgenMatrix));
 	memset(gstate.boneMatrix, 0, sizeof(gstate.boneMatrix));

+	if (gpu)
+		gpu->ResetMatrices();
+
 	savedContextVersion = 1;
 }

@ -152,11 +165,11 @@ void GPUgstate::Save(u32_le *ptr) {
 		memcpy(matrices, projMatrix, sizeof(projMatrix)); matrices += sizeof(projMatrix);
 		memcpy(matrices, tgenMatrix, sizeof(tgenMatrix)); matrices += sizeof(tgenMatrix);
 	} else {
-		cmds = SaveMatrix(cmds, boneMatrix, ARRAY_SIZE(boneMatrix), GE_CMD_BONEMATRIXNUMBER, GE_CMD_BONEMATRIXDATA);
-		cmds = SaveMatrix(cmds, worldMatrix, ARRAY_SIZE(worldMatrix), GE_CMD_WORLDMATRIXNUMBER, GE_CMD_WORLDMATRIXDATA);
-		cmds = SaveMatrix(cmds, viewMatrix, ARRAY_SIZE(viewMatrix), GE_CMD_VIEWMATRIXNUMBER, GE_CMD_VIEWMATRIXDATA);
-		cmds = SaveMatrix(cmds, projMatrix, ARRAY_SIZE(projMatrix), GE_CMD_PROJMATRIXNUMBER, GE_CMD_PROJMATRIXDATA);
-		cmds = SaveMatrix(cmds, tgenMatrix, ARRAY_SIZE(tgenMatrix), GE_CMD_TGENMATRIXNUMBER, GE_CMD_TGENMATRIXDATA);
+		cmds = SaveMatrix(cmds, GE_MTX_BONE0, ARRAY_SIZE(boneMatrix), GE_CMD_BONEMATRIXNUMBER, GE_CMD_BONEMATRIXDATA);
+		cmds = SaveMatrix(cmds, GE_MTX_WORLD, ARRAY_SIZE(worldMatrix), GE_CMD_WORLDMATRIXNUMBER, GE_CMD_WORLDMATRIXDATA);
+		cmds = SaveMatrix(cmds, GE_MTX_VIEW, ARRAY_SIZE(viewMatrix), GE_CMD_VIEWMATRIXNUMBER, GE_CMD_VIEWMATRIXDATA);
+		cmds = SaveMatrix(cmds, GE_MTX_PROJECTION, ARRAY_SIZE(projMatrix), GE_CMD_PROJMATRIXNUMBER, GE_CMD_PROJMATRIXDATA);
+		cmds = SaveMatrix(cmds, GE_MTX_TEXGEN, ARRAY_SIZE(tgenMatrix), GE_CMD_TGENMATRIXNUMBER, GE_CMD_TGENMATRIXDATA);

 		*cmds++ = boneMatrixNumber;
 		*cmds++ = worldmtxnum;
@ -199,7 +212,7 @@ void GPUgstate::FastLoadBoneMatrix(u32 addr) {
 #endif

 	num += 12;
-	gstate.boneMatrixNumber = (GE_CMD_BONEMATRIXNUMBER << 24) | (num & 0x7F);
+	gstate.boneMatrixNumber = (GE_CMD_BONEMATRIXNUMBER << 24) | (num & 0x00FFFFFF);
 }

 void GPUgstate::Restore(u32_le *ptr) {
@ -244,6 +257,9 @@ void GPUgstate::Restore(u32_le *ptr) {
 		projmtxnum = *cmds++;
 		texmtxnum = *cmds++;
 	}
+
+	if (gpu)
+		gpu->ResetMatrices();
 }

 bool vertTypeIsSkinningEnabled(u32 vertType) {
--- a/GPU/Software/SoftGpu.cpp
+++ b/GPU/Software/SoftGpu.cpp
@ -341,16 +341,16 @@ const SoftwareCommandTableEntry softgpuCommandTable[] = {
 	{ GE_CMD_DITH2, 0, SoftDirty::PIXEL_DITHER },
 	{ GE_CMD_DITH3, 0, SoftDirty::PIXEL_DITHER },

-	{ GE_CMD_WORLDMATRIXNUMBER },
+	{ GE_CMD_WORLDMATRIXNUMBER, FLAG_EXECUTE, SoftDirty::NONE, &SoftGPU::Execute_WorldMtxNum },
 	{ GE_CMD_WORLDMATRIXDATA, FLAG_EXECUTE, SoftDirty::NONE, &SoftGPU::Execute_WorldMtxData },
-	{ GE_CMD_VIEWMATRIXNUMBER },
+	{ GE_CMD_VIEWMATRIXNUMBER, FLAG_EXECUTE, SoftDirty::NONE, &SoftGPU::Execute_ViewMtxNum },
 	{ GE_CMD_VIEWMATRIXDATA, FLAG_EXECUTE, SoftDirty::NONE, &SoftGPU::Execute_ViewMtxData },
-	{ GE_CMD_PROJMATRIXNUMBER },
+	{ GE_CMD_PROJMATRIXNUMBER, FLAG_EXECUTE, SoftDirty::NONE, &SoftGPU::Execute_ProjMtxNum },
 	{ GE_CMD_PROJMATRIXDATA, FLAG_EXECUTE, SoftDirty::NONE, &SoftGPU::Execute_ProjMtxData },
 	// Currently not state.
-	{ GE_CMD_TGENMATRIXNUMBER },
+	{ GE_CMD_TGENMATRIXNUMBER, FLAG_EXECUTE, SoftDirty::NONE, &SoftGPU::Execute_TgenMtxNum },
 	{ GE_CMD_TGENMATRIXDATA, FLAG_EXECUTE, SoftDirty::NONE, &SoftGPU::Execute_TgenMtxData },
-	{ GE_CMD_BONEMATRIXNUMBER },
+	{ GE_CMD_BONEMATRIXNUMBER, FLAG_EXECUTE, SoftDirty::NONE, &SoftGPU::Execute_BoneMtxNum },
 	{ GE_CMD_BONEMATRIXDATA, FLAG_EXECUTE, SoftDirty::NONE, &SoftGPU::Execute_BoneMtxData },

 	// Vertex Screen/Texture/Color
@ -1040,83 +1040,162 @@ void SoftGPU::Execute_VertexType(u32 op, u32 diff) {
 	}
 }

+void SoftGPU::Execute_WorldMtxNum(u32 op, u32 diff) {
+	// Setting 0xFFFFF0 will reset to 0.
+	gstate.worldmtxnum = (GE_CMD_WORLDMATRIXNUMBER << 24) | (op & 0xF);
+}
+
+void SoftGPU::Execute_ViewMtxNum(u32 op, u32 diff) {
+	gstate.viewmtxnum = (GE_CMD_VIEWMATRIXNUMBER << 24) | (op & 0xF);
+}
+
+void SoftGPU::Execute_ProjMtxNum(u32 op, u32 diff) {
+	gstate.projmtxnum = (GE_CMD_PROJMATRIXNUMBER << 24) | (op & 0xF);
+}
+
+void SoftGPU::Execute_TgenMtxNum(u32 op, u32 diff) {
+	gstate.texmtxnum = (GE_CMD_TGENMATRIXNUMBER << 24) | (op & 0xF);
+}
+
+void SoftGPU::Execute_BoneMtxNum(u32 op, u32 diff) {
+	// Setting any bits outside 0x7F are ignored and resets the internal counter.
+	gstate.boneMatrixNumber = (GE_CMD_BONEMATRIXNUMBER << 24) | (op & 0x7F);
+}
+
 void SoftGPU::Execute_WorldMtxData(u32 op, u32 diff) {
-	int num = gstate.worldmtxnum & 0xF;
-	u32 *target = num < 12 ? (u32 *)&gstate.worldMatrix[num] : (u32 *)&gstate.viewMatrix[num - 12];
-	u32 newVal = op << 8;
-	if (newVal != *target) {
-		*target = newVal;
-		dirtyFlags_ |= SoftDirty::TRANSFORM_MATRIX;
+	int num = gstate.worldmtxnum & 0x00FFFFFF;
+	if (num < 12) {
+		u32 *target = (u32 *)&gstate.worldMatrix[num];
+		u32 newVal = op << 8;
+		if (newVal != *target) {
+			*target = newVal;
+			dirtyFlags_ |= SoftDirty::TRANSFORM_MATRIX;
+		}
 	}
+
+	// Also update the CPU visible values, which update differently.
+	u32 *target = &matrixVisible.all[12 * 8 + (num & 0xF)];
+	*target = op & 0x00FFFFFF;
+
 	num++;
-	gstate.worldmtxnum = (GE_CMD_WORLDMATRIXNUMBER << 24) | (num & 0xF);
+	gstate.worldmtxnum = (GE_CMD_WORLDMATRIXNUMBER << 24) | (num & 0x00FFFFFF);
 	gstate.worldmtxdata = GE_CMD_WORLDMATRIXDATA << 24;
 }

 void SoftGPU::Execute_ViewMtxData(u32 op, u32 diff) {
-	int num = gstate.viewmtxnum & 0xF;
-	u32 *target = num < 12 ? (u32 *)&gstate.viewMatrix[num] : (u32 *)&gstate.projMatrix[num - 12];
-	u32 newVal = op << 8;
-	if (newVal != *target) {
-		*target = newVal;
-		dirtyFlags_ |= SoftDirty::TRANSFORM_MATRIX;
+	int num = gstate.viewmtxnum & 0x00FFFFFF;
+	if (num < 12) {
+		u32 *target = (u32 *)&gstate.viewMatrix[num];
+		u32 newVal = op << 8;
+		if (newVal != *target) {
+			*target = newVal;
+			dirtyFlags_ |= SoftDirty::TRANSFORM_MATRIX;
+		}
 	}
+
+	// Also update the CPU visible values, which update differently.
+	u32 *target = &matrixVisible.all[12 * 8 + 12 + (num & 0xF)];
+	*target = op & 0x00FFFFFF;
+
 	num++;
-	gstate.viewmtxnum = (GE_CMD_VIEWMATRIXNUMBER << 24) | (num & 0xF);
+	gstate.viewmtxnum = (GE_CMD_VIEWMATRIXNUMBER << 24) | (num & 0x00FFFFFF);
 	gstate.viewmtxdata = GE_CMD_VIEWMATRIXDATA << 24;
 }

 void SoftGPU::Execute_ProjMtxData(u32 op, u32 diff) {
-	int num = gstate.projmtxnum & 0xF;
-	u32 *target = (u32 *)&gstate.projMatrix[num];
-	u32 newVal = op << 8;
-	if (newVal != *target) {
-		*target = newVal;
-		dirtyFlags_ |= SoftDirty::TRANSFORM_MATRIX;
+	int num = gstate.projmtxnum & 0x00FFFFFF;
+	if (num < 16) {
+		u32 *target = (u32 *)&gstate.projMatrix[num];
+		u32 newVal = op << 8;
+		if (newVal != *target) {
+			*target = newVal;
+			dirtyFlags_ |= SoftDirty::TRANSFORM_MATRIX;
+		}
 	}
+
+	// Also update the CPU visible values, which update differently.
+	u32 *target = &matrixVisible.all[12 * 8 + 12 + 12 + (num & 0xF)];
+	*target = op & 0x00FFFFFF;
+
 	num++;
-	gstate.projmtxnum = (GE_CMD_PROJMATRIXNUMBER << 24) | (num & 0xF);
+	gstate.projmtxnum = (GE_CMD_PROJMATRIXNUMBER << 24) | (num & 0x00FFFFFF);
 	gstate.projmtxdata = GE_CMD_PROJMATRIXDATA << 24;
 }

 void SoftGPU::Execute_TgenMtxData(u32 op, u32 diff) {
-	int num = gstate.texmtxnum & 0xF;
-	u32 newVal = op << 8;
-	// Doesn't wrap to any other matrix.
-	if (num < 12 && newVal != ((const u32 *)gstate.tgenMatrix)[num]) {
-		((u32 *)gstate.tgenMatrix)[num] = newVal;
+	int num = gstate.texmtxnum & 0x00FFFFFF;
+	if (num < 12) {
+		u32 *target = (u32 *)&gstate.tgenMatrix[num];
+		u32 newVal = op << 8;
 		// No dirtying, read during vertex read.
+		*target = newVal;
 	}
+
+	// Doesn't wrap to any other matrix.
+	if ((num & 0xF) < 12) {
+		matrixVisible.tgen[num & 0xF] = op & 0x00FFFFFF;
+	}
+
 	num++;
-	gstate.texmtxnum = (GE_CMD_TGENMATRIXNUMBER << 24) | (num & 0xF);
+	gstate.texmtxnum = (GE_CMD_TGENMATRIXNUMBER << 24) | (num & 0x00FFFFFF);
 	gstate.texmtxdata = GE_CMD_TGENMATRIXDATA << 24;
 }

 void SoftGPU::Execute_BoneMtxData(u32 op, u32 diff) {
-	int num = gstate.boneMatrixNumber & 0x7F;
-	u32 *target;
+	int num = gstate.boneMatrixNumber & 0x00FFFFFF;
+
 	if (num < 96) {
-		target = (u32 *)&gstate.boneMatrix[num];
-	} else if (num < 96 + 12) {
-		target = (u32 *)&gstate.worldMatrix[num - 96];
-	} else if (num < 96 + 12 + 12) {
-		target = (u32 *)&gstate.viewMatrix[num - 96 - 12];
-	} else {
-		target = (u32 *)&gstate.projMatrix[num - 96 - 12 - 12];
+		u32 *target = (u32 *)&gstate.boneMatrix[num];
+		u32 newVal = op << 8;
+		// No dirtying, we read bone data during vertex read.
+		*target = newVal;
 	}

-	u32 newVal = op << 8;
-	if (newVal != *target) {
-		*target = newVal;
-		// Dirty if it overflowed.  We read bone data during vertex read.
-		if (num >= 96)
-			dirtyFlags_ |= SoftDirty::TRANSFORM_MATRIX;
-	}
+	// Also update the CPU visible values, which update differently.
+	u32 *target = &matrixVisible.all[(num & 0x7F)];
+	*target = op & 0x00FFFFFF;
+
 	num++;
-	gstate.boneMatrixNumber = (GE_CMD_BONEMATRIXNUMBER << 24) | (num & 0x7F);
+	gstate.boneMatrixNumber = (GE_CMD_BONEMATRIXNUMBER << 24) | (num & 0x00FFFFFF);
 	gstate.boneMatrixData  = GE_CMD_BONEMATRIXDATA << 24;
 }

+static void CopyMatrix24(u32_le *result, const u32 *mtx, u32 count, u32 cmdbits) {
+	for (u32 i = 0; i < count; ++i) {
+		result[i] = mtx[i] | cmdbits;
+	}
+}
+
+bool SoftGPU::GetMatrix24(GEMatrixType type, u32_le *result, u32 cmdbits) {
+	switch (type) {
+	case GE_MTX_BONE0:
+	case GE_MTX_BONE1:
+	case GE_MTX_BONE2:
+	case GE_MTX_BONE3:
+	case GE_MTX_BONE4:
+	case GE_MTX_BONE5:
+	case GE_MTX_BONE6:
+	case GE_MTX_BONE7:
+		CopyMatrix24(result, matrixVisible.bone + (type - GE_MTX_BONE0) * 12, 12, cmdbits);
+		break;
+	case GE_MTX_TEXGEN:
+		CopyMatrix24(result, matrixVisible.tgen, 12, cmdbits);
+		break;
+	case GE_MTX_WORLD:
+		CopyMatrix24(result, matrixVisible.world, 12, cmdbits);
+		break;
+	case GE_MTX_VIEW:
+		CopyMatrix24(result, matrixVisible.view, 12, cmdbits);
+		break;
+	case GE_MTX_PROJECTION:
+		CopyMatrix24(result, matrixVisible.proj, 16, cmdbits);
+		break;
+	default:
+		return false;
+	}
+	return true;
+}
+
 void SoftGPU::Execute_ImmVertexAlphaPrim(u32 op, u32 diff) {
 	GPUCommon::Execute_ImmVertexAlphaPrim(op, diff);
 	// We won't flush as often as hardware renderers, so we want to flush right away.
--- a/GPU/Software/SoftGpu.h
+++ b/GPU/Software/SoftGpu.h
@ -181,12 +181,20 @@ public:
 	// Overridden to change flushing behavior.
 	void Execute_Call(u32 op, u32 diff);

+	void Execute_WorldMtxNum(u32 op, u32 diff);
+	void Execute_ViewMtxNum(u32 op, u32 diff);
+	void Execute_ProjMtxNum(u32 op, u32 diff);
+	void Execute_TgenMtxNum(u32 op, u32 diff);
+	void Execute_BoneMtxNum(u32 op, u32 diff);
+
 	void Execute_WorldMtxData(u32 op, u32 diff);
 	void Execute_ViewMtxData(u32 op, u32 diff);
 	void Execute_ProjMtxData(u32 op, u32 diff);
 	void Execute_TgenMtxData(u32 op, u32 diff);
 	void Execute_BoneMtxData(u32 op, u32 diff);

+	bool GetMatrix24(GEMatrixType type, u32_le *result, u32 cmdbits) override;
+
 	void Execute_ImmVertexAlphaPrim(u32 op, u32 diff);

 	typedef void (SoftGPU::*CmdFunc)(u32 op, u32 diff);