From e1c391b1c80a9ad0410187e4281c37a28335213a Mon Sep 17 00:00:00 2001
From: "Unknown W. Brackets" <checkins@unknownbrackets.org>
Date: Mon, 13 Feb 2023 17:15:49 -0800
Subject: [PATCH] riscv: Use vertexjit for hardware skinned verts.

---
 GPU/Common/VertexDecoderRiscV.cpp | 43 +++++++++++++++++++++++++++++++
 1 file changed, 43 insertions(+)

diff --git a/GPU/Common/VertexDecoderRiscV.cpp b/GPU/Common/VertexDecoderRiscV.cpp
index 165dbab2e7..27f3a13cec 100644
--- a/GPU/Common/VertexDecoderRiscV.cpp
+++ b/GPU/Common/VertexDecoderRiscV.cpp
@@ -71,6 +71,10 @@ static const RiscVReg const65535Reg = F6;
 // TODO: Use vector, where supported.
 
 static const JitLookup jitLookup[] = {
+	{&VertexDecoder::Step_WeightsU8, &VertexDecoderJitCache::Jit_WeightsU8},
+	{&VertexDecoder::Step_WeightsU16, &VertexDecoderJitCache::Jit_WeightsU16},
+	{&VertexDecoder::Step_WeightsFloat, &VertexDecoderJitCache::Jit_WeightsFloat},
+
 	{&VertexDecoder::Step_TcU8ToFloat, &VertexDecoderJitCache::Jit_TcU8ToFloat},
 	{&VertexDecoder::Step_TcU16ToFloat, &VertexDecoderJitCache::Jit_TcU16ToFloat},
 	{&VertexDecoder::Step_TcFloat, &VertexDecoderJitCache::Jit_TcFloat},
@@ -234,6 +238,45 @@ bool VertexDecoderJitCache::CompileStep(const VertexDecoder &dec, int step) {
 	return false;
 }
 
+void VertexDecoderJitCache::Jit_WeightsU8() {
+	// Just copy a byte at a time.  Would be nice if we knew if misaligned access was fast.
+	// If it's not fast, it can crash or hit a software trap (100x slower.)
+	int j;
+	for (j = 0; j < dec_->nweights; j++) {
+		LB(tempReg1, srcReg, dec_->weightoff + j);
+		SB(tempReg1, dstReg, dec_->decFmt.w0off + j);
+	}
+	// We zero out any weights up to a multiple of 4.
+	while (j & 3) {
+		SB(R_ZERO, dstReg, dec_->decFmt.w0off + j);
+		j++;
+	}
+}
+
+void VertexDecoderJitCache::Jit_WeightsU16() {
+	int j;
+	for (j = 0; j < dec_->nweights; j++) {
+		LH(tempReg1, srcReg, dec_->weightoff + j * 2);
+		SH(tempReg1, dstReg, dec_->decFmt.w0off + j * 2);
+	}
+	while (j & 3) {
+		SH(R_ZERO, dstReg, dec_->decFmt.w0off + j * 2);
+		j++;
+	}
+}
+
+void VertexDecoderJitCache::Jit_WeightsFloat() {
+	int j;
+	for (j = 0; j < dec_->nweights; j++) {
+		LW(tempReg1, srcReg, dec_->weightoff + j * 4);
+		SW(tempReg1, dstReg, dec_->decFmt.w0off + j * 4);
+	}
+	while (j & 3) {
+		SW(R_ZERO, dstReg, dec_->decFmt.w0off + j * 4);
+		j++;
+	}
+}
+
 void VertexDecoderJitCache::Jit_TcU8ToFloat() {
 	Jit_AnyU8ToFloat(dec_->tcoff, 16);
 	FS(32, fpSrc[0], dstReg, dec_->decFmt.uvoff);