From e1c391b1c80a9ad0410187e4281c37a28335213a Mon Sep 17 00:00:00 2001 From: "Unknown W. Brackets" Date: Mon, 13 Feb 2023 17:15:49 -0800 Subject: [PATCH] riscv: Use vertexjit for hardware skinned verts. --- GPU/Common/VertexDecoderRiscV.cpp | 43 +++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/GPU/Common/VertexDecoderRiscV.cpp b/GPU/Common/VertexDecoderRiscV.cpp index 165dbab2e7..27f3a13cec 100644 --- a/GPU/Common/VertexDecoderRiscV.cpp +++ b/GPU/Common/VertexDecoderRiscV.cpp @@ -71,6 +71,10 @@ static const RiscVReg const65535Reg = F6; // TODO: Use vector, where supported. static const JitLookup jitLookup[] = { + {&VertexDecoder::Step_WeightsU8, &VertexDecoderJitCache::Jit_WeightsU8}, + {&VertexDecoder::Step_WeightsU16, &VertexDecoderJitCache::Jit_WeightsU16}, + {&VertexDecoder::Step_WeightsFloat, &VertexDecoderJitCache::Jit_WeightsFloat}, + {&VertexDecoder::Step_TcU8ToFloat, &VertexDecoderJitCache::Jit_TcU8ToFloat}, {&VertexDecoder::Step_TcU16ToFloat, &VertexDecoderJitCache::Jit_TcU16ToFloat}, {&VertexDecoder::Step_TcFloat, &VertexDecoderJitCache::Jit_TcFloat}, @@ -234,6 +238,45 @@ bool VertexDecoderJitCache::CompileStep(const VertexDecoder &dec, int step) { return false; } +void VertexDecoderJitCache::Jit_WeightsU8() { + // Just copy a byte at a time. Would be nice if we knew if misaligned access was fast. + // If it's not fast, it can crash or hit a software trap (100x slower.) + int j; + for (j = 0; j < dec_->nweights; j++) { + LB(tempReg1, srcReg, dec_->weightoff + j); + SB(tempReg1, dstReg, dec_->decFmt.w0off + j); + } + // We zero out any weights up to a multiple of 4. + while (j & 3) { + SB(R_ZERO, dstReg, dec_->decFmt.w0off + j); + j++; + } +} + +void VertexDecoderJitCache::Jit_WeightsU16() { + int j; + for (j = 0; j < dec_->nweights; j++) { + LH(tempReg1, srcReg, dec_->weightoff + j * 2); + SH(tempReg1, dstReg, dec_->decFmt.w0off + j * 2); + } + while (j & 3) { + SH(R_ZERO, dstReg, dec_->decFmt.w0off + j * 2); + j++; + } +} + +void VertexDecoderJitCache::Jit_WeightsFloat() { + int j; + for (j = 0; j < dec_->nweights; j++) { + LW(tempReg1, srcReg, dec_->weightoff + j * 4); + SW(tempReg1, dstReg, dec_->decFmt.w0off + j * 4); + } + while (j & 3) { + SW(R_ZERO, dstReg, dec_->decFmt.w0off + j * 4); + j++; + } +} + void VertexDecoderJitCache::Jit_TcU8ToFloat() { Jit_AnyU8ToFloat(dec_->tcoff, 16); FS(32, fpSrc[0], dstReg, dec_->decFmt.uvoff);