Make handwritten vertex decoders work with non-compiled vertex decoding

2025-02-20 06:00:58 +00:00 · 2024-05-10 18:59:37 +02:00 · 2024-05-10 18:59:37 +02:00 · 81f1b3fd95
commit 81f1b3fd95
parent afca3717ba
11 changed files with 149 additions and 109 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -1858,6 +1858,8 @@ set(GPU_SOURCES
 	GPU/Common/SoftwareTransformCommon.h
 	GPU/Common/VertexDecoderCommon.cpp
 	GPU/Common/VertexDecoderCommon.h
 	GPU/Common/VertexDecoderHandwritten.cpp
 	GPU/Common/VertexDecoderHandwritten.h
 	GPU/Common/TransformCommon.cpp
 	GPU/Common/TransformCommon.h
 	GPU/Common/IndexGenerator.cpp
--- a/GPU/Common/VertexDecoderCommon.cpp
+++ b/GPU/Common/VertexDecoderCommon.cpp
@ -36,6 +36,7 @@
 #include "GPU/ge_constants.h"
 #include "GPU/Math3D.h"
 #include "GPU/Common/VertexDecoderCommon.h"
 #include "GPU/Common/VertexDecoderHandwritten.h"
 static const u8 tcsize[4] = { 0, 2, 4, 8 }, tcalign[4] = { 0, 1, 2, 4 };
 static const u8 colsize[8] = { 0, 0, 0, 0, 2, 2, 2, 4 }, colalign[8] = { 0, 0, 0, 0, 2, 2, 2, 4 };
@ -1282,6 +1283,14 @@ void VertexDecoder::SetVertexType(u32 fmt, const VertexDecoderOptions &options,
 	_assert_msg_(decFmt.uvfmt == DEC_FLOAT_2 || decFmt.uvfmt == DEC_NONE, "Reader only supports float UV");
 	// See GetVertTypeID
 	uint32_t fmtWithoutSkinFlag = (fmt_ & ~0x04000000);
 	if (fmtWithoutSkinFlag == (GE_VTYPE_TC_8BIT | GE_VTYPE_COL_5551 | GE_VTYPE_POS_16BIT)) {
 		// Can skip looking up in the JIT.
 		jitted_ = &VtxDec_Tu8_C5551_Ps16;
 		return;
 	}
 	// Attempt to JIT as well. But only do that if the main CPU JIT is enabled, in order to aid
 	// debugging attempts - if the main JIT doesn't work, this one won't do any better, probably.
 	if (jitCache) {
--- a/GPU/Common/VertexDecoderHandwritten.cpp
+++ b/GPU/Common/VertexDecoderHandwritten.cpp
@ -0,0 +1,115 @@
 #include "Common/CommonTypes.h"
 #include "Common/Data/Convert/ColorConv.h"
 #include "GPU/Common/VertexDecoderCommon.h"
 #include "GPU/GPUState.h"
 // Candidates for hand-writing
 // (found using our custom Very Sleepy).
 // GPU::P:_f_N:_s8_C:_8888_T:_u16__(24b)_040001BE  (5%+ of God of War execution)
 // GPU::P:_f_N:_s8_C:_8888_T:_u16_W:_f_(1x)__(28b)_040007BE (1%+ of God of War execution)
 void VtxDec_Tu8_C5551_Ps16(const u8 *srcp, u8 *dstp, int count, const UVScale *uvScaleOffset) {
 	struct GTAVTX {
 		union {
 			struct {
 				u8 u;
 				u8 v;
 			};
 			u16 uv;
 		};
 		u16 col;
 		s16 x;
 		s16 y;
 		s16 z;
 	};
 	// NOTE: This might be different for different vertex format.
 	struct OutVTX {
 		float u;
 		float v;
 		uint32_t col;
 		float x;
 		float y;
 		float z;
 	};
 	const GTAVTX *src = (const GTAVTX *)srcp;
 	OutVTX *dst = (OutVTX *)dstp;
 	float uscale = uvScaleOffset->uScale * (1.0f / 128.0f);
 	float vscale = uvScaleOffset->vScale * (1.0f / 128.0f);
 	float uoff = uvScaleOffset->uOff;
 	float voff = uvScaleOffset->vOff;
 	u32 alpha = 0xFFFFFFFF;
 #if PPSSPP_ARCH(SSE2)
 	__m128 uvOff = _mm_setr_ps(uoff, voff, uoff, voff);
 	__m128 uvScale = _mm_setr_ps(uscale, vscale, uscale, vscale);
 	__m128 posScale = _mm_set1_ps(1.0f / 32768.0f);
 	__m128i rmask = _mm_set1_epi32(0x001F);
 	__m128i gmask = _mm_set1_epi32(0x03E0);
 	__m128i bmask = _mm_set1_epi32(0x7c00);
 	__m128i amask = _mm_set1_epi32(0x8000);
 	__m128i lowbits = _mm_set1_epi32(0x00070707);
 	// Two vertices at a time, we can share some calculations.
 	// It's OK to accidentally decode an extra vertex.
 	for (int i = 0; i < count; i += 2) {
 		__m128i pos0 = _mm_loadl_epi64((const __m128i *) & src[i].x);
 		__m128i pos1 = _mm_loadl_epi64((const __m128i *) & src[i + 1].x);
 		// Translate UV, combined. TODO: Can possibly shuffle UV and col together here
 		uint32_t uv0 = (uint32_t)src[i].uv | ((uint32_t)src[i + 1].uv << 16);
 		uint64_t col0 = (uint64_t)src[i].col | ((uint64_t)src[i + 1].col << 32);
 		__m128i pos0_32 = _mm_srai_epi32(_mm_unpacklo_epi16(pos0, pos0), 16);
 		__m128i pos1_32 = _mm_srai_epi32(_mm_unpacklo_epi16(pos1, pos1), 16);
 		__m128 pos0_ext = _mm_mul_ps(_mm_cvtepi32_ps(pos0_32), posScale);
 		__m128 pos1_ext = _mm_mul_ps(_mm_cvtepi32_ps(pos1_32), posScale);
 		__m128i uv8 = _mm_set1_epi32(uv0);
 		__m128i uv16 = _mm_unpacklo_epi8(uv8, uv8);
 		__m128i uv32 = _mm_srli_epi32(_mm_unpacklo_epi16(uv16, uv16), 24);
 		__m128d uvf = _mm_castps_pd(_mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(uv32), uvScale), uvOff));
 		alpha &= col0;
 		// Combined RGBA
 		__m128i col = _mm_set1_epi64x(col0);
 		__m128i r = _mm_slli_epi32(_mm_and_si128(col, rmask), 8 - 5);
 		__m128i g = _mm_slli_epi32(_mm_and_si128(col, gmask), 16 - 10);
 		__m128i b = _mm_slli_epi32(_mm_and_si128(col, bmask), 24 - 15);
 		__m128i a = _mm_srai_epi32(_mm_slli_epi32(_mm_and_si128(col, amask), 16), 7);
 		col = _mm_or_si128(_mm_or_si128(r, g), b);
 		col = _mm_or_si128(col, _mm_and_si128(_mm_srli_epi32(col, 5), lowbits));
 		col = _mm_or_si128(col, a);
 		// TODO: Mix into fewer stores.
 		_mm_storeu_ps(&dst[i].x, pos0_ext);
 		_mm_storeu_ps(&dst[i + 1].x, pos1_ext);
 		_mm_storel_pd((double *)&dst[i].u, uvf);
 		_mm_storeh_pd((double *)&dst[i + 1].u, uvf);
 		dst[i].col = _mm_cvtsi128_si32(col);
 		dst[i + 1].col = _mm_cvtsi128_si32(_mm_shuffle_epi32(col, _MM_SHUFFLE(1, 1, 1, 1)));
 	}
 	alpha = alpha & (alpha >> 16);
 #else
 	for (int i = 0; i < count; i++) {
 		float u = src[i].u * uscale + uoff;
 		float v = src[i].v * vscale + voff;
 		alpha &= src[i].col;
 		uint32_t color = RGBA5551ToRGBA8888(src[i].col);
 		float x = src[i].x * (1.0f / 32768.0f);
 		float y = src[i].y * (1.0f / 32768.0f);
 		float z = src[i].z * (1.0f / 32768.0f);
 		dst[i].col = color;
 		dst[i].u = u;
 		dst[i].v = v;
 		dst[i].x = x;
 		dst[i].y = y;
 		dst[i].z = z;
 	}
 #endif
 	gstate_c.vertexFullAlpha = (alpha >> 15) & 1;
 }
--- a/GPU/Common/VertexDecoderHandwritten.h
+++ b/GPU/Common/VertexDecoderHandwritten.h
@ -0,0 +1,6 @@
 #pragma once
 // These are useful on JIT-less platforms - they don't beat the jitted vertex decoders by much, but they
 // will beat the function-call-stitched ones by a lot.
 void VtxDec_Tu8_C5551_Ps16(const u8 *srcp, u8 *dstp, int count, const UVScale *uvScaleOffset);
--- a/GPU/Common/VertexDecoderX86.cpp
+++ b/GPU/Common/VertexDecoderX86.cpp
@ -25,6 +25,7 @@
 #include "Core/Config.h"
 #include "GPU/GPUState.h"
 #include "GPU/Common/VertexDecoderCommon.h"
 #include "GPU/Common/VertexDecoderHandwritten.h"
 // We start out by converting the active matrices into 4x4 which are easier to multiply with
 // using SSE / NEON and store them here.
@ -167,114 +168,7 @@ static const JitLookup jitLookup[] = {
 	{&VertexDecoder::Step_Color5551Morph, &VertexDecoderJitCache::Jit_Color5551Morph},
 };
 void GTA_Tu8_C5551_Ps16(const u8 *srcp, u8 *dstp, int count, const UVScale *uvScaleOffset) {
 	struct GTAVTX {
 		union {
 			struct {
 				u8 u;
 				u8 v;
 			};
 			u16 uv;
 		};
 		u16 col;
 		s16 x;
 		s16 y;
 		s16 z;
 	};
 	// NOTE: This might be different for different vertex format.
 	struct OutVTX {
 		float u;
 		float v;
 		uint32_t col;
 		float x;
 		float y;
 		float z;
 	};
 	const GTAVTX *src = (const GTAVTX *)srcp;
 	OutVTX *dst = (OutVTX *)dstp;
 	float uscale = uvScaleOffset->uScale * (1.0f / 128.0f);
 	float vscale = uvScaleOffset->vScale * (1.0f / 128.0f);
 	float uoff = uvScaleOffset->uOff;
 	float voff = uvScaleOffset->vOff;
 	u32 alpha = 0xFFFFFFFF;
 #if PPSSPP_ARCH(SSE2)
 	__m128 uvOff = _mm_setr_ps(uoff, voff, uoff, voff);
 	__m128 uvScale = _mm_setr_ps(uscale, vscale, uscale, vscale);
 	__m128 posScale = _mm_set1_ps(1.0f / 32768.0f);
 	__m128i rmask = _mm_set1_epi32(0x001F);
 	__m128i gmask = _mm_set1_epi32(0x03E0);
 	__m128i bmask = _mm_set1_epi32(0x7c00);
 	__m128i amask = _mm_set1_epi32(0x8000);
 	__m128i lowbits = _mm_set1_epi32(0x00070707);
 	// Two vertices at a time, we can share some calculations.
 	for (int i = 0; i < count; i += 2) {
 		__m128i pos0 = _mm_loadl_epi64((const __m128i *) & src[i].x);
 		__m128i pos1 = _mm_loadl_epi64((const __m128i *) & src[i + 1].x);
 		// Translate UV, combined. TODO: Can possibly shuffle UV and col together here
 		uint32_t uv0 = (uint32_t)src[i].uv | ((uint32_t)src[i + 1].uv << 16);
 		uint64_t col0 = (uint64_t)src[i].col | ((uint64_t)src[i + 1].col << 32);
 		__m128i pos0_32 = _mm_srai_epi32(_mm_unpacklo_epi16(pos0, pos0), 16);
 		__m128i pos1_32 = _mm_srai_epi32(_mm_unpacklo_epi16(pos1, pos1), 16);
 		__m128 pos0_ext = _mm_mul_ps(_mm_cvtepi32_ps(pos0_32), posScale);
 		__m128 pos1_ext = _mm_mul_ps(_mm_cvtepi32_ps(pos1_32), posScale);
 		__m128i uv8 = _mm_set1_epi32(uv0);
 		__m128i uv16 = _mm_unpacklo_epi8(uv8, uv8);
 		__m128i uv32 = _mm_srli_epi32(_mm_unpacklo_epi16(uv16, uv16), 24);
 		__m128d uvf = _mm_castps_pd(_mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(uv32), uvScale), uvOff));
 		alpha &= col0;
 		// Combined RGBA
 		__m128i col = _mm_set1_epi64x(col0);
 		__m128i r = _mm_slli_epi32(_mm_and_si128(col, rmask), 8 - 5);
 		__m128i g = _mm_slli_epi32(_mm_and_si128(col, gmask), 16 - 10);
 		__m128i b = _mm_slli_epi32(_mm_and_si128(col, bmask), 24 - 15);
 		__m128i a = _mm_srai_epi32(_mm_slli_epi32(_mm_and_si128(col, amask), 16), 7);
 		col = _mm_or_si128(_mm_or_si128(r, g), b);
 		col = _mm_or_si128(col, _mm_and_si128(_mm_srli_epi32(col, 5), lowbits));
 		col = _mm_or_si128(col, a);
 		// TODO: Mix into fewer stores.
 		_mm_storeu_ps(&dst[i].x, pos0_ext);
 		_mm_storeu_ps(&dst[i + 1].x, pos1_ext);
 		_mm_storel_pd((double *)&dst[i].u, uvf);
 		_mm_storeh_pd((double *)&dst[i + 1].u, uvf);
 		dst[i].col = _mm_cvtsi128_si32(col);
 		dst[i + 1].col = _mm_cvtsi128_si32(_mm_shuffle_epi32(col, _MM_SHUFFLE(1, 1, 1, 1)));
 	}
 	alpha = alpha & (alpha >> 16);
 #else
 	for (int i = 0; i < count; i++) {
 		float u = src[i].u * uscale + uoff;
 		float v = src[i].v * vscale + voff;
 		alpha &= src[i].col;
 		uint32_t color = RGBA5551ToRGBA8888(src[i].col);
 		float x = src[i].x * (1.0f / 32768.0f);
 		float y = src[i].y * (1.0f / 32768.0f);
 		float z = src[i].z * (1.0f / 32768.0f);
 		dst[i].col = color;
 		dst[i].u = u;
 		dst[i].v = v;
 		dst[i].x = x;
 		dst[i].y = y;
 		dst[i].z = z;
 	}
 #endif
 	gstate_c.vertexFullAlpha = (alpha >> 15) & 1;
 }
 JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec, int32_t *jittedSize) {
 	if ((dec.fmt_ & ~0x04000000) == (GE_VTYPE_TC_8BIT | GE_VTYPE_COL_5551 | GE_VTYPE_POS_16BIT)) {
 		return &GTA_Tu8_C5551_Ps16;
 	}
 	dec_ = &dec;
 	BeginWrite(4096);
 	const u8 *start = this->AlignCode16();
--- a/GPU/GPU.vcxproj
+++ b/GPU/GPU.vcxproj
@ -371,6 +371,7 @@
    <ClInclude Include="Common\TextureScalerCommon.h" />
    <ClInclude Include="Common\TransformCommon.h" />
    <ClInclude Include="Common\VertexDecoderCommon.h" />
    <ClInclude Include="Common\VertexDecoderHandwritten.h" />
    <ClInclude Include="Common\VertexShaderGenerator.h" />
    <ClInclude Include="D3D11\D3D11Util.h" />
    <ClInclude Include="D3D11\DrawEngineD3D11.h" />
@ -511,6 +512,7 @@
      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|ARM'">true</ExcludedFromBuild>
    </ClCompile>
    <ClCompile Include="Common\VertexDecoderCommon.cpp" />
    <ClCompile Include="Common\VertexDecoderHandwritten.cpp" />
    <ClCompile Include="Common\VertexDecoderRiscV.cpp" />
    <ClCompile Include="Common\VertexDecoderX86.cpp">
      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|ARM64'">true</ExcludedFromBuild>
--- a/GPU/GPU.vcxproj.filters
+++ b/GPU/GPU.vcxproj.filters
@ -273,6 +273,9 @@
    <ClInclude Include="Common\TextureReplacer.h">
      <Filter>Common</Filter>
    </ClInclude>
    <ClInclude Include="Common\VertexDecoderHandwritten.h">
      <Filter>Common</Filter>
    </ClInclude>
  </ItemGroup>
  <ItemGroup>
    <ClCompile Include="Math3D.cpp">
@ -542,6 +545,9 @@
    <ClCompile Include="Common\TextureReplacer.cpp">
      <Filter>Common</Filter>
    </ClCompile>
    <ClCompile Include="Common\VertexDecoderHandwritten.cpp">
      <Filter>Common</Filter>
    </ClCompile>
  </ItemGroup>
  <ItemGroup>
    <FxCompile Include="..\assets\shaders\tex_4xbrz.csh">
--- a/UWP/GPU_UWP/GPU_UWP.vcxproj
+++ b/UWP/GPU_UWP/GPU_UWP.vcxproj
@ -136,6 +136,7 @@
    <ClInclude Include="..\..\GPU\Common\TextureScalerCommon.h" />
    <ClInclude Include="..\..\GPU\Common\TransformCommon.h" />
    <ClInclude Include="..\..\GPU\Common\VertexDecoderCommon.h" />
    <ClInclude Include="..\..\GPU\Common\VertexDecoderHandwritten.h" />
    <ClInclude Include="..\..\GPU\Common\VertexShaderGenerator.h" />
    <ClInclude Include="..\..\GPU\D3D11\D3D11Util.h" />
    <ClInclude Include="..\..\GPU\D3D11\DrawEngineD3D11.h" />
@ -203,6 +204,7 @@
    <ClCompile Include="..\..\GPU\Common\VertexDecoderArm.cpp" />
    <ClCompile Include="..\..\GPU\Common\VertexDecoderArm64.cpp" />
    <ClCompile Include="..\..\GPU\Common\VertexDecoderCommon.cpp" />
    <ClCompile Include="..\..\GPU\Common\VertexDecoderHandwritten.cpp" />
    <ClCompile Include="..\..\GPU\Common\VertexDecoderX86.cpp" />
    <ClCompile Include="..\..\GPU\Common\VertexShaderGenerator.cpp" />
    <ClCompile Include="..\..\GPU\D3D11\D3D11Util.cpp" />
--- a/UWP/GPU_UWP/GPU_UWP.vcxproj.filters
+++ b/UWP/GPU_UWP/GPU_UWP.vcxproj.filters
@ -22,6 +22,7 @@
    <ClCompile Include="..\..\GPU\Common\VertexDecoderArm.cpp" />
    <ClCompile Include="..\..\GPU\Common\VertexDecoderArm64.cpp" />
    <ClCompile Include="..\..\GPU\Common\VertexDecoderCommon.cpp" />
    <ClCompile Include="..\..\GPU\Common\VertexDecoderHandwritten.cpp" />
    <ClCompile Include="..\..\GPU\Common\VertexDecoderX86.cpp" />
    <ClCompile Include="..\..\GPU\D3D11\D3D11Util.cpp" />
    <ClCompile Include="..\..\GPU\D3D11\DrawEngineD3D11.cpp" />
@ -86,6 +87,7 @@
    <ClInclude Include="..\..\GPU\Common\TextureScalerCommon.h" />
    <ClInclude Include="..\..\GPU\Common\TransformCommon.h" />
    <ClInclude Include="..\..\GPU\Common\VertexDecoderCommon.h" />
    <ClInclude Include="..\..\GPU\Common\VertexDecoderHandwritten.h" />
    <ClInclude Include="..\..\GPU\D3D11\D3D11Util.h" />
    <ClInclude Include="..\..\GPU\D3D11\DrawEngineD3D11.h" />
    <ClInclude Include="..\..\GPU\D3D11\FramebufferManagerD3D11.h" />
--- a/android/jni/Android.mk
+++ b/android/jni/Android.mk
@ -481,6 +481,7 @@ EXEC_AND_LIB_FILES := \
  $(SRC)/GPU/Common/ReinterpretFramebuffer.cpp \
  $(SRC)/GPU/Common/DepthBufferCommon.cpp \
  $(SRC)/GPU/Common/VertexDecoderCommon.cpp.arm \
  $(SRC)/GPU/Common/VertexDecoderHandwritten.cpp.arm \
  $(SRC)/GPU/Common/TextureCacheCommon.cpp.arm \
  $(SRC)/GPU/Common/TextureScalerCommon.cpp.arm \
  $(SRC)/GPU/Common/ShaderCommon.cpp \
--- a/libretro/Makefile.common
+++ b/libretro/Makefile.common
@ -463,6 +463,7 @@ SOURCES_C +=\
 SOURCES_CXX += \
 	$(GPUCOMMONDIR)/Draw2D.cpp \
 	$(GPUCOMMONDIR)/VertexDecoderCommon.cpp \
 	$(GPUCOMMONDIR)/VertexDecoderHandwritten.cpp \
 	$(GPUCOMMONDIR)/GPUStateUtils.cpp \
 	$(GPUCOMMONDIR)/DrawEngineCommon.cpp \
 	$(GPUCOMMONDIR)/SplineCommon.cpp \