diff --git a/CMakeLists.txt b/CMakeLists.txt index 4da7206ba2..4d1207d5c1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1858,6 +1858,8 @@ set(GPU_SOURCES GPU/Common/SoftwareTransformCommon.h GPU/Common/VertexDecoderCommon.cpp GPU/Common/VertexDecoderCommon.h + GPU/Common/VertexDecoderHandwritten.cpp + GPU/Common/VertexDecoderHandwritten.h GPU/Common/TransformCommon.cpp GPU/Common/TransformCommon.h GPU/Common/IndexGenerator.cpp diff --git a/GPU/Common/VertexDecoderCommon.cpp b/GPU/Common/VertexDecoderCommon.cpp index ee31129140..01916b99a3 100644 --- a/GPU/Common/VertexDecoderCommon.cpp +++ b/GPU/Common/VertexDecoderCommon.cpp @@ -36,6 +36,7 @@ #include "GPU/ge_constants.h" #include "GPU/Math3D.h" #include "GPU/Common/VertexDecoderCommon.h" +#include "GPU/Common/VertexDecoderHandwritten.h" static const u8 tcsize[4] = { 0, 2, 4, 8 }, tcalign[4] = { 0, 1, 2, 4 }; static const u8 colsize[8] = { 0, 0, 0, 0, 2, 2, 2, 4 }, colalign[8] = { 0, 0, 0, 0, 2, 2, 2, 4 }; @@ -1282,6 +1283,14 @@ void VertexDecoder::SetVertexType(u32 fmt, const VertexDecoderOptions &options, _assert_msg_(decFmt.uvfmt == DEC_FLOAT_2 || decFmt.uvfmt == DEC_NONE, "Reader only supports float UV"); + // See GetVertTypeID + uint32_t fmtWithoutSkinFlag = (fmt_ & ~0x04000000); + if (fmtWithoutSkinFlag == (GE_VTYPE_TC_8BIT | GE_VTYPE_COL_5551 | GE_VTYPE_POS_16BIT)) { + // Can skip looking up in the JIT. + jitted_ = &VtxDec_Tu8_C5551_Ps16; + return; + } + // Attempt to JIT as well. But only do that if the main CPU JIT is enabled, in order to aid // debugging attempts - if the main JIT doesn't work, this one won't do any better, probably. if (jitCache) { diff --git a/GPU/Common/VertexDecoderHandwritten.cpp b/GPU/Common/VertexDecoderHandwritten.cpp new file mode 100644 index 0000000000..cb139dd1f5 --- /dev/null +++ b/GPU/Common/VertexDecoderHandwritten.cpp @@ -0,0 +1,115 @@ +#include "Common/CommonTypes.h" +#include "Common/Data/Convert/ColorConv.h" +#include "GPU/Common/VertexDecoderCommon.h" +#include "GPU/GPUState.h" + + +// Candidates for hand-writing +// (found using our custom Very Sleepy). +// GPU::P:_f_N:_s8_C:_8888_T:_u16__(24b)_040001BE (5%+ of God of War execution) +// GPU::P:_f_N:_s8_C:_8888_T:_u16_W:_f_(1x)__(28b)_040007BE (1%+ of God of War execution) + + +void VtxDec_Tu8_C5551_Ps16(const u8 *srcp, u8 *dstp, int count, const UVScale *uvScaleOffset) { + struct GTAVTX { + union { + struct { + u8 u; + u8 v; + }; + u16 uv; + }; + u16 col; + s16 x; + s16 y; + s16 z; + }; + // NOTE: This might be different for different vertex format. + struct OutVTX { + float u; + float v; + uint32_t col; + float x; + float y; + float z; + }; + const GTAVTX *src = (const GTAVTX *)srcp; + OutVTX *dst = (OutVTX *)dstp; + float uscale = uvScaleOffset->uScale * (1.0f / 128.0f); + float vscale = uvScaleOffset->vScale * (1.0f / 128.0f); + float uoff = uvScaleOffset->uOff; + float voff = uvScaleOffset->vOff; + + u32 alpha = 0xFFFFFFFF; + +#if PPSSPP_ARCH(SSE2) + __m128 uvOff = _mm_setr_ps(uoff, voff, uoff, voff); + __m128 uvScale = _mm_setr_ps(uscale, vscale, uscale, vscale); + __m128 posScale = _mm_set1_ps(1.0f / 32768.0f); + __m128i rmask = _mm_set1_epi32(0x001F); + __m128i gmask = _mm_set1_epi32(0x03E0); + __m128i bmask = _mm_set1_epi32(0x7c00); + __m128i amask = _mm_set1_epi32(0x8000); + __m128i lowbits = _mm_set1_epi32(0x00070707); + + // Two vertices at a time, we can share some calculations. + // It's OK to accidentally decode an extra vertex. + for (int i = 0; i < count; i += 2) { + __m128i pos0 = _mm_loadl_epi64((const __m128i *) & src[i].x); + __m128i pos1 = _mm_loadl_epi64((const __m128i *) & src[i + 1].x); + // Translate UV, combined. TODO: Can possibly shuffle UV and col together here + uint32_t uv0 = (uint32_t)src[i].uv | ((uint32_t)src[i + 1].uv << 16); + uint64_t col0 = (uint64_t)src[i].col | ((uint64_t)src[i + 1].col << 32); + __m128i pos0_32 = _mm_srai_epi32(_mm_unpacklo_epi16(pos0, pos0), 16); + __m128i pos1_32 = _mm_srai_epi32(_mm_unpacklo_epi16(pos1, pos1), 16); + __m128 pos0_ext = _mm_mul_ps(_mm_cvtepi32_ps(pos0_32), posScale); + __m128 pos1_ext = _mm_mul_ps(_mm_cvtepi32_ps(pos1_32), posScale); + + __m128i uv8 = _mm_set1_epi32(uv0); + __m128i uv16 = _mm_unpacklo_epi8(uv8, uv8); + __m128i uv32 = _mm_srli_epi32(_mm_unpacklo_epi16(uv16, uv16), 24); + __m128d uvf = _mm_castps_pd(_mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(uv32), uvScale), uvOff)); + alpha &= col0; + + // Combined RGBA + __m128i col = _mm_set1_epi64x(col0); + __m128i r = _mm_slli_epi32(_mm_and_si128(col, rmask), 8 - 5); + __m128i g = _mm_slli_epi32(_mm_and_si128(col, gmask), 16 - 10); + __m128i b = _mm_slli_epi32(_mm_and_si128(col, bmask), 24 - 15); + __m128i a = _mm_srai_epi32(_mm_slli_epi32(_mm_and_si128(col, amask), 16), 7); + col = _mm_or_si128(_mm_or_si128(r, g), b); + col = _mm_or_si128(col, _mm_and_si128(_mm_srli_epi32(col, 5), lowbits)); + col = _mm_or_si128(col, a); + + // TODO: Mix into fewer stores. + _mm_storeu_ps(&dst[i].x, pos0_ext); + _mm_storeu_ps(&dst[i + 1].x, pos1_ext); + _mm_storel_pd((double *)&dst[i].u, uvf); + _mm_storeh_pd((double *)&dst[i + 1].u, uvf); + dst[i].col = _mm_cvtsi128_si32(col); + dst[i + 1].col = _mm_cvtsi128_si32(_mm_shuffle_epi32(col, _MM_SHUFFLE(1, 1, 1, 1))); + } + + alpha = alpha & (alpha >> 16); + +#else + + for (int i = 0; i < count; i++) { + float u = src[i].u * uscale + uoff; + float v = src[i].v * vscale + voff; + alpha &= src[i].col; + uint32_t color = RGBA5551ToRGBA8888(src[i].col); + float x = src[i].x * (1.0f / 32768.0f); + float y = src[i].y * (1.0f / 32768.0f); + float z = src[i].z * (1.0f / 32768.0f); + dst[i].col = color; + dst[i].u = u; + dst[i].v = v; + dst[i].x = x; + dst[i].y = y; + dst[i].z = z; + } + +#endif + gstate_c.vertexFullAlpha = (alpha >> 15) & 1; +} diff --git a/GPU/Common/VertexDecoderHandwritten.h b/GPU/Common/VertexDecoderHandwritten.h new file mode 100644 index 0000000000..4ea698a380 --- /dev/null +++ b/GPU/Common/VertexDecoderHandwritten.h @@ -0,0 +1,6 @@ +#pragma once + +// These are useful on JIT-less platforms - they don't beat the jitted vertex decoders by much, but they +// will beat the function-call-stitched ones by a lot. + +void VtxDec_Tu8_C5551_Ps16(const u8 *srcp, u8 *dstp, int count, const UVScale *uvScaleOffset); diff --git a/GPU/Common/VertexDecoderX86.cpp b/GPU/Common/VertexDecoderX86.cpp index 3871792cd0..1df3be8388 100644 --- a/GPU/Common/VertexDecoderX86.cpp +++ b/GPU/Common/VertexDecoderX86.cpp @@ -25,6 +25,7 @@ #include "Core/Config.h" #include "GPU/GPUState.h" #include "GPU/Common/VertexDecoderCommon.h" +#include "GPU/Common/VertexDecoderHandwritten.h" // We start out by converting the active matrices into 4x4 which are easier to multiply with // using SSE / NEON and store them here. @@ -167,114 +168,7 @@ static const JitLookup jitLookup[] = { {&VertexDecoder::Step_Color5551Morph, &VertexDecoderJitCache::Jit_Color5551Morph}, }; -void GTA_Tu8_C5551_Ps16(const u8 *srcp, u8 *dstp, int count, const UVScale *uvScaleOffset) { - struct GTAVTX { - union { - struct { - u8 u; - u8 v; - }; - u16 uv; - }; - u16 col; - s16 x; - s16 y; - s16 z; - }; - // NOTE: This might be different for different vertex format. - struct OutVTX { - float u; - float v; - uint32_t col; - float x; - float y; - float z; - }; - const GTAVTX *src = (const GTAVTX *)srcp; - OutVTX *dst = (OutVTX *)dstp; - float uscale = uvScaleOffset->uScale * (1.0f / 128.0f); - float vscale = uvScaleOffset->vScale * (1.0f / 128.0f); - float uoff = uvScaleOffset->uOff; - float voff = uvScaleOffset->vOff; - - u32 alpha = 0xFFFFFFFF; - -#if PPSSPP_ARCH(SSE2) - __m128 uvOff = _mm_setr_ps(uoff, voff, uoff, voff); - __m128 uvScale = _mm_setr_ps(uscale, vscale, uscale, vscale); - __m128 posScale = _mm_set1_ps(1.0f / 32768.0f); - __m128i rmask = _mm_set1_epi32(0x001F); - __m128i gmask = _mm_set1_epi32(0x03E0); - __m128i bmask = _mm_set1_epi32(0x7c00); - __m128i amask = _mm_set1_epi32(0x8000); - __m128i lowbits = _mm_set1_epi32(0x00070707); - - // Two vertices at a time, we can share some calculations. - for (int i = 0; i < count; i += 2) { - __m128i pos0 = _mm_loadl_epi64((const __m128i *) & src[i].x); - __m128i pos1 = _mm_loadl_epi64((const __m128i *) & src[i + 1].x); - // Translate UV, combined. TODO: Can possibly shuffle UV and col together here - uint32_t uv0 = (uint32_t)src[i].uv | ((uint32_t)src[i + 1].uv << 16); - uint64_t col0 = (uint64_t)src[i].col | ((uint64_t)src[i + 1].col << 32); - __m128i pos0_32 = _mm_srai_epi32(_mm_unpacklo_epi16(pos0, pos0), 16); - __m128i pos1_32 = _mm_srai_epi32(_mm_unpacklo_epi16(pos1, pos1), 16); - __m128 pos0_ext = _mm_mul_ps(_mm_cvtepi32_ps(pos0_32), posScale); - __m128 pos1_ext = _mm_mul_ps(_mm_cvtepi32_ps(pos1_32), posScale); - - __m128i uv8 = _mm_set1_epi32(uv0); - __m128i uv16 = _mm_unpacklo_epi8(uv8, uv8); - __m128i uv32 = _mm_srli_epi32(_mm_unpacklo_epi16(uv16, uv16), 24); - __m128d uvf = _mm_castps_pd(_mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(uv32), uvScale), uvOff)); - alpha &= col0; - - // Combined RGBA - __m128i col = _mm_set1_epi64x(col0); - __m128i r = _mm_slli_epi32(_mm_and_si128(col, rmask), 8 - 5); - __m128i g = _mm_slli_epi32(_mm_and_si128(col, gmask), 16 - 10); - __m128i b = _mm_slli_epi32(_mm_and_si128(col, bmask), 24 - 15); - __m128i a = _mm_srai_epi32(_mm_slli_epi32(_mm_and_si128(col, amask), 16), 7); - col = _mm_or_si128(_mm_or_si128(r, g), b); - col = _mm_or_si128(col, _mm_and_si128(_mm_srli_epi32(col, 5), lowbits)); - col = _mm_or_si128(col, a); - - // TODO: Mix into fewer stores. - _mm_storeu_ps(&dst[i].x, pos0_ext); - _mm_storeu_ps(&dst[i + 1].x, pos1_ext); - _mm_storel_pd((double *)&dst[i].u, uvf); - _mm_storeh_pd((double *)&dst[i + 1].u, uvf); - dst[i].col = _mm_cvtsi128_si32(col); - dst[i + 1].col = _mm_cvtsi128_si32(_mm_shuffle_epi32(col, _MM_SHUFFLE(1, 1, 1, 1))); - } - - alpha = alpha & (alpha >> 16); - -#else - - for (int i = 0; i < count; i++) { - float u = src[i].u * uscale + uoff; - float v = src[i].v * vscale + voff; - alpha &= src[i].col; - uint32_t color = RGBA5551ToRGBA8888(src[i].col); - float x = src[i].x * (1.0f / 32768.0f); - float y = src[i].y * (1.0f / 32768.0f); - float z = src[i].z * (1.0f / 32768.0f); - dst[i].col = color; - dst[i].u = u; - dst[i].v = v; - dst[i].x = x; - dst[i].y = y; - dst[i].z = z; - } - -#endif - gstate_c.vertexFullAlpha = (alpha >> 15) & 1; -} - JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec, int32_t *jittedSize) { - if ((dec.fmt_ & ~0x04000000) == (GE_VTYPE_TC_8BIT | GE_VTYPE_COL_5551 | GE_VTYPE_POS_16BIT)) { - return >A_Tu8_C5551_Ps16; - } - dec_ = &dec; BeginWrite(4096); const u8 *start = this->AlignCode16(); diff --git a/GPU/GPU.vcxproj b/GPU/GPU.vcxproj index 87c79f3462..147e31c724 100644 --- a/GPU/GPU.vcxproj +++ b/GPU/GPU.vcxproj @@ -371,6 +371,7 @@ + @@ -511,6 +512,7 @@ true + true diff --git a/GPU/GPU.vcxproj.filters b/GPU/GPU.vcxproj.filters index 6b35f8f33a..c59e28ed7b 100644 --- a/GPU/GPU.vcxproj.filters +++ b/GPU/GPU.vcxproj.filters @@ -273,6 +273,9 @@ Common + + Common + @@ -542,6 +545,9 @@ Common + + Common + diff --git a/UWP/GPU_UWP/GPU_UWP.vcxproj b/UWP/GPU_UWP/GPU_UWP.vcxproj index ffc2d6f0c0..9e22d246ef 100644 --- a/UWP/GPU_UWP/GPU_UWP.vcxproj +++ b/UWP/GPU_UWP/GPU_UWP.vcxproj @@ -136,6 +136,7 @@ + @@ -203,6 +204,7 @@ + @@ -257,4 +259,4 @@ - \ No newline at end of file + diff --git a/UWP/GPU_UWP/GPU_UWP.vcxproj.filters b/UWP/GPU_UWP/GPU_UWP.vcxproj.filters index 314574f863..804554a2c5 100644 --- a/UWP/GPU_UWP/GPU_UWP.vcxproj.filters +++ b/UWP/GPU_UWP/GPU_UWP.vcxproj.filters @@ -22,6 +22,7 @@ + @@ -86,6 +87,7 @@ + @@ -130,4 +132,4 @@ - \ No newline at end of file + diff --git a/android/jni/Android.mk b/android/jni/Android.mk index ac949a57ef..c49ac79c4e 100644 --- a/android/jni/Android.mk +++ b/android/jni/Android.mk @@ -481,6 +481,7 @@ EXEC_AND_LIB_FILES := \ $(SRC)/GPU/Common/ReinterpretFramebuffer.cpp \ $(SRC)/GPU/Common/DepthBufferCommon.cpp \ $(SRC)/GPU/Common/VertexDecoderCommon.cpp.arm \ + $(SRC)/GPU/Common/VertexDecoderHandwritten.cpp.arm \ $(SRC)/GPU/Common/TextureCacheCommon.cpp.arm \ $(SRC)/GPU/Common/TextureScalerCommon.cpp.arm \ $(SRC)/GPU/Common/ShaderCommon.cpp \ diff --git a/libretro/Makefile.common b/libretro/Makefile.common index e91b78e722..8a5b0b9214 100644 --- a/libretro/Makefile.common +++ b/libretro/Makefile.common @@ -463,6 +463,7 @@ SOURCES_C +=\ SOURCES_CXX += \ $(GPUCOMMONDIR)/Draw2D.cpp \ $(GPUCOMMONDIR)/VertexDecoderCommon.cpp \ + $(GPUCOMMONDIR)/VertexDecoderHandwritten.cpp \ $(GPUCOMMONDIR)/GPUStateUtils.cpp \ $(GPUCOMMONDIR)/DrawEngineCommon.cpp \ $(GPUCOMMONDIR)/SplineCommon.cpp \