diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4da7206ba2..4d1207d5c1 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1858,6 +1858,8 @@ set(GPU_SOURCES
GPU/Common/SoftwareTransformCommon.h
GPU/Common/VertexDecoderCommon.cpp
GPU/Common/VertexDecoderCommon.h
+ GPU/Common/VertexDecoderHandwritten.cpp
+ GPU/Common/VertexDecoderHandwritten.h
GPU/Common/TransformCommon.cpp
GPU/Common/TransformCommon.h
GPU/Common/IndexGenerator.cpp
diff --git a/GPU/Common/VertexDecoderCommon.cpp b/GPU/Common/VertexDecoderCommon.cpp
index ee31129140..01916b99a3 100644
--- a/GPU/Common/VertexDecoderCommon.cpp
+++ b/GPU/Common/VertexDecoderCommon.cpp
@@ -36,6 +36,7 @@
#include "GPU/ge_constants.h"
#include "GPU/Math3D.h"
#include "GPU/Common/VertexDecoderCommon.h"
+#include "GPU/Common/VertexDecoderHandwritten.h"
static const u8 tcsize[4] = { 0, 2, 4, 8 }, tcalign[4] = { 0, 1, 2, 4 };
static const u8 colsize[8] = { 0, 0, 0, 0, 2, 2, 2, 4 }, colalign[8] = { 0, 0, 0, 0, 2, 2, 2, 4 };
@@ -1282,6 +1283,14 @@ void VertexDecoder::SetVertexType(u32 fmt, const VertexDecoderOptions &options,
_assert_msg_(decFmt.uvfmt == DEC_FLOAT_2 || decFmt.uvfmt == DEC_NONE, "Reader only supports float UV");
+ // See GetVertTypeID
+ uint32_t fmtWithoutSkinFlag = (fmt_ & ~0x04000000);
+ if (fmtWithoutSkinFlag == (GE_VTYPE_TC_8BIT | GE_VTYPE_COL_5551 | GE_VTYPE_POS_16BIT)) {
+ // Can skip looking up in the JIT.
+ jitted_ = &VtxDec_Tu8_C5551_Ps16;
+ return;
+ }
+
// Attempt to JIT as well. But only do that if the main CPU JIT is enabled, in order to aid
// debugging attempts - if the main JIT doesn't work, this one won't do any better, probably.
if (jitCache) {
diff --git a/GPU/Common/VertexDecoderHandwritten.cpp b/GPU/Common/VertexDecoderHandwritten.cpp
new file mode 100644
index 0000000000..cb139dd1f5
--- /dev/null
+++ b/GPU/Common/VertexDecoderHandwritten.cpp
@@ -0,0 +1,115 @@
+#include "Common/CommonTypes.h"
+#include "Common/Data/Convert/ColorConv.h"
+#include "GPU/Common/VertexDecoderCommon.h"
+#include "GPU/GPUState.h"
+
+
+// Candidates for hand-writing
+// (found using our custom Very Sleepy).
+// GPU::P:_f_N:_s8_C:_8888_T:_u16__(24b)_040001BE (5%+ of God of War execution)
+// GPU::P:_f_N:_s8_C:_8888_T:_u16_W:_f_(1x)__(28b)_040007BE (1%+ of God of War execution)
+
+
+void VtxDec_Tu8_C5551_Ps16(const u8 *srcp, u8 *dstp, int count, const UVScale *uvScaleOffset) {
+ struct GTAVTX {
+ union {
+ struct {
+ u8 u;
+ u8 v;
+ };
+ u16 uv;
+ };
+ u16 col;
+ s16 x;
+ s16 y;
+ s16 z;
+ };
+ // NOTE: This might be different for different vertex format.
+ struct OutVTX {
+ float u;
+ float v;
+ uint32_t col;
+ float x;
+ float y;
+ float z;
+ };
+ const GTAVTX *src = (const GTAVTX *)srcp;
+ OutVTX *dst = (OutVTX *)dstp;
+ float uscale = uvScaleOffset->uScale * (1.0f / 128.0f);
+ float vscale = uvScaleOffset->vScale * (1.0f / 128.0f);
+ float uoff = uvScaleOffset->uOff;
+ float voff = uvScaleOffset->vOff;
+
+ u32 alpha = 0xFFFFFFFF;
+
+#if PPSSPP_ARCH(SSE2)
+ __m128 uvOff = _mm_setr_ps(uoff, voff, uoff, voff);
+ __m128 uvScale = _mm_setr_ps(uscale, vscale, uscale, vscale);
+ __m128 posScale = _mm_set1_ps(1.0f / 32768.0f);
+ __m128i rmask = _mm_set1_epi32(0x001F);
+ __m128i gmask = _mm_set1_epi32(0x03E0);
+ __m128i bmask = _mm_set1_epi32(0x7c00);
+ __m128i amask = _mm_set1_epi32(0x8000);
+ __m128i lowbits = _mm_set1_epi32(0x00070707);
+
+ // Two vertices at a time, we can share some calculations.
+ // It's OK to accidentally decode an extra vertex.
+ for (int i = 0; i < count; i += 2) {
+ __m128i pos0 = _mm_loadl_epi64((const __m128i *) & src[i].x);
+ __m128i pos1 = _mm_loadl_epi64((const __m128i *) & src[i + 1].x);
+ // Translate UV, combined. TODO: Can possibly shuffle UV and col together here
+ uint32_t uv0 = (uint32_t)src[i].uv | ((uint32_t)src[i + 1].uv << 16);
+ uint64_t col0 = (uint64_t)src[i].col | ((uint64_t)src[i + 1].col << 32);
+ __m128i pos0_32 = _mm_srai_epi32(_mm_unpacklo_epi16(pos0, pos0), 16);
+ __m128i pos1_32 = _mm_srai_epi32(_mm_unpacklo_epi16(pos1, pos1), 16);
+ __m128 pos0_ext = _mm_mul_ps(_mm_cvtepi32_ps(pos0_32), posScale);
+ __m128 pos1_ext = _mm_mul_ps(_mm_cvtepi32_ps(pos1_32), posScale);
+
+ __m128i uv8 = _mm_set1_epi32(uv0);
+ __m128i uv16 = _mm_unpacklo_epi8(uv8, uv8);
+ __m128i uv32 = _mm_srli_epi32(_mm_unpacklo_epi16(uv16, uv16), 24);
+ __m128d uvf = _mm_castps_pd(_mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(uv32), uvScale), uvOff));
+ alpha &= col0;
+
+ // Combined RGBA
+ __m128i col = _mm_set1_epi64x(col0);
+ __m128i r = _mm_slli_epi32(_mm_and_si128(col, rmask), 8 - 5);
+ __m128i g = _mm_slli_epi32(_mm_and_si128(col, gmask), 16 - 10);
+ __m128i b = _mm_slli_epi32(_mm_and_si128(col, bmask), 24 - 15);
+ __m128i a = _mm_srai_epi32(_mm_slli_epi32(_mm_and_si128(col, amask), 16), 7);
+ col = _mm_or_si128(_mm_or_si128(r, g), b);
+ col = _mm_or_si128(col, _mm_and_si128(_mm_srli_epi32(col, 5), lowbits));
+ col = _mm_or_si128(col, a);
+
+ // TODO: Mix into fewer stores.
+ _mm_storeu_ps(&dst[i].x, pos0_ext);
+ _mm_storeu_ps(&dst[i + 1].x, pos1_ext);
+ _mm_storel_pd((double *)&dst[i].u, uvf);
+ _mm_storeh_pd((double *)&dst[i + 1].u, uvf);
+ dst[i].col = _mm_cvtsi128_si32(col);
+ dst[i + 1].col = _mm_cvtsi128_si32(_mm_shuffle_epi32(col, _MM_SHUFFLE(1, 1, 1, 1)));
+ }
+
+ alpha = alpha & (alpha >> 16);
+
+#else
+
+ for (int i = 0; i < count; i++) {
+ float u = src[i].u * uscale + uoff;
+ float v = src[i].v * vscale + voff;
+ alpha &= src[i].col;
+ uint32_t color = RGBA5551ToRGBA8888(src[i].col);
+ float x = src[i].x * (1.0f / 32768.0f);
+ float y = src[i].y * (1.0f / 32768.0f);
+ float z = src[i].z * (1.0f / 32768.0f);
+ dst[i].col = color;
+ dst[i].u = u;
+ dst[i].v = v;
+ dst[i].x = x;
+ dst[i].y = y;
+ dst[i].z = z;
+ }
+
+#endif
+ gstate_c.vertexFullAlpha = (alpha >> 15) & 1;
+}
diff --git a/GPU/Common/VertexDecoderHandwritten.h b/GPU/Common/VertexDecoderHandwritten.h
new file mode 100644
index 0000000000..4ea698a380
--- /dev/null
+++ b/GPU/Common/VertexDecoderHandwritten.h
@@ -0,0 +1,6 @@
+#pragma once
+
+// These are useful on JIT-less platforms - they don't beat the jitted vertex decoders by much, but they
+// will beat the function-call-stitched ones by a lot.
+
+void VtxDec_Tu8_C5551_Ps16(const u8 *srcp, u8 *dstp, int count, const UVScale *uvScaleOffset);
diff --git a/GPU/Common/VertexDecoderX86.cpp b/GPU/Common/VertexDecoderX86.cpp
index 3871792cd0..1df3be8388 100644
--- a/GPU/Common/VertexDecoderX86.cpp
+++ b/GPU/Common/VertexDecoderX86.cpp
@@ -25,6 +25,7 @@
#include "Core/Config.h"
#include "GPU/GPUState.h"
#include "GPU/Common/VertexDecoderCommon.h"
+#include "GPU/Common/VertexDecoderHandwritten.h"
// We start out by converting the active matrices into 4x4 which are easier to multiply with
// using SSE / NEON and store them here.
@@ -167,114 +168,7 @@ static const JitLookup jitLookup[] = {
{&VertexDecoder::Step_Color5551Morph, &VertexDecoderJitCache::Jit_Color5551Morph},
};
-void GTA_Tu8_C5551_Ps16(const u8 *srcp, u8 *dstp, int count, const UVScale *uvScaleOffset) {
- struct GTAVTX {
- union {
- struct {
- u8 u;
- u8 v;
- };
- u16 uv;
- };
- u16 col;
- s16 x;
- s16 y;
- s16 z;
- };
- // NOTE: This might be different for different vertex format.
- struct OutVTX {
- float u;
- float v;
- uint32_t col;
- float x;
- float y;
- float z;
- };
- const GTAVTX *src = (const GTAVTX *)srcp;
- OutVTX *dst = (OutVTX *)dstp;
- float uscale = uvScaleOffset->uScale * (1.0f / 128.0f);
- float vscale = uvScaleOffset->vScale * (1.0f / 128.0f);
- float uoff = uvScaleOffset->uOff;
- float voff = uvScaleOffset->vOff;
-
- u32 alpha = 0xFFFFFFFF;
-
-#if PPSSPP_ARCH(SSE2)
- __m128 uvOff = _mm_setr_ps(uoff, voff, uoff, voff);
- __m128 uvScale = _mm_setr_ps(uscale, vscale, uscale, vscale);
- __m128 posScale = _mm_set1_ps(1.0f / 32768.0f);
- __m128i rmask = _mm_set1_epi32(0x001F);
- __m128i gmask = _mm_set1_epi32(0x03E0);
- __m128i bmask = _mm_set1_epi32(0x7c00);
- __m128i amask = _mm_set1_epi32(0x8000);
- __m128i lowbits = _mm_set1_epi32(0x00070707);
-
- // Two vertices at a time, we can share some calculations.
- for (int i = 0; i < count; i += 2) {
- __m128i pos0 = _mm_loadl_epi64((const __m128i *) & src[i].x);
- __m128i pos1 = _mm_loadl_epi64((const __m128i *) & src[i + 1].x);
- // Translate UV, combined. TODO: Can possibly shuffle UV and col together here
- uint32_t uv0 = (uint32_t)src[i].uv | ((uint32_t)src[i + 1].uv << 16);
- uint64_t col0 = (uint64_t)src[i].col | ((uint64_t)src[i + 1].col << 32);
- __m128i pos0_32 = _mm_srai_epi32(_mm_unpacklo_epi16(pos0, pos0), 16);
- __m128i pos1_32 = _mm_srai_epi32(_mm_unpacklo_epi16(pos1, pos1), 16);
- __m128 pos0_ext = _mm_mul_ps(_mm_cvtepi32_ps(pos0_32), posScale);
- __m128 pos1_ext = _mm_mul_ps(_mm_cvtepi32_ps(pos1_32), posScale);
-
- __m128i uv8 = _mm_set1_epi32(uv0);
- __m128i uv16 = _mm_unpacklo_epi8(uv8, uv8);
- __m128i uv32 = _mm_srli_epi32(_mm_unpacklo_epi16(uv16, uv16), 24);
- __m128d uvf = _mm_castps_pd(_mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(uv32), uvScale), uvOff));
- alpha &= col0;
-
- // Combined RGBA
- __m128i col = _mm_set1_epi64x(col0);
- __m128i r = _mm_slli_epi32(_mm_and_si128(col, rmask), 8 - 5);
- __m128i g = _mm_slli_epi32(_mm_and_si128(col, gmask), 16 - 10);
- __m128i b = _mm_slli_epi32(_mm_and_si128(col, bmask), 24 - 15);
- __m128i a = _mm_srai_epi32(_mm_slli_epi32(_mm_and_si128(col, amask), 16), 7);
- col = _mm_or_si128(_mm_or_si128(r, g), b);
- col = _mm_or_si128(col, _mm_and_si128(_mm_srli_epi32(col, 5), lowbits));
- col = _mm_or_si128(col, a);
-
- // TODO: Mix into fewer stores.
- _mm_storeu_ps(&dst[i].x, pos0_ext);
- _mm_storeu_ps(&dst[i + 1].x, pos1_ext);
- _mm_storel_pd((double *)&dst[i].u, uvf);
- _mm_storeh_pd((double *)&dst[i + 1].u, uvf);
- dst[i].col = _mm_cvtsi128_si32(col);
- dst[i + 1].col = _mm_cvtsi128_si32(_mm_shuffle_epi32(col, _MM_SHUFFLE(1, 1, 1, 1)));
- }
-
- alpha = alpha & (alpha >> 16);
-
-#else
-
- for (int i = 0; i < count; i++) {
- float u = src[i].u * uscale + uoff;
- float v = src[i].v * vscale + voff;
- alpha &= src[i].col;
- uint32_t color = RGBA5551ToRGBA8888(src[i].col);
- float x = src[i].x * (1.0f / 32768.0f);
- float y = src[i].y * (1.0f / 32768.0f);
- float z = src[i].z * (1.0f / 32768.0f);
- dst[i].col = color;
- dst[i].u = u;
- dst[i].v = v;
- dst[i].x = x;
- dst[i].y = y;
- dst[i].z = z;
- }
-
-#endif
- gstate_c.vertexFullAlpha = (alpha >> 15) & 1;
-}
-
JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec, int32_t *jittedSize) {
- if ((dec.fmt_ & ~0x04000000) == (GE_VTYPE_TC_8BIT | GE_VTYPE_COL_5551 | GE_VTYPE_POS_16BIT)) {
- return >A_Tu8_C5551_Ps16;
- }
-
dec_ = &dec;
BeginWrite(4096);
const u8 *start = this->AlignCode16();
diff --git a/GPU/GPU.vcxproj b/GPU/GPU.vcxproj
index 87c79f3462..147e31c724 100644
--- a/GPU/GPU.vcxproj
+++ b/GPU/GPU.vcxproj
@@ -371,6 +371,7 @@
+
@@ -511,6 +512,7 @@
true
+
true
diff --git a/GPU/GPU.vcxproj.filters b/GPU/GPU.vcxproj.filters
index 6b35f8f33a..c59e28ed7b 100644
--- a/GPU/GPU.vcxproj.filters
+++ b/GPU/GPU.vcxproj.filters
@@ -273,6 +273,9 @@
Common
+
+ Common
+
@@ -542,6 +545,9 @@
Common
+
+ Common
+
diff --git a/UWP/GPU_UWP/GPU_UWP.vcxproj b/UWP/GPU_UWP/GPU_UWP.vcxproj
index ffc2d6f0c0..9e22d246ef 100644
--- a/UWP/GPU_UWP/GPU_UWP.vcxproj
+++ b/UWP/GPU_UWP/GPU_UWP.vcxproj
@@ -136,6 +136,7 @@
+
@@ -203,6 +204,7 @@
+
@@ -257,4 +259,4 @@
-
\ No newline at end of file
+
diff --git a/UWP/GPU_UWP/GPU_UWP.vcxproj.filters b/UWP/GPU_UWP/GPU_UWP.vcxproj.filters
index 314574f863..804554a2c5 100644
--- a/UWP/GPU_UWP/GPU_UWP.vcxproj.filters
+++ b/UWP/GPU_UWP/GPU_UWP.vcxproj.filters
@@ -22,6 +22,7 @@
+
@@ -86,6 +87,7 @@
+
@@ -130,4 +132,4 @@
-
\ No newline at end of file
+
diff --git a/android/jni/Android.mk b/android/jni/Android.mk
index ac949a57ef..c49ac79c4e 100644
--- a/android/jni/Android.mk
+++ b/android/jni/Android.mk
@@ -481,6 +481,7 @@ EXEC_AND_LIB_FILES := \
$(SRC)/GPU/Common/ReinterpretFramebuffer.cpp \
$(SRC)/GPU/Common/DepthBufferCommon.cpp \
$(SRC)/GPU/Common/VertexDecoderCommon.cpp.arm \
+ $(SRC)/GPU/Common/VertexDecoderHandwritten.cpp.arm \
$(SRC)/GPU/Common/TextureCacheCommon.cpp.arm \
$(SRC)/GPU/Common/TextureScalerCommon.cpp.arm \
$(SRC)/GPU/Common/ShaderCommon.cpp \
diff --git a/libretro/Makefile.common b/libretro/Makefile.common
index e91b78e722..8a5b0b9214 100644
--- a/libretro/Makefile.common
+++ b/libretro/Makefile.common
@@ -463,6 +463,7 @@ SOURCES_C +=\
SOURCES_CXX += \
$(GPUCOMMONDIR)/Draw2D.cpp \
$(GPUCOMMONDIR)/VertexDecoderCommon.cpp \
+ $(GPUCOMMONDIR)/VertexDecoderHandwritten.cpp \
$(GPUCOMMONDIR)/GPUStateUtils.cpp \
$(GPUCOMMONDIR)/DrawEngineCommon.cpp \
$(GPUCOMMONDIR)/SplineCommon.cpp \