Merge pull request #10899 from hrydgard/restore-hw-skinning-part-2

Restore hw skinning part 2
2025-03-03 03:27:19 +00:00 · 2018-04-10 14:39:23 +02:00 · 2018-04-10 14:39:23 +02:00 · 993423f82d
commit 993423f82d
parent b58bc7c799 02ea4b5efa
46 changed files with 1444 additions and 368 deletions
--- a/Core/Config.cpp
+++ b/Core/Config.cpp
@ -510,6 +510,7 @@ static ConfigSetting graphicsSettings[] = {
 	ReportedConfigSetting("RenderingMode", &g_Config.iRenderingMode, &DefaultRenderingMode, true, true),
 	ConfigSetting("SoftwareRenderer", &g_Config.bSoftwareRendering, false, true, true),
 	ReportedConfigSetting("HardwareTransform", &g_Config.bHardwareTransform, true, true, true),
+	ReportedConfigSetting("SoftwareSkinning", &g_Config.bSoftwareSkinning, true, true, true),
 	ReportedConfigSetting("TextureFiltering", &g_Config.iTexFiltering, 1, true, true),
 	ReportedConfigSetting("BufferFiltering", &g_Config.iBufFilter, 1, true, true),
 	ReportedConfigSetting("InternalResolution", &g_Config.iInternalResolution, &DefaultInternalResolution, true, true),
--- a/Core/Config.h
+++ b/Core/Config.h
@ -155,6 +155,7 @@ public:
 	int iGPUBackend;
 	bool bSoftwareRendering;
 	bool bHardwareTransform; // only used in the GLES backend
+	bool bSoftwareSkinning;  // may speed up some games

 	int iRenderingMode; // 0 = non-buffered rendering 1 = buffered rendering
 	int iTexFiltering; // 1 = off , 2 = nearest , 3 = linear , 4 = linear(CG)
--- a/GPU/Common/DrawEngineCommon.cpp
+++ b/GPU/Common/DrawEngineCommon.cpp
@ -379,9 +379,13 @@ bool DrawEngineCommon::GetCurrentSimpleVertices(int count, std::vector<GPUDebugV
 // The implementation is initially a bit inefficient but shouldn't be a big deal.
 // An intermediate buffer of not-easy-to-predict size is stored at bufPtr.
 u32 DrawEngineCommon::NormalizeVertices(u8 *outPtr, u8 *bufPtr, const u8 *inPtr, VertexDecoder *dec, int lowerBound, int upperBound, u32 vertType) {
-	// First, decode the vertices into a GPU compatible format.
+	// First, decode the vertices into a GPU compatible format. This step can be eliminated but will need a separate
+	// implementation of the vertex decoder.
 	dec->DecodeVerts(bufPtr, inPtr, lowerBound, upperBound);

+	// OK, morphing eliminated but bones still remain to be taken care of.
+	// Let's do a partial software transform where we only do skinning.
+
 	VertexReader reader(bufPtr, dec->GetDecVtxFmt(), vertType);

 	SimpleVertex *sverts = (SimpleVertex *)outPtr;
@ -393,7 +397,56 @@ u32 DrawEngineCommon::NormalizeVertices(u8 *outPtr, u8 *bufPtr, const u8 *inPtr,
 		(u8)gstate.getMaterialAmbientA(),
 	};

-	{
+	// Let's have two separate loops, one for non skinning and one for skinning.
+	if (!g_Config.bSoftwareSkinning && (vertType & GE_VTYPE_WEIGHT_MASK) != GE_VTYPE_WEIGHT_NONE) {
+		int numBoneWeights = vertTypeGetNumBoneWeights(vertType);
+		for (int i = lowerBound; i <= upperBound; i++) {
+			reader.Goto(i - lowerBound);
+			SimpleVertex &sv = sverts[i];
+			if (vertType & GE_VTYPE_TC_MASK) {
+				reader.ReadUV(sv.uv);
+			}
+
+			if (vertType & GE_VTYPE_COL_MASK) {
+				reader.ReadColor0_8888(sv.color);
+			} else {
+				memcpy(sv.color, defaultColor, 4);
+			}
+
+			float nrm[3], pos[3];
+			float bnrm[3], bpos[3];
+
+			if (vertType & GE_VTYPE_NRM_MASK) {
+				// Normals are generated during tessellation anyway, not sure if any need to supply
+				reader.ReadNrm(nrm);
+			} else {
+				nrm[0] = 0;
+				nrm[1] = 0;
+				nrm[2] = 1.0f;
+			}
+			reader.ReadPos(pos);
+
+			// Apply skinning transform directly
+			float weights[8];
+			reader.ReadWeights(weights);
+			// Skinning
+			Vec3Packedf psum(0, 0, 0);
+			Vec3Packedf nsum(0, 0, 0);
+			for (int w = 0; w < numBoneWeights; w++) {
+				if (weights[w] != 0.0f) {
+					Vec3ByMatrix43(bpos, pos, gstate.boneMatrix + w * 12);
+					Vec3Packedf tpos(bpos);
+					psum += tpos * weights[w];
+
+					Norm3ByMatrix43(bnrm, nrm, gstate.boneMatrix + w * 12);
+					Vec3Packedf tnorm(bnrm);
+					nsum += tnorm * weights[w];
+				}
+			}
+			sv.pos = psum;
+			sv.nrm = nsum;
+		}
+	} else {
 		for (int i = lowerBound; i <= upperBound; i++) {
 			reader.Goto(i - lowerBound);
 			SimpleVertex &sv = sverts[i];
@ -655,7 +708,7 @@ void DrawEngineCommon::SubmitPrim(void *verts, void *inds, GEPrimitiveType prim,
 	numDrawCalls++;
 	vertexCountInDrawCalls_ += vertexCount;

-	if (vertTypeID & GE_VTYPE_WEIGHT_MASK) {
+	if (g_Config.bSoftwareSkinning && (vertTypeID & GE_VTYPE_WEIGHT_MASK)) {
 		DecodeVertsStep(decoded, decodeCounter_, decodedVerts_);
 		decodeCounter_++;
 	}
--- a/GPU/Common/DrawEngineCommon.h
+++ b/GPU/Common/DrawEngineCommon.h
@ -133,7 +133,7 @@ protected:
 	TransformedVertex *transformed = nullptr;
 	TransformedVertex *transformedExpanded = nullptr;

-	// Defer all vertex decoding to a "Flush" (except when skinning)
+	// Defer all vertex decoding to a "Flush" (except when software skinning)
 	struct DeferredDrawCall {
 		void *verts;
 		void *inds;
--- a/GPU/Common/ShaderCommon.h
+++ b/GPU/Common/ShaderCommon.h
@ -76,8 +76,14 @@ enum : uint64_t {
 	DIRTY_WORLDMATRIX = 1ULL << 21,
 	DIRTY_VIEWMATRIX = 1ULL << 22,
 	DIRTY_TEXMATRIX = 1ULL << 23,
-
-	// 8 free bits here where bones used to be!
+	DIRTY_BONEMATRIX0 = 1ULL << 24,  // NOTE: These must be under 32
+	DIRTY_BONEMATRIX1 = 1ULL << 25,
+	DIRTY_BONEMATRIX2 = 1ULL << 26,
+	DIRTY_BONEMATRIX3 = 1ULL << 27,
+	DIRTY_BONEMATRIX4 = 1ULL << 28,
+	DIRTY_BONEMATRIX5 = 1ULL << 29,
+	DIRTY_BONEMATRIX6 = 1ULL << 30,
+	DIRTY_BONEMATRIX7 = 1ULL << 31,

 	// These are for hardware tessellation
 	DIRTY_BEZIERSPLINE = 1ULL << 32,
@ -85,6 +91,8 @@ enum : uint64_t {

 	// space for 7 more uniforms.

+	DIRTY_BONE_UNIFORMS = 0xFF000000ULL,
+
 	DIRTY_ALL_UNIFORMS = 0x3FFFFFFFFULL,
 	DIRTY_ALL_LIGHTS = DIRTY_LIGHT0 | DIRTY_LIGHT1 | DIRTY_LIGHT2 | DIRTY_LIGHT3,

--- a/GPU/Common/ShaderId.cpp
+++ b/GPU/Common/ShaderId.cpp
@ -33,6 +33,7 @@ std::string VertexShaderDesc(const ShaderID &id) {
 	int ls1 = id.Bits(VS_BIT_LS1, 2);

 	if (uvgMode) desc << uvgModes[uvgMode];
+	if (id.Bit(VS_BIT_ENABLE_BONES)) desc << "Bones:" << (id.Bits(VS_BIT_BONES, 3) + 1) << " ";
 	// Lights
 	if (id.Bit(VS_BIT_LIGHTING_ENABLE)) {
 		desc << "Light: ";
@ -102,6 +103,16 @@ void ComputeVertexShaderID(ShaderID *id_out, u32 vertType, bool useHWTransform)
 			id.SetBits(VS_BIT_LS1, 2, gstate.getUVLS1());
 		}

+		// Bones.
+		bool enableBones = vertTypeIsSkinningEnabled(vertType);
+		id.SetBit(VS_BIT_ENABLE_BONES, enableBones);
+		if (enableBones) {
+			id.SetBits(VS_BIT_BONES, 3, TranslateNumBones(vertTypeGetNumBoneWeights(vertType)) - 1);
+			// 2 bits. We should probably send in the weight scalefactor as a uniform instead,
+			// or simply preconvert all weights to floats.
+			id.SetBits(VS_BIT_WEIGHT_FMTSCALE, 2, (vertType & GE_VTYPE_WEIGHT_MASK) >> GE_VTYPE_WEIGHT_SHIFT);
+		}
+
 		// Okay, d[1] coming up. ==============
 		if (gstate.isLightingEnabled() || doShadeMapping) {
 			// doShadeMapping is stored as UVGenMode, so this is enough for isLightingEnabled.
--- a/GPU/Common/ShaderId.h
+++ b/GPU/Common/ShaderId.h
@ -7,6 +7,7 @@

 // TODO: There will be additional bits, indicating that groups of these will be
 // sent to the shader and processed there. This will cut down the number of shaders ("ubershader approach")
+// This is probably only really worth doing for lighting and bones.
 enum {
 	VS_BIT_LMODE = 0,
 	VS_BIT_IS_THROUGH = 1,
@ -28,7 +29,10 @@ enum {
 	VS_BIT_UVPROJ_MODE = 18,  // 2, can overlap with LS0
 	VS_BIT_LS0 = 18,  // 2
 	VS_BIT_LS1 = 20,  // 2
-	// 22 - 31 are free.
+	VS_BIT_BONES = 22,  // 3 should be enough, not 8
+	// 25 - 29 are free.
+	VS_BIT_ENABLE_BONES = 30,
+	// 31 is free.
 	VS_BIT_LIGHT0_COMP = 32,  // 2 bits
 	VS_BIT_LIGHT0_TYPE = 34,  // 2 bits
 	VS_BIT_LIGHT1_COMP = 36,  // 2 bits
--- a/GPU/Common/ShaderUniforms.cpp
+++ b/GPU/Common/ShaderUniforms.cpp
@ -246,3 +246,11 @@ void LightUpdateUniforms(UB_VS_Lights *ub, uint64_t dirtyUniforms) {
 		}
 	}
 }
+
+void BoneUpdateUniforms(UB_VS_Bones *ub, uint64_t dirtyUniforms) {
+	for (int i = 0; i < 8; i++) {
+		if (dirtyUniforms & (DIRTY_BONEMATRIX0 << i)) {
+			ConvertMatrix4x3To3x4Transposed(ub->bones[i], gstate.boneMatrix + 12 * i);
+		}
+	}
+}
--- a/GPU/Common/ShaderUniforms.h
+++ b/GPU/Common/ShaderUniforms.h
@ -159,5 +159,22 @@ R"(	float4 u_ambient;
 	float3 u_lightspecular3;
 )";

+// With some cleverness, we could get away with uploading just half this when only the four or five first
+// bones are being used. This is 512b, 256b would be great.
+struct UB_VS_Bones {
+	float bones[8][12];
+};
+
+static const char *ub_vs_bonesStr =
+R"(	mat3x4 m[8];
+)";
+
+// HLSL code is shared so these names are changed to match those in DX9.
+static const char *cb_vs_bonesStr =
+R"(	float4x3 u_bone[8];
+)";
+
 void BaseUpdateUniforms(UB_VS_FS_Base *ub, uint64_t dirtyUniforms, bool flipViewport);
 void LightUpdateUniforms(UB_VS_Lights *ub, uint64_t dirtyUniforms);
+void BoneUpdateUniforms(UB_VS_Bones *ub, uint64_t dirtyUniforms);
+
--- a/GPU/Common/SoftwareTransformCommon.cpp
+++ b/GPU/Common/SoftwareTransformCommon.cpp
@ -144,6 +144,8 @@ void SoftwareTransform(
 		vscale /= gstate_c.curTextureHeight;
 	}

+	bool skinningEnabled = vertTypeIsSkinningEnabled(vertType);
+
 	const int w = gstate.getTextureWidth(0);
 	const int h = gstate.getTextureHeight(0);
 	float widthFactor = (float) w / (float) gstate_c.curTextureWidth;
@ -211,14 +213,48 @@ void SoftwareTransform(
 			Vec3f worldnormal(0, 0, 1);
 			reader.ReadPos(pos);

-			Vec3ByMatrix43(out, pos, gstate.worldMatrix);
-			if (reader.hasNormal()) {
-				reader.ReadNrm(normal.AsArray());
-				if (gstate.areNormalsReversed()) {
-					normal = -normal;
+			if (!skinningEnabled) {
+				Vec3ByMatrix43(out, pos, gstate.worldMatrix);
+				if (reader.hasNormal()) {
+					reader.ReadNrm(normal.AsArray());
+					if (gstate.areNormalsReversed()) {
+						normal = -normal;
+					}
+					Norm3ByMatrix43(worldnormal.AsArray(), normal.AsArray(), gstate.worldMatrix);
+					worldnormal = worldnormal.Normalized();
+				}
+			} else {
+				float weights[8];
+				reader.ReadWeights(weights);
+				if (reader.hasNormal())
+					reader.ReadNrm(normal.AsArray());
+
+				// Skinning
+				Vec3f psum(0, 0, 0);
+				Vec3f nsum(0, 0, 0);
+				for (int i = 0; i < vertTypeGetNumBoneWeights(vertType); i++) {
+					if (weights[i] != 0.0f) {
+						Vec3ByMatrix43(out, pos, gstate.boneMatrix+i*12);
+						Vec3f tpos(out);
+						psum += tpos * weights[i];
+						if (reader.hasNormal()) {
+							Vec3f norm;
+							Norm3ByMatrix43(norm.AsArray(), normal.AsArray(), gstate.boneMatrix+i*12);
+							nsum += norm * weights[i];
+						}
+					}
+				}
+
+				// Yes, we really must multiply by the world matrix too.
+				Vec3ByMatrix43(out, psum.AsArray(), gstate.worldMatrix);
+				if (reader.hasNormal()) {
+					normal = nsum;
+					if (gstate.areNormalsReversed()) {
+						normal = -normal;
+					}
+					Norm3ByMatrix43(worldnormal.AsArray(), normal.AsArray(), gstate.worldMatrix);
+					worldnormal = worldnormal.Normalized();
 				}
-				Norm3ByMatrix43(worldnormal.AsArray(), normal.AsArray(), gstate.worldMatrix);
-				worldnormal = worldnormal.Normalized();
 			}

 			// Perform lighting here if enabled. don't need to check through, it's checked above.
--- a/GPU/Common/VertexDecoderArm.cpp
+++ b/GPU/Common/VertexDecoderArm.cpp
@ -111,6 +111,9 @@ static const ARMReg srcNEON = Q2;
 static const ARMReg accNEON = Q3;

 static const JitLookup jitLookup[] = {
+	{&VertexDecoder::Step_WeightsU8, &VertexDecoderJitCache::Jit_WeightsU8},
+	{&VertexDecoder::Step_WeightsU16, &VertexDecoderJitCache::Jit_WeightsU16},
+	{&VertexDecoder::Step_WeightsFloat, &VertexDecoderJitCache::Jit_WeightsFloat},
 	{&VertexDecoder::Step_WeightsU8Skin, &VertexDecoderJitCache::Jit_WeightsU8Skin},
 	{&VertexDecoder::Step_WeightsU16Skin, &VertexDecoderJitCache::Jit_WeightsU16Skin},
 	{&VertexDecoder::Step_WeightsFloatSkin, &VertexDecoderJitCache::Jit_WeightsFloatSkin},
@ -229,7 +232,7 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec, int
 	// Add code to convert matrices to 4x4.
 	// Later we might want to do this when the matrices are loaded instead.
 	int boneCount = 0;
-	if (NEONSkinning && dec.weighttype) {
+	if (NEONSkinning && dec.weighttype && g_Config.bSoftwareSkinning) {
 		// Copying from R3 to R4
 		MOVP2R(R3, gstate.boneMatrix);
 		MOVP2R(R4, bones);
@ -323,6 +326,55 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec, int
 	return (JittedVertexDecoder)start;
 }

+void VertexDecoderJitCache::Jit_WeightsU8() {
+	// Basic implementation - a byte at a time. TODO: Optimize
+	int j;
+	for (j = 0; j < dec_->nweights; j++) {
+		LDRB(tempReg1, srcReg, dec_->weightoff + j);
+		STRB(tempReg1, dstReg, dec_->decFmt.w0off + j);
+	}
+	if (j & 3) {
+		// Create a zero register. Might want to make a fixed one.
+		EOR(scratchReg, scratchReg, scratchReg);
+	}
+	while (j & 3) {
+		STRB(scratchReg, dstReg, dec_->decFmt.w0off + j);
+		j++;
+	}
+}
+
+void VertexDecoderJitCache::Jit_WeightsU16() {
+	// Basic implementation - a short at a time. TODO: Optimize
+	int j;
+	for (j = 0; j < dec_->nweights; j++) {
+		LDRH(tempReg1, srcReg, dec_->weightoff + j * 2);
+		STRH(tempReg1, dstReg, dec_->decFmt.w0off + j * 2);
+	}
+	if (j & 3) {
+		// Create a zero register. Might want to make a fixed one.
+		EOR(scratchReg, scratchReg, scratchReg);
+	}
+	while (j & 3) {
+		STRH(scratchReg, dstReg, dec_->decFmt.w0off + j * 2);
+		j++;
+	}
+}
+
+void VertexDecoderJitCache::Jit_WeightsFloat() {
+	int j;
+	for (j = 0; j < dec_->nweights; j++) {
+		LDR(tempReg1, srcReg, dec_->weightoff + j * 4);
+		STR(tempReg1, dstReg, dec_->decFmt.w0off + j * 4);
+	}
+	if (j & 3) {
+		EOR(tempReg1, tempReg1, tempReg1);
+	}
+	while (j & 3) {  // Zero additional weights rounding up to 4.
+		STR(tempReg1, dstReg, dec_->decFmt.w0off + j * 4);
+		j++;
+	}
+}
+
 static const ARMReg weightRegs[8] = { S8, S9, S10, S11, S12, S13, S14, S15 };
 static const ARMReg neonWeightRegsD[4] = { D4, D5, D6, D7 };
 static const ARMReg neonWeightRegsQ[2] = { Q2, Q3 };
--- a/GPU/Common/VertexDecoderArm64.cpp
+++ b/GPU/Common/VertexDecoderArm64.cpp
@ -85,6 +85,9 @@ static const ARM64Reg neonWeightRegsQ[2] = { Q3, Q2 };  // reverse order to prev
 // Q16+ are free-for-all for matrices. In 16 registers, we can fit 4 4x4 matrices.

 static const JitLookup jitLookup[] = {
+	{&VertexDecoder::Step_WeightsU8, &VertexDecoderJitCache::Jit_WeightsU8},
+	{&VertexDecoder::Step_WeightsU16, &VertexDecoderJitCache::Jit_WeightsU16},
+	{&VertexDecoder::Step_WeightsFloat, &VertexDecoderJitCache::Jit_WeightsFloat},
 	{&VertexDecoder::Step_WeightsU8Skin, &VertexDecoderJitCache::Jit_WeightsU8Skin},
 	{&VertexDecoder::Step_WeightsU16Skin, &VertexDecoderJitCache::Jit_WeightsU16Skin},
 	{&VertexDecoder::Step_WeightsFloatSkin, &VertexDecoderJitCache::Jit_WeightsFloatSkin},
@ -193,7 +196,7 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec, int
 	// Add code to convert matrices to 4x4.
 	// Later we might want to do this when the matrices are loaded instead.
 	int boneCount = 0;
-	if (dec.weighttype) {
+	if (dec.weighttype && g_Config.bSoftwareSkinning) {
 		// Copying from R3 to R4
 		MOVP2R(X3, gstate.boneMatrix);
 		MOVP2R(X4, bones);
@ -353,6 +356,44 @@ void VertexDecoderJitCache::Jit_ApplyWeights() {
 	}
 }

+void VertexDecoderJitCache::Jit_WeightsU8() {
+	// Basic implementation - a byte at a time. TODO: Optimize
+	int j;
+	for (j = 0; j < dec_->nweights; j++) {
+		LDRB(INDEX_UNSIGNED, tempReg1, srcReg, dec_->weightoff + j);
+		STRB(INDEX_UNSIGNED, tempReg1, dstReg, dec_->decFmt.w0off + j);
+	}
+	while (j & 3) {
+		STRB(INDEX_UNSIGNED, WZR, dstReg, dec_->decFmt.w0off + j);
+		j++;
+	}
+}
+
+void VertexDecoderJitCache::Jit_WeightsU16() {
+	// Basic implementation - a short at a time. TODO: Optimize
+	int j;
+	for (j = 0; j < dec_->nweights; j++) {
+		LDRH(INDEX_UNSIGNED, tempReg1, srcReg, dec_->weightoff + j * 2);
+		STRH(INDEX_UNSIGNED, tempReg1, dstReg, dec_->decFmt.w0off + j * 2);
+	}
+	while (j & 3) {
+		STRH(INDEX_UNSIGNED, WZR, dstReg, dec_->decFmt.w0off + j * 2);
+		j++;
+	}
+}
+
+void VertexDecoderJitCache::Jit_WeightsFloat() {
+	int j;
+	for (j = 0; j < dec_->nweights; j++) {
+		LDR(INDEX_UNSIGNED, tempReg1, srcReg, dec_->weightoff + j * 4);
+		STR(INDEX_UNSIGNED, tempReg1, dstReg, dec_->decFmt.w0off + j * 4);
+	}
+	while (j & 3) {  // Zero additional weights rounding up to 4.
+		STR(INDEX_UNSIGNED, WZR, dstReg, dec_->decFmt.w0off + j * 4);
+		j++;
+	}
+}
+
 void VertexDecoderJitCache::Jit_WeightsU8Skin() {
 	// Weight is first so srcReg is correct.
 	switch (dec_->nweights) {
--- a/GPU/Common/VertexDecoderCommon.cpp
+++ b/GPU/Common/VertexDecoderCommon.cpp
@ -41,7 +41,7 @@ static const u8 nrmsize[4] = { 0, 3, 6, 12 }, nrmalign[4] = { 0, 1, 2, 4 };
 static const u8 possize[4] = { 3, 3, 6, 12 }, posalign[4] = { 1, 1, 2, 4 };
 static const u8 wtsize[4] = { 0, 1, 2, 4 }, wtalign[4] = { 0, 1, 2, 4 };

-// This array is only used when non-jitted - when jitted, the matrix
+// When software skinning. This array is only used when non-jitted - when jitted, the matrix
 // is kept in registers.
 alignas(16) static float skinMatrix[12];

@ -49,6 +49,13 @@ inline int align(int n, int align) {
 	return (n + (align - 1)) & ~(align - 1);
 }

+int TranslateNumBones(int bones) {
+	if (!bones) return 0;
+	if (bones < 4) return 4;
+	// if (bones < 8) return 8;   I get drawing problems in FF:CC with this!
+	return bones;
+}
+
 int DecFmtSize(u8 fmt) {
 	switch (fmt) {
 	case DEC_NONE: return 0;
@ -170,6 +177,67 @@ void PrintDecodedVertex(VertexReader &vtx) {
 VertexDecoder::VertexDecoder() : decoded_(nullptr), ptr_(nullptr), jitted_(0), jittedSize_(0) {
 }

+void VertexDecoder::Step_WeightsU8() const
+{
+	u8 *wt = (u8 *)(decoded_ + decFmt.w0off);
+	const u8 *wdata = (const u8*)(ptr_);
+	int j;
+	for (j = 0; j < nweights; j++)
+		wt[j] = wdata[j];
+	while (j & 3)   // Zero additional weights rounding up to 4.
+		wt[j++] = 0;
+}
+
+void VertexDecoder::Step_WeightsU16() const
+{
+	u16 *wt = (u16 *)(decoded_ + decFmt.w0off);
+	const u16 *wdata = (const u16*)(ptr_);
+	int j;
+	for (j = 0; j < nweights; j++)
+		wt[j] = wdata[j];
+	while (j & 3)   // Zero additional weights rounding up to 4.
+		wt[j++] = 0;
+}
+
+void VertexDecoder::Step_WeightsU8ToFloat() const
+{
+	float *wt = (float *)(decoded_ + decFmt.w0off);
+	const u8 *wdata = (const u8*)(ptr_);
+	int j;
+	for (j = 0; j < nweights; j++) {
+		wt[j] = (float)wdata[j] * (1.0f / 128.0f);
+	}
+	while (j & 3)   // Zero additional weights rounding up to 4.
+		wt[j++] = 0;
+}
+
+void VertexDecoder::Step_WeightsU16ToFloat() const
+{
+	float *wt = (float *)(decoded_ + decFmt.w0off);
+	const u16 *wdata = (const u16*)(ptr_);
+	int j;
+	for (j = 0; j < nweights; j++) {
+		wt[j] = (float)wdata[j] * (1.0f / 32768.0f);
+	}
+	while (j & 3)   // Zero additional weights rounding up to 4.
+		wt[j++] = 0;
+}
+
+// Float weights should be uncommon, we can live with having to multiply these by 2.0
+// to avoid special checks in the vertex shader generator.
+// (PSP uses 0.0-2.0 fixed point numbers for weights)
+void VertexDecoder::Step_WeightsFloat() const
+{
+	float *wt = (float *)(decoded_ + decFmt.w0off);
+	const float *wdata = (const float*)(ptr_);
+	int j;
+	for (j = 0; j < nweights; j++) {
+		wt[j] = wdata[j];
+	}
+	while (j & 3)   // Zero additional weights rounding up to 4.
+		wt[j++] = 0.0f;
+}
+
 void VertexDecoder::ComputeSkinMatrix(const float weights[8]) const {
 	memset(skinMatrix, 0, sizeof(skinMatrix));
 	for (int j = 0; j < nweights; j++) {
@ -802,6 +870,20 @@ void VertexDecoder::Step_PosFloatMorphSkin() const {
 	Vec3ByMatrix43(v, pos, skinMatrix);
 }

+static const StepFunction wtstep[4] = {
+	0,
+	&VertexDecoder::Step_WeightsU8,
+	&VertexDecoder::Step_WeightsU16,
+	&VertexDecoder::Step_WeightsFloat,
+};
+
+static const StepFunction wtstepToFloat[4] = {
+	0,
+	&VertexDecoder::Step_WeightsU8ToFloat,
+	&VertexDecoder::Step_WeightsU16ToFloat,
+	&VertexDecoder::Step_WeightsFloat,
+};
+
 // TODO: Morph weights correctly! This is missing. Not sure if any game actually
 // use this functionality at all.

@ -1000,7 +1082,7 @@ void VertexDecoder::SetVertexType(u32 fmt, const VertexDecoderOptions &options,
 		DEBUG_LOG(G3D, "VTYPE: THRU=%i TC=%i COL=%i POS=%i NRM=%i WT=%i NW=%i IDX=%i MC=%i", (int)throughmode, tc, col, pos, nrm, weighttype, nweights, idx, morphcount);
 	}

-	bool skinning = weighttype != 0;
+	bool skinInDecode = weighttype != 0 && g_Config.bSoftwareSkinning;

 	if (weighttype) { // && nweights?
 		weightoff = size;
@ -1009,11 +1091,43 @@ void VertexDecoder::SetVertexType(u32 fmt, const VertexDecoderOptions &options,
 		if (wtalign[weighttype] > biggest)
 			biggest = wtalign[weighttype];

-		// No visible output, computes a matrix that is passed through the skinMatrix variable
-		// to the "nrm" and "pos" steps.
-		// Technically we should support morphing the weights too, but I have a hard time
-		// imagining that any game would use that.. but you never know.
-		steps_[numSteps_++] = wtstep_skin[weighttype];
+		if (skinInDecode) {
+			// No visible output, computes a matrix that is passed through the skinMatrix variable
+			// to the "nrm" and "pos" steps.
+			// Technically we should support morphing the weights too, but I have a hard time
+			// imagining that any game would use that.. but you never know.
+			steps_[numSteps_++] = wtstep_skin[weighttype];
+		} else {
+			int fmtBase = DEC_FLOAT_1;
+			if (options.expandAllWeightsToFloat) {
+				steps_[numSteps_++] = wtstepToFloat[weighttype];
+				fmtBase = DEC_FLOAT_1;
+			} else {
+				steps_[numSteps_++] = wtstep[weighttype];
+				if (weighttype == GE_VTYPE_WEIGHT_8BIT >> GE_VTYPE_WEIGHT_SHIFT) {
+					fmtBase = DEC_U8_1;
+				} else if (weighttype == GE_VTYPE_WEIGHT_16BIT >> GE_VTYPE_WEIGHT_SHIFT) {
+					fmtBase = DEC_U16_1;
+				} else if (weighttype == GE_VTYPE_WEIGHT_FLOAT >> GE_VTYPE_WEIGHT_SHIFT) {
+					fmtBase = DEC_FLOAT_1;
+				}
+			}
+
+			int numWeights = TranslateNumBones(nweights);
+
+			if (numWeights <= 4) {
+				decFmt.w0off = decOff;
+				decFmt.w0fmt = fmtBase + numWeights - 1;
+				decOff += DecFmtSize(decFmt.w0fmt);
+			} else {
+				decFmt.w0off = decOff;
+				decFmt.w0fmt = fmtBase + 3;
+				decOff += DecFmtSize(decFmt.w0fmt);
+				decFmt.w1off = decOff;
+				decFmt.w1fmt = fmtBase + numWeights - 5;
+				decOff += DecFmtSize(decFmt.w1fmt);
+			}
+		}
 	}

 	if (tc) {
@ -1071,7 +1185,7 @@ void VertexDecoder::SetVertexType(u32 fmt, const VertexDecoderOptions &options,
 		if (nrmalign[nrm] > biggest)
 			biggest = nrmalign[nrm];

-		if (skinning) {
+		if (skinInDecode) {
 			steps_[numSteps_++] = morphcount == 1 ? nrmstep_skin[nrm] : nrmstep_morphskin[nrm];
 			// After skinning, we always have three floats.
 			decFmt.nrmfmt = DEC_FLOAT_3;
@ -1122,7 +1236,7 @@ void VertexDecoder::SetVertexType(u32 fmt, const VertexDecoderOptions &options,
 			steps_[numSteps_++] = posstep_through[pos];
 			decFmt.posfmt = DEC_FLOAT_3;
 		} else {
-			if (skinning) {
+			if (skinInDecode) {
 				steps_[numSteps_++] = morphcount == 1 ? posstep_skin[pos] : posstep_morph_skin[pos];
 				decFmt.posfmt = DEC_FLOAT_3;
 			} else {
--- a/GPU/Common/VertexDecoderCommon.h
+++ b/GPU/Common/VertexDecoderCommon.h
@ -432,6 +432,9 @@ struct JitLookup {
 	JitStepFunction jitFunc;
 };

+// Collapse to less skinning shaders to reduce shader switching, which is expensive.
+int TranslateNumBones(int bones);
+
 typedef void(*JittedVertexDecoder)(const u8 *src, u8 *dst, int count);

 struct VertexDecoderOptions {
@ -458,6 +461,12 @@ public:

 	std::string GetString(DebugShaderStringType stringType);

+	void Step_WeightsU8() const;
+	void Step_WeightsU16() const;
+	void Step_WeightsU8ToFloat() const;
+	void Step_WeightsU16ToFloat() const;
+	void Step_WeightsFloat() const;
+
 	void ComputeSkinMatrix(const float weights[8]) const;

 	void Step_WeightsU8Skin() const;
@ -610,6 +619,12 @@ public:
 	JittedVertexDecoder Compile(const VertexDecoder &dec, int32_t *jittedSize);
 	void Clear();

+	void Jit_WeightsU8();
+	void Jit_WeightsU16();
+	void Jit_WeightsU8ToFloat();
+	void Jit_WeightsU16ToFloat();
+	void Jit_WeightsFloat();
+
 	void Jit_WeightsU8Skin();
 	void Jit_WeightsU16Skin();
 	void Jit_WeightsFloatSkin();
--- a/GPU/Common/VertexDecoderX86.cpp
+++ b/GPU/Common/VertexDecoderX86.cpp
@ -94,10 +94,16 @@ static const X64Reg fpScratchReg4 = XMM4;
 // on the interpreter if the compiler fails.

 static const JitLookup jitLookup[] = {
+	{&VertexDecoder::Step_WeightsU8, &VertexDecoderJitCache::Jit_WeightsU8},
+	{&VertexDecoder::Step_WeightsU16, &VertexDecoderJitCache::Jit_WeightsU16},
+	{&VertexDecoder::Step_WeightsFloat, &VertexDecoderJitCache::Jit_WeightsFloat},
 	{&VertexDecoder::Step_WeightsU8Skin, &VertexDecoderJitCache::Jit_WeightsU8Skin},
 	{&VertexDecoder::Step_WeightsU16Skin, &VertexDecoderJitCache::Jit_WeightsU16Skin},
 	{&VertexDecoder::Step_WeightsFloatSkin, &VertexDecoderJitCache::Jit_WeightsFloatSkin},

+	{&VertexDecoder::Step_WeightsU8ToFloat, &VertexDecoderJitCache::Jit_WeightsU8ToFloat},
+	{&VertexDecoder::Step_WeightsU16ToFloat, &VertexDecoderJitCache::Jit_WeightsU16ToFloat},
+
 	{&VertexDecoder::Step_TcFloat, &VertexDecoderJitCache::Jit_TcFloat},
 	{&VertexDecoder::Step_TcU8ToFloat, &VertexDecoderJitCache::Jit_TcU8ToFloat},
 	{&VertexDecoder::Step_TcU16ToFloat, &VertexDecoderJitCache::Jit_TcU16ToFloat},
@ -202,7 +208,7 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec, int
 	// Add code to convert matrices to 4x4.
 	// Later we might want to do this when the matrices are loaded instead.
 	int boneCount = 0;
-	if (dec.weighttype) {
+	if (dec.weighttype && g_Config.bSoftwareSkinning) {
 		MOV(PTRBITS, R(tempReg1), ImmPtr(&threeMasks));
 		MOVAPS(XMM4, MatR(tempReg1));
 		MOV(PTRBITS, R(tempReg1), ImmPtr(&aOne));
@ -276,6 +282,175 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec, int
 	return (JittedVertexDecoder)start;
 }

+void VertexDecoderJitCache::Jit_WeightsU8() {
+	switch (dec_->nweights) {
+	case 1:
+		MOVZX(32, 8, tempReg1, MDisp(srcReg, dec_->weightoff));
+		break;
+	case 2:
+		MOVZX(32, 16, tempReg1, MDisp(srcReg, dec_->weightoff));
+		break;
+	case 3:
+		MOV(32, R(tempReg1), MDisp(srcReg, dec_->weightoff));
+		AND(32, R(tempReg1), Imm32(0x00FFFFFF));
+		break;
+	case 4:
+		MOV(32, R(tempReg1), MDisp(srcReg, dec_->weightoff));
+		break;
+	case 5:
+		MOV(32, R(tempReg1), MDisp(srcReg, dec_->weightoff));
+		MOVZX(32, 8, tempReg2, MDisp(srcReg, dec_->weightoff + 4));
+		break;
+	case 6:
+		MOV(32, R(tempReg1), MDisp(srcReg, dec_->weightoff));
+		MOVZX(32, 16, tempReg2, MDisp(srcReg, dec_->weightoff + 4));
+		break;
+	case 7:
+		MOV(32, R(tempReg1), MDisp(srcReg, dec_->weightoff));
+		MOV(32, R(tempReg2), MDisp(srcReg, dec_->weightoff + 4));
+		AND(32, R(tempReg2), Imm32(0x00FFFFFF));
+		break;
+	case 8:
+		MOV(32, R(tempReg1), MDisp(srcReg, dec_->weightoff));
+		MOV(32, R(tempReg2), MDisp(srcReg, dec_->weightoff + 4));
+		break;
+	}
+
+	if (dec_->nweights <= 4) {
+		MOV(32, MDisp(dstReg, dec_->decFmt.w0off), R(tempReg1));
+	} else {
+		MOV(32, MDisp(dstReg, dec_->decFmt.w0off), R(tempReg1));
+		MOV(32, MDisp(dstReg, dec_->decFmt.w1off), R(tempReg2));
+	}
+}
+
+void VertexDecoderJitCache::Jit_WeightsU16() {
+	switch (dec_->nweights) {
+	case 1:
+		MOVZX(32, 16, tempReg1, MDisp(srcReg, dec_->weightoff));
+		MOV(32, MDisp(dstReg, dec_->decFmt.w0off), R(tempReg1));
+		MOV(32, MDisp(dstReg, dec_->decFmt.w0off + 4), Imm32(0));
+		return;
+
+	case 2:
+		MOV(32, R(tempReg1), MDisp(srcReg, dec_->weightoff));
+		MOV(32, MDisp(dstReg, dec_->decFmt.w0off), R(tempReg1));
+		MOV(32, MDisp(dstReg, dec_->decFmt.w0off + 4), Imm32(0));
+		return;
+
+	case 3:
+		MOV(32, R(tempReg1), MDisp(srcReg, dec_->weightoff));
+		MOVZX(32, 16, tempReg2, MDisp(srcReg, dec_->weightoff + 4));
+		MOV(32, MDisp(dstReg, dec_->decFmt.w0off), R(tempReg1));
+		MOV(32, MDisp(dstReg, dec_->decFmt.w0off + 4), R(tempReg2));
+		return;
+
+	case 4:
+		// Anything above 4 will do 4 here, and then the rest after.
+	case 5:
+	case 6:
+	case 7:
+	case 8:
+		MOV(32, R(tempReg1), MDisp(srcReg, dec_->weightoff));
+		MOV(32, R(tempReg2), MDisp(srcReg, dec_->weightoff + 4));
+		MOV(32, MDisp(dstReg, dec_->decFmt.w0off), R(tempReg1));
+		MOV(32, MDisp(dstReg, dec_->decFmt.w0off + 4), R(tempReg2));
+		break;
+	}
+
+	// Basic implementation - a short at a time. TODO: Optimize
+	int j;
+	for (j = 4; j < dec_->nweights; j++) {
+		MOV(16, R(tempReg1), MDisp(srcReg, dec_->weightoff + j * 2));
+		MOV(16, MDisp(dstReg, dec_->decFmt.w0off + j * 2), R(tempReg1));
+	}
+	while (j & 3) {
+		MOV(16, MDisp(dstReg, dec_->decFmt.w0off + j * 2), Imm16(0));
+		j++;
+	}
+}
+
+void VertexDecoderJitCache::Jit_WeightsU8ToFloat() {
+	if (dec_->nweights >= 4) {
+		Jit_AnyU8ToFloat(dec_->weightoff, 32);
+		MOVUPS(MDisp(dstReg, dec_->decFmt.w0off), XMM3);
+		if (dec_->nweights > 4) {
+			Jit_AnyU8ToFloat(dec_->weightoff + 4, (dec_->nweights - 4) * 8);
+			MOVUPS(MDisp(dstReg, dec_->decFmt.w1off), XMM3);
+		}
+	} else {
+		Jit_AnyU8ToFloat(dec_->weightoff, dec_->nweights * 8);
+		MOVUPS(MDisp(dstReg, dec_->decFmt.w0off), XMM3);
+	}
+}
+
+void VertexDecoderJitCache::Jit_WeightsU16ToFloat() {
+	if (dec_->nweights >= 4) {
+		Jit_AnyU16ToFloat(dec_->weightoff, 64);
+		MOVUPS(MDisp(dstReg, dec_->decFmt.w0off), XMM3);
+		if (dec_->nweights > 4) {
+			Jit_AnyU16ToFloat(dec_->weightoff + 4 * 2, (dec_->nweights - 4) * 16);
+			MOVUPS(MDisp(dstReg, dec_->decFmt.w1off), XMM3);
+		}
+	} else {
+		Jit_AnyU16ToFloat(dec_->weightoff, dec_->nweights * 16);
+		MOVUPS(MDisp(dstReg, dec_->decFmt.w0off), XMM3);
+	}
+}
+
+void VertexDecoderJitCache::Jit_WeightsFloat() {
+	int j;
+	switch (dec_->nweights) {
+	case 1:
+		// MOVSS: When the source operand is a memory location and destination operand is an XMM register, the three high-order doublewords of the destination operand are cleared to all 0s.
+		MOVSS(XMM3, MDisp(srcReg, dec_->weightoff));
+		MOVUPS(MDisp(dstReg, dec_->decFmt.w0off), XMM3);
+		break;
+
+	case 2:
+		MOVQ_xmm(XMM3, MDisp(srcReg, dec_->weightoff));
+		MOVUPS(MDisp(dstReg, dec_->decFmt.w0off), XMM3);
+		break;
+
+	case 4:
+		MOVUPS(XMM3, MDisp(srcReg, dec_->weightoff));
+		MOVUPS(MDisp(dstReg, dec_->decFmt.w0off), XMM3);
+		break;
+
+	case 5:
+		MOVUPS(XMM3, MDisp(srcReg, dec_->weightoff));
+		MOVSS(XMM4, MDisp(srcReg, dec_->weightoff + 16));
+		MOVUPS(MDisp(dstReg, dec_->decFmt.w0off), XMM3);
+		MOVUPS(MDisp(dstReg, dec_->decFmt.w0off + 16), XMM4);
+		break;
+
+	case 6:
+		MOVUPS(XMM3, MDisp(srcReg, dec_->weightoff));
+		MOVQ_xmm(XMM4, MDisp(srcReg, dec_->weightoff + 16));
+		MOVUPS(MDisp(dstReg, dec_->decFmt.w0off), XMM3);
+		MOVUPS(MDisp(dstReg, dec_->decFmt.w0off + 16), XMM4);
+		break;
+
+	case 8:
+		MOVUPS(XMM3, MDisp(srcReg, dec_->weightoff));
+		MOVUPS(XMM4, MDisp(srcReg, dec_->weightoff + 16));
+		MOVUPS(MDisp(dstReg, dec_->decFmt.w0off), XMM3);
+		MOVUPS(MDisp(dstReg, dec_->decFmt.w0off + 16), XMM4);
+		break;
+
+	default:
+		for (j = 0; j < dec_->nweights; j++) {
+			MOV(32, R(tempReg1), MDisp(srcReg, dec_->weightoff + j * 4));
+			MOV(32, MDisp(dstReg, dec_->decFmt.w0off + j * 4), R(tempReg1));
+		}
+		while (j & 3) {  // Zero additional weights rounding up to 4.
+			MOV(32, MDisp(dstReg, dec_->decFmt.w0off + j * 4), Imm32(0));
+			j++;
+		}
+		break;
+	}
+}
+
 void VertexDecoderJitCache::Jit_WeightsU8Skin() {
 	MOV(PTRBITS, R(tempReg2), ImmPtr(&bones));

--- a/GPU/D3D11/DrawEngineD3D11.cpp
+++ b/GPU/D3D11/DrawEngineD3D11.cpp
@ -343,8 +343,8 @@ void DrawEngineD3D11::DoFlush() {

 		// Cannot cache vertex data with morph enabled.
 		bool useCache = g_Config.bVertexCache && !(lastVType_ & GE_VTYPE_MORPHCOUNT_MASK);
-		// Also avoid caching when skinning.
-		if (lastVType_ & GE_VTYPE_WEIGHT_MASK)
+		// Also avoid caching when software skinning.
+		if (g_Config.bSoftwareSkinning && (lastVType_ & GE_VTYPE_WEIGHT_MASK))
 			useCache = false;

 		if (useCache) {
--- a/GPU/D3D11/GPU_D3D11.cpp
+++ b/GPU/D3D11/GPU_D3D11.cpp
@ -103,6 +103,7 @@ GPU_D3D11::GPU_D3D11(GraphicsContext *gfxCtx, Draw::DrawContext *draw)

 	// No need to flush before the tex scale/offset commands if we are baking
 	// the tex scale/offset into the vertices anyway.
+	UpdateCmdInfo();
 	CheckGPUFeatures();

 	BuildReportingInfo();
@ -214,6 +215,7 @@ void GPU_D3D11::InitClear() {

 void GPU_D3D11::BeginHostFrame() {
 	GPUCommon::BeginHostFrame();
+	UpdateCmdInfo();
 	if (resized_) {
 		CheckGPUFeatures();
 		framebufferManager_->Resized();
--- a/GPU/D3D11/ShaderManagerD3D11.cpp
+++ b/GPU/D3D11/ShaderManagerD3D11.cpp
@ -93,19 +93,24 @@ ShaderManagerD3D11::ShaderManagerD3D11(ID3D11Device *device, ID3D11DeviceContext
 	codeBuffer_ = new char[16384];
 	memset(&ub_base, 0, sizeof(ub_base));
 	memset(&ub_lights, 0, sizeof(ub_lights));
+	memset(&ub_bones, 0, sizeof(ub_bones));

 	INFO_LOG(G3D, "sizeof(ub_base): %d", (int)sizeof(ub_base));
 	INFO_LOG(G3D, "sizeof(ub_lights): %d", (int)sizeof(ub_lights));
+	INFO_LOG(G3D, "sizeof(ub_bones): %d", (int)sizeof(ub_bones));

 	D3D11_BUFFER_DESC desc{sizeof(ub_base), D3D11_USAGE_DYNAMIC, D3D11_BIND_CONSTANT_BUFFER, D3D11_CPU_ACCESS_WRITE };
 	ASSERT_SUCCESS(device_->CreateBuffer(&desc, nullptr, &push_base));
 	desc.ByteWidth = sizeof(ub_lights);
 	ASSERT_SUCCESS(device_->CreateBuffer(&desc, nullptr, &push_lights));
+	desc.ByteWidth = sizeof(ub_bones);
+	ASSERT_SUCCESS(device_->CreateBuffer(&desc, nullptr, &push_bones));
 }

 ShaderManagerD3D11::~ShaderManagerD3D11() {
 	push_base->Release();
 	push_lights->Release();
+	push_bones->Release();
 	ClearShaders();
 	delete[] codeBuffer_;
 }
@ -154,15 +159,21 @@ uint64_t ShaderManagerD3D11::UpdateUniforms() {
 			memcpy(map.pData, &ub_lights, sizeof(ub_lights));
 			context_->Unmap(push_lights, 0);
 		}
+		if (dirty & DIRTY_BONE_UNIFORMS) {
+			BoneUpdateUniforms(&ub_bones, dirty);
+			context_->Map(push_bones, 0, D3D11_MAP_WRITE_DISCARD, 0, &map);
+			memcpy(map.pData, &ub_bones, sizeof(ub_bones));
+			context_->Unmap(push_bones, 0);
+		}
 	}
 	gstate_c.CleanUniforms();
 	return dirty;
 }

 void ShaderManagerD3D11::BindUniforms() {
-	ID3D11Buffer *vs_cbs[2] = { push_base, push_lights };
+	ID3D11Buffer *vs_cbs[3] = { push_base, push_lights, push_bones };
 	ID3D11Buffer *ps_cbs[1] = { push_base };
-	context_->VSSetConstantBuffers(0, 2, vs_cbs);
+	context_->VSSetConstantBuffers(0, 3, vs_cbs);
 	context_->PSSetConstantBuffers(0, 1, ps_cbs);
 }

--- a/GPU/D3D11/ShaderManagerD3D11.h
+++ b/GPU/D3D11/ShaderManagerD3D11.h
@ -121,10 +121,12 @@ private:
 	// Uniform block scratchpad. These (the relevant ones) are copied to the current pushbuffer at draw time.
 	UB_VS_FS_Base ub_base;
 	UB_VS_Lights ub_lights;
+	UB_VS_Bones ub_bones;

 	// Not actual pushbuffers, requires D3D11.1, let's try to live without that first.
 	ID3D11Buffer *push_base;
 	ID3D11Buffer *push_lights;
+	ID3D11Buffer *push_bones;

 	D3D11FragmentShader *lastFShader_;
 	D3D11VertexShader *lastVShader_;
--- a/GPU/Directx9/DrawEngineDX9.cpp
+++ b/GPU/Directx9/DrawEngineDX9.cpp
@ -324,8 +324,8 @@ void DrawEngineDX9::DoFlush() {

 		// Cannot cache vertex data with morph enabled.
 		bool useCache = g_Config.bVertexCache && !(lastVType_ & GE_VTYPE_MORPHCOUNT_MASK);
-		// Also avoid caching when skinning.
-		if (lastVType_ & GE_VTYPE_WEIGHT_MASK)
+		// Also avoid caching when software skinning.
+		if (g_Config.bSoftwareSkinning && (lastVType_ & GE_VTYPE_WEIGHT_MASK))
 			useCache = false;

 		if (useCache) {
--- a/GPU/Directx9/GPU_DX9.cpp
+++ b/GPU/Directx9/GPU_DX9.cpp
@ -83,6 +83,9 @@ GPU_DX9::GPU_DX9(GraphicsContext *gfxCtx, Draw::DrawContext *draw)
 		ERROR_LOG(G3D, "gstate has drifted out of sync!");
 	}

+	// No need to flush before the tex scale/offset commands if we are baking
+	// the tex scale/offset into the vertices anyway.
+	UpdateCmdInfo();
 	CheckGPUFeatures();

 	BuildReportingInfo();
@ -188,6 +191,7 @@ void GPU_DX9::InitClear() {

 void GPU_DX9::BeginHostFrame() {
 	GPUCommon::BeginHostFrame();
+	UpdateCmdInfo();
 	if (resized_) {
 		CheckGPUFeatures();
 		framebufferManager_->Resized();
--- a/GPU/Directx9/ShaderManagerDX9.cpp
+++ b/GPU/Directx9/ShaderManagerDX9.cpp
@ -313,7 +313,7 @@ void ShaderManagerDX9::PSUpdateUniforms(u64 dirtyUniforms) {
 }

 const uint64_t vsUniforms = DIRTY_PROJMATRIX | DIRTY_PROJTHROUGHMATRIX | DIRTY_WORLDMATRIX | DIRTY_VIEWMATRIX | DIRTY_TEXMATRIX |
-DIRTY_FOGCOEF | DIRTY_UVSCALEOFFSET | DIRTY_DEPTHRANGE |
+DIRTY_FOGCOEF | DIRTY_BONE_UNIFORMS | DIRTY_UVSCALEOFFSET | DIRTY_DEPTHRANGE |
 DIRTY_AMBIENT | DIRTY_MATAMBIENTALPHA | DIRTY_MATSPECULAR | DIRTY_MATDIFFUSE | DIRTY_MATEMISSIVE | DIRTY_LIGHT0 | DIRTY_LIGHT1 | DIRTY_LIGHT2 | DIRTY_LIGHT3;

 void ShaderManagerDX9::VSUpdateUniforms(u64 dirtyUniforms) {
@ -382,6 +382,38 @@ void ShaderManagerDX9::VSUpdateUniforms(u64 dirtyUniforms) {
 #endif
 		VSSetFloatArray(CONST_VS_FOGCOEF, fogcoef, 2);
 	}
+	// TODO: Could even set all bones in one go if they're all dirty.
+#ifdef USE_BONE_ARRAY
+	if (u_bone != 0) {
+		float allBones[8 * 16];
+
+		bool allDirty = true;
+		for (int i = 0; i < numBones; i++) {
+			if (dirtyUniforms & (DIRTY_BONEMATRIX0 << i)) {
+				ConvertMatrix4x3To4x4(allBones + 16 * i, gstate.boneMatrix + 12 * i);
+			} else {
+				allDirty = false;
+			}
+		}
+		if (allDirty) {
+			// Set them all with one call
+			//glUniformMatrix4fv(u_bone, numBones, GL_FALSE, allBones);
+		} else {
+			// Set them one by one. Could try to coalesce two in a row etc but too lazy.
+			for (int i = 0; i < numBones; i++) {
+				if (dirtyUniforms & (DIRTY_BONEMATRIX0 << i)) {
+					//glUniformMatrix4fv(u_bone + i, 1, GL_FALSE, allBones + 16 * i);
+				}
+			}
+		}
+	}
+#else
+	for (int i = 0; i < 8; i++) {
+		if (dirtyUniforms & (DIRTY_BONEMATRIX0 << i)) {
+			VSSetMatrix4x3_3(CONST_VS_BONE0 + 3 * i, gstate.boneMatrix + 12 * i);
+		}
+	}
+#endif

 	// Texturing
 	if (dirtyUniforms & DIRTY_UVSCALEOFFSET) {
--- a/GPU/Directx9/VertexShaderGeneratorDX9.cpp
+++ b/GPU/Directx9/VertexShaderGeneratorDX9.cpp
@ -37,6 +37,18 @@

 namespace DX9 {

+static const char * const boneWeightAttrDecl[9] = {	
+	"#ERROR#",
+	"float  a_w1:TEXCOORD1;\n",
+	"float2 a_w1:TEXCOORD1;\n",
+	"float3 a_w1:TEXCOORD1;\n",
+	"float4 a_w1:TEXCOORD1;\n",
+	"float4 a_w1:TEXCOORD1;\n  float  a_w2:TEXCOORD2;\n",
+	"float4 a_w1:TEXCOORD1;\n  float2 a_w2:TEXCOORD2;\n",
+	"float4 a_w1:TEXCOORD1;\n  float3 a_w2:TEXCOORD2;\n",
+	"float4 a_w1:TEXCOORD1;\n  float4 a_w2:TEXCOORD2;\n",
+};
+
 enum DoLightComputation {
 	LIGHT_OFF,
 	LIGHT_SHADE,
@ -68,6 +80,7 @@ void GenerateVertexShaderHLSL(const VShaderID &id, char *buffer, ShaderLanguage
 	bool flipNormal = id.Bit(VS_BIT_NORM_REVERSE);
 	int ls0 = id.Bits(VS_BIT_LS0, 2);
 	int ls1 = id.Bits(VS_BIT_LS1, 2);
+	bool enableBones = id.Bit(VS_BIT_ENABLE_BONES);
 	bool enableLighting = id.Bit(VS_BIT_LIGHTING_ENABLE);
 	int matUpdate = id.Bits(VS_BIT_MATERIAL_UPDATE, 3);

@ -91,6 +104,9 @@ void GenerateVertexShaderHLSL(const VShaderID &id, char *buffer, ShaderLanguage

 	int numBoneWeights = 0;
 	int boneWeightScale = id.Bits(VS_BIT_WEIGHT_FMTSCALE, 2);
+	if (enableBones) {
+		numBoneWeights = 1 + id.Bits(VS_BIT_BONES, 3);
+	}

 	if (lang == HLSL_DX9) {
 		WRITE(p, "#pragma warning( disable : 3571 )\n");
@ -113,6 +129,15 @@ void GenerateVertexShaderHLSL(const VShaderID &id, char *buffer, ShaderLanguage
 			WRITE(p, "float4x3 u_view : register(c%i);\n", CONST_VS_VIEW);
 			if (doTextureTransform)
 				WRITE(p, "float4x3 u_tex : register(c%i);\n", CONST_VS_TEXMTX);
+			if (enableBones) {
+#ifdef USE_BONE_ARRAY
+				WRITE(p, "float4x3 u_bone[%i] : register(c%i);\n", numBones, CONST_VS_BONE0);
+#else
+				for (int i = 0; i < numBoneWeights; i++) {
+					WRITE(p, "float4x3 u_bone%i : register(c%i);\n", i, CONST_VS_BONE0 + i * 3);
+				}
+#endif
+			}
 			if (doTexture) {
 				WRITE(p, "float4 u_uvscaleoffset : register(c%i);\n", CONST_VS_UVSCALEOFFSET);
 			}
@ -156,6 +181,7 @@ void GenerateVertexShaderHLSL(const VShaderID &id, char *buffer, ShaderLanguage
 	} else {
 		WRITE(p, "cbuffer base : register(b0) {\n%s};\n", cb_baseStr);
 		WRITE(p, "cbuffer lights: register(b1) {\n%s};\n", cb_vs_lightsStr);
+		WRITE(p, "cbuffer bones : register(b2) {\n%s};\n", cb_vs_bonesStr);
 	}

 	// And the "varyings".
@ -165,6 +191,9 @@ void GenerateVertexShaderHLSL(const VShaderID &id, char *buffer, ShaderLanguage
 		if ((doSpline || doBezier) && lang == HLSL_D3D11) {
 			WRITE(p, "  uint instanceId : SV_InstanceID;\n");
 		}
+		if (enableBones) {
+			WRITE(p, "  %s", boneWeightAttrDecl[numBoneWeights]);
+		}
 		if (doTexture && hasTexcoord) {
 			WRITE(p, "  float2 texcoord : TEXCOORD0;\n");
 		}
@ -357,113 +386,184 @@ void GenerateVertexShaderHLSL(const VShaderID &id, char *buffer, ShaderLanguage
 			}
 		}
 	}  else {
-		// Step 1: World Transform
-		// Hardware tessellation
-		if (doSpline || doBezier) {
-			WRITE(p, "  uint num_patches_u = %s;\n", doBezier ? "(u_spline_count_u - 1) / 3u" : "u_spline_count_u - 3");
-			WRITE(p, "  float2 tess_pos = In.position.xy;\n");
-			WRITE(p, "  int u = In.instanceId %% num_patches_u;\n");
-			WRITE(p, "  int v = In.instanceId / num_patches_u;\n");
-			WRITE(p, "  int2 patch_pos = int2(u, v);\n");
-			WRITE(p, "  float3 _pos[16];\n");
-			WRITE(p, "  float2 _tex[16];\n");
-			WRITE(p, "  float4 _col[16];\n");
-			WRITE(p, "  int idx;\n");
-			WRITE(p, "  int2 index;\n");
-			for (int i = 0; i < 4; i++) {
-				for (int j = 0; j < 4; j++) {
-					WRITE(p, "  idx = (%i + v%s) * u_spline_count_u + (%i + u%s);\n", i, doBezier ? " * 3" : "", j, doBezier ? " * 3" : "");
-					WRITE(p, "  index = int2(idx, 0);\n");
-					WRITE(p, "  _pos[%i] = u_tess_pos_tex.Load(index).xyz;\n", i * 4 + j);
-					if (doTexture && hasTexcoord && hasTexcoordTess)
-						WRITE(p, "  _tex[%i] = u_tess_tex_tex.Load(index).xy;\n", i * 4 + j);
-					if (hasColor && hasColorTess)
-						WRITE(p, "  _col[%i] = u_tess_col_tex.Load(index).rgba;\n", i * 4 + j);
+		// Step 1: World Transform / Skinning
+		if (!enableBones) {
+			// Hardware tessellation
+			if (doSpline || doBezier) {
+				WRITE(p, "  uint num_patches_u = %s;\n", doBezier ? "(u_spline_count_u - 1) / 3u" : "u_spline_count_u - 3");
+				WRITE(p, "  float2 tess_pos = In.position.xy;\n");
+				WRITE(p, "  int u = In.instanceId %% num_patches_u;\n");
+				WRITE(p, "  int v = In.instanceId / num_patches_u;\n");
+				WRITE(p, "  int2 patch_pos = int2(u, v);\n");
+				WRITE(p, "  float3 _pos[16];\n");
+				WRITE(p, "  float2 _tex[16];\n");
+				WRITE(p, "  float4 _col[16];\n");
+				WRITE(p, "  int idx;\n");
+				WRITE(p, "  int2 index;\n");
+				for (int i = 0; i < 4; i++) {
+					for (int j = 0; j < 4; j++) {
+						WRITE(p, "  idx = (%i + v%s) * u_spline_count_u + (%i + u%s);\n", i, doBezier ? " * 3" : "", j, doBezier ? " * 3" : "");
+						WRITE(p, "  index = int2(idx, 0);\n");
+						WRITE(p, "  _pos[%i] = u_tess_pos_tex.Load(index).xyz;\n", i * 4 + j);
+						if (doTexture && hasTexcoord && hasTexcoordTess)
+							WRITE(p, "  _tex[%i] = u_tess_tex_tex.Load(index).xy;\n", i * 4 + j);
+						if (hasColor && hasColorTess)
+							WRITE(p, "  _col[%i] = u_tess_col_tex.Load(index).rgba;\n", i * 4 + j);
+					}
 				}
-			}
-			WRITE(p, "  float2 weights[4];\n");
-			if (doBezier) {
-				// Bernstein 3D
-				WRITE(p, "  weights[0] = (1.0 - tess_pos) * (1.0 - tess_pos) * (1.0 - tess_pos);\n");
-				WRITE(p, "  weights[1] = 3.0 * tess_pos * (1.0 - tess_pos) * (1.0 - tess_pos);\n");
-				WRITE(p, "  weights[2] = 3.0 * tess_pos * tess_pos * (1.0 - tess_pos);\n");
-				WRITE(p, "  weights[3] = tess_pos * tess_pos * tess_pos;\n");
-			} else if (doSpline) {
-				WRITE(p, "  int2 spline_num_patches = int2(u_spline_count_u - 3, u_spline_count_v - 3);\n");
-				WRITE(p, "  int2 spline_type = int2(u_spline_type_u, u_spline_type_v);\n");
-				WRITE(p, "  float2 knots[6];\n");
-				WRITE(p, "  spline_knot(spline_num_patches, spline_type, knots, patch_pos);\n");
-				WRITE(p, "  spline_weight(tess_pos + patch_pos, knots, weights);\n");
-			}
-			WRITE(p, "  float3 pos = tess_sample(_pos, weights);\n");
-			if (doTexture && hasTexcoord) {
-				if (hasTexcoordTess)
-					WRITE(p, "  float2 tex = tess_sample(_tex, weights);\n");
-				else
-					WRITE(p, "  float2 tex = tess_pos + patch_pos;\n");
-			}
-			if (hasColor) {
-				if (hasColorTess)
-					WRITE(p, "  float4 col = tess_sample(_col, weights);\n");
-				else
-					WRITE(p, "  float4 col = u_tess_col_tex.Load(int2(0, 0)).rgba;\n");
-			}
-			if (hasNormal) {
-				// Curved surface is probably always need to compute normal(not sampling from control points)
+				WRITE(p, "  float2 weights[4];\n");
 				if (doBezier) {
-					// Bernstein derivative
-					WRITE(p, "  float2 bernderiv[4];\n");
-					WRITE(p, "  bernderiv[0] = -3.0 * (tess_pos - 1.0) * (tess_pos - 1.0); \n");
-					WRITE(p, "  bernderiv[1] = 9.0 * tess_pos * tess_pos - 12.0 * tess_pos + 3.0; \n");
-					WRITE(p, "  bernderiv[2] = 3.0 * (2.0 - 3.0 * tess_pos) * tess_pos; \n");
-					WRITE(p, "  bernderiv[3] = 3.0 * tess_pos * tess_pos; \n");
-
-					WRITE(p, "  float2 bernderiv_u[4];\n");
-					WRITE(p, "  float2 bernderiv_v[4];\n");
-					WRITE(p, "  for (int i = 0; i < 4; i++) {\n");
-					WRITE(p, "    bernderiv_u[i] = float2(bernderiv[i].x, weights[i].y);\n");
-					WRITE(p, "    bernderiv_v[i] = float2(weights[i].x, bernderiv[i].y);\n");
-					WRITE(p, "  }\n");
-
-					WRITE(p, "  float3 du = tess_sample(_pos, bernderiv_u);\n");
-					WRITE(p, "  float3 dv = tess_sample(_pos, bernderiv_v);\n");
+					// Bernstein 3D
+					WRITE(p, "  weights[0] = (1.0 - tess_pos) * (1.0 - tess_pos) * (1.0 - tess_pos);\n");
+					WRITE(p, "  weights[1] = 3.0 * tess_pos * (1.0 - tess_pos) * (1.0 - tess_pos);\n");
+					WRITE(p, "  weights[2] = 3.0 * tess_pos * tess_pos * (1.0 - tess_pos);\n");
+					WRITE(p, "  weights[3] = tess_pos * tess_pos * tess_pos;\n");
 				} else if (doSpline) {
-					WRITE(p, "  float2 tess_next_u = float2(In.normal.x, 0.0);\n");
-					WRITE(p, "  float2 tess_next_v = float2(0.0, In.normal.y);\n");
-					// Right
-					WRITE(p, "  float2 tess_pos_r = tess_pos + tess_next_u;\n");
-					WRITE(p, "  spline_weight(tess_pos_r + patch_pos, knots, weights);\n");
-					WRITE(p, "  float3 pos_r = tess_sample(_pos, weights);\n");
-					// Left
-					WRITE(p, "  float2 tess_pos_l = tess_pos - tess_next_u;\n");
-					WRITE(p, "  spline_weight(tess_pos_l + patch_pos, knots, weights);\n");
-					WRITE(p, "  float3 pos_l = tess_sample(_pos, weights);\n");
-					// Down
-					WRITE(p, "  float2 tess_pos_d = tess_pos + tess_next_v;\n");
-					WRITE(p, "  spline_weight(tess_pos_d + patch_pos, knots, weights);\n");
-					WRITE(p, "  float3 pos_d = tess_sample(_pos, weights);\n");
-					// Up
-					WRITE(p, "  float2 tess_pos_u = tess_pos - tess_next_v;\n");
-					WRITE(p, "  spline_weight(tess_pos_u + patch_pos, knots, weights);\n");
-					WRITE(p, "  float3 pos_u = tess_sample(_pos, weights);\n");
-
-					WRITE(p, "  float3 du = pos_r - pos_l;\n");
-					WRITE(p, "  float3 dv = pos_d - pos_u;\n");
+					WRITE(p, "  int2 spline_num_patches = int2(u_spline_count_u - 3, u_spline_count_v - 3);\n");
+					WRITE(p, "  int2 spline_type = int2(u_spline_type_u, u_spline_type_v);\n");
+					WRITE(p, "  float2 knots[6];\n");
+					WRITE(p, "  spline_knot(spline_num_patches, spline_type, knots, patch_pos);\n");
+					WRITE(p, "  spline_weight(tess_pos + patch_pos, knots, weights);\n");
 				}
-				WRITE(p, "  float3 nrm = cross(du, dv);\n");
-				WRITE(p, "  nrm = normalize(nrm);\n");
+				WRITE(p, "  float3 pos = tess_sample(_pos, weights);\n");
+				if (doTexture && hasTexcoord) {
+					if (hasTexcoordTess)
+						WRITE(p, "  float2 tex = tess_sample(_tex, weights);\n");
+					else
+						WRITE(p, "  float2 tex = tess_pos + patch_pos;\n");
+				}
+				if (hasColor) {
+					if (hasColorTess)
+						WRITE(p, "  float4 col = tess_sample(_col, weights);\n");
+					else
+						WRITE(p, "  float4 col = u_tess_col_tex.Load(int2(0, 0)).rgba;\n");
+				}
+				if (hasNormal) {
+					// Curved surface is probably always need to compute normal(not sampling from control points)
+					if (doBezier) {
+						// Bernstein derivative
+						WRITE(p, "  float2 bernderiv[4];\n");
+						WRITE(p, "  bernderiv[0] = -3.0 * (tess_pos - 1.0) * (tess_pos - 1.0); \n");
+						WRITE(p, "  bernderiv[1] = 9.0 * tess_pos * tess_pos - 12.0 * tess_pos + 3.0; \n");
+						WRITE(p, "  bernderiv[2] = 3.0 * (2.0 - 3.0 * tess_pos) * tess_pos; \n");
+						WRITE(p, "  bernderiv[3] = 3.0 * tess_pos * tess_pos; \n");
+
+						WRITE(p, "  float2 bernderiv_u[4];\n");
+						WRITE(p, "  float2 bernderiv_v[4];\n");
+						WRITE(p, "  for (int i = 0; i < 4; i++) {\n");
+						WRITE(p, "    bernderiv_u[i] = float2(bernderiv[i].x, weights[i].y);\n");
+						WRITE(p, "    bernderiv_v[i] = float2(weights[i].x, bernderiv[i].y);\n");
+						WRITE(p, "  }\n");
+
+						WRITE(p, "  float3 du = tess_sample(_pos, bernderiv_u);\n");
+						WRITE(p, "  float3 dv = tess_sample(_pos, bernderiv_v);\n");
+					} else if (doSpline) {
+						WRITE(p, "  float2 tess_next_u = float2(In.normal.x, 0.0);\n");
+						WRITE(p, "  float2 tess_next_v = float2(0.0, In.normal.y);\n");
+						// Right
+						WRITE(p, "  float2 tess_pos_r = tess_pos + tess_next_u;\n");
+						WRITE(p, "  spline_weight(tess_pos_r + patch_pos, knots, weights);\n");
+						WRITE(p, "  float3 pos_r = tess_sample(_pos, weights);\n");
+						// Left
+						WRITE(p, "  float2 tess_pos_l = tess_pos - tess_next_u;\n");
+						WRITE(p, "  spline_weight(tess_pos_l + patch_pos, knots, weights);\n");
+						WRITE(p, "  float3 pos_l = tess_sample(_pos, weights);\n");
+						// Down
+						WRITE(p, "  float2 tess_pos_d = tess_pos + tess_next_v;\n");
+						WRITE(p, "  spline_weight(tess_pos_d + patch_pos, knots, weights);\n");
+						WRITE(p, "  float3 pos_d = tess_sample(_pos, weights);\n");
+						// Up
+						WRITE(p, "  float2 tess_pos_u = tess_pos - tess_next_v;\n");
+						WRITE(p, "  spline_weight(tess_pos_u + patch_pos, knots, weights);\n");
+						WRITE(p, "  float3 pos_u = tess_sample(_pos, weights);\n");
+
+						WRITE(p, "  float3 du = pos_r - pos_l;\n");
+						WRITE(p, "  float3 dv = pos_d - pos_u;\n");
+					}
+					WRITE(p, "  float3 nrm = cross(du, dv);\n");
+					WRITE(p, "  nrm = normalize(nrm);\n");
+				}
+				WRITE(p, "  float3 worldpos = mul(float4(pos.xyz, 1.0), u_world);\n");
+				if (hasNormal)
+					WRITE(p, "  float3 worldnormal = normalize(mul(float4(%snrm, 0.0), u_world));\n", flipNormalTess ? "-" : "");
+				else
+					WRITE(p, "  float3 worldnormal = float3(0.0, 0.0, 1.0);\n");
+			} else {
+				// No skinning, just standard T&L.
+				WRITE(p, "  float3 worldpos = mul(float4(In.position.xyz, 1.0), u_world);\n");
+				if (hasNormal)
+					WRITE(p, "  float3 worldnormal = normalize(mul(float4(%sIn.normal, 0.0), u_world));\n", flipNormal ? "-" : "");
+				else
+					WRITE(p, "  float3 worldnormal = float3(0.0, 0.0, 1.0);\n");
 			}
-			WRITE(p, "  float3 worldpos = mul(float4(pos.xyz, 1.0), u_world);\n");
-			if (hasNormal)
-				WRITE(p, "  float3 worldnormal = normalize(mul(float4(%snrm, 0.0), u_world));\n", flipNormalTess ? "-" : "");
-			else
-				WRITE(p, "  float3 worldnormal = float3(0.0, 0.0, 1.0);\n");
 		} else {
-			WRITE(p, "  float3 worldpos = mul(float4(In.position.xyz, 1.0), u_world);\n");
-			if (hasNormal)
-				WRITE(p, "  float3 worldnormal = normalize(mul(float4(%sIn.normal, 0.0), u_world));\n", flipNormal ? "-" : "");
-			else
-				WRITE(p, "  float3 worldnormal = float3(0.0, 0.0, 1.0);\n");
+			static const char * const boneWeightAttr[8] = {
+				"a_w1.x", "a_w1.y", "a_w1.z", "a_w1.w",
+				"a_w2.x", "a_w2.y", "a_w2.z", "a_w2.w",
+			};
+
+#if defined(USE_FOR_LOOP) && defined(USE_BONE_ARRAY)
+
+			// To loop through the weights, we unfortunately need to put them in a float array.
+			// GLSL ES sucks - no way to directly initialize an array!
+			switch (numBoneWeights) {
+			case 1: WRITE(p, "  float w[1]; w[0] = a_w1;\n"); break;
+			case 2: WRITE(p, "  float w[2]; w[0] = a_w1.x; w[1] = a_w1.y;\n"); break;
+			case 3: WRITE(p, "  float w[3]; w[0] = a_w1.x; w[1] = a_w1.y; w[2] = a_w1.z;\n"); break;
+			case 4: WRITE(p, "  float w[4]; w[0] = a_w1.x; w[1] = a_w1.y; w[2] = a_w1.z; w[3] = a_w1.w;\n"); break;
+			case 5: WRITE(p, "  float w[5]; w[0] = a_w1.x; w[1] = a_w1.y; w[2] = a_w1.z; w[3] = a_w1.w; w[4] = a_w2;\n"); break;
+			case 6: WRITE(p, "  float w[6]; w[0] = a_w1.x; w[1] = a_w1.y; w[2] = a_w1.z; w[3] = a_w1.w; w[4] = a_w2.x; w[5] = a_w2.y;\n"); break;
+			case 7: WRITE(p, "  float w[7]; w[0] = a_w1.x; w[1] = a_w1.y; w[2] = a_w1.z; w[3] = a_w1.w; w[4] = a_w2.x; w[5] = a_w2.y; w[6] = a_w2.z;\n"); break;
+			case 8: WRITE(p, "  float w[8]; w[0] = a_w1.x; w[1] = a_w1.y; w[2] = a_w1.z; w[3] = a_w1.w; w[4] = a_w2.x; w[5] = a_w2.y; w[6] = a_w2.z; w[7] = a_w2.w;\n"); break;
+			}
+
+			WRITE(p, "  mat4 skinMatrix = w[0] * u_bone[0];\n");
+			if (numBoneWeights > 1) {
+				WRITE(p, "  for (int i = 1; i < %i; i++) {\n", numBoneWeights);
+				WRITE(p, "    skinMatrix += w[i] * u_bone[i];\n");
+				WRITE(p, "  }\n");
+			}
+
+#else
+			if (lang == HLSL_D3D11 || lang == HLSL_D3D11_LEVEL9) {
+				if (numBoneWeights == 1)
+					WRITE(p, "  float4x3 skinMatrix = mul(In.a_w1, u_bone[0])");
+				else
+					WRITE(p, "  float4x3 skinMatrix = mul(In.a_w1.x, u_bone[0])");
+				for (int i = 1; i < numBoneWeights; i++) {
+					const char *weightAttr = boneWeightAttr[i];
+					// workaround for "cant do .x of scalar" issue
+					if (numBoneWeights == 1 && i == 0) weightAttr = "a_w1";
+					if (numBoneWeights == 5 && i == 4) weightAttr = "a_w2";
+					WRITE(p, " + mul(In.%s, u_bone[%i])", weightAttr, i);
+				}
+			} else {
+				if (numBoneWeights == 1)
+					WRITE(p, "  float4x3 skinMatrix = mul(In.a_w1, u_bone0)");
+				else
+					WRITE(p, "  float4x3 skinMatrix = mul(In.a_w1.x, u_bone0)");
+				for (int i = 1; i < numBoneWeights; i++) {
+					const char *weightAttr = boneWeightAttr[i];
+					// workaround for "cant do .x of scalar" issue
+					if (numBoneWeights == 1 && i == 0) weightAttr = "a_w1";
+					if (numBoneWeights == 5 && i == 4) weightAttr = "a_w2";
+					WRITE(p, " + mul(In.%s, u_bone%i)", weightAttr, i);
+				}
+			}
+#endif
+
+			WRITE(p, ";\n");
+
+			// Trying to simplify this results in bugs in LBP...
+			WRITE(p, "  float3 skinnedpos = mul(float4(In.position.xyz, 1.0), skinMatrix);\n");
+			WRITE(p, "  float3 worldpos = mul(float4(skinnedpos, 1.0), u_world);\n");
+
+			if (hasNormal) {
+				WRITE(p, "  float3 skinnednormal = mul(float4(%sIn.normal, 0.0), skinMatrix);\n", flipNormal ? "-" : "");
+			} else {
+				WRITE(p, "  float3 skinnednormal = mul(float4(0.0, 0.0, %s1.0, 0.0), skinMatrix);\n", flipNormal ? "-" : "");
+			}
+			WRITE(p, "  float3 worldnormal = normalize(mul(float4(skinnednormal, 0.0), u_world));\n");
 		}

 		WRITE(p, "  float4 viewPos = float4(mul(float4(worldpos, 1.0), u_view), 1.0);\n");
--- a/GPU/Directx9/VertexShaderGeneratorDX9.h
+++ b/GPU/Directx9/VertexShaderGeneratorDX9.h
@ -44,6 +44,15 @@ namespace DX9 {
 		CONST_VS_LIGHTSPECULAR = 44,
 		CONST_VS_LIGHTAMBIENT = 48,
 		CONST_VS_DEPTHRANGE = 52,
+		CONST_VS_BONE0 = 53,
+		CONST_VS_BONE1 = 56,
+		CONST_VS_BONE2 = 59,
+		CONST_VS_BONE3 = 62,
+		CONST_VS_BONE4 = 65,
+		CONST_VS_BONE5 = 68,
+		CONST_VS_BONE6 = 71,
+		CONST_VS_BONE7 = 74,
+		CONST_VS_BONE8 = 77,
 	};

 };
--- a/GPU/GLES/DrawEngineGLES.cpp
+++ b/GPU/GLES/DrawEngineGLES.cpp
@ -324,8 +324,8 @@ void DrawEngineGLES::DoFlush() {

 		// Cannot cache vertex data with morph enabled.
 		bool useCache = g_Config.bVertexCache && !(lastVType_ & GE_VTYPE_MORPHCOUNT_MASK);
-		// Also avoid caching when skinning.
-		if (lastVType_ & GE_VTYPE_WEIGHT_MASK)
+		// Also avoid caching when software skinning.
+		if (g_Config.bSoftwareSkinning && (lastVType_ & GE_VTYPE_WEIGHT_MASK))
 			useCache = false;

 		// TEMPORARY
@ -469,8 +469,8 @@ void DrawEngineGLES::DoFlush() {

 			vai->lastFrame = gpuStats.numFlips;
 		} else {
-			if (lastVType_ & GE_VTYPE_WEIGHT_MASK) {
-				// If skinning, we've already predecoded into "decoded". So push that content.
+			if (g_Config.bSoftwareSkinning && (lastVType_ & GE_VTYPE_WEIGHT_MASK)) {
+				// If software skinning, we've already predecoded into "decoded". So push that content.
 				size_t size = decodedVerts_ * dec_->GetDecVtxFmt().stride;
 				u8 *dest = (u8 *)frameData.pushVertex->Push(size, &vertexBufferOffset, &vertexBuffer);
 				memcpy(dest, decoded, size);
--- a/GPU/GLES/GPU_GLES.cpp
+++ b/GPU/GLES/GPU_GLES.cpp
@ -89,6 +89,8 @@ GPU_GLES::GPU_GLES(GraphicsContext *gfxCtx, Draw::DrawContext *draw)
 	// No need to flush before the tex scale/offset commands if we are baking
 	// the tex scale/offset into the vertices anyway.

+	UpdateCmdInfo();
+
 	BuildReportingInfo();
 	// Update again after init to be sure of any silly driver problems.
 	UpdateVsyncInterval(true);
@ -344,6 +346,7 @@ void GPU_GLES::DeviceRestore() {
 	draw_ = (Draw::DrawContext *)PSP_CoreParameter().graphicsContext->GetDrawContext();
 	ILOG("GPU_GLES: DeviceRestore");

+	UpdateCmdInfo();
 	UpdateVsyncInterval(true);

 	textureCacheGL_->DeviceRestore(draw_);
@ -363,6 +366,7 @@ void GPU_GLES::InitClear() {

 void GPU_GLES::BeginHostFrame() {
 	GPUCommon::BeginHostFrame();
+	UpdateCmdInfo();
 	if (resized_) {
 		CheckGPUFeatures();
 		framebufferManager_->Resized();
--- a/GPU/GLES/ShaderManagerGLES.cpp
+++ b/GPU/GLES/ShaderManagerGLES.cpp
@ -109,8 +109,21 @@ LinkedShader::LinkedShader(GLRenderManager *render, VShaderID VSID, Shader *vs,
 	queries.push_back({ &u_world, "u_world" });
 	queries.push_back({ &u_texmtx, "u_texmtx" });

+	if (VSID.Bit(VS_BIT_ENABLE_BONES))
+		numBones = TranslateNumBones(VSID.Bits(VS_BIT_BONES, 3) + 1);
+	else
+		numBones = 0;
 	queries.push_back({ &u_depthRange, "u_depthRange" });

+#ifdef USE_BONE_ARRAY
+	queries.push_back({ &u_bone, "u_bone" });
+#else
+	static const char * const boneNames[8] = { "u_bone0", "u_bone1", "u_bone2", "u_bone3", "u_bone4", "u_bone5", "u_bone6", "u_bone7", };
+	for (int i = 0; i < 8; i++) {
+		queries.push_back({ &u_bone[i], boneNames[i] });
+	}
+#endif
+
 	// Lighting, texturing
 	queries.push_back({ &u_ambient, "u_ambient" });
 	queries.push_back({ &u_matambientalpha, "u_matambientalpha" });
@ -465,6 +478,13 @@ void LinkedShader::UpdateUniforms(u32 vertType, const ShaderID &vsid) {
 		float f = (float)gstate.getStencilTestRef() * (1.0f / 255.0f);
 		render_->SetUniformF(&u_stencilReplaceValue, 1, &f);
 	}
+	float bonetemp[16];
+	for (int i = 0; i < numBones; i++) {
+		if (dirty & (DIRTY_BONEMATRIX0 << i)) {
+			ConvertMatrix4x3To4x4(bonetemp, gstate.boneMatrix + 12 * i);
+			render_->SetUniformM4x4(&u_bone[i], bonetemp);
+		}
+	}

 	if (dirty & DIRTY_SHADERBLEND) {
 		if (u_blendFixA != -1) {
@ -790,7 +810,7 @@ std::string ShaderManagerGLES::DebugGetShaderString(std::string id, DebugShaderT
 // as sometimes these features might have an effect on the ID bits.

 #define CACHE_HEADER_MAGIC 0x83277592
-#define CACHE_VERSION 10
+#define CACHE_VERSION 11
 struct CacheHeader {
 	uint32_t magic;
 	uint32_t version;
--- a/GPU/GLES/ShaderManagerGLES.h
+++ b/GPU/GLES/ShaderManagerGLES.h
@ -72,6 +72,13 @@ public:
 	int u_world;
 	int u_depthRange;   // x,y = viewport xscale/xcenter. z,w=clipping minz/maxz (?)

+#ifdef USE_BONE_ARRAY
+	int u_bone;  // array, size is numBones
+#else
+	int u_bone[8];
+#endif
+	int numBones;
+
 	// Shader blending.
 	int u_fbotex;
 	int u_blendFixA;
--- a/GPU/GLES/VertexShaderGeneratorGLES.cpp
+++ b/GPU/GLES/VertexShaderGeneratorGLES.cpp
@ -38,6 +38,30 @@

 #define WRITE p+=sprintf

+static const char * const boneWeightAttrDecl[9] = {
+	"#ERROR#",
+	"attribute mediump float w1;\n",
+	"attribute mediump vec2 w1;\n",
+	"attribute mediump vec3 w1;\n",
+	"attribute mediump vec4 w1;\n",
+	"attribute mediump vec4 w1;\nattribute mediump float w2;\n",
+	"attribute mediump vec4 w1;\nattribute mediump vec2 w2;\n",
+	"attribute mediump vec4 w1;\nattribute mediump vec3 w2;\n",
+	"attribute mediump vec4 w1, w2;\n",
+};
+
+static const char * const boneWeightInDecl[9] = {
+	"#ERROR#",
+	"in mediump float w1;\n",
+	"in mediump vec2 w1;\n",
+	"in mediump vec3 w1;\n",
+	"in mediump vec4 w1;\n",
+	"in mediump vec4 w1;\nin mediump float w2;\n",
+	"in mediump vec4 w1;\nin mediump vec2 w2;\n",
+	"in mediump vec4 w1;\nin mediump vec3 w2;\n",
+	"in mediump vec4 w1, w2;\n",
+};
+
 enum DoLightComputation {
 	LIGHT_OFF,
 	LIGHT_SHADE,
@ -81,6 +105,7 @@ void GenerateVertexShader(const VShaderID &id, char *buffer, uint32_t *attrMask,
 	bool glslES30 = false;
 	const char *varying = "varying";
 	const char *attribute = "attribute";
+	const char * const * boneWeightDecl = boneWeightAttrDecl;
 	const char *texelFetch = NULL;
 	bool highpFog = false;
 	bool highpTexcoord = false;
@ -133,6 +158,7 @@ void GenerateVertexShader(const VShaderID &id, char *buffer, uint32_t *attrMask,
 	if (glslES30 || gl_extensions.IsCoreContext) {
 		attribute = "in";
 		varying = "out";
+		boneWeightDecl = boneWeightInDecl;
 	}

 	bool isModeThrough = id.Bit(VS_BIT_IS_THROUGH);
@ -156,6 +182,7 @@ void GenerateVertexShader(const VShaderID &id, char *buffer, uint32_t *attrMask,
 	bool flipNormal = id.Bit(VS_BIT_NORM_REVERSE);
 	int ls0 = id.Bits(VS_BIT_LS0, 2);
 	int ls1 = id.Bits(VS_BIT_LS1, 2);
+	bool enableBones = id.Bit(VS_BIT_ENABLE_BONES);
 	bool enableLighting = id.Bit(VS_BIT_LIGHTING_ENABLE);
 	int matUpdate = id.Bits(VS_BIT_MATERIAL_UPDATE, 3);

@ -181,6 +208,16 @@ void GenerateVertexShader(const VShaderID &id, char *buffer, uint32_t *attrMask,
 		}
 	}

+	int numBoneWeights = 0;
+	int boneWeightScale = id.Bits(VS_BIT_WEIGHT_FMTSCALE, 2);
+	if (enableBones) {
+		numBoneWeights = 1 + id.Bits(VS_BIT_BONES, 3);
+		WRITE(p, "%s", boneWeightDecl[numBoneWeights]);
+		*attrMask |= 1 << ATTR_W1;
+		if (numBoneWeights >= 5)
+			*attrMask |= 1 << ATTR_W2;
+	}
+
 	if (useHWTransform)
 		WRITE(p, "%s vec3 position;\n", attribute);
 	else
@ -231,6 +268,17 @@ void GenerateVertexShader(const VShaderID &id, char *buffer, uint32_t *attrMask,
 			WRITE(p, "uniform mediump mat4 u_texmtx;\n");
 			*uniformMask |= DIRTY_TEXMATRIX;
 		}
+		if (enableBones) {
+#ifdef USE_BONE_ARRAY
+			WRITE(p, "uniform mediump mat4 u_bone[%i];\n", numBoneWeights);
+			*uniformMask |= DIRTY_BONE_UNIFORMS;
+#else
+			for (int i = 0; i < numBoneWeights; i++) {
+				WRITE(p, "uniform mat4 u_bone%i;\n", i);
+				*uniformMask |= DIRTY_BONEMATRIX0 << i;
+			}
+#endif
+		}
 		if (doTexture) {
 			WRITE(p, "uniform vec4 u_uvscaleoffset;\n");
 			*uniformMask |= DIRTY_UVSCALEOFFSET;
@ -435,111 +483,189 @@ void GenerateVertexShader(const VShaderID &id, char *buffer, uint32_t *attrMask,
 			}
 		}
 	} else {
-		// Step 1: World Transform
-		// Hardware tessellation
-		if (doBezier || doSpline) {
-			WRITE(p, "  vec3 _pos[16];\n");
-			WRITE(p, "  vec2 _tex[16];\n");
-			WRITE(p, "  vec4 _col[16];\n");
-			WRITE(p, "  int num_patches_u = %s;\n", doBezier ? "(u_spline_count_u - 1) / 3" : "u_spline_count_u - 3");
-			WRITE(p, "  int u = int(mod(float(gl_InstanceID), float(num_patches_u)));\n");
-			WRITE(p, "  int v = gl_InstanceID / num_patches_u;\n");
-			WRITE(p, "  ivec2 patch_pos = ivec2(u, v);\n");
-			WRITE(p, "  for (int i = 0; i < 4; i++) {\n");
-			WRITE(p, "    for (int j = 0; j < 4; j++) {\n");
-			WRITE(p, "      int index = (i + v%s) * u_spline_count_u + (j + u%s);\n", doBezier ? " * 3" : "", doBezier ? " * 3" : "");
-			WRITE(p, "      _pos[i * 4 + j] = %s(u_tess_pos_tex, ivec2(index, 0), 0).xyz;\n", texelFetch);
-			if (doTexture && hasTexcoord && hasTexcoordTess)
-				WRITE(p, "      _tex[i * 4 + j] = %s(u_tess_tex_tex, ivec2(index, 0), 0).xy;\n", texelFetch);
-			if (hasColor && hasColorTess)
-				WRITE(p, "      _col[i * 4 + j] = %s(u_tess_col_tex, ivec2(index, 0), 0).rgba;\n", texelFetch);
-			WRITE(p, "    }\n");
-			WRITE(p, "  }\n");
-			WRITE(p, "  vec2 tess_pos = position.xy;\n");
-			WRITE(p, "  vec2 weights[4];\n");
-			if (doBezier) {
-				// Bernstein 3D
-				WRITE(p, "  weights[0] = (1.0 - tess_pos) * (1.0 - tess_pos) * (1.0 - tess_pos);\n");
-				WRITE(p, "  weights[1] = 3.0 * tess_pos * (1.0 - tess_pos) * (1.0 - tess_pos);\n");
-				WRITE(p, "  weights[2] = 3.0 * tess_pos * tess_pos * (1.0 - tess_pos);\n");
-				WRITE(p, "  weights[3] = tess_pos * tess_pos * tess_pos;\n");
-			} else { // Spline
-				WRITE(p, "  ivec2 spline_num_patches = ivec2(u_spline_count_u - 3, u_spline_count_v - 3);\n");
-				WRITE(p, "  ivec2 spline_type = ivec2(u_spline_type_u, u_spline_type_v);\n");
-				WRITE(p, "  vec2 knots[6];\n");
-				WRITE(p, "  spline_knot(spline_num_patches, spline_type, knots, patch_pos);\n");
-				WRITE(p, "  spline_weight(tess_pos + vec2(patch_pos), knots, weights);\n");
-			}
-			WRITE(p, "  vec3 pos = tess_sample(_pos, weights);\n");
-			if (doTexture && hasTexcoord) {
-				if (hasTexcoordTess)
-					WRITE(p, "  vec2 tex = tess_sample(_tex, weights);\n");
-				else
-					WRITE(p, "  vec2 tex = tess_pos + vec2(patch_pos);\n");
-			}
-			if (hasColor) {
-				if (hasColorTess)
-					WRITE(p, "  vec4 col = tess_sample(_col, weights);\n");
-				else
-					WRITE(p, "  vec4 col = %s(u_tess_col_tex, ivec2(0, 0), 0).rgba;\n", texelFetch);
-			}
-			if (hasNormal) {
-				// Curved surface is probably always need to compute normal(not sampling from control points)
+		// Step 1: World Transform / Skinning
+		if (!enableBones) {
+			// Hardware tessellation
+			if (doBezier || doSpline) {
+				WRITE(p, "  vec3 _pos[16];\n");
+				WRITE(p, "  vec2 _tex[16];\n");
+				WRITE(p, "  vec4 _col[16];\n");
+				WRITE(p, "  int num_patches_u = %s;\n", doBezier ? "(u_spline_count_u - 1) / 3" : "u_spline_count_u - 3");
+				WRITE(p, "  int u = int(mod(float(gl_InstanceID), float(num_patches_u)));\n");
+				WRITE(p, "  int v = gl_InstanceID / num_patches_u;\n");
+				WRITE(p, "  ivec2 patch_pos = ivec2(u, v);\n");
+				WRITE(p, "  for (int i = 0; i < 4; i++) {\n");
+				WRITE(p, "    for (int j = 0; j < 4; j++) {\n");
+				WRITE(p, "      int index = (i + v%s) * u_spline_count_u + (j + u%s);\n", doBezier ? " * 3" : "", doBezier ? " * 3" : "");
+				WRITE(p, "      _pos[i * 4 + j] = %s(u_tess_pos_tex, ivec2(index, 0), 0).xyz;\n", texelFetch);
+				if (doTexture && hasTexcoord && hasTexcoordTess)
+					WRITE(p, "      _tex[i * 4 + j] = %s(u_tess_tex_tex, ivec2(index, 0), 0).xy;\n", texelFetch);
+				if (hasColor && hasColorTess)
+					WRITE(p, "      _col[i * 4 + j] = %s(u_tess_col_tex, ivec2(index, 0), 0).rgba;\n", texelFetch);
+				WRITE(p, "    }\n");
+				WRITE(p, "  }\n");
+				WRITE(p, "  vec2 tess_pos = position.xy;\n");
+				WRITE(p, "  vec2 weights[4];\n");
 				if (doBezier) {
-					// Bernstein derivative
-					WRITE(p, "  vec2 bernderiv[4];\n");
-					WRITE(p, "  bernderiv[0] = -3.0 * (tess_pos - 1.0) * (tess_pos - 1.0); \n");
-					WRITE(p, "  bernderiv[1] = 9.0 * tess_pos * tess_pos - 12.0 * tess_pos + 3.0; \n");
-					WRITE(p, "  bernderiv[2] = 3.0 * (2.0 - 3.0 * tess_pos) * tess_pos; \n");
-					WRITE(p, "  bernderiv[3] = 3.0 * tess_pos * tess_pos; \n");
-
-					WRITE(p, "  vec2 bernderiv_u[4];\n");
-					WRITE(p, "  vec2 bernderiv_v[4];\n");
-					WRITE(p, "  for (int i = 0; i < 4; i++) {\n");
-					WRITE(p, "    bernderiv_u[i] = vec2(bernderiv[i].x, weights[i].y);\n");
-					WRITE(p, "    bernderiv_v[i] = vec2(weights[i].x, bernderiv[i].y);\n");
-					WRITE(p, "  }\n");
-
-					WRITE(p, "  vec3 du = tess_sample(_pos, bernderiv_u);\n");
-					WRITE(p, "  vec3 dv = tess_sample(_pos, bernderiv_v);\n");
+					// Bernstein 3D
+					WRITE(p, "  weights[0] = (1.0 - tess_pos) * (1.0 - tess_pos) * (1.0 - tess_pos);\n");
+					WRITE(p, "  weights[1] = 3.0 * tess_pos * (1.0 - tess_pos) * (1.0 - tess_pos);\n");
+					WRITE(p, "  weights[2] = 3.0 * tess_pos * tess_pos * (1.0 - tess_pos);\n");
+					WRITE(p, "  weights[3] = tess_pos * tess_pos * tess_pos;\n");
 				} else { // Spline
-					WRITE(p, "  vec2 tess_next_u = vec2(normal.x, 0.0);\n");
-					WRITE(p, "  vec2 tess_next_v = vec2(0.0, normal.y);\n");
-					// Right
-					WRITE(p, "  vec2 tess_pos_r = tess_pos + tess_next_u;\n");
-					WRITE(p, "  spline_weight(tess_pos_r + vec2(patch_pos), knots, weights);\n");
-					WRITE(p, "  vec3 pos_r = tess_sample(_pos, weights);\n");
-					// Left
-					WRITE(p, "  vec2 tess_pos_l = tess_pos - tess_next_u;\n");
-					WRITE(p, "  spline_weight(tess_pos_l + vec2(patch_pos), knots, weights);\n");
-					WRITE(p, "  vec3 pos_l = tess_sample(_pos, weights);\n");
-					// Down
-					WRITE(p, "  vec2 tess_pos_d = tess_pos + tess_next_v;\n");
-					WRITE(p, "  spline_weight(tess_pos_d + vec2(patch_pos), knots, weights);\n");
-					WRITE(p, "  vec3 pos_d = tess_sample(_pos, weights);\n");
-					// Up
-					WRITE(p, "  vec2 tess_pos_u = tess_pos - tess_next_v;\n");
-					WRITE(p, "  spline_weight(tess_pos_u + vec2(patch_pos), knots, weights);\n");
-					WRITE(p, "  vec3 pos_u = tess_sample(_pos, weights);\n");
-
-					WRITE(p, "  vec3 du = pos_r - pos_l;\n");
-					WRITE(p, "  vec3 dv = pos_d - pos_u;\n");
+					WRITE(p, "  ivec2 spline_num_patches = ivec2(u_spline_count_u - 3, u_spline_count_v - 3);\n");
+					WRITE(p, "  ivec2 spline_type = ivec2(u_spline_type_u, u_spline_type_v);\n");
+					WRITE(p, "  vec2 knots[6];\n");
+					WRITE(p, "  spline_knot(spline_num_patches, spline_type, knots, patch_pos);\n");
+					WRITE(p, "  spline_weight(tess_pos + vec2(patch_pos), knots, weights);\n");
+				}
+				WRITE(p, "  vec3 pos = tess_sample(_pos, weights);\n");
+				if (doTexture && hasTexcoord) {
+					if (hasTexcoordTess)
+						WRITE(p, "  vec2 tex = tess_sample(_tex, weights);\n");
+					else
+						WRITE(p, "  vec2 tex = tess_pos + vec2(patch_pos);\n");
+				}
+				if (hasColor) {
+					if (hasColorTess)
+						WRITE(p, "  vec4 col = tess_sample(_col, weights);\n");
+					else
+						WRITE(p, "  vec4 col = %s(u_tess_col_tex, ivec2(0, 0), 0).rgba;\n", texelFetch);
+				}
+				if (hasNormal) {
+					// Curved surface is probably always need to compute normal(not sampling from control points)
+					if (doBezier) {
+						// Bernstein derivative
+						WRITE(p, "  vec2 bernderiv[4];\n");
+						WRITE(p, "  bernderiv[0] = -3.0 * (tess_pos - 1.0) * (tess_pos - 1.0); \n");
+						WRITE(p, "  bernderiv[1] = 9.0 * tess_pos * tess_pos - 12.0 * tess_pos + 3.0; \n");
+						WRITE(p, "  bernderiv[2] = 3.0 * (2.0 - 3.0 * tess_pos) * tess_pos; \n");
+						WRITE(p, "  bernderiv[3] = 3.0 * tess_pos * tess_pos; \n");
+
+						WRITE(p, "  vec2 bernderiv_u[4];\n");
+						WRITE(p, "  vec2 bernderiv_v[4];\n");
+						WRITE(p, "  for (int i = 0; i < 4; i++) {\n");
+						WRITE(p, "    bernderiv_u[i] = vec2(bernderiv[i].x, weights[i].y);\n");
+						WRITE(p, "    bernderiv_v[i] = vec2(weights[i].x, bernderiv[i].y);\n");
+						WRITE(p, "  }\n");
+
+						WRITE(p, "  vec3 du = tess_sample(_pos, bernderiv_u);\n");
+						WRITE(p, "  vec3 dv = tess_sample(_pos, bernderiv_v);\n");
+					} else { // Spline
+						WRITE(p, "  vec2 tess_next_u = vec2(normal.x, 0.0);\n");
+						WRITE(p, "  vec2 tess_next_v = vec2(0.0, normal.y);\n");
+						// Right
+						WRITE(p, "  vec2 tess_pos_r = tess_pos + tess_next_u;\n");
+						WRITE(p, "  spline_weight(tess_pos_r + vec2(patch_pos), knots, weights);\n");
+						WRITE(p, "  vec3 pos_r = tess_sample(_pos, weights);\n");
+						// Left
+						WRITE(p, "  vec2 tess_pos_l = tess_pos - tess_next_u;\n");
+						WRITE(p, "  spline_weight(tess_pos_l + vec2(patch_pos), knots, weights);\n");
+						WRITE(p, "  vec3 pos_l = tess_sample(_pos, weights);\n");
+						// Down
+						WRITE(p, "  vec2 tess_pos_d = tess_pos + tess_next_v;\n");
+						WRITE(p, "  spline_weight(tess_pos_d + vec2(patch_pos), knots, weights);\n");
+						WRITE(p, "  vec3 pos_d = tess_sample(_pos, weights);\n");
+						// Up
+						WRITE(p, "  vec2 tess_pos_u = tess_pos - tess_next_v;\n");
+						WRITE(p, "  spline_weight(tess_pos_u + vec2(patch_pos), knots, weights);\n");
+						WRITE(p, "  vec3 pos_u = tess_sample(_pos, weights);\n");
+
+						WRITE(p, "  vec3 du = pos_r - pos_l;\n");
+						WRITE(p, "  vec3 dv = pos_d - pos_u;\n");
+					}
+					WRITE(p, "  vec3 nrm = cross(du, dv);\n");
+					WRITE(p, "  nrm = normalize(nrm);\n");
+				}
+				WRITE(p, "  vec3 worldpos = (u_world * vec4(pos.xyz, 1.0)).xyz;\n");
+				if (hasNormal) {
+					WRITE(p, "  mediump vec3 worldnormal = normalize((u_world * vec4(%snrm, 0.0)).xyz);\n", flipNormalTess ? "-" : "");
+				} else {
+					WRITE(p, "  mediump vec3 worldnormal = vec3(0.0, 0.0, 1.0);\n");
 				}
-				WRITE(p, "  vec3 nrm = cross(du, dv);\n");
-				WRITE(p, "  nrm = normalize(nrm);\n");
-			}
-			WRITE(p, "  vec3 worldpos = (u_world * vec4(pos.xyz, 1.0)).xyz;\n");
-			if (hasNormal) {
-				WRITE(p, "  mediump vec3 worldnormal = normalize((u_world * vec4(%snrm, 0.0)).xyz);\n", flipNormalTess ? "-" : "");
 			} else {
-				WRITE(p, "  mediump vec3 worldnormal = vec3(0.0, 0.0, 1.0);\n");
+				// No skinning, just standard T&L.
+				WRITE(p, "  vec3 worldpos = (u_world * vec4(position.xyz, 1.0)).xyz;\n");
+				if (hasNormal)
+					WRITE(p, "  mediump vec3 worldnormal = normalize((u_world * vec4(%snormal, 0.0)).xyz);\n", flipNormal ? "-" : "");
+				else
+					WRITE(p, "  mediump vec3 worldnormal = vec3(0.0, 0.0, 1.0);\n");
 			}
 		} else {
-			WRITE(p, "  vec3 worldpos = (u_world * vec4(position.xyz, 1.0)).xyz;\n");
-			if (hasNormal)
-				WRITE(p, "  mediump vec3 worldnormal = normalize((u_world * vec4(%snormal, 0.0)).xyz);\n", flipNormal ? "-" : "");
+			static const char *rescale[4] = {"", " * 1.9921875", " * 1.999969482421875", ""}; // 2*127.5f/128.f, 2*32767.5f/32768.f, 1.0f};
+			const char *factor = rescale[boneWeightScale];
+
+			static const char * const boneWeightAttr[8] = {
+				"w1.x", "w1.y", "w1.z", "w1.w",
+				"w2.x", "w2.y", "w2.z", "w2.w",
+			};
+
+#if defined(USE_FOR_LOOP) && defined(USE_BONE_ARRAY)
+
+			// To loop through the weights, we unfortunately need to put them in a float array.
+			// GLSL ES sucks - no way to directly initialize an array!
+			switch (numBoneWeights) {
+			case 1: WRITE(p, "  float w[1]; w[0] = w1;\n"); break;
+			case 2: WRITE(p, "  float w[2]; w[0] = w1.x; w[1] = w1.y;\n"); break;
+			case 3: WRITE(p, "  float w[3]; w[0] = w1.x; w[1] = w1.y; w[2] = w1.z;\n"); break;
+			case 4: WRITE(p, "  float w[4]; w[0] = w1.x; w[1] = w1.y; w[2] = w1.z; w[3] = w1.w;\n"); break;
+			case 5: WRITE(p, "  float w[5]; w[0] = w1.x; w[1] = w1.y; w[2] = w1.z; w[3] = w1.w; w[4] = w2;\n"); break;
+			case 6: WRITE(p, "  float w[6]; w[0] = w1.x; w[1] = w1.y; w[2] = w1.z; w[3] = w1.w; w[4] = w2.x; w[5] = w2.y;\n"); break;
+			case 7: WRITE(p, "  float w[7]; w[0] = w1.x; w[1] = w1.y; w[2] = w1.z; w[3] = w1.w; w[4] = w2.x; w[5] = w2.y; w[6] = w2.z;\n"); break;
+			case 8: WRITE(p, "  float w[8]; w[0] = w1.x; w[1] = w1.y; w[2] = w1.z; w[3] = w1.w; w[4] = w2.x; w[5] = w2.y; w[6] = w2.z; w[7] = w2.w;\n"); break;
+			}
+
+			WRITE(p, "  mat4 skinMatrix = w[0] * u_bone[0];\n");
+			if (numBoneWeights > 1) {
+				WRITE(p, "  for (int i = 1; i < %i; i++) {\n", numBoneWeights);
+				WRITE(p, "    skinMatrix += w[i] * u_bone[i];\n");
+				WRITE(p, "  }\n");
+			}
+
+#else
+
+#ifdef USE_BONE_ARRAY
+			if (numBoneWeights == 1)
+				WRITE(p, "  mat4 skinMatrix = w1 * u_bone[0]");
 			else
-				WRITE(p, "  mediump vec3 worldnormal = vec3(0.0, 0.0, 1.0);\n");
+				WRITE(p, "  mat4 skinMatrix = w1.x * u_bone[0]");
+			for (int i = 1; i < numBoneWeights; i++) {
+				const char *weightAttr = boneWeightAttr[i];
+				// workaround for "cant do .x of scalar" issue
+				if (numBoneWeights == 1 && i == 0) weightAttr = "w1";
+				if (numBoneWeights == 5 && i == 4) weightAttr = "w2";
+				WRITE(p, " + %s * u_bone[%i]", weightAttr, i);
+			}
+#else
+			// Uncomment this to screw up bone shaders to check the vertex shader software fallback
+			// WRITE(p, "THIS SHOULD ERROR! #error");
+			if (numBoneWeights == 1)
+				WRITE(p, "  mat4 skinMatrix = w1 * u_bone0");
+			else
+				WRITE(p, "  mat4 skinMatrix = w1.x * u_bone0");
+			for (int i = 1; i < numBoneWeights; i++) {
+				const char *weightAttr = boneWeightAttr[i];
+				// workaround for "cant do .x of scalar" issue
+				if (numBoneWeights == 1 && i == 0) weightAttr = "w1";
+				if (numBoneWeights == 5 && i == 4) weightAttr = "w2";
+				WRITE(p, " + %s * u_bone%i", weightAttr, i);
+			}
+#endif
+
+#endif
+
+			WRITE(p, ";\n");
+
+			// Trying to simplify this results in bugs in LBP...
+			WRITE(p, "  vec3 skinnedpos = (skinMatrix * vec4(position, 1.0)).xyz %s;\n", factor);
+			WRITE(p, "  vec3 worldpos = (u_world * vec4(skinnedpos, 1.0)).xyz;\n");
+
+			if (hasNormal) {
+				WRITE(p, "  mediump vec3 skinnednormal = (skinMatrix * vec4(%snormal, 0.0)).xyz %s;\n", flipNormal ? "-" : "", factor);
+			} else {
+				WRITE(p, "  mediump vec3 skinnednormal = (skinMatrix * vec4(0.0, 0.0, %s1.0, 0.0)).xyz %s;\n", flipNormal ? "-" : "", factor);
+			}
+			WRITE(p, "  mediump vec3 worldnormal = normalize((u_world * vec4(skinnednormal, 0.0)).xyz);\n");
 		}

 		WRITE(p, "  vec4 viewPos = u_view * vec4(worldpos, 1.0);\n");
--- a/GPU/GLES/VertexShaderGeneratorGLES.h
+++ b/GPU/GLES/VertexShaderGeneratorGLES.h
@ -19,6 +19,8 @@

 #include "Common/CommonTypes.h"

+// #define USE_BONE_ARRAY
+
 struct VShaderID;

 void GenerateVertexShader(const VShaderID &id, char *buffer, uint32_t *attrMask, uint64_t *uniformMask);
--- a/GPU/GPUCommon.cpp
+++ b/GPU/GPUCommon.cpp
@ -46,8 +46,8 @@ const CommonCommandTableEntry commonCommandTable[] = {
 	{ GE_CMD_BEZIER, FLAG_FLUSHBEFORE | FLAG_EXECUTE, 0, &GPUCommon::Execute_Bezier },
 	{ GE_CMD_SPLINE, FLAG_FLUSHBEFORE | FLAG_EXECUTE, 0, &GPUCommon::Execute_Spline },

-	// Changing the vertex type does not always require us to flush so handle that in Execute_VertexType.
-	{ GE_CMD_VERTEXTYPE, FLAG_EXECUTEONCHANGE, 0, &GPUCommon::Execute_VertexType },
+	// Changing the vertex type requires us to flush.
+	{ GE_CMD_VERTEXTYPE, FLAG_FLUSHBEFOREONCHANGE | FLAG_EXECUTEONCHANGE, 0, &GPUCommon::Execute_VertexType },

 	{ GE_CMD_LOADCLUT, FLAG_FLUSHBEFOREONCHANGE | FLAG_EXECUTE, 0, &GPUCommon::Execute_LoadClut },

@ -403,11 +403,23 @@ GPUCommon::GPUCommon(GraphicsContext *gfxCtx, Draw::DrawContext *draw) :
 			ERROR_LOG(G3D, "Command missing from table: %02x (%i)", i, i);
 		}
 	}
+
+	UpdateCmdInfo();
 }

 GPUCommon::~GPUCommon() {
 }

+void GPUCommon::UpdateCmdInfo() {
+	if (g_Config.bSoftwareSkinning) {
+		cmdInfo_[GE_CMD_VERTEXTYPE].flags &= ~FLAG_FLUSHBEFOREONCHANGE;
+		cmdInfo_[GE_CMD_VERTEXTYPE].func = &GPUCommon::Execute_VertexTypeSkinning;
+	} else {
+		cmdInfo_[GE_CMD_VERTEXTYPE].flags |= FLAG_FLUSHBEFOREONCHANGE;
+		cmdInfo_[GE_CMD_VERTEXTYPE].func = &GPUCommon::Execute_VertexType;
+	}
+}
+
 void GPUCommon::BeginHostFrame() {
 	ReapplyGfxState();

@ -1414,12 +1426,22 @@ void GPUCommon::Execute_TexSize0(u32 op, u32 diff) {
 	}
 }

+void GPUCommon::Execute_VertexType(u32 op, u32 diff) {
+	if (diff)
+		gstate_c.Dirty(DIRTY_VERTEXSHADER_STATE);
+	if (diff & (GE_VTYPE_TC_MASK | GE_VTYPE_THROUGH_MASK)) {
+		gstate_c.Dirty(DIRTY_UVSCALEOFFSET);
+		if (diff & GE_VTYPE_THROUGH_MASK)
+			gstate_c.Dirty(DIRTY_RASTER_STATE | DIRTY_VIEWPORTSCISSOR_STATE | DIRTY_FRAGMENTSHADER_STATE);
+	}
+}
+
 void GPUCommon::Execute_LoadClut(u32 op, u32 diff) {
 	gstate_c.Dirty(DIRTY_TEXTURE_PARAMS);
 	textureCache_->LoadClut(gstate.getClutAddress(), gstate.getClutLoadBytes());
 }

-void GPUCommon::Execute_VertexType(u32 op, u32 diff) {
+void GPUCommon::Execute_VertexTypeSkinning(u32 op, u32 diff) {
 	// Don't flush when weight count changes.
 	if (diff & ~GE_VTYPE_WEIGHTCOUNT_MASK) {
 		// Restore and flush
@ -1428,6 +1450,12 @@ void GPUCommon::Execute_VertexType(u32 op, u32 diff) {
 		gstate.vertType ^= diff;
 		if (diff & (GE_VTYPE_TC_MASK | GE_VTYPE_THROUGH_MASK))
 			gstate_c.Dirty(DIRTY_UVSCALEOFFSET);
+		// In this case, we may be doing weights and morphs.
+		// Update any bone matrix uniforms so it uses them correctly.
+		if ((op & GE_VTYPE_MORPHCOUNT_MASK) != 0) {
+			gstate_c.Dirty(gstate_c.deferredVertTypeDirty);
+			gstate_c.deferredVertTypeDirty = 0;
+		}
 		gstate_c.Dirty(DIRTY_VERTEXSHADER_STATE);
 	}
 	if (diff & GE_VTYPE_THROUGH_MASK)
@ -1525,6 +1553,10 @@ void GPUCommon::Execute_Prim(u32 op, u32 diff) {
 		goto bail;
 #endif

+	uint32_t vtypeCheckMask = ~GE_VTYPE_WEIGHTCOUNT_MASK;
+	if (!g_Config.bSoftwareSkinning)
+		vtypeCheckMask = 0xFFFFFFFF;
+
 	while (src != stall) {
 		uint32_t data = *src;
 		switch (data >> 24) {
@ -1553,7 +1585,7 @@ void GPUCommon::Execute_Prim(u32 op, u32 diff) {
 		{
 			uint32_t diff = data ^ vertexType;
 			// don't mask upper bits, vertexType is unmasked
-			if (diff & ~GE_VTYPE_WEIGHTCOUNT_MASK) {
+			if (diff & vtypeCheckMask) {
 				goto bail;
 			} else {
 				vertexType = data;
@ -1655,6 +1687,10 @@ void GPUCommon::Execute_Bezier(u32 op, u32 diff) {
 		indices = Memory::GetPointerUnchecked(gstate_c.indexAddr);
 	}

+	if (vertTypeIsSkinningEnabled(gstate.vertType)) {
+		DEBUG_LOG_REPORT(G3D, "Unusual bezier/spline vtype: %08x, morph: %d, bones: %d", gstate.vertType, (gstate.vertType & GE_VTYPE_MORPHCOUNT_MASK) >> GE_VTYPE_MORPHCOUNT_SHIFT, vertTypeGetNumBoneWeights(gstate.vertType));
+	}
+
 	GEPatchPrimType patchPrim = gstate.getPatchPrimitiveType();
 	SetDrawType(DRAW_BEZIER, PatchPrimToPrim(patchPrim));

@ -1713,6 +1749,10 @@ void GPUCommon::Execute_Spline(u32 op, u32 diff) {
 		indices = Memory::GetPointerUnchecked(gstate_c.indexAddr);
 	}

+	if (vertTypeIsSkinningEnabled(gstate.vertType)) {
+		DEBUG_LOG_REPORT(G3D, "Unusual bezier/spline vtype: %08x, morph: %d, bones: %d", gstate.vertType, (gstate.vertType & GE_VTYPE_MORPHCOUNT_MASK) >> GE_VTYPE_MORPHCOUNT_SHIFT, vertTypeGetNumBoneWeights(gstate.vertType));
+	}
+
 	int sp_ucount = op & 0xFF;
 	int sp_vcount = (op >> 8) & 0xFF;
 	int sp_utype = (op >> 16) & 0x3;
@ -1993,10 +2033,34 @@ void GPUCommon::Execute_BoneMtxNum(u32 op, u32 diff) {
 	}

 	if (fastLoad) {
-		while ((src[i] >> 24) == GE_CMD_BONEMATRIXDATA) {
-			dst[i] = src[i] << 8;
-			if (++i >= end) {
-				break;
+		// If we can't use software skinning, we have to flush and dirty.
+		if (!g_Config.bSoftwareSkinning) {
+			while ((src[i] >> 24) == GE_CMD_BONEMATRIXDATA) {
+				const u32 newVal = src[i] << 8;
+				if (dst[i] != newVal) {
+					Flush();
+					dst[i] = newVal;
+				}
+				if (++i >= end) {
+					break;
+				}
+			}
+
+			const unsigned int numPlusCount = (op & 0x7F) + i;
+			for (unsigned int num = op & 0x7F; num < numPlusCount; num += 12) {
+				gstate_c.Dirty(DIRTY_BONEMATRIX0 << (num / 12));
+			}
+		} else {
+			while ((src[i] >> 24) == GE_CMD_BONEMATRIXDATA) {
+				dst[i] = src[i] << 8;
+				if (++i >= end) {
+					break;
+				}
+			}
+
+			const unsigned int numPlusCount = (op & 0x7F) + i;
+			for (unsigned int num = op & 0x7F; num < numPlusCount; num += 12) {
+				gstate_c.deferredVertTypeDirty |= DIRTY_BONEMATRIX0 << (num / 12);
 			}
 		}
 	}
@ -2014,6 +2078,13 @@ void GPUCommon::Execute_BoneMtxData(u32 op, u32 diff) {
 	int num = gstate.boneMatrixNumber & 0x7F;
 	u32 newVal = op << 8;
 	if (num < 96 && newVal != ((const u32 *)gstate.boneMatrix)[num]) {
+		// Bone matrices should NOT flush when software skinning is enabled!
+		if (!g_Config.bSoftwareSkinning) {
+			Flush();
+			gstate_c.Dirty(DIRTY_BONEMATRIX0 << (num / 12));
+		} else {
+			gstate_c.deferredVertTypeDirty |= DIRTY_BONEMATRIX0 << (num / 12);
+		}
 		((u32 *)gstate.boneMatrix)[num] = newVal;
 	}
 	num++;
@ -2154,6 +2225,17 @@ void GPUCommon::Execute_Unknown(u32 op, u32 diff) {
 void GPUCommon::FastLoadBoneMatrix(u32 target) {
 	const int num = gstate.boneMatrixNumber & 0x7F;
 	const int mtxNum = num / 12;
+	uint32_t uniformsToDirty = DIRTY_BONEMATRIX0 << mtxNum;
+	if ((num - 12 * mtxNum) != 0) {
+		uniformsToDirty |= DIRTY_BONEMATRIX0 << ((mtxNum + 1) & 7);
+	}
+
+	if (!g_Config.bSoftwareSkinning) {
+		Flush();
+		gstate_c.Dirty(uniformsToDirty);
+	} else {
+		gstate_c.deferredVertTypeDirty |= uniformsToDirty;
+	}
 	gstate.FastLoadBoneMatrix(target);
 }

--- a/GPU/GPUCommon.h
+++ b/GPU/GPUCommon.h
@ -72,6 +72,8 @@ public:
 	}
 	virtual void CheckGPUFeatures() = 0;

+	void UpdateCmdInfo();
+
 	bool IsReady() override {
 		return true;
 	}
@ -129,6 +131,7 @@ public:
 	void Execute_End(u32 op, u32 diff);

 	void Execute_VertexType(u32 op, u32 diff);
+	void Execute_VertexTypeSkinning(u32 op, u32 diff);

 	void Execute_Prim(u32 op, u32 diff);
 	void Execute_Bezier(u32 op, u32 diff);
--- a/GPU/GPUState.cpp
+++ b/GPU/GPUState.cpp
@ -240,6 +240,13 @@ void GPUgstate::Restore(u32_le *ptr) {
 	}
 }

+bool vertTypeIsSkinningEnabled(u32 vertType) {
+	if (g_Config.bSoftwareSkinning)
+		return false;
+	else
+		return ((vertType & GE_VTYPE_WEIGHT_MASK) != GE_VTYPE_WEIGHT_NONE);
+}
+
 struct GPUStateCache_v0 {
 	u32 vertexAddr;
 	u32 indexAddr;
--- a/GPU/GPUState.h
+++ b/GPU/GPUState.h
@ -441,6 +441,11 @@ struct GPUgstate {
 	void Restore(u32_le *ptr);
 };

+bool vertTypeIsSkinningEnabled(u32 vertType);
+
+inline int vertTypeGetNumBoneWeights(u32 vertType) { return 1 + ((vertType & GE_VTYPE_WEIGHTCOUNT_MASK) >> GE_VTYPE_WEIGHTCOUNT_SHIFT); }
+inline int vertTypeGetWeightMask(u32 vertType) { return vertType & GE_VTYPE_WEIGHT_MASK; }
+
 // The rest is cached simplified/converted data for fast access.
 // Does not need to be saved when saving/restoring context.
 //
@ -553,6 +558,7 @@ struct GPUStateCache {
 	bool allowShaderBlend;

 	float morphWeights[8];
+	u32 deferredVertTypeDirty;

 	u32 curTextureWidth;
 	u32 curTextureHeight;
--- a/GPU/Software/SoftGpu.cpp
+++ b/GPU/Software/SoftGpu.cpp
@ -396,6 +396,10 @@ void SoftGPU::ExecuteOp(u32 op, u32 diff) {
 				indices = Memory::GetPointerUnchecked(gstate_c.indexAddr);
 			}

+			if ((gstate.vertType & GE_VTYPE_MORPHCOUNT_MASK) || vertTypeIsSkinningEnabled(gstate.vertType)) {
+				DEBUG_LOG_REPORT(G3D, "Unusual bezier/spline vtype: %08x, morph: %d, bones: %d", gstate.vertType, (gstate.vertType & GE_VTYPE_MORPHCOUNT_MASK) >> GE_VTYPE_MORPHCOUNT_SHIFT, vertTypeGetNumBoneWeights(gstate.vertType));
+			}
+
 			GEPatchPrimType patchPrim = gstate.getPatchPrimitiveType();
 			SetDrawType(DRAW_BEZIER, PatchPrimToPrim(patchPrim));

@ -440,6 +444,10 @@ void SoftGPU::ExecuteOp(u32 op, u32 diff) {
 				indices = Memory::GetPointerUnchecked(gstate_c.indexAddr);
 			}

+			if ((gstate.vertType & GE_VTYPE_MORPHCOUNT_MASK) || vertTypeIsSkinningEnabled(gstate.vertType)) {
+				DEBUG_LOG_REPORT(G3D, "Unusual bezier/spline vtype: %08x, morph: %d, bones: %d", gstate.vertType, (gstate.vertType & GE_VTYPE_MORPHCOUNT_MASK) >> GE_VTYPE_MORPHCOUNT_SHIFT, vertTypeGetNumBoneWeights(gstate.vertType));
+			}
+
 			int sp_ucount = op & 0xFF;
 			int sp_vcount = (op >> 8) & 0xFF;
 			int sp_utype = (op >> 16) & 0x3;
--- a/GPU/Software/TransformUnit.cpp
+++ b/GPU/Software/TransformUnit.cpp
@ -167,6 +167,27 @@ VertexData TransformUnit::ReadVertex(VertexReader& vreader)
 			vertex.normal = -vertex.normal;
 	}

+	if (vertTypeIsSkinningEnabled(gstate.vertType) && !gstate.isModeThrough()) {
+		float W[8] = { 1.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f };
+		vreader.ReadWeights(W);
+
+		Vec3<float> tmppos(0.f, 0.f, 0.f);
+		Vec3<float> tmpnrm(0.f, 0.f, 0.f);
+
+		for (int i = 0; i < vertTypeGetNumBoneWeights(gstate.vertType); ++i) {
+			Mat3x3<float> bone(&gstate.boneMatrix[12*i]);
+			tmppos += (bone * ModelCoords(pos[0], pos[1], pos[2]) + Vec3<float>(gstate.boneMatrix[12*i+9], gstate.boneMatrix[12*i+10], gstate.boneMatrix[12*i+11])) * W[i];
+			if (vreader.hasNormal())
+				tmpnrm += (bone * vertex.normal) * W[i];
+		}
+
+		pos[0] = tmppos.x;
+		pos[1] = tmppos.y;
+		pos[2] = tmppos.z;
+		if (vreader.hasNormal())
+			vertex.normal = tmpnrm;
+	}
+
 	if (vreader.hasColor0()) {
 		float col[4];
 		vreader.ReadColor0(col);
--- a/GPU/Vulkan/DrawEngineVulkan.cpp
+++ b/GPU/Vulkan/DrawEngineVulkan.cpp
@ -66,7 +66,8 @@ enum {
 	DRAW_BINDING_2ND_TEXTURE = 1,
 	DRAW_BINDING_DYNUBO_BASE = 2,
 	DRAW_BINDING_DYNUBO_LIGHT = 3,
-	DRAW_BINDING_TESS_STORAGE_BUF = 4,
+	DRAW_BINDING_DYNUBO_BONE = 4,
+	DRAW_BINDING_TESS_STORAGE_BUF = 5,
 };

 enum {
@ -94,7 +95,7 @@ DrawEngineVulkan::DrawEngineVulkan(VulkanContext *vulkan, Draw::DrawContext *dra

 void DrawEngineVulkan::InitDeviceObjects() {
 	// All resources we need for PSP drawing. Usually only bindings 0 and 2-4 are populated.
-	VkDescriptorSetLayoutBinding bindings[5]{};
+	VkDescriptorSetLayoutBinding bindings[6]{};
 	bindings[0].descriptorCount = 1;
 	bindings[0].descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER;
 	bindings[0].stageFlags = VK_SHADER_STAGE_FRAGMENT_BIT;
@ -111,11 +112,15 @@ void DrawEngineVulkan::InitDeviceObjects() {
 	bindings[3].descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC;
 	bindings[3].stageFlags = VK_SHADER_STAGE_VERTEX_BIT;
 	bindings[3].binding = DRAW_BINDING_DYNUBO_LIGHT;
-	// Used only for hardware tessellation.
 	bindings[4].descriptorCount = 1;
-	bindings[4].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
+	bindings[4].descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC;
 	bindings[4].stageFlags = VK_SHADER_STAGE_VERTEX_BIT;
-	bindings[4].binding = DRAW_BINDING_TESS_STORAGE_BUF;
+	bindings[4].binding = DRAW_BINDING_DYNUBO_BONE;
+	// Used only for hardware tessellation.
+	bindings[5].descriptorCount = 1;
+	bindings[5].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
+	bindings[5].stageFlags = VK_SHADER_STAGE_VERTEX_BIT;
+	bindings[5].binding = DRAW_BINDING_TESS_STORAGE_BUF;

 	VkDevice device = vulkan_->GetDevice();

@ -129,7 +134,7 @@ void DrawEngineVulkan::InitDeviceObjects() {
 	// if creating and updating them turns out to be expensive.
 	for (int i = 0; i < VulkanContext::MAX_INFLIGHT_FRAMES; i++) {
 		// We now create descriptor pools on demand, so removed from here.
-		frame_[i].pushUBO = new VulkanPushBuffer(vulkan_, 4 * 1024 * 1024);
+		frame_[i].pushUBO = new VulkanPushBuffer(vulkan_, 8 * 1024 * 1024);
 		frame_[i].pushVertex = new VulkanPushBuffer(vulkan_, 2 * 1024 * 1024);
 		frame_[i].pushIndex = new VulkanPushBuffer(vulkan_, 1 * 1024 * 1024);
 	}
@ -360,7 +365,7 @@ VkResult DrawEngineVulkan::RecreateDescriptorPool(FrameData &frame, int newSize)
 	frame.descPoolSize = newSize;

 	VkDescriptorPoolSize dpTypes[3];
-	dpTypes[0].descriptorCount = frame.descPoolSize * 2;
+	dpTypes[0].descriptorCount = frame.descPoolSize * 3;
 	dpTypes[0].type = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC;
 	dpTypes[1].descriptorCount = frame.descPoolSize * 2;  // Don't use these for tess anymore, need max two per set.
 	dpTypes[1].type = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER;
@ -378,15 +383,17 @@ VkResult DrawEngineVulkan::RecreateDescriptorPool(FrameData &frame, int newSize)
 	return res;
 }

-VkDescriptorSet DrawEngineVulkan::GetOrCreateDescriptorSet(VkImageView imageView, VkSampler sampler, VkBuffer base, VkBuffer light, bool tess) {
+VkDescriptorSet DrawEngineVulkan::GetOrCreateDescriptorSet(VkImageView imageView, VkSampler sampler, VkBuffer base, VkBuffer light, VkBuffer bone, bool tess) {
 	DescriptorSetKey key;
 	key.imageView_ = imageView;
 	key.sampler_ = sampler;
 	key.secondaryImageView_ = boundSecondary_;
 	key.base_ = base;
 	key.light_ = light;
+	key.bone_ = bone;
 	_dbg_assert_(G3D, base != VK_NULL_HANDLE);
 	_dbg_assert_(G3D, light != VK_NULL_HANDLE);
+	_dbg_assert_(G3D, bone != VK_NULL_HANDLE);

 	FrameData &frame = frame_[vulkan_->GetCurFrame()];
 	// See if we already have this descriptor set cached.
@ -494,7 +501,7 @@ VkDescriptorSet DrawEngineVulkan::GetOrCreateDescriptorSet(VkImageView imageView
 	}

 	// Uniform buffer objects
-	VkDescriptorBufferInfo buf[2]{};
+	VkDescriptorBufferInfo buf[3]{};
 	int count = 0;
 	buf[count].buffer = base;
 	buf[count].offset = 0;
@ -504,6 +511,10 @@ VkDescriptorSet DrawEngineVulkan::GetOrCreateDescriptorSet(VkImageView imageView
 	buf[count].offset = 0;
 	buf[count].range = sizeof(UB_VS_Lights);
 	count++;
+	buf[count].buffer = bone;
+	buf[count].offset = 0;
+	buf[count].range = sizeof(UB_VS_Bones);
+	count++;
 	for (int i = 0; i < count; i++) {
 		writes[n].sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET;
 		writes[n].pNext = nullptr;
@ -527,9 +538,11 @@ VkDescriptorSet DrawEngineVulkan::GetOrCreateDescriptorSet(VkImageView imageView
 void DrawEngineVulkan::DirtyAllUBOs() {
 	baseUBOOffset = 0;
 	lightUBOOffset = 0;
+	boneUBOOffset = 0;
 	baseBuf = VK_NULL_HANDLE;
 	lightBuf = VK_NULL_HANDLE;
-	dirtyUniforms_ = DIRTY_BASE_UNIFORMS | DIRTY_LIGHT_UNIFORMS;
+	boneBuf = VK_NULL_HANDLE;
+	dirtyUniforms_ = DIRTY_BASE_UNIFORMS | DIRTY_LIGHT_UNIFORMS | DIRTY_BONE_UNIFORMS;
 	imageView = VK_NULL_HANDLE;
 	sampler = VK_NULL_HANDLE;
 	gstate_c.Dirty(DIRTY_TEXTURE_IMAGE);
@ -585,10 +598,10 @@ void DrawEngineVulkan::DoFlush() {

 		// Cannot cache vertex data with morph enabled.
 		bool useCache = g_Config.bVertexCache && !(lastVType_ & GE_VTYPE_MORPHCOUNT_MASK);
-		// Also avoid caching when skinning.
+		// Also avoid caching when software skinning.
 		VkBuffer vbuf = VK_NULL_HANDLE;
 		VkBuffer ibuf = VK_NULL_HANDLE;
-		if (lastVType_ & GE_VTYPE_WEIGHT_MASK) {
+		if (g_Config.bSoftwareSkinning && (lastVType_ & GE_VTYPE_WEIGHT_MASK)) {
 			useCache = false;
 		}

@ -730,8 +743,8 @@ void DrawEngineVulkan::DoFlush() {
 				break;
 			}
 		} else {
-			if (lastVType_ & GE_VTYPE_WEIGHT_MASK) {
-				// If skinning, we've already predecoded into "decoded". So push that content.
+			if (g_Config.bSoftwareSkinning && (lastVType_ & GE_VTYPE_WEIGHT_MASK)) {
+				// If software skinning, we've already predecoded into "decoded". So push that content.
 				VkDeviceSize size = decodedVerts_ * dec_->GetDecVtxFmt().stride;
 				u8 *dest = (u8 *)frame->pushVertex->Push(size, &vbOffset, &vbuf);
 				memcpy(dest, decoded, size);
@ -802,12 +815,12 @@ void DrawEngineVulkan::DoFlush() {
 		dirtyUniforms_ |= shaderManager_->UpdateUniforms();
 		UpdateUBOs(frame);

-		VkDescriptorSet ds = GetOrCreateDescriptorSet(imageView, sampler, baseBuf, lightBuf, tess);
+		VkDescriptorSet ds = GetOrCreateDescriptorSet(imageView, sampler, baseBuf, lightBuf, boneBuf, tess);
 		{
 		PROFILE_THIS_SCOPE("renderman_q");

-		const uint32_t dynamicUBOOffsets[2] = {
-			baseUBOOffset, lightUBOOffset,
+		const uint32_t dynamicUBOOffsets[3] = {
+			baseUBOOffset, lightUBOOffset, boneUBOOffset,
 		};

 		int stride = dec_->GetDecVtxFmt().stride;
@ -908,9 +921,9 @@ void DrawEngineVulkan::DoFlush() {
 			// Even if the first draw is through-mode, make sure we at least have one copy of these uniforms buffered
 			UpdateUBOs(frame);

-			VkDescriptorSet ds = GetOrCreateDescriptorSet(imageView, sampler, baseBuf, lightBuf, tess);
-			const uint32_t dynamicUBOOffsets[2] = {
-				baseUBOOffset, lightUBOOffset,
+			VkDescriptorSet ds = GetOrCreateDescriptorSet(imageView, sampler, baseBuf, lightBuf, boneBuf, tess);
+			const uint32_t dynamicUBOOffsets[3] = {
+				baseUBOOffset, lightUBOOffset, boneUBOOffset,
 			};

 			PROFILE_THIS_SCOPE("renderman_q");
@ -977,6 +990,10 @@ void DrawEngineVulkan::UpdateUBOs(FrameData *frame) {
 		lightUBOOffset = shaderManager_->PushLightBuffer(frame->pushUBO, &lightBuf);
 		dirtyUniforms_ &= ~DIRTY_LIGHT_UNIFORMS;
 	}
+	if ((dirtyUniforms_ & DIRTY_BONE_UNIFORMS) || boneBuf == VK_NULL_HANDLE) {
+		boneUBOOffset = shaderManager_->PushBoneBuffer(frame->pushUBO, &boneBuf);
+		dirtyUniforms_ &= ~DIRTY_BONE_UNIFORMS;
+	}
 }

 DrawEngineVulkan::TessellationDataTransferVulkan::TessellationDataTransferVulkan(VulkanContext *vulkan, Draw::DrawContext *draw)
--- a/GPU/Vulkan/DrawEngineVulkan.h
+++ b/GPU/Vulkan/DrawEngineVulkan.h
@ -23,7 +23,7 @@
 // * binding 1: Secondary texture sampler for shader blending or depal palettes
 // * binding 2: Base Uniform Buffer (includes fragment state)
 // * binding 3: Light uniform buffer
-// * binding 4: Shader buffer storage for tesselation
+// * binding 4: Bone uniform buffer
 //
 // All shaders conform to this layout, so they are all compatible with the same descriptor set.
 // The format of the various uniform buffers may vary though - vertex shaders that don't skin
@ -194,7 +194,7 @@ private:
 	void DoFlush();
 	void UpdateUBOs(FrameData *frame);

-	VkDescriptorSet GetOrCreateDescriptorSet(VkImageView imageView, VkSampler sampler, VkBuffer base, VkBuffer light, bool tess);
+	VkDescriptorSet GetOrCreateDescriptorSet(VkImageView imageView, VkSampler sampler, VkBuffer base, VkBuffer light, VkBuffer bone, bool tess);

 	VulkanContext *vulkan_;
 	Draw::DrawContext *draw_;
@ -218,7 +218,7 @@ private:
 		VkImageView imageView_;
 		VkImageView secondaryImageView_;
 		VkSampler sampler_;
-		VkBuffer base_, light_;  // All three UBO slots will be set to this. This will usually be identical
+		VkBuffer base_, light_, bone_;  // All three UBO slots will be set to this. This will usually be identical
 		// for all draws in a frame, except when the buffer has to grow.
 	};

@ -252,7 +252,8 @@ private:
 	uint64_t dirtyUniforms_;
 	uint32_t baseUBOOffset;
 	uint32_t lightUBOOffset;
-	VkBuffer baseBuf, lightBuf;
+	uint32_t boneUBOOffset;
+	VkBuffer baseBuf, lightBuf, boneBuf;
 	VkImageView imageView = VK_NULL_HANDLE;
 	VkSampler sampler = VK_NULL_HANDLE;

--- a/GPU/Vulkan/GPU_Vulkan.cpp
+++ b/GPU/Vulkan/GPU_Vulkan.cpp
@ -252,6 +252,7 @@ void GPU_Vulkan::CheckGPUFeatures() {

 void GPU_Vulkan::BeginHostFrame() {
 	drawEngine_.BeginFrame();
+	UpdateCmdInfo();

 	if (resized_) {
 		CheckGPUFeatures();
@ -490,6 +491,7 @@ void GPU_Vulkan::DeviceRestore() {

 	CheckGPUFeatures();
 	BuildReportingInfo();
+	UpdateCmdInfo();

 	framebufferManagerVulkan_->DeviceRestore(vulkan_, draw_);
 	vulkan2D_.DeviceRestore(vulkan_);
--- a/GPU/Vulkan/ShaderManagerVulkan.cpp
+++ b/GPU/Vulkan/ShaderManagerVulkan.cpp
@ -158,9 +158,11 @@ ShaderManagerVulkan::ShaderManagerVulkan(VulkanContext *vulkan)
 	uboAlignment_ = vulkan_->GetPhysicalDeviceProperties().limits.minUniformBufferOffsetAlignment;
 	memset(&ub_base, 0, sizeof(ub_base));
 	memset(&ub_lights, 0, sizeof(ub_lights));
+	memset(&ub_bones, 0, sizeof(ub_bones));

 	ILOG("sizeof(ub_base): %d", (int)sizeof(ub_base));
 	ILOG("sizeof(ub_lights): %d", (int)sizeof(ub_lights));
+	ILOG("sizeof(ub_bones): %d", (int)sizeof(ub_bones));
 }

 ShaderManagerVulkan::~ShaderManagerVulkan() {
@ -213,6 +215,8 @@ uint64_t ShaderManagerVulkan::UpdateUniforms() {
 			BaseUpdateUniforms(&ub_base, dirty, false);
 		if (dirty & DIRTY_LIGHT_UNIFORMS)
 			LightUpdateUniforms(&ub_lights, dirty);
+		if (dirty & DIRTY_BONE_UNIFORMS)
+			BoneUpdateUniforms(&ub_bones, dirty);
 	}
 	gstate_c.CleanUniforms();
 	return dirty;
--- a/GPU/Vulkan/ShaderManagerVulkan.h
+++ b/GPU/Vulkan/ShaderManagerVulkan.h
@ -111,6 +111,7 @@ public:
 	// Applies dirty changes and copies the buffer.
 	bool IsBaseDirty() { return true; }
 	bool IsLightDirty() { return true; }
+	bool IsBoneDirty() { return true; }

 	uint32_t PushBaseBuffer(VulkanPushBuffer *dest, VkBuffer *buf) {
 		return dest->PushAligned(&ub_base, sizeof(ub_base), uboAlignment_, buf);
@ -118,6 +119,10 @@ public:
 	uint32_t PushLightBuffer(VulkanPushBuffer *dest, VkBuffer *buf) {
 		return dest->PushAligned(&ub_lights, sizeof(ub_lights), uboAlignment_, buf);
 	}
+	// TODO: Only push half the bone buffer if we only have four bones.
+	uint32_t PushBoneBuffer(VulkanPushBuffer *dest, VkBuffer *buf) {
+		return dest->PushAligned(&ub_bones, sizeof(ub_bones), uboAlignment_, buf);
+	}

 	bool LoadCache(FILE *f);
 	void SaveCache(FILE *f);
@ -139,6 +144,7 @@ private:
 	// Uniform block scratchpad. These (the relevant ones) are copied to the current pushbuffer at draw time.
 	UB_VS_FS_Base ub_base;
 	UB_VS_Lights ub_lights;
+	UB_VS_Bones ub_bones;

 	VulkanFragmentShader *lastFShader_;
 	VulkanVertexShader *lastVShader_;
--- a/GPU/Vulkan/VertexShaderGeneratorVulkan.cpp
+++ b/GPU/Vulkan/VertexShaderGeneratorVulkan.cpp
@ -54,6 +54,18 @@ static const char *vulkan_glsl_preamble =

 #define WRITE p+=sprintf

+static const char * const boneWeightDecl[9] = {
+	"#ERROR#",
+	"layout(location = 3) in float w1;\n",
+	"layout(location = 3) in vec2 w1;\n",
+	"layout(location = 3) in vec3 w1;\n",
+	"layout(location = 3) in vec4 w1;\n",
+	"layout(location = 3) in vec4 w1;\nlayout(location = 4) in float w2;\n",
+	"layout(location = 3) in vec4 w1;\nlayout(location = 4) in vec2 w2;\n",
+	"layout(location = 3) in vec4 w1;\nlayout(location = 4) in vec3 w2;\n",
+	"layout(location = 3) in vec4 w1;\nlayout(location = 4) in vec4 w2;\n",
+};
+
 enum DoLightComputation {
 	LIGHT_OFF,
 	LIGHT_SHADE,
@ -114,6 +126,7 @@ bool GenerateVulkanGLSLVertexShader(const VShaderID &id, char *buffer) {
 	bool flipNormal = id.Bit(VS_BIT_NORM_REVERSE);
 	int ls0 = id.Bits(VS_BIT_LS0, 2);
 	int ls1 = id.Bits(VS_BIT_LS1, 2);
+	bool enableBones = id.Bit(VS_BIT_ENABLE_BONES);
 	bool enableLighting = id.Bit(VS_BIT_LIGHTING_ENABLE);
 	int matUpdate = id.Bits(VS_BIT_MATERIAL_UPDATE, 3);

@ -127,6 +140,8 @@ bool GenerateVulkanGLSLVertexShader(const VShaderID &id, char *buffer) {
 	WRITE(p, "layout (std140, set = 0, binding = 2) uniform baseVars {\n%s} base;\n", ub_baseStr);
 	if (enableLighting || doShadeMapping)
 		WRITE(p, "layout (std140, set = 0, binding = 3) uniform lightVars {\n%s} light;\n", ub_vs_lightsStr);
+	if (enableBones)
+		WRITE(p, "layout (std140, set = 0, binding = 4) uniform boneVars {\n%s} bone;\n", ub_vs_bonesStr);

 	const char *shading = doFlatShading ? "flat " : "";

@ -142,6 +157,13 @@ bool GenerateVulkanGLSLVertexShader(const VShaderID &id, char *buffer) {
 		}
 	}

+	int numBoneWeights = 0;
+	int boneWeightScale = id.Bits(VS_BIT_WEIGHT_FMTSCALE, 2);
+	if (enableBones) {
+		numBoneWeights = 1 + id.Bits(VS_BIT_BONES, 3);
+		WRITE(p, "%s", boneWeightDecl[numBoneWeights]);
+	}
+
 	if (useHWTransform)
 		WRITE(p, "layout (location = %d) in vec3 position;\n", (int)PspAttributeLocation::POSITION);
 	else
@ -199,7 +221,7 @@ bool GenerateVulkanGLSLVertexShader(const VShaderID &id, char *buffer) {
 		WRITE(p, "  vec4 uv;\n");
 		WRITE(p, "  vec4 color;\n");
 		WRITE(p, "};");
-		WRITE(p, "layout (std430, set = 0, binding = 4) buffer s_tess_data {\n");
+		WRITE(p, "layout (std430, set = 0, binding = 5) buffer s_tess_data {\n");
 		WRITE(p, "  TessData data[];");
 		WRITE(p, "} tess_data;");

@ -307,109 +329,140 @@ bool GenerateVulkanGLSLVertexShader(const VShaderID &id, char *buffer) {
 		}
 	} else {
 		// Step 1: World Transform / Skinning
-		if (doBezier || doSpline) {
-			WRITE(p, "  vec3 _pos[16];\n");
-			WRITE(p, "  vec2 _tex[16];\n");
-			WRITE(p, "  vec4 _col[16];\n");
-			WRITE(p, "  int num_patches_u = %s;\n", doBezier ? "(base.spline_count_u - 1) / 3" : "base.spline_count_u - 3");
-			WRITE(p, "  int u = int(mod(gl_InstanceIndex, num_patches_u));\n");
-			WRITE(p, "  int v = gl_InstanceIndex / num_patches_u;\n");
-			WRITE(p, "  ivec2 patch_pos = ivec2(u, v);\n");
-			WRITE(p, "  for (int i = 0; i < 4; i++) {\n");
-			WRITE(p, "    for (int j = 0; j < 4; j++) {\n");
-			WRITE(p, "      int idx = (i + v%s) * base.spline_count_u + (j + u%s);\n", doBezier ? " * 3" : "", doBezier ? " * 3" : "");
-			WRITE(p, "      _pos[i * 4 + j] = tess_data.data[idx].pos.xyz;\n");
-			if (doTexture && hasTexcoord && hasTexcoordTess)
-				WRITE(p, "      _tex[i * 4 + j] = tess_data.data[idx].uv.xy;\n");
-			if (hasColor && hasColorTess)
-				WRITE(p, "      _col[i * 4 + j] = tess_data.data[idx].color;\n");
-			WRITE(p, "    }\n");
-			WRITE(p, "  }\n");
-			WRITE(p, "  vec2 tess_pos = position.xy;\n");
-			WRITE(p, "  vec2 weights[4];\n");
-			if (doBezier) {
-				// Bernstein 3D
-				WRITE(p, "  weights[0] = (1 - tess_pos) * (1 - tess_pos) * (1 - tess_pos);\n");
-				WRITE(p, "  weights[1] = 3 * tess_pos * (1 - tess_pos) * (1 - tess_pos);\n");
-				WRITE(p, "  weights[2] = 3 * tess_pos * tess_pos * (1 - tess_pos);\n");
-				WRITE(p, "  weights[3] = tess_pos * tess_pos * tess_pos;\n");
-			} else { // Spline
-				WRITE(p, "  ivec2 spline_num_patches = ivec2(base.spline_count_u - 3, base.spline_count_v - 3);\n");
-				WRITE(p, "  ivec2 spline_type = ivec2(base.spline_type_u, base.spline_type_v);\n");
-				WRITE(p, "  vec2 knots[6];\n");
-				WRITE(p, "  spline_knot(spline_num_patches, spline_type, knots, patch_pos);\n");
-				WRITE(p, "  spline_weight(tess_pos + patch_pos, knots, weights);\n");
-			}
-			WRITE(p, "  vec3 pos = tess_sample(_pos, weights);\n");
-			if (doTexture && hasTexcoord) {
-				if (hasTexcoordTess)
-					WRITE(p, "  vec2 tex = tess_sample(_tex, weights);\n");
-				else
-					WRITE(p, "  vec2 tex = tess_pos + patch_pos;\n");
-			}
-			if (hasColor) {
-				if (hasColorTess)
-					WRITE(p, "  vec4 col = tess_sample(_col, weights);\n");
-				else
-					WRITE(p, "  vec4 col = tess_data.data[0].color;\n");
-			}
-			if (hasNormal) {
-				// Curved surface is probably always need to compute normal(not sampling from control points)
+		if (!enableBones) {
+			if (doBezier || doSpline) {
+				WRITE(p, "  vec3 _pos[16];\n");
+				WRITE(p, "  vec2 _tex[16];\n");
+				WRITE(p, "  vec4 _col[16];\n");
+				WRITE(p, "  int num_patches_u = %s;\n", doBezier ? "(base.spline_count_u - 1) / 3" : "base.spline_count_u - 3");
+				WRITE(p, "  int u = int(mod(gl_InstanceIndex, num_patches_u));\n");
+				WRITE(p, "  int v = gl_InstanceIndex / num_patches_u;\n");
+				WRITE(p, "  ivec2 patch_pos = ivec2(u, v);\n");
+				WRITE(p, "  for (int i = 0; i < 4; i++) {\n");
+				WRITE(p, "    for (int j = 0; j < 4; j++) {\n");
+				WRITE(p, "      int idx = (i + v%s) * base.spline_count_u + (j + u%s);\n", doBezier ? " * 3" : "", doBezier ? " * 3" : "");
+				WRITE(p, "      _pos[i * 4 + j] = tess_data.data[idx].pos.xyz;\n");
+				if (doTexture && hasTexcoord && hasTexcoordTess)
+					WRITE(p, "      _tex[i * 4 + j] = tess_data.data[idx].uv.xy;\n");
+				if (hasColor && hasColorTess)
+					WRITE(p, "      _col[i * 4 + j] = tess_data.data[idx].color;\n");
+				WRITE(p, "    }\n");
+				WRITE(p, "  }\n");
+				WRITE(p, "  vec2 tess_pos = position.xy;\n");
+				WRITE(p, "  vec2 weights[4];\n");
 				if (doBezier) {
-					// Bernstein derivative
-					WRITE(p, "  vec2 bernderiv[4];\n");
-					WRITE(p, "  bernderiv[0] = -3 * (tess_pos - 1) * (tess_pos - 1); \n");
-					WRITE(p, "  bernderiv[1] = 9 * tess_pos * tess_pos - 12 * tess_pos + 3; \n");
-					WRITE(p, "  bernderiv[2] = 3 * (2 - 3 * tess_pos) * tess_pos; \n");
-					WRITE(p, "  bernderiv[3] = 3 * tess_pos * tess_pos; \n");
-
-					WRITE(p, "  vec2 bernderiv_u[4];\n");
-					WRITE(p, "  vec2 bernderiv_v[4];\n");
-					WRITE(p, "  for (int i = 0; i < 4; i++) {\n");
-					WRITE(p, "    bernderiv_u[i] = vec2(bernderiv[i].x, weights[i].y);\n");
-					WRITE(p, "    bernderiv_v[i] = vec2(weights[i].x, bernderiv[i].y);\n");
-					WRITE(p, "  }\n");
-
-					WRITE(p, "  vec3 du = tess_sample(_pos, bernderiv_u);\n");
-					WRITE(p, "  vec3 dv = tess_sample(_pos, bernderiv_v);\n");
+					// Bernstein 3D
+					WRITE(p, "  weights[0] = (1 - tess_pos) * (1 - tess_pos) * (1 - tess_pos);\n");
+					WRITE(p, "  weights[1] = 3 * tess_pos * (1 - tess_pos) * (1 - tess_pos);\n");
+					WRITE(p, "  weights[2] = 3 * tess_pos * tess_pos * (1 - tess_pos);\n");
+					WRITE(p, "  weights[3] = tess_pos * tess_pos * tess_pos;\n");
 				} else { // Spline
-					WRITE(p, "  vec2 tess_next_u = vec2(normal.x, 0);\n");
-					WRITE(p, "  vec2 tess_next_v = vec2(0, normal.y);\n");
-					// Right
-					WRITE(p, "  vec2 tess_pos_r = tess_pos + tess_next_u;\n");
-					WRITE(p, "  spline_weight(tess_pos_r + patch_pos, knots, weights);\n");
-					WRITE(p, "  vec3 pos_r = tess_sample(_pos, weights);\n");
-					// Left
-					WRITE(p, "  vec2 tess_pos_l = tess_pos - tess_next_u;\n");
-					WRITE(p, "  spline_weight(tess_pos_l + patch_pos, knots, weights);\n");
-					WRITE(p, "  vec3 pos_l = tess_sample(_pos, weights);\n");
-					// Down
-					WRITE(p, "  vec2 tess_pos_d = tess_pos + tess_next_v;\n");
-					WRITE(p, "  spline_weight(tess_pos_d + patch_pos, knots, weights);\n");
-					WRITE(p, "  vec3 pos_d = tess_sample(_pos, weights);\n");
-					// Up
-					WRITE(p, "  vec2 tess_pos_u = tess_pos - tess_next_v;\n");
-					WRITE(p, "  spline_weight(tess_pos_u + patch_pos, knots, weights);\n");
-					WRITE(p, "  vec3 pos_u = tess_sample(_pos, weights);\n");
-
-					WRITE(p, "  vec3 du = pos_r - pos_l;\n");
-					WRITE(p, "  vec3 dv = pos_d - pos_u;\n");
+					WRITE(p, "  ivec2 spline_num_patches = ivec2(base.spline_count_u - 3, base.spline_count_v - 3);\n");
+					WRITE(p, "  ivec2 spline_type = ivec2(base.spline_type_u, base.spline_type_v);\n");
+					WRITE(p, "  vec2 knots[6];\n");
+					WRITE(p, "  spline_knot(spline_num_patches, spline_type, knots, patch_pos);\n");
+					WRITE(p, "  spline_weight(tess_pos + patch_pos, knots, weights);\n");
+				}
+				WRITE(p, "  vec3 pos = tess_sample(_pos, weights);\n");
+				if (doTexture && hasTexcoord) {
+					if (hasTexcoordTess)
+						WRITE(p, "  vec2 tex = tess_sample(_tex, weights);\n");
+					else
+						WRITE(p, "  vec2 tex = tess_pos + patch_pos;\n");
+				}
+				if (hasColor) {
+					if (hasColorTess)
+						WRITE(p, "  vec4 col = tess_sample(_col, weights);\n");
+					else
+						WRITE(p, "  vec4 col = tess_data.data[0].color;\n");
+				}
+				if (hasNormal) {
+					// Curved surface is probably always need to compute normal(not sampling from control points)
+					if (doBezier) {
+						// Bernstein derivative
+						WRITE(p, "  vec2 bernderiv[4];\n");
+						WRITE(p, "  bernderiv[0] = -3 * (tess_pos - 1) * (tess_pos - 1); \n");
+						WRITE(p, "  bernderiv[1] = 9 * tess_pos * tess_pos - 12 * tess_pos + 3; \n");
+						WRITE(p, "  bernderiv[2] = 3 * (2 - 3 * tess_pos) * tess_pos; \n");
+						WRITE(p, "  bernderiv[3] = 3 * tess_pos * tess_pos; \n");
+
+						WRITE(p, "  vec2 bernderiv_u[4];\n");
+						WRITE(p, "  vec2 bernderiv_v[4];\n");
+						WRITE(p, "  for (int i = 0; i < 4; i++) {\n");
+						WRITE(p, "    bernderiv_u[i] = vec2(bernderiv[i].x, weights[i].y);\n");
+						WRITE(p, "    bernderiv_v[i] = vec2(weights[i].x, bernderiv[i].y);\n");
+						WRITE(p, "  }\n");
+
+						WRITE(p, "  vec3 du = tess_sample(_pos, bernderiv_u);\n");
+						WRITE(p, "  vec3 dv = tess_sample(_pos, bernderiv_v);\n");
+					} else { // Spline
+						WRITE(p, "  vec2 tess_next_u = vec2(normal.x, 0);\n");
+						WRITE(p, "  vec2 tess_next_v = vec2(0, normal.y);\n");
+						// Right
+						WRITE(p, "  vec2 tess_pos_r = tess_pos + tess_next_u;\n");
+						WRITE(p, "  spline_weight(tess_pos_r + patch_pos, knots, weights);\n");
+						WRITE(p, "  vec3 pos_r = tess_sample(_pos, weights);\n");
+						// Left
+						WRITE(p, "  vec2 tess_pos_l = tess_pos - tess_next_u;\n");
+						WRITE(p, "  spline_weight(tess_pos_l + patch_pos, knots, weights);\n");
+						WRITE(p, "  vec3 pos_l = tess_sample(_pos, weights);\n");
+						// Down
+						WRITE(p, "  vec2 tess_pos_d = tess_pos + tess_next_v;\n");
+						WRITE(p, "  spline_weight(tess_pos_d + patch_pos, knots, weights);\n");
+						WRITE(p, "  vec3 pos_d = tess_sample(_pos, weights);\n");
+						// Up
+						WRITE(p, "  vec2 tess_pos_u = tess_pos - tess_next_v;\n");
+						WRITE(p, "  spline_weight(tess_pos_u + patch_pos, knots, weights);\n");
+						WRITE(p, "  vec3 pos_u = tess_sample(_pos, weights);\n");
+
+						WRITE(p, "  vec3 du = pos_r - pos_l;\n");
+						WRITE(p, "  vec3 dv = pos_d - pos_u;\n");
+					}
+					WRITE(p, "  vec3 nrm = cross(du, dv);\n");
+					WRITE(p, "  nrm = normalize(nrm);\n");
+				}
+				WRITE(p, "  vec3 worldpos = vec4(pos.xyz, 1.0) * base.world_mtx;\n");
+				if (hasNormal) {
+					WRITE(p, "  mediump vec3 worldnormal = normalize(vec4(%snrm, 0.0) * base.world_mtx);\n", flipNormalTess ? "-" : "");
+				} else {
+					WRITE(p, "  mediump vec3 worldnormal = vec3(0.0, 0.0, 1.0);\n");
 				}
-				WRITE(p, "  vec3 nrm = cross(du, dv);\n");
-				WRITE(p, "  nrm = normalize(nrm);\n");
-			}
-			WRITE(p, "  vec3 worldpos = vec4(pos.xyz, 1.0) * base.world_mtx;\n");
-			if (hasNormal) {
-				WRITE(p, "  mediump vec3 worldnormal = normalize(vec4(%snrm, 0.0) * base.world_mtx);\n", flipNormalTess ? "-" : "");
 			} else {
-				WRITE(p, "  mediump vec3 worldnormal = vec3(0.0, 0.0, 1.0);\n");
+				// No skinning, just standard T&L.
+				WRITE(p, "  vec3 worldpos = vec4(position.xyz, 1.0) * base.world_mtx;\n");
+				if (hasNormal)
+					WRITE(p, "  mediump vec3 worldnormal = normalize(vec4(%snormal, 0.0) * base.world_mtx);\n", flipNormal ? "-" : "");
+				else
+					WRITE(p, "  mediump vec3 worldnormal = vec3(0.0, 0.0, 1.0);\n");
 			}
 		} else {
-			WRITE(p, "  vec3 worldpos = vec4(position.xyz, 1.0) * base.world_mtx;\n");
-			if (hasNormal)
-				WRITE(p, "  mediump vec3 worldnormal = normalize(vec4(%snormal, 0.0) * base.world_mtx);\n", flipNormal ? "-" : "");
-			else
-				WRITE(p, "  mediump vec3 worldnormal = vec3(0.0, 0.0, 1.0);\n");
+			static const char *rescale[4] = { "", " * 1.9921875", " * 1.999969482421875", "" }; // 2*127.5f/128.f, 2*32767.5f/32768.f, 1.0f};
+			const char *factor = rescale[boneWeightScale];
+
+			static const char * const boneWeightAttr[8] = {
+				"w1.x", "w1.y", "w1.z", "w1.w",
+				"w2.x", "w2.y", "w2.z", "w2.w",
+			};
+
+			WRITE(p, "  mat3x4 skinMatrix = w1.x * bone.m[0];\n");
+			if (numBoneWeights > 1) {
+				for (int i = 1; i < numBoneWeights; i++) {
+					WRITE(p, "    skinMatrix += %s * bone.m[%i];\n", boneWeightAttr[i], i);
+				}
+			}
+
+			WRITE(p, ";\n");
+
+			// Trying to simplify this results in bugs in LBP...
+			WRITE(p, "  vec3 skinnedpos = (vec4(position, 1.0) * skinMatrix) %s;\n", factor);
+			WRITE(p, "  vec3 worldpos = vec4(skinnedpos, 1.0) * base.world_mtx;\n");
+
+			if (hasNormal) {
+				WRITE(p, "  mediump vec3 skinnednormal = vec4(%snormal, 0.0) * skinMatrix %s;\n", flipNormal ? "-" : "", factor);
+			} else {
+				WRITE(p, "  mediump vec3 skinnednormal = vec4(0.0, 0.0, %s1.0, 0.0) * skinMatrix %s;\n", flipNormal ? "-" : "", factor);
+			}
+			WRITE(p, "  mediump vec3 worldnormal = normalize(vec4(skinnednormal, 0.0) * base.world_mtx);\n");
 		}

 		WRITE(p, "  vec4 viewPos = vec4(vec4(worldpos, 1.0) * base.view_mtx, 1.0);\n");
--- a/UI/GameSettingsScreen.cpp
+++ b/UI/GameSettingsScreen.cpp
@ -312,6 +312,13 @@ void GameSettingsScreen::CreateViews() {
 	hwTransform->OnClick.Handle(this, &GameSettingsScreen::OnHardwareTransform);
 	hwTransform->SetDisabledPtr(&g_Config.bSoftwareRendering);

+	CheckBox *swSkin = graphicsSettings->Add(new CheckBox(&g_Config.bSoftwareSkinning, gr->T("Software Skinning")));
+	swSkin->OnClick.Add([=](EventParams &e) {
+		settingInfo_->Show(gr->T("SoftwareSkinning Tip", "Combine skinned model draws on the CPU, faster in most games"), e.v);
+		return UI::EVENT_CONTINUE;
+	});
+	swSkin->SetDisabledPtr(&g_Config.bSoftwareRendering);
+
 	CheckBox *vtxCache = graphicsSettings->Add(new CheckBox(&g_Config.bVertexCache, gr->T("Vertex Cache")));
 	vtxCache->OnClick.Add([=](EventParams &e) {
 		settingInfo_->Show(gr->T("VertexCache Tip", "Faster, but may cause temporary flicker"), e.v);
--- a/ext/native/thin3d/VulkanQueueRunner.h
+++ b/ext/native/thin3d/VulkanQueueRunner.h
@ -41,7 +41,7 @@ struct VkRenderData {
 			VkPipelineLayout pipelineLayout;
 			VkDescriptorSet ds;
 			int numUboOffsets;
-			uint32_t uboOffsets[2];
+			uint32_t uboOffsets[3];
 			VkBuffer vbuffer;  // might need to increase at some point
 			VkDeviceSize voffset;
 			VkBuffer ibuffer;
--- a/headless/Headless.cpp
+++ b/headless/Headless.cpp
@ -369,6 +369,7 @@ int main(int argc, const char* argv[])
 	g_Config.bFrameSkipUnthrottle = false;
 	g_Config.bEnableLogging = fullLog;
 	g_Config.iNumWorkerThreads = 1;
+	g_Config.bSoftwareSkinning = true;
 	g_Config.bVertexDecoderJit = true;
 	g_Config.bBlockTransferGPU = true;
 	g_Config.iSplineBezierQuality = 2;
--- a/unittest/TestVertexJit.cpp
+++ b/unittest/TestVertexJit.cpp
@ -543,6 +543,7 @@ static bool TestVertexColor565() {
 static bool TestVertex8Skin() {
 	VertexDecoderTestHarness dec;

+	g_Config.bSoftwareSkinning = true;
 	for (int i = 0; i < 8 * 12; ++i) {
 		gstate.boneMatrix[i] = 0.0f;
 	}
@ -572,6 +573,7 @@ static bool TestVertex8Skin() {
 static bool TestVertex16Skin() {
 	VertexDecoderTestHarness dec;

+	g_Config.bSoftwareSkinning = true;
 	for (int i = 0; i < 8 * 12; ++i) {
 		gstate.boneMatrix[i] = 0.0f;
 	}
@ -601,6 +603,7 @@ static bool TestVertex16Skin() {
 static bool TestVertexFloatSkin() {
 	VertexDecoderTestHarness dec;

+	g_Config.bSoftwareSkinning = true;
 	for (int i = 0; i < 8 * 12; ++i) {
 		gstate.boneMatrix[i] = 0.0f;
 	}