diff --git a/Core/Config.cpp b/Core/Config.cpp index 623637999a..e2bde296d9 100644 --- a/Core/Config.cpp +++ b/Core/Config.cpp @@ -510,6 +510,7 @@ static ConfigSetting graphicsSettings[] = { ReportedConfigSetting("RenderingMode", &g_Config.iRenderingMode, &DefaultRenderingMode, true, true), ConfigSetting("SoftwareRenderer", &g_Config.bSoftwareRendering, false, true, true), ReportedConfigSetting("HardwareTransform", &g_Config.bHardwareTransform, true, true, true), + ReportedConfigSetting("SoftwareSkinning", &g_Config.bSoftwareSkinning, true, true, true), ReportedConfigSetting("TextureFiltering", &g_Config.iTexFiltering, 1, true, true), ReportedConfigSetting("BufferFiltering", &g_Config.iBufFilter, 1, true, true), ReportedConfigSetting("InternalResolution", &g_Config.iInternalResolution, &DefaultInternalResolution, true, true), diff --git a/Core/Config.h b/Core/Config.h index e22e78f501..466624ce67 100644 --- a/Core/Config.h +++ b/Core/Config.h @@ -155,6 +155,7 @@ public: int iGPUBackend; bool bSoftwareRendering; bool bHardwareTransform; // only used in the GLES backend + bool bSoftwareSkinning; // may speed up some games int iRenderingMode; // 0 = non-buffered rendering 1 = buffered rendering int iTexFiltering; // 1 = off , 2 = nearest , 3 = linear , 4 = linear(CG) diff --git a/GPU/Common/DrawEngineCommon.cpp b/GPU/Common/DrawEngineCommon.cpp index 663c782dfd..8903137d0e 100644 --- a/GPU/Common/DrawEngineCommon.cpp +++ b/GPU/Common/DrawEngineCommon.cpp @@ -379,9 +379,13 @@ bool DrawEngineCommon::GetCurrentSimpleVertices(int count, std::vectorDecodeVerts(bufPtr, inPtr, lowerBound, upperBound); + // OK, morphing eliminated but bones still remain to be taken care of. + // Let's do a partial software transform where we only do skinning. + VertexReader reader(bufPtr, dec->GetDecVtxFmt(), vertType); SimpleVertex *sverts = (SimpleVertex *)outPtr; @@ -393,7 +397,56 @@ u32 DrawEngineCommon::NormalizeVertices(u8 *outPtr, u8 *bufPtr, const u8 *inPtr, (u8)gstate.getMaterialAmbientA(), }; - { + // Let's have two separate loops, one for non skinning and one for skinning. + if (!g_Config.bSoftwareSkinning && (vertType & GE_VTYPE_WEIGHT_MASK) != GE_VTYPE_WEIGHT_NONE) { + int numBoneWeights = vertTypeGetNumBoneWeights(vertType); + for (int i = lowerBound; i <= upperBound; i++) { + reader.Goto(i - lowerBound); + SimpleVertex &sv = sverts[i]; + if (vertType & GE_VTYPE_TC_MASK) { + reader.ReadUV(sv.uv); + } + + if (vertType & GE_VTYPE_COL_MASK) { + reader.ReadColor0_8888(sv.color); + } else { + memcpy(sv.color, defaultColor, 4); + } + + float nrm[3], pos[3]; + float bnrm[3], bpos[3]; + + if (vertType & GE_VTYPE_NRM_MASK) { + // Normals are generated during tessellation anyway, not sure if any need to supply + reader.ReadNrm(nrm); + } else { + nrm[0] = 0; + nrm[1] = 0; + nrm[2] = 1.0f; + } + reader.ReadPos(pos); + + // Apply skinning transform directly + float weights[8]; + reader.ReadWeights(weights); + // Skinning + Vec3Packedf psum(0, 0, 0); + Vec3Packedf nsum(0, 0, 0); + for (int w = 0; w < numBoneWeights; w++) { + if (weights[w] != 0.0f) { + Vec3ByMatrix43(bpos, pos, gstate.boneMatrix + w * 12); + Vec3Packedf tpos(bpos); + psum += tpos * weights[w]; + + Norm3ByMatrix43(bnrm, nrm, gstate.boneMatrix + w * 12); + Vec3Packedf tnorm(bnrm); + nsum += tnorm * weights[w]; + } + } + sv.pos = psum; + sv.nrm = nsum; + } + } else { for (int i = lowerBound; i <= upperBound; i++) { reader.Goto(i - lowerBound); SimpleVertex &sv = sverts[i]; @@ -655,7 +708,7 @@ void DrawEngineCommon::SubmitPrim(void *verts, void *inds, GEPrimitiveType prim, numDrawCalls++; vertexCountInDrawCalls_ += vertexCount; - if (vertTypeID & GE_VTYPE_WEIGHT_MASK) { + if (g_Config.bSoftwareSkinning && (vertTypeID & GE_VTYPE_WEIGHT_MASK)) { DecodeVertsStep(decoded, decodeCounter_, decodedVerts_); decodeCounter_++; } diff --git a/GPU/Common/ShaderCommon.h b/GPU/Common/ShaderCommon.h index 8de756237c..6e4bfa8c81 100644 --- a/GPU/Common/ShaderCommon.h +++ b/GPU/Common/ShaderCommon.h @@ -76,8 +76,14 @@ enum : uint64_t { DIRTY_WORLDMATRIX = 1ULL << 21, DIRTY_VIEWMATRIX = 1ULL << 22, DIRTY_TEXMATRIX = 1ULL << 23, - - // 8 free bits here where bones used to be! + DIRTY_BONEMATRIX0 = 1ULL << 24, // NOTE: These must be under 32 + DIRTY_BONEMATRIX1 = 1ULL << 25, + DIRTY_BONEMATRIX2 = 1ULL << 26, + DIRTY_BONEMATRIX3 = 1ULL << 27, + DIRTY_BONEMATRIX4 = 1ULL << 28, + DIRTY_BONEMATRIX5 = 1ULL << 29, + DIRTY_BONEMATRIX6 = 1ULL << 30, + DIRTY_BONEMATRIX7 = 1ULL << 31, // These are for hardware tessellation DIRTY_BEZIERSPLINE = 1ULL << 32, @@ -85,6 +91,8 @@ enum : uint64_t { // space for 7 more uniforms. + DIRTY_BONE_UNIFORMS = 0xFF000000ULL, + DIRTY_ALL_UNIFORMS = 0x3FFFFFFFFULL, DIRTY_ALL_LIGHTS = DIRTY_LIGHT0 | DIRTY_LIGHT1 | DIRTY_LIGHT2 | DIRTY_LIGHT3, diff --git a/GPU/Common/ShaderId.cpp b/GPU/Common/ShaderId.cpp index 8a1667f181..3a8f0de37e 100644 --- a/GPU/Common/ShaderId.cpp +++ b/GPU/Common/ShaderId.cpp @@ -33,6 +33,7 @@ std::string VertexShaderDesc(const ShaderID &id) { int ls1 = id.Bits(VS_BIT_LS1, 2); if (uvgMode) desc << uvgModes[uvgMode]; + if (id.Bit(VS_BIT_ENABLE_BONES)) desc << "Bones:" << (id.Bits(VS_BIT_BONES, 3) + 1) << " "; // Lights if (id.Bit(VS_BIT_LIGHTING_ENABLE)) { desc << "Light: "; @@ -102,6 +103,16 @@ void ComputeVertexShaderID(ShaderID *id_out, u32 vertType, bool useHWTransform) id.SetBits(VS_BIT_LS1, 2, gstate.getUVLS1()); } + // Bones. + bool enableBones = vertTypeIsSkinningEnabled(vertType); + id.SetBit(VS_BIT_ENABLE_BONES, enableBones); + if (enableBones) { + id.SetBits(VS_BIT_BONES, 3, TranslateNumBones(vertTypeGetNumBoneWeights(vertType)) - 1); + // 2 bits. We should probably send in the weight scalefactor as a uniform instead, + // or simply preconvert all weights to floats. + id.SetBits(VS_BIT_WEIGHT_FMTSCALE, 2, (vertType & GE_VTYPE_WEIGHT_MASK) >> GE_VTYPE_WEIGHT_SHIFT); + } + // Okay, d[1] coming up. ============== if (gstate.isLightingEnabled() || doShadeMapping) { // doShadeMapping is stored as UVGenMode, so this is enough for isLightingEnabled. diff --git a/GPU/Common/ShaderId.h b/GPU/Common/ShaderId.h index 8ef501a571..afc7c51313 100644 --- a/GPU/Common/ShaderId.h +++ b/GPU/Common/ShaderId.h @@ -7,6 +7,7 @@ // TODO: There will be additional bits, indicating that groups of these will be // sent to the shader and processed there. This will cut down the number of shaders ("ubershader approach") +// This is probably only really worth doing for lighting and bones. enum { VS_BIT_LMODE = 0, VS_BIT_IS_THROUGH = 1, @@ -28,7 +29,10 @@ enum { VS_BIT_UVPROJ_MODE = 18, // 2, can overlap with LS0 VS_BIT_LS0 = 18, // 2 VS_BIT_LS1 = 20, // 2 - // 22 - 31 are free. + VS_BIT_BONES = 22, // 3 should be enough, not 8 + // 25 - 29 are free. + VS_BIT_ENABLE_BONES = 30, + // 31 is free. VS_BIT_LIGHT0_COMP = 32, // 2 bits VS_BIT_LIGHT0_TYPE = 34, // 2 bits VS_BIT_LIGHT1_COMP = 36, // 2 bits diff --git a/GPU/Common/ShaderUniforms.cpp b/GPU/Common/ShaderUniforms.cpp index 5e0927410d..bc02f60912 100644 --- a/GPU/Common/ShaderUniforms.cpp +++ b/GPU/Common/ShaderUniforms.cpp @@ -246,3 +246,11 @@ void LightUpdateUniforms(UB_VS_Lights *ub, uint64_t dirtyUniforms) { } } } + +void BoneUpdateUniforms(UB_VS_Bones *ub, uint64_t dirtyUniforms) { + for (int i = 0; i < 8; i++) { + if (dirtyUniforms & (DIRTY_BONEMATRIX0 << i)) { + ConvertMatrix4x3To3x4Transposed(ub->bones[i], gstate.boneMatrix + 12 * i); + } + } +} diff --git a/GPU/Common/ShaderUniforms.h b/GPU/Common/ShaderUniforms.h index 9e7d028c7a..55cdaf6efd 100644 --- a/GPU/Common/ShaderUniforms.h +++ b/GPU/Common/ShaderUniforms.h @@ -159,5 +159,22 @@ R"( float4 u_ambient; float3 u_lightspecular3; )"; +// With some cleverness, we could get away with uploading just half this when only the four or five first +// bones are being used. This is 512b, 256b would be great. +struct UB_VS_Bones { + float bones[8][12]; +}; + +static const char *ub_vs_bonesStr = +R"( mat3x4 m[8]; +)"; + +// HLSL code is shared so these names are changed to match those in DX9. +static const char *cb_vs_bonesStr = +R"( float4x3 u_bone[8]; +)"; + void BaseUpdateUniforms(UB_VS_FS_Base *ub, uint64_t dirtyUniforms, bool flipViewport); void LightUpdateUniforms(UB_VS_Lights *ub, uint64_t dirtyUniforms); +void BoneUpdateUniforms(UB_VS_Bones *ub, uint64_t dirtyUniforms); + diff --git a/GPU/Common/SoftwareTransformCommon.cpp b/GPU/Common/SoftwareTransformCommon.cpp index 5ebf13570c..a29c987c11 100644 --- a/GPU/Common/SoftwareTransformCommon.cpp +++ b/GPU/Common/SoftwareTransformCommon.cpp @@ -144,6 +144,8 @@ void SoftwareTransform( vscale /= gstate_c.curTextureHeight; } + bool skinningEnabled = vertTypeIsSkinningEnabled(vertType); + const int w = gstate.getTextureWidth(0); const int h = gstate.getTextureHeight(0); float widthFactor = (float) w / (float) gstate_c.curTextureWidth; @@ -211,14 +213,48 @@ void SoftwareTransform( Vec3f worldnormal(0, 0, 1); reader.ReadPos(pos); - Vec3ByMatrix43(out, pos, gstate.worldMatrix); - if (reader.hasNormal()) { - reader.ReadNrm(normal.AsArray()); - if (gstate.areNormalsReversed()) { - normal = -normal; + if (!skinningEnabled) { + Vec3ByMatrix43(out, pos, gstate.worldMatrix); + if (reader.hasNormal()) { + reader.ReadNrm(normal.AsArray()); + if (gstate.areNormalsReversed()) { + normal = -normal; + } + Norm3ByMatrix43(worldnormal.AsArray(), normal.AsArray(), gstate.worldMatrix); + worldnormal = worldnormal.Normalized(); + } + } else { + float weights[8]; + reader.ReadWeights(weights); + if (reader.hasNormal()) + reader.ReadNrm(normal.AsArray()); + + // Skinning + Vec3f psum(0, 0, 0); + Vec3f nsum(0, 0, 0); + for (int i = 0; i < vertTypeGetNumBoneWeights(vertType); i++) { + if (weights[i] != 0.0f) { + Vec3ByMatrix43(out, pos, gstate.boneMatrix+i*12); + Vec3f tpos(out); + psum += tpos * weights[i]; + if (reader.hasNormal()) { + Vec3f norm; + Norm3ByMatrix43(norm.AsArray(), normal.AsArray(), gstate.boneMatrix+i*12); + nsum += norm * weights[i]; + } + } + } + + // Yes, we really must multiply by the world matrix too. + Vec3ByMatrix43(out, psum.AsArray(), gstate.worldMatrix); + if (reader.hasNormal()) { + normal = nsum; + if (gstate.areNormalsReversed()) { + normal = -normal; + } + Norm3ByMatrix43(worldnormal.AsArray(), normal.AsArray(), gstate.worldMatrix); + worldnormal = worldnormal.Normalized(); } - Norm3ByMatrix43(worldnormal.AsArray(), normal.AsArray(), gstate.worldMatrix); - worldnormal = worldnormal.Normalized(); } // Perform lighting here if enabled. don't need to check through, it's checked above. diff --git a/GPU/Common/VertexDecoderArm.cpp b/GPU/Common/VertexDecoderArm.cpp index c7cddcb4de..79e6584ae2 100644 --- a/GPU/Common/VertexDecoderArm.cpp +++ b/GPU/Common/VertexDecoderArm.cpp @@ -229,7 +229,7 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec, int // Add code to convert matrices to 4x4. // Later we might want to do this when the matrices are loaded instead. int boneCount = 0; - if (NEONSkinning && dec.weighttype) { + if (NEONSkinning && dec.weighttype && g_Config.bSoftwareSkinning && dec.morphcount == 1) { // Copying from R3 to R4 MOVP2R(R3, gstate.boneMatrix); MOVP2R(R4, bones); diff --git a/GPU/Common/VertexDecoderArm64.cpp b/GPU/Common/VertexDecoderArm64.cpp index 3c87a6b677..acfa230ea9 100644 --- a/GPU/Common/VertexDecoderArm64.cpp +++ b/GPU/Common/VertexDecoderArm64.cpp @@ -193,7 +193,7 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec, int // Add code to convert matrices to 4x4. // Later we might want to do this when the matrices are loaded instead. int boneCount = 0; - if (dec.weighttype) { + if (dec.weighttype && g_Config.bSoftwareSkinning && dec.morphcount == 1) { // Copying from R3 to R4 MOVP2R(X3, gstate.boneMatrix); MOVP2R(X4, bones); diff --git a/GPU/Common/VertexDecoderX86.cpp b/GPU/Common/VertexDecoderX86.cpp index f18d24a93d..3b6b22ae16 100644 --- a/GPU/Common/VertexDecoderX86.cpp +++ b/GPU/Common/VertexDecoderX86.cpp @@ -202,7 +202,7 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec, int // Add code to convert matrices to 4x4. // Later we might want to do this when the matrices are loaded instead. int boneCount = 0; - if (dec.weighttype) { + if (dec.weighttype && g_Config.bSoftwareSkinning && dec.morphcount == 1) { MOV(PTRBITS, R(tempReg1), ImmPtr(&threeMasks)); MOVAPS(XMM4, MatR(tempReg1)); MOV(PTRBITS, R(tempReg1), ImmPtr(&aOne)); diff --git a/GPU/D3D11/DrawEngineD3D11.cpp b/GPU/D3D11/DrawEngineD3D11.cpp index 55006baca7..2b49ae2d6e 100644 --- a/GPU/D3D11/DrawEngineD3D11.cpp +++ b/GPU/D3D11/DrawEngineD3D11.cpp @@ -344,7 +344,7 @@ void DrawEngineD3D11::DoFlush() { // Cannot cache vertex data with morph enabled. bool useCache = g_Config.bVertexCache && !(lastVType_ & GE_VTYPE_MORPHCOUNT_MASK); // Also avoid caching when software skinning. - if (lastVType_ & GE_VTYPE_WEIGHT_MASK) + if (g_Config.bSoftwareSkinning && (lastVType_ & GE_VTYPE_WEIGHT_MASK)) useCache = false; if (useCache) { diff --git a/GPU/D3D11/GPU_D3D11.cpp b/GPU/D3D11/GPU_D3D11.cpp index 2b5d760302..8ad9ebd11a 100644 --- a/GPU/D3D11/GPU_D3D11.cpp +++ b/GPU/D3D11/GPU_D3D11.cpp @@ -103,6 +103,7 @@ GPU_D3D11::GPU_D3D11(GraphicsContext *gfxCtx, Draw::DrawContext *draw) // No need to flush before the tex scale/offset commands if we are baking // the tex scale/offset into the vertices anyway. + UpdateCmdInfo(); CheckGPUFeatures(); BuildReportingInfo(); @@ -214,6 +215,7 @@ void GPU_D3D11::InitClear() { void GPU_D3D11::BeginHostFrame() { GPUCommon::BeginHostFrame(); + UpdateCmdInfo(); if (resized_) { CheckGPUFeatures(); framebufferManager_->Resized(); diff --git a/GPU/D3D11/ShaderManagerD3D11.cpp b/GPU/D3D11/ShaderManagerD3D11.cpp index 7e79d57224..c5c498f5f5 100644 --- a/GPU/D3D11/ShaderManagerD3D11.cpp +++ b/GPU/D3D11/ShaderManagerD3D11.cpp @@ -93,19 +93,24 @@ ShaderManagerD3D11::ShaderManagerD3D11(ID3D11Device *device, ID3D11DeviceContext codeBuffer_ = new char[16384]; memset(&ub_base, 0, sizeof(ub_base)); memset(&ub_lights, 0, sizeof(ub_lights)); + memset(&ub_bones, 0, sizeof(ub_bones)); INFO_LOG(G3D, "sizeof(ub_base): %d", (int)sizeof(ub_base)); INFO_LOG(G3D, "sizeof(ub_lights): %d", (int)sizeof(ub_lights)); + INFO_LOG(G3D, "sizeof(ub_bones): %d", (int)sizeof(ub_bones)); D3D11_BUFFER_DESC desc{sizeof(ub_base), D3D11_USAGE_DYNAMIC, D3D11_BIND_CONSTANT_BUFFER, D3D11_CPU_ACCESS_WRITE }; ASSERT_SUCCESS(device_->CreateBuffer(&desc, nullptr, &push_base)); desc.ByteWidth = sizeof(ub_lights); ASSERT_SUCCESS(device_->CreateBuffer(&desc, nullptr, &push_lights)); + desc.ByteWidth = sizeof(ub_bones); + ASSERT_SUCCESS(device_->CreateBuffer(&desc, nullptr, &push_bones)); } ShaderManagerD3D11::~ShaderManagerD3D11() { push_base->Release(); push_lights->Release(); + push_bones->Release(); ClearShaders(); delete[] codeBuffer_; } @@ -154,15 +159,21 @@ uint64_t ShaderManagerD3D11::UpdateUniforms() { memcpy(map.pData, &ub_lights, sizeof(ub_lights)); context_->Unmap(push_lights, 0); } + if (dirty & DIRTY_BONE_UNIFORMS) { + BoneUpdateUniforms(&ub_bones, dirty); + context_->Map(push_bones, 0, D3D11_MAP_WRITE_DISCARD, 0, &map); + memcpy(map.pData, &ub_bones, sizeof(ub_bones)); + context_->Unmap(push_bones, 0); + } } gstate_c.CleanUniforms(); return dirty; } void ShaderManagerD3D11::BindUniforms() { - ID3D11Buffer *vs_cbs[2] = { push_base, push_lights }; + ID3D11Buffer *vs_cbs[3] = { push_base, push_lights, push_bones }; ID3D11Buffer *ps_cbs[1] = { push_base }; - context_->VSSetConstantBuffers(0, 2, vs_cbs); + context_->VSSetConstantBuffers(0, 3, vs_cbs); context_->PSSetConstantBuffers(0, 1, ps_cbs); } diff --git a/GPU/D3D11/ShaderManagerD3D11.h b/GPU/D3D11/ShaderManagerD3D11.h index 4d08c35dc4..23ff3be8a5 100644 --- a/GPU/D3D11/ShaderManagerD3D11.h +++ b/GPU/D3D11/ShaderManagerD3D11.h @@ -121,10 +121,12 @@ private: // Uniform block scratchpad. These (the relevant ones) are copied to the current pushbuffer at draw time. UB_VS_FS_Base ub_base; UB_VS_Lights ub_lights; + UB_VS_Bones ub_bones; // Not actual pushbuffers, requires D3D11.1, let's try to live without that first. ID3D11Buffer *push_base; ID3D11Buffer *push_lights; + ID3D11Buffer *push_bones; D3D11FragmentShader *lastFShader_; D3D11VertexShader *lastVShader_; diff --git a/GPU/Directx9/DrawEngineDX9.cpp b/GPU/Directx9/DrawEngineDX9.cpp index 9330893a97..c72d256727 100644 --- a/GPU/Directx9/DrawEngineDX9.cpp +++ b/GPU/Directx9/DrawEngineDX9.cpp @@ -325,7 +325,7 @@ void DrawEngineDX9::DoFlush() { // Cannot cache vertex data with morph enabled. bool useCache = g_Config.bVertexCache && !(lastVType_ & GE_VTYPE_MORPHCOUNT_MASK); // Also avoid caching when software skinning. - if (lastVType_ & GE_VTYPE_WEIGHT_MASK) + if (g_Config.bSoftwareSkinning && (lastVType_ & GE_VTYPE_WEIGHT_MASK)) useCache = false; if (useCache) { diff --git a/GPU/Directx9/GPU_DX9.cpp b/GPU/Directx9/GPU_DX9.cpp index 52f1a6e2d6..adb50ecbf0 100644 --- a/GPU/Directx9/GPU_DX9.cpp +++ b/GPU/Directx9/GPU_DX9.cpp @@ -83,6 +83,9 @@ GPU_DX9::GPU_DX9(GraphicsContext *gfxCtx, Draw::DrawContext *draw) ERROR_LOG(G3D, "gstate has drifted out of sync!"); } + // No need to flush before the tex scale/offset commands if we are baking + // the tex scale/offset into the vertices anyway. + UpdateCmdInfo(); CheckGPUFeatures(); BuildReportingInfo(); @@ -188,6 +191,7 @@ void GPU_DX9::InitClear() { void GPU_DX9::BeginHostFrame() { GPUCommon::BeginHostFrame(); + UpdateCmdInfo(); if (resized_) { CheckGPUFeatures(); framebufferManager_->Resized(); diff --git a/GPU/Directx9/ShaderManagerDX9.cpp b/GPU/Directx9/ShaderManagerDX9.cpp index 10e128d620..70d4cb7693 100644 --- a/GPU/Directx9/ShaderManagerDX9.cpp +++ b/GPU/Directx9/ShaderManagerDX9.cpp @@ -313,7 +313,7 @@ void ShaderManagerDX9::PSUpdateUniforms(u64 dirtyUniforms) { } const uint64_t vsUniforms = DIRTY_PROJMATRIX | DIRTY_PROJTHROUGHMATRIX | DIRTY_WORLDMATRIX | DIRTY_VIEWMATRIX | DIRTY_TEXMATRIX | -DIRTY_FOGCOEF | DIRTY_UVSCALEOFFSET | DIRTY_DEPTHRANGE | +DIRTY_FOGCOEF | DIRTY_BONE_UNIFORMS | DIRTY_UVSCALEOFFSET | DIRTY_DEPTHRANGE | DIRTY_AMBIENT | DIRTY_MATAMBIENTALPHA | DIRTY_MATSPECULAR | DIRTY_MATDIFFUSE | DIRTY_MATEMISSIVE | DIRTY_LIGHT0 | DIRTY_LIGHT1 | DIRTY_LIGHT2 | DIRTY_LIGHT3; void ShaderManagerDX9::VSUpdateUniforms(u64 dirtyUniforms) { @@ -382,6 +382,38 @@ void ShaderManagerDX9::VSUpdateUniforms(u64 dirtyUniforms) { #endif VSSetFloatArray(CONST_VS_FOGCOEF, fogcoef, 2); } + // TODO: Could even set all bones in one go if they're all dirty. +#ifdef USE_BONE_ARRAY + if (u_bone != 0) { + float allBones[8 * 16]; + + bool allDirty = true; + for (int i = 0; i < numBones; i++) { + if (dirtyUniforms & (DIRTY_BONEMATRIX0 << i)) { + ConvertMatrix4x3To4x4(allBones + 16 * i, gstate.boneMatrix + 12 * i); + } else { + allDirty = false; + } + } + if (allDirty) { + // Set them all with one call + //glUniformMatrix4fv(u_bone, numBones, GL_FALSE, allBones); + } else { + // Set them one by one. Could try to coalesce two in a row etc but too lazy. + for (int i = 0; i < numBones; i++) { + if (dirtyUniforms & (DIRTY_BONEMATRIX0 << i)) { + //glUniformMatrix4fv(u_bone + i, 1, GL_FALSE, allBones + 16 * i); + } + } + } + } +#else + for (int i = 0; i < 8; i++) { + if (dirtyUniforms & (DIRTY_BONEMATRIX0 << i)) { + VSSetMatrix4x3_3(CONST_VS_BONE0 + 3 * i, gstate.boneMatrix + 12 * i); + } + } +#endif // Texturing if (dirtyUniforms & DIRTY_UVSCALEOFFSET) { diff --git a/GPU/Directx9/VertexShaderGeneratorDX9.cpp b/GPU/Directx9/VertexShaderGeneratorDX9.cpp index 3a9fe06465..ed397500b4 100644 --- a/GPU/Directx9/VertexShaderGeneratorDX9.cpp +++ b/GPU/Directx9/VertexShaderGeneratorDX9.cpp @@ -37,6 +37,18 @@ namespace DX9 { +static const char * const boneWeightAttrDecl[9] = { + "#ERROR#", + "float a_w1:TEXCOORD1;\n", + "float2 a_w1:TEXCOORD1;\n", + "float3 a_w1:TEXCOORD1;\n", + "float4 a_w1:TEXCOORD1;\n", + "float4 a_w1:TEXCOORD1;\n float a_w2:TEXCOORD2;\n", + "float4 a_w1:TEXCOORD1;\n float2 a_w2:TEXCOORD2;\n", + "float4 a_w1:TEXCOORD1;\n float3 a_w2:TEXCOORD2;\n", + "float4 a_w1:TEXCOORD1;\n float4 a_w2:TEXCOORD2;\n", +}; + enum DoLightComputation { LIGHT_OFF, LIGHT_SHADE, @@ -68,6 +80,7 @@ void GenerateVertexShaderHLSL(const VShaderID &id, char *buffer, ShaderLanguage bool flipNormal = id.Bit(VS_BIT_NORM_REVERSE); int ls0 = id.Bits(VS_BIT_LS0, 2); int ls1 = id.Bits(VS_BIT_LS1, 2); + bool enableBones = id.Bit(VS_BIT_ENABLE_BONES); bool enableLighting = id.Bit(VS_BIT_LIGHTING_ENABLE); int matUpdate = id.Bits(VS_BIT_MATERIAL_UPDATE, 3); @@ -91,6 +104,9 @@ void GenerateVertexShaderHLSL(const VShaderID &id, char *buffer, ShaderLanguage int numBoneWeights = 0; int boneWeightScale = id.Bits(VS_BIT_WEIGHT_FMTSCALE, 2); + if (enableBones) { + numBoneWeights = 1 + id.Bits(VS_BIT_BONES, 3); + } if (lang == HLSL_DX9) { WRITE(p, "#pragma warning( disable : 3571 )\n"); @@ -113,6 +129,15 @@ void GenerateVertexShaderHLSL(const VShaderID &id, char *buffer, ShaderLanguage WRITE(p, "float4x3 u_view : register(c%i);\n", CONST_VS_VIEW); if (doTextureTransform) WRITE(p, "float4x3 u_tex : register(c%i);\n", CONST_VS_TEXMTX); + if (enableBones) { +#ifdef USE_BONE_ARRAY + WRITE(p, "float4x3 u_bone[%i] : register(c%i);\n", numBones, CONST_VS_BONE0); +#else + for (int i = 0; i < numBoneWeights; i++) { + WRITE(p, "float4x3 u_bone%i : register(c%i);\n", i, CONST_VS_BONE0 + i * 3); + } +#endif + } if (doTexture) { WRITE(p, "float4 u_uvscaleoffset : register(c%i);\n", CONST_VS_UVSCALEOFFSET); } @@ -156,6 +181,7 @@ void GenerateVertexShaderHLSL(const VShaderID &id, char *buffer, ShaderLanguage } else { WRITE(p, "cbuffer base : register(b0) {\n%s};\n", cb_baseStr); WRITE(p, "cbuffer lights: register(b1) {\n%s};\n", cb_vs_lightsStr); + WRITE(p, "cbuffer bones : register(b2) {\n%s};\n", cb_vs_bonesStr); } // And the "varyings". @@ -165,6 +191,9 @@ void GenerateVertexShaderHLSL(const VShaderID &id, char *buffer, ShaderLanguage if ((doSpline || doBezier) && lang == HLSL_D3D11) { WRITE(p, " uint instanceId : SV_InstanceID;\n"); } + if (enableBones) { + WRITE(p, " %s", boneWeightAttrDecl[numBoneWeights]); + } if (doTexture && hasTexcoord) { WRITE(p, " float2 texcoord : TEXCOORD0;\n"); } @@ -358,7 +387,7 @@ void GenerateVertexShaderHLSL(const VShaderID &id, char *buffer, ShaderLanguage } } else { // Step 1: World Transform / Skinning - if (true) { + if (!enableBones) { // Hardware tessellation if (doSpline || doBezier) { WRITE(p, " uint num_patches_u = %s;\n", doBezier ? "(u_spline_count_u - 1) / 3u" : "u_spline_count_u - 3"); @@ -467,6 +496,74 @@ void GenerateVertexShaderHLSL(const VShaderID &id, char *buffer, ShaderLanguage else WRITE(p, " float3 worldnormal = float3(0.0, 0.0, 1.0);\n"); } + } else { + static const char * const boneWeightAttr[8] = { + "a_w1.x", "a_w1.y", "a_w1.z", "a_w1.w", + "a_w2.x", "a_w2.y", "a_w2.z", "a_w2.w", + }; + +#if defined(USE_FOR_LOOP) && defined(USE_BONE_ARRAY) + + // To loop through the weights, we unfortunately need to put them in a float array. + // GLSL ES sucks - no way to directly initialize an array! + switch (numBoneWeights) { + case 1: WRITE(p, " float w[1]; w[0] = a_w1;\n"); break; + case 2: WRITE(p, " float w[2]; w[0] = a_w1.x; w[1] = a_w1.y;\n"); break; + case 3: WRITE(p, " float w[3]; w[0] = a_w1.x; w[1] = a_w1.y; w[2] = a_w1.z;\n"); break; + case 4: WRITE(p, " float w[4]; w[0] = a_w1.x; w[1] = a_w1.y; w[2] = a_w1.z; w[3] = a_w1.w;\n"); break; + case 5: WRITE(p, " float w[5]; w[0] = a_w1.x; w[1] = a_w1.y; w[2] = a_w1.z; w[3] = a_w1.w; w[4] = a_w2;\n"); break; + case 6: WRITE(p, " float w[6]; w[0] = a_w1.x; w[1] = a_w1.y; w[2] = a_w1.z; w[3] = a_w1.w; w[4] = a_w2.x; w[5] = a_w2.y;\n"); break; + case 7: WRITE(p, " float w[7]; w[0] = a_w1.x; w[1] = a_w1.y; w[2] = a_w1.z; w[3] = a_w1.w; w[4] = a_w2.x; w[5] = a_w2.y; w[6] = a_w2.z;\n"); break; + case 8: WRITE(p, " float w[8]; w[0] = a_w1.x; w[1] = a_w1.y; w[2] = a_w1.z; w[3] = a_w1.w; w[4] = a_w2.x; w[5] = a_w2.y; w[6] = a_w2.z; w[7] = a_w2.w;\n"); break; + } + + WRITE(p, " mat4 skinMatrix = w[0] * u_bone[0];\n"); + if (numBoneWeights > 1) { + WRITE(p, " for (int i = 1; i < %i; i++) {\n", numBoneWeights); + WRITE(p, " skinMatrix += w[i] * u_bone[i];\n"); + WRITE(p, " }\n"); + } + +#else + if (lang == HLSL_D3D11 || lang == HLSL_D3D11_LEVEL9) { + if (numBoneWeights == 1) + WRITE(p, " float4x3 skinMatrix = mul(In.a_w1, u_bone[0])"); + else + WRITE(p, " float4x3 skinMatrix = mul(In.a_w1.x, u_bone[0])"); + for (int i = 1; i < numBoneWeights; i++) { + const char *weightAttr = boneWeightAttr[i]; + // workaround for "cant do .x of scalar" issue + if (numBoneWeights == 1 && i == 0) weightAttr = "a_w1"; + if (numBoneWeights == 5 && i == 4) weightAttr = "a_w2"; + WRITE(p, " + mul(In.%s, u_bone[%i])", weightAttr, i); + } + } else { + if (numBoneWeights == 1) + WRITE(p, " float4x3 skinMatrix = mul(In.a_w1, u_bone0)"); + else + WRITE(p, " float4x3 skinMatrix = mul(In.a_w1.x, u_bone0)"); + for (int i = 1; i < numBoneWeights; i++) { + const char *weightAttr = boneWeightAttr[i]; + // workaround for "cant do .x of scalar" issue + if (numBoneWeights == 1 && i == 0) weightAttr = "a_w1"; + if (numBoneWeights == 5 && i == 4) weightAttr = "a_w2"; + WRITE(p, " + mul(In.%s, u_bone%i)", weightAttr, i); + } + } +#endif + + WRITE(p, ";\n"); + + // Trying to simplify this results in bugs in LBP... + WRITE(p, " float3 skinnedpos = mul(float4(In.position.xyz, 1.0), skinMatrix);\n"); + WRITE(p, " float3 worldpos = mul(float4(skinnedpos, 1.0), u_world);\n"); + + if (hasNormal) { + WRITE(p, " float3 skinnednormal = mul(float4(%sIn.normal, 0.0), skinMatrix);\n", flipNormal ? "-" : ""); + } else { + WRITE(p, " float3 skinnednormal = mul(float4(0.0, 0.0, %s1.0, 0.0), skinMatrix);\n", flipNormal ? "-" : ""); + } + WRITE(p, " float3 worldnormal = normalize(mul(float4(skinnednormal, 0.0), u_world));\n"); } WRITE(p, " float4 viewPos = float4(mul(float4(worldpos, 1.0), u_view), 1.0);\n"); diff --git a/GPU/Directx9/VertexShaderGeneratorDX9.h b/GPU/Directx9/VertexShaderGeneratorDX9.h index a76a5ec5ed..e33567992c 100644 --- a/GPU/Directx9/VertexShaderGeneratorDX9.h +++ b/GPU/Directx9/VertexShaderGeneratorDX9.h @@ -44,6 +44,15 @@ namespace DX9 { CONST_VS_LIGHTSPECULAR = 44, CONST_VS_LIGHTAMBIENT = 48, CONST_VS_DEPTHRANGE = 52, + CONST_VS_BONE0 = 53, + CONST_VS_BONE1 = 56, + CONST_VS_BONE2 = 59, + CONST_VS_BONE3 = 62, + CONST_VS_BONE4 = 65, + CONST_VS_BONE5 = 68, + CONST_VS_BONE6 = 71, + CONST_VS_BONE7 = 74, + CONST_VS_BONE8 = 77, }; }; diff --git a/GPU/GLES/DrawEngineGLES.cpp b/GPU/GLES/DrawEngineGLES.cpp index c304d5d236..25822b470d 100644 --- a/GPU/GLES/DrawEngineGLES.cpp +++ b/GPU/GLES/DrawEngineGLES.cpp @@ -325,7 +325,7 @@ void DrawEngineGLES::DoFlush() { // Cannot cache vertex data with morph enabled. bool useCache = g_Config.bVertexCache && !(lastVType_ & GE_VTYPE_MORPHCOUNT_MASK); // Also avoid caching when software skinning. - if (lastVType_ & GE_VTYPE_WEIGHT_MASK) + if (g_Config.bSoftwareSkinning && (lastVType_ & GE_VTYPE_WEIGHT_MASK)) useCache = false; // TEMPORARY @@ -469,7 +469,7 @@ void DrawEngineGLES::DoFlush() { vai->lastFrame = gpuStats.numFlips; } else { - if (lastVType_ & GE_VTYPE_WEIGHT_MASK) { + if (g_Config.bSoftwareSkinning && (lastVType_ & GE_VTYPE_WEIGHT_MASK)) { // If software skinning, we've already predecoded into "decoded". So push that content. size_t size = decodedVerts_ * dec_->GetDecVtxFmt().stride; u8 *dest = (u8 *)frameData.pushVertex->Push(size, &vertexBufferOffset, &vertexBuffer); diff --git a/GPU/GLES/GPU_GLES.cpp b/GPU/GLES/GPU_GLES.cpp index 7b64f05d32..b1505cfacd 100644 --- a/GPU/GLES/GPU_GLES.cpp +++ b/GPU/GLES/GPU_GLES.cpp @@ -89,6 +89,8 @@ GPU_GLES::GPU_GLES(GraphicsContext *gfxCtx, Draw::DrawContext *draw) // No need to flush before the tex scale/offset commands if we are baking // the tex scale/offset into the vertices anyway. + UpdateCmdInfo(); + BuildReportingInfo(); // Update again after init to be sure of any silly driver problems. UpdateVsyncInterval(true); @@ -344,6 +346,7 @@ void GPU_GLES::DeviceRestore() { draw_ = (Draw::DrawContext *)PSP_CoreParameter().graphicsContext->GetDrawContext(); ILOG("GPU_GLES: DeviceRestore"); + UpdateCmdInfo(); UpdateVsyncInterval(true); textureCacheGL_->DeviceRestore(draw_); @@ -363,6 +366,7 @@ void GPU_GLES::InitClear() { void GPU_GLES::BeginHostFrame() { GPUCommon::BeginHostFrame(); + UpdateCmdInfo(); if (resized_) { CheckGPUFeatures(); framebufferManager_->Resized(); diff --git a/GPU/GLES/ShaderManagerGLES.cpp b/GPU/GLES/ShaderManagerGLES.cpp index 9425589665..0fee3e6500 100644 --- a/GPU/GLES/ShaderManagerGLES.cpp +++ b/GPU/GLES/ShaderManagerGLES.cpp @@ -109,8 +109,21 @@ LinkedShader::LinkedShader(GLRenderManager *render, VShaderID VSID, Shader *vs, queries.push_back({ &u_world, "u_world" }); queries.push_back({ &u_texmtx, "u_texmtx" }); + if (VSID.Bit(VS_BIT_ENABLE_BONES)) + numBones = TranslateNumBones(VSID.Bits(VS_BIT_BONES, 3) + 1); + else + numBones = 0; queries.push_back({ &u_depthRange, "u_depthRange" }); +#ifdef USE_BONE_ARRAY + queries.push_back({ &u_bone, "u_bone" }); +#else + static const char * const boneNames[8] = { "u_bone0", "u_bone1", "u_bone2", "u_bone3", "u_bone4", "u_bone5", "u_bone6", "u_bone7", }; + for (int i = 0; i < 8; i++) { + queries.push_back({ &u_bone[i], boneNames[i] }); + } +#endif + // Lighting, texturing queries.push_back({ &u_ambient, "u_ambient" }); queries.push_back({ &u_matambientalpha, "u_matambientalpha" }); @@ -465,6 +478,13 @@ void LinkedShader::UpdateUniforms(u32 vertType, const ShaderID &vsid) { float f = (float)gstate.getStencilTestRef() * (1.0f / 255.0f); render_->SetUniformF(&u_stencilReplaceValue, 1, &f); } + float bonetemp[16]; + for (int i = 0; i < numBones; i++) { + if (dirty & (DIRTY_BONEMATRIX0 << i)) { + ConvertMatrix4x3To4x4(bonetemp, gstate.boneMatrix + 12 * i); + render_->SetUniformM4x4(&u_bone[i], bonetemp); + } + } if (dirty & DIRTY_SHADERBLEND) { if (u_blendFixA != -1) { @@ -790,7 +810,7 @@ std::string ShaderManagerGLES::DebugGetShaderString(std::string id, DebugShaderT // as sometimes these features might have an effect on the ID bits. #define CACHE_HEADER_MAGIC 0x83277592 -#define CACHE_VERSION 10 +#define CACHE_VERSION 11 struct CacheHeader { uint32_t magic; uint32_t version; diff --git a/GPU/GLES/ShaderManagerGLES.h b/GPU/GLES/ShaderManagerGLES.h index 7f19a77e28..e04a9a8c8b 100644 --- a/GPU/GLES/ShaderManagerGLES.h +++ b/GPU/GLES/ShaderManagerGLES.h @@ -72,6 +72,13 @@ public: int u_world; int u_depthRange; // x,y = viewport xscale/xcenter. z,w=clipping minz/maxz (?) +#ifdef USE_BONE_ARRAY + int u_bone; // array, size is numBones +#else + int u_bone[8]; +#endif + int numBones; + // Shader blending. int u_fbotex; int u_blendFixA; diff --git a/GPU/GLES/VertexShaderGeneratorGLES.cpp b/GPU/GLES/VertexShaderGeneratorGLES.cpp index 97e7ab3ef1..edb1828429 100644 --- a/GPU/GLES/VertexShaderGeneratorGLES.cpp +++ b/GPU/GLES/VertexShaderGeneratorGLES.cpp @@ -38,6 +38,30 @@ #define WRITE p+=sprintf +static const char * const boneWeightAttrDecl[9] = { + "#ERROR#", + "attribute mediump float w1;\n", + "attribute mediump vec2 w1;\n", + "attribute mediump vec3 w1;\n", + "attribute mediump vec4 w1;\n", + "attribute mediump vec4 w1;\nattribute mediump float w2;\n", + "attribute mediump vec4 w1;\nattribute mediump vec2 w2;\n", + "attribute mediump vec4 w1;\nattribute mediump vec3 w2;\n", + "attribute mediump vec4 w1, w2;\n", +}; + +static const char * const boneWeightInDecl[9] = { + "#ERROR#", + "in mediump float w1;\n", + "in mediump vec2 w1;\n", + "in mediump vec3 w1;\n", + "in mediump vec4 w1;\n", + "in mediump vec4 w1;\nin mediump float w2;\n", + "in mediump vec4 w1;\nin mediump vec2 w2;\n", + "in mediump vec4 w1;\nin mediump vec3 w2;\n", + "in mediump vec4 w1, w2;\n", +}; + enum DoLightComputation { LIGHT_OFF, LIGHT_SHADE, @@ -81,6 +105,7 @@ void GenerateVertexShader(const VShaderID &id, char *buffer, uint32_t *attrMask, bool glslES30 = false; const char *varying = "varying"; const char *attribute = "attribute"; + const char * const * boneWeightDecl = boneWeightAttrDecl; const char *texelFetch = NULL; bool highpFog = false; bool highpTexcoord = false; @@ -133,6 +158,7 @@ void GenerateVertexShader(const VShaderID &id, char *buffer, uint32_t *attrMask, if (glslES30 || gl_extensions.IsCoreContext) { attribute = "in"; varying = "out"; + boneWeightDecl = boneWeightInDecl; } bool isModeThrough = id.Bit(VS_BIT_IS_THROUGH); @@ -156,6 +182,7 @@ void GenerateVertexShader(const VShaderID &id, char *buffer, uint32_t *attrMask, bool flipNormal = id.Bit(VS_BIT_NORM_REVERSE); int ls0 = id.Bits(VS_BIT_LS0, 2); int ls1 = id.Bits(VS_BIT_LS1, 2); + bool enableBones = id.Bit(VS_BIT_ENABLE_BONES); bool enableLighting = id.Bit(VS_BIT_LIGHTING_ENABLE); int matUpdate = id.Bits(VS_BIT_MATERIAL_UPDATE, 3); @@ -181,6 +208,16 @@ void GenerateVertexShader(const VShaderID &id, char *buffer, uint32_t *attrMask, } } + int numBoneWeights = 0; + int boneWeightScale = id.Bits(VS_BIT_WEIGHT_FMTSCALE, 2); + if (enableBones) { + numBoneWeights = 1 + id.Bits(VS_BIT_BONES, 3); + WRITE(p, "%s", boneWeightDecl[numBoneWeights]); + *attrMask |= 1 << ATTR_W1; + if (numBoneWeights >= 5) + *attrMask |= 1 << ATTR_W2; + } + if (useHWTransform) WRITE(p, "%s vec3 position;\n", attribute); else @@ -231,6 +268,17 @@ void GenerateVertexShader(const VShaderID &id, char *buffer, uint32_t *attrMask, WRITE(p, "uniform mediump mat4 u_texmtx;\n"); *uniformMask |= DIRTY_TEXMATRIX; } + if (enableBones) { +#ifdef USE_BONE_ARRAY + WRITE(p, "uniform mediump mat4 u_bone[%i];\n", numBoneWeights); + *uniformMask |= DIRTY_BONE_UNIFORMS; +#else + for (int i = 0; i < numBoneWeights; i++) { + WRITE(p, "uniform mat4 u_bone%i;\n", i); + *uniformMask |= DIRTY_BONEMATRIX0 << i; + } +#endif + } if (doTexture) { WRITE(p, "uniform vec4 u_uvscaleoffset;\n"); *uniformMask |= DIRTY_UVSCALEOFFSET; @@ -436,7 +484,7 @@ void GenerateVertexShader(const VShaderID &id, char *buffer, uint32_t *attrMask, } } else { // Step 1: World Transform / Skinning - if (true) { + if (!enableBones) { // Hardware tessellation if (doBezier || doSpline) { WRITE(p, " vec3 _pos[16];\n"); @@ -543,6 +591,81 @@ void GenerateVertexShader(const VShaderID &id, char *buffer, uint32_t *attrMask, else WRITE(p, " mediump vec3 worldnormal = vec3(0.0, 0.0, 1.0);\n"); } + } else { + static const char *rescale[4] = {"", " * 1.9921875", " * 1.999969482421875", ""}; // 2*127.5f/128.f, 2*32767.5f/32768.f, 1.0f}; + const char *factor = rescale[boneWeightScale]; + + static const char * const boneWeightAttr[8] = { + "w1.x", "w1.y", "w1.z", "w1.w", + "w2.x", "w2.y", "w2.z", "w2.w", + }; + +#if defined(USE_FOR_LOOP) && defined(USE_BONE_ARRAY) + + // To loop through the weights, we unfortunately need to put them in a float array. + // GLSL ES sucks - no way to directly initialize an array! + switch (numBoneWeights) { + case 1: WRITE(p, " float w[1]; w[0] = w1;\n"); break; + case 2: WRITE(p, " float w[2]; w[0] = w1.x; w[1] = w1.y;\n"); break; + case 3: WRITE(p, " float w[3]; w[0] = w1.x; w[1] = w1.y; w[2] = w1.z;\n"); break; + case 4: WRITE(p, " float w[4]; w[0] = w1.x; w[1] = w1.y; w[2] = w1.z; w[3] = w1.w;\n"); break; + case 5: WRITE(p, " float w[5]; w[0] = w1.x; w[1] = w1.y; w[2] = w1.z; w[3] = w1.w; w[4] = w2;\n"); break; + case 6: WRITE(p, " float w[6]; w[0] = w1.x; w[1] = w1.y; w[2] = w1.z; w[3] = w1.w; w[4] = w2.x; w[5] = w2.y;\n"); break; + case 7: WRITE(p, " float w[7]; w[0] = w1.x; w[1] = w1.y; w[2] = w1.z; w[3] = w1.w; w[4] = w2.x; w[5] = w2.y; w[6] = w2.z;\n"); break; + case 8: WRITE(p, " float w[8]; w[0] = w1.x; w[1] = w1.y; w[2] = w1.z; w[3] = w1.w; w[4] = w2.x; w[5] = w2.y; w[6] = w2.z; w[7] = w2.w;\n"); break; + } + + WRITE(p, " mat4 skinMatrix = w[0] * u_bone[0];\n"); + if (numBoneWeights > 1) { + WRITE(p, " for (int i = 1; i < %i; i++) {\n", numBoneWeights); + WRITE(p, " skinMatrix += w[i] * u_bone[i];\n"); + WRITE(p, " }\n"); + } + +#else + +#ifdef USE_BONE_ARRAY + if (numBoneWeights == 1) + WRITE(p, " mat4 skinMatrix = w1 * u_bone[0]"); + else + WRITE(p, " mat4 skinMatrix = w1.x * u_bone[0]"); + for (int i = 1; i < numBoneWeights; i++) { + const char *weightAttr = boneWeightAttr[i]; + // workaround for "cant do .x of scalar" issue + if (numBoneWeights == 1 && i == 0) weightAttr = "w1"; + if (numBoneWeights == 5 && i == 4) weightAttr = "w2"; + WRITE(p, " + %s * u_bone[%i]", weightAttr, i); + } +#else + // Uncomment this to screw up bone shaders to check the vertex shader software fallback + // WRITE(p, "THIS SHOULD ERROR! #error"); + if (numBoneWeights == 1) + WRITE(p, " mat4 skinMatrix = w1 * u_bone0"); + else + WRITE(p, " mat4 skinMatrix = w1.x * u_bone0"); + for (int i = 1; i < numBoneWeights; i++) { + const char *weightAttr = boneWeightAttr[i]; + // workaround for "cant do .x of scalar" issue + if (numBoneWeights == 1 && i == 0) weightAttr = "w1"; + if (numBoneWeights == 5 && i == 4) weightAttr = "w2"; + WRITE(p, " + %s * u_bone%i", weightAttr, i); + } +#endif + +#endif + + WRITE(p, ";\n"); + + // Trying to simplify this results in bugs in LBP... + WRITE(p, " vec3 skinnedpos = (skinMatrix * vec4(position, 1.0)).xyz %s;\n", factor); + WRITE(p, " vec3 worldpos = (u_world * vec4(skinnedpos, 1.0)).xyz;\n"); + + if (hasNormal) { + WRITE(p, " mediump vec3 skinnednormal = (skinMatrix * vec4(%snormal, 0.0)).xyz %s;\n", flipNormal ? "-" : "", factor); + } else { + WRITE(p, " mediump vec3 skinnednormal = (skinMatrix * vec4(0.0, 0.0, %s1.0, 0.0)).xyz %s;\n", flipNormal ? "-" : "", factor); + } + WRITE(p, " mediump vec3 worldnormal = normalize((u_world * vec4(skinnednormal, 0.0)).xyz);\n"); } WRITE(p, " vec4 viewPos = u_view * vec4(worldpos, 1.0);\n"); diff --git a/GPU/GLES/VertexShaderGeneratorGLES.h b/GPU/GLES/VertexShaderGeneratorGLES.h index ae78cad767..7801b2cd6f 100644 --- a/GPU/GLES/VertexShaderGeneratorGLES.h +++ b/GPU/GLES/VertexShaderGeneratorGLES.h @@ -19,6 +19,8 @@ #include "Common/CommonTypes.h" +// #define USE_BONE_ARRAY + struct VShaderID; void GenerateVertexShader(const VShaderID &id, char *buffer, uint32_t *attrMask, uint64_t *uniformMask); diff --git a/GPU/GPUCommon.cpp b/GPU/GPUCommon.cpp index 83afe4b40c..0fb4318eb0 100644 --- a/GPU/GPUCommon.cpp +++ b/GPU/GPUCommon.cpp @@ -46,8 +46,8 @@ const CommonCommandTableEntry commonCommandTable[] = { { GE_CMD_BEZIER, FLAG_FLUSHBEFORE | FLAG_EXECUTE, 0, &GPUCommon::Execute_Bezier }, { GE_CMD_SPLINE, FLAG_FLUSHBEFORE | FLAG_EXECUTE, 0, &GPUCommon::Execute_Spline }, - // Changing the vertex type does not always require us to flush so handle that in Execute_VertexType. - { GE_CMD_VERTEXTYPE, FLAG_EXECUTEONCHANGE, 0, &GPUCommon::Execute_VertexType }, + // Changing the vertex type requires us to flush. + { GE_CMD_VERTEXTYPE, FLAG_FLUSHBEFOREONCHANGE | FLAG_EXECUTEONCHANGE, 0, &GPUCommon::Execute_VertexType }, { GE_CMD_LOADCLUT, FLAG_FLUSHBEFOREONCHANGE | FLAG_EXECUTE, 0, &GPUCommon::Execute_LoadClut }, @@ -403,11 +403,23 @@ GPUCommon::GPUCommon(GraphicsContext *gfxCtx, Draw::DrawContext *draw) : ERROR_LOG(G3D, "Command missing from table: %02x (%i)", i, i); } } + + UpdateCmdInfo(); } GPUCommon::~GPUCommon() { } +void GPUCommon::UpdateCmdInfo() { + if (g_Config.bSoftwareSkinning) { + cmdInfo_[GE_CMD_VERTEXTYPE].flags &= ~FLAG_FLUSHBEFOREONCHANGE; + cmdInfo_[GE_CMD_VERTEXTYPE].func = &GPUCommon::Execute_VertexTypeSkinning; + } else { + cmdInfo_[GE_CMD_VERTEXTYPE].flags |= FLAG_FLUSHBEFOREONCHANGE; + cmdInfo_[GE_CMD_VERTEXTYPE].func = &GPUCommon::Execute_VertexType; + } +} + void GPUCommon::BeginHostFrame() { ReapplyGfxState(); @@ -1414,12 +1426,22 @@ void GPUCommon::Execute_TexSize0(u32 op, u32 diff) { } } +void GPUCommon::Execute_VertexType(u32 op, u32 diff) { + if (diff) + gstate_c.Dirty(DIRTY_VERTEXSHADER_STATE); + if (diff & (GE_VTYPE_TC_MASK | GE_VTYPE_THROUGH_MASK)) { + gstate_c.Dirty(DIRTY_UVSCALEOFFSET); + if (diff & GE_VTYPE_THROUGH_MASK) + gstate_c.Dirty(DIRTY_RASTER_STATE | DIRTY_VIEWPORTSCISSOR_STATE | DIRTY_FRAGMENTSHADER_STATE); + } +} + void GPUCommon::Execute_LoadClut(u32 op, u32 diff) { gstate_c.Dirty(DIRTY_TEXTURE_PARAMS); textureCache_->LoadClut(gstate.getClutAddress(), gstate.getClutLoadBytes()); } -void GPUCommon::Execute_VertexType(u32 op, u32 diff) { +void GPUCommon::Execute_VertexTypeSkinning(u32 op, u32 diff) { // Don't flush when weight count changes. if (diff & ~GE_VTYPE_WEIGHTCOUNT_MASK) { // Restore and flush @@ -1661,6 +1683,10 @@ void GPUCommon::Execute_Bezier(u32 op, u32 diff) { indices = Memory::GetPointerUnchecked(gstate_c.indexAddr); } + if (vertTypeIsSkinningEnabled(gstate.vertType)) { + DEBUG_LOG_REPORT(G3D, "Unusual bezier/spline vtype: %08x, morph: %d, bones: %d", gstate.vertType, (gstate.vertType & GE_VTYPE_MORPHCOUNT_MASK) >> GE_VTYPE_MORPHCOUNT_SHIFT, vertTypeGetNumBoneWeights(gstate.vertType)); + } + GEPatchPrimType patchPrim = gstate.getPatchPrimitiveType(); SetDrawType(DRAW_BEZIER, PatchPrimToPrim(patchPrim)); @@ -1719,6 +1745,10 @@ void GPUCommon::Execute_Spline(u32 op, u32 diff) { indices = Memory::GetPointerUnchecked(gstate_c.indexAddr); } + if (vertTypeIsSkinningEnabled(gstate.vertType)) { + DEBUG_LOG_REPORT(G3D, "Unusual bezier/spline vtype: %08x, morph: %d, bones: %d", gstate.vertType, (gstate.vertType & GE_VTYPE_MORPHCOUNT_MASK) >> GE_VTYPE_MORPHCOUNT_SHIFT, vertTypeGetNumBoneWeights(gstate.vertType)); + } + int sp_ucount = op & 0xFF; int sp_vcount = (op >> 8) & 0xFF; int sp_utype = (op >> 16) & 0x3; @@ -1999,10 +2029,34 @@ void GPUCommon::Execute_BoneMtxNum(u32 op, u32 diff) { } if (fastLoad) { - while ((src[i] >> 24) == GE_CMD_BONEMATRIXDATA) { - dst[i] = src[i] << 8; - if (++i >= end) { - break; + // If we can't use software skinning, we have to flush and dirty. + if (!g_Config.bSoftwareSkinning) { + while ((src[i] >> 24) == GE_CMD_BONEMATRIXDATA) { + const u32 newVal = src[i] << 8; + if (dst[i] != newVal) { + Flush(); + dst[i] = newVal; + } + if (++i >= end) { + break; + } + } + + const unsigned int numPlusCount = (op & 0x7F) + i; + for (unsigned int num = op & 0x7F; num < numPlusCount; num += 12) { + gstate_c.Dirty(DIRTY_BONEMATRIX0 << (num / 12)); + } + } else { + while ((src[i] >> 24) == GE_CMD_BONEMATRIXDATA) { + dst[i] = src[i] << 8; + if (++i >= end) { + break; + } + } + + const unsigned int numPlusCount = (op & 0x7F) + i; + for (unsigned int num = op & 0x7F; num < numPlusCount; num += 12) { + gstate_c.deferredVertTypeDirty |= DIRTY_BONEMATRIX0 << (num / 12); } } } @@ -2020,6 +2074,13 @@ void GPUCommon::Execute_BoneMtxData(u32 op, u32 diff) { int num = gstate.boneMatrixNumber & 0x7F; u32 newVal = op << 8; if (num < 96 && newVal != ((const u32 *)gstate.boneMatrix)[num]) { + // Bone matrices should NOT flush when software skinning is enabled! + if (!g_Config.bSoftwareSkinning) { + Flush(); + gstate_c.Dirty(DIRTY_BONEMATRIX0 << (num / 12)); + } else { + gstate_c.deferredVertTypeDirty |= DIRTY_BONEMATRIX0 << (num / 12); + } ((u32 *)gstate.boneMatrix)[num] = newVal; } num++; @@ -2160,6 +2221,17 @@ void GPUCommon::Execute_Unknown(u32 op, u32 diff) { void GPUCommon::FastLoadBoneMatrix(u32 target) { const int num = gstate.boneMatrixNumber & 0x7F; const int mtxNum = num / 12; + uint32_t uniformsToDirty = DIRTY_BONEMATRIX0 << mtxNum; + if ((num - 12 * mtxNum) != 0) { + uniformsToDirty |= DIRTY_BONEMATRIX0 << ((mtxNum + 1) & 7); + } + + if (!g_Config.bSoftwareSkinning) { + Flush(); + gstate_c.Dirty(uniformsToDirty); + } else { + gstate_c.deferredVertTypeDirty |= uniformsToDirty; + } gstate.FastLoadBoneMatrix(target); } diff --git a/GPU/GPUCommon.h b/GPU/GPUCommon.h index 44bb9ab6d2..9d379000e1 100644 --- a/GPU/GPUCommon.h +++ b/GPU/GPUCommon.h @@ -72,6 +72,8 @@ public: } virtual void CheckGPUFeatures() = 0; + void UpdateCmdInfo(); + bool IsReady() override { return true; } @@ -129,6 +131,7 @@ public: void Execute_End(u32 op, u32 diff); void Execute_VertexType(u32 op, u32 diff); + void Execute_VertexTypeSkinning(u32 op, u32 diff); void Execute_Prim(u32 op, u32 diff); void Execute_Bezier(u32 op, u32 diff); diff --git a/GPU/GPUState.cpp b/GPU/GPUState.cpp index 19ae21706d..7e3f3812ef 100644 --- a/GPU/GPUState.cpp +++ b/GPU/GPUState.cpp @@ -240,6 +240,13 @@ void GPUgstate::Restore(u32_le *ptr) { } } +bool vertTypeIsSkinningEnabled(u32 vertType) { + if (g_Config.bSoftwareSkinning) + return false; + else + return ((vertType & GE_VTYPE_WEIGHT_MASK) != GE_VTYPE_WEIGHT_NONE); +} + struct GPUStateCache_v0 { u32 vertexAddr; u32 indexAddr; diff --git a/GPU/GPUState.h b/GPU/GPUState.h index 18d8cdd21b..58bc5e3859 100644 --- a/GPU/GPUState.h +++ b/GPU/GPUState.h @@ -441,6 +441,11 @@ struct GPUgstate { void Restore(u32_le *ptr); }; +bool vertTypeIsSkinningEnabled(u32 vertType); + +inline int vertTypeGetNumBoneWeights(u32 vertType) { return 1 + ((vertType & GE_VTYPE_WEIGHTCOUNT_MASK) >> GE_VTYPE_WEIGHTCOUNT_SHIFT); } +inline int vertTypeGetWeightMask(u32 vertType) { return vertType & GE_VTYPE_WEIGHT_MASK; } + // The rest is cached simplified/converted data for fast access. // Does not need to be saved when saving/restoring context. // diff --git a/GPU/Software/SoftGpu.cpp b/GPU/Software/SoftGpu.cpp index da74874c7c..b24d286940 100644 --- a/GPU/Software/SoftGpu.cpp +++ b/GPU/Software/SoftGpu.cpp @@ -396,6 +396,10 @@ void SoftGPU::ExecuteOp(u32 op, u32 diff) { indices = Memory::GetPointerUnchecked(gstate_c.indexAddr); } + if ((gstate.vertType & GE_VTYPE_MORPHCOUNT_MASK) || vertTypeIsSkinningEnabled(gstate.vertType)) { + DEBUG_LOG_REPORT(G3D, "Unusual bezier/spline vtype: %08x, morph: %d, bones: %d", gstate.vertType, (gstate.vertType & GE_VTYPE_MORPHCOUNT_MASK) >> GE_VTYPE_MORPHCOUNT_SHIFT, vertTypeGetNumBoneWeights(gstate.vertType)); + } + GEPatchPrimType patchPrim = gstate.getPatchPrimitiveType(); SetDrawType(DRAW_BEZIER, PatchPrimToPrim(patchPrim)); @@ -440,6 +444,10 @@ void SoftGPU::ExecuteOp(u32 op, u32 diff) { indices = Memory::GetPointerUnchecked(gstate_c.indexAddr); } + if ((gstate.vertType & GE_VTYPE_MORPHCOUNT_MASK) || vertTypeIsSkinningEnabled(gstate.vertType)) { + DEBUG_LOG_REPORT(G3D, "Unusual bezier/spline vtype: %08x, morph: %d, bones: %d", gstate.vertType, (gstate.vertType & GE_VTYPE_MORPHCOUNT_MASK) >> GE_VTYPE_MORPHCOUNT_SHIFT, vertTypeGetNumBoneWeights(gstate.vertType)); + } + int sp_ucount = op & 0xFF; int sp_vcount = (op >> 8) & 0xFF; int sp_utype = (op >> 16) & 0x3; diff --git a/GPU/Software/TransformUnit.cpp b/GPU/Software/TransformUnit.cpp index e6bc3fbba0..f1e7e9581b 100644 --- a/GPU/Software/TransformUnit.cpp +++ b/GPU/Software/TransformUnit.cpp @@ -167,6 +167,27 @@ VertexData TransformUnit::ReadVertex(VertexReader& vreader) vertex.normal = -vertex.normal; } + if (vertTypeIsSkinningEnabled(gstate.vertType) && !gstate.isModeThrough()) { + float W[8] = { 1.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f }; + vreader.ReadWeights(W); + + Vec3 tmppos(0.f, 0.f, 0.f); + Vec3 tmpnrm(0.f, 0.f, 0.f); + + for (int i = 0; i < vertTypeGetNumBoneWeights(gstate.vertType); ++i) { + Mat3x3 bone(&gstate.boneMatrix[12*i]); + tmppos += (bone * ModelCoords(pos[0], pos[1], pos[2]) + Vec3(gstate.boneMatrix[12*i+9], gstate.boneMatrix[12*i+10], gstate.boneMatrix[12*i+11])) * W[i]; + if (vreader.hasNormal()) + tmpnrm += (bone * vertex.normal) * W[i]; + } + + pos[0] = tmppos.x; + pos[1] = tmppos.y; + pos[2] = tmppos.z; + if (vreader.hasNormal()) + vertex.normal = tmpnrm; + } + if (vreader.hasColor0()) { float col[4]; vreader.ReadColor0(col); diff --git a/GPU/Vulkan/DrawEngineVulkan.cpp b/GPU/Vulkan/DrawEngineVulkan.cpp index b261c1cc4c..5b5275d77e 100644 --- a/GPU/Vulkan/DrawEngineVulkan.cpp +++ b/GPU/Vulkan/DrawEngineVulkan.cpp @@ -66,7 +66,8 @@ enum { DRAW_BINDING_2ND_TEXTURE = 1, DRAW_BINDING_DYNUBO_BASE = 2, DRAW_BINDING_DYNUBO_LIGHT = 3, - DRAW_BINDING_TESS_STORAGE_BUF = 4, + DRAW_BINDING_DYNUBO_BONE = 4, + DRAW_BINDING_TESS_STORAGE_BUF = 5, }; enum { @@ -94,7 +95,7 @@ DrawEngineVulkan::DrawEngineVulkan(VulkanContext *vulkan, Draw::DrawContext *dra void DrawEngineVulkan::InitDeviceObjects() { // All resources we need for PSP drawing. Usually only bindings 0 and 2-4 are populated. - VkDescriptorSetLayoutBinding bindings[5]{}; + VkDescriptorSetLayoutBinding bindings[6]{}; bindings[0].descriptorCount = 1; bindings[0].descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER; bindings[0].stageFlags = VK_SHADER_STAGE_FRAGMENT_BIT; @@ -111,11 +112,15 @@ void DrawEngineVulkan::InitDeviceObjects() { bindings[3].descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC; bindings[3].stageFlags = VK_SHADER_STAGE_VERTEX_BIT; bindings[3].binding = DRAW_BINDING_DYNUBO_LIGHT; - // Used only for hardware tessellation. bindings[4].descriptorCount = 1; - bindings[4].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; + bindings[4].descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC; bindings[4].stageFlags = VK_SHADER_STAGE_VERTEX_BIT; - bindings[4].binding = DRAW_BINDING_TESS_STORAGE_BUF; + bindings[4].binding = DRAW_BINDING_DYNUBO_BONE; + // Used only for hardware tessellation. + bindings[5].descriptorCount = 1; + bindings[5].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; + bindings[5].stageFlags = VK_SHADER_STAGE_VERTEX_BIT; + bindings[5].binding = DRAW_BINDING_TESS_STORAGE_BUF; VkDevice device = vulkan_->GetDevice(); @@ -129,7 +134,7 @@ void DrawEngineVulkan::InitDeviceObjects() { // if creating and updating them turns out to be expensive. for (int i = 0; i < VulkanContext::MAX_INFLIGHT_FRAMES; i++) { // We now create descriptor pools on demand, so removed from here. - frame_[i].pushUBO = new VulkanPushBuffer(vulkan_, 4 * 1024 * 1024); + frame_[i].pushUBO = new VulkanPushBuffer(vulkan_, 8 * 1024 * 1024); frame_[i].pushVertex = new VulkanPushBuffer(vulkan_, 2 * 1024 * 1024); frame_[i].pushIndex = new VulkanPushBuffer(vulkan_, 1 * 1024 * 1024); } @@ -360,7 +365,7 @@ VkResult DrawEngineVulkan::RecreateDescriptorPool(FrameData &frame, int newSize) frame.descPoolSize = newSize; VkDescriptorPoolSize dpTypes[3]; - dpTypes[0].descriptorCount = frame.descPoolSize * 2; + dpTypes[0].descriptorCount = frame.descPoolSize * 3; dpTypes[0].type = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC; dpTypes[1].descriptorCount = frame.descPoolSize * 2; // Don't use these for tess anymore, need max two per set. dpTypes[1].type = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER; @@ -378,15 +383,17 @@ VkResult DrawEngineVulkan::RecreateDescriptorPool(FrameData &frame, int newSize) return res; } -VkDescriptorSet DrawEngineVulkan::GetOrCreateDescriptorSet(VkImageView imageView, VkSampler sampler, VkBuffer base, VkBuffer light, bool tess) { +VkDescriptorSet DrawEngineVulkan::GetOrCreateDescriptorSet(VkImageView imageView, VkSampler sampler, VkBuffer base, VkBuffer light, VkBuffer bone, bool tess) { DescriptorSetKey key; key.imageView_ = imageView; key.sampler_ = sampler; key.secondaryImageView_ = boundSecondary_; key.base_ = base; key.light_ = light; + key.bone_ = bone; _dbg_assert_(G3D, base != VK_NULL_HANDLE); _dbg_assert_(G3D, light != VK_NULL_HANDLE); + _dbg_assert_(G3D, bone != VK_NULL_HANDLE); FrameData &frame = frame_[vulkan_->GetCurFrame()]; // See if we already have this descriptor set cached. @@ -494,7 +501,7 @@ VkDescriptorSet DrawEngineVulkan::GetOrCreateDescriptorSet(VkImageView imageView } // Uniform buffer objects - VkDescriptorBufferInfo buf[2]{}; + VkDescriptorBufferInfo buf[3]{}; int count = 0; buf[count].buffer = base; buf[count].offset = 0; @@ -504,6 +511,10 @@ VkDescriptorSet DrawEngineVulkan::GetOrCreateDescriptorSet(VkImageView imageView buf[count].offset = 0; buf[count].range = sizeof(UB_VS_Lights); count++; + buf[count].buffer = bone; + buf[count].offset = 0; + buf[count].range = sizeof(UB_VS_Bones); + count++; for (int i = 0; i < count; i++) { writes[n].sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET; writes[n].pNext = nullptr; @@ -527,9 +538,11 @@ VkDescriptorSet DrawEngineVulkan::GetOrCreateDescriptorSet(VkImageView imageView void DrawEngineVulkan::DirtyAllUBOs() { baseUBOOffset = 0; lightUBOOffset = 0; + boneUBOOffset = 0; baseBuf = VK_NULL_HANDLE; lightBuf = VK_NULL_HANDLE; - dirtyUniforms_ = DIRTY_BASE_UNIFORMS | DIRTY_LIGHT_UNIFORMS; + boneBuf = VK_NULL_HANDLE; + dirtyUniforms_ = DIRTY_BASE_UNIFORMS | DIRTY_LIGHT_UNIFORMS | DIRTY_BONE_UNIFORMS; imageView = VK_NULL_HANDLE; sampler = VK_NULL_HANDLE; gstate_c.Dirty(DIRTY_TEXTURE_IMAGE); @@ -588,7 +601,7 @@ void DrawEngineVulkan::DoFlush() { // Also avoid caching when software skinning. VkBuffer vbuf = VK_NULL_HANDLE; VkBuffer ibuf = VK_NULL_HANDLE; - if (lastVType_ & GE_VTYPE_WEIGHT_MASK) { + if (g_Config.bSoftwareSkinning && (lastVType_ & GE_VTYPE_WEIGHT_MASK)) { useCache = false; } @@ -730,7 +743,7 @@ void DrawEngineVulkan::DoFlush() { break; } } else { - if (lastVType_ & GE_VTYPE_WEIGHT_MASK) { + if (g_Config.bSoftwareSkinning && (lastVType_ & GE_VTYPE_WEIGHT_MASK)) { // If software skinning, we've already predecoded into "decoded". So push that content. VkDeviceSize size = decodedVerts_ * dec_->GetDecVtxFmt().stride; u8 *dest = (u8 *)frame->pushVertex->Push(size, &vbOffset, &vbuf); @@ -802,12 +815,12 @@ void DrawEngineVulkan::DoFlush() { dirtyUniforms_ |= shaderManager_->UpdateUniforms(); UpdateUBOs(frame); - VkDescriptorSet ds = GetOrCreateDescriptorSet(imageView, sampler, baseBuf, lightBuf, tess); + VkDescriptorSet ds = GetOrCreateDescriptorSet(imageView, sampler, baseBuf, lightBuf, boneBuf, tess); { PROFILE_THIS_SCOPE("renderman_q"); - const uint32_t dynamicUBOOffsets[2] = { - baseUBOOffset, lightUBOOffset, + const uint32_t dynamicUBOOffsets[3] = { + baseUBOOffset, lightUBOOffset, boneUBOOffset, }; int stride = dec_->GetDecVtxFmt().stride; @@ -908,9 +921,9 @@ void DrawEngineVulkan::DoFlush() { // Even if the first draw is through-mode, make sure we at least have one copy of these uniforms buffered UpdateUBOs(frame); - VkDescriptorSet ds = GetOrCreateDescriptorSet(imageView, sampler, baseBuf, lightBuf, tess); - const uint32_t dynamicUBOOffsets[2] = { - baseUBOOffset, lightUBOOffset, + VkDescriptorSet ds = GetOrCreateDescriptorSet(imageView, sampler, baseBuf, lightBuf, boneBuf, tess); + const uint32_t dynamicUBOOffsets[3] = { + baseUBOOffset, lightUBOOffset, boneUBOOffset, }; PROFILE_THIS_SCOPE("renderman_q"); @@ -977,6 +990,10 @@ void DrawEngineVulkan::UpdateUBOs(FrameData *frame) { lightUBOOffset = shaderManager_->PushLightBuffer(frame->pushUBO, &lightBuf); dirtyUniforms_ &= ~DIRTY_LIGHT_UNIFORMS; } + if ((dirtyUniforms_ & DIRTY_BONE_UNIFORMS) || boneBuf == VK_NULL_HANDLE) { + boneUBOOffset = shaderManager_->PushBoneBuffer(frame->pushUBO, &boneBuf); + dirtyUniforms_ &= ~DIRTY_BONE_UNIFORMS; + } } DrawEngineVulkan::TessellationDataTransferVulkan::TessellationDataTransferVulkan(VulkanContext *vulkan, Draw::DrawContext *draw) diff --git a/GPU/Vulkan/DrawEngineVulkan.h b/GPU/Vulkan/DrawEngineVulkan.h index 1cc4976bc3..74cf5a0655 100644 --- a/GPU/Vulkan/DrawEngineVulkan.h +++ b/GPU/Vulkan/DrawEngineVulkan.h @@ -23,7 +23,7 @@ // * binding 1: Secondary texture sampler for shader blending or depal palettes // * binding 2: Base Uniform Buffer (includes fragment state) // * binding 3: Light uniform buffer -// * binding 4: Shader buffer storage for tesselation +// * binding 4: Bone uniform buffer // // All shaders conform to this layout, so they are all compatible with the same descriptor set. // The format of the various uniform buffers may vary though - vertex shaders that don't skin @@ -194,7 +194,7 @@ private: void DoFlush(); void UpdateUBOs(FrameData *frame); - VkDescriptorSet GetOrCreateDescriptorSet(VkImageView imageView, VkSampler sampler, VkBuffer base, VkBuffer light, bool tess); + VkDescriptorSet GetOrCreateDescriptorSet(VkImageView imageView, VkSampler sampler, VkBuffer base, VkBuffer light, VkBuffer bone, bool tess); VulkanContext *vulkan_; Draw::DrawContext *draw_; @@ -218,7 +218,7 @@ private: VkImageView imageView_; VkImageView secondaryImageView_; VkSampler sampler_; - VkBuffer base_, light_; // All three UBO slots will be set to this. This will usually be identical + VkBuffer base_, light_, bone_; // All three UBO slots will be set to this. This will usually be identical // for all draws in a frame, except when the buffer has to grow. }; @@ -252,7 +252,8 @@ private: uint64_t dirtyUniforms_; uint32_t baseUBOOffset; uint32_t lightUBOOffset; - VkBuffer baseBuf, lightBuf; + uint32_t boneUBOOffset; + VkBuffer baseBuf, lightBuf, boneBuf; VkImageView imageView = VK_NULL_HANDLE; VkSampler sampler = VK_NULL_HANDLE; diff --git a/GPU/Vulkan/GPU_Vulkan.cpp b/GPU/Vulkan/GPU_Vulkan.cpp index 621b7f6f7f..b5d78c9595 100644 --- a/GPU/Vulkan/GPU_Vulkan.cpp +++ b/GPU/Vulkan/GPU_Vulkan.cpp @@ -252,6 +252,7 @@ void GPU_Vulkan::CheckGPUFeatures() { void GPU_Vulkan::BeginHostFrame() { drawEngine_.BeginFrame(); + UpdateCmdInfo(); if (resized_) { CheckGPUFeatures(); @@ -490,6 +491,7 @@ void GPU_Vulkan::DeviceRestore() { CheckGPUFeatures(); BuildReportingInfo(); + UpdateCmdInfo(); framebufferManagerVulkan_->DeviceRestore(vulkan_, draw_); vulkan2D_.DeviceRestore(vulkan_); diff --git a/GPU/Vulkan/ShaderManagerVulkan.cpp b/GPU/Vulkan/ShaderManagerVulkan.cpp index bf2e425b7f..17106e0951 100644 --- a/GPU/Vulkan/ShaderManagerVulkan.cpp +++ b/GPU/Vulkan/ShaderManagerVulkan.cpp @@ -158,9 +158,11 @@ ShaderManagerVulkan::ShaderManagerVulkan(VulkanContext *vulkan) uboAlignment_ = vulkan_->GetPhysicalDeviceProperties().limits.minUniformBufferOffsetAlignment; memset(&ub_base, 0, sizeof(ub_base)); memset(&ub_lights, 0, sizeof(ub_lights)); + memset(&ub_bones, 0, sizeof(ub_bones)); ILOG("sizeof(ub_base): %d", (int)sizeof(ub_base)); ILOG("sizeof(ub_lights): %d", (int)sizeof(ub_lights)); + ILOG("sizeof(ub_bones): %d", (int)sizeof(ub_bones)); } ShaderManagerVulkan::~ShaderManagerVulkan() { @@ -213,6 +215,8 @@ uint64_t ShaderManagerVulkan::UpdateUniforms() { BaseUpdateUniforms(&ub_base, dirty, false); if (dirty & DIRTY_LIGHT_UNIFORMS) LightUpdateUniforms(&ub_lights, dirty); + if (dirty & DIRTY_BONE_UNIFORMS) + BoneUpdateUniforms(&ub_bones, dirty); } gstate_c.CleanUniforms(); return dirty; diff --git a/GPU/Vulkan/ShaderManagerVulkan.h b/GPU/Vulkan/ShaderManagerVulkan.h index 281d4768e0..2bcce9332b 100644 --- a/GPU/Vulkan/ShaderManagerVulkan.h +++ b/GPU/Vulkan/ShaderManagerVulkan.h @@ -111,6 +111,7 @@ public: // Applies dirty changes and copies the buffer. bool IsBaseDirty() { return true; } bool IsLightDirty() { return true; } + bool IsBoneDirty() { return true; } uint32_t PushBaseBuffer(VulkanPushBuffer *dest, VkBuffer *buf) { return dest->PushAligned(&ub_base, sizeof(ub_base), uboAlignment_, buf); @@ -118,6 +119,10 @@ public: uint32_t PushLightBuffer(VulkanPushBuffer *dest, VkBuffer *buf) { return dest->PushAligned(&ub_lights, sizeof(ub_lights), uboAlignment_, buf); } + // TODO: Only push half the bone buffer if we only have four bones. + uint32_t PushBoneBuffer(VulkanPushBuffer *dest, VkBuffer *buf) { + return dest->PushAligned(&ub_bones, sizeof(ub_bones), uboAlignment_, buf); + } bool LoadCache(FILE *f); void SaveCache(FILE *f); @@ -139,6 +144,7 @@ private: // Uniform block scratchpad. These (the relevant ones) are copied to the current pushbuffer at draw time. UB_VS_FS_Base ub_base; UB_VS_Lights ub_lights; + UB_VS_Bones ub_bones; VulkanFragmentShader *lastFShader_; VulkanVertexShader *lastVShader_; diff --git a/GPU/Vulkan/VertexShaderGeneratorVulkan.cpp b/GPU/Vulkan/VertexShaderGeneratorVulkan.cpp index 0b39ef5085..0dfcf57d11 100644 --- a/GPU/Vulkan/VertexShaderGeneratorVulkan.cpp +++ b/GPU/Vulkan/VertexShaderGeneratorVulkan.cpp @@ -54,6 +54,18 @@ static const char *vulkan_glsl_preamble = #define WRITE p+=sprintf +static const char * const boneWeightDecl[9] = { + "#ERROR#", + "layout(location = 3) in float w1;\n", + "layout(location = 3) in vec2 w1;\n", + "layout(location = 3) in vec3 w1;\n", + "layout(location = 3) in vec4 w1;\n", + "layout(location = 3) in vec4 w1;\nlayout(location = 4) in float w2;\n", + "layout(location = 3) in vec4 w1;\nlayout(location = 4) in vec2 w2;\n", + "layout(location = 3) in vec4 w1;\nlayout(location = 4) in vec3 w2;\n", + "layout(location = 3) in vec4 w1;\nlayout(location = 4) in vec4 w2;\n", +}; + enum DoLightComputation { LIGHT_OFF, LIGHT_SHADE, @@ -114,6 +126,7 @@ bool GenerateVulkanGLSLVertexShader(const VShaderID &id, char *buffer) { bool flipNormal = id.Bit(VS_BIT_NORM_REVERSE); int ls0 = id.Bits(VS_BIT_LS0, 2); int ls1 = id.Bits(VS_BIT_LS1, 2); + bool enableBones = id.Bit(VS_BIT_ENABLE_BONES); bool enableLighting = id.Bit(VS_BIT_LIGHTING_ENABLE); int matUpdate = id.Bits(VS_BIT_MATERIAL_UPDATE, 3); @@ -127,6 +140,8 @@ bool GenerateVulkanGLSLVertexShader(const VShaderID &id, char *buffer) { WRITE(p, "layout (std140, set = 0, binding = 2) uniform baseVars {\n%s} base;\n", ub_baseStr); if (enableLighting || doShadeMapping) WRITE(p, "layout (std140, set = 0, binding = 3) uniform lightVars {\n%s} light;\n", ub_vs_lightsStr); + if (enableBones) + WRITE(p, "layout (std140, set = 0, binding = 4) uniform boneVars {\n%s} bone;\n", ub_vs_bonesStr); const char *shading = doFlatShading ? "flat " : ""; @@ -142,6 +157,13 @@ bool GenerateVulkanGLSLVertexShader(const VShaderID &id, char *buffer) { } } + int numBoneWeights = 0; + int boneWeightScale = id.Bits(VS_BIT_WEIGHT_FMTSCALE, 2); + if (enableBones) { + numBoneWeights = 1 + id.Bits(VS_BIT_BONES, 3); + WRITE(p, "%s", boneWeightDecl[numBoneWeights]); + } + if (useHWTransform) WRITE(p, "layout (location = %d) in vec3 position;\n", (int)PspAttributeLocation::POSITION); else @@ -307,7 +329,7 @@ bool GenerateVulkanGLSLVertexShader(const VShaderID &id, char *buffer) { } } else { // Step 1: World Transform / Skinning - if (true) { + if (!enableBones) { if (doBezier || doSpline) { WRITE(p, " vec3 _pos[16];\n"); WRITE(p, " vec2 _tex[16];\n"); @@ -413,6 +435,34 @@ bool GenerateVulkanGLSLVertexShader(const VShaderID &id, char *buffer) { else WRITE(p, " mediump vec3 worldnormal = vec3(0.0, 0.0, 1.0);\n"); } + } else { + static const char *rescale[4] = { "", " * 1.9921875", " * 1.999969482421875", "" }; // 2*127.5f/128.f, 2*32767.5f/32768.f, 1.0f}; + const char *factor = rescale[boneWeightScale]; + + static const char * const boneWeightAttr[8] = { + "w1.x", "w1.y", "w1.z", "w1.w", + "w2.x", "w2.y", "w2.z", "w2.w", + }; + + WRITE(p, " mat3x4 skinMatrix = w1.x * bone.m[0];\n"); + if (numBoneWeights > 1) { + for (int i = 1; i < numBoneWeights; i++) { + WRITE(p, " skinMatrix += %s * bone.m[%i];\n", boneWeightAttr[i], i); + } + } + + WRITE(p, ";\n"); + + // Trying to simplify this results in bugs in LBP... + WRITE(p, " vec3 skinnedpos = (vec4(position, 1.0) * skinMatrix) %s;\n", factor); + WRITE(p, " vec3 worldpos = vec4(skinnedpos, 1.0) * base.world_mtx;\n"); + + if (hasNormal) { + WRITE(p, " mediump vec3 skinnednormal = vec4(%snormal, 0.0) * skinMatrix %s;\n", flipNormal ? "-" : "", factor); + } else { + WRITE(p, " mediump vec3 skinnednormal = vec4(0.0, 0.0, %s1.0, 0.0) * skinMatrix %s;\n", flipNormal ? "-" : "", factor); + } + WRITE(p, " mediump vec3 worldnormal = normalize(vec4(skinnednormal, 0.0) * base.world_mtx);\n"); } WRITE(p, " vec4 viewPos = vec4(vec4(worldpos, 1.0) * base.view_mtx, 1.0);\n"); diff --git a/UI/GameSettingsScreen.cpp b/UI/GameSettingsScreen.cpp index 7b485d4b68..904477697e 100644 --- a/UI/GameSettingsScreen.cpp +++ b/UI/GameSettingsScreen.cpp @@ -312,6 +312,13 @@ void GameSettingsScreen::CreateViews() { hwTransform->OnClick.Handle(this, &GameSettingsScreen::OnHardwareTransform); hwTransform->SetDisabledPtr(&g_Config.bSoftwareRendering); + CheckBox *swSkin = graphicsSettings->Add(new CheckBox(&g_Config.bSoftwareSkinning, gr->T("Software Skinning"))); + swSkin->OnClick.Add([=](EventParams &e) { + settingInfo_->Show(gr->T("SoftwareSkinning Tip", "Combine skinned model draws on the CPU, faster in most games"), e.v); + return UI::EVENT_CONTINUE; + }); + swSkin->SetDisabledPtr(&g_Config.bSoftwareRendering); + CheckBox *vtxCache = graphicsSettings->Add(new CheckBox(&g_Config.bVertexCache, gr->T("Vertex Cache"))); vtxCache->OnClick.Add([=](EventParams &e) { settingInfo_->Show(gr->T("VertexCache Tip", "Faster, but may cause temporary flicker"), e.v); diff --git a/ext/native/thin3d/VulkanQueueRunner.h b/ext/native/thin3d/VulkanQueueRunner.h index e9ad43fad7..0dd3ba8597 100644 --- a/ext/native/thin3d/VulkanQueueRunner.h +++ b/ext/native/thin3d/VulkanQueueRunner.h @@ -41,7 +41,7 @@ struct VkRenderData { VkPipelineLayout pipelineLayout; VkDescriptorSet ds; int numUboOffsets; - uint32_t uboOffsets[2]; + uint32_t uboOffsets[3]; VkBuffer vbuffer; // might need to increase at some point VkDeviceSize voffset; VkBuffer ibuffer; diff --git a/headless/Headless.cpp b/headless/Headless.cpp index 883f608537..661211d1d0 100644 --- a/headless/Headless.cpp +++ b/headless/Headless.cpp @@ -369,6 +369,7 @@ int main(int argc, const char* argv[]) g_Config.bFrameSkipUnthrottle = false; g_Config.bEnableLogging = fullLog; g_Config.iNumWorkerThreads = 1; + g_Config.bSoftwareSkinning = true; g_Config.bVertexDecoderJit = true; g_Config.bBlockTransferGPU = true; g_Config.iSplineBezierQuality = 2; diff --git a/unittest/TestVertexJit.cpp b/unittest/TestVertexJit.cpp index dc964c4564..40dd640c8b 100644 --- a/unittest/TestVertexJit.cpp +++ b/unittest/TestVertexJit.cpp @@ -543,6 +543,7 @@ static bool TestVertexColor565() { static bool TestVertex8Skin() { VertexDecoderTestHarness dec; + g_Config.bSoftwareSkinning = true; for (int i = 0; i < 8 * 12; ++i) { gstate.boneMatrix[i] = 0.0f; } @@ -572,6 +573,7 @@ static bool TestVertex8Skin() { static bool TestVertex16Skin() { VertexDecoderTestHarness dec; + g_Config.bSoftwareSkinning = true; for (int i = 0; i < 8 * 12; ++i) { gstate.boneMatrix[i] = 0.0f; } @@ -601,6 +603,7 @@ static bool TestVertex16Skin() { static bool TestVertexFloatSkin() { VertexDecoderTestHarness dec; + g_Config.bSoftwareSkinning = true; for (int i = 0; i < 8 * 12; ++i) { gstate.boneMatrix[i] = 0.0f; }