diff --git a/src/core/libraries/gnmdriver/gnmdriver.cpp b/src/core/libraries/gnmdriver/gnmdriver.cpp index 08f534c7..ee37bc57 100644 --- a/src/core/libraries/gnmdriver/gnmdriver.cpp +++ b/src/core/libraries/gnmdriver/gnmdriver.cpp @@ -308,6 +308,7 @@ struct AscQueueInfo { }; static Common::SlotVector asc_queues{}; static constexpr VAddr tessellation_factors_ring_addr = Core::SYSTEM_RESERVED_MAX - 0xFFFFFFF; +static constexpr u32 tessellation_offchip_buffer_size = 0x800000u; static void ResetSubmissionLock(Platform::InterruptId irq) { std::unique_lock lock{m_submission}; @@ -672,18 +673,50 @@ s32 PS4_SYSV_ABI sceGnmDrawIndexIndirect(u32* cmdbuf, u32 size, u32 data_offset, return -1; } -int PS4_SYSV_ABI sceGnmDrawIndexIndirectCountMulti() { - LOG_ERROR(Lib_GnmDriver, "(STUBBED) called"); - return ORBIS_OK; +s32 PS4_SYSV_ABI sceGnmDrawIndexIndirectCountMulti(u32* cmdbuf, u32 size, u32 data_offset, + u32 max_count, u64 count_addr, u32 shader_stage, + u32 vertex_sgpr_offset, u32 instance_sgpr_offset, + u32 flags) { + LOG_TRACE(Lib_GnmDriver, "called"); + + if (cmdbuf && (size == 16) && (shader_stage < ShaderStages::Max) && + (vertex_sgpr_offset < 0x10u) && (instance_sgpr_offset < 0x10u)) { + + cmdbuf = WriteHeader(cmdbuf, 2); + cmdbuf = WriteBody(cmdbuf, 0u); + cmdbuf += 1; + + const auto predicate = flags & 1 ? PM4Predicate::PredEnable : PM4Predicate::PredDisable; + cmdbuf = WriteHeader( + cmdbuf, 9, PM4ShaderType::ShaderGraphics, predicate); + + const auto sgpr_offset = indirect_sgpr_offsets[shader_stage]; + + cmdbuf[0] = data_offset; + cmdbuf[1] = vertex_sgpr_offset == 0 ? 0 : (vertex_sgpr_offset & 0xffffu) + sgpr_offset; + cmdbuf[2] = instance_sgpr_offset == 0 ? 0 : (instance_sgpr_offset & 0xffffu) + sgpr_offset; + cmdbuf[3] = (count_addr != 0 ? 1u : 0u) << 0x1e; + cmdbuf[4] = max_count; + *(u64*)(&cmdbuf[5]) = count_addr; + cmdbuf[7] = AmdGpu::Liverpool::DrawIndexedIndirectArgsSize; + cmdbuf[8] = 0; + + cmdbuf += 9; + WriteTrailingNop<2>(cmdbuf); + return ORBIS_OK; + } + return -1; } int PS4_SYSV_ABI sceGnmDrawIndexIndirectMulti() { LOG_ERROR(Lib_GnmDriver, "(STUBBED) called"); + UNREACHABLE(); return ORBIS_OK; } int PS4_SYSV_ABI sceGnmDrawIndexMultiInstanced() { LOG_ERROR(Lib_GnmDriver, "(STUBBED) called"); + UNREACHABLE(); return ORBIS_OK; } @@ -730,11 +763,13 @@ s32 PS4_SYSV_ABI sceGnmDrawIndirect(u32* cmdbuf, u32 size, u32 data_offset, u32 int PS4_SYSV_ABI sceGnmDrawIndirectCountMulti() { LOG_ERROR(Lib_GnmDriver, "(STUBBED) called"); + UNREACHABLE(); return ORBIS_OK; } int PS4_SYSV_ABI sceGnmDrawIndirectMulti() { LOG_ERROR(Lib_GnmDriver, "(STUBBED) called"); + UNREACHABLE(); return ORBIS_OK; } @@ -992,8 +1027,8 @@ int PS4_SYSV_ABI sceGnmGetNumTcaUnits() { } int PS4_SYSV_ABI sceGnmGetOffChipTessellationBufferSize() { - LOG_ERROR(Lib_GnmDriver, "(STUBBED) called"); - return ORBIS_OK; + LOG_TRACE(Lib_GnmDriver, "called"); + return tessellation_offchip_buffer_size; } int PS4_SYSV_ABI sceGnmGetOwnerName() { @@ -2438,8 +2473,8 @@ int PS4_SYSV_ABI sceGnmValidateGetVersion() { } int PS4_SYSV_ABI sceGnmValidateOnSubmitEnabled() { - LOG_ERROR(Lib_GnmDriver, "(STUBBED) called"); - return ORBIS_OK; + LOG_TRACE(Lib_GnmDriver, "called"); + return 0; } int PS4_SYSV_ABI sceGnmValidateResetState() { diff --git a/src/core/libraries/gnmdriver/gnmdriver.h b/src/core/libraries/gnmdriver/gnmdriver.h index a95daa90..115268ea 100644 --- a/src/core/libraries/gnmdriver/gnmdriver.h +++ b/src/core/libraries/gnmdriver/gnmdriver.h @@ -47,7 +47,10 @@ s32 PS4_SYSV_ABI sceGnmDrawIndexAuto(u32* cmdbuf, u32 size, u32 index_count, u32 s32 PS4_SYSV_ABI sceGnmDrawIndexIndirect(u32* cmdbuf, u32 size, u32 data_offset, u32 shader_stage, u32 vertex_sgpr_offset, u32 instance_sgpr_offset, u32 flags); -int PS4_SYSV_ABI sceGnmDrawIndexIndirectCountMulti(); +s32 PS4_SYSV_ABI sceGnmDrawIndexIndirectCountMulti(u32* cmdbuf, u32 size, u32 data_offset, + u32 max_count, u64 count_addr, u32 shader_stage, + u32 vertex_sgpr_offset, u32 instance_sgpr_offset, + u32 flags); int PS4_SYSV_ABI sceGnmDrawIndexIndirectMulti(); int PS4_SYSV_ABI sceGnmDrawIndexMultiInstanced(); s32 PS4_SYSV_ABI sceGnmDrawIndexOffset(u32* cmdbuf, u32 size, u32 index_offset, u32 index_count, diff --git a/src/video_core/amdgpu/liverpool.cpp b/src/video_core/amdgpu/liverpool.cpp index 53aab630..3c359b8d 100644 --- a/src/video_core/amdgpu/liverpool.cpp +++ b/src/video_core/amdgpu/liverpool.cpp @@ -417,7 +417,7 @@ Liverpool::Task Liverpool::ProcessGraphics(std::span dcb, std::span(header); rasterizer->ScopeMarkerBegin(fmt::format("dcb:{}:DrawIndirect", cmd_address)); - rasterizer->DrawIndirect(false, ib_address, offset, size); + rasterizer->DrawIndirect(false, ib_address, offset, size, 1, 0); rasterizer->ScopeMarkerEnd(); } break; @@ -435,7 +435,27 @@ Liverpool::Task Liverpool::ProcessGraphics(std::span dcb, std::span(header); rasterizer->ScopeMarkerBegin( fmt::format("dcb:{}:DrawIndexIndirect", cmd_address)); - rasterizer->DrawIndirect(true, ib_address, offset, size); + rasterizer->DrawIndirect(true, ib_address, offset, size, 1, 0); + rasterizer->ScopeMarkerEnd(); + } + break; + } + case PM4ItOpcode::DrawIndexIndirectCountMulti: { + const auto* draw_index_indirect = + reinterpret_cast(header); + const auto offset = draw_index_indirect->data_offset; + const auto ib_address = mapped_queues[GfxQueueId].indirect_args_addr; + const auto size = sizeof(PM4CmdDrawIndexIndirect::DrawIndexInstancedArgs); + if (DebugState.DumpingCurrentReg()) { + DebugState.PushRegsDump(base_addr, reinterpret_cast(header), regs); + } + if (rasterizer) { + const auto cmd_address = reinterpret_cast(header); + rasterizer->ScopeMarkerBegin( + fmt::format("dcb:{}:DrawIndexIndirectCountMulti", cmd_address)); + rasterizer->DrawIndirect(true, ib_address, offset, size, + draw_index_indirect->count, + draw_index_indirect->countAddr); rasterizer->ScopeMarkerEnd(); } break; diff --git a/src/video_core/amdgpu/liverpool.h b/src/video_core/amdgpu/liverpool.h index a4cf7933..d94e4329 100644 --- a/src/video_core/amdgpu/liverpool.h +++ b/src/video_core/amdgpu/liverpool.h @@ -57,6 +57,8 @@ struct Liverpool { static constexpr u32 ConfigRegWordOffset = 0x2000; static constexpr u32 ShRegWordOffset = 0x2C00; static constexpr u32 NumRegs = 0xD000; + static constexpr u32 DrawIndirectArgsSize = 0x10u; + static constexpr u32 DrawIndexedIndirectArgsSize = 0x14u; using UserData = std::array; diff --git a/src/video_core/amdgpu/pm4_cmds.h b/src/video_core/amdgpu/pm4_cmds.h index a956b030..d6cab23d 100644 --- a/src/video_core/amdgpu/pm4_cmds.h +++ b/src/video_core/amdgpu/pm4_cmds.h @@ -817,11 +817,25 @@ struct PM4CmdDrawIndexIndirect { BitField<0, 16, u32> base_vtx_loc; ///< Offset where the CP will write the ///< BaseVertexLocation it fetched from memory }; - union { // NOTE: this one is undocumented in AMD spec, but Gnm driver writes this field + union { u32 dw3; BitField<0, 16, u32> start_inst_loc; ///< Offset where the CP will write the ///< StartInstanceLocation it fetched from memory }; + + union { + u32 dw4; + struct { + BitField<0, 16, u32> drawIndexLoc; ///< register offset to write the Draw Index count + BitField<30, 1, u32> + countIndirectEnable; ///< Indicates the data structure count is in memory + BitField<31, 1, u32> + drawIndexEnable; ///< Enables writing of Draw Index count to DRAW_INDEX_LOC + }; + }; + u32 count; ///< Count of data structures to loop through before going to next packet + u64 countAddr; ///< DWord aligned Address[31:2]; Valid if countIndirectEnable is set + u32 stride; ///< Stride in memory from one data structure to the next u32 draw_initiator; ///< Draw Initiator Register }; diff --git a/src/video_core/amdgpu/pm4_opcodes.h b/src/video_core/amdgpu/pm4_opcodes.h index 4b853138..ce388d1b 100644 --- a/src/video_core/amdgpu/pm4_opcodes.h +++ b/src/video_core/amdgpu/pm4_opcodes.h @@ -71,6 +71,7 @@ enum class PM4ItOpcode : u32 { IncrementDeCounter = 0x85, WaitOnCeCounter = 0x86, WaitOnDeCounterDiff = 0x88, + DrawIndexIndirectCountMulti = 0x9d, }; } // namespace AmdGpu diff --git a/src/video_core/renderer_vulkan/vk_instance.cpp b/src/video_core/renderer_vulkan/vk_instance.cpp index 0edc4228..a16f35ed 100644 --- a/src/video_core/renderer_vulkan/vk_instance.cpp +++ b/src/video_core/renderer_vulkan/vk_instance.cpp @@ -345,6 +345,7 @@ bool Instance::CreateDevice() { }, vk::PhysicalDeviceVulkan12Features{ .samplerMirrorClampToEdge = vk12_features.samplerMirrorClampToEdge, + .drawIndirectCount = vk12_features.drawIndirectCount, .shaderFloat16 = vk12_features.shaderFloat16, .scalarBlockLayout = vk12_features.scalarBlockLayout, .uniformBufferStandardLayout = vk12_features.uniformBufferStandardLayout, diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp index 6d214a18..27120323 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp +++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp @@ -115,14 +115,14 @@ void Rasterizer::Draw(bool is_indexed, u32 index_offset) { } } -void Rasterizer::DrawIndirect(bool is_indexed, VAddr address, u32 offset, u32 size) { +void Rasterizer::DrawIndirect(bool is_indexed, VAddr arg_address, u32 offset, u32 size, + u32 max_count, VAddr count_address) { RENDERER_TRACE; if (!FilterDraw()) { return; } - const auto cmdbuf = scheduler.CommandBuffer(); const auto& regs = liverpool->regs; const GraphicsPipeline* pipeline = pipeline_cache.GetGraphicsPipeline(); if (!pipeline) { @@ -142,7 +142,13 @@ void Rasterizer::DrawIndirect(bool is_indexed, VAddr address, u32 offset, u32 si buffer_cache.BindVertexBuffers(vs_info); buffer_cache.BindIndexBuffer(is_indexed, 0); - const auto [buffer, base] = buffer_cache.ObtainBuffer(address + offset, size, false); + const auto [buffer, base] = buffer_cache.ObtainBuffer(arg_address + offset, size, false); + + VideoCore::Buffer* count_buffer{}; + u32 count_base{}; + if (count_address != 0) { + std::tie(count_buffer, count_base) = buffer_cache.ObtainBuffer(count_address, 4, false); + } BeginRendering(*pipeline); UpdateDynamicState(*pipeline); @@ -150,10 +156,29 @@ void Rasterizer::DrawIndirect(bool is_indexed, VAddr address, u32 offset, u32 si // We can safely ignore both SGPR UD indices and results of fetch shader parsing, as vertex and // instance offsets will be automatically applied by Vulkan from indirect args buffer. + const auto cmdbuf = scheduler.CommandBuffer(); if (is_indexed) { - cmdbuf.drawIndexedIndirect(buffer->Handle(), base, 1, 0); + static_assert(sizeof(VkDrawIndexedIndirectCommand) == + AmdGpu::Liverpool::DrawIndexedIndirectArgsSize); + + if (count_address != 0) { + cmdbuf.drawIndexedIndirectCount(buffer->Handle(), base, count_buffer->Handle(), + count_base, max_count, + AmdGpu::Liverpool::DrawIndexedIndirectArgsSize); + } else { + cmdbuf.drawIndexedIndirect(buffer->Handle(), base, max_count, + AmdGpu::Liverpool::DrawIndexedIndirectArgsSize); + } } else { - cmdbuf.drawIndirect(buffer->Handle(), base, 1, 0); + static_assert(sizeof(VkDrawIndirectCommand) == AmdGpu::Liverpool::DrawIndirectArgsSize); + + if (count_address != 0) { + cmdbuf.drawIndirectCount(buffer->Handle(), base, count_buffer->Handle(), count_base, + max_count, AmdGpu::Liverpool::DrawIndirectArgsSize); + } else { + cmdbuf.drawIndirect(buffer->Handle(), base, max_count, + AmdGpu::Liverpool::DrawIndirectArgsSize); + } } } diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.h b/src/video_core/renderer_vulkan/vk_rasterizer.h index 9035ed9d..b6813aec 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.h +++ b/src/video_core/renderer_vulkan/vk_rasterizer.h @@ -32,7 +32,8 @@ public: } void Draw(bool is_indexed, u32 index_offset = 0); - void DrawIndirect(bool is_indexed, VAddr address, u32 offset, u32 size); + void DrawIndirect(bool is_indexed, VAddr arg_address, u32 offset, u32 size, u32 max_count, + VAddr count_address); void DispatchDirect(); void DispatchIndirect(VAddr address, u32 offset, u32 size);