From 0d5c5f81a60701f15af32953b8aedeb201bc3f05 Mon Sep 17 00:00:00 2001 From: TheTurtle Date: Wed, 21 Jan 2026 22:49:35 +0200 Subject: [PATCH] video_core: Small readback optimization (#3941) * pm4_cmds: Handle nop packet overflow * liverpool: Detect DispatchDirect patches and promote to DispatchIndirect * clang.. * log removed --------- Co-authored-by: georgemoralis --- src/video_core/amdgpu/liverpool.cpp | 27 +++++++++++++++++++++++++-- src/video_core/amdgpu/pm4_cmds.h | 2 +- 2 files changed, 26 insertions(+), 3 deletions(-) diff --git a/src/video_core/amdgpu/liverpool.cpp b/src/video_core/amdgpu/liverpool.cpp index 3f307c51b..32ea1e8ed 100644 --- a/src/video_core/amdgpu/liverpool.cpp +++ b/src/video_core/amdgpu/liverpool.cpp @@ -830,7 +830,14 @@ Liverpool::Task Liverpool::ProcessCompute(std::span acb, u32 vqid) { FIBER_ENTER(acb_task_name[vqid]); auto& queue = asc_queues[{vqid}]; + struct IndirectPatch { + const PM4Header* header; + VAddr indirect_addr; + }; + boost::container::small_vector indirect_patches; + auto base_addr = reinterpret_cast(acb.data()); + size_t acb_size = acb.size_bytes(); while (!acb.empty()) { ProcessCommands(); @@ -919,8 +926,18 @@ Liverpool::Task Liverpool::ProcessCompute(std::span acb, u32 vqid) { dma_data->src_sel == DmaDataSrc::MemoryUsingL2) && (dma_data->dst_sel == DmaDataDst::Memory || dma_data->dst_sel == DmaDataDst::MemoryUsingL2)) { - rasterizer->CopyBuffer(dma_data->DstAddress(), dma_data->SrcAddress(), - dma_data->NumBytes(), false, false); + const u32 num_bytes = dma_data->NumBytes(); + const VAddr src_addr = dma_data->SrcAddress(); + const VAddr dst_addr = dma_data->DstAddress(); + const PM4Header* header = + reinterpret_cast(dst_addr - sizeof(PM4Header)); + if (dst_addr >= base_addr && dst_addr < base_addr + acb_size && + num_bytes == sizeof(PM4CmdDispatchIndirect::GroupDimensions) && + header->type == 3 && header->type3.opcode == PM4ItOpcode::DispatchDirect) { + indirect_patches.emplace_back(header, src_addr); + } else { + rasterizer->CopyBuffer(dst_addr, src_addr, num_bytes, false, false); + } } else { UNREACHABLE_MSG("WriteData src_sel = {}, dst_sel = {}", u32(dma_data->src_sel.Value()), u32(dma_data->dst_sel.Value())); @@ -964,6 +981,12 @@ Liverpool::Task Liverpool::ProcessCompute(std::span acb, u32 vqid) { } case PM4ItOpcode::DispatchDirect: { const auto* dispatch_direct = reinterpret_cast(header); + if (auto it = std::ranges::find(indirect_patches, header, &IndirectPatch::header); + it != indirect_patches.end()) { + const auto size = sizeof(PM4CmdDispatchIndirect::GroupDimensions); + rasterizer->DispatchIndirect(it->indirect_addr, 0, size); + break; + } auto& cs_program = GetCsRegs(); cs_program.dim_x = dispatch_direct->dim_x; cs_program.dim_y = dispatch_direct->dim_y; diff --git a/src/video_core/amdgpu/pm4_cmds.h b/src/video_core/amdgpu/pm4_cmds.h index eb48f3568..46ecb09d6 100644 --- a/src/video_core/amdgpu/pm4_cmds.h +++ b/src/video_core/amdgpu/pm4_cmds.h @@ -50,7 +50,7 @@ union PM4Type3Header { } u32 NumWords() const { - return count + 1; + return (count + 1) & 0x3fff; } u32 raw;