tu: Add support for suspending and resuming renderpasses

This is unfortunately very complicated because we have to stitch
together the state of the suspended passes after the fact, with primary
command buffers at submit time.

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/17378>
This commit is contained in:
Connor Abbott 2022-06-30 17:36:05 +02:00 committed by Marge Bot
parent 0a4c86fc44
commit cb0f414b2a
9 changed files with 895 additions and 107 deletions

View File

@ -474,7 +474,7 @@ Vulkan 1.2 -- all DONE: anv, vn
Vulkan 1.3 -- all DONE: anv, radv, lvp
VK_KHR_copy_commands2 DONE (anv, lvp, radv, tu, v3dv)
VK_KHR_dynamic_rendering DONE (anv, lvp, radv)
VK_KHR_dynamic_rendering DONE (anv, lvp, radv, tu)
VK_KHR_format_feature_flags2 DONE (anv, radv, tu, v3dv)
VK_KHR_maintenance4 DONE (anv, radv, tu)
VK_KHR_shader_non_semantic_info DONE (anv, radv, tu, v3dv)

View File

@ -39,6 +39,7 @@ libtu_files = files(
'tu_device.c',
'tu_descriptor_set.c',
'tu_descriptor_set.h',
'tu_dynamic_rendering.c',
'tu_formats.c',
'tu_image.c',
'tu_lrz.c',

View File

@ -1526,6 +1526,46 @@ tu_cmd_render_sysmem(struct tu_cmd_buffer *cmd,
trace_end_render_pass(&cmd->trace, &cmd->cs, cmd->state.framebuffer);
}
void
tu_cmd_render(struct tu_cmd_buffer *cmd_buffer)
{
if (cmd_buffer->state.rp.has_tess)
tu6_lazy_emit_tessfactor_addr(cmd_buffer);
struct tu_renderpass_result *autotune_result = NULL;
if (use_sysmem_rendering(cmd_buffer, &autotune_result))
tu_cmd_render_sysmem(cmd_buffer, autotune_result);
else
tu_cmd_render_tiles(cmd_buffer, autotune_result);
/* Outside of renderpasses we assume all draw states are disabled. We do
* this outside the draw CS for the normal case where 3d gmem stores aren't
* used.
*/
tu_disable_draw_states(cmd_buffer, &cmd_buffer->cs);
}
static void tu_reset_render_pass(struct tu_cmd_buffer *cmd_buffer)
{
/* discard draw_cs and draw_epilogue_cs entries now that the tiles are
rendered */
tu_cs_discard_entries(&cmd_buffer->draw_cs);
tu_cs_begin(&cmd_buffer->draw_cs);
tu_cs_discard_entries(&cmd_buffer->draw_epilogue_cs);
tu_cs_begin(&cmd_buffer->draw_epilogue_cs);
cmd_buffer->state.pass = NULL;
cmd_buffer->state.subpass = NULL;
cmd_buffer->state.framebuffer = NULL;
cmd_buffer->state.attachments = NULL;
memset(&cmd_buffer->state.rp, 0, sizeof(cmd_buffer->state.rp));
/* LRZ is not valid next time we use it */
cmd_buffer->state.lrz.valid = false;
cmd_buffer->state.dirty |= TU_CMD_DIRTY_LRZ;
}
static VkResult
tu_create_cmd_buffer(struct tu_device *device,
struct tu_cmd_pool *pool,
@ -1570,6 +1610,8 @@ tu_create_cmd_buffer(struct tu_device *device,
tu_cs_init(&cmd_buffer->tile_store_cs, device, TU_CS_MODE_GROW, 2048);
tu_cs_init(&cmd_buffer->draw_epilogue_cs, device, TU_CS_MODE_GROW, 4096);
tu_cs_init(&cmd_buffer->sub_cs, device, TU_CS_MODE_SUB_STREAM, 2048);
tu_cs_init(&cmd_buffer->pre_chain.draw_cs, device, TU_CS_MODE_GROW, 4096);
tu_cs_init(&cmd_buffer->pre_chain.draw_epilogue_cs, device, TU_CS_MODE_GROW, 4096);
*pCommandBuffer = tu_cmd_buffer_to_handle(cmd_buffer);
@ -1586,6 +1628,8 @@ tu_cmd_buffer_destroy(struct tu_cmd_buffer *cmd_buffer)
tu_cs_finish(&cmd_buffer->tile_store_cs);
tu_cs_finish(&cmd_buffer->draw_epilogue_cs);
tu_cs_finish(&cmd_buffer->sub_cs);
tu_cs_finish(&cmd_buffer->pre_chain.draw_cs);
tu_cs_finish(&cmd_buffer->pre_chain.draw_epilogue_cs);
u_trace_fini(&cmd_buffer->trace);
@ -1614,6 +1658,8 @@ tu_reset_cmd_buffer(struct tu_cmd_buffer *cmd_buffer)
tu_cs_reset(&cmd_buffer->tile_store_cs);
tu_cs_reset(&cmd_buffer->draw_epilogue_cs);
tu_cs_reset(&cmd_buffer->sub_cs);
tu_cs_reset(&cmd_buffer->pre_chain.draw_cs);
tu_cs_reset(&cmd_buffer->pre_chain.draw_epilogue_cs);
tu_autotune_free_results(cmd_buffer->device, &cmd_buffer->renderpass_autotune_results);
@ -1728,13 +1774,15 @@ tu_cache_init(struct tu_cache_state *cache)
cache->pending_flush_bits = TU_CMD_FLAG_ALL_INVALIDATE;
}
VKAPI_ATTR VkResult VKAPI_CALL
tu_BeginCommandBuffer(VkCommandBuffer commandBuffer,
const VkCommandBufferBeginInfo *pBeginInfo)
/* Unlike the public entrypoint, this doesn't handle cache tracking, and
* tracking the CCU state. It's used for the driver to insert its own command
* buffer in the middle of a submit.
*/
VkResult
tu_cmd_buffer_begin(struct tu_cmd_buffer *cmd_buffer,
VkCommandBufferUsageFlags usage_flags)
{
TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer);
VkResult result = VK_SUCCESS;
if (cmd_buffer->status != TU_CMD_BUFFER_STATUS_INITIAL) {
/* If the command buffer has already been resetted with
* vkResetCommandBuffer, no need to do it again.
@ -1750,12 +1798,25 @@ tu_BeginCommandBuffer(VkCommandBuffer commandBuffer,
tu_cache_init(&cmd_buffer->state.cache);
tu_cache_init(&cmd_buffer->state.renderpass_cache);
cmd_buffer->usage_flags = pBeginInfo->flags;
cmd_buffer->usage_flags = usage_flags;
tu_cs_begin(&cmd_buffer->cs);
tu_cs_begin(&cmd_buffer->draw_cs);
tu_cs_begin(&cmd_buffer->draw_epilogue_cs);
cmd_buffer->status = TU_CMD_BUFFER_STATUS_RECORDING;
return VK_SUCCESS;
}
VKAPI_ATTR VkResult VKAPI_CALL
tu_BeginCommandBuffer(VkCommandBuffer commandBuffer,
const VkCommandBufferBeginInfo *pBeginInfo)
{
TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer);
VkResult result = tu_cmd_buffer_begin(cmd_buffer, pBeginInfo->flags);
if (result != VK_SUCCESS)
return result;
/* setup initial configuration into command buffer */
if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY) {
switch (cmd_buffer->queue_family_index) {
@ -1805,8 +1866,6 @@ tu_BeginCommandBuffer(VkCommandBuffer commandBuffer,
}
}
cmd_buffer->status = TU_CMD_BUFFER_STATUS_RECORDING;
return VK_SUCCESS;
}
@ -3331,7 +3390,7 @@ tu_flush_for_stage(struct tu_cache_state *cache,
}
}
static void
void
tu_render_pass_state_merge(struct tu_render_pass_state *dst,
const struct tu_render_pass_state *src)
{
@ -3346,6 +3405,103 @@ tu_render_pass_state_merge(struct tu_render_pass_state *dst,
src->drawcall_bandwidth_per_sample_sum;
}
void
tu_restore_suspended_pass(struct tu_cmd_buffer *cmd,
struct tu_cmd_buffer *suspended)
{
cmd->state.pass = suspended->state.suspended_pass.pass;
cmd->state.subpass = suspended->state.suspended_pass.subpass;
cmd->state.framebuffer = suspended->state.suspended_pass.framebuffer;
cmd->state.attachments = suspended->state.suspended_pass.attachments;
cmd->state.render_area = suspended->state.suspended_pass.render_area;
cmd->state.lrz = suspended->state.suspended_pass.lrz;
}
/* Take the saved pre-chain in "secondary" and copy its commands to "cmd",
* appending it after any saved-up commands in "cmd".
*/
void
tu_append_pre_chain(struct tu_cmd_buffer *cmd,
struct tu_cmd_buffer *secondary)
{
tu_cs_add_entries(&cmd->draw_cs, &secondary->pre_chain.draw_cs);
tu_cs_add_entries(&cmd->draw_epilogue_cs,
&secondary->pre_chain.draw_epilogue_cs);
tu_render_pass_state_merge(&cmd->state.rp,
&secondary->pre_chain.state);
if (!u_trace_iterator_equal(secondary->pre_chain.trace_renderpass_start,
secondary->pre_chain.trace_renderpass_end)) {
tu_cs_emit_wfi(&cmd->draw_cs);
tu_cs_emit_pkt7(&cmd->draw_cs, CP_WAIT_FOR_ME, 0);
u_trace_clone_append(secondary->pre_chain.trace_renderpass_start,
secondary->pre_chain.trace_renderpass_end,
&cmd->trace, &cmd->draw_cs,
tu_copy_timestamp_buffer);
}
}
/* Take the saved post-chain in "secondary" and copy it to "cmd".
*/
void
tu_append_post_chain(struct tu_cmd_buffer *cmd,
struct tu_cmd_buffer *secondary)
{
tu_cs_add_entries(&cmd->draw_cs, &secondary->draw_cs);
tu_cs_add_entries(&cmd->draw_epilogue_cs, &secondary->draw_epilogue_cs);
if (!u_trace_iterator_equal(secondary->trace_renderpass_start,
secondary->trace_renderpass_end)) {
tu_cs_emit_wfi(&cmd->draw_cs);
tu_cs_emit_pkt7(&cmd->draw_cs, CP_WAIT_FOR_ME, 0);
u_trace_clone_append(secondary->trace_renderpass_start,
secondary->trace_renderpass_end,
&cmd->trace, &cmd->draw_cs,
tu_copy_timestamp_buffer);
}
cmd->state.rp = secondary->state.rp;
}
/* Assuming "secondary" is just a sequence of suspended and resuming passes,
* copy its state to "cmd". This also works instead of tu_append_post_chain(),
* but it's a bit slower because we don't assume that the chain begins in
* "secondary" and therefore have to care about the command buffer's
* renderpass state.
*/
void
tu_append_pre_post_chain(struct tu_cmd_buffer *cmd,
struct tu_cmd_buffer *secondary)
{
tu_cs_add_entries(&cmd->draw_cs, &secondary->draw_cs);
tu_cs_add_entries(&cmd->draw_epilogue_cs, &secondary->draw_epilogue_cs);
if (!u_trace_iterator_equal(secondary->trace_renderpass_start,
secondary->trace_renderpass_end)) {
tu_cs_emit_wfi(&cmd->draw_cs);
tu_cs_emit_pkt7(&cmd->draw_cs, CP_WAIT_FOR_ME, 0);
u_trace_clone_append(secondary->trace_renderpass_start,
secondary->trace_renderpass_end,
&cmd->trace, &cmd->draw_cs,
tu_copy_timestamp_buffer);
}
tu_render_pass_state_merge(&cmd->state.rp,
&secondary->state.rp);
}
/* Take the current render pass state and save it to "pre_chain" to be
* combined later.
*/
static void
tu_save_pre_chain(struct tu_cmd_buffer *cmd)
{
tu_cs_add_entries(&cmd->pre_chain.draw_cs,
&cmd->draw_cs);
tu_cs_add_entries(&cmd->pre_chain.draw_epilogue_cs,
&cmd->draw_epilogue_cs);
cmd->pre_chain.trace_renderpass_start =
cmd->trace_renderpass_start;
cmd->pre_chain.trace_renderpass_end =
cmd->trace_renderpass_end;
cmd->pre_chain.state = cmd->state.rp;
}
VKAPI_ATTR void VKAPI_CALL
tu_CmdExecuteCommands(VkCommandBuffer commandBuffer,
uint32_t commandBufferCount,
@ -3393,10 +3549,110 @@ tu_CmdExecuteCommands(VkCommandBuffer commandBuffer,
tu_render_pass_state_merge(&cmd->state.rp, &secondary->state.rp);
} else {
assert(tu_cs_is_empty(&secondary->draw_cs));
assert(tu_cs_is_empty(&secondary->draw_epilogue_cs));
switch (secondary->state.suspend_resume) {
case SR_NONE:
assert(tu_cs_is_empty(&secondary->draw_cs));
assert(tu_cs_is_empty(&secondary->draw_epilogue_cs));
tu_cs_add_entries(&cmd->cs, &secondary->cs);
break;
tu_cs_add_entries(&cmd->cs, &secondary->cs);
case SR_IN_PRE_CHAIN:
/* cmd may be empty, which means that the chain begins before cmd
* in which case we have to update its state.
*/
if (cmd->state.suspend_resume == SR_NONE) {
cmd->state.suspend_resume = SR_IN_PRE_CHAIN;
cmd->trace_renderpass_start = u_trace_end_iterator(&cmd->trace);
}
/* The secondary is just a continuous suspend/resume chain so we
* just have to append it to the the command buffer.
*/
assert(tu_cs_is_empty(&secondary->cs));
tu_append_pre_post_chain(cmd, secondary);
break;
case SR_AFTER_PRE_CHAIN:
case SR_IN_CHAIN:
case SR_IN_CHAIN_AFTER_PRE_CHAIN:
if (secondary->state.suspend_resume == SR_AFTER_PRE_CHAIN ||
secondary->state.suspend_resume == SR_IN_CHAIN_AFTER_PRE_CHAIN) {
/* In thse cases there is a `pre_chain` in the secondary which
* ends that we need to append to the primary.
*/
if (cmd->state.suspend_resume == SR_NONE)
cmd->trace_renderpass_start = u_trace_end_iterator(&cmd->trace);
tu_append_pre_chain(cmd, secondary);
cmd->trace_renderpass_end = u_trace_end_iterator(&cmd->trace);
/* We're about to render, so we need to end the command stream
* in case there were any extra commands generated by copying
* the trace.
*/
tu_cs_end(&cmd->draw_cs);
tu_cs_end(&cmd->draw_epilogue_cs);
switch (cmd->state.suspend_resume) {
case SR_NONE:
case SR_IN_PRE_CHAIN:
/* The renderpass chain ends in the secondary but isn't
* started in the primary, so we have to move the state to
* `pre_chain`.
*/
tu_save_pre_chain(cmd);
cmd->state.suspend_resume = SR_AFTER_PRE_CHAIN;
break;
case SR_IN_CHAIN:
case SR_IN_CHAIN_AFTER_PRE_CHAIN:
/* The renderpass ends in the secondary and starts somewhere
* earlier in this primary. Since the last render pass in
* the chain is in the secondary, we are technically outside
* of a render pass. Fix that here by reusing the dynamic
* render pass that was setup for the last suspended render
* pass before the secondary.
*/
tu_restore_suspended_pass(cmd, cmd);
tu_cmd_render(cmd);
if (cmd->state.suspend_resume == SR_IN_CHAIN)
cmd->state.suspend_resume = SR_NONE;
else
cmd->state.suspend_resume = SR_AFTER_PRE_CHAIN;
break;
case SR_AFTER_PRE_CHAIN:
unreachable("resuming render pass is not preceded by suspending one");
}
tu_reset_render_pass(cmd);
}
tu_cs_add_entries(&cmd->cs, &secondary->cs);
if (secondary->state.suspend_resume == SR_IN_CHAIN_AFTER_PRE_CHAIN ||
secondary->state.suspend_resume == SR_IN_CHAIN) {
/* The secondary ends in a "post-chain" (the opposite of a
* pre-chain) that we need to copy into the current command
* buffer.
*/
cmd->trace_renderpass_start = u_trace_end_iterator(&cmd->trace);
tu_append_post_chain(cmd, secondary);
cmd->trace_renderpass_end = u_trace_end_iterator(&cmd->trace);
cmd->state.suspended_pass = secondary->state.suspended_pass;
switch (cmd->state.suspend_resume) {
case SR_NONE:
cmd->state.suspend_resume = SR_IN_CHAIN;
break;
case SR_AFTER_PRE_CHAIN:
cmd->state.suspend_resume = SR_IN_CHAIN_AFTER_PRE_CHAIN;
break;
default:
unreachable("suspending render pass is followed by a not resuming one");
}
}
}
}
cmd->state.index_size = secondary->state.index_size; /* for restart index update */
@ -3685,12 +3941,65 @@ tu_CmdBeginRendering(VkCommandBuffer commandBuffer,
cmd->state.cache.pending_flush_bits;
cmd->state.renderpass_cache.flush_bits = 0;
trace_start_render_pass(&cmd->trace, &cmd->cs);
bool resuming = pRenderingInfo->flags & VK_RENDERING_RESUMING_BIT;
bool suspending = pRenderingInfo->flags & VK_RENDERING_SUSPENDING_BIT;
cmd->state.suspending = suspending;
cmd->state.resuming = resuming;
cmd->trace_renderpass_start = u_trace_end_iterator(&cmd->trace);
/* We can't track LRZ across command buffer boundaries, so we have to
* disable LRZ when resuming/suspending unless we can track on the GPU.
*/
if ((resuming || suspending) &&
!cmd->device->physical_device->info->a6xx.has_lrz_dir_tracking) {
cmd->state.lrz.valid = false;
} else {
if (resuming)
tu_lrz_begin_resumed_renderpass(cmd, clear_values);
else
tu_lrz_begin_renderpass(cmd, clear_values);
}
tu_emit_renderpass_begin(cmd, clear_values);
tu_emit_subpass_begin(cmd);
if (suspending) {
cmd->state.suspended_pass.pass = cmd->state.pass;
cmd->state.suspended_pass.subpass = cmd->state.subpass;
cmd->state.suspended_pass.framebuffer = cmd->state.framebuffer;
cmd->state.suspended_pass.render_area = cmd->state.render_area;
cmd->state.suspended_pass.attachments = cmd->state.attachments;
}
if (!resuming) {
trace_start_render_pass(&cmd->trace, &cmd->cs);
}
if (!resuming || cmd->state.suspend_resume == SR_NONE) {
cmd->trace_renderpass_start = u_trace_end_iterator(&cmd->trace);
}
if (!resuming) {
tu_emit_renderpass_begin(cmd, clear_values);
tu_emit_subpass_begin(cmd);
}
if (suspending && !resuming) {
/* entering a chain */
switch (cmd->state.suspend_resume) {
case SR_NONE:
cmd->state.suspend_resume = SR_IN_CHAIN;
break;
case SR_AFTER_PRE_CHAIN:
cmd->state.suspend_resume = SR_IN_CHAIN_AFTER_PRE_CHAIN;
break;
case SR_IN_PRE_CHAIN:
case SR_IN_CHAIN:
case SR_IN_CHAIN_AFTER_PRE_CHAIN:
unreachable("suspending render pass not followed by resuming pass");
break;
}
}
if (resuming && cmd->state.suspend_resume == SR_NONE)
cmd->state.suspend_resume = SR_IN_PRE_CHAIN;
}
VKAPI_ATTR void VKAPI_CALL
@ -4801,60 +5110,25 @@ tu_CmdDispatchIndirect(VkCommandBuffer commandBuffer,
tu_dispatch(cmd_buffer, &info);
}
static void
tu_end_rendering(struct tu_cmd_buffer *cmd_buffer)
{
tu_cs_end(&cmd_buffer->draw_cs);
tu_cs_end(&cmd_buffer->draw_epilogue_cs);
cmd_buffer->trace_renderpass_end = u_trace_end_iterator(&cmd_buffer->trace);
if (cmd_buffer->state.rp.has_tess)
tu6_lazy_emit_tessfactor_addr(cmd_buffer);
struct tu_renderpass_result *autotune_result = NULL;
if (use_sysmem_rendering(cmd_buffer, &autotune_result))
tu_cmd_render_sysmem(cmd_buffer, autotune_result);
else
tu_cmd_render_tiles(cmd_buffer, autotune_result);
/* Outside of renderpasses we assume all draw states are disabled. We do
* this outside the draw CS for the normal case where 3d gmem stores aren't
* used.
*/
tu_disable_draw_states(cmd_buffer, &cmd_buffer->cs);
/* discard draw_cs and draw_epilogue_cs entries now that the tiles are
rendered */
tu_cs_discard_entries(&cmd_buffer->draw_cs);
tu_cs_begin(&cmd_buffer->draw_cs);
tu_cs_discard_entries(&cmd_buffer->draw_epilogue_cs);
tu_cs_begin(&cmd_buffer->draw_epilogue_cs);
cmd_buffer->state.pass = NULL;
cmd_buffer->state.subpass = NULL;
cmd_buffer->state.framebuffer = NULL;
cmd_buffer->state.attachments = NULL;
memset(&cmd_buffer->state.rp, 0, sizeof(cmd_buffer->state.rp));
/* LRZ is not valid next time we use it */
cmd_buffer->state.lrz.valid = false;
cmd_buffer->state.dirty |= TU_CMD_DIRTY_LRZ;
}
VKAPI_ATTR void VKAPI_CALL
tu_CmdEndRenderPass2(VkCommandBuffer commandBuffer,
const VkSubpassEndInfo *pSubpassEndInfo)
{
TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer);
tu_end_rendering(cmd_buffer);
cmd_buffer->trace_renderpass_end = u_trace_end_iterator(&cmd_buffer->trace);
tu_cs_end(&cmd_buffer->draw_cs);
tu_cs_end(&cmd_buffer->draw_epilogue_cs);
tu_cmd_render(cmd_buffer);
cmd_buffer->state.cache.pending_flush_bits |=
cmd_buffer->state.renderpass_cache.pending_flush_bits;
tu_subpass_barrier(cmd_buffer, &cmd_buffer->state.pass->end_barrier, true);
vk_free(&cmd_buffer->pool->vk.alloc, cmd_buffer->state.attachments);
tu_reset_render_pass(cmd_buffer);
}
VKAPI_ATTR void VKAPI_CALL
@ -4862,7 +5136,38 @@ tu_CmdEndRendering(VkCommandBuffer commandBuffer)
{
TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer);
tu_end_rendering(cmd_buffer);
cmd_buffer->trace_renderpass_end = u_trace_end_iterator(&cmd_buffer->trace);
if (cmd_buffer->state.suspending)
cmd_buffer->state.suspended_pass.lrz = cmd_buffer->state.lrz;
if (!cmd_buffer->state.suspending) {
tu_cs_end(&cmd_buffer->draw_cs);
tu_cs_end(&cmd_buffer->draw_epilogue_cs);
if (cmd_buffer->state.suspend_resume == SR_IN_PRE_CHAIN) {
tu_save_pre_chain(cmd_buffer);
} else {
tu_cmd_render(cmd_buffer);
}
tu_reset_render_pass(cmd_buffer);
}
if (cmd_buffer->state.resuming && !cmd_buffer->state.suspending) {
/* exiting suspend/resume chain */
switch (cmd_buffer->state.suspend_resume) {
case SR_IN_CHAIN:
cmd_buffer->state.suspend_resume = SR_NONE;
break;
case SR_IN_PRE_CHAIN:
case SR_IN_CHAIN_AFTER_PRE_CHAIN:
cmd_buffer->state.suspend_resume = SR_AFTER_PRE_CHAIN;
break;
default:
unreachable("suspending render pass not followed by resuming pass");
}
}
}
static void

View File

@ -180,6 +180,7 @@ get_device_extensions(const struct tu_physical_device *device,
.KHR_zero_initialize_workgroup_memory = true,
.KHR_shader_non_semantic_info = true,
.KHR_synchronization2 = true,
.KHR_dynamic_rendering = true,
#ifndef TU_USE_KGSL
.KHR_timeline_semaphore = true,
#endif
@ -237,6 +238,7 @@ get_device_extensions(const struct tu_physical_device *device,
.VALVE_mutable_descriptor_type = true,
.EXT_image_2d_view_of_3d = true,
.EXT_color_write_enable = true,
.EXT_load_store_op_none = true,
};
}
@ -640,7 +642,7 @@ tu_get_physical_device_features_1_3(struct tu_physical_device *pdevice,
features->synchronization2 = true;
features->textureCompressionASTC_HDR = false;
features->shaderZeroInitializeWorkgroupMemory = true;
features->dynamicRendering = false;
features->dynamicRendering = true;
features->shaderIntegerDotProduct = true;
features->maintenance4 = true;
}
@ -1611,6 +1613,37 @@ tu_copy_timestamp_buffer(struct u_trace_context *utctx, void *cmdstream,
tu_cs_emit_qw(cs, bo_to->iova + to_offset * sizeof(uint64_t));
}
/* Special helpers instead of u_trace_begin_iterator()/u_trace_end_iterator()
* that ignore tracepoints at the beginning/end that are part of a
* suspend/resume chain.
*/
static struct u_trace_iterator
tu_cmd_begin_iterator(struct tu_cmd_buffer *cmdbuf)
{
switch (cmdbuf->state.suspend_resume) {
case SR_IN_PRE_CHAIN:
return cmdbuf->trace_renderpass_end;
case SR_AFTER_PRE_CHAIN:
case SR_IN_CHAIN_AFTER_PRE_CHAIN:
return cmdbuf->pre_chain.trace_renderpass_end;
default:
return u_trace_begin_iterator(&cmdbuf->trace);
}
}
static struct u_trace_iterator
tu_cmd_end_iterator(struct tu_cmd_buffer *cmdbuf)
{
switch (cmdbuf->state.suspend_resume) {
case SR_IN_PRE_CHAIN:
return cmdbuf->trace_renderpass_end;
case SR_IN_CHAIN:
case SR_IN_CHAIN_AFTER_PRE_CHAIN:
return cmdbuf->trace_renderpass_start;
default:
return u_trace_end_iterator(&cmdbuf->trace);
}
}
VkResult
tu_create_copy_timestamp_cs(struct tu_cmd_buffer *cmdbuf, struct tu_cs** cs,
struct u_trace **trace_copy)
@ -1638,8 +1671,8 @@ tu_create_copy_timestamp_cs(struct tu_cmd_buffer *cmdbuf, struct tu_cs** cs,
}
u_trace_init(*trace_copy, cmdbuf->trace.utctx);
u_trace_clone_append(u_trace_begin_iterator(&cmdbuf->trace),
u_trace_end_iterator(&cmdbuf->trace),
u_trace_clone_append(tu_cmd_begin_iterator(cmdbuf),
tu_cmd_end_iterator(cmdbuf),
*trace_copy, *cs,
tu_copy_timestamp_buffer);
@ -1900,6 +1933,12 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice,
/* initialize to ones so ffs can be used to find unused slots */
BITSET_ONES(device->custom_border_color);
result = tu_init_dynamic_rendering(device);
if (result != VK_SUCCESS) {
vk_startup_errorf(device->instance, result, "dynamic rendering");
goto fail_dynamic_rendering;
}
struct vk_pipeline_cache_create_info pcc_info = { };
device->mem_cache = vk_pipeline_cache_create(&device->vk, &pcc_info,
false);
@ -2009,6 +2048,8 @@ fail_perfcntrs_pass_entries_alloc:
fail_perfcntrs_pass_alloc:
vk_pipeline_cache_destroy(device->mem_cache, &device->vk.alloc);
fail_pipeline_cache:
tu_destroy_dynamic_rendering(device);
fail_dynamic_rendering:
tu_destroy_clear_blit_shaders(device);
fail_global_bo_map:
tu_bo_finish(device, device->global_bo);
@ -2055,6 +2096,8 @@ tu_DestroyDevice(VkDevice _device, const VkAllocationCallbacks *pAllocator)
tu_destroy_clear_blit_shaders(device);
tu_destroy_dynamic_rendering(device);
ir3_compiler_destroy(device->compiler);
vk_pipeline_cache_destroy(device->mem_cache, &device->vk.alloc);

View File

@ -53,10 +53,12 @@ struct tu_queue_submit
struct vk_queue_submit *vk_submit;
struct tu_u_trace_submission_data *u_trace_submission_data;
struct tu_cmd_buffer **cmd_buffers;
struct drm_msm_gem_submit_cmd *cmds;
struct drm_msm_gem_submit_syncobj *in_syncobjs;
struct drm_msm_gem_submit_syncobj *out_syncobjs;
uint32_t nr_cmd_buffers;
uint32_t nr_in_syncobjs;
uint32_t nr_out_syncobjs;
uint32_t entry_count;
@ -833,11 +835,17 @@ tu_queue_submit_create_locked(struct tu_queue *queue,
bool has_trace_points = false;
struct vk_command_buffer **vk_cmd_buffers = vk_submit->command_buffers;
struct tu_cmd_buffer **cmd_buffers = (void *)vk_cmd_buffers;
memset(new_submit, 0, sizeof(struct tu_queue_submit));
new_submit->cmd_buffers = (void *)vk_cmd_buffers;
new_submit->nr_cmd_buffers = vk_submit->command_buffer_count;
tu_insert_dynamic_cmdbufs(queue->device, &new_submit->cmd_buffers,
&new_submit->nr_cmd_buffers);
uint32_t entry_count = 0;
for (uint32_t j = 0; j < vk_submit->command_buffer_count; ++j) {
struct tu_cmd_buffer *cmdbuf = cmd_buffers[j];
for (uint32_t j = 0; j < new_submit->nr_cmd_buffers; ++j) {
struct tu_cmd_buffer *cmdbuf = new_submit->cmd_buffers[j];
if (perf_pass_index != ~0)
entry_count++;
@ -852,11 +860,8 @@ tu_queue_submit_create_locked(struct tu_queue *queue,
}
}
memset(new_submit, 0, sizeof(struct tu_queue_submit));
new_submit->autotune_fence =
tu_autotune_submit_requires_fence(cmd_buffers, vk_submit->command_buffer_count);
tu_autotune_submit_requires_fence(new_submit->cmd_buffers, new_submit->nr_cmd_buffers);
if (new_submit->autotune_fence)
entry_count++;
@ -872,8 +877,8 @@ tu_queue_submit_create_locked(struct tu_queue *queue,
if (has_trace_points) {
result =
tu_u_trace_submission_data_create(
queue->device, cmd_buffers,
vk_submit->command_buffer_count,
queue->device, new_submit->cmd_buffers,
new_submit->nr_cmd_buffers,
&new_submit->u_trace_submission_data);
if (result != VK_SUCCESS) {
@ -927,6 +932,8 @@ tu_queue_submit_finish(struct tu_queue *queue, struct tu_queue_submit *submit)
vk_free(&queue->device->vk.alloc, submit->cmds);
vk_free(&queue->device->vk.alloc, submit->in_syncobjs);
vk_free(&queue->device->vk.alloc, submit->out_syncobjs);
if (submit->cmd_buffers != (void *) submit->vk_submit->command_buffers)
vk_free(&queue->device->vk.alloc, submit->cmd_buffers);
}
static void
@ -951,13 +958,10 @@ tu_queue_build_msm_gem_submit_cmds(struct tu_queue *queue,
struct tu_device *dev = queue->device;
struct drm_msm_gem_submit_cmd *cmds = submit->cmds;
struct vk_command_buffer **vk_cmd_buffers = submit->vk_submit->command_buffers;
struct tu_cmd_buffer **cmd_buffers = (void *)vk_cmd_buffers;
uint32_t entry_idx = 0;
for (uint32_t j = 0; j < submit->vk_submit->command_buffer_count; ++j) {
for (uint32_t j = 0; j < submit->nr_cmd_buffers; ++j) {
struct tu_device *dev = queue->device;
struct tu_cmd_buffer *cmdbuf = cmd_buffers[j];
struct tu_cmd_buffer *cmdbuf = submit->cmd_buffers[j];
struct tu_cs *cs = &cmdbuf->cs;
if (submit->perf_pass_index != ~0) {
@ -996,11 +1000,10 @@ tu_queue_submit_locked(struct tu_queue *queue, struct tu_queue_submit *submit)
struct tu_cs *autotune_cs = NULL;
if (submit->autotune_fence) {
struct tu_cmd_buffer **cmd_buffers = (void *)submit->vk_submit->command_buffers;
autotune_cs = tu_autotune_on_submit(queue->device,
&queue->device->autotune,
cmd_buffers,
submit->vk_submit->command_buffer_count);
submit->cmd_buffers,
submit->nr_cmd_buffers);
}
uint32_t flags = MSM_PIPE_3D0;
@ -1062,7 +1065,7 @@ tu_queue_submit_locked(struct tu_queue *queue, struct tu_queue_submit *submit)
submit->u_trace_submission_data = NULL;
for (uint32_t i = 0; i < submit->vk_submit->command_buffer_count; i++) {
for (uint32_t i = 0; i < submission_data->cmd_buffer_count; i++) {
bool free_data = i == submission_data->last_buffer_with_tracepoints;
if (submission_data->cmd_trace_data[i].trace)
u_trace_flush(submission_data->cmd_trace_data[i].trace,

View File

@ -0,0 +1,237 @@
/*
* Copyright © 2022 Valve Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
/* When using dynamic rendering with the suspend/resume functionality, we
* sometimes need to merge together multiple suspended render passes
* dynamically at submit time. This involves combining all the saved-up IBs,
* emitting the rendering commands usually emitted by
* CmdEndRenderPass()/CmdEndRendering(), and inserting them in between the
* user command buffers. This gets tricky, because the same command buffer can
* be submitted multiple times, each time with a different other set of
* command buffers, and with VK_COMMAND_BUFFER_SIMULTANEOUS_USE_BIT, this can
* happen before the previous submission of the same command buffer has
* finished. At some point we have to free these commands and the BOs they are
* contained in, and we can't do that when resubmitting the last command
* buffer in the sequence because it may still be in use. This means we have
* to make the commands owned by the device and roll our own memory tracking.
*/
#include "tu_private.h"
#include "tu_cs.h"
struct dynamic_rendering_entry {
struct tu_cmd_buffer *cmd_buffer;
uint32_t fence; /* The fence value when cmd_buffer becomes available */
};
static VkResult
get_cmd_buffer(struct tu_device *dev, struct tu_cmd_buffer **cmd_buffer_out)
{
struct tu6_global *global = dev->global_bo->map;
/* Note: because QueueSubmit is serialized, we don't need any locks here.
*/
uint32_t fence = global->dynamic_rendering_fence;
/* Go through the entries and return the finished ones to the pool,
* shrinking the array of pending entries.
*/
struct dynamic_rendering_entry *new_entry =
util_dynarray_begin(&dev->dynamic_rendering_pending);
uint32_t entries = 0;
util_dynarray_foreach(&dev->dynamic_rendering_pending,
struct dynamic_rendering_entry, entry) {
if (entry->fence <= fence) {
VkCommandBuffer vk_buf = tu_cmd_buffer_to_handle(entry->cmd_buffer);
tu_FreeCommandBuffers(tu_device_to_handle(dev),
dev->dynamic_rendering_pool, 1, &vk_buf);
} else {
*new_entry = *entry;
new_entry++;
entries++;
}
}
UNUSED void *dummy =
util_dynarray_resize(&dev->dynamic_rendering_pending,
struct dynamic_rendering_entry, entries);
VkCommandBuffer vk_buf;
const VkCommandBufferAllocateInfo info = {
.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO,
.pNext = NULL,
.commandPool = dev->dynamic_rendering_pool,
.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY,
.commandBufferCount = 1,
};
VkResult result =
tu_AllocateCommandBuffers(tu_device_to_handle(dev), &info, &vk_buf);
if (result != VK_SUCCESS)
return result;
TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, vk_buf);
struct dynamic_rendering_entry entry = {
.cmd_buffer = cmd_buffer,
.fence = ++dev->dynamic_rendering_fence,
};
util_dynarray_append(&dev->dynamic_rendering_pending,
struct dynamic_rendering_entry, entry);
*cmd_buffer_out = cmd_buffer;
return VK_SUCCESS;
}
VkResult
tu_init_dynamic_rendering(struct tu_device *dev)
{
util_dynarray_init(&dev->dynamic_rendering_pending, NULL);
dev->dynamic_rendering_fence = 0;
return tu_CreateCommandPool(tu_device_to_handle(dev),
&(VkCommandPoolCreateInfo) {
.pNext = NULL,
.sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO,
.flags = 0,
.queueFamilyIndex = 0,
}, &dev->vk.alloc, &dev->dynamic_rendering_pool);
}
void
tu_destroy_dynamic_rendering(struct tu_device *dev)
{
tu_DestroyCommandPool(tu_device_to_handle(dev),
dev->dynamic_rendering_pool,
&dev->vk.alloc);
util_dynarray_fini(&dev->dynamic_rendering_pending);
}
VkResult
tu_insert_dynamic_cmdbufs(struct tu_device *dev,
struct tu_cmd_buffer ***cmds_ptr,
uint32_t *size)
{
struct tu_cmd_buffer **old_cmds = *cmds_ptr;
bool has_dynamic = false;
for (unsigned i = 0; i < *size; i++) {
if (old_cmds[i]->state.suspend_resume != SR_NONE) {
has_dynamic = true;
break;
}
}
if (!has_dynamic)
return VK_SUCCESS;
struct util_dynarray cmds = {0};
struct tu_cmd_buffer *cmd_buffer = NULL;
for (unsigned i = 0; i < *size; i++) {
switch (old_cmds[i]->state.suspend_resume) {
case SR_NONE:
case SR_IN_CHAIN:
case SR_IN_PRE_CHAIN:
break;
case SR_AFTER_PRE_CHAIN:
case SR_IN_CHAIN_AFTER_PRE_CHAIN:
tu_append_pre_chain(cmd_buffer, old_cmds[i]);
if (!(old_cmds[i]->usage_flags &
VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT)) {
u_trace_disable_event_range(old_cmds[i]->pre_chain.trace_renderpass_start,
old_cmds[i]->pre_chain.trace_renderpass_end);
}
tu_cmd_render(cmd_buffer);
tu_cs_emit_pkt7(&cmd_buffer->cs, CP_MEM_WRITE, 3);
tu_cs_emit_qw(&cmd_buffer->cs,
global_iova(cmd_buffer, dynamic_rendering_fence));
tu_cs_emit(&cmd_buffer->cs, dev->dynamic_rendering_fence);
tu_EndCommandBuffer(tu_cmd_buffer_to_handle(cmd_buffer));
util_dynarray_append(&cmds, struct tu_cmd_buffer *, cmd_buffer);
cmd_buffer = NULL;
break;
}
util_dynarray_append(&cmds, struct tu_cmd_buffer *, old_cmds[i]);
switch (old_cmds[i]->state.suspend_resume) {
case SR_NONE:
case SR_AFTER_PRE_CHAIN:
break;
case SR_IN_CHAIN:
case SR_IN_CHAIN_AFTER_PRE_CHAIN: {
assert(!cmd_buffer);
VkResult result = get_cmd_buffer(dev, &cmd_buffer);
if (result != VK_SUCCESS)
return result;
tu_cmd_buffer_begin(cmd_buffer,
VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT);
/* Setup the render pass using the first command buffer involved in
* the chain, so that it will look like we're inside a render pass
* for tu_cmd_render().
*/
tu_restore_suspended_pass(cmd_buffer, old_cmds[i]);
FALLTHROUGH;
}
case SR_IN_PRE_CHAIN:
assert(cmd_buffer);
tu_append_pre_post_chain(cmd_buffer, old_cmds[i]);
if (old_cmds[i]->usage_flags &
VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT) {
u_trace_disable_event_range(old_cmds[i]->trace_renderpass_start,
old_cmds[i]->trace_renderpass_end);
}
/* When the command buffer is finally recorded, we need its state
* to be the state of the command buffer before it. We need this
* because we skip tu6_emit_hw().
*/
cmd_buffer->state.ccu_state = old_cmds[i]->state.ccu_state;
cmd_buffer->vsc_draw_strm_pitch = old_cmds[i]->vsc_draw_strm_pitch;
cmd_buffer->vsc_prim_strm_pitch = old_cmds[i]->vsc_prim_strm_pitch;
break;
}
}
struct tu_cmd_buffer **new_cmds =
vk_alloc(&dev->vk.alloc, cmds.size, alignof(struct tu_cmd_buffer *),
VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
if (!new_cmds)
return VK_ERROR_OUT_OF_HOST_MEMORY;
memcpy(new_cmds, cmds.data, cmds.size);
*cmds_ptr = new_cmds;
*size = util_dynarray_num_elements(&cmds, struct tu_cmd_buffer *);
util_dynarray_fini(&cmds);
return VK_SUCCESS;
}

View File

@ -367,6 +367,9 @@ tu_QueueSubmit2(VkQueue _queue,
tu_dbg_log_gmem_load_store_skips(queue->device);
}
struct tu_cmd_buffer **submit_cmd_buffers[submitCount];
uint32_t submit_cmd_buffer_count[submitCount];
uint32_t max_entry_count = 0;
for (uint32_t i = 0; i < submitCount; ++i) {
const VkSubmitInfo2 *submit = pSubmits + i;
@ -375,17 +378,34 @@ tu_QueueSubmit2(VkQueue _queue,
vk_find_struct_const(pSubmits[i].pNext,
PERFORMANCE_QUERY_SUBMIT_INFO_KHR);
uint32_t entry_count = 0;
struct tu_cmd_buffer *cmd_buffers[submit->commandBufferInfoCount];
for (uint32_t j = 0; j < submit->commandBufferInfoCount; ++j) {
struct tu_cmd_buffer *old_cmd_buffers[submit->commandBufferInfoCount];
uint32_t cmdbuf_count = submit->commandBufferInfoCount;
for (uint32_t j = 0; j < cmdbuf_count; ++j) {
TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, submit->pCommandBufferInfos[j].commandBuffer);
cmd_buffers[j] = cmdbuf;
entry_count += cmdbuf->cs.entry_count;
old_cmd_buffers[j] = cmdbuf;
}
struct tu_cmd_buffer **cmd_buffers = old_cmd_buffers;
tu_insert_dynamic_cmdbufs(queue->device, &cmd_buffers, &cmdbuf_count);
if (cmd_buffers == old_cmd_buffers) {
cmd_buffers =
vk_alloc(&queue->device->vk.alloc,
sizeof(*cmd_buffers) * cmdbuf_count, 8,
VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
memcpy(cmd_buffers, old_cmd_buffers,
sizeof(*cmd_buffers) * cmdbuf_count);
}
submit_cmd_buffers[i] = cmd_buffers;
submit_cmd_buffer_count[i] = cmdbuf_count;
uint32_t entry_count = 0;
for (uint32_t j = 0; j < cmdbuf_count; ++j) {
entry_count += cmd_buffers[i]->cs.entry_count;
if (perf_info)
entry_count++;
}
if (tu_autotune_submit_requires_fence(cmd_buffers, submit->commandBufferInfoCount))
if (tu_autotune_submit_requires_fence(cmd_buffers, cmdbuf_count))
entry_count++;
max_entry_count = MAX2(max_entry_count, entry_count);
@ -406,10 +426,10 @@ tu_QueueSubmit2(VkQueue _queue,
PERFORMANCE_QUERY_SUBMIT_INFO_KHR);
struct tu_cmd_buffer *cmd_buffers[submit->commandBufferInfoCount];
for (uint32_t j = 0; j < submit->commandBufferInfoCount; j++) {
TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, submit->pCommandBufferInfos[j].commandBuffer);
cmd_buffers[j] = cmdbuf;
struct tu_cmd_buffer **cmd_buffers = submit_cmd_buffers[i];
uint32_t cmdbuf_count = submit_cmd_buffer_count[i];
for (uint32_t j = 0; j < cmdbuf_count; j++) {
struct tu_cmd_buffer *cmdbuf = cmd_buffers[j];
struct tu_cs *cs = &cmdbuf->cs;
if (perf_info) {
@ -436,12 +456,12 @@ tu_QueueSubmit2(VkQueue _queue,
}
}
if (tu_autotune_submit_requires_fence(cmd_buffers, submit->commandBufferInfoCount)) {
if (tu_autotune_submit_requires_fence(cmd_buffers, cmdbuf_count)) {
struct tu_cs *autotune_cs =
tu_autotune_on_submit(queue->device,
&queue->device->autotune,
cmd_buffers,
submit->commandBufferInfoCount);
cmdbuf_count);
cmds[entry_idx++] = (struct kgsl_command_object) {
.offset = autotune_cs->entries[0].offset,
.gpuaddr = autotune_cs->entries[0].bo->iova,

View File

@ -275,6 +275,32 @@ tu_lrz_init_secondary(struct tu_cmd_buffer *cmd,
cmd->state.lrz.reuse_previous_state = false;
}
/* This is generally the same as tu_lrz_begin_renderpass(), but we skip
* actually emitting anything. The lrz state needs to be consistent between
* renderpasses, but only the first should actually emit commands to disable
* lrz etc.
*/
void
tu_lrz_begin_resumed_renderpass(struct tu_cmd_buffer *cmd,
const VkClearValue *clear_values)
{
/* Track LRZ valid state */
memset(&cmd->state.lrz, 0, sizeof(cmd->state.lrz));
uint32_t a = cmd->state.subpass->depth_stencil_attachment.attachment;
if (a != VK_ATTACHMENT_UNUSED) {
const struct tu_render_pass_attachment *att = &cmd->state.pass->attachments[a];
tu_lrz_init_state(cmd, att, cmd->state.attachments[a]);
if (att->clear_mask & (VK_IMAGE_ASPECT_COLOR_BIT | VK_IMAGE_ASPECT_DEPTH_BIT)) {
VkClearValue clear = clear_values[a];
cmd->state.lrz.depth_clear_value = clear;
cmd->state.lrz.fast_clear = cmd->state.lrz.fast_clear &&
(clear.depthStencil.depth == 0.f ||
clear.depthStencil.depth == 1.f);
}
cmd->state.dirty |= TU_CMD_DIRTY_LRZ;
}
}
void
tu_lrz_begin_renderpass(struct tu_cmd_buffer *cmd,
const VkClearValue *clear_values)
@ -304,20 +330,7 @@ tu_lrz_begin_renderpass(struct tu_cmd_buffer *cmd,
}
/* Track LRZ valid state */
memset(&cmd->state.lrz, 0, sizeof(cmd->state.lrz));
uint32_t a = cmd->state.subpass->depth_stencil_attachment.attachment;
if (a != VK_ATTACHMENT_UNUSED) {
const struct tu_render_pass_attachment *att = &cmd->state.pass->attachments[a];
tu_lrz_init_state(cmd, att, cmd->state.attachments[a]);
if (att->clear_mask & (VK_IMAGE_ASPECT_COLOR_BIT | VK_IMAGE_ASPECT_DEPTH_BIT)) {
VkClearValue clear = clear_values[a];
cmd->state.lrz.depth_clear_value = clear;
cmd->state.lrz.fast_clear = cmd->state.lrz.fast_clear &&
(clear.depthStencil.depth == 0.f ||
clear.depthStencil.depth == 1.f);
}
cmd->state.dirty |= TU_CMD_DIRTY_LRZ;
}
tu_lrz_begin_resumed_renderpass(cmd, clear_values);
if (!cmd->state.lrz.valid) {
tu6_emit_lrz_buffer(&cmd->cs, NULL);

View File

@ -489,6 +489,9 @@ struct tu6_global
/* To know when renderpass stats for autotune are valid */
volatile uint32_t autotune_fence;
/* For recycling command buffers for dynamic suspend/resume comamnds */
volatile uint32_t dynamic_rendering_fence;
volatile uint32_t dbg_one;
volatile uint32_t dbg_gmem_total_loads;
volatile uint32_t dbg_gmem_taken_loads;
@ -593,6 +596,10 @@ struct tu_device
struct tu_cs *perfcntrs_pass_cs;
struct tu_cs_entry *perfcntrs_pass_cs_entries;
struct util_dynarray dynamic_rendering_pending;
VkCommandPool dynamic_rendering_pool;
uint32_t dynamic_rendering_fence;
/* Condition variable for timeline semaphore to notify waiters when a
* new submit is executed. */
pthread_cond_t timeline_cond;
@ -624,6 +631,14 @@ void tu_init_clear_blit_shaders(struct tu_device *dev);
void tu_destroy_clear_blit_shaders(struct tu_device *dev);
VkResult tu_init_dynamic_rendering(struct tu_device *dev);
void tu_destroy_dynamic_rendering(struct tu_device *dev);
VkResult tu_insert_dynamic_cmdbufs(struct tu_device *dev,
struct tu_cmd_buffer ***cmds_ptr,
uint32_t *size);
VkResult
tu_device_submit_deferred_locked(struct tu_device *dev);
@ -1327,6 +1342,8 @@ struct tu_render_pass_state
uint32_t drawcall_bandwidth_per_sample_sum;
};
void tu_render_pass_state_merge(struct tu_render_pass_state *dst,
const struct tu_render_pass_state *src);
struct tu_cmd_state
{
uint32_t dirty;
@ -1403,6 +1420,22 @@ struct tu_cmd_state
const struct tu_image_view **attachments;
/* State that in the dynamic case comes from VkRenderingInfo and needs to
* be saved/restored when suspending. This holds the state for the last
* suspended renderpass, which may point to this command buffer's dynamic_*
* or another command buffer if executed on a secondary.
*/
struct {
const struct tu_render_pass *pass;
const struct tu_subpass *subpass;
const struct tu_framebuffer *framebuffer;
VkRect2D render_area;
const struct tu_image_view **attachments;
struct tu_lrz_state lrz;
} suspended_pass;
bool tessfactor_addr_set;
bool predication_active;
enum a5xx_line_mode line_mode;
@ -1416,6 +1449,97 @@ struct tu_cmd_state
bool prim_generated_query_running_before_rp;
/* These are the states of the suspend/resume state machine. In addition to
* tracking whether we're in the middle of a chain of suspending and
* resuming passes that will be merged, we need to track whether the
* command buffer begins in the middle of such a chain, for when it gets
* merged with other command buffers. We call such a chain that begins
* before the command buffer starts a "pre-chain".
*
* Note that when this command buffer is finished, this state is untouched
* but it gains a different meaning. For example, if we finish in state
* SR_IN_CHAIN, we finished in the middle of a suspend/resume chain, so
* there's a suspend/resume chain that extends past the end of the command
* buffer. In this sense it's the "opposite" of SR_AFTER_PRE_CHAIN, which
* means that there's a suspend/resume chain that extends before the
* beginning.
*/
enum {
/* Either there are no suspend/resume chains, or they are entirely
* contained in the current command buffer.
*
* BeginCommandBuffer() <- start of current command buffer
* ...
* // we are here
*/
SR_NONE = 0,
/* We are in the middle of a suspend/resume chain that starts before the
* current command buffer. This happens when the command buffer begins
* with a resuming render pass and all of the passes up to the current
* one are suspending. In this state, our part of the chain is not saved
* and is in the current draw_cs/state.
*
* BeginRendering() ... EndRendering(suspending)
* BeginCommandBuffer() <- start of current command buffer
* BeginRendering(resuming) ... EndRendering(suspending)
* BeginRendering(resuming) ... EndRendering(suspending)
* ...
* // we are here
*/
SR_IN_PRE_CHAIN,
/* We are currently outside of any suspend/resume chains, but there is a
* chain starting before the current command buffer. It is saved in
* pre_chain.
*
* BeginRendering() ... EndRendering(suspending)
* BeginCommandBuffer() <- start of current command buffer
* // This part is stashed in pre_chain
* BeginRendering(resuming) ... EndRendering(suspending)
* BeginRendering(resuming) ... EndRendering(suspending)
* ...
* BeginRendering(resuming) ... EndRendering() // end of chain
* ...
* // we are here
*/
SR_AFTER_PRE_CHAIN,
/* We are in the middle of a suspend/resume chain and there is no chain
* starting before the current command buffer.
*
* BeginCommandBuffer() <- start of current command buffer
* ...
* BeginRendering() ... EndRendering(suspending)
* BeginRendering(resuming) ... EndRendering(suspending)
* BeginRendering(resuming) ... EndRendering(suspending)
* ...
* // we are here
*/
SR_IN_CHAIN,
/* We are in the middle of a suspend/resume chain and there is another,
* separate, chain starting before the current command buffer.
*
* BeginRendering() ... EndRendering(suspending)
* CommandBufferBegin() <- start of current command buffer
* // This part is stashed in pre_chain
* BeginRendering(resuming) ... EndRendering(suspending)
* BeginRendering(resuming) ... EndRendering(suspending)
* ...
* BeginRendering(resuming) ... EndRendering() // end of chain
* ...
* BeginRendering() ... EndRendering(suspending)
* BeginRendering(resuming) ... EndRendering(suspending)
* BeginRendering(resuming) ... EndRendering(suspending)
* ...
* // we are here
*/
SR_IN_CHAIN_AFTER_PRE_CHAIN,
} suspend_resume;
bool suspending, resuming;
struct tu_lrz_state lrz;
struct tu_draw_state lrz_and_depth_plane_state;
@ -1487,6 +1611,24 @@ struct tu_cmd_buffer
struct tu_cs draw_epilogue_cs;
struct tu_cs sub_cs;
/* If the first render pass in the command buffer is resuming, then it is
* part of a suspend/resume chain that starts before the current command
* buffer and needs to be merged later. In this case, its incomplete state
* is stored in pre_chain. In the symmetric case where the last render pass
* is suspending, we just skip ending the render pass and its state is
* stored in draw_cs/the current state. The first and last render pass
* might be part of different chains, which is why all the state may need
* to be saved separately here.
*/
struct {
struct tu_cs draw_cs;
struct tu_cs draw_epilogue_cs;
struct u_trace_iterator trace_renderpass_start, trace_renderpass_end;
struct tu_render_pass_state state;
} pre_chain;
uint32_t vsc_draw_strm_pitch;
uint32_t vsc_prim_strm_pitch;
};
@ -1504,6 +1646,8 @@ struct tu_reg_value {
uint32_t bo_shift;
};
VkResult tu_cmd_buffer_begin(struct tu_cmd_buffer *cmd_buffer,
VkCommandBufferUsageFlags usage_flags);
void tu_emit_cache_flush_renderpass(struct tu_cmd_buffer *cmd_buffer,
struct tu_cs *cs);
@ -1521,6 +1665,24 @@ void tu_setup_dynamic_inheritance(struct tu_cmd_buffer *cmd_buffer,
void tu_setup_dynamic_framebuffer(struct tu_cmd_buffer *cmd_buffer,
const VkRenderingInfo *pRenderingInfo);
void
tu_append_pre_chain(struct tu_cmd_buffer *cmd,
struct tu_cmd_buffer *secondary);
void
tu_append_pre_post_chain(struct tu_cmd_buffer *cmd,
struct tu_cmd_buffer *secondary);
void
tu_append_post_chain(struct tu_cmd_buffer *cmd,
struct tu_cmd_buffer *secondary);
void
tu_restore_suspended_pass(struct tu_cmd_buffer *cmd,
struct tu_cmd_buffer *suspended);
void tu_cmd_render(struct tu_cmd_buffer *cmd);
void
tu6_emit_event_write(struct tu_cmd_buffer *cmd,
struct tu_cs *cs,
@ -1756,6 +1918,10 @@ void
tu_lrz_begin_renderpass(struct tu_cmd_buffer *cmd,
const VkClearValue *clear_values);
void
tu_lrz_begin_resumed_renderpass(struct tu_cmd_buffer *cmd,
const VkClearValue *clear_values);
void
tu_lrz_begin_secondary_cmdbuf(struct tu_cmd_buffer *cmd);