From 29f264f25804eeea962f21c29c39050c3fc1663d Mon Sep 17 00:00:00 2001 From: Samuel Pitoiset Date: Thu, 1 Jul 2021 13:32:05 +0200 Subject: [PATCH] ac,radv: implement the cs_regalloc_hang HW bug workaround MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Might fix spurious failures on GFX6 and some GFX7 chips. Signed-off-by: Samuel Pitoiset Reviewed-by: Marek Olšák Reviewed-by: Bas Nieuwenhuizen Part-of: --- src/amd/common/ac_gpu_info.c | 12 ++++++++++++ src/amd/common/ac_gpu_info.h | 1 + src/amd/vulkan/radv_cmd_buffer.c | 9 +++++++++ src/gallium/drivers/radeonsi/si_compute.c | 12 +++--------- 4 files changed, 25 insertions(+), 9 deletions(-) diff --git a/src/amd/common/ac_gpu_info.c b/src/amd/common/ac_gpu_info.c index c0c03de389b..63c2058862b 100644 --- a/src/amd/common/ac_gpu_info.c +++ b/src/amd/common/ac_gpu_info.c @@ -906,6 +906,18 @@ bool ac_query_gpu_info(int fd, void *dev_p, struct radeon_info *info, info->has_vgt_flush_ngg_legacy_bug = info->chip_class == GFX10 || info->family == CHIP_SIENNA_CICHLID; + /* HW bug workaround when CS threadgroups > 256 threads and async compute + * isn't used, i.e. only one compute job can run at a time. If async + * compute is possible, the threadgroup size must be limited to 256 threads + * on all queues to avoid the bug. + * Only GFX6 and certain GFX7 chips are affected. + * + * FIXME: RADV doesn't limit the number of threads for async compute. + */ + info->has_cs_regalloc_hang_bug = info->chip_class == GFX6 || + info->family == CHIP_BONAIRE || + info->family == CHIP_KABINI; + /* Support for GFX10.3 was added with F32_ME_FEATURE_VERSION_31 but the * feature version wasn't bumped. */ diff --git a/src/amd/common/ac_gpu_info.h b/src/amd/common/ac_gpu_info.h index e487ff040c1..543051fb8c7 100644 --- a/src/amd/common/ac_gpu_info.h +++ b/src/amd/common/ac_gpu_info.h @@ -79,6 +79,7 @@ struct radeon_info { bool has_image_load_dcc_bug; bool has_two_planes_iterate256_bug; bool has_vgt_flush_ngg_legacy_bug; + bool has_cs_regalloc_hang_bug; bool has_32bit_predication; bool has_3d_cube_border_color_mipmap; diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c index 5f28c6f85f5..be9ccaa14da 100644 --- a/src/amd/vulkan/radv_cmd_buffer.c +++ b/src/amd/vulkan/radv_cmd_buffer.c @@ -6134,6 +6134,12 @@ radv_dispatch(struct radv_cmd_buffer *cmd_buffer, const struct radv_dispatch_inf { bool has_prefetch = cmd_buffer->device->physical_device->rad_info.chip_class >= GFX7; bool pipeline_is_dirty = pipeline && pipeline != cmd_buffer->state.emitted_compute_pipeline; + bool cs_regalloc_hang = cmd_buffer->device->physical_device->rad_info.has_cs_regalloc_hang_bug && + info->blocks[0] * info->blocks[1] * info->blocks[2] > 256; + + if (cs_regalloc_hang) + cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_PS_PARTIAL_FLUSH | + RADV_CMD_FLAG_CS_PARTIAL_FLUSH; if (cmd_buffer->state.flush_bits & (RADV_CMD_FLAG_FLUSH_AND_INV_CB | RADV_CMD_FLAG_FLUSH_AND_INV_DB | @@ -6190,6 +6196,9 @@ radv_dispatch(struct radv_cmd_buffer *cmd_buffer, const struct radv_dispatch_inf : VK_PIPELINE_BIND_POINT_COMPUTE); } + if (cs_regalloc_hang) + cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_CS_PARTIAL_FLUSH; + radv_cmd_buffer_after_draw(cmd_buffer, RADV_CMD_FLAG_CS_PARTIAL_FLUSH); } diff --git a/src/gallium/drivers/radeonsi/si_compute.c b/src/gallium/drivers/radeonsi/si_compute.c index 136674e2d11..4a4e40333e0 100644 --- a/src/gallium/drivers/radeonsi/si_compute.c +++ b/src/gallium/drivers/radeonsi/si_compute.c @@ -890,18 +890,12 @@ static bool si_check_needs_implicit_sync(struct si_context *sctx) static void si_launch_grid(struct pipe_context *ctx, const struct pipe_grid_info *info) { struct si_context *sctx = (struct si_context *)ctx; + struct si_screen *sscreen = sctx->screen; struct si_compute *program = sctx->cs_shader_state.program; const amd_kernel_code_t *code_object = si_compute_get_code_object(program, info->pc); int i; - /* HW bug workaround when CS threadgroups > 256 threads and async - * compute isn't used, i.e. only one compute job can run at a time. - * If async compute is possible, the threadgroup size must be limited - * to 256 threads on all queues to avoid the bug. - * Only GFX6 and certain GFX7 chips are affected. - */ - bool cs_regalloc_hang = - (sctx->chip_class == GFX6 || sctx->family == CHIP_BONAIRE || sctx->family == CHIP_KABINI) && - info->block[0] * info->block[1] * info->block[2] > 256; + bool cs_regalloc_hang = sscreen->info.has_cs_regalloc_hang_bug && + info->block[0] * info->block[1] * info->block[2] > 256; if (cs_regalloc_hang) sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH;