From e0353296daa4e7f65ccb1f97b6baa16755422c16 Mon Sep 17 00:00:00 2001 From: Samuel Pitoiset Date: Thu, 19 Aug 2021 09:04:46 +0200 Subject: [PATCH] radv: allocate shaders to 32-bit address to skip PGM_HI This reduces the number of emitted registers. Signed-off-by: Samuel Pitoiset Reviewed-by: Bas Nieuwenhuizen Part-of: --- src/amd/vulkan/radv_pipeline.c | 29 ++++++++--------------------- src/amd/vulkan/radv_shader.c | 2 +- src/amd/vulkan/si_cmd_buffer.c | 20 ++++++++++++++++++++ 3 files changed, 29 insertions(+), 22 deletions(-) diff --git a/src/amd/vulkan/radv_pipeline.c b/src/amd/vulkan/radv_pipeline.c index 4bf7a02b243..ce211086d9d 100644 --- a/src/amd/vulkan/radv_pipeline.c +++ b/src/amd/vulkan/radv_pipeline.c @@ -4420,9 +4420,7 @@ radv_pipeline_generate_hw_ls(struct radeon_cmdbuf *cs, const struct radv_pipelin uint64_t va = radv_buffer_get_va(shader->bo) + shader->bo_offset; uint32_t rsrc2 = shader->config.rsrc2; - radeon_set_sh_reg_seq(cs, R_00B520_SPI_SHADER_PGM_LO_LS, 2); - radeon_emit(cs, va >> 8); - radeon_emit(cs, S_00B524_MEM_BASE(va >> 40)); + radeon_set_sh_reg(cs, R_00B520_SPI_SHADER_PGM_LO_LS, va >> 8); rsrc2 |= S_00B52C_LDS_SIZE(num_lds_blocks); if (pipeline->device->physical_device->rad_info.chip_class == GFX7 && @@ -4447,9 +4445,8 @@ radv_pipeline_generate_hw_ngg(struct radeon_cmdbuf *ctx_cs, struct radeon_cmdbuf : pipeline->shaders[MESA_SHADER_VERTEX]; const struct gfx10_ngg_info *ngg_state = &shader->info.ngg_info; - radeon_set_sh_reg_seq(cs, R_00B320_SPI_SHADER_PGM_LO_ES, 2); - radeon_emit(cs, va >> 8); - radeon_emit(cs, S_00B324_MEM_BASE(va >> 40)); + radeon_set_sh_reg(cs, R_00B320_SPI_SHADER_PGM_LO_ES, va >> 8); + radeon_set_sh_reg_seq(cs, R_00B228_SPI_SHADER_PGM_RSRC1_GS, 2); radeon_emit(cs, shader->config.rsrc1); radeon_emit(cs, shader->config.rsrc2); @@ -4592,13 +4589,9 @@ radv_pipeline_generate_hw_hs(struct radeon_cmdbuf *cs, const struct radv_pipelin if (pipeline->device->physical_device->rad_info.chip_class >= GFX9) { if (pipeline->device->physical_device->rad_info.chip_class >= GFX10) { - radeon_set_sh_reg_seq(cs, R_00B520_SPI_SHADER_PGM_LO_LS, 2); - radeon_emit(cs, va >> 8); - radeon_emit(cs, S_00B524_MEM_BASE(va >> 40)); + radeon_set_sh_reg(cs, R_00B520_SPI_SHADER_PGM_LO_LS, va >> 8); } else { - radeon_set_sh_reg_seq(cs, R_00B410_SPI_SHADER_PGM_LO_LS, 2); - radeon_emit(cs, va >> 8); - radeon_emit(cs, S_00B414_MEM_BASE(va >> 40)); + radeon_set_sh_reg(cs, R_00B410_SPI_SHADER_PGM_LO_LS, va >> 8); } radeon_set_sh_reg_seq(cs, R_00B428_SPI_SHADER_PGM_RSRC1_HS, 2); @@ -4793,13 +4786,9 @@ radv_pipeline_generate_hw_gs(struct radeon_cmdbuf *ctx_cs, struct radeon_cmdbuf if (pipeline->device->physical_device->rad_info.chip_class >= GFX9) { if (pipeline->device->physical_device->rad_info.chip_class >= GFX10) { - radeon_set_sh_reg_seq(cs, R_00B320_SPI_SHADER_PGM_LO_ES, 2); - radeon_emit(cs, va >> 8); - radeon_emit(cs, S_00B324_MEM_BASE(va >> 40)); + radeon_set_sh_reg(cs, R_00B320_SPI_SHADER_PGM_LO_ES, va >> 8); } else { - radeon_set_sh_reg_seq(cs, R_00B210_SPI_SHADER_PGM_LO_ES, 2); - radeon_emit(cs, va >> 8); - radeon_emit(cs, S_00B214_MEM_BASE(va >> 40)); + radeon_set_sh_reg(cs, R_00B210_SPI_SHADER_PGM_LO_ES, va >> 8); } radeon_set_sh_reg_seq(cs, R_00B228_SPI_SHADER_PGM_RSRC1_GS, 2); @@ -5576,9 +5565,7 @@ radv_pipeline_generate_hw_cs(struct radeon_cmdbuf *cs, const struct radv_pipelin uint64_t va = radv_buffer_get_va(shader->bo) + shader->bo_offset; struct radv_device *device = pipeline->device; - radeon_set_sh_reg_seq(cs, R_00B830_COMPUTE_PGM_LO, 2); - radeon_emit(cs, va >> 8); - radeon_emit(cs, S_00B834_DATA(va >> 40)); + radeon_set_sh_reg(cs, R_00B830_COMPUTE_PGM_LO, va >> 8); radeon_set_sh_reg_seq(cs, R_00B848_COMPUTE_PGM_RSRC1, 2); radeon_emit(cs, shader->config.rsrc1); diff --git a/src/amd/vulkan/radv_shader.c b/src/amd/vulkan/radv_shader.c index 82dade3dee4..f38ca7c8f45 100644 --- a/src/amd/vulkan/radv_shader.c +++ b/src/amd/vulkan/radv_shader.c @@ -1059,7 +1059,7 @@ radv_alloc_shader_memory(struct radv_device *device, struct radv_shader_variant slab->size = MAX2(256 * 1024, shader->code_size); VkResult result = device->ws->buffer_create( device->ws, slab->size, 256, RADEON_DOMAIN_VRAM, - RADEON_FLAG_NO_INTERPROCESS_SHARING | + RADEON_FLAG_NO_INTERPROCESS_SHARING | RADEON_FLAG_32BIT | (device->physical_device->rad_info.cpdma_prefetch_writes_memory ? 0 : RADEON_FLAG_READ_ONLY), RADV_BO_PRIORITY_SHADER, 0, &slab->bo); diff --git a/src/amd/vulkan/si_cmd_buffer.c b/src/amd/vulkan/si_cmd_buffer.c index a4471d87910..b1c4f25fe1f 100644 --- a/src/amd/vulkan/si_cmd_buffer.c +++ b/src/amd/vulkan/si_cmd_buffer.c @@ -79,6 +79,9 @@ si_emit_compute(struct radv_device *device, struct radeon_cmdbuf *cs) radeon_emit(cs, 0); radeon_emit(cs, 0); + radeon_set_sh_reg(cs, R_00B834_COMPUTE_PGM_HI, + S_00B834_DATA(device->physical_device->rad_info.address32_hi >> 8)); + radeon_set_sh_reg_seq(cs, R_00B858_COMPUTE_STATIC_THREAD_MGMT_SE0, 2); /* R_00B858_COMPUTE_STATIC_THREAD_MGMT_SE0 / SE1, * renamed COMPUTE_DESTINATION_EN_SEn on gfx10. */ @@ -291,6 +294,23 @@ si_emit_graphics(struct radv_device *device, struct radeon_cmdbuf *cs) radeon_set_context_reg(cs, R_028408_VGT_INDX_OFFSET, 0); } + if (device->physical_device->rad_info.chip_class >= GFX10) { + radeon_set_sh_reg(cs, R_00B524_SPI_SHADER_PGM_HI_LS, + S_00B524_MEM_BASE(device->physical_device->rad_info.address32_hi >> 8)); + radeon_set_sh_reg(cs, R_00B324_SPI_SHADER_PGM_HI_ES, + S_00B324_MEM_BASE(device->physical_device->rad_info.address32_hi >> 8)); + } else if (device->physical_device->rad_info.chip_class == GFX9) { + radeon_set_sh_reg(cs, R_00B414_SPI_SHADER_PGM_HI_LS, + S_00B414_MEM_BASE(device->physical_device->rad_info.address32_hi >> 8)); + radeon_set_sh_reg(cs, R_00B214_SPI_SHADER_PGM_HI_ES, + S_00B214_MEM_BASE(device->physical_device->rad_info.address32_hi >> 8)); + } else { + radeon_set_sh_reg(cs, R_00B524_SPI_SHADER_PGM_HI_LS, + S_00B524_MEM_BASE(device->physical_device->rad_info.address32_hi >> 8)); + radeon_set_sh_reg(cs, R_00B324_SPI_SHADER_PGM_HI_ES, + S_00B324_MEM_BASE(device->physical_device->rad_info.address32_hi >> 8)); + } + unsigned cu_mask_ps = 0xffffffff; /* It's wasteful to enable all CUs for PS if shader arrays have a