intel/rt: Add a helper to create the raygen trampoline shader

Unlike graphics and compute pipelines, Vulkan ray-tracing pipelines do
not have a single entrypoint.  Instead, the raygen shader is specified
as a one-element shader binding table in the vkCmdTraceRay call.  This
means that raygen shaders have to be bindless shaders just like any
other ray tracing shader.  To launch them, we have a tiny compute shader
that acts as a trampoline and sets up the hotzone and uses btd_spawn to
fire off the raygen shader.

Reviewed-by: Caio Marcelo de Oliveira Filho <caio.oliveira@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7356>
This commit is contained in:
Jason Ekstrand 2020-08-06 16:31:05 -05:00 committed by Marge Bot
parent 303378e1dd
commit 96fde5518b
3 changed files with 159 additions and 0 deletions

View File

@ -368,3 +368,126 @@ brw_nir_lower_combined_intersection_any_hit(nir_shader *intersection,
NIR_PASS_V(intersection, lower_ray_walk_intrinsics, devinfo);
lower_rt_io_and_scratch(intersection);
}
static nir_ssa_def *
build_load_uniform(nir_builder *b, unsigned offset,
unsigned num_components, unsigned bit_size)
{
nir_intrinsic_instr *load =
nir_intrinsic_instr_create(b->shader, nir_intrinsic_load_uniform);
load->num_components = num_components;
load->src[0] = nir_src_for_ssa(nir_imm_int(b, 0));
nir_intrinsic_set_base(load, offset);
nir_intrinsic_set_range(load, num_components * bit_size / 8);
nir_ssa_dest_init(&load->instr, &load->dest,
num_components, bit_size, NULL);
nir_builder_instr_insert(b, &load->instr);
return &load->dest.ssa;
}
#define load_trampoline_param(b, name, num_components, bit_size) \
build_load_uniform((b), offsetof(struct brw_rt_raygen_trampoline_params, name), \
(num_components), (bit_size))
nir_shader *
brw_nir_create_raygen_trampoline(const struct brw_compiler *compiler,
void *mem_ctx)
{
const struct gen_device_info *devinfo = compiler->devinfo;
const nir_shader_compiler_options *nir_options =
compiler->glsl_compiler_options[MESA_SHADER_COMPUTE].NirOptions;
STATIC_ASSERT(sizeof(struct brw_rt_raygen_trampoline_params) == 32);
nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE,
nir_options,
"RT Ray-Gen Trampoline");
ralloc_steal(mem_ctx, b.shader);
b.shader->info.cs.local_size_variable = true;
/* The RT global data and raygen BINDLESS_SHADER_RECORD addresses are
* passed in as push constants in the first register. We deal with the
* raygen BSR address here; the global data we'll deal with later.
*/
b.shader->num_uniforms = 32;
nir_ssa_def *raygen_bsr_addr =
load_trampoline_param(&b, raygen_bsr_addr, 1, 64);
nir_ssa_def *local_shift =
nir_u2u32(&b, load_trampoline_param(&b, local_group_size_log2, 3, 8));
nir_ssa_def *global_id = nir_load_work_group_id(&b, 32);
nir_ssa_def *simd_channel = nir_load_subgroup_invocation(&b);
nir_ssa_def *local_x =
nir_ubfe(&b, simd_channel, nir_imm_int(&b, 0),
nir_channel(&b, local_shift, 0));
nir_ssa_def *local_y =
nir_ubfe(&b, simd_channel, nir_channel(&b, local_shift, 0),
nir_channel(&b, local_shift, 1));
nir_ssa_def *local_z =
nir_ubfe(&b, simd_channel,
nir_iadd(&b, nir_channel(&b, local_shift, 0),
nir_channel(&b, local_shift, 1)),
nir_channel(&b, local_shift, 2));
nir_ssa_def *launch_id =
nir_iadd(&b, nir_ishl(&b, global_id, local_shift),
nir_vec3(&b, local_x, local_y, local_z));
nir_ssa_def *launch_size = nir_load_ray_launch_size(&b);
nir_push_if(&b, nir_ball(&b, nir_ult(&b, launch_id, launch_size)));
{
nir_store_global(&b, brw_nir_rt_sw_hotzone_addr(&b, devinfo), 16,
nir_vec4(&b, nir_imm_int(&b, 0), /* Stack ptr */
nir_channel(&b, launch_id, 0),
nir_channel(&b, launch_id, 1),
nir_channel(&b, launch_id, 2)),
0xf /* write mask */);
brw_nir_btd_spawn(&b, raygen_bsr_addr);
}
nir_push_else(&b, NULL);
{
/* Even though these invocations aren't being used for anything, the
* hardware allocated stack IDs for them. They need to retire them.
*/
brw_nir_btd_retire(&b);
}
nir_pop_if(&b, NULL);
nir_shader *nir = b.shader;
nir->info.name = ralloc_strdup(nir, "RT: TraceRay trampoline");
nir_validate_shader(nir, "in brw_nir_create_raygen_trampoline");
brw_preprocess_nir(compiler, nir, NULL);
NIR_PASS_V(nir, brw_nir_lower_rt_intrinsics, devinfo);
/* brw_nir_lower_rt_intrinsics will leave us with a btd_global_arg_addr
* intrinsic which doesn't exist in compute shaders. We also created one
* above when we generated the BTD spawn intrinsic. Now we go through and
* replace them with a uniform load.
*/
nir_foreach_block(block, b.impl) {
nir_foreach_instr_safe(instr, block) {
if (instr->type != nir_instr_type_intrinsic)
continue;
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
if (intrin->intrinsic != nir_intrinsic_load_btd_global_arg_addr_intel)
continue;
b.cursor = nir_before_instr(&intrin->instr);
nir_ssa_def *global_arg_addr =
load_trampoline_param(&b, rt_disp_globals_addr, 1, 64);
assert(intrin->dest.is_ssa);
nir_ssa_def_rewrite_uses(&intrin->dest.ssa,
nir_src_for_ssa(global_arg_addr));
nir_instr_remove(instr);
}
}
NIR_PASS_V(nir, brw_nir_lower_cs_intrinsics);
brw_nir_optimize(nir, compiler, true, false);
return nir;
}

View File

@ -64,6 +64,9 @@ void brw_nir_lower_intersection_shader(nir_shader *intersection,
const struct gen_device_info *devinfo);
nir_shader *
brw_nir_create_raygen_trampoline(const struct brw_compiler *compiler,
void *mem_ctx);
nir_shader *
brw_nir_create_trivial_return_shader(const struct brw_compiler *compiler,
void *mem_ctx);

View File

@ -96,6 +96,39 @@ struct brw_rt_scratch_layout {
uint64_t total_size;
};
/** Parameters passed to the raygen trampoline shader
*
* This struct is carefully construected to be 32B and must be passed to the
* raygen trampoline shader as as inline constant data.
*/
struct brw_rt_raygen_trampoline_params {
/** The GPU address of the RT_DISPATCH_GLOBALS */
uint64_t rt_disp_globals_addr;
/** The GPU address of the BINDLESS_SHADER_RECORD for the raygen shader */
uint64_t raygen_bsr_addr;
/** 1 if this is an indirect dispatch, 0 otherwise */
uint8_t is_indirect;
/** The integer log2 of the local group size
*
* Ray-tracing shaders don't have a concept of local vs. global workgroup
* size. They only have a single 3D launch size. The raygen trampoline
* shader is always dispatched with a local workgroup size equal to the
* SIMD width but the shape of the local workgroup is determined at
* dispatch time based on the shape of the launch and passed to the
* trampoline via this field. (There's no sense having a Z dimension on
* the local workgroup if the launch is 2D.)
*
* We use the integer log2 of the size because there's no point in
* non-power-of-two sizes and shifts are cheaper than division.
*/
uint8_t local_group_size_log2[3];
uint32_t pad[3];
};
/** Size of the "hot zone" in bytes
*
* The hot zone is a SW-defined data structure which is a single uvec4