mirror of
https://gitee.com/openharmony/third_party_mesa3d
synced 2024-11-26 17:12:07 +00:00
i965: Select ranges of UBO data to be uploaded as push constants.
This adds a NIR pass that decides which portions of UBOS we should upload as push constants, rather than pull constants. v2: Switch to uint16_t for the UBO block number, because we may have a lot of them in Vulkan (suggested by Jason). Add more comments about bitfield trickery (requested by Matt). v3: Skip vec4 stages for now...I haven't finished wiring up support in the vec4 backend, and so pushing the data but not using it will just be wasteful. Reviewed-by: Matt Turner <mattst88@gmail.com>
This commit is contained in:
parent
2a5e4f15ef
commit
6d28c6e52c
@ -74,6 +74,7 @@ COMPILER_FILES = \
|
||||
compiler/brw_nir.h \
|
||||
compiler/brw_nir.c \
|
||||
compiler/brw_nir_analyze_boolean_resolves.c \
|
||||
compiler/brw_nir_analyze_ubo_ranges.c \
|
||||
compiler/brw_nir_attribute_workarounds.c \
|
||||
compiler/brw_nir_intrinsics.c \
|
||||
compiler/brw_nir_opt_peephole_ffma.c \
|
||||
|
@ -468,6 +468,13 @@ struct brw_image_param {
|
||||
*/
|
||||
#define BRW_SHADER_TIME_STRIDE 64
|
||||
|
||||
struct brw_ubo_range
|
||||
{
|
||||
uint16_t block;
|
||||
uint8_t start;
|
||||
uint8_t length;
|
||||
};
|
||||
|
||||
struct brw_stage_prog_data {
|
||||
struct {
|
||||
/** size of our binding table. */
|
||||
@ -488,6 +495,8 @@ struct brw_stage_prog_data {
|
||||
/** @} */
|
||||
} binding_table;
|
||||
|
||||
struct brw_ubo_range ubo_ranges[4];
|
||||
|
||||
GLuint nr_params; /**< number of float params/constants */
|
||||
GLuint nr_pull_params;
|
||||
unsigned nr_image_params;
|
||||
|
@ -142,6 +142,10 @@ void brw_nir_setup_glsl_uniforms(nir_shader *shader,
|
||||
void brw_nir_setup_arb_uniforms(nir_shader *shader, struct gl_program *prog,
|
||||
struct brw_stage_prog_data *stage_prog_data);
|
||||
|
||||
void brw_nir_analyze_ubo_ranges(const struct brw_compiler *compiler,
|
||||
nir_shader *nir,
|
||||
struct brw_ubo_range out_ranges[4]);
|
||||
|
||||
bool brw_nir_opt_peephole_ffma(nir_shader *shader);
|
||||
|
||||
#define BRW_NIR_FRAG_OUTPUT_INDEX_SHIFT 0
|
||||
|
298
src/intel/compiler/brw_nir_analyze_ubo_ranges.c
Normal file
298
src/intel/compiler/brw_nir_analyze_ubo_ranges.c
Normal file
@ -0,0 +1,298 @@
|
||||
/*
|
||||
* Copyright © 2015 Intel Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include "brw_nir.h"
|
||||
#include "compiler/nir/nir.h"
|
||||
#include "util/u_dynarray.h"
|
||||
|
||||
/**
|
||||
* \file brw_nir_analyze_ubo_ranges.c
|
||||
*
|
||||
* This pass decides which portions of UBOs to upload as push constants,
|
||||
* so shaders can access them as part of the thread payload, rather than
|
||||
* having to issue expensive memory reads to pull the data.
|
||||
*
|
||||
* The 3DSTATE_CONSTANT_* mechanism can push data from up to 4 different
|
||||
* buffers, in GRF (256-bit/32-byte) units.
|
||||
*
|
||||
* To do this, we examine NIR load_ubo intrinsics, recording the number of
|
||||
* loads at each offset. We track offsets at a 32-byte granularity, so even
|
||||
* fields with a bit of padding between them tend to fall into contiguous
|
||||
* ranges. We build a list of these ranges, tracking their "cost" (number
|
||||
* of registers required) and "benefit" (number of pull loads eliminated
|
||||
* by pushing the range). We then sort the list to obtain the four best
|
||||
* ranges (most benefit for the least cost).
|
||||
*/
|
||||
|
||||
struct ubo_range_entry
|
||||
{
|
||||
struct brw_ubo_range range;
|
||||
int benefit;
|
||||
};
|
||||
|
||||
static int
|
||||
score(const struct ubo_range_entry *entry)
|
||||
{
|
||||
return 2 * entry->benefit - entry->range.length;
|
||||
}
|
||||
|
||||
/**
|
||||
* Compares score for two UBO range entries.
|
||||
*
|
||||
* For a descending qsort().
|
||||
*/
|
||||
static int
|
||||
cmp_ubo_range_entry(const void *va, const void *vb)
|
||||
{
|
||||
const struct ubo_range_entry *a = va;
|
||||
const struct ubo_range_entry *b = vb;
|
||||
|
||||
/* Rank based on scores */
|
||||
int delta = score(b) - score(a);
|
||||
|
||||
/* Then use the UBO block index as a tie-breaker */
|
||||
if (delta == 0)
|
||||
delta = b->range.block - a->range.block;
|
||||
|
||||
/* Finally use the UBO offset as a second tie-breaker */
|
||||
if (delta == 0)
|
||||
delta = b->range.block - a->range.block;
|
||||
|
||||
return delta;
|
||||
}
|
||||
|
||||
struct ubo_block_info
|
||||
{
|
||||
/* Each bit in the offsets bitfield represents a 32-byte section of data.
|
||||
* If it's set to one, there is interesting UBO data at that offset. If
|
||||
* not, there's a "hole" - padding between data - or just nothing at all.
|
||||
*/
|
||||
uint64_t offsets;
|
||||
uint8_t uses[64];
|
||||
};
|
||||
|
||||
struct ubo_analysis_state
|
||||
{
|
||||
struct hash_table *blocks;
|
||||
bool uses_regular_uniforms;
|
||||
};
|
||||
|
||||
static struct ubo_block_info *
|
||||
get_block_info(struct ubo_analysis_state *state, int block)
|
||||
{
|
||||
uint32_t hash = block + 1;
|
||||
void *key = (void *) (uintptr_t) hash;
|
||||
|
||||
struct hash_entry *entry =
|
||||
_mesa_hash_table_search_pre_hashed(state->blocks, hash, key);
|
||||
|
||||
if (entry)
|
||||
return (struct ubo_block_info *) entry->data;
|
||||
|
||||
struct ubo_block_info *info =
|
||||
rzalloc(state->blocks, struct ubo_block_info);
|
||||
_mesa_hash_table_insert_pre_hashed(state->blocks, hash, key, info);
|
||||
|
||||
return info;
|
||||
}
|
||||
|
||||
static void
|
||||
analyze_ubos_block(struct ubo_analysis_state *state, nir_block *block)
|
||||
{
|
||||
nir_foreach_instr(instr, block) {
|
||||
if (instr->type != nir_instr_type_intrinsic)
|
||||
continue;
|
||||
|
||||
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
|
||||
if (intrin->intrinsic == nir_intrinsic_load_uniform)
|
||||
state->uses_regular_uniforms = true;
|
||||
|
||||
if (intrin->intrinsic != nir_intrinsic_load_ubo)
|
||||
continue;
|
||||
|
||||
nir_const_value *block_const = nir_src_as_const_value(intrin->src[0]);
|
||||
nir_const_value *offset_const = nir_src_as_const_value(intrin->src[1]);
|
||||
|
||||
if (block_const && offset_const) {
|
||||
const int block = block_const->u32[0];
|
||||
const int offset = offset_const->u32[0] / 32;
|
||||
|
||||
/* Won't fit in our bitfield */
|
||||
if (offset >= 64)
|
||||
continue;
|
||||
|
||||
/* TODO: should we count uses in loops as higher benefit? */
|
||||
|
||||
struct ubo_block_info *info = get_block_info(state, block);
|
||||
info->offsets |= 1ull << offset;
|
||||
info->uses[offset]++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
print_ubo_entry(FILE *file,
|
||||
const struct ubo_range_entry *entry,
|
||||
struct ubo_analysis_state *state)
|
||||
{
|
||||
struct ubo_block_info *info = get_block_info(state, entry->range.block);
|
||||
|
||||
fprintf(file,
|
||||
"block %2d, start %2d, length %2d, bits = %zx, "
|
||||
"benefit %2d, cost %2d, score = %2d\n",
|
||||
entry->range.block, entry->range.start, entry->range.length,
|
||||
info->offsets, entry->benefit, entry->range.length, score(entry));
|
||||
}
|
||||
|
||||
void
|
||||
brw_nir_analyze_ubo_ranges(const struct brw_compiler *compiler,
|
||||
nir_shader *nir,
|
||||
struct brw_ubo_range out_ranges[4])
|
||||
{
|
||||
const struct gen_device_info *devinfo = compiler->devinfo;
|
||||
|
||||
if ((devinfo->gen <= 7 && !devinfo->is_haswell) ||
|
||||
!compiler->scalar_stage[nir->stage]) {
|
||||
memset(out_ranges, 0, 4 * sizeof(struct brw_ubo_range));
|
||||
return;
|
||||
}
|
||||
|
||||
void *mem_ctx = ralloc_context(NULL);
|
||||
|
||||
struct ubo_analysis_state state = {
|
||||
.uses_regular_uniforms = false,
|
||||
.blocks =
|
||||
_mesa_hash_table_create(mem_ctx, NULL, _mesa_key_pointer_equal),
|
||||
};
|
||||
|
||||
/* Walk the IR, recording how many times each UBO block/offset is used. */
|
||||
nir_foreach_function(function, nir) {
|
||||
if (function->impl) {
|
||||
nir_foreach_block(block, function->impl) {
|
||||
analyze_ubos_block(&state, block);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* Find ranges: a block, starting 32-byte offset, and length. */
|
||||
struct util_dynarray ranges;
|
||||
util_dynarray_init(&ranges, mem_ctx);
|
||||
|
||||
struct hash_entry *entry;
|
||||
hash_table_foreach(state.blocks, entry) {
|
||||
const int b = entry->hash - 1;
|
||||
const struct ubo_block_info *info = entry->data;
|
||||
uint64_t offsets = info->offsets;
|
||||
|
||||
/* Walk through the offsets bitfield, finding contiguous regions of
|
||||
* set bits:
|
||||
*
|
||||
* 0000000001111111111111000000000000111111111111110000000011111100
|
||||
* ^^^^^^^^^^^^^ ^^^^^^^^^^^^^^ ^^^^^^
|
||||
*
|
||||
* Each of these will become a UBO range.
|
||||
*/
|
||||
while (offsets != 0) {
|
||||
/* Find the first 1 in the offsets bitfield. This represents the
|
||||
* start of a range of interesting UBO data. Make it zero-indexed.
|
||||
*/
|
||||
int first_bit = ffsll(offsets) - 1;
|
||||
|
||||
/* Find the first 0 bit in offsets beyond first_bit. To find the
|
||||
* first zero bit, we find the first 1 bit in the complement. In
|
||||
* order to ignore bits before first_bit, we mask off those bits.
|
||||
*/
|
||||
int first_hole = ffsll(~offsets & ~((1ull << first_bit) - 1)) - 1;
|
||||
|
||||
if (first_hole == -1) {
|
||||
/* If we didn't find a hole, then set it to the end of the
|
||||
* bitfield. There are no more ranges to process.
|
||||
*/
|
||||
first_hole = 64;
|
||||
offsets = 0;
|
||||
} else {
|
||||
/* We've processed all bits before first_hole. Mask them off. */
|
||||
offsets &= ~((1ull << first_hole) - 1);
|
||||
}
|
||||
|
||||
struct ubo_range_entry *entry =
|
||||
util_dynarray_grow(&ranges, sizeof(struct ubo_range_entry));
|
||||
|
||||
entry->range.block = b;
|
||||
entry->range.start = first_bit;
|
||||
/* first_hole is one beyond the end, so we don't need to add 1 */
|
||||
entry->range.length = first_hole - first_bit;
|
||||
entry->benefit = 0;
|
||||
|
||||
for (int i = 0; i < entry->range.length; i++)
|
||||
entry->benefit += info->uses[first_bit + i];
|
||||
}
|
||||
}
|
||||
|
||||
int nr_entries = ranges.size / sizeof(struct ubo_range_entry);
|
||||
|
||||
if (0) {
|
||||
util_dynarray_foreach(&ranges, struct ubo_range_entry, entry) {
|
||||
print_ubo_entry(stderr, entry, &state);
|
||||
}
|
||||
}
|
||||
|
||||
/* TODO: Consider combining ranges.
|
||||
*
|
||||
* We can only push 3-4 ranges via 3DSTATE_CONSTANT_XS. If there are
|
||||
* more ranges, and two are close by with only a small hole, it may be
|
||||
* worth combining them. The holes will waste register space, but the
|
||||
* benefit of removing pulls may outweigh that cost.
|
||||
*/
|
||||
|
||||
/* Sort the list so the most beneficial ranges are at the front. */
|
||||
qsort(ranges.data, nr_entries, sizeof(struct ubo_range_entry),
|
||||
cmp_ubo_range_entry);
|
||||
|
||||
struct ubo_range_entry *entries = ranges.data;
|
||||
|
||||
/* Return the top 4 or so. We drop by one if regular uniforms are in
|
||||
* use, assuming one push buffer will be dedicated to those. We may
|
||||
* also only get 3 on Haswell if we can't write INSTPM.
|
||||
*
|
||||
* The backend may need to shrink these ranges to ensure that they
|
||||
* don't exceed the maximum push constant limits. It can simply drop
|
||||
* the tail of the list, as that's the least valuable portion. We
|
||||
* unfortunately can't truncate it here, because we don't know what
|
||||
* the backend is planning to do with regular uniforms.
|
||||
*/
|
||||
const int max_ubos = (compiler->constant_buffer_0_is_relative ? 3 : 4) -
|
||||
state.uses_regular_uniforms;
|
||||
nr_entries = MIN2(nr_entries, max_ubos);
|
||||
|
||||
for (int i = 0; i < nr_entries; i++) {
|
||||
out_ranges[i] = entries[i].range;
|
||||
}
|
||||
for (int i = nr_entries; i < 4; i++) {
|
||||
out_ranges[i].block = 0;
|
||||
out_ranges[i].start = 0;
|
||||
out_ranges[i].length = 0;
|
||||
}
|
||||
|
||||
ralloc_free(ranges.mem_ctx);
|
||||
}
|
@ -112,6 +112,8 @@ brw_codegen_gs_prog(struct brw_context *brw,
|
||||
brw_nir_setup_glsl_uniforms(gp->program.nir, &gp->program,
|
||||
&prog_data.base.base,
|
||||
compiler->scalar_stage[MESA_SHADER_GEOMETRY]);
|
||||
brw_nir_analyze_ubo_ranges(compiler, gp->program.nir,
|
||||
prog_data.base.base.ubo_ranges);
|
||||
|
||||
uint64_t outputs_written = gp->program.info.outputs_written;
|
||||
|
||||
|
@ -205,6 +205,8 @@ brw_codegen_tcs_prog(struct brw_context *brw, struct brw_program *tcp,
|
||||
|
||||
brw_nir_setup_glsl_uniforms(nir, &tcp->program, &prog_data.base.base,
|
||||
compiler->scalar_stage[MESA_SHADER_TESS_CTRL]);
|
||||
brw_nir_analyze_ubo_ranges(compiler, tcp->program.nir,
|
||||
prog_data.base.base.ubo_ranges);
|
||||
} else {
|
||||
/* Upload the Patch URB Header as the first two uniforms.
|
||||
* Do the annoying scrambling so the shader doesn't have to.
|
||||
|
@ -102,6 +102,8 @@ brw_codegen_tes_prog(struct brw_context *brw,
|
||||
|
||||
brw_nir_setup_glsl_uniforms(nir, &tep->program, &prog_data.base.base,
|
||||
compiler->scalar_stage[MESA_SHADER_TESS_EVAL]);
|
||||
brw_nir_analyze_ubo_ranges(compiler, tep->program.nir,
|
||||
prog_data.base.base.ubo_ranges);
|
||||
|
||||
int st_index = -1;
|
||||
if (unlikely(INTEL_DEBUG & DEBUG_SHADER_TIME))
|
||||
|
@ -203,6 +203,8 @@ brw_codegen_vs_prog(struct brw_context *brw,
|
||||
brw_nir_setup_glsl_uniforms(vp->program.nir, &vp->program,
|
||||
&prog_data.base.base,
|
||||
compiler->scalar_stage[MESA_SHADER_VERTEX]);
|
||||
brw_nir_analyze_ubo_ranges(compiler, vp->program.nir,
|
||||
prog_data.base.base.ubo_ranges);
|
||||
} else {
|
||||
brw_nir_setup_arb_uniforms(vp->program.nir, &vp->program,
|
||||
&prog_data.base.base);
|
||||
|
@ -165,6 +165,8 @@ brw_codegen_wm_prog(struct brw_context *brw,
|
||||
if (!fp->program.is_arb_asm) {
|
||||
brw_nir_setup_glsl_uniforms(fp->program.nir, &fp->program,
|
||||
&prog_data.base, true);
|
||||
brw_nir_analyze_ubo_ranges(brw->screen->compiler, fp->program.nir,
|
||||
prog_data.base.ubo_ranges);
|
||||
} else {
|
||||
brw_nir_setup_arb_uniforms(fp->program.nir, &fp->program,
|
||||
&prog_data.base);
|
||||
|
Loading…
Reference in New Issue
Block a user