mirror of
https://gitee.com/openharmony/third_party_mesa3d
synced 2024-12-13 19:32:15 +00:00
i965: Move VUE map computation to once at VS compile time.
With this and the previous patch, 640x480 nexuiz is running 0.169118% +/- 0.0863696% faster (n=121). On a VS state change microbenchmark, performance is increased 8.28645% +/- 0.460478% (n=52). v2: Fix CACHE_NEW_VS comment. Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
This commit is contained in:
parent
9f3d3216cf
commit
f0cecd43d6
@ -69,7 +69,7 @@ static void compile_clip_prog( struct brw_context *brw,
|
||||
c.func.single_program_flow = 1;
|
||||
|
||||
c.key = *key;
|
||||
brw_compute_vue_map(&c.vue_map, intel, brw->vs.prog_data);
|
||||
c.vue_map = brw->vs.prog_data->vue_map;
|
||||
|
||||
/* nr_regs is the number of registers filled by reading data from the VUE.
|
||||
* This program accesses the entire VUE, so nr_regs needs to be the size of
|
||||
|
@ -381,6 +381,8 @@ struct brw_gs_prog_data {
|
||||
};
|
||||
|
||||
struct brw_vs_prog_data {
|
||||
struct brw_vue_map vue_map;
|
||||
|
||||
GLuint curb_read_length;
|
||||
GLuint urb_read_length;
|
||||
GLuint total_grf;
|
||||
@ -1045,9 +1047,6 @@ void brw_upload_cs_urb_state(struct brw_context *brw);
|
||||
int brw_disasm (FILE *file, struct brw_instruction *inst, int gen);
|
||||
|
||||
/* brw_vs.c */
|
||||
void brw_compute_vue_map(struct brw_vue_map *vue_map,
|
||||
const struct intel_context *intel,
|
||||
const struct brw_vs_prog_data *prog_data);
|
||||
gl_clip_plane *brw_select_clip_planes(struct gl_context *ctx);
|
||||
|
||||
/* brw_wm.c */
|
||||
|
@ -56,7 +56,7 @@ static void compile_gs_prog( struct brw_context *brw,
|
||||
memset(&c, 0, sizeof(c));
|
||||
|
||||
c.key = *key;
|
||||
brw_compute_vue_map(&c.vue_map, intel, brw->vs.prog_data);
|
||||
c.vue_map = brw->vs.prog_data->vue_map;
|
||||
c.nr_regs = (c.vue_map.num_slots + 1)/2;
|
||||
|
||||
mem_ctx = NULL;
|
||||
|
@ -63,7 +63,7 @@ static void compile_sf_prog( struct brw_context *brw,
|
||||
brw_init_compile(brw, &c.func, mem_ctx);
|
||||
|
||||
c.key = *key;
|
||||
brw_compute_vue_map(&c.vue_map, intel, brw->vs.prog_data);
|
||||
c.vue_map = brw->vs.prog_data->vue_map;
|
||||
c.urb_entry_read_offset = brw_sf_compute_urb_entry_read_offset(intel);
|
||||
c.nr_attr_regs = (c.vue_map.num_slots + 1)/2 - c.urb_entry_read_offset;
|
||||
c.nr_setup_regs = c.nr_attr_regs;
|
||||
|
@ -96,7 +96,7 @@ vec4_visitor::setup_attributes(int payload_reg)
|
||||
|
||||
prog_data->urb_read_length = (nr_attributes + 1) / 2;
|
||||
|
||||
unsigned vue_entries = MAX2(nr_attributes, c->vue_map.num_slots);
|
||||
unsigned vue_entries = MAX2(nr_attributes, c->prog_data.vue_map.num_slots);
|
||||
|
||||
if (intel->gen == 6)
|
||||
c->prog_data.urb_entry_size = ALIGN(vue_entries, 8) / 8;
|
||||
|
@ -2252,8 +2252,6 @@ vec4_visitor::emit_urb_writes()
|
||||
|
||||
/* FINISHME: edgeflag */
|
||||
|
||||
brw_compute_vue_map(&c->vue_map, intel, &c->prog_data);
|
||||
|
||||
/* First mrf is the g0-based message header containing URB handles and such,
|
||||
* which is implied in VS_OPCODE_URB_WRITE.
|
||||
*/
|
||||
@ -2265,8 +2263,8 @@ vec4_visitor::emit_urb_writes()
|
||||
|
||||
/* Set up the VUE data for the first URB write */
|
||||
int slot;
|
||||
for (slot = 0; slot < c->vue_map.num_slots; ++slot) {
|
||||
emit_urb_slot(mrf++, c->vue_map.slot_to_vert_result[slot]);
|
||||
for (slot = 0; slot < c->prog_data.vue_map.num_slots; ++slot) {
|
||||
emit_urb_slot(mrf++, c->prog_data.vue_map.slot_to_vert_result[slot]);
|
||||
|
||||
/* If this was max_usable_mrf, we can't fit anything more into this URB
|
||||
* WRITE.
|
||||
@ -2281,16 +2279,16 @@ vec4_visitor::emit_urb_writes()
|
||||
vec4_instruction *inst = emit(VS_OPCODE_URB_WRITE);
|
||||
inst->base_mrf = base_mrf;
|
||||
inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
|
||||
inst->eot = (slot >= c->vue_map.num_slots);
|
||||
inst->eot = (slot >= c->prog_data.vue_map.num_slots);
|
||||
|
||||
/* Optional second URB write */
|
||||
if (!inst->eot) {
|
||||
mrf = base_mrf + 1;
|
||||
|
||||
for (; slot < c->vue_map.num_slots; ++slot) {
|
||||
for (; slot < c->prog_data.vue_map.num_slots; ++slot) {
|
||||
assert(mrf < max_usable_mrf);
|
||||
|
||||
emit_urb_slot(mrf++, c->vue_map.slot_to_vert_result[slot]);
|
||||
emit_urb_slot(mrf++, c->prog_data.vue_map.slot_to_vert_result[slot]);
|
||||
}
|
||||
|
||||
current_annotation = "URB write";
|
||||
|
@ -57,13 +57,13 @@ static inline void assign_vue_slot(struct brw_vue_map *vue_map,
|
||||
* prog_data->userclip and prog_data->outputs_written in their key
|
||||
* (generated by CACHE_NEW_VS_PROG).
|
||||
*/
|
||||
void
|
||||
brw_compute_vue_map(struct brw_vue_map *vue_map,
|
||||
const struct intel_context *intel,
|
||||
const struct brw_vs_prog_data *prog_data)
|
||||
static void
|
||||
brw_compute_vue_map(struct brw_vs_compile *c)
|
||||
{
|
||||
bool userclip_active = prog_data->userclip;
|
||||
GLbitfield64 outputs_written = prog_data->outputs_written;
|
||||
struct brw_context *brw = c->func.brw;
|
||||
const struct intel_context *intel = &brw->intel;
|
||||
struct brw_vue_map *vue_map = &c->prog_data.vue_map;
|
||||
GLbitfield64 outputs_written = c->prog_data.outputs_written;
|
||||
int i;
|
||||
|
||||
vue_map->num_slots = 0;
|
||||
@ -118,7 +118,7 @@ brw_compute_vue_map(struct brw_vue_map *vue_map,
|
||||
*/
|
||||
assign_vue_slot(vue_map, VERT_RESULT_PSIZ);
|
||||
assign_vue_slot(vue_map, VERT_RESULT_HPOS);
|
||||
if (userclip_active) {
|
||||
if (c->key.userclip_active) {
|
||||
assign_vue_slot(vue_map, VERT_RESULT_CLIP_DIST0);
|
||||
assign_vue_slot(vue_map, VERT_RESULT_CLIP_DIST1);
|
||||
}
|
||||
@ -218,6 +218,8 @@ do_vs_prog(struct brw_context *brw,
|
||||
c.prog_data.inputs_read |= VERT_BIT_EDGEFLAG;
|
||||
}
|
||||
|
||||
brw_compute_vue_map(&c);
|
||||
|
||||
/* Put dummy slots into the VUE for the SF to put the replaced
|
||||
* point sprite coords in. We shouldn't need these dummy slots,
|
||||
* which take up precious URB space, but it would mean that the SF
|
||||
|
@ -92,7 +92,6 @@ struct brw_vs_compile {
|
||||
|
||||
GLuint nr_inputs;
|
||||
|
||||
struct brw_vue_map vue_map;
|
||||
GLuint first_output;
|
||||
GLuint last_scratch;
|
||||
|
||||
|
@ -173,7 +173,6 @@ static inline bool can_use_direct_mrf(int vert_result,
|
||||
*/
|
||||
static void brw_vs_alloc_regs( struct brw_vs_compile *c )
|
||||
{
|
||||
struct brw_context *brw = c->func.brw;
|
||||
struct intel_context *intel = &c->func.brw->intel;
|
||||
GLuint i, reg = 0, slot;
|
||||
int attributes_in_vue;
|
||||
@ -326,13 +325,12 @@ static void brw_vs_alloc_regs( struct brw_vs_compile *c )
|
||||
|
||||
/* Allocate outputs. The non-position outputs go straight into message regs.
|
||||
*/
|
||||
brw_compute_vue_map(&c->vue_map, intel, &c->prog_data);
|
||||
c->first_output = reg;
|
||||
|
||||
first_reladdr_output = get_first_reladdr_output(&c->vp->program);
|
||||
|
||||
for (slot = 0; slot < c->vue_map.num_slots; slot++) {
|
||||
int vert_result = c->vue_map.slot_to_vert_result[slot];
|
||||
for (slot = 0; slot < c->prog_data.vue_map.num_slots; slot++) {
|
||||
int vert_result = c->prog_data.vue_map.slot_to_vert_result[slot];
|
||||
assert(vert_result < Elements(c->regs[PROGRAM_OUTPUT]));
|
||||
if (can_use_direct_mrf(vert_result, first_reladdr_output, slot)) {
|
||||
c->regs[PROGRAM_OUTPUT][vert_result] = brw_message_reg(slot + 1);
|
||||
@ -405,7 +403,7 @@ static void brw_vs_alloc_regs( struct brw_vs_compile *c )
|
||||
/* The VS VUEs are shared by VF (outputting our inputs) and VS, so size
|
||||
* them to fit the biggest thing they need to.
|
||||
*/
|
||||
attributes_in_vue = MAX2(c->vue_map.num_slots, c->nr_inputs);
|
||||
attributes_in_vue = MAX2(c->prog_data.vue_map.num_slots, c->nr_inputs);
|
||||
|
||||
if (intel->gen == 6) {
|
||||
/* Each attribute is 32 bytes (2 vec4s), so dividing by 8 gives us the
|
||||
@ -1678,12 +1676,12 @@ static void emit_vertex_write( struct brw_vs_compile *c)
|
||||
}
|
||||
|
||||
/* Move variable-addressed, non-overflow outputs to their MRFs. */
|
||||
for (slot = len_vertex_header; slot < c->vue_map.num_slots; ++slot) {
|
||||
for (slot = len_vertex_header; slot < c->prog_data.vue_map.num_slots; ++slot) {
|
||||
if (slot >= MAX_SLOTS_IN_FIRST_URB_WRITE)
|
||||
break;
|
||||
|
||||
int mrf = slot + 1;
|
||||
int vert_result = c->vue_map.slot_to_vert_result[slot];
|
||||
int vert_result = c->prog_data.vue_map.slot_to_vert_result[slot];
|
||||
if (c->regs[PROGRAM_OUTPUT][vert_result].file ==
|
||||
BRW_GENERAL_REGISTER_FILE) {
|
||||
brw_MOV(p, brw_message_reg(mrf),
|
||||
@ -1691,7 +1689,7 @@ static void emit_vertex_write( struct brw_vs_compile *c)
|
||||
}
|
||||
}
|
||||
|
||||
eot = (slot >= c->vue_map.num_slots);
|
||||
eot = (slot >= c->prog_data.vue_map.num_slots);
|
||||
|
||||
/* Message header, plus the (first part of the) VUE. */
|
||||
msg_len = 1 + slot;
|
||||
@ -1712,14 +1710,14 @@ static void emit_vertex_write( struct brw_vs_compile *c)
|
||||
0, /* urb destination offset */
|
||||
BRW_URB_SWIZZLE_INTERLEAVE);
|
||||
|
||||
if (slot < c->vue_map.num_slots) {
|
||||
if (slot < c->prog_data.vue_map.num_slots) {
|
||||
/* Not all of the vertex outputs/results fit into the MRF.
|
||||
* Move the overflowed attributes from the GRF to the MRF and
|
||||
* issue another brw_urb_WRITE().
|
||||
*/
|
||||
GLuint mrf = 1;
|
||||
for (; slot < c->vue_map.num_slots; ++slot) {
|
||||
int vert_result = c->vue_map.slot_to_vert_result[slot];
|
||||
for (; slot < c->prog_data.vue_map.num_slots; ++slot) {
|
||||
int vert_result = c->prog_data.vue_map.slot_to_vert_result[slot];
|
||||
/* move from GRF to MRF */
|
||||
brw_MOV(p, brw_message_reg(mrf),
|
||||
c->regs[PROGRAM_OUTPUT][vert_result]);
|
||||
|
@ -112,7 +112,6 @@ upload_sf_state(struct brw_context *brw)
|
||||
{
|
||||
struct intel_context *intel = &brw->intel;
|
||||
struct gl_context *ctx = &intel->ctx;
|
||||
struct brw_vue_map vue_map;
|
||||
uint32_t urb_entry_read_length;
|
||||
/* BRW_NEW_FRAGMENT_PROGRAM */
|
||||
uint32_t num_outputs = _mesa_bitcount_64(brw->fragment_program->Base.InputsRead);
|
||||
@ -129,8 +128,8 @@ upload_sf_state(struct brw_context *brw)
|
||||
uint32_t point_sprite_origin;
|
||||
|
||||
/* CACHE_NEW_VS_PROG */
|
||||
brw_compute_vue_map(&vue_map, intel, brw->vs.prog_data);
|
||||
urb_entry_read_length = (vue_map.num_slots + 1)/2 - urb_entry_read_offset;
|
||||
urb_entry_read_length = ((brw->vs.prog_data->vue_map.num_slots + 1) / 2 -
|
||||
urb_entry_read_offset);
|
||||
if (urb_entry_read_length == 0) {
|
||||
/* Setting the URB entry read length to 0 causes undefined behavior, so
|
||||
* if we have no URB data to read, set it to 1.
|
||||
@ -301,9 +300,10 @@ upload_sf_state(struct brw_context *brw)
|
||||
*/
|
||||
assert(input_index < 16 || attr == input_index);
|
||||
|
||||
/* _NEW_LIGHT | _NEW_PROGRAM */
|
||||
/* CACHE_NEW_VS_PROG | _NEW_LIGHT | _NEW_PROGRAM */
|
||||
attr_overrides[input_index++] =
|
||||
get_attr_override(&vue_map, urb_entry_read_offset, attr,
|
||||
get_attr_override(&brw->vs.prog_data->vue_map,
|
||||
urb_entry_read_offset, attr,
|
||||
ctx->VertexProgram._TwoSideEnabled);
|
||||
}
|
||||
|
||||
|
@ -33,7 +33,6 @@ upload_sbe_state(struct brw_context *brw)
|
||||
{
|
||||
struct intel_context *intel = &brw->intel;
|
||||
struct gl_context *ctx = &intel->ctx;
|
||||
struct brw_vue_map vue_map;
|
||||
uint32_t urb_entry_read_length;
|
||||
/* BRW_NEW_FRAGMENT_PROGRAM */
|
||||
uint32_t num_outputs = _mesa_bitcount_64(brw->fragment_program->Base.InputsRead);
|
||||
@ -49,8 +48,8 @@ upload_sbe_state(struct brw_context *brw)
|
||||
uint32_t point_sprite_origin;
|
||||
|
||||
/* CACHE_NEW_VS_PROG */
|
||||
brw_compute_vue_map(&vue_map, intel, brw->vs.prog_data);
|
||||
urb_entry_read_length = (vue_map.num_slots + 1)/2 - urb_entry_read_offset;
|
||||
urb_entry_read_length = ((brw->vs.prog_data->vue_map.num_slots + 1) / 2 -
|
||||
urb_entry_read_offset);
|
||||
if (urb_entry_read_length == 0) {
|
||||
/* Setting the URB entry read length to 0 causes undefined behavior, so
|
||||
* if we have no URB data to read, set it to 1.
|
||||
@ -114,9 +113,10 @@ upload_sbe_state(struct brw_context *brw)
|
||||
*/
|
||||
assert(input_index < 16 || attr == input_index);
|
||||
|
||||
/* _NEW_LIGHT | _NEW_PROGRAM */
|
||||
/* CACHE_NEW_VS_PROG | _NEW_LIGHT | _NEW_PROGRAM */
|
||||
attr_overrides[input_index++] =
|
||||
get_attr_override(&vue_map, urb_entry_read_offset, attr,
|
||||
get_attr_override(&brw->vs.prog_data->vue_map,
|
||||
urb_entry_read_offset, attr,
|
||||
ctx->VertexProgram._TwoSideEnabled);
|
||||
}
|
||||
|
||||
|
@ -239,14 +239,11 @@ upload_sol_state(struct brw_context *brw)
|
||||
struct gl_transform_feedback_object *xfb_obj =
|
||||
ctx->TransformFeedback.CurrentObject;
|
||||
bool active = xfb_obj->Active && !xfb_obj->Paused;
|
||||
struct brw_vue_map vue_map;
|
||||
|
||||
/* CACHE_NEW_VS_PROG */
|
||||
brw_compute_vue_map(&vue_map, intel, brw->vs.prog_data);
|
||||
|
||||
if (active) {
|
||||
upload_3dstate_so_buffers(brw);
|
||||
upload_3dstate_so_decl_list(brw, &vue_map);
|
||||
/* CACHE_NEW_VS_PROG */
|
||||
upload_3dstate_so_decl_list(brw, &brw->vs.prog_data->vue_map);
|
||||
|
||||
intel->batch.needs_sol_reset = true;
|
||||
}
|
||||
@ -256,7 +253,7 @@ upload_sol_state(struct brw_context *brw)
|
||||
* MMIO register updates (current performed by the kernel at each batch
|
||||
* emit).
|
||||
*/
|
||||
upload_3dstate_streamout(brw, active, &vue_map);
|
||||
upload_3dstate_streamout(brw, active, &brw->vs.prog_data->vue_map);
|
||||
}
|
||||
|
||||
const struct brw_tracked_state gen7_sol_state = {
|
||||
|
Loading…
Reference in New Issue
Block a user