[Mesa-dev] [PATCH 2/2] i965: Move VUE map computation to once at VS compile time.

Tue Feb 21 11:23:56 PST 2012

On 02/14/2012 08:46 PM, Eric Anholt wrote:
> With this and the previous patch, 640x480 nexuiz is running 0.169118%
> +/- 0.0863696% faster (n=121).  On a VS state change microbenchmark,
> performance is increased 8.28645% +/- 0.460478% (n=52).
>
> ---
>
> To those who saw me throwing my hands in the air saying "why does this
> patch not show up in the microbenchmark!?" today, it turns out I was
> smashing the environment in the test run so I was testing the old mesa
> for both configurations.
>
>   src/mesa/drivers/dri/i965/brw_clip.c           |    2 +-
>   src/mesa/drivers/dri/i965/brw_context.h        |    5 ++---
>   src/mesa/drivers/dri/i965/brw_gs.c             |    2 +-
>   src/mesa/drivers/dri/i965/brw_sf.c             |    2 +-
>   src/mesa/drivers/dri/i965/brw_vec4_emit.cpp    |    2 +-
>   src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp |   12 +++++-------
>   src/mesa/drivers/dri/i965/brw_vs.c             |   16 +++++++++-------
>   src/mesa/drivers/dri/i965/brw_vs.h             |    1 -
>   src/mesa/drivers/dri/i965/brw_vs_emit.c        |   20 +++++++++-----------
>   src/mesa/drivers/dri/i965/gen6_sf_state.c      |   10 +++++-----
>   src/mesa/drivers/dri/i965/gen7_sf_state.c      |   10 +++++-----
>   src/mesa/drivers/dri/i965/gen7_sol_state.c     |    9 +++------
>   12 files changed, 42 insertions(+), 49 deletions(-)
>
> diff --git a/src/mesa/drivers/dri/i965/brw_clip.c b/src/mesa/drivers/dri/i965/brw_clip.c
> index 5b5f551..d411208 100644
> --- a/src/mesa/drivers/dri/i965/brw_clip.c
> +++ b/src/mesa/drivers/dri/i965/brw_clip.c
> @@ -69,7 +69,7 @@ static void compile_clip_prog( struct brw_context *brw,
>      c.func.single_program_flow = 1;
>
>      c.key = *key;
> -   brw_compute_vue_map(&c.vue_map, intel, brw->vs.prog_data);
> +   c.vue_map = brw->vs.prog_data->vue_map;
>
>      /* nr_regs is the number of registers filled by reading data from the VUE.
>       * This program accesses the entire VUE, so nr_regs needs to be the size of
> diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h
> index c6860a7..503585c 100644
> --- a/src/mesa/drivers/dri/i965/brw_context.h
> +++ b/src/mesa/drivers/dri/i965/brw_context.h
> @@ -381,6 +381,8 @@ struct brw_gs_prog_data {
>   };
>
>   struct brw_vs_prog_data {
> +   struct brw_vue_map vue_map;
> +
>      GLuint curb_read_length;
>      GLuint urb_read_length;
>      GLuint total_grf;
> @@ -1045,9 +1047,6 @@ void brw_upload_cs_urb_state(struct brw_context *brw);
>   int brw_disasm (FILE *file, struct brw_instruction *inst, int gen);
>
>   /* brw_vs.c */
> -void brw_compute_vue_map(struct brw_vue_map *vue_map,
> -                         const struct intel_context *intel,
> -                         const struct brw_vs_prog_data *prog_data);
>   gl_clip_plane *brw_select_clip_planes(struct gl_context *ctx);
>
>   /* brw_wm.c */
> diff --git a/src/mesa/drivers/dri/i965/brw_gs.c b/src/mesa/drivers/dri/i965/brw_gs.c
> index c6132df..bfca169 100644
> --- a/src/mesa/drivers/dri/i965/brw_gs.c
> +++ b/src/mesa/drivers/dri/i965/brw_gs.c
> @@ -56,7 +56,7 @@ static void compile_gs_prog( struct brw_context *brw,
>      memset(&c, 0, sizeof(c));
>
>      c.key = *key;
> -   brw_compute_vue_map(&c.vue_map, intel, brw->vs.prog_data);
> +   c.vue_map = brw->vs.prog_data->vue_map;
>      c.nr_regs = (c.vue_map.num_slots + 1)/2;
>
>      mem_ctx = NULL;
> diff --git a/src/mesa/drivers/dri/i965/brw_sf.c b/src/mesa/drivers/dri/i965/brw_sf.c
> index 6c28d77..6e63583 100644
> --- a/src/mesa/drivers/dri/i965/brw_sf.c
> +++ b/src/mesa/drivers/dri/i965/brw_sf.c
> @@ -63,7 +63,7 @@ static void compile_sf_prog( struct brw_context *brw,
>      brw_init_compile(brw,&c.func, mem_ctx);
>
>      c.key = *key;
> -   brw_compute_vue_map(&c.vue_map, intel, brw->vs.prog_data);
> +   c.vue_map = brw->vs.prog_data->vue_map;
>      c.urb_entry_read_offset = brw_sf_compute_urb_entry_read_offset(intel);
>      c.nr_attr_regs = (c.vue_map.num_slots + 1)/2 - c.urb_entry_read_offset;
>      c.nr_setup_regs = c.nr_attr_regs;
> diff --git a/src/mesa/drivers/dri/i965/brw_vec4_emit.cpp b/src/mesa/drivers/dri/i965/brw_vec4_emit.cpp
> index 917c927..f9eed61 100644
> --- a/src/mesa/drivers/dri/i965/brw_vec4_emit.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_vec4_emit.cpp
> @@ -96,7 +96,7 @@ vec4_visitor::setup_attributes(int payload_reg)
>
>      prog_data->urb_read_length = (nr_attributes + 1) / 2;
>
> -   unsigned vue_entries = MAX2(nr_attributes, c->vue_map.num_slots);
> +   unsigned vue_entries = MAX2(nr_attributes, c->prog_data.vue_map.num_slots);
>
>      if (intel->gen == 6)
>         c->prog_data.urb_entry_size = ALIGN(vue_entries, 8) / 8;
> diff --git a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
> index 5dfe1c1..f9a08a0 100644
> --- a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
> @@ -2252,8 +2252,6 @@ vec4_visitor::emit_urb_writes()
>
>      /* FINISHME: edgeflag */
>
> -   brw_compute_vue_map(&c->vue_map, intel,&c->prog_data);
> -
>      /* First mrf is the g0-based message header containing URB handles and such,
>       * which is implied in VS_OPCODE_URB_WRITE.
>       */
> @@ -2265,8 +2263,8 @@ vec4_visitor::emit_urb_writes()
>
>      /* Set up the VUE data for the first URB write */
>      int slot;
> -   for (slot = 0; slot<  c->vue_map.num_slots; ++slot) {
> -      emit_urb_slot(mrf++, c->vue_map.slot_to_vert_result[slot]);
> +   for (slot = 0; slot<  c->prog_data.vue_map.num_slots; ++slot) {
> +      emit_urb_slot(mrf++, c->prog_data.vue_map.slot_to_vert_result[slot]);
>
>         /* If this was max_usable_mrf, we can't fit anything more into this URB
>          * WRITE.
> @@ -2281,16 +2279,16 @@ vec4_visitor::emit_urb_writes()
>      vec4_instruction *inst = emit(VS_OPCODE_URB_WRITE);
>      inst->base_mrf = base_mrf;
>      inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
> -   inst->eot = (slot>= c->vue_map.num_slots);
> +   inst->eot = (slot>= c->prog_data.vue_map.num_slots);
>
>      /* Optional second URB write */
>      if (!inst->eot) {
>         mrf = base_mrf + 1;
>
> -      for (; slot<  c->vue_map.num_slots; ++slot) {
> +      for (; slot<  c->prog_data.vue_map.num_slots; ++slot) {
>   	 assert(mrf<  max_usable_mrf);
>
> -         emit_urb_slot(mrf++, c->vue_map.slot_to_vert_result[slot]);
> +         emit_urb_slot(mrf++, c->prog_data.vue_map.slot_to_vert_result[slot]);
>         }
>
>         current_annotation = "URB write";
> diff --git a/src/mesa/drivers/dri/i965/brw_vs.c b/src/mesa/drivers/dri/i965/brw_vs.c
> index be82177..ca205cd 100644
> --- a/src/mesa/drivers/dri/i965/brw_vs.c
> +++ b/src/mesa/drivers/dri/i965/brw_vs.c
> @@ -57,13 +57,13 @@ static inline void assign_vue_slot(struct brw_vue_map *vue_map,
>    * prog_data->userclip and prog_data->outputs_written in their key
>    * (generated by CACHE_NEW_VS_PROG).
>    */
> -void
> -brw_compute_vue_map(struct brw_vue_map *vue_map,
> -                    const struct intel_context *intel,
> -		    const struct brw_vs_prog_data *prog_data)
> +static void
> +brw_compute_vue_map(struct brw_vs_compile *c)
>   {
> -   bool userclip_active = prog_data->userclip;
> -   GLbitfield64 outputs_written = prog_data->outputs_written;
> +   struct brw_context *brw = c->func.brw;
> +   const struct intel_context *intel =&brw->intel;
> +   struct brw_vue_map *vue_map =&c->prog_data.vue_map;
> +   GLbitfield64 outputs_written = c->prog_data.outputs_written;
>      int i;
>
>      vue_map->num_slots = 0;
> @@ -118,7 +118,7 @@ brw_compute_vue_map(struct brw_vue_map *vue_map,
>          */
>         assign_vue_slot(vue_map, VERT_RESULT_PSIZ);
>         assign_vue_slot(vue_map, VERT_RESULT_HPOS);
> -      if (userclip_active) {
> +      if (c->key.userclip_active) {
>            assign_vue_slot(vue_map, VERT_RESULT_CLIP_DIST0);
>            assign_vue_slot(vue_map, VERT_RESULT_CLIP_DIST1);
>         }
> @@ -218,6 +218,8 @@ do_vs_prog(struct brw_context *brw,
>         c.prog_data.inputs_read |= VERT_BIT_EDGEFLAG;
>      }
>
> +   brw_compute_vue_map(&c);
> +
>      /* Put dummy slots into the VUE for the SF to put the replaced
>       * point sprite coords in.  We shouldn't need these dummy slots,
>       * which take up precious URB space, but it would mean that the SF
> diff --git a/src/mesa/drivers/dri/i965/brw_vs.h b/src/mesa/drivers/dri/i965/brw_vs.h
> index 8814251..490fcc0 100644
> --- a/src/mesa/drivers/dri/i965/brw_vs.h
> +++ b/src/mesa/drivers/dri/i965/brw_vs.h
> @@ -92,7 +92,6 @@ struct brw_vs_compile {
>
>      GLuint nr_inputs;
>
> -   struct brw_vue_map vue_map;
>      GLuint first_output;
>      GLuint last_scratch;
>
> diff --git a/src/mesa/drivers/dri/i965/brw_vs_emit.c b/src/mesa/drivers/dri/i965/brw_vs_emit.c
> index 07fc0af..4bdd366 100644
> --- a/src/mesa/drivers/dri/i965/brw_vs_emit.c
> +++ b/src/mesa/drivers/dri/i965/brw_vs_emit.c
> @@ -173,7 +173,6 @@ static inline bool can_use_direct_mrf(int vert_result,
>    */
>   static void brw_vs_alloc_regs( struct brw_vs_compile *c )
>   {
> -   struct brw_context *brw = c->func.brw;
>      struct intel_context *intel =&c->func.brw->intel;
>      GLuint i, reg = 0, slot;
>      int attributes_in_vue;
> @@ -326,13 +325,12 @@ static void brw_vs_alloc_regs( struct brw_vs_compile *c )
>
>      /* Allocate outputs.  The non-position outputs go straight into message regs.
>       */
> -   brw_compute_vue_map(&c->vue_map, intel,&c->prog_data);
>      c->first_output = reg;
>
>      first_reladdr_output = get_first_reladdr_output(&c->vp->program);
>
> -   for (slot = 0; slot<  c->vue_map.num_slots; slot++) {
> -      int vert_result = c->vue_map.slot_to_vert_result[slot];
> +   for (slot = 0; slot<  c->prog_data.vue_map.num_slots; slot++) {
> +      int vert_result = c->prog_data.vue_map.slot_to_vert_result[slot];
>         assert(vert_result<  Elements(c->regs[PROGRAM_OUTPUT]));
>         if (can_use_direct_mrf(vert_result, first_reladdr_output, slot)) {
>            c->regs[PROGRAM_OUTPUT][vert_result] = brw_message_reg(slot + 1);
> @@ -405,7 +403,7 @@ static void brw_vs_alloc_regs( struct brw_vs_compile *c )
>      /* The VS VUEs are shared by VF (outputting our inputs) and VS, so size
>       * them to fit the biggest thing they need to.
>       */
> -   attributes_in_vue = MAX2(c->vue_map.num_slots, c->nr_inputs);
> +   attributes_in_vue = MAX2(c->prog_data.vue_map.num_slots, c->nr_inputs);
>
>      if (intel->gen == 6) {
>         /* Each attribute is 32 bytes (2 vec4s), so dividing by 8 gives us the
> @@ -1678,12 +1676,12 @@ static void emit_vertex_write( struct brw_vs_compile *c)
>      }
>
>      /* Move variable-addressed, non-overflow outputs to their MRFs. */
> -   for (slot = len_vertex_header; slot<  c->vue_map.num_slots; ++slot) {
> +   for (slot = len_vertex_header; slot<  c->prog_data.vue_map.num_slots; ++slot) {
>         if (slot>= MAX_SLOTS_IN_FIRST_URB_WRITE)
>            break;
>
>         int mrf = slot + 1;
> -      int vert_result = c->vue_map.slot_to_vert_result[slot];
> +      int vert_result = c->prog_data.vue_map.slot_to_vert_result[slot];
>         if (c->regs[PROGRAM_OUTPUT][vert_result].file ==
>             BRW_GENERAL_REGISTER_FILE) {
>            brw_MOV(p, brw_message_reg(mrf),
> @@ -1691,7 +1689,7 @@ static void emit_vertex_write( struct brw_vs_compile *c)
>         }
>      }
>
> -   eot = (slot>= c->vue_map.num_slots);
> +   eot = (slot>= c->prog_data.vue_map.num_slots);
>
>      /* Message header, plus the (first part of the) VUE. */
>      msg_len = 1 + slot;
> @@ -1712,14 +1710,14 @@ static void emit_vertex_write( struct brw_vs_compile *c)
>   		 0, 		/* urb destination offset */
>   		 BRW_URB_SWIZZLE_INTERLEAVE);
>
> -   if (slot<  c->vue_map.num_slots) {
> +   if (slot<  c->prog_data.vue_map.num_slots) {
>         /* Not all of the vertex outputs/results fit into the MRF.
>          * Move the overflowed attributes from the GRF to the MRF and
>          * issue another brw_urb_WRITE().
>          */
>         GLuint mrf = 1;
> -      for (; slot<  c->vue_map.num_slots; ++slot) {
> -         int vert_result = c->vue_map.slot_to_vert_result[slot];
> +      for (; slot<  c->prog_data.vue_map.num_slots; ++slot) {
> +         int vert_result = c->prog_data.vue_map.slot_to_vert_result[slot];
>            /* move from GRF to MRF */
>            brw_MOV(p, brw_message_reg(mrf),
>                    c->regs[PROGRAM_OUTPUT][vert_result]);
> diff --git a/src/mesa/drivers/dri/i965/gen6_sf_state.c b/src/mesa/drivers/dri/i965/gen6_sf_state.c
> index c4e7c4c..95ed1f7 100644
> --- a/src/mesa/drivers/dri/i965/gen6_sf_state.c
> +++ b/src/mesa/drivers/dri/i965/gen6_sf_state.c
> @@ -112,7 +112,6 @@ upload_sf_state(struct brw_context *brw)
>   {
>      struct intel_context *intel =&brw->intel;
>      struct gl_context *ctx =&intel->ctx;
> -   struct brw_vue_map vue_map;
>      uint32_t urb_entry_read_length;
>      /* BRW_NEW_FRAGMENT_PROGRAM */
>      uint32_t num_outputs = _mesa_bitcount_64(brw->fragment_program->Base.InputsRead);
> @@ -129,8 +128,8 @@ upload_sf_state(struct brw_context *brw)
>      uint32_t point_sprite_origin;
>
>      /* CACHE_NEW_VS_PROG */
> -   brw_compute_vue_map(&vue_map, intel, brw->vs.prog_data);
> -   urb_entry_read_length = (vue_map.num_slots + 1)/2 - urb_entry_read_offset;
> +   urb_entry_read_length = ((brw->vs.prog_data->vue_map.num_slots + 1) / 2 -
> +			    urb_entry_read_offset);
>      if (urb_entry_read_length == 0) {
>         /* Setting the URB entry read length to 0 causes undefined behavior, so
>          * if we have no URB data to read, set it to 1.
> @@ -301,9 +300,10 @@ upload_sf_state(struct brw_context *brw)
>          */
>         assert(input_index<  16 || attr == input_index);
>
> -      /* _NEW_LIGHT | _NEW_PROGRAM */
> +      /* CACHE_NEW_VS_PROG | _NEW_LIGHT | _NEW_PROGRAM */
>         attr_overrides[input_index++] =
> -         get_attr_override(&vue_map, urb_entry_read_offset, attr,
> +         get_attr_override(&brw->vs.prog_data->vue_map,
> +			   urb_entry_read_offset, attr,
>                              ctx->VertexProgram._TwoSideEnabled);
>      }
>
> diff --git a/src/mesa/drivers/dri/i965/gen7_sf_state.c b/src/mesa/drivers/dri/i965/gen7_sf_state.c
> index 49460b2..aa77584 100644
> --- a/src/mesa/drivers/dri/i965/gen7_sf_state.c
> +++ b/src/mesa/drivers/dri/i965/gen7_sf_state.c
> @@ -33,7 +33,6 @@ upload_sbe_state(struct brw_context *brw)
>   {
>      struct intel_context *intel =&brw->intel;
>      struct gl_context *ctx =&intel->ctx;
> -   struct brw_vue_map vue_map;
>      uint32_t urb_entry_read_length;
>      /* BRW_NEW_FRAGMENT_PROGRAM */
>      uint32_t num_outputs = _mesa_bitcount_64(brw->fragment_program->Base.InputsRead);
> @@ -49,8 +48,8 @@ upload_sbe_state(struct brw_context *brw)
>      uint32_t point_sprite_origin;
>
>      /* CACHE_NEW_VS_PROG */
> -   brw_compute_vue_map(&vue_map, intel, brw->vs.prog_data);
> -   urb_entry_read_length = (vue_map.num_slots + 1)/2 - urb_entry_read_offset;
> +   urb_entry_read_length = ((brw->vs.prog_data->vue_map.num_slots + 1) / 2 -
> +			    urb_entry_read_offset);
>      if (urb_entry_read_length == 0) {
>         /* Setting the URB entry read length to 0 causes undefined behavior, so
>          * if we have no URB data to read, set it to 1.
> @@ -114,9 +113,10 @@ upload_sbe_state(struct brw_context *brw)
>          */
>         assert(input_index<  16 || attr == input_index);
>
> -      /* _NEW_LIGHT | _NEW_PROGRAM */
> +      /* CACHE_NEW_VS | _NEW_LIGHT | _NEW_PROGRAM */

/* CACHE_NEW_VS_PROG | _NEW_LIGHT | _NEW_PROGRAM */

Reviewed-by: Kenneth Graunke <kenneth at whitecape.org>

>         attr_overrides[input_index++] =
> -         get_attr_override(&vue_map, urb_entry_read_offset, attr,
> +         get_attr_override(&brw->vs.prog_data->vue_map,
> +			   urb_entry_read_offset, attr,
>                              ctx->VertexProgram._TwoSideEnabled);
>      }
>
> diff --git a/src/mesa/drivers/dri/i965/gen7_sol_state.c b/src/mesa/drivers/dri/i965/gen7_sol_state.c
> index 134153e..1a89503 100644
> --- a/src/mesa/drivers/dri/i965/gen7_sol_state.c
> +++ b/src/mesa/drivers/dri/i965/gen7_sol_state.c
> @@ -239,14 +239,11 @@ upload_sol_state(struct brw_context *brw)
>      struct gl_transform_feedback_object *xfb_obj =
>         ctx->TransformFeedback.CurrentObject;
>      bool active = xfb_obj->Active&&  !xfb_obj->Paused;
> -   struct brw_vue_map vue_map;
> -
> -   /* CACHE_NEW_VS_PROG */
> -   brw_compute_vue_map(&vue_map, intel, brw->vs.prog_data);
>
>      if (active) {
>         upload_3dstate_so_buffers(brw);
> -      upload_3dstate_so_decl_list(brw,&vue_map);
> +      /* CACHE_NEW_VS_PROG */
> +      upload_3dstate_so_decl_list(brw,&brw->vs.prog_data->vue_map);
>
>         intel->batch.needs_sol_reset = true;
>      }
> @@ -256,7 +253,7 @@ upload_sol_state(struct brw_context *brw)
>       * MMIO register updates (current performed by the kernel at each batch
>       * emit).
>       */
> -   upload_3dstate_streamout(brw, active,&vue_map);
> +   upload_3dstate_streamout(brw, active,&brw->vs.prog_data->vue_map);
>   }
>
>   const struct brw_tracked_state gen7_sol_state = {