[Mesa-dev] [PATCH v2 14/16] i965: Add fs_visitor::run_vs() to generate scalar vertex shader code

Fri Nov 14 16:08:02 PST 2014

On Thursday, November 13, 2014 04:28:20 PM Kristian Høgsberg wrote:
> This patch uses the previous refactoring to add a new run_vs() method
> that generates vertex shader code using the scalar visitor and
> optimizer.
> 
> Signed-off-by: Kristian Høgsberg <krh at bitplanet.net>
> ---
>  src/mesa/drivers/dri/i965/brw_fs.cpp         |  99 ++++++++-
>  src/mesa/drivers/dri/i965/brw_fs.h           |  21 +-
>  src/mesa/drivers/dri/i965/brw_fs_visitor.cpp | 303 ++++++++++++++++++++++++++-
>  3 files changed, 412 insertions(+), 11 deletions(-)
> 
> diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
> index 4dce0a2..8007977 100644
> --- a/src/mesa/drivers/dri/i965/brw_fs.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
> @@ -1828,6 +1828,56 @@ fs_visitor::assign_urb_setup()
>        urb_start + prog_data->num_varying_inputs * 2;
>  }
>  
> +void
> +fs_visitor::assign_vs_urb_setup()
> +{
> +   brw_vs_prog_data *vs_prog_data = (brw_vs_prog_data *) prog_data;
> +   int grf, count, slot, channel, attr;
> +
> +   assert(stage == MESA_SHADER_VERTEX);
> +   count = _mesa_bitcount_64(vs_prog_data->inputs_read);
> +   if (vs_prog_data->uses_vertexid || vs_prog_data->uses_instanceid)
> +      count++;
> +
> +   /* Each attribute is 4 regs. */
> +   this->first_non_payload_grf =
> +      payload.num_regs + prog_data->curb_read_length + count * 4;
> +
> +   unsigned vue_entries =
> +      MAX2(count, vs_prog_data->base.vue_map.num_slots);
> +
> +   vs_prog_data->base.urb_entry_size = ALIGN(vue_entries, 4) / 4;
> +   vs_prog_data->base.urb_read_length = (count + 1) / 2;
> +
> +   assert(vs_prog_data->base.urb_read_length <= 15);
> +
> +   /* Rewrite all ATTR file references to the hw grf that they land in. */
> +   foreach_block_and_inst(block, fs_inst, inst, cfg) {
> +      for (int i = 0; i < inst->sources; i++) {
> +         if (inst->src[i].file == ATTR) {
> +
> +            if (inst->src[i].reg == VERT_ATTRIB_MAX) {
> +               slot = count - 1;
> +            } else {
> +               attr = inst->src[i].reg + inst->src[i].reg_offset / 4;
> +               slot = _mesa_bitcount_64(vs_prog_data->inputs_read &
> +                                        BITFIELD64_MASK(attr));

I'm having trouble understanding this code - can you explain?

Reading ir_set_program_inouts.cpp:98 I see that incoming vertex attributes
are always vec4 slots, except for matrices and arrays, which use multiple
vec4 slots.

I expected your ATTR registers to always be size 4, so reg_offset would have
valid values of 0..3.  But I must be mistaken, since you're doing
reg_offset / 4, which would always be 0.  Are ATTRs 4*N where N == the # of
matrix columns or array length?

Even still - I don't see how applying BITFIELD64_MASK to a potentially
non-power-of-two number and then doing a bitcount will give you a single
accurate slot value.

Adding a comment would also be nice to future maintainers.

This was the main spot where I got confused - otherwise most of the code
looks good to me.

> +            }
> +
> +            channel = inst->src[i].reg_offset & 3;
> +
> +            grf = payload.num_regs +
> +               prog_data->curb_read_length +
> +               slot * 4 + channel;
> +
> +            inst->src[i].file = HW_REG;
> +            inst->src[i].fixed_hw_reg =
> +               retype(brw_vec8_grf(grf, 0), inst->src[i].type);
> +         }
> +      }
> +   }
> +}
> +
>  /**
>   * Split large virtual GRFs into separate components if we can.
>   *
> @@ -3405,6 +3455,13 @@ fs_visitor::setup_payload_gen6()
>  }
>  
>  void
> +fs_visitor::setup_vs_payload()
> +{
> +   /* R0: thread header, R1: urb handles */
> +   payload.num_regs = 2;
> +}
> +
> +void
>  fs_visitor::assign_binding_table_offsets()
>  {
>     assert(stage == MESA_SHADER_FRAGMENT);
> @@ -3471,6 +3528,8 @@ fs_visitor::opt_drop_redundant_mov_to_flags()
>  void
>  fs_visitor::optimize()
>  {
> +   const char *stage_name = stage == MESA_SHADER_VERTEX ? "vs" : "fs";
> +
>     calculate_cfg();
>  
>     split_virtual_grfs();
> @@ -3487,8 +3546,8 @@ fs_visitor::optimize()
>                                                                          \
>        if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER) && this_progress) {   \
>           char filename[64];                                             \
> -         snprintf(filename, 64, "fs%d-%04d-%02d-%02d-" #pass,           \
> -                  dispatch_width, shader_prog ? shader_prog->Name : 0, iteration, pass_num); \
> +         snprintf(filename, 64, "%s%d-%04d-%02d-%02d-" #pass,              \
> +                  stage_name, dispatch_width, shader_prog ? shader_prog->Name : 0, iteration, pass_num); \
>                                                                          \
>           backend_visitor::dump_instructions(filename);                  \
>        }                                                                 \
> @@ -3498,8 +3557,8 @@ fs_visitor::optimize()
>  
>     if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER)) {
>        char filename[64];
> -      snprintf(filename, 64, "fs%d-%04d-00-start",
> -               dispatch_width, shader_prog ? shader_prog->Name : 0);
> +      snprintf(filename, 64, "%s%d-%04d-00-start",
> +               stage_name, dispatch_width, shader_prog ? shader_prog->Name : 0);
>  
>        backend_visitor::dump_instructions(filename);
>     }
> @@ -3608,6 +3667,38 @@ fs_visitor::allocate_registers()
>  }
>  
>  bool
> +fs_visitor::run_vs()
> +{
> +   assert(stage == MESA_SHADER_VERTEX);
> +
> +   assign_common_binding_table_offsets(0);
> +   setup_vs_payload();
> +
> +   if (INTEL_DEBUG & DEBUG_SHADER_TIME)
> +      emit_shader_time_begin();
> +
> +   foreach_in_list(ir_instruction, ir, shader->base.ir) {
> +      base_ir = ir;
> +      this->result = reg_undef;
> +      ir->accept(this);
> +   }
> +   base_ir = NULL;
> +   if (failed)
> +      return false;
> +
> +   emit_urb_writes();
> +
> +   optimize();
> +
> +   assign_curb_setup();
> +   assign_vs_urb_setup();
> +
> +   allocate_registers();
> +
> +   return !failed;
> +}
> +
> +bool
>  fs_visitor::run()
>  {
>     sanity_param_count = prog->Parameters->NumParameters;
> diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h
> index bb6f767..6888cdd 100644
> --- a/src/mesa/drivers/dri/i965/brw_fs.h
> +++ b/src/mesa/drivers/dri/i965/brw_fs.h
> @@ -310,12 +310,23 @@ public:
>                struct gl_shader_program *shader_prog,
>                struct gl_fragment_program *fp,
>                unsigned dispatch_width);
> +
> +   fs_visitor(struct brw_context *brw,
> +              void *mem_ctx,
> +              const struct brw_vs_prog_key *key,
> +              struct brw_vs_prog_data *prog_data,
> +              struct gl_shader_program *shader_prog,
> +              struct gl_vertex_program *cp,
> +              unsigned dispatch_width);
> +
>     ~fs_visitor();
>     void init();
>  
>     fs_reg *variable_storage(ir_variable *var);
>     int virtual_grf_alloc(int size);
>     void import_uniforms(fs_visitor *v);
> +   void setup_uniform_clipplane_values();
> +   void compute_clip_distance();
>  
>     void visit(ir_variable *ir);
>     void visit(ir_assignment *ir);
> @@ -406,14 +417,17 @@ public:
>                                          uint32_t const_offset);
>  
>     bool run();
> +   bool run_vs();
>     void optimize();
>     void allocate_registers();
>     void assign_binding_table_offsets();
>     void setup_payload_gen4();
>     void setup_payload_gen6();
> +   void setup_vs_payload();
>     void assign_curb_setup();
>     void calculate_urb_setup();
>     void assign_urb_setup();
> +   void assign_vs_urb_setup();
>     bool assign_regs(bool allow_spilling);
>     void assign_regs_trivial();
>     void get_used_mrfs(bool *mrf_used);
> @@ -471,6 +485,7 @@ public:
>     fs_reg *emit_samplepos_setup();
>     fs_reg *emit_sampleid_setup();
>     fs_reg *emit_general_interpolation(ir_variable *ir);
> +   fs_reg *emit_vs_system_value(enum brw_reg_type type, int location);
>     void emit_interpolation_setup_gen4();
>     void emit_interpolation_setup_gen6();
>     void compute_sample_position(fs_reg dst, fs_reg int_sample_pos);
> @@ -557,6 +572,7 @@ public:
>     fs_inst *emit_single_fb_write(fs_reg color1, fs_reg color2,
>                                   fs_reg src0_alpha, unsigned components);
>     void emit_fb_writes();
> +   void emit_urb_writes();
>  
>     void emit_shader_time_begin();
>     void emit_shader_time_end();
> @@ -632,8 +648,8 @@ public:
>     struct hash_table *variable_ht;
>     fs_reg frag_depth;
>     fs_reg sample_mask;
> -   fs_reg outputs[BRW_MAX_DRAW_BUFFERS];
> -   unsigned output_components[BRW_MAX_DRAW_BUFFERS];
> +   fs_reg outputs[VARYING_SLOT_MAX];
> +   unsigned output_components[VARYING_SLOT_MAX];
>     fs_reg dual_src_output;
>     bool do_dual_src;
>     int first_non_payload_grf;
> @@ -680,6 +696,7 @@ public:
>     fs_reg delta_x[BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT];
>     fs_reg delta_y[BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT];
>     fs_reg shader_start_time;
> +   fs_reg userplane[MAX_CLIP_PLANES];
>  
>     int grf_used;
>     bool spilled_any_registers;
> diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
> index 0cc51f3..df70340 100644
> --- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
> @@ -43,11 +43,40 @@ extern "C" {
>  #include "brw_eu.h"
>  #include "brw_wm.h"
>  }
> +#include "brw_vec4.h"
>  #include "brw_fs.h"
>  #include "main/uniforms.h"
>  #include "glsl/glsl_types.h"
>  #include "glsl/ir_optimization.h"
>  
> +fs_reg *
> +fs_visitor::emit_vs_system_value(enum brw_reg_type type, int location)
> +{
> +   fs_reg *reg = new(this->mem_ctx)
> +      fs_reg(ATTR, VERT_ATTRIB_MAX, type);
> +   brw_vs_prog_data *vs_prog_data = (brw_vs_prog_data *) prog_data;
> +
> +   switch (location) {
> +   case SYSTEM_VALUE_BASE_VERTEX:
> +      reg->reg_offset = 0;
> +      vs_prog_data->uses_vertexid = true;
> +      break;
> +   case SYSTEM_VALUE_VERTEX_ID:
> +   case SYSTEM_VALUE_VERTEX_ID_ZERO_BASE:
> +      reg->reg_offset = 2;
> +      vs_prog_data->uses_vertexid = true;
> +      break;
> +   case SYSTEM_VALUE_INSTANCE_ID:
> +      reg->reg_offset = 3;
> +      vs_prog_data->uses_instanceid = true;
> +      break;
> +   default:
> +      unreachable("not reached");
> +   }
> +
> +   return reg;
> +}
> +
>  void
>  fs_visitor::visit(ir_variable *ir)
>  {
> @@ -57,7 +86,11 @@ fs_visitor::visit(ir_variable *ir)
>        return;
>  
>     if (ir->data.mode == ir_var_shader_in) {
> -      if (!strcmp(ir->name, "gl_FragCoord")) {
> +      if (stage == MESA_SHADER_VERTEX) {
> +         reg = new(this->mem_ctx)
> +            fs_reg(ATTR, ir->data.location,
> +                   brw_type_for_base_type(ir->type->get_scalar_type()));
> +      } else if (!strcmp(ir->name, "gl_FragCoord")) {
>  	 reg = emit_fragcoord_interpolation(ir);
>        } else if (!strcmp(ir->name, "gl_FrontFacing")) {
>  	 reg = emit_frontfacing_interpolation();
> @@ -70,7 +103,19 @@ fs_visitor::visit(ir_variable *ir)
>     } else if (ir->data.mode == ir_var_shader_out) {
>        reg = new(this->mem_ctx) fs_reg(this, ir->type);
>  
> -      if (ir->data.index > 0) {
> +      if (stage == MESA_SHADER_VERTEX) {
> +	 int vector_elements =
> +	    ir->type->is_array() ? ir->type->fields.array->vector_elements
> +				 : ir->type->vector_elements;
> +
> +	 for (int i = 0; i < (type_size(ir->type) + 3) / 4; i++) {
> +	    int output = ir->data.location + i;
> +	    this->outputs[output] = *reg;
> +	    this->outputs[output].reg_offset = i * 4;
> +	    this->output_components[output] = vector_elements;
> +	 }
> +
> +      } else if (ir->data.index > 0) {
>  	 assert(ir->data.location == FRAG_RESULT_DATA0);
>  	 assert(ir->data.index == 1);
>  	 this->dual_src_output = *reg;
> @@ -134,15 +179,26 @@ fs_visitor::visit(ir_variable *ir)
>        reg->type = brw_type_for_base_type(ir->type);
>  
>     } else if (ir->data.mode == ir_var_system_value) {
> -      if (ir->data.location == SYSTEM_VALUE_SAMPLE_POS) {
> +      switch (ir->data.location) {
> +      case SYSTEM_VALUE_BASE_VERTEX:
> +      case SYSTEM_VALUE_VERTEX_ID:
> +      case SYSTEM_VALUE_VERTEX_ID_ZERO_BASE:
> +      case SYSTEM_VALUE_INSTANCE_ID:
> +         reg = emit_vs_system_value(brw_type_for_base_type(ir->type),
> +                                    ir->data.location);
> +         break;
> +      case SYSTEM_VALUE_SAMPLE_POS:
>  	 reg = emit_samplepos_setup();
> -      } else if (ir->data.location == SYSTEM_VALUE_SAMPLE_ID) {
> +         break;
> +      case SYSTEM_VALUE_SAMPLE_ID:
>  	 reg = emit_sampleid_setup();
> -      } else if (ir->data.location == SYSTEM_VALUE_SAMPLE_MASK_IN) {
> +         break;
> +      case SYSTEM_VALUE_SAMPLE_MASK_IN:
>           assert(brw->gen >= 7);
>           reg = new(mem_ctx)
>              fs_reg(retype(brw_vec8_grf(payload.sample_mask_in_reg, 0),
>                            BRW_REGISTER_TYPE_D));
> +         break;
>        }
>     }
>  
> @@ -1709,6 +1765,8 @@ get_tex(gl_shader_stage stage, const void *key)
>     switch (stage) {
>     case MESA_SHADER_FRAGMENT:
>        return &((brw_wm_prog_key*) key)->tex;
> +   case MESA_SHADER_VERTEX:
> +      return &((brw_vec4_prog_key*) key)->tex;

Doesn't compile.  &((brw_vue_prog_key *) key)->tex;

>     default:
>        unreachable("unhandled shader stage");
>     }
> @@ -3394,6 +3452,222 @@ fs_visitor::emit_fb_writes()
>  }
>  
>  void
> +fs_visitor::setup_uniform_clipplane_values()
> +{
> +   gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
> +   const struct brw_vec4_prog_key *key =
> +      (const struct brw_vec4_prog_key *) this->key;
> +
> +   for (int i = 0; i < key->nr_userclip_plane_consts; i++) {
> +      this->userplane[i] = fs_reg(UNIFORM, uniforms);
> +      for (int j = 0; j < 4; ++j) {
> +         stage_prog_data->param[uniforms + j] =
> +            (gl_constant_value *) &clip_planes[i][j];
> +      }
> +      uniforms += 4;
> +   }
> +}
> +
> +void fs_visitor::compute_clip_distance()
> +{
> +   struct brw_vue_prog_data *vue_prog_data =
> +      (struct brw_vue_prog_data *) prog_data;
> +   const struct brw_vec4_prog_key *key =
> +      (const struct brw_vec4_prog_key *) this->key;
> +

brw_vue_prog_key.

Please also copy and paste Paul's comment:

   /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
    *
    *     "If a linked set of shaders forming the vertex stage contains no
    *     static write to gl_ClipVertex or gl_ClipDistance, but the
    *     application has requested clipping against user clip planes through
    *     the API, then the coordinate written to gl_Position is used for
    *     comparison against the user clip planes."
    *
    * This function is only called if the shader didn't write to
    * gl_ClipDistance.  Accordingly, we use gl_ClipVertex to perform clipping
    * if the user wrote to it; otherwise we use gl_Position.
    */

> +   gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
> +   if (!(vec4_prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX))
> +      clip_vertex = VARYING_SLOT_POS;
> +
> +   /* If the clip vertex isn't written, skip this.  Typically this means
> +    * the GS will set up clipping. */

*/ goes on its own line (same feedback applies in many places).

> +   if (outputs[clip_vertex].file == BAD_FILE)
> +      return;
> +
> +   setup_uniform_clipplane_values();
> +
> +   current_annotation = "user clip distances";
> +
> +   this->outputs[VARYING_SLOT_CLIP_DIST0] = fs_reg(this, glsl_type::vec4_type);
> +   this->outputs[VARYING_SLOT_CLIP_DIST1] = fs_reg(this, glsl_type::vec4_type);
> +
> +   for (int i = 0; i < key->nr_userclip_plane_consts; i++) {
> +      fs_reg u = userplane[i];
> +      fs_reg output = outputs[VARYING_SLOT_CLIP_DIST0 + i / 4];
> +      output.reg_offset = i & 3;
> +
> +      emit(MUL(output, outputs[clip_vertex], u));
> +      for (int j = 1; j < 4; j++) {
> +         u.reg = userplane[i].reg + j;
> +         emit(MAD(output, output, offset(outputs[clip_vertex], j), u));
> +      }
> +   }
> +}
> +
> +void
> +fs_visitor::emit_urb_writes()
> +{
> +   int slot, urb_offset, length;
> +   struct brw_vue_prog_data *vue_prog_data =
> +      (struct brw_vue_prog_data *) prog_data;
> +   const struct brw_vec4_prog_key *key =
> +      (const struct brw_vec4_prog_key *) this->key;
> +   const GLbitfield64 psiz_mask =
> +      VARYING_BIT_LAYER | VARYING_BIT_VIEWPORT | VARYING_BIT_PSIZ;
> +   bool flush;
> +   fs_reg sources[8];
> +
> +   /* Lower legacy ff and ClipVertex clipping to clip distances */
> +   if (key->userclip_active && !prog->UsesClipDistanceOut)
> +      compute_clip_distance();
> +
> +   /* If we don't have any valid slots to write, just do a minimal urb write
> +    * send to terminate the shader. */
> +   if (vec4_prog_data->vue_map.slots_valid == 0) {
> +
> +      fs_reg payload = fs_reg(GRF, virtual_grf_alloc(1), BRW_REGISTER_TYPE_UD);
> +      fs_inst *inst = emit(MOV(payload, fs_reg(retype(brw_vec8_grf(1, 0),
> +                                                      BRW_REGISTER_TYPE_UD))));
> +      inst->force_writemask_all = true;
> +
> +      inst = emit(VS_OPCODE_URB_WRITE, reg_undef, payload);
> +      inst->eot = true;
> +      inst->mlen = 1;
> +      inst->offset = 1;
> +      return;
> +   }
> +
> +   length = 0;
> +   urb_offset = 0;
> +   flush = false;
> +   for (slot = 0; slot < vec4_prog_data->vue_map.num_slots; slot++) {
> +      fs_reg reg, src, zero;
> +
> +      int varying = vec4_prog_data->vue_map.slot_to_varying[slot];
> +      switch (varying) {
> +      case VARYING_SLOT_PSIZ:
> +
> +         /* The point size varying slot is the vue header and is always in the
> +          * vue map.  But often none of the special varyings that live there
> +          * are written and in that case we can skip writing to the vue
> +          * header, provided the corresponding state properly clamps the
> +          * values further down the pipeline. */
> +         if ((vec4_prog_data->vue_map.slots_valid & psiz_mask) == 0) {
> +            assert(length == 0);
> +            urb_offset++;
> +            break;
> +         }
> +
> +         zero = fs_reg(GRF, virtual_grf_alloc(1), BRW_REGISTER_TYPE_UD);
> +         emit(MOV(zero, fs_reg(0u)));
> +
> +         sources[length++] = zero;
> +         if (vec4_prog_data->vue_map.slots_valid & VARYING_BIT_LAYER)
> +            sources[length++] = this->outputs[VARYING_SLOT_LAYER];
> +         else
> +            sources[length++] = zero;
> +
> +         if (vec4_prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT)
> +            sources[length++] = this->outputs[VARYING_SLOT_VIEWPORT];
> +         else
> +            sources[length++] = zero;
> +
> +         if (vec4_prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ)
> +            sources[length++] = this->outputs[VARYING_SLOT_PSIZ];
> +         else
> +            sources[length++] = zero;
> +         break;
> +
> +      case BRW_VARYING_SLOT_NDC:
> +      case VARYING_SLOT_EDGE:
> +         unreachable("unexpected scalar vs output");
> +         break;
> +
> +      case BRW_VARYING_SLOT_PAD:
> +         break;
> +
> +      default:
> +         /* gl_Position is always in the vue map, but isn't always written by
> +          * the shader.  Other varyings (clip distances) get added to the vue

Really?  I guess that's true, but it's a link error to not write it prior to
GLSL 1.40, so virtually all shaders do write it...

I suppose you're right, though, and we need this code for clip distance
regardless, so my point is moot.  Nevermind.

> +          * map but doesn't always get written.  In those cases, the

"don't always get written"

> +          * corresponding this->output slot will be invalid we can skip the

                                                      ", and" ^

> +          * urb write for the varying.  If we've already queued up a vue slot
> +          * for writing we flush a mlen 5 urb write, otherwise we just advance
> +          * the urb_offset.
> +          */
> +         if (this->outputs[varying].file == BAD_FILE) {
> +            if (length > 0)
> +               flush = true;
> +            else
> +               urb_offset++;
> +            break;
> +         }
> +
> +         for (int i = 0; i < 4; i++) {
> +            if ((varying == VARYING_SLOT_COL0 ||
> +                 varying == VARYING_SLOT_COL1 ||
> +                 varying == VARYING_SLOT_BFC0 ||
> +                 varying == VARYING_SLOT_BFC1) &&
> +                key->clamp_vertex_color) {
> +               /* We need to clamp these guys, so do a saturating MOV into a
> +                * temp register and use that for the payload.
> +                */
> +               reg = fs_reg(GRF, virtual_grf_alloc(1), BRW_REGISTER_TYPE_F);
> +               reg.type = this->outputs[varying].type;

Could be written more succinctly as:

reg = fs_reg(GRF, virtual_grf_alloc(1), outputs[varying].type);

> +               src = offset(this->outputs[varying], i);
> +               fs_inst *inst = emit(MOV(reg, src));
> +               inst->saturate = true;
> +               sources[length++] = reg;
> +            } else {
> +               sources[length++] = offset(this->outputs[varying], i);
> +            }
> +         }

I'd push the loop into the then/else blocks, i.e.

if (key->clamp_vertex_color && (COL0 || COL1 || BFC0 || BFC1)) {
   for (int i = 0; i < 4; i++) {
      ...
   }
} else {
   for (int i = 0; i < 4; i++)
      sources[length++] = offset(this->outputs[varying], i);
}

> +         break;
> +      }
> +
> +      current_annotation = "URB write";
> +
> +      /* If we've queued up 8 registers of payload (2 VUE slots), if this is
> +       * the last slot or if we need to flush (see BAD_FILE varying case
> +       * above), emit a URB write send now to flush out the data.
> +       */
> +      int last = slot == vec4_prog_data->vue_map.num_slots - 1;
> +      if (length == 8 || last)
> +         flush = true;
> +      if (flush) {
> +         if (last && (INTEL_DEBUG & DEBUG_SHADER_TIME))
> +            emit_shader_time_end();
> +
> +         fs_reg *payload_sources = ralloc_array(mem_ctx, fs_reg, length + 1);
> +         fs_reg payload = fs_reg(GRF, virtual_grf_alloc(length + 1),
> +                                 BRW_REGISTER_TYPE_F);
> +
> +         /* We need WE_all on the MOV for the message header (the URB handles)
> +          * so do a MOV to a dummy register and set force_writemask_all on the
> +          * MOV.  LOAD_PAYLOAD will preserve that.
> +          */
> +         fs_reg dummy = fs_reg(GRF, virtual_grf_alloc(1),
> +                               BRW_REGISTER_TYPE_UD);
> +         fs_inst *inst = emit(MOV(dummy, fs_reg(retype(brw_vec8_grf(1, 0),
> +                                                       BRW_REGISTER_TYPE_UD))));
> +         inst->force_writemask_all = true;
> +         payload_sources[0] = dummy;
> +
> +         memcpy(&payload_sources[1], sources, length * sizeof sources[0]);
> +         emit(LOAD_PAYLOAD(payload, payload_sources, length + 1));
> +
> +         inst = emit(SHADER_OPCODE_URB_WRITE_SIMD8, reg_undef, payload);
> +         inst->eot = last;
> +         inst->mlen = length + 1;
> +         inst->offset = urb_offset;
> +         urb_offset = slot + 1;
> +         length = 0;
> +         flush = false;
> +      }
> +   }
> +}
> +
> +void
>  fs_visitor::resolve_ud_negate(fs_reg *reg)
>  {
>     if (reg->type != BRW_REGISTER_TYPE_UD ||
> @@ -3437,6 +3711,25 @@ fs_visitor::fs_visitor(struct brw_context *brw,
>     init();
>  }
>  
> +fs_visitor::fs_visitor(struct brw_context *brw,
> +                       void *mem_ctx,
> +                       const struct brw_vs_prog_key *key,
> +                       struct brw_vs_prog_data *prog_data,
> +                       struct gl_shader_program *shader_prog,
> +                       struct gl_vertex_program *cp,
> +                       unsigned dispatch_width)
> +   : backend_visitor(brw, shader_prog, &cp->Base, &prog_data->base.base,
> +                     MESA_SHADER_VERTEX),
> +     reg_null_f(retype(brw_null_vec(dispatch_width), BRW_REGISTER_TYPE_F)),
> +     reg_null_d(retype(brw_null_vec(dispatch_width), BRW_REGISTER_TYPE_D)),
> +     reg_null_ud(retype(brw_null_vec(dispatch_width), BRW_REGISTER_TYPE_UD)),
> +     key(key), prog_data(&prog_data->base.base),
> +     dispatch_width(dispatch_width)
> +{
> +   this->mem_ctx = mem_ctx;
> +   init();
> +}
> +
>  void
>  fs_visitor::init()
>  {
> 
-------------- next part --------------
A non-text attachment was scrubbed...
Name: signature.asc
Type: application/pgp-signature
Size: 819 bytes
Desc: This is a digitally signed message part.
URL: <http://lists.freedesktop.org/archives/mesa-dev/attachments/20141114/8b2c2fed/attachment-0001.sig>