[Mesa-dev] [PATCH v2 12/11] i965: Add scalar geometry shader support.

Wed Oct 28 16:21:46 PDT 2015

On Mon, Oct 12, 2015 at 02:55:32PM -0700, Kenneth Graunke wrote:
> Signed-off-by: Kenneth Graunke <kenneth at whitecape.org>

A few comments below, but

Reviewed-by: Kristian Høgsberg <krh at bitplanet.net>

> ---
>  src/mesa/drivers/dri/i965/brw_fs.cpp              | 174 ++++++++++
>  src/mesa/drivers/dri/i965/brw_fs.h                |  16 +-
>  src/mesa/drivers/dri/i965/brw_fs_nir.cpp          | 378 ++++++++++++++++++++++
>  src/mesa/drivers/dri/i965/brw_fs_visitor.cpp      |  49 ++-
>  src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp |  21 ++
>  5 files changed, 628 insertions(+), 10 deletions(-)
> 
> diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
> index dde8c45..778237a 100644
> --- a/src/mesa/drivers/dri/i965/brw_fs.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
> @@ -43,6 +43,7 @@
>  #include "brw_wm.h"
>  #include "brw_fs.h"
>  #include "brw_cs.h"
> +#include "brw_vec4_gs_visitor.h"
>  #include "brw_cfg.h"
>  #include "brw_dead_control_flow.h"
>  #include "main/uniforms.h"
> @@ -1347,6 +1348,47 @@ fs_visitor::emit_discard_jump()
>  }
>  
>  void
> +fs_visitor::emit_gs_thread_end()
> +{
> +   assert(stage == MESA_SHADER_GEOMETRY);
> +
> +   if (gs_compile->control_data_header_size_bits > 0) {
> +      emit_gs_control_data_bits(this->final_gs_vertex_count);
> +   }
> +
> +   const fs_builder abld = bld.annotate("thread end");
> +   fs_inst *inst;
> +
> +   if (gs_compile->prog_data.static_vertex_count != -1) {
> +      foreach_in_list_reverse(fs_inst, prev, &this->instructions) {
> +         if (prev->opcode == SHADER_OPCODE_URB_WRITE_SIMD8 ||
> +             prev->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_MASKED ||
> +             prev->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT ||
> +             prev->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT) {
> +            prev->eot = true;
> +            return;
> +         } else if (prev->is_control_flow() || prev->has_side_effects()) {
> +            break;
> +         }
> +      }
> +      fs_reg hdr = abld.vgrf(BRW_REGISTER_TYPE_UD, 1);
> +      abld.MOV(hdr, fs_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD)));
> +      inst = abld.emit(SHADER_OPCODE_URB_WRITE_SIMD8, reg_undef, hdr);
> +      inst->mlen = 1;
> +   } else {
> +      fs_reg payload = abld.vgrf(BRW_REGISTER_TYPE_UD, 2);
> +      fs_reg *sources = ralloc_array(mem_ctx, fs_reg, 2);
> +      sources[0] = fs_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD));
> +      sources[1] = this->final_gs_vertex_count;
> +      abld.LOAD_PAYLOAD(payload, sources, 2, 2);
> +      inst = abld.emit(SHADER_OPCODE_URB_WRITE_SIMD8, reg_undef, payload);
> +      inst->mlen = 2;
> +   }
> +   inst->eot = true;
> +   inst->offset = 0;
> +}
> +
> +void
>  fs_visitor::assign_curb_setup()
>  {
>     if (dispatch_width == 8) {
> @@ -1550,6 +1592,53 @@ fs_visitor::assign_vs_urb_setup()
>     }
>  }
>  
> +void
> +fs_visitor::assign_gs_urb_setup()
> +{
> +   assert(stage == MESA_SHADER_GEOMETRY);
> +
> +   const gl_geometry_program *gp = &gs_compile->gp->program;
> +   brw_vue_prog_data *vue_prog_data = (brw_vue_prog_data *) prog_data;
> +
> +   first_non_payload_grf +=
> +      8 * vue_prog_data->urb_read_length * gp->VerticesIn;
> +
> +   const unsigned first_icp_handle = payload.num_regs -
> +      (vue_prog_data->include_vue_handles ? gp->VerticesIn : 0);
> +
> +   foreach_block_and_inst(block, fs_inst, inst, cfg) {
> +      /* Lower URB_READ_SIMD8 opcodes into real messages. */
> +      if (inst->opcode == SHADER_OPCODE_URB_READ_SIMD8) {
> +         assert(inst->src[0].file == IMM);
> +         inst->src[0] = retype(brw_vec8_grf(first_icp_handle +
> +                                            inst->src[0].fixed_hw_reg.dw1.ud,
> +                                            0), BRW_REGISTER_TYPE_UD);
> +         /* for now, assume constant - we can do per-slot offsets later */
> +         assert(inst->src[1].file == IMM);
> +         inst->offset = inst->src[1].fixed_hw_reg.dw1.ud;
> +         inst->src[1] = fs_reg();
> +         inst->mlen = 1;
> +         inst->base_mrf = -1;
> +      }
> +
> +      /* Rewrite all ATTR file references to a real HW_REG. */
> +      for (int i = 0; i < inst->sources; i++) {
> +         if (inst->src[i].file != ATTR)
> +            continue;
> +
> +         int grf = payload.num_regs +
> +                   prog_data->curb_read_length +
> +                   inst->src[i].reg +
> +                   inst->src[i].reg_offset;
> +
> +         inst->src[i].file = HW_REG;
> +         inst->src[i].fixed_hw_reg =
> +            retype(brw_vec8_grf(grf, 0), inst->src[i].type);

We need to do what assign_vs_urb_setup() does when it lowers ATTR file
references, that is, account for stride and subreg_offset:

            inst->src[i].fixed_hw_reg =
               stride(byte_offset(retype(brw_vec8_grf(grf, 0), inst->src[i].type),
                                  inst->src[i].subreg_offset),
                      inst->exec_size * inst->src[i].stride,
                      inst->exec_size, inst->src[i].stride);

in case we end up with an input as a mul src on BSW:

  https://bugs.freedesktop.org/show_bug.cgi?id=91970

In fact, it looks like we can move the loop to lower ATTRS to a helper
and share between GS and VS.

> +      }
> +   }
> +}
> +
> +
>  /**
>   * Split large virtual GRFs into separate components if we can.
>   *
> @@ -4733,6 +4822,46 @@ fs_visitor::setup_vs_payload()
>   *
>   */
>  void
> +fs_visitor::setup_gs_payload()
> +{
> +   assert(stage == MESA_SHADER_GEOMETRY);
> +
> +   const gl_geometry_program *gp = &gs_compile->gp->program;
> +   struct brw_gs_prog_data *gs_prog_data =
> +      (struct brw_gs_prog_data *) prog_data;
> +   struct brw_vue_prog_data *vue_prog_data =
> +      (struct brw_vue_prog_data *) prog_data;
> +
> +   /* R0: thread header, R1: output URB handles */
> +   payload.num_regs = 2;
> +
> +   if (gs_prog_data->include_primitive_id) {
> +      /* R2: Primitive ID 0..7 */
> +      payload.num_regs++;
> +   }
> +
> +   /* Use a maximum of 32 registers for push-model inputs. */
> +   const unsigned max_push_components = 32;
> +
> +   /* If pushing our inputs would take too many registers, reduce the URB read
> +    * length (which is in HWords, or 8 registers), and resort to pulling.
> +    *
> +    * Note that the GS reads <URB Read Length> HWords for every vertex - so we
> +    * have to multiply by VerticesIn to obtain the total storage requirement.
> +    */
> +   if (8 * vue_prog_data->urb_read_length * gp->VerticesIn >
> +       max_push_components) {
> +      gs_prog_data->base.include_vue_handles = true;
> +
> +      /* R3..RN: ICP Handles for each incoming vertex (when using pull model) */
> +      payload.num_regs += gp->VerticesIn;
> +
> +      vue_prog_data->urb_read_length =
> +         ROUND_DOWN_TO(max_push_components / gp->VerticesIn, 8) / 8;
> +   }
> +}
> +
> +void
>  fs_visitor::setup_cs_payload()
>  {
>     assert(devinfo->gen >= 7);
> @@ -4990,6 +5119,51 @@ fs_visitor::run_vs(gl_clip_plane *clip_planes)
>  }
>  
>  bool
> +fs_visitor::run_gs()
> +{
> +   assert(stage == MESA_SHADER_GEOMETRY);
> +
> +   setup_gs_payload();
> +
> +   this->final_gs_vertex_count = vgrf(glsl_type::uint_type);
> +
> +   if (gs_compile->control_data_header_size_bits > 0) {
> +      /* Create a VGRF to store accumulated control data bits. */
> +      this->control_data_bits = vgrf(glsl_type::uint_type);
> +
> +      /* If we're outputting more than 32 control data bits, then EmitVertex()
> +       * will set control_data_bits to 0 after emitting the first vertex.
> +       * Otherwise, we need to initialize it to 0 here.
> +       */
> +      if (gs_compile->control_data_header_size_bits <= 32) {
> +         const fs_builder abld = bld.annotate("initialize control data bits");
> +         abld.MOV(this->control_data_bits, fs_reg(0u));
> +      }
> +   }
> +
> +   if (INTEL_DEBUG & DEBUG_SHADER_TIME)
> +      emit_shader_time_begin();
> +
> +   emit_nir_code();

We're missing

   if (shader_time_index >= 0)
      emit_shader_time_end();

here, right?

> +   emit_gs_thread_end();
> +
> +   if (failed)
> +      return false;
> +
> +   calculate_cfg();
> +
> +   optimize();
> +
> +   assign_curb_setup();
> +   assign_gs_urb_setup();
> +
> +   fixup_3src_null_dest();
> +   allocate_registers();
> +
> +   return !failed;
> +}
> +
> +bool
>  fs_visitor::run_fs(bool do_rep_send)
>  {
>     brw_wm_prog_data *wm_prog_data = (brw_wm_prog_data *) this->prog_data;
> diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h
> index e049608..aa5ff70 100644
> --- a/src/mesa/drivers/dri/i965/brw_fs.h
> +++ b/src/mesa/drivers/dri/i965/brw_fs.h
> @@ -130,18 +130,21 @@ public:
>  
>     bool run_fs(bool do_rep_send);
>     bool run_vs(gl_clip_plane *clip_planes);
> +   bool run_gs();
>     bool run_cs();
>     void optimize();
>     void allocate_registers();
>     void setup_payload_gen4();
>     void setup_payload_gen6();
>     void setup_vs_payload();
> +   void setup_gs_payload();
>     void setup_cs_payload();
>     void fixup_3src_null_dest();
>     void assign_curb_setup();
>     void calculate_urb_setup();
>     void assign_urb_setup();
>     void assign_vs_urb_setup();
> +   void assign_gs_urb_setup();
>     bool assign_regs(bool allow_spilling);
>     void assign_regs_trivial();
>     void setup_payload_interference(struct ra_graph *g, int payload_reg_count,
> @@ -277,7 +280,16 @@ public:
>                                   fs_reg color1, fs_reg color2,
>                                   fs_reg src0_alpha, unsigned components);
>     void emit_fb_writes();
> -   void emit_urb_writes();
> +   void emit_urb_writes(const fs_reg &gs_vertex_count = fs_reg());
> +   void set_gs_stream_control_data_bits(const fs_reg &vertex_count,
> +                                        unsigned stream_id);
> +   void emit_gs_control_data_bits(const fs_reg &vertex_count);
> +   void emit_gs_end_primitive(const nir_src &vertex_count_nir_src);
> +   void emit_gs_vertex(const nir_src &vertex_count_nir_src,
> +                       unsigned stream_id);
> +   void emit_gs_thread_end();
> +   void emit_gs_input_load(const fs_reg &dst, const nir_src &vertex_src,
> +                           unsigned offset, unsigned num_components);
>     void emit_cs_terminate();
>     fs_reg *emit_cs_local_invocation_id_setup();
>     fs_reg *emit_cs_work_group_id_setup();
> @@ -384,6 +396,8 @@ public:
>     fs_reg delta_xy[BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT];
>     fs_reg shader_start_time;
>     fs_reg userplane[MAX_CLIP_PLANES];
> +   fs_reg final_gs_vertex_count;
> +   fs_reg control_data_bits;
>  
>     unsigned grf_used;
>     bool spilled_any_registers;
> diff --git a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
> index 70ddf59..f86645b 100644
> --- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
> @@ -28,6 +28,7 @@
>  #include "program/prog_to_nir.h"
>  #include "brw_fs.h"
>  #include "brw_fs_surface_builder.h"
> +#include "brw_vec4_gs_visitor.h"
>  #include "brw_nir.h"
>  #include "brw_fs_surface_builder.h"
>  #include "brw_vec4_gs_visitor.h"
> @@ -96,6 +97,7 @@ fs_visitor::nir_setup_outputs()
>  
>        switch (stage) {
>        case MESA_SHADER_VERTEX:
> +      case MESA_SHADER_GEOMETRY:
>           for (unsigned int i = 0; i < ALIGN(type_size_scalar(var->type), 4) / 4; i++) {
>              int output = var->data.location + i;
>              this->outputs[output] = offset(reg, bld, 4 * i);
> @@ -1187,6 +1189,362 @@ emit_pixel_interpolater_send(const fs_builder &bld,
>     return inst;
>  }
>  
> +/**
> + * Computes 1 << x, given a D/UD register containing some value x.
> + */
> +static fs_reg
> +intexp2(const fs_builder &bld, const fs_reg &x)
> +{
> +   assert(x.type == BRW_REGISTER_TYPE_UD || x.type == BRW_REGISTER_TYPE_D);
> +
> +   fs_reg result = bld.vgrf(x.type, 1);
> +   fs_reg one = bld.vgrf(x.type, 1);
> +
> +   bld.MOV(one, fs_reg(1u));

Do we need to use fs_reg(1) when x.type == BRW_REGISTER_TYPE_UD to
avoid confusing constant propagation? Or perhaps don't support D since
we only use it for UD.

> +   bld.SHL(result, one, x);
> +   return result;
> +}
> +
> +void
> +fs_visitor::emit_gs_end_primitive(const nir_src &vertex_count_nir_src)
> +{
> +   assert(stage == MESA_SHADER_GEOMETRY);
> +
> +   /* We can only do EndPrimitive() functionality when the control data
> +    * consists of cut bits.  Fortunately, the only time it isn't is when the
> +    * output type is points, in which case EndPrimitive() is a no-op.
> +    */
> +   if (gs_compile->prog_data.control_data_format !=
> +       GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_CUT) {
> +      return;
> +   }
> +
> +   /* Cut bits use one bit per vertex. */
> +   assert(gs_compile->control_data_bits_per_vertex == 1);
> +
> +   fs_reg vertex_count = get_nir_src(vertex_count_nir_src);
> +   vertex_count.type = BRW_REGISTER_TYPE_UD;
> +
> +   /* Cut bit n should be set to 1 if EndPrimitive() was called after emitting
> +    * vertex n, 0 otherwise.  So all we need to do here is mark bit
> +    * (vertex_count - 1) % 32 in the cut_bits register to indicate that
> +    * EndPrimitive() was called after emitting vertex (vertex_count - 1);
> +    * vec4_gs_visitor::emit_control_data_bits() will take care of the rest.
> +    *
> +    * Note that if EndPrimitve() is called before emitting any vertices, this

EndPrimitve -> EndPrimitive

> +    * will cause us to set bit 31 of the control_data_bits register to 1.
> +    * That's fine because:
> +    *
> +    * - If max_vertices < 32, then vertex number 31 (zero-based) will never be
> +    *   output, so the hardware will ignore cut bit 31.
> +    *
> +    * - If max_vertices == 32, then vertex number 31 is guaranteed to be the
> +    *   last vertex, so setting cut bit 31 has no effect (since the primitive
> +    *   is automatically ended when the GS terminates).
> +    *
> +    * - If max_vertices > 32, then the ir_emit_vertex visitor will reset the
> +    *   control_data_bits register to 0 when the first vertex is emitted.
> +    */
> +
> +   const fs_builder abld = bld.annotate("end primitive");
> +
> +   /* control_data_bits |= 1 << ((vertex_count - 1) % 32) */
> +   fs_reg prev_count = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
> +   abld.ADD(prev_count, vertex_count, fs_reg(0xffffffffu));
> +   fs_reg mask = intexp2(abld, prev_count);
> +   /* Note: we're relying on the fact that the GEN SHL instruction only pays
> +    * attention to the lower 5 bits of its second source argument, so on this
> +    * architecture, 1 << (vertex_count - 1) is equivalent to 1 <<
> +    * ((vertex_count - 1) % 32).
> +    */
> +   abld.OR(this->control_data_bits, this->control_data_bits, mask);
> +}
> +
> +void
> +fs_visitor::emit_gs_control_data_bits(const fs_reg &vertex_count)
> +{
> +   assert(stage == MESA_SHADER_GEOMETRY);
> +   assert(gs_compile->control_data_bits_per_vertex != 0);
> +
> +   const fs_builder abld = bld.annotate("emit control data bits");
> +   const fs_builder fwa_bld = bld.exec_all();
> +
> +   /* We use a single UD register to accumulate control data bits (32 bits
> +    * for each of the SIMD8 channels).  So we need to write a DWord (32 bits)
> +    * at a time.
> +    *
> +    * Unfortunately, the URB_WRITE_SIMD8 message uses 128-bit (OWord) offsets.
> +    * We have select a 128-bit group via the Global and Per-Slot Offsets, then
> +    * use the Channel Mask phase to enable/disable which DWord within that
> +    * group to write.  (Remember, different SIMD8 channels may have emitted
> +    * different numbers of vertices, so we may need per-slot offsets.)
> +    *
> +    * Channel masking presents an annoying problem: we may have to replicate
> +    * the data up to 4 times:
> +    *
> +    * Msg = Handles, Per-Slot Offsets, Channel Masks, Data, Data, Data, Data.
> +    *
> +    * To avoid penalizing shaders that emit a small number of vertices, we
> +    * can avoid these sometimes: if the size of the control data header is
> +    * <= 128 bits, then there is only 1 OWord.  All SIMD8 channels will land
> +    * land in the same 128-bit group, so we can skip per-slot offsets.
> +    *
> +    * Similarly, if the control data header is <= 32 bits, there is only one
> +    * DWord, so we can skip channel masks.
> +    */
> +   enum opcode opcode = SHADER_OPCODE_URB_WRITE_SIMD8;
> +
> +   fs_reg channel_mask, per_slot_offset;
> +
> +   if (gs_compile->control_data_header_size_bits > 32) {
> +      opcode = SHADER_OPCODE_URB_WRITE_SIMD8_MASKED;
> +      channel_mask = vgrf(glsl_type::uint_type);
> +   }
> +
> +   if (gs_compile->control_data_header_size_bits > 128) {
> +      opcode = SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT;
> +      per_slot_offset = vgrf(glsl_type::uint_type);
> +   }
> +
> +   /* Figure out which DWord we're trying to write to using the formula:
> +    *
> +    *    dword_index = (vertex_count - 1) * bits_per_vertex / 32
> +    *
> +    * Since bits_per_vertex is a power of two, and is known at compile
> +    * time, this can be optimized to:
> +    *
> +    *    dword_index = (vertex_count - 1) >> (6 - log2(bits_per_vertex))
> +    */
> +   if (opcode != SHADER_OPCODE_URB_WRITE_SIMD8) {
> +      fs_reg dword_index = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
> +      fs_reg prev_count = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
> +      abld.ADD(prev_count, vertex_count, fs_reg(0xffffffffu));
> +      unsigned log2_bits_per_vertex =
> +         _mesa_fls(gs_compile->control_data_bits_per_vertex);
> +      abld.SHR(dword_index, prev_count, fs_reg(6u - log2_bits_per_vertex));
> +
> +      if (per_slot_offset.file != BAD_FILE) {
> +         /* Set the per-slot offset to dword_index / 4, to that we'll write to

to that -> so that

> +          * the appropriate OWord within the control data header.
> +          */
> +         abld.SHR(per_slot_offset, dword_index, fs_reg(2u));
> +      }
> +
> +      /* Set the channel masks to 1 << (dword_index % 4), so that we'll
> +       * write to the appropriate DWORD within the OWORD.
> +       */
> +      fs_reg channel = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
> +      fwa_bld.AND(channel, dword_index, fs_reg(3u));
> +      channel_mask = intexp2(fwa_bld, channel);
> +      /* Then the channel masks need to be in bits 23:16. */
> +      fwa_bld.SHL(channel_mask, channel_mask, fs_reg(16u));
> +   }
> +
> +   /* Store the control data bits in the message payload and send it. */
> +   int mlen = 2;
> +   if (channel_mask.file != BAD_FILE)
> +      mlen += 4; /* channel masks, plus 3 extra copies of the data */
> +   if (per_slot_offset.file != BAD_FILE)
> +      mlen++;
> +
> +   fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, mlen);
> +   fs_reg *sources = ralloc_array(mem_ctx, fs_reg, mlen);
> +   int i = 0;
> +   sources[i++] = fs_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD));
> +   if (per_slot_offset.file != BAD_FILE)
> +      sources[i++] = per_slot_offset;
> +   if (channel_mask.file != BAD_FILE)
> +      sources[i++] = channel_mask;
> +   while (i < mlen) {
> +      sources[i++] = this->control_data_bits;
> +   }
> +
> +   abld.LOAD_PAYLOAD(payload, sources, mlen, mlen);
> +   fs_inst *inst = abld.emit(opcode, reg_undef, payload);
> +   inst->mlen = mlen;
> +   /* We need to increment Global Offset by 256-bits to make room for
> +    * Broadwell's extra "Vertex Count" payload at the beginning of the
> +    * URB entry.  Since this is an OWord message, Global Offset is counted
> +    * in 128-bit units, so we must set it to 2.
> +    */
> +   if (gs_compile->prog_data.static_vertex_count == -1)
> +      inst->offset = 2;
> +}
> +
> +void
> +fs_visitor::set_gs_stream_control_data_bits(const fs_reg &vertex_count,
> +                                            unsigned stream_id)
> +{
> +   /* control_data_bits |= stream_id << ((2 * (vertex_count - 1)) % 32) */
> +
> +   /* Note: we are calling this *before* increasing vertex_count, so
> +    * this->vertex_count == vertex_count - 1 in the formula above.
> +    */
> +
> +   /* Stream mode uses 2 bits per vertex */
> +   assert(gs_compile->control_data_bits_per_vertex == 2);
> +
> +   /* Must be a valid stream */
> +   assert(stream_id >= 0 && stream_id < MAX_VERTEX_STREAMS);
> +
> +   /* Control data bits are initialized to 0 so we don't have to set any
> +    * bits when sending vertices to stream 0.
> +    */
> +   if (stream_id == 0)
> +      return;
> +
> +   const fs_builder abld = bld.annotate("set stream control data bits", NULL);
> +
> +   /* reg::sid = stream_id */
> +   fs_reg sid = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
> +   abld.MOV(sid, fs_reg(stream_id));
> +
> +   /* reg:shift_count = 2 * (vertex_count - 1) */
> +   fs_reg shift_count = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
> +   abld.SHL(shift_count, vertex_count, fs_reg(1u));
> +
> +   /* Note: we're relying on the fact that the GEN SHL instruction only pays
> +    * attention to the lower 5 bits of its second source argument, so on this
> +    * architecture, stream_id << 2 * (vertex_count - 1) is equivalent to
> +    * stream_id << ((2 * (vertex_count - 1)) % 32).
> +    */
> +   fs_reg mask = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
> +   abld.SHL(mask, sid, shift_count);
> +   abld.OR(this->control_data_bits, this->control_data_bits, mask);
> +}
> +
> +void
> +fs_visitor::emit_gs_vertex(const nir_src &vertex_count_nir_src,
> +                           unsigned stream_id)
> +{
> +   fs_reg vertex_count = get_nir_src(vertex_count_nir_src);
> +   vertex_count.type = BRW_REGISTER_TYPE_UD;
> +
> +   /* Haswell and later hardware ignores the "Render Stream Select" bits
> +    * from the 3DSTATE_STREAMOUT packet when the SOL stage is disabled,
> +    * and instead sends all primitives down the pipeline for rasterization.
> +    * If the SOL stage is enabled, "Render Stream Select" is honored and
> +    * primitives bound to non-zero streams are discarded after stream output.
> +    *
> +    * Since the only purpose of primives sent to non-zero streams is to
> +    * be recorded by transform feedback, we can simply discard all geometry
> +    * bound to these streams when transform feedback is disabled.
> +    */
> +   if (stream_id > 0 && !nir->info.has_transform_feedback_varyings)
> +      return;
> +
> +   /* If we're outputting 32 control data bits or less, then we can wait
> +    * until the shader is over to output them all.  Otherwise we need to
> +    * output them as we go.  Now is the time to do it, since we're about to
> +    * output the vertex_count'th vertex, so it's guaranteed that the
> +    * control data bits associated with the (vertex_count - 1)th vertex are
> +    * correct.
> +    */
> +   if (gs_compile->control_data_header_size_bits > 32) {
> +      const fs_builder abld =
> +         bld.annotate("emit vertex: emit control data bits");
> +
> +      /* Only emit control data bits if we've finished accumulating a batch
> +       * of 32 bits.  This is the case when:
> +       *
> +       *     (vertex_count * bits_per_vertex) % 32 == 0
> +       *
> +       * (in other words, when the last 5 bits of vertex_count *
> +       * bits_per_vertex are 0).  Assuming bits_per_vertex == 2^n for some
> +       * integer n (which is always the case, since bits_per_vertex is
> +       * always 1 or 2), this is equivalent to requiring that the last 5-n
> +       * bits of vertex_count are 0:
> +       *
> +       *     vertex_count & (2^(5-n) - 1) == 0
> +       *
> +       * 2^(5-n) == 2^5 / 2^n == 32 / bits_per_vertex, so this is
> +       * equivalent to:
> +       *
> +       *     vertex_count & (32 / bits_per_vertex - 1) == 0
> +       */
> +      // XXX: check immediates.
> +      fs_inst *inst =
> +         abld.AND(bld.null_reg_d(), vertex_count,
> +                  fs_reg(32u / gs_compile->control_data_bits_per_vertex - 1u));
> +      inst->conditional_mod = BRW_CONDITIONAL_Z;
> +
> +      abld.IF(BRW_PREDICATE_NORMAL);
> +      /* If vertex_count is 0, then no control data bits have been
> +       * accumulated yet, so we can skip emitting them.
> +       */
> +      abld.CMP(bld.null_reg_d(), vertex_count, fs_reg(0u),
> +               BRW_CONDITIONAL_NEQ);
> +      abld.IF(BRW_PREDICATE_NORMAL);
> +      emit_gs_control_data_bits(vertex_count);
> +      abld.emit(BRW_OPCODE_ENDIF);
> +
> +      /* Reset control_data_bits to 0 so we can start accumulating a new
> +       * batch.
> +       *
> +       * Note: in the case where vertex_count == 0, this neutralizes the
> +       * effect of any call to EndPrimitive() that the shader may have
> +       * made before outputting its first vertex.
> +       */
> +      inst = abld.MOV(this->control_data_bits, fs_reg(0u));
> +      inst->force_writemask_all = true;
> +      abld.emit(BRW_OPCODE_ENDIF);
> +   }
> +
> +   emit_urb_writes(vertex_count);
> +
> +   /* In stream mode we have to set control data bits for all vertices
> +    * unless we have disabled control data bits completely (which we do
> +    * do for GL_POINTS outputs that don't use streams).
> +    */
> +   if (gs_compile->control_data_header_size_bits > 0 &&
> +       gs_compile->prog_data.control_data_format ==
> +          GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_SID) {
> +      set_gs_stream_control_data_bits(vertex_count, stream_id);
> +   }
> +}
> +
> +void
> +fs_visitor::emit_gs_input_load(const fs_reg &dst,
> +                               const nir_src &vertex_src,
> +                               unsigned input_offset,
> +                               unsigned num_components)
> +{
> +   const brw_vue_prog_data *vue_prog_data = (const brw_vue_prog_data *) prog_data;
> +   const unsigned vertex = nir_src_as_const_value(vertex_src)->u[0];
> +
> +   const unsigned array_stride = vue_prog_data->urb_read_length * 8;
> +
> +   const bool pushed = 4 * input_offset < array_stride;
> +
> +   if (input_offset == 0) {
> +      /* This is the VUE header, containing VARYING_SLOT_LAYER [.y],
> +       * VARYING_SLOT_VIEWPORT [.z], and VARYING_SLOT_PSIZ [.w].
> +       * Only gl_PointSize is available as a GS input, so they must
> +       * be asking for that input.
> +       */
> +      if (pushed) {
> +         bld.MOV(dst, fs_reg(ATTR, array_stride * vertex + 3, dst.type));
> +      } else {
> +         fs_reg tmp = bld.vgrf(dst.type, 4);
> +         fs_inst *inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp,
> +                                  fs_reg(vertex), fs_reg(0));
> +         inst->regs_written = 4;
> +         bld.MOV(dst, offset(tmp, bld, 3));
> +      }
> +   } else {
> +      if (pushed) {
> +         int index = vertex * array_stride + 4 * input_offset;
> +         for (unsigned i = 0; i < num_components; i++) {
> +            bld.MOV(offset(dst, bld, i), fs_reg(ATTR, index + i, dst.type));
> +         }
> +      } else {
> +         fs_inst *inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, dst,
> +                                  fs_reg(vertex), fs_reg(input_offset));
> +         inst->regs_written = num_components;
> +      }
> +   }
> +}
> +
>  void
>  fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr)
>  {
> @@ -1577,6 +1935,14 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
>        break;
>     }
>  
> +   case nir_intrinsic_load_per_vertex_input_indirect:
> +      assert(!"Not allowed");
> +      /* fallthrough */

Heh, assert(false) and fallthrough? Maybe just unreachable()?

> +   case nir_intrinsic_load_per_vertex_input:
> +      emit_gs_input_load(dest, instr->src[0], instr->const_index[0],
> +                         instr->num_components);
> +      break;
> +
>     /* Handle ARB_gpu_shader5 interpolation intrinsics
>      *
>      * It's worth a quick word of explanation as to why we handle the full
> @@ -1933,6 +2299,18 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
>        break;
>     }
>  
> +   case nir_intrinsic_emit_vertex_with_counter:
> +      emit_gs_vertex(instr->src[0], instr->const_index[0]);
> +      break;
> +
> +   case nir_intrinsic_end_primitive_with_counter:
> +      emit_gs_end_primitive(instr->src[0]);
> +      break;
> +
> +   case nir_intrinsic_set_vertex_count:
> +      bld.MOV(this->final_gs_vertex_count, get_nir_src(instr->src[0]));
> +      break;
> +
>     default:
>        unreachable("unknown intrinsic");
>     }
> diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
> index dc7fa9d..76f592f 100644
> --- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
> @@ -866,7 +866,7 @@ void fs_visitor::compute_clip_distance(gl_clip_plane *clip_planes)
>  }
>  
>  void
> -fs_visitor::emit_urb_writes()
> +fs_visitor::emit_urb_writes(const fs_reg &gs_vertex_count)
>  {
>     int slot, urb_offset, length;
>     int starting_urb_offset = 0;
> @@ -902,9 +902,13 @@ fs_visitor::emit_urb_writes()
>        return;
>     }
>  
> +   opcode opcode = SHADER_OPCODE_URB_WRITE_SIMD8;
> +   int header_size = 1;
> +   fs_reg per_slot_offsets;
> +
>     if (stage == MESA_SHADER_GEOMETRY) {
>        const struct brw_gs_prog_data *gs_prog_data =
> -         (const struct brw_gs_prog_data *) prog_data;
> +         (const struct brw_gs_prog_data *) this->prog_data;
>  
>        /* We need to increment the Global Offset to skip over the control data
>         * header and the extra "Vertex Count" field (1 HWord) at the beginning
> @@ -913,6 +917,27 @@ fs_visitor::emit_urb_writes()
>        starting_urb_offset = 2 * gs_prog_data->control_data_header_size_hwords;
>        if (gs_prog_data->static_vertex_count == -1)
>           starting_urb_offset += 2;
> +
> +      /* We also need to use per-slot offsets.  The per-slot offset is the
> +       * Vertex Count.  SIMD8 mode processes 8 different primitives at a
> +       * time; each may output a different number of vertices.
> +       */
> +      opcode = SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT;
> +      header_size++;
> +
> +      /* The URB offset is in 128-bit units, so we need to multiply by 2 */
> +      const int output_vertex_size_owords =
> +         gs_prog_data->output_vertex_size_hwords * 2;
> +
> +      fs_reg offset;
> +      if (gs_vertex_count.file == IMM) {
> +         per_slot_offsets = fs_reg(output_vertex_size_owords *
> +                                   gs_vertex_count.fixed_hw_reg.dw1.ud);
> +      } else {
> +         per_slot_offsets = vgrf(glsl_type::int_type);
> +         bld.MUL(per_slot_offsets, gs_vertex_count,
> +                 fs_reg(output_vertex_size_owords));
> +      }
>     }
>  
>     length = 0;
> @@ -1012,19 +1037,25 @@ fs_visitor::emit_urb_writes()
>        if (length == 8 || last)
>           flush = true;
>        if (flush) {
> -         fs_reg *payload_sources = ralloc_array(mem_ctx, fs_reg, length + 1);
> -         fs_reg payload = fs_reg(GRF, alloc.allocate(length + 1),
> +         fs_reg *payload_sources =
> +            ralloc_array(mem_ctx, fs_reg, length + header_size);
> +         fs_reg payload = fs_reg(GRF, alloc.allocate(length + header_size),
>                                   BRW_REGISTER_TYPE_F);
>           payload_sources[0] =
>              fs_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD));
>  
> -         memcpy(&payload_sources[1], sources, length * sizeof sources[0]);
> -         abld.LOAD_PAYLOAD(payload, payload_sources, length + 1, 1);
> +         if (opcode == SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT)
> +            payload_sources[1] = per_slot_offsets;
> +
> +         memcpy(&payload_sources[header_size], sources,
> +                length * sizeof sources[0]);
> +
> +         abld.LOAD_PAYLOAD(payload, payload_sources, length + header_size,
> +                           header_size);
>  
> -         fs_inst *inst =
> -            abld.emit(SHADER_OPCODE_URB_WRITE_SIMD8, reg_undef, payload);
> +         fs_inst *inst = abld.emit(opcode, reg_undef, payload);
>           inst->eot = last && stage == MESA_SHADER_VERTEX;
> -         inst->mlen = length + 1;
> +         inst->mlen = length + header_size;
>           inst->offset = urb_offset;
>           urb_offset = starting_urb_offset + slot + 1;
>           length = 0;
> diff --git a/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp
> index 775f64d..246ecff 100644
> --- a/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp
> @@ -29,6 +29,7 @@
>  
>  #include "brw_vec4_gs_visitor.h"
>  #include "gen6_gs_visitor.h"
> +#include "brw_fs.h"
>  
>  namespace brw {
>  
> @@ -620,6 +621,26 @@ brw_gs_emit(struct brw_context *brw,
>              unsigned *final_assembly_size)
>  {
>     struct gl_shader *shader = prog->_LinkedShaders[MESA_SHADER_GEOMETRY];
> +   const struct brw_compiler *compiler = brw->intelScreen->compiler;
> +
> +   if (compiler->scalar_gs) {

Make this

      if (compiler->scalar_gs && prog_data->invocations <= 1) {

for now? Or assert(prog_data->invocations == 1) inside the if to fail
more loudly.

> +      fs_visitor v(compiler, brw, mem_ctx, c, shader->Program->nir);
> +      if (v.run_gs()) {
> +         c->prog_data.base.dispatch_mode = DISPATCH_MODE_SIMD8;
> +
> +         fs_generator g(compiler, brw, mem_ctx, &c->key,
> +                        &c->prog_data.base.base, &c->gp->program.Base,
> +                        v.promoted_constants, false, "GS");
> +         if (INTEL_DEBUG & DEBUG_GS) {
> +            char *name = ralloc_asprintf(mem_ctx, "%s geometry shader %d",
> +                                         prog->Label ? prog->Label : "unnamed",
> +                                         prog->Name);
> +            g.enable_debug(name);
> +         }
> +         g.generate_code(v.cfg, 8);
> +         return g.get_assembly(final_assembly_size);
> +      }
> +   }
>  
>     if (brw->gen >= 7) {
>        /* Compile the geometry shader in DUAL_OBJECT dispatch mode, if we can do
> -- 
> 2.6.1
> 
> _______________________________________________
> mesa-dev mailing list
> mesa-dev at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/mesa-dev