[Mesa-dev] [RFC PATCH] i965/gs: add snb support

Tue Feb 25 13:03:40 PST 2014

Ok, further comments, as promised.  I think you're completely on the right
track, Ilia.  My comments below are almost all nit-picks.  I also think
that, in spite of all of my gloom and doom about transform feedback
yesterday, nothing about transform feedback is going to require you to
change your approach.  My advice would be: go ahead and proceed with your
implementation and get it working without transform feedback.  Transform
feedback can be addressed in a later patch series once the rest of the
geometry shader support is working properly.

On 24 February 2014 00:36, Ilia Mirkin <imirkin at alum.mit.edu> wrote:

> Before you read any further, this is nowhere close to working. However
> it's in
> a state where I think most of the structure is there, albeit with a lot of
> XXX
> comments. And I haven't actually implemented the new opcodes I've added.
>
> I was hoping one or two Intel people could take a look at this and let me
> know
> of any pitfalls I'm likely to run into. I've already gotten a lot of help
> and
> advice from Ken, but wanted to put something out publicly.
>
> Any and all feedback much appreciated!
>
> Not-Signed-off-by: Ilia Mirkin <imirkin at alum.mit.edu>
>
> ---
>  src/mesa/drivers/dri/i965/Makefile.sources        |   1 +
>  src/mesa/drivers/dri/i965/brw_defines.h           |  13 +
>  src/mesa/drivers/dri/i965/brw_vec4.h              |   1 +
>  src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp |  36 ++-
>  src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.h   |   5 +-
>  src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp    |  14 ++
>  src/mesa/drivers/dri/i965/gen6_gs_visitor.cpp     | 285
> ++++++++++++++++++++++
>  src/mesa/drivers/dri/i965/gen6_gs_visitor.h       |  63 +++++
>  8 files changed, 405 insertions(+), 13 deletions(-)
>  create mode 100644 src/mesa/drivers/dri/i965/gen6_gs_visitor.cpp
>  create mode 100644 src/mesa/drivers/dri/i965/gen6_gs_visitor.h
>
> diff --git a/src/mesa/drivers/dri/i965/Makefile.sources
> b/src/mesa/drivers/dri/i965/Makefile.sources
> index dc30eb3..96b637f 100644
> --- a/src/mesa/drivers/dri/i965/Makefile.sources
> +++ b/src/mesa/drivers/dri/i965/Makefile.sources
> @@ -119,6 +119,7 @@ i965_FILES = \
>         gen6_clip_state.c \
>         gen6_depthstencil.c \
>         gen6_gs_state.c \
> +       gen6_gs_visitor.cpp \
>          gen6_multisample_state.c \
>         gen6_queryobj.c \
>         gen6_sampler_state.c \
> diff --git a/src/mesa/drivers/dri/i965/brw_defines.h
> b/src/mesa/drivers/dri/i965/brw_defines.h
> index 01d3cb6..a24919c 100644
> --- a/src/mesa/drivers/dri/i965/brw_defines.h
> +++ b/src/mesa/drivers/dri/i965/brw_defines.h
> @@ -909,6 +909,19 @@ enum opcode {
>      * - dst is the GRF for gl_InvocationID.
>      */
>     GS_OPCODE_GET_INSTANCE_ID,
> +
> +   /**
> +    * Sets DWORD 2 of dst to the value in src DWORD 0. Used by geometry
> +    * shaders to initialize DWORD 2 of the message header, which contains
> +    * primitive start/end flags.
>

Let's make this "Used by geometry shaders to initialize DWORD 2 of the URB
write message header...", so that it's clear what message you're talking
about.

> +    */
> +   GS_OPCODE_SET_DWORD_2,
> +
> +   /**
> +    * Emits a FF_SYNC, which on Gen6 returns a VUE handle, which is
> needed to
> +    * emit state in GS.
> +    */
> +   GS_OPCODE_FF_SYNC,
>  };
>
>  enum brw_urb_write_flags {
> diff --git a/src/mesa/drivers/dri/i965/brw_vec4.h
> b/src/mesa/drivers/dri/i965/brw_vec4.h
> index 6bd8b80..14d67b7 100644
> --- a/src/mesa/drivers/dri/i965/brw_vec4.h
> +++ b/src/mesa/drivers/dri/i965/brw_vec4.h
> @@ -130,6 +130,7 @@ public:
>     bool is_one() const;
>
>     src_reg(class vec4_visitor *v, const struct glsl_type *type);
> +   src_reg(class vec4_visitor *v, const struct glsl_type *type, int size);
>
>     explicit src_reg(dst_reg reg);
>
> diff --git a/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp
> b/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp
> index 0a2d8ff..ede9002 100644
> --- a/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp
> @@ -28,6 +28,7 @@
>   */
>
>  #include "brw_vec4_gs_visitor.h"
> +#include "gen6_gs_visitor.h"
>
>  const unsigned MAX_GS_INPUT_VERTICES = 6;
>
> @@ -587,6 +588,8 @@ brw_gs_emit(struct brw_context *brw,
>  {
>     struct brw_shader *shader =
>        (brw_shader *) prog->_LinkedShaders[MESA_SHADER_GEOMETRY];
> +   vec4_gs_visitor *gs;
>

Nit pick: we typically use "gs" as a variable name when we're referring to
the user-provided geometry shader program, and "v" when referring to an
instance of the visitor class.  To stay consistent with that, let's keep
the name "v" for this variable.

Also, to avoid future headaches, let's initialize this value to NULL.  That
way, if in the future we ever add a code path that goes to the "done" label
before this variable is initialized, deleting it won't corrupt memory.

> +   const unsigned *ret = NULL;
>
>     if (unlikely(INTEL_DEBUG & DEBUG_GS)) {
>        printf("GLSL IR for native geometry shader %d:\n", prog->Name);
> @@ -602,12 +605,17 @@ brw_gs_emit(struct brw_context *brw,
>         likely(!(INTEL_DEBUG & DEBUG_NO_DUAL_OBJECT_GS))) {
>        c->prog_data.dual_instanced_dispatch = false;
>
> -      vec4_gs_visitor v(brw, c, prog, shader, mem_ctx, true /* no_spills
> */);
> -      if (v.run()) {
> -         return generate_assembly(brw, prog, &c->gp->program.Base,
> -                                  &c->prog_data.base, mem_ctx,
> &v.instructions,
> -                                  final_assembly_size);
> +      if (brw->gen >= 7)
> +         gs = new vec4_gs_visitor(brw, c, prog, shader, mem_ctx, true /*
> no_spills */);
> +      else
> +         gs = new gen6_gs_visitor(brw, c, prog, shader, mem_ctx, true /*
> no_spills */);
> +      if (gs->run()) {
> +         ret = generate_assembly(brw, prog, &c->gp->program.Base,
> +                                 &c->prog_data.base, mem_ctx,
> &gs->instructions,
> +                                 final_assembly_size);
> +         goto done;
>        }
> +      delete gs;
>     }
>
>     /* Either we failed to compile in DUAL_OBJECT mode (probably because it
> @@ -622,15 +630,21 @@ brw_gs_emit(struct brw_context *brw,
>      */
>     c->prog_data.dual_instanced_dispatch = true;
>
> -   vec4_gs_visitor v(brw, c, prog, shader, mem_ctx, false /* no_spills
> */);
> -   if (!v.run()) {
> +   if (brw->gen >= 7)
> +      gs = new vec4_gs_visitor(brw, c, prog, shader, mem_ctx, false /*
> no_spills */);
> +   else
> +      gs = new gen6_gs_visitor(brw, c, prog, shader, mem_ctx, false /*
> no_spills */);
> +   if (!gs->run()) {
>        prog->LinkStatus = false;
> -      ralloc_strcat(&prog->InfoLog, v.fail_msg);
> -      return NULL;
> +      ralloc_strcat(&prog->InfoLog, gs->fail_msg);
> +      goto done;
>     }
>
> -   return generate_assembly(brw, prog, &c->gp->program.Base,
> &c->prog_data.base,
> -                            mem_ctx, &v.instructions,
> final_assembly_size);
> +   ret = generate_assembly(brw, prog, &c->gp->program.Base,
> &c->prog_data.base,
> +                           mem_ctx, &gs->instructions,
> final_assembly_size);
> +done:
> +   delete gs;
> +   return ret;
>  }
>
>
> diff --git a/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.h
> b/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.h
> index 68756f7..7a4a262 100644
> --- a/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.h
> +++ b/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.h
> @@ -96,14 +96,15 @@ protected:
>     virtual void visit(ir_emit_vertex *);
>     virtual void visit(ir_end_primitive *);
>
> +   src_reg vertex_count;
> +   const struct brw_gs_compile * const c;
> +
>  private:
>     int setup_varying_inputs(int payload_reg, int *attribute_map,
>                              int attributes_per_reg);
>     void emit_control_data_bits();
>
> -   src_reg vertex_count;
>     src_reg control_data_bits;
> -   const struct brw_gs_compile * const c;
>  };
>
>  } /* namespace brw */
> diff --git a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
> b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
> index 601b364..0ebf118 100644
> --- a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
> @@ -616,6 +616,20 @@ src_reg::src_reg(class vec4_visitor *v, const struct
> glsl_type *type)
>     this->type = brw_type_for_base_type(type);
>  }
>
> +src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type, int
> size)
>

It looks like the only caller of this constructor passes a type of
glsl_type::uint_type, and that doesn't really make sense (since vec4-sized
values are actually being stored in the resulting virtual register).  How
about if we drop the "type" argument from this constructor entirely?  After
all the point of this constructor is simply to allocate "size" contiguous
registers, not to allocate registers corresponding to any particular GLSL
type.

> +{
> +   assert(size > 0);
> +
> +   init();
> +
> +   this->file = GRF;
> +   this->reg = v->virtual_grf_alloc(type_size(type) * size);
> +
> +   this->swizzle = BRW_SWIZZLE_NOOP;
>
+
> +   this->type = brw_type_for_base_type(type);
>

With the change I recommended above, this line would simply become:

this->type = BRW_REGISTER_TYPE_UD;

> +}
> +
>  dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
>  {
>     init();
> diff --git a/src/mesa/drivers/dri/i965/gen6_gs_visitor.cpp
> b/src/mesa/drivers/dri/i965/gen6_gs_visitor.cpp
> new file mode 100644
> index 0000000..7be04d0
> --- /dev/null
> +++ b/src/mesa/drivers/dri/i965/gen6_gs_visitor.cpp
> @@ -0,0 +1,285 @@
> +/*
> + * Copyright © 2014 Ilia Mirkin
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a
> + * copy of this software and associated documentation files (the
> "Software"),
> + * to deal in the Software without restriction, including without
> limitation
> + * the rights to use, copy, modify, merge, publish, distribute,
> sublicense,
> + * and/or sell copies of the Software, and to permit persons to whom the
> + * Software is furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice (including the
> next
> + * paragraph) shall be included in all copies or substantial portions of
> the
> + * Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
> EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
> MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT
> SHALL
> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
> OTHER
> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
> + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
> + * DEALINGS IN THE SOFTWARE.
> + */
> +
> +/**
> + * \file gen6_gs_visitor.cpp
> + *
> + * Gen6 geometry-shader-specific code, derived from the Gen7+
> vec4_gs_visitor.
> + */
> +
> +#include "gen6_gs_visitor.h"
> +
> +namespace brw {
> +
> +void
> +gen6_gs_visitor::emit_prolog()
> +{
> +   vec4_gs_visitor::emit_prolog();
> +
> +   /* vertex_output layout:
> +    *
> +    * This is an array that contains all the data emitted during the
> runtime
> +    * of this GP. For each emitted vertex, there are vue_map.num_slots
> data
> +    * items, and an extra register used to store flags. This register
> comes
> +    * after the first num_slots. To match the URB_WRITE message header,
> bit 0
> +    * is PrimEnd, and bit 1 is PrimStart. The next vertex then continues
> +    * afterwards.
> +    */
>

This is good documentation, but it's not in a place where people are likely
to look for it (if I'm looking for documentation of this->vertex_output I
will usually look at the class definition in the header file).  I'd suggest
moving this comment to just above the declaration of vertex_output in
gen6_gs_visitor.h.

> +   this->vertex_output = src_reg(this, glsl_type::uint_type,
> +                                 (prog_data->vue_map.num_slots + 1) *
> +                                 c->gp->program.VerticesOut);
> +   this->vertex_output_offset = src_reg(this, glsl_type::uint_type);
> +   emit(MOV(dst_reg(this->vertex_output_offset), src_reg(0u)));
> +
> +   this->first_vertex = src_reg(this, glsl_type::uint_type);
> +   emit(MOV(dst_reg(this->first_vertex), src_reg(2u)));
>

For code clarity, please use the #defines URB_WRITE_PRIM_END and
URB_WRITE_PRIM_START (from brw_defines.h) rather than hardcoding 1 and 2.

> +}
> +
> +void
> +gen6_gs_visitor::visit(ir_emit_vertex *)
> +{
> +   /* copy all of the outputs into the temporary area */
> +
> +   unsigned num_output_vertices = c->gp->program.VerticesOut;
> +   emit(CMP(dst_null_d(), this->vertex_count,
> +            src_reg(num_output_vertices), BRW_CONDITIONAL_L));
> +   emit(IF(BRW_PREDICATE_NORMAL));
> +   {
> +      for (int slot = 0; slot < prog_data->vue_map.num_slots; ++slot) {
> +         dst_reg dst(this->vertex_output);
> +         dst.reladdr = ralloc(mem_ctx, src_reg);
> +         memcpy(dst.reladdr, &this->vertex_output_offset,
> sizeof(src_reg));
> +         /* xxx generalize emit_urb_slot and use that, otherwise
> gl_Position &
> +          * co don't work */
>

Yup, agreed.  I think it shouldn't be too difficult--all that needs to be
done is to move the declaration of the destination registers from
vec4_visitor::emit_urb_slot() to its caller, so that we can pass a dst_reg
into it rather than an int representing the mrf.

> +         emit_generic_urb_slot(dst,
> prog_data->vue_map.slot_to_varying[slot]);
> +         emit(ADD(dst_reg(this->vertex_output_offset),
> +                  this->vertex_output_offset, src_reg(1u)));
> +      }
> +
> +      /* XXX write the flags in a format that will make implementation of
> the
> +       * "write flags" opcode easy.
> +       */
> +      dst_reg dst(this->vertex_output);
> +      dst.reladdr = ralloc(mem_ctx, src_reg);
> +      memcpy(dst.reladdr, &this->vertex_output_offset, sizeof(src_reg));
> +      if (c->gp->program.OutputType == GL_POINTS) {
> +         /* Each point starts and ends the vertex */
> +         emit(MOV(dst, src_reg(3u)));
> +      } else {
> +         /* Set the flags to first_vertex, which will be set to 2 when
> it's
> +          * the first, or 0 if it's not the first. Zero out first_vertex
> so
> +          * that future runs will work correctly as well.
> +          */
>

Some explanatory text like this would really be better in the
gen6_gs_visitor.h file, next to the definition of first_vertex.  It took me
some hunting to find this comment before I really understood how
first_vertex worked.

> +         emit(MOV(dst, this->first_vertex));
> +         emit(MOV(dst_reg(this->first_vertex), src_reg(0u)));
> +      }
> +
> +      emit(ADD(dst_reg(this->vertex_output_offset),
> +               this->vertex_output_offset, src_reg(1u)));
> +
> +      emit(ADD(dst_reg(this->vertex_count),
> +               this->vertex_count, src_reg(1u)));
> +   }
> +   emit(BRW_OPCODE_ENDIF);
> +}
> +
> +void
> +gen6_gs_visitor::visit(ir_end_primitive *)
> +{
> +   /* This has no effect for GL_POINTS */
> +   if (c->gp->program.OutputType == GL_POINTS)
> +      return;
> +
> +   /* mark the current vertex as ending the primitive */
> +   unsigned num_output_vertices = c->gp->program.VerticesOut;
> +   emit(CMP(dst_null_d(), this->vertex_count,
> +            src_reg(num_output_vertices), BRW_CONDITIONAL_L));
>

We also need a check to make sure that we skip the code below when
num_output_vertices == 0.  Otherwise when we subtract 1 from
vertex_output_offset, we'll wind up stomping on data that belongs to some
other variable in the shader.

I believe we can fix this by just inserting the following code here:

vec4_instruction *inst = emit(CMP(dst_null_d(), this->vertex_count, 0u,
BRW_CONDITIONAL_NEQ));
inst->predicate = BRW_PREDICATE_NORMAL;

(setting the predicate means that the second comparison will only run if
the first one passes, so in effect they are ANDed).

> +   emit(IF(BRW_PREDICATE_NORMAL));
> +   {
> +      /* vertex_output_offset is already pointing at the first entry of
> the
> +       * next vertex. So subtract 1 to modify the flags for the previous
> +       * vertex.
> +       */
> +      src_reg offset(this, glsl_type::uint_type);
> +      emit(MOV(dst_reg(offset), this->vertex_output_offset));
> +      emit(ADD(dst_reg(offset), offset, src_reg(~0u)));
>

This should be possible to do in a single instruction:

emit(ADD(dst_reg(offset), this->vertex_output_offset, src_reg(~0u));

> +
> +      src_reg dst(this->vertex_output);
> +      dst.reladdr = ralloc(mem_ctx, src_reg);
> +      memcpy(dst.reladdr, &offset, sizeof(src_reg));
> +
> +      /* Set the 0 bit to indicate that the primitive is ending */
> +      emit(OR(dst_reg(dst), dst, src_reg(1u)));
> +
> +      /* Set the first vertex flag to indicate that the next vertex will
> start
> +       * a primitive
> +       */
> +      emit(MOV(dst_reg(this->first_vertex), src_reg(2u)));
> +   }
> +   emit(BRW_OPCODE_ENDIF);
> +}
> +
> +void
> +gen6_gs_visitor::emit_thread_end()
> +{
>

Before doing anything else, we should ensure that any primitive that is in
progress gets ended.  (Gen7+ does this automatically, but on Gen6 we have
to do it ourselves, because we have to emit correct start/end flags with
each vertex).  I think we can accomplish this by simply inserting a call to
the end_primitive visitor here.

> +   /* Now we take all of the vertex data previously written into
> +    * vertex_output, and emit it for real. This involves looping through
> all
> +    * the vertices, loading the relevant data back into the MRFs, and
> emitting
> +    * write opcodes.
> +    *
> +    * Note that this has to be done before
> move_grf_array_access_to_scratch
> +    * runs and does its magic, otherwise we have to manually deal with
> scratch
> +    * space.
> +    */
> +
> +   src_reg i(this, glsl_type::uint_type);
> +   emit(MOV(dst_reg(i), src_reg(0u)));
>

Is "i" counting vertices?  If so it would be nice to have a comment here to
clarify that.

> +
> +   src_reg offset(this, glsl_type::uint_type);
> +   emit(MOV(dst_reg(offset), src_reg(0u)));
>

Similarly, it would be nice to have a comment explaining what "offset" is
indexing into (vertex_data I presume?)

> +
> +   src_reg vue_handle(this, glsl_type::uint_type);
> +   src_reg prim_flags(this, glsl_type::uint_type);
> +
> +   /* XXX check if FF_SYNC really needs the # of primitives generated, if
> it
> +    * does, figure out exactly what it wants a counter of, and count it.
> +    */
>

IIRC, the reason FF_SYNC needs the # of primitives generated is so that it
can update statistics counters correctly.  I'm not sure exactly which
counters are affected, but the ones we should double check are the ones in
AMD_performance_monitor and the ones counted by transform feedback queries
(GL_PRIMITIVES_GENERATED and GL_TRANSFORM_FEEDBACK_PRIMITIVES_WRITTEN).
Note that the GL_PRIMITIVES_GENERATED needs to produce a correct result
even when transform feedback is not in use.

Fortunately, the number of primitives generated is pretty easy to compute:
- If the output type is GL_POINTS, it's equal to vertex_count.
- If the output type is GL_LINE_STRIP, it's equal to the number of vertices
for which the URB_WRITE_PRIM_START bit is clear.
- If the output type is GL_TRIANGLE_STRIP, it's equal to the number of
vertices for which both the URB_WRITE_PRIM_START and URB_WRITE_PRIM_END
bits are clear.

The GL_POINTS case is obviously trivial.  The other two cases should be
pretty easy to compute by looping through the vertex_output array.

>
> +   /* XXX look into SO and whether it interacts with this GS logic in any
> way.
> +    */
>

As I said before, I think your best bet is to get geometry shaders working
without worrying about transform feedback, and then address transform
feedback in a separate patch series.  However, since I'm thinking about it,
let me just give some pseudocode of what I think will be necessary to do
once we add transform feedback support.  Feel free to disregard this until
you're ready to tackle it.

Pseudocode for transform feedback support:
FF_SYNC.
Let svbi0 = the value returned in W0.1 of the FF_SYNC reply (see the Sandy
Bridge PRM, vol 4 part 2, p30).
Let max_svbi0 = the value passed in r1.4 of the thread payload (see the
Sandy Bridge PRM, vol 2 part 1, p169).
Let output_verts_per_prim = 1, 2, or 3 depending on the output type.
Let reverse_triangle = 0 (needed only for GL_TRIANGLE_STRIP).
for (int n = 0; n < num_verts; n++) {
  if (svbi0 + output_verts_per_prim >= max_svbi0) {
    break; // transform feedback buffer exhausted
  }
  switch (output type) { // Note: output_type is known at compile time
  case GL_POINTS:
    Output vertex_data[n] to transform_feedback_buffer[svbi0].
    svbi0 += 1.
    break;
  case GL_LINE_STRIP:
    if (the URB_WRITE_PRIM_START bit is clear in vertex_data[n]) {
      Output vertex_data[n-1] to transform_feedback_buffer[svbi0].
      Output vertex_data[n] to transform_feedback_buffer[svbi0 + 1].
      svbi0 += 2.
    }
    break;
  case GL_TRIANGLE_STRIP:
    if (both the URB_WRITE_PRIM_START and URB_WRITE_PRIM_END bits are clear
in vertex_data[n]) {
      Output vertex_data[n-1] to transform_feedback_buffer[svbi0 +
reverse_triangle].
      Output vertex_data[n] to transform_feedback_buffer[svbi0 + 1 -
reverse_triangle].
      Output vertex_data[n+1] to transform_feedback_buffer[svbi0 + 2].
      reverse_triangle = 1 - reverse_triangle.
      svbi0 += 3.
    } else {
      reverse_triangle = 0.
    }
    break;
  }
}

(Note: for the first attempt, we should probably use a state-based
recompile to ensure that the code above is only generated when transform
feedback is in use.  I *think* it would be possible to avoid the
state-based recompile by being sufficiently clever (basically ensure that
max_svbi0 == 0 whenever transform feedback is disabled), but it may not be
worth the effort).

> +   emit(GS_OPCODE_FF_SYNC, dst_reg(vue_handle));
> +
> +   /* XXX double-check loop logic -- can this be done using a regular
> +    * condition instead of a nested if inside an infinite do/while?
> +    */
> +   emit(BRW_OPCODE_DO);
> +   {
> +      emit(CMP(dst_null_d(), this->vertex_count, i, BRW_CONDITIONAL_L));
> +      emit(IF(BRW_PREDICATE_NORMAL));
> +      {
> +         emit(BRW_OPCODE_BREAK);
> +      }
> +      emit(BRW_OPCODE_ENDIF);
>

I believe what you can do instead is just a conditional break, i.e.:

emit(BRW_OPCODE_DO);
{
   emit(CMP(...));
   vec4_instruction *inst = emit(BRW_OPCODE_BREAK);
   inst->predicate = BRW_PREDICATE_NORMAL;

> +
> +      /* Writes out a single vertex's worth data. This is basically the
> same
> +       * as vec4_visitor::emit_vertex, except that it's using
> vertex_output as
> +       * its source rather than the varying slots
> +       */
> +
> +      int base_mrf = 1;
> +      int mrf = base_mrf;
> +      /* MRF 14/15 are used for spill handling */
> +      int max_usable_mrf = 13;
> +
> +      dst_reg base_mrf_reg = dst_reg(MRF, base_mrf);
> +      base_mrf_reg.type = BRW_REGISTER_TYPE_UD;
> +
> +      emit_urb_write_header(mrf++);
> +
> +      /* XXX double-check that these really are the right primitive ids */
> +      unsigned prim_type;
> +      switch (c->gp->program.OutputType) {
> +      case GL_POINTS:
> +         prim_type = _3DPRIM_POINTLIST << 2;
> +         break;
> +      case GL_LINE_STRIP:
> +         prim_type = _3DPRIM_LINESTRIP << 2;
> +         break;
> +      case GL_TRIANGLE_STRIP:
> +         prim_type = _3DPRIM_TRISTRIP << 2;
> +         break;
> +      default:
> +         assert(!"Unexpected output type");
> +         prim_type = 0;
> +         break;
> +      }
>

Instead of the hardcoded 2 please use URB_WRITE_PRIM_TYPE_SHIFT from
brw_defines.h.  Also, you can use the prim_to_hw_prim array (from
brw_draw.c) instead of a switch statement.  So I think this all reduces
down to:

unsigned prim_type = prim_to_hw_prim[c->gp->program.OutputType] <<
URB_WRITE_PRIM_TYPE_SHIFT;

> +
> +      /* Location of the primitive flags for the current vertex */
> +      src_reg flag_offset(this, glsl_type::uint_type);
> +      emit(ADD(dst_reg(flag_offset),
> +               offset, src_reg(prog_data->vue_map.num_slots)));
> +
> +      /* Pointer to the primitive flag data for the current vertex */
> +      src_reg prim_data(this->vertex_output);
> +      prim_data.reladdr = ralloc(mem_ctx, src_reg);
> +      memcpy(&prim_data.reladdr, &flag_offset, sizeof(src_reg));
> +
> +      /* Combine the current vertex prim start/end flags with the
> primitive
> +       * type
> +       */
> +      emit(OR(dst_reg(prim_flags), prim_data, src_reg(prim_type)));
> +
> +      emit(GS_OPCODE_SET_DWORD_2, base_mrf_reg, prim_flags);
> +
> +      src_reg data(this->vertex_output);
> +      data.reladdr = ralloc(mem_ctx, src_reg);
> +      memcpy(data.reladdr, &offset, sizeof(src_reg));
> +
> +      int slot = 0;
> +      bool complete = false;
> +      do {
> +         /* URB offset is in URB row increments, and each of our MRFs is
> half of
> +          * one of those, since we're doing interleaved writes.
> +          */
> +         int urb_offset = slot / 2;
> +
> +         mrf = base_mrf + 1;
> +         for (; slot < prog_data->vue_map.num_slots; ++slot) {
> +            dst_reg reg = dst_reg(MRF, mrf++);
> +            reg.type = BRW_REGISTER_TYPE_F;
> +
> +            emit(MOV(reg, data));
> +
> +            /* XXX double-check that adding to offset will also alter the
> +             * reladdr above, or if we need to create a fresh offset for
> each
> +             * slot. */
> +            emit(ADD(dst_reg(offset), offset, src_reg(1u)));
> +
> +            /* If this was max_usable_mrf, we can't fit anything more
> into this
> +             * URB WRITE.
> +             */
> +            if (mrf > max_usable_mrf) {
> +               slot++;
> +               break;
> +            }
> +         }
> +
> +         complete = slot >= prog_data->vue_map.num_slots;
> +         current_annotation = "URB write";
> +         vec4_instruction *inst = emit_urb_write_opcode(complete);
> +         inst->base_mrf = base_mrf;
> +         inst->mlen = mrf - base_mrf;
> +         if ((inst->mlen % 2) != 1)
> +            inst->mlen++;
> +         inst->offset += urb_offset;
> +      } while (!complete);
> +
> +      emit(ADD(dst_reg(i), i, src_reg(1u)));
> +   }
> +   emit(BRW_OPCODE_WHILE);
> +
> +   /* XXX more thread ending logic here, perhaps similar to what
> +    * vec4_gs_visitor::emit_thread_end does. Need to check the docs.
> +    */
> +}
> +
> +} /* namespace brw */
> diff --git a/src/mesa/drivers/dri/i965/gen6_gs_visitor.h
> b/src/mesa/drivers/dri/i965/gen6_gs_visitor.h
> new file mode 100644
> index 0000000..b519438
> --- /dev/null
> +++ b/src/mesa/drivers/dri/i965/gen6_gs_visitor.h
> @@ -0,0 +1,63 @@
> +/*
> + * Copyright © 2014 Ilia Mirkin
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a
> + * copy of this software and associated documentation files (the
> "Software"),
> + * to deal in the Software without restriction, including without
> limitation
> + * the rights to use, copy, modify, merge, publish, distribute,
> sublicense,
> + * and/or sell copies of the Software, and to permit persons to whom the
> + * Software is furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice (including the
> next
> + * paragraph) shall be included in all copies or substantial portions of
> the
> + * Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
> EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
> MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT
> SHALL
> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
> OTHER
> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
> + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
> + * DEALINGS IN THE SOFTWARE.
> + */
> +
> +/**
> + * \file gen6_gs_visitor.h
> + *
> + * Gen6 geometry-shader-specific code, derived from the Gen7+
> vec4_gs_visitor.
> + */
> +
> +#ifndef GEN6_GS_VISITOR_H
> +#define GEN6_GS_VISITOR_H
> +
> +#include "brw_vec4.h"
> +#include "brw_vec4_gs_visitor.h"
> +
> +#ifdef __cplusplus
> +namespace brw {
> +
> +class gen6_gs_visitor : public vec4_gs_visitor {
> +  public:
> +   gen6_gs_visitor(struct brw_context *brw,
> +                   struct brw_gs_compile *c,
> +                   struct gl_shader_program *prog,
> +                   struct brw_shader *shader,
> +                   void *mem_ctx,
> +                   bool no_spills) :
> +         vec4_gs_visitor(brw, c, prog, shader, mem_ctx, no_spills) {}
> +  protected:
> +   virtual void emit_prolog();
> +   virtual void emit_thread_end();
> +   virtual void visit(ir_emit_vertex *);
> +   virtual void visit(ir_end_primitive *);
> +
> +  private:
> +   src_reg vertex_output;
> +   src_reg vertex_output_offset;
> +   src_reg first_vertex;
> +};
> +
> +} /* namespace brw */
> +#endif /* __cplusplus */
> +
> +#endif /* GEN6_VS_VISITOR_H */
> --
> 1.8.3.2
>
> _______________________________________________
> mesa-dev mailing list
> mesa-dev at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/mesa-dev
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.freedesktop.org/archives/mesa-dev/attachments/20140225/68b6ed93/attachment-0001.html>