[Mesa-dev] [PATCH 10/10] i965 gen6: Implement pass-through GS for transform feedback.

Kenneth Graunke kenneth at whitecape.org
Tue Dec 6 23:54:30 PST 2011


On 12/05/2011 09:40 AM, Paul Berry wrote:
> In Gen6, transform feedback is accomplished by having the geometry
> shader send vertex data to the data port using "Streamed Vertex Buffer
> Write" messages, while simultaneously passing vertices through to the
> rest of the graphics pipeline (if rendering is enabled).
> 
> This patch adds a geometry shader program that simply passes vertices
> through to the rest of the graphics pipeline.  The rest of transform
> feedback functionality will be added in future patches.
> 
> To make the new geometry shader easier to test, I've added an
> environment variable "INTEL_FORCE_GS".  If this environment variable
> is enabled, then the pass-through geometry shader will always be used,
> regardless of whether transform feedback is in effect.
> 
> On my Sandy Bridge laptop, I'm able to enable INTEL_FORCE_GS with no
> Piglit regressions.

Looks good, Paul.  One minor nit in the middle, and then a question at
the very end...

> ---
>  src/mesa/drivers/dri/i965/brw_defines.h   |    3 +
>  src/mesa/drivers/dri/i965/brw_eu.h        |    5 ++
>  src/mesa/drivers/dri/i965/brw_gs.c        |  105 ++++++++++++++++++++--------
>  src/mesa/drivers/dri/i965/brw_gs.h        |    2 +
>  src/mesa/drivers/dri/i965/brw_gs_emit.c   |   94 ++++++++++++++++++++++++++
>  src/mesa/drivers/dri/i965/gen6_gs_state.c |   46 ++++++++-----
>  6 files changed, 209 insertions(+), 46 deletions(-)
> 
> diff --git a/src/mesa/drivers/dri/i965/brw_defines.h b/src/mesa/drivers/dri/i965/brw_defines.h
> index 2990c90..ee7ec87 100644
> --- a/src/mesa/drivers/dri/i965/brw_defines.h
> +++ b/src/mesa/drivers/dri/i965/brw_defines.h
> @@ -1075,6 +1075,9 @@ enum brw_message_target {
>  # define GEN6_GS_SVBI_POSTINCREMENT_VALUE_MASK		INTEL_MASK(25, 16)
>  # define GEN6_GS_ENABLE					(1 << 15)
>  
> +# define BRW_GS_EDGE_INDICATOR_0			(1 << 8)
> +# define BRW_GS_EDGE_INDICATOR_1			(1 << 9)
> +
>  #define _3DSTATE_HS                             0x781B /* GEN7+ */
>  #define _3DSTATE_TE                             0x781C /* GEN7+ */
>  #define _3DSTATE_DS                             0x781D /* GEN7+ */
> diff --git a/src/mesa/drivers/dri/i965/brw_eu.h b/src/mesa/drivers/dri/i965/brw_eu.h
> index dcb1fc9..596be02 100644
> --- a/src/mesa/drivers/dri/i965/brw_eu.h
> +++ b/src/mesa/drivers/dri/i965/brw_eu.h
> @@ -650,6 +650,11 @@ static INLINE struct brw_reg get_element_ud( struct brw_reg reg, GLuint elt )
>     return vec1(suboffset(retype(reg, BRW_REGISTER_TYPE_UD), elt));
>  }
>  
> +static INLINE struct brw_reg get_element_d( struct brw_reg reg, GLuint elt )
> +{
> +   return vec1(suboffset(retype(reg, BRW_REGISTER_TYPE_D), elt));
> +}
> +
>  
>  static INLINE struct brw_reg brw_swizzle( struct brw_reg reg,
>  					    GLuint x,
> diff --git a/src/mesa/drivers/dri/i965/brw_gs.c b/src/mesa/drivers/dri/i965/brw_gs.c
> index 804ea84..2e729aa 100644
> --- a/src/mesa/drivers/dri/i965/brw_gs.c
> +++ b/src/mesa/drivers/dri/i965/brw_gs.c
> @@ -53,12 +53,6 @@ static void compile_gs_prog( struct brw_context *brw,
>     void *mem_ctx;
>     GLuint program_size;
>  
> -   /* Gen6: VF has already converted into polygon, and LINELOOP is
> -    * converted to LINESTRIP at the beginning of the 3D pipeline.
> -    */
> -   if (intel->gen >= 6)
> -      return;
> -
>     memset(&c, 0, sizeof(c));
>     
>     c.key = *key;
> @@ -76,24 +70,60 @@ static void compile_gs_prog( struct brw_context *brw,
>      */
>     brw_set_mask_control(&c.func, BRW_MASK_DISABLE);
>  
> -
> -   /* Note that primitives which don't require a GS program have
> -    * already been weeded out by this stage:
> -    */
> -
> -   switch (key->primitive) {
> -   case _3DPRIM_QUADLIST:
> -      brw_gs_quads( &c, key );
> -      break;
> -   case _3DPRIM_QUADSTRIP:
> -      brw_gs_quad_strip( &c, key );
> -      break;
> -   case _3DPRIM_LINELOOP:
> -      brw_gs_lines( &c );
> -      break;
> -   default:
> -      ralloc_free(mem_ctx);
> -      return;
> +   if (intel->gen >= 6) {
> +      unsigned num_verts;
> +      bool check_edge_flag;
> +      /* On Sandybridge, we use the GS for implementing transform feedback
> +       * (called "Stream Out" in the PRM).
> +       */
> +      switch (key->primitive) {
> +      case _3DPRIM_POINTLIST:
> +         num_verts = 1;
> +         check_edge_flag = false;
> +	 break;
> +      case _3DPRIM_LINELIST:
> +      case _3DPRIM_LINESTRIP:
> +      case _3DPRIM_LINELOOP:
> +         num_verts = 2;
> +         check_edge_flag = false;
> +	 break;
> +      case _3DPRIM_TRILIST:
> +      case _3DPRIM_TRIFAN:
> +      case _3DPRIM_TRISTRIP:
> +      case _3DPRIM_RECTLIST:
> +	 num_verts = 3;
> +         check_edge_flag = false;
> +         break;
> +      case _3DPRIM_QUADLIST:
> +      case _3DPRIM_QUADSTRIP:
> +      case _3DPRIM_POLYGON:
> +         num_verts = 3;
> +         check_edge_flag = true;
> +         break;
> +      default:
> +	 assert(!"Unexpected primitive type in Gen6 SOL program.");
> +	 return;
> +      }
> +      gen6_sol_program(&c, key, num_verts, check_edge_flag);
> +   } else {
> +      /* On Gen4-5, we use the GS to decompose certain types of primitives.
> +       * Note that primitives which don't require a GS program have already
> +       * been weeded out by now.
> +       */
> +      switch (key->primitive) {
> +      case _3DPRIM_QUADLIST:
> +	 brw_gs_quads( &c, key );
> +	 break;
> +      case _3DPRIM_QUADSTRIP:
> +	 brw_gs_quad_strip( &c, key );
> +	 break;
> +      case _3DPRIM_LINELOOP:
> +	 brw_gs_lines( &c );
> +	 break;
> +      default:
> +	 ralloc_free(mem_ctx);
> +	 return;
> +      }
>     }
>  
>     /* get the program
> @@ -147,11 +177,25 @@ static void populate_key( struct brw_context *brw,
>        key->pv_first = true;
>     }
>  
> -   key->need_gs_prog = (intel->gen >= 6)
> -      ? 0
> -      : (brw->primitive == _3DPRIM_QUADLIST ||
> -	 brw->primitive == _3DPRIM_QUADSTRIP ||
> -	 brw->primitive == _3DPRIM_LINELOOP);
> +   if (intel->gen == 6) {
> +      /* On Gen6, GS is used for transform feedback. */
> +      key->need_gs_prog = ctx->TransformFeedback.CurrentObject->Active;
> +   } else if (intel->gen >= 7) {
> +      /* On Gen7 and later, we don't use GS (yet). */
> +      key->need_gs_prog = false;

Could you please put these in order?  6, 7+, 4-5 is just asking for OCD
issues. :)  I'd probably move the >= 7 check to the top.

> +   } else {
> +      /* Pre-gen6, GS is used to transform QUADLIST, QUADSTRIP, and LINELOOP
> +       * into simpler primitives.
> +       */
> +      key->need_gs_prog = (brw->primitive == _3DPRIM_QUADLIST ||
> +                           brw->primitive == _3DPRIM_QUADSTRIP ||
> +                           brw->primitive == _3DPRIM_LINELOOP);
> +   }
> +   /* For testing, the environment variable INTEL_FORCE_GS can be used to
> +    * force a GS program to be used, even if it's not necessary.
> +    */
> +   if (getenv("INTEL_FORCE_GS"))
> +      key->need_gs_prog = true;
>  }
>  
>  /* Calculate interpolants for triangle and line rasterization.
> @@ -182,7 +226,8 @@ brw_upload_gs_prog(struct brw_context *brw)
>  const struct brw_tracked_state brw_gs_prog = {
>     .dirty = {
>        .mesa  = (_NEW_LIGHT |
> -                _NEW_TRANSFORM),
> +                _NEW_TRANSFORM |
> +                _NEW_TRANSFORM_FEEDBACK),
>        .brw   = BRW_NEW_PRIMITIVE,
>        .cache = CACHE_NEW_VS_PROG
>     },
> diff --git a/src/mesa/drivers/dri/i965/brw_gs.h b/src/mesa/drivers/dri/i965/brw_gs.h
> index d71609f..bade3f6 100644
> --- a/src/mesa/drivers/dri/i965/brw_gs.h
> +++ b/src/mesa/drivers/dri/i965/brw_gs.h
> @@ -68,5 +68,7 @@ struct brw_gs_compile {
>  void brw_gs_quads( struct brw_gs_compile *c, struct brw_gs_prog_key *key );
>  void brw_gs_quad_strip( struct brw_gs_compile *c, struct brw_gs_prog_key *key );
>  void brw_gs_lines( struct brw_gs_compile *c );
> +void gen6_sol_program(struct brw_gs_compile *c, struct brw_gs_prog_key *key,
> +                      unsigned num_verts, bool check_edge_flag);
>  
>  #endif
> diff --git a/src/mesa/drivers/dri/i965/brw_gs_emit.c b/src/mesa/drivers/dri/i965/brw_gs_emit.c
> index 3d332c4..a6e9f50 100644
> --- a/src/mesa/drivers/dri/i965/brw_gs_emit.c
> +++ b/src/mesa/drivers/dri/i965/brw_gs_emit.c
> @@ -101,6 +101,37 @@ static void brw_gs_overwrite_header_dw2(struct brw_gs_compile *c,
>  }
>  
>  /**
> + * Overwrite DWORD 2 of c->reg.header with the primitive type from c->reg.R0.
> + *
> + * When the thread is spawned, GRF 0 contains the primitive type in bits 4:0
> + * of DWORD 2.  URB_WRITE messages need the primitive type in bits 6:2 of
> + * DWORD 2.  So this function extracts the primitive type field, bitshifts it
> + * appropriately, and stores it in c->reg.header.
> + */
> +static void brw_gs_overwrite_header_dw2_from_r0(struct brw_gs_compile *c)
> +{
> +   struct brw_compile *p = &c->func;
> +   brw_AND(p, get_element_ud(c->reg.header, 2), get_element_ud(c->reg.R0, 2),
> +           brw_imm_ud(0x1f));
> +   brw_SHL(p, get_element_ud(c->reg.header, 2),
> +           get_element_ud(c->reg.header, 2), brw_imm_ud(2));
> +}
> +
> +/**
> + * Apply an additive offset to DWORD 2 of c->reg.header.
> + *
> + * This is used to set/unset the "PrimStart" and "PrimEnd" flags appropriately
> + * for each vertex.
> + */
> +static void brw_gs_offset_header_dw2(struct brw_gs_compile *c, int offset)
> +{
> +   struct brw_compile *p = &c->func;
> +   brw_ADD(p, get_element_d(c->reg.header, 2), get_element_d(c->reg.header, 2),
> +           brw_imm_d(offset));
> +}
> +
> +
> +/**
>   * Emit a vertex using the URB_WRITE message.  Use the contents of
>   * c->reg.header for the message header, and the registers starting at \c vert
>   * for the vertex data.
> @@ -269,3 +300,66 @@ void brw_gs_lines( struct brw_gs_compile *c )
>            | URB_WRITE_M02_PRIM_END));
>     brw_gs_emit_vue(c, c->reg.vertex[1], 1);
>  }
> +
> +/**
> + * Generate the geometry shader program used on Gen6 to perform stream output
> + * (transform feedback).
> + */
> +void
> +gen6_sol_program(struct brw_gs_compile *c, struct brw_gs_prog_key *key,
> +	         unsigned num_verts, bool check_edge_flags)
> +{
> +   struct brw_compile *p = &c->func;
> +
> +   brw_gs_alloc_regs(c, num_verts);
> +   brw_gs_initialize_header(c);
> +
> +   brw_gs_ff_sync(c, 1);
> +
> +   brw_gs_overwrite_header_dw2_from_r0(c);
> +   switch (num_verts) {
> +   case 1:
> +      brw_gs_offset_header_dw2(c, (URB_WRITE_M02_PRIM_START
> +                                   | URB_WRITE_M02_PRIM_END));
> +      brw_gs_emit_vue(c, c->reg.vertex[0], true);
> +      break;
> +   case 2:
> +      brw_gs_offset_header_dw2(c, URB_WRITE_M02_PRIM_START);
> +      brw_gs_emit_vue(c, c->reg.vertex[0], false);
> +      brw_gs_offset_header_dw2(c, (URB_WRITE_M02_PRIM_END
> +                                   - URB_WRITE_M02_PRIM_START));
> +      brw_gs_emit_vue(c, c->reg.vertex[1], true);
> +      break;
> +   case 3:
> +      if (check_edge_flags) {
> +         /* Only emit vertices 0 and 1 if this is the first triangle of the
> +          * polygon.  Otherwise they are redundant.
> +          */
> +         brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
> +         brw_AND(p, retype(brw_null_reg(), BRW_REGISTER_TYPE_UD),
> +                 get_element_ud(c->reg.R0, 2),
> +                 brw_imm_ud(BRW_GS_EDGE_INDICATOR_0));
> +         brw_IF(p, BRW_EXECUTE_1);
> +      }
> +      brw_gs_offset_header_dw2(c, URB_WRITE_M02_PRIM_START);
> +      brw_gs_emit_vue(c, c->reg.vertex[0], false);
> +      brw_gs_offset_header_dw2(c, -URB_WRITE_M02_PRIM_START);
> +      brw_gs_emit_vue(c, c->reg.vertex[1], false);
> +      if (check_edge_flags) {
> +         brw_ENDIF(p);
> +         /* Only emit vertex 2 in PRIM_END mode if this is the last triangle
> +          * of the polygon.  Otherwise leave the primitive incomplete because
> +          * there are more polygon vertices coming.
> +          */
> +         brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
> +         brw_AND(p, retype(brw_null_reg(), BRW_REGISTER_TYPE_UD),
> +                 get_element_ud(c->reg.R0, 2),
> +                 brw_imm_ud(BRW_GS_EDGE_INDICATOR_1));
> +         brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
> +      }
> +      brw_gs_offset_header_dw2(c, URB_WRITE_M02_PRIM_END);
> +      brw_set_predicate_control(p, BRW_PREDICATE_NONE);
> +      brw_gs_emit_vue(c, c->reg.vertex[2], true);
> +      break;
> +   }
> +}
> diff --git a/src/mesa/drivers/dri/i965/gen6_gs_state.c b/src/mesa/drivers/dri/i965/gen6_gs_state.c
> index d29f029..b041140 100644
> --- a/src/mesa/drivers/dri/i965/gen6_gs_state.c
> +++ b/src/mesa/drivers/dri/i965/gen6_gs_state.c
> @@ -44,22 +44,36 @@ upload_gs_state(struct brw_context *brw)
>     OUT_BATCH(0);
>     ADVANCE_BATCH();
>  
> -   // GS should never be used on Gen6.  Disable it.
> -   assert(!brw->gs.prog_active);
> -   BEGIN_BATCH(7);
> -   OUT_BATCH(_3DSTATE_GS << 16 | (7 - 2));
> -   OUT_BATCH(0); /* prog_bo */
> -   OUT_BATCH((0 << GEN6_GS_SAMPLER_COUNT_SHIFT) |
> -	     (0 << GEN6_GS_BINDING_TABLE_ENTRY_COUNT_SHIFT));
> -   OUT_BATCH(0); /* scratch space base offset */
> -   OUT_BATCH((1 << GEN6_GS_DISPATCH_START_GRF_SHIFT) |
> -	     (0 << GEN6_GS_URB_READ_LENGTH_SHIFT) |
> -	     (0 << GEN6_GS_URB_ENTRY_READ_OFFSET_SHIFT));
> -   OUT_BATCH((0 << GEN6_GS_MAX_THREADS_SHIFT) |
> -	     GEN6_GS_STATISTICS_ENABLE |
> -	     GEN6_GS_RENDERING_ENABLE);
> -   OUT_BATCH(0);
> -   ADVANCE_BATCH();
> +   if (brw->gs.prog_active) {
> +      BEGIN_BATCH(7);
> +      OUT_BATCH(_3DSTATE_GS << 16 | (7 - 2));
> +      OUT_BATCH(brw->gs.prog_offset);
> +      OUT_BATCH(GEN6_GS_SPF_MODE | GEN6_GS_VECTOR_MASK_ENABLE);
> +      OUT_BATCH(0); /* no scratch space */
> +      OUT_BATCH((1 << GEN6_GS_DISPATCH_START_GRF_SHIFT) |
> +	        (brw->gs.prog_data->urb_read_length << GEN6_GS_URB_READ_LENGTH_SHIFT));
> +      OUT_BATCH(((brw->max_gs_threads - 1) << GEN6_GS_MAX_THREADS_SHIFT) |
> +	        GEN6_GS_STATISTICS_ENABLE |
> +		GEN6_GS_SO_STATISTICS_ENABLE |
> +		0); //GEN6_GS_RENDERING_ENABLE);

I'm rather surprised this works.  I thought you needed the
GEN6_GS_RENDERING_ENABLE bit set in order to draw anything at all.

The commented out code looks like it came from a half-baked patch of
mine, so I'm guessing it's unintentional.  Still, do you have any idea
why it would work?

> +      OUT_BATCH(GEN6_GS_ENABLE);
> +      ADVANCE_BATCH();
> +   } else {
> +      BEGIN_BATCH(7);
> +      OUT_BATCH(_3DSTATE_GS << 16 | (7 - 2));
> +      OUT_BATCH(0); /* prog_bo */
> +      OUT_BATCH((0 << GEN6_GS_SAMPLER_COUNT_SHIFT) |
> +		(0 << GEN6_GS_BINDING_TABLE_ENTRY_COUNT_SHIFT));
> +      OUT_BATCH(0); /* scratch space base offset */
> +      OUT_BATCH((1 << GEN6_GS_DISPATCH_START_GRF_SHIFT) |
> +		(0 << GEN6_GS_URB_READ_LENGTH_SHIFT) |
> +		(0 << GEN6_GS_URB_ENTRY_READ_OFFSET_SHIFT));
> +      OUT_BATCH((0 << GEN6_GS_MAX_THREADS_SHIFT) |
> +		GEN6_GS_STATISTICS_ENABLE |
> +		GEN6_GS_RENDERING_ENABLE);
> +      OUT_BATCH(0);
> +      ADVANCE_BATCH();
> +   }
>  }
>  
>  const struct brw_tracked_state gen6_gs_state = {


More information about the mesa-dev mailing list