[Mesa-dev] [PATCH 10/10] i965 gen6: Implement pass-through GS for transform feedback.
Kenneth Graunke
kenneth at whitecape.org
Tue Dec 6 23:54:30 PST 2011
On 12/05/2011 09:40 AM, Paul Berry wrote:
> In Gen6, transform feedback is accomplished by having the geometry
> shader send vertex data to the data port using "Streamed Vertex Buffer
> Write" messages, while simultaneously passing vertices through to the
> rest of the graphics pipeline (if rendering is enabled).
>
> This patch adds a geometry shader program that simply passes vertices
> through to the rest of the graphics pipeline. The rest of transform
> feedback functionality will be added in future patches.
>
> To make the new geometry shader easier to test, I've added an
> environment variable "INTEL_FORCE_GS". If this environment variable
> is enabled, then the pass-through geometry shader will always be used,
> regardless of whether transform feedback is in effect.
>
> On my Sandy Bridge laptop, I'm able to enable INTEL_FORCE_GS with no
> Piglit regressions.
Looks good, Paul. One minor nit in the middle, and then a question at
the very end...
> ---
> src/mesa/drivers/dri/i965/brw_defines.h | 3 +
> src/mesa/drivers/dri/i965/brw_eu.h | 5 ++
> src/mesa/drivers/dri/i965/brw_gs.c | 105 ++++++++++++++++++++--------
> src/mesa/drivers/dri/i965/brw_gs.h | 2 +
> src/mesa/drivers/dri/i965/brw_gs_emit.c | 94 ++++++++++++++++++++++++++
> src/mesa/drivers/dri/i965/gen6_gs_state.c | 46 ++++++++-----
> 6 files changed, 209 insertions(+), 46 deletions(-)
>
> diff --git a/src/mesa/drivers/dri/i965/brw_defines.h b/src/mesa/drivers/dri/i965/brw_defines.h
> index 2990c90..ee7ec87 100644
> --- a/src/mesa/drivers/dri/i965/brw_defines.h
> +++ b/src/mesa/drivers/dri/i965/brw_defines.h
> @@ -1075,6 +1075,9 @@ enum brw_message_target {
> # define GEN6_GS_SVBI_POSTINCREMENT_VALUE_MASK INTEL_MASK(25, 16)
> # define GEN6_GS_ENABLE (1 << 15)
>
> +# define BRW_GS_EDGE_INDICATOR_0 (1 << 8)
> +# define BRW_GS_EDGE_INDICATOR_1 (1 << 9)
> +
> #define _3DSTATE_HS 0x781B /* GEN7+ */
> #define _3DSTATE_TE 0x781C /* GEN7+ */
> #define _3DSTATE_DS 0x781D /* GEN7+ */
> diff --git a/src/mesa/drivers/dri/i965/brw_eu.h b/src/mesa/drivers/dri/i965/brw_eu.h
> index dcb1fc9..596be02 100644
> --- a/src/mesa/drivers/dri/i965/brw_eu.h
> +++ b/src/mesa/drivers/dri/i965/brw_eu.h
> @@ -650,6 +650,11 @@ static INLINE struct brw_reg get_element_ud( struct brw_reg reg, GLuint elt )
> return vec1(suboffset(retype(reg, BRW_REGISTER_TYPE_UD), elt));
> }
>
> +static INLINE struct brw_reg get_element_d( struct brw_reg reg, GLuint elt )
> +{
> + return vec1(suboffset(retype(reg, BRW_REGISTER_TYPE_D), elt));
> +}
> +
>
> static INLINE struct brw_reg brw_swizzle( struct brw_reg reg,
> GLuint x,
> diff --git a/src/mesa/drivers/dri/i965/brw_gs.c b/src/mesa/drivers/dri/i965/brw_gs.c
> index 804ea84..2e729aa 100644
> --- a/src/mesa/drivers/dri/i965/brw_gs.c
> +++ b/src/mesa/drivers/dri/i965/brw_gs.c
> @@ -53,12 +53,6 @@ static void compile_gs_prog( struct brw_context *brw,
> void *mem_ctx;
> GLuint program_size;
>
> - /* Gen6: VF has already converted into polygon, and LINELOOP is
> - * converted to LINESTRIP at the beginning of the 3D pipeline.
> - */
> - if (intel->gen >= 6)
> - return;
> -
> memset(&c, 0, sizeof(c));
>
> c.key = *key;
> @@ -76,24 +70,60 @@ static void compile_gs_prog( struct brw_context *brw,
> */
> brw_set_mask_control(&c.func, BRW_MASK_DISABLE);
>
> -
> - /* Note that primitives which don't require a GS program have
> - * already been weeded out by this stage:
> - */
> -
> - switch (key->primitive) {
> - case _3DPRIM_QUADLIST:
> - brw_gs_quads( &c, key );
> - break;
> - case _3DPRIM_QUADSTRIP:
> - brw_gs_quad_strip( &c, key );
> - break;
> - case _3DPRIM_LINELOOP:
> - brw_gs_lines( &c );
> - break;
> - default:
> - ralloc_free(mem_ctx);
> - return;
> + if (intel->gen >= 6) {
> + unsigned num_verts;
> + bool check_edge_flag;
> + /* On Sandybridge, we use the GS for implementing transform feedback
> + * (called "Stream Out" in the PRM).
> + */
> + switch (key->primitive) {
> + case _3DPRIM_POINTLIST:
> + num_verts = 1;
> + check_edge_flag = false;
> + break;
> + case _3DPRIM_LINELIST:
> + case _3DPRIM_LINESTRIP:
> + case _3DPRIM_LINELOOP:
> + num_verts = 2;
> + check_edge_flag = false;
> + break;
> + case _3DPRIM_TRILIST:
> + case _3DPRIM_TRIFAN:
> + case _3DPRIM_TRISTRIP:
> + case _3DPRIM_RECTLIST:
> + num_verts = 3;
> + check_edge_flag = false;
> + break;
> + case _3DPRIM_QUADLIST:
> + case _3DPRIM_QUADSTRIP:
> + case _3DPRIM_POLYGON:
> + num_verts = 3;
> + check_edge_flag = true;
> + break;
> + default:
> + assert(!"Unexpected primitive type in Gen6 SOL program.");
> + return;
> + }
> + gen6_sol_program(&c, key, num_verts, check_edge_flag);
> + } else {
> + /* On Gen4-5, we use the GS to decompose certain types of primitives.
> + * Note that primitives which don't require a GS program have already
> + * been weeded out by now.
> + */
> + switch (key->primitive) {
> + case _3DPRIM_QUADLIST:
> + brw_gs_quads( &c, key );
> + break;
> + case _3DPRIM_QUADSTRIP:
> + brw_gs_quad_strip( &c, key );
> + break;
> + case _3DPRIM_LINELOOP:
> + brw_gs_lines( &c );
> + break;
> + default:
> + ralloc_free(mem_ctx);
> + return;
> + }
> }
>
> /* get the program
> @@ -147,11 +177,25 @@ static void populate_key( struct brw_context *brw,
> key->pv_first = true;
> }
>
> - key->need_gs_prog = (intel->gen >= 6)
> - ? 0
> - : (brw->primitive == _3DPRIM_QUADLIST ||
> - brw->primitive == _3DPRIM_QUADSTRIP ||
> - brw->primitive == _3DPRIM_LINELOOP);
> + if (intel->gen == 6) {
> + /* On Gen6, GS is used for transform feedback. */
> + key->need_gs_prog = ctx->TransformFeedback.CurrentObject->Active;
> + } else if (intel->gen >= 7) {
> + /* On Gen7 and later, we don't use GS (yet). */
> + key->need_gs_prog = false;
Could you please put these in order? 6, 7+, 4-5 is just asking for OCD
issues. :) I'd probably move the >= 7 check to the top.
> + } else {
> + /* Pre-gen6, GS is used to transform QUADLIST, QUADSTRIP, and LINELOOP
> + * into simpler primitives.
> + */
> + key->need_gs_prog = (brw->primitive == _3DPRIM_QUADLIST ||
> + brw->primitive == _3DPRIM_QUADSTRIP ||
> + brw->primitive == _3DPRIM_LINELOOP);
> + }
> + /* For testing, the environment variable INTEL_FORCE_GS can be used to
> + * force a GS program to be used, even if it's not necessary.
> + */
> + if (getenv("INTEL_FORCE_GS"))
> + key->need_gs_prog = true;
> }
>
> /* Calculate interpolants for triangle and line rasterization.
> @@ -182,7 +226,8 @@ brw_upload_gs_prog(struct brw_context *brw)
> const struct brw_tracked_state brw_gs_prog = {
> .dirty = {
> .mesa = (_NEW_LIGHT |
> - _NEW_TRANSFORM),
> + _NEW_TRANSFORM |
> + _NEW_TRANSFORM_FEEDBACK),
> .brw = BRW_NEW_PRIMITIVE,
> .cache = CACHE_NEW_VS_PROG
> },
> diff --git a/src/mesa/drivers/dri/i965/brw_gs.h b/src/mesa/drivers/dri/i965/brw_gs.h
> index d71609f..bade3f6 100644
> --- a/src/mesa/drivers/dri/i965/brw_gs.h
> +++ b/src/mesa/drivers/dri/i965/brw_gs.h
> @@ -68,5 +68,7 @@ struct brw_gs_compile {
> void brw_gs_quads( struct brw_gs_compile *c, struct brw_gs_prog_key *key );
> void brw_gs_quad_strip( struct brw_gs_compile *c, struct brw_gs_prog_key *key );
> void brw_gs_lines( struct brw_gs_compile *c );
> +void gen6_sol_program(struct brw_gs_compile *c, struct brw_gs_prog_key *key,
> + unsigned num_verts, bool check_edge_flag);
>
> #endif
> diff --git a/src/mesa/drivers/dri/i965/brw_gs_emit.c b/src/mesa/drivers/dri/i965/brw_gs_emit.c
> index 3d332c4..a6e9f50 100644
> --- a/src/mesa/drivers/dri/i965/brw_gs_emit.c
> +++ b/src/mesa/drivers/dri/i965/brw_gs_emit.c
> @@ -101,6 +101,37 @@ static void brw_gs_overwrite_header_dw2(struct brw_gs_compile *c,
> }
>
> /**
> + * Overwrite DWORD 2 of c->reg.header with the primitive type from c->reg.R0.
> + *
> + * When the thread is spawned, GRF 0 contains the primitive type in bits 4:0
> + * of DWORD 2. URB_WRITE messages need the primitive type in bits 6:2 of
> + * DWORD 2. So this function extracts the primitive type field, bitshifts it
> + * appropriately, and stores it in c->reg.header.
> + */
> +static void brw_gs_overwrite_header_dw2_from_r0(struct brw_gs_compile *c)
> +{
> + struct brw_compile *p = &c->func;
> + brw_AND(p, get_element_ud(c->reg.header, 2), get_element_ud(c->reg.R0, 2),
> + brw_imm_ud(0x1f));
> + brw_SHL(p, get_element_ud(c->reg.header, 2),
> + get_element_ud(c->reg.header, 2), brw_imm_ud(2));
> +}
> +
> +/**
> + * Apply an additive offset to DWORD 2 of c->reg.header.
> + *
> + * This is used to set/unset the "PrimStart" and "PrimEnd" flags appropriately
> + * for each vertex.
> + */
> +static void brw_gs_offset_header_dw2(struct brw_gs_compile *c, int offset)
> +{
> + struct brw_compile *p = &c->func;
> + brw_ADD(p, get_element_d(c->reg.header, 2), get_element_d(c->reg.header, 2),
> + brw_imm_d(offset));
> +}
> +
> +
> +/**
> * Emit a vertex using the URB_WRITE message. Use the contents of
> * c->reg.header for the message header, and the registers starting at \c vert
> * for the vertex data.
> @@ -269,3 +300,66 @@ void brw_gs_lines( struct brw_gs_compile *c )
> | URB_WRITE_M02_PRIM_END));
> brw_gs_emit_vue(c, c->reg.vertex[1], 1);
> }
> +
> +/**
> + * Generate the geometry shader program used on Gen6 to perform stream output
> + * (transform feedback).
> + */
> +void
> +gen6_sol_program(struct brw_gs_compile *c, struct brw_gs_prog_key *key,
> + unsigned num_verts, bool check_edge_flags)
> +{
> + struct brw_compile *p = &c->func;
> +
> + brw_gs_alloc_regs(c, num_verts);
> + brw_gs_initialize_header(c);
> +
> + brw_gs_ff_sync(c, 1);
> +
> + brw_gs_overwrite_header_dw2_from_r0(c);
> + switch (num_verts) {
> + case 1:
> + brw_gs_offset_header_dw2(c, (URB_WRITE_M02_PRIM_START
> + | URB_WRITE_M02_PRIM_END));
> + brw_gs_emit_vue(c, c->reg.vertex[0], true);
> + break;
> + case 2:
> + brw_gs_offset_header_dw2(c, URB_WRITE_M02_PRIM_START);
> + brw_gs_emit_vue(c, c->reg.vertex[0], false);
> + brw_gs_offset_header_dw2(c, (URB_WRITE_M02_PRIM_END
> + - URB_WRITE_M02_PRIM_START));
> + brw_gs_emit_vue(c, c->reg.vertex[1], true);
> + break;
> + case 3:
> + if (check_edge_flags) {
> + /* Only emit vertices 0 and 1 if this is the first triangle of the
> + * polygon. Otherwise they are redundant.
> + */
> + brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
> + brw_AND(p, retype(brw_null_reg(), BRW_REGISTER_TYPE_UD),
> + get_element_ud(c->reg.R0, 2),
> + brw_imm_ud(BRW_GS_EDGE_INDICATOR_0));
> + brw_IF(p, BRW_EXECUTE_1);
> + }
> + brw_gs_offset_header_dw2(c, URB_WRITE_M02_PRIM_START);
> + brw_gs_emit_vue(c, c->reg.vertex[0], false);
> + brw_gs_offset_header_dw2(c, -URB_WRITE_M02_PRIM_START);
> + brw_gs_emit_vue(c, c->reg.vertex[1], false);
> + if (check_edge_flags) {
> + brw_ENDIF(p);
> + /* Only emit vertex 2 in PRIM_END mode if this is the last triangle
> + * of the polygon. Otherwise leave the primitive incomplete because
> + * there are more polygon vertices coming.
> + */
> + brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
> + brw_AND(p, retype(brw_null_reg(), BRW_REGISTER_TYPE_UD),
> + get_element_ud(c->reg.R0, 2),
> + brw_imm_ud(BRW_GS_EDGE_INDICATOR_1));
> + brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
> + }
> + brw_gs_offset_header_dw2(c, URB_WRITE_M02_PRIM_END);
> + brw_set_predicate_control(p, BRW_PREDICATE_NONE);
> + brw_gs_emit_vue(c, c->reg.vertex[2], true);
> + break;
> + }
> +}
> diff --git a/src/mesa/drivers/dri/i965/gen6_gs_state.c b/src/mesa/drivers/dri/i965/gen6_gs_state.c
> index d29f029..b041140 100644
> --- a/src/mesa/drivers/dri/i965/gen6_gs_state.c
> +++ b/src/mesa/drivers/dri/i965/gen6_gs_state.c
> @@ -44,22 +44,36 @@ upload_gs_state(struct brw_context *brw)
> OUT_BATCH(0);
> ADVANCE_BATCH();
>
> - // GS should never be used on Gen6. Disable it.
> - assert(!brw->gs.prog_active);
> - BEGIN_BATCH(7);
> - OUT_BATCH(_3DSTATE_GS << 16 | (7 - 2));
> - OUT_BATCH(0); /* prog_bo */
> - OUT_BATCH((0 << GEN6_GS_SAMPLER_COUNT_SHIFT) |
> - (0 << GEN6_GS_BINDING_TABLE_ENTRY_COUNT_SHIFT));
> - OUT_BATCH(0); /* scratch space base offset */
> - OUT_BATCH((1 << GEN6_GS_DISPATCH_START_GRF_SHIFT) |
> - (0 << GEN6_GS_URB_READ_LENGTH_SHIFT) |
> - (0 << GEN6_GS_URB_ENTRY_READ_OFFSET_SHIFT));
> - OUT_BATCH((0 << GEN6_GS_MAX_THREADS_SHIFT) |
> - GEN6_GS_STATISTICS_ENABLE |
> - GEN6_GS_RENDERING_ENABLE);
> - OUT_BATCH(0);
> - ADVANCE_BATCH();
> + if (brw->gs.prog_active) {
> + BEGIN_BATCH(7);
> + OUT_BATCH(_3DSTATE_GS << 16 | (7 - 2));
> + OUT_BATCH(brw->gs.prog_offset);
> + OUT_BATCH(GEN6_GS_SPF_MODE | GEN6_GS_VECTOR_MASK_ENABLE);
> + OUT_BATCH(0); /* no scratch space */
> + OUT_BATCH((1 << GEN6_GS_DISPATCH_START_GRF_SHIFT) |
> + (brw->gs.prog_data->urb_read_length << GEN6_GS_URB_READ_LENGTH_SHIFT));
> + OUT_BATCH(((brw->max_gs_threads - 1) << GEN6_GS_MAX_THREADS_SHIFT) |
> + GEN6_GS_STATISTICS_ENABLE |
> + GEN6_GS_SO_STATISTICS_ENABLE |
> + 0); //GEN6_GS_RENDERING_ENABLE);
I'm rather surprised this works. I thought you needed the
GEN6_GS_RENDERING_ENABLE bit set in order to draw anything at all.
The commented out code looks like it came from a half-baked patch of
mine, so I'm guessing it's unintentional. Still, do you have any idea
why it would work?
> + OUT_BATCH(GEN6_GS_ENABLE);
> + ADVANCE_BATCH();
> + } else {
> + BEGIN_BATCH(7);
> + OUT_BATCH(_3DSTATE_GS << 16 | (7 - 2));
> + OUT_BATCH(0); /* prog_bo */
> + OUT_BATCH((0 << GEN6_GS_SAMPLER_COUNT_SHIFT) |
> + (0 << GEN6_GS_BINDING_TABLE_ENTRY_COUNT_SHIFT));
> + OUT_BATCH(0); /* scratch space base offset */
> + OUT_BATCH((1 << GEN6_GS_DISPATCH_START_GRF_SHIFT) |
> + (0 << GEN6_GS_URB_READ_LENGTH_SHIFT) |
> + (0 << GEN6_GS_URB_ENTRY_READ_OFFSET_SHIFT));
> + OUT_BATCH((0 << GEN6_GS_MAX_THREADS_SHIFT) |
> + GEN6_GS_STATISTICS_ENABLE |
> + GEN6_GS_RENDERING_ENABLE);
> + OUT_BATCH(0);
> + ADVANCE_BATCH();
> + }
> }
>
> const struct brw_tracked_state gen6_gs_state = {
More information about the mesa-dev
mailing list