[Mesa-dev] [PATCH] i965/gen7: Emit workaround flush when changing GS enable state.

Kenneth Graunke kenneth at whitecape.org
Thu Nov 14 13:54:13 PST 2013


On 11/04/2013 08:10 PM, Paul Berry wrote:
> ---
>  src/mesa/drivers/dri/i965/brw_context.h       | 44 +++++++++++++++++++++++++++
>  src/mesa/drivers/dri/i965/gen7_blorp.cpp      |  8 +++++
>  src/mesa/drivers/dri/i965/gen7_gs_state.c     | 18 +++++++++++
>  src/mesa/drivers/dri/i965/gen7_urb.c          | 24 ++-------------
>  src/mesa/drivers/dri/i965/intel_batchbuffer.c | 38 +++++++++++++++++++++++
>  src/mesa/drivers/dri/i965/intel_batchbuffer.h |  1 +
>  6 files changed, 111 insertions(+), 22 deletions(-)
> 
> diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h
> index bec4d6b..9e96e1d 100644
> --- a/src/mesa/drivers/dri/i965/brw_context.h
> +++ b/src/mesa/drivers/dri/i965/brw_context.h
> @@ -862,6 +862,44 @@ struct intel_sync_object {
>     drm_intel_bo *bo;
>  };

This seems like a lot of complexity for one workaround flush.

I think a simpler approach would be:

1. Create a brw_context::gs::enabled boolean.
   (It should be 'false' at context creation time.)
2. When enabling or disabling GS, check if brw->gs.enabled != the new
   enable value.  If so, flush.
3. Update brw->gs.enabled with the new value.

Your code here has the added advantage that it avoids doing the CS stall
if we've already done a CS stall for some other reason.  However, we
make no attempt to avoid duplicate stalls for any of other other
workaround flushes.  Perhaps we should someday, but I'd like to start
with the simple solution and justify any added complexity with
performance data.

My intuition is that the first CS stall will be a performance hit, but
subsequent CS stalls should be fairly cheap - you've likely only
processed a couple 3DSTATE packets since the first stall.

> +
> +/**
> + * From Graphics BSpec: 3D-Media-GPGPU Engine > 3D Pipeline Stages > Geometry
> + * > Geometry Shader > State:
> + *
> + *     "Note: Because of corruption in IVB:GT2, software needs to flush the
> + *     whole fixed function pipeline when the GS enable changes value in the
> + *     3DSTATE_GS."
> + *
> + * The hardware architects have clarified that in this context "flush the
> + * whole fixed function pipeline" means to emit a PIPE_CONTROL with the "CS
> + * Stall" bit set.
> + *
> + * This enum keeps track of the information we need to determine when a flush
> + * needs to happen.
> + */
> +enum ivbgt2_gs_flush_workaround_state {
> +
> +   /**
> +    * The pipeline has been flushed since the last 3DSTATE_GS, so no further
> +    * flushing is needed right now.
> +    */
> +   IVBGT2_GS_FLUSH_UNNECESSARY,
> +
> +   /**
> +    * Since the last flush, primitives have been rendered with GS enabled.  So
> +    * a flush will be needed next time we want to disable the GS.
> +    */
> +   IVBGT2_GS_FLUSH_NEEDED_BEFORE_DISABLE,
> +
> +   /**
> +    * Since the last flush, primitives have been rendered with GS disabled.
> +    * So a flush will be needed next time we want to enable the GS.
> +    */
> +   IVBGT2_GS_FLUSH_NEEDED_BEFORE_ENABLE,
> +};
> +
> +
>  struct intel_batchbuffer {
>     /** Current batchbuffer being queued up. */
>     drm_intel_bo *bo;
> @@ -871,6 +909,12 @@ struct intel_batchbuffer {
>     drm_intel_bo *workaround_bo;
>     bool need_workaround_flush;
>  
> +   /**
> +    * For Ivy Bridge GT2, state machine to determine whether a flush is needed
> +    * before changing GS enable.  Undefined for all other platforms.
> +    */
> +   enum ivbgt2_gs_flush_workaround_state ivbgt2_gs_flush_needed;
> +
>     struct cached_batch_item *cached_items;
>  
>     uint16_t emit, total;
> diff --git a/src/mesa/drivers/dri/i965/gen7_blorp.cpp b/src/mesa/drivers/dri/i965/gen7_blorp.cpp
> index 71f31b7..accb93e 100644
> --- a/src/mesa/drivers/dri/i965/gen7_blorp.cpp
> +++ b/src/mesa/drivers/dri/i965/gen7_blorp.cpp
> @@ -402,6 +402,14 @@ gen7_blorp_emit_gs_disable(struct brw_context *brw,
>     OUT_BATCH(0);
>     ADVANCE_BATCH();
>  
> +   if (!brw->is_haswell && brw->gt == 2) {
> +      if (brw->batch.ivbgt2_gs_flush_needed ==
> +          IVBGT2_GS_FLUSH_NEEDED_BEFORE_DISABLE) {
> +         gen7_emit_cs_stall_flush(brw);
> +      }
> +      brw->batch.ivbgt2_gs_flush_needed = IVBGT2_GS_FLUSH_NEEDED_BEFORE_ENABLE;
> +   }
> +
>     BEGIN_BATCH(7);
>     OUT_BATCH(_3DSTATE_GS << 16 | (7 - 2));
>     OUT_BATCH(0);
> diff --git a/src/mesa/drivers/dri/i965/gen7_gs_state.c b/src/mesa/drivers/dri/i965/gen7_gs_state.c
> index 2602200..c2f036e 100644
> --- a/src/mesa/drivers/dri/i965/gen7_gs_state.c
> +++ b/src/mesa/drivers/dri/i965/gen7_gs_state.c
> @@ -81,6 +81,15 @@ upload_gs_state(struct brw_context *brw)
>     gen7_upload_constant_state(brw, stage_state, active, _3DSTATE_CONSTANT_GS);
>  
>     if (active) {
> +      if (!brw->is_haswell && brw->gt == 2) {
> +         if (brw->batch.ivbgt2_gs_flush_needed ==
> +             IVBGT2_GS_FLUSH_NEEDED_BEFORE_ENABLE) {
> +            gen7_emit_cs_stall_flush(brw);
> +         }
> +         brw->batch.ivbgt2_gs_flush_needed =
> +            IVBGT2_GS_FLUSH_NEEDED_BEFORE_DISABLE;
> +      }
> +
>        BEGIN_BATCH(7);
>        OUT_BATCH(_3DSTATE_GS << 16 | (7 - 2));
>        OUT_BATCH(stage_state->prog_offset);
> @@ -159,6 +168,15 @@ upload_gs_state(struct brw_context *brw)
>        OUT_BATCH(dw6);
>        ADVANCE_BATCH();
>     } else {
> +      if (!brw->is_haswell && brw->gt == 2) {
> +         if (brw->batch.ivbgt2_gs_flush_needed ==
> +             IVBGT2_GS_FLUSH_NEEDED_BEFORE_DISABLE) {
> +            gen7_emit_cs_stall_flush(brw);
> +         }
> +         brw->batch.ivbgt2_gs_flush_needed =
> +            IVBGT2_GS_FLUSH_NEEDED_BEFORE_ENABLE;
> +      }
> +
>        BEGIN_BATCH(7);
>        OUT_BATCH(_3DSTATE_GS << 16 | (7 - 2));
>        OUT_BATCH(0); /* prog_bo */
> diff --git a/src/mesa/drivers/dri/i965/gen7_urb.c b/src/mesa/drivers/dri/i965/gen7_urb.c
> index 6dcdfe4..c638586 100644
> --- a/src/mesa/drivers/dri/i965/gen7_urb.c
> +++ b/src/mesa/drivers/dri/i965/gen7_urb.c
> @@ -122,28 +122,8 @@ gen7_emit_push_constant_state(struct brw_context *brw, unsigned vs_size,
>      *
>      * No such restriction exists for Haswell.
>      */
> -   if (!brw->is_haswell) {
> -      BEGIN_BATCH(4);
> -      OUT_BATCH(_3DSTATE_PIPE_CONTROL | (4 - 2));
> -      /* From p61 of the Ivy Bridge PRM (1.10.4 PIPE_CONTROL Command: DW1[20]
> -       * CS Stall):
> -       *
> -       *     One of the following must also be set:
> -       *     - Render Target Cache Flush Enable ([12] of DW1)
> -       *     - Depth Cache Flush Enable ([0] of DW1)
> -       *     - Stall at Pixel Scoreboard ([1] of DW1)
> -       *     - Depth Stall ([13] of DW1)
> -       *     - Post-Sync Operation ([13] of DW1)
> -       *
> -       * We choose to do a Post-Sync Operation (Write Immediate Data), since
> -       * it seems like it will incur the least additional performance penalty.
> -       */
> -      OUT_BATCH(PIPE_CONTROL_CS_STALL | PIPE_CONTROL_WRITE_IMMEDIATE);
> -      OUT_RELOC(brw->batch.workaround_bo,
> -                I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION, 0);
> -      OUT_BATCH(0);
> -      ADVANCE_BATCH();
> -   }
> +   if (!brw->is_haswell)
> +      gen7_emit_cs_stall_flush(brw);
>  }
>  
>  const struct brw_tracked_state gen7_push_constant_space = {
> diff --git a/src/mesa/drivers/dri/i965/intel_batchbuffer.c b/src/mesa/drivers/dri/i965/intel_batchbuffer.c
> index 6d1ae79..66d0c96 100644
> --- a/src/mesa/drivers/dri/i965/intel_batchbuffer.c
> +++ b/src/mesa/drivers/dri/i965/intel_batchbuffer.c
> @@ -194,6 +194,11 @@ brw_new_batch(struct brw_context *brw)
>      */
>     brw->batch.need_workaround_flush = true;
>  
> +   /* The kernel flushes the pipeline between batches, so no GS workaround
> +    * flush is necessary.
> +    */
> +   brw->batch.ivbgt2_gs_flush_needed = IVBGT2_GS_FLUSH_UNNECESSARY;
> +
>     brw->state_batch_count = 0;
>  
>     brw->ib.type = -1;
> @@ -510,6 +515,37 @@ gen7_emit_vs_workaround_flush(struct brw_context *brw)
>     ADVANCE_BATCH();
>  }
>  
> +
> +/**
> + * Emit a PIPE_CONTROL command for gen7 with the CS Stall bit set.
> + */
> +void
> +gen7_emit_cs_stall_flush(struct brw_context *brw)
> +{
> +   BEGIN_BATCH(4);
> +   OUT_BATCH(_3DSTATE_PIPE_CONTROL | (4 - 2));
> +   /* From p61 of the Ivy Bridge PRM (1.10.4 PIPE_CONTROL Command: DW1[20]
> +    * CS Stall):
> +    *
> +    *     One of the following must also be set:
> +    *     - Render Target Cache Flush Enable ([12] of DW1)
> +    *     - Depth Cache Flush Enable ([0] of DW1)
> +    *     - Stall at Pixel Scoreboard ([1] of DW1)
> +    *     - Depth Stall ([13] of DW1)
> +    *     - Post-Sync Operation ([13] of DW1)
> +    *
> +    * We choose to do a Post-Sync Operation (Write Immediate Data), since
> +    * it seems like it will incur the least additional performance penalty.
> +    */
> +   OUT_BATCH(PIPE_CONTROL_CS_STALL | PIPE_CONTROL_WRITE_IMMEDIATE);
> +   OUT_RELOC(brw->batch.workaround_bo,
> +             I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION, 0);
> +   OUT_BATCH(0);
> +   ADVANCE_BATCH();
> +   brw->batch.ivbgt2_gs_flush_needed = IVBGT2_GS_FLUSH_UNNECESSARY;
> +}

If you instead used PIPE_CONTROL_STALL_AT_SCOREBOARD, then this would
work on Sandybridge as well.  That would let us reuse it someday.

But it's not that important, so I'm fine if you want to leave it as is.

> +
> +
>  /**
>   * Emits a PIPE_CONTROL with a non-zero post-sync operation, for
>   * implementing two workarounds on gen6.  From section 1.4.7.1
> @@ -560,6 +596,7 @@ intel_emit_post_sync_nonzero_flush(struct brw_context *brw)
>     OUT_BATCH(0); /* address */
>     OUT_BATCH(0); /* write data */
>     ADVANCE_BATCH();
> +   brw->batch.ivbgt2_gs_flush_needed = IVBGT2_GS_FLUSH_UNNECESSARY;
>  
>     BEGIN_BATCH(4);
>     OUT_BATCH(_3DSTATE_PIPE_CONTROL | (4 - 2));
> @@ -612,6 +649,7 @@ intel_batchbuffer_emit_mi_flush(struct brw_context *brw)
>  	 OUT_BATCH(0); /* write address */
>  	 OUT_BATCH(0); /* write data */
>  	 ADVANCE_BATCH();
> +         brw->batch.ivbgt2_gs_flush_needed = IVBGT2_GS_FLUSH_UNNECESSARY;
>        }
>     } else {
>        BEGIN_BATCH(4);
> diff --git a/src/mesa/drivers/dri/i965/intel_batchbuffer.h b/src/mesa/drivers/dri/i965/intel_batchbuffer.h
> index d46f48e..cabbb69 100644
> --- a/src/mesa/drivers/dri/i965/intel_batchbuffer.h
> +++ b/src/mesa/drivers/dri/i965/intel_batchbuffer.h
> @@ -59,6 +59,7 @@ void intel_batchbuffer_emit_mi_flush(struct brw_context *brw);
>  void intel_emit_post_sync_nonzero_flush(struct brw_context *brw);
>  void intel_emit_depth_stall_flushes(struct brw_context *brw);
>  void gen7_emit_vs_workaround_flush(struct brw_context *brw);
> +void gen7_emit_cs_stall_flush(struct brw_context *brw);
>  
>  static INLINE uint32_t float_as_int(float f)
>  {
> 



More information about the mesa-dev mailing list