[Mesa-dev] [PATCH 2/3] i965: Implement the PMA stall fix.

Kristian Høgsberg krh at bitplanet.net
Mon Nov 3 22:58:42 PST 2014


On Wed, Oct 22, 2014 at 8:58 AM, Kenneth Graunke <kenneth at whitecape.org> wrote:
> Certain non-promoted depth cases typically incur stalls.  In very
> specific cases, we can enable a workaround which improves performance.
>
> Improves performance in GLBenchmark 2.7 TRex by 1.17762% +/- 0.448765%
> (n=75) at 1280x720 on Broadwell GT3.
>
> Haswell has this feature as well, but we can't currently write registers
> from userspace batches (and we'd incur additional software batch
> scanning overhead as well), so we haven't enabled it.  Broadwell allows
> us to write CACHE_MODE_1.  Backporters beware: the formula and flushing
> incantation differs between Haswell and Broadwell.
>
> Signed-off-by: Kenneth Graunke <kenneth at whitecape.org>
> ---
>  src/mesa/drivers/dri/i965/brw_context.h      |   1 +
>  src/mesa/drivers/dri/i965/brw_state.h        |   1 +
>  src/mesa/drivers/dri/i965/brw_state_upload.c |   6 +
>  src/mesa/drivers/dri/i965/gen8_depth_state.c | 170 +++++++++++++++++++++++++++
>  4 files changed, 178 insertions(+)
>
> diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h
> index 45d72d2..7877aa1 100644
> --- a/src/mesa/drivers/dri/i965/brw_context.h
> +++ b/src/mesa/drivers/dri/i965/brw_context.h
> @@ -1079,6 +1079,7 @@ struct brw_context
>     GLuint NewGLState;
>     struct {
>        struct brw_state_flags dirty;
> +      uint32_t pma_stall_bits;

I don't think I'd put it here, this looks like it's part of the atom
tracking... maybe in depthstencil or just not in a sub-struct?
Anyway, nothing to hold up the patches for.

>     } state;
>
>     struct brw_cache cache;
> diff --git a/src/mesa/drivers/dri/i965/brw_state.h b/src/mesa/drivers/dri/i965/brw_state.h
> index 2efe56e..209fab1 100644
> --- a/src/mesa/drivers/dri/i965/brw_state.h
> +++ b/src/mesa/drivers/dri/i965/brw_state.h
> @@ -137,6 +137,7 @@ extern const struct brw_tracked_state gen8_disable_stages;
>  extern const struct brw_tracked_state gen8_gs_state;
>  extern const struct brw_tracked_state gen8_index_buffer;
>  extern const struct brw_tracked_state gen8_multisample_state;
> +extern const struct brw_tracked_state gen8_pma_fix;
>  extern const struct brw_tracked_state gen8_ps_blend;
>  extern const struct brw_tracked_state gen8_ps_extra;
>  extern const struct brw_tracked_state gen8_ps_state;
> diff --git a/src/mesa/drivers/dri/i965/brw_state_upload.c b/src/mesa/drivers/dri/i965/brw_state_upload.c
> index a691319..efa870c 100644
> --- a/src/mesa/drivers/dri/i965/brw_state_upload.c
> +++ b/src/mesa/drivers/dri/i965/brw_state_upload.c
> @@ -333,6 +333,7 @@ static const struct brw_tracked_state *gen8_atoms[] =
>     &gen8_vertices,
>
>     &haswell_cut_index,
> +   &gen8_pma_fix,
>  };
>
>  static void
> @@ -390,6 +391,11 @@ void brw_init_state( struct brw_context *brw )
>     brw->state.dirty.mesa = ~0;
>     brw->state.dirty.brw = ~0ull;
>
> +   /* ~0 is a nonsensical value which won't match anything we program, so
> +    * the programming will take effect on the first time around.
> +    */
> +   brw->state.pma_stall_bits = ~0;
> +
>     /* Make sure that brw->state.dirty.brw has enough bits to hold all possible
>      * dirty flags.
>      */
> diff --git a/src/mesa/drivers/dri/i965/gen8_depth_state.c b/src/mesa/drivers/dri/i965/gen8_depth_state.c
> index 7c3bfe0..4284a62 100644
> --- a/src/mesa/drivers/dri/i965/gen8_depth_state.c
> +++ b/src/mesa/drivers/dri/i965/gen8_depth_state.c
> @@ -28,6 +28,7 @@
>  #include "brw_context.h"
>  #include "brw_state.h"
>  #include "brw_defines.h"
> +#include "brw_wm.h"
>
>  /**
>   * Helper function to emit depth related command packets.
> @@ -210,6 +211,172 @@ gen8_emit_depth_stencil_hiz(struct brw_context *brw,
>  }
>
>  /**
> + * Should we set the PMA FIX ENABLE bit?
> + *
> + * To avoid unnecessary depth related stalls, we need to set this bit.
> + * However, there is a very complicated formula which governs when it
> + * is legal to do so.  This function computes that.
> + *
> + * See the documenation for the CACHE_MODE_1 register, bit 11.
> + */
> +static bool
> +pma_fix_enable(const struct brw_context *brw)
> +{
> +   const struct gl_context *ctx = &brw->ctx;
> +   /* BRW_NEW_FRAGMENT_PROGRAM */
> +   const struct gl_fragment_program *fp = brw->fragment_program;
> +   /* _NEW_BUFFERS */
> +   struct intel_renderbuffer *depth_irb =
> +      intel_get_renderbuffer(ctx->DrawBuffer, BUFFER_DEPTH);
> +
> +   /* 3DSTATE_WM::ForceThreadDispatch is never used. */
> +   const bool wm_force_thread_dispatch = false;
> +
> +   /* 3DSTATE_RASTER::ForceSampleCount is never used. */
> +   const bool raster_force_sample_count_nonzero = false;
> +
> +   /* _NEW_BUFFERS:
> +    * 3DSTATE_DEPTH_BUFFER::SURFACE_TYPE != NULL &&
> +    * 3DSTATE_DEPTH_BUFFER::HIZ Enable
> +    */
> +   const bool hiz_enabled = depth_irb && intel_renderbuffer_has_hiz(depth_irb);
> +
> +   /* 3DSTATE_WM::Early Depth/Stencil Control != EDSC_PREPS (2).
> +    * We always leave this set to EDSC_NORMAL (0).
> +    */
> +   const bool edsc_not_preps = true;
> +
> +   /* 3DSTATE_PS_EXTRA::PixelShaderValid is always true. */
> +   const bool pixel_shader_valid = true;
> +
> +   /* !(3DSTATE_WM_HZ_OP::DepthBufferClear ||
> +    *   3DSTATE_WM_HZ_OP::DepthBufferResolve ||
> +    *   3DSTATE_WM_HZ_OP::Hierarchical Depth Buffer Resolve Enable ||
> +    *   3DSTATE_WM_HZ_OP::StencilBufferClear)
> +    *
> +    * HiZ operations are done outside of the normal state upload, so they're
> +    * definitely not happening now.
> +    */
> +   const bool in_hiz_op = false;
> +
> +   /* _NEW_DEPTH:
> +    * DEPTH_STENCIL_STATE::DepthTestEnable
> +    */
> +   const bool depth_test_enabled = depth_irb && ctx->Depth.Test;
> +
> +   /* _NEW_DEPTH:
> +    * 3DSTATE_WM_DEPTH_STENCIL::DepthWriteEnable &&
> +    * 3DSTATE_DEPTH_BUFFER::DEPTH_WRITE_ENABLE.
> +    */
> +   const bool depth_writes_enabled = ctx->Depth.Mask;
> +
> +   /* _NEW_STENCIL:
> +    * !DEPTH_STENCIL_STATE::Stencil Buffer Write Enable ||
> +    * !3DSTATE_DEPTH_BUFFER::Stencil Buffer Enable ||
> +    * !3DSTATE_STENCIL_BUFFER::Stencil Buffer Enable
> +    */
> +   const bool stencil_writes_enabled = ctx->Stencil._WriteEnabled;
> +
> +   /* BRW_NEW_FRAGMENT_PROGRAM:
> +    * 3DSTATE_PS_EXTRA::Pixel Shader Computed Depth Mode == PSCDEPTH_OFF
> +    */
> +   const bool ps_computes_depth =
> +      (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) &&
> +      fp->FragDepthLayout != FRAG_DEPTH_LAYOUT_UNCHANGED;
> +
> +   /* BRW_NEW_FRAGMENT_PROGRAM: 3DSTATE_PS_EXTRA::PixelShaderKillsPixels
> +    * CACHE_NEW_WM_PROG:        3DSTATE_PS_EXTRA::oMask Present to RenderTarget
> +    * _NEW_MULTISAMPLE:         3DSTATE_PS_BLEND::AlphaToCoverageEnable
> +    * _NEW_COLOR:               3DSTATE_PS_BLEND::AlphaTestEnable
> +    *
> +    * 3DSTATE_WM_CHROMAKEY::ChromaKeyKillEnable is always false.
> +    * 3DSTATE_WM::ForceKillPix != ForceOff is always true.
> +    */
> +   const bool kill_pixel =
> +      fp->UsesKill ||
> +      brw->wm.prog_data->uses_omask ||
> +      (ctx->Multisample._Enabled && ctx->Multisample.SampleAlphaToCoverage) ||
> +      ctx->Color.AlphaEnabled;
> +
> +   /* The big formula in CACHE_MODE_1::NP PMA FIX ENABLE. */
> +   return !wm_force_thread_dispatch &&
> +          !raster_force_sample_count_nonzero &&
> +          hiz_enabled &&
> +          edsc_not_preps &&
> +          pixel_shader_valid &&
> +          !in_hiz_op &&
> +          depth_test_enabled &&
> +          (ps_computes_depth ||
> +           (kill_pixel && (depth_writes_enabled || stencil_writes_enabled)));
> +}

That matches my understanding of the condition.

> +static void
> +write_pma_stall_bits(struct brw_context *brw, uint32_t pma_stall_bits)
> +{
> +   struct gl_context *ctx = &brw->ctx;
> +
> +   /* If we haven't actually changed the value, bail now to avoid unnecessary
> +    * pipeline stalls and register writes.
> +    */
> +   if (brw->state.pma_stall_bits == pma_stall_bits)
> +      return;
> +
> +   brw->state.pma_stall_bits = pma_stall_bits;
> +
> +   /* According to the PIPE_CONTROL documentation, software should emit a
> +    * PIPE_CONTROL with the CS Stall and Depth Cache Flush bits set prior
> +    * to the LRI.  If stencil buffer writes are enabled, then a Render Cache
> +    * Flush is also necessary.
> +    */
> +   const uint32_t render_cache_flush =
> +      ctx->Stencil._WriteEnabled ? PIPE_CONTROL_WRITE_FLUSH : 0;
> +   brw_emit_pipe_control_flush(brw,
> +                               PIPE_CONTROL_CS_STALL |
> +                               PIPE_CONTROL_DEPTH_CACHE_FLUSH |
> +                               render_cache_flush);
> +
> +   /* CACHE_MODE_1 is a non-privileged register. */
> +   BEGIN_BATCH(3);
> +   OUT_BATCH(MI_LOAD_REGISTER_IMM | (3 - 2));
> +   OUT_BATCH(GEN7_CACHE_MODE_1);
> +   OUT_BATCH(GEN8_HIZ_PMA_MASK_BITS | pma_stall_bits);
> +   ADVANCE_BATCH();
> +
> +   /* After the LRI, a PIPE_CONTROL with both the Depth Stall and Depth Cache
> +    * Flush bits is often necessary.  We do it regardless because it's easier.
> +    * The render cache flush is also necessary if stencil writes are enabled.
> +    */
> +   brw_emit_pipe_control_flush(brw,
> +                               PIPE_CONTROL_DEPTH_STALL |
> +                               PIPE_CONTROL_DEPTH_CACHE_FLUSH |
> +                               render_cache_flush);
> +
> +}
> +
> +static void
> +gen8_emit_pma_stall_workaround(struct brw_context *brw)
> +{
> +   uint32_t bits = 0;
> +   if (pma_fix_enable(brw))
> +      bits |= GEN8_HIZ_NP_PMA_FIX_ENABLE | GEN8_HIZ_NP_EARLY_Z_FAILS_DISABLE;
> +
> +   write_pma_stall_bits(brw, bits);
> +}
> +
> +const struct brw_tracked_state gen8_pma_fix = {
> +   .dirty = {
> +      .mesa = _NEW_BUFFERS |
> +              _NEW_COLOR |
> +              _NEW_DEPTH |
> +              _NEW_MULTISAMPLE |
> +              _NEW_STENCIL,
> +      .brw = BRW_NEW_FRAGMENT_PROGRAM,
> +      .cache = CACHE_NEW_WM_PROG,
> +   },
> +   .emit = gen8_emit_pma_stall_workaround
> +};
> +
> +/**
>   * Emit packets to perform a depth/HiZ resolve or fast depth/stencil clear.
>   *
>   * See the "Optimized Depth Buffer Clear and/or Stencil Buffer Clear" section
> @@ -222,6 +389,9 @@ gen8_hiz_exec(struct brw_context *brw, struct intel_mipmap_tree *mt,
>     if (op == GEN6_HIZ_OP_NONE)
>        return;
>
> +   /* Disable the PMA stall fix since we're about to do a HiZ operation. */
> +   write_pma_stall_bits(brw, 0);
> +
>     assert(mt->first_level == 0);
>     assert(mt->logical_depth0 >= 1);

It's a nasty workaround but this patch makes it easier to follow and
validate against the spec.  Nicely done.

Reviewed-by: Kristian Høgsberg <krh at bitplanet.net>

> --
> 2.1.2
>
> _______________________________________________
> mesa-dev mailing list
> mesa-dev at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/mesa-dev


More information about the mesa-dev mailing list