[Mesa-dev] [PATCH 2/3] i965: Implement the PMA stall fix.

Kenneth Graunke kenneth at whitecape.org
Wed Oct 22 08:58:58 PDT 2014


Certain non-promoted depth cases typically incur stalls.  In very
specific cases, we can enable a workaround which improves performance.

Improves performance in GLBenchmark 2.7 TRex by 1.17762% +/- 0.448765%
(n=75) at 1280x720 on Broadwell GT3.

Haswell has this feature as well, but we can't currently write registers
from userspace batches (and we'd incur additional software batch
scanning overhead as well), so we haven't enabled it.  Broadwell allows
us to write CACHE_MODE_1.  Backporters beware: the formula and flushing
incantation differs between Haswell and Broadwell.

Signed-off-by: Kenneth Graunke <kenneth at whitecape.org>
---
 src/mesa/drivers/dri/i965/brw_context.h      |   1 +
 src/mesa/drivers/dri/i965/brw_state.h        |   1 +
 src/mesa/drivers/dri/i965/brw_state_upload.c |   6 +
 src/mesa/drivers/dri/i965/gen8_depth_state.c | 170 +++++++++++++++++++++++++++
 4 files changed, 178 insertions(+)

diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h
index 45d72d2..7877aa1 100644
--- a/src/mesa/drivers/dri/i965/brw_context.h
+++ b/src/mesa/drivers/dri/i965/brw_context.h
@@ -1079,6 +1079,7 @@ struct brw_context
    GLuint NewGLState;
    struct {
       struct brw_state_flags dirty;
+      uint32_t pma_stall_bits;
    } state;
 
    struct brw_cache cache;
diff --git a/src/mesa/drivers/dri/i965/brw_state.h b/src/mesa/drivers/dri/i965/brw_state.h
index 2efe56e..209fab1 100644
--- a/src/mesa/drivers/dri/i965/brw_state.h
+++ b/src/mesa/drivers/dri/i965/brw_state.h
@@ -137,6 +137,7 @@ extern const struct brw_tracked_state gen8_disable_stages;
 extern const struct brw_tracked_state gen8_gs_state;
 extern const struct brw_tracked_state gen8_index_buffer;
 extern const struct brw_tracked_state gen8_multisample_state;
+extern const struct brw_tracked_state gen8_pma_fix;
 extern const struct brw_tracked_state gen8_ps_blend;
 extern const struct brw_tracked_state gen8_ps_extra;
 extern const struct brw_tracked_state gen8_ps_state;
diff --git a/src/mesa/drivers/dri/i965/brw_state_upload.c b/src/mesa/drivers/dri/i965/brw_state_upload.c
index a691319..efa870c 100644
--- a/src/mesa/drivers/dri/i965/brw_state_upload.c
+++ b/src/mesa/drivers/dri/i965/brw_state_upload.c
@@ -333,6 +333,7 @@ static const struct brw_tracked_state *gen8_atoms[] =
    &gen8_vertices,
 
    &haswell_cut_index,
+   &gen8_pma_fix,
 };
 
 static void
@@ -390,6 +391,11 @@ void brw_init_state( struct brw_context *brw )
    brw->state.dirty.mesa = ~0;
    brw->state.dirty.brw = ~0ull;
 
+   /* ~0 is a nonsensical value which won't match anything we program, so
+    * the programming will take effect on the first time around.
+    */
+   brw->state.pma_stall_bits = ~0;
+
    /* Make sure that brw->state.dirty.brw has enough bits to hold all possible
     * dirty flags.
     */
diff --git a/src/mesa/drivers/dri/i965/gen8_depth_state.c b/src/mesa/drivers/dri/i965/gen8_depth_state.c
index 7c3bfe0..4284a62 100644
--- a/src/mesa/drivers/dri/i965/gen8_depth_state.c
+++ b/src/mesa/drivers/dri/i965/gen8_depth_state.c
@@ -28,6 +28,7 @@
 #include "brw_context.h"
 #include "brw_state.h"
 #include "brw_defines.h"
+#include "brw_wm.h"
 
 /**
  * Helper function to emit depth related command packets.
@@ -210,6 +211,172 @@ gen8_emit_depth_stencil_hiz(struct brw_context *brw,
 }
 
 /**
+ * Should we set the PMA FIX ENABLE bit?
+ *
+ * To avoid unnecessary depth related stalls, we need to set this bit.
+ * However, there is a very complicated formula which governs when it
+ * is legal to do so.  This function computes that.
+ *
+ * See the documenation for the CACHE_MODE_1 register, bit 11.
+ */
+static bool
+pma_fix_enable(const struct brw_context *brw)
+{
+   const struct gl_context *ctx = &brw->ctx;
+   /* BRW_NEW_FRAGMENT_PROGRAM */
+   const struct gl_fragment_program *fp = brw->fragment_program;
+   /* _NEW_BUFFERS */
+   struct intel_renderbuffer *depth_irb =
+      intel_get_renderbuffer(ctx->DrawBuffer, BUFFER_DEPTH);
+
+   /* 3DSTATE_WM::ForceThreadDispatch is never used. */
+   const bool wm_force_thread_dispatch = false;
+
+   /* 3DSTATE_RASTER::ForceSampleCount is never used. */
+   const bool raster_force_sample_count_nonzero = false;
+
+   /* _NEW_BUFFERS:
+    * 3DSTATE_DEPTH_BUFFER::SURFACE_TYPE != NULL &&
+    * 3DSTATE_DEPTH_BUFFER::HIZ Enable
+    */
+   const bool hiz_enabled = depth_irb && intel_renderbuffer_has_hiz(depth_irb);
+
+   /* 3DSTATE_WM::Early Depth/Stencil Control != EDSC_PREPS (2).
+    * We always leave this set to EDSC_NORMAL (0).
+    */
+   const bool edsc_not_preps = true;
+
+   /* 3DSTATE_PS_EXTRA::PixelShaderValid is always true. */
+   const bool pixel_shader_valid = true;
+
+   /* !(3DSTATE_WM_HZ_OP::DepthBufferClear ||
+    *   3DSTATE_WM_HZ_OP::DepthBufferResolve ||
+    *   3DSTATE_WM_HZ_OP::Hierarchical Depth Buffer Resolve Enable ||
+    *   3DSTATE_WM_HZ_OP::StencilBufferClear)
+    *
+    * HiZ operations are done outside of the normal state upload, so they're
+    * definitely not happening now.
+    */
+   const bool in_hiz_op = false;
+
+   /* _NEW_DEPTH:
+    * DEPTH_STENCIL_STATE::DepthTestEnable
+    */
+   const bool depth_test_enabled = depth_irb && ctx->Depth.Test;
+
+   /* _NEW_DEPTH:
+    * 3DSTATE_WM_DEPTH_STENCIL::DepthWriteEnable &&
+    * 3DSTATE_DEPTH_BUFFER::DEPTH_WRITE_ENABLE.
+    */
+   const bool depth_writes_enabled = ctx->Depth.Mask;
+
+   /* _NEW_STENCIL:
+    * !DEPTH_STENCIL_STATE::Stencil Buffer Write Enable ||
+    * !3DSTATE_DEPTH_BUFFER::Stencil Buffer Enable ||
+    * !3DSTATE_STENCIL_BUFFER::Stencil Buffer Enable
+    */
+   const bool stencil_writes_enabled = ctx->Stencil._WriteEnabled;
+
+   /* BRW_NEW_FRAGMENT_PROGRAM:
+    * 3DSTATE_PS_EXTRA::Pixel Shader Computed Depth Mode == PSCDEPTH_OFF
+    */
+   const bool ps_computes_depth =
+      (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) &&
+      fp->FragDepthLayout != FRAG_DEPTH_LAYOUT_UNCHANGED;
+
+   /* BRW_NEW_FRAGMENT_PROGRAM: 3DSTATE_PS_EXTRA::PixelShaderKillsPixels
+    * CACHE_NEW_WM_PROG:        3DSTATE_PS_EXTRA::oMask Present to RenderTarget
+    * _NEW_MULTISAMPLE:         3DSTATE_PS_BLEND::AlphaToCoverageEnable
+    * _NEW_COLOR:               3DSTATE_PS_BLEND::AlphaTestEnable
+    *
+    * 3DSTATE_WM_CHROMAKEY::ChromaKeyKillEnable is always false.
+    * 3DSTATE_WM::ForceKillPix != ForceOff is always true.
+    */
+   const bool kill_pixel =
+      fp->UsesKill ||
+      brw->wm.prog_data->uses_omask ||
+      (ctx->Multisample._Enabled && ctx->Multisample.SampleAlphaToCoverage) ||
+      ctx->Color.AlphaEnabled;
+
+   /* The big formula in CACHE_MODE_1::NP PMA FIX ENABLE. */
+   return !wm_force_thread_dispatch &&
+          !raster_force_sample_count_nonzero &&
+          hiz_enabled &&
+          edsc_not_preps &&
+          pixel_shader_valid &&
+          !in_hiz_op &&
+          depth_test_enabled &&
+          (ps_computes_depth ||
+           (kill_pixel && (depth_writes_enabled || stencil_writes_enabled)));
+}
+
+static void
+write_pma_stall_bits(struct brw_context *brw, uint32_t pma_stall_bits)
+{
+   struct gl_context *ctx = &brw->ctx;
+
+   /* If we haven't actually changed the value, bail now to avoid unnecessary
+    * pipeline stalls and register writes.
+    */
+   if (brw->state.pma_stall_bits == pma_stall_bits)
+      return;
+
+   brw->state.pma_stall_bits = pma_stall_bits;
+
+   /* According to the PIPE_CONTROL documentation, software should emit a
+    * PIPE_CONTROL with the CS Stall and Depth Cache Flush bits set prior
+    * to the LRI.  If stencil buffer writes are enabled, then a Render Cache
+    * Flush is also necessary.
+    */
+   const uint32_t render_cache_flush =
+      ctx->Stencil._WriteEnabled ? PIPE_CONTROL_WRITE_FLUSH : 0;
+   brw_emit_pipe_control_flush(brw,
+                               PIPE_CONTROL_CS_STALL |
+                               PIPE_CONTROL_DEPTH_CACHE_FLUSH |
+                               render_cache_flush);
+
+   /* CACHE_MODE_1 is a non-privileged register. */
+   BEGIN_BATCH(3);
+   OUT_BATCH(MI_LOAD_REGISTER_IMM | (3 - 2));
+   OUT_BATCH(GEN7_CACHE_MODE_1);
+   OUT_BATCH(GEN8_HIZ_PMA_MASK_BITS | pma_stall_bits);
+   ADVANCE_BATCH();
+
+   /* After the LRI, a PIPE_CONTROL with both the Depth Stall and Depth Cache
+    * Flush bits is often necessary.  We do it regardless because it's easier.
+    * The render cache flush is also necessary if stencil writes are enabled.
+    */
+   brw_emit_pipe_control_flush(brw,
+                               PIPE_CONTROL_DEPTH_STALL |
+                               PIPE_CONTROL_DEPTH_CACHE_FLUSH |
+                               render_cache_flush);
+
+}
+
+static void
+gen8_emit_pma_stall_workaround(struct brw_context *brw)
+{
+   uint32_t bits = 0;
+   if (pma_fix_enable(brw))
+      bits |= GEN8_HIZ_NP_PMA_FIX_ENABLE | GEN8_HIZ_NP_EARLY_Z_FAILS_DISABLE;
+
+   write_pma_stall_bits(brw, bits);
+}
+
+const struct brw_tracked_state gen8_pma_fix = {
+   .dirty = {
+      .mesa = _NEW_BUFFERS |
+              _NEW_COLOR |
+              _NEW_DEPTH |
+              _NEW_MULTISAMPLE |
+              _NEW_STENCIL,
+      .brw = BRW_NEW_FRAGMENT_PROGRAM,
+      .cache = CACHE_NEW_WM_PROG,
+   },
+   .emit = gen8_emit_pma_stall_workaround
+};
+
+/**
  * Emit packets to perform a depth/HiZ resolve or fast depth/stencil clear.
  *
  * See the "Optimized Depth Buffer Clear and/or Stencil Buffer Clear" section
@@ -222,6 +389,9 @@ gen8_hiz_exec(struct brw_context *brw, struct intel_mipmap_tree *mt,
    if (op == GEN6_HIZ_OP_NONE)
       return;
 
+   /* Disable the PMA stall fix since we're about to do a HiZ operation. */
+   write_pma_stall_bits(brw, 0);
+
    assert(mt->first_level == 0);
    assert(mt->logical_depth0 >= 1);
 
-- 
2.1.2



More information about the mesa-dev mailing list