[Mesa-dev] [PATCH 2/3] i965: Implement the PMA stall fix.
Kristian Høgsberg
krh at bitplanet.net
Mon Nov 3 22:58:42 PST 2014
On Wed, Oct 22, 2014 at 8:58 AM, Kenneth Graunke <kenneth at whitecape.org> wrote:
> Certain non-promoted depth cases typically incur stalls. In very
> specific cases, we can enable a workaround which improves performance.
>
> Improves performance in GLBenchmark 2.7 TRex by 1.17762% +/- 0.448765%
> (n=75) at 1280x720 on Broadwell GT3.
>
> Haswell has this feature as well, but we can't currently write registers
> from userspace batches (and we'd incur additional software batch
> scanning overhead as well), so we haven't enabled it. Broadwell allows
> us to write CACHE_MODE_1. Backporters beware: the formula and flushing
> incantation differs between Haswell and Broadwell.
>
> Signed-off-by: Kenneth Graunke <kenneth at whitecape.org>
> ---
> src/mesa/drivers/dri/i965/brw_context.h | 1 +
> src/mesa/drivers/dri/i965/brw_state.h | 1 +
> src/mesa/drivers/dri/i965/brw_state_upload.c | 6 +
> src/mesa/drivers/dri/i965/gen8_depth_state.c | 170 +++++++++++++++++++++++++++
> 4 files changed, 178 insertions(+)
>
> diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h
> index 45d72d2..7877aa1 100644
> --- a/src/mesa/drivers/dri/i965/brw_context.h
> +++ b/src/mesa/drivers/dri/i965/brw_context.h
> @@ -1079,6 +1079,7 @@ struct brw_context
> GLuint NewGLState;
> struct {
> struct brw_state_flags dirty;
> + uint32_t pma_stall_bits;
I don't think I'd put it here, this looks like it's part of the atom
tracking... maybe in depthstencil or just not in a sub-struct?
Anyway, nothing to hold up the patches for.
> } state;
>
> struct brw_cache cache;
> diff --git a/src/mesa/drivers/dri/i965/brw_state.h b/src/mesa/drivers/dri/i965/brw_state.h
> index 2efe56e..209fab1 100644
> --- a/src/mesa/drivers/dri/i965/brw_state.h
> +++ b/src/mesa/drivers/dri/i965/brw_state.h
> @@ -137,6 +137,7 @@ extern const struct brw_tracked_state gen8_disable_stages;
> extern const struct brw_tracked_state gen8_gs_state;
> extern const struct brw_tracked_state gen8_index_buffer;
> extern const struct brw_tracked_state gen8_multisample_state;
> +extern const struct brw_tracked_state gen8_pma_fix;
> extern const struct brw_tracked_state gen8_ps_blend;
> extern const struct brw_tracked_state gen8_ps_extra;
> extern const struct brw_tracked_state gen8_ps_state;
> diff --git a/src/mesa/drivers/dri/i965/brw_state_upload.c b/src/mesa/drivers/dri/i965/brw_state_upload.c
> index a691319..efa870c 100644
> --- a/src/mesa/drivers/dri/i965/brw_state_upload.c
> +++ b/src/mesa/drivers/dri/i965/brw_state_upload.c
> @@ -333,6 +333,7 @@ static const struct brw_tracked_state *gen8_atoms[] =
> &gen8_vertices,
>
> &haswell_cut_index,
> + &gen8_pma_fix,
> };
>
> static void
> @@ -390,6 +391,11 @@ void brw_init_state( struct brw_context *brw )
> brw->state.dirty.mesa = ~0;
> brw->state.dirty.brw = ~0ull;
>
> + /* ~0 is a nonsensical value which won't match anything we program, so
> + * the programming will take effect on the first time around.
> + */
> + brw->state.pma_stall_bits = ~0;
> +
> /* Make sure that brw->state.dirty.brw has enough bits to hold all possible
> * dirty flags.
> */
> diff --git a/src/mesa/drivers/dri/i965/gen8_depth_state.c b/src/mesa/drivers/dri/i965/gen8_depth_state.c
> index 7c3bfe0..4284a62 100644
> --- a/src/mesa/drivers/dri/i965/gen8_depth_state.c
> +++ b/src/mesa/drivers/dri/i965/gen8_depth_state.c
> @@ -28,6 +28,7 @@
> #include "brw_context.h"
> #include "brw_state.h"
> #include "brw_defines.h"
> +#include "brw_wm.h"
>
> /**
> * Helper function to emit depth related command packets.
> @@ -210,6 +211,172 @@ gen8_emit_depth_stencil_hiz(struct brw_context *brw,
> }
>
> /**
> + * Should we set the PMA FIX ENABLE bit?
> + *
> + * To avoid unnecessary depth related stalls, we need to set this bit.
> + * However, there is a very complicated formula which governs when it
> + * is legal to do so. This function computes that.
> + *
> + * See the documenation for the CACHE_MODE_1 register, bit 11.
> + */
> +static bool
> +pma_fix_enable(const struct brw_context *brw)
> +{
> + const struct gl_context *ctx = &brw->ctx;
> + /* BRW_NEW_FRAGMENT_PROGRAM */
> + const struct gl_fragment_program *fp = brw->fragment_program;
> + /* _NEW_BUFFERS */
> + struct intel_renderbuffer *depth_irb =
> + intel_get_renderbuffer(ctx->DrawBuffer, BUFFER_DEPTH);
> +
> + /* 3DSTATE_WM::ForceThreadDispatch is never used. */
> + const bool wm_force_thread_dispatch = false;
> +
> + /* 3DSTATE_RASTER::ForceSampleCount is never used. */
> + const bool raster_force_sample_count_nonzero = false;
> +
> + /* _NEW_BUFFERS:
> + * 3DSTATE_DEPTH_BUFFER::SURFACE_TYPE != NULL &&
> + * 3DSTATE_DEPTH_BUFFER::HIZ Enable
> + */
> + const bool hiz_enabled = depth_irb && intel_renderbuffer_has_hiz(depth_irb);
> +
> + /* 3DSTATE_WM::Early Depth/Stencil Control != EDSC_PREPS (2).
> + * We always leave this set to EDSC_NORMAL (0).
> + */
> + const bool edsc_not_preps = true;
> +
> + /* 3DSTATE_PS_EXTRA::PixelShaderValid is always true. */
> + const bool pixel_shader_valid = true;
> +
> + /* !(3DSTATE_WM_HZ_OP::DepthBufferClear ||
> + * 3DSTATE_WM_HZ_OP::DepthBufferResolve ||
> + * 3DSTATE_WM_HZ_OP::Hierarchical Depth Buffer Resolve Enable ||
> + * 3DSTATE_WM_HZ_OP::StencilBufferClear)
> + *
> + * HiZ operations are done outside of the normal state upload, so they're
> + * definitely not happening now.
> + */
> + const bool in_hiz_op = false;
> +
> + /* _NEW_DEPTH:
> + * DEPTH_STENCIL_STATE::DepthTestEnable
> + */
> + const bool depth_test_enabled = depth_irb && ctx->Depth.Test;
> +
> + /* _NEW_DEPTH:
> + * 3DSTATE_WM_DEPTH_STENCIL::DepthWriteEnable &&
> + * 3DSTATE_DEPTH_BUFFER::DEPTH_WRITE_ENABLE.
> + */
> + const bool depth_writes_enabled = ctx->Depth.Mask;
> +
> + /* _NEW_STENCIL:
> + * !DEPTH_STENCIL_STATE::Stencil Buffer Write Enable ||
> + * !3DSTATE_DEPTH_BUFFER::Stencil Buffer Enable ||
> + * !3DSTATE_STENCIL_BUFFER::Stencil Buffer Enable
> + */
> + const bool stencil_writes_enabled = ctx->Stencil._WriteEnabled;
> +
> + /* BRW_NEW_FRAGMENT_PROGRAM:
> + * 3DSTATE_PS_EXTRA::Pixel Shader Computed Depth Mode == PSCDEPTH_OFF
> + */
> + const bool ps_computes_depth =
> + (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) &&
> + fp->FragDepthLayout != FRAG_DEPTH_LAYOUT_UNCHANGED;
> +
> + /* BRW_NEW_FRAGMENT_PROGRAM: 3DSTATE_PS_EXTRA::PixelShaderKillsPixels
> + * CACHE_NEW_WM_PROG: 3DSTATE_PS_EXTRA::oMask Present to RenderTarget
> + * _NEW_MULTISAMPLE: 3DSTATE_PS_BLEND::AlphaToCoverageEnable
> + * _NEW_COLOR: 3DSTATE_PS_BLEND::AlphaTestEnable
> + *
> + * 3DSTATE_WM_CHROMAKEY::ChromaKeyKillEnable is always false.
> + * 3DSTATE_WM::ForceKillPix != ForceOff is always true.
> + */
> + const bool kill_pixel =
> + fp->UsesKill ||
> + brw->wm.prog_data->uses_omask ||
> + (ctx->Multisample._Enabled && ctx->Multisample.SampleAlphaToCoverage) ||
> + ctx->Color.AlphaEnabled;
> +
> + /* The big formula in CACHE_MODE_1::NP PMA FIX ENABLE. */
> + return !wm_force_thread_dispatch &&
> + !raster_force_sample_count_nonzero &&
> + hiz_enabled &&
> + edsc_not_preps &&
> + pixel_shader_valid &&
> + !in_hiz_op &&
> + depth_test_enabled &&
> + (ps_computes_depth ||
> + (kill_pixel && (depth_writes_enabled || stencil_writes_enabled)));
> +}
That matches my understanding of the condition.
> +static void
> +write_pma_stall_bits(struct brw_context *brw, uint32_t pma_stall_bits)
> +{
> + struct gl_context *ctx = &brw->ctx;
> +
> + /* If we haven't actually changed the value, bail now to avoid unnecessary
> + * pipeline stalls and register writes.
> + */
> + if (brw->state.pma_stall_bits == pma_stall_bits)
> + return;
> +
> + brw->state.pma_stall_bits = pma_stall_bits;
> +
> + /* According to the PIPE_CONTROL documentation, software should emit a
> + * PIPE_CONTROL with the CS Stall and Depth Cache Flush bits set prior
> + * to the LRI. If stencil buffer writes are enabled, then a Render Cache
> + * Flush is also necessary.
> + */
> + const uint32_t render_cache_flush =
> + ctx->Stencil._WriteEnabled ? PIPE_CONTROL_WRITE_FLUSH : 0;
> + brw_emit_pipe_control_flush(brw,
> + PIPE_CONTROL_CS_STALL |
> + PIPE_CONTROL_DEPTH_CACHE_FLUSH |
> + render_cache_flush);
> +
> + /* CACHE_MODE_1 is a non-privileged register. */
> + BEGIN_BATCH(3);
> + OUT_BATCH(MI_LOAD_REGISTER_IMM | (3 - 2));
> + OUT_BATCH(GEN7_CACHE_MODE_1);
> + OUT_BATCH(GEN8_HIZ_PMA_MASK_BITS | pma_stall_bits);
> + ADVANCE_BATCH();
> +
> + /* After the LRI, a PIPE_CONTROL with both the Depth Stall and Depth Cache
> + * Flush bits is often necessary. We do it regardless because it's easier.
> + * The render cache flush is also necessary if stencil writes are enabled.
> + */
> + brw_emit_pipe_control_flush(brw,
> + PIPE_CONTROL_DEPTH_STALL |
> + PIPE_CONTROL_DEPTH_CACHE_FLUSH |
> + render_cache_flush);
> +
> +}
> +
> +static void
> +gen8_emit_pma_stall_workaround(struct brw_context *brw)
> +{
> + uint32_t bits = 0;
> + if (pma_fix_enable(brw))
> + bits |= GEN8_HIZ_NP_PMA_FIX_ENABLE | GEN8_HIZ_NP_EARLY_Z_FAILS_DISABLE;
> +
> + write_pma_stall_bits(brw, bits);
> +}
> +
> +const struct brw_tracked_state gen8_pma_fix = {
> + .dirty = {
> + .mesa = _NEW_BUFFERS |
> + _NEW_COLOR |
> + _NEW_DEPTH |
> + _NEW_MULTISAMPLE |
> + _NEW_STENCIL,
> + .brw = BRW_NEW_FRAGMENT_PROGRAM,
> + .cache = CACHE_NEW_WM_PROG,
> + },
> + .emit = gen8_emit_pma_stall_workaround
> +};
> +
> +/**
> * Emit packets to perform a depth/HiZ resolve or fast depth/stencil clear.
> *
> * See the "Optimized Depth Buffer Clear and/or Stencil Buffer Clear" section
> @@ -222,6 +389,9 @@ gen8_hiz_exec(struct brw_context *brw, struct intel_mipmap_tree *mt,
> if (op == GEN6_HIZ_OP_NONE)
> return;
>
> + /* Disable the PMA stall fix since we're about to do a HiZ operation. */
> + write_pma_stall_bits(brw, 0);
> +
> assert(mt->first_level == 0);
> assert(mt->logical_depth0 >= 1);
It's a nasty workaround but this patch makes it easier to follow and
validate against the spec. Nicely done.
Reviewed-by: Kristian Høgsberg <krh at bitplanet.net>
> --
> 2.1.2
>
> _______________________________________________
> mesa-dev mailing list
> mesa-dev at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/mesa-dev
More information about the mesa-dev
mailing list