[Mesa-dev] [PATCH v2 3/5] anv: Add support for the PMA fix on Broadwell
Nanley Chery
nanleychery at gmail.com
Thu Feb 9 01:34:14 UTC 2017
On Thu, Feb 02, 2017 at 01:26:05PM -0800, Jason Ekstrand wrote:
> In order to get good performance numbers for this, I had to hack up the
> driver to whack wm_prog_data::uses_kill to true to emulate a discard and
> used the Sascha "shadowmapping" demo. Setting uses_kill to true dropped
> the framerate on the demo by 25-30%. Enabling the PMA fix brought it
> back up to around 90% of the original framerate. This doesn't seem to
> really impact Dota 2; probably because it doesn't use 16-bit depth.
>
> Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin at intel.com>
> ---
> src/intel/vulkan/TODO | 1 -
> src/intel/vulkan/anv_cmd_buffer.c | 2 +
> src/intel/vulkan/anv_genX.h | 3 +
> src/intel/vulkan/anv_private.h | 17 +++++
> src/intel/vulkan/gen7_cmd_buffer.c | 7 ++
> src/intel/vulkan/gen8_cmd_buffer.c | 133 +++++++++++++++++++++++++++++++++++++
> src/intel/vulkan/genX_blorp_exec.c | 5 ++
> src/intel/vulkan/genX_cmd_buffer.c | 15 ++++-
> src/intel/vulkan/genX_pipeline.c | 38 +++++++++++
> 9 files changed, 219 insertions(+), 2 deletions(-)
>
> diff --git a/src/intel/vulkan/TODO b/src/intel/vulkan/TODO
> index 38acc0d..f8b73a1 100644
> --- a/src/intel/vulkan/TODO
> +++ b/src/intel/vulkan/TODO
> @@ -12,5 +12,4 @@ Performance:
> - Compressed multisample support
> - Pushing pieces of UBOs?
> - Enable guardband clipping
> - - pma stall workaround
> - Use soft-pin to avoid relocations
> diff --git a/src/intel/vulkan/anv_cmd_buffer.c b/src/intel/vulkan/anv_cmd_buffer.c
> index 5886fa6..8c08f8d 100644
> --- a/src/intel/vulkan/anv_cmd_buffer.c
> +++ b/src/intel/vulkan/anv_cmd_buffer.c
> @@ -135,6 +135,8 @@ anv_cmd_state_reset(struct anv_cmd_buffer *cmd_buffer)
> state->restart_index = UINT32_MAX;
> state->dynamic = default_dynamic_state;
> state->need_query_wa = true;
> + state->pma_fix_enabled = false;
> + state->hiz_enabled = false;
>
> if (state->attachments != NULL) {
> vk_free(&cmd_buffer->pool->alloc, state->attachments);
> diff --git a/src/intel/vulkan/anv_genX.h b/src/intel/vulkan/anv_genX.h
> index d04fe38..67147b0 100644
> --- a/src/intel/vulkan/anv_genX.h
> +++ b/src/intel/vulkan/anv_genX.h
> @@ -55,6 +55,9 @@ void genX(cmd_buffer_flush_dynamic_state)(struct anv_cmd_buffer *cmd_buffer);
>
> void genX(cmd_buffer_flush_compute_state)(struct anv_cmd_buffer *cmd_buffer);
>
> +void genX(cmd_buffer_enable_pma_fix)(struct anv_cmd_buffer *cmd_buffer,
> + bool enable);
> +
> void
> genX(emit_urb_setup)(struct anv_device *device, struct anv_batch *batch,
> const struct gen_l3_config *l3_config,
> diff --git a/src/intel/vulkan/anv_private.h b/src/intel/vulkan/anv_private.h
> index 4fe3ebc..6efe4ea 100644
> --- a/src/intel/vulkan/anv_private.h
> +++ b/src/intel/vulkan/anv_private.h
> @@ -1163,6 +1163,20 @@ struct anv_cmd_state {
> bool need_query_wa;
>
> /**
> + * Whether or not the gen8 PMA fix is enabled. We ensure that, at the top
> + * of any command buffer it disabled by disabling it in EndCommandBuffer
^
is?
> + * and before invoking the secondary in ExecuteCommands.
> + */
> + bool pma_fix_enabled;
> +
> + /**
> + * Whether or not we now for certain that HiZ is enabled for the current
^
know
> + * subpass. If, for whatever reason, we are unsure as to whether HiZ is
> + * enabled or not, this will be false.
> + */
> + bool hiz_enabled;
> +
> + /**
> * Array length is anv_cmd_state::pass::attachment_count. Array content is
> * valid only when recording a render pass instance.
> */
> @@ -1465,8 +1479,11 @@ struct anv_pipeline {
>
> uint32_t cs_right_mask;
>
> + bool writes_depth;
> + bool depth_test_enable;
> bool writes_stencil;
> bool depth_clamp_enable;
> + bool kill_pixel;
>
> struct {
> uint32_t sf[7];
> diff --git a/src/intel/vulkan/gen7_cmd_buffer.c b/src/intel/vulkan/gen7_cmd_buffer.c
> index 013ed87..c1a25e8 100644
> --- a/src/intel/vulkan/gen7_cmd_buffer.c
> +++ b/src/intel/vulkan/gen7_cmd_buffer.c
> @@ -260,6 +260,13 @@ genX(cmd_buffer_flush_dynamic_state)(struct anv_cmd_buffer *cmd_buffer)
> cmd_buffer->state.dirty = 0;
> }
>
> +void
> +genX(cmd_buffer_enable_pma_fix)(struct anv_cmd_buffer *cmd_buffer,
> + bool enable)
> +{
> + /* The NP PMA fix doesn't exist on gen7 */
> +}
> +
> void genX(CmdSetEvent)(
> VkCommandBuffer commandBuffer,
> VkEvent event,
> diff --git a/src/intel/vulkan/gen8_cmd_buffer.c b/src/intel/vulkan/gen8_cmd_buffer.c
> index 8c8de62..271ab3f 100644
> --- a/src/intel/vulkan/gen8_cmd_buffer.c
> +++ b/src/intel/vulkan/gen8_cmd_buffer.c
> @@ -155,6 +155,135 @@ __emit_sf_state(struct anv_cmd_buffer *cmd_buffer)
> #endif
>
> void
> +genX(cmd_buffer_enable_pma_fix)(struct anv_cmd_buffer *cmd_buffer, bool enable)
> +{
> +#if GEN_GEN == 8
> + if (cmd_buffer->state.pma_fix_enabled == enable)
> + return;
> +
> + anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
> + pc.DepthCacheFlushEnable = true;
> + pc.CommandStreamerStallEnable = true;
> + pc.RenderTargetCacheFlushEnable = true;
Instead of flushing the depth and RT caches, what do you think about
implementing a stall that writes 0 to the workaround bo? That should
consume less bandwidth.
To make the pipeline idle, flushing the caches shouldn't be necessary,
we should be able to do it with a valid pipecontrol packet that has the
CS stall bit set.
The BDW PRM states:
If the stall bit is set, the command streamer waits until the pipe is
completely flushed.
> + }
> +
> + uint32_t cache_mode;
> + anv_pack_struct(&cache_mode, GENX(CACHE_MODE_1),
> + .NPPMAFixEnable = enable,
> + .NPEarlyZFailsDisableMask = enable,
^
I don't think you intended to set the mask here. It should be:
.NPEarlyZFailsDisable = enable,
Does fixing this impact your FPS measurements significantly?
> + .NPPMAFixEnableMask = true,
> + .NPEarlyZFailsDisableMask = true);
> + anv_batch_emit(&cmd_buffer->batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
> + lri.RegisterOffset = GENX(CACHE_MODE_1_num);
> + lri.DataDWord = cache_mode;
> + }
> +
> + /* After the LRI, a PIPE_CONTROL with both the Depth Stall and Depth Cache
> + * Flush bits is often necessary. We do it regardless because it's easier.
> + * The render cache flush is also necessary if stencil writes are enabled.
> + */
I couldn't find a requirement to flush these specific caches in the
docs, but I did find:
To ensure this command gets executed before upcoming commands in the
ring, either a stalling pipecontrol should be sent after this
command, or MMIO 0x20C0 bit 7 should be set to 1.
Perhaps we could use the flush I suggested earlier?
> + anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
> + pc.DepthStallEnable = true;
> + pc.DepthCacheFlushEnable = true;
> + pc.RenderTargetCacheFlushEnable = true;
> + }
> +
> + cmd_buffer->state.pma_fix_enabled = enable;
> +#endif /* GEN_GEN == 8 */
> +}
> +
> +static inline bool
> +want_depth_pma_fix(struct anv_cmd_buffer *cmd_buffer)
> +{
> + assert(GEN_GEN == 8);
> +
> + /* From the Broadwell PRM Vol. 2c CACHE_MODE_1::NP_PMA_FIX_ENABLE:
> + *
> + * SW must set this bit in order to enable this fix when following
> + * expression is TRUE.
> + *
> + * 3DSTATE_WM::ForceThreadDispatch != 1 &&
> + * !(3DSTATE_RASTER::ForceSampleCount != NUMRASTSAMPLES_0) &&
> + * (3DSTATE_DEPTH_BUFFER::SURFACE_TYPE != NULL) &&
> + * (3DSTATE_DEPTH_BUFFER::HIZ Enable) &&
> + * !(3DSTATE_WM::EDSC_Mode == EDSC_PREPS) &&
> + * (3DSTATE_PS_EXTRA::PixelShaderValid) &&
> + * !(3DSTATE_WM_HZ_OP::DepthBufferClear ||
> + * 3DSTATE_WM_HZ_OP::DepthBufferResolve ||
> + * 3DSTATE_WM_HZ_OP::Hierarchical Depth Buffer Resolve Enable ||
> + * 3DSTATE_WM_HZ_OP::StencilBufferClear) &&
> + * (3DSTATE_WM_DEPTH_STENCIL::DepthTestEnable) &&
> + * (((3DSTATE_PS_EXTRA::PixelShaderKillsPixels ||
> + * 3DSTATE_PS_EXTRA::oMask Present to RenderTarget ||
> + * 3DSTATE_PS_BLEND::AlphaToCoverageEnable ||
> + * 3DSTATE_PS_BLEND::AlphaTestEnable ||
> + * 3DSTATE_WM_CHROMAKEY::ChromaKeyKillEnable) &&
> + * 3DSTATE_WM::ForceKillPix != ForceOff &&
> + * ((3DSTATE_WM_DEPTH_STENCIL::DepthWriteEnable &&
> + * 3DSTATE_DEPTH_BUFFER::DEPTH_WRITE_ENABLE) ||
> + * (3DSTATE_WM_DEPTH_STENCIL::Stencil Buffer Write Enable &&
> + * 3DSTATE_DEPTH_BUFFER::STENCIL_WRITE_ENABLE &&
> + * 3DSTATE_STENCIL_BUFFER::STENCIL_BUFFER_ENABLE))) ||
> + * (3DSTATE_PS_EXTRA:: Pixel Shader Computed Depth mode != PSCDEPTH_OFF))
> + *
> + * This function only takes care of the pipeline parts of the equation.
Which parts are the pipeline parts? It looks like you take care of the
entire equation in this function.
> + */
> +
> + /* These are always true:
> + * 3DSTATE_WM::ForceThreadDispatch != 1 &&
> + * !(3DSTATE_RASTER::ForceSampleCount != NUMRASTSAMPLES_0)
> + */
I couldn't quite understand what ForcedSampleCount does from looking at
the HW docs. If you know, could you tell me what this does and why it
wouldn't change under any circumstance in the future? Otherwise, I
think we should put an assertion where we assign ForcedSampleCount and
reference this PMA optimization.
> +
> + /* We only enable the PMA fix if we know for certain that HiZ is enabled.
> + * If we don't know whether HiZ is enabled or not, we disable the PMA fix
> + * and there is no harm.
> + *
> + * (3DSTATE_DEPTH_BUFFER::SURFACE_TYPE != NULL) &&
> + * 3DSTATE_DEPTH_BUFFER::HIZ Enable
> + */
> + if (!cmd_buffer->state.hiz_enabled)
> + return false;
> +
> + /* 3DSTATE_PS_EXTRA::PixelShaderValid */
> + struct anv_pipeline *pipeline = cmd_buffer->state.pipeline;
> + if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT))
> + return false;
> +
> + /* !(3DSTATE_WM::EDSC_Mode == EDSC_PREPS) */
> + const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);
> + if (wm_prog_data->early_fragment_tests)
> + return false;
In keeping with your top-to-bottom evaluation of the logical statement,
could you move this part to be above the PixelShaderValid part?
> +
> + /* We never use anv_pipeline for HiZ ops so this is trivially true:
> + * !(3DSTATE_WM_HZ_OP::DepthBufferClear ||
^
misaligned asterisk
> + * 3DSTATE_WM_HZ_OP::DepthBufferResolve ||
> + * 3DSTATE_WM_HZ_OP::Hierarchical Depth Buffer Resolve Enable ||
> + * 3DSTATE_WM_HZ_OP::StencilBufferClear)
> + */
> +
> + /* 3DSTATE_WM_DEPTH_STENCIL::DepthTestEnable */
> + if (!pipeline->depth_test_enable)
> + return false;
> +
> + /* (((3DSTATE_PS_EXTRA::PixelShaderKillsPixels ||
> + * 3DSTATE_PS_EXTRA::oMask Present to RenderTarget ||
> + * 3DSTATE_PS_BLEND::AlphaToCoverageEnable ||
> + * 3DSTATE_PS_BLEND::AlphaTestEnable ||
> + * 3DSTATE_WM_CHROMAKEY::ChromaKeyKillEnable) &&
> + * 3DSTATE_WM::ForceKillPix != ForceOff &&
> + * ((3DSTATE_WM_DEPTH_STENCIL::DepthWriteEnable &&
> + * 3DSTATE_DEPTH_BUFFER::DEPTH_WRITE_ENABLE) ||
> + * (3DSTATE_WM_DEPTH_STENCIL::Stencil Buffer Write Enable &&
> + * 3DSTATE_DEPTH_BUFFER::STENCIL_WRITE_ENABLE &&
> + * 3DSTATE_STENCIL_BUFFER::STENCIL_BUFFER_ENABLE))) ||
> + * (3DSTATE_PS_EXTRA:: Pixel Shader Computed Depth mode != PSCDEPTH_OFF))
> + */
> + return (pipeline->kill_pixel && (pipeline->writes_depth ||
> + pipeline->writes_stencil)) ||
anv_pipeline::writes_stencil !=
(3DSTATE_WM_DEPTH_STENCIL::Stencil Buffer Write Enable &&
3DSTATE_DEPTH_BUFFER::STENCIL_WRITE_ENABLE &&
3DSTATE_STENCIL_BUFFER::STENCIL_BUFFER_ENABLE);
The last two lines are not accounted for - we don't detect if the
currently bound depth_stencil attachment lacks or possesses a stencil
buffer.
This is a valid scenario according to section 25.9. Stencil Test of the
Vulkan spec, which states:
If there is no stencil framebuffer attachment, stencil modification
cannot occur, and it is as if the stencil tests always pass.
anv_pipeline::writes_depth should be correct because hiz_enabled is true
by the time we reach it. For hiz_enabled to be true, there must be a
depth buffer present.
> + wm_prog_data->computed_depth_mode != PSCDEPTH_OFF;
> +}
> +
> +void
> genX(cmd_buffer_flush_dynamic_state)(struct anv_cmd_buffer *cmd_buffer)
> {
> struct anv_pipeline *pipeline = cmd_buffer->state.pipeline;
> @@ -211,6 +340,7 @@ genX(cmd_buffer_flush_dynamic_state)(struct anv_cmd_buffer *cmd_buffer)
> }
>
> if (cmd_buffer->state.dirty & (ANV_CMD_DIRTY_PIPELINE |
> + ANV_CMD_DIRTY_RENDER_TARGETS |
Why is this necessary?
> ANV_CMD_DIRTY_DYNAMIC_STENCIL_COMPARE_MASK |
> ANV_CMD_DIRTY_DYNAMIC_STENCIL_WRITE_MASK)) {
> uint32_t wm_depth_stencil_dw[GENX(3DSTATE_WM_DEPTH_STENCIL_length)];
> @@ -234,6 +364,9 @@ genX(cmd_buffer_flush_dynamic_state)(struct anv_cmd_buffer *cmd_buffer)
>
> anv_batch_emit_merge(&cmd_buffer->batch, wm_depth_stencil_dw,
> pipeline->gen8.wm_depth_stencil);
> +
> + genX(cmd_buffer_enable_pma_fix)(cmd_buffer,
> + want_depth_pma_fix(cmd_buffer));
> }
> #else
> if (cmd_buffer->state.dirty & ANV_CMD_DIRTY_DYNAMIC_BLEND_CONSTANTS) {
> diff --git a/src/intel/vulkan/genX_blorp_exec.c b/src/intel/vulkan/genX_blorp_exec.c
> index 663e6c9..6f0b063 100644
> --- a/src/intel/vulkan/genX_blorp_exec.c
> +++ b/src/intel/vulkan/genX_blorp_exec.c
> @@ -154,6 +154,11 @@ genX(blorp_exec)(struct blorp_batch *batch,
>
> genX(cmd_buffer_emit_gen7_depth_flush)(cmd_buffer);
>
> + /* BLORP doesn't do anything fancy with depth such as discards, so we want
> + * the PMA fix off. Also, off is always the safe option.
> + */
> + genX(cmd_buffer_enable_pma_fix)(cmd_buffer, false);
> +
> blorp_exec(batch, params);
>
> cmd_buffer->state.vb_dirty = ~0;
> diff --git a/src/intel/vulkan/genX_cmd_buffer.c b/src/intel/vulkan/genX_cmd_buffer.c
> index b6b7f74..66de93a 100644
> --- a/src/intel/vulkan/genX_cmd_buffer.c
> +++ b/src/intel/vulkan/genX_cmd_buffer.c
> @@ -627,6 +627,11 @@ genX(EndCommandBuffer)(
> {
> ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
>
> + /* We want every command buffer to start with the PMA fix in a known state,
> + * so we disable it at the end of the command buffer.
> + */
> + genX(cmd_buffer_enable_pma_fix)(cmd_buffer, false);
> +
> genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
>
> anv_cmd_buffer_end_batch_buffer(cmd_buffer);
> @@ -644,6 +649,11 @@ genX(CmdExecuteCommands)(
>
> assert(primary->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY);
>
> + /* The secondary command buffers will assume that the PMA fix is disabled
> + * when they begin executing. Make sure this is true.
> + */
> + genX(cmd_buffer_enable_pma_fix)(primary, false);
> +
> for (uint32_t i = 0; i < commandBufferCount; i++) {
> ANV_FROM_HANDLE(anv_cmd_buffer, secondary, pCmdBuffers[i]);
>
> @@ -2181,7 +2191,8 @@ cmd_buffer_emit_depth_stencil(struct anv_cmd_buffer *cmd_buffer)
> const bool has_stencil =
> image && (image->aspects & VK_IMAGE_ASPECT_STENCIL_BIT);
>
> - /* FIXME: Implement the PMA stall W/A */
> + cmd_buffer->state.hiz_enabled = has_hiz;
> +
> /* FIXME: Width and Height are wrong */
>
> genX(cmd_buffer_emit_gen7_depth_flush)(cmd_buffer);
> @@ -2419,6 +2430,8 @@ void genX(CmdEndRenderPass)(
>
> anv_cmd_buffer_resolve_subpass(cmd_buffer);
>
> + cmd_buffer->state.hiz_enabled = false;
> +
> #ifndef NDEBUG
> anv_dump_add_framebuffer(cmd_buffer, cmd_buffer->state.framebuffer);
> #endif
> diff --git a/src/intel/vulkan/genX_pipeline.c b/src/intel/vulkan/genX_pipeline.c
> index 18fe48c..0588d01 100644
> --- a/src/intel/vulkan/genX_pipeline.c
> +++ b/src/intel/vulkan/genX_pipeline.c
> @@ -653,6 +653,8 @@ emit_ds_state(struct anv_pipeline *pipeline,
> * to make sure it's initialized to something useful.
> */
> pipeline->writes_stencil = false;
> + pipeline->writes_depth = false;
> + pipeline->depth_test_enable = false;
> memset(depth_stencil_dw, 0, sizeof(depth_stencil_dw));
> return;
> }
> @@ -711,6 +713,9 @@ emit_ds_state(struct anv_pipeline *pipeline,
> if (info->depthTestEnable && info->depthCompareOp == VK_COMPARE_OP_EQUAL)
> depth_stencil.DepthBufferWriteEnable = false;
>
> + pipeline->writes_depth = depth_stencil.DepthBufferWriteEnable;
> + pipeline->depth_test_enable = depth_stencil.DepthTestEnable;
> +
> #if GEN_GEN <= 7
> GENX(DEPTH_STENCIL_STATE_pack)(NULL, depth_stencil_dw, &depth_stencil);
> #else
> @@ -1429,6 +1434,38 @@ emit_3dstate_vf_topology(struct anv_pipeline *pipeline)
> }
> #endif
>
> +static void
> +compute_kill_pixel(struct anv_pipeline *pipeline,
> + const VkPipelineMultisampleStateCreateInfo *ms_info,
> + const struct anv_subpass *subpass)
> +{
> + if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT)) {
> + pipeline->kill_pixel = false;
> + return;
> + }
> +
> + const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);
> +
> + /* This computes the KillPixel portion of the computation for whether or
> + * not we want to enable the PMA fix on gen8. It's given by this chunk of
> + * the giant formula:
> + *
> + * (3DSTATE_PS_EXTRA::PixelShaderKillsPixels ||
> + * 3DSTATE_PS_EXTRA::oMask Present to RenderTarget ||
> + * 3DSTATE_PS_BLEND::AlphaToCoverageEnable ||
> + * 3DSTATE_PS_BLEND::AlphaTestEnable ||
> + * 3DSTATE_WM_CHROMAKEY::ChromaKeyKillEnable)
> + *
> + * 3DSTATE_WM_CHROMAKEY::ChromaKeyKillEnable is always false and so is
> + * 3DSTATE_PS_BLEND::AlphaTestEnable since Vulkan doesn't have a concept
> + * of an alpha test.
> + */
> + pipeline->kill_pixel =
> + subpass->has_ds_self_dep || wm_prog_data->uses_kill ||
Why do we set kill_pixel in the presence of has_ds_self_dep?
Section 7.1 of the Vulkan spec states:
If a subpass uses the same attachment as both an input attachment and
either a color attachment or a depth/stencil attachment, writes via
the color or depth/stencil attachment are not automatically made
visible to reads via the input attachment, causing a feedback loop,
except in any of the following conditions:
[...]
I don't see how any of the conditions are satisfied through the use
of kill_pixel.
> + wm_prog_data->uses_omask ||
> + (ms_info && ms_info->alphaToCoverageEnable);
> +}
> +
> static VkResult
> genX(graphics_pipeline_create)(
> VkDevice _device,
> @@ -1466,6 +1503,7 @@ genX(graphics_pipeline_create)(
> emit_ds_state(pipeline, pCreateInfo->pDepthStencilState, pass, subpass);
> emit_cb_state(pipeline, pCreateInfo->pColorBlendState,
> pCreateInfo->pMultisampleState);
> + compute_kill_pixel(pipeline, pCreateInfo->pMultisampleState, subpass);
>
> emit_urb_setup(pipeline);
>
> --
> 2.5.0.400.gff86faf
>
> _______________________________________________
> mesa-dev mailing list
> mesa-dev at lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/mesa-dev
More information about the mesa-dev
mailing list