[Mesa-dev] [PATCH 3/5] anv: Add support for the PMA fix on Broadwell
Lionel Landwerlin
lionel.g.landwerlin at intel.com
Thu Feb 2 10:43:49 UTC 2017
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin at intel.com>
On 02/02/17 01:43, Jason Ekstrand wrote:
> In order to get good performance numbers for this, I had to hack up the
> driver to whack wm_prog_data::uses_kill to true to emulate a discard and
> used the Sascha "shadowmapping" demo. Setting uses_kill to true dropped
> the framerate on the demo by 25-30%. Enabling the PMA fix brought it
> back up to around 90% of the original framerate. This doesn't seem to
> really impact Dota 2; probably because it doesn't use 16-bit depth.
> ---
> src/intel/vulkan/TODO | 1 -
> src/intel/vulkan/anv_cmd_buffer.c | 1 +
> src/intel/vulkan/anv_genX.h | 3 +
> src/intel/vulkan/anv_private.h | 10 +++
> src/intel/vulkan/gen7_cmd_buffer.c | 7 ++
> src/intel/vulkan/gen8_cmd_buffer.c | 139 +++++++++++++++++++++++++++++++++++++
> src/intel/vulkan/genX_blorp_exec.c | 5 ++
> src/intel/vulkan/genX_cmd_buffer.c | 10 +++
> src/intel/vulkan/genX_pipeline.c | 36 ++++++++++
> 9 files changed, 211 insertions(+), 1 deletion(-)
>
> diff --git a/src/intel/vulkan/TODO b/src/intel/vulkan/TODO
> index 38acc0d..f8b73a1 100644
> --- a/src/intel/vulkan/TODO
> +++ b/src/intel/vulkan/TODO
> @@ -12,5 +12,4 @@ Performance:
> - Compressed multisample support
> - Pushing pieces of UBOs?
> - Enable guardband clipping
> - - pma stall workaround
> - Use soft-pin to avoid relocations
> diff --git a/src/intel/vulkan/anv_cmd_buffer.c b/src/intel/vulkan/anv_cmd_buffer.c
> index 5886fa6..a762476 100644
> --- a/src/intel/vulkan/anv_cmd_buffer.c
> +++ b/src/intel/vulkan/anv_cmd_buffer.c
> @@ -135,6 +135,7 @@ anv_cmd_state_reset(struct anv_cmd_buffer *cmd_buffer)
> state->restart_index = UINT32_MAX;
> state->dynamic = default_dynamic_state;
> state->need_query_wa = true;
> + state->pma_fix_enabled = false;
>
> if (state->attachments != NULL) {
> vk_free(&cmd_buffer->pool->alloc, state->attachments);
> diff --git a/src/intel/vulkan/anv_genX.h b/src/intel/vulkan/anv_genX.h
> index d04fe38..67147b0 100644
> --- a/src/intel/vulkan/anv_genX.h
> +++ b/src/intel/vulkan/anv_genX.h
> @@ -55,6 +55,9 @@ void genX(cmd_buffer_flush_dynamic_state)(struct anv_cmd_buffer *cmd_buffer);
>
> void genX(cmd_buffer_flush_compute_state)(struct anv_cmd_buffer *cmd_buffer);
>
> +void genX(cmd_buffer_enable_pma_fix)(struct anv_cmd_buffer *cmd_buffer,
> + bool enable);
> +
> void
> genX(emit_urb_setup)(struct anv_device *device, struct anv_batch *batch,
> const struct gen_l3_config *l3_config,
> diff --git a/src/intel/vulkan/anv_private.h b/src/intel/vulkan/anv_private.h
> index 4fe3ebc..5fe4dd8 100644
> --- a/src/intel/vulkan/anv_private.h
> +++ b/src/intel/vulkan/anv_private.h
> @@ -1163,6 +1163,13 @@ struct anv_cmd_state {
> bool need_query_wa;
>
> /**
> + * Whether or not the gen8 PMA fix is enabled. We ensure that, at the top
> + * of any command buffer it disabled by disabling it in EndCommandBuffer
> + * and before invoking the secondary in ExecuteCommands.
> + */
> + bool pma_fix_enabled;
> +
> + /**
> * Array length is anv_cmd_state::pass::attachment_count. Array content is
> * valid only when recording a render pass instance.
> */
> @@ -1465,8 +1472,11 @@ struct anv_pipeline {
>
> uint32_t cs_right_mask;
>
> + bool writes_depth;
> + bool depth_test_enable;
> bool writes_stencil;
> bool depth_clamp_enable;
> + bool kill_pixel;
>
> struct {
> uint32_t sf[7];
> diff --git a/src/intel/vulkan/gen7_cmd_buffer.c b/src/intel/vulkan/gen7_cmd_buffer.c
> index 013ed87..c1a25e8 100644
> --- a/src/intel/vulkan/gen7_cmd_buffer.c
> +++ b/src/intel/vulkan/gen7_cmd_buffer.c
> @@ -260,6 +260,13 @@ genX(cmd_buffer_flush_dynamic_state)(struct anv_cmd_buffer *cmd_buffer)
> cmd_buffer->state.dirty = 0;
> }
>
> +void
> +genX(cmd_buffer_enable_pma_fix)(struct anv_cmd_buffer *cmd_buffer,
> + bool enable)
> +{
> + /* The NP PMA fix doesn't exist on gen7 */
> +}
> +
> void genX(CmdSetEvent)(
> VkCommandBuffer commandBuffer,
> VkEvent event,
> diff --git a/src/intel/vulkan/gen8_cmd_buffer.c b/src/intel/vulkan/gen8_cmd_buffer.c
> index 8c8de62..b877e27 100644
> --- a/src/intel/vulkan/gen8_cmd_buffer.c
> +++ b/src/intel/vulkan/gen8_cmd_buffer.c
> @@ -155,6 +155,141 @@ __emit_sf_state(struct anv_cmd_buffer *cmd_buffer)
> #endif
>
> void
> +genX(cmd_buffer_enable_pma_fix)(struct anv_cmd_buffer *cmd_buffer, bool enable)
> +{
> +#if GEN_GEN == 8
> + if (cmd_buffer->state.pma_fix_enabled == enable)
> + return;
> +
> + anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
> + pc.DepthCacheFlushEnable = true;
> + pc.CommandStreamerStallEnable = true;
> + pc.RenderTargetCacheFlushEnable = true;
> + }
> +
> + uint32_t cache_mode;
> + anv_pack_struct(&cache_mode, GENX(CACHE_MODE_1),
> + .NPPMAFixEnable = enable,
> + .NPEarlyZFailsDisableMask = enable,
> + .NPPMAFixEnableMask = true,
> + .NPEarlyZFailsDisableMask = true);
> + anv_batch_emit(&cmd_buffer->batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
> + lri.RegisterOffset = GENX(CACHE_MODE_1_num);
> + lri.DataDWord = cache_mode;
> + }
> +
> + /* After the LRI, a PIPE_CONTROL with both the Depth Stall and Depth Cache
> + * Flush bits is often necessary. We do it regardless because it's easier.
> + * The render cache flush is also necessary if stencil writes are enabled.
> + */
> + anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
> + pc.DepthStallEnable = true;
> + pc.DepthCacheFlushEnable = true;
> + pc.RenderTargetCacheFlushEnable = true;
> + }
> +
> + cmd_buffer->state.pma_fix_enabled = enable;
> +#endif /* GEN_GEN == 8 */
> +}
> +
> +static inline bool
> +want_depth_pma_fix(struct anv_cmd_buffer *cmd_buffer)
> +{
> + assert(GEN_GEN == 8);
> +
> + /* From the Broadwell PRM Vol. 2c CACHE_MODE_1::NP_PMA_FIX_ENABLE:
> + *
> + * SW must set this bit in order to enable this fix when following
> + * expression is TRUE.
> + *
> + * 3DSTATE_WM::ForceThreadDispatch != 1 &&
> + * !(3DSTATE_RASTER::ForceSampleCount != NUMRASTSAMPLES_0) &&
> + * (3DSTATE_DEPTH_BUFFER::SURFACE_TYPE != NULL) &&
> + * (3DSTATE_DEPTH_BUFFER::HIZ Enable) &&
> + * !(3DSTATE_WM::EDSC_Mode == EDSC_PREPS) &&
> + * (3DSTATE_PS_EXTRA::PixelShaderValid) &&
> + * !(3DSTATE_WM_HZ_OP::DepthBufferClear ||
> + * 3DSTATE_WM_HZ_OP::DepthBufferResolve ||
> + * 3DSTATE_WM_HZ_OP::Hierarchical Depth Buffer Resolve Enable ||
> + * 3DSTATE_WM_HZ_OP::StencilBufferClear) &&
> + * (3DSTATE_WM_DEPTH_STENCIL::DepthTestEnable) &&
> + * (((3DSTATE_PS_EXTRA::PixelShaderKillsPixels ||
> + * 3DSTATE_PS_EXTRA::oMask Present to RenderTarget ||
> + * 3DSTATE_PS_BLEND::AlphaToCoverageEnable ||
> + * 3DSTATE_PS_BLEND::AlphaTestEnable ||
> + * 3DSTATE_WM_CHROMAKEY::ChromaKeyKillEnable) &&
> + * 3DSTATE_WM::ForceKillPix != ForceOff &&
> + * ((3DSTATE_WM_DEPTH_STENCIL::DepthWriteEnable &&
> + * 3DSTATE_DEPTH_BUFFER::DEPTH_WRITE_ENABLE) ||
> + * (3DSTATE_WM_DEPTH_STENCIL::Stencil Buffer Write Enable &&
> + * 3DSTATE_DEPTH_BUFFER::STENCIL_WRITE_ENABLE &&
> + * 3DSTATE_STENCIL_BUFFER::STENCIL_BUFFER_ENABLE))) ||
> + * (3DSTATE_PS_EXTRA:: Pixel Shader Computed Depth mode != PSCDEPTH_OFF))
> + *
> + * This function only takes care of the pipeline parts of the equation.
> + */
> +
> + /* These are always true:
> + * 3DSTATE_WM::ForceThreadDispatch != 1 &&
> + * !(3DSTATE_RASTER::ForceSampleCount != NUMRASTSAMPLES_0)
> + */
> +
> + /* We only enable the PMA fix if HiZ is enabled. However, if HiZ is
> + * *not* enabled and we don't set this bit there is no harm. Therefore,
> + * we can just treat the NULL framebuffer case as has_hiz == false and
> + * everything will work just fine.
> + */
> + if (cmd_buffer->state.framebuffer == NULL)
> + return false;
> +
> + /* (3DSTATE_DEPTH_BUFFER::SURFACE_TYPE != NULL) &&
> + * 3DSTATE_DEPTH_BUFFER::HIZ Enable
> + */
> + const struct anv_image_view *ds_iview =
> + anv_cmd_buffer_get_depth_stencil_view(cmd_buffer);
> + if (!ds_iview || ds_iview->image->aux_usage != ISL_AUX_USAGE_HIZ)
> + return false;
> +
> + /* 3DSTATE_PS_EXTRA::PixelShaderValid */
> + struct anv_pipeline *pipeline = cmd_buffer->state.pipeline;
> + if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT))
> + return false;
> +
> + /* !(3DSTATE_WM::EDSC_Mode == EDSC_PREPS) */
> + const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);
> + if (wm_prog_data->early_fragment_tests)
> + return false;
> +
> + /* We never use anv_pipeline for HiZ ops so this is trivially true:
> + * !(3DSTATE_WM_HZ_OP::DepthBufferClear ||
> + * 3DSTATE_WM_HZ_OP::DepthBufferResolve ||
> + * 3DSTATE_WM_HZ_OP::Hierarchical Depth Buffer Resolve Enable ||
> + * 3DSTATE_WM_HZ_OP::StencilBufferClear)
> + */
> +
> + /* 3DSTATE_WM_DEPTH_STENCIL::DepthTestEnable */
> + if (!pipeline->depth_test_enable)
> + return false;
> +
> + /* (((3DSTATE_PS_EXTRA::PixelShaderKillsPixels ||
> + * 3DSTATE_PS_EXTRA::oMask Present to RenderTarget ||
> + * 3DSTATE_PS_BLEND::AlphaToCoverageEnable ||
> + * 3DSTATE_PS_BLEND::AlphaTestEnable ||
> + * 3DSTATE_WM_CHROMAKEY::ChromaKeyKillEnable) &&
> + * 3DSTATE_WM::ForceKillPix != ForceOff &&
> + * ((3DSTATE_WM_DEPTH_STENCIL::DepthWriteEnable &&
> + * 3DSTATE_DEPTH_BUFFER::DEPTH_WRITE_ENABLE) ||
> + * (3DSTATE_WM_DEPTH_STENCIL::Stencil Buffer Write Enable &&
> + * 3DSTATE_DEPTH_BUFFER::STENCIL_WRITE_ENABLE &&
> + * 3DSTATE_STENCIL_BUFFER::STENCIL_BUFFER_ENABLE))) ||
> + * (3DSTATE_PS_EXTRA:: Pixel Shader Computed Depth mode != PSCDEPTH_OFF))
> + */
> + return (pipeline->kill_pixel && (pipeline->writes_depth ||
> + pipeline->writes_stencil)) ||
> + wm_prog_data->computed_depth_mode != PSCDEPTH_OFF;
> +}
> +
> +void
> genX(cmd_buffer_flush_dynamic_state)(struct anv_cmd_buffer *cmd_buffer)
> {
> struct anv_pipeline *pipeline = cmd_buffer->state.pipeline;
> @@ -211,6 +346,7 @@ genX(cmd_buffer_flush_dynamic_state)(struct anv_cmd_buffer *cmd_buffer)
> }
>
> if (cmd_buffer->state.dirty & (ANV_CMD_DIRTY_PIPELINE |
> + ANV_CMD_DIRTY_RENDER_TARGETS |
> ANV_CMD_DIRTY_DYNAMIC_STENCIL_COMPARE_MASK |
> ANV_CMD_DIRTY_DYNAMIC_STENCIL_WRITE_MASK)) {
> uint32_t wm_depth_stencil_dw[GENX(3DSTATE_WM_DEPTH_STENCIL_length)];
> @@ -234,6 +370,9 @@ genX(cmd_buffer_flush_dynamic_state)(struct anv_cmd_buffer *cmd_buffer)
>
> anv_batch_emit_merge(&cmd_buffer->batch, wm_depth_stencil_dw,
> pipeline->gen8.wm_depth_stencil);
> +
> + genX(cmd_buffer_enable_pma_fix)(cmd_buffer,
> + want_depth_pma_fix(cmd_buffer));
> }
> #else
> if (cmd_buffer->state.dirty & ANV_CMD_DIRTY_DYNAMIC_BLEND_CONSTANTS) {
> diff --git a/src/intel/vulkan/genX_blorp_exec.c b/src/intel/vulkan/genX_blorp_exec.c
> index 663e6c9..6f0b063 100644
> --- a/src/intel/vulkan/genX_blorp_exec.c
> +++ b/src/intel/vulkan/genX_blorp_exec.c
> @@ -154,6 +154,11 @@ genX(blorp_exec)(struct blorp_batch *batch,
>
> genX(cmd_buffer_emit_gen7_depth_flush)(cmd_buffer);
>
> + /* BLORP doesn't do anything fancy with depth such as discards, so we want
> + * the PMA fix off. Also, off is always the safe option.
> + */
> + genX(cmd_buffer_enable_pma_fix)(cmd_buffer, false);
> +
> blorp_exec(batch, params);
>
> cmd_buffer->state.vb_dirty = ~0;
> diff --git a/src/intel/vulkan/genX_cmd_buffer.c b/src/intel/vulkan/genX_cmd_buffer.c
> index b6b7f74..9b54e80 100644
> --- a/src/intel/vulkan/genX_cmd_buffer.c
> +++ b/src/intel/vulkan/genX_cmd_buffer.c
> @@ -627,6 +627,11 @@ genX(EndCommandBuffer)(
> {
> ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
>
> + /* We want every command buffer to start with the PMA fix in a known state,
> + * so we disable it at the end of the command buffer.
> + */
> + genX(cmd_buffer_enable_pma_fix)(cmd_buffer, false);
> +
> genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
>
> anv_cmd_buffer_end_batch_buffer(cmd_buffer);
> @@ -644,6 +649,11 @@ genX(CmdExecuteCommands)(
>
> assert(primary->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY);
>
> + /* The secondary command buffers will assume that the PMA fix is disabled
> + * when they begin executing. Make sure this is true.
> + */
> + genX(cmd_buffer_enable_pma_fix)(primary, false);
> +
> for (uint32_t i = 0; i < commandBufferCount; i++) {
> ANV_FROM_HANDLE(anv_cmd_buffer, secondary, pCmdBuffers[i]);
>
> diff --git a/src/intel/vulkan/genX_pipeline.c b/src/intel/vulkan/genX_pipeline.c
> index 32be151..a7b294f 100644
> --- a/src/intel/vulkan/genX_pipeline.c
> +++ b/src/intel/vulkan/genX_pipeline.c
> @@ -710,6 +710,9 @@ emit_ds_state(struct anv_pipeline *pipeline,
> if (info->depthTestEnable && info->depthCompareOp == VK_COMPARE_OP_EQUAL)
> depth_stencil.DepthBufferWriteEnable = false;
>
> + pipeline->writes_depth = depth_stencil.DepthBufferWriteEnable;
> + pipeline->depth_test_enable = depth_stencil.DepthTestEnable;
> +
> #if GEN_GEN <= 7
> GENX(DEPTH_STENCIL_STATE_pack)(NULL, depth_stencil_dw, &depth_stencil);
> #else
> @@ -1428,6 +1431,38 @@ emit_3dstate_vf_topology(struct anv_pipeline *pipeline)
> }
> #endif
>
> +static void
> +compute_kill_pixel(struct anv_pipeline *pipeline,
> + const VkPipelineMultisampleStateCreateInfo *ms_info,
> + const struct anv_subpass *subpass)
> +{
> + if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT)) {
> + pipeline->kill_pixel = false;
> + return;
> + }
> +
> + const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);
> +
> + /* This computes the KillPixel portion of the computation for whether or
> + * not we want to enable the PMA fix on gen8. It's given by this chunk of
> + * the giant formula:
> + *
> + * (3DSTATE_PS_EXTRA::PixelShaderKillsPixels ||
> + * 3DSTATE_PS_EXTRA::oMask Present to RenderTarget ||
> + * 3DSTATE_PS_BLEND::AlphaToCoverageEnable ||
> + * 3DSTATE_PS_BLEND::AlphaTestEnable ||
> + * 3DSTATE_WM_CHROMAKEY::ChromaKeyKillEnable)
> + *
> + * 3DSTATE_WM_CHROMAKEY::ChromaKeyKillEnable is always false and so is
> + * 3DSTATE_PS_BLEND::AlphaTestEnable since Vulkan doesn't have a concept
> + * of an alpha test.
> + */
> + pipeline->kill_pixel =
> + subpass->has_ds_self_dep || wm_prog_data->uses_kill ||
> + wm_prog_data->uses_omask ||
> + (ms_info && ms_info->alphaToCoverageEnable);
> +}
> +
> static VkResult
> genX(graphics_pipeline_create)(
> VkDevice _device,
> @@ -1465,6 +1500,7 @@ genX(graphics_pipeline_create)(
> emit_ds_state(pipeline, pCreateInfo->pDepthStencilState, pass, subpass);
> emit_cb_state(pipeline, pCreateInfo->pColorBlendState,
> pCreateInfo->pMultisampleState);
> + compute_kill_pixel(pipeline, pCreateInfo->pMultisampleState, subpass);
>
> emit_urb_setup(pipeline);
>
More information about the mesa-dev
mailing list