[Mesa-dev] [PATCH v2 3/7] anv: Add support for the PMA fix on Broadwell

Nanley Chery nanleychery at gmail.com
Fri Feb 10 19:37:32 UTC 2017


On Fri, Feb 10, 2017 at 11:02:17AM -0800, Jason Ekstrand wrote:
> This helps Dota 2 on Broadwell by 8-9%.  I also hacked up the driver and
> used the Sascha "shadowmapping" demo to get some results.  Setting
> uses_kill to true dropped the framerate on the demo by 25-30%.  Enabling
> the PMA fix brought it back up to around 90% of the original framerate.
> 
> Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin at intel.com>
> ---
>  src/intel/vulkan/TODO              |   1 -
>  src/intel/vulkan/anv_cmd_buffer.c  |   2 +
>  src/intel/vulkan/anv_genX.h        |   3 +
>  src/intel/vulkan/anv_private.h     |  17 +++++
>  src/intel/vulkan/gen7_cmd_buffer.c |   7 ++
>  src/intel/vulkan/gen8_cmd_buffer.c | 131 +++++++++++++++++++++++++++++++++++++
>  src/intel/vulkan/genX_blorp_exec.c |   5 ++
>  src/intel/vulkan/genX_cmd_buffer.c |  15 ++++-
>  src/intel/vulkan/genX_pipeline.c   |  42 ++++++++++++
>  9 files changed, 221 insertions(+), 2 deletions(-)

This patch is,
Reviewed-by: Nanley Chery <nanley.g.chery at intel.com>

> 
> diff --git a/src/intel/vulkan/TODO b/src/intel/vulkan/TODO
> index 38acc0d..f8b73a1 100644
> --- a/src/intel/vulkan/TODO
> +++ b/src/intel/vulkan/TODO
> @@ -12,5 +12,4 @@ Performance:
>   - Compressed multisample support
>   - Pushing pieces of UBOs?
>   - Enable guardband clipping
> - - pma stall workaround
>   - Use soft-pin to avoid relocations
> diff --git a/src/intel/vulkan/anv_cmd_buffer.c b/src/intel/vulkan/anv_cmd_buffer.c
> index 5886fa6..8c08f8d 100644
> --- a/src/intel/vulkan/anv_cmd_buffer.c
> +++ b/src/intel/vulkan/anv_cmd_buffer.c
> @@ -135,6 +135,8 @@ anv_cmd_state_reset(struct anv_cmd_buffer *cmd_buffer)
>     state->restart_index = UINT32_MAX;
>     state->dynamic = default_dynamic_state;
>     state->need_query_wa = true;
> +   state->pma_fix_enabled = false;
> +   state->hiz_enabled = false;
>  
>     if (state->attachments != NULL) {
>        vk_free(&cmd_buffer->pool->alloc, state->attachments);
> diff --git a/src/intel/vulkan/anv_genX.h b/src/intel/vulkan/anv_genX.h
> index d04fe38..67147b0 100644
> --- a/src/intel/vulkan/anv_genX.h
> +++ b/src/intel/vulkan/anv_genX.h
> @@ -55,6 +55,9 @@ void genX(cmd_buffer_flush_dynamic_state)(struct anv_cmd_buffer *cmd_buffer);
>  
>  void genX(cmd_buffer_flush_compute_state)(struct anv_cmd_buffer *cmd_buffer);
>  
> +void genX(cmd_buffer_enable_pma_fix)(struct anv_cmd_buffer *cmd_buffer,
> +                                     bool enable);
> +
>  void
>  genX(emit_urb_setup)(struct anv_device *device, struct anv_batch *batch,
>                       const struct gen_l3_config *l3_config,
> diff --git a/src/intel/vulkan/anv_private.h b/src/intel/vulkan/anv_private.h
> index 4fe3ebc..fa6032e 100644
> --- a/src/intel/vulkan/anv_private.h
> +++ b/src/intel/vulkan/anv_private.h
> @@ -1163,6 +1163,20 @@ struct anv_cmd_state {
>     bool                                         need_query_wa;
>  
>     /**
> +    * Whether or not the gen8 PMA fix is enabled.  We ensure that, at the top
> +    * of any command buffer it is disabled by disabling it in EndCommandBuffer
> +    * and before invoking the secondary in ExecuteCommands.
> +    */
> +   bool                                         pma_fix_enabled;
> +
> +   /**
> +    * Whether or not we know for certain that HiZ is enabled for the current
> +    * subpass.  If, for whatever reason, we are unsure as to whether HiZ is
> +    * enabled or not, this will be false.
> +    */
> +   bool                                         hiz_enabled;
> +
> +   /**
>      * Array length is anv_cmd_state::pass::attachment_count. Array content is
>      * valid only when recording a render pass instance.
>      */
> @@ -1465,8 +1479,11 @@ struct anv_pipeline {
>  
>     uint32_t                                     cs_right_mask;
>  
> +   bool                                         writes_depth;
> +   bool                                         depth_test_enable;
>     bool                                         writes_stencil;
>     bool                                         depth_clamp_enable;
> +   bool                                         kill_pixel;
>  
>     struct {
>        uint32_t                                  sf[7];
> diff --git a/src/intel/vulkan/gen7_cmd_buffer.c b/src/intel/vulkan/gen7_cmd_buffer.c
> index 013ed87..c1a25e8 100644
> --- a/src/intel/vulkan/gen7_cmd_buffer.c
> +++ b/src/intel/vulkan/gen7_cmd_buffer.c
> @@ -260,6 +260,13 @@ genX(cmd_buffer_flush_dynamic_state)(struct anv_cmd_buffer *cmd_buffer)
>     cmd_buffer->state.dirty = 0;
>  }
>  
> +void
> +genX(cmd_buffer_enable_pma_fix)(struct anv_cmd_buffer *cmd_buffer,
> +                                bool enable)
> +{
> +   /* The NP PMA fix doesn't exist on gen7 */
> +}
> +
>  void genX(CmdSetEvent)(
>      VkCommandBuffer                             commandBuffer,
>      VkEvent                                     event,
> diff --git a/src/intel/vulkan/gen8_cmd_buffer.c b/src/intel/vulkan/gen8_cmd_buffer.c
> index 8c8de62..0628f3a 100644
> --- a/src/intel/vulkan/gen8_cmd_buffer.c
> +++ b/src/intel/vulkan/gen8_cmd_buffer.c
> @@ -155,6 +155,133 @@ __emit_sf_state(struct anv_cmd_buffer *cmd_buffer)
>  #endif
>  
>  void
> +genX(cmd_buffer_enable_pma_fix)(struct anv_cmd_buffer *cmd_buffer, bool enable)
> +{
> +#if GEN_GEN == 8
> +   if (cmd_buffer->state.pma_fix_enabled == enable)
> +      return;
> +
> +   anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
> +      pc.DepthCacheFlushEnable = true;
> +      pc.CommandStreamerStallEnable = true;
> +      pc.RenderTargetCacheFlushEnable = true;
> +   }
> +
> +   uint32_t cache_mode;
> +   anv_pack_struct(&cache_mode, GENX(CACHE_MODE_1),
> +                   .NPPMAFixEnable = enable,
> +                   .NPEarlyZFailsDisable = enable,
> +                   .NPPMAFixEnableMask = true,
> +                   .NPEarlyZFailsDisableMask = true);
> +   anv_batch_emit(&cmd_buffer->batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
> +      lri.RegisterOffset   = GENX(CACHE_MODE_1_num);
> +      lri.DataDWord        = cache_mode;
> +   }
> +
> +   /* After the LRI, a PIPE_CONTROL with both the Depth Stall and Depth Cache
> +    * Flush bits is often necessary.  We do it regardless because it's easier.
> +    * The render cache flush is also necessary if stencil writes are enabled.
> +    */
> +   anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
> +      pc.DepthStallEnable = true;
> +      pc.DepthCacheFlushEnable = true;
> +      pc.RenderTargetCacheFlushEnable = true;
> +   }
> +
> +   cmd_buffer->state.pma_fix_enabled = enable;
> +#endif /* GEN_GEN == 8 */
> +}
> +
> +static inline bool
> +want_depth_pma_fix(struct anv_cmd_buffer *cmd_buffer)
> +{
> +   assert(GEN_GEN == 8);
> +
> +   /* From the Broadwell PRM Vol. 2c CACHE_MODE_1::NP_PMA_FIX_ENABLE:
> +    *
> +    *    SW must set this bit in order to enable this fix when following
> +    *    expression is TRUE.
> +    *
> +    *    3DSTATE_WM::ForceThreadDispatch != 1 &&
> +    *    !(3DSTATE_RASTER::ForceSampleCount != NUMRASTSAMPLES_0) &&
> +    *    (3DSTATE_DEPTH_BUFFER::SURFACE_TYPE != NULL) &&
> +    *    (3DSTATE_DEPTH_BUFFER::HIZ Enable) &&
> +    *    !(3DSTATE_WM::EDSC_Mode == EDSC_PREPS) &&
> +    *    (3DSTATE_PS_EXTRA::PixelShaderValid) &&
> +    *    !(3DSTATE_WM_HZ_OP::DepthBufferClear ||
> +    *      3DSTATE_WM_HZ_OP::DepthBufferResolve ||
> +    *      3DSTATE_WM_HZ_OP::Hierarchical Depth Buffer Resolve Enable ||
> +    *      3DSTATE_WM_HZ_OP::StencilBufferClear) &&
> +    *    (3DSTATE_WM_DEPTH_STENCIL::DepthTestEnable) &&
> +    *    (((3DSTATE_PS_EXTRA::PixelShaderKillsPixels ||
> +    *       3DSTATE_PS_EXTRA::oMask Present to RenderTarget ||
> +    *       3DSTATE_PS_BLEND::AlphaToCoverageEnable ||
> +    *       3DSTATE_PS_BLEND::AlphaTestEnable ||
> +    *       3DSTATE_WM_CHROMAKEY::ChromaKeyKillEnable) &&
> +    *      3DSTATE_WM::ForceKillPix != ForceOff &&
> +    *      ((3DSTATE_WM_DEPTH_STENCIL::DepthWriteEnable &&
> +    *        3DSTATE_DEPTH_BUFFER::DEPTH_WRITE_ENABLE) ||
> +    *       (3DSTATE_WM_DEPTH_STENCIL::Stencil Buffer Write Enable &&
> +    *        3DSTATE_DEPTH_BUFFER::STENCIL_WRITE_ENABLE &&
> +    *        3DSTATE_STENCIL_BUFFER::STENCIL_BUFFER_ENABLE))) ||
> +    *     (3DSTATE_PS_EXTRA:: Pixel Shader Computed Depth mode != PSCDEPTH_OFF))
> +    */
> +
> +   /* These are always true:
> +    *    3DSTATE_WM::ForceThreadDispatch != 1 &&
> +    *    !(3DSTATE_RASTER::ForceSampleCount != NUMRASTSAMPLES_0)
> +    */
> +
> +   /* We only enable the PMA fix if we know for certain that HiZ is enabled.
> +    * If we don't know whether HiZ is enabled or not, we disable the PMA fix
> +    * and there is no harm.
> +    *
> +    * (3DSTATE_DEPTH_BUFFER::SURFACE_TYPE != NULL) &&
> +    * 3DSTATE_DEPTH_BUFFER::HIZ Enable
> +    */
> +   if (!cmd_buffer->state.hiz_enabled)
> +      return false;
> +
> +   /* 3DSTATE_PS_EXTRA::PixelShaderValid */
> +   struct anv_pipeline *pipeline = cmd_buffer->state.pipeline;
> +   if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT))
> +      return false;
> +
> +   /* !(3DSTATE_WM::EDSC_Mode == EDSC_PREPS) */
> +   const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);
> +   if (wm_prog_data->early_fragment_tests)
> +      return false;
> +
> +   /* We never use anv_pipeline for HiZ ops so this is trivially true:
> +    *    !(3DSTATE_WM_HZ_OP::DepthBufferClear ||
> +    *      3DSTATE_WM_HZ_OP::DepthBufferResolve ||
> +    *      3DSTATE_WM_HZ_OP::Hierarchical Depth Buffer Resolve Enable ||
> +    *      3DSTATE_WM_HZ_OP::StencilBufferClear)
> +    */
> +
> +   /* 3DSTATE_WM_DEPTH_STENCIL::DepthTestEnable */
> +   if (!pipeline->depth_test_enable)
> +      return false;
> +
> +   /* (((3DSTATE_PS_EXTRA::PixelShaderKillsPixels ||
> +    *    3DSTATE_PS_EXTRA::oMask Present to RenderTarget ||
> +    *    3DSTATE_PS_BLEND::AlphaToCoverageEnable ||
> +    *    3DSTATE_PS_BLEND::AlphaTestEnable ||
> +    *    3DSTATE_WM_CHROMAKEY::ChromaKeyKillEnable) &&
> +    *   3DSTATE_WM::ForceKillPix != ForceOff &&
> +    *   ((3DSTATE_WM_DEPTH_STENCIL::DepthWriteEnable &&
> +    *     3DSTATE_DEPTH_BUFFER::DEPTH_WRITE_ENABLE) ||
> +    *    (3DSTATE_WM_DEPTH_STENCIL::Stencil Buffer Write Enable &&
> +    *     3DSTATE_DEPTH_BUFFER::STENCIL_WRITE_ENABLE &&
> +    *     3DSTATE_STENCIL_BUFFER::STENCIL_BUFFER_ENABLE))) ||
> +    *  (3DSTATE_PS_EXTRA:: Pixel Shader Computed Depth mode != PSCDEPTH_OFF))
> +    */
> +   return (pipeline->kill_pixel && (pipeline->writes_depth ||
> +                                    pipeline->writes_stencil)) ||
> +          wm_prog_data->computed_depth_mode != PSCDEPTH_OFF;
> +}
> +
> +void
>  genX(cmd_buffer_flush_dynamic_state)(struct anv_cmd_buffer *cmd_buffer)
>  {
>     struct anv_pipeline *pipeline = cmd_buffer->state.pipeline;
> @@ -211,6 +338,7 @@ genX(cmd_buffer_flush_dynamic_state)(struct anv_cmd_buffer *cmd_buffer)
>     }
>  
>     if (cmd_buffer->state.dirty & (ANV_CMD_DIRTY_PIPELINE |
> +                                  ANV_CMD_DIRTY_RENDER_TARGETS |
>                                    ANV_CMD_DIRTY_DYNAMIC_STENCIL_COMPARE_MASK |
>                                    ANV_CMD_DIRTY_DYNAMIC_STENCIL_WRITE_MASK)) {
>        uint32_t wm_depth_stencil_dw[GENX(3DSTATE_WM_DEPTH_STENCIL_length)];
> @@ -234,6 +362,9 @@ genX(cmd_buffer_flush_dynamic_state)(struct anv_cmd_buffer *cmd_buffer)
>  
>        anv_batch_emit_merge(&cmd_buffer->batch, wm_depth_stencil_dw,
>                             pipeline->gen8.wm_depth_stencil);
> +
> +      genX(cmd_buffer_enable_pma_fix)(cmd_buffer,
> +                                      want_depth_pma_fix(cmd_buffer));
>     }
>  #else
>     if (cmd_buffer->state.dirty & ANV_CMD_DIRTY_DYNAMIC_BLEND_CONSTANTS) {
> diff --git a/src/intel/vulkan/genX_blorp_exec.c b/src/intel/vulkan/genX_blorp_exec.c
> index 663e6c9..6f0b063 100644
> --- a/src/intel/vulkan/genX_blorp_exec.c
> +++ b/src/intel/vulkan/genX_blorp_exec.c
> @@ -154,6 +154,11 @@ genX(blorp_exec)(struct blorp_batch *batch,
>  
>     genX(cmd_buffer_emit_gen7_depth_flush)(cmd_buffer);
>  
> +   /* BLORP doesn't do anything fancy with depth such as discards, so we want
> +    * the PMA fix off.  Also, off is always the safe option.
> +    */
> +   genX(cmd_buffer_enable_pma_fix)(cmd_buffer, false);
> +
>     blorp_exec(batch, params);
>  
>     cmd_buffer->state.vb_dirty = ~0;
> diff --git a/src/intel/vulkan/genX_cmd_buffer.c b/src/intel/vulkan/genX_cmd_buffer.c
> index b6b7f74..66de93a 100644
> --- a/src/intel/vulkan/genX_cmd_buffer.c
> +++ b/src/intel/vulkan/genX_cmd_buffer.c
> @@ -627,6 +627,11 @@ genX(EndCommandBuffer)(
>  {
>     ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
>  
> +   /* We want every command buffer to start with the PMA fix in a known state,
> +    * so we disable it at the end of the command buffer.
> +    */
> +   genX(cmd_buffer_enable_pma_fix)(cmd_buffer, false);
> +
>     genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
>  
>     anv_cmd_buffer_end_batch_buffer(cmd_buffer);
> @@ -644,6 +649,11 @@ genX(CmdExecuteCommands)(
>  
>     assert(primary->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY);
>  
> +   /* The secondary command buffers will assume that the PMA fix is disabled
> +    * when they begin executing.  Make sure this is true.
> +    */
> +   genX(cmd_buffer_enable_pma_fix)(primary, false);
> +
>     for (uint32_t i = 0; i < commandBufferCount; i++) {
>        ANV_FROM_HANDLE(anv_cmd_buffer, secondary, pCmdBuffers[i]);
>  
> @@ -2181,7 +2191,8 @@ cmd_buffer_emit_depth_stencil(struct anv_cmd_buffer *cmd_buffer)
>     const bool has_stencil =
>        image && (image->aspects & VK_IMAGE_ASPECT_STENCIL_BIT);
>  
> -   /* FIXME: Implement the PMA stall W/A */
> +   cmd_buffer->state.hiz_enabled = has_hiz;
> +
>     /* FIXME: Width and Height are wrong */
>  
>     genX(cmd_buffer_emit_gen7_depth_flush)(cmd_buffer);
> @@ -2419,6 +2430,8 @@ void genX(CmdEndRenderPass)(
>  
>     anv_cmd_buffer_resolve_subpass(cmd_buffer);
>  
> +   cmd_buffer->state.hiz_enabled = false;
> +
>  #ifndef NDEBUG
>     anv_dump_add_framebuffer(cmd_buffer, cmd_buffer->state.framebuffer);
>  #endif
> diff --git a/src/intel/vulkan/genX_pipeline.c b/src/intel/vulkan/genX_pipeline.c
> index 102b75b..5fe829b 100644
> --- a/src/intel/vulkan/genX_pipeline.c
> +++ b/src/intel/vulkan/genX_pipeline.c
> @@ -444,6 +444,10 @@ emit_rs_state(struct anv_pipeline *pipeline,
>      */
>  #if GEN_GEN >= 8
>     raster.DXMultisampleRasterizationEnable = true;
> +   /* NOTE: 3DSTATE_RASTER::ForcedSampleCount affects the BDW and SKL PMA fix
> +    * computations.  If we ever set this bit to a different value, they will
> +    * need to be updated accordingly.
> +    */
>     raster.ForcedSampleCount = FSC_NUMRASTSAMPLES_0;
>     raster.ForceMultisampling = false;
>  #else
> @@ -653,6 +657,8 @@ emit_ds_state(struct anv_pipeline *pipeline,
>         * to make sure it's initialized to something useful.
>         */
>        pipeline->writes_stencil = false;
> +      pipeline->writes_depth = false;
> +      pipeline->depth_test_enable = false;
>        memset(depth_stencil_dw, 0, sizeof(depth_stencil_dw));
>        return;
>     }
> @@ -711,6 +717,9 @@ emit_ds_state(struct anv_pipeline *pipeline,
>     if (info->depthTestEnable && info->depthCompareOp == VK_COMPARE_OP_EQUAL)
>        depth_stencil.DepthBufferWriteEnable = false;
>  
> +   pipeline->writes_depth = depth_stencil.DepthBufferWriteEnable;
> +   pipeline->depth_test_enable = depth_stencil.DepthTestEnable;
> +
>  #if GEN_GEN <= 7
>     GENX(DEPTH_STENCIL_STATE_pack)(NULL, depth_stencil_dw, &depth_stencil);
>  #else
> @@ -1429,6 +1438,38 @@ emit_3dstate_vf_topology(struct anv_pipeline *pipeline)
>  }
>  #endif
>  
> +static void
> +compute_kill_pixel(struct anv_pipeline *pipeline,
> +                   const VkPipelineMultisampleStateCreateInfo *ms_info,
> +                   const struct anv_subpass *subpass)
> +{
> +   if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT)) {
> +      pipeline->kill_pixel = false;
> +      return;
> +   }
> +
> +   const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);
> +
> +   /* This computes the KillPixel portion of the computation for whether or
> +    * not we want to enable the PMA fix on gen8.  It's given by this chunk of
> +    * the giant formula:
> +    *
> +    *    (3DSTATE_PS_EXTRA::PixelShaderKillsPixels ||
> +    *     3DSTATE_PS_EXTRA::oMask Present to RenderTarget ||
> +    *     3DSTATE_PS_BLEND::AlphaToCoverageEnable ||
> +    *     3DSTATE_PS_BLEND::AlphaTestEnable ||
> +    *     3DSTATE_WM_CHROMAKEY::ChromaKeyKillEnable)
> +    *
> +    * 3DSTATE_WM_CHROMAKEY::ChromaKeyKillEnable is always false and so is
> +    * 3DSTATE_PS_BLEND::AlphaTestEnable since Vulkan doesn't have a concept
> +    * of an alpha test.
> +    */
> +   pipeline->kill_pixel =
> +      subpass->has_ds_self_dep || wm_prog_data->uses_kill ||
> +      wm_prog_data->uses_omask ||
> +      (ms_info && ms_info->alphaToCoverageEnable);
> +}
> +
>  static VkResult
>  genX(graphics_pipeline_create)(
>      VkDevice                                    _device,
> @@ -1466,6 +1507,7 @@ genX(graphics_pipeline_create)(
>     emit_ds_state(pipeline, pCreateInfo->pDepthStencilState, pass, subpass);
>     emit_cb_state(pipeline, pCreateInfo->pColorBlendState,
>                             pCreateInfo->pMultisampleState);
> +   compute_kill_pixel(pipeline, pCreateInfo->pMultisampleState, subpass);
>  
>     emit_urb_setup(pipeline);
>  
> -- 
> 2.5.0.400.gff86faf
> 
> _______________________________________________
> mesa-dev mailing list
> mesa-dev at lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/mesa-dev


More information about the mesa-dev mailing list