[Mesa-dev] [PATCH 2/2] anv: Implement the Sky Lake stencil PMA optimization

Jason Ekstrand jason at jlekstrand.net
Thu Feb 2 20:53:33 UTC 2017


On Thu, Feb 2, 2017 at 10:55 AM, Nanley Chery <nanleychery at gmail.com> wrote:

> On Wed, Feb 01, 2017 at 10:11:42PM -0800, Jason Ekstrand wrote:
> > This improves the performance of Dota 2 on my Sky Lake Skull Canyon
> > machine by about 2-3%.
> > ---
> >  src/intel/vulkan/anv_private.h     |   1 +
> >  src/intel/vulkan/gen8_cmd_buffer.c | 155 ++++++++++++++++++++++++++++++
> ++++++-
> >  src/intel/vulkan/genX_pipeline.c   |   6 +-
> >  3 files changed, 156 insertions(+), 6 deletions(-)
> >
> > diff --git a/src/intel/vulkan/anv_private.h
> b/src/intel/vulkan/anv_private.h
> > index 5fe4dd8..e7ad351 100644
> > --- a/src/intel/vulkan/anv_private.h
> > +++ b/src/intel/vulkan/anv_private.h
> > @@ -1475,6 +1475,7 @@ struct anv_pipeline {
> >     bool                                         writes_depth;
> >     bool                                         depth_test_enable;
> >     bool                                         writes_stencil;
> > +   bool                                         stencil_test_enable;
> >     bool                                         depth_clamp_enable;
> >     bool                                         kill_pixel;
> >
> > diff --git a/src/intel/vulkan/gen8_cmd_buffer.c
> b/src/intel/vulkan/gen8_cmd_buffer.c
> > index b877e27..553f0c3 100644
> > --- a/src/intel/vulkan/gen8_cmd_buffer.c
> > +++ b/src/intel/vulkan/gen8_cmd_buffer.c
> > @@ -157,16 +157,39 @@ __emit_sf_state(struct anv_cmd_buffer *cmd_buffer)
> >  void
> >  genX(cmd_buffer_enable_pma_fix)(struct anv_cmd_buffer *cmd_buffer,
> bool enable)
> >  {
> > -#if GEN_GEN == 8
> >     if (cmd_buffer->state.pma_fix_enabled == enable)
> >        return;
> >
> > +   cmd_buffer->state.pma_fix_enabled = enable;
> > +
> > +   /* According to the Broadwell PIPE_CONTROL documentation, software
> should
> > +    * emit a PIPE_CONTROL with the CS Stall and Depth Cache Flush bits
> set
> > +    * prior to the LRI.  If stencil buffer writes are enabled, then a
> Render
> > +    * Cache Flush is also necessary.
> > +    *
> > +    * The Sky Lake docs say to use a depth stall rather than a command
> > +    * streamer stall.  However, the hardware seems to violently
> disagree.
> > +    * A full command streamer stall seems to be needed in both cases.
> > +    */
> >     anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
> >        pc.DepthCacheFlushEnable = true;
> >        pc.CommandStreamerStallEnable = true;
> >        pc.RenderTargetCacheFlushEnable = true;
> >     }
> >
> > +#if GEN_GEN == 9
> > +
> > +   uint32_t cache_mode;
> > +   anv_pack_struct(&cache_mode, GENX(CACHE_MODE_0),
> > +                   .STCPMAOptimizationEnable = enable,
> > +                   .STCPMAOptimizationEnableMask = true);
> > +   anv_batch_emit(&cmd_buffer->batch, GENX(MI_LOAD_REGISTER_IMM), lri)
> {
> > +      lri.RegisterOffset   = GENX(CACHE_MODE_0_num);
> > +      lri.DataDWord        = cache_mode;
> > +   }
> > +
> > +#elif GEN_GEN == 8
> > +
> >     uint32_t cache_mode;
> >     anv_pack_struct(&cache_mode, GENX(CACHE_MODE_1),
> >                     .NPPMAFixEnable = enable,
> > @@ -178,18 +201,20 @@ genX(cmd_buffer_enable_pma_fix)(struct
> anv_cmd_buffer *cmd_buffer, bool enable)
> >        lri.DataDWord        = cache_mode;
> >     }
> >
> > +#endif /* GEN_GEN == 8 */
> > +
> >     /* After the LRI, a PIPE_CONTROL with both the Depth Stall and Depth
> Cache
> >      * Flush bits is often necessary.  We do it regardless because it's
> easier.
> >      * The render cache flush is also necessary if stencil writes are
> enabled.
> > +    *
> > +    * Again, the Sky Lake docs give a different set of flushes but the
> BDW
> > +    * flushes seem to work just as well.
> >      */
> >     anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
> >        pc.DepthStallEnable = true;
> >        pc.DepthCacheFlushEnable = true;
> >        pc.RenderTargetCacheFlushEnable = true;
> >     }
> > -
> > -   cmd_buffer->state.pma_fix_enabled = enable;
> > -#endif /* GEN_GEN == 8 */
> >  }
> >
> >  static inline bool
> > @@ -289,6 +314,124 @@ want_depth_pma_fix(struct anv_cmd_buffer
> *cmd_buffer)
> >            wm_prog_data->computed_depth_mode != PSCDEPTH_OFF;
> >  }
> >
> > +static inline bool
> > +want_stencil_pma_fix(struct anv_cmd_buffer *cmd_buffer)
> > +{
> > +   assert(GEN_GEN == 9);
> > +
> > +   /* From the Sky Lake PRM Vol. 2c CACHE_MODE_1::STC PMA Optimization
> Enable:
> > +    *
> > +    *    Clearing this bit will force the STC cache to wait for pending
> > +    *    retirement of pixels at the HZ-read stage and do the STC-test
> for
> > +    *    Non-promoted, R-computed and Computed depth modes instead of
> > +    *    postponing the STC-test to RCPFE.
> > +    *
> > +    *    STC_TEST_EN = 3DSTATE_STENCIL_BUFFER::STENCIL_BUFFER_ENABLE &&
> > +    *                  3DSTATE_WM_DEPTH_STENCIL::StencilTestEnable
> > +    *
> > +    *    STC_WRITE_EN = 3DSTATE_STENCIL_BUFFER::STENCIL_BUFFER_ENABLE
> &&
> > +    *                   (3DSTATE_WM_DEPTH_STENCIL::Stencil Buffer
> Write Enable &&
> > +    *                    3DSTATE_DEPTH_BUFFER::STENCIL_WRITE_ENABLE)
> > +    *
> > +    *    COMP_STC_EN = STC_TEST_EN &&
> > +    *                  3DSTATE_PS_EXTRA::PixelShaderComputesStencil
> > +    *
> > +    *    SW parses the pipeline states to generate the following logical
> > +    *    signal indicating if PMA FIX can be enabled.
> > +    *
> > +    *    STC_PMA_OPT =
> > +    *       3DSTATE_WM::ForceThreadDispatch != 1 &&
> > +    *       !(3DSTATE_RASTER::ForceSampleCount != NUMRASTSAMPLES_0) &&
> > +    *       3DSTATE_DEPTH_BUFFER::SURFACE_TYPE != NULL &&
> > +    *       3DSTATE_DEPTH_BUFFER::HIZ Enable &&
> > +    *       !(3DSTATE_WM::EDSC_Mode == 2) &&
> > +    *       3DSTATE_PS_EXTRA::PixelShaderValid &&
> > +    *       !(3DSTATE_WM_HZ_OP::DepthBufferClear ||
> > +    *         3DSTATE_WM_HZ_OP::DepthBufferResolve ||
> > +    *         3DSTATE_WM_HZ_OP::Hierarchical Depth Buffer Resolve
> Enable ||
> > +    *         3DSTATE_WM_HZ_OP::StencilBufferClear) &&
> > +    *       (COMP_STC_EN || STC_WRITE_EN) &&
> > +    *       ((3DSTATE_PS_EXTRA::PixelShaderKillsPixels ||
> > +    *         3DSTATE_WM::ForceKillPix == ON ||
> > +    *         3DSTATE_PS_EXTRA::oMask Present to RenderTarget ||
> > +    *         3DSTATE_PS_BLEND::AlphaToCoverageEnable ||
> > +    *         3DSTATE_PS_BLEND::AlphaTestEnable ||
> > +    *         3DSTATE_WM_CHROMAKEY::ChromaKeyKillEnable) ||
> > +    *        (3DSTATE_PS_EXTRA::Pixel Shader Computed Depth mode !=
> PSCDEPTH_OFF))
> > +    */
> > +
> > +   /* These are always true:
> > +    *    3DSTATE_WM::ForceThreadDispatch != 1 &&
> > +    *    !(3DSTATE_RASTER::ForceSampleCount != NUMRASTSAMPLES_0)
> > +    */
> > +
> > +   /* We only enable the PMA fix if HiZ is enabled.  However, if HiZ is
> > +    * *not* enabled and we don't set this bit there is no harm.
> Therefore,
> > +    * we can just treat the NULL framebuffer case as has_hiz == false
> and
> > +    * everything will work just fine.
> > +    */
> > +   if (cmd_buffer->state.framebuffer == NULL)
> > +      return false;
> > +
> > +   /* (3DSTATE_DEPTH_BUFFER::SURFACE_TYPE != NULL) &&
> > +    * 3DSTATE_DEPTH_BUFFER::HIZ Enable
> > +    */
> > +   const struct anv_image_view *ds_iview =
> > +      anv_cmd_buffer_get_depth_stencil_view(cmd_buffer);
> > +   if (!ds_iview || ds_iview->image->aux_usage != ISL_AUX_USAGE_HIZ)
>
> I think there's an error here. For depth buffers, anv_image::aux_usage
> only indicates if there is an auxiliary HiZ buffer present. It does not
> indicate that the HiZ buffer is being used within a render pass. For
> that you'd have to check anv_attachment_state::aux_usage.
>

So things just got more interesting... The Vulkan spec only requires that
the render pass provided to pInheritanceInfo for the vkBeginCommandBuffer
of a secondary command buffer to be "compatible" with the render pass
provided to vkCmdBeginRenderPass.  The rules for compatibility explicitly
allow the initial and final layouts of the attachments to differ.  This
means that, if HiZ usage within a render pass is determined by the initial
layout, we can never know if HiZ is enabled in a secondary command buffer
and we always have to leave the PMA stall off.  That's fairly brutal to
apps.

How bad would it be to enable HiZ within the render pass for the case where
they use the GENERAL layout and then resolve at the end?  They used the
GENERAL layout so they know they won't get fantastic performance so maybe
that's ok?

Either way, I'll respin and make it correct. We can sort this issue out and
hopefully give secondary command buffer users their 2-3% later.

--Jason


> -Nanley
>
> > +      return false;
> > +
> > +   /* 3DSTATE_PS_EXTRA::PixelShaderValid */
> > +   struct anv_pipeline *pipeline = cmd_buffer->state.pipeline;
> > +   if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT))
> > +      return false;
> > +
> > +   /* !(3DSTATE_WM::EDSC_Mode == 2) */
> > +   const struct brw_wm_prog_data *wm_prog_data =
> get_wm_prog_data(pipeline);
> > +   if (wm_prog_data->early_fragment_tests)
> > +      return false;
> > +
> > +   /* We never use anv_pipeline for HiZ ops so this is trivially true:
> > +   *    !(3DSTATE_WM_HZ_OP::DepthBufferClear ||
> > +    *      3DSTATE_WM_HZ_OP::DepthBufferResolve ||
> > +    *      3DSTATE_WM_HZ_OP::Hierarchical Depth Buffer Resolve Enable ||
> > +    *      3DSTATE_WM_HZ_OP::StencilBufferClear)
> > +    */
> > +
> > +   /* 3DSTATE_STENCIL_BUFFER::STENCIL_BUFFER_ENABLE &&
> > +    * 3DSTATE_WM_DEPTH_STENCIL::StencilTestEnable
> > +    */
> > +   const bool stc_test_en =
> > +      (ds_iview->image->aspects & VK_IMAGE_ASPECT_STENCIL_BIT) &&
> > +      pipeline->stencil_test_enable;
> > +
> > +   /* 3DSTATE_STENCIL_BUFFER::STENCIL_BUFFER_ENABLE &&
> > +    * (3DSTATE_WM_DEPTH_STENCIL::Stencil Buffer Write Enable &&
> > +    *  3DSTATE_DEPTH_BUFFER::STENCIL_WRITE_ENABLE)
> > +    */
> > +   const bool stc_write_en =
> > +      (ds_iview->image->aspects & VK_IMAGE_ASPECT_STENCIL_BIT) &&
> > +      pipeline->writes_stencil;
> > +
> > +   /* STC_TEST_EN && 3DSTATE_PS_EXTRA::PixelShaderComputesStencil */
> > +   const bool comp_stc_en = stc_test_en &&
> wm_prog_data->computed_stencil;
> > +
> > +   /* COMP_STC_EN || STC_WRITE_EN */
> > +   if (!(comp_stc_en || stc_write_en))
> > +      return false;
> > +
> > +   /* (3DSTATE_PS_EXTRA::PixelShaderKillsPixels ||
> > +    *  3DSTATE_WM::ForceKillPix == ON ||
> > +    *  3DSTATE_PS_EXTRA::oMask Present to RenderTarget ||
> > +    *  3DSTATE_PS_BLEND::AlphaToCoverageEnable ||
> > +    *  3DSTATE_PS_BLEND::AlphaTestEnable ||
> > +    *  3DSTATE_WM_CHROMAKEY::ChromaKeyKillEnable) ||
> > +    * (3DSTATE_PS_EXTRA::Pixel Shader Computed Depth mode !=
> PSCDEPTH_OFF)
> > +    */
> > +   return pipeline->kill_pixel ||
> > +          wm_prog_data->computed_depth_mode != PSCDEPTH_OFF;
> > +}
> > +
> >  void
> >  genX(cmd_buffer_flush_dynamic_state)(struct anv_cmd_buffer *cmd_buffer)
> >  {
> > @@ -398,6 +541,7 @@ genX(cmd_buffer_flush_dynamic_state)(struct
> anv_cmd_buffer *cmd_buffer)
> >     }
> >
> >     if (cmd_buffer->state.dirty & (ANV_CMD_DIRTY_PIPELINE |
> > +                                  ANV_CMD_DIRTY_RENDER_TARGETS |
> >                                    ANV_CMD_DIRTY_DYNAMIC_STENCIL_COMPARE_MASK
> |
> >                                    ANV_CMD_DIRTY_DYNAMIC_STENCIL_WRITE_MASK
> |
> >                                    ANV_CMD_DIRTY_DYNAMIC_STENCIL_REFERENCE))
> {
> > @@ -423,6 +567,9 @@ genX(cmd_buffer_flush_dynamic_state)(struct
> anv_cmd_buffer *cmd_buffer)
> >
> >        anv_batch_emit_merge(&cmd_buffer->batch, dwords,
> >                             pipeline->gen9.wm_depth_stencil);
> > +
> > +      genX(cmd_buffer_enable_pma_fix)(cmd_buffer,
> > +                                      want_stencil_pma_fix(cmd_buffe
> r));
> >     }
> >  #endif
> >
> > diff --git a/src/intel/vulkan/genX_pipeline.c
> b/src/intel/vulkan/genX_pipeline.c
> > index 3f701f3..3087037 100644
> > --- a/src/intel/vulkan/genX_pipeline.c
> > +++ b/src/intel/vulkan/genX_pipeline.c
> > @@ -742,6 +742,7 @@ emit_ds_state(struct anv_pipeline *pipeline,
> >         * to make sure it's initialized to something useful.
> >         */
> >        pipeline->writes_stencil = false;
> > +      pipeline->stencil_test_enable = false;
> >        pipeline->writes_depth = false;
> >        pipeline->depth_test_enable = false;
> >        memset(depth_stencil_dw, 0, sizeof(depth_stencil_dw));
> > @@ -757,6 +758,7 @@ emit_ds_state(struct anv_pipeline *pipeline,
> >
> >     VkPipelineDepthStencilStateCreateInfo info = *pCreateInfo;
> >     sanitize_ds_state(&info, &pipeline->writes_stencil, ds_aspects);
> > +   pipeline->stencil_test_enable = info.stencilTestEnable;
> >     pipeline->writes_depth = info.depthWriteEnable;
> >     pipeline->depth_test_enable = info.depthTestEnable;
> >
> > @@ -1514,8 +1516,8 @@ compute_kill_pixel(struct anv_pipeline *pipeline,
> >     const struct brw_wm_prog_data *wm_prog_data =
> get_wm_prog_data(pipeline);
> >
> >     /* This computes the KillPixel portion of the computation for
> whether or
> > -    * not we want to enable the PMA fix on gen8.  It's given by this
> chunk of
> > -    * the giant formula:
> > +    * not we want to enable the PMA fix on gen8 or gen9.  It's given by
> this
> > +    * chunk of the giant formula:
> >      *
> >      *    (3DSTATE_PS_EXTRA::PixelShaderKillsPixels ||
> >      *     3DSTATE_PS_EXTRA::oMask Present to RenderTarget ||
> > --
> > 2.5.0.400.gff86faf
> >
> > _______________________________________________
> > mesa-dev mailing list
> > mesa-dev at lists.freedesktop.org
> > https://lists.freedesktop.org/mailman/listinfo/mesa-dev
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://lists.freedesktop.org/archives/mesa-dev/attachments/20170202/7cc0e571/attachment-0001.html>


More information about the mesa-dev mailing list