[Mesa-dev] [PATCH 2/2] anv: Implement the Sky Lake stencil PMA optimization

Nanley Chery nanleychery at gmail.com
Thu Feb 2 23:01:11 UTC 2017


On Thu, Feb 02, 2017 at 01:55:30PM -0800, Jason Ekstrand wrote:
> On Thu, Feb 2, 2017 at 1:45 PM, Nanley Chery <nanleychery at gmail.com> wrote:
> 
> > On Thu, Feb 02, 2017 at 01:37:56PM -0800, Nanley Chery wrote:
> > > On Thu, Feb 02, 2017 at 12:53:33PM -0800, Jason Ekstrand wrote:
> > > > On Thu, Feb 2, 2017 at 10:55 AM, Nanley Chery <nanleychery at gmail.com>
> > wrote:
> > > >
> > > > > On Wed, Feb 01, 2017 at 10:11:42PM -0800, Jason Ekstrand wrote:
> > > > > > This improves the performance of Dota 2 on my Sky Lake Skull Canyon
> > > > > > machine by about 2-3%.
> > > > > > ---
> > > > > >  src/intel/vulkan/anv_private.h     |   1 +
> > > > > >  src/intel/vulkan/gen8_cmd_buffer.c | 155
> > ++++++++++++++++++++++++++++++
> > > > > ++++++-
> > > > > >  src/intel/vulkan/genX_pipeline.c   |   6 +-
> > > > > >  3 files changed, 156 insertions(+), 6 deletions(-)
> > > > > >
> > > > > > diff --git a/src/intel/vulkan/anv_private.h
> > > > > b/src/intel/vulkan/anv_private.h
> > > > > > index 5fe4dd8..e7ad351 100644
> > > > > > --- a/src/intel/vulkan/anv_private.h
> > > > > > +++ b/src/intel/vulkan/anv_private.h
> > > > > > @@ -1475,6 +1475,7 @@ struct anv_pipeline {
> > > > > >     bool                                         writes_depth;
> > > > > >     bool                                         depth_test_enable;
> > > > > >     bool                                         writes_stencil;
> > > > > > +   bool
> >  stencil_test_enable;
> > > > > >     bool
> >  depth_clamp_enable;
> > > > > >     bool                                         kill_pixel;
> > > > > >
> > > > > > diff --git a/src/intel/vulkan/gen8_cmd_buffer.c
> > > > > b/src/intel/vulkan/gen8_cmd_buffer.c
> > > > > > index b877e27..553f0c3 100644
> > > > > > --- a/src/intel/vulkan/gen8_cmd_buffer.c
> > > > > > +++ b/src/intel/vulkan/gen8_cmd_buffer.c
> > > > > > @@ -157,16 +157,39 @@ __emit_sf_state(struct anv_cmd_buffer
> > *cmd_buffer)
> > > > > >  void
> > > > > >  genX(cmd_buffer_enable_pma_fix)(struct anv_cmd_buffer
> > *cmd_buffer,
> > > > > bool enable)
> > > > > >  {
> > > > > > -#if GEN_GEN == 8
> > > > > >     if (cmd_buffer->state.pma_fix_enabled == enable)
> > > > > >        return;
> > > > > >
> > > > > > +   cmd_buffer->state.pma_fix_enabled = enable;
> > > > > > +
> > > > > > +   /* According to the Broadwell PIPE_CONTROL documentation,
> > software
> > > > > should
> > > > > > +    * emit a PIPE_CONTROL with the CS Stall and Depth Cache Flush
> > bits
> > > > > set
> > > > > > +    * prior to the LRI.  If stencil buffer writes are enabled,
> > then a
> > > > > Render
> > > > > > +    * Cache Flush is also necessary.
> > > > > > +    *
> > > > > > +    * The Sky Lake docs say to use a depth stall rather than a
> > command
> > > > > > +    * streamer stall.  However, the hardware seems to violently
> > > > > disagree.
> > > > > > +    * A full command streamer stall seems to be needed in both
> > cases.
> > > > > > +    */
> > > > > >     anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
> > > > > >        pc.DepthCacheFlushEnable = true;
> > > > > >        pc.CommandStreamerStallEnable = true;
> > > > > >        pc.RenderTargetCacheFlushEnable = true;
> > > > > >     }
> > > > > >
> > > > > > +#if GEN_GEN == 9
> > > > > > +
> > > > > > +   uint32_t cache_mode;
> > > > > > +   anv_pack_struct(&cache_mode, GENX(CACHE_MODE_0),
> > > > > > +                   .STCPMAOptimizationEnable = enable,
> > > > > > +                   .STCPMAOptimizationEnableMask = true);
> > > > > > +   anv_batch_emit(&cmd_buffer->batch,
> > GENX(MI_LOAD_REGISTER_IMM), lri)
> > > > > {
> > > > > > +      lri.RegisterOffset   = GENX(CACHE_MODE_0_num);
> > > > > > +      lri.DataDWord        = cache_mode;
> > > > > > +   }
> > > > > > +
> > > > > > +#elif GEN_GEN == 8
> > > > > > +
> > > > > >     uint32_t cache_mode;
> > > > > >     anv_pack_struct(&cache_mode, GENX(CACHE_MODE_1),
> > > > > >                     .NPPMAFixEnable = enable,
> > > > > > @@ -178,18 +201,20 @@ genX(cmd_buffer_enable_pma_fix)(struct
> > > > > anv_cmd_buffer *cmd_buffer, bool enable)
> > > > > >        lri.DataDWord        = cache_mode;
> > > > > >     }
> > > > > >
> > > > > > +#endif /* GEN_GEN == 8 */
> > > > > > +
> > > > > >     /* After the LRI, a PIPE_CONTROL with both the Depth Stall and
> > Depth
> > > > > Cache
> > > > > >      * Flush bits is often necessary.  We do it regardless because
> > it's
> > > > > easier.
> > > > > >      * The render cache flush is also necessary if stencil writes
> > are
> > > > > enabled.
> > > > > > +    *
> > > > > > +    * Again, the Sky Lake docs give a different set of flushes
> > but the
> > > > > BDW
> > > > > > +    * flushes seem to work just as well.
> > > > > >      */
> > > > > >     anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
> > > > > >        pc.DepthStallEnable = true;
> > > > > >        pc.DepthCacheFlushEnable = true;
> > > > > >        pc.RenderTargetCacheFlushEnable = true;
> > > > > >     }
> > > > > > -
> > > > > > -   cmd_buffer->state.pma_fix_enabled = enable;
> > > > > > -#endif /* GEN_GEN == 8 */
> > > > > >  }
> > > > > >
> > > > > >  static inline bool
> > > > > > @@ -289,6 +314,124 @@ want_depth_pma_fix(struct anv_cmd_buffer
> > > > > *cmd_buffer)
> > > > > >            wm_prog_data->computed_depth_mode != PSCDEPTH_OFF;
> > > > > >  }
> > > > > >
> > > > > > +static inline bool
> > > > > > +want_stencil_pma_fix(struct anv_cmd_buffer *cmd_buffer)
> > > > > > +{
> > > > > > +   assert(GEN_GEN == 9);
> > > > > > +
> > > > > > +   /* From the Sky Lake PRM Vol. 2c CACHE_MODE_1::STC PMA
> > Optimization
> > > > > Enable:
> > > > > > +    *
> > > > > > +    *    Clearing this bit will force the STC cache to wait for
> > pending
> > > > > > +    *    retirement of pixels at the HZ-read stage and do the
> > STC-test
> > > > > for
> > > > > > +    *    Non-promoted, R-computed and Computed depth modes
> > instead of
> > > > > > +    *    postponing the STC-test to RCPFE.
> > > > > > +    *
> > > > > > +    *    STC_TEST_EN = 3DSTATE_STENCIL_BUFFER::STENCIL_BUFFER_ENABLE
> > &&
> > > > > > +    *                  3DSTATE_WM_DEPTH_STENCIL::
> > StencilTestEnable
> > > > > > +    *
> > > > > > +    *    STC_WRITE_EN = 3DSTATE_STENCIL_BUFFER::
> > STENCIL_BUFFER_ENABLE
> > > > > &&
> > > > > > +    *                   (3DSTATE_WM_DEPTH_STENCIL::Stencil Buffer
> > > > > Write Enable &&
> > > > > > +    *                    3DSTATE_DEPTH_BUFFER::STENCIL_
> > WRITE_ENABLE)
> > > > > > +    *
> > > > > > +    *    COMP_STC_EN = STC_TEST_EN &&
> > > > > > +    *                  3DSTATE_PS_EXTRA::
> > PixelShaderComputesStencil
> > > > > > +    *
> > > > > > +    *    SW parses the pipeline states to generate the following
> > logical
> > > > > > +    *    signal indicating if PMA FIX can be enabled.
> > > > > > +    *
> > > > > > +    *    STC_PMA_OPT =
> > > > > > +    *       3DSTATE_WM::ForceThreadDispatch != 1 &&
> > > > > > +    *       !(3DSTATE_RASTER::ForceSampleCount !=
> > NUMRASTSAMPLES_0) &&
> > > > > > +    *       3DSTATE_DEPTH_BUFFER::SURFACE_TYPE != NULL &&
> > > > > > +    *       3DSTATE_DEPTH_BUFFER::HIZ Enable &&
> > > > > > +    *       !(3DSTATE_WM::EDSC_Mode == 2) &&
> > > > > > +    *       3DSTATE_PS_EXTRA::PixelShaderValid &&
> > > > > > +    *       !(3DSTATE_WM_HZ_OP::DepthBufferClear ||
> > > > > > +    *         3DSTATE_WM_HZ_OP::DepthBufferResolve ||
> > > > > > +    *         3DSTATE_WM_HZ_OP::Hierarchical Depth Buffer Resolve
> > > > > Enable ||
> > > > > > +    *         3DSTATE_WM_HZ_OP::StencilBufferClear) &&
> > > > > > +    *       (COMP_STC_EN || STC_WRITE_EN) &&
> > > > > > +    *       ((3DSTATE_PS_EXTRA::PixelShaderKillsPixels ||
> > > > > > +    *         3DSTATE_WM::ForceKillPix == ON ||
> > > > > > +    *         3DSTATE_PS_EXTRA::oMask Present to RenderTarget ||
> > > > > > +    *         3DSTATE_PS_BLEND::AlphaToCoverageEnable ||
> > > > > > +    *         3DSTATE_PS_BLEND::AlphaTestEnable ||
> > > > > > +    *         3DSTATE_WM_CHROMAKEY::ChromaKeyKillEnable) ||
> > > > > > +    *        (3DSTATE_PS_EXTRA::Pixel Shader Computed Depth mode
> > !=
> > > > > PSCDEPTH_OFF))
> > > > > > +    */
> > > > > > +
> > > > > > +   /* These are always true:
> > > > > > +    *    3DSTATE_WM::ForceThreadDispatch != 1 &&
> > > > > > +    *    !(3DSTATE_RASTER::ForceSampleCount != NUMRASTSAMPLES_0)
> > > > > > +    */
> > > > > > +
> > > > > > +   /* We only enable the PMA fix if HiZ is enabled.  However, if
> > HiZ is
> > > > > > +    * *not* enabled and we don't set this bit there is no harm.
> > > > > Therefore,
> > > > > > +    * we can just treat the NULL framebuffer case as has_hiz ==
> > false
> > > > > and
> > > > > > +    * everything will work just fine.
> > > > > > +    */
> > > > > > +   if (cmd_buffer->state.framebuffer == NULL)
> > > > > > +      return false;
> > > > > > +
> > > > > > +   /* (3DSTATE_DEPTH_BUFFER::SURFACE_TYPE != NULL) &&
> > > > > > +    * 3DSTATE_DEPTH_BUFFER::HIZ Enable
> > > > > > +    */
> > > > > > +   const struct anv_image_view *ds_iview =
> > > > > > +      anv_cmd_buffer_get_depth_stencil_view(cmd_buffer);
> > > > > > +   if (!ds_iview || ds_iview->image->aux_usage !=
> > ISL_AUX_USAGE_HIZ)
> > > > >
> > > > > I think there's an error here. For depth buffers,
> > anv_image::aux_usage
> > > > > only indicates if there is an auxiliary HiZ buffer present. It does
> > not
> > > > > indicate that the HiZ buffer is being used within a render pass. For
> > > > > that you'd have to check anv_attachment_state::aux_usage.
> > > > >
> > > >
> > > > So things just got more interesting... The Vulkan spec only requires
> > that
> > > > the render pass provided to pInheritanceInfo for the
> > vkBeginCommandBuffer
> > > > of a secondary command buffer to be "compatible" with the render pass
> > > > provided to vkCmdBeginRenderPass.  The rules for compatibility
> > explicitly
> > > > allow the initial and final layouts of the attachments to differ.  This
> > > > means that, if HiZ usage within a render pass is determined by the
> > initial
> > > > layout, we can never know if HiZ is enabled in a secondary command
> > buffer
> > > > and we always have to leave the PMA stall off.  That's fairly brutal to
> > > > apps.
> > >
> > > That is interesting. I'm glad you found this. Perhaps we can get around
> > > this by computing part of the PMA fix at the start of the subpass, and
> > > the latter portion at the draw call?
> > >
> >
> > That seems to be what your v2 patches are doing, so the problem is
> > solved right?
> >
> 
> Basically, yes in the sense that it's correct.  However, the new
> hiz_enabled bit is always false inside secondaries so they take a perf hit.
> 

Right, I was thinking about secondaries incorrectly.

> 
> > -Nanley
> >
> > > >
> > > > How bad would it be to enable HiZ within the render pass for the case
> > where
> > > > they use the GENERAL layout and then resolve at the end?  They used the
> > > > GENERAL layout so they know they won't get fantastic performance so
> > maybe
> > > > that's ok?
> > > >
> > >
> > > Now that we've seen a benchmark out in the wild that uses depth input
> > > attachments, I think we'll want the GENERAL layout to be fast and for
> > > our software handling it to be straight-forward. I think we can
> > > accomplish this by adding HiZ R/W support to BLORP. A local series of
> > > mine that enables HiZ for input attachments takes a step towards this by
> > > getting rid of the read-depth-through-HiZ situation we currently have
> > > for the GENERAL layout.
> >
> 
> We can do whatever we want inside the render pass.  The initial and final
> layouts are just that, "initial" and "final".  We can do an implicit layout
> transition from initial to DEPTH_STENCIL_ATTACHMENT_OPTIMAL at the top of
> the render pass and then transition to final at the end.  It just clients
> which use GENERAL when they don't need to will get resolves.  If you don't
> want resolves, use layout transitions properly and you won't get them.

Thanks for expounding, that makes sense. In order for this to work,
you'd also have to force all depth textures to be sampled through HiZ
(as is currently the case).

> 
> Yes, adding depth/hiz support to BLORP will let us leave HiZ on for all
> GENERAL layouts but I don't think it's needed to solve the current problem
> or to get fast depth input attachments.
> 

Sorry if I gave the impression that I thought this patch depended on
BLORP having HiZ support, that wasn't my intention. I was replying to
how we should handle the GENERAL layout in the future.

I also don't think it's the only way to solve this problem, but I do
think it's the most straight-forward. I've been trying to share code
between depth and color in the process of making CCS use layouts.
Having such support in BLORP would avoid having to special-case depth.

-Nanley

> --Jason
> 
> 
> > > -Nanley
> > >
> > > > Either way, I'll respin and make it correct. We can sort this issue
> > out and
> > > > hopefully give secondary command buffer users their 2-3% later.
> > > >
> > > > --Jason
> > > >
> > > >
> > > > > -Nanley
> > > > >
> > > > > > +      return false;
> > > > > > +
> > > > > > +   /* 3DSTATE_PS_EXTRA::PixelShaderValid */
> > > > > > +   struct anv_pipeline *pipeline = cmd_buffer->state.pipeline;
> > > > > > +   if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT))
> > > > > > +      return false;
> > > > > > +
> > > > > > +   /* !(3DSTATE_WM::EDSC_Mode == 2) */
> > > > > > +   const struct brw_wm_prog_data *wm_prog_data =
> > > > > get_wm_prog_data(pipeline);
> > > > > > +   if (wm_prog_data->early_fragment_tests)
> > > > > > +      return false;
> > > > > > +
> > > > > > +   /* We never use anv_pipeline for HiZ ops so this is trivially
> > true:
> > > > > > +   *    !(3DSTATE_WM_HZ_OP::DepthBufferClear ||
> > > > > > +    *      3DSTATE_WM_HZ_OP::DepthBufferResolve ||
> > > > > > +    *      3DSTATE_WM_HZ_OP::Hierarchical Depth Buffer Resolve
> > Enable ||
> > > > > > +    *      3DSTATE_WM_HZ_OP::StencilBufferClear)
> > > > > > +    */
> > > > > > +
> > > > > > +   /* 3DSTATE_STENCIL_BUFFER::STENCIL_BUFFER_ENABLE &&
> > > > > > +    * 3DSTATE_WM_DEPTH_STENCIL::StencilTestEnable
> > > > > > +    */
> > > > > > +   const bool stc_test_en =
> > > > > > +      (ds_iview->image->aspects & VK_IMAGE_ASPECT_STENCIL_BIT) &&
> > > > > > +      pipeline->stencil_test_enable;
> > > > > > +
> > > > > > +   /* 3DSTATE_STENCIL_BUFFER::STENCIL_BUFFER_ENABLE &&
> > > > > > +    * (3DSTATE_WM_DEPTH_STENCIL::Stencil Buffer Write Enable &&
> > > > > > +    *  3DSTATE_DEPTH_BUFFER::STENCIL_WRITE_ENABLE)
> > > > > > +    */
> > > > > > +   const bool stc_write_en =
> > > > > > +      (ds_iview->image->aspects & VK_IMAGE_ASPECT_STENCIL_BIT) &&
> > > > > > +      pipeline->writes_stencil;
> > > > > > +
> > > > > > +   /* STC_TEST_EN && 3DSTATE_PS_EXTRA::PixelShaderComputesStencil
> > */
> > > > > > +   const bool comp_stc_en = stc_test_en &&
> > > > > wm_prog_data->computed_stencil;
> > > > > > +
> > > > > > +   /* COMP_STC_EN || STC_WRITE_EN */
> > > > > > +   if (!(comp_stc_en || stc_write_en))
> > > > > > +      return false;
> > > > > > +
> > > > > > +   /* (3DSTATE_PS_EXTRA::PixelShaderKillsPixels ||
> > > > > > +    *  3DSTATE_WM::ForceKillPix == ON ||
> > > > > > +    *  3DSTATE_PS_EXTRA::oMask Present to RenderTarget ||
> > > > > > +    *  3DSTATE_PS_BLEND::AlphaToCoverageEnable ||
> > > > > > +    *  3DSTATE_PS_BLEND::AlphaTestEnable ||
> > > > > > +    *  3DSTATE_WM_CHROMAKEY::ChromaKeyKillEnable) ||
> > > > > > +    * (3DSTATE_PS_EXTRA::Pixel Shader Computed Depth mode !=
> > > > > PSCDEPTH_OFF)
> > > > > > +    */
> > > > > > +   return pipeline->kill_pixel ||
> > > > > > +          wm_prog_data->computed_depth_mode != PSCDEPTH_OFF;
> > > > > > +}
> > > > > > +
> > > > > >  void
> > > > > >  genX(cmd_buffer_flush_dynamic_state)(struct anv_cmd_buffer
> > *cmd_buffer)
> > > > > >  {
> > > > > > @@ -398,6 +541,7 @@ genX(cmd_buffer_flush_dynamic_state)(struct
> > > > > anv_cmd_buffer *cmd_buffer)
> > > > > >     }
> > > > > >
> > > > > >     if (cmd_buffer->state.dirty & (ANV_CMD_DIRTY_PIPELINE |
> > > > > > +                                  ANV_CMD_DIRTY_RENDER_TARGETS |
> > > > > >                                    ANV_CMD_DIRTY_DYNAMIC_STENCIL_
> > COMPARE_MASK
> > > > > |
> > > > > >                                    ANV_CMD_DIRTY_DYNAMIC_STENCIL_
> > WRITE_MASK
> > > > > |
> > > > > >                                    ANV_CMD_DIRTY_DYNAMIC_STENCIL_
> > REFERENCE))
> > > > > {
> > > > > > @@ -423,6 +567,9 @@ genX(cmd_buffer_flush_dynamic_state)(struct
> > > > > anv_cmd_buffer *cmd_buffer)
> > > > > >
> > > > > >        anv_batch_emit_merge(&cmd_buffer->batch, dwords,
> > > > > >                             pipeline->gen9.wm_depth_stencil);
> > > > > > +
> > > > > > +      genX(cmd_buffer_enable_pma_fix)(cmd_buffer,
> > > > > > +
> > want_stencil_pma_fix(cmd_buffe
> > > > > r));
> > > > > >     }
> > > > > >  #endif
> > > > > >
> > > > > > diff --git a/src/intel/vulkan/genX_pipeline.c
> > > > > b/src/intel/vulkan/genX_pipeline.c
> > > > > > index 3f701f3..3087037 100644
> > > > > > --- a/src/intel/vulkan/genX_pipeline.c
> > > > > > +++ b/src/intel/vulkan/genX_pipeline.c
> > > > > > @@ -742,6 +742,7 @@ emit_ds_state(struct anv_pipeline *pipeline,
> > > > > >         * to make sure it's initialized to something useful.
> > > > > >         */
> > > > > >        pipeline->writes_stencil = false;
> > > > > > +      pipeline->stencil_test_enable = false;
> > > > > >        pipeline->writes_depth = false;
> > > > > >        pipeline->depth_test_enable = false;
> > > > > >        memset(depth_stencil_dw, 0, sizeof(depth_stencil_dw));
> > > > > > @@ -757,6 +758,7 @@ emit_ds_state(struct anv_pipeline *pipeline,
> > > > > >
> > > > > >     VkPipelineDepthStencilStateCreateInfo info = *pCreateInfo;
> > > > > >     sanitize_ds_state(&info, &pipeline->writes_stencil,
> > ds_aspects);
> > > > > > +   pipeline->stencil_test_enable = info.stencilTestEnable;
> > > > > >     pipeline->writes_depth = info.depthWriteEnable;
> > > > > >     pipeline->depth_test_enable = info.depthTestEnable;
> > > > > >
> > > > > > @@ -1514,8 +1516,8 @@ compute_kill_pixel(struct anv_pipeline
> > *pipeline,
> > > > > >     const struct brw_wm_prog_data *wm_prog_data =
> > > > > get_wm_prog_data(pipeline);
> > > > > >
> > > > > >     /* This computes the KillPixel portion of the computation for
> > > > > whether or
> > > > > > -    * not we want to enable the PMA fix on gen8.  It's given by
> > this
> > > > > chunk of
> > > > > > -    * the giant formula:
> > > > > > +    * not we want to enable the PMA fix on gen8 or gen9.  It's
> > given by
> > > > > this
> > > > > > +    * chunk of the giant formula:
> > > > > >      *
> > > > > >      *    (3DSTATE_PS_EXTRA::PixelShaderKillsPixels ||
> > > > > >      *     3DSTATE_PS_EXTRA::oMask Present to RenderTarget ||
> > > > > > --
> > > > > > 2.5.0.400.gff86faf
> > > > > >
> > > > > > _______________________________________________
> > > > > > mesa-dev mailing list
> > > > > > mesa-dev at lists.freedesktop.org
> > > > > > https://lists.freedesktop.org/mailman/listinfo/mesa-dev
> > > > >
> >


More information about the mesa-dev mailing list