[Mesa-dev] [PATCH 2/2] anv: Implement the Sky Lake stencil PMA optimization
Nanley Chery
nanleychery at gmail.com
Thu Feb 2 21:37:56 UTC 2017
On Thu, Feb 02, 2017 at 12:53:33PM -0800, Jason Ekstrand wrote:
> On Thu, Feb 2, 2017 at 10:55 AM, Nanley Chery <nanleychery at gmail.com> wrote:
>
> > On Wed, Feb 01, 2017 at 10:11:42PM -0800, Jason Ekstrand wrote:
> > > This improves the performance of Dota 2 on my Sky Lake Skull Canyon
> > > machine by about 2-3%.
> > > ---
> > > src/intel/vulkan/anv_private.h | 1 +
> > > src/intel/vulkan/gen8_cmd_buffer.c | 155 ++++++++++++++++++++++++++++++
> > ++++++-
> > > src/intel/vulkan/genX_pipeline.c | 6 +-
> > > 3 files changed, 156 insertions(+), 6 deletions(-)
> > >
> > > diff --git a/src/intel/vulkan/anv_private.h
> > b/src/intel/vulkan/anv_private.h
> > > index 5fe4dd8..e7ad351 100644
> > > --- a/src/intel/vulkan/anv_private.h
> > > +++ b/src/intel/vulkan/anv_private.h
> > > @@ -1475,6 +1475,7 @@ struct anv_pipeline {
> > > bool writes_depth;
> > > bool depth_test_enable;
> > > bool writes_stencil;
> > > + bool stencil_test_enable;
> > > bool depth_clamp_enable;
> > > bool kill_pixel;
> > >
> > > diff --git a/src/intel/vulkan/gen8_cmd_buffer.c
> > b/src/intel/vulkan/gen8_cmd_buffer.c
> > > index b877e27..553f0c3 100644
> > > --- a/src/intel/vulkan/gen8_cmd_buffer.c
> > > +++ b/src/intel/vulkan/gen8_cmd_buffer.c
> > > @@ -157,16 +157,39 @@ __emit_sf_state(struct anv_cmd_buffer *cmd_buffer)
> > > void
> > > genX(cmd_buffer_enable_pma_fix)(struct anv_cmd_buffer *cmd_buffer,
> > bool enable)
> > > {
> > > -#if GEN_GEN == 8
> > > if (cmd_buffer->state.pma_fix_enabled == enable)
> > > return;
> > >
> > > + cmd_buffer->state.pma_fix_enabled = enable;
> > > +
> > > + /* According to the Broadwell PIPE_CONTROL documentation, software
> > should
> > > + * emit a PIPE_CONTROL with the CS Stall and Depth Cache Flush bits
> > set
> > > + * prior to the LRI. If stencil buffer writes are enabled, then a
> > Render
> > > + * Cache Flush is also necessary.
> > > + *
> > > + * The Sky Lake docs say to use a depth stall rather than a command
> > > + * streamer stall. However, the hardware seems to violently
> > disagree.
> > > + * A full command streamer stall seems to be needed in both cases.
> > > + */
> > > anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
> > > pc.DepthCacheFlushEnable = true;
> > > pc.CommandStreamerStallEnable = true;
> > > pc.RenderTargetCacheFlushEnable = true;
> > > }
> > >
> > > +#if GEN_GEN == 9
> > > +
> > > + uint32_t cache_mode;
> > > + anv_pack_struct(&cache_mode, GENX(CACHE_MODE_0),
> > > + .STCPMAOptimizationEnable = enable,
> > > + .STCPMAOptimizationEnableMask = true);
> > > + anv_batch_emit(&cmd_buffer->batch, GENX(MI_LOAD_REGISTER_IMM), lri)
> > {
> > > + lri.RegisterOffset = GENX(CACHE_MODE_0_num);
> > > + lri.DataDWord = cache_mode;
> > > + }
> > > +
> > > +#elif GEN_GEN == 8
> > > +
> > > uint32_t cache_mode;
> > > anv_pack_struct(&cache_mode, GENX(CACHE_MODE_1),
> > > .NPPMAFixEnable = enable,
> > > @@ -178,18 +201,20 @@ genX(cmd_buffer_enable_pma_fix)(struct
> > anv_cmd_buffer *cmd_buffer, bool enable)
> > > lri.DataDWord = cache_mode;
> > > }
> > >
> > > +#endif /* GEN_GEN == 8 */
> > > +
> > > /* After the LRI, a PIPE_CONTROL with both the Depth Stall and Depth
> > Cache
> > > * Flush bits is often necessary. We do it regardless because it's
> > easier.
> > > * The render cache flush is also necessary if stencil writes are
> > enabled.
> > > + *
> > > + * Again, the Sky Lake docs give a different set of flushes but the
> > BDW
> > > + * flushes seem to work just as well.
> > > */
> > > anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
> > > pc.DepthStallEnable = true;
> > > pc.DepthCacheFlushEnable = true;
> > > pc.RenderTargetCacheFlushEnable = true;
> > > }
> > > -
> > > - cmd_buffer->state.pma_fix_enabled = enable;
> > > -#endif /* GEN_GEN == 8 */
> > > }
> > >
> > > static inline bool
> > > @@ -289,6 +314,124 @@ want_depth_pma_fix(struct anv_cmd_buffer
> > *cmd_buffer)
> > > wm_prog_data->computed_depth_mode != PSCDEPTH_OFF;
> > > }
> > >
> > > +static inline bool
> > > +want_stencil_pma_fix(struct anv_cmd_buffer *cmd_buffer)
> > > +{
> > > + assert(GEN_GEN == 9);
> > > +
> > > + /* From the Sky Lake PRM Vol. 2c CACHE_MODE_1::STC PMA Optimization
> > Enable:
> > > + *
> > > + * Clearing this bit will force the STC cache to wait for pending
> > > + * retirement of pixels at the HZ-read stage and do the STC-test
> > for
> > > + * Non-promoted, R-computed and Computed depth modes instead of
> > > + * postponing the STC-test to RCPFE.
> > > + *
> > > + * STC_TEST_EN = 3DSTATE_STENCIL_BUFFER::STENCIL_BUFFER_ENABLE &&
> > > + * 3DSTATE_WM_DEPTH_STENCIL::StencilTestEnable
> > > + *
> > > + * STC_WRITE_EN = 3DSTATE_STENCIL_BUFFER::STENCIL_BUFFER_ENABLE
> > &&
> > > + * (3DSTATE_WM_DEPTH_STENCIL::Stencil Buffer
> > Write Enable &&
> > > + * 3DSTATE_DEPTH_BUFFER::STENCIL_WRITE_ENABLE)
> > > + *
> > > + * COMP_STC_EN = STC_TEST_EN &&
> > > + * 3DSTATE_PS_EXTRA::PixelShaderComputesStencil
> > > + *
> > > + * SW parses the pipeline states to generate the following logical
> > > + * signal indicating if PMA FIX can be enabled.
> > > + *
> > > + * STC_PMA_OPT =
> > > + * 3DSTATE_WM::ForceThreadDispatch != 1 &&
> > > + * !(3DSTATE_RASTER::ForceSampleCount != NUMRASTSAMPLES_0) &&
> > > + * 3DSTATE_DEPTH_BUFFER::SURFACE_TYPE != NULL &&
> > > + * 3DSTATE_DEPTH_BUFFER::HIZ Enable &&
> > > + * !(3DSTATE_WM::EDSC_Mode == 2) &&
> > > + * 3DSTATE_PS_EXTRA::PixelShaderValid &&
> > > + * !(3DSTATE_WM_HZ_OP::DepthBufferClear ||
> > > + * 3DSTATE_WM_HZ_OP::DepthBufferResolve ||
> > > + * 3DSTATE_WM_HZ_OP::Hierarchical Depth Buffer Resolve
> > Enable ||
> > > + * 3DSTATE_WM_HZ_OP::StencilBufferClear) &&
> > > + * (COMP_STC_EN || STC_WRITE_EN) &&
> > > + * ((3DSTATE_PS_EXTRA::PixelShaderKillsPixels ||
> > > + * 3DSTATE_WM::ForceKillPix == ON ||
> > > + * 3DSTATE_PS_EXTRA::oMask Present to RenderTarget ||
> > > + * 3DSTATE_PS_BLEND::AlphaToCoverageEnable ||
> > > + * 3DSTATE_PS_BLEND::AlphaTestEnable ||
> > > + * 3DSTATE_WM_CHROMAKEY::ChromaKeyKillEnable) ||
> > > + * (3DSTATE_PS_EXTRA::Pixel Shader Computed Depth mode !=
> > PSCDEPTH_OFF))
> > > + */
> > > +
> > > + /* These are always true:
> > > + * 3DSTATE_WM::ForceThreadDispatch != 1 &&
> > > + * !(3DSTATE_RASTER::ForceSampleCount != NUMRASTSAMPLES_0)
> > > + */
> > > +
> > > + /* We only enable the PMA fix if HiZ is enabled. However, if HiZ is
> > > + * *not* enabled and we don't set this bit there is no harm.
> > Therefore,
> > > + * we can just treat the NULL framebuffer case as has_hiz == false
> > and
> > > + * everything will work just fine.
> > > + */
> > > + if (cmd_buffer->state.framebuffer == NULL)
> > > + return false;
> > > +
> > > + /* (3DSTATE_DEPTH_BUFFER::SURFACE_TYPE != NULL) &&
> > > + * 3DSTATE_DEPTH_BUFFER::HIZ Enable
> > > + */
> > > + const struct anv_image_view *ds_iview =
> > > + anv_cmd_buffer_get_depth_stencil_view(cmd_buffer);
> > > + if (!ds_iview || ds_iview->image->aux_usage != ISL_AUX_USAGE_HIZ)
> >
> > I think there's an error here. For depth buffers, anv_image::aux_usage
> > only indicates if there is an auxiliary HiZ buffer present. It does not
> > indicate that the HiZ buffer is being used within a render pass. For
> > that you'd have to check anv_attachment_state::aux_usage.
> >
>
> So things just got more interesting... The Vulkan spec only requires that
> the render pass provided to pInheritanceInfo for the vkBeginCommandBuffer
> of a secondary command buffer to be "compatible" with the render pass
> provided to vkCmdBeginRenderPass. The rules for compatibility explicitly
> allow the initial and final layouts of the attachments to differ. This
> means that, if HiZ usage within a render pass is determined by the initial
> layout, we can never know if HiZ is enabled in a secondary command buffer
> and we always have to leave the PMA stall off. That's fairly brutal to
> apps.
That is interesting. I'm glad you found this. Perhaps we can get around
this by computing part of the PMA fix at the start of the subpass, and
the latter portion at the draw call?
>
> How bad would it be to enable HiZ within the render pass for the case where
> they use the GENERAL layout and then resolve at the end? They used the
> GENERAL layout so they know they won't get fantastic performance so maybe
> that's ok?
>
Now that we've seen a benchmark out in the wild that uses depth input
attachments, I think we'll want the GENERAL layout to be fast and for
our software handling it to be straight-forward. I think we can
accomplish this by adding HiZ R/W support to BLORP. A local series of
mine that enables HiZ for input attachments takes a step towards this by
getting rid of the read-depth-through-HiZ situation we currently have
for the GENERAL layout.
-Nanley
> Either way, I'll respin and make it correct. We can sort this issue out and
> hopefully give secondary command buffer users their 2-3% later.
>
> --Jason
>
>
> > -Nanley
> >
> > > + return false;
> > > +
> > > + /* 3DSTATE_PS_EXTRA::PixelShaderValid */
> > > + struct anv_pipeline *pipeline = cmd_buffer->state.pipeline;
> > > + if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT))
> > > + return false;
> > > +
> > > + /* !(3DSTATE_WM::EDSC_Mode == 2) */
> > > + const struct brw_wm_prog_data *wm_prog_data =
> > get_wm_prog_data(pipeline);
> > > + if (wm_prog_data->early_fragment_tests)
> > > + return false;
> > > +
> > > + /* We never use anv_pipeline for HiZ ops so this is trivially true:
> > > + * !(3DSTATE_WM_HZ_OP::DepthBufferClear ||
> > > + * 3DSTATE_WM_HZ_OP::DepthBufferResolve ||
> > > + * 3DSTATE_WM_HZ_OP::Hierarchical Depth Buffer Resolve Enable ||
> > > + * 3DSTATE_WM_HZ_OP::StencilBufferClear)
> > > + */
> > > +
> > > + /* 3DSTATE_STENCIL_BUFFER::STENCIL_BUFFER_ENABLE &&
> > > + * 3DSTATE_WM_DEPTH_STENCIL::StencilTestEnable
> > > + */
> > > + const bool stc_test_en =
> > > + (ds_iview->image->aspects & VK_IMAGE_ASPECT_STENCIL_BIT) &&
> > > + pipeline->stencil_test_enable;
> > > +
> > > + /* 3DSTATE_STENCIL_BUFFER::STENCIL_BUFFER_ENABLE &&
> > > + * (3DSTATE_WM_DEPTH_STENCIL::Stencil Buffer Write Enable &&
> > > + * 3DSTATE_DEPTH_BUFFER::STENCIL_WRITE_ENABLE)
> > > + */
> > > + const bool stc_write_en =
> > > + (ds_iview->image->aspects & VK_IMAGE_ASPECT_STENCIL_BIT) &&
> > > + pipeline->writes_stencil;
> > > +
> > > + /* STC_TEST_EN && 3DSTATE_PS_EXTRA::PixelShaderComputesStencil */
> > > + const bool comp_stc_en = stc_test_en &&
> > wm_prog_data->computed_stencil;
> > > +
> > > + /* COMP_STC_EN || STC_WRITE_EN */
> > > + if (!(comp_stc_en || stc_write_en))
> > > + return false;
> > > +
> > > + /* (3DSTATE_PS_EXTRA::PixelShaderKillsPixels ||
> > > + * 3DSTATE_WM::ForceKillPix == ON ||
> > > + * 3DSTATE_PS_EXTRA::oMask Present to RenderTarget ||
> > > + * 3DSTATE_PS_BLEND::AlphaToCoverageEnable ||
> > > + * 3DSTATE_PS_BLEND::AlphaTestEnable ||
> > > + * 3DSTATE_WM_CHROMAKEY::ChromaKeyKillEnable) ||
> > > + * (3DSTATE_PS_EXTRA::Pixel Shader Computed Depth mode !=
> > PSCDEPTH_OFF)
> > > + */
> > > + return pipeline->kill_pixel ||
> > > + wm_prog_data->computed_depth_mode != PSCDEPTH_OFF;
> > > +}
> > > +
> > > void
> > > genX(cmd_buffer_flush_dynamic_state)(struct anv_cmd_buffer *cmd_buffer)
> > > {
> > > @@ -398,6 +541,7 @@ genX(cmd_buffer_flush_dynamic_state)(struct
> > anv_cmd_buffer *cmd_buffer)
> > > }
> > >
> > > if (cmd_buffer->state.dirty & (ANV_CMD_DIRTY_PIPELINE |
> > > + ANV_CMD_DIRTY_RENDER_TARGETS |
> > > ANV_CMD_DIRTY_DYNAMIC_STENCIL_COMPARE_MASK
> > |
> > > ANV_CMD_DIRTY_DYNAMIC_STENCIL_WRITE_MASK
> > |
> > > ANV_CMD_DIRTY_DYNAMIC_STENCIL_REFERENCE))
> > {
> > > @@ -423,6 +567,9 @@ genX(cmd_buffer_flush_dynamic_state)(struct
> > anv_cmd_buffer *cmd_buffer)
> > >
> > > anv_batch_emit_merge(&cmd_buffer->batch, dwords,
> > > pipeline->gen9.wm_depth_stencil);
> > > +
> > > + genX(cmd_buffer_enable_pma_fix)(cmd_buffer,
> > > + want_stencil_pma_fix(cmd_buffe
> > r));
> > > }
> > > #endif
> > >
> > > diff --git a/src/intel/vulkan/genX_pipeline.c
> > b/src/intel/vulkan/genX_pipeline.c
> > > index 3f701f3..3087037 100644
> > > --- a/src/intel/vulkan/genX_pipeline.c
> > > +++ b/src/intel/vulkan/genX_pipeline.c
> > > @@ -742,6 +742,7 @@ emit_ds_state(struct anv_pipeline *pipeline,
> > > * to make sure it's initialized to something useful.
> > > */
> > > pipeline->writes_stencil = false;
> > > + pipeline->stencil_test_enable = false;
> > > pipeline->writes_depth = false;
> > > pipeline->depth_test_enable = false;
> > > memset(depth_stencil_dw, 0, sizeof(depth_stencil_dw));
> > > @@ -757,6 +758,7 @@ emit_ds_state(struct anv_pipeline *pipeline,
> > >
> > > VkPipelineDepthStencilStateCreateInfo info = *pCreateInfo;
> > > sanitize_ds_state(&info, &pipeline->writes_stencil, ds_aspects);
> > > + pipeline->stencil_test_enable = info.stencilTestEnable;
> > > pipeline->writes_depth = info.depthWriteEnable;
> > > pipeline->depth_test_enable = info.depthTestEnable;
> > >
> > > @@ -1514,8 +1516,8 @@ compute_kill_pixel(struct anv_pipeline *pipeline,
> > > const struct brw_wm_prog_data *wm_prog_data =
> > get_wm_prog_data(pipeline);
> > >
> > > /* This computes the KillPixel portion of the computation for
> > whether or
> > > - * not we want to enable the PMA fix on gen8. It's given by this
> > chunk of
> > > - * the giant formula:
> > > + * not we want to enable the PMA fix on gen8 or gen9. It's given by
> > this
> > > + * chunk of the giant formula:
> > > *
> > > * (3DSTATE_PS_EXTRA::PixelShaderKillsPixels ||
> > > * 3DSTATE_PS_EXTRA::oMask Present to RenderTarget ||
> > > --
> > > 2.5.0.400.gff86faf
> > >
> > > _______________________________________________
> > > mesa-dev mailing list
> > > mesa-dev at lists.freedesktop.org
> > > https://lists.freedesktop.org/mailman/listinfo/mesa-dev
> >
More information about the mesa-dev
mailing list