<div dir="ltr"><div class="gmail_extra"><div class="gmail_quote">On Thu, Feb 2, 2017 at 1:45 PM, Nanley Chery <span dir="ltr"><<a href="mailto:nanleychery@gmail.com" target="_blank">nanleychery@gmail.com</a>></span> wrote:<br><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex"><div class="HOEnZb"><div class="h5">On Thu, Feb 02, 2017 at 01:37:56PM -0800, Nanley Chery wrote:<br>
> On Thu, Feb 02, 2017 at 12:53:33PM -0800, Jason Ekstrand wrote:<br>
> > On Thu, Feb 2, 2017 at 10:55 AM, Nanley Chery <<a href="mailto:nanleychery@gmail.com">nanleychery@gmail.com</a>> wrote:<br>
> ><br>
> > > On Wed, Feb 01, 2017 at 10:11:42PM -0800, Jason Ekstrand wrote:<br>
> > > > This improves the performance of Dota 2 on my Sky Lake Skull Canyon<br>
> > > > machine by about 2-3%.<br>
> > > > ---<br>
> > > > src/intel/vulkan/anv_private.h | 1 +<br>
> > > > src/intel/vulkan/gen8_cmd_<wbr>buffer.c | 155 ++++++++++++++++++++++++++++++<br>
> > > ++++++-<br>
> > > > src/intel/vulkan/genX_<wbr>pipeline.c | 6 +-<br>
> > > > 3 files changed, 156 insertions(+), 6 deletions(-)<br>
> > > ><br>
> > > > diff --git a/src/intel/vulkan/anv_<wbr>private.h<br>
> > > b/src/intel/vulkan/anv_<wbr>private.h<br>
> > > > index 5fe4dd8..e7ad351 100644<br>
> > > > --- a/src/intel/vulkan/anv_<wbr>private.h<br>
> > > > +++ b/src/intel/vulkan/anv_<wbr>private.h<br>
> > > > @@ -1475,6 +1475,7 @@ struct anv_pipeline {<br>
> > > > bool writes_depth;<br>
> > > > bool depth_test_enable;<br>
> > > > bool writes_stencil;<br>
> > > > + bool stencil_test_enable;<br>
> > > > bool depth_clamp_enable;<br>
> > > > bool kill_pixel;<br>
> > > ><br>
> > > > diff --git a/src/intel/vulkan/gen8_cmd_<wbr>buffer.c<br>
> > > b/src/intel/vulkan/gen8_cmd_<wbr>buffer.c<br>
> > > > index b877e27..553f0c3 100644<br>
> > > > --- a/src/intel/vulkan/gen8_cmd_<wbr>buffer.c<br>
> > > > +++ b/src/intel/vulkan/gen8_cmd_<wbr>buffer.c<br>
> > > > @@ -157,16 +157,39 @@ __emit_sf_state(struct anv_cmd_buffer *cmd_buffer)<br>
> > > > void<br>
> > > > genX(cmd_buffer_enable_pma_<wbr>fix)(struct anv_cmd_buffer *cmd_buffer,<br>
> > > bool enable)<br>
> > > > {<br>
> > > > -#if GEN_GEN == 8<br>
> > > > if (cmd_buffer->state.pma_fix_<wbr>enabled == enable)<br>
> > > > return;<br>
> > > ><br>
> > > > + cmd_buffer->state.pma_fix_<wbr>enabled = enable;<br>
> > > > +<br>
> > > > + /* According to the Broadwell PIPE_CONTROL documentation, software<br>
> > > should<br>
> > > > + * emit a PIPE_CONTROL with the CS Stall and Depth Cache Flush bits<br>
> > > set<br>
> > > > + * prior to the LRI. If stencil buffer writes are enabled, then a<br>
> > > Render<br>
> > > > + * Cache Flush is also necessary.<br>
> > > > + *<br>
> > > > + * The Sky Lake docs say to use a depth stall rather than a command<br>
> > > > + * streamer stall. However, the hardware seems to violently<br>
> > > disagree.<br>
> > > > + * A full command streamer stall seems to be needed in both cases.<br>
> > > > + */<br>
> > > > anv_batch_emit(&cmd_buffer-><wbr>batch, GENX(PIPE_CONTROL), pc) {<br>
> > > > pc.DepthCacheFlushEnable = true;<br>
> > > > pc.CommandStreamerStallEnable = true;<br>
> > > > pc.<wbr>RenderTargetCacheFlushEnable = true;<br>
> > > > }<br>
> > > ><br>
> > > > +#if GEN_GEN == 9<br>
> > > > +<br>
> > > > + uint32_t cache_mode;<br>
> > > > + anv_pack_struct(&cache_mode, GENX(CACHE_MODE_0),<br>
> > > > + .STCPMAOptimizationEnable = enable,<br>
> > > > + .STCPMAOptimizationEnableMask = true);<br>
> > > > + anv_batch_emit(&cmd_buffer-><wbr>batch, GENX(MI_LOAD_REGISTER_IMM), lri)<br>
> > > {<br>
> > > > + lri.RegisterOffset = GENX(CACHE_MODE_0_num);<br>
> > > > + lri.DataDWord = cache_mode;<br>
> > > > + }<br>
> > > > +<br>
> > > > +#elif GEN_GEN == 8<br>
> > > > +<br>
> > > > uint32_t cache_mode;<br>
> > > > anv_pack_struct(&cache_mode, GENX(CACHE_MODE_1),<br>
> > > > .NPPMAFixEnable = enable,<br>
> > > > @@ -178,18 +201,20 @@ genX(cmd_buffer_enable_pma_<wbr>fix)(struct<br>
> > > anv_cmd_buffer *cmd_buffer, bool enable)<br>
> > > > lri.DataDWord = cache_mode;<br>
> > > > }<br>
> > > ><br>
> > > > +#endif /* GEN_GEN == 8 */<br>
> > > > +<br>
> > > > /* After the LRI, a PIPE_CONTROL with both the Depth Stall and Depth<br>
> > > Cache<br>
> > > > * Flush bits is often necessary. We do it regardless because it's<br>
> > > easier.<br>
> > > > * The render cache flush is also necessary if stencil writes are<br>
> > > enabled.<br>
> > > > + *<br>
> > > > + * Again, the Sky Lake docs give a different set of flushes but the<br>
> > > BDW<br>
> > > > + * flushes seem to work just as well.<br>
> > > > */<br>
> > > > anv_batch_emit(&cmd_buffer-><wbr>batch, GENX(PIPE_CONTROL), pc) {<br>
> > > > pc.DepthStallEnable = true;<br>
> > > > pc.DepthCacheFlushEnable = true;<br>
> > > > pc.<wbr>RenderTargetCacheFlushEnable = true;<br>
> > > > }<br>
> > > > -<br>
> > > > - cmd_buffer->state.pma_fix_<wbr>enabled = enable;<br>
> > > > -#endif /* GEN_GEN == 8 */<br>
> > > > }<br>
> > > ><br>
> > > > static inline bool<br>
> > > > @@ -289,6 +314,124 @@ want_depth_pma_fix(struct anv_cmd_buffer<br>
> > > *cmd_buffer)<br>
> > > > wm_prog_data->computed_depth_<wbr>mode != PSCDEPTH_OFF;<br>
> > > > }<br>
> > > ><br>
> > > > +static inline bool<br>
> > > > +want_stencil_pma_fix(struct anv_cmd_buffer *cmd_buffer)<br>
> > > > +{<br>
> > > > + assert(GEN_GEN == 9);<br>
> > > > +<br>
> > > > + /* From the Sky Lake PRM Vol. 2c CACHE_MODE_1::STC PMA Optimization<br>
> > > Enable:<br>
> > > > + *<br>
> > > > + * Clearing this bit will force the STC cache to wait for pending<br>
> > > > + * retirement of pixels at the HZ-read stage and do the STC-test<br>
> > > for<br>
> > > > + * Non-promoted, R-computed and Computed depth modes instead of<br>
> > > > + * postponing the STC-test to RCPFE.<br>
> > > > + *<br>
> > > > + * STC_TEST_EN = 3DSTATE_STENCIL_BUFFER::<wbr>STENCIL_BUFFER_ENABLE &&<br>
> > > > + * 3DSTATE_WM_DEPTH_STENCIL::<wbr>StencilTestEnable<br>
> > > > + *<br>
> > > > + * STC_WRITE_EN = 3DSTATE_STENCIL_BUFFER::<wbr>STENCIL_BUFFER_ENABLE<br>
> > > &&<br>
> > > > + * (3DSTATE_WM_DEPTH_STENCIL::<wbr>Stencil Buffer<br>
> > > Write Enable &&<br>
> > > > + * 3DSTATE_DEPTH_BUFFER::STENCIL_<wbr>WRITE_ENABLE)<br>
> > > > + *<br>
> > > > + * COMP_STC_EN = STC_TEST_EN &&<br>
> > > > + * 3DSTATE_PS_EXTRA::<wbr>PixelShaderComputesStencil<br>
> > > > + *<br>
> > > > + * SW parses the pipeline states to generate the following logical<br>
> > > > + * signal indicating if PMA FIX can be enabled.<br>
> > > > + *<br>
> > > > + * STC_PMA_OPT =<br>
> > > > + * 3DSTATE_WM::<wbr>ForceThreadDispatch != 1 &&<br>
> > > > + * !(3DSTATE_RASTER::<wbr>ForceSampleCount != NUMRASTSAMPLES_0) &&<br>
> > > > + * 3DSTATE_DEPTH_BUFFER::SURFACE_<wbr>TYPE != NULL &&<br>
> > > > + * 3DSTATE_DEPTH_BUFFER::HIZ Enable &&<br>
> > > > + * !(3DSTATE_WM::EDSC_Mode == 2) &&<br>
> > > > + * 3DSTATE_PS_EXTRA::<wbr>PixelShaderValid &&<br>
> > > > + * !(3DSTATE_WM_HZ_OP::<wbr>DepthBufferClear ||<br>
> > > > + * 3DSTATE_WM_HZ_OP::<wbr>DepthBufferResolve ||<br>
> > > > + * 3DSTATE_WM_HZ_OP::Hierarchical Depth Buffer Resolve<br>
> > > Enable ||<br>
> > > > + * 3DSTATE_WM_HZ_OP::<wbr>StencilBufferClear) &&<br>
> > > > + * (COMP_STC_EN || STC_WRITE_EN) &&<br>
> > > > + * ((3DSTATE_PS_EXTRA::<wbr>PixelShaderKillsPixels ||<br>
> > > > + * 3DSTATE_WM::ForceKillPix == ON ||<br>
> > > > + * 3DSTATE_PS_EXTRA::oMask Present to RenderTarget ||<br>
> > > > + * 3DSTATE_PS_BLEND::<wbr>AlphaToCoverageEnable ||<br>
> > > > + * 3DSTATE_PS_BLEND::<wbr>AlphaTestEnable ||<br>
> > > > + * 3DSTATE_WM_CHROMAKEY::<wbr>ChromaKeyKillEnable) ||<br>
> > > > + * (3DSTATE_PS_EXTRA::Pixel Shader Computed Depth mode !=<br>
> > > PSCDEPTH_OFF))<br>
> > > > + */<br>
> > > > +<br>
> > > > + /* These are always true:<br>
> > > > + * 3DSTATE_WM::<wbr>ForceThreadDispatch != 1 &&<br>
> > > > + * !(3DSTATE_RASTER::<wbr>ForceSampleCount != NUMRASTSAMPLES_0)<br>
> > > > + */<br>
> > > > +<br>
> > > > + /* We only enable the PMA fix if HiZ is enabled. However, if HiZ is<br>
> > > > + * *not* enabled and we don't set this bit there is no harm.<br>
> > > Therefore,<br>
> > > > + * we can just treat the NULL framebuffer case as has_hiz == false<br>
> > > and<br>
> > > > + * everything will work just fine.<br>
> > > > + */<br>
> > > > + if (cmd_buffer->state.framebuffer == NULL)<br>
> > > > + return false;<br>
> > > > +<br>
> > > > + /* (3DSTATE_DEPTH_BUFFER::<wbr>SURFACE_TYPE != NULL) &&<br>
> > > > + * 3DSTATE_DEPTH_BUFFER::HIZ Enable<br>
> > > > + */<br>
> > > > + const struct anv_image_view *ds_iview =<br>
> > > > + anv_cmd_buffer_get_depth_<wbr>stencil_view(cmd_buffer);<br>
> > > > + if (!ds_iview || ds_iview->image->aux_usage != ISL_AUX_USAGE_HIZ)<br>
> > ><br>
> > > I think there's an error here. For depth buffers, anv_image::aux_usage<br>
> > > only indicates if there is an auxiliary HiZ buffer present. It does not<br>
> > > indicate that the HiZ buffer is being used within a render pass. For<br>
> > > that you'd have to check anv_attachment_state::aux_<wbr>usage.<br>
> > ><br>
> ><br>
> > So things just got more interesting... The Vulkan spec only requires that<br>
> > the render pass provided to pInheritanceInfo for the vkBeginCommandBuffer<br>
> > of a secondary command buffer to be "compatible" with the render pass<br>
> > provided to vkCmdBeginRenderPass. The rules for compatibility explicitly<br>
> > allow the initial and final layouts of the attachments to differ. This<br>
> > means that, if HiZ usage within a render pass is determined by the initial<br>
> > layout, we can never know if HiZ is enabled in a secondary command buffer<br>
> > and we always have to leave the PMA stall off. That's fairly brutal to<br>
> > apps.<br>
><br>
> That is interesting. I'm glad you found this. Perhaps we can get around<br>
> this by computing part of the PMA fix at the start of the subpass, and<br>
> the latter portion at the draw call?<br>
><br>
<br>
</div></div>That seems to be what your v2 patches are doing, so the problem is<br>
solved right?<span class="HOEnZb"><font color="#888888"><br></font></span></blockquote><div><br></div><div>Basically, yes in the sense that it's correct. However, the new hiz_enabled bit is always false inside secondaries so they take a perf hit.<br></div><div> </div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex"><span class="HOEnZb"><font color="#888888">
-Nanley<br>
</font></span><div class="HOEnZb"><div class="h5"><br>
> ><br>
> > How bad would it be to enable HiZ within the render pass for the case where<br>
> > they use the GENERAL layout and then resolve at the end? They used the<br>
> > GENERAL layout so they know they won't get fantastic performance so maybe<br>
> > that's ok?<br>
> ><br>
><br>
> Now that we've seen a benchmark out in the wild that uses depth input<br>
> attachments, I think we'll want the GENERAL layout to be fast and for<br>
> our software handling it to be straight-forward. I think we can<br>
> accomplish this by adding HiZ R/W support to BLORP. A local series of<br>
> mine that enables HiZ for input attachments takes a step towards this by<br>
> getting rid of the read-depth-through-HiZ situation we currently have<br>
> for the GENERAL layout.<br></div></div></blockquote><div><br></div><div>We can do whatever we want inside the render pass. The initial and final layouts are just that, "initial" and "final". We can do an implicit layout transition from initial to DEPTH_STENCIL_ATTACHMENT_OPTIMAL at the top of the render pass and then transition to final at the end. It just clients which use GENERAL when they don't need to will get resolves. If you don't want resolves, use layout transitions properly and you won't get them.<br><br></div><div>Yes, adding depth/hiz support to BLORP will let us leave HiZ on for all GENERAL layouts but I don't think it's needed to solve the current problem or to get fast depth input attachments.<br></div><div><br></div><div>--Jason<br></div><div> </div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex"><div class="HOEnZb"><div class="h5">
> -Nanley<br>
><br>
> > Either way, I'll respin and make it correct. We can sort this issue out and<br>
> > hopefully give secondary command buffer users their 2-3% later.<br>
> ><br>
> > --Jason<br>
> ><br>
> ><br>
> > > -Nanley<br>
> > ><br>
> > > > + return false;<br>
> > > > +<br>
> > > > + /* 3DSTATE_PS_EXTRA::<wbr>PixelShaderValid */<br>
> > > > + struct anv_pipeline *pipeline = cmd_buffer->state.pipeline;<br>
> > > > + if (!anv_pipeline_has_stage(<wbr>pipeline, MESA_SHADER_FRAGMENT))<br>
> > > > + return false;<br>
> > > > +<br>
> > > > + /* !(3DSTATE_WM::EDSC_Mode == 2) */<br>
> > > > + const struct brw_wm_prog_data *wm_prog_data =<br>
> > > get_wm_prog_data(pipeline);<br>
> > > > + if (wm_prog_data->early_fragment_<wbr>tests)<br>
> > > > + return false;<br>
> > > > +<br>
> > > > + /* We never use anv_pipeline for HiZ ops so this is trivially true:<br>
> > > > + * !(3DSTATE_WM_HZ_OP::<wbr>DepthBufferClear ||<br>
> > > > + * 3DSTATE_WM_HZ_OP::<wbr>DepthBufferResolve ||<br>
> > > > + * 3DSTATE_WM_HZ_OP::Hierarchical Depth Buffer Resolve Enable ||<br>
> > > > + * 3DSTATE_WM_HZ_OP::<wbr>StencilBufferClear)<br>
> > > > + */<br>
> > > > +<br>
> > > > + /* 3DSTATE_STENCIL_BUFFER::<wbr>STENCIL_BUFFER_ENABLE &&<br>
> > > > + * 3DSTATE_WM_DEPTH_STENCIL::<wbr>StencilTestEnable<br>
> > > > + */<br>
> > > > + const bool stc_test_en =<br>
> > > > + (ds_iview->image->aspects & VK_IMAGE_ASPECT_STENCIL_BIT) &&<br>
> > > > + pipeline->stencil_test_enable;<br>
> > > > +<br>
> > > > + /* 3DSTATE_STENCIL_BUFFER::<wbr>STENCIL_BUFFER_ENABLE &&<br>
> > > > + * (3DSTATE_WM_DEPTH_STENCIL::<wbr>Stencil Buffer Write Enable &&<br>
> > > > + * 3DSTATE_DEPTH_BUFFER::STENCIL_<wbr>WRITE_ENABLE)<br>
> > > > + */<br>
> > > > + const bool stc_write_en =<br>
> > > > + (ds_iview->image->aspects & VK_IMAGE_ASPECT_STENCIL_BIT) &&<br>
> > > > + pipeline->writes_stencil;<br>
> > > > +<br>
> > > > + /* STC_TEST_EN && 3DSTATE_PS_EXTRA::<wbr>PixelShaderComputesStencil */<br>
> > > > + const bool comp_stc_en = stc_test_en &&<br>
> > > wm_prog_data->computed_<wbr>stencil;<br>
> > > > +<br>
> > > > + /* COMP_STC_EN || STC_WRITE_EN */<br>
> > > > + if (!(comp_stc_en || stc_write_en))<br>
> > > > + return false;<br>
> > > > +<br>
> > > > + /* (3DSTATE_PS_EXTRA::<wbr>PixelShaderKillsPixels ||<br>
> > > > + * 3DSTATE_WM::ForceKillPix == ON ||<br>
> > > > + * 3DSTATE_PS_EXTRA::oMask Present to RenderTarget ||<br>
> > > > + * 3DSTATE_PS_BLEND::<wbr>AlphaToCoverageEnable ||<br>
> > > > + * 3DSTATE_PS_BLEND::<wbr>AlphaTestEnable ||<br>
> > > > + * 3DSTATE_WM_CHROMAKEY::<wbr>ChromaKeyKillEnable) ||<br>
> > > > + * (3DSTATE_PS_EXTRA::Pixel Shader Computed Depth mode !=<br>
> > > PSCDEPTH_OFF)<br>
> > > > + */<br>
> > > > + return pipeline->kill_pixel ||<br>
> > > > + wm_prog_data->computed_depth_<wbr>mode != PSCDEPTH_OFF;<br>
> > > > +}<br>
> > > > +<br>
> > > > void<br>
> > > > genX(cmd_buffer_flush_dynamic_<wbr>state)(struct anv_cmd_buffer *cmd_buffer)<br>
> > > > {<br>
> > > > @@ -398,6 +541,7 @@ genX(cmd_buffer_flush_dynamic_<wbr>state)(struct<br>
> > > anv_cmd_buffer *cmd_buffer)<br>
> > > > }<br>
> > > ><br>
> > > > if (cmd_buffer->state.dirty & (ANV_CMD_DIRTY_PIPELINE |<br>
> > > > + ANV_CMD_DIRTY_RENDER_TARGETS |<br>
> > > > ANV_CMD_DIRTY_DYNAMIC_STENCIL_<wbr>COMPARE_MASK<br>
> > > |<br>
> > > > ANV_CMD_DIRTY_DYNAMIC_STENCIL_<wbr>WRITE_MASK<br>
> > > |<br>
> > > > ANV_CMD_DIRTY_DYNAMIC_STENCIL_<wbr>REFERENCE))<br>
> > > {<br>
> > > > @@ -423,6 +567,9 @@ genX(cmd_buffer_flush_dynamic_<wbr>state)(struct<br>
> > > anv_cmd_buffer *cmd_buffer)<br>
> > > ><br>
> > > > anv_batch_emit_merge(&cmd_<wbr>buffer->batch, dwords,<br>
> > > > pipeline->gen9.wm_depth_<wbr>stencil);<br>
> > > > +<br>
> > > > + genX(cmd_buffer_enable_pma_<wbr>fix)(cmd_buffer,<br>
> > > > + want_stencil_pma_fix(cmd_buffe<br>
> > > r));<br>
> > > > }<br>
> > > > #endif<br>
> > > ><br>
> > > > diff --git a/src/intel/vulkan/genX_<wbr>pipeline.c<br>
> > > b/src/intel/vulkan/genX_<wbr>pipeline.c<br>
> > > > index 3f701f3..3087037 100644<br>
> > > > --- a/src/intel/vulkan/genX_<wbr>pipeline.c<br>
> > > > +++ b/src/intel/vulkan/genX_<wbr>pipeline.c<br>
> > > > @@ -742,6 +742,7 @@ emit_ds_state(struct anv_pipeline *pipeline,<br>
> > > > * to make sure it's initialized to something useful.<br>
> > > > */<br>
> > > > pipeline->writes_stencil = false;<br>
> > > > + pipeline->stencil_test_enable = false;<br>
> > > > pipeline->writes_depth = false;<br>
> > > > pipeline->depth_test_enable = false;<br>
> > > > memset(depth_stencil_dw, 0, sizeof(depth_stencil_dw));<br>
> > > > @@ -757,6 +758,7 @@ emit_ds_state(struct anv_pipeline *pipeline,<br>
> > > ><br>
> > > > VkPipelineDepthStencilStateCre<wbr>ateInfo info = *pCreateInfo;<br>
> > > > sanitize_ds_state(&info, &pipeline->writes_stencil, ds_aspects);<br>
> > > > + pipeline->stencil_test_enable = info.stencilTestEnable;<br>
> > > > pipeline->writes_depth = info.depthWriteEnable;<br>
> > > > pipeline->depth_test_enable = info.depthTestEnable;<br>
> > > ><br>
> > > > @@ -1514,8 +1516,8 @@ compute_kill_pixel(struct anv_pipeline *pipeline,<br>
> > > > const struct brw_wm_prog_data *wm_prog_data =<br>
> > > get_wm_prog_data(pipeline);<br>
> > > ><br>
> > > > /* This computes the KillPixel portion of the computation for<br>
> > > whether or<br>
> > > > - * not we want to enable the PMA fix on gen8. It's given by this<br>
> > > chunk of<br>
> > > > - * the giant formula:<br>
> > > > + * not we want to enable the PMA fix on gen8 or gen9. It's given by<br>
> > > this<br>
> > > > + * chunk of the giant formula:<br>
> > > > *<br>
> > > > * (3DSTATE_PS_EXTRA::<wbr>PixelShaderKillsPixels ||<br>
> > > > * 3DSTATE_PS_EXTRA::oMask Present to RenderTarget ||<br>
> > > > --<br>
> > > > 2.5.0.400.gff86faf<br>
> > > ><br>
> > > > ______________________________<wbr>_________________<br>
> > > > mesa-dev mailing list<br>
> > > > <a href="mailto:mesa-dev@lists.freedesktop.org">mesa-dev@lists.freedesktop.org</a><br>
> > > > <a href="https://lists.freedesktop.org/mailman/listinfo/mesa-dev" rel="noreferrer" target="_blank">https://lists.freedesktop.org/<wbr>mailman/listinfo/mesa-dev</a><br>
> > ><br>
</div></div></blockquote></div><br></div></div>