<div dir="ltr"><div class="gmail_extra"><div class="gmail_quote">On Thu, Feb 2, 2017 at 10:55 AM, Nanley Chery <span dir="ltr"><<a href="mailto:nanleychery@gmail.com" target="_blank">nanleychery@gmail.com</a>></span> wrote:<br><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex"><div class="m_-49594673675494927HOEnZb"><div class="m_-49594673675494927h5">On Wed, Feb 01, 2017 at 10:11:42PM -0800, Jason Ekstrand wrote:<br>
> This improves the performance of Dota 2 on my Sky Lake Skull Canyon<br>
> machine by about 2-3%.<br>
> ---<br>
> src/intel/vulkan/anv_private.<wbr>h | 1 +<br>
> src/intel/vulkan/gen8_cmd_buff<wbr>er.c | 155 ++++++++++++++++++++++++++++++<wbr>++++++-<br>
> src/intel/vulkan/genX_pipeline<wbr>.c | 6 +-<br>
> 3 files changed, 156 insertions(+), 6 deletions(-)<br>
><br>
> diff --git a/src/intel/vulkan/anv_private<wbr>.h b/src/intel/vulkan/anv_private<wbr>.h<br>
> index 5fe4dd8..e7ad351 100644<br>
> --- a/src/intel/vulkan/anv_private<wbr>.h<br>
> +++ b/src/intel/vulkan/anv_private<wbr>.h<br>
> @@ -1475,6 +1475,7 @@ struct anv_pipeline {<br>
> bool writes_depth;<br>
> bool depth_test_enable;<br>
> bool writes_stencil;<br>
> + bool stencil_test_enable;<br>
> bool depth_clamp_enable;<br>
> bool kill_pixel;<br>
><br>
> diff --git a/src/intel/vulkan/gen8_cmd_bu<wbr>ffer.c b/src/intel/vulkan/gen8_cmd_bu<wbr>ffer.c<br>
> index b877e27..553f0c3 100644<br>
> --- a/src/intel/vulkan/gen8_cmd_bu<wbr>ffer.c<br>
> +++ b/src/intel/vulkan/gen8_cmd_bu<wbr>ffer.c<br>
> @@ -157,16 +157,39 @@ __emit_sf_state(struct anv_cmd_buffer *cmd_buffer)<br>
> void<br>
> genX(cmd_buffer_enable_pma_fix<wbr>)(struct anv_cmd_buffer *cmd_buffer, bool enable)<br>
> {<br>
> -#if GEN_GEN == 8<br>
> if (cmd_buffer->state.pma_fix_ena<wbr>bled == enable)<br>
> return;<br>
><br>
> + cmd_buffer->state.pma_fix_ena<wbr>bled = enable;<br>
> +<br>
> + /* According to the Broadwell PIPE_CONTROL documentation, software should<br>
> + * emit a PIPE_CONTROL with the CS Stall and Depth Cache Flush bits set<br>
> + * prior to the LRI. If stencil buffer writes are enabled, then a Render<br>
> + * Cache Flush is also necessary.<br>
> + *<br>
> + * The Sky Lake docs say to use a depth stall rather than a command<br>
> + * streamer stall. However, the hardware seems to violently disagree.<br>
> + * A full command streamer stall seems to be needed in both cases.<br>
> + */<br>
> anv_batch_emit(&cmd_buffer->b<wbr>atch, GENX(PIPE_CONTROL), pc) {<br>
> pc.DepthCacheFlushEnable = true;<br>
> pc.CommandStreamerStallEnable = true;<br>
> pc.RenderTargetCacheFlushEnabl<wbr>e = true;<br>
> }<br>
><br>
> +#if GEN_GEN == 9<br>
> +<br>
> + uint32_t cache_mode;<br>
> + anv_pack_struct(&cache_mode, GENX(CACHE_MODE_0),<br>
> + .STCPMAOptimizationEnable = enable,<br>
> + .STCPMAOptimizationEnableMask = true);<br>
> + anv_batch_emit(&cmd_buffer->b<wbr>atch, GENX(MI_LOAD_REGISTER_IMM), lri) {<br>
> + lri.RegisterOffset = GENX(CACHE_MODE_0_num);<br>
> + lri.DataDWord = cache_mode;<br>
> + }<br>
> +<br>
> +#elif GEN_GEN == 8<br>
> +<br>
> uint32_t cache_mode;<br>
> anv_pack_struct(&cache_mode, GENX(CACHE_MODE_1),<br>
> .NPPMAFixEnable = enable,<br>
> @@ -178,18 +201,20 @@ genX(cmd_buffer_enable_pma_fix<wbr>)(struct anv_cmd_buffer *cmd_buffer, bool enable)<br>
> lri.DataDWord = cache_mode;<br>
> }<br>
><br>
> +#endif /* GEN_GEN == 8 */<br>
> +<br>
> /* After the LRI, a PIPE_CONTROL with both the Depth Stall and Depth Cache<br>
> * Flush bits is often necessary. We do it regardless because it's easier.<br>
> * The render cache flush is also necessary if stencil writes are enabled.<br>
> + *<br>
> + * Again, the Sky Lake docs give a different set of flushes but the BDW<br>
> + * flushes seem to work just as well.<br>
> */<br>
> anv_batch_emit(&cmd_buffer->b<wbr>atch, GENX(PIPE_CONTROL), pc) {<br>
> pc.DepthStallEnable = true;<br>
> pc.DepthCacheFlushEnable = true;<br>
> pc.RenderTargetCacheFlushEnabl<wbr>e = true;<br>
> }<br>
> -<br>
> - cmd_buffer->state.pma_fix_ena<wbr>bled = enable;<br>
> -#endif /* GEN_GEN == 8 */<br>
> }<br>
><br>
> static inline bool<br>
> @@ -289,6 +314,124 @@ want_depth_pma_fix(struct anv_cmd_buffer *cmd_buffer)<br>
> wm_prog_data->computed_depth_m<wbr>ode != PSCDEPTH_OFF;<br>
> }<br>
><br>
> +static inline bool<br>
> +want_stencil_pma_fix(struct anv_cmd_buffer *cmd_buffer)<br>
> +{<br>
> + assert(GEN_GEN == 9);<br>
> +<br>
> + /* From the Sky Lake PRM Vol. 2c CACHE_MODE_1::STC PMA Optimization Enable:<br>
> + *<br>
> + * Clearing this bit will force the STC cache to wait for pending<br>
> + * retirement of pixels at the HZ-read stage and do the STC-test for<br>
> + * Non-promoted, R-computed and Computed depth modes instead of<br>
> + * postponing the STC-test to RCPFE.<br>
> + *<br>
> + * STC_TEST_EN = 3DSTATE_STENCIL_BUFFER::STENCI<wbr>L_BUFFER_ENABLE &&<br>
> + * 3DSTATE_WM_DEPTH_STENCIL::Sten<wbr>cilTestEnable<br>
> + *<br>
> + * STC_WRITE_EN = 3DSTATE_STENCIL_BUFFER::STENCI<wbr>L_BUFFER_ENABLE &&<br>
> + * (3DSTATE_WM_DEPTH_STENCIL::St<wbr>encil Buffer Write Enable &&<br>
> + * 3DSTATE_DEPTH_BUFFER::STENCIL_<wbr>WRITE_ENABLE)<br>
> + *<br>
> + * COMP_STC_EN = STC_TEST_EN &&<br>
> + * 3DSTATE_PS_EXTRA::PixelShaderC<wbr>omputesStencil<br>
> + *<br>
> + * SW parses the pipeline states to generate the following logical<br>
> + * signal indicating if PMA FIX can be enabled.<br>
> + *<br>
> + * STC_PMA_OPT =<br>
> + * 3DSTATE_WM::ForceThreadDispat<wbr>ch != 1 &&<br>
> + * !(3DSTATE_RASTER::ForceSample<wbr>Count != NUMRASTSAMPLES_0) &&<br>
> + * 3DSTATE_DEPTH_BUFFER::<wbr>SURFACE_TYPE != NULL &&<br>
> + * 3DSTATE_DEPTH_BUFFER::HIZ Enable &&<br>
> + * !(3DSTATE_WM::EDSC_Mode == 2) &&<br>
> + * 3DSTATE_PS_EXTRA::PixelShader<wbr>Valid &&<br>
> + * !(3DSTATE_WM_HZ_OP::DepthBuff<wbr>erClear ||<br>
> + * 3DSTATE_WM_HZ_OP::DepthBuffer<wbr>Resolve ||<br>
> + * 3DSTATE_WM_HZ_OP::<wbr>Hierarchical Depth Buffer Resolve Enable ||<br>
> + * 3DSTATE_WM_HZ_OP::StencilBuff<wbr>erClear) &&<br>
> + * (COMP_STC_EN || STC_WRITE_EN) &&<br>
> + * ((3DSTATE_PS_EXTRA::PixelShad<wbr>erKillsPixels ||<br>
> + * 3DSTATE_WM::ForceKillPix == ON ||<br>
> + * 3DSTATE_PS_EXTRA::oMask Present to RenderTarget ||<br>
> + * 3DSTATE_PS_BLEND::AlphaToCove<wbr>rageEnable ||<br>
> + * 3DSTATE_PS_BLEND::AlphaTestEn<wbr>able ||<br>
> + * 3DSTATE_WM_CHROMAKEY::ChromaK<wbr>eyKillEnable) ||<br>
> + * (3DSTATE_PS_EXTRA::Pixel Shader Computed Depth mode != PSCDEPTH_OFF))<br>
> + */<br>
> +<br>
> + /* These are always true:<br>
> + * 3DSTATE_WM::ForceThreadDispatc<wbr>h != 1 &&<br>
> + * !(3DSTATE_RASTER::ForceSampleC<wbr>ount != NUMRASTSAMPLES_0)<br>
> + */<br>
> +<br>
> + /* We only enable the PMA fix if HiZ is enabled. However, if HiZ is<br>
> + * *not* enabled and we don't set this bit there is no harm. Therefore,<br>
> + * we can just treat the NULL framebuffer case as has_hiz == false and<br>
> + * everything will work just fine.<br>
> + */<br>
> + if (cmd_buffer->state.framebuffer == NULL)<br>
> + return false;<br>
> +<br>
> + /* (3DSTATE_DEPTH_BUFFER::SURFACE<wbr>_TYPE != NULL) &&<br>
> + * 3DSTATE_DEPTH_BUFFER::HIZ Enable<br>
> + */<br>
> + const struct anv_image_view *ds_iview =<br>
> + anv_cmd_buffer_get_depth_stenc<wbr>il_view(cmd_buffer);<br>
> + if (!ds_iview || ds_iview->image->aux_usage != ISL_AUX_USAGE_HIZ)<br>
<br>
</div></div>I think there's an error here. For depth buffers, anv_image::aux_usage<br>
only indicates if there is an auxiliary HiZ buffer present. It does not<br>
indicate that the HiZ buffer is being used within a render pass. For<br>
that you'd have to check anv_attachment_state::aux_usag<wbr>e.<br></blockquote><div><br></div><div>So things just got more interesting... The Vulkan spec only requires that the render pass provided to pInheritanceInfo for the vkBeginCommandBuffer of a secondary command buffer to be "compatible" with the render pass provided to vkCmdBeginRenderPass. The rules for compatibility explicitly allow the initial and final layouts of the attachments to differ. This means that, if HiZ usage within a render pass is determined by the initial layout, we can never know if HiZ is enabled in a secondary command buffer and we always have to leave the PMA stall off. That's fairly brutal to apps.<br><br></div><div>How bad would it be to enable HiZ within the render pass for the case where they use the GENERAL layout and then resolve at the end? They used the GENERAL layout so they know they won't get fantastic performance so maybe that's ok?<br><br></div><div>Either way, I'll respin and make it correct. We can sort this issue out and hopefully give secondary command buffer users their 2-3% later.<br><br></div><div>--Jason<br></div><div> </div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">
-Nanley<br>
<div><div class="m_-49594673675494927h5"><br>
> + return false;<br>
> +<br>
> + /* 3DSTATE_PS_EXTRA::PixelShaderV<wbr>alid */<br>
> + struct anv_pipeline *pipeline = cmd_buffer->state.pipeline;<br>
> + if (!anv_pipeline_has_stage(pipel<wbr>ine, MESA_SHADER_FRAGMENT))<br>
> + return false;<br>
> +<br>
> + /* !(3DSTATE_WM::EDSC_Mode == 2) */<br>
> + const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);<br>
> + if (wm_prog_data->early_fragment_<wbr>tests)<br>
> + return false;<br>
> +<br>
> + /* We never use anv_pipeline for HiZ ops so this is trivially true:<br>
> + * !(3DSTATE_WM_HZ_OP::DepthBuffe<wbr>rClear ||<br>
> + * 3DSTATE_WM_HZ_OP::DepthBufferR<wbr>esolve ||<br>
> + * 3DSTATE_WM_HZ_OP::Hierarchical Depth Buffer Resolve Enable ||<br>
> + * 3DSTATE_WM_HZ_OP::StencilBuffe<wbr>rClear)<br>
> + */<br>
> +<br>
> + /* 3DSTATE_STENCIL_BUFFER::STENCI<wbr>L_BUFFER_ENABLE &&<br>
> + * 3DSTATE_WM_DEPTH_STENCIL::Sten<wbr>cilTestEnable<br>
> + */<br>
> + const bool stc_test_en =<br>
> + (ds_iview->image->aspects & VK_IMAGE_ASPECT_STENCIL_BIT) &&<br>
> + pipeline->stencil_test_enable;<br>
> +<br>
> + /* 3DSTATE_STENCIL_BUFFER::STENCI<wbr>L_BUFFER_ENABLE &&<br>
> + * (3DSTATE_WM_DEPTH_STENCIL::Ste<wbr>ncil Buffer Write Enable &&<br>
> + * 3DSTATE_DEPTH_BUFFER::STENCIL_<wbr>WRITE_ENABLE)<br>
> + */<br>
> + const bool stc_write_en =<br>
> + (ds_iview->image->aspects & VK_IMAGE_ASPECT_STENCIL_BIT) &&<br>
> + pipeline->writes_stencil;<br>
> +<br>
> + /* STC_TEST_EN && 3DSTATE_PS_EXTRA::PixelShaderC<wbr>omputesStencil */<br>
> + const bool comp_stc_en = stc_test_en && wm_prog_data->computed_stencil<wbr>;<br>
> +<br>
> + /* COMP_STC_EN || STC_WRITE_EN */<br>
> + if (!(comp_stc_en || stc_write_en))<br>
> + return false;<br>
> +<br>
> + /* (3DSTATE_PS_EXTRA::PixelShader<wbr>KillsPixels ||<br>
> + * 3DSTATE_WM::ForceKillPix == ON ||<br>
> + * 3DSTATE_PS_EXTRA::oMask Present to RenderTarget ||<br>
> + * 3DSTATE_PS_BLEND::AlphaToCover<wbr>ageEnable ||<br>
> + * 3DSTATE_PS_BLEND::AlphaTestEna<wbr>ble ||<br>
> + * 3DSTATE_WM_CHROMAKEY::ChromaKe<wbr>yKillEnable) ||<br>
> + * (3DSTATE_PS_EXTRA::Pixel Shader Computed Depth mode != PSCDEPTH_OFF)<br>
> + */<br>
> + return pipeline->kill_pixel ||<br>
> + wm_prog_data->computed_depth_m<wbr>ode != PSCDEPTH_OFF;<br>
> +}<br>
> +<br>
> void<br>
> genX(cmd_buffer_flush_dynamic_<wbr>state)(struct anv_cmd_buffer *cmd_buffer)<br>
> {<br>
> @@ -398,6 +541,7 @@ genX(cmd_buffer_flush_dynamic_<wbr>state)(struct anv_cmd_buffer *cmd_buffer)<br>
> }<br>
><br>
> if (cmd_buffer->state.dirty & (ANV_CMD_DIRTY_PIPELINE |<br>
> + ANV_CMD_DIRTY_RENDER_TARGETS |<br>
> ANV_CMD_DIRTY_DYNAMIC_STENCIL_<wbr>COMPARE_MASK |<br>
> ANV_CMD_DIRTY_DYNAMIC_STENCIL_<wbr>WRITE_MASK |<br>
> ANV_CMD_DIRTY_DYNAMIC_STENCIL_<wbr>REFERENCE)) {<br>
> @@ -423,6 +567,9 @@ genX(cmd_buffer_flush_dynamic_<wbr>state)(struct anv_cmd_buffer *cmd_buffer)<br>
><br>
> anv_batch_emit_merge(&cmd_buff<wbr>er->batch, dwords,<br>
> pipeline->gen9.wm_depth_stenc<wbr>il);<br>
> +<br>
> + genX(cmd_buffer_enable_pma_fix<wbr>)(cmd_buffer,<br>
> + want_stencil_pma_fix(cmd_buffe<wbr>r));<br>
> }<br>
> #endif<br>
><br>
> diff --git a/src/intel/vulkan/genX_pipeli<wbr>ne.c b/src/intel/vulkan/genX_pipeli<wbr>ne.c<br>
> index 3f701f3..3087037 100644<br>
> --- a/src/intel/vulkan/genX_pipeli<wbr>ne.c<br>
> +++ b/src/intel/vulkan/genX_pipeli<wbr>ne.c<br>
> @@ -742,6 +742,7 @@ emit_ds_state(struct anv_pipeline *pipeline,<br>
> * to make sure it's initialized to something useful.<br>
> */<br>
> pipeline->writes_stencil = false;<br>
> + pipeline->stencil_test_enable = false;<br>
> pipeline->writes_depth = false;<br>
> pipeline->depth_test_enable = false;<br>
> memset(depth_stencil_dw, 0, sizeof(depth_stencil_dw));<br>
> @@ -757,6 +758,7 @@ emit_ds_state(struct anv_pipeline *pipeline,<br>
><br>
> <wbr>VkPipelineDepthStencilStateCre<wbr>ateInfo info = *pCreateInfo;<br>
> sanitize_ds_state(&info, &pipeline->writes_stencil, ds_aspects);<br>
> + pipeline->stencil_test_enable = info.stencilTestEnable;<br>
> pipeline->writes_depth = info.depthWriteEnable;<br>
> pipeline->depth_test_enable = info.depthTestEnable;<br>
><br>
> @@ -1514,8 +1516,8 @@ compute_kill_pixel(struct anv_pipeline *pipeline,<br>
> const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);<br>
><br>
> /* This computes the KillPixel portion of the computation for whether or<br>
> - * not we want to enable the PMA fix on gen8. It's given by this chunk of<br>
> - * the giant formula:<br>
> + * not we want to enable the PMA fix on gen8 or gen9. It's given by this<br>
> + * chunk of the giant formula:<br>
> *<br>
> * (3DSTATE_PS_EXTRA::PixelShader<wbr>KillsPixels ||<br>
> * 3DSTATE_PS_EXTRA::oMask Present to RenderTarget ||<br>
> --<br>
> 2.5.0.400.gff86faf<br>
><br>
</div></div>> ______________________________<wbr>_________________<br>
> mesa-dev mailing list<br>
> <a href="mailto:mesa-dev@lists.freedesktop.org" target="_blank">mesa-dev@lists.freedesktop.org</a><br>
> <a href="https://lists.freedesktop.org/mailman/listinfo/mesa-dev" rel="noreferrer" target="_blank">https://lists.freedesktop.org/<wbr>mailman/listinfo/mesa-dev</a><br>
</blockquote></div><br></div></div>