<div dir="ltr"><div class="gmail_extra"><div class="gmail_quote">On Wed, Feb 8, 2017 at 8:11 PM, Jason Ekstrand <span dir="ltr"><<a href="mailto:jason@jlekstrand.net" target="_blank">jason@jlekstrand.net</a>></span> wrote:<br><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex"><div dir="ltr"><div class="gmail_extra"><div class="gmail_quote"><div><div class="h5">On Wed, Feb 8, 2017 at 6:27 PM, Jason Ekstrand <span dir="ltr"><<a href="mailto:jason@jlekstrand.net" target="_blank">jason@jlekstrand.net</a>></span> wrote:<br><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex"><div dir="ltr"><div class="gmail_extra"><div class="gmail_quote"><div><div class="m_-5995240216295546008h5">On Wed, Feb 8, 2017 at 5:34 PM, Nanley Chery <span dir="ltr"><<a href="mailto:nanleychery@gmail.com" target="_blank">nanleychery@gmail.com</a>></span> wrote:<br><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex"><div class="m_-5995240216295546008m_-3958707379954501137HOEnZb"><div class="m_-5995240216295546008m_-3958707379954501137h5">On Thu, Feb 02, 2017 at 01:26:05PM -0800, Jason Ekstrand wrote:<br>
> In order to get good performance numbers for this, I had to hack up the<br>
> driver to whack wm_prog_data::uses_kill to true to emulate a discard and<br>
> used the Sascha "shadowmapping" demo.  Setting uses_kill to true dropped<br>
> the framerate on the demo by 25-30%.  Enabling the PMA fix brought it<br>
> back up to around 90% of the original framerate.  This doesn't seem to<br>
> really impact Dota 2;  probably because it doesn't use 16-bit depth.<br>
><br>
> Reviewed-by: Lionel Landwerlin <<a href="mailto:lionel.g.landwerlin@intel.com" target="_blank">lionel.g.landwerlin@intel.com</a><wbr>><br>
> ---<br>
>  src/intel/vulkan/TODO              |   1 -<br>
>  src/intel/vulkan/anv_cmd_buffe<wbr>r.c  |   2 +<br>
>  src/intel/vulkan/anv_genX.h        |   3 +<br>
>  src/intel/vulkan/anv_private.h<wbr>     |  17 +++++<br>
>  src/intel/vulkan/gen7_cmd_buff<wbr>er.c |   7 ++<br>
>  src/intel/vulkan/gen8_cmd_buff<wbr>er.c | 133 ++++++++++++++++++++++++++++++<wbr>+++++++<br>
>  src/intel/vulkan/genX_blorp_ex<wbr>ec.c |   5 ++<br>
>  src/intel/vulkan/genX_cmd_buff<wbr>er.c |  15 ++++-<br>
>  src/intel/vulkan/genX_pipeline<wbr>.c   |  38 +++++++++++<br>
>  9 files changed, 219 insertions(+), 2 deletions(-)<br>
><br>
> diff --git a/src/intel/vulkan/TODO b/src/intel/vulkan/TODO<br>
> index 38acc0d..f8b73a1 100644<br>
> --- a/src/intel/vulkan/TODO<br>
> +++ b/src/intel/vulkan/TODO<br>
> @@ -12,5 +12,4 @@ Performance:<br>
>   - Compressed multisample support<br>
>   - Pushing pieces of UBOs?<br>
>   - Enable guardband clipping<br>
> - - pma stall workaround<br>
>   - Use soft-pin to avoid relocations<br>
> diff --git a/src/intel/vulkan/anv_cmd_buf<wbr>fer.c b/src/intel/vulkan/anv_cmd_buf<wbr>fer.c<br>
> index 5886fa6..8c08f8d 100644<br>
> --- a/src/intel/vulkan/anv_cmd_buf<wbr>fer.c<br>
> +++ b/src/intel/vulkan/anv_cmd_buf<wbr>fer.c<br>
> @@ -135,6 +135,8 @@ anv_cmd_state_reset(struct anv_cmd_buffer *cmd_buffer)<br>
>     state->restart_index = UINT32_MAX;<br>
>     state->dynamic = default_dynamic_state;<br>
>     state->need_query_wa = true;<br>
> +   state->pma_fix_enabled = false;<br>
> +   state->hiz_enabled = false;<br>
><br>
>     if (state->attachments != NULL) {<br>
>        vk_free(&cmd_buffer->pool->all<wbr>oc, state->attachments);<br>
> diff --git a/src/intel/vulkan/anv_genX.h b/src/intel/vulkan/anv_genX.h<br>
> index d04fe38..67147b0 100644<br>
> --- a/src/intel/vulkan/anv_genX.h<br>
> +++ b/src/intel/vulkan/anv_genX.h<br>
> @@ -55,6 +55,9 @@ void genX(cmd_buffer_flush_dynamic_<wbr>state)(struct anv_cmd_buffer *cmd_buffer);<br>
><br>
>  void genX(cmd_buffer_flush_compute_<wbr>state)(struct anv_cmd_buffer *cmd_buffer);<br>
><br>
> +void genX(cmd_buffer_enable_pma_fix<wbr>)(struct anv_cmd_buffer *cmd_buffer,<br>
> +                                     bool enable);<br>
> +<br>
>  void<br>
>  genX(emit_urb_setup)(struct anv_device *device, struct anv_batch *batch,<br>
>                       const struct gen_l3_config *l3_config,<br>
> diff --git a/src/intel/vulkan/anv_private<wbr>.h b/src/intel/vulkan/anv_private<wbr>.h<br>
> index 4fe3ebc..6efe4ea 100644<br>
> --- a/src/intel/vulkan/anv_private<wbr>.h<br>
> +++ b/src/intel/vulkan/anv_private<wbr>.h<br>
> @@ -1163,6 +1163,20 @@ struct anv_cmd_state {<br>
>     bool                                         need_query_wa;<br>
><br>
>     /**<br>
> +    * Whether or not the gen8 PMA fix is enabled.  We ensure that, at the top<br>
> +    * of any command buffer it disabled by disabling it in EndCommandBuffer<br>
</div></div>                                 ^<br>
                                 is?<br>
<span><br>
> +    * and before invoking the secondary in ExecuteCommands.<br>
> +    */<br>
> +   bool                                         pma_fix_enabled;<br>
> +<br>
> +   /**<br>
> +    * Whether or not we now for certain that HiZ is enabled for the current<br>
</span>                           ^<br>
                           know<br>
<div><div class="m_-5995240216295546008m_-3958707379954501137h5"><br>
> +    * subpass.  If, for whatever reason, we are unsure as to whether HiZ is<br>
> +    * enabled or not, this will be false.<br>
> +    */<br>
> +   bool                                         hiz_enabled;<br>
> +<br>
> +   /**<br>
>      * Array length is anv_cmd_state::pass::attachmen<wbr>t_count. Array content is<br>
>      * valid only when recording a render pass instance.<br>
>      */<br>
> @@ -1465,8 +1479,11 @@ struct anv_pipeline {<br>
><br>
>     uint32_t                                     cs_right_mask;<br>
><br>
> +   bool                                         writes_depth;<br>
> +   bool                                         depth_test_enable;<br>
>     bool                                         writes_stencil;<br>
>     bool                                         depth_clamp_enable;<br>
> +   bool                                         kill_pixel;<br>
><br>
>     struct {<br>
>        uint32_t                                  sf[7];<br>
> diff --git a/src/intel/vulkan/gen7_cmd_bu<wbr>ffer.c b/src/intel/vulkan/gen7_cmd_bu<wbr>ffer.c<br>
> index 013ed87..c1a25e8 100644<br>
> --- a/src/intel/vulkan/gen7_cmd_bu<wbr>ffer.c<br>
> +++ b/src/intel/vulkan/gen7_cmd_bu<wbr>ffer.c<br>
> @@ -260,6 +260,13 @@ genX(cmd_buffer_flush_dynamic_<wbr>state)(struct anv_cmd_buffer *cmd_buffer)<br>
>     cmd_buffer->state.dirty = 0;<br>
>  }<br>
><br>
> +void<br>
> +genX(cmd_buffer_enable_pma_fi<wbr>x)(struct anv_cmd_buffer *cmd_buffer,<br>
> +                                bool enable)<br>
> +{<br>
> +   /* The NP PMA fix doesn't exist on gen7 */<br>
> +}<br>
> +<br>
>  void genX(CmdSetEvent)(<br>
>      VkCommandBuffer                             commandBuffer,<br>
>      VkEvent                                     event,<br>
> diff --git a/src/intel/vulkan/gen8_cmd_bu<wbr>ffer.c b/src/intel/vulkan/gen8_cmd_bu<wbr>ffer.c<br>
> index 8c8de62..271ab3f 100644<br>
> --- a/src/intel/vulkan/gen8_cmd_bu<wbr>ffer.c<br>
> +++ b/src/intel/vulkan/gen8_cmd_bu<wbr>ffer.c<br>
> @@ -155,6 +155,135 @@ __emit_sf_state(struct anv_cmd_buffer *cmd_buffer)<br>
>  #endif<br>
><br>
>  void<br>
> +genX(cmd_buffer_enable_pma_fi<wbr>x)(struct anv_cmd_buffer *cmd_buffer, bool enable)<br>
> +{<br>
> +#if GEN_GEN == 8<br>
> +   if (cmd_buffer->state.pma_fix_ena<wbr>bled == enable)<br>
> +      return;<br>
> +<br>
> +   anv_batch_emit(&cmd_buffer->b<wbr>atch, GENX(PIPE_CONTROL), pc) {<br>
> +      pc.DepthCacheFlushEnable = true;<br>
> +      pc.CommandStreamerStallEnable = true;<br>
> +      pc.RenderTargetCacheFlushEnabl<wbr>e = true;<br>
<br>
</div></div>Instead of flushing the depth and RT caches, what do you think about<br>
implementing a stall that writes 0 to the workaround bo? That should<br>
consume less bandwidth.<br>
<br>
To make the pipeline idle, flushing the caches shouldn't be necessary,<br>
we should be able to do it with a valid pipecontrol packet that has the<br>
CS stall bit set.<br></blockquote><div><br></div></div></div><div>Without having an extremely good idea of how these bits interact with caching, I'm very reluctant to try too hard to over-optimize the pipe control.<br></div><span><div> </div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">
The BDW PRM states:<br>
<br>
   If the stall bit is set, the command streamer waits until the pipe is<br>
   completely flushed.<span><br></span></blockquote><div><br></div></span><div>It waits until whatever flush/stall operation you gave it is finished.  You also need to specify a stall/flush operation.  If you just do a depth stall for instance and no depth reads/writes are in-flight, I don't think it actually stalls at all.  We may be able to use StallAtPixelScoreboard instead but, again, I don't know for sure how this interacts with caches. <br></div><span><div> <br></div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex"><span>
> +   }<br>
> +<br>
> +   uint32_t cache_mode;<br>
> +   anv_pack_struct(&cache_mode, GENX(CACHE_MODE_1),<br>
> +                   .NPPMAFixEnable = enable,<br>
> +                   .NPEarlyZFailsDisableMask = enable,<br>
</span>                                           ^<br>
I don't think you intended to set the mask here. It should be:<br>
<br>
                      .NPEarlyZFailsDisable = enable,<br>
<br>
Does fixing this impact your FPS measurements significantly?<span><br></span></blockquote><div><br></div></span><div>Thanks for catching that!  I'm a bit surprised that it worked at all.  I'll definitely re-measure.<br></div></div></div></div></blockquote><div><br></div></div></div><div>I just ran Dota2 and fixing this seems to help by around 4%!  I'll do a full re-run and get fresh numbers.<br></div></div></div></div></blockquote><div><br></div><div>I did a full re-benchmark of the series with the fixes from your review.  Now Dota 2 is helped by 8-9% by the PMA fix and none of the other patches seem to help.  I'm very confused.<br><br></div><div>When I say "none of the other patches help" that's not entirely true.  I'm seeing occational hangs (1 out of 4 runs) and they they seem to substantially decrease with the patch to disable stencil writes when the writemasks are zero.  I'm not sure what's going on there.  Maybe we're missing a workaround somewhere?<br><br></div><div>--Jason<br></div><div> </div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex"><div dir="ltr"><div class="gmail_extra"><div class="gmail_quote"><div></div><div><div class="h5"><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex"><div dir="ltr"><div class="gmail_extra"><div class="gmail_quote"><div></div><span><div> </div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex"><span>
> +                   .NPPMAFixEnableMask = true,<br>
> +                   .NPEarlyZFailsDisableMask = true);<br>
> +   anv_batch_emit(&cmd_buffer->b<wbr>atch, GENX(MI_LOAD_REGISTER_IMM), lri) {<br>
> +      lri.RegisterOffset   = GENX(CACHE_MODE_1_num);<br>
> +      lri.DataDWord        = cache_mode;<br>
> +   }<br>
> +<br>
> +   /* After the LRI, a PIPE_CONTROL with both the Depth Stall and Depth Cache<br>
> +    * Flush bits is often necessary.  We do it regardless because it's easier.<br>
> +    * The render cache flush is also necessary if stencil writes are enabled.<br>
> +    */<br>
<br>
</span>I couldn't find a requirement to flush these specific caches in the<br>
docs, but I did find:<br></blockquote><div><br></div></span><div>Both of these flushes are documented in the bspec (not PRM) docs for PIPE_CONTROL.<br></div><div><div class="m_-5995240216295546008h5"><div> </div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">
   To ensure this command gets executed before upcoming commands in the<br>
   ring, either a stalling pipecontrol should be sent after this<br>
   command, or MMIO 0x20C0 bit 7 should be set to 1.<br>
<br>
Perhaps we could use the flush I suggested earlier?<br>
<div><div class="m_-5995240216295546008m_-3958707379954501137h5"><br>
> +   anv_batch_emit(&cmd_buffer->b<wbr>atch, GENX(PIPE_CONTROL), pc) {<br>
> +      pc.DepthStallEnable = true;<br>
> +      pc.DepthCacheFlushEnable = true;<br>
> +      pc.RenderTargetCacheFlushEnabl<wbr>e = true;<br>
> +   }<br>
> +<br>
> +   cmd_buffer->state.pma_fix_ena<wbr>bled = enable;<br>
> +#endif /* GEN_GEN == 8 */<br>
> +}<br>
> +<br>
> +static inline bool<br>
> +want_depth_pma_fix(struct anv_cmd_buffer *cmd_buffer)<br>
> +{<br>
> +   assert(GEN_GEN == 8);<br>
> +<br>
> +   /* From the Broadwell PRM Vol. 2c CACHE_MODE_1::NP_PMA_FIX_ENABL<wbr>E:<br>
> +    *<br>
> +    *    SW must set this bit in order to enable this fix when following<br>
> +    *    expression is TRUE.<br>
> +    *<br>
> +    *    3DSTATE_WM::ForceThreadDispatc<wbr>h != 1 &&<br>
> +    *    !(3DSTATE_RASTER::ForceSampleC<wbr>ount != NUMRASTSAMPLES_0) &&<br>
> +    *    (3DSTATE_DEPTH_BUFFER::SURFACE<wbr>_TYPE != NULL) &&<br>
> +    *    (3DSTATE_DEPTH_BUFFER::HIZ Enable) &&<br>
> +    *    !(3DSTATE_WM::EDSC_Mode == EDSC_PREPS) &&<br>
> +    *    (3DSTATE_PS_EXTRA::PixelShader<wbr>Valid) &&<br>
> +    *    !(3DSTATE_WM_HZ_OP::DepthBuffe<wbr>rClear ||<br>
> +    *      3DSTATE_WM_HZ_OP::DepthBufferR<wbr>esolve ||<br>
> +    *      3DSTATE_WM_HZ_OP::Hierarchical Depth Buffer Resolve Enable ||<br>
> +    *      3DSTATE_WM_HZ_OP::StencilBuffe<wbr>rClear) &&<br>
> +    *    (3DSTATE_WM_DEPTH_STENCIL::Dep<wbr>thTestEnable) &&<br>
> +    *    (((3DSTATE_PS_EXTRA::PixelShad<wbr>erKillsPixels ||<br>
> +    *       3DSTATE_PS_EXTRA::oMask Present to RenderTarget ||<br>
> +    *       3DSTATE_PS_BLEND::AlphaToCove<wbr>rageEnable ||<br>
> +    *       3DSTATE_PS_BLEND::AlphaTestEn<wbr>able ||<br>
> +    *       3DSTATE_WM_CHROMAKEY::ChromaK<wbr>eyKillEnable) &&<br>
> +    *      3DSTATE_WM::ForceKillPix != ForceOff &&<br>
> +    *      ((3DSTATE_WM_DEPTH_STENCIL::De<wbr>pthWriteEnable &&<br>
> +    *        3DSTATE_DEPTH_BUFFER::DEPTH_WR<wbr>ITE_ENABLE) ||<br>
> +    *       (3DSTATE_WM_DEPTH_STENCIL::St<wbr>encil Buffer Write Enable &&<br>
> +    *        3DSTATE_DEPTH_BUFFER::STENCIL_<wbr>WRITE_ENABLE &&<br>
> +    *        3DSTATE_STENCIL_BUFFER::STENCI<wbr>L_BUFFER_ENABLE))) ||<br>
> +    *     (3DSTATE_PS_EXTRA:: Pixel Shader Computed Depth mode != PSCDEPTH_OFF))<br>
> +    *<br>
> +    * This function only takes care of the pipeline parts of the equation.<br>
<br>
</div></div>Which parts are the pipeline parts? It looks like you take care of the<br>
entire equation in this function.<span><br></span></blockquote><div><br></div></div></div><div>Yes, I do.<br></div><span><div> </div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex"><span>
> +    */<br>
> +<br>
> +   /* These are always true:<br>
> +    *    3DSTATE_WM::ForceThreadDispatc<wbr>h != 1 &&<br>
> +    *    !(3DSTATE_RASTER::ForceSampleC<wbr>ount != NUMRASTSAMPLES_0)<br>
> +    */<br>
<br>
</span>I couldn't quite understand what ForcedSampleCount does from looking at<br>
the HW docs. If you know, could you tell me what this does and why it<br>
wouldn't change under any circumstance in the future?  Otherwise, I<br>
think we should put an assertion where we assign ForcedSampleCount and<br>
reference this PMA optimization.<span><br></span></blockquote><div><br></div></span><div>Honestly, I'm not 100% sure what it's for.  I think it lets you make the rasterizer run at a different number of samples than the underlying surface.  We've never used this bit in Vulkan or GL.  Also, there's no flag for us to assert on.<br></div><span><div> </div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex"><span>
> +<br>
> +   /* We only enable the PMA fix if we know for certain that HiZ is enabled.<br>
> +    * If we don't know whether HiZ is enabled or not, we disable the PMA fix<br>
> +    * and there is no harm.<br>
> +    *<br>
> +    * (3DSTATE_DEPTH_BUFFER::SURFACE<wbr>_TYPE != NULL) &&<br>
> +    * 3DSTATE_DEPTH_BUFFER::HIZ Enable<br>
> +    */<br>
> +   if (!cmd_buffer->state.hiz_enable<wbr>d)<br>
> +      return false;<br>
> +<br>
> +   /* 3DSTATE_PS_EXTRA::PixelShaderV<wbr>alid */<br>
> +   struct anv_pipeline *pipeline = cmd_buffer->state.pipeline;<br>
> +   if (!anv_pipeline_has_stage(pipel<wbr>ine, MESA_SHADER_FRAGMENT))<br>
> +      return false;<br>
> +<br>
> +   /* !(3DSTATE_WM::EDSC_Mode == EDSC_PREPS) */<br>
> +   const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);<br>
> +   if (wm_prog_data->early_fragment_<wbr>tests)<br>
> +      return false;<br>
<br>
</span>In keeping with your top-to-bottom evaluation of the logical statement,<br>
could you move this part to be above the PixelShaderValid part?<span><br></span></blockquote><div><br></div></span><div>Not really.  If we don't' have a fragment shader, we wm_prog_data is NULL<br></div><span><div> </div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex"><span>
> +<br>
> +   /* We never use anv_pipeline for HiZ ops so this is trivially true:<br>
> +   *    !(3DSTATE_WM_HZ_OP::DepthBuffe<wbr>rClear ||<br>
</span>      ^<br>
      misaligned asterisk<span><br></span></blockquote><div><br></div></span><div>Fixed.<br></div><div><div class="m_-5995240216295546008h5"><div> </div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex"><span>
> +    *      3DSTATE_WM_HZ_OP::DepthBufferR<wbr>esolve ||<br>
> +    *      3DSTATE_WM_HZ_OP::Hierarchical Depth Buffer Resolve Enable ||<br>
> +    *      3DSTATE_WM_HZ_OP::StencilBuffe<wbr>rClear)<br>
> +    */<br>
> +<br>
> +   /* 3DSTATE_WM_DEPTH_STENCIL::Dept<wbr>hTestEnable */<br>
> +   if (!pipeline->depth_test_enable)<br>
> +      return false;<br>
> +<br>
> +   /* (((3DSTATE_PS_EXTRA::PixelShad<wbr>erKillsPixels ||<br>
> +    *    3DSTATE_PS_EXTRA::oMask Present to RenderTarget ||<br>
> +    *    3DSTATE_PS_BLEND::AlphaToCover<wbr>ageEnable ||<br>
> +    *    3DSTATE_PS_BLEND::AlphaTestEna<wbr>ble ||<br>
> +    *    3DSTATE_WM_CHROMAKEY::ChromaKe<wbr>yKillEnable) &&<br>
> +    *   3DSTATE_WM::ForceKillPix != ForceOff &&<br>
> +    *   ((3DSTATE_WM_DEPTH_STENCIL::D<wbr>epthWriteEnable &&<br>
> +    *     3DSTATE_DEPTH_BUFFER::DEPTH_W<wbr>RITE_ENABLE) ||<br>
> +    *    (3DSTATE_WM_DEPTH_STENCIL::Ste<wbr>ncil Buffer Write Enable &&<br>
> +    *     3DSTATE_DEPTH_BUFFER::STENCIL<wbr>_WRITE_ENABLE &&<br>
> +    *     3DSTATE_STENCIL_BUFFER::STENC<wbr>IL_BUFFER_ENABLE))) ||<br>
> +    *  (3DSTATE_PS_EXTRA:: Pixel Shader Computed Depth mode != PSCDEPTH_OFF))<br>
> +    */<br>
> +   return (pipeline->kill_pixel && (pipeline->writes_depth ||<br>
> +                                    pipeline->writes_stencil)) ||<br>
<br>
</span>anv_pipeline::writes_stencil !=<br>
<span>(3DSTATE_WM_DEPTH_STENCIL::Ste<wbr>ncil Buffer Write Enable &&<br>
</span> 3DSTATE_DEPTH_BUFFER::STENCIL<wbr>_WRITE_ENABLE &&<br>
 3DSTATE_STENCIL_BUFFER::STENC<wbr>IL_BUFFER_ENABLE);<br>
<br>
The last two lines are not accounted for - we don't detect if the<br>
currently bound depth_stencil attachment lacks or possesses a stencil<br>
buffer.<br></blockquote><div><br></div></div></div>We should be checking for the presence of a stencil aspect in the pipeline.  I think that got missed when I added the writes_stencil bit.  We do by the end of the series, but I'll patch up the disable stencil writes patch to make it check.<span><br><div> </div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">
This is a valid scenario according to section 25.9. Stencil Test of the<br>
Vulkan spec, which states:<br>
<br>
   If there is no stencil framebuffer attachment, stencil modification<br>
   cannot occur, and it is as if the stencil tests always pass.<br>
<br>
anv_pipeline::writes_depth should be correct because hiz_enabled is true<br>
by the time we reach it. For hiz_enabled to be true, there must be a<br>
depth buffer present.<span><br></span></blockquote><div><br></div></span><div>Yup<br></div><span><div> </div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex"><span>
> +          wm_prog_data->computed_depth_m<wbr>ode != PSCDEPTH_OFF;<br>
> +}<br>
> +<br>
> +void<br>
>  genX(cmd_buffer_flush_dynamic_<wbr>state)(struct anv_cmd_buffer *cmd_buffer)<br>
>  {<br>
>     struct anv_pipeline *pipeline = cmd_buffer->state.pipeline;<br>
> @@ -211,6 +340,7 @@ genX(cmd_buffer_flush_dynamic_<wbr>state)(struct anv_cmd_buffer *cmd_buffer)<br>
>     }<br>
><br>
>     if (cmd_buffer->state.dirty & (ANV_CMD_DIRTY_PIPELINE |<br>
> +                                  ANV_CMD_DIRTY_RENDER_TARGETS |<br>
<br>
</span>Why is this necessary?<br><div><div class="m_-5995240216295546008m_-3958707379954501137h5"></div></div></blockquote><div><br></div></span><div>We set RENDER_TARGETS dirty when the depth buffer changes due to NextSubpass or BeginCommandBuffer<br></div><div><div class="m_-5995240216295546008h5"><div> </div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex"><div><div class="m_-5995240216295546008m_-3958707379954501137h5">
>                                    ANV_CMD_DIRTY_DYNAMIC_STENCIL_<wbr>COMPARE_MASK |<br>
>                                    ANV_CMD_DIRTY_DYNAMIC_STENCIL_<wbr>WRITE_MASK)) {<br>
>        uint32_t wm_depth_stencil_dw[GENX(3DSTA<wbr>TE_WM_DEPTH_STENCIL_length)];<br>
> @@ -234,6 +364,9 @@ genX(cmd_buffer_flush_dynamic_<wbr>state)(struct anv_cmd_buffer *cmd_buffer)<br>
><br>
>        anv_batch_emit_merge(&cmd_buff<wbr>er->batch, wm_depth_stencil_dw,<br>
>                             pipeline->gen8.wm_depth_stenc<wbr>il);<br>
> +<br>
> +      genX(cmd_buffer_enable_pma_fix<wbr>)(cmd_buffer,<br>
> +                                      want_depth_pma_fix(cmd_buffer)<wbr>);<br>
>     }<br>
>  #else<br>
>     if (cmd_buffer->state.dirty & ANV_CMD_DIRTY_DYNAMIC_BLEND_CO<wbr>NSTANTS) {<br>
> diff --git a/src/intel/vulkan/genX_blorp_<wbr>exec.c b/src/intel/vulkan/genX_blorp_<wbr>exec.c<br>
> index 663e6c9..6f0b063 100644<br>
> --- a/src/intel/vulkan/genX_blorp_<wbr>exec.c<br>
> +++ b/src/intel/vulkan/genX_blorp_<wbr>exec.c<br>
> @@ -154,6 +154,11 @@ genX(blorp_exec)(struct blorp_batch *batch,<br>
><br>
>     genX(cmd_buffer_emit_gen7_dep<wbr>th_flush)(cmd_buffer);<br>
><br>
> +   /* BLORP doesn't do anything fancy with depth such as discards, so we want<br>
> +    * the PMA fix off.  Also, off is always the safe option.<br>
> +    */<br>
> +   genX(cmd_buffer_enable_pma_fi<wbr>x)(cmd_buffer, false);<br>
> +<br>
>     blorp_exec(batch, params);<br>
><br>
>     cmd_buffer->state.vb_dirty = ~0;<br>
> diff --git a/src/intel/vulkan/genX_cmd_bu<wbr>ffer.c b/src/intel/vulkan/genX_cmd_bu<wbr>ffer.c<br>
> index b6b7f74..66de93a 100644<br>
> --- a/src/intel/vulkan/genX_cmd_bu<wbr>ffer.c<br>
> +++ b/src/intel/vulkan/genX_cmd_bu<wbr>ffer.c<br>
> @@ -627,6 +627,11 @@ genX(EndCommandBuffer)(<br>
>  {<br>
>     ANV_FROM_HANDLE(anv_cmd_buffe<wbr>r, cmd_buffer, commandBuffer);<br>
><br>
> +   /* We want every command buffer to start with the PMA fix in a known state,<br>
> +    * so we disable it at the end of the command buffer.<br>
> +    */<br>
> +   genX(cmd_buffer_enable_pma_fi<wbr>x)(cmd_buffer, false);<br>
> +<br>
>     genX(cmd_buffer_apply_pipe_fl<wbr>ushes)(cmd_buffer);<br>
><br>
>     anv_cmd_buffer_end_batch_buff<wbr>er(cmd_buffer);<br>
> @@ -644,6 +649,11 @@ genX(CmdExecuteCommands)(<br>
><br>
>     assert(primary->level == VK_COMMAND_BUFFER_LEVEL_PRIMAR<wbr>Y);<br>
><br>
> +   /* The secondary command buffers will assume that the PMA fix is disabled<br>
> +    * when they begin executing.  Make sure this is true.<br>
> +    */<br>
> +   genX(cmd_buffer_enable_pma_fi<wbr>x)(primary, false);<br>
> +<br>
>     for (uint32_t i = 0; i < commandBufferCount; i++) {<br>
>        ANV_FROM_HANDLE(anv_cmd_buffer<wbr>, secondary, pCmdBuffers[i]);<br>
><br>
> @@ -2181,7 +2191,8 @@ cmd_buffer_emit_depth_stencil(<wbr>struct anv_cmd_buffer *cmd_buffer)<br>
>     const bool has_stencil =<br>
>        image && (image->aspects & VK_IMAGE_ASPECT_STENCIL_BIT);<br>
><br>
> -   /* FIXME: Implement the PMA stall W/A */<br>
> +   cmd_buffer->state.hiz_enabled = has_hiz;<br>
> +<br>
>     /* FIXME: Width and Height are wrong */<br>
><br>
>     genX(cmd_buffer_emit_gen7_dep<wbr>th_flush)(cmd_buffer);<br>
> @@ -2419,6 +2430,8 @@ void genX(CmdEndRenderPass)(<br>
><br>
>     anv_cmd_buffer_resolve_subpas<wbr>s(cmd_buffer);<br>
><br>
> +   cmd_buffer->state.hiz_enabled = false;<br>
> +<br>
>  #ifndef NDEBUG<br>
>     anv_dump_add_framebuffer(cmd_<wbr>buffer, cmd_buffer->state.framebuffer)<wbr>;<br>
>  #endif<br>
> diff --git a/src/intel/vulkan/genX_pipeli<wbr>ne.c b/src/intel/vulkan/genX_pipeli<wbr>ne.c<br>
> index 18fe48c..0588d01 100644<br>
> --- a/src/intel/vulkan/genX_pipeli<wbr>ne.c<br>
> +++ b/src/intel/vulkan/genX_pipeli<wbr>ne.c<br>
> @@ -653,6 +653,8 @@ emit_ds_state(struct anv_pipeline *pipeline,<br>
>         * to make sure it's initialized to something useful.<br>
>         */<br>
>        pipeline->writes_stencil = false;<br>
> +      pipeline->writes_depth = false;<br>
> +      pipeline->depth_test_enable = false;<br>
>        memset(depth_stencil_dw, 0, sizeof(depth_stencil_dw));<br>
>        return;<br>
>     }<br>
> @@ -711,6 +713,9 @@ emit_ds_state(struct anv_pipeline *pipeline,<br>
>     if (info->depthTestEnable && info->depthCompareOp == VK_COMPARE_OP_EQUAL)<br>
>        depth_stencil.DepthBufferWrite<wbr>Enable = false;<br>
><br>
> +   pipeline->writes_depth = depth_stencil.DepthBufferWrite<wbr>Enable;<br>
> +   pipeline->depth_test_enable = depth_stencil.DepthTestEnable;<br>
> +<br>
>  #if GEN_GEN <= 7<br>
>     GENX(DEPTH_STENCIL_STATE_pack<wbr>)(NULL, depth_stencil_dw, &depth_stencil);<br>
>  #else<br>
> @@ -1429,6 +1434,38 @@ emit_3dstate_vf_topology(struc<wbr>t anv_pipeline *pipeline)<br>
>  }<br>
>  #endif<br>
><br>
> +static void<br>
> +compute_kill_pixel(struct anv_pipeline *pipeline,<br>
> +                   const VkPipelineMultisampleStateCrea<wbr>teInfo *ms_info,<br>
> +                   const struct anv_subpass *subpass)<br>
> +{<br>
> +   if (!anv_pipeline_has_stage(pipel<wbr>ine, MESA_SHADER_FRAGMENT)) {<br>
> +      pipeline->kill_pixel = false;<br>
> +      return;<br>
> +   }<br>
> +<br>
> +   const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);<br>
> +<br>
> +   /* This computes the KillPixel portion of the computation for whether or<br>
> +    * not we want to enable the PMA fix on gen8.  It's given by this chunk of<br>
> +    * the giant formula:<br>
> +    *<br>
> +    *    (3DSTATE_PS_EXTRA::PixelShader<wbr>KillsPixels ||<br>
> +    *     3DSTATE_PS_EXTRA::oMask Present to RenderTarget ||<br>
> +    *     3DSTATE_PS_BLEND::AlphaToCove<wbr>rageEnable ||<br>
> +    *     3DSTATE_PS_BLEND::AlphaTestEn<wbr>able ||<br>
> +    *     3DSTATE_WM_CHROMAKEY::ChromaK<wbr>eyKillEnable)<br>
> +    *<br>
> +    * 3DSTATE_WM_CHROMAKEY::ChromaKe<wbr>yKillEnable is always false and so is<br>
> +    * 3DSTATE_PS_BLEND::AlphaTestEna<wbr>ble since Vulkan doesn't have a concept<br>
> +    * of an alpha test.<br>
> +    */<br>
> +   pipeline->kill_pixel =<br>
> +      subpass->has_ds_self_dep || wm_prog_data->uses_kill ||<br>
<br>
</div></div>Why do we set kill_pixel in the presence of has_ds_self_dep?<br>
<br>
Section 7.1 of the Vulkan spec states:<br>
<br>
   If a subpass uses the same attachment as both an input attachment and<br>
   either a color attachment or a depth/stencil attachment, writes via<br>
   the color or depth/stencil attachment are not automatically made<br>
   visible to reads via the input attachment, causing a feedback loop,<br>
   except in any of the following conditions:<br>
<br>
   [...]<br>
<br>
I don't see how any of the conditions are satisfied through the use<br>
of kill_pixel.<span class="m_-5995240216295546008m_-3958707379954501137im m_-5995240216295546008m_-3958707379954501137HOEnZb"><br></span></blockquote><div><br></div></div></div><div>There's a really good comment where we set 3DSTATE_PS::PixelShaderKillsPi<wbr>xel and 3DSTATE_PS_EXTRA::PixelShaderK<wbr>illsPixel<br></div><span><div> </div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex"><span class="m_-5995240216295546008m_-3958707379954501137im m_-5995240216295546008m_-3958707379954501137HOEnZb">
> +      wm_prog_data->uses_omask ||<br>
> +      (ms_info && ms_info->alphaToCoverageEnable<wbr>);<br>
> +}<br>
> +<br>
>  static VkResult<br>
>  genX(graphics_pipeline_create)<wbr>(<br>
>      VkDevice                                    _device,<br>
> @@ -1466,6 +1503,7 @@ genX(graphics_pipeline_create)<wbr>(<br>
>     emit_ds_state(pipeline, pCreateInfo->pDepthStencilStat<wbr>e, pass, subpass);<br>
>     emit_cb_state(pipeline, pCreateInfo->pColorBlendState,<br>
>                             pCreateInfo->pMultisampleStat<wbr>e);<br>
> +   compute_kill_pixel(pipeline, pCreateInfo->pMultisampleState<wbr>, subpass);<br>
><br>
>     emit_urb_setup(pipeline);<br>
><br>
> --<br>
> 2.5.0.400.gff86faf<br>
><br>
</span><div class="m_-5995240216295546008m_-3958707379954501137HOEnZb"><div class="m_-5995240216295546008m_-3958707379954501137h5">> ______________________________<wbr>_________________<br>
> mesa-dev mailing list<br>
> <a href="mailto:mesa-dev@lists.freedesktop.org" target="_blank">mesa-dev@lists.freedesktop.org</a><br>
> <a href="https://lists.freedesktop.org/mailman/listinfo/mesa-dev" rel="noreferrer" target="_blank">https://lists.freedesktop.org/<wbr>mailman/listinfo/mesa-dev</a><br>
</div></div></blockquote></span></div><br></div></div>
</blockquote></div></div></div><br></div></div>
</blockquote></div><br></div></div>