[Mesa-dev] [PATCH v2 3/5] anv: Add support for the PMA fix on Broadwell

Thu Feb 9 18:47:43 UTC 2017

On Wed, Feb 08, 2017 at 09:38:52PM -0800, Jason Ekstrand wrote:
> On Wed, Feb 8, 2017 at 8:11 PM, Jason Ekstrand <jason at jlekstrand.net> wrote:
> 
> > On Wed, Feb 8, 2017 at 6:27 PM, Jason Ekstrand <jason at jlekstrand.net>
> > wrote:
> >
> >> On Wed, Feb 8, 2017 at 5:34 PM, Nanley Chery <nanleychery at gmail.com>
> >> wrote:
> >>
> >>> On Thu, Feb 02, 2017 at 01:26:05PM -0800, Jason Ekstrand wrote:
> >>> > In order to get good performance numbers for this, I had to hack up the
> >>> > driver to whack wm_prog_data::uses_kill to true to emulate a discard
> >>> and
> >>> > used the Sascha "shadowmapping" demo.  Setting uses_kill to true
> >>> dropped
> >>> > the framerate on the demo by 25-30%.  Enabling the PMA fix brought it
> >>> > back up to around 90% of the original framerate.  This doesn't seem to
> >>> > really impact Dota 2;  probably because it doesn't use 16-bit depth.
> >>> >
> >>> > Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin at intel.com>
> >>> > ---
> >>> >  src/intel/vulkan/TODO              |   1 -
> >>> >  src/intel/vulkan/anv_cmd_buffer.c  |   2 +
> >>> >  src/intel/vulkan/anv_genX.h        |   3 +
> >>> >  src/intel/vulkan/anv_private.h     |  17 +++++
> >>> >  src/intel/vulkan/gen7_cmd_buffer.c |   7 ++
> >>> >  src/intel/vulkan/gen8_cmd_buffer.c | 133
> >>> +++++++++++++++++++++++++++++++++++++
> >>> >  src/intel/vulkan/genX_blorp_exec.c |   5 ++
> >>> >  src/intel/vulkan/genX_cmd_buffer.c |  15 ++++-
> >>> >  src/intel/vulkan/genX_pipeline.c   |  38 +++++++++++
> >>> >  9 files changed, 219 insertions(+), 2 deletions(-)
> >>> >
> >>> > diff --git a/src/intel/vulkan/TODO b/src/intel/vulkan/TODO
> >>> > index 38acc0d..f8b73a1 100644
> >>> > --- a/src/intel/vulkan/TODO
> >>> > +++ b/src/intel/vulkan/TODO
> >>> > @@ -12,5 +12,4 @@ Performance:
> >>> >   - Compressed multisample support
> >>> >   - Pushing pieces of UBOs?
> >>> >   - Enable guardband clipping
> >>> > - - pma stall workaround
> >>> >   - Use soft-pin to avoid relocations
> >>> > diff --git a/src/intel/vulkan/anv_cmd_buffer.c
> >>> b/src/intel/vulkan/anv_cmd_buffer.c
> >>> > index 5886fa6..8c08f8d 100644
> >>> > --- a/src/intel/vulkan/anv_cmd_buffer.c
> >>> > +++ b/src/intel/vulkan/anv_cmd_buffer.c
> >>> > @@ -135,6 +135,8 @@ anv_cmd_state_reset(struct anv_cmd_buffer
> >>> *cmd_buffer)
> >>> >     state->restart_index = UINT32_MAX;
> >>> >     state->dynamic = default_dynamic_state;
> >>> >     state->need_query_wa = true;
> >>> > +   state->pma_fix_enabled = false;
> >>> > +   state->hiz_enabled = false;
> >>> >
> >>> >     if (state->attachments != NULL) {
> >>> >        vk_free(&cmd_buffer->pool->alloc, state->attachments);
> >>> > diff --git a/src/intel/vulkan/anv_genX.h b/src/intel/vulkan/anv_genX.h
> >>> > index d04fe38..67147b0 100644
> >>> > --- a/src/intel/vulkan/anv_genX.h
> >>> > +++ b/src/intel/vulkan/anv_genX.h
> >>> > @@ -55,6 +55,9 @@ void genX(cmd_buffer_flush_dynamic_state)(struct
> >>> anv_cmd_buffer *cmd_buffer);
> >>> >
> >>> >  void genX(cmd_buffer_flush_compute_state)(struct anv_cmd_buffer
> >>> *cmd_buffer);
> >>> >
> >>> > +void genX(cmd_buffer_enable_pma_fix)(struct anv_cmd_buffer
> >>> *cmd_buffer,
> >>> > +                                     bool enable);
> >>> > +
> >>> >  void
> >>> >  genX(emit_urb_setup)(struct anv_device *device, struct anv_batch
> >>> *batch,
> >>> >                       const struct gen_l3_config *l3_config,
> >>> > diff --git a/src/intel/vulkan/anv_private.h
> >>> b/src/intel/vulkan/anv_private.h
> >>> > index 4fe3ebc..6efe4ea 100644
> >>> > --- a/src/intel/vulkan/anv_private.h
> >>> > +++ b/src/intel/vulkan/anv_private.h
> >>> > @@ -1163,6 +1163,20 @@ struct anv_cmd_state {
> >>> >     bool                                         need_query_wa;
> >>> >
> >>> >     /**
> >>> > +    * Whether or not the gen8 PMA fix is enabled.  We ensure that, at
> >>> the top
> >>> > +    * of any command buffer it disabled by disabling it in
> >>> EndCommandBuffer
> >>>                                  ^
> >>>                                  is?
> >>>
> >>> > +    * and before invoking the secondary in ExecuteCommands.
> >>> > +    */
> >>> > +   bool                                         pma_fix_enabled;
> >>> > +
> >>> > +   /**
> >>> > +    * Whether or not we now for certain that HiZ is enabled for the
> >>> current
> >>>                            ^
> >>>                            know
> >>>
> >>> > +    * subpass.  If, for whatever reason, we are unsure as to whether
> >>> HiZ is
> >>> > +    * enabled or not, this will be false.
> >>> > +    */
> >>> > +   bool                                         hiz_enabled;
> >>> > +
> >>> > +   /**
> >>> >      * Array length is anv_cmd_state::pass::attachment_count. Array
> >>> content is
> >>> >      * valid only when recording a render pass instance.
> >>> >      */
> >>> > @@ -1465,8 +1479,11 @@ struct anv_pipeline {
> >>> >
> >>> >     uint32_t                                     cs_right_mask;
> >>> >
> >>> > +   bool                                         writes_depth;
> >>> > +   bool                                         depth_test_enable;
> >>> >     bool                                         writes_stencil;
> >>> >     bool                                         depth_clamp_enable;
> >>> > +   bool                                         kill_pixel;
> >>> >
> >>> >     struct {
> >>> >        uint32_t                                  sf[7];
> >>> > diff --git a/src/intel/vulkan/gen7_cmd_buffer.c
> >>> b/src/intel/vulkan/gen7_cmd_buffer.c
> >>> > index 013ed87..c1a25e8 100644
> >>> > --- a/src/intel/vulkan/gen7_cmd_buffer.c
> >>> > +++ b/src/intel/vulkan/gen7_cmd_buffer.c
> >>> > @@ -260,6 +260,13 @@ genX(cmd_buffer_flush_dynamic_state)(struct
> >>> anv_cmd_buffer *cmd_buffer)
> >>> >     cmd_buffer->state.dirty = 0;
> >>> >  }
> >>> >
> >>> > +void
> >>> > +genX(cmd_buffer_enable_pma_fix)(struct anv_cmd_buffer *cmd_buffer,
> >>> > +                                bool enable)
> >>> > +{
> >>> > +   /* The NP PMA fix doesn't exist on gen7 */
> >>> > +}
> >>> > +
> >>> >  void genX(CmdSetEvent)(
> >>> >      VkCommandBuffer                             commandBuffer,
> >>> >      VkEvent                                     event,
> >>> > diff --git a/src/intel/vulkan/gen8_cmd_buffer.c
> >>> b/src/intel/vulkan/gen8_cmd_buffer.c
> >>> > index 8c8de62..271ab3f 100644
> >>> > --- a/src/intel/vulkan/gen8_cmd_buffer.c
> >>> > +++ b/src/intel/vulkan/gen8_cmd_buffer.c
> >>> > @@ -155,6 +155,135 @@ __emit_sf_state(struct anv_cmd_buffer
> >>> *cmd_buffer)
> >>> >  #endif
> >>> >
> >>> >  void
> >>> > +genX(cmd_buffer_enable_pma_fix)(struct anv_cmd_buffer *cmd_buffer,
> >>> bool enable)
> >>> > +{
> >>> > +#if GEN_GEN == 8
> >>> > +   if (cmd_buffer->state.pma_fix_enabled == enable)
> >>> > +      return;
> >>> > +
> >>> > +   anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
> >>> > +      pc.DepthCacheFlushEnable = true;
> >>> > +      pc.CommandStreamerStallEnable = true;
> >>> > +      pc.RenderTargetCacheFlushEnable = true;
> >>>
> >>> Instead of flushing the depth and RT caches, what do you think about
> >>> implementing a stall that writes 0 to the workaround bo? That should
> >>> consume less bandwidth.
> >>>
> >>> To make the pipeline idle, flushing the caches shouldn't be necessary,
> >>> we should be able to do it with a valid pipecontrol packet that has the
> >>> CS stall bit set.
> >>>
> >>
> >> Without having an extremely good idea of how these bits interact with
> >> caching, I'm very reluctant to try too hard to over-optimize the pipe
> >> control.
> >>
> >>
> >>> The BDW PRM states:
> >>>
> >>>    If the stall bit is set, the command streamer waits until the pipe is
> >>>    completely flushed.
> >>>
> >>
> >> It waits until whatever flush/stall operation you gave it is finished.
> >> You also need to specify a stall/flush operation.  If you just do a depth
> >> stall for instance and no depth reads/writes are in-flight, I don't think
> >> it actually stalls at all.  We may be able to use StallAtPixelScoreboard
> >> instead but, again, I don't know for sure how this interacts with caches.
> >>
> >>
> >>> > +   }
> >>> > +
> >>> > +   uint32_t cache_mode;
> >>> > +   anv_pack_struct(&cache_mode, GENX(CACHE_MODE_1),
> >>> > +                   .NPPMAFixEnable = enable,
> >>> > +                   .NPEarlyZFailsDisableMask = enable,
> >>>                                            ^
> >>> I don't think you intended to set the mask here. It should be:
> >>>
> >>>                       .NPEarlyZFailsDisable = enable,
> >>>
> >>> Does fixing this impact your FPS measurements significantly?
> >>>
> >>
> >> Thanks for catching that!  I'm a bit surprised that it worked at all.
> >> I'll definitely re-measure.
> >>
> >
> > I just ran Dota2 and fixing this seems to help by around 4%!  I'll do a
> > full re-run and get fresh numbers.
> >
> 
> I did a full re-benchmark of the series with the fixes from your review.
> Now Dota 2 is helped by 8-9% by the PMA fix and none of the other patches
> seem to help.  I'm very confused.
> 

Those are pretty good results! I'll probably use some printfs to see if
the ds sanitation function catches anything major.

-Nanley

> When I say "none of the other patches help" that's not entirely true.  I'm
> seeing occational hangs (1 out of 4 runs) and they they seem to
> substantially decrease with the patch to disable stencil writes when the
> writemasks are zero.  I'm not sure what's going on there.  Maybe we're
> missing a workaround somewhere?
> 
> --Jason
> 
> 
> >
> >>
> >>> > +                   .NPPMAFixEnableMask = true,
> >>> > +                   .NPEarlyZFailsDisableMask = true);
> >>> > +   anv_batch_emit(&cmd_buffer->batch, GENX(MI_LOAD_REGISTER_IMM),
> >>> lri) {
> >>> > +      lri.RegisterOffset   = GENX(CACHE_MODE_1_num);
> >>> > +      lri.DataDWord        = cache_mode;
> >>> > +   }
> >>> > +
> >>> > +   /* After the LRI, a PIPE_CONTROL with both the Depth Stall and
> >>> Depth Cache
> >>> > +    * Flush bits is often necessary.  We do it regardless because
> >>> it's easier.
> >>> > +    * The render cache flush is also necessary if stencil writes are
> >>> enabled.
> >>> > +    */
> >>>
> >>> I couldn't find a requirement to flush these specific caches in the
> >>> docs, but I did find:
> >>>
> >>
> >> Both of these flushes are documented in the bspec (not PRM) docs for
> >> PIPE_CONTROL.
> >>
> >>
> >>>    To ensure this command gets executed before upcoming commands in the
> >>>    ring, either a stalling pipecontrol should be sent after this
> >>>    command, or MMIO 0x20C0 bit 7 should be set to 1.
> >>>
> >>> Perhaps we could use the flush I suggested earlier?
> >>>
> >>> > +   anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
> >>> > +      pc.DepthStallEnable = true;
> >>> > +      pc.DepthCacheFlushEnable = true;
> >>> > +      pc.RenderTargetCacheFlushEnable = true;
> >>> > +   }
> >>> > +
> >>> > +   cmd_buffer->state.pma_fix_enabled = enable;
> >>> > +#endif /* GEN_GEN == 8 */
> >>> > +}
> >>> > +
> >>> > +static inline bool
> >>> > +want_depth_pma_fix(struct anv_cmd_buffer *cmd_buffer)
> >>> > +{
> >>> > +   assert(GEN_GEN == 8);
> >>> > +
> >>> > +   /* From the Broadwell PRM Vol. 2c CACHE_MODE_1::NP_PMA_FIX_ENABLE:
> >>> > +    *
> >>> > +    *    SW must set this bit in order to enable this fix when
> >>> following
> >>> > +    *    expression is TRUE.
> >>> > +    *
> >>> > +    *    3DSTATE_WM::ForceThreadDispatch != 1 &&
> >>> > +    *    !(3DSTATE_RASTER::ForceSampleCount != NUMRASTSAMPLES_0) &&
> >>> > +    *    (3DSTATE_DEPTH_BUFFER::SURFACE_TYPE != NULL) &&
> >>> > +    *    (3DSTATE_DEPTH_BUFFER::HIZ Enable) &&
> >>> > +    *    !(3DSTATE_WM::EDSC_Mode == EDSC_PREPS) &&
> >>> > +    *    (3DSTATE_PS_EXTRA::PixelShaderValid) &&
> >>> > +    *    !(3DSTATE_WM_HZ_OP::DepthBufferClear ||
> >>> > +    *      3DSTATE_WM_HZ_OP::DepthBufferResolve ||
> >>> > +    *      3DSTATE_WM_HZ_OP::Hierarchical Depth Buffer Resolve Enable
> >>> ||
> >>> > +    *      3DSTATE_WM_HZ_OP::StencilBufferClear) &&
> >>> > +    *    (3DSTATE_WM_DEPTH_STENCIL::DepthTestEnable) &&
> >>> > +    *    (((3DSTATE_PS_EXTRA::PixelShaderKillsPixels ||
> >>> > +    *       3DSTATE_PS_EXTRA::oMask Present to RenderTarget ||
> >>> > +    *       3DSTATE_PS_BLEND::AlphaToCoverageEnable ||
> >>> > +    *       3DSTATE_PS_BLEND::AlphaTestEnable ||
> >>> > +    *       3DSTATE_WM_CHROMAKEY::ChromaKeyKillEnable) &&
> >>> > +    *      3DSTATE_WM::ForceKillPix != ForceOff &&
> >>> > +    *      ((3DSTATE_WM_DEPTH_STENCIL::DepthWriteEnable &&
> >>> > +    *        3DSTATE_DEPTH_BUFFER::DEPTH_WRITE_ENABLE) ||
> >>> > +    *       (3DSTATE_WM_DEPTH_STENCIL::Stencil Buffer Write Enable &&
> >>> > +    *        3DSTATE_DEPTH_BUFFER::STENCIL_WRITE_ENABLE &&
> >>> > +    *        3DSTATE_STENCIL_BUFFER::STENCIL_BUFFER_ENABLE))) ||
> >>> > +    *     (3DSTATE_PS_EXTRA:: Pixel Shader Computed Depth mode !=
> >>> PSCDEPTH_OFF))
> >>> > +    *
> >>> > +    * This function only takes care of the pipeline parts of the
> >>> equation.
> >>>
> >>> Which parts are the pipeline parts? It looks like you take care of the
> >>> entire equation in this function.
> >>>
> >>
> >> Yes, I do.
> >>
> >>
> >>> > +    */
> >>> > +
> >>> > +   /* These are always true:
> >>> > +    *    3DSTATE_WM::ForceThreadDispatch != 1 &&
> >>> > +    *    !(3DSTATE_RASTER::ForceSampleCount != NUMRASTSAMPLES_0)
> >>> > +    */
> >>>
> >>> I couldn't quite understand what ForcedSampleCount does from looking at
> >>> the HW docs. If you know, could you tell me what this does and why it
> >>> wouldn't change under any circumstance in the future?  Otherwise, I
> >>> think we should put an assertion where we assign ForcedSampleCount and
> >>> reference this PMA optimization.
> >>>
> >>
> >> Honestly, I'm not 100% sure what it's for.  I think it lets you make the
> >> rasterizer run at a different number of samples than the underlying
> >> surface.  We've never used this bit in Vulkan or GL.  Also, there's no flag
> >> for us to assert on.
> >>
> >>
> >>> > +
> >>> > +   /* We only enable the PMA fix if we know for certain that HiZ is
> >>> enabled.
> >>> > +    * If we don't know whether HiZ is enabled or not, we disable the
> >>> PMA fix
> >>> > +    * and there is no harm.
> >>> > +    *
> >>> > +    * (3DSTATE_DEPTH_BUFFER::SURFACE_TYPE != NULL) &&
> >>> > +    * 3DSTATE_DEPTH_BUFFER::HIZ Enable
> >>> > +    */
> >>> > +   if (!cmd_buffer->state.hiz_enabled)
> >>> > +      return false;
> >>> > +
> >>> > +   /* 3DSTATE_PS_EXTRA::PixelShaderValid */
> >>> > +   struct anv_pipeline *pipeline = cmd_buffer->state.pipeline;
> >>> > +   if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT))
> >>> > +      return false;
> >>> > +
> >>> > +   /* !(3DSTATE_WM::EDSC_Mode == EDSC_PREPS) */
> >>> > +   const struct brw_wm_prog_data *wm_prog_data =
> >>> get_wm_prog_data(pipeline);
> >>> > +   if (wm_prog_data->early_fragment_tests)
> >>> > +      return false;
> >>>
> >>> In keeping with your top-to-bottom evaluation of the logical statement,
> >>> could you move this part to be above the PixelShaderValid part?
> >>>
> >>
> >> Not really.  If we don't' have a fragment shader, we wm_prog_data is NULL
> >>
> >>
> >>> > +
> >>> > +   /* We never use anv_pipeline for HiZ ops so this is trivially true:
> >>> > +   *    !(3DSTATE_WM_HZ_OP::DepthBufferClear ||
> >>>       ^
> >>>       misaligned asterisk
> >>>
> >>
> >> Fixed.
> >>
> >>
> >>> > +    *      3DSTATE_WM_HZ_OP::DepthBufferResolve ||
> >>> > +    *      3DSTATE_WM_HZ_OP::Hierarchical Depth Buffer Resolve Enable
> >>> ||
> >>> > +    *      3DSTATE_WM_HZ_OP::StencilBufferClear)
> >>> > +    */
> >>> > +
> >>> > +   /* 3DSTATE_WM_DEPTH_STENCIL::DepthTestEnable */
> >>> > +   if (!pipeline->depth_test_enable)
> >>> > +      return false;
> >>> > +
> >>> > +   /* (((3DSTATE_PS_EXTRA::PixelShaderKillsPixels ||
> >>> > +    *    3DSTATE_PS_EXTRA::oMask Present to RenderTarget ||
> >>> > +    *    3DSTATE_PS_BLEND::AlphaToCoverageEnable ||
> >>> > +    *    3DSTATE_PS_BLEND::AlphaTestEnable ||
> >>> > +    *    3DSTATE_WM_CHROMAKEY::ChromaKeyKillEnable) &&
> >>> > +    *   3DSTATE_WM::ForceKillPix != ForceOff &&
> >>> > +    *   ((3DSTATE_WM_DEPTH_STENCIL::DepthWriteEnable &&
> >>> > +    *     3DSTATE_DEPTH_BUFFER::DEPTH_WRITE_ENABLE) ||
> >>> > +    *    (3DSTATE_WM_DEPTH_STENCIL::Stencil Buffer Write Enable &&
> >>> > +    *     3DSTATE_DEPTH_BUFFER::STENCIL_WRITE_ENABLE &&
> >>> > +    *     3DSTATE_STENCIL_BUFFER::STENCIL_BUFFER_ENABLE))) ||
> >>> > +    *  (3DSTATE_PS_EXTRA:: Pixel Shader Computed Depth mode !=
> >>> PSCDEPTH_OFF))
> >>> > +    */
> >>> > +   return (pipeline->kill_pixel && (pipeline->writes_depth ||
> >>> > +                                    pipeline->writes_stencil)) ||
> >>>
> >>> anv_pipeline::writes_stencil !=
> >>> (3DSTATE_WM_DEPTH_STENCIL::Stencil Buffer Write Enable &&
> >>>  3DSTATE_DEPTH_BUFFER::STENCIL_WRITE_ENABLE &&
> >>>  3DSTATE_STENCIL_BUFFER::STENCIL_BUFFER_ENABLE);
> >>>
> >>> The last two lines are not accounted for - we don't detect if the
> >>> currently bound depth_stencil attachment lacks or possesses a stencil
> >>> buffer.
> >>>
> >>
> >> We should be checking for the presence of a stencil aspect in the
> >> pipeline.  I think that got missed when I added the writes_stencil bit.  We
> >> do by the end of the series, but I'll patch up the disable stencil writes
> >> patch to make it check.
> >>
> >>
> >>> This is a valid scenario according to section 25.9. Stencil Test of the
> >>> Vulkan spec, which states:
> >>>
> >>>    If there is no stencil framebuffer attachment, stencil modification
> >>>    cannot occur, and it is as if the stencil tests always pass.
> >>>
> >>> anv_pipeline::writes_depth should be correct because hiz_enabled is true
> >>> by the time we reach it. For hiz_enabled to be true, there must be a
> >>> depth buffer present.
> >>>
> >>
> >> Yup
> >>
> >>
> >>> > +          wm_prog_data->computed_depth_mode != PSCDEPTH_OFF;
> >>> > +}
> >>> > +
> >>> > +void
> >>> >  genX(cmd_buffer_flush_dynamic_state)(struct anv_cmd_buffer
> >>> *cmd_buffer)
> >>> >  {
> >>> >     struct anv_pipeline *pipeline = cmd_buffer->state.pipeline;
> >>> > @@ -211,6 +340,7 @@ genX(cmd_buffer_flush_dynamic_state)(struct
> >>> anv_cmd_buffer *cmd_buffer)
> >>> >     }
> >>> >
> >>> >     if (cmd_buffer->state.dirty & (ANV_CMD_DIRTY_PIPELINE |
> >>> > +                                  ANV_CMD_DIRTY_RENDER_TARGETS |
> >>>
> >>> Why is this necessary?
> >>>
> >>
> >> We set RENDER_TARGETS dirty when the depth buffer changes due to
> >> NextSubpass or BeginCommandBuffer
> >>
> >>
> >>> >                                    ANV_CMD_DIRTY_DYNAMIC_STENCIL_COMPARE_MASK
> >>> |
> >>> >                                    ANV_CMD_DIRTY_DYNAMIC_STENCIL_WRITE_MASK))
> >>> {
> >>> >        uint32_t wm_depth_stencil_dw[GENX(3DSTA
> >>> TE_WM_DEPTH_STENCIL_length)];
> >>> > @@ -234,6 +364,9 @@ genX(cmd_buffer_flush_dynamic_state)(struct
> >>> anv_cmd_buffer *cmd_buffer)
> >>> >
> >>> >        anv_batch_emit_merge(&cmd_buffer->batch, wm_depth_stencil_dw,
> >>> >                             pipeline->gen8.wm_depth_stencil);
> >>> > +
> >>> > +      genX(cmd_buffer_enable_pma_fix)(cmd_buffer,
> >>> > +                                      want_depth_pma_fix(cmd_buffer)
> >>> );
> >>> >     }
> >>> >  #else
> >>> >     if (cmd_buffer->state.dirty & ANV_CMD_DIRTY_DYNAMIC_BLEND_CONSTANTS)
> >>> {
> >>> > diff --git a/src/intel/vulkan/genX_blorp_exec.c
> >>> b/src/intel/vulkan/genX_blorp_exec.c
> >>> > index 663e6c9..6f0b063 100644
> >>> > --- a/src/intel/vulkan/genX_blorp_exec.c
> >>> > +++ b/src/intel/vulkan/genX_blorp_exec.c
> >>> > @@ -154,6 +154,11 @@ genX(blorp_exec)(struct blorp_batch *batch,
> >>> >
> >>> >     genX(cmd_buffer_emit_gen7_depth_flush)(cmd_buffer);
> >>> >
> >>> > +   /* BLORP doesn't do anything fancy with depth such as discards, so
> >>> we want
> >>> > +    * the PMA fix off.  Also, off is always the safe option.
> >>> > +    */
> >>> > +   genX(cmd_buffer_enable_pma_fix)(cmd_buffer, false);
> >>> > +
> >>> >     blorp_exec(batch, params);
> >>> >
> >>> >     cmd_buffer->state.vb_dirty = ~0;
> >>> > diff --git a/src/intel/vulkan/genX_cmd_buffer.c
> >>> b/src/intel/vulkan/genX_cmd_buffer.c
> >>> > index b6b7f74..66de93a 100644
> >>> > --- a/src/intel/vulkan/genX_cmd_buffer.c
> >>> > +++ b/src/intel/vulkan/genX_cmd_buffer.c
> >>> > @@ -627,6 +627,11 @@ genX(EndCommandBuffer)(
> >>> >  {
> >>> >     ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
> >>> >
> >>> > +   /* We want every command buffer to start with the PMA fix in a
> >>> known state,
> >>> > +    * so we disable it at the end of the command buffer.
> >>> > +    */
> >>> > +   genX(cmd_buffer_enable_pma_fix)(cmd_buffer, false);
> >>> > +
> >>> >     genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
> >>> >
> >>> >     anv_cmd_buffer_end_batch_buffer(cmd_buffer);
> >>> > @@ -644,6 +649,11 @@ genX(CmdExecuteCommands)(
> >>> >
> >>> >     assert(primary->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY);
> >>> >
> >>> > +   /* The secondary command buffers will assume that the PMA fix is
> >>> disabled
> >>> > +    * when they begin executing.  Make sure this is true.
> >>> > +    */
> >>> > +   genX(cmd_buffer_enable_pma_fix)(primary, false);
> >>> > +
> >>> >     for (uint32_t i = 0; i < commandBufferCount; i++) {
> >>> >        ANV_FROM_HANDLE(anv_cmd_buffer, secondary, pCmdBuffers[i]);
> >>> >
> >>> > @@ -2181,7 +2191,8 @@ cmd_buffer_emit_depth_stencil(struct
> >>> anv_cmd_buffer *cmd_buffer)
> >>> >     const bool has_stencil =
> >>> >        image && (image->aspects & VK_IMAGE_ASPECT_STENCIL_BIT);
> >>> >
> >>> > -   /* FIXME: Implement the PMA stall W/A */
> >>> > +   cmd_buffer->state.hiz_enabled = has_hiz;
> >>> > +
> >>> >     /* FIXME: Width and Height are wrong */
> >>> >
> >>> >     genX(cmd_buffer_emit_gen7_depth_flush)(cmd_buffer);
> >>> > @@ -2419,6 +2430,8 @@ void genX(CmdEndRenderPass)(
> >>> >
> >>> >     anv_cmd_buffer_resolve_subpass(cmd_buffer);
> >>> >
> >>> > +   cmd_buffer->state.hiz_enabled = false;
> >>> > +
> >>> >  #ifndef NDEBUG
> >>> >     anv_dump_add_framebuffer(cmd_buffer,
> >>> cmd_buffer->state.framebuffer);
> >>> >  #endif
> >>> > diff --git a/src/intel/vulkan/genX_pipeline.c
> >>> b/src/intel/vulkan/genX_pipeline.c
> >>> > index 18fe48c..0588d01 100644
> >>> > --- a/src/intel/vulkan/genX_pipeline.c
> >>> > +++ b/src/intel/vulkan/genX_pipeline.c
> >>> > @@ -653,6 +653,8 @@ emit_ds_state(struct anv_pipeline *pipeline,
> >>> >         * to make sure it's initialized to something useful.
> >>> >         */
> >>> >        pipeline->writes_stencil = false;
> >>> > +      pipeline->writes_depth = false;
> >>> > +      pipeline->depth_test_enable = false;
> >>> >        memset(depth_stencil_dw, 0, sizeof(depth_stencil_dw));
> >>> >        return;
> >>> >     }
> >>> > @@ -711,6 +713,9 @@ emit_ds_state(struct anv_pipeline *pipeline,
> >>> >     if (info->depthTestEnable && info->depthCompareOp ==
> >>> VK_COMPARE_OP_EQUAL)
> >>> >        depth_stencil.DepthBufferWriteEnable = false;
> >>> >
> >>> > +   pipeline->writes_depth = depth_stencil.DepthBufferWriteEnable;
> >>> > +   pipeline->depth_test_enable = depth_stencil.DepthTestEnable;
> >>> > +
> >>> >  #if GEN_GEN <= 7
> >>> >     GENX(DEPTH_STENCIL_STATE_pack)(NULL, depth_stencil_dw,
> >>> &depth_stencil);
> >>> >  #else
> >>> > @@ -1429,6 +1434,38 @@ emit_3dstate_vf_topology(struct anv_pipeline
> >>> *pipeline)
> >>> >  }
> >>> >  #endif
> >>> >
> >>> > +static void
> >>> > +compute_kill_pixel(struct anv_pipeline *pipeline,
> >>> > +                   const VkPipelineMultisampleStateCreateInfo
> >>> *ms_info,
> >>> > +                   const struct anv_subpass *subpass)
> >>> > +{
> >>> > +   if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT)) {
> >>> > +      pipeline->kill_pixel = false;
> >>> > +      return;
> >>> > +   }
> >>> > +
> >>> > +   const struct brw_wm_prog_data *wm_prog_data =
> >>> get_wm_prog_data(pipeline);
> >>> > +
> >>> > +   /* This computes the KillPixel portion of the computation for
> >>> whether or
> >>> > +    * not we want to enable the PMA fix on gen8.  It's given by this
> >>> chunk of
> >>> > +    * the giant formula:
> >>> > +    *
> >>> > +    *    (3DSTATE_PS_EXTRA::PixelShaderKillsPixels ||
> >>> > +    *     3DSTATE_PS_EXTRA::oMask Present to RenderTarget ||
> >>> > +    *     3DSTATE_PS_BLEND::AlphaToCoverageEnable ||
> >>> > +    *     3DSTATE_PS_BLEND::AlphaTestEnable ||
> >>> > +    *     3DSTATE_WM_CHROMAKEY::ChromaKeyKillEnable)
> >>> > +    *
> >>> > +    * 3DSTATE_WM_CHROMAKEY::ChromaKeyKillEnable is always false and
> >>> so is
> >>> > +    * 3DSTATE_PS_BLEND::AlphaTestEnable since Vulkan doesn't have a
> >>> concept
> >>> > +    * of an alpha test.
> >>> > +    */
> >>> > +   pipeline->kill_pixel =
> >>> > +      subpass->has_ds_self_dep || wm_prog_data->uses_kill ||
> >>>
> >>> Why do we set kill_pixel in the presence of has_ds_self_dep?
> >>>
> >>> Section 7.1 of the Vulkan spec states:
> >>>
> >>>    If a subpass uses the same attachment as both an input attachment and
> >>>    either a color attachment or a depth/stencil attachment, writes via
> >>>    the color or depth/stencil attachment are not automatically made
> >>>    visible to reads via the input attachment, causing a feedback loop,
> >>>    except in any of the following conditions:
> >>>
> >>>    [...]
> >>>
> >>> I don't see how any of the conditions are satisfied through the use
> >>> of kill_pixel.
> >>>
> >>
> >> There's a really good comment where we set 3DSTATE_PS::PixelShaderKillsPixel
> >> and 3DSTATE_PS_EXTRA::PixelShaderKillsPixel
> >>
> >>
> >>> > +      wm_prog_data->uses_omask ||
> >>> > +      (ms_info && ms_info->alphaToCoverageEnable);
> >>> > +}
> >>> > +
> >>> >  static VkResult
> >>> >  genX(graphics_pipeline_create)(
> >>> >      VkDevice                                    _device,
> >>> > @@ -1466,6 +1503,7 @@ genX(graphics_pipeline_create)(
> >>> >     emit_ds_state(pipeline, pCreateInfo->pDepthStencilState, pass,
> >>> subpass);
> >>> >     emit_cb_state(pipeline, pCreateInfo->pColorBlendState,
> >>> >                             pCreateInfo->pMultisampleState);
> >>> > +   compute_kill_pixel(pipeline, pCreateInfo->pMultisampleState,
> >>> subpass);
> >>> >
> >>> >     emit_urb_setup(pipeline);
> >>> >
> >>> > --
> >>> > 2.5.0.400.gff86faf
> >>> >
> >>> > _______________________________________________
> >>> > mesa-dev mailing list
> >>> > mesa-dev at lists.freedesktop.org
> >>> > https://lists.freedesktop.org/mailman/listinfo/mesa-dev
> >>>
> >>
> >>
> >