[Mesa-dev] [PATCH 2/2] anv/pipeline: Use ForceThreadDispatchEnable instead of AccessUAV.

Jason Ekstrand jason at jlekstrand.net
Tue Feb 14 01:28:31 UTC 2017


The AccessUAV bit is not quite what we want because it's more about
coherency between storage operations than it is about dispatch.  Also,
the 3DSTATE_PS_EXTRA::PixelShaderHasUAV bit seems to cause hangs on
Broadwell for unknown reasons so it's best to just leave it off.  The
3DSTATE_WM::ForceThreadDispatchEnable bit, on the other hand, does
exactly what we want and seems to work fine.

This fixes GPU hangs with Dota 2 on Broadwell
---
 src/intel/vulkan/genX_pipeline.c | 86 ++++++++++++++++------------------------
 1 file changed, 35 insertions(+), 51 deletions(-)

diff --git a/src/intel/vulkan/genX_pipeline.c b/src/intel/vulkan/genX_pipeline.c
index 752661e..ffb6e38 100644
--- a/src/intel/vulkan/genX_pipeline.c
+++ b/src/intel/vulkan/genX_pipeline.c
@@ -1285,6 +1285,26 @@ emit_3dstate_gs(struct anv_pipeline *pipeline)
    }
 }
 
+static inline bool
+has_color_buffer_write_enabled(const struct anv_pipeline *pipeline)
+{
+   const struct anv_shader_bin *shader_bin =
+      pipeline->shaders[MESA_SHADER_FRAGMENT];
+   if (!shader_bin)
+      return false;
+
+   const struct anv_pipeline_bind_map *bind_map = &shader_bin->bind_map;
+   for (int i = 0; i < bind_map->surface_count; i++) {
+      if (bind_map->surface_to_descriptor[i].set !=
+          ANV_DESCRIPTOR_SET_COLOR_ATTACHMENTS)
+         continue;
+      if (bind_map->surface_to_descriptor[i].index != UINT8_MAX)
+         return true;
+   }
+
+   return false;
+}
+
 static void
 emit_3dstate_wm(struct anv_pipeline *pipeline, struct anv_subpass *subpass,
                 const VkPipelineMultisampleStateCreateInfo *multisample)
@@ -1312,6 +1332,21 @@ emit_3dstate_wm(struct anv_pipeline *pipeline, struct anv_subpass *subpass,
          wm.BarycentricInterpolationMode =
             wm_prog_data->barycentric_interp_modes;
 
+#if GEN_GEN >= 8
+      /* In most cases, the computation for the WM_INT::ThreadDispatchEnable
+       * does exactly what you want.  However, when you don't have any color
+       * buffers or when depth/stencil writes are disabled, it misses a few
+       * cases.  In those cases, we force it to dispatch the PS by using the
+       * 3DSTATE_WM::ForceThreadDispatchEnable bit.  According to the docs,
+       * this bit is not validated and shouldn't be used.  However, it seems
+       * to work fine and this is exactly the type of thing you would use such
+       * a bit for.
+       */
+      if ((wm_prog_data->has_side_effects || wm_prog_data->uses_kill) &&
+          !has_color_buffer_write_enabled(pipeline))
+         wm.ForceThreadDispatchEnable = ForceON;
+#endif
+
 #if GEN_GEN < 8
          /* FIXME: This needs a lot more work, cf gen7 upload_wm_state(). */
          wm.ThreadDispatchEnable          = true;
@@ -1449,26 +1484,6 @@ emit_3dstate_ps(struct anv_pipeline *pipeline,
    }
 }
 
-static inline bool
-has_color_buffer_write_enabled(const struct anv_pipeline *pipeline)
-{
-   const struct anv_shader_bin *shader_bin =
-      pipeline->shaders[MESA_SHADER_FRAGMENT];
-   if (!shader_bin)
-      return false;
-
-   const struct anv_pipeline_bind_map *bind_map = &shader_bin->bind_map;
-   for (int i = 0; i < bind_map->surface_count; i++) {
-      if (bind_map->surface_to_descriptor[i].set !=
-          ANV_DESCRIPTOR_SET_COLOR_ATTACHMENTS)
-         continue;
-      if (bind_map->surface_to_descriptor[i].index != UINT8_MAX)
-         return true;
-   }
-
-   return false;
-}
-
 #if GEN_GEN >= 8
 static void
 emit_3dstate_ps_extra(struct anv_pipeline *pipeline,
@@ -1499,37 +1514,6 @@ emit_3dstate_ps_extra(struct anv_pipeline *pipeline,
       ps.PixelShaderKillsPixel         = subpass->has_ds_self_dep ||
                                          wm_prog_data->uses_kill;
 
-      /* The stricter cross-primitive coherency guarantees that the hardware
-       * gives us with the "Accesses UAV" bit set for at least one shader stage
-       * and the "UAV coherency required" bit set on the 3DPRIMITIVE command are
-       * redundant within the current image, atomic counter and SSBO GL APIs,
-       * which all have very loose ordering and coherency requirements and
-       * generally rely on the application to insert explicit barriers when a
-       * shader invocation is expected to see the memory writes performed by the
-       * invocations of some previous primitive.  Regardless of the value of
-       * "UAV coherency required", the "Accesses UAV" bits will implicitly cause
-       * an in most cases useless DC flush when the lowermost stage with the bit
-       * set finishes execution.
-       *
-       * It would be nice to disable it, but in some cases we can't because on
-       * Gen8+ it also has an influence on rasterization via the PS UAV-only
-       * signal (which could be set independently from the coherency mechanism
-       * in the 3DSTATE_WM command on Gen7), and because in some cases it will
-       * determine whether the hardware skips execution of the fragment shader
-       * or not via the ThreadDispatchEnable signal.  However if we know that
-       * GEN8_PS_BLEND_HAS_WRITEABLE_RT is going to be set and
-       * GEN8_PSX_PIXEL_SHADER_NO_RT_WRITE is not set it shouldn't make any
-       * difference so we may just disable it here.
-       *
-       * Gen8 hardware tries to compute ThreadDispatchEnable for us but doesn't
-       * take into account KillPixels when no depth or stencil writes are
-       * enabled. In order for occlusion queries to work correctly with no
-       * attachments, we need to force-enable here.
-       */
-      if ((wm_prog_data->has_side_effects || wm_prog_data->uses_kill) &&
-          !has_color_buffer_write_enabled(pipeline))
-         ps.PixelShaderHasUAV = true;
-
 #if GEN_GEN >= 9
       ps.PixelShaderPullsBary    = wm_prog_data->pulls_bary;
       ps.InputCoverageMaskState  = wm_prog_data->uses_sample_mask ?
-- 
2.5.0.400.gff86faf



More information about the mesa-dev mailing list