[Mesa-dev] [PATCH 03/16] i965: Hook end-of-pipe-sync after texture resolves

Fri Feb 17 19:32:06 UTC 2017

There are three functional changes in this patch:

1) Currently the iteration over textures would flush after each
   resolve: brw_render_cache_set_check_flush() would fire
   every time as the resolved surface would be found in the render
   cache. Now the iteration records is flush is needed and does
   it only once after all resolves are pipelined.

2) Make distinction between resolves and other renders. In the
   former case issue end-of-pipe-sync and in the latter keep on
   emitting just the flush.

3) Current logic calls brw_render_cache_set_check_flush() which
   also does the top-of-pipe flushing. In case of texture resolves
   this is now also done once in intel_update_state(). Ideally
   this would be called conditionally by the 3D draw and compute
   draw paths. In the former case this would need plumbing from
   core update to the driver as draw elements update the state
   without the driver necessarily knowing.

Signed-off-by: Topi Pohjolainen <topi.pohjolainen at intel.com>
---
 src/mesa/drivers/dri/i965/brw_context.c | 113 ++++++++++++++++++++++++++++----
 1 file changed, 99 insertions(+), 14 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_context.c b/src/mesa/drivers/dri/i965/brw_context.c
index 7240b1f..9ca1ac1 100644
--- a/src/mesa/drivers/dri/i965/brw_context.c
+++ b/src/mesa/drivers/dri/i965/brw_context.c
@@ -222,6 +222,86 @@ intel_texture_view_requires_resolve(struct brw_context *brw,
    return true;
 }
 
+enum intel_write_cache_flush_type {
+   INTEL_WRITE_CACHE_NO_FLUSH = 0,
+   INTEL_WRITE_CACHE_FLUSH    = 1 << 0,
+   INTEL_WRITE_CACHE_SYNC     = 1 << 1,
+};
+
+static enum intel_write_cache_flush_type 
+brw_prepare_textures(struct gl_context *ctx)
+{
+   struct brw_context *brw = brw_context(ctx);
+   enum intel_write_cache_flush_type flush = INTEL_WRITE_CACHE_NO_FLUSH;
+   bool resolved = false;
+
+   memset(brw->draw_aux_buffer_disabled, 0,
+          sizeof(brw->draw_aux_buffer_disabled));
+
+   for (int i = 0; i <= ctx->Texture._MaxEnabledTexImageUnit; i++) {
+      if (!ctx->Texture.Unit[i]._Current)
+	 continue;
+
+      struct intel_texture_object * const tex_obj =
+         intel_texture_object(ctx->Texture.Unit[i]._Current);
+      if (!tex_obj || !tex_obj->mt)
+	 continue;
+
+      if (intel_miptree_sample_with_hiz(brw, tex_obj->mt))
+         resolved |= intel_miptree_all_slices_resolve_hiz(brw, tex_obj->mt);
+      else
+         resolved |= intel_miptree_all_slices_resolve_depth(brw, tex_obj->mt);
+
+      /* Sampling engine understands lossless compression and resolving
+       * those surfaces should be skipped for performance reasons.
+       */
+      const int flags = intel_texture_view_requires_resolve(brw, tex_obj) ?
+                           0 : INTEL_MIPTREE_IGNORE_CCS_E;
+      resolved |= intel_miptree_all_slices_resolve_color(brw, tex_obj->mt,
+                                                         flags);
+
+      /*
+       * Ivybrigde PRM Vol 2, Part 1, "11.7 MCS Buffer for Render Target(s)":
+       *
+       *  Any transition from any value in {Clear, Render, Resolve} to a
+       *  different value in {Clear, Render, Resolve} requires end of pipe
+       *  synchronization.
+       */
+      if (resolved)
+         flush |= INTEL_WRITE_CACHE_SYNC;
+
+      if (_mesa_set_search(brw->render_cache, tex_obj->mt->bo) != NULL)
+         flush |= INTEL_WRITE_CACHE_FLUSH;
+   }
+
+   return flush;
+}
+
+static bool
+intel_resolve_and_sync_surfaces(struct gl_context *ctx)
+{
+   struct brw_context *brw = brw_context(ctx);
+   const int flags = (brw->gen >= 6) ? PIPE_CONTROL_DEPTH_CACHE_FLUSH |
+                                       PIPE_CONTROL_RENDER_TARGET_FLUSH |
+                                       PIPE_CONTROL_CS_STALL :
+                                       PIPE_CONTROL_RENDER_TARGET_FLUSH;
+
+   const enum intel_write_cache_flush_type flush =
+      brw_prepare_textures(ctx);
+
+   if (flush == INTEL_WRITE_CACHE_NO_FLUSH)
+      return false;
+
+   if (flush & INTEL_WRITE_CACHE_SYNC)
+      brw_end_of_pipe_sync(brw);
+   else if (flush & INTEL_WRITE_CACHE_FLUSH)
+      brw_emit_pipe_control_flush(brw, flags);
+
+   brw_render_cache_set_clear(brw);
+
+   return true;
+}
+
 static void
 intel_update_state(struct gl_context * ctx, GLuint new_state)
 {
@@ -242,10 +322,26 @@ intel_update_state(struct gl_context * ctx, GLuint new_state)
    if (depth_irb)
       intel_renderbuffer_resolve_hiz(brw, depth_irb);
 
-   memset(brw->draw_aux_buffer_disabled, 0,
-          sizeof(brw->draw_aux_buffer_disabled));
+   if (intel_resolve_and_sync_surfaces(ctx)) {
+      /* Perform top-of-pipe flush.
+       *
+       * TODO: Consider flushing only in brw_dispatch_compute_common() and
+       *       brw_try_draw_prims(). Other callers of _mesa_update_state() are
+       *       not going to be using gpu and hence flushing the gpu read-only
+       *       caches (texture and data port constant cache) are unnecessary.
+       *       Simply moving the call from here is not enough, however. In
+       *       case of glDrawElements() brw_try_draw_prims() gets clean
+       *       gl-state as vbo_exec_DrawElements() calls _mesa_update_state()
+       *       before consulting driver.
+       *       Unconditional flushing in turn decreases performance
+       *       significantly in various benchmarks.
+       */
+      if (brw->gen >= 6)
+         brw_emit_pipe_control_flush(brw,
+                                     PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
+                                     PIPE_CONTROL_CONST_CACHE_INVALIDATE);
+   }
 
-   /* Resolve depth buffer and render cache of each enabled texture. */
    int maxEnabledUnit = ctx->Texture._MaxEnabledTexImageUnit;
    for (int i = 0; i <= maxEnabledUnit; i++) {
       if (!ctx->Texture.Unit[i]._Current)
@@ -253,17 +349,6 @@ intel_update_state(struct gl_context * ctx, GLuint new_state)
       tex_obj = intel_texture_object(ctx->Texture.Unit[i]._Current);
       if (!tex_obj || !tex_obj->mt)
 	 continue;
-      if (intel_miptree_sample_with_hiz(brw, tex_obj->mt))
-         intel_miptree_all_slices_resolve_hiz(brw, tex_obj->mt);
-      else
-         intel_miptree_all_slices_resolve_depth(brw, tex_obj->mt);
-      /* Sampling engine understands lossless compression and resolving
-       * those surfaces should be skipped for performance reasons.
-       */
-      const int flags = intel_texture_view_requires_resolve(brw, tex_obj) ?
-                           0 : INTEL_MIPTREE_IGNORE_CCS_E;
-      intel_miptree_all_slices_resolve_color(brw, tex_obj->mt, flags);
-      brw_render_cache_set_check_flush(brw, tex_obj->mt->bo);
 
       if (tex_obj->base.StencilSampling ||
           tex_obj->mt->format == MESA_FORMAT_S_UINT8) {
-- 
2.5.5