[Mesa-dev] [PATCH 2/3] i965: Keep track of the per-thread scratch allocation in brw_stage_state.

Francisco Jerez currojerez at riseup.net
Sat Jun 11 21:50:27 UTC 2016


This will be used to find out what per-thread slot size a previously
allocated scratch BO was used with in order to fix a hardware race
condition without introducing additional stalls or memory allocations.
Instead of calling brw_get_scratch_bo() manually from the various
codegen functions, call a new helper function that keeps track of the
per-thread scratch size and conditionally allocates a larger scratch
BO.
---
This patch and the next one apply on top of Ken's compute shader
scratch fixes from:
 https://lists.freedesktop.org/archives/mesa-dev/2016-June/120084.html
 
 src/mesa/drivers/dri/i965/brw_context.h | 10 +++++++
 src/mesa/drivers/dri/i965/brw_cs.c      | 48 ++++++++++++++++-----------------
 src/mesa/drivers/dri/i965/brw_gs.c      |  8 +++---
 src/mesa/drivers/dri/i965/brw_program.c | 17 ++++++++++++
 src/mesa/drivers/dri/i965/brw_tcs.c     |  8 +++---
 src/mesa/drivers/dri/i965/brw_tes.c     |  8 +++---
 src/mesa/drivers/dri/i965/brw_vs.c      |  8 +++---
 src/mesa/drivers/dri/i965/brw_wm.c      |  7 +++--
 8 files changed, 65 insertions(+), 49 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h
index daa9ed2..9618b4a 100644
--- a/src/mesa/drivers/dri/i965/brw_context.h
+++ b/src/mesa/drivers/dri/i965/brw_context.h
@@ -677,6 +677,12 @@ struct brw_stage_state
     */
    drm_intel_bo *scratch_bo;
 
+   /**
+    * Scratch slot size allocated for each thread in the buffer object given
+    * by \c scratch_bo.
+    */
+   uint32_t per_thread_scratch;
+
    /** Offset in the program cache to the program */
    uint32_t prog_offset;
 
@@ -1481,6 +1487,10 @@ brw_get_scratch_size(int size)
 }
 void brw_get_scratch_bo(struct brw_context *brw,
 			drm_intel_bo **scratch_bo, int size);
+void brw_alloc_stage_scratch(struct brw_context *brw,
+                             struct brw_stage_state *stage_state,
+                             unsigned per_thread_size,
+                             unsigned thread_count);
 void brw_init_shader_time(struct brw_context *brw);
 int brw_get_shader_time_index(struct brw_context *brw,
                               struct gl_shader_program *shader_prog,
diff --git a/src/mesa/drivers/dri/i965/brw_cs.c b/src/mesa/drivers/dri/i965/brw_cs.c
index 329adff..5c89d42 100644
--- a/src/mesa/drivers/dri/i965/brw_cs.c
+++ b/src/mesa/drivers/dri/i965/brw_cs.c
@@ -148,31 +148,29 @@ brw_codegen_cs_prog(struct brw_context *brw,
       }
    }
 
-   if (prog_data.base.total_scratch) {
-      const unsigned subslices = MAX2(brw->intelScreen->subslice_total, 1);
-
-      /* WaCSScratchSize:hsw
-       *
-       * Haswell's scratch space address calculation appears to be sparse
-       * rather than tightly packed.  The Thread ID has bits indicating
-       * which subslice, EU within a subslice, and thread within an EU
-       * it is.  There's a maximum of two slices and two subslices, so these
-       * can be stored with a single bit.  Even though there are only 10 EUs
-       * per subslice, this is stored in 4 bits, so there's an effective
-       * maximum value of 16 EUs.  Similarly, although there are only 7
-       * threads per EU, this is stored in a 3 bit number, giving an effective
-       * maximum value of 8 threads per EU.
-       *
-       * This means that we need to use 16 * 8 instead of 10 * 7 for the
-       * number of threads per subslice.
-       */
-      const unsigned threads_per_subslice =
-         brw->is_haswell ? 16 * 8 : brw->max_cs_threads;
-
-      brw_get_scratch_bo(brw, &brw->cs.base.scratch_bo,
-                         prog_data.base.total_scratch *
-                         threads_per_subslice * subslices);
-   }
+   const unsigned subslices = MAX2(brw->intelScreen->subslice_total, 1);
+
+   /* WaCSScratchSize:hsw
+    *
+    * Haswell's scratch space address calculation appears to be sparse
+    * rather than tightly packed.  The Thread ID has bits indicating
+    * which subslice, EU within a subslice, and thread within an EU
+    * it is.  There's a maximum of two slices and two subslices, so these
+    * can be stored with a single bit.  Even though there are only 10 EUs
+    * per subslice, this is stored in 4 bits, so there's an effective
+    * maximum value of 16 EUs.  Similarly, although there are only 7
+    * threads per EU, this is stored in a 3 bit number, giving an effective
+    * maximum value of 8 threads per EU.
+    *
+    * This means that we need to use 16 * 8 instead of 10 * 7 for the
+    * number of threads per subslice.
+    */
+   const unsigned threads_per_subslice =
+      brw->is_haswell ? 16 * 8 : brw->max_cs_threads;
+
+   brw_alloc_stage_scratch(brw, &brw->cs.base,
+                           prog_data.base.total_scratch,
+                           threads_per_subslice * subslices);
 
    if (unlikely(INTEL_DEBUG & DEBUG_CS))
       fprintf(stderr, "\n");
diff --git a/src/mesa/drivers/dri/i965/brw_gs.c b/src/mesa/drivers/dri/i965/brw_gs.c
index 7ead182..4ac1009 100644
--- a/src/mesa/drivers/dri/i965/brw_gs.c
+++ b/src/mesa/drivers/dri/i965/brw_gs.c
@@ -180,11 +180,9 @@ brw_codegen_gs_prog(struct brw_context *brw,
    }
 
    /* Scratch space is used for register spilling */
-   if (prog_data.base.base.total_scratch) {
-      brw_get_scratch_bo(brw, &stage_state->scratch_bo,
-			 prog_data.base.base.total_scratch *
-                         brw->max_gs_threads);
-   }
+   brw_alloc_stage_scratch(brw, stage_state,
+                           prog_data.base.base.total_scratch,
+                           brw->max_gs_threads);
 
    brw_upload_cache(&brw->cache, BRW_CACHE_GS_PROG,
                     key, sizeof(*key),
diff --git a/src/mesa/drivers/dri/i965/brw_program.c b/src/mesa/drivers/dri/i965/brw_program.c
index 792f81b..9f822d2 100644
--- a/src/mesa/drivers/dri/i965/brw_program.c
+++ b/src/mesa/drivers/dri/i965/brw_program.c
@@ -345,6 +345,23 @@ brw_get_scratch_bo(struct brw_context *brw,
    }
 }
 
+/**
+ * Reserve enough scratch space for the given stage to hold \p per_thread_size
+ * bytes times the given \p thread_count.
+ */
+void
+brw_alloc_stage_scratch(struct brw_context *brw,
+                        struct brw_stage_state *stage_state,
+                        unsigned per_thread_size,
+                        unsigned thread_count)
+{
+   if (stage_state->per_thread_scratch < per_thread_size) {
+      stage_state->per_thread_scratch = per_thread_size;
+      brw_get_scratch_bo(brw, &stage_state->scratch_bo,
+                         per_thread_size * thread_count);
+   }
+}
+
 void brwInitFragProgFuncs( struct dd_function_table *functions )
 {
    assert(functions->ProgramStringNotify == _tnl_program_string);
diff --git a/src/mesa/drivers/dri/i965/brw_tcs.c b/src/mesa/drivers/dri/i965/brw_tcs.c
index 83fc157..7fc8eb4 100644
--- a/src/mesa/drivers/dri/i965/brw_tcs.c
+++ b/src/mesa/drivers/dri/i965/brw_tcs.c
@@ -294,11 +294,9 @@ brw_codegen_tcs_prog(struct brw_context *brw,
    }
 
    /* Scratch space is used for register spilling */
-   if (prog_data.base.base.total_scratch) {
-      brw_get_scratch_bo(brw, &stage_state->scratch_bo,
-			 prog_data.base.base.total_scratch *
-                         brw->max_hs_threads);
-   }
+   brw_alloc_stage_scratch(brw, stage_state,
+                           prog_data.base.base.total_scratch,
+                           brw->max_hs_threads);
 
    brw_upload_cache(&brw->cache, BRW_CACHE_TCS_PROG,
                     key, sizeof(*key),
diff --git a/src/mesa/drivers/dri/i965/brw_tes.c b/src/mesa/drivers/dri/i965/brw_tes.c
index a4cd4da..d7b3e69 100644
--- a/src/mesa/drivers/dri/i965/brw_tes.c
+++ b/src/mesa/drivers/dri/i965/brw_tes.c
@@ -214,11 +214,9 @@ brw_codegen_tes_prog(struct brw_context *brw,
    }
 
    /* Scratch space is used for register spilling */
-   if (prog_data.base.base.total_scratch) {
-      brw_get_scratch_bo(brw, &stage_state->scratch_bo,
-			 prog_data.base.base.total_scratch *
-                         brw->max_ds_threads);
-   }
+   brw_alloc_stage_scratch(brw, stage_state,
+                           prog_data.base.base.total_scratch,
+                           brw->max_ds_threads);
 
    brw_upload_cache(&brw->cache, BRW_CACHE_TES_PROG,
                     key, sizeof(*key),
diff --git a/src/mesa/drivers/dri/i965/brw_vs.c b/src/mesa/drivers/dri/i965/brw_vs.c
index abf03b1..d929f9b 100644
--- a/src/mesa/drivers/dri/i965/brw_vs.c
+++ b/src/mesa/drivers/dri/i965/brw_vs.c
@@ -208,11 +208,9 @@ brw_codegen_vs_prog(struct brw_context *brw,
    }
 
    /* Scratch space is used for register spilling */
-   if (prog_data.base.base.total_scratch) {
-      brw_get_scratch_bo(brw, &brw->vs.base.scratch_bo,
-			 prog_data.base.base.total_scratch *
-                         brw->max_vs_threads);
-   }
+   brw_alloc_stage_scratch(brw, &brw->vs.base,
+                           prog_data.base.base.total_scratch,
+                           brw->max_vs_threads);
 
    brw_upload_cache(&brw->cache, BRW_CACHE_VS_PROG,
 		    key, sizeof(struct brw_vs_prog_key),
diff --git a/src/mesa/drivers/dri/i965/brw_wm.c b/src/mesa/drivers/dri/i965/brw_wm.c
index c9c5d5e..46839bc 100644
--- a/src/mesa/drivers/dri/i965/brw_wm.c
+++ b/src/mesa/drivers/dri/i965/brw_wm.c
@@ -163,10 +163,9 @@ brw_codegen_wm_prog(struct brw_context *brw,
       }
    }
 
-   if (prog_data.base.total_scratch) {
-      brw_get_scratch_bo(brw, &brw->wm.base.scratch_bo,
-			 prog_data.base.total_scratch * brw->max_wm_threads);
-   }
+   brw_alloc_stage_scratch(brw, &brw->wm.base,
+                           prog_data.base.total_scratch,
+                           brw->max_wm_threads);
 
    if (unlikely(INTEL_DEBUG & DEBUG_WM))
       fprintf(stderr, "\n");
-- 
2.7.3



More information about the mesa-dev mailing list