[Mesa-dev] [PATCH 3/3] i965: Fix cross-primitive scratch corruption when changing the per-thread allocation.

Francisco Jerez currojerez at riseup.net
Sat Jun 11 21:50:28 UTC 2016


I haven't found any mention of this in the hardware docs, but
experimentally what seems to be going on is that when the per-thread
scratch slot size is changed between two pipelined draw calls, shader
invocations using the old and new scratch size setting may end up
being executed in parallel, causing their scratch offset calculations
to be based in a different partitioning of the scratch space, which
can cause their thread-local scratch space to overlap leading to
cross-thread scratch corruption.

I've been experimenting with alternative workarounds, like emitting a
PIPE_CONTROL with DC flush and CS stall between draw (or dispatch
compute) calls using different per-thread scratch allocation settings,
or avoiding reuse of the scratch BO if the per-thread scratch
allocation doesn't exactly match the original.  Both seem to be as
effective as this workaround, but they have potential performance
implications, while this should be basically for free.

Fixes over 40 failures in our CI system with spilling forced on
(including CTS, dEQP and Piglit failures) on a number of different
platforms from Gen4 to Gen9.  The 'glsl-max-varyings' piglit test
seems to be able to reproduce this bug consistently in the vertex
shader on at least Gen4, Gen8 and Gen9 with spilling forced on.
---
 src/mesa/drivers/dri/i965/brw_context.h   | 13 +++++++++++++
 src/mesa/drivers/dri/i965/brw_vs_state.c  |  2 +-
 src/mesa/drivers/dri/i965/brw_wm_state.c  |  2 +-
 src/mesa/drivers/dri/i965/gen6_gs_state.c |  2 +-
 src/mesa/drivers/dri/i965/gen6_vs_state.c |  2 +-
 src/mesa/drivers/dri/i965/gen6_wm_state.c |  2 +-
 src/mesa/drivers/dri/i965/gen7_cs_state.c |  6 +++---
 src/mesa/drivers/dri/i965/gen7_ds_state.c |  2 +-
 src/mesa/drivers/dri/i965/gen7_gs_state.c |  2 +-
 src/mesa/drivers/dri/i965/gen7_hs_state.c |  2 +-
 src/mesa/drivers/dri/i965/gen7_vs_state.c |  2 +-
 src/mesa/drivers/dri/i965/gen7_wm_state.c |  2 +-
 src/mesa/drivers/dri/i965/gen8_ds_state.c |  2 +-
 src/mesa/drivers/dri/i965/gen8_gs_state.c |  2 +-
 src/mesa/drivers/dri/i965/gen8_hs_state.c |  2 +-
 src/mesa/drivers/dri/i965/gen8_ps_state.c |  2 +-
 src/mesa/drivers/dri/i965/gen8_vs_state.c |  2 +-
 17 files changed, 31 insertions(+), 18 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h
index 9618b4a..6e84506 100644
--- a/src/mesa/drivers/dri/i965/brw_context.h
+++ b/src/mesa/drivers/dri/i965/brw_context.h
@@ -674,6 +674,19 @@ struct brw_stage_state
    /**
     * Optional scratch buffer used to store spilled register values and
     * variably-indexed GRF arrays.
+    *
+    * The contents of this buffer are short-lived so the same memory can be
+    * re-used at will for multiple shader programs (executed by the same fixed
+    * function).  However reusing a scratch BO for which shader invocations
+    * are still in flight with a per-thread scratch slot size other than the
+    * original can cause threads with different scratch slot size and FFTID
+    * (which may be executed in parallel depending on the shader stage and
+    * hardware generation) to map to an overlapping region of the scratch
+    * space, which can potentially lead to mutual scratch space corruption.
+    * For that reason if you borrow this scratch buffer you should only be
+    * using the slot size given by the \c per_thread_scratch member below,
+    * unless you're taking additional measures to synchronize thread execution
+    * across slot size changes.
     */
    drm_intel_bo *scratch_bo;
 
diff --git a/src/mesa/drivers/dri/i965/brw_vs_state.c b/src/mesa/drivers/dri/i965/brw_vs_state.c
index c728f09..331949a 100644
--- a/src/mesa/drivers/dri/i965/brw_vs_state.c
+++ b/src/mesa/drivers/dri/i965/brw_vs_state.c
@@ -83,7 +83,7 @@ brw_upload_vs_unit(struct brw_context *brw)
       vs->thread2.scratch_space_base_pointer =
 	 stage_state->scratch_bo->offset64 >> 10; /* reloc */
       vs->thread2.per_thread_scratch_space =
-	 ffs(brw->vs.prog_data->base.base.total_scratch) - 11;
+	 ffs(stage_state->per_thread_scratch) - 11;
    } else {
       vs->thread2.scratch_space_base_pointer = 0;
       vs->thread2.per_thread_scratch_space = 0;
diff --git a/src/mesa/drivers/dri/i965/brw_wm_state.c b/src/mesa/drivers/dri/i965/brw_wm_state.c
index bf1bdc9..dda4f23 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_state.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_state.c
@@ -133,7 +133,7 @@ brw_upload_wm_unit(struct brw_context *brw)
       wm->thread2.scratch_space_base_pointer =
 	 brw->wm.base.scratch_bo->offset64 >> 10; /* reloc */
       wm->thread2.per_thread_scratch_space =
-	 ffs(prog_data->base.total_scratch) - 11;
+	 ffs(brw->wm.base.per_thread_scratch) - 11;
    } else {
       wm->thread2.scratch_space_base_pointer = 0;
       wm->thread2.per_thread_scratch_space = 0;
diff --git a/src/mesa/drivers/dri/i965/gen6_gs_state.c b/src/mesa/drivers/dri/i965/gen6_gs_state.c
index 4e4b9463..da7322e 100644
--- a/src/mesa/drivers/dri/i965/gen6_gs_state.c
+++ b/src/mesa/drivers/dri/i965/gen6_gs_state.c
@@ -140,7 +140,7 @@ upload_gs_state(struct brw_context *brw)
       if (prog_data->base.total_scratch) {
          OUT_RELOC(stage_state->scratch_bo,
                    I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
-                   ffs(prog_data->base.total_scratch) - 11);
+                   ffs(stage_state->per_thread_scratch) - 11);
       } else {
          OUT_BATCH(0); /* no scratch space */
       }
diff --git a/src/mesa/drivers/dri/i965/gen6_vs_state.c b/src/mesa/drivers/dri/i965/gen6_vs_state.c
index 3ae00ec..0ad74c4 100644
--- a/src/mesa/drivers/dri/i965/gen6_vs_state.c
+++ b/src/mesa/drivers/dri/i965/gen6_vs_state.c
@@ -126,7 +126,7 @@ upload_vs_state(struct brw_context *brw)
    if (brw->vs.prog_data->base.base.total_scratch) {
       OUT_RELOC(stage_state->scratch_bo,
 		I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
-		ffs(brw->vs.prog_data->base.base.total_scratch) - 11);
+		ffs(stage_state->per_thread_scratch) - 11);
    } else {
       OUT_BATCH(0);
    }
diff --git a/src/mesa/drivers/dri/i965/gen6_wm_state.c b/src/mesa/drivers/dri/i965/gen6_wm_state.c
index 3e872af..34aa121 100644
--- a/src/mesa/drivers/dri/i965/gen6_wm_state.c
+++ b/src/mesa/drivers/dri/i965/gen6_wm_state.c
@@ -220,7 +220,7 @@ gen6_upload_wm_state(struct brw_context *brw,
    if (prog_data->base.total_scratch) {
       OUT_RELOC(stage_state->scratch_bo,
                 I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
-		ffs(prog_data->base.total_scratch) - 11);
+		ffs(stage_state->per_thread_scratch) - 11);
    } else {
       OUT_BATCH(0);
    }
diff --git a/src/mesa/drivers/dri/i965/gen7_cs_state.c b/src/mesa/drivers/dri/i965/gen7_cs_state.c
index ba558a6..abaa2ce 100644
--- a/src/mesa/drivers/dri/i965/gen7_cs_state.c
+++ b/src/mesa/drivers/dri/i965/gen7_cs_state.c
@@ -70,21 +70,21 @@ brw_upload_cs_state(struct brw_context *brw)
           */
          OUT_RELOC64(stage_state->scratch_bo,
                      I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
-                     ffs(prog_data->total_scratch) - 11);
+                     ffs(stage_state->per_thread_scratch) - 11);
       } else if (brw->is_haswell) {
          /* Haswell's Per Thread Scratch Space is in the range [0, 10]
           * where 0 = 2k, 1 = 4k, 2 = 8k, ..., 10 = 2M.
           */
          OUT_RELOC(stage_state->scratch_bo,
                    I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
-                   ffs(prog_data->total_scratch) - 12);
+                   ffs(stage_state->per_thread_scratch) - 12);
       } else {
          /* Earlier platforms use the range [0, 11] to mean [1kB, 12kB]
           * where 0 = 1kB, 1 = 2kB, 2 = 3kB, ..., 11 = 12kB.
           */
          OUT_RELOC(stage_state->scratch_bo,
                    I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
-                   prog_data->total_scratch / 1024 - 1);
+                   stage_state->per_thread_scratch / 1024 - 1);
       }
    } else {
       OUT_BATCH(0);
diff --git a/src/mesa/drivers/dri/i965/gen7_ds_state.c b/src/mesa/drivers/dri/i965/gen7_ds_state.c
index 2fe0d88..3e1a03b 100644
--- a/src/mesa/drivers/dri/i965/gen7_ds_state.c
+++ b/src/mesa/drivers/dri/i965/gen7_ds_state.c
@@ -82,7 +82,7 @@ gen7_upload_ds_state(struct brw_context *brw)
       if (prog_data->total_scratch) {
          OUT_RELOC(stage_state->scratch_bo,
                    I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
-                   ffs(prog_data->total_scratch) - 11);
+                   ffs(stage_state->per_thread_scratch) - 11);
       } else {
          OUT_BATCH(0);
       }
diff --git a/src/mesa/drivers/dri/i965/gen7_gs_state.c b/src/mesa/drivers/dri/i965/gen7_gs_state.c
index 6b126fe..7d73393 100644
--- a/src/mesa/drivers/dri/i965/gen7_gs_state.c
+++ b/src/mesa/drivers/dri/i965/gen7_gs_state.c
@@ -64,7 +64,7 @@ upload_gs_state(struct brw_context *brw)
       if (brw->gs.prog_data->base.base.total_scratch) {
          OUT_RELOC(stage_state->scratch_bo,
                    I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
-                   ffs(brw->gs.prog_data->base.base.total_scratch) - 11);
+                   ffs(stage_state->per_thread_scratch) - 11);
       } else {
          OUT_BATCH(0);
       }
diff --git a/src/mesa/drivers/dri/i965/gen7_hs_state.c b/src/mesa/drivers/dri/i965/gen7_hs_state.c
index 4f948dc..297d61b 100644
--- a/src/mesa/drivers/dri/i965/gen7_hs_state.c
+++ b/src/mesa/drivers/dri/i965/gen7_hs_state.c
@@ -83,7 +83,7 @@ gen7_upload_hs_state(struct brw_context *brw)
       if (prog_data->base.total_scratch) {
          OUT_RELOC(stage_state->scratch_bo,
                    I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
-                   ffs(prog_data->base.total_scratch) - 11);
+                   ffs(stage_state->per_thread_scratch) - 11);
       } else {
          OUT_BATCH(0);
       }
diff --git a/src/mesa/drivers/dri/i965/gen7_vs_state.c b/src/mesa/drivers/dri/i965/gen7_vs_state.c
index bb3179d..2715b37 100644
--- a/src/mesa/drivers/dri/i965/gen7_vs_state.c
+++ b/src/mesa/drivers/dri/i965/gen7_vs_state.c
@@ -56,7 +56,7 @@ upload_vs_state(struct brw_context *brw)
    if (prog_data->base.total_scratch) {
       OUT_RELOC(stage_state->scratch_bo,
 		I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
-		ffs(prog_data->base.total_scratch) - 11);
+		ffs(stage_state->per_thread_scratch) - 11);
    } else {
       OUT_BATCH(0);
    }
diff --git a/src/mesa/drivers/dri/i965/gen7_wm_state.c b/src/mesa/drivers/dri/i965/gen7_wm_state.c
index 8d4d4fc..8243905 100644
--- a/src/mesa/drivers/dri/i965/gen7_wm_state.c
+++ b/src/mesa/drivers/dri/i965/gen7_wm_state.c
@@ -235,7 +235,7 @@ gen7_upload_ps_state(struct brw_context *brw,
    if (prog_data->base.total_scratch) {
       OUT_RELOC(brw->wm.base.scratch_bo,
 		I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
-		ffs(prog_data->base.total_scratch) - 11);
+		ffs(stage_state->per_thread_scratch) - 11);
    } else {
       OUT_BATCH(0);
    }
diff --git a/src/mesa/drivers/dri/i965/gen8_ds_state.c b/src/mesa/drivers/dri/i965/gen8_ds_state.c
index 0219d07..6f01abb 100644
--- a/src/mesa/drivers/dri/i965/gen8_ds_state.c
+++ b/src/mesa/drivers/dri/i965/gen8_ds_state.c
@@ -52,7 +52,7 @@ gen8_upload_ds_state(struct brw_context *brw)
       if (prog_data->total_scratch) {
          OUT_RELOC64(stage_state->scratch_bo,
                      I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
-                     ffs(prog_data->total_scratch) - 11);
+                     ffs(stage_state->per_thread_scratch) - 11);
       } else {
          OUT_BATCH(0);
          OUT_BATCH(0);
diff --git a/src/mesa/drivers/dri/i965/gen8_gs_state.c b/src/mesa/drivers/dri/i965/gen8_gs_state.c
index 2741330..eb0c3df 100644
--- a/src/mesa/drivers/dri/i965/gen8_gs_state.c
+++ b/src/mesa/drivers/dri/i965/gen8_gs_state.c
@@ -57,7 +57,7 @@ gen8_upload_gs_state(struct brw_context *brw)
       if (brw->gs.prog_data->base.base.total_scratch) {
          OUT_RELOC64(stage_state->scratch_bo,
                      I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
-                     ffs(brw->gs.prog_data->base.base.total_scratch) - 11);
+                     ffs(stage_state->per_thread_scratch) - 11);
       } else {
          OUT_BATCH(0);
          OUT_BATCH(0);
diff --git a/src/mesa/drivers/dri/i965/gen8_hs_state.c b/src/mesa/drivers/dri/i965/gen8_hs_state.c
index 4f8eba6..e759a64 100644
--- a/src/mesa/drivers/dri/i965/gen8_hs_state.c
+++ b/src/mesa/drivers/dri/i965/gen8_hs_state.c
@@ -52,7 +52,7 @@ gen8_upload_hs_state(struct brw_context *brw)
       if (prog_data->base.total_scratch) {
          OUT_RELOC64(stage_state->scratch_bo,
                      I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
-                     ffs(prog_data->base.total_scratch) - 11);
+                     ffs(stage_state->per_thread_scratch) - 11);
       } else {
          OUT_BATCH(0);
          OUT_BATCH(0);
diff --git a/src/mesa/drivers/dri/i965/gen8_ps_state.c b/src/mesa/drivers/dri/i965/gen8_ps_state.c
index 51a3121..0e7d258 100644
--- a/src/mesa/drivers/dri/i965/gen8_ps_state.c
+++ b/src/mesa/drivers/dri/i965/gen8_ps_state.c
@@ -255,7 +255,7 @@ gen8_upload_ps_state(struct brw_context *brw,
    if (prog_data->base.total_scratch) {
       OUT_RELOC64(stage_state->scratch_bo,
                   I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
-                  ffs(prog_data->base.total_scratch) - 11);
+                  ffs(stage_state->per_thread_scratch) - 11);
    } else {
       OUT_BATCH(0);
       OUT_BATCH(0);
diff --git a/src/mesa/drivers/dri/i965/gen8_vs_state.c b/src/mesa/drivers/dri/i965/gen8_vs_state.c
index d4a3455..b7682b5 100644
--- a/src/mesa/drivers/dri/i965/gen8_vs_state.c
+++ b/src/mesa/drivers/dri/i965/gen8_vs_state.c
@@ -58,7 +58,7 @@ upload_vs_state(struct brw_context *brw)
    if (prog_data->base.total_scratch) {
       OUT_RELOC64(stage_state->scratch_bo,
                   I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
-                  ffs(prog_data->base.total_scratch) - 11);
+                  ffs(stage_state->per_thread_scratch) - 11);
    } else {
       OUT_BATCH(0);
       OUT_BATCH(0);
-- 
2.7.3



More information about the mesa-dev mailing list