[Mesa-dev] [PATCH] mesa: Fix pixel shader scratch space allocation on Gen9+ platforms.

Tue Nov 8 18:25:45 UTC 2016

We had missed a bit of errata - PS scratch needs to be computed as if
there were 4 subslices per slice, rather than 3.

                          Skylake      Broxton        Kabylake
                      GT1 GT2 GT3 GT4  2x6 3x6  GT1 GT1.5 GT2 GT3 GT4
Actual Slices          1   1   2   3    1   1    1    1    1   2   3
Total Subslices        3   3   6   9    2   3    2    3    3   6   9
Subsl. for PS Scratch  4   4   8   12   4   4    4    4    4   8   12

Note that Skylake GT1-3 already worked because we allocated 64 * 9
(trying to use a value that would work on GT4, with 9 subslices),
and the actual required values were 64 * 4 or 64 * 8.  However, all
others (Skylake GT4, Broxton, and Kabylake GT1-4) underallocated,
which can lead to scratch writes trashing random process memory,
and rendering corruption or GPU hangs.

Fixes GPU hangs and rendering corruption on Skylake GT4 in shaders that
spill.  Particularly, dEQP-GLES31.functional.ubo.all_per_block_buffers.*
now runs successfully with no hangs and renders correctly.  This may
fix problems on Broxton and Kabylake as well.

Cc: "13.0" <mesa-stable at lists.freedesktop.org>
Signed-off-by: Kenneth Graunke <kenneth at whitecape.org>
---
 src/intel/common/gen_device_info.c | 33 +++++++++++++++++++--------------
 1 file changed, 19 insertions(+), 14 deletions(-)

diff --git a/src/intel/common/gen_device_info.c b/src/intel/common/gen_device_info.c
index 30df0b2..1dc1769 100644
--- a/src/intel/common/gen_device_info.c
+++ b/src/intel/common/gen_device_info.c
@@ -335,7 +335,6 @@ static const struct gen_device_info gen_device_info_chv = {
    .max_gs_threads = 336,                           \
    .max_tcs_threads = 336,                          \
    .max_tes_threads = 336,                          \
-   .max_wm_threads = 64 * 9,                        \
    .max_cs_threads = 56,                            \
    .urb = {                                         \
       .size = 384,                                  \
@@ -388,7 +387,6 @@ static const struct gen_device_info gen_device_info_bxt = {
    .max_tcs_threads = 112,
    .max_tes_threads = 112,
    .max_gs_threads = 112,
-   .max_wm_threads = 64 * 3,
    .max_cs_threads = 6 * 6,
    .urb = {
       .size = 192,
@@ -411,7 +409,6 @@ static const struct gen_device_info gen_device_info_bxt_2x6 = {
    .max_tcs_threads = 56, /* XXX: guess */
    .max_tes_threads = 56,
    .max_gs_threads = 56,
-   .max_wm_threads = 64 * 2,
    .max_cs_threads = 6 * 6,
    .urb = {
       .size = 128,
@@ -427,18 +424,11 @@ static const struct gen_device_info gen_device_info_bxt_2x6 = {
  * There's no KBL entry. Using the default SKL (GEN9) GS entries value.
  */
 
-/*
- * Both SKL and KBL support a maximum of 64 threads per
- * Pixel Shader Dispatch (PSD) unit.
- */
-#define  KBL_MAX_THREADS_PER_PSD 64
-
 static const struct gen_device_info gen_device_info_kbl_gt1 = {
    GEN9_FEATURES,
    .gt = 1,
 
    .max_cs_threads = 7 * 6,
-   .max_wm_threads = KBL_MAX_THREADS_PER_PSD * 2,
    .urb.size = 192,
    .num_slices = 1,
 };
@@ -448,7 +438,6 @@ static const struct gen_device_info gen_device_info_kbl_gt1_5 = {
    .gt = 1,
 
    .max_cs_threads = 7 * 6,
-   .max_wm_threads = KBL_MAX_THREADS_PER_PSD * 3,
    .num_slices = 1,
 };
 
@@ -456,7 +445,6 @@ static const struct gen_device_info gen_device_info_kbl_gt2 = {
    GEN9_FEATURES,
    .gt = 2,
 
-   .max_wm_threads = KBL_MAX_THREADS_PER_PSD * 3,
    .num_slices = 1,
 };
 
@@ -464,7 +452,6 @@ static const struct gen_device_info gen_device_info_kbl_gt3 = {
    GEN9_FEATURES,
    .gt = 3,
 
-   .max_wm_threads = KBL_MAX_THREADS_PER_PSD * 6,
    .num_slices = 2,
 };
 
@@ -472,7 +459,6 @@ static const struct gen_device_info gen_device_info_kbl_gt4 = {
    GEN9_FEATURES,
    .gt = 4,
 
-   .max_wm_threads = KBL_MAX_THREADS_PER_PSD * 9,
    /*
     * From the "L3 Allocation and Programming" documentation:
     *
@@ -500,6 +486,25 @@ gen_get_device_info(int devid, struct gen_device_info *devinfo)
       return false;
    }
 
+   /* From the Skylake PRM, 3DSTATE_PS::Scratch Space Base Pointer:
+    *
+    * "Scratch Space per slice is computed based on 4 sub-slices.  SW must
+    *  allocate scratch space enough so that each slice has 4 slices allowed."
+    *
+    * The equivalent internal documentation says that this programming note
+    * applies to all Gen9+ platforms.
+    *
+    * The hardware typically calculates the scratch space pointer by taking
+    * the base address, and adding per-thread-scratch-space * thread ID.
+    * Extra padding can be necessary depending how the thread IDs are
+    * calculated for a particular shader stage.
+    */
+   if (devinfo->gen >= 9) {
+      devinfo->max_wm_threads = 64 /* threads-per-PSD */
+                              * devinfo->num_slices
+                              * 4; /* effective subslices per slice */
+   }
+
    return true;
 }
 
-- 
2.10.2