[Mesa-dev] [PATCH 2/2] i965: Support "unlimited" compute shader scratch space.

Fri Jun 17 02:58:17 UTC 2016

Ivybridge and Baytrail have a pretty harsh limit of 12kB scratch space
per thread.  However, we can exceed this limit with some clever tricks
and gain effectively unlimited scratch space.

Later platforms have a 2MB limit, which is much more reasonable, but
we may as well apply the same trick there.

We can probably extend this trick to other stages, but would need to
adjust the shader code for the different thread payload layouts.

Cc: "12.0" <mesa-stable at lists.freedesktop.org>
Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=96505
Signed-off-by: Kenneth Graunke <kenneth at whitecape.org>
---
 src/mesa/drivers/dri/i965/brw_fs.cpp      | 50 +++++++++++++++++++++++--------
 src/mesa/drivers/dri/i965/gen7_cs_state.c |  6 ++--
 2 files changed, 41 insertions(+), 15 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index cadf905..9075918 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -5993,19 +5993,45 @@ fs_visitor::allocate_registers(bool allow_spilling)
             prog_data->total_scratch = ALIGN(last_scratch, 1024);
             max_scratch_size = 12 * 1024;
          }
-      }
 
-      /* We currently only support up to 2MB of scratch space.  If we
-       * need to support more eventually, the documentation suggests
-       * that we could allocate a larger buffer, and partition it out
-       * ourselves.  We'd just have to undo the hardware's address
-       * calculation by subtracting (FFTID * Per Thread Scratch Space)
-       * and then add FFTID * (Larger Per Thread Scratch Space).
-       *
-       * See 3D-Media-GPGPU Engine > Media GPGPU Pipeline >
-       * Thread Group Tracking > Local Memory/Scratch Space.
-       */
-      assert(prog_data->total_scratch < max_scratch_size);
+         if (prog_data->total_scratch >= max_scratch_size) {
+            /* Normally, the hardware computes a pointer to the scratch
+             * space region for our thread for us.  This comes with a
+             * limitation - each thread can only use a limited amount
+             * of scratch space.  To support more than that limit, we
+             * can subtract the hardware's offset and add our own:
+             *
+             * Scratch Space Pointer +=
+             *    FFTID * (prog_data->total_scratch - max_scratch_size);
+             *
+             * See 3D-Media-GPGPU Engine > Media GPGPU Pipeline >
+             * Thread Group Tracking > Local Memory/Scratch Space.
+             */
+            const fs_builder bld =
+               fs_builder(this, cfg->blocks[0], (fs_inst *)
+                          cfg->blocks[0]->start()).exec_all();
+            struct brw_reg g0_5 =
+               retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_D);
+            struct brw_reg g127 =
+               retype(brw_vec1_grf(127, 0), BRW_REGISTER_TYPE_D);
+
+            /* Multiply FFTID by max_scratch_size and shift the result
+             * left by 10.  Then subtract that from the scratch pointer.
+             *
+             * We need a register for temporary storage, but can't allocate
+             * one post-register-allocation.  However, since our code will
+             * be emitted at the top of the program, we can safely use g127,
+             * as it isn't part of the thread payload and can't be in use.
+             */
+            bld.AND(g127, g0_5, brw_imm_d(0x3ff));
+            bld.MUL(g127, g127,
+                    brw_imm_d(prog_data->total_scratch - max_scratch_size));
+            bld.ADD(g0_5, g0_5, g127);
+         }
+      } else {
+         /* We should extend the above hack for other stages someday. */
+         assert(prog_data->total_scratch < max_scratch_size);
+      }
    }
 }
 
diff --git a/src/mesa/drivers/dri/i965/gen7_cs_state.c b/src/mesa/drivers/dri/i965/gen7_cs_state.c
index 5fb8829..b89a186 100644
--- a/src/mesa/drivers/dri/i965/gen7_cs_state.c
+++ b/src/mesa/drivers/dri/i965/gen7_cs_state.c
@@ -70,21 +70,21 @@ brw_upload_cs_state(struct brw_context *brw)
           */
          OUT_RELOC64(stage_state->scratch_bo,
                      I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
-                     ffs(stage_state->per_thread_scratch) - 11);
+                     MIN2(ffs(stage_state->per_thread_scratch) - 11, 11));
       } else if (brw->is_haswell) {
          /* Haswell's Per Thread Scratch Space is in the range [0, 10]
           * where 0 = 2k, 1 = 4k, 2 = 8k, ..., 10 = 2M.
           */
          OUT_RELOC(stage_state->scratch_bo,
                    I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
-                   ffs(stage_state->per_thread_scratch) - 12);
+                   MIN2(ffs(stage_state->per_thread_scratch) - 12, 10));
       } else {
          /* Earlier platforms use the range [0, 11] to mean [1kB, 12kB]
           * where 0 = 1kB, 1 = 2kB, 2 = 3kB, ..., 11 = 12kB.
           */
          OUT_RELOC(stage_state->scratch_bo,
                    I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
-                   stage_state->per_thread_scratch / 1024 - 1);
+                   MIN2(stage_state->per_thread_scratch / 1024, 12) - 1);
       }
    } else {
       OUT_BATCH(0);
-- 
2.8.3