[Mesa-dev] [PATCH v2] i965: Support larger scratch space sizes for compute shaders.

Tue Jun 21 23:35:54 UTC 2016

Ivybridge and Baytrail have a pretty harsh limit of 12kB scratch space
per thread.  However, we can exceed this limit with some clever tricks.

Later platforms have a 2MB limit, which is much more reasonable, but
we may as well apply the same trick there.

We can probably extend this trick to other stages, but would need to
adjust the shader code for the different thread payload layouts.

Fixes Piglit's spec/arb_compute_shader/linker/bug-93840 on Ivybridge GT1
and Baytrail.

v2:
- Fix builder execution flags (caught by Curro)
- Don't overflow integer immediate ranges (caught by Curro)
- Don't use MUL as this is post-lowering, and we'd need to handle
  Cherryview retyping nonsense, which would be painful.  Use shifts.
- Fail the compile if the scratch space is too large.
- Put all this nonsense into a helper function.

Cc: "12.0" <mesa-stable at lists.freedesktop.org>
Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=96505
Signed-off-by: Kenneth Graunke <kenneth at whitecape.org>
---
 src/mesa/drivers/dri/i965/brw_fs.cpp      | 91 +++++++++++++++++++++++++++----
 src/mesa/drivers/dri/i965/brw_fs.h        |  1 +
 src/mesa/drivers/dri/i965/gen7_cs_state.c | 11 +++-
 3 files changed, 88 insertions(+), 15 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index 09b0431..c2dc2ac 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -5975,6 +5975,12 @@ fs_visitor::allocate_registers(bool allow_spilling)
 
    schedule_instructions(SCHEDULE_POST);
 
+   setup_scratch_space();
+}
+
+void
+fs_visitor::setup_scratch_space()
+{
    if (last_scratch > 0) {
       unsigned max_scratch_size = 2 * 1024 * 1024;
 
@@ -5993,22 +5999,83 @@ fs_visitor::allocate_registers(bool allow_spilling)
              * field documentation, platforms prior to Haswell measure scratch
              * size linearly with a range of [1kB, 12kB] and 1kB granularity.
              */
-            prog_data->total_scratch = ALIGN(last_scratch, 1024);
             max_scratch_size = 12 * 1024;
+
+            const unsigned linear_size = ALIGN(last_scratch, 1024);
+            if (linear_size < max_scratch_size)
+               prog_data->total_scratch = linear_size;
+
+            /* If it exceeds the maximum, keep it as a power of two */
+         }
+
+         if (prog_data->total_scratch >= max_scratch_size) {
+            const bool ivb_byt = devinfo->gen == 7 && !devinfo->is_haswell;
+
+            /* Normally, the hardware computes a pointer to the scratch
+             * space region for our thread for us.  This comes with a
+             * limitation - each thread can only use <max_scratch_size>
+             * bytes of scratch space.  This is particularly small on Gen7.
+             * To support more than this limit, we can subtract the
+             * hardware-computed offset and add our own.
+             *
+             * See 3D-Media-GPGPU Engine > Media GPGPU Pipeline >
+             * Thread Group Tracking > Local Memory/Scratch Space.
+             */
+            const fs_builder bld = fs_builder(this, 1)
+               .at(cfg->blocks[0], cfg->blocks[0]->start()).exec_all();
+
+            /* We need a register for temporary storage, but can't allocate
+             * one post-register-allocation.  However, since our code will
+             * be emitted at the top of the program, we can safely use g127,
+             * as it isn't part of the thread payload and can't be in use.
+             */
+            struct brw_reg g0_5 =
+               retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_D);
+            struct brw_reg fftid =
+               retype(brw_vec1_grf(127, 0), BRW_REGISTER_TYPE_D);
+            struct brw_reg tmp =
+               retype(brw_vec1_grf(127, 1), BRW_REGISTER_TYPE_D);
+
+            /* We program MEDIA_VFE_STATE to use 8kB (2^13) on Gen7 and
+             * 2MB (2^21) elsewhere.  Subtract FFTID * max_power.
+             */
+            const int hw_supplied_max_power = ivb_byt ? 13 : 21;
+
+            bld.AND(fftid, g0_5, brw_imm_d(0x3ff));
+
+            /* Subtract the hardware-supplied offset */
+            bld.SHL(tmp, fftid, brw_imm_d(hw_supplied_max_power));
+            bld.ADD(g0_5, g0_5, negate(tmp));
+
+            /* Add a new offset */
+            bld.SHL(tmp, fftid, brw_imm_d(ffs(prog_data->total_scratch) - 1));
+            bld.ADD(g0_5, g0_5, tmp);
+
+            /* Our new offset still needs to fit in 32 bits, which imposes
+             * a new maximum per-thread scratch space size.  Compute that
+             * so we can still fail the compile if necessary.
+             *
+             * We use the number of FFTID bits to compute a theoretical
+             * number of threads.  FFTID is theoretically a 10-bit number
+             * but the high bits are "reserved must be zero" sometimes.
+             */
+            int fftid_bits;
+            if (ivb_byt)
+               fftid_bits = 8;
+            else if (devinfo->gen <= 9)
+               fftid_bits = 9;
+            else
+               fftid_bits = 10;
+
+            max_scratch_size = 1 << (32 - fftid_bits);
          }
       }
 
-      /* We currently only support up to 2MB of scratch space.  If we
-       * need to support more eventually, the documentation suggests
-       * that we could allocate a larger buffer, and partition it out
-       * ourselves.  We'd just have to undo the hardware's address
-       * calculation by subtracting (FFTID * Per Thread Scratch Space)
-       * and then add FFTID * (Larger Per Thread Scratch Space).
-       *
-       * See 3D-Media-GPGPU Engine > Media GPGPU Pipeline >
-       * Thread Group Tracking > Local Memory/Scratch Space.
-       */
-      assert(prog_data->total_scratch < max_scratch_size);
+      if (prog_data->total_scratch > max_scratch_size) {
+         fail("Shader exceeded maximum per-thread scratch space limit: "
+              "%u bytes > %u bytes\n",
+              prog_data->total_scratch, max_scratch_size);
+      }
    }
 }
 
diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h
index 4237197..3a7ff1a 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.h
+++ b/src/mesa/drivers/dri/i965/brw_fs.h
@@ -99,6 +99,7 @@ public:
    bool run_cs();
    void optimize();
    void allocate_registers(bool allow_spilling);
+   void setup_scratch_space();
    void setup_fs_payload_gen4();
    void setup_fs_payload_gen6();
    void setup_vs_payload();
diff --git a/src/mesa/drivers/dri/i965/gen7_cs_state.c b/src/mesa/drivers/dri/i965/gen7_cs_state.c
index 5fb8829..34c3f09 100644
--- a/src/mesa/drivers/dri/i965/gen7_cs_state.c
+++ b/src/mesa/drivers/dri/i965/gen7_cs_state.c
@@ -70,21 +70,26 @@ brw_upload_cs_state(struct brw_context *brw)
           */
          OUT_RELOC64(stage_state->scratch_bo,
                      I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
-                     ffs(stage_state->per_thread_scratch) - 11);
+                     MIN2(ffs(stage_state->per_thread_scratch) - 11, 11));
       } else if (brw->is_haswell) {
          /* Haswell's Per Thread Scratch Space is in the range [0, 10]
           * where 0 = 2k, 1 = 4k, 2 = 8k, ..., 10 = 2M.
           */
          OUT_RELOC(stage_state->scratch_bo,
                    I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
-                   ffs(stage_state->per_thread_scratch) - 12);
+                   MIN2(ffs(stage_state->per_thread_scratch) - 12, 10));
       } else {
          /* Earlier platforms use the range [0, 11] to mean [1kB, 12kB]
           * where 0 = 1kB, 1 = 2kB, 2 = 3kB, ..., 11 = 12kB.
+          *
+          * brw_fs.cpp has a hack to use more than 12kB of scratch space;
+          * it relies on us programming the hardware to 8kB (the largest
+          * power of two).
           */
+         unsigned scratch_kb = stage_state->per_thread_scratch / 1024;
          OUT_RELOC(stage_state->scratch_bo,
                    I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
-                   stage_state->per_thread_scratch / 1024 - 1);
+                   (scratch_kb > 12 ? 8 : scratch_kb) - 1);
       }
    } else {
       OUT_BATCH(0);
-- 
2.8.3