[Beignet] [PATCH 8/8] HSW: Corret the scratch buffer size calc and set the correct index in vfe state.

Mon May 12 08:45:23 PDT 2014

HSW's scratch buffer alignment and the index set in vfe state are different with IVB.
And when calc per thread's stack offset, will used R0.0's FFTID to, the define of
FFTID also changed in HSW.
With this patch, all utest pass.

Signed-off-by: Yang Rong <rong.r.yang at intel.com>
---
 backend/src/backend/context.cpp       | 14 ++--------
 backend/src/backend/context.hpp       |  4 +++
 backend/src/backend/gen75_context.cpp | 52 +++++++++++++++++++++++++++++++++++
 backend/src/backend/gen75_context.hpp | 11 ++++++++
 backend/src/backend/gen_context.cpp   |  6 ++++
 backend/src/backend/gen_context.hpp   |  8 +++++-
 src/cl_device_id.c                    | 12 ++++----
 src/intel/intel_gpgpu.c               | 27 +++++++++++++++---
 8 files changed, 111 insertions(+), 23 deletions(-)

diff --git a/backend/src/backend/context.cpp b/backend/src/backend/context.cpp
index 6a0bca2..db968c3 100644
--- a/backend/src/backend/context.cpp
+++ b/backend/src/backend/context.cpp
@@ -313,16 +313,6 @@ namespace gbe
     allocatedBlocks.insert(std::make_pair(offset + subOffset, size - subOffset));
   }
 
-  static int
-  alignScratchSize(int size){
-    int i = 0;
-
-    for(; i < size; i+=1024)
-      ;
-
-    return i;
-  }
-
   ///////////////////////////////////////////////////////////////////////////
   // Generic Context (shared by the simulator and the HW context)
   ///////////////////////////////////////////////////////////////////////////
@@ -355,7 +345,7 @@ namespace gbe
     GBE_SAFE_DELETE(this->scratchAllocator);
     GBE_ASSERT(dag != NULL && liveness != NULL);
     this->registerAllocator = GBE_NEW(RegisterAllocator, GEN_REG_SIZE, 4*KB - GEN_REG_SIZE);
-    this->scratchAllocator = GBE_NEW(ScratchAllocator, 12*KB);
+    this->scratchAllocator = GBE_NEW(ScratchAllocator, this->getScratchSize());
     this->curbeRegs.clear();
     this->JIPs.clear();
   }
@@ -375,7 +365,7 @@ namespace gbe
       this->kernel = NULL;
     }
     if(this->kernel != NULL) {
-      this->kernel->scratchSize = alignScratchSize(scratchAllocator->getMaxScatchMemUsed());
+      this->kernel->scratchSize = this->alignScratchSize(scratchAllocator->getMaxScatchMemUsed());
       this->kernel->ctx = this;
     }
     return this->kernel;
diff --git a/backend/src/backend/context.hpp b/backend/src/backend/context.hpp
index d4dcfca..2a37a0e 100644
--- a/backend/src/backend/context.hpp
+++ b/backend/src/backend/context.hpp
@@ -103,6 +103,10 @@ namespace gbe
   protected:
     /*! Build the instruction stream. Return false if failed */
     virtual bool emitCode(void) = 0;
+    /*! Align the scratch size to the device's scratch unit size */
+    virtual uint32_t alignScratchSize(uint32_t) = 0;
+    /*! Get the device's max srcatch size */
+    virtual uint32_t getScratchSize(void) = 0;
     /*! Allocate a new empty kernel (to be implemented) */
     virtual Kernel *allocateKernel(void) = 0;
     /*! Look if a stack is needed and allocate it */
diff --git a/backend/src/backend/gen75_context.cpp b/backend/src/backend/gen75_context.cpp
index f22a6ab..aedd4d3 100644
--- a/backend/src/backend/gen75_context.cpp
+++ b/backend/src/backend/gen75_context.cpp
@@ -54,6 +54,58 @@ namespace gbe
       allocCurbeReg(ir::ocl::slmoffset, GBE_CURBE_SLM_OFFSET);
   }
 
+  uint32_t Gen75Context::alignScratchSize(uint32_t size){
+    if(size == 0)
+      return 0;
+    uint32_t i = 2048;
+    while(i < size) i *= 2;
+    return i;
+  }
+
+  void Gen75Context::emitStackPointer(void) {
+    using namespace ir;
+
+    // Only emit stack pointer computation if we use a stack
+    if (kernel->getCurbeOffset(GBE_CURBE_STACK_POINTER, 0) <= 0)
+      return;
+
+    // Check that everything is consistent in the kernel code
+    const uint32_t perLaneSize = kernel->getStackSize();
+    const uint32_t perThreadSize = perLaneSize * this->simdWidth;
+    GBE_ASSERT(perLaneSize > 0);
+    GBE_ASSERT(isPowerOf<2>(perLaneSize) == true);
+    GBE_ASSERT(isPowerOf<2>(perThreadSize) == true);
+
+    // Use shifts rather than muls which are limited to 32x16 bit sources
+    const uint32_t perLaneShift = logi2(perLaneSize);
+    const uint32_t perThreadShift = logi2(perThreadSize);
+    const GenRegister selStatckPtr = this->simdWidth == 8 ?
+      GenRegister::ud8grf(ir::ocl::stackptr) :
+      GenRegister::ud16grf(ir::ocl::stackptr);
+    const GenRegister stackptr = ra->genReg(selStatckPtr);
+    const GenRegister selStackBuffer = GenRegister::ud1grf(ir::ocl::stackbuffer);
+    const GenRegister bufferptr = ra->genReg(selStackBuffer);
+
+    // We compute the per-lane stack pointer here
+    p->push();
+      p->curr.execWidth = 1;
+      p->curr.predicate = GEN_PREDICATE_NONE;
+      //p->AND(GenRegister::ud1grf(126,0), GenRegister::ud1grf(0,5), GenRegister::immud(0x1ff));
+      p->AND(GenRegister::ud1grf(126,0), GenRegister::ud1grf(0,5), GenRegister::immud(0x7f));
+      p->AND(GenRegister::ud1grf(126,4), GenRegister::ud1grf(0,5), GenRegister::immud(0x80));
+      p->SHR(GenRegister::ud1grf(126,4), GenRegister::ud1grf(126, 4), GenRegister::immud(7));
+      p->curr.execWidth = this->simdWidth;
+      p->SHL(stackptr, stackptr, GenRegister::immud(perLaneShift));
+      p->curr.execWidth = 1;
+      p->SHL(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0), GenRegister::immud(1));
+      p->ADD(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0), GenRegister::ud1grf(126, 4));
+      p->SHL(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0), GenRegister::immud(perThreadShift));
+      p->curr.execWidth = this->simdWidth;
+      p->ADD(stackptr, stackptr, bufferptr);
+      p->ADD(stackptr, stackptr, GenRegister::ud1grf(126,0));
+    p->pop();
+  }
+
   void Gen75Context::newSelection(void) {
     this->sel = GBE_NEW(Selection75, *this);
   }
diff --git a/backend/src/backend/gen75_context.hpp b/backend/src/backend/gen75_context.hpp
index bd0986c..5e710da 100644
--- a/backend/src/backend/gen75_context.hpp
+++ b/backend/src/backend/gen75_context.hpp
@@ -35,6 +35,17 @@ namespace gbe
     Gen75Context(const ir::Unit &unit, const std::string &name, uint32_t deviceID, bool relaxMath = false)
             : GenContext(unit, name, deviceID, relaxMath) {
     };
+    /*! device's max srcatch buffer size */
+    const int GEN75_SCRATCH_SIZE = 2 * KB * KB;
+    /*! Emit the per-lane stack pointer computation */
+    virtual void emitStackPointer(void);
+    /*! Align the scratch size to the device's scratch unit size */
+    virtual uint32_t alignScratchSize(uint32_t size);
+    /*! Get the device's max srcatch size */
+    virtual uint32_t getScratchSize(void) {
+      //Because the allocate is use uint16_t, so clamp it, need refine
+      return std::min(GEN75_SCRATCH_SIZE, 0x7fff);
+    }
 
   protected:
     virtual GenEncoder* generateEncoder(void) {
diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp
index adeb852..e5ddb74 100644
--- a/backend/src/backend/gen_context.cpp
+++ b/backend/src/backend/gen_context.cpp
@@ -79,6 +79,12 @@ namespace gbe
     this->sel = GBE_NEW(Selection, *this);
   }
 
+  uint32_t GenContext::alignScratchSize(uint32_t size){
+    uint32_t i = 0;
+    while(i < size) i+=1024;
+    return i;
+  }
+
   void GenContext::emitInstructionStream(void) {
     // Emit Gen ISA
     for (auto &block : *sel->blockList)
diff --git a/backend/src/backend/gen_context.hpp b/backend/src/backend/gen_context.hpp
index ba4cdc6..be85edc 100644
--- a/backend/src/backend/gen_context.hpp
+++ b/backend/src/backend/gen_context.hpp
@@ -64,12 +64,18 @@ namespace gbe
                bool relaxMath = false);
     /*! Release everything needed */
     ~GenContext(void);
+    /*! device's max srcatch buffer size */
+    const int GEN7_SCRATCH_SIZE = 12 * KB;
     /*! Start new code generation with specific parameters */
     void startNewCG(uint32_t simdWidth, uint32_t reservedSpillRegs, bool limitRegisterPressure);
     /*! Target device ID*/
     uint32_t deviceID;
     /*! Implements base class */
     virtual bool emitCode(void);
+    /*! Align the scratch size to the device's scratch unit size */
+    virtual uint32_t alignScratchSize(uint32_t size);
+    /*! Get the device's max srcatch size */
+    virtual uint32_t getScratchSize(void) { return GEN7_SCRATCH_SIZE; }
     /*! Function we emit code for */
     INLINE const ir::Function &getFunction(void) const { return fn; }
     /*! Simd width chosen for the current function */
@@ -78,7 +84,7 @@ namespace gbe
     /*! check the flag reg, if is grf, use f0.1 instead */
     GenRegister checkFlagRegister(GenRegister flagReg);
     /*! Emit the per-lane stack pointer computation */
-    void emitStackPointer(void);
+    virtual void emitStackPointer(void);
     /*! Emit the instructions */
     void emitInstructionStream(void);
     /*! Set the correct target values for the branches */
diff --git a/src/cl_device_id.c b/src/cl_device_id.c
index 74c6ab8..01f672c 100644
--- a/src/cl_device_id.c
+++ b/src/cl_device_id.c
@@ -71,8 +71,8 @@ static struct _cl_device_id intel_baytrail_t_device = {
 /* XXX we clone IVB for HSW now */
 static struct _cl_device_id intel_hsw_gt1_device = {
   INIT_ICD(dispatch)
-  .max_compute_unit = 64,
-  .max_thread_per_unit = 8,
+  .max_compute_unit = 70,
+  .max_thread_per_unit = 7,
   .max_work_item_sizes = {512, 512, 512},
   .max_work_group_size = 512,
   .max_clock_frequency = 1000,
@@ -82,8 +82,8 @@ static struct _cl_device_id intel_hsw_gt1_device = {
 
 static struct _cl_device_id intel_hsw_gt2_device = {
   INIT_ICD(dispatch)
-  .max_compute_unit = 128,
-  .max_thread_per_unit = 8,
+  .max_compute_unit = 140,
+  .max_thread_per_unit = 7,
   .max_work_item_sizes = {512, 512, 512},
   .max_work_group_size = 512,
   .max_clock_frequency = 1000,
@@ -93,8 +93,8 @@ static struct _cl_device_id intel_hsw_gt2_device = {
 
 static struct _cl_device_id intel_hsw_gt3_device = {
   INIT_ICD(dispatch)
-  .max_compute_unit = 256,
-  .max_thread_per_unit = 8,
+  .max_compute_unit = 280,
+  .max_thread_per_unit = 7,
   .max_work_item_sizes = {512, 512, 512},
   .max_work_group_size = 512,
   .max_clock_frequency = 1000,
diff --git a/src/intel/intel_gpgpu.c b/src/intel/intel_gpgpu.c
index 103a4b2..26d27b7 100644
--- a/src/intel/intel_gpgpu.c
+++ b/src/intel/intel_gpgpu.c
@@ -121,6 +121,9 @@ typedef struct intel_gpgpu intel_gpgpu_t;
 typedef void (intel_gpgpu_set_L3_t)(intel_gpgpu_t *gpgpu, uint32_t use_slm);
 intel_gpgpu_set_L3_t *intel_gpgpu_set_L3 = NULL;
 
+typedef uint32_t (get_scratch_index_t)(uint32_t size);
+get_scratch_index_t *get_scratch_index = NULL;
+
 static void
 intel_gpgpu_sync(void *buf)
 {
@@ -230,17 +233,34 @@ intel_gpgpu_set_base_address(intel_gpgpu_t *gpgpu)
   ADVANCE_BATCH(gpgpu->batch);
 }
 
+uint32_t get_scratch_index_gen7(uint32_t size) {
+  return size / 1024 - 1;
+}
+
+uint32_t get_scratch_index_gen75(uint32_t size) {
+    size = size >> 12;
+    uint32_t index = 0;
+    while((size >>= 1) > 0)
+      index++;   //get leading one
+
+    //non pow 2 size
+    if(size & (size - 1)) index++;
+    return index;
+}
+
 static void
 intel_gpgpu_load_vfe_state(intel_gpgpu_t *gpgpu)
 {
+  int32_t scratch_index;
   BEGIN_BATCH(gpgpu->batch, 8);
   OUT_BATCH(gpgpu->batch, CMD_MEDIA_STATE_POINTERS | (8-2));
 
   if(gpgpu->per_thread_scratch > 0) {
+    scratch_index = get_scratch_index(gpgpu->per_thread_scratch);
     OUT_RELOC(gpgpu->batch, gpgpu->scratch_b.bo,
               I915_GEM_DOMAIN_RENDER,
               I915_GEM_DOMAIN_RENDER,
-              gpgpu->per_thread_scratch/1024 - 1);
+              scratch_index);
   }
   else {
     OUT_BATCH(gpgpu->batch, 0);
@@ -351,9 +371,6 @@ intel_gpgpu_set_L3_gen7(intel_gpgpu_t *gpgpu, uint32_t use_slm)
     OUT_BATCH(gpgpu->batch, gpgpu_l3_config_reg2[4]);
     ADVANCE_BATCH(gpgpu->batch);
 
-  //To set L3 in HSW, enable the flag I915_EXEC_ENABLE_SLM flag when exec
-  if(use_slm)
-    gpgpu->batch->enable_slm = 1;
   intel_gpgpu_pipe_control(gpgpu);
 }
 
@@ -1150,10 +1167,12 @@ intel_set_gpgpu_callbacks(int device_id)
   if (IS_HASWELL(device_id)) {
     cl_gpgpu_bind_image = (cl_gpgpu_bind_image_cb *) intel_gpgpu_bind_image_gen75;
     intel_gpgpu_set_L3 = intel_gpgpu_set_L3_gen75;
+    get_scratch_index = get_scratch_index_gen75;
   }
   else if (IS_IVYBRIDGE(device_id)) {
     cl_gpgpu_bind_image = (cl_gpgpu_bind_image_cb *) intel_gpgpu_bind_image_gen7;
     intel_gpgpu_set_L3 = intel_gpgpu_set_L3_gen7;
+    get_scratch_index = get_scratch_index_gen7;
   }
   else
     assert(0);
-- 
1.8.3.2