[Beignet] [PATCH V2 1/2] enable scratch memory allocation and read/write

Wed Aug 7 00:07:40 PDT 2013

v2: refine function naming.

Signed-off-by: Ruiling Song <ruiling.song at intel.com>
---
 backend/src/backend/context.cpp           |   18 ++++++++++
 backend/src/backend/context.hpp           |    3 ++
 backend/src/backend/gen/gen_mesa_disasm.c |   43 ++++++++++++++++++++----
 backend/src/backend/gen_context.cpp       |   31 +++++++++++++++++
 backend/src/backend/gen_context.hpp       |    4 ++-
 backend/src/backend/gen_defs.hpp          |   25 ++++++++++++++
 backend/src/backend/gen_encoder.cpp       |   43 ++++++++++++++++++++++++
 backend/src/backend/gen_encoder.hpp       |    4 +++
 backend/src/backend/program.cpp           |    8 +++++
 backend/src/backend/program.h             |    4 +++
 backend/src/backend/program.hpp           |    3 ++
 src/cl_command_queue_gen7.c               |    9 +++++
 src/cl_driver.h                           |    4 +++
 src/cl_driver_defs.c                      |    1 +
 src/intel/intel_gpgpu.c                   |   52 ++++++++++++++++++++++-------
 15 files changed, 233 insertions(+), 19 deletions(-)

diff --git a/backend/src/backend/context.cpp b/backend/src/backend/context.cpp
index 48160de..5484869 100644
--- a/backend/src/backend/context.cpp
+++ b/backend/src/backend/context.cpp
@@ -268,6 +268,15 @@ namespace gbe
     }
   }
 
+  static int
+  alignScratchSize(int size){
+    int i = 0;
+
+    for(; i < size; i+=1024)
+      ;
+
+    return i;
+  }
   ///////////////////////////////////////////////////////////////////////////
   // Generic Context (shared by the simulator and the HW context)
   ///////////////////////////////////////////////////////////////////////////
@@ -284,6 +293,7 @@ namespace gbe
       this->simdWidth = nextHighestPowerOf2(OCL_SIMD_WIDTH);
     else
       this->simdWidth = fn.getSimdWidth();
+    this->scratchOffset = 0;
   }
 
   Context::~Context(void) {
@@ -306,6 +316,8 @@ namespace gbe
       this->kernel = NULL;
     }
     if(this->kernel != NULL)
+      this->kernel->scratchSize = alignScratchSize(this->scratchOffset);
+    if(this->kernel != NULL)
       this->kernel->ctx = this;
     return this->kernel;
   }
@@ -337,6 +349,12 @@ namespace gbe
     return offset + GEN_REG_SIZE;
   }
 
+  uint32_t Context::allocateScratchMem(uint32_t size) {
+    uint32_t offset = scratchOffset;
+    scratchOffset += size;
+    return offset;
+  }
+
   void Context::buildStack(void) {
     const auto &stackUse = dag->getUse(ir::ocl::stackptr);
     if (stackUse.size() == 0)  // no stack is used if stackptr is unused
diff --git a/backend/src/backend/context.hpp b/backend/src/backend/context.hpp
index c205388..50c0e70 100644
--- a/backend/src/backend/context.hpp
+++ b/backend/src/backend/context.hpp
@@ -91,6 +91,8 @@ namespace gbe
     /* allocate a new entry for a specific image's information */
     /*! Get (search or allocate if fail to find one) image info curbeOffset.*/
     uint32_t getImageInfoCurbeOffset(ir::ImageInfoKey key, size_t size);
+    /*! allocate size scratch memory and return start address */
+    uint32_t allocateScratchMem(uint32_t size);
   protected:
     /*! Build the instruction stream. Return false if failed */
     virtual bool emitCode(void) = 0;
@@ -126,6 +128,7 @@ namespace gbe
     set<ir::LabelIndex> usedLabels;       //!< Set of all used labels
     JIPMap JIPs;                          //!< Where to jump all labels/branches
     uint32_t simdWidth;                   //!< Number of lanes per HW threads
+    uint32_t scratchOffset;               //!< scratch slot for next scratch memory request
     GBE_CLASS(Context);                   //!< Use custom allocators
   };
 
diff --git a/backend/src/backend/gen/gen_mesa_disasm.c b/backend/src/backend/gen/gen_mesa_disasm.c
index ca8ca37..bfb865a 100644
--- a/backend/src/backend/gen/gen_mesa_disasm.c
+++ b/backend/src/backend/gen/gen_mesa_disasm.c
@@ -373,6 +373,28 @@ static const char *data_port_data_cache_category[] = {
   "scratch",
 };
 
+static const char *data_port_scratch_block_size[] = {
+  "1 register",
+  "2 registers",
+  "Reserve",
+  "4 registers",
+};
+
+static const char *data_port_scratch_invalidate[] = {
+  "no invalidate",
+  "invalidate cache line",
+};
+
+static const char *data_port_scratch_channel_mode[] = {
+  "Oword",
+  "Dword",
+};
+
+static const char *data_port_scratch_msg_type[] = {
+  "Scratch Read",
+  "Scratch Write",
+};
+
 static const char *data_port_data_cache_msg_type[] = {
   [0] = "OWord Block Read",
   [1] = "Unaligned OWord Block Read",
@@ -1155,12 +1177,21 @@ int gen_disasm (FILE *file, const void *opaque_insn)
                 inst->bits3.sampler_gen7.simd_mode);
         break;
       case GEN_SFID_DATAPORT_DATA_CACHE:
-        format (file, " (bti: %d, rgba: %d, %s, %s, %s)",
-                inst->bits3.gen7_untyped_rw.bti,
-                inst->bits3.gen7_untyped_rw.rgba,
-                data_port_data_cache_simd_mode[inst->bits3.gen7_untyped_rw.simd_mode],
-                data_port_data_cache_category[inst->bits3.gen7_untyped_rw.category],
-                data_port_data_cache_msg_type[inst->bits3.gen7_untyped_rw.msg_type]);
+        if(inst->bits3.gen7_untyped_rw.category == 0) {
+          format (file, " (bti: %d, rgba: %d, %s, %s, %s)",
+                  inst->bits3.gen7_untyped_rw.bti,
+                  inst->bits3.gen7_untyped_rw.rgba,
+                  data_port_data_cache_simd_mode[inst->bits3.gen7_untyped_rw.simd_mode],
+                  data_port_data_cache_category[inst->bits3.gen7_untyped_rw.category],
+                  data_port_data_cache_msg_type[inst->bits3.gen7_untyped_rw.msg_type]);
+        } else {
+          format (file, " (addr: %d, blocks: %s, %s, mode: %s, %s)",
+                  inst->bits3.gen7_scratch_rw.offset,
+                  data_port_scratch_block_size[inst->bits3.gen7_scratch_rw.block_size],
+                  data_port_scratch_invalidate[inst->bits3.gen7_scratch_rw.invalidate_after_read],
+                  data_port_scratch_channel_mode[inst->bits3.gen7_scratch_rw.channel_mode],
+                  data_port_scratch_msg_type[inst->bits3.gen7_scratch_rw.msg_type]);
+        }
         break;
       case GEN_SFID_MESSAGE_GATEWAY:
         format (file, " (subfunc: %s, notify: %d, ackreq: %d)",
diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp
index 0c29beb..29fa1c3 100644
--- a/backend/src/backend/gen_context.cpp
+++ b/backend/src/backend/gen_context.cpp
@@ -620,6 +620,37 @@ namespace gbe
     p->pop();
   }
 
+  void GenContext::scratchWrite(const GenRegister header, uint32_t offset, uint32_t reg_num, uint32_t reg_type, uint32_t channel_mode) {
+    p->push();
+    uint32_t simdWidth = p->curr.execWidth;
+    p->curr.predicate = GEN_PREDICATE_NONE;
+    p->curr.noMask = 1;
+
+    p->curr.execWidth = 8;
+    p->MOV(header, GenRegister::ud8grf(0,0));
+    p->pop();
+
+    int size = typeSize(reg_type)*simdWidth;
+    p->push();
+    p->SCRATCH_WRITE(header, offset/32, size, reg_num, channel_mode);
+    p->pop();
+  }
+
+  void GenContext::scratchRead(const GenRegister dst, const GenRegister header, uint32_t offset, uint32_t reg_num, uint32_t reg_type, uint32_t channel_mode) {
+    p->push();
+    uint32_t simdWidth = p->curr.execWidth;
+    p->curr.predicate = GEN_PREDICATE_NONE;
+    p->curr.noMask = 1;
+    p->curr.execWidth = 8;
+    p->MOV(header, GenRegister::ud8grf(0,0));
+    p->pop();
+
+    int size = typeSize(reg_type)*simdWidth;
+    p->push();
+    p->SCRATCH_READ(dst, header, offset/32, size, reg_num, channel_mode);
+    p->pop();
+  }
+
   void GenContext::emitTypedWriteInstruction(const SelectionInstruction &insn) {
     const GenRegister header = GenRegister::retype(ra->genReg(insn.src(0)), GEN_TYPE_UD);
     const GenRegister ucoord = ra->genReg(insn.src(insn.extra.elem));
diff --git a/backend/src/backend/gen_context.hpp b/backend/src/backend/gen_context.hpp
index dc5dc45..bcf0dc4 100644
--- a/backend/src/backend/gen_context.hpp
+++ b/backend/src/backend/gen_context.hpp
@@ -41,6 +41,7 @@ namespace gbe
   class Selection;            // Performs instruction selection
   class SelectionInstruction; // Pre-RA Gen instruction
   class SelectionReg;         // Pre-RA Gen register
+  class GenRegister;
 
   /*! Context is the helper structure to build the Gen ISA or simulation code
    *  from GenIR
@@ -108,7 +109,8 @@ namespace gbe
     void emitSampleInstruction(const SelectionInstruction &insn);
     void emitTypedWriteInstruction(const SelectionInstruction &insn);
     void emitGetImageInfoInstruction(const SelectionInstruction &insn);
-
+    void scratchWrite(const GenRegister header, uint32_t offset, uint32_t reg_num, uint32_t reg_type, uint32_t channel_mode);
+    void scratchRead(const GenRegister dst, const GenRegister header, uint32_t offset, uint32_t reg_num, uint32_t reg_type, uint32_t channel_mode);
     /*! Implements base class */
     virtual Kernel *allocateKernel(void);
     /*! Store the position of each label instruction in the Gen ISA stream */
diff --git a/backend/src/backend/gen_defs.hpp b/backend/src/backend/gen_defs.hpp
index 5b15e30..e3959ff 100644
--- a/backend/src/backend/gen_defs.hpp
+++ b/backend/src/backend/gen_defs.hpp
@@ -319,6 +319,15 @@ enum GenMessageTarget {
 #define GEN_BYTE_SCATTER          12//1100: Byte Scattered Write
 #define GEN_UNTYPED_WRITE         13//1101: Untyped Surface Write
 
+/* Data port data cache scratch messages*/
+#define GEN_SCRATCH_READ                  0
+#define GEN_SCRATCH_WRITE                 1
+#define GEN_SCRATCH_CHANNEL_MODE_OWORD    0
+#define GEN_SCRATCH_CHANNEL_MODE_DWORD    1
+#define GEN_SCRATCH_BLOCK_SIZE_1          0
+#define GEN_SCRATCH_BLOCK_SIZE_2          1
+#define GEN_SCRATCH_BLOCK_SIZE_4          3
+
 /* Data port render cache Message Type*/
 #define GEN_MBLOCK_READ           4  //0100: Media Block Read
 #define GEN_TYPED_READ            5  //0101: Typed Surface Read
@@ -765,6 +774,22 @@ struct GenInstruction
       uint32_t end_of_thread:1;
     } gen7_byte_rw;
 
+    /*! Data port Scratch Read/ write */
+    struct {
+      uint32_t offset:12;
+      uint32_t block_size:2;
+      uint32_t ignored0:1;
+      uint32_t invalidate_after_read:1;
+      uint32_t channel_mode:1;
+      uint32_t msg_type:1;
+      uint32_t category:1;
+      uint32_t header_present:1;
+      uint32_t response_length:5;
+      uint32_t msg_length:4;
+      uint32_t pad2:2;
+      uint32_t end_of_thread:1;
+    } gen7_scratch_rw;
+
     /*! Data port OBlock read / write */
     struct {
       uint32_t bti:8;
diff --git a/backend/src/backend/gen_encoder.cpp b/backend/src/backend/gen_encoder.cpp
index 3793d8b..b27ea91 100644
--- a/backend/src/backend/gen_encoder.cpp
+++ b/backend/src/backend/gen_encoder.cpp
@@ -1150,6 +1150,49 @@ namespace gbe
      this->setSrc0(insn, msg);
      setTypedWriteMessage(this, insn, bti, msg_type, msg_length, header_present);
   }
+  static void setScratchMessage(GenEncoder *p,
+                                   GenInstruction *insn,
+                                   uint32_t offset,
+                                   uint32_t block_size,
+                                   uint32_t channel_mode,
+                                   uint32_t msg_type,
+                                   uint32_t msg_length,
+                                   uint32_t response_length)
+  {
+     const GenMessageTarget sfid = GEN_SFID_DATAPORT_DATA_CACHE;
+     setMessageDescriptor(p, insn, sfid, msg_length, response_length, true);
+     insn->bits3.gen7_scratch_rw.block_size = block_size;
+     insn->bits3.gen7_scratch_rw.msg_type = msg_type;
+     insn->bits3.gen7_scratch_rw.channel_mode = channel_mode;
+     insn->bits3.gen7_scratch_rw.offset = offset;
+     insn->bits3.gen7_scratch_rw.category = 1;
+  }
+
+  void GenEncoder::SCRATCH_WRITE(GenRegister msg, uint32_t offset, uint32_t size, uint32_t src_num, uint32_t channel_mode)
+  {
+     assert(src_num == 1 || src_num ==2);
+     uint32_t block_size = src_num == 1 ? GEN_SCRATCH_BLOCK_SIZE_1 : GEN_SCRATCH_BLOCK_SIZE_2;
+     GenInstruction *insn = this->next(GEN_OPCODE_SEND);
+     this->setHeader(insn);
+     this->setDst(insn, GenRegister::retype(GenRegister::null(), GEN_TYPE_UD));
+     this->setSrc0(insn, msg);
+     this->setSrc1(insn, GenRegister::immud(0));
+     // here src_num means register that will be write out: in terms of 32byte register number
+     setScratchMessage(this, insn, offset, block_size, channel_mode, GEN_SCRATCH_WRITE, src_num+1, 0);
+  }
+
+  void GenEncoder::SCRATCH_READ(GenRegister dst, GenRegister src, uint32_t offset, uint32_t size, uint32_t dst_num, uint32_t channel_mode)
+  {
+     assert(dst_num == 1 || dst_num ==2);
+     uint32_t block_size = dst_num == 1 ? GEN_SCRATCH_BLOCK_SIZE_1 : GEN_SCRATCH_BLOCK_SIZE_2;
+     GenInstruction *insn = this->next(GEN_OPCODE_SEND);
+     this->setHeader(insn);
+     this->setDst(insn, dst);
+     this->setSrc0(insn, src);
+     this->setSrc1(insn, GenRegister::immud(0));
+      // here dst_num is the register that will be write-back: in terms of 32byte register
+     setScratchMessage(this, insn, offset, block_size, channel_mode, GEN_SCRATCH_READ, 1, dst_num);
+  }
 
   void GenEncoder::EOT(uint32_t msg) {
     GenInstruction *insn = this->next(GEN_OPCODE_SEND);
diff --git a/backend/src/backend/gen_encoder.hpp b/backend/src/backend/gen_encoder.hpp
index 54674d3..bbf240c 100644
--- a/backend/src/backend/gen_encoder.hpp
+++ b/backend/src/backend/gen_encoder.hpp
@@ -156,6 +156,10 @@ namespace gbe
     void BYTE_GATHER(GenRegister dst, GenRegister src, uint32_t bti, uint32_t elemSize);
     /*! Byte scatter (for unaligned bytes, shorts and ints) */
     void BYTE_SCATTER(GenRegister src, uint32_t bti, uint32_t elemSize);
+    /*! for scratch memory read */
+    void SCRATCH_READ(GenRegister msg, GenRegister dst, uint32_t offset, uint32_t size, uint32_t dst_num, uint32_t channel_mode);
+    /*! for scratch memory write */
+    void SCRATCH_WRITE(GenRegister msg, uint32_t offset, uint32_t size, uint32_t src_num, uint32_t channel_mode);
     /*! Send instruction for the sampler */
     void SAMPLE(GenRegister dest,
                 GenRegister msg,
diff --git a/backend/src/backend/program.cpp b/backend/src/backend/program.cpp
index 26c22f3..35d3a7c 100644
--- a/backend/src/backend/program.cpp
+++ b/backend/src/backend/program.cpp
@@ -374,6 +374,12 @@ namespace gbe {
     return kernel->getStackSize();
   }
 
+  static int32_t kernelGetScratchSize(gbe_kernel genKernel) {
+    if (genKernel == NULL) return 0;
+    const gbe::Kernel *kernel = (const gbe::Kernel*) genKernel;
+    return kernel->getScratchSize();
+  }
+
   static int32_t kernelUseSLM(gbe_kernel genKernel) {
     if (genKernel == NULL) return 0;
     const gbe::Kernel *kernel = (const gbe::Kernel*) genKernel;
@@ -443,6 +449,7 @@ GBE_EXPORT_SYMBOL gbe_kernel_get_simd_width_cb *gbe_kernel_get_simd_width = NULL
 GBE_EXPORT_SYMBOL gbe_kernel_get_curbe_offset_cb *gbe_kernel_get_curbe_offset = NULL;
 GBE_EXPORT_SYMBOL gbe_kernel_get_curbe_size_cb *gbe_kernel_get_curbe_size = NULL;
 GBE_EXPORT_SYMBOL gbe_kernel_get_stack_size_cb *gbe_kernel_get_stack_size = NULL;
+GBE_EXPORT_SYMBOL gbe_kernel_get_scratch_size_cb *gbe_kernel_get_scratch_size = NULL;
 GBE_EXPORT_SYMBOL gbe_kernel_set_const_buffer_size_cb *gbe_kernel_set_const_buffer_size = NULL;
 GBE_EXPORT_SYMBOL gbe_kernel_get_required_work_group_size_cb *gbe_kernel_get_required_work_group_size = NULL;
 GBE_EXPORT_SYMBOL gbe_kernel_use_slm_cb *gbe_kernel_use_slm = NULL;
@@ -476,6 +483,7 @@ namespace gbe
       gbe_kernel_get_curbe_offset = gbe::kernelGetCurbeOffset;
       gbe_kernel_get_curbe_size = gbe::kernelGetCurbeSize;
       gbe_kernel_get_stack_size = gbe::kernelGetStackSize;
+      gbe_kernel_get_scratch_size = gbe::kernelGetScratchSize;
       gbe_kernel_set_const_buffer_size = gbe::kernelSetConstBufSize;
       gbe_kernel_get_required_work_group_size = gbe::kernelGetRequiredWorkGroupSize;
       gbe_kernel_use_slm = gbe::kernelUseSLM;
diff --git a/backend/src/backend/program.h b/backend/src/backend/program.h
index f36bfbf..d20e7af 100644
--- a/backend/src/backend/program.h
+++ b/backend/src/backend/program.h
@@ -198,6 +198,10 @@ extern gbe_kernel_get_curbe_size_cb *gbe_kernel_get_curbe_size;
 typedef int32_t (gbe_kernel_get_stack_size_cb)(gbe_kernel);
 extern gbe_kernel_get_stack_size_cb *gbe_kernel_get_stack_size;
 
+/*! Get the scratch size (zero if no scratch is required) */
+typedef int32_t (gbe_kernel_get_scratch_size_cb)(gbe_kernel);
+extern gbe_kernel_get_scratch_size_cb *gbe_kernel_get_scratch_size;
+
 /*! Get the curbe offset where to put the data. Returns -1 if not required */
 typedef int32_t (gbe_kernel_get_curbe_offset_cb)(gbe_kernel, enum gbe_curbe_type type, uint32_t sub_type);
 extern gbe_kernel_get_curbe_offset_cb *gbe_kernel_get_curbe_offset;
diff --git a/backend/src/backend/program.hpp b/backend/src/backend/program.hpp
index 2d67310..83aaab8 100644
--- a/backend/src/backend/program.hpp
+++ b/backend/src/backend/program.hpp
@@ -96,6 +96,8 @@ namespace gbe {
     INLINE uint32_t getCurbeSize(void) const { return this->curbeSize; }
     /*! Return the size of the stack (zero if none) */
     INLINE uint32_t getStackSize(void) const { return this->stackSize; }
+    /*! Return the size of the scratch memory needed (zero if none) */
+    INLINE uint32_t getScratchSize(void) const { return this->scratchSize; }
     /*! Get the SIMD width for the kernel */
     INLINE uint32_t getSIMDWidth(void) const { return this->simdWidth; }
     /*! Says if SLM is needed for it */
@@ -135,6 +137,7 @@ namespace gbe {
     uint32_t curbeSize;        //!< Size of the data to push
     uint32_t simdWidth;        //!< SIMD size for the kernel (lane number)
     uint32_t stackSize;        //!< Stack size (may be 0 if unused)
+    uint32_t scratchSize;      //!< Scratch memory size (may be 0 if unused)
     bool useSLM;               //!< SLM requires a special HW config
     Context *ctx;              //!< Save context after compiler to alloc constant buffer curbe
     ir::SamplerSet *samplerSet;//!< Copy from the corresponding function.
diff --git a/src/cl_command_queue_gen7.c b/src/cl_command_queue_gen7.c
index 8933213..e58433f 100644
--- a/src/cl_command_queue_gen7.c
+++ b/src/cl_command_queue_gen7.c
@@ -183,6 +183,14 @@ cl_bind_stack(cl_gpgpu gpgpu, cl_kernel ker)
   cl_gpgpu_set_stack(gpgpu, offset, stack_sz, cc_llc_l3);
 }
 
+static void
+cl_setup_scratch(cl_gpgpu gpgpu, cl_kernel ker)
+{
+  int32_t scratch_sz = gbe_kernel_get_scratch_size(ker->opaque);
+
+  cl_gpgpu_set_scratch(gpgpu, scratch_sz);
+}
+
 LOCAL cl_int
 cl_command_queue_ND_range_gen7(cl_command_queue queue,
                                cl_kernel ker,
@@ -231,6 +239,7 @@ cl_command_queue_ND_range_gen7(cl_command_queue queue,
   /* Bind all samplers */
   cl_gpgpu_bind_sampler(queue->gpgpu, ker->samplers, ker->sampler_sz);
 
+  cl_setup_scratch(gpgpu, ker);
   /* Bind a stack if needed */
   cl_bind_stack(gpgpu, ker);
   cl_gpgpu_states_setup(gpgpu, &kernel);
diff --git a/src/cl_driver.h b/src/cl_driver.h
index 212beb3..673985d 100644
--- a/src/cl_driver.h
+++ b/src/cl_driver.h
@@ -135,6 +135,10 @@ extern cl_gpgpu_bind_image_cb *cl_gpgpu_bind_image;
 typedef void (cl_gpgpu_set_stack_cb)(cl_gpgpu, uint32_t offset, uint32_t size, uint32_t cchint);
 extern cl_gpgpu_set_stack_cb *cl_gpgpu_set_stack;
 
+/* Setup scratch */
+typedef void (cl_gpgpu_set_scratch_cb)(cl_gpgpu, uint32_t per_thread_size);
+extern cl_gpgpu_set_scratch_cb *cl_gpgpu_set_scratch;
+
 /* Configure internal state */
 typedef void (cl_gpgpu_state_init_cb)(cl_gpgpu, uint32_t max_threads, uint32_t size_cs_entry);
 extern cl_gpgpu_state_init_cb *cl_gpgpu_state_init;
diff --git a/src/cl_driver_defs.c b/src/cl_driver_defs.c
index 4952288..9aa926e 100644
--- a/src/cl_driver_defs.c
+++ b/src/cl_driver_defs.c
@@ -50,6 +50,7 @@ LOCAL cl_gpgpu_delete_cb *cl_gpgpu_delete = NULL;
 LOCAL cl_gpgpu_sync_cb *cl_gpgpu_sync = NULL;
 LOCAL cl_gpgpu_bind_buf_cb *cl_gpgpu_bind_buf = NULL;
 LOCAL cl_gpgpu_set_stack_cb *cl_gpgpu_set_stack = NULL;
+LOCAL cl_gpgpu_set_scratch_cb *cl_gpgpu_set_scratch = NULL;
 LOCAL cl_gpgpu_bind_image_cb *cl_gpgpu_bind_image = NULL;
 LOCAL cl_gpgpu_state_init_cb *cl_gpgpu_state_init = NULL;
 LOCAL cl_gpgpu_set_perf_counters_cb *cl_gpgpu_set_perf_counters = NULL;
diff --git a/src/intel/intel_gpgpu.c b/src/intel/intel_gpgpu.c
index 2791fbe..e553a55 100644
--- a/src/intel/intel_gpgpu.c
+++ b/src/intel/intel_gpgpu.c
@@ -89,7 +89,9 @@ struct intel_gpgpu
   struct { drm_intel_bo *bo; } curbe_b;
   struct { drm_intel_bo *bo; } sampler_state_b;
   struct { drm_intel_bo *bo; } perf_b;
+  struct { drm_intel_bo *bo; } scratch_b;
 
+  uint32_t per_thread_scratch;
   struct {
     uint32_t num_cs_entries;
     uint32_t size_cs_entry;  /* size of one entry in 512bit elements */
@@ -127,6 +129,9 @@ intel_gpgpu_delete(intel_gpgpu_t *gpgpu)
     drm_intel_bo_unreference(gpgpu->perf_b.bo);
   if (gpgpu->stack_b.bo)
     drm_intel_bo_unreference(gpgpu->stack_b.bo);
+  if (gpgpu->scratch_b.bo)
+    drm_intel_bo_unreference(gpgpu->scratch_b.bo);
+
   intel_batchbuffer_delete(gpgpu->batch);
   cl_free(gpgpu);
 }
@@ -199,18 +204,23 @@ intel_gpgpu_load_vfe_state(intel_gpgpu_t *gpgpu)
   BEGIN_BATCH(gpgpu->batch, 8);
   OUT_BATCH(gpgpu->batch, CMD_MEDIA_STATE_POINTERS | (8-2));
 
-  gen6_vfe_state_inline_t* vfe = (gen6_vfe_state_inline_t*)
-    intel_batchbuffer_alloc_space(gpgpu->batch,0);
-
-  memset(vfe, 0, sizeof(struct gen6_vfe_state_inline));
-  vfe->vfe1.gpgpu_mode = 1;
-  vfe->vfe1.bypass_gateway_ctl = 1;
-  vfe->vfe1.reset_gateway_timer = 1;
-  vfe->vfe1.max_threads = gpgpu->max_threads - 1;
-  vfe->vfe1.urb_entries = 64;
-  vfe->vfe3.curbe_size = 480;
-  vfe->vfe4.scoreboard_mask = 0;
-  intel_batchbuffer_alloc_space(gpgpu->batch, sizeof(gen6_vfe_state_inline_t));
+  if(gpgpu->per_thread_scratch > 0) {
+    OUT_RELOC(gpgpu->batch, gpgpu->scratch_b.bo,
+              I915_GEM_DOMAIN_RENDER,
+              I915_GEM_DOMAIN_RENDER,
+              gpgpu->per_thread_scratch/1024 - 1);
+  }
+  else {
+    OUT_BATCH(gpgpu->batch, 0);
+  }
+  /* max_thread | urb entries | (reset_gateway|bypass_gate_way | gpgpu_mode) */
+  OUT_BATCH(gpgpu->batch, 0 | ((gpgpu->max_threads - 1) << 16) | (64 << 8) | 0xc4);
+  OUT_BATCH(gpgpu->batch, 0);
+  /* curbe_size */
+  OUT_BATCH(gpgpu->batch, 480);
+  OUT_BATCH(gpgpu->batch, 0);
+  OUT_BATCH(gpgpu->batch, 0);
+  OUT_BATCH(gpgpu->batch, 0);
   ADVANCE_BATCH(gpgpu->batch);
 }
 
@@ -537,6 +547,23 @@ intel_gpgpu_bind_buf(intel_gpgpu_t *gpgpu, drm_intel_bo *buf, uint32_t offset, u
 }
 
 static void
+intel_gpgpu_set_scratch(intel_gpgpu_t * gpgpu, uint32_t per_thread_size)
+{
+  drm_intel_bufmgr *bufmgr = gpgpu->drv->bufmgr;
+  drm_intel_bo* old = gpgpu->scratch_b.bo;
+  uint32_t total = per_thread_size * gpgpu->max_threads;
+
+  gpgpu->per_thread_scratch = per_thread_size;
+
+  if(old && old->size < total) {
+    drm_intel_bo_unreference(old);
+    old = NULL;
+  }
+
+  if(!old)
+    gpgpu->scratch_b.bo = drm_intel_bo_alloc(bufmgr, "SCRATCH_BO", total, 4096);
+}
+static void
 intel_gpgpu_set_stack(intel_gpgpu_t *gpgpu, uint32_t offset, uint32_t size, uint32_t cchint)
 {
   drm_intel_bufmgr *bufmgr = gpgpu->drv->bufmgr;
@@ -823,5 +850,6 @@ intel_set_gpgpu_callbacks(void)
   cl_gpgpu_flush = (cl_gpgpu_flush_cb *) intel_gpgpu_flush;
   cl_gpgpu_walker = (cl_gpgpu_walker_cb *) intel_gpgpu_walker;
   cl_gpgpu_bind_sampler = (cl_gpgpu_bind_sampler_cb *) intel_gpgpu_bind_sampler;
+  cl_gpgpu_set_scratch = (cl_gpgpu_set_scratch_cb *) intel_gpgpu_set_scratch;
 }
 
-- 
1.7.9.5