[Beignet] [PATCH v2 6/7] GBE: concentrate all samplers' allocation at compile time.

Zhigang Gong zhigang.gong at linux.intel.com
Sun May 12 20:32:23 PDT 2013


This is the first step to do image/sampler allocation fully
at compile time. Thus we can determine all the sampler id and image
bti index at compile time. So it can make the following things
easier or faster:

1. After we finish both image/sampler, we can treat all image bti and sampler
   as constant and can get their value when we encode the Sampler and TypedWrite
   instructions. Then we don't need to compute the message header at runtime which
   cost 3 instructions each call.

2. get image width/height/depth. As we know the surface bti at compile time,
   we can put those data at specified curbe entry and generate correct indirect
   register access to get those information at compile time.

This is the first step. And just finish the sampler part. Now all the
samplers including those defeined in kernel arguments will be allocated
at compile time. At runtime, it just need to fill in the sampler value
into the proper slot which map to the specified input argument. Then the
driver will create and bind the sampler to the correct slot.

Signed-off-by: Zhigang Gong <zhigang.gong at linux.intel.com>
---
 backend/src/ir/function.hpp           | 17 +++++++++--
 backend/src/ir/sampler.cpp            | 53 ++++++++++++++++++++++++++++-------
 backend/src/ir/sampler.hpp            | 25 ++++++++++++-----
 backend/src/llvm/llvm_gen_backend.cpp |  7 +----
 backend/src/ocl_common_defines.h      |  9 ++++--
 src/cl_command_queue.c                |  2 --
 src/cl_command_queue_gen7.c           |  2 +-
 src/cl_kernel.c                       | 14 ++++-----
 src/cl_kernel.h                       |  5 ++--
 src/cl_sampler.c                      | 22 +++++++++------
 src/cl_sampler.h                      |  4 +--
 src/intel/intel_gpgpu.c               |  2 +-
 12 files changed, 106 insertions(+), 56 deletions(-)

diff --git a/backend/src/ir/function.hpp b/backend/src/ir/function.hpp
index ae49eba..1c02678 100644
--- a/backend/src/ir/function.hpp
+++ b/backend/src/ir/function.hpp
@@ -197,6 +197,18 @@ namespace ir {
       GBE_ASSERT(args[ID] != NULL);
       return *args[ID];
     }
+
+    /*! Get arg ID. */
+    INLINE int32_t getArgID(FunctionArgument *requestArg) {
+      for (uint32_t ID = 0; ID < args.size(); ID++)
+      {
+        if ( args[ID] == requestArg )
+          return ID;
+      }
+      GBE_ASSERTM(0, "Failed to get a valid argument ID.");
+      return -1;
+    }
+
     /*! Get the number of pushed registers */
     INLINE uint32_t pushedNum(void) const { return pushMap.size(); }
     /*! Get the pushed data location for the given register */
@@ -289,8 +301,7 @@ namespace ir {
     /*! Change the SLM config for the function */
     INLINE bool setUseSLM(bool useSLM) { return this->useSLM = useSLM; }
     /*! Get sampler set in this function */
-    SamplerSet* getSamplerSet(void) {return samplerSet; }
-    //const SamplerSet& getSamplerSet(void) const {return samplerSet; }
+    SamplerSet* getSamplerSet(void) const {return samplerSet; }
   private:
     friend class Context;           //!< Can freely modify a function
     std::string name;               //!< Function name
@@ -306,7 +317,7 @@ namespace ir {
     LocationMap locationMap;        //!< Pushed function arguments (loc->reg)
     uint32_t simdWidth;             //!< 8 or 16 if forced, 0 otherwise
     bool useSLM;                    //!< Is SLM required?
-    SamplerSet *samplerSet;
+    SamplerSet *samplerSet;          //!< samplers used in this function.
     GBE_CLASS(Function);            //!< Use custom allocator
   };
 
diff --git a/backend/src/ir/sampler.cpp b/backend/src/ir/sampler.cpp
index d7a8463..f030c6f 100644
--- a/backend/src/ir/sampler.cpp
+++ b/backend/src/ir/sampler.cpp
@@ -22,25 +22,58 @@
  */
 #include "sampler.hpp"
 #include "context.hpp"
+#include "ocl_common_defines.h"
 
 namespace gbe {
 namespace ir {
 
-  Register SamplerSet::append(uint32_t samplerValue, Context *ctx)
+  const uint32_t SamplerSet::getIdx(const Register reg) const
   {
-    int i = 0;
+    auto it = regMap.find(reg);
+    GBE_ASSERT(it != regMap.end());
+    return it->second.slot;
+  }
 
-    for(auto it = regMap.begin();
-        it != regMap.end(); ++it, ++i)
-    {
-      if (it->first == samplerValue)
-        return it->second;
-    }
+  void SamplerSet::appendReg(const Register reg, uint32_t key, Context *ctx) {
+    struct SamplerRegSlot samplerSlot;
+    // This register is just used as a key.
+    samplerSlot.reg = reg;
+    samplerSlot.slot = samplerMap.size();
+    samplerMap.insert(std::make_pair(key, samplerSlot));
+    regMap.insert(std::make_pair(samplerSlot.reg, samplerSlot));
+    ctx->LOADI(ir::TYPE_S32, samplerSlot.reg, ctx->newIntegerImmediate(samplerSlot.slot, ir::TYPE_S32));
+  }
+
+  Register SamplerSet::append(uint32_t samplerValue, Context *ctx)
+  {
+    auto it = samplerMap.find(samplerValue);
+    if (it != samplerMap.end())
+        return it->second.reg;
     Register reg = ctx->reg(FAMILY_DWORD);
-    ctx->LOADI(ir::TYPE_S32, reg, ctx->newIntegerImmediate(i, ir::TYPE_S32));
-    regMap.insert(std::make_pair(samplerValue, reg));
+    appendReg(reg, samplerValue, ctx);
     return reg;
   }
 
+#define SAMPLER_ID(id) ((id << __CLK_SAMPLER_ARG_BASE) | __CLK_SAMPLER_ARG_KEY_BIT)
+  void SamplerSet::append(Register samplerReg, Context *ctx)
+  {
+    ir::FunctionArgument *arg =  ctx->getFunction().getArg(samplerReg);
+    GBE_ASSERT(arg != NULL);
+
+    // XXX As LLVM 3.2/3.1 doesn't have a new data type for the sampler_t, we have to fix up the argument
+    // type here. Once we switch to the LLVM and use the new data type sampler_t, we can remove this
+    // work around.
+    arg->type = ir::FunctionArgument::SAMPLER;
+    int32_t id = ctx->getFunction().getArgID(arg);
+    GBE_ASSERT(id < (1 << __CLK_SAMPLER_ARG_BITS));
+
+    auto it = samplerMap.find(SAMPLER_ID(id));
+    if (it != samplerMap.end()) {
+      GBE_ASSERT(it->second.reg == samplerReg);
+      return;
+    }
+    appendReg(samplerReg, SAMPLER_ID(id), ctx);
+  }
+
 } /* namespace ir */
 } /* namespace gbe */
diff --git a/backend/src/ir/sampler.hpp b/backend/src/ir/sampler.hpp
index 75c4753..f968299 100644
--- a/backend/src/ir/sampler.hpp
+++ b/backend/src/ir/sampler.hpp
@@ -27,7 +27,6 @@
 #include "ir/register.hpp"
 #include "sys/map.hpp"
 
-
 namespace gbe {
 namespace ir {
 
@@ -37,6 +36,11 @@ namespace ir {
    */
   class Context;
 
+  struct SamplerRegSlot {
+    Register reg;
+    uint32_t slot;
+  };
+
   class SamplerSet
   {
   public:
@@ -44,21 +48,28 @@ namespace ir {
      *  If the speficied sampler is exist, only return the previous offset and
      *  don't append it again. Return -1, if failed.*/
     Register append(uint32_t clkSamplerValue, Context *ctx);
-    size_t getDataSize(void) { return regMap.size(); }
-    size_t getDataSize(void) const { return regMap.size(); }
+    /*! Append a sampler defined in kernel args. */
+    void append(Register samplerArg, Context *ctx);
+    /*! Get the sampler idx (actual location) */
+    const uint32_t getIdx(const Register reg) const;
+    size_t getDataSize(void) { return samplerMap.size(); }
+    size_t getDataSize(void) const { return samplerMap.size(); }
     void getData(uint32_t *samplers) const {
-      for ( auto &it : regMap)
-        *samplers++ = it.first;
+      for(auto &it : samplerMap)
+        samplers[it.second.slot] = it.first;
     }
 
     void operator = (const SamplerSet& other) {
       regMap.insert(other.regMap.begin(), other.regMap.end());
+      samplerMap.insert(other.samplerMap.begin(), other.samplerMap.end());
     }
 
-    SamplerSet(const SamplerSet& other) : regMap(other.regMap.begin(), other.regMap.end()) { }
+    SamplerSet(const SamplerSet& other) : samplerMap(other.samplerMap.begin(), other.samplerMap.end()) { }
     SamplerSet() {}
   private:
-    map<uint32_t, Register> regMap;
+    void appendReg(const Register reg, uint32_t key, Context *ctx);
+    map<uint32_t, SamplerRegSlot> samplerMap;
+    map<Register, SamplerRegSlot> regMap;
     GBE_CLASS(SamplerSet);
   };
 } /* namespace ir */
diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp
index e7ddcf0..c17a40b 100644
--- a/backend/src/llvm/llvm_gen_backend.cpp
+++ b/backend/src/llvm/llvm_gen_backend.cpp
@@ -1998,13 +1998,8 @@ namespace gbe
                GBE_ASSERTM(x.type == ir::TYPE_U32 || x.type == ir::TYPE_S32, "Invalid sampler type");
                sampler = ctx.getFunction().getSamplerSet()->append(x.data.u32, &ctx);
             } else {
-              // XXX As LLVM 3.2/3.1 doesn't have a new data type for the sampler_t, we have to fix up the argument
-              // type here. Once we switch to the LLVM and use the new data type sampler_t, we can remove this
-              // work around.
               sampler = this->getRegister(*AI);
-              ir::FunctionArgument *arg =  ctx.getFunction().getArg(sampler);
-              GBE_ASSERT(arg != NULL);
-              arg->type = ir::FunctionArgument::SAMPLER;
+              ctx.getFunction().getSamplerSet()->append(sampler, &ctx);
             }
             ++AI;
 
diff --git a/backend/src/ocl_common_defines.h b/backend/src/ocl_common_defines.h
index d4b1b6c..1ea150b 100644
--- a/backend/src/ocl_common_defines.h
+++ b/backend/src/ocl_common_defines.h
@@ -111,9 +111,12 @@ typedef enum clk_sampler_type {
     __CLK_SAMPLER_MASK             = __CLK_MIP_MASK | __CLK_FILTER_MASK |
                                      __CLK_NORMALIZED_MASK | __CLK_ADDRESS_MASK,
 
-    __CLK_ANISOTROPIC_RATIO_BITS   = 5,
-    __CLK_ANISOTROPIC_RATIO_MASK   = (int) 0x80000000 >>
-                                      (__CLK_ANISOTROPIC_RATIO_BITS-1)
+    __CLK_SAMPLER_ARG_BASE         = __CLK_MIP_BASE + __CLK_SAMPLER_BITS,
+    __CLK_SAMPLER_ARG_BITS         = 8,
+    __CLK_SAMPLER_ARG_MASK         = ((1 << __CLK_SAMPLER_ARG_BITS) - 1) << __CLK_SAMPLER_ARG_BASE,
+    __CLK_SAMPLER_ARG_KEY_BIT      = (1 << (__CLK_SAMPLER_ARG_BASE + __CLK_SAMPLER_ARG_BITS)),
+    __CLK_SAMPLER_ARG_KEY_BITS     = 1,
+
 } clk_sampler_type;
 
 // Memory synchronization
diff --git a/src/cl_command_queue.c b/src/cl_command_queue.c
index 7e720cf..4d19fc8 100644
--- a/src/cl_command_queue.c
+++ b/src/cl_command_queue.c
@@ -377,8 +377,6 @@ cl_command_queue_ND_range(cl_command_queue queue,
   else
     FATAL ("Unknown Gen Device");
 
-  k->arg_sampler_sz = 0;
-
 #if USE_FULSIM
   if (run_it != NULL && strcmp(run_it, "1") == 0) {
     TRY (cl_fulsim_dump_all_surfaces, queue, k);
diff --git a/src/cl_command_queue_gen7.c b/src/cl_command_queue_gen7.c
index bc648a5..c93241c 100644
--- a/src/cl_command_queue_gen7.c
+++ b/src/cl_command_queue_gen7.c
@@ -225,7 +225,7 @@ cl_command_queue_ND_range_gen7(cl_command_queue queue,
   /* Bind user buffers */
   cl_command_queue_bind_surface(queue, ker);
   /* Bind all samplers */
-  cl_gpgpu_bind_sampler(queue->gpgpu, ker->samplers, ker->arg_sampler_sz + ker->sampler_sz);
+  cl_gpgpu_bind_sampler(queue->gpgpu, ker->samplers, ker->sampler_sz);
 
   /* Bind a stack if needed */
   cl_bind_stack(gpgpu, ker);
diff --git a/src/cl_kernel.c b/src/cl_kernel.c
index d2231ea..80215b3 100644
--- a/src/cl_kernel.c
+++ b/src/cl_kernel.c
@@ -136,15 +136,11 @@ cl_kernel_set_arg(cl_kernel k, cl_uint index, size_t sz, const void *value)
     memcpy(&sampler, value, sz);
     if (UNLIKELY(sampler->magic != CL_MAGIC_SAMPLER_HEADER))
       return CL_INVALID_KERNEL_ARGS;
-    uint32_t slot;
     k->args[index].local_sz = 0;
     k->args[index].is_set = 1;
     k->args[index].mem = NULL;
     k->args[index].sampler = sampler;
-    slot = cl_arg_sampler_insert(k, sampler);
-    offset = gbe_kernel_get_curbe_offset(k->opaque, GBE_CURBE_KERNEL_ARGUMENT, index);
-    assert(offset + sz <= k->curbe_sz);
-    memcpy(k->curbe + offset, &slot, sizeof(slot));
+    cl_set_sampler_arg_slot(k, index, sampler);
     return CL_SUCCESS;
   }
 
@@ -209,9 +205,9 @@ cl_kernel_setup(cl_kernel k, gbe_kernel opaque)
 
   /* Get sampler data & size */
   k->sampler_sz = gbe_kernel_get_sampler_size(k->opaque);
-  k->arg_sampler_sz = 0;
   assert(k->sampler_sz <= GEN_MAX_SAMPLERS);
-  gbe_kernel_get_sampler_data(k->opaque, k->samplers);
+  if (k->sampler_sz > 0)
+    gbe_kernel_get_sampler_data(k->opaque, k->samplers);
 }
 
 LOCAL cl_kernel
@@ -231,8 +227,8 @@ cl_kernel_dup(cl_kernel from)
   to->arg_n = from->arg_n;
   to->curbe_sz = from->curbe_sz;
   to->sampler_sz = from->sampler_sz;
-  to->arg_sampler_sz = from->arg_sampler_sz;
-  memcpy(to->samplers, from->samplers, to->sampler_sz * sizeof(uint32_t));
+  if (to->sampler_sz)
+    memcpy(to->samplers, from->samplers, to->sampler_sz * sizeof(uint32_t));
   TRY_ALLOC_NO_ERR(to->args, cl_calloc(to->arg_n, sizeof(cl_argument)));
   if (to->curbe_sz) TRY_ALLOC_NO_ERR(to->curbe, cl_calloc(1, to->curbe_sz));
 
diff --git a/src/cl_kernel.h b/src/cl_kernel.h
index 01810ba..d569531 100644
--- a/src/cl_kernel.h
+++ b/src/cl_kernel.h
@@ -52,10 +52,9 @@ struct _cl_kernel {
   gbe_kernel opaque;          /* (Opaque) compiler structure for the OCL kernel */
   char *curbe;                /* One curbe per kernel */
   size_t curbe_sz;            /* Size of it */
-  uint32_t samplers[GEN_MAX_SAMPLERS]; /* samplers defined in kernel */
-  size_t sampler_sz;          /* sampler size defined in kernel */
+  uint32_t samplers[GEN_MAX_SAMPLERS]; /* samplers defined in kernel & kernel args */
+  size_t sampler_sz;          /* sampler size defined in kernel & kernel args. */
   cl_argument *args;          /* To track argument setting */
-  size_t arg_sampler_sz;      /* sampler size defined in kernel args */
   uint32_t arg_n:31;          /* Number of arguments */
   uint32_t ref_its_program:1; /* True only for the user kernel (created by clCreateKernel) */
 };
diff --git a/src/cl_sampler.c b/src/cl_sampler.c
index b8711ae..7e0b7b0 100644
--- a/src/cl_sampler.c
+++ b/src/cl_sampler.c
@@ -52,18 +52,22 @@ uint32_t cl_to_clk(cl_bool normalized_coords,
          | (clk_filter << __CLK_FILTER_BASE);
 }
 
-int cl_arg_sampler_insert(cl_kernel k, cl_sampler sampler)
+#define IS_SAMPLER_ARG(v) (v & __CLK_SAMPLER_ARG_KEY_BIT)
+#define SAMPLER_ARG_ID(v) ((v & __CLK_SAMPLER_ARG_MASK) >> __CLK_SAMPLER_ARG_BASE)
+int cl_set_sampler_arg_slot(cl_kernel k, int index, cl_sampler sampler)
 {
-  int i, slot_id;
-  for(i = 0; i < k->sampler_sz; i++)
+  int slot_id;
+  for(slot_id = 0; slot_id < k->sampler_sz; slot_id++)
   {
-    if (k->samplers[i] == sampler->clkSamplerValue)
-      return i;
+    if (IS_SAMPLER_ARG(k->samplers[slot_id])) {
+     if (SAMPLER_ARG_ID(k->samplers[slot_id]) == index) {
+       k->samplers[slot_id] = (k->samplers[slot_id] & (~__CLK_SAMPLER_MASK))
+                              | sampler->clkSamplerValue;
+       return slot_id;
+     }
+    }
   }
-  slot_id = k->sampler_sz + k->arg_sampler_sz;
-  k->samplers[slot_id] = sampler->clkSamplerValue;
-  k->arg_sampler_sz++;
-  return slot_id;
+  assert(0);
 }
 
 LOCAL cl_sampler
diff --git a/src/cl_sampler.h b/src/cl_sampler.h
index d5042e5..4785928 100644
--- a/src/cl_sampler.h
+++ b/src/cl_sampler.h
@@ -50,8 +50,8 @@ extern void cl_sampler_delete(cl_sampler);
 /* Add one more reference to this object */
 extern void cl_sampler_add_ref(cl_sampler);
 
-/* insert a new argument sampler */
-int cl_arg_sampler_insert(cl_kernel k, cl_sampler sampler);
+/* set a sampler kernel argument */
+int cl_set_sampler_arg_slot(cl_kernel k, int index, cl_sampler sampler);
 
 #endif /* __CL_SAMPLER_H__ */
 
diff --git a/src/intel/intel_gpgpu.c b/src/intel/intel_gpgpu.c
index 4341d09..2f34ce0 100644
--- a/src/intel/intel_gpgpu.c
+++ b/src/intel/intel_gpgpu.c
@@ -734,7 +734,7 @@ intel_gpgpu_bind_sampler(intel_gpgpu_t *gpgpu, uint32_t *samplers, size_t sample
   int index;
   assert(sampler_sz <= GEN_MAX_SAMPLERS);
   for(index = 0; index < sampler_sz; index++)
-    intel_gpgpu_insert_sampler(gpgpu, index, samplers[index]);
+    intel_gpgpu_insert_sampler(gpgpu, index, samplers[index] & __CLK_SAMPLER_MASK);
 }
 
 static void
-- 
1.7.11.7



More information about the Beignet mailing list