[Beignet] [PATCH 2/3] CL: Support kernel side defined samplers.

Zhigang Gong zhigang.gong at linux.intel.com
Mon May 6 23:45:59 PDT 2013


We changed the way to handle samplers. We gather all the kernel side
defined samplers and those sampler in kernel argument into one samplers
array. And don't allocate one single sampler each time.

Signed-off-by: Zhigang Gong <zhigang.gong at linux.intel.com>
---
 src/cl_command_queue.c      |  4 ++--
 src/cl_command_queue_gen7.c |  2 ++
 src/cl_driver.h             |  6 +++---
 src/cl_driver_defs.c        |  2 +-
 src/cl_kernel.c             | 36 +++++++++++++++++++++++------------
 src/cl_kernel.h             |  3 +++
 src/cl_sampler.c            | 43 ++++++++++++++++++++++++++++++++++++++++++
 src/cl_sampler.h            |  5 +++++
 src/intel/intel_gpgpu.c     | 46 +++++++++++++++++++++------------------------
 9 files changed, 104 insertions(+), 43 deletions(-)

diff --git a/src/cl_command_queue.c b/src/cl_command_queue.c
index 7d604c3..7e720cf 100644
--- a/src/cl_command_queue.c
+++ b/src/cl_command_queue.c
@@ -119,8 +119,6 @@ cl_command_queue_bind_surface(cl_command_queue queue, cl_kernel k)
                           k->args[i].mem->w, k->args[i].mem->h,
                           k->args[i].mem->pitch, k->args[i].mem->tiling);
     } else if (arg_type == GBE_ARG_SAMPLER) {
-      uint32_t *curbe_index = (uint32_t*)(k->curbe + offset);
-      cl_gpgpu_insert_sampler(queue->gpgpu, curbe_index, k->args[i].sampler);
     } else
       cl_gpgpu_bind_buf(queue->gpgpu, k->args[i].mem->bo, offset, cc_llc_l3);
   }
@@ -379,6 +377,8 @@ cl_command_queue_ND_range(cl_command_queue queue,
   else
     FATAL ("Unknown Gen Device");
 
+  k->arg_sampler_sz = 0;
+
 #if USE_FULSIM
   if (run_it != NULL && strcmp(run_it, "1") == 0) {
     TRY (cl_fulsim_dump_all_surfaces, queue, k);
diff --git a/src/cl_command_queue_gen7.c b/src/cl_command_queue_gen7.c
index 108684f..bc648a5 100644
--- a/src/cl_command_queue_gen7.c
+++ b/src/cl_command_queue_gen7.c
@@ -224,6 +224,8 @@ cl_command_queue_ND_range_gen7(cl_command_queue queue,
 
   /* Bind user buffers */
   cl_command_queue_bind_surface(queue, ker);
+  /* Bind all samplers */
+  cl_gpgpu_bind_sampler(queue->gpgpu, ker->samplers, ker->arg_sampler_sz + ker->sampler_sz);
 
   /* Bind a stack if needed */
   cl_bind_stack(gpgpu, ker);
diff --git a/src/cl_driver.h b/src/cl_driver.h
index f1e1454..089167a 100644
--- a/src/cl_driver.h
+++ b/src/cl_driver.h
@@ -110,9 +110,9 @@ extern cl_gpgpu_delete_cb *cl_gpgpu_delete;
 typedef void (cl_gpgpu_bind_buf_cb)(cl_gpgpu, cl_buffer, uint32_t offset, uint32_t cchint);
 extern cl_gpgpu_bind_buf_cb *cl_gpgpu_bind_buf;
 
-/* Insert a sampler */
-typedef void (cl_gpgpu_insert_sampler_cb)(cl_gpgpu, uint32_t *curbe_index, cl_sampler sampler);
-extern cl_gpgpu_insert_sampler_cb *cl_gpgpu_insert_sampler;
+/* bind samplers defined in both kernel and kernel args. */
+typedef void (cl_gpgpu_bind_sampler_cb)(cl_gpgpu, uint32_t *samplers, size_t sampler_sz);
+extern cl_gpgpu_bind_sampler_cb *cl_gpgpu_bind_sampler;
 
 /* Set a 2d texture */
 typedef void (cl_gpgpu_bind_image_cb)(cl_gpgpu state,
diff --git a/src/cl_driver_defs.c b/src/cl_driver_defs.c
index 5acc6a5..c7dc59b 100644
--- a/src/cl_driver_defs.c
+++ b/src/cl_driver_defs.c
@@ -59,5 +59,5 @@ LOCAL cl_gpgpu_batch_start_cb *cl_gpgpu_batch_start = NULL;
 LOCAL cl_gpgpu_batch_end_cb *cl_gpgpu_batch_end = NULL;
 LOCAL cl_gpgpu_flush_cb *cl_gpgpu_flush = NULL;
 LOCAL cl_gpgpu_walker_cb *cl_gpgpu_walker = NULL;
-LOCAL cl_gpgpu_insert_sampler_cb *cl_gpgpu_insert_sampler = NULL;
+LOCAL cl_gpgpu_bind_sampler_cb *cl_gpgpu_bind_sampler = NULL;
 
diff --git a/src/cl_kernel.c b/src/cl_kernel.c
index ec0e2e8..d2231ea 100644
--- a/src/cl_kernel.c
+++ b/src/cl_kernel.c
@@ -110,6 +110,7 @@ cl_kernel_set_arg(cl_kernel k, cl_uint index, size_t sz, const void *value)
   if (arg_type == GBE_ARG_VALUE) {
     if (UNLIKELY(value == NULL))
       return CL_INVALID_KERNEL_ARGS;
+
     offset = gbe_kernel_get_curbe_offset(k->opaque, GBE_CURBE_KERNEL_ARGUMENT, index);
     assert(offset + sz <= k->curbe_sz);
     memcpy(k->curbe + offset, value, sz);
@@ -129,20 +130,22 @@ cl_kernel_set_arg(cl_kernel k, cl_uint index, size_t sz, const void *value)
     return CL_SUCCESS;
   }
 
-  /* For a sampler*/
+  /* Is it a sampler*/
   if (arg_type == GBE_ARG_SAMPLER) {
-     cl_sampler sampler;
-     if (UNLIKELY(value == NULL))
+    cl_sampler sampler;
+    memcpy(&sampler, value, sz);
+    if (UNLIKELY(sampler->magic != CL_MAGIC_SAMPLER_HEADER))
       return CL_INVALID_KERNEL_ARGS;
-     sampler = *(cl_sampler*)value;
-
-     if (UNLIKELY(sampler->magic != CL_MAGIC_SAMPLER_HEADER))
-       return CL_INVALID_ARG_VALUE;
-     k->args[index].local_sz = 0;
-     k->args[index].is_set = 1;
-     k->args[index].mem = NULL;
-     k->args[index].sampler = sampler;
-     return CL_SUCCESS;
+    uint32_t slot;
+    k->args[index].local_sz = 0;
+    k->args[index].is_set = 1;
+    k->args[index].mem = NULL;
+    k->args[index].sampler = sampler;
+    slot = cl_arg_sampler_insert(k, sampler);
+    offset = gbe_kernel_get_curbe_offset(k->opaque, GBE_CURBE_KERNEL_ARGUMENT, index);
+    assert(offset + sz <= k->curbe_sz);
+    memcpy(k->curbe + offset, &slot, sizeof(slot));
+    return CL_SUCCESS;
   }
 
   /* Otherwise, we just need to check that this is a buffer */
@@ -203,6 +206,12 @@ cl_kernel_setup(cl_kernel k, gbe_kernel opaque)
 
   /* Create the curbe */
   k->curbe_sz = gbe_kernel_get_curbe_size(k->opaque);
+
+  /* Get sampler data & size */
+  k->sampler_sz = gbe_kernel_get_sampler_size(k->opaque);
+  k->arg_sampler_sz = 0;
+  assert(k->sampler_sz <= GEN_MAX_SAMPLERS);
+  gbe_kernel_get_sampler_data(k->opaque, k->samplers);
 }
 
 LOCAL cl_kernel
@@ -221,6 +230,9 @@ cl_kernel_dup(cl_kernel from)
   to->program = from->program;
   to->arg_n = from->arg_n;
   to->curbe_sz = from->curbe_sz;
+  to->sampler_sz = from->sampler_sz;
+  to->arg_sampler_sz = from->arg_sampler_sz;
+  memcpy(to->samplers, from->samplers, to->sampler_sz * sizeof(uint32_t));
   TRY_ALLOC_NO_ERR(to->args, cl_calloc(to->arg_n, sizeof(cl_argument)));
   if (to->curbe_sz) TRY_ALLOC_NO_ERR(to->curbe, cl_calloc(1, to->curbe_sz));
 
diff --git a/src/cl_kernel.h b/src/cl_kernel.h
index dd98fb3..01810ba 100644
--- a/src/cl_kernel.h
+++ b/src/cl_kernel.h
@@ -52,7 +52,10 @@ struct _cl_kernel {
   gbe_kernel opaque;          /* (Opaque) compiler structure for the OCL kernel */
   char *curbe;                /* One curbe per kernel */
   size_t curbe_sz;            /* Size of it */
+  uint32_t samplers[GEN_MAX_SAMPLERS]; /* samplers defined in kernel */
+  size_t sampler_sz;          /* sampler size defined in kernel */
   cl_argument *args;          /* To track argument setting */
+  size_t arg_sampler_sz;      /* sampler size defined in kernel args */
   uint32_t arg_n:31;          /* Number of arguments */
   uint32_t ref_its_program:1; /* True only for the user kernel (created by clCreateKernel) */
 };
diff --git a/src/cl_sampler.c b/src/cl_sampler.c
index d3e61da..b8711ae 100644
--- a/src/cl_sampler.c
+++ b/src/cl_sampler.c
@@ -22,9 +22,50 @@
 #include "cl_utils.h"
 #include "cl_alloc.h"
 #include "cl_khr_icd.h"
+#include "cl_kernel.h"
 
 #include <assert.h>
 
+uint32_t cl_to_clk(cl_bool normalized_coords,
+                   cl_addressing_mode address,
+                   cl_filter_mode filter)
+{
+  int clk_address;
+  int clk_filter;
+  switch (address) {
+  case CL_ADDRESS_NONE: clk_address = CLK_ADDRESS_NONE; break;
+  case CL_ADDRESS_CLAMP: clk_address = CLK_ADDRESS_CLAMP; break;
+  case CL_ADDRESS_CLAMP_TO_EDGE: clk_address = CLK_ADDRESS_CLAMP_TO_EDGE; break;
+  case CL_ADDRESS_REPEAT: clk_address = CLK_ADDRESS_REPEAT; break;
+  case CL_ADDRESS_MIRRORED_REPEAT: clk_address = CLK_ADDRESS_MIRRORED_REPEAT; break;
+  default:
+    assert(0);
+  }
+  switch(filter) {
+  case CL_FILTER_NEAREST: clk_filter = CLK_FILTER_NEAREST; break;
+  case CL_FILTER_LINEAR: clk_filter = CLK_FILTER_LINEAR; break;
+  default:
+    assert(0);
+  }
+  return (clk_address << __CLK_ADDRESS_BASE)
+         | (normalized_coords << __CLK_NORMALIZED_BASE)
+         | (clk_filter << __CLK_FILTER_BASE);
+}
+
+int cl_arg_sampler_insert(cl_kernel k, cl_sampler sampler)
+{
+  int i, slot_id;
+  for(i = 0; i < k->sampler_sz; i++)
+  {
+    if (k->samplers[i] == sampler->clkSamplerValue)
+      return i;
+  }
+  slot_id = k->sampler_sz + k->arg_sampler_sz;
+  k->samplers[slot_id] = sampler->clkSamplerValue;
+  k->arg_sampler_sz++;
+  return slot_id;
+}
+
 LOCAL cl_sampler
 cl_sampler_new(cl_context ctx,
                cl_bool normalized_coords,
@@ -54,6 +95,8 @@ cl_sampler_new(cl_context ctx,
   sampler->ctx = ctx;
   cl_context_add_ref(ctx);
 
+  sampler->clkSamplerValue = cl_to_clk(normalized_coords, address, filter);
+
 exit:
   if (errcode_ret)
     *errcode_ret = err;
diff --git a/src/cl_sampler.h b/src/cl_sampler.h
index da9a488..d5042e5 100644
--- a/src/cl_sampler.h
+++ b/src/cl_sampler.h
@@ -21,6 +21,7 @@
 #define __CL_SAMPLER_H__
 
 #include "CL/cl.h"
+#include "../backend/src/ocl_common_defines.h"
 #include <stdint.h>
 
 /* How to access images */
@@ -33,6 +34,7 @@ struct _cl_sampler {
   cl_bool normalized_coords; /* Are coordinates normalized? */
   cl_addressing_mode address;/* CLAMP / REPEAT and so on... */
   cl_filter_mode filter;     /* LINEAR / NEAREST mostly */
+  uint32_t clkSamplerValue;
 };
 
 /* Create a new sampler object */
@@ -48,5 +50,8 @@ extern void cl_sampler_delete(cl_sampler);
 /* Add one more reference to this object */
 extern void cl_sampler_add_ref(cl_sampler);
 
+/* insert a new argument sampler */
+int cl_arg_sampler_insert(cl_kernel k, cl_sampler sampler);
+
 #endif /* __CL_SAMPLER_H__ */
 
diff --git a/src/intel/intel_gpgpu.c b/src/intel/intel_gpgpu.c
index d90368c..4341d09 100644
--- a/src/intel/intel_gpgpu.c
+++ b/src/intel/intel_gpgpu.c
@@ -471,16 +471,6 @@ intel_gpgpu_get_free_img_index(intel_gpgpu_t *gpgpu)
 }
 
 static int
-intel_gpgpu_get_free_sampler_index(intel_gpgpu_t *gpgpu)
-{
-  int slot;
-  assert(~gpgpu->sampler_bitmap != 0);
-  slot = __fls(~gpgpu->sampler_bitmap);
-  gpgpu->sampler_bitmap |= (1 << slot);
-  return slot;
-}
-
-static int
 intel_get_surface_type(cl_mem_object_type type)
 {
   switch (type) {
@@ -662,10 +652,10 @@ intel_gpgpu_upload_samplers(intel_gpgpu_t *gpgpu, const void *data, uint32_t n)
 int translate_wrap_mode(uint32_t cl_address_mode, int using_nearest)
 {
    switch( cl_address_mode ) {
-   case CL_ADDRESS_NONE:
-   case CL_ADDRESS_REPEAT:
+   case CLK_ADDRESS_NONE:
+   case CLK_ADDRESS_REPEAT:
       return GEN_TEXCOORDMODE_WRAP;
-   case CL_ADDRESS_CLAMP:
+   case CLK_ADDRESS_CLAMP:
       /* GL_CLAMP is the weird mode where coordinates are clamped to
        * [0.0, 1.0], so linear filtering of coordinates outside of
        * [0.0, 1.0] give you half edge texel value and half border
@@ -679,9 +669,9 @@ int translate_wrap_mode(uint32_t cl_address_mode, int using_nearest)
          return GEN_TEXCOORDMODE_CLAMP;
       else
          return GEN_TEXCOORDMODE_CLAMP_BORDER;
-   case CL_ADDRESS_CLAMP_TO_EDGE:
+   case CLK_ADDRESS_CLAMP_TO_EDGE:
       return GEN_TEXCOORDMODE_CLAMP;
-   case CL_ADDRESS_MIRRORED_REPEAT:
+   case CLK_ADDRESS_MIRRORED_REPEAT:
       return GEN_TEXCOORDMODE_MIRROR;
    default:
       return GEN_TEXCOORDMODE_WRAP;
@@ -689,35 +679,33 @@ int translate_wrap_mode(uint32_t cl_address_mode, int using_nearest)
 }
 
 static void
-intel_gpgpu_insert_sampler(intel_gpgpu_t *gpgpu, uint32_t *curbe_index, cl_sampler cl_sampler)
+intel_gpgpu_insert_sampler(intel_gpgpu_t *gpgpu, uint32_t index, uint32_t clk_sampler)
 {
-  int index;
   int using_nearest = 0;
   uint32_t wrap_mode;
   gen7_sampler_state_t *sampler;
 
-  index = intel_gpgpu_get_free_sampler_index(gpgpu);
   sampler = (gen7_sampler_state_t *)gpgpu->sampler_state_b.bo->virtual + index;
-  if (!cl_sampler->normalized_coords)
+  if ((clk_sampler & __CLK_NORMALIZED_MASK) == CLK_NORMALIZED_COORDS_FALSE)
     sampler->ss3.non_normalized_coord = 1;
   else
     sampler->ss3.non_normalized_coord = 0;
 
-  switch (cl_sampler->filter) {
-  case CL_FILTER_NEAREST:
+  switch (clk_sampler & __CLK_FILTER_MASK) {
+  case CLK_FILTER_NEAREST:
     sampler->ss0.min_filter = GEN_MAPFILTER_NEAREST;
     sampler->ss0.mip_filter = GEN_MIPFILTER_NONE;
     sampler->ss0.mag_filter = GEN_MAPFILTER_NEAREST;
     using_nearest = 1;
     break;
-  case CL_FILTER_LINEAR:
+  case CLK_FILTER_LINEAR:
     sampler->ss0.min_filter = GEN_MAPFILTER_LINEAR;
     sampler->ss0.mip_filter = GEN_MIPFILTER_NONE;
     sampler->ss0.mag_filter = GEN_MAPFILTER_LINEAR;
     break;
   }
 
-  wrap_mode = translate_wrap_mode(cl_sampler->address, using_nearest);
+  wrap_mode = translate_wrap_mode(clk_sampler & __CLK_ADDRESS_MASK, using_nearest);
   sampler->ss3.r_wrap_mode = wrap_mode;
   sampler->ss3.s_wrap_mode = wrap_mode;
   sampler->ss3.t_wrap_mode = wrap_mode;
@@ -738,7 +726,15 @@ intel_gpgpu_insert_sampler(intel_gpgpu_t *gpgpu, uint32_t *curbe_index, cl_sampl
      sampler->ss3.address_round |= GEN_ADDRESS_ROUNDING_ENABLE_U_MAG |
                                    GEN_ADDRESS_ROUNDING_ENABLE_V_MAG |
                                    GEN_ADDRESS_ROUNDING_ENABLE_R_MAG;
-  *curbe_index = index;
+}
+
+static void
+intel_gpgpu_bind_sampler(intel_gpgpu_t *gpgpu, uint32_t *samplers, size_t sampler_sz)
+{
+  int index;
+  assert(sampler_sz <= GEN_MAX_SAMPLERS);
+  for(index = 0; index < sampler_sz; index++)
+    intel_gpgpu_insert_sampler(gpgpu, index, samplers[index]);
 }
 
 static void
@@ -815,6 +811,6 @@ intel_set_gpgpu_callbacks(void)
   cl_gpgpu_batch_end = (cl_gpgpu_batch_end_cb *) intel_gpgpu_batch_end;
   cl_gpgpu_flush = (cl_gpgpu_flush_cb *) intel_gpgpu_flush;
   cl_gpgpu_walker = (cl_gpgpu_walker_cb *) intel_gpgpu_walker;
-  cl_gpgpu_insert_sampler = (cl_gpgpu_insert_sampler_cb *) intel_gpgpu_insert_sampler;
+  cl_gpgpu_bind_sampler = (cl_gpgpu_bind_sampler_cb *) intel_gpgpu_bind_sampler;
 }
 
-- 
1.7.11.7



More information about the Beignet mailing list