[Beignet] [PATCH] [PATCH]improve the clEnqueueCopyBufferRect performance in some cases

Luo, Xionghu xionghu.luo at intel.com
Mon May 19 00:35:22 PDT 2014


Could you please move the kernel source code to file instead of staying in host code?
You can refer to my pending patch "move enqueue_copy_image kernels outside of runtime code", thanks.

Luo Xionghu
Best Regards

-----Original Message-----
From: Beignet [mailto:beignet-bounces at lists.freedesktop.org] On Behalf Of Yang, Rong R
Sent: Monday, May 19, 2014 3:14 PM
To: Lv, Meng; beignet at lists.freedesktop.org
Cc: Lv, Meng
Subject: Re: [Beignet] [PATCH] [PATCH]improve the clEnqueueCopyBufferRect performance in some cases

The one index indicate to one kernel string, because you add a new kernel for CL_ENQUEUE_COPY_BUFFER_RECT, you should also add a new index for it.

And the file mode change 100644 => 100755, I think it is not necessary.

-----Original Message-----
From: Beignet [mailto:beignet-bounces at lists.freedesktop.org] On Behalf Of Lv Meng
Sent: Monday, May 05, 2014 10:50 AM
To: beignet at lists.freedesktop.org
Cc: Lv, Meng
Subject: [Beignet] [PATCH] [PATCH]improve the clEnqueueCopyBufferRect performance in some cases

Signed-off-by: Lv Meng <meng.lv at intel.com>
---
 src/cl_mem.c | 80 ++++++++++++++++++++++++++++++++++++++++++++----------------
 1 file changed, 59 insertions(+), 21 deletions(-)  mode change 100644 => 100755 src/cl_mem.c

diff --git a/src/cl_mem.c b/src/cl_mem.c old mode 100644 new mode 100755 index 44482f7..92f51d0
--- a/src/cl_mem.c
+++ b/src/cl_mem.c
@@ -911,6 +911,17 @@ cl_mem_copy_buffer_rect(cl_command_queue queue, cl_mem src_buf, cl_mem dst_buf,
   size_t global_off[] = {0,0,0};
   size_t global_sz[] = {1,1,1};
   size_t local_sz[] = {LOCAL_SZ_0,LOCAL_SZ_1,LOCAL_SZ_1};
+
+  // the src and dst mem rect is continuous, the copy is degraded to 
+ buf copy  if((region[0] == dst_row_pitch) && (region[0] ==
+ src_row_pitch)  && (region[1] * src_row_pitch == src_slice_pitch) && (region[1] * dst_row_pitch == dst_slice_pitch)){
+    cl_int src_offset = src_origin[2]*src_slice_pitch + src_origin[1]*src_row_pitch + src_origin[0];
+    cl_int dst_offset = dst_origin[2]*dst_slice_pitch + dst_origin[1]*dst_row_pitch + dst_origin[0];
+    cl_int size = region[0]*region[1]*region[2];
+    ret = cl_mem_copy(queue, src_buf, dst_buf,src_offset, dst_offset, size);
+    return ret;
+  }
+
   if(region[1] == 1) local_sz[1] = 1;
   if(region[2] == 1) local_sz[2] = 1;
   global_sz[0] = ((region[0] + local_sz[0] - 1) / local_sz[0]) * local_sz[0]; @@ -919,30 +930,57 @@ cl_mem_copy_buffer_rect(cl_command_queue queue, cl_mem src_buf, cl_mem dst_buf,
   cl_int index = CL_ENQUEUE_COPY_BUFFER_RECT;
   cl_int src_offset = src_origin[2]*src_slice_pitch + src_origin[1]*src_row_pitch + src_origin[0];
   cl_int dst_offset = dst_origin[2]*dst_slice_pitch + dst_origin[1]*dst_row_pitch + dst_origin[0];
-
-  static const char *str_kernel =
-      "kernel void __cl_cpy_buffer_rect ( \n"
-      "       global char* src, global char* dst, \n"
-      "       unsigned int region0, unsigned int region1, unsigned int region2, \n"
-      "       unsigned int src_offset, unsigned int dst_offset, \n"
-      "       unsigned int src_row_pitch, unsigned int src_slice_pitch, \n"
-      "       unsigned int dst_row_pitch, unsigned int dst_slice_pitch) { \n"
-      "  int i = get_global_id(0); \n"
-      "  int j = get_global_id(1); \n"
-      "  int k = get_global_id(2); \n"
-      "  if((i >= region0) || (j>= region1) || (k>=region2)) \n"
-      "    return; \n"
-      "  src_offset += k * src_slice_pitch + j * src_row_pitch + i; \n"
-      "  dst_offset += k * dst_slice_pitch + j * dst_row_pitch + i; \n"
-      "  dst[dst_offset] = src[src_offset]; \n"
-      "}";
-
-
   /* We use one kernel to copy the data. The kernel is lazily created. */
   assert(src_buf->ctx == dst_buf->ctx);
+  if( (src_offset % 4== 0) && (dst_offset % 4== 0) && (src_row_pitch % 4== 0) && (dst_row_pitch % 4== 0)
+  && (src_slice_pitch % 4== 0) && (dst_slice_pitch % 4== 0) && (global_sz[0] % 4 == 0) ){	
+    global_sz[0] /= 4;
+    src_offset /= 4;
+    dst_offset /= 4;
+    src_row_pitch /= 4;
+    dst_row_pitch /= 4;
+    src_slice_pitch /= 4;
+    dst_slice_pitch /= 4;
+    static const char *str_intkernel =
+        "kernel void __cl_cpy_buffer_rect ( \n"
+        "		global int* src, global int* dst, \n"
+        "		unsigned int region0, unsigned int region1, unsigned int region2, \n"
+        "		unsigned int src_offset, unsigned int dst_offset, \n"
+        "		unsigned int src_row_pitch, unsigned int src_slice_pitch, \n"
+        "		unsigned int dst_row_pitch, unsigned int dst_slice_pitch) { \n"
+        "  int i = get_global_id(0); \n"
+        "  int j = get_global_id(1); \n"
+        "  int k = get_global_id(2); \n"
+        "  region0 >>= 2; \n"
+        "  if((i >= region0) || (j>= region1) || (k>=region2)) \n"
+        "	 return; \n"
+        "  src_offset += k * src_slice_pitch + j * src_row_pitch + i; \n"
+        "  dst_offset += k * dst_slice_pitch + j * dst_row_pitch + i; \n"
+        "  dst[dst_offset] = src[src_offset]; \n"
+        "}";
+    /* setup the kernel and run. */
+    ker = cl_context_get_static_kernel(queue->ctx, index, 
+ str_intkernel, NULL);  } else {
+    static const char *str_kernel =
+        "kernel void __cl_cpy_buffer_rect ( \n"
+        "       global char* src, global char* dst, \n"
+        "       unsigned int region0, unsigned int region1, unsigned int region2, \n"
+        "       unsigned int src_offset, unsigned int dst_offset, \n"
+        "       unsigned int src_row_pitch, unsigned int src_slice_pitch, \n"
+        "       unsigned int dst_row_pitch, unsigned int dst_slice_pitch) { \n"
+        "  int i = get_global_id(0); \n"
+        "  int j = get_global_id(1); \n"
+        "  int k = get_global_id(2); \n"
+        "  if((i >= region0) || (j>= region1) || (k>=region2)) \n"
+        "    return; \n"
+        "  src_offset += k * src_slice_pitch + j * src_row_pitch + i; \n"
+        "  dst_offset += k * dst_slice_pitch + j * dst_row_pitch + i; \n"
+        "  dst[dst_offset] = src[src_offset]; \n"
+        "}";
+    /* setup the kernel and run. */
+    ker = cl_context_get_static_kernel(queue->ctx, index, str_kernel, 
+ NULL);  }
 
-  /* setup the kernel and run. */
-  ker = cl_context_get_static_kernel(queue->ctx, index, str_kernel, NULL);
   if (!ker)
     return CL_OUT_OF_RESOURCES;
 
--
1.8.3.2

_______________________________________________
Beignet mailing list
Beignet at lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/beignet
_______________________________________________
Beignet mailing list
Beignet at lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/beignet


More information about the Beignet mailing list