[Beignet] [PATCH] [PATCH]improve the clEnqueueCopyBufferRect performance in some cases

Lv, Meng meng.lv at intel.com
Mon May 19 00:44:32 PDT 2014


RONG, 麻烦你先CHECK下 LUO的PATCH,我V2的PATCH会按照他的格式来做
-----Original Message-----
From: Luo, Xionghu 
Sent: Monday, May 19, 2014 3:35 PM
To: Lv, Meng
Cc: beignet at lists.freedesktop.org
Subject: RE: [Beignet] [PATCH] [PATCH]improve the clEnqueueCopyBufferRect performance in some cases

Could you please move the kernel source code to file instead of staying in host code?
You can refer to my pending patch "move enqueue_copy_image kernels outside of runtime code", thanks.

Luo Xionghu
Best Regards

-----Original Message-----
From: Beignet [mailto:beignet-bounces at lists.freedesktop.org] On Behalf Of Yang, Rong R
Sent: Monday, May 19, 2014 3:14 PM
To: Lv, Meng; beignet at lists.freedesktop.org
Cc: Lv, Meng
Subject: Re: [Beignet] [PATCH] [PATCH]improve the clEnqueueCopyBufferRect performance in some cases

The one index indicate to one kernel string, because you add a new kernel for CL_ENQUEUE_COPY_BUFFER_RECT, you should also add a new index for it.

And the file mode change 100644 => 100755, I think it is not necessary.

-----Original Message-----
From: Beignet [mailto:beignet-bounces at lists.freedesktop.org] On Behalf Of Lv Meng
Sent: Monday, May 05, 2014 10:50 AM
To: beignet at lists.freedesktop.org
Cc: Lv, Meng
Subject: [Beignet] [PATCH] [PATCH]improve the clEnqueueCopyBufferRect performance in some cases

Signed-off-by: Lv Meng <meng.lv at intel.com>
---
 src/cl_mem.c | 80 ++++++++++++++++++++++++++++++++++++++++++++----------------
 1 file changed, 59 insertions(+), 21 deletions(-)  mode change 100644 => 100755 src/cl_mem.c

diff --git a/src/cl_mem.c b/src/cl_mem.c old mode 100644 new mode 100755 index 44482f7..92f51d0
--- a/src/cl_mem.c
+++ b/src/cl_mem.c
@@ -911,6 +911,17 @@ cl_mem_copy_buffer_rect(cl_command_queue queue, cl_mem src_buf, cl_mem dst_buf,
   size_t global_off[] = {0,0,0};
   size_t global_sz[] = {1,1,1};
   size_t local_sz[] = {LOCAL_SZ_0,LOCAL_SZ_1,LOCAL_SZ_1};
+
+  // the src and dst mem rect is continuous, the copy is degraded to 
+ buf copy  if((region[0] == dst_row_pitch) && (region[0] ==
+ src_row_pitch)  && (region[1] * src_row_pitch == src_slice_pitch) && (region[1] * dst_row_pitch == dst_slice_pitch)){
+    cl_int src_offset = src_origin[2]*src_slice_pitch + src_origin[1]*src_row_pitch + src_origin[0];
+    cl_int dst_offset = dst_origin[2]*dst_slice_pitch + dst_origin[1]*dst_row_pitch + dst_origin[0];
+    cl_int size = region[0]*region[1]*region[2];
+    ret = cl_mem_copy(queue, src_buf, dst_buf,src_offset, dst_offset, size);
+    return ret;
+  }
+
   if(region[1] == 1) local_sz[1] = 1;
   if(region[2] == 1) local_sz[2] = 1;
   global_sz[0] = ((region[0] + local_sz[0] - 1) / local_sz[0]) * local_sz[0]; @@ -919,30 +930,57 @@ cl_mem_copy_buffer_rect(cl_command_queue queue, cl_mem src_buf, cl_mem dst_buf,
   cl_int index = CL_ENQUEUE_COPY_BUFFER_RECT;
   cl_int src_offset = src_origin[2]*src_slice_pitch + src_origin[1]*src_row_pitch + src_origin[0];
   cl_int dst_offset = dst_origin[2]*dst_slice_pitch + dst_origin[1]*dst_row_pitch + dst_origin[0];
-
-  static const char *str_kernel =
-      "kernel void __cl_cpy_buffer_rect ( \n"
-      "       global char* src, global char* dst, \n"
-      "       unsigned int region0, unsigned int region1, unsigned int region2, \n"
-      "       unsigned int src_offset, unsigned int dst_offset, \n"
-      "       unsigned int src_row_pitch, unsigned int src_slice_pitch, \n"
-      "       unsigned int dst_row_pitch, unsigned int dst_slice_pitch) { \n"
-      "  int i = get_global_id(0); \n"
-      "  int j = get_global_id(1); \n"
-      "  int k = get_global_id(2); \n"
-      "  if((i >= region0) || (j>= region1) || (k>=region2)) \n"
-      "    return; \n"
-      "  src_offset += k * src_slice_pitch + j * src_row_pitch + i; \n"
-      "  dst_offset += k * dst_slice_pitch + j * dst_row_pitch + i; \n"
-      "  dst[dst_offset] = src[src_offset]; \n"
-      "}";
-
-
   /* We use one kernel to copy the data. The kernel is lazily created. */
   assert(src_buf->ctx == dst_buf->ctx);
+  if( (src_offset % 4== 0) && (dst_offset % 4== 0) && (src_row_pitch % 4== 0) && (dst_row_pitch % 4== 0)
+  && (src_slice_pitch % 4== 0) && (dst_slice_pitch % 4== 0) && (global_sz[0] % 4 == 0) ){	
+    global_sz[0] /= 4;
+    src_offset /= 4;
+    dst_offset /= 4;
+    src_row_pitch /= 4;
+    dst_row_pitch /= 4;
+    src_slice_pitch /= 4;
+    dst_slice_pitch /= 4;
+    static const char *str_intkernel =
+        "kernel void __cl_cpy_buffer_rect ( \n"
+        "		global int* src, global int* dst, \n"
+        "		unsigned int region0, unsigned int region1, unsigned int region2, \n"
+        "		unsigned int src_offset, unsigned int dst_offset, \n"
+        "		unsigned int src_row_pitch, unsigned int src_slice_pitch, \n"
+        "		unsigned int dst_row_pitch, unsigned int dst_slice_pitch) { \n"
+        "  int i = get_global_id(0); \n"
+        "  int j = get_global_id(1); \n"
+        "  int k = get_global_id(2); \n"
+        "  region0 >>= 2; \n"
+        "  if((i >= region0) || (j>= region1) || (k>=region2)) \n"
+        "	 return; \n"
+        "  src_offset += k * src_slice_pitch + j * src_row_pitch + i; \n"
+        "  dst_offset += k * dst_slice_pitch + j * dst_row_pitch + i; \n"
+        "  dst[dst_offset] = src[src_offset]; \n"
+        "}";
+    /* setup the kernel and run. */
+    ker = cl_context_get_static_kernel(queue->ctx, index, 
+ str_intkernel, NULL);  } else {
+    static const char *str_kernel =
+        "kernel void __cl_cpy_buffer_rect ( \n"
+        "       global char* src, global char* dst, \n"
+        "       unsigned int region0, unsigned int region1, unsigned int region2, \n"
+        "       unsigned int src_offset, unsigned int dst_offset, \n"
+        "       unsigned int src_row_pitch, unsigned int src_slice_pitch, \n"
+        "       unsigned int dst_row_pitch, unsigned int dst_slice_pitch) { \n"
+        "  int i = get_global_id(0); \n"
+        "  int j = get_global_id(1); \n"
+        "  int k = get_global_id(2); \n"
+        "  if((i >= region0) || (j>= region1) || (k>=region2)) \n"
+        "    return; \n"
+        "  src_offset += k * src_slice_pitch + j * src_row_pitch + i; \n"
+        "  dst_offset += k * dst_slice_pitch + j * dst_row_pitch + i; \n"
+        "  dst[dst_offset] = src[src_offset]; \n"
+        "}";
+    /* setup the kernel and run. */
+    ker = cl_context_get_static_kernel(queue->ctx, index, str_kernel, 
+ NULL);  }
 
-  /* setup the kernel and run. */
-  ker = cl_context_get_static_kernel(queue->ctx, index, str_kernel, NULL);
   if (!ker)
     return CL_OUT_OF_RESOURCES;
 
--
1.8.3.2

_______________________________________________
Beignet mailing list
Beignet at lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/beignet
_______________________________________________
Beignet mailing list
Beignet at lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/beignet


More information about the Beignet mailing list