[Beignet] [Patch V3] Fix memcpy and memset bug.

Zhigang Gong zhigang.gong at linux.intel.com
Mon Oct 13 22:25:36 PDT 2014


This patch LGTM, but it lacks of the following change. As this functions may be
used after the bitcode link, we have to force the module to not drop these
unused functions. Will push latter, Thanks.

diff --git a/backend/src/llvm/llvm_bitcode_link.cpp b/backend/src/llvm/llvm_bitcode_link.cpp
index 7ef6a8a..fa09703 100644
--- a/backend/src/llvm/llvm_bitcode_link.cpp
+++ b/backend/src/llvm/llvm_bitcode_link.cpp
@@ -157,6 +157,20 @@ namespace gbe
     builtinFuncs.push_back("__gen_memset_g");
     builtinFuncs.push_back("__gen_memset_l");

+    builtinFuncs.push_back("__gen_memcpy_gg_align");
+    builtinFuncs.push_back("__gen_memcpy_gp_align");
+    builtinFuncs.push_back("__gen_memcpy_gl_align");
+    builtinFuncs.push_back("__gen_memcpy_pg_align");
+    builtinFuncs.push_back("__gen_memcpy_pp_align");
+    builtinFuncs.push_back("__gen_memcpy_pl_align");
+    builtinFuncs.push_back("__gen_memcpy_lg_align");
+    builtinFuncs.push_back("__gen_memcpy_lp_align");
+    builtinFuncs.push_back("__gen_memcpy_ll_align");
+    builtinFuncs.push_back("__gen_memset_p_align");
+    builtinFuncs.push_back("__gen_memset_g_align");
+    builtinFuncs.push_back("__gen_memset_l_align");
+
+
     for (Module::iterator SF = mod->begin(), E = mod->end(); SF != E; ++SF) {
       if (SF->isDeclaration()) continue;
       if (!isKernelFunction(*SF)) continue;


On Tue, Oct 14, 2014 at 11:48:20AM +0800, Yang Rong wrote:
> In ocl_memcpy.ll and ocl_memset.ll, index+4 should be less than size when use int in
> memcpy and memset, and need consider alignment.
> 
> V3: For performance, provide two versions of memcpy and memset, decide call which one when lowering intrinsic.
> Signed-off-by: Yang Rong <rong.r.yang at intel.com>
> ---
>  backend/src/libocl/src/ocl_memcpy.ll         | 256 ++++++++++++++++++++++++---
>  backend/src/libocl/src/ocl_memset.ll         |  80 ++++++++-
>  backend/src/llvm/llvm_intrinsic_lowering.cpp |  14 +-
>  3 files changed, 321 insertions(+), 29 deletions(-)
> 
> diff --git a/backend/src/libocl/src/ocl_memcpy.ll b/backend/src/libocl/src/ocl_memcpy.ll
> index 476033e..fbc44d1 100644
> --- a/backend/src/libocl/src/ocl_memcpy.ll
> +++ b/backend/src/libocl/src/ocl_memcpy.ll
> @@ -1,7 +1,7 @@
>  ;The memcpy's source code.
> -; INLINE_OVERLOADABLE void __gen_memcpy(uchar* dst, uchar* src, size_t size) {
> +; INLINE_OVERLOADABLE void __gen_memcpy_align(uchar* dst, uchar* src, size_t size) {
>  ;   size_t index = 0;
> -;   while((index + 4) >= size) {
> +;   while((index + 4) <= size) {
>  ;     *((uint *)(dst + index)) = *((uint *)(src + index));
>  ;     index += 4;
>  ;   }
> @@ -11,14 +11,14 @@
>  ;   }
>  ; }
>  
> -define void @__gen_memcpy_gg(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i32 %size) nounwind alwaysinline {
> +define void @__gen_memcpy_gg_align(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i32 %size) nounwind alwaysinline {
>  entry:
>    br label %while.cond
>  
>  while.cond:                                       ; preds = %while.body, %entry
>    %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ]
>    %add = add i32 %index.0, 4
> -  %cmp = icmp ult i32 %add, %size
> +  %cmp = icmp ugt i32 %add, %size
>    br i1 %cmp, label %while.cond3, label %while.body
>  
>  while.body:                                       ; preds = %while.cond
> @@ -47,14 +47,14 @@ while.end7:                                       ; preds = %while.cond3
>    ret void
>  }
>  
> -define void @__gen_memcpy_gp(i8 addrspace(1)* %dst, i8 addrspace(0)* %src, i32 %size) nounwind alwaysinline {
> +define void @__gen_memcpy_gp_align(i8 addrspace(1)* %dst, i8 addrspace(0)* %src, i32 %size) nounwind alwaysinline {
>  entry:
>    br label %while.cond
>  
>  while.cond:                                       ; preds = %while.body, %entry
>    %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ]
>    %add = add i32 %index.0, 4
> -  %cmp = icmp ult i32 %add, %size
> +  %cmp = icmp ugt i32 %add, %size
>    br i1 %cmp, label %while.cond3, label %while.body
>  
>  while.body:                                       ; preds = %while.cond
> @@ -83,14 +83,14 @@ while.end7:                                       ; preds = %while.cond3
>    ret void
>  }
>  
> -define void @__gen_memcpy_gl(i8 addrspace(1)* %dst, i8 addrspace(3)* %src, i32 %size) nounwind alwaysinline {
> +define void @__gen_memcpy_gl_align(i8 addrspace(1)* %dst, i8 addrspace(3)* %src, i32 %size) nounwind alwaysinline {
>  entry:
>    br label %while.cond
>  
>  while.cond:                                       ; preds = %while.body, %entry
>    %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ]
>    %add = add i32 %index.0, 4
> -  %cmp = icmp ult i32 %add, %size
> +  %cmp = icmp ugt i32 %add, %size
>    br i1 %cmp, label %while.cond3, label %while.body
>  
>  while.body:                                       ; preds = %while.cond
> @@ -119,14 +119,14 @@ while.end7:                                       ; preds = %while.cond3
>    ret void
>  }
>  
> -define void @__gen_memcpy_pg(i8 addrspace(0)* %dst, i8 addrspace(1)* %src, i32 %size) nounwind alwaysinline {
> +define void @__gen_memcpy_pg_align(i8 addrspace(0)* %dst, i8 addrspace(1)* %src, i32 %size) nounwind alwaysinline {
>  entry:
>    br label %while.cond
>  
>  while.cond:                                       ; preds = %while.body, %entry
>    %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ]
>    %add = add i32 %index.0, 4
> -  %cmp = icmp ult i32 %add, %size
> +  %cmp = icmp ugt i32 %add, %size
>    br i1 %cmp, label %while.cond3, label %while.body
>  
>  while.body:                                       ; preds = %while.cond
> @@ -155,14 +155,14 @@ while.end7:                                       ; preds = %while.cond3
>    ret void
>  }
>  
> -define void @__gen_memcpy_pp(i8 addrspace(0)* %dst, i8 addrspace(0)* %src, i32 %size) nounwind alwaysinline {
> +define void @__gen_memcpy_pp_align(i8 addrspace(0)* %dst, i8 addrspace(0)* %src, i32 %size) nounwind alwaysinline {
>  entry:
>    br label %while.cond
>  
>  while.cond:                                       ; preds = %while.body, %entry
>    %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ]
>    %add = add i32 %index.0, 4
> -  %cmp = icmp ult i32 %add, %size
> +  %cmp = icmp ugt i32 %add, %size
>    br i1 %cmp, label %while.cond3, label %while.body
>  
>  while.body:                                       ; preds = %while.cond
> @@ -191,14 +191,14 @@ while.end7:                                       ; preds = %while.cond3
>    ret void
>  }
>  
> -define void @__gen_memcpy_pl(i8 addrspace(0)* %dst, i8 addrspace(3)* %src, i32 %size) nounwind alwaysinline {
> +define void @__gen_memcpy_pl_align(i8 addrspace(0)* %dst, i8 addrspace(3)* %src, i32 %size) nounwind alwaysinline {
>  entry:
>    br label %while.cond
>  
>  while.cond:                                       ; preds = %while.body, %entry
>    %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ]
>    %add = add i32 %index.0, 4
> -  %cmp = icmp ult i32 %add, %size
> +  %cmp = icmp ugt i32 %add, %size
>    br i1 %cmp, label %while.cond3, label %while.body
>  
>  while.body:                                       ; preds = %while.cond
> @@ -227,14 +227,14 @@ while.end7:                                       ; preds = %while.cond3
>    ret void
>  }
>  
> -define void @__gen_memcpy_lg(i8 addrspace(3)* %dst, i8 addrspace(1)* %src, i32 %size) nounwind alwaysinline {
> +define void @__gen_memcpy_lg_align(i8 addrspace(3)* %dst, i8 addrspace(1)* %src, i32 %size) nounwind alwaysinline {
>  entry:
>    br label %while.cond
>  
>  while.cond:                                       ; preds = %while.body, %entry
>    %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ]
>    %add = add i32 %index.0, 4
> -  %cmp = icmp ult i32 %add, %size
> +  %cmp = icmp ugt i32 %add, %size
>    br i1 %cmp, label %while.cond3, label %while.body
>  
>  while.body:                                       ; preds = %while.cond
> @@ -263,14 +263,14 @@ while.end7:                                       ; preds = %while.cond3
>    ret void
>  }
>  
> -define void @__gen_memcpy_lp(i8 addrspace(3)* %dst, i8 addrspace(0)* %src, i32 %size) nounwind alwaysinline {
> +define void @__gen_memcpy_lp_align(i8 addrspace(3)* %dst, i8 addrspace(0)* %src, i32 %size) nounwind alwaysinline {
>  entry:
>    br label %while.cond
>  
>  while.cond:                                       ; preds = %while.body, %entry
>    %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ]
>    %add = add i32 %index.0, 4
> -  %cmp = icmp ult i32 %add, %size
> +  %cmp = icmp ugt i32 %add, %size
>    br i1 %cmp, label %while.cond3, label %while.body
>  
>  while.body:                                       ; preds = %while.cond
> @@ -299,14 +299,14 @@ while.end7:                                       ; preds = %while.cond3
>    ret void
>  }
>  
> -define void @__gen_memcpy_ll(i8 addrspace(3)* %dst, i8 addrspace(3)* %src, i32 %size) nounwind alwaysinline {
> +define void @__gen_memcpy_ll_align(i8 addrspace(3)* %dst, i8 addrspace(3)* %src, i32 %size) nounwind alwaysinline {
>  entry:
>    br label %while.cond
>  
>  while.cond:                                       ; preds = %while.body, %entry
>    %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ]
>    %add = add i32 %index.0, 4
> -  %cmp = icmp ult i32 %add, %size
> +  %cmp = icmp ugt i32 %add, %size
>    br i1 %cmp, label %while.cond3, label %while.body
>  
>  while.body:                                       ; preds = %while.cond
> @@ -334,3 +334,219 @@ while.body5:                                      ; preds = %while.cond3
>  while.end7:                                       ; preds = %while.cond3
>    ret void
>  }
> +
> +;The memcpy's source code.
> +; INLINE_OVERLOADABLE void __gen_memcpy(uchar* dst, uchar* src, size_t size) {
> +;   size_t index = 0;
> +;   while(index < size) {
> +;     dst[index] = src[index];
> +;     index++;
> +;   }
> +; }
> +
> +define void @__gen_memcpy_gg(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i32 %size) nounwind alwaysinline {
> +entry:
> +  %cmp4 = icmp eq i32 %size, 0
> +  br i1 %cmp4, label %while.end, label %while.body
> +
> +while.body:                                       ; preds = %entry, %while.body
> +  %index.05 = phi i32 [ %inc, %while.body ], [ 0, %entry ]
> +  %0 = ptrtoint i8 addrspace(1)* %src to i32
> +  %1 = add i32 %0, %index.05
> +  %2 = inttoptr i32 %1 to i8 addrspace(1)*
> +  %3 = load i8 addrspace(1)* %2, align 1
> +  %4 = ptrtoint i8 addrspace(1)* %dst to i32
> +  %5 = add i32 %4, %index.05
> +  %6 = inttoptr i32 %5 to i8 addrspace(1)*
> +  store i8 %3, i8 addrspace(1)* %6, align 1
> +  %inc = add i32 %index.05, 1
> +  %cmp = icmp ult i32 %inc, %size
> +  br i1 %cmp, label %while.body, label %while.end
> +
> +while.end:                                        ; preds = %while.body, %entry
> +  ret void
> +}
> +
> +define void @__gen_memcpy_gp(i8 addrspace(1)* %dst, i8 addrspace(0)* %src, i32 %size) nounwind alwaysinline {
> +entry:
> +  %cmp4 = icmp eq i32 %size, 0
> +  br i1 %cmp4, label %while.end, label %while.body
> +
> +while.body:                                       ; preds = %entry, %while.body
> +  %index.05 = phi i32 [ %inc, %while.body ], [ 0, %entry ]
> +  %0 = ptrtoint i8 addrspace(0)* %src to i32
> +  %1 = add i32 %0, %index.05
> +  %2 = inttoptr i32 %1 to i8 addrspace(0)*
> +  %3 = load i8 addrspace(0)* %2, align 1
> +  %4 = ptrtoint i8 addrspace(1)* %dst to i32
> +  %5 = add i32 %4, %index.05
> +  %6 = inttoptr i32 %5 to i8 addrspace(1)*
> +  store i8 %3, i8 addrspace(1)* %6, align 1
> +  %inc = add i32 %index.05, 1
> +  %cmp = icmp ult i32 %inc, %size
> +  br i1 %cmp, label %while.body, label %while.end
> +
> +while.end:                                        ; preds = %while.body, %entry
> +  ret void
> +}
> +
> +define void @__gen_memcpy_gl(i8 addrspace(1)* %dst, i8 addrspace(3)* %src, i32 %size) nounwind alwaysinline {
> +entry:
> +  %cmp4 = icmp eq i32 %size, 0
> +  br i1 %cmp4, label %while.end, label %while.body
> +
> +while.body:                                       ; preds = %entry, %while.body
> +  %index.05 = phi i32 [ %inc, %while.body ], [ 0, %entry ]
> +  %0 = ptrtoint i8 addrspace(3)* %src to i32
> +  %1 = add i32 %0, %index.05
> +  %2 = inttoptr i32 %1 to i8 addrspace(3)*
> +  %3 = load i8 addrspace(3)* %2, align 1
> +  %4 = ptrtoint i8 addrspace(1)* %dst to i32
> +  %5 = add i32 %4, %index.05
> +  %6 = inttoptr i32 %5 to i8 addrspace(1)*
> +  store i8 %3, i8 addrspace(1)* %6, align 1
> +  %inc = add i32 %index.05, 1
> +  %cmp = icmp ult i32 %inc, %size
> +  br i1 %cmp, label %while.body, label %while.end
> +
> +while.end:                                        ; preds = %while.body, %entry
> +  ret void
> +}
> +
> +define void @__gen_memcpy_pg(i8 addrspace(0)* %dst, i8 addrspace(1)* %src, i32 %size) nounwind alwaysinline {
> +entry:
> +  %cmp4 = icmp eq i32 %size, 0
> +  br i1 %cmp4, label %while.end, label %while.body
> +
> +while.body:                                       ; preds = %entry, %while.body
> +  %index.05 = phi i32 [ %inc, %while.body ], [ 0, %entry ]
> +  %0 = ptrtoint i8 addrspace(1)* %src to i32
> +  %1 = add i32 %0, %index.05
> +  %2 = inttoptr i32 %1 to i8 addrspace(1)*
> +  %3 = load i8 addrspace(1)* %2, align 1
> +  %4 = ptrtoint i8 addrspace(0)* %dst to i32
> +  %5 = add i32 %4, %index.05
> +  %6 = inttoptr i32 %5 to i8 addrspace(0)*
> +  store i8 %3, i8 addrspace(0)* %6, align 1
> +  %inc = add i32 %index.05, 1
> +  %cmp = icmp ult i32 %inc, %size
> +  br i1 %cmp, label %while.body, label %while.end
> +
> +while.end:                                        ; preds = %while.body, %entry
> +  ret void
> +}
> +
> +define void @__gen_memcpy_pp(i8 addrspace(0)* %dst, i8 addrspace(0)* %src, i32 %size) nounwind alwaysinline {
> +entry:
> +  %cmp4 = icmp eq i32 %size, 0
> +  br i1 %cmp4, label %while.end, label %while.body
> +
> +while.body:                                       ; preds = %entry, %while.body
> +  %index.05 = phi i32 [ %inc, %while.body ], [ 0, %entry ]
> +  %0 = ptrtoint i8 addrspace(0)* %src to i32
> +  %1 = add i32 %0, %index.05
> +  %2 = inttoptr i32 %1 to i8 addrspace(0)*
> +  %3 = load i8 addrspace(0)* %2, align 1
> +  %4 = ptrtoint i8 addrspace(0)* %dst to i32
> +  %5 = add i32 %4, %index.05
> +  %6 = inttoptr i32 %5 to i8 addrspace(0)*
> +  store i8 %3, i8 addrspace(0)* %6, align 1
> +  %inc = add i32 %index.05, 1
> +  %cmp = icmp ult i32 %inc, %size
> +  br i1 %cmp, label %while.body, label %while.end
> +
> +while.end:                                        ; preds = %while.body, %entry
> +  ret void
> +}
> +
> +define void @__gen_memcpy_pl(i8 addrspace(0)* %dst, i8 addrspace(3)* %src, i32 %size) nounwind alwaysinline {
> +entry:
> +  %cmp4 = icmp eq i32 %size, 0
> +  br i1 %cmp4, label %while.end, label %while.body
> +
> +while.body:                                       ; preds = %entry, %while.body
> +  %index.05 = phi i32 [ %inc, %while.body ], [ 0, %entry ]
> +  %0 = ptrtoint i8 addrspace(3)* %src to i32
> +  %1 = add i32 %0, %index.05
> +  %2 = inttoptr i32 %1 to i8 addrspace(3)*
> +  %3 = load i8 addrspace(3)* %2, align 1
> +  %4 = ptrtoint i8 addrspace(0)* %dst to i32
> +  %5 = add i32 %4, %index.05
> +  %6 = inttoptr i32 %5 to i8 addrspace(0)*
> +  store i8 %3, i8 addrspace(0)* %6, align 1
> +  %inc = add i32 %index.05, 1
> +  %cmp = icmp ult i32 %inc, %size
> +  br i1 %cmp, label %while.body, label %while.end
> +
> +while.end:                                        ; preds = %while.body, %entry
> +  ret void
> +}
> +
> +define void @__gen_memcpy_lg(i8 addrspace(3)* %dst, i8 addrspace(1)* %src, i32 %size) nounwind alwaysinline {
> +entry:
> +  %cmp4 = icmp eq i32 %size, 0
> +  br i1 %cmp4, label %while.end, label %while.body
> +
> +while.body:                                       ; preds = %entry, %while.body
> +  %index.05 = phi i32 [ %inc, %while.body ], [ 0, %entry ]
> +  %0 = ptrtoint i8 addrspace(1)* %src to i32
> +  %1 = add i32 %0, %index.05
> +  %2 = inttoptr i32 %1 to i8 addrspace(1)*
> +  %3 = load i8 addrspace(1)* %2, align 1
> +  %4 = ptrtoint i8 addrspace(3)* %dst to i32
> +  %5 = add i32 %4, %index.05
> +  %6 = inttoptr i32 %5 to i8 addrspace(3)*
> +  store i8 %3, i8 addrspace(3)* %6, align 1
> +  %inc = add i32 %index.05, 1
> +  %cmp = icmp ult i32 %inc, %size
> +  br i1 %cmp, label %while.body, label %while.end
> +
> +while.end:                                        ; preds = %while.body, %entry
> +  ret void
> +}
> +
> +define void @__gen_memcpy_lp(i8 addrspace(3)* %dst, i8 addrspace(0)* %src, i32 %size) nounwind alwaysinline {
> +entry:
> +  %cmp4 = icmp eq i32 %size, 0
> +  br i1 %cmp4, label %while.end, label %while.body
> +
> +while.body:                                       ; preds = %entry, %while.body
> +  %index.05 = phi i32 [ %inc, %while.body ], [ 0, %entry ]
> +  %0 = ptrtoint i8 addrspace(0)* %src to i32
> +  %1 = add i32 %0, %index.05
> +  %2 = inttoptr i32 %1 to i8 addrspace(0)*
> +  %3 = load i8 addrspace(0)* %2, align 1
> +  %4 = ptrtoint i8 addrspace(3)* %dst to i32
> +  %5 = add i32 %4, %index.05
> +  %6 = inttoptr i32 %5 to i8 addrspace(3)*
> +  store i8 %3, i8 addrspace(3)* %6, align 1
> +  %inc = add i32 %index.05, 1
> +  %cmp = icmp ult i32 %inc, %size
> +  br i1 %cmp, label %while.body, label %while.end
> +
> +while.end:                                        ; preds = %while.body, %entry
> +  ret void
> +}
> +
> +define void @__gen_memcpy_ll(i8 addrspace(3)* %dst, i8 addrspace(3)* %src, i32 %size) nounwind alwaysinline {
> +entry:
> +  %cmp4 = icmp eq i32 %size, 0
> +  br i1 %cmp4, label %while.end, label %while.body
> +
> +while.body:                                       ; preds = %entry, %while.body
> +  %index.05 = phi i32 [ %inc, %while.body ], [ 0, %entry ]
> +  %0 = ptrtoint i8 addrspace(3)* %src to i32
> +  %1 = add i32 %0, %index.05
> +  %2 = inttoptr i32 %1 to i8 addrspace(3)*
> +  %3 = load i8 addrspace(3)* %2, align 1
> +  %4 = ptrtoint i8 addrspace(3)* %dst to i32
> +  %5 = add i32 %4, %index.05
> +  %6 = inttoptr i32 %5 to i8 addrspace(3)*
> +  store i8 %3, i8 addrspace(3)* %6, align 1
> +  %inc = add i32 %index.05, 1
> +  %cmp = icmp ult i32 %inc, %size
> +  br i1 %cmp, label %while.body, label %while.end
> +
> +while.end:                                        ; preds = %while.body, %entry
> +  ret void
> +}
> diff --git a/backend/src/libocl/src/ocl_memset.ll b/backend/src/libocl/src/ocl_memset.ll
> index addf9f5..665eac4 100644
> --- a/backend/src/libocl/src/ocl_memset.ll
> +++ b/backend/src/libocl/src/ocl_memset.ll
> @@ -1,5 +1,5 @@
>  ;The memset's source code.
> -; INLINE_OVERLOADABLE void __gen_memset(uchar* dst, uchar val, size_t size) {
> +; INLINE_OVERLOADABLE void __gen_memset_align(uchar* dst, uchar val, size_t size) {
>  ;   size_t index = 0;
>  ;   uint v = (val << 24) | (val << 16) | (val << 8) | val;
>  ;   while((index + 4) >= size) {
> @@ -12,7 +12,7 @@
>  ;  }
>  ; }
>  
> -define void @__gen_memset_p(i8* %dst, i8 zeroext %val, i32 %size) nounwind alwaysinline {
> +define void @__gen_memset_p_align(i8* %dst, i8 zeroext %val, i32 %size) nounwind alwaysinline {
>  entry:
>    %conv = zext i8 %val to i32
>    %shl = shl nuw i32 %conv, 24
> @@ -26,7 +26,7 @@ entry:
>  while.cond:                                       ; preds = %while.body, %entry
>    %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ]
>    %add = add i32 %index.0, 4
> -  %cmp = icmp ult i32 %add, %size
> +  %cmp = icmp ugt i32 %add, %size
>    br i1 %cmp, label %while.cond10, label %while.body
>  
>  while.body:                                       ; preds = %while.cond
> @@ -50,7 +50,7 @@ while.end14:                                      ; preds = %while.cond10
>    ret void
>  }
>  
> -define void @__gen_memset_g(i8 addrspace(1)* %dst, i8 zeroext %val, i32 %size) nounwind alwaysinline {
> +define void @__gen_memset_g_align(i8 addrspace(1)* %dst, i8 zeroext %val, i32 %size) nounwind alwaysinline {
>  entry:
>    %conv = zext i8 %val to i32
>    %shl = shl nuw i32 %conv, 24
> @@ -64,7 +64,7 @@ entry:
>  while.cond:                                       ; preds = %while.body, %entry
>    %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ]
>    %add = add i32 %index.0, 4
> -  %cmp = icmp ult i32 %add, %size
> +  %cmp = icmp ugt i32 %add, %size
>    br i1 %cmp, label %while.cond10, label %while.body
>  
>  while.body:                                       ; preds = %while.cond
> @@ -88,7 +88,7 @@ while.end14:                                      ; preds = %while.cond10
>    ret void
>  }
>  
> -define void @__gen_memset_l(i8 addrspace(3)* %dst, i8 zeroext %val, i32 %size) nounwind alwaysinline {
> +define void @__gen_memset_l_align(i8 addrspace(3)* %dst, i8 zeroext %val, i32 %size) nounwind alwaysinline {
>  entry:
>    %conv = zext i8 %val to i32
>    %shl = shl nuw i32 %conv, 24
> @@ -102,7 +102,7 @@ entry:
>  while.cond:                                       ; preds = %while.body, %entry
>    %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ]
>    %add = add i32 %index.0, 4
> -  %cmp = icmp ult i32 %add, %size
> +  %cmp = icmp ugt i32 %add, %size
>    br i1 %cmp, label %while.cond10, label %while.body
>  
>  while.body:                                       ; preds = %while.cond
> @@ -125,3 +125,69 @@ while.body13:                                     ; preds = %while.cond10
>  while.end14:                                      ; preds = %while.cond10
>    ret void
>  }
> +
> +;The memset's source code.
> +; INLINE_OVERLOADABLE void __gen_memset(uchar* dst, uchar val, size_t size) {
> +;   size_t index = 0;
> +;   while(index < size) {
> +;     dst[index] = val;
> +;     index++;
> +;  }
> +; }
> +
> +define void @__gen_memset_p(i8 addrspace(0)* %dst, i8 zeroext %val, i32 %size) nounwind alwaysinline {
> +entry:
> +  %cmp3 = icmp eq i32 %size, 0
> +  br i1 %cmp3, label %while.end, label %while.body
> +
> +while.body:                                       ; preds = %entry, %while.body
> +  %index.04 = phi i32 [ %inc, %while.body ], [ 0, %entry ]
> +  %0 = ptrtoint i8 addrspace(0)* %dst to i32
> +  %1 = add i32 %0, %index.04
> +  %2 = inttoptr i32 %1 to i8 addrspace(0)*
> +  store i8 %val, i8 addrspace(0)* %2, align 1
> +  %inc = add i32 %index.04, 1
> +  %cmp = icmp ult i32 %inc, %size
> +  br i1 %cmp, label %while.body, label %while.end
> +
> +while.end:                                        ; preds = %while.body, %entry
> +  ret void
> +}
> +
> +define void @__gen_memset_g(i8 addrspace(1)* %dst, i8 zeroext %val, i32 %size) nounwind alwaysinline {
> +entry:
> +  %cmp3 = icmp eq i32 %size, 0
> +  br i1 %cmp3, label %while.end, label %while.body
> +
> +while.body:                                       ; preds = %entry, %while.body
> +  %index.04 = phi i32 [ %inc, %while.body ], [ 0, %entry ]
> +  %0 = ptrtoint i8 addrspace(1)* %dst to i32
> +  %1 = add i32 %0, %index.04
> +  %2 = inttoptr i32 %1 to i8 addrspace(1)*
> +  store i8 %val, i8 addrspace(1)* %2, align 1
> +  %inc = add i32 %index.04, 1
> +  %cmp = icmp ult i32 %inc, %size
> +  br i1 %cmp, label %while.body, label %while.end
> +
> +while.end:                                        ; preds = %while.body, %entry
> +  ret void
> +}
> +
> +define void @__gen_memset_l(i8 addrspace(3)* %dst, i8 zeroext %val, i32 %size) nounwind alwaysinline {
> +entry:
> +  %cmp3 = icmp eq i32 %size, 0
> +  br i1 %cmp3, label %while.end, label %while.body
> +
> +while.body:                                       ; preds = %entry, %while.body
> +  %index.04 = phi i32 [ %inc, %while.body ], [ 0, %entry ]
> +  %0 = ptrtoint i8 addrspace(3)* %dst to i32
> +  %1 = add i32 %0, %index.04
> +  %2 = inttoptr i32 %1 to i8 addrspace(3)*
> +  store i8 %val, i8 addrspace(3)* %2, align 1
> +  %inc = add i32 %index.04, 1
> +  %cmp = icmp ult i32 %inc, %size
> +  br i1 %cmp, label %while.body, label %while.end
> +
> +while.end:                                        ; preds = %while.body, %entry
> +  ret void
> +}
> diff --git a/backend/src/llvm/llvm_intrinsic_lowering.cpp b/backend/src/llvm/llvm_intrinsic_lowering.cpp
> index 7d04318..cfb18ab 100644
> --- a/backend/src/llvm/llvm_intrinsic_lowering.cpp
> +++ b/backend/src/llvm/llvm_intrinsic_lowering.cpp
> @@ -126,13 +126,18 @@ namespace gbe {
>                  Type *IntPtr = TD.getIntPtrType(Context);
>                  Value *Size = Builder.CreateIntCast(CI->getArgOperand(2), IntPtr,
>                                                      /* isSigned */ false);
> +                Value *align = Builder.CreateIntCast(CI->getArgOperand(3), IntPtr,
> +                                                    /* isSigned */ false);
> +                ConstantInt *ci = dyn_cast<ConstantInt>(align);
>                  Value *Ops[3];
>                  Ops[0] = CI->getArgOperand(0);
>                  Ops[1] = CI->getArgOperand(1);
>                  Ops[2] = Size;
> -                char name[16] = "__gen_memcpy_xx";
> +                char name[24] = "__gen_memcpy_xx";
>                  name[13] = convertSpaceToName(Ops[0]);
>                  name[14] = convertSpaceToName(Ops[1]);
> +                if(ci && (ci->getZExtValue() % 4 == 0)) //alignment is constant and 4 byte align
> +                  strcat(name, "_align");
>                  replaceCallWith(name, CI, Ops, Ops+3, Type::getVoidTy(Context));
>                  break;
>                }
> @@ -143,13 +148,18 @@ namespace gbe {
>                  Type *IntPtr = TD.getIntPtrType(Op0->getType());
>                  Value *Size = Builder.CreateIntCast(CI->getArgOperand(2), IntPtr,
>                                                      /* isSigned */ false);
> +                Value *align = Builder.CreateIntCast(CI->getArgOperand(3), IntPtr,
> +                                                    /* isSigned */ false);
> +                ConstantInt *ci = dyn_cast<ConstantInt>(align);
>                  Value *Ops[3];
>                  Ops[0] = Op0;
>                  // Extend the amount to i32.
>                  Ops[1] = val;
>                  Ops[2] = Size;
> -                char name[16] = "__gen_memset_x";
> +                char name[24] = "__gen_memset_x";
>                  name[13] = convertSpaceToName(Ops[0]);
> +                if(ci && (ci->getZExtValue() % 4 == 0)) //alignment is constant and 4 byte align
> +                  strcat(name, "_align");
>                  replaceCallWith(name, CI, Ops, Ops+3, Type::getVoidTy(Context));
>                  break;
>                }
> -- 
> 1.8.3.2
> 
> _______________________________________________
> Beignet mailing list
> Beignet at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/beignet


More information about the Beignet mailing list