[Beignet] [Patch V3] Fix memcpy and memset bug.

Yang Rong rong.r.yang at intel.com
Mon Oct 13 20:48:20 PDT 2014


In ocl_memcpy.ll and ocl_memset.ll, index+4 should be less than size when use int in
memcpy and memset, and need consider alignment.

V3: For performance, provide two versions of memcpy and memset, decide call which one when lowering intrinsic.
Signed-off-by: Yang Rong <rong.r.yang at intel.com>
---
 backend/src/libocl/src/ocl_memcpy.ll         | 256 ++++++++++++++++++++++++---
 backend/src/libocl/src/ocl_memset.ll         |  80 ++++++++-
 backend/src/llvm/llvm_intrinsic_lowering.cpp |  14 +-
 3 files changed, 321 insertions(+), 29 deletions(-)

diff --git a/backend/src/libocl/src/ocl_memcpy.ll b/backend/src/libocl/src/ocl_memcpy.ll
index 476033e..fbc44d1 100644
--- a/backend/src/libocl/src/ocl_memcpy.ll
+++ b/backend/src/libocl/src/ocl_memcpy.ll
@@ -1,7 +1,7 @@
 ;The memcpy's source code.
-; INLINE_OVERLOADABLE void __gen_memcpy(uchar* dst, uchar* src, size_t size) {
+; INLINE_OVERLOADABLE void __gen_memcpy_align(uchar* dst, uchar* src, size_t size) {
 ;   size_t index = 0;
-;   while((index + 4) >= size) {
+;   while((index + 4) <= size) {
 ;     *((uint *)(dst + index)) = *((uint *)(src + index));
 ;     index += 4;
 ;   }
@@ -11,14 +11,14 @@
 ;   }
 ; }
 
-define void @__gen_memcpy_gg(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i32 %size) nounwind alwaysinline {
+define void @__gen_memcpy_gg_align(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i32 %size) nounwind alwaysinline {
 entry:
   br label %while.cond
 
 while.cond:                                       ; preds = %while.body, %entry
   %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ]
   %add = add i32 %index.0, 4
-  %cmp = icmp ult i32 %add, %size
+  %cmp = icmp ugt i32 %add, %size
   br i1 %cmp, label %while.cond3, label %while.body
 
 while.body:                                       ; preds = %while.cond
@@ -47,14 +47,14 @@ while.end7:                                       ; preds = %while.cond3
   ret void
 }
 
-define void @__gen_memcpy_gp(i8 addrspace(1)* %dst, i8 addrspace(0)* %src, i32 %size) nounwind alwaysinline {
+define void @__gen_memcpy_gp_align(i8 addrspace(1)* %dst, i8 addrspace(0)* %src, i32 %size) nounwind alwaysinline {
 entry:
   br label %while.cond
 
 while.cond:                                       ; preds = %while.body, %entry
   %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ]
   %add = add i32 %index.0, 4
-  %cmp = icmp ult i32 %add, %size
+  %cmp = icmp ugt i32 %add, %size
   br i1 %cmp, label %while.cond3, label %while.body
 
 while.body:                                       ; preds = %while.cond
@@ -83,14 +83,14 @@ while.end7:                                       ; preds = %while.cond3
   ret void
 }
 
-define void @__gen_memcpy_gl(i8 addrspace(1)* %dst, i8 addrspace(3)* %src, i32 %size) nounwind alwaysinline {
+define void @__gen_memcpy_gl_align(i8 addrspace(1)* %dst, i8 addrspace(3)* %src, i32 %size) nounwind alwaysinline {
 entry:
   br label %while.cond
 
 while.cond:                                       ; preds = %while.body, %entry
   %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ]
   %add = add i32 %index.0, 4
-  %cmp = icmp ult i32 %add, %size
+  %cmp = icmp ugt i32 %add, %size
   br i1 %cmp, label %while.cond3, label %while.body
 
 while.body:                                       ; preds = %while.cond
@@ -119,14 +119,14 @@ while.end7:                                       ; preds = %while.cond3
   ret void
 }
 
-define void @__gen_memcpy_pg(i8 addrspace(0)* %dst, i8 addrspace(1)* %src, i32 %size) nounwind alwaysinline {
+define void @__gen_memcpy_pg_align(i8 addrspace(0)* %dst, i8 addrspace(1)* %src, i32 %size) nounwind alwaysinline {
 entry:
   br label %while.cond
 
 while.cond:                                       ; preds = %while.body, %entry
   %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ]
   %add = add i32 %index.0, 4
-  %cmp = icmp ult i32 %add, %size
+  %cmp = icmp ugt i32 %add, %size
   br i1 %cmp, label %while.cond3, label %while.body
 
 while.body:                                       ; preds = %while.cond
@@ -155,14 +155,14 @@ while.end7:                                       ; preds = %while.cond3
   ret void
 }
 
-define void @__gen_memcpy_pp(i8 addrspace(0)* %dst, i8 addrspace(0)* %src, i32 %size) nounwind alwaysinline {
+define void @__gen_memcpy_pp_align(i8 addrspace(0)* %dst, i8 addrspace(0)* %src, i32 %size) nounwind alwaysinline {
 entry:
   br label %while.cond
 
 while.cond:                                       ; preds = %while.body, %entry
   %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ]
   %add = add i32 %index.0, 4
-  %cmp = icmp ult i32 %add, %size
+  %cmp = icmp ugt i32 %add, %size
   br i1 %cmp, label %while.cond3, label %while.body
 
 while.body:                                       ; preds = %while.cond
@@ -191,14 +191,14 @@ while.end7:                                       ; preds = %while.cond3
   ret void
 }
 
-define void @__gen_memcpy_pl(i8 addrspace(0)* %dst, i8 addrspace(3)* %src, i32 %size) nounwind alwaysinline {
+define void @__gen_memcpy_pl_align(i8 addrspace(0)* %dst, i8 addrspace(3)* %src, i32 %size) nounwind alwaysinline {
 entry:
   br label %while.cond
 
 while.cond:                                       ; preds = %while.body, %entry
   %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ]
   %add = add i32 %index.0, 4
-  %cmp = icmp ult i32 %add, %size
+  %cmp = icmp ugt i32 %add, %size
   br i1 %cmp, label %while.cond3, label %while.body
 
 while.body:                                       ; preds = %while.cond
@@ -227,14 +227,14 @@ while.end7:                                       ; preds = %while.cond3
   ret void
 }
 
-define void @__gen_memcpy_lg(i8 addrspace(3)* %dst, i8 addrspace(1)* %src, i32 %size) nounwind alwaysinline {
+define void @__gen_memcpy_lg_align(i8 addrspace(3)* %dst, i8 addrspace(1)* %src, i32 %size) nounwind alwaysinline {
 entry:
   br label %while.cond
 
 while.cond:                                       ; preds = %while.body, %entry
   %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ]
   %add = add i32 %index.0, 4
-  %cmp = icmp ult i32 %add, %size
+  %cmp = icmp ugt i32 %add, %size
   br i1 %cmp, label %while.cond3, label %while.body
 
 while.body:                                       ; preds = %while.cond
@@ -263,14 +263,14 @@ while.end7:                                       ; preds = %while.cond3
   ret void
 }
 
-define void @__gen_memcpy_lp(i8 addrspace(3)* %dst, i8 addrspace(0)* %src, i32 %size) nounwind alwaysinline {
+define void @__gen_memcpy_lp_align(i8 addrspace(3)* %dst, i8 addrspace(0)* %src, i32 %size) nounwind alwaysinline {
 entry:
   br label %while.cond
 
 while.cond:                                       ; preds = %while.body, %entry
   %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ]
   %add = add i32 %index.0, 4
-  %cmp = icmp ult i32 %add, %size
+  %cmp = icmp ugt i32 %add, %size
   br i1 %cmp, label %while.cond3, label %while.body
 
 while.body:                                       ; preds = %while.cond
@@ -299,14 +299,14 @@ while.end7:                                       ; preds = %while.cond3
   ret void
 }
 
-define void @__gen_memcpy_ll(i8 addrspace(3)* %dst, i8 addrspace(3)* %src, i32 %size) nounwind alwaysinline {
+define void @__gen_memcpy_ll_align(i8 addrspace(3)* %dst, i8 addrspace(3)* %src, i32 %size) nounwind alwaysinline {
 entry:
   br label %while.cond
 
 while.cond:                                       ; preds = %while.body, %entry
   %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ]
   %add = add i32 %index.0, 4
-  %cmp = icmp ult i32 %add, %size
+  %cmp = icmp ugt i32 %add, %size
   br i1 %cmp, label %while.cond3, label %while.body
 
 while.body:                                       ; preds = %while.cond
@@ -334,3 +334,219 @@ while.body5:                                      ; preds = %while.cond3
 while.end7:                                       ; preds = %while.cond3
   ret void
 }
+
+;The memcpy's source code.
+; INLINE_OVERLOADABLE void __gen_memcpy(uchar* dst, uchar* src, size_t size) {
+;   size_t index = 0;
+;   while(index < size) {
+;     dst[index] = src[index];
+;     index++;
+;   }
+; }
+
+define void @__gen_memcpy_gg(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i32 %size) nounwind alwaysinline {
+entry:
+  %cmp4 = icmp eq i32 %size, 0
+  br i1 %cmp4, label %while.end, label %while.body
+
+while.body:                                       ; preds = %entry, %while.body
+  %index.05 = phi i32 [ %inc, %while.body ], [ 0, %entry ]
+  %0 = ptrtoint i8 addrspace(1)* %src to i32
+  %1 = add i32 %0, %index.05
+  %2 = inttoptr i32 %1 to i8 addrspace(1)*
+  %3 = load i8 addrspace(1)* %2, align 1
+  %4 = ptrtoint i8 addrspace(1)* %dst to i32
+  %5 = add i32 %4, %index.05
+  %6 = inttoptr i32 %5 to i8 addrspace(1)*
+  store i8 %3, i8 addrspace(1)* %6, align 1
+  %inc = add i32 %index.05, 1
+  %cmp = icmp ult i32 %inc, %size
+  br i1 %cmp, label %while.body, label %while.end
+
+while.end:                                        ; preds = %while.body, %entry
+  ret void
+}
+
+define void @__gen_memcpy_gp(i8 addrspace(1)* %dst, i8 addrspace(0)* %src, i32 %size) nounwind alwaysinline {
+entry:
+  %cmp4 = icmp eq i32 %size, 0
+  br i1 %cmp4, label %while.end, label %while.body
+
+while.body:                                       ; preds = %entry, %while.body
+  %index.05 = phi i32 [ %inc, %while.body ], [ 0, %entry ]
+  %0 = ptrtoint i8 addrspace(0)* %src to i32
+  %1 = add i32 %0, %index.05
+  %2 = inttoptr i32 %1 to i8 addrspace(0)*
+  %3 = load i8 addrspace(0)* %2, align 1
+  %4 = ptrtoint i8 addrspace(1)* %dst to i32
+  %5 = add i32 %4, %index.05
+  %6 = inttoptr i32 %5 to i8 addrspace(1)*
+  store i8 %3, i8 addrspace(1)* %6, align 1
+  %inc = add i32 %index.05, 1
+  %cmp = icmp ult i32 %inc, %size
+  br i1 %cmp, label %while.body, label %while.end
+
+while.end:                                        ; preds = %while.body, %entry
+  ret void
+}
+
+define void @__gen_memcpy_gl(i8 addrspace(1)* %dst, i8 addrspace(3)* %src, i32 %size) nounwind alwaysinline {
+entry:
+  %cmp4 = icmp eq i32 %size, 0
+  br i1 %cmp4, label %while.end, label %while.body
+
+while.body:                                       ; preds = %entry, %while.body
+  %index.05 = phi i32 [ %inc, %while.body ], [ 0, %entry ]
+  %0 = ptrtoint i8 addrspace(3)* %src to i32
+  %1 = add i32 %0, %index.05
+  %2 = inttoptr i32 %1 to i8 addrspace(3)*
+  %3 = load i8 addrspace(3)* %2, align 1
+  %4 = ptrtoint i8 addrspace(1)* %dst to i32
+  %5 = add i32 %4, %index.05
+  %6 = inttoptr i32 %5 to i8 addrspace(1)*
+  store i8 %3, i8 addrspace(1)* %6, align 1
+  %inc = add i32 %index.05, 1
+  %cmp = icmp ult i32 %inc, %size
+  br i1 %cmp, label %while.body, label %while.end
+
+while.end:                                        ; preds = %while.body, %entry
+  ret void
+}
+
+define void @__gen_memcpy_pg(i8 addrspace(0)* %dst, i8 addrspace(1)* %src, i32 %size) nounwind alwaysinline {
+entry:
+  %cmp4 = icmp eq i32 %size, 0
+  br i1 %cmp4, label %while.end, label %while.body
+
+while.body:                                       ; preds = %entry, %while.body
+  %index.05 = phi i32 [ %inc, %while.body ], [ 0, %entry ]
+  %0 = ptrtoint i8 addrspace(1)* %src to i32
+  %1 = add i32 %0, %index.05
+  %2 = inttoptr i32 %1 to i8 addrspace(1)*
+  %3 = load i8 addrspace(1)* %2, align 1
+  %4 = ptrtoint i8 addrspace(0)* %dst to i32
+  %5 = add i32 %4, %index.05
+  %6 = inttoptr i32 %5 to i8 addrspace(0)*
+  store i8 %3, i8 addrspace(0)* %6, align 1
+  %inc = add i32 %index.05, 1
+  %cmp = icmp ult i32 %inc, %size
+  br i1 %cmp, label %while.body, label %while.end
+
+while.end:                                        ; preds = %while.body, %entry
+  ret void
+}
+
+define void @__gen_memcpy_pp(i8 addrspace(0)* %dst, i8 addrspace(0)* %src, i32 %size) nounwind alwaysinline {
+entry:
+  %cmp4 = icmp eq i32 %size, 0
+  br i1 %cmp4, label %while.end, label %while.body
+
+while.body:                                       ; preds = %entry, %while.body
+  %index.05 = phi i32 [ %inc, %while.body ], [ 0, %entry ]
+  %0 = ptrtoint i8 addrspace(0)* %src to i32
+  %1 = add i32 %0, %index.05
+  %2 = inttoptr i32 %1 to i8 addrspace(0)*
+  %3 = load i8 addrspace(0)* %2, align 1
+  %4 = ptrtoint i8 addrspace(0)* %dst to i32
+  %5 = add i32 %4, %index.05
+  %6 = inttoptr i32 %5 to i8 addrspace(0)*
+  store i8 %3, i8 addrspace(0)* %6, align 1
+  %inc = add i32 %index.05, 1
+  %cmp = icmp ult i32 %inc, %size
+  br i1 %cmp, label %while.body, label %while.end
+
+while.end:                                        ; preds = %while.body, %entry
+  ret void
+}
+
+define void @__gen_memcpy_pl(i8 addrspace(0)* %dst, i8 addrspace(3)* %src, i32 %size) nounwind alwaysinline {
+entry:
+  %cmp4 = icmp eq i32 %size, 0
+  br i1 %cmp4, label %while.end, label %while.body
+
+while.body:                                       ; preds = %entry, %while.body
+  %index.05 = phi i32 [ %inc, %while.body ], [ 0, %entry ]
+  %0 = ptrtoint i8 addrspace(3)* %src to i32
+  %1 = add i32 %0, %index.05
+  %2 = inttoptr i32 %1 to i8 addrspace(3)*
+  %3 = load i8 addrspace(3)* %2, align 1
+  %4 = ptrtoint i8 addrspace(0)* %dst to i32
+  %5 = add i32 %4, %index.05
+  %6 = inttoptr i32 %5 to i8 addrspace(0)*
+  store i8 %3, i8 addrspace(0)* %6, align 1
+  %inc = add i32 %index.05, 1
+  %cmp = icmp ult i32 %inc, %size
+  br i1 %cmp, label %while.body, label %while.end
+
+while.end:                                        ; preds = %while.body, %entry
+  ret void
+}
+
+define void @__gen_memcpy_lg(i8 addrspace(3)* %dst, i8 addrspace(1)* %src, i32 %size) nounwind alwaysinline {
+entry:
+  %cmp4 = icmp eq i32 %size, 0
+  br i1 %cmp4, label %while.end, label %while.body
+
+while.body:                                       ; preds = %entry, %while.body
+  %index.05 = phi i32 [ %inc, %while.body ], [ 0, %entry ]
+  %0 = ptrtoint i8 addrspace(1)* %src to i32
+  %1 = add i32 %0, %index.05
+  %2 = inttoptr i32 %1 to i8 addrspace(1)*
+  %3 = load i8 addrspace(1)* %2, align 1
+  %4 = ptrtoint i8 addrspace(3)* %dst to i32
+  %5 = add i32 %4, %index.05
+  %6 = inttoptr i32 %5 to i8 addrspace(3)*
+  store i8 %3, i8 addrspace(3)* %6, align 1
+  %inc = add i32 %index.05, 1
+  %cmp = icmp ult i32 %inc, %size
+  br i1 %cmp, label %while.body, label %while.end
+
+while.end:                                        ; preds = %while.body, %entry
+  ret void
+}
+
+define void @__gen_memcpy_lp(i8 addrspace(3)* %dst, i8 addrspace(0)* %src, i32 %size) nounwind alwaysinline {
+entry:
+  %cmp4 = icmp eq i32 %size, 0
+  br i1 %cmp4, label %while.end, label %while.body
+
+while.body:                                       ; preds = %entry, %while.body
+  %index.05 = phi i32 [ %inc, %while.body ], [ 0, %entry ]
+  %0 = ptrtoint i8 addrspace(0)* %src to i32
+  %1 = add i32 %0, %index.05
+  %2 = inttoptr i32 %1 to i8 addrspace(0)*
+  %3 = load i8 addrspace(0)* %2, align 1
+  %4 = ptrtoint i8 addrspace(3)* %dst to i32
+  %5 = add i32 %4, %index.05
+  %6 = inttoptr i32 %5 to i8 addrspace(3)*
+  store i8 %3, i8 addrspace(3)* %6, align 1
+  %inc = add i32 %index.05, 1
+  %cmp = icmp ult i32 %inc, %size
+  br i1 %cmp, label %while.body, label %while.end
+
+while.end:                                        ; preds = %while.body, %entry
+  ret void
+}
+
+define void @__gen_memcpy_ll(i8 addrspace(3)* %dst, i8 addrspace(3)* %src, i32 %size) nounwind alwaysinline {
+entry:
+  %cmp4 = icmp eq i32 %size, 0
+  br i1 %cmp4, label %while.end, label %while.body
+
+while.body:                                       ; preds = %entry, %while.body
+  %index.05 = phi i32 [ %inc, %while.body ], [ 0, %entry ]
+  %0 = ptrtoint i8 addrspace(3)* %src to i32
+  %1 = add i32 %0, %index.05
+  %2 = inttoptr i32 %1 to i8 addrspace(3)*
+  %3 = load i8 addrspace(3)* %2, align 1
+  %4 = ptrtoint i8 addrspace(3)* %dst to i32
+  %5 = add i32 %4, %index.05
+  %6 = inttoptr i32 %5 to i8 addrspace(3)*
+  store i8 %3, i8 addrspace(3)* %6, align 1
+  %inc = add i32 %index.05, 1
+  %cmp = icmp ult i32 %inc, %size
+  br i1 %cmp, label %while.body, label %while.end
+
+while.end:                                        ; preds = %while.body, %entry
+  ret void
+}
diff --git a/backend/src/libocl/src/ocl_memset.ll b/backend/src/libocl/src/ocl_memset.ll
index addf9f5..665eac4 100644
--- a/backend/src/libocl/src/ocl_memset.ll
+++ b/backend/src/libocl/src/ocl_memset.ll
@@ -1,5 +1,5 @@
 ;The memset's source code.
-; INLINE_OVERLOADABLE void __gen_memset(uchar* dst, uchar val, size_t size) {
+; INLINE_OVERLOADABLE void __gen_memset_align(uchar* dst, uchar val, size_t size) {
 ;   size_t index = 0;
 ;   uint v = (val << 24) | (val << 16) | (val << 8) | val;
 ;   while((index + 4) >= size) {
@@ -12,7 +12,7 @@
 ;  }
 ; }
 
-define void @__gen_memset_p(i8* %dst, i8 zeroext %val, i32 %size) nounwind alwaysinline {
+define void @__gen_memset_p_align(i8* %dst, i8 zeroext %val, i32 %size) nounwind alwaysinline {
 entry:
   %conv = zext i8 %val to i32
   %shl = shl nuw i32 %conv, 24
@@ -26,7 +26,7 @@ entry:
 while.cond:                                       ; preds = %while.body, %entry
   %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ]
   %add = add i32 %index.0, 4
-  %cmp = icmp ult i32 %add, %size
+  %cmp = icmp ugt i32 %add, %size
   br i1 %cmp, label %while.cond10, label %while.body
 
 while.body:                                       ; preds = %while.cond
@@ -50,7 +50,7 @@ while.end14:                                      ; preds = %while.cond10
   ret void
 }
 
-define void @__gen_memset_g(i8 addrspace(1)* %dst, i8 zeroext %val, i32 %size) nounwind alwaysinline {
+define void @__gen_memset_g_align(i8 addrspace(1)* %dst, i8 zeroext %val, i32 %size) nounwind alwaysinline {
 entry:
   %conv = zext i8 %val to i32
   %shl = shl nuw i32 %conv, 24
@@ -64,7 +64,7 @@ entry:
 while.cond:                                       ; preds = %while.body, %entry
   %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ]
   %add = add i32 %index.0, 4
-  %cmp = icmp ult i32 %add, %size
+  %cmp = icmp ugt i32 %add, %size
   br i1 %cmp, label %while.cond10, label %while.body
 
 while.body:                                       ; preds = %while.cond
@@ -88,7 +88,7 @@ while.end14:                                      ; preds = %while.cond10
   ret void
 }
 
-define void @__gen_memset_l(i8 addrspace(3)* %dst, i8 zeroext %val, i32 %size) nounwind alwaysinline {
+define void @__gen_memset_l_align(i8 addrspace(3)* %dst, i8 zeroext %val, i32 %size) nounwind alwaysinline {
 entry:
   %conv = zext i8 %val to i32
   %shl = shl nuw i32 %conv, 24
@@ -102,7 +102,7 @@ entry:
 while.cond:                                       ; preds = %while.body, %entry
   %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ]
   %add = add i32 %index.0, 4
-  %cmp = icmp ult i32 %add, %size
+  %cmp = icmp ugt i32 %add, %size
   br i1 %cmp, label %while.cond10, label %while.body
 
 while.body:                                       ; preds = %while.cond
@@ -125,3 +125,69 @@ while.body13:                                     ; preds = %while.cond10
 while.end14:                                      ; preds = %while.cond10
   ret void
 }
+
+;The memset's source code.
+; INLINE_OVERLOADABLE void __gen_memset(uchar* dst, uchar val, size_t size) {
+;   size_t index = 0;
+;   while(index < size) {
+;     dst[index] = val;
+;     index++;
+;  }
+; }
+
+define void @__gen_memset_p(i8 addrspace(0)* %dst, i8 zeroext %val, i32 %size) nounwind alwaysinline {
+entry:
+  %cmp3 = icmp eq i32 %size, 0
+  br i1 %cmp3, label %while.end, label %while.body
+
+while.body:                                       ; preds = %entry, %while.body
+  %index.04 = phi i32 [ %inc, %while.body ], [ 0, %entry ]
+  %0 = ptrtoint i8 addrspace(0)* %dst to i32
+  %1 = add i32 %0, %index.04
+  %2 = inttoptr i32 %1 to i8 addrspace(0)*
+  store i8 %val, i8 addrspace(0)* %2, align 1
+  %inc = add i32 %index.04, 1
+  %cmp = icmp ult i32 %inc, %size
+  br i1 %cmp, label %while.body, label %while.end
+
+while.end:                                        ; preds = %while.body, %entry
+  ret void
+}
+
+define void @__gen_memset_g(i8 addrspace(1)* %dst, i8 zeroext %val, i32 %size) nounwind alwaysinline {
+entry:
+  %cmp3 = icmp eq i32 %size, 0
+  br i1 %cmp3, label %while.end, label %while.body
+
+while.body:                                       ; preds = %entry, %while.body
+  %index.04 = phi i32 [ %inc, %while.body ], [ 0, %entry ]
+  %0 = ptrtoint i8 addrspace(1)* %dst to i32
+  %1 = add i32 %0, %index.04
+  %2 = inttoptr i32 %1 to i8 addrspace(1)*
+  store i8 %val, i8 addrspace(1)* %2, align 1
+  %inc = add i32 %index.04, 1
+  %cmp = icmp ult i32 %inc, %size
+  br i1 %cmp, label %while.body, label %while.end
+
+while.end:                                        ; preds = %while.body, %entry
+  ret void
+}
+
+define void @__gen_memset_l(i8 addrspace(3)* %dst, i8 zeroext %val, i32 %size) nounwind alwaysinline {
+entry:
+  %cmp3 = icmp eq i32 %size, 0
+  br i1 %cmp3, label %while.end, label %while.body
+
+while.body:                                       ; preds = %entry, %while.body
+  %index.04 = phi i32 [ %inc, %while.body ], [ 0, %entry ]
+  %0 = ptrtoint i8 addrspace(3)* %dst to i32
+  %1 = add i32 %0, %index.04
+  %2 = inttoptr i32 %1 to i8 addrspace(3)*
+  store i8 %val, i8 addrspace(3)* %2, align 1
+  %inc = add i32 %index.04, 1
+  %cmp = icmp ult i32 %inc, %size
+  br i1 %cmp, label %while.body, label %while.end
+
+while.end:                                        ; preds = %while.body, %entry
+  ret void
+}
diff --git a/backend/src/llvm/llvm_intrinsic_lowering.cpp b/backend/src/llvm/llvm_intrinsic_lowering.cpp
index 7d04318..cfb18ab 100644
--- a/backend/src/llvm/llvm_intrinsic_lowering.cpp
+++ b/backend/src/llvm/llvm_intrinsic_lowering.cpp
@@ -126,13 +126,18 @@ namespace gbe {
                 Type *IntPtr = TD.getIntPtrType(Context);
                 Value *Size = Builder.CreateIntCast(CI->getArgOperand(2), IntPtr,
                                                     /* isSigned */ false);
+                Value *align = Builder.CreateIntCast(CI->getArgOperand(3), IntPtr,
+                                                    /* isSigned */ false);
+                ConstantInt *ci = dyn_cast<ConstantInt>(align);
                 Value *Ops[3];
                 Ops[0] = CI->getArgOperand(0);
                 Ops[1] = CI->getArgOperand(1);
                 Ops[2] = Size;
-                char name[16] = "__gen_memcpy_xx";
+                char name[24] = "__gen_memcpy_xx";
                 name[13] = convertSpaceToName(Ops[0]);
                 name[14] = convertSpaceToName(Ops[1]);
+                if(ci && (ci->getZExtValue() % 4 == 0)) //alignment is constant and 4 byte align
+                  strcat(name, "_align");
                 replaceCallWith(name, CI, Ops, Ops+3, Type::getVoidTy(Context));
                 break;
               }
@@ -143,13 +148,18 @@ namespace gbe {
                 Type *IntPtr = TD.getIntPtrType(Op0->getType());
                 Value *Size = Builder.CreateIntCast(CI->getArgOperand(2), IntPtr,
                                                     /* isSigned */ false);
+                Value *align = Builder.CreateIntCast(CI->getArgOperand(3), IntPtr,
+                                                    /* isSigned */ false);
+                ConstantInt *ci = dyn_cast<ConstantInt>(align);
                 Value *Ops[3];
                 Ops[0] = Op0;
                 // Extend the amount to i32.
                 Ops[1] = val;
                 Ops[2] = Size;
-                char name[16] = "__gen_memset_x";
+                char name[24] = "__gen_memset_x";
                 name[13] = convertSpaceToName(Ops[0]);
+                if(ci && (ci->getZExtValue() % 4 == 0)) //alignment is constant and 4 byte align
+                  strcat(name, "_align");
                 replaceCallWith(name, CI, Ops, Ops+3, Type::getVoidTy(Context));
                 break;
               }
-- 
1.8.3.2



More information about the Beignet mailing list