[Beignet] [PATCH] Fix memcpy and memset bug.

Yang Rong rong.r.yang at intel.com
Tue Oct 7 23:36:56 PDT 2014


In ocl_memcpy.ll and ocl_memset.ll, index+4 should be less than size when use int in
memcpy and memset, and need consider alignment.
Because GBE backend have optimized continuous char load, so remove the int memcpy and
memset optimization.

Signed-off-by: Yang Rong <rong.r.yang at intel.com>
---
 backend/src/libocl/src/ocl_memcpy.ll | 445 +++++++++++++----------------------
 backend/src/libocl/src/ocl_memset.ll | 148 ++++--------
 2 files changed, 205 insertions(+), 388 deletions(-)

diff --git a/backend/src/libocl/src/ocl_memcpy.ll b/backend/src/libocl/src/ocl_memcpy.ll
index 476033e..24b210f 100644
--- a/backend/src/libocl/src/ocl_memcpy.ll
+++ b/backend/src/libocl/src/ocl_memcpy.ll
@@ -1,10 +1,6 @@
 ;The memcpy's source code.
 ; INLINE_OVERLOADABLE void __gen_memcpy(uchar* dst, uchar* src, size_t size) {
 ;   size_t index = 0;
-;   while((index + 4) >= size) {
-;     *((uint *)(dst + index)) = *((uint *)(src + index));
-;     index += 4;
-;   }
 ;   while(index < size) {
 ;     dst[index] = src[index];
 ;     index++;
@@ -13,324 +9,207 @@
 
 define void @__gen_memcpy_gg(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i32 %size) nounwind alwaysinline {
 entry:
-  br label %while.cond
-
-while.cond:                                       ; preds = %while.body, %entry
-  %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ]
-  %add = add i32 %index.0, 4
-  %cmp = icmp ult i32 %add, %size
-  br i1 %cmp, label %while.cond3, label %while.body
-
-while.body:                                       ; preds = %while.cond
-  %add.ptr = getelementptr inbounds i8 addrspace(1)* %src, i32 %index.0
-  %0 = bitcast i8 addrspace(1)* %add.ptr to i32 addrspace(1)*
-  %1 = load i32 addrspace(1)* %0, align 4
-  %add.ptr1 = getelementptr inbounds i8 addrspace(1)* %dst, i32 %index.0
-  %2 = bitcast i8 addrspace(1)* %add.ptr1 to i32 addrspace(1)*
-  store i32 %1, i32 addrspace(1)* %2, align 4
-  br label %while.cond
-
-while.cond3:                                      ; preds = %while.cond, %while.body5
-  %index.1 = phi i32 [ %index.0, %while.cond ], [ %inc, %while.body5 ]
-  %cmp4 = icmp ult i32 %index.1, %size
-  br i1 %cmp4, label %while.body5, label %while.end7
-
-while.body5:                                      ; preds = %while.cond3
-  %arrayidx = getelementptr inbounds i8 addrspace(1)* %src, i32 %index.1
-  %3 = load i8 addrspace(1)* %arrayidx, align 1
-  %arrayidx6 = getelementptr inbounds i8 addrspace(1)* %dst, i32 %index.1
-  store i8 %3, i8 addrspace(1)* %arrayidx6, align 1
-  %inc = add i32 %index.1, 1
-  br label %while.cond3
-
-while.end7:                                       ; preds = %while.cond3
+  %cmp4 = icmp eq i32 %size, 0
+  br i1 %cmp4, label %while.end, label %while.body
+
+while.body:                                       ; preds = %entry, %while.body
+  %index.05 = phi i32 [ %inc, %while.body ], [ 0, %entry ]
+  %0 = ptrtoint i8 addrspace(1)* %src to i32
+  %1 = add i32 %0, %index.05
+  %2 = inttoptr i32 %1 to i8 addrspace(1)*
+  %3 = load i8 addrspace(1)* %2, align 1
+  %4 = ptrtoint i8 addrspace(1)* %dst to i32
+  %5 = add i32 %4, %index.05
+  %6 = inttoptr i32 %5 to i8 addrspace(1)*
+  store i8 %3, i8 addrspace(1)* %6, align 1
+  %inc = add i32 %index.05, 1
+  %cmp = icmp ult i32 %inc, %size
+  br i1 %cmp, label %while.body, label %while.end
+
+while.end:                                        ; preds = %while.body, %entry
   ret void
 }
 
 define void @__gen_memcpy_gp(i8 addrspace(1)* %dst, i8 addrspace(0)* %src, i32 %size) nounwind alwaysinline {
 entry:
-  br label %while.cond
-
-while.cond:                                       ; preds = %while.body, %entry
-  %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ]
-  %add = add i32 %index.0, 4
-  %cmp = icmp ult i32 %add, %size
-  br i1 %cmp, label %while.cond3, label %while.body
-
-while.body:                                       ; preds = %while.cond
-  %add.ptr = getelementptr inbounds i8 addrspace(0)* %src, i32 %index.0
-  %0 = bitcast i8 addrspace(0)* %add.ptr to i32 addrspace(0)*
-  %1 = load i32 addrspace(0)* %0, align 4
-  %add.ptr1 = getelementptr inbounds i8 addrspace(1)* %dst, i32 %index.0
-  %2 = bitcast i8 addrspace(1)* %add.ptr1 to i32 addrspace(1)*
-  store i32 %1, i32 addrspace(1)* %2, align 4
-  br label %while.cond
-
-while.cond3:                                      ; preds = %while.cond, %while.body5
-  %index.1 = phi i32 [ %index.0, %while.cond ], [ %inc, %while.body5 ]
-  %cmp4 = icmp ult i32 %index.1, %size
-  br i1 %cmp4, label %while.body5, label %while.end7
-
-while.body5:                                      ; preds = %while.cond3
-  %arrayidx = getelementptr inbounds i8 addrspace(0)* %src, i32 %index.1
-  %3 = load i8 addrspace(0)* %arrayidx, align 1
-  %arrayidx6 = getelementptr inbounds i8 addrspace(1)* %dst, i32 %index.1
-  store i8 %3, i8 addrspace(1)* %arrayidx6, align 1
-  %inc = add i32 %index.1, 1
-  br label %while.cond3
-
-while.end7:                                       ; preds = %while.cond3
+  %cmp4 = icmp eq i32 %size, 0
+  br i1 %cmp4, label %while.end, label %while.body
+
+while.body:                                       ; preds = %entry, %while.body
+  %index.05 = phi i32 [ %inc, %while.body ], [ 0, %entry ]
+  %0 = ptrtoint i8 addrspace(0)* %src to i32
+  %1 = add i32 %0, %index.05
+  %2 = inttoptr i32 %1 to i8 addrspace(0)*
+  %3 = load i8 addrspace(0)* %2, align 1
+  %4 = ptrtoint i8 addrspace(1)* %dst to i32
+  %5 = add i32 %4, %index.05
+  %6 = inttoptr i32 %5 to i8 addrspace(1)*
+  store i8 %3, i8 addrspace(1)* %6, align 1
+  %inc = add i32 %index.05, 1
+  %cmp = icmp ult i32 %inc, %size
+  br i1 %cmp, label %while.body, label %while.end
+
+while.end:                                        ; preds = %while.body, %entry
   ret void
 }
 
 define void @__gen_memcpy_gl(i8 addrspace(1)* %dst, i8 addrspace(3)* %src, i32 %size) nounwind alwaysinline {
 entry:
-  br label %while.cond
-
-while.cond:                                       ; preds = %while.body, %entry
-  %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ]
-  %add = add i32 %index.0, 4
-  %cmp = icmp ult i32 %add, %size
-  br i1 %cmp, label %while.cond3, label %while.body
-
-while.body:                                       ; preds = %while.cond
-  %add.ptr = getelementptr inbounds i8 addrspace(3)* %src, i32 %index.0
-  %0 = bitcast i8 addrspace(3)* %add.ptr to i32 addrspace(3)*
-  %1 = load i32 addrspace(3)* %0, align 4
-  %add.ptr1 = getelementptr inbounds i8 addrspace(1)* %dst, i32 %index.0
-  %2 = bitcast i8 addrspace(1)* %add.ptr1 to i32 addrspace(1)*
-  store i32 %1, i32 addrspace(1)* %2, align 4
-  br label %while.cond
-
-while.cond3:                                      ; preds = %while.cond, %while.body5
-  %index.1 = phi i32 [ %index.0, %while.cond ], [ %inc, %while.body5 ]
-  %cmp4 = icmp ult i32 %index.1, %size
-  br i1 %cmp4, label %while.body5, label %while.end7
-
-while.body5:                                      ; preds = %while.cond3
-  %arrayidx = getelementptr inbounds i8 addrspace(3)* %src, i32 %index.1
-  %3 = load i8 addrspace(3)* %arrayidx, align 1
-  %arrayidx6 = getelementptr inbounds i8 addrspace(1)* %dst, i32 %index.1
-  store i8 %3, i8 addrspace(1)* %arrayidx6, align 1
-  %inc = add i32 %index.1, 1
-  br label %while.cond3
-
-while.end7:                                       ; preds = %while.cond3
+  %cmp4 = icmp eq i32 %size, 0
+  br i1 %cmp4, label %while.end, label %while.body
+
+while.body:                                       ; preds = %entry, %while.body
+  %index.05 = phi i32 [ %inc, %while.body ], [ 0, %entry ]
+  %0 = ptrtoint i8 addrspace(3)* %src to i32
+  %1 = add i32 %0, %index.05
+  %2 = inttoptr i32 %1 to i8 addrspace(3)*
+  %3 = load i8 addrspace(3)* %2, align 1
+  %4 = ptrtoint i8 addrspace(1)* %dst to i32
+  %5 = add i32 %4, %index.05
+  %6 = inttoptr i32 %5 to i8 addrspace(1)*
+  store i8 %3, i8 addrspace(1)* %6, align 1
+  %inc = add i32 %index.05, 1
+  %cmp = icmp ult i32 %inc, %size
+  br i1 %cmp, label %while.body, label %while.end
+
+while.end:                                        ; preds = %while.body, %entry
   ret void
 }
 
 define void @__gen_memcpy_pg(i8 addrspace(0)* %dst, i8 addrspace(1)* %src, i32 %size) nounwind alwaysinline {
 entry:
-  br label %while.cond
-
-while.cond:                                       ; preds = %while.body, %entry
-  %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ]
-  %add = add i32 %index.0, 4
-  %cmp = icmp ult i32 %add, %size
-  br i1 %cmp, label %while.cond3, label %while.body
-
-while.body:                                       ; preds = %while.cond
-  %add.ptr = getelementptr inbounds i8 addrspace(1)* %src, i32 %index.0
-  %0 = bitcast i8 addrspace(1)* %add.ptr to i32 addrspace(1)*
-  %1 = load i32 addrspace(1)* %0, align 4
-  %add.ptr1 = getelementptr inbounds i8 addrspace(0)* %dst, i32 %index.0
-  %2 = bitcast i8 addrspace(0)* %add.ptr1 to i32 addrspace(0)*
-  store i32 %1, i32 addrspace(0)* %2, align 4
-  br label %while.cond
-
-while.cond3:                                      ; preds = %while.cond, %while.body5
-  %index.1 = phi i32 [ %index.0, %while.cond ], [ %inc, %while.body5 ]
-  %cmp4 = icmp ult i32 %index.1, %size
-  br i1 %cmp4, label %while.body5, label %while.end7
-
-while.body5:                                      ; preds = %while.cond3
-  %arrayidx = getelementptr inbounds i8 addrspace(1)* %src, i32 %index.1
-  %3 = load i8 addrspace(1)* %arrayidx, align 1
-  %arrayidx6 = getelementptr inbounds i8 addrspace(0)* %dst, i32 %index.1
-  store i8 %3, i8 addrspace(0)* %arrayidx6, align 1
-  %inc = add i32 %index.1, 1
-  br label %while.cond3
-
-while.end7:                                       ; preds = %while.cond3
+  %cmp4 = icmp eq i32 %size, 0
+  br i1 %cmp4, label %while.end, label %while.body
+
+while.body:                                       ; preds = %entry, %while.body
+  %index.05 = phi i32 [ %inc, %while.body ], [ 0, %entry ]
+  %0 = ptrtoint i8 addrspace(1)* %src to i32
+  %1 = add i32 %0, %index.05
+  %2 = inttoptr i32 %1 to i8 addrspace(1)*
+  %3 = load i8 addrspace(1)* %2, align 1
+  %4 = ptrtoint i8 addrspace(0)* %dst to i32
+  %5 = add i32 %4, %index.05
+  %6 = inttoptr i32 %5 to i8 addrspace(0)*
+  store i8 %3, i8 addrspace(0)* %6, align 1
+  %inc = add i32 %index.05, 1
+  %cmp = icmp ult i32 %inc, %size
+  br i1 %cmp, label %while.body, label %while.end
+
+while.end:                                        ; preds = %while.body, %entry
   ret void
 }
 
 define void @__gen_memcpy_pp(i8 addrspace(0)* %dst, i8 addrspace(0)* %src, i32 %size) nounwind alwaysinline {
 entry:
-  br label %while.cond
-
-while.cond:                                       ; preds = %while.body, %entry
-  %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ]
-  %add = add i32 %index.0, 4
-  %cmp = icmp ult i32 %add, %size
-  br i1 %cmp, label %while.cond3, label %while.body
-
-while.body:                                       ; preds = %while.cond
-  %add.ptr = getelementptr inbounds i8 addrspace(0)* %src, i32 %index.0
-  %0 = bitcast i8 addrspace(0)* %add.ptr to i32 addrspace(0)*
-  %1 = load i32 addrspace(0)* %0, align 4
-  %add.ptr1 = getelementptr inbounds i8 addrspace(0)* %dst, i32 %index.0
-  %2 = bitcast i8 addrspace(0)* %add.ptr1 to i32 addrspace(0)*
-  store i32 %1, i32 addrspace(0)* %2, align 4
-  br label %while.cond
-
-while.cond3:                                      ; preds = %while.cond, %while.body5
-  %index.1 = phi i32 [ %index.0, %while.cond ], [ %inc, %while.body5 ]
-  %cmp4 = icmp ult i32 %index.1, %size
-  br i1 %cmp4, label %while.body5, label %while.end7
-
-while.body5:                                      ; preds = %while.cond3
-  %arrayidx = getelementptr inbounds i8 addrspace(0)* %src, i32 %index.1
-  %3 = load i8 addrspace(0)* %arrayidx, align 1
-  %arrayidx6 = getelementptr inbounds i8 addrspace(0)* %dst, i32 %index.1
-  store i8 %3, i8 addrspace(0)* %arrayidx6, align 1
-  %inc = add i32 %index.1, 1
-  br label %while.cond3
-
-while.end7:                                       ; preds = %while.cond3
+  %cmp4 = icmp eq i32 %size, 0
+  br i1 %cmp4, label %while.end, label %while.body
+
+while.body:                                       ; preds = %entry, %while.body
+  %index.05 = phi i32 [ %inc, %while.body ], [ 0, %entry ]
+  %0 = ptrtoint i8 addrspace(0)* %src to i32
+  %1 = add i32 %0, %index.05
+  %2 = inttoptr i32 %1 to i8 addrspace(0)*
+  %3 = load i8 addrspace(0)* %2, align 1
+  %4 = ptrtoint i8 addrspace(0)* %dst to i32
+  %5 = add i32 %4, %index.05
+  %6 = inttoptr i32 %5 to i8 addrspace(0)*
+  store i8 %3, i8 addrspace(0)* %6, align 1
+  %inc = add i32 %index.05, 1
+  %cmp = icmp ult i32 %inc, %size
+  br i1 %cmp, label %while.body, label %while.end
+
+while.end:                                        ; preds = %while.body, %entry
   ret void
 }
 
 define void @__gen_memcpy_pl(i8 addrspace(0)* %dst, i8 addrspace(3)* %src, i32 %size) nounwind alwaysinline {
 entry:
-  br label %while.cond
-
-while.cond:                                       ; preds = %while.body, %entry
-  %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ]
-  %add = add i32 %index.0, 4
-  %cmp = icmp ult i32 %add, %size
-  br i1 %cmp, label %while.cond3, label %while.body
-
-while.body:                                       ; preds = %while.cond
-  %add.ptr = getelementptr inbounds i8 addrspace(3)* %src, i32 %index.0
-  %0 = bitcast i8 addrspace(3)* %add.ptr to i32 addrspace(3)*
-  %1 = load i32 addrspace(3)* %0, align 4
-  %add.ptr1 = getelementptr inbounds i8 addrspace(0)* %dst, i32 %index.0
-  %2 = bitcast i8 addrspace(0)* %add.ptr1 to i32 addrspace(0)*
-  store i32 %1, i32 addrspace(0)* %2, align 4
-  br label %while.cond
-
-while.cond3:                                      ; preds = %while.cond, %while.body5
-  %index.1 = phi i32 [ %index.0, %while.cond ], [ %inc, %while.body5 ]
-  %cmp4 = icmp ult i32 %index.1, %size
-  br i1 %cmp4, label %while.body5, label %while.end7
-
-while.body5:                                      ; preds = %while.cond3
-  %arrayidx = getelementptr inbounds i8 addrspace(3)* %src, i32 %index.1
-  %3 = load i8 addrspace(3)* %arrayidx, align 1
-  %arrayidx6 = getelementptr inbounds i8 addrspace(0)* %dst, i32 %index.1
-  store i8 %3, i8 addrspace(0)* %arrayidx6, align 1
-  %inc = add i32 %index.1, 1
-  br label %while.cond3
-
-while.end7:                                       ; preds = %while.cond3
+  %cmp4 = icmp eq i32 %size, 0
+  br i1 %cmp4, label %while.end, label %while.body
+
+while.body:                                       ; preds = %entry, %while.body
+  %index.05 = phi i32 [ %inc, %while.body ], [ 0, %entry ]
+  %0 = ptrtoint i8 addrspace(3)* %src to i32
+  %1 = add i32 %0, %index.05
+  %2 = inttoptr i32 %1 to i8 addrspace(3)*
+  %3 = load i8 addrspace(3)* %2, align 1
+  %4 = ptrtoint i8 addrspace(0)* %dst to i32
+  %5 = add i32 %4, %index.05
+  %6 = inttoptr i32 %5 to i8 addrspace(0)*
+  store i8 %3, i8 addrspace(0)* %6, align 1
+  %inc = add i32 %index.05, 1
+  %cmp = icmp ult i32 %inc, %size
+  br i1 %cmp, label %while.body, label %while.end
+
+while.end:                                        ; preds = %while.body, %entry
   ret void
 }
 
 define void @__gen_memcpy_lg(i8 addrspace(3)* %dst, i8 addrspace(1)* %src, i32 %size) nounwind alwaysinline {
 entry:
-  br label %while.cond
-
-while.cond:                                       ; preds = %while.body, %entry
-  %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ]
-  %add = add i32 %index.0, 4
-  %cmp = icmp ult i32 %add, %size
-  br i1 %cmp, label %while.cond3, label %while.body
-
-while.body:                                       ; preds = %while.cond
-  %add.ptr = getelementptr inbounds i8 addrspace(1)* %src, i32 %index.0
-  %0 = bitcast i8 addrspace(1)* %add.ptr to i32 addrspace(1)*
-  %1 = load i32 addrspace(1)* %0, align 4
-  %add.ptr1 = getelementptr inbounds i8 addrspace(3)* %dst, i32 %index.0
-  %2 = bitcast i8 addrspace(3)* %add.ptr1 to i32 addrspace(3)*
-  store i32 %1, i32 addrspace(3)* %2, align 4
-  br label %while.cond
-
-while.cond3:                                      ; preds = %while.cond, %while.body5
-  %index.1 = phi i32 [ %index.0, %while.cond ], [ %inc, %while.body5 ]
-  %cmp4 = icmp ult i32 %index.1, %size
-  br i1 %cmp4, label %while.body5, label %while.end7
-
-while.body5:                                      ; preds = %while.cond3
-  %arrayidx = getelementptr inbounds i8 addrspace(1)* %src, i32 %index.1
-  %3 = load i8 addrspace(1)* %arrayidx, align 1
-  %arrayidx6 = getelementptr inbounds i8 addrspace(3)* %dst, i32 %index.1
-  store i8 %3, i8 addrspace(3)* %arrayidx6, align 1
-  %inc = add i32 %index.1, 1
-  br label %while.cond3
-
-while.end7:                                       ; preds = %while.cond3
+  %cmp4 = icmp eq i32 %size, 0
+  br i1 %cmp4, label %while.end, label %while.body
+
+while.body:                                       ; preds = %entry, %while.body
+  %index.05 = phi i32 [ %inc, %while.body ], [ 0, %entry ]
+  %0 = ptrtoint i8 addrspace(1)* %src to i32
+  %1 = add i32 %0, %index.05
+  %2 = inttoptr i32 %1 to i8 addrspace(1)*
+  %3 = load i8 addrspace(1)* %2, align 1
+  %4 = ptrtoint i8 addrspace(3)* %dst to i32
+  %5 = add i32 %4, %index.05
+  %6 = inttoptr i32 %5 to i8 addrspace(3)*
+  store i8 %3, i8 addrspace(3)* %6, align 1
+  %inc = add i32 %index.05, 1
+  %cmp = icmp ult i32 %inc, %size
+  br i1 %cmp, label %while.body, label %while.end
+
+while.end:                                        ; preds = %while.body, %entry
   ret void
 }
 
 define void @__gen_memcpy_lp(i8 addrspace(3)* %dst, i8 addrspace(0)* %src, i32 %size) nounwind alwaysinline {
 entry:
-  br label %while.cond
-
-while.cond:                                       ; preds = %while.body, %entry
-  %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ]
-  %add = add i32 %index.0, 4
-  %cmp = icmp ult i32 %add, %size
-  br i1 %cmp, label %while.cond3, label %while.body
-
-while.body:                                       ; preds = %while.cond
-  %add.ptr = getelementptr inbounds i8 addrspace(0)* %src, i32 %index.0
-  %0 = bitcast i8 addrspace(0)* %add.ptr to i32 addrspace(0)*
-  %1 = load i32 addrspace(0)* %0, align 4
-  %add.ptr1 = getelementptr inbounds i8 addrspace(3)* %dst, i32 %index.0
-  %2 = bitcast i8 addrspace(3)* %add.ptr1 to i32 addrspace(3)*
-  store i32 %1, i32 addrspace(3)* %2, align 4
-  br label %while.cond
-
-while.cond3:                                      ; preds = %while.cond, %while.body5
-  %index.1 = phi i32 [ %index.0, %while.cond ], [ %inc, %while.body5 ]
-  %cmp4 = icmp ult i32 %index.1, %size
-  br i1 %cmp4, label %while.body5, label %while.end7
-
-while.body5:                                      ; preds = %while.cond3
-  %arrayidx = getelementptr inbounds i8 addrspace(0)* %src, i32 %index.1
-  %3 = load i8 addrspace(0)* %arrayidx, align 1
-  %arrayidx6 = getelementptr inbounds i8 addrspace(3)* %dst, i32 %index.1
-  store i8 %3, i8 addrspace(3)* %arrayidx6, align 1
-  %inc = add i32 %index.1, 1
-  br label %while.cond3
-
-while.end7:                                       ; preds = %while.cond3
+  %cmp4 = icmp eq i32 %size, 0
+  br i1 %cmp4, label %while.end, label %while.body
+
+while.body:                                       ; preds = %entry, %while.body
+  %index.05 = phi i32 [ %inc, %while.body ], [ 0, %entry ]
+  %0 = ptrtoint i8 addrspace(0)* %src to i32
+  %1 = add i32 %0, %index.05
+  %2 = inttoptr i32 %1 to i8 addrspace(0)*
+  %3 = load i8 addrspace(0)* %2, align 1
+  %4 = ptrtoint i8 addrspace(3)* %dst to i32
+  %5 = add i32 %4, %index.05
+  %6 = inttoptr i32 %5 to i8 addrspace(3)*
+  store i8 %3, i8 addrspace(3)* %6, align 1
+  %inc = add i32 %index.05, 1
+  %cmp = icmp ult i32 %inc, %size
+  br i1 %cmp, label %while.body, label %while.end
+
+while.end:                                        ; preds = %while.body, %entry
   ret void
 }
 
 define void @__gen_memcpy_ll(i8 addrspace(3)* %dst, i8 addrspace(3)* %src, i32 %size) nounwind alwaysinline {
 entry:
-  br label %while.cond
-
-while.cond:                                       ; preds = %while.body, %entry
-  %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ]
-  %add = add i32 %index.0, 4
-  %cmp = icmp ult i32 %add, %size
-  br i1 %cmp, label %while.cond3, label %while.body
-
-while.body:                                       ; preds = %while.cond
-  %add.ptr = getelementptr inbounds i8 addrspace(3)* %src, i32 %index.0
-  %0 = bitcast i8 addrspace(3)* %add.ptr to i32 addrspace(3)*
-  %1 = load i32 addrspace(3)* %0, align 4
-  %add.ptr1 = getelementptr inbounds i8 addrspace(3)* %dst, i32 %index.0
-  %2 = bitcast i8 addrspace(3)* %add.ptr1 to i32 addrspace(3)*
-  store i32 %1, i32 addrspace(3)* %2, align 4
-  br label %while.cond
-
-while.cond3:                                      ; preds = %while.cond, %while.body5
-  %index.1 = phi i32 [ %index.0, %while.cond ], [ %inc, %while.body5 ]
-  %cmp4 = icmp ult i32 %index.1, %size
-  br i1 %cmp4, label %while.body5, label %while.end7
-
-while.body5:                                      ; preds = %while.cond3
-  %arrayidx = getelementptr inbounds i8 addrspace(3)* %src, i32 %index.1
-  %3 = load i8 addrspace(3)* %arrayidx, align 1
-  %arrayidx6 = getelementptr inbounds i8 addrspace(3)* %dst, i32 %index.1
-  store i8 %3, i8 addrspace(3)* %arrayidx6, align 1
-  %inc = add i32 %index.1, 1
-  br label %while.cond3
-
-while.end7:                                       ; preds = %while.cond3
+  %cmp4 = icmp eq i32 %size, 0
+  br i1 %cmp4, label %while.end, label %while.body
+
+while.body:                                       ; preds = %entry, %while.body
+  %index.05 = phi i32 [ %inc, %while.body ], [ 0, %entry ]
+  %0 = ptrtoint i8 addrspace(3)* %src to i32
+  %1 = add i32 %0, %index.05
+  %2 = inttoptr i32 %1 to i8 addrspace(3)*
+  %3 = load i8 addrspace(3)* %2, align 1
+  %4 = ptrtoint i8 addrspace(3)* %dst to i32
+  %5 = add i32 %4, %index.05
+  %6 = inttoptr i32 %5 to i8 addrspace(3)*
+  store i8 %3, i8 addrspace(3)* %6, align 1
+  %inc = add i32 %index.05, 1
+  %cmp = icmp ult i32 %inc, %size
+  br i1 %cmp, label %while.body, label %while.end
+
+while.end:                                        ; preds = %while.body, %entry
   ret void
 }
diff --git a/backend/src/libocl/src/ocl_memset.ll b/backend/src/libocl/src/ocl_memset.ll
index addf9f5..7d3f03d 100644
--- a/backend/src/libocl/src/ocl_memset.ll
+++ b/backend/src/libocl/src/ocl_memset.ll
@@ -1,127 +1,65 @@
 ;The memset's source code.
 ; INLINE_OVERLOADABLE void __gen_memset(uchar* dst, uchar val, size_t size) {
 ;   size_t index = 0;
-;   uint v = (val << 24) | (val << 16) | (val << 8) | val;
-;   while((index + 4) >= size) {
-;     *((uint *)(dst + index)) = v;
-;     index += 4;
-;   }
 ;   while(index < size) {
 ;     dst[index] = val;
 ;     index++;
 ;  }
 ; }
 
-define void @__gen_memset_p(i8* %dst, i8 zeroext %val, i32 %size) nounwind alwaysinline {
+define void @__gen_memset_p(i8 addrspace(0)* %dst, i8 zeroext %val, i32 %size) nounwind alwaysinline {
 entry:
-  %conv = zext i8 %val to i32
-  %shl = shl nuw i32 %conv, 24
-  %shl2 = shl nuw nsw i32 %conv, 16
-  %or = or i32 %shl, %shl2
-  %shl4 = shl nuw nsw i32 %conv, 8
-  %or5 = or i32 %or, %shl4
-  %or7 = or i32 %or5, %conv
-  br label %while.cond
-
-while.cond:                                       ; preds = %while.body, %entry
-  %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ]
-  %add = add i32 %index.0, 4
-  %cmp = icmp ult i32 %add, %size
-  br i1 %cmp, label %while.cond10, label %while.body
-
-while.body:                                       ; preds = %while.cond
-  %add.ptr = getelementptr inbounds i8* %dst, i32 %index.0
-  %0 = bitcast i8* %add.ptr to i32*
-  store i32 %or7, i32* %0, align 4
-  br label %while.cond
-
-while.cond10:                                     ; preds = %while.cond, %while.body13
-  %index.1 = phi i32 [ %index.0, %while.cond ], [ %inc, %while.body13 ]
-  %cmp11 = icmp ult i32 %index.1, %size
-  br i1 %cmp11, label %while.body13, label %while.end14
-
-while.body13:                                     ; preds = %while.cond10
-  %arrayidx = getelementptr inbounds i8* %dst, i32 %index.1
-  store i8 %val, i8* %arrayidx, align 1
-  %inc = add i32 %index.1, 1
-  br label %while.cond10
-
-while.end14:                                      ; preds = %while.cond10
+  %cmp3 = icmp eq i32 %size, 0
+  br i1 %cmp3, label %while.end, label %while.body
+
+while.body:                                       ; preds = %entry, %while.body
+  %index.04 = phi i32 [ %inc, %while.body ], [ 0, %entry ]
+  %0 = ptrtoint i8 addrspace(0)* %dst to i32
+  %1 = add i32 %0, %index.04
+  %2 = inttoptr i32 %1 to i8 addrspace(0)*
+  store i8 %val, i8 addrspace(0)* %2, align 1
+  %inc = add i32 %index.04, 1
+  %cmp = icmp ult i32 %inc, %size
+  br i1 %cmp, label %while.body, label %while.end
+
+while.end:                                        ; preds = %while.body, %entry
   ret void
 }
 
 define void @__gen_memset_g(i8 addrspace(1)* %dst, i8 zeroext %val, i32 %size) nounwind alwaysinline {
 entry:
-  %conv = zext i8 %val to i32
-  %shl = shl nuw i32 %conv, 24
-  %shl2 = shl nuw nsw i32 %conv, 16
-  %or = or i32 %shl, %shl2
-  %shl4 = shl nuw nsw i32 %conv, 8
-  %or5 = or i32 %or, %shl4
-  %or7 = or i32 %or5, %conv
-  br label %while.cond
-
-while.cond:                                       ; preds = %while.body, %entry
-  %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ]
-  %add = add i32 %index.0, 4
-  %cmp = icmp ult i32 %add, %size
-  br i1 %cmp, label %while.cond10, label %while.body
-
-while.body:                                       ; preds = %while.cond
-  %add.ptr = getelementptr inbounds i8 addrspace(1)* %dst, i32 %index.0
-  %0 = bitcast i8 addrspace(1)* %add.ptr to i32 addrspace(1)*
-  store i32 %or7, i32 addrspace(1)* %0, align 4
-  br label %while.cond
-
-while.cond10:                                     ; preds = %while.cond, %while.body13
-  %index.1 = phi i32 [ %index.0, %while.cond ], [ %inc, %while.body13 ]
-  %cmp11 = icmp ult i32 %index.1, %size
-  br i1 %cmp11, label %while.body13, label %while.end14
-
-while.body13:                                     ; preds = %while.cond10
-  %arrayidx = getelementptr inbounds i8 addrspace(1)* %dst, i32 %index.1
-  store i8 %val, i8 addrspace(1)* %arrayidx, align 1
-  %inc = add i32 %index.1, 1
-  br label %while.cond10
-
-while.end14:                                      ; preds = %while.cond10
+  %cmp3 = icmp eq i32 %size, 0
+  br i1 %cmp3, label %while.end, label %while.body
+
+while.body:                                       ; preds = %entry, %while.body
+  %index.04 = phi i32 [ %inc, %while.body ], [ 0, %entry ]
+  %0 = ptrtoint i8 addrspace(1)* %dst to i32
+  %1 = add i32 %0, %index.04
+  %2 = inttoptr i32 %1 to i8 addrspace(1)*
+  store i8 %val, i8 addrspace(1)* %2, align 1
+  %inc = add i32 %index.04, 1
+  %cmp = icmp ult i32 %inc, %size
+  br i1 %cmp, label %while.body, label %while.end
+
+while.end:                                        ; preds = %while.body, %entry
   ret void
 }
 
 define void @__gen_memset_l(i8 addrspace(3)* %dst, i8 zeroext %val, i32 %size) nounwind alwaysinline {
 entry:
-  %conv = zext i8 %val to i32
-  %shl = shl nuw i32 %conv, 24
-  %shl2 = shl nuw nsw i32 %conv, 16
-  %or = or i32 %shl, %shl2
-  %shl4 = shl nuw nsw i32 %conv, 8
-  %or5 = or i32 %or, %shl4
-  %or7 = or i32 %or5, %conv
-  br label %while.cond
-
-while.cond:                                       ; preds = %while.body, %entry
-  %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ]
-  %add = add i32 %index.0, 4
-  %cmp = icmp ult i32 %add, %size
-  br i1 %cmp, label %while.cond10, label %while.body
-
-while.body:                                       ; preds = %while.cond
-  %add.ptr = getelementptr inbounds i8 addrspace(3)* %dst, i32 %index.0
-  %0 = bitcast i8 addrspace(3)* %add.ptr to i32 addrspace(3)*
-  store i32 %or7, i32 addrspace(3)* %0, align 4
-  br label %while.cond
-
-while.cond10:                                     ; preds = %while.cond, %while.body13
-  %index.1 = phi i32 [ %index.0, %while.cond ], [ %inc, %while.body13 ]
-  %cmp11 = icmp ult i32 %index.1, %size
-  br i1 %cmp11, label %while.body13, label %while.end14
-
-while.body13:                                     ; preds = %while.cond10
-  %arrayidx = getelementptr inbounds i8 addrspace(3)* %dst, i32 %index.1
-  store i8 %val, i8 addrspace(3)* %arrayidx, align 1
-  %inc = add i32 %index.1, 1
-  br label %while.cond10
-
-while.end14:                                      ; preds = %while.cond10
+  %cmp3 = icmp eq i32 %size, 0
+  br i1 %cmp3, label %while.end, label %while.body
+
+while.body:                                       ; preds = %entry, %while.body
+  %index.04 = phi i32 [ %inc, %while.body ], [ 0, %entry ]
+  %0 = ptrtoint i8 addrspace(3)* %dst to i32
+  %1 = add i32 %0, %index.04
+  %2 = inttoptr i32 %1 to i8 addrspace(3)*
+  store i8 %val, i8 addrspace(3)* %2, align 1
+  %inc = add i32 %index.04, 1
+  %cmp = icmp ult i32 %inc, %size
+  br i1 %cmp, label %while.body, label %while.end
+
+while.end:                                        ; preds = %while.body, %entry
   ret void
 }
-- 
1.8.3.2



More information about the Beignet mailing list