[Beignet] [Patch V2] Fix memcpy and memset bug.

Song, Ruiling ruiling.song at intel.com
Fri Oct 10 01:08:35 PDT 2014


The patch LGTM

> -----Original Message-----
> From: Beignet [mailto:beignet-bounces at lists.freedesktop.org] On Behalf Of
> Yang Rong
> Sent: Wednesday, October 08, 2014 3:56 PM
> To: beignet at lists.freedesktop.org
> Cc: Yang, Rong R
> Subject: [Beignet] [Patch V2] Fix memcpy and memset bug.
> 
> In ocl_memcpy.ll and ocl_memset.ll, index+4 should be less and equal than
> size when use int in memcpy and memset, and need consider alignment.
> 
> V2: Add the alignment argument, fix the condition.
> Signed-off-by: Yang Rong <rong.r.yang at intel.com>
> ---
>  backend/src/libocl/src/ocl_memcpy.ll         | 704
> ++++++++++++++++-----------
>  backend/src/libocl/src/ocl_memset.ll         | 225 +++++----
>  backend/src/llvm/llvm_intrinsic_lowering.cpp |  14 +-
>  3 files changed, 546 insertions(+), 397 deletions(-)
> 
> diff --git a/backend/src/libocl/src/ocl_memcpy.ll
> b/backend/src/libocl/src/ocl_memcpy.ll
> index 476033e..64c68bb 100644
> --- a/backend/src/libocl/src/ocl_memcpy.ll
> +++ b/backend/src/libocl/src/ocl_memcpy.ll
> @@ -1,336 +1,446 @@
>  ;The memcpy's source code.
> -; INLINE_OVERLOADABLE void __gen_memcpy(uchar* dst, uchar* src,
> size_t size) {
> +; INLINE_OVERLOADABLE void __gen_memcpy(uchar* dst, uchar* src,
> size_t
> +size, uint alignment) {
>  ;   size_t index = 0;
> -;   while((index + 4) >= size) {
> -;     *((uint *)(dst + index)) = *((uint *)(src + index));
> -;     index += 4;
> -;   }
> -;   while(index < size) {
> -;     dst[index] = src[index];
> -;     index++;
> -;   }
> +;  	if(alignment % 4 == 0) {
> +;      while((index + 4) <= size) {
> +;        *((__global  uint *)(dst + index)) = *((__global uint *)(src +
> index));
> +;        index += 4;
> +;      }
> +;    }
> +;    while(index < size) {
> +;      dst[index] = src[index];
> +;      index++;
> +;    }
>  ; }
> 
> -define void @__gen_memcpy_gg(i8 addrspace(1)* %dst, i8
> addrspace(1)* %src, i32 %size) nounwind alwaysinline {
> +define void @__gen_memcpy_gg(i8 addrspace(1)* %dst, i8 addrspace(1)*
> +%src, i32 %size, i32 %alignment) nounwind alwaysinline {
>  entry:
> -  br label %while.cond
> -
> -while.cond:                                       ; preds
> = %while.body, %entry
> -  %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ]
> -  %add = add i32 %index.0, 4
> -  %cmp = icmp ult i32 %add, %size
> -  br i1 %cmp, label %while.cond3, label %while.body
> -
> -while.body:                                       ; preds
> = %while.cond
> -  %add.ptr = getelementptr inbounds i8 addrspace(1)* %src, i32 %index.0
> -  %0 = bitcast i8 addrspace(1)* %add.ptr to i32 addrspace(1)*
> -  %1 = load i32 addrspace(1)* %0, align 4
> -  %add.ptr1 = getelementptr inbounds i8 addrspace(1)* %dst, i32 %index.0
> -  %2 = bitcast i8 addrspace(1)* %add.ptr1 to i32 addrspace(1)*
> -  store i32 %1, i32 addrspace(1)* %2, align 4
> -  br label %while.cond
> -
> -while.cond3:                                      ; preds
> = %while.cond, %while.body5
> -  %index.1 = phi i32 [ %index.0, %while.cond ], [ %inc, %while.body5 ]
> -  %cmp4 = icmp ult i32 %index.1, %size
> -  br i1 %cmp4, label %while.body5, label %while.end7
> -
> -while.body5:                                      ; preds
> = %while.cond3
> -  %arrayidx = getelementptr inbounds i8 addrspace(1)* %src, i32 %index.1
> -  %3 = load i8 addrspace(1)* %arrayidx, align 1
> -  %arrayidx6 = getelementptr inbounds i8 addrspace(1)* %dst, i32 %index.1
> -  store i8 %3, i8 addrspace(1)* %arrayidx6, align 1
> -  %inc = add i32 %index.1, 1
> -  br label %while.cond3
> -
> -while.end7:                                       ; preds
> = %while.cond3
> +  %rem = and i32 %alignment, 3
> +  %cmp = icmp ne i32 %rem, 0
> +  %cmp113 = icmp ult i32 %size, 4
> +  %or.cond = or i1 %cmp, %cmp113
> +  br i1 %or.cond, label %while.cond4.preheader, label %while.body
> +
> +while.cond4.preheader:                            ; preds
> = %entry, %while.body
> +  %index.1.ph = phi i32 [ 0, %entry ], [ %add15, %while.body ]
> +  %cmp511 = icmp ult i32 %index.1.ph, %size
> +  br i1 %cmp511, label %while.body6, label %while.end8
> +
> +while.body:                                       ; preds
> = %entry, %while.body
> +  %add15 = phi i32 [ %add, %while.body ], [ 4, %entry ]
> +  %index.014 = phi i32 [ %add15, %while.body ], [ 0, %entry ]
> +  %0 = ptrtoint i8 addrspace(1)* %src to i32
> +  %1 = add i32 %0, %index.014
> +  %2 = inttoptr i32 %1 to i8 addrspace(1)*
> +  %3 = bitcast i8 addrspace(1)* %2 to i32 addrspace(1)*
> +  %4 = load i32 addrspace(1)* %3, align 4
> +  %5 = ptrtoint i8 addrspace(1)* %dst to i32
> +  %6 = add i32 %5, %index.014
> +  %7 = inttoptr i32 %6 to i8 addrspace(1)*
> +  %8 = bitcast i8 addrspace(1)* %7 to i32 addrspace(1)*
> +  store i32 %4, i32 addrspace(1)* %8, align 4
> +  %add = add i32 %add15, 4
> +  %cmp1 = icmp ugt i32 %add, %size
> +  br i1 %cmp1, label %while.cond4.preheader, label %while.body
> +
> +while.body6:                                      ; preds
> = %while.cond4.preheader, %while.body6
> +  %index.112 = phi i32 [ %inc, %while.body6 ], [ %index.1.ph,
> +%while.cond4.preheader ]
> +  %9 = ptrtoint i8 addrspace(1)* %src to i32
> +  %10 = add i32 %9, %index.112
> +  %11 = inttoptr i32 %10 to i8 addrspace(1)*
> +  %12 = load i8 addrspace(1)* %11, align 1
> +  %13 = ptrtoint i8 addrspace(1)* %dst to i32
> +  %14 = add i32 %13, %index.112
> +  %15 = inttoptr i32 %14 to i8 addrspace(1)*
> +  store i8 %12, i8 addrspace(1)* %15, align 1
> +  %inc = add i32 %index.112, 1
> +  %cmp5 = icmp ult i32 %inc, %size
> +  br i1 %cmp5, label %while.body6, label %while.end8
> +
> +while.end8:                                       ; preds
> = %while.body6, %while.cond4.preheader
>    ret void
>  }
> 
> -define void @__gen_memcpy_gp(i8 addrspace(1)* %dst, i8
> addrspace(0)* %src, i32 %size) nounwind alwaysinline {
> +define void @__gen_memcpy_gp(i8 addrspace(1)* %dst, i8 addrspace(0)*
> +%src, i32 %size, i32 %alignment) nounwind alwaysinline {
>  entry:
> -  br label %while.cond
> -
> -while.cond:                                       ; preds
> = %while.body, %entry
> -  %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ]
> -  %add = add i32 %index.0, 4
> -  %cmp = icmp ult i32 %add, %size
> -  br i1 %cmp, label %while.cond3, label %while.body
> -
> -while.body:                                       ; preds
> = %while.cond
> -  %add.ptr = getelementptr inbounds i8 addrspace(0)* %src, i32 %index.0
> -  %0 = bitcast i8 addrspace(0)* %add.ptr to i32 addrspace(0)*
> -  %1 = load i32 addrspace(0)* %0, align 4
> -  %add.ptr1 = getelementptr inbounds i8 addrspace(1)* %dst, i32 %index.0
> -  %2 = bitcast i8 addrspace(1)* %add.ptr1 to i32 addrspace(1)*
> -  store i32 %1, i32 addrspace(1)* %2, align 4
> -  br label %while.cond
> -
> -while.cond3:                                      ; preds
> = %while.cond, %while.body5
> -  %index.1 = phi i32 [ %index.0, %while.cond ], [ %inc, %while.body5 ]
> -  %cmp4 = icmp ult i32 %index.1, %size
> -  br i1 %cmp4, label %while.body5, label %while.end7
> -
> -while.body5:                                      ; preds
> = %while.cond3
> -  %arrayidx = getelementptr inbounds i8 addrspace(0)* %src, i32 %index.1
> -  %3 = load i8 addrspace(0)* %arrayidx, align 1
> -  %arrayidx6 = getelementptr inbounds i8 addrspace(1)* %dst, i32 %index.1
> -  store i8 %3, i8 addrspace(1)* %arrayidx6, align 1
> -  %inc = add i32 %index.1, 1
> -  br label %while.cond3
> -
> -while.end7:                                       ; preds
> = %while.cond3
> +  %rem = and i32 %alignment, 3
> +  %cmp = icmp ne i32 %rem, 0
> +  %cmp113 = icmp ult i32 %size, 4
> +  %or.cond = or i1 %cmp, %cmp113
> +  br i1 %or.cond, label %while.cond4.preheader, label %while.body
> +
> +while.cond4.preheader:                            ; preds
> = %entry, %while.body
> +  %index.1.ph = phi i32 [ 0, %entry ], [ %add15, %while.body ]
> +  %cmp511 = icmp ult i32 %index.1.ph, %size
> +  br i1 %cmp511, label %while.body6, label %while.end8
> +
> +while.body:                                       ; preds
> = %entry, %while.body
> +  %add15 = phi i32 [ %add, %while.body ], [ 4, %entry ]
> +  %index.014 = phi i32 [ %add15, %while.body ], [ 0, %entry ]
> +  %0 = ptrtoint i8 addrspace(0)* %src to i32
> +  %1 = add i32 %0, %index.014
> +  %2 = inttoptr i32 %1 to i8 addrspace(0)*
> +  %3 = bitcast i8 addrspace(0)* %2 to i32 addrspace(0)*
> +  %4 = load i32 addrspace(0)* %3, align 4
> +  %5 = ptrtoint i8 addrspace(1)* %dst to i32
> +  %6 = add i32 %5, %index.014
> +  %7 = inttoptr i32 %6 to i8 addrspace(1)*
> +  %8 = bitcast i8 addrspace(1)* %7 to i32 addrspace(1)*
> +  store i32 %4, i32 addrspace(1)* %8, align 4
> +  %add = add i32 %add15, 4
> +  %cmp1 = icmp ugt i32 %add, %size
> +  br i1 %cmp1, label %while.cond4.preheader, label %while.body
> +
> +while.body6:                                      ; preds
> = %while.cond4.preheader, %while.body6
> +  %index.112 = phi i32 [ %inc, %while.body6 ], [ %index.1.ph,
> +%while.cond4.preheader ]
> +  %9 = ptrtoint i8 addrspace(0)* %src to i32
> +  %10 = add i32 %9, %index.112
> +  %11 = inttoptr i32 %10 to i8 addrspace(0)*
> +  %12 = load i8 addrspace(0)* %11, align 1
> +  %13 = ptrtoint i8 addrspace(1)* %dst to i32
> +  %14 = add i32 %13, %index.112
> +  %15 = inttoptr i32 %14 to i8 addrspace(1)*
> +  store i8 %12, i8 addrspace(1)* %15, align 1
> +  %inc = add i32 %index.112, 1
> +  %cmp5 = icmp ult i32 %inc, %size
> +  br i1 %cmp5, label %while.body6, label %while.end8
> +
> +while.end8:                                       ; preds
> = %while.body6, %while.cond4.preheader
>    ret void
>  }
> 
> -define void @__gen_memcpy_gl(i8 addrspace(1)* %dst, i8
> addrspace(3)* %src, i32 %size) nounwind alwaysinline {
> +define void @__gen_memcpy_gl(i8 addrspace(1)* %dst, i8 addrspace(3)*
> +%src, i32 %size, i32 %alignment) nounwind alwaysinline {
>  entry:
> -  br label %while.cond
> -
> -while.cond:                                       ; preds
> = %while.body, %entry
> -  %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ]
> -  %add = add i32 %index.0, 4
> -  %cmp = icmp ult i32 %add, %size
> -  br i1 %cmp, label %while.cond3, label %while.body
> -
> -while.body:                                       ; preds
> = %while.cond
> -  %add.ptr = getelementptr inbounds i8 addrspace(3)* %src, i32 %index.0
> -  %0 = bitcast i8 addrspace(3)* %add.ptr to i32 addrspace(3)*
> -  %1 = load i32 addrspace(3)* %0, align 4
> -  %add.ptr1 = getelementptr inbounds i8 addrspace(1)* %dst, i32 %index.0
> -  %2 = bitcast i8 addrspace(1)* %add.ptr1 to i32 addrspace(1)*
> -  store i32 %1, i32 addrspace(1)* %2, align 4
> -  br label %while.cond
> -
> -while.cond3:                                      ; preds
> = %while.cond, %while.body5
> -  %index.1 = phi i32 [ %index.0, %while.cond ], [ %inc, %while.body5 ]
> -  %cmp4 = icmp ult i32 %index.1, %size
> -  br i1 %cmp4, label %while.body5, label %while.end7
> -
> -while.body5:                                      ; preds
> = %while.cond3
> -  %arrayidx = getelementptr inbounds i8 addrspace(3)* %src, i32 %index.1
> -  %3 = load i8 addrspace(3)* %arrayidx, align 1
> -  %arrayidx6 = getelementptr inbounds i8 addrspace(1)* %dst, i32 %index.1
> -  store i8 %3, i8 addrspace(1)* %arrayidx6, align 1
> -  %inc = add i32 %index.1, 1
> -  br label %while.cond3
> -
> -while.end7:                                       ; preds
> = %while.cond3
> +  %rem = and i32 %alignment, 3
> +  %cmp = icmp ne i32 %rem, 0
> +  %cmp113 = icmp ult i32 %size, 4
> +  %or.cond = or i1 %cmp, %cmp113
> +  br i1 %or.cond, label %while.cond4.preheader, label %while.body
> +
> +while.cond4.preheader:                            ; preds
> = %entry, %while.body
> +  %index.1.ph = phi i32 [ 0, %entry ], [ %add15, %while.body ]
> +  %cmp511 = icmp ult i32 %index.1.ph, %size
> +  br i1 %cmp511, label %while.body6, label %while.end8
> +
> +while.body:                                       ; preds
> = %entry, %while.body
> +  %add15 = phi i32 [ %add, %while.body ], [ 4, %entry ]
> +  %index.014 = phi i32 [ %add15, %while.body ], [ 0, %entry ]
> +  %0 = ptrtoint i8 addrspace(3)* %src to i32
> +  %1 = add i32 %0, %index.014
> +  %2 = inttoptr i32 %1 to i8 addrspace(3)*
> +  %3 = bitcast i8 addrspace(3)* %2 to i32 addrspace(3)*
> +  %4 = load i32 addrspace(3)* %3, align 4
> +  %5 = ptrtoint i8 addrspace(1)* %dst to i32
> +  %6 = add i32 %5, %index.014
> +  %7 = inttoptr i32 %6 to i8 addrspace(1)*
> +  %8 = bitcast i8 addrspace(1)* %7 to i32 addrspace(1)*
> +  store i32 %4, i32 addrspace(1)* %8, align 4
> +  %add = add i32 %add15, 4
> +  %cmp1 = icmp ugt i32 %add, %size
> +  br i1 %cmp1, label %while.cond4.preheader, label %while.body
> +
> +while.body6:                                      ; preds
> = %while.cond4.preheader, %while.body6
> +  %index.112 = phi i32 [ %inc, %while.body6 ], [ %index.1.ph,
> +%while.cond4.preheader ]
> +  %9 = ptrtoint i8 addrspace(3)* %src to i32
> +  %10 = add i32 %9, %index.112
> +  %11 = inttoptr i32 %10 to i8 addrspace(3)*
> +  %12 = load i8 addrspace(3)* %11, align 1
> +  %13 = ptrtoint i8 addrspace(1)* %dst to i32
> +  %14 = add i32 %13, %index.112
> +  %15 = inttoptr i32 %14 to i8 addrspace(1)*
> +  store i8 %12, i8 addrspace(1)* %15, align 1
> +  %inc = add i32 %index.112, 1
> +  %cmp5 = icmp ult i32 %inc, %size
> +  br i1 %cmp5, label %while.body6, label %while.end8
> +
> +while.end8:                                       ; preds
> = %while.body6, %while.cond4.preheader
>    ret void
>  }
> 
> -define void @__gen_memcpy_pg(i8 addrspace(0)* %dst, i8
> addrspace(1)* %src, i32 %size) nounwind alwaysinline {
> +define void @__gen_memcpy_pg(i8 addrspace(0)* %dst, i8 addrspace(1)*
> +%src, i32 %size, i32 %alignment) nounwind alwaysinline {
>  entry:
> -  br label %while.cond
> -
> -while.cond:                                       ; preds
> = %while.body, %entry
> -  %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ]
> -  %add = add i32 %index.0, 4
> -  %cmp = icmp ult i32 %add, %size
> -  br i1 %cmp, label %while.cond3, label %while.body
> -
> -while.body:                                       ; preds
> = %while.cond
> -  %add.ptr = getelementptr inbounds i8 addrspace(1)* %src, i32 %index.0
> -  %0 = bitcast i8 addrspace(1)* %add.ptr to i32 addrspace(1)*
> -  %1 = load i32 addrspace(1)* %0, align 4
> -  %add.ptr1 = getelementptr inbounds i8 addrspace(0)* %dst, i32 %index.0
> -  %2 = bitcast i8 addrspace(0)* %add.ptr1 to i32 addrspace(0)*
> -  store i32 %1, i32 addrspace(0)* %2, align 4
> -  br label %while.cond
> -
> -while.cond3:                                      ; preds
> = %while.cond, %while.body5
> -  %index.1 = phi i32 [ %index.0, %while.cond ], [ %inc, %while.body5 ]
> -  %cmp4 = icmp ult i32 %index.1, %size
> -  br i1 %cmp4, label %while.body5, label %while.end7
> -
> -while.body5:                                      ; preds
> = %while.cond3
> -  %arrayidx = getelementptr inbounds i8 addrspace(1)* %src, i32 %index.1
> -  %3 = load i8 addrspace(1)* %arrayidx, align 1
> -  %arrayidx6 = getelementptr inbounds i8 addrspace(0)* %dst, i32 %index.1
> -  store i8 %3, i8 addrspace(0)* %arrayidx6, align 1
> -  %inc = add i32 %index.1, 1
> -  br label %while.cond3
> -
> -while.end7:                                       ; preds
> = %while.cond3
> +  %rem = and i32 %alignment, 3
> +  %cmp = icmp ne i32 %rem, 0
> +  %cmp113 = icmp ult i32 %size, 4
> +  %or.cond = or i1 %cmp, %cmp113
> +  br i1 %or.cond, label %while.cond4.preheader, label %while.body
> +
> +while.cond4.preheader:                            ; preds
> = %entry, %while.body
> +  %index.1.ph = phi i32 [ 0, %entry ], [ %add15, %while.body ]
> +  %cmp511 = icmp ult i32 %index.1.ph, %size
> +  br i1 %cmp511, label %while.body6, label %while.end8
> +
> +while.body:                                       ; preds
> = %entry, %while.body
> +  %add15 = phi i32 [ %add, %while.body ], [ 4, %entry ]
> +  %index.014 = phi i32 [ %add15, %while.body ], [ 0, %entry ]
> +  %0 = ptrtoint i8 addrspace(1)* %src to i32
> +  %1 = add i32 %0, %index.014
> +  %2 = inttoptr i32 %1 to i8 addrspace(1)*
> +  %3 = bitcast i8 addrspace(1)* %2 to i32 addrspace(1)*
> +  %4 = load i32 addrspace(1)* %3, align 4
> +  %5 = ptrtoint i8 addrspace(0)* %dst to i32
> +  %6 = add i32 %5, %index.014
> +  %7 = inttoptr i32 %6 to i8 addrspace(0)*
> +  %8 = bitcast i8 addrspace(0)* %7 to i32 addrspace(0)*
> +  store i32 %4, i32 addrspace(0)* %8, align 4
> +  %add = add i32 %add15, 4
> +  %cmp1 = icmp ugt i32 %add, %size
> +  br i1 %cmp1, label %while.cond4.preheader, label %while.body
> +
> +while.body6:                                      ; preds
> = %while.cond4.preheader, %while.body6
> +  %index.112 = phi i32 [ %inc, %while.body6 ], [ %index.1.ph,
> +%while.cond4.preheader ]
> +  %9 = ptrtoint i8 addrspace(1)* %src to i32
> +  %10 = add i32 %9, %index.112
> +  %11 = inttoptr i32 %10 to i8 addrspace(1)*
> +  %12 = load i8 addrspace(1)* %11, align 1
> +  %13 = ptrtoint i8 addrspace(0)* %dst to i32
> +  %14 = add i32 %13, %index.112
> +  %15 = inttoptr i32 %14 to i8 addrspace(0)*
> +  store i8 %12, i8 addrspace(0)* %15, align 1
> +  %inc = add i32 %index.112, 1
> +  %cmp5 = icmp ult i32 %inc, %size
> +  br i1 %cmp5, label %while.body6, label %while.end8
> +
> +while.end8:                                       ; preds
> = %while.body6, %while.cond4.preheader
>    ret void
>  }
> 
> -define void @__gen_memcpy_pp(i8 addrspace(0)* %dst, i8
> addrspace(0)* %src, i32 %size) nounwind alwaysinline {
> +define void @__gen_memcpy_pp(i8 addrspace(0)* %dst, i8 addrspace(0)*
> +%src, i32 %size, i32 %alignment) nounwind alwaysinline {
>  entry:
> -  br label %while.cond
> -
> -while.cond:                                       ; preds
> = %while.body, %entry
> -  %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ]
> -  %add = add i32 %index.0, 4
> -  %cmp = icmp ult i32 %add, %size
> -  br i1 %cmp, label %while.cond3, label %while.body
> -
> -while.body:                                       ; preds
> = %while.cond
> -  %add.ptr = getelementptr inbounds i8 addrspace(0)* %src, i32 %index.0
> -  %0 = bitcast i8 addrspace(0)* %add.ptr to i32 addrspace(0)*
> -  %1 = load i32 addrspace(0)* %0, align 4
> -  %add.ptr1 = getelementptr inbounds i8 addrspace(0)* %dst, i32 %index.0
> -  %2 = bitcast i8 addrspace(0)* %add.ptr1 to i32 addrspace(0)*
> -  store i32 %1, i32 addrspace(0)* %2, align 4
> -  br label %while.cond
> -
> -while.cond3:                                      ; preds
> = %while.cond, %while.body5
> -  %index.1 = phi i32 [ %index.0, %while.cond ], [ %inc, %while.body5 ]
> -  %cmp4 = icmp ult i32 %index.1, %size
> -  br i1 %cmp4, label %while.body5, label %while.end7
> -
> -while.body5:                                      ; preds
> = %while.cond3
> -  %arrayidx = getelementptr inbounds i8 addrspace(0)* %src, i32 %index.1
> -  %3 = load i8 addrspace(0)* %arrayidx, align 1
> -  %arrayidx6 = getelementptr inbounds i8 addrspace(0)* %dst, i32 %index.1
> -  store i8 %3, i8 addrspace(0)* %arrayidx6, align 1
> -  %inc = add i32 %index.1, 1
> -  br label %while.cond3
> -
> -while.end7:                                       ; preds
> = %while.cond3
> +  %rem = and i32 %alignment, 3
> +  %cmp = icmp ne i32 %rem, 0
> +  %cmp113 = icmp ult i32 %size, 4
> +  %or.cond = or i1 %cmp, %cmp113
> +  br i1 %or.cond, label %while.cond4.preheader, label %while.body
> +
> +while.cond4.preheader:                            ; preds
> = %entry, %while.body
> +  %index.1.ph = phi i32 [ 0, %entry ], [ %add15, %while.body ]
> +  %cmp511 = icmp ult i32 %index.1.ph, %size
> +  br i1 %cmp511, label %while.body6, label %while.end8
> +
> +while.body:                                       ; preds
> = %entry, %while.body
> +  %add15 = phi i32 [ %add, %while.body ], [ 4, %entry ]
> +  %index.014 = phi i32 [ %add15, %while.body ], [ 0, %entry ]
> +  %0 = ptrtoint i8 addrspace(0)* %src to i32
> +  %1 = add i32 %0, %index.014
> +  %2 = inttoptr i32 %1 to i8 addrspace(0)*
> +  %3 = bitcast i8 addrspace(0)* %2 to i32 addrspace(0)*
> +  %4 = load i32 addrspace(0)* %3, align 4
> +  %5 = ptrtoint i8 addrspace(0)* %dst to i32
> +  %6 = add i32 %5, %index.014
> +  %7 = inttoptr i32 %6 to i8 addrspace(0)*
> +  %8 = bitcast i8 addrspace(0)* %7 to i32 addrspace(0)*
> +  store i32 %4, i32 addrspace(0)* %8, align 4
> +  %add = add i32 %add15, 4
> +  %cmp1 = icmp ugt i32 %add, %size
> +  br i1 %cmp1, label %while.cond4.preheader, label %while.body
> +
> +while.body6:                                      ; preds
> = %while.cond4.preheader, %while.body6
> +  %index.112 = phi i32 [ %inc, %while.body6 ], [ %index.1.ph,
> +%while.cond4.preheader ]
> +  %9 = ptrtoint i8 addrspace(0)* %src to i32
> +  %10 = add i32 %9, %index.112
> +  %11 = inttoptr i32 %10 to i8 addrspace(0)*
> +  %12 = load i8 addrspace(0)* %11, align 1
> +  %13 = ptrtoint i8 addrspace(0)* %dst to i32
> +  %14 = add i32 %13, %index.112
> +  %15 = inttoptr i32 %14 to i8 addrspace(0)*
> +  store i8 %12, i8 addrspace(0)* %15, align 1
> +  %inc = add i32 %index.112, 1
> +  %cmp5 = icmp ult i32 %inc, %size
> +  br i1 %cmp5, label %while.body6, label %while.end8
> +
> +while.end8:                                       ; preds
> = %while.body6, %while.cond4.preheader
>    ret void
>  }
> 
> -define void @__gen_memcpy_pl(i8 addrspace(0)* %dst, i8
> addrspace(3)* %src, i32 %size) nounwind alwaysinline {
> +define void @__gen_memcpy_pl(i8 addrspace(0)* %dst, i8 addrspace(3)*
> +%src, i32 %size, i32 %alignment) nounwind alwaysinline {
>  entry:
> -  br label %while.cond
> -
> -while.cond:                                       ; preds
> = %while.body, %entry
> -  %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ]
> -  %add = add i32 %index.0, 4
> -  %cmp = icmp ult i32 %add, %size
> -  br i1 %cmp, label %while.cond3, label %while.body
> -
> -while.body:                                       ; preds
> = %while.cond
> -  %add.ptr = getelementptr inbounds i8 addrspace(3)* %src, i32 %index.0
> -  %0 = bitcast i8 addrspace(3)* %add.ptr to i32 addrspace(3)*
> -  %1 = load i32 addrspace(3)* %0, align 4
> -  %add.ptr1 = getelementptr inbounds i8 addrspace(0)* %dst, i32 %index.0
> -  %2 = bitcast i8 addrspace(0)* %add.ptr1 to i32 addrspace(0)*
> -  store i32 %1, i32 addrspace(0)* %2, align 4
> -  br label %while.cond
> -
> -while.cond3:                                      ; preds
> = %while.cond, %while.body5
> -  %index.1 = phi i32 [ %index.0, %while.cond ], [ %inc, %while.body5 ]
> -  %cmp4 = icmp ult i32 %index.1, %size
> -  br i1 %cmp4, label %while.body5, label %while.end7
> -
> -while.body5:                                      ; preds
> = %while.cond3
> -  %arrayidx = getelementptr inbounds i8 addrspace(3)* %src, i32 %index.1
> -  %3 = load i8 addrspace(3)* %arrayidx, align 1
> -  %arrayidx6 = getelementptr inbounds i8 addrspace(0)* %dst, i32 %index.1
> -  store i8 %3, i8 addrspace(0)* %arrayidx6, align 1
> -  %inc = add i32 %index.1, 1
> -  br label %while.cond3
> -
> -while.end7:                                       ; preds
> = %while.cond3
> +  %rem = and i32 %alignment, 3
> +  %cmp = icmp ne i32 %rem, 0
> +  %cmp113 = icmp ult i32 %size, 4
> +  %or.cond = or i1 %cmp, %cmp113
> +  br i1 %or.cond, label %while.cond4.preheader, label %while.body
> +
> +while.cond4.preheader:                            ; preds
> = %entry, %while.body
> +  %index.1.ph = phi i32 [ 0, %entry ], [ %add15, %while.body ]
> +  %cmp511 = icmp ult i32 %index.1.ph, %size
> +  br i1 %cmp511, label %while.body6, label %while.end8
> +
> +while.body:                                       ; preds
> = %entry, %while.body
> +  %add15 = phi i32 [ %add, %while.body ], [ 4, %entry ]
> +  %index.014 = phi i32 [ %add15, %while.body ], [ 0, %entry ]
> +  %0 = ptrtoint i8 addrspace(3)* %src to i32
> +  %1 = add i32 %0, %index.014
> +  %2 = inttoptr i32 %1 to i8 addrspace(3)*
> +  %3 = bitcast i8 addrspace(3)* %2 to i32 addrspace(3)*
> +  %4 = load i32 addrspace(3)* %3, align 4
> +  %5 = ptrtoint i8 addrspace(0)* %dst to i32
> +  %6 = add i32 %5, %index.014
> +  %7 = inttoptr i32 %6 to i8 addrspace(0)*
> +  %8 = bitcast i8 addrspace(0)* %7 to i32 addrspace(0)*
> +  store i32 %4, i32 addrspace(0)* %8, align 4
> +  %add = add i32 %add15, 4
> +  %cmp1 = icmp ugt i32 %add, %size
> +  br i1 %cmp1, label %while.cond4.preheader, label %while.body
> +
> +while.body6:                                      ; preds
> = %while.cond4.preheader, %while.body6
> +  %index.112 = phi i32 [ %inc, %while.body6 ], [ %index.1.ph,
> +%while.cond4.preheader ]
> +  %9 = ptrtoint i8 addrspace(3)* %src to i32
> +  %10 = add i32 %9, %index.112
> +  %11 = inttoptr i32 %10 to i8 addrspace(3)*
> +  %12 = load i8 addrspace(3)* %11, align 1
> +  %13 = ptrtoint i8 addrspace(0)* %dst to i32
> +  %14 = add i32 %13, %index.112
> +  %15 = inttoptr i32 %14 to i8 addrspace(0)*
> +  store i8 %12, i8 addrspace(0)* %15, align 1
> +  %inc = add i32 %index.112, 1
> +  %cmp5 = icmp ult i32 %inc, %size
> +  br i1 %cmp5, label %while.body6, label %while.end8
> +
> +while.end8:                                       ; preds
> = %while.body6, %while.cond4.preheader
>    ret void
>  }
> 
> -define void @__gen_memcpy_lg(i8 addrspace(3)* %dst, i8
> addrspace(1)* %src, i32 %size) nounwind alwaysinline {
> +define void @__gen_memcpy_lg(i8 addrspace(3)* %dst, i8 addrspace(1)*
> +%src, i32 %size, i32 %alignment) nounwind alwaysinline {
>  entry:
> -  br label %while.cond
> -
> -while.cond:                                       ; preds
> = %while.body, %entry
> -  %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ]
> -  %add = add i32 %index.0, 4
> -  %cmp = icmp ult i32 %add, %size
> -  br i1 %cmp, label %while.cond3, label %while.body
> -
> -while.body:                                       ; preds
> = %while.cond
> -  %add.ptr = getelementptr inbounds i8 addrspace(1)* %src, i32 %index.0
> -  %0 = bitcast i8 addrspace(1)* %add.ptr to i32 addrspace(1)*
> -  %1 = load i32 addrspace(1)* %0, align 4
> -  %add.ptr1 = getelementptr inbounds i8 addrspace(3)* %dst, i32 %index.0
> -  %2 = bitcast i8 addrspace(3)* %add.ptr1 to i32 addrspace(3)*
> -  store i32 %1, i32 addrspace(3)* %2, align 4
> -  br label %while.cond
> -
> -while.cond3:                                      ; preds
> = %while.cond, %while.body5
> -  %index.1 = phi i32 [ %index.0, %while.cond ], [ %inc, %while.body5 ]
> -  %cmp4 = icmp ult i32 %index.1, %size
> -  br i1 %cmp4, label %while.body5, label %while.end7
> -
> -while.body5:                                      ; preds
> = %while.cond3
> -  %arrayidx = getelementptr inbounds i8 addrspace(1)* %src, i32 %index.1
> -  %3 = load i8 addrspace(1)* %arrayidx, align 1
> -  %arrayidx6 = getelementptr inbounds i8 addrspace(3)* %dst, i32 %index.1
> -  store i8 %3, i8 addrspace(3)* %arrayidx6, align 1
> -  %inc = add i32 %index.1, 1
> -  br label %while.cond3
> -
> -while.end7:                                       ; preds
> = %while.cond3
> +  %rem = and i32 %alignment, 3
> +  %cmp = icmp ne i32 %rem, 0
> +  %cmp113 = icmp ult i32 %size, 4
> +  %or.cond = or i1 %cmp, %cmp113
> +  br i1 %or.cond, label %while.cond4.preheader, label %while.body
> +
> +while.cond4.preheader:                            ; preds
> = %entry, %while.body
> +  %index.1.ph = phi i32 [ 0, %entry ], [ %add15, %while.body ]
> +  %cmp511 = icmp ult i32 %index.1.ph, %size
> +  br i1 %cmp511, label %while.body6, label %while.end8
> +
> +while.body:                                       ; preds
> = %entry, %while.body
> +  %add15 = phi i32 [ %add, %while.body ], [ 4, %entry ]
> +  %index.014 = phi i32 [ %add15, %while.body ], [ 0, %entry ]
> +  %0 = ptrtoint i8 addrspace(1)* %src to i32
> +  %1 = add i32 %0, %index.014
> +  %2 = inttoptr i32 %1 to i8 addrspace(1)*
> +  %3 = bitcast i8 addrspace(1)* %2 to i32 addrspace(1)*
> +  %4 = load i32 addrspace(1)* %3, align 4
> +  %5 = ptrtoint i8 addrspace(3)* %dst to i32
> +  %6 = add i32 %5, %index.014
> +  %7 = inttoptr i32 %6 to i8 addrspace(3)*
> +  %8 = bitcast i8 addrspace(3)* %7 to i32 addrspace(3)*
> +  store i32 %4, i32 addrspace(3)* %8, align 4
> +  %add = add i32 %add15, 4
> +  %cmp1 = icmp ugt i32 %add, %size
> +  br i1 %cmp1, label %while.cond4.preheader, label %while.body
> +
> +while.body6:                                      ; preds
> = %while.cond4.preheader, %while.body6
> +  %index.112 = phi i32 [ %inc, %while.body6 ], [ %index.1.ph,
> +%while.cond4.preheader ]
> +  %9 = ptrtoint i8 addrspace(1)* %src to i32
> +  %10 = add i32 %9, %index.112
> +  %11 = inttoptr i32 %10 to i8 addrspace(1)*
> +  %12 = load i8 addrspace(1)* %11, align 1
> +  %13 = ptrtoint i8 addrspace(3)* %dst to i32
> +  %14 = add i32 %13, %index.112
> +  %15 = inttoptr i32 %14 to i8 addrspace(3)*
> +  store i8 %12, i8 addrspace(3)* %15, align 1
> +  %inc = add i32 %index.112, 1
> +  %cmp5 = icmp ult i32 %inc, %size
> +  br i1 %cmp5, label %while.body6, label %while.end8
> +
> +while.end8:                                       ; preds
> = %while.body6, %while.cond4.preheader
>    ret void
>  }
> 
> -define void @__gen_memcpy_lp(i8 addrspace(3)* %dst, i8
> addrspace(0)* %src, i32 %size) nounwind alwaysinline {
> +define void @__gen_memcpy_lp(i8 addrspace(3)* %dst, i8 addrspace(0)*
> +%src, i32 %size, i32 %alignment) nounwind alwaysinline {
>  entry:
> -  br label %while.cond
> -
> -while.cond:                                       ; preds
> = %while.body, %entry
> -  %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ]
> -  %add = add i32 %index.0, 4
> -  %cmp = icmp ult i32 %add, %size
> -  br i1 %cmp, label %while.cond3, label %while.body
> -
> -while.body:                                       ; preds
> = %while.cond
> -  %add.ptr = getelementptr inbounds i8 addrspace(0)* %src, i32 %index.0
> -  %0 = bitcast i8 addrspace(0)* %add.ptr to i32 addrspace(0)*
> -  %1 = load i32 addrspace(0)* %0, align 4
> -  %add.ptr1 = getelementptr inbounds i8 addrspace(3)* %dst, i32 %index.0
> -  %2 = bitcast i8 addrspace(3)* %add.ptr1 to i32 addrspace(3)*
> -  store i32 %1, i32 addrspace(3)* %2, align 4
> -  br label %while.cond
> -
> -while.cond3:                                      ; preds
> = %while.cond, %while.body5
> -  %index.1 = phi i32 [ %index.0, %while.cond ], [ %inc, %while.body5 ]
> -  %cmp4 = icmp ult i32 %index.1, %size
> -  br i1 %cmp4, label %while.body5, label %while.end7
> -
> -while.body5:                                      ; preds
> = %while.cond3
> -  %arrayidx = getelementptr inbounds i8 addrspace(0)* %src, i32 %index.1
> -  %3 = load i8 addrspace(0)* %arrayidx, align 1
> -  %arrayidx6 = getelementptr inbounds i8 addrspace(3)* %dst, i32 %index.1
> -  store i8 %3, i8 addrspace(3)* %arrayidx6, align 1
> -  %inc = add i32 %index.1, 1
> -  br label %while.cond3
> -
> -while.end7:                                       ; preds
> = %while.cond3
> +  %rem = and i32 %alignment, 3
> +  %cmp = icmp ne i32 %rem, 0
> +  %cmp113 = icmp ult i32 %size, 4
> +  %or.cond = or i1 %cmp, %cmp113
> +  br i1 %or.cond, label %while.cond4.preheader, label %while.body
> +
> +while.cond4.preheader:                            ; preds
> = %entry, %while.body
> +  %index.1.ph = phi i32 [ 0, %entry ], [ %add15, %while.body ]
> +  %cmp511 = icmp ult i32 %index.1.ph, %size
> +  br i1 %cmp511, label %while.body6, label %while.end8
> +
> +while.body:                                       ; preds
> = %entry, %while.body
> +  %add15 = phi i32 [ %add, %while.body ], [ 4, %entry ]
> +  %index.014 = phi i32 [ %add15, %while.body ], [ 0, %entry ]
> +  %0 = ptrtoint i8 addrspace(0)* %src to i32
> +  %1 = add i32 %0, %index.014
> +  %2 = inttoptr i32 %1 to i8 addrspace(0)*
> +  %3 = bitcast i8 addrspace(0)* %2 to i32 addrspace(0)*
> +  %4 = load i32 addrspace(0)* %3, align 4
> +  %5 = ptrtoint i8 addrspace(3)* %dst to i32
> +  %6 = add i32 %5, %index.014
> +  %7 = inttoptr i32 %6 to i8 addrspace(3)*
> +  %8 = bitcast i8 addrspace(3)* %7 to i32 addrspace(3)*
> +  store i32 %4, i32 addrspace(3)* %8, align 4
> +  %add = add i32 %add15, 4
> +  %cmp1 = icmp ugt i32 %add, %size
> +  br i1 %cmp1, label %while.cond4.preheader, label %while.body
> +
> +while.body6:                                      ; preds
> = %while.cond4.preheader, %while.body6
> +  %index.112 = phi i32 [ %inc, %while.body6 ], [ %index.1.ph,
> +%while.cond4.preheader ]
> +  %9 = ptrtoint i8 addrspace(0)* %src to i32
> +  %10 = add i32 %9, %index.112
> +  %11 = inttoptr i32 %10 to i8 addrspace(0)*
> +  %12 = load i8 addrspace(0)* %11, align 1
> +  %13 = ptrtoint i8 addrspace(3)* %dst to i32
> +  %14 = add i32 %13, %index.112
> +  %15 = inttoptr i32 %14 to i8 addrspace(3)*
> +  store i8 %12, i8 addrspace(3)* %15, align 1
> +  %inc = add i32 %index.112, 1
> +  %cmp5 = icmp ult i32 %inc, %size
> +  br i1 %cmp5, label %while.body6, label %while.end8
> +
> +while.end8:                                       ; preds
> = %while.body6, %while.cond4.preheader
>    ret void
>  }
> 
> -define void @__gen_memcpy_ll(i8 addrspace(3)* %dst, i8
> addrspace(3)* %src, i32 %size) nounwind alwaysinline {
> +define void @__gen_memcpy_ll(i8 addrspace(3)* %dst, i8 addrspace(3)*
> +%src, i32 %size, i32 %alignment) nounwind alwaysinline {
>  entry:
> -  br label %while.cond
> -
> -while.cond:                                       ; preds
> = %while.body, %entry
> -  %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ]
> -  %add = add i32 %index.0, 4
> -  %cmp = icmp ult i32 %add, %size
> -  br i1 %cmp, label %while.cond3, label %while.body
> -
> -while.body:                                       ; preds
> = %while.cond
> -  %add.ptr = getelementptr inbounds i8 addrspace(3)* %src, i32 %index.0
> -  %0 = bitcast i8 addrspace(3)* %add.ptr to i32 addrspace(3)*
> -  %1 = load i32 addrspace(3)* %0, align 4
> -  %add.ptr1 = getelementptr inbounds i8 addrspace(3)* %dst, i32 %index.0
> -  %2 = bitcast i8 addrspace(3)* %add.ptr1 to i32 addrspace(3)*
> -  store i32 %1, i32 addrspace(3)* %2, align 4
> -  br label %while.cond
> -
> -while.cond3:                                      ; preds
> = %while.cond, %while.body5
> -  %index.1 = phi i32 [ %index.0, %while.cond ], [ %inc, %while.body5 ]
> -  %cmp4 = icmp ult i32 %index.1, %size
> -  br i1 %cmp4, label %while.body5, label %while.end7
> -
> -while.body5:                                      ; preds
> = %while.cond3
> -  %arrayidx = getelementptr inbounds i8 addrspace(3)* %src, i32 %index.1
> -  %3 = load i8 addrspace(3)* %arrayidx, align 1
> -  %arrayidx6 = getelementptr inbounds i8 addrspace(3)* %dst, i32 %index.1
> -  store i8 %3, i8 addrspace(3)* %arrayidx6, align 1
> -  %inc = add i32 %index.1, 1
> -  br label %while.cond3
> -
> -while.end7:                                       ; preds
> = %while.cond3
> +  %rem = and i32 %alignment, 3
> +  %cmp = icmp ne i32 %rem, 0
> +  %cmp113 = icmp ult i32 %size, 4
> +  %or.cond = or i1 %cmp, %cmp113
> +  br i1 %or.cond, label %while.cond4.preheader, label %while.body
> +
> +while.cond4.preheader:                            ; preds
> = %entry, %while.body
> +  %index.1.ph = phi i32 [ 0, %entry ], [ %add15, %while.body ]
> +  %cmp511 = icmp ult i32 %index.1.ph, %size
> +  br i1 %cmp511, label %while.body6, label %while.end8
> +
> +while.body:                                       ; preds
> = %entry, %while.body
> +  %add15 = phi i32 [ %add, %while.body ], [ 4, %entry ]
> +  %index.014 = phi i32 [ %add15, %while.body ], [ 0, %entry ]
> +  %0 = ptrtoint i8 addrspace(3)* %src to i32
> +  %1 = add i32 %0, %index.014
> +  %2 = inttoptr i32 %1 to i8 addrspace(3)*
> +  %3 = bitcast i8 addrspace(3)* %2 to i32 addrspace(3)*
> +  %4 = load i32 addrspace(3)* %3, align 4
> +  %5 = ptrtoint i8 addrspace(3)* %dst to i32
> +  %6 = add i32 %5, %index.014
> +  %7 = inttoptr i32 %6 to i8 addrspace(3)*
> +  %8 = bitcast i8 addrspace(3)* %7 to i32 addrspace(3)*
> +  store i32 %4, i32 addrspace(3)* %8, align 4
> +  %add = add i32 %add15, 4
> +  %cmp1 = icmp ugt i32 %add, %size
> +  br i1 %cmp1, label %while.cond4.preheader, label %while.body
> +
> +while.body6:                                      ; preds
> = %while.cond4.preheader, %while.body6
> +  %index.112 = phi i32 [ %inc, %while.body6 ], [ %index.1.ph,
> +%while.cond4.preheader ]
> +  %9 = ptrtoint i8 addrspace(3)* %src to i32
> +  %10 = add i32 %9, %index.112
> +  %11 = inttoptr i32 %10 to i8 addrspace(3)*
> +  %12 = load i8 addrspace(3)* %11, align 1
> +  %13 = ptrtoint i8 addrspace(3)* %dst to i32
> +  %14 = add i32 %13, %index.112
> +  %15 = inttoptr i32 %14 to i8 addrspace(3)*
> +  store i8 %12, i8 addrspace(3)* %15, align 1
> +  %inc = add i32 %index.112, 1
> +  %cmp5 = icmp ult i32 %inc, %size
> +  br i1 %cmp5, label %while.body6, label %while.end8
> +
> +while.end8:                                       ; preds
> = %while.body6, %while.cond4.preheader
>    ret void
>  }
> diff --git a/backend/src/libocl/src/ocl_memset.ll
> b/backend/src/libocl/src/ocl_memset.ll
> index addf9f5..661520d 100644
> --- a/backend/src/libocl/src/ocl_memset.ll
> +++ b/backend/src/libocl/src/ocl_memset.ll
> @@ -1,127 +1,160 @@
>  ;The memset's source code.
> -; INLINE_OVERLOADABLE void __gen_memset(uchar* dst, uchar val, size_t
> size) {
> +; INLINE_OVERLOADABLE void __gen_memset(__global uchar* dst, uchar
> val,
> +size_t size, uint alignment) {
>  ;   size_t index = 0;
> -;   uint v = (val << 24) | (val << 16) | (val << 8) | val;
> -;   while((index + 4) >= size) {
> -;     *((uint *)(dst + index)) = v;
> -;     index += 4;
> -;   }
> +;   uint v;
> +; 	if(alignment % 4 == 0) {
> +;     v = (val << 24) | (val << 16) | (val << 8) | val;
> +;     while((index + 4) <= size) {
> +;       *((__global uint *)(dst + index)) = v;
> +;       index += 4;
> +;     }
> +;	  }
>  ;   while(index < size) {
>  ;     dst[index] = val;
>  ;     index++;
>  ;  }
>  ; }
> 
> -define void @__gen_memset_p(i8* %dst, i8 zeroext %val, i32 %size)
> nounwind alwaysinline {
> +define void @__gen_memset_p(i8 addrspace(0)* %dst, i8 zeroext %val, i32
> +%size, i32 %alignment) nounwind alwaysinline {
>  entry:
> +  %rem = and i32 %alignment, 3
> +  %cmp = icmp eq i32 %rem, 0
> +  br i1 %cmp, label %if.then, label %while.cond11.preheader
> +
> +if.then:                                          ; preds = %entry
>    %conv = zext i8 %val to i32
>    %shl = shl nuw i32 %conv, 24
>    %shl2 = shl nuw nsw i32 %conv, 16
> -  %or = or i32 %shl, %shl2
>    %shl4 = shl nuw nsw i32 %conv, 8
> -  %or5 = or i32 %or, %shl4
> -  %or7 = or i32 %or5, %conv
> -  br label %while.cond
> -
> -while.cond:                                       ; preds
> = %while.body, %entry
> -  %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ]
> -  %add = add i32 %index.0, 4
> -  %cmp = icmp ult i32 %add, %size
> -  br i1 %cmp, label %while.cond10, label %while.body
> -
> -while.body:                                       ; preds
> = %while.cond
> -  %add.ptr = getelementptr inbounds i8* %dst, i32 %index.0
> -  %0 = bitcast i8* %add.ptr to i32*
> -  store i32 %or7, i32* %0, align 4
> -  br label %while.cond
> -
> -while.cond10:                                     ; preds
> = %while.cond, %while.body13
> -  %index.1 = phi i32 [ %index.0, %while.cond ], [ %inc, %while.body13 ]
> -  %cmp11 = icmp ult i32 %index.1, %size
> -  br i1 %cmp11, label %while.body13, label %while.end14
> -
> -while.body13:                                     ; preds
> = %while.cond10
> -  %arrayidx = getelementptr inbounds i8* %dst, i32 %index.1
> -  store i8 %val, i8* %arrayidx, align 1
> -  %inc = add i32 %index.1, 1
> -  br label %while.cond10
> -
> -while.end14:                                      ; preds
> = %while.cond10
> +  %or = or i32 %shl2, %conv
> +  %or5 = or i32 %or, %shl
> +  %or7 = or i32 %or5, %shl4
> +  %cmp814 = icmp ult i32 %size, 4
> +  br i1 %cmp814, label %while.cond11.preheader, label %while.body
> +
> +while.cond11.preheader:                           ; preds
> = %if.then, %while.body, %entry
> +  %index.1.ph = phi i32 [ 0, %entry ], [ 0, %if.then ], [ %add16,
> +%while.body ]
> +  %cmp1212 = icmp ult i32 %index.1.ph, %size
> +  br i1 %cmp1212, label %while.body14, label %while.end15
> +
> +while.body:                                       ; preds
> = %if.then, %while.body
> +  %add16 = phi i32 [ %add, %while.body ], [ 4, %if.then ]
> +  %index.015 = phi i32 [ %add16, %while.body ], [ 0, %if.then ]
> +  %0 = ptrtoint i8 addrspace(0)* %dst to i32
> +  %1 = add i32 %0, %index.015
> +  %2 = inttoptr i32 %1 to i8 addrspace(0)*
> +  %3 = bitcast i8 addrspace(0)* %2 to i32 addrspace(0)*
> +  store i32 %or7, i32 addrspace(0)* %3, align 4
> +  %add = add i32 %add16, 4
> +  %cmp8 = icmp ugt i32 %add, %size
> +  br i1 %cmp8, label %while.cond11.preheader, label %while.body
> +
> +while.body14:                                     ; preds
> = %while.cond11.preheader, %while.body14
> +  %index.113 = phi i32 [ %inc, %while.body14 ], [ %index.1.ph,
> +%while.cond11.preheader ]
> +  %4 = ptrtoint i8 addrspace(0)* %dst to i32
> +  %5 = add i32 %4, %index.113
> +  %6 = inttoptr i32 %5 to i8 addrspace(0)*
> +  store i8 %val, i8 addrspace(0)* %6, align 1
> +  %inc = add i32 %index.113, 1
> +  %cmp12 = icmp ult i32 %inc, %size
> +  br i1 %cmp12, label %while.body14, label %while.end15
> +
> +while.end15:                                      ; preds
> = %while.body14, %while.cond11.preheader
>    ret void
>  }
> 
> -define void @__gen_memset_g(i8 addrspace(1)* %dst, i8 zeroext %val,
> i32 %size) nounwind alwaysinline {
> +define void @__gen_memset_g(i8 addrspace(1)* %dst, i8 zeroext %val, i32
> +%size, i32 %alignment) nounwind alwaysinline {
>  entry:
> +  %rem = and i32 %alignment, 3
> +  %cmp = icmp eq i32 %rem, 0
> +  br i1 %cmp, label %if.then, label %while.cond11.preheader
> +
> +if.then:                                          ; preds = %entry
>    %conv = zext i8 %val to i32
>    %shl = shl nuw i32 %conv, 24
>    %shl2 = shl nuw nsw i32 %conv, 16
> -  %or = or i32 %shl, %shl2
>    %shl4 = shl nuw nsw i32 %conv, 8
> -  %or5 = or i32 %or, %shl4
> -  %or7 = or i32 %or5, %conv
> -  br label %while.cond
> -
> -while.cond:                                       ; preds
> = %while.body, %entry
> -  %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ]
> -  %add = add i32 %index.0, 4
> -  %cmp = icmp ult i32 %add, %size
> -  br i1 %cmp, label %while.cond10, label %while.body
> -
> -while.body:                                       ; preds
> = %while.cond
> -  %add.ptr = getelementptr inbounds i8 addrspace(1)* %dst, i32 %index.0
> -  %0 = bitcast i8 addrspace(1)* %add.ptr to i32 addrspace(1)*
> -  store i32 %or7, i32 addrspace(1)* %0, align 4
> -  br label %while.cond
> -
> -while.cond10:                                     ; preds
> = %while.cond, %while.body13
> -  %index.1 = phi i32 [ %index.0, %while.cond ], [ %inc, %while.body13 ]
> -  %cmp11 = icmp ult i32 %index.1, %size
> -  br i1 %cmp11, label %while.body13, label %while.end14
> -
> -while.body13:                                     ; preds
> = %while.cond10
> -  %arrayidx = getelementptr inbounds i8 addrspace(1)* %dst, i32 %index.1
> -  store i8 %val, i8 addrspace(1)* %arrayidx, align 1
> -  %inc = add i32 %index.1, 1
> -  br label %while.cond10
> -
> -while.end14:                                      ; preds
> = %while.cond10
> +  %or = or i32 %shl2, %conv
> +  %or5 = or i32 %or, %shl
> +  %or7 = or i32 %or5, %shl4
> +  %cmp814 = icmp ult i32 %size, 4
> +  br i1 %cmp814, label %while.cond11.preheader, label %while.body
> +
> +while.cond11.preheader:                           ; preds
> = %if.then, %while.body, %entry
> +  %index.1.ph = phi i32 [ 0, %entry ], [ 0, %if.then ], [ %add16,
> +%while.body ]
> +  %cmp1212 = icmp ult i32 %index.1.ph, %size
> +  br i1 %cmp1212, label %while.body14, label %while.end15
> +
> +while.body:                                       ; preds
> = %if.then, %while.body
> +  %add16 = phi i32 [ %add, %while.body ], [ 4, %if.then ]
> +  %index.015 = phi i32 [ %add16, %while.body ], [ 0, %if.then ]
> +  %0 = ptrtoint i8 addrspace(1)* %dst to i32
> +  %1 = add i32 %0, %index.015
> +  %2 = inttoptr i32 %1 to i8 addrspace(1)*
> +  %3 = bitcast i8 addrspace(1)* %2 to i32 addrspace(1)*
> +  store i32 %or7, i32 addrspace(1)* %3, align 4
> +  %add = add i32 %add16, 4
> +  %cmp8 = icmp ugt i32 %add, %size
> +  br i1 %cmp8, label %while.cond11.preheader, label %while.body
> +
> +while.body14:                                     ; preds
> = %while.cond11.preheader, %while.body14
> +  %index.113 = phi i32 [ %inc, %while.body14 ], [ %index.1.ph,
> +%while.cond11.preheader ]
> +  %4 = ptrtoint i8 addrspace(1)* %dst to i32
> +  %5 = add i32 %4, %index.113
> +  %6 = inttoptr i32 %5 to i8 addrspace(1)*
> +  store i8 %val, i8 addrspace(1)* %6, align 1
> +  %inc = add i32 %index.113, 1
> +  %cmp12 = icmp ult i32 %inc, %size
> +  br i1 %cmp12, label %while.body14, label %while.end15
> +
> +while.end15:                                      ; preds
> = %while.body14, %while.cond11.preheader
>    ret void
>  }
> 
> -define void @__gen_memset_l(i8 addrspace(3)* %dst, i8 zeroext %val,
> i32 %size) nounwind alwaysinline {
> +define void @__gen_memset_l(i8 addrspace(3)* %dst, i8 zeroext %val, i32
> +%size, i32 %alignment) nounwind alwaysinline {
>  entry:
> +  %rem = and i32 %alignment, 3
> +  %cmp = icmp eq i32 %rem, 0
> +  br i1 %cmp, label %if.then, label %while.cond11.preheader
> +
> +if.then:                                          ; preds = %entry
>    %conv = zext i8 %val to i32
>    %shl = shl nuw i32 %conv, 24
>    %shl2 = shl nuw nsw i32 %conv, 16
> -  %or = or i32 %shl, %shl2
>    %shl4 = shl nuw nsw i32 %conv, 8
> -  %or5 = or i32 %or, %shl4
> -  %or7 = or i32 %or5, %conv
> -  br label %while.cond
> -
> -while.cond:                                       ; preds
> = %while.body, %entry
> -  %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ]
> -  %add = add i32 %index.0, 4
> -  %cmp = icmp ult i32 %add, %size
> -  br i1 %cmp, label %while.cond10, label %while.body
> -
> -while.body:                                       ; preds
> = %while.cond
> -  %add.ptr = getelementptr inbounds i8 addrspace(3)* %dst, i32 %index.0
> -  %0 = bitcast i8 addrspace(3)* %add.ptr to i32 addrspace(3)*
> -  store i32 %or7, i32 addrspace(3)* %0, align 4
> -  br label %while.cond
> -
> -while.cond10:                                     ; preds
> = %while.cond, %while.body13
> -  %index.1 = phi i32 [ %index.0, %while.cond ], [ %inc, %while.body13 ]
> -  %cmp11 = icmp ult i32 %index.1, %size
> -  br i1 %cmp11, label %while.body13, label %while.end14
> -
> -while.body13:                                     ; preds
> = %while.cond10
> -  %arrayidx = getelementptr inbounds i8 addrspace(3)* %dst, i32 %index.1
> -  store i8 %val, i8 addrspace(3)* %arrayidx, align 1
> -  %inc = add i32 %index.1, 1
> -  br label %while.cond10
> -
> -while.end14:                                      ; preds
> = %while.cond10
> +  %or = or i32 %shl2, %conv
> +  %or5 = or i32 %or, %shl
> +  %or7 = or i32 %or5, %shl4
> +  %cmp814 = icmp ult i32 %size, 4
> +  br i1 %cmp814, label %while.cond11.preheader, label %while.body
> +
> +while.cond11.preheader:                           ; preds
> = %if.then, %while.body, %entry
> +  %index.1.ph = phi i32 [ 0, %entry ], [ 0, %if.then ], [ %add16,
> +%while.body ]
> +  %cmp1212 = icmp ult i32 %index.1.ph, %size
> +  br i1 %cmp1212, label %while.body14, label %while.end15
> +
> +while.body:                                       ; preds
> = %if.then, %while.body
> +  %add16 = phi i32 [ %add, %while.body ], [ 4, %if.then ]
> +  %index.015 = phi i32 [ %add16, %while.body ], [ 0, %if.then ]
> +  %0 = ptrtoint i8 addrspace(3)* %dst to i32
> +  %1 = add i32 %0, %index.015
> +  %2 = inttoptr i32 %1 to i8 addrspace(3)*
> +  %3 = bitcast i8 addrspace(3)* %2 to i32 addrspace(3)*
> +  store i32 %or7, i32 addrspace(3)* %3, align 4
> +  %add = add i32 %add16, 4
> +  %cmp8 = icmp ugt i32 %add, %size
> +  br i1 %cmp8, label %while.cond11.preheader, label %while.body
> +
> +while.body14:                                     ; preds
> = %while.cond11.preheader, %while.body14
> +  %index.113 = phi i32 [ %inc, %while.body14 ], [ %index.1.ph,
> +%while.cond11.preheader ]
> +  %4 = ptrtoint i8 addrspace(3)* %dst to i32
> +  %5 = add i32 %4, %index.113
> +  %6 = inttoptr i32 %5 to i8 addrspace(3)*
> +  store i8 %val, i8 addrspace(3)* %6, align 1
> +  %inc = add i32 %index.113, 1
> +  %cmp12 = icmp ult i32 %inc, %size
> +  br i1 %cmp12, label %while.body14, label %while.end15
> +
> +while.end15:                                      ; preds
> = %while.body14, %while.cond11.preheader
>    ret void
>  }
> diff --git a/backend/src/llvm/llvm_intrinsic_lowering.cpp
> b/backend/src/llvm/llvm_intrinsic_lowering.cpp
> index 7d04318..1466de0 100644
> --- a/backend/src/llvm/llvm_intrinsic_lowering.cpp
> +++ b/backend/src/llvm/llvm_intrinsic_lowering.cpp
> @@ -126,14 +126,17 @@ namespace gbe {
>                  Type *IntPtr = TD.getIntPtrType(Context);
>                  Value *Size =
> Builder.CreateIntCast(CI->getArgOperand(2), IntPtr,
>                                                      /* isSigned
> */ false);
> -                Value *Ops[3];
> +                Value *alignment =
> Builder.CreateIntCast(CI->getArgOperand(3), IntPtr,
> +                                                        /*
> isSigned */ false);
> +                Value *Ops[4];
>                  Ops[0] = CI->getArgOperand(0);
>                  Ops[1] = CI->getArgOperand(1);
>                  Ops[2] = Size;
> +                Ops[3] = alignment;
>                  char name[16] = "__gen_memcpy_xx";
>                  name[13] = convertSpaceToName(Ops[0]);
>                  name[14] = convertSpaceToName(Ops[1]);
> -                replaceCallWith(name, CI, Ops, Ops+3,
> Type::getVoidTy(Context));
> +                replaceCallWith(name, CI, Ops, Ops+4,
> + Type::getVoidTy(Context));
>                  break;
>                }
>                case Intrinsic::memset: { @@ -143,14 +146,17 @@
> namespace gbe {
>                  Type *IntPtr = TD.getIntPtrType(Op0->getType());
>                  Value *Size =
> Builder.CreateIntCast(CI->getArgOperand(2), IntPtr,
>                                                      /* isSigned
> */ false);
> -                Value *Ops[3];
> +                Value *alignment =
> Builder.CreateIntCast(CI->getArgOperand(3), IntPtr,
> +                                                         /*
> isSigned */ false);
> +                Value *Ops[4];
>                  Ops[0] = Op0;
>                  // Extend the amount to i32.
>                  Ops[1] = val;
>                  Ops[2] = Size;
> +                Ops[3] = alignment;
>                  char name[16] = "__gen_memset_x";
>                  name[13] = convertSpaceToName(Ops[0]);
> -                replaceCallWith(name, CI, Ops, Ops+3,
> Type::getVoidTy(Context));
> +                replaceCallWith(name, CI, Ops, Ops+4,
> + Type::getVoidTy(Context));
>                  break;
>                }
>                default:
> --
> 1.8.3.2
> 
> _______________________________________________
> Beignet mailing list
> Beignet at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/beignet


More information about the Beignet mailing list