[Beignet] [PATCH V2] Add llvm instrinsic function llvm.memset and llvm.memcpy support.
Zhigang Gong
zhigang.gong at linux.intel.com
Wed Jan 15 22:57:53 PST 2014
LGTM, pushed with the other two patches, thanks.
On Thu, Jan 16, 2014 at 03:38:30PM +0800, Yang Rong wrote:
> SPIR 1.2 require llvm.memcpy support. And llvm will emit llvm.memset sometimes.
> So adding a pass to lower these two intrinsic function, and then inline them.
>
> In intrinsic lowering pass, find all llvm.memset and llvm.memcpy and then replace
> them with a function call __gen_memset_x and __gen_memcpy_xx, x and xx is for address space.
>
> Because this pass is after clang, but after clang, the unused function seems be stripped, so
> implement the __gen_memset_x and __gen_memcpy_xx functions in pre compiled module, then link
> them.
>
> Signed-off-by: Yang Rong <rong.r.yang at intel.com>
> ---
> backend/src/CMakeLists.txt | 3 +-
> backend/src/llvm/llvm_gen_backend.hpp | 4 +
> backend/src/llvm/llvm_intrinsic_lowering.cpp | 172 ++++++++++++++
> backend/src/llvm/llvm_passes.cpp | 2 +-
> backend/src/llvm/llvm_to_gen.cpp | 2 +
> backend/src/ocl_memcpy.ll | 336 +++++++++++++++++++++++++++
> backend/src/ocl_memset.ll | 127 ++++++++++
> 7 files changed, 644 insertions(+), 2 deletions(-)
> create mode 100644 backend/src/llvm/llvm_intrinsic_lowering.cpp
> create mode 100644 backend/src/ocl_memcpy.ll
> create mode 100644 backend/src/ocl_memset.ll
>
> diff --git a/backend/src/CMakeLists.txt b/backend/src/CMakeLists.txt
> index b93133f..10bf67b 100644
> --- a/backend/src/CMakeLists.txt
> +++ b/backend/src/CMakeLists.txt
> @@ -136,6 +136,7 @@ else (GBE_USE_BLOB)
> llvm/llvm_gen_backend.cpp
> llvm/llvm_passes.cpp
> llvm/llvm_scalarize.cpp
> + llvm/llvm_intrinsic_lowering.cpp
> llvm/llvm_to_gen.cpp
> llvm/llvm_gen_backend.hpp
> llvm/llvm_gen_ocl_function.hxx
> @@ -165,7 +166,7 @@ add_library (gbe STATIC ${GBE_SRC})
>
> # for pre compiled module library.
> set (pcm_lib "beignet.bc")
> -set (pcm_sources ocl_barrier.ll)
> +set (pcm_sources ocl_barrier.ll ocl_memset.ll ocl_memcpy.ll)
> ll_add_library (${pcm_lib} pcm_sources)
>
> ADD_DEPENDENCIES (gbe pch_object ${pcm_lib})
> diff --git a/backend/src/llvm/llvm_gen_backend.hpp b/backend/src/llvm/llvm_gen_backend.hpp
> index 55079f5..389d5f3 100644
> --- a/backend/src/llvm/llvm_gen_backend.hpp
> +++ b/backend/src/llvm/llvm_gen_backend.hpp
> @@ -84,8 +84,12 @@ namespace gbe
> /*! Remove the GEP instructions */
> llvm::BasicBlockPass *createRemoveGEPPass(const ir::Unit &unit);
>
> + /*! Scalarize all vector op instructions */
> llvm::FunctionPass* createScalarizePass();
>
> + /*! Convert the Intrinsic call to gen function */
> + llvm::BasicBlockPass *createIntrinsicLoweringPass();
> +
> } /* namespace gbe */
>
> #endif /* __GBE_LLVM_GEN_BACKEND_HPP__ */
> diff --git a/backend/src/llvm/llvm_intrinsic_lowering.cpp b/backend/src/llvm/llvm_intrinsic_lowering.cpp
> new file mode 100644
> index 0000000..1942860
> --- /dev/null
> +++ b/backend/src/llvm/llvm_intrinsic_lowering.cpp
> @@ -0,0 +1,172 @@
> +/*
> + * Copyright © 2012 Intel Corporation
> + *
> + * This library is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2 of the License, or (at your option) any later version.
> + *
> + * This library is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with this library. If not, see <http://www.gnu.org/licenses/>.
> + */
> +
> +/**
> + * \file llvm_intrinisc_lowering.cpp
> + * \author Yang Rong <rong.r.yang at intel.com>
> + */
> +
> +#include "llvm/Config/config.h"
> +#if LLVM_VERSION_MINOR <= 2
> +#include "llvm/Function.h"
> +#include "llvm/InstrTypes.h"
> +#include "llvm/Instructions.h"
> +#include "llvm/IntrinsicInst.h"
> +#include "llvm/Module.h"
> +#else
> +#include "llvm/IR/Function.h"
> +#include "llvm/IR/InstrTypes.h"
> +#include "llvm/IR/Instructions.h"
> +#include "llvm/IR/IntrinsicInst.h"
> +#include "llvm/IR/Module.h"
> +#endif /* LLVM_VERSION_MINOR <= 2 */
> +#include "llvm/Pass.h"
> +#if LLVM_VERSION_MINOR <= 1
> +#include "llvm/Support/IRBuilder.h"
> +#elif LLVM_VERSION_MINOR == 2
> +#include "llvm/IRBuilder.h"
> +#else
> +#include "llvm/IR/IRBuilder.h"
> +#endif /* LLVM_VERSION_MINOR <= 1 */
> +#include "llvm/Support/CallSite.h"
> +#include "llvm/Support/CFG.h"
> +#include "llvm/Support/raw_ostream.h"
> +
> +#include "llvm/llvm_gen_backend.hpp"
> +#include "sys/map.hpp"
> +
> +
> +using namespace llvm;
> +
> +namespace gbe {
> + class InstrinsicLowering : public BasicBlockPass
> + {
> + public:
> + static char ID;
> + InstrinsicLowering() :
> + BasicBlockPass(ID) {}
> +
> + void getAnalysisUsage(AnalysisUsage &AU) const {
> +
> + }
> +
> + virtual const char *getPassName() const {
> + return "SPIR backend: lowering instrinsics";
> + }
> + static char convertSpaceToName(Value *val) {
> + const uint32_t space = val->getType()->getPointerAddressSpace();
> + switch(space) {
> + case 0:
> + return 'p';
> + case 1:
> + return 'g';
> + case 3:
> + return 'l';
> + default:
> + assert("Non support address space");
> + return '\0';
> + }
> + }
> + static CallInst *replaceCallWith(const char *NewFn, CallInst *CI,
> + Value **ArgBegin, Value **ArgEnd,
> + Type *RetTy)
> + {
> + // If we haven't already looked up this function, check to see if the
> + // program already contains a function with this name.
> + Module *M = CI->getParent()->getParent()->getParent();
> + // Get or insert the definition now.
> + std::vector<Type *> ParamTys;
> + for (Value** I = ArgBegin; I != ArgEnd; ++I)
> + ParamTys.push_back((*I)->getType());
> + Constant* FCache = M->getOrInsertFunction(NewFn,
> + FunctionType::get(RetTy, ParamTys, false));
> +
> + IRBuilder<> Builder(CI->getParent(), CI);
> + SmallVector<Value *, 8> Args(ArgBegin, ArgEnd);
> + CallInst *NewCI = Builder.CreateCall(FCache, Args);
> + NewCI->setName(CI->getName());
> + if (!CI->use_empty())
> + CI->replaceAllUsesWith(NewCI);
> + CI->eraseFromParent();
> + return NewCI;
> + }
> + virtual bool runOnBasicBlock(BasicBlock &BB)
> + {
> + bool changedBlock = false;
> + Module *M = BB.getParent()->getParent();
> +
> + DataLayout TD(M);
> + LLVMContext &Context = BB.getContext();
> + for (BasicBlock::iterator DI = BB.begin(); DI != BB.end(); ) {
> + Instruction *Inst = DI++;
> + CallInst* CI = dyn_cast<CallInst>(Inst);
> + if(CI == NULL)
> + continue;
> +
> + IRBuilder<> Builder(&BB, CI);
> + // only support memcpy and memset
> + if (Function *F = CI->getCalledFunction()) {
> + const Intrinsic::ID intrinsicID = (Intrinsic::ID) F->getIntrinsicID();
> + if (intrinsicID == 0)
> + continue;
> + switch (intrinsicID) {
> + case Intrinsic::memcpy: {
> + Type *IntPtr = TD.getIntPtrType(Context);
> + Value *Size = Builder.CreateIntCast(CI->getArgOperand(2), IntPtr,
> + /* isSigned */ false);
> + Value *Ops[3];
> + Ops[0] = CI->getArgOperand(0);
> + Ops[1] = CI->getArgOperand(1);
> + Ops[2] = Size;
> + char name[16] = "__gen_memcpy_xx";
> + name[13] = convertSpaceToName(Ops[0]);
> + name[14] = convertSpaceToName(Ops[1]);
> + replaceCallWith(name, CI, Ops, Ops+3, Type::getVoidTy(Context));
> + break;
> + }
> + case Intrinsic::memset: {
> + Value *Op0 = CI->getArgOperand(0);
> + Value *val = Builder.CreateIntCast(CI->getArgOperand(1), IntegerType::getInt8Ty(Context),
> + /* isSigned */ false);
> + Type *IntPtr = TD.getIntPtrType(Op0->getType());
> + Value *Size = Builder.CreateIntCast(CI->getArgOperand(2), IntPtr,
> + /* isSigned */ false);
> + Value *Ops[3];
> + Ops[0] = Op0;
> + // Extend the amount to i32.
> + Ops[1] = val;
> + Ops[2] = Size;
> + char name[16] = "__gen_memset_x";
> + name[13] = convertSpaceToName(Ops[0]);
> + replaceCallWith(name, CI, Ops, Ops+3, Type::getVoidTy(Context));
> + break;
> + }
> + default:
> + continue;
> + }
> + }
> + }
> + return changedBlock;
> + }
> + };
> +
> + char InstrinsicLowering::ID = 0;
> +
> + BasicBlockPass *createIntrinsicLoweringPass() {
> + return new InstrinsicLowering();
> + }
> +} // end namespace
> diff --git a/backend/src/llvm/llvm_passes.cpp b/backend/src/llvm/llvm_passes.cpp
> index 3bb6f71..1091dae 100644
> --- a/backend/src/llvm/llvm_passes.cpp
> +++ b/backend/src/llvm/llvm_passes.cpp
> @@ -232,7 +232,7 @@ namespace gbe
> }
>
> virtual const char *getPassName() const {
> - return "PTX backend: insert special ptx instructions";
> + return "SPIR backend: insert special spir instructions";
> }
>
> bool simplifyGEPInstructions(GetElementPtrInst* GEPInst);
> diff --git a/backend/src/llvm/llvm_to_gen.cpp b/backend/src/llvm/llvm_to_gen.cpp
> index a9f70d9..b227912 100644
> --- a/backend/src/llvm/llvm_to_gen.cpp
> +++ b/backend/src/llvm/llvm_to_gen.cpp
> @@ -175,6 +175,8 @@ namespace gbe
> // Print the code before further optimizations
> if (OCL_OUTPUT_LLVM_BEFORE_EXTRA_PASS)
> passes.add(createPrintModulePass(&*o));
> + passes.add(createIntrinsicLoweringPass());
> + passes.add(createFunctionInliningPass(200000));
> passes.add(createScalarReplAggregatesPass()); // Break up allocas
> passes.add(createRemoveGEPPass(unit));
> passes.add(createConstantPropagationPass());
> diff --git a/backend/src/ocl_memcpy.ll b/backend/src/ocl_memcpy.ll
> new file mode 100644
> index 0000000..476033e
> --- /dev/null
> +++ b/backend/src/ocl_memcpy.ll
> @@ -0,0 +1,336 @@
> +;The memcpy's source code.
> +; INLINE_OVERLOADABLE void __gen_memcpy(uchar* dst, uchar* src, size_t size) {
> +; size_t index = 0;
> +; while((index + 4) >= size) {
> +; *((uint *)(dst + index)) = *((uint *)(src + index));
> +; index += 4;
> +; }
> +; while(index < size) {
> +; dst[index] = src[index];
> +; index++;
> +; }
> +; }
> +
> +define void @__gen_memcpy_gg(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i32 %size) nounwind alwaysinline {
> +entry:
> + br label %while.cond
> +
> +while.cond: ; preds = %while.body, %entry
> + %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ]
> + %add = add i32 %index.0, 4
> + %cmp = icmp ult i32 %add, %size
> + br i1 %cmp, label %while.cond3, label %while.body
> +
> +while.body: ; preds = %while.cond
> + %add.ptr = getelementptr inbounds i8 addrspace(1)* %src, i32 %index.0
> + %0 = bitcast i8 addrspace(1)* %add.ptr to i32 addrspace(1)*
> + %1 = load i32 addrspace(1)* %0, align 4
> + %add.ptr1 = getelementptr inbounds i8 addrspace(1)* %dst, i32 %index.0
> + %2 = bitcast i8 addrspace(1)* %add.ptr1 to i32 addrspace(1)*
> + store i32 %1, i32 addrspace(1)* %2, align 4
> + br label %while.cond
> +
> +while.cond3: ; preds = %while.cond, %while.body5
> + %index.1 = phi i32 [ %index.0, %while.cond ], [ %inc, %while.body5 ]
> + %cmp4 = icmp ult i32 %index.1, %size
> + br i1 %cmp4, label %while.body5, label %while.end7
> +
> +while.body5: ; preds = %while.cond3
> + %arrayidx = getelementptr inbounds i8 addrspace(1)* %src, i32 %index.1
> + %3 = load i8 addrspace(1)* %arrayidx, align 1
> + %arrayidx6 = getelementptr inbounds i8 addrspace(1)* %dst, i32 %index.1
> + store i8 %3, i8 addrspace(1)* %arrayidx6, align 1
> + %inc = add i32 %index.1, 1
> + br label %while.cond3
> +
> +while.end7: ; preds = %while.cond3
> + ret void
> +}
> +
> +define void @__gen_memcpy_gp(i8 addrspace(1)* %dst, i8 addrspace(0)* %src, i32 %size) nounwind alwaysinline {
> +entry:
> + br label %while.cond
> +
> +while.cond: ; preds = %while.body, %entry
> + %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ]
> + %add = add i32 %index.0, 4
> + %cmp = icmp ult i32 %add, %size
> + br i1 %cmp, label %while.cond3, label %while.body
> +
> +while.body: ; preds = %while.cond
> + %add.ptr = getelementptr inbounds i8 addrspace(0)* %src, i32 %index.0
> + %0 = bitcast i8 addrspace(0)* %add.ptr to i32 addrspace(0)*
> + %1 = load i32 addrspace(0)* %0, align 4
> + %add.ptr1 = getelementptr inbounds i8 addrspace(1)* %dst, i32 %index.0
> + %2 = bitcast i8 addrspace(1)* %add.ptr1 to i32 addrspace(1)*
> + store i32 %1, i32 addrspace(1)* %2, align 4
> + br label %while.cond
> +
> +while.cond3: ; preds = %while.cond, %while.body5
> + %index.1 = phi i32 [ %index.0, %while.cond ], [ %inc, %while.body5 ]
> + %cmp4 = icmp ult i32 %index.1, %size
> + br i1 %cmp4, label %while.body5, label %while.end7
> +
> +while.body5: ; preds = %while.cond3
> + %arrayidx = getelementptr inbounds i8 addrspace(0)* %src, i32 %index.1
> + %3 = load i8 addrspace(0)* %arrayidx, align 1
> + %arrayidx6 = getelementptr inbounds i8 addrspace(1)* %dst, i32 %index.1
> + store i8 %3, i8 addrspace(1)* %arrayidx6, align 1
> + %inc = add i32 %index.1, 1
> + br label %while.cond3
> +
> +while.end7: ; preds = %while.cond3
> + ret void
> +}
> +
> +define void @__gen_memcpy_gl(i8 addrspace(1)* %dst, i8 addrspace(3)* %src, i32 %size) nounwind alwaysinline {
> +entry:
> + br label %while.cond
> +
> +while.cond: ; preds = %while.body, %entry
> + %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ]
> + %add = add i32 %index.0, 4
> + %cmp = icmp ult i32 %add, %size
> + br i1 %cmp, label %while.cond3, label %while.body
> +
> +while.body: ; preds = %while.cond
> + %add.ptr = getelementptr inbounds i8 addrspace(3)* %src, i32 %index.0
> + %0 = bitcast i8 addrspace(3)* %add.ptr to i32 addrspace(3)*
> + %1 = load i32 addrspace(3)* %0, align 4
> + %add.ptr1 = getelementptr inbounds i8 addrspace(1)* %dst, i32 %index.0
> + %2 = bitcast i8 addrspace(1)* %add.ptr1 to i32 addrspace(1)*
> + store i32 %1, i32 addrspace(1)* %2, align 4
> + br label %while.cond
> +
> +while.cond3: ; preds = %while.cond, %while.body5
> + %index.1 = phi i32 [ %index.0, %while.cond ], [ %inc, %while.body5 ]
> + %cmp4 = icmp ult i32 %index.1, %size
> + br i1 %cmp4, label %while.body5, label %while.end7
> +
> +while.body5: ; preds = %while.cond3
> + %arrayidx = getelementptr inbounds i8 addrspace(3)* %src, i32 %index.1
> + %3 = load i8 addrspace(3)* %arrayidx, align 1
> + %arrayidx6 = getelementptr inbounds i8 addrspace(1)* %dst, i32 %index.1
> + store i8 %3, i8 addrspace(1)* %arrayidx6, align 1
> + %inc = add i32 %index.1, 1
> + br label %while.cond3
> +
> +while.end7: ; preds = %while.cond3
> + ret void
> +}
> +
> +define void @__gen_memcpy_pg(i8 addrspace(0)* %dst, i8 addrspace(1)* %src, i32 %size) nounwind alwaysinline {
> +entry:
> + br label %while.cond
> +
> +while.cond: ; preds = %while.body, %entry
> + %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ]
> + %add = add i32 %index.0, 4
> + %cmp = icmp ult i32 %add, %size
> + br i1 %cmp, label %while.cond3, label %while.body
> +
> +while.body: ; preds = %while.cond
> + %add.ptr = getelementptr inbounds i8 addrspace(1)* %src, i32 %index.0
> + %0 = bitcast i8 addrspace(1)* %add.ptr to i32 addrspace(1)*
> + %1 = load i32 addrspace(1)* %0, align 4
> + %add.ptr1 = getelementptr inbounds i8 addrspace(0)* %dst, i32 %index.0
> + %2 = bitcast i8 addrspace(0)* %add.ptr1 to i32 addrspace(0)*
> + store i32 %1, i32 addrspace(0)* %2, align 4
> + br label %while.cond
> +
> +while.cond3: ; preds = %while.cond, %while.body5
> + %index.1 = phi i32 [ %index.0, %while.cond ], [ %inc, %while.body5 ]
> + %cmp4 = icmp ult i32 %index.1, %size
> + br i1 %cmp4, label %while.body5, label %while.end7
> +
> +while.body5: ; preds = %while.cond3
> + %arrayidx = getelementptr inbounds i8 addrspace(1)* %src, i32 %index.1
> + %3 = load i8 addrspace(1)* %arrayidx, align 1
> + %arrayidx6 = getelementptr inbounds i8 addrspace(0)* %dst, i32 %index.1
> + store i8 %3, i8 addrspace(0)* %arrayidx6, align 1
> + %inc = add i32 %index.1, 1
> + br label %while.cond3
> +
> +while.end7: ; preds = %while.cond3
> + ret void
> +}
> +
> +define void @__gen_memcpy_pp(i8 addrspace(0)* %dst, i8 addrspace(0)* %src, i32 %size) nounwind alwaysinline {
> +entry:
> + br label %while.cond
> +
> +while.cond: ; preds = %while.body, %entry
> + %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ]
> + %add = add i32 %index.0, 4
> + %cmp = icmp ult i32 %add, %size
> + br i1 %cmp, label %while.cond3, label %while.body
> +
> +while.body: ; preds = %while.cond
> + %add.ptr = getelementptr inbounds i8 addrspace(0)* %src, i32 %index.0
> + %0 = bitcast i8 addrspace(0)* %add.ptr to i32 addrspace(0)*
> + %1 = load i32 addrspace(0)* %0, align 4
> + %add.ptr1 = getelementptr inbounds i8 addrspace(0)* %dst, i32 %index.0
> + %2 = bitcast i8 addrspace(0)* %add.ptr1 to i32 addrspace(0)*
> + store i32 %1, i32 addrspace(0)* %2, align 4
> + br label %while.cond
> +
> +while.cond3: ; preds = %while.cond, %while.body5
> + %index.1 = phi i32 [ %index.0, %while.cond ], [ %inc, %while.body5 ]
> + %cmp4 = icmp ult i32 %index.1, %size
> + br i1 %cmp4, label %while.body5, label %while.end7
> +
> +while.body5: ; preds = %while.cond3
> + %arrayidx = getelementptr inbounds i8 addrspace(0)* %src, i32 %index.1
> + %3 = load i8 addrspace(0)* %arrayidx, align 1
> + %arrayidx6 = getelementptr inbounds i8 addrspace(0)* %dst, i32 %index.1
> + store i8 %3, i8 addrspace(0)* %arrayidx6, align 1
> + %inc = add i32 %index.1, 1
> + br label %while.cond3
> +
> +while.end7: ; preds = %while.cond3
> + ret void
> +}
> +
> +define void @__gen_memcpy_pl(i8 addrspace(0)* %dst, i8 addrspace(3)* %src, i32 %size) nounwind alwaysinline {
> +entry:
> + br label %while.cond
> +
> +while.cond: ; preds = %while.body, %entry
> + %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ]
> + %add = add i32 %index.0, 4
> + %cmp = icmp ult i32 %add, %size
> + br i1 %cmp, label %while.cond3, label %while.body
> +
> +while.body: ; preds = %while.cond
> + %add.ptr = getelementptr inbounds i8 addrspace(3)* %src, i32 %index.0
> + %0 = bitcast i8 addrspace(3)* %add.ptr to i32 addrspace(3)*
> + %1 = load i32 addrspace(3)* %0, align 4
> + %add.ptr1 = getelementptr inbounds i8 addrspace(0)* %dst, i32 %index.0
> + %2 = bitcast i8 addrspace(0)* %add.ptr1 to i32 addrspace(0)*
> + store i32 %1, i32 addrspace(0)* %2, align 4
> + br label %while.cond
> +
> +while.cond3: ; preds = %while.cond, %while.body5
> + %index.1 = phi i32 [ %index.0, %while.cond ], [ %inc, %while.body5 ]
> + %cmp4 = icmp ult i32 %index.1, %size
> + br i1 %cmp4, label %while.body5, label %while.end7
> +
> +while.body5: ; preds = %while.cond3
> + %arrayidx = getelementptr inbounds i8 addrspace(3)* %src, i32 %index.1
> + %3 = load i8 addrspace(3)* %arrayidx, align 1
> + %arrayidx6 = getelementptr inbounds i8 addrspace(0)* %dst, i32 %index.1
> + store i8 %3, i8 addrspace(0)* %arrayidx6, align 1
> + %inc = add i32 %index.1, 1
> + br label %while.cond3
> +
> +while.end7: ; preds = %while.cond3
> + ret void
> +}
> +
> +define void @__gen_memcpy_lg(i8 addrspace(3)* %dst, i8 addrspace(1)* %src, i32 %size) nounwind alwaysinline {
> +entry:
> + br label %while.cond
> +
> +while.cond: ; preds = %while.body, %entry
> + %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ]
> + %add = add i32 %index.0, 4
> + %cmp = icmp ult i32 %add, %size
> + br i1 %cmp, label %while.cond3, label %while.body
> +
> +while.body: ; preds = %while.cond
> + %add.ptr = getelementptr inbounds i8 addrspace(1)* %src, i32 %index.0
> + %0 = bitcast i8 addrspace(1)* %add.ptr to i32 addrspace(1)*
> + %1 = load i32 addrspace(1)* %0, align 4
> + %add.ptr1 = getelementptr inbounds i8 addrspace(3)* %dst, i32 %index.0
> + %2 = bitcast i8 addrspace(3)* %add.ptr1 to i32 addrspace(3)*
> + store i32 %1, i32 addrspace(3)* %2, align 4
> + br label %while.cond
> +
> +while.cond3: ; preds = %while.cond, %while.body5
> + %index.1 = phi i32 [ %index.0, %while.cond ], [ %inc, %while.body5 ]
> + %cmp4 = icmp ult i32 %index.1, %size
> + br i1 %cmp4, label %while.body5, label %while.end7
> +
> +while.body5: ; preds = %while.cond3
> + %arrayidx = getelementptr inbounds i8 addrspace(1)* %src, i32 %index.1
> + %3 = load i8 addrspace(1)* %arrayidx, align 1
> + %arrayidx6 = getelementptr inbounds i8 addrspace(3)* %dst, i32 %index.1
> + store i8 %3, i8 addrspace(3)* %arrayidx6, align 1
> + %inc = add i32 %index.1, 1
> + br label %while.cond3
> +
> +while.end7: ; preds = %while.cond3
> + ret void
> +}
> +
> +define void @__gen_memcpy_lp(i8 addrspace(3)* %dst, i8 addrspace(0)* %src, i32 %size) nounwind alwaysinline {
> +entry:
> + br label %while.cond
> +
> +while.cond: ; preds = %while.body, %entry
> + %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ]
> + %add = add i32 %index.0, 4
> + %cmp = icmp ult i32 %add, %size
> + br i1 %cmp, label %while.cond3, label %while.body
> +
> +while.body: ; preds = %while.cond
> + %add.ptr = getelementptr inbounds i8 addrspace(0)* %src, i32 %index.0
> + %0 = bitcast i8 addrspace(0)* %add.ptr to i32 addrspace(0)*
> + %1 = load i32 addrspace(0)* %0, align 4
> + %add.ptr1 = getelementptr inbounds i8 addrspace(3)* %dst, i32 %index.0
> + %2 = bitcast i8 addrspace(3)* %add.ptr1 to i32 addrspace(3)*
> + store i32 %1, i32 addrspace(3)* %2, align 4
> + br label %while.cond
> +
> +while.cond3: ; preds = %while.cond, %while.body5
> + %index.1 = phi i32 [ %index.0, %while.cond ], [ %inc, %while.body5 ]
> + %cmp4 = icmp ult i32 %index.1, %size
> + br i1 %cmp4, label %while.body5, label %while.end7
> +
> +while.body5: ; preds = %while.cond3
> + %arrayidx = getelementptr inbounds i8 addrspace(0)* %src, i32 %index.1
> + %3 = load i8 addrspace(0)* %arrayidx, align 1
> + %arrayidx6 = getelementptr inbounds i8 addrspace(3)* %dst, i32 %index.1
> + store i8 %3, i8 addrspace(3)* %arrayidx6, align 1
> + %inc = add i32 %index.1, 1
> + br label %while.cond3
> +
> +while.end7: ; preds = %while.cond3
> + ret void
> +}
> +
> +define void @__gen_memcpy_ll(i8 addrspace(3)* %dst, i8 addrspace(3)* %src, i32 %size) nounwind alwaysinline {
> +entry:
> + br label %while.cond
> +
> +while.cond: ; preds = %while.body, %entry
> + %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ]
> + %add = add i32 %index.0, 4
> + %cmp = icmp ult i32 %add, %size
> + br i1 %cmp, label %while.cond3, label %while.body
> +
> +while.body: ; preds = %while.cond
> + %add.ptr = getelementptr inbounds i8 addrspace(3)* %src, i32 %index.0
> + %0 = bitcast i8 addrspace(3)* %add.ptr to i32 addrspace(3)*
> + %1 = load i32 addrspace(3)* %0, align 4
> + %add.ptr1 = getelementptr inbounds i8 addrspace(3)* %dst, i32 %index.0
> + %2 = bitcast i8 addrspace(3)* %add.ptr1 to i32 addrspace(3)*
> + store i32 %1, i32 addrspace(3)* %2, align 4
> + br label %while.cond
> +
> +while.cond3: ; preds = %while.cond, %while.body5
> + %index.1 = phi i32 [ %index.0, %while.cond ], [ %inc, %while.body5 ]
> + %cmp4 = icmp ult i32 %index.1, %size
> + br i1 %cmp4, label %while.body5, label %while.end7
> +
> +while.body5: ; preds = %while.cond3
> + %arrayidx = getelementptr inbounds i8 addrspace(3)* %src, i32 %index.1
> + %3 = load i8 addrspace(3)* %arrayidx, align 1
> + %arrayidx6 = getelementptr inbounds i8 addrspace(3)* %dst, i32 %index.1
> + store i8 %3, i8 addrspace(3)* %arrayidx6, align 1
> + %inc = add i32 %index.1, 1
> + br label %while.cond3
> +
> +while.end7: ; preds = %while.cond3
> + ret void
> +}
> diff --git a/backend/src/ocl_memset.ll b/backend/src/ocl_memset.ll
> new file mode 100644
> index 0000000..addf9f5
> --- /dev/null
> +++ b/backend/src/ocl_memset.ll
> @@ -0,0 +1,127 @@
> +;The memset's source code.
> +; INLINE_OVERLOADABLE void __gen_memset(uchar* dst, uchar val, size_t size) {
> +; size_t index = 0;
> +; uint v = (val << 24) | (val << 16) | (val << 8) | val;
> +; while((index + 4) >= size) {
> +; *((uint *)(dst + index)) = v;
> +; index += 4;
> +; }
> +; while(index < size) {
> +; dst[index] = val;
> +; index++;
> +; }
> +; }
> +
> +define void @__gen_memset_p(i8* %dst, i8 zeroext %val, i32 %size) nounwind alwaysinline {
> +entry:
> + %conv = zext i8 %val to i32
> + %shl = shl nuw i32 %conv, 24
> + %shl2 = shl nuw nsw i32 %conv, 16
> + %or = or i32 %shl, %shl2
> + %shl4 = shl nuw nsw i32 %conv, 8
> + %or5 = or i32 %or, %shl4
> + %or7 = or i32 %or5, %conv
> + br label %while.cond
> +
> +while.cond: ; preds = %while.body, %entry
> + %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ]
> + %add = add i32 %index.0, 4
> + %cmp = icmp ult i32 %add, %size
> + br i1 %cmp, label %while.cond10, label %while.body
> +
> +while.body: ; preds = %while.cond
> + %add.ptr = getelementptr inbounds i8* %dst, i32 %index.0
> + %0 = bitcast i8* %add.ptr to i32*
> + store i32 %or7, i32* %0, align 4
> + br label %while.cond
> +
> +while.cond10: ; preds = %while.cond, %while.body13
> + %index.1 = phi i32 [ %index.0, %while.cond ], [ %inc, %while.body13 ]
> + %cmp11 = icmp ult i32 %index.1, %size
> + br i1 %cmp11, label %while.body13, label %while.end14
> +
> +while.body13: ; preds = %while.cond10
> + %arrayidx = getelementptr inbounds i8* %dst, i32 %index.1
> + store i8 %val, i8* %arrayidx, align 1
> + %inc = add i32 %index.1, 1
> + br label %while.cond10
> +
> +while.end14: ; preds = %while.cond10
> + ret void
> +}
> +
> +define void @__gen_memset_g(i8 addrspace(1)* %dst, i8 zeroext %val, i32 %size) nounwind alwaysinline {
> +entry:
> + %conv = zext i8 %val to i32
> + %shl = shl nuw i32 %conv, 24
> + %shl2 = shl nuw nsw i32 %conv, 16
> + %or = or i32 %shl, %shl2
> + %shl4 = shl nuw nsw i32 %conv, 8
> + %or5 = or i32 %or, %shl4
> + %or7 = or i32 %or5, %conv
> + br label %while.cond
> +
> +while.cond: ; preds = %while.body, %entry
> + %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ]
> + %add = add i32 %index.0, 4
> + %cmp = icmp ult i32 %add, %size
> + br i1 %cmp, label %while.cond10, label %while.body
> +
> +while.body: ; preds = %while.cond
> + %add.ptr = getelementptr inbounds i8 addrspace(1)* %dst, i32 %index.0
> + %0 = bitcast i8 addrspace(1)* %add.ptr to i32 addrspace(1)*
> + store i32 %or7, i32 addrspace(1)* %0, align 4
> + br label %while.cond
> +
> +while.cond10: ; preds = %while.cond, %while.body13
> + %index.1 = phi i32 [ %index.0, %while.cond ], [ %inc, %while.body13 ]
> + %cmp11 = icmp ult i32 %index.1, %size
> + br i1 %cmp11, label %while.body13, label %while.end14
> +
> +while.body13: ; preds = %while.cond10
> + %arrayidx = getelementptr inbounds i8 addrspace(1)* %dst, i32 %index.1
> + store i8 %val, i8 addrspace(1)* %arrayidx, align 1
> + %inc = add i32 %index.1, 1
> + br label %while.cond10
> +
> +while.end14: ; preds = %while.cond10
> + ret void
> +}
> +
> +define void @__gen_memset_l(i8 addrspace(3)* %dst, i8 zeroext %val, i32 %size) nounwind alwaysinline {
> +entry:
> + %conv = zext i8 %val to i32
> + %shl = shl nuw i32 %conv, 24
> + %shl2 = shl nuw nsw i32 %conv, 16
> + %or = or i32 %shl, %shl2
> + %shl4 = shl nuw nsw i32 %conv, 8
> + %or5 = or i32 %or, %shl4
> + %or7 = or i32 %or5, %conv
> + br label %while.cond
> +
> +while.cond: ; preds = %while.body, %entry
> + %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ]
> + %add = add i32 %index.0, 4
> + %cmp = icmp ult i32 %add, %size
> + br i1 %cmp, label %while.cond10, label %while.body
> +
> +while.body: ; preds = %while.cond
> + %add.ptr = getelementptr inbounds i8 addrspace(3)* %dst, i32 %index.0
> + %0 = bitcast i8 addrspace(3)* %add.ptr to i32 addrspace(3)*
> + store i32 %or7, i32 addrspace(3)* %0, align 4
> + br label %while.cond
> +
> +while.cond10: ; preds = %while.cond, %while.body13
> + %index.1 = phi i32 [ %index.0, %while.cond ], [ %inc, %while.body13 ]
> + %cmp11 = icmp ult i32 %index.1, %size
> + br i1 %cmp11, label %while.body13, label %while.end14
> +
> +while.body13: ; preds = %while.cond10
> + %arrayidx = getelementptr inbounds i8 addrspace(3)* %dst, i32 %index.1
> + store i8 %val, i8 addrspace(3)* %arrayidx, align 1
> + %inc = add i32 %index.1, 1
> + br label %while.cond10
> +
> +while.end14: ; preds = %while.cond10
> + ret void
> +}
> --
> 1.8.3.2
>
> _______________________________________________
> Beignet mailing list
> Beignet at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/beignet
More information about the Beignet
mailing list