[Beignet] [PATCH 2/3] Add llvm instrinsic function llvm.memset and llvm.memcpy support.
Yang Rong
rong.r.yang at intel.com
Wed Jan 15 00:31:05 PST 2014
SPIR 1.2 require llvm.memcpy support. And llvm will emit llvm.memset sometimes.
So adding a pass to lower these two intrinsic function, and then inline them.
In intrinsic lowering pass, find all llvm.memset and llvm.memcpy and then replace
them with a function call __gen_memset_x and __gen_memcpy_xx, x and xx is for address space.
Because this pass is after clang, but after clang, the unused function seems be stripped, so
implement the __gen_memset_x and __gen_memcpy_xx functions in pre compiled module, then link
them.
Signed-off-by: Yang Rong <rong.r.yang at intel.com>
---
backend/src/CMakeLists.txt | 3 +-
backend/src/llvm/llvm_gen_backend.hpp | 4 +
backend/src/llvm/llvm_intrinsic_lowering.cpp | 171 ++++++++++++++
backend/src/llvm/llvm_to_gen.cpp | 2 +
backend/src/ocl_memcpy.ll | 323 +++++++++++++++++++++++++++
backend/src/ocl_memset.ll | 113 ++++++++++
6 files changed, 615 insertions(+), 1 deletion(-)
create mode 100644 backend/src/llvm/llvm_intrinsic_lowering.cpp
create mode 100644 backend/src/ocl_memcpy.ll
create mode 100644 backend/src/ocl_memset.ll
diff --git a/backend/src/CMakeLists.txt b/backend/src/CMakeLists.txt
index b93133f..10bf67b 100644
--- a/backend/src/CMakeLists.txt
+++ b/backend/src/CMakeLists.txt
@@ -136,6 +136,7 @@ else (GBE_USE_BLOB)
llvm/llvm_gen_backend.cpp
llvm/llvm_passes.cpp
llvm/llvm_scalarize.cpp
+ llvm/llvm_intrinsic_lowering.cpp
llvm/llvm_to_gen.cpp
llvm/llvm_gen_backend.hpp
llvm/llvm_gen_ocl_function.hxx
@@ -165,7 +166,7 @@ add_library (gbe STATIC ${GBE_SRC})
# for pre compiled module library.
set (pcm_lib "beignet.bc")
-set (pcm_sources ocl_barrier.ll)
+set (pcm_sources ocl_barrier.ll ocl_memset.ll ocl_memcpy.ll)
ll_add_library (${pcm_lib} pcm_sources)
ADD_DEPENDENCIES (gbe pch_object ${pcm_lib})
diff --git a/backend/src/llvm/llvm_gen_backend.hpp b/backend/src/llvm/llvm_gen_backend.hpp
index 55079f5..389d5f3 100644
--- a/backend/src/llvm/llvm_gen_backend.hpp
+++ b/backend/src/llvm/llvm_gen_backend.hpp
@@ -84,8 +84,12 @@ namespace gbe
/*! Remove the GEP instructions */
llvm::BasicBlockPass *createRemoveGEPPass(const ir::Unit &unit);
+ /*! Scalarize all vector op instructions */
llvm::FunctionPass* createScalarizePass();
+ /*! Convert the Intrinsic call to gen function */
+ llvm::BasicBlockPass *createIntrinsicLoweringPass();
+
} /* namespace gbe */
#endif /* __GBE_LLVM_GEN_BACKEND_HPP__ */
diff --git a/backend/src/llvm/llvm_intrinsic_lowering.cpp b/backend/src/llvm/llvm_intrinsic_lowering.cpp
new file mode 100644
index 0000000..b6f874b
--- /dev/null
+++ b/backend/src/llvm/llvm_intrinsic_lowering.cpp
@@ -0,0 +1,171 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+/**
+ * \file llvm_intrinisc_lowering.cpp
+ * \author Yang Rong <rong.r.yang at intel.com>
+ */
+
+#include "llvm/Config/config.h"
+#if LLVM_VERSION_MINOR <= 2
+#include "llvm/Function.h"
+#include "llvm/InstrTypes.h"
+#include "llvm/Instructions.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/Module.h"
+#else
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#endif /* LLVM_VERSION_MINOR <= 2 */
+#include "llvm/Pass.h"
+#if LLVM_VERSION_MINOR <= 1
+#include "llvm/Support/IRBuilder.h"
+#elif LLVM_VERSION_MINOR == 2
+#include "llvm/IRBuilder.h"
+#else
+#include "llvm/IR/IRBuilder.h"
+#endif /* LLVM_VERSION_MINOR <= 1 */
+#include "llvm/Support/CallSite.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/Support/raw_ostream.h"
+
+#include "llvm/llvm_gen_backend.hpp"
+#include "sys/map.hpp"
+
+
+using namespace llvm;
+
+namespace gbe {
+ class InstrinsicLowering : public BasicBlockPass
+ {
+ public:
+ static char ID;
+ InstrinsicLowering() :
+ BasicBlockPass(ID) {}
+
+ void getAnalysisUsage(AnalysisUsage &AU) const {
+
+ }
+
+ virtual const char *getPassName() const {
+ return "PTX backend: lowering instrinsics";
+ }
+ static char convertSpaceToName(Value *val) {
+ const uint32_t space = val->getType()->getPointerAddressSpace();
+ switch(space) {
+ case 0:
+ return 'p';
+ case 1:
+ return 'g';
+ case 3:
+ return 'l';
+ default:
+ return '\0';
+ }
+ }
+ static CallInst *replaceCallWith(const char *NewFn, CallInst *CI,
+ Value **ArgBegin, Value **ArgEnd,
+ Type *RetTy)
+ {
+ // If we haven't already looked up this function, check to see if the
+ // program already contains a function with this name.
+ Module *M = CI->getParent()->getParent()->getParent();
+ // Get or insert the definition now.
+ std::vector<Type *> ParamTys;
+ for (Value** I = ArgBegin; I != ArgEnd; ++I)
+ ParamTys.push_back((*I)->getType());
+ Constant* FCache = M->getOrInsertFunction(NewFn,
+ FunctionType::get(RetTy, ParamTys, false));
+
+ IRBuilder<> Builder(CI->getParent(), CI);
+ SmallVector<Value *, 8> Args(ArgBegin, ArgEnd);
+ CallInst *NewCI = Builder.CreateCall(FCache, Args);
+ NewCI->setName(CI->getName());
+ if (!CI->use_empty())
+ CI->replaceAllUsesWith(NewCI);
+ CI->eraseFromParent();
+ return NewCI;
+ }
+ virtual bool runOnBasicBlock(BasicBlock &BB)
+ {
+ bool changedBlock = false;
+ Module *M = BB.getParent()->getParent();
+
+ DataLayout TD(M);
+ LLVMContext &Context = BB.getContext();
+ for (BasicBlock::iterator DI = BB.begin(); DI != BB.end(); ) {
+ Instruction *Inst = DI++;
+ CallInst* CI = dyn_cast<CallInst>(Inst);
+ if(CI == NULL)
+ continue;
+
+ IRBuilder<> Builder(&BB, CI);
+ // only support memcpy and memset
+ if (Function *F = CI->getCalledFunction()) {
+ const Intrinsic::ID intrinsicID = (Intrinsic::ID) F->getIntrinsicID();
+ if (intrinsicID == 0)
+ continue;
+ switch (intrinsicID) {
+ case Intrinsic::memcpy: {
+ Type *IntPtr = TD.getIntPtrType(Context);
+ Value *Size = Builder.CreateIntCast(CI->getArgOperand(2), IntPtr,
+ /* isSigned */ false);
+ Value *Ops[3];
+ Ops[0] = CI->getArgOperand(0);
+ Ops[1] = CI->getArgOperand(1);
+ Ops[2] = Size;
+ char name[16] = "__gen_memcpy_xx";
+ name[13] = convertSpaceToName(Ops[0]);
+ name[14] = convertSpaceToName(Ops[1]);
+ replaceCallWith(name, CI, Ops, Ops+3, Type::getVoidTy(Context));
+ break;
+ }
+ case Intrinsic::memset: {
+ Value *Op0 = CI->getArgOperand(0);
+ Value *val = Builder.CreateIntCast(CI->getArgOperand(1), IntegerType::getInt8Ty(Context),
+ /* isSigned */ false);
+ Type *IntPtr = TD.getIntPtrType(Op0->getType());
+ Value *Size = Builder.CreateIntCast(CI->getArgOperand(2), IntPtr,
+ /* isSigned */ false);
+ Value *Ops[3];
+ Ops[0] = Op0;
+ // Extend the amount to i32.
+ Ops[1] = val;
+ Ops[2] = Size;
+ char name[16] = "__gen_memset_x";
+ name[13] = convertSpaceToName(Ops[0]);
+ replaceCallWith(name, CI, Ops, Ops+3, Type::getVoidTy(Context));
+ break;
+ }
+ default:
+ continue;
+ }
+ }
+ }
+ return changedBlock;
+ }
+ };
+
+ char InstrinsicLowering::ID = 0;
+
+ BasicBlockPass *createIntrinsicLoweringPass() {
+ return new InstrinsicLowering();
+ }
+} // end namespace
diff --git a/backend/src/llvm/llvm_to_gen.cpp b/backend/src/llvm/llvm_to_gen.cpp
index a9f70d9..b227912 100644
--- a/backend/src/llvm/llvm_to_gen.cpp
+++ b/backend/src/llvm/llvm_to_gen.cpp
@@ -175,6 +175,8 @@ namespace gbe
// Print the code before further optimizations
if (OCL_OUTPUT_LLVM_BEFORE_EXTRA_PASS)
passes.add(createPrintModulePass(&*o));
+ passes.add(createIntrinsicLoweringPass());
+ passes.add(createFunctionInliningPass(200000));
passes.add(createScalarReplAggregatesPass()); // Break up allocas
passes.add(createRemoveGEPPass(unit));
passes.add(createConstantPropagationPass());
diff --git a/backend/src/ocl_memcpy.ll b/backend/src/ocl_memcpy.ll
new file mode 100644
index 0000000..c0783bf
--- /dev/null
+++ b/backend/src/ocl_memcpy.ll
@@ -0,0 +1,323 @@
+define void @__gen_memcpy_gg(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i32 %size) nounwind alwaysinline {
+entry:
+ br label %while.cond
+
+while.cond: ; preds = %while.body, %entry
+ %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ]
+ %add = add i32 %index.0, 4
+ %cmp = icmp ult i32 %add, %size
+ br i1 %cmp, label %while.cond3, label %while.body
+
+while.body: ; preds = %while.cond
+ %add.ptr = getelementptr inbounds i8 addrspace(1)* %src, i32 %index.0
+ %0 = bitcast i8 addrspace(1)* %add.ptr to i32 addrspace(1)*
+ %1 = load i32 addrspace(1)* %0, align 4
+ %add.ptr1 = getelementptr inbounds i8 addrspace(1)* %dst, i32 %index.0
+ %2 = bitcast i8 addrspace(1)* %add.ptr1 to i32 addrspace(1)*
+ store i32 %1, i32 addrspace(1)* %2, align 4
+ br label %while.cond
+
+while.cond3: ; preds = %while.cond, %while.body5
+ %index.1 = phi i32 [ %index.0, %while.cond ], [ %inc, %while.body5 ]
+ %cmp4 = icmp ult i32 %index.1, %size
+ br i1 %cmp4, label %while.body5, label %while.end7
+
+while.body5: ; preds = %while.cond3
+ %arrayidx = getelementptr inbounds i8 addrspace(1)* %src, i32 %index.1
+ %3 = load i8 addrspace(1)* %arrayidx, align 1
+ %arrayidx6 = getelementptr inbounds i8 addrspace(1)* %dst, i32 %index.1
+ store i8 %3, i8 addrspace(1)* %arrayidx6, align 1
+ %inc = add i32 %index.1, 1
+ br label %while.cond3
+
+while.end7: ; preds = %while.cond3
+ ret void
+}
+
+define void @__gen_memcpy_gp(i8 addrspace(1)* %dst, i8 addrspace(0)* %src, i32 %size) nounwind alwaysinline {
+entry:
+ br label %while.cond
+
+while.cond: ; preds = %while.body, %entry
+ %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ]
+ %add = add i32 %index.0, 4
+ %cmp = icmp ult i32 %add, %size
+ br i1 %cmp, label %while.cond3, label %while.body
+
+while.body: ; preds = %while.cond
+ %add.ptr = getelementptr inbounds i8 addrspace(0)* %src, i32 %index.0
+ %0 = bitcast i8 addrspace(0)* %add.ptr to i32 addrspace(0)*
+ %1 = load i32 addrspace(0)* %0, align 4
+ %add.ptr1 = getelementptr inbounds i8 addrspace(1)* %dst, i32 %index.0
+ %2 = bitcast i8 addrspace(1)* %add.ptr1 to i32 addrspace(1)*
+ store i32 %1, i32 addrspace(1)* %2, align 4
+ br label %while.cond
+
+while.cond3: ; preds = %while.cond, %while.body5
+ %index.1 = phi i32 [ %index.0, %while.cond ], [ %inc, %while.body5 ]
+ %cmp4 = icmp ult i32 %index.1, %size
+ br i1 %cmp4, label %while.body5, label %while.end7
+
+while.body5: ; preds = %while.cond3
+ %arrayidx = getelementptr inbounds i8 addrspace(0)* %src, i32 %index.1
+ %3 = load i8 addrspace(0)* %arrayidx, align 1
+ %arrayidx6 = getelementptr inbounds i8 addrspace(1)* %dst, i32 %index.1
+ store i8 %3, i8 addrspace(1)* %arrayidx6, align 1
+ %inc = add i32 %index.1, 1
+ br label %while.cond3
+
+while.end7: ; preds = %while.cond3
+ ret void
+}
+
+define void @__gen_memcpy_gl(i8 addrspace(1)* %dst, i8 addrspace(3)* %src, i32 %size) nounwind alwaysinline {
+entry:
+ br label %while.cond
+
+while.cond: ; preds = %while.body, %entry
+ %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ]
+ %add = add i32 %index.0, 4
+ %cmp = icmp ult i32 %add, %size
+ br i1 %cmp, label %while.cond3, label %while.body
+
+while.body: ; preds = %while.cond
+ %add.ptr = getelementptr inbounds i8 addrspace(3)* %src, i32 %index.0
+ %0 = bitcast i8 addrspace(3)* %add.ptr to i32 addrspace(3)*
+ %1 = load i32 addrspace(3)* %0, align 4
+ %add.ptr1 = getelementptr inbounds i8 addrspace(1)* %dst, i32 %index.0
+ %2 = bitcast i8 addrspace(1)* %add.ptr1 to i32 addrspace(1)*
+ store i32 %1, i32 addrspace(1)* %2, align 4
+ br label %while.cond
+
+while.cond3: ; preds = %while.cond, %while.body5
+ %index.1 = phi i32 [ %index.0, %while.cond ], [ %inc, %while.body5 ]
+ %cmp4 = icmp ult i32 %index.1, %size
+ br i1 %cmp4, label %while.body5, label %while.end7
+
+while.body5: ; preds = %while.cond3
+ %arrayidx = getelementptr inbounds i8 addrspace(3)* %src, i32 %index.1
+ %3 = load i8 addrspace(3)* %arrayidx, align 1
+ %arrayidx6 = getelementptr inbounds i8 addrspace(1)* %dst, i32 %index.1
+ store i8 %3, i8 addrspace(1)* %arrayidx6, align 1
+ %inc = add i32 %index.1, 1
+ br label %while.cond3
+
+while.end7: ; preds = %while.cond3
+ ret void
+}
+
+define void @__gen_memcpy_pg(i8 addrspace(0)* %dst, i8 addrspace(1)* %src, i32 %size) nounwind alwaysinline {
+entry:
+ br label %while.cond
+
+while.cond: ; preds = %while.body, %entry
+ %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ]
+ %add = add i32 %index.0, 4
+ %cmp = icmp ult i32 %add, %size
+ br i1 %cmp, label %while.cond3, label %while.body
+
+while.body: ; preds = %while.cond
+ %add.ptr = getelementptr inbounds i8 addrspace(1)* %src, i32 %index.0
+ %0 = bitcast i8 addrspace(1)* %add.ptr to i32 addrspace(1)*
+ %1 = load i32 addrspace(1)* %0, align 4
+ %add.ptr1 = getelementptr inbounds i8 addrspace(0)* %dst, i32 %index.0
+ %2 = bitcast i8 addrspace(0)* %add.ptr1 to i32 addrspace(0)*
+ store i32 %1, i32 addrspace(0)* %2, align 4
+ br label %while.cond
+
+while.cond3: ; preds = %while.cond, %while.body5
+ %index.1 = phi i32 [ %index.0, %while.cond ], [ %inc, %while.body5 ]
+ %cmp4 = icmp ult i32 %index.1, %size
+ br i1 %cmp4, label %while.body5, label %while.end7
+
+while.body5: ; preds = %while.cond3
+ %arrayidx = getelementptr inbounds i8 addrspace(1)* %src, i32 %index.1
+ %3 = load i8 addrspace(1)* %arrayidx, align 1
+ %arrayidx6 = getelementptr inbounds i8 addrspace(0)* %dst, i32 %index.1
+ store i8 %3, i8 addrspace(0)* %arrayidx6, align 1
+ %inc = add i32 %index.1, 1
+ br label %while.cond3
+
+while.end7: ; preds = %while.cond3
+ ret void
+}
+
+define void @__gen_memcpy_pp(i8 addrspace(0)* %dst, i8 addrspace(0)* %src, i32 %size) nounwind alwaysinline {
+entry:
+ br label %while.cond
+
+while.cond: ; preds = %while.body, %entry
+ %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ]
+ %add = add i32 %index.0, 4
+ %cmp = icmp ult i32 %add, %size
+ br i1 %cmp, label %while.cond3, label %while.body
+
+while.body: ; preds = %while.cond
+ %add.ptr = getelementptr inbounds i8 addrspace(0)* %src, i32 %index.0
+ %0 = bitcast i8 addrspace(0)* %add.ptr to i32 addrspace(0)*
+ %1 = load i32 addrspace(0)* %0, align 4
+ %add.ptr1 = getelementptr inbounds i8 addrspace(0)* %dst, i32 %index.0
+ %2 = bitcast i8 addrspace(0)* %add.ptr1 to i32 addrspace(0)*
+ store i32 %1, i32 addrspace(0)* %2, align 4
+ br label %while.cond
+
+while.cond3: ; preds = %while.cond, %while.body5
+ %index.1 = phi i32 [ %index.0, %while.cond ], [ %inc, %while.body5 ]
+ %cmp4 = icmp ult i32 %index.1, %size
+ br i1 %cmp4, label %while.body5, label %while.end7
+
+while.body5: ; preds = %while.cond3
+ %arrayidx = getelementptr inbounds i8 addrspace(0)* %src, i32 %index.1
+ %3 = load i8 addrspace(0)* %arrayidx, align 1
+ %arrayidx6 = getelementptr inbounds i8 addrspace(0)* %dst, i32 %index.1
+ store i8 %3, i8 addrspace(0)* %arrayidx6, align 1
+ %inc = add i32 %index.1, 1
+ br label %while.cond3
+
+while.end7: ; preds = %while.cond3
+ ret void
+}
+
+define void @__gen_memcpy_pl(i8 addrspace(0)* %dst, i8 addrspace(3)* %src, i32 %size) nounwind alwaysinline {
+entry:
+ br label %while.cond
+
+while.cond: ; preds = %while.body, %entry
+ %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ]
+ %add = add i32 %index.0, 4
+ %cmp = icmp ult i32 %add, %size
+ br i1 %cmp, label %while.cond3, label %while.body
+
+while.body: ; preds = %while.cond
+ %add.ptr = getelementptr inbounds i8 addrspace(3)* %src, i32 %index.0
+ %0 = bitcast i8 addrspace(3)* %add.ptr to i32 addrspace(3)*
+ %1 = load i32 addrspace(3)* %0, align 4
+ %add.ptr1 = getelementptr inbounds i8 addrspace(0)* %dst, i32 %index.0
+ %2 = bitcast i8 addrspace(0)* %add.ptr1 to i32 addrspace(0)*
+ store i32 %1, i32 addrspace(0)* %2, align 4
+ br label %while.cond
+
+while.cond3: ; preds = %while.cond, %while.body5
+ %index.1 = phi i32 [ %index.0, %while.cond ], [ %inc, %while.body5 ]
+ %cmp4 = icmp ult i32 %index.1, %size
+ br i1 %cmp4, label %while.body5, label %while.end7
+
+while.body5: ; preds = %while.cond3
+ %arrayidx = getelementptr inbounds i8 addrspace(3)* %src, i32 %index.1
+ %3 = load i8 addrspace(3)* %arrayidx, align 1
+ %arrayidx6 = getelementptr inbounds i8 addrspace(0)* %dst, i32 %index.1
+ store i8 %3, i8 addrspace(0)* %arrayidx6, align 1
+ %inc = add i32 %index.1, 1
+ br label %while.cond3
+
+while.end7: ; preds = %while.cond3
+ ret void
+}
+
+define void @__gen_memcpy_lg(i8 addrspace(3)* %dst, i8 addrspace(1)* %src, i32 %size) nounwind alwaysinline {
+entry:
+ br label %while.cond
+
+while.cond: ; preds = %while.body, %entry
+ %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ]
+ %add = add i32 %index.0, 4
+ %cmp = icmp ult i32 %add, %size
+ br i1 %cmp, label %while.cond3, label %while.body
+
+while.body: ; preds = %while.cond
+ %add.ptr = getelementptr inbounds i8 addrspace(1)* %src, i32 %index.0
+ %0 = bitcast i8 addrspace(1)* %add.ptr to i32 addrspace(1)*
+ %1 = load i32 addrspace(1)* %0, align 4
+ %add.ptr1 = getelementptr inbounds i8 addrspace(3)* %dst, i32 %index.0
+ %2 = bitcast i8 addrspace(3)* %add.ptr1 to i32 addrspace(3)*
+ store i32 %1, i32 addrspace(3)* %2, align 4
+ br label %while.cond
+
+while.cond3: ; preds = %while.cond, %while.body5
+ %index.1 = phi i32 [ %index.0, %while.cond ], [ %inc, %while.body5 ]
+ %cmp4 = icmp ult i32 %index.1, %size
+ br i1 %cmp4, label %while.body5, label %while.end7
+
+while.body5: ; preds = %while.cond3
+ %arrayidx = getelementptr inbounds i8 addrspace(1)* %src, i32 %index.1
+ %3 = load i8 addrspace(1)* %arrayidx, align 1
+ %arrayidx6 = getelementptr inbounds i8 addrspace(3)* %dst, i32 %index.1
+ store i8 %3, i8 addrspace(3)* %arrayidx6, align 1
+ %inc = add i32 %index.1, 1
+ br label %while.cond3
+
+while.end7: ; preds = %while.cond3
+ ret void
+}
+
+define void @__gen_memcpy_lp(i8 addrspace(3)* %dst, i8 addrspace(0)* %src, i32 %size) nounwind alwaysinline {
+entry:
+ br label %while.cond
+
+while.cond: ; preds = %while.body, %entry
+ %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ]
+ %add = add i32 %index.0, 4
+ %cmp = icmp ult i32 %add, %size
+ br i1 %cmp, label %while.cond3, label %while.body
+
+while.body: ; preds = %while.cond
+ %add.ptr = getelementptr inbounds i8 addrspace(0)* %src, i32 %index.0
+ %0 = bitcast i8 addrspace(0)* %add.ptr to i32 addrspace(0)*
+ %1 = load i32 addrspace(0)* %0, align 4
+ %add.ptr1 = getelementptr inbounds i8 addrspace(3)* %dst, i32 %index.0
+ %2 = bitcast i8 addrspace(3)* %add.ptr1 to i32 addrspace(3)*
+ store i32 %1, i32 addrspace(3)* %2, align 4
+ br label %while.cond
+
+while.cond3: ; preds = %while.cond, %while.body5
+ %index.1 = phi i32 [ %index.0, %while.cond ], [ %inc, %while.body5 ]
+ %cmp4 = icmp ult i32 %index.1, %size
+ br i1 %cmp4, label %while.body5, label %while.end7
+
+while.body5: ; preds = %while.cond3
+ %arrayidx = getelementptr inbounds i8 addrspace(0)* %src, i32 %index.1
+ %3 = load i8 addrspace(0)* %arrayidx, align 1
+ %arrayidx6 = getelementptr inbounds i8 addrspace(3)* %dst, i32 %index.1
+ store i8 %3, i8 addrspace(3)* %arrayidx6, align 1
+ %inc = add i32 %index.1, 1
+ br label %while.cond3
+
+while.end7: ; preds = %while.cond3
+ ret void
+}
+
+define void @__gen_memcpy_ll(i8 addrspace(3)* %dst, i8 addrspace(3)* %src, i32 %size) nounwind alwaysinline {
+entry:
+ br label %while.cond
+
+while.cond: ; preds = %while.body, %entry
+ %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ]
+ %add = add i32 %index.0, 4
+ %cmp = icmp ult i32 %add, %size
+ br i1 %cmp, label %while.cond3, label %while.body
+
+while.body: ; preds = %while.cond
+ %add.ptr = getelementptr inbounds i8 addrspace(3)* %src, i32 %index.0
+ %0 = bitcast i8 addrspace(3)* %add.ptr to i32 addrspace(3)*
+ %1 = load i32 addrspace(3)* %0, align 4
+ %add.ptr1 = getelementptr inbounds i8 addrspace(3)* %dst, i32 %index.0
+ %2 = bitcast i8 addrspace(3)* %add.ptr1 to i32 addrspace(3)*
+ store i32 %1, i32 addrspace(3)* %2, align 4
+ br label %while.cond
+
+while.cond3: ; preds = %while.cond, %while.body5
+ %index.1 = phi i32 [ %index.0, %while.cond ], [ %inc, %while.body5 ]
+ %cmp4 = icmp ult i32 %index.1, %size
+ br i1 %cmp4, label %while.body5, label %while.end7
+
+while.body5: ; preds = %while.cond3
+ %arrayidx = getelementptr inbounds i8 addrspace(3)* %src, i32 %index.1
+ %3 = load i8 addrspace(3)* %arrayidx, align 1
+ %arrayidx6 = getelementptr inbounds i8 addrspace(3)* %dst, i32 %index.1
+ store i8 %3, i8 addrspace(3)* %arrayidx6, align 1
+ %inc = add i32 %index.1, 1
+ br label %while.cond3
+
+while.end7: ; preds = %while.cond3
+ ret void
+}
diff --git a/backend/src/ocl_memset.ll b/backend/src/ocl_memset.ll
new file mode 100644
index 0000000..fa8203b
--- /dev/null
+++ b/backend/src/ocl_memset.ll
@@ -0,0 +1,113 @@
+define void @__gen_memset_p(i8* %dst, i8 zeroext %val, i32 %size) nounwind alwaysinline {
+entry:
+ %conv = zext i8 %val to i32
+ %shl = shl nuw i32 %conv, 24
+ %shl2 = shl nuw nsw i32 %conv, 16
+ %or = or i32 %shl, %shl2
+ %shl4 = shl nuw nsw i32 %conv, 8
+ %or5 = or i32 %or, %shl4
+ %or7 = or i32 %or5, %conv
+ br label %while.cond
+
+while.cond: ; preds = %while.body, %entry
+ %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ]
+ %add = add i32 %index.0, 4
+ %cmp = icmp ult i32 %add, %size
+ br i1 %cmp, label %while.cond10, label %while.body
+
+while.body: ; preds = %while.cond
+ %add.ptr = getelementptr inbounds i8* %dst, i32 %index.0
+ %0 = bitcast i8* %add.ptr to i32*
+ store i32 %or7, i32* %0, align 4
+ br label %while.cond
+
+while.cond10: ; preds = %while.cond, %while.body13
+ %index.1 = phi i32 [ %index.0, %while.cond ], [ %inc, %while.body13 ]
+ %cmp11 = icmp ult i32 %index.1, %size
+ br i1 %cmp11, label %while.body13, label %while.end14
+
+while.body13: ; preds = %while.cond10
+ %arrayidx = getelementptr inbounds i8* %dst, i32 %index.1
+ store i8 %val, i8* %arrayidx, align 1
+ %inc = add i32 %index.1, 1
+ br label %while.cond10
+
+while.end14: ; preds = %while.cond10
+ ret void
+}
+
+define void @__gen_memset_g(i8 addrspace(1)* %dst, i8 zeroext %val, i32 %size) nounwind alwaysinline {
+entry:
+ %conv = zext i8 %val to i32
+ %shl = shl nuw i32 %conv, 24
+ %shl2 = shl nuw nsw i32 %conv, 16
+ %or = or i32 %shl, %shl2
+ %shl4 = shl nuw nsw i32 %conv, 8
+ %or5 = or i32 %or, %shl4
+ %or7 = or i32 %or5, %conv
+ br label %while.cond
+
+while.cond: ; preds = %while.body, %entry
+ %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ]
+ %add = add i32 %index.0, 4
+ %cmp = icmp ult i32 %add, %size
+ br i1 %cmp, label %while.cond10, label %while.body
+
+while.body: ; preds = %while.cond
+ %add.ptr = getelementptr inbounds i8 addrspace(1)* %dst, i32 %index.0
+ %0 = bitcast i8 addrspace(1)* %add.ptr to i32 addrspace(1)*
+ store i32 %or7, i32 addrspace(1)* %0, align 4
+ br label %while.cond
+
+while.cond10: ; preds = %while.cond, %while.body13
+ %index.1 = phi i32 [ %index.0, %while.cond ], [ %inc, %while.body13 ]
+ %cmp11 = icmp ult i32 %index.1, %size
+ br i1 %cmp11, label %while.body13, label %while.end14
+
+while.body13: ; preds = %while.cond10
+ %arrayidx = getelementptr inbounds i8 addrspace(1)* %dst, i32 %index.1
+ store i8 %val, i8 addrspace(1)* %arrayidx, align 1
+ %inc = add i32 %index.1, 1
+ br label %while.cond10
+
+while.end14: ; preds = %while.cond10
+ ret void
+}
+
+define void @__gen_memset_l(i8 addrspace(3)* %dst, i8 zeroext %val, i32 %size) nounwind alwaysinline {
+entry:
+ %conv = zext i8 %val to i32
+ %shl = shl nuw i32 %conv, 24
+ %shl2 = shl nuw nsw i32 %conv, 16
+ %or = or i32 %shl, %shl2
+ %shl4 = shl nuw nsw i32 %conv, 8
+ %or5 = or i32 %or, %shl4
+ %or7 = or i32 %or5, %conv
+ br label %while.cond
+
+while.cond: ; preds = %while.body, %entry
+ %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ]
+ %add = add i32 %index.0, 4
+ %cmp = icmp ult i32 %add, %size
+ br i1 %cmp, label %while.cond10, label %while.body
+
+while.body: ; preds = %while.cond
+ %add.ptr = getelementptr inbounds i8 addrspace(3)* %dst, i32 %index.0
+ %0 = bitcast i8 addrspace(3)* %add.ptr to i32 addrspace(3)*
+ store i32 %or7, i32 addrspace(3)* %0, align 4
+ br label %while.cond
+
+while.cond10: ; preds = %while.cond, %while.body13
+ %index.1 = phi i32 [ %index.0, %while.cond ], [ %inc, %while.body13 ]
+ %cmp11 = icmp ult i32 %index.1, %size
+ br i1 %cmp11, label %while.body13, label %while.end14
+
+while.body13: ; preds = %while.cond10
+ %arrayidx = getelementptr inbounds i8 addrspace(3)* %dst, i32 %index.1
+ store i8 %val, i8 addrspace(3)* %arrayidx, align 1
+ %inc = add i32 %index.1, 1
+ br label %while.cond10
+
+while.end14: ; preds = %while.cond10
+ ret void
+}
--
1.8.3.2
More information about the Beignet
mailing list