[Beignet] [PATCH 1/4] GBE: Implement new constant solution for ocl2
xionghu.luo at intel.com
xionghu.luo at intel.com
Mon Jan 18 19:29:06 PST 2016
From: Ruiling Song <ruiling.song at intel.com>
Different from ocl 1.2, which gather all constant into one surface.
ocl2 only gather program global/constant into one surface. But
keep other constant passed through kernel argument in separate buffer.
Signed-off-by: Ruiling Song <ruiling.song at intel.com>
---
backend/src/backend/gen_insn_selection.cpp | 15 ++++--
backend/src/backend/program.cpp | 21 +++++++-
backend/src/backend/program.h | 6 +++
backend/src/backend/program.hpp | 4 ++
backend/src/gbe_bin_interpreter.cpp | 2 +
backend/src/ir/profile.cpp | 4 +-
backend/src/ir/profile.hpp | 3 +-
backend/src/ir/unit.hpp | 50 ++++++++++++++++++
backend/src/llvm/llvm_gen_backend.cpp | 84 ++++++++++++++++++------------
kernels/compiler_program_global.cl | 68 ++++++++++++++++++++++++
src/cl_command_queue.c | 2 +-
src/cl_command_queue_gen7.c | 16 ++++++
src/cl_gbe_loader.cpp | 10 ++++
src/cl_gbe_loader.h | 2 +
src/cl_program.c | 46 ++++++++++++++++
src/cl_program.h | 2 +
utests/CMakeLists.txt | 1 +
utests/compiler_program_global.cpp | 80 ++++++++++++++++++++++++++++
18 files changed, 374 insertions(+), 42 deletions(-)
create mode 100644 kernels/compiler_program_global.cl
create mode 100644 utests/compiler_program_global.cpp
diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp
index d19f985..6ef077b 100644
--- a/backend/src/backend/gen_insn_selection.cpp
+++ b/backend/src/backend/gen_insn_selection.cpp
@@ -3596,6 +3596,13 @@ namespace gbe
LoadInstructionPattern(void) : SelectionPattern(1, 1) {
this->opcodes.push_back(ir::OP_LOAD);
}
+ bool isReadConstantLegacy(const ir::LoadInstruction &load) const {
+ ir::AddressMode AM = load.getAddressMode();
+ ir::AddressSpace AS = load.getAddressSpace();
+ if (AM != ir::AM_Stateless && AS == ir::MEM_CONSTANT)
+ return true;
+ return false;
+ }
void untypedReadStateless(Selection::Opaque &sel,
GenRegister addr,
vector<GenRegister> &dst
@@ -3678,7 +3685,7 @@ namespace gbe
unsigned SI = insn.getSurfaceIndex();
sel.UNTYPED_READ(addr, dst.data(), valueNum, GenRegister::immud(SI), btiTemp);
}
- } else if (addrSpace == ir::MEM_LOCAL || addrSpace == ir::MEM_CONSTANT ) {
+ } else if (addrSpace == ir::MEM_LOCAL || isReadConstantLegacy(insn) ) {
// stateless mode, local/constant still use bti access
unsigned bti = addrSpace == ir::MEM_CONSTANT ? BTI_CONSTANT : 0xfe;
GenRegister addrDW = addr;
@@ -3842,7 +3849,7 @@ namespace gbe
b = GenRegister::immud(insn.getSurfaceIndex());
}
read64Legacy(sel, addr, dst, b, btiTemp);
- } else if (addrSpace == MEM_LOCAL || addrSpace == MEM_CONSTANT) {
+ } else if (addrSpace == MEM_LOCAL || isReadConstantLegacy(insn)) {
GenRegister b = GenRegister::immud(addrSpace == MEM_LOCAL? 0xfe : BTI_CONSTANT);
GenRegister addrDW = addr;
if (addrBytes == 8)
@@ -4063,7 +4070,7 @@ namespace gbe
unsigned SI = insn.getSurfaceIndex();
sel.BYTE_GATHER(dst, addr, elemSize, GenRegister::immud(SI), btiTemp);
}
- } else if (addrSpace == ir::MEM_LOCAL || addrSpace == ir::MEM_CONSTANT) {
+ } else if (addrSpace == ir::MEM_LOCAL || isReadConstantLegacy(insn)) {
unsigned bti = addrSpace == ir::MEM_CONSTANT ? BTI_CONSTANT : 0xfe;
GenRegister addrDW = addr;
if (addrBytes == 8) {
@@ -4207,7 +4214,7 @@ namespace gbe
const Type type = insn.getValueType();
const uint32_t elemSize = getByteScatterGatherSize(sel, type);
- if (addrSpace == MEM_CONSTANT) {
+ if (isReadConstantLegacy(insn)) {
// XXX TODO read 64bit constant through constant cache
// Per HW Spec, constant cache messages can read at least DWORD data.
// So, byte/short data type, we have to read through data cache.
diff --git a/backend/src/backend/program.cpp b/backend/src/backend/program.cpp
index 36af95f..ce4f927 100644
--- a/backend/src/backend/program.cpp
+++ b/backend/src/backend/program.cpp
@@ -104,11 +104,13 @@ namespace gbe {
return it->offset; // we found it!
}
- Program::Program(void) : constantSet(NULL) {}
+ Program::Program(void) : constantSet(NULL),
+ relocTable(NULL) {}
Program::~Program(void) {
for (map<std::string, Kernel*>::iterator it = kernels.begin(); it != kernels.end(); ++it)
GBE_DELETE(it->second);
if (constantSet) delete constantSet;
+ if (relocTable) delete relocTable;
}
#ifdef GBE_COMPILER_AVAILABLE
@@ -151,6 +153,7 @@ namespace gbe {
bool Program::buildFromUnit(const ir::Unit &unit, std::string &error) {
constantSet = new ir::ConstantSet(unit.getConstantSet());
+ relocTable = new ir::RelocTable(unit.getRelocTable());
const auto &set = unit.getFunctionSet();
const uint32_t kernelNum = set.size();
if (OCL_OUTPUT_GEN_IR) std::cout << unit;
@@ -978,6 +981,18 @@ namespace gbe {
program->getGlobalConstantData(mem);
}
+ static size_t programGetGlobalRelocCount(gbe_program gbeProgram) {
+ if (gbeProgram == NULL) return 0;
+ const gbe::Program *program = (const gbe::Program*) gbeProgram;
+ return program->getGlobalRelocCount();
+ }
+
+ static void programGetGlobalRelocTable(gbe_program gbeProgram, char *mem) {
+ if (gbeProgram == NULL) return;
+ const gbe::Program *program = (const gbe::Program*) gbeProgram;
+ program->getGlobalRelocTable(mem);
+ }
+
static uint32_t programGetKernelNum(gbe_program gbeProgram) {
if (gbeProgram == NULL) return 0;
const gbe::Program *program = (const gbe::Program*) gbeProgram;
@@ -1220,6 +1235,8 @@ GBE_EXPORT_SYMBOL gbe_program_link_from_llvm_cb *gbe_program_link_from_llvm = NU
GBE_EXPORT_SYMBOL gbe_program_build_from_llvm_cb *gbe_program_build_from_llvm = NULL;
GBE_EXPORT_SYMBOL gbe_program_get_global_constant_size_cb *gbe_program_get_global_constant_size = NULL;
GBE_EXPORT_SYMBOL gbe_program_get_global_constant_data_cb *gbe_program_get_global_constant_data = NULL;
+GBE_EXPORT_SYMBOL gbe_program_get_global_reloc_count_cb *gbe_program_get_global_reloc_count = NULL;
+GBE_EXPORT_SYMBOL gbe_program_get_global_reloc_table_cb *gbe_program_get_global_reloc_table = NULL;
GBE_EXPORT_SYMBOL gbe_program_clean_llvm_resource_cb *gbe_program_clean_llvm_resource = NULL;
GBE_EXPORT_SYMBOL gbe_program_delete_cb *gbe_program_delete = NULL;
GBE_EXPORT_SYMBOL gbe_program_get_kernel_num_cb *gbe_program_get_kernel_num = NULL;
@@ -1269,6 +1286,8 @@ namespace gbe
gbe_program_check_opt = gbe::programCheckOption;
gbe_program_get_global_constant_size = gbe::programGetGlobalConstantSize;
gbe_program_get_global_constant_data = gbe::programGetGlobalConstantData;
+ gbe_program_get_global_reloc_count = gbe::programGetGlobalRelocCount;
+ gbe_program_get_global_reloc_table = gbe::programGetGlobalRelocTable;
gbe_program_clean_llvm_resource = gbe::programCleanLlvmResource;
gbe_program_delete = gbe::programDelete;
gbe_program_get_kernel_num = gbe::programGetKernelNum;
diff --git a/backend/src/backend/program.h b/backend/src/backend/program.h
index 86b3177..03150bc 100644
--- a/backend/src/backend/program.h
+++ b/backend/src/backend/program.h
@@ -99,6 +99,7 @@ enum gbe_curbe_type {
GBE_CURBE_BLOCK_IP,
GBE_CURBE_DW_BLOCK_IP,
GBE_CURBE_THREAD_NUM,
+ GBE_CURBE_CONSTANT_ADDRSPACE,
GBE_GEN_REG,
};
@@ -243,6 +244,11 @@ extern gbe_program_get_global_constant_size_cb *gbe_program_get_global_constant_
typedef void (gbe_program_get_global_constant_data_cb)(gbe_program gbeProgram, char *mem);
extern gbe_program_get_global_constant_data_cb *gbe_program_get_global_constant_data;
+typedef size_t (gbe_program_get_global_reloc_count_cb)(gbe_program gbeProgram);
+extern gbe_program_get_global_reloc_count_cb *gbe_program_get_global_reloc_count;
+
+typedef void (gbe_program_get_global_reloc_table_cb)(gbe_program gbeProgram, char *mem);
+extern gbe_program_get_global_reloc_table_cb *gbe_program_get_global_reloc_table;
/*! Get the size of defined samplers */
typedef size_t (gbe_kernel_get_sampler_size_cb)(gbe_kernel gbeKernel);
extern gbe_kernel_get_sampler_size_cb *gbe_kernel_get_sampler_size;
diff --git a/backend/src/backend/program.hpp b/backend/src/backend/program.hpp
index efe192f..e58ddf0 100644
--- a/backend/src/backend/program.hpp
+++ b/backend/src/backend/program.hpp
@@ -280,6 +280,8 @@ namespace gbe {
/*! Get the content of global constant arrays */
void getGlobalConstantData(char *mem) const { constantSet->getData(mem); }
+ uint32_t getGlobalRelocCount(void) const { return relocTable->getCount(); }
+ void getGlobalRelocTable(char *p) const { relocTable->getData(p); }
static const uint32_t magic_begin = TO_MAGIC('P', 'R', 'O', 'G');
static const uint32_t magic_end = TO_MAGIC('G', 'O', 'R', 'P');
@@ -309,6 +311,8 @@ namespace gbe {
map<std::string, Kernel*> kernels;
/*! Global (constants) outside any kernel */
ir::ConstantSet *constantSet;
+ /*! relocation table */
+ ir::RelocTable *relocTable;
/*! Use custom allocators */
GBE_CLASS(Program);
};
diff --git a/backend/src/gbe_bin_interpreter.cpp b/backend/src/gbe_bin_interpreter.cpp
index 4594a0a..0957092 100644
--- a/backend/src/gbe_bin_interpreter.cpp
+++ b/backend/src/gbe_bin_interpreter.cpp
@@ -61,6 +61,8 @@ struct BinInterpCallBackInitializer
gbe_program_get_global_constant_size = gbe::programGetGlobalConstantSize;
gbe_program_delete = gbe::programDelete;
gbe_program_get_global_constant_data = gbe::programGetGlobalConstantData;
+ gbe_program_get_global_reloc_count = gbe::programGetGlobalRelocCount;
+ gbe_program_get_global_reloc_table = gbe::programGetGlobalRelocTable;
gbe_kernel_get_sampler_data = gbe::kernelGetSamplerData;
gbe_kernel_get_image_data = gbe::kernelGetImageData;
gbe_kernel_get_arg_info = gbe::kernelGetArgInfo;
diff --git a/backend/src/ir/profile.cpp b/backend/src/ir/profile.cpp
index 3ead8a7..0699167 100644
--- a/backend/src/ir/profile.cpp
+++ b/backend/src/ir/profile.cpp
@@ -43,7 +43,8 @@ namespace ir {
"zero", "one",
"retVal",
"printf_buffer_pointer", "printf_index_buffer_pointer",
- "dwblockip"
+ "dwblockip",
+ "constant_addrspace_start"
};
#if GBE_DEBUG
@@ -86,6 +87,7 @@ namespace ir {
DECL_NEW_REG(FAMILY_QWORD, printfbptr, 1, GBE_CURBE_PRINTF_BUF_POINTER);
DECL_NEW_REG(FAMILY_QWORD, printfiptr, 1, GBE_CURBE_PRINTF_INDEX_POINTER);
DECL_NEW_REG(FAMILY_DWORD, dwblockip, 0, GBE_CURBE_DW_BLOCK_IP);
+ DECL_NEW_REG(FAMILY_QWORD, constant_addrspace, 1, GBE_CURBE_CONSTANT_ADDRSPACE);
}
#undef DECL_NEW_REG
diff --git a/backend/src/ir/profile.hpp b/backend/src/ir/profile.hpp
index a8445c4..79761d4 100644
--- a/backend/src/ir/profile.hpp
+++ b/backend/src/ir/profile.hpp
@@ -71,7 +71,8 @@ namespace ir {
static const Register printfbptr = Register(27); // printf buffer address .
static const Register printfiptr = Register(28); // printf index buffer address.
static const Register dwblockip = Register(29); // blockip
- static const uint32_t regNum = 30; // number of special registers
+ static const Register constant_addrspace = Register(30); // starting address of program-scope constant
+ static const uint32_t regNum = 31; // number of special registers
extern const char *specialRegMean[]; // special register name.
} /* namespace ocl */
diff --git a/backend/src/ir/unit.hpp b/backend/src/ir/unit.hpp
index 8ff858d..b8df145 100644
--- a/backend/src/ir/unit.hpp
+++ b/backend/src/ir/unit.hpp
@@ -27,6 +27,7 @@
#include "ir/constant.hpp"
#include "ir/register.hpp"
#include "sys/map.hpp"
+#include <string.h>
namespace gbe {
namespace ir {
@@ -37,6 +38,52 @@ namespace ir {
/*! Complete unit of compilation. It contains a set of functions and a set of
* constant the functions may refer to.
*/
+ struct RelocEntry {
+ RelocEntry(unsigned int rO, unsigned int dO):
+ refOffset(rO),
+ defOffset(dO) {}
+
+ unsigned int refOffset;
+ unsigned int defOffset;
+ };
+
+ class RelocTable : public NonCopyable, public Serializable
+ {
+ public:
+ void addEntry(unsigned refOffset, unsigned defOffset) {
+ entries.push_back(RelocEntry(refOffset, defOffset));
+ }
+ RelocTable() {}
+ RelocTable(const RelocTable& other) : Serializable(other),
+ entries(other.entries) {}
+ uint32_t getCount() { return entries.size(); }
+ void getData(char *p) {
+ if (entries.size() > 1 && p)
+ memcpy(p, entries.data(), entries.size()*sizeof(RelocEntry));
+ }
+ static const uint32_t magic_begin = TO_MAGIC('R', 'E', 'L', 'C');
+ static const uint32_t magic_end = TO_MAGIC('C', 'L', 'E', 'R');
+
+ /* format:
+ magic_begin |
+ const_data_size |
+ const_data |
+ constant_1_size |
+ constant_1 |
+ ........ |
+ constant_n_size |
+ constant_n |
+ magic_end |
+ total_size
+ */
+
+ /*! Implements the serialization. */
+ virtual size_t serializeToBin(std::ostream& outs) { return 0;}
+ virtual size_t deserializeFromBin(std::istream& ins) { return 0; }
+ private:
+ vector<RelocEntry> entries;
+ GBE_CLASS(RelocTable);
+ };
class Unit : public NonCopyable
{
public:
@@ -70,6 +117,8 @@ namespace ir {
}
/*! Return the constant set */
ConstantSet& getConstantSet(void) { return constantSet; }
+ const RelocTable& getRelocTable(void) const { return relocTable; }
+ RelocTable& getRelocTable(void) { return relocTable; }
/*! Return the constant set */
const ConstantSet& getConstantSet(void) const { return constantSet; }
void setValid(bool value) { valid = value; }
@@ -78,6 +127,7 @@ namespace ir {
friend class ContextInterface; //!< Can free modify the unit
FunctionSet functions; //!< All the defined functions
ConstantSet constantSet; //!< All the constants defined in the unit
+ RelocTable relocTable;
PointerSize pointerSize; //!< Size shared by all pointers
GBE_CLASS(Unit);
bool valid;
diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp
index cb47097..d23a598 100644
--- a/backend/src/llvm/llvm_gen_backend.cpp
+++ b/backend/src/llvm/llvm_gen_backend.cpp
@@ -506,7 +506,7 @@ namespace gbe
virtual bool doInitialization(Module &M);
/*! helper function for parsing global constant data */
- void getConstantData(const Constant * c, void* mem, uint32_t& offset) const;
+ void getConstantData(const Constant * c, void* mem, uint32_t& offset, vector<ir::RelocEntry> &) const;
void collectGlobalConstant(void) const;
ir::ImmediateIndex processConstantImmIndex(Constant *CPV, int32_t index = 0u);
const ir::Immediate &processConstantImm(Constant *CPV, int32_t index = 0u);
@@ -1111,8 +1111,9 @@ namespace gbe
break;
}
case 2:
- new_bti = BTI_CONSTANT;
-
+ // ocl 2.0, constant pointer use separate bti
+ new_bti = btiBase;
+ incBtiBase();
break;
case 3:
new_bti = BTI_LOCAL;
@@ -1349,22 +1350,34 @@ namespace gbe
return;
}
- void GenWriter::getConstantData(const Constant * c, void* mem, uint32_t& offset) const {
+ void GenWriter::getConstantData(const Constant * c, void* mem, uint32_t& offset, vector<ir::RelocEntry> &relocs) const {
Type * type = c->getType();
Type::TypeID id = type->getTypeID();
GBE_ASSERT(c);
+ if (isa<GlobalVariable>(c)) {
+ const GlobalVariable *GV = cast<GlobalVariable>(c);
+
+ unsigned valueAddrSpace = GV->getType()->getAddressSpace();
+ ir::Constant cc = unit.getConstantSet().getConstant(c->getName());
+ unsigned int defOffset = cc.getOffset();
+
+ relocs.push_back(ir::RelocEntry(offset, defOffset));
+ uint32_t size = getTypeByteSize(unit, type);
+ memset((char*)mem+offset, 0, size);
+ offset += size;
+ return;
+ }
if(isa<UndefValue>(c)) {
uint32_t size = getTypeByteSize(unit, type);
offset += size;
return;
- } else if(isa<ConstantAggregateZero>(c)) {
+ } else if(isa<ConstantAggregateZero>(c) || isa<ConstantPointerNull>(c)) {
uint32_t size = getTypeByteSize(unit, type);
memset((char*)mem+offset, 0, size);
offset += size;
return;
}
-
switch(id) {
case Type::TypeID::StructTyID:
{
@@ -1382,7 +1395,7 @@ namespace gbe
offset += padding/8;
const Constant* sub = cast<Constant>(c->getOperand(op));
GBE_ASSERT(sub);
- getConstantData(sub, mem, offset);
+ getConstantData(sub, mem, offset, relocs);
}
break;
}
@@ -1401,7 +1414,7 @@ namespace gbe
uint32_t ops = c->getNumOperands();
for(uint32_t op = 0; op < ops; ++op) {
Constant * ca = dyn_cast<Constant>(c->getOperand(op));
- getConstantData(ca, mem, offset);
+ getConstantData(ca, mem, offset, relocs);
offset += padding;
}
}
@@ -1449,21 +1462,34 @@ namespace gbe
const Module::GlobalListType &globalList = TheModule->getGlobalList();
for(auto i = globalList.begin(); i != globalList.end(); i ++) {
const GlobalVariable &v = *i;
- if(!v.isConstantUsed()) continue;
const char *name = v.getName().data();
unsigned addrSpace = v.getType()->getAddressSpace();
- if(addrSpace == ir::AddressSpace::MEM_CONSTANT || v.isConstant()) {
- GBE_ASSERT(v.hasInitializer());
- const Constant *c = v.getInitializer();
- Type * type = c->getType();
+
+ vector<ir::RelocEntry> relocs;
+ if(addrSpace == 2 /* __constant */
+ || addrSpace == 1
+ || addrSpace == 0) {
+ Type * type = v.getValueType();
uint32_t size = getTypeByteSize(unit, type);
void* mem = malloc(size);
uint32_t offset = 0;
- getConstantData(c, mem, offset);
+ if (v.hasInitializer()) {
+ const Constant *c = v.getInitializer();
+ getConstantData(c, mem, offset, relocs);
+ } else {
+ memset(mem, 0, size);
+ }
uint32_t alignment = getAlignmentByte(unit, type);
unit.newConstant((char *)mem, name, size, alignment);
free(mem);
+ uint32_t refOffset = unit.getConstantSet().getConstant(name).getOffset();
+ for (uint32_t k = 0; k < relocs.size(); k++) {
+ unit.getRelocTable().addEntry(
+ refOffset + relocs[k].refOffset,
+ relocs[k].defOffset
+ );
+ }
}
}
}
@@ -2562,33 +2588,23 @@ namespace gbe
this->newRegister(const_cast<GlobalVariable*>(&v));
ir::Register reg = regTranslator.getScalar(const_cast<GlobalVariable*>(&v), 0);
ctx.LOADI(getType(ctx, v.getType()), reg, ctx.newIntegerImmediate(oldSlm + padding/8, getType(ctx, v.getType())));
- } else if(addrSpace == ir::MEM_CONSTANT || v.isConstant()) {
- GBE_ASSERT(v.hasInitializer());
- this->newRegister(const_cast<GlobalVariable*>(&v));
- ir::Register reg = regTranslator.getScalar(const_cast<GlobalVariable*>(&v), 0);
- ir::Constant &con = unit.getConstantSet().getConstant(v.getName());
- ctx.LOADI(getType(ctx, v.getType()), reg, ctx.newIntegerImmediate(con.getOffset(), getType(ctx, v.getType())));
- } else {
+ } else if(addrSpace == ir::MEM_CONSTANT
+ || addrSpace == ir::MEM_GLOBAL
+ || v.isConstant()) {
if(v.getName().equals(StringRef("__gen_ocl_printf_buf"))) {
ctx.getFunction().getPrintfSet()->setBufBTI(BtiMap.find(const_cast<GlobalVariable*>(&v))->second);
regTranslator.newScalarProxy(ir::ocl::printfbptr, const_cast<GlobalVariable*>(&v));
} else if(v.getName().equals(StringRef("__gen_ocl_printf_index_buf"))) {
ctx.getFunction().getPrintfSet()->setIndexBufBTI(BtiMap.find(const_cast<GlobalVariable*>(&v))->second);
regTranslator.newScalarProxy(ir::ocl::printfiptr, const_cast<GlobalVariable*>(&v));
- } else if(v.getName().str().substr(0, 4) == ".str") {
- /* When there are multi printf statements in multi kernel fucntions within the same
- translate unit, if they have the same sting parameter, such as
- kernel_func1 () {
- printf("Line is %d\n", line_num1);
- }
- kernel_func2 () {
- printf("Line is %d\n", line_num2);
- }
- The Clang will just generate one global string named .strXXX to represent "Line is %d\n"
- So when translating the kernel_func1, we can not unref that global var, so we will
- get here. Just ignore it to avoid assert. */
} else {
- GBE_ASSERT(0 && "Unsupported private memory access pattern");
+ this->newRegister(const_cast<GlobalVariable*>(&v));
+ ir::Register reg = regTranslator.getScalar(const_cast<GlobalVariable*>(&v), 0);
+ ir::Constant &con = unit.getConstantSet().getConstant(v.getName());
+ ctx.LOADI(getType(ctx, v.getType()), reg, ctx.newIntegerImmediate(con.getOffset(), getType(ctx, v.getType())));
+ if (!legacyMode) {
+ ctx.ADD(getType(ctx, v.getType()), reg, ir::ocl::constant_addrspace, reg);
+ }
}
}
}
diff --git a/kernels/compiler_program_global.cl b/kernels/compiler_program_global.cl
new file mode 100644
index 0000000..405c53f
--- /dev/null
+++ b/kernels/compiler_program_global.cl
@@ -0,0 +1,68 @@
+struct config{
+ int s0;
+ global short *s1;
+};
+
+global int i = 5;
+global int bb = 4;
+global int *global p;
+
+/* array */
+global int ba[12];
+
+/* short/long data type */
+global short s;
+global short s2;
+global long l;
+
+/* pointer in constant AS to global */
+global int * constant px =&i;
+
+/* constant pointer relocation */
+constant int x = 2;
+constant int y =1;
+constant int *constant z[2] = {&x, &y};
+
+/* structure with pointer field */
+global struct config c[2] = {{1, &s}, {2, &s2} };
+
+
+global int a = 1;
+global int b = 2;
+global int * constant gArr[2]= {&a, &b};
+
+__kernel void compiler_program_global0(const global int *src, int dynamic) {
+ size_t gid = get_global_id(0);
+ /* global read/write */
+ p = &i;
+ *p += 1;
+
+ /* pointer in struct memory access */
+ *c[gid&1].s1 += 2;
+
+ s = 2;
+ l = 3;
+
+ /* constant AS pointer (points to global) memory access */
+ *px += *z[dynamic];
+
+ p = &bb;
+ /* array */
+ if (gid < 11)
+ ba[gid] = src[gid];
+}
+
+__kernel void compiler_program_global1(global int *dst, int dynamic) {
+ size_t gid = get_global_id(0);
+// static global sg;
+
+ dst[11] = i;
+ dst[12] = *p;
+ dst[13] = s;
+ dst[14] = l;
+ dst[15] = *gArr[dynamic];
+
+ if (gid < 11)
+ dst[gid] = ba[gid];
+}
+
diff --git a/src/cl_command_queue.c b/src/cl_command_queue.c
index 9dc3fe6..442c6a2 100644
--- a/src/cl_command_queue.c
+++ b/src/cl_command_queue.c
@@ -161,7 +161,7 @@ cl_command_queue_bind_surface(cl_command_queue queue, cl_kernel k)
for (i = 0; i < k->arg_n; ++i) {
int32_t offset; // location of the address in the curbe
arg_type = interp_kernel_get_arg_type(k->opaque, i);
- if (arg_type != GBE_ARG_GLOBAL_PTR || !k->args[i].mem)
+ if (!(arg_type == GBE_ARG_GLOBAL_PTR || arg_type == GBE_ARG_CONSTANT_PTR) || !k->args[i].mem)
continue;
offset = interp_kernel_get_curbe_offset(k->opaque, GBE_CURBE_KERNEL_ARGUMENT, i);
if (offset < 0)
diff --git a/src/cl_command_queue_gen7.c b/src/cl_command_queue_gen7.c
index 2edc3be..61ffe7e 100644
--- a/src/cl_command_queue_gen7.c
+++ b/src/cl_command_queue_gen7.c
@@ -29,6 +29,7 @@
#include <assert.h>
#include <stdio.h>
#include <string.h>
+#include <unistd.h>
#define MAX_GROUP_SIZE_IN_HALFSLICE 512
static INLINE size_t cl_kernel_compute_batch_sz(cl_kernel k) { return 256+256; }
@@ -117,6 +118,11 @@ cl_upload_constant_buffer(cl_command_queue queue, cl_kernel ker)
* we need raw_size & aligned_size
*/
GET_QUEUE_THREAD_GPGPU(queue);
+ // TODO this is only valid for OpenCL 1.2,
+ // under ocl1.2 we gather all constant into one dedicated surface.
+ // but in 2.0 we put program global into one surface, but constants
+ // pass through kernel argument in each separate buffer
+#if 0
int32_t arg;
size_t offset = 0;
uint32_t raw_size = 0, aligned_size =0;
@@ -185,6 +191,16 @@ cl_upload_constant_buffer(cl_command_queue queue, cl_kernel ker)
}
}
cl_buffer_unmap(bo);
+#endif
+ // pass the starting of constant address space
+ int32_t constant_addrspace = interp_kernel_get_curbe_offset(ker->opaque, GBE_CURBE_CONSTANT_ADDRSPACE, 0);
+ if (constant_addrspace >= 0) {
+ size_t global_const_size = interp_program_get_global_constant_size(ker->program->opaque);
+ if (global_const_size > 0) {
+ *(uint64_t*)(ker->curbe + constant_addrspace) = (uint64_t)ker->program->global_data_ptr;
+ cl_gpgpu_bind_buf(gpgpu, ker->program->global_data, constant_addrspace, 0, ALIGN(global_const_size, getpagesize()), BTI_CONSTANT);
+ }
+ }
return 0;
}
diff --git a/src/cl_gbe_loader.cpp b/src/cl_gbe_loader.cpp
index e832a53..d75c92c 100644
--- a/src/cl_gbe_loader.cpp
+++ b/src/cl_gbe_loader.cpp
@@ -38,6 +38,8 @@ gbe_program_clean_llvm_resource_cb *compiler_program_clean_llvm_resource = NULL;
gbe_program_new_from_binary_cb *interp_program_new_from_binary = NULL;
gbe_program_get_global_constant_size_cb *interp_program_get_global_constant_size = NULL;
gbe_program_get_global_constant_data_cb *interp_program_get_global_constant_data = NULL;
+gbe_program_get_global_reloc_count_cb *interp_program_get_global_reloc_count = NULL;
+gbe_program_get_global_reloc_table_cb *interp_program_get_global_reloc_table = NULL;
gbe_program_delete_cb *interp_program_delete = NULL;
gbe_program_get_kernel_num_cb *interp_program_get_kernel_num = NULL;
gbe_program_get_kernel_by_name_cb *interp_program_get_kernel_by_name = NULL;
@@ -109,6 +111,14 @@ struct GbeLoaderInitializer
if (interp_program_get_global_constant_data == NULL)
return false;
+ interp_program_get_global_reloc_count = *(gbe_program_get_global_reloc_count_cb**)dlsym(dlhInterp, "gbe_program_get_global_reloc_count");
+ if (interp_program_get_global_reloc_count == NULL)
+ return false;
+
+ interp_program_get_global_reloc_table = *(gbe_program_get_global_reloc_table_cb**)dlsym(dlhInterp, "gbe_program_get_global_reloc_table");
+ if (interp_program_get_global_reloc_table == NULL)
+ return false;
+
interp_program_delete = *(gbe_program_delete_cb**)dlsym(dlhInterp, "gbe_program_delete");
if (interp_program_delete == NULL)
return false;
diff --git a/src/cl_gbe_loader.h b/src/cl_gbe_loader.h
index de91c85..28741ff 100644
--- a/src/cl_gbe_loader.h
+++ b/src/cl_gbe_loader.h
@@ -38,6 +38,8 @@ extern gbe_program_clean_llvm_resource_cb *compiler_program_clean_llvm_resource;
extern gbe_program_new_from_binary_cb *interp_program_new_from_binary;
extern gbe_program_get_global_constant_size_cb *interp_program_get_global_constant_size;
extern gbe_program_get_global_constant_data_cb *interp_program_get_global_constant_data;
+extern gbe_program_get_global_reloc_count_cb *interp_program_get_global_reloc_count;
+extern gbe_program_get_global_reloc_table_cb *interp_program_get_global_reloc_table;
extern gbe_program_delete_cb *interp_program_delete;
extern gbe_program_get_kernel_num_cb *interp_program_get_kernel_num;
extern gbe_program_get_kernel_by_name_cb *interp_program_get_kernel_by_name;
diff --git a/src/cl_program.c b/src/cl_program.c
index 98b6d51..ffdb2a1 100644
--- a/src/cl_program.c
+++ b/src/cl_program.c
@@ -97,6 +97,9 @@ cl_program_delete(cl_program p)
cl_kernel_delete(p->ker[i]);
cl_free(p->ker);
+ cl_free(p->global_data_ptr);
+ if (p->global_data_ptr)
+ cl_buffer_unreference(p->global_data);
/* Program belongs to their parent context */
cl_context_delete(p->ctx);
@@ -191,6 +194,42 @@ LOCAL cl_bool headerCompare(const unsigned char *BufPtr, BINARY_HEADER_INDEX ind
#define isLLVM_LIB(BufPtr) headerCompare(BufPtr, BHI_LIBRARY)
#define isGenBinary(BufPtr) headerCompare(BufPtr, BHI_GEN_BINARY)
+static cl_int get_program_global_data(cl_program prog) {
+ cl_buffer_mgr bufmgr = NULL;
+ bufmgr = cl_context_get_bufmgr(prog->ctx);
+ assert(bufmgr);
+ size_t const_size = interp_program_get_global_constant_size(prog->opaque);
+ if (const_size == 0) return CL_SUCCESS;
+
+ int page_size = getpagesize();
+ size_t alignedSz = ALIGN(const_size, page_size);
+ char * p = (char*)cl_aligned_malloc(alignedSz, page_size);
+ prog->global_data_ptr = p;
+ interp_program_get_global_constant_data(prog->opaque, (char*)p);
+
+ prog->global_data = cl_buffer_alloc_userptr(bufmgr, "program global data", p, alignedSz, 0);
+ cl_buffer_set_softpin_offset(prog->global_data, (size_t)p);
+
+ uint32_t reloc_count = interp_program_get_global_reloc_count(prog->opaque);
+ if (reloc_count > 0) {
+ uint32_t x;
+ struct RelocEntry {int refOffset; int defOffset;};
+ char *temp = (char*) malloc(reloc_count *sizeof(int)*2);
+ interp_program_get_global_reloc_table(prog->opaque, temp);
+ for (x = 0; x < reloc_count; x++) {
+ int ref_offset = ((struct RelocEntry *)temp)[x].refOffset;
+ *(uint64_t*)&(p[ref_offset]) = ((struct RelocEntry *)temp)[x].defOffset + (uint64_t)p;
+ }
+ free(temp);
+ }
+#if 0
+ int x = 0;
+ for (x = 0; x < const_size; x++) {
+ printf("offset %d data: %x\n", x, (unsigned)p[x]);
+ }
+#endif
+ return CL_SUCCESS;
+}
LOCAL cl_program
cl_program_create_from_binary(cl_context ctx,
cl_uint num_devices,
@@ -603,6 +642,9 @@ cl_program_build(cl_program p, const char *options)
memcpy(p->bin + copyed, interp_kernel_get_code(opaque), sz);
copyed += sz;
}
+ if ((err = get_program_global_data(p)) != CL_SUCCESS)
+ goto error;
+
p->is_built = 1;
p->build_status = CL_BUILD_SUCCESS;
return CL_SUCCESS;
@@ -697,6 +739,10 @@ cl_program_link(cl_context context,
memcpy(p->bin + copyed, interp_kernel_get_code(opaque), sz);
copyed += sz;
}
+
+ if ((err = get_program_global_data(p)) != CL_SUCCESS)
+ goto error;
+
done:
if(p) p->is_built = 1;
if(p) p->build_status = CL_BUILD_SUCCESS;
diff --git a/src/cl_program.h b/src/cl_program.h
index 63ad16d..083d66a 100644
--- a/src/cl_program.h
+++ b/src/cl_program.h
@@ -54,6 +54,8 @@ struct _cl_program {
cl_kernel *ker; /* All kernels included by the OCL file */
cl_program prev, next; /* We chain the programs together */
cl_context ctx; /* Its parent context */
+ cl_buffer global_data;
+ char * global_data_ptr;
char *bin; /* The program copied verbatim */
size_t bin_sz; /* Its size in memory */
char *source; /* Program sources */
diff --git a/utests/CMakeLists.txt b/utests/CMakeLists.txt
index 78442cb..0fca450 100644
--- a/utests/CMakeLists.txt
+++ b/utests/CMakeLists.txt
@@ -191,6 +191,7 @@ set (utests_sources
compiler_bool_cross_basic_block.cpp
compiler_private_const.cpp
compiler_private_data_overflow.cpp
+ compiler_program_global.cpp
compiler_getelementptr_bitcast.cpp
compiler_sub_group_any.cpp
compiler_sub_group_all.cpp
diff --git a/utests/compiler_program_global.cpp b/utests/compiler_program_global.cpp
new file mode 100644
index 0000000..ef7c655
--- /dev/null
+++ b/utests/compiler_program_global.cpp
@@ -0,0 +1,80 @@
+#include "utest_helper.hpp"
+#include "utest_file_map.hpp"
+
+static int init_program(const char* name, cl_context ctx, cl_program *pg )
+{
+ cl_int err;
+ char* ker_path = cl_do_kiss_path(name, device);
+
+ cl_file_map_t *fm = cl_file_map_new();
+ err = cl_file_map_open(fm, ker_path);
+ if(err != CL_FILE_MAP_SUCCESS)
+ OCL_ASSERT(0);
+ const char *src = cl_file_map_begin(fm);
+
+ *pg = clCreateProgramWithSource(ctx, 1, &src, NULL, &err);
+ free(ker_path);
+ cl_file_map_delete(fm);
+ return 0;
+
+}
+
+void compiler_program_global()
+{
+ const int n = 16;
+ int cpu_src[16];
+ cl_int err;
+
+ // Setup kernel and buffers
+ cl_program program;
+ init_program("compiler_program_global.cl", ctx, &program);
+ OCL_CALL (clBuildProgram, program, 1, &device, "-cl-std=CL2.0", NULL, NULL);
+
+ cl_kernel k0 = clCreateKernel(program, "compiler_program_global0", &err);
+ assert(err == CL_SUCCESS);
+ cl_kernel k1 = clCreateKernel(program, "compiler_program_global1", &err);
+ assert(err == CL_SUCCESS);
+
+ OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(int), NULL);
+ OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(int), NULL);
+
+ OCL_CALL (clSetKernelArg, k0, 0, sizeof(cl_mem), &buf[0]);
+ OCL_CALL (clSetKernelArg, k1, 0, sizeof(cl_mem), &buf[1]);
+
+ int dynamic = 1;
+ OCL_CALL (clSetKernelArg, k0, 1, sizeof(cl_int), &dynamic);
+ OCL_CALL (clSetKernelArg, k1, 1, sizeof(cl_int), &dynamic);
+
+ globals[0] = 16;
+ locals[0] = 16;
+
+ OCL_MAP_BUFFER(0);
+ for (int i = 0; i < n; ++i)
+ cpu_src[i] = ((int*)buf_data[0])[i] = i;
+ OCL_UNMAP_BUFFER(0);
+
+ // Run the kernel on GPU
+ OCL_CALL (clEnqueueNDRangeKernel, queue, k0, 1, NULL, globals, locals, 0, NULL, NULL);
+ OCL_CALL (clEnqueueNDRangeKernel, queue, k1, 1, NULL, globals, locals, 0, NULL, NULL);
+
+ // Compare
+ OCL_MAP_BUFFER(1);
+ for (int32_t i = 0; i < n; ++i) {
+// printf("i=%d dst=%d\n", i, ((int*)buf_data[1])[i]);
+ switch(i) {
+ default: OCL_ASSERT(((int*)buf_data[1])[i] == i); break;
+ case 11: OCL_ASSERT(((int*)buf_data[1])[i] == 7); break;
+ case 12: OCL_ASSERT(((int*)buf_data[1])[i] == 4); break;
+ case 13: OCL_ASSERT(((int*)buf_data[1])[i] == 2); break;
+ case 14: OCL_ASSERT(((int*)buf_data[1])[i] == 3); break;
+ case 15: OCL_ASSERT(((int*)buf_data[1])[i] == 2); break;
+ }
+ }
+ OCL_UNMAP_BUFFER(1);
+ clReleaseKernel(k0);
+ clReleaseKernel(k1);
+ clReleaseProgram(program);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_program_global);
+
--
2.4.1
More information about the Beignet
mailing list