[Beignet] [PATCH 1/4] GBE: Implement new constant solution for ocl2
Song, Ruiling
ruiling.song at intel.com
Mon Jan 18 19:38:42 PST 2016
This patchset is for ocl2.0 branch only.
> -----Original Message-----
> From: Beignet [mailto:beignet-bounces at lists.freedesktop.org] On Behalf Of
> xionghu.luo at intel.com
> Sent: Tuesday, January 19, 2016 11:29 AM
> To: beignet at lists.freedesktop.org
> Cc: Luo, Xionghu <xionghu.luo at intel.com>; Song, Ruiling
> <ruiling.song at intel.com>
> Subject: [Beignet] [PATCH 1/4] GBE: Implement new constant solution for
> ocl2
>
> From: Ruiling Song <ruiling.song at intel.com>
>
> Different from ocl 1.2, which gather all constant into one surface.
> ocl2 only gather program global/constant into one surface. But
> keep other constant passed through kernel argument in separate buffer.
>
> Signed-off-by: Ruiling Song <ruiling.song at intel.com>
> ---
> backend/src/backend/gen_insn_selection.cpp | 15 ++++--
> backend/src/backend/program.cpp | 21 +++++++-
> backend/src/backend/program.h | 6 +++
> backend/src/backend/program.hpp | 4 ++
> backend/src/gbe_bin_interpreter.cpp | 2 +
> backend/src/ir/profile.cpp | 4 +-
> backend/src/ir/profile.hpp | 3 +-
> backend/src/ir/unit.hpp | 50 ++++++++++++++++++
> backend/src/llvm/llvm_gen_backend.cpp | 84 ++++++++++++++++++----
> --------
> kernels/compiler_program_global.cl | 68 ++++++++++++++++++++++++
> src/cl_command_queue.c | 2 +-
> src/cl_command_queue_gen7.c | 16 ++++++
> src/cl_gbe_loader.cpp | 10 ++++
> src/cl_gbe_loader.h | 2 +
> src/cl_program.c | 46 ++++++++++++++++
> src/cl_program.h | 2 +
> utests/CMakeLists.txt | 1 +
> utests/compiler_program_global.cpp | 80
> ++++++++++++++++++++++++++++
> 18 files changed, 374 insertions(+), 42 deletions(-)
> create mode 100644 kernels/compiler_program_global.cl
> create mode 100644 utests/compiler_program_global.cpp
>
> diff --git a/backend/src/backend/gen_insn_selection.cpp
> b/backend/src/backend/gen_insn_selection.cpp
> index d19f985..6ef077b 100644
> --- a/backend/src/backend/gen_insn_selection.cpp
> +++ b/backend/src/backend/gen_insn_selection.cpp
> @@ -3596,6 +3596,13 @@ namespace gbe
> LoadInstructionPattern(void) : SelectionPattern(1, 1) {
> this->opcodes.push_back(ir::OP_LOAD);
> }
> + bool isReadConstantLegacy(const ir::LoadInstruction &load) const {
> + ir::AddressMode AM = load.getAddressMode();
> + ir::AddressSpace AS = load.getAddressSpace();
> + if (AM != ir::AM_Stateless && AS == ir::MEM_CONSTANT)
> + return true;
> + return false;
> + }
> void untypedReadStateless(Selection::Opaque &sel,
> GenRegister addr,
> vector<GenRegister> &dst
> @@ -3678,7 +3685,7 @@ namespace gbe
> unsigned SI = insn.getSurfaceIndex();
> sel.UNTYPED_READ(addr, dst.data(), valueNum,
> GenRegister::immud(SI), btiTemp);
> }
> - } else if (addrSpace == ir::MEM_LOCAL || addrSpace ==
> ir::MEM_CONSTANT ) {
> + } else if (addrSpace == ir::MEM_LOCAL || isReadConstantLegacy(insn) )
> {
> // stateless mode, local/constant still use bti access
> unsigned bti = addrSpace == ir::MEM_CONSTANT ? BTI_CONSTANT :
> 0xfe;
> GenRegister addrDW = addr;
> @@ -3842,7 +3849,7 @@ namespace gbe
> b = GenRegister::immud(insn.getSurfaceIndex());
> }
> read64Legacy(sel, addr, dst, b, btiTemp);
> - } else if (addrSpace == MEM_LOCAL || addrSpace == MEM_CONSTANT)
> {
> + } else if (addrSpace == MEM_LOCAL || isReadConstantLegacy(insn)) {
> GenRegister b = GenRegister::immud(addrSpace == MEM_LOCAL?
> 0xfe : BTI_CONSTANT);
> GenRegister addrDW = addr;
> if (addrBytes == 8)
> @@ -4063,7 +4070,7 @@ namespace gbe
> unsigned SI = insn.getSurfaceIndex();
> sel.BYTE_GATHER(dst, addr, elemSize, GenRegister::immud(SI),
> btiTemp);
> }
> - } else if (addrSpace == ir::MEM_LOCAL || addrSpace ==
> ir::MEM_CONSTANT) {
> + } else if (addrSpace == ir::MEM_LOCAL || isReadConstantLegacy(insn)) {
> unsigned bti = addrSpace == ir::MEM_CONSTANT ? BTI_CONSTANT :
> 0xfe;
> GenRegister addrDW = addr;
> if (addrBytes == 8) {
> @@ -4207,7 +4214,7 @@ namespace gbe
> const Type type = insn.getValueType();
> const uint32_t elemSize = getByteScatterGatherSize(sel, type);
>
> - if (addrSpace == MEM_CONSTANT) {
> + if (isReadConstantLegacy(insn)) {
> // XXX TODO read 64bit constant through constant cache
> // Per HW Spec, constant cache messages can read at least DWORD data.
> // So, byte/short data type, we have to read through data cache.
> diff --git a/backend/src/backend/program.cpp
> b/backend/src/backend/program.cpp
> index 36af95f..ce4f927 100644
> --- a/backend/src/backend/program.cpp
> +++ b/backend/src/backend/program.cpp
> @@ -104,11 +104,13 @@ namespace gbe {
> return it->offset; // we found it!
> }
>
> - Program::Program(void) : constantSet(NULL) {}
> + Program::Program(void) : constantSet(NULL),
> + relocTable(NULL) {}
> Program::~Program(void) {
> for (map<std::string, Kernel*>::iterator it = kernels.begin(); it !=
> kernels.end(); ++it)
> GBE_DELETE(it->second);
> if (constantSet) delete constantSet;
> + if (relocTable) delete relocTable;
> }
>
> #ifdef GBE_COMPILER_AVAILABLE
> @@ -151,6 +153,7 @@ namespace gbe {
>
> bool Program::buildFromUnit(const ir::Unit &unit, std::string &error) {
> constantSet = new ir::ConstantSet(unit.getConstantSet());
> + relocTable = new ir::RelocTable(unit.getRelocTable());
> const auto &set = unit.getFunctionSet();
> const uint32_t kernelNum = set.size();
> if (OCL_OUTPUT_GEN_IR) std::cout << unit;
> @@ -978,6 +981,18 @@ namespace gbe {
> program->getGlobalConstantData(mem);
> }
>
> + static size_t programGetGlobalRelocCount(gbe_program gbeProgram) {
> + if (gbeProgram == NULL) return 0;
> + const gbe::Program *program = (const gbe::Program*) gbeProgram;
> + return program->getGlobalRelocCount();
> + }
> +
> + static void programGetGlobalRelocTable(gbe_program gbeProgram, char
> *mem) {
> + if (gbeProgram == NULL) return;
> + const gbe::Program *program = (const gbe::Program*) gbeProgram;
> + program->getGlobalRelocTable(mem);
> + }
> +
> static uint32_t programGetKernelNum(gbe_program gbeProgram) {
> if (gbeProgram == NULL) return 0;
> const gbe::Program *program = (const gbe::Program*) gbeProgram;
> @@ -1220,6 +1235,8 @@ GBE_EXPORT_SYMBOL
> gbe_program_link_from_llvm_cb *gbe_program_link_from_llvm = NU
> GBE_EXPORT_SYMBOL gbe_program_build_from_llvm_cb
> *gbe_program_build_from_llvm = NULL;
> GBE_EXPORT_SYMBOL gbe_program_get_global_constant_size_cb
> *gbe_program_get_global_constant_size = NULL;
> GBE_EXPORT_SYMBOL gbe_program_get_global_constant_data_cb
> *gbe_program_get_global_constant_data = NULL;
> +GBE_EXPORT_SYMBOL gbe_program_get_global_reloc_count_cb
> *gbe_program_get_global_reloc_count = NULL;
> +GBE_EXPORT_SYMBOL gbe_program_get_global_reloc_table_cb
> *gbe_program_get_global_reloc_table = NULL;
> GBE_EXPORT_SYMBOL gbe_program_clean_llvm_resource_cb
> *gbe_program_clean_llvm_resource = NULL;
> GBE_EXPORT_SYMBOL gbe_program_delete_cb *gbe_program_delete =
> NULL;
> GBE_EXPORT_SYMBOL gbe_program_get_kernel_num_cb
> *gbe_program_get_kernel_num = NULL;
> @@ -1269,6 +1286,8 @@ namespace gbe
> gbe_program_check_opt = gbe::programCheckOption;
> gbe_program_get_global_constant_size =
> gbe::programGetGlobalConstantSize;
> gbe_program_get_global_constant_data =
> gbe::programGetGlobalConstantData;
> + gbe_program_get_global_reloc_count =
> gbe::programGetGlobalRelocCount;
> + gbe_program_get_global_reloc_table =
> gbe::programGetGlobalRelocTable;
> gbe_program_clean_llvm_resource = gbe::programCleanLlvmResource;
> gbe_program_delete = gbe::programDelete;
> gbe_program_get_kernel_num = gbe::programGetKernelNum;
> diff --git a/backend/src/backend/program.h
> b/backend/src/backend/program.h
> index 86b3177..03150bc 100644
> --- a/backend/src/backend/program.h
> +++ b/backend/src/backend/program.h
> @@ -99,6 +99,7 @@ enum gbe_curbe_type {
> GBE_CURBE_BLOCK_IP,
> GBE_CURBE_DW_BLOCK_IP,
> GBE_CURBE_THREAD_NUM,
> + GBE_CURBE_CONSTANT_ADDRSPACE,
> GBE_GEN_REG,
> };
>
> @@ -243,6 +244,11 @@ extern gbe_program_get_global_constant_size_cb
> *gbe_program_get_global_constant_
> typedef void (gbe_program_get_global_constant_data_cb)(gbe_program
> gbeProgram, char *mem);
> extern gbe_program_get_global_constant_data_cb
> *gbe_program_get_global_constant_data;
>
> +typedef size_t (gbe_program_get_global_reloc_count_cb)(gbe_program
> gbeProgram);
> +extern gbe_program_get_global_reloc_count_cb
> *gbe_program_get_global_reloc_count;
> +
> +typedef void (gbe_program_get_global_reloc_table_cb)(gbe_program
> gbeProgram, char *mem);
> +extern gbe_program_get_global_reloc_table_cb
> *gbe_program_get_global_reloc_table;
> /*! Get the size of defined samplers */
> typedef size_t (gbe_kernel_get_sampler_size_cb)(gbe_kernel gbeKernel);
> extern gbe_kernel_get_sampler_size_cb *gbe_kernel_get_sampler_size;
> diff --git a/backend/src/backend/program.hpp
> b/backend/src/backend/program.hpp
> index efe192f..e58ddf0 100644
> --- a/backend/src/backend/program.hpp
> +++ b/backend/src/backend/program.hpp
> @@ -280,6 +280,8 @@ namespace gbe {
> /*! Get the content of global constant arrays */
> void getGlobalConstantData(char *mem) const { constantSet-
> >getData(mem); }
>
> + uint32_t getGlobalRelocCount(void) const { return relocTable-
> >getCount(); }
> + void getGlobalRelocTable(char *p) const { relocTable->getData(p); }
> static const uint32_t magic_begin = TO_MAGIC('P', 'R', 'O', 'G');
> static const uint32_t magic_end = TO_MAGIC('G', 'O', 'R', 'P');
>
> @@ -309,6 +311,8 @@ namespace gbe {
> map<std::string, Kernel*> kernels;
> /*! Global (constants) outside any kernel */
> ir::ConstantSet *constantSet;
> + /*! relocation table */
> + ir::RelocTable *relocTable;
> /*! Use custom allocators */
> GBE_CLASS(Program);
> };
> diff --git a/backend/src/gbe_bin_interpreter.cpp
> b/backend/src/gbe_bin_interpreter.cpp
> index 4594a0a..0957092 100644
> --- a/backend/src/gbe_bin_interpreter.cpp
> +++ b/backend/src/gbe_bin_interpreter.cpp
> @@ -61,6 +61,8 @@ struct BinInterpCallBackInitializer
> gbe_program_get_global_constant_size =
> gbe::programGetGlobalConstantSize;
> gbe_program_delete = gbe::programDelete;
> gbe_program_get_global_constant_data =
> gbe::programGetGlobalConstantData;
> + gbe_program_get_global_reloc_count =
> gbe::programGetGlobalRelocCount;
> + gbe_program_get_global_reloc_table =
> gbe::programGetGlobalRelocTable;
> gbe_kernel_get_sampler_data = gbe::kernelGetSamplerData;
> gbe_kernel_get_image_data = gbe::kernelGetImageData;
> gbe_kernel_get_arg_info = gbe::kernelGetArgInfo;
> diff --git a/backend/src/ir/profile.cpp b/backend/src/ir/profile.cpp
> index 3ead8a7..0699167 100644
> --- a/backend/src/ir/profile.cpp
> +++ b/backend/src/ir/profile.cpp
> @@ -43,7 +43,8 @@ namespace ir {
> "zero", "one",
> "retVal",
> "printf_buffer_pointer", "printf_index_buffer_pointer",
> - "dwblockip"
> + "dwblockip",
> + "constant_addrspace_start"
> };
>
> #if GBE_DEBUG
> @@ -86,6 +87,7 @@ namespace ir {
> DECL_NEW_REG(FAMILY_QWORD, printfbptr, 1,
> GBE_CURBE_PRINTF_BUF_POINTER);
> DECL_NEW_REG(FAMILY_QWORD, printfiptr, 1,
> GBE_CURBE_PRINTF_INDEX_POINTER);
> DECL_NEW_REG(FAMILY_DWORD, dwblockip, 0,
> GBE_CURBE_DW_BLOCK_IP);
> + DECL_NEW_REG(FAMILY_QWORD, constant_addrspace, 1,
> GBE_CURBE_CONSTANT_ADDRSPACE);
> }
> #undef DECL_NEW_REG
>
> diff --git a/backend/src/ir/profile.hpp b/backend/src/ir/profile.hpp
> index a8445c4..79761d4 100644
> --- a/backend/src/ir/profile.hpp
> +++ b/backend/src/ir/profile.hpp
> @@ -71,7 +71,8 @@ namespace ir {
> static const Register printfbptr = Register(27); // printf buffer address .
> static const Register printfiptr = Register(28); // printf index buffer address.
> static const Register dwblockip = Register(29); // blockip
> - static const uint32_t regNum = 30; // number of special registers
> + static const Register constant_addrspace = Register(30); // starting
> address of program-scope constant
> + static const uint32_t regNum = 31; // number of special registers
> extern const char *specialRegMean[]; // special register name.
> } /* namespace ocl */
>
> diff --git a/backend/src/ir/unit.hpp b/backend/src/ir/unit.hpp
> index 8ff858d..b8df145 100644
> --- a/backend/src/ir/unit.hpp
> +++ b/backend/src/ir/unit.hpp
> @@ -27,6 +27,7 @@
> #include "ir/constant.hpp"
> #include "ir/register.hpp"
> #include "sys/map.hpp"
> +#include <string.h>
>
> namespace gbe {
> namespace ir {
> @@ -37,6 +38,52 @@ namespace ir {
> /*! Complete unit of compilation. It contains a set of functions and a set of
> * constant the functions may refer to.
> */
> + struct RelocEntry {
> + RelocEntry(unsigned int rO, unsigned int dO):
> + refOffset(rO),
> + defOffset(dO) {}
> +
> + unsigned int refOffset;
> + unsigned int defOffset;
> + };
> +
> + class RelocTable : public NonCopyable, public Serializable
> + {
> + public:
> + void addEntry(unsigned refOffset, unsigned defOffset) {
> + entries.push_back(RelocEntry(refOffset, defOffset));
> + }
> + RelocTable() {}
> + RelocTable(const RelocTable& other) : Serializable(other),
> + entries(other.entries) {}
> + uint32_t getCount() { return entries.size(); }
> + void getData(char *p) {
> + if (entries.size() > 1 && p)
> + memcpy(p, entries.data(), entries.size()*sizeof(RelocEntry));
> + }
> + static const uint32_t magic_begin = TO_MAGIC('R', 'E', 'L', 'C');
> + static const uint32_t magic_end = TO_MAGIC('C', 'L', 'E', 'R');
> +
> + /* format:
> + magic_begin |
> + const_data_size |
> + const_data |
> + constant_1_size |
> + constant_1 |
> + ........ |
> + constant_n_size |
> + constant_n |
> + magic_end |
> + total_size
> + */
> +
> + /*! Implements the serialization. */
> + virtual size_t serializeToBin(std::ostream& outs) { return 0;}
> + virtual size_t deserializeFromBin(std::istream& ins) { return 0; }
> + private:
> + vector<RelocEntry> entries;
> + GBE_CLASS(RelocTable);
> + };
> class Unit : public NonCopyable
> {
> public:
> @@ -70,6 +117,8 @@ namespace ir {
> }
> /*! Return the constant set */
> ConstantSet& getConstantSet(void) { return constantSet; }
> + const RelocTable& getRelocTable(void) const { return relocTable; }
> + RelocTable& getRelocTable(void) { return relocTable; }
> /*! Return the constant set */
> const ConstantSet& getConstantSet(void) const { return constantSet; }
> void setValid(bool value) { valid = value; }
> @@ -78,6 +127,7 @@ namespace ir {
> friend class ContextInterface; //!< Can free modify the unit
> FunctionSet functions; //!< All the defined functions
> ConstantSet constantSet; //!< All the constants defined in the unit
> + RelocTable relocTable;
> PointerSize pointerSize; //!< Size shared by all pointers
> GBE_CLASS(Unit);
> bool valid;
> diff --git a/backend/src/llvm/llvm_gen_backend.cpp
> b/backend/src/llvm/llvm_gen_backend.cpp
> index cb47097..d23a598 100644
> --- a/backend/src/llvm/llvm_gen_backend.cpp
> +++ b/backend/src/llvm/llvm_gen_backend.cpp
> @@ -506,7 +506,7 @@ namespace gbe
>
> virtual bool doInitialization(Module &M);
> /*! helper function for parsing global constant data */
> - void getConstantData(const Constant * c, void* mem, uint32_t& offset)
> const;
> + void getConstantData(const Constant * c, void* mem, uint32_t& offset,
> vector<ir::RelocEntry> &) const;
> void collectGlobalConstant(void) const;
> ir::ImmediateIndex processConstantImmIndex(Constant *CPV, int32_t
> index = 0u);
> const ir::Immediate &processConstantImm(Constant *CPV, int32_t index
> = 0u);
> @@ -1111,8 +1111,9 @@ namespace gbe
> break;
> }
> case 2:
> - new_bti = BTI_CONSTANT;
> -
> + // ocl 2.0, constant pointer use separate bti
> + new_bti = btiBase;
> + incBtiBase();
> break;
> case 3:
> new_bti = BTI_LOCAL;
> @@ -1349,22 +1350,34 @@ namespace gbe
> return;
> }
>
> - void GenWriter::getConstantData(const Constant * c, void* mem,
> uint32_t& offset) const {
> + void GenWriter::getConstantData(const Constant * c, void* mem,
> uint32_t& offset, vector<ir::RelocEntry> &relocs) const {
> Type * type = c->getType();
> Type::TypeID id = type->getTypeID();
>
> GBE_ASSERT(c);
> + if (isa<GlobalVariable>(c)) {
> + const GlobalVariable *GV = cast<GlobalVariable>(c);
> +
> + unsigned valueAddrSpace = GV->getType()->getAddressSpace();
> + ir::Constant cc = unit.getConstantSet().getConstant(c->getName());
> + unsigned int defOffset = cc.getOffset();
> +
> + relocs.push_back(ir::RelocEntry(offset, defOffset));
> + uint32_t size = getTypeByteSize(unit, type);
> + memset((char*)mem+offset, 0, size);
> + offset += size;
> + return;
> + }
> if(isa<UndefValue>(c)) {
> uint32_t size = getTypeByteSize(unit, type);
> offset += size;
> return;
> - } else if(isa<ConstantAggregateZero>(c)) {
> + } else if(isa<ConstantAggregateZero>(c) || isa<ConstantPointerNull>(c)) {
> uint32_t size = getTypeByteSize(unit, type);
> memset((char*)mem+offset, 0, size);
> offset += size;
> return;
> }
> -
> switch(id) {
> case Type::TypeID::StructTyID:
> {
> @@ -1382,7 +1395,7 @@ namespace gbe
> offset += padding/8;
> const Constant* sub = cast<Constant>(c->getOperand(op));
> GBE_ASSERT(sub);
> - getConstantData(sub, mem, offset);
> + getConstantData(sub, mem, offset, relocs);
> }
> break;
> }
> @@ -1401,7 +1414,7 @@ namespace gbe
> uint32_t ops = c->getNumOperands();
> for(uint32_t op = 0; op < ops; ++op) {
> Constant * ca = dyn_cast<Constant>(c->getOperand(op));
> - getConstantData(ca, mem, offset);
> + getConstantData(ca, mem, offset, relocs);
> offset += padding;
> }
> }
> @@ -1449,21 +1462,34 @@ namespace gbe
> const Module::GlobalListType &globalList = TheModule->getGlobalList();
> for(auto i = globalList.begin(); i != globalList.end(); i ++) {
> const GlobalVariable &v = *i;
> - if(!v.isConstantUsed()) continue;
> const char *name = v.getName().data();
> unsigned addrSpace = v.getType()->getAddressSpace();
> - if(addrSpace == ir::AddressSpace::MEM_CONSTANT || v.isConstant()) {
> - GBE_ASSERT(v.hasInitializer());
> - const Constant *c = v.getInitializer();
> - Type * type = c->getType();
> +
> + vector<ir::RelocEntry> relocs;
> + if(addrSpace == 2 /* __constant */
> + || addrSpace == 1
> + || addrSpace == 0) {
> + Type * type = v.getValueType();
>
> uint32_t size = getTypeByteSize(unit, type);
> void* mem = malloc(size);
> uint32_t offset = 0;
> - getConstantData(c, mem, offset);
> + if (v.hasInitializer()) {
> + const Constant *c = v.getInitializer();
> + getConstantData(c, mem, offset, relocs);
> + } else {
> + memset(mem, 0, size);
> + }
> uint32_t alignment = getAlignmentByte(unit, type);
> unit.newConstant((char *)mem, name, size, alignment);
> free(mem);
> + uint32_t refOffset =
> unit.getConstantSet().getConstant(name).getOffset();
> + for (uint32_t k = 0; k < relocs.size(); k++) {
> + unit.getRelocTable().addEntry(
> + refOffset + relocs[k].refOffset,
> + relocs[k].defOffset
> + );
> + }
> }
> }
> }
> @@ -2562,33 +2588,23 @@ namespace gbe
> this->newRegister(const_cast<GlobalVariable*>(&v));
> ir::Register reg =
> regTranslator.getScalar(const_cast<GlobalVariable*>(&v), 0);
> ctx.LOADI(getType(ctx, v.getType()), reg,
> ctx.newIntegerImmediate(oldSlm + padding/8, getType(ctx, v.getType())));
> - } else if(addrSpace == ir::MEM_CONSTANT || v.isConstant()) {
> - GBE_ASSERT(v.hasInitializer());
> - this->newRegister(const_cast<GlobalVariable*>(&v));
> - ir::Register reg =
> regTranslator.getScalar(const_cast<GlobalVariable*>(&v), 0);
> - ir::Constant &con = unit.getConstantSet().getConstant(v.getName());
> - ctx.LOADI(getType(ctx, v.getType()), reg,
> ctx.newIntegerImmediate(con.getOffset(), getType(ctx, v.getType())));
> - } else {
> + } else if(addrSpace == ir::MEM_CONSTANT
> + || addrSpace == ir::MEM_GLOBAL
> + || v.isConstant()) {
> if(v.getName().equals(StringRef("__gen_ocl_printf_buf"))) {
> ctx.getFunction().getPrintfSet()-
> >setBufBTI(BtiMap.find(const_cast<GlobalVariable*>(&v))->second);
> regTranslator.newScalarProxy(ir::ocl::printfbptr,
> const_cast<GlobalVariable*>(&v));
> } else if(v.getName().equals(StringRef("__gen_ocl_printf_index_buf")))
> {
> ctx.getFunction().getPrintfSet()-
> >setIndexBufBTI(BtiMap.find(const_cast<GlobalVariable*>(&v))->second);
> regTranslator.newScalarProxy(ir::ocl::printfiptr,
> const_cast<GlobalVariable*>(&v));
> - } else if(v.getName().str().substr(0, 4) == ".str") {
> - /* When there are multi printf statements in multi kernel fucntions
> within the same
> - translate unit, if they have the same sting parameter, such as
> - kernel_func1 () {
> - printf("Line is %d\n", line_num1);
> - }
> - kernel_func2 () {
> - printf("Line is %d\n", line_num2);
> - }
> - The Clang will just generate one global string named .strXXX to
> represent "Line is %d\n"
> - So when translating the kernel_func1, we can not unref that global
> var, so we will
> - get here. Just ignore it to avoid assert. */
> } else {
> - GBE_ASSERT(0 && "Unsupported private memory access pattern");
> + this->newRegister(const_cast<GlobalVariable*>(&v));
> + ir::Register reg =
> regTranslator.getScalar(const_cast<GlobalVariable*>(&v), 0);
> + ir::Constant &con = unit.getConstantSet().getConstant(v.getName());
> + ctx.LOADI(getType(ctx, v.getType()), reg,
> ctx.newIntegerImmediate(con.getOffset(), getType(ctx, v.getType())));
> + if (!legacyMode) {
> + ctx.ADD(getType(ctx, v.getType()), reg, ir::ocl::constant_addrspace,
> reg);
> + }
> }
> }
> }
> diff --git a/kernels/compiler_program_global.cl
> b/kernels/compiler_program_global.cl
> new file mode 100644
> index 0000000..405c53f
> --- /dev/null
> +++ b/kernels/compiler_program_global.cl
> @@ -0,0 +1,68 @@
> +struct config{
> + int s0;
> + global short *s1;
> +};
> +
> +global int i = 5;
> +global int bb = 4;
> +global int *global p;
> +
> +/* array */
> +global int ba[12];
> +
> +/* short/long data type */
> +global short s;
> +global short s2;
> +global long l;
> +
> +/* pointer in constant AS to global */
> +global int * constant px =&i;
> +
> +/* constant pointer relocation */
> +constant int x = 2;
> +constant int y =1;
> +constant int *constant z[2] = {&x, &y};
> +
> +/* structure with pointer field */
> +global struct config c[2] = {{1, &s}, {2, &s2} };
> +
> +
> +global int a = 1;
> +global int b = 2;
> +global int * constant gArr[2]= {&a, &b};
> +
> +__kernel void compiler_program_global0(const global int *src, int dynamic)
> {
> + size_t gid = get_global_id(0);
> + /* global read/write */
> + p = &i;
> + *p += 1;
> +
> + /* pointer in struct memory access */
> + *c[gid&1].s1 += 2;
> +
> + s = 2;
> + l = 3;
> +
> + /* constant AS pointer (points to global) memory access */
> + *px += *z[dynamic];
> +
> + p = &bb;
> + /* array */
> + if (gid < 11)
> + ba[gid] = src[gid];
> +}
> +
> +__kernel void compiler_program_global1(global int *dst, int dynamic) {
> + size_t gid = get_global_id(0);
> +// static global sg;
> +
> + dst[11] = i;
> + dst[12] = *p;
> + dst[13] = s;
> + dst[14] = l;
> + dst[15] = *gArr[dynamic];
> +
> + if (gid < 11)
> + dst[gid] = ba[gid];
> +}
> +
> diff --git a/src/cl_command_queue.c b/src/cl_command_queue.c
> index 9dc3fe6..442c6a2 100644
> --- a/src/cl_command_queue.c
> +++ b/src/cl_command_queue.c
> @@ -161,7 +161,7 @@
> cl_command_queue_bind_surface(cl_command_queue queue, cl_kernel k)
> for (i = 0; i < k->arg_n; ++i) {
> int32_t offset; // location of the address in the curbe
> arg_type = interp_kernel_get_arg_type(k->opaque, i);
> - if (arg_type != GBE_ARG_GLOBAL_PTR || !k->args[i].mem)
> + if (!(arg_type == GBE_ARG_GLOBAL_PTR || arg_type ==
> GBE_ARG_CONSTANT_PTR) || !k->args[i].mem)
> continue;
> offset = interp_kernel_get_curbe_offset(k->opaque,
> GBE_CURBE_KERNEL_ARGUMENT, i);
> if (offset < 0)
> diff --git a/src/cl_command_queue_gen7.c
> b/src/cl_command_queue_gen7.c
> index 2edc3be..61ffe7e 100644
> --- a/src/cl_command_queue_gen7.c
> +++ b/src/cl_command_queue_gen7.c
> @@ -29,6 +29,7 @@
> #include <assert.h>
> #include <stdio.h>
> #include <string.h>
> +#include <unistd.h>
>
> #define MAX_GROUP_SIZE_IN_HALFSLICE 512
> static INLINE size_t cl_kernel_compute_batch_sz(cl_kernel k) { return
> 256+256; }
> @@ -117,6 +118,11 @@ cl_upload_constant_buffer(cl_command_queue
> queue, cl_kernel ker)
> * we need raw_size & aligned_size
> */
> GET_QUEUE_THREAD_GPGPU(queue);
> + // TODO this is only valid for OpenCL 1.2,
> + // under ocl1.2 we gather all constant into one dedicated surface.
> + // but in 2.0 we put program global into one surface, but constants
> + // pass through kernel argument in each separate buffer
> +#if 0
> int32_t arg;
> size_t offset = 0;
> uint32_t raw_size = 0, aligned_size =0;
> @@ -185,6 +191,16 @@ cl_upload_constant_buffer(cl_command_queue
> queue, cl_kernel ker)
> }
> }
> cl_buffer_unmap(bo);
> +#endif
> + // pass the starting of constant address space
> + int32_t constant_addrspace = interp_kernel_get_curbe_offset(ker-
> >opaque, GBE_CURBE_CONSTANT_ADDRSPACE, 0);
> + if (constant_addrspace >= 0) {
> + size_t global_const_size =
> interp_program_get_global_constant_size(ker->program->opaque);
> + if (global_const_size > 0) {
> + *(uint64_t*)(ker->curbe + constant_addrspace) = (uint64_t)ker-
> >program->global_data_ptr;
> + cl_gpgpu_bind_buf(gpgpu, ker->program->global_data,
> constant_addrspace, 0, ALIGN(global_const_size, getpagesize()),
> BTI_CONSTANT);
> + }
> + }
> return 0;
> }
>
> diff --git a/src/cl_gbe_loader.cpp b/src/cl_gbe_loader.cpp
> index e832a53..d75c92c 100644
> --- a/src/cl_gbe_loader.cpp
> +++ b/src/cl_gbe_loader.cpp
> @@ -38,6 +38,8 @@ gbe_program_clean_llvm_resource_cb
> *compiler_program_clean_llvm_resource = NULL;
> gbe_program_new_from_binary_cb *interp_program_new_from_binary =
> NULL;
> gbe_program_get_global_constant_size_cb
> *interp_program_get_global_constant_size = NULL;
> gbe_program_get_global_constant_data_cb
> *interp_program_get_global_constant_data = NULL;
> +gbe_program_get_global_reloc_count_cb
> *interp_program_get_global_reloc_count = NULL;
> +gbe_program_get_global_reloc_table_cb
> *interp_program_get_global_reloc_table = NULL;
> gbe_program_delete_cb *interp_program_delete = NULL;
> gbe_program_get_kernel_num_cb *interp_program_get_kernel_num =
> NULL;
> gbe_program_get_kernel_by_name_cb
> *interp_program_get_kernel_by_name = NULL;
> @@ -109,6 +111,14 @@ struct GbeLoaderInitializer
> if (interp_program_get_global_constant_data == NULL)
> return false;
>
> + interp_program_get_global_reloc_count =
> *(gbe_program_get_global_reloc_count_cb**)dlsym(dlhInterp,
> "gbe_program_get_global_reloc_count");
> + if (interp_program_get_global_reloc_count == NULL)
> + return false;
> +
> + interp_program_get_global_reloc_table =
> *(gbe_program_get_global_reloc_table_cb**)dlsym(dlhInterp,
> "gbe_program_get_global_reloc_table");
> + if (interp_program_get_global_reloc_table == NULL)
> + return false;
> +
> interp_program_delete = *(gbe_program_delete_cb**)dlsym(dlhInterp,
> "gbe_program_delete");
> if (interp_program_delete == NULL)
> return false;
> diff --git a/src/cl_gbe_loader.h b/src/cl_gbe_loader.h
> index de91c85..28741ff 100644
> --- a/src/cl_gbe_loader.h
> +++ b/src/cl_gbe_loader.h
> @@ -38,6 +38,8 @@ extern gbe_program_clean_llvm_resource_cb
> *compiler_program_clean_llvm_resource;
> extern gbe_program_new_from_binary_cb
> *interp_program_new_from_binary;
> extern gbe_program_get_global_constant_size_cb
> *interp_program_get_global_constant_size;
> extern gbe_program_get_global_constant_data_cb
> *interp_program_get_global_constant_data;
> +extern gbe_program_get_global_reloc_count_cb
> *interp_program_get_global_reloc_count;
> +extern gbe_program_get_global_reloc_table_cb
> *interp_program_get_global_reloc_table;
> extern gbe_program_delete_cb *interp_program_delete;
> extern gbe_program_get_kernel_num_cb
> *interp_program_get_kernel_num;
> extern gbe_program_get_kernel_by_name_cb
> *interp_program_get_kernel_by_name;
> diff --git a/src/cl_program.c b/src/cl_program.c
> index 98b6d51..ffdb2a1 100644
> --- a/src/cl_program.c
> +++ b/src/cl_program.c
> @@ -97,6 +97,9 @@ cl_program_delete(cl_program p)
> cl_kernel_delete(p->ker[i]);
> cl_free(p->ker);
>
> + cl_free(p->global_data_ptr);
> + if (p->global_data_ptr)
> + cl_buffer_unreference(p->global_data);
> /* Program belongs to their parent context */
> cl_context_delete(p->ctx);
>
> @@ -191,6 +194,42 @@ LOCAL cl_bool headerCompare(const unsigned char
> *BufPtr, BINARY_HEADER_INDEX ind
> #define isLLVM_LIB(BufPtr) headerCompare(BufPtr, BHI_LIBRARY)
> #define isGenBinary(BufPtr) headerCompare(BufPtr, BHI_GEN_BINARY)
>
> +static cl_int get_program_global_data(cl_program prog) {
> + cl_buffer_mgr bufmgr = NULL;
> + bufmgr = cl_context_get_bufmgr(prog->ctx);
> + assert(bufmgr);
> + size_t const_size = interp_program_get_global_constant_size(prog-
> >opaque);
> + if (const_size == 0) return CL_SUCCESS;
> +
> + int page_size = getpagesize();
> + size_t alignedSz = ALIGN(const_size, page_size);
> + char * p = (char*)cl_aligned_malloc(alignedSz, page_size);
> + prog->global_data_ptr = p;
> + interp_program_get_global_constant_data(prog->opaque, (char*)p);
> +
> + prog->global_data = cl_buffer_alloc_userptr(bufmgr, "program global
> data", p, alignedSz, 0);
> + cl_buffer_set_softpin_offset(prog->global_data, (size_t)p);
> +
> + uint32_t reloc_count = interp_program_get_global_reloc_count(prog-
> >opaque);
> + if (reloc_count > 0) {
> + uint32_t x;
> + struct RelocEntry {int refOffset; int defOffset;};
> + char *temp = (char*) malloc(reloc_count *sizeof(int)*2);
> + interp_program_get_global_reloc_table(prog->opaque, temp);
> + for (x = 0; x < reloc_count; x++) {
> + int ref_offset = ((struct RelocEntry *)temp)[x].refOffset;
> + *(uint64_t*)&(p[ref_offset]) = ((struct RelocEntry *)temp)[x].defOffset
> + (uint64_t)p;
> + }
> + free(temp);
> + }
> +#if 0
> + int x = 0;
> + for (x = 0; x < const_size; x++) {
> + printf("offset %d data: %x\n", x, (unsigned)p[x]);
> + }
> +#endif
> + return CL_SUCCESS;
> +}
> LOCAL cl_program
> cl_program_create_from_binary(cl_context ctx,
> cl_uint num_devices,
> @@ -603,6 +642,9 @@ cl_program_build(cl_program p, const char *options)
> memcpy(p->bin + copyed, interp_kernel_get_code(opaque), sz);
> copyed += sz;
> }
> + if ((err = get_program_global_data(p)) != CL_SUCCESS)
> + goto error;
> +
> p->is_built = 1;
> p->build_status = CL_BUILD_SUCCESS;
> return CL_SUCCESS;
> @@ -697,6 +739,10 @@ cl_program_link(cl_context context,
> memcpy(p->bin + copyed, interp_kernel_get_code(opaque), sz);
> copyed += sz;
> }
> +
> + if ((err = get_program_global_data(p)) != CL_SUCCESS)
> + goto error;
> +
> done:
> if(p) p->is_built = 1;
> if(p) p->build_status = CL_BUILD_SUCCESS;
> diff --git a/src/cl_program.h b/src/cl_program.h
> index 63ad16d..083d66a 100644
> --- a/src/cl_program.h
> +++ b/src/cl_program.h
> @@ -54,6 +54,8 @@ struct _cl_program {
> cl_kernel *ker; /* All kernels included by the OCL file */
> cl_program prev, next; /* We chain the programs together */
> cl_context ctx; /* Its parent context */
> + cl_buffer global_data;
> + char * global_data_ptr;
> char *bin; /* The program copied verbatim */
> size_t bin_sz; /* Its size in memory */
> char *source; /* Program sources */
> diff --git a/utests/CMakeLists.txt b/utests/CMakeLists.txt
> index 78442cb..0fca450 100644
> --- a/utests/CMakeLists.txt
> +++ b/utests/CMakeLists.txt
> @@ -191,6 +191,7 @@ set (utests_sources
> compiler_bool_cross_basic_block.cpp
> compiler_private_const.cpp
> compiler_private_data_overflow.cpp
> + compiler_program_global.cpp
> compiler_getelementptr_bitcast.cpp
> compiler_sub_group_any.cpp
> compiler_sub_group_all.cpp
> diff --git a/utests/compiler_program_global.cpp
> b/utests/compiler_program_global.cpp
> new file mode 100644
> index 0000000..ef7c655
> --- /dev/null
> +++ b/utests/compiler_program_global.cpp
> @@ -0,0 +1,80 @@
> +#include "utest_helper.hpp"
> +#include "utest_file_map.hpp"
> +
> +static int init_program(const char* name, cl_context ctx, cl_program *pg )
> +{
> + cl_int err;
> + char* ker_path = cl_do_kiss_path(name, device);
> +
> + cl_file_map_t *fm = cl_file_map_new();
> + err = cl_file_map_open(fm, ker_path);
> + if(err != CL_FILE_MAP_SUCCESS)
> + OCL_ASSERT(0);
> + const char *src = cl_file_map_begin(fm);
> +
> + *pg = clCreateProgramWithSource(ctx, 1, &src, NULL, &err);
> + free(ker_path);
> + cl_file_map_delete(fm);
> + return 0;
> +
> +}
> +
> +void compiler_program_global()
> +{
> + const int n = 16;
> + int cpu_src[16];
> + cl_int err;
> +
> + // Setup kernel and buffers
> + cl_program program;
> + init_program("compiler_program_global.cl", ctx, &program);
> + OCL_CALL (clBuildProgram, program, 1, &device, "-cl-std=CL2.0", NULL,
> NULL);
> +
> + cl_kernel k0 = clCreateKernel(program, "compiler_program_global0",
> &err);
> + assert(err == CL_SUCCESS);
> + cl_kernel k1 = clCreateKernel(program, "compiler_program_global1",
> &err);
> + assert(err == CL_SUCCESS);
> +
> + OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(int), NULL);
> + OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(int), NULL);
> +
> + OCL_CALL (clSetKernelArg, k0, 0, sizeof(cl_mem), &buf[0]);
> + OCL_CALL (clSetKernelArg, k1, 0, sizeof(cl_mem), &buf[1]);
> +
> + int dynamic = 1;
> + OCL_CALL (clSetKernelArg, k0, 1, sizeof(cl_int), &dynamic);
> + OCL_CALL (clSetKernelArg, k1, 1, sizeof(cl_int), &dynamic);
> +
> + globals[0] = 16;
> + locals[0] = 16;
> +
> + OCL_MAP_BUFFER(0);
> + for (int i = 0; i < n; ++i)
> + cpu_src[i] = ((int*)buf_data[0])[i] = i;
> + OCL_UNMAP_BUFFER(0);
> +
> + // Run the kernel on GPU
> + OCL_CALL (clEnqueueNDRangeKernel, queue, k0, 1, NULL, globals, locals, 0,
> NULL, NULL);
> + OCL_CALL (clEnqueueNDRangeKernel, queue, k1, 1, NULL, globals, locals, 0,
> NULL, NULL);
> +
> + // Compare
> + OCL_MAP_BUFFER(1);
> + for (int32_t i = 0; i < n; ++i) {
> +// printf("i=%d dst=%d\n", i, ((int*)buf_data[1])[i]);
> + switch(i) {
> + default: OCL_ASSERT(((int*)buf_data[1])[i] == i); break;
> + case 11: OCL_ASSERT(((int*)buf_data[1])[i] == 7); break;
> + case 12: OCL_ASSERT(((int*)buf_data[1])[i] == 4); break;
> + case 13: OCL_ASSERT(((int*)buf_data[1])[i] == 2); break;
> + case 14: OCL_ASSERT(((int*)buf_data[1])[i] == 3); break;
> + case 15: OCL_ASSERT(((int*)buf_data[1])[i] == 2); break;
> + }
> + }
> + OCL_UNMAP_BUFFER(1);
> + clReleaseKernel(k0);
> + clReleaseKernel(k1);
> + clReleaseProgram(program);
> +}
> +
> +MAKE_UTEST_FROM_FUNCTION(compiler_program_global);
> +
> --
> 2.4.1
>
> _______________________________________________
> Beignet mailing list
> Beignet at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/beignet
More information about the Beignet
mailing list