[Beignet] [PATCH 1/4] GBE: Implement new constant solution for ocl2

xionghu.luo at intel.com xionghu.luo at intel.com
Mon Jan 18 19:29:06 PST 2016


From: Ruiling Song <ruiling.song at intel.com>

Different from ocl 1.2, which gather all constant into one surface.
ocl2 only gather program global/constant into one surface. But
keep other constant passed through kernel argument in separate buffer.

Signed-off-by: Ruiling Song <ruiling.song at intel.com>
---
 backend/src/backend/gen_insn_selection.cpp | 15 ++++--
 backend/src/backend/program.cpp            | 21 +++++++-
 backend/src/backend/program.h              |  6 +++
 backend/src/backend/program.hpp            |  4 ++
 backend/src/gbe_bin_interpreter.cpp        |  2 +
 backend/src/ir/profile.cpp                 |  4 +-
 backend/src/ir/profile.hpp                 |  3 +-
 backend/src/ir/unit.hpp                    | 50 ++++++++++++++++++
 backend/src/llvm/llvm_gen_backend.cpp      | 84 ++++++++++++++++++------------
 kernels/compiler_program_global.cl         | 68 ++++++++++++++++++++++++
 src/cl_command_queue.c                     |  2 +-
 src/cl_command_queue_gen7.c                | 16 ++++++
 src/cl_gbe_loader.cpp                      | 10 ++++
 src/cl_gbe_loader.h                        |  2 +
 src/cl_program.c                           | 46 ++++++++++++++++
 src/cl_program.h                           |  2 +
 utests/CMakeLists.txt                      |  1 +
 utests/compiler_program_global.cpp         | 80 ++++++++++++++++++++++++++++
 18 files changed, 374 insertions(+), 42 deletions(-)
 create mode 100644 kernels/compiler_program_global.cl
 create mode 100644 utests/compiler_program_global.cpp

diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp
index d19f985..6ef077b 100644
--- a/backend/src/backend/gen_insn_selection.cpp
+++ b/backend/src/backend/gen_insn_selection.cpp
@@ -3596,6 +3596,13 @@ namespace gbe
     LoadInstructionPattern(void) : SelectionPattern(1, 1) {
        this->opcodes.push_back(ir::OP_LOAD);
     }
+    bool isReadConstantLegacy(const ir::LoadInstruction &load) const {
+      ir::AddressMode AM = load.getAddressMode();
+      ir::AddressSpace AS = load.getAddressSpace();
+      if (AM != ir::AM_Stateless && AS == ir::MEM_CONSTANT)
+        return true;
+      return false;
+    }
     void untypedReadStateless(Selection::Opaque &sel,
                               GenRegister addr,
                               vector<GenRegister> &dst
@@ -3678,7 +3685,7 @@ namespace gbe
             unsigned SI = insn.getSurfaceIndex();
             sel.UNTYPED_READ(addr, dst.data(), valueNum, GenRegister::immud(SI), btiTemp);
           }
-        } else if (addrSpace == ir::MEM_LOCAL || addrSpace == ir::MEM_CONSTANT ) {
+        } else if (addrSpace == ir::MEM_LOCAL || isReadConstantLegacy(insn) ) {
           // stateless mode, local/constant still use bti access
           unsigned bti = addrSpace == ir::MEM_CONSTANT ? BTI_CONSTANT : 0xfe;
           GenRegister addrDW = addr;
@@ -3842,7 +3849,7 @@ namespace gbe
             b = GenRegister::immud(insn.getSurfaceIndex());
           }
           read64Legacy(sel, addr, dst, b, btiTemp);
-        } else if (addrSpace == MEM_LOCAL || addrSpace == MEM_CONSTANT) {
+        } else if (addrSpace == MEM_LOCAL || isReadConstantLegacy(insn)) {
           GenRegister b = GenRegister::immud(addrSpace == MEM_LOCAL? 0xfe : BTI_CONSTANT);
           GenRegister addrDW = addr;
           if (addrBytes == 8)
@@ -4063,7 +4070,7 @@ namespace gbe
           unsigned SI = insn.getSurfaceIndex();
           sel.BYTE_GATHER(dst, addr, elemSize, GenRegister::immud(SI), btiTemp);
         }
-      } else if (addrSpace == ir::MEM_LOCAL || addrSpace == ir::MEM_CONSTANT) {
+      } else if (addrSpace == ir::MEM_LOCAL || isReadConstantLegacy(insn)) {
         unsigned bti = addrSpace == ir::MEM_CONSTANT ? BTI_CONSTANT : 0xfe;
         GenRegister addrDW = addr;
         if (addrBytes == 8) {
@@ -4207,7 +4214,7 @@ namespace gbe
       const Type type = insn.getValueType();
       const uint32_t elemSize = getByteScatterGatherSize(sel, type);
 
-      if (addrSpace == MEM_CONSTANT) {
+      if (isReadConstantLegacy(insn)) {
         // XXX TODO read 64bit constant through constant cache
         // Per HW Spec, constant cache messages can read at least DWORD data.
         // So, byte/short data type, we have to read through data cache.
diff --git a/backend/src/backend/program.cpp b/backend/src/backend/program.cpp
index 36af95f..ce4f927 100644
--- a/backend/src/backend/program.cpp
+++ b/backend/src/backend/program.cpp
@@ -104,11 +104,13 @@ namespace gbe {
     return it->offset; // we found it!
   }
 
-  Program::Program(void) : constantSet(NULL) {}
+  Program::Program(void) : constantSet(NULL),
+                           relocTable(NULL) {}
   Program::~Program(void) {
     for (map<std::string, Kernel*>::iterator it = kernels.begin(); it != kernels.end(); ++it)
       GBE_DELETE(it->second);
     if (constantSet) delete constantSet;
+    if (relocTable) delete relocTable;
   }
 
 #ifdef GBE_COMPILER_AVAILABLE
@@ -151,6 +153,7 @@ namespace gbe {
 
   bool Program::buildFromUnit(const ir::Unit &unit, std::string &error) {
     constantSet = new ir::ConstantSet(unit.getConstantSet());
+    relocTable = new ir::RelocTable(unit.getRelocTable());
     const auto &set = unit.getFunctionSet();
     const uint32_t kernelNum = set.size();
     if (OCL_OUTPUT_GEN_IR) std::cout << unit;
@@ -978,6 +981,18 @@ namespace gbe {
     program->getGlobalConstantData(mem);
   }
 
+  static size_t programGetGlobalRelocCount(gbe_program gbeProgram) {
+    if (gbeProgram == NULL) return 0;
+    const gbe::Program *program = (const gbe::Program*) gbeProgram;
+    return program->getGlobalRelocCount();
+  }
+
+  static void programGetGlobalRelocTable(gbe_program gbeProgram, char *mem) {
+    if (gbeProgram == NULL) return;
+    const gbe::Program *program = (const gbe::Program*) gbeProgram;
+    program->getGlobalRelocTable(mem);
+  }
+
   static uint32_t programGetKernelNum(gbe_program gbeProgram) {
     if (gbeProgram == NULL) return 0;
     const gbe::Program *program = (const gbe::Program*) gbeProgram;
@@ -1220,6 +1235,8 @@ GBE_EXPORT_SYMBOL gbe_program_link_from_llvm_cb *gbe_program_link_from_llvm = NU
 GBE_EXPORT_SYMBOL gbe_program_build_from_llvm_cb *gbe_program_build_from_llvm = NULL;
 GBE_EXPORT_SYMBOL gbe_program_get_global_constant_size_cb *gbe_program_get_global_constant_size = NULL;
 GBE_EXPORT_SYMBOL gbe_program_get_global_constant_data_cb *gbe_program_get_global_constant_data = NULL;
+GBE_EXPORT_SYMBOL gbe_program_get_global_reloc_count_cb *gbe_program_get_global_reloc_count = NULL;
+GBE_EXPORT_SYMBOL gbe_program_get_global_reloc_table_cb *gbe_program_get_global_reloc_table = NULL;
 GBE_EXPORT_SYMBOL gbe_program_clean_llvm_resource_cb *gbe_program_clean_llvm_resource = NULL;
 GBE_EXPORT_SYMBOL gbe_program_delete_cb *gbe_program_delete = NULL;
 GBE_EXPORT_SYMBOL gbe_program_get_kernel_num_cb *gbe_program_get_kernel_num = NULL;
@@ -1269,6 +1286,8 @@ namespace gbe
       gbe_program_check_opt = gbe::programCheckOption;
       gbe_program_get_global_constant_size = gbe::programGetGlobalConstantSize;
       gbe_program_get_global_constant_data = gbe::programGetGlobalConstantData;
+      gbe_program_get_global_reloc_count = gbe::programGetGlobalRelocCount;
+      gbe_program_get_global_reloc_table = gbe::programGetGlobalRelocTable;
       gbe_program_clean_llvm_resource = gbe::programCleanLlvmResource;
       gbe_program_delete = gbe::programDelete;
       gbe_program_get_kernel_num = gbe::programGetKernelNum;
diff --git a/backend/src/backend/program.h b/backend/src/backend/program.h
index 86b3177..03150bc 100644
--- a/backend/src/backend/program.h
+++ b/backend/src/backend/program.h
@@ -99,6 +99,7 @@ enum gbe_curbe_type {
   GBE_CURBE_BLOCK_IP,
   GBE_CURBE_DW_BLOCK_IP,
   GBE_CURBE_THREAD_NUM,
+  GBE_CURBE_CONSTANT_ADDRSPACE,
   GBE_GEN_REG,
 };
 
@@ -243,6 +244,11 @@ extern gbe_program_get_global_constant_size_cb *gbe_program_get_global_constant_
 typedef void (gbe_program_get_global_constant_data_cb)(gbe_program gbeProgram, char *mem);
 extern gbe_program_get_global_constant_data_cb *gbe_program_get_global_constant_data;
 
+typedef size_t (gbe_program_get_global_reloc_count_cb)(gbe_program gbeProgram);
+extern gbe_program_get_global_reloc_count_cb *gbe_program_get_global_reloc_count;
+
+typedef void (gbe_program_get_global_reloc_table_cb)(gbe_program gbeProgram, char *mem);
+extern gbe_program_get_global_reloc_table_cb *gbe_program_get_global_reloc_table;
 /*! Get the size of defined samplers */
 typedef size_t (gbe_kernel_get_sampler_size_cb)(gbe_kernel gbeKernel);
 extern gbe_kernel_get_sampler_size_cb *gbe_kernel_get_sampler_size;
diff --git a/backend/src/backend/program.hpp b/backend/src/backend/program.hpp
index efe192f..e58ddf0 100644
--- a/backend/src/backend/program.hpp
+++ b/backend/src/backend/program.hpp
@@ -280,6 +280,8 @@ namespace gbe {
     /*! Get the content of global constant arrays */
     void getGlobalConstantData(char *mem) const { constantSet->getData(mem); }
 
+    uint32_t getGlobalRelocCount(void) const { return relocTable->getCount(); }
+    void getGlobalRelocTable(char *p) const { relocTable->getData(p); }
     static const uint32_t magic_begin = TO_MAGIC('P', 'R', 'O', 'G');
     static const uint32_t magic_end = TO_MAGIC('G', 'O', 'R', 'P');
 
@@ -309,6 +311,8 @@ namespace gbe {
     map<std::string, Kernel*> kernels;
     /*! Global (constants) outside any kernel */
     ir::ConstantSet *constantSet;
+    /*! relocation table */
+    ir::RelocTable *relocTable;
     /*! Use custom allocators */
     GBE_CLASS(Program);
   };
diff --git a/backend/src/gbe_bin_interpreter.cpp b/backend/src/gbe_bin_interpreter.cpp
index 4594a0a..0957092 100644
--- a/backend/src/gbe_bin_interpreter.cpp
+++ b/backend/src/gbe_bin_interpreter.cpp
@@ -61,6 +61,8 @@ struct BinInterpCallBackInitializer
     gbe_program_get_global_constant_size = gbe::programGetGlobalConstantSize;
     gbe_program_delete = gbe::programDelete;
     gbe_program_get_global_constant_data = gbe::programGetGlobalConstantData;
+    gbe_program_get_global_reloc_count = gbe::programGetGlobalRelocCount;
+    gbe_program_get_global_reloc_table = gbe::programGetGlobalRelocTable;
     gbe_kernel_get_sampler_data = gbe::kernelGetSamplerData;
     gbe_kernel_get_image_data = gbe::kernelGetImageData;
     gbe_kernel_get_arg_info = gbe::kernelGetArgInfo;
diff --git a/backend/src/ir/profile.cpp b/backend/src/ir/profile.cpp
index 3ead8a7..0699167 100644
--- a/backend/src/ir/profile.cpp
+++ b/backend/src/ir/profile.cpp
@@ -43,7 +43,8 @@ namespace ir {
         "zero", "one",
         "retVal",
         "printf_buffer_pointer", "printf_index_buffer_pointer",
-        "dwblockip"
+        "dwblockip",
+        "constant_addrspace_start"
     };
 
 #if GBE_DEBUG
@@ -86,6 +87,7 @@ namespace ir {
       DECL_NEW_REG(FAMILY_QWORD, printfbptr, 1, GBE_CURBE_PRINTF_BUF_POINTER);
       DECL_NEW_REG(FAMILY_QWORD, printfiptr, 1, GBE_CURBE_PRINTF_INDEX_POINTER);
       DECL_NEW_REG(FAMILY_DWORD, dwblockip, 0, GBE_CURBE_DW_BLOCK_IP);
+      DECL_NEW_REG(FAMILY_QWORD, constant_addrspace, 1, GBE_CURBE_CONSTANT_ADDRSPACE);
     }
 #undef DECL_NEW_REG
 
diff --git a/backend/src/ir/profile.hpp b/backend/src/ir/profile.hpp
index a8445c4..79761d4 100644
--- a/backend/src/ir/profile.hpp
+++ b/backend/src/ir/profile.hpp
@@ -71,7 +71,8 @@ namespace ir {
     static const Register printfbptr = Register(27); // printf buffer address .
     static const Register printfiptr = Register(28); // printf index buffer address.
     static const Register dwblockip = Register(29);  // blockip
-    static const uint32_t regNum = 30;             // number of special registers
+    static const Register constant_addrspace = Register(30);  // starting address of program-scope constant
+    static const uint32_t regNum = 31;             // number of special registers
     extern const char *specialRegMean[];           // special register name.
   } /* namespace ocl */
 
diff --git a/backend/src/ir/unit.hpp b/backend/src/ir/unit.hpp
index 8ff858d..b8df145 100644
--- a/backend/src/ir/unit.hpp
+++ b/backend/src/ir/unit.hpp
@@ -27,6 +27,7 @@
 #include "ir/constant.hpp"
 #include "ir/register.hpp"
 #include "sys/map.hpp"
+#include <string.h>
 
 namespace gbe {
 namespace ir {
@@ -37,6 +38,52 @@ namespace ir {
   /*! Complete unit of compilation. It contains a set of functions and a set of
    *  constant the functions may refer to.
    */
+  struct RelocEntry {
+    RelocEntry(unsigned int rO, unsigned int dO):
+      refOffset(rO),
+      defOffset(dO) {}
+
+    unsigned int refOffset;
+    unsigned int defOffset;
+  };
+
+  class RelocTable : public NonCopyable, public Serializable
+  {
+    public:
+      void addEntry(unsigned refOffset, unsigned defOffset) {
+        entries.push_back(RelocEntry(refOffset, defOffset));
+      }
+      RelocTable() {}
+      RelocTable(const RelocTable& other) : Serializable(other),
+                                            entries(other.entries) {}
+      uint32_t getCount() { return entries.size(); }
+      void getData(char *p) {
+        if (entries.size() > 1 && p)
+          memcpy(p, entries.data(), entries.size()*sizeof(RelocEntry));
+      }
+    static const uint32_t magic_begin = TO_MAGIC('R', 'E', 'L', 'C');
+    static const uint32_t magic_end = TO_MAGIC('C', 'L', 'E', 'R');
+
+    /* format:
+       magic_begin     |
+       const_data_size |
+       const_data      |
+       constant_1_size |
+       constant_1      |
+       ........        |
+       constant_n_size |
+       constant_n      |
+       magic_end       |
+       total_size
+    */
+
+    /*! Implements the serialization. */
+    virtual size_t serializeToBin(std::ostream& outs) { return 0;}
+    virtual size_t deserializeFromBin(std::istream& ins) { return 0; }
+    private:
+      vector<RelocEntry> entries;
+      GBE_CLASS(RelocTable);
+  };
   class Unit : public NonCopyable
   {
   public:
@@ -70,6 +117,8 @@ namespace ir {
     }
     /*! Return the constant set */
     ConstantSet& getConstantSet(void) { return constantSet; }
+    const RelocTable& getRelocTable(void) const  { return relocTable; }
+    RelocTable& getRelocTable(void)   { return relocTable; }
     /*! Return the constant set */
     const ConstantSet& getConstantSet(void) const { return constantSet; }
     void setValid(bool value) { valid = value; }
@@ -78,6 +127,7 @@ namespace ir {
     friend class ContextInterface; //!< Can free modify the unit
     FunctionSet functions; //!< All the defined functions
     ConstantSet constantSet; //!< All the constants defined in the unit
+    RelocTable relocTable;
     PointerSize pointerSize; //!< Size shared by all pointers
     GBE_CLASS(Unit);
     bool valid;
diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp
index cb47097..d23a598 100644
--- a/backend/src/llvm/llvm_gen_backend.cpp
+++ b/backend/src/llvm/llvm_gen_backend.cpp
@@ -506,7 +506,7 @@ namespace gbe
 
     virtual bool doInitialization(Module &M);
     /*! helper function for parsing global constant data */
-    void getConstantData(const Constant * c, void* mem, uint32_t& offset) const;
+    void getConstantData(const Constant * c, void* mem, uint32_t& offset, vector<ir::RelocEntry> &) const;
     void collectGlobalConstant(void) const;
     ir::ImmediateIndex processConstantImmIndex(Constant *CPV, int32_t index = 0u);
     const ir::Immediate &processConstantImm(Constant *CPV, int32_t index = 0u);
@@ -1111,8 +1111,9 @@ namespace gbe
           break;
         }
         case 2:
-          new_bti = BTI_CONSTANT;
-
+          // ocl 2.0, constant pointer use separate bti
+          new_bti = btiBase;
+          incBtiBase();
           break;
         case 3:
           new_bti = BTI_LOCAL;
@@ -1349,22 +1350,34 @@ namespace gbe
     return;
   }
 
-  void GenWriter::getConstantData(const Constant * c, void* mem, uint32_t& offset) const {
+  void GenWriter::getConstantData(const Constant * c, void* mem, uint32_t& offset, vector<ir::RelocEntry> &relocs) const {
     Type * type = c->getType();
     Type::TypeID id = type->getTypeID();
 
     GBE_ASSERT(c);
+    if (isa<GlobalVariable>(c)) {
+      const GlobalVariable *GV = cast<GlobalVariable>(c);
+
+      unsigned valueAddrSpace = GV->getType()->getAddressSpace();
+      ir::Constant cc = unit.getConstantSet().getConstant(c->getName());
+      unsigned int defOffset = cc.getOffset();
+
+      relocs.push_back(ir::RelocEntry(offset, defOffset));
+      uint32_t size = getTypeByteSize(unit, type);
+      memset((char*)mem+offset, 0, size);
+      offset += size;
+      return;
+    }
     if(isa<UndefValue>(c)) {
       uint32_t size = getTypeByteSize(unit, type);
       offset += size;
       return;
-    } else if(isa<ConstantAggregateZero>(c)) {
+    } else if(isa<ConstantAggregateZero>(c) || isa<ConstantPointerNull>(c)) {
       uint32_t size = getTypeByteSize(unit, type);
       memset((char*)mem+offset, 0, size);
       offset += size;
       return;
     }
-
     switch(id) {
       case Type::TypeID::StructTyID:
         {
@@ -1382,7 +1395,7 @@ namespace gbe
             offset += padding/8;
             const Constant* sub = cast<Constant>(c->getOperand(op));
             GBE_ASSERT(sub);
-            getConstantData(sub, mem, offset);
+            getConstantData(sub, mem, offset, relocs);
           }
           break;
         }
@@ -1401,7 +1414,7 @@ namespace gbe
             uint32_t ops = c->getNumOperands();
             for(uint32_t op = 0; op < ops; ++op) {
               Constant * ca = dyn_cast<Constant>(c->getOperand(op));
-              getConstantData(ca, mem, offset);
+              getConstantData(ca, mem, offset, relocs);
               offset += padding;
             }
           }
@@ -1449,21 +1462,34 @@ namespace gbe
     const Module::GlobalListType &globalList = TheModule->getGlobalList();
     for(auto i = globalList.begin(); i != globalList.end(); i ++) {
       const GlobalVariable &v = *i;
-      if(!v.isConstantUsed()) continue;
       const char *name = v.getName().data();
       unsigned addrSpace = v.getType()->getAddressSpace();
-      if(addrSpace == ir::AddressSpace::MEM_CONSTANT || v.isConstant()) {
-        GBE_ASSERT(v.hasInitializer());
-        const Constant *c = v.getInitializer();
-        Type * type = c->getType();
+
+      vector<ir::RelocEntry> relocs;
+      if(addrSpace == 2 /* __constant */
+          || addrSpace == 1
+          || addrSpace == 0) {
+        Type * type = v.getValueType();
 
         uint32_t size = getTypeByteSize(unit, type);
         void* mem = malloc(size);
         uint32_t offset = 0;
-        getConstantData(c, mem, offset);
+        if (v.hasInitializer()) {
+          const Constant *c = v.getInitializer();
+          getConstantData(c, mem, offset, relocs);
+        } else {
+          memset(mem, 0, size);
+        }
         uint32_t alignment = getAlignmentByte(unit, type);
         unit.newConstant((char *)mem, name, size, alignment);
         free(mem);
+        uint32_t refOffset = unit.getConstantSet().getConstant(name).getOffset();
+        for (uint32_t k = 0; k < relocs.size(); k++) {
+          unit.getRelocTable().addEntry(
+                               refOffset + relocs[k].refOffset,
+                               relocs[k].defOffset
+                               );
+        }
       }
     }
   }
@@ -2562,33 +2588,23 @@ namespace gbe
         this->newRegister(const_cast<GlobalVariable*>(&v));
         ir::Register reg = regTranslator.getScalar(const_cast<GlobalVariable*>(&v), 0);
         ctx.LOADI(getType(ctx, v.getType()), reg, ctx.newIntegerImmediate(oldSlm + padding/8, getType(ctx, v.getType())));
-      } else if(addrSpace == ir::MEM_CONSTANT || v.isConstant()) {
-        GBE_ASSERT(v.hasInitializer());
-        this->newRegister(const_cast<GlobalVariable*>(&v));
-        ir::Register reg = regTranslator.getScalar(const_cast<GlobalVariable*>(&v), 0);
-        ir::Constant &con = unit.getConstantSet().getConstant(v.getName());
-        ctx.LOADI(getType(ctx, v.getType()), reg, ctx.newIntegerImmediate(con.getOffset(), getType(ctx, v.getType())));
-      } else {
+      } else if(addrSpace == ir::MEM_CONSTANT
+             || addrSpace == ir::MEM_GLOBAL
+             || v.isConstant()) {
         if(v.getName().equals(StringRef("__gen_ocl_printf_buf"))) {
           ctx.getFunction().getPrintfSet()->setBufBTI(BtiMap.find(const_cast<GlobalVariable*>(&v))->second);
           regTranslator.newScalarProxy(ir::ocl::printfbptr, const_cast<GlobalVariable*>(&v));
         } else if(v.getName().equals(StringRef("__gen_ocl_printf_index_buf"))) {
           ctx.getFunction().getPrintfSet()->setIndexBufBTI(BtiMap.find(const_cast<GlobalVariable*>(&v))->second);
           regTranslator.newScalarProxy(ir::ocl::printfiptr, const_cast<GlobalVariable*>(&v));
-        } else if(v.getName().str().substr(0, 4) == ".str") {
-          /* When there are multi printf statements in multi kernel fucntions within the same
-             translate unit, if they have the same sting parameter, such as
-             kernel_func1 () {
-               printf("Line is %d\n", line_num1);
-             }
-             kernel_func2 () {
-               printf("Line is %d\n", line_num2);
-             }
-             The Clang will just generate one global string named .strXXX to represent "Line is %d\n"
-             So when translating the kernel_func1, we can not unref that global var, so we will
-             get here. Just ignore it to avoid assert. */
         } else {
-          GBE_ASSERT(0 && "Unsupported private memory access pattern");
+          this->newRegister(const_cast<GlobalVariable*>(&v));
+          ir::Register reg = regTranslator.getScalar(const_cast<GlobalVariable*>(&v), 0);
+          ir::Constant &con = unit.getConstantSet().getConstant(v.getName());
+          ctx.LOADI(getType(ctx, v.getType()), reg, ctx.newIntegerImmediate(con.getOffset(), getType(ctx, v.getType())));
+          if (!legacyMode) {
+            ctx.ADD(getType(ctx, v.getType()), reg, ir::ocl::constant_addrspace, reg);
+          }
         }
       }
     }
diff --git a/kernels/compiler_program_global.cl b/kernels/compiler_program_global.cl
new file mode 100644
index 0000000..405c53f
--- /dev/null
+++ b/kernels/compiler_program_global.cl
@@ -0,0 +1,68 @@
+struct config{
+  int s0;
+  global short *s1;
+};
+
+global int i = 5;
+global int bb = 4;
+global int *global p;
+
+/* array */
+global int ba[12];
+
+/* short/long data type */
+global short s;
+global short s2;
+global long l;
+
+/* pointer in constant AS to global */
+global int * constant px =&i;
+
+/* constant pointer relocation */
+constant int x = 2;
+constant int y =1;
+constant int *constant z[2] = {&x, &y};
+
+/* structure with pointer field */
+global struct config c[2] = {{1, &s}, {2, &s2} };
+
+
+global int a = 1;
+global int b = 2;
+global int * constant gArr[2]= {&a, &b};
+
+__kernel void compiler_program_global0(const global int *src, int dynamic) {
+  size_t gid = get_global_id(0);
+  /* global read/write */
+  p = &i;
+  *p += 1;
+
+  /* pointer in struct memory access */
+  *c[gid&1].s1 += 2;
+
+  s = 2;
+  l = 3;
+
+  /* constant AS pointer (points to global) memory access */
+  *px += *z[dynamic];
+
+  p = &bb;
+  /* array */
+  if (gid < 11)
+    ba[gid] = src[gid];
+}
+
+__kernel void compiler_program_global1(global int *dst, int dynamic) {
+  size_t gid = get_global_id(0);
+//  static global sg;
+
+  dst[11] = i;
+  dst[12] = *p;
+  dst[13] = s;
+  dst[14] = l;
+  dst[15] = *gArr[dynamic];
+
+  if (gid < 11)
+    dst[gid] = ba[gid];
+}
+
diff --git a/src/cl_command_queue.c b/src/cl_command_queue.c
index 9dc3fe6..442c6a2 100644
--- a/src/cl_command_queue.c
+++ b/src/cl_command_queue.c
@@ -161,7 +161,7 @@ cl_command_queue_bind_surface(cl_command_queue queue, cl_kernel k)
   for (i = 0; i < k->arg_n; ++i) {
     int32_t offset; // location of the address in the curbe
     arg_type = interp_kernel_get_arg_type(k->opaque, i);
-    if (arg_type != GBE_ARG_GLOBAL_PTR || !k->args[i].mem)
+    if (!(arg_type == GBE_ARG_GLOBAL_PTR || arg_type == GBE_ARG_CONSTANT_PTR) || !k->args[i].mem)
       continue;
     offset = interp_kernel_get_curbe_offset(k->opaque, GBE_CURBE_KERNEL_ARGUMENT, i);
     if (offset < 0)
diff --git a/src/cl_command_queue_gen7.c b/src/cl_command_queue_gen7.c
index 2edc3be..61ffe7e 100644
--- a/src/cl_command_queue_gen7.c
+++ b/src/cl_command_queue_gen7.c
@@ -29,6 +29,7 @@
 #include <assert.h>
 #include <stdio.h>
 #include <string.h>
+#include <unistd.h>
 
 #define MAX_GROUP_SIZE_IN_HALFSLICE   512
 static INLINE size_t cl_kernel_compute_batch_sz(cl_kernel k) { return 256+256; }
@@ -117,6 +118,11 @@ cl_upload_constant_buffer(cl_command_queue queue, cl_kernel ker)
    * we need raw_size & aligned_size
    */
   GET_QUEUE_THREAD_GPGPU(queue);
+  // TODO this is only valid for OpenCL 1.2,
+  // under ocl1.2 we gather all constant into one dedicated surface.
+  // but in 2.0 we put program global into one surface, but constants
+  // pass through kernel argument in each separate buffer
+#if 0
   int32_t arg;
   size_t offset = 0;
   uint32_t raw_size = 0, aligned_size =0;
@@ -185,6 +191,16 @@ cl_upload_constant_buffer(cl_command_queue queue, cl_kernel ker)
     }
   }
   cl_buffer_unmap(bo);
+#endif
+  // pass the starting of constant address space
+  int32_t constant_addrspace = interp_kernel_get_curbe_offset(ker->opaque, GBE_CURBE_CONSTANT_ADDRSPACE, 0);
+  if (constant_addrspace >= 0) {
+    size_t global_const_size = interp_program_get_global_constant_size(ker->program->opaque);
+    if (global_const_size > 0) {
+      *(uint64_t*)(ker->curbe + constant_addrspace) = (uint64_t)ker->program->global_data_ptr;
+      cl_gpgpu_bind_buf(gpgpu, ker->program->global_data, constant_addrspace, 0, ALIGN(global_const_size, getpagesize()), BTI_CONSTANT);
+    }
+  }
   return 0;
 }
 
diff --git a/src/cl_gbe_loader.cpp b/src/cl_gbe_loader.cpp
index e832a53..d75c92c 100644
--- a/src/cl_gbe_loader.cpp
+++ b/src/cl_gbe_loader.cpp
@@ -38,6 +38,8 @@ gbe_program_clean_llvm_resource_cb *compiler_program_clean_llvm_resource = NULL;
 gbe_program_new_from_binary_cb *interp_program_new_from_binary = NULL;
 gbe_program_get_global_constant_size_cb *interp_program_get_global_constant_size = NULL;
 gbe_program_get_global_constant_data_cb *interp_program_get_global_constant_data = NULL;
+gbe_program_get_global_reloc_count_cb *interp_program_get_global_reloc_count = NULL;
+gbe_program_get_global_reloc_table_cb *interp_program_get_global_reloc_table = NULL;
 gbe_program_delete_cb *interp_program_delete = NULL;
 gbe_program_get_kernel_num_cb *interp_program_get_kernel_num = NULL;
 gbe_program_get_kernel_by_name_cb *interp_program_get_kernel_by_name = NULL;
@@ -109,6 +111,14 @@ struct GbeLoaderInitializer
     if (interp_program_get_global_constant_data == NULL)
       return false;
 
+    interp_program_get_global_reloc_count = *(gbe_program_get_global_reloc_count_cb**)dlsym(dlhInterp, "gbe_program_get_global_reloc_count");
+    if (interp_program_get_global_reloc_count == NULL)
+      return false;
+
+    interp_program_get_global_reloc_table = *(gbe_program_get_global_reloc_table_cb**)dlsym(dlhInterp, "gbe_program_get_global_reloc_table");
+    if (interp_program_get_global_reloc_table == NULL)
+      return false;
+
     interp_program_delete = *(gbe_program_delete_cb**)dlsym(dlhInterp, "gbe_program_delete");
     if (interp_program_delete == NULL)
       return false;
diff --git a/src/cl_gbe_loader.h b/src/cl_gbe_loader.h
index de91c85..28741ff 100644
--- a/src/cl_gbe_loader.h
+++ b/src/cl_gbe_loader.h
@@ -38,6 +38,8 @@ extern gbe_program_clean_llvm_resource_cb *compiler_program_clean_llvm_resource;
 extern gbe_program_new_from_binary_cb *interp_program_new_from_binary;
 extern gbe_program_get_global_constant_size_cb *interp_program_get_global_constant_size;
 extern gbe_program_get_global_constant_data_cb *interp_program_get_global_constant_data;
+extern gbe_program_get_global_reloc_count_cb *interp_program_get_global_reloc_count;
+extern gbe_program_get_global_reloc_table_cb *interp_program_get_global_reloc_table;
 extern gbe_program_delete_cb *interp_program_delete;
 extern gbe_program_get_kernel_num_cb *interp_program_get_kernel_num;
 extern gbe_program_get_kernel_by_name_cb *interp_program_get_kernel_by_name;
diff --git a/src/cl_program.c b/src/cl_program.c
index 98b6d51..ffdb2a1 100644
--- a/src/cl_program.c
+++ b/src/cl_program.c
@@ -97,6 +97,9 @@ cl_program_delete(cl_program p)
     cl_kernel_delete(p->ker[i]);
   cl_free(p->ker);
 
+  cl_free(p->global_data_ptr);
+  if (p->global_data_ptr)
+    cl_buffer_unreference(p->global_data);
   /* Program belongs to their parent context */
   cl_context_delete(p->ctx);
 
@@ -191,6 +194,42 @@ LOCAL cl_bool headerCompare(const unsigned char *BufPtr, BINARY_HEADER_INDEX ind
 #define isLLVM_LIB(BufPtr)  headerCompare(BufPtr, BHI_LIBRARY)
 #define isGenBinary(BufPtr) headerCompare(BufPtr, BHI_GEN_BINARY)
 
+static cl_int get_program_global_data(cl_program prog) {
+  cl_buffer_mgr bufmgr = NULL;
+  bufmgr = cl_context_get_bufmgr(prog->ctx);
+  assert(bufmgr);
+  size_t const_size = interp_program_get_global_constant_size(prog->opaque);
+  if (const_size == 0) return CL_SUCCESS;
+
+  int page_size = getpagesize();
+  size_t alignedSz = ALIGN(const_size, page_size);
+  char * p = (char*)cl_aligned_malloc(alignedSz, page_size);
+  prog->global_data_ptr = p;
+  interp_program_get_global_constant_data(prog->opaque, (char*)p);
+
+  prog->global_data = cl_buffer_alloc_userptr(bufmgr, "program global data", p, alignedSz, 0);
+  cl_buffer_set_softpin_offset(prog->global_data, (size_t)p);
+
+  uint32_t reloc_count = interp_program_get_global_reloc_count(prog->opaque);
+  if (reloc_count > 0) {
+    uint32_t x;
+    struct RelocEntry {int refOffset; int defOffset;};
+    char *temp = (char*) malloc(reloc_count *sizeof(int)*2);
+    interp_program_get_global_reloc_table(prog->opaque, temp);
+    for (x = 0; x < reloc_count; x++) {
+      int ref_offset = ((struct RelocEntry *)temp)[x].refOffset;
+      *(uint64_t*)&(p[ref_offset]) = ((struct RelocEntry *)temp)[x].defOffset + (uint64_t)p;
+    }
+    free(temp);
+  }
+#if 0
+  int x = 0;
+  for (x = 0; x < const_size; x++) {
+    printf("offset %d data: %x\n", x, (unsigned)p[x]);
+  }
+#endif
+  return CL_SUCCESS;
+}
 LOCAL cl_program
 cl_program_create_from_binary(cl_context             ctx,
                               cl_uint                num_devices,
@@ -603,6 +642,9 @@ cl_program_build(cl_program p, const char *options)
     memcpy(p->bin + copyed, interp_kernel_get_code(opaque), sz);
     copyed += sz;
   }
+  if ((err = get_program_global_data(p)) != CL_SUCCESS)
+    goto error;
+
   p->is_built = 1;
   p->build_status = CL_BUILD_SUCCESS;
   return CL_SUCCESS;
@@ -697,6 +739,10 @@ cl_program_link(cl_context            context,
     memcpy(p->bin + copyed, interp_kernel_get_code(opaque), sz);
     copyed += sz;
   }
+
+  if ((err = get_program_global_data(p)) != CL_SUCCESS)
+    goto error;
+
 done:
   if(p) p->is_built = 1;
   if(p) p->build_status = CL_BUILD_SUCCESS;
diff --git a/src/cl_program.h b/src/cl_program.h
index 63ad16d..083d66a 100644
--- a/src/cl_program.h
+++ b/src/cl_program.h
@@ -54,6 +54,8 @@ struct _cl_program {
   cl_kernel *ker;         /* All kernels included by the OCL file */
   cl_program prev, next;  /* We chain the programs together */
   cl_context ctx;         /* Its parent context */
+  cl_buffer  global_data;
+  char * global_data_ptr;
   char *bin;              /* The program copied verbatim */
   size_t bin_sz;          /* Its size in memory */
   char *source;           /* Program sources */
diff --git a/utests/CMakeLists.txt b/utests/CMakeLists.txt
index 78442cb..0fca450 100644
--- a/utests/CMakeLists.txt
+++ b/utests/CMakeLists.txt
@@ -191,6 +191,7 @@ set (utests_sources
   compiler_bool_cross_basic_block.cpp
   compiler_private_const.cpp
   compiler_private_data_overflow.cpp
+  compiler_program_global.cpp
   compiler_getelementptr_bitcast.cpp
   compiler_sub_group_any.cpp
   compiler_sub_group_all.cpp
diff --git a/utests/compiler_program_global.cpp b/utests/compiler_program_global.cpp
new file mode 100644
index 0000000..ef7c655
--- /dev/null
+++ b/utests/compiler_program_global.cpp
@@ -0,0 +1,80 @@
+#include "utest_helper.hpp"
+#include "utest_file_map.hpp"
+
+static int init_program(const char* name, cl_context ctx, cl_program *pg )
+{
+  cl_int err;
+  char* ker_path = cl_do_kiss_path(name, device);
+
+  cl_file_map_t *fm = cl_file_map_new();
+  err = cl_file_map_open(fm, ker_path);
+  if(err != CL_FILE_MAP_SUCCESS)
+    OCL_ASSERT(0);
+  const char *src = cl_file_map_begin(fm);
+
+  *pg = clCreateProgramWithSource(ctx, 1, &src, NULL, &err);
+  free(ker_path);
+  cl_file_map_delete(fm);
+  return 0;
+
+}
+
+void compiler_program_global()
+{
+  const int n = 16;
+  int cpu_src[16];
+  cl_int err;
+
+  // Setup kernel and buffers
+  cl_program program;
+  init_program("compiler_program_global.cl", ctx, &program);
+  OCL_CALL (clBuildProgram, program, 1, &device, "-cl-std=CL2.0", NULL, NULL);
+
+  cl_kernel k0 = clCreateKernel(program, "compiler_program_global0", &err);
+  assert(err == CL_SUCCESS);
+  cl_kernel k1 = clCreateKernel(program, "compiler_program_global1", &err);
+  assert(err == CL_SUCCESS);
+
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(int), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(int), NULL);
+
+  OCL_CALL (clSetKernelArg, k0, 0, sizeof(cl_mem), &buf[0]);
+  OCL_CALL (clSetKernelArg, k1, 0, sizeof(cl_mem), &buf[1]);
+
+  int dynamic = 1;
+  OCL_CALL (clSetKernelArg, k0, 1, sizeof(cl_int), &dynamic);
+  OCL_CALL (clSetKernelArg, k1, 1, sizeof(cl_int), &dynamic);
+
+  globals[0] = 16;
+  locals[0] = 16;
+
+  OCL_MAP_BUFFER(0);
+  for (int i = 0; i <  n; ++i)
+    cpu_src[i] = ((int*)buf_data[0])[i] = i;
+  OCL_UNMAP_BUFFER(0);
+
+  // Run the kernel on GPU
+  OCL_CALL (clEnqueueNDRangeKernel, queue, k0, 1, NULL, globals, locals, 0, NULL, NULL);
+  OCL_CALL (clEnqueueNDRangeKernel, queue, k1, 1, NULL, globals, locals, 0, NULL, NULL);
+
+  // Compare
+  OCL_MAP_BUFFER(1);
+  for (int32_t i = 0; i < n; ++i) {
+//    printf("i=%d dst=%d\n", i, ((int*)buf_data[1])[i]);
+    switch(i) {
+      default: OCL_ASSERT(((int*)buf_data[1])[i] == i); break;
+      case 11: OCL_ASSERT(((int*)buf_data[1])[i] == 7); break;
+      case 12: OCL_ASSERT(((int*)buf_data[1])[i] == 4); break;
+      case 13: OCL_ASSERT(((int*)buf_data[1])[i] == 2); break;
+      case 14: OCL_ASSERT(((int*)buf_data[1])[i] == 3); break;
+      case 15: OCL_ASSERT(((int*)buf_data[1])[i] == 2); break;
+    }
+  }
+  OCL_UNMAP_BUFFER(1);
+  clReleaseKernel(k0);
+  clReleaseKernel(k1);
+  clReleaseProgram(program);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_program_global);
+
-- 
2.4.1



More information about the Beignet mailing list