[Beignet] [PATCH 1/4] GBE: Implement new constant solution for ocl2

Song, Ruiling ruiling.song at intel.com
Mon Jan 18 19:38:42 PST 2016


This patchset is for ocl2.0 branch only.

> -----Original Message-----
> From: Beignet [mailto:beignet-bounces at lists.freedesktop.org] On Behalf Of
> xionghu.luo at intel.com
> Sent: Tuesday, January 19, 2016 11:29 AM
> To: beignet at lists.freedesktop.org
> Cc: Luo, Xionghu <xionghu.luo at intel.com>; Song, Ruiling
> <ruiling.song at intel.com>
> Subject: [Beignet] [PATCH 1/4] GBE: Implement new constant solution for
> ocl2
> 
> From: Ruiling Song <ruiling.song at intel.com>
> 
> Different from ocl 1.2, which gather all constant into one surface.
> ocl2 only gather program global/constant into one surface. But
> keep other constant passed through kernel argument in separate buffer.
> 
> Signed-off-by: Ruiling Song <ruiling.song at intel.com>
> ---
>  backend/src/backend/gen_insn_selection.cpp | 15 ++++--
>  backend/src/backend/program.cpp            | 21 +++++++-
>  backend/src/backend/program.h              |  6 +++
>  backend/src/backend/program.hpp            |  4 ++
>  backend/src/gbe_bin_interpreter.cpp        |  2 +
>  backend/src/ir/profile.cpp                 |  4 +-
>  backend/src/ir/profile.hpp                 |  3 +-
>  backend/src/ir/unit.hpp                    | 50 ++++++++++++++++++
>  backend/src/llvm/llvm_gen_backend.cpp      | 84 ++++++++++++++++++----
> --------
>  kernels/compiler_program_global.cl         | 68 ++++++++++++++++++++++++
>  src/cl_command_queue.c                     |  2 +-
>  src/cl_command_queue_gen7.c                | 16 ++++++
>  src/cl_gbe_loader.cpp                      | 10 ++++
>  src/cl_gbe_loader.h                        |  2 +
>  src/cl_program.c                           | 46 ++++++++++++++++
>  src/cl_program.h                           |  2 +
>  utests/CMakeLists.txt                      |  1 +
>  utests/compiler_program_global.cpp         | 80
> ++++++++++++++++++++++++++++
>  18 files changed, 374 insertions(+), 42 deletions(-)
>  create mode 100644 kernels/compiler_program_global.cl
>  create mode 100644 utests/compiler_program_global.cpp
> 
> diff --git a/backend/src/backend/gen_insn_selection.cpp
> b/backend/src/backend/gen_insn_selection.cpp
> index d19f985..6ef077b 100644
> --- a/backend/src/backend/gen_insn_selection.cpp
> +++ b/backend/src/backend/gen_insn_selection.cpp
> @@ -3596,6 +3596,13 @@ namespace gbe
>      LoadInstructionPattern(void) : SelectionPattern(1, 1) {
>         this->opcodes.push_back(ir::OP_LOAD);
>      }
> +    bool isReadConstantLegacy(const ir::LoadInstruction &load) const {
> +      ir::AddressMode AM = load.getAddressMode();
> +      ir::AddressSpace AS = load.getAddressSpace();
> +      if (AM != ir::AM_Stateless && AS == ir::MEM_CONSTANT)
> +        return true;
> +      return false;
> +    }
>      void untypedReadStateless(Selection::Opaque &sel,
>                                GenRegister addr,
>                                vector<GenRegister> &dst
> @@ -3678,7 +3685,7 @@ namespace gbe
>              unsigned SI = insn.getSurfaceIndex();
>              sel.UNTYPED_READ(addr, dst.data(), valueNum,
> GenRegister::immud(SI), btiTemp);
>            }
> -        } else if (addrSpace == ir::MEM_LOCAL || addrSpace ==
> ir::MEM_CONSTANT ) {
> +        } else if (addrSpace == ir::MEM_LOCAL || isReadConstantLegacy(insn) )
> {
>            // stateless mode, local/constant still use bti access
>            unsigned bti = addrSpace == ir::MEM_CONSTANT ? BTI_CONSTANT :
> 0xfe;
>            GenRegister addrDW = addr;
> @@ -3842,7 +3849,7 @@ namespace gbe
>              b = GenRegister::immud(insn.getSurfaceIndex());
>            }
>            read64Legacy(sel, addr, dst, b, btiTemp);
> -        } else if (addrSpace == MEM_LOCAL || addrSpace == MEM_CONSTANT)
> {
> +        } else if (addrSpace == MEM_LOCAL || isReadConstantLegacy(insn)) {
>            GenRegister b = GenRegister::immud(addrSpace == MEM_LOCAL?
> 0xfe : BTI_CONSTANT);
>            GenRegister addrDW = addr;
>            if (addrBytes == 8)
> @@ -4063,7 +4070,7 @@ namespace gbe
>            unsigned SI = insn.getSurfaceIndex();
>            sel.BYTE_GATHER(dst, addr, elemSize, GenRegister::immud(SI),
> btiTemp);
>          }
> -      } else if (addrSpace == ir::MEM_LOCAL || addrSpace ==
> ir::MEM_CONSTANT) {
> +      } else if (addrSpace == ir::MEM_LOCAL || isReadConstantLegacy(insn)) {
>          unsigned bti = addrSpace == ir::MEM_CONSTANT ? BTI_CONSTANT :
> 0xfe;
>          GenRegister addrDW = addr;
>          if (addrBytes == 8) {
> @@ -4207,7 +4214,7 @@ namespace gbe
>        const Type type = insn.getValueType();
>        const uint32_t elemSize = getByteScatterGatherSize(sel, type);
> 
> -      if (addrSpace == MEM_CONSTANT) {
> +      if (isReadConstantLegacy(insn)) {
>          // XXX TODO read 64bit constant through constant cache
>          // Per HW Spec, constant cache messages can read at least DWORD data.
>          // So, byte/short data type, we have to read through data cache.
> diff --git a/backend/src/backend/program.cpp
> b/backend/src/backend/program.cpp
> index 36af95f..ce4f927 100644
> --- a/backend/src/backend/program.cpp
> +++ b/backend/src/backend/program.cpp
> @@ -104,11 +104,13 @@ namespace gbe {
>      return it->offset; // we found it!
>    }
> 
> -  Program::Program(void) : constantSet(NULL) {}
> +  Program::Program(void) : constantSet(NULL),
> +                           relocTable(NULL) {}
>    Program::~Program(void) {
>      for (map<std::string, Kernel*>::iterator it = kernels.begin(); it !=
> kernels.end(); ++it)
>        GBE_DELETE(it->second);
>      if (constantSet) delete constantSet;
> +    if (relocTable) delete relocTable;
>    }
> 
>  #ifdef GBE_COMPILER_AVAILABLE
> @@ -151,6 +153,7 @@ namespace gbe {
> 
>    bool Program::buildFromUnit(const ir::Unit &unit, std::string &error) {
>      constantSet = new ir::ConstantSet(unit.getConstantSet());
> +    relocTable = new ir::RelocTable(unit.getRelocTable());
>      const auto &set = unit.getFunctionSet();
>      const uint32_t kernelNum = set.size();
>      if (OCL_OUTPUT_GEN_IR) std::cout << unit;
> @@ -978,6 +981,18 @@ namespace gbe {
>      program->getGlobalConstantData(mem);
>    }
> 
> +  static size_t programGetGlobalRelocCount(gbe_program gbeProgram) {
> +    if (gbeProgram == NULL) return 0;
> +    const gbe::Program *program = (const gbe::Program*) gbeProgram;
> +    return program->getGlobalRelocCount();
> +  }
> +
> +  static void programGetGlobalRelocTable(gbe_program gbeProgram, char
> *mem) {
> +    if (gbeProgram == NULL) return;
> +    const gbe::Program *program = (const gbe::Program*) gbeProgram;
> +    program->getGlobalRelocTable(mem);
> +  }
> +
>    static uint32_t programGetKernelNum(gbe_program gbeProgram) {
>      if (gbeProgram == NULL) return 0;
>      const gbe::Program *program = (const gbe::Program*) gbeProgram;
> @@ -1220,6 +1235,8 @@ GBE_EXPORT_SYMBOL
> gbe_program_link_from_llvm_cb *gbe_program_link_from_llvm = NU
>  GBE_EXPORT_SYMBOL gbe_program_build_from_llvm_cb
> *gbe_program_build_from_llvm = NULL;
>  GBE_EXPORT_SYMBOL gbe_program_get_global_constant_size_cb
> *gbe_program_get_global_constant_size = NULL;
>  GBE_EXPORT_SYMBOL gbe_program_get_global_constant_data_cb
> *gbe_program_get_global_constant_data = NULL;
> +GBE_EXPORT_SYMBOL gbe_program_get_global_reloc_count_cb
> *gbe_program_get_global_reloc_count = NULL;
> +GBE_EXPORT_SYMBOL gbe_program_get_global_reloc_table_cb
> *gbe_program_get_global_reloc_table = NULL;
>  GBE_EXPORT_SYMBOL gbe_program_clean_llvm_resource_cb
> *gbe_program_clean_llvm_resource = NULL;
>  GBE_EXPORT_SYMBOL gbe_program_delete_cb *gbe_program_delete =
> NULL;
>  GBE_EXPORT_SYMBOL gbe_program_get_kernel_num_cb
> *gbe_program_get_kernel_num = NULL;
> @@ -1269,6 +1286,8 @@ namespace gbe
>        gbe_program_check_opt = gbe::programCheckOption;
>        gbe_program_get_global_constant_size =
> gbe::programGetGlobalConstantSize;
>        gbe_program_get_global_constant_data =
> gbe::programGetGlobalConstantData;
> +      gbe_program_get_global_reloc_count =
> gbe::programGetGlobalRelocCount;
> +      gbe_program_get_global_reloc_table =
> gbe::programGetGlobalRelocTable;
>        gbe_program_clean_llvm_resource = gbe::programCleanLlvmResource;
>        gbe_program_delete = gbe::programDelete;
>        gbe_program_get_kernel_num = gbe::programGetKernelNum;
> diff --git a/backend/src/backend/program.h
> b/backend/src/backend/program.h
> index 86b3177..03150bc 100644
> --- a/backend/src/backend/program.h
> +++ b/backend/src/backend/program.h
> @@ -99,6 +99,7 @@ enum gbe_curbe_type {
>    GBE_CURBE_BLOCK_IP,
>    GBE_CURBE_DW_BLOCK_IP,
>    GBE_CURBE_THREAD_NUM,
> +  GBE_CURBE_CONSTANT_ADDRSPACE,
>    GBE_GEN_REG,
>  };
> 
> @@ -243,6 +244,11 @@ extern gbe_program_get_global_constant_size_cb
> *gbe_program_get_global_constant_
>  typedef void (gbe_program_get_global_constant_data_cb)(gbe_program
> gbeProgram, char *mem);
>  extern gbe_program_get_global_constant_data_cb
> *gbe_program_get_global_constant_data;
> 
> +typedef size_t (gbe_program_get_global_reloc_count_cb)(gbe_program
> gbeProgram);
> +extern gbe_program_get_global_reloc_count_cb
> *gbe_program_get_global_reloc_count;
> +
> +typedef void (gbe_program_get_global_reloc_table_cb)(gbe_program
> gbeProgram, char *mem);
> +extern gbe_program_get_global_reloc_table_cb
> *gbe_program_get_global_reloc_table;
>  /*! Get the size of defined samplers */
>  typedef size_t (gbe_kernel_get_sampler_size_cb)(gbe_kernel gbeKernel);
>  extern gbe_kernel_get_sampler_size_cb *gbe_kernel_get_sampler_size;
> diff --git a/backend/src/backend/program.hpp
> b/backend/src/backend/program.hpp
> index efe192f..e58ddf0 100644
> --- a/backend/src/backend/program.hpp
> +++ b/backend/src/backend/program.hpp
> @@ -280,6 +280,8 @@ namespace gbe {
>      /*! Get the content of global constant arrays */
>      void getGlobalConstantData(char *mem) const { constantSet-
> >getData(mem); }
> 
> +    uint32_t getGlobalRelocCount(void) const { return relocTable-
> >getCount(); }
> +    void getGlobalRelocTable(char *p) const { relocTable->getData(p); }
>      static const uint32_t magic_begin = TO_MAGIC('P', 'R', 'O', 'G');
>      static const uint32_t magic_end = TO_MAGIC('G', 'O', 'R', 'P');
> 
> @@ -309,6 +311,8 @@ namespace gbe {
>      map<std::string, Kernel*> kernels;
>      /*! Global (constants) outside any kernel */
>      ir::ConstantSet *constantSet;
> +    /*! relocation table */
> +    ir::RelocTable *relocTable;
>      /*! Use custom allocators */
>      GBE_CLASS(Program);
>    };
> diff --git a/backend/src/gbe_bin_interpreter.cpp
> b/backend/src/gbe_bin_interpreter.cpp
> index 4594a0a..0957092 100644
> --- a/backend/src/gbe_bin_interpreter.cpp
> +++ b/backend/src/gbe_bin_interpreter.cpp
> @@ -61,6 +61,8 @@ struct BinInterpCallBackInitializer
>      gbe_program_get_global_constant_size =
> gbe::programGetGlobalConstantSize;
>      gbe_program_delete = gbe::programDelete;
>      gbe_program_get_global_constant_data =
> gbe::programGetGlobalConstantData;
> +    gbe_program_get_global_reloc_count =
> gbe::programGetGlobalRelocCount;
> +    gbe_program_get_global_reloc_table =
> gbe::programGetGlobalRelocTable;
>      gbe_kernel_get_sampler_data = gbe::kernelGetSamplerData;
>      gbe_kernel_get_image_data = gbe::kernelGetImageData;
>      gbe_kernel_get_arg_info = gbe::kernelGetArgInfo;
> diff --git a/backend/src/ir/profile.cpp b/backend/src/ir/profile.cpp
> index 3ead8a7..0699167 100644
> --- a/backend/src/ir/profile.cpp
> +++ b/backend/src/ir/profile.cpp
> @@ -43,7 +43,8 @@ namespace ir {
>          "zero", "one",
>          "retVal",
>          "printf_buffer_pointer", "printf_index_buffer_pointer",
> -        "dwblockip"
> +        "dwblockip",
> +        "constant_addrspace_start"
>      };
> 
>  #if GBE_DEBUG
> @@ -86,6 +87,7 @@ namespace ir {
>        DECL_NEW_REG(FAMILY_QWORD, printfbptr, 1,
> GBE_CURBE_PRINTF_BUF_POINTER);
>        DECL_NEW_REG(FAMILY_QWORD, printfiptr, 1,
> GBE_CURBE_PRINTF_INDEX_POINTER);
>        DECL_NEW_REG(FAMILY_DWORD, dwblockip, 0,
> GBE_CURBE_DW_BLOCK_IP);
> +      DECL_NEW_REG(FAMILY_QWORD, constant_addrspace, 1,
> GBE_CURBE_CONSTANT_ADDRSPACE);
>      }
>  #undef DECL_NEW_REG
> 
> diff --git a/backend/src/ir/profile.hpp b/backend/src/ir/profile.hpp
> index a8445c4..79761d4 100644
> --- a/backend/src/ir/profile.hpp
> +++ b/backend/src/ir/profile.hpp
> @@ -71,7 +71,8 @@ namespace ir {
>      static const Register printfbptr = Register(27); // printf buffer address .
>      static const Register printfiptr = Register(28); // printf index buffer address.
>      static const Register dwblockip = Register(29);  // blockip
> -    static const uint32_t regNum = 30;             // number of special registers
> +    static const Register constant_addrspace = Register(30);  // starting
> address of program-scope constant
> +    static const uint32_t regNum = 31;             // number of special registers
>      extern const char *specialRegMean[];           // special register name.
>    } /* namespace ocl */
> 
> diff --git a/backend/src/ir/unit.hpp b/backend/src/ir/unit.hpp
> index 8ff858d..b8df145 100644
> --- a/backend/src/ir/unit.hpp
> +++ b/backend/src/ir/unit.hpp
> @@ -27,6 +27,7 @@
>  #include "ir/constant.hpp"
>  #include "ir/register.hpp"
>  #include "sys/map.hpp"
> +#include <string.h>
> 
>  namespace gbe {
>  namespace ir {
> @@ -37,6 +38,52 @@ namespace ir {
>    /*! Complete unit of compilation. It contains a set of functions and a set of
>     *  constant the functions may refer to.
>     */
> +  struct RelocEntry {
> +    RelocEntry(unsigned int rO, unsigned int dO):
> +      refOffset(rO),
> +      defOffset(dO) {}
> +
> +    unsigned int refOffset;
> +    unsigned int defOffset;
> +  };
> +
> +  class RelocTable : public NonCopyable, public Serializable
> +  {
> +    public:
> +      void addEntry(unsigned refOffset, unsigned defOffset) {
> +        entries.push_back(RelocEntry(refOffset, defOffset));
> +      }
> +      RelocTable() {}
> +      RelocTable(const RelocTable& other) : Serializable(other),
> +                                            entries(other.entries) {}
> +      uint32_t getCount() { return entries.size(); }
> +      void getData(char *p) {
> +        if (entries.size() > 1 && p)
> +          memcpy(p, entries.data(), entries.size()*sizeof(RelocEntry));
> +      }
> +    static const uint32_t magic_begin = TO_MAGIC('R', 'E', 'L', 'C');
> +    static const uint32_t magic_end = TO_MAGIC('C', 'L', 'E', 'R');
> +
> +    /* format:
> +       magic_begin     |
> +       const_data_size |
> +       const_data      |
> +       constant_1_size |
> +       constant_1      |
> +       ........        |
> +       constant_n_size |
> +       constant_n      |
> +       magic_end       |
> +       total_size
> +    */
> +
> +    /*! Implements the serialization. */
> +    virtual size_t serializeToBin(std::ostream& outs) { return 0;}
> +    virtual size_t deserializeFromBin(std::istream& ins) { return 0; }
> +    private:
> +      vector<RelocEntry> entries;
> +      GBE_CLASS(RelocTable);
> +  };
>    class Unit : public NonCopyable
>    {
>    public:
> @@ -70,6 +117,8 @@ namespace ir {
>      }
>      /*! Return the constant set */
>      ConstantSet& getConstantSet(void) { return constantSet; }
> +    const RelocTable& getRelocTable(void) const  { return relocTable; }
> +    RelocTable& getRelocTable(void)   { return relocTable; }
>      /*! Return the constant set */
>      const ConstantSet& getConstantSet(void) const { return constantSet; }
>      void setValid(bool value) { valid = value; }
> @@ -78,6 +127,7 @@ namespace ir {
>      friend class ContextInterface; //!< Can free modify the unit
>      FunctionSet functions; //!< All the defined functions
>      ConstantSet constantSet; //!< All the constants defined in the unit
> +    RelocTable relocTable;
>      PointerSize pointerSize; //!< Size shared by all pointers
>      GBE_CLASS(Unit);
>      bool valid;
> diff --git a/backend/src/llvm/llvm_gen_backend.cpp
> b/backend/src/llvm/llvm_gen_backend.cpp
> index cb47097..d23a598 100644
> --- a/backend/src/llvm/llvm_gen_backend.cpp
> +++ b/backend/src/llvm/llvm_gen_backend.cpp
> @@ -506,7 +506,7 @@ namespace gbe
> 
>      virtual bool doInitialization(Module &M);
>      /*! helper function for parsing global constant data */
> -    void getConstantData(const Constant * c, void* mem, uint32_t& offset)
> const;
> +    void getConstantData(const Constant * c, void* mem, uint32_t& offset,
> vector<ir::RelocEntry> &) const;
>      void collectGlobalConstant(void) const;
>      ir::ImmediateIndex processConstantImmIndex(Constant *CPV, int32_t
> index = 0u);
>      const ir::Immediate &processConstantImm(Constant *CPV, int32_t index
> = 0u);
> @@ -1111,8 +1111,9 @@ namespace gbe
>            break;
>          }
>          case 2:
> -          new_bti = BTI_CONSTANT;
> -
> +          // ocl 2.0, constant pointer use separate bti
> +          new_bti = btiBase;
> +          incBtiBase();
>            break;
>          case 3:
>            new_bti = BTI_LOCAL;
> @@ -1349,22 +1350,34 @@ namespace gbe
>      return;
>    }
> 
> -  void GenWriter::getConstantData(const Constant * c, void* mem,
> uint32_t& offset) const {
> +  void GenWriter::getConstantData(const Constant * c, void* mem,
> uint32_t& offset, vector<ir::RelocEntry> &relocs) const {
>      Type * type = c->getType();
>      Type::TypeID id = type->getTypeID();
> 
>      GBE_ASSERT(c);
> +    if (isa<GlobalVariable>(c)) {
> +      const GlobalVariable *GV = cast<GlobalVariable>(c);
> +
> +      unsigned valueAddrSpace = GV->getType()->getAddressSpace();
> +      ir::Constant cc = unit.getConstantSet().getConstant(c->getName());
> +      unsigned int defOffset = cc.getOffset();
> +
> +      relocs.push_back(ir::RelocEntry(offset, defOffset));
> +      uint32_t size = getTypeByteSize(unit, type);
> +      memset((char*)mem+offset, 0, size);
> +      offset += size;
> +      return;
> +    }
>      if(isa<UndefValue>(c)) {
>        uint32_t size = getTypeByteSize(unit, type);
>        offset += size;
>        return;
> -    } else if(isa<ConstantAggregateZero>(c)) {
> +    } else if(isa<ConstantAggregateZero>(c) || isa<ConstantPointerNull>(c)) {
>        uint32_t size = getTypeByteSize(unit, type);
>        memset((char*)mem+offset, 0, size);
>        offset += size;
>        return;
>      }
> -
>      switch(id) {
>        case Type::TypeID::StructTyID:
>          {
> @@ -1382,7 +1395,7 @@ namespace gbe
>              offset += padding/8;
>              const Constant* sub = cast<Constant>(c->getOperand(op));
>              GBE_ASSERT(sub);
> -            getConstantData(sub, mem, offset);
> +            getConstantData(sub, mem, offset, relocs);
>            }
>            break;
>          }
> @@ -1401,7 +1414,7 @@ namespace gbe
>              uint32_t ops = c->getNumOperands();
>              for(uint32_t op = 0; op < ops; ++op) {
>                Constant * ca = dyn_cast<Constant>(c->getOperand(op));
> -              getConstantData(ca, mem, offset);
> +              getConstantData(ca, mem, offset, relocs);
>                offset += padding;
>              }
>            }
> @@ -1449,21 +1462,34 @@ namespace gbe
>      const Module::GlobalListType &globalList = TheModule->getGlobalList();
>      for(auto i = globalList.begin(); i != globalList.end(); i ++) {
>        const GlobalVariable &v = *i;
> -      if(!v.isConstantUsed()) continue;
>        const char *name = v.getName().data();
>        unsigned addrSpace = v.getType()->getAddressSpace();
> -      if(addrSpace == ir::AddressSpace::MEM_CONSTANT || v.isConstant()) {
> -        GBE_ASSERT(v.hasInitializer());
> -        const Constant *c = v.getInitializer();
> -        Type * type = c->getType();
> +
> +      vector<ir::RelocEntry> relocs;
> +      if(addrSpace == 2 /* __constant */
> +          || addrSpace == 1
> +          || addrSpace == 0) {
> +        Type * type = v.getValueType();
> 
>          uint32_t size = getTypeByteSize(unit, type);
>          void* mem = malloc(size);
>          uint32_t offset = 0;
> -        getConstantData(c, mem, offset);
> +        if (v.hasInitializer()) {
> +          const Constant *c = v.getInitializer();
> +          getConstantData(c, mem, offset, relocs);
> +        } else {
> +          memset(mem, 0, size);
> +        }
>          uint32_t alignment = getAlignmentByte(unit, type);
>          unit.newConstant((char *)mem, name, size, alignment);
>          free(mem);
> +        uint32_t refOffset =
> unit.getConstantSet().getConstant(name).getOffset();
> +        for (uint32_t k = 0; k < relocs.size(); k++) {
> +          unit.getRelocTable().addEntry(
> +                               refOffset + relocs[k].refOffset,
> +                               relocs[k].defOffset
> +                               );
> +        }
>        }
>      }
>    }
> @@ -2562,33 +2588,23 @@ namespace gbe
>          this->newRegister(const_cast<GlobalVariable*>(&v));
>          ir::Register reg =
> regTranslator.getScalar(const_cast<GlobalVariable*>(&v), 0);
>          ctx.LOADI(getType(ctx, v.getType()), reg,
> ctx.newIntegerImmediate(oldSlm + padding/8, getType(ctx, v.getType())));
> -      } else if(addrSpace == ir::MEM_CONSTANT || v.isConstant()) {
> -        GBE_ASSERT(v.hasInitializer());
> -        this->newRegister(const_cast<GlobalVariable*>(&v));
> -        ir::Register reg =
> regTranslator.getScalar(const_cast<GlobalVariable*>(&v), 0);
> -        ir::Constant &con = unit.getConstantSet().getConstant(v.getName());
> -        ctx.LOADI(getType(ctx, v.getType()), reg,
> ctx.newIntegerImmediate(con.getOffset(), getType(ctx, v.getType())));
> -      } else {
> +      } else if(addrSpace == ir::MEM_CONSTANT
> +             || addrSpace == ir::MEM_GLOBAL
> +             || v.isConstant()) {
>          if(v.getName().equals(StringRef("__gen_ocl_printf_buf"))) {
>            ctx.getFunction().getPrintfSet()-
> >setBufBTI(BtiMap.find(const_cast<GlobalVariable*>(&v))->second);
>            regTranslator.newScalarProxy(ir::ocl::printfbptr,
> const_cast<GlobalVariable*>(&v));
>          } else if(v.getName().equals(StringRef("__gen_ocl_printf_index_buf")))
> {
>            ctx.getFunction().getPrintfSet()-
> >setIndexBufBTI(BtiMap.find(const_cast<GlobalVariable*>(&v))->second);
>            regTranslator.newScalarProxy(ir::ocl::printfiptr,
> const_cast<GlobalVariable*>(&v));
> -        } else if(v.getName().str().substr(0, 4) == ".str") {
> -          /* When there are multi printf statements in multi kernel fucntions
> within the same
> -             translate unit, if they have the same sting parameter, such as
> -             kernel_func1 () {
> -               printf("Line is %d\n", line_num1);
> -             }
> -             kernel_func2 () {
> -               printf("Line is %d\n", line_num2);
> -             }
> -             The Clang will just generate one global string named .strXXX to
> represent "Line is %d\n"
> -             So when translating the kernel_func1, we can not unref that global
> var, so we will
> -             get here. Just ignore it to avoid assert. */
>          } else {
> -          GBE_ASSERT(0 && "Unsupported private memory access pattern");
> +          this->newRegister(const_cast<GlobalVariable*>(&v));
> +          ir::Register reg =
> regTranslator.getScalar(const_cast<GlobalVariable*>(&v), 0);
> +          ir::Constant &con = unit.getConstantSet().getConstant(v.getName());
> +          ctx.LOADI(getType(ctx, v.getType()), reg,
> ctx.newIntegerImmediate(con.getOffset(), getType(ctx, v.getType())));
> +          if (!legacyMode) {
> +            ctx.ADD(getType(ctx, v.getType()), reg, ir::ocl::constant_addrspace,
> reg);
> +          }
>          }
>        }
>      }
> diff --git a/kernels/compiler_program_global.cl
> b/kernels/compiler_program_global.cl
> new file mode 100644
> index 0000000..405c53f
> --- /dev/null
> +++ b/kernels/compiler_program_global.cl
> @@ -0,0 +1,68 @@
> +struct config{
> +  int s0;
> +  global short *s1;
> +};
> +
> +global int i = 5;
> +global int bb = 4;
> +global int *global p;
> +
> +/* array */
> +global int ba[12];
> +
> +/* short/long data type */
> +global short s;
> +global short s2;
> +global long l;
> +
> +/* pointer in constant AS to global */
> +global int * constant px =&i;
> +
> +/* constant pointer relocation */
> +constant int x = 2;
> +constant int y =1;
> +constant int *constant z[2] = {&x, &y};
> +
> +/* structure with pointer field */
> +global struct config c[2] = {{1, &s}, {2, &s2} };
> +
> +
> +global int a = 1;
> +global int b = 2;
> +global int * constant gArr[2]= {&a, &b};
> +
> +__kernel void compiler_program_global0(const global int *src, int dynamic)
> {
> +  size_t gid = get_global_id(0);
> +  /* global read/write */
> +  p = &i;
> +  *p += 1;
> +
> +  /* pointer in struct memory access */
> +  *c[gid&1].s1 += 2;
> +
> +  s = 2;
> +  l = 3;
> +
> +  /* constant AS pointer (points to global) memory access */
> +  *px += *z[dynamic];
> +
> +  p = &bb;
> +  /* array */
> +  if (gid < 11)
> +    ba[gid] = src[gid];
> +}
> +
> +__kernel void compiler_program_global1(global int *dst, int dynamic) {
> +  size_t gid = get_global_id(0);
> +//  static global sg;
> +
> +  dst[11] = i;
> +  dst[12] = *p;
> +  dst[13] = s;
> +  dst[14] = l;
> +  dst[15] = *gArr[dynamic];
> +
> +  if (gid < 11)
> +    dst[gid] = ba[gid];
> +}
> +
> diff --git a/src/cl_command_queue.c b/src/cl_command_queue.c
> index 9dc3fe6..442c6a2 100644
> --- a/src/cl_command_queue.c
> +++ b/src/cl_command_queue.c
> @@ -161,7 +161,7 @@
> cl_command_queue_bind_surface(cl_command_queue queue, cl_kernel k)
>    for (i = 0; i < k->arg_n; ++i) {
>      int32_t offset; // location of the address in the curbe
>      arg_type = interp_kernel_get_arg_type(k->opaque, i);
> -    if (arg_type != GBE_ARG_GLOBAL_PTR || !k->args[i].mem)
> +    if (!(arg_type == GBE_ARG_GLOBAL_PTR || arg_type ==
> GBE_ARG_CONSTANT_PTR) || !k->args[i].mem)
>        continue;
>      offset = interp_kernel_get_curbe_offset(k->opaque,
> GBE_CURBE_KERNEL_ARGUMENT, i);
>      if (offset < 0)
> diff --git a/src/cl_command_queue_gen7.c
> b/src/cl_command_queue_gen7.c
> index 2edc3be..61ffe7e 100644
> --- a/src/cl_command_queue_gen7.c
> +++ b/src/cl_command_queue_gen7.c
> @@ -29,6 +29,7 @@
>  #include <assert.h>
>  #include <stdio.h>
>  #include <string.h>
> +#include <unistd.h>
> 
>  #define MAX_GROUP_SIZE_IN_HALFSLICE   512
>  static INLINE size_t cl_kernel_compute_batch_sz(cl_kernel k) { return
> 256+256; }
> @@ -117,6 +118,11 @@ cl_upload_constant_buffer(cl_command_queue
> queue, cl_kernel ker)
>     * we need raw_size & aligned_size
>     */
>    GET_QUEUE_THREAD_GPGPU(queue);
> +  // TODO this is only valid for OpenCL 1.2,
> +  // under ocl1.2 we gather all constant into one dedicated surface.
> +  // but in 2.0 we put program global into one surface, but constants
> +  // pass through kernel argument in each separate buffer
> +#if 0
>    int32_t arg;
>    size_t offset = 0;
>    uint32_t raw_size = 0, aligned_size =0;
> @@ -185,6 +191,16 @@ cl_upload_constant_buffer(cl_command_queue
> queue, cl_kernel ker)
>      }
>    }
>    cl_buffer_unmap(bo);
> +#endif
> +  // pass the starting of constant address space
> +  int32_t constant_addrspace = interp_kernel_get_curbe_offset(ker-
> >opaque, GBE_CURBE_CONSTANT_ADDRSPACE, 0);
> +  if (constant_addrspace >= 0) {
> +    size_t global_const_size =
> interp_program_get_global_constant_size(ker->program->opaque);
> +    if (global_const_size > 0) {
> +      *(uint64_t*)(ker->curbe + constant_addrspace) = (uint64_t)ker-
> >program->global_data_ptr;
> +      cl_gpgpu_bind_buf(gpgpu, ker->program->global_data,
> constant_addrspace, 0, ALIGN(global_const_size, getpagesize()),
> BTI_CONSTANT);
> +    }
> +  }
>    return 0;
>  }
> 
> diff --git a/src/cl_gbe_loader.cpp b/src/cl_gbe_loader.cpp
> index e832a53..d75c92c 100644
> --- a/src/cl_gbe_loader.cpp
> +++ b/src/cl_gbe_loader.cpp
> @@ -38,6 +38,8 @@ gbe_program_clean_llvm_resource_cb
> *compiler_program_clean_llvm_resource = NULL;
>  gbe_program_new_from_binary_cb *interp_program_new_from_binary =
> NULL;
>  gbe_program_get_global_constant_size_cb
> *interp_program_get_global_constant_size = NULL;
>  gbe_program_get_global_constant_data_cb
> *interp_program_get_global_constant_data = NULL;
> +gbe_program_get_global_reloc_count_cb
> *interp_program_get_global_reloc_count = NULL;
> +gbe_program_get_global_reloc_table_cb
> *interp_program_get_global_reloc_table = NULL;
>  gbe_program_delete_cb *interp_program_delete = NULL;
>  gbe_program_get_kernel_num_cb *interp_program_get_kernel_num =
> NULL;
>  gbe_program_get_kernel_by_name_cb
> *interp_program_get_kernel_by_name = NULL;
> @@ -109,6 +111,14 @@ struct GbeLoaderInitializer
>      if (interp_program_get_global_constant_data == NULL)
>        return false;
> 
> +    interp_program_get_global_reloc_count =
> *(gbe_program_get_global_reloc_count_cb**)dlsym(dlhInterp,
> "gbe_program_get_global_reloc_count");
> +    if (interp_program_get_global_reloc_count == NULL)
> +      return false;
> +
> +    interp_program_get_global_reloc_table =
> *(gbe_program_get_global_reloc_table_cb**)dlsym(dlhInterp,
> "gbe_program_get_global_reloc_table");
> +    if (interp_program_get_global_reloc_table == NULL)
> +      return false;
> +
>      interp_program_delete = *(gbe_program_delete_cb**)dlsym(dlhInterp,
> "gbe_program_delete");
>      if (interp_program_delete == NULL)
>        return false;
> diff --git a/src/cl_gbe_loader.h b/src/cl_gbe_loader.h
> index de91c85..28741ff 100644
> --- a/src/cl_gbe_loader.h
> +++ b/src/cl_gbe_loader.h
> @@ -38,6 +38,8 @@ extern gbe_program_clean_llvm_resource_cb
> *compiler_program_clean_llvm_resource;
>  extern gbe_program_new_from_binary_cb
> *interp_program_new_from_binary;
>  extern gbe_program_get_global_constant_size_cb
> *interp_program_get_global_constant_size;
>  extern gbe_program_get_global_constant_data_cb
> *interp_program_get_global_constant_data;
> +extern gbe_program_get_global_reloc_count_cb
> *interp_program_get_global_reloc_count;
> +extern gbe_program_get_global_reloc_table_cb
> *interp_program_get_global_reloc_table;
>  extern gbe_program_delete_cb *interp_program_delete;
>  extern gbe_program_get_kernel_num_cb
> *interp_program_get_kernel_num;
>  extern gbe_program_get_kernel_by_name_cb
> *interp_program_get_kernel_by_name;
> diff --git a/src/cl_program.c b/src/cl_program.c
> index 98b6d51..ffdb2a1 100644
> --- a/src/cl_program.c
> +++ b/src/cl_program.c
> @@ -97,6 +97,9 @@ cl_program_delete(cl_program p)
>      cl_kernel_delete(p->ker[i]);
>    cl_free(p->ker);
> 
> +  cl_free(p->global_data_ptr);
> +  if (p->global_data_ptr)
> +    cl_buffer_unreference(p->global_data);
>    /* Program belongs to their parent context */
>    cl_context_delete(p->ctx);
> 
> @@ -191,6 +194,42 @@ LOCAL cl_bool headerCompare(const unsigned char
> *BufPtr, BINARY_HEADER_INDEX ind
>  #define isLLVM_LIB(BufPtr)  headerCompare(BufPtr, BHI_LIBRARY)
>  #define isGenBinary(BufPtr) headerCompare(BufPtr, BHI_GEN_BINARY)
> 
> +static cl_int get_program_global_data(cl_program prog) {
> +  cl_buffer_mgr bufmgr = NULL;
> +  bufmgr = cl_context_get_bufmgr(prog->ctx);
> +  assert(bufmgr);
> +  size_t const_size = interp_program_get_global_constant_size(prog-
> >opaque);
> +  if (const_size == 0) return CL_SUCCESS;
> +
> +  int page_size = getpagesize();
> +  size_t alignedSz = ALIGN(const_size, page_size);
> +  char * p = (char*)cl_aligned_malloc(alignedSz, page_size);
> +  prog->global_data_ptr = p;
> +  interp_program_get_global_constant_data(prog->opaque, (char*)p);
> +
> +  prog->global_data = cl_buffer_alloc_userptr(bufmgr, "program global
> data", p, alignedSz, 0);
> +  cl_buffer_set_softpin_offset(prog->global_data, (size_t)p);
> +
> +  uint32_t reloc_count = interp_program_get_global_reloc_count(prog-
> >opaque);
> +  if (reloc_count > 0) {
> +    uint32_t x;
> +    struct RelocEntry {int refOffset; int defOffset;};
> +    char *temp = (char*) malloc(reloc_count *sizeof(int)*2);
> +    interp_program_get_global_reloc_table(prog->opaque, temp);
> +    for (x = 0; x < reloc_count; x++) {
> +      int ref_offset = ((struct RelocEntry *)temp)[x].refOffset;
> +      *(uint64_t*)&(p[ref_offset]) = ((struct RelocEntry *)temp)[x].defOffset
> + (uint64_t)p;
> +    }
> +    free(temp);
> +  }
> +#if 0
> +  int x = 0;
> +  for (x = 0; x < const_size; x++) {
> +    printf("offset %d data: %x\n", x, (unsigned)p[x]);
> +  }
> +#endif
> +  return CL_SUCCESS;
> +}
>  LOCAL cl_program
>  cl_program_create_from_binary(cl_context             ctx,
>                                cl_uint                num_devices,
> @@ -603,6 +642,9 @@ cl_program_build(cl_program p, const char *options)
>      memcpy(p->bin + copyed, interp_kernel_get_code(opaque), sz);
>      copyed += sz;
>    }
> +  if ((err = get_program_global_data(p)) != CL_SUCCESS)
> +    goto error;
> +
>    p->is_built = 1;
>    p->build_status = CL_BUILD_SUCCESS;
>    return CL_SUCCESS;
> @@ -697,6 +739,10 @@ cl_program_link(cl_context            context,
>      memcpy(p->bin + copyed, interp_kernel_get_code(opaque), sz);
>      copyed += sz;
>    }
> +
> +  if ((err = get_program_global_data(p)) != CL_SUCCESS)
> +    goto error;
> +
>  done:
>    if(p) p->is_built = 1;
>    if(p) p->build_status = CL_BUILD_SUCCESS;
> diff --git a/src/cl_program.h b/src/cl_program.h
> index 63ad16d..083d66a 100644
> --- a/src/cl_program.h
> +++ b/src/cl_program.h
> @@ -54,6 +54,8 @@ struct _cl_program {
>    cl_kernel *ker;         /* All kernels included by the OCL file */
>    cl_program prev, next;  /* We chain the programs together */
>    cl_context ctx;         /* Its parent context */
> +  cl_buffer  global_data;
> +  char * global_data_ptr;
>    char *bin;              /* The program copied verbatim */
>    size_t bin_sz;          /* Its size in memory */
>    char *source;           /* Program sources */
> diff --git a/utests/CMakeLists.txt b/utests/CMakeLists.txt
> index 78442cb..0fca450 100644
> --- a/utests/CMakeLists.txt
> +++ b/utests/CMakeLists.txt
> @@ -191,6 +191,7 @@ set (utests_sources
>    compiler_bool_cross_basic_block.cpp
>    compiler_private_const.cpp
>    compiler_private_data_overflow.cpp
> +  compiler_program_global.cpp
>    compiler_getelementptr_bitcast.cpp
>    compiler_sub_group_any.cpp
>    compiler_sub_group_all.cpp
> diff --git a/utests/compiler_program_global.cpp
> b/utests/compiler_program_global.cpp
> new file mode 100644
> index 0000000..ef7c655
> --- /dev/null
> +++ b/utests/compiler_program_global.cpp
> @@ -0,0 +1,80 @@
> +#include "utest_helper.hpp"
> +#include "utest_file_map.hpp"
> +
> +static int init_program(const char* name, cl_context ctx, cl_program *pg )
> +{
> +  cl_int err;
> +  char* ker_path = cl_do_kiss_path(name, device);
> +
> +  cl_file_map_t *fm = cl_file_map_new();
> +  err = cl_file_map_open(fm, ker_path);
> +  if(err != CL_FILE_MAP_SUCCESS)
> +    OCL_ASSERT(0);
> +  const char *src = cl_file_map_begin(fm);
> +
> +  *pg = clCreateProgramWithSource(ctx, 1, &src, NULL, &err);
> +  free(ker_path);
> +  cl_file_map_delete(fm);
> +  return 0;
> +
> +}
> +
> +void compiler_program_global()
> +{
> +  const int n = 16;
> +  int cpu_src[16];
> +  cl_int err;
> +
> +  // Setup kernel and buffers
> +  cl_program program;
> +  init_program("compiler_program_global.cl", ctx, &program);
> +  OCL_CALL (clBuildProgram, program, 1, &device, "-cl-std=CL2.0", NULL,
> NULL);
> +
> +  cl_kernel k0 = clCreateKernel(program, "compiler_program_global0",
> &err);
> +  assert(err == CL_SUCCESS);
> +  cl_kernel k1 = clCreateKernel(program, "compiler_program_global1",
> &err);
> +  assert(err == CL_SUCCESS);
> +
> +  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(int), NULL);
> +  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(int), NULL);
> +
> +  OCL_CALL (clSetKernelArg, k0, 0, sizeof(cl_mem), &buf[0]);
> +  OCL_CALL (clSetKernelArg, k1, 0, sizeof(cl_mem), &buf[1]);
> +
> +  int dynamic = 1;
> +  OCL_CALL (clSetKernelArg, k0, 1, sizeof(cl_int), &dynamic);
> +  OCL_CALL (clSetKernelArg, k1, 1, sizeof(cl_int), &dynamic);
> +
> +  globals[0] = 16;
> +  locals[0] = 16;
> +
> +  OCL_MAP_BUFFER(0);
> +  for (int i = 0; i <  n; ++i)
> +    cpu_src[i] = ((int*)buf_data[0])[i] = i;
> +  OCL_UNMAP_BUFFER(0);
> +
> +  // Run the kernel on GPU
> +  OCL_CALL (clEnqueueNDRangeKernel, queue, k0, 1, NULL, globals, locals, 0,
> NULL, NULL);
> +  OCL_CALL (clEnqueueNDRangeKernel, queue, k1, 1, NULL, globals, locals, 0,
> NULL, NULL);
> +
> +  // Compare
> +  OCL_MAP_BUFFER(1);
> +  for (int32_t i = 0; i < n; ++i) {
> +//    printf("i=%d dst=%d\n", i, ((int*)buf_data[1])[i]);
> +    switch(i) {
> +      default: OCL_ASSERT(((int*)buf_data[1])[i] == i); break;
> +      case 11: OCL_ASSERT(((int*)buf_data[1])[i] == 7); break;
> +      case 12: OCL_ASSERT(((int*)buf_data[1])[i] == 4); break;
> +      case 13: OCL_ASSERT(((int*)buf_data[1])[i] == 2); break;
> +      case 14: OCL_ASSERT(((int*)buf_data[1])[i] == 3); break;
> +      case 15: OCL_ASSERT(((int*)buf_data[1])[i] == 2); break;
> +    }
> +  }
> +  OCL_UNMAP_BUFFER(1);
> +  clReleaseKernel(k0);
> +  clReleaseKernel(k1);
> +  clReleaseProgram(program);
> +}
> +
> +MAKE_UTEST_FROM_FUNCTION(compiler_program_global);
> +
> --
> 2.4.1
> 
> _______________________________________________
> Beignet mailing list
> Beignet at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/beignet


More information about the Beignet mailing list