[Beignet] [PATCH newRT] Add relocation table for ELF file to support 2.0

junyan.he at inbox.com junyan.he at inbox.com
Mon Apr 10 08:21:10 UTC 2017


From: Junyan He <junyan.he at intel.com>

2.0 Spec require a global memory and the global pointer can
point to any global variable. We add a rela.rodata section
in ELF file to support the relocation. The global memory
just available for 2.0 later.

Signed-off-by: Junyan He <junyan.he at intel.com>
---
 CMakeLists.txt                          |   2 +-
 backend/src/backend/gen_program_elf.cpp |  54 ++++++++--
 backend/src/ir/reloc.cpp                |   2 +-
 backend/src/ir/reloc.hpp                |  22 +++-
 backend/src/llvm/llvm_gen_backend.cpp   |   3 +-
 src/cl_gen7_device.h                    |   2 +-
 src/gen/cl_command_queue_gen.c          |  52 +++++++++-
 src/gen/cl_gen.h                        |  25 +++++
 src/gen/cl_gen75_device.h               |   2 +-
 src/gen/cl_gen7_device.h                |   2 +-
 src/gen/cl_gen8_device.h                |   2 +-
 src/gen/cl_gen9_device.h                |   5 +-
 src/gen/cl_gen_device_common.h          |   4 -
 src/gen/cl_kernel_gen.c                 |   8 +-
 src/gen/cl_program_gen.c                | 178 ++++++++++++++++++++++++++++++--
 src/gen/intel_driver.c                  |  11 +-
 src/gen/intel_driver.h                  |   2 +
 17 files changed, 327 insertions(+), 49 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index fe895d0..e6babe4 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -239,7 +239,7 @@ IF (EXPERIMENTAL_DOUBLE)
   ADD_DEFINITIONS(-DENABLE_FP64)
 ENDIF(EXPERIMENTAL_DOUBLE)
 
-SET(CAN_OPENCL_20 OFF)
+SET(CAN_OPENCL_20 ON)
 IF (CMAKE_SIZEOF_VOID_P EQUAL 4)
   SET(CAN_OPENCL_20 OFF)
 ENDIF (CMAKE_SIZEOF_VOID_P EQUAL 4)
diff --git a/backend/src/backend/gen_program_elf.cpp b/backend/src/backend/gen_program_elf.cpp
index feea392..0c78964 100644
--- a/backend/src/backend/gen_program_elf.cpp
+++ b/backend/src/backend/gen_program_elf.cpp
@@ -262,10 +262,12 @@ public:
   section *ker_info_sec;
   section *cl_info_sec;
   section *rodata_sec;
+  section *reloc_rodata_sec;
   symbol_section_accessor *syma;
   string_section_accessor *stra;
   note_section_accessor *note_writer;
   note_section_accessor *cl_note_writer;
+  relocation_section_accessor *rela;
   Elf32_Word sym_num;
   uint64_t bitcode_offset;
 
@@ -280,8 +282,8 @@ public:
 
 GenProgramElfContext::GenProgramElfContext(GenProgram &prog)
   : genProg(prog), text_sec(NULL), sym_sec(NULL), strtab_sec(NULL), ker_info_sec(NULL),
-    cl_info_sec(NULL), rodata_sec(NULL), syma(NULL), stra(NULL), note_writer(NULL),
-    cl_note_writer(NULL), sym_num(0), bitcode_offset(0)
+    cl_info_sec(NULL), rodata_sec(NULL), reloc_rodata_sec(NULL), syma(NULL), stra(NULL),
+    note_writer(NULL), cl_note_writer(NULL), rela(NULL), sym_num(0), bitcode_offset(0)
 {
   writer.create(ELFCLASS64, ELFDATA2LSB);
   writer.set_os_abi(ELFOSABI_LINUX);
@@ -339,6 +341,8 @@ GenProgramElfContext::~GenProgramElfContext(void)
     GBE_DELETE(note_writer);
   if (cl_note_writer)
     GBE_DELETE(cl_note_writer);
+  if (rela)
+    GBE_DELETE(rela);
 }
 
 /*Store the special vitrual register map */
@@ -653,6 +657,41 @@ GenProgram::toBinaryFormat(size_t &ret_size)
     getGlobalConstantData(const_data);
     elf_ctx->rodata_sec->set_data(const_data, getGlobalConstantSize());
     GBE_FREE(const_data);
+
+    if (getGlobalRelocCount() > 0) {
+      elf_ctx->reloc_rodata_sec = elf_ctx->writer.sections.add(".rel.rodata");
+      elf_ctx->reloc_rodata_sec->set_type(SHT_RELA);
+      elf_ctx->reloc_rodata_sec->set_info(elf_ctx->rodata_sec->get_index());
+      elf_ctx->reloc_rodata_sec->set_addr_align(0x4);
+      elf_ctx->reloc_rodata_sec->set_entry_size(elf_ctx->writer.get_default_entry_size(SHT_RELA));
+      elf_ctx->reloc_rodata_sec->set_link(elf_ctx->sym_sec->get_index());
+      elf_ctx->rela = GBE_NEW(relocation_section_accessor, elf_ctx->writer, elf_ctx->reloc_rodata_sec);
+
+      char *reloc_data = static_cast<char *>(GBE_MALLOC(getGlobalRelocCount() * sizeof(ir::RelocEntry)));
+      getGlobalRelocTable(reloc_data);
+      ir::RelocEntry *rel_entry = reinterpret_cast<ir::RelocEntry *>(reloc_data);
+      std::sort(rel_entry, rel_entry + getGlobalRelocCount(),
+                [](ir::RelocEntry &a, ir::RelocEntry &b) { return a.defOffset < b.defOffset; });
+
+      std::string last_name;
+      unsigned int var_defOffset;
+      Elf_Word var_symbol;
+      for (uint32_t e = 0; e < getGlobalRelocCount(); e++) {
+        if (last_name != relocTable->getEntryName(rel_entry[e])) {
+          // Add a global symbol
+          var_defOffset = rel_entry[e].defOffset;
+          last_name = relocTable->getEntryName(rel_entry[e]);
+          assert(last_name != ""); // Must have a name
+          var_symbol = elf_ctx->syma->add_symbol(*elf_ctx->stra, last_name.c_str(), var_defOffset,
+                                                 this->constantSet->getConstant(last_name).getSize(),
+                                                 STB_GLOBAL, STT_OBJECT, 0, elf_ctx->rodata_sec->get_index());
+        }
+        elf_ctx->rela->add_entry(rel_entry[e].refOffset, var_symbol, (unsigned char)R_386_32,
+                                 rel_entry[e].defOffset - var_defOffset);
+      }
+
+      GBE_FREE(reloc_data);
+    }
   }
 
   /* Add the note about GPU info */
@@ -707,22 +746,17 @@ GenProgram::toBinaryFormat(size_t &ret_size)
 
     if (write_cl_version == false) {
       std::string ocl_version_str;
-      Elf32_Word cl_version[2]; // major and minor
-
-      oclVersion = k->getOclVersion();
+      oclVersion = k->getOclVersion(); // major and minor
       if (oclVersion == 120) {
         ocl_version_str = "OpenCL 1.2";
-        cl_version[0] = 1;
-        cl_version[1] = 2;
       } else if (oclVersion == 200) {
         ocl_version_str = "OpenCL 2.0";
-        cl_version[0] = 2;
-        cl_version[1] = 0;
       } else
         assert(0);
 
       elf_ctx->cl_note_writer->add_note(GenProgramElfContext::GEN_NOTE_TYPE_CL_VERSION,
-                                        ocl_version_str, cl_version, sizeof(cl_version));
+                                        ocl_version_str, &oclVersion, sizeof(oclVersion));
+      write_cl_version = true;
     } else {
       assert(oclVersion == k->getOclVersion());
     }
diff --git a/backend/src/ir/reloc.cpp b/backend/src/ir/reloc.cpp
index 4884610..70dc0f6 100644
--- a/backend/src/ir/reloc.cpp
+++ b/backend/src/ir/reloc.cpp
@@ -67,7 +67,7 @@ namespace ir {
     for (uint32_t i = 0; i < sz; i++) {
       IN_UPDATE_SZ(refOffset);
       IN_UPDATE_SZ(defOffset);
-      addEntry(refOffset, defOffset);
+      addEntry(refOffset, defOffset, NULL);
     }
 
     IN_UPDATE_SZ(magic);
diff --git a/backend/src/ir/reloc.hpp b/backend/src/ir/reloc.hpp
index de33a8a..27cc943 100644
--- a/backend/src/ir/reloc.hpp
+++ b/backend/src/ir/reloc.hpp
@@ -27,6 +27,7 @@
 
 #include "sys/vector.hpp"
 #include <string.h>
+#include <map>
 
 namespace gbe {
 namespace ir {
@@ -42,17 +43,31 @@ namespace ir {
 
     unsigned int refOffset;
     unsigned int defOffset;
+    friend bool operator< (const RelocEntry& a, const RelocEntry& b) {
+      if (a.defOffset < b.defOffset)
+        return true;
+      if (a.refOffset < b.refOffset)
+        return true;
+      return false;
+    }
   };
 
   class RelocTable : public NonCopyable, public Serializable
   {
     public:
-      void addEntry(unsigned refOffset, unsigned defOffset) {
+      void addEntry(unsigned refOffset, unsigned defOffset, const char *name) {
         entries.push_back(RelocEntry(refOffset, defOffset));
+        RelocEntry& re = entries.back();
+        entryNames[re] = name;
+      }
+      std::string getEntryName(RelocEntry& re) {
+        if (entryNames.find(re) == entryNames.end())
+          return std::string();
+        return entryNames[re];
       }
       RelocTable() : Serializable() {}
-      RelocTable(const RelocTable& other) : Serializable(other),
-                                            entries(other.entries) {}
+      RelocTable(const RelocTable& other) :
+        Serializable(other), entries(other.entries), entryNames(other.entryNames) {}
       uint32_t getCount() { return entries.size(); }
       void getData(char *p) {
         if (entries.size() > 0 && p)
@@ -80,6 +95,7 @@ namespace ir {
     virtual uint32_t deserializeFromBin(std::istream& ins);
     private:
       vector<RelocEntry> entries;
+      std::map<RelocEntry, std::string> entryNames;
       GBE_CLASS(RelocTable);
   };
 
diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp
index 3fefa92..7b07d8d 100644
--- a/backend/src/llvm/llvm_gen_backend.cpp
+++ b/backend/src/llvm/llvm_gen_backend.cpp
@@ -1691,8 +1691,7 @@ namespace gbe
             for (uint32_t k = 0; k < relocs.size(); k++) {
               unit.getRelocTable().addEntry(
                   refOffset + relocs[k].refOffset,
-                  relocs[k].defOffset
-                  );
+                  relocs[k].defOffset, name);
             }
           }
         }
diff --git a/src/cl_gen7_device.h b/src/cl_gen7_device.h
index 01aa0f3..8dfa52f 100644
--- a/src/cl_gen7_device.h
+++ b/src/cl_gen7_device.h
@@ -30,5 +30,5 @@
 //temporarily define to only export builtin kernel block_motion_estimate_intel only for Gen7
 //will remove after HSW+ also support
 #define GEN7_DEVICE
-#include "cl_gt_device.h"
+#include "cl_gen_device_common.h"
 #undef GEN7_DEVICE
diff --git a/src/gen/cl_command_queue_gen.c b/src/gen/cl_command_queue_gen.c
index 8bbfe2c..1f3e1c1 100644
--- a/src/gen/cl_command_queue_gen.c
+++ b/src/gen/cl_command_queue_gen.c
@@ -67,7 +67,6 @@ typedef struct gen_gpgpu {
     drm_intel_bo *scratch_bo; /* Scratch buffer */
 
     drm_intel_bo *const_bo; /* Constant buffer */
-
     drm_intel_bo *stack_bo; /* stack buffer */
 
     drm_intel_bo *time_stamp_bo; /* The buffer to record exec timestamps */
@@ -267,12 +266,18 @@ gen_gpgpu_setup_global_mem(cl_kernel kernel, cl_kernel_gen kernel_gen, gen_gpgpu
   int32_t offset = 0;
   cl_mem mem;
   uint32_t bti;
+  cl_program_gen prog_gen;
+  DEV_PRIVATE_DATA(kernel->program, gpu->device, prog_gen);
 
   for (i = 0; i < kernel->arg_n; i++) {
     if (kernel->args[i].arg_type != ArgTypePointer)
       continue;
 
-    if (kernel->args[i].arg_addrspace != AddressSpaceGlobal)
+    if (kernel->args[i].arg_addrspace != AddressSpaceGlobal &&
+        kernel->args[i].arg_addrspace != AddressSpaceConstant)
+      continue;
+
+    if (prog_gen->cl_version < 200 && kernel->args[i].arg_addrspace == AddressSpaceConstant)
       continue;
 
     mem = NULL;
@@ -372,6 +377,44 @@ gen_gpgpu_setup_scratch(gen_gpgpu *gpu)
 }
 
 static cl_int
+gen_setup_constant_buffer_for_20(cl_kernel kernel, cl_kernel_gen kernel_gen,
+                                 cl_program_gen prog_gen, gen_gpgpu *gpu)
+{
+#ifndef HAS_BO_SET_SOFTPIN
+  return CL_OUT_OF_RESOURCES;
+#else
+  int i;
+  cl_bool need_const_buf = CL_FALSE;
+  cl_int const_addr_curbe_offset = -1;
+  cl_gen_virt_phy_offset map = kernel_gen->virt_reg_phy_offset;
+
+  for (i = 0; i < kernel_gen->virt_reg_phy_offset_num; i++) {
+    if (map[i].virt_reg == GBE_CURBE_CONSTANT_ADDRSPACE) {
+      need_const_buf = CL_TRUE;
+      const_addr_curbe_offset = map[i].phy_offset;
+      assert(map[i].size == 8);
+      break;
+    }
+  }
+
+  if (need_const_buf == CL_FALSE)
+    return CL_SUCCESS;
+
+  assert(prog_gen->global_mem_data); // Should always have something
+  assert(const_addr_curbe_offset >= 0);
+
+  gpu->mem.const_bo = intel_buffer_alloc_userptr(gpu->bufmgr, "program global data",
+                                                 prog_gen->global_mem_data, prog_gen->global_mem_data_size, 0);
+  drm_intel_bo_set_softpin_offset(gpu->mem.const_bo, (size_t)prog_gen->global_mem_data);
+  drm_intel_bo_use_48b_address_range(gpu->mem.const_bo, 1);
+  *(char **)(gpu->thread.curbe + const_addr_curbe_offset) = prog_gen->global_mem_data;
+  gen_gpgpu_bind_one_bo(gpu, gpu->mem.const_bo, const_addr_curbe_offset, 0,
+                        prog_gen->global_mem_data_size, BTI_CONSTANT);
+  return CL_SUCCESS;
+#endif
+}
+
+static cl_int
 gen_setup_constant_buffer(cl_kernel kernel, cl_kernel_gen kernel_gen, gen_gpgpu *gpu)
 {
   cl_program_gen prog_gen;
@@ -383,6 +426,11 @@ gen_setup_constant_buffer(cl_kernel kernel, cl_kernel_gen kernel_gen, gen_gpgpu
   int i;
   DEV_PRIVATE_DATA(kernel->program, gpu->device, prog_gen);
 
+  /* 2.0 is different from before */
+  if (prog_gen->cl_version >= 200) {
+    return gen_setup_constant_buffer_for_20(kernel, kernel_gen, prog_gen, gpu);
+  }
+
   if (prog_gen->rodata) {
     const_buf_size = prog_gen->rodata_data->d_size;
     aligned_const_buf_size = ALIGN(const_buf_size, 8);
diff --git a/src/gen/cl_gen.h b/src/gen/cl_gen.h
index d04a644..0f50e37 100644
--- a/src/gen/cl_gen.h
+++ b/src/gen/cl_gen.h
@@ -182,6 +182,14 @@ extern cl_int cl_kernel_get_info_gen(cl_device_id device, cl_kernel kernel,
 extern cl_int cl_kernel_create_gen(cl_device_id device, cl_kernel kernel);
 
 /*********************************** Program *****************************************/
+enum cl_gen_program_note_type {
+  GEN_NOTE_TYPE_CL_VERSION = 1,
+  GEN_NOTE_TYPE_GPU_VERSION = 2,
+  GEN_NOTE_TYPE_GPU_INFO = 3,
+  GEN_NOTE_TYPE_CL_INFO = 4,
+  GEN_NOTE_TYPE_COMPILER_INFO = 5,
+};
+
 typedef struct _cl_program_gen {
   _cl_program_for_device prog_base;
   Elf *elf;
@@ -205,9 +213,26 @@ typedef struct _cl_program_gen {
   Elf_Scn *func_cl_info;
   cl_int func_cl_info_sec_index;
   Elf_Data *func_cl_info_data;
+  Elf_Scn *ro_reloc;
+  cl_int ro_reloc_index;
+  Elf_Data *ro_reloc_data;
+  char *global_mem_data;
+  cl_uint global_mem_data_size;
+  char *gpu_name;
+  cl_uint gpu_version_major;
+  cl_uint gpu_version_minor;
+  char *compiler_name;
+  cl_uint compiler_version_major;
+  cl_uint compiler_version_minor;
+  char *cl_version_str;
+  cl_uint cl_version;
 } _cl_program_gen;
 typedef _cl_program_gen *cl_program_gen;
 
+#define GEN_ELF_RELOC_GET_SYM(PROG_GEN, RELOC_ENTRY) \
+  gelf_getclass(PROG_GEN->elf) == ELFCLASS64 ? ELF64_R_SYM(RELOC_ENTRY->r_info) : ELF32_R_SYM(RELOC_ENTRY->r_info)
+#define GEN_ELF_RELOC_GET_TYPE(PROG_GEN, RELOC_ENTRY) \
+  gelf_getclass(PROG_GEN->elf) == ELFCLASS64 ? ELF64_R_TYPE(RELOC_ENTRY->r_info) : ELF32_R_TYPE(RELOC_ENTRY->r_info)
 extern void *cl_program_new_gen(cl_device_id device, cl_program p);
 extern void cl_program_delete_gen(cl_device_id device, cl_program p);
 extern cl_int cl_program_load_binary_gen(cl_device_id device, cl_program prog);
diff --git a/src/gen/cl_gen75_device.h b/src/gen/cl_gen75_device.h
index 99b76bf..0d6c812 100644
--- a/src/gen/cl_gen75_device.h
+++ b/src/gen/cl_gen75_device.h
@@ -21,7 +21,7 @@
 .max_parameter_size = 1024,
 .global_mem_cache_line_size = 64, /* XXX */
 .global_mem_cache_size = 8 << 10, /* XXX */
-.local_mem_type = CL_GLOBAL,
+.local_mem_type = CL_LOCAL,
 .local_mem_size = 64 << 10,
 .scratch_mem_size = 2 << 20,
 .max_mem_alloc_size = 2 * 1024 * 1024 * 1024ul,
diff --git a/src/gen/cl_gen7_device.h b/src/gen/cl_gen7_device.h
index 7bf1202..8dfa52f 100644
--- a/src/gen/cl_gen7_device.h
+++ b/src/gen/cl_gen7_device.h
@@ -21,7 +21,7 @@
 .max_parameter_size = 1024,
 .global_mem_cache_line_size = 64, /* XXX */
 .global_mem_cache_size = 8 << 10, /* XXX */
-.local_mem_type = CL_GLOBAL,
+.local_mem_type = CL_LOCAL,
 .local_mem_size = 64 << 10,
 .scratch_mem_size = 12 << 10,
 .max_mem_alloc_size = 2 * 1024 * 1024 * 1024ul,
diff --git a/src/gen/cl_gen8_device.h b/src/gen/cl_gen8_device.h
index b807272..c8b7754 100644
--- a/src/gen/cl_gen8_device.h
+++ b/src/gen/cl_gen8_device.h
@@ -21,7 +21,7 @@
 .max_parameter_size = 1024,
 .global_mem_cache_line_size = 64, /* XXX */
 .global_mem_cache_size = 8 << 10, /* XXX */
-.local_mem_type = CL_GLOBAL,
+.local_mem_type = CL_LOCAL,
 .local_mem_size = 64 << 10,
 .scratch_mem_size = 2 << 20,
 .max_mem_alloc_size = 2 * 1024 * 1024 * 1024ul,
diff --git a/src/gen/cl_gen9_device.h b/src/gen/cl_gen9_device.h
index d069332..7412e98 100644
--- a/src/gen/cl_gen9_device.h
+++ b/src/gen/cl_gen9_device.h
@@ -21,10 +21,13 @@
 .max_parameter_size = 1024,
 .global_mem_cache_line_size = 64, /* XXX */
 .global_mem_cache_size = 8 << 10, /* XXX */
-.local_mem_type = CL_GLOBAL,
+.local_mem_type = CL_LOCAL,
 .local_mem_size = 64 << 10,
 .scratch_mem_size = 2 << 20,
 .max_mem_alloc_size = 4 * 1024 * 1024 * 1024ul,
 .global_mem_size = 4 * 1024 * 1024 * 1024ul,
 
+#define GEN9_DEVICE 1
 #include "cl_gen_device_common.h"
+#undef GEN9_DEVICE
+
diff --git a/src/gen/cl_gen_device_common.h b/src/gen/cl_gen_device_common.h
index 9fef422..16b4811 100644
--- a/src/gen/cl_gen_device_common.h
+++ b/src/gen/cl_gen_device_common.h
@@ -49,11 +49,7 @@
 .native_vector_width_float = 4,
 .native_vector_width_double = 2,
 .native_vector_width_half = 8,
-#ifdef ENABLE_OPENCL_20
-.address_bits = 64,
-#else
 .address_bits = 32,
-#endif
 .svm_capabilities = CL_DEVICE_SVM_COARSE_GRAIN_BUFFER,
 .preferred_platform_atomic_alignment = 0,
 .preferred_global_atomic_alignment = 0,
diff --git a/src/gen/cl_kernel_gen.c b/src/gen/cl_kernel_gen.c
index 4e85c1d..0fd5809 100644
--- a/src/gen/cl_kernel_gen.c
+++ b/src/gen/cl_kernel_gen.c
@@ -199,7 +199,7 @@ cl_program_gen_get_kernel_func_cl_info(cl_device_id device, cl_kernel kernel)
     desc_type = *(cl_uint *)(prog_gen->func_cl_info_data->d_buf + offset + 2 * sizeof(cl_uint));
     name = prog_gen->func_cl_info_data->d_buf + offset + sizeof(cl_uint) * 3;
 
-    if (desc_type != 0x04) {
+    if (desc_type != GEN_NOTE_TYPE_CL_INFO) {
       offset += 3 * sizeof(cl_uint) + ALIGN(name_size, 4) + ALIGN(desc_size, 4);
       continue;
     }
@@ -355,7 +355,7 @@ cl_program_gen_get_one_kernel_func(cl_device_id device, cl_kernel kernel, GElf_S
     desc_size = *(cl_uint *)(prog_gen->func_gpu_info_data->d_buf + offset + sizeof(cl_uint));
     desc_type = *(cl_uint *)(prog_gen->func_gpu_info_data->d_buf + offset + 2 * sizeof(cl_uint));
     name = prog_gen->func_gpu_info_data->d_buf + offset + sizeof(cl_uint) * 3;
-    if (desc_type != 0x03) {
+    if (desc_type != GEN_NOTE_TYPE_GPU_INFO) {
       offset += 3 * sizeof(cl_uint) + ALIGN(name_size, 4) + ALIGN(desc_size, 4);
       continue;
     }
@@ -535,9 +535,9 @@ cl_kernel_create_gen(cl_device_id device, cl_kernel kernel)
   for (i = 0; i < (int)(prog_gen->symtab_entry_num); i++) {
     p_sym_entry = gelf_getsym(prog_gen->symtab_data, i, &sym_entry);
     assert(p_sym_entry == &sym_entry);
-    if ((p_sym_entry->st_info & 0x0f) != STT_FUNC)
+    if (ELF32_ST_TYPE(p_sym_entry->st_info) != STT_FUNC)
       continue;
-    if (((p_sym_entry->st_info & 0x0f0) >> 4) != STB_GLOBAL)
+    if (ELF32_ST_BIND(p_sym_entry->st_info) != STB_GLOBAL)
       continue;
 
     name = p_sym_entry->st_name + prog_gen->strtab_data->d_buf;
diff --git a/src/gen/cl_program_gen.c b/src/gen/cl_program_gen.c
index 3c0b796..3b2f4b5 100644
--- a/src/gen/cl_program_gen.c
+++ b/src/gen/cl_program_gen.c
@@ -17,6 +17,7 @@
  */
 
 #include "cl_gen.h"
+#include <unistd.h>
 
 struct binary_type_header_info {
   unsigned char header[7];
@@ -94,6 +95,24 @@ cl_program_delete_gen(cl_device_id device, cl_program p)
   }
   pd->kernel_names = NULL;
 
+  if (gen_elf->compiler_name)
+    CL_FREE(gen_elf->compiler_name);
+  gen_elf->compiler_name = NULL;
+
+  if (gen_elf->gpu_name)
+    CL_FREE(gen_elf->gpu_name);
+  gen_elf->gpu_name = NULL;
+
+  if (gen_elf->cl_version_str)
+    CL_FREE(gen_elf->cl_version_str);
+  gen_elf->cl_version_str = NULL;
+
+  if (gen_elf->global_mem_data) {
+    CL_FREE(gen_elf->global_mem_data);
+    assert(gen_elf->global_mem_data_size > 0);
+  }
+  gen_elf->global_mem_data = NULL;
+
   if (gen_elf->elf)
     elf_end(gen_elf->elf);
   gen_elf->elf = NULL;
@@ -102,6 +121,69 @@ cl_program_delete_gen(cl_device_id device, cl_program p)
 }
 
 static cl_int
+cl_program_gen_alloc_global_mem(cl_device_id device, cl_program prog, cl_program_gen prog_gen)
+{
+  int i;
+  cl_uint const_buf_size = 0;
+  cl_uint aligned_const_buf_size = 0;
+
+  if (prog_gen->cl_version < 200 && prog_gen->rodata_data != NULL)
+    return CL_INVALID_PROGRAM;
+
+  if (prog_gen->cl_version < 200 || prog_gen->rodata_data == NULL)
+    return CL_SUCCESS;
+
+  const_buf_size = prog_gen->rodata_data->d_size;
+  aligned_const_buf_size = ALIGN(const_buf_size, getpagesize());
+  prog_gen->global_mem_data = CL_MEMALIGN(getpagesize(), aligned_const_buf_size);
+  if (prog_gen->global_mem_data == NULL)
+    return CL_OUT_OF_RESOURCES;
+
+  prog_gen->global_mem_data_size = aligned_const_buf_size;
+  memset(prog_gen->global_mem_data, 0, aligned_const_buf_size);
+  memcpy(prog_gen->global_mem_data, prog_gen->rodata_data->d_buf, prog_gen->rodata_data->d_size);
+
+  /* Do some reloc setting */
+  if (prog_gen->ro_reloc) {
+    GElf_Rela entry;
+    GElf_Rela *p_entry;
+    cl_int ro_reloc_num;
+    GElf_Shdr *p_sec_header = NULL;
+    GElf_Shdr sec_header;
+    GElf_Sym *p_sym_entry;
+    GElf_Sym sym_entry;
+    char *const_buf_addr = prog_gen->global_mem_data;
+    assert(prog_gen->ro_reloc_data);
+
+    p_sec_header = gelf_getshdr(prog_gen->ro_reloc, &sec_header);
+    ro_reloc_num = p_sec_header->sh_size / p_sec_header->sh_entsize;
+    for (i = 0; i < ro_reloc_num; i++) {
+      p_entry = gelf_getrela(prog_gen->ro_reloc_data, i, &entry);
+      if (p_entry == NULL) {
+        return CL_INVALID_PROGRAM;
+      }
+
+      if ((cl_uint)(GEN_ELF_RELOC_GET_TYPE(prog_gen, p_entry)) != R_386_32) {
+        return CL_INVALID_PROGRAM;
+      }
+
+      p_sym_entry = gelf_getsym(prog_gen->symtab_data,
+                                GEN_ELF_RELOC_GET_SYM(prog_gen, p_entry), &sym_entry);
+      if (p_sym_entry == NULL) {
+        return CL_INVALID_PROGRAM;
+      }
+
+      assert(p_entry->r_offset > 0);
+      assert(sizeof(void *) == 8); // Must be 64 bits
+      *(char **)(const_buf_addr + p_entry->r_offset) =
+        (char *)(const_buf_addr + p_sym_entry->st_value + p_entry->r_addend);
+    }
+  }
+
+  return CL_SUCCESS;
+}
+
+static cl_int
 cl_program_load_binary_gen_elf(cl_device_id device, cl_program prog)
 {
   cl_program_for_device pd;
@@ -115,9 +197,13 @@ cl_program_load_binary_gen_elf(cl_device_id device, cl_program prog)
   GElf_Sym *p_sym_entry;
   GElf_Sym sym_entry;
   char *name;
-  int ret;
   size_t val = 0;
   int i, j;
+  cl_int offset;
+  cl_uint name_size;
+  cl_uint desc_size;
+  cl_uint desc_type;
+  cl_int ret;
 
   DEV_PRIVATE_DATA(prog, device, elf);
   pd = &elf->prog_base;
@@ -191,11 +277,14 @@ cl_program_load_binary_gen_elf(cl_device_id device, cl_program prog)
     } else if (strcmp(sh_strtab_data->d_buf + p_sec_header->sh_name, ".rodata") == 0) {
       elf->rodata = elf_sec;
       elf->rodata_sec_index = i;
+    } else if (strcmp(sh_strtab_data->d_buf + p_sec_header->sh_name, ".rel.rodata") == 0) {
+      elf->ro_reloc = elf_sec;
+      elf->ro_reloc_index = i;
     }
   }
 
-  if (elf->text == NULL || elf->symtab == NULL ||
-      elf->strtab == NULL || elf->func_gpu_info == NULL) {
+  if (elf->text == NULL || elf->symtab == NULL || elf->strtab == NULL ||
+      elf->func_gpu_info == NULL || elf->func_cl_info == NULL) {
     elf_end(elf_p);
     elf->elf = NULL;
     return CL_INVALID_PROGRAM;
@@ -213,13 +302,15 @@ cl_program_load_binary_gen_elf(cl_device_id device, cl_program prog)
   assert(p_sec_header->sh_size % p_sec_header->sh_entsize == 0);
   elf->func_gpu_info_data = elf_getdata(elf->func_gpu_info, NULL);
   assert(elf->func_gpu_info_data);
+  elf->func_cl_info_data = elf_getdata(elf->func_cl_info, NULL);
+  assert(elf->func_cl_info_data);
   if (elf->rodata) {
     elf->rodata_data = elf_getdata(elf->rodata, NULL);
     assert(elf->rodata_data);
   }
-  if (elf->func_cl_info) {
-    elf->func_cl_info_data = elf_getdata(elf->func_cl_info, NULL);
-    assert(elf->func_cl_info_data);
+  if (elf->ro_reloc) {
+    elf->ro_reloc_data = elf_getdata(elf->ro_reloc, NULL);
+    assert(elf->ro_reloc_data);
   }
 
   /* Add all kernel names */
@@ -228,9 +319,9 @@ cl_program_load_binary_gen_elf(cl_device_id device, cl_program prog)
   for (i = 0; i < (int)(elf->symtab_entry_num); i++) {
     p_sym_entry = gelf_getsym(elf->symtab_data, i, &sym_entry);
     assert(p_sym_entry == &sym_entry);
-    if ((p_sym_entry->st_info & 0x0f) != STT_FUNC)
+    if (ELF32_ST_TYPE(p_sym_entry->st_info) != STT_FUNC)
       continue;
-    if (((p_sym_entry->st_info & 0x0f0) >> 4) != STB_GLOBAL)
+    if (ELF32_ST_BIND(p_sym_entry->st_info) != STB_GLOBAL)
       continue;
 
     name = p_sym_entry->st_name + elf->strtab_data->d_buf;
@@ -254,9 +345,9 @@ cl_program_load_binary_gen_elf(cl_device_id device, cl_program prog)
   for (i = 0; i < (int)(elf->symtab_entry_num); i++) {
     p_sym_entry = gelf_getsym(elf->symtab_data, i, &sym_entry);
     assert(p_sym_entry == &sym_entry);
-    if ((p_sym_entry->st_info & 0x0f) != STT_FUNC)
+    if (ELF32_ST_TYPE(p_sym_entry->st_info) != STT_FUNC)
       continue;
-    if (((p_sym_entry->st_info & 0x0f0) >> 4) != STB_GLOBAL)
+    if (ELF32_ST_BIND(p_sym_entry->st_info) != STB_GLOBAL)
       continue;
 
     pd->kernel_names[j] =
@@ -273,7 +364,72 @@ cl_program_load_binary_gen_elf(cl_device_id device, cl_program prog)
   }
   assert(j == pd->kernel_num);
 
-  return CL_SUCCESS;
+  /* Get the compiler name and gpu version */
+  offset = 0;
+  while (offset < elf->func_gpu_info_data->d_size) {
+    name_size = *(cl_uint *)(elf->func_gpu_info_data->d_buf + offset);
+    desc_size = *(cl_uint *)(elf->func_gpu_info_data->d_buf + offset + sizeof(cl_uint));
+    desc_type = *(cl_uint *)(elf->func_gpu_info_data->d_buf + offset + 2 * sizeof(cl_uint));
+    if (desc_type == GEN_NOTE_TYPE_COMPILER_INFO) {
+      elf->compiler_name = CL_CALLOC(name_size + 1, sizeof(char));
+      if (elf->compiler_name == NULL) {
+        elf_end(elf_p);
+        elf->elf = NULL;
+        return CL_OUT_OF_HOST_MEMORY;
+      }
+      memcpy(elf->compiler_name, elf->func_gpu_info_data->d_buf + offset + sizeof(cl_uint) * 3, name_size);
+      elf->compiler_name[name_size] = 0;
+      elf->compiler_version_major = *(cl_uint *)(elf->func_gpu_info_data->d_buf + offset +
+                                                 3 * sizeof(cl_uint) + ALIGN(name_size, 4));
+      elf->compiler_version_minor = *(cl_uint *)(elf->func_gpu_info_data->d_buf + offset +
+                                                 3 * sizeof(cl_uint) + ALIGN(name_size, 4) + sizeof(cl_uint));
+    } else if (desc_type == GEN_NOTE_TYPE_GPU_VERSION) {
+      elf->gpu_name = CL_CALLOC(name_size + 1, sizeof(char));
+      if (elf->gpu_name == NULL) {
+        elf_end(elf_p);
+        elf->elf = NULL;
+        return CL_OUT_OF_HOST_MEMORY;
+      }
+      memcpy(elf->gpu_name, elf->func_gpu_info_data->d_buf + offset + sizeof(cl_uint) * 3, name_size);
+      elf->gpu_name[name_size] = 0;
+      elf->gpu_version_major = *(cl_uint *)(elf->func_gpu_info_data->d_buf + offset +
+                                            3 * sizeof(cl_uint) + ALIGN(name_size, 4));
+      elf->gpu_version_minor = *(cl_uint *)(elf->func_gpu_info_data->d_buf + offset +
+                                            3 * sizeof(cl_uint) + ALIGN(name_size, 4) + sizeof(cl_uint));
+    }
+
+    offset += 3 * sizeof(cl_uint) + ALIGN(name_size, 4) + ALIGN(desc_size, 4);
+  }
+
+  /* Get the OpenCL version */
+  offset = 0;
+  while (offset < elf->func_cl_info_data->d_size) {
+    name_size = *(cl_uint *)(elf->func_cl_info_data->d_buf + offset);
+    desc_size = *(cl_uint *)(elf->func_cl_info_data->d_buf + offset + sizeof(cl_uint));
+    desc_type = *(cl_uint *)(elf->func_cl_info_data->d_buf + offset + 2 * sizeof(cl_uint));
+    if (desc_type == GEN_NOTE_TYPE_CL_VERSION) {
+      elf->cl_version_str = CL_CALLOC(name_size + 1, sizeof(char));
+      if (elf->cl_version_str == NULL) {
+        elf_end(elf_p);
+        elf->elf = NULL;
+        return CL_OUT_OF_HOST_MEMORY;
+      }
+      memcpy(elf->cl_version_str, elf->func_cl_info_data->d_buf + offset + sizeof(cl_uint) * 3, name_size);
+      elf->cl_version_str[name_size] = 0;
+      elf->cl_version = *(cl_uint *)(elf->func_cl_info_data->d_buf + offset +
+                                     3 * sizeof(cl_uint) + ALIGN(name_size, 4));
+    }
+
+    offset += 3 * sizeof(cl_uint) + ALIGN(name_size, 4) + ALIGN(desc_size, 4);
+  }
+
+  ret = cl_program_gen_alloc_global_mem(device, prog, elf);
+  if (ret != CL_SUCCESS) {
+    elf_end(elf_p);
+    elf->elf = NULL;
+  }
+
+  return ret;
 }
 
 LOCAL cl_int
diff --git a/src/gen/intel_driver.c b/src/gen/intel_driver.c
index 2f62b22..eac6366 100644
--- a/src/gen/intel_driver.c
+++ b/src/gen/intel_driver.c
@@ -560,8 +560,8 @@ intel_buffer_set_tiling(cl_buffer bo, cl_image_tiling_t tiling, size_t stride)
   return ret;
 }
 
-static cl_buffer
-intel_buffer_alloc_userptr(cl_buffer_mgr bufmgr, const char *name,
+LOCAL drm_intel_bo*
+intel_buffer_alloc_userptr(dri_bufmgr *bufmgr, const char *name,
                            void *data, size_t size, unsigned long flags)
 {
 #ifdef HAS_USERPTR
@@ -572,7 +572,7 @@ intel_buffer_alloc_userptr(cl_buffer_mgr bufmgr, const char *name,
   if (bo == NULL)
     bo = drm_intel_bo_alloc_userptr((drm_intel_bufmgr *)bufmgr, name, data,
                                     I915_TILING_NONE, 0, size, flags | I915_USERPTR_UNSYNCHRONIZED);
-  return (cl_buffer)bo;
+  return bo;
 #else
   return NULL;
 #endif
@@ -912,12 +912,11 @@ intel_update_device_info(cl_device_id device)
 
   host_ptr = CL_MEMALIGN(sz, 4096);
   if (host_ptr != NULL) {
-    cl_buffer bo = intel_buffer_alloc_userptr((cl_buffer_mgr)driver->bufmgr,
-                                              "CL memory object", host_ptr, sz, 0);
+    drm_intel_bo* bo = intel_buffer_alloc_userptr(driver->bufmgr, "CL memory object", host_ptr, sz, 0);
     if (bo == NULL)
       device->host_unified_memory = CL_FALSE;
     else
-      drm_intel_bo_unreference((drm_intel_bo *)bo);
+      drm_intel_bo_unreference(bo);
     CL_FREE(host_ptr);
   } else
     device->host_unified_memory = CL_FALSE;
diff --git a/src/gen/intel_driver.h b/src/gen/intel_driver.h
index 825eebf..d01cd55 100644
--- a/src/gen/intel_driver.h
+++ b/src/gen/intel_driver.h
@@ -132,6 +132,8 @@ extern int intel_get_device_id(void);
 /* methods working in shared mode */
 extern dri_bo *intel_driver_share_buffer(intel_driver_t *, const char *sname, uint32_t name);
 extern uint32_t intel_driver_shared_name(intel_driver_t *, dri_bo *);
+extern dri_bo* intel_buffer_alloc_userptr(dri_bufmgr *bufmgr, const char *name,
+                           void *data, size_t size, unsigned long flags);
 
 /* init the call backs used by the ocl driver */
 extern void intel_setup_callbacks(void);
-- 
2.7.4



More information about the Beignet mailing list