[Beignet] [PATCH newRT] Add relocation table for ELF file to support 2.0
junyan.he at inbox.com
junyan.he at inbox.com
Mon Apr 10 08:21:10 UTC 2017
From: Junyan He <junyan.he at intel.com>
2.0 Spec require a global memory and the global pointer can
point to any global variable. We add a rela.rodata section
in ELF file to support the relocation. The global memory
just available for 2.0 later.
Signed-off-by: Junyan He <junyan.he at intel.com>
---
CMakeLists.txt | 2 +-
backend/src/backend/gen_program_elf.cpp | 54 ++++++++--
backend/src/ir/reloc.cpp | 2 +-
backend/src/ir/reloc.hpp | 22 +++-
backend/src/llvm/llvm_gen_backend.cpp | 3 +-
src/cl_gen7_device.h | 2 +-
src/gen/cl_command_queue_gen.c | 52 +++++++++-
src/gen/cl_gen.h | 25 +++++
src/gen/cl_gen75_device.h | 2 +-
src/gen/cl_gen7_device.h | 2 +-
src/gen/cl_gen8_device.h | 2 +-
src/gen/cl_gen9_device.h | 5 +-
src/gen/cl_gen_device_common.h | 4 -
src/gen/cl_kernel_gen.c | 8 +-
src/gen/cl_program_gen.c | 178 ++++++++++++++++++++++++++++++--
src/gen/intel_driver.c | 11 +-
src/gen/intel_driver.h | 2 +
17 files changed, 327 insertions(+), 49 deletions(-)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index fe895d0..e6babe4 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -239,7 +239,7 @@ IF (EXPERIMENTAL_DOUBLE)
ADD_DEFINITIONS(-DENABLE_FP64)
ENDIF(EXPERIMENTAL_DOUBLE)
-SET(CAN_OPENCL_20 OFF)
+SET(CAN_OPENCL_20 ON)
IF (CMAKE_SIZEOF_VOID_P EQUAL 4)
SET(CAN_OPENCL_20 OFF)
ENDIF (CMAKE_SIZEOF_VOID_P EQUAL 4)
diff --git a/backend/src/backend/gen_program_elf.cpp b/backend/src/backend/gen_program_elf.cpp
index feea392..0c78964 100644
--- a/backend/src/backend/gen_program_elf.cpp
+++ b/backend/src/backend/gen_program_elf.cpp
@@ -262,10 +262,12 @@ public:
section *ker_info_sec;
section *cl_info_sec;
section *rodata_sec;
+ section *reloc_rodata_sec;
symbol_section_accessor *syma;
string_section_accessor *stra;
note_section_accessor *note_writer;
note_section_accessor *cl_note_writer;
+ relocation_section_accessor *rela;
Elf32_Word sym_num;
uint64_t bitcode_offset;
@@ -280,8 +282,8 @@ public:
GenProgramElfContext::GenProgramElfContext(GenProgram &prog)
: genProg(prog), text_sec(NULL), sym_sec(NULL), strtab_sec(NULL), ker_info_sec(NULL),
- cl_info_sec(NULL), rodata_sec(NULL), syma(NULL), stra(NULL), note_writer(NULL),
- cl_note_writer(NULL), sym_num(0), bitcode_offset(0)
+ cl_info_sec(NULL), rodata_sec(NULL), reloc_rodata_sec(NULL), syma(NULL), stra(NULL),
+ note_writer(NULL), cl_note_writer(NULL), rela(NULL), sym_num(0), bitcode_offset(0)
{
writer.create(ELFCLASS64, ELFDATA2LSB);
writer.set_os_abi(ELFOSABI_LINUX);
@@ -339,6 +341,8 @@ GenProgramElfContext::~GenProgramElfContext(void)
GBE_DELETE(note_writer);
if (cl_note_writer)
GBE_DELETE(cl_note_writer);
+ if (rela)
+ GBE_DELETE(rela);
}
/*Store the special vitrual register map */
@@ -653,6 +657,41 @@ GenProgram::toBinaryFormat(size_t &ret_size)
getGlobalConstantData(const_data);
elf_ctx->rodata_sec->set_data(const_data, getGlobalConstantSize());
GBE_FREE(const_data);
+
+ if (getGlobalRelocCount() > 0) {
+ elf_ctx->reloc_rodata_sec = elf_ctx->writer.sections.add(".rel.rodata");
+ elf_ctx->reloc_rodata_sec->set_type(SHT_RELA);
+ elf_ctx->reloc_rodata_sec->set_info(elf_ctx->rodata_sec->get_index());
+ elf_ctx->reloc_rodata_sec->set_addr_align(0x4);
+ elf_ctx->reloc_rodata_sec->set_entry_size(elf_ctx->writer.get_default_entry_size(SHT_RELA));
+ elf_ctx->reloc_rodata_sec->set_link(elf_ctx->sym_sec->get_index());
+ elf_ctx->rela = GBE_NEW(relocation_section_accessor, elf_ctx->writer, elf_ctx->reloc_rodata_sec);
+
+ char *reloc_data = static_cast<char *>(GBE_MALLOC(getGlobalRelocCount() * sizeof(ir::RelocEntry)));
+ getGlobalRelocTable(reloc_data);
+ ir::RelocEntry *rel_entry = reinterpret_cast<ir::RelocEntry *>(reloc_data);
+ std::sort(rel_entry, rel_entry + getGlobalRelocCount(),
+ [](ir::RelocEntry &a, ir::RelocEntry &b) { return a.defOffset < b.defOffset; });
+
+ std::string last_name;
+ unsigned int var_defOffset;
+ Elf_Word var_symbol;
+ for (uint32_t e = 0; e < getGlobalRelocCount(); e++) {
+ if (last_name != relocTable->getEntryName(rel_entry[e])) {
+ // Add a global symbol
+ var_defOffset = rel_entry[e].defOffset;
+ last_name = relocTable->getEntryName(rel_entry[e]);
+ assert(last_name != ""); // Must have a name
+ var_symbol = elf_ctx->syma->add_symbol(*elf_ctx->stra, last_name.c_str(), var_defOffset,
+ this->constantSet->getConstant(last_name).getSize(),
+ STB_GLOBAL, STT_OBJECT, 0, elf_ctx->rodata_sec->get_index());
+ }
+ elf_ctx->rela->add_entry(rel_entry[e].refOffset, var_symbol, (unsigned char)R_386_32,
+ rel_entry[e].defOffset - var_defOffset);
+ }
+
+ GBE_FREE(reloc_data);
+ }
}
/* Add the note about GPU info */
@@ -707,22 +746,17 @@ GenProgram::toBinaryFormat(size_t &ret_size)
if (write_cl_version == false) {
std::string ocl_version_str;
- Elf32_Word cl_version[2]; // major and minor
-
- oclVersion = k->getOclVersion();
+ oclVersion = k->getOclVersion(); // major and minor
if (oclVersion == 120) {
ocl_version_str = "OpenCL 1.2";
- cl_version[0] = 1;
- cl_version[1] = 2;
} else if (oclVersion == 200) {
ocl_version_str = "OpenCL 2.0";
- cl_version[0] = 2;
- cl_version[1] = 0;
} else
assert(0);
elf_ctx->cl_note_writer->add_note(GenProgramElfContext::GEN_NOTE_TYPE_CL_VERSION,
- ocl_version_str, cl_version, sizeof(cl_version));
+ ocl_version_str, &oclVersion, sizeof(oclVersion));
+ write_cl_version = true;
} else {
assert(oclVersion == k->getOclVersion());
}
diff --git a/backend/src/ir/reloc.cpp b/backend/src/ir/reloc.cpp
index 4884610..70dc0f6 100644
--- a/backend/src/ir/reloc.cpp
+++ b/backend/src/ir/reloc.cpp
@@ -67,7 +67,7 @@ namespace ir {
for (uint32_t i = 0; i < sz; i++) {
IN_UPDATE_SZ(refOffset);
IN_UPDATE_SZ(defOffset);
- addEntry(refOffset, defOffset);
+ addEntry(refOffset, defOffset, NULL);
}
IN_UPDATE_SZ(magic);
diff --git a/backend/src/ir/reloc.hpp b/backend/src/ir/reloc.hpp
index de33a8a..27cc943 100644
--- a/backend/src/ir/reloc.hpp
+++ b/backend/src/ir/reloc.hpp
@@ -27,6 +27,7 @@
#include "sys/vector.hpp"
#include <string.h>
+#include <map>
namespace gbe {
namespace ir {
@@ -42,17 +43,31 @@ namespace ir {
unsigned int refOffset;
unsigned int defOffset;
+ friend bool operator< (const RelocEntry& a, const RelocEntry& b) {
+ if (a.defOffset < b.defOffset)
+ return true;
+ if (a.refOffset < b.refOffset)
+ return true;
+ return false;
+ }
};
class RelocTable : public NonCopyable, public Serializable
{
public:
- void addEntry(unsigned refOffset, unsigned defOffset) {
+ void addEntry(unsigned refOffset, unsigned defOffset, const char *name) {
entries.push_back(RelocEntry(refOffset, defOffset));
+ RelocEntry& re = entries.back();
+ entryNames[re] = name;
+ }
+ std::string getEntryName(RelocEntry& re) {
+ if (entryNames.find(re) == entryNames.end())
+ return std::string();
+ return entryNames[re];
}
RelocTable() : Serializable() {}
- RelocTable(const RelocTable& other) : Serializable(other),
- entries(other.entries) {}
+ RelocTable(const RelocTable& other) :
+ Serializable(other), entries(other.entries), entryNames(other.entryNames) {}
uint32_t getCount() { return entries.size(); }
void getData(char *p) {
if (entries.size() > 0 && p)
@@ -80,6 +95,7 @@ namespace ir {
virtual uint32_t deserializeFromBin(std::istream& ins);
private:
vector<RelocEntry> entries;
+ std::map<RelocEntry, std::string> entryNames;
GBE_CLASS(RelocTable);
};
diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp
index 3fefa92..7b07d8d 100644
--- a/backend/src/llvm/llvm_gen_backend.cpp
+++ b/backend/src/llvm/llvm_gen_backend.cpp
@@ -1691,8 +1691,7 @@ namespace gbe
for (uint32_t k = 0; k < relocs.size(); k++) {
unit.getRelocTable().addEntry(
refOffset + relocs[k].refOffset,
- relocs[k].defOffset
- );
+ relocs[k].defOffset, name);
}
}
}
diff --git a/src/cl_gen7_device.h b/src/cl_gen7_device.h
index 01aa0f3..8dfa52f 100644
--- a/src/cl_gen7_device.h
+++ b/src/cl_gen7_device.h
@@ -30,5 +30,5 @@
//temporarily define to only export builtin kernel block_motion_estimate_intel only for Gen7
//will remove after HSW+ also support
#define GEN7_DEVICE
-#include "cl_gt_device.h"
+#include "cl_gen_device_common.h"
#undef GEN7_DEVICE
diff --git a/src/gen/cl_command_queue_gen.c b/src/gen/cl_command_queue_gen.c
index 8bbfe2c..1f3e1c1 100644
--- a/src/gen/cl_command_queue_gen.c
+++ b/src/gen/cl_command_queue_gen.c
@@ -67,7 +67,6 @@ typedef struct gen_gpgpu {
drm_intel_bo *scratch_bo; /* Scratch buffer */
drm_intel_bo *const_bo; /* Constant buffer */
-
drm_intel_bo *stack_bo; /* stack buffer */
drm_intel_bo *time_stamp_bo; /* The buffer to record exec timestamps */
@@ -267,12 +266,18 @@ gen_gpgpu_setup_global_mem(cl_kernel kernel, cl_kernel_gen kernel_gen, gen_gpgpu
int32_t offset = 0;
cl_mem mem;
uint32_t bti;
+ cl_program_gen prog_gen;
+ DEV_PRIVATE_DATA(kernel->program, gpu->device, prog_gen);
for (i = 0; i < kernel->arg_n; i++) {
if (kernel->args[i].arg_type != ArgTypePointer)
continue;
- if (kernel->args[i].arg_addrspace != AddressSpaceGlobal)
+ if (kernel->args[i].arg_addrspace != AddressSpaceGlobal &&
+ kernel->args[i].arg_addrspace != AddressSpaceConstant)
+ continue;
+
+ if (prog_gen->cl_version < 200 && kernel->args[i].arg_addrspace == AddressSpaceConstant)
continue;
mem = NULL;
@@ -372,6 +377,44 @@ gen_gpgpu_setup_scratch(gen_gpgpu *gpu)
}
static cl_int
+gen_setup_constant_buffer_for_20(cl_kernel kernel, cl_kernel_gen kernel_gen,
+ cl_program_gen prog_gen, gen_gpgpu *gpu)
+{
+#ifndef HAS_BO_SET_SOFTPIN
+ return CL_OUT_OF_RESOURCES;
+#else
+ int i;
+ cl_bool need_const_buf = CL_FALSE;
+ cl_int const_addr_curbe_offset = -1;
+ cl_gen_virt_phy_offset map = kernel_gen->virt_reg_phy_offset;
+
+ for (i = 0; i < kernel_gen->virt_reg_phy_offset_num; i++) {
+ if (map[i].virt_reg == GBE_CURBE_CONSTANT_ADDRSPACE) {
+ need_const_buf = CL_TRUE;
+ const_addr_curbe_offset = map[i].phy_offset;
+ assert(map[i].size == 8);
+ break;
+ }
+ }
+
+ if (need_const_buf == CL_FALSE)
+ return CL_SUCCESS;
+
+ assert(prog_gen->global_mem_data); // Should always have something
+ assert(const_addr_curbe_offset >= 0);
+
+ gpu->mem.const_bo = intel_buffer_alloc_userptr(gpu->bufmgr, "program global data",
+ prog_gen->global_mem_data, prog_gen->global_mem_data_size, 0);
+ drm_intel_bo_set_softpin_offset(gpu->mem.const_bo, (size_t)prog_gen->global_mem_data);
+ drm_intel_bo_use_48b_address_range(gpu->mem.const_bo, 1);
+ *(char **)(gpu->thread.curbe + const_addr_curbe_offset) = prog_gen->global_mem_data;
+ gen_gpgpu_bind_one_bo(gpu, gpu->mem.const_bo, const_addr_curbe_offset, 0,
+ prog_gen->global_mem_data_size, BTI_CONSTANT);
+ return CL_SUCCESS;
+#endif
+}
+
+static cl_int
gen_setup_constant_buffer(cl_kernel kernel, cl_kernel_gen kernel_gen, gen_gpgpu *gpu)
{
cl_program_gen prog_gen;
@@ -383,6 +426,11 @@ gen_setup_constant_buffer(cl_kernel kernel, cl_kernel_gen kernel_gen, gen_gpgpu
int i;
DEV_PRIVATE_DATA(kernel->program, gpu->device, prog_gen);
+ /* 2.0 is different from before */
+ if (prog_gen->cl_version >= 200) {
+ return gen_setup_constant_buffer_for_20(kernel, kernel_gen, prog_gen, gpu);
+ }
+
if (prog_gen->rodata) {
const_buf_size = prog_gen->rodata_data->d_size;
aligned_const_buf_size = ALIGN(const_buf_size, 8);
diff --git a/src/gen/cl_gen.h b/src/gen/cl_gen.h
index d04a644..0f50e37 100644
--- a/src/gen/cl_gen.h
+++ b/src/gen/cl_gen.h
@@ -182,6 +182,14 @@ extern cl_int cl_kernel_get_info_gen(cl_device_id device, cl_kernel kernel,
extern cl_int cl_kernel_create_gen(cl_device_id device, cl_kernel kernel);
/*********************************** Program *****************************************/
+enum cl_gen_program_note_type {
+ GEN_NOTE_TYPE_CL_VERSION = 1,
+ GEN_NOTE_TYPE_GPU_VERSION = 2,
+ GEN_NOTE_TYPE_GPU_INFO = 3,
+ GEN_NOTE_TYPE_CL_INFO = 4,
+ GEN_NOTE_TYPE_COMPILER_INFO = 5,
+};
+
typedef struct _cl_program_gen {
_cl_program_for_device prog_base;
Elf *elf;
@@ -205,9 +213,26 @@ typedef struct _cl_program_gen {
Elf_Scn *func_cl_info;
cl_int func_cl_info_sec_index;
Elf_Data *func_cl_info_data;
+ Elf_Scn *ro_reloc;
+ cl_int ro_reloc_index;
+ Elf_Data *ro_reloc_data;
+ char *global_mem_data;
+ cl_uint global_mem_data_size;
+ char *gpu_name;
+ cl_uint gpu_version_major;
+ cl_uint gpu_version_minor;
+ char *compiler_name;
+ cl_uint compiler_version_major;
+ cl_uint compiler_version_minor;
+ char *cl_version_str;
+ cl_uint cl_version;
} _cl_program_gen;
typedef _cl_program_gen *cl_program_gen;
+#define GEN_ELF_RELOC_GET_SYM(PROG_GEN, RELOC_ENTRY) \
+ gelf_getclass(PROG_GEN->elf) == ELFCLASS64 ? ELF64_R_SYM(RELOC_ENTRY->r_info) : ELF32_R_SYM(RELOC_ENTRY->r_info)
+#define GEN_ELF_RELOC_GET_TYPE(PROG_GEN, RELOC_ENTRY) \
+ gelf_getclass(PROG_GEN->elf) == ELFCLASS64 ? ELF64_R_TYPE(RELOC_ENTRY->r_info) : ELF32_R_TYPE(RELOC_ENTRY->r_info)
extern void *cl_program_new_gen(cl_device_id device, cl_program p);
extern void cl_program_delete_gen(cl_device_id device, cl_program p);
extern cl_int cl_program_load_binary_gen(cl_device_id device, cl_program prog);
diff --git a/src/gen/cl_gen75_device.h b/src/gen/cl_gen75_device.h
index 99b76bf..0d6c812 100644
--- a/src/gen/cl_gen75_device.h
+++ b/src/gen/cl_gen75_device.h
@@ -21,7 +21,7 @@
.max_parameter_size = 1024,
.global_mem_cache_line_size = 64, /* XXX */
.global_mem_cache_size = 8 << 10, /* XXX */
-.local_mem_type = CL_GLOBAL,
+.local_mem_type = CL_LOCAL,
.local_mem_size = 64 << 10,
.scratch_mem_size = 2 << 20,
.max_mem_alloc_size = 2 * 1024 * 1024 * 1024ul,
diff --git a/src/gen/cl_gen7_device.h b/src/gen/cl_gen7_device.h
index 7bf1202..8dfa52f 100644
--- a/src/gen/cl_gen7_device.h
+++ b/src/gen/cl_gen7_device.h
@@ -21,7 +21,7 @@
.max_parameter_size = 1024,
.global_mem_cache_line_size = 64, /* XXX */
.global_mem_cache_size = 8 << 10, /* XXX */
-.local_mem_type = CL_GLOBAL,
+.local_mem_type = CL_LOCAL,
.local_mem_size = 64 << 10,
.scratch_mem_size = 12 << 10,
.max_mem_alloc_size = 2 * 1024 * 1024 * 1024ul,
diff --git a/src/gen/cl_gen8_device.h b/src/gen/cl_gen8_device.h
index b807272..c8b7754 100644
--- a/src/gen/cl_gen8_device.h
+++ b/src/gen/cl_gen8_device.h
@@ -21,7 +21,7 @@
.max_parameter_size = 1024,
.global_mem_cache_line_size = 64, /* XXX */
.global_mem_cache_size = 8 << 10, /* XXX */
-.local_mem_type = CL_GLOBAL,
+.local_mem_type = CL_LOCAL,
.local_mem_size = 64 << 10,
.scratch_mem_size = 2 << 20,
.max_mem_alloc_size = 2 * 1024 * 1024 * 1024ul,
diff --git a/src/gen/cl_gen9_device.h b/src/gen/cl_gen9_device.h
index d069332..7412e98 100644
--- a/src/gen/cl_gen9_device.h
+++ b/src/gen/cl_gen9_device.h
@@ -21,10 +21,13 @@
.max_parameter_size = 1024,
.global_mem_cache_line_size = 64, /* XXX */
.global_mem_cache_size = 8 << 10, /* XXX */
-.local_mem_type = CL_GLOBAL,
+.local_mem_type = CL_LOCAL,
.local_mem_size = 64 << 10,
.scratch_mem_size = 2 << 20,
.max_mem_alloc_size = 4 * 1024 * 1024 * 1024ul,
.global_mem_size = 4 * 1024 * 1024 * 1024ul,
+#define GEN9_DEVICE 1
#include "cl_gen_device_common.h"
+#undef GEN9_DEVICE
+
diff --git a/src/gen/cl_gen_device_common.h b/src/gen/cl_gen_device_common.h
index 9fef422..16b4811 100644
--- a/src/gen/cl_gen_device_common.h
+++ b/src/gen/cl_gen_device_common.h
@@ -49,11 +49,7 @@
.native_vector_width_float = 4,
.native_vector_width_double = 2,
.native_vector_width_half = 8,
-#ifdef ENABLE_OPENCL_20
-.address_bits = 64,
-#else
.address_bits = 32,
-#endif
.svm_capabilities = CL_DEVICE_SVM_COARSE_GRAIN_BUFFER,
.preferred_platform_atomic_alignment = 0,
.preferred_global_atomic_alignment = 0,
diff --git a/src/gen/cl_kernel_gen.c b/src/gen/cl_kernel_gen.c
index 4e85c1d..0fd5809 100644
--- a/src/gen/cl_kernel_gen.c
+++ b/src/gen/cl_kernel_gen.c
@@ -199,7 +199,7 @@ cl_program_gen_get_kernel_func_cl_info(cl_device_id device, cl_kernel kernel)
desc_type = *(cl_uint *)(prog_gen->func_cl_info_data->d_buf + offset + 2 * sizeof(cl_uint));
name = prog_gen->func_cl_info_data->d_buf + offset + sizeof(cl_uint) * 3;
- if (desc_type != 0x04) {
+ if (desc_type != GEN_NOTE_TYPE_CL_INFO) {
offset += 3 * sizeof(cl_uint) + ALIGN(name_size, 4) + ALIGN(desc_size, 4);
continue;
}
@@ -355,7 +355,7 @@ cl_program_gen_get_one_kernel_func(cl_device_id device, cl_kernel kernel, GElf_S
desc_size = *(cl_uint *)(prog_gen->func_gpu_info_data->d_buf + offset + sizeof(cl_uint));
desc_type = *(cl_uint *)(prog_gen->func_gpu_info_data->d_buf + offset + 2 * sizeof(cl_uint));
name = prog_gen->func_gpu_info_data->d_buf + offset + sizeof(cl_uint) * 3;
- if (desc_type != 0x03) {
+ if (desc_type != GEN_NOTE_TYPE_GPU_INFO) {
offset += 3 * sizeof(cl_uint) + ALIGN(name_size, 4) + ALIGN(desc_size, 4);
continue;
}
@@ -535,9 +535,9 @@ cl_kernel_create_gen(cl_device_id device, cl_kernel kernel)
for (i = 0; i < (int)(prog_gen->symtab_entry_num); i++) {
p_sym_entry = gelf_getsym(prog_gen->symtab_data, i, &sym_entry);
assert(p_sym_entry == &sym_entry);
- if ((p_sym_entry->st_info & 0x0f) != STT_FUNC)
+ if (ELF32_ST_TYPE(p_sym_entry->st_info) != STT_FUNC)
continue;
- if (((p_sym_entry->st_info & 0x0f0) >> 4) != STB_GLOBAL)
+ if (ELF32_ST_BIND(p_sym_entry->st_info) != STB_GLOBAL)
continue;
name = p_sym_entry->st_name + prog_gen->strtab_data->d_buf;
diff --git a/src/gen/cl_program_gen.c b/src/gen/cl_program_gen.c
index 3c0b796..3b2f4b5 100644
--- a/src/gen/cl_program_gen.c
+++ b/src/gen/cl_program_gen.c
@@ -17,6 +17,7 @@
*/
#include "cl_gen.h"
+#include <unistd.h>
struct binary_type_header_info {
unsigned char header[7];
@@ -94,6 +95,24 @@ cl_program_delete_gen(cl_device_id device, cl_program p)
}
pd->kernel_names = NULL;
+ if (gen_elf->compiler_name)
+ CL_FREE(gen_elf->compiler_name);
+ gen_elf->compiler_name = NULL;
+
+ if (gen_elf->gpu_name)
+ CL_FREE(gen_elf->gpu_name);
+ gen_elf->gpu_name = NULL;
+
+ if (gen_elf->cl_version_str)
+ CL_FREE(gen_elf->cl_version_str);
+ gen_elf->cl_version_str = NULL;
+
+ if (gen_elf->global_mem_data) {
+ CL_FREE(gen_elf->global_mem_data);
+ assert(gen_elf->global_mem_data_size > 0);
+ }
+ gen_elf->global_mem_data = NULL;
+
if (gen_elf->elf)
elf_end(gen_elf->elf);
gen_elf->elf = NULL;
@@ -102,6 +121,69 @@ cl_program_delete_gen(cl_device_id device, cl_program p)
}
static cl_int
+cl_program_gen_alloc_global_mem(cl_device_id device, cl_program prog, cl_program_gen prog_gen)
+{
+ int i;
+ cl_uint const_buf_size = 0;
+ cl_uint aligned_const_buf_size = 0;
+
+ if (prog_gen->cl_version < 200 && prog_gen->rodata_data != NULL)
+ return CL_INVALID_PROGRAM;
+
+ if (prog_gen->cl_version < 200 || prog_gen->rodata_data == NULL)
+ return CL_SUCCESS;
+
+ const_buf_size = prog_gen->rodata_data->d_size;
+ aligned_const_buf_size = ALIGN(const_buf_size, getpagesize());
+ prog_gen->global_mem_data = CL_MEMALIGN(getpagesize(), aligned_const_buf_size);
+ if (prog_gen->global_mem_data == NULL)
+ return CL_OUT_OF_RESOURCES;
+
+ prog_gen->global_mem_data_size = aligned_const_buf_size;
+ memset(prog_gen->global_mem_data, 0, aligned_const_buf_size);
+ memcpy(prog_gen->global_mem_data, prog_gen->rodata_data->d_buf, prog_gen->rodata_data->d_size);
+
+ /* Do some reloc setting */
+ if (prog_gen->ro_reloc) {
+ GElf_Rela entry;
+ GElf_Rela *p_entry;
+ cl_int ro_reloc_num;
+ GElf_Shdr *p_sec_header = NULL;
+ GElf_Shdr sec_header;
+ GElf_Sym *p_sym_entry;
+ GElf_Sym sym_entry;
+ char *const_buf_addr = prog_gen->global_mem_data;
+ assert(prog_gen->ro_reloc_data);
+
+ p_sec_header = gelf_getshdr(prog_gen->ro_reloc, &sec_header);
+ ro_reloc_num = p_sec_header->sh_size / p_sec_header->sh_entsize;
+ for (i = 0; i < ro_reloc_num; i++) {
+ p_entry = gelf_getrela(prog_gen->ro_reloc_data, i, &entry);
+ if (p_entry == NULL) {
+ return CL_INVALID_PROGRAM;
+ }
+
+ if ((cl_uint)(GEN_ELF_RELOC_GET_TYPE(prog_gen, p_entry)) != R_386_32) {
+ return CL_INVALID_PROGRAM;
+ }
+
+ p_sym_entry = gelf_getsym(prog_gen->symtab_data,
+ GEN_ELF_RELOC_GET_SYM(prog_gen, p_entry), &sym_entry);
+ if (p_sym_entry == NULL) {
+ return CL_INVALID_PROGRAM;
+ }
+
+ assert(p_entry->r_offset > 0);
+ assert(sizeof(void *) == 8); // Must be 64 bits
+ *(char **)(const_buf_addr + p_entry->r_offset) =
+ (char *)(const_buf_addr + p_sym_entry->st_value + p_entry->r_addend);
+ }
+ }
+
+ return CL_SUCCESS;
+}
+
+static cl_int
cl_program_load_binary_gen_elf(cl_device_id device, cl_program prog)
{
cl_program_for_device pd;
@@ -115,9 +197,13 @@ cl_program_load_binary_gen_elf(cl_device_id device, cl_program prog)
GElf_Sym *p_sym_entry;
GElf_Sym sym_entry;
char *name;
- int ret;
size_t val = 0;
int i, j;
+ cl_int offset;
+ cl_uint name_size;
+ cl_uint desc_size;
+ cl_uint desc_type;
+ cl_int ret;
DEV_PRIVATE_DATA(prog, device, elf);
pd = &elf->prog_base;
@@ -191,11 +277,14 @@ cl_program_load_binary_gen_elf(cl_device_id device, cl_program prog)
} else if (strcmp(sh_strtab_data->d_buf + p_sec_header->sh_name, ".rodata") == 0) {
elf->rodata = elf_sec;
elf->rodata_sec_index = i;
+ } else if (strcmp(sh_strtab_data->d_buf + p_sec_header->sh_name, ".rel.rodata") == 0) {
+ elf->ro_reloc = elf_sec;
+ elf->ro_reloc_index = i;
}
}
- if (elf->text == NULL || elf->symtab == NULL ||
- elf->strtab == NULL || elf->func_gpu_info == NULL) {
+ if (elf->text == NULL || elf->symtab == NULL || elf->strtab == NULL ||
+ elf->func_gpu_info == NULL || elf->func_cl_info == NULL) {
elf_end(elf_p);
elf->elf = NULL;
return CL_INVALID_PROGRAM;
@@ -213,13 +302,15 @@ cl_program_load_binary_gen_elf(cl_device_id device, cl_program prog)
assert(p_sec_header->sh_size % p_sec_header->sh_entsize == 0);
elf->func_gpu_info_data = elf_getdata(elf->func_gpu_info, NULL);
assert(elf->func_gpu_info_data);
+ elf->func_cl_info_data = elf_getdata(elf->func_cl_info, NULL);
+ assert(elf->func_cl_info_data);
if (elf->rodata) {
elf->rodata_data = elf_getdata(elf->rodata, NULL);
assert(elf->rodata_data);
}
- if (elf->func_cl_info) {
- elf->func_cl_info_data = elf_getdata(elf->func_cl_info, NULL);
- assert(elf->func_cl_info_data);
+ if (elf->ro_reloc) {
+ elf->ro_reloc_data = elf_getdata(elf->ro_reloc, NULL);
+ assert(elf->ro_reloc_data);
}
/* Add all kernel names */
@@ -228,9 +319,9 @@ cl_program_load_binary_gen_elf(cl_device_id device, cl_program prog)
for (i = 0; i < (int)(elf->symtab_entry_num); i++) {
p_sym_entry = gelf_getsym(elf->symtab_data, i, &sym_entry);
assert(p_sym_entry == &sym_entry);
- if ((p_sym_entry->st_info & 0x0f) != STT_FUNC)
+ if (ELF32_ST_TYPE(p_sym_entry->st_info) != STT_FUNC)
continue;
- if (((p_sym_entry->st_info & 0x0f0) >> 4) != STB_GLOBAL)
+ if (ELF32_ST_BIND(p_sym_entry->st_info) != STB_GLOBAL)
continue;
name = p_sym_entry->st_name + elf->strtab_data->d_buf;
@@ -254,9 +345,9 @@ cl_program_load_binary_gen_elf(cl_device_id device, cl_program prog)
for (i = 0; i < (int)(elf->symtab_entry_num); i++) {
p_sym_entry = gelf_getsym(elf->symtab_data, i, &sym_entry);
assert(p_sym_entry == &sym_entry);
- if ((p_sym_entry->st_info & 0x0f) != STT_FUNC)
+ if (ELF32_ST_TYPE(p_sym_entry->st_info) != STT_FUNC)
continue;
- if (((p_sym_entry->st_info & 0x0f0) >> 4) != STB_GLOBAL)
+ if (ELF32_ST_BIND(p_sym_entry->st_info) != STB_GLOBAL)
continue;
pd->kernel_names[j] =
@@ -273,7 +364,72 @@ cl_program_load_binary_gen_elf(cl_device_id device, cl_program prog)
}
assert(j == pd->kernel_num);
- return CL_SUCCESS;
+ /* Get the compiler name and gpu version */
+ offset = 0;
+ while (offset < elf->func_gpu_info_data->d_size) {
+ name_size = *(cl_uint *)(elf->func_gpu_info_data->d_buf + offset);
+ desc_size = *(cl_uint *)(elf->func_gpu_info_data->d_buf + offset + sizeof(cl_uint));
+ desc_type = *(cl_uint *)(elf->func_gpu_info_data->d_buf + offset + 2 * sizeof(cl_uint));
+ if (desc_type == GEN_NOTE_TYPE_COMPILER_INFO) {
+ elf->compiler_name = CL_CALLOC(name_size + 1, sizeof(char));
+ if (elf->compiler_name == NULL) {
+ elf_end(elf_p);
+ elf->elf = NULL;
+ return CL_OUT_OF_HOST_MEMORY;
+ }
+ memcpy(elf->compiler_name, elf->func_gpu_info_data->d_buf + offset + sizeof(cl_uint) * 3, name_size);
+ elf->compiler_name[name_size] = 0;
+ elf->compiler_version_major = *(cl_uint *)(elf->func_gpu_info_data->d_buf + offset +
+ 3 * sizeof(cl_uint) + ALIGN(name_size, 4));
+ elf->compiler_version_minor = *(cl_uint *)(elf->func_gpu_info_data->d_buf + offset +
+ 3 * sizeof(cl_uint) + ALIGN(name_size, 4) + sizeof(cl_uint));
+ } else if (desc_type == GEN_NOTE_TYPE_GPU_VERSION) {
+ elf->gpu_name = CL_CALLOC(name_size + 1, sizeof(char));
+ if (elf->gpu_name == NULL) {
+ elf_end(elf_p);
+ elf->elf = NULL;
+ return CL_OUT_OF_HOST_MEMORY;
+ }
+ memcpy(elf->gpu_name, elf->func_gpu_info_data->d_buf + offset + sizeof(cl_uint) * 3, name_size);
+ elf->gpu_name[name_size] = 0;
+ elf->gpu_version_major = *(cl_uint *)(elf->func_gpu_info_data->d_buf + offset +
+ 3 * sizeof(cl_uint) + ALIGN(name_size, 4));
+ elf->gpu_version_minor = *(cl_uint *)(elf->func_gpu_info_data->d_buf + offset +
+ 3 * sizeof(cl_uint) + ALIGN(name_size, 4) + sizeof(cl_uint));
+ }
+
+ offset += 3 * sizeof(cl_uint) + ALIGN(name_size, 4) + ALIGN(desc_size, 4);
+ }
+
+ /* Get the OpenCL version */
+ offset = 0;
+ while (offset < elf->func_cl_info_data->d_size) {
+ name_size = *(cl_uint *)(elf->func_cl_info_data->d_buf + offset);
+ desc_size = *(cl_uint *)(elf->func_cl_info_data->d_buf + offset + sizeof(cl_uint));
+ desc_type = *(cl_uint *)(elf->func_cl_info_data->d_buf + offset + 2 * sizeof(cl_uint));
+ if (desc_type == GEN_NOTE_TYPE_CL_VERSION) {
+ elf->cl_version_str = CL_CALLOC(name_size + 1, sizeof(char));
+ if (elf->cl_version_str == NULL) {
+ elf_end(elf_p);
+ elf->elf = NULL;
+ return CL_OUT_OF_HOST_MEMORY;
+ }
+ memcpy(elf->cl_version_str, elf->func_cl_info_data->d_buf + offset + sizeof(cl_uint) * 3, name_size);
+ elf->cl_version_str[name_size] = 0;
+ elf->cl_version = *(cl_uint *)(elf->func_cl_info_data->d_buf + offset +
+ 3 * sizeof(cl_uint) + ALIGN(name_size, 4));
+ }
+
+ offset += 3 * sizeof(cl_uint) + ALIGN(name_size, 4) + ALIGN(desc_size, 4);
+ }
+
+ ret = cl_program_gen_alloc_global_mem(device, prog, elf);
+ if (ret != CL_SUCCESS) {
+ elf_end(elf_p);
+ elf->elf = NULL;
+ }
+
+ return ret;
}
LOCAL cl_int
diff --git a/src/gen/intel_driver.c b/src/gen/intel_driver.c
index 2f62b22..eac6366 100644
--- a/src/gen/intel_driver.c
+++ b/src/gen/intel_driver.c
@@ -560,8 +560,8 @@ intel_buffer_set_tiling(cl_buffer bo, cl_image_tiling_t tiling, size_t stride)
return ret;
}
-static cl_buffer
-intel_buffer_alloc_userptr(cl_buffer_mgr bufmgr, const char *name,
+LOCAL drm_intel_bo*
+intel_buffer_alloc_userptr(dri_bufmgr *bufmgr, const char *name,
void *data, size_t size, unsigned long flags)
{
#ifdef HAS_USERPTR
@@ -572,7 +572,7 @@ intel_buffer_alloc_userptr(cl_buffer_mgr bufmgr, const char *name,
if (bo == NULL)
bo = drm_intel_bo_alloc_userptr((drm_intel_bufmgr *)bufmgr, name, data,
I915_TILING_NONE, 0, size, flags | I915_USERPTR_UNSYNCHRONIZED);
- return (cl_buffer)bo;
+ return bo;
#else
return NULL;
#endif
@@ -912,12 +912,11 @@ intel_update_device_info(cl_device_id device)
host_ptr = CL_MEMALIGN(sz, 4096);
if (host_ptr != NULL) {
- cl_buffer bo = intel_buffer_alloc_userptr((cl_buffer_mgr)driver->bufmgr,
- "CL memory object", host_ptr, sz, 0);
+ drm_intel_bo* bo = intel_buffer_alloc_userptr(driver->bufmgr, "CL memory object", host_ptr, sz, 0);
if (bo == NULL)
device->host_unified_memory = CL_FALSE;
else
- drm_intel_bo_unreference((drm_intel_bo *)bo);
+ drm_intel_bo_unreference(bo);
CL_FREE(host_ptr);
} else
device->host_unified_memory = CL_FALSE;
diff --git a/src/gen/intel_driver.h b/src/gen/intel_driver.h
index 825eebf..d01cd55 100644
--- a/src/gen/intel_driver.h
+++ b/src/gen/intel_driver.h
@@ -132,6 +132,8 @@ extern int intel_get_device_id(void);
/* methods working in shared mode */
extern dri_bo *intel_driver_share_buffer(intel_driver_t *, const char *sname, uint32_t name);
extern uint32_t intel_driver_shared_name(intel_driver_t *, dri_bo *);
+extern dri_bo* intel_buffer_alloc_userptr(dri_bufmgr *bufmgr, const char *name,
+ void *data, size_t size, unsigned long flags);
/* init the call backs used by the ocl driver */
extern void intel_setup_callbacks(void);
--
2.7.4
More information about the Beignet
mailing list