[Beignet] [PATCH V3 newRT] Add GenProgram::toBinaryFormat to generate ELF format binary.

junyan.he at inbox.com junyan.he at inbox.com
Thu Mar 2 08:54:17 UTC 2017


From: Junyan He <junyan.he at intel.com>

We add this function to generate a standard ELF format binary.
All the verbose information we need in runtime will be stored
in .note.gpu_info section. Then, we can separate the runtime
and compiler clearly.

V2:
Add OpenCL info such as Argument nane, workgroup size, etc.
Add GPU version and OpenCL version info.
Use struct and template to clear up the code.

V3:
Fix some bugs.

Signed-off-by: Junyan He <junyan.he at intel.com>
---
 backend/src/backend/gen_program_elf.cpp | 672 ++++++++++++++++++++++----------
 1 file changed, 471 insertions(+), 201 deletions(-)

diff --git a/backend/src/backend/gen_program_elf.cpp b/backend/src/backend/gen_program_elf.cpp
index 0440e81..c750ca8 100644
--- a/backend/src/backend/gen_program_elf.cpp
+++ b/backend/src/backend/gen_program_elf.cpp
@@ -19,6 +19,7 @@
 #include "elfio/elfio.hpp"
 #include "backend/gen_program.hpp"
 #include "backend/gen_program.hpp"
+#include "sys/cvar.hpp"
 #include <algorithm>
 #include <sstream>
 #include <streambuf>
@@ -26,6 +27,9 @@ using namespace std;
 
 namespace gbe
 {
+
+BVAR(OCL_DUMP_ELF_FILE, false);
+
 /* The elf writer need to make sure seekp function work, so sstream
    can not work, and we do not want the fostream to generate the real
    file. We just want to keep the elf image in the memory. Implement
@@ -125,82 +129,338 @@ protected:
 
 using namespace ELFIO;
 
+/* The format for one Gen Kernel function is following note section format
+ --------------------------
+ | GEN_NOTE_TYPE_GPU_INFO |
+ --------------------------
+ | Function Name size:4 |
+ ------------------------
+ | Desc size:4  |
+ ---------------------------
+ | The kernel name(strlen) |
+ -----------------------------------------------------------------------------------------------
+ | SIMD:4 | Local Mem Size:4 | Scratch Size:4 | Stack Size :4 | Barrier/SLM Used:4 | Arg Num:4 |
+ -----------------------------------------------------------------------------------------------
+   Then the format for each argument is
+ --------------------------------------------------------------------------------------------------------------------------
+ | Index:4 | Size:4 | Type:4 | Offset:4 | Addr Space:4 | Align(if is ptr) | BTI(if buffer):4 / Index(sampler and image):4 |
+ --------------------------------------------------------------------------------------------------------------------------
+   Then all sampler info
+ -----------------------------------
+ | Number:4 | SamperInfo:4 | ......|
+ -----------------------------------
+   Then all image info
+ --------------------------------------------------------------------------------------------
+ | Number:4 | BTI:4 | Width:4 | Height:4 | Depth:4 | Data Type:4 | Channel Order:4 | .......|
+ --------------------------------------------------------------------------------------------
+   Last is the map table of special virtual register and phy register
+ --------------------------------------------------------
+ | Number:4 | Virt Reg:4 | Phy Reg:4 | Size:4 |.........|
+ --------------------------------------------------------  */
+
+/* The format for one Gen Kernel function's OpenCL info is following note section format
+ --------------------------
+ | GEN_NOTE_TYPE_CL_INFO  |
+ ----------------------------------------
+ | The kernel function's name: (strlen) |
+ ----------------------------------------
+ | Function's attribute string: (strlen)|
+ ----------------------------------------
+ | Work Group size: sizeof(size_t) * 3  |
+ ----------------------------------------
+ | Argument TypeName: (strlen) |
+ ---------------------------------
+ | Argument AccessQual: (strlen) |
+ ---------------------------------
+ | Argument Name: (strlen) |
+ ---------------------------  */
+
+/* The format for GPU version is:
+ ----------------------------
+ | GEN_NOTE_TYPE_GPU_VERSION |
+ -----------------------------
+ | GEN string (HasWell e.g.) |
+ -----------------------------
+ | GEN pci id |
+ --------------
+ | GEN version major:4 |
+ -----------------------
+ | GEN version minor:4 |
+ -----------------------  */
+
+/* The format for CL version is:
+ ----------------------------
+ | GEN_NOTE_TYPE_CL_VERSION |
+ ----------------------------------------
+ | CL version string (OpenCL 2.0  e.g.) |
+ ----------------------------------------
+ | CL version major:4 |
+ ----------------------
+ | CL version minor:4 |
+ ----------------------  */
+
+/* The format for Compiler info is:
+ -------------------------------
+ | GEN_NOTE_TYPE_COMPILER_INFO |
+ ----------------------------------------
+ | Compiler name (GBE_Compiler  e.g.) |
+ ----------------------------------------
+ | LLVM version major:4 |
+ ------------------------
+ | LLVM version minor:4 |
+ ------------------------ */
+
 class GenProgramElfContext
 {
 public:
-  enum { // 0, 1, 2 already have meanings
+  enum {
+    GEN_NOTE_TYPE_CL_VERSION = 1,
+    GEN_NOTE_TYPE_GPU_VERSION = 2,
     GEN_NOTE_TYPE_GPU_INFO = 3,
-    GEN_NOTE_TYPE_CL_ARG_INFO = 4,
-    GEN_NOTE_TYPE_CL_WORKGROUP_SIZE = 5,
+    GEN_NOTE_TYPE_CL_INFO = 4,
+    GEN_NOTE_TYPE_COMPILER_INFO = 5,
+  };
+
+  struct KernelInfoHelper {
+    Elf32_Word simd;
+    Elf32_Word local_mem_size;
+    Elf32_Word scratch_size;
+    Elf32_Word stack_size;
+    Elf32_Word barrier_slm_used;
+    Elf32_Word arg_num;
+  };
+  struct ArgInfoHelper {
+    Elf32_Word index;
+    Elf32_Word size;
+    Elf32_Word type;
+    Elf32_Word offset;
+    Elf32_Word addr_space;
+    Elf32_Word align;
+    Elf32_Word extra;
+  };
+  struct ImageInfoHelper {
+    Elf32_Word bti;
+    Elf32_Word width;
+    Elf32_Word height;
+    Elf32_Word depth;
+    Elf32_Word data_type;
+    Elf32_Word channel_order;
+  };
+  struct VirtRegMapHelper {
+    Elf32_Word virt_reg;
+    Elf32_Word phy_reg;
+    Elf32_Word size;
   };
 
   GenProgram &genProg;
-  void emitOneKernel(GenKernel &kernel);
+
   elfio writer;
   section *text_sec;
   section *sym_sec;
   section *strtab_sec;
   section *ker_info_sec;
+  section *cl_info_sec;
   section *rodata_sec;
   symbol_section_accessor *syma;
   string_section_accessor *stra;
   note_section_accessor *note_writer;
+  note_section_accessor *cl_note_writer;
   Elf32_Word sym_num;
-  Elf64_Word bitcode_offset;
+  uint64_t bitcode_offset;
 
-  ~GenProgramElfContext(void)
-  {
-    if (syma)
-      GBE_DELETE(syma);
-    if (stra)
-      GBE_DELETE(stra);
-    if (note_writer)
-      GBE_DELETE(note_writer);
+  GenProgramElfContext(GenProgram &prog);
+  ~GenProgramElfContext(void);
+
+  template <gbe_curbe_type curbe_enum, typename TYPE, int UNIFORM>
+  void emitOneCurbeReg(unsigned int &total_num, char *&ptr, GenKernel &kernel);
+  void emitOneKernel(GenKernel &kernel);
+  void emitOneKernelCLInfo(GenKernel &kernel);
+};
+
+GenProgramElfContext::GenProgramElfContext(GenProgram &prog)
+  : genProg(prog), text_sec(NULL), sym_sec(NULL), strtab_sec(NULL), ker_info_sec(NULL),
+    cl_info_sec(NULL), rodata_sec(NULL), syma(NULL), stra(NULL), note_writer(NULL),
+    cl_note_writer(NULL), sym_num(0), bitcode_offset(0)
+{
+  writer.create(ELFCLASS64, ELFDATA2LSB);
+  writer.set_os_abi(ELFOSABI_LINUX);
+  writer.set_type(ET_REL);
+  writer.set_machine(EM_INTEL205); // TODO: Some value of Intel GPU;
+
+  // Create code section
+  text_sec = writer.sections.add(".text");
+  text_sec->set_type(SHT_PROGBITS);
+  text_sec->set_flags(SHF_ALLOC | SHF_EXECINSTR);
+  text_sec->set_addr_align(4);
+
+  // Create string table section
+  strtab_sec = writer.sections.add(".strtab");
+  strtab_sec->set_type(SHT_STRTAB);
+  strtab_sec->set_addr_align(1);
+
+  // Create symbol table section
+  sym_sec = writer.sections.add(".symtab");
+  sym_sec->set_type(SHT_SYMTAB);
+  sym_sec->set_addr_align(0x4);
+  sym_sec->set_entry_size(writer.get_default_entry_size(SHT_SYMTAB));
+  sym_sec->set_link(strtab_sec->get_index());
+  sym_sec->set_info(0x01);
+
+  // Create kernel info section
+  ker_info_sec = writer.sections.add(".note.gpu_info");
+  ker_info_sec->set_type(SHT_NOTE);
+  ker_info_sec->set_flags(SHF_ALLOC);
+  ker_info_sec->set_addr_align(0x04);
+
+  // Create cl info section
+  cl_info_sec = writer.sections.add(".note.cl_info");
+  cl_info_sec->set_type(SHT_NOTE);
+  cl_info_sec->set_flags(SHF_ALLOC);
+  cl_info_sec->set_addr_align(0x04);
+
+  // Create string table writer
+  stra = GBE_NEW(string_section_accessor, strtab_sec);
+  // Create symbol table writer
+  syma = GBE_NEW(symbol_section_accessor, writer, sym_sec);
+  // Create note writer
+  note_writer = GBE_NEW(note_section_accessor, writer, ker_info_sec);
+  // Create CL note writer
+  cl_note_writer = GBE_NEW(note_section_accessor, writer, cl_info_sec);
+}
+
+GenProgramElfContext::~GenProgramElfContext(void)
+{
+  if (syma)
+    GBE_DELETE(syma);
+  if (stra)
+    GBE_DELETE(stra);
+  if (note_writer)
+    GBE_DELETE(note_writer);
+  if (cl_note_writer)
+    GBE_DELETE(cl_note_writer);
+}
+
+/*Store the special vitrual register map */
+template <gbe_curbe_type curbe_enum, typename TYPE, int UNIFORM>
+void GenProgramElfContext::emitOneCurbeReg(unsigned int &total_num, char *&ptr, GenKernel &kernel)
+{
+  int32_t offset = kernel.getCurbeOffset(curbe_enum, 0);
+  if (offset >= 0) {
+    VirtRegMapHelper *vri = reinterpret_cast<VirtRegMapHelper *>(ptr);
+    vri->virt_reg = curbe_enum;
+    vri->phy_reg = offset;
+    vri->size = UNIFORM ? sizeof(TYPE) : sizeof(TYPE) * kernel.getSIMDWidth();
+    ptr += sizeof(VirtRegMapHelper);
+    total_num++;
+  }
+}
+template <>
+void GenProgramElfContext::emitOneCurbeReg<GBE_CURBE_EXTRA_ARGUMENT, uint64_t, 0>(
+  unsigned int &total_num, char *&ptr, GenKernel &kernel)
+{
+  int32_t offset = kernel.getCurbeOffset(GBE_CURBE_EXTRA_ARGUMENT, GBE_STACK_BUFFER);
+  if (offset >= 0) {
+    VirtRegMapHelper *vri = reinterpret_cast<VirtRegMapHelper *>(ptr);
+    vri->virt_reg = GBE_CURBE_EXTRA_ARGUMENT;
+    vri->phy_reg = offset;
+    vri->size = sizeof(uint64_t);
+    ptr += sizeof(VirtRegMapHelper);
+    total_num++;
   }
+}
 
-  GenProgramElfContext(GenProgram &prog) : genProg(prog), text_sec(NULL), sym_sec(NULL),
-                                           strtab_sec(NULL), ker_info_sec(NULL), rodata_sec(NULL),
-                                           syma(NULL), stra(NULL), note_writer(NULL), sym_num(0),
-                                           bitcode_offset(0)
-  {
-    writer.create(ELFCLASS64, ELFDATA2LSB);
-    writer.set_os_abi(ELFOSABI_LINUX);
-    writer.set_type(ET_REL);
-    writer.set_machine(EM_INTEL205); // TODO: Some value of Intel GPU;
-
-    // Create code section
-    text_sec = writer.sections.add(".text");
-    text_sec->set_type(SHT_PROGBITS);
-    text_sec->set_flags(SHF_ALLOC | SHF_EXECINSTR);
-    text_sec->set_addr_align(4);
-
-    // Create string table section
-    strtab_sec = writer.sections.add(".strtab");
-    strtab_sec->set_type(SHT_STRTAB);
-    strtab_sec->set_addr_align(1);
-
-    // Create symbol table section
-    sym_sec = writer.sections.add(".symtab");
-    sym_sec->set_type(SHT_SYMTAB);
-    sym_sec->set_addr_align(0x4);
-    sym_sec->set_entry_size(writer.get_default_entry_size(SHT_SYMTAB));
-    sym_sec->set_link(strtab_sec->get_index());
-    sym_sec->set_info(0x01);
-
-    // Create kernel info section
-    ker_info_sec = writer.sections.add(".note.gpu_info");
-    ker_info_sec->set_type(SHT_NOTE);
-    text_sec->set_flags(SHF_ALLOC);
-    ker_info_sec->set_addr_align(0x04);
-
-    // Create string table writer
-    stra = GBE_NEW(string_section_accessor, strtab_sec);
-    // Create symbol table writer
-    syma = GBE_NEW(symbol_section_accessor, writer, sym_sec);
-    // Create note writer
-    note_writer = GBE_NEW(note_section_accessor, writer, ker_info_sec);
+void GenProgramElfContext::emitOneKernelCLInfo(GenKernel &kernel)
+{
+  uint32_t all_str_len = 0;
+  uint32_t attr_size = 0;
+  size_t wg_sz[3];
+  uint32_t wg_sz_size = 0;
+  uint32_t arg_info_size = 0;
+
+  if ((kernel.getFunctionAttributes())[0] != 0)
+    attr_size = ::strlen(kernel.getFunctionAttributes()) + 1;
+  all_str_len = ALIGN(attr_size, 4);
+
+  kernel.getCompileWorkGroupSize(wg_sz);
+  if (wg_sz[0] > 0 || wg_sz[1] > 0 || wg_sz[2] > 0) {
+    wg_sz_size = sizeof(size_t) * 3;
   }
-};
+  all_str_len = all_str_len + wg_sz_size;
+
+  for (unsigned int i = 0; i < kernel.getArgNum(); i++) {
+    KernelArgument::ArgInfo *arg_info = kernel.getArgInfo(i);
+    if (arg_info == NULL) {
+      assert(i == 0); // All have info or none has info
+      break;
+    }
+    arg_info_size += arg_info->typeName.length() + 1;
+    arg_info_size += arg_info->accessQual.length() + 1;
+    arg_info_size += arg_info->typeQual.length() + 1;
+    arg_info_size += arg_info->argName.length() + 1;
+    arg_info_size = ALIGN(arg_info_size, 4);
+  }
+  all_str_len = all_str_len + arg_info_size;
+
+  if (all_str_len == 0)
+    return;
+
+  all_str_len += 3 * sizeof(uint32_t); // The length themselves
+  char *cl_info = static_cast<char *>(GBE_MALLOC(all_str_len));
+  *reinterpret_cast<uint32_t *>(cl_info) = attr_size;
+  *reinterpret_cast<uint32_t *>(cl_info + sizeof(uint32_t)) = wg_sz_size;
+  *reinterpret_cast<uint32_t *>(cl_info + 2 * sizeof(uint32_t)) = arg_info_size;
+
+  size_t offset = 3 * sizeof(uint32_t);
+
+  if (attr_size > 0) {
+    ::memcpy(cl_info + offset, kernel.getFunctionAttributes(),
+             ::strlen(kernel.getFunctionAttributes()) + 1);
+    offset += attr_size;
+    offset = ALIGN(offset, 4);
+  }
+
+  if (wg_sz_size > 0) {
+    ::memcpy(cl_info + offset, wg_sz, sizeof(size_t) * 3);
+    offset += wg_sz_size;
+  }
+
+  if (arg_info_size) {
+    for (unsigned int i = 0; i < kernel.getArgNum(); i++) {
+      KernelArgument::ArgInfo *arg_info = kernel.getArgInfo(i);
+      assert(arg_info != NULL);
+      if (arg_info->typeName.length() > 0)
+        ::memcpy(cl_info + offset, arg_info->typeName.c_str(), arg_info->typeName.length() + 1);
+      else
+        *(cl_info + offset) = 0;
+      offset += (arg_info->typeName.length() + 1);
+
+      if (arg_info->accessQual.length() > 0)
+        ::memcpy(cl_info + offset, arg_info->accessQual.c_str(), arg_info->accessQual.length() + 1);
+      else
+        *(cl_info + offset) = 0;
+      offset += (arg_info->accessQual.length() + 1);
+
+      if (arg_info->typeQual.length() > 0)
+        ::memcpy(cl_info + offset, arg_info->typeQual.c_str(), arg_info->typeQual.length() + 1);
+      else
+        *(cl_info + offset) = 0;
+      offset += (arg_info->typeQual.length() + 1);
+
+      if (arg_info->argName.length() > 0)
+        ::memcpy(cl_info + offset, arg_info->argName.c_str(), arg_info->argName.length() + 1);
+      else
+        *(cl_info + offset) = 0;
+      offset += (arg_info->argName.length() + 1);
+
+      offset = ALIGN(offset, 4);
+    }
+  }
+
+  assert(offset == all_str_len);
+
+  cl_note_writer->add_note(GEN_NOTE_TYPE_CL_INFO, kernel.getName(), cl_info, all_str_len);
+  GBE_FREE(cl_info);
+}
 
 void GenProgramElfContext::emitOneKernel(GenKernel &kernel)
 {
@@ -219,24 +479,9 @@ void GenProgramElfContext::emitOneKernel(GenKernel &kernel)
                    STB_GLOBAL, STT_FUNC, 0, text_sec->get_index());
   bitcode_offset += kernel.getCodeSize();
 
-  void *kernel_info = GBE_MALLOC(8 * 1024);
-  void *ptr = kernel_info;
-  Elf32_Word *p_simd = static_cast<Elf32_Word *>(ptr);
-  Elf32_Word *p_slm_sz = static_cast<Elf32_Word *>(ptr) + 1;
-  Elf32_Word *p_scratch_sz = static_cast<Elf32_Word *>(ptr) + 2;
-  Elf32_Word *p_stack_sz = static_cast<Elf32_Word *>(ptr) + 3;
-  Elf32_Word *p_barrier_slm_used = static_cast<Elf32_Word *>(ptr) + 4;
-  Elf32_Word *p_arg_num = static_cast<Elf32_Word *>(ptr) + 5;
-  *p_simd = kernel.getSIMDWidth();
-  *p_slm_sz = kernel.getSLMSize();
-  *p_scratch_sz = kernel.getScratchSize();
-  *p_stack_sz = kernel.getStackSize();
-  *p_barrier_slm_used = kernel.getUseSLM();
-  *p_arg_num = kernel.getArgNum();
-
-  ptr = static_cast<char *>(ptr) + 6 * sizeof(Elf32_Word);
-
-  size_t sampler_data_sz = kernel.getSamplerSize();
+  uint32_t arg_num = kernel.getArgNum();
+
+  size_t sampler_data_sz = kernel.getSamplerSize() * sizeof(uint32_t);
   uint32_t *sampler_data = NULL;
   if (sampler_data_sz) {
     sampler_data = static_cast<uint32_t *>(GBE_MALLOC(sampler_data_sz));
@@ -244,149 +489,128 @@ void GenProgramElfContext::emitOneKernel(GenKernel &kernel)
     kernel.getSamplerData(sampler_data);
   }
 
-  size_t image_data_sz = kernel.getImageSize();
+  size_t image_data_sz = kernel.getImageSize() * sizeof(ImageInfo);
   ImageInfo *image_data = NULL;
   if (image_data_sz) {
     image_data = static_cast<ImageInfo *>(GBE_MALLOC(image_data_sz));
     ::memset(image_data, 0, image_data_sz);
     kernel.getImageData(image_data);
+    std::sort(image_data, image_data + image_data_sz / sizeof(ImageInfo),
+              [](ImageInfo &a, ImageInfo &b) { return a.idx < b.idx; });
   }
 
-  for (unsigned int i = 0; i < *p_arg_num; i++) {
-    Elf32_Word *arg_index = static_cast<Elf32_Word *>(ptr);
-    Elf32_Word *arg_size = static_cast<Elf32_Word *>(ptr) + 1;
-    Elf32_Word *arg_type = static_cast<Elf32_Word *>(ptr) + 2;
-    Elf32_Word *arg_offset = static_cast<Elf32_Word *>(ptr) + 3;
-    Elf32_Word *arg_space = static_cast<Elf32_Word *>(ptr) + 4;
-    Elf32_Word *arg_align = static_cast<Elf32_Word *>(ptr) + 5;
-    Elf32_Word *arg_info = static_cast<Elf32_Word *>(ptr) + 6;
-
-    *arg_index = i;
-    *arg_size = kernel.getArgSize(i);
-    *arg_type = kernel.getArgType(i);
-    *arg_align = kernel.getArgAlign(i);
-#if 0
-    *arg_space = kernel.getArgAddressSpace(i);
-
-    if (*arg_type == GBE_ARG_TYPE_POINTER && *arg_space == GBE_ADDRESS_SPACE_GLOBAL) {
-      *arg_info = kernel.getArgBTI(i);
-    } else if (*arg_type == GBE_ARG_TYPE_IMAGE) {
+  void *kernel_info = GBE_MALLOC(4 /* For align */ +
+                                 sizeof(KernelInfoHelper) + arg_num * sizeof(ArgInfoHelper) +
+                                 sizeof(Elf32_Word) /* For sampler num */ + image_data_sz +
+                                 sizeof(Elf32_Word) /* For image num */ +
+                                 ((image_data_sz / sizeof(ImageInfo)) * sizeof(ImageInfoHelper)) +
+                                 sizeof(Elf32_Word) /* For virt/phy num */ +
+                                 GBE_GEN_REG * sizeof(VirtRegMapHelper));
+  char *ptr = reinterpret_cast<char *>(ALIGN(reinterpret_cast<long>(kernel_info), 4));
+  KernelInfoHelper *ki = reinterpret_cast<KernelInfoHelper *>(ptr);
+  ki->simd = kernel.getSIMDWidth();
+  ki->local_mem_size = kernel.getSLMSize();
+  ki->scratch_size = kernel.getScratchSize();
+  ki->stack_size = kernel.getStackSize();
+  ki->barrier_slm_used = kernel.getUseSLM();
+  ki->arg_num = kernel.getArgNum();
+  ptr += sizeof(KernelInfoHelper);
+
+  for (unsigned int i = 0; i < arg_num; i++) {
+    ArgInfoHelper *argi = reinterpret_cast<ArgInfoHelper *>(ptr);
+    argi->index = i;
+    argi->size = kernel.getArgSize(i);
+    argi->type = kernel.getArgType(i);
+    argi->addr_space = kernel.getArgAddressSpace(i);
+    argi->align = kernel.getArgAlign(i);
+
+    if (argi->type == GBE_ARG_TYPE_POINTER && argi->addr_space == GBE_ADDRESS_SPACE_GLOBAL) {
+      argi->extra = kernel.getArgBTI(i);
+    } else if (argi->type == GBE_ARG_TYPE_IMAGE) {
       assert(image_data_sz > 0);
       for (size_t j = 0; j < image_data_sz / sizeof(ImageInfo); j++) {
         if (image_data[j].arg_idx == static_cast<int32_t>(i)) {
-          *arg_info = static_cast<Elf32_Word>(image_data[j].idx);
+          argi->extra = static_cast<Elf32_Word>(j);
           break;
         }
       }
-    } else if (*arg_type == GBE_ARG_TYPE_SAMPLER) {
+    } else if (argi->type == GBE_ARG_TYPE_SAMPLER) {
       assert(sampler_data_sz > 0);
       for (size_t j = 0; j < sampler_data_sz / sizeof(uint32_t); j++) {
         if (((sampler_data[i] & __CLK_SAMPLER_ARG_MASK) >> __CLK_SAMPLER_ARG_BASE) ==
             static_cast<uint32_t>(i)) {
-          *arg_info = static_cast<Elf32_Word>(j);
+          argi->extra = static_cast<Elf32_Word>(j);
           break;
         }
       }
+    } else {
+      argi->extra = 0;
     }
-#endif
-    *arg_offset = kernel.getCurbeOffset(GBE_CURBE_KERNEL_ARGUMENT, i);
-    ptr = static_cast<char *>(ptr) + 7 * sizeof(Elf32_Word);
+
+    argi->offset = kernel.getCurbeOffset(GBE_CURBE_KERNEL_ARGUMENT, i);
+    ptr += sizeof(ArgInfoHelper);
   }
 
   /* Store all the sampler info */
-  *(static_cast<Elf32_Word *>(ptr)) =
+  *(reinterpret_cast<Elf32_Word *>(ptr)) =
     static_cast<Elf32_Word>(sampler_data_sz / sizeof(uint32_t)); // Samper number
-  ptr = static_cast<char *>(ptr) + sizeof(Elf32_Word);
+  ptr = ptr + sizeof(Elf32_Word);
   if (sampler_data_sz > 0) {
     ::memcpy(ptr, sampler_data, sampler_data_sz);
     GBE_FREE(sampler_data);
-    ptr = static_cast<char *>(ptr) + sampler_data_sz;
+    ptr = ptr + sampler_data_sz;
   }
 
   /* Store all the Image info */
-  *(static_cast<Elf32_Word *>(ptr)) =
+  *(reinterpret_cast<Elf32_Word *>(ptr)) =
     static_cast<Elf32_Word>(image_data_sz / sizeof(ImageInfo)); // Image number
   ptr = static_cast<char *>(ptr) + sizeof(Elf32_Word);
+  /* Store all the image info by index */
   if (image_data_sz > 0) {
-    std::sort(image_data, image_data + image_data_sz / sizeof(ImageInfo),
-              [](ImageInfo &a, ImageInfo &b) { return a.idx < b.idx; });
-
-    /* Store all the image info by index */
     for (size_t i = 0; i < image_data_sz / sizeof(ImageInfo); i++) {
-      Elf32_Word *bti = static_cast<Elf32_Word *>(ptr);
-      Elf32_Word *width = static_cast<Elf32_Word *>(ptr) + 1;
-      Elf32_Word *height = static_cast<Elf32_Word *>(ptr) + 2;
-      Elf32_Word *depth = static_cast<Elf32_Word *>(ptr) + 3;
-      Elf32_Word *data_type = static_cast<Elf32_Word *>(ptr) + 4;
-      Elf32_Word *channel_order = static_cast<Elf32_Word *>(ptr) + 5;
-      Elf32_Word *dim_order = static_cast<Elf32_Word *>(ptr) + 6;
-
-      *bti = image_data[i].idx;
-      *width = image_data[i].wSlot;
-      *height = image_data[i].hSlot;
-      *depth = image_data[i].depthSlot;
-      *data_type = image_data[i].dataTypeSlot;
-      *channel_order = image_data[i].channelOrderSlot;
-      *dim_order = image_data[i].dimOrderSlot;
-
-      ptr = static_cast<char *>(ptr) + 7 * sizeof(Elf32_Word);
+      ImageInfoHelper *imgi = reinterpret_cast<ImageInfoHelper *>(ptr);
+      imgi->bti = image_data[i].idx;
+      imgi->width = image_data[i].wSlot;
+      imgi->height = image_data[i].hSlot;
+      imgi->depth = image_data[i].depthSlot;
+      imgi->data_type = image_data[i].dataTypeSlot;
+      imgi->channel_order = image_data[i].channelOrderSlot;
+      ptr = ptr + sizeof(ImageInfoHelper);
     }
 
     GBE_FREE(image_data);
   }
 
-  Elf32_Word *p_virt_phy_num = static_cast<Elf32_Word *>(ptr);
+  Elf32_Word *p_virt_phy_num = reinterpret_cast<Elf32_Word *>(ptr);
   ptr = static_cast<char *>(ptr) + sizeof(Elf32_Word);
   unsigned int virt_phy_num = 0;
 
-/*Store the special vitrual register map */
-#define STORE_CURB_MAP(curbe_enum, data_type, uniform)                         \
-  if (kernel.getCurbeOffset(curbe_enum, 0) >= 0) {                             \
-    *static_cast<Elf32_Word *>(ptr) = curbe_enum;                              \
-    ptr = static_cast<char *>(ptr) + sizeof(Elf32_Word);                       \
-    *static_cast<Elf32_Word *>(ptr) = kernel.getCurbeOffset(curbe_enum, 0);    \
-    ptr = static_cast<char *>(ptr) + sizeof(Elf32_Word);                       \
-    *static_cast<Elf32_Word *>(ptr) =                                          \
-      uniform ? sizeof(data_type) : sizeof(data_type) * kernel.getSIMDWidth(); \
-    ptr = static_cast<char *>(ptr) + sizeof(Elf32_Word);                       \
-    virt_phy_num++;                                                            \
-  }
-  STORE_CURB_MAP(GBE_CURBE_LOCAL_ID_X, Elf32_Word, 0);
-  STORE_CURB_MAP(GBE_CURBE_LOCAL_ID_Y, Elf32_Word, 0);
-  STORE_CURB_MAP(GBE_CURBE_LOCAL_ID_Z, Elf32_Word, 0);
-  STORE_CURB_MAP(GBE_CURBE_LOCAL_SIZE_X, Elf32_Word, 1);
-  STORE_CURB_MAP(GBE_CURBE_LOCAL_SIZE_Y, Elf32_Word, 1);
-  STORE_CURB_MAP(GBE_CURBE_LOCAL_SIZE_Z, Elf32_Word, 1);
-  STORE_CURB_MAP(GBE_CURBE_ENQUEUED_LOCAL_SIZE_X, Elf32_Word, 1);
-  STORE_CURB_MAP(GBE_CURBE_ENQUEUED_LOCAL_SIZE_Y, Elf32_Word, 1);
-  STORE_CURB_MAP(GBE_CURBE_ENQUEUED_LOCAL_SIZE_Z, Elf32_Word, 1);
-  STORE_CURB_MAP(GBE_CURBE_GLOBAL_SIZE_X, Elf32_Word, 1);
-  STORE_CURB_MAP(GBE_CURBE_GLOBAL_SIZE_Y, Elf32_Word, 1);
-  STORE_CURB_MAP(GBE_CURBE_GLOBAL_SIZE_Z, Elf32_Word, 1);
-  STORE_CURB_MAP(GBE_CURBE_GLOBAL_OFFSET_X, Elf32_Word, 1);
-  STORE_CURB_MAP(GBE_CURBE_GLOBAL_OFFSET_Y, Elf32_Word, 1);
-  STORE_CURB_MAP(GBE_CURBE_GLOBAL_OFFSET_Z, Elf32_Word, 1);
-  STORE_CURB_MAP(GBE_CURBE_GROUP_NUM_X, Elf32_Word, 1);
-  STORE_CURB_MAP(GBE_CURBE_GROUP_NUM_Y, Elf32_Word, 1);
-  STORE_CURB_MAP(GBE_CURBE_GROUP_NUM_Z, Elf32_Word, 1);
-  STORE_CURB_MAP(GBE_CURBE_WORK_DIM, Elf32_Word, 1);
-  STORE_CURB_MAP(GBE_CURBE_BLOCK_IP, Elf32_Half, 0);
-  STORE_CURB_MAP(GBE_CURBE_DW_BLOCK_IP, Elf32_Word, 0);
-  STORE_CURB_MAP(GBE_CURBE_THREAD_NUM, Elf32_Word, 1);
-  STORE_CURB_MAP(GBE_CURBE_THREAD_ID, Elf32_Word, 1);
-  STORE_CURB_MAP(GBE_CURBE_CONSTANT_ADDRSPACE, Elf64_Word, 1);
-  STORE_CURB_MAP(GBE_CURBE_STACK_SIZE, Elf64_Word, 1);
-#undef STORE_CURB_MAP
-  if (kernel.getCurbeOffset(GBE_CURBE_EXTRA_ARGUMENT, GBE_STACK_BUFFER) >= 0) {
-    *static_cast<Elf32_Word *>(ptr) = GBE_CURBE_EXTRA_ARGUMENT;
-    ptr = static_cast<char *>(ptr) + sizeof(Elf32_Word);
-    *static_cast<Elf32_Word *>(ptr) =
-      kernel.getCurbeOffset(GBE_CURBE_EXTRA_ARGUMENT, GBE_STACK_BUFFER);
-    ptr = static_cast<char *>(ptr) + sizeof(Elf32_Word);
-    *static_cast<Elf32_Word *>(ptr) = sizeof(Elf64_Word);
-    ptr = static_cast<char *>(ptr) + sizeof(Elf32_Word);
-    virt_phy_num++;
-  }
+  emitOneCurbeReg<GBE_CURBE_LOCAL_ID_X, Elf32_Word, 0>(virt_phy_num, ptr, kernel);
+  emitOneCurbeReg<GBE_CURBE_LOCAL_ID_Y, Elf32_Word, 0>(virt_phy_num, ptr, kernel);
+  emitOneCurbeReg<GBE_CURBE_LOCAL_ID_Z, Elf32_Word, 0>(virt_phy_num, ptr, kernel);
+  emitOneCurbeReg<GBE_CURBE_LOCAL_SIZE_X, Elf32_Word, 1>(virt_phy_num, ptr, kernel);
+  emitOneCurbeReg<GBE_CURBE_LOCAL_SIZE_Y, Elf32_Word, 1>(virt_phy_num, ptr, kernel);
+  emitOneCurbeReg<GBE_CURBE_LOCAL_SIZE_Z, Elf32_Word, 1>(virt_phy_num, ptr, kernel);
+  emitOneCurbeReg<GBE_CURBE_ENQUEUED_LOCAL_SIZE_X, Elf32_Word, 1>(virt_phy_num, ptr, kernel);
+  emitOneCurbeReg<GBE_CURBE_ENQUEUED_LOCAL_SIZE_Y, Elf32_Word, 1>(virt_phy_num, ptr, kernel);
+  emitOneCurbeReg<GBE_CURBE_ENQUEUED_LOCAL_SIZE_Z, Elf32_Word, 1>(virt_phy_num, ptr, kernel);
+  emitOneCurbeReg<GBE_CURBE_GLOBAL_SIZE_X, Elf32_Word, 1>(virt_phy_num, ptr, kernel);
+  emitOneCurbeReg<GBE_CURBE_GLOBAL_SIZE_Y, Elf32_Word, 1>(virt_phy_num, ptr, kernel);
+  emitOneCurbeReg<GBE_CURBE_GLOBAL_SIZE_Z, Elf32_Word, 1>(virt_phy_num, ptr, kernel);
+  emitOneCurbeReg<GBE_CURBE_GLOBAL_OFFSET_X, Elf32_Word, 1>(virt_phy_num, ptr, kernel);
+  emitOneCurbeReg<GBE_CURBE_GLOBAL_OFFSET_Y, Elf32_Word, 1>(virt_phy_num, ptr, kernel);
+  emitOneCurbeReg<GBE_CURBE_GLOBAL_OFFSET_Z, Elf32_Word, 1>(virt_phy_num, ptr, kernel);
+  emitOneCurbeReg<GBE_CURBE_GROUP_NUM_X, Elf32_Word, 1>(virt_phy_num, ptr, kernel);
+  emitOneCurbeReg<GBE_CURBE_GROUP_NUM_Y, Elf32_Word, 1>(virt_phy_num, ptr, kernel);
+  emitOneCurbeReg<GBE_CURBE_GROUP_NUM_Z, Elf32_Word, 1>(virt_phy_num, ptr, kernel);
+  emitOneCurbeReg<GBE_CURBE_WORK_DIM, Elf32_Word, 1>(virt_phy_num, ptr, kernel);
+  emitOneCurbeReg<GBE_CURBE_BLOCK_IP, Elf32_Half, 0>(virt_phy_num, ptr, kernel);
+  emitOneCurbeReg<GBE_CURBE_DW_BLOCK_IP, Elf32_Word, 0>(virt_phy_num, ptr, kernel);
+  emitOneCurbeReg<GBE_CURBE_THREAD_NUM, Elf32_Word, 1>(virt_phy_num, ptr, kernel);
+  emitOneCurbeReg<GBE_CURBE_THREAD_ID, Elf32_Word, 1>(virt_phy_num, ptr, kernel);
+  emitOneCurbeReg<GBE_CURBE_CONSTANT_ADDRSPACE, uint64_t, 1>(virt_phy_num, ptr, kernel);
+  emitOneCurbeReg<GBE_CURBE_STACK_SIZE, uint64_t, 1>(virt_phy_num, ptr, kernel);
+  emitOneCurbeReg<GBE_CURBE_EXTRA_ARGUMENT, uint64_t, 0>(virt_phy_num, ptr, kernel);
   *p_virt_phy_num = virt_phy_num;
 
   Elf_Word total_sz = static_cast<char *>(ptr) - static_cast<char *>(kernel_info);
@@ -407,36 +631,10 @@ void GenProgramElfContext::emitOneKernel(GenKernel &kernel)
 #endif
 
   GBE_FREE(kernel_info);
+
+  emitOneKernelCLInfo(kernel);
 }
 
-/* The format for one Gen Kernel function is following note section format
-   ------------------------
-   | Function Name size:4 |
-   ------------------------
-   | Desc size:4  |
-   ----------------
-   | Type:4       |
-   --------------------
-   | The name(strlen) |
-   -----------------------------------------------------------------------------------------------
-   | SIMD:4 | Local Mem Size:4 | Scratch Size:4 | Stack Size :4 | Barrier/SLM Used:4 | Arg Num:4 |
-   -----------------------------------------------------------------------------------------------
-     Then the format for each argument is
-   --------------------------------------------------------------------------------------------------------------------------
-   | Index:4 | Size:4 | Type:4 | Offset:4 | Addr Space:4 | Align(if is ptr) | BTI(if buffer):4 / Index(sampler and image):4 |
-   --------------------------------------------------------------------------------------------------------------------------
-     Then all sampler info
-   -----------------------------------
-   | Number:4 | SamperInfo:4 | ......|
-   -----------------------------------
-     Then all image info
-   ----------------------------------------------------------------------------------------------------------
-   | Number:4 | BTI:4 | Width:4 | Height:4 | Depth:4 | Data Type:4 | Channel Order:4 | Dim Order:4 | .......|
-   ----------------------------------------------------------------------------------------------------------
-     Last is the map table of special virtual register and phy register
-   --------------------------------------------------------
-   | Number:4 | Virt Reg:4 | Phy Reg:4 | Size:4 |.........|
-   --------------------------------------------------------       */
 void *
 GenProgram::toBinaryFormat(size_t &ret_size)
 {
@@ -456,14 +654,86 @@ GenProgram::toBinaryFormat(size_t &ret_size)
     GBE_FREE(const_data);
   }
 
+  /* Add the note about GPU info */
+  std::string gpu_name;
+  Elf32_Word gpu_version[3]; // pci-id, major and minor
+  if (IS_IVYBRIDGE(deviceID)) {
+    gpu_name = "IVYBridge";
+    gpu_version[0] = 7;
+    gpu_version[1] = 0;
+  } else if (IS_BAYTRAIL_T(deviceID)) {
+    gpu_name = "BayTrail";
+    gpu_version[0] = 7;
+    gpu_version[1] = 0;
+  } else if (IS_HASWELL(deviceID)) {
+    gpu_name = "HasWell";
+    gpu_version[0] = 7;
+    gpu_version[1] = 5;
+  } else if (IS_BROADWELL(deviceID)) {
+    gpu_name = "BroadWell";
+    gpu_version[0] = 8;
+    gpu_version[1] = 0;
+  } else if (IS_CHERRYVIEW(deviceID)) {
+    gpu_name = "CherryView";
+    gpu_version[0] = 8;
+    gpu_version[1] = 0;
+  } else if (IS_SKYLAKE(deviceID)) {
+    gpu_name = "SkyLake";
+    gpu_version[0] = 9;
+    gpu_version[1] = 0;
+  } else if (IS_BROXTON(deviceID)) {
+    gpu_name = "BroxTon";
+    gpu_version[0] = 9;
+    gpu_version[1] = 0;
+  }
+  gpu_version[3] = deviceID;
+  elf_ctx->note_writer->add_note(GenProgramElfContext::GEN_NOTE_TYPE_GPU_VERSION,
+                                 gpu_name, gpu_version, sizeof(gpu_version));
+
+  /* Add note info about compiler */
+  std::string compiler_name("GBE Compiler");
+  Elf32_Word compiler_version[2]; // major and minor
+  compiler_version[0] = LLVM_VERSION_MAJOR;
+  compiler_version[1] = LLVM_VERSION_MINOR;
+  elf_ctx->note_writer->add_note(GenProgramElfContext::GEN_NOTE_TYPE_COMPILER_INFO,
+                                 compiler_name, compiler_version, sizeof(compiler_version));
+
+  bool write_cl_version = false;
+  uint32_t oclVersion = 0;
   for (map<std::string, Kernel *>::const_iterator it = kernels.begin();
        it != kernels.end(); ++it) {
     GenKernel *k = static_cast<GenKernel *>(it->second);
+
+    if (write_cl_version == false) {
+      std::string ocl_version_str;
+      Elf32_Word cl_version[2]; // major and minor
+
+      oclVersion = k->getOclVersion();
+      if (oclVersion == 120) {
+        ocl_version_str = "OpenCL 1.2";
+        cl_version[0] = 1;
+        cl_version[1] = 2;
+      } else if (oclVersion == 200) {
+        ocl_version_str = "OpenCL 2.0";
+        cl_version[0] = 2;
+        cl_version[1] = 0;
+      } else
+        assert(0);
+
+      elf_ctx->cl_note_writer->add_note(GenProgramElfContext::GEN_NOTE_TYPE_CL_VERSION,
+                                        ocl_version_str, cl_version, sizeof(cl_version));
+    } else {
+      assert(oclVersion == k->getOclVersion());
+    }
+
     elf_ctx->emitOneKernel(*k);
   }
 
   wmemstreambuf membuf(4096);
   std::ostream oss(&membuf);
+  if (OCL_DUMP_ELF_FILE) {
+    elf_ctx->writer.save("gbe_program_elf_dump.o");
+  }
   elf_ctx->writer.save(oss);
   GBE_DELETE(elf_ctx);
 
-- 
2.7.4



More information about the Beignet mailing list