[Beignet] [PATCH V2 1/2] make Beignet as intermedia layer of CMRT

Guo, Yejun yejun.guo at intel.com
Thu Dec 3 00:52:41 PST 2015


Ping for review, thanks.

-----Original Message-----
From: Guo, Yejun 
Sent: Thursday, November 19, 2015 2:03 AM
To: beignet at lists.freedesktop.org
Cc: Guo, Yejun
Subject: [PATCH V2 1/2] make Beignet as intermedia layer of CMRT

CMRT is C for Media Runtime on Intel GPU, see https://github.com/01org/cmrt.
There is a request to make Beignet as intermedia layer of CMRT, in other words, application programer write OpenCL APIs to execute the CM kernel on GPU, the following shows the key code, and please refer to the next patch of unit test for detail.

prog = clCreateProgramWithBinary("cm kernel"); clBuildProgram(prog); kernel = clCreateKernel(prog, "kernel name"); image = clCreateImage(); clSetKernelArg(kernel, image); clEnqueueNDRangeKernel(kernel);

Inside Beignet, once cm kernel is invoked, the following relative APIs will be directly passed to CMRT library (libcmrt.so) which is loaded via dlopen only when necessary. Since we use this simple method to keep the code clean, OpenCL spec is not strictly followed, and cl_event is not supported for this case.

v2: add comments about the cm queue in fuction cmrt_enqueue
Signed-off-by: Guo Yejun <yejun.guo at intel.com>
---
 CMakeLists.txt         |   6 +
 src/CMakeLists.txt     |   7 ++
 src/cl_api.c           |  39 ++++++-
 src/cl_cmrt.cpp        | 311 +++++++++++++++++++++++++++++++++++++++++++++++++
 src/cl_cmrt.h          |  45 +++++++
 src/cl_command_queue.c |   7 ++
 src/cl_command_queue.h |   2 +
 src/cl_device_id.h     |   3 +
 src/cl_gt_device.h     |   1 +
 src/cl_kernel.c        |  11 ++
 src/cl_kernel.h        |   2 +
 src/cl_mem.c           |   7 ++
 src/cl_mem.h           |   3 +
 src/cl_program.c       |  57 +++++++--
 src/cl_program.h       |  10 +-
 15 files changed, 497 insertions(+), 14 deletions(-)  create mode 100644 src/cl_cmrt.cpp  create mode 100644 src/cl_cmrt.h

diff --git a/CMakeLists.txt b/CMakeLists.txt index 3c6c373..3411b6a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -150,6 +150,12 @@ ELSE(DRM_INTEL_FOUND)
   MESSAGE(FATAL_ERROR "Looking for DRM Intel (>= 2.4.52) - not found")
 ENDIF(DRM_INTEL_FOUND)
 
+# CMRT
+pkg_check_modules(CMRT libcmrt)
+IF(CMRT_FOUND)
+INCLUDE_DIRECTORIES(${CMRT_INCLUDE_DIRS})
+ENDIF(CMRT_FOUND)
+
 # Threads
 Find_Package(Threads)
 
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index c917e76..5f1872d 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -101,6 +101,13 @@ if (X11_FOUND)
       x11/va_dri2.c)
 endif (X11_FOUND)
 
+if (CMRT_FOUND)
+  set(CMAKE_CXX_FLAGS "-DHAS_CMRT ${CMAKE_CXX_FLAGS}")
+  set(CMAKE_CXX_FLAGS "-DCMRT_PATH=${CMRT_LIBRARY_DIRS}/libcmrt.so 
+${CMAKE_CXX_FLAGS}")
+  set(CMAKE_C_FLAGS "-DHAS_CMRT ${CMAKE_C_FLAGS}")
+  set(OPENCL_SRC ${OPENCL_SRC} cl_cmrt.cpp) endif (CMRT_FOUND)
+
 if (EGL_FOUND AND MESA_SOURCE_FOUND)
 set (OPENCL_SRC ${OPENCL_SRC} cl_mem_gl.c cl_gl_api.c x11/mesa_egl_extension.c x11/mesa_egl_res_share.c intel/intel_dri_resource_sharing.c)
 SET(CMAKE_CXX_FLAGS "-DHAS_EGL ${CMAKE_CXX_FLAGS}") diff --git a/src/cl_api.c b/src/cl_api.c index ddd39cf..7cedf4b 100644
--- a/src/cl_api.c
+++ b/src/cl_api.c
@@ -31,6 +31,7 @@
 #include "cl_accelerator_intel.h"
 #include "cl_alloc.h"
 #include "cl_utils.h"
+#include "cl_cmrt.h"
 
 #include "CL/cl.h"
 #include "CL/cl_ext.h"
@@ -276,6 +277,10 @@ clRetainDevice(cl_device_id device)  cl_int  clReleaseDevice(cl_device_id device)  {
+#ifdef HAS_CMRT
+  cmrt_destroy_device(device);
+#endif
+
   // XXX stub for C++ Bindings
   return CL_SUCCESS;
 }
@@ -941,11 +946,11 @@ clBuildProgram(cl_program            program,
     INVALID_DEVICE_IF (device_list[0] != program->ctx->device);
   }
 
-  /* TODO support create program from binary */
   assert(program->source_type == FROM_LLVM ||
          program->source_type == FROM_SOURCE ||
          program->source_type == FROM_LLVM_SPIR ||
-         program->source_type == FROM_BINARY);
+         program->source_type == FROM_BINARY ||
+         program->source_type == FROM_CMRT);
   if((err = cl_program_build(program, options)) != CL_SUCCESS) {
     goto error;
   }
@@ -1244,7 +1249,13 @@ clSetKernelArg(cl_kernel     kernel,
 {
   cl_int err = CL_SUCCESS;
   CHECK_KERNEL(kernel);
-  err = cl_kernel_set_arg(kernel, arg_index, arg_size, arg_value);
+
+#ifdef HAS_CMRT
+  if (kernel->cmrt_kernel != NULL)
+    err = cmrt_set_kernel_arg(kernel, arg_index, arg_size, arg_value);
+  else
+#endif
+    err = cl_kernel_set_arg(kernel, arg_index, arg_size, arg_value);
 error:
   return err;
 }
@@ -1532,6 +1543,12 @@ clFinish(cl_command_queue command_queue)
   cl_int err = CL_SUCCESS;
 
   CHECK_QUEUE (command_queue);
+
+#ifdef HAS_CMRT
+  if (command_queue->cmrt_event != NULL)
+    return cmrt_wait_for_task_finished(command_queue);
+#endif
+
   err = cl_command_queue_finish(command_queue);
 
 error:
@@ -2655,6 +2672,11 @@ clEnqueueMapBuffer(cl_command_queue  command_queue,
     goto error;
   }
 
+#ifdef HAS_CMRT
+  if (command_queue->cmrt_event != NULL)
+    cmrt_wait_for_task_finished(command_queue);
+#endif
+
   TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, buffer->ctx);
 
   data = &no_wait_data;
@@ -2743,6 +2765,11 @@ clEnqueueMapImage(cl_command_queue   command_queue,
     goto error;
   }
 
+#ifdef HAS_CMRT
+  if (command_queue->cmrt_event != NULL)
+    cmrt_wait_for_task_finished(command_queue);
+#endif
+
   TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, mem->ctx);
 
   data = &no_wait_data;
@@ -2948,6 +2975,12 @@ clEnqueueNDRangeKernel(cl_command_queue  command_queue,
     goto error;
   }
 
+#ifdef HAS_CMRT
+  if (kernel->cmrt_kernel != NULL) {
+    err = cmrt_enqueue(command_queue, kernel, global_work_size, local_work_size);
+    goto error;
+  }
+#endif
 
   /* XXX No event right now */
   //FATAL_IF(num_events_in_wait_list > 0, "Events are not supported"); diff --git a/src/cl_cmrt.cpp b/src/cl_cmrt.cpp new file mode 100644 index 0000000..25e4d82
--- /dev/null
+++ b/src/cl_cmrt.cpp
@@ -0,0 +1,311 @@
+#include "cl_cmrt.h"
+#include "cl_device_id.h"
+#include "intel/intel_defines.h"
+#include "cl_command_queue.h"
+
+#include "cm_rt.h"      //header file of libcmrt.so
+typedef INT (*CreateCmDeviceFunc)(CmDevice * &pDevice, UINT & version,
+			    CmDriverContext * drivercontext, UINT DevCreateOption); typedef 
+INT (*DestroyCmDeviceFunc)(CmDevice * &pDevice);
+
+#include <dlfcn.h>
+
+static void* dlhCMRT = NULL;
+static CreateCmDeviceFunc pfnCreateCmDevice = NULL; static 
+DestroyCmDeviceFunc pfnDestroyCmDevice = NULL;
+
+#define XSTR(x) #x
+#define STR(x) XSTR(x)
+
+class CmrtCleanup
+{
+public:
+  CmrtCleanup(){}
+  ~CmrtCleanup()
+  {
+    if (dlhCMRT != NULL)
+      dlclose(dlhCMRT);
+  }
+};
+
+enum CMRT_MEM_TYPE
+{
+    CMRT_BUFFER,
+    CMRT_SURFACE2D,
+};
+
+static CmrtCleanup cmrtCleanup;
+
+static bool LoadCmrtLibrary()
+{
+  if (dlhCMRT == NULL) {
+    dlhCMRT = dlopen(STR(CMRT_PATH), RTLD_LAZY | RTLD_LOCAL);
+
+    if (dlhCMRT == NULL)
+      return false;
+
+    pfnCreateCmDevice = (CreateCmDeviceFunc)dlsym(dlhCMRT, "CreateCmDevice");
+    if (pfnCreateCmDevice == NULL)
+      return false;
+
+    pfnDestroyCmDevice = (DestroyCmDeviceFunc)dlsym(dlhCMRT, "DestroyCmDevice");
+    if (pfnDestroyCmDevice == NULL)
+      return false;
+  }
+  return true;
+}
+
+cl_int cmrt_build_program(cl_program p, const char *options) {
+  CmDevice*& cmrt_device = (CmDevice*&)(p->ctx->device->cmrt_device);
+  int result;
+  if (cmrt_device == NULL)
+  {
+    if (!LoadCmrtLibrary())
+      return CL_DEVICE_NOT_AVAILABLE;   //yes, the error is not accurate, but i do not find a bettere one
+
+    CmDriverContext ctx;
+    ctx.shared_bufmgr = 1;
+    ctx.bufmgr = (drm_intel_bufmgr*)cl_context_get_bufmgr(p->ctx);
+    ctx.userptr_enabled = 0;
+    ctx.deviceid = p->ctx->device->device_id;
+    ctx.device_rev = -1;
+    UINT version = 0;
+    result = (*pfnCreateCmDevice)(cmrt_device, version, &ctx, CM_DEVICE_CREATE_OPTION_DEFAULT);
+    if (result != CM_SUCCESS)
+      return CL_DEVICE_NOT_AVAILABLE;
+  }
+
+  CmProgram* cmrt_program = NULL;
+  result = cmrt_device->LoadProgram(p->binary, p->binary_sz, 
+ cmrt_program, options);  if (result != CM_SUCCESS)
+    return CL_COMPILE_PROGRAM_FAILURE;
+
+  p->cmrt_program = cmrt_program;
+  cmrt_program->GetKernelCount(p->ker_n);
+  return CL_SUCCESS;
+}
+
+cl_int cmrt_destroy_program(cl_program p) {
+  CmDevice* cmrt_device = (CmDevice*)(p->ctx->device->cmrt_device);
+  CmProgram*& cmrt_program = (CmProgram*&)(p->cmrt_program);
+  if (cmrt_device->DestroyProgram(cmrt_program) != CM_SUCCESS)
+    return CL_INVALID_PROGRAM;
+  return CL_SUCCESS;
+}
+
+cl_int cmrt_destroy_device(cl_device_id device) {
+  CmDevice*& cmrt_device = (CmDevice*&)(device->cmrt_device);
+  if ((*pfnDestroyCmDevice)(cmrt_device) != CM_SUCCESS)
+    return CL_INVALID_DEVICE;
+  return CL_SUCCESS;
+}
+
+void* cmrt_create_kernel(cl_program p, const char *name) {
+  CmDevice* cmrt_device = (CmDevice*)(p->ctx->device->cmrt_device);
+  CmKernel* cmrt_kernel = NULL;
+  int result = cmrt_device->CreateKernel((CmProgram*)(p->cmrt_program), 
+name, cmrt_kernel);
+  if (result != CM_SUCCESS)
+    return NULL;
+
+  return cmrt_kernel;
+}
+
+cl_int cmrt_destroy_kernel(cl_kernel k) {
+  CmDevice* cmrt_device = 
+(CmDevice*)(k->program->ctx->device->cmrt_device);
+  CmKernel*& cmrt_kernel = (CmKernel*&)(k->cmrt_kernel);
+  if (cmrt_device->DestroyKernel(cmrt_kernel) != CM_SUCCESS)
+    return CL_INVALID_KERNEL;
+  return CL_SUCCESS;
+}
+
+cl_int cmrt_enqueue(cl_command_queue cq, cl_kernel k, const size_t* 
+global_work_size, const size_t* local_work_size) {
+  CmDevice* cmrt_device = 
+(CmDevice*)(k->program->ctx->device->cmrt_device);
+  CmKernel* cmrt_kernel = (CmKernel*)(k->cmrt_kernel);
+
+  int result = 0;
+
+  cmrt_kernel->SetThreadCount(global_work_size[0]*global_work_size[1]);
+
+  //no need to destory queue explicitly,  //and there is only one queue 
+ instance within each device,  //CreateQueue always returns the same 
+ instance
+  CmQueue* pCmQueue = NULL;
+  cmrt_device->CreateQueue(pCmQueue);
+
+  CmTask *pKernelArray = NULL;
+  cmrt_device->CreateTask(pKernelArray);
+
+  pKernelArray->AddKernel(cmrt_kernel);
+
+  CmEvent* e = NULL;
+
+  if (local_work_size == NULL) {
+    CmThreadSpace* pTS = NULL;
+    cmrt_device->CreateThreadSpace(global_work_size[0], global_work_size[1], pTS);
+    result = pCmQueue->Enqueue(pKernelArray, e, pTS);
+  } else {
+    CmThreadGroupSpace* pTGS = NULL;
+	cmrt_device->CreateThreadGroupSpace(global_work_size[0], global_work_size[1], local_work_size[0], local_work_size[1], pTGS);
+    result = pCmQueue->EnqueueWithGroup(pKernelArray, e, pTGS);
+    cmrt_device->DestroyThreadGroupSpace(pTGS);
+  }
+
+  if (result != CM_SUCCESS)
+    return CL_INVALID_OPERATION;
+
+  cmrt_device->DestroyTask(pKernelArray);
+
+  CmEvent*& olde = (CmEvent*&)cq->cmrt_event;  if (olde != NULL)
+    pCmQueue->DestroyEvent(e);
+
+  cq->cmrt_event = e;
+
+  return CL_SUCCESS;
+}
+
+static VA_CM_FORMAT GetCmrtFormat(_cl_mem_image* image) {
+    switch (image->intel_fmt)
+    {
+    case I965_SURFACEFORMAT_B8G8R8A8_UNORM:
+      return VA_CM_FMT_A8R8G8B8;
+    case I965_SURFACEFORMAT_B8G8R8X8_UNORM:
+      return VA_CM_FMT_X8R8G8B8;
+    case I965_SURFACEFORMAT_A8_UNORM:
+      return VA_CM_FMT_A8;
+    case I965_SURFACEFORMAT_R10G10B10A2_UNORM:
+      return VA_CM_FMT_A2B10G10R10;
+    case I965_SURFACEFORMAT_R16G16B16A16_UNORM:
+      return VA_CM_FMT_A16B16G16R16;
+    case I965_SURFACEFORMAT_L8_UNORM:
+      return VA_CM_FMT_L8;
+    case I965_SURFACEFORMAT_R16_UINT:
+      return VA_CM_FMT_R16U;
+    case I965_SURFACEFORMAT_R8_UNORM:
+      return VA_CM_FMT_R8U;
+    case I965_SURFACEFORMAT_L16_UNORM:
+      return VA_CM_FMT_L16;
+    case I965_SURFACEFORMAT_R32_FLOAT:
+      return VA_CM_FMT_R32F;
+    default:
+      return VA_CM_FMT_UNKNOWN;
+    }
+}
+
+static bool CreateCmrtMemory(cl_mem mem) {
+  if (mem->cmrt_mem != NULL)
+    return true;
+
+  CmDevice* cmrt_device = (CmDevice*)(mem->ctx->device->cmrt_device);
+  int result;
+  CmOsResource osResource;
+  osResource.bo_size = mem->size;
+  osResource.bo_flags = DRM_BO_HANDLE;
+  osResource.bo = (drm_intel_bo*)mem->bo;  if (IS_IMAGE(mem)) {
+    _cl_mem_image* image = cl_mem_image(mem);
+    if (CL_MEM_OBJECT_IMAGE2D != image->image_type)
+      return CL_INVALID_ARG_VALUE;
+    osResource.format = GetCmrtFormat(image);
+    if (osResource.format == VA_CM_FMT_UNKNOWN)
+      return false;
+    osResource.aligned_width = image->row_pitch;
+    osResource.aligned_height = mem->size / image->row_pitch;
+    osResource.pitch = image->row_pitch;
+    osResource.tile_type = image->tiling;
+    osResource.orig_width = image->w;
+    osResource.orig_height = image->h;
+    CmSurface2D*& cmrt_surface2d = (CmSurface2D*&)(mem->cmrt_mem);
+    result = cmrt_device->CreateSurface2D(&osResource, cmrt_surface2d);
+    mem->cmrt_mem_type = CMRT_SURFACE2D;  } else {
+    osResource.format = VA_CM_FMT_BUFFER;
+    osResource.buf_bytes = mem->size;
+    CmBuffer*& cmrt_buffer = (CmBuffer*&)(mem->cmrt_mem);
+    result = cmrt_device->CreateBuffer(&osResource, cmrt_buffer);
+    mem->cmrt_mem_type = CMRT_BUFFER;
+  }
+
+  if (result != CM_SUCCESS)
+    return false;
+
+  return true;
+}
+
+cl_int cmrt_set_kernel_arg(cl_kernel k, cl_uint index, size_t sz, const 
+void *value) {
+  if(value == NULL)
+    return CL_INVALID_ARG_VALUE;
+
+  CmKernel* cmrt_kernel = (CmKernel*)(k->cmrt_kernel);
+
+  WORD argKind = -1;
+  if (cmrt_kernel->GetArgKind(index, argKind) != CM_SUCCESS)
+    return CL_INVALID_ARG_INDEX;
+
+  int result;
+  if (argKind == ARG_KIND_GENERAL)
+    result = cmrt_kernel->SetKernelArg(index, sz, value);  else {
+    cl_mem mem = *(cl_mem*)value;
+    if (mem->magic == CL_MAGIC_MEM_HEADER) {
+      if (!CreateCmrtMemory(mem))
+        return CL_INVALID_ARG_VALUE;
+
+      SurfaceIndex * memIndex = NULL;
+      if (mem->cmrt_mem_type == CMRT_BUFFER) {
+        CmBuffer* cmrt_buffer = (CmBuffer*)(mem->cmrt_mem);
+        cmrt_buffer->GetIndex(memIndex);
+      } else {
+        CmSurface2D* cmrt_surface2d = (CmSurface2D*)(mem->cmrt_mem);
+        cmrt_surface2d->GetIndex(memIndex);
+      }
+      result = cmrt_kernel->SetKernelArg(index, sizeof(SurfaceIndex), memIndex);
+    } else
+      return CL_INVALID_ARG_VALUE;
+  }
+
+  if (result != CM_SUCCESS)
+    return CL_INVALID_KERNEL_ARGS;
+
+  return CL_SUCCESS;
+}
+
+cl_int cmrt_destroy_memory(cl_mem mem)
+{
+  CmDevice* cmrt_device = (CmDevice*)(mem->ctx->device->cmrt_device);
+  if (mem->cmrt_mem_type == CMRT_BUFFER) {
+    CmBuffer*& cmrt_buffer = (CmBuffer*&)(mem->cmrt_mem);
+    cmrt_device->DestroySurface(cmrt_buffer);
+  } else {
+    CmSurface2D*& cmrt_surface2d = (CmSurface2D*&)(mem->cmrt_mem);
+    cmrt_device->DestroySurface(cmrt_surface2d);
+  }
+  return CL_SUCCESS;
+}
+
+cl_int cmrt_destroy_event(cl_command_queue cq) {
+  CmEvent*& cmrt_event = (CmEvent*&)(cq->cmrt_event);
+  CmDevice* cmrt_device = (CmDevice*)(cq->ctx->device->cmrt_device);
+  CmQueue* pCmQueue = NULL;
+  cmrt_event->WaitForTaskFinished();
+  cmrt_device->CreateQueue(pCmQueue);
+  pCmQueue->DestroyEvent(cmrt_event);
+  return CL_SUCCESS;
+}
+
+cl_int cmrt_wait_for_task_finished(cl_command_queue cq) {
+  CmEvent* cmrt_event = (CmEvent*)(cq->cmrt_event);
+  cmrt_event->WaitForTaskFinished();
+  return CL_SUCCESS;
+}
diff --git a/src/cl_cmrt.h b/src/cl_cmrt.h new file mode 100644 index 0000000..316095c
--- /dev/null
+++ b/src/cl_cmrt.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright @2015 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Guo Yejun <yejun.guo at intel.com>  */
+
+#ifndef __CL_CMRT_H__
+#define __CL_CMRT_H__
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "cl_kernel.h"
+#include "cl_program.h"
+
+cl_int cmrt_build_program(cl_program p, const char *options); cl_int 
+cmrt_destroy_program(cl_program p); cl_int 
+cmrt_destroy_device(cl_device_id device);
+void* cmrt_create_kernel(cl_program p, const char *name); cl_int 
+cmrt_destroy_kernel(cl_kernel k); cl_int cmrt_enqueue(cl_command_queue 
+cq, cl_kernel k, const size_t* global_work_size, const size_t* 
+local_work_size); cl_int cmrt_set_kernel_arg(cl_kernel k, cl_uint 
+index, size_t sz, const void *value); cl_int cmrt_destroy_memory(cl_mem 
+mem); cl_int cmrt_destroy_event(cl_command_queue cq); cl_int 
+cmrt_wait_for_task_finished(cl_command_queue cq);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/cl_command_queue.c b/src/cl_command_queue.c index 033e7df..549f648 100644
--- a/src/cl_command_queue.c
+++ b/src/cl_command_queue.c
@@ -31,6 +31,7 @@
 #include "cl_khr_icd.h"
 #include "cl_event.h"
 #include "performance.h"
+#include "cl_cmrt.h"
 
 #include <assert.h>
 #include <stdio.h>
@@ -47,6 +48,7 @@ cl_command_queue_new(cl_context ctx)
   queue->magic = CL_MAGIC_QUEUE_HEADER;
   queue->ref_n = 1;
   queue->ctx = ctx;
+  queue->cmrt_event = NULL;
   if ((queue->thread_data = cl_thread_data_create()) == NULL) {
     goto error;
   }
@@ -76,6 +78,11 @@ cl_command_queue_delete(cl_command_queue queue)
   assert(queue);
   if (atomic_dec(&queue->ref_n) != 1) return;
 
+#ifdef HAS_CMRT
+  if (queue->cmrt_event != NULL)
+    cmrt_destroy_event(queue);
+#endif
+
   // If there is a list of valid events, we need to give them
   // a chance to call the call-back function.
   cl_event_update_last_events(queue,1);
diff --git a/src/cl_command_queue.h b/src/cl_command_queue.h index 2cd6739..d1b8c44 100644
--- a/src/cl_command_queue.h
+++ b/src/cl_command_queue.h
@@ -44,6 +44,8 @@ struct _cl_command_queue {
   cl_command_queue prev, next;         /* We chain the command queues together */
   void *thread_data;                   /* Used to store thread context data */
   cl_mem perf;                         /* Where to put the perf counters */
+
+  void* cmrt_event;                    /* the latest CmEvent* of the command queue */
 };
 
 /* The macro to get the thread specified gpgpu struct. */ diff --git a/src/cl_device_id.h b/src/cl_device_id.h index e971735..619fa0a 100644
--- a/src/cl_device_id.h
+++ b/src/cl_device_id.h
@@ -123,6 +123,9 @@ struct _cl_device_id {
   uint32_t atomic_test_result;
   uint32_t image_pitch_alignment;
   uint32_t image_base_address_alignment;
+
+  //inited as NULL, created only when cmrt kernel is used
+  void* cmrt_device;  //realtype: CmDevice*
 };
 
 /* Get a device from the given platform */ diff --git a/src/cl_gt_device.h b/src/cl_gt_device.h index d8089c2..b4c610e 100644
--- a/src/cl_gt_device.h
+++ b/src/cl_gt_device.h
@@ -131,3 +131,4 @@ DECL_INFO_STRING(spir_versions, "1.2")  .device_reference_count = 1,  .image_pitch_alignment = 1,  .image_base_address_alignment = 4096,
+.cmrt_device = NULL
diff --git a/src/cl_kernel.c b/src/cl_kernel.c index b2d1955..b380abe 100644
--- a/src/cl_kernel.c
+++ b/src/cl_kernel.c
@@ -28,6 +28,7 @@
 #include "CL/cl.h"
 #include "cl_sampler.h"
 #include "cl_accelerator_intel.h"
+#include "cl_cmrt.h"
 
 #include <stdio.h>
 #include <string.h>
@@ -41,6 +42,15 @@ cl_kernel_delete(cl_kernel k)
   uint32_t i;
   if (k == NULL) return;
 
+#ifdef HAS_CMRT
+  if (k->cmrt_kernel != NULL) {
+    cmrt_destroy_kernel(k);
+    k->magic = CL_MAGIC_DEAD_HEADER; /* For safety */
+    cl_free(k);
+    return;
+  }
+#endif
+
   /* We are not done with the kernel */
   if (atomic_dec(&k->ref_n) > 1) return;
   /* Release one reference on all bos we own */ @@ -71,6 +81,7 @@ cl_kernel_new(cl_program p)
   k->ref_n = 1;
   k->magic = CL_MAGIC_KERNEL_HEADER;
   k->program = p;
+  k->cmrt_kernel = NULL;
 
 exit:
   return k;
diff --git a/src/cl_kernel.h b/src/cl_kernel.h index 7f59162..05a882e 100644
--- a/src/cl_kernel.h
+++ b/src/cl_kernel.h
@@ -69,6 +69,8 @@ struct _cl_kernel {
   uint32_t arg_n:30;          /* Number of arguments */
   uint32_t ref_its_program:1; /* True only for the user kernel (created by clCreateKernel) */
   uint32_t vme:1;             /* True only if it is a built-in kernel for VME */
+
+  void* cmrt_kernel;          /* CmKernel* */
 };
 
 /* Allocate an empty kernel */
diff --git a/src/cl_mem.c b/src/cl_mem.c index 9a6bb83..1f03c69 100644
--- a/src/cl_mem.c
+++ b/src/cl_mem.c
@@ -27,6 +27,7 @@
 #include "cl_khr_icd.h"
 #include "cl_kernel.h"
 #include "cl_command_queue.h"
+#include "cl_cmrt.h"
 
 #include "CL/cl.h"
 #include "CL/cl_intel.h"
@@ -268,6 +269,7 @@ cl_mem_allocate(enum cl_mem_type type,
   mem->flags = flags;
   mem->is_userptr = 0;
   mem->offset = 0;
+  mem->cmrt_mem = NULL;
   if (mem->type == CL_MEM_IMAGE_TYPE) {
     cl_mem_image(mem)->is_image_from_buffer = 0;
   }
@@ -1166,6 +1168,11 @@ cl_mem_delete(cl_mem mem)
   }
 #endif
 
+#ifdef HAS_CMRT
+  if (mem->cmrt_mem != NULL)
+    cmrt_destroy_memory(mem);
+#endif
+
   /* iff we are a image, delete the 1d buffer if has. */
   if (IS_IMAGE(mem)) {
     if (cl_mem_image(mem)->buffer_1d) { diff --git a/src/cl_mem.h b/src/cl_mem.h index fb24115..c8f256d 100644
--- a/src/cl_mem.h
+++ b/src/cl_mem.h
@@ -95,6 +95,9 @@ typedef  struct _cl_mem {
   cl_mem_dstr_cb *dstr_cb;  /* The destroy callback. */
   uint8_t is_userptr;       /* CL_MEM_USE_HOST_PTR is enabled*/
   size_t offset;            /* offset of host_ptr to the page beginning, only for CL_MEM_USE_HOST_PTR*/
+
+  uint8_t cmrt_mem_type;    /* CmBuffer, CmSurface2D, ... */
+  void* cmrt_mem;
 } _cl_mem;
 
 struct _cl_mem_image {
diff --git a/src/cl_program.c b/src/cl_program.c index 98b6d51..1dca673 100644
--- a/src/cl_program.c
+++ b/src/cl_program.c
@@ -25,6 +25,7 @@
 #include "cl_utils.h"
 #include "cl_khr_icd.h"
 #include "cl_gbe_loader.h"
+#include "cl_cmrt.h"
 #include "CL/cl.h"
 #include "CL/cl_intel.h"
 
@@ -92,10 +93,17 @@ cl_program_delete(cl_program p)
       p->ctx->programs = p->next;
   pthread_mutex_unlock(&p->ctx->program_lock);
 
-  cl_free(p->bin);               /* Free the blob */
-  for (i = 0; i < p->ker_n; ++i) /* Free the kernels */
-    cl_kernel_delete(p->ker[i]);
-  cl_free(p->ker);
+#ifdef HAS_CMRT
+  if (p->cmrt_program != NULL)
+    cmrt_destroy_program(p);
+  else
+#endif
+  {
+    cl_free(p->bin);               /* Free the blob */
+    for (i = 0; i < p->ker_n; ++i) /* Free the kernels */
+      cl_kernel_delete(p->ker[i]);
+    cl_free(p->ker);
+  }
 
   /* Program belongs to their parent context */
   cl_context_delete(p->ctx);
@@ -123,6 +131,7 @@ cl_program_new(cl_context ctx)
   p->ref_n = 1;
   p->magic = CL_MAGIC_PROGRAM_HEADER;
   p->ctx = ctx;
+  p->cmrt_program = NULL;
   p->build_log = calloc(1000, sizeof(char));
   if (p->build_log)
     p->build_log_max_sz = 1000;
@@ -172,12 +181,14 @@ static const unsigned char binary_type_header[BHI_MAX][BINARY_HEADER_LENGTH]=  \
                                               {{'B','C', 0xC0, 0xDE},
                                                {1, 'B', 'C', 0xC0, 0xDE},
                                                {2, 'B', 'C', 0xC0, 0xDE},
-                                               {0, 'G','E', 'N', 'C'}};
+                                               {0, 'G','E', 'N', 'C'},
+                                               {'C','I', 'S', 'A'},
+                                               };
 
 LOCAL cl_bool headerCompare(const unsigned char *BufPtr, BINARY_HEADER_INDEX index)  {
   bool matched = true;
-  int length = index == BHI_SPIR ? BINARY_HEADER_LENGTH -1 :BINARY_HEADER_LENGTH;
+  int length = (index == BHI_SPIR || index == BHI_CMRT) ? 
+ BINARY_HEADER_LENGTH -1 :BINARY_HEADER_LENGTH;
   int i = 0;
   for (i = 0; i < length; ++i)
   {
@@ -190,6 +201,7 @@ LOCAL cl_bool headerCompare(const unsigned char *BufPtr, BINARY_HEADER_INDEX ind  #define isLLVM_C_O(BufPtr)  headerCompare(BufPtr, BHI_COMPIRED_OBJECT)  #define isLLVM_LIB(BufPtr)  headerCompare(BufPtr, BHI_LIBRARY)  #define isGenBinary(BufPtr) headerCompare(BufPtr, BHI_GEN_BINARY)
+#define isCMRT(BufPtr)      headerCompare(BufPtr, BHI_CMRT)
 
 LOCAL cl_program
 cl_program_create_from_binary(cl_context             ctx,
@@ -236,8 +248,9 @@ cl_program_create_from_binary(cl_context             ctx,
   program->binary_sz = lengths[0];
   program->source_type = FROM_BINARY;
 
-  if(isSPIR((unsigned char*)program->binary)) {
-
+  if (isCMRT((unsigned char*)program->binary)) {
+    program->source_type = FROM_CMRT;
+  }else if(isSPIR((unsigned char*)program->binary)) {
     char* typed_binary;
     TRY_ALLOC(typed_binary, cl_calloc(lengths[0]+1, sizeof(char)));
     memcpy(typed_binary+1, binaries[0], lengths[0]); @@ -518,6 +531,20 @@ cl_program_build(cl_program p, const char *options)
     goto error;
   }
 
+#if HAS_CMRT
+  if (p->source_type == FROM_CMRT) {
+    //only here we begins to invoke cmrt
+    //break spec to return other errors such as CL_DEVICE_NOT_FOUND
+    err = cmrt_build_program(p, options);
+    if (err == CL_SUCCESS) {
+      p->build_status = CL_BUILD_SUCCESS;
+      p->binary_type = CL_PROGRAM_BINARY_TYPE_EXECUTABLE;
+      return CL_SUCCESS;
+    } else
+      goto error;
+  }
+#endif
+
   if (!check_cl_version_option(p, options)) {
     err = CL_BUILD_PROGRAM_FAILURE;
     goto error;
@@ -833,6 +860,20 @@ cl_program_create_kernel(cl_program p, const char *name, cl_int *errcode_ret)
   cl_int err = CL_SUCCESS;
   uint32_t i = 0;
 
+#ifdef HAS_CMRT
+  if (p->cmrt_program != NULL) {
+    void* cmrt_kernel = cmrt_create_kernel(p, name);
+    if (cmrt_kernel != NULL) {
+      to = cl_kernel_new(p);
+      to->cmrt_kernel = cmrt_kernel;
+      goto exit;
+    } else {
+      err = CL_INVALID_KERNEL_NAME;
+      goto error;
+    }
+  }
+#endif
+
   /* Find the program first */
   for (i = 0; i < p->ker_n; ++i) {
     assert(p->ker[i]);
diff --git a/src/cl_program.h b/src/cl_program.h index 63ad16d..899a31a 100644
--- a/src/cl_program.h
+++ b/src/cl_program.h
@@ -34,14 +34,16 @@ enum {
   FROM_SOURCE = 0,
   FROM_LLVM = 1,
   FROM_BINARY = 2,
-  FROM_LLVM_SPIR = 3
+  FROM_LLVM_SPIR = 3,
+  FROM_CMRT = 4,
 };
 
 typedef enum _BINARY_HEADER_INDEX {
   BHI_SPIR = 0,
   BHI_COMPIRED_OBJECT = 1,
   BHI_LIBRARY = 2,
-  BHI_GEN_BINARY = 3, /*remember update BHI_MAX if add option.*/
+  BHI_GEN_BINARY = 3,
+  BHI_CMRT = 4,
   BHI_MAX,
 }BINARY_HEADER_INDEX;
 
@@ -61,13 +63,15 @@ struct _cl_program {
   size_t binary_sz;       /* The binary size. */
   uint32_t binary_type;   /* binary type: COMPILED_OBJECT(LLVM IR), LIBRARY(LLVM IR with option "-create-library"), or EXECUTABLE(GEN binary). */
   uint32_t ker_n;         /* Number of declared kernels */
-  uint32_t source_type:2; /* Built from binary, source or LLVM */
+  uint32_t source_type:3; /* Built from binary, source, CMRT or LLVM*/
   uint32_t is_built:1;    /* Did we call clBuildProgram on it? */
   int32_t build_status;   /* build status. */
   char *build_opts;       /* The build options for this program */
   size_t build_log_max_sz; /*build log maximum size in byte.*/
   char *build_log;         /* The build log for this program. */
   size_t build_log_sz;    /* The actual build log size.*/
+
+  void* cmrt_program;      /* real type: CmProgram* */
 };
 
 /* Create a empty program */
--
1.9.1



More information about the Beignet mailing list