[Beignet] [PATCH 5/5] runtime: support for the debug system routine, surface and MMIO registers

Mircea Gherzan mircea.gherzan at intel.com
Fri Jul 8 12:39:39 UTC 2016


Shader debugging has to be enabled in the hardware via MMIO registers.
The address of the debug system routine has to be configured via a
STATE_SIP command in the batch buffer. The debug surface (where the
registers are dumped) has to be configured at the BTI used by the
system routine.

The binary of the system routine is provided by the interchange library
of the debugger.

Signed-off-by: Mircea Gherzan <mircea.gherzan at intel.com>
---
 src/cl_command_queue_gen7.c |   4 +
 src/cl_driver.h             |   4 +
 src/cl_driver_defs.c        |   2 +-
 src/intel/intel_defines.h   |  11 +++
 src/intel/intel_gpgpu.c     | 192 +++++++++++++++++++++++++++++++++++++++++---
 src/intel/intel_gpgpu.h     |   7 ++
 6 files changed, 209 insertions(+), 11 deletions(-)

diff --git a/src/cl_command_queue_gen7.c b/src/cl_command_queue_gen7.c
index 648ac62..7977f92 100644
--- a/src/cl_command_queue_gen7.c
+++ b/src/cl_command_queue_gen7.c
@@ -381,6 +381,10 @@ cl_command_queue_ND_range_gen7(cl_command_queue queue,
   printf_info = interp_dup_printfset(ker->opaque);
   cl_gpgpu_set_printf_info(gpgpu, printf_info);
 
+  /* We do not want to debug the self-test kernel. */
+  if (queue->ctx->kernel_debug && strcmp(kernel.name, "self_test"))
+    cl_gpgpu_enable_kernel_debug(gpgpu);
+
   /* Setup the kernel */
   if (queue->props & CL_QUEUE_PROFILING_ENABLE)
     err = cl_gpgpu_state_init(gpgpu, ctx->device->max_compute_unit * ctx->device->max_thread_per_unit, cst_sz / 32, 1);
diff --git a/src/cl_driver.h b/src/cl_driver.h
index 16730db..494c972 100644
--- a/src/cl_driver.h
+++ b/src/cl_driver.h
@@ -326,6 +326,10 @@ typedef void (cl_gpgpu_walker_cb)(cl_gpgpu,
                                   const size_t local_wk_sz[3]);
 extern cl_gpgpu_walker_cb *cl_gpgpu_walker;
 
+/* Configure kernel debugging */
+typedef void (cl_gpgpu_enable_kernel_debug_cb)(cl_gpgpu);
+extern cl_gpgpu_enable_kernel_debug_cb *cl_gpgpu_enable_kernel_debug;
+
 /**************************************************************************
  * Buffer
  **************************************************************************/
diff --git a/src/cl_driver_defs.c b/src/cl_driver_defs.c
index 31176a4..7197e40 100644
--- a/src/cl_driver_defs.c
+++ b/src/cl_driver_defs.c
@@ -106,4 +106,4 @@ LOCAL cl_gpgpu_unmap_printf_buffer_cb *cl_gpgpu_unmap_printf_buffer = NULL;
 LOCAL cl_gpgpu_set_printf_info_cb *cl_gpgpu_set_printf_info = NULL;
 LOCAL cl_gpgpu_get_printf_info_cb *cl_gpgpu_get_printf_info = NULL;
 LOCAL cl_gpgpu_release_printf_buffer_cb *cl_gpgpu_release_printf_buffer = NULL;
-
+LOCAL cl_gpgpu_enable_kernel_debug_cb *cl_gpgpu_enable_kernel_debug = NULL;
diff --git a/src/intel/intel_defines.h b/src/intel/intel_defines.h
index 6ada30c..d3ad07e 100644
--- a/src/intel/intel_defines.h
+++ b/src/intel/intel_defines.h
@@ -315,6 +315,17 @@
 
 #define GEN8_L3_CNTL_REG_ADDRESS_OFFSET          (0x7034)
 
+// Kernel debug configuration
+#define GEN8_TD_CTL                               0xe400
+#define GEN8_TD_CTL_FORCE_BKPT_ENABLE             (1 << 4)
+#define GEN8_TD_CTL_FORCE_EXCEPTION_ENABLE        (1 << 7)
+
+#define GEN8_INSTPM                               0x20c0
+#define GEN8_INSTPM_GLOBAL_DEBUG                  (1 << 4)
+
+#define GEN9_CS_DEBUG_MODE1                       0x20ec
+#define GEN9_CS_DEBUG_MODE1_GLOBAL_DEBUG          (1 << 6)
+
 // To issue pipe controls (reset L3 / SLM or stall)
 #define GEN7_PIPE_CONTROL_MEDIA 0x2
 #define GEN7_PIPE_CONTROL_3D 0x3
diff --git a/src/intel/intel_gpgpu.c b/src/intel/intel_gpgpu.c
index db967e8..966fa2e 100644
--- a/src/intel/intel_gpgpu.c
+++ b/src/intel/intel_gpgpu.c
@@ -31,6 +31,7 @@
 #include <stddef.h>
 #include <errno.h>
 
+#include "intel/intel_debugger.h"
 #include "intel/intel_gpgpu.h"
 #include "intel/intel_defines.h"
 #include "intel/intel_structs.h"
@@ -55,6 +56,9 @@
 
 #define TIMESTAMP_ADDR        0x2358
 
+#define SYS_ROUTINE_SIZE      4096
+#define DEBUG_SURFACE_BTI     127
+
 /* Stores both binding tables and surface states */
 typedef struct surface_heap {
   uint32_t binding_table[256];
@@ -113,6 +117,9 @@ typedef void (intel_gpgpu_select_pipeline_t)(intel_gpgpu_t *gpgpu);
 intel_gpgpu_select_pipeline_t *intel_gpgpu_select_pipeline = NULL;
 
 static void
+intel_gpgpu_debug_surface_setup(intel_gpgpu_t *gpgpu);
+
+static void
 intel_gpgpu_sync(void *buf)
 {
   if (buf)
@@ -152,6 +159,8 @@ intel_gpgpu_delete_finished(intel_gpgpu_t *gpgpu)
     drm_intel_bo_unreference(gpgpu->scratch_b.bo);
   if (gpgpu->profiling_b.bo)
     drm_intel_bo_unreference(gpgpu->profiling_b.bo);
+  if (gpgpu->sys_routine_bo)
+    drm_intel_bo_unreference(gpgpu->sys_routine_bo);
 
   if(gpgpu->constant_b.bo)
     drm_intel_bo_unreference(gpgpu->constant_b.bo);
@@ -365,11 +374,7 @@ intel_gpgpu_set_base_address_gen8(intel_gpgpu_t *gpgpu)
     OUT_BATCH(gpgpu->batch, 0);
     OUT_BATCH(gpgpu->batch, 0 | (def_cc << 4) | BASE_ADDRESS_MODIFY); /* Indirect Obj Base Addr */
     OUT_BATCH(gpgpu->batch, 0);
-    //OUT_BATCH(gpgpu->batch, 0 | (def_cc << 4) | BASE_ADDRESS_MODIFY); /* Instruction Base Addr  */
-    OUT_RELOC(gpgpu->batch, (drm_intel_bo *)gpgpu->ker->bo,
-              I915_GEM_DOMAIN_INSTRUCTION,
-              I915_GEM_DOMAIN_INSTRUCTION,
-              0 + (0 | (def_cc << 4) | (0 << 1)| BASE_ADDRESS_MODIFY));
+    OUT_BATCH(gpgpu->batch, 0 | (def_cc << 4) | BASE_ADDRESS_MODIFY); /* Instruction Base Addr  */
     OUT_BATCH(gpgpu->batch, 0);
 
     OUT_BATCH(gpgpu->batch, 0xfffff000 | BASE_ADDRESS_MODIFY);
@@ -410,11 +415,7 @@ intel_gpgpu_set_base_address_gen9(intel_gpgpu_t *gpgpu)
     OUT_BATCH(gpgpu->batch, 0);
     OUT_BATCH(gpgpu->batch, 0 | (def_cc << 4) | BASE_ADDRESS_MODIFY); /* Indirect Obj Base Addr */
     OUT_BATCH(gpgpu->batch, 0);
-    //OUT_BATCH(gpgpu->batch, 0 | (def_cc << 4) | BASE_ADDRESS_MODIFY); /* Instruction Base Addr  */
-    OUT_RELOC(gpgpu->batch, (drm_intel_bo *)gpgpu->ker->bo,
-              I915_GEM_DOMAIN_INSTRUCTION,
-              I915_GEM_DOMAIN_INSTRUCTION,
-              0 + (0 | (def_cc << 4) | (0 << 1)| BASE_ADDRESS_MODIFY));
+    OUT_BATCH(gpgpu->batch, 0 | (def_cc << 4) | BASE_ADDRESS_MODIFY); /* Instruction Base Addr  */
     OUT_BATCH(gpgpu->batch, 0);
 
     OUT_BATCH(gpgpu->batch, 0xfffff000 | BASE_ADDRESS_MODIFY);
@@ -796,6 +797,101 @@ intel_gpgpu_set_L3_gen8(intel_gpgpu_t *gpgpu, uint32_t use_slm)
 }
 
 static void
+intel_gpgpu_send_kernel_debug_state(intel_gpgpu_t *gpgpu)
+{
+  const int dev = gpgpu->drv->device_id;
+  uint32_t reg = 0, val = 0;
+
+  if (!(IS_HASWELL(dev) || IS_BROADWELL(dev) || IS_SKYLAKE(dev)))
+    return;
+
+  /* Set the System Instruction Pointer */
+  if (IS_HASWELL(dev)) {
+    BEGIN_BATCH(gpgpu->batch, 2);
+    OUT_BATCH(gpgpu->batch, CMD_STATE_SIP | (2 - 2));
+    OUT_RELOC(gpgpu->batch, gpgpu->sys_routine_bo, I915_GEM_DOMAIN_INSTRUCTION,
+              I915_GEM_DOMAIN_INSTRUCTION, 0);
+    ADVANCE_BATCH(gpgpu->batch);
+  } else {
+    BEGIN_BATCH(gpgpu->batch, 2);
+    OUT_BATCH(gpgpu->batch, CMD_STATE_SIP | (3 - 2));
+    OUT_RELOC(gpgpu->batch, gpgpu->sys_routine_bo, I915_GEM_DOMAIN_INSTRUCTION,
+              I915_GEM_DOMAIN_INSTRUCTION, 0);
+    OUT_BATCH(gpgpu->batch, 0);
+    ADVANCE_BATCH(gpgpu->batch);
+  }
+
+  if (IS_HASWELL(dev))
+    return;
+
+  /* Enable HW shader debug via MMIO */
+  if (IS_BROADWELL(dev)) {
+    reg = GEN8_INSTPM;
+    val = GEN8_INSTPM_GLOBAL_DEBUG;
+    val |= (val << 16);
+  } else {
+    reg = GEN9_CS_DEBUG_MODE1;
+    val = GEN9_CS_DEBUG_MODE1;
+    val |= (val << 16);
+  }
+
+  BEGIN_BATCH(gpgpu->batch, 3);
+  OUT_BATCH(gpgpu->batch, CMD_LOAD_REGISTER_IMM | (3 - 2));
+  OUT_BATCH(gpgpu->batch, reg);
+  OUT_BATCH(gpgpu->batch, val);
+  ADVANCE_BATCH(gpgpu->batch);
+
+  BEGIN_BATCH(gpgpu->batch, 3);
+  OUT_BATCH(gpgpu->batch, CMD_LOAD_REGISTER_IMM | (3 - 2));
+  OUT_BATCH(gpgpu->batch, GEN8_TD_CTL);
+  OUT_BATCH(gpgpu->batch, GEN8_TD_CTL_FORCE_BKPT_ENABLE |
+                          GEN8_TD_CTL_FORCE_EXCEPTION_ENABLE);
+  ADVANCE_BATCH(gpgpu->batch);
+}
+
+static inline unsigned
+get_generation(intel_gpgpu_t *gpgpu)
+{
+  const int dev = gpgpu->drv->device_id;
+
+  if (IS_HASWELL(dev))
+    return 7;
+  else if (IS_BROADWELL(dev))
+    return 8;
+  else if (IS_SKYLAKE(dev))
+    return 9;
+  else {
+    assert("Unsupported generation" && 0);
+    return 0;
+  }
+}
+
+static void
+intel_gpgpu_sys_routine_setup(intel_gpgpu_t *gpgpu)
+{
+  drm_intel_bufmgr *bufmgr = gpgpu->drv->bufmgr;
+
+  assert(gpgpu->debug_surface_bo);
+
+  gpgpu->sys_routine_bo = drm_intel_bo_alloc(bufmgr, "system_routine",
+                                             gpgpu->sys_routine_size,
+                                             INTEL_SYS_ROUTINE_ALIGN);
+  assert(gpgpu->sys_routine_bo);
+  if (!gpgpu->sys_routine_bo)
+    goto error;
+
+  // Copy the system routine binary to the BO
+  drm_intel_bo_subdata(gpgpu->sys_routine_bo, 0, gpgpu->sys_routine_size,
+                       gpgpu->sys_routine_buf);
+  return;
+
+error:
+  gpgpu->kernel_debug = false;
+  drm_intel_bo_unreference(gpgpu->debug_surface_bo);
+  gpgpu->debug_surface_bo = NULL;
+}
+
+static void
 intel_gpgpu_batch_start(intel_gpgpu_t *gpgpu)
 {
   intel_batchbuffer_start_atomic(gpgpu->batch, 256);
@@ -807,6 +903,8 @@ intel_gpgpu_batch_start(intel_gpgpu_t *gpgpu)
   intel_gpgpu_load_vfe_state(gpgpu);
   intel_gpgpu_load_curbe_buffer(gpgpu);
   intel_gpgpu_load_idrt(gpgpu);
+  if (gpgpu->kernel_debug)
+    intel_gpgpu_sys_routine_setup(gpgpu);
 
   if (gpgpu->perf_b.bo) {
     BEGIN_BATCH(gpgpu->batch, 3);
@@ -825,6 +923,9 @@ intel_gpgpu_batch_start(intel_gpgpu_t *gpgpu)
   /* Insert PIPE_CONTROL for time stamp of start*/
   if (gpgpu->time_stamp_b.bo)
     intel_gpgpu_write_timestamp(gpgpu, 0);
+
+  if (gpgpu->kernel_debug)
+    intel_gpgpu_send_kernel_debug_state(gpgpu);
 }
 
 static void
@@ -1531,6 +1632,37 @@ intel_gpgpu_bind_buf(intel_gpgpu_t *gpgpu, drm_intel_bo *buf, uint32_t offset,
   intel_gpgpu_setup_bti(gpgpu, buf, internal_offset, size, bti, I965_SURFACEFORMAT_RAW);
 }
 
+static void
+intel_gpgpu_debug_surface_setup(intel_gpgpu_t *gpgpu)
+{
+  drm_intel_bufmgr *bufmgr = gpgpu->drv->bufmgr;
+  const unsigned bti = DEBUG_SURFACE_BTI;
+  unsigned debug_surface_size;
+  int ret;
+
+  ret = dbg_get_sys_routine_binary(get_generation(gpgpu), bti,
+                                   &gpgpu->sys_routine_buf, &gpgpu->sys_routine_size,
+                                   &debug_surface_size);
+  assert(ret == 0);
+  if (ret) {
+    gpgpu->kernel_debug = false;
+    return;
+  }
+
+  gpgpu->debug_surface_bo = drm_intel_bo_alloc(bufmgr, "debug_surface",
+                                               debug_surface_size,
+                                               INTEL_SYS_ROUTINE_ALIGN);
+  assert(gpgpu->debug_surface_bo);
+  if (!gpgpu->debug_surface_bo) {
+    perror("drm_intel_bo_alloc debug_surface");
+    gpgpu->kernel_debug = false;
+    return;
+  }
+
+  intel_gpgpu_bind_buf(gpgpu, gpgpu->debug_surface_bo, 0, 0,
+                       debug_surface_size, bti);
+}
+
 static int
 intel_gpgpu_set_scratch(intel_gpgpu_t * gpgpu, uint32_t per_thread_size)
 {
@@ -1659,6 +1791,13 @@ intel_gpgpu_build_idrt_gen8(intel_gpgpu_t *gpgpu, cl_gpgpu_kernel *kernel)
     slm_sz = 64*KB;
   slm_sz = slm_sz >> 12;
   desc->desc6.slm_sz = slm_sz;
+
+  /* Set the kernel start pointer. */
+  dri_bo_emit_reloc(gpgpu->aux_buf.bo,
+                    I915_GEM_DOMAIN_INSTRUCTION, 0,
+                    0,
+                    gpgpu->aux_offset.idrt_offset + offsetof(gen8_interface_descriptor_t, desc0),
+                    (drm_intel_bo *)kernel->bo);
 }
 
 static void
@@ -1703,6 +1842,13 @@ intel_gpgpu_build_idrt_gen9(intel_gpgpu_t *gpgpu, cl_gpgpu_kernel *kernel)
   else
     slm_sz = 7;
   desc->desc6.slm_sz = slm_sz;
+
+  /* Set the kernel start pointer. */
+  dri_bo_emit_reloc(gpgpu->aux_buf.bo,
+                    I915_GEM_DOMAIN_INSTRUCTION, 0,
+                    0,
+                    gpgpu->aux_offset.idrt_offset + offsetof(gen8_interface_descriptor_t, desc0),
+                    (drm_intel_bo *)kernel->bo);
 }
 
 static int
@@ -2046,6 +2192,24 @@ static void
 intel_gpgpu_states_setup(intel_gpgpu_t *gpgpu, cl_gpgpu_kernel *kernel)
 {
   gpgpu->ker = kernel;
+
+  if (gpgpu->kernel_debug)
+    intel_gpgpu_debug_surface_setup(gpgpu);
+
+  /* The kernel debug might have been disabled by a failure in the
+   * debug surface allocation.
+   */
+  if (gpgpu->kernel_debug) {
+    drm_intel_bo *bo = (drm_intel_bo *)kernel->bo;
+    assert(bo != NULL);
+
+    drm_intel_bo_map(bo, 1);
+    assert(bo->virtual != NULL);
+
+    dbg_notify_kernel_debug_data(kernel->name, bo->virtual, bo->size);
+    drm_intel_bo_unmap(bo);
+  }
+
   intel_gpgpu_build_idrt(gpgpu, kernel);
   dri_bo_unmap(gpgpu->aux_buf.bo);
 }
@@ -2405,6 +2569,13 @@ intel_gpgpu_get_printf_info(intel_gpgpu_t *gpgpu)
   return gpgpu->printf_info;
 }
 
+
+static void
+intel_gpgpu_enable_kernel_debug(intel_gpgpu_t *gpgpu)
+{
+  gpgpu->kernel_debug = true;
+}
+
 LOCAL void
 intel_set_gpgpu_callbacks(int device_id)
 {
@@ -2445,6 +2616,7 @@ intel_set_gpgpu_callbacks(int device_id)
   cl_gpgpu_release_printf_buffer = (cl_gpgpu_release_printf_buffer_cb *)intel_gpgpu_release_printf_buf;
   cl_gpgpu_set_printf_info = (cl_gpgpu_set_printf_info_cb *)intel_gpgpu_set_printf_info;
   cl_gpgpu_get_printf_info = (cl_gpgpu_get_printf_info_cb *)intel_gpgpu_get_printf_info;
+  cl_gpgpu_enable_kernel_debug = (cl_gpgpu_enable_kernel_debug_cb *)intel_gpgpu_enable_kernel_debug;
 
   if (IS_BROADWELL(device_id) || IS_CHERRYVIEW(device_id)) {
     cl_gpgpu_bind_image = (cl_gpgpu_bind_image_cb *) intel_gpgpu_bind_image_gen8;
diff --git a/src/intel/intel_gpgpu.h b/src/intel/intel_gpgpu.h
index 904f9e0..762e43f 100644
--- a/src/intel/intel_gpgpu.h
+++ b/src/intel/intel_gpgpu.h
@@ -28,6 +28,7 @@
 
 #include <stdlib.h>
 #include <stdint.h>
+#include <stdbool.h>
 
 
 /* We can bind only a limited number of buffers */
@@ -82,6 +83,12 @@ struct intel_gpgpu
   } curb;
 
   uint32_t max_threads;      /* max threads requested by the user */
+
+  bool kernel_debug;
+  const unsigned char *sys_routine_buf;
+  unsigned int sys_routine_size;
+  drm_intel_bo *sys_routine_bo;
+  drm_intel_bo *debug_surface_bo;
 };
 
 struct intel_gpgpu_node {
-- 
1.8.3.1



More information about the Beignet mailing list