[Intel-gfx] [PATCH v2] drm/i915: Record more information about the hanging contexts
Mika Kuoppala
mika.kuoppala at linux.intel.com
Mon Jan 30 15:24:52 UTC 2017
Chris Wilson <chris at chris-wilson.co.uk> writes:
> Include extra information such as the user_handle and hw_id so that
> userspace can identify which of their contexts hung, useful if they are
> performing self-diagnositics.
>
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> Cc: Mika Kuoppala <mika.kuoppala at intel.com>
> ---
> drivers/gpu/drm/i915/i915_drv.h | 14 +++++--
> drivers/gpu/drm/i915/i915_gpu_error.c | 77 ++++++++++++++++++++++-------------
> 2 files changed, 59 insertions(+), 32 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index c1fde816db63..7e7bc4504c94 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -970,6 +970,16 @@ struct drm_i915_error_state {
> u32 semaphore_mboxes[I915_NUM_ENGINES - 1];
> struct intel_instdone instdone;
>
> + struct drm_i915_error_context {
> + char comm[TASK_COMM_LEN];
> + int pid;
s/int/pid_t
Reviewed-by: Mika Kuoppala <mika.kuoppala at intel.com>
> + u32 handle;
> + u32 hw_id;
> + int ban_score;
> + int active;
> + int guilty;
> + } context;
> +
> struct drm_i915_error_object {
> u64 gtt_offset;
> u64 gtt_size;
> @@ -1003,10 +1013,6 @@ struct drm_i915_error_state {
> u32 pp_dir_base;
> };
> } vm_info;
> -
> - pid_t pid;
> - char comm[TASK_COMM_LEN];
> - int context_bans;
> } engine[I915_NUM_ENGINES];
>
> struct drm_i915_error_buffer {
> diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
> index e5375323eb06..5283fe815a4d 100644
> --- a/drivers/gpu/drm/i915/i915_gpu_error.c
> +++ b/drivers/gpu/drm/i915/i915_gpu_error.c
> @@ -384,6 +384,15 @@ static void error_print_request(struct drm_i915_error_state_buf *m,
> erq->head, erq->tail);
> }
>
> +static void error_print_context(struct drm_i915_error_state_buf *m,
> + const char *header,
> + struct drm_i915_error_context *ctx)
> +{
> + err_printf(m, "%s%s[%d] user_handle %d hw_id %d, ban score %d guilty %d active %d\n",
> + header, ctx->comm, ctx->pid, ctx->handle, ctx->hw_id,
> + ctx->ban_score, ctx->guilty, ctx->active);
> +}
> +
> static void error_print_engine(struct drm_i915_error_state_buf *m,
> struct drm_i915_error_engine *ee)
> {
> @@ -457,6 +466,7 @@ static void error_print_engine(struct drm_i915_error_state_buf *m,
>
> error_print_request(m, " ELSP[0]: ", &ee->execlist[0]);
> error_print_request(m, " ELSP[1]: ", &ee->execlist[1]);
> + error_print_context(m, " Active context: ", &ee->context);
> }
>
> void i915_error_printf(struct drm_i915_error_state_buf *e, const char *f, ...)
> @@ -562,12 +572,12 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m,
>
> for (i = 0; i < ARRAY_SIZE(error->engine); i++) {
> if (error->engine[i].hangcheck_stalled &&
> - error->engine[i].pid != -1) {
> - err_printf(m, "Active process (on ring %s): %s [%d], context bans %d\n",
> + error->engine[i].context.pid) {
> + err_printf(m, "Active process (on ring %s): %s [%d], score %d\n",
> engine_str(i),
> - error->engine[i].comm,
> - error->engine[i].pid,
> - error->engine[i].context_bans);
> + error->engine[i].context.comm,
> + error->engine[i].context.pid,
> + error->engine[i].context.ban_score);
> }
> }
> err_printf(m, "Reset count: %u\n", error->reset_count);
> @@ -658,11 +668,13 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m,
> obj = ee->batchbuffer;
> if (obj) {
> err_puts(m, dev_priv->engine[i]->name);
> - if (ee->pid != -1)
> - err_printf(m, " (submitted by %s [%d], bans %d)",
> - ee->comm,
> - ee->pid,
> - ee->context_bans);
> + if (ee->context.pid)
> + err_printf(m, " (submitted by %s [%d], ctx %d [%d], score %d)",
> + ee->context.comm,
> + ee->context.pid,
> + ee->context.handle,
> + ee->context.hw_id,
> + ee->context.ban_score);
> err_printf(m, " --- gtt_offset = 0x%08x %08x\n",
> upper_32_bits(obj->gtt_offset),
> lower_32_bits(obj->gtt_offset));
> @@ -1267,6 +1279,28 @@ static void error_record_engine_execlists(struct intel_engine_cs *engine,
> &ee->execlist[n]);
> }
>
> +static void record_context(struct drm_i915_error_context *e,
> + struct i915_gem_context *ctx)
> +{
> + if (ctx->pid) {
> + struct task_struct *task;
> +
> + rcu_read_lock();
> + task = pid_task(ctx->pid, PIDTYPE_PID);
> + if (task) {
> + strcpy(e->comm, task->comm);
> + e->pid = task->pid;
> + }
> + rcu_read_unlock();
> + }
> +
> + e->handle = ctx->user_handle;
> + e->hw_id = ctx->hw_id;
> + e->ban_score = ctx->ban_score;
> + e->guilty = ctx->guilty_count;
> + e->active = ctx->active_count;
> +}
> +
> static void i915_gem_record_rings(struct drm_i915_private *dev_priv,
> struct drm_i915_error_state *error)
> {
> @@ -1281,7 +1315,6 @@ static void i915_gem_record_rings(struct drm_i915_private *dev_priv,
> struct drm_i915_error_engine *ee = &error->engine[i];
> struct drm_i915_gem_request *request;
>
> - ee->pid = -1;
> ee->engine_id = -1;
>
> if (!engine)
> @@ -1296,11 +1329,12 @@ static void i915_gem_record_rings(struct drm_i915_private *dev_priv,
> request = i915_gem_find_active_request(engine);
> if (request) {
> struct intel_ring *ring;
> - struct pid *pid;
>
> ee->vm = request->ctx->ppgtt ?
> &request->ctx->ppgtt->base : &ggtt->base;
>
> + record_context(&ee->context, request->ctx);
> +
> /* We need to copy these to an anonymous buffer
> * as the simplest method to avoid being overwritten
> * by userspace.
> @@ -1318,19 +1352,6 @@ static void i915_gem_record_rings(struct drm_i915_private *dev_priv,
> i915_error_object_create(dev_priv,
> request->ctx->engine[i].state);
>
> - pid = request->ctx->pid;
> - if (pid) {
> - struct task_struct *task;
> -
> - rcu_read_lock();
> - task = pid_task(pid, PIDTYPE_PID);
> - if (task) {
> - strcpy(ee->comm, task->comm);
> - ee->pid = task->pid;
> - }
> - rcu_read_unlock();
> - }
> -
> error->simulated |=
> i915_gem_context_no_error_capture(request->ctx);
>
> @@ -1534,12 +1555,12 @@ static void i915_error_capture_msg(struct drm_i915_private *dev_priv,
> "GPU HANG: ecode %d:%d:0x%08x",
> INTEL_GEN(dev_priv), engine_id, ecode);
>
> - if (engine_id != -1 && error->engine[engine_id].pid != -1)
> + if (engine_id != -1 && error->engine[engine_id].context.pid)
> len += scnprintf(error->error_msg + len,
> sizeof(error->error_msg) - len,
> ", in %s [%d]",
> - error->engine[engine_id].comm,
> - error->engine[engine_id].pid);
> + error->engine[engine_id].context.comm,
> + error->engine[engine_id].context.pid);
>
> scnprintf(error->error_msg + len, sizeof(error->error_msg) - len,
> ", reason: %s, action: %s",
> --
> 2.11.0
>
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx at lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/intel-gfx
More information about the Intel-gfx
mailing list