[Intel-gfx] [PATCH 2/2] drm/i915: Use time based guilty context banning
Mika Kuoppala
mika.kuoppala at linux.intel.com
Tue Feb 19 11:22:32 UTC 2019
Chris Wilson <chris at chris-wilson.co.uk> writes:
> Currently, we accumulate each time a context hangs the GPU, offset
> against the number of requests it submits, and if that score exceeds a
> certain threshold, we ban that context from submitting any more requests
> (cancelling any work in flight). In contrast, we use a simple timer on
> the file, that if we see more than a 9 hangs faster than 60s apart in
> total across all of its contexts, we will ban the client from creating
> any more contexts. This leads to a confusing situation where the file
> may be banned before the context, so lets use a simple timer scheme for
> each.
>
> If the context submits 3 hanging requests within a 120s period, declare
> it forbidden to ever send more requests.
>
> This has the advantage of not being easy to repair by simply sending
> empty requests, but has the disadvantage that if the context is idle
> then it is forgiven. However, if the context is idle, it is not
> disrupting the system, but a hog can evade the request counting and
> cause much more severe disruption to the system.
>
> Updating ban_score from request retirement is dubious as the retirement
> is purposely not in sync with request submission (i.e. we try and batch
> retirement to reduce overhead and avoid latency on submission), which
> leads to surprising situations where we can forgive a hang immediately
> due to a backlog of requests from before the hang being retired
> afterwards.
>
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> Cc: Mika Kuoppala <mika.kuoppala at intel.com>
> ---
> drivers/gpu/drm/i915/i915_gem_context.c | 4 ++++
> drivers/gpu/drm/i915/i915_gem_context.h | 9 +++----
> drivers/gpu/drm/i915/i915_gpu_error.c | 31 +++++++------------------
> drivers/gpu/drm/i915/i915_gpu_error.h | 3 ---
> drivers/gpu/drm/i915/i915_request.c | 2 --
> drivers/gpu/drm/i915/i915_reset.c | 27 ++++++++++++---------
> 6 files changed, 33 insertions(+), 43 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
> index da21c843fed8..7541c6f961b3 100644
> --- a/drivers/gpu/drm/i915/i915_gem_context.c
> +++ b/drivers/gpu/drm/i915/i915_gem_context.c
> @@ -355,6 +355,7 @@ __create_hw_context(struct drm_i915_private *dev_priv,
> struct i915_gem_context *ctx;
> unsigned int n;
> int ret;
> + int i;
>
> ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
> if (ctx == NULL)
> @@ -407,6 +408,9 @@ __create_hw_context(struct drm_i915_private *dev_priv,
> ctx->desc_template =
> default_desc_template(dev_priv, dev_priv->mm.aliasing_ppgtt);
>
> + for (i = 0; i < ARRAY_SIZE(ctx->hang_timestamp); i++)
> + ctx->hang_timestamp[i] = jiffies - CONTEXT_FAST_HANG_JIFFIES;
> +
> return ctx;
>
> err_pid:
> diff --git a/drivers/gpu/drm/i915/i915_gem_context.h b/drivers/gpu/drm/i915/i915_gem_context.h
> index 071108d34ae0..dc6c58f38cfa 100644
> --- a/drivers/gpu/drm/i915/i915_gem_context.h
> +++ b/drivers/gpu/drm/i915/i915_gem_context.h
> @@ -209,10 +209,11 @@ struct i915_gem_context {
> */
> atomic_t active_count;
>
> -#define CONTEXT_SCORE_GUILTY 10
> -#define CONTEXT_SCORE_BAN_THRESHOLD 40
> - /** ban_score: Accumulated score of all hangs caused by this context. */
> - atomic_t ban_score;
> + /**
> + * @hang_timestamp: The last time(s) this context caused a GPU hang
> + */
> + unsigned long hang_timestamp[2];
> +#define CONTEXT_FAST_HANG_JIFFIES (120 * HZ) /* 3 hangs within 120s? Banned! */
>
> /** remap_slice: Bitmask of cache lines that need remapping */
> u8 remap_slice;
> diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
> index 9a65341fec09..3f6eddb6f6de 100644
> --- a/drivers/gpu/drm/i915/i915_gpu_error.c
> +++ b/drivers/gpu/drm/i915/i915_gpu_error.c
> @@ -434,11 +434,6 @@ static void error_print_instdone(struct drm_i915_error_state_buf *m,
> ee->instdone.row[slice][subslice]);
> }
>
> -static const char *bannable(const struct drm_i915_error_context *ctx)
> -{
> - return ctx->bannable ? "" : " (unbannable)";
> -}
> -
> static void error_print_request(struct drm_i915_error_state_buf *m,
> const char *prefix,
> const struct drm_i915_error_request *erq,
> @@ -447,9 +442,8 @@ static void error_print_request(struct drm_i915_error_state_buf *m,
> if (!erq->seqno)
> return;
>
> - err_printf(m, "%s pid %d, ban score %d, seqno %8x:%08x%s%s, prio %d, emitted %dms, start %08x, head %08x, tail %08x\n",
> - prefix, erq->pid, erq->ban_score,
> - erq->context, erq->seqno,
> + err_printf(m, "%s pid %d, seqno %8x:%08x%s%s, prio %d, emitted %dms, start %08x, head %08x, tail %08x\n",
> + prefix, erq->pid, erq->context, erq->seqno,
> test_bit(DMA_FENCE_FLAG_SIGNALED_BIT,
> &erq->flags) ? "!" : "",
> test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT,
> @@ -463,10 +457,9 @@ static void error_print_context(struct drm_i915_error_state_buf *m,
> const char *header,
> const struct drm_i915_error_context *ctx)
> {
> - err_printf(m, "%s%s[%d] user_handle %d hw_id %d, prio %d, ban score %d%s guilty %d active %d\n",
> + err_printf(m, "%s%s[%d] user_handle %d hw_id %d, prio %d, guilty %d active %d\n",
> header, ctx->comm, ctx->pid, ctx->handle, ctx->hw_id,
> - ctx->sched_attr.priority, ctx->ban_score, bannable(ctx),
> - ctx->guilty, ctx->active);
> + ctx->sched_attr.priority, ctx->guilty, ctx->active);
> }
>
> static void error_print_engine(struct drm_i915_error_state_buf *m,
> @@ -688,12 +681,10 @@ static void __err_print_to_sgl(struct drm_i915_error_state_buf *m,
> if (!error->engine[i].context.pid)
> continue;
>
> - err_printf(m, "Active process (on ring %s): %s [%d], score %d%s\n",
> + err_printf(m, "Active process (on ring %s): %s [%d]\n",
> engine_name(m->i915, i),
> error->engine[i].context.comm,
> - error->engine[i].context.pid,
> - error->engine[i].context.ban_score,
> - bannable(&error->engine[i].context));
> + error->engine[i].context.pid);
> }
> err_printf(m, "Reset count: %u\n", error->reset_count);
> err_printf(m, "Suspend count: %u\n", error->suspend_count);
> @@ -779,13 +770,11 @@ static void __err_print_to_sgl(struct drm_i915_error_state_buf *m,
> if (obj) {
> err_puts(m, m->i915->engine[i]->name);
> if (ee->context.pid)
> - err_printf(m, " (submitted by %s [%d], ctx %d [%d], score %d%s)",
> + err_printf(m, " (submitted by %s [%d], ctx %d [%d])",
> ee->context.comm,
> ee->context.pid,
> ee->context.handle,
> - ee->context.hw_id,
> - ee->context.ban_score,
> - bannable(&ee->context));
> + ee->context.hw_id);
> err_printf(m, " --- gtt_offset = 0x%08x %08x\n",
> upper_32_bits(obj->gtt_offset),
> lower_32_bits(obj->gtt_offset));
> @@ -1301,8 +1290,6 @@ static void record_request(struct i915_request *request,
> erq->flags = request->fence.flags;
> erq->context = ctx->hw_id;
> erq->sched_attr = request->sched.attr;
> - erq->ban_score = atomic_read(&ctx->ban_score);
> - erq->seqno = request->global_seqno;
> erq->jiffies = request->emitted_jiffies;
> erq->start = i915_ggtt_offset(request->ring->vma);
> erq->head = request->head;
> @@ -1396,8 +1383,6 @@ static void record_context(struct drm_i915_error_context *e,
> e->handle = ctx->user_handle;
> e->hw_id = ctx->hw_id;
> e->sched_attr = ctx->sched;
> - e->ban_score = atomic_read(&ctx->ban_score);
> - e->bannable = i915_gem_context_is_bannable(ctx);
> e->guilty = atomic_read(&ctx->guilty_count);
> e->active = atomic_read(&ctx->active_count);
> }
> diff --git a/drivers/gpu/drm/i915/i915_gpu_error.h b/drivers/gpu/drm/i915/i915_gpu_error.h
> index afa3adb28f02..94eaf8ab9051 100644
> --- a/drivers/gpu/drm/i915/i915_gpu_error.h
> +++ b/drivers/gpu/drm/i915/i915_gpu_error.h
> @@ -122,10 +122,8 @@ struct i915_gpu_state {
> pid_t pid;
> u32 handle;
> u32 hw_id;
> - int ban_score;
> int active;
> int guilty;
> - bool bannable;
> struct i915_sched_attr sched_attr;
> } context;
>
> @@ -149,7 +147,6 @@ struct i915_gpu_state {
> long jiffies;
> pid_t pid;
> u32 context;
> - int ban_score;
> u32 seqno;
> u32 start;
> u32 head;
> diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
> index 5ab4e1c01618..a61e3a4fc9dc 100644
> --- a/drivers/gpu/drm/i915/i915_request.c
> +++ b/drivers/gpu/drm/i915/i915_request.c
> @@ -290,8 +290,6 @@ static void i915_request_retire(struct i915_request *request)
>
> i915_request_remove_from_client(request);
>
> - /* Retirement decays the ban score as it is a sign of ctx progress */
> - atomic_dec_if_positive(&request->gem_context->ban_score);
> intel_context_unpin(request->hw_context);
>
> __retire_engine_upto(request->engine, request);
> diff --git a/drivers/gpu/drm/i915/i915_reset.c b/drivers/gpu/drm/i915/i915_reset.c
> index 1911e00d2581..bae88a4ea924 100644
> --- a/drivers/gpu/drm/i915/i915_reset.c
> +++ b/drivers/gpu/drm/i915/i915_reset.c
> @@ -59,24 +59,29 @@ static void client_mark_guilty(struct drm_i915_file_private *file_priv,
>
> static bool context_mark_guilty(struct i915_gem_context *ctx)
> {
> - unsigned int score;
> - bool banned, bannable;
> + unsigned long prev_hang;
> + bool banned;
> + int i;
>
> atomic_inc(&ctx->guilty_count);
>
> - bannable = i915_gem_context_is_bannable(ctx);
> - score = atomic_add_return(CONTEXT_SCORE_GUILTY, &ctx->ban_score);
> - banned = (!i915_gem_context_is_recoverable(ctx) ||
> - score >= CONTEXT_SCORE_BAN_THRESHOLD);
> -
> /* Cool contexts don't accumulate client ban score */
This comment becomes misleading and can be removed.
> - if (!bannable)
> + if (!i915_gem_context_is_bannable(ctx))
> return false;
>
> + /* Record the timestamp for the last N hangs */
> + prev_hang = ctx->hang_timestamp[0];
> + for (i = 0; i < ARRAY_SIZE(ctx->hang_timestamp) - 1; i++)
> + ctx->hang_timestamp[i] = ctx->hang_timestamp[i + 1];
> + ctx->hang_timestamp[i] = jiffies;
> +
> + /* If we have hung N+1 times in rapid succession, we ban the context! */
> + banned = !i915_gem_context_is_recoverable(ctx);
> + if (time_before(jiffies, prev_hang + CONTEXT_FAST_HANG_JIFFIES))
> + banned = true;
Ok, the initialization to jiffies - CONTEXT_FAST_HANG_JIFFIES guarantees
that it cant be banned even if it manages to immediately hang. Which
in itself is difficult feat to accomplish.
> if (banned) {
> - DRM_DEBUG_DRIVER("context %s: guilty %d, score %u, banned\n",
> - ctx->name, atomic_read(&ctx->guilty_count),
> - score);
> + DRM_DEBUG_DRIVER("context %s: guilty %d, banned\n",
> + ctx->name,
> atomic_read(&ctx->guilty_count));
Now when we know when it previously hanged, we could improve the
logging a bit by saying how long ago. But not sure
about if it worth the trouble.
With the misleading comment removed/corrected,
Reviewed-by: Mika Kuoppala <mika.kuoppala at linux.intel.com>
> i915_gem_context_set_banned(ctx);
> }
>
> --
> 2.20.1
>
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx at lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/intel-gfx
More information about the Intel-gfx
mailing list