[Intel-gfx] [PATCH v2] drm/i915/guc: Force a reset on internal GuC error
Ceraolo Spurio, Daniele
daniele.ceraolospurio at intel.com
Thu Aug 17 21:25:12 UTC 2023
On 8/15/2023 5:39 PM, John.C.Harrison at Intel.com wrote:
> From: John Harrison <John.C.Harrison at Intel.com>
>
> If GuC hits an internal error (and survives long enough to report it
> to the KMD), it is basically toast and will stop until a GT reset and
> subsequent GuC reload is performed. Previously, the KMD just printed
> an error message and then waited for the heartbeat to eventually kick
> in and trigger a reset (assuming the heartbeat had not been disabled).
> Instead, force the reset immediately to guarantee that it happens and
> to eliminate the very long heartbeat delay. The captured error state
> is also more likely to be useful if captured at the time of the error
> rather than many seconds later.
>
> Note that it is not possible to trigger a reset from with the G2H
> handler itself. The reset prepare process involves flushing
> outstanding G2H contents. So a deadlock could result. Instead, the G2H
> handler queues a worker thread to do the reset asynchronously.
>
> v2: Flush the worker on suspend and shutdown. Add rate limiting to
> prevent spam from a totally dead system (review feedback from Daniele).
>
> Signed-off-by: John Harrison <John.C.Harrison at Intel.com>
Reviewed-by: Daniele Ceraolo Spurio <daniele.ceraolospurio at intel.com>
Daniele
> ---
> drivers/gpu/drm/i915/gt/uc/intel_guc.c | 38 +++++++++++++++++++++++
> drivers/gpu/drm/i915/gt/uc/intel_guc.h | 15 +++++++++
> drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c | 6 +---
> 3 files changed, 54 insertions(+), 5 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc.c b/drivers/gpu/drm/i915/gt/uc/intel_guc.c
> index 569b5fe94c416..12a817b762334 100644
> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc.c
> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc.c
> @@ -159,6 +159,21 @@ static void gen11_disable_guc_interrupts(struct intel_guc *guc)
> gen11_reset_guc_interrupts(guc);
> }
>
> +static void guc_dead_worker_func(struct work_struct *w)
> +{
> + struct intel_guc *guc = container_of(w, struct intel_guc, dead_guc_worker);
> + struct intel_gt *gt = guc_to_gt(guc);
> + unsigned long last = guc->last_dead_guc_jiffies;
> + unsigned long delta = jiffies_to_msecs(jiffies - last);
> +
> + if (delta < 500) {
> + intel_gt_set_wedged(gt);
> + } else {
> + intel_gt_handle_error(gt, ALL_ENGINES, I915_ERROR_CAPTURE, "dead GuC");
> + guc->last_dead_guc_jiffies = jiffies;
> + }
> +}
> +
> void intel_guc_init_early(struct intel_guc *guc)
> {
> struct intel_gt *gt = guc_to_gt(guc);
> @@ -171,6 +186,8 @@ void intel_guc_init_early(struct intel_guc *guc)
> intel_guc_slpc_init_early(&guc->slpc);
> intel_guc_rc_init_early(guc);
>
> + INIT_WORK(&guc->dead_guc_worker, guc_dead_worker_func);
> +
> mutex_init(&guc->send_mutex);
> spin_lock_init(&guc->irq_lock);
> if (GRAPHICS_VER(i915) >= 11) {
> @@ -461,6 +478,8 @@ void intel_guc_fini(struct intel_guc *guc)
> if (!intel_uc_fw_is_loadable(&guc->fw))
> return;
>
> + flush_work(&guc->dead_guc_worker);
> +
> if (intel_guc_slpc_is_used(guc))
> intel_guc_slpc_fini(&guc->slpc);
>
> @@ -585,6 +604,20 @@ int intel_guc_send_mmio(struct intel_guc *guc, const u32 *request, u32 len,
> return ret;
> }
>
> +int intel_guc_crash_process_msg(struct intel_guc *guc, u32 action)
> +{
> + if (action == INTEL_GUC_ACTION_NOTIFY_CRASH_DUMP_POSTED)
> + guc_err(guc, "Crash dump notification\n");
> + else if (action == INTEL_GUC_ACTION_NOTIFY_EXCEPTION)
> + guc_err(guc, "Exception notification\n");
> + else
> + guc_err(guc, "Unknown crash notification: 0x%04X\n", action);
> +
> + queue_work(system_unbound_wq, &guc->dead_guc_worker);
> +
> + return 0;
> +}
> +
> int intel_guc_to_host_process_recv_msg(struct intel_guc *guc,
> const u32 *payload, u32 len)
> {
> @@ -601,6 +634,9 @@ int intel_guc_to_host_process_recv_msg(struct intel_guc *guc,
> if (msg & INTEL_GUC_RECV_MSG_EXCEPTION)
> guc_err(guc, "Received early exception notification!\n");
>
> + if (msg & (INTEL_GUC_RECV_MSG_CRASH_DUMP_POSTED | INTEL_GUC_RECV_MSG_EXCEPTION))
> + queue_work(system_unbound_wq, &guc->dead_guc_worker);
> +
> return 0;
> }
>
> @@ -640,6 +676,8 @@ int intel_guc_suspend(struct intel_guc *guc)
> return 0;
>
> if (intel_guc_submission_is_used(guc)) {
> + flush_work(&guc->dead_guc_worker);
> +
> /*
> * This H2G MMIO command tears down the GuC in two steps. First it will
> * generate a G2H CTB for every active context indicating a reset. In
> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc.h b/drivers/gpu/drm/i915/gt/uc/intel_guc.h
> index 8dc291ff00935..6c392bad29c19 100644
> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc.h
> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc.h
> @@ -266,6 +266,20 @@ struct intel_guc {
> unsigned long last_stat_jiffies;
> } timestamp;
>
> + /**
> + * @dead_guc_worker: Asynchronous worker thread for forcing a GuC reset.
> + * Specifically used when the G2H handler wants to issue a reset. Resets
> + * require flushing the G2H queue. So, the G2H processing itself must not
> + * trigger a reset directly. Instead, go via this worker.
> + */
> + struct work_struct dead_guc_worker;
> + /**
> + * @last_dead_guc_jiffies: timestamp of previous 'dead guc' occurrance
> + * used to prevent a fundamentally broken system from continuously
> + * reloading the GuC.
> + */
> + unsigned long last_dead_guc_jiffies;
> +
> #ifdef CONFIG_DRM_I915_SELFTEST
> /**
> * @number_guc_id_stolen: The number of guc_ids that have been stolen
> @@ -476,6 +490,7 @@ int intel_guc_engine_failure_process_msg(struct intel_guc *guc,
> const u32 *msg, u32 len);
> int intel_guc_error_capture_process_msg(struct intel_guc *guc,
> const u32 *msg, u32 len);
> +int intel_guc_crash_process_msg(struct intel_guc *guc, u32 action);
>
> struct intel_engine_cs *
> intel_guc_lookup_engine(struct intel_guc *guc, u8 guc_class, u8 instance);
> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c
> index 97eadd08181d6..6e22af31513a5 100644
> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c
> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c
> @@ -1112,12 +1112,8 @@ static int ct_process_request(struct intel_guc_ct *ct, struct ct_incoming_msg *r
> ret = 0;
> break;
> case INTEL_GUC_ACTION_NOTIFY_CRASH_DUMP_POSTED:
> - CT_ERROR(ct, "Received GuC crash dump notification!\n");
> - ret = 0;
> - break;
> case INTEL_GUC_ACTION_NOTIFY_EXCEPTION:
> - CT_ERROR(ct, "Received GuC exception notification!\n");
> - ret = 0;
> + ret = intel_guc_crash_process_msg(guc, action);
> break;
> default:
> ret = -EOPNOTSUPP;
More information about the Intel-gfx
mailing list