[Intel-xe] [PATCH v2 1/1] drm/xe: Add a debugfs for faking gt reset failure.
Ghimiray, Himal Prasad
himal.prasad.ghimiray at intel.com
Thu Jun 15 09:00:34 UTC 2023
Please Ignore.
> -----Original Message-----
> From: Ghimiray, Himal Prasad <himal.prasad.ghimiray at intel.com>
> Sent: 15 June 2023 14:29
> To: intel-xe at lists.freedesktop.org
> Cc: Ghimiray, Himal Prasad <himal.prasad.ghimiray at intel.com>; Vivi,
> Rodrigo <rodrigo.vivi at intel.com>; Dugast, Francois
> <francois.dugast at intel.com>; Mauro Carvalho Chehab
> <mchehab at kernel.org>; Vivekanandan, Balasubramani
> <balasubramani.vivekanandan at intel.com>
> Subject: [PATCH v2 1/1] drm/xe: Add a debugfs for faking gt reset failure.
>
> In case of gt reset failure, KMD notifies userspace about failure via uevent. To
> validate this notification we need to ensure gt reset fails and there is no
> mechanism to cause failure from hardware.
> Hence added a debugfs which will cause fake reset failure.
>
> v1(Rodrigo)
> - Change the variable to fake_reset_failure_in_progress.
> - Drop usage of READ_ONCE and WRITE_ONCE.
> - Follow consistency for variable assignment. Either use
> functions for all the assignments or don't use for any.
>
> v2
> - Add description for variable.
> - Define xe_fake_reset(gt) function in xe_gt.c to set the
> fake_reset_failure_in_progress.
> - No need to explicitly initialize the fake_reset_failure_in_progress
> as false. (Rodrigo)
> - Return proper error code in case of fake reset. (Bala)
>
> v3
> - Move gt reset worker call to xe_fake_reset(gt) which ensures fake reset
> goes through even in GT suspend state.
>
> Cc: Rodrigo Vivi <rodrigo.vivi at intel.com>
> cc: Francois Dugast <francois.dugast at intel.com>
> Cc: Mauro Carvalho Chehab <mchehab at kernel.org>
> Cc: Balasubramani Vivekanandan
> <balasubramani.vivekanandan at intel.com>
> Signed-off-by: Himal Prasad Ghimiray <himal.prasad.ghimiray at intel.com>
> ---
> drivers/gpu/drm/xe/xe_gt.c | 19 +++++++++++++++++++
> drivers/gpu/drm/xe/xe_gt.h | 2 ++
> drivers/gpu/drm/xe/xe_gt_debugfs.c | 9 +++++++++
> drivers/gpu/drm/xe/xe_gt_types.h | 5 +++++
> 4 files changed, 35 insertions(+)
>
> diff --git a/drivers/gpu/drm/xe/xe_gt.c b/drivers/gpu/drm/xe/xe_gt.c index
> 2458397ce8af..e70af81f2db5 100644
> --- a/drivers/gpu/drm/xe/xe_gt.c
> +++ b/drivers/gpu/drm/xe/xe_gt.c
> @@ -497,10 +497,27 @@ static int do_gt_restart(struct xe_gt *gt)
> return 0;
> }
>
> +void xe_gt_fake_reset(struct xe_gt *gt) {
> + gt->reset.fake_reset_failure_in_progress = true;
> +
> + xe_gt_info(gt, "Fake reset queued\n");
> + queue_work(gt->ordered_wq, >->reset.worker);
> + n
> +
> +}
> +
> static int gt_reset(struct xe_gt *gt)
> {
> int err;
>
> + if (gt->reset.fake_reset_failure_in_progress) {
> + err = -ECANCELED;
> + xe_gt_info(gt, "Fake GT reset failure is in progress\n");
> + gt->reset.fake_reset_failure_in_progress = false;
> + goto err_cancelled;
> + }
> +
> /* We only support GT resets with GuC submission */
> if (!xe_device_guc_submission_enabled(gt_to_xe(gt)))
> return -ENODEV;
> @@ -510,6 +527,7 @@ static int gt_reset(struct xe_gt *gt)
> xe_gt_sanitize(gt);
>
> xe_device_mem_access_get(gt_to_xe(gt));
> +
> err = xe_force_wake_get(gt_to_fw(gt), XE_FORCEWAKE_ALL);
> if (err)
> goto err_msg;
> @@ -543,6 +561,7 @@ static int gt_reset(struct xe_gt *gt)
> err_msg:
> XE_WARN_ON(xe_uc_start(>->uc));
> xe_device_mem_access_put(gt_to_xe(gt));
> +err_cancelled:
> xe_gt_err(gt, "reset failed (%pe)\n", ERR_PTR(err));
>
> return err;
> diff --git a/drivers/gpu/drm/xe/xe_gt.h b/drivers/gpu/drm/xe/xe_gt.h index
> 21d9044088de..22afae04fa3c 100644
> --- a/drivers/gpu/drm/xe/xe_gt.h
> +++ b/drivers/gpu/drm/xe/xe_gt.h
> @@ -24,6 +24,8 @@ void xe_gt_suspend_prepare(struct xe_gt *gt); int
> xe_gt_suspend(struct xe_gt *gt); int xe_gt_resume(struct xe_gt *gt); void
> xe_gt_reset_async(struct xe_gt *gt);
> +void xe_gt_fake_reset(struct xe_gt *gt); void xe_gt_migrate_wait(struct
> +xe_gt *gt);
> void xe_gt_sanitize(struct xe_gt *gt);
>
> /**
> diff --git a/drivers/gpu/drm/xe/xe_gt_debugfs.c
> b/drivers/gpu/drm/xe/xe_gt_debugfs.c
> index b5a5538ae630..8d888153c477 100644
> --- a/drivers/gpu/drm/xe/xe_gt_debugfs.c
> +++ b/drivers/gpu/drm/xe/xe_gt_debugfs.c
> @@ -138,6 +138,14 @@ static int workarounds(struct seq_file *m, void
> *data)
> return 0;
> }
>
> +static int fake_reset_failure(struct seq_file *m, void *data) {
> + struct xe_gt *gt = node_to_gt(m->private);
> +
> + xe_gt_fake_reset(gt);
> + return 0;
> +}
> +
> static const struct drm_info_list debugfs_list[] = {
> {"hw_engines", hw_engines, 0},
> {"force_reset", force_reset, 0},
> @@ -147,6 +155,7 @@ static const struct drm_info_list debugfs_list[] = {
> {"ggtt", ggtt, 0},
> {"register-save-restore", register_save_restore, 0},
> {"workarounds", workarounds, 0},
> + {"fake_reset_failure", fake_reset_failure, 0},
> };
>
> void xe_gt_debugfs_register(struct xe_gt *gt) diff --git
> a/drivers/gpu/drm/xe/xe_gt_types.h b/drivers/gpu/drm/xe/xe_gt_types.h
> index 99ab7ec99ccd..0b6e1df3ca36 100644
> --- a/drivers/gpu/drm/xe/xe_gt_types.h
> +++ b/drivers/gpu/drm/xe/xe_gt_types.h
> @@ -141,6 +141,11 @@ struct xe_gt {
>
> /** @reset: state for GT resets */
> struct {
> + /**
> + * @fake_reset_failure_in_progress: A bool to indicate a fake
> reset
> + * failure has been triggered
> + */
> + bool fake_reset_failure_in_progress;
> /**
> * @worker: work so GT resets can done async allowing to
> reset
> * code to safely flush all code paths
> --
> 2.25.1
More information about the Intel-xe
mailing list