[Intel-xe] [PATCH v4 1/1] drm/xe: Add a debugfs for faking gt reset failure.

Francois Dugast francois.dugast at intel.com
Thu Jun 22 14:58:50 UTC 2023


On Thu, Jun 15, 2023 at 02:56:43PM +0530, Himal Prasad Ghimiray wrote:
> In case of gt reset failure, KMD notifies userspace about failure
> via uevent. To validate this notification we need to ensure gt
> reset fails and there is no mechanism to cause failure from hardware.
> Hence added a debugfs which will cause fake reset failure.
> 
> v1(Rodrigo)
> - Change the variable to fake_reset_failure_in_progress.
> - Drop usage of READ_ONCE and WRITE_ONCE.
> - Follow consistency for variable assignment. Either use
>   functions for all the assignments or don't use for any.
> 
> v2
> - Add description for variable.
> - Define xe_fake_reset(gt) function in xe_gt.c to set the
>   fake_reset_failure_in_progress.
> - No need to explicitly initialize the fake_reset_failure_in_progress
>   as false. (Rodrigo)
> - Return proper error code in case of fake reset. (Bala)
> 
> v3
> - Move gt reset worker call to xe_fake_reset(gt) which ensures
> fake reset goes through even in GT suspend state. (Aravind)
> 
> v4
> -Clear whitespaces.
> 
> Cc: Aravind Iddamsetty <aravind.iddamsetty at intel.com>
> Cc: Rodrigo Vivi <rodrigo.vivi at intel.com>
> cc: Francois Dugast <francois.dugast at intel.com>
> Cc: Mauro Carvalho Chehab <mchehab at kernel.org>
> Cc: Balasubramani Vivekanandan <balasubramani.vivekanandan at intel.com>
> Signed-off-by: Himal Prasad Ghimiray <himal.prasad.ghimiray at intel.com>
> ---
>  drivers/gpu/drm/xe/xe_gt.c         | 15 +++++++++++++++
>  drivers/gpu/drm/xe/xe_gt.h         |  2 ++
>  drivers/gpu/drm/xe/xe_gt_debugfs.c |  9 +++++++++
>  drivers/gpu/drm/xe/xe_gt_types.h   |  5 +++++
>  4 files changed, 31 insertions(+)
> 
> diff --git a/drivers/gpu/drm/xe/xe_gt.c b/drivers/gpu/drm/xe/xe_gt.c
> index 2458397ce8af..086317a4d91a 100644
> --- a/drivers/gpu/drm/xe/xe_gt.c
> +++ b/drivers/gpu/drm/xe/xe_gt.c
> @@ -497,10 +497,24 @@ static int do_gt_restart(struct xe_gt *gt)
>  	return 0;
>  }
>  
> +void xe_gt_fake_reset(struct xe_gt *gt)
> +{
> +	gt->reset.fake_reset_failure_in_progress = true;
> +	xe_gt_info(gt, "Fake reset queued\n");
> +	queue_work(gt->ordered_wq, &gt->reset.worker);
> +}
> +
>  static int gt_reset(struct xe_gt *gt)
>  {
>  	int err;
>  
> +	if (gt->reset.fake_reset_failure_in_progress) {
> +		err = -ECANCELED;
> +		xe_gt_info(gt, "Fake GT reset failure is in progress\n");
> +		gt->reset.fake_reset_failure_in_progress = false;
> +		goto err_cancelled;
> +	}
> +
>  	/* We only support GT resets with GuC submission */
>  	if (!xe_device_guc_submission_enabled(gt_to_xe(gt)))
>  		return -ENODEV;
> @@ -543,6 +557,7 @@ static int gt_reset(struct xe_gt *gt)
>  err_msg:
>  	XE_WARN_ON(xe_uc_start(&gt->uc));
>  	xe_device_mem_access_put(gt_to_xe(gt));
> +err_cancelled:
>  	xe_gt_err(gt, "reset failed (%pe)\n", ERR_PTR(err));

Not a big issue but the goto label is misleading as cancellation != failure.

>  
>  	return err;
> diff --git a/drivers/gpu/drm/xe/xe_gt.h b/drivers/gpu/drm/xe/xe_gt.h
> index 21d9044088de..22afae04fa3c 100644
> --- a/drivers/gpu/drm/xe/xe_gt.h
> +++ b/drivers/gpu/drm/xe/xe_gt.h
> @@ -24,6 +24,8 @@ void xe_gt_suspend_prepare(struct xe_gt *gt);
>  int xe_gt_suspend(struct xe_gt *gt);
>  int xe_gt_resume(struct xe_gt *gt);
>  void xe_gt_reset_async(struct xe_gt *gt);
> +void xe_gt_fake_reset(struct xe_gt *gt);
> +void xe_gt_migrate_wait(struct xe_gt *gt);

This one is not needed, please remove it.

Francois

>  void xe_gt_sanitize(struct xe_gt *gt);
>  
>  /**
> diff --git a/drivers/gpu/drm/xe/xe_gt_debugfs.c b/drivers/gpu/drm/xe/xe_gt_debugfs.c
> index b5a5538ae630..8d888153c477 100644
> --- a/drivers/gpu/drm/xe/xe_gt_debugfs.c
> +++ b/drivers/gpu/drm/xe/xe_gt_debugfs.c
> @@ -138,6 +138,14 @@ static int workarounds(struct seq_file *m, void *data)
>  	return 0;
>  }
>  
> +static int fake_reset_failure(struct seq_file *m, void *data)
> +{
> +	struct xe_gt *gt = node_to_gt(m->private);
> +
> +	xe_gt_fake_reset(gt);
> +	return 0;
> +}
> +
>  static const struct drm_info_list debugfs_list[] = {
>  	{"hw_engines", hw_engines, 0},
>  	{"force_reset", force_reset, 0},
> @@ -147,6 +155,7 @@ static const struct drm_info_list debugfs_list[] = {
>  	{"ggtt", ggtt, 0},
>  	{"register-save-restore", register_save_restore, 0},
>  	{"workarounds", workarounds, 0},
> +	{"fake_reset_failure", fake_reset_failure, 0},
>  };
>  
>  void xe_gt_debugfs_register(struct xe_gt *gt)
> diff --git a/drivers/gpu/drm/xe/xe_gt_types.h b/drivers/gpu/drm/xe/xe_gt_types.h
> index 99ab7ec99ccd..0b6e1df3ca36 100644
> --- a/drivers/gpu/drm/xe/xe_gt_types.h
> +++ b/drivers/gpu/drm/xe/xe_gt_types.h
> @@ -141,6 +141,11 @@ struct xe_gt {
>  
>  	/** @reset: state for GT resets */
>  	struct {
> +		/**
> +		 * @fake_reset_failure_in_progress: A bool to indicate a fake reset
> +		 * failure has been triggered
> +		 */
> +		bool fake_reset_failure_in_progress;
>  		/**
>  		 * @worker: work so GT resets can done async allowing to reset
>  		 * code to safely flush all code paths
> -- 
> 2.25.1
> 


More information about the Intel-xe mailing list