[Intel-xe] [RFC 1/1] drm/xe: Add a debugfs for faking gt reset failure.

Ghimiray, Himal Prasad himal.prasad.ghimiray at intel.com
Wed May 24 06:18:55 UTC 2023



> -----Original Message-----
> From: Rodrigo Vivi <rodrigo.vivi at kernel.org>
> Sent: 22 May 2023 21:21
> To: Ghimiray, Himal Prasad <himal.prasad.ghimiray at intel.com>
> Cc: intel-xe at lists.freedesktop.org; Vivi, Rodrigo <rodrigo.vivi at intel.com>
> Subject: Re: [RFC 1/1] drm/xe: Add a debugfs for faking gt reset failure.
> 
> On Mon, May 22, 2023 at 02:28:17PM +0530, Himal Prasad Ghimiray wrote:
> > In case of gt reset failure, KMD notifies userspace about failure via
> > uevent. To validate this notification we need to ensure gt reset fails
> > and there is no mechanism to cause failure from hardware.
> > Hence added a debugfs which will cause fake reset failure.
> >
> > Cc: Rodrigo Vivi <rodrigo.vivi at intel.com>
> > Signed-off-by: Himal Prasad Ghimiray <himal.prasad.ghimiray at intel.com>
> > ---
> >  drivers/gpu/drm/xe/xe_gt.c         | 26 +++++++++++++++++++++++++-
> >  drivers/gpu/drm/xe/xe_gt_debugfs.c | 11 +++++++++++
> >  drivers/gpu/drm/xe/xe_gt_types.h   |  3 +++
> >  3 files changed, 39 insertions(+), 1 deletion(-)
> >
> > diff --git a/drivers/gpu/drm/xe/xe_gt.c b/drivers/gpu/drm/xe/xe_gt.c
> > index d98761d9eeba..e05615ca13e5 100644
> > --- a/drivers/gpu/drm/xe/xe_gt.c
> > +++ b/drivers/gpu/drm/xe/xe_gt.c
> > @@ -301,6 +301,9 @@ int xe_gt_init_early(struct xe_gt *gt)  {
> >  	int err;
> >
> > +	/* Reset is supported by default */
> > +	gt->reset_enabled = true;
> > +
> 
> I'd prefer to name this variable as something more specific
> fake_reset_failure_in_progress. In order to avoid future abuses or misusage
> of it.
Sure. Will change it.
> 
> >  	xe_force_wake_init_gt(gt, gt_to_fw(gt));
> >
> >  	err = xe_force_wake_get(gt_to_fw(gt), XE_FW_GT); @@ -605,6
> +608,16
> > @@ static void xe_uevent_gt_reset_failure(struct xe_device *xe, u8 id)
> >  	kfree(reset_event[3]);
> >  }
> >
> > +static int reset_disabled(struct xe_gt *gt) {
> > +	return !READ_ONCE(gt->reset_enabled); }
> > +
> > +static void enable_reset(struct xe_gt *gt) {
> > +	WRITE_ONCE(gt->reset_enabled, true); }
> 
> Why are you using these write/read _once variants?
> Do you need some mutex?
Seems we are already ensuring gt_reset call is dropped if one is already in process.
Will not need mutex. Will make changes in next patch.
> 
> > +
> >  static int gt_reset(struct xe_gt *gt)  {
> >  	struct xe_device *xe = gt_to_xe(gt); @@ -617,8 +630,19 @@ static
> int
> > gt_reset(struct xe_gt *gt)
> >  	drm_info(&xe->drm, "GT reset started\n");
> >
> >  	xe_gt_sanitize(gt);
> > -
> >  	xe_device_mem_access_get(gt_to_xe(gt));
> > +
> > +	err = reset_disabled(gt);
> > +	if (err) {
> > +		drm_info(&xe->drm, "GT reset is disabled\n");
> > +
> > +		/*Enable GT reset for next call if disabled
> > +		 * for fake reset failure.
> > +		 */
> > +		enable_reset(gt);
> > +		goto err_msg;
> > +	}
> > +
> >  	err = xe_force_wake_get(gt_to_fw(gt), XE_FORCEWAKE_ALL);
> >  	if (err)
> >  		goto err_msg;
> > diff --git a/drivers/gpu/drm/xe/xe_gt_debugfs.c
> > b/drivers/gpu/drm/xe/xe_gt_debugfs.c
> > index c45486c2015a..1c3e673c8c60 100644
> > --- a/drivers/gpu/drm/xe/xe_gt_debugfs.c
> > +++ b/drivers/gpu/drm/xe/xe_gt_debugfs.c
> > @@ -127,6 +127,16 @@ static int register_save_restore(struct seq_file *m,
> void *data)
> >  	return 0;
> >  }
> >
> > +static int fake_reset_failure(struct seq_file *m, void *data) {
> > +	struct xe_gt *gt = node_to_gt(m->private);
> > +
> > +	WRITE_ONCE(gt->reset_enabled, false);
> 
> or you create functions for everything or you don't create functions for any
> of them. But be consistent.

Will address in next patch. 
> 
> > +	xe_gt_reset_async(gt);
> > +
> > +	return 0;
> > +}
> > +
> >  static const struct drm_info_list debugfs_list[] = {
> >  	{"hw_engines", hw_engines, 0},
> >  	{"force_reset", force_reset, 0},
> > @@ -135,6 +145,7 @@ static const struct drm_info_list debugfs_list[] = {
> >  	{"steering", steering, 0},
> >  	{"ggtt", ggtt, 0},
> >  	{"register-save-restore", register_save_restore, 0},
> > +	{"fake_reset_failure", fake_reset_failure, 0},
> >  };
> >
> >  void xe_gt_debugfs_register(struct xe_gt *gt) diff --git
> > a/drivers/gpu/drm/xe/xe_gt_types.h
> b/drivers/gpu/drm/xe/xe_gt_types.h
> > index 7c47d67aa8be..7ec19ad0365d 100644
> > --- a/drivers/gpu/drm/xe/xe_gt_types.h
> > +++ b/drivers/gpu/drm/xe/xe_gt_types.h
> > @@ -175,6 +175,9 @@ struct xe_gt {
> >  		struct work_struct worker;
> >  	} reset;
> >
> > +	/** @reset_enabled: GT supports reset */
> > +	bool reset_enabled;
> > +
> >  	/** @tlb_invalidation: TLB invalidation state */
> >  	struct {
> >  		/** @seqno: TLB invalidation seqno, protected by CT lock */
> > --
> > 2.25.1
> >


More information about the Intel-xe mailing list