[Intel-xe] [PATCH v5 2/2] drm/xe: Introduce fault injection for gt reset

Ghimiray, Himal Prasad himal.prasad.ghimiray at intel.com
Mon Jul 24 08:38:20 UTC 2023


Hi Rodrigo,

> -----Original Message-----
> From: Vivi, Rodrigo <rodrigo.vivi at intel.com>
> Sent: 20 July 2023 21:59
> To: Ghimiray, Himal Prasad <himal.prasad.ghimiray at intel.com>
> Cc: intel-xe at lists.freedesktop.org; De Marchi, Lucas
> <lucas.demarchi at intel.com>
> Subject: Re: [Intel-xe] [PATCH v5 2/2] drm/xe: Introduce fault injection for gt
> reset
> 
> On Tue, Jul 18, 2023 at 07:02:16PM +0530, Himal Prasad Ghimiray wrote:
> > To trigger gt reset failure:
> >  echo 100 >  /sys/kernel/debug/dri/<cardX>/fail_gt_reset/probability
> >  echo 2 >  /sys/kernel/debug/dri/<cardX>/fail_gt_reset/times
> 
> why 2 and not 1?

We are relying on 2 times should_fail returning true.
1st to skip the check for xe_uc_reset_prepare(&gt->uc)).
and 2nd to send uevent. 

> 
> anyway, neat solution!
> 
> Reviewed-by: Rodrigo Vivi <rodrigo.vivi at intel.com>

Thanks for the review.

BR
Himal 
> 
> >
> > Cc: Rodrigo Vivi <rodrigo.vivi at intel.com>
> > Cc: Lucas De Marchi <lucas.demarchi at intel.com>
> > Signed-off-by: Himal Prasad Ghimiray <himal.prasad.ghimiray at intel.com>
> > ---
> >  drivers/gpu/drm/xe/xe_debugfs.c | 10 ++++++++++
> >  drivers/gpu/drm/xe/xe_gt.c      |  8 +++++++-
> >  drivers/gpu/drm/xe/xe_gt.h      | 14 ++++++++++++++
> >  3 files changed, 31 insertions(+), 1 deletion(-)
> >
> > diff --git a/drivers/gpu/drm/xe/xe_debugfs.c
> > b/drivers/gpu/drm/xe/xe_debugfs.c index 7827a785b020..08d5bdf4cf61
> > 100644
> > --- a/drivers/gpu/drm/xe/xe_debugfs.c
> > +++ b/drivers/gpu/drm/xe/xe_debugfs.c
> > @@ -5,6 +5,7 @@
> >
> >  #include "xe_debugfs.h"
> >
> > +#include <linux/fault-inject.h>
> >  #include <linux/string_helpers.h>
> >
> >  #include <drm/drm_debugfs.h>
> > @@ -20,6 +21,10 @@
> >  #include "xe_vm.h"
> >  #endif
> >
> > +#ifdef CONFIG_FAULT_INJECTION
> > +DECLARE_FAULT_ATTR(gt_reset_failure);
> > +#endif
> > +
> >  static struct xe_device *node_to_xe(struct drm_info_node *node)  {
> >  	return to_xe_device(node->minor->dev); @@ -131,4 +136,9 @@
> void
> > xe_debugfs_register(struct xe_device *xe)
> >
> >  	for_each_gt(gt, xe, id)
> >  		xe_gt_debugfs_register(gt);
> > +
> > +#ifdef CONFIG_FAULT_INJECTION
> > +	fault_create_debugfs_attr("fail_gt_reset", root, &gt_reset_failure);
> > +#endif
> > +
> >  }
> > diff --git a/drivers/gpu/drm/xe/xe_gt.c b/drivers/gpu/drm/xe/xe_gt.c
> > index 1db4d610f2fd..370d4b96e616 100644
> > --- a/drivers/gpu/drm/xe/xe_gt.c
> > +++ b/drivers/gpu/drm/xe/xe_gt.c
> > @@ -525,6 +525,11 @@ static int gt_reset(struct xe_gt *gt)
> >
> >  	xe_gt_info(gt, "reset started\n");
> >
> > +	if (xe_fault_inject_gt_reset()) {
> > +		err = -ECANCELED;
> > +		goto err_fail;
> > +	}
> > +
> >  	xe_gt_sanitize(gt);
> >
> >  	xe_device_mem_access_get(gt_to_xe(gt));
> > @@ -562,6 +567,7 @@ static int gt_reset(struct xe_gt *gt)
> >  err_msg:
> >  	XE_WARN_ON(xe_uc_start(&gt->uc));
> >  	xe_device_mem_access_put(gt_to_xe(gt));
> > +err_fail:
> >  	xe_gt_err(gt, "reset failed (%pe)\n", ERR_PTR(err));
> >
> >  	/* Notify userspace about gt reset failure */ @@ -582,7 +588,7 @@
> > void xe_gt_reset_async(struct xe_gt *gt)
> >  	xe_gt_info(gt, "trying reset\n");
> >
> >  	/* Don't do a reset while one is already in flight */
> > -	if (xe_uc_reset_prepare(&gt->uc))
> > +	if (!xe_fault_inject_gt_reset() && xe_uc_reset_prepare(&gt->uc))
> >  		return;
> >
> >  	xe_gt_info(gt, "reset queued\n");
> > diff --git a/drivers/gpu/drm/xe/xe_gt.h b/drivers/gpu/drm/xe/xe_gt.h
> > index 7298653a73de..caded203a8a0 100644
> > --- a/drivers/gpu/drm/xe/xe_gt.h
> > +++ b/drivers/gpu/drm/xe/xe_gt.h
> > @@ -7,6 +7,7 @@
> >  #define _XE_GT_H_
> >
> >  #include <drm/drm_util.h>
> > +#include <linux/fault-inject.h>
> >
> >  #include "xe_device_types.h"
> >  #include "xe_hw_engine.h"
> > @@ -16,6 +17,19 @@
> >  		for_each_if(((hwe__) = (gt__)->hw_engines + (id__)) && \
> >  			  xe_hw_engine_is_valid((hwe__)))
> >
> > +#ifdef CONFIG_FAULT_INJECTION
> > +extern struct fault_attr gt_reset_failure; static inline bool
> > +xe_fault_inject_gt_reset(void) {
> > +	return should_fail(&gt_reset_failure, 1); } #else static inline bool
> > +xe_fault_inject_gt_reset(void) {
> > +	return false;
> > +}
> > +#endif
> > +
> >  struct xe_gt *xe_gt_alloc(struct xe_tile *tile);  int
> > xe_gt_init_early(struct xe_gt *gt);  int xe_gt_init(struct xe_gt *gt);
> > --
> > 2.25.1
> >


More information about the Intel-xe mailing list