[RFC PATCH 1/1] drm/xe: Add driver load error injection

Sun Aug 11 22:09:23 UTC 2024

On Sat, Aug 10, 2024 at 11:01:29AM -0500, Lucas De Marchi wrote:
> On Sat, Aug 10, 2024 at 01:41:51PM GMT, Matthew Brost wrote:
> > On Sat, Aug 10, 2024 at 12:16:32AM -0500, Lucas De Marchi wrote:
> > > On Fri, Aug 09, 2024 at 03:44:24PM GMT, Matthew Brost wrote:
> > > > Port over i915 driver load error injection.
> > > >
> > > 
> > > I don't like much the manual approach, but it's better to get the driver
> > > not exploding. Then we can think of replacing this.  Some comments below
> > 
> > Yep. I chatted with Rodrigo about this and we agreed their isn't a great
> > way with the existing kernel error injection to easily get coverage like
> > this plus a very simple test case [1]. Agree longterm we should not
> > invent our own things and come up with either a kernel or drm level
> > solution.
> > 
> > In the short term, yes this better than our driver exploding. View this
> > as a force probe blocker, so we need to get our driver fixed in a matter
> > of weeks and this seems like the only viable path for now.
> > 
> > [1] for i in {1..N}; do echo "Run $i"; modprobe xe inject_driver_load_error=$i; rmmod xe; done
> > 
> > > as I think some of the checks are not right.
> > > 
> > > 
> > > > Signed-off-by: Matthew Brost <matthew.brost at intel.com>
> > > > ---
> > > > drivers/gpu/drm/xe/xe_device.c       | 31 ++++++++++++++++++++++++++++
> > > > drivers/gpu/drm/xe/xe_device.h       | 15 ++++++++++++++
> > > > drivers/gpu/drm/xe/xe_device_types.h |  4 ++++
> > > > drivers/gpu/drm/xe/xe_gt.c           |  5 +++++
> > > > drivers/gpu/drm/xe/xe_gt_sriov_pf.c  |  4 ++++
> > > > drivers/gpu/drm/xe/xe_guc.c          |  8 +++++++
> > > > drivers/gpu/drm/xe/xe_guc_ads.c      |  5 +++++
> > > > drivers/gpu/drm/xe/xe_guc_ct.c       |  4 ++++
> > > > drivers/gpu/drm/xe/xe_guc_log.c      |  5 +++++
> > > > drivers/gpu/drm/xe/xe_mmio.c         |  5 +++++
> > > > drivers/gpu/drm/xe/xe_module.c       |  5 +++++
> > > > drivers/gpu/drm/xe/xe_module.h       |  3 +++
> > > > drivers/gpu/drm/xe/xe_pci.c          |  9 ++++++++
> > > > drivers/gpu/drm/xe/xe_pm.c           |  8 +++++++
> > > > drivers/gpu/drm/xe/xe_sriov.c        |  8 ++++++-
> > > > drivers/gpu/drm/xe/xe_tile.c         |  4 ++++
> > > > drivers/gpu/drm/xe/xe_uc.c           |  4 ++++
> > > > drivers/gpu/drm/xe/xe_wa.c           |  5 +++++
> > > > drivers/gpu/drm/xe/xe_wopcm.c        |  4 ++++
> > > > 19 files changed, 135 insertions(+), 1 deletion(-)
> > > >
> > > > diff --git a/drivers/gpu/drm/xe/xe_device.c b/drivers/gpu/drm/xe/xe_device.c
> > > > index 1aba6f9eaa19..f6cd13ed6d20 100644
> > > > --- a/drivers/gpu/drm/xe/xe_device.c
> > > > +++ b/drivers/gpu/drm/xe/xe_device.c
> > > > @@ -374,6 +374,10 @@ struct xe_device *xe_device_create(struct pci_dev *pdev,
> > > > 	if (WARN_ON(err))
> > > > 		goto err;
> > > >
> > > > +	err = xe_device_inject_driver_load_error(xe);
> > > > +	if (err)
> > > > +		goto err;
> > > > +
> > > > 	return xe;
> > > >
> > > > err:
> > > > @@ -477,6 +481,10 @@ static int xe_set_dma_info(struct xe_device *xe)
> > > > 	if (err)
> > > > 		goto mask_err;
> > > >
> > > > +	err = xe_device_inject_driver_load_error(xe);
> > > > +	if (err)
> > > > +		goto mask_err;
> > > > +
> > > > 	return 0;
> > > >
> > > > mask_err:
> > > > @@ -580,6 +588,10 @@ int xe_device_probe_early(struct xe_device *xe)
> > > > 	if (err)
> > > > 		return err;
> > > >
> > > > +	err = xe_device_inject_driver_load_error(xe);
> > > > +	if (err)
> > > > +		return err;
> > > > +
> > > > 	xe->wedged.mode = xe_modparam.wedged_mode;
> > > >
> > > > 	return 0;
> > > > @@ -995,3 +1007,22 @@ void xe_device_declare_wedged(struct xe_device *xe)
> > > > 	for_each_gt(gt, xe, id)
> > > > 		xe_gt_declare_wedged(gt);
> > > > }
> > > > +
> > > > +#if IS_ENABLED(CONFIG_DRM_XE_DEBUG)
> > > > +int __xe_device_inject_driver_load_error(struct xe_device *xe, int err,
> > > > +					 const char *func, int line)
> > > > +{
> > > > +        if (xe->inject_driver_load_error >= xe_modparam.inject_driver_load_error)
> > > > +                return 0;
> > > > +
> > > > +        if (++xe->inject_driver_load_error < xe_modparam.inject_driver_load_error)
> > > > +                return 0;
> > > > +
> > > > +        drm_info(&xe->drm, "Injecting failure %d at checkpoint %u [%s:%d]\n",
> > > > +                 err, xe->inject_driver_load_error, func, line);
> > > 
> > > I wonder if it'd wouldn't be sufficient, for debugging sake to use
> > > "... [%pF]", __builtin_return_address(0)
> > > 
> > 
> > I think that probably works. I their a way to decode the line number
> > though? The line number is a bit more helpful can some hex value.
> 
> the format is actually
> 
> theres addr2line, but since I may or may not have tool handy depending
> on the machine, I always use gdb
> 
> 	gdb xe.ko
> 	...
> 	l *(versatile_init+0x9)
> 
> which gives the tranlation. Looking at https://www.kernel.org/doc/Documentation/printk-formats.txt
> (Symbols/Function Pointers) it actually suggests using "%pS" as print
> format when using __builtin_return_address(0)
> 
> Anyway, I won't oppose leaving this they way you coded though.
> 

Yes, I use gdb too but typically don't build with debug symbols or if
something pops in CI unsure how to grab the build to determine the line
number.

> > 
> > > > +
> > > > +        xe_modparam.inject_driver_load_error = 0;
> > > > +        return err;
> > > > +
> > > > +}
> > > > +#endif
> > > > diff --git a/drivers/gpu/drm/xe/xe_device.h b/drivers/gpu/drm/xe/xe_device.h
> > > > index db6cc8d0d6b8..4f7e9cdac9fe 100644
> > > > --- a/drivers/gpu/drm/xe/xe_device.h
> > > > +++ b/drivers/gpu/drm/xe/xe_device.h
> > > > @@ -179,4 +179,19 @@ void xe_device_declare_wedged(struct xe_device *xe);
> > > > struct xe_file *xe_file_get(struct xe_file *xef);
> > > > void xe_file_put(struct xe_file *xef);
> > > >
> > > > +#if IS_ENABLED(CONFIG_DRM_XE_DEBUG)
> > > > +
> > > > +int __xe_device_inject_driver_load_error(struct xe_device *xe, int err,
> > > > +					 const char *func, int line);
> > > > +
> > > > +#define xe_device_inject_driver_load_error(__xe) \
> > > > +	__xe_device_inject_driver_load_error(__xe, -ENODEV, __func__, __LINE__)
> > > 
> > > is -ENODEV the only error we may want to inject?
> > > 
> > 
> > I was little lazy here, should be easy enough to add another macro (the
> > i915 does this) to control the error value. But in general I don't think
> > we really care about the error on driver load as we pretty much always
> > just fail the driver load.
> 
> ok, if we don't care about the error code in any existent check, then
> let's go with the simple approach.
>

+1

> > 
> > > > +
> > > > +#else
> > > > +
> > > > +#define xe_device_inject_driver_load_error(__xe) \
> > > > +	({ BUILD_BUG_ON_INVALID(__xe); 0; })
> > > 
> > > if the suggestion above is not feasible, then maybe better to leave
> > > this outside the ifdef and provide a dummy
> > > __xe_device_inject_driver_load_error() so we type-check?  Or we can
> > > type check manually too.
> > > 
> > 
> > You mean?
> > 
> > static inline xe_device_inject_driver_load_error(struct xe_device *xe) {}
> 
> 	       __xe_device_...
> 
> so it matches the other side of the ifdef. Then you leave the macro
> outside of the ifdef and always call the function.  The compiler should
> eliminate the function call.
> 
> > 
> > So we get type checking?
> 
> yeah, otherwise a mistake like
> xe_device_inject_driver_load_error(&xe->drm) goes unnoticed depending on
> the build config.
> 
> The other alternative I proposed is to use
> __builtin_types_compatible_p() if we keep the macro the way you coded.
> 

Agree we want something with type checking.

> > 
> > > > +
> > > > +#endif
> > > > +
> > > > #endif
> > > > diff --git a/drivers/gpu/drm/xe/xe_device_types.h b/drivers/gpu/drm/xe/xe_device_types.h
> > > > index 5b7292a9a66d..3e620314eec2 100644
> > > > --- a/drivers/gpu/drm/xe/xe_device_types.h
> > > > +++ b/drivers/gpu/drm/xe/xe_device_types.h
> > > > @@ -484,6 +484,10 @@ struct xe_device {
> > > > 		int mode;
> > > > 	} wedged;
> > > >
> > > > +#if IS_ENABLED(CONFIG_DRM_XE_DEBUG)
> > > > +	int inject_driver_load_error;
> > > > +#endif
> > > > +
> > > > #ifdef TEST_VM_OPS_ERROR
> > > > 	/**
> > > > 	 * @vm_inject_error_position: inject errors at different places in VM
> > > > diff --git a/drivers/gpu/drm/xe/xe_gt.c b/drivers/gpu/drm/xe/xe_gt.c
> > > > index 58895ed22f6e..8209079c0334 100644
> > > > --- a/drivers/gpu/drm/xe/xe_gt.c
> > > > +++ b/drivers/gpu/drm/xe/xe_gt.c
> > > > @@ -389,6 +389,10 @@ int xe_gt_init_early(struct xe_gt *gt)
> > > > 	xe_pcode_init(gt);
> > > > 	spin_lock_init(&gt->global_invl_lock);
> > > >
> > > > +	err = xe_device_inject_driver_load_error(gt_to_xe(gt));
> > > 
> > > for the ones I check before this, things are ok. For this and some
> > > below, I believe they are misplaced because this currently is not a
> > > failure point. Adding a return error here is wrong IMO because it
> > > changes the expectation: the function was expecting that after
> > > xe_wa_init() there are no more failure points so nothing to unwind. If
> > > you inject an error here, you may be then trying to fix a non-existent
> > > bug.
> > > 
> > 
> > I pretty much agree. I really hacked this together as fast as I could
> > without tons of deep thought as a PoC + to see what exploded (quite a
> > bit so far).
> > 
> > But our error handling should be robust enough if I coded this a bit
> > goofy, it should work.
> > 
> > > From what I can see, the a valid point to inject an error is always when
> > > we are already checking for an error (and if we aren't then we should).
> > > So they should be always in the pattern as the ones above...
> > > 
> > > 	err = foo();
> > > 	if (err)
> > > 		return err;
> > > 
> > > 	err = xe_device_inject_driver_load_error(xe);
> > > 	if (err)
> > > 		return err;
> > > 
> > > I think that also means we can do better:
> > > 
> > > 	err = foo();
> > > 	err = xe_device_inject_driver_load_error(xe, err);
> > > 	if (err)
> > > 		return err;
> > > 
> > > And in xe_device_inject_driver_load_error() we implement it
> > > to first check `err` and return it if != 0. This way we only even try to
> > > inject errors in valid points, and don't risk getting problems if a
> > > later patch mistakenly update the `return err;` with `goto fail;` in
> > > one place and forgets the other.
> > > 
> > 
> > I do like this (xe, err) semantics with if != 0, but think it should be
> > seperate macro / function than the one in place as (xe, 0) for cases
> > without an existing error is a bit goofy.
> 
> works for me
> 

+1

> > 
> > > 
> > > 
> > > > +	if (err)
> > > > +		return err;
> > > > +
> > > > 	return 0;
> > > > }
> > > >
> > > > @@ -570,6 +574,7 @@ int xe_gt_init_hwconfig(struct xe_gt *gt)
> > > > 	xe_gt_topology_init(gt);
> > > > 	xe_gt_mcr_init(gt);
> > > >
> > > > +	err = xe_device_inject_driver_load_error(gt_to_xe(gt));
> > > > out_fw:
> > > > 	xe_force_wake_put(gt_to_fw(gt), XE_FW_GT);
> > > > out:
> > > > diff --git a/drivers/gpu/drm/xe/xe_gt_sriov_pf.c b/drivers/gpu/drm/xe/xe_gt_sriov_pf.c
> > > > index ef239440963c..897815ddf954 100644
> > > > --- a/drivers/gpu/drm/xe/xe_gt_sriov_pf.c
> > > > +++ b/drivers/gpu/drm/xe/xe_gt_sriov_pf.c
> > > > @@ -57,6 +57,10 @@ int xe_gt_sriov_pf_init_early(struct xe_gt *gt)
> > > > 	if (err)
> > > > 		return err;
> > > >
> > > > +	err = xe_device_inject_driver_load_error(gt_to_xe(gt));
> > > > +	if (err)
> > > > +		return err;
> > > > +
> > > > 	return 0;
> > > > }
> > > >
> > > > diff --git a/drivers/gpu/drm/xe/xe_guc.c b/drivers/gpu/drm/xe/xe_guc.c
> > > > index de0fe9e65746..980691c178c4 100644
> > > > --- a/drivers/gpu/drm/xe/xe_guc.c
> > > > +++ b/drivers/gpu/drm/xe/xe_guc.c
> > > > @@ -354,6 +354,10 @@ int xe_guc_init(struct xe_guc *guc)
> > > > 	if (ret)
> > > > 		goto out;
> > > >
> > > > +	ret = xe_device_inject_driver_load_error(guc_to_xe(guc));
> > > > +	if (ret)
> > > > +		goto out;
> > > > +
> > > > 	guc_init_params(guc);
> > > >
> > > > 	xe_guc_comm_init_early(guc);
> > > > @@ -411,6 +415,10 @@ int xe_guc_init_post_hwconfig(struct xe_guc *guc)
> > > > 	if (ret)
> > > > 		return ret;
> > > >
> > > > +	ret = xe_device_inject_driver_load_error(guc_to_xe(guc));
> > > > +	if (ret)
> > > > +		return ret;
> > > > +
> > > > 	return xe_guc_ads_init_post_hwconfig(&guc->ads);
> > > > }
> > > >
> > > > diff --git a/drivers/gpu/drm/xe/xe_guc_ads.c b/drivers/gpu/drm/xe/xe_guc_ads.c
> > > > index d1902a8581ca..1944912ef9b8 100644
> > > > --- a/drivers/gpu/drm/xe/xe_guc_ads.c
> > > > +++ b/drivers/gpu/drm/xe/xe_guc_ads.c
> > > > @@ -402,6 +402,7 @@ int xe_guc_ads_init(struct xe_guc_ads *ads)
> > > > 	struct xe_gt *gt = ads_to_gt(ads);
> > > > 	struct xe_tile *tile = gt_to_tile(gt);
> > > > 	struct xe_bo *bo;
> > > > +	int err;
> > > >
> > > > 	ads->golden_lrc_size = calculate_golden_lrc_size(ads);
> > > > 	ads->regset_size = calculate_regset_size(gt);
> > > > @@ -416,6 +417,10 @@ int xe_guc_ads_init(struct xe_guc_ads *ads)
> > > >
> > > > 	ads->bo = bo;
> > > >
> > > > +	err = xe_device_inject_driver_load_error(xe);
> > > > +	if (err)
> > > > +		return err;
> > > > +
> > > > 	return 0;
> > > > }
> > > >
> > > > diff --git a/drivers/gpu/drm/xe/xe_guc_ct.c b/drivers/gpu/drm/xe/xe_guc_ct.c
> > > > index beeeb120d1fc..76a26aaabb13 100644
> > > > --- a/drivers/gpu/drm/xe/xe_guc_ct.c
> > > > +++ b/drivers/gpu/drm/xe/xe_guc_ct.c
> > > > @@ -197,6 +197,10 @@ int xe_guc_ct_init(struct xe_guc_ct *ct)
> > > > 	if (err)
> > > > 		return err;
> > > >
> > > > +	err = xe_device_inject_driver_load_error(xe);
> > > > +	if (err)
> > > > +		return err;
> > > > +
> > > > 	xe_gt_assert(gt, ct->state == XE_GUC_CT_STATE_NOT_INITIALIZED);
> > > > 	ct->state = XE_GUC_CT_STATE_DISABLED;
> > > > 	return 0;
> > > > diff --git a/drivers/gpu/drm/xe/xe_guc_log.c b/drivers/gpu/drm/xe/xe_guc_log.c
> > > > index a37ee3419428..f26c37e3ee3a 100644
> > > > --- a/drivers/gpu/drm/xe/xe_guc_log.c
> > > > +++ b/drivers/gpu/drm/xe/xe_guc_log.c
> > > > @@ -82,6 +82,7 @@ int xe_guc_log_init(struct xe_guc_log *log)
> > > > 	struct xe_device *xe = log_to_xe(log);
> > > > 	struct xe_tile *tile = gt_to_tile(log_to_gt(log));
> > > > 	struct xe_bo *bo;
> > > > +	int err;
> > > >
> > > > 	bo = xe_managed_bo_create_pin_map(xe, tile, guc_log_size(),
> > > > 					  XE_BO_FLAG_SYSTEM |
> > > > @@ -94,5 +95,9 @@ int xe_guc_log_init(struct xe_guc_log *log)
> > > > 	log->bo = bo;
> > > > 	log->level = xe_modparam.guc_log_level;
> > > >
> > > > +	err = xe_device_inject_driver_load_error(log_to_xe(log));
> > > > +	if (err)
> > > > +		return err;
> > > > +
> > > > 	return 0;
> > > > }
> > > > diff --git a/drivers/gpu/drm/xe/xe_mmio.c b/drivers/gpu/drm/xe/xe_mmio.c
> > > > index bdcc7282385c..1d52b1f57f4c 100644
> > > > --- a/drivers/gpu/drm/xe/xe_mmio.c
> > > > +++ b/drivers/gpu/drm/xe/xe_mmio.c
> > > > @@ -136,6 +136,11 @@ int xe_mmio_probe_tiles(struct xe_device *xe)
> > > > {
> > > > 	size_t tile_mmio_size = SZ_16M;
> > > > 	size_t tile_mmio_ext_size = xe->info.tile_mmio_ext_size;
> > > > +	int err;
> > > > +
> > > > +	err = xe_device_inject_driver_load_error(xe);
> > > > +	if (err)
> > > > +		return err;
> > > 
> > > yeah... I think this is a valid one, because it means
> > > "handle errors from this function". As long as it's the very first thing
> > > in a function, not leaving side effects, it should be fine, And in this
> > > case I'd do:
> > > 
> > 
> > Yep, I think beginning of most functions in the driver load call stack
> > is likely the correct placement. Again didn't really have time to do
> > this perfectly. I'm hoping to find someone to own this full time next
> > week and your feedback here it helpful to get this done correctly.`
> 
> humn... to be able to set the value to an arbitrary high value,
> shouldn't we at some point force the param value to 0 so we stop
> checking for error injection after the probe part?
> 

That's a good point. Once the driver load completes we should turn off
this error injection by setting the modparam to 0. 

> > 
> > > 	err = xe_device_inject_driver_load_error(xe, 0);
> > > 
> > > to follow the logic I mentioned previously.
> > > 
> > > >
> > > > 	mmio_multi_tile_setup(xe, tile_mmio_size);
> > > > 	mmio_extension_setup(xe, tile_mmio_size, tile_mmio_ext_size);
> > > > diff --git a/drivers/gpu/drm/xe/xe_module.c b/drivers/gpu/drm/xe/xe_module.c
> > > > index 7bb99e451fcc..972b64a9f514 100644
> > > > --- a/drivers/gpu/drm/xe/xe_module.c
> > > > +++ b/drivers/gpu/drm/xe/xe_module.c
> > > > @@ -53,6 +53,11 @@ module_param_named_unsafe(force_probe, xe_modparam.force_probe, charp, 0400);
> > > > MODULE_PARM_DESC(force_probe,
> > > > 		 "Force probe options for specified devices. See CONFIG_DRM_XE_FORCE_PROBE for details.");
> > > >
> > > > +#if IS_ENABLED(CONFIG_DRM_XE_DEBUG)
> > > > +module_param_named_unsafe(inject_driver_load_error, xe_modparam.inject_driver_load_error, int, 0600);
> > > > +MODULE_PARM_DESC(inject_driver_load_error, "Inject driver load error");
> > > 
> > > hum... are we doing this when loading the module?  I guess
> > > inject_driver_probe_error would be better as those tests are not per
> > > module but per device (and instead of modprobe, we could test by
> > > doing bind/unbind).
> > 
> > So s/inject_driver_load_error/inject_driver_probe_error/
> 
> yes
> 

+1

> > 
> > Makes sense.
> > 
> > Matt
> 
> thanks a lot for doing this and uncovering the existent bugs
> 

No problem. I have it in a working state here [1] and hoping to hand
this off, I think we may discuss this tomorrow. More error injection
points are needed and I suspect when more are added, more explosions
will be seen.

Matt

[1] https://patchwork.freedesktop.org/series/137114/

> Lucas De Marchi
> 
> > 
> > > 
> > > thanks
> > > Lucas De Marchi
> > > 
> > > > +#endif
> > > > +
> > > > #ifdef CONFIG_PCI_IOV
> > > > module_param_named(max_vfs, xe_modparam.max_vfs, uint, 0400);
> > > > MODULE_PARM_DESC(max_vfs,
> > > > diff --git a/drivers/gpu/drm/xe/xe_module.h b/drivers/gpu/drm/xe/xe_module.h
> > > > index 61a0d28a28c8..409ea10be942 100644
> > > > --- a/drivers/gpu/drm/xe/xe_module.h
> > > > +++ b/drivers/gpu/drm/xe/xe_module.h
> > > > @@ -20,6 +20,9 @@ struct xe_modparam {
> > > > 	char *force_probe;
> > > > #ifdef CONFIG_PCI_IOV
> > > > 	unsigned int max_vfs;
> > > > +#endif
> > > > +#if IS_ENABLED(CONFIG_DRM_XE_DEBUG)
> > > > +	int inject_driver_load_error;
> > > > #endif
> > > > 	int wedged_mode;
> > > > };
> > > > diff --git a/drivers/gpu/drm/xe/xe_pci.c b/drivers/gpu/drm/xe/xe_pci.c
> > > > index f818aa69f3ca..8b278c83128a 100644
> > > > --- a/drivers/gpu/drm/xe/xe_pci.c
> > > > +++ b/drivers/gpu/drm/xe/xe_pci.c
> > > > @@ -629,6 +629,10 @@ static int xe_info_init_early(struct xe_device *xe,
> > > > 	if (err)
> > > > 		return err;
> > > >
> > > > +	err = xe_device_inject_driver_load_error(xe);
> > > > +	if (err)
> > > > +		return err;
> > > > +
> > > > 	return 0;
> > > > }
> > > >
> > > > @@ -645,6 +649,7 @@ static int xe_info_init(struct xe_device *xe,
> > > > 	u32 graphics_gmdid_revid = 0, media_gmdid_revid = 0;
> > > > 	struct xe_tile *tile;
> > > > 	struct xe_gt *gt;
> > > > +	int err;
> > > > 	u8 id;
> > > >
> > > > 	/*
> > > > @@ -745,6 +750,10 @@ static int xe_info_init(struct xe_device *xe,
> > > > 		gt->info.id = xe->info.gt_count++;
> > > > 	}
> > > >
> > > > +	err = xe_device_inject_driver_load_error(xe);
> > > > +	if (err)
> > > > +		return err;
> > > > +
> > > > 	return 0;
> > > > }
> > > >
> > > > diff --git a/drivers/gpu/drm/xe/xe_pm.c b/drivers/gpu/drm/xe/xe_pm.c
> > > > index 9f3c14fd9f33..64d992c12364 100644
> > > > --- a/drivers/gpu/drm/xe/xe_pm.c
> > > > +++ b/drivers/gpu/drm/xe/xe_pm.c
> > > > @@ -231,6 +231,10 @@ int xe_pm_init_early(struct xe_device *xe)
> > > > 	if (err)
> > > > 		return err;
> > > >
> > > > +	err = xe_device_inject_driver_load_error(xe);
> > > > +	if (err)
> > > > +		return err;
> > > > +
> > > > 	return 0;
> > > > }
> > > >
> > > > @@ -264,6 +268,10 @@ int xe_pm_init(struct xe_device *xe)
> > > >
> > > > 	xe_pm_runtime_init(xe);
> > > >
> > > > +	err = xe_device_inject_driver_load_error(xe);
> > > > +	if (err)
> > > > +		return err;
> > > > +
> > > > 	return 0;
> > > > }
> > > >
> > > > diff --git a/drivers/gpu/drm/xe/xe_sriov.c b/drivers/gpu/drm/xe/xe_sriov.c
> > > > index 5a1d65e4f19f..1e738f1d80df 100644
> > > > --- a/drivers/gpu/drm/xe/xe_sriov.c
> > > > +++ b/drivers/gpu/drm/xe/xe_sriov.c
> > > > @@ -102,11 +102,17 @@ static void fini_sriov(struct drm_device *drm, void *arg)
> > > >  */
> > > > int xe_sriov_init(struct xe_device *xe)
> > > > {
> > > > +	int err;
> > > > +
> > > > 	if (!IS_SRIOV(xe))
> > > > 		return 0;
> > > >
> > > > +	err = xe_device_inject_driver_load_error(xe);
> > > > +	if (err)
> > > > +		return err;
> > > > +
> > > > 	if (IS_SRIOV_PF(xe)) {
> > > > -		int err = xe_sriov_pf_init_early(xe);
> > > > +		err = xe_sriov_pf_init_early(xe);
> > > >
> > > > 		if (err)
> > > > 			return err;
> > > > diff --git a/drivers/gpu/drm/xe/xe_tile.c b/drivers/gpu/drm/xe/xe_tile.c
> > > > index 15ea0a942f67..2d25c7b59b0d 100644
> > > > --- a/drivers/gpu/drm/xe/xe_tile.c
> > > > +++ b/drivers/gpu/drm/xe/xe_tile.c
> > > > @@ -124,6 +124,10 @@ int xe_tile_init_early(struct xe_tile *tile, struct xe_device *xe, u8 id)
> > > > 	if (IS_ERR(tile->primary_gt))
> > > > 		return PTR_ERR(tile->primary_gt);
> > > >
> > > > +	err = xe_device_inject_driver_load_error(xe);
> > > > +	if (err)
> > > > +		return err;
> > > > +
> > > > 	return 0;
> > > > }
> > > >
> > > > diff --git a/drivers/gpu/drm/xe/xe_uc.c b/drivers/gpu/drm/xe/xe_uc.c
> > > > index 0d073a9987c2..a3786020838b 100644
> > > > --- a/drivers/gpu/drm/xe/xe_uc.c
> > > > +++ b/drivers/gpu/drm/xe/xe_uc.c
> > > > @@ -62,6 +62,10 @@ int xe_uc_init(struct xe_uc *uc)
> > > > 	if (ret)
> > > > 		goto err;
> > > >
> > > > +	ret = xe_device_inject_driver_load_error(uc_to_xe(uc));
> > > > +	if (ret)
> > > > +		goto err;
> > > > +
> > > > 	return 0;
> > > >
> > > > err:
> > > > diff --git a/drivers/gpu/drm/xe/xe_wa.c b/drivers/gpu/drm/xe/xe_wa.c
> > > > index 564e32e44e3b..e558715d8027 100644
> > > > --- a/drivers/gpu/drm/xe/xe_wa.c
> > > > +++ b/drivers/gpu/drm/xe/xe_wa.c
> > > > @@ -821,6 +821,7 @@ int xe_wa_init(struct xe_gt *gt)
> > > > 	struct xe_device *xe = gt_to_xe(gt);
> > > > 	size_t n_oob, n_lrc, n_engine, n_gt, total;
> > > > 	unsigned long *p;
> > > > +	int err;
> > > >
> > > > 	n_gt = BITS_TO_LONGS(ARRAY_SIZE(gt_was));
> > > > 	n_engine = BITS_TO_LONGS(ARRAY_SIZE(engine_was));
> > > > @@ -840,6 +841,10 @@ int xe_wa_init(struct xe_gt *gt)
> > > > 	p += n_lrc;
> > > > 	gt->wa_active.oob = p;
> > > >
> > > > +	err = xe_device_inject_driver_load_error(xe);
> > > > +	if (err)
> > > > +		return err;
> > > > +
> > > > 	return 0;
> > > > }
> > > >
> > > > diff --git a/drivers/gpu/drm/xe/xe_wopcm.c b/drivers/gpu/drm/xe/xe_wopcm.c
> > > > index d3a99157e523..edaad1c93e58 100644
> > > > --- a/drivers/gpu/drm/xe/xe_wopcm.c
> > > > +++ b/drivers/gpu/drm/xe/xe_wopcm.c
> > > > @@ -263,6 +263,10 @@ int xe_wopcm_init(struct xe_wopcm *wopcm)
> > > > 		return -E2BIG;
> > > > 	}
> > > >
> > > > +	ret = xe_device_inject_driver_load_error(xe);
> > > > +	if (ret)
> > > > +		return ret;
> > > > +
> > > > 	if (!locked)
> > > > 		ret = __wopcm_init_regs(xe, gt, wopcm);
> > > >
> > > > --
> > > > 2.34.1
> > > >