[PATCH v4] drm/xe: Use fault injection infrastructure to find issues at probe time

Francois Dugast francois.dugast at intel.com
Fri Sep 27 18:57:29 UTC 2024


On Fri, Sep 27, 2024 at 01:37:52PM -0400, Rodrigo Vivi wrote:
> On Fri, Sep 27, 2024 at 05:12:06PM +0200, Francois Dugast wrote:
> > The kernel fault injection infrastructure is used to test proper error
> > handling during probe. The return code of the functions using
> > ALLOW_ERROR_INJECTION() can be conditionnally modified at runtime by
> > tuning some debugfs entries. This requires CONFIG_FUNCTION_ERROR_INJECTION
> > (among others).
> > 
> > One way to use fault injection at probe time by making each of those
> > functions fail one at a time is:
> > 
> >     FAILTYPE=fail_function
> >     DEVICE="0000:00:08.0" # depends on the system
> >     ERRNO=-12 # -ENOMEM, can depend on the function
> > 
> >     echo N > /sys/kernel/debug/$FAILTYPE/task-filter
> >     echo 100 > /sys/kernel/debug/$FAILTYPE/probability
> >     echo 0 > /sys/kernel/debug/$FAILTYPE/interval
> >     echo -1 > /sys/kernel/debug/$FAILTYPE/times
> >     echo 0 > /sys/kernel/debug/$FAILTYPE/space
> >     echo 1 > /sys/kernel/debug/$FAILTYPE/verbose
> > 
> >     modprobe xe
> >     echo $DEVICE > /sys/bus/pci/drivers/xe/unbind
> > 
> >     grep -oP "^.* \[xe\]" /sys/kernel/debug/$FAILTYPE/injectable | \
> >     cut -d ' ' -f 1 | while read -r FUNCTION ; do
> >         echo "Injecting fault in $FUNCTION"
> >         echo "" > /sys/kernel/debug/$FAILTYPE/inject
> >         echo $FUNCTION > /sys/kernel/debug/$FAILTYPE/inject
> >         printf %#x $ERRNO > /sys/kernel/debug/$FAILTYPE/$FUNCTION/retval
> >         echo $DEVICE > /sys/bus/pci/drivers/xe/bind
> >     done
> > 
> >     rmmod xe
> > 
> > It will also be integrated into IGT for systematic execution by CI.
> > 
> > v2: Wrappers are not needed in the cases covered by this patch, so
> >     remove them and use ALLOW_ERROR_INJECTION() directly.
> > 
> > v3: Document the use of fault injection at probe time in xe_pci_probe
> >     and refer to it where ALLOW_ERROR_INJECTION() is used.
> 
> I now have a feeling that we could have a xe_fault_injection component,
> that would be the only one including linux/fault-inject.h, the only
> place where these wrappers would be declared and everything documented.

The intention here is actually to avoid adding a driver-specific layer as
was done in i915. There is no wrapper anymore with this version (and the
previous one), only direct uses of the ALLOW_ERROR_INJECTION() macro, so I
am not sure what a xe_fault_injection component would contain besides an
"#include <linux/fault-inject.h>".

> 
> I feel that /* See xe_pci_probe() */ everywhere is a bit too much as
> weel...

This is an attempt to centralize thorough explanations and refer to it as
suggested by Jani. I believe documenting it around xe_pci_probe() makes
sense for the case of the fault injections brought by this patch because
they are only relevant for the execution of that function, during probe.

The approach and documentation would likely look a bit different when we
extend the use of fault injection in the driver to test error paths and
unwinding in the IOCTLs. Also, in that case we might need wrappers as in
the first version of this patch.

Francois

> 
> > 
> > Signed-off-by: Francois Dugast <francois.dugast at intel.com>
> > Cc: Lucas De Marchi <lucas.demarchi at intel.com>
> > Cc: Matthew Brost <matthew.brost at intel.com>
> > Cc: Rodrigo Vivi <rodrigo.vivi at intel.com>
> > Cc: Michal Wajdeczko <michal.wajdeczko at intel.com>
> > Cc: Jani Nikula <jani.nikula at intel.com>
> > ---
> >  drivers/gpu/drm/xe/xe_device.c    |  3 +++
> >  drivers/gpu/drm/xe/xe_ggtt.c      |  2 ++
> >  drivers/gpu/drm/xe/xe_guc_ads.c   |  3 +++
> >  drivers/gpu/drm/xe/xe_guc_ct.c    |  2 ++
> >  drivers/gpu/drm/xe/xe_guc_log.c   |  3 +++
> >  drivers/gpu/drm/xe/xe_guc_relay.c |  2 ++
> >  drivers/gpu/drm/xe/xe_pci.c       | 19 +++++++++++++++++++
> >  drivers/gpu/drm/xe/xe_pm.c        |  2 ++
> >  drivers/gpu/drm/xe/xe_sriov.c     |  3 +++
> >  drivers/gpu/drm/xe/xe_tile.c      |  3 +++
> >  drivers/gpu/drm/xe/xe_uc_fw.c     |  2 ++
> >  drivers/gpu/drm/xe/xe_wa.c        |  2 ++
> >  drivers/gpu/drm/xe/xe_wopcm.c     |  3 +++
> >  13 files changed, 49 insertions(+)
> > 
> > diff --git a/drivers/gpu/drm/xe/xe_device.c b/drivers/gpu/drm/xe/xe_device.c
> > index 8e9b551c7033..ad70a1bdd476 100644
> > --- a/drivers/gpu/drm/xe/xe_device.c
> > +++ b/drivers/gpu/drm/xe/xe_device.c
> > @@ -6,6 +6,7 @@
> >  #include "xe_device.h"
> >  
> >  #include <linux/delay.h>
> > +#include <linux/fault-inject.h>
> >  #include <linux/units.h>
> >  
> >  #include <drm/drm_aperture.h>
> > @@ -382,6 +383,7 @@ struct xe_device *xe_device_create(struct pci_dev *pdev,
> >  err:
> >  	return ERR_PTR(err);
> >  }
> > +ALLOW_ERROR_INJECTION(xe_device_create, ERRNO); /* See xe_pci_probe() */
> >  
> >  static bool xe_driver_flr_disabled(struct xe_device *xe)
> >  {
> > @@ -550,6 +552,7 @@ static int wait_for_lmem_ready(struct xe_device *xe)
> >  
> >  	return 0;
> >  }
> > +ALLOW_ERROR_INJECTION(wait_for_lmem_ready, ERRNO); /* See xe_pci_probe() */
> >  
> >  static void update_device_info(struct xe_device *xe)
> >  {
> > diff --git a/drivers/gpu/drm/xe/xe_ggtt.c b/drivers/gpu/drm/xe/xe_ggtt.c
> > index f68af56c3f86..47bfd9d2635d 100644
> > --- a/drivers/gpu/drm/xe/xe_ggtt.c
> > +++ b/drivers/gpu/drm/xe/xe_ggtt.c
> > @@ -5,6 +5,7 @@
> >  
> >  #include "xe_ggtt.h"
> >  
> > +#include <linux/fault-inject.h>
> >  #include <linux/io-64-nonatomic-lo-hi.h>
> >  #include <linux/sizes.h>
> >  
> > @@ -264,6 +265,7 @@ int xe_ggtt_init_early(struct xe_ggtt *ggtt)
> >  
> >  	return 0;
> >  }
> > +ALLOW_ERROR_INJECTION(xe_ggtt_init_early, ERRNO); /* See xe_pci_probe() */
> >  
> >  static void xe_ggtt_invalidate(struct xe_ggtt *ggtt);
> >  
> > diff --git a/drivers/gpu/drm/xe/xe_guc_ads.c b/drivers/gpu/drm/xe/xe_guc_ads.c
> > index 66d4e5e95abd..04485461aa20 100644
> > --- a/drivers/gpu/drm/xe/xe_guc_ads.c
> > +++ b/drivers/gpu/drm/xe/xe_guc_ads.c
> > @@ -5,6 +5,8 @@
> >  
> >  #include "xe_guc_ads.h"
> >  
> > +#include <linux/fault-inject.h>
> > +
> >  #include <drm/drm_managed.h>
> >  
> >  #include <generated/xe_wa_oob.h>
> > @@ -418,6 +420,7 @@ int xe_guc_ads_init(struct xe_guc_ads *ads)
> >  
> >  	return 0;
> >  }
> > +ALLOW_ERROR_INJECTION(xe_guc_ads_init, ERRNO); /* See xe_pci_probe() */
> >  
> >  /**
> >   * xe_guc_ads_init_post_hwconfig - initialize ADS post hwconfig load
> > diff --git a/drivers/gpu/drm/xe/xe_guc_ct.c b/drivers/gpu/drm/xe/xe_guc_ct.c
> > index 4b95f75b1546..816dc897e29f 100644
> > --- a/drivers/gpu/drm/xe/xe_guc_ct.c
> > +++ b/drivers/gpu/drm/xe/xe_guc_ct.c
> > @@ -8,6 +8,7 @@
> >  #include <linux/bitfield.h>
> >  #include <linux/circ_buf.h>
> >  #include <linux/delay.h>
> > +#include <linux/fault-inject.h>
> >  
> >  #include <kunit/static_stub.h>
> >  
> > @@ -209,6 +210,7 @@ int xe_guc_ct_init(struct xe_guc_ct *ct)
> >  	ct->state = XE_GUC_CT_STATE_DISABLED;
> >  	return 0;
> >  }
> > +ALLOW_ERROR_INJECTION(xe_guc_ct_init, ERRNO); /* See xe_pci_probe() */
> >  
> >  #define desc_read(xe_, guc_ctb__, field_)			\
> >  	xe_map_rd_field(xe_, &guc_ctb__->desc, 0,		\
> > diff --git a/drivers/gpu/drm/xe/xe_guc_log.c b/drivers/gpu/drm/xe/xe_guc_log.c
> > index a37ee3419428..651543721ce5 100644
> > --- a/drivers/gpu/drm/xe/xe_guc_log.c
> > +++ b/drivers/gpu/drm/xe/xe_guc_log.c
> > @@ -5,6 +5,8 @@
> >  
> >  #include "xe_guc_log.h"
> >  
> > +#include <linux/fault-inject.h>
> > +
> >  #include <drm/drm_managed.h>
> >  
> >  #include "xe_bo.h"
> > @@ -96,3 +98,4 @@ int xe_guc_log_init(struct xe_guc_log *log)
> >  
> >  	return 0;
> >  }
> > +ALLOW_ERROR_INJECTION(xe_guc_log_init, ERRNO); /* See xe_pci_probe() */
> > diff --git a/drivers/gpu/drm/xe/xe_guc_relay.c b/drivers/gpu/drm/xe/xe_guc_relay.c
> > index ade6162dc259..8f62de026724 100644
> > --- a/drivers/gpu/drm/xe/xe_guc_relay.c
> > +++ b/drivers/gpu/drm/xe/xe_guc_relay.c
> > @@ -5,6 +5,7 @@
> >  
> >  #include <linux/bitfield.h>
> >  #include <linux/delay.h>
> > +#include <linux/fault-inject.h>
> >  
> >  #include <drm/drm_managed.h>
> >  
> > @@ -355,6 +356,7 @@ int xe_guc_relay_init(struct xe_guc_relay *relay)
> >  
> >  	return drmm_add_action_or_reset(&xe->drm, __fini_relay, relay);
> >  }
> > +ALLOW_ERROR_INJECTION(xe_guc_relay_init, ERRNO); /* See xe_pci_probe() */
> >  
> >  static u32 to_relay_error(int err)
> >  {
> > diff --git a/drivers/gpu/drm/xe/xe_pci.c b/drivers/gpu/drm/xe/xe_pci.c
> > index 41445245a287..7ffee06fab13 100644
> > --- a/drivers/gpu/drm/xe/xe_pci.c
> > +++ b/drivers/gpu/drm/xe/xe_pci.c
> > @@ -770,6 +770,25 @@ static void xe_pci_remove(struct pci_dev *pdev)
> >  	pci_set_drvdata(pdev, NULL);
> >  }
> >  
> > +/*
> > + * Probe the PCI device, initialize various parts of the driver.
> > + *
> > + * Fault injection is used to test the error paths of some initialization
> > + * functions called either directly from xe_pci_probe() or indirectly for
> > + * example through xe_device_probe(). Those functions use the kernel fault
> > + * injection capabilities infrastructure, see
> > + * Documentation/fault-injection/fault-injection.rst for details. The macro
> > + * ALLOW_ERROR_INJECTION() is used to conditionally skip function execution
> > + * at runtime and use a provided return value. The first requirement for
> > + * error injectable functions is proper handling of the error code by the
> > + * caller for recovery, which is always the case here. The second
> > + * requirement is that no state is changed before the first error return.
> > + * It is not strictly fullfilled for all initialization functions using the
> > + * ALLOW_ERROR_INJECTION() macro but this is acceptable because for those
> > + * error cases at probe time, the error code is simply propagated up by the
> > + * caller. Therefore there is no consequence on those specific callers when
> > + * function error injection skips the whole function.
> > + */
> >  static int xe_pci_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
> >  {
> >  	const struct xe_device_desc *desc = (const void *)ent->driver_data;
> > diff --git a/drivers/gpu/drm/xe/xe_pm.c b/drivers/gpu/drm/xe/xe_pm.c
> > index 33eb039053e4..40f7c844ed44 100644
> > --- a/drivers/gpu/drm/xe/xe_pm.c
> > +++ b/drivers/gpu/drm/xe/xe_pm.c
> > @@ -5,6 +5,7 @@
> >  
> >  #include "xe_pm.h"
> >  
> > +#include <linux/fault-inject.h>
> >  #include <linux/pm_runtime.h>
> >  
> >  #include <drm/drm_managed.h>
> > @@ -263,6 +264,7 @@ int xe_pm_init_early(struct xe_device *xe)
> >  
> >  	return 0;
> >  }
> > +ALLOW_ERROR_INJECTION(xe_pm_init_early, ERRNO); /* See xe_pci_probe() */
> >  
> >  /**
> >   * xe_pm_init - Initialize Xe Power Management
> > diff --git a/drivers/gpu/drm/xe/xe_sriov.c b/drivers/gpu/drm/xe/xe_sriov.c
> > index 69a066ef20c0..ef10782af656 100644
> > --- a/drivers/gpu/drm/xe/xe_sriov.c
> > +++ b/drivers/gpu/drm/xe/xe_sriov.c
> > @@ -3,6 +3,8 @@
> >   * Copyright © 2023 Intel Corporation
> >   */
> >  
> > +#include <linux/fault-inject.h>
> > +
> >  #include <drm/drm_managed.h>
> >  
> >  #include "regs/xe_regs.h"
> > @@ -119,6 +121,7 @@ int xe_sriov_init(struct xe_device *xe)
> >  
> >  	return drmm_add_action_or_reset(&xe->drm, fini_sriov, xe);
> >  }
> > +ALLOW_ERROR_INJECTION(xe_sriov_init, ERRNO); /* See xe_pci_probe() */
> >  
> >  /**
> >   * xe_sriov_print_info - Print basic SR-IOV information.
> > diff --git a/drivers/gpu/drm/xe/xe_tile.c b/drivers/gpu/drm/xe/xe_tile.c
> > index dda5268507d8..07cf7cfe4abd 100644
> > --- a/drivers/gpu/drm/xe/xe_tile.c
> > +++ b/drivers/gpu/drm/xe/xe_tile.c
> > @@ -3,6 +3,8 @@
> >   * Copyright © 2023 Intel Corporation
> >   */
> >  
> > +#include <linux/fault-inject.h>
> > +
> >  #include <drm/drm_managed.h>
> >  
> >  #include "xe_device.h"
> > @@ -129,6 +131,7 @@ int xe_tile_init_early(struct xe_tile *tile, struct xe_device *xe, u8 id)
> >  
> >  	return 0;
> >  }
> > +ALLOW_ERROR_INJECTION(xe_tile_init_early, ERRNO); /* See xe_pci_probe() */
> >  
> >  static int tile_ttm_mgr_init(struct xe_tile *tile)
> >  {
> > diff --git a/drivers/gpu/drm/xe/xe_uc_fw.c b/drivers/gpu/drm/xe/xe_uc_fw.c
> > index eab9456e051f..087fb96f707e 100644
> > --- a/drivers/gpu/drm/xe/xe_uc_fw.c
> > +++ b/drivers/gpu/drm/xe/xe_uc_fw.c
> > @@ -4,6 +4,7 @@
> >   */
> >  
> >  #include <linux/bitfield.h>
> > +#include <linux/fault-inject.h>
> >  #include <linux/firmware.h>
> >  
> >  #include <drm/drm_managed.h>
> > @@ -797,6 +798,7 @@ int xe_uc_fw_init(struct xe_uc_fw *uc_fw)
> >  
> >  	return err;
> >  }
> > +ALLOW_ERROR_INJECTION(xe_uc_fw_init, ERRNO); /* See xe_pci_probe() */
> >  
> >  static u32 uc_fw_ggtt_offset(struct xe_uc_fw *uc_fw)
> >  {
> > diff --git a/drivers/gpu/drm/xe/xe_wa.c b/drivers/gpu/drm/xe/xe_wa.c
> > index 22c148b1e996..94ea76b098ed 100644
> > --- a/drivers/gpu/drm/xe/xe_wa.c
> > +++ b/drivers/gpu/drm/xe/xe_wa.c
> > @@ -8,6 +8,7 @@
> >  #include <drm/drm_managed.h>
> >  #include <kunit/visibility.h>
> >  #include <linux/compiler_types.h>
> > +#include <linux/fault-inject.h>
> >  
> >  #include <generated/xe_wa_oob.h>
> >  
> > @@ -850,6 +851,7 @@ int xe_wa_init(struct xe_gt *gt)
> >  
> >  	return 0;
> >  }
> > +ALLOW_ERROR_INJECTION(xe_wa_init, ERRNO); /* See xe_pci_probe() */
> >  
> >  void xe_wa_dump(struct xe_gt *gt, struct drm_printer *p)
> >  {
> > diff --git a/drivers/gpu/drm/xe/xe_wopcm.c b/drivers/gpu/drm/xe/xe_wopcm.c
> > index 93c82825d896..ada0d0aa6b74 100644
> > --- a/drivers/gpu/drm/xe/xe_wopcm.c
> > +++ b/drivers/gpu/drm/xe/xe_wopcm.c
> > @@ -5,6 +5,8 @@
> >  
> >  #include "xe_wopcm.h"
> >  
> > +#include <linux/fault-inject.h>
> > +
> >  #include "regs/xe_guc_regs.h"
> >  #include "xe_device.h"
> >  #include "xe_force_wake.h"
> > @@ -268,3 +270,4 @@ int xe_wopcm_init(struct xe_wopcm *wopcm)
> >  
> >  	return ret;
> >  }
> > +ALLOW_ERROR_INJECTION(xe_wopcm_init, ERRNO); /* See xe_pci_probe() */
> > -- 
> > 2.43.0
> > 


More information about the Intel-xe mailing list