[PATCH v2] drm/xe: Use fault injection infrastructure to find issues at probe time

Rodrigo Vivi rodrigo.vivi at intel.com
Mon Sep 23 18:05:01 UTC 2024


On Fri, Sep 20, 2024 at 03:18:34PM +0200, Francois Dugast wrote:
> The kernel fault injection infrastructure is used to test proper error
> handling during probe. In particular, ALLOW_ERROR_INJECTION() is added
> directly to functions which fullfill the injectable functions
> requirements:
> fault-injection.html#requirements-for-the-error-injectable-functions

Okay, I read this. This was my biggest initial question here...

> 
> Otherwise a helper function is added and called in the beginning of the
> function where the fault is to be injected.
> 
> The return code of the functions using ALLOW_ERROR_INJECTION() can be
> conditionnally modified at runtime by tuning some debugfs entries. This
> requires CONFIG_FUNCTION_ERROR_INJECTION (among others).
> 
> One way to use fault injection at probe time by making each of those
> functions fail one at a time is:
> 
>     FAILTYPE=fail_function
>     DEVICE="0000:00:08.0" # depends on the system
>     ERRNO=-12 # -ENOMEM, can depend on the function
> 
>     echo N > /sys/kernel/debug/$FAILTYPE/task-filter
>     echo 100 > /sys/kernel/debug/$FAILTYPE/probability
>     echo 0 > /sys/kernel/debug/$FAILTYPE/interval
>     echo -1 > /sys/kernel/debug/$FAILTYPE/times
>     echo 0 > /sys/kernel/debug/$FAILTYPE/space
>     echo 1 > /sys/kernel/debug/$FAILTYPE/verbose
> 
>     modprobe xe
>     echo $DEVICE > /sys/bus/pci/drivers/xe/unbind
> 
>     grep -oP "^.* \[xe\]" /sys/kernel/debug/$FAILTYPE/injectable | \
>     cut -d ' ' -f 1 | while read -r FUNCTION ; do
>         echo "Injecting fault in $FUNCTION"
>         echo "" > /sys/kernel/debug/$FAILTYPE/inject
>         echo $FUNCTION > /sys/kernel/debug/$FAILTYPE/inject
>         printf %#x $ERRNO > /sys/kernel/debug/$FAILTYPE/$FUNCTION/retval
>         echo $DEVICE > /sys/bus/pci/drivers/xe/bind
>     done
> 
>     rmmod xe
> 
> It will also be integrated into IGT for systematic execution by CI.
> 
> Signed-off-by: Francois Dugast <francois.dugast at intel.com>
> Cc: Lucas De Marchi <lucas.demarchi at intel.com>
> Cc: Matthew Brost <matthew.brost at intel.com>
> Cc: Rodrigo Vivi <rodrigo.vivi at intel.com>
> Cc: Michal Wajdeczko <michal.wajdeczko at intel.com>
> ---
>  drivers/gpu/drm/xe/xe_device.c    | 12 ++++++++++++
>  drivers/gpu/drm/xe/xe_ggtt.c      |  2 ++
>  drivers/gpu/drm/xe/xe_guc_ads.c   | 13 +++++++++++++
>  drivers/gpu/drm/xe/xe_guc_ct.c    | 11 +++++++++++
>  drivers/gpu/drm/xe/xe_guc_log.c   | 13 +++++++++++++
>  drivers/gpu/drm/xe/xe_guc_relay.c | 11 +++++++++++
>  drivers/gpu/drm/xe/xe_pm.c        | 11 +++++++++++
>  drivers/gpu/drm/xe/xe_sriov.c     | 15 ++++++++++++++-
>  drivers/gpu/drm/xe/xe_tile.c      | 12 ++++++++++++
>  drivers/gpu/drm/xe/xe_uc_fw.c     | 11 +++++++++++
>  drivers/gpu/drm/xe/xe_wa.c        | 12 ++++++++++++
>  drivers/gpu/drm/xe/xe_wopcm.c     |  3 +++
>  12 files changed, 125 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/gpu/drm/xe/xe_device.c b/drivers/gpu/drm/xe/xe_device.c
> index cb5a9fd820cf..d26352ecf75e 100644
> --- a/drivers/gpu/drm/xe/xe_device.c
> +++ b/drivers/gpu/drm/xe/xe_device.c
> @@ -6,6 +6,7 @@
>  #include "xe_device.h"
>  
>  #include <linux/delay.h>
> +#include <linux/fault-inject.h>
>  #include <linux/units.h>
>  
>  #include <drm/drm_aperture.h>
> @@ -300,12 +301,22 @@ static void xe_device_destroy(struct drm_device *dev, void *dummy)
>  	ttm_device_fini(&xe->ttm);
>  }
>  
> +static noinline int fault_inject_device_create(void)
> +{
> +	return 0;
> +}
> +ALLOW_ERROR_INJECTION(fault_inject_device_create, ERRNO);
> +
>  struct xe_device *xe_device_create(struct pci_dev *pdev,
>  				   const struct pci_device_id *ent)
>  {
>  	struct xe_device *xe;
>  	int err;
>  
> +	err = fault_inject_device_create();

But then I don't understand here. To me it looks like this function would
fulfill the criteria. It is not doing any clean up on the err goto,
nor changingn any state before the first error return.

Why can't we go directly?

Perhaps we should add a comment on each of this new fault_inject
functions to explain why that was chosen instead of the direct
macro assignment?

> +	if (err)
> +		return ERR_PTR(err);
> +
>  	xe_display_driver_set_hooks(&driver);
>  
>  	err = drm_aperture_remove_conflicting_pci_framebuffers(pdev, &driver);
> @@ -548,6 +559,7 @@ static int wait_for_lmem_ready(struct xe_device *xe)
>  
>  	return 0;
>  }
> +ALLOW_ERROR_INJECTION(wait_for_lmem_ready, ERRNO);
>  
>  static void update_device_info(struct xe_device *xe)
>  {
> diff --git a/drivers/gpu/drm/xe/xe_ggtt.c b/drivers/gpu/drm/xe/xe_ggtt.c
> index f68af56c3f86..4906e3f3150b 100644
> --- a/drivers/gpu/drm/xe/xe_ggtt.c
> +++ b/drivers/gpu/drm/xe/xe_ggtt.c
> @@ -5,6 +5,7 @@
>  
>  #include "xe_ggtt.h"
>  
> +#include <linux/fault-inject.h>
>  #include <linux/io-64-nonatomic-lo-hi.h>
>  #include <linux/sizes.h>
>  
> @@ -264,6 +265,7 @@ int xe_ggtt_init_early(struct xe_ggtt *ggtt)
>  
>  	return 0;
>  }
> +ALLOW_ERROR_INJECTION(xe_ggtt_init_early, ERRNO);
>  
>  static void xe_ggtt_invalidate(struct xe_ggtt *ggtt);
>  
> diff --git a/drivers/gpu/drm/xe/xe_guc_ads.c b/drivers/gpu/drm/xe/xe_guc_ads.c
> index 66d4e5e95abd..e366043eb4b8 100644
> --- a/drivers/gpu/drm/xe/xe_guc_ads.c
> +++ b/drivers/gpu/drm/xe/xe_guc_ads.c
> @@ -5,6 +5,8 @@
>  
>  #include "xe_guc_ads.h"
>  
> +#include <linux/fault-inject.h>
> +
>  #include <drm/drm_managed.h>
>  
>  #include <generated/xe_wa_oob.h>
> @@ -396,12 +398,23 @@ static int calculate_waklv_size(struct xe_guc_ads *ads)
>  
>  #define MAX_GOLDEN_LRC_SIZE	(SZ_4K * 64)
>  
> +static noinline int fault_inject_guc_ads_init(void)
> +{
> +	return 0;
> +}
> +ALLOW_ERROR_INJECTION(fault_inject_guc_ads_init, ERRNO);
> +
>  int xe_guc_ads_init(struct xe_guc_ads *ads)
>  {
>  	struct xe_device *xe = ads_to_xe(ads);
>  	struct xe_gt *gt = ads_to_gt(ads);
>  	struct xe_tile *tile = gt_to_tile(gt);
>  	struct xe_bo *bo;
> +	int ret;
> +
> +	ret = fault_inject_guc_ads_init();
> +	if (ret)
> +		return ret;
>  
>  	ads->golden_lrc_size = calculate_golden_lrc_size(ads);
>  	ads->regset_size = calculate_regset_size(gt);
> diff --git a/drivers/gpu/drm/xe/xe_guc_ct.c b/drivers/gpu/drm/xe/xe_guc_ct.c
> index 4b95f75b1546..61967ddd319f 100644
> --- a/drivers/gpu/drm/xe/xe_guc_ct.c
> +++ b/drivers/gpu/drm/xe/xe_guc_ct.c
> @@ -8,6 +8,7 @@
>  #include <linux/bitfield.h>
>  #include <linux/circ_buf.h>
>  #include <linux/delay.h>
> +#include <linux/fault-inject.h>
>  
>  #include <kunit/static_stub.h>
>  
> @@ -165,6 +166,12 @@ static void primelockdep(struct xe_guc_ct *ct)
>  	fs_reclaim_release(GFP_KERNEL);
>  }
>  
> +static noinline int fault_inject_guc_ct_init(void)
> +{
> +	return 0;
> +}
> +ALLOW_ERROR_INJECTION(fault_inject_guc_ct_init, ERRNO);
> +
>  int xe_guc_ct_init(struct xe_guc_ct *ct)
>  {
>  	struct xe_device *xe = ct_to_xe(ct);
> @@ -173,6 +180,10 @@ int xe_guc_ct_init(struct xe_guc_ct *ct)
>  	struct xe_bo *bo;
>  	int err;
>  
> +	err = fault_inject_guc_ct_init();
> +	if (err)
> +		return err;
> +
>  	xe_gt_assert(gt, !(guc_ct_size() % PAGE_SIZE));
>  
>  	ct->g2h_wq = alloc_ordered_workqueue("xe-g2h-wq", 0);
> diff --git a/drivers/gpu/drm/xe/xe_guc_log.c b/drivers/gpu/drm/xe/xe_guc_log.c
> index a37ee3419428..a3e54f1bb0c3 100644
> --- a/drivers/gpu/drm/xe/xe_guc_log.c
> +++ b/drivers/gpu/drm/xe/xe_guc_log.c
> @@ -5,6 +5,8 @@
>  
>  #include "xe_guc_log.h"
>  
> +#include <linux/fault-inject.h>
> +
>  #include <drm/drm_managed.h>
>  
>  #include "xe_bo.h"
> @@ -77,11 +79,22 @@ void xe_guc_log_print(struct xe_guc_log *log, struct drm_printer *p)
>  	}
>  }
>  
> +static noinline int fault_inject_guc_log_init(void)
> +{
> +	return 0;
> +}
> +ALLOW_ERROR_INJECTION(fault_inject_guc_log_init, ERRNO);
> +
>  int xe_guc_log_init(struct xe_guc_log *log)
>  {
>  	struct xe_device *xe = log_to_xe(log);
>  	struct xe_tile *tile = gt_to_tile(log_to_gt(log));
>  	struct xe_bo *bo;
> +	int err;
> +
> +	err = fault_inject_guc_log_init();
> +	if (err)
> +		return err;
>  
>  	bo = xe_managed_bo_create_pin_map(xe, tile, guc_log_size(),
>  					  XE_BO_FLAG_SYSTEM |
> diff --git a/drivers/gpu/drm/xe/xe_guc_relay.c b/drivers/gpu/drm/xe/xe_guc_relay.c
> index ade6162dc259..ede7fd3e7785 100644
> --- a/drivers/gpu/drm/xe/xe_guc_relay.c
> +++ b/drivers/gpu/drm/xe/xe_guc_relay.c
> @@ -5,6 +5,7 @@
>  
>  #include <linux/bitfield.h>
>  #include <linux/delay.h>
> +#include <linux/fault-inject.h>
>  
>  #include <drm/drm_managed.h>
>  
> @@ -320,6 +321,12 @@ static void __fini_relay(struct drm_device *drm, void *arg)
>  	mempool_exit(&relay->pool);
>  }
>  
> +static noinline int fault_inject_guc_relay_init(void)
> +{
> +	return 0;
> +}
> +ALLOW_ERROR_INJECTION(fault_inject_guc_relay_init, ERRNO);
> +
>  /**
>   * xe_guc_relay_init - Initialize a &xe_guc_relay
>   * @relay: the &xe_guc_relay to initialize
> @@ -335,6 +342,10 @@ int xe_guc_relay_init(struct xe_guc_relay *relay)
>  	struct xe_device *xe = relay_to_xe(relay);
>  	int err;
>  
> +	err = fault_inject_guc_relay_init();
> +	if (err)
> +		return err;
> +
>  	relay_assert(relay, !relay_is_ready(relay));
>  
>  	if (!IS_SRIOV(xe))
> diff --git a/drivers/gpu/drm/xe/xe_pm.c b/drivers/gpu/drm/xe/xe_pm.c
> index 33eb039053e4..87075aed885d 100644
> --- a/drivers/gpu/drm/xe/xe_pm.c
> +++ b/drivers/gpu/drm/xe/xe_pm.c
> @@ -5,6 +5,7 @@
>  
>  #include "xe_pm.h"
>  
> +#include <linux/fault-inject.h>
>  #include <linux/pm_runtime.h>
>  
>  #include <drm/drm_managed.h>
> @@ -247,10 +248,20 @@ static void xe_pm_runtime_init(struct xe_device *xe)
>  	pm_runtime_put(dev);
>  }
>  
> +static noinline int fault_inject_pm_init_early(void)
> +{
> +	return 0;
> +}
> +ALLOW_ERROR_INJECTION(fault_inject_pm_init_early, ERRNO);
> +
>  int xe_pm_init_early(struct xe_device *xe)
>  {
>  	int err;
>  
> +	err = fault_inject_pm_init_early();
> +	if (err)
> +		return err;
> +
>  	INIT_LIST_HEAD(&xe->mem_access.vram_userfault.list);
>  
>  	err = drmm_mutex_init(&xe->drm, &xe->mem_access.vram_userfault.lock);
> diff --git a/drivers/gpu/drm/xe/xe_sriov.c b/drivers/gpu/drm/xe/xe_sriov.c
> index 69a066ef20c0..f1dafcfd4eae 100644
> --- a/drivers/gpu/drm/xe/xe_sriov.c
> +++ b/drivers/gpu/drm/xe/xe_sriov.c
> @@ -3,6 +3,8 @@
>   * Copyright © 2023 Intel Corporation
>   */
>  
> +#include <linux/fault-inject.h>
> +
>  #include <drm/drm_managed.h>
>  
>  #include "regs/xe_regs.h"
> @@ -91,6 +93,12 @@ static void fini_sriov(struct drm_device *drm, void *arg)
>  	xe->sriov.wq = NULL;
>  }
>  
> +static noinline int fault_inject_sriov_init(void)
> +{
> +	return 0;
> +}
> +ALLOW_ERROR_INJECTION(fault_inject_sriov_init, ERRNO);
> +
>  /**
>   * xe_sriov_init - Initialize SR-IOV specific data.
>   * @xe: the &xe_device to initialize
> @@ -102,11 +110,16 @@ static void fini_sriov(struct drm_device *drm, void *arg)
>   */
>  int xe_sriov_init(struct xe_device *xe)
>  {
> +	int err = fault_inject_sriov_init();
> +
> +	if (err)
> +		return err;
> +
>  	if (!IS_SRIOV(xe))
>  		return 0;
>  
>  	if (IS_SRIOV_PF(xe)) {
> -		int err = xe_sriov_pf_init_early(xe);
> +		err = xe_sriov_pf_init_early(xe);
>  
>  		if (err)
>  			return err;
> diff --git a/drivers/gpu/drm/xe/xe_tile.c b/drivers/gpu/drm/xe/xe_tile.c
> index dda5268507d8..c82b4278c03e 100644
> --- a/drivers/gpu/drm/xe/xe_tile.c
> +++ b/drivers/gpu/drm/xe/xe_tile.c
> @@ -3,6 +3,8 @@
>   * Copyright © 2023 Intel Corporation
>   */
>  
> +#include <linux/fault-inject.h>
> +
>  #include <drm/drm_managed.h>
>  
>  #include "xe_device.h"
> @@ -99,6 +101,12 @@ static int xe_tile_alloc(struct xe_tile *tile)
>  	return 0;
>  }
>  
> +static noinline int fault_inject_tile_init_early(void)
> +{
> +	return 0;
> +}
> +ALLOW_ERROR_INJECTION(fault_inject_tile_init_early, ERRNO);
> +
>  /**
>   * xe_tile_init_early - Initialize the tile and primary GT
>   * @tile: Tile to initialize
> @@ -114,6 +122,10 @@ int xe_tile_init_early(struct xe_tile *tile, struct xe_device *xe, u8 id)
>  {
>  	int err;
>  
> +	err = fault_inject_tile_init_early();
> +	if (err)
> +		return err;
> +
>  	tile->xe = xe;
>  	tile->id = id;
>  
> diff --git a/drivers/gpu/drm/xe/xe_uc_fw.c b/drivers/gpu/drm/xe/xe_uc_fw.c
> index eab9456e051f..8fff0fd7c675 100644
> --- a/drivers/gpu/drm/xe/xe_uc_fw.c
> +++ b/drivers/gpu/drm/xe/xe_uc_fw.c
> @@ -4,6 +4,7 @@
>   */
>  
>  #include <linux/bitfield.h>
> +#include <linux/fault-inject.h>
>  #include <linux/firmware.h>
>  
>  #include <drm/drm_managed.h>
> @@ -776,11 +777,21 @@ static int uc_fw_copy(struct xe_uc_fw *uc_fw, const void *data, size_t size, u32
>  	return err;
>  }
>  
> +static noinline int fault_inject_uc_fw_init(void)
> +{
> +	return 0;
> +}
> +ALLOW_ERROR_INJECTION(fault_inject_uc_fw_init, ERRNO);
> +
>  int xe_uc_fw_init(struct xe_uc_fw *uc_fw)
>  {
>  	const struct firmware *fw = NULL;
>  	int err;
>  
> +	err = fault_inject_uc_fw_init();
> +	if (err)
> +		return err;
> +
>  	err = uc_fw_request(uc_fw, &fw);
>  	if (err)
>  		return err;
> diff --git a/drivers/gpu/drm/xe/xe_wa.c b/drivers/gpu/drm/xe/xe_wa.c
> index 22c148b1e996..121443b790bf 100644
> --- a/drivers/gpu/drm/xe/xe_wa.c
> +++ b/drivers/gpu/drm/xe/xe_wa.c
> @@ -8,6 +8,7 @@
>  #include <drm/drm_managed.h>
>  #include <kunit/visibility.h>
>  #include <linux/compiler_types.h>
> +#include <linux/fault-inject.h>
>  
>  #include <generated/xe_wa_oob.h>
>  
> @@ -818,6 +819,12 @@ void xe_wa_process_lrc(struct xe_hw_engine *hwe)
>  	xe_rtp_process_to_sr(&ctx, lrc_was, &hwe->reg_lrc);
>  }
>  
> +static noinline int fault_inject_wa_init(void)
> +{
> +	return 0;
> +}
> +ALLOW_ERROR_INJECTION(fault_inject_wa_init, ERRNO);
> +
>  /**
>   * xe_wa_init - initialize gt with workaround bookkeeping
>   * @gt: GT instance to initialize
> @@ -829,6 +836,11 @@ int xe_wa_init(struct xe_gt *gt)
>  	struct xe_device *xe = gt_to_xe(gt);
>  	size_t n_oob, n_lrc, n_engine, n_gt, total;
>  	unsigned long *p;
> +	int err;
> +
> +	err = fault_inject_wa_init();
> +	if (err)
> +		return err;
>  
>  	n_gt = BITS_TO_LONGS(ARRAY_SIZE(gt_was));
>  	n_engine = BITS_TO_LONGS(ARRAY_SIZE(engine_was));
> diff --git a/drivers/gpu/drm/xe/xe_wopcm.c b/drivers/gpu/drm/xe/xe_wopcm.c
> index 93c82825d896..88a201122a22 100644
> --- a/drivers/gpu/drm/xe/xe_wopcm.c
> +++ b/drivers/gpu/drm/xe/xe_wopcm.c
> @@ -5,6 +5,8 @@
>  
>  #include "xe_wopcm.h"
>  
> +#include <linux/fault-inject.h>
> +
>  #include "regs/xe_guc_regs.h"
>  #include "xe_device.h"
>  #include "xe_force_wake.h"
> @@ -268,3 +270,4 @@ int xe_wopcm_init(struct xe_wopcm *wopcm)
>  
>  	return ret;
>  }
> +ALLOW_ERROR_INJECTION(xe_wopcm_init, ERRNO);
> -- 
> 2.43.0
> 


More information about the Intel-xe mailing list