[PATCH v2] drm/xe: Add driver load error injection

Lucas De Marchi lucas.demarchi at intel.com
Wed Aug 14 18:18:14 UTC 2024


On Wed, Aug 14, 2024 at 06:59:29PM GMT, Francois Dugast wrote:
>Port over i915 driver load error injection.

this sentence needs to go away, then you explain here what this is and
how it's expected to be used for tests. Also a good opportunity to
explain that this is used to fix up the current problems and avoid new
ones to be introduced, but in future it's expected to be replaced by
infra from fault-inject.h.

>
>v2: Fix style and build errors, modparam to 0 after probe, rename to
>    xe_device_inject_driver_probe_error, check type when compiled out,
>    add _return macro, move some uses to the beginning of the function
>
>Signed-off-by: Matthew Brost <matthew.brost at intel.com>
>Signed-off-by: Francois Dugast <francois.dugast at intel.com>
>Cc: Lucas De Marchi <lucas.demarchi at intel.com>
>---
> drivers/gpu/drm/xe/xe_device.c       | 43 ++++++++++++++++++++++++++++
> drivers/gpu/drm/xe/xe_device.h       | 25 ++++++++++++++++
> drivers/gpu/drm/xe/xe_device_types.h |  5 ++++
> drivers/gpu/drm/xe/xe_gt.c           |  2 ++
> drivers/gpu/drm/xe/xe_gt_sriov_pf.c  |  2 ++
> drivers/gpu/drm/xe/xe_guc.c          |  2 ++
> drivers/gpu/drm/xe/xe_guc_ct.c       |  1 +
> drivers/gpu/drm/xe/xe_mmio.c         |  2 ++
> drivers/gpu/drm/xe/xe_module.c       |  5 ++++
> drivers/gpu/drm/xe/xe_module.h       |  3 ++
> drivers/gpu/drm/xe/xe_pci.c          |  3 ++
> drivers/gpu/drm/xe/xe_pm.c           |  3 ++
> drivers/gpu/drm/xe/xe_sriov.c        |  2 ++
> drivers/gpu/drm/xe/xe_tile.c         |  2 ++
> drivers/gpu/drm/xe/xe_uc.c           |  1 +
> drivers/gpu/drm/xe/xe_wa.c           |  2 ++
> drivers/gpu/drm/xe/xe_wopcm.c        |  2 ++
> 17 files changed, 105 insertions(+)
>
>diff --git a/drivers/gpu/drm/xe/xe_device.c b/drivers/gpu/drm/xe/xe_device.c
>index 1aba6f9eaa19..6d3cdeeaadfd 100644
>--- a/drivers/gpu/drm/xe/xe_device.c
>+++ b/drivers/gpu/drm/xe/xe_device.c
>@@ -374,6 +374,10 @@ struct xe_device *xe_device_create(struct pci_dev *pdev,
> 	if (WARN_ON(err))
> 		goto err;
>
>+	err = xe_device_inject_driver_probe_error(xe, err);
>+	if (err)
>+		goto err;

	err = ttm_device_init(&xe->ttm, &xe_ttm_funcs, xe->drm.dev,
			      xe->drm.anon_inode->i_mapping,
			      xe->drm.vma_offset_manager, false, false);
	err = xe_device_inject_driver_probe_error(xe, err);
	if (WARN_ON(err))
		goto err;


rather than adding another flow control

>+
> 	return xe;
>
> err:
>@@ -474,6 +478,7 @@ static int xe_set_dma_info(struct xe_device *xe)
> 		goto mask_err;
>
> 	err = dma_set_coherent_mask(xe->drm.dev, DMA_BIT_MASK(mask_size));
>+	err = xe_device_inject_driver_probe_error(xe, err);
> 	if (err)
> 		goto mask_err;
>
>@@ -577,6 +582,7 @@ int xe_device_probe_early(struct xe_device *xe)
> 		return err;
>
> 	err = wait_for_lmem_ready(xe);
>+	err = xe_device_inject_driver_probe_error(xe, err);
> 	if (err)
> 		return err;
>
>@@ -745,6 +751,10 @@ int xe_device_probe(struct xe_device *xe)
> 	for_each_gt(gt, xe, id)
> 		xe_gt_sanitize_freq(gt);
>
>+#ifdef CONFIG_DRM_XE_DEBUG

comment here:

	/* After probe finishes, stop checking for error injection */

>+	xe_modparam.inject_driver_probe_error = 0;
>+#endif
>+
> 	return devm_add_action_or_reset(xe->drm.dev, xe_device_sanitize, xe);
>
> err_fini_display:
>@@ -995,3 +1005,36 @@ void xe_device_declare_wedged(struct xe_device *xe)
> 	for_each_gt(gt, xe, id)
> 		xe_gt_declare_wedged(gt);
> }
>+
>+#if IS_ENABLED(CONFIG_DRM_XE_DEBUG)
>+/**
>+ * __xe_device_inject_driver_probe_error - Inject an error during device probe
>+ * @xe: xe device instance
>+ * @err_injected: the error to inject
>+ * @err_real: the error returned by the actual function
>+ * @func: the name of the function where this is called from
>+ * @line: the line where this is called from
>+ *
>+ * This is not meant to be called directly, only through xe_device_inject_driver_probe_error.
>+ *
>+ * Return: err_real if != 0, err_injected otherwise
>+ */
>+int __xe_device_inject_driver_probe_error(struct xe_device *xe, int err_injected, int err_real,
>+					  const char *func, int line)
>+{
>+	if (xe->inject_driver_probe_error >= xe_modparam.inject_driver_probe_error)
>+		return 0;
>+
>+	if (++xe->inject_driver_probe_error < xe_modparam.inject_driver_probe_error)
>+		return 0;
>+
>+	if (err_real != 0)
>+		return err_real;

why isn't this the first check? we don't want to count any error
injection if the error was real

>+
>+	drm_info(&xe->drm, "Injecting failure %d at checkpoint %u [%s:%d]\n",
>+		 err_injected, xe->inject_driver_probe_error, func, line);
>+
>+	xe_modparam.inject_driver_probe_error = 0;
>+	return err_injected;
>+}
>+#endif
>diff --git a/drivers/gpu/drm/xe/xe_device.h b/drivers/gpu/drm/xe/xe_device.h
>index db6cc8d0d6b8..31270a54ccf9 100644
>--- a/drivers/gpu/drm/xe/xe_device.h
>+++ b/drivers/gpu/drm/xe/xe_device.h
>@@ -179,4 +179,29 @@ void xe_device_declare_wedged(struct xe_device *xe);
> struct xe_file *xe_file_get(struct xe_file *xef);
> void xe_file_put(struct xe_file *xef);
>
>+#define XE_DEVICE_INJECTED_ERR -ENODEV
>+#define xe_device_inject_driver_probe_return(__xe) \
>+	({ if (__xe_device_inject_driver_probe_error(__xe, XE_DEVICE_INJECTED_ERR, 0, __func__, \
>+						     __LINE__) == XE_DEVICE_INJECTED_ERR) \
>+			return XE_DEVICE_INJECTED_ERR; })

#define xe_device_inject_driver_probe_error(__xe) \
	__xe_device_inject_driver_probe_error(__xe, XE_DEVICE_INJECTED_ERR, 0, __func__, __LINE__)

>+#define xe_device_inject_driver_probe_error(__xe, __err_real) \

#define xe_device_inject_driver_probe_error_override(__xe, __err_real)
	__xe_device_inject_driver_probe_error(__xe, XE_DEVICE_INJECTED_ERR, __err_real, __func__, \
					      __LINE__)

or with some other better name, but without changing the flow inside a
macro.

>+	__xe_device_inject_driver_probe_error(__xe, XE_DEVICE_INJECTED_ERR, __err_real, __func__, \
>+					      __LINE__)
>+
>+#if IS_ENABLED(CONFIG_DRM_XE_DEBUG)
>+
>+int __xe_device_inject_driver_probe_error(struct xe_device *xe,
>+					  int err_injected, int err_real,
>+					  const char *func, int line);
>+#else
>+
>+static inline int __xe_device_inject_driver_probe_error(struct xe_device *xe,
>+							int err_injected, int err_real,
>+							const char *func, int line)
>+{
>+	return 0;
>+}
>+
>+#endif
>+
> #endif
>diff --git a/drivers/gpu/drm/xe/xe_device_types.h b/drivers/gpu/drm/xe/xe_device_types.h
>index 5b7292a9a66d..139336ad4194 100644
>--- a/drivers/gpu/drm/xe/xe_device_types.h
>+++ b/drivers/gpu/drm/xe/xe_device_types.h
>@@ -484,6 +484,11 @@ struct xe_device {
> 		int mode;
> 	} wedged;
>
>+#if IS_ENABLED(CONFIG_DRM_XE_DEBUG)
>+	/** @inject_driver_probe_error: Counter used for error injection during probe */
>+	int inject_driver_probe_error;
>+#endif
>+
> #ifdef TEST_VM_OPS_ERROR
> 	/**
> 	 * @vm_inject_error_position: inject errors at different places in VM
>diff --git a/drivers/gpu/drm/xe/xe_gt.c b/drivers/gpu/drm/xe/xe_gt.c
>index 58895ed22f6e..4c73576bc5f3 100644
>--- a/drivers/gpu/drm/xe/xe_gt.c
>+++ b/drivers/gpu/drm/xe/xe_gt.c
>@@ -378,6 +378,7 @@ int xe_gt_init_early(struct xe_gt *gt)
> 	xe_reg_sr_init(&gt->reg_sr, "GT", gt_to_xe(gt));
>
> 	err = xe_wa_init(gt);
>+	err = xe_device_inject_driver_probe_error(gt_to_xe(gt), err);
> 	if (err)
> 		return err;
>
>@@ -564,6 +565,7 @@ int xe_gt_init_hwconfig(struct xe_gt *gt)
> 		goto out_fw;
>
> 	err = xe_uc_init_hwconfig(&gt->uc);
>+	err = xe_device_inject_driver_probe_error(gt_to_xe(gt), err);
> 	if (err)
> 		goto out_fw;
>
>diff --git a/drivers/gpu/drm/xe/xe_gt_sriov_pf.c b/drivers/gpu/drm/xe/xe_gt_sriov_pf.c
>index ef239440963c..0d50cbb496c3 100644
>--- a/drivers/gpu/drm/xe/xe_gt_sriov_pf.c
>+++ b/drivers/gpu/drm/xe/xe_gt_sriov_pf.c
>@@ -7,6 +7,7 @@
>
> #include "regs/xe_regs.h"
>
>+#include "xe_device.h"
> #include "xe_gt_sriov_pf.h"
> #include "xe_gt_sriov_pf_config.h"
> #include "xe_gt_sriov_pf_helpers.h"
>@@ -54,6 +55,7 @@ int xe_gt_sriov_pf_init_early(struct xe_gt *gt)
> 		return err;
>
> 	err = xe_gt_sriov_pf_service_init(gt);
>+	err = xe_device_inject_driver_probe_error(gt_to_xe(gt), err);
> 	if (err)
> 		return err;
>
>diff --git a/drivers/gpu/drm/xe/xe_guc.c b/drivers/gpu/drm/xe/xe_guc.c
>index de0fe9e65746..61009499fddf 100644
>--- a/drivers/gpu/drm/xe/xe_guc.c
>+++ b/drivers/gpu/drm/xe/xe_guc.c
>@@ -351,6 +351,7 @@ int xe_guc_init(struct xe_guc *guc)
> 		goto out;
>
> 	ret = devm_add_action_or_reset(xe->drm.dev, guc_fini_hw, guc);
>+	ret = xe_device_inject_driver_probe_error(guc_to_xe(guc), ret);
> 	if (ret)
> 		goto out;
>
>@@ -408,6 +409,7 @@ int xe_guc_init_post_hwconfig(struct xe_guc *guc)
> 		return ret;
>
> 	ret = xe_guc_pc_init(&guc->pc);
>+	ret = xe_device_inject_driver_probe_error(guc_to_xe(guc), ret);
> 	if (ret)
> 		return ret;
>
>diff --git a/drivers/gpu/drm/xe/xe_guc_ct.c b/drivers/gpu/drm/xe/xe_guc_ct.c
>index beeeb120d1fc..29a8314b0ee1 100644
>--- a/drivers/gpu/drm/xe/xe_guc_ct.c
>+++ b/drivers/gpu/drm/xe/xe_guc_ct.c
>@@ -194,6 +194,7 @@ int xe_guc_ct_init(struct xe_guc_ct *ct)
> 	ct->bo = bo;
>
> 	err = drmm_add_action_or_reset(&xe->drm, guc_ct_fini, ct);
>+	err = xe_device_inject_driver_probe_error(xe, err);
> 	if (err)
> 		return err;
>
>diff --git a/drivers/gpu/drm/xe/xe_mmio.c b/drivers/gpu/drm/xe/xe_mmio.c
>index bdcc7282385c..f79ac35251c5 100644
>--- a/drivers/gpu/drm/xe/xe_mmio.c
>+++ b/drivers/gpu/drm/xe/xe_mmio.c
>@@ -137,6 +137,8 @@ int xe_mmio_probe_tiles(struct xe_device *xe)
> 	size_t tile_mmio_size = SZ_16M;
> 	size_t tile_mmio_ext_size = xe->info.tile_mmio_ext_size;
>
>+	xe_device_inject_driver_probe_return(xe);

as above, let's keep the if (...)  here rather than inside the macro.

Lucas De Marchi

>+
> 	mmio_multi_tile_setup(xe, tile_mmio_size);
> 	mmio_extension_setup(xe, tile_mmio_size, tile_mmio_ext_size);
>
>diff --git a/drivers/gpu/drm/xe/xe_module.c b/drivers/gpu/drm/xe/xe_module.c
>index 7bb99e451fcc..dbc737085bf1 100644
>--- a/drivers/gpu/drm/xe/xe_module.c
>+++ b/drivers/gpu/drm/xe/xe_module.c
>@@ -53,6 +53,11 @@ module_param_named_unsafe(force_probe, xe_modparam.force_probe, charp, 0400);
> MODULE_PARM_DESC(force_probe,
> 		 "Force probe options for specified devices. See CONFIG_DRM_XE_FORCE_PROBE for details.");
>
>+#if IS_ENABLED(CONFIG_DRM_XE_DEBUG)
>+module_param_named_unsafe(inject_driver_probe_error, xe_modparam.inject_driver_probe_error, int, 0600);
>+MODULE_PARM_DESC(inject_driver_probe_error, "Inject driver probe error");
>+#endif
>+
> #ifdef CONFIG_PCI_IOV
> module_param_named(max_vfs, xe_modparam.max_vfs, uint, 0400);
> MODULE_PARM_DESC(max_vfs,
>diff --git a/drivers/gpu/drm/xe/xe_module.h b/drivers/gpu/drm/xe/xe_module.h
>index 61a0d28a28c8..d612a7ae7a4c 100644
>--- a/drivers/gpu/drm/xe/xe_module.h
>+++ b/drivers/gpu/drm/xe/xe_module.h
>@@ -20,6 +20,9 @@ struct xe_modparam {
> 	char *force_probe;
> #ifdef CONFIG_PCI_IOV
> 	unsigned int max_vfs;
>+#endif
>+#if IS_ENABLED(CONFIG_DRM_XE_DEBUG)
>+	int inject_driver_probe_error;
> #endif
> 	int wedged_mode;
> };
>diff --git a/drivers/gpu/drm/xe/xe_pci.c b/drivers/gpu/drm/xe/xe_pci.c
>index f818aa69f3ca..97b82d5366d9 100644
>--- a/drivers/gpu/drm/xe/xe_pci.c
>+++ b/drivers/gpu/drm/xe/xe_pci.c
>@@ -626,6 +626,7 @@ static int xe_info_init_early(struct xe_device *xe,
> 				  desc->has_display;
>
> 	err = xe_tile_init_early(xe_device_get_root_tile(xe), xe, 0);
>+	err = xe_device_inject_driver_probe_error(xe, err);
> 	if (err)
> 		return err;
>
>@@ -647,6 +648,8 @@ static int xe_info_init(struct xe_device *xe,
> 	struct xe_gt *gt;
> 	u8 id;
>
>+	xe_device_inject_driver_probe_return(xe);
>+
> 	/*
> 	 * If this platform supports GMD_ID, we'll detect the proper IP
> 	 * descriptor to use from hardware registers. desc->graphics will only
>diff --git a/drivers/gpu/drm/xe/xe_pm.c b/drivers/gpu/drm/xe/xe_pm.c
>index 9f3c14fd9f33..4c05352eb01a 100644
>--- a/drivers/gpu/drm/xe/xe_pm.c
>+++ b/drivers/gpu/drm/xe/xe_pm.c
>@@ -228,6 +228,7 @@ int xe_pm_init_early(struct xe_device *xe)
> 		return err;
>
> 	err = drmm_mutex_init(&xe->drm, &xe->d3cold.lock);
>+	err = xe_device_inject_driver_probe_error(xe, err);
> 	if (err)
> 		return err;
>
>@@ -246,6 +247,8 @@ int xe_pm_init(struct xe_device *xe)
> {
> 	int err;
>
>+	xe_device_inject_driver_probe_return(xe);
>+
> 	/* For now suspend/resume is only allowed with GuC */
> 	if (!xe_device_uc_enabled(xe))
> 		return 0;
>diff --git a/drivers/gpu/drm/xe/xe_sriov.c b/drivers/gpu/drm/xe/xe_sriov.c
>index 5a1d65e4f19f..164cc119ad96 100644
>--- a/drivers/gpu/drm/xe/xe_sriov.c
>+++ b/drivers/gpu/drm/xe/xe_sriov.c
>@@ -105,6 +105,8 @@ int xe_sriov_init(struct xe_device *xe)
> 	if (!IS_SRIOV(xe))
> 		return 0;
>
>+	xe_device_inject_driver_probe_return(xe);
>+
> 	if (IS_SRIOV_PF(xe)) {
> 		int err = xe_sriov_pf_init_early(xe);
>
>diff --git a/drivers/gpu/drm/xe/xe_tile.c b/drivers/gpu/drm/xe/xe_tile.c
>index 15ea0a942f67..bc0193bd35ed 100644
>--- a/drivers/gpu/drm/xe/xe_tile.c
>+++ b/drivers/gpu/drm/xe/xe_tile.c
>@@ -124,6 +124,8 @@ int xe_tile_init_early(struct xe_tile *tile, struct xe_device *xe, u8 id)
> 	if (IS_ERR(tile->primary_gt))
> 		return PTR_ERR(tile->primary_gt);
>
>+	xe_device_inject_driver_probe_return(xe);
>+
> 	return 0;
> }
>
>diff --git a/drivers/gpu/drm/xe/xe_uc.c b/drivers/gpu/drm/xe/xe_uc.c
>index 0d073a9987c2..32e0466bf0a4 100644
>--- a/drivers/gpu/drm/xe/xe_uc.c
>+++ b/drivers/gpu/drm/xe/xe_uc.c
>@@ -59,6 +59,7 @@ int xe_uc_init(struct xe_uc *uc)
> 		return 0;
>
> 	ret = xe_wopcm_init(&uc->wopcm);
>+	ret = xe_device_inject_driver_probe_error(uc_to_xe(uc), ret);
> 	if (ret)
> 		goto err;
>
>diff --git a/drivers/gpu/drm/xe/xe_wa.c b/drivers/gpu/drm/xe/xe_wa.c
>index 564e32e44e3b..1c0c1ea8058b 100644
>--- a/drivers/gpu/drm/xe/xe_wa.c
>+++ b/drivers/gpu/drm/xe/xe_wa.c
>@@ -832,6 +832,8 @@ int xe_wa_init(struct xe_gt *gt)
> 	if (!p)
> 		return -ENOMEM;
>
>+	xe_device_inject_driver_probe_return(xe);
>+
> 	gt->wa_active.gt = p;
> 	p += n_gt;
> 	gt->wa_active.engine = p;
>diff --git a/drivers/gpu/drm/xe/xe_wopcm.c b/drivers/gpu/drm/xe/xe_wopcm.c
>index d3a99157e523..eb7f1554108d 100644
>--- a/drivers/gpu/drm/xe/xe_wopcm.c
>+++ b/drivers/gpu/drm/xe/xe_wopcm.c
>@@ -263,6 +263,8 @@ int xe_wopcm_init(struct xe_wopcm *wopcm)
> 		return -E2BIG;
> 	}
>
>+	xe_device_inject_driver_probe_return(xe);
>+
> 	if (!locked)
> 		ret = __wopcm_init_regs(xe, gt, wopcm);
>
>-- 
>2.43.0
>


More information about the Intel-xe mailing list