[PATCH v2] drm/xe: Add driver load error injection

Gustavo Sousa gustavo.sousa at intel.com
Wed Aug 14 18:26:08 UTC 2024


Quoting Lucas De Marchi (2024-08-14 15:18:14-03:00)
>On Wed, Aug 14, 2024 at 06:59:29PM GMT, Francois Dugast wrote:
>>Port over i915 driver load error injection.
>
>this sentence needs to go away, then you explain here what this is and
>how it's expected to be used for tests. Also a good opportunity to

Even though this is supposed to be temporary, the explanation could be
done as documentation in the code instead of part of the commit message?

--
Gustavo Sousa

>explain that this is used to fix up the current problems and avoid new
>ones to be introduced, but in future it's expected to be replaced by
>infra from fault-inject.h.
>
>>
>>v2: Fix style and build errors, modparam to 0 after probe, rename to
>>    xe_device_inject_driver_probe_error, check type when compiled out,
>>    add _return macro, move some uses to the beginning of the function
>>
>>Signed-off-by: Matthew Brost <matthew.brost at intel.com>
>>Signed-off-by: Francois Dugast <francois.dugast at intel.com>
>>Cc: Lucas De Marchi <lucas.demarchi at intel.com>
>>---
>> drivers/gpu/drm/xe/xe_device.c       | 43 ++++++++++++++++++++++++++++
>> drivers/gpu/drm/xe/xe_device.h       | 25 ++++++++++++++++
>> drivers/gpu/drm/xe/xe_device_types.h |  5 ++++
>> drivers/gpu/drm/xe/xe_gt.c           |  2 ++
>> drivers/gpu/drm/xe/xe_gt_sriov_pf.c  |  2 ++
>> drivers/gpu/drm/xe/xe_guc.c          |  2 ++
>> drivers/gpu/drm/xe/xe_guc_ct.c       |  1 +
>> drivers/gpu/drm/xe/xe_mmio.c         |  2 ++
>> drivers/gpu/drm/xe/xe_module.c       |  5 ++++
>> drivers/gpu/drm/xe/xe_module.h       |  3 ++
>> drivers/gpu/drm/xe/xe_pci.c          |  3 ++
>> drivers/gpu/drm/xe/xe_pm.c           |  3 ++
>> drivers/gpu/drm/xe/xe_sriov.c        |  2 ++
>> drivers/gpu/drm/xe/xe_tile.c         |  2 ++
>> drivers/gpu/drm/xe/xe_uc.c           |  1 +
>> drivers/gpu/drm/xe/xe_wa.c           |  2 ++
>> drivers/gpu/drm/xe/xe_wopcm.c        |  2 ++
>> 17 files changed, 105 insertions(+)
>>
>>diff --git a/drivers/gpu/drm/xe/xe_device.c b/drivers/gpu/drm/xe/xe_device.c
>>index 1aba6f9eaa19..6d3cdeeaadfd 100644
>>--- a/drivers/gpu/drm/xe/xe_device.c
>>+++ b/drivers/gpu/drm/xe/xe_device.c
>>@@ -374,6 +374,10 @@ struct xe_device *xe_device_create(struct pci_dev *pdev,
>>         if (WARN_ON(err))
>>                 goto err;
>>
>>+        err = xe_device_inject_driver_probe_error(xe, err);
>>+        if (err)
>>+                goto err;
>
>        err = ttm_device_init(&xe->ttm, &xe_ttm_funcs, xe->drm.dev,
>                              xe->drm.anon_inode->i_mapping,
>                              xe->drm.vma_offset_manager, false, false);
>        err = xe_device_inject_driver_probe_error(xe, err);
>        if (WARN_ON(err))
>                goto err;
>
>
>rather than adding another flow control
>
>>+
>>         return xe;
>>
>> err:
>>@@ -474,6 +478,7 @@ static int xe_set_dma_info(struct xe_device *xe)
>>                 goto mask_err;
>>
>>         err = dma_set_coherent_mask(xe->drm.dev, DMA_BIT_MASK(mask_size));
>>+        err = xe_device_inject_driver_probe_error(xe, err);
>>         if (err)
>>                 goto mask_err;
>>
>>@@ -577,6 +582,7 @@ int xe_device_probe_early(struct xe_device *xe)
>>                 return err;
>>
>>         err = wait_for_lmem_ready(xe);
>>+        err = xe_device_inject_driver_probe_error(xe, err);
>>         if (err)
>>                 return err;
>>
>>@@ -745,6 +751,10 @@ int xe_device_probe(struct xe_device *xe)
>>         for_each_gt(gt, xe, id)
>>                 xe_gt_sanitize_freq(gt);
>>
>>+#ifdef CONFIG_DRM_XE_DEBUG
>
>comment here:
>
>        /* After probe finishes, stop checking for error injection */
>
>>+        xe_modparam.inject_driver_probe_error = 0;
>>+#endif
>>+
>>         return devm_add_action_or_reset(xe->drm.dev, xe_device_sanitize, xe);
>>
>> err_fini_display:
>>@@ -995,3 +1005,36 @@ void xe_device_declare_wedged(struct xe_device *xe)
>>         for_each_gt(gt, xe, id)
>>                 xe_gt_declare_wedged(gt);
>> }
>>+
>>+#if IS_ENABLED(CONFIG_DRM_XE_DEBUG)
>>+/**
>>+ * __xe_device_inject_driver_probe_error - Inject an error during device probe
>>+ * @xe: xe device instance
>>+ * @err_injected: the error to inject
>>+ * @err_real: the error returned by the actual function
>>+ * @func: the name of the function where this is called from
>>+ * @line: the line where this is called from
>>+ *
>>+ * This is not meant to be called directly, only through xe_device_inject_driver_probe_error.
>>+ *
>>+ * Return: err_real if != 0, err_injected otherwise
>>+ */
>>+int __xe_device_inject_driver_probe_error(struct xe_device *xe, int err_injected, int err_real,
>>+                                          const char *func, int line)
>>+{
>>+        if (xe->inject_driver_probe_error >= xe_modparam.inject_driver_probe_error)
>>+                return 0;
>>+
>>+        if (++xe->inject_driver_probe_error < xe_modparam.inject_driver_probe_error)
>>+                return 0;
>>+
>>+        if (err_real != 0)
>>+                return err_real;
>
>why isn't this the first check? we don't want to count any error
>injection if the error was real
>
>>+
>>+        drm_info(&xe->drm, "Injecting failure %d at checkpoint %u [%s:%d]\n",
>>+                 err_injected, xe->inject_driver_probe_error, func, line);
>>+
>>+        xe_modparam.inject_driver_probe_error = 0;
>>+        return err_injected;
>>+}
>>+#endif
>>diff --git a/drivers/gpu/drm/xe/xe_device.h b/drivers/gpu/drm/xe/xe_device.h
>>index db6cc8d0d6b8..31270a54ccf9 100644
>>--- a/drivers/gpu/drm/xe/xe_device.h
>>+++ b/drivers/gpu/drm/xe/xe_device.h
>>@@ -179,4 +179,29 @@ void xe_device_declare_wedged(struct xe_device *xe);
>> struct xe_file *xe_file_get(struct xe_file *xef);
>> void xe_file_put(struct xe_file *xef);
>>
>>+#define XE_DEVICE_INJECTED_ERR -ENODEV
>>+#define xe_device_inject_driver_probe_return(__xe) \
>>+        ({ if (__xe_device_inject_driver_probe_error(__xe, XE_DEVICE_INJECTED_ERR, 0, __func__, \
>>+                                                     __LINE__) == XE_DEVICE_INJECTED_ERR) \
>>+                        return XE_DEVICE_INJECTED_ERR; })
>
>#define xe_device_inject_driver_probe_error(__xe) \
>        __xe_device_inject_driver_probe_error(__xe, XE_DEVICE_INJECTED_ERR, 0, __func__, __LINE__)
>
>>+#define xe_device_inject_driver_probe_error(__xe, __err_real) \
>
>#define xe_device_inject_driver_probe_error_override(__xe, __err_real)
>        __xe_device_inject_driver_probe_error(__xe, XE_DEVICE_INJECTED_ERR, __err_real, __func__, \
>                                              __LINE__)
>
>or with some other better name, but without changing the flow inside a
>macro.
>
>>+        __xe_device_inject_driver_probe_error(__xe, XE_DEVICE_INJECTED_ERR, __err_real, __func__, \
>>+                                              __LINE__)
>>+
>>+#if IS_ENABLED(CONFIG_DRM_XE_DEBUG)
>>+
>>+int __xe_device_inject_driver_probe_error(struct xe_device *xe,
>>+                                          int err_injected, int err_real,
>>+                                          const char *func, int line);
>>+#else
>>+
>>+static inline int __xe_device_inject_driver_probe_error(struct xe_device *xe,
>>+                                                        int err_injected, int err_real,
>>+                                                        const char *func, int line)
>>+{
>>+        return 0;
>>+}
>>+
>>+#endif
>>+
>> #endif
>>diff --git a/drivers/gpu/drm/xe/xe_device_types.h b/drivers/gpu/drm/xe/xe_device_types.h
>>index 5b7292a9a66d..139336ad4194 100644
>>--- a/drivers/gpu/drm/xe/xe_device_types.h
>>+++ b/drivers/gpu/drm/xe/xe_device_types.h
>>@@ -484,6 +484,11 @@ struct xe_device {
>>                 int mode;
>>         } wedged;
>>
>>+#if IS_ENABLED(CONFIG_DRM_XE_DEBUG)
>>+        /** @inject_driver_probe_error: Counter used for error injection during probe */
>>+        int inject_driver_probe_error;
>>+#endif
>>+
>> #ifdef TEST_VM_OPS_ERROR
>>         /**
>>          * @vm_inject_error_position: inject errors at different places in VM
>>diff --git a/drivers/gpu/drm/xe/xe_gt.c b/drivers/gpu/drm/xe/xe_gt.c
>>index 58895ed22f6e..4c73576bc5f3 100644
>>--- a/drivers/gpu/drm/xe/xe_gt.c
>>+++ b/drivers/gpu/drm/xe/xe_gt.c
>>@@ -378,6 +378,7 @@ int xe_gt_init_early(struct xe_gt *gt)
>>         xe_reg_sr_init(&gt->reg_sr, "GT", gt_to_xe(gt));
>>
>>         err = xe_wa_init(gt);
>>+        err = xe_device_inject_driver_probe_error(gt_to_xe(gt), err);
>>         if (err)
>>                 return err;
>>
>>@@ -564,6 +565,7 @@ int xe_gt_init_hwconfig(struct xe_gt *gt)
>>                 goto out_fw;
>>
>>         err = xe_uc_init_hwconfig(&gt->uc);
>>+        err = xe_device_inject_driver_probe_error(gt_to_xe(gt), err);
>>         if (err)
>>                 goto out_fw;
>>
>>diff --git a/drivers/gpu/drm/xe/xe_gt_sriov_pf.c b/drivers/gpu/drm/xe/xe_gt_sriov_pf.c
>>index ef239440963c..0d50cbb496c3 100644
>>--- a/drivers/gpu/drm/xe/xe_gt_sriov_pf.c
>>+++ b/drivers/gpu/drm/xe/xe_gt_sriov_pf.c
>>@@ -7,6 +7,7 @@
>>
>> #include "regs/xe_regs.h"
>>
>>+#include "xe_device.h"
>> #include "xe_gt_sriov_pf.h"
>> #include "xe_gt_sriov_pf_config.h"
>> #include "xe_gt_sriov_pf_helpers.h"
>>@@ -54,6 +55,7 @@ int xe_gt_sriov_pf_init_early(struct xe_gt *gt)
>>                 return err;
>>
>>         err = xe_gt_sriov_pf_service_init(gt);
>>+        err = xe_device_inject_driver_probe_error(gt_to_xe(gt), err);
>>         if (err)
>>                 return err;
>>
>>diff --git a/drivers/gpu/drm/xe/xe_guc.c b/drivers/gpu/drm/xe/xe_guc.c
>>index de0fe9e65746..61009499fddf 100644
>>--- a/drivers/gpu/drm/xe/xe_guc.c
>>+++ b/drivers/gpu/drm/xe/xe_guc.c
>>@@ -351,6 +351,7 @@ int xe_guc_init(struct xe_guc *guc)
>>                 goto out;
>>
>>         ret = devm_add_action_or_reset(xe->drm.dev, guc_fini_hw, guc);
>>+        ret = xe_device_inject_driver_probe_error(guc_to_xe(guc), ret);
>>         if (ret)
>>                 goto out;
>>
>>@@ -408,6 +409,7 @@ int xe_guc_init_post_hwconfig(struct xe_guc *guc)
>>                 return ret;
>>
>>         ret = xe_guc_pc_init(&guc->pc);
>>+        ret = xe_device_inject_driver_probe_error(guc_to_xe(guc), ret);
>>         if (ret)
>>                 return ret;
>>
>>diff --git a/drivers/gpu/drm/xe/xe_guc_ct.c b/drivers/gpu/drm/xe/xe_guc_ct.c
>>index beeeb120d1fc..29a8314b0ee1 100644
>>--- a/drivers/gpu/drm/xe/xe_guc_ct.c
>>+++ b/drivers/gpu/drm/xe/xe_guc_ct.c
>>@@ -194,6 +194,7 @@ int xe_guc_ct_init(struct xe_guc_ct *ct)
>>         ct->bo = bo;
>>
>>         err = drmm_add_action_or_reset(&xe->drm, guc_ct_fini, ct);
>>+        err = xe_device_inject_driver_probe_error(xe, err);
>>         if (err)
>>                 return err;
>>
>>diff --git a/drivers/gpu/drm/xe/xe_mmio.c b/drivers/gpu/drm/xe/xe_mmio.c
>>index bdcc7282385c..f79ac35251c5 100644
>>--- a/drivers/gpu/drm/xe/xe_mmio.c
>>+++ b/drivers/gpu/drm/xe/xe_mmio.c
>>@@ -137,6 +137,8 @@ int xe_mmio_probe_tiles(struct xe_device *xe)
>>         size_t tile_mmio_size = SZ_16M;
>>         size_t tile_mmio_ext_size = xe->info.tile_mmio_ext_size;
>>
>>+        xe_device_inject_driver_probe_return(xe);
>
>as above, let's keep the if (...)  here rather than inside the macro.
>
>Lucas De Marchi
>
>>+
>>         mmio_multi_tile_setup(xe, tile_mmio_size);
>>         mmio_extension_setup(xe, tile_mmio_size, tile_mmio_ext_size);
>>
>>diff --git a/drivers/gpu/drm/xe/xe_module.c b/drivers/gpu/drm/xe/xe_module.c
>>index 7bb99e451fcc..dbc737085bf1 100644
>>--- a/drivers/gpu/drm/xe/xe_module.c
>>+++ b/drivers/gpu/drm/xe/xe_module.c
>>@@ -53,6 +53,11 @@ module_param_named_unsafe(force_probe, xe_modparam.force_probe, charp, 0400);
>> MODULE_PARM_DESC(force_probe,
>>                  "Force probe options for specified devices. See CONFIG_DRM_XE_FORCE_PROBE for details.");
>>
>>+#if IS_ENABLED(CONFIG_DRM_XE_DEBUG)
>>+module_param_named_unsafe(inject_driver_probe_error, xe_modparam.inject_driver_probe_error, int, 0600);
>>+MODULE_PARM_DESC(inject_driver_probe_error, "Inject driver probe error");
>>+#endif
>>+
>> #ifdef CONFIG_PCI_IOV
>> module_param_named(max_vfs, xe_modparam.max_vfs, uint, 0400);
>> MODULE_PARM_DESC(max_vfs,
>>diff --git a/drivers/gpu/drm/xe/xe_module.h b/drivers/gpu/drm/xe/xe_module.h
>>index 61a0d28a28c8..d612a7ae7a4c 100644
>>--- a/drivers/gpu/drm/xe/xe_module.h
>>+++ b/drivers/gpu/drm/xe/xe_module.h
>>@@ -20,6 +20,9 @@ struct xe_modparam {
>>         char *force_probe;
>> #ifdef CONFIG_PCI_IOV
>>         unsigned int max_vfs;
>>+#endif
>>+#if IS_ENABLED(CONFIG_DRM_XE_DEBUG)
>>+        int inject_driver_probe_error;
>> #endif
>>         int wedged_mode;
>> };
>>diff --git a/drivers/gpu/drm/xe/xe_pci.c b/drivers/gpu/drm/xe/xe_pci.c
>>index f818aa69f3ca..97b82d5366d9 100644
>>--- a/drivers/gpu/drm/xe/xe_pci.c
>>+++ b/drivers/gpu/drm/xe/xe_pci.c
>>@@ -626,6 +626,7 @@ static int xe_info_init_early(struct xe_device *xe,
>>                                   desc->has_display;
>>
>>         err = xe_tile_init_early(xe_device_get_root_tile(xe), xe, 0);
>>+        err = xe_device_inject_driver_probe_error(xe, err);
>>         if (err)
>>                 return err;
>>
>>@@ -647,6 +648,8 @@ static int xe_info_init(struct xe_device *xe,
>>         struct xe_gt *gt;
>>         u8 id;
>>
>>+        xe_device_inject_driver_probe_return(xe);
>>+
>>         /*
>>          * If this platform supports GMD_ID, we'll detect the proper IP
>>          * descriptor to use from hardware registers. desc->graphics will only
>>diff --git a/drivers/gpu/drm/xe/xe_pm.c b/drivers/gpu/drm/xe/xe_pm.c
>>index 9f3c14fd9f33..4c05352eb01a 100644
>>--- a/drivers/gpu/drm/xe/xe_pm.c
>>+++ b/drivers/gpu/drm/xe/xe_pm.c
>>@@ -228,6 +228,7 @@ int xe_pm_init_early(struct xe_device *xe)
>>                 return err;
>>
>>         err = drmm_mutex_init(&xe->drm, &xe->d3cold.lock);
>>+        err = xe_device_inject_driver_probe_error(xe, err);
>>         if (err)
>>                 return err;
>>
>>@@ -246,6 +247,8 @@ int xe_pm_init(struct xe_device *xe)
>> {
>>         int err;
>>
>>+        xe_device_inject_driver_probe_return(xe);
>>+
>>         /* For now suspend/resume is only allowed with GuC */
>>         if (!xe_device_uc_enabled(xe))
>>                 return 0;
>>diff --git a/drivers/gpu/drm/xe/xe_sriov.c b/drivers/gpu/drm/xe/xe_sriov.c
>>index 5a1d65e4f19f..164cc119ad96 100644
>>--- a/drivers/gpu/drm/xe/xe_sriov.c
>>+++ b/drivers/gpu/drm/xe/xe_sriov.c
>>@@ -105,6 +105,8 @@ int xe_sriov_init(struct xe_device *xe)
>>         if (!IS_SRIOV(xe))
>>                 return 0;
>>
>>+        xe_device_inject_driver_probe_return(xe);
>>+
>>         if (IS_SRIOV_PF(xe)) {
>>                 int err = xe_sriov_pf_init_early(xe);
>>
>>diff --git a/drivers/gpu/drm/xe/xe_tile.c b/drivers/gpu/drm/xe/xe_tile.c
>>index 15ea0a942f67..bc0193bd35ed 100644
>>--- a/drivers/gpu/drm/xe/xe_tile.c
>>+++ b/drivers/gpu/drm/xe/xe_tile.c
>>@@ -124,6 +124,8 @@ int xe_tile_init_early(struct xe_tile *tile, struct xe_device *xe, u8 id)
>>         if (IS_ERR(tile->primary_gt))
>>                 return PTR_ERR(tile->primary_gt);
>>
>>+        xe_device_inject_driver_probe_return(xe);
>>+
>>         return 0;
>> }
>>
>>diff --git a/drivers/gpu/drm/xe/xe_uc.c b/drivers/gpu/drm/xe/xe_uc.c
>>index 0d073a9987c2..32e0466bf0a4 100644
>>--- a/drivers/gpu/drm/xe/xe_uc.c
>>+++ b/drivers/gpu/drm/xe/xe_uc.c
>>@@ -59,6 +59,7 @@ int xe_uc_init(struct xe_uc *uc)
>>                 return 0;
>>
>>         ret = xe_wopcm_init(&uc->wopcm);
>>+        ret = xe_device_inject_driver_probe_error(uc_to_xe(uc), ret);
>>         if (ret)
>>                 goto err;
>>
>>diff --git a/drivers/gpu/drm/xe/xe_wa.c b/drivers/gpu/drm/xe/xe_wa.c
>>index 564e32e44e3b..1c0c1ea8058b 100644
>>--- a/drivers/gpu/drm/xe/xe_wa.c
>>+++ b/drivers/gpu/drm/xe/xe_wa.c
>>@@ -832,6 +832,8 @@ int xe_wa_init(struct xe_gt *gt)
>>         if (!p)
>>                 return -ENOMEM;
>>
>>+        xe_device_inject_driver_probe_return(xe);
>>+
>>         gt->wa_active.gt = p;
>>         p += n_gt;
>>         gt->wa_active.engine = p;
>>diff --git a/drivers/gpu/drm/xe/xe_wopcm.c b/drivers/gpu/drm/xe/xe_wopcm.c
>>index d3a99157e523..eb7f1554108d 100644
>>--- a/drivers/gpu/drm/xe/xe_wopcm.c
>>+++ b/drivers/gpu/drm/xe/xe_wopcm.c
>>@@ -263,6 +263,8 @@ int xe_wopcm_init(struct xe_wopcm *wopcm)
>>                 return -E2BIG;
>>         }
>>
>>+        xe_device_inject_driver_probe_return(xe);
>>+
>>         if (!locked)
>>                 ret = __wopcm_init_regs(xe, gt, wopcm);
>>
>>-- 
>>2.43.0
>>


More information about the Intel-xe mailing list