[PATCH v2] drm/xe: Use fault injection infrastructure to find issues at probe time

Francois Dugast francois.dugast at intel.com
Fri Sep 20 13:18:34 UTC 2024


The kernel fault injection infrastructure is used to test proper error
handling during probe. In particular, ALLOW_ERROR_INJECTION() is added
directly to functions which fullfill the injectable functions
requirements:
fault-injection.html#requirements-for-the-error-injectable-functions

Otherwise a helper function is added and called in the beginning of the
function where the fault is to be injected.

The return code of the functions using ALLOW_ERROR_INJECTION() can be
conditionnally modified at runtime by tuning some debugfs entries. This
requires CONFIG_FUNCTION_ERROR_INJECTION (among others).

One way to use fault injection at probe time by making each of those
functions fail one at a time is:

    FAILTYPE=fail_function
    DEVICE="0000:00:08.0" # depends on the system
    ERRNO=-12 # -ENOMEM, can depend on the function

    echo N > /sys/kernel/debug/$FAILTYPE/task-filter
    echo 100 > /sys/kernel/debug/$FAILTYPE/probability
    echo 0 > /sys/kernel/debug/$FAILTYPE/interval
    echo -1 > /sys/kernel/debug/$FAILTYPE/times
    echo 0 > /sys/kernel/debug/$FAILTYPE/space
    echo 1 > /sys/kernel/debug/$FAILTYPE/verbose

    modprobe xe
    echo $DEVICE > /sys/bus/pci/drivers/xe/unbind

    grep -oP "^.* \[xe\]" /sys/kernel/debug/$FAILTYPE/injectable | \
    cut -d ' ' -f 1 | while read -r FUNCTION ; do
        echo "Injecting fault in $FUNCTION"
        echo "" > /sys/kernel/debug/$FAILTYPE/inject
        echo $FUNCTION > /sys/kernel/debug/$FAILTYPE/inject
        printf %#x $ERRNO > /sys/kernel/debug/$FAILTYPE/$FUNCTION/retval
        echo $DEVICE > /sys/bus/pci/drivers/xe/bind
    done

    rmmod xe

It will also be integrated into IGT for systematic execution by CI.

Signed-off-by: Francois Dugast <francois.dugast at intel.com>
Cc: Lucas De Marchi <lucas.demarchi at intel.com>
Cc: Matthew Brost <matthew.brost at intel.com>
Cc: Rodrigo Vivi <rodrigo.vivi at intel.com>
Cc: Michal Wajdeczko <michal.wajdeczko at intel.com>
---
 drivers/gpu/drm/xe/xe_device.c    | 12 ++++++++++++
 drivers/gpu/drm/xe/xe_ggtt.c      |  2 ++
 drivers/gpu/drm/xe/xe_guc_ads.c   | 13 +++++++++++++
 drivers/gpu/drm/xe/xe_guc_ct.c    | 11 +++++++++++
 drivers/gpu/drm/xe/xe_guc_log.c   | 13 +++++++++++++
 drivers/gpu/drm/xe/xe_guc_relay.c | 11 +++++++++++
 drivers/gpu/drm/xe/xe_pm.c        | 11 +++++++++++
 drivers/gpu/drm/xe/xe_sriov.c     | 15 ++++++++++++++-
 drivers/gpu/drm/xe/xe_tile.c      | 12 ++++++++++++
 drivers/gpu/drm/xe/xe_uc_fw.c     | 11 +++++++++++
 drivers/gpu/drm/xe/xe_wa.c        | 12 ++++++++++++
 drivers/gpu/drm/xe/xe_wopcm.c     |  3 +++
 12 files changed, 125 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/xe/xe_device.c b/drivers/gpu/drm/xe/xe_device.c
index cb5a9fd820cf..d26352ecf75e 100644
--- a/drivers/gpu/drm/xe/xe_device.c
+++ b/drivers/gpu/drm/xe/xe_device.c
@@ -6,6 +6,7 @@
 #include "xe_device.h"
 
 #include <linux/delay.h>
+#include <linux/fault-inject.h>
 #include <linux/units.h>
 
 #include <drm/drm_aperture.h>
@@ -300,12 +301,22 @@ static void xe_device_destroy(struct drm_device *dev, void *dummy)
 	ttm_device_fini(&xe->ttm);
 }
 
+static noinline int fault_inject_device_create(void)
+{
+	return 0;
+}
+ALLOW_ERROR_INJECTION(fault_inject_device_create, ERRNO);
+
 struct xe_device *xe_device_create(struct pci_dev *pdev,
 				   const struct pci_device_id *ent)
 {
 	struct xe_device *xe;
 	int err;
 
+	err = fault_inject_device_create();
+	if (err)
+		return ERR_PTR(err);
+
 	xe_display_driver_set_hooks(&driver);
 
 	err = drm_aperture_remove_conflicting_pci_framebuffers(pdev, &driver);
@@ -548,6 +559,7 @@ static int wait_for_lmem_ready(struct xe_device *xe)
 
 	return 0;
 }
+ALLOW_ERROR_INJECTION(wait_for_lmem_ready, ERRNO);
 
 static void update_device_info(struct xe_device *xe)
 {
diff --git a/drivers/gpu/drm/xe/xe_ggtt.c b/drivers/gpu/drm/xe/xe_ggtt.c
index f68af56c3f86..4906e3f3150b 100644
--- a/drivers/gpu/drm/xe/xe_ggtt.c
+++ b/drivers/gpu/drm/xe/xe_ggtt.c
@@ -5,6 +5,7 @@
 
 #include "xe_ggtt.h"
 
+#include <linux/fault-inject.h>
 #include <linux/io-64-nonatomic-lo-hi.h>
 #include <linux/sizes.h>
 
@@ -264,6 +265,7 @@ int xe_ggtt_init_early(struct xe_ggtt *ggtt)
 
 	return 0;
 }
+ALLOW_ERROR_INJECTION(xe_ggtt_init_early, ERRNO);
 
 static void xe_ggtt_invalidate(struct xe_ggtt *ggtt);
 
diff --git a/drivers/gpu/drm/xe/xe_guc_ads.c b/drivers/gpu/drm/xe/xe_guc_ads.c
index 66d4e5e95abd..e366043eb4b8 100644
--- a/drivers/gpu/drm/xe/xe_guc_ads.c
+++ b/drivers/gpu/drm/xe/xe_guc_ads.c
@@ -5,6 +5,8 @@
 
 #include "xe_guc_ads.h"
 
+#include <linux/fault-inject.h>
+
 #include <drm/drm_managed.h>
 
 #include <generated/xe_wa_oob.h>
@@ -396,12 +398,23 @@ static int calculate_waklv_size(struct xe_guc_ads *ads)
 
 #define MAX_GOLDEN_LRC_SIZE	(SZ_4K * 64)
 
+static noinline int fault_inject_guc_ads_init(void)
+{
+	return 0;
+}
+ALLOW_ERROR_INJECTION(fault_inject_guc_ads_init, ERRNO);
+
 int xe_guc_ads_init(struct xe_guc_ads *ads)
 {
 	struct xe_device *xe = ads_to_xe(ads);
 	struct xe_gt *gt = ads_to_gt(ads);
 	struct xe_tile *tile = gt_to_tile(gt);
 	struct xe_bo *bo;
+	int ret;
+
+	ret = fault_inject_guc_ads_init();
+	if (ret)
+		return ret;
 
 	ads->golden_lrc_size = calculate_golden_lrc_size(ads);
 	ads->regset_size = calculate_regset_size(gt);
diff --git a/drivers/gpu/drm/xe/xe_guc_ct.c b/drivers/gpu/drm/xe/xe_guc_ct.c
index 4b95f75b1546..61967ddd319f 100644
--- a/drivers/gpu/drm/xe/xe_guc_ct.c
+++ b/drivers/gpu/drm/xe/xe_guc_ct.c
@@ -8,6 +8,7 @@
 #include <linux/bitfield.h>
 #include <linux/circ_buf.h>
 #include <linux/delay.h>
+#include <linux/fault-inject.h>
 
 #include <kunit/static_stub.h>
 
@@ -165,6 +166,12 @@ static void primelockdep(struct xe_guc_ct *ct)
 	fs_reclaim_release(GFP_KERNEL);
 }
 
+static noinline int fault_inject_guc_ct_init(void)
+{
+	return 0;
+}
+ALLOW_ERROR_INJECTION(fault_inject_guc_ct_init, ERRNO);
+
 int xe_guc_ct_init(struct xe_guc_ct *ct)
 {
 	struct xe_device *xe = ct_to_xe(ct);
@@ -173,6 +180,10 @@ int xe_guc_ct_init(struct xe_guc_ct *ct)
 	struct xe_bo *bo;
 	int err;
 
+	err = fault_inject_guc_ct_init();
+	if (err)
+		return err;
+
 	xe_gt_assert(gt, !(guc_ct_size() % PAGE_SIZE));
 
 	ct->g2h_wq = alloc_ordered_workqueue("xe-g2h-wq", 0);
diff --git a/drivers/gpu/drm/xe/xe_guc_log.c b/drivers/gpu/drm/xe/xe_guc_log.c
index a37ee3419428..a3e54f1bb0c3 100644
--- a/drivers/gpu/drm/xe/xe_guc_log.c
+++ b/drivers/gpu/drm/xe/xe_guc_log.c
@@ -5,6 +5,8 @@
 
 #include "xe_guc_log.h"
 
+#include <linux/fault-inject.h>
+
 #include <drm/drm_managed.h>
 
 #include "xe_bo.h"
@@ -77,11 +79,22 @@ void xe_guc_log_print(struct xe_guc_log *log, struct drm_printer *p)
 	}
 }
 
+static noinline int fault_inject_guc_log_init(void)
+{
+	return 0;
+}
+ALLOW_ERROR_INJECTION(fault_inject_guc_log_init, ERRNO);
+
 int xe_guc_log_init(struct xe_guc_log *log)
 {
 	struct xe_device *xe = log_to_xe(log);
 	struct xe_tile *tile = gt_to_tile(log_to_gt(log));
 	struct xe_bo *bo;
+	int err;
+
+	err = fault_inject_guc_log_init();
+	if (err)
+		return err;
 
 	bo = xe_managed_bo_create_pin_map(xe, tile, guc_log_size(),
 					  XE_BO_FLAG_SYSTEM |
diff --git a/drivers/gpu/drm/xe/xe_guc_relay.c b/drivers/gpu/drm/xe/xe_guc_relay.c
index ade6162dc259..ede7fd3e7785 100644
--- a/drivers/gpu/drm/xe/xe_guc_relay.c
+++ b/drivers/gpu/drm/xe/xe_guc_relay.c
@@ -5,6 +5,7 @@
 
 #include <linux/bitfield.h>
 #include <linux/delay.h>
+#include <linux/fault-inject.h>
 
 #include <drm/drm_managed.h>
 
@@ -320,6 +321,12 @@ static void __fini_relay(struct drm_device *drm, void *arg)
 	mempool_exit(&relay->pool);
 }
 
+static noinline int fault_inject_guc_relay_init(void)
+{
+	return 0;
+}
+ALLOW_ERROR_INJECTION(fault_inject_guc_relay_init, ERRNO);
+
 /**
  * xe_guc_relay_init - Initialize a &xe_guc_relay
  * @relay: the &xe_guc_relay to initialize
@@ -335,6 +342,10 @@ int xe_guc_relay_init(struct xe_guc_relay *relay)
 	struct xe_device *xe = relay_to_xe(relay);
 	int err;
 
+	err = fault_inject_guc_relay_init();
+	if (err)
+		return err;
+
 	relay_assert(relay, !relay_is_ready(relay));
 
 	if (!IS_SRIOV(xe))
diff --git a/drivers/gpu/drm/xe/xe_pm.c b/drivers/gpu/drm/xe/xe_pm.c
index 33eb039053e4..87075aed885d 100644
--- a/drivers/gpu/drm/xe/xe_pm.c
+++ b/drivers/gpu/drm/xe/xe_pm.c
@@ -5,6 +5,7 @@
 
 #include "xe_pm.h"
 
+#include <linux/fault-inject.h>
 #include <linux/pm_runtime.h>
 
 #include <drm/drm_managed.h>
@@ -247,10 +248,20 @@ static void xe_pm_runtime_init(struct xe_device *xe)
 	pm_runtime_put(dev);
 }
 
+static noinline int fault_inject_pm_init_early(void)
+{
+	return 0;
+}
+ALLOW_ERROR_INJECTION(fault_inject_pm_init_early, ERRNO);
+
 int xe_pm_init_early(struct xe_device *xe)
 {
 	int err;
 
+	err = fault_inject_pm_init_early();
+	if (err)
+		return err;
+
 	INIT_LIST_HEAD(&xe->mem_access.vram_userfault.list);
 
 	err = drmm_mutex_init(&xe->drm, &xe->mem_access.vram_userfault.lock);
diff --git a/drivers/gpu/drm/xe/xe_sriov.c b/drivers/gpu/drm/xe/xe_sriov.c
index 69a066ef20c0..f1dafcfd4eae 100644
--- a/drivers/gpu/drm/xe/xe_sriov.c
+++ b/drivers/gpu/drm/xe/xe_sriov.c
@@ -3,6 +3,8 @@
  * Copyright © 2023 Intel Corporation
  */
 
+#include <linux/fault-inject.h>
+
 #include <drm/drm_managed.h>
 
 #include "regs/xe_regs.h"
@@ -91,6 +93,12 @@ static void fini_sriov(struct drm_device *drm, void *arg)
 	xe->sriov.wq = NULL;
 }
 
+static noinline int fault_inject_sriov_init(void)
+{
+	return 0;
+}
+ALLOW_ERROR_INJECTION(fault_inject_sriov_init, ERRNO);
+
 /**
  * xe_sriov_init - Initialize SR-IOV specific data.
  * @xe: the &xe_device to initialize
@@ -102,11 +110,16 @@ static void fini_sriov(struct drm_device *drm, void *arg)
  */
 int xe_sriov_init(struct xe_device *xe)
 {
+	int err = fault_inject_sriov_init();
+
+	if (err)
+		return err;
+
 	if (!IS_SRIOV(xe))
 		return 0;
 
 	if (IS_SRIOV_PF(xe)) {
-		int err = xe_sriov_pf_init_early(xe);
+		err = xe_sriov_pf_init_early(xe);
 
 		if (err)
 			return err;
diff --git a/drivers/gpu/drm/xe/xe_tile.c b/drivers/gpu/drm/xe/xe_tile.c
index dda5268507d8..c82b4278c03e 100644
--- a/drivers/gpu/drm/xe/xe_tile.c
+++ b/drivers/gpu/drm/xe/xe_tile.c
@@ -3,6 +3,8 @@
  * Copyright © 2023 Intel Corporation
  */
 
+#include <linux/fault-inject.h>
+
 #include <drm/drm_managed.h>
 
 #include "xe_device.h"
@@ -99,6 +101,12 @@ static int xe_tile_alloc(struct xe_tile *tile)
 	return 0;
 }
 
+static noinline int fault_inject_tile_init_early(void)
+{
+	return 0;
+}
+ALLOW_ERROR_INJECTION(fault_inject_tile_init_early, ERRNO);
+
 /**
  * xe_tile_init_early - Initialize the tile and primary GT
  * @tile: Tile to initialize
@@ -114,6 +122,10 @@ int xe_tile_init_early(struct xe_tile *tile, struct xe_device *xe, u8 id)
 {
 	int err;
 
+	err = fault_inject_tile_init_early();
+	if (err)
+		return err;
+
 	tile->xe = xe;
 	tile->id = id;
 
diff --git a/drivers/gpu/drm/xe/xe_uc_fw.c b/drivers/gpu/drm/xe/xe_uc_fw.c
index eab9456e051f..8fff0fd7c675 100644
--- a/drivers/gpu/drm/xe/xe_uc_fw.c
+++ b/drivers/gpu/drm/xe/xe_uc_fw.c
@@ -4,6 +4,7 @@
  */
 
 #include <linux/bitfield.h>
+#include <linux/fault-inject.h>
 #include <linux/firmware.h>
 
 #include <drm/drm_managed.h>
@@ -776,11 +777,21 @@ static int uc_fw_copy(struct xe_uc_fw *uc_fw, const void *data, size_t size, u32
 	return err;
 }
 
+static noinline int fault_inject_uc_fw_init(void)
+{
+	return 0;
+}
+ALLOW_ERROR_INJECTION(fault_inject_uc_fw_init, ERRNO);
+
 int xe_uc_fw_init(struct xe_uc_fw *uc_fw)
 {
 	const struct firmware *fw = NULL;
 	int err;
 
+	err = fault_inject_uc_fw_init();
+	if (err)
+		return err;
+
 	err = uc_fw_request(uc_fw, &fw);
 	if (err)
 		return err;
diff --git a/drivers/gpu/drm/xe/xe_wa.c b/drivers/gpu/drm/xe/xe_wa.c
index 22c148b1e996..121443b790bf 100644
--- a/drivers/gpu/drm/xe/xe_wa.c
+++ b/drivers/gpu/drm/xe/xe_wa.c
@@ -8,6 +8,7 @@
 #include <drm/drm_managed.h>
 #include <kunit/visibility.h>
 #include <linux/compiler_types.h>
+#include <linux/fault-inject.h>
 
 #include <generated/xe_wa_oob.h>
 
@@ -818,6 +819,12 @@ void xe_wa_process_lrc(struct xe_hw_engine *hwe)
 	xe_rtp_process_to_sr(&ctx, lrc_was, &hwe->reg_lrc);
 }
 
+static noinline int fault_inject_wa_init(void)
+{
+	return 0;
+}
+ALLOW_ERROR_INJECTION(fault_inject_wa_init, ERRNO);
+
 /**
  * xe_wa_init - initialize gt with workaround bookkeeping
  * @gt: GT instance to initialize
@@ -829,6 +836,11 @@ int xe_wa_init(struct xe_gt *gt)
 	struct xe_device *xe = gt_to_xe(gt);
 	size_t n_oob, n_lrc, n_engine, n_gt, total;
 	unsigned long *p;
+	int err;
+
+	err = fault_inject_wa_init();
+	if (err)
+		return err;
 
 	n_gt = BITS_TO_LONGS(ARRAY_SIZE(gt_was));
 	n_engine = BITS_TO_LONGS(ARRAY_SIZE(engine_was));
diff --git a/drivers/gpu/drm/xe/xe_wopcm.c b/drivers/gpu/drm/xe/xe_wopcm.c
index 93c82825d896..88a201122a22 100644
--- a/drivers/gpu/drm/xe/xe_wopcm.c
+++ b/drivers/gpu/drm/xe/xe_wopcm.c
@@ -5,6 +5,8 @@
 
 #include "xe_wopcm.h"
 
+#include <linux/fault-inject.h>
+
 #include "regs/xe_guc_regs.h"
 #include "xe_device.h"
 #include "xe_force_wake.h"
@@ -268,3 +270,4 @@ int xe_wopcm_init(struct xe_wopcm *wopcm)
 
 	return ret;
 }
+ALLOW_ERROR_INJECTION(xe_wopcm_init, ERRNO);
-- 
2.43.0



More information about the Intel-xe mailing list