[PATCH v3] drm/xe: Use fault injection infrastructure to find issues at probe time
Francois Dugast
francois.dugast at intel.com
Wed Sep 25 15:55:46 UTC 2024
The kernel fault injection infrastructure is used to test proper error
handling during probe. The return code of the functions using
ALLOW_ERROR_INJECTION() can be conditionnally modified at runtime by
tuning some debugfs entries. This requires CONFIG_FUNCTION_ERROR_INJECTION
(among others).
One way to use fault injection at probe time by making each of those
functions fail one at a time is:
FAILTYPE=fail_function
DEVICE="0000:00:08.0" # depends on the system
ERRNO=-12 # -ENOMEM, can depend on the function
echo N > /sys/kernel/debug/$FAILTYPE/task-filter
echo 100 > /sys/kernel/debug/$FAILTYPE/probability
echo 0 > /sys/kernel/debug/$FAILTYPE/interval
echo -1 > /sys/kernel/debug/$FAILTYPE/times
echo 0 > /sys/kernel/debug/$FAILTYPE/space
echo 1 > /sys/kernel/debug/$FAILTYPE/verbose
modprobe xe
echo $DEVICE > /sys/bus/pci/drivers/xe/unbind
grep -oP "^.* \[xe\]" /sys/kernel/debug/$FAILTYPE/injectable | \
cut -d ' ' -f 1 | while read -r FUNCTION ; do
echo "Injecting fault in $FUNCTION"
echo "" > /sys/kernel/debug/$FAILTYPE/inject
echo $FUNCTION > /sys/kernel/debug/$FAILTYPE/inject
printf %#x $ERRNO > /sys/kernel/debug/$FAILTYPE/$FUNCTION/retval
echo $DEVICE > /sys/bus/pci/drivers/xe/bind
done
rmmod xe
It will also be integrated into IGT for systematic execution by CI.
v2: Wrappers are not needed in the cases covered by this patch, so
remove them and use ALLOW_ERROR_INJECTION() directly.
Signed-off-by: Francois Dugast <francois.dugast at intel.com>
Cc: Lucas De Marchi <lucas.demarchi at intel.com>
Cc: Matthew Brost <matthew.brost at intel.com>
Cc: Rodrigo Vivi <rodrigo.vivi at intel.com>
Cc: Michal Wajdeczko <michal.wajdeczko at intel.com>
---
drivers/gpu/drm/xe/xe_device.c | 17 +++++++++++++++++
drivers/gpu/drm/xe/xe_ggtt.c | 8 ++++++++
drivers/gpu/drm/xe/xe_guc_ads.c | 10 ++++++++++
drivers/gpu/drm/xe/xe_guc_ct.c | 9 +++++++++
drivers/gpu/drm/xe/xe_guc_log.c | 10 ++++++++++
drivers/gpu/drm/xe/xe_guc_relay.c | 8 ++++++++
drivers/gpu/drm/xe/xe_pm.c | 9 +++++++++
drivers/gpu/drm/xe/xe_sriov.c | 10 ++++++++++
drivers/gpu/drm/xe/xe_tile.c | 9 +++++++++
drivers/gpu/drm/xe/xe_uc_fw.c | 9 +++++++++
drivers/gpu/drm/xe/xe_wa.c | 8 ++++++++
drivers/gpu/drm/xe/xe_wopcm.c | 9 +++++++++
12 files changed, 116 insertions(+)
diff --git a/drivers/gpu/drm/xe/xe_device.c b/drivers/gpu/drm/xe/xe_device.c
index 53dcece40fc5..f84327f8d2bf 100644
--- a/drivers/gpu/drm/xe/xe_device.c
+++ b/drivers/gpu/drm/xe/xe_device.c
@@ -6,6 +6,7 @@
#include "xe_device.h"
#include <linux/delay.h>
+#include <linux/fault-inject.h>
#include <linux/units.h>
#include <drm/drm_aperture.h>
@@ -298,6 +299,13 @@ static void xe_device_destroy(struct drm_device *dev, void *dummy)
ttm_device_fini(&xe->ttm);
}
+/*
+ * The ALLOW_ERROR_INJECTION() macro is added to conditionally skip execution at
+ * runtime and use a provided return value, in order to test errors paths in the
+ * callers. The requirements for the error injectable functions are not strictly
+ * fullfilled but this is acceptable because the caller only propagates the error
+ * up the stack without cleanup of resources potentially allocated here.
+ */
struct xe_device *xe_device_create(struct pci_dev *pdev,
const struct pci_device_id *ent)
{
@@ -378,6 +386,7 @@ struct xe_device *xe_device_create(struct pci_dev *pdev,
err:
return ERR_PTR(err);
}
+ALLOW_ERROR_INJECTION(xe_device_create, ERRNO);
static bool xe_driver_flr_disabled(struct xe_device *xe)
{
@@ -500,6 +509,13 @@ static bool verify_lmem_ready(struct xe_device *xe)
return !!val;
}
+/*
+ * This function fullfills the requirements for the error injectable functions
+ * fault-injection.html#requirements-for-the-error-injectable-functions so the
+ * ALLOW_ERROR_INJECTION() macro is added to conditionally skip execution at
+ * runtime and use a provided return value, in order to test errors paths in
+ * the callers.
+ */
static int wait_for_lmem_ready(struct xe_device *xe)
{
unsigned long timeout, start;
@@ -546,6 +562,7 @@ static int wait_for_lmem_ready(struct xe_device *xe)
return 0;
}
+ALLOW_ERROR_INJECTION(wait_for_lmem_ready, ERRNO);
static void update_device_info(struct xe_device *xe)
{
diff --git a/drivers/gpu/drm/xe/xe_ggtt.c b/drivers/gpu/drm/xe/xe_ggtt.c
index f68af56c3f86..a55f4afff4d6 100644
--- a/drivers/gpu/drm/xe/xe_ggtt.c
+++ b/drivers/gpu/drm/xe/xe_ggtt.c
@@ -5,6 +5,7 @@
#include "xe_ggtt.h"
+#include <linux/fault-inject.h>
#include <linux/io-64-nonatomic-lo-hi.h>
#include <linux/sizes.h>
@@ -209,6 +210,12 @@ static const struct xe_ggtt_pt_ops xelpg_pt_wa_ops = {
* initial clear done to it yet. That will happen in the regular, non-early
* GGTT initialization.
*
+ * This function fullfills the requirements for the error injectable functions
+ * fault-injection.html#requirements-for-the-error-injectable-functions so the
+ * ALLOW_ERROR_INJECTION() macro is added to conditionally skip execution at
+ * runtime and use a provided return value, in order to test errors paths in
+ * the callers.
+ *
* Return: 0 on success or a negative error code on failure.
*/
int xe_ggtt_init_early(struct xe_ggtt *ggtt)
@@ -264,6 +271,7 @@ int xe_ggtt_init_early(struct xe_ggtt *ggtt)
return 0;
}
+ALLOW_ERROR_INJECTION(xe_ggtt_init_early, ERRNO);
static void xe_ggtt_invalidate(struct xe_ggtt *ggtt);
diff --git a/drivers/gpu/drm/xe/xe_guc_ads.c b/drivers/gpu/drm/xe/xe_guc_ads.c
index 66d4e5e95abd..74a2d600e811 100644
--- a/drivers/gpu/drm/xe/xe_guc_ads.c
+++ b/drivers/gpu/drm/xe/xe_guc_ads.c
@@ -5,6 +5,8 @@
#include "xe_guc_ads.h"
+#include <linux/fault-inject.h>
+
#include <drm/drm_managed.h>
#include <generated/xe_wa_oob.h>
@@ -396,6 +398,13 @@ static int calculate_waklv_size(struct xe_guc_ads *ads)
#define MAX_GOLDEN_LRC_SIZE (SZ_4K * 64)
+/*
+ * The ALLOW_ERROR_INJECTION() macro is added to conditionally skip execution at
+ * runtime and use a provided return value, in order to test errors paths in the
+ * callers. The requirements for the error injectable functions are not strictly
+ * fullfilled but this is acceptable because the caller only propagates the error
+ * up the stack without cleanup of resources potentially allocated here.
+ */
int xe_guc_ads_init(struct xe_guc_ads *ads)
{
struct xe_device *xe = ads_to_xe(ads);
@@ -418,6 +427,7 @@ int xe_guc_ads_init(struct xe_guc_ads *ads)
return 0;
}
+ALLOW_ERROR_INJECTION(xe_guc_ads_init, ERRNO);
/**
* xe_guc_ads_init_post_hwconfig - initialize ADS post hwconfig load
diff --git a/drivers/gpu/drm/xe/xe_guc_ct.c b/drivers/gpu/drm/xe/xe_guc_ct.c
index 4b95f75b1546..41900d1da5e8 100644
--- a/drivers/gpu/drm/xe/xe_guc_ct.c
+++ b/drivers/gpu/drm/xe/xe_guc_ct.c
@@ -8,6 +8,7 @@
#include <linux/bitfield.h>
#include <linux/circ_buf.h>
#include <linux/delay.h>
+#include <linux/fault-inject.h>
#include <kunit/static_stub.h>
@@ -165,6 +166,13 @@ static void primelockdep(struct xe_guc_ct *ct)
fs_reclaim_release(GFP_KERNEL);
}
+/*
+ * The ALLOW_ERROR_INJECTION() macro is added to conditionally skip execution at
+ * runtime and use a provided return value, in order to test errors paths in the
+ * callers. The requirements for the error injectable functions are not strictly
+ * fullfilled but this is acceptable because the caller only propagates the error
+ * up the stack without cleanup of resources potentially allocated here.
+ */
int xe_guc_ct_init(struct xe_guc_ct *ct)
{
struct xe_device *xe = ct_to_xe(ct);
@@ -209,6 +217,7 @@ int xe_guc_ct_init(struct xe_guc_ct *ct)
ct->state = XE_GUC_CT_STATE_DISABLED;
return 0;
}
+ALLOW_ERROR_INJECTION(xe_guc_ct_init, ERRNO);
#define desc_read(xe_, guc_ctb__, field_) \
xe_map_rd_field(xe_, &guc_ctb__->desc, 0, \
diff --git a/drivers/gpu/drm/xe/xe_guc_log.c b/drivers/gpu/drm/xe/xe_guc_log.c
index a37ee3419428..d448c7f09c01 100644
--- a/drivers/gpu/drm/xe/xe_guc_log.c
+++ b/drivers/gpu/drm/xe/xe_guc_log.c
@@ -5,6 +5,8 @@
#include "xe_guc_log.h"
+#include <linux/fault-inject.h>
+
#include <drm/drm_managed.h>
#include "xe_bo.h"
@@ -77,6 +79,13 @@ void xe_guc_log_print(struct xe_guc_log *log, struct drm_printer *p)
}
}
+/*
+ * The ALLOW_ERROR_INJECTION() macro is added to conditionally skip execution at
+ * runtime and use a provided return value, in order to test errors paths in the
+ * callers. The requirements for the error injectable functions are not strictly
+ * fullfilled but this is acceptable because the caller only propagates the error
+ * up the stack without cleanup of resources potentially allocated here.
+ */
int xe_guc_log_init(struct xe_guc_log *log)
{
struct xe_device *xe = log_to_xe(log);
@@ -96,3 +105,4 @@ int xe_guc_log_init(struct xe_guc_log *log)
return 0;
}
+ALLOW_ERROR_INJECTION(xe_guc_log_init, ERRNO);
diff --git a/drivers/gpu/drm/xe/xe_guc_relay.c b/drivers/gpu/drm/xe/xe_guc_relay.c
index ade6162dc259..d077e2ddb531 100644
--- a/drivers/gpu/drm/xe/xe_guc_relay.c
+++ b/drivers/gpu/drm/xe/xe_guc_relay.c
@@ -5,6 +5,7 @@
#include <linux/bitfield.h>
#include <linux/delay.h>
+#include <linux/fault-inject.h>
#include <drm/drm_managed.h>
@@ -327,6 +328,12 @@ static void __fini_relay(struct drm_device *drm, void *arg)
* Initialize remaining members of &xe_guc_relay that may depend
* on the SR-IOV mode.
*
+ * The ALLOW_ERROR_INJECTION() macro is added to conditionally skip execution at
+ * runtime and use a provided return value, in order to test errors paths in the
+ * callers. The requirements for the error injectable functions are not strictly
+ * fullfilled but this is acceptable because the caller only propagates the error
+ * up the stack without cleanup of resources potentially allocated here.
+ *
* Return: 0 on success or a negative error code on failure.
*/
int xe_guc_relay_init(struct xe_guc_relay *relay)
@@ -355,6 +362,7 @@ int xe_guc_relay_init(struct xe_guc_relay *relay)
return drmm_add_action_or_reset(&xe->drm, __fini_relay, relay);
}
+ALLOW_ERROR_INJECTION(xe_guc_relay_init, ERRNO);
static u32 to_relay_error(int err)
{
diff --git a/drivers/gpu/drm/xe/xe_pm.c b/drivers/gpu/drm/xe/xe_pm.c
index 33eb039053e4..b2f0cd1adc88 100644
--- a/drivers/gpu/drm/xe/xe_pm.c
+++ b/drivers/gpu/drm/xe/xe_pm.c
@@ -5,6 +5,7 @@
#include "xe_pm.h"
+#include <linux/fault-inject.h>
#include <linux/pm_runtime.h>
#include <drm/drm_managed.h>
@@ -247,6 +248,13 @@ static void xe_pm_runtime_init(struct xe_device *xe)
pm_runtime_put(dev);
}
+/*
+ * The ALLOW_ERROR_INJECTION() macro is added to conditionally skip execution at
+ * runtime and use a provided return value, in order to test errors paths in the
+ * callers. The requirements for the error injectable functions are not strictly
+ * fullfilled but this is acceptable because the caller only propagates the error
+ * up the stack without cleanup of resources potentially allocated here.
+ */
int xe_pm_init_early(struct xe_device *xe)
{
int err;
@@ -263,6 +271,7 @@ int xe_pm_init_early(struct xe_device *xe)
return 0;
}
+ALLOW_ERROR_INJECTION(xe_pm_init_early, ERRNO);
/**
* xe_pm_init - Initialize Xe Power Management
diff --git a/drivers/gpu/drm/xe/xe_sriov.c b/drivers/gpu/drm/xe/xe_sriov.c
index 69a066ef20c0..b714825c209c 100644
--- a/drivers/gpu/drm/xe/xe_sriov.c
+++ b/drivers/gpu/drm/xe/xe_sriov.c
@@ -3,6 +3,8 @@
* Copyright © 2023 Intel Corporation
*/
+#include <linux/fault-inject.h>
+
#include <drm/drm_managed.h>
#include "regs/xe_regs.h"
@@ -98,6 +100,13 @@ static void fini_sriov(struct drm_device *drm, void *arg)
* In this function we create dedicated workqueue that will be used
* by the SR-IOV specific workers.
*
+ * The ALLOW_ERROR_INJECTION() macro is added to conditionally skip
+ * execution at runtime and use a provided return value, in order to
+ * test errors paths in the callers. The requirements for the error
+ * injectable functions are not strictly fullfilled but this is
+ * acceptable because the caller only propagates the error up the
+ * stack without cleanup of resources potentially allocated here.
+ *
* Return: 0 on success or a negative error code on failure.
*/
int xe_sriov_init(struct xe_device *xe)
@@ -119,6 +128,7 @@ int xe_sriov_init(struct xe_device *xe)
return drmm_add_action_or_reset(&xe->drm, fini_sriov, xe);
}
+ALLOW_ERROR_INJECTION(xe_sriov_init, ERRNO);
/**
* xe_sriov_print_info - Print basic SR-IOV information.
diff --git a/drivers/gpu/drm/xe/xe_tile.c b/drivers/gpu/drm/xe/xe_tile.c
index dda5268507d8..fe56fb6327a9 100644
--- a/drivers/gpu/drm/xe/xe_tile.c
+++ b/drivers/gpu/drm/xe/xe_tile.c
@@ -3,6 +3,8 @@
* Copyright © 2023 Intel Corporation
*/
+#include <linux/fault-inject.h>
+
#include <drm/drm_managed.h>
#include "xe_device.h"
@@ -108,6 +110,12 @@ static int xe_tile_alloc(struct xe_tile *tile)
* Initializes per-tile resources that don't require any interactions with the
* hardware or any knowledge about the Graphics/Media IP version.
*
+ * The ALLOW_ERROR_INJECTION() macro is added to conditionally skip execution at
+ * runtime and use a provided return value, in order to test errors paths in the
+ * callers. The requirements for the error injectable functions are not strictly
+ * fullfilled but this is acceptable because the caller only propagates the error
+ * up the stack without cleanup of resources potentially allocated here.
+ *
* Returns: 0 on success, negative error code on error.
*/
int xe_tile_init_early(struct xe_tile *tile, struct xe_device *xe, u8 id)
@@ -129,6 +137,7 @@ int xe_tile_init_early(struct xe_tile *tile, struct xe_device *xe, u8 id)
return 0;
}
+ALLOW_ERROR_INJECTION(xe_tile_init_early, ERRNO);
static int tile_ttm_mgr_init(struct xe_tile *tile)
{
diff --git a/drivers/gpu/drm/xe/xe_uc_fw.c b/drivers/gpu/drm/xe/xe_uc_fw.c
index eab9456e051f..8f43b0990947 100644
--- a/drivers/gpu/drm/xe/xe_uc_fw.c
+++ b/drivers/gpu/drm/xe/xe_uc_fw.c
@@ -4,6 +4,7 @@
*/
#include <linux/bitfield.h>
+#include <linux/fault-inject.h>
#include <linux/firmware.h>
#include <drm/drm_managed.h>
@@ -776,6 +777,13 @@ static int uc_fw_copy(struct xe_uc_fw *uc_fw, const void *data, size_t size, u32
return err;
}
+/*
+ * The ALLOW_ERROR_INJECTION() macro is added to conditionally skip execution at
+ * runtime and use a provided return value, in order to test errors paths in the
+ * callers. The requirements for the error injectable functions are not strictly
+ * fullfilled but this is acceptable because the caller only propagates the error
+ * up the stack without cleanup of resources potentially allocated here.
+ */
int xe_uc_fw_init(struct xe_uc_fw *uc_fw)
{
const struct firmware *fw = NULL;
@@ -797,6 +805,7 @@ int xe_uc_fw_init(struct xe_uc_fw *uc_fw)
return err;
}
+ALLOW_ERROR_INJECTION(xe_uc_fw_init, ERRNO);
static u32 uc_fw_ggtt_offset(struct xe_uc_fw *uc_fw)
{
diff --git a/drivers/gpu/drm/xe/xe_wa.c b/drivers/gpu/drm/xe/xe_wa.c
index 22c148b1e996..3c9d36fd1a2c 100644
--- a/drivers/gpu/drm/xe/xe_wa.c
+++ b/drivers/gpu/drm/xe/xe_wa.c
@@ -8,6 +8,7 @@
#include <drm/drm_managed.h>
#include <kunit/visibility.h>
#include <linux/compiler_types.h>
+#include <linux/fault-inject.h>
#include <generated/xe_wa_oob.h>
@@ -822,6 +823,12 @@ void xe_wa_process_lrc(struct xe_hw_engine *hwe)
* xe_wa_init - initialize gt with workaround bookkeeping
* @gt: GT instance to initialize
*
+ * The ALLOW_ERROR_INJECTION() macro is added to conditionally skip execution at
+ * runtime and use a provided return value, in order to test errors paths in the
+ * callers. The requirements for the error injectable functions are not strictly
+ * fullfilled but this is acceptable because the caller only propagates the error
+ * up the stack without cleanup of resources potentially allocated here.
+ *
* Returns 0 for success, negative error code otherwise.
*/
int xe_wa_init(struct xe_gt *gt)
@@ -850,6 +857,7 @@ int xe_wa_init(struct xe_gt *gt)
return 0;
}
+ALLOW_ERROR_INJECTION(xe_wa_init, ERRNO);
void xe_wa_dump(struct xe_gt *gt, struct drm_printer *p)
{
diff --git a/drivers/gpu/drm/xe/xe_wopcm.c b/drivers/gpu/drm/xe/xe_wopcm.c
index 93c82825d896..1ea9ddf32c2a 100644
--- a/drivers/gpu/drm/xe/xe_wopcm.c
+++ b/drivers/gpu/drm/xe/xe_wopcm.c
@@ -5,6 +5,8 @@
#include "xe_wopcm.h"
+#include <linux/fault-inject.h>
+
#include "regs/xe_guc_regs.h"
#include "xe_device.h"
#include "xe_force_wake.h"
@@ -193,6 +195,12 @@ u32 xe_wopcm_size(struct xe_device *xe)
* enforce platform dependent hardware restrictions on GuC WOPCM offset and
* size. It will fail the WOPCM init if any of these checks fail, so that the
* following WOPCM registers setup and GuC firmware uploading would be aborted.
+ *
+ * This function fullfills the requirements for the error injectable functions
+ * fault-injection.html#requirements-for-the-error-injectable-functions so the
+ * ALLOW_ERROR_INJECTION() macro is added to conditionally skip execution at
+ * runtime and use a provided return value, in order to test errors paths in
+ * the callers.
*/
int xe_wopcm_init(struct xe_wopcm *wopcm)
{
@@ -268,3 +276,4 @@ int xe_wopcm_init(struct xe_wopcm *wopcm)
return ret;
}
+ALLOW_ERROR_INJECTION(xe_wopcm_init, ERRNO);
--
2.43.0
More information about the Intel-xe
mailing list