[PATCH v6 9/9] drm/xe/xe_hw_error: Add fault injection to trigger csc error handler
Riana Tauro
riana.tauro at intel.com
Thu Jul 24 14:34:38 UTC 2025
Add a debugfs fault handler to trigger csc error handler that
wedges the device and enables runtime survivability mode
v2: add debugfs only for bmg (Umesh)
v3: do not use csc_fault attribute if debugfs is not enabled
Cc: Lucas De Marchi <lucas.demarchi at intel.com>
Signed-off-by: Riana Tauro <riana.tauro at intel.com>
---
drivers/gpu/drm/xe/xe_debugfs.c | 3 +++
drivers/gpu/drm/xe/xe_hw_error.c | 11 +++++++++++
2 files changed, 14 insertions(+)
diff --git a/drivers/gpu/drm/xe/xe_debugfs.c b/drivers/gpu/drm/xe/xe_debugfs.c
index 0b4a532f7c45..ac5b9e5889c5 100644
--- a/drivers/gpu/drm/xe/xe_debugfs.c
+++ b/drivers/gpu/drm/xe/xe_debugfs.c
@@ -34,6 +34,7 @@
#endif
DECLARE_FAULT_ATTR(gt_reset_failure);
+DECLARE_FAULT_ATTR(inject_csc_hw_error);
static void read_residency_counter(struct xe_device *xe, struct xe_mmio *mmio,
u32 offset, char *name, struct drm_printer *p)
@@ -401,6 +402,8 @@ void xe_debugfs_register(struct xe_device *xe)
xe_pxp_debugfs_register(xe->pxp);
fault_create_debugfs_attr("fail_gt_reset", root, >_reset_failure);
+ if (xe->info.platform == XE_BATTLEMAGE)
+ fault_create_debugfs_attr("inject_csc_hw_error", root, &inject_csc_hw_error);
if (IS_SRIOV_PF(xe))
xe_sriov_pf_debugfs_register(xe, root);
diff --git a/drivers/gpu/drm/xe/xe_hw_error.c b/drivers/gpu/drm/xe/xe_hw_error.c
index 4099ed1262e1..8c65291f36fc 100644
--- a/drivers/gpu/drm/xe/xe_hw_error.c
+++ b/drivers/gpu/drm/xe/xe_hw_error.c
@@ -3,6 +3,8 @@
* Copyright © 2025 Intel Corporation
*/
+#include <linux/fault-inject.h>
+
#include "regs/xe_gsc_regs.h"
#include "regs/xe_hw_error_regs.h"
#include "regs/xe_irq_regs.h"
@@ -13,6 +15,7 @@
#include "xe_survivability_mode.h"
#define HEC_UNCORR_FW_ERR_BITS 4
+extern struct fault_attr inject_csc_hw_error;
/* Error categories reported by hardware */
enum hardware_error {
@@ -43,6 +46,11 @@ static const char *hw_error_to_str(const enum hardware_error hw_err)
}
}
+static bool fault_inject_csc_hw_error(void)
+{
+ return IS_ENABLED(CONFIG_DEBUG_FS) && should_fail(&inject_csc_hw_error, 1);
+}
+
static void csc_hw_error_work(struct work_struct *work)
{
struct xe_tile *tile = container_of(work, typeof(*tile), csc_hw_error_work);
@@ -130,6 +138,9 @@ void xe_hw_error_irq_handler(struct xe_tile *tile, const u32 master_ctl)
{
enum hardware_error hw_err;
+ if (fault_inject_csc_hw_error())
+ schedule_work(&tile->csc_hw_error_work);
+
for (hw_err = 0; hw_err < HARDWARE_ERROR_MAX; hw_err++)
if (master_ctl & ERROR_IRQ(hw_err))
hw_error_source_handler(tile, hw_err);
--
2.47.1
More information about the Intel-xe
mailing list