[PATCH v4 9/9] drm/xe/xe_hw_error: Add fault injection to trigger csc error handler

Fri Jul 11 17:41:40 UTC 2025

On Wed, Jul 09, 2025 at 04:50:21PM +0530, Riana Tauro wrote:
>Add a debugfs fault handler to trigger csc error handler that
>wedges the device and sends drm uevent
>
>Signed-off-by: Riana Tauro <riana.tauro at intel.com>
>---
> drivers/gpu/drm/xe/xe_debugfs.c  |  2 ++
> drivers/gpu/drm/xe/xe_hw_error.c | 11 +++++++++++
> 2 files changed, 13 insertions(+)
>
>diff --git a/drivers/gpu/drm/xe/xe_debugfs.c b/drivers/gpu/drm/xe/xe_debugfs.c
>index d83cd6ed3fa8..134610437aea 100644
>--- a/drivers/gpu/drm/xe/xe_debugfs.c
>+++ b/drivers/gpu/drm/xe/xe_debugfs.c
>@@ -29,6 +29,7 @@
> #endif
>
> DECLARE_FAULT_ATTR(gt_reset_failure);
>+DECLARE_FAULT_ATTR(inject_csc_hw_error);
>
> static struct xe_device *node_to_xe(struct drm_info_node *node)
> {
>@@ -273,4 +274,5 @@ void xe_debugfs_register(struct xe_device *xe)
> 	xe_pxp_debugfs_register(xe->pxp);
>
> 	fault_create_debugfs_attr("fail_gt_reset", root, &gt_reset_failure);
>+	fault_create_debugfs_attr("inject_csc_hw_error", root, &inject_csc_hw_error);

Maybe create this attribute only for BMG since it will bail out anyways 
with an error when the worker runs? OR are you expecting to see that log 
message which says "runtime survivability not supported".

The absence of this attribute in debugfs can also be sufficient to 
indicate that it's not supported.

Thanks,
Umesh

> }
>diff --git a/drivers/gpu/drm/xe/xe_hw_error.c b/drivers/gpu/drm/xe/xe_hw_error.c
>index 7cc9b8a7fa1a..2d56a93b3a71 100644
>--- a/drivers/gpu/drm/xe/xe_hw_error.c
>+++ b/drivers/gpu/drm/xe/xe_hw_error.c
>@@ -3,6 +3,8 @@
>  * Copyright © 2025 Intel Corporation
>  */
>
>+#include <linux/fault-inject.h>
>+
> #include "regs/xe_gsc_regs.h"
> #include "regs/xe_hw_error_regs.h"
> #include "regs/xe_irq_regs.h"
>@@ -13,6 +15,7 @@
> #include "xe_survivability_mode.h"
>
> #define  HEC_UNCORR_FW_ERR_BITS 4
>+extern struct fault_attr inject_csc_hw_error;
>
> /* Error categories reported by hardware */
> enum hardware_error {
>@@ -43,6 +46,11 @@ static const char *hw_error_to_str(const enum hardware_error hw_err)
> 	}
> }
>
>+static bool fault_inject_csc_hw_error(void)
>+{
>+	return should_fail(&inject_csc_hw_error, 1);
>+}
>+
> static void csc_hw_error_work(struct work_struct *work)
> {
> 	struct xe_tile *tile = container_of(work, typeof(*tile), csc_hw_error_work);
>@@ -134,6 +142,9 @@ void xe_hw_error_irq_handler(struct xe_tile *tile, const u32 master_ctl)
> {
> 	enum hardware_error hw_err;
>
>+	if (fault_inject_csc_hw_error())
>+		schedule_work(&tile->csc_hw_error_work);
>+
> 	for (hw_err = 0; hw_err < HARDWARE_ERROR_MAX; hw_err++)
> 		if (master_ctl & ERROR_IRQ(hw_err))
> 			hw_error_source_handler(tile, hw_err);
>-- 
>2.47.1
>