[Intel-xe] [PATCH v5 2/2] drm/xe: Introduce fault injection for gt reset
Rodrigo Vivi
rodrigo.vivi at intel.com
Thu Jul 20 16:28:51 UTC 2023
On Tue, Jul 18, 2023 at 07:02:16PM +0530, Himal Prasad Ghimiray wrote:
> To trigger gt reset failure:
> echo 100 > /sys/kernel/debug/dri/<cardX>/fail_gt_reset/probability
> echo 2 > /sys/kernel/debug/dri/<cardX>/fail_gt_reset/times
why 2 and not 1?
anyway, neat solution!
Reviewed-by: Rodrigo Vivi <rodrigo.vivi at intel.com>
>
> Cc: Rodrigo Vivi <rodrigo.vivi at intel.com>
> Cc: Lucas De Marchi <lucas.demarchi at intel.com>
> Signed-off-by: Himal Prasad Ghimiray <himal.prasad.ghimiray at intel.com>
> ---
> drivers/gpu/drm/xe/xe_debugfs.c | 10 ++++++++++
> drivers/gpu/drm/xe/xe_gt.c | 8 +++++++-
> drivers/gpu/drm/xe/xe_gt.h | 14 ++++++++++++++
> 3 files changed, 31 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/xe/xe_debugfs.c b/drivers/gpu/drm/xe/xe_debugfs.c
> index 7827a785b020..08d5bdf4cf61 100644
> --- a/drivers/gpu/drm/xe/xe_debugfs.c
> +++ b/drivers/gpu/drm/xe/xe_debugfs.c
> @@ -5,6 +5,7 @@
>
> #include "xe_debugfs.h"
>
> +#include <linux/fault-inject.h>
> #include <linux/string_helpers.h>
>
> #include <drm/drm_debugfs.h>
> @@ -20,6 +21,10 @@
> #include "xe_vm.h"
> #endif
>
> +#ifdef CONFIG_FAULT_INJECTION
> +DECLARE_FAULT_ATTR(gt_reset_failure);
> +#endif
> +
> static struct xe_device *node_to_xe(struct drm_info_node *node)
> {
> return to_xe_device(node->minor->dev);
> @@ -131,4 +136,9 @@ void xe_debugfs_register(struct xe_device *xe)
>
> for_each_gt(gt, xe, id)
> xe_gt_debugfs_register(gt);
> +
> +#ifdef CONFIG_FAULT_INJECTION
> + fault_create_debugfs_attr("fail_gt_reset", root, >_reset_failure);
> +#endif
> +
> }
> diff --git a/drivers/gpu/drm/xe/xe_gt.c b/drivers/gpu/drm/xe/xe_gt.c
> index 1db4d610f2fd..370d4b96e616 100644
> --- a/drivers/gpu/drm/xe/xe_gt.c
> +++ b/drivers/gpu/drm/xe/xe_gt.c
> @@ -525,6 +525,11 @@ static int gt_reset(struct xe_gt *gt)
>
> xe_gt_info(gt, "reset started\n");
>
> + if (xe_fault_inject_gt_reset()) {
> + err = -ECANCELED;
> + goto err_fail;
> + }
> +
> xe_gt_sanitize(gt);
>
> xe_device_mem_access_get(gt_to_xe(gt));
> @@ -562,6 +567,7 @@ static int gt_reset(struct xe_gt *gt)
> err_msg:
> XE_WARN_ON(xe_uc_start(>->uc));
> xe_device_mem_access_put(gt_to_xe(gt));
> +err_fail:
> xe_gt_err(gt, "reset failed (%pe)\n", ERR_PTR(err));
>
> /* Notify userspace about gt reset failure */
> @@ -582,7 +588,7 @@ void xe_gt_reset_async(struct xe_gt *gt)
> xe_gt_info(gt, "trying reset\n");
>
> /* Don't do a reset while one is already in flight */
> - if (xe_uc_reset_prepare(>->uc))
> + if (!xe_fault_inject_gt_reset() && xe_uc_reset_prepare(>->uc))
> return;
>
> xe_gt_info(gt, "reset queued\n");
> diff --git a/drivers/gpu/drm/xe/xe_gt.h b/drivers/gpu/drm/xe/xe_gt.h
> index 7298653a73de..caded203a8a0 100644
> --- a/drivers/gpu/drm/xe/xe_gt.h
> +++ b/drivers/gpu/drm/xe/xe_gt.h
> @@ -7,6 +7,7 @@
> #define _XE_GT_H_
>
> #include <drm/drm_util.h>
> +#include <linux/fault-inject.h>
>
> #include "xe_device_types.h"
> #include "xe_hw_engine.h"
> @@ -16,6 +17,19 @@
> for_each_if(((hwe__) = (gt__)->hw_engines + (id__)) && \
> xe_hw_engine_is_valid((hwe__)))
>
> +#ifdef CONFIG_FAULT_INJECTION
> +extern struct fault_attr gt_reset_failure;
> +static inline bool xe_fault_inject_gt_reset(void)
> +{
> + return should_fail(>_reset_failure, 1);
> +}
> +#else
> +static inline bool xe_fault_inject_gt_reset(void)
> +{
> + return false;
> +}
> +#endif
> +
> struct xe_gt *xe_gt_alloc(struct xe_tile *tile);
> int xe_gt_init_early(struct xe_gt *gt);
> int xe_gt_init(struct xe_gt *gt);
> --
> 2.25.1
>
More information about the Intel-xe
mailing list