[PATCH v6 2/4] drm/xe: Don't update wedged mode in case of an error

Wed May 28 09:17:22 UTC 2025

On 21.05.2025 14:25, Lukasz Laguna wrote:
> Update driver's internal wedged.mode state only in case of a success or
> set it to "misconfigured" in case of inconsistent reset policy state
> between GTs.
> 
> Fixes: 6b8ef44cc0a9 ("drm/xe: Introduce the wedged_mode debugfs")
> Signed-off-by: Lukasz Laguna <lukasz.laguna at intel.com>
> ---
>  drivers/gpu/drm/xe/xe_debugfs.c      | 34 ++++++++++++++++++++++++----
>  drivers/gpu/drm/xe/xe_device.c       |  2 ++
>  drivers/gpu/drm/xe/xe_device_types.h |  1 +
>  drivers/gpu/drm/xe/xe_guc_ads.c      | 12 +++++-----
>  drivers/gpu/drm/xe/xe_guc_ads.h      |  4 +++-
>  5 files changed, 41 insertions(+), 12 deletions(-)
> 
> diff --git a/drivers/gpu/drm/xe/xe_debugfs.c b/drivers/gpu/drm/xe/xe_debugfs.c
> index 5dd0ad3f9bae..5bc4fe5aa9a2 100644
> --- a/drivers/gpu/drm/xe/xe_debugfs.c
> +++ b/drivers/gpu/drm/xe/xe_debugfs.c
> @@ -150,6 +150,25 @@ static ssize_t wedged_mode_show(struct file *f, char __user *ubuf,
>  	return simple_read_from_buffer(ubuf, size, pos, buf, len);
>  }
>  
> +static int wedged_mode_set_reset_policy(struct xe_gt *gt, enum xe_wedged_mode mode)
> +{
> +	struct xe_device *xe = gt_to_xe(gt);
> +	int ret;
> +
> +	if ((xe->wedged.mode == XE_WEDGED_MODE_NEVER &&
> +	     mode == XE_WEDGED_MODE_UPON_CRITICAL_ERROR) ||
> +	    (xe->wedged.mode == XE_WEDGED_MODE_UPON_CRITICAL_ERROR &&
> +	     mode == XE_WEDGED_MODE_NEVER))
> +		return 0;

this could be a separate helper:

	bool need_policy_change(old_mode, new_mode)

and might be called before we enter the for_each_gt loop

> +
> +	ret = xe_guc_ads_scheduler_policy_toggle_reset(&gt->uc.guc.ads,
> +						       !(mode == XE_WEDGED_MODE_UPON_ANY_HANG));
> +	if (ret)
> +		xe_gt_err(gt, "Failed to update GuC ADS scheduler policy (%pe)\n", ERR_PTR(ret));
> +
> +	return ret;
> +}
> +
>  static ssize_t wedged_mode_set(struct file *f, const char __user *ubuf,
>  			       size_t size, loff_t *pos)
>  {
> @@ -170,19 +189,24 @@ static ssize_t wedged_mode_set(struct file *f, const char __user *ubuf,
>  	if (xe->wedged.mode == wedged_mode)
>  		return size;
>  
> -	xe->wedged.mode = wedged_mode;
> -
>  	xe_pm_runtime_get(xe);
>  	for_each_gt(gt, xe, id) {
> -		ret = xe_guc_ads_scheduler_policy_toggle_reset(&gt->uc.guc.ads);
> +		ret = wedged_mode_set_reset_policy(gt, wedged_mode);
>  		if (ret) {
> -			xe_gt_err(gt, "Failed to update GuC ADS scheduler policy. GuC may still cause engine reset even with wedged_mode=2\n");
> +			if (id > 0) {
> +				xe->wedged.mode = XE_WEDGED_MODE_MISCONFIGURED;

hmm, I'm not sure that the rest of the driver code will properly handle
this new unexpected state - do we really need this? can't we just try to
revert recent change where it was accepted? and if it fails there it
still not the end of the world

alternatively we can try to track GuC reset policy settings per-GuC, so
we will know where it was left inconsistent with global 'wedge' mode

or just wait until next mode update or GT reset where likely we will be
able to restore the consistency

IMO it should be fine just to report inconsistency, and then let the
user decide what to do next

> +				drm_err(&xe->drm, "Inconsistent reset policy state between GTs, setting wedged mode: (%u) %s\n",
> +					xe->wedged.mode,
> +					xe_device_wedged_mode_to_string(xe->wedged.mode));

this new state is not the one exposed in modparam description, user
might be confused what it means

> +			}
>  			xe_pm_runtime_put(xe);
> -			return -EIO;
> +			return ret;
>  		}
>  	}
>  	xe_pm_runtime_put(xe);
>  
> +	xe->wedged.mode = wedged_mode;
> +
>  	return size;
>  }
>  
> diff --git a/drivers/gpu/drm/xe/xe_device.c b/drivers/gpu/drm/xe/xe_device.c
> index fa4f1a3ac1f2..0aea87994541 100644
> --- a/drivers/gpu/drm/xe/xe_device.c
> +++ b/drivers/gpu/drm/xe/xe_device.c
> @@ -1206,6 +1206,8 @@ const char *xe_device_wedged_mode_to_string(enum xe_wedged_mode mode)
>  		return "upon-critical-error";
>  	case XE_WEDGED_MODE_UPON_ANY_HANG:
>  		return "upon-any-hang";
> +	case XE_WEDGED_MODE_MISCONFIGURED:
> +		return "misconfigured";
>  	default:
>  		return "<invalid>";
>  	}
> diff --git a/drivers/gpu/drm/xe/xe_device_types.h b/drivers/gpu/drm/xe/xe_device_types.h
> index a8dfacb951c1..8ec1d891a2ab 100644
> --- a/drivers/gpu/drm/xe/xe_device_types.h
> +++ b/drivers/gpu/drm/xe/xe_device_types.h
> @@ -562,6 +562,7 @@ struct xe_device {
>  			XE_WEDGED_MODE_NEVER = 0,
>  			XE_WEDGED_MODE_UPON_CRITICAL_ERROR = 1,
>  			XE_WEDGED_MODE_UPON_ANY_HANG = 2,
> +			XE_WEDGED_MODE_MISCONFIGURED = 3,
>  			XE_WEDGED_MODE_DEFAULT = XE_WEDGED_MODE_UPON_CRITICAL_ERROR,
>  		} mode;
>  	} wedged;
> diff --git a/drivers/gpu/drm/xe/xe_guc_ads.c b/drivers/gpu/drm/xe/xe_guc_ads.c
> index ca8bdb0ec08d..f2c4c4e292f6 100644
> --- a/drivers/gpu/drm/xe/xe_guc_ads.c
> +++ b/drivers/gpu/drm/xe/xe_guc_ads.c
> @@ -1008,14 +1008,14 @@ static int guc_ads_update_policies(struct xe_guc_ads *ads, const struct guc_poli
>  /**
>   * xe_guc_ads_scheduler_policy_toggle_reset - Toggle reset policy
>   * @ads: Additional data structures object
> + * @enable: true to enable engine resets, false otherwise
>   *
> - * This function update the GuC's engine reset policy based on wedged.mode.
> + * This function update the GuC's engine reset policy.
>   *
>   * Return: 0 on success, and negative error code otherwise.
>   */
> -int xe_guc_ads_scheduler_policy_toggle_reset(struct xe_guc_ads *ads)
> +int xe_guc_ads_scheduler_policy_toggle_reset(struct xe_guc_ads *ads, bool enable)
>  {
> -	struct xe_device *xe = ads_to_xe(ads);
>  	struct guc_policies *policies;
>  	int ret;
>  
> @@ -1026,10 +1026,10 @@ int xe_guc_ads_scheduler_policy_toggle_reset(struct xe_guc_ads *ads)
>  	policies->dpc_promote_time = ads_blob_read(ads, policies.dpc_promote_time);
>  	policies->max_num_work_items = ads_blob_read(ads, policies.max_num_work_items);
>  	policies->is_valid = 1;
> -	if (xe->wedged.mode == XE_WEDGED_MODE_UPON_ANY_HANG)
> -		policies->global_flags |= GLOBAL_POLICY_DISABLE_ENGINE_RESET;
> -	else
> +	if (enable)
>  		policies->global_flags &= ~GLOBAL_POLICY_DISABLE_ENGINE_RESET;
> +	else
> +		policies->global_flags |= GLOBAL_POLICY_DISABLE_ENGINE_RESET;

btw, do we correctly update policies stored in ADS that likely will be
used again by the GuC on the subsequent reset ?

adding @John

>  
>  	ret = guc_ads_update_policies(ads, policies);
>  	kfree(policies);
> diff --git a/drivers/gpu/drm/xe/xe_guc_ads.h b/drivers/gpu/drm/xe/xe_guc_ads.h
> index 2e6674c760ff..9879aadd22d6 100644
> --- a/drivers/gpu/drm/xe/xe_guc_ads.h
> +++ b/drivers/gpu/drm/xe/xe_guc_ads.h
> @@ -6,6 +6,8 @@
>  #ifndef _XE_GUC_ADS_H_
>  #define _XE_GUC_ADS_H_
>  
> +#include <linux/types.h>
> +
>  struct xe_guc_ads;
>  
>  int xe_guc_ads_init(struct xe_guc_ads *ads);
> @@ -13,6 +15,6 @@ int xe_guc_ads_init_post_hwconfig(struct xe_guc_ads *ads);
>  void xe_guc_ads_populate(struct xe_guc_ads *ads);
>  void xe_guc_ads_populate_minimal(struct xe_guc_ads *ads);
>  void xe_guc_ads_populate_post_load(struct xe_guc_ads *ads);
> -int xe_guc_ads_scheduler_policy_toggle_reset(struct xe_guc_ads *ads);
> +int xe_guc_ads_scheduler_policy_toggle_reset(struct xe_guc_ads *ads, bool enable);
>  
>  #endif