[PATCH 6/7] drm/xe: Stop checking for power_lost on D3Cold

Tue May 7 10:35:55 UTC 2024


> -----Original Message-----
> From: Vivi, Rodrigo <rodrigo.vivi at intel.com>
> Sent: Saturday, May 4, 2024 12:43 AM
> To: intel-xe at lists.freedesktop.org
> Cc: De Marchi, Lucas <lucas.demarchi at intel.com>; Brost, Matthew
> <matthew.brost at intel.com>; Dugast, Francois <francois.dugast at intel.com>;
> thomas.hellstrom at linux.intel.com; Auld, Matthew
> <matthew.auld at intel.com>; Gupta, Anshuman
> <anshuman.gupta at intel.com>; Vivi, Rodrigo <rodrigo.vivi at intel.com>
> Subject: [PATCH 6/7] drm/xe: Stop checking for power_lost on D3Cold
> 
> GuC reset status is not reliable for this purpose and it is once in a while
> ending up in a situation of D3Cold, where power_reset is false and without
> the proper memory restoration the GuC reload and Display will fail to come
> back from D3Cold.
> 
> So, let's do a full restoration of everything if we have a risk of losing power,
> without further optimizations.
> 
> v2: also remove the gut_in_reset function (Anshuman)
> 
> Cc: Anshuman Gupta <anshuman.gupta at intel.com>
> Signed-off-by: Rodrigo Vivi <rodrigo.vivi at intel.com>
Reviewed-by: Anshuman Gupta <anshuman.gupta at intel.com>
> ---
>  drivers/gpu/drm/xe/xe_device_types.h |  3 ---
>  drivers/gpu/drm/xe/xe_guc.c          | 27 ---------------------------
>  drivers/gpu/drm/xe/xe_guc.h          |  1 -
>  drivers/gpu/drm/xe/xe_pm.c           | 12 ++----------
>  4 files changed, 2 insertions(+), 41 deletions(-)
> 
> diff --git a/drivers/gpu/drm/xe/xe_device_types.h
> b/drivers/gpu/drm/xe/xe_device_types.h
> index 0f68c55ea405..863ba49fedea 100644
> --- a/drivers/gpu/drm/xe/xe_device_types.h
> +++ b/drivers/gpu/drm/xe/xe_device_types.h
> @@ -427,9 +427,6 @@ struct xe_device {
>  		/** @d3cold.allowed: Indicates if d3cold is a valid device
> state */
>  		bool allowed;
> 
> -		/** @d3cold.power_lost: Indicates if card has really lost
> power. */
> -		bool power_lost;
> -
>  		/**
>  		 * @d3cold.vram_threshold:
>  		 *
> diff --git a/drivers/gpu/drm/xe/xe_guc.c b/drivers/gpu/drm/xe/xe_guc.c index
> 0c9938e0ab8c..7860b720a99c 100644
> --- a/drivers/gpu/drm/xe/xe_guc.c
> +++ b/drivers/gpu/drm/xe/xe_guc.c
> @@ -938,30 +938,3 @@ void xe_guc_print_info(struct xe_guc *guc, struct
> drm_printer *p)
>  	xe_guc_ct_print(&guc->ct, p, false);
>  	xe_guc_submit_print(guc, p);
>  }
> -
> -/**
> - * xe_guc_in_reset() - Detect if GuC MIA is in reset.
> - * @guc: The GuC object
> - *
> - * This function detects runtime resume from d3cold by leveraging
> - * GUC_STATUS, GUC doesn't get reset during d3hot,
> - * it strictly to be called from RPM resume handler.
> - *
> - * Return: true if failed to get forcewake or GuC MIA is in Reset,
> - * otherwise false.
> - */
> -bool xe_guc_in_reset(struct xe_guc *guc) -{
> -	struct xe_gt *gt = guc_to_gt(guc);
> -	u32 status;
> -	int err;
> -
> -	err = xe_force_wake_get(gt_to_fw(gt), XE_FW_GT);
> -	if (err)
> -		return true;
> -
> -	status = xe_mmio_read32(gt, GUC_STATUS);
> -	xe_force_wake_put(gt_to_fw(gt), XE_FW_GT);
> -
> -	return  status & GS_MIA_IN_RESET;
> -}
> diff --git a/drivers/gpu/drm/xe/xe_guc.h b/drivers/gpu/drm/xe/xe_guc.h
> index a3c92b74a3d5..af59c9545753 100644
> --- a/drivers/gpu/drm/xe/xe_guc.h
> +++ b/drivers/gpu/drm/xe/xe_guc.h
> @@ -37,7 +37,6 @@ void xe_guc_reset_wait(struct xe_guc *guc);  void
> xe_guc_stop_prepare(struct xe_guc *guc);  void xe_guc_stop(struct xe_guc
> *guc);  int xe_guc_start(struct xe_guc *guc); -bool xe_guc_in_reset(struct
> xe_guc *guc);
> 
>  static inline u16 xe_engine_class_to_guc_class(enum xe_engine_class class)  {
> diff --git a/drivers/gpu/drm/xe/xe_pm.c b/drivers/gpu/drm/xe/xe_pm.c index
> 8500dc93f695..fa099a8981e7 100644
> --- a/drivers/gpu/drm/xe/xe_pm.c
> +++ b/drivers/gpu/drm/xe/xe_pm.c
> @@ -404,15 +404,7 @@ int xe_pm_runtime_resume(struct xe_device *xe)
> 
>  	lock_map_acquire(&xe_pm_runtime_lockdep_map);
> 
> -	/*
> -	 * It can be possible that xe has allowed d3cold but other pcie
> devices
> -	 * in gfx card soc would have blocked d3cold, therefore card has not
> -	 * really lost power. Detecting primary Gt power is sufficient.
> -	 */
> -	gt = xe_device_get_gt(xe, 0);
> -	xe->d3cold.power_lost = xe_guc_in_reset(&gt->uc.guc);
> -
> -	if (xe->d3cold.allowed && xe->d3cold.power_lost) {
> +	if (xe->d3cold.allowed) {
>  		err = xe_pcode_ready(xe, true);
>  		if (err)
>  			goto out;
> @@ -433,7 +425,7 @@ int xe_pm_runtime_resume(struct xe_device *xe)
>  	for_each_gt(gt, xe, id)
>  		xe_gt_resume(gt);
> 
> -	if (xe->d3cold.allowed && xe->d3cold.power_lost) {
> +	if (xe->d3cold.allowed) {
>  		xe_display_pm_resume(xe, true);
>  		err = xe_bo_restore_user(xe);
>  		if (err)
> --
> 2.44.0