[PATCH] drm/xe: Always check force_wake_get return code

Matt Roper matthew.d.roper at intel.com
Tue Mar 12 23:07:15 UTC 2024


On Tue, Mar 12, 2024 at 12:42:56PM -0700, Daniele Ceraolo Spurio wrote:
> A force_wake_get failure means that the HW might not be awake for the
> access we're doing; this can lead to an immediate error or it can be a
> more subtle problem (e.g. a register read might return an incorrect
> value that is still valid, leading the driver to make a wrong choice
> instead of flagging an error).
> We avoid an error from the force_wake function because callers might
> handle or tolerate the error, but this only works if all callers
> are checking the error code. The majority already do, but a few are not.
> These are mainly falling into 3 categories, which are each handled
> differently:
> 
> 1) error capture: in this case we want to continue the capture, but we
>    log an info message in dmesg to notify the user that the capture
>    might have incorrect data.
> 
> 2) ioctl: in this case we return a -EIO error to userspace
> 
> 3) unabortable actions: these are scenarios where we can't simply abort
>    and retry and so it's better to just try it anyway because there is a
>    chance the HW is awake even with the failure. In this case we throw a
>    warning so we know there was a forcewake problem if something fails
>    down the line.
> 
> Signed-off-by: Daniele Ceraolo Spurio <daniele.ceraolospurio at intel.com>
> Cc: Tejas Upadhyay <tejas.upadhyay at intel.com>
> ---
>  drivers/gpu/drm/xe/xe_devcoredump.c         | 9 +++++++--
>  drivers/gpu/drm/xe/xe_gsc.c                 | 2 +-
>  drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c | 2 +-
>  drivers/gpu/drm/xe/xe_guc.c                 | 2 +-
>  drivers/gpu/drm/xe/xe_guc_pc.c              | 2 +-
>  drivers/gpu/drm/xe/xe_guc_submit.c          | 4 +++-
>  drivers/gpu/drm/xe/xe_query.c               | 3 ++-
>  7 files changed, 16 insertions(+), 8 deletions(-)
> 
> diff --git a/drivers/gpu/drm/xe/xe_devcoredump.c b/drivers/gpu/drm/xe/xe_devcoredump.c
> index 0fcd30680323..7d3aa6bd3524 100644
> --- a/drivers/gpu/drm/xe/xe_devcoredump.c
> +++ b/drivers/gpu/drm/xe/xe_devcoredump.c
> @@ -13,6 +13,7 @@
>  #include "xe_exec_queue.h"
>  #include "xe_force_wake.h"
>  #include "xe_gt.h"
> +#include "xe_gt_printk.h"
>  #include "xe_guc_ct.h"
>  #include "xe_guc_submit.h"
>  #include "xe_hw_engine.h"
> @@ -64,7 +65,9 @@ static void xe_devcoredump_deferred_snap_work(struct work_struct *work)
>  {
>  	struct xe_devcoredump_snapshot *ss = container_of(work, typeof(*ss), work);
>  
> -	xe_force_wake_get(gt_to_fw(ss->gt), XE_FORCEWAKE_ALL);
> +	/* keep going if fw fails as we still want to save the memory and SW data */
> +	if (xe_force_wake_get(gt_to_fw(ss->gt), XE_FORCEWAKE_ALL))
> +		xe_gt_info(ss->gt, "failed to get forcewake for coredump capture\n");
>  	xe_vm_snapshot_capture_delayed(ss->vm);
>  	xe_guc_exec_queue_snapshot_capture_delayed(ss->ge);
>  	xe_force_wake_put(gt_to_fw(ss->gt), XE_FORCEWAKE_ALL);
> @@ -180,7 +183,9 @@ static void devcoredump_snapshot(struct xe_devcoredump *coredump,
>  		}
>  	}
>  
> -	xe_force_wake_get(gt_to_fw(q->gt), XE_FORCEWAKE_ALL);
> +	/* keep going if fw fails as we still want to save the memory and SW data */
> +	if (xe_force_wake_get(gt_to_fw(q->gt), XE_FORCEWAKE_ALL))
> +		xe_gt_info(ss->gt, "failed to get forcewake for coredump capture\n");
>  
>  	coredump->snapshot.ct = xe_guc_ct_snapshot_capture(&guc->ct, true);
>  	coredump->snapshot.ge = xe_guc_exec_queue_snapshot_capture(job);
> diff --git a/drivers/gpu/drm/xe/xe_gsc.c b/drivers/gpu/drm/xe/xe_gsc.c
> index d9aa815a5bc2..902c52d95a8a 100644
> --- a/drivers/gpu/drm/xe/xe_gsc.c
> +++ b/drivers/gpu/drm/xe/xe_gsc.c
> @@ -287,7 +287,7 @@ static void gsc_work(struct work_struct *work)
>  	spin_unlock_irq(&gsc->lock);
>  
>  	xe_pm_runtime_get(xe);
> -	xe_force_wake_get(gt_to_fw(gt), XE_FW_GSC);
> +	XE_WARN_ON(xe_force_wake_get(gt_to_fw(gt), XE_FW_GSC));

Should this use xe_gt_WARN_ON() instead to make it GT-specific?  Same
for the other cases later where it's even more important to know which
specific GT is misbehaving.

Aside from that,

        Reviewed-by: Matt Roper <matthew.d.roper at intel.com>

>  
>  	if (actions & GSC_ACTION_FW_LOAD) {
>  		ret = gsc_upload_and_init(gsc);
> diff --git a/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c b/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c
> index a3c4ffba679d..7f1ac02b96c6 100644
> --- a/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c
> +++ b/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c
> @@ -247,7 +247,7 @@ int xe_gt_tlb_invalidation_ggtt(struct xe_gt *gt)
>  
>  		xe_gt_tlb_invalidation_wait(gt, seqno);
>  	} else if (xe_device_uc_enabled(xe)) {
> -		xe_force_wake_get(gt_to_fw(gt), XE_FW_GT);
> +		XE_WARN_ON(xe_force_wake_get(gt_to_fw(gt), XE_FW_GT));
>  		if (xe->info.platform == XE_PVC || GRAPHICS_VER(xe) >= 20) {
>  			xe_mmio_write32(gt, PVC_GUC_TLB_INV_DESC1,
>  					PVC_GUC_TLB_INV_DESC1_INVALIDATE);
> diff --git a/drivers/gpu/drm/xe/xe_guc.c b/drivers/gpu/drm/xe/xe_guc.c
> index caa86ccbe9e7..32b6da891f72 100644
> --- a/drivers/gpu/drm/xe/xe_guc.c
> +++ b/drivers/gpu/drm/xe/xe_guc.c
> @@ -251,7 +251,7 @@ static void guc_fini(struct drm_device *drm, void *arg)
>  {
>  	struct xe_guc *guc = arg;
>  
> -	xe_force_wake_get(gt_to_fw(guc_to_gt(guc)), XE_FORCEWAKE_ALL);
> +	XE_WARN_ON(xe_force_wake_get(gt_to_fw(guc_to_gt(guc)), XE_FORCEWAKE_ALL));
>  	xe_uc_fini_hw(&guc_to_gt(guc)->uc);
>  	xe_force_wake_put(gt_to_fw(guc_to_gt(guc)), XE_FORCEWAKE_ALL);
>  }
> diff --git a/drivers/gpu/drm/xe/xe_guc_pc.c b/drivers/gpu/drm/xe/xe_guc_pc.c
> index f4b031b8d9de..b4b39bfcb5d5 100644
> --- a/drivers/gpu/drm/xe/xe_guc_pc.c
> +++ b/drivers/gpu/drm/xe/xe_guc_pc.c
> @@ -927,7 +927,7 @@ static void xe_guc_pc_fini(struct drm_device *drm, void *arg)
>  		return;
>  	}
>  
> -	xe_force_wake_get(gt_to_fw(pc_to_gt(pc)), XE_FORCEWAKE_ALL);
> +	XE_WARN_ON(xe_force_wake_get(gt_to_fw(pc_to_gt(pc)), XE_FORCEWAKE_ALL));
>  	XE_WARN_ON(xe_guc_pc_gucrc_disable(pc));
>  	XE_WARN_ON(xe_guc_pc_stop(pc));
>  	xe_force_wake_put(gt_to_fw(pc_to_gt(pc)), XE_FORCEWAKE_ALL);
> diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c b/drivers/gpu/drm/xe/xe_guc_submit.c
> index 19efdb2f881f..d7462969af1e 100644
> --- a/drivers/gpu/drm/xe/xe_guc_submit.c
> +++ b/drivers/gpu/drm/xe/xe_guc_submit.c
> @@ -833,7 +833,9 @@ static void simple_error_capture(struct xe_exec_queue *q)
>  			}
>  		}
>  
> -		xe_force_wake_get(gt_to_fw(guc_to_gt(guc)), XE_FORCEWAKE_ALL);
> +		if (xe_force_wake_get(gt_to_fw(guc_to_gt(guc)), XE_FORCEWAKE_ALL))
> +			xe_gt_info(guc_to_gt(guc),
> +				   "failed to get forcewake for error capture");
>  		xe_guc_ct_print(&guc->ct, &p, true);
>  		guc_exec_queue_print(q, &p);
>  		for_each_hw_engine(hwe, guc_to_gt(guc), id) {
> diff --git a/drivers/gpu/drm/xe/xe_query.c b/drivers/gpu/drm/xe/xe_query.c
> index a6a20a6dd360..3975f99fdca5 100644
> --- a/drivers/gpu/drm/xe/xe_query.c
> +++ b/drivers/gpu/drm/xe/xe_query.c
> @@ -147,7 +147,8 @@ query_engine_cycles(struct xe_device *xe,
>  	if (!hwe)
>  		return -EINVAL;
>  
> -	xe_force_wake_get(gt_to_fw(gt), XE_FORCEWAKE_ALL);
> +	if (xe_force_wake_get(gt_to_fw(gt), XE_FORCEWAKE_ALL))
> +		return -EIO;
>  
>  	__read_timestamps(gt,
>  			  RING_TIMESTAMP(hwe->mmio_base),
> -- 
> 2.43.0
> 

-- 
Matt Roper
Graphics Software Engineer
Linux GPU Platform Enablement
Intel Corporation


More information about the Intel-xe mailing list