[PATCH 2/4] drm/xe: declare wedged upon GuC load failure

Rodrigo Vivi rodrigo.vivi at intel.com
Tue Apr 16 19:05:46 UTC 2024


On Tue, Apr 09, 2024 at 06:15:05PM -0400, Rodrigo Vivi wrote:
> Let's block the device upon any GuC load failure.
> But let's continue with the probe so guc logs can be read
> from the debugfs.
> 
> v2: - s/wedged/busted
>     - do not block probe or we lose guc_logs in debugfs (Matt)
> 
> v3: - s/busted/wedged
> 
> Cc: Matthew Brost <matthew.brost at intel.com>
> Signed-off-by: Rodrigo Vivi <rodrigo.vivi at intel.com>
> ---
>  drivers/gpu/drm/xe/xe_guc.c | 42 ++++++++++++++++---------------------
>  1 file changed, 18 insertions(+), 24 deletions(-)
> 
> diff --git a/drivers/gpu/drm/xe/xe_guc.c b/drivers/gpu/drm/xe/xe_guc.c
> index 240e7a4bbff1..f1c3e338301d 100644
> --- a/drivers/gpu/drm/xe/xe_guc.c
> +++ b/drivers/gpu/drm/xe/xe_guc.c
> @@ -451,7 +451,7 @@ static int guc_xfer_rsa(struct xe_guc *guc)
>  	return 0;
>  }
>  
> -static int guc_wait_ucode(struct xe_guc *guc)
> +static void guc_wait_ucode(struct xe_guc *guc)
>  {
>  	struct xe_gt *gt = guc_to_gt(guc);
>  	u32 status;
> @@ -479,30 +479,26 @@ static int guc_wait_ucode(struct xe_guc *guc)
>  			     200000, &status, false);
>  
>  	if (ret) {
> -		xe_gt_info(gt, "GuC load failed: status = 0x%08X\n", status);
> -		xe_gt_info(gt, "GuC status: Reset = %u, BootROM = %#X, UKernel = %#X, MIA = %#X, Auth = %#X\n",
> -			   REG_FIELD_GET(GS_MIA_IN_RESET, status),
> -			   REG_FIELD_GET(GS_BOOTROM_MASK, status),
> -			   REG_FIELD_GET(GS_UKERNEL_MASK, status),
> -			   REG_FIELD_GET(GS_MIA_MASK, status),
> -			   REG_FIELD_GET(GS_AUTH_STATUS_MASK, status));
> -
> -		if ((status & GS_BOOTROM_MASK) == GS_BOOTROM_RSA_FAILED) {
> -			xe_gt_info(gt, "GuC firmware signature verification failed\n");
> -			ret = -ENOEXEC;
> -		}
> +		xe_gt_err(gt, "GuC load failed: status = 0x%08X\n", status);
> +		xe_gt_err(gt, "GuC status: Reset = %u, BootROM = %#X, UKernel = %#X, MIA = %#X, Auth = %#X\n",
> +			  REG_FIELD_GET(GS_MIA_IN_RESET, status),
> +			  REG_FIELD_GET(GS_BOOTROM_MASK, status),
> +			  REG_FIELD_GET(GS_UKERNEL_MASK, status),
> +			  REG_FIELD_GET(GS_MIA_MASK, status),
> +			  REG_FIELD_GET(GS_AUTH_STATUS_MASK, status));
> +
> +		if ((status & GS_BOOTROM_MASK) == GS_BOOTROM_RSA_FAILED)
> +			xe_gt_err(gt, "GuC firmware signature verification failed\n");
>  
>  		if (REG_FIELD_GET(GS_UKERNEL_MASK, status) ==
> -		    XE_GUC_LOAD_STATUS_EXCEPTION) {
> -			xe_gt_info(gt, "GuC firmware exception. EIP: %#x\n",
> -				   xe_mmio_read32(gt, SOFT_SCRATCH(13)));
> -			ret = -ENXIO;
> -		}
> +		    XE_GUC_LOAD_STATUS_EXCEPTION)
> +			xe_gt_err(gt, "GuC firmware exception. EIP: %#x\n",
> +				  xe_mmio_read32(gt, SOFT_SCRATCH(13)));
> +
> +		xe_device_declare_wedged(gt_to_xe(gt));
>  	} else {
>  		xe_gt_dbg(gt, "GuC successfully loaded\n");
>  	}
> -
> -	return ret;
>  }
>  
>  static int __xe_guc_upload(struct xe_guc *guc)
> @@ -532,16 +528,14 @@ static int __xe_guc_upload(struct xe_guc *guc)
>  		goto out;
>  
>  	/* Wait for authentication */
> -	ret = guc_wait_ucode(guc);
> -	if (ret)
> -		goto out;
> +	guc_wait_ucode(guc);
>  
>  	xe_uc_fw_change_status(&guc->fw, XE_UC_FIRMWARE_RUNNING);
>  	return 0;
>  
>  out:
>  	xe_uc_fw_change_status(&guc->fw, XE_UC_FIRMWARE_LOAD_FAIL);
> -	return 0	/* FIXME: ret, don't want to stop load currently */;
> +	return ret;

Lucas, thanks for the review. Just to let you know that I'm removing
this chunk from this patch. Himal had noticed and warned me that
this would change the behavior of other cases that are not touched
or covered by this patch. i.e. if the guc_load fails on guc_xfer_rsa
or xe_uc_fw_upload, we were not aboarting the probe, but now we are.

So, let's remove this change from this patch for now so we can go
ahead with this and then on top we see if we do the wedged on top
of the rest and make this function a void case.

Agree?

>  }
>  
>  /**
> -- 
> 2.44.0
> 


More information about the Intel-xe mailing list