[PATCH 2/4] drm/xe: declare wedged upon GuC load failure

Lucas De Marchi lucas.demarchi at intel.com
Tue Apr 16 17:40:32 UTC 2024


On Wed, Apr 10, 2024 at 12:20:18PM GMT, Ghimiray, Himal Prasad wrote:
>
>On 10-04-2024 03:45, Rodrigo Vivi wrote:
>>Let's block the device upon any GuC load failure.
>>But let's continue with the probe so guc logs can be read
>>from the debugfs.
>>
>>v2: - s/wedged/busted
>>     - do not block probe or we lose guc_logs in debugfs (Matt)
>>
>>v3: - s/busted/wedged
>>
>>Cc: Matthew Brost <matthew.brost at intel.com>
>>Signed-off-by: Rodrigo Vivi <rodrigo.vivi at intel.com>
>>---
>>  drivers/gpu/drm/xe/xe_guc.c | 42 ++++++++++++++++---------------------
>>  1 file changed, 18 insertions(+), 24 deletions(-)
>>
>>diff --git a/drivers/gpu/drm/xe/xe_guc.c b/drivers/gpu/drm/xe/xe_guc.c
>>index 240e7a4bbff1..f1c3e338301d 100644
>>--- a/drivers/gpu/drm/xe/xe_guc.c
>>+++ b/drivers/gpu/drm/xe/xe_guc.c
>>@@ -451,7 +451,7 @@ static int guc_xfer_rsa(struct xe_guc *guc)
>>  	return 0;
>>  }
>>-static int guc_wait_ucode(struct xe_guc *guc)
>>+static void guc_wait_ucode(struct xe_guc *guc)
>>  {
>>  	struct xe_gt *gt = guc_to_gt(guc);
>>  	u32 status;
>>@@ -479,30 +479,26 @@ static int guc_wait_ucode(struct xe_guc *guc)
>>  			     200000, &status, false);
>>  	if (ret) {
>>-		xe_gt_info(gt, "GuC load failed: status = 0x%08X\n", status);
>>-		xe_gt_info(gt, "GuC status: Reset = %u, BootROM = %#X, UKernel = %#X, MIA = %#X, Auth = %#X\n",
>>-			   REG_FIELD_GET(GS_MIA_IN_RESET, status),
>>-			   REG_FIELD_GET(GS_BOOTROM_MASK, status),
>>-			   REG_FIELD_GET(GS_UKERNEL_MASK, status),
>>-			   REG_FIELD_GET(GS_MIA_MASK, status),
>>-			   REG_FIELD_GET(GS_AUTH_STATUS_MASK, status));
>>-
>>-		if ((status & GS_BOOTROM_MASK) == GS_BOOTROM_RSA_FAILED) {
>>-			xe_gt_info(gt, "GuC firmware signature verification failed\n");
>>-			ret = -ENOEXEC;
>>-		}
>>+		xe_gt_err(gt, "GuC load failed: status = 0x%08X\n", status);
>>+		xe_gt_err(gt, "GuC status: Reset = %u, BootROM = %#X, UKernel = %#X, MIA = %#X, Auth = %#X\n",
>>+			  REG_FIELD_GET(GS_MIA_IN_RESET, status),
>>+			  REG_FIELD_GET(GS_BOOTROM_MASK, status),
>>+			  REG_FIELD_GET(GS_UKERNEL_MASK, status),
>>+			  REG_FIELD_GET(GS_MIA_MASK, status),
>>+			  REG_FIELD_GET(GS_AUTH_STATUS_MASK, status));
>>+
>>+		if ((status & GS_BOOTROM_MASK) == GS_BOOTROM_RSA_FAILED)
>>+			xe_gt_err(gt, "GuC firmware signature verification failed\n");
>>  		if (REG_FIELD_GET(GS_UKERNEL_MASK, status) ==
>>-		    XE_GUC_LOAD_STATUS_EXCEPTION) {
>>-			xe_gt_info(gt, "GuC firmware exception. EIP: %#x\n",
>>-				   xe_mmio_read32(gt, SOFT_SCRATCH(13)));
>>-			ret = -ENXIO;
>>-		}
>>+		    XE_GUC_LOAD_STATUS_EXCEPTION)
>>+			xe_gt_err(gt, "GuC firmware exception. EIP: %#x\n",
>>+				  xe_mmio_read32(gt, SOFT_SCRATCH(13)));
>>+
>>+		xe_device_declare_wedged(gt_to_xe(gt));
>>  	} else {
>>  		xe_gt_dbg(gt, "GuC successfully loaded\n");
>>  	}
>>-
>>-	return ret;
>>  }
>>  static int __xe_guc_upload(struct xe_guc *guc)
>>@@ -532,16 +528,14 @@ static int __xe_guc_upload(struct xe_guc *guc)
>>  		goto out;
>>  	/* Wait for authentication */
>>-	ret = guc_wait_ucode(guc);
>>-	if (ret)
>>-		goto out;
>>+	guc_wait_ucode(guc);
>>  	xe_uc_fw_change_status(&guc->fw, XE_UC_FIRMWARE_RUNNING);
>>  	return 0;
>>  out:
>>  	xe_uc_fw_change_status(&guc->fw, XE_UC_FIRMWARE_LOAD_FAIL);
>>-	return 0	/* FIXME: ret, don't want to stop load currently */;
>>+	return ret;
>
>Hi Rodrigo,
>
>Can you please confirm whether the concern raised on 
>https://patchwork.freedesktop.org/patch/582990/ is valid or not ?
>
>With current changes error return from guc_xfer_rsa(guc)  or 
>xe_uc_fw_upload will lead to driver probe failure. Probably we don't 
>want that.

yeah... I think this return code change isn't really needed for what
this patch is doing. We could very well keep the return to maintain
the behavior. Or do we also want the wedged state in any failure in
__xe_guc_upload()?

Lucas De Marchi


More information about the Intel-xe mailing list