[Intel-gfx] [PATCH v2 2/2] drm/i915/guc: Allow for very slow GuC loading

Ceraolo Spurio, Daniele daniele.ceraolospurio at intel.com
Wed Mar 22 21:14:55 UTC 2023



On 3/16/2023 3:06 PM, John.C.Harrison at Intel.com wrote:
> From: John Harrison <John.C.Harrison at Intel.com>
>
> A failure to load the GuC is occasionally observed where the GuC log
> actually showed that the GuC had loaded just fine. The implication
> being that the load just took ever so slightly longer than the 200ms
> timeout. Given that the actual time should be tens of milliseconds at
> the slowest, this should never happen. So far the issue has generally
> been caused by a bad IFWI resulting in low frequencies during boot
> (depsite the KMD requesting max frequency). However, the issue seems
> to happen more often than one would like.
>
> So a) increase the timeout so that the user still gets a working
> system even in the case of slow load. And b) report the frequency
> during the load to see if that is the case of the slow down.
>
> v2: Reduce timeout in non-debug builds, add references (Daniele)
>
> References: https://gitlab.freedesktop.org/drm/intel/-/issues/7931
> References: https://gitlab.freedesktop.org/drm/intel/-/issues/8060
> References: https://gitlab.freedesktop.org/drm/intel/-/issues/8083
> References: https://gitlab.freedesktop.org/drm/intel/-/issues/8136
> References: https://gitlab.freedesktop.org/drm/intel/-/issues/8137
> Signed-off-by: John Harrison <John.C.Harrison at Intel.com>

Reviewed-by: Daniele Ceraolo Spurio <daniele.ceraolospurio at intel.com>

Daniele

> ---
>   drivers/gpu/drm/i915/gt/uc/intel_guc_fw.c | 50 +++++++++++++++++++++--
>   1 file changed, 47 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_fw.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_fw.c
> index 0b49d84a8a9c2..6fda3aec5c66a 100644
> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_fw.c
> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_fw.c
> @@ -12,6 +12,7 @@
>   #include "gt/intel_gt.h"
>   #include "gt/intel_gt_mcr.h"
>   #include "gt/intel_gt_regs.h"
> +#include "gt/intel_rps.h"
>   #include "intel_guc_fw.h"
>   #include "intel_guc_print.h"
>   #include "i915_drv.h"
> @@ -135,13 +136,29 @@ static inline bool guc_load_done(struct intel_uncore *uncore, u32 *status, bool
>   	return false;
>   }
>   
> +/*
> + * Use a longer timeout for debug builds so that problems can be detected
> + * and analysed. But a shorter timeout for releases so that user's don't
> + * wait forever to find out there is a problem. Note that the only reason
> + * an end user should hit the timeout is in case of extreme thermal throttling.
> + * And a system that is that hot during boot is probably dead anyway!
> + */
> +#if defined(CONFIG_DRM_I915_DEBUG_GEM)
> +#define GUC_LOAD_RETRY_LIMIT	20
> +#else
> +#define GUC_LOAD_RETRY_LIMIT	3
> +#endif
> +
>   static int guc_wait_ucode(struct intel_guc *guc)
>   {
>   	struct intel_gt *gt = guc_to_gt(guc);
>   	struct intel_uncore *uncore = gt->uncore;
> +	ktime_t before, after, delta;
>   	bool success;
>   	u32 status;
> -	int ret;
> +	int ret, count;
> +	u64 delta_ms;
> +	u32 before_freq;
>   
>   	/*
>   	 * Wait for the GuC to start up.
> @@ -159,13 +176,32 @@ static int guc_wait_ucode(struct intel_guc *guc)
>   	 * issues to be resolved. In the meantime bump the timeout to
>   	 * 200ms. Even at slowest clock, this should be sufficient. And
>   	 * in the working case, a larger timeout makes no difference.
> +	 *
> +	 * IFWI updates have also been seen to cause sporadic failures due to
> +	 * the requested frequency not being granted and thus the firmware
> +	 * load is attempted at minimum frequency. That can lead to load times
> +	 * in the seconds range. However, there is a limit on how long an
> +	 * individual wait_for() can wait. So wrap it in a loop.
>   	 */
> -	ret = wait_for(guc_load_done(uncore, &status, &success), 200);
> +	before_freq = intel_rps_read_actual_frequency(&uncore->gt->rps);
> +	before = ktime_get();
> +	for (count = 0; count < GUC_LOAD_RETRY_LIMIT; count++) {
> +		ret = wait_for(guc_load_done(uncore, &status, &success), 1000);
> +		if (!ret || !success)
> +			break;
> +
> +		guc_dbg(guc, "load still in progress, count = %d, freq = %dMHz\n",
> +			count, intel_rps_read_actual_frequency(&uncore->gt->rps));
> +	}
> +	after = ktime_get();
> +	delta = ktime_sub(after, before);
> +	delta_ms = ktime_to_ms(delta);
>   	if (ret || !success) {
>   		u32 ukernel = REG_FIELD_GET(GS_UKERNEL_MASK, status);
>   		u32 bootrom = REG_FIELD_GET(GS_BOOTROM_MASK, status);
>   
> -		guc_info(guc, "load failed: status = 0x%08X, ret = %d\n", status, ret);
> +		guc_info(guc, "load failed: status = 0x%08X, time = %lldms, freq = %dMHz, ret = %d\n",
> +			 status, delta_ms, intel_rps_read_actual_frequency(&uncore->gt->rps), ret);
>   		guc_info(guc, "load failed: status: Reset = %d, BootROM = 0x%02X, UKernel = 0x%02X, MIA = 0x%02X, Auth = 0x%02X\n",
>   			 REG_FIELD_GET(GS_MIA_IN_RESET, status),
>   			 bootrom, ukernel,
> @@ -206,6 +242,14 @@ static int guc_wait_ucode(struct intel_guc *guc)
>   		/* Uncommon/unexpected error, see earlier status code print for details */
>   		if (ret == 0)
>   			ret = -ENXIO;
> +	} else if (delta_ms > 200) {
> +		guc_warn(guc, "excessive init time: %lldms! [freq = %dMHz, before = %dMHz, status = 0x%08X, count = %d, ret = %d]\n",
> +			 delta_ms, intel_rps_read_actual_frequency(&uncore->gt->rps),
> +			 before_freq, status, count, ret);
> +	} else {
> +		guc_dbg(guc, "init took %lldms, freq = %dMHz, before = %dMHz, status = 0x%08X, count = %d, ret = %d\n",
> +			delta_ms, intel_rps_read_actual_frequency(&uncore->gt->rps),
> +			before_freq, status, count, ret);
>   	}
>   
>   	return ret;



More information about the Intel-gfx mailing list