[PATCH] drm/i915/huc: Allow for very slow HuC loading

Thu Jan 4 20:34:57 UTC 2024

On 1/2/2024 2:22 PM, John.C.Harrison at Intel.com wrote:
> From: John Harrison <John.C.Harrison at Intel.com>
>
> A failure to load the HuC is occasionally observed where the cause is
> believed to be a low GT frequency leading to very long load times.
>
> So a) increase the timeout so that the user still gets a working
> system even in the case of slow load. And b) report the frequency
> during the load to see if that is the cause of the slow down.
>
> Also update the similar code on the GuC load to not use uncore->gt
> when there is a local gt available. The two should match, but no need
> for unnecessary de-referencing.

Since the code is identical in almost identical in both places, I'm 
wondering if it is worth using a common waiter function and pass in a 
function pointer with the waiting logic. The cons of that is that we'd 
have to move to gt-level logging and pass in a tag, so not sure if it is 
worth it overall given that it isn't a lot of code. Maybe we should 
consider it when we implement this on the Xe-driver side?

>
> Signed-off-by: John Harrison <John.C.Harrison at Intel.com>
> ---
>   drivers/gpu/drm/i915/gt/uc/intel_guc_fw.c | 10 ++--
>   drivers/gpu/drm/i915/gt/uc/intel_huc.c    | 64 ++++++++++++++++++++---
>   2 files changed, 63 insertions(+), 11 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_fw.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_fw.c
> index 0f79cb6585182..52332bb143395 100644
> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_fw.c
> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_fw.c
> @@ -184,7 +184,7 @@ static int guc_wait_ucode(struct intel_guc *guc)
>   	 * in the seconds range. However, there is a limit on how long an
>   	 * individual wait_for() can wait. So wrap it in a loop.
>   	 */
> -	before_freq = intel_rps_read_actual_frequency(&uncore->gt->rps);
> +	before_freq = intel_rps_read_actual_frequency(&gt->rps);
>   	before = ktime_get();
>   	for (count = 0; count < GUC_LOAD_RETRY_LIMIT; count++) {
>   		ret = wait_for(guc_load_done(uncore, &status, &success), 1000);
> @@ -192,7 +192,7 @@ static int guc_wait_ucode(struct intel_guc *guc)
>   			break;
>   
>   		guc_dbg(guc, "load still in progress, count = %d, freq = %dMHz, status = 0x%08X [0x%02X/%02X]\n",
> -			count, intel_rps_read_actual_frequency(&uncore->gt->rps), status,
> +			count, intel_rps_read_actual_frequency(&gt->rps), status,
>   			REG_FIELD_GET(GS_BOOTROM_MASK, status),
>   			REG_FIELD_GET(GS_UKERNEL_MASK, status));
>   	}
> @@ -204,7 +204,7 @@ static int guc_wait_ucode(struct intel_guc *guc)
>   		u32 bootrom = REG_FIELD_GET(GS_BOOTROM_MASK, status);
>   
>   		guc_info(guc, "load failed: status = 0x%08X, time = %lldms, freq = %dMHz, ret = %d\n",
> -			 status, delta_ms, intel_rps_read_actual_frequency(&uncore->gt->rps), ret);
> +			 status, delta_ms, intel_rps_read_actual_frequency(&gt->rps), ret);
>   		guc_info(guc, "load failed: status: Reset = %d, BootROM = 0x%02X, UKernel = 0x%02X, MIA = 0x%02X, Auth = 0x%02X\n",
>   			 REG_FIELD_GET(GS_MIA_IN_RESET, status),
>   			 bootrom, ukernel,
> @@ -254,11 +254,11 @@ static int guc_wait_ucode(struct intel_guc *guc)
>   		guc_warn(guc, "excessive init time: %lldms! [status = 0x%08X, count = %d, ret = %d]\n",
>   			 delta_ms, status, count, ret);
>   		guc_warn(guc, "excessive init time: [freq = %dMHz, before = %dMHz, perf_limit_reasons = 0x%08X]\n",
> -			 intel_rps_read_actual_frequency(&uncore->gt->rps), before_freq,
> +			 intel_rps_read_actual_frequency(&gt->rps), before_freq,
>   			 intel_uncore_read(uncore, intel_gt_perf_limit_reasons_reg(gt)));
>   	} else {
>   		guc_dbg(guc, "init took %lldms, freq = %dMHz, before = %dMHz, status = 0x%08X, count = %d, ret = %d\n",
> -			delta_ms, intel_rps_read_actual_frequency(&uncore->gt->rps),
> +			delta_ms, intel_rps_read_actual_frequency(&gt->rps),
>   			before_freq, status, count, ret);
>   	}
>   
> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_huc.c b/drivers/gpu/drm/i915/gt/uc/intel_huc.c
> index ba9e07fc2b577..9ccec7de9628a 100644
> --- a/drivers/gpu/drm/i915/gt/uc/intel_huc.c
> +++ b/drivers/gpu/drm/i915/gt/uc/intel_huc.c
> @@ -6,6 +6,7 @@
>   #include <linux/types.h>
>   
>   #include "gt/intel_gt.h"
> +#include "gt/intel_rps.h"
>   #include "intel_guc_reg.h"
>   #include "intel_huc.h"
>   #include "intel_huc_print.h"
> @@ -447,17 +448,68 @@ static const char *auth_mode_string(struct intel_huc *huc,
>   	return partial ? "clear media" : "all workloads";
>   }
>   
> +/*
> + * Use a longer timeout for debug builds so that problems can be detected
> + * and analysed. But a shorter timeout for releases so that user's don't
> + * wait forever to find out there is a problem. Note that the only reason
> + * an end user should hit the timeout is in case of extreme thermal throttling.
> + * And a system that is that hot during boot is probably dead anyway!
> + */
> +#if defined(CONFIG_DRM_I915_DEBUG_GEM)
> +#define HUC_LOAD_RETRY_LIMIT   20
> +#else
> +#define HUC_LOAD_RETRY_LIMIT   3
> +#endif
> +
>   int intel_huc_wait_for_auth_complete(struct intel_huc *huc,
>   				     enum intel_huc_authentication_type type)
>   {
>   	struct intel_gt *gt = huc_to_gt(huc);
> -	int ret;
> +	struct intel_uncore *uncore = gt->uncore;
> +	ktime_t before, after, delta;
> +	int ret, count;
> +	u64 delta_ms;
> +	u32 before_freq;
>   
> -	ret = __intel_wait_for_register(gt->uncore,
> -					huc->status[type].reg,
> -					huc->status[type].mask,
> -					huc->status[type].value,
> -					2, 50, NULL);
> +	/*
> +	 * The KMD requests maximum frequency during driver load, however thermal
> +	 * throttling can force the frequency down to minimum (although the board
> +	 * really should never get that hot in real life!). IFWI  issues have been
> +	 * seen to cause sporadic failures to grant the higher frequency. And at
> +	 * minimum frequency, the load time can be in the seconds range. Note that

this is not load time but auth time. With this fixed (when applying is 
fine):

Reviewed-by: Daniele Ceraolo Spurio <daniele.ceraolospurio at intel.com>

Daniele

> +	 * there is a limit on how long an individual wait_for() can wait. So wrap
> +	 * it in a loop.
> +	 */
> +	before_freq = intel_rps_read_actual_frequency(&gt->rps);
> +	before = ktime_get();
> +	for (count = 0; count < HUC_LOAD_RETRY_LIMIT; count++) {
> +		ret = __intel_wait_for_register(gt->uncore,
> +						huc->status[type].reg,
> +						huc->status[type].mask,
> +						huc->status[type].value,
> +						2, 1000, NULL);
> +		if (!ret)
> +			break;
> +
> +		huc_dbg(huc, "auth still in progress, count = %d, freq = %dMHz, status = 0x%08X\n",
> +			count, intel_rps_read_actual_frequency(&gt->rps),
> +			huc->status[type].reg.reg);
> +	}
> +	after = ktime_get();
> +	delta = ktime_sub(after, before);
> +	delta_ms = ktime_to_ms(delta);
> +
> +	if (delta_ms > 50) {
> +		huc_warn(huc, "excessive auth time: %lldms! [status = 0x%08X, count = %d, ret = %d]\n",
> +			 delta_ms, huc->status[type].reg.reg, count, ret);
> +		huc_warn(huc, "excessive auth time: [freq = %dMHz, before = %dMHz, perf_limit_reasons = 0x%08X]\n",
> +			 intel_rps_read_actual_frequency(&gt->rps), before_freq,
> +			 intel_uncore_read(uncore, intel_gt_perf_limit_reasons_reg(gt)));
> +	} else {
> +		huc_dbg(huc, "auth took %lldms, freq = %dMHz, before = %dMHz, status = 0x%08X, count = %d, ret = %d\n",
> +			delta_ms, intel_rps_read_actual_frequency(&gt->rps),
> +			before_freq, huc->status[type].reg.reg, count, ret);
> +	}
>   
>   	/* mark the load process as complete even if the wait failed */
>   	delayed_huc_load_complete(huc);