[Intel-gfx] [PATCH 1/1] drm/i915/guc: Fix GuC error capture sizing estimation and reporting

John Harrison john.c.harrison at intel.com
Fri Sep 30 17:48:33 UTC 2022


On 9/29/2022 18:05, Alan Previn wrote:
> During GuC error capture initialization, we estimate the amount of size
> we need for the error-capture-region of the shared GuC-log-buffer.
> This calculation was incorrect so fix that. Additionally, if the size
> was insufficient, don't drm_warn or drm_notice, just drm_debug since
> actually running out based on that estimation is a corner case. It
> can only occur if all engine instances get reset all at once and i915
> isn't able extract the capture data fast enough within G2H handler
> worker.
>
> Fixes d7c15d76a5547: drm/i915/guc: Check sizing of guc_capture output
>
> Signed-off-by: Alan Previn <alan.previn.teres.alexis at intel.com>
> ---
>   .../gpu/drm/i915/gt/uc/intel_guc_capture.c    | 31 ++++++++++++-------
>   1 file changed, 19 insertions(+), 12 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_capture.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_capture.c
> index 8f1165146013..cb62507c91ce 100644
> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_capture.c
> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_capture.c
> @@ -502,8 +502,9 @@ intel_guc_capture_getlistsize(struct intel_guc *guc, u32 owner, u32 type, u32 cl
>   	if (!num_regs)
>   		return -ENODATA;
>   
> -	*size = PAGE_ALIGN((sizeof(struct guc_debug_capture_list)) +
> -			   (num_regs * sizeof(struct guc_mmio_reg)));
> +	if (size)
> +		*size = PAGE_ALIGN((sizeof(struct guc_debug_capture_list)) +
> +				   (num_regs * sizeof(struct guc_mmio_reg)));
>   
>   	return 0;
>   }
> @@ -606,7 +607,7 @@ guc_capture_output_min_size_est(struct intel_guc *guc)
>   	struct intel_gt *gt = guc_to_gt(guc);
>   	struct intel_engine_cs *engine;
>   	enum intel_engine_id id;
> -	int worst_min_size = 0, num_regs = 0;
> +	int worst_min_size = 0;
>   	size_t tmp = 0;
>   
>   	if (!guc->capture)
> @@ -628,20 +629,18 @@ guc_capture_output_min_size_est(struct intel_guc *guc)
>   					 (3 * sizeof(struct guc_state_capture_header_t));
>   
>   		if (!intel_guc_capture_getlistsize(guc, 0, GUC_CAPTURE_LIST_TYPE_GLOBAL, 0, &tmp))
> -			num_regs += tmp;
> +			worst_min_size += tmp;
>   
>   		if (!intel_guc_capture_getlistsize(guc, 0, GUC_CAPTURE_LIST_TYPE_ENGINE_CLASS,
>   						   engine->class, &tmp)) {
> -			num_regs += tmp;
> +			worst_min_size += tmp;
>   		}
>   		if (!intel_guc_capture_getlistsize(guc, 0, GUC_CAPTURE_LIST_TYPE_ENGINE_INSTANCE,
>   						   engine->class, &tmp)) {
> -			num_regs += tmp;
> +			worst_min_size += tmp;
>   		}
>   	}
>   
> -	worst_min_size += (num_regs * sizeof(struct guc_mmio_reg));
> -
>   	return worst_min_size;
>   }
>   
> @@ -658,15 +657,23 @@ static void check_guc_capture_size(struct intel_guc *guc)
>   	int spare_size = min_size * GUC_CAPTURE_OVERBUFFER_MULTIPLIER;
>   	u32 buffer_size = intel_guc_log_section_size_capture(&guc->log);
>   
> +	/*
> +	 * Don't drm_warn or drm_error here on "possible" insufficient size because we only run out
> +	 * of space if all engines were to suffer resets all at once AND the driver is not able to
> +	 * extract that data fast enough in the interrupt handler worker (moving them to the
> +	 * larger pool of pre-allocated capture nodes. If GuC does run out of space, we will
> +	 * print an error when processing the G2H event capture-notification (search for
> +	 * "INTEL_GUC_STATE_CAPTURE_EVENT_STATUS_NOSPACE").
> +	 */
>   	if (min_size < 0)
>   		drm_warn(&i915->drm, "Failed to calculate GuC error state capture buffer minimum size: %d!\n",
>   			 min_size);
>   	else if (min_size > buffer_size)
> -		drm_warn(&i915->drm, "GuC error state capture buffer is too small: %d < %d\n",
> -			 buffer_size, min_size);
> +		drm_dbg(&i915->drm, "GuC error state capture buffer maybe small: %d < %d\n",
> +			buffer_size, min_size);
Isn't min_size the bare minimum to get a valid capture? Surely this 
still needs to be a warning not a debug. If we can't manage a basic 
working error capture then there is a problem. This needs to be caught 
by CI and logged as a bug if it is ever hit. And that means an end user 
should never see it fire because we won't let a driver out the door 
unless the default buffer size is sufficient.

John.

>   	else if (spare_size > buffer_size)
> -		drm_notice(&i915->drm, "GuC error state capture buffer maybe too small: %d < %d (min = %d)\n",
> -			   buffer_size, spare_size, min_size);
> +		drm_dbg(&i915->drm, "GuC error state capture buffer lacks spare size: %d < %d (min = %d)\n",
> +			buffer_size, spare_size, min_size);
>   }
>   
>   /*



More information about the Intel-gfx mailing list