[Intel-gfx] [PATCH v4 1/1] drm/i915/guc: Fix GuC error capture sizing estimation and reporting
John Harrison
john.c.harrison at intel.com
Wed Oct 26 18:27:05 UTC 2022
On 10/25/2022 23:05, Alan Previn wrote:
> During GuC error capture initialization, we estimate the amount of size
> we need for the error-capture-region of the shared GuC-log-buffer.
> This calculation was incorrect so fix that. With the fixed calculation
> we can reduce the allocation of error-capture region from 4MB to 1MB
> (see note2 below for reasoning). Additionally, switch from drm_notice to
> drm_debug for the 3X spare size check since that would be impossible to
> hit without redesigning gpu_coredump framework to hold multiple captures.
>
> NOTE1: Even for 1x the min size estimation case, actually running out
> of space is a corner case because it can only occur if all engine
> instances get reset all at once and i915 isn't able extract the capture
> data fast enough within G2H handler worker.
>
> NOTE2: With the corrected calculation, a DG2 part required ~77K and a PVC
> required ~115K (1X min-est-size that is calculated as one-shot all-engine-
> reset scenario).
>
> Fixes d7c15d76a5547: drm/i915/guc: Check sizing of guc_capture output
>
> Signed-off-by: Alan Previn <alan.previn.teres.alexis at intel.com>
The fixes tag is not correct. You should use 'dim fixes <sha>' to create
it automatically (it also adds a whole bunch of CC tags). I can fix that
when merging, though.
Reviewed-by: John Harrison <John.C.Harrison at Intel.com>
> ---
> .../gpu/drm/i915/gt/uc/intel_guc_capture.c | 29 ++++++++++++-------
> drivers/gpu/drm/i915/gt/uc/intel_guc_log.c | 6 ++--
> 2 files changed, 21 insertions(+), 14 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_capture.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_capture.c
> index c4bee3bc15a9..4e6dca707d94 100644
> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_capture.c
> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_capture.c
> @@ -559,8 +559,9 @@ guc_capture_getlistsize(struct intel_guc *guc, u32 owner, u32 type, u32 classid,
> if (!num_regs)
> return -ENODATA;
>
> - *size = PAGE_ALIGN((sizeof(struct guc_debug_capture_list)) +
> - (num_regs * sizeof(struct guc_mmio_reg)));
> + if (size)
> + *size = PAGE_ALIGN((sizeof(struct guc_debug_capture_list)) +
> + (num_regs * sizeof(struct guc_mmio_reg)));
>
> return 0;
> }
> @@ -670,7 +671,7 @@ guc_capture_output_min_size_est(struct intel_guc *guc)
> struct intel_gt *gt = guc_to_gt(guc);
> struct intel_engine_cs *engine;
> enum intel_engine_id id;
> - int worst_min_size = 0, num_regs = 0;
> + int worst_min_size = 0;
> size_t tmp = 0;
>
> if (!guc->capture)
> @@ -692,20 +693,18 @@ guc_capture_output_min_size_est(struct intel_guc *guc)
> (3 * sizeof(struct guc_state_capture_header_t));
>
> if (!guc_capture_getlistsize(guc, 0, GUC_CAPTURE_LIST_TYPE_GLOBAL, 0, &tmp, true))
> - num_regs += tmp;
> + worst_min_size += tmp;
>
> if (!guc_capture_getlistsize(guc, 0, GUC_CAPTURE_LIST_TYPE_ENGINE_CLASS,
> engine->class, &tmp, true)) {
> - num_regs += tmp;
> + worst_min_size += tmp;
> }
> if (!guc_capture_getlistsize(guc, 0, GUC_CAPTURE_LIST_TYPE_ENGINE_INSTANCE,
> engine->class, &tmp, true)) {
> - num_regs += tmp;
> + worst_min_size += tmp;
> }
> }
>
> - worst_min_size += (num_regs * sizeof(struct guc_mmio_reg));
> -
> return worst_min_size;
> }
>
> @@ -722,15 +721,23 @@ static void check_guc_capture_size(struct intel_guc *guc)
> int spare_size = min_size * GUC_CAPTURE_OVERBUFFER_MULTIPLIER;
> u32 buffer_size = intel_guc_log_section_size_capture(&guc->log);
>
> + /*
> + * NOTE: min_size is much smaller than the capture region allocation (DG2: <80K vs 1MB)
> + * Additionally, its based on space needed to fit all engines getting reset at once
> + * within the same G2H handler task slot. This is very unlikely. However, if GuC really
> + * does run out of space for whatever reason, we will see an separate warning message
> + * when processing the G2H event capture-notification, search for:
> + * INTEL_GUC_STATE_CAPTURE_EVENT_STATUS_NOSPACE.
> + */
> if (min_size < 0)
> drm_warn(&i915->drm, "Failed to calculate GuC error state capture buffer minimum size: %d!\n",
> min_size);
> else if (min_size > buffer_size)
> - drm_warn(&i915->drm, "GuC error state capture buffer is too small: %d < %d\n",
> + drm_warn(&i915->drm, "GuC error state capture buffer maybe small: %d < %d\n",
> buffer_size, min_size);
> else if (spare_size > buffer_size)
> - drm_notice(&i915->drm, "GuC error state capture buffer maybe too small: %d < %d (min = %d)\n",
> - buffer_size, spare_size, min_size);
> + drm_dbg(&i915->drm, "GuC error state capture buffer lacks spare size: %d < %d (min = %d)\n",
> + buffer_size, spare_size, min_size);
> }
>
> /*
> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_log.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_log.c
> index 55d3ef93e86f..68331c538b0a 100644
> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_log.c
> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_log.c
> @@ -16,15 +16,15 @@
> #if defined(CONFIG_DRM_I915_DEBUG_GUC)
> #define GUC_LOG_DEFAULT_CRASH_BUFFER_SIZE SZ_2M
> #define GUC_LOG_DEFAULT_DEBUG_BUFFER_SIZE SZ_16M
> -#define GUC_LOG_DEFAULT_CAPTURE_BUFFER_SIZE SZ_4M
> +#define GUC_LOG_DEFAULT_CAPTURE_BUFFER_SIZE SZ_1M
> #elif defined(CONFIG_DRM_I915_DEBUG_GEM)
> #define GUC_LOG_DEFAULT_CRASH_BUFFER_SIZE SZ_1M
> #define GUC_LOG_DEFAULT_DEBUG_BUFFER_SIZE SZ_2M
> -#define GUC_LOG_DEFAULT_CAPTURE_BUFFER_SIZE SZ_4M
> +#define GUC_LOG_DEFAULT_CAPTURE_BUFFER_SIZE SZ_1M
> #else
> #define GUC_LOG_DEFAULT_CRASH_BUFFER_SIZE SZ_8K
> #define GUC_LOG_DEFAULT_DEBUG_BUFFER_SIZE SZ_64K
> -#define GUC_LOG_DEFAULT_CAPTURE_BUFFER_SIZE SZ_2M
> +#define GUC_LOG_DEFAULT_CAPTURE_BUFFER_SIZE SZ_1M
> #endif
>
> static void guc_log_copy_debuglogs_for_relay(struct intel_guc_log *log);
More information about the Intel-gfx
mailing list