[PATCH v16 3/7] drm/xe/guc: Add capture size check in GuC log buffer
Dong, Zhanjun
zhanjun.dong at intel.com
Tue Aug 27 22:27:04 UTC 2024
On 2024-08-27 6:02 p.m., Teres Alexis, Alan Previn wrote:
> On Mon, 2024-08-19 at 19:11 -0700, Zhanjun Dong wrote:
>> Capture-nodes generated by GuC are placed in the GuC capture ring
>> buffer which is a sub-region of the larger Guc-Log-buffer.
>> Add capture output size check before allocating the shared buffer.
>>
>> Signed-off-by: Zhanjun Dong <zhanjun.dong at intel.com>
>>
> alan:snip
>
>> +static int guc_capture_output_size_est(struct xe_guc *guc)
>> +{
>> + struct xe_gt *gt = guc_to_gt(guc);
>> + struct xe_hw_engine *hwe;
>> + enum xe_hw_engine_id id;
>> +
>> + int capture_size = 0;
>> + size_t tmp = 0;
>> +
>> + if (!guc->capture)
>> + return -ENODEV;
>> +
>> + /*
>> + * If every single engine-instance suffered a failure in quick succession but
>> + * were all unrelated, then a burst of multiple error-capture events would dump
>> + * registers for every one engine instance, one at a time. In this case, GuC
>> + * would even dump the global-registers repeatedly.
>> + *
>> + * For each engine instance, there would be 1 x guc_state_capture_group_t output
>> + * followed by 3 x guc_state_capture_t lists. The latter is how the register
>> + * dumps are split across different register types (where the '3' are global vs class
>> + * vs instance).
>> + */
>> + for_each_hw_engine(hwe, gt, id) {
>> + enum guc_capture_list_class_type capture_class;
>> +
>> + capture_class = xe_engine_class_to_guc_capture_class(hwe->class);
>> + capture_size += sizeof(struct guc_state_capture_group_header_t) +
>> + (3 * sizeof(struct guc_state_capture_header_t));
>> +
>> + if (!guc_capture_getlistsize(guc, 0, GUC_STATE_CAPTURE_TYPE_GLOBAL,
>> + 0, &tmp, true))
>> + capture_size += tmp;
>> + if (!guc_capture_getlistsize(guc, 0, GUC_STATE_CAPTURE_TYPE_ENGINE_CLASS,
>> + capture_class, &tmp, true))
>> + capture_size += tmp;
>> + /* Estimate steering register size for rcs/ccs */
>> + if (capture_class == GUC_CAPTURE_LIST_CLASS_RENDER_COMPUTE) {
>> + int total = guc_capture_get_steer_reg_num(guc_to_xe(guc)) *
>> + XE_MAX_DSS_FUSE_BITS;
>> +
>> + capture_size += PAGE_ALIGN((sizeof(struct guc_debug_capture_list)) +
>> + (total * sizeof(struct guc_mmio_reg)));
>> + }
> alan: as per the review-conf-call last week, lets remove any traces of "steering register
> size rules" from outside the guc_capture_getlistsize. As we know there are only 2 conditions
> where we guc_capture_getlistsize can cause different results - one is when its called early
> before pre-hwconfig and again if its called later after post-hwconfig. In both cases,
> it doesn't matter who is making the call or the callstack, it will either need to
> return (at pre-hwconfig) the worst-case scenario where real-dss-masks were not yet available
> and we use max-dss ... or (at post-hwconfig), where we can caclculate using real-dss-masks
> and then get the valid value. That way, we don't need to care about "educating" other functions
> about the differentiation between real-dss-masks being available or not, lets just always
> ensure that the guc_capture_getlistsize always does this implicit decision for us so we dont
> have to replicate such code in multiple place such as xe_guc_capture_ads_input_worst_size.
>
> alan: actually just before hitting send button, i see you've addressed this in v17 (cleaning
> both this code and the xe_guc_capture_ads_input_worst_size to remove this. I will drop further
> review of v16 now and move to v17 or v18 if u wanna address those v16-nits i posted just now
> quickly.
Yes, it was addressed in v17 as we discussed.
There is no v18 planned to be send out today, please start review v17.
>> + if (!guc_capture_getlistsize(guc, 0, GUC_STATE_CAPTURE_TYPE_ENGINE_INSTANCE,
>> + capture_class, &tmp, true))
>> + capture_size += tmp;
>> + }
>> +
>> + return capture_size;
>> +}
>> +
>> +/*
>> + * Add on a 3x multiplier to allow for multiple back-to-back captures occurring
>> + * before the Xe can read the data out and process it
>> + */
>> +#define GUC_CAPTURE_OVERBUFFER_MULTIPLIER 3
>> +
>> +static void check_guc_capture_size(struct xe_guc *guc)
>> +{
>> + int capture_size = guc_capture_output_size_est(guc);
>> + int spare_size = capture_size * GUC_CAPTURE_OVERBUFFER_MULTIPLIER;
>> + u32 buffer_size = xe_guc_log_section_size_capture(&guc->log);
>> +
>> + /*
>> + * NOTE: capture_size is much smaller than the capture region
>> + * allocation (DG2: <80K vs 1MB).
>> + * Additionally, its based on space needed to fit all engines getting
>> + * reset at once within the same G2H handler task slot. This is very
>> + * unlikely. However, if GuC really does run out of space for whatever
>> + * reason, we will see an separate warning message when processing the
>> + * G2H event capture-notification, search for:
>> + * xe_guc_STATE_CAPTURE_EVENT_STATUS_NOSPACE.
>> + */
>> + if (capture_size < 0)
>> + xe_gt_dbg(guc_to_gt(guc),
>> + "Failed to calculate error state capture buffer minimum size: %d!\n",
>> + capture_size);
>> + if (capture_size > buffer_size)
>> + xe_gt_dbg(guc_to_gt(guc), "Error state capture buffer maybe small: %d < %d\n",
>> + buffer_size, capture_size);
>> + else if (spare_size > buffer_size)
>> + xe_gt_dbg(guc_to_gt(guc),
>> + "Error state capture buffer lacks spare size: %d < %d (min = %d)\n",
>> + buffer_size, spare_size, capture_size);
>> +}
>> +
>> /*
>> * xe_guc_capture_steered_list_init - Init steering register list
>> * @guc: The GuC object
>> @@ -657,11 +745,12 @@ int xe_guc_capture_steered_list_init(struct xe_guc *guc)
>> * the end of the pre-populated render list.
>> */
>> guc_capture_alloc_steered_lists(guc);
>> + check_guc_capture_size(guc);
>>
>> return 0;
>> }
>>
>> -/**
>> +/*
>> * xe_guc_capture_init - Init for GuC register capture
>> * @guc: The GuC object
>> *
>> diff --git a/drivers/gpu/drm/xe/xe_guc_log.c b/drivers/gpu/drm/xe/xe_guc_log.c
>> index a37ee3419428..d6b5ac522b6c 100644
>> --- a/drivers/gpu/drm/xe/xe_guc_log.c
>> +++ b/drivers/gpu/drm/xe/xe_guc_log.c
>> @@ -96,3 +96,68 @@ int xe_guc_log_init(struct xe_guc_log *log)
>>
>> return 0;
>> }
>> +
>> +static u32 xe_guc_log_section_size_crash(struct xe_guc_log *log)
>> +{
>> + return CRASH_BUFFER_SIZE;
>> +}
>> +
>> +static u32 xe_guc_log_section_size_debug(struct xe_guc_log *log)
>> +{
>> + return DEBUG_BUFFER_SIZE;
>> +}
>> +
>> +/**
>> + * xe_guc_log_section_size_capture - Get capture buffer size within log sections.
>> + * @log: The log object.
>> + *
>> + * This function will return the capture buffer size within log sections.
>> + *
>> + * Return: capture buffer size.
>> + */
>> +u32 xe_guc_log_section_size_capture(struct xe_guc_log *log)
>> +{
>> + return CAPTURE_BUFFER_SIZE;
>> +}
>> +
>> +/**
>> + * xe_guc_get_log_buffer_size - Get log buffer size for a type.
>> + * @log: The log object.
>> + * @type: The log buffer type
>> + *
>> + * Return: buffer size.
>> + */
>> +u32 xe_guc_get_log_buffer_size(struct xe_guc_log *log, enum guc_log_buffer_type type)
>> +{
>> + switch (type) {
>> + case GUC_LOG_BUFFER_CRASH_DUMP:
>> + return xe_guc_log_section_size_crash(log);
>> + case GUC_LOG_BUFFER_DEBUG:
>> + return xe_guc_log_section_size_debug(log);
>> + case GUC_LOG_BUFFER_CAPTURE:
>> + return xe_guc_log_section_size_capture(log);
>> + }
>> + return 0;
>> +}
>> +
>> +/**
>> + * xe_guc_get_log_buffer_offset - Get offset in log buffer for a type.
>> + * @log: The log object.
>> + * @type: The log buffer type
>> + *
>> + * This function will return the offset in the log buffer for a type.
>> + * Return: buffer offset.
>> + */
>> +u32 xe_guc_get_log_buffer_offset(struct xe_guc_log *log, enum guc_log_buffer_type type)
>> +{
>> + enum guc_log_buffer_type i;
>> + u32 offset = PAGE_SIZE;/* for the log_buffer_states */
>> +
>> + for (i = GUC_LOG_BUFFER_CRASH_DUMP; i < GUC_LOG_BUFFER_TYPE_MAX; ++i) {
>> + if (i == type)
>> + break;
>> + offset += xe_guc_get_log_buffer_size(log, i);
>> + }
>> +
>> + return offset;
>> +}
>> diff --git a/drivers/gpu/drm/xe/xe_guc_log.h b/drivers/gpu/drm/xe/xe_guc_log.h
>> index 2d25ab28b4b3..87ecd1814854 100644
>> --- a/drivers/gpu/drm/xe/xe_guc_log.h
>> +++ b/drivers/gpu/drm/xe/xe_guc_log.h
>> @@ -7,6 +7,7 @@
>> #define _XE_GUC_LOG_H_
>>
>> #include "xe_guc_log_types.h"
>> +#include "abi/guc_log_abi.h"
>>
>> struct drm_printer;
>>
>> @@ -17,7 +18,7 @@ struct drm_printer;
>> #else
>> #define CRASH_BUFFER_SIZE SZ_8K
>> #define DEBUG_BUFFER_SIZE SZ_64K
>> -#define CAPTURE_BUFFER_SIZE SZ_16K
>> +#define CAPTURE_BUFFER_SIZE SZ_1M
>> #endif
>> /*
>> * While we're using plain log level in i915, GuC controls are much more...
>> @@ -45,4 +46,8 @@ xe_guc_log_get_level(struct xe_guc_log *log)
>> return log->level;
>> }
>>
>> +u32 xe_guc_log_section_size_capture(struct xe_guc_log *log);
>> +u32 xe_guc_get_log_buffer_size(struct xe_guc_log *log, enum guc_log_buffer_type type);
>> +u32 xe_guc_get_log_buffer_offset(struct xe_guc_log *log, enum guc_log_buffer_type type);
>> +
>> #endif
>
More information about the Intel-xe
mailing list