[PATCH v16 3/7] drm/xe/guc: Add capture size check in GuC log buffer

Tue Aug 27 22:27:04 UTC 2024

On 2024-08-27 6:02 p.m., Teres Alexis, Alan Previn wrote:
> On Mon, 2024-08-19 at 19:11 -0700, Zhanjun Dong wrote:
>> Capture-nodes generated by GuC are placed in the GuC capture ring
>> buffer which is a sub-region of the larger Guc-Log-buffer.
>> Add capture output size check before allocating the shared buffer.
>>
>> Signed-off-by: Zhanjun Dong <zhanjun.dong at intel.com>
>>
> alan:snip
> 
>> +static int guc_capture_output_size_est(struct xe_guc *guc)
>> +{
>> +       struct xe_gt *gt = guc_to_gt(guc);
>> +       struct xe_hw_engine *hwe;
>> +       enum xe_hw_engine_id id;
>> +
>> +       int capture_size = 0;
>> +       size_t tmp = 0;
>> +
>> +       if (!guc->capture)
>> +               return -ENODEV;
>> +
>> +       /*
>> +        * If every single engine-instance suffered a failure in quick succession but
>> +        * were all unrelated, then a burst of multiple error-capture events would dump
>> +        * registers for every one engine instance, one at a time. In this case, GuC
>> +        * would even dump the global-registers repeatedly.
>> +        *
>> +        * For each engine instance, there would be 1 x guc_state_capture_group_t output
>> +        * followed by 3 x guc_state_capture_t lists. The latter is how the register
>> +        * dumps are split across different register types (where the '3' are global vs class
>> +        * vs instance).
>> +        */
>> +       for_each_hw_engine(hwe, gt, id) {
>> +               enum guc_capture_list_class_type capture_class;
>> +
>> +               capture_class = xe_engine_class_to_guc_capture_class(hwe->class);
>> +               capture_size += sizeof(struct guc_state_capture_group_header_t) +
>> +                                        (3 * sizeof(struct guc_state_capture_header_t));
>> +
>> +               if (!guc_capture_getlistsize(guc, 0, GUC_STATE_CAPTURE_TYPE_GLOBAL,
>> +                                            0, &tmp, true))
>> +                       capture_size += tmp;
>> +               if (!guc_capture_getlistsize(guc, 0, GUC_STATE_CAPTURE_TYPE_ENGINE_CLASS,
>> +                                            capture_class, &tmp, true))
>> +                       capture_size += tmp;
>> +               /* Estimate steering register size for rcs/ccs */
>> +               if (capture_class == GUC_CAPTURE_LIST_CLASS_RENDER_COMPUTE) {
>> +                       int total = guc_capture_get_steer_reg_num(guc_to_xe(guc)) *
>> +                                   XE_MAX_DSS_FUSE_BITS;
>> +
>> +                       capture_size += PAGE_ALIGN((sizeof(struct guc_debug_capture_list)) +
>> +                                                  (total * sizeof(struct guc_mmio_reg)));
>> +               }
> alan: as per the review-conf-call last week, lets remove any traces of "steering register
> size rules" from outside the guc_capture_getlistsize. As we know there are only 2 conditions
> where we guc_capture_getlistsize can cause different results - one is when its called early
> before pre-hwconfig and again if its called later after post-hwconfig. In both cases,
> it doesn't matter who is making the call or the callstack, it will either need to
> return (at pre-hwconfig) the worst-case scenario where real-dss-masks were not yet available
> and we use max-dss ... or (at post-hwconfig), where we can caclculate using real-dss-masks
> and then get the valid value. That way, we don't need to care about "educating" other functions
> about the differentiation between real-dss-masks being available or not, lets just always
> ensure that the guc_capture_getlistsize always does this implicit decision for us so we dont
> have to replicate such code in multiple place such as xe_guc_capture_ads_input_worst_size.
> 
> alan: actually just before hitting send button, i see you've addressed this in v17 (cleaning
> both this code and the xe_guc_capture_ads_input_worst_size to remove this. I will drop further
> review of v16 now and move to v17 or v18 if u wanna address those v16-nits i posted just now
> quickly.

Yes, it was addressed in v17 as we discussed.
There is no v18 planned to be send out today, please start review v17.

>> +               if (!guc_capture_getlistsize(guc, 0, GUC_STATE_CAPTURE_TYPE_ENGINE_INSTANCE,
>> +                                            capture_class, &tmp, true))
>> +                       capture_size += tmp;
>> +       }
>> +
>> +       return capture_size;
>> +}
>> +
>> +/*
>> + * Add on a 3x multiplier to allow for multiple back-to-back captures occurring
>> + * before the Xe can read the data out and process it
>> + */
>> +#define GUC_CAPTURE_OVERBUFFER_MULTIPLIER 3
>> +
>> +static void check_guc_capture_size(struct xe_guc *guc)
>> +{
>> +       int capture_size = guc_capture_output_size_est(guc);
>> +       int spare_size = capture_size * GUC_CAPTURE_OVERBUFFER_MULTIPLIER;
>> +       u32 buffer_size = xe_guc_log_section_size_capture(&guc->log);
>> +
>> +       /*
>> +        * NOTE: capture_size is much smaller than the capture region
>> +        * allocation (DG2: <80K vs 1MB).
>> +        * Additionally, its based on space needed to fit all engines getting
>> +        * reset at once within the same G2H handler task slot. This is very
>> +        * unlikely. However, if GuC really does run out of space for whatever
>> +        * reason, we will see an separate warning message when processing the
>> +        * G2H event capture-notification, search for:
>> +        * xe_guc_STATE_CAPTURE_EVENT_STATUS_NOSPACE.
>> +        */
>> +       if (capture_size < 0)
>> +               xe_gt_dbg(guc_to_gt(guc),
>> +                         "Failed to calculate error state capture buffer minimum size: %d!\n",
>> +                         capture_size);
>> +       if (capture_size > buffer_size)
>> +               xe_gt_dbg(guc_to_gt(guc), "Error state capture buffer maybe small: %d < %d\n",
>> +                         buffer_size, capture_size);
>> +       else if (spare_size > buffer_size)
>> +               xe_gt_dbg(guc_to_gt(guc),
>> +                         "Error state capture buffer lacks spare size: %d < %d (min = %d)\n",
>> +                         buffer_size, spare_size, capture_size);
>> +}
>> +
>>   /*
>>    * xe_guc_capture_steered_list_init - Init steering register list
>>    * @guc: The GuC object
>> @@ -657,11 +745,12 @@ int xe_guc_capture_steered_list_init(struct xe_guc *guc)
>>           * the end of the pre-populated render list.
>>           */
>>          guc_capture_alloc_steered_lists(guc);
>> +       check_guc_capture_size(guc);
>>   
>>          return 0;
>>   }
>>   
>> -/**
>> +/*
>>    * xe_guc_capture_init - Init for GuC register capture
>>    * @guc: The GuC object
>>    *
>> diff --git a/drivers/gpu/drm/xe/xe_guc_log.c b/drivers/gpu/drm/xe/xe_guc_log.c
>> index a37ee3419428..d6b5ac522b6c 100644
>> --- a/drivers/gpu/drm/xe/xe_guc_log.c
>> +++ b/drivers/gpu/drm/xe/xe_guc_log.c
>> @@ -96,3 +96,68 @@ int xe_guc_log_init(struct xe_guc_log *log)
>>   
>>          return 0;
>>   }
>> +
>> +static u32 xe_guc_log_section_size_crash(struct xe_guc_log *log)
>> +{
>> +       return CRASH_BUFFER_SIZE;
>> +}
>> +
>> +static u32 xe_guc_log_section_size_debug(struct xe_guc_log *log)
>> +{
>> +       return DEBUG_BUFFER_SIZE;
>> +}
>> +
>> +/**
>> + * xe_guc_log_section_size_capture - Get capture buffer size within log sections.
>> + * @log: The log object.
>> + *
>> + * This function will return the capture buffer size within log sections.
>> + *
>> + * Return: capture buffer size.
>> + */
>> +u32 xe_guc_log_section_size_capture(struct xe_guc_log *log)
>> +{
>> +       return CAPTURE_BUFFER_SIZE;
>> +}
>> +
>> +/**
>> + * xe_guc_get_log_buffer_size - Get log buffer size for a type.
>> + * @log: The log object.
>> + * @type: The log buffer type
>> + *
>> + * Return: buffer size.
>> + */
>> +u32 xe_guc_get_log_buffer_size(struct xe_guc_log *log, enum guc_log_buffer_type type)
>> +{
>> +       switch (type) {
>> +       case GUC_LOG_BUFFER_CRASH_DUMP:
>> +               return xe_guc_log_section_size_crash(log);
>> +       case GUC_LOG_BUFFER_DEBUG:
>> +               return xe_guc_log_section_size_debug(log);
>> +       case GUC_LOG_BUFFER_CAPTURE:
>> +               return xe_guc_log_section_size_capture(log);
>> +       }
>> +       return 0;
>> +}
>> +
>> +/**
>> + * xe_guc_get_log_buffer_offset - Get offset in log buffer for a type.
>> + * @log: The log object.
>> + * @type: The log buffer type
>> + *
>> + * This function will return the offset in the log buffer for a type.
>> + * Return: buffer offset.
>> + */
>> +u32 xe_guc_get_log_buffer_offset(struct xe_guc_log *log, enum guc_log_buffer_type type)
>> +{
>> +       enum guc_log_buffer_type i;
>> +       u32 offset = PAGE_SIZE;/* for the log_buffer_states */
>> +
>> +       for (i = GUC_LOG_BUFFER_CRASH_DUMP; i < GUC_LOG_BUFFER_TYPE_MAX; ++i) {
>> +               if (i == type)
>> +                       break;
>> +               offset += xe_guc_get_log_buffer_size(log, i);
>> +       }
>> +
>> +       return offset;
>> +}
>> diff --git a/drivers/gpu/drm/xe/xe_guc_log.h b/drivers/gpu/drm/xe/xe_guc_log.h
>> index 2d25ab28b4b3..87ecd1814854 100644
>> --- a/drivers/gpu/drm/xe/xe_guc_log.h
>> +++ b/drivers/gpu/drm/xe/xe_guc_log.h
>> @@ -7,6 +7,7 @@
>>   #define _XE_GUC_LOG_H_
>>   
>>   #include "xe_guc_log_types.h"
>> +#include "abi/guc_log_abi.h"
>>   
>>   struct drm_printer;
>>   
>> @@ -17,7 +18,7 @@ struct drm_printer;
>>   #else
>>   #define CRASH_BUFFER_SIZE      SZ_8K
>>   #define DEBUG_BUFFER_SIZE      SZ_64K
>> -#define CAPTURE_BUFFER_SIZE    SZ_16K
>> +#define CAPTURE_BUFFER_SIZE    SZ_1M
>>   #endif
>>   /*
>>    * While we're using plain log level in i915, GuC controls are much more...
>> @@ -45,4 +46,8 @@ xe_guc_log_get_level(struct xe_guc_log *log)
>>          return log->level;
>>   }
>>   
>> +u32 xe_guc_log_section_size_capture(struct xe_guc_log *log);
>> +u32 xe_guc_get_log_buffer_size(struct xe_guc_log *log, enum guc_log_buffer_type type);
>> +u32 xe_guc_get_log_buffer_offset(struct xe_guc_log *log, enum guc_log_buffer_type type);
>> +
>>   #endif
>