[PATCH v11 3/5] drm/xe/guc: Add capture size check in GuC log buffer

Dong, Zhanjun zhanjun.dong at intel.com
Thu Jul 4 20:27:48 UTC 2024


See my comments inline below.

Regards,
Zhanjun

On 2024-07-02 11:10 p.m., Teres Alexis, Alan Previn wrote:
> On Mon, 2024-06-24 at 14:54 -0700, Zhanjun Dong wrote:
>> The capture-nodes is included in GuC log buffer, add the size check
>> for capture region in the whole GuC log buffer.
>> Add capture output size check before allocating the shared buffer.
>>
>> Signed-off-by: Zhanjun Dong <zhanjun.dong at intel.com>
> alan:snip
>> +/* GuC logging buffer types */
>> +enum guc_log_buffer_type {
>> +       GUC_LOG_BUFFER_CRASH_DUMP,
>> +       GUC_LOG_BUFFER_DEBUG,
>> +       GUC_LOG_BUFFER_CAPTURE,
>> +};
>> +
>> +#define GUC_LOG_BUFFER_TYPE_MAX                3
> alan: nit: not sure why we decided on a separate macro, just putting
> at the end of the enum is better for future code readability /
> maintainability. Just a nit, so u can ignore.
> alna:snip
> 
>> +struct guc_log_buffer_state {
> alan: Btw, you are not even using this structure in this patch
> this structure is part of the extraction patch, right? should
> move this there.
Right, to be moved to extraction patch.

>> +       u32 marker[2];
>> +       u32 read_ptr;
>> +       u32 write_ptr;
>> +       u32 size;
>> +       u32 sampled_write_ptr;
>> +       u32 wrap_offset;
>> +       union {
>> +               struct {
>> +                       u32 flush_to_file:1;
>> +                       u32 buffer_full_cnt:4;
>> +                       u32 reserved:27;
>> +               };
> alan: agree with Michal, please change to GENMASK
Will do
> 
>> +               u32 flags;
>> +       };
>> +       u32 version;
>> +} __packed;
>> +
>> +#endif

...
>> diff --git a/drivers/gpu/drm/xe/xe_guc_log.c b/drivers/gpu/drm/xe/xe_guc_log.c
>> index a37ee3419428..0188bc0a2b84 100644
>> --- a/drivers/gpu/drm/xe/xe_guc_log.c
>> +++ b/drivers/gpu/drm/xe/xe_guc_log.c
>> @@ -9,9 +9,22 @@
>>   
>>   #include "xe_bo.h"
>>   #include "xe_gt.h"
>> +#include "xe_gt_printk.h"
>> +#include "xe_guc.h"
>>   #include "xe_map.h"
>>   #include "xe_module.h"
>>   
>> +#define GUC_LOG_DEFAULT_CRASH_BUFFER_SIZE      CRASH_BUFFER_SIZE
>> +#define GUC_LOG_DEFAULT_DEBUG_BUFFER_SIZE      DEBUG_BUFFER_SIZE
>> +#define GUC_LOG_DEFAULT_CAPTURE_BUFFER_SIZE    CAPTURE_BUFFER_SIZE
>> +
>> +struct guc_log_section {
>> +       u32 max;
>> +       u32 flag;
>> +       u32 default_val;
>> +       const char *name;
>> +};
>> +
>>   static struct xe_gt *
>>   log_to_gt(struct xe_guc_log *log)
>>   {
>> @@ -96,3 +109,195 @@ int xe_guc_log_init(struct xe_guc_log *log)
>>   
>>          return 0;
>>   }
>> +
>> +static void _guc_log_init_sizes(struct xe_guc_log *log)
>> +{
>> +       struct xe_guc *guc = log_to_guc(log);
>> +       static const struct guc_log_section sections[GUC_LOG_BUFFER_TYPE_MAX] = {
>> +               {
>> +                       GUC_LOG_CRASH_MASK >> GUC_LOG_CRASH_SHIFT,
>> +                       GUC_LOG_LOG_ALLOC_UNITS,
>> +                       GUC_LOG_DEFAULT_CRASH_BUFFER_SIZE,
>> +                       "crash dump"
>> +               },
>> +               {
>> +                       GUC_LOG_DEBUG_MASK >> GUC_LOG_DEBUG_SHIFT,
>> +                       GUC_LOG_LOG_ALLOC_UNITS,
>> +                       GUC_LOG_DEFAULT_DEBUG_BUFFER_SIZE,
>> +                       "debug",
>> +               },
>> +               {
>> +                       GUC_LOG_CAPTURE_MASK >> GUC_LOG_CAPTURE_SHIFT,
>> +                       GUC_LOG_CAPTURE_ALLOC_UNITS,
>> +                       GUC_LOG_DEFAULT_CAPTURE_BUFFER_SIZE,
>> +                       "capture",
>> +               }
>> +       };
>> +       int i;
>> +
>> +       for (i = 0; i < GUC_LOG_BUFFER_TYPE_MAX; i++)
>> +               log->sizes[i].bytes = sections[i].default_val;
>> +
>> +       /* If debug size > 1MB then bump default crash size to keep the same units */
>> +       if (log->sizes[GUC_LOG_BUFFER_DEBUG].bytes >= SZ_1M &&
>> +           GUC_LOG_DEFAULT_CRASH_BUFFER_SIZE < SZ_1M)
>> +               log->sizes[GUC_LOG_BUFFER_CRASH_DUMP].bytes = SZ_1M;
>> +
>> +       /* Prepare the GuC API structure fields: */
>> +       for (i = 0; i < GUC_LOG_BUFFER_TYPE_MAX; i++) {
>> +               /* Convert to correct units */
>> +               if ((log->sizes[i].bytes % SZ_1M) == 0) {
>> +                       log->sizes[i].units = SZ_1M;
>> +                       log->sizes[i].flag = sections[i].flag;
>> +               } else {
>> +                       log->sizes[i].units = SZ_4K;
>> +                       log->sizes[i].flag = 0;
>> +               }
>> +
>> +               xe_gt_assert_msg(log_to_gt(log),
>> +                                IS_ALIGNED(log->sizes[i].bytes, log->sizes[i].units),
>> +                                "Mis-aligned log %s size: 0x%X vs 0x%X!\n",
>> +                                sections[i].name, log->sizes[i].bytes, log->sizes[i].units);
>> +
>> +               log->sizes[i].count = log->sizes[i].bytes / log->sizes[i].units;
>> +
>> +               if (!log->sizes[i].count) {
>> +                       xe_gt_err(guc_to_gt(guc), "Zero log %s size!\n", sections[i].name);
>> +               } else {
>> +                       /* Size is +1 unit */
>> +                       log->sizes[i].count--;
>> +               }
>> +
>> +               /* Clip to field size */
>> +               if (log->sizes[i].count > sections[i].max) {
>> +                       xe_gt_err(guc_to_gt(guc), "log %s size too large: %d vs %d!\n",
>> +                                 sections[i].name, log->sizes[i].count + 1, sections[i].max + 1);
>> +                       log->sizes[i].count = sections[i].max;
>> +               }
>> +       }
>> +
>> +       if (log->sizes[GUC_LOG_BUFFER_CRASH_DUMP].units != log->sizes[GUC_LOG_BUFFER_DEBUG].units)
>> {
>> +               xe_gt_err(guc_to_gt(guc), "Unit mismatch for crash and debug sections: %d vs
>> %d!\n",
>> +                         log->sizes[GUC_LOG_BUFFER_CRASH_DUMP].units,
>> +                         log->sizes[GUC_LOG_BUFFER_DEBUG].units);
>> +               log->sizes[GUC_LOG_BUFFER_CRASH_DUMP].units =
>> +                       log->sizes[GUC_LOG_BUFFER_DEBUG].units;
>> +               log->sizes[GUC_LOG_BUFFER_CRASH_DUMP].count = 0;
>> +       }
>> +
>> +       log->sizes_initialised = true;
>> +}
> alan: something isn't quite right about how you have inserted the guc's
> log->sizes infrastructure into this patch. It took me a little time to
> trace through all the codes in both i915 and xe (since some of these
> concepts came from i915).
> 
> So the issue is this: At the big picture, (system level),
> we init multiple guc-subfunctions/subcontext: guc-log, guc-ads,
> guc-ct, guc-params, etc. After that, we setup other things
> and then we load the guc firmware. NOTE: guc-params handles
> information the offset of guc-log buffer and sizing of the various
> guc-log-buffer-types. This info gets programmed into mmio registers
> before we load the guc so that guc firmware init flows can start
> using the debug or crash buffer for logging.
> 
> However, i notice you have not wired up guc-params init to
> use the above sizes infrastructure. Additionally, this
> wiring also needs happen even earlier during the guc-log
> xe_bo allocation since the allocation size must match
> what is setup in guc-params and must match with what guc-log-debug
> and guc-error-capture expects the output dump ranges to be.
>     note: guc-params init ==> "guc_init_params" and
>                               "guc_ctl_log_params_flags"
>     note: guc log bo allocation ==> "xe_guc_log_init"
> 	
> 
> I'll pause for a moment here and state that both of the xe_guc_log_init
> and guc_init_params relies on the #defines of DEBUG_BUFFER_SIZE,
> CRASH_BUFFER_SIZE and CAPTURE_BUFFER_SIZE) when allocating the bo and
> programming guc-params. That said, since you have defined
> GUC_LOG_DEFAULT_DEBUG_BUFFER_SIZE to be DEBUG_BUFFER_SIZE (same for
> CRASH/CAPTURE), you are most probably not facing any kind of mismatch
> between guc bo allocation and guc-params vs guc error capture sizing
> expectation.
> 
> That said, if you look at guc_ctl_log_params_flags, you can
> see all the code there is using build-time checks to ensure the
> various guc sizes settings (size, units, counts) are all good just
> like this sizes framework you have added.
> 
> So now the question becomes, why did i915 even do all this in the first
> place? Why didn't it stick with just re-using CAPTURE_BUFFER_SIZE?
>  From git-blame you'll see that i915 added support for overriding
> the size of the guc log buffer. And thus the updated sizes framework
> will do all of the correction (clipping or rounding up) to get all
> that sorted out - and it would occur for the first time during
> guc log bo allocation.
> 
> So moving forward i see two options:
> 
> 1. drop all of the sizes infrastructure and when check_guc_capture_size
>     needs the size of its region, it can just directly use
>     CAPTURE_BUFFER_SIZE (of course we still do the size increase).
>     Then double check the code in guc_ctl_log_params_flags and ensure
>     everything still makes sense / builds alright for the larger capture
>     size. Then we can focus on driving the param for dynamic calculation/
>     correctoin of the sizes values as a separate JIRA later.
> 
> 2. Plumb the xe_guc_log_init(..), guc_log_size(), guc_init_params(..)
>     and guc_ctl_log_params_flags(..) to ensure its using the same guc
>     log sizes framework above. However, i don't see the motivation for
>     this without pushing for startup-time (module param) configuration
>     of the guc log buffer size and region dimensions.
> 
> 
> I would propose to just go with option #1. I do sincerely apologize
> for not catching this sooner, i notice folks spent time reviewing the
> coding details of this patch in prior revs but i hadn't stepped back
> for the big picture like i did today.

Thanks for highlevel review.
Option 1 is the way I'm doing. The log feature is not a small feature, 
so what I do right now is to take the minimal code base for the register 
capture feature, and that's why the patch defined 
GUC_LOG_DEFAULT_DEBUG_BUFFER_SIZE to be DEBUG_BUFFER_SIZE, ...
About the guc_params and other parts, that seems not a small topic and I 
would discuss with you offline.

> 
> 
>> +
>> +static void guc_log_init_sizes(struct xe_guc_log *log)
>> +{
>> +       if (log->sizes_initialised)
>> +               return;
>> +
>> +       _guc_log_init_sizes(log);
>> +}
>> +
>> +static u32 xe_guc_log_section_size_crash(struct xe_guc_log *log)
>> +{
>> +       guc_log_init_sizes(log);
>> +
>> +       return log->sizes[GUC_LOG_BUFFER_CRASH_DUMP].bytes;
>> +}
>> +
>> +static u32 xe_guc_log_section_size_debug(struct xe_guc_log *log)
>> +{
>> +       guc_log_init_sizes(log);
>> +
>> +       return log->sizes[GUC_LOG_BUFFER_DEBUG].bytes;
>> +}
>> +
>> +/**
>> + * xe_guc_log_section_size_capture - Get capture buffer size in log sections.
>> + * @log: The log object.
>> + *
>> + * This function will return the capture buffer size in log sections.
>> + *
>> + * Return: capture buffer size.
>> + */
>> +u32 xe_guc_log_section_size_capture(struct xe_guc_log *log)
>> +{
>> +       guc_log_init_sizes(log);
>> +
>> +       return log->sizes[GUC_LOG_BUFFER_CAPTURE].bytes;
>> +}
>> +
>> +/**
>> + * xe_guc_check_log_buf_overflow - Check if log buffer overflowed
>> + * @log: The log object.
>> + * @type: The log buffer type
>> + * @full_cnt: The count of buffer full
>> + *
>> + * This function will check count of buffer full against previous, mismatch
>> + * indicate overflowed.
>> + * Update the sampled_overflow counter, if the 4 bit counter overflowed, add
>> + * up 16 to correct the value.
>> + *
>> + * Return: True if overflowed.
>> + */
>> +bool xe_guc_check_log_buf_overflow(struct xe_guc_log *log, enum guc_log_buffer_type type,
>> +                                  unsigned int full_cnt)
>> +{
>> +       unsigned int prev_full_cnt = log->stats[type].sampled_overflow;
>> +       bool overflow = false;
>> +
>> +       if (full_cnt != prev_full_cnt) {
>> +               overflow = true;
>> +
>> +               log->stats[type].overflow = full_cnt;
>> +               log->stats[type].sampled_overflow += full_cnt - prev_full_cnt;
>> +
>> +               if (full_cnt < prev_full_cnt) {
>> +                       /* buffer_full_cnt is a 4 bit counter */
>> +                       log->stats[type].sampled_overflow += 16;
>> +               }
>> +               xe_gt_notice(log_to_gt(log), "log buffer overflow\n");
>> +       }
>> +
>> +       return overflow;
>> +}
>> +
>> +/**
>> + * xe_guc_get_log_buffer_size - Get log buffer size for a type.
>> + * @log: The log object.
>> + * @type: The log buffer type
>> + *
>> + * Return: buffer size.
>> + */
>> +u32 xe_guc_get_log_buffer_size(struct xe_guc_log *log, enum guc_log_buffer_type type)
>> +{
>> +       switch (type) {
>> +       case GUC_LOG_BUFFER_CRASH_DUMP:
>> +               return xe_guc_log_section_size_crash(log);
>> +       case GUC_LOG_BUFFER_DEBUG:
>> +               return xe_guc_log_section_size_debug(log);
>> +       case GUC_LOG_BUFFER_CAPTURE:
>> +               return xe_guc_log_section_size_capture(log);
>> +       }
>> +       return 0;
>> +}
>> +
>> +/**
>> + * xe_guc_get_log_buffer_offset - Get offset in log buffer for a type.
>> + * @log: The log object.
>> + * @type: The log buffer type
>> + *
>> + * This function will return the offset in the log buffer for a type.
>> + * Return: buffer offset.
>> + */
>> +u32 xe_guc_get_log_buffer_offset(struct xe_guc_log *log, enum guc_log_buffer_type type)
>> +{
>> +       enum guc_log_buffer_type i;
>> +       u32 offset = PAGE_SIZE;/* for the log_buffer_states */
>> +
>> +       for (i = GUC_LOG_BUFFER_CRASH_DUMP; i < GUC_LOG_BUFFER_TYPE_MAX; ++i) {
>> +               if (i == type)
>> +                       break;
>> +               offset += xe_guc_get_log_buffer_size(log, i);
>> +       }
>> +
>> +       return offset;
>> +}
>> diff --git a/drivers/gpu/drm/xe/xe_guc_log.h b/drivers/gpu/drm/xe/xe_guc_log.h
>> index 2d25ab28b4b3..ea9e79ccd314 100644
>> --- a/drivers/gpu/drm/xe/xe_guc_log.h
>> +++ b/drivers/gpu/drm/xe/xe_guc_log.h
>> @@ -7,6 +7,7 @@
>>   #define _XE_GUC_LOG_H_
>>   
>>   #include "xe_guc_log_types.h"
>> +#include "xe_guc_types.h"
>>   
>>   struct drm_printer;
>>   
>> @@ -17,7 +18,7 @@ struct drm_printer;
>>   #else
>>   #define CRASH_BUFFER_SIZE      SZ_8K
>>   #define DEBUG_BUFFER_SIZE      SZ_64K
>> -#define CAPTURE_BUFFER_SIZE    SZ_16K
>> +#define CAPTURE_BUFFER_SIZE    SZ_1M
>>   #endif
>>   /*
>>    * While we're using plain log level in i915, GuC controls are much more...
>> @@ -36,6 +37,11 @@ struct drm_printer;
>>   #define GUC_VERBOSITY_TO_LOG_LEVEL(x)  ((x) + 2)
>>   #define GUC_LOG_LEVEL_MAX GUC_VERBOSITY_TO_LOG_LEVEL(GUC_LOG_VERBOSITY_MAX)
>>   
>> +static inline struct xe_guc *log_to_guc(struct xe_guc_log *log)
>> +{
>> +       return container_of(log, struct xe_guc, log);
>> +}
>> +
>>   int xe_guc_log_init(struct xe_guc_log *log);
>>   void xe_guc_log_print(struct xe_guc_log *log, struct drm_printer *p);
>>   
>> @@ -45,4 +51,13 @@ xe_guc_log_get_level(struct xe_guc_log *log)
>>          return log->level;
>>   }
>>   
>> +u32 xe_guc_log_section_size_capture(struct xe_guc_log *log);
>> +
>> +bool xe_guc_check_log_buf_overflow(struct xe_guc_log *log,
>> +                                  enum guc_log_buffer_type type,
>> +                                  unsigned int full_cnt);
>> +
>> +u32 xe_guc_get_log_buffer_size(struct xe_guc_log *log, enum guc_log_buffer_type type);
>> +u32 xe_guc_get_log_buffer_offset(struct xe_guc_log *log, enum guc_log_buffer_type type);
>> +
>>   #endif
>> diff --git a/drivers/gpu/drm/xe/xe_guc_log_types.h b/drivers/gpu/drm/xe/xe_guc_log_types.h
>> index 125080d138a7..67a9c58e7ed7 100644
>> --- a/drivers/gpu/drm/xe/xe_guc_log_types.h
>> +++ b/drivers/gpu/drm/xe/xe_guc_log_types.h
>> @@ -7,6 +7,7 @@
>>   #define _XE_GUC_LOG_TYPES_H_
>>   
>>   #include <linux/types.h>
>> +#include "abi/guc_log_abi.h"
>>   
>>   struct xe_bo;
>>   
>> @@ -18,6 +19,23 @@ struct xe_guc_log {
>>          u32 level;
>>          /** @bo: XE BO for GuC log */
>>          struct xe_bo *bo;
>> +
>> +       /** @sizes: Allocation settings */
>> +       struct {
>> +               u32 bytes;      /* Size in bytes */
>> +               u32 units;      /* GuC API units - 1MB or 4KB */
>> +               u32 count;      /* Number of API units */
>> +               u32 flag;       /* GuC API units flag */
>> +       } sizes[GUC_LOG_BUFFER_TYPE_MAX];
>> +       /** @sizes_initialised: sizes initialised */
>> +       bool sizes_initialised;
>> +
>> +       /** @stats: logging related stats */
>> +       struct {
>> +               u32 sampled_overflow;
>> +               u32 overflow;
>> +               u32 flush;
>> +       } stats[GUC_LOG_BUFFER_TYPE_MAX];
> alan: nit: my unsolicted $0.02, i believe GUC_LOG_BUFFER_TYPE_MAX should
> be kept in an ABI header because it is part of the GuC interface spec
> that determined things like the number of GuC log regions for both state
> and data.
Agree
> 
>>   };
>>   
>>   #endif
> 


More information about the Intel-xe mailing list