[PATCH v11 3/5] drm/xe/guc: Add capture size check in GuC log buffer

Wed Jul 3 03:10:05 UTC 2024

On Mon, 2024-06-24 at 14:54 -0700, Zhanjun Dong wrote:
> The capture-nodes is included in GuC log buffer, add the size check
> for capture region in the whole GuC log buffer.
> Add capture output size check before allocating the shared buffer.
> 
> Signed-off-by: Zhanjun Dong <zhanjun.dong at intel.com>
alan:snip
> +/* GuC logging buffer types */
> +enum guc_log_buffer_type {
> +       GUC_LOG_BUFFER_CRASH_DUMP,
> +       GUC_LOG_BUFFER_DEBUG,
> +       GUC_LOG_BUFFER_CAPTURE,
> +};
> +
> +#define GUC_LOG_BUFFER_TYPE_MAX                3
alan: nit: not sure why we decided on a separate macro, just putting
at the end of the enum is better for future code readability /
maintainability. Just a nit, so u can ignore.
alna:snip

> +struct guc_log_buffer_state {
alan: Btw, you are not even using this structure in this patch
this structure is part of the extraction patch, right? should
move this there.
> +       u32 marker[2];
> +       u32 read_ptr;
> +       u32 write_ptr;
> +       u32 size;
> +       u32 sampled_write_ptr;
> +       u32 wrap_offset;
> +       union {
> +               struct {
> +                       u32 flush_to_file:1;
> +                       u32 buffer_full_cnt:4;
> +                       u32 reserved:27;
> +               };
alan: agree with Michal, please change to GENMASK

> +               u32 flags;
> +       };
> +       u32 version;
> +} __packed;
> +
> +#endif
> diff --git a/drivers/gpu/drm/xe/xe_guc_capture.c b/drivers/gpu/drm/xe/xe_guc_capture.c
> index 9c473aceb402..c2a08d4a6751 100644
> --- a/drivers/gpu/drm/xe/xe_guc_capture.c
> +++ b/drivers/gpu/drm/xe/xe_guc_capture.c
> @@ -22,6 +22,7 @@
>  #include "xe_gt_mcr.h"
>  #include "xe_gt_printk.h"
>  #include "xe_guc.h"
> +#include "xe_guc_ads.h"
>  #include "xe_guc_capture.h"
>  #include "xe_guc_capture_types.h"
>  #include "xe_guc_ct.h"
> @@ -558,6 +559,85 @@ size_t xe_guc_capture_ads_input_worst_size(struct xe_guc *guc)
>         return PAGE_ALIGN(total_size);
>  }
>  
> +static int
> +guc_capture_output_size_est(struct xe_guc *guc)
> +{
> +       struct xe_gt *gt = guc_to_gt(guc);
> +       struct xe_hw_engine *hwe;
> +       enum xe_hw_engine_id id;
> +
> +       int capture_size = 0;
> +       size_t tmp = 0;
> +
> +       if (!guc->capture)
> +               return -ENODEV;
> +
> +       /*
> +        * If every single engine-instance suffered a failure in quick succession but
> +        * were all unrelated, then a burst of multiple error-capture events would dump
> +        * registers for every one engine instance, one at a time. In this case, GuC
> +        * would even dump the global-registers repeatedly.
> +        *
> +        * For each engine instance, there would be 1 x guc_state_capture_group_t output
> +        * followed by 3 x guc_state_capture_t lists. The latter is how the register
> +        * dumps are split across different register types (where the '3' are global vs class
> +        * vs instance).
> +        */
> +       for_each_hw_engine(hwe, gt, id) {
> +               capture_size += sizeof(struct guc_state_capture_group_header_t) +
> +                                        (3 * sizeof(struct guc_state_capture_header_t));
> +
> +               if (!guc_capture_getlistsize(guc, 0, GUC_CAPTURE_LIST_TYPE_GLOBAL, 0, &tmp, true))
> +                       capture_size += tmp;
> +
> +               if (!guc_capture_getlistsize(guc, 0, GUC_CAPTURE_LIST_TYPE_ENGINE_CLASS,
> +                                            hwe->class, &tmp, true)) {
> +                       capture_size += tmp;
> +               }
> +               if (!guc_capture_getlistsize(guc, 0, GUC_CAPTURE_LIST_TYPE_ENGINE_INSTANCE,
> +                                            hwe->class, &tmp, true)) {
> +                       capture_size += tmp;
> +               }
> +       }
> +
> +       return capture_size;
> +}
> +
> +/*
> + * Add on a 3x multiplier to allow for multiple back-to-back captures occurring
> + * before the Xe can read the data out and process it
> + */
> +#define GUC_CAPTURE_OVERBUFFER_MULTIPLIER 3
> +
> +static void check_guc_capture_size(struct xe_guc *guc)
> +{
> +       int capture_size = guc_capture_output_size_est(guc);
> +       int spare_size = capture_size * GUC_CAPTURE_OVERBUFFER_MULTIPLIER;
> +       u32 buffer_size = xe_guc_log_section_size_capture(&guc->log);
> +
> +       /*
> +        * NOTE: capture_size is much smaller than the capture region
> +        * allocation (DG2: <80K vs 1MB).
> +        * Additionally, its based on space needed to fit all engines getting
> +        * reset at once within the same G2H handler task slot. This is very
> +        * unlikely. However, if GuC really does run out of space for whatever
> +        * reason, we will see an separate warning message when processing the
> +        * G2H event capture-notification, search for:
> +        * xe_guc_STATE_CAPTURE_EVENT_STATUS_NOSPACE.
> +        */
> +       if (capture_size < 0)
> +               xe_gt_dbg(guc_to_gt(guc),
> +                         "Failed to calculate error state capture buffer minimum size: %d!\n",
> +                         capture_size);
> +       if (capture_size > buffer_size)
> +               xe_gt_dbg(guc_to_gt(guc), "Error state capture buffer maybe small: %d < %d\n",
> +                         buffer_size, capture_size);
> +       else if (spare_size > buffer_size)
> +               xe_gt_dbg(guc_to_gt(guc),
> +                         "Error state capture buffer lacks spare size: %d < %d (min = %d)\n",
> +                         buffer_size, spare_size, capture_size);
> +}
> +
>  /*
>   * xe_guc_capture_init - Init for GuC register capture
>   * @guc: The GuC object
> @@ -575,5 +655,6 @@ int xe_guc_capture_init(struct xe_guc *guc)
>  
>         guc->capture->reglists = guc_capture_get_device_reglist(guc);
>  
> +       check_guc_capture_size(guc);
>         return 0;
>  }
> diff --git a/drivers/gpu/drm/xe/xe_guc_log.c b/drivers/gpu/drm/xe/xe_guc_log.c
> index a37ee3419428..0188bc0a2b84 100644
> --- a/drivers/gpu/drm/xe/xe_guc_log.c
> +++ b/drivers/gpu/drm/xe/xe_guc_log.c
> @@ -9,9 +9,22 @@
>  
>  #include "xe_bo.h"
>  #include "xe_gt.h"
> +#include "xe_gt_printk.h"
> +#include "xe_guc.h"
>  #include "xe_map.h"
>  #include "xe_module.h"
>  
> +#define GUC_LOG_DEFAULT_CRASH_BUFFER_SIZE      CRASH_BUFFER_SIZE
> +#define GUC_LOG_DEFAULT_DEBUG_BUFFER_SIZE      DEBUG_BUFFER_SIZE
> +#define GUC_LOG_DEFAULT_CAPTURE_BUFFER_SIZE    CAPTURE_BUFFER_SIZE
> +
> +struct guc_log_section {
> +       u32 max;
> +       u32 flag;
> +       u32 default_val;
> +       const char *name;
> +};
> +
>  static struct xe_gt *
>  log_to_gt(struct xe_guc_log *log)
>  {
> @@ -96,3 +109,195 @@ int xe_guc_log_init(struct xe_guc_log *log)
>  
>         return 0;
>  }
> +
> +static void _guc_log_init_sizes(struct xe_guc_log *log)
> +{
> +       struct xe_guc *guc = log_to_guc(log);
> +       static const struct guc_log_section sections[GUC_LOG_BUFFER_TYPE_MAX] = {
> +               {
> +                       GUC_LOG_CRASH_MASK >> GUC_LOG_CRASH_SHIFT,
> +                       GUC_LOG_LOG_ALLOC_UNITS,
> +                       GUC_LOG_DEFAULT_CRASH_BUFFER_SIZE,
> +                       "crash dump"
> +               },
> +               {
> +                       GUC_LOG_DEBUG_MASK >> GUC_LOG_DEBUG_SHIFT,
> +                       GUC_LOG_LOG_ALLOC_UNITS,
> +                       GUC_LOG_DEFAULT_DEBUG_BUFFER_SIZE,
> +                       "debug",
> +               },
> +               {
> +                       GUC_LOG_CAPTURE_MASK >> GUC_LOG_CAPTURE_SHIFT,
> +                       GUC_LOG_CAPTURE_ALLOC_UNITS,
> +                       GUC_LOG_DEFAULT_CAPTURE_BUFFER_SIZE,
> +                       "capture",
> +               }
> +       };
> +       int i;
> +
> +       for (i = 0; i < GUC_LOG_BUFFER_TYPE_MAX; i++)
> +               log->sizes[i].bytes = sections[i].default_val;
> +
> +       /* If debug size > 1MB then bump default crash size to keep the same units */
> +       if (log->sizes[GUC_LOG_BUFFER_DEBUG].bytes >= SZ_1M &&
> +           GUC_LOG_DEFAULT_CRASH_BUFFER_SIZE < SZ_1M)
> +               log->sizes[GUC_LOG_BUFFER_CRASH_DUMP].bytes = SZ_1M;
> +
> +       /* Prepare the GuC API structure fields: */
> +       for (i = 0; i < GUC_LOG_BUFFER_TYPE_MAX; i++) {
> +               /* Convert to correct units */
> +               if ((log->sizes[i].bytes % SZ_1M) == 0) {
> +                       log->sizes[i].units = SZ_1M;
> +                       log->sizes[i].flag = sections[i].flag;
> +               } else {
> +                       log->sizes[i].units = SZ_4K;
> +                       log->sizes[i].flag = 0;
> +               }
> +
> +               xe_gt_assert_msg(log_to_gt(log),
> +                                IS_ALIGNED(log->sizes[i].bytes, log->sizes[i].units),
> +                                "Mis-aligned log %s size: 0x%X vs 0x%X!\n",
> +                                sections[i].name, log->sizes[i].bytes, log->sizes[i].units);
> +
> +               log->sizes[i].count = log->sizes[i].bytes / log->sizes[i].units;
> +
> +               if (!log->sizes[i].count) {
> +                       xe_gt_err(guc_to_gt(guc), "Zero log %s size!\n", sections[i].name);
> +               } else {
> +                       /* Size is +1 unit */
> +                       log->sizes[i].count--;
> +               }
> +
> +               /* Clip to field size */
> +               if (log->sizes[i].count > sections[i].max) {
> +                       xe_gt_err(guc_to_gt(guc), "log %s size too large: %d vs %d!\n",
> +                                 sections[i].name, log->sizes[i].count + 1, sections[i].max + 1);
> +                       log->sizes[i].count = sections[i].max;
> +               }
> +       }
> +
> +       if (log->sizes[GUC_LOG_BUFFER_CRASH_DUMP].units != log->sizes[GUC_LOG_BUFFER_DEBUG].units)
> {
> +               xe_gt_err(guc_to_gt(guc), "Unit mismatch for crash and debug sections: %d vs
> %d!\n",
> +                         log->sizes[GUC_LOG_BUFFER_CRASH_DUMP].units,
> +                         log->sizes[GUC_LOG_BUFFER_DEBUG].units);
> +               log->sizes[GUC_LOG_BUFFER_CRASH_DUMP].units =
> +                       log->sizes[GUC_LOG_BUFFER_DEBUG].units;
> +               log->sizes[GUC_LOG_BUFFER_CRASH_DUMP].count = 0;
> +       }
> +
> +       log->sizes_initialised = true;
> +}
alan: something isn't quite right about how you have inserted the guc's
log->sizes infrastructure into this patch. It took me a little time to
trace through all the codes in both i915 and xe (since some of these
concepts came from i915).

So the issue is this: At the big picture, (system level),
we init multiple guc-subfunctions/subcontext: guc-log, guc-ads,
guc-ct, guc-params, etc. After that, we setup other things
and then we load the guc firmware. NOTE: guc-params handles
information the offset of guc-log buffer and sizing of the various
guc-log-buffer-types. This info gets programmed into mmio registers
before we load the guc so that guc firmware init flows can start
using the debug or crash buffer for logging.

However, i notice you have not wired up guc-params init to
use the above sizes infrastructure. Additionally, this
wiring also needs happen even earlier during the guc-log
xe_bo allocation since the allocation size must match
what is setup in guc-params and must match with what guc-log-debug
and guc-error-capture expects the output dump ranges to be.
   note: guc-params init ==> "guc_init_params" and
                             "guc_ctl_log_params_flags"
   note: guc log bo allocation ==> "xe_guc_log_init"

I'll pause for a moment here and state that both of the xe_guc_log_init
and guc_init_params relies on the #defines of DEBUG_BUFFER_SIZE,
CRASH_BUFFER_SIZE and CAPTURE_BUFFER_SIZE) when allocating the bo and
programming guc-params. That said, since you have defined
GUC_LOG_DEFAULT_DEBUG_BUFFER_SIZE to be DEBUG_BUFFER_SIZE (same for
CRASH/CAPTURE), you are most probably not facing any kind of mismatch
between guc bo allocation and guc-params vs guc error capture sizing
expectation.

That said, if you look at guc_ctl_log_params_flags, you can
see all the code there is using build-time checks to ensure the
various guc sizes settings (size, units, counts) are all good just
like this sizes framework you have added.

So now the question becomes, why did i915 even do all this in the first
place? Why didn't it stick with just re-using CAPTURE_BUFFER_SIZE?
From git-blame you'll see that i915 added support for overriding
the size of the guc log buffer. And thus the updated sizes framework
will do all of the correction (clipping or rounding up) to get all
that sorted out - and it would occur for the first time during
guc log bo allocation.

So moving forward i see two options:

1. drop all of the sizes infrastructure and when check_guc_capture_size
   needs the size of its region, it can just directly use
   CAPTURE_BUFFER_SIZE (of course we still do the size increase).
   Then double check the code in guc_ctl_log_params_flags and ensure
   everything still makes sense / builds alright for the larger capture
   size. Then we can focus on driving the param for dynamic calculation/
   correctoin of the sizes values as a separate JIRA later.

2. Plumb the xe_guc_log_init(..), guc_log_size(), guc_init_params(..)
   and guc_ctl_log_params_flags(..) to ensure its using the same guc
   log sizes framework above. However, i don't see the motivation for
   this without pushing for startup-time (module param) configuration
   of the guc log buffer size and region dimensions.

I would propose to just go with option #1. I do sincerely apologize
for not catching this sooner, i notice folks spent time reviewing the
coding details of this patch in prior revs but i hadn't stepped back
for the big picture like i did today.

> +
> +static void guc_log_init_sizes(struct xe_guc_log *log)
> +{
> +       if (log->sizes_initialised)
> +               return;
> +
> +       _guc_log_init_sizes(log);
> +}
> +
> +static u32 xe_guc_log_section_size_crash(struct xe_guc_log *log)
> +{
> +       guc_log_init_sizes(log);
> +
> +       return log->sizes[GUC_LOG_BUFFER_CRASH_DUMP].bytes;
> +}
> +
> +static u32 xe_guc_log_section_size_debug(struct xe_guc_log *log)
> +{
> +       guc_log_init_sizes(log);
> +
> +       return log->sizes[GUC_LOG_BUFFER_DEBUG].bytes;
> +}
> +
> +/**
> + * xe_guc_log_section_size_capture - Get capture buffer size in log sections.
> + * @log: The log object.
> + *
> + * This function will return the capture buffer size in log sections.
> + *
> + * Return: capture buffer size.
> + */
> +u32 xe_guc_log_section_size_capture(struct xe_guc_log *log)
> +{
> +       guc_log_init_sizes(log);
> +
> +       return log->sizes[GUC_LOG_BUFFER_CAPTURE].bytes;
> +}
> +
> +/**
> + * xe_guc_check_log_buf_overflow - Check if log buffer overflowed
> + * @log: The log object.
> + * @type: The log buffer type
> + * @full_cnt: The count of buffer full
> + *
> + * This function will check count of buffer full against previous, mismatch
> + * indicate overflowed.
> + * Update the sampled_overflow counter, if the 4 bit counter overflowed, add
> + * up 16 to correct the value.
> + *
> + * Return: True if overflowed.
> + */
> +bool xe_guc_check_log_buf_overflow(struct xe_guc_log *log, enum guc_log_buffer_type type,
> +                                  unsigned int full_cnt)
> +{
> +       unsigned int prev_full_cnt = log->stats[type].sampled_overflow;
> +       bool overflow = false;
> +
> +       if (full_cnt != prev_full_cnt) {
> +               overflow = true;
> +
> +               log->stats[type].overflow = full_cnt;
> +               log->stats[type].sampled_overflow += full_cnt - prev_full_cnt;
> +
> +               if (full_cnt < prev_full_cnt) {
> +                       /* buffer_full_cnt is a 4 bit counter */
> +                       log->stats[type].sampled_overflow += 16;
> +               }
> +               xe_gt_notice(log_to_gt(log), "log buffer overflow\n");
> +       }
> +
> +       return overflow;
> +}
> +
> +/**
> + * xe_guc_get_log_buffer_size - Get log buffer size for a type.
> + * @log: The log object.
> + * @type: The log buffer type
> + *
> + * Return: buffer size.
> + */
> +u32 xe_guc_get_log_buffer_size(struct xe_guc_log *log, enum guc_log_buffer_type type)
> +{
> +       switch (type) {
> +       case GUC_LOG_BUFFER_CRASH_DUMP:
> +               return xe_guc_log_section_size_crash(log);
> +       case GUC_LOG_BUFFER_DEBUG:
> +               return xe_guc_log_section_size_debug(log);
> +       case GUC_LOG_BUFFER_CAPTURE:
> +               return xe_guc_log_section_size_capture(log);
> +       }
> +       return 0;
> +}
> +
> +/**
> + * xe_guc_get_log_buffer_offset - Get offset in log buffer for a type.
> + * @log: The log object.
> + * @type: The log buffer type
> + *
> + * This function will return the offset in the log buffer for a type.
> + * Return: buffer offset.
> + */
> +u32 xe_guc_get_log_buffer_offset(struct xe_guc_log *log, enum guc_log_buffer_type type)
> +{
> +       enum guc_log_buffer_type i;
> +       u32 offset = PAGE_SIZE;/* for the log_buffer_states */
> +
> +       for (i = GUC_LOG_BUFFER_CRASH_DUMP; i < GUC_LOG_BUFFER_TYPE_MAX; ++i) {
> +               if (i == type)
> +                       break;
> +               offset += xe_guc_get_log_buffer_size(log, i);
> +       }
> +
> +       return offset;
> +}
> diff --git a/drivers/gpu/drm/xe/xe_guc_log.h b/drivers/gpu/drm/xe/xe_guc_log.h
> index 2d25ab28b4b3..ea9e79ccd314 100644
> --- a/drivers/gpu/drm/xe/xe_guc_log.h
> +++ b/drivers/gpu/drm/xe/xe_guc_log.h
> @@ -7,6 +7,7 @@
>  #define _XE_GUC_LOG_H_
>  
>  #include "xe_guc_log_types.h"
> +#include "xe_guc_types.h"
>  
>  struct drm_printer;
>  
> @@ -17,7 +18,7 @@ struct drm_printer;
>  #else
>  #define CRASH_BUFFER_SIZE      SZ_8K
>  #define DEBUG_BUFFER_SIZE      SZ_64K
> -#define CAPTURE_BUFFER_SIZE    SZ_16K
> +#define CAPTURE_BUFFER_SIZE    SZ_1M
>  #endif
>  /*
>   * While we're using plain log level in i915, GuC controls are much more...
> @@ -36,6 +37,11 @@ struct drm_printer;
>  #define GUC_VERBOSITY_TO_LOG_LEVEL(x)  ((x) + 2)
>  #define GUC_LOG_LEVEL_MAX GUC_VERBOSITY_TO_LOG_LEVEL(GUC_LOG_VERBOSITY_MAX)
>  
> +static inline struct xe_guc *log_to_guc(struct xe_guc_log *log)
> +{
> +       return container_of(log, struct xe_guc, log);
> +}
> +
>  int xe_guc_log_init(struct xe_guc_log *log);
>  void xe_guc_log_print(struct xe_guc_log *log, struct drm_printer *p);
>  
> @@ -45,4 +51,13 @@ xe_guc_log_get_level(struct xe_guc_log *log)
>         return log->level;
>  }
>  
> +u32 xe_guc_log_section_size_capture(struct xe_guc_log *log);
> +
> +bool xe_guc_check_log_buf_overflow(struct xe_guc_log *log,
> +                                  enum guc_log_buffer_type type,
> +                                  unsigned int full_cnt);
> +
> +u32 xe_guc_get_log_buffer_size(struct xe_guc_log *log, enum guc_log_buffer_type type);
> +u32 xe_guc_get_log_buffer_offset(struct xe_guc_log *log, enum guc_log_buffer_type type);
> +
>  #endif
> diff --git a/drivers/gpu/drm/xe/xe_guc_log_types.h b/drivers/gpu/drm/xe/xe_guc_log_types.h
> index 125080d138a7..67a9c58e7ed7 100644
> --- a/drivers/gpu/drm/xe/xe_guc_log_types.h
> +++ b/drivers/gpu/drm/xe/xe_guc_log_types.h
> @@ -7,6 +7,7 @@
>  #define _XE_GUC_LOG_TYPES_H_
>  
>  #include <linux/types.h>
> +#include "abi/guc_log_abi.h"
>  
>  struct xe_bo;
>  
> @@ -18,6 +19,23 @@ struct xe_guc_log {
>         u32 level;
>         /** @bo: XE BO for GuC log */
>         struct xe_bo *bo;
> +
> +       /** @sizes: Allocation settings */
> +       struct {
> +               u32 bytes;      /* Size in bytes */
> +               u32 units;      /* GuC API units - 1MB or 4KB */
> +               u32 count;      /* Number of API units */
> +               u32 flag;       /* GuC API units flag */
> +       } sizes[GUC_LOG_BUFFER_TYPE_MAX];
> +       /** @sizes_initialised: sizes initialised */
> +       bool sizes_initialised;
> +
> +       /** @stats: logging related stats */
> +       struct {
> +               u32 sampled_overflow;
> +               u32 overflow;
> +               u32 flush;
> +       } stats[GUC_LOG_BUFFER_TYPE_MAX];
alan: nit: my unsolicted $0.02, i believe GUC_LOG_BUFFER_TYPE_MAX should
be kept in an ABI header because it is part of the GuC interface spec
that determined things like the number of GuC log regions for both state
and data.

>  };
>  
>  #endif