[PATCH v11 3/5] drm/xe/guc: Add capture size check in GuC log buffer
Dong, Zhanjun
zhanjun.dong at intel.com
Thu Jul 4 20:27:48 UTC 2024
See my comments inline below.
Regards,
Zhanjun
On 2024-07-02 11:10 p.m., Teres Alexis, Alan Previn wrote:
> On Mon, 2024-06-24 at 14:54 -0700, Zhanjun Dong wrote:
>> The capture-nodes is included in GuC log buffer, add the size check
>> for capture region in the whole GuC log buffer.
>> Add capture output size check before allocating the shared buffer.
>>
>> Signed-off-by: Zhanjun Dong <zhanjun.dong at intel.com>
> alan:snip
>> +/* GuC logging buffer types */
>> +enum guc_log_buffer_type {
>> + GUC_LOG_BUFFER_CRASH_DUMP,
>> + GUC_LOG_BUFFER_DEBUG,
>> + GUC_LOG_BUFFER_CAPTURE,
>> +};
>> +
>> +#define GUC_LOG_BUFFER_TYPE_MAX 3
> alan: nit: not sure why we decided on a separate macro, just putting
> at the end of the enum is better for future code readability /
> maintainability. Just a nit, so u can ignore.
> alna:snip
>
>> +struct guc_log_buffer_state {
> alan: Btw, you are not even using this structure in this patch
> this structure is part of the extraction patch, right? should
> move this there.
Right, to be moved to extraction patch.
>> + u32 marker[2];
>> + u32 read_ptr;
>> + u32 write_ptr;
>> + u32 size;
>> + u32 sampled_write_ptr;
>> + u32 wrap_offset;
>> + union {
>> + struct {
>> + u32 flush_to_file:1;
>> + u32 buffer_full_cnt:4;
>> + u32 reserved:27;
>> + };
> alan: agree with Michal, please change to GENMASK
Will do
>
>> + u32 flags;
>> + };
>> + u32 version;
>> +} __packed;
>> +
>> +#endif
...
>> diff --git a/drivers/gpu/drm/xe/xe_guc_log.c b/drivers/gpu/drm/xe/xe_guc_log.c
>> index a37ee3419428..0188bc0a2b84 100644
>> --- a/drivers/gpu/drm/xe/xe_guc_log.c
>> +++ b/drivers/gpu/drm/xe/xe_guc_log.c
>> @@ -9,9 +9,22 @@
>>
>> #include "xe_bo.h"
>> #include "xe_gt.h"
>> +#include "xe_gt_printk.h"
>> +#include "xe_guc.h"
>> #include "xe_map.h"
>> #include "xe_module.h"
>>
>> +#define GUC_LOG_DEFAULT_CRASH_BUFFER_SIZE CRASH_BUFFER_SIZE
>> +#define GUC_LOG_DEFAULT_DEBUG_BUFFER_SIZE DEBUG_BUFFER_SIZE
>> +#define GUC_LOG_DEFAULT_CAPTURE_BUFFER_SIZE CAPTURE_BUFFER_SIZE
>> +
>> +struct guc_log_section {
>> + u32 max;
>> + u32 flag;
>> + u32 default_val;
>> + const char *name;
>> +};
>> +
>> static struct xe_gt *
>> log_to_gt(struct xe_guc_log *log)
>> {
>> @@ -96,3 +109,195 @@ int xe_guc_log_init(struct xe_guc_log *log)
>>
>> return 0;
>> }
>> +
>> +static void _guc_log_init_sizes(struct xe_guc_log *log)
>> +{
>> + struct xe_guc *guc = log_to_guc(log);
>> + static const struct guc_log_section sections[GUC_LOG_BUFFER_TYPE_MAX] = {
>> + {
>> + GUC_LOG_CRASH_MASK >> GUC_LOG_CRASH_SHIFT,
>> + GUC_LOG_LOG_ALLOC_UNITS,
>> + GUC_LOG_DEFAULT_CRASH_BUFFER_SIZE,
>> + "crash dump"
>> + },
>> + {
>> + GUC_LOG_DEBUG_MASK >> GUC_LOG_DEBUG_SHIFT,
>> + GUC_LOG_LOG_ALLOC_UNITS,
>> + GUC_LOG_DEFAULT_DEBUG_BUFFER_SIZE,
>> + "debug",
>> + },
>> + {
>> + GUC_LOG_CAPTURE_MASK >> GUC_LOG_CAPTURE_SHIFT,
>> + GUC_LOG_CAPTURE_ALLOC_UNITS,
>> + GUC_LOG_DEFAULT_CAPTURE_BUFFER_SIZE,
>> + "capture",
>> + }
>> + };
>> + int i;
>> +
>> + for (i = 0; i < GUC_LOG_BUFFER_TYPE_MAX; i++)
>> + log->sizes[i].bytes = sections[i].default_val;
>> +
>> + /* If debug size > 1MB then bump default crash size to keep the same units */
>> + if (log->sizes[GUC_LOG_BUFFER_DEBUG].bytes >= SZ_1M &&
>> + GUC_LOG_DEFAULT_CRASH_BUFFER_SIZE < SZ_1M)
>> + log->sizes[GUC_LOG_BUFFER_CRASH_DUMP].bytes = SZ_1M;
>> +
>> + /* Prepare the GuC API structure fields: */
>> + for (i = 0; i < GUC_LOG_BUFFER_TYPE_MAX; i++) {
>> + /* Convert to correct units */
>> + if ((log->sizes[i].bytes % SZ_1M) == 0) {
>> + log->sizes[i].units = SZ_1M;
>> + log->sizes[i].flag = sections[i].flag;
>> + } else {
>> + log->sizes[i].units = SZ_4K;
>> + log->sizes[i].flag = 0;
>> + }
>> +
>> + xe_gt_assert_msg(log_to_gt(log),
>> + IS_ALIGNED(log->sizes[i].bytes, log->sizes[i].units),
>> + "Mis-aligned log %s size: 0x%X vs 0x%X!\n",
>> + sections[i].name, log->sizes[i].bytes, log->sizes[i].units);
>> +
>> + log->sizes[i].count = log->sizes[i].bytes / log->sizes[i].units;
>> +
>> + if (!log->sizes[i].count) {
>> + xe_gt_err(guc_to_gt(guc), "Zero log %s size!\n", sections[i].name);
>> + } else {
>> + /* Size is +1 unit */
>> + log->sizes[i].count--;
>> + }
>> +
>> + /* Clip to field size */
>> + if (log->sizes[i].count > sections[i].max) {
>> + xe_gt_err(guc_to_gt(guc), "log %s size too large: %d vs %d!\n",
>> + sections[i].name, log->sizes[i].count + 1, sections[i].max + 1);
>> + log->sizes[i].count = sections[i].max;
>> + }
>> + }
>> +
>> + if (log->sizes[GUC_LOG_BUFFER_CRASH_DUMP].units != log->sizes[GUC_LOG_BUFFER_DEBUG].units)
>> {
>> + xe_gt_err(guc_to_gt(guc), "Unit mismatch for crash and debug sections: %d vs
>> %d!\n",
>> + log->sizes[GUC_LOG_BUFFER_CRASH_DUMP].units,
>> + log->sizes[GUC_LOG_BUFFER_DEBUG].units);
>> + log->sizes[GUC_LOG_BUFFER_CRASH_DUMP].units =
>> + log->sizes[GUC_LOG_BUFFER_DEBUG].units;
>> + log->sizes[GUC_LOG_BUFFER_CRASH_DUMP].count = 0;
>> + }
>> +
>> + log->sizes_initialised = true;
>> +}
> alan: something isn't quite right about how you have inserted the guc's
> log->sizes infrastructure into this patch. It took me a little time to
> trace through all the codes in both i915 and xe (since some of these
> concepts came from i915).
>
> So the issue is this: At the big picture, (system level),
> we init multiple guc-subfunctions/subcontext: guc-log, guc-ads,
> guc-ct, guc-params, etc. After that, we setup other things
> and then we load the guc firmware. NOTE: guc-params handles
> information the offset of guc-log buffer and sizing of the various
> guc-log-buffer-types. This info gets programmed into mmio registers
> before we load the guc so that guc firmware init flows can start
> using the debug or crash buffer for logging.
>
> However, i notice you have not wired up guc-params init to
> use the above sizes infrastructure. Additionally, this
> wiring also needs happen even earlier during the guc-log
> xe_bo allocation since the allocation size must match
> what is setup in guc-params and must match with what guc-log-debug
> and guc-error-capture expects the output dump ranges to be.
> note: guc-params init ==> "guc_init_params" and
> "guc_ctl_log_params_flags"
> note: guc log bo allocation ==> "xe_guc_log_init"
>
>
> I'll pause for a moment here and state that both of the xe_guc_log_init
> and guc_init_params relies on the #defines of DEBUG_BUFFER_SIZE,
> CRASH_BUFFER_SIZE and CAPTURE_BUFFER_SIZE) when allocating the bo and
> programming guc-params. That said, since you have defined
> GUC_LOG_DEFAULT_DEBUG_BUFFER_SIZE to be DEBUG_BUFFER_SIZE (same for
> CRASH/CAPTURE), you are most probably not facing any kind of mismatch
> between guc bo allocation and guc-params vs guc error capture sizing
> expectation.
>
> That said, if you look at guc_ctl_log_params_flags, you can
> see all the code there is using build-time checks to ensure the
> various guc sizes settings (size, units, counts) are all good just
> like this sizes framework you have added.
>
> So now the question becomes, why did i915 even do all this in the first
> place? Why didn't it stick with just re-using CAPTURE_BUFFER_SIZE?
> From git-blame you'll see that i915 added support for overriding
> the size of the guc log buffer. And thus the updated sizes framework
> will do all of the correction (clipping or rounding up) to get all
> that sorted out - and it would occur for the first time during
> guc log bo allocation.
>
> So moving forward i see two options:
>
> 1. drop all of the sizes infrastructure and when check_guc_capture_size
> needs the size of its region, it can just directly use
> CAPTURE_BUFFER_SIZE (of course we still do the size increase).
> Then double check the code in guc_ctl_log_params_flags and ensure
> everything still makes sense / builds alright for the larger capture
> size. Then we can focus on driving the param for dynamic calculation/
> correctoin of the sizes values as a separate JIRA later.
>
> 2. Plumb the xe_guc_log_init(..), guc_log_size(), guc_init_params(..)
> and guc_ctl_log_params_flags(..) to ensure its using the same guc
> log sizes framework above. However, i don't see the motivation for
> this without pushing for startup-time (module param) configuration
> of the guc log buffer size and region dimensions.
>
>
> I would propose to just go with option #1. I do sincerely apologize
> for not catching this sooner, i notice folks spent time reviewing the
> coding details of this patch in prior revs but i hadn't stepped back
> for the big picture like i did today.
Thanks for highlevel review.
Option 1 is the way I'm doing. The log feature is not a small feature,
so what I do right now is to take the minimal code base for the register
capture feature, and that's why the patch defined
GUC_LOG_DEFAULT_DEBUG_BUFFER_SIZE to be DEBUG_BUFFER_SIZE, ...
About the guc_params and other parts, that seems not a small topic and I
would discuss with you offline.
>
>
>> +
>> +static void guc_log_init_sizes(struct xe_guc_log *log)
>> +{
>> + if (log->sizes_initialised)
>> + return;
>> +
>> + _guc_log_init_sizes(log);
>> +}
>> +
>> +static u32 xe_guc_log_section_size_crash(struct xe_guc_log *log)
>> +{
>> + guc_log_init_sizes(log);
>> +
>> + return log->sizes[GUC_LOG_BUFFER_CRASH_DUMP].bytes;
>> +}
>> +
>> +static u32 xe_guc_log_section_size_debug(struct xe_guc_log *log)
>> +{
>> + guc_log_init_sizes(log);
>> +
>> + return log->sizes[GUC_LOG_BUFFER_DEBUG].bytes;
>> +}
>> +
>> +/**
>> + * xe_guc_log_section_size_capture - Get capture buffer size in log sections.
>> + * @log: The log object.
>> + *
>> + * This function will return the capture buffer size in log sections.
>> + *
>> + * Return: capture buffer size.
>> + */
>> +u32 xe_guc_log_section_size_capture(struct xe_guc_log *log)
>> +{
>> + guc_log_init_sizes(log);
>> +
>> + return log->sizes[GUC_LOG_BUFFER_CAPTURE].bytes;
>> +}
>> +
>> +/**
>> + * xe_guc_check_log_buf_overflow - Check if log buffer overflowed
>> + * @log: The log object.
>> + * @type: The log buffer type
>> + * @full_cnt: The count of buffer full
>> + *
>> + * This function will check count of buffer full against previous, mismatch
>> + * indicate overflowed.
>> + * Update the sampled_overflow counter, if the 4 bit counter overflowed, add
>> + * up 16 to correct the value.
>> + *
>> + * Return: True if overflowed.
>> + */
>> +bool xe_guc_check_log_buf_overflow(struct xe_guc_log *log, enum guc_log_buffer_type type,
>> + unsigned int full_cnt)
>> +{
>> + unsigned int prev_full_cnt = log->stats[type].sampled_overflow;
>> + bool overflow = false;
>> +
>> + if (full_cnt != prev_full_cnt) {
>> + overflow = true;
>> +
>> + log->stats[type].overflow = full_cnt;
>> + log->stats[type].sampled_overflow += full_cnt - prev_full_cnt;
>> +
>> + if (full_cnt < prev_full_cnt) {
>> + /* buffer_full_cnt is a 4 bit counter */
>> + log->stats[type].sampled_overflow += 16;
>> + }
>> + xe_gt_notice(log_to_gt(log), "log buffer overflow\n");
>> + }
>> +
>> + return overflow;
>> +}
>> +
>> +/**
>> + * xe_guc_get_log_buffer_size - Get log buffer size for a type.
>> + * @log: The log object.
>> + * @type: The log buffer type
>> + *
>> + * Return: buffer size.
>> + */
>> +u32 xe_guc_get_log_buffer_size(struct xe_guc_log *log, enum guc_log_buffer_type type)
>> +{
>> + switch (type) {
>> + case GUC_LOG_BUFFER_CRASH_DUMP:
>> + return xe_guc_log_section_size_crash(log);
>> + case GUC_LOG_BUFFER_DEBUG:
>> + return xe_guc_log_section_size_debug(log);
>> + case GUC_LOG_BUFFER_CAPTURE:
>> + return xe_guc_log_section_size_capture(log);
>> + }
>> + return 0;
>> +}
>> +
>> +/**
>> + * xe_guc_get_log_buffer_offset - Get offset in log buffer for a type.
>> + * @log: The log object.
>> + * @type: The log buffer type
>> + *
>> + * This function will return the offset in the log buffer for a type.
>> + * Return: buffer offset.
>> + */
>> +u32 xe_guc_get_log_buffer_offset(struct xe_guc_log *log, enum guc_log_buffer_type type)
>> +{
>> + enum guc_log_buffer_type i;
>> + u32 offset = PAGE_SIZE;/* for the log_buffer_states */
>> +
>> + for (i = GUC_LOG_BUFFER_CRASH_DUMP; i < GUC_LOG_BUFFER_TYPE_MAX; ++i) {
>> + if (i == type)
>> + break;
>> + offset += xe_guc_get_log_buffer_size(log, i);
>> + }
>> +
>> + return offset;
>> +}
>> diff --git a/drivers/gpu/drm/xe/xe_guc_log.h b/drivers/gpu/drm/xe/xe_guc_log.h
>> index 2d25ab28b4b3..ea9e79ccd314 100644
>> --- a/drivers/gpu/drm/xe/xe_guc_log.h
>> +++ b/drivers/gpu/drm/xe/xe_guc_log.h
>> @@ -7,6 +7,7 @@
>> #define _XE_GUC_LOG_H_
>>
>> #include "xe_guc_log_types.h"
>> +#include "xe_guc_types.h"
>>
>> struct drm_printer;
>>
>> @@ -17,7 +18,7 @@ struct drm_printer;
>> #else
>> #define CRASH_BUFFER_SIZE SZ_8K
>> #define DEBUG_BUFFER_SIZE SZ_64K
>> -#define CAPTURE_BUFFER_SIZE SZ_16K
>> +#define CAPTURE_BUFFER_SIZE SZ_1M
>> #endif
>> /*
>> * While we're using plain log level in i915, GuC controls are much more...
>> @@ -36,6 +37,11 @@ struct drm_printer;
>> #define GUC_VERBOSITY_TO_LOG_LEVEL(x) ((x) + 2)
>> #define GUC_LOG_LEVEL_MAX GUC_VERBOSITY_TO_LOG_LEVEL(GUC_LOG_VERBOSITY_MAX)
>>
>> +static inline struct xe_guc *log_to_guc(struct xe_guc_log *log)
>> +{
>> + return container_of(log, struct xe_guc, log);
>> +}
>> +
>> int xe_guc_log_init(struct xe_guc_log *log);
>> void xe_guc_log_print(struct xe_guc_log *log, struct drm_printer *p);
>>
>> @@ -45,4 +51,13 @@ xe_guc_log_get_level(struct xe_guc_log *log)
>> return log->level;
>> }
>>
>> +u32 xe_guc_log_section_size_capture(struct xe_guc_log *log);
>> +
>> +bool xe_guc_check_log_buf_overflow(struct xe_guc_log *log,
>> + enum guc_log_buffer_type type,
>> + unsigned int full_cnt);
>> +
>> +u32 xe_guc_get_log_buffer_size(struct xe_guc_log *log, enum guc_log_buffer_type type);
>> +u32 xe_guc_get_log_buffer_offset(struct xe_guc_log *log, enum guc_log_buffer_type type);
>> +
>> #endif
>> diff --git a/drivers/gpu/drm/xe/xe_guc_log_types.h b/drivers/gpu/drm/xe/xe_guc_log_types.h
>> index 125080d138a7..67a9c58e7ed7 100644
>> --- a/drivers/gpu/drm/xe/xe_guc_log_types.h
>> +++ b/drivers/gpu/drm/xe/xe_guc_log_types.h
>> @@ -7,6 +7,7 @@
>> #define _XE_GUC_LOG_TYPES_H_
>>
>> #include <linux/types.h>
>> +#include "abi/guc_log_abi.h"
>>
>> struct xe_bo;
>>
>> @@ -18,6 +19,23 @@ struct xe_guc_log {
>> u32 level;
>> /** @bo: XE BO for GuC log */
>> struct xe_bo *bo;
>> +
>> + /** @sizes: Allocation settings */
>> + struct {
>> + u32 bytes; /* Size in bytes */
>> + u32 units; /* GuC API units - 1MB or 4KB */
>> + u32 count; /* Number of API units */
>> + u32 flag; /* GuC API units flag */
>> + } sizes[GUC_LOG_BUFFER_TYPE_MAX];
>> + /** @sizes_initialised: sizes initialised */
>> + bool sizes_initialised;
>> +
>> + /** @stats: logging related stats */
>> + struct {
>> + u32 sampled_overflow;
>> + u32 overflow;
>> + u32 flush;
>> + } stats[GUC_LOG_BUFFER_TYPE_MAX];
> alan: nit: my unsolicted $0.02, i believe GUC_LOG_BUFFER_TYPE_MAX should
> be kept in an ABI header because it is part of the GuC interface spec
> that determined things like the number of GuC log regions for both state
> and data.
Agree
>
>> };
>>
>> #endif
>
More information about the Intel-xe
mailing list