[PATCH v14 4/5] drm/xe/guc: Extract GuC error capture lists
Teres Alexis, Alan Previn
alan.previn.teres.alexis at intel.com
Wed Jul 31 01:09:52 UTC 2024
There are two minor issues, i believe they are trivial.
Other than that everything else looks good to me.
Thus i'm providing a conditional-rb on those two
non-nit's below:
Reviewed-by: Alan Previn <alan.previn.teres.alexis at intel.com>
P.S.: appreciate the work you did diving deep
into these legacy codes and optimizing that wrap-around
extraction.
On Wed, 2024-07-24 at 07:51 -0700, Zhanjun Dong wrote:
> Upon the G2H Notify-Err-Capture event, parse through the
> GuC Log Buffer (error-capture-subregion) and generate one or
> more capture-nodes. A single node represents a single "engine-
> instance-capture-dump" and contains at least 3 register lists:
> global, engine-class and engine-instance. An internal link
> list is maintained to store one or more nodes.
>
> Because the link-list node generation happen before the call
> to devcoredump, duplicate global and engine-class register
> lists for each engine-instance register dump if we find
> dependent-engine resets in a engine-capture-group.
>
>
alan:snip
> +/**
> + * struct guc_log_buffer_state - GuC log buffer state
> + *
> + * Below state structure is used for coordination of retrieval of GuC firmware
> + * logs. Separate state is maintained for each log buffer type.
> + * read_ptr points to the location where Xe read last in log buffer and
> + * is read only for GuC firmware. write_ptr is incremented by GuC with number
> + * of bytes written for each log entry and is read only for Xe.
> + * When any type of log buffer becomes half full, GuC sends a flush interrupt.
> + * GuC firmware expects that while it is writing to 2nd half of the buffer,
> + * first half would get consumed by Host and then get a flush completed
> + * acknowledgment from Host, so that it does not end up doing any overwrite
> + * causing loss of logs. So when buffer gets half filled & Xe has requested
> + * for interrupt, GuC will set flush_to_file field, set the sampled_write_ptr
> + * to the value of write_ptr and raise the interrupt.
> + * On receiving the interrupt Xe should read the buffer, clear flush_to_file
> + * field and also update read_ptr with the value of sample_write_ptr, before
> + * sending an acknowledgment to GuC. marker & version fields are for internal
> + * usage of GuC and opaque to Xe. buffer_full_cnt field is incremented every
> + * time GuC detects the log buffer overflow.
> + */
> +struct guc_log_buffer_state {
> + /** @marker: buffer state start marker */
> + u32 marker[2];
> + /** @read_ptr: the Last Byte Offset Location */
alan: nit: added clarity per spec --> "the last byte offset that was read by KMD previously"
> + u32 read_ptr;
> + /** @write_ptr: the Byte Offset Location */
alan: nit: added clarity per spec --> "the next byte offset location that will be written by GuC".
> + u32 write_ptr;
> + /** @size: Log buffer size */
> + u32 size;
> + /**
> + * @sampled_write_ptr: Log buffer write pointer
> + * This is written by GuC to the byte offset of the next free entry in
> + * the buffer on log buffer half full or state capture notification
> + */
> + u32 sampled_write_ptr;
> + /**
> + * @wrap_offset: wraparound offset
> + * This is the byte offset of location 1 byte after last valid guc log
> + * event entry written by Guc firmware before there was a wraparound.
> + * This field is updated by guc firmware and should be used by Host
> + * when copying buffer contents to file.
> + */
> + u32 wrap_offset;
> + /** @flags: Flush to file flag and buffer full count */
> + u32 flags;
> +#define GUC_LOG_BUFFER_STATE_FLUSH_TO_FILE GENMASK(0, 0)
> +#define GUC_LOG_BUFFER_STATE_BUFFER_FULL_CNT GENMASK(4, 1)
> + /** @version: The Guc-Log-Entry format version */
> + u32 version;
> +} __packed;
> +
> #endif
> diff --git a/drivers/gpu/drm/xe/xe_guc_capture.c b/drivers/gpu/drm/xe/xe_guc_capture.c
> index 72abba8f7bf8..bc400764e80d 100644
> --- a/drivers/gpu/drm/xe/xe_guc_capture.c
> +++ b/drivers/gpu/drm/xe/xe_guc_capture.c
> @@ -10,6 +10,7 @@
>
> #include "abi/guc_actions_abi.h"
> #include "abi/guc_capture_abi.h"
> +#include "abi/guc_log_abi.h"
> #include "regs/xe_engine_regs.h"
> #include "regs/xe_gt_regs.h"
> #include "regs/xe_guc_regs.h"
> @@ -32,6 +33,51 @@
> #include "xe_macros.h"
> #include "xe_map.h"
>
> +/*
> + * struct __guc_capture_bufstate
> + *
> + * Book-keeping structure used to track read and write pointers
> + * as we extract error capture data from the GuC-log-buffer's
> + * error-capture region as a stream of dwords.
> + */
> +struct __guc_capture_bufstate {
> + u32 size;
> + u32 data_offset;
> + u32 rd;
> + u32 wr;
> +};
> +
> +/*
> + * struct __guc_capture_parsed_output - extracted error capture node
> + *
> + * A single unit of extracted error-capture output data grouped together
> + * at an engine-instance level. We keep these nodes in a linked list.
> + * See cachelist and outlist below.
> + */
> +struct __guc_capture_parsed_output {
> + /*
> + * A single set of 3 capture lists: a global-list
> + * an engine-class-list and an engine-instance list.
> + * outlist in __guc_capture_parsed_output will keep
> + * a linked list of these nodes that will eventually
> + * be detached from outlist and attached into to
> + * xe_codedump in response to a context reset
> + */
> + struct list_head link;
> + bool is_partial;
> + u32 eng_class;
> + u32 eng_inst;
> + u32 guc_id;
> + u32 lrca;
> + struct gcap_reg_list_info {
> + u32 vfid;
> + u32 num_regs;
> + struct guc_mmio_reg *regs;
> + } reginfo[GUC_STATE_CAPTURE_TYPE_MAX];
> +#define GCAP_PARSED_REGLIST_INDEX_GLOBAL BIT(GUC_STATE_CAPTURE_TYPE_GLOBAL)
> +#define GCAP_PARSED_REGLIST_INDEX_ENGCLASS BIT(GUC_STATE_CAPTURE_TYPE_ENGINE_CLASS)
> +};
> +
alan: snip
>
> +/*
> + * GuC's error-capture output is a ring buffer populated in a byte-stream fashion:
> + *
> + * The GuC Log buffer region for error-capture is managed like a ring buffer.
> + * The GuC firmware dumps error capture logs into this ring in a byte-stream flow.
> + * Additionally, as per the current and foreseeable future, all packed error-
> + * capture output structures are dword aligned.
> + *
> + * That said, if the GuC firmware is in the midst of writing a structure that is larger
> + * than one dword but the tail end of the err-capture buffer-region has lesser space left,
> + * we would need to extract that structure one dword at a time straddled across the end,
> + * onto the start of the ring.
> + *
> + * Below function, guc_capture_log_remove_bytes is a helper for that. All callers of this
> + * function would typically do a straight-up memcpy from the ring contents and will only
> + * call this helper if their structure-extraction is straddling across the end of the
> + * ring. GuC firmware does not add any padding. The reason for the no-padding is to ease
> + * scalability for future expansion of output data types without requiring a redesign
> + * of the flow controls.
> + */
> +static int
> +guc_capture_log_remove_bytes(struct xe_guc *guc, struct __guc_capture_bufstate *buf,
> + void *out, int bytes_needed)
> +{
> + #define GUC_CAPTURE_LOG_BUF_COPY_RETRY_MAX 3
alan: (conditional-rb) please double check if you are allowed to have a #define indented like this.
> + int fill_size = 0, tries = GUC_CAPTURE_LOG_BUF_COPY_RETRY_MAX;
> + int copy_size, avail;
> +
> + xe_assert(guc_to_xe(guc), bytes_needed % sizeof(u32) == 0);
> +
> + if (bytes_needed > guc_capture_buf_cnt(buf))
> + return -1;
> +
> + while (bytes_needed > 0 && tries--) {
> + int misaligned;
> +
> + avail = guc_capture_buf_cnt_to_end(buf);
> + misaligned = avail % sizeof(u32);
> + /* wrap if at end */
> + if (!avail) {
> + /* output stream clipped */
> + if (!buf->rd)
> + return fill_size;
> + buf->rd = 0;
> + continue;
> + }
> +
> + /* Only copy to u32 aligned data */
> + copy_size = avail < bytes_needed ? avail - misaligned : bytes_needed;
> + xe_map_memcpy_from(guc_to_xe(guc), out + fill_size, &guc->log.bo->vmap,
> + buf->data_offset + buf->rd, copy_size);
> + buf->rd += copy_size;
> + fill_size += copy_size;
> + bytes_needed -= copy_size;
> +
> + if (misaligned)
> + xe_gt_dbg(guc_to_gt(guc),
> + "Bytes extraction not dword aligned, skipping.\n");
alan:nit, based on code flow, debug message should say "clipping", not "skipping".
To me either option (skipping or clipping) are okay as long as the message matches
the code. Also, i personally prefer tihs to be a xe_gt_warn - because this shouldn't
happen and basically means one of the componnets are breaking the rules.
> + }
> +
> + return fill_size;
> +}
> +
> +static int
> +guc_capture_log_get_group_hdr(struct xe_guc *guc, struct __guc_capture_bufstate *buf,
> + struct guc_state_capture_group_header_t *ghdr)
> +{
> + int fullsize = sizeof(struct guc_state_capture_group_header_t);
> +
> + if (guc_capture_log_remove_bytes(guc, buf, ghdr, fullsize) != fullsize)
> + return -1;
> + return 0;
> +}
> +
> +static int
> +guc_capture_log_get_data_hdr(struct xe_guc *guc, struct __guc_capture_bufstate *buf,
> + struct guc_state_capture_header_t *hdr)
> +{
> + int fullsize = sizeof(struct guc_state_capture_header_t);
> +
> + if (guc_capture_log_remove_bytes(guc, buf, hdr, fullsize) != fullsize)
> + return -1;
> + return 0;
> +}
> +
> +static int
> +guc_capture_log_get_register(struct xe_guc *guc, struct __guc_capture_bufstate *buf,
> + struct guc_mmio_reg *reg)
> +{
> + int fullsize = sizeof(struct guc_mmio_reg);
> +
> + if (guc_capture_log_remove_bytes(guc, buf, reg, fullsize) != fullsize)
> + return -1;
> + return 0;
> +}
> +
alan: snip
>
> @@ -1594,6 +1594,9 @@ static int guc_exec_queue_suspend(struct xe_exec_queue *q)
> static int guc_exec_queue_suspend_wait(struct xe_exec_queue *q)
> {
> struct xe_guc *guc = exec_queue_to_guc(q);
> +
> + wait_event(q->guc->suspend_wait, !q->guc->suspend_pending ||
> + xe_guc_read_stopped(guc));
alan: (conditional-rb) this looks weird, i thought the above wait_event code
is already there in baseline and we only changed it from guc_read_stopped
to xe_guc_read_stopped, not adding the entire new wait function. Also, a local
variable declaration below this? ... did checkpatch not warn? Can you double check
if this is an error and fix if necessary? (I'm wondering if you introduced a
trivial rebase line-number-alignment error.
> int ret;
>
alan:snip
More information about the Intel-xe
mailing list