[PATCH v14 4/5] drm/xe/guc: Extract GuC error capture lists

Wed Jul 31 15:01:42 UTC 2024

See my comments inline below.

Regards,
Zhanjun Dong

On 2024-07-30 9:09 p.m., Teres Alexis, Alan Previn wrote:
> There are two minor issues, i believe they are trivial.
> Other than that everything else looks good to me.
> Thus i'm providing a conditional-rb on those two
> non-nit's below:
> 
> 
> Reviewed-by: Alan Previn <alan.previn.teres.alexis at intel.com>
> 
> P.S.: appreciate the work you did diving deep
> into these legacy codes and optimizing that wrap-around
> extraction.

Thanks Alan.
> 
> 
> On Wed, 2024-07-24 at 07:51 -0700, Zhanjun Dong wrote:
>> Upon the G2H Notify-Err-Capture event, parse through the
>> GuC Log Buffer (error-capture-subregion) and generate one or
>> more capture-nodes. A single node represents a single "engine-
>> instance-capture-dump" and contains at least 3 register lists:
>> global, engine-class and engine-instance. An internal link
>> list is maintained to store one or more nodes.
>>
>> Because the link-list node generation happen before the call
>> to devcoredump, duplicate global and engine-class register
>> lists for each engine-instance register dump if we find
>> dependent-engine resets in a engine-capture-group.
>>
>>
> alan:snip
> 
> 
>> +/**
>> + * struct guc_log_buffer_state - GuC log buffer state
>> + *
>> + * Below state structure is used for coordination of retrieval of GuC firmware
>> + * logs. Separate state is maintained for each log buffer type.
>> + * read_ptr points to the location where Xe read last in log buffer and
>> + * is read only for GuC firmware. write_ptr is incremented by GuC with number
>> + * of bytes written for each log entry and is read only for Xe.
>> + * When any type of log buffer becomes half full, GuC sends a flush interrupt.
>> + * GuC firmware expects that while it is writing to 2nd half of the buffer,
>> + * first half would get consumed by Host and then get a flush completed
>> + * acknowledgment from Host, so that it does not end up doing any overwrite
>> + * causing loss of logs. So when buffer gets half filled & Xe has requested
>> + * for interrupt, GuC will set flush_to_file field, set the sampled_write_ptr
>> + * to the value of write_ptr and raise the interrupt.
>> + * On receiving the interrupt Xe should read the buffer, clear flush_to_file
>> + * field and also update read_ptr with the value of sample_write_ptr, before
>> + * sending an acknowledgment to GuC. marker & version fields are for internal
>> + * usage of GuC and opaque to Xe. buffer_full_cnt field is incremented every
>> + * time GuC detects the log buffer overflow.
>> + */
>> +struct guc_log_buffer_state {
>> +       /** @marker: buffer state start marker */
>> +       u32 marker[2];
>> +       /** @read_ptr: the Last Byte Offset Location */
> alan: nit: added clarity per spec --> "the last byte offset that was read by KMD previously"
>> +       u32 read_ptr;
>> +       /** @write_ptr: the Byte Offset Location */
> alan: nit: added clarity per spec --> "the next byte offset location that will be written by GuC".
Thanks, I like this, it point out access it.
>> +       u32 write_ptr;
>> +       /** @size: Log buffer size */
>> +       u32 size;
...
>> diff --git a/drivers/gpu/drm/xe/xe_guc_capture.c b/drivers/gpu/drm/xe/xe_guc_capture.c
>> index 72abba8f7bf8..bc400764e80d 100644
>> --- a/drivers/gpu/drm/xe/xe_guc_capture.c
>> +++ b/drivers/gpu/drm/xe/xe_guc_capture.c
>> @@ -10,6 +10,7 @@
>>   
>>   #include "abi/guc_actions_abi.h"
>>   #include "abi/guc_capture_abi.h"
>> +#include "abi/guc_log_abi.h"
>>   #include "regs/xe_engine_regs.h"
>>   #include "regs/xe_gt_regs.h"
>>   #include "regs/xe_guc_regs.h"
>> @@ -32,6 +33,51 @@
>>   #include "xe_macros.h"
>>   #include "xe_map.h"
>>   
>> +/*
>> + * struct __guc_capture_bufstate
>> + *
>> + * Book-keeping structure used to track read and write pointers
>> + * as we extract error capture data from the GuC-log-buffer's
>> + * error-capture region as a stream of dwords.
>> + */
>> +struct __guc_capture_bufstate {
>> +       u32 size;
>> +       u32 data_offset;
>> +       u32 rd;
>> +       u32 wr;
>> +};
>> +
>> +/*
>> + * struct __guc_capture_parsed_output - extracted error capture node
>> + *
>> + * A single unit of extracted error-capture output data grouped together
>> + * at an engine-instance level. We keep these nodes in a linked list.
>> + * See cachelist and outlist below.
>> + */
>> +struct __guc_capture_parsed_output {
>> +       /*
>> +        * A single set of 3 capture lists: a global-list
>> +        * an engine-class-list and an engine-instance list.
>> +        * outlist in __guc_capture_parsed_output will keep
>> +        * a linked list of these nodes that will eventually
>> +        * be detached from outlist and attached into to
>> +        * xe_codedump in response to a context reset
>> +        */
>> +       struct list_head link;
>> +       bool is_partial;
>> +       u32 eng_class;
>> +       u32 eng_inst;
>> +       u32 guc_id;
>> +       u32 lrca;
>> +       struct gcap_reg_list_info {
>> +               u32 vfid;
>> +               u32 num_regs;
>> +               struct guc_mmio_reg *regs;
>> +       } reginfo[GUC_STATE_CAPTURE_TYPE_MAX];
>> +#define GCAP_PARSED_REGLIST_INDEX_GLOBAL   BIT(GUC_STATE_CAPTURE_TYPE_GLOBAL)
>> +#define GCAP_PARSED_REGLIST_INDEX_ENGCLASS BIT(GUC_STATE_CAPTURE_TYPE_ENGINE_CLASS)
>> +};
>> +
> alan: snip
>>
> 
> 
>> +/*
>> + * GuC's error-capture output is a ring buffer populated in a byte-stream fashion:
>> + *
>> + * The GuC Log buffer region for error-capture is managed like a ring buffer.
>> + * The GuC firmware dumps error capture logs into this ring in a byte-stream flow.
>> + * Additionally, as per the current and foreseeable future, all packed error-
>> + * capture output structures are dword aligned.
>> + *
>> + * That said, if the GuC firmware is in the midst of writing a structure that is larger
>> + * than one dword but the tail end of the err-capture buffer-region has lesser space left,
>> + * we would need to extract that structure one dword at a time straddled across the end,
>> + * onto the start of the ring.
>> + *
>> + * Below function, guc_capture_log_remove_bytes is a helper for that. All callers of this
>> + * function would typically do a straight-up memcpy from the ring contents and will only
>> + * call this helper if their structure-extraction is straddling across the end of the
>> + * ring. GuC firmware does not add any padding. The reason for the no-padding is to ease
>> + * scalability for future expansion of output data types without requiring a redesign
>> + * of the flow controls.
>> + */
>> +static int
>> +guc_capture_log_remove_bytes(struct xe_guc *guc, struct __guc_capture_bufstate *buf,
>> +                            void *out, int bytes_needed)
>> +{
>> +       #define GUC_CAPTURE_LOG_BUF_COPY_RETRY_MAX      3
> alan: (conditional-rb) please double check if you are allowed to have a #define indented like this.
Oops, should be start at 0
>> +       int fill_size = 0, tries = GUC_CAPTURE_LOG_BUF_COPY_RETRY_MAX;
>> +       int copy_size, avail;
>> +
>> +       xe_assert(guc_to_xe(guc), bytes_needed % sizeof(u32) == 0);
>> +
>> +       if (bytes_needed > guc_capture_buf_cnt(buf))
>> +               return -1;
>> +
>> +       while (bytes_needed > 0 && tries--) {
>> +               int misaligned;
>> +
>> +               avail = guc_capture_buf_cnt_to_end(buf);
>> +               misaligned = avail % sizeof(u32);
>> +               /* wrap if at end */
>> +               if (!avail) {
>> +                       /* output stream clipped */
>> +                       if (!buf->rd)
>> +                               return fill_size;
>> +                       buf->rd = 0;
>> +                       continue;
>> +               }
>> +
>> +               /* Only copy to u32 aligned data */
>> +               copy_size = avail < bytes_needed ? avail - misaligned : bytes_needed;
>> +               xe_map_memcpy_from(guc_to_xe(guc), out + fill_size, &guc->log.bo->vmap,
>> +                                  buf->data_offset + buf->rd, copy_size);
>> +               buf->rd += copy_size;
>> +               fill_size += copy_size;
>> +               bytes_needed -= copy_size;
>> +
>> +               if (misaligned)
>> +                       xe_gt_dbg(guc_to_gt(guc),
>> +                                 "Bytes extraction not dword aligned, skipping.\n");
> alan:nit, based on code flow, debug message should say "clipping", not "skipping".
> To me either option (skipping or clipping) are okay as long as the message matches
> the code. Also, i personally prefer tihs to be a xe_gt_warn - because this shouldn't
> happen and basically means one of the componnets are breaking the rules.
Sure, will do.
>> +       }
>> +
>> +       return fill_size;
>> +}
>> +
>> +static int
>> +guc_capture_log_get_group_hdr(struct xe_guc *guc, struct __guc_capture_bufstate *buf,
>> +                             struct guc_state_capture_group_header_t *ghdr)
>> +{
>> +       int fullsize = sizeof(struct guc_state_capture_group_header_t);
>> +
>> +       if (guc_capture_log_remove_bytes(guc, buf, ghdr, fullsize) != fullsize)
>> +               return -1;
>> +       return 0;
>> +}
>> +
>> +static int
>> +guc_capture_log_get_data_hdr(struct xe_guc *guc, struct __guc_capture_bufstate *buf,
>> +                            struct guc_state_capture_header_t *hdr)
>> +{
>> +       int fullsize = sizeof(struct guc_state_capture_header_t);
>> +
>> +       if (guc_capture_log_remove_bytes(guc, buf, hdr, fullsize) != fullsize)
>> +               return -1;
>> +       return 0;
>> +}
>> +
>> +static int
>> +guc_capture_log_get_register(struct xe_guc *guc, struct __guc_capture_bufstate *buf,
>> +                            struct guc_mmio_reg *reg)
>> +{
>> +       int fullsize = sizeof(struct guc_mmio_reg);
>> +
>> +       if (guc_capture_log_remove_bytes(guc, buf, reg, fullsize) != fullsize)
>> +               return -1;
>> +       return 0;
>> +}
>> +
> 
> alan: snip
> 
>>
>> @@ -1594,6 +1594,9 @@ static int guc_exec_queue_suspend(struct xe_exec_queue *q)
>>   static int guc_exec_queue_suspend_wait(struct xe_exec_queue *q)
>>   {
>>          struct xe_guc *guc = exec_queue_to_guc(q);
>> +
>> +       wait_event(q->guc->suspend_wait, !q->guc->suspend_pending ||
>> +                  xe_guc_read_stopped(guc));
> alan: (conditional-rb) this looks weird, i thought the above wait_event code
> is already there in baseline and we only changed it from guc_read_stopped
> to xe_guc_read_stopped, not adding the entire new wait function. Also, a local
> variable declaration below this? ... did checkpatch not warn? Can you double check
> if this is an error and fix if necessary? (I'm wondering if you introduced a
> trivial rebase line-number-alignment error.
Thanks Alan, you are right, that is the rebase line-number-alignment 
error. Will fix it in next rev.

>>          int ret;
>>   
> 
> 
> alan:snip