[PATCH v27 6/6] drm/xe/guc: Save manual engine capture into capture list

Teres Alexis, Alan Previn alan.previn.teres.alexis at intel.com
Fri Oct 4 17:38:42 UTC 2024


As per offline conversation just now, since our last thorough review in rev18,
we notice many trivial changes that are not required at all that got accidentally
included as a result of ongoing offline debugging work that you forgot to remove.
I'll highlight them below as part of this review. Looking forward for next rev.



On Thu, 2024-10-03 at 20:24 -0700, Zhanjun Dong wrote:
> Save manual engine capture into capture list.
> This removes duplicate register definitions across manual-capture vs
> guc-err-capture.
> 
> 
alan:snip

> diff --git a/drivers/gpu/drm/xe/xe_guc_capture.c b/drivers/gpu/drm/xe/xe_guc_capture.c
> index f32c086c5968..aafd31f2e477 100644
> --- a/drivers/gpu/drm/xe/xe_guc_capture.c
> +++ b/drivers/gpu/drm/xe/xe_guc_capture.c
> @@ -312,7 +312,6 @@ guc_capture_get_one_list(const struct __guc_mmio_reg_descr_group *reglists,
>                      reglists[i].type == GUC_STATE_CAPTURE_TYPE_GLOBAL))
>                         return &reglists[i];
>         }
> -
alan: nit: redundant change.
>         return NULL;
>  }
>  
> @@ -881,7 +880,7 @@ guc_capture_remove_stale_matches_from_list(struct xe_guc_state_capture *gc,
>         int guc_id = node->guc_id;
>  
>         list_for_each_entry_safe(n, ntmp, &gc->outlist, link) {
> -               if (n != node && !n->locked && n->guc_id == guc_id)
> +               if (n && n != node && !n->locked && n->guc_id == guc_id)
alan: dont check if "n" is valid, coz if its not, something else is busted.
>                         guc_capture_free_outlist_node(gc, n);
>         }
>  }
> @@ -935,7 +934,7 @@ guc_capture_init_node(struct xe_guc *guc, struct __guc_capture_parsed_output *no
>   *                   guc->capture->cachelist and populated with the error-capture
>   *                   data from GuC and then it's added into guc->capture->outlist linked
>   *                   list. This list is used for matchup and printout by xe_devcoredump_read
> - *                   and xe_hw_engine_snapshot_print, (when user invokes the devcoredump sysfs).
> + *                   and xe_engine_snapshot_print, (when user invokes the devcoredump sysfs).
>   *
>   * GUC --> notify context reset:
>   * -----------------------------
> @@ -943,12 +942,13 @@ guc_capture_init_node(struct xe_guc *guc, struct __guc_capture_parsed_output *no
>   *                   L--> xe_devcoredump
>   *                          L--> devcoredump_snapshot
>   *                               --> xe_hw_engine_snapshot_capture
> + *                               --> xe_engine_manual_capture(For manual capture)
>   *
>   * User Sysfs / Debugfs
>   * --------------------
>   *      --> xe_devcoredump_read->
>   *             L--> xxx_snapshot_print
> - *                    L--> xe_hw_engine_snapshot_print
> + *                    L--> xe_engine_snapshot_print
>   *                         Print register lists values saved at
>   *                         guc->capture->outlist
>   *
> @@ -1070,6 +1070,9 @@ guc_capture_get_prealloc_node(struct xe_guc *guc)
>  {
>         struct __guc_capture_parsed_output *found = NULL;
>  
> +       if (!guc->capture)
> +               return NULL;
> +
alan: guc_capture_get_prealloc_node is expected to be called after guc subsystem is
valid so we should never need above check. From the quick test we ran just now,
this was proven to be true.
>         if (!list_empty(&guc->capture->cachelist)) {
>                 struct __guc_capture_parsed_output *n, *ntmp;
>  
> 
alan:snip
> @@ -1619,14 +1744,23 @@ void xe_engine_guc_capture_print(struct xe_hw_engine_snapshot *snapshot, struct
>         const struct __guc_mmio_reg_descr_group *list;
>         enum guc_capture_list_class_type capture_class;
>  
> -       struct xe_gt *gt = snapshot->hwe->gt;
> -       struct xe_device *xe = gt_to_xe(gt);
> -       struct xe_devcoredump *devcoredump = &xe->devcoredump;
> -       struct xe_devcoredump_snapshot *devcore_snapshot = &devcoredump->snapshot;
> +       struct xe_gt *gt;
> +       struct xe_guc *guc;
> +       struct xe_device *xe;
> +       struct xe_devcoredump *devcoredump;
> +       struct xe_devcoredump_snapshot *devcore_snapshot;
>  
alan: the changes above and below are basically a "fixes" for Patch#5 when
we are checking snapshot after dereferencing it. So lets just fix this in
Patch #5. I'm giving you the okay to maintain the rb in Patch #5 with this.
>         if (!snapshot)
>                 return;
> -       XE_WARN_ON(!devcore_snapshot->matched_node);
> +
> +       gt = snapshot->hwe->gt;
> +       guc = &gt->uc.guc;
> +       xe = gt_to_xe(gt);
> +       devcoredump = &xe->devcoredump;
> +       devcore_snapshot = &devcoredump->snapshot;
> +
> +       if (!guc->capture || !devcore_snapshot->matched_node)
> +               return;
>  
>         xe_gt_assert(gt, snapshot->source <= XE_ENGINE_CAPTURE_SOURCE_GUC);
>         xe_gt_assert(gt, snapshot->hwe);
> @@ -1641,6 +1775,8 @@ void xe_engine_guc_capture_print(struct xe_hw_engine_snapshot *snapshot, struct
>         drm_printf(p, "\tCoverage: %s\n", grptype[devcore_snapshot->matched_node->is_partial]);
>         drm_printf(p, "\tForcewake: domain 0x%x, ref %d\n",
>                    snapshot->forcewake.domain, snapshot->forcewake.ref);
> +       drm_printf(p, "\tReserved: %s\n",
> +                  str_yes_no(snapshot->kernel_reserved));
>  
>         for (type = GUC_STATE_CAPTURE_TYPE_GLOBAL; type < GUC_STATE_CAPTURE_TYPE_MAX; type++) {
>                 list = xe_guc_capture_get_reg_desc_list(gt, GUC_CAPTURE_LIST_INDEX_PF, type,
> @@ -1714,7 +1850,7 @@ xe_guc_capture_get_matching_and_lock(struct xe_sched_job *job)
>                  * lrca info.
>                  */
>                 list_for_each_entry_safe(n, ntmp, &guc->capture->outlist, link) {
> -                       if (n->eng_class == guc_class && n->eng_inst == hwe->instance &&
> +                       if (n && n->eng_class == guc_class && n->eng_inst == hwe->instance &&
alan: same as before - no need to check n here.
>                             n->guc_id == guc_id && n->lrca == lrca &&
>                             n->source == XE_ENGINE_CAPTURE_SOURCE_GUC) {
>                                 n->locked = 1;
> @@ -1750,31 +1886,51 @@ xe_engine_snapshot_capture_for_job(struct xe_sched_job *job)
>                         continue;
>                 }
>  
> -               if (!coredump->snapshot.hwe[id])
> +               if (!coredump->snapshot.hwe[id]) {
>                         coredump->snapshot.hwe[id] = xe_hw_engine_snapshot_capture(hwe, job);
> +               } else {
> +                       struct __guc_capture_parsed_output *new;
> +
> +                       new = xe_guc_capture_get_matching_and_lock(job);
> +                       if (new) {
> +                               struct xe_guc *guc =  &q->gt->uc.guc;
> +
> +                               /*
> +                                * If we are in here, it means we found a fresh
> +                                * GuC-err-capture node for this engine after
> +                                * previously failing to find a match in the
> +                                * early part of guc_exec_queue_timedout_job.
> +                                * Thus we must free the manually captured node
> +                                */
> +                               guc_capture_free_outlist_node(guc->capture,
> +                                                             coredump->snapshot.matched_node);
> +                               coredump->snapshot.matched_node = new;
> +                       }
> +               }
>  
>                 break;
>         }
>  }
>  
>  /*
> - * xe_guc_capture_free - Cleanup GuC captured register list
> + * xe_guc_capture_put_matched_nodes - Release macthed nodes
>   * @guc: The GuC object
>   *
> - * Free matched_node and all nodes with the equal guc_id from GuC captured
> - * register list
> + * Free matched node and all nodes with the equal guc_id from
> + * GuC captured outlist
>   */
> -void xe_guc_capture_free(struct xe_guc *guc)
> +void xe_guc_capture_put_matched_nodes(struct xe_guc *guc)
>  {
>         struct xe_device *xe = guc_to_xe(guc);
>         struct xe_devcoredump *devcoredump = &xe->devcoredump;
> -       struct __guc_capture_parsed_output *n = devcoredump->snapshot.matched_node;
alan: all the changes in this function doesnt change the implementation, usage or behavior
since Patch #5 - so we can just move these changes directly into patch #5. The name change is
the one thing i did want to see since "free" is the opposite of "alloc" but we are not
freeing content, we are returning a node we had "get" previously.
>  
> -       if (n) {
> -               guc_capture_remove_stale_matches_from_list(guc->capture, n);
> -               guc_capture_free_outlist_node(guc->capture, n);
> +       if (devcoredump->snapshot.matched_node) {
> +               guc_capture_remove_stale_matches_from_list(guc->capture,
> +                                                          devcoredump->snapshot.matched_node);
> +               guc_capture_free_outlist_node(guc->capture, devcoredump->snapshot.matched_node);
> +
> +               devcoredump->snapshot.matched_node = NULL;
>         }
> -       devcoredump->snapshot.matched_node = NULL;
>  }
>  
>  /*
alan:snip


More information about the Intel-xe mailing list