[PATCH v7 07/10] drm/xe/guc: Dead CT helper
Julia Filipchuk
julia.filipchuk at intel.com
Wed Sep 11 19:55:36 UTC 2024
On 9/5/2024 1:51 PM, John.C.Harrison at Intel.com wrote:
> From: John Harrison <John.C.Harrison at Intel.com>
>
> Add a worker function helper for asynchronously dumping state when an
> internal/fatal error is detected in CT processing. Being asynchronous
> is required to avoid deadlocks and scheduling-while-atomic or
> process-stalled-for-too-long issues. Also check for a bunch more error
> conditions and improve the handling of some existing checks.
>
> v2: Use compile time CONFIG check for new (but not directly CT_DEAD
> related) checks and use unsigned int for a bitmask, rename
> CT_DEAD_RESET to CT_DEAD_REARM and add some explaining comments,
> rename 'hxg' macro parameter to 'ctb' - review feedback from Michal W.
> Drop CT_DEAD_ALIVE as no need for a bitfield define to just set the
> entire mask to zero.
> v3: Fix kerneldoc
> v4: Nullify some floating pointers after free.
> v5: Add section headings and device info to make the state dump look
> more like a devcoredump to allow parsing by the same tools (eventual
> aim is to just call the devcoredump code itself, but that currently
> requires an xe_sched_job, which is not available in the CT code).
>
> Signed-off-by: John Harrison <John.C.Harrison at Intel.com>
> ---
> .../drm/xe/abi/guc_communication_ctb_abi.h | 1 +
> drivers/gpu/drm/xe/xe_guc.c | 2 +-
> drivers/gpu/drm/xe/xe_guc_ct.c | 280 ++++++++++++++++--
> drivers/gpu/drm/xe/xe_guc_ct.h | 2 +-
> drivers/gpu/drm/xe/xe_guc_ct_types.h | 23 ++
> 5 files changed, 280 insertions(+), 28 deletions(-)
>
> diff --git a/drivers/gpu/drm/xe/abi/guc_communication_ctb_abi.h b/drivers/gpu/drm/xe/abi/guc_communication_ctb_abi.h
> index 8f86a16dc577..f58198cf2cf6 100644
> --- a/drivers/gpu/drm/xe/abi/guc_communication_ctb_abi.h
> +++ b/drivers/gpu/drm/xe/abi/guc_communication_ctb_abi.h
> @@ -52,6 +52,7 @@ struct guc_ct_buffer_desc {
> #define GUC_CTB_STATUS_OVERFLOW (1 << 0)
> #define GUC_CTB_STATUS_UNDERFLOW (1 << 1)
> #define GUC_CTB_STATUS_MISMATCH (1 << 2)
> +#define GUC_CTB_STATUS_DISABLED (1 << 3)
> u32 reserved[13];
> } __packed;
> static_assert(sizeof(struct guc_ct_buffer_desc) == 64);
> diff --git a/drivers/gpu/drm/xe/xe_guc.c b/drivers/gpu/drm/xe/xe_guc.c
> index 34cdb08b6e27..3fef24c965c4 100644
> --- a/drivers/gpu/drm/xe/xe_guc.c
> +++ b/drivers/gpu/drm/xe/xe_guc.c
> @@ -1176,7 +1176,7 @@ void xe_guc_print_info(struct xe_guc *guc, struct drm_printer *p)
>
> xe_force_wake_put(gt_to_fw(gt), XE_FW_GT);
>
> - xe_guc_ct_print(&guc->ct, p, false);
> + xe_guc_ct_print(&guc->ct, p);
> xe_guc_submit_print(guc, p);
> }
>
> diff --git a/drivers/gpu/drm/xe/xe_guc_ct.c b/drivers/gpu/drm/xe/xe_guc_ct.c
> index a63fe0a9077a..e31b1f0b855f 100644
> --- a/drivers/gpu/drm/xe/xe_guc_ct.c
> +++ b/drivers/gpu/drm/xe/xe_guc_ct.c
> @@ -25,12 +25,57 @@
> #include "xe_gt_sriov_pf_monitor.h"
> #include "xe_gt_tlb_invalidation.h"
> #include "xe_guc.h"
> +#include "xe_guc_log.h"
> #include "xe_guc_relay.h"
> #include "xe_guc_submit.h"
> #include "xe_map.h"
> #include "xe_pm.h"
> #include "xe_trace_guc.h"
>
> +#if IS_ENABLED(CONFIG_DRM_XE_DEBUG)
> +enum {
> + CT_DEAD_REARM, /* 0x0001 - not an error condition */
> + CT_DEAD_SETUP, /* 0x0002 */
> + CT_DEAD_H2G_WRITE, /* 0x0004 */
> + CT_DEAD_H2G_HAS_ROOM, /* 0x0008 */
> + CT_DEAD_G2H_READ, /* 0x0010 */
> + CT_DEAD_G2H_RECV, /* 0x0020 */
> + CT_DEAD_G2H_RELEASE, /* 0x0040 */
> + CT_DEAD_DEADLOCK, /* 0x0080 */
> + CT_DEAD_PROCESS_FAILED, /* 0x0100 */
> + CT_DEAD_FAST_G2H, /* 0x0200 */
> + CT_DEAD_PARSE_G2H_RESPONSE, /* 0x0400 */
> + CT_DEAD_PARSE_G2H_UNKNOWN, /* 0x0800 */
> + CT_DEAD_PARSE_G2H_ORIGIN, /* 0x1000 */
> + CT_DEAD_PARSE_G2H_TYPE, /* 0x2000 */
> +};
> +
> +static void ct_dead_worker_func(struct work_struct *w);
> +
> +#define CT_DEAD(ct, ctb, reason_code) \
> + do { \
> + struct guc_ctb *_ctb = (ctb); \
> + if (_ctb) \
> + _ctb->info.broken = true; \
> + if (!(ct)->dead.reported) { \
Do we need to worry about a second dead ct causing conflict with quick
back-to-back CT_DEAD? Suggest to set reported here and to clear it only
from worker thread. Snapshot can be used instead of reported to
determine if it has been printed (in worker_func).
Should the snapshot be taken in the "CT_DEAD_REARM" case?
> + struct xe_guc *guc = ct_to_guc(ct); \
> + spin_lock_irq(&ct->dead.lock); \
> + (ct)->dead.reason |= 1 << CT_DEAD_##reason_code; \
Does this field need to be cleared or does this accumulate reasons CT died?
> + (ct)->dead.snapshot_log = xe_guc_log_snapshot_capture(&guc->log, true); \
> + (ct)->dead.snapshot_ct = xe_guc_ct_snapshot_capture((ct), true); \
> + spin_unlock_irq(&ct->dead.lock); \
> + queue_work(system_unbound_wq, &(ct)->dead.worker); \
> + } \
> + } while (0)
> +#else
> +#define CT_DEAD(ct, ctb, reason) \
> + do { \
> + struct guc_ctb *_ctb = (ctb); \
> + if (_ctb) \
> + _ctb->info.broken = true; \
> + } while (0)
> +#endif
> +
> /* Used when a CT send wants to block and / or receive data */
> struct g2h_fence {
> u32 *response_buffer;
> @@ -183,6 +228,10 @@ int xe_guc_ct_init(struct xe_guc_ct *ct)
> xa_init(&ct->fence_lookup);
> INIT_WORK(&ct->g2h_worker, g2h_worker_func);
> INIT_DELAYED_WORK(&ct->safe_mode_worker, safe_mode_worker_func);
> +#if IS_ENABLED(CONFIG_DRM_XE_DEBUG)
> + spin_lock_init(&ct->dead.lock);
> + INIT_WORK(&ct->dead.worker, ct_dead_worker_func);
> +#endif
> init_waitqueue_head(&ct->wq);
> init_waitqueue_head(&ct->g2h_fence_wq);
>
> @@ -419,10 +468,22 @@ int xe_guc_ct_enable(struct xe_guc_ct *ct)
> if (ct_needs_safe_mode(ct))
> ct_enter_safe_mode(ct);
>
> +#if IS_ENABLED(CONFIG_DRM_XE_DEBUG)
> + /*
> + * The CT has now been reset so the dumper can be re-armed
> + * after any existing dead state has been dumped.
> + */
> + spin_lock_irq(&ct->dead.lock);
> + if (ct->dead.reason)
> + ct->dead.reason |= CT_DEAD_REARM;
> + spin_unlock_irq(&ct->dead.lock);
> +#endif
> +
> return 0;
>
> err_out:
> xe_gt_err(gt, "Failed to enable GuC CT (%pe)\n", ERR_PTR(err));
> + CT_DEAD(ct, NULL, SETUP);
>
> return err;
> }
> @@ -773,8 +895,13 @@ static int guc_ct_send_locked(struct xe_guc_ct *ct, const u32 *action, u32 len,
> goto broken;
> #undef g2h_avail
>
> - if (dequeue_one_g2h(ct) < 0)
> + ret = dequeue_one_g2h(ct);
> + if (ret < 0) {
> + if (ret != -ECANCELED)
> + xe_gt_err(ct_to_gt(ct), "CTB receive failed (%pe)",
> + ERR_PTR(ret));
Is it correct there is no success condition here? Would canceled case
need to route to try_again?
> goto broken;
> + }
>
> goto try_again;
> }
> +
> +static void ct_dead_worker_func(struct work_struct *w)
> +{
> + struct xe_guc_ct *ct = container_of(w, struct xe_guc_ct, dead.worker);
> +
> + if (!ct->dead.reported) {
> + ct->dead.reported = true;> + ct_dead_print(&ct->dead);
> + }
Would this guard work against quick back-to-back CT_DEAD calls? This may
not happen? Suggest to move 'ct->dead.reported = true;' into the CT_DEAD
macro. Relates to CT_DEAD macro above.
Reviewed-by: Julia Filipchuk <julia.filipchuk at intel.com>
More information about the Intel-xe
mailing list