[PATCH] drm/xe/guc: Track FAST_REQ H2Gs to report where errors came from
Daniele Ceraolo Spurio
daniele.ceraolospurio at intel.com
Mon Mar 24 16:48:45 UTC 2025
<snip>
>>>> @@ -1141,6 +1165,47 @@ static int guc_crash_process_msg(struct
>>>> xe_guc_ct *ct, u32 action)
>>>> return 0;
>>>> }
>>>> +#if IS_ENABLED(CONFIG_DRM_XE_DEBUG)
>>>> +static void fast_req_report(struct xe_guc_ct *ct, u16 fence)
>>>> +{
>>>> + unsigned int n;
>>>> + bool found = false;
>>>> +#if IS_ENABLED(CONFIG_DRM_XE_DEBUG_GUC)
>>>> + char *buf;
>>>> +#endif
>>>> +
>>>> + lockdep_assert_held(&ct->lock);
>>>> +
>>>> + for (n = 0; n < ARRAY_SIZE(ct->fast_req); n++) {
>>>> + if (ct->fast_req[n].fence != fence)
>>>> + continue;
>>>> + found = true;
>>>> +
>>>> +#if IS_ENABLED(CONFIG_DRM_XE_DEBUG_GUC)
>>>> + buf = kmalloc(SZ_4K, GFP_NOWAIT);
>>>> + if (buf && stack_depot_snprint(ct->fast_req[n].stack, buf,
>>>> SZ_4K, 0))
>>>> + xe_gt_err(ct_to_gt(ct), "Fence 0x%x was used by action
>>>> %#04x sent at\n%s",
>>>> + fence, ct->fast_req[n].action, buf);
>>>> + else
>>>> + xe_gt_err(ct_to_gt(ct), "Fence 0x%x was used by action
>>>> %#04x [failed to retrieve stack]\n",
>>>> + fence, ct->fast_req[n].action);
>>>> + kfree(buf);
>>>> +#else
>>>> + xe_gt_err(ct_to_gt(ct), "Fence 0x%x was used by action
>>>> %#04x\n",
>>>> + fence, ct->fast_req[n].action);
>>>> +#endif
>>>> + break;
>>>> + }
>>>> +
>>>> + if (!found)
>>>> + xe_gt_warn(ct_to_gt(ct), "FAST_REQ G2H fence 0x%x not found!
>>>> \n", fence);
>>> Not convinced about this error message. the fast_req array is only 32
>>> entries deep, so it wouldn't be weird for entries to be overwritten
>>> in a
>>> busy system, but the read I get from this message is that something is
>>> wrong with the fact that we didn't find the fence. Maybe go for
>>> something like: "FAST_REQ G2H fence 0x%x action unknown". Not a
>>> blocker.
> How about:
> xe_gt_warn(gt, "Fence 0x%x not found - tracking buffer
> wrapped?\n", fence);
>
>
Sounds good to me
<snip>
>
>>
>>>> + /** @action: H2G action code */
>>>> + u16 action;
>>>> +#if IS_ENABLED(CONFIG_DRM_XE_DEBUG_GUC)
>>>> + /** @stack: call stack from when the H2G was sent */
>>>> + depot_stack_handle_t stack;
>>>> +#endif
>>>> +};
>>> nit: should this whole struct be wrapped in CONFIG_DRM_XE_DEBUG? Not
>>> sure if any code analyzer would be smart enough to mark it as unused if
>>> CONFIG_DRM_XE_DEBUG is not set.
> Um, it is. Or are you saying that it should not be?
>
> That '#endif' just below matches a CONFIG_DRM_XE_DEBUG that was
> previously just wrapping the DEAD_CT structure. Now it wraps both that
> and the FAST_REQ fence structure.
D'oh! I missed that the ifdef was already there.
Daniele
>
> John.
>
>
>>>
>>> Apart from the nits this looks good to me:
>>>
>>> Reviewed-by: Daniele Ceraolo Spurio <daniele.ceraolospurio at intel.com>
>>>
>>> Daniele
>>>
>>>> #endif
>>>> /**
>>>> @@ -152,6 +165,8 @@ struct xe_guc_ct {
>>>> #if IS_ENABLED(CONFIG_DRM_XE_DEBUG)
>>>> /** @dead: information for debugging dead CTs */
>>>> struct xe_dead_ct dead;
>>>> + /** @fast_req: history of FAST_REQ messages for matching with G2H
>>>> error responses*/
>>>> + struct xe_fast_req_fence fast_req[SZ_32];
>>>> #endif
>>>> };
>>>> diff --git a/drivers/gpu/drm/xe/xe_guc_log.h b/drivers/gpu/drm/xe/
>>>> xe_guc_log.h
>>>> index 5b896f5fafaf..f1e2b0be90a9 100644
>>>> --- a/drivers/gpu/drm/xe/xe_guc_log.h
>>>> +++ b/drivers/gpu/drm/xe/xe_guc_log.h
>>>> @@ -12,7 +12,7 @@
>>>> struct drm_printer;
>>>> struct xe_device;
>>>> -#if IS_ENABLED(CONFIG_DRM_XE_LARGE_GUC_BUFFER)
>>>> +#if IS_ENABLED(CONFIG_DRM_XE_DEBUG_GUC)
>>>> #define CRASH_BUFFER_SIZE SZ_1M
>>>> #define DEBUG_BUFFER_SIZE SZ_8M
>>>> #define CAPTURE_BUFFER_SIZE SZ_2M
>
More information about the Intel-xe
mailing list