[PATCH v2] drm/xe/uapi: Bring back reset uevent
Aravind Iddamsetty
aravind.iddamsetty at linux.intel.com
Wed Aug 14 06:46:41 UTC 2024
On 13/08/24 22:24, Rodrigo Vivi wrote:
> On Tue, Aug 13, 2024 at 04:28:32PM +0300, Raag Jadav wrote:
>> On Mon, Aug 12, 2024 at 03:08:14PM +0530, Aravind Iddamsetty wrote:
>>> On 12/08/24 13:18, Raag Jadav wrote:
>>>> From: Himal Prasad Ghimiray <himal.prasad.ghimiray at intel.com>
>>>>
>>>> This was dropped in commit 77a0d4d1cea2 ("drm/xe/uapi: Remove reset
>>>> uevent for now") as part of refactoring.
>>>>
>>>> Now that we have better uapi semantics and naming for the uevent,
>>>> bring it back. With this in place, userspace will be notified of
>>>> wedged device along with its reason.
>>>>
>>>> $ udevadm monitor --property --kernel
>>>> monitor will print the received events for:
>>>> KERNEL - the kernel uevent
>>>>
>>>> KERNEL[871.188570] change /devices/pci0000:00/0000:00:01.0/0000:01:00.0/0000:02:01.0/0000:03:00.0 (pci)
>>>> ACTION=change
>>>> DEVPATH=/devices/pci0000:00/0000:00:01.0/0000:01:00.0/0000:02:01.0/0000:03:00.0
>>>> SUBSYSTEM=pci
>>>> DEVICE_STATUS=NEEDS_RESET
>>>> REASON=GT_RESET_FAILED
>>>> TILE_ID=0
>>>> GT_ID=0
>>>> DRIVER=xe
>>>> PCI_CLASS=30000
>>>> PCI_ID=8086:56B1
>>>> PCI_SUBSYS_ID=8086:1210
>>>> PCI_SLOT_NAME=0000:03:00.0
>>>> MODALIAS=pci:v00008086d000056B1sv00008086sd00001210bc03sc00i00
>>>> SEQNUM=6104
>>>>
>>>> v2: Change authorship to Himal (Aravind)
>>>> Add uevent for all device wedged cases (Aravind)
>>>>
>>>> Signed-off-by: Himal Prasad Ghimiray <himal.prasad.ghimiray at intel.com>
>>>> Signed-off-by: Raag Jadav <raag.jadav at intel.com>
>>>> ---
>>>> drivers/gpu/drm/xe/xe_device.c | 10 +++++++++-
>>>> drivers/gpu/drm/xe/xe_device.h | 2 +-
>>>> drivers/gpu/drm/xe/xe_gt.c | 23 +++++++++++++++++++----
>>>> drivers/gpu/drm/xe/xe_guc.c | 13 ++++++++++++-
>>>> drivers/gpu/drm/xe/xe_guc_submit.c | 13 ++++++++++++-
>>>> include/uapi/drm/xe_drm.h | 29 +++++++++++++++++++++++++++++
>>>> 6 files changed, 82 insertions(+), 8 deletions(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/xe/xe_device.c b/drivers/gpu/drm/xe/xe_device.c
>>>> index 1aba6f9eaa19..d975bdce4a7d 100644
>>>> --- a/drivers/gpu/drm/xe/xe_device.c
>>>> +++ b/drivers/gpu/drm/xe/xe_device.c
>>>> @@ -955,6 +955,7 @@ static void xe_device_wedged_fini(struct drm_device *drm, void *arg)
>>>> /**
>>>> * xe_device_declare_wedged - Declare device wedged
>>>> * @xe: xe device instance
>>>> + * @event_params: parameters to be sent along with uevent
>>>> *
>>>> * This is a final state that can only be cleared with a mudule
>>>> * re-probe (unbind + bind).
>>>> @@ -965,8 +966,10 @@ static void xe_device_wedged_fini(struct drm_device *drm, void *arg)
>>>> * on every single execution timeout (a.k.a. GPU hang) right after devcoredump
>>>> * snapshot capture. In this mode, GT reset won't be attempted so the state of
>>>> * the issue is preserved for further debugging.
>>>> + * Caller is expected to pass respective parameters to be sent along with
>>>> + * uevent. Pass NULL in case of no params.
>>>> */
>>>> -void xe_device_declare_wedged(struct xe_device *xe)
>>>> +void xe_device_declare_wedged(struct xe_device *xe, char **event_params)
>>>> {
>>>> struct xe_gt *gt;
>>>> u8 id;
>>>> @@ -984,12 +987,17 @@ void xe_device_declare_wedged(struct xe_device *xe)
>>>> xe_pm_runtime_get_noresume(xe);
>>>>
>>>> if (!atomic_xchg(&xe->wedged.flag, 1)) {
>>>> + struct pci_dev *pdev = to_pci_dev(xe->drm.dev);
>>>> +
>>>> xe->needs_flr_on_fini = true;
>>>> drm_err(&xe->drm,
>>>> "CRITICAL: Xe has declared device %s as wedged.\n"
>>>> "IOCTLs and executions are blocked. Only a rebind may clear the failure\n"
>>>> "Please file a _new_ bug report at https://gitlab.freedesktop.org/drm/xe/kernel/issues/new\n",
>>>> dev_name(xe->drm.dev));
>>>> +
>>>> + /* Notify userspace about reset required */
>>>> + kobject_uevent_env(&pdev->dev.kobj, KOBJ_CHANGE, event_params);
>>>> }
>>>>
>>>> for_each_gt(gt, xe, id)
>>>> diff --git a/drivers/gpu/drm/xe/xe_device.h b/drivers/gpu/drm/xe/xe_device.h
>>>> index db6cc8d0d6b8..5d40fc6f0904 100644
>>>> --- a/drivers/gpu/drm/xe/xe_device.h
>>>> +++ b/drivers/gpu/drm/xe/xe_device.h
>>>> @@ -174,7 +174,7 @@ static inline bool xe_device_wedged(struct xe_device *xe)
>>>> return atomic_read(&xe->wedged.flag);
>>>> }
>>>>
>>>> -void xe_device_declare_wedged(struct xe_device *xe);
>>>> +void xe_device_declare_wedged(struct xe_device *xe, char **reset_event);
>>>>
>>>> struct xe_file *xe_file_get(struct xe_file *xef);
>>>> void xe_file_put(struct xe_file *xef);
>>>> diff --git a/drivers/gpu/drm/xe/xe_gt.c b/drivers/gpu/drm/xe/xe_gt.c
>>>> index 58895ed22f6e..519f3c2cf9e2 100644
>>>> --- a/drivers/gpu/drm/xe/xe_gt.c
>>>> +++ b/drivers/gpu/drm/xe/xe_gt.c
>>>> @@ -741,6 +741,24 @@ static int do_gt_restart(struct xe_gt *gt)
>>>> return 0;
>>>> }
>>>>
>>>> +static void xe_gt_reset_failed(struct xe_gt *gt, int err)
>>>> +{
>>>> + char *event_params[5];
>>>> +
>>>> + xe_gt_err(gt, "reset failed (%pe)\n", ERR_PTR(err));
>>>> +
>>>> + event_params[0] = DRM_XE_RESET_REQUIRED_UEVENT;
>>>> + event_params[1] = DRM_XE_RESET_REQUIRED_UEVENT_REASON_GT;
>>>> + event_params[2] = kasprintf(GFP_KERNEL, "TILE_ID=%d", gt_to_tile(gt)->id);
>>>> + event_params[3] = kasprintf(GFP_KERNEL, "GT_ID=%d", gt->info.id);
>>> the TILE_ID and GT_ID can be passed for other events as well, with that you can
>>> have a common function to send uevent which would take reason as an input.
>> But is that required for all cases? There could be potential cases atleast
>> in the future where it is not needed.
At least in these cases it makes sense as they (other reasons)
can be associated to a GT and a Tile. If in future they arises a
reason where these details are not needed i guess we can handle that.
>>
>>>> + event_params[4] = NULL;
>>>> +
>>>> + xe_device_declare_wedged(gt_to_xe(gt), event_params);
>>>> +
>>>> + kfree(event_params[2]);
>>>> + kfree(event_params[3]);
>>>> +}
>>>> +
>>>> static int gt_reset(struct xe_gt *gt)
>>>> {
>>>> int err;
>>>> @@ -796,10 +814,7 @@ static int gt_reset(struct xe_gt *gt)
>>>> XE_WARN_ON(xe_uc_start(>->uc));
>>>> xe_pm_runtime_put(gt_to_xe(gt));
>>>> err_fail:
>>>> - xe_gt_err(gt, "reset failed (%pe)\n", ERR_PTR(err));
>>>> -
>>>> - xe_device_declare_wedged(gt_to_xe(gt));
>>>> -
>>>> + xe_gt_reset_failed(gt, err);
>>>> return err;
>>>> }
>>>>
>>>> diff --git a/drivers/gpu/drm/xe/xe_guc.c b/drivers/gpu/drm/xe/xe_guc.c
>>>> index de0fe9e65746..b544012f5b11 100644
>>>> --- a/drivers/gpu/drm/xe/xe_guc.c
>>>> +++ b/drivers/gpu/drm/xe/xe_guc.c
>>>> @@ -560,6 +560,17 @@ static s32 guc_pc_get_cur_freq(struct xe_guc_pc *guc_pc)
>>>> return ret ? ret : freq;
>>>> }
>>>>
>>>> +static void xe_guc_load_failed(struct xe_gt *gt)
>>>> +{
>>>> + char *event_params[3];
>>>> +
>>>> + event_params[0] = DRM_XE_RESET_REQUIRED_UEVENT;
>>>> + event_params[1] = DRM_XE_RESET_REQUIRED_UEVENT_REASON_GUC;
>>>> + event_params[2] = NULL;
>>>> +
>>>> + xe_device_declare_wedged(gt_to_xe(gt), event_params);
>>>> +}
>>>> +
>>>> /*
>>>> * Wait for the GuC to start up.
>>>> *
>>>> @@ -684,7 +695,7 @@ static void guc_wait_ucode(struct xe_guc *guc)
>>>> break;
>>>> }
>>>>
>>>> - xe_device_declare_wedged(gt_to_xe(gt));
>>>> + xe_guc_load_failed(gt);
>>>> } else if (delta_ms > GUC_LOAD_TIME_WARN_MS) {
>>>> xe_gt_warn(gt, "excessive init time: %lldms! [status = 0x%08X, timeouts = %d]\n",
>>>> delta_ms, status, count);
>>>> diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c b/drivers/gpu/drm/xe/xe_guc_submit.c
>>>> index 460808507947..33ed6221f465 100644
>>>> --- a/drivers/gpu/drm/xe/xe_guc_submit.c
>>>> +++ b/drivers/gpu/drm/xe/xe_guc_submit.c
>>>> @@ -891,6 +891,17 @@ void xe_guc_submit_wedge(struct xe_guc *guc)
>>>> mutex_unlock(&guc->submission_state.lock);
>>>> }
>>>>
>>>> +static void xe_exec_queue_timedout(struct xe_device *xe)
>>>> +{
>>>> + char *event_params[3];
>>>> +
>>>> + event_params[0] = DRM_XE_RESET_REQUIRED_UEVENT;
>>>> + event_params[1] = DRM_XE_RESET_REQUIRED_UEVENT_REASON_TOUT;
>>>> + event_params[2] = NULL;
>>>> +
>>>> + xe_device_declare_wedged(xe, event_params);
>>>> +}
>>>> +
>>>> static bool guc_submit_hint_wedged(struct xe_guc *guc)
>>>> {
>>>> struct xe_device *xe = guc_to_xe(guc);
>>>> @@ -901,7 +912,7 @@ static bool guc_submit_hint_wedged(struct xe_guc *guc)
>>>> if (xe_device_wedged(xe))
>>>> return true;
>>>>
>>> i see that this is even called from xe_guc_exec_queue_lr_cleanup which is for long running queue
>>> from various call paths need to check in what scenarios those happen.
>> Should we add a reason for long running queue?
> Both of your questions would probably be answered if this was getting developed
> at the same time of the user space usage of these uevents.
Can't we consider the generic Linux user as a consumer, via udevadm.
Thanks,
Aravind.
>
>> Raag
More information about the Intel-xe
mailing list