[PATCH v1 3/3] drm/amdgpu: update the error logging for more information
Khatri, Sunil
sukhatri at amd.com
Fri Apr 11 16:01:04 UTC 2025
On 4/11/2025 7:54 PM, Alex Deucher wrote:
> On Fri, Apr 11, 2025 at 9:05 AM Sunil Khatri <sunil.khatri at amd.com> wrote:
>> add process and pid information in the userqueue error
>> logging to make it more useful in resolving the error
>> by logs.
>>
>> Sample log:
>> [ 42.444297] [drm:amdgpu_userqueue_wait_for_signal [amdgpu]] *ERROR* Timed out waiting for fence f=000000001c74d978 for comm:Xwayland pid:3427
>> [ 42.444669] [drm:amdgpu_userqueue_suspend [amdgpu]] *ERROR* Not suspending userqueue, timeout waiting for comm:Xwayland pid:3427
>> [ 42.824729] [drm:amdgpu_userqueue_wait_for_signal [amdgpu]] *ERROR* Timed out waiting for fence f=0000000074407d3e for comm:systemd-logind pid:1058
>> [ 42.825082] [drm:amdgpu_userqueue_suspend [amdgpu]] *ERROR* Not suspending userqueue, timeout waiting for comm:systemd-logind pid:1058
>>
>> Signed-off-by: Sunil Khatri <sunil.khatri at amd.com>
>> ---
>> drivers/gpu/drm/amd/amdgpu/amdgpu_userqueue.c | 45 +++++++++++++++----
>> 1 file changed, 37 insertions(+), 8 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userqueue.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userqueue.c
>> index ecd49cf15b2a..5b58c41618ee 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userqueue.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userqueue.c
>> @@ -62,12 +62,17 @@ amdgpu_userqueue_cleanup(struct amdgpu_userq_mgr *uq_mgr,
>> struct amdgpu_device *adev = uq_mgr->adev;
>> const struct amdgpu_userq_funcs *uq_funcs = adev->userq_funcs[queue->queue_type];
>> struct dma_fence *f = queue->last_fence;
>> + struct drm_file *file;
>> + char proc_log[50];
>> int ret;
>>
>> if (f && !dma_fence_is_signaled(f)) {
>> ret = dma_fence_wait_timeout(f, true, msecs_to_jiffies(100));
>> if (ret <= 0) {
>> - DRM_ERROR("Timed out waiting for fence f=%p\n", f);
>> + file = uq_mgr->file;
>> + drm_process_info(file, proc_log, sizeof(proc_log));
>> + DRM_ERROR("Timed out waiting for fence f=%p for %s\n",
>> + f, proc_log);
> user drm_err() here and below so we get proper handling of multiple devices.
>
> Alex
Sure Alex. Once i have the main drm patch reviewed would update these too.
Sunil
>
>> return;
>> }
>> }
>> @@ -427,6 +432,8 @@ amdgpu_userqueue_resume_all(struct amdgpu_userq_mgr *uq_mgr)
>> const struct amdgpu_userq_funcs *userq_funcs;
>> struct amdgpu_usermode_queue *queue;
>> int queue_id;
>> + struct drm_file *file;
>> + char proc_log[50];
>> int ret = 0;
>>
>> /* Resume all the queues for this process */
>> @@ -435,8 +442,12 @@ amdgpu_userqueue_resume_all(struct amdgpu_userq_mgr *uq_mgr)
>> ret = userq_funcs->resume(uq_mgr, queue);
>> }
>>
>> - if (ret)
>> - DRM_ERROR("Failed to resume all the queue\n");
>> + if (ret) {
>> + file = uq_mgr->file;
>> + drm_process_info(file, proc_log, sizeof(proc_log));
>> + DRM_ERROR("Failed to resume all the queue for %s\n",
>> + proc_log);
>> + }
>> return ret;
>> }
>>
>> @@ -585,6 +596,8 @@ amdgpu_userqueue_suspend_all(struct amdgpu_userq_mgr *uq_mgr)
>> const struct amdgpu_userq_funcs *userq_funcs;
>> struct amdgpu_usermode_queue *queue;
>> int queue_id;
>> + struct drm_file *file;
>> + char proc_log[50];
>> int ret = 0;
>>
>> /* Try to suspend all the queues in this process ctx */
>> @@ -593,8 +606,12 @@ amdgpu_userqueue_suspend_all(struct amdgpu_userq_mgr *uq_mgr)
>> ret += userq_funcs->suspend(uq_mgr, queue);
>> }
>>
>> - if (ret)
>> - DRM_ERROR("Couldn't suspend all the queues\n");
>> + if (ret) {
>> + file = uq_mgr->file;
>> + drm_process_info(file, proc_log, sizeof(proc_log));
>> + DRM_ERROR("Couldn't suspend all the queues for %s\n",
>> + proc_log);
>> + }
>> return ret;
>> }
>>
>> @@ -602,6 +619,8 @@ static int
>> amdgpu_userqueue_wait_for_signal(struct amdgpu_userq_mgr *uq_mgr)
>> {
>> struct amdgpu_usermode_queue *queue;
>> + struct drm_file *file;
>> + char proc_log[50];
>> int queue_id, ret;
>>
>> idr_for_each_entry(&uq_mgr->userq_idr, queue, queue_id) {
>> @@ -611,7 +630,10 @@ amdgpu_userqueue_wait_for_signal(struct amdgpu_userq_mgr *uq_mgr)
>> continue;
>> ret = dma_fence_wait_timeout(f, true, msecs_to_jiffies(100));
>> if (ret <= 0) {
>> - DRM_ERROR("Timed out waiting for fence f=%p\n", f);
>> + file = uq_mgr->file;
>> + drm_process_info(file, proc_log, sizeof(proc_log));
>> + DRM_ERROR("Timed out waiting for fence f=%p for %s\n",
>> + f, proc_log);
>> return -ETIMEDOUT;
>> }
>> }
>> @@ -624,19 +646,26 @@ amdgpu_userqueue_suspend(struct amdgpu_userq_mgr *uq_mgr,
>> struct amdgpu_eviction_fence *ev_fence)
>> {
>> int ret;
>> + struct drm_file *file;
>> + char proc_log[50];
>> struct amdgpu_fpriv *fpriv = uq_mgr_to_fpriv(uq_mgr);
>> struct amdgpu_eviction_fence_mgr *evf_mgr = &fpriv->evf_mgr;
>>
>> /* Wait for any pending userqueue fence work to finish */
>> ret = amdgpu_userqueue_wait_for_signal(uq_mgr);
>> if (ret) {
>> - DRM_ERROR("Not suspending userqueue, timeout waiting for work\n");
>> + file = uq_mgr->file;
>> + drm_process_info(file, proc_log, sizeof(proc_log));
>> + DRM_ERROR("Not suspending userqueue, timeout waiting for %s\n",
>> + proc_log);
>> return;
>> }
>>
>> ret = amdgpu_userqueue_suspend_all(uq_mgr);
>> if (ret) {
>> - DRM_ERROR("Failed to evict userqueue\n");
>> + file = uq_mgr->file;
>> + drm_process_info(file, proc_log, sizeof(proc_log));
>> + DRM_ERROR("Failed to evict userqueue for %s\n", proc_log);
>> return;
>> }
>>
>> --
>> 2.34.1
>>
More information about the amd-gfx
mailing list