[PATCH] drm/sched: fix the duplicated TMO message for one IB
Liu, Monk
Monk.Liu at amd.com
Thu May 9 10:49:42 UTC 2019
Hah ... I see, but my requirement cannot be satisfied with current design:
What I need to do is put a signal arming in job_timeout() to notify a USER SPACE daemon app , which finally leverage "UMR" to DUMP/retrieve sw/hw information related with the TMO/hang as much as possible . To make it straight forward I would signal USR1 to the registered app (the daemon) every time a TMO happens, and this would introduce lot of unnecessary DUMP if this "bug" cannot resolved.
I think keep reporting TMO message for one IB is a "bug" to me even it is not for my above feature...
To address your concern, what about this one:
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
index 1397942..31d99e9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
@@ -28,7 +28,7 @@
#include "amdgpu.h"
#include "amdgpu_trace.h"
-static void amdgpu_job_timedout(struct drm_sched_job *s_job)
+static int amdgpu_job_timedout(struct drm_sched_job *s_job)
{
struct amdgpu_ring *ring = to_amdgpu_ring(s_job->sched);
struct amdgpu_job *job = to_amdgpu_job(s_job);
@@ -39,7 +39,7 @@ static void amdgpu_job_timedout(struct drm_sched_job *s_job)
if (amdgpu_ring_soft_recovery(ring, job->vmid, s_job->s_fence->parent)) {
DRM_ERROR("ring %s timeout, but soft recovered\n",
s_job->sched->name);
- return;
+ return -ENODEV;
}
amdgpu_vm_get_task_info(ring->adev, job->pasid, &ti);
@@ -51,6 +51,8 @@ static void amdgpu_job_timedout(struct drm_sched_job *s_job)
if (amdgpu_device_should_recover_gpu(ring->adev))
amdgpu_device_gpu_recover(ring->adev, job);
+
+ return 0;
}
int amdgpu_job_alloc(struct amdgpu_device *adev, unsigned num_ibs,
diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c
index c1aaf85..b93deb1 100644
--- a/drivers/gpu/drm/scheduler/sched_main.c
+++ b/drivers/gpu/drm/scheduler/sched_main.c
@@ -308,18 +308,23 @@ static void drm_sched_job_timedout(struct work_struct *work)
{
struct drm_gpu_scheduler *sched;
struct drm_sched_job *job;
- unsigned long flags;
+ int ret;
sched = container_of(work, struct drm_gpu_scheduler, work_tdr.work);
job = list_first_entry_or_null(&sched->ring_mirror_list,
struct drm_sched_job, node);
- if (job)
- job->sched->ops->timedout_job(job);
+ if (job) {
+ ret = job->sched->ops->timedout_job(job);
+ if (ret) {
+ unsigned long flags;
- spin_lock_irqsave(&sched->job_list_lock, flags);
- drm_sched_start_timeout(sched);
- spin_unlock_irqrestore(&sched->job_list_lock, flags);
+ /* means recovered, restart timer now */
+ spin_lock_irqsave(&sched->job_list_lock, flags);
+ drm_sched_start_timeout(sched);
+ spin_unlock_irqrestore(&sched->job_list_lock, flags);
+ }
+ }
}
/**
diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
index 9c2a957..c3884c3 100644
--- a/include/drm/gpu_scheduler.h
@@ -229,7 +229,7 @@ struct drm_sched_backend_ops {
* @timedout_job: Called when a job has taken too long to execute,
* to trigger GPU recovery.
*/
- void (*timedout_job)(struct drm_sched_job *sched_job);
+ int (*timedout_job)(struct drm_sched_job *sched_job);
/**
* @free_job: Called once the job's finished fence has been signaled
(END)
Thanks
/Monk
-----Original Message-----
From: Christian König <ckoenig.leichtzumerken at gmail.com>
Sent: Thursday, May 9, 2019 6:30 PM
To: Liu, Monk <Monk.Liu at amd.com>; Koenig, Christian <Christian.Koenig at amd.com>; amd-gfx at lists.freedesktop.org
Subject: Re: [PATCH] drm/sched: fix the duplicated TMO message for one IB
[CAUTION: External Email]
drm_sched_start() is not necessary called from the timeout handler.
If a soft recovery is sufficient, we just continue without a complete reset.
Christian.
Am 09.05.19 um 12:25 schrieb Liu, Monk:
> Christian
>
> Check "drm_sched_start" which is invoked from gpu_recover() , there is
> a "drm_sched_start_timeout()" in the tail
>
> /Monk
>
> -----Original Message-----
> From: Christian König <ckoenig.leichtzumerken at gmail.com>
> Sent: Thursday, May 9, 2019 3:18 PM
> To: Liu, Monk <Monk.Liu at amd.com>; amd-gfx at lists.freedesktop.org
> Subject: Re: [PATCH] drm/sched: fix the duplicated TMO message for one
> IB
>
> [CAUTION: External Email]
>
> Am 09.05.19 um 06:31 schrieb Monk Liu:
>> we don't need duplicated IB's timeout error message reported
>> endlessly, just one report per timedout IB is enough
> Well, NAK. We don't need multiple timeout reports, but we really need to restart the timeout counter after handling it.
>
> Otherwise we will never run into a timeout again after handling one and it isn't unlikely that multiple IBs in a row doesn't work correctly.
>
> Christian.
>
>> Signed-off-by: Monk Liu <Monk.Liu at amd.com>
>> ---
>> drivers/gpu/drm/scheduler/sched_main.c | 5 -----
>> 1 file changed, 5 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/scheduler/sched_main.c
>> b/drivers/gpu/drm/scheduler/sched_main.c
>> index c1aaf85..d6c17f1 100644
>> --- a/drivers/gpu/drm/scheduler/sched_main.c
>> +++ b/drivers/gpu/drm/scheduler/sched_main.c
>> @@ -308,7 +308,6 @@ static void drm_sched_job_timedout(struct work_struct *work)
>> {
>> struct drm_gpu_scheduler *sched;
>> struct drm_sched_job *job;
>> - unsigned long flags;
>>
>> sched = container_of(work, struct drm_gpu_scheduler, work_tdr.work);
>> job = list_first_entry_or_null(&sched->ring_mirror_list,
>> @@ -316,10 +315,6 @@ static void drm_sched_job_timedout(struct
>> work_struct *work)
>>
>> if (job)
>> job->sched->ops->timedout_job(job);
>> -
>> - spin_lock_irqsave(&sched->job_list_lock, flags);
>> - drm_sched_start_timeout(sched);
>> - spin_unlock_irqrestore(&sched->job_list_lock, flags);
>> }
>>
>> /**
> _______________________________________________
> amd-gfx mailing list
> amd-gfx at lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
More information about the amd-gfx
mailing list