[PATCH 3/6] drm/scheduler: Job timeout handler returns status

Thu Nov 26 15:06:46 UTC 2020

On 11/24/20 10:17 PM, Luben Tuikov wrote:
> The job timeout handler now returns status
> indicating back to the DRM layer whether the job
> was successfully cancelled or whether more time
> should be given to the job to complete.
>
> Signed-off-by: Luben Tuikov <luben.tuikov at amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_job.c |  6 ++++--
>   include/drm/gpu_scheduler.h             | 13 ++++++++++---
>   2 files changed, 14 insertions(+), 5 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> index ff48101bab55..81b73790ecc6 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> @@ -28,7 +28,7 @@
>   #include "amdgpu.h"
>   #include "amdgpu_trace.h"
>   
> -static void amdgpu_job_timedout(struct drm_sched_job *s_job)
> +static int amdgpu_job_timedout(struct drm_sched_job *s_job)
>   {
>   	struct amdgpu_ring *ring = to_amdgpu_ring(s_job->sched);
>   	struct amdgpu_job *job = to_amdgpu_job(s_job);
> @@ -41,7 +41,7 @@ static void amdgpu_job_timedout(struct drm_sched_job *s_job)
>   	    amdgpu_ring_soft_recovery(ring, job->vmid, s_job->s_fence->parent)) {
>   		DRM_ERROR("ring %s timeout, but soft recovered\n",
>   			  s_job->sched->name);
> -		return;
> +		return 0;
>   	}
>   
>   	amdgpu_vm_get_task_info(ring->adev, job->pasid, &ti);
> @@ -53,10 +53,12 @@ static void amdgpu_job_timedout(struct drm_sched_job *s_job)
>   
>   	if (amdgpu_device_should_recover_gpu(ring->adev)) {
>   		amdgpu_device_gpu_recover(ring->adev, job);
> +		return 0;

For amdgpu specifically - not that amdgpu_device_gpu_recover returns a value 
which is 0 for successful GPU reset
meaning we reset the GPU and resubmitted to HW the job that triggered the 
timeout to HW (guilty).
It means the job is still should be considered part of pending list and so a non 
zero value
should be returned. I think only if we reset the GPU and don't submit back the 
guilty job then
it can be considered 'aborted' - but I don't think we even do this.

Andrey

>   	} else {
>   		drm_sched_suspend_timeout(&ring->sched);
>   		if (amdgpu_sriov_vf(adev))
>   			adev->virt.tdr_debug = true;
> +		return 1;
>   	}
>   }
>   
> diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
> index 2e0c368e19f6..61f7121e1c19 100644
> --- a/include/drm/gpu_scheduler.h
> +++ b/include/drm/gpu_scheduler.h
> @@ -230,10 +230,17 @@ struct drm_sched_backend_ops {
>   	struct dma_fence *(*run_job)(struct drm_sched_job *sched_job);
>   
>   	/**
> -         * @timedout_job: Called when a job has taken too long to execute,
> -         * to trigger GPU recovery.
> +	 * @timedout_job: Called when a job has taken too long to execute,
> +	 * to trigger GPU recovery.
> +	 *
> +	 * Return 0, if the job has been aborted successfully and will
> +	 * never be heard of from the device. Return non-zero if the
> +	 * job wasn't able to be aborted, i.e. if more time should be
> +	 * given to this job. The result is not "bool" as this
> +	 * function is not a predicate, although its result may seem
> +	 * as one.
>   	 */
> -	void (*timedout_job)(struct drm_sched_job *sched_job);
> +	int (*timedout_job)(struct drm_sched_job *sched_job);
>   
>   	/**
>            * @free_job: Called once the job's finished fence has been signaled