[PATCH 1/1] drm/scheduler: Job timeout handler returns status (v2)

Thu Dec 10 09:31:19 UTC 2020

Hi Luben,

Am Mittwoch, den 09.12.2020, 21:14 -0500 schrieb Luben Tuikov:
> This patch does not change current behaviour.
> 
> The driver's job timeout handler now returns
> status indicating back to the DRM layer whether
> the task (job) was successfully aborted or whether
> more time should be given to the task to complete.
> 
> Default behaviour as of this patch, is preserved,
> except in obvious-by-comment case in the Panfrost
> driver, as documented below.
> 
> All drivers which make use of the
> drm_sched_backend_ops' .timedout_job() callback
> have been accordingly renamed and return the
> would've-been default value of
> DRM_TASK_STATUS_ALIVE to restart the task's
> timeout timer--this is the old behaviour, and
> is preserved by this patch.
> 
> In the case of the Panfrost driver, its timedout
> callback correctly first checks if the job had
> completed in due time and if so, it now returns
> DRM_TASK_STATUS_COMPLETE to notify the DRM layer
> that the task can be moved to the done list, to be
> freed later. In the other two subsequent checks,
> the value of DRM_TASK_STATUS_ALIVE is returned, as
> per the default behaviour.
> 
> A more involved driver's solutions can be had
> in subequent patches.
> 
> v2: Use enum as the status of a driver's job
>     timeout callback method.
> 
> Cc: Alexander Deucher <Alexander.Deucher at amd.com>
> Cc: Andrey Grodzovsky <Andrey.Grodzovsky at amd.com>
> Cc: Christian König <christian.koenig at amd.com>
> Cc: Daniel Vetter <daniel.vetter at ffwll.ch>
> Cc: Lucas Stach <l.stach at pengutronix.de>
> Cc: Russell King <linux+etnaviv at armlinux.org.uk>
> Cc: Christian Gmeiner <christian.gmeiner at gmail.com>
> Cc: Qiang Yu <yuq825 at gmail.com>
> Cc: Rob Herring <robh at kernel.org>
> Cc: Tomeu Vizoso <tomeu.vizoso at collabora.com>
> Cc: Steven Price <steven.price at arm.com>
> Cc: Alyssa Rosenzweig <alyssa.rosenzweig at collabora.com>
> Cc: Eric Anholt <eric at anholt.net>
> Reported-by: kernel test robot <lkp at intel.com>
> Signed-off-by: Luben Tuikov <luben.tuikov at amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_job.c |  6 +++--
>  drivers/gpu/drm/etnaviv/etnaviv_sched.c | 10 +++++++-
>  drivers/gpu/drm/lima/lima_sched.c       |  4 +++-
>  drivers/gpu/drm/panfrost/panfrost_job.c |  9 ++++---
>  drivers/gpu/drm/scheduler/sched_main.c  |  4 +---
>  drivers/gpu/drm/v3d/v3d_sched.c         | 32 +++++++++++++------------
>  include/drm/gpu_scheduler.h             | 20 +++++++++++++---
>  7 files changed, 57 insertions(+), 28 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> index ff48101bab55..a111326cbdde 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> @@ -28,7 +28,7 @@
>  #include "amdgpu.h"
>  #include "amdgpu_trace.h"
>  
> 
> 
> 
> -static void amdgpu_job_timedout(struct drm_sched_job *s_job)
> +static enum drm_task_status amdgpu_job_timedout(struct drm_sched_job *s_job)
>  {
>  	struct amdgpu_ring *ring = to_amdgpu_ring(s_job->sched);
>  	struct amdgpu_job *job = to_amdgpu_job(s_job);
> @@ -41,7 +41,7 @@ static void amdgpu_job_timedout(struct drm_sched_job *s_job)
>  	    amdgpu_ring_soft_recovery(ring, job->vmid, s_job->s_fence->parent)) {
>  		DRM_ERROR("ring %s timeout, but soft recovered\n",
>  			  s_job->sched->name);
> -		return;
> +		return DRM_TASK_STATUS_ALIVE;
>  	}
>  
> 
> 
> 
>  	amdgpu_vm_get_task_info(ring->adev, job->pasid, &ti);
> @@ -53,10 +53,12 @@ static void amdgpu_job_timedout(struct drm_sched_job *s_job)
>  
> 
> 
> 
>  	if (amdgpu_device_should_recover_gpu(ring->adev)) {
>  		amdgpu_device_gpu_recover(ring->adev, job);
> +		return DRM_TASK_STATUS_ALIVE;
>  	} else {
>  		drm_sched_suspend_timeout(&ring->sched);
>  		if (amdgpu_sriov_vf(adev))
>  			adev->virt.tdr_debug = true;
> +		return DRM_TASK_STATUS_ALIVE;
>  	}
>  }
>  
> 
> 
> 
> diff --git a/drivers/gpu/drm/etnaviv/etnaviv_sched.c b/drivers/gpu/drm/etnaviv/etnaviv_sched.c
> index cd46c882269c..c49516942328 100644
> --- a/drivers/gpu/drm/etnaviv/etnaviv_sched.c
> +++ b/drivers/gpu/drm/etnaviv/etnaviv_sched.c
> @@ -82,7 +82,8 @@ static struct dma_fence *etnaviv_sched_run_job(struct drm_sched_job *sched_job)
>  	return fence;
>  }
>  
> 
> 
> 
> -static void etnaviv_sched_timedout_job(struct drm_sched_job *sched_job)
> +static enum drm_task_status etnaviv_sched_timedout_job(struct drm_sched_job
> +						       *sched_job)
>  {
>  	struct etnaviv_gem_submit *submit = to_etnaviv_submit(sched_job);
>  	struct etnaviv_gpu *gpu = submit->gpu;
> @@ -120,9 +121,16 @@ static void etnaviv_sched_timedout_job(struct drm_sched_job *sched_job)
>  
>  	drm_sched_resubmit_jobs(&gpu->sched);
>  
> +	/* Tell the DRM scheduler that this task needs
> +	 * more time.
> +	 */

This comment doesn't match the kernel coding style, but it's also moot
as the whole added code block isn't needed. The code just below is
identical, so letting execution continue here instead of returning
would be the right thing to do, but maybe you mean to return
DRM_TASK_STATUS_COMPLETE? It's a bit confusing that aborted and job
successfully finished should be signaled with the same return code.

> +	drm_sched_start(&gpu->sched, true);
> +	return DRM_TASK_STATUS_ALIVE;
> +
>  out_no_timeout:
>  	/* restart scheduler after GPU is usable again */
>  	drm_sched_start(&gpu->sched, true);
> +	return DRM_TASK_STATUS_ALIVE;
>  }

Regards,
Lucas