[PATCH] Revert "drm/scheduler: improve job distribution with multiple queues"
Christian König
ckoenig.leichtzumerken at gmail.com
Fri Oct 9 07:47:25 UTC 2020
The patch itself is correct, but it was reported numerous times that
this surfaces problems elsewhere.
So just reverting it is probably not the right approach.
Christian.
Am 09.10.20 um 08:16 schrieb Changfeng:
> From: changzhu <Changfeng.Zhu at amd.com>
>
> From: Changfeng <Changfeng.Zhu at amd.com>
>
> It needs to revert this patch to avoid amdgpu_test compute hang problem
> on picasso/raven1
>
> Change-Id: I5c298bb0c6cd64c67de712db551d15974c41493e
> Signed-off-by: Changfeng <Changfeng.Zhu at amd.com>
> ---
> drivers/gpu/drm/scheduler/sched_entity.c | 2 +-
> drivers/gpu/drm/scheduler/sched_main.c | 14 ++++++--------
> include/drm/gpu_scheduler.h | 6 +++---
> 3 files changed, 10 insertions(+), 12 deletions(-)
>
> diff --git a/drivers/gpu/drm/scheduler/sched_entity.c b/drivers/gpu/drm/scheduler/sched_entity.c
> index 146380118962..c803e14eed91 100644
> --- a/drivers/gpu/drm/scheduler/sched_entity.c
> +++ b/drivers/gpu/drm/scheduler/sched_entity.c
> @@ -486,7 +486,7 @@ void drm_sched_entity_push_job(struct drm_sched_job *sched_job,
> bool first;
>
> trace_drm_sched_job(sched_job, entity);
> - atomic_inc(&entity->rq->sched->score);
> + atomic_inc(&entity->rq->sched->num_jobs);
> WRITE_ONCE(entity->last_user, current->group_leader);
> first = spsc_queue_push(&entity->job_queue, &sched_job->queue_node);
>
> diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c
> index 9a0d77a68018..851443a19ee0 100644
> --- a/drivers/gpu/drm/scheduler/sched_main.c
> +++ b/drivers/gpu/drm/scheduler/sched_main.c
> @@ -92,7 +92,6 @@ void drm_sched_rq_add_entity(struct drm_sched_rq *rq,
> if (!list_empty(&entity->list))
> return;
> spin_lock(&rq->lock);
> - atomic_inc(&rq->sched->score);
> list_add_tail(&entity->list, &rq->entities);
> spin_unlock(&rq->lock);
> }
> @@ -111,7 +110,6 @@ void drm_sched_rq_remove_entity(struct drm_sched_rq *rq,
> if (list_empty(&entity->list))
> return;
> spin_lock(&rq->lock);
> - atomic_dec(&rq->sched->score);
> list_del_init(&entity->list);
> if (rq->current_entity == entity)
> rq->current_entity = NULL;
> @@ -649,7 +647,7 @@ static void drm_sched_process_job(struct dma_fence *f, struct dma_fence_cb *cb)
> struct drm_gpu_scheduler *sched = s_fence->sched;
>
> atomic_dec(&sched->hw_rq_count);
> - atomic_dec(&sched->score);
> + atomic_dec(&sched->num_jobs);
>
> trace_drm_sched_process_job(s_fence);
>
> @@ -714,7 +712,7 @@ drm_sched_pick_best(struct drm_gpu_scheduler **sched_list,
> {
> struct drm_gpu_scheduler *sched, *picked_sched = NULL;
> int i;
> - unsigned int min_score = UINT_MAX, num_score;
> + unsigned int min_jobs = UINT_MAX, num_jobs;
>
> for (i = 0; i < num_sched_list; ++i) {
> sched = sched_list[i];
> @@ -725,9 +723,9 @@ drm_sched_pick_best(struct drm_gpu_scheduler **sched_list,
> continue;
> }
>
> - num_score = atomic_read(&sched->score);
> - if (num_score < min_score) {
> - min_score = num_score;
> + num_jobs = atomic_read(&sched->num_jobs);
> + if (num_jobs < min_jobs) {
> + min_jobs = num_jobs;
> picked_sched = sched;
> }
> }
> @@ -861,7 +859,7 @@ int drm_sched_init(struct drm_gpu_scheduler *sched,
> spin_lock_init(&sched->job_list_lock);
> atomic_set(&sched->hw_rq_count, 0);
> INIT_DELAYED_WORK(&sched->work_tdr, drm_sched_job_timedout);
> - atomic_set(&sched->score, 0);
> + atomic_set(&sched->num_jobs, 0);
> atomic64_set(&sched->job_id_count, 0);
>
> /* Each scheduler will run on a seperate kernel thread */
> diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
> index 92436553fd6a..a33590e62108 100644
> --- a/include/drm/gpu_scheduler.h
> +++ b/include/drm/gpu_scheduler.h
> @@ -264,7 +264,7 @@ struct drm_sched_backend_ops {
> * @job_list_lock: lock to protect the ring_mirror_list.
> * @hang_limit: once the hangs by a job crosses this limit then it is marked
> * guilty and it will be considered for scheduling further.
> - * @score: score to help loadbalancer pick a idle sched
> + * @num_jobs: the number of jobs in queue in the scheduler
> * @ready: marks if the underlying HW is ready to work
> * @free_guilty: A hit to time out handler to free the guilty job.
> *
> @@ -285,8 +285,8 @@ struct drm_gpu_scheduler {
> struct list_head ring_mirror_list;
> spinlock_t job_list_lock;
> int hang_limit;
> - atomic_t score;
> - bool ready;
> + atomic_t num_jobs;
> + bool ready;
> bool free_guilty;
> };
>
More information about the amd-gfx
mailing list