[PATCH v3 11/13] drm/sched: Waiting for pending jobs to complete in scheduler kill

Sat Sep 16 17:52:55 UTC 2023

On 9/12/23 16:47, Matthew Brost wrote:
> On Tue, Sep 12, 2023 at 11:57:30AM +0200, Christian König wrote:
>> Am 12.09.23 um 04:16 schrieb Matthew Brost:
>>> Wait for pending jobs to be complete before signaling queued jobs. This
>>> ensures dma-fence signaling order correct and also ensures the entity is
>>> not running on the hardware after drm_sched_entity_flush or
>>> drm_sched_entity_fini returns.
>>
>> Entities are *not* supposed to outlive the submissions they carry and we
>> absolutely *can't* wait for submissions to finish while killing the entity.
>>
>> In other words it is perfectly expected that entities doesn't exists any
>> more while the submissions they carried are still running on the hardware.
>>
>> I somehow better need to document how this working and especially why it is
>> working like that.
>>
>> This approach came up like four or five times now and we already applied and
>> reverted patches doing this.
>>
>> For now let's take a look at the source code of drm_sched_entity_kill():
>>
>>         /* The entity is guaranteed to not be used by the scheduler */
>>          prev = rcu_dereference_check(entity->last_scheduled, true);
>>          dma_fence_get(prev);
>>
>>          while ((job = to_drm_sched_job(spsc_queue_pop(&entity->job_queue))))
>> {
>>                  struct drm_sched_fence *s_fence = job->s_fence;
>>
>>                  dma_fence_get(&s_fence->finished);
>>                  if (!prev || dma_fence_add_callback(prev, &job->finish_cb,
>> drm_sched_entity_kill_jobs_cb))
>>                          drm_sched_entity_kill_jobs_cb(NULL,
>> &job->finish_cb);
>>
>>                  prev = &s_fence->finished;
>>          }
>>          dma_fence_put(prev);
>>
>> This ensures the dma-fence signaling order by delegating signaling of the
>> scheduler fences into callbacks.
>>
> 
> Thanks for the explaination, this code makes more sense now. Agree this
> patch is not correct.
> 
> This patch really is an RFC for something Nouveau needs, I can delete
> this patch in the next rev and let Nouveau run with a slightly different
> version if needed.

Maybe there was a misunderstanding, I do not see any need for this in Nouveau.

Instead, what I think we need is a way to wait for the pending_list being empty
(meaning all jobs on the pending_list are freed) before we call drm_sched_fini().

Currently, if we call drm_sched_fini() there might still be pending jobs on the
pending_list (unless the driver implements something driver specific).
drm_sched_fini() stops the scheduler work though, hence pending jobs will never be
freed up leaking their memory.

This might also be true for existing drivers, but since they call drm_sched_fini()
from their driver remove callback, this race is extremely unlikely. However, it
definitely is an issue for drivers using the single entitiy policy calling
drm_sched_fini() from a context where it is much more likely pending jobs still
exist.

> 
> Matt
> 
>> Regards,
>> Christian.
>>
>>>
>>> Signed-off-by: Matthew Brost <matthew.brost at intel.com>
>>> ---
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c |  2 +-
>>>    drivers/gpu/drm/scheduler/sched_entity.c    |  7 ++-
>>>    drivers/gpu/drm/scheduler/sched_main.c      | 50 ++++++++++++++++++---
>>>    include/drm/gpu_scheduler.h                 | 18 ++++++++
>>>    4 files changed, 70 insertions(+), 7 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>>> index fb5dad687168..7835c0da65c5 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>>> @@ -1873,7 +1873,7 @@ static void amdgpu_ib_preempt_mark_partial_job(struct amdgpu_ring *ring)
>>>    	list_for_each_entry_safe(s_job, tmp, &sched->pending_list, list) {
>>>    		if (dma_fence_is_signaled(&s_job->s_fence->finished)) {
>>>    			/* remove job from ring_mirror_list */
>>> -			list_del_init(&s_job->list);
>>> +			drm_sched_remove_pending_job(s_job);
>>>    			sched->ops->free_job(s_job);
>>>    			continue;
>>>    		}
>>> diff --git a/drivers/gpu/drm/scheduler/sched_entity.c b/drivers/gpu/drm/scheduler/sched_entity.c
>>> index 1dec97caaba3..37557fbb96d0 100644
>>> --- a/drivers/gpu/drm/scheduler/sched_entity.c
>>> +++ b/drivers/gpu/drm/scheduler/sched_entity.c
>>> @@ -104,9 +104,11 @@ int drm_sched_entity_init(struct drm_sched_entity *entity,
>>>    	}
>>>    	init_completion(&entity->entity_idle);
>>> +	init_completion(&entity->jobs_done);
>>> -	/* We start in an idle state. */
>>> +	/* We start in an idle and jobs done state. */
>>>    	complete_all(&entity->entity_idle);
>>> +	complete_all(&entity->jobs_done);
>>>    	spin_lock_init(&entity->rq_lock);
>>>    	spsc_queue_init(&entity->job_queue);
>>> @@ -256,6 +258,9 @@ static void drm_sched_entity_kill(struct drm_sched_entity *entity)
>>>    	/* Make sure this entity is not used by the scheduler at the moment */
>>>    	wait_for_completion(&entity->entity_idle);
>>> +	/* Make sure all pending jobs are done */
>>> +	wait_for_completion(&entity->jobs_done);
>>> +
>>>    	/* The entity is guaranteed to not be used by the scheduler */
>>>    	prev = rcu_dereference_check(entity->last_scheduled, true);
>>>    	dma_fence_get(prev);
>>> diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c
>>> index 689fb6686e01..ed6f5680793a 100644
>>> --- a/drivers/gpu/drm/scheduler/sched_main.c
>>> +++ b/drivers/gpu/drm/scheduler/sched_main.c
>>> @@ -510,12 +510,52 @@ void drm_sched_resume_timeout(struct drm_gpu_scheduler *sched,
>>>    }
>>>    EXPORT_SYMBOL(drm_sched_resume_timeout);
>>> +/**
>>> + * drm_sched_add_pending_job - Add pending job to scheduler
>>> + *
>>> + * @job: scheduler job to add
>>> + * @tail: add to tail of pending list
>>> + */
>>> +void drm_sched_add_pending_job(struct drm_sched_job *job, bool tail)
>>> +{
>>> +	struct drm_gpu_scheduler *sched = job->sched;
>>> +	struct drm_sched_entity *entity = job->entity;
>>> +
>>> +	lockdep_assert_held(&sched->job_list_lock);
>>> +
>>> +	if (tail)
>>> +		list_add_tail(&job->list, &sched->pending_list);
>>> +	else
>>> +		list_add(&job->list, &sched->pending_list);
>>> +	if (!entity->pending_job_count++)
>>> +		reinit_completion(&entity->jobs_done);
>>> +}
>>> +EXPORT_SYMBOL(drm_sched_add_pending_job);
>>> +
>>> +/**
>>> + * drm_sched_remove_pending_job - Remove pending job from` scheduler
>>> + *
>>> + * @job: scheduler job to remove
>>> + */
>>> +void drm_sched_remove_pending_job(struct drm_sched_job *job)
>>> +{
>>> +	struct drm_gpu_scheduler *sched = job->sched;
>>> +	struct drm_sched_entity *entity = job->entity;
>>> +
>>> +	lockdep_assert_held(&sched->job_list_lock);
>>> +
>>> +	list_del_init(&job->list);
>>> +	if (!--entity->pending_job_count)
>>> +		complete_all(&entity->jobs_done);
>>> +}
>>> +EXPORT_SYMBOL(drm_sched_remove_pending_job);
>>> +
>>>    static void drm_sched_job_begin(struct drm_sched_job *s_job)
>>>    {
>>>    	struct drm_gpu_scheduler *sched = s_job->sched;
>>>    	spin_lock(&sched->job_list_lock);
>>> -	list_add_tail(&s_job->list, &sched->pending_list);
>>> +	drm_sched_add_pending_job(s_job, true);
>>>    	spin_unlock(&sched->job_list_lock);
>>>    }
>>> @@ -538,7 +578,7 @@ static void drm_sched_job_timedout(struct work_struct *work)
>>>    		 * drm_sched_cleanup_jobs. It will be reinserted back after sched->thread
>>>    		 * is parked at which point it's safe.
>>>    		 */
>>> -		list_del_init(&job->list);
>>> +		drm_sched_remove_pending_job(job);
>>>    		spin_unlock(&sched->job_list_lock);
>>>    		status = job->sched->ops->timedout_job(job);
>>> @@ -589,7 +629,7 @@ void drm_sched_stop(struct drm_gpu_scheduler *sched, struct drm_sched_job *bad)
>>>    		 * Add at the head of the queue to reflect it was the earliest
>>>    		 * job extracted.
>>>    		 */
>>> -		list_add(&bad->list, &sched->pending_list);
>>> +		drm_sched_add_pending_job(bad, false);
>>>    	/*
>>>    	 * Iterate the job list from later to  earlier one and either deactive
>>> @@ -611,7 +651,7 @@ void drm_sched_stop(struct drm_gpu_scheduler *sched, struct drm_sched_job *bad)
>>>    			 * Locking here is for concurrent resume timeout
>>>    			 */
>>>    			spin_lock(&sched->job_list_lock);
>>> -			list_del_init(&s_job->list);
>>> +			drm_sched_remove_pending_job(s_job);
>>>    			spin_unlock(&sched->job_list_lock);
>>>    			/*
>>> @@ -1066,7 +1106,7 @@ drm_sched_get_cleanup_job(struct drm_gpu_scheduler *sched)
>>>    	if (job && dma_fence_is_signaled(&job->s_fence->finished)) {
>>>    		/* remove job from pending_list */
>>> -		list_del_init(&job->list);
>>> +		drm_sched_remove_pending_job(job);
>>>    		/* cancel this job's TO timer */
>>>    		cancel_delayed_work(&sched->work_tdr);
>>> diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
>>> index b7b818cd81b6..7c628f36fe78 100644
>>> --- a/include/drm/gpu_scheduler.h
>>> +++ b/include/drm/gpu_scheduler.h
>>> @@ -233,6 +233,21 @@ struct drm_sched_entity {
>>>    	 */
>>>    	struct completion		entity_idle;
>>> +	/**
>>> +	 * @pending_job_count:
>>> +	 *
>>> +	 * Number of pending jobs.
>>> +	 */
>>> +	unsigned int                    pending_job_count;
>>> +
>>> +	/**
>>> +	 * @jobs_done:
>>> +	 *
>>> +	 * Signals when entity has no pending jobs, used to sequence entity
>>> +	 * cleanup in drm_sched_entity_fini().
>>> +	 */
>>> +	struct completion		jobs_done;
>>> +
>>>    	/**
>>>    	 * @oldest_job_waiting:
>>>    	 *
>>> @@ -656,4 +671,7 @@ struct drm_gpu_scheduler *
>>>    drm_sched_pick_best(struct drm_gpu_scheduler **sched_list,
>>>    		     unsigned int num_sched_list);
>>> +void drm_sched_add_pending_job(struct drm_sched_job *job, bool tail);
>>> +void drm_sched_remove_pending_job(struct drm_sched_job *job);
>>> +
>>>    #endif
>>