[PATCH v3 11/13] drm/sched: Waiting for pending jobs to complete in scheduler kill
Danilo Krummrich
dakr at redhat.com
Sat Sep 16 17:52:55 UTC 2023
On 9/12/23 16:47, Matthew Brost wrote:
> On Tue, Sep 12, 2023 at 11:57:30AM +0200, Christian König wrote:
>> Am 12.09.23 um 04:16 schrieb Matthew Brost:
>>> Wait for pending jobs to be complete before signaling queued jobs. This
>>> ensures dma-fence signaling order correct and also ensures the entity is
>>> not running on the hardware after drm_sched_entity_flush or
>>> drm_sched_entity_fini returns.
>>
>> Entities are *not* supposed to outlive the submissions they carry and we
>> absolutely *can't* wait for submissions to finish while killing the entity.
>>
>> In other words it is perfectly expected that entities doesn't exists any
>> more while the submissions they carried are still running on the hardware.
>>
>> I somehow better need to document how this working and especially why it is
>> working like that.
>>
>> This approach came up like four or five times now and we already applied and
>> reverted patches doing this.
>>
>> For now let's take a look at the source code of drm_sched_entity_kill():
>>
>> /* The entity is guaranteed to not be used by the scheduler */
>> prev = rcu_dereference_check(entity->last_scheduled, true);
>> dma_fence_get(prev);
>>
>> while ((job = to_drm_sched_job(spsc_queue_pop(&entity->job_queue))))
>> {
>> struct drm_sched_fence *s_fence = job->s_fence;
>>
>> dma_fence_get(&s_fence->finished);
>> if (!prev || dma_fence_add_callback(prev, &job->finish_cb,
>> drm_sched_entity_kill_jobs_cb))
>> drm_sched_entity_kill_jobs_cb(NULL,
>> &job->finish_cb);
>>
>> prev = &s_fence->finished;
>> }
>> dma_fence_put(prev);
>>
>> This ensures the dma-fence signaling order by delegating signaling of the
>> scheduler fences into callbacks.
>>
>
> Thanks for the explaination, this code makes more sense now. Agree this
> patch is not correct.
>
> This patch really is an RFC for something Nouveau needs, I can delete
> this patch in the next rev and let Nouveau run with a slightly different
> version if needed.
Maybe there was a misunderstanding, I do not see any need for this in Nouveau.
Instead, what I think we need is a way to wait for the pending_list being empty
(meaning all jobs on the pending_list are freed) before we call drm_sched_fini().
Currently, if we call drm_sched_fini() there might still be pending jobs on the
pending_list (unless the driver implements something driver specific).
drm_sched_fini() stops the scheduler work though, hence pending jobs will never be
freed up leaking their memory.
This might also be true for existing drivers, but since they call drm_sched_fini()
from their driver remove callback, this race is extremely unlikely. However, it
definitely is an issue for drivers using the single entitiy policy calling
drm_sched_fini() from a context where it is much more likely pending jobs still
exist.
>
> Matt
>
>> Regards,
>> Christian.
>>
>>>
>>> Signed-off-by: Matthew Brost <matthew.brost at intel.com>
>>> ---
>>> drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 2 +-
>>> drivers/gpu/drm/scheduler/sched_entity.c | 7 ++-
>>> drivers/gpu/drm/scheduler/sched_main.c | 50 ++++++++++++++++++---
>>> include/drm/gpu_scheduler.h | 18 ++++++++
>>> 4 files changed, 70 insertions(+), 7 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>>> index fb5dad687168..7835c0da65c5 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>>> @@ -1873,7 +1873,7 @@ static void amdgpu_ib_preempt_mark_partial_job(struct amdgpu_ring *ring)
>>> list_for_each_entry_safe(s_job, tmp, &sched->pending_list, list) {
>>> if (dma_fence_is_signaled(&s_job->s_fence->finished)) {
>>> /* remove job from ring_mirror_list */
>>> - list_del_init(&s_job->list);
>>> + drm_sched_remove_pending_job(s_job);
>>> sched->ops->free_job(s_job);
>>> continue;
>>> }
>>> diff --git a/drivers/gpu/drm/scheduler/sched_entity.c b/drivers/gpu/drm/scheduler/sched_entity.c
>>> index 1dec97caaba3..37557fbb96d0 100644
>>> --- a/drivers/gpu/drm/scheduler/sched_entity.c
>>> +++ b/drivers/gpu/drm/scheduler/sched_entity.c
>>> @@ -104,9 +104,11 @@ int drm_sched_entity_init(struct drm_sched_entity *entity,
>>> }
>>> init_completion(&entity->entity_idle);
>>> + init_completion(&entity->jobs_done);
>>> - /* We start in an idle state. */
>>> + /* We start in an idle and jobs done state. */
>>> complete_all(&entity->entity_idle);
>>> + complete_all(&entity->jobs_done);
>>> spin_lock_init(&entity->rq_lock);
>>> spsc_queue_init(&entity->job_queue);
>>> @@ -256,6 +258,9 @@ static void drm_sched_entity_kill(struct drm_sched_entity *entity)
>>> /* Make sure this entity is not used by the scheduler at the moment */
>>> wait_for_completion(&entity->entity_idle);
>>> + /* Make sure all pending jobs are done */
>>> + wait_for_completion(&entity->jobs_done);
>>> +
>>> /* The entity is guaranteed to not be used by the scheduler */
>>> prev = rcu_dereference_check(entity->last_scheduled, true);
>>> dma_fence_get(prev);
>>> diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c
>>> index 689fb6686e01..ed6f5680793a 100644
>>> --- a/drivers/gpu/drm/scheduler/sched_main.c
>>> +++ b/drivers/gpu/drm/scheduler/sched_main.c
>>> @@ -510,12 +510,52 @@ void drm_sched_resume_timeout(struct drm_gpu_scheduler *sched,
>>> }
>>> EXPORT_SYMBOL(drm_sched_resume_timeout);
>>> +/**
>>> + * drm_sched_add_pending_job - Add pending job to scheduler
>>> + *
>>> + * @job: scheduler job to add
>>> + * @tail: add to tail of pending list
>>> + */
>>> +void drm_sched_add_pending_job(struct drm_sched_job *job, bool tail)
>>> +{
>>> + struct drm_gpu_scheduler *sched = job->sched;
>>> + struct drm_sched_entity *entity = job->entity;
>>> +
>>> + lockdep_assert_held(&sched->job_list_lock);
>>> +
>>> + if (tail)
>>> + list_add_tail(&job->list, &sched->pending_list);
>>> + else
>>> + list_add(&job->list, &sched->pending_list);
>>> + if (!entity->pending_job_count++)
>>> + reinit_completion(&entity->jobs_done);
>>> +}
>>> +EXPORT_SYMBOL(drm_sched_add_pending_job);
>>> +
>>> +/**
>>> + * drm_sched_remove_pending_job - Remove pending job from` scheduler
>>> + *
>>> + * @job: scheduler job to remove
>>> + */
>>> +void drm_sched_remove_pending_job(struct drm_sched_job *job)
>>> +{
>>> + struct drm_gpu_scheduler *sched = job->sched;
>>> + struct drm_sched_entity *entity = job->entity;
>>> +
>>> + lockdep_assert_held(&sched->job_list_lock);
>>> +
>>> + list_del_init(&job->list);
>>> + if (!--entity->pending_job_count)
>>> + complete_all(&entity->jobs_done);
>>> +}
>>> +EXPORT_SYMBOL(drm_sched_remove_pending_job);
>>> +
>>> static void drm_sched_job_begin(struct drm_sched_job *s_job)
>>> {
>>> struct drm_gpu_scheduler *sched = s_job->sched;
>>> spin_lock(&sched->job_list_lock);
>>> - list_add_tail(&s_job->list, &sched->pending_list);
>>> + drm_sched_add_pending_job(s_job, true);
>>> spin_unlock(&sched->job_list_lock);
>>> }
>>> @@ -538,7 +578,7 @@ static void drm_sched_job_timedout(struct work_struct *work)
>>> * drm_sched_cleanup_jobs. It will be reinserted back after sched->thread
>>> * is parked at which point it's safe.
>>> */
>>> - list_del_init(&job->list);
>>> + drm_sched_remove_pending_job(job);
>>> spin_unlock(&sched->job_list_lock);
>>> status = job->sched->ops->timedout_job(job);
>>> @@ -589,7 +629,7 @@ void drm_sched_stop(struct drm_gpu_scheduler *sched, struct drm_sched_job *bad)
>>> * Add at the head of the queue to reflect it was the earliest
>>> * job extracted.
>>> */
>>> - list_add(&bad->list, &sched->pending_list);
>>> + drm_sched_add_pending_job(bad, false);
>>> /*
>>> * Iterate the job list from later to earlier one and either deactive
>>> @@ -611,7 +651,7 @@ void drm_sched_stop(struct drm_gpu_scheduler *sched, struct drm_sched_job *bad)
>>> * Locking here is for concurrent resume timeout
>>> */
>>> spin_lock(&sched->job_list_lock);
>>> - list_del_init(&s_job->list);
>>> + drm_sched_remove_pending_job(s_job);
>>> spin_unlock(&sched->job_list_lock);
>>> /*
>>> @@ -1066,7 +1106,7 @@ drm_sched_get_cleanup_job(struct drm_gpu_scheduler *sched)
>>> if (job && dma_fence_is_signaled(&job->s_fence->finished)) {
>>> /* remove job from pending_list */
>>> - list_del_init(&job->list);
>>> + drm_sched_remove_pending_job(job);
>>> /* cancel this job's TO timer */
>>> cancel_delayed_work(&sched->work_tdr);
>>> diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
>>> index b7b818cd81b6..7c628f36fe78 100644
>>> --- a/include/drm/gpu_scheduler.h
>>> +++ b/include/drm/gpu_scheduler.h
>>> @@ -233,6 +233,21 @@ struct drm_sched_entity {
>>> */
>>> struct completion entity_idle;
>>> + /**
>>> + * @pending_job_count:
>>> + *
>>> + * Number of pending jobs.
>>> + */
>>> + unsigned int pending_job_count;
>>> +
>>> + /**
>>> + * @jobs_done:
>>> + *
>>> + * Signals when entity has no pending jobs, used to sequence entity
>>> + * cleanup in drm_sched_entity_fini().
>>> + */
>>> + struct completion jobs_done;
>>> +
>>> /**
>>> * @oldest_job_waiting:
>>> *
>>> @@ -656,4 +671,7 @@ struct drm_gpu_scheduler *
>>> drm_sched_pick_best(struct drm_gpu_scheduler **sched_list,
>>> unsigned int num_sched_list);
>>> +void drm_sched_add_pending_job(struct drm_sched_job *job, bool tail);
>>> +void drm_sched_remove_pending_job(struct drm_sched_job *job);
>>> +
>>> #endif
>>
More information about the dri-devel
mailing list