[PATCH 12/13] drm/scheduler: rework entity flush, kill and fini
Andrey Grodzovsky
andrey.grodzovsky at amd.com
Fri Sep 30 13:46:02 UTC 2022
On 2022-09-30 07:51, Christian König wrote:
> Am 29.09.22 um 21:20 schrieb Andrey Grodzovsky:
>>
>> On 2022-09-29 09:21, Christian König wrote:
>>> This was buggy because when we had to wait for entities which were
>>> killed as well we would just deadlock.
>>>
>>> Instead move all the dependency handling into the callbacks so that
>>> will all happen asynchronously.
>>>
>>> Signed-off-by: Christian König <christian.koenig at amd.com>
>>> ---
>>> drivers/gpu/drm/scheduler/sched_entity.c | 197
>>> +++++++++++------------
>>> 1 file changed, 92 insertions(+), 105 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/scheduler/sched_entity.c
>>> b/drivers/gpu/drm/scheduler/sched_entity.c
>>> index 1bb1437a8fed..1d448e376811 100644
>>> --- a/drivers/gpu/drm/scheduler/sched_entity.c
>>> +++ b/drivers/gpu/drm/scheduler/sched_entity.c
>>> @@ -139,6 +139,74 @@ bool drm_sched_entity_is_ready(struct
>>> drm_sched_entity *entity)
>>> return true;
>>> }
>>> +static void drm_sched_entity_kill_jobs_irq_work(struct irq_work
>>> *wrk)
>>> +{
>>> + struct drm_sched_job *job = container_of(wrk, typeof(*job), work);
>>> +
>>> + drm_sched_fence_scheduled(job->s_fence);
>>> + drm_sched_fence_finished(job->s_fence);
>>> + WARN_ON(job->s_fence->parent);
>>> + job->sched->ops->free_job(job);
>>> +}
>>> +
>>> +/* Signal the scheduler finished fence when the entity in question
>>> is killed. */
>>> +static void drm_sched_entity_kill_jobs_cb(struct dma_fence *f,
>>> + struct dma_fence_cb *cb)
>>> +{
>>> + struct drm_sched_job *job = container_of(cb, struct drm_sched_job,
>>> + finish_cb);
>>> + int r;
>>> +
>>> + dma_fence_put(f);
>>> +
>>> + /* Wait for all dependencies to avoid data corruptions */
>>> + while (!xa_empty(&job->dependencies)) {
>>> + f = xa_erase(&job->dependencies, job->last_dependency++);
>>> + r = dma_fence_add_callback(f, &job->finish_cb,
>>> + drm_sched_entity_kill_jobs_cb);
>>> + if (!r)
>>> + return;
>>> +
>>> + dma_fence_put(f);
>>> + }
>>> +
>>> + init_irq_work(&job->work, drm_sched_entity_kill_jobs_irq_work);
>>> + irq_work_queue(&job->work);
>>> +}
>>> +
>>> +/* Remove the entity from the scheduler and kill all pending jobs */
>>> +static void drm_sched_entity_kill(struct drm_sched_entity *entity)
>>> +{
>>> + struct drm_sched_job *job;
>>> + struct dma_fence *prev;
>>> +
>>> + if (!entity->rq)
>>> + return;
>>> +
>>> + spin_lock(&entity->rq_lock);
>>> + entity->stopped = true;
>>> + drm_sched_rq_remove_entity(entity->rq, entity);
>>> + spin_unlock(&entity->rq_lock);
>>> +
>>> + /* Make sure this entity is not used by the scheduler at the
>>> moment */
>>> + wait_for_completion(&entity->entity_idle);
>>
>>
>> Does it really stop processing in case more jobs are pending in
>> entity queue already ?
>> It probably makes sense to integrate drm_sched_entity_is_idle into
>> drm_sched_entity_is_ready
>> to prevent rq selecting this entity in such case
>
> We make sure that the entity is not used for scheduling any more by
> calling drm_sched_rq_remove_entity() right above this check here.
>
> The wait is only to prevent us from racing with the scheduler thread
> submitting one more job from the entity.
Ok, missed the entity remove in the code
>
>>
>>> +
>>> + prev = dma_fence_get(entity->last_scheduled);
>>> + while ((job =
>>> to_drm_sched_job(spsc_queue_pop(&entity->job_queue)))) {
>>> + struct drm_sched_fence *s_fence = job->s_fence;
>>> +
>>> + dma_fence_set_error(&s_fence->finished, -ESRCH);
>>> +
>>> + dma_fence_get(&s_fence->finished);
>>> + if (!prev || dma_fence_add_callback(prev, &job->finish_cb,
>>> + drm_sched_entity_kill_jobs_cb))
>>> + drm_sched_entity_kill_jobs_cb(NULL, &job->finish_cb);
>>> +
>>> + prev = &s_fence->finished;
>>> + }
>>> + dma_fence_put(prev);
>>> +}
>>> +
>>> /**
>>> * drm_sched_entity_flush - Flush a context entity
>>> *
>>> @@ -179,91 +247,13 @@ long drm_sched_entity_flush(struct
>>> drm_sched_entity *entity, long timeout)
>>> /* For killed process disable any more IBs enqueue right now */
>>> last_user = cmpxchg(&entity->last_user, current->group_leader,
>>> NULL);
>>> if ((!last_user || last_user == current->group_leader) &&
>>> - (current->flags & PF_EXITING) && (current->exit_code ==
>>> SIGKILL)) {
>>> - spin_lock(&entity->rq_lock);
>>> - entity->stopped = true;
>>> - drm_sched_rq_remove_entity(entity->rq, entity);
>>> - spin_unlock(&entity->rq_lock);
>>> - }
>>> + (current->flags & PF_EXITING) && (current->exit_code ==
>>> SIGKILL))
>>> + drm_sched_entity_kill(entity);
>>
>>
>> This reverts 'drm/scheduler: only kill entity if last user is killed
>> v2' and so might break this use case - when entity
>> gets through FD into child process. Why this needs to be removed ?
>
> This patch isn't reverted. I keep the "last_user ==
> current->group_leader" check in the line above.
>
> Christian.
OK, second time i didn't look carefully... My bad
Another question - you call drm_sched_entity_kill now both in
drm_sched_entity_flush and in drm_sched_entity_kill_jobs - so
for drm_sched_entity_destroy it will be called twice, no ? because it
will be called from drm_sched_entity_flush and from
drm_sched_entity_fini. Why we need to call drm_sched_entity_kill from
flush ? With your new async scheme of jobs killing
we can even leave the flush code unchanged and just call
drm_sched_entity_kill from drm_sched_entity_fini and no deadlock
will happen, what I am missing here ?
Andrey
>
>>
>> Andrey
>>
>>
>>> return ret;
>>> }
>>> EXPORT_SYMBOL(drm_sched_entity_flush);
>>> -static void drm_sched_entity_kill_jobs_irq_work(struct irq_work
>>> *wrk)
>>> -{
>>> - struct drm_sched_job *job = container_of(wrk, typeof(*job), work);
>>> -
>>> - drm_sched_fence_finished(job->s_fence);
>>> - WARN_ON(job->s_fence->parent);
>>> - job->sched->ops->free_job(job);
>>> -}
>>> -
>>> -
>>> -/* Signal the scheduler finished fence when the entity in question
>>> is killed. */
>>> -static void drm_sched_entity_kill_jobs_cb(struct dma_fence *f,
>>> - struct dma_fence_cb *cb)
>>> -{
>>> - struct drm_sched_job *job = container_of(cb, struct drm_sched_job,
>>> - finish_cb);
>>> -
>>> - dma_fence_put(f);
>>> - init_irq_work(&job->work, drm_sched_entity_kill_jobs_irq_work);
>>> - irq_work_queue(&job->work);
>>> -}
>>> -
>>> -static struct dma_fence *
>>> -drm_sched_job_dependency(struct drm_sched_job *job,
>>> - struct drm_sched_entity *entity)
>>> -{
>>> - if (!xa_empty(&job->dependencies))
>>> - return xa_erase(&job->dependencies, job->last_dependency++);
>>> -
>>> - if (job->sched->ops->dependency)
>>> - return job->sched->ops->dependency(job, entity);
>>> -
>>> - return NULL;
>>> -}
>>> -
>>> -static void drm_sched_entity_kill_jobs(struct drm_sched_entity
>>> *entity)
>>> -{
>>> - struct drm_sched_job *job;
>>> - struct dma_fence *f;
>>> - int r;
>>> -
>>> - while ((job =
>>> to_drm_sched_job(spsc_queue_pop(&entity->job_queue)))) {
>>> - struct drm_sched_fence *s_fence = job->s_fence;
>>> -
>>> - /* Wait for all dependencies to avoid data corruptions */
>>> - while ((f = drm_sched_job_dependency(job, entity))) {
>>> - dma_fence_wait(f, false);
>>> - dma_fence_put(f);
>>> - }
>>> -
>>> - drm_sched_fence_scheduled(s_fence);
>>> - dma_fence_set_error(&s_fence->finished, -ESRCH);
>>> -
>>> - /*
>>> - * When pipe is hanged by older entity, new entity might
>>> - * not even have chance to submit it's first job to HW
>>> - * and so entity->last_scheduled will remain NULL
>>> - */
>>> - if (!entity->last_scheduled) {
>>> - drm_sched_entity_kill_jobs_cb(NULL, &job->finish_cb);
>>> - continue;
>>> - }
>>> -
>>> - dma_fence_get(entity->last_scheduled);
>>> - r = dma_fence_add_callback(entity->last_scheduled,
>>> - &job->finish_cb,
>>> - drm_sched_entity_kill_jobs_cb);
>>> - if (r == -ENOENT)
>>> - drm_sched_entity_kill_jobs_cb(NULL, &job->finish_cb);
>>> - else if (r)
>>> - DRM_ERROR("fence add callback failed (%d)\n", r);
>>> - }
>>> -}
>>> -
>>> /**
>>> * drm_sched_entity_fini - Destroy a context entity
>>> *
>>> @@ -277,33 +267,17 @@ static void drm_sched_entity_kill_jobs(struct
>>> drm_sched_entity *entity)
>>> */
>>> void drm_sched_entity_fini(struct drm_sched_entity *entity)
>>> {
>>> - struct drm_gpu_scheduler *sched = NULL;
>>> -
>>> - if (entity->rq) {
>>> - sched = entity->rq->sched;
>>> - drm_sched_rq_remove_entity(entity->rq, entity);
>>> - }
>>> -
>>> - /* Consumption of existing IBs wasn't completed. Forcefully
>>> - * remove them here.
>>> + /*
>>> + * If consumption of existing IBs wasn't completed. Forcefully
>>> remove
>>> + * them here. Also makes sure that the scheduler won't touch
>>> this entity
>>> + * any more.
>>> */
>>> - if (spsc_queue_count(&entity->job_queue)) {
>>> - if (sched) {
>>> - /*
>>> - * Wait for thread to idle to make sure it isn't
>>> processing
>>> - * this entity.
>>> - */
>>> - wait_for_completion(&entity->entity_idle);
>>> -
>>> - }
>>> - if (entity->dependency) {
>>> - dma_fence_remove_callback(entity->dependency,
>>> - &entity->cb);
>>> - dma_fence_put(entity->dependency);
>>> - entity->dependency = NULL;
>>> - }
>>> + drm_sched_entity_kill(entity);
>>> - drm_sched_entity_kill_jobs(entity);
>>> + if (entity->dependency) {
>>> + dma_fence_remove_callback(entity->dependency, &entity->cb);
>>> + dma_fence_put(entity->dependency);
>>> + entity->dependency = NULL;
>>> }
>>> dma_fence_put(entity->last_scheduled);
>>> @@ -415,6 +389,19 @@ static bool
>>> drm_sched_entity_add_dependency_cb(struct drm_sched_entity *entity)
>>> return false;
>>> }
>>> +static struct dma_fence *
>>> +drm_sched_job_dependency(struct drm_sched_job *job,
>>> + struct drm_sched_entity *entity)
>>> +{
>>> + if (!xa_empty(&job->dependencies))
>>> + return xa_erase(&job->dependencies, job->last_dependency++);
>>> +
>>> + if (job->sched->ops->dependency)
>>> + return job->sched->ops->dependency(job, entity);
>>> +
>>> + return NULL;
>>> +}
>>> +
>>> struct drm_sched_job *drm_sched_entity_pop_job(struct
>>> drm_sched_entity *entity)
>>> {
>>> struct drm_sched_job *sched_job;
>
More information about the dri-devel
mailing list