[PATCH 12/13] drm/scheduler: rework entity flush, kill and fini
Christian König
christian.koenig at amd.com
Fri Sep 30 11:51:00 UTC 2022
Am 29.09.22 um 21:20 schrieb Andrey Grodzovsky:
>
> On 2022-09-29 09:21, Christian König wrote:
>> This was buggy because when we had to wait for entities which were
>> killed as well we would just deadlock.
>>
>> Instead move all the dependency handling into the callbacks so that
>> will all happen asynchronously.
>>
>> Signed-off-by: Christian König <christian.koenig at amd.com>
>> ---
>> drivers/gpu/drm/scheduler/sched_entity.c | 197 +++++++++++------------
>> 1 file changed, 92 insertions(+), 105 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/scheduler/sched_entity.c
>> b/drivers/gpu/drm/scheduler/sched_entity.c
>> index 1bb1437a8fed..1d448e376811 100644
>> --- a/drivers/gpu/drm/scheduler/sched_entity.c
>> +++ b/drivers/gpu/drm/scheduler/sched_entity.c
>> @@ -139,6 +139,74 @@ bool drm_sched_entity_is_ready(struct
>> drm_sched_entity *entity)
>> return true;
>> }
>> +static void drm_sched_entity_kill_jobs_irq_work(struct irq_work *wrk)
>> +{
>> + struct drm_sched_job *job = container_of(wrk, typeof(*job), work);
>> +
>> + drm_sched_fence_scheduled(job->s_fence);
>> + drm_sched_fence_finished(job->s_fence);
>> + WARN_ON(job->s_fence->parent);
>> + job->sched->ops->free_job(job);
>> +}
>> +
>> +/* Signal the scheduler finished fence when the entity in question
>> is killed. */
>> +static void drm_sched_entity_kill_jobs_cb(struct dma_fence *f,
>> + struct dma_fence_cb *cb)
>> +{
>> + struct drm_sched_job *job = container_of(cb, struct drm_sched_job,
>> + finish_cb);
>> + int r;
>> +
>> + dma_fence_put(f);
>> +
>> + /* Wait for all dependencies to avoid data corruptions */
>> + while (!xa_empty(&job->dependencies)) {
>> + f = xa_erase(&job->dependencies, job->last_dependency++);
>> + r = dma_fence_add_callback(f, &job->finish_cb,
>> + drm_sched_entity_kill_jobs_cb);
>> + if (!r)
>> + return;
>> +
>> + dma_fence_put(f);
>> + }
>> +
>> + init_irq_work(&job->work, drm_sched_entity_kill_jobs_irq_work);
>> + irq_work_queue(&job->work);
>> +}
>> +
>> +/* Remove the entity from the scheduler and kill all pending jobs */
>> +static void drm_sched_entity_kill(struct drm_sched_entity *entity)
>> +{
>> + struct drm_sched_job *job;
>> + struct dma_fence *prev;
>> +
>> + if (!entity->rq)
>> + return;
>> +
>> + spin_lock(&entity->rq_lock);
>> + entity->stopped = true;
>> + drm_sched_rq_remove_entity(entity->rq, entity);
>> + spin_unlock(&entity->rq_lock);
>> +
>> + /* Make sure this entity is not used by the scheduler at the
>> moment */
>> + wait_for_completion(&entity->entity_idle);
>
>
> Does it really stop processing in case more jobs are pending in entity
> queue already ?
> It probably makes sense to integrate drm_sched_entity_is_idle into
> drm_sched_entity_is_ready
> to prevent rq selecting this entity in such case
We make sure that the entity is not used for scheduling any more by
calling drm_sched_rq_remove_entity() right above this check here.
The wait is only to prevent us from racing with the scheduler thread
submitting one more job from the entity.
>
>> +
>> + prev = dma_fence_get(entity->last_scheduled);
>> + while ((job =
>> to_drm_sched_job(spsc_queue_pop(&entity->job_queue)))) {
>> + struct drm_sched_fence *s_fence = job->s_fence;
>> +
>> + dma_fence_set_error(&s_fence->finished, -ESRCH);
>> +
>> + dma_fence_get(&s_fence->finished);
>> + if (!prev || dma_fence_add_callback(prev, &job->finish_cb,
>> + drm_sched_entity_kill_jobs_cb))
>> + drm_sched_entity_kill_jobs_cb(NULL, &job->finish_cb);
>> +
>> + prev = &s_fence->finished;
>> + }
>> + dma_fence_put(prev);
>> +}
>> +
>> /**
>> * drm_sched_entity_flush - Flush a context entity
>> *
>> @@ -179,91 +247,13 @@ long drm_sched_entity_flush(struct
>> drm_sched_entity *entity, long timeout)
>> /* For killed process disable any more IBs enqueue right now */
>> last_user = cmpxchg(&entity->last_user, current->group_leader,
>> NULL);
>> if ((!last_user || last_user == current->group_leader) &&
>> - (current->flags & PF_EXITING) && (current->exit_code ==
>> SIGKILL)) {
>> - spin_lock(&entity->rq_lock);
>> - entity->stopped = true;
>> - drm_sched_rq_remove_entity(entity->rq, entity);
>> - spin_unlock(&entity->rq_lock);
>> - }
>> + (current->flags & PF_EXITING) && (current->exit_code ==
>> SIGKILL))
>> + drm_sched_entity_kill(entity);
>
>
> This reverts 'drm/scheduler: only kill entity if last user is killed
> v2' and so might break this use case - when entity
> gets through FD into child process. Why this needs to be removed ?
This patch isn't reverted. I keep the "last_user ==
current->group_leader" check in the line above.
Christian.
>
> Andrey
>
>
>> return ret;
>> }
>> EXPORT_SYMBOL(drm_sched_entity_flush);
>> -static void drm_sched_entity_kill_jobs_irq_work(struct irq_work *wrk)
>> -{
>> - struct drm_sched_job *job = container_of(wrk, typeof(*job), work);
>> -
>> - drm_sched_fence_finished(job->s_fence);
>> - WARN_ON(job->s_fence->parent);
>> - job->sched->ops->free_job(job);
>> -}
>> -
>> -
>> -/* Signal the scheduler finished fence when the entity in question
>> is killed. */
>> -static void drm_sched_entity_kill_jobs_cb(struct dma_fence *f,
>> - struct dma_fence_cb *cb)
>> -{
>> - struct drm_sched_job *job = container_of(cb, struct drm_sched_job,
>> - finish_cb);
>> -
>> - dma_fence_put(f);
>> - init_irq_work(&job->work, drm_sched_entity_kill_jobs_irq_work);
>> - irq_work_queue(&job->work);
>> -}
>> -
>> -static struct dma_fence *
>> -drm_sched_job_dependency(struct drm_sched_job *job,
>> - struct drm_sched_entity *entity)
>> -{
>> - if (!xa_empty(&job->dependencies))
>> - return xa_erase(&job->dependencies, job->last_dependency++);
>> -
>> - if (job->sched->ops->dependency)
>> - return job->sched->ops->dependency(job, entity);
>> -
>> - return NULL;
>> -}
>> -
>> -static void drm_sched_entity_kill_jobs(struct drm_sched_entity *entity)
>> -{
>> - struct drm_sched_job *job;
>> - struct dma_fence *f;
>> - int r;
>> -
>> - while ((job =
>> to_drm_sched_job(spsc_queue_pop(&entity->job_queue)))) {
>> - struct drm_sched_fence *s_fence = job->s_fence;
>> -
>> - /* Wait for all dependencies to avoid data corruptions */
>> - while ((f = drm_sched_job_dependency(job, entity))) {
>> - dma_fence_wait(f, false);
>> - dma_fence_put(f);
>> - }
>> -
>> - drm_sched_fence_scheduled(s_fence);
>> - dma_fence_set_error(&s_fence->finished, -ESRCH);
>> -
>> - /*
>> - * When pipe is hanged by older entity, new entity might
>> - * not even have chance to submit it's first job to HW
>> - * and so entity->last_scheduled will remain NULL
>> - */
>> - if (!entity->last_scheduled) {
>> - drm_sched_entity_kill_jobs_cb(NULL, &job->finish_cb);
>> - continue;
>> - }
>> -
>> - dma_fence_get(entity->last_scheduled);
>> - r = dma_fence_add_callback(entity->last_scheduled,
>> - &job->finish_cb,
>> - drm_sched_entity_kill_jobs_cb);
>> - if (r == -ENOENT)
>> - drm_sched_entity_kill_jobs_cb(NULL, &job->finish_cb);
>> - else if (r)
>> - DRM_ERROR("fence add callback failed (%d)\n", r);
>> - }
>> -}
>> -
>> /**
>> * drm_sched_entity_fini - Destroy a context entity
>> *
>> @@ -277,33 +267,17 @@ static void drm_sched_entity_kill_jobs(struct
>> drm_sched_entity *entity)
>> */
>> void drm_sched_entity_fini(struct drm_sched_entity *entity)
>> {
>> - struct drm_gpu_scheduler *sched = NULL;
>> -
>> - if (entity->rq) {
>> - sched = entity->rq->sched;
>> - drm_sched_rq_remove_entity(entity->rq, entity);
>> - }
>> -
>> - /* Consumption of existing IBs wasn't completed. Forcefully
>> - * remove them here.
>> + /*
>> + * If consumption of existing IBs wasn't completed. Forcefully
>> remove
>> + * them here. Also makes sure that the scheduler won't touch
>> this entity
>> + * any more.
>> */
>> - if (spsc_queue_count(&entity->job_queue)) {
>> - if (sched) {
>> - /*
>> - * Wait for thread to idle to make sure it isn't processing
>> - * this entity.
>> - */
>> - wait_for_completion(&entity->entity_idle);
>> -
>> - }
>> - if (entity->dependency) {
>> - dma_fence_remove_callback(entity->dependency,
>> - &entity->cb);
>> - dma_fence_put(entity->dependency);
>> - entity->dependency = NULL;
>> - }
>> + drm_sched_entity_kill(entity);
>> - drm_sched_entity_kill_jobs(entity);
>> + if (entity->dependency) {
>> + dma_fence_remove_callback(entity->dependency, &entity->cb);
>> + dma_fence_put(entity->dependency);
>> + entity->dependency = NULL;
>> }
>> dma_fence_put(entity->last_scheduled);
>> @@ -415,6 +389,19 @@ static bool
>> drm_sched_entity_add_dependency_cb(struct drm_sched_entity *entity)
>> return false;
>> }
>> +static struct dma_fence *
>> +drm_sched_job_dependency(struct drm_sched_job *job,
>> + struct drm_sched_entity *entity)
>> +{
>> + if (!xa_empty(&job->dependencies))
>> + return xa_erase(&job->dependencies, job->last_dependency++);
>> +
>> + if (job->sched->ops->dependency)
>> + return job->sched->ops->dependency(job, entity);
>> +
>> + return NULL;
>> +}
>> +
>> struct drm_sched_job *drm_sched_entity_pop_job(struct
>> drm_sched_entity *entity)
>> {
>> struct drm_sched_job *sched_job;
More information about the dri-devel
mailing list