[PATCH 12/13] drm/scheduler: rework entity flush, kill and fini
Andrey Grodzovsky
andrey.grodzovsky at amd.com
Thu Sep 29 19:20:17 UTC 2022
On 2022-09-29 09:21, Christian König wrote:
> This was buggy because when we had to wait for entities which were
> killed as well we would just deadlock.
>
> Instead move all the dependency handling into the callbacks so that
> will all happen asynchronously.
>
> Signed-off-by: Christian König <christian.koenig at amd.com>
> ---
> drivers/gpu/drm/scheduler/sched_entity.c | 197 +++++++++++------------
> 1 file changed, 92 insertions(+), 105 deletions(-)
>
> diff --git a/drivers/gpu/drm/scheduler/sched_entity.c b/drivers/gpu/drm/scheduler/sched_entity.c
> index 1bb1437a8fed..1d448e376811 100644
> --- a/drivers/gpu/drm/scheduler/sched_entity.c
> +++ b/drivers/gpu/drm/scheduler/sched_entity.c
> @@ -139,6 +139,74 @@ bool drm_sched_entity_is_ready(struct drm_sched_entity *entity)
> return true;
> }
>
> +static void drm_sched_entity_kill_jobs_irq_work(struct irq_work *wrk)
> +{
> + struct drm_sched_job *job = container_of(wrk, typeof(*job), work);
> +
> + drm_sched_fence_scheduled(job->s_fence);
> + drm_sched_fence_finished(job->s_fence);
> + WARN_ON(job->s_fence->parent);
> + job->sched->ops->free_job(job);
> +}
> +
> +/* Signal the scheduler finished fence when the entity in question is killed. */
> +static void drm_sched_entity_kill_jobs_cb(struct dma_fence *f,
> + struct dma_fence_cb *cb)
> +{
> + struct drm_sched_job *job = container_of(cb, struct drm_sched_job,
> + finish_cb);
> + int r;
> +
> + dma_fence_put(f);
> +
> + /* Wait for all dependencies to avoid data corruptions */
> + while (!xa_empty(&job->dependencies)) {
> + f = xa_erase(&job->dependencies, job->last_dependency++);
> + r = dma_fence_add_callback(f, &job->finish_cb,
> + drm_sched_entity_kill_jobs_cb);
> + if (!r)
> + return;
> +
> + dma_fence_put(f);
> + }
> +
> + init_irq_work(&job->work, drm_sched_entity_kill_jobs_irq_work);
> + irq_work_queue(&job->work);
> +}
> +
> +/* Remove the entity from the scheduler and kill all pending jobs */
> +static void drm_sched_entity_kill(struct drm_sched_entity *entity)
> +{
> + struct drm_sched_job *job;
> + struct dma_fence *prev;
> +
> + if (!entity->rq)
> + return;
> +
> + spin_lock(&entity->rq_lock);
> + entity->stopped = true;
> + drm_sched_rq_remove_entity(entity->rq, entity);
> + spin_unlock(&entity->rq_lock);
> +
> + /* Make sure this entity is not used by the scheduler at the moment */
> + wait_for_completion(&entity->entity_idle);
Does it really stop processing in case more jobs are pending in entity
queue already ?
It probably makes sense to integrate drm_sched_entity_is_idle into
drm_sched_entity_is_ready
to prevent rq selecting this entity in such case
> +
> + prev = dma_fence_get(entity->last_scheduled);
> + while ((job = to_drm_sched_job(spsc_queue_pop(&entity->job_queue)))) {
> + struct drm_sched_fence *s_fence = job->s_fence;
> +
> + dma_fence_set_error(&s_fence->finished, -ESRCH);
> +
> + dma_fence_get(&s_fence->finished);
> + if (!prev || dma_fence_add_callback(prev, &job->finish_cb,
> + drm_sched_entity_kill_jobs_cb))
> + drm_sched_entity_kill_jobs_cb(NULL, &job->finish_cb);
> +
> + prev = &s_fence->finished;
> + }
> + dma_fence_put(prev);
> +}
> +
> /**
> * drm_sched_entity_flush - Flush a context entity
> *
> @@ -179,91 +247,13 @@ long drm_sched_entity_flush(struct drm_sched_entity *entity, long timeout)
> /* For killed process disable any more IBs enqueue right now */
> last_user = cmpxchg(&entity->last_user, current->group_leader, NULL);
> if ((!last_user || last_user == current->group_leader) &&
> - (current->flags & PF_EXITING) && (current->exit_code == SIGKILL)) {
> - spin_lock(&entity->rq_lock);
> - entity->stopped = true;
> - drm_sched_rq_remove_entity(entity->rq, entity);
> - spin_unlock(&entity->rq_lock);
> - }
> + (current->flags & PF_EXITING) && (current->exit_code == SIGKILL))
> + drm_sched_entity_kill(entity);
This reverts 'drm/scheduler: only kill entity if last user is killed v2'
and so might break this use case - when entity
gets through FD into child process. Why this needs to be removed ?
Andrey
>
> return ret;
> }
> EXPORT_SYMBOL(drm_sched_entity_flush);
>
> -static void drm_sched_entity_kill_jobs_irq_work(struct irq_work *wrk)
> -{
> - struct drm_sched_job *job = container_of(wrk, typeof(*job), work);
> -
> - drm_sched_fence_finished(job->s_fence);
> - WARN_ON(job->s_fence->parent);
> - job->sched->ops->free_job(job);
> -}
> -
> -
> -/* Signal the scheduler finished fence when the entity in question is killed. */
> -static void drm_sched_entity_kill_jobs_cb(struct dma_fence *f,
> - struct dma_fence_cb *cb)
> -{
> - struct drm_sched_job *job = container_of(cb, struct drm_sched_job,
> - finish_cb);
> -
> - dma_fence_put(f);
> - init_irq_work(&job->work, drm_sched_entity_kill_jobs_irq_work);
> - irq_work_queue(&job->work);
> -}
> -
> -static struct dma_fence *
> -drm_sched_job_dependency(struct drm_sched_job *job,
> - struct drm_sched_entity *entity)
> -{
> - if (!xa_empty(&job->dependencies))
> - return xa_erase(&job->dependencies, job->last_dependency++);
> -
> - if (job->sched->ops->dependency)
> - return job->sched->ops->dependency(job, entity);
> -
> - return NULL;
> -}
> -
> -static void drm_sched_entity_kill_jobs(struct drm_sched_entity *entity)
> -{
> - struct drm_sched_job *job;
> - struct dma_fence *f;
> - int r;
> -
> - while ((job = to_drm_sched_job(spsc_queue_pop(&entity->job_queue)))) {
> - struct drm_sched_fence *s_fence = job->s_fence;
> -
> - /* Wait for all dependencies to avoid data corruptions */
> - while ((f = drm_sched_job_dependency(job, entity))) {
> - dma_fence_wait(f, false);
> - dma_fence_put(f);
> - }
> -
> - drm_sched_fence_scheduled(s_fence);
> - dma_fence_set_error(&s_fence->finished, -ESRCH);
> -
> - /*
> - * When pipe is hanged by older entity, new entity might
> - * not even have chance to submit it's first job to HW
> - * and so entity->last_scheduled will remain NULL
> - */
> - if (!entity->last_scheduled) {
> - drm_sched_entity_kill_jobs_cb(NULL, &job->finish_cb);
> - continue;
> - }
> -
> - dma_fence_get(entity->last_scheduled);
> - r = dma_fence_add_callback(entity->last_scheduled,
> - &job->finish_cb,
> - drm_sched_entity_kill_jobs_cb);
> - if (r == -ENOENT)
> - drm_sched_entity_kill_jobs_cb(NULL, &job->finish_cb);
> - else if (r)
> - DRM_ERROR("fence add callback failed (%d)\n", r);
> - }
> -}
> -
> /**
> * drm_sched_entity_fini - Destroy a context entity
> *
> @@ -277,33 +267,17 @@ static void drm_sched_entity_kill_jobs(struct drm_sched_entity *entity)
> */
> void drm_sched_entity_fini(struct drm_sched_entity *entity)
> {
> - struct drm_gpu_scheduler *sched = NULL;
> -
> - if (entity->rq) {
> - sched = entity->rq->sched;
> - drm_sched_rq_remove_entity(entity->rq, entity);
> - }
> -
> - /* Consumption of existing IBs wasn't completed. Forcefully
> - * remove them here.
> + /*
> + * If consumption of existing IBs wasn't completed. Forcefully remove
> + * them here. Also makes sure that the scheduler won't touch this entity
> + * any more.
> */
> - if (spsc_queue_count(&entity->job_queue)) {
> - if (sched) {
> - /*
> - * Wait for thread to idle to make sure it isn't processing
> - * this entity.
> - */
> - wait_for_completion(&entity->entity_idle);
> -
> - }
> - if (entity->dependency) {
> - dma_fence_remove_callback(entity->dependency,
> - &entity->cb);
> - dma_fence_put(entity->dependency);
> - entity->dependency = NULL;
> - }
> + drm_sched_entity_kill(entity);
>
> - drm_sched_entity_kill_jobs(entity);
> + if (entity->dependency) {
> + dma_fence_remove_callback(entity->dependency, &entity->cb);
> + dma_fence_put(entity->dependency);
> + entity->dependency = NULL;
> }
>
> dma_fence_put(entity->last_scheduled);
> @@ -415,6 +389,19 @@ static bool drm_sched_entity_add_dependency_cb(struct drm_sched_entity *entity)
> return false;
> }
>
> +static struct dma_fence *
> +drm_sched_job_dependency(struct drm_sched_job *job,
> + struct drm_sched_entity *entity)
> +{
> + if (!xa_empty(&job->dependencies))
> + return xa_erase(&job->dependencies, job->last_dependency++);
> +
> + if (job->sched->ops->dependency)
> + return job->sched->ops->dependency(job, entity);
> +
> + return NULL;
> +}
> +
> struct drm_sched_job *drm_sched_entity_pop_job(struct drm_sched_entity *entity)
> {
> struct drm_sched_job *sched_job;
More information about the dri-devel
mailing list