[PATCH 2/2] drm/scheduler: stop setting rq to NULL

Thu Aug 2 14:38:50 UTC 2018

On 08/02/2018 02:43 AM, Nayan Deshmukh wrote:
> Hi Andrey,
>
> Good catch. I guess since both Christian and I were working on it 
> parallelly we didn't catch this problem.
>
> If it is only causing a problem in push_job then we can handle it 
> something like this:
>
> ----------------------------------------------------------------------------------------------------------------------------
> diff --git a/drivers/gpu/drm/scheduler/gpu_scheduler.c 
> b/drivers/gpu/drm/scheduler/gpu_scheduler.c
> index 05dc6ecd4003..f344ce32f128 100644
> --- a/drivers/gpu/drm/scheduler/gpu_scheduler.c
> +++ b/drivers/gpu/drm/scheduler/gpu_scheduler.c
> @@ -563,6 +563,11 @@ void drm_sched_entity_push_job(struct 
> drm_sched_job *sched_job,
>         first = spsc_queue_count(&entity->job_queue) == 0;
>         reschedule = idle && first && (entity->num_rq_list > 1);
>
> +       if (first && list_empty(&entity->list)) {

first might be false if the other process interrupted by SIGKILL  in 
middle of  wait_event_killable in drm_sched_entity_flush when there were 
still item in queue.
I don't see a good way besides adding a 'stopped' flag to drm_sched_enitity.

Andrey

> +               DRM_ERROR("Trying to push to a killed entity\n");
> +               return;
> +       }
> +
>         if (reschedule) {
>                 rq = drm_sched_entity_get_free_sched(entity);
>                 spin_lock(&entity->rq_lock);
> @@ -580,11 +585,6 @@ void drm_sched_entity_push_job(struct 
> drm_sched_job *sched_job,
>         if (first) {
>                 /* Add the entity to the run queue */
>                 spin_lock(&entity->rq_lock);
> -               if (!entity->rq) {
> -                       DRM_ERROR("Trying to push to a killed entity\n");
> -                       spin_unlock(&entity->rq_lock);
> -                       return;
> -               }
>                 drm_sched_rq_add_entity(entity->rq, entity);
>                 spin_unlock(&entity->rq_lock);
>                 drm_sched_wakeup(entity->rq->sched);
> -----------------------------------------------------------------------------------------------------------------------------
>
>
> On Thu, Aug 2, 2018 at 3:55 AM Andrey Grodzovsky 
> <Andrey.Grodzovsky at amd.com <mailto:Andrey.Grodzovsky at amd.com>> wrote:
>
>     Thinking again about this change and 53d5f1e drm/scheduler: move idle
>     entities to scheduler with less load v2
>
>     Looks to me like use case for which fc9a539 drm/scheduler: add NULL
>     pointer check for run queue (v2) was done
>
>     will not work anymore.
>
>     First of all in drm_sched_entity_push_job, 'if (!entity->rq)' will
>     never
>     be true any more since we stopped entity->rq to NULL in
>
>     drm_sched_entity_flush.
>
>     Second of all, even after we removed the entity from rq in
>     drm_sched_entity_flush to terminate any subsequent submissions
>
>     to the queue the other thread doing push job can just acquire again a
>     run queue
>
>     from drm_sched_entity_get_free_sched and continue submission so you
>     can't substitute ' if (!entity->rq)' to 'if
>     (list_empty(&entity->list))'.
>
>     What about adding a 'stopped' flag to drm_sched_entity to be set in
>     drm_sched_entity_flush and in
>
> But it might be worth adding a stopped flag field if it is required at 
> other places as well.
>
> Thanks,
> Nayan
>
>     drm_sched_entity_push_job check for  'if (!entity->stopped)'
>     instead of
>     ' if (!entity->rq)' ?
>
>     Andrey
>
>
>     On 07/30/2018 07:03 AM, Christian König wrote:
>     > We removed the redundancy of having an extra scheduler field, so we
>     > can't set the rq to NULL any more or otherwise won't know which
>     > scheduler to use for the cleanup.
>     >
>     > Just remove the entity from the scheduling list instead.
>     >
>     > Signed-off-by: Christian König <christian.koenig at amd.com
>     <mailto:christian.koenig at amd.com>>
>     > ---
>     >   drivers/gpu/drm/scheduler/gpu_scheduler.c | 35
>     +++++++------------------------
>     >   1 file changed, 8 insertions(+), 27 deletions(-)
>     >
>     > diff --git a/drivers/gpu/drm/scheduler/gpu_scheduler.c
>     b/drivers/gpu/drm/scheduler/gpu_scheduler.c
>     > index f563e4fbb4b6..1b733229201e 100644
>     > --- a/drivers/gpu/drm/scheduler/gpu_scheduler.c
>     > +++ b/drivers/gpu/drm/scheduler/gpu_scheduler.c
>     > @@ -198,21 +198,6 @@ int drm_sched_entity_init(struct
>     drm_sched_entity *entity,
>     >   }
>     >   EXPORT_SYMBOL(drm_sched_entity_init);
>     >
>     > -/**
>     > - * drm_sched_entity_is_initialized - Query if entity is initialized
>     > - *
>     > - * @sched: Pointer to scheduler instance
>     > - * @entity: The pointer to a valid scheduler entity
>     > - *
>     > - * return true if entity is initialized, false otherwise
>     > -*/
>     > -static bool drm_sched_entity_is_initialized(struct
>     drm_gpu_scheduler *sched,
>     > -                                         struct
>     drm_sched_entity *entity)
>     > -{
>     > -     return entity->rq != NULL &&
>     > -             entity->rq->sched == sched;
>     > -}
>     > -
>     >   /**
>     >    * drm_sched_entity_is_idle - Check if entity is idle
>     >    *
>     > @@ -224,7 +209,8 @@ static bool drm_sched_entity_is_idle(struct
>     drm_sched_entity *entity)
>     >   {
>     >       rmb();
>     >
>     > -     if (!entity->rq || spsc_queue_peek(&entity->job_queue) ==
>     NULL)
>     > +     if (list_empty(&entity->list) ||
>     > +         spsc_queue_peek(&entity->job_queue) == NULL)
>     >               return true;
>     >
>     >       return false;
>     > @@ -279,8 +265,6 @@ long drm_sched_entity_flush(struct
>     drm_sched_entity *entity, long timeout)
>     >       long ret = timeout;
>     >
>     >       sched = entity->rq->sched;
>     > -     if (!drm_sched_entity_is_initialized(sched, entity))
>     > -             return ret;
>     >       /**
>     >        * The client will not queue more IBs during this fini,
>     consume existing
>     >        * queued IBs or discard them on SIGKILL
>     > @@ -299,7 +283,7 @@ long drm_sched_entity_flush(struct
>     drm_sched_entity *entity, long timeout)
>     >       last_user = cmpxchg(&entity->last_user,
>     current->group_leader, NULL);
>     >       if ((!last_user || last_user == current->group_leader) &&
>     >           (current->flags & PF_EXITING) && (current->exit_code
>     == SIGKILL))
>     > -             drm_sched_entity_set_rq(entity, NULL);
>     > +             drm_sched_rq_remove_entity(entity->rq, entity);
>     >
>     >       return ret;
>     >   }
>     > @@ -320,7 +304,7 @@ void drm_sched_entity_fini(struct
>     drm_sched_entity *entity)
>     >       struct drm_gpu_scheduler *sched;
>     >
>     >       sched = entity->rq->sched;
>     > -     drm_sched_entity_set_rq(entity, NULL);
>     > +     drm_sched_rq_remove_entity(entity->rq, entity);
>     >
>     >       /* Consumption of existing IBs wasn't completed. Forcefully
>     >        * remove them here.
>     > @@ -416,15 +400,12 @@ void drm_sched_entity_set_rq(struct
>     drm_sched_entity *entity,
>     >       if (entity->rq == rq)
>     >               return;
>     >
>     > -     spin_lock(&entity->rq_lock);
>     > -
>     > -     if (entity->rq)
>     > -             drm_sched_rq_remove_entity(entity->rq, entity);
>     > +     BUG_ON(!rq);
>     >
>     > +     spin_lock(&entity->rq_lock);
>     > +     drm_sched_rq_remove_entity(entity->rq, entity);
>     >       entity->rq = rq;
>     > -     if (rq)
>     > -             drm_sched_rq_add_entity(rq, entity);
>     > -
>     > +     drm_sched_rq_add_entity(rq, entity);
>     >       spin_unlock(&entity->rq_lock);
>     >   }
>     >   EXPORT_SYMBOL(drm_sched_entity_set_rq);
>

-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://lists.freedesktop.org/archives/dri-devel/attachments/20180802/9d555b19/attachment-0001.html>