<div dir="ltr"><div><div><div>Hi Andrey,<br><br></div>Good catch. I guess since both Christian and I were working on it parallelly we didn't catch this problem.<br><br></div>If it is only causing a problem in push_job then we can handle it something like this:<br><br>----------------------------------------------------------------------------------------------------------------------------<br>diff --git a/drivers/gpu/drm/scheduler/gpu_scheduler.c b/drivers/gpu/drm/scheduler/gpu_scheduler.c<br>index 05dc6ecd4003..f344ce32f128 100644<br>--- a/drivers/gpu/drm/scheduler/gpu_scheduler.c<br>+++ b/drivers/gpu/drm/scheduler/gpu_scheduler.c<br>@@ -563,6 +563,11 @@ void drm_sched_entity_push_job(struct drm_sched_job *sched_job,<br>        first = spsc_queue_count(&entity->job_queue) == 0;<br>        reschedule = idle && first && (entity->num_rq_list > 1);<br> <br>+       if (first && list_empty(&entity->list)) {<br>+               DRM_ERROR("Trying to push to a killed entity\n");<br>+               return;<br>+       }<br>+<br>        if (reschedule) {<br>                rq = drm_sched_entity_get_free_sched(entity);<br>                spin_lock(&entity->rq_lock);<br>@@ -580,11 +585,6 @@ void drm_sched_entity_push_job(struct drm_sched_job *sched_job,<br>        if (first) {<br>                /* Add the entity to the run queue */<br>                spin_lock(&entity->rq_lock);<br>-               if (!entity->rq) {<br>-                       DRM_ERROR("Trying to push to a killed entity\n");<br>-                       spin_unlock(&entity->rq_lock);<br>-                       return;<br>-               }<br>                drm_sched_rq_add_entity(entity->rq, entity);<br>                spin_unlock(&entity->rq_lock);<br>                drm_sched_wakeup(entity->rq->sched);<br>-----------------------------------------------------------------------------------------------------------------------------<br></div><br><br><div class="gmail_quote"><div dir="ltr">On Thu, Aug 2, 2018 at 3:55 AM Andrey Grodzovsky <<a href="mailto:Andrey.Grodzovsky@amd.com" target="_blank">Andrey.Grodzovsky@amd.com</a>> wrote:<br></div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">Thinking again about this change and 53d5f1e drm/scheduler: move idle <br>
entities to scheduler with less load v2<br>
<br>
Looks to me like use case for which fc9a539 drm/scheduler: add NULL <br>
pointer check for run queue (v2) was done<br>
<br>
will not work anymore.<br>
<br>
First of all in drm_sched_entity_push_job, 'if (!entity->rq)' will never <br>
be true any more since we stopped entity->rq to NULL in<br>
<br>
drm_sched_entity_flush.<br>
<br>
Second of all, even after we removed the entity from rq in <br>
drm_sched_entity_flush to terminate any subsequent submissions<br>
<br>
to the queue the other thread doing push job can just acquire again a <br>
run queue<br>
<br>
from drm_sched_entity_get_free_sched and continue submission so you <br>
can't substitute ' if (!entity->rq)' to 'if (list_empty(&entity->list))'.<br>
<br>
What about adding a 'stopped' flag to drm_sched_entity to be set in <br>
drm_sched_entity_flush and in<br>
<br></blockquote><div>But it might be worth adding a stopped flag field if it is required at other places as well.  <br><br></div><div>Thanks,<br></div><div>Nayan<br></div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">
drm_sched_entity_push_job check for  'if (!entity->stopped)' instead of  <br>
' if (!entity->rq)' ?<br>
<br>
Andrey<br>
<br>
<br>
On 07/30/2018 07:03 AM, Christian König wrote:<br>
> We removed the redundancy of having an extra scheduler field, so we<br>
> can't set the rq to NULL any more or otherwise won't know which<br>
> scheduler to use for the cleanup.<br>
><br>
> Just remove the entity from the scheduling list instead.<br>
><br>
> Signed-off-by: Christian König <<a href="mailto:christian.koenig@amd.com" target="_blank">christian.koenig@amd.com</a>><br>
> ---<br>
>   drivers/gpu/drm/scheduler/gpu_scheduler.c | 35 +++++++------------------------<br>
>   1 file changed, 8 insertions(+), 27 deletions(-)<br>
><br>
> diff --git a/drivers/gpu/drm/scheduler/gpu_scheduler.c b/drivers/gpu/drm/scheduler/gpu_scheduler.c<br>
> index f563e4fbb4b6..1b733229201e 100644<br>
> --- a/drivers/gpu/drm/scheduler/gpu_scheduler.c<br>
> +++ b/drivers/gpu/drm/scheduler/gpu_scheduler.c<br>
> @@ -198,21 +198,6 @@ int drm_sched_entity_init(struct drm_sched_entity *entity,<br>
>   }<br>
>   EXPORT_SYMBOL(drm_sched_entity_init);<br>
>   <br>
> -/**<br>
> - * drm_sched_entity_is_initialized - Query if entity is initialized<br>
> - *<br>
> - * @sched: Pointer to scheduler instance<br>
> - * @entity: The pointer to a valid scheduler entity<br>
> - *<br>
> - * return true if entity is initialized, false otherwise<br>
> -*/<br>
> -static bool drm_sched_entity_is_initialized(struct drm_gpu_scheduler *sched,<br>
> -                                         struct drm_sched_entity *entity)<br>
> -{<br>
> -     return entity->rq != NULL &&<br>
> -             entity->rq->sched == sched;<br>
> -}<br>
> -<br>
>   /**<br>
>    * drm_sched_entity_is_idle - Check if entity is idle<br>
>    *<br>
> @@ -224,7 +209,8 @@ static bool drm_sched_entity_is_idle(struct drm_sched_entity *entity)<br>
>   {<br>
>       rmb();<br>
>   <br>
> -     if (!entity->rq || spsc_queue_peek(&entity->job_queue) == NULL)<br>
> +     if (list_empty(&entity->list) ||<br>
> +         spsc_queue_peek(&entity->job_queue) == NULL)<br>
>               return true;<br>
>   <br>
>       return false;<br>
> @@ -279,8 +265,6 @@ long drm_sched_entity_flush(struct drm_sched_entity *entity, long timeout)<br>
>       long ret = timeout;<br>
>   <br>
>       sched = entity->rq->sched;<br>
> -     if (!drm_sched_entity_is_initialized(sched, entity))<br>
> -             return ret;<br>
>       /**<br>
>        * The client will not queue more IBs during this fini, consume existing<br>
>        * queued IBs or discard them on SIGKILL<br>
> @@ -299,7 +283,7 @@ long drm_sched_entity_flush(struct drm_sched_entity *entity, long timeout)<br>
>       last_user = cmpxchg(&entity->last_user, current->group_leader, NULL);<br>
>       if ((!last_user || last_user == current->group_leader) &&<br>
>           (current->flags & PF_EXITING) && (current->exit_code == SIGKILL))<br>
> -             drm_sched_entity_set_rq(entity, NULL);<br>
> +             drm_sched_rq_remove_entity(entity->rq, entity);<br>
>   <br>
>       return ret;<br>
>   }<br>
> @@ -320,7 +304,7 @@ void drm_sched_entity_fini(struct drm_sched_entity *entity)<br>
>       struct drm_gpu_scheduler *sched;<br>
>   <br>
>       sched = entity->rq->sched;<br>
> -     drm_sched_entity_set_rq(entity, NULL);<br>
> +     drm_sched_rq_remove_entity(entity->rq, entity);<br>
>   <br>
>       /* Consumption of existing IBs wasn't completed. Forcefully<br>
>        * remove them here.<br>
> @@ -416,15 +400,12 @@ void drm_sched_entity_set_rq(struct drm_sched_entity *entity,<br>
>       if (entity->rq == rq)<br>
>               return;<br>
>   <br>
> -     spin_lock(&entity->rq_lock);<br>
> -<br>
> -     if (entity->rq)<br>
> -             drm_sched_rq_remove_entity(entity->rq, entity);<br>
> +     BUG_ON(!rq);<br>
>   <br>
> +     spin_lock(&entity->rq_lock);<br>
> +     drm_sched_rq_remove_entity(entity->rq, entity);<br>
>       entity->rq = rq;<br>
> -     if (rq)<br>
> -             drm_sched_rq_add_entity(rq, entity);<br>
> -<br>
> +     drm_sched_rq_add_entity(rq, entity);<br>
>       spin_unlock(&entity->rq_lock);<br>
>   }<br>
>   EXPORT_SYMBOL(drm_sched_entity_set_rq);<br>
<br>
</blockquote></div></div>