<div dir="ltr"><br><br><div class="gmail_quote"><div dir="ltr">On Fri, Aug 3, 2018 at 7:17 PM Andrey Grodzovsky <<a href="mailto:Andrey.Grodzovsky@amd.com" target="_blank">Andrey.Grodzovsky@amd.com</a>> wrote:<br></div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">
<div text="#000000" bgcolor="#FFFFFF">
<p><br>
</p>
<br>
<div class="m_-6223222008664542963m_3550633764703809527moz-cite-prefix">On 08/03/2018 08:12 AM, Nayan Deshmukh
wrote:<br>
</div>
<blockquote type="cite">
<div dir="auto">
<div><br>
<br>
<div class="gmail_quote">
<div dir="ltr">On Thu, Aug 2, 2018, 8:09 PM Andrey
Grodzovsky <<a href="mailto:Andrey.Grodzovsky@amd.com" target="_blank">Andrey.Grodzovsky@amd.com</a>>
wrote:<br>
</div>
<blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">
<div text="#000000" bgcolor="#FFFFFF">
<p><br>
</p>
<br>
<div class="m_-6223222008664542963m_3550633764703809527m_-2527259700652160276moz-cite-prefix">On
08/02/2018 02:43 AM, Nayan Deshmukh wrote:<br>
</div>
<blockquote type="cite">
<div dir="ltr">
<div>
<div>
<div>Hi Andrey,<br>
<br>
</div>
Good catch. I guess since both Christian and I
were working on it parallelly we didn't catch
this problem.<br>
<br>
</div>
If it is only causing a problem in push_job then
we can handle it something like this:<br>
<br>
----------------------------------------------------------------------------------------------------------------------------<br>
diff --git
a/drivers/gpu/drm/scheduler/gpu_scheduler.c
b/drivers/gpu/drm/scheduler/gpu_scheduler.c<br>
index 05dc6ecd4003..f344ce32f128 100644<br>
--- a/drivers/gpu/drm/scheduler/gpu_scheduler.c<br>
+++ b/drivers/gpu/drm/scheduler/gpu_scheduler.c<br>
@@ -563,6 +563,11 @@ void
drm_sched_entity_push_job(struct drm_sched_job
*sched_job,<br>
first =
spsc_queue_count(&entity->job_queue) == 0;<br>
reschedule = idle && first
&& (entity->num_rq_list > 1);<br>
<br>
+ if (first &&
list_empty(&entity->list)) {<br>
</div>
</div>
</blockquote>
<br>
first might be false if the other process interrupted by
SIGKILL in middle of wait_event_killable in
drm_sched_entity_flush when there were still item in
queue.<br>
I don't see a good way besides adding a 'stopped' flag
to drm_sched_enitity.<br>
</div>
</blockquote>
</div>
</div>
<div dir="auto">Sorry I missed this mail. This case might happen
but this was also not being handled previously. </div>
<div dir="auto"><br>
</div>
<div dir="auto">Nayan <br>
</div>
</div>
</blockquote>
<br>
The original code before 'drm/scheduler: stop setting rq to NULL'
was , because you didn't use the queue's emptiness as a criteria for
aborting your next push to queue but rather<br>
checking for entity->rq != NULL.<br></div></blockquote><div>But (entity->rq != NULL) check was inside the body of if (first) so it was also dependent on the queue's emptiness earlier. Well in any case it doesn't matter now as with Christian's patch we can remove the if condition altogether. <br><br></div><div>Nayan<br></div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex"><div text="#000000" bgcolor="#FFFFFF">
<br>
Andrey<br>
<br>
<blockquote type="cite">
<div dir="auto">
<div dir="auto">
<div class="gmail_quote">
<blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">
<div text="#000000" bgcolor="#FFFFFF"> <br>
Andrey<br>
<br>
<blockquote type="cite">
<div dir="ltr">
<div>+ DRM_ERROR("Trying to push to a
killed entity\n");<br>
+ return;<br>
+ }<br>
+<br>
if (reschedule) {<br>
rq =
drm_sched_entity_get_free_sched(entity);<br>
spin_lock(&entity->rq_lock);<br>
@@ -580,11 +585,6 @@ void
drm_sched_entity_push_job(struct drm_sched_job
*sched_job,<br>
if (first) {<br>
/* Add the entity to the run queue
*/<br>
spin_lock(&entity->rq_lock);<br>
- if (!entity->rq) {<br>
- DRM_ERROR("Trying to push
to a killed entity\n");<br>
-
spin_unlock(&entity->rq_lock);<br>
- return;<br>
- }<br>
drm_sched_rq_add_entity(entity->rq, entity);<br>
spin_unlock(&entity->rq_lock);<br>
drm_sched_wakeup(entity->rq->sched);<br>
-----------------------------------------------------------------------------------------------------------------------------<br>
</div>
<br>
<br>
<div class="gmail_quote">
<div dir="ltr">On Thu, Aug 2, 2018 at 3:55 AM
Andrey Grodzovsky <<a href="mailto:Andrey.Grodzovsky@amd.com" rel="noreferrer" target="_blank">Andrey.Grodzovsky@amd.com</a>>
wrote:<br>
</div>
<blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">Thinking again about
this change and 53d5f1e drm/scheduler: move idle
<br>
entities to scheduler with less load v2<br>
<br>
Looks to me like use case for which fc9a539
drm/scheduler: add NULL <br>
pointer check for run queue (v2) was done<br>
<br>
will not work anymore.<br>
<br>
First of all in drm_sched_entity_push_job, 'if
(!entity->rq)' will never <br>
be true any more since we stopped entity->rq
to NULL in<br>
<br>
drm_sched_entity_flush.<br>
<br>
Second of all, even after we removed the entity
from rq in <br>
drm_sched_entity_flush to terminate any
subsequent submissions<br>
<br>
to the queue the other thread doing push job can
just acquire again a <br>
run queue<br>
<br>
from drm_sched_entity_get_free_sched and
continue submission so you <br>
can't substitute ' if (!entity->rq)' to 'if
(list_empty(&entity->list))'.<br>
<br>
What about adding a 'stopped' flag to
drm_sched_entity to be set in <br>
drm_sched_entity_flush and in<br>
<br>
</blockquote>
<div>But it might be worth adding a stopped flag
field if it is required at other places as
well. <br>
<br>
</div>
<div>Thanks,<br>
</div>
<div>Nayan<br>
</div>
<blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">
drm_sched_entity_push_job check for 'if
(!entity->stopped)' instead of <br>
' if (!entity->rq)' ?<br>
<br>
Andrey<br>
<br>
<br>
On 07/30/2018 07:03 AM, Christian König wrote:<br>
> We removed the redundancy of having an
extra scheduler field, so we<br>
> can't set the rq to NULL any more or
otherwise won't know which<br>
> scheduler to use for the cleanup.<br>
><br>
> Just remove the entity from the scheduling
list instead.<br>
><br>
> Signed-off-by: Christian König <<a href="mailto:christian.koenig@amd.com" rel="noreferrer" target="_blank">christian.koenig@amd.com</a>><br>
> ---<br>
> drivers/gpu/drm/scheduler/gpu_scheduler.c
| 35 +++++++------------------------<br>
> 1 file changed, 8 insertions(+), 27
deletions(-)<br>
><br>
> diff --git
a/drivers/gpu/drm/scheduler/gpu_scheduler.c
b/drivers/gpu/drm/scheduler/gpu_scheduler.c<br>
> index f563e4fbb4b6..1b733229201e 100644<br>
> ---
a/drivers/gpu/drm/scheduler/gpu_scheduler.c<br>
> +++
b/drivers/gpu/drm/scheduler/gpu_scheduler.c<br>
> @@ -198,21 +198,6 @@ int
drm_sched_entity_init(struct drm_sched_entity
*entity,<br>
> }<br>
> EXPORT_SYMBOL(drm_sched_entity_init);<br>
> <br>
> -/**<br>
> - * drm_sched_entity_is_initialized - Query
if entity is initialized<br>
> - *<br>
> - * @sched: Pointer to scheduler instance<br>
> - * @entity: The pointer to a valid
scheduler entity<br>
> - *<br>
> - * return true if entity is initialized,
false otherwise<br>
> -*/<br>
> -static bool
drm_sched_entity_is_initialized(struct
drm_gpu_scheduler *sched,<br>
> -
struct drm_sched_entity *entity)<br>
> -{<br>
> - return entity->rq != NULL
&&<br>
> - entity->rq->sched ==
sched;<br>
> -}<br>
> -<br>
> /**<br>
> * drm_sched_entity_is_idle - Check if
entity is idle<br>
> *<br>
> @@ -224,7 +209,8 @@ static bool
drm_sched_entity_is_idle(struct drm_sched_entity
*entity)<br>
> {<br>
> rmb();<br>
> <br>
> - if (!entity->rq ||
spsc_queue_peek(&entity->job_queue) ==
NULL)<br>
> + if (list_empty(&entity->list)
||<br>
> +
spsc_queue_peek(&entity->job_queue) ==
NULL)<br>
> return true;<br>
> <br>
> return false;<br>
> @@ -279,8 +265,6 @@ long
drm_sched_entity_flush(struct drm_sched_entity
*entity, long timeout)<br>
> long ret = timeout;<br>
> <br>
> sched = entity->rq->sched;<br>
> - if
(!drm_sched_entity_is_initialized(sched,
entity))<br>
> - return ret;<br>
> /**<br>
> * The client will not queue more IBs
during this fini, consume existing<br>
> * queued IBs or discard them on
SIGKILL<br>
> @@ -299,7 +283,7 @@ long
drm_sched_entity_flush(struct drm_sched_entity
*entity, long timeout)<br>
> last_user =
cmpxchg(&entity->last_user,
current->group_leader, NULL);<br>
> if ((!last_user || last_user ==
current->group_leader) &&<br>
> (current->flags &
PF_EXITING) && (current->exit_code ==
SIGKILL))<br>
> -
drm_sched_entity_set_rq(entity, NULL);<br>
> +
drm_sched_rq_remove_entity(entity->rq,
entity);<br>
> <br>
> return ret;<br>
> }<br>
> @@ -320,7 +304,7 @@ void
drm_sched_entity_fini(struct drm_sched_entity
*entity)<br>
> struct drm_gpu_scheduler *sched;<br>
> <br>
> sched = entity->rq->sched;<br>
> - drm_sched_entity_set_rq(entity,
NULL);<br>
> +
drm_sched_rq_remove_entity(entity->rq,
entity);<br>
> <br>
> /* Consumption of existing IBs wasn't
completed. Forcefully<br>
> * remove them here.<br>
> @@ -416,15 +400,12 @@ void
drm_sched_entity_set_rq(struct drm_sched_entity
*entity,<br>
> if (entity->rq == rq)<br>
> return;<br>
> <br>
> - spin_lock(&entity->rq_lock);<br>
> -<br>
> - if (entity->rq)<br>
> -
drm_sched_rq_remove_entity(entity->rq,
entity);<br>
> + BUG_ON(!rq);<br>
> <br>
> + spin_lock(&entity->rq_lock);<br>
> +
drm_sched_rq_remove_entity(entity->rq,
entity);<br>
> entity->rq = rq;<br>
> - if (rq)<br>
> - drm_sched_rq_add_entity(rq,
entity);<br>
> -<br>
> + drm_sched_rq_add_entity(rq, entity);<br>
> spin_unlock(&entity->rq_lock);<br>
> }<br>
> EXPORT_SYMBOL(drm_sched_entity_set_rq);<br>
<br>
</blockquote>
</div>
</div>
</blockquote>
<br>
</div>
</blockquote>
</div>
</div>
</div>
</blockquote>
<br>
</div>
</blockquote></div></div>