[PATCH v2 1/2] drm/scheduler: Avoid using wait_event_killable for dying process.
Andrey Grodzovsky
Andrey.Grodzovsky at amd.com
Fri Jun 1 19:56:54 UTC 2018
On 06/01/2018 01:22 PM, Christian König wrote:
> Am 01.06.2018 um 19:11 schrieb Andrey Grodzovsky:
>> Dying process might be blocked from receiving any more signals
>> so avoid using it.
>>
>> Also retire enity->fini_status and just check the SW queue,
>> if it's not empty do the fallback cleanup.
>>
>> Also handle entity->last_scheduled == NULL use case which
>> happens when HW ring is already hangged whem a new entity
>> tried to enqeue jobs.
>>
>> v2:
>> Return the remaining timeout and use that as parameter for the next
>> call.
>> This way when we need to cleanup multiple queues we don't wait for the
>> entire TO period for each queue but rather in total.
>> Styling comments.
>> Rebase.
>>
>> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky at amd.com>
>> ---
>> drivers/gpu/drm/scheduler/gpu_scheduler.c | 74
>> ++++++++++++++++++++++++-------
>> include/drm/gpu_scheduler.h | 7 +--
>> 2 files changed, 61 insertions(+), 20 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/scheduler/gpu_scheduler.c
>> b/drivers/gpu/drm/scheduler/gpu_scheduler.c
>> index 8c1e80c..c594d17 100644
>> --- a/drivers/gpu/drm/scheduler/gpu_scheduler.c
>> +++ b/drivers/gpu/drm/scheduler/gpu_scheduler.c
>> @@ -181,7 +181,6 @@ int drm_sched_entity_init(struct
>> drm_gpu_scheduler *sched,
>> entity->rq = rq;
>> entity->sched = sched;
>> entity->guilty = guilty;
>> - entity->fini_status = 0;
>> entity->last_scheduled = NULL;
>> spin_lock_init(&entity->rq_lock);
>> @@ -219,7 +218,8 @@ static bool
>> drm_sched_entity_is_initialized(struct drm_gpu_scheduler *sched,
>> static bool drm_sched_entity_is_idle(struct drm_sched_entity *entity)
>> {
>> rmb();
>> - if (spsc_queue_peek(&entity->job_queue) == NULL)
>> +
>> + if (!entity->rq || spsc_queue_peek(&entity->job_queue) == NULL)
>> return true;
>> return false;
>> @@ -260,25 +260,48 @@ static void
>> drm_sched_entity_kill_jobs_cb(struct dma_fence *f,
>> *
>> * @sched: scheduler instance
>> * @entity: scheduler entity
>> + * @timeout: time to wait in ms for Q to become empty.
>> *
>> * Splitting drm_sched_entity_fini() into two functions, The first
>> one does the waiting,
>> * removes the entity from the runqueue and returns an error when
>> the process was killed.
>> + *
>> + * Returns amount of time spent in waiting for TO.
>> + * 0 if wait wasn't with time out.
>> + * MAX_WAIT_SCHED_ENTITY_Q_EMPTY_MS if wait timed out with condition
>> false
>> + * Number of MS spent in waiting before condition became true
>> + *
>> */
>> -void drm_sched_entity_do_release(struct drm_gpu_scheduler *sched,
>> - struct drm_sched_entity *entity)
>> +unsigned drm_sched_entity_do_release(struct drm_gpu_scheduler *sched,
>> + struct drm_sched_entity *entity, unsigned timeout)
>
> Better use long for return type and timeout.
>
>> {
>> + unsigned ret = 0;
>
> Also use a long here and initialize it with timeout.
Please see bellow
>
>> +
>> if (!drm_sched_entity_is_initialized(sched, entity))
>> return;
>> /**
>> * The client will not queue more IBs during this fini, consume
>> existing
>> * queued IBs or discard them on SIGKILL
>> */
>> - if ((current->flags & PF_SIGNALED) && current->exit_code ==
>> SIGKILL)
>> - entity->fini_status = -ERESTARTSYS;
>> - else
>> - entity->fini_status = wait_event_killable(sched->job_scheduled,
>> - drm_sched_entity_is_idle(entity));
>> - drm_sched_entity_set_rq(entity, NULL);
>> + if (current->flags & PF_EXITING) {
>> + if (timeout) {
>> + ret = jiffies_to_msecs(
>> + wait_event_timeout(
>> + sched->job_scheduled,
>> + drm_sched_entity_is_idle(entity),
>> + msecs_to_jiffies(timeout)));
>
> Oh please don't use msecs as timeout, just use jiffies and let the
> caller do the conversion.
>
>> +
>> + if (!ret)
>> + ret = MAX_WAIT_SCHED_ENTITY_Q_EMPTY_MS;
>
> Why that? It is common coding style to return 0 when a timeout occurs.
>
> Christian.
What should i return when i do wait_event_killable, it's return values
are opposite to wait_event_timeout...
This way returning 0 has no impact on remaining waiting time, dong it
the other way will force the caller
to do some cumbersome logic instead of just
max_wait = max_wait >= ret ? max_wait - ret : 0;
like in amdgpu_ctx_mgr_entity_fini
Andrey
>
>> + }
>> + } else
>> + wait_event_killable(sched->job_scheduled,
>> drm_sched_entity_is_idle(entity));
>> +
>> +
>> + /* For killed process disable any more IBs enqueue right now */
>> + if ((current->flags & PF_EXITING) && (current->exit_code ==
>> SIGKILL))
>> + drm_sched_entity_set_rq(entity, NULL);
>> +
>> + return ret;
>> }
>> EXPORT_SYMBOL(drm_sched_entity_do_release);
>> @@ -290,11 +313,18 @@ EXPORT_SYMBOL(drm_sched_entity_do_release);
>> *
>> * This should be called after @drm_sched_entity_do_release. It
>> goes over the
>> * entity and signals all jobs with an error code if the process
>> was killed.
>> + *
>> */
>> void drm_sched_entity_cleanup(struct drm_gpu_scheduler *sched,
>> struct drm_sched_entity *entity)
>> {
>> - if (entity->fini_status) {
>> +
>> + drm_sched_entity_set_rq(entity, NULL);
>> +
>> + /* Consumption of existing IBs wasn't completed. Forcefully
>> + * remove them here.
>> + */
>> + if (spsc_queue_peek(&entity->job_queue)) {
>> struct drm_sched_job *job;
>> int r;
>> @@ -314,12 +344,22 @@ void drm_sched_entity_cleanup(struct
>> drm_gpu_scheduler *sched,
>> struct drm_sched_fence *s_fence = job->s_fence;
>> drm_sched_fence_scheduled(s_fence);
>> dma_fence_set_error(&s_fence->finished, -ESRCH);
>> - r = dma_fence_add_callback(entity->last_scheduled,
>> &job->finish_cb,
>> - drm_sched_entity_kill_jobs_cb);
>> - if (r == -ENOENT)
>> +
>> + /*
>> + * When pipe is hanged by older entity, new entity might
>> + * not even have chance to submit it's first job to HW
>> + * and so entity->last_scheduled will remain NULL
>> + */
>> + if (!entity->last_scheduled) {
>> drm_sched_entity_kill_jobs_cb(NULL, &job->finish_cb);
>> - else if (r)
>> - DRM_ERROR("fence add callback failed (%d)\n", r);
>> + } else {
>> + r = dma_fence_add_callback(entity->last_scheduled,
>> &job->finish_cb,
>> + drm_sched_entity_kill_jobs_cb);
>> + if (r == -ENOENT)
>> + drm_sched_entity_kill_jobs_cb(NULL,
>> &job->finish_cb);
>> + else if (r)
>> + DRM_ERROR("fence add callback failed (%d)\n", r);
>> + }
>> }
>> }
>> @@ -339,7 +379,7 @@ EXPORT_SYMBOL(drm_sched_entity_cleanup);
>> void drm_sched_entity_fini(struct drm_gpu_scheduler *sched,
>> struct drm_sched_entity *entity)
>> {
>> - drm_sched_entity_do_release(sched, entity);
>> + drm_sched_entity_do_release(sched, entity,
>> MAX_WAIT_SCHED_ENTITY_Q_EMPTY_MS);
>> drm_sched_entity_cleanup(sched, entity);
>> }
>> EXPORT_SYMBOL(drm_sched_entity_fini);
>> diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
>> index 496442f..af07875 100644
>> --- a/include/drm/gpu_scheduler.h
>> +++ b/include/drm/gpu_scheduler.h
>> @@ -27,6 +27,8 @@
>> #include <drm/spsc_queue.h>
>> #include <linux/dma-fence.h>
>> +#define MAX_WAIT_SCHED_ENTITY_Q_EMPTY_MS 1000
>
> I suggest to use msecs_to_jiffies(1000) here and drop the _MS postfix.
>
> Christian.
>
>> +
>> struct drm_gpu_scheduler;
>> struct drm_sched_rq;
>> @@ -84,7 +86,6 @@ struct drm_sched_entity {
>> struct dma_fence *dependency;
>> struct dma_fence_cb cb;
>> atomic_t *guilty;
>> - int fini_status;
>> struct dma_fence *last_scheduled;
>> };
>> @@ -283,8 +284,8 @@ int drm_sched_entity_init(struct
>> drm_gpu_scheduler *sched,
>> struct drm_sched_entity *entity,
>> struct drm_sched_rq *rq,
>> atomic_t *guilty);
>> -void drm_sched_entity_do_release(struct drm_gpu_scheduler *sched,
>> - struct drm_sched_entity *entity);
>> +unsigned drm_sched_entity_do_release(struct drm_gpu_scheduler *sched,
>> + struct drm_sched_entity *entity, unsigned timeout);
>> void drm_sched_entity_cleanup(struct drm_gpu_scheduler *sched,
>> struct drm_sched_entity *entity);
>> void drm_sched_entity_fini(struct drm_gpu_scheduler *sched,
>
More information about the amd-gfx
mailing list