[PATCH v2 1/2] drm/scheduler: Avoid using wait_event_killable for dying process.
Christian König
christian.koenig at amd.com
Fri Jun 1 17:22:46 UTC 2018
Am 01.06.2018 um 19:11 schrieb Andrey Grodzovsky:
> Dying process might be blocked from receiving any more signals
> so avoid using it.
>
> Also retire enity->fini_status and just check the SW queue,
> if it's not empty do the fallback cleanup.
>
> Also handle entity->last_scheduled == NULL use case which
> happens when HW ring is already hangged whem a new entity
> tried to enqeue jobs.
>
> v2:
> Return the remaining timeout and use that as parameter for the next call.
> This way when we need to cleanup multiple queues we don't wait for the
> entire TO period for each queue but rather in total.
> Styling comments.
> Rebase.
>
> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky at amd.com>
> ---
> drivers/gpu/drm/scheduler/gpu_scheduler.c | 74 ++++++++++++++++++++++++-------
> include/drm/gpu_scheduler.h | 7 +--
> 2 files changed, 61 insertions(+), 20 deletions(-)
>
> diff --git a/drivers/gpu/drm/scheduler/gpu_scheduler.c b/drivers/gpu/drm/scheduler/gpu_scheduler.c
> index 8c1e80c..c594d17 100644
> --- a/drivers/gpu/drm/scheduler/gpu_scheduler.c
> +++ b/drivers/gpu/drm/scheduler/gpu_scheduler.c
> @@ -181,7 +181,6 @@ int drm_sched_entity_init(struct drm_gpu_scheduler *sched,
> entity->rq = rq;
> entity->sched = sched;
> entity->guilty = guilty;
> - entity->fini_status = 0;
> entity->last_scheduled = NULL;
>
> spin_lock_init(&entity->rq_lock);
> @@ -219,7 +218,8 @@ static bool drm_sched_entity_is_initialized(struct drm_gpu_scheduler *sched,
> static bool drm_sched_entity_is_idle(struct drm_sched_entity *entity)
> {
> rmb();
> - if (spsc_queue_peek(&entity->job_queue) == NULL)
> +
> + if (!entity->rq || spsc_queue_peek(&entity->job_queue) == NULL)
> return true;
>
> return false;
> @@ -260,25 +260,48 @@ static void drm_sched_entity_kill_jobs_cb(struct dma_fence *f,
> *
> * @sched: scheduler instance
> * @entity: scheduler entity
> + * @timeout: time to wait in ms for Q to become empty.
> *
> * Splitting drm_sched_entity_fini() into two functions, The first one does the waiting,
> * removes the entity from the runqueue and returns an error when the process was killed.
> + *
> + * Returns amount of time spent in waiting for TO.
> + * 0 if wait wasn't with time out.
> + * MAX_WAIT_SCHED_ENTITY_Q_EMPTY_MS if wait timed out with condition false
> + * Number of MS spent in waiting before condition became true
> + *
> */
> -void drm_sched_entity_do_release(struct drm_gpu_scheduler *sched,
> - struct drm_sched_entity *entity)
> +unsigned drm_sched_entity_do_release(struct drm_gpu_scheduler *sched,
> + struct drm_sched_entity *entity, unsigned timeout)
Better use long for return type and timeout.
> {
> + unsigned ret = 0;
Also use a long here and initialize it with timeout.
> +
> if (!drm_sched_entity_is_initialized(sched, entity))
> return;
> /**
> * The client will not queue more IBs during this fini, consume existing
> * queued IBs or discard them on SIGKILL
> */
> - if ((current->flags & PF_SIGNALED) && current->exit_code == SIGKILL)
> - entity->fini_status = -ERESTARTSYS;
> - else
> - entity->fini_status = wait_event_killable(sched->job_scheduled,
> - drm_sched_entity_is_idle(entity));
> - drm_sched_entity_set_rq(entity, NULL);
> + if (current->flags & PF_EXITING) {
> + if (timeout) {
> + ret = jiffies_to_msecs(
> + wait_event_timeout(
> + sched->job_scheduled,
> + drm_sched_entity_is_idle(entity),
> + msecs_to_jiffies(timeout)));
Oh please don't use msecs as timeout, just use jiffies and let the
caller do the conversion.
> +
> + if (!ret)
> + ret = MAX_WAIT_SCHED_ENTITY_Q_EMPTY_MS;
Why that? It is common coding style to return 0 when a timeout occurs.
Christian.
> + }
> + } else
> + wait_event_killable(sched->job_scheduled, drm_sched_entity_is_idle(entity));
> +
> +
> + /* For killed process disable any more IBs enqueue right now */
> + if ((current->flags & PF_EXITING) && (current->exit_code == SIGKILL))
> + drm_sched_entity_set_rq(entity, NULL);
> +
> + return ret;
> }
> EXPORT_SYMBOL(drm_sched_entity_do_release);
>
> @@ -290,11 +313,18 @@ EXPORT_SYMBOL(drm_sched_entity_do_release);
> *
> * This should be called after @drm_sched_entity_do_release. It goes over the
> * entity and signals all jobs with an error code if the process was killed.
> + *
> */
> void drm_sched_entity_cleanup(struct drm_gpu_scheduler *sched,
> struct drm_sched_entity *entity)
> {
> - if (entity->fini_status) {
> +
> + drm_sched_entity_set_rq(entity, NULL);
> +
> + /* Consumption of existing IBs wasn't completed. Forcefully
> + * remove them here.
> + */
> + if (spsc_queue_peek(&entity->job_queue)) {
> struct drm_sched_job *job;
> int r;
>
> @@ -314,12 +344,22 @@ void drm_sched_entity_cleanup(struct drm_gpu_scheduler *sched,
> struct drm_sched_fence *s_fence = job->s_fence;
> drm_sched_fence_scheduled(s_fence);
> dma_fence_set_error(&s_fence->finished, -ESRCH);
> - r = dma_fence_add_callback(entity->last_scheduled, &job->finish_cb,
> - drm_sched_entity_kill_jobs_cb);
> - if (r == -ENOENT)
> +
> + /*
> + * When pipe is hanged by older entity, new entity might
> + * not even have chance to submit it's first job to HW
> + * and so entity->last_scheduled will remain NULL
> + */
> + if (!entity->last_scheduled) {
> drm_sched_entity_kill_jobs_cb(NULL, &job->finish_cb);
> - else if (r)
> - DRM_ERROR("fence add callback failed (%d)\n", r);
> + } else {
> + r = dma_fence_add_callback(entity->last_scheduled, &job->finish_cb,
> + drm_sched_entity_kill_jobs_cb);
> + if (r == -ENOENT)
> + drm_sched_entity_kill_jobs_cb(NULL, &job->finish_cb);
> + else if (r)
> + DRM_ERROR("fence add callback failed (%d)\n", r);
> + }
> }
> }
>
> @@ -339,7 +379,7 @@ EXPORT_SYMBOL(drm_sched_entity_cleanup);
> void drm_sched_entity_fini(struct drm_gpu_scheduler *sched,
> struct drm_sched_entity *entity)
> {
> - drm_sched_entity_do_release(sched, entity);
> + drm_sched_entity_do_release(sched, entity, MAX_WAIT_SCHED_ENTITY_Q_EMPTY_MS);
> drm_sched_entity_cleanup(sched, entity);
> }
> EXPORT_SYMBOL(drm_sched_entity_fini);
> diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
> index 496442f..af07875 100644
> --- a/include/drm/gpu_scheduler.h
> +++ b/include/drm/gpu_scheduler.h
> @@ -27,6 +27,8 @@
> #include <drm/spsc_queue.h>
> #include <linux/dma-fence.h>
>
> +#define MAX_WAIT_SCHED_ENTITY_Q_EMPTY_MS 1000
I suggest to use msecs_to_jiffies(1000) here and drop the _MS postfix.
Christian.
> +
> struct drm_gpu_scheduler;
> struct drm_sched_rq;
>
> @@ -84,7 +86,6 @@ struct drm_sched_entity {
> struct dma_fence *dependency;
> struct dma_fence_cb cb;
> atomic_t *guilty;
> - int fini_status;
> struct dma_fence *last_scheduled;
> };
>
> @@ -283,8 +284,8 @@ int drm_sched_entity_init(struct drm_gpu_scheduler *sched,
> struct drm_sched_entity *entity,
> struct drm_sched_rq *rq,
> atomic_t *guilty);
> -void drm_sched_entity_do_release(struct drm_gpu_scheduler *sched,
> - struct drm_sched_entity *entity);
> +unsigned drm_sched_entity_do_release(struct drm_gpu_scheduler *sched,
> + struct drm_sched_entity *entity, unsigned timeout);
> void drm_sched_entity_cleanup(struct drm_gpu_scheduler *sched,
> struct drm_sched_entity *entity);
> void drm_sched_entity_fini(struct drm_gpu_scheduler *sched,
More information about the amd-gfx
mailing list