[PATCH 3/3] drm/amdgpu: Fix deadlock during GPU reset.
Christian König
ckoenig.leichtzumerken at gmail.com
Mon Oct 23 12:42:36 UTC 2017
Am 20.10.2017 um 15:32 schrieb Andrey Grodzovsky:
> Switch from kfifo to SPSC queue.
>
> Bug:
> Kfifo is limited at size, during GPU reset it would fill up to limit
> and the pushing thread (producer) would wait for the scheduler worker to
> consume the items in the fifo while holding reservation lock
> on a BO. The gpu reset thread on the other hand blocks the scheduler
> during reset. Before it unblocks the sceduler it might want
> to recover VRAM and so will try to reserve the same BO the producer
> thread is already holding creating a deadlock.
>
> Fix:
> Switch from kfifo to SPSC queue which is unlimited in size.
>
> Signed-off-by: Andrey Grodzovsky <Andrey.Grodzovsky at amd.com>
> ---
> drivers/gpu/drm/amd/scheduler/gpu_sched_trace.h | 4 +-
> drivers/gpu/drm/amd/scheduler/gpu_scheduler.c | 51 ++++++++++---------------
> drivers/gpu/drm/amd/scheduler/gpu_scheduler.h | 4 +-
> 3 files changed, 26 insertions(+), 33 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/scheduler/gpu_sched_trace.h b/drivers/gpu/drm/amd/scheduler/gpu_sched_trace.h
> index 8bd3810..86838a8 100644
> --- a/drivers/gpu/drm/amd/scheduler/gpu_sched_trace.h
> +++ b/drivers/gpu/drm/amd/scheduler/gpu_sched_trace.h
> @@ -28,8 +28,8 @@ TRACE_EVENT(amd_sched_job,
> __entry->id = sched_job->id;
> __entry->fence = &sched_job->s_fence->finished;
> __entry->name = sched_job->sched->name;
> - __entry->job_count = kfifo_len(
> - &sched_job->s_entity->job_queue) / sizeof(sched_job);
> + __entry->job_count = spsc_queue_count(
> + &sched_job->s_entity->job_queue);
> __entry->hw_job_count = atomic_read(
> &sched_job->sched->hw_rq_count);
> ),
> diff --git a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
> index 1bbbce2..0c9cdc0 100644
> --- a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
> +++ b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
> @@ -28,9 +28,14 @@
> #include <drm/drmP.h>
> #include "gpu_scheduler.h"
>
> +#include "spsc_queue.h"
> +
> #define CREATE_TRACE_POINTS
> #include "gpu_sched_trace.h"
>
> +#define to_amd_sched_job(sched_job) \
> + container_of((sched_job), struct amd_sched_job, queue_node)
> +
> static bool amd_sched_entity_is_ready(struct amd_sched_entity *entity);
> static void amd_sched_wakeup(struct amd_gpu_scheduler *sched);
> static void amd_sched_process_job(struct dma_fence *f, struct dma_fence_cb *cb);
> @@ -123,8 +128,6 @@ int amd_sched_entity_init(struct amd_gpu_scheduler *sched,
> struct amd_sched_rq *rq,
> uint32_t jobs)
> {
> - int r;
> -
> if (!(sched && entity && rq))
> return -EINVAL;
>
> @@ -135,9 +138,7 @@ int amd_sched_entity_init(struct amd_gpu_scheduler *sched,
>
> spin_lock_init(&entity->rq_lock);
> spin_lock_init(&entity->queue_lock);
> - r = kfifo_alloc(&entity->job_queue, jobs * sizeof(void *), GFP_KERNEL);
> - if (r)
> - return r;
> + spsc_queue_init(&entity->job_queue);
>
> atomic_set(&entity->fence_seq, 0);
> entity->fence_context = dma_fence_context_alloc(2);
> @@ -170,7 +171,7 @@ static bool amd_sched_entity_is_initialized(struct amd_gpu_scheduler *sched,
> static bool amd_sched_entity_is_idle(struct amd_sched_entity *entity)
> {
> rmb();
> - if (kfifo_is_empty(&entity->job_queue))
> + if (spsc_queue_peek(&entity->job_queue) == NULL)
> return true;
>
> return false;
> @@ -185,7 +186,7 @@ static bool amd_sched_entity_is_idle(struct amd_sched_entity *entity)
> */
> static bool amd_sched_entity_is_ready(struct amd_sched_entity *entity)
> {
> - if (kfifo_is_empty(&entity->job_queue))
> + if (spsc_queue_peek(&entity->job_queue) == NULL)
> return false;
>
> if (ACCESS_ONCE(entity->dependency))
> @@ -227,7 +228,7 @@ void amd_sched_entity_fini(struct amd_gpu_scheduler *sched,
> */
> kthread_park(sched->thread);
> kthread_unpark(sched->thread);
> - while (kfifo_out(&entity->job_queue, &job, sizeof(job))) {
> + while ((job = to_amd_sched_job(spsc_queue_pop(&entity->job_queue)))) {
> struct amd_sched_fence *s_fence = job->s_fence;
> amd_sched_fence_scheduled(s_fence);
> dma_fence_set_error(&s_fence->finished, -ESRCH);
> @@ -235,9 +236,7 @@ void amd_sched_entity_fini(struct amd_gpu_scheduler *sched,
> dma_fence_put(&s_fence->finished);
> sched->ops->free_job(job);
> }
> -
> }
> - kfifo_free(&entity->job_queue);
> }
>
> static void amd_sched_entity_wakeup(struct dma_fence *f, struct dma_fence_cb *cb)
> @@ -332,18 +331,21 @@ static bool amd_sched_entity_add_dependency_cb(struct amd_sched_entity *entity)
> }
>
> static struct amd_sched_job *
> -amd_sched_entity_peek_job(struct amd_sched_entity *entity)
> +amd_sched_entity_pop_job(struct amd_sched_entity *entity)
> {
> struct amd_gpu_scheduler *sched = entity->sched;
> - struct amd_sched_job *sched_job;
> + struct amd_sched_job *sched_job = to_amd_sched_job(
> + spsc_queue_peek(&entity->job_queue));
>
> - if (!kfifo_out_peek(&entity->job_queue, &sched_job, sizeof(sched_job)))
> + if (!sched_job)
> return NULL;
>
> while ((entity->dependency = sched->ops->dependency(sched_job)))
> if (amd_sched_entity_add_dependency_cb(entity))
> return NULL;
>
> + sched_job->s_entity = NULL;
> + spsc_queue_pop(&entity->job_queue);
> return sched_job;
> }
>
> @@ -354,18 +356,14 @@ amd_sched_entity_peek_job(struct amd_sched_entity *entity)
> *
> * Returns true if we could submit the job.
> */
> -static bool amd_sched_entity_in(struct amd_sched_job *sched_job)
> +static void amd_sched_entity_in(struct amd_sched_job *sched_job)
> {
> struct amd_gpu_scheduler *sched = sched_job->sched;
> struct amd_sched_entity *entity = sched_job->s_entity;
> - bool added, first = false;
> + bool first = false;
>
> spin_lock(&entity->queue_lock);
> - added = kfifo_in(&entity->job_queue, &sched_job,
> - sizeof(sched_job)) == sizeof(sched_job);
> -
> - if (added && kfifo_len(&entity->job_queue) == sizeof(sched_job))
> - first = true;
> + first = spsc_queue_push(&entity->job_queue, &sched_job->queue_node);
>
> spin_unlock(&entity->queue_lock);
>
> @@ -377,7 +375,6 @@ static bool amd_sched_entity_in(struct amd_sched_job *sched_job)
> spin_unlock(&entity->rq_lock);
> amd_sched_wakeup(sched);
> }
> - return added;
> }
>
> /* job_finish is called after hw fence signaled
> @@ -514,11 +511,8 @@ void amd_sched_job_recovery(struct amd_gpu_scheduler *sched)
> */
> void amd_sched_entity_push_job(struct amd_sched_job *sched_job)
> {
> - struct amd_sched_entity *entity = sched_job->s_entity;
> -
> trace_amd_sched_job(sched_job);
> - wait_event(entity->sched->job_scheduled,
> - amd_sched_entity_in(sched_job));
> + amd_sched_entity_in(sched_job);
To save a few loc amd_sched_entity_in() can now be completely merged
into this function.
Apart from that the patch looks really good to me and is Reviewed-by:
Christian König <christian.koenig at amd.com>.
Ping me internally for new work if you now have time again.
Thanks,
Christian.
> }
>
> /* init a sched_job with basic field */
> @@ -611,7 +605,7 @@ static int amd_sched_main(void *param)
> {
> struct sched_param sparam = {.sched_priority = 1};
> struct amd_gpu_scheduler *sched = (struct amd_gpu_scheduler *)param;
> - int r, count;
> + int r;
>
> sched_setscheduler(current, SCHED_FIFO, &sparam);
>
> @@ -629,7 +623,7 @@ static int amd_sched_main(void *param)
> if (!entity)
> continue;
>
> - sched_job = amd_sched_entity_peek_job(entity);
> + sched_job = amd_sched_entity_pop_job(entity);
> if (!sched_job)
> continue;
>
> @@ -656,9 +650,6 @@ static int amd_sched_main(void *param)
> amd_sched_process_job(NULL, &s_fence->cb);
> }
>
> - count = kfifo_out(&entity->job_queue, &sched_job,
> - sizeof(sched_job));
> - WARN_ON(count != sizeof(sched_job));
> wake_up(&sched->job_scheduled);
> }
> return 0;
> diff --git a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
> index 3f75b45..8844ce7 100644
> --- a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
> +++ b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
> @@ -26,6 +26,7 @@
>
> #include <linux/kfifo.h>
> #include <linux/dma-fence.h>
> +#include "spsc_queue.h"
>
> struct amd_gpu_scheduler;
> struct amd_sched_rq;
> @@ -56,7 +57,7 @@ struct amd_sched_entity {
> struct amd_gpu_scheduler *sched;
>
> spinlock_t queue_lock;
> - struct kfifo job_queue;
> + struct spsc_queue job_queue;
>
> atomic_t fence_seq;
> uint64_t fence_context;
> @@ -87,6 +88,7 @@ struct amd_sched_fence {
> };
>
> struct amd_sched_job {
> + struct spsc_node queue_node;
> struct amd_gpu_scheduler *sched;
> struct amd_sched_entity *s_entity;
> struct amd_sched_fence *s_fence;
More information about the amd-gfx
mailing list