[PATCH v3 11/13] drm/sched: Waiting for pending jobs to complete in scheduler kill
Matthew Brost
matthew.brost at intel.com
Tue Sep 12 02:16:13 UTC 2023
Wait for pending jobs to be complete before signaling queued jobs. This
ensures dma-fence signaling order correct and also ensures the entity is
not running on the hardware after drm_sched_entity_flush or
drm_sched_entity_fini returns.
Signed-off-by: Matthew Brost <matthew.brost at intel.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 2 +-
drivers/gpu/drm/scheduler/sched_entity.c | 7 ++-
drivers/gpu/drm/scheduler/sched_main.c | 50 ++++++++++++++++++---
include/drm/gpu_scheduler.h | 18 ++++++++
4 files changed, 70 insertions(+), 7 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
index fb5dad687168..7835c0da65c5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
@@ -1873,7 +1873,7 @@ static void amdgpu_ib_preempt_mark_partial_job(struct amdgpu_ring *ring)
list_for_each_entry_safe(s_job, tmp, &sched->pending_list, list) {
if (dma_fence_is_signaled(&s_job->s_fence->finished)) {
/* remove job from ring_mirror_list */
- list_del_init(&s_job->list);
+ drm_sched_remove_pending_job(s_job);
sched->ops->free_job(s_job);
continue;
}
diff --git a/drivers/gpu/drm/scheduler/sched_entity.c b/drivers/gpu/drm/scheduler/sched_entity.c
index 1dec97caaba3..37557fbb96d0 100644
--- a/drivers/gpu/drm/scheduler/sched_entity.c
+++ b/drivers/gpu/drm/scheduler/sched_entity.c
@@ -104,9 +104,11 @@ int drm_sched_entity_init(struct drm_sched_entity *entity,
}
init_completion(&entity->entity_idle);
+ init_completion(&entity->jobs_done);
- /* We start in an idle state. */
+ /* We start in an idle and jobs done state. */
complete_all(&entity->entity_idle);
+ complete_all(&entity->jobs_done);
spin_lock_init(&entity->rq_lock);
spsc_queue_init(&entity->job_queue);
@@ -256,6 +258,9 @@ static void drm_sched_entity_kill(struct drm_sched_entity *entity)
/* Make sure this entity is not used by the scheduler at the moment */
wait_for_completion(&entity->entity_idle);
+ /* Make sure all pending jobs are done */
+ wait_for_completion(&entity->jobs_done);
+
/* The entity is guaranteed to not be used by the scheduler */
prev = rcu_dereference_check(entity->last_scheduled, true);
dma_fence_get(prev);
diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c
index 689fb6686e01..ed6f5680793a 100644
--- a/drivers/gpu/drm/scheduler/sched_main.c
+++ b/drivers/gpu/drm/scheduler/sched_main.c
@@ -510,12 +510,52 @@ void drm_sched_resume_timeout(struct drm_gpu_scheduler *sched,
}
EXPORT_SYMBOL(drm_sched_resume_timeout);
+/**
+ * drm_sched_add_pending_job - Add pending job to scheduler
+ *
+ * @job: scheduler job to add
+ * @tail: add to tail of pending list
+ */
+void drm_sched_add_pending_job(struct drm_sched_job *job, bool tail)
+{
+ struct drm_gpu_scheduler *sched = job->sched;
+ struct drm_sched_entity *entity = job->entity;
+
+ lockdep_assert_held(&sched->job_list_lock);
+
+ if (tail)
+ list_add_tail(&job->list, &sched->pending_list);
+ else
+ list_add(&job->list, &sched->pending_list);
+ if (!entity->pending_job_count++)
+ reinit_completion(&entity->jobs_done);
+}
+EXPORT_SYMBOL(drm_sched_add_pending_job);
+
+/**
+ * drm_sched_remove_pending_job - Remove pending job from` scheduler
+ *
+ * @job: scheduler job to remove
+ */
+void drm_sched_remove_pending_job(struct drm_sched_job *job)
+{
+ struct drm_gpu_scheduler *sched = job->sched;
+ struct drm_sched_entity *entity = job->entity;
+
+ lockdep_assert_held(&sched->job_list_lock);
+
+ list_del_init(&job->list);
+ if (!--entity->pending_job_count)
+ complete_all(&entity->jobs_done);
+}
+EXPORT_SYMBOL(drm_sched_remove_pending_job);
+
static void drm_sched_job_begin(struct drm_sched_job *s_job)
{
struct drm_gpu_scheduler *sched = s_job->sched;
spin_lock(&sched->job_list_lock);
- list_add_tail(&s_job->list, &sched->pending_list);
+ drm_sched_add_pending_job(s_job, true);
spin_unlock(&sched->job_list_lock);
}
@@ -538,7 +578,7 @@ static void drm_sched_job_timedout(struct work_struct *work)
* drm_sched_cleanup_jobs. It will be reinserted back after sched->thread
* is parked at which point it's safe.
*/
- list_del_init(&job->list);
+ drm_sched_remove_pending_job(job);
spin_unlock(&sched->job_list_lock);
status = job->sched->ops->timedout_job(job);
@@ -589,7 +629,7 @@ void drm_sched_stop(struct drm_gpu_scheduler *sched, struct drm_sched_job *bad)
* Add at the head of the queue to reflect it was the earliest
* job extracted.
*/
- list_add(&bad->list, &sched->pending_list);
+ drm_sched_add_pending_job(bad, false);
/*
* Iterate the job list from later to earlier one and either deactive
@@ -611,7 +651,7 @@ void drm_sched_stop(struct drm_gpu_scheduler *sched, struct drm_sched_job *bad)
* Locking here is for concurrent resume timeout
*/
spin_lock(&sched->job_list_lock);
- list_del_init(&s_job->list);
+ drm_sched_remove_pending_job(s_job);
spin_unlock(&sched->job_list_lock);
/*
@@ -1066,7 +1106,7 @@ drm_sched_get_cleanup_job(struct drm_gpu_scheduler *sched)
if (job && dma_fence_is_signaled(&job->s_fence->finished)) {
/* remove job from pending_list */
- list_del_init(&job->list);
+ drm_sched_remove_pending_job(job);
/* cancel this job's TO timer */
cancel_delayed_work(&sched->work_tdr);
diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
index b7b818cd81b6..7c628f36fe78 100644
--- a/include/drm/gpu_scheduler.h
+++ b/include/drm/gpu_scheduler.h
@@ -233,6 +233,21 @@ struct drm_sched_entity {
*/
struct completion entity_idle;
+ /**
+ * @pending_job_count:
+ *
+ * Number of pending jobs.
+ */
+ unsigned int pending_job_count;
+
+ /**
+ * @jobs_done:
+ *
+ * Signals when entity has no pending jobs, used to sequence entity
+ * cleanup in drm_sched_entity_fini().
+ */
+ struct completion jobs_done;
+
/**
* @oldest_job_waiting:
*
@@ -656,4 +671,7 @@ struct drm_gpu_scheduler *
drm_sched_pick_best(struct drm_gpu_scheduler **sched_list,
unsigned int num_sched_list);
+void drm_sched_add_pending_job(struct drm_sched_job *job, bool tail);
+void drm_sched_remove_pending_job(struct drm_sched_job *job);
+
#endif
--
2.34.1
More information about the dri-devel
mailing list