[RFC 03/14] drm/sched: Implement RR via FIFO

Wed Jan 8 15:04:22 UTC 2025

On 08/01/2025 09:42, Danilo Krummrich wrote:
> On Mon, Dec 30, 2024 at 04:52:48PM +0000, Tvrtko Ursulin wrote:
>> From: Tvrtko Ursulin <tvrtko.ursulin at igalia.com>
>>
>> Round-robin being the non-default policy and unclear how much it is used,
> 
> I don't know either, but would be nice if we could instead remove it.
> 
>> we can notice that it can be implemented using the FIFO data structures if
>> we only invent a fake submit timestamp which is monotonically increasing
>> inside drm_sched_rq instances.
>>
>> So instead of remembering which was the last entity the scheduler worker
>> picked, we can bump the picked one to the bottom of the tree, achieving
>> the same round-robin behaviour.
>>
>> Advantage is that we can consolidate to a single code path and remove a
>> bunch of code. Downside is round-robin mode now needs to lock on the job
>> pop path but that should not be visible.
> 
> I guess you did you test both scheduler strategies with this change?

Insofar it doesn't crash and burn, yes.

>> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin at igalia.com>
>> Cc: Christian König <christian.koenig at amd.com>
>> Cc: Danilo Krummrich <dakr at redhat.com>
>> Cc: Matthew Brost <matthew.brost at intel.com>
>> Cc: Philipp Stanner <pstanner at redhat.com>
>> ---
>>   drivers/gpu/drm/scheduler/sched_entity.c | 53 ++++++++-----
>>   drivers/gpu/drm/scheduler/sched_main.c   | 99 +++---------------------
>>   include/drm/gpu_scheduler.h              |  5 +-
>>   3 files changed, 48 insertions(+), 109 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/scheduler/sched_entity.c b/drivers/gpu/drm/scheduler/sched_entity.c
>> index 8e910586979e..cb5f596b48b7 100644
>> --- a/drivers/gpu/drm/scheduler/sched_entity.c
>> +++ b/drivers/gpu/drm/scheduler/sched_entity.c
>> @@ -473,9 +473,20 @@ drm_sched_job_dependency(struct drm_sched_job *job,
>>   	return NULL;
>>   }
>>   
>> +static ktime_t
>> +drm_sched_rq_get_rr_deadline(struct drm_sched_rq *rq)
>> +{
>> +	lockdep_assert_held(&rq->lock);
>> +
>> +	rq->rr_deadline = ktime_add_ns(rq->rr_deadline, 1);
>> +
>> +	return rq->rr_deadline;
>> +}
>> +
>>   struct drm_sched_job *drm_sched_entity_pop_job(struct drm_sched_entity *entity)
>>   {
>> -	struct drm_sched_job *sched_job;
>> +	struct drm_sched_job *sched_job, *next_job;
>> +	struct drm_sched_rq *rq;
>>   
>>   	sched_job = to_drm_sched_job(spsc_queue_peek(&entity->job_queue));
>>   	if (!sched_job)
>> @@ -510,24 +521,28 @@ struct drm_sched_job *drm_sched_entity_pop_job(struct drm_sched_entity *entity)
>>   	 * Update the entity's location in the min heap according to
>>   	 * the timestamp of the next job, if any.
>>   	 */
>> -	if (drm_sched_policy == DRM_SCHED_POLICY_FIFO) {
>> -		struct drm_sched_job *next;
>> -		struct drm_sched_rq *rq;
>> +	next_job = to_drm_sched_job(spsc_queue_peek(&entity->job_queue));
>>   
>> -		spin_lock(&entity->lock);
>> -		rq = entity->rq;
>> -		spin_lock(&rq->lock);
>> -		next = to_drm_sched_job(spsc_queue_peek(&entity->job_queue));
>> -		if (next) {
>> -			drm_sched_rq_update_fifo_locked(entity, rq,
>> -							next->submit_ts);
>> -		} else {
>> -			drm_sched_rq_remove_fifo_locked(entity, rq);
>> -		}
>> -		spin_unlock(&rq->lock);
>> -		spin_unlock(&entity->lock);
>> +	spin_lock(&entity->lock);
>> +	rq = entity->rq;
>> +	spin_lock(&rq->lock);
>> +
>> +	if (next_job) {
>> +		ktime_t ts;
>> +
>> +		if (drm_sched_policy == DRM_SCHED_POLICY_FIFO)
>> +			ts = next_job->submit_ts;
>> +		else
>> +			ts = drm_sched_rq_get_rr_deadline(rq);
>> +
>> +		drm_sched_rq_update_fifo_locked(entity, rq, ts);
>> +	} else {
>> +		drm_sched_rq_remove_fifo_locked(entity, rq);
>>   	}
>>   
>> +	spin_unlock(&rq->lock);
>> +	spin_unlock(&entity->lock);
>> +
>>   	/* Jobs and entities might have different lifecycles. Since we're
>>   	 * removing the job from the entities queue, set the jobs entity pointer
>>   	 * to NULL to prevent any future access of the entity through this job.
>> @@ -624,9 +639,9 @@ void drm_sched_entity_push_job(struct drm_sched_job *sched_job)
>>   
>>   		spin_lock(&rq->lock);
>>   		drm_sched_rq_add_entity(rq, entity);
>> -
>> -		if (drm_sched_policy == DRM_SCHED_POLICY_FIFO)
>> -			drm_sched_rq_update_fifo_locked(entity, rq, submit_ts);
>> +		if (drm_sched_policy == DRM_SCHED_POLICY_RR)
>> +			submit_ts = drm_sched_rq_get_rr_deadline(rq);
>> +		drm_sched_rq_update_fifo_locked(entity, rq, submit_ts);
>>   
>>   		spin_unlock(&rq->lock);
>>   		spin_unlock(&entity->lock);
>> diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c
>> index 9beb4c611988..eb22b1b7de36 100644
>> --- a/drivers/gpu/drm/scheduler/sched_main.c
>> +++ b/drivers/gpu/drm/scheduler/sched_main.c
>> @@ -189,7 +189,6 @@ static void drm_sched_rq_init(struct drm_gpu_scheduler *sched,
>>   	spin_lock_init(&rq->lock);
>>   	INIT_LIST_HEAD(&rq->entities);
>>   	rq->rb_tree_root = RB_ROOT_CACHED;
>> -	rq->current_entity = NULL;
>>   	rq->sched = sched;
>>   }
>>   
>> @@ -235,82 +234,13 @@ void drm_sched_rq_remove_entity(struct drm_sched_rq *rq,
>>   	atomic_dec(rq->sched->score);
>>   	list_del_init(&entity->list);
>>   
>> -	if (rq->current_entity == entity)
>> -		rq->current_entity = NULL;
>> -
>> -	if (drm_sched_policy == DRM_SCHED_POLICY_FIFO)
>> -		drm_sched_rq_remove_fifo_locked(entity, rq);
>> +	drm_sched_rq_remove_fifo_locked(entity, rq);
>>   
>>   	spin_unlock(&rq->lock);
>>   }
>>   
>>   /**
>> - * drm_sched_rq_select_entity_rr - Select an entity which could provide a job to run
>> - *
>> - * @sched: the gpu scheduler
>> - * @rq: scheduler run queue to check.
>> - *
>> - * Try to find the next ready entity.
>> - *
>> - * Return an entity if one is found; return an error-pointer (!NULL) if an
>> - * entity was ready, but the scheduler had insufficient credits to accommodate
>> - * its job; return NULL, if no ready entity was found.
>> - */
>> -static struct drm_sched_entity *
>> -drm_sched_rq_select_entity_rr(struct drm_gpu_scheduler *sched,
>> -			      struct drm_sched_rq *rq)
>> -{
>> -	struct drm_sched_entity *entity;
>> -
>> -	spin_lock(&rq->lock);
>> -
>> -	entity = rq->current_entity;
>> -	if (entity) {
>> -		list_for_each_entry_continue(entity, &rq->entities, list) {
>> -			if (drm_sched_entity_is_ready(entity)) {
>> -				/* If we can't queue yet, preserve the current
>> -				 * entity in terms of fairness.
>> -				 */
>> -				if (!drm_sched_can_queue(sched, entity)) {
>> -					spin_unlock(&rq->lock);
>> -					return ERR_PTR(-ENOSPC);
>> -				}
>> -
>> -				rq->current_entity = entity;
>> -				reinit_completion(&entity->entity_idle);
>> -				spin_unlock(&rq->lock);
>> -				return entity;
>> -			}
>> -		}
>> -	}
>> -
>> -	list_for_each_entry(entity, &rq->entities, list) {
>> -		if (drm_sched_entity_is_ready(entity)) {
>> -			/* If we can't queue yet, preserve the current entity in
>> -			 * terms of fairness.
>> -			 */
>> -			if (!drm_sched_can_queue(sched, entity)) {
>> -				spin_unlock(&rq->lock);
>> -				return ERR_PTR(-ENOSPC);
>> -			}
>> -
>> -			rq->current_entity = entity;
>> -			reinit_completion(&entity->entity_idle);
>> -			spin_unlock(&rq->lock);
>> -			return entity;
>> -		}
>> -
>> -		if (entity == rq->current_entity)
>> -			break;
>> -	}
>> -
>> -	spin_unlock(&rq->lock);
>> -
>> -	return NULL;
>> -}
>> -
>> -/**
>> - * drm_sched_rq_select_entity_fifo - Select an entity which provides a job to run
>> + * drm_sched_rq_select_entity - Select an entity which provides a job to run
>>    *
>>    * @sched: the gpu scheduler
>>    * @rq: scheduler run queue to check.
>> @@ -322,32 +252,29 @@ drm_sched_rq_select_entity_rr(struct drm_gpu_scheduler *sched,
>>    * its job; return NULL, if no ready entity was found.
>>    */
>>   static struct drm_sched_entity *
>> -drm_sched_rq_select_entity_fifo(struct drm_gpu_scheduler *sched,
>> -				struct drm_sched_rq *rq)
>> +drm_sched_rq_select_entity(struct drm_gpu_scheduler *sched,
>> +			   struct drm_sched_rq *rq)
>>   {
>> +	struct drm_sched_entity *entity = NULL;
>>   	struct rb_node *rb;
>>   
>>   	spin_lock(&rq->lock);
>>   	for (rb = rb_first_cached(&rq->rb_tree_root); rb; rb = rb_next(rb)) {
>> -		struct drm_sched_entity *entity;
>> -
>>   		entity = rb_entry(rb, struct drm_sched_entity, rb_tree_node);
>>   		if (drm_sched_entity_is_ready(entity)) {
>> -			/* If we can't queue yet, preserve the current entity in
>> -			 * terms of fairness.
>> -			 */
>>   			if (!drm_sched_can_queue(sched, entity)) {
>> -				spin_unlock(&rq->lock);
>> -				return ERR_PTR(-ENOSPC);
>> +				entity = ERR_PTR(-ENOSPC);
>> +				break;
>>   			}
>>   
>>   			reinit_completion(&entity->entity_idle);
>>   			break;
>>   		}
>> +		entity = NULL;
>>   	}
>>   	spin_unlock(&rq->lock);
>>   
>> -	return rb ? rb_entry(rb, struct drm_sched_entity, rb_tree_node) : NULL;
>> +	return entity;
>>   }
> 
> The whole diff of drm_sched_rq_select_entity_fifo() (except for the name) has
> nothing to do with the patch, does it?
> 
> If you want refactor things, please do it in a separate patch.

Well spotted thanks. There was some re-ordering and cherry-picking 
patches from some development branches and some mistakes crept in.

Regards,

Tvrtko

>>   
>>   /**
>> @@ -1045,20 +972,18 @@ void drm_sched_wakeup(struct drm_gpu_scheduler *sched)
>>   static struct drm_sched_entity *
>>   drm_sched_select_entity(struct drm_gpu_scheduler *sched)
>>   {
>> -	struct drm_sched_entity *entity;
>> +	struct drm_sched_entity *entity = NULL;
>>   	int i;
>>   
>>   	/* Start with the highest priority.
>>   	 */
>>   	for (i = DRM_SCHED_PRIORITY_KERNEL; i < sched->num_rqs; i++) {
>> -		entity = drm_sched_policy == DRM_SCHED_POLICY_FIFO ?
>> -			drm_sched_rq_select_entity_fifo(sched, sched->sched_rq[i]) :
>> -			drm_sched_rq_select_entity_rr(sched, sched->sched_rq[i]);
>> +		entity = drm_sched_rq_select_entity(sched, sched->sched_rq[i]);
>>   		if (entity)
>>   			break;
>>   	}
>>   
>> -	return IS_ERR(entity) ? NULL : entity;
>> +	return IS_ERR(entity) ? NULL : entity;;
>>   }
>>   
>>   /**
>> diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
>> index 978ca621cc13..db65600732b9 100644
>> --- a/include/drm/gpu_scheduler.h
>> +++ b/include/drm/gpu_scheduler.h
>> @@ -245,8 +245,7 @@ struct drm_sched_entity {
>>    * struct drm_sched_rq - queue of entities to be scheduled.
>>    *
>>    * @sched: the scheduler to which this rq belongs to.
>> - * @lock: protects @entities, @rb_tree_root and @current_entity.
>> - * @current_entity: the entity which is to be scheduled.
>> + * @lock: protects @entities, @rb_tree_root and @rr_deadline.
>>    * @entities: list of the entities to be scheduled.
>>    * @rb_tree_root: root of time based priority queue of entities for FIFO scheduling
>>    *
>> @@ -259,7 +258,7 @@ struct drm_sched_rq {
>>   
>>   	spinlock_t			lock;
>>   	/* Following members are protected by the @lock: */
>> -	struct drm_sched_entity		*current_entity;
>> +	ktime_t				rr_deadline;
>>   	struct list_head		entities;
>>   	struct rb_root_cached		rb_tree_root;
>>   };
>> -- 
>> 2.47.1
>>
>