[PATCH 1/3] drm/scheduler: implement hw time accounting

Thu Jun 15 18:45:32 UTC 2023

On 2023-06-15 07:56, Christian König wrote:
> Multiple drivers came up with the requirement to measure how
> much time each submission spend on the hw.

"spends"

> 
> A previous attempt of accounting this had to be reverted because
> hw submissions can live longer than the entity originally
> issuing them.
> 
> Amdgpu on the other hand solves this task by keeping track of
> all the submissions and calculating how much time they have used
> on demand.
> 
> Move this approach over into the scheduler to provide an easy to
> use interface for all drivers.

Yeah, that'd be helpful.

> 
> Signed-off-by: Christian König <christian.koenig at amd.com>
> ---
>  drivers/gpu/drm/scheduler/sched_entity.c | 89 ++++++++++++++++++++++--
>  drivers/gpu/drm/scheduler/sched_fence.c  | 19 +++++
>  include/drm/gpu_scheduler.h              | 29 ++++++++
>  3 files changed, 133 insertions(+), 4 deletions(-)
> 
> diff --git a/drivers/gpu/drm/scheduler/sched_entity.c b/drivers/gpu/drm/scheduler/sched_entity.c
> index 68e807ae136a..67307022a505 100644
> --- a/drivers/gpu/drm/scheduler/sched_entity.c
> +++ b/drivers/gpu/drm/scheduler/sched_entity.c
> @@ -62,7 +62,9 @@ int drm_sched_entity_init(struct drm_sched_entity *entity,
>  			  unsigned int num_sched_list,
>  			  atomic_t *guilty)
>  {
> -	if (!(entity && sched_list && (num_sched_list == 0 || sched_list[0])))
> +	unsigned int i, num_submissions;
> +
> +	if (!entity || !sched_list)
>  		return -EINVAL;
>  
>  	memset(entity, 0, sizeof(struct drm_sched_entity));
> @@ -75,9 +77,6 @@ int drm_sched_entity_init(struct drm_sched_entity *entity,
>  	RCU_INIT_POINTER(entity->last_scheduled, NULL);
>  	RB_CLEAR_NODE(&entity->rb_tree_node);
>  
> -	if(num_sched_list)
> -		entity->rq = &sched_list[0]->sched_rq[entity->priority];
> -
>  	init_completion(&entity->entity_idle);
>  
>  	/* We start in an idle state. */
> @@ -88,11 +87,68 @@ int drm_sched_entity_init(struct drm_sched_entity *entity,
>  
>  	atomic_set(&entity->fence_seq, 0);
>  	entity->fence_context = dma_fence_context_alloc(2);
> +	spin_lock_init(&entity->accounting_lock);
> +
> +	/* We need to be able to init even unused entities */
> +	if (!num_sched_list)
> +		return 0;
> +
> +	entity->rq = &sched_list[0]->sched_rq[entity->priority];
> +
> +	num_submissions = 0;
> +	for (i = 0; i < num_sched_list; ++i) {
> +		if (!sched_list[i])
> +			return -EINVAL;
> +
> +		num_submissions = max(num_submissions,
> +				      sched_list[i]->hw_submission_limit);
> +	}
> +
> +	/* FIXME: Entity initialized before the scheduler. */
> +	if (!num_submissions)
> +		return 0;
> +
> +	entity->num_hw_submissions = num_submissions;
> +	entity->hw_submissions = kcalloc(num_submissions, sizeof(void *),
> +					 GFP_KERNEL);
> +	if (!entity->hw_submissions)
> +		return -ENOMEM;
>  
>  	return 0;
>  }
>  EXPORT_SYMBOL(drm_sched_entity_init);
>  
> +/**
> + * drm_sched_entity_time_spend - Accumulated hw time used by this entity

Use the adjective form to "time", "time spent":

drm_sched_entity_time_spent

> + * @entity: scheduler entity to check
> + *
> + * Get the current accumulated hw time used by all submissions made through this
> + * entity.
> + */
> +ktime_t drm_sched_entity_time_spend(struct drm_sched_entity *entity)

"time_spend" --> "time_spent"

drm_sched_entity_time_spent

> +{
> +	ktime_t result;
> +	unsigned int i;
> +
> +	if (!entity->num_hw_submissions)
> +		return ns_to_ktime(0);
> +
> +	spin_lock(&entity->accounting_lock);
> +	result = entity->hw_time_used;
> +	for (i = 0; i < entity->num_hw_submissions; ++i) {
> +		struct drm_sched_fence *fence = entity->hw_submissions[i];
> +
> +		if (!fence)
> +			continue;
> +
> +		result = ktime_add(result, drm_sched_fence_time_spend(fence));
> +	}
> +	spin_unlock(&entity->accounting_lock);
> +
> +	return result;
> +}
> +EXPORT_SYMBOL(drm_sched_entity_time_spend);

This is a good show-and-tell, ideally we want to add to entity->hw_time_used,
so that that quantity is updated. 
Otherwise, as it returns result = "new time spent" + entity->hw_time_used; we want
to make sure that "result" is accounted for somewhere more permanent. But
as I said, this is a good show-and-tell. I'd imagine we'd develop this function more
once it lands in the tree.

> +
>  /**
>   * drm_sched_entity_modify_sched - Modify sched of an entity
>   * @entity: scheduler entity to init
> @@ -288,6 +344,8 @@ EXPORT_SYMBOL(drm_sched_entity_flush);
>   */
>  void drm_sched_entity_fini(struct drm_sched_entity *entity)
>  {
> +	unsigned int i;
> +
>  	/*
>  	 * If consumption of existing IBs wasn't completed. Forcefully remove
>  	 * them here. Also makes sure that the scheduler won't touch this entity
> @@ -303,6 +361,9 @@ void drm_sched_entity_fini(struct drm_sched_entity *entity)
>  
>  	dma_fence_put(rcu_dereference_check(entity->last_scheduled, true));
>  	RCU_INIT_POINTER(entity->last_scheduled, NULL);
> +	for (i = 0; i < entity->num_hw_submissions; ++i)
> +		dma_fence_put(&entity->hw_submissions[i]->scheduled);
> +	kfree(entity->hw_submissions);
>  }
>  EXPORT_SYMBOL(drm_sched_entity_fini);
>  
> @@ -427,6 +488,8 @@ drm_sched_job_dependency(struct drm_sched_job *job,
>  struct drm_sched_job *drm_sched_entity_pop_job(struct drm_sched_entity *entity)
>  {
>  	struct drm_sched_job *sched_job;
> +	struct drm_sched_fence *fence;
> +	unsigned int idx;
>  
>  	sched_job = to_drm_sched_job(spsc_queue_peek(&entity->job_queue));
>  	if (!sched_job)
> @@ -475,6 +538,24 @@ struct drm_sched_job *drm_sched_entity_pop_job(struct drm_sched_entity *entity)
>  	 */
>  	sched_job->entity = NULL;
>  
> +	if (entity->num_hw_submissions) {
> +		fence = sched_job->s_fence;
> +		dma_fence_get(&fence->scheduled);

Okay, you're doing a get-op on the "scheduled"-fence for
a job which is to be started. [1]
Presumably, temporally, the "scheduled"-fence
gets a signal that it's been started, after this function.

> +		idx = fence->scheduled.seqno;
> +		idx &= entity->num_hw_submissions - 1;

Since num_hw_submissions isn't guaranteed to be a power of 2,
we can mitigate this here by indexing by the remainder of integer
division,
		idx = fence->scheduled.seqno % entity->num_hw_submissions;

This would give you a uniform distribution indexing into hw_submission[],
if the distribution of "seqno" is dense/continuous, and doesn't jump
by num_hw_submissions or more.

> +
> +		spin_lock(&entity->accounting_lock);
> +		swap(fence, entity->hw_submissions[idx]);
> +		if (fence) {

This here needs a comment and an explanation of the intended purpose.
You're swapping fence with hw_submission[idx], thus assigning the fence
whose job is to be started into hw_submission[idx] and assigning
to fence the old value of hw_submission[idx].

Now "fence" is either NULL (if that's the first time around) or
the old "seqno" which mapped to that index. (Which would be
seqno - num_hw_submissions, as you should use modulus above.)

> +			ktime_t spend = drm_sched_fence_time_spend(fence);

Change to the adjective:
			ktime_t spent = drm_sched_fence_time_spent(fence);

> +
> +			entity->hw_time_used = ktime_add(entity->hw_time_used,
> +							 spend);
> +			dma_fence_put(&fence->scheduled);

Are you temporally relying on the fact that you did a get-op in [1] above?
And are you doing a get-op above to keep this around until you get
another seqno which maps to this, (thus you'll need seqno to increase
by at least num_hw_submissions _and_ pop_job to be called for it,) so
that you can time-spent account for it here?

So, unless seqno advances by num_hw_submissions, _and_ pop_job is called
for it, we'd never put-op the "scheduled"-fence (and account for time spent)?
(Which implies that we need a consistent feed of jobs in order to put-op
the last num_hw_submissions - 1 "scheduled"-fences.)

If you're kref holding the fence, then wouldn't we only want to account for
it if it's been completed only, in drm_sched_fence_time_spent()?
And we'd somehow want to guarantee that it would achieve completion/done
status by the time we wrap num_hw_submissions and the newly mapped
entry maps to a done fence... Although, come to think of it, if we're
submitting the num_hw_submissions-th job then we're not interested
in the 0-th job since the hw cannot hold it anymore. So, I guess
that seems fine, but low-level driver writers should be aware of this. 

It would appear that the snippet of code here relies on the fact that
by the time we reach seqno + num_hw_sumissions, then seqno is done, or
more correctly put "uninteresting" to us anymore.
Which is an okay assumption since the hardware cannot hold more
than num_hw_submissions jobs... or can it? :-)

So, whether it's done or not, we don't care, we just care to account
for it when we "push it out of the hardware" by lieu of now
having num_hw_submission new jobs in.

So, this snippet of code here needs very lucid comments in [1],
also before the swap(), and also after the if (fence).

I'm concerned that if we don't get new jobs, then the last num_hw_submission - 1
"scheduled"-fences would lag behind since no put-op would be called
for them unless we get new jobs, which could be indeterminate time later.

> +		}
> +		spin_unlock(&entity->accounting_lock);
> +	}
> +
>  	return sched_job;
>  }
>  
> diff --git a/drivers/gpu/drm/scheduler/sched_fence.c b/drivers/gpu/drm/scheduler/sched_fence.c
> index ef120475e7c6..9abd076923f5 100644
> --- a/drivers/gpu/drm/scheduler/sched_fence.c
> +++ b/drivers/gpu/drm/scheduler/sched_fence.c
> @@ -60,6 +60,25 @@ void drm_sched_fence_finished(struct drm_sched_fence *fence, int result)
>  	dma_fence_signal(&fence->finished);
>  }
>  
> +/**
> + * drm_sched_fence_time_spend - calculate time spend on the hw
> + * @fence: fence to use
> + *
> + * Calculate how much time this fence has spend using the hw.
> + */
> +ktime_t drm_sched_fence_time_spend(struct drm_sched_fence *fence)

Use the adjective form:

ktime_t drm_sched_fence_time_spent(...

> +{
> +	/* When the fence is not even scheduled, it can't have spend time */

"it can't have time spent (in the hardware)"

> +	if (!test_bit(DMA_FENCE_FLAG_TIMESTAMP_BIT, &fence->scheduled.flags))
> +		return ns_to_ktime(0);
> +
> +	/* When it is still running account how much already spend */

/* If it is still running, then account for time spent. */

> +	if (!test_bit(DMA_FENCE_FLAG_TIMESTAMP_BIT, &fence->finished.flags))
> +		return ktime_sub(ktime_get(), fence->scheduled.timestamp);
> +

If someone calls this more than once while the same fence is alive
and its task being worked on in the GPU, and then they use the result
of this function to add to some counter for the fence or its container
entity, then this would over-account the time spent for this fence.

So while this function is fine, the way its result is being used
can result in over-accounting. So we should be careful how we use it.

I can imagine that if the fence is still alive, we don't accumulate;
or if the fence is still alive we just assign the new value; or
we can subtract the time from the last measurement to this time (a bit more
advanced, and this is what I used for my own accounting when we had "elapsed_ns",
but I digress...)

> +	return ktime_sub(fence->finished.timestamp, fence->scheduled.timestamp);
> +}
> +
>  static const char *drm_sched_fence_get_driver_name(struct dma_fence *fence)
>  {
>  	return "drm_sched";
> diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
> index e95b4837e5a3..7f9296202841 100644
> --- a/include/drm/gpu_scheduler.h
> +++ b/include/drm/gpu_scheduler.h
> @@ -239,6 +239,33 @@ struct drm_sched_entity {
>  	 */
>  	struct rb_node			rb_tree_node;
>  
> +	/**
> +	 * @accounting_lock:
> +	 *
> +	 * Protecting the array of hw submissions and the time spend
> +	 */
> +	spinlock_t			accounting_lock;
> +
> +	/**
> +	 * @hw_time_used:
> +	 *
> +	 * how much hw time previous submissions have already used
> +	 */
> +	ktime_t				hw_time_used;

This needs to age, and we can add aging code once this lands
in the tree, so no need to worry about it now.

> +
> +	/**
> +	 * @max_hw_submissions:
> +	 *
> +	 * Maximum number of submissions currently in worked on bye the hw
> +	 */
> +	unsigned int			num_hw_submissions;
> +
> +	/**
> +	 * @hw_submissions:
> +	 *
> +	 * Scheduler fences of the hw submissions in flight
> +	 */
> +	struct drm_sched_fence		**hw_submissions;

Perhaps just use singular, to mean:

	hw_submission[i] --> pointer to the i-th hw submission fence.

>  };
>  
>  /**
> @@ -572,6 +599,7 @@ int drm_sched_entity_init(struct drm_sched_entity *entity,
>  			  struct drm_gpu_scheduler **sched_list,
>  			  unsigned int num_sched_list,
>  			  atomic_t *guilty);
> +ktime_t drm_sched_entity_time_spend(struct drm_sched_entity *entity);

drm_sched_entity_time_spent

>  long drm_sched_entity_flush(struct drm_sched_entity *entity, long timeout);
>  void drm_sched_entity_fini(struct drm_sched_entity *entity);
>  void drm_sched_entity_destroy(struct drm_sched_entity *entity);
> @@ -593,6 +621,7 @@ void drm_sched_fence_free(struct drm_sched_fence *fence);
>  
>  void drm_sched_fence_scheduled(struct drm_sched_fence *fence);
>  void drm_sched_fence_finished(struct drm_sched_fence *fence, int result);
> +ktime_t drm_sched_fence_time_spend(struct drm_sched_fence *fence);

drm_sched_fence_time_spent

>  
>  unsigned long drm_sched_suspend_timeout(struct drm_gpu_scheduler *sched);
>  void drm_sched_resume_timeout(struct drm_gpu_scheduler *sched,

Regards,
Luben