[PATCH 1/4] drm/scheduler: implement hardware time accounting

Tue Jul 2 08:56:03 UTC 2024

Am 02.07.24 um 10:42 schrieb Tvrtko Ursulin:
>
> Hi,
>
> I few questions below.
>
> On 01/07/2024 18:14, Lucas Stach wrote:
>> From: Christian König <ckoenig.leichtzumerken at gmail.com>
>>
>> Multiple drivers came up with the requirement to measure how
>> much runtime each entity accumulated on the HW.
>>
>> A previous attempt of accounting this had to be reverted because
>> HW submissions can have a lifetime exceeding that of the entity
>> originally issuing them.
>>
>> Amdgpu on the other hand solves this task by keeping track of
>> all the submissions and calculating how much time they have used
>> on demand.
>>
>> Move this approach over into the scheduler to provide an easy to
>> use interface for all drivers.
>>
>> Signed-off-by: Christian König <christian.koenig at amd.com>
>> Signed-off-by: Lucas Stach <l.stach at pengutronix.de>
>> ---
>> v2:
>> - rebase to v6.10-rc1
>> - fix for non-power-of-two number of HW submission
>> - add comment explaining the logic behind the fence tracking array
>> - rename some function and fix documentation
>> ---
>>   drivers/gpu/drm/scheduler/sched_entity.c | 82 +++++++++++++++++++++++-
>>   drivers/gpu/drm/scheduler/sched_fence.c  | 19 ++++++
>>   include/drm/gpu_scheduler.h              | 31 +++++++++
>>   3 files changed, 131 insertions(+), 1 deletion(-)
>>
>> diff --git a/drivers/gpu/drm/scheduler/sched_entity.c 
>> b/drivers/gpu/drm/scheduler/sched_entity.c
>> index 58c8161289fe..d678d0b9b29e 100644
>> --- a/drivers/gpu/drm/scheduler/sched_entity.c
>> +++ b/drivers/gpu/drm/scheduler/sched_entity.c
>> @@ -62,7 +62,9 @@ int drm_sched_entity_init(struct drm_sched_entity 
>> *entity,
>>                 unsigned int num_sched_list,
>>                 atomic_t *guilty)
>>   {
>> -    if (!(entity && sched_list && (num_sched_list == 0 || 
>> sched_list[0])))
>> +    unsigned int i, num_submissions = 0;
>> +
>> +    if (!entity || !sched_list)
>>           return -EINVAL;
>>         memset(entity, 0, sizeof(struct drm_sched_entity));
>> @@ -98,6 +100,11 @@ int drm_sched_entity_init(struct drm_sched_entity 
>> *entity,
>>                            (s32) DRM_SCHED_PRIORITY_KERNEL);
>>           }
>>           entity->rq = sched_list[0]->sched_rq[entity->priority];
>> +
>> +        for (i = 0; i < num_sched_list; ++i) {
>> +            num_submissions = max(num_submissions,
>> +                          sched_list[i]->credit_limit);
>> +        }
>
> Does this work (in concept and naming) for all drivers if introduction 
> of credits broke the 1:1 between jobs and hw "ring" capacity?
>
> How big is the array for different drivers?

Oh, good point. I don't really know the answer.

>
>>       }
>>         init_completion(&entity->entity_idle);
>> @@ -110,11 +117,52 @@ int drm_sched_entity_init(struct 
>> drm_sched_entity *entity,
>>         atomic_set(&entity->fence_seq, 0);
>>       entity->fence_context = dma_fence_context_alloc(2);
>> +    spin_lock_init(&entity->accounting_lock);
>> +
>> +    if (!num_submissions)
>> +        return 0;
>> +
>> +    entity->max_hw_submissions = num_submissions;
>> +    entity->hw_submissions = kcalloc(num_submissions, sizeof(void *),
>> +                     GFP_KERNEL);
>> +    if (!entity->hw_submissions)
>> +        return -ENOMEM;
>>         return 0;
>>   }
>>   EXPORT_SYMBOL(drm_sched_entity_init);
>>   +/**
>> + * drm_sched_entity_time_spent - Accumulated HW runtime used by this 
>> entity
>> + * @entity: scheduler entity to check
>> + *
>> + * Get the current accumulated HW runtime used by all submissions 
>> made through
>> + * this entity.
>> + */
>> +ktime_t drm_sched_entity_time_spent(struct drm_sched_entity *entity)
>> +{
>> +    ktime_t result;
>> +    unsigned int i;
>> +
>> +    if (!entity->max_hw_submissions)
>> +        return ns_to_ktime(0);
>> +
>> +    spin_lock(&entity->accounting_lock);
>> +    result = entity->hw_time_used;
>> +    for (i = 0; i < entity->max_hw_submissions; ++i) {
>> +        struct drm_sched_fence *fence = entity->hw_submissions[i];
>> +
>> +        if (!fence)
>> +            continue;
>> +
>> +        result = ktime_add(result, drm_sched_fence_get_runtime(fence));
>
> Does this end up counting from when jobs have been submitted to the hw 
> backend and may not be actually executing?
>
> Say if a driver configures a backend N deep and is filled with N jobs, 
> while in actuality they are executed sequentially one at a time, the 
> time as reported here would over-account by some series such as 
> (job[0].finish - job[0].submit) + ... + (job[N].finish - job[N].submit)?
>
> Or in other words if one submits N jobs to a ring serving an 1-wide hw 
> backend, will we see "N*100%" utilisation instead of "100%" if 
> sampling while first job is still executing, the rest queued behind it?

Oh, good question. Yeah that can happen I think.

Regards,
Christian.

>
> Regards,
>
> Tvrtko
>
>> +    }
>> +    spin_unlock(&entity->accounting_lock);
>> +
>> +    return result;
>> +}
>> +EXPORT_SYMBOL(drm_sched_entity_time_spent);
>> +
>>   /**
>>    * drm_sched_entity_modify_sched - Modify sched of an entity
>>    * @entity: scheduler entity to init
>> @@ -326,6 +374,8 @@ EXPORT_SYMBOL(drm_sched_entity_flush);
>>    */
>>   void drm_sched_entity_fini(struct drm_sched_entity *entity)
>>   {
>> +    unsigned int i;
>> +
>>       /*
>>        * If consumption of existing IBs wasn't completed. Forcefully 
>> remove
>>        * them here. Also makes sure that the scheduler won't touch 
>> this entity
>> @@ -341,6 +391,9 @@ void drm_sched_entity_fini(struct 
>> drm_sched_entity *entity)
>> dma_fence_put(rcu_dereference_check(entity->last_scheduled, true));
>>       RCU_INIT_POINTER(entity->last_scheduled, NULL);
>> +    for (i = 0; i < entity->max_hw_submissions; ++i)
>> + dma_fence_put(&entity->hw_submissions[i]->scheduled);
>> +    kfree(entity->hw_submissions);
>>   }
>>   EXPORT_SYMBOL(drm_sched_entity_fini);
>>   @@ -522,6 +575,33 @@ struct drm_sched_job 
>> *drm_sched_entity_pop_job(struct drm_sched_entity *entity)
>>        */
>>       sched_job->entity = NULL;
>>   +    if (entity->max_hw_submissions) {
>> +        struct drm_sched_fence *fence = sched_job->s_fence;
>> +        unsigned int idx = fence->scheduled.seqno;
>> +
>> +        dma_fence_get(&fence->scheduled);
>> +        idx %= entity->max_hw_submissions;
>> +
>> +        spin_lock(&entity->accounting_lock);
>> +        /*
>> +         * The fence seqno is dense and monotonically increasing. By
>> +         * cycling through a array sized to match the maximum number of
>> +         * submissions queued in the HW we can be sure that once we 
>> need
>> +         * to reuse a slot the fence stored in this slot refers to a
>> +         * retired submission and we can safely sum up the accumulated
>> +         * runtime and dispose the fence.
>> +         */
>> +        swap(fence, entity->hw_submissions[idx]);
>> +        if (fence) {
>> +            ktime_t runtime = drm_sched_fence_get_runtime(fence);
>> +
>> +            entity->hw_time_used = ktime_add(entity->hw_time_used,
>> +                             runtime);
>> +            dma_fence_put(&fence->scheduled);
>> +        }
>> +        spin_unlock(&entity->accounting_lock);
>> +    }
>> +
>>       return sched_job;
>>   }
>>   diff --git a/drivers/gpu/drm/scheduler/sched_fence.c 
>> b/drivers/gpu/drm/scheduler/sched_fence.c
>> index 0f35f009b9d3..55981ada1829 100644
>> --- a/drivers/gpu/drm/scheduler/sched_fence.c
>> +++ b/drivers/gpu/drm/scheduler/sched_fence.c
>> @@ -82,6 +82,25 @@ void drm_sched_fence_finished(struct 
>> drm_sched_fence *fence, int result)
>>       dma_fence_signal(&fence->finished);
>>   }
>>   +/**
>> + * drm_sched_fence_get_runtime - accumulated runtime on HW
>> + * @fence: fence
>> + *
>> + * Calculate how much runtime this fence has accumulated on the HW.
>> + */
>> +ktime_t drm_sched_fence_get_runtime(struct drm_sched_fence *fence)
>> +{
>> +    /* When the fence is not scheduled, it can't have accumulated 
>> runtime */
>> +    if (!test_bit(DMA_FENCE_FLAG_TIMESTAMP_BIT, 
>> &fence->scheduled.flags))
>> +        return ns_to_ktime(0);
>> +
>> +    /* When it is still running, calculate runtime until now */
>> +    if (!test_bit(DMA_FENCE_FLAG_TIMESTAMP_BIT, 
>> &fence->finished.flags))
>> +        return ktime_sub(ktime_get(), fence->scheduled.timestamp);
>> +
>> +    return ktime_sub(fence->finished.timestamp, 
>> fence->scheduled.timestamp);
>> +}
>> +
>>   static const char *drm_sched_fence_get_driver_name(struct dma_fence 
>> *fence)
>>   {
>>       return "drm_sched";
>> diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
>> index 5acc64954a88..52bcff324a92 100644
>> --- a/include/drm/gpu_scheduler.h
>> +++ b/include/drm/gpu_scheduler.h
>> @@ -238,6 +238,35 @@ struct drm_sched_entity {
>>        */
>>       struct rb_node            rb_tree_node;
>>   +    /**
>> +     * @accounting_lock:
>> +     *
>> +     * Protects the array of fences tracking the in-flight HW 
>> submissions
>> +     * and the accumulator counter.
>> +     */
>> +    spinlock_t            accounting_lock;
>> +
>> +    /**
>> +     * @hw_time_used:
>> +     *
>> +     * How much HW runtime has been accumulated by retired submissions
>> +     * from this entity.
>> +     */
>> +    ktime_t                hw_time_used;
>> +
>> +    /**
>> +     * @max_hw_submissions:
>> +     *
>> +     * Maximum number of submissions queued in the HW.
>> +     */
>> +    unsigned int            max_hw_submissions;
>> +
>> +    /**
>> +     * @hw_submissions:
>> +     *
>> +     * Scheduler fences of the HW submissions in flight.
>> +     */
>> +    struct drm_sched_fence        **hw_submissions;
>>   };
>>     /**
>> @@ -600,6 +629,7 @@ int drm_sched_entity_init(struct drm_sched_entity 
>> *entity,
>>                 struct drm_gpu_scheduler **sched_list,
>>                 unsigned int num_sched_list,
>>                 atomic_t *guilty);
>> +ktime_t drm_sched_entity_time_spent(struct drm_sched_entity *entity);
>>   long drm_sched_entity_flush(struct drm_sched_entity *entity, long 
>> timeout);
>>   void drm_sched_entity_fini(struct drm_sched_entity *entity);
>>   void drm_sched_entity_destroy(struct drm_sched_entity *entity);
>> @@ -620,6 +650,7 @@ void drm_sched_fence_free(struct drm_sched_fence 
>> *fence);
>>   void drm_sched_fence_scheduled(struct drm_sched_fence *fence,
>>                      struct dma_fence *parent);
>>   void drm_sched_fence_finished(struct drm_sched_fence *fence, int 
>> result);
>> +ktime_t drm_sched_fence_get_runtime(struct drm_sched_fence *fence);
>>     unsigned long drm_sched_suspend_timeout(struct drm_gpu_scheduler 
>> *sched);
>>   void drm_sched_resume_timeout(struct drm_gpu_scheduler *sched,
>>
>> base-commit: 1613e604df0cd359cf2a7fbd9be7a0bcfacfabd0