[PATCH 1/2] drm/amdgpu: Convert to common fdinfo format v2

Wed May 11 09:04:26 UTC 2022

Am 11.05.22 um 10:04 schrieb Tvrtko Ursulin:
>
> On 10/05/2022 17:55, Christian König wrote:
>> From: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
>
> Feels like you should take over owenrship since you wrote a lot of 
> new, amdgpu specific code here.

Yeah, probably.

>
>> Convert fdinfo format to one documented in drm-usage-stats.rst.
>>
>> It turned out that the existing implementation was actually completely
>> nonsense. The calculated percentages indeed represented the usage of the
>> engine, but with varying time slices.
>>
>> So 10% usage for application A could mean something completely different
>> than 10% usage for application B.
>>
>> Completely nuke that and just use the now standardized nanosecond
>> interface.
>>
>> v2: drop the documentation change for now, nuke percentage calculation
>>
>> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
>> Signed-off-by: Christian König <christian.koenig at amd.com>
>> Cc: David M Nieto <David.Nieto at amd.com>
>> Cc: Daniel Vetter <daniel at ffwll.ch>
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c    | 136 +++++++++------------
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h    |   8 +-
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_fdinfo.c |  45 ++++---
>>   3 files changed, 88 insertions(+), 101 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
>> index 8f0e6d93bb9c..988c5a76cad2 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
>> @@ -183,6 +183,7 @@ static int amdgpu_ctx_init_entity(struct 
>> amdgpu_ctx *ctx, u32 hw_ip,
>>       ctx_prio = (ctx->override_priority == AMDGPU_CTX_PRIORITY_UNSET) ?
>>               ctx->init_priority : ctx->override_priority;
>>       entity->sequence = 1;
>> +    entity->time_spend = 0;
>>       hw_prio = amdgpu_ctx_get_hw_prio(ctx, hw_ip);
>>       drm_prio = amdgpu_ctx_to_drm_sched_prio(ctx_prio);
>>   @@ -666,6 +667,26 @@ int amdgpu_ctx_put(struct amdgpu_ctx *ctx)
>>       return 0;
>>   }
>>   +/* Calculate the time spend on the hw */
>> +static ktime_t amdgpu_ctx_fence_time(struct dma_fence *fence)
>> +{
>> +    struct drm_sched_fence *s_fence;
>> +
>> +    if (!fence)
>> +        return ns_to_ktime(0);
>> +
>> +    /* When the fence is not even scheduled it can't have spend time */
>> +    s_fence = to_drm_sched_fence(fence);
>> +    if (!test_bit(DMA_FENCE_FLAG_TIMESTAMP_BIT, 
>> &s_fence->scheduled.flags))
>> +        return ns_to_ktime(0);
>> +
>> +    if (!test_bit(DMA_FENCE_FLAG_TIMESTAMP_BIT, 
>> &s_fence->finished.flags))
>> +        return ktime_sub(ktime_get(), s_fence->scheduled.timestamp);
>> +
>> +    return ktime_sub(s_fence->finished.timestamp,
>> +             s_fence->scheduled.timestamp);
>> +}
>> +
>>   void amdgpu_ctx_add_fence(struct amdgpu_ctx *ctx,
>>                 struct drm_sched_entity *entity,
>>                 struct dma_fence *fence, uint64_t *handle)
>> @@ -685,6 +706,7 @@ void amdgpu_ctx_add_fence(struct amdgpu_ctx *ctx,
>>       spin_lock(&ctx->ring_lock);
>>       centity->fences[idx] = fence;
>>       centity->sequence++;
>> +    centity->time_spend += amdgpu_ctx_fence_time(other);
>>       spin_unlock(&ctx->ring_lock);
>>         dma_fence_put(other);
>> @@ -692,6 +714,42 @@ void amdgpu_ctx_add_fence(struct amdgpu_ctx *ctx,
>>           *handle = seq;
>>   }
>>   +static ktime_t amdgpu_ctx_entity_time(struct amdgpu_ctx *ctx,
>> +                      struct amdgpu_ctx_entity *centity)
>> +{
>> +    ktime_t res;
>> +    uint32_t i;
>> +
>> +    spin_lock(&ctx->ring_lock);
>> +    res = centity->time_spend;
>> +    for (i = 0; i < amdgpu_sched_jobs; i++) {
>> +        res = ktime_add(res, 
>> amdgpu_ctx_fence_time(centity->fences[i]));
>> +    }
>> +    spin_unlock(&ctx->ring_lock);
>> +    return res;
>> +}
>> +
>> +ktime_t amdgpu_ctx_mgr_usage(struct amdgpu_ctx_mgr *mgr, uint32_t hwip,
>> +                 uint32_t idx)
>> +{
>> +    struct amdgpu_ctx_entity *centity;
>> +    ktime_t res = ns_to_ktime(0);
>> +    struct amdgpu_ctx *ctx;
>> +    uint32_t id;
>> +
>> +    mutex_lock(&mgr->lock);
>> +    idr_for_each_entry(&mgr->ctx_handles, ctx, id) {
>> +
>> +        centity = ctx->entities[hwip][idx];
>> +        if (!centity)
>> +            continue;
>> +
>> +        res = ktime_add(res, amdgpu_ctx_entity_time(ctx, centity));
>> +    }
>> +    mutex_unlock(&mgr->lock);
>> +    return res;
>> +}
>
> The amdgpu specific code I cannot really review - but I can ask a 
> question - do you have a concept of contexts which were used with a 
> client and then closed/abandoned, in which case would the time they 
> spent on the GPU be forgotten or preserved in this accounting flow?

Oh, good point. So I should probably accumulate the time spend by 
garbage collected fences on the context manager instead of the context 
entity.

Additional to that I think separating the output by scheduler instance 
doesn't make any sense at all because those are not related to the 
hardware engines. In other word the SDMA0 queue of process A can go to a 
different hardware engine than the SDMA0 queue of process B and that 
makes the values not comparable.

Thanks,
Christian.

>
> In i915 we have this so I am accumulating the times spent by contexts 
> as they are closed into a data structure associated with the owning 
> drm_file.
>
>> +
>>   struct dma_fence *amdgpu_ctx_get_fence(struct amdgpu_ctx *ctx,
>>                          struct drm_sched_entity *entity,
>>                          uint64_t seq)
>> @@ -869,81 +927,3 @@ void amdgpu_ctx_mgr_fini(struct amdgpu_ctx_mgr 
>> *mgr)
>>       idr_destroy(&mgr->ctx_handles);
>>       mutex_destroy(&mgr->lock);
>>   }
>> -
>> -static void amdgpu_ctx_fence_time(struct amdgpu_ctx *ctx,
>> -        struct amdgpu_ctx_entity *centity, ktime_t *total, ktime_t 
>> *max)
>> -{
>> -    ktime_t now, t1;
>> -    uint32_t i;
>> -
>> -    *total = *max = 0;
>> -
>> -    now = ktime_get();
>> -    for (i = 0; i < amdgpu_sched_jobs; i++) {
>> -        struct dma_fence *fence;
>> -        struct drm_sched_fence *s_fence;
>> -
>> -        spin_lock(&ctx->ring_lock);
>> -        fence = dma_fence_get(centity->fences[i]);
>> -        spin_unlock(&ctx->ring_lock);
>> -        if (!fence)
>> -            continue;
>> -        s_fence = to_drm_sched_fence(fence);
>> -        if (!dma_fence_is_signaled(&s_fence->scheduled)) {
>> -            dma_fence_put(fence);
>> -            continue;
>> -        }
>> -        t1 = s_fence->scheduled.timestamp;
>> -        if (!ktime_before(t1, now)) {
>> -            dma_fence_put(fence);
>> -            continue;
>> -        }
>> -        if (dma_fence_is_signaled(&s_fence->finished) &&
>> -            s_fence->finished.timestamp < now)
>> -            *total += ktime_sub(s_fence->finished.timestamp, t1);
>> -        else
>> -            *total += ktime_sub(now, t1);
>> -        t1 = ktime_sub(now, t1);
>> -        dma_fence_put(fence);
>> -        *max = max(t1, *max);
>> -    }
>> -}
>> -
>> -ktime_t amdgpu_ctx_mgr_fence_usage(struct amdgpu_ctx_mgr *mgr, 
>> uint32_t hwip,
>> -        uint32_t idx, uint64_t *elapsed)
>> -{
>> -    struct idr *idp;
>> -    struct amdgpu_ctx *ctx;
>> -    uint32_t id;
>> -    struct amdgpu_ctx_entity *centity;
>> -    ktime_t total = 0, max = 0;
>> -
>> -    if (idx >= AMDGPU_MAX_ENTITY_NUM)
>> -        return 0;
>> -    idp = &mgr->ctx_handles;
>> -    mutex_lock(&mgr->lock);
>> -    idr_for_each_entry(idp, ctx, id) {
>> -        ktime_t ttotal, tmax;
>> -
>> -        if (!ctx->entities[hwip][idx])
>> -            continue;
>> -
>> -        centity = ctx->entities[hwip][idx];
>> -        amdgpu_ctx_fence_time(ctx, centity, &ttotal, &tmax);
>> -
>> -        /* Harmonic mean approximation diverges for very small
>> -         * values. If ratio < 0.01% ignore
>> -         */
>> -        if (AMDGPU_CTX_FENCE_USAGE_MIN_RATIO(tmax, ttotal))
>> -            continue;
>> -
>> -        total = ktime_add(total, ttotal);
>> -        max = ktime_after(tmax, max) ? tmax : max;
>> -    }
>> -
>> -    mutex_unlock(&mgr->lock);
>> -    if (elapsed)
>> -        *elapsed = max;
>> -
>> -    return total;
>> -}
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h
>> index 142f2f87d44c..b5bb4425b0ac 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h
>> @@ -23,6 +23,8 @@
>>   #ifndef __AMDGPU_CTX_H__
>>   #define __AMDGPU_CTX_H__
>>   +#include <linux/ktime.h>
>> +
>>   #include "amdgpu_ring.h"
>>     struct drm_device;
>> @@ -30,10 +32,10 @@ struct drm_file;
>>   struct amdgpu_fpriv;
>>     #define AMDGPU_MAX_ENTITY_NUM 4
>> -#define AMDGPU_CTX_FENCE_USAGE_MIN_RATIO(max, total) ((max) > 
>> 16384ULL*(total))
>>     struct amdgpu_ctx_entity {
>>       uint64_t        sequence;
>> +    ktime_t            time_spend;
>>       struct drm_sched_entity    entity;
>>       struct dma_fence    *fences[];
>>   };
>> @@ -89,6 +91,6 @@ void amdgpu_ctx_mgr_init(struct amdgpu_ctx_mgr *mgr);
>>   void amdgpu_ctx_mgr_entity_fini(struct amdgpu_ctx_mgr *mgr);
>>   long amdgpu_ctx_mgr_entity_flush(struct amdgpu_ctx_mgr *mgr, long 
>> timeout);
>>   void amdgpu_ctx_mgr_fini(struct amdgpu_ctx_mgr *mgr);
>> -ktime_t amdgpu_ctx_mgr_fence_usage(struct amdgpu_ctx_mgr *mgr, 
>> uint32_t hwip,
>> -        uint32_t idx, uint64_t *elapsed);
>> +ktime_t amdgpu_ctx_mgr_usage(struct amdgpu_ctx_mgr *mgr, uint32_t hwip,
>> +                 uint32_t idx);
>>   #endif
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fdinfo.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_fdinfo.c
>> index 5a6857c44bb6..38cb5eb105ad 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fdinfo.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fdinfo.c
>> @@ -32,6 +32,7 @@
>>     #include <drm/amdgpu_drm.h>
>>   #include <drm/drm_debugfs.h>
>> +#include <drm/drm_drv.h>
>>     #include "amdgpu.h"
>>   #include "amdgpu_vm.h"
>> @@ -55,10 +56,10 @@ static const char 
>> *amdgpu_ip_name[AMDGPU_HW_IP_NUM] = {
>>   void amdgpu_show_fdinfo(struct seq_file *m, struct file *f)
>>   {
>>       struct amdgpu_fpriv *fpriv;
>> -    uint32_t bus, dev, fn, i, domain;
>>       uint64_t vram_mem = 0, gtt_mem = 0, cpu_mem = 0;
>>       struct drm_file *file = f->private_data;
>>       struct amdgpu_device *adev = drm_to_adev(file->minor->dev);
>> +    uint32_t bus, dev, fn, domain, hw_ip;
>>       struct amdgpu_bo *root;
>>       int ret;
>>   @@ -83,29 +84,33 @@ void amdgpu_show_fdinfo(struct seq_file *m, 
>> struct file *f)
>>       amdgpu_bo_unreserve(root);
>>       amdgpu_bo_unref(&root);
>>   -    seq_printf(m, "pdev:\t%04x:%02x:%02x.%d\npasid:\t%u\n", 
>> domain, bus,
>> +    /*
>> +     * 
>> ******************************************************************
>> +     * For text output format description please see 
>> drm-usage-stats.rst!
>> +     * 
>> ******************************************************************
>> +     */
>> +
>> +    seq_printf(m, "drm-driver:\t%s\n", file->minor->dev->driver->name);
>> +    seq_printf(m, "drm-pdev:\t%04x:%02x:%02x.%d\npasid:\t%u\n", 
>> domain, bus,
>>               dev, fn, fpriv->vm.pasid);
>> -    seq_printf(m, "vram mem:\t%llu kB\n", vram_mem/1024UL);
>> -    seq_printf(m, "gtt mem:\t%llu kB\n", gtt_mem/1024UL);
>> -    seq_printf(m, "cpu mem:\t%llu kB\n", cpu_mem/1024UL);
>> -    for (i = 0; i < AMDGPU_HW_IP_NUM; i++) {
>> -        uint32_t count = amdgpu_ctx_num_entities[i];
>> -        int idx = 0;
>> -        uint64_t total = 0, min = 0;
>> -        uint32_t perc, frac;
>> +    seq_printf(m, "drm-memory-vram:\t%llu KiB\n", vram_mem/1024UL);
>> +    seq_printf(m, "drm-memory-gtt:\t%llu KiB\n", gtt_mem/1024UL);
>> +    seq_printf(m, "drm-memory-cpu:\t%llu KiB\n", cpu_mem/1024UL);
>> +    for (hw_ip = 0; hw_ip < AMDGPU_HW_IP_NUM; ++hw_ip) {
>> +        uint32_t count = amdgpu_ctx_num_entities[hw_ip];
>> +        int idx;
>>   -        for (idx = 0; idx < count; idx++) {
>> -            total = amdgpu_ctx_mgr_fence_usage(&fpriv->ctx_mgr,
>> -                i, idx, &min);
>> -            if ((total == 0) || (min == 0))
>> -                continue;
>> +        for (idx = 0; idx < count; ++idx) {
>> +            ktime_t usage;
>>   -            perc = div64_u64(10000 * total, min);
>> -            frac = perc % 100;
>> +            usage = amdgpu_ctx_mgr_usage(&fpriv->ctx_mgr, hw_ip,
>> +                             idx);
>> +            if (!usage)
>> +                continue;
>
> Hmm I did not envisage fields dynamically appearing but it looks my 
> userspace happens to cope. Worst that would happen is order of engines 
> are displayed on screen could change between runs. And as my spec does 
> not define whether all possible engines should be always present or 
> not I guess it is fine like this.
>
>>   -            seq_printf(m, "%s%d:\t%d.%d%%\n",
>> -                    amdgpu_ip_name[i],
>> -                    idx, perc/100, frac);
>> +            seq_printf(m, "drm-engine-%s%d:\t%Ld ns\n",
>> +                   amdgpu_ip_name[hw_ip], idx,
>> +                   ktime_to_ns(usage));
>>           }
>>       }
>>   }
>
> Regards,
>
> Tvrtko