[PATCH 1/2] drm/amdgpu: Convert to common fdinfo format v2

Wed May 11 08:04:36 UTC 2022

On 10/05/2022 17:55, Christian König wrote:
> From: Tvrtko Ursulin <tvrtko.ursulin at intel.com>

Feels like you should take over owenrship since you wrote a lot of new, 
amdgpu specific code here.

> Convert fdinfo format to one documented in drm-usage-stats.rst.
> 
> It turned out that the existing implementation was actually completely
> nonsense. The calculated percentages indeed represented the usage of the
> engine, but with varying time slices.
> 
> So 10% usage for application A could mean something completely different
> than 10% usage for application B.
> 
> Completely nuke that and just use the now standardized nanosecond
> interface.
> 
> v2: drop the documentation change for now, nuke percentage calculation
> 
> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
> Signed-off-by: Christian König <christian.koenig at amd.com>
> Cc: David M Nieto <David.Nieto at amd.com>
> Cc: Daniel Vetter <daniel at ffwll.ch>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c    | 136 +++++++++------------
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h    |   8 +-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_fdinfo.c |  45 ++++---
>   3 files changed, 88 insertions(+), 101 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
> index 8f0e6d93bb9c..988c5a76cad2 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
> @@ -183,6 +183,7 @@ static int amdgpu_ctx_init_entity(struct amdgpu_ctx *ctx, u32 hw_ip,
>   	ctx_prio = (ctx->override_priority == AMDGPU_CTX_PRIORITY_UNSET) ?
>   			ctx->init_priority : ctx->override_priority;
>   	entity->sequence = 1;
> +	entity->time_spend = 0;
>   	hw_prio = amdgpu_ctx_get_hw_prio(ctx, hw_ip);
>   	drm_prio = amdgpu_ctx_to_drm_sched_prio(ctx_prio);
>   
> @@ -666,6 +667,26 @@ int amdgpu_ctx_put(struct amdgpu_ctx *ctx)
>   	return 0;
>   }
>   
> +/* Calculate the time spend on the hw */
> +static ktime_t amdgpu_ctx_fence_time(struct dma_fence *fence)
> +{
> +	struct drm_sched_fence *s_fence;
> +
> +	if (!fence)
> +		return ns_to_ktime(0);
> +
> +	/* When the fence is not even scheduled it can't have spend time */
> +	s_fence = to_drm_sched_fence(fence);
> +	if (!test_bit(DMA_FENCE_FLAG_TIMESTAMP_BIT, &s_fence->scheduled.flags))
> +		return ns_to_ktime(0);
> +
> +	if (!test_bit(DMA_FENCE_FLAG_TIMESTAMP_BIT, &s_fence->finished.flags))
> +		return ktime_sub(ktime_get(), s_fence->scheduled.timestamp);
> +
> +	return ktime_sub(s_fence->finished.timestamp,
> +			 s_fence->scheduled.timestamp);
> +}
> +
>   void amdgpu_ctx_add_fence(struct amdgpu_ctx *ctx,
>   			  struct drm_sched_entity *entity,
>   			  struct dma_fence *fence, uint64_t *handle)
> @@ -685,6 +706,7 @@ void amdgpu_ctx_add_fence(struct amdgpu_ctx *ctx,
>   	spin_lock(&ctx->ring_lock);
>   	centity->fences[idx] = fence;
>   	centity->sequence++;
> +	centity->time_spend += amdgpu_ctx_fence_time(other);
>   	spin_unlock(&ctx->ring_lock);
>   
>   	dma_fence_put(other);
> @@ -692,6 +714,42 @@ void amdgpu_ctx_add_fence(struct amdgpu_ctx *ctx,
>   		*handle = seq;
>   }
>   
> +static ktime_t amdgpu_ctx_entity_time(struct amdgpu_ctx *ctx,
> +				      struct amdgpu_ctx_entity *centity)
> +{
> +	ktime_t res;
> +	uint32_t i;
> +
> +	spin_lock(&ctx->ring_lock);
> +	res = centity->time_spend;
> +	for (i = 0; i < amdgpu_sched_jobs; i++) {
> +		res = ktime_add(res, amdgpu_ctx_fence_time(centity->fences[i]));
> +	}
> +	spin_unlock(&ctx->ring_lock);
> +	return res;
> +}
> +
> +ktime_t amdgpu_ctx_mgr_usage(struct amdgpu_ctx_mgr *mgr, uint32_t hwip,
> +			     uint32_t idx)
> +{
> +	struct amdgpu_ctx_entity *centity;
> +	ktime_t res = ns_to_ktime(0);
> +	struct amdgpu_ctx *ctx;
> +	uint32_t id;
> +
> +	mutex_lock(&mgr->lock);
> +	idr_for_each_entry(&mgr->ctx_handles, ctx, id) {
> +
> +		centity = ctx->entities[hwip][idx];
> +		if (!centity)
> +			continue;
> +
> +		res = ktime_add(res, amdgpu_ctx_entity_time(ctx, centity));
> +	}
> +	mutex_unlock(&mgr->lock);
> +	return res;
> +}

The amdgpu specific code I cannot really review - but I can ask a 
question - do you have a concept of contexts which were used with a 
client and then closed/abandoned, in which case would the time they 
spent on the GPU be forgotten or preserved in this accounting flow?

In i915 we have this so I am accumulating the times spent by contexts as 
they are closed into a data structure associated with the owning drm_file.

> +
>   struct dma_fence *amdgpu_ctx_get_fence(struct amdgpu_ctx *ctx,
>   				       struct drm_sched_entity *entity,
>   				       uint64_t seq)
> @@ -869,81 +927,3 @@ void amdgpu_ctx_mgr_fini(struct amdgpu_ctx_mgr *mgr)
>   	idr_destroy(&mgr->ctx_handles);
>   	mutex_destroy(&mgr->lock);
>   }
> -
> -static void amdgpu_ctx_fence_time(struct amdgpu_ctx *ctx,
> -		struct amdgpu_ctx_entity *centity, ktime_t *total, ktime_t *max)
> -{
> -	ktime_t now, t1;
> -	uint32_t i;
> -
> -	*total = *max = 0;
> -
> -	now = ktime_get();
> -	for (i = 0; i < amdgpu_sched_jobs; i++) {
> -		struct dma_fence *fence;
> -		struct drm_sched_fence *s_fence;
> -
> -		spin_lock(&ctx->ring_lock);
> -		fence = dma_fence_get(centity->fences[i]);
> -		spin_unlock(&ctx->ring_lock);
> -		if (!fence)
> -			continue;
> -		s_fence = to_drm_sched_fence(fence);
> -		if (!dma_fence_is_signaled(&s_fence->scheduled)) {
> -			dma_fence_put(fence);
> -			continue;
> -		}
> -		t1 = s_fence->scheduled.timestamp;
> -		if (!ktime_before(t1, now)) {
> -			dma_fence_put(fence);
> -			continue;
> -		}
> -		if (dma_fence_is_signaled(&s_fence->finished) &&
> -			s_fence->finished.timestamp < now)
> -			*total += ktime_sub(s_fence->finished.timestamp, t1);
> -		else
> -			*total += ktime_sub(now, t1);
> -		t1 = ktime_sub(now, t1);
> -		dma_fence_put(fence);
> -		*max = max(t1, *max);
> -	}
> -}
> -
> -ktime_t amdgpu_ctx_mgr_fence_usage(struct amdgpu_ctx_mgr *mgr, uint32_t hwip,
> -		uint32_t idx, uint64_t *elapsed)
> -{
> -	struct idr *idp;
> -	struct amdgpu_ctx *ctx;
> -	uint32_t id;
> -	struct amdgpu_ctx_entity *centity;
> -	ktime_t total = 0, max = 0;
> -
> -	if (idx >= AMDGPU_MAX_ENTITY_NUM)
> -		return 0;
> -	idp = &mgr->ctx_handles;
> -	mutex_lock(&mgr->lock);
> -	idr_for_each_entry(idp, ctx, id) {
> -		ktime_t ttotal, tmax;
> -
> -		if (!ctx->entities[hwip][idx])
> -			continue;
> -
> -		centity = ctx->entities[hwip][idx];
> -		amdgpu_ctx_fence_time(ctx, centity, &ttotal, &tmax);
> -
> -		/* Harmonic mean approximation diverges for very small
> -		 * values. If ratio < 0.01% ignore
> -		 */
> -		if (AMDGPU_CTX_FENCE_USAGE_MIN_RATIO(tmax, ttotal))
> -			continue;
> -
> -		total = ktime_add(total, ttotal);
> -		max = ktime_after(tmax, max) ? tmax : max;
> -	}
> -
> -	mutex_unlock(&mgr->lock);
> -	if (elapsed)
> -		*elapsed = max;
> -
> -	return total;
> -}
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h
> index 142f2f87d44c..b5bb4425b0ac 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h
> @@ -23,6 +23,8 @@
>   #ifndef __AMDGPU_CTX_H__
>   #define __AMDGPU_CTX_H__
>   
> +#include <linux/ktime.h>
> +
>   #include "amdgpu_ring.h"
>   
>   struct drm_device;
> @@ -30,10 +32,10 @@ struct drm_file;
>   struct amdgpu_fpriv;
>   
>   #define AMDGPU_MAX_ENTITY_NUM 4
> -#define AMDGPU_CTX_FENCE_USAGE_MIN_RATIO(max, total) ((max) > 16384ULL*(total))
>   
>   struct amdgpu_ctx_entity {
>   	uint64_t		sequence;
> +	ktime_t			time_spend;
>   	struct drm_sched_entity	entity;
>   	struct dma_fence	*fences[];
>   };
> @@ -89,6 +91,6 @@ void amdgpu_ctx_mgr_init(struct amdgpu_ctx_mgr *mgr);
>   void amdgpu_ctx_mgr_entity_fini(struct amdgpu_ctx_mgr *mgr);
>   long amdgpu_ctx_mgr_entity_flush(struct amdgpu_ctx_mgr *mgr, long timeout);
>   void amdgpu_ctx_mgr_fini(struct amdgpu_ctx_mgr *mgr);
> -ktime_t amdgpu_ctx_mgr_fence_usage(struct amdgpu_ctx_mgr *mgr, uint32_t hwip,
> -		uint32_t idx, uint64_t *elapsed);
> +ktime_t amdgpu_ctx_mgr_usage(struct amdgpu_ctx_mgr *mgr, uint32_t hwip,
> +			     uint32_t idx);
>   #endif
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fdinfo.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_fdinfo.c
> index 5a6857c44bb6..38cb5eb105ad 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fdinfo.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fdinfo.c
> @@ -32,6 +32,7 @@
>   
>   #include <drm/amdgpu_drm.h>
>   #include <drm/drm_debugfs.h>
> +#include <drm/drm_drv.h>
>   
>   #include "amdgpu.h"
>   #include "amdgpu_vm.h"
> @@ -55,10 +56,10 @@ static const char *amdgpu_ip_name[AMDGPU_HW_IP_NUM] = {
>   void amdgpu_show_fdinfo(struct seq_file *m, struct file *f)
>   {
>   	struct amdgpu_fpriv *fpriv;
> -	uint32_t bus, dev, fn, i, domain;
>   	uint64_t vram_mem = 0, gtt_mem = 0, cpu_mem = 0;
>   	struct drm_file *file = f->private_data;
>   	struct amdgpu_device *adev = drm_to_adev(file->minor->dev);
> +	uint32_t bus, dev, fn, domain, hw_ip;
>   	struct amdgpu_bo *root;
>   	int ret;
>   
> @@ -83,29 +84,33 @@ void amdgpu_show_fdinfo(struct seq_file *m, struct file *f)
>   	amdgpu_bo_unreserve(root);
>   	amdgpu_bo_unref(&root);
>   
> -	seq_printf(m, "pdev:\t%04x:%02x:%02x.%d\npasid:\t%u\n", domain, bus,
> +	/*
> +	 * ******************************************************************
> +	 * For text output format description please see drm-usage-stats.rst!
> +	 * ******************************************************************
> +	 */
> +
> +	seq_printf(m, "drm-driver:\t%s\n", file->minor->dev->driver->name);
> +	seq_printf(m, "drm-pdev:\t%04x:%02x:%02x.%d\npasid:\t%u\n", domain, bus,
>   			dev, fn, fpriv->vm.pasid);
> -	seq_printf(m, "vram mem:\t%llu kB\n", vram_mem/1024UL);
> -	seq_printf(m, "gtt mem:\t%llu kB\n", gtt_mem/1024UL);
> -	seq_printf(m, "cpu mem:\t%llu kB\n", cpu_mem/1024UL);
> -	for (i = 0; i < AMDGPU_HW_IP_NUM; i++) {
> -		uint32_t count = amdgpu_ctx_num_entities[i];
> -		int idx = 0;
> -		uint64_t total = 0, min = 0;
> -		uint32_t perc, frac;
> +	seq_printf(m, "drm-memory-vram:\t%llu KiB\n", vram_mem/1024UL);
> +	seq_printf(m, "drm-memory-gtt:\t%llu KiB\n", gtt_mem/1024UL);
> +	seq_printf(m, "drm-memory-cpu:\t%llu KiB\n", cpu_mem/1024UL);
> +	for (hw_ip = 0; hw_ip < AMDGPU_HW_IP_NUM; ++hw_ip) {
> +		uint32_t count = amdgpu_ctx_num_entities[hw_ip];
> +		int idx;
>   
> -		for (idx = 0; idx < count; idx++) {
> -			total = amdgpu_ctx_mgr_fence_usage(&fpriv->ctx_mgr,
> -				i, idx, &min);
> -			if ((total == 0) || (min == 0))
> -				continue;
> +		for (idx = 0; idx < count; ++idx) {
> +			ktime_t usage;
>   
> -			perc = div64_u64(10000 * total, min);
> -			frac = perc % 100;
> +			usage = amdgpu_ctx_mgr_usage(&fpriv->ctx_mgr, hw_ip,
> +						     idx);
> +			if (!usage)
> +				continue;

Hmm I did not envisage fields dynamically appearing but it looks my 
userspace happens to cope. Worst that would happen is order of engines 
are displayed on screen could change between runs. And as my spec does 
not define whether all possible engines should be always present or not 
I guess it is fine like this.

>   
> -			seq_printf(m, "%s%d:\t%d.%d%%\n",
> -					amdgpu_ip_name[i],
> -					idx, perc/100, frac);
> +			seq_printf(m, "drm-engine-%s%d:\t%Ld ns\n",
> +				   amdgpu_ip_name[hw_ip], idx,
> +				   ktime_to_ns(usage));
>   		}
>   	}
>   }

Regards,

Tvrtko