[RFC] drm/amdgpu: More efficient ring padding

Fri Jul 12 07:33:07 UTC 2024

Am 11.07.24 um 20:17 schrieb Tvrtko Ursulin:
> From: Tvrtko Ursulin <tvrtko.ursulin at igalia.com>
>
>  From the department of questionable optimisations today we have a minor
> improvement to how padding / filling the rings with nops is done.
>
> Having noticed that typically 200+ nops per submission are filled into the
> ring, using a rather verbose one-nop-at-a-time-plus-ring-buffer-arithmetic
> as done in amdgpu_ring_write(), while right next to it there is
> amdgpu_ring_write_multiple(), I thought why not pre-cache a block of nops
> and write them out more efficiently.
>
> The patch is rather quick and dirty, does not deal with all insert_nops
> vfuncs, and the cache should probably go one level up so it is not
> replicated per amdgpu_ring instance.

Why should that be more effective? We essentially use more cache lines 
than before.

> And performance gains are not that amazing for normal workloads. For
> instance a game which results in two submissions per frame, each pads
> with 222 nops, submission worker thread profile changes from:

Mhm, why the heck are we using so many nops in the first place?

Regards,
Christian.

>
> +   90.78%     0.67%  kworker/u32:3-e  [kernel.kallsyms]  [k] process_one_work
> +   48.92%     0.12%  kworker/u32:3-e  [kernel.kallsyms]  [k] commit_tail
> +   41.18%     1.73%  kworker/u32:3-e  [kernel.kallsyms]  [k] amdgpu_dm_atomic_commit_tail
> -   30.31%     0.67%  kworker/u32:3-e  [kernel.kallsyms]  [k] drm_sched_run_job_work
>     - 29.63% drm_sched_run_job_work
>        + 8.55% dma_fence_add_callback
>        - 7.50% amdgpu_job_run
>           - 7.43% amdgpu_ib_schedule
>              - 2.46% amdgpu_ring_commit
>                   1.44% amdgpu_ring_insert_nop
>
> To:
>
> +   89.83%     0.51%  kworker/u32:6-g  [kernel.kallsyms]  [k] process_one_work
> +   47.65%     0.30%  kworker/u32:6-g  [kernel.kallsyms]  [k] commit_tail
> +   39.42%     1.97%  kworker/u32:6-g  [kernel.kallsyms]  [k] amdgpu_dm_atomic_commit_tail
> -   29.57%     1.10%  kworker/u32:6-g  [kernel.kallsyms]  [k] drm_sched_run_job_work
>     - 28.47% drm_sched_run_job_work
>        + 8.52% dma_fence_add_callback
>        - 6.33% amdgpu_job_run
>           - 6.19% amdgpu_ib_schedule
>              - 1.85% amdgpu_ring_commit
>                   0.53% amdgpu_ring_insert_nop
>
> Or if we run a more "spammy" workload, which does several orders of
> magnitude more submissions second we go from:
>
> +   79.38%     1.66%  kworker/u32:1+e  [kernel.kallsyms]  [k] process_one_work
> -   63.13%     6.66%  kworker/u32:1+e  [kernel.kallsyms]  [k] drm_sched_run_job_work
>     - 56.47% drm_sched_run_job_work
>        - 25.67% amdgpu_job_run
>           - 24.40% amdgpu_ib_schedule
>              - 15.29% amdgpu_ring_commit
>                   12.06% amdgpu_ring_insert_nop
>
> To:
>
> +   77.76%     1.97%  kworker/u32:6-f  [kernel.kallsyms]  [k] process_one_work
> -   60.15%     7.04%  kworker/u32:6-f  [kernel.kallsyms]  [k] drm_sched_run_job_work
>     - 53.11% drm_sched_run_job_work
>        - 19.35% amdgpu_job_run
>           - 17.85% amdgpu_ib_schedule
>              - 7.75% amdgpu_ring_commit
>                   3.27% amdgpu_ring_insert_nop
>
> Not bad and "every little helps", or flame-throwers at ready?
>
> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin at igalia.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c | 20 +++++++++++++++-----
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h |  1 +
>   2 files changed, 16 insertions(+), 5 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
> index ad49cecb20b8..22ec9de62b06 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
> @@ -108,10 +108,14 @@ int amdgpu_ring_alloc(struct amdgpu_ring *ring, unsigned int ndw)
>    */
>   void amdgpu_ring_insert_nop(struct amdgpu_ring *ring, uint32_t count)
>   {
> -	int i;
> +	if (count > 1 && count <= ARRAY_SIZE(ring->nop_cache)) {
> +		amdgpu_ring_write_multiple(ring, ring->nop_cache, count);
> +	} else {
> +		int i;
>   
> -	for (i = 0; i < count; i++)
> -		amdgpu_ring_write(ring, ring->funcs->nop);
> +		for (i = 0; i < count; i++)
> +			amdgpu_ring_write(ring, ring->funcs->nop);
> +	}
>   }
>   
>   /**
> @@ -124,8 +128,11 @@ void amdgpu_ring_insert_nop(struct amdgpu_ring *ring, uint32_t count)
>    */
>   void amdgpu_ring_generic_pad_ib(struct amdgpu_ring *ring, struct amdgpu_ib *ib)
>   {
> -	while (ib->length_dw & ring->funcs->align_mask)
> -		ib->ptr[ib->length_dw++] = ring->funcs->nop;
> +	u32 count = ib->length_dw & ring->funcs->align_mask;
> +
> +	memcpy(&ib->ptr[ib->length_dw], ring->nop_cache, count * sizeof(u32));
> +
> +	ib->length_dw += count;
>   }
>   
>   /**
> @@ -359,6 +366,9 @@ int amdgpu_ring_init(struct amdgpu_device *adev, struct amdgpu_ring *ring,
>   			&ring->sched;
>   	}
>   
> +	for (r = 0; r < ARRAY_SIZE(ring->nop_cache); r++)
> +		ring->nop_cache[r] = ring->funcs->nop;
> +
>   	return 0;
>   }
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> index 582053f1cd56..74ce95b4666a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> @@ -246,6 +246,7 @@ struct amdgpu_ring {
>   	struct amdgpu_bo	*ring_obj;
>   	volatile uint32_t	*ring;
>   	unsigned		rptr_offs;
> +	u32			nop_cache[256];
>   	u64			rptr_gpu_addr;
>   	volatile u32		*rptr_cpu_addr;
>   	u64			wptr;