[PATCH 09/12] drm/amdgpu: Optimise amdgpu_ring_write()

Christian König ckoenig.leichtzumerken at gmail.com
Thu Jan 2 13:55:47 UTC 2025


Am 27.12.24 um 12:19 schrieb Tvrtko Ursulin:
> From: Tvrtko Ursulin <tvrtko.ursulin at igalia.com>
>
> There are more than 2000 calls to amdgpu_ring_write() in the driver and
> the majority is multiple sequential calls which the compiler cannot
> optimise much.
>
> Lets make this helper variadic via some pre-processor magic which allows
> merging those sequential calls and in turn enables the compiler to emit
> much less code.

I've played around with the same idea before abandoning it for this 
patch here: 
https://lore.kernel.org/all/20241008181134.1350-2-christian.koenig@amd.com/

Basically we don't need to update count_dw nor mask the WPTR which has 
the same effect as this optimization here and far less code change.

The problem is that some code for Polaris HW generations seem to use the 
WPTR or count_dw directly for some calculation. I didn't had time to 
clean that up and push the resulting change.

Regards,
Christian.

>
> If we then would convert some call sites to use this macro, lets see on
> the example of amdgpu_vce_ring_emit_ib(), what results that would bring.
> Before (but after the wptr local caching, before it is even worse):
>
>    173c39:       48 89 f8                mov    %rdi,%rax
>    173c3c:       48 89 d1                mov    %rdx,%rcx
>    173c3f:       48 8b 97 c8 01 00 00    mov    0x1c8(%rdi),%rdx
>    173c46:       48 8b b0 a8 01 00 00    mov    0x1a8(%rax),%rsi
>    173c4d:       89 d7                   mov    %edx,%edi
>    173c4f:       23 b8 f8 01 00 00       and    0x1f8(%rax),%edi
>    173c55:       48 83 c2 01             add    $0x1,%rdx
>    173c59:       c7 04 be 02 00 00 00    movl   $0x2,(%rsi,%rdi,4)
>    173c60:       48 23 90 f0 01 00 00    and    0x1f0(%rax),%rdx
>    173c67:       48 89 90 c8 01 00 00    mov    %rdx,0x1c8(%rax)
>    173c6e:       48 8b b0 a8 01 00 00    mov    0x1a8(%rax),%rsi
>    173c75:       89 d7                   mov    %edx,%edi
>    173c77:       48 83 c2 01             add    $0x1,%rdx
>    173c7b:       83 a8 e0 01 00 00 01    subl   $0x1,0x1e0(%rax)
>    173c82:       4c 8b 41 10             mov    0x10(%rcx),%r8
>    173c86:       23 b8 f8 01 00 00       and    0x1f8(%rax),%edi
>    173c8c:       44 89 04 be             mov    %r8d,(%rsi,%rdi,4)
>    173c90:       48 23 90 f0 01 00 00    and    0x1f0(%rax),%rdx
>    173c97:       48 89 90 c8 01 00 00    mov    %rdx,0x1c8(%rax)
>    173c9e:       48 8b b0 a8 01 00 00    mov    0x1a8(%rax),%rsi
>    173ca5:       89 d7                   mov    %edx,%edi
>    173ca7:       48 83 c2 01             add    $0x1,%rdx
>    173cab:       83 a8 e0 01 00 00 01    subl   $0x1,0x1e0(%rax)
>    173cb2:       44 8b 41 14             mov    0x14(%rcx),%r8d
>    173cb6:       23 b8 f8 01 00 00       and    0x1f8(%rax),%edi
>    173cbc:       44 89 04 be             mov    %r8d,(%rsi,%rdi,4)
>    173cc0:       48 23 90 f0 01 00 00    and    0x1f0(%rax),%rdx
>    173cc7:       48 89 90 c8 01 00 00    mov    %rdx,0x1c8(%rax)
>    173cce:       89 d6                   mov    %edx,%esi
>    173cd0:       23 b0 f8 01 00 00       and    0x1f8(%rax),%esi
>    173cd6:       48 83 c2 01             add    $0x1,%rdx
>    173cda:       83 a8 e0 01 00 00 01    subl   $0x1,0x1e0(%rax)
>    173ce1:       8b 79 08                mov    0x8(%rcx),%edi
>    173ce4:       48 8b 88 a8 01 00 00    mov    0x1a8(%rax),%rcx
>    173ceb:       89 3c b1                mov    %edi,(%rcx,%rsi,4)
>    173cee:       48 23 90 f0 01 00 00    and    0x1f0(%rax),%rdx
>    173cf5:       83 a8 e0 01 00 00 01    subl   $0x1,0x1e0(%rax)
>    173cfc:       48 89 90 c8 01 00 00    mov    %rdx,0x1c8(%rax)
>    173d03:       c3                      ret
>
> And after:
>
>      1579:       48 89 f8                mov    %rdi,%rax
>      157c:       44 8b 4a 08             mov    0x8(%rdx),%r9d
>      1580:       48 8b 7a 10             mov    0x10(%rdx),%rdi
>      1584:       48 8b 90 c8 01 00 00    mov    0x1c8(%rax),%rdx
>      158b:       8b b0 f8 01 00 00       mov    0x1f8(%rax),%esi
>      1591:       48 8b 88 a8 01 00 00    mov    0x1a8(%rax),%rcx
>      1598:       49 89 d0                mov    %rdx,%r8
>      159b:       49 21 f0                and    %rsi,%r8
>      159e:       42 c7 04 81 02 00 00    movl   $0x2,(%rcx,%r8,4)
>      15a5:       00
>      15a6:       4c 8d 42 01             lea    0x1(%rdx),%r8
>      15aa:       49 21 f0                and    %rsi,%r8
>      15ad:       42 89 3c 81             mov    %edi,(%rcx,%r8,4)
>      15b1:       4c 8d 42 02             lea    0x2(%rdx),%r8
>      15b5:       48 c1 ef 20             shr    $0x20,%rdi
>      15b9:       49 21 f0                and    %rsi,%r8
>      15bc:       42 89 3c 81             mov    %edi,(%rcx,%r8,4)
>      15c0:       48 8d 7a 03             lea    0x3(%rdx),%rdi
>      15c4:       48 83 c2 04             add    $0x4,%rdx
>      15c8:       48 21 fe                and    %rdi,%rsi
>      15cb:       44 89 0c b1             mov    %r9d,(%rcx,%rsi,4)
>      15cf:       48 23 90 f0 01 00 00    and    0x1f0(%rax),%rdx
>      15d6:       83 a8 e0 01 00 00 04    subl   $0x4,0x1e0(%rax)
>      15dd:       48 89 90 c8 01 00 00    mov    %rdx,0x1c8(%rax)
>      15e4:       c3                      ret
>
> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin at igalia.com>
> Cc: Christian König <christian.koenig at amd.com>
> Cc: Sunil Khatri <sunil.khatri at amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h | 222 ++++++++++++++++++++++-
>   1 file changed, 220 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> index b57951d8c997..4f467864ed09 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> @@ -383,15 +383,233 @@ static inline void amdgpu_ring_clear_ring(struct amdgpu_ring *ring)
>   	memset32(ring->ring, ring->funcs->nop, ring->buf_mask + 1);
>   }
>   
> -static inline void amdgpu_ring_write(struct amdgpu_ring *ring, uint32_t v)
> +static inline void
> +amdgpu_ring_write1(struct amdgpu_ring *ring,
> +		   u32 v1)
>   {
> +	const u32 buf_mask = ring->buf_mask;
>   	u64 wptr = ring->wptr;
>   
> -	ring->ring[wptr++ & ring->buf_mask] = v;
> +	ring->ring[wptr++ & buf_mask] = v1;
>   	ring->wptr = wptr & ring->ptr_mask;
>   	ring->count_dw--;
>   }
>   
> +static inline void
> +amdgpu_ring_write2(struct amdgpu_ring *ring,
> +		   u32 v1, u32 v2)
> +{
> +	const u32 buf_mask = ring->buf_mask;
> +	u64 wptr = ring->wptr;
> +	u32 *r = ring->ring;
> +
> +	r[wptr++ & buf_mask] = v1;
> +	r[wptr++ & buf_mask] = v2;
> +
> +	ring->wptr = wptr & ring->ptr_mask;
> +	ring->count_dw -= 2;
> +}
> +
> +static inline void
> +amdgpu_ring_write3(struct amdgpu_ring *ring,
> +		   u32 v1, u32 v2, u32 v3)
> +{
> +	const u32 buf_mask = ring->buf_mask;
> +	u64 wptr = ring->wptr;
> +	u32 *r = ring->ring;
> +
> +	r[wptr++ & buf_mask] = v1;
> +	r[wptr++ & buf_mask] = v2;
> +	r[wptr++ & buf_mask] = v3;
> +
> +	ring->wptr = wptr & ring->ptr_mask;
> +	ring->count_dw -= 3;
> +}
> +
> +static inline void
> +amdgpu_ring_write4(struct amdgpu_ring *ring,
> +		   u32 v1, u32 v2, u32 v3, u32 v4)
> +{
> +	const u32 buf_mask = ring->buf_mask;
> +	u64 wptr = ring->wptr;
> +	u32 *r = ring->ring;
> +
> +	r[wptr++ & buf_mask] = v1;
> +	r[wptr++ & buf_mask] = v2;
> +	r[wptr++ & buf_mask] = v3;
> +	r[wptr++ & buf_mask] = v4;
> +
> +	ring->wptr = wptr & ring->ptr_mask;
> +	ring->count_dw -= 4;
> +}
> +
> +static inline void
> +amdgpu_ring_write5(struct amdgpu_ring *ring,
> +		   u32 v1, u32 v2, u32 v3, u32 v4, u32 v5)
> +{
> +	const u32 buf_mask = ring->buf_mask;
> +	u64 wptr = ring->wptr;
> +	u32 *r = ring->ring;
> +
> +	r[wptr++ & buf_mask] = v1;
> +	r[wptr++ & buf_mask] = v2;
> +	r[wptr++ & buf_mask] = v3;
> +	r[wptr++ & buf_mask] = v4;
> +	r[wptr++ & buf_mask] = v5;
> +
> +	ring->wptr = wptr & ring->ptr_mask;
> +	ring->count_dw -= 5;
> +}
> +
> +static inline void
> +amdgpu_ring_write6(struct amdgpu_ring *ring,
> +		   u32 v1, u32 v2, u32 v3, u32 v4, u32 v5, u32 v6)
> +{
> +	const u32 buf_mask = ring->buf_mask;
> +	u64 wptr = ring->wptr;
> +	u32 *r = ring->ring;
> +
> +	r[wptr++ & buf_mask] = v1;
> +	r[wptr++ & buf_mask] = v2;
> +	r[wptr++ & buf_mask] = v3;
> +	r[wptr++ & buf_mask] = v4;
> +	r[wptr++ & buf_mask] = v5;
> +	r[wptr++ & buf_mask] = v6;
> +
> +	ring->wptr = wptr & ring->ptr_mask;
> +	ring->count_dw -= 6;
> +}
> +
> +static inline void
> +amdgpu_ring_write7(struct amdgpu_ring *ring,
> +		   u32 v1, u32 v2, u32 v3, u32 v4, u32 v5, u32 v6, u32 v7)
> +{
> +	const u32 buf_mask = ring->buf_mask;
> +	u64 wptr = ring->wptr;
> +	u32 *r = ring->ring;
> +
> +	r[wptr++ & buf_mask] = v1;
> +	r[wptr++ & buf_mask] = v2;
> +	r[wptr++ & buf_mask] = v3;
> +	r[wptr++ & buf_mask] = v4;
> +	r[wptr++ & buf_mask] = v5;
> +	r[wptr++ & buf_mask] = v6;
> +	r[wptr++ & buf_mask] = v7;
> +
> +	ring->wptr = wptr & ring->ptr_mask;
> +	ring->count_dw -= 7;
> +}
> +
> +static inline void
> +amdgpu_ring_write8(struct amdgpu_ring *ring,
> +		   u32 v1, u32 v2, u32 v3, u32 v4, u32 v5, u32 v6, u32 v7,
> +		   u32 v8)
> +{
> +	const u32 buf_mask = ring->buf_mask;
> +	u64 wptr = ring->wptr;
> +	u32 *r = ring->ring;
> +
> +	r[wptr++ & buf_mask] = v1;
> +	r[wptr++ & buf_mask] = v2;
> +	r[wptr++ & buf_mask] = v3;
> +	r[wptr++ & buf_mask] = v4;
> +	r[wptr++ & buf_mask] = v5;
> +	r[wptr++ & buf_mask] = v6;
> +	r[wptr++ & buf_mask] = v7;
> +	r[wptr++ & buf_mask] = v8;
> +
> +	ring->wptr = wptr & ring->ptr_mask;
> +	ring->count_dw -= 8;
> +}
> +
> +static inline void
> +amdgpu_ring_write9(struct amdgpu_ring *ring,
> +		   u32 v1, u32 v2, u32 v3, u32 v4, u32 v5, u32 v6, u32 v7,
> +		   u32 v8, u32 v9)
> +{
> +	const u32 buf_mask = ring->buf_mask;
> +	u64 wptr = ring->wptr;
> +	u32 *r = ring->ring;
> +
> +	r[wptr++ & buf_mask] = v1;
> +	r[wptr++ & buf_mask] = v2;
> +	r[wptr++ & buf_mask] = v3;
> +	r[wptr++ & buf_mask] = v4;
> +	r[wptr++ & buf_mask] = v5;
> +	r[wptr++ & buf_mask] = v6;
> +	r[wptr++ & buf_mask] = v7;
> +	r[wptr++ & buf_mask] = v8;
> +	r[wptr++ & buf_mask] = v9;
> +
> +	ring->wptr = wptr & ring->ptr_mask;
> +	ring->count_dw -= 9;
> +}
> +
> +static inline void
> +amdgpu_ring_write10(struct amdgpu_ring *ring,
> +		   u32 v1, u32 v2, u32 v3, u32 v4, u32 v5, u32 v6, u32 v7,
> +		   u32 v8, u32 v9, u32 v10)
> +{
> +	const u32 buf_mask = ring->buf_mask;
> +	u64 wptr = ring->wptr;
> +	u32 *r = ring->ring;
> +
> +	r[wptr++ & buf_mask] = v1;
> +	r[wptr++ & buf_mask] = v2;
> +	r[wptr++ & buf_mask] = v3;
> +	r[wptr++ & buf_mask] = v4;
> +	r[wptr++ & buf_mask] = v5;
> +	r[wptr++ & buf_mask] = v6;
> +	r[wptr++ & buf_mask] = v7;
> +	r[wptr++ & buf_mask] = v8;
> +	r[wptr++ & buf_mask] = v9;
> +	r[wptr++ & buf_mask] = v10;
> +
> +	ring->wptr = wptr & ring->ptr_mask;
> +	ring->count_dw -= 10;
> +}
> +
> +static inline void
> +amdgpu_ring_write11(struct amdgpu_ring *ring,
> +		   u32 v1, u32 v2, u32 v3, u32 v4, u32 v5, u32 v6, u32 v7,
> +		   u32 v8, u32 v9, u32 v10, u32 v11)
> +{
> +	const u32 buf_mask = ring->buf_mask;
> +	u64 wptr = ring->wptr;
> +	u32 *r = ring->ring;
> +
> +	r[wptr++ & buf_mask] = v1;
> +	r[wptr++ & buf_mask] = v2;
> +	r[wptr++ & buf_mask] = v3;
> +	r[wptr++ & buf_mask] = v4;
> +	r[wptr++ & buf_mask] = v5;
> +	r[wptr++ & buf_mask] = v6;
> +	r[wptr++ & buf_mask] = v7;
> +	r[wptr++ & buf_mask] = v8;
> +	r[wptr++ & buf_mask] = v9;
> +	r[wptr++ & buf_mask] = v10;
> +	r[wptr++ & buf_mask] = v11;
> +
> +	ring->wptr = wptr & ring->ptr_mask;
> +	ring->count_dw -= 11;
> +}
> +
> +#define AMDGPU_RING_WRITE(_1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, NAME, ...) NAME
> +#define amdgpu_ring_write(...) \
> +	AMDGPU_RING_WRITE(__VA_ARGS__, \
> +			  amdgpu_ring_write11, \
> +			  amdgpu_ring_write10, \
> +			  amdgpu_ring_write9, \
> +			  amdgpu_ring_write8, \
> +			  amdgpu_ring_write7, \
> +			  amdgpu_ring_write6, \
> +			  amdgpu_ring_write5, \
> +			  amdgpu_ring_write4, \
> +			  amdgpu_ring_write3, \
> +			  amdgpu_ring_write2, \
> +			  amdgpu_ring_write1, \
> +			  NULL)(__VA_ARGS__)
> +
>   static inline void amdgpu_ring_write_multiple(struct amdgpu_ring *ring,
>   					      void *src, int count_dw)
>   {



More information about the amd-gfx mailing list