[PATCH 09/12] drm/amdgpu: Optimise amdgpu_ring_write()
Tvrtko Ursulin
tursulin at igalia.com
Fri Dec 27 11:19:35 UTC 2024
From: Tvrtko Ursulin <tvrtko.ursulin at igalia.com>
There are more than 2000 calls to amdgpu_ring_write() in the driver and
the majority is multiple sequential calls which the compiler cannot
optimise much.
Lets make this helper variadic via some pre-processor magic which allows
merging those sequential calls and in turn enables the compiler to emit
much less code.
If we then would convert some call sites to use this macro, lets see on
the example of amdgpu_vce_ring_emit_ib(), what results that would bring.
Before (but after the wptr local caching, before it is even worse):
173c39: 48 89 f8 mov %rdi,%rax
173c3c: 48 89 d1 mov %rdx,%rcx
173c3f: 48 8b 97 c8 01 00 00 mov 0x1c8(%rdi),%rdx
173c46: 48 8b b0 a8 01 00 00 mov 0x1a8(%rax),%rsi
173c4d: 89 d7 mov %edx,%edi
173c4f: 23 b8 f8 01 00 00 and 0x1f8(%rax),%edi
173c55: 48 83 c2 01 add $0x1,%rdx
173c59: c7 04 be 02 00 00 00 movl $0x2,(%rsi,%rdi,4)
173c60: 48 23 90 f0 01 00 00 and 0x1f0(%rax),%rdx
173c67: 48 89 90 c8 01 00 00 mov %rdx,0x1c8(%rax)
173c6e: 48 8b b0 a8 01 00 00 mov 0x1a8(%rax),%rsi
173c75: 89 d7 mov %edx,%edi
173c77: 48 83 c2 01 add $0x1,%rdx
173c7b: 83 a8 e0 01 00 00 01 subl $0x1,0x1e0(%rax)
173c82: 4c 8b 41 10 mov 0x10(%rcx),%r8
173c86: 23 b8 f8 01 00 00 and 0x1f8(%rax),%edi
173c8c: 44 89 04 be mov %r8d,(%rsi,%rdi,4)
173c90: 48 23 90 f0 01 00 00 and 0x1f0(%rax),%rdx
173c97: 48 89 90 c8 01 00 00 mov %rdx,0x1c8(%rax)
173c9e: 48 8b b0 a8 01 00 00 mov 0x1a8(%rax),%rsi
173ca5: 89 d7 mov %edx,%edi
173ca7: 48 83 c2 01 add $0x1,%rdx
173cab: 83 a8 e0 01 00 00 01 subl $0x1,0x1e0(%rax)
173cb2: 44 8b 41 14 mov 0x14(%rcx),%r8d
173cb6: 23 b8 f8 01 00 00 and 0x1f8(%rax),%edi
173cbc: 44 89 04 be mov %r8d,(%rsi,%rdi,4)
173cc0: 48 23 90 f0 01 00 00 and 0x1f0(%rax),%rdx
173cc7: 48 89 90 c8 01 00 00 mov %rdx,0x1c8(%rax)
173cce: 89 d6 mov %edx,%esi
173cd0: 23 b0 f8 01 00 00 and 0x1f8(%rax),%esi
173cd6: 48 83 c2 01 add $0x1,%rdx
173cda: 83 a8 e0 01 00 00 01 subl $0x1,0x1e0(%rax)
173ce1: 8b 79 08 mov 0x8(%rcx),%edi
173ce4: 48 8b 88 a8 01 00 00 mov 0x1a8(%rax),%rcx
173ceb: 89 3c b1 mov %edi,(%rcx,%rsi,4)
173cee: 48 23 90 f0 01 00 00 and 0x1f0(%rax),%rdx
173cf5: 83 a8 e0 01 00 00 01 subl $0x1,0x1e0(%rax)
173cfc: 48 89 90 c8 01 00 00 mov %rdx,0x1c8(%rax)
173d03: c3 ret
And after:
1579: 48 89 f8 mov %rdi,%rax
157c: 44 8b 4a 08 mov 0x8(%rdx),%r9d
1580: 48 8b 7a 10 mov 0x10(%rdx),%rdi
1584: 48 8b 90 c8 01 00 00 mov 0x1c8(%rax),%rdx
158b: 8b b0 f8 01 00 00 mov 0x1f8(%rax),%esi
1591: 48 8b 88 a8 01 00 00 mov 0x1a8(%rax),%rcx
1598: 49 89 d0 mov %rdx,%r8
159b: 49 21 f0 and %rsi,%r8
159e: 42 c7 04 81 02 00 00 movl $0x2,(%rcx,%r8,4)
15a5: 00
15a6: 4c 8d 42 01 lea 0x1(%rdx),%r8
15aa: 49 21 f0 and %rsi,%r8
15ad: 42 89 3c 81 mov %edi,(%rcx,%r8,4)
15b1: 4c 8d 42 02 lea 0x2(%rdx),%r8
15b5: 48 c1 ef 20 shr $0x20,%rdi
15b9: 49 21 f0 and %rsi,%r8
15bc: 42 89 3c 81 mov %edi,(%rcx,%r8,4)
15c0: 48 8d 7a 03 lea 0x3(%rdx),%rdi
15c4: 48 83 c2 04 add $0x4,%rdx
15c8: 48 21 fe and %rdi,%rsi
15cb: 44 89 0c b1 mov %r9d,(%rcx,%rsi,4)
15cf: 48 23 90 f0 01 00 00 and 0x1f0(%rax),%rdx
15d6: 83 a8 e0 01 00 00 04 subl $0x4,0x1e0(%rax)
15dd: 48 89 90 c8 01 00 00 mov %rdx,0x1c8(%rax)
15e4: c3 ret
Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin at igalia.com>
Cc: Christian König <christian.koenig at amd.com>
Cc: Sunil Khatri <sunil.khatri at amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h | 222 ++++++++++++++++++++++-
1 file changed, 220 insertions(+), 2 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
index b57951d8c997..4f467864ed09 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
@@ -383,15 +383,233 @@ static inline void amdgpu_ring_clear_ring(struct amdgpu_ring *ring)
memset32(ring->ring, ring->funcs->nop, ring->buf_mask + 1);
}
-static inline void amdgpu_ring_write(struct amdgpu_ring *ring, uint32_t v)
+static inline void
+amdgpu_ring_write1(struct amdgpu_ring *ring,
+ u32 v1)
{
+ const u32 buf_mask = ring->buf_mask;
u64 wptr = ring->wptr;
- ring->ring[wptr++ & ring->buf_mask] = v;
+ ring->ring[wptr++ & buf_mask] = v1;
ring->wptr = wptr & ring->ptr_mask;
ring->count_dw--;
}
+static inline void
+amdgpu_ring_write2(struct amdgpu_ring *ring,
+ u32 v1, u32 v2)
+{
+ const u32 buf_mask = ring->buf_mask;
+ u64 wptr = ring->wptr;
+ u32 *r = ring->ring;
+
+ r[wptr++ & buf_mask] = v1;
+ r[wptr++ & buf_mask] = v2;
+
+ ring->wptr = wptr & ring->ptr_mask;
+ ring->count_dw -= 2;
+}
+
+static inline void
+amdgpu_ring_write3(struct amdgpu_ring *ring,
+ u32 v1, u32 v2, u32 v3)
+{
+ const u32 buf_mask = ring->buf_mask;
+ u64 wptr = ring->wptr;
+ u32 *r = ring->ring;
+
+ r[wptr++ & buf_mask] = v1;
+ r[wptr++ & buf_mask] = v2;
+ r[wptr++ & buf_mask] = v3;
+
+ ring->wptr = wptr & ring->ptr_mask;
+ ring->count_dw -= 3;
+}
+
+static inline void
+amdgpu_ring_write4(struct amdgpu_ring *ring,
+ u32 v1, u32 v2, u32 v3, u32 v4)
+{
+ const u32 buf_mask = ring->buf_mask;
+ u64 wptr = ring->wptr;
+ u32 *r = ring->ring;
+
+ r[wptr++ & buf_mask] = v1;
+ r[wptr++ & buf_mask] = v2;
+ r[wptr++ & buf_mask] = v3;
+ r[wptr++ & buf_mask] = v4;
+
+ ring->wptr = wptr & ring->ptr_mask;
+ ring->count_dw -= 4;
+}
+
+static inline void
+amdgpu_ring_write5(struct amdgpu_ring *ring,
+ u32 v1, u32 v2, u32 v3, u32 v4, u32 v5)
+{
+ const u32 buf_mask = ring->buf_mask;
+ u64 wptr = ring->wptr;
+ u32 *r = ring->ring;
+
+ r[wptr++ & buf_mask] = v1;
+ r[wptr++ & buf_mask] = v2;
+ r[wptr++ & buf_mask] = v3;
+ r[wptr++ & buf_mask] = v4;
+ r[wptr++ & buf_mask] = v5;
+
+ ring->wptr = wptr & ring->ptr_mask;
+ ring->count_dw -= 5;
+}
+
+static inline void
+amdgpu_ring_write6(struct amdgpu_ring *ring,
+ u32 v1, u32 v2, u32 v3, u32 v4, u32 v5, u32 v6)
+{
+ const u32 buf_mask = ring->buf_mask;
+ u64 wptr = ring->wptr;
+ u32 *r = ring->ring;
+
+ r[wptr++ & buf_mask] = v1;
+ r[wptr++ & buf_mask] = v2;
+ r[wptr++ & buf_mask] = v3;
+ r[wptr++ & buf_mask] = v4;
+ r[wptr++ & buf_mask] = v5;
+ r[wptr++ & buf_mask] = v6;
+
+ ring->wptr = wptr & ring->ptr_mask;
+ ring->count_dw -= 6;
+}
+
+static inline void
+amdgpu_ring_write7(struct amdgpu_ring *ring,
+ u32 v1, u32 v2, u32 v3, u32 v4, u32 v5, u32 v6, u32 v7)
+{
+ const u32 buf_mask = ring->buf_mask;
+ u64 wptr = ring->wptr;
+ u32 *r = ring->ring;
+
+ r[wptr++ & buf_mask] = v1;
+ r[wptr++ & buf_mask] = v2;
+ r[wptr++ & buf_mask] = v3;
+ r[wptr++ & buf_mask] = v4;
+ r[wptr++ & buf_mask] = v5;
+ r[wptr++ & buf_mask] = v6;
+ r[wptr++ & buf_mask] = v7;
+
+ ring->wptr = wptr & ring->ptr_mask;
+ ring->count_dw -= 7;
+}
+
+static inline void
+amdgpu_ring_write8(struct amdgpu_ring *ring,
+ u32 v1, u32 v2, u32 v3, u32 v4, u32 v5, u32 v6, u32 v7,
+ u32 v8)
+{
+ const u32 buf_mask = ring->buf_mask;
+ u64 wptr = ring->wptr;
+ u32 *r = ring->ring;
+
+ r[wptr++ & buf_mask] = v1;
+ r[wptr++ & buf_mask] = v2;
+ r[wptr++ & buf_mask] = v3;
+ r[wptr++ & buf_mask] = v4;
+ r[wptr++ & buf_mask] = v5;
+ r[wptr++ & buf_mask] = v6;
+ r[wptr++ & buf_mask] = v7;
+ r[wptr++ & buf_mask] = v8;
+
+ ring->wptr = wptr & ring->ptr_mask;
+ ring->count_dw -= 8;
+}
+
+static inline void
+amdgpu_ring_write9(struct amdgpu_ring *ring,
+ u32 v1, u32 v2, u32 v3, u32 v4, u32 v5, u32 v6, u32 v7,
+ u32 v8, u32 v9)
+{
+ const u32 buf_mask = ring->buf_mask;
+ u64 wptr = ring->wptr;
+ u32 *r = ring->ring;
+
+ r[wptr++ & buf_mask] = v1;
+ r[wptr++ & buf_mask] = v2;
+ r[wptr++ & buf_mask] = v3;
+ r[wptr++ & buf_mask] = v4;
+ r[wptr++ & buf_mask] = v5;
+ r[wptr++ & buf_mask] = v6;
+ r[wptr++ & buf_mask] = v7;
+ r[wptr++ & buf_mask] = v8;
+ r[wptr++ & buf_mask] = v9;
+
+ ring->wptr = wptr & ring->ptr_mask;
+ ring->count_dw -= 9;
+}
+
+static inline void
+amdgpu_ring_write10(struct amdgpu_ring *ring,
+ u32 v1, u32 v2, u32 v3, u32 v4, u32 v5, u32 v6, u32 v7,
+ u32 v8, u32 v9, u32 v10)
+{
+ const u32 buf_mask = ring->buf_mask;
+ u64 wptr = ring->wptr;
+ u32 *r = ring->ring;
+
+ r[wptr++ & buf_mask] = v1;
+ r[wptr++ & buf_mask] = v2;
+ r[wptr++ & buf_mask] = v3;
+ r[wptr++ & buf_mask] = v4;
+ r[wptr++ & buf_mask] = v5;
+ r[wptr++ & buf_mask] = v6;
+ r[wptr++ & buf_mask] = v7;
+ r[wptr++ & buf_mask] = v8;
+ r[wptr++ & buf_mask] = v9;
+ r[wptr++ & buf_mask] = v10;
+
+ ring->wptr = wptr & ring->ptr_mask;
+ ring->count_dw -= 10;
+}
+
+static inline void
+amdgpu_ring_write11(struct amdgpu_ring *ring,
+ u32 v1, u32 v2, u32 v3, u32 v4, u32 v5, u32 v6, u32 v7,
+ u32 v8, u32 v9, u32 v10, u32 v11)
+{
+ const u32 buf_mask = ring->buf_mask;
+ u64 wptr = ring->wptr;
+ u32 *r = ring->ring;
+
+ r[wptr++ & buf_mask] = v1;
+ r[wptr++ & buf_mask] = v2;
+ r[wptr++ & buf_mask] = v3;
+ r[wptr++ & buf_mask] = v4;
+ r[wptr++ & buf_mask] = v5;
+ r[wptr++ & buf_mask] = v6;
+ r[wptr++ & buf_mask] = v7;
+ r[wptr++ & buf_mask] = v8;
+ r[wptr++ & buf_mask] = v9;
+ r[wptr++ & buf_mask] = v10;
+ r[wptr++ & buf_mask] = v11;
+
+ ring->wptr = wptr & ring->ptr_mask;
+ ring->count_dw -= 11;
+}
+
+#define AMDGPU_RING_WRITE(_1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, NAME, ...) NAME
+#define amdgpu_ring_write(...) \
+ AMDGPU_RING_WRITE(__VA_ARGS__, \
+ amdgpu_ring_write11, \
+ amdgpu_ring_write10, \
+ amdgpu_ring_write9, \
+ amdgpu_ring_write8, \
+ amdgpu_ring_write7, \
+ amdgpu_ring_write6, \
+ amdgpu_ring_write5, \
+ amdgpu_ring_write4, \
+ amdgpu_ring_write3, \
+ amdgpu_ring_write2, \
+ amdgpu_ring_write1, \
+ NULL)(__VA_ARGS__)
+
static inline void amdgpu_ring_write_multiple(struct amdgpu_ring *ring,
void *src, int count_dw)
{
--
2.47.1
More information about the amd-gfx
mailing list