[PATCH 07/12] drm/amdgpu: Convert JPEG, VCE and UVD to more efficient ring padding

Tvrtko Ursulin tursulin at igalia.com
Fri Dec 27 11:19:33 UTC 2024


From: Tvrtko Ursulin <tvrtko.ursulin at igalia.com>

Similarly as in the previous patch, but now we add a new
amdgpu_ring_fill2x32() helper which can write out the nops more
efficiently using memset64().

This should have a lesser effect than the previous patch, given how the
affected rings have at most 64 dword alignment restriction (in contrast to
many with 256 with GFX). Nevertheless, it sounds plausible to utilise the
identical approach and still achieve that little bit more efficiency.

Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin at igalia.com>
Cc: Christian König <ckoenig.leichtzumerken at gmail.com>
Cc: Sunil Khatri <sunil.khatri at amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h | 41 ++++++++++++++++++++++++
 drivers/gpu/drm/amd/amdgpu/jpeg_v1_0.c   |  8 ++---
 drivers/gpu/drm/amd/amdgpu/jpeg_v2_0.c   |  8 ++---
 drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c |  8 ++---
 drivers/gpu/drm/amd/amdgpu/uvd_v3_1.c    |  7 +---
 drivers/gpu/drm/amd/amdgpu/uvd_v4_2.c    |  7 +---
 drivers/gpu/drm/amd/amdgpu/uvd_v5_0.c    |  7 +---
 drivers/gpu/drm/amd/amdgpu/uvd_v6_0.c    |  7 +---
 drivers/gpu/drm/amd/amdgpu/uvd_v7_0.c    |  9 +++---
 drivers/gpu/drm/amd/amdgpu/vcn_v1_0.c    |  8 ++---
 drivers/gpu/drm/amd/amdgpu/vcn_v2_0.c    |  7 ++--
 11 files changed, 60 insertions(+), 57 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
index 0a59738fa1d4..158238f8c06a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
@@ -441,6 +441,47 @@ static inline void amdgpu_ring_fill(struct amdgpu_ring *ring,
 	ring->count_dw -= count;
 }
 
+static inline void amdgpu_ring_fill2x32(struct amdgpu_ring *ring,
+					u32 v1, u32 v2, u32 count)
+{
+	const u32 buf_mask = ring->buf_mask;
+	u32 occupied, chunk1, chunk2;
+	u64 wptr = ring->wptr;
+	u64 v;
+
+	if (count == 0)
+		return;
+
+	occupied = wptr & buf_mask;
+	chunk1 = buf_mask + 1 - occupied;
+	count *= 2; /* To dwords */
+	chunk1 = (chunk1 >= count) ? count : chunk1;
+	chunk2 = count - chunk1;
+
+	if (WARN_ON_ONCE((chunk1 | chunk2) & 1))
+		return;
+
+	/* To qwords */
+	chunk1 >>= 1;
+	chunk2 >>= 1;
+
+#ifdef __BIG_ENDIAN
+	v = (u64)v1 << 32 | v2;
+#else
+	v = (u64)v2 << 32 | v1;
+#endif
+
+	if (chunk1)
+		memset64((u64 *)&ring->ring[occupied], v, chunk1);
+
+	if (chunk2)
+		memset64((u64 *)&ring->ring[0], v, chunk2);
+
+	wptr += count;
+	ring->wptr = wptr & ring->ptr_mask;
+	ring->count_dw -= count;
+}
+
 /**
  * amdgpu_ring_patch_cond_exec - patch dw count of conditional execute
  * @ring: amdgpu_ring structure
diff --git a/drivers/gpu/drm/amd/amdgpu/jpeg_v1_0.c b/drivers/gpu/drm/amd/amdgpu/jpeg_v1_0.c
index 03b8b7cd5229..1c4ec0330391 100644
--- a/drivers/gpu/drm/amd/amdgpu/jpeg_v1_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/jpeg_v1_0.c
@@ -418,14 +418,10 @@ static void jpeg_v1_0_decode_ring_emit_wreg(struct amdgpu_ring *ring,
 
 static void jpeg_v1_0_decode_ring_nop(struct amdgpu_ring *ring, uint32_t count)
 {
-	int i;
-
 	WARN_ON(ring->wptr % 2 || count % 2);
 
-	for (i = 0; i < count / 2; i++) {
-		amdgpu_ring_write(ring, PACKETJ(0, 0, 0, PACKETJ_TYPE6));
-		amdgpu_ring_write(ring, 0);
-	}
+	amdgpu_ring_fill2x32(ring, PACKETJ(0, 0, 0, PACKETJ_TYPE6), 0,
+			     count / 2);
 }
 
 static int jpeg_v1_0_set_interrupt_state(struct amdgpu_device *adev,
diff --git a/drivers/gpu/drm/amd/amdgpu/jpeg_v2_0.c b/drivers/gpu/drm/amd/amdgpu/jpeg_v2_0.c
index 7c9251c03815..4b39898aacc9 100644
--- a/drivers/gpu/drm/amd/amdgpu/jpeg_v2_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/jpeg_v2_0.c
@@ -645,14 +645,10 @@ void jpeg_v2_0_dec_ring_emit_wreg(struct amdgpu_ring *ring, uint32_t reg, uint32
 
 void jpeg_v2_0_dec_ring_nop(struct amdgpu_ring *ring, uint32_t count)
 {
-	int i;
-
 	WARN_ON(ring->wptr % 2 || count % 2);
 
-	for (i = 0; i < count / 2; i++) {
-		amdgpu_ring_write(ring, PACKETJ(0, 0, 0, PACKETJ_TYPE6));
-		amdgpu_ring_write(ring, 0);
-	}
+	amdgpu_ring_fill2x32(ring, PACKETJ(0, 0, 0, PACKETJ_TYPE6), 0,
+			     count / 2);
 }
 
 static bool jpeg_v2_0_is_idle(void *handle)
diff --git a/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c b/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c
index 88f9771c1686..5e503a5c1926 100644
--- a/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c
+++ b/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c
@@ -897,14 +897,10 @@ void jpeg_v4_0_3_dec_ring_emit_wreg(struct amdgpu_ring *ring, uint32_t reg, uint
 
 void jpeg_v4_0_3_dec_ring_nop(struct amdgpu_ring *ring, uint32_t count)
 {
-	int i;
-
 	WARN_ON(ring->wptr % 2 || count % 2);
 
-	for (i = 0; i < count / 2; i++) {
-		amdgpu_ring_write(ring, PACKETJ(0, 0, 0, PACKETJ_TYPE6));
-		amdgpu_ring_write(ring, 0);
-	}
+	amdgpu_ring_fill2x32(ring, PACKETJ(0, 0, 0, PACKETJ_TYPE6), 0,
+			     count / 2);
 }
 
 static bool jpeg_v4_0_3_is_idle(void *handle)
diff --git a/drivers/gpu/drm/amd/amdgpu/uvd_v3_1.c b/drivers/gpu/drm/amd/amdgpu/uvd_v3_1.c
index 5830e799c0a3..4faf8ca2773d 100644
--- a/drivers/gpu/drm/amd/amdgpu/uvd_v3_1.c
+++ b/drivers/gpu/drm/amd/amdgpu/uvd_v3_1.c
@@ -166,14 +166,9 @@ static int uvd_v3_1_ring_test_ring(struct amdgpu_ring *ring)
 
 static void uvd_v3_1_ring_insert_nop(struct amdgpu_ring *ring, uint32_t count)
 {
-	int i;
-
 	WARN_ON(ring->wptr % 2 || count % 2);
 
-	for (i = 0; i < count / 2; i++) {
-		amdgpu_ring_write(ring, PACKET0(mmUVD_NO_OP, 0));
-		amdgpu_ring_write(ring, 0);
-	}
+	amdgpu_ring_fill2x32(ring, PACKET0(mmUVD_NO_OP, 0), 0, count / 2);
 }
 
 static const struct amdgpu_ring_funcs uvd_v3_1_ring_funcs = {
diff --git a/drivers/gpu/drm/amd/amdgpu/uvd_v4_2.c b/drivers/gpu/drm/amd/amdgpu/uvd_v4_2.c
index f93079e09215..05ac019e00b4 100644
--- a/drivers/gpu/drm/amd/amdgpu/uvd_v4_2.c
+++ b/drivers/gpu/drm/amd/amdgpu/uvd_v4_2.c
@@ -553,14 +553,9 @@ static void uvd_v4_2_ring_emit_ib(struct amdgpu_ring *ring,
 
 static void uvd_v4_2_ring_insert_nop(struct amdgpu_ring *ring, uint32_t count)
 {
-	int i;
-
 	WARN_ON(ring->wptr % 2 || count % 2);
 
-	for (i = 0; i < count / 2; i++) {
-		amdgpu_ring_write(ring, PACKET0(mmUVD_NO_OP, 0));
-		amdgpu_ring_write(ring, 0);
-	}
+	amdgpu_ring_fill2x32(ring, PACKET0(mmUVD_NO_OP, 0), 0, count / 2);
 }
 
 /**
diff --git a/drivers/gpu/drm/amd/amdgpu/uvd_v5_0.c b/drivers/gpu/drm/amd/amdgpu/uvd_v5_0.c
index 050a0f309390..dde0d7d508ce 100644
--- a/drivers/gpu/drm/amd/amdgpu/uvd_v5_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/uvd_v5_0.c
@@ -570,14 +570,9 @@ static void uvd_v5_0_ring_emit_ib(struct amdgpu_ring *ring,
 
 static void uvd_v5_0_ring_insert_nop(struct amdgpu_ring *ring, uint32_t count)
 {
-	int i;
-
 	WARN_ON(ring->wptr % 2 || count % 2);
 
-	for (i = 0; i < count / 2; i++) {
-		amdgpu_ring_write(ring, PACKET0(mmUVD_NO_OP, 0));
-		amdgpu_ring_write(ring, 0);
-	}
+	amdgpu_ring_fill2x32(ring, PACKET0(mmUVD_NO_OP, 0), 0, count / 2);
 }
 
 static bool uvd_v5_0_is_idle(void *handle)
diff --git a/drivers/gpu/drm/amd/amdgpu/uvd_v6_0.c b/drivers/gpu/drm/amd/amdgpu/uvd_v6_0.c
index d9d036ee51fb..095e71d509dc 100644
--- a/drivers/gpu/drm/amd/amdgpu/uvd_v6_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/uvd_v6_0.c
@@ -1106,14 +1106,9 @@ static void uvd_v6_0_ring_emit_pipeline_sync(struct amdgpu_ring *ring)
 
 static void uvd_v6_0_ring_insert_nop(struct amdgpu_ring *ring, uint32_t count)
 {
-	int i;
-
 	WARN_ON(ring->wptr % 2 || count % 2);
 
-	for (i = 0; i < count / 2; i++) {
-		amdgpu_ring_write(ring, PACKET0(mmUVD_NO_OP, 0));
-		amdgpu_ring_write(ring, 0);
-	}
+	amdgpu_ring_fill2x32(ring, PACKET0(mmUVD_NO_OP, 0), 0, count / 2);
 }
 
 static void uvd_v6_0_enc_ring_emit_pipeline_sync(struct amdgpu_ring *ring)
diff --git a/drivers/gpu/drm/amd/amdgpu/uvd_v7_0.c b/drivers/gpu/drm/amd/amdgpu/uvd_v7_0.c
index 9d237b5937fb..31005dbde8a0 100644
--- a/drivers/gpu/drm/amd/amdgpu/uvd_v7_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/uvd_v7_0.c
@@ -1416,14 +1416,13 @@ static void uvd_v7_0_ring_emit_vm_flush(struct amdgpu_ring *ring,
 static void uvd_v7_0_ring_insert_nop(struct amdgpu_ring *ring, uint32_t count)
 {
 	struct amdgpu_device *adev = ring->adev;
-	int i;
 
 	WARN_ON(ring->wptr % 2 || count % 2);
 
-	for (i = 0; i < count / 2; i++) {
-		amdgpu_ring_write(ring, PACKET0(SOC15_REG_OFFSET(UVD, ring->me, mmUVD_NO_OP), 0));
-		amdgpu_ring_write(ring, 0);
-	}
+	amdgpu_ring_fill2x32(ring,
+			   PACKET0(SOC15_REG_OFFSET(UVD, ring->me, mmUVD_NO_OP), 0),
+			   0,
+			   count / 2);
 }
 
 static void uvd_v7_0_enc_ring_insert_end(struct amdgpu_ring *ring)
diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v1_0.c b/drivers/gpu/drm/amd/amdgpu/vcn_v1_0.c
index 5ea96c983517..2f848f302069 100644
--- a/drivers/gpu/drm/amd/amdgpu/vcn_v1_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/vcn_v1_0.c
@@ -1790,14 +1790,12 @@ static int vcn_v1_0_process_interrupt(struct amdgpu_device *adev,
 static void vcn_v1_0_dec_ring_insert_nop(struct amdgpu_ring *ring, uint32_t count)
 {
 	struct amdgpu_device *adev = ring->adev;
-	int i;
 
 	WARN_ON(ring->wptr % 2 || count % 2);
 
-	for (i = 0; i < count / 2; i++) {
-		amdgpu_ring_write(ring, PACKET0(SOC15_REG_OFFSET(UVD, 0, mmUVD_NO_OP), 0));
-		amdgpu_ring_write(ring, 0);
-	}
+	amdgpu_ring_fill2x32(ring,
+			   PACKET0(SOC15_REG_OFFSET(UVD, 0, mmUVD_NO_OP), 0), 0,
+			   count / 2);
 }
 
 static int vcn_v1_0_set_powergating_state(struct amdgpu_ip_block *ip_block,
diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v2_0.c b/drivers/gpu/drm/amd/amdgpu/vcn_v2_0.c
index e42cfc731ad8..9d24ba173f09 100644
--- a/drivers/gpu/drm/amd/amdgpu/vcn_v2_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/vcn_v2_0.c
@@ -1453,14 +1453,11 @@ void vcn_v2_0_dec_ring_insert_end(struct amdgpu_ring *ring)
 void vcn_v2_0_dec_ring_insert_nop(struct amdgpu_ring *ring, uint32_t count)
 {
 	struct amdgpu_device *adev = ring->adev;
-	int i;
 
 	WARN_ON(ring->wptr % 2 || count % 2);
 
-	for (i = 0; i < count / 2; i++) {
-		amdgpu_ring_write(ring, PACKET0(adev->vcn.internal.nop, 0));
-		amdgpu_ring_write(ring, 0);
-	}
+	amdgpu_ring_fill2x32(ring, PACKET0(adev->vcn.internal.nop, 0), 0,
+			     count / 2);
 }
 
 /**
-- 
2.47.1



More information about the amd-gfx mailing list