# [PATCH] drm/amdgpu: simplify padding calculations (v3)

Luben Tuikov luben.tuikov at amd.com
Wed Dec 18 23:12:17 UTC 2019

```Simplify padding calculations.

1. Taking the remainder of a binary value when
the divisor is a power of two, such as,
a % 2^n is identical to, a & (2^n - 1) in base-2
arithmetic, and so expressions like this:

(12 - (lower_32_bits(ring->wptr) & 7)) % 8

are superflous. And can be reduced to a single
remainder-taking operation.

2. Subtracting the remainder modulo 8 out of 12
and then again taking the remainder modulo 8,
yields the same result as simply subtracting
the value out of 4 and then taking the remainder.
This is true because 4 belongs to the congruence
(residue) class {4 + 8*k}, k in Z. (So using,
{..., -12, -4, 4, 12, 20, ...}, would yield
the same final result.

So the above expression, becomes,

(4 - lower_32_bits(ring->wptr)) & 7

3. Now since 8 is part of the conguence class
{0 + 8*k}, k in Z, and from 1) and 2) above, then,

(8 - (ib->length_dw & 0x7)) % 8

becomes, simply,

(-ib->length_dw) & 7.

This patch implements all of the above in this code.

v2: Comment update and spacing.
v3: Add 1, 2, 3, above, in this commit message.

Signed-off-by: Luben Tuikov <luben.tuikov at amd.com>
---
drivers/gpu/drm/amd/amdgpu/cik_sdma.c  |  4 ++--
drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c |  4 ++--
drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c |  4 ++--
drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c |  4 ++--
drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c | 17 ++++++++++++-----
5 files changed, 20 insertions(+), 13 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/cik_sdma.c b/drivers/gpu/drm/amd/amdgpu/cik_sdma.c
index 1f22a8d0f7f3..4274ccf765de 100644
--- a/drivers/gpu/drm/amd/amdgpu/cik_sdma.c
+++ b/drivers/gpu/drm/amd/amdgpu/cik_sdma.c
@@ -228,7 +228,7 @@ static void cik_sdma_ring_emit_ib(struct amdgpu_ring *ring,
u32 extra_bits = vmid & 0xf;

/* IB packet must end on a 8 DW boundary */
-	cik_sdma_ring_insert_nop(ring, (12 - (lower_32_bits(ring->wptr) & 7)) % 8);
+	cik_sdma_ring_insert_nop(ring, (4 - lower_32_bits(ring->wptr)) & 7);

amdgpu_ring_write(ring, SDMA_PACKET(SDMA_OPCODE_INDIRECT_BUFFER, 0, extra_bits));
amdgpu_ring_write(ring, ib->gpu_addr & 0xffffffe0); /* base must be 32 byte aligned */
@@ -811,7 +811,7 @@ static void cik_sdma_ring_pad_ib(struct amdgpu_ring *ring, struct amdgpu_ib *ib)
int i;

-	pad_count = (8 - (ib->length_dw & 0x7)) % 8;
+	pad_count = (-ib->length_dw) & 7;
for (i = 0; i < pad_count; i++)
if (sdma && sdma->burst_nop && (i == 0))
ib->ptr[ib->length_dw++] =
diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c b/drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c
index 606b621145a1..fd7fa6082563 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c
@@ -255,7 +255,7 @@ static void sdma_v2_4_ring_emit_ib(struct amdgpu_ring *ring,
unsigned vmid = AMDGPU_JOB_GET_VMID(job);

/* IB packet must end on a 8 DW boundary */
-	sdma_v2_4_ring_insert_nop(ring, (10 - (lower_32_bits(ring->wptr) & 7)) % 8);
+	sdma_v2_4_ring_insert_nop(ring, (2 - lower_32_bits(ring->wptr)) & 7);

@@ -750,7 +750,7 @@ static void sdma_v2_4_ring_pad_ib(struct amdgpu_ring *ring, struct amdgpu_ib *ib
int i;

-	pad_count = (8 - (ib->length_dw & 0x7)) % 8;
+	pad_count = (-ib->length_dw) & 7;
for (i = 0; i < pad_count; i++)
if (sdma && sdma->burst_nop && (i == 0))
ib->ptr[ib->length_dw++] =
diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c
index a559573ec8fd..4a8a7f0f3a9c 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c
@@ -429,7 +429,7 @@ static void sdma_v3_0_ring_emit_ib(struct amdgpu_ring *ring,
unsigned vmid = AMDGPU_JOB_GET_VMID(job);

/* IB packet must end on a 8 DW boundary */
-	sdma_v3_0_ring_insert_nop(ring, (10 - (lower_32_bits(ring->wptr) & 7)) % 8);
+	sdma_v3_0_ring_insert_nop(ring, (2 - lower_32_bits(ring->wptr)) & 7);

@@ -1021,7 +1021,7 @@ static void sdma_v3_0_ring_pad_ib(struct amdgpu_ring *ring, struct amdgpu_ib *ib
int i;

-	pad_count = (8 - (ib->length_dw & 0x7)) % 8;
+	pad_count = (-ib->length_dw) & 7;
for (i = 0; i < pad_count; i++)
if (sdma && sdma->burst_nop && (i == 0))
ib->ptr[ib->length_dw++] =
diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
index bd9ed33bab43..c69df0cb21ec 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
@@ -698,7 +698,7 @@ static void sdma_v4_0_ring_emit_ib(struct amdgpu_ring *ring,
unsigned vmid = AMDGPU_JOB_GET_VMID(job);

/* IB packet must end on a 8 DW boundary */
-	sdma_v4_0_ring_insert_nop(ring, (10 - (lower_32_bits(ring->wptr) & 7)) % 8);
+	sdma_v4_0_ring_insert_nop(ring, (2 - lower_32_bits(ring->wptr)) & 7);

@@ -1579,7 +1579,7 @@ static void sdma_v4_0_ring_pad_ib(struct amdgpu_ring *ring, struct amdgpu_ib *ib
int i;

-	pad_count = (8 - (ib->length_dw & 0x7)) % 8;
+	pad_count = (-ib->length_dw) & 7;
for (i = 0; i < pad_count; i++)
if (sdma && sdma->burst_nop && (i == 0))
ib->ptr[ib->length_dw++] =
diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
index 119364293cec..3912937f878f 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
@@ -382,8 +382,15 @@ static void sdma_v5_0_ring_emit_ib(struct amdgpu_ring *ring,
unsigned vmid = AMDGPU_JOB_GET_VMID(job);

-	/* IB packet must end on a 8 DW boundary */
-	sdma_v5_0_ring_insert_nop(ring, (10 - (lower_32_bits(ring->wptr) & 7)) % 8);
+	/* An IB packet must end on a 8 DW boundary--the next dword
+	 * must be on a 8-dword boundary. Our IB packet below is 6
+	 * dwords long, thus add x number of NOPs, such that, in
+	 * modular arithmetic,
+	 * wptr + 6 + x = 8k, k >= 0, which in C is,
+	 * (wptr + 6 + x) % 8 = 0.
+	 * The expression below, is a solution of x.
+	 */
+	sdma_v5_0_ring_insert_nop(ring, (2 - lower_32_bits(ring->wptr)) & 7);

@@ -1076,10 +1083,10 @@ static void sdma_v5_0_vm_set_pte_pde(struct amdgpu_ib *ib,
}

/**
- * sdma_v5_0_ring_pad_ib - pad the IB to the required number of dw
- *
* @ib: indirect buffer to fill with padding
*
+ * Pad the IB with NOPs to a boundary multiple of 8.
*/
static void sdma_v5_0_ring_pad_ib(struct amdgpu_ring *ring, struct amdgpu_ib *ib)
{
@@ -1087,7 +1094,7 @@ static void sdma_v5_0_ring_pad_ib(struct amdgpu_ring *ring, struct amdgpu_ib *ib
int i;

-	pad_count = (8 - (ib->length_dw & 0x7)) % 8;
+	pad_count = (-ib->length_dw) & 0x7;
for (i = 0; i < pad_count; i++)
if (sdma && sdma->burst_nop && (i == 0))
ib->ptr[ib->length_dw++] =
--
2.24.0.390.g083378cc35

```