# [PATCH] drm/amdgpu: simplify padding calculations (v3)

Luben Tuikov luben.tuikov at amd.com
Thu Dec 19 03:13:15 UTC 2019

On 2019-12-18 9:59 p.m., Deucher, Alexander wrote:
>> -----Original Message-----
>> From: Tuikov, Luben <Luben.Tuikov at amd.com>
>> Sent: Wednesday, December 18, 2019 6:17 PM
>> To: amd-gfx at lists.freedesktop.org
>> Cc: Deucher, Alexander <Alexander.Deucher at amd.com>; Koenig, Christian
>> <Christian.Koenig at amd.com>
>> Subject: Re: [PATCH] drm/amdgpu: simplify padding calculations (v3)
>>
>> This patch seems to have been dropped after posting and reposting. I'm not
>> sure why it keeps being dropped.
>
> Dropped from what?  Once it's been reviewed, add the RBs and go ahead and push it to amd-staging-drm-next.

Oh, thanks!

I'll write it on a post-it note so I can remember.

Okay, so then that's what I need to do.

Regards,
Luben

>
> Alex
>
>>
>> In v3, I added an explanation in the commit text below the original commit
>> text of the single sentence of "Simplify padding calculations." Identical
>> diffstat as v2.
>>
>> Regards,
>> Luben
>>
>> On 2019-12-18 6:12 p.m., Luben Tuikov wrote:
>>> Simplify padding calculations.
>>>
>>> 1. Taking the remainder of a binary value when the divisor is a power
>>> of two, such as, a % 2^n is identical to, a & (2^n - 1) in base-2
>>> arithmetic, and so expressions like this:
>>>
>>> (12 - (lower_32_bits(ring->wptr) & 7)) % 8
>>>
>>> are superflous. And can be reduced to a single remainder-taking
>>> operation.
>>>
>>> 2. Subtracting the remainder modulo 8 out of 12 and then again taking
>>> the remainder modulo 8, yields the same result as simply subtracting
>>> the value out of 4 and then taking the remainder.
>>> This is true because 4 belongs to the congruence
>>> (residue) class {4 + 8*k}, k in Z. (So using,  {..., -12, -4, 4, 12,
>>> 20, ...}, would yield  the same final result.
>>>
>>> So the above expression, becomes,
>>>
>>> (4 - lower_32_bits(ring->wptr)) & 7
>>>
>>> 3. Now since 8 is part of the conguence class
>>> {0 + 8*k}, k in Z, and from 1) and 2) above, then,
>>>
>>> (8 - (ib->length_dw & 0x7)) % 8
>>>
>>> becomes, simply,
>>>
>>> (-ib->length_dw) & 7.
>>>
>>> This patch implements all of the above in this code.
>>>
>>> v2: Comment update and spacing.
>>> v3: Add 1, 2, 3, above, in this commit message.
>>>
>>> Signed-off-by: Luben Tuikov <luben.tuikov at amd.com>
>>> ---
>>>  drivers/gpu/drm/amd/amdgpu/cik_sdma.c  |  4 ++--
>>> drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c |  4 ++--
>>> drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c |  4 ++--
>>> drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c |  4 ++--
>>> drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c | 17 ++++++++++++-----
>>>  5 files changed, 20 insertions(+), 13 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/cik_sdma.c
>>> b/drivers/gpu/drm/amd/amdgpu/cik_sdma.c
>>> index 1f22a8d0f7f3..4274ccf765de 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/cik_sdma.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/cik_sdma.c
>>> @@ -228,7 +228,7 @@ static void cik_sdma_ring_emit_ib(struct
>> amdgpu_ring *ring,
>>>  	u32 extra_bits = vmid & 0xf;
>>>
>>>  	/* IB packet must end on a 8 DW boundary */
>>> -	cik_sdma_ring_insert_nop(ring, (12 - (lower_32_bits(ring->wptr) &
>> 7)) % 8);
>>> +	cik_sdma_ring_insert_nop(ring, (4 - lower_32_bits(ring->wptr)) & 7);
>>>
>>>  	amdgpu_ring_write(ring,
>> SDMA_PACKET(SDMA_OPCODE_INDIRECT_BUFFER, 0, extra_bits));
>>>  	amdgpu_ring_write(ring, ib->gpu_addr & 0xffffffe0); /* base must
>> be
>>> 32 byte aligned */ @@ -811,7 +811,7 @@ static void
>> cik_sdma_ring_pad_ib(struct amdgpu_ring *ring, struct amdgpu_ib *ib)
>>>  	int i;
>>>
>>> -	pad_count = (8 - (ib->length_dw & 0x7)) % 8;
>>> +	pad_count = (-ib->length_dw) & 7;
>>>  	for (i = 0; i < pad_count; i++)
>>>  		if (sdma && sdma->burst_nop && (i == 0))
>>>  			ib->ptr[ib->length_dw++] =
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c
>>> b/drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c
>>> index 606b621145a1..fd7fa6082563 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c
>>> @@ -255,7 +255,7 @@ static void sdma_v2_4_ring_emit_ib(struct
>> amdgpu_ring *ring,
>>>  	unsigned vmid = AMDGPU_JOB_GET_VMID(job);
>>>
>>>  	/* IB packet must end on a 8 DW boundary */
>>> -	sdma_v2_4_ring_insert_nop(ring, (10 - (lower_32_bits(ring->wptr) &
>> 7)) % 8);
>>> +	sdma_v2_4_ring_insert_nop(ring, (2 - lower_32_bits(ring->wptr)) &
>>> +7);
>>>
>>>  	amdgpu_ring_write(ring,
>>>  			  SDMA_PKT_INDIRECT_HEADER_VMID(vmid & 0xf));
>> @@ -750,7 +750,7 @@
>>> static void sdma_v2_4_ring_pad_ib(struct amdgpu_ring *ring, struct
>> amdgpu_ib *ib
>>>  	int i;
>>>
>>> -	pad_count = (8 - (ib->length_dw & 0x7)) % 8;
>>> +	pad_count = (-ib->length_dw) & 7;
>>>  	for (i = 0; i < pad_count; i++)
>>>  		if (sdma && sdma->burst_nop && (i == 0))
>>>  			ib->ptr[ib->length_dw++] =
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c
>>> b/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c
>>> index a559573ec8fd..4a8a7f0f3a9c 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c
>>> @@ -429,7 +429,7 @@ static void sdma_v3_0_ring_emit_ib(struct
>> amdgpu_ring *ring,
>>>  	unsigned vmid = AMDGPU_JOB_GET_VMID(job);
>>>
>>>  	/* IB packet must end on a 8 DW boundary */
>>> -	sdma_v3_0_ring_insert_nop(ring, (10 - (lower_32_bits(ring->wptr) &
>> 7)) % 8);
>>> +	sdma_v3_0_ring_insert_nop(ring, (2 - lower_32_bits(ring->wptr)) &
>>> +7);
>>>
>>>  	amdgpu_ring_write(ring,
>>>  			  SDMA_PKT_INDIRECT_HEADER_VMID(vmid & 0xf));
>> @@ -1021,7 +1021,7
>>> @@ static void sdma_v3_0_ring_pad_ib(struct amdgpu_ring *ring, struct
>> amdgpu_ib *ib
>>>  	int i;
>>>
>>> -	pad_count = (8 - (ib->length_dw & 0x7)) % 8;
>>> +	pad_count = (-ib->length_dw) & 7;
>>>  	for (i = 0; i < pad_count; i++)
>>>  		if (sdma && sdma->burst_nop && (i == 0))
>>>  			ib->ptr[ib->length_dw++] =
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
>>> b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
>>> index bd9ed33bab43..c69df0cb21ec 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
>>> @@ -698,7 +698,7 @@ static void sdma_v4_0_ring_emit_ib(struct
>> amdgpu_ring *ring,
>>>  	unsigned vmid = AMDGPU_JOB_GET_VMID(job);
>>>
>>>  	/* IB packet must end on a 8 DW boundary */
>>> -	sdma_v4_0_ring_insert_nop(ring, (10 - (lower_32_bits(ring->wptr) &
>> 7)) % 8);
>>> +	sdma_v4_0_ring_insert_nop(ring, (2 - lower_32_bits(ring->wptr)) &
>>> +7);
>>>
>>>  	amdgpu_ring_write(ring,
>>>  			  SDMA_PKT_INDIRECT_HEADER_VMID(vmid & 0xf));
>> @@ -1579,7 +1579,7
>>> @@ static void sdma_v4_0_ring_pad_ib(struct amdgpu_ring *ring, struct
>> amdgpu_ib *ib
>>>  	int i;
>>>
>>> -	pad_count = (8 - (ib->length_dw & 0x7)) % 8;
>>> +	pad_count = (-ib->length_dw) & 7;
>>>  	for (i = 0; i < pad_count; i++)
>>>  		if (sdma && sdma->burst_nop && (i == 0))
>>>  			ib->ptr[ib->length_dw++] =
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
>>> b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
>>> index 119364293cec..3912937f878f 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
>>> @@ -382,8 +382,15 @@ static void sdma_v5_0_ring_emit_ib(struct
>> amdgpu_ring *ring,
>>>  	unsigned vmid = AMDGPU_JOB_GET_VMID(job);
>> vmid);
>>>
>>> -	/* IB packet must end on a 8 DW boundary */
>>> -	sdma_v5_0_ring_insert_nop(ring, (10 - (lower_32_bits(ring->wptr) &
>> 7)) % 8);
>>> +	/* An IB packet must end on a 8 DW boundary--the next dword
>>> +	 * must be on a 8-dword boundary. Our IB packet below is 6
>>> +	 * dwords long, thus add x number of NOPs, such that, in
>>> +	 * modular arithmetic,
>>> +	 * wptr + 6 + x = 8k, k >= 0, which in C is,
>>> +	 * (wptr + 6 + x) % 8 = 0.
>>> +	 * The expression below, is a solution of x.
>>> +	 */
>>> +	sdma_v5_0_ring_insert_nop(ring, (2 - lower_32_bits(ring->wptr)) &
>>> +7);
>>>
>>>  	amdgpu_ring_write(ring,
>>>  			  SDMA_PKT_INDIRECT_HEADER_VMID(vmid & 0xf));
>> @@ -1076,10 +1083,10
>>> @@ static void sdma_v5_0_vm_set_pte_pde(struct amdgpu_ib *ib,  }
>>>
>>>  /**
>>> - * sdma_v5_0_ring_pad_ib - pad the IB to the required number of dw
>>> - *
>>> + * sdma_v5_0_ring_pad_ib - pad the IB
>>>   * @ib: indirect buffer to fill with padding
>>>   *
>>> + * Pad the IB with NOPs to a boundary multiple of 8.
>>>   */
>>>  static void sdma_v5_0_ring_pad_ib(struct amdgpu_ring *ring, struct
>>> amdgpu_ib *ib)  { @@ -1087,7 +1094,7 @@ static void
>>> sdma_v5_0_ring_pad_ib(struct amdgpu_ring *ring, struct amdgpu_ib *ib