[PATCH] drm/amdgpu: simplify padding calculations

Tuikov, Luben Luben.Tuikov at amd.com
Sun Oct 27 21:46:00 UTC 2019


On 2019-10-26 08:05, Koenig, Christian wrote:
> Am 25.10.19 um 23:51 schrieb Tuikov, Luben:
>> On 2019-10-25 2:54 a.m., Koenig, Christian wrote:
>>> Am 25.10.19 um 01:44 schrieb Tuikov, Luben:
>>>> Simplify padding calculations.
>>>>
>>>> Signed-off-by: Luben Tuikov <luben.tuikov at amd.com>
>>>> ---
>>>>    drivers/gpu/drm/amd/amdgpu/cik_sdma.c  |  4 ++--
>>>>    drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c |  4 ++--
>>>>    drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c |  4 ++--
>>>>    drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c |  4 ++--
>>>>    drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c | 14 +++++++++-----
>>>>    5 files changed, 17 insertions(+), 13 deletions(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/cik_sdma.c b/drivers/gpu/drm/amd/amdgpu/cik_sdma.c
>>>> index c45304f1047c..1ea9e18d6f08 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/cik_sdma.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/cik_sdma.c
>>>> @@ -228,7 +228,7 @@ static void cik_sdma_ring_emit_ib(struct amdgpu_ring *ring,
>>>>    	u32 extra_bits = vmid & 0xf;
>>>>    
>>>>    	/* IB packet must end on a 8 DW boundary */
>>>> -	cik_sdma_ring_insert_nop(ring, (12 - (lower_32_bits(ring->wptr) & 7)) % 8);
>>>> +	cik_sdma_ring_insert_nop(ring, (4-lower_32_bits(ring->wptr)) & 7);
>>>>    
>>>>    	amdgpu_ring_write(ring, SDMA_PACKET(SDMA_OPCODE_INDIRECT_BUFFER, 0, extra_bits));
>>>>    	amdgpu_ring_write(ring, ib->gpu_addr & 0xffffffe0); /* base must be 32 byte aligned */
>>>> @@ -811,7 +811,7 @@ static void cik_sdma_ring_pad_ib(struct amdgpu_ring *ring, struct amdgpu_ib *ib)
>>>>    	u32 pad_count;
>>>>    	int i;
>>>>    
>>>> -	pad_count = (8 - (ib->length_dw & 0x7)) % 8;
>>>> +	pad_count = (-ib->length_dw) & 7;
>>>>    	for (i = 0; i < pad_count; i++)
>>>>    		if (sdma && sdma->burst_nop && (i == 0))
>>>>    			ib->ptr[ib->length_dw++] =
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c b/drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c
>>>> index a10175838013..d340f179401a 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c
>>>> @@ -255,7 +255,7 @@ static void sdma_v2_4_ring_emit_ib(struct amdgpu_ring *ring,
>>>>    	unsigned vmid = AMDGPU_JOB_GET_VMID(job);
>>>>    
>>>>    	/* IB packet must end on a 8 DW boundary */
>>>> -	sdma_v2_4_ring_insert_nop(ring, (10 - (lower_32_bits(ring->wptr) & 7)) % 8);
>>>> +	sdma_v2_4_ring_insert_nop(ring, (2-lower_32_bits(ring->wptr)) & 7);
>>>>    
>>>>    	amdgpu_ring_write(ring, SDMA_PKT_HEADER_OP(SDMA_OP_INDIRECT) |
>>>>    			  SDMA_PKT_INDIRECT_HEADER_VMID(vmid & 0xf));
>>>> @@ -750,7 +750,7 @@ static void sdma_v2_4_ring_pad_ib(struct amdgpu_ring *ring, struct amdgpu_ib *ib
>>>>    	u32 pad_count;
>>>>    	int i;
>>>>    
>>>> -	pad_count = (8 - (ib->length_dw & 0x7)) % 8;
>>>> +	pad_count = (-ib->length_dw) & 7;
>>>>    	for (i = 0; i < pad_count; i++)
>>>>    		if (sdma && sdma->burst_nop && (i == 0))
>>>>    			ib->ptr[ib->length_dw++] =
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c
>>>> index 5f4e2c616241..5c3c310188b6 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c
>>>> @@ -429,7 +429,7 @@ static void sdma_v3_0_ring_emit_ib(struct amdgpu_ring *ring,
>>>>    	unsigned vmid = AMDGPU_JOB_GET_VMID(job);
>>>>    
>>>>    	/* IB packet must end on a 8 DW boundary */
>>>> -	sdma_v3_0_ring_insert_nop(ring, (10 - (lower_32_bits(ring->wptr) & 7)) % 8);
>>>> +	sdma_v3_0_ring_insert_nop(ring, (2-lower_32_bits(ring->wptr)) & 7);
>>>>    
>>>>    	amdgpu_ring_write(ring, SDMA_PKT_HEADER_OP(SDMA_OP_INDIRECT) |
>>>>    			  SDMA_PKT_INDIRECT_HEADER_VMID(vmid & 0xf));
>>>> @@ -1021,7 +1021,7 @@ static void sdma_v3_0_ring_pad_ib(struct amdgpu_ring *ring, struct amdgpu_ib *ib
>>>>    	u32 pad_count;
>>>>    	int i;
>>>>    
>>>> -	pad_count = (8 - (ib->length_dw & 0x7)) % 8;
>>>> +	pad_count = (-ib->length_dw) & 7;
>>>>    	for (i = 0; i < pad_count; i++)
>>>>    		if (sdma && sdma->burst_nop && (i == 0))
>>>>    			ib->ptr[ib->length_dw++] =
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
>>>> index 45bd538ba97e..7c71c88e38a4 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
>>>> @@ -698,7 +698,7 @@ static void sdma_v4_0_ring_emit_ib(struct amdgpu_ring *ring,
>>>>    	unsigned vmid = AMDGPU_JOB_GET_VMID(job);
>>>>    
>>>>    	/* IB packet must end on a 8 DW boundary */
>>>> -	sdma_v4_0_ring_insert_nop(ring, (10 - (lower_32_bits(ring->wptr) & 7)) % 8);
>>>> +	sdma_v4_0_ring_insert_nop(ring, (2-lower_32_bits(ring->wptr)) & 7);
>>>>    
>>>>    	amdgpu_ring_write(ring, SDMA_PKT_HEADER_OP(SDMA_OP_INDIRECT) |
>>>>    			  SDMA_PKT_INDIRECT_HEADER_VMID(vmid & 0xf));
>>>> @@ -1580,7 +1580,7 @@ static void sdma_v4_0_ring_pad_ib(struct amdgpu_ring *ring, struct amdgpu_ib *ib
>>>>    	u32 pad_count;
>>>>    	int i;
>>>>    
>>>> -	pad_count = (8 - (ib->length_dw & 0x7)) % 8;
>>>> +	pad_count = (-ib->length_dw) & 7;
>>>>    	for (i = 0; i < pad_count; i++)
>>>>    		if (sdma && sdma->burst_nop && (i == 0))
>>>>    			ib->ptr[ib->length_dw++] =
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
>>>> index 0c41b4fdc58b..67ede9e4df01 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
>>>> @@ -382,8 +382,12 @@ static void sdma_v5_0_ring_emit_ib(struct amdgpu_ring *ring,
>>>>    	unsigned vmid = AMDGPU_JOB_GET_VMID(job);
>>>>    	uint64_t csa_mc_addr = amdgpu_sdma_get_csa_mc_addr(ring, vmid);
>>>>    
>>>> -	/* IB packet must end on a 8 DW boundary */
>>>> -	sdma_v5_0_ring_insert_nop(ring, (10 - (lower_32_bits(ring->wptr) & 7)) % 8);
>>>> +	/* An IB packet must end on a 8 DW boundary--the next dword must
>>>> +	 * be on a 8-dword boundary. Our IB below is 6 dwords long,
>>> That should probably read "our IB command" or "our IB packet".
>> Done.
>>
>>>> +	 * thus add x number of NOPs, such that
>>>> +	 * wptr + 6 + x = 8k, k >= 0.
>>> Maybe write (wrptr + 6 + x) % 8 = 0 to make that more obvious.
>> That's less obvious. In my mind I always translate it to
>> a congruence expression. It's what I used to derive the algebraic
>> simplifications of this patch.
>>
>> I'll add the highly cryptic "(wrptr + 6 + x) % 8 = 0" to make you happy,
>> below the algebraic expression already there.
> 
> The problem I see here is that people who are used to GPUs will think 
> that k is a float and not immediately get what's meant here.
> 
>>>> +	 */
>>>> +	sdma_v5_0_ring_insert_nop(ring, (2-lower_32_bits(ring->wptr)) & 7);
>>> Spaces between operators please.
>> Yeah, I wanted to put an emphasis on the AND-op, but I'll add them.
> 
> What would be really nice to have is to keep the 10 as number of DW in 
> the IB packet around.

Why? Why 10?

As far as I can see, THE IB packet is 6 dwords, we want to pad with some
number of dwords, in order to make sure that the wptr + 6, ends
up on a boundary multiple of 8. I.e. we want to solve exactly this,
for x:

wptr + 6 + x = 8k, k >= 0.

x = -(wtpr + 6) = -6 - wptr = 2 - wptr (mod 8)

Also note that, 10 = 2 = -6 (mod 8).

I believe the patch is good as is.

Did you see v2 of the patch?

Regards,
Luben


> 
> Maybe add an "static inline amdgpu_ring_unalign(ring, 10, 8)" helper or 
> something like that, the insert_nop is a ring callback anyway IIRC.
> 
> Regards,
> Christian.
> 
>>
>> Regards,
>> Luben
>>
>>
>>> Apart form that looks good to me,
>>> Christian.
>>>
>>>>    
>>>>    	amdgpu_ring_write(ring, SDMA_PKT_HEADER_OP(SDMA_OP_INDIRECT) |
>>>>    			  SDMA_PKT_INDIRECT_HEADER_VMID(vmid & 0xf));
>>>> @@ -1086,10 +1090,10 @@ static void sdma_v5_0_vm_set_pte_pde(struct amdgpu_ib *ib,
>>>>    }
>>>>    
>>>>    /**
>>>> - * sdma_v5_0_ring_pad_ib - pad the IB to the required number of dw
>>>> - *
>>>> + * sdma_v5_0_ring_pad_ib - pad the IB
>>>>     * @ib: indirect buffer to fill with padding
>>>>     *
>>>> + * Pad the IB with NOPs to a boundary multiple of 8.
>>>>     */
>>>>    static void sdma_v5_0_ring_pad_ib(struct amdgpu_ring *ring, struct amdgpu_ib *ib)
>>>>    {
>>>> @@ -1097,7 +1101,7 @@ static void sdma_v5_0_ring_pad_ib(struct amdgpu_ring *ring, struct amdgpu_ib *ib
>>>>    	u32 pad_count;
>>>>    	int i;
>>>>    
>>>> -	pad_count = (8 - (ib->length_dw & 0x7)) % 8;
>>>> +	pad_count = (-ib->length_dw) & 0x7;
>>>>    	for (i = 0; i < pad_count; i++)
>>>>    		if (sdma && sdma->burst_nop && (i == 0))
>>>>    			ib->ptr[ib->length_dw++] =
> 



More information about the amd-gfx mailing list