[PATCH] drm/amdgpu: simplify padding calculations

Koenig, Christian Christian.Koenig at amd.com
Sat Oct 26 12:05:43 UTC 2019


Am 25.10.19 um 23:51 schrieb Tuikov, Luben:
> On 2019-10-25 2:54 a.m., Koenig, Christian wrote:
>> Am 25.10.19 um 01:44 schrieb Tuikov, Luben:
>>> Simplify padding calculations.
>>>
>>> Signed-off-by: Luben Tuikov <luben.tuikov at amd.com>
>>> ---
>>>    drivers/gpu/drm/amd/amdgpu/cik_sdma.c  |  4 ++--
>>>    drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c |  4 ++--
>>>    drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c |  4 ++--
>>>    drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c |  4 ++--
>>>    drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c | 14 +++++++++-----
>>>    5 files changed, 17 insertions(+), 13 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/cik_sdma.c b/drivers/gpu/drm/amd/amdgpu/cik_sdma.c
>>> index c45304f1047c..1ea9e18d6f08 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/cik_sdma.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/cik_sdma.c
>>> @@ -228,7 +228,7 @@ static void cik_sdma_ring_emit_ib(struct amdgpu_ring *ring,
>>>    	u32 extra_bits = vmid & 0xf;
>>>    
>>>    	/* IB packet must end on a 8 DW boundary */
>>> -	cik_sdma_ring_insert_nop(ring, (12 - (lower_32_bits(ring->wptr) & 7)) % 8);
>>> +	cik_sdma_ring_insert_nop(ring, (4-lower_32_bits(ring->wptr)) & 7);
>>>    
>>>    	amdgpu_ring_write(ring, SDMA_PACKET(SDMA_OPCODE_INDIRECT_BUFFER, 0, extra_bits));
>>>    	amdgpu_ring_write(ring, ib->gpu_addr & 0xffffffe0); /* base must be 32 byte aligned */
>>> @@ -811,7 +811,7 @@ static void cik_sdma_ring_pad_ib(struct amdgpu_ring *ring, struct amdgpu_ib *ib)
>>>    	u32 pad_count;
>>>    	int i;
>>>    
>>> -	pad_count = (8 - (ib->length_dw & 0x7)) % 8;
>>> +	pad_count = (-ib->length_dw) & 7;
>>>    	for (i = 0; i < pad_count; i++)
>>>    		if (sdma && sdma->burst_nop && (i == 0))
>>>    			ib->ptr[ib->length_dw++] =
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c b/drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c
>>> index a10175838013..d340f179401a 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c
>>> @@ -255,7 +255,7 @@ static void sdma_v2_4_ring_emit_ib(struct amdgpu_ring *ring,
>>>    	unsigned vmid = AMDGPU_JOB_GET_VMID(job);
>>>    
>>>    	/* IB packet must end on a 8 DW boundary */
>>> -	sdma_v2_4_ring_insert_nop(ring, (10 - (lower_32_bits(ring->wptr) & 7)) % 8);
>>> +	sdma_v2_4_ring_insert_nop(ring, (2-lower_32_bits(ring->wptr)) & 7);
>>>    
>>>    	amdgpu_ring_write(ring, SDMA_PKT_HEADER_OP(SDMA_OP_INDIRECT) |
>>>    			  SDMA_PKT_INDIRECT_HEADER_VMID(vmid & 0xf));
>>> @@ -750,7 +750,7 @@ static void sdma_v2_4_ring_pad_ib(struct amdgpu_ring *ring, struct amdgpu_ib *ib
>>>    	u32 pad_count;
>>>    	int i;
>>>    
>>> -	pad_count = (8 - (ib->length_dw & 0x7)) % 8;
>>> +	pad_count = (-ib->length_dw) & 7;
>>>    	for (i = 0; i < pad_count; i++)
>>>    		if (sdma && sdma->burst_nop && (i == 0))
>>>    			ib->ptr[ib->length_dw++] =
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c
>>> index 5f4e2c616241..5c3c310188b6 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c
>>> @@ -429,7 +429,7 @@ static void sdma_v3_0_ring_emit_ib(struct amdgpu_ring *ring,
>>>    	unsigned vmid = AMDGPU_JOB_GET_VMID(job);
>>>    
>>>    	/* IB packet must end on a 8 DW boundary */
>>> -	sdma_v3_0_ring_insert_nop(ring, (10 - (lower_32_bits(ring->wptr) & 7)) % 8);
>>> +	sdma_v3_0_ring_insert_nop(ring, (2-lower_32_bits(ring->wptr)) & 7);
>>>    
>>>    	amdgpu_ring_write(ring, SDMA_PKT_HEADER_OP(SDMA_OP_INDIRECT) |
>>>    			  SDMA_PKT_INDIRECT_HEADER_VMID(vmid & 0xf));
>>> @@ -1021,7 +1021,7 @@ static void sdma_v3_0_ring_pad_ib(struct amdgpu_ring *ring, struct amdgpu_ib *ib
>>>    	u32 pad_count;
>>>    	int i;
>>>    
>>> -	pad_count = (8 - (ib->length_dw & 0x7)) % 8;
>>> +	pad_count = (-ib->length_dw) & 7;
>>>    	for (i = 0; i < pad_count; i++)
>>>    		if (sdma && sdma->burst_nop && (i == 0))
>>>    			ib->ptr[ib->length_dw++] =
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
>>> index 45bd538ba97e..7c71c88e38a4 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
>>> @@ -698,7 +698,7 @@ static void sdma_v4_0_ring_emit_ib(struct amdgpu_ring *ring,
>>>    	unsigned vmid = AMDGPU_JOB_GET_VMID(job);
>>>    
>>>    	/* IB packet must end on a 8 DW boundary */
>>> -	sdma_v4_0_ring_insert_nop(ring, (10 - (lower_32_bits(ring->wptr) & 7)) % 8);
>>> +	sdma_v4_0_ring_insert_nop(ring, (2-lower_32_bits(ring->wptr)) & 7);
>>>    
>>>    	amdgpu_ring_write(ring, SDMA_PKT_HEADER_OP(SDMA_OP_INDIRECT) |
>>>    			  SDMA_PKT_INDIRECT_HEADER_VMID(vmid & 0xf));
>>> @@ -1580,7 +1580,7 @@ static void sdma_v4_0_ring_pad_ib(struct amdgpu_ring *ring, struct amdgpu_ib *ib
>>>    	u32 pad_count;
>>>    	int i;
>>>    
>>> -	pad_count = (8 - (ib->length_dw & 0x7)) % 8;
>>> +	pad_count = (-ib->length_dw) & 7;
>>>    	for (i = 0; i < pad_count; i++)
>>>    		if (sdma && sdma->burst_nop && (i == 0))
>>>    			ib->ptr[ib->length_dw++] =
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
>>> index 0c41b4fdc58b..67ede9e4df01 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
>>> @@ -382,8 +382,12 @@ static void sdma_v5_0_ring_emit_ib(struct amdgpu_ring *ring,
>>>    	unsigned vmid = AMDGPU_JOB_GET_VMID(job);
>>>    	uint64_t csa_mc_addr = amdgpu_sdma_get_csa_mc_addr(ring, vmid);
>>>    
>>> -	/* IB packet must end on a 8 DW boundary */
>>> -	sdma_v5_0_ring_insert_nop(ring, (10 - (lower_32_bits(ring->wptr) & 7)) % 8);
>>> +	/* An IB packet must end on a 8 DW boundary--the next dword must
>>> +	 * be on a 8-dword boundary. Our IB below is 6 dwords long,
>> That should probably read "our IB command" or "our IB packet".
> Done.
>
>>> +	 * thus add x number of NOPs, such that
>>> +	 * wptr + 6 + x = 8k, k >= 0.
>> Maybe write (wrptr + 6 + x) % 8 = 0 to make that more obvious.
> That's less obvious. In my mind I always translate it to
> a congruence expression. It's what I used to derive the algebraic
> simplifications of this patch.
>
> I'll add the highly cryptic "(wrptr + 6 + x) % 8 = 0" to make you happy,
> below the algebraic expression already there.

The problem I see here is that people who are used to GPUs will think 
that k is a float and not immediately get what's meant here.

>>> +	 */
>>> +	sdma_v5_0_ring_insert_nop(ring, (2-lower_32_bits(ring->wptr)) & 7);
>> Spaces between operators please.
> Yeah, I wanted to put an emphasis on the AND-op, but I'll add them.

What would be really nice to have is to keep the 10 as number of DW in 
the IB packet around.

Maybe add an "static inline amdgpu_ring_unalign(ring, 10, 8)" helper or 
something like that, the insert_nop is a ring callback anyway IIRC.

Regards,
Christian.

>
> Regards,
> Luben
>
>
>> Apart form that looks good to me,
>> Christian.
>>
>>>    
>>>    	amdgpu_ring_write(ring, SDMA_PKT_HEADER_OP(SDMA_OP_INDIRECT) |
>>>    			  SDMA_PKT_INDIRECT_HEADER_VMID(vmid & 0xf));
>>> @@ -1086,10 +1090,10 @@ static void sdma_v5_0_vm_set_pte_pde(struct amdgpu_ib *ib,
>>>    }
>>>    
>>>    /**
>>> - * sdma_v5_0_ring_pad_ib - pad the IB to the required number of dw
>>> - *
>>> + * sdma_v5_0_ring_pad_ib - pad the IB
>>>     * @ib: indirect buffer to fill with padding
>>>     *
>>> + * Pad the IB with NOPs to a boundary multiple of 8.
>>>     */
>>>    static void sdma_v5_0_ring_pad_ib(struct amdgpu_ring *ring, struct amdgpu_ib *ib)
>>>    {
>>> @@ -1097,7 +1101,7 @@ static void sdma_v5_0_ring_pad_ib(struct amdgpu_ring *ring, struct amdgpu_ib *ib
>>>    	u32 pad_count;
>>>    	int i;
>>>    
>>> -	pad_count = (8 - (ib->length_dw & 0x7)) % 8;
>>> +	pad_count = (-ib->length_dw) & 0x7;
>>>    	for (i = 0; i < pad_count; i++)
>>>    		if (sdma && sdma->burst_nop && (i == 0))
>>>    			ib->ptr[ib->length_dw++] =



More information about the amd-gfx mailing list