[PATCH 3/3] drm/amdgpu/sdma_v4_4_2: update VM flush implementation for SDMA

Fri Feb 28 09:36:47 UTC 2025

On 2/27/2025 9:26 PM, Christian König wrote:
> Am 27.02.25 um 12:47 schrieb Jesse.zhang at amd.com:
>> This commit updates the VM flush implementation for the SDMA engine.
>>
>> - Added a new function `sdma_v4_4_2_get_invalidate_req` to construct the VM_INVALIDATE_ENG0_REQ
>>   register value for the specified VMID and flush type. This function ensures that all relevant
>>   page table cache levels (L1 PTEs, L2 PTEs, and L2 PDEs) are invalidated.
>>
>> - Modified the `sdma_v4_4_2_ring_emit_vm_flush` function to use the new `sdma_v4_4_2_get_invalidate_req`
>>   function. The updated function emits the necessary register writes and waits to perform a VM flush
>>   for the specified VMID. It updates the PTB address registers and issues a VM invalidation request
>>   using the specified VM invalidation engine.
>>
>> - Included the necessary header file `gc/gc_9_0_sh_mask.h` to provide access to the required register
>>   definitions.
>>
>> v2: vm flush by the vm inalidation packet (Lijo)
>>
>> Suggested-by: Lijo Lazar <lijo.lazar at amd.com>
>> Signed-off-by: Jesse Zhang <jesse.zhang at amd.com>
>> ---
>>  drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c | 81 ++++++++++++++++++++----
>>  1 file changed, 67 insertions(+), 14 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
>> index ba43c8f46f45..a9e46a4ed7a8 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
>> @@ -31,6 +31,7 @@
>>  #include "amdgpu_ucode.h"
>>  #include "amdgpu_trace.h"
>>  #include "amdgpu_reset.h"
>> +#include "gc/gc_9_0_sh_mask.h"
>>  
>>  #include "sdma/sdma_4_4_2_offset.h"
>>  #include "sdma/sdma_4_4_2_sh_mask.h"
>> @@ -1292,21 +1293,75 @@ static void sdma_v4_4_2_ring_emit_pipeline_sync(struct amdgpu_ring *ring)
>>  			       seq, 0xffffffff, 4);
>>  }
>>  
>> -
>> -/**
>> - * sdma_v4_4_2_ring_emit_vm_flush - vm flush using sDMA
>> +/*
>> + * sdma_v4_4_2_get_invalidate_req - Construct the VM_INVALIDATE_ENG0_REQ register value
>> + * @vmid: The VMID to invalidate
>> + * @flush_type: The type of flush (0 = legacy, 1 = lightweight, 2 = heavyweight)
>>   *
>> - * @ring: amdgpu_ring pointer
>> - * @vmid: vmid number to use
>> - * @pd_addr: address
>> + * This function constructs the VM_INVALIDATE_ENG0_REQ register value for the specified VMID
>> + * and flush type. It ensures that all relevant page table cache levels (L1 PTEs, L2 PTEs, and
>> + * L2 PDEs) are invalidated.
>> + */
>> +static uint32_t sdma_v4_4_2_get_invalidate_req(unsigned int vmid,
>> +					uint32_t flush_type)
>> +{
>> +	u32 req = 0;
>> +
>> +	req = REG_SET_FIELD(req, VM_INVALIDATE_ENG0_REQ,
>> +			    PER_VMID_INVALIDATE_REQ, 1 << vmid);
>> +	req = REG_SET_FIELD(req, VM_INVALIDATE_ENG0_REQ, FLUSH_TYPE, flush_type);
>> +	req = REG_SET_FIELD(req, VM_INVALIDATE_ENG0_REQ, INVALIDATE_L2_PTES, 1);
>> +	req = REG_SET_FIELD(req, VM_INVALIDATE_ENG0_REQ, INVALIDATE_L2_PDE0, 1);
>> +	req = REG_SET_FIELD(req, VM_INVALIDATE_ENG0_REQ, INVALIDATE_L2_PDE1, 1);
>> +	req = REG_SET_FIELD(req, VM_INVALIDATE_ENG0_REQ, INVALIDATE_L2_PDE2, 1);
>> +	req = REG_SET_FIELD(req, VM_INVALIDATE_ENG0_REQ, INVALIDATE_L1_PTES, 1);
>> +	req = REG_SET_FIELD(req, VM_INVALIDATE_ENG0_REQ,
>> +			    CLEAR_PROTECTION_FAULT_STATUS_ADDR,	0);
>> +
>> +	return req;
>> +}
>> +/* The vm validate packet is only available for GC9.4.3/GC9.4.4/GC9.5.0 */
>> +#define SDMA_OP_VM_INVALIDATE 0x8
>> +#define SDMA_SUBOP_VM_INVALIDATE 0x4
> 
> That needs to be in a header.
> 
>> +
>> +/*
>> + * sdma_v4_4_2_ring_emit_vm_flush - Emit VM flush commands for SDMA
>> + * @ring: The SDMA ring
>> + * @vmid: The VMID to flush
>> + * @pd_addr: The page directory address
>>   *
>> - * Update the page table base and flush the VM TLB
>> - * using sDMA.
>> + * This function emits the necessary register writes and waits to perform a VM flush for the
>> + * specified VMID. It updates the PTB address registers and issues a VM invalidation request
>> + * using the specified VM invalidation engine.
>>   */
>>  static void sdma_v4_4_2_ring_emit_vm_flush(struct amdgpu_ring *ring,
>> -					 unsigned vmid, uint64_t pd_addr)
>> +					    unsigned int vmid, uint64_t pd_addr)
>>  {
>> -	amdgpu_gmc_emit_flush_gpu_tlb(ring, vmid, pd_addr);
>> +	struct amdgpu_device *adev = ring->adev;
>> +	uint32_t req = sdma_v4_4_2_get_invalidate_req(vmid, 0);
>> +	unsigned int eng = ring->vm_inv_eng;
>> +	struct amdgpu_vmhub *hub = &adev->vmhub[ring->vm_hub];
>> +
> 
>> +	amdgpu_ring_emit_wreg(ring, hub->ctx0_ptb_addr_lo32 +
>> +                              (hub->ctx_addr_distance * vmid),
>> +                              lower_32_bits(pd_addr));
>> +
>> +        amdgpu_ring_emit_wreg(ring, hub->ctx0_ptb_addr_hi32 +
>> +                              (hub->ctx_addr_distance * vmid),
>> +                              upper_32_bits(pd_addr));
> 
> That is unecessary.
> 

Probably you got confused by the description below. The description
below is wrong.

>> +	/*
>> +	 * Construct and emit the VM invalidation packet:
>> +	 * DW0: OP, Sub OP, Engine IDs (XCC0, XCC1, MMHUB)
>> +	 * DW1: Invalidation request
>> +	 * DW2: Lower 32 bits of page directory address
>> +	 * DW3: Upper 32 bits of page directory address and INVALIDATEACK
> 

This description is not correct. DW2 and DW3 are for the logical address
ranges as filled in inv_eng_addr_lo/hi. They are not meant for PD address.

> How are upper bits and invalidateack mixed together here?
> 
>> +	 */
>> +	amdgpu_ring_write(ring, SDMA_PKT_HEADER_OP(SDMA_OP_VM_INVALIDATE) |
>> +			       SDMA_PKT_HEADER_SUB_OP(SDMA_SUBOP_VM_INVALIDATE) |
>> +				(0x1f << 16) | (0x1f << 21) | (eng << 26));
> 
> What does those magic numbers mean?

Yes, this means the packet definition needs to be there somewhere in the
header too.

Thanks,
Lijo

> 
>> +	amdgpu_ring_write(ring, req);
>> +	amdgpu_ring_write(ring, 0x0);
>> +	amdgpu_ring_write(ring, (0x1 << vmid));
> 
> Either drop the () and the 0x or even better use the bit macro.
> 
> And it looks like you completely missed the upper and lower bits of the page directory.
> 
> Christian.
> 
>>  }
>>  
>>  static void sdma_v4_4_2_ring_emit_wreg(struct amdgpu_ring *ring,
>> @@ -2112,8 +2167,7 @@ static const struct amdgpu_ring_funcs sdma_v4_4_2_ring_funcs = {
>>  		3 + /* hdp invalidate */
>>  		6 + /* sdma_v4_4_2_ring_emit_pipeline_sync */
>>  		/* sdma_v4_4_2_ring_emit_vm_flush */
>> -		SOC15_FLUSH_GPU_TLB_NUM_WREG * 3 +
>> -		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 6 +
>> +		4 + 2 * 3 +
>>  		10 + 10 + 10, /* sdma_v4_4_2_ring_emit_fence x3 for user fence, vm fence */
>>  	.emit_ib_size = 7 + 6, /* sdma_v4_4_2_ring_emit_ib */
>>  	.emit_ib = sdma_v4_4_2_ring_emit_ib,
>> @@ -2145,8 +2199,7 @@ static const struct amdgpu_ring_funcs sdma_v4_4_2_page_ring_funcs = {
>>  		3 + /* hdp invalidate */
>>  		6 + /* sdma_v4_4_2_ring_emit_pipeline_sync */
>>  		/* sdma_v4_4_2_ring_emit_vm_flush */
>> -		SOC15_FLUSH_GPU_TLB_NUM_WREG * 3 +
>> -		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 6 +
>> +		4 + 2 * 3 +
>>  		10 + 10 + 10, /* sdma_v4_4_2_ring_emit_fence x3 for user fence, vm fence */
>>  	.emit_ib_size = 7 + 6, /* sdma_v4_4_2_ring_emit_ib */
>> + 	.emit_ib = sdma_v4_4_2_ring_emit_ib,
>