[PATCH 06/11] drm/amdgpu: fix and cleanup gmc_v9_0_flush_gpu_tlb_pasid

Fri Sep 8 21:13:18 UTC 2023

On 2023-09-05 02:04, Christian König wrote:
> Testing for reset is pointless since the reset can start right after the
> test.
>
> The same PASID can be used by more than one VMID, reset each of them.
>
> Move the KIQ and all the workaround handling into common GMC code.
>
> Signed-off-by: Christian König <christian.koenig at amd.com>
reset -> invalidate.

With that fixed the patch is

Reviewed-by: Felix Kuehling <Felix.Kuehling at amd.com>


> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c |  60 +++++++++++++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h |  10 ++-
>   drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c   | 109 ++++++++----------------
>   3 files changed, 102 insertions(+), 77 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
> index 857051093900..b5f1a1218725 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
> @@ -32,6 +32,7 @@
>   #include "amdgpu.h"
>   #include "amdgpu_gmc.h"
>   #include "amdgpu_ras.h"
> +#include "amdgpu_reset.h"
>   #include "amdgpu_xgmi.h"
>   
>   #include <drm/drm_drv.h>
> @@ -623,6 +624,65 @@ void amdgpu_gmc_flush_gpu_tlb(struct amdgpu_device *adev, uint32_t vmid,
>   	DRM_ERROR("Error flushing GPU TLB using the SDMA (%d)!\n", r);
>   }
>   
> +int amdgpu_gmc_flush_gpu_tlb_pasid(struct amdgpu_device *adev, uint16_t pasid,
> +				   uint32_t flush_type, bool all_hub,
> +				   uint32_t inst)
> +{
> +	u32 usec_timeout = amdgpu_sriov_vf(adev) ? SRIOV_USEC_TIMEOUT :
> +		adev->usec_timeout;
> +	struct amdgpu_ring *ring = &adev->gfx.kiq[inst].ring;
> +	struct amdgpu_kiq *kiq = &adev->gfx.kiq[inst];
> +	unsigned int ndw;
> +	signed long r;
> +	uint32_t seq;
> +
> +	if (!adev->gmc.flush_pasid_uses_kiq || !ring->sched.ready ||
> +	    !down_read_trylock(&adev->reset_domain->sem)) {
> +		return adev->gmc.gmc_funcs->flush_gpu_tlb_pasid(adev, pasid,
> +								flush_type,
> +								all_hub, inst);
> +	}
> +
> +	/* 2 dwords flush + 8 dwords fence */
> +	ndw = kiq->pmf->invalidate_tlbs_size + 8;
> +
> +	if (adev->gmc.flush_tlb_needs_extra_type_2)
> +		ndw += kiq->pmf->invalidate_tlbs_size;
> +
> +	if (adev->gmc.flush_tlb_needs_extra_type_0)
> +		ndw += kiq->pmf->invalidate_tlbs_size;
> +
> +	spin_lock(&adev->gfx.kiq[inst].ring_lock);
> +	amdgpu_ring_alloc(ring, ndw);
> +	if (adev->gmc.flush_tlb_needs_extra_type_2)
> +		kiq->pmf->kiq_invalidate_tlbs(ring, pasid, 2, all_hub);
> +
> +	if (flush_type == 2 && adev->gmc.flush_tlb_needs_extra_type_0)
> +		kiq->pmf->kiq_invalidate_tlbs(ring, pasid, 0, all_hub);
> +
> +	kiq->pmf->kiq_invalidate_tlbs(ring, pasid, flush_type, all_hub);
> +	r = amdgpu_fence_emit_polling(ring, &seq, MAX_KIQ_REG_WAIT);
> +	if (r) {
> +		amdgpu_ring_undo(ring);
> +		spin_unlock(&adev->gfx.kiq[inst].ring_lock);
> +		goto error_unlock_reset;
> +	}
> +
> +	amdgpu_ring_commit(ring);
> +	spin_unlock(&adev->gfx.kiq[inst].ring_lock);
> +	r = amdgpu_fence_wait_polling(ring, seq, usec_timeout);
> +	if (r < 1) {
> +		dev_err(adev->dev, "wait for kiq fence error: %ld.\n", r);
> +		r = -ETIME;
> +		goto error_unlock_reset;
> +	}
> +	r = 0;
> +
> +error_unlock_reset:
> +	up_read(&adev->reset_domain->sem);
> +	return r;
> +}
> +
>   /**
>    * amdgpu_gmc_tmz_set -- check and set if a device supports TMZ
>    * @adev: amdgpu_device pointer
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
> index 9e7df2f69123..7732d4ef845e 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
> @@ -335,11 +335,12 @@ struct amdgpu_gmc {
>   	u64 MC_VM_MX_L1_TLB_CNTL;
>   
>   	u64 noretry_flags;
> +
> +	bool flush_tlb_needs_extra_type_0;
> +	bool flush_tlb_needs_extra_type_2;
> +	bool flush_pasid_uses_kiq;
>   };
>   
> -#define amdgpu_gmc_flush_gpu_tlb_pasid(adev, pasid, type, allhub, inst) \
> -	((adev)->gmc.gmc_funcs->flush_gpu_tlb_pasid \
> -	((adev), (pasid), (type), (allhub), (inst)))
>   #define amdgpu_gmc_emit_flush_gpu_tlb(r, vmid, addr) (r)->adev->gmc.gmc_funcs->emit_flush_gpu_tlb((r), (vmid), (addr))
>   #define amdgpu_gmc_emit_pasid_mapping(r, vmid, pasid) (r)->adev->gmc.gmc_funcs->emit_pasid_mapping((r), (vmid), (pasid))
>   #define amdgpu_gmc_map_mtype(adev, flags) (adev)->gmc.gmc_funcs->map_mtype((adev),(flags))
> @@ -404,6 +405,9 @@ void amdgpu_gmc_ras_fini(struct amdgpu_device *adev);
>   int amdgpu_gmc_allocate_vm_inv_eng(struct amdgpu_device *adev);
>   void amdgpu_gmc_flush_gpu_tlb(struct amdgpu_device *adev, uint32_t vmid,
>   			      uint32_t vmhub, uint32_t flush_type);
> +int amdgpu_gmc_flush_gpu_tlb_pasid(struct amdgpu_device *adev, uint16_t pasid,
> +				   uint32_t flush_type, bool all_hub,
> +				   uint32_t inst);
>   
>   extern void amdgpu_gmc_tmz_set(struct amdgpu_device *adev);
>   extern void amdgpu_gmc_noretry_set(struct amdgpu_device *adev);
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> index 4f6990ba71cb..39016b6900d3 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> @@ -954,87 +954,30 @@ static int gmc_v9_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
>   					uint16_t pasid, uint32_t flush_type,
>   					bool all_hub, uint32_t inst)
>   {
> -	int vmid, i;
> -	signed long r;
> -	uint32_t seq;
> -	uint16_t queried_pasid;
> -	bool ret;
> -	u32 usec_timeout = amdgpu_sriov_vf(adev) ? SRIOV_USEC_TIMEOUT : adev->usec_timeout;
> -	struct amdgpu_ring *ring = &adev->gfx.kiq[inst].ring;
> -	struct amdgpu_kiq *kiq = &adev->gfx.kiq[inst];
> -
> -	if (amdgpu_in_reset(adev))
> -		return -EIO;
> -
> -	if (ring->sched.ready && down_read_trylock(&adev->reset_domain->sem)) {
> -		/* Vega20+XGMI caches PTEs in TC and TLB. Add a
> -		 * heavy-weight TLB flush (type 2), which flushes
> -		 * both. Due to a race condition with concurrent
> -		 * memory accesses using the same TLB cache line, we
> -		 * still need a second TLB flush after this.
> -		 */
> -		bool vega20_xgmi_wa = (adev->gmc.xgmi.num_physical_nodes &&
> -				       adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 0));
> -		/* 2 dwords flush + 8 dwords fence */
> -		unsigned int ndw = kiq->pmf->invalidate_tlbs_size + 8;
> -
> -		if (vega20_xgmi_wa)
> -			ndw += kiq->pmf->invalidate_tlbs_size;
> -
> -		spin_lock(&adev->gfx.kiq[inst].ring_lock);
> -		/* 2 dwords flush + 8 dwords fence */
> -		amdgpu_ring_alloc(ring, ndw);
> -		if (vega20_xgmi_wa)
> -			kiq->pmf->kiq_invalidate_tlbs(ring,
> -						      pasid, 2, all_hub);
> -
> -		if (flush_type == 2 &&
> -		    adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 3) &&
> -		    adev->rev_id == 0)
> -			kiq->pmf->kiq_invalidate_tlbs(ring,
> -						pasid, 0, all_hub);
> -
> -		kiq->pmf->kiq_invalidate_tlbs(ring,
> -					pasid, flush_type, all_hub);
> -		r = amdgpu_fence_emit_polling(ring, &seq, MAX_KIQ_REG_WAIT);
> -		if (r) {
> -			amdgpu_ring_undo(ring);
> -			spin_unlock(&adev->gfx.kiq[inst].ring_lock);
> -			up_read(&adev->reset_domain->sem);
> -			return -ETIME;
> -		}
> -
> -		amdgpu_ring_commit(ring);
> -		spin_unlock(&adev->gfx.kiq[inst].ring_lock);
> -		r = amdgpu_fence_wait_polling(ring, seq, usec_timeout);
> -		if (r < 1) {
> -			dev_err(adev->dev, "wait for kiq fence error: %ld.\n", r);
> -			up_read(&adev->reset_domain->sem);
> -			return -ETIME;
> -		}
> -		up_read(&adev->reset_domain->sem);
> -		return 0;
> -	}
> +	uint16_t queried;
> +	int i, vmid;
>   
>   	for (vmid = 1; vmid < 16; vmid++) {
> +		bool valid;
>   
> -		ret = gmc_v9_0_get_atc_vmid_pasid_mapping_info(adev, vmid,
> -				&queried_pasid);
> -		if (ret && queried_pasid == pasid) {
> -			if (all_hub) {
> -				for_each_set_bit(i, adev->vmhubs_mask, AMDGPU_MAX_VMHUBS)
> -					gmc_v9_0_flush_gpu_tlb(adev, vmid,
> -							i, flush_type);
> -			} else {
> -				gmc_v9_0_flush_gpu_tlb(adev, vmid,
> -						AMDGPU_GFXHUB(0), flush_type);
> -			}
> -			break;
> +		valid = gmc_v9_0_get_atc_vmid_pasid_mapping_info(adev, vmid,
> +								 &queried);
> +		if (!valid || queried != pasid)
> +			continue;
> +
> +		if (all_hub) {
> +			for_each_set_bit(i, adev->vmhubs_mask,
> +					 AMDGPU_MAX_VMHUBS)
> +				gmc_v9_0_flush_gpu_tlb(adev, vmid, i,
> +						       flush_type);
> +		} else {
> +			gmc_v9_0_flush_gpu_tlb(adev, vmid,
> +					       AMDGPU_GFXHUB(0),
> +					       flush_type);
>   		}
>   	}
>   
>   	return 0;
> -
>   }
>   
>   static uint64_t gmc_v9_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
> @@ -2335,6 +2278,24 @@ static int gmc_v9_0_hw_init(void *handle)
>   	bool value;
>   	int i, r;
>   
> +	adev->gmc.flush_pasid_uses_kiq = true;
> +
> +	/* Vega20+XGMI caches PTEs in TC and TLB. Add a heavy-weight TLB flush
> +	 * (type 2), which flushes both. Due to a race condition with
> +	 * concurrent memory accesses using the same TLB cache line, we still
> +	 * need a second TLB flush after this.
> +	 */
> +	adev->gmc.flush_tlb_needs_extra_type_2 =
> +		adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 0) &&
> +		adev->gmc.xgmi.num_physical_nodes;
> +	/*
> +	 * TODO: This workaround is badly documented and had a buggy
> +	 * implementation. We should probably verify what we do here.
> +	 */
> +	adev->gmc.flush_tlb_needs_extra_type_0 =
> +		adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 3) &&
> +		adev->rev_id == 0;
> +
>   	/* The sequence of these two function calls matters.*/
>   	gmc_v9_0_init_golden_registers(adev);
>