[PATCH v2] drm/amdgpu: block ring buffer access during GPU recovery

Christian König christian.koenig at amd.com
Tue Sep 1 08:51:25 UTC 2020


Am 01.09.20 um 09:50 schrieb Dennis Li:
> When GPU is in reset, its status isn't stable and ring buffer also need
> be reset when resuming. Therefore driver should protect GPU recovery
> thread from ring buffer accessed by other threads. Otherwise GPU will
> randomly hang during recovery.
>
> v2: correct indent
>
> Signed-off-by: Dennis Li <Dennis.Li at amd.com>

Reviewed-by: Christian König <christian.koenig at amd.com>

>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 172dc47b7f39..9b586bc80c38 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -319,8 +319,12 @@ uint32_t amdgpu_mm_rreg(struct amdgpu_device *adev, uint32_t reg,
>   {
>   	uint32_t ret;
>   
> -	if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev))
> -		return amdgpu_kiq_rreg(adev, reg);
> +	if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev) &&
> +	    down_read_trylock(&adev->reset_sem)) {
> +		ret = amdgpu_kiq_rreg(adev, reg);
> +		up_read(&adev->reset_sem);
> +		return ret;
> +	}
>   
>   	if ((reg * 4) < adev->rmmio_size)
>   		ret = readl(((void __iomem *)adev->rmmio) + (reg * 4));
> @@ -332,6 +336,7 @@ uint32_t amdgpu_mm_rreg(struct amdgpu_device *adev, uint32_t reg,
>   		ret = readl(((void __iomem *)adev->rmmio) + (mmMM_DATA * 4));
>   		spin_unlock_irqrestore(&adev->mmio_idx_lock, flags);
>   	}
> +
>   	trace_amdgpu_mm_rreg(adev->pdev->device, reg, ret);
>   	return ret;
>   }
> @@ -407,8 +412,12 @@ void static inline amdgpu_mm_wreg_mmio(struct amdgpu_device *adev, uint32_t reg,
>   void amdgpu_mm_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v,
>   		    uint32_t acc_flags)
>   {
> -	if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev))
> -		return amdgpu_kiq_wreg(adev, reg, v);
> +	if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev) &&
> +	    down_read_trylock(&adev->reset_sem)) {
> +		amdgpu_kiq_wreg(adev, reg, v);
> +		up_read(&adev->reset_sem);
> +		return;
> +	}
>   
>   	amdgpu_mm_wreg_mmio(adev, reg, v, acc_flags);
>   }
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> index ad9ad622ccce..31359e519d69 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> @@ -287,8 +287,7 @@ static void gmc_v10_0_flush_gpu_tlb(struct amdgpu_device *adev, uint32_t vmid,
>   	 */
>   	if (adev->gfx.kiq.ring.sched.ready &&
>   	    (amdgpu_sriov_runtime(adev) || !amdgpu_sriov_vf(adev)) &&
> -	    !amdgpu_in_reset(adev)) {
> -
> +	    down_read_trylock(&adev->reset_sem)) {
>   		struct amdgpu_vmhub *hub = &adev->vmhub[vmhub];
>   		const unsigned eng = 17;
>   		u32 inv_req = hub->vmhub_funcs->get_invalidate_req(vmid, flush_type);
> @@ -297,6 +296,8 @@ static void gmc_v10_0_flush_gpu_tlb(struct amdgpu_device *adev, uint32_t vmid,
>   
>   		amdgpu_virt_kiq_reg_write_reg_wait(adev, req, ack, inv_req,
>   				1 << vmid);
> +
> +		up_read(&adev->reset_sem);
>   		return;
>   	}
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> index e1a0ae327cf5..c602ddc68384 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> @@ -500,13 +500,14 @@ static void gmc_v9_0_flush_gpu_tlb(struct amdgpu_device *adev, uint32_t vmid,
>   	 * as GFXOFF under bare metal
>   	 */
>   	if (adev->gfx.kiq.ring.sched.ready &&
> -			(amdgpu_sriov_runtime(adev) || !amdgpu_sriov_vf(adev)) &&
> -			!amdgpu_in_reset(adev)) {
> +	    (amdgpu_sriov_runtime(adev) || !amdgpu_sriov_vf(adev)) &&
> +	    down_read_trylock(&adev->reset_sem)) {
>   		uint32_t req = hub->vm_inv_eng0_req + hub->eng_distance * eng;
>   		uint32_t ack = hub->vm_inv_eng0_ack + hub->eng_distance * eng;
>   
>   		amdgpu_virt_kiq_reg_write_reg_wait(adev, req, ack, inv_req,
>   						   1 << vmid);
> +		up_read(&adev->reset_sem);
>   		return;
>   	}
>   
> @@ -599,7 +600,7 @@ static int gmc_v9_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
>   	if (amdgpu_in_reset(adev))
>   		return -EIO;
>   
> -	if (ring->sched.ready) {
> +	if (ring->sched.ready && down_read_trylock(&adev->reset_sem)) {
>   		/* Vega20+XGMI caches PTEs in TC and TLB. Add a
>   		 * heavy-weight TLB flush (type 2), which flushes
>   		 * both. Due to a race condition with concurrent
> @@ -626,6 +627,7 @@ static int gmc_v9_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
>   		if (r) {
>   			amdgpu_ring_undo(ring);
>   			spin_unlock(&adev->gfx.kiq.ring_lock);
> +			up_read(&adev->reset_sem);
>   			return -ETIME;
>   		}
>   
> @@ -634,9 +636,10 @@ static int gmc_v9_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
>   		r = amdgpu_fence_wait_polling(ring, seq, adev->usec_timeout);
>   		if (r < 1) {
>   			dev_err(adev->dev, "wait for kiq fence error: %ld.\n", r);
> +			up_read(&adev->reset_sem);
>   			return -ETIME;
>   		}
> -
> +		up_read(&adev->reset_sem);
>   		return 0;
>   	}
>   



More information about the amd-gfx mailing list