[PATCH] drm/amdgpu: protect kiq overrun

Christian König christian.koenig at amd.com
Wed Apr 22 18:42:48 UTC 2020


Am 22.04.20 um 16:50 schrieb Yintian Tao:
> Wait for the oldest sequence on the kiq ring
> to be signaled in order to make sure there
> will be no kiq overrun.
>
> v2: remove unused the variable and correct
>      kiq max_sub_num value

First of all this should probably be added to the fence handling code 
and not the kiq code.

Then you are kind of duplicating some of the functionality we have in 
the ring handling here. Probably better to avoid this, see 
amdgpu_fence_driver_init_ring() as well. That's also why I suggested to 
use the num_fences_mask value.

Regards,
Christian.

>
> Signed-off-by: Yintian Tao <yttao at amd.com>
> ---
>   .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c    |  6 ++++
>   .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c |  6 ++++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c       | 30 +++++++++++++++++++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h       |  3 ++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c      |  6 ++++
>   drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c         |  6 ++++
>   drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c        |  7 +++++
>   drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c         |  7 +++++
>   8 files changed, 71 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
> index 691c89705bcd..fac8b9713dfc 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
> @@ -325,6 +325,12 @@ static int kgd_hiq_mqd_load(struct kgd_dev *kgd, void *mqd,
>   		 mec, pipe, queue_id);
>   
>   	spin_lock(&adev->gfx.kiq.ring_lock);
> +	r = amdgpu_gfx_kiq_is_avail(&adev->gfx.kiq);
> +	if (r) {
> +		pr_err("critical bug! too many kiq submission\n");
> +		goto out_unlock;
> +	}
> +
>   	r = amdgpu_ring_alloc(kiq_ring, 7);
>   	if (r) {
>   		pr_err("Failed to alloc KIQ (%d).\n", r);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
> index df841c2ac5e7..fd42c126510f 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
> @@ -323,6 +323,12 @@ int kgd_gfx_v9_hiq_mqd_load(struct kgd_dev *kgd, void *mqd,
>   		 mec, pipe, queue_id);
>   
>   	spin_lock(&adev->gfx.kiq.ring_lock);
> +	r = amdgpu_gfx_kiq_is_avail(&adev->gfx.kiq);
> +	if (r) {
> +		pr_err("critical bug! too many kiq submissions\n");
> +		goto out_unlock;
> +	}
> +
>   	r = amdgpu_ring_alloc(kiq_ring, 7);
>   	if (r) {
>   		pr_err("Failed to alloc KIQ (%d).\n", r);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> index a721b0e0ff69..84e66c45df37 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> @@ -321,6 +321,9 @@ int amdgpu_gfx_kiq_init_ring(struct amdgpu_device *adev,
>   			     AMDGPU_RING_PRIO_DEFAULT);
>   	if (r)
>   		dev_warn(adev->dev, "(%d) failed to init kiq ring\n", r);
> +	else
> +		kiq->max_sub_num = (ring->ring_size / 4) /
> +				(ring->funcs->align_mask + 1);
>   
>   	return r;
>   }
> @@ -663,6 +666,21 @@ int amdgpu_gfx_cp_ecc_error_irq(struct amdgpu_device *adev,
>   	return 0;
>   }
>   
> +int amdgpu_gfx_kiq_is_avail(struct amdgpu_kiq *kiq)
> +{
> +	uint32_t seq = 0;
> +	signed long r = 0;
> +
> +	seq = abs(kiq->ring.fence_drv.sync_seq - kiq->max_sub_num);
> +	if (seq > kiq->max_sub_num) {
> +		r = amdgpu_fence_wait_polling(&kiq->ring, seq,
> +					      MAX_KIQ_REG_WAIT);
> +		return r < 1 ? -ETIME : 0;
> +	}
> +
> +	return 0;
> +}
> +
>   uint32_t amdgpu_kiq_rreg(struct amdgpu_device *adev, uint32_t reg)
>   {
>   	signed long r, cnt = 0;
> @@ -674,6 +692,12 @@ uint32_t amdgpu_kiq_rreg(struct amdgpu_device *adev, uint32_t reg)
>   	BUG_ON(!ring->funcs->emit_rreg);
>   
>   	spin_lock_irqsave(&kiq->ring_lock, flags);
> +	r = amdgpu_gfx_kiq_is_avail(&adev->gfx.kiq);
> +	if (r) {
> +		spin_unlock_irqrestore(&kiq->ring_lock, flags);
> +		goto failed_kiq_read;
> +	}
> +
>   	if (amdgpu_device_wb_get(adev, &reg_val_offs)) {
>   		spin_unlock_irqrestore(&kiq->ring_lock, flags);
>   		pr_err("critical bug! too many kiq readers\n");
> @@ -728,6 +752,12 @@ void amdgpu_kiq_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v)
>   	BUG_ON(!ring->funcs->emit_wreg);
>   
>   	spin_lock_irqsave(&kiq->ring_lock, flags);
> +	r = amdgpu_gfx_kiq_is_avail(&adev->gfx.kiq);
> +	if (r) {
> +		spin_unlock_irqrestore(&kiq->ring_lock, flags);
> +		goto failed_kiq_write;
> +	}
> +
>   	amdgpu_ring_alloc(ring, 32);
>   	amdgpu_ring_emit_wreg(ring, reg, v);
>   	amdgpu_fence_emit_polling(ring, &seq);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> index ee698f0246d8..1ee59a927bd9 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> @@ -103,6 +103,7 @@ struct amdgpu_kiq {
>   	struct amdgpu_ring	ring;
>   	struct amdgpu_irq_src	irq;
>   	const struct kiq_pm4_funcs *pmf;
> +	uint32_t		max_sub_num;
>   };
>   
>   /*
> @@ -387,4 +388,6 @@ int amdgpu_gfx_cp_ecc_error_irq(struct amdgpu_device *adev,
>   				  struct amdgpu_iv_entry *entry);
>   uint32_t amdgpu_kiq_rreg(struct amdgpu_device *adev, uint32_t reg);
>   void amdgpu_kiq_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v);
> +
> +int amdgpu_gfx_kiq_is_avail(struct amdgpu_kiq *kiq);
>   #endif
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
> index 8c10084f44ef..4b027006d072 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
> @@ -57,6 +57,12 @@ void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>   	uint32_t seq;
>   
>   	spin_lock_irqsave(&kiq->ring_lock, flags);
> +	r = amdgpu_gfx_kiq_is_avail(&adev->gfx.kiq);
> +	if (r) {
> +		spin_unlock_irqrestore(&kiq->ring_lock, flags);
> +		goto failed_kiq;
> +	}
> +
>   	amdgpu_ring_alloc(ring, 32);
>   	amdgpu_ring_emit_reg_write_reg_wait(ring, reg0, reg1,
>   					    ref, mask);
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> index 5b1549f167b0..a136e2229f7a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> @@ -4051,6 +4051,12 @@ static uint64_t gfx_v9_0_kiq_read_clock(struct amdgpu_device *adev)
>   	BUG_ON(!ring->funcs->emit_rreg);
>   
>   	spin_lock_irqsave(&kiq->ring_lock, flags);
> +	r = amdgpu_gfx_kiq_is_avail(&adev->gfx.kiq);
> +	if (r) {
> +		spin_unlock_irqrestore(&kiq->ring_lock, flags);
> +		goto failed_kiq_read;
> +	}
> +
>   	if (amdgpu_device_wb_get(adev, &reg_val_offs)) {
>   		spin_unlock_irqrestore(&kiq->ring_lock, flags);
>   		pr_err("critical bug! too many kiq readers\n");
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> index 30b75d79efdb..77d8bc9c0111 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> @@ -423,6 +423,13 @@ static int gmc_v10_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
>   
>   	if (amdgpu_emu_mode == 0 && ring->sched.ready) {
>   		spin_lock(&adev->gfx.kiq.ring_lock);
> +		r = amdgpu_gfx_kiq_is_avail(&adev->gfx.kiq);
> +		if (r) {
> +			spin_unlock(&kiq->ring_lock);
> +			DRM_ERROR("too many kiq submissions\n");
> +			return -ETIME;
> +		}
> +
>   		/* 2 dwords flush + 8 dwords fence */
>   		amdgpu_ring_alloc(ring, kiq->pmf->invalidate_tlbs_size + 8);
>   		kiq->pmf->kiq_invalidate_tlbs(ring,
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> index fecdbc471983..c429a2a5fe3d 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> @@ -614,6 +614,13 @@ static int gmc_v9_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
>   			ndw += kiq->pmf->invalidate_tlbs_size;
>   
>   		spin_lock(&adev->gfx.kiq.ring_lock);
> +		r = amdgpu_gfx_kiq_is_avail(&adev->gfx.kiq);
> +		if (r) {
> +			spin_unlock(&kiq->ring_lock);
> +			DRM_ERROR("critical bug! too many kiq submissions\n");
> +			return -ETIME;
> +		}
> +
>   		/* 2 dwords flush + 8 dwords fence */
>   		amdgpu_ring_alloc(ring, ndw);
>   		if (vega20_xgmi_wa)



More information about the amd-gfx mailing list