[PATCH] drm/amdgpu: protect kiq overrun

Liu, Shaoyun Shaoyun.Liu at amd.com
Wed Apr 22 15:43:51 UTC 2020


[AMD Official Use Only - Internal Distribution Only]

I always has an impression for each submission,  once the ring be allocated , before the fence be signed , this ring space will always be reserved . If this can not be guaranteed , it sound a  big issue  to me .   Can't  we  check the rptr write back  to determine the available room in the ring_alloc ? 

Regards
Shaoyun.liu
-----Original Message-----
From: Koenig, Christian <Christian.Koenig at amd.com> 
Sent: Wednesday, April 22, 2020 10:57 AM
To: Liu, Shaoyun <Shaoyun.Liu at amd.com>; Tao, Yintian <Yintian.Tao at amd.com>; Liu, Monk <Monk.Liu at amd.com>; Kuehling, Felix <Felix.Kuehling at amd.com>
Cc: amd-gfx at lists.freedesktop.org
Subject: Re: [PATCH] drm/amdgpu: protect kiq overrun

The amdgpu_ring_alloc() function checks if the requested number of DW don't exceed the maximum submission size.

It does NOT check if there is enough room on the ring. That would require MMIO access and that is what we want to avoid.

Regards,
Christian.

Am 22.04.20 um 16:54 schrieb Liu, Shaoyun:
> [AMD Official Use Only - Internal Distribution Only]
>
> I think each  kiq operation will call ring_alloc  for the package space  , why  not just check whether this allocation is succeed or not ?
>
> Shaoyun.liu
>
> -----Original Message-----
> From: amd-gfx <amd-gfx-bounces at lists.freedesktop.org> On Behalf Of 
> Yintian Tao
> Sent: Wednesday, April 22, 2020 10:50 AM
> To: Koenig, Christian <Christian.Koenig at amd.com>; Liu, Monk 
> <Monk.Liu at amd.com>; Kuehling, Felix <Felix.Kuehling at amd.com>
> Cc: amd-gfx at lists.freedesktop.org; Tao, Yintian <Yintian.Tao at amd.com>
> Subject: [PATCH] drm/amdgpu: protect kiq overrun
>
> Wait for the oldest sequence on the kiq ring to be signaled in order to make sure there will be no kiq overrun.
>
> v2: remove unused the variable and correct
>      kiq max_sub_num value
>
> Signed-off-by: Yintian Tao <yttao at amd.com>
> ---
>   .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c    |  6 ++++
>   .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c |  6 ++++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c       | 30 +++++++++++++++++++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h       |  3 ++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c      |  6 ++++
>   drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c         |  6 ++++
>   drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c        |  7 +++++
>   drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c         |  7 +++++
>   8 files changed, 71 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
> index 691c89705bcd..fac8b9713dfc 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
> @@ -325,6 +325,12 @@ static int kgd_hiq_mqd_load(struct kgd_dev *kgd, void *mqd,
>   		 mec, pipe, queue_id);
>   
>   	spin_lock(&adev->gfx.kiq.ring_lock);
> +	r = amdgpu_gfx_kiq_is_avail(&adev->gfx.kiq);
> +	if (r) {
> +		pr_err("critical bug! too many kiq submission\n");
> +		goto out_unlock;
> +	}
> +
>   	r = amdgpu_ring_alloc(kiq_ring, 7);
>   	if (r) {
>   		pr_err("Failed to alloc KIQ (%d).\n", r); diff --git 
> a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
> index df841c2ac5e7..fd42c126510f 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
> @@ -323,6 +323,12 @@ int kgd_gfx_v9_hiq_mqd_load(struct kgd_dev *kgd, void *mqd,
>   		 mec, pipe, queue_id);
>   
>   	spin_lock(&adev->gfx.kiq.ring_lock);
> +	r = amdgpu_gfx_kiq_is_avail(&adev->gfx.kiq);
> +	if (r) {
> +		pr_err("critical bug! too many kiq submissions\n");
> +		goto out_unlock;
> +	}
> +
>   	r = amdgpu_ring_alloc(kiq_ring, 7);
>   	if (r) {
>   		pr_err("Failed to alloc KIQ (%d).\n", r); diff --git 
> a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> index a721b0e0ff69..84e66c45df37 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> @@ -321,6 +321,9 @@ int amdgpu_gfx_kiq_init_ring(struct amdgpu_device *adev,
>   			     AMDGPU_RING_PRIO_DEFAULT);
>   	if (r)
>   		dev_warn(adev->dev, "(%d) failed to init kiq ring\n", r);
> +	else
> +		kiq->max_sub_num = (ring->ring_size / 4) /
> +				(ring->funcs->align_mask + 1);
>   
>   	return r;
>   }
> @@ -663,6 +666,21 @@ int amdgpu_gfx_cp_ecc_error_irq(struct amdgpu_device *adev,
>   	return 0;
>   }
>   
> +int amdgpu_gfx_kiq_is_avail(struct amdgpu_kiq *kiq) {
> +	uint32_t seq = 0;
> +	signed long r = 0;
> +
> +	seq = abs(kiq->ring.fence_drv.sync_seq - kiq->max_sub_num);
> +	if (seq > kiq->max_sub_num) {
> +		r = amdgpu_fence_wait_polling(&kiq->ring, seq,
> +					      MAX_KIQ_REG_WAIT);
> +		return r < 1 ? -ETIME : 0;
> +	}
> +
> +	return 0;
> +}
> +
>   uint32_t amdgpu_kiq_rreg(struct amdgpu_device *adev, uint32_t reg)  {
>   	signed long r, cnt = 0;
> @@ -674,6 +692,12 @@ uint32_t amdgpu_kiq_rreg(struct amdgpu_device *adev, uint32_t reg)
>   	BUG_ON(!ring->funcs->emit_rreg);
>   
>   	spin_lock_irqsave(&kiq->ring_lock, flags);
> +	r = amdgpu_gfx_kiq_is_avail(&adev->gfx.kiq);
> +	if (r) {
> +		spin_unlock_irqrestore(&kiq->ring_lock, flags);
> +		goto failed_kiq_read;
> +	}
> +
>   	if (amdgpu_device_wb_get(adev, &reg_val_offs)) {
>   		spin_unlock_irqrestore(&kiq->ring_lock, flags);
>   		pr_err("critical bug! too many kiq readers\n"); @@ -728,6 +752,12 @@ void amdgpu_kiq_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v)
>   	BUG_ON(!ring->funcs->emit_wreg);
>   
>   	spin_lock_irqsave(&kiq->ring_lock, flags);
> +	r = amdgpu_gfx_kiq_is_avail(&adev->gfx.kiq);
> +	if (r) {
> +		spin_unlock_irqrestore(&kiq->ring_lock, flags);
> +		goto failed_kiq_write;
> +	}
> +
>   	amdgpu_ring_alloc(ring, 32);
>   	amdgpu_ring_emit_wreg(ring, reg, v);
>   	amdgpu_fence_emit_polling(ring, &seq); diff --git 
> a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> index ee698f0246d8..1ee59a927bd9 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> @@ -103,6 +103,7 @@ struct amdgpu_kiq {
>   	struct amdgpu_ring	ring;
>   	struct amdgpu_irq_src	irq;
>   	const struct kiq_pm4_funcs *pmf;
> +	uint32_t		max_sub_num;
>   };
>   
>   /*
> @@ -387,4 +388,6 @@ int amdgpu_gfx_cp_ecc_error_irq(struct amdgpu_device *adev,
>   				  struct amdgpu_iv_entry *entry);
>   uint32_t amdgpu_kiq_rreg(struct amdgpu_device *adev, uint32_t reg);  
> void amdgpu_kiq_wreg(struct amdgpu_device *adev, uint32_t reg, 
> uint32_t v);
> +
> +int amdgpu_gfx_kiq_is_avail(struct amdgpu_kiq *kiq);
>   #endif
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
> index 8c10084f44ef..4b027006d072 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
> @@ -57,6 +57,12 @@ void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>   	uint32_t seq;
>   
>   	spin_lock_irqsave(&kiq->ring_lock, flags);
> +	r = amdgpu_gfx_kiq_is_avail(&adev->gfx.kiq);
> +	if (r) {
> +		spin_unlock_irqrestore(&kiq->ring_lock, flags);
> +		goto failed_kiq;
> +	}
> +
>   	amdgpu_ring_alloc(ring, 32);
>   	amdgpu_ring_emit_reg_write_reg_wait(ring, reg0, reg1,
>   					    ref, mask);
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c 
> b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> index 5b1549f167b0..a136e2229f7a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> @@ -4051,6 +4051,12 @@ static uint64_t gfx_v9_0_kiq_read_clock(struct amdgpu_device *adev)
>   	BUG_ON(!ring->funcs->emit_rreg);
>   
>   	spin_lock_irqsave(&kiq->ring_lock, flags);
> +	r = amdgpu_gfx_kiq_is_avail(&adev->gfx.kiq);
> +	if (r) {
> +		spin_unlock_irqrestore(&kiq->ring_lock, flags);
> +		goto failed_kiq_read;
> +	}
> +
>   	if (amdgpu_device_wb_get(adev, &reg_val_offs)) {
>   		spin_unlock_irqrestore(&kiq->ring_lock, flags);
>   		pr_err("critical bug! too many kiq readers\n"); diff --git 
> a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c 
> b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> index 30b75d79efdb..77d8bc9c0111 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> @@ -423,6 +423,13 @@ static int gmc_v10_0_flush_gpu_tlb_pasid(struct 
> amdgpu_device *adev,
>   
>   	if (amdgpu_emu_mode == 0 && ring->sched.ready) {
>   		spin_lock(&adev->gfx.kiq.ring_lock);
> +		r = amdgpu_gfx_kiq_is_avail(&adev->gfx.kiq);
> +		if (r) {
> +			spin_unlock(&kiq->ring_lock);
> +			DRM_ERROR("too many kiq submissions\n");
> +			return -ETIME;
> +		}
> +
>   		/* 2 dwords flush + 8 dwords fence */
>   		amdgpu_ring_alloc(ring, kiq->pmf->invalidate_tlbs_size + 8);
>   		kiq->pmf->kiq_invalidate_tlbs(ring,
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c 
> b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> index fecdbc471983..c429a2a5fe3d 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> @@ -614,6 +614,13 @@ static int gmc_v9_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
>   			ndw += kiq->pmf->invalidate_tlbs_size;
>   
>   		spin_lock(&adev->gfx.kiq.ring_lock);
> +		r = amdgpu_gfx_kiq_is_avail(&adev->gfx.kiq);
> +		if (r) {
> +			spin_unlock(&kiq->ring_lock);
> +			DRM_ERROR("critical bug! too many kiq submissions\n");
> +			return -ETIME;
> +		}
> +
>   		/* 2 dwords flush + 8 dwords fence */
>   		amdgpu_ring_alloc(ring, ndw);
>   		if (vega20_xgmi_wa)
> --
> 2.17.1
>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx at lists.freedesktop.org
> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flist
> s.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&data=02%7C01%7CSh
> aoyun.Liu%40amd.com%7C0cd26307c0a149ebe0bd08d7e6cc84b1%7C3dd8961fe4884
> e608e11a82d994e183d%7C0%7C0%7C637231638419034830&sdata=rxkhyMutFwb
> e5Nw%2BeBGiESW9wTdflDUo%2F4xEvbCbR6U%3D&reserved=0


More information about the amd-gfx mailing list