[PATCH v2] drm/amdgpu: Increase soft IH ring size

Felix Kuehling felix.kuehling at amd.com
Fri Jul 7 17:15:38 UTC 2023


On 2023-07-07 11:49, Philip Yang wrote:
> Retry faults are delegated to soft IH ring and then processed by
> deferred worker. Current soft IH ring size PAGE_SIZE can store 128
> entries, which may overflow and drop retry faults, causes HW stucks
> because the retry fault is not recovered.
>
> Increase soft IH ring size to 8KB, enough to store 256 CAM entries
> because we clear the CAM entry after handling the retry fault from soft
> ring.
>
> Define macro IH_RING_SIZE and IH_SW_RING_SIZE to remove duplicate
> constant.
>
> Show warning message if soft IH ring overflows because this should not
> happen.

It would indicate a problem with the CAM or it could happen on older 
GPUs that don't have a CAM. See below.


>
> Signed-off-by: Philip Yang <Philip.Yang at amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c  | 8 ++++++--
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h  | 7 +++++--
>   drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c | 2 +-
>   drivers/gpu/drm/amd/amdgpu/ih_v6_0.c    | 4 ++--
>   drivers/gpu/drm/amd/amdgpu/navi10_ih.c  | 4 ++--
>   drivers/gpu/drm/amd/amdgpu/vega10_ih.c  | 4 ++--
>   drivers/gpu/drm/amd/amdgpu/vega20_ih.c  | 4 ++--
>   7 files changed, 20 insertions(+), 13 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c
> index fceb3b384955..51a0dbd2358a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c
> @@ -138,6 +138,7 @@ void amdgpu_ih_ring_fini(struct amdgpu_device *adev, struct amdgpu_ih_ring *ih)
>   /**
>    * amdgpu_ih_ring_write - write IV to the ring buffer
>    *
> + * @adev: amdgpu_device pointer
>    * @ih: ih ring to write to
>    * @iv: the iv to write
>    * @num_dw: size of the iv in dw
> @@ -145,8 +146,8 @@ void amdgpu_ih_ring_fini(struct amdgpu_device *adev, struct amdgpu_ih_ring *ih)
>    * Writes an IV to the ring buffer using the CPU and increment the wptr.
>    * Used for testing and delegating IVs to a software ring.
>    */
> -void amdgpu_ih_ring_write(struct amdgpu_ih_ring *ih, const uint32_t *iv,
> -			  unsigned int num_dw)
> +void amdgpu_ih_ring_write(struct amdgpu_device *adev, struct amdgpu_ih_ring *ih,
> +			  const uint32_t *iv, unsigned int num_dw)
>   {
>   	uint32_t wptr = le32_to_cpu(*ih->wptr_cpu) >> 2;
>   	unsigned int i;
> @@ -161,6 +162,9 @@ void amdgpu_ih_ring_write(struct amdgpu_ih_ring *ih, const uint32_t *iv,
>   	if (wptr != READ_ONCE(ih->rptr)) {
>   		wmb();
>   		WRITE_ONCE(*ih->wptr_cpu, cpu_to_le32(wptr));
> +	} else {
> +		dev_warn(adev->dev, "IH soft ring buffer overflow 0x%X, 0x%X\n",
> +			 wptr, ih->rptr);

If this happens, it's probably going to flood the log. It would be a 
good idea to apply a rate-limit, or use dev_warn_once. With that fixed, 
the patch is

Reviewed-by: Felix Kuehling <Felix.Kuehling at amd.com>


>   	}
>   }
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h
> index dd1c2eded6b9..6c6184f0dbc1 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h
> @@ -27,6 +27,9 @@
>   /* Maximum number of IVs processed at once */
>   #define AMDGPU_IH_MAX_NUM_IVS	32
>   
> +#define IH_RING_SIZE	(256 * 1024)
> +#define IH_SW_RING_SIZE	(8 * 1024)	/* enough for 256 CAM entries */
> +
>   struct amdgpu_device;
>   struct amdgpu_iv_entry;
>   
> @@ -97,8 +100,8 @@ struct amdgpu_ih_funcs {
>   int amdgpu_ih_ring_init(struct amdgpu_device *adev, struct amdgpu_ih_ring *ih,
>   			unsigned ring_size, bool use_bus_addr);
>   void amdgpu_ih_ring_fini(struct amdgpu_device *adev, struct amdgpu_ih_ring *ih);
> -void amdgpu_ih_ring_write(struct amdgpu_ih_ring *ih, const uint32_t *iv,
> -			  unsigned int num_dw);
> +void amdgpu_ih_ring_write(struct amdgpu_device *adev, struct amdgpu_ih_ring *ih,
> +			  const uint32_t *iv, unsigned int num_dw);
>   int amdgpu_ih_wait_on_checkpoint_process_ts(struct amdgpu_device *adev,
>   					    struct amdgpu_ih_ring *ih);
>   int amdgpu_ih_process(struct amdgpu_device *adev, struct amdgpu_ih_ring *ih);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c
> index 5273decc5753..fa6d0adcec20 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c
> @@ -493,7 +493,7 @@ void amdgpu_irq_delegate(struct amdgpu_device *adev,
>   			 struct amdgpu_iv_entry *entry,
>   			 unsigned int num_dw)
>   {
> -	amdgpu_ih_ring_write(&adev->irq.ih_soft, entry->iv_entry, num_dw);
> +	amdgpu_ih_ring_write(adev, &adev->irq.ih_soft, entry->iv_entry, num_dw);
>   	schedule_work(&adev->irq.ih_soft_work);
>   }
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/ih_v6_0.c b/drivers/gpu/drm/amd/amdgpu/ih_v6_0.c
> index b02e1cef78a7..980b24120080 100644
> --- a/drivers/gpu/drm/amd/amdgpu/ih_v6_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/ih_v6_0.c
> @@ -535,7 +535,7 @@ static int ih_v6_0_sw_init(void *handle)
>   	 * use bus address for ih ring by psp bl */
>   	use_bus_addr =
>   		(adev->firmware.load_type == AMDGPU_FW_LOAD_PSP) ? false : true;
> -	r = amdgpu_ih_ring_init(adev, &adev->irq.ih, 256 * 1024, use_bus_addr);
> +	r = amdgpu_ih_ring_init(adev, &adev->irq.ih, IH_RING_SIZE, use_bus_addr);
>   	if (r)
>   		return r;
>   
> @@ -548,7 +548,7 @@ static int ih_v6_0_sw_init(void *handle)
>   	/* initialize ih control register offset */
>   	ih_v6_0_init_register_offset(adev);
>   
> -	r = amdgpu_ih_ring_init(adev, &adev->irq.ih_soft, PAGE_SIZE, true);
> +	r = amdgpu_ih_ring_init(adev, &adev->irq.ih_soft, IH_SW_RING_SIZE, true);
>   	if (r)
>   		return r;
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/navi10_ih.c b/drivers/gpu/drm/amd/amdgpu/navi10_ih.c
> index eec13cb5bf75..b6a8478dabf4 100644
> --- a/drivers/gpu/drm/amd/amdgpu/navi10_ih.c
> +++ b/drivers/gpu/drm/amd/amdgpu/navi10_ih.c
> @@ -565,7 +565,7 @@ static int navi10_ih_sw_init(void *handle)
>   		use_bus_addr = false;
>   	else
>   		use_bus_addr = true;
> -	r = amdgpu_ih_ring_init(adev, &adev->irq.ih, 256 * 1024, use_bus_addr);
> +	r = amdgpu_ih_ring_init(adev, &adev->irq.ih, IH_RING_SIZE, use_bus_addr);
>   	if (r)
>   		return r;
>   
> @@ -578,7 +578,7 @@ static int navi10_ih_sw_init(void *handle)
>   	/* initialize ih control registers offset */
>   	navi10_ih_init_register_offset(adev);
>   
> -	r = amdgpu_ih_ring_init(adev, &adev->irq.ih_soft, PAGE_SIZE, true);
> +	r = amdgpu_ih_ring_init(adev, &adev->irq.ih_soft, IH_SW_RING_SIZE, true);
>   	if (r)
>   		return r;
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/vega10_ih.c b/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
> index 1e83db0c5438..d364c6dd152c 100644
> --- a/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
> +++ b/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
> @@ -485,7 +485,7 @@ static int vega10_ih_sw_init(void *handle)
>   	if (r)
>   		return r;
>   
> -	r = amdgpu_ih_ring_init(adev, &adev->irq.ih, 256 * 1024, true);
> +	r = amdgpu_ih_ring_init(adev, &adev->irq.ih, IH_RING_SIZE, true);
>   	if (r)
>   		return r;
>   
> @@ -510,7 +510,7 @@ static int vega10_ih_sw_init(void *handle)
>   	/* initialize ih control registers offset */
>   	vega10_ih_init_register_offset(adev);
>   
> -	r = amdgpu_ih_ring_init(adev, &adev->irq.ih_soft, PAGE_SIZE, true);
> +	r = amdgpu_ih_ring_init(adev, &adev->irq.ih_soft, IH_SW_RING_SIZE, true);
>   	if (r)
>   		return r;
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/vega20_ih.c b/drivers/gpu/drm/amd/amdgpu/vega20_ih.c
> index 4d719df376a7..544ee55a22da 100644
> --- a/drivers/gpu/drm/amd/amdgpu/vega20_ih.c
> +++ b/drivers/gpu/drm/amd/amdgpu/vega20_ih.c
> @@ -539,7 +539,7 @@ static int vega20_ih_sw_init(void *handle)
>   	    (adev->ip_versions[OSSSYS_HWIP][0] == IP_VERSION(4, 4, 2)))
>   		use_bus_addr = false;
>   
> -	r = amdgpu_ih_ring_init(adev, &adev->irq.ih, 256 * 1024, use_bus_addr);
> +	r = amdgpu_ih_ring_init(adev, &adev->irq.ih, IH_RING_SIZE, use_bus_addr);
>   	if (r)
>   		return r;
>   
> @@ -565,7 +565,7 @@ static int vega20_ih_sw_init(void *handle)
>   	/* initialize ih control registers offset */
>   	vega20_ih_init_register_offset(adev);
>   
> -	r = amdgpu_ih_ring_init(adev, &adev->irq.ih_soft, PAGE_SIZE, use_bus_addr);
> +	r = amdgpu_ih_ring_init(adev, &adev->irq.ih_soft, IH_SW_RING_SIZE, use_bus_addr);
>   	if (r)
>   		return r;
>   


More information about the amd-gfx mailing list