[PATCH] drm/amdgpu: fix the ib test hang when gfx is in "idle" state

Fri Apr 20 09:59:16 UTC 2018

Am 20.04.2018 um 11:40 schrieb Huang Rui:
> "aaabaf4   drm/amdgpu: defer test IBs on the rings at boot (V3)"
> Above patch defers the execution of gfx/compute ib tests. However, at that time,
> the gfx may already go into idle state. If "idle" gfx receives command
> submission, it will get hang in the system. So we must add is_gfx_on checking at
> start of ib tests.

Do I see that right that you just skip the IB test when the GFX block is 
already turned of? In this case that would be a clear NAK.

BTW: How do you detect that we need to turn GFX on again?

Regards,
Christian.

>
> Signed-off-by: Huang Rui <ray.huang at amd.com>
> Cc: Shirish S <shirish.s at amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu.h               |  2 ++
>   drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c             | 19 ++++++++++++++++++-
>   drivers/gpu/drm/amd/powerplay/hwmgr/smu10_hwmgr.c | 16 ++--------------
>   3 files changed, 22 insertions(+), 15 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index 59df4b7..a0263b9 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -905,6 +905,7 @@ struct amdgpu_gfx_funcs {
>   	void (*read_wave_vgprs)(struct amdgpu_device *adev, uint32_t simd, uint32_t wave, uint32_t thread, uint32_t start, uint32_t size, uint32_t *dst);
>   	void (*read_wave_sgprs)(struct amdgpu_device *adev, uint32_t simd, uint32_t wave, uint32_t start, uint32_t size, uint32_t *dst);
>   	void (*select_me_pipe_q)(struct amdgpu_device *adev, u32 me, u32 pipe, u32 queue);
> +	bool (*is_gfx_on)(struct amdgpu_device *adev);
>   };
>   
>   struct amdgpu_ngg_buf {
> @@ -1855,6 +1856,7 @@ amdgpu_get_sdma_instance(struct amdgpu_ring *ring)
>   #define amdgpu_gds_switch(adev, r, v, d, w, a) (adev)->gds.funcs->patch_gds_switch((r), (v), (d), (w), (a))
>   #define amdgpu_psp_check_fw_loading_status(adev, i) (adev)->firmware.funcs->check_fw_loading_status((adev), (i))
>   #define amdgpu_gfx_select_me_pipe_q(adev, me, pipe, q) (adev)->gfx.funcs->select_me_pipe_q((adev), (me), (pipe), (q))
> +#define amdgpu_gfx_is_gfx_on(adev) (adev)->gfx.funcs->is_gfx_on((adev))
>   
>   /* Common functions */
>   int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> index 6c2d278..a71d711 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> @@ -342,6 +342,18 @@ static int gfx_v9_0_ring_test_ring(struct amdgpu_ring *ring)
>   	return r;
>   }
>   
> +static bool gfx_v9_0_is_gfx_on(struct amdgpu_device *adev)
> +{
> +	uint32_t reg;
> +
> +	reg = RREG32_SOC15(PWR, 0, mmPWR_MISC_CNTL_STATUS);
> +	if ((reg & PWR_MISC_CNTL_STATUS__PWR_GFXOFF_STATUS_MASK) ==
> +	    (0x2 << PWR_MISC_CNTL_STATUS__PWR_GFXOFF_STATUS__SHIFT))
> +		return true;
> +
> +	return false;
> +}
> +
>   static int gfx_v9_0_ring_test_ib(struct amdgpu_ring *ring, long timeout)
>   {
>   	struct amdgpu_device *adev = ring->adev;
> @@ -353,6 +365,10 @@ static int gfx_v9_0_ring_test_ib(struct amdgpu_ring *ring, long timeout)
>   	uint32_t tmp;
>   	long r;
>   
> +	/* confirm gfx is not in "idle" state */
> +	if (!amdgpu_gfx_is_gfx_on(adev))
> +		return 0;
> +
>   	r = amdgpu_device_wb_get(adev, &index);
>   	if (r) {
>   		dev_err(adev->dev, "(%ld) failed to allocate wb slot\n", r);
> @@ -1085,7 +1101,8 @@ static const struct amdgpu_gfx_funcs gfx_v9_0_gfx_funcs = {
>   	.read_wave_data = &gfx_v9_0_read_wave_data,
>   	.read_wave_sgprs = &gfx_v9_0_read_wave_sgprs,
>   	.read_wave_vgprs = &gfx_v9_0_read_wave_vgprs,
> -	.select_me_pipe_q = &gfx_v9_0_select_me_pipe_q
> +	.select_me_pipe_q = &gfx_v9_0_select_me_pipe_q,
> +	.is_gfx_on = &gfx_v9_0_is_gfx_on
>   };
>   
>   static void gfx_v9_0_gpu_early_init(struct amdgpu_device *adev)
> diff --git a/drivers/gpu/drm/amd/powerplay/hwmgr/smu10_hwmgr.c b/drivers/gpu/drm/amd/powerplay/hwmgr/smu10_hwmgr.c
> index 7712eb6..3553fba 100644
> --- a/drivers/gpu/drm/amd/powerplay/hwmgr/smu10_hwmgr.c
> +++ b/drivers/gpu/drm/amd/powerplay/hwmgr/smu10_hwmgr.c
> @@ -254,28 +254,16 @@ static int smu10_power_off_asic(struct pp_hwmgr *hwmgr)
>   	return smu10_reset_cc6_data(hwmgr);
>   }
>   
> -static bool smu10_is_gfx_on(struct pp_hwmgr *hwmgr)
> -{
> -	uint32_t reg;
> -	struct amdgpu_device *adev = hwmgr->adev;
> -
> -	reg = RREG32_SOC15(PWR, 0, mmPWR_MISC_CNTL_STATUS);
> -	if ((reg & PWR_MISC_CNTL_STATUS__PWR_GFXOFF_STATUS_MASK) ==
> -	    (0x2 << PWR_MISC_CNTL_STATUS__PWR_GFXOFF_STATUS__SHIFT))
> -		return true;
> -
> -	return false;
> -}
> -
>   static int smu10_disable_gfx_off(struct pp_hwmgr *hwmgr)
>   {
>   	struct smu10_hwmgr *smu10_data = (struct smu10_hwmgr *)(hwmgr->backend);
> +	struct amdgpu_device *adev = hwmgr->adev;
>   
>   	if (smu10_data->gfx_off_controled_by_driver) {
>   		smum_send_msg_to_smc(hwmgr, PPSMC_MSG_DisableGfxOff);
>   
>   		/* confirm gfx is back to "on" state */
> -		while (!smu10_is_gfx_on(hwmgr))
> +		while (!amdgpu_gfx_is_gfx_on(adev))
>   			msleep(1);
>   	}
>