[PATCH v2] drm/amdgpu: fix the ib test hang when gfx is in "idle" state

Huang Rui ray.huang at amd.com
Tue Apr 24 06:02:05 UTC 2018


On Tue, Apr 24, 2018 at 11:44:27AM +0800, S, Shirish wrote:
> 
> 
> On 4/24/2018 8:19 AM, Huang Rui wrote:
> > "aaabaf4   drm/amdgpu: defer test IBs on the rings at boot (V3)"
> > Above patch defers the execution of gfx/compute ib tests. However, at that time,
> > the gfx may already go into idle state. If "idle" gfx receives command
> > submission, then will get hang in the system. And it still has issue to
> > dynamically enable/disable gfxoff at runtime. So we have to use a workaround to
> > skip the gfx/compute ib tests when gfx is already in "idle" state.
> >
> > Signed-off-by: Huang Rui <ray.huang at amd.com>
> > Cc: Shirish S <shirish.s at amd.com>
> > ---
> >
> > Changes from V1 -> V2:
> > - Remove unused definitions of smu10_hwmgr.
> > - Add WA descriptions into commit log and comments.
> >
> > ---
> >   drivers/gpu/drm/amd/amdgpu/amdgpu.h               |  2 ++
> >   drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c             | 23 ++++++++++++++++++++++-
> Is it not required for older asics of cz/st?
> If not then please update the commit message accordingly.

No, it isn't. gfx8 asics don't support gfxoff. OK, I will add one note in
my commit log.

Thanks,
Ray

> Regards,
> Shirish S
> >   drivers/gpu/drm/amd/powerplay/hwmgr/smu10_hwmgr.c | 22 ++--------------------
> >   3 files changed, 26 insertions(+), 21 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> > index 59df4b7..a0263b9 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> > @@ -905,6 +905,7 @@ struct amdgpu_gfx_funcs {
> >   	void (*read_wave_vgprs)(struct amdgpu_device *adev, uint32_t simd, uint32_t wave, uint32_t thread, uint32_t start, uint32_t size, uint32_t *dst);
> >   	void (*read_wave_sgprs)(struct amdgpu_device *adev, uint32_t simd, uint32_t wave, uint32_t start, uint32_t size, uint32_t *dst);
> >   	void (*select_me_pipe_q)(struct amdgpu_device *adev, u32 me, u32 pipe, u32 queue);
> > +	bool (*is_gfx_on)(struct amdgpu_device *adev);
> >   };
> >   
> >   struct amdgpu_ngg_buf {
> > @@ -1855,6 +1856,7 @@ amdgpu_get_sdma_instance(struct amdgpu_ring *ring)
> >   #define amdgpu_gds_switch(adev, r, v, d, w, a) (adev)->gds.funcs->patch_gds_switch((r), (v), (d), (w), (a))
> >   #define amdgpu_psp_check_fw_loading_status(adev, i) (adev)->firmware.funcs->check_fw_loading_status((adev), (i))
> >   #define amdgpu_gfx_select_me_pipe_q(adev, me, pipe, q) (adev)->gfx.funcs->select_me_pipe_q((adev), (me), (pipe), (q))
> > +#define amdgpu_gfx_is_gfx_on(adev) (adev)->gfx.funcs->is_gfx_on((adev))
> >   
> >   /* Common functions */
> >   int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
> > diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> > index 2c5e2a4..b8bd194 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> > @@ -342,6 +342,18 @@ static int gfx_v9_0_ring_test_ring(struct amdgpu_ring *ring)
> >   	return r;
> >   }
> >   
> > +static bool gfx_v9_0_is_gfx_on(struct amdgpu_device *adev)
> > +{
> > +	uint32_t reg;
> > +
> > +	reg = RREG32_SOC15(PWR, 0, mmPWR_MISC_CNTL_STATUS);
> > +	if ((reg & PWR_MISC_CNTL_STATUS__PWR_GFXOFF_STATUS_MASK) ==
> > +	    (0x2 << PWR_MISC_CNTL_STATUS__PWR_GFXOFF_STATUS__SHIFT))
> > +		return true;
> > +
> > +	return false;
> > +}
> > +
> >   static int gfx_v9_0_ring_test_ib(struct amdgpu_ring *ring, long timeout)
> >   {
> >   	struct amdgpu_device *adev = ring->adev;
> > @@ -353,6 +365,14 @@ static int gfx_v9_0_ring_test_ib(struct amdgpu_ring *ring, long timeout)
> >   	uint32_t tmp;
> >   	long r;
> >   
> > +	/*
> > +	 * FIXME: It still has issue to dynamically enable/disable gfxoff at
> > +	 * runtime. So it has to skip the gfx/compute ib test when gfx is
> > +	 * already in "idle" state.
> > +	 */
> > +	if (!amdgpu_gfx_is_gfx_on(adev))
> > +		return 0;
> > +
> >   	r = amdgpu_device_wb_get(adev, &index);
> >   	if (r) {
> >   		dev_err(adev->dev, "(%ld) failed to allocate wb slot\n", r);
> > @@ -1085,7 +1105,8 @@ static const struct amdgpu_gfx_funcs gfx_v9_0_gfx_funcs = {
> >   	.read_wave_data = &gfx_v9_0_read_wave_data,
> >   	.read_wave_sgprs = &gfx_v9_0_read_wave_sgprs,
> >   	.read_wave_vgprs = &gfx_v9_0_read_wave_vgprs,
> > -	.select_me_pipe_q = &gfx_v9_0_select_me_pipe_q
> > +	.select_me_pipe_q = &gfx_v9_0_select_me_pipe_q,
> > +	.is_gfx_on = &gfx_v9_0_is_gfx_on
> >   };
> >   
> >   static void gfx_v9_0_gpu_early_init(struct amdgpu_device *adev)
> > diff --git a/drivers/gpu/drm/amd/powerplay/hwmgr/smu10_hwmgr.c b/drivers/gpu/drm/amd/powerplay/hwmgr/smu10_hwmgr.c
> > index 7712eb6..48c17fb 100644
> > --- a/drivers/gpu/drm/amd/powerplay/hwmgr/smu10_hwmgr.c
> > +++ b/drivers/gpu/drm/amd/powerplay/hwmgr/smu10_hwmgr.c
> > @@ -42,12 +42,6 @@
> >   #define SMU10_DISPCLK_BYPASS_THRESHOLD     10000 /* 100Mhz */
> >   #define SMC_RAM_END                     0x40000
> >   
> > -#define mmPWR_MISC_CNTL_STATUS					0x0183
> > -#define mmPWR_MISC_CNTL_STATUS_BASE_IDX				0
> > -#define PWR_MISC_CNTL_STATUS__PWR_GFX_RLC_CGPG_EN__SHIFT	0x0
> > -#define PWR_MISC_CNTL_STATUS__PWR_GFXOFF_STATUS__SHIFT		0x1
> > -#define PWR_MISC_CNTL_STATUS__PWR_GFX_RLC_CGPG_EN_MASK		0x00000001L
> > -#define PWR_MISC_CNTL_STATUS__PWR_GFXOFF_STATUS_MASK		0x00000006L
> >   
> >   static const unsigned long SMU10_Magic = (unsigned long) PHM_Rv_Magic;
> >   
> > @@ -254,28 +248,16 @@ static int smu10_power_off_asic(struct pp_hwmgr *hwmgr)
> >   	return smu10_reset_cc6_data(hwmgr);
> >   }
> >   
> > -static bool smu10_is_gfx_on(struct pp_hwmgr *hwmgr)
> > -{
> > -	uint32_t reg;
> > -	struct amdgpu_device *adev = hwmgr->adev;
> > -
> > -	reg = RREG32_SOC15(PWR, 0, mmPWR_MISC_CNTL_STATUS);
> > -	if ((reg & PWR_MISC_CNTL_STATUS__PWR_GFXOFF_STATUS_MASK) ==
> > -	    (0x2 << PWR_MISC_CNTL_STATUS__PWR_GFXOFF_STATUS__SHIFT))
> > -		return true;
> > -
> > -	return false;
> > -}
> > -
> >   static int smu10_disable_gfx_off(struct pp_hwmgr *hwmgr)
> >   {
> >   	struct smu10_hwmgr *smu10_data = (struct smu10_hwmgr *)(hwmgr->backend);
> > +	struct amdgpu_device *adev = hwmgr->adev;
> >   
> >   	if (smu10_data->gfx_off_controled_by_driver) {
> >   		smum_send_msg_to_smc(hwmgr, PPSMC_MSG_DisableGfxOff);
> >   
> >   		/* confirm gfx is back to "on" state */
> > -		while (!smu10_is_gfx_on(hwmgr))
> > +		while (!amdgpu_gfx_is_gfx_on(adev))
> >   			msleep(1);
> >   	}
> >   
> 


More information about the amd-gfx mailing list