[PATCH] drm/amdgpu: fix gfx hang during suspend with video playback

Liang, Prike Prike.Liang at amd.com
Fri Apr 3 09:22:28 UTC 2020


> -----Original Message-----
> From: Huang, Ray <Ray.Huang at amd.com>
> Sent: Friday, April 3, 2020 2:27 PM
> To: Liang, Prike <Prike.Liang at amd.com>
> Cc: amd-gfx at lists.freedesktop.org; Quan, Evan <Evan.Quan at amd.com>;
> Deucher, Alexander <Alexander.Deucher at amd.com>; Kuehling, Felix
> <Felix.Kuehling at amd.com>
> Subject: Re: [PATCH] drm/amdgpu: fix gfx hang during suspend with video
> playback
> 
> (+ Felix)
> 
> On Fri, Apr 03, 2020 at 12:07:53PM +0800, Liang, Prike wrote:
> > The system will be hang up during S3 as SMU is pending at GC not
> > respose the register CP_HQD_ACTIVE access request and this issue can
> > be fixed by adding RLC safe mode guard before each HQD map/unmap
> > retrive opt.
> 
> We need more information for the issue, does the map/unmap is required
> for MAP_QUEUES/UNMAP_QUEUES packets or writing with MMIO or both?
> 
[Prike]  The issue hang up at MP1 was trying to read register RSMU_RESIDENCY_COUNTER_GC 
but did not get response from GFX, since GFX was busy at reading register CP_HQD_ACTIVE.
Moreover, when disabled GFXOFF this issue also can't see so there is likely to perform 
register accessed at GFXOFF CGPG/CGCG enter stage.  As for only  this issue, that seems just 
MMIO  access failed case which occurred under QUEUE map/unmap status check. 

> From your patch, you just protect the kernel kiq and user queue. What about
> other kernel compute queues? HIQ?
> 
[Prike] So far just find the KIQ/CPQ/DIQ map/unmap will inquire the CP_HQD_ACTIVE status by MMIO accessing,
therefore just guard the KIQ  and some type user queue now. Regarding HIQ map and ummap which used the method of submitting configuration packet.  

> Thanks,
> Ray
> 
> >
> > Signed-off-by: Prike Liang <Prike.Liang at amd.com>
> > Tested-by: Mengbing Wang <Mengbing.Wang at amd.com>
> > ---
> >  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c | 6 ++++++
> >  drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c             | 4 ++++
> >  2 files changed, 10 insertions(+)
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
> > index df841c2..e265063 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
> > @@ -232,6 +232,7 @@ int kgd_gfx_v9_hqd_load(struct kgd_dev *kgd, void
> *mqd, uint32_t pipe_id,
> >  	uint32_t *mqd_hqd;
> >  	uint32_t reg, hqd_base, data;
> >
> > +	amdgpu_gfx_rlc_enter_safe_mode(adev);
> >  	m = get_mqd(mqd);
> >
> >  	acquire_queue(kgd, pipe_id, queue_id); @@ -299,6 +300,7 @@ int
> > kgd_gfx_v9_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id,
> >
> >  	release_queue(kgd);
> >
> > +	amdgpu_gfx_rlc_exit_safe_mode(adev);
> >  	return 0;
> >  }
> >
> > @@ -497,6 +499,7 @@ bool kgd_gfx_v9_hqd_is_occupied(struct kgd_dev
> *kgd, uint64_t queue_address,
> >  	bool retval = false;
> >  	uint32_t low, high;
> >
> > +	amdgpu_gfx_rlc_enter_safe_mode(adev);
> >  	acquire_queue(kgd, pipe_id, queue_id);
> >  	act = RREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_ACTIVE));
> >  	if (act) {
> > @@ -508,6 +511,7 @@ bool kgd_gfx_v9_hqd_is_occupied(struct kgd_dev
> *kgd, uint64_t queue_address,
> >  			retval = true;
> >  	}
> >  	release_queue(kgd);
> > +	amdgpu_gfx_rlc_exit_safe_mode(adev);
> >  	return retval;
> >  }
> >
> > @@ -541,6 +545,7 @@ int kgd_gfx_v9_hqd_destroy(struct kgd_dev *kgd,
> void *mqd,
> >  	uint32_t temp;
> >  	struct v9_mqd *m = get_mqd(mqd);
> >
> > +	amdgpu_gfx_rlc_enter_safe_mode(adev);
> >  	if (adev->in_gpu_reset)
> >  		return -EIO;
> >
> > @@ -577,6 +582,7 @@ int kgd_gfx_v9_hqd_destroy(struct kgd_dev *kgd,
> void *mqd,
> >  	}
> >
> >  	release_queue(kgd);
> > +	amdgpu_gfx_rlc_exit_safe_mode(adev);
> >  	return 0;
> >  }
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> > b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> > index 1fea077..ee107d9 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> > @@ -3533,6 +3533,7 @@ static int gfx_v9_0_kiq_init_register(struct
> amdgpu_ring *ring)
> >  	struct v9_mqd *mqd = ring->mqd_ptr;
> >  	int j;
> >
> > +	amdgpu_gfx_rlc_enter_safe_mode(adev);
> >  	/* disable wptr polling */
> >  	WREG32_FIELD15(GC, 0, CP_PQ_WPTR_POLL_CNTL, EN, 0);
> >
> > @@ -3629,6 +3630,7 @@ static int gfx_v9_0_kiq_init_register(struct
> amdgpu_ring *ring)
> >  	if (ring->use_doorbell)
> >  		WREG32_FIELD15(GC, 0, CP_PQ_STATUS, DOORBELL_ENABLE,
> 1);
> >
> > +	amdgpu_gfx_rlc_exit_safe_mode(adev);
> >  	return 0;
> >  }
> >
> > @@ -3637,6 +3639,7 @@ static int gfx_v9_0_kiq_fini_register(struct
> amdgpu_ring *ring)
> >  	struct amdgpu_device *adev = ring->adev;
> >  	int j;
> >
> > +	amdgpu_gfx_rlc_enter_safe_mode(adev);
> >  	/* disable the queue if it's active */
> >  	if (RREG32_SOC15(GC, 0, mmCP_HQD_ACTIVE) & 1) {
> >
> > @@ -3668,6 +3671,7 @@ static int gfx_v9_0_kiq_fini_register(struct
> amdgpu_ring *ring)
> >  	WREG32_SOC15_RLC(GC, 0, mmCP_HQD_PQ_WPTR_HI, 0);
> >  	WREG32_SOC15_RLC(GC, 0, mmCP_HQD_PQ_WPTR_LO, 0);
> >
> > +	amdgpu_gfx_rlc_exit_safe_mode(adev);
> >  	return 0;
> >  }
> >
> > --
> > 2.7.4
> >


More information about the amd-gfx mailing list