[PATCH] drm/amdgpu: fix a kcq hang issue for SRIOV

Deng, Emily Emily.Deng at amd.com
Tue Mar 27 08:31:03 UTC 2018


Hi Christian,
     I only found the issue on SRIOV environment, does it also occurs on other cases, for example, thanks.

Best Wishes,
Emily Deng

> -----Original Message-----
> From: Christian König [mailto:ckoenig.leichtzumerken at gmail.com]
> Sent: Tuesday, March 27, 2018 4:27 PM
> To: Deng, Emily <Emily.Deng at amd.com>; Koenig, Christian
> <Christian.Koenig at amd.com>; amd-gfx at lists.freedesktop.org
> Cc: Liu, Monk <Monk.Liu at amd.com>
> Subject: Re: [PATCH] drm/amdgpu: fix a kcq hang issue for SRIOV
> 
> Hi Emily,
> 
> Am 27.03.2018 um 10:23 schrieb Deng, Emily:
> > Hi Christian,
> >      I think the root cause has been found out. It is the hardware
> > restriction that couldn't do world switch between invalidate request,
> > and ack. We couldn't control world switch in guest driver, but could replace
> the origin two command with one command to avoid world switch.
> 
> Unfortunately the problem was reproduced completely without world switch.
> 
> So the world switch is not the root cause, but only another trigger to the
> issue.
> 
> We need to find the root cause or otherwise will not fix the real problem but
> only mitigate the effects.
> 
> Regards,
> Christian.
> 
> >
> >      For the multiple times writing, it is also the same root cause,
> > when the world switch happened between the  invalidate request, and
> > ack, it will have problem. So we could give more chance for SRIOV to
> > do the invalidate request to workaround the case that world switch
> > happens between invalidate request, and ack
> >
> > Best Wishes,
> > Emily Deng
> >
> >> -----Original Message-----
> >> From: Christian König [mailto:ckoenig.leichtzumerken at gmail.com]
> >> Sent: Tuesday, March 27, 2018 3:48 PM
> >> To: Deng, Emily <Emily.Deng at amd.com>; amd-gfx at lists.freedesktop.org
> >> Cc: Liu, Monk <Monk.Liu at amd.com>
> >> Subject: Re: [PATCH] drm/amdgpu: fix a kcq hang issue for SRIOV
> >>
> >> Am 27.03.2018 um 07:58 schrieb Emily Deng:
> >>> issue:
> >>> the vmflush in KCQ could be preempted (not like GFX ring which
> >>> doesn't allow preemption in ring buffer) and this lead to vm flush
> >>> fail when there is a world switch during the vm flush procedure
> >>> (between write invalidate request and query invalidate ack)
> >>>
> >>> fix:
> >>> separate vm flush for gfx and compute ring, and use the new format
> >>> command in compute's vm flush which use only one package so no
> >>> preemption could allowed
> >> NAK, as already discussed multiple times now that only circumvents
> >> the problem, but not really fixes it.
> >>
> >> Just executing the "amdgpu_ring_emit_wreg(ring, hub->vm_inv_eng0_req
> >> + eng, req);" multiple times has the same effect and we need to figure out
> why.
> >>
> >> Regards,
> >> Christian.
> >>
> >>> Signed-off-by: Monk Liu <Monk.Liu at amd.com>
> >>> Signed-off-by: Emily Deng <Emily.Deng at amd.com>
> >>> ---
> >>>    drivers/gpu/drm/amd/amdgpu/amdgpu.h      |  1 +
> >>>    drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h |  2 ++
> >>>    drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c    | 10 +++++++++-
> >>>    drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c    | 18 +++++++++++++-----
> >>>    4 files changed, 25 insertions(+), 6 deletions(-)
> >>>
> >>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> >>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> >>> index a7e2229..986659f 100644
> >>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> >>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> >>> @@ -1790,6 +1790,7 @@ amdgpu_get_sdma_instance(struct
> >> amdgpu_ring *ring)
> >>>    #define amdgpu_ring_emit_rreg(r, d) (r)->funcs->emit_rreg((r), (d))
> >>>    #define amdgpu_ring_emit_wreg(r, d, v) (r)->funcs->emit_wreg((r), (d),
> (v))
> >>>    #define amdgpu_ring_emit_reg_wait(r, d, v, m)
> >>> (r)->funcs->emit_reg_wait((r), (d), (v), (m))
> >>> +#define amdgpu_ring_emit_reg_wait1(r, d0, d1, v, m)
> >>> +(r)->funcs->emit_reg_wait1((r), (d0), (d1), (v), (m))
> >>>    #define amdgpu_ring_emit_tmz(r, b) (r)->funcs->emit_tmz((r), (b))
> >>>    #define amdgpu_ring_pad_ib(r, ib) ((r)->funcs->pad_ib((r), (ib)))
> >>>    #define amdgpu_ring_init_cond_exec(r)
> >>> (r)->funcs->init_cond_exec((r)) diff --git
> >>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> >>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> >>> index 1d0d250..d85df5d 100644
> >>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> >>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> >>> @@ -152,6 +152,8 @@ struct amdgpu_ring_funcs {
> >>>    	void (*emit_wreg)(struct amdgpu_ring *ring, uint32_t reg,
> >>> uint32_t
> >> val);
> >>>    	void (*emit_reg_wait)(struct amdgpu_ring *ring, uint32_t reg,
> >>>    			      uint32_t val, uint32_t mask);
> >>> +	void (*emit_reg_wait1)(struct amdgpu_ring *ring, uint32_t reg0,
> >>> +				uint32_t reg1, uint32_t val, uint32_t mask);
> >>>    	void (*emit_tmz)(struct amdgpu_ring *ring, bool start);
> >>>    	/* priority functions */
> >>>    	void (*set_priority) (struct amdgpu_ring *ring, diff --git
> >>> a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> >>> b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> >>> index 1ae3de1..509c9d2 100644
> >>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> >>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> >>> @@ -4078,6 +4078,13 @@ static void
> >>> gfx_v9_0_ring_emit_reg_wait(struct
> >> amdgpu_ring *ring, uint32_t reg,
> >>>    	gfx_v9_0_wait_reg_mem(ring, 0, 0, 0, reg, 0, val, mask, 0x20);
> >>>    }
> >>>
> >>> +static void gfx_v9_0_ring_emit_reg_wait_compute(struct amdgpu_ring
> >> *ring,
> >>> +					uint32_t reg0, uint32_t reg1,
> >>> +					uint32_t val, uint32_t mask)
> >>> +{
> >>> +	gfx_v9_0_wait_reg_mem(ring, 0, 0, 1, reg0, reg1, val, mask, 0x20);
> >>> +}
> >>> +
> >>>    static void gfx_v9_0_set_gfx_eop_interrupt_state(struct
> >>> amdgpu_device
> >> *adev,
> >>>    						 enum
> >> amdgpu_interrupt_state state)
> >>>    {
> >>> @@ -4415,7 +4422,7 @@ static const struct amdgpu_ring_funcs
> >> gfx_v9_0_ring_funcs_compute = {
> >>>    		7 + /* gfx_v9_0_ring_emit_hdp_flush */
> >>>    		5 + /* hdp invalidate */
> >>>    		7 + /* gfx_v9_0_ring_emit_pipeline_sync */
> >>> -		SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
> >>> +		(SOC15_FLUSH_GPU_TLB_NUM_WREG - 1) * 5 +
> >>>    		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
> >>>    		2 + /* gfx_v9_0_ring_emit_vm_flush */
> >>>    		8 + 8 + 8, /* gfx_v9_0_ring_emit_fence x3 for user fence, vm
> >> fence
> >>> */ @@ -4433,6 +4440,7 @@ static const struct amdgpu_ring_funcs
> >> gfx_v9_0_ring_funcs_compute = {
> >>>    	.set_priority = gfx_v9_0_ring_set_priority_compute,
> >>>    	.emit_wreg = gfx_v9_0_ring_emit_wreg,
> >>>    	.emit_reg_wait = gfx_v9_0_ring_emit_reg_wait,
> >>> +	.emit_reg_wait1 = gfx_v9_0_ring_emit_reg_wait_compute,
> >>>    };
> >>>
> >>>    static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_kiq = {
> >>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> >>> b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> >>> index e687363..968447d 100644
> >>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> >>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> >>> @@ -385,11 +385,19 @@ static uint64_t
> >> gmc_v9_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
> >>>    	amdgpu_ring_emit_wreg(ring, hub->ctx0_ptb_addr_hi32 + (2 * vmid),
> >>>    			      upper_32_bits(pd_addr));
> >>>
> >>> -	amdgpu_ring_emit_wreg(ring, hub->vm_inv_eng0_req + eng, req);
> >>> -
> >>> -	/* wait for the invalidate to complete */
> >>> -	amdgpu_ring_emit_reg_wait(ring, hub->vm_inv_eng0_ack + eng,
> >>> -				  1 << vmid, 1 << vmid);
> >>> +    /* The world switch cannot be allowed to occur while
> >>> +       some invalidation controller code is waiting for an ack.
> >>> +       To workaround the hardware restriction, replace the original
> >>> +       two command with one command for compute ring */
> >>> +	if (ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE &&
> >> amdgpu_sriov_vf(adev)) {
> >>> +		amdgpu_ring_emit_reg_wait1(ring, hub->vm_inv_eng0_req
> >> + eng,
> >>> +				   hub->vm_inv_eng0_ack + eng, req, 1 <<
> >> vmid);
> >>> +	} else {
> >>> +		amdgpu_ring_emit_wreg(ring, hub->vm_inv_eng0_req + eng,
> >> req);
> >>> +		/* wait for the invalidate to complete */
> >>> +		amdgpu_ring_emit_reg_wait(ring, hub->vm_inv_eng0_ack +
> >> eng,
> >>> +			   1 << vmid, 1 << vmid);
> >>> +	}
> >>>
> >>>    	return pd_addr;
> >>>    }
> > _______________________________________________
> > amd-gfx mailing list
> > amd-gfx at lists.freedesktop.org
> > https://lists.freedesktop.org/mailman/listinfo/amd-gfx



More information about the amd-gfx mailing list