[PATCH 2/2] drm/amdgpu: invalidate mmhub semphore workaround in gmc9/gmc10

Zhou1, Tao Tao.Zhou1 at amd.com
Thu Nov 21 07:16:30 UTC 2019


Yes, we could delay the submit of patch 2, or only apply the patch to MMHUB currently and apply it to GFXHUB if we fix the gfx hang issue in the future.

For patch 1, as Monk confirms SRIOV won't enable PG at all, I agree that patch 1 could be dropped.

Tao

> -----Original Message-----
> From: Christian König <ckoenig.leichtzumerken at gmail.com>
> Sent: 2019年11月20日 19:27
> To: Zhu, Changfeng <Changfeng.Zhu at amd.com>; Koenig, Christian
> <Christian.Koenig at amd.com>; Xiao, Jack <Jack.Xiao at amd.com>; Zhou1, Tao
> <Tao.Zhou1 at amd.com>; Huang, Ray <Ray.Huang at amd.com>; Huang,
> Shimmer <Xinmei.Huang at amd.com>; amd-gfx at lists.freedesktop.org
> Subject: Re: [PATCH 2/2] drm/amdgpu: invalidate mmhub semphore
> workaround in gmc9/gmc10
> 
> Am 20.11.19 um 10:44 schrieb Changfeng.Zhu:
> > From: changzhu <Changfeng.Zhu at amd.com>
> >
> > It may lose gpuvm invalidate acknowldege state across power-gating off
> > cycle. To avoid this issue in gmc9/gmc10 invalidation, add semaphore
> > acquire before invalidation and semaphore release after invalidation.
> >
> > After adding semaphore acquire before invalidation, the semaphore
> > register become read-only if another process try to acquire semaphore.
> > Then it will not be able to release this semaphore. Then it may cause
> > deadlock problem. If this deadlock problem happens, it needs a
> > semaphore firmware fix.
> 
> Please remove the DRM_WARN_ONCE, that looks like overkill to me.
> 
> And I'm not sure how urgent that issue here is. We could also wait a few
> more days and see if the hw guys figure out why this lockups on the GFX ring.
> 
> Regards,
> Christian.
> 
> >
> > Change-Id: I9942a2f451265c1f1038ccfe2f70042c7c8118af
> > Signed-off-by: changzhu <Changfeng.Zhu at amd.com>
> > ---
> >   drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 49
> ++++++++++++++++++++++++++
> >   drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c  | 49
> ++++++++++++++++++++++++++
> >   drivers/gpu/drm/amd/amdgpu/soc15.h     |  4 +--
> >   3 files changed, 100 insertions(+), 2 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> > b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> > index af2615ba52aa..685d0d5ef31e 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> > @@ -234,6 +234,24 @@ static void gmc_v10_0_flush_vm_hub(struct
> amdgpu_device *adev, uint32_t vmid,
> >   	const unsigned eng = 17;
> >   	unsigned int i;
> >
> > +	spin_lock(&adev->gmc.invalidate_lock);
> > +	/*
> > +	 * It may lose gpuvm invalidate acknowldege state across power-
> gating
> > +	 * off cycle, add semaphore acquire before invalidation and
> semaphore
> > +	 * release after invalidation to avoid entering power gated state
> > +	 * to WA the Issue
> > +	 */
> > +	for (i = 0; i < adev->usec_timeout; i++) {
> > +		/* a read return value of 1 means semaphore acuqire */
> > +		tmp = RREG32_NO_KIQ(hub->vm_inv_eng0_sem + eng);
> > +		if (tmp & 0x1)
> > +			break;
> > +		udelay(1);
> > +	}
> > +
> > +	if (i >= adev->usec_timeout)
> > +		DRM_ERROR("Timeout waiting for sem acquire in VM
> flush!\n");
> > +
> >   	WREG32_NO_KIQ(hub->vm_inv_eng0_req + eng, tmp);
> >
> >   	/*
> > @@ -253,6 +271,14 @@ static void gmc_v10_0_flush_vm_hub(struct
> amdgpu_device *adev, uint32_t vmid,
> >   		udelay(1);
> >   	}
> >
> > +	/*
> > +	 * add semaphore release after invalidation,
> > +	 * write with 0 means semaphore release
> > +	 */
> > +	WREG32_NO_KIQ(hub->vm_inv_eng0_sem + eng, 0);
> > +
> > +	spin_unlock(&adev->gmc.invalidate_lock);
> > +
> >   	if (i < adev->usec_timeout)
> >   		return;
> >
> > @@ -338,6 +364,21 @@ static uint64_t
> gmc_v10_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
> >   	uint32_t req = gmc_v10_0_get_invalidate_req(vmid, 0);
> >   	unsigned eng = ring->vm_inv_eng;
> >
> > +	/*
> > +	 * It may lose gpuvm invalidate acknowldege state across power-
> gating
> > +	 * off cycle, add semaphore acquire before invalidation and
> semaphore
> > +	 * release after invalidation to avoid entering power gated state
> > +	 * to WA the Issue
> > +	 */
> > +
> > +	/* a read return value of 1 means semaphore acuqire */
> > +	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
> > +	    ring->funcs->vmhub == AMDGPU_MMHUB_1) {
> > +		amdgpu_ring_emit_reg_wait(ring,
> > +					  hub->vm_inv_eng0_sem + eng, 0x1,
> 0x1);
> > +		DRM_WARN_ONCE("Adding semaphore may cause deadlock
> and it needs firmware fix\n");
> > +	}
> > +
> >   	amdgpu_ring_emit_wreg(ring, hub->ctx0_ptb_addr_lo32 + (2 * vmid),
> >   			      lower_32_bits(pd_addr));
> >
> > @@ -348,6 +389,14 @@ static uint64_t
> gmc_v10_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
> >   					    hub->vm_inv_eng0_ack + eng,
> >   					    req, 1 << vmid);
> >
> > +	/*
> > +	 * add semaphore release after invalidation,
> > +	 * write with 0 means semaphore release
> > +	 */
> > +	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
> > +	    ring->funcs->vmhub == AMDGPU_MMHUB_1)
> > +		amdgpu_ring_emit_wreg(ring, hub->vm_inv_eng0_sem + eng,
> 0);
> > +
> >   	return pd_addr;
> >   }
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> > b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> > index 1ae59af7836a..c4118cbb0fbe 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> > @@ -456,6 +456,24 @@ static void gmc_v9_0_flush_gpu_tlb(struct
> amdgpu_device *adev, uint32_t vmid,
> >   	}
> >
> >   	spin_lock(&adev->gmc.invalidate_lock);
> > +
> > +	/*
> > +	 * It may lose gpuvm invalidate acknowldege state across power-
> gating
> > +	 * off cycle, add semaphore acquire before invalidation and
> semaphore
> > +	 * release after invalidation to avoid entering power gated state
> > +	 * to WA the Issue
> > +	 */
> > +	for (j = 0; j < adev->usec_timeout; j++) {
> > +		/* a read return value of 1 means semaphore acuqire */
> > +		tmp = RREG32_NO_KIQ(hub->vm_inv_eng0_sem + eng);
> > +		if (tmp & 0x1)
> > +			break;
> > +		udelay(1);
> > +	}
> > +
> > +	if (j >= adev->usec_timeout)
> > +		DRM_ERROR("Timeout waiting for sem acquire in VM
> flush!\n");
> > +
> >   	WREG32_NO_KIQ(hub->vm_inv_eng0_req + eng, tmp);
> >
> >   	/*
> > @@ -471,7 +489,15 @@ static void gmc_v9_0_flush_gpu_tlb(struct
> amdgpu_device *adev, uint32_t vmid,
> >   			break;
> >   		udelay(1);
> >   	}
> > +
> > +	/*
> > +	 * add semaphore release after invalidation,
> > +	 * write with 0 means semaphore release
> > +	 */
> > +	WREG32_NO_KIQ(hub->vm_inv_eng0_sem + eng, 0);
> > +
> >   	spin_unlock(&adev->gmc.invalidate_lock);
> > +
> >   	if (j < adev->usec_timeout)
> >   		return;
> >
> > @@ -486,6 +512,21 @@ static uint64_t
> gmc_v9_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
> >   	uint32_t req = gmc_v9_0_get_invalidate_req(vmid, 0);
> >   	unsigned eng = ring->vm_inv_eng;
> >
> > +	/*
> > +	 * It may lose gpuvm invalidate acknowldege state across power-
> gating
> > +	 * off cycle, add semaphore acquire before invalidation and
> semaphore
> > +	 * release after invalidation to avoid entering power gated state
> > +	 * to WA the Issue
> > +	 */
> > +
> > +	/* a read return value of 1 means semaphore acuqire */
> > +	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
> > +	    ring->funcs->vmhub == AMDGPU_MMHUB_1) {
> > +		amdgpu_ring_emit_reg_wait(ring,
> > +					  hub->vm_inv_eng0_sem + eng, 0x1,
> 0x1);
> > +		DRM_WARN_ONCE("Adding semaphore may cause deadlock
> and it needs firmware fix\n");
> > +	}
> > +
> >   	amdgpu_ring_emit_wreg(ring, hub->ctx0_ptb_addr_lo32 + (2 * vmid),
> >   			      lower_32_bits(pd_addr));
> >
> > @@ -496,6 +537,14 @@ static uint64_t
> gmc_v9_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
> >   					    hub->vm_inv_eng0_ack + eng,
> >   					    req, 1 << vmid);
> >
> > +	/*
> > +	 * add semaphore release after invalidation,
> > +	 * write with 0 means semaphore release
> > +	 */
> > +	if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
> > +	    ring->funcs->vmhub == AMDGPU_MMHUB_1)
> > +		amdgpu_ring_emit_wreg(ring, hub->vm_inv_eng0_sem + eng,
> 0);
> > +
> >   	return pd_addr;
> >   }
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/soc15.h
> > b/drivers/gpu/drm/amd/amdgpu/soc15.h
> > index 9af6c6ffbfa2..57af489a5de3 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/soc15.h
> > +++ b/drivers/gpu/drm/amd/amdgpu/soc15.h
> > @@ -28,8 +28,8 @@
> >   #include "nbio_v7_0.h"
> >   #include "nbio_v7_4.h"
> >
> > -#define SOC15_FLUSH_GPU_TLB_NUM_WREG		4
> > -#define SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT	1
> > +#define SOC15_FLUSH_GPU_TLB_NUM_WREG		6
> > +#define SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT	3
> >
> >   extern const struct amd_ip_funcs soc15_common_ip_funcs;
> >



More information about the amd-gfx mailing list