[PATCH] drm/amdgpu/sriov:fix driver unloading bug

Thu Sep 21 08:19:58 UTC 2017

I decide to cancel this fix, because it although can avoid the SAVE_VF fail, but the next round driver loading will fail on KIQ,  unless there is a VF FLR inserted before driver loading, so I had another w/a for it, which allocates MQD of KIQ on VRAM domain

Already sent out for review, thanks !

-----Original Message-----
From: Deucher, Alexander 
Sent: 2017年9月21日 14:58
To: Liu, Monk <Monk.Liu at amd.com>; amd-gfx at lists.freedesktop.org
Cc: Chen, Horace <Horace.Chen at amd.com>; Liu, Monk <Monk.Liu at amd.com>
Subject: RE: [PATCH] drm/amdgpu/sriov:fix driver unloading bug

> -----Original Message-----
> From: amd-gfx [mailto:amd-gfx-bounces at lists.freedesktop.org] On Behalf 
> Of Monk Liu
> Sent: Wednesday, September 20, 2017 5:28 AM
> To: amd-gfx at lists.freedesktop.org
> Cc: Chen, Horace; Liu, Monk
> Subject: [PATCH] drm/amdgpu/sriov:fix driver unloading bug
> 
> Fix hypervisor save_vf fail issue which hit after guest drv unloaded.
> 
> the reason of SAVE_VF will fail is:
> KIQ and KCQ still active after drv unloaded, RLCV will command CPC to 
> run MQD (to save current status) on all queues if they are still 
> active
> 
> the fix is to unmap KCQ and disable KIQ/HIQ in gfx fini, thus we 
> implement new routines to unmap KCQ for gfx8/9, and we disable KIQ/HIQ 
> in RLC registers thus RLCV won't initiate CPC do the MQD commands.
> 
> Change-Id: I95eb650f4bd16b639ca6e773efce80abb5e04641
> Signed-off-by: Horace Chen <horace.chen at amd.com>
> Signed-off-by: Monk Liu <Monk.Liu at amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c | 63
> ++++++++++++++++++++++++++++++++++
>  drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 64
> +++++++++++++++++++++++++++++++++++
>  2 files changed, 127 insertions(+)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> index 0c4a3b8..14be0bd 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> @@ -5034,6 +5034,57 @@ static int gfx_v8_0_hw_init(void *handle)
>  	return r;
>  }
> 
> +static int gfx_v8_0_kcq_disable(struct amdgpu_ring *kiq_ring,struct
> amdgpu_ring *ring)
> +{
> +	struct amdgpu_device *adev = kiq_ring->adev;
> +	uint32_t scratch, tmp = 0;
> +	int r, i;
> +
> +	r = amdgpu_gfx_scratch_get(adev, &scratch);
> +	if (r) {
> +		DRM_ERROR("Failed to get scratch reg (%d).\n", r);
> +		return r;
> +	}
> +	WREG32(scratch, 0xCAFEDEAD);
> +
> +	r = amdgpu_ring_alloc(kiq_ring, 10);
> +	if (r) {
> +		DRM_ERROR("Failed to lock KIQ (%d).\n", r);
> +		amdgpu_gfx_scratch_free(adev, scratch);
> +		return r;
> +	}
> +
> +	/* unmap queues */
> +	amdgpu_ring_write(kiq_ring, PACKET3(PACKET3_UNMAP_QUEUES,
> 4));
> +	amdgpu_ring_write(kiq_ring, /* Q_sel: 0, vmid: 0, engine: 0, num_Q:
> 1 */
> +
> 	PACKET3_UNMAP_QUEUES_ACTION(1) | /* RESET_QUEUES */
> +
> 	PACKET3_UNMAP_QUEUES_QUEUE_SEL(0) |
> +
> 	PACKET3_UNMAP_QUEUES_ENGINE_SEL(0) |
> +
> 	PACKET3_UNMAP_QUEUES_NUM_QUEUES(1));
> +	amdgpu_ring_write(kiq_ring,
> PACKET3_UNMAP_QUEUES_DOORBELL_OFFSET0(ring->doorbell_index));
> +	amdgpu_ring_write(kiq_ring, 0);
> +	amdgpu_ring_write(kiq_ring, 0);
> +	amdgpu_ring_write(kiq_ring, 0);
> +	/* write to scratch for completion */
> +	amdgpu_ring_write(kiq_ring,
> PACKET3(PACKET3_SET_UCONFIG_REG, 1));
> +	amdgpu_ring_write(kiq_ring, (scratch -
> PACKET3_SET_UCONFIG_REG_START));
> +	amdgpu_ring_write(kiq_ring, 0xDEADBEEF);
> +	amdgpu_ring_commit(kiq_ring);
> +
> +	for (i = 0; i < adev->usec_timeout; i++) {
> +		tmp = RREG32(scratch);
> +		if (tmp == 0xDEADBEEF)
> +			break;
> +		DRM_UDELAY(1);
> +	}
> +	if (i >= adev->usec_timeout) {
> +		DRM_ERROR("KCQ disabled failed
> (scratch(0x%04X)=0x%08X)\n", scratch, tmp);
> +		r = -EINVAL;
> +	}
> +	amdgpu_gfx_scratch_free(adev, scratch);
> +	return r;
> +}
> +
>  static int gfx_v8_0_hw_fini(void *handle)  {
>  	struct amdgpu_device *adev = (struct amdgpu_device *)handle; @@ 
> -5041,6 +5092,18 @@ static int gfx_v8_0_hw_fini(void *handle)
>  	amdgpu_irq_put(adev, &adev->gfx.priv_reg_irq, 0);
>  	amdgpu_irq_put(adev, &adev->gfx.priv_inst_irq, 0);
>  	if (amdgpu_sriov_vf(adev)) {
> +		uint32_t value;
> +		int i;
> +
> +		/* disable KCQ to avoid CPC touch memory not valid anymore
> */
> +		for (i = 0; i < adev->gfx.num_compute_rings; i++)
> +			gfx_v8_0_kcq_disable(&adev->gfx.kiq.ring, &adev-
> >gfx.compute_ring[i]);
> +
> +		/* disable KIQ & HIQ */
> +		value = RREG32(mmRLC_CP_SCHEDULERS);
> +		value = REG_SET_FIELD(value, RLC_CP_SCHEDULERS,
> scheduler0, 0);
> +		value = REG_SET_FIELD(value, RLC_CP_SCHEDULERS,
> scheduler1, 0);
> +		WREG32(mmRLC_CP_SCHEDULERS,value);

Please make sure this won't break the KFD.  IIRC, they shut down the HIQ already in KFD tear down.  Same for gfx9.

Alex

>  		pr_debug("For SRIOV client, shouldn't do anything.\n");
>  		return 0;
>  	}
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> index e2ae00d..f1f34a8 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> @@ -2895,6 +2895,58 @@ static int gfx_v9_0_hw_init(void *handle)
>  	return r;
>  }
> 
> +static int gfx_v9_0_kcq_disable(struct amdgpu_ring *kiq_ring,struct
> amdgpu_ring *ring)
> +{
> +	struct amdgpu_device *adev = kiq_ring->adev;
> +	uint32_t scratch, tmp = 0;
> +	int r, i;
> +
> +	r = amdgpu_gfx_scratch_get(adev, &scratch);
> +	if (r) {
> +		DRM_ERROR("Failed to get scratch reg (%d).\n", r);
> +		return r;
> +	}
> +	WREG32(scratch, 0xCAFEDEAD);
> +
> +	r = amdgpu_ring_alloc(kiq_ring, 10);
> +	if (r) {
> +		DRM_ERROR("Failed to lock KIQ (%d).\n", r);
> +		amdgpu_gfx_scratch_free(adev, scratch);
> +		return r;
> +	}
> +
> +	/* unmap queues */
> +	amdgpu_ring_write(kiq_ring, PACKET3(PACKET3_UNMAP_QUEUES,
> 4));
> +	amdgpu_ring_write(kiq_ring, /* Q_sel: 0, vmid: 0, engine: 0, num_Q:
> 1 */
> +
> 	PACKET3_UNMAP_QUEUES_ACTION(1) | /* RESET_QUEUES */
> +
> 	PACKET3_UNMAP_QUEUES_QUEUE_SEL(0) |
> +
> 	PACKET3_UNMAP_QUEUES_ENGINE_SEL(0) |
> +
> 	PACKET3_UNMAP_QUEUES_NUM_QUEUES(1));
> +	amdgpu_ring_write(kiq_ring,
> PACKET3_UNMAP_QUEUES_DOORBELL_OFFSET0(ring->doorbell_index));
> +	amdgpu_ring_write(kiq_ring, 0);
> +	amdgpu_ring_write(kiq_ring, 0);
> +	amdgpu_ring_write(kiq_ring, 0);
> +	/* write to scratch for completion */
> +	amdgpu_ring_write(kiq_ring,
> PACKET3(PACKET3_SET_UCONFIG_REG, 1));
> +	amdgpu_ring_write(kiq_ring, (scratch -
> PACKET3_SET_UCONFIG_REG_START));
> +	amdgpu_ring_write(kiq_ring, 0xDEADBEEF);
> +	amdgpu_ring_commit(kiq_ring);
> +
> +	for (i = 0; i < adev->usec_timeout; i++) {
> +		tmp = RREG32(scratch);
> +		if (tmp == 0xDEADBEEF)
> +			break;
> +		DRM_UDELAY(1);
> +	}
> +	if (i >= adev->usec_timeout) {
> +		DRM_ERROR("KCQ disabled failed
> (scratch(0x%04X)=0x%08X)\n", scratch, tmp);
> +		r = -EINVAL;
> +	}
> +	amdgpu_gfx_scratch_free(adev, scratch);
> +	return r;
> +}
> +
> +
>  static int gfx_v9_0_hw_fini(void *handle)  {
>  	struct amdgpu_device *adev = (struct amdgpu_device *)handle; @@ 
> -2902,6 +2954,18 @@ static int gfx_v9_0_hw_fini(void *handle)
>  	amdgpu_irq_put(adev, &adev->gfx.priv_reg_irq, 0);
>  	amdgpu_irq_put(adev, &adev->gfx.priv_inst_irq, 0);
>  	if (amdgpu_sriov_vf(adev)) {
> +		uint32_t value;
> +		int i;
> +
> +		/* disable KCQ to avoid CPC touch memory not valid anymore
> */
> +		for (i = 0; i < adev->gfx.num_compute_rings; i++)
> +			gfx_v9_0_kcq_disable(&adev->gfx.kiq.ring, &adev-
> >gfx.compute_ring[i]);
> +
> +		/* disable KIQ & HIQ */
> +		value = RREG32(mmRLC_CP_SCHEDULERS);
> +		value = REG_SET_FIELD(value, RLC_CP_SCHEDULERS,
> scheduler0, 0);
> +		value = REG_SET_FIELD(value, RLC_CP_SCHEDULERS,
> scheduler1, 0);
> +		WREG32(mmRLC_CP_SCHEDULERS,value);
>  		pr_debug("For SRIOV client, shouldn't do anything.\n");
>  		return 0;
>  	}
> --
> 2.7.4
> 
> _______________________________________________
> amd-gfx mailing list
> amd-gfx at lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx