[PATCH 13/18] drm/amdgpu:fix driver unloading bug

Mon Sep 18 09:27:42 UTC 2017

Am 18.09.2017 um 08:11 schrieb Monk Liu:
> [SWDEV-126631] - fix hypervisor save_vf fail that occured
> after driver removed:
> 1. Because the KIQ and KCQ were not ummapped, save_vf will fail if driver freed mqd of KIQ and KCQ.
> 2. KIQ can't be unmapped since RLCV always need it, the bo_free on KIQ should be skipped
> 3. KCQ can be unmapped, and should be unmapped during hw_fini,
> 4. RLCV still need to access other mc address from some hw even after driver unloaded,
>     So we should not unbind gart for VF.
>
> Change-Id: I320487a9a848f41484c5f8cc11be34aca807b424
> Signed-off-by: Horace Chen <horace.chen at amd.com>
> Signed-off-by: Monk Liu <Monk.Liu at amd.com>

I absolutely can't judge if this is correct or not, but keeping the GART 
and KIQ alive after the driver is unloaded sounds really fishy to me.

Isn't there any other clean way of handling this?

Christian.

> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c |  3 +-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c  |  5 +++
>   drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c    | 60 +++++++++++++++++++++++++++++++-
>   3 files changed, 66 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
> index f437008..2fee071 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
> @@ -394,7 +394,8 @@ int amdgpu_gart_init(struct amdgpu_device *adev)
>    */
>   void amdgpu_gart_fini(struct amdgpu_device *adev)
>   {
> -	if (adev->gart.ready) {
> +	/* gart is still used by other hw under SRIOV, don't unbind it */
> +	if (adev->gart.ready && !amdgpu_sriov_vf(adev)) {
>   		/* unbind pages */
>   		amdgpu_gart_unbind(adev, 0, adev->gart.num_cpu_pages);
>   	}
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> index 4f6c68f..bf6656f 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> @@ -309,6 +309,11 @@ void amdgpu_gfx_compute_mqd_sw_fini(struct amdgpu_device *adev)
>   				      &ring->mqd_ptr);
>   	}
>   
> +	/* don't deallocate KIQ mqd because the bo is still used by RLCV even
> +	the guest VM is shutdown */
> +	if (amdgpu_sriov_vf(adev))
> +		return;
> +
>   	ring = &adev->gfx.kiq.ring;
>   	kfree(adev->gfx.mec.mqd_backup[AMDGPU_MAX_COMPUTE_RINGS]);
>   	amdgpu_bo_free_kernel(&ring->mqd_obj,
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> index 44960b3..a577bbc 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> @@ -2892,14 +2892,72 @@ static int gfx_v9_0_hw_init(void *handle)
>   	return r;
>   }
>   
> +static int gfx_v9_0_kcq_disable(struct amdgpu_ring *kiq_ring,struct amdgpu_ring *ring)
> +{
> +	struct amdgpu_device *adev = kiq_ring->adev;
> +	uint32_t scratch, tmp = 0;
> +	int r, i;
> +
> +	r = amdgpu_gfx_scratch_get(adev, &scratch);
> +	if (r) {
> +		DRM_ERROR("Failed to get scratch reg (%d).\n", r);
> +		return r;
> +	}
> +	WREG32(scratch, 0xCAFEDEAD);
> +
> +	r = amdgpu_ring_alloc(kiq_ring, 10);
> +	if (r) {
> +		DRM_ERROR("Failed to lock KIQ (%d).\n", r);
> +		amdgpu_gfx_scratch_free(adev, scratch);
> +		return r;
> +	}
> +
> +	/* unmap queues */
> +	amdgpu_ring_write(kiq_ring, PACKET3(PACKET3_UNMAP_QUEUES, 4));
> +	amdgpu_ring_write(kiq_ring, /* Q_sel: 0, vmid: 0, engine: 0, num_Q: 1 */
> +						PACKET3_UNMAP_QUEUES_ACTION(1) | /* RESET_QUEUES */
> +						PACKET3_UNMAP_QUEUES_QUEUE_SEL(0) |
> +						PACKET3_UNMAP_QUEUES_ENGINE_SEL(0) |
> +						PACKET3_UNMAP_QUEUES_NUM_QUEUES(1));
> +	amdgpu_ring_write(kiq_ring, PACKET3_UNMAP_QUEUES_DOORBELL_OFFSET0(ring->doorbell_index));
> +	amdgpu_ring_write(kiq_ring, 0);
> +	amdgpu_ring_write(kiq_ring, 0);
> +	amdgpu_ring_write(kiq_ring, 0);
> +	/* write to scratch for completion */
> +	amdgpu_ring_write(kiq_ring, PACKET3(PACKET3_SET_UCONFIG_REG, 1));
> +	amdgpu_ring_write(kiq_ring, (scratch - PACKET3_SET_UCONFIG_REG_START));
> +	amdgpu_ring_write(kiq_ring, 0xDEADBEEF);
> +	amdgpu_ring_commit(kiq_ring);
> +
> +	for (i = 0; i < adev->usec_timeout; i++) {
> +		tmp = RREG32(scratch);
> +		if (tmp == 0xDEADBEEF)
> +			break;
> +		DRM_UDELAY(1);
> +	}
> +	if (i >= adev->usec_timeout) {
> +		DRM_ERROR("KCQ disabled failed (scratch(0x%04X)=0x%08X)\n", scratch, tmp);
> +		r = -EINVAL;
> +	}
> +	amdgpu_gfx_scratch_free(adev, scratch);
> +	return r;
> +}
> +
> +
>   static int gfx_v9_0_hw_fini(void *handle)
>   {
>   	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
> +	int i, r;
>   
>   	amdgpu_irq_put(adev, &adev->gfx.priv_reg_irq, 0);
>   	amdgpu_irq_put(adev, &adev->gfx.priv_inst_irq, 0);
>   	if (amdgpu_sriov_vf(adev)) {
> -		pr_debug("For SRIOV client, shouldn't do anything.\n");
> +		/* disable KCQ to avoid CPC touch memory not valid anymore */
> +		for (i = 0; i < adev->gfx.num_compute_rings; i++) {
> +			r = gfx_v9_0_kcq_disable(&adev->gfx.kiq.ring, &adev->gfx.compute_ring[i]);
> +			if (r)
> +				return r;
> +		}
>   		return 0;
>   	}
>   	gfx_v9_0_cp_enable(adev, false);