[PATCH] drm/amdgpu: avoid over-handle of fence driver fini in s3 test (v2)

Andrey Grodzovsky andrey.grodzovsky at amd.com
Wed Aug 18 05:26:52 UTC 2021


On 2021-08-02 1:16 a.m., Guchun Chen wrote:
> In amdgpu_fence_driver_hw_fini, no need to call drm_sched_fini to stop
> scheduler in s3 test, otherwise, fence related failure will arrive
> after resume. To fix this and for a better clean up, move drm_sched_fini
> from fence_hw_fini to fence_sw_fini, as it's part of driver shutdown, and
> should never be called in hw_fini.
>
> v2: rename amdgpu_fence_driver_init to amdgpu_fence_driver_sw_init,
> to keep sw_init and sw_fini paired.
>
> Fixes: cd87a6dcf6af drm/amdgpu: adjust fence driver enable sequence
> Suggested-by: Christian König <christian.koenig at amd.com>
> Signed-off-by: Guchun Chen <guchun.chen at amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  5 ++---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c  | 12 +++++++-----
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h   |  4 ++--
>   3 files changed, 11 insertions(+), 10 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index b1d2dc39e8be..9e53ff851496 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -3646,9 +3646,9 @@ int amdgpu_device_init(struct amdgpu_device *adev,
>   
>   fence_driver_init:
>   	/* Fence driver */
> -	r = amdgpu_fence_driver_init(adev);
> +	r = amdgpu_fence_driver_sw_init(adev);
>   	if (r) {
> -		dev_err(adev->dev, "amdgpu_fence_driver_init failed\n");
> +		dev_err(adev->dev, "amdgpu_fence_driver_sw_init failed\n");
>   		amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0);
>   		goto failed;
>   	}
> @@ -3988,7 +3988,6 @@ int amdgpu_device_resume(struct drm_device *dev, bool fbcon)
>   	}
>   	amdgpu_fence_driver_hw_init(adev);
>   
> -
>   	r = amdgpu_device_ip_late_init(adev);
>   	if (r)
>   		return r;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> index 49c5c7331c53..7495911516c2 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> @@ -498,7 +498,7 @@ int amdgpu_fence_driver_init_ring(struct amdgpu_ring *ring,
>   }
>   
>   /**
> - * amdgpu_fence_driver_init - init the fence driver
> + * amdgpu_fence_driver_sw_init - init the fence driver
>    * for all possible rings.
>    *
>    * @adev: amdgpu device pointer
> @@ -509,13 +509,13 @@ int amdgpu_fence_driver_init_ring(struct amdgpu_ring *ring,
>    * amdgpu_fence_driver_start_ring().
>    * Returns 0 for success.
>    */
> -int amdgpu_fence_driver_init(struct amdgpu_device *adev)
> +int amdgpu_fence_driver_sw_init(struct amdgpu_device *adev)
>   {
>   	return 0;
>   }
>   
>   /**
> - * amdgpu_fence_driver_fini - tear down the fence driver
> + * amdgpu_fence_driver_hw_fini - tear down the fence driver
>    * for all possible rings.
>    *
>    * @adev: amdgpu device pointer
> @@ -531,8 +531,7 @@ void amdgpu_fence_driver_hw_fini(struct amdgpu_device *adev)
>   
>   		if (!ring || !ring->fence_drv.initialized)
>   			continue;
> -		if (!ring->no_scheduler)
> -			drm_sched_fini(&ring->sched);
> +
>   		/* You can't wait for HW to signal if it's gone */
>   		if (!drm_dev_is_unplugged(&adev->ddev))
>   			r = amdgpu_fence_wait_empty(ring);


Sorry for late notice, missed this patch. By moving drm_sched_fini
past amdgpu_fence_wait_empty a race is created as even after you waited
for all fences on the ring to signal the sw scheduler will keep submitting
new jobs on the ring and so the ring won't stay empty.

For hot device removal also we want to prevent any access to HW past PCI 
removal
in order to not do any MMIO accesses inside the physical MMIO range that 
no longer
belongs to this device after it's removal by the PCI core. Stopping all 
the schedulers prevents any MMIO
accesses done during job submissions and that why drm_sched_fini was 
done as part of amdgpu_fence_driver_hw_fini
and not amdgpu_fence_driver_sw_fini

Andrey

> @@ -560,6 +559,9 @@ void amdgpu_fence_driver_sw_fini(struct amdgpu_device *adev)
>   		if (!ring || !ring->fence_drv.initialized)
>   			continue;
>   
> +		if (!ring->no_scheduler)
> +			drm_sched_fini(&ring->sched);
> +
>   		for (j = 0; j <= ring->fence_drv.num_fences_mask; ++j)
>   			dma_fence_put(ring->fence_drv.fences[j]);
>   		kfree(ring->fence_drv.fences);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> index 27adffa7658d..9c11ced4312c 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> @@ -106,7 +106,6 @@ struct amdgpu_fence_driver {
>   	struct dma_fence		**fences;
>   };
>   
> -int amdgpu_fence_driver_init(struct amdgpu_device *adev);
>   void amdgpu_fence_driver_force_completion(struct amdgpu_ring *ring);
>   
>   int amdgpu_fence_driver_init_ring(struct amdgpu_ring *ring,
> @@ -115,9 +114,10 @@ int amdgpu_fence_driver_init_ring(struct amdgpu_ring *ring,
>   int amdgpu_fence_driver_start_ring(struct amdgpu_ring *ring,
>   				   struct amdgpu_irq_src *irq_src,
>   				   unsigned irq_type);
> +void amdgpu_fence_driver_hw_init(struct amdgpu_device *adev);
>   void amdgpu_fence_driver_hw_fini(struct amdgpu_device *adev);
> +int amdgpu_fence_driver_sw_init(struct amdgpu_device *adev);
>   void amdgpu_fence_driver_sw_fini(struct amdgpu_device *adev);
> -void amdgpu_fence_driver_hw_init(struct amdgpu_device *adev);
>   int amdgpu_fence_emit(struct amdgpu_ring *ring, struct dma_fence **fence,
>   		      unsigned flags);
>   int amdgpu_fence_emit_polling(struct amdgpu_ring *ring, uint32_t *s,


More information about the amd-gfx mailing list