[PATCH] drm/amdkfd: fix svm_bo release invalid wait context warning

Felix Kuehling felix.kuehling at amd.com
Wed Dec 1 00:42:01 UTC 2021


Am 2021-11-30 um 3:13 p.m. schrieb Philip Yang:
> svm_range_bo_release could be called from __do_munmap put_page
> atomic context if process release work has finished to free pranges of
> the svm_bo. Schedule release_work to wait for svm_bo eviction work done
> and then free svm_bo.
>
> Signed-off-by: Philip Yang <Philip.Yang at amd.com>
> ---
>  drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 36 +++++++++++++++++++---------
>  drivers/gpu/drm/amd/amdkfd/kfd_svm.h |  1 +
>  2 files changed, 26 insertions(+), 11 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
> index f2db49c7a8fd..8af87a662a0d 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
> @@ -327,11 +327,33 @@ static bool svm_bo_ref_unless_zero(struct svm_range_bo *svm_bo)
>  	return true;
>  }
>  
> +static void svm_range_bo_wq_release(struct work_struct *work)
> +{
> +	struct svm_range_bo *svm_bo;
> +
> +	svm_bo = container_of(work, struct svm_range_bo, release_work);
> +	pr_debug("svm_bo 0x%p\n", svm_bo);
> +
> +	if (!dma_fence_is_signaled(&svm_bo->eviction_fence->base)) {
> +		/* We're not in the eviction worker.
> +		 * Signal the fence and synchronize with any
> +		 * pending eviction work.
> +		 */

Now that this is its own worker, it's definitely never in the eviction
worker. So this doesn't need to be conditional here.

I'm thinking, in the eviction case, the extra scheduling is not actually
needed and only adds latency and overhead. The eviction already runs in
a worker thread and the svm_range_bo_unref there is definitely not in an
atomic context.

So I wonder if we should have a two versions of
svm_range_bo_unref_sync/async and svm_range_bo_release_sync/async. The
synchronous version doesn't schedule a worker and is used when we're
sure were not in atomic context. The asynchronous version we can use in
places that may be in atomic context.

Regards,
  Felix


> +		dma_fence_signal(&svm_bo->eviction_fence->base);
> +		cancel_work_sync(&svm_bo->eviction_work);
> +	}
> +	dma_fence_put(&svm_bo->eviction_fence->base);
> +	amdgpu_bo_unref(&svm_bo->bo);
> +	kfree(svm_bo);
> +}
> +
>  static void svm_range_bo_release(struct kref *kref)
>  {
>  	struct svm_range_bo *svm_bo;
>  
>  	svm_bo = container_of(kref, struct svm_range_bo, kref);
> +	pr_debug("svm_bo 0x%p\n", svm_bo);
> +
>  	spin_lock(&svm_bo->list_lock);
>  	while (!list_empty(&svm_bo->range_list)) {
>  		struct svm_range *prange =
> @@ -352,17 +374,9 @@ static void svm_range_bo_release(struct kref *kref)
>  		spin_lock(&svm_bo->list_lock);
>  	}
>  	spin_unlock(&svm_bo->list_lock);
> -	if (!dma_fence_is_signaled(&svm_bo->eviction_fence->base)) {
> -		/* We're not in the eviction worker.
> -		 * Signal the fence and synchronize with any
> -		 * pending eviction work.
> -		 */
> -		dma_fence_signal(&svm_bo->eviction_fence->base);
> -		cancel_work_sync(&svm_bo->eviction_work);
> -	}
> -	dma_fence_put(&svm_bo->eviction_fence->base);
> -	amdgpu_bo_unref(&svm_bo->bo);
> -	kfree(svm_bo);
> +
> +	INIT_WORK(&svm_bo->release_work, svm_range_bo_wq_release);
> +	schedule_work(&svm_bo->release_work);
>  }
>  
>  void svm_range_bo_unref(struct svm_range_bo *svm_bo)
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.h b/drivers/gpu/drm/amd/amdkfd/kfd_svm.h
> index 6dc91c33e80f..23478ae7a7d9 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.h
> @@ -48,6 +48,7 @@ struct svm_range_bo {
>  	struct work_struct		eviction_work;
>  	struct svm_range_list		*svms;
>  	uint32_t			evicting;
> +	struct work_struct		release_work;
>  };
>  
>  enum svm_work_list_ops {


More information about the amd-gfx mailing list