[PATCH 2/2] drm/amdkfd: Fix double release compute pasid
Felix Kuehling
felix.kuehling at amd.com
Wed Dec 14 22:42:41 UTC 2022
On 2022-12-14 10:42, Philip Yang wrote:
> If kfd_process_device_init_vm returns failure after vm is converted to
> compute vm and vm->pasid set to compute pasid, KFD will not take
> pdd->drm_file reference. As a result, drm close file handler maybe
> called to release the compute pasid before KFD process destroy worker to
> release the same pasid and set vm->pasid to zero, this generates below
> WARNING backtrace and NULL pointer access.
>
> Add helper amdgpu_amdkfd_gpuvm_set_vm_pasid and call it at the last step
> of kfd_process_device_init_vm, to ensure vm pasid is the original pasid
> if acquiring vm failed or is the compute pasid with pdd->drm_file
> reference taken to avoid double release same pasid.
>
> amdgpu: Failed to create process VM object
> ida_free called for id=32770 which is not allocated.
> WARNING: CPU: 57 PID: 72542 at ../lib/idr.c:522 ida_free+0x96/0x140
> RIP: 0010:ida_free+0x96/0x140
> Call Trace:
> amdgpu_pasid_free_delayed+0xe1/0x2a0 [amdgpu]
> amdgpu_driver_postclose_kms+0x2d8/0x340 [amdgpu]
> drm_file_free.part.13+0x216/0x270 [drm]
> drm_close_helper.isra.14+0x60/0x70 [drm]
> drm_release+0x6e/0xf0 [drm]
> __fput+0xcc/0x280
> ____fput+0xe/0x20
> task_work_run+0x96/0xc0
> do_exit+0x3d0/0xc10
>
> BUG: kernel NULL pointer dereference, address: 0000000000000000
> RIP: 0010:ida_free+0x76/0x140
> Call Trace:
> amdgpu_pasid_free_delayed+0xe1/0x2a0 [amdgpu]
> amdgpu_driver_postclose_kms+0x2d8/0x340 [amdgpu]
> drm_file_free.part.13+0x216/0x270 [drm]
> drm_close_helper.isra.14+0x60/0x70 [drm]
> drm_release+0x6e/0xf0 [drm]
> __fput+0xcc/0x280
> ____fput+0xe/0x20
> task_work_run+0x96/0xc0
> do_exit+0x3d0/0xc10
>
> Suggested-by: Felix Kuehling <Felix.Kuehling at amd.com>
I don't think I suggested this fix. I didn't realize that the problem
only affected the case where kfd_process_device_init_vm fails. Anyway,
the series is
Reviewed-by: Felix Kuehling <Felix.Kuehling at amd.com>
> Signed-off-by: Philip Yang <Philip.Yang at amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h | 4 +-
> .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 39 +++++++++++++------
> drivers/gpu/drm/amd/amdkfd/kfd_process.c | 12 ++++--
> 3 files changed, 40 insertions(+), 15 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> index 589939631ed4..0040deaf8a83 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> @@ -270,8 +270,10 @@ int amdgpu_amdkfd_get_pcie_bandwidth_mbytes(struct amdgpu_device *adev, bool is_
> (&((struct amdgpu_fpriv *) \
> ((struct drm_file *)(drm_priv))->driver_priv)->vm)
>
> +int amdgpu_amdkfd_gpuvm_set_vm_pasid(struct amdgpu_device *adev,
> + struct file *filp, u32 pasid);
> int amdgpu_amdkfd_gpuvm_acquire_process_vm(struct amdgpu_device *adev,
> - struct file *filp, u32 pasid,
> + struct file *filp,
> void **process_info,
> struct dma_fence **ef);
> void amdgpu_amdkfd_gpuvm_release_process_vm(struct amdgpu_device *adev,
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> index 0a854bb8b47e..b15091d8310d 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> @@ -1429,10 +1429,9 @@ static void amdgpu_amdkfd_gpuvm_unpin_bo(struct amdgpu_bo *bo)
> amdgpu_bo_unreserve(bo);
> }
>
> -int amdgpu_amdkfd_gpuvm_acquire_process_vm(struct amdgpu_device *adev,
> - struct file *filp, u32 pasid,
> - void **process_info,
> - struct dma_fence **ef)
> +int amdgpu_amdkfd_gpuvm_set_vm_pasid(struct amdgpu_device *adev,
> + struct file *filp, u32 pasid)
> +
> {
> struct amdgpu_fpriv *drv_priv;
> struct amdgpu_vm *avm;
> @@ -1443,10 +1442,6 @@ int amdgpu_amdkfd_gpuvm_acquire_process_vm(struct amdgpu_device *adev,
> return ret;
> avm = &drv_priv->vm;
>
> - /* Already a compute VM? */
> - if (avm->process_info)
> - return -EINVAL;
> -
> /* Free the original amdgpu allocated pasid,
> * will be replaced with kfd allocated pasid.
> */
> @@ -1455,14 +1450,36 @@ int amdgpu_amdkfd_gpuvm_acquire_process_vm(struct amdgpu_device *adev,
> amdgpu_vm_set_pasid(adev, avm, 0);
> }
>
> - /* Convert VM into a compute VM */
> - ret = amdgpu_vm_make_compute(adev, avm);
> + ret = amdgpu_vm_set_pasid(adev, avm, pasid);
> if (ret)
> return ret;
>
> - ret = amdgpu_vm_set_pasid(adev, avm, pasid);
> + return 0;
> +}
> +
> +int amdgpu_amdkfd_gpuvm_acquire_process_vm(struct amdgpu_device *adev,
> + struct file *filp,
> + void **process_info,
> + struct dma_fence **ef)
> +{
> + struct amdgpu_fpriv *drv_priv;
> + struct amdgpu_vm *avm;
> + int ret;
> +
> + ret = amdgpu_file_to_fpriv(filp, &drv_priv);
> if (ret)
> return ret;
> + avm = &drv_priv->vm;
> +
> + /* Already a compute VM? */
> + if (avm->process_info)
> + return -EINVAL;
> +
> + /* Convert VM into a compute VM */
> + ret = amdgpu_vm_make_compute(adev, avm);
> + if (ret)
> + return ret;
> +
> /* Initialize KFD part of the VM and process info */
> ret = init_kfd_vm(avm, process_info, ef);
> if (ret)
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> index 6caa9dd57ff1..51b1683ac5c1 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> @@ -1576,9 +1576,9 @@ int kfd_process_device_init_vm(struct kfd_process_device *pdd,
> p = pdd->process;
> dev = pdd->dev;
>
> - ret = amdgpu_amdkfd_gpuvm_acquire_process_vm(
> - dev->adev, drm_file, p->pasid,
> - &p->kgd_process_info, &p->ef);
> + ret = amdgpu_amdkfd_gpuvm_acquire_process_vm(dev->adev, drm_file,
> + &p->kgd_process_info,
> + &p->ef);
> if (ret) {
> pr_err("Failed to create process VM object\n");
> return ret;
> @@ -1593,10 +1593,16 @@ int kfd_process_device_init_vm(struct kfd_process_device *pdd,
> if (ret)
> goto err_init_cwsr;
>
> + ret = amdgpu_amdkfd_gpuvm_set_vm_pasid(dev->adev, drm_file, p->pasid);
> + if (ret)
> + goto err_set_pasid;
> +
> pdd->drm_file = drm_file;
>
> return 0;
>
> +err_set_pasid:
> + kfd_process_device_destroy_cwsr_dgpu(pdd);
> err_init_cwsr:
> kfd_process_device_destroy_ib_mem(pdd);
> err_reserve_ib_mem:
More information about the amd-gfx
mailing list