[PATCH v2] drm/amdkfd: reserve the BO before validating it

Felix Kuehling felix.kuehling at amd.com
Wed Jan 24 21:40:44 UTC 2024


On 2024-01-22 4:08, Lang Yu wrote:
> Fixes: 410f08516e0f ("drm/amdkfd: Move dma unmapping after TLB flush")
>
> v2:
> Avoid unmapping attachment twice when ERESTARTSYS.
>
> [   41.708711] WARNING: CPU: 0 PID: 1463 at drivers/gpu/drm/ttm/ttm_bo.c:846 ttm_bo_validate+0x146/0x1b0 [ttm]
> [   41.708989] Call Trace:
> [   41.708992]  <TASK>
> [   41.708996]  ? show_regs+0x6c/0x80
> [   41.709000]  ? ttm_bo_validate+0x146/0x1b0 [ttm]
> [   41.709008]  ? __warn+0x93/0x190
> [   41.709014]  ? ttm_bo_validate+0x146/0x1b0 [ttm]
> [   41.709024]  ? report_bug+0x1f9/0x210
> [   41.709035]  ? handle_bug+0x46/0x80
> [   41.709041]  ? exc_invalid_op+0x1d/0x80
> [   41.709048]  ? asm_exc_invalid_op+0x1f/0x30
> [   41.709057]  ? amdgpu_amdkfd_gpuvm_dmaunmap_mem+0x2c/0x80 [amdgpu]
> [   41.709185]  ? ttm_bo_validate+0x146/0x1b0 [ttm]
> [   41.709197]  ? amdgpu_amdkfd_gpuvm_dmaunmap_mem+0x2c/0x80 [amdgpu]
> [   41.709337]  ? srso_alias_return_thunk+0x5/0x7f
> [   41.709346]  kfd_mem_dmaunmap_attachment+0x9e/0x1e0 [amdgpu]
> [   41.709467]  amdgpu_amdkfd_gpuvm_dmaunmap_mem+0x56/0x80 [amdgpu]
> [   41.709586]  kfd_ioctl_unmap_memory_from_gpu+0x1b7/0x300 [amdgpu]
> [   41.709710]  kfd_ioctl+0x1ec/0x650 [amdgpu]
> [   41.709822]  ? __pfx_kfd_ioctl_unmap_memory_from_gpu+0x10/0x10 [amdgpu]
> [   41.709945]  ? srso_alias_return_thunk+0x5/0x7f
> [   41.709949]  ? tomoyo_file_ioctl+0x20/0x30
> [   41.709959]  __x64_sys_ioctl+0x9c/0xd0
> [   41.709967]  do_syscall_64+0x3f/0x90
> [   41.709973]  entry_SYSCALL_64_after_hwframe+0x6e/0xd8
>
> Signed-off-by: Lang Yu <Lang.Yu at amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h    |  2 +-
>   .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c  | 28 +++++++++++++++++--
>   drivers/gpu/drm/amd/amdkfd/kfd_chardev.c      |  4 ++-
>   3 files changed, 29 insertions(+), 5 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> index 584a0cea5572..41854417e487 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> @@ -311,7 +311,7 @@ int amdgpu_amdkfd_gpuvm_map_memory_to_gpu(struct amdgpu_device *adev,
>   					  struct kgd_mem *mem, void *drm_priv);
>   int amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu(
>   		struct amdgpu_device *adev, struct kgd_mem *mem, void *drm_priv);
> -void amdgpu_amdkfd_gpuvm_dmaunmap_mem(struct kgd_mem *mem, void *drm_priv);
> +int amdgpu_amdkfd_gpuvm_dmaunmap_mem(struct kgd_mem *mem, void *drm_priv);
>   int amdgpu_amdkfd_gpuvm_sync_memory(
>   		struct amdgpu_device *adev, struct kgd_mem *mem, bool intr);
>   int amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel(struct kgd_mem *mem,
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> index 6f3a4cb2a9ef..7a050d46fa4d 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> @@ -2088,21 +2088,43 @@ int amdgpu_amdkfd_gpuvm_map_memory_to_gpu(
>   	return ret;
>   }
>   
> -void amdgpu_amdkfd_gpuvm_dmaunmap_mem(struct kgd_mem *mem, void *drm_priv)
> +int amdgpu_amdkfd_gpuvm_dmaunmap_mem(struct kgd_mem *mem, void *drm_priv)
>   {
>   	struct kfd_mem_attachment *entry;
>   	struct amdgpu_vm *vm;
> +	bool reserved = false;
> +	int ret = 0;
>   
>   	vm = drm_priv_to_vm(drm_priv);
>   
>   	mutex_lock(&mem->lock);
>   
>   	list_for_each_entry(entry, &mem->attachments, list) {
> -		if (entry->bo_va->base.vm == vm)
> -			kfd_mem_dmaunmap_attachment(mem, entry);
> +		if (entry->bo_va->base.vm != vm)
> +			continue;
> +		if (entry->type == KFD_MEM_ATT_SHARED ||
> +		    entry->type == KFD_MEM_ATT_DMABUF)
> +			continue;
> +		if (!entry->bo_va->base.bo->tbo.ttm->sg)
> +			continue;

You're going to great lengths to avoid the reservation when it's not 
needed by kfd_mem_dmaunmap_attachment. However, this feels a bit 
fragile. Any changes in the kfd_mem_dmaunmap_* functions could break this.


> +
> +		if (!reserved) {
> +			ret = amdgpu_bo_reserve(mem->bo, true);
> +			if (ret)
> +				goto out;
> +			reserved = true;
> +		}
> +
> +		kfd_mem_dmaunmap_attachment(mem, entry);
>   	}
>   
> +	if (reserved)
> +		amdgpu_bo_unreserve(mem->bo);
> +
> +out:
>   	mutex_unlock(&mem->lock);
> +
> +	return ret;
>   }
>   
>   int amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu(
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> index ce4c52ec34d8..80e90fdef291 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> @@ -1442,7 +1442,9 @@ static int kfd_ioctl_unmap_memory_from_gpu(struct file *filep,
>   			kfd_flush_tlb(peer_pdd, TLB_FLUSH_HEAVYWEIGHT);
>   
>   		/* Remove dma mapping after tlb flush to avoid IO_PAGE_FAULT */
> -		amdgpu_amdkfd_gpuvm_dmaunmap_mem(mem, peer_pdd->drm_priv);
> +		err = amdgpu_amdkfd_gpuvm_dmaunmap_mem(mem, peer_pdd->drm_priv);
> +		if (err)
> +			goto sync_memory_failed;

This handles the case that the system call got interrupted. But you're 
not handling the restart correctly. When the ioctl is restarted, you 
don't know how many dmaunmaps are already done. So you'd need to make 
sure that repeating the dmaunmap is safe in all cases. Or what David 
tried earlier, find a way to track the unmapping so you only try to 
dmaunmap on GPUs where it's actually dmamapped.

Regards,
   Felix


>   	}
>   
>   	mutex_unlock(&p->mutex);


More information about the amd-gfx mailing list