[PATCH V2 1/2] drm/amdgpu: Acquire ttm locks for dmaunmap
Felix Kuehling
felix.kuehling at amd.com
Mon Oct 16 22:19:27 UTC 2023
On 2023-10-16 10:49, David Francis wrote:
> dmaunmap can call ttm_bo_validate, which expects the
> ttm dma_resv to be held.
>
> Acquire the locks in amdgpu_amdkfd_gpuvm_dmaunmap_mem.
>
> Because the dmaunmap step can now fail, the unmap ioctl UAPI
> needs two new arguments. n_dmaunmap_success tracks the number
> of devices that have completed dmaunmap. If a device fails
> to dmaunmap due to a signal interrupt, n_dmaunmap_bos tracks
> the number of bos on that device that were successfully
> dmaunmapped.
>
> This failure can also cause the sync_memory step of the ioctl
> to be repeated; it is idempotent, so this should not cause any issues.
>
> Signed-off-by: David Francis <David.Francis at amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h | 2 +-
> .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 23 +++++++++++++++----
> drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 14 +++++++++--
> include/uapi/linux/kfd_ioctl.h | 2 ++
> 4 files changed, 34 insertions(+), 7 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> index 3ad8dc523b42..781642871900 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> @@ -302,7 +302,7 @@ int amdgpu_amdkfd_gpuvm_map_memory_to_gpu(struct amdgpu_device *adev,
> struct kgd_mem *mem, void *drm_priv);
> int amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu(
> struct amdgpu_device *adev, struct kgd_mem *mem, void *drm_priv);
> -void amdgpu_amdkfd_gpuvm_dmaunmap_mem(struct kgd_mem *mem, void *drm_priv);
> +int amdgpu_amdkfd_gpuvm_dmaunmap_mem(struct kgd_mem *mem, void *drm_priv, uint32_t *num_bos);
> int amdgpu_amdkfd_gpuvm_sync_memory(
> struct amdgpu_device *adev, struct kgd_mem *mem, bool intr);
> int amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel(struct kgd_mem *mem,
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> index a15e59abe70a..cbd6032f3d39 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> @@ -2094,21 +2094,36 @@ int amdgpu_amdkfd_gpuvm_map_memory_to_gpu(
> return ret;
> }
>
> -void amdgpu_amdkfd_gpuvm_dmaunmap_mem(struct kgd_mem *mem, void *drm_priv)
> +int amdgpu_amdkfd_gpuvm_dmaunmap_mem(struct kgd_mem *mem, void *drm_priv, uint32_t *num_bos)
> {
> struct kfd_mem_attachment *entry;
> struct amdgpu_vm *vm;
> + int ret;
> + int i = 0;
>
> vm = drm_priv_to_vm(drm_priv);
>
> mutex_lock(&mem->lock);
>
> list_for_each_entry(entry, &mem->attachments, list) {
> - if (entry->bo_va->base.vm == vm)
> - kfd_mem_dmaunmap_attachment(mem, entry);
> - }
> + if (i >= *num_bos) {
> + ret = amdgpu_bo_reserve(entry->bo_va->base.bo, false);
> + if (ret) {
> + *num_bos = i;
> + goto out;
> + }
> +
> + if (entry->bo_va->base.vm == vm)
> + kfd_mem_dmaunmap_attachment(mem, entry);
>
> + amdgpu_bo_unreserve(entry->bo_va->base.bo);
> + }
> + i++;
> + }
> + *num_bos = 0;
> +out:
> mutex_unlock(&mem->lock);
> + return ret;
> }
>
> int amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu(
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> index 06988cf1db51..a944e255de4a 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> @@ -1379,6 +1379,10 @@ static int kfd_ioctl_unmap_memory_from_gpu(struct file *filep,
> pr_debug("n_success exceeds n_devices\n");
> return -EINVAL;
> }
> + if (args->n_dmaunmap_success > args->n_devices) {
> + pr_debug("n_dmaunmap_success exceeds n_devices\n");
> + return -EINVAL;
> + }
>
> devices_arr = kmalloc_array(args->n_devices, sizeof(*devices_arr),
> GFP_KERNEL);
> @@ -1434,7 +1438,7 @@ static int kfd_ioctl_unmap_memory_from_gpu(struct file *filep,
> }
>
> /* Flush TLBs after waiting for the page table updates to complete */
> - for (i = 0; i < args->n_devices; i++) {
> + for (i = args->n_dmaunmap_success; i < args->n_devices; i++) {
> peer_pdd = kfd_process_device_data_by_id(p, devices_arr[i]);
> if (WARN_ON_ONCE(!peer_pdd))
> continue;
> @@ -1442,7 +1446,12 @@ static int kfd_ioctl_unmap_memory_from_gpu(struct file *filep,
> kfd_flush_tlb(peer_pdd, TLB_FLUSH_HEAVYWEIGHT);
>
> /* Remove dma mapping after tlb flush to avoid IO_PAGE_FAULT */
> - amdgpu_amdkfd_gpuvm_dmaunmap_mem(mem, peer_pdd->drm_priv);
> + err = amdgpu_amdkfd_gpuvm_dmaunmap_mem(mem, peer_pdd->drm_priv, &args->n_dmaunmap_bos);
> + if (err) {
> + pr_debug("DMA unmapping failed, acquire interrupted by user signal\n");
> + goto dmaunmap_failed;
> + }
> + args->n_dmaunmap_success = i+1;
> }
>
> mutex_unlock(&p->mutex);
> @@ -1455,6 +1464,7 @@ static int kfd_ioctl_unmap_memory_from_gpu(struct file *filep,
> get_mem_obj_from_handle_failed:
> unmap_memory_from_gpu_failed:
> sync_memory_failed:
> +dmaunmap_failed:
> mutex_unlock(&p->mutex);
> copy_from_user_failed:
> kfree(devices_arr);
> diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h
> index f0ed68974c54..edeeac370201 100644
> --- a/include/uapi/linux/kfd_ioctl.h
> +++ b/include/uapi/linux/kfd_ioctl.h
> @@ -467,6 +467,8 @@ struct kfd_ioctl_unmap_memory_from_gpu_args {
> __u64 device_ids_array_ptr; /* to KFD */
> __u32 n_devices; /* to KFD */
> __u32 n_success; /* to/from KFD */
> + __u32 n_dmaunmap_success; /* to/from KFD */
> + __u32 n_dmaunmap_bos; /* to/from KFD */
This breaks the ABI with existing user mode, because this is the
argument structure being passed back and forth with user mode. You can't
change this without a very good reason and a strategy for maintaining
backwards compatibility with existing user mode.
Regards,
Felix
> };
>
> /* Allocate GWS for specific queue
More information about the amd-gfx
mailing list