[PATCH v3 1/2] drm/amdgpu: Acquire ttm locks for dmaunmap

Wed Oct 25 06:12:00 UTC 2023

Am 24.10.23 um 21:20 schrieb David Francis:
> dmaunmap can call ttm_bo_validate, which expects the
> ttm dma_resv to be held.

Well first of all the dma_resv object isn't related to TTM.

>
> Acquire the locks in amdgpu_amdkfd_gpuvm_dmaunmap_mem.
>
> Because the dmaunmap step can now fail, two new numbers
> need to be tracked. n_dmaunmap_success tracks the number
> of devices that have completed dmaunmap. If a device fails
> to dmaunmap due to a signal interrupt, n_dmaunmap_bos tracks
> the number of bos on that device that were successfully
> dmaunmapped.
>
> Track those values in struct kgd_mem.
>
> This failure can also cause the sync_memory step of the ioctl
> to be repeated; it is idempotent, so this should not cause any issues.
>
> Signed-off-by: David Francis <David.Francis at amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h    |  6 ++++-
>   .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c  | 23 +++++++++++++++----
>   drivers/gpu/drm/amd/amdkfd/kfd_chardev.c      | 19 ++++++++++-----
>   3 files changed, 37 insertions(+), 11 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> index 3ad8dc523b42..c60564ec4312 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> @@ -86,6 +86,10 @@ struct kgd_mem {
>   
>   	bool aql_queue;
>   	bool is_imported;
> +
> +	/* Used to track successful dmaunmap across retries in unmap ioctl */
> +	uint32_t n_dmaunmap_success;
> +	uint32_t n_dmaunmap_bos;
>   };
>   
>   /* KFD Memory Eviction */
> @@ -302,7 +306,7 @@ int amdgpu_amdkfd_gpuvm_map_memory_to_gpu(struct amdgpu_device *adev,
>   					  struct kgd_mem *mem, void *drm_priv);
>   int amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu(
>   		struct amdgpu_device *adev, struct kgd_mem *mem, void *drm_priv);
> -void amdgpu_amdkfd_gpuvm_dmaunmap_mem(struct kgd_mem *mem, void *drm_priv);
> +int amdgpu_amdkfd_gpuvm_dmaunmap_mem(struct kgd_mem *mem, void *drm_priv);
>   int amdgpu_amdkfd_gpuvm_sync_memory(
>   		struct amdgpu_device *adev, struct kgd_mem *mem, bool intr);
>   int amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel(struct kgd_mem *mem,
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> index 54f31a420229..c431132d7cc1 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> @@ -2102,21 +2102,36 @@ int amdgpu_amdkfd_gpuvm_map_memory_to_gpu(
>   	return ret;
>   }
>   
> -void amdgpu_amdkfd_gpuvm_dmaunmap_mem(struct kgd_mem *mem, void *drm_priv)
> +int amdgpu_amdkfd_gpuvm_dmaunmap_mem(struct kgd_mem *mem, void *drm_priv)
>   {
>   	struct kfd_mem_attachment *entry;
>   	struct amdgpu_vm *vm;
> +	int ret;
> +	int i = 0;
>   
>   	vm = drm_priv_to_vm(drm_priv);
>   
>   	mutex_lock(&mem->lock);
>   
>   	list_for_each_entry(entry, &mem->attachments, list) {
> -		if (entry->bo_va->base.vm == vm)
> -			kfd_mem_dmaunmap_attachment(mem, entry);
> -	}
> +		if (i >= mem->n_dmaunmap_bos) {
> +			ret = amdgpu_bo_reserve(entry->bo_va->base.bo, false);
> +			if (ret) {
> +				mem->n_dmaunmap_bos = i;
> +				goto out;
> +			}
> +
> +			if (entry->bo_va->base.vm == vm)
> +				kfd_mem_dmaunmap_attachment(mem, entry);
>   
> +			amdgpu_bo_unreserve(entry->bo_va->base.bo);
> +		}
> +		i++;
> +	}

WOW, big NAK to that!

You are essentially re-inventing the locking multiple BOs at the same 
time, please don't do that. Instead use dma_exec or ttm_exec_buf for that.

This also avoids all the fail handling.

Regards,
Christian.

> +	mem->n_dmaunmap_bos = 0;
> +out:
>   	mutex_unlock(&mem->lock);
> +	return ret;
>   }
>   
>   int amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu(
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> index 06988cf1db51..66dee67ad859 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> @@ -1366,7 +1366,7 @@ static int kfd_ioctl_unmap_memory_from_gpu(struct file *filep,
>   {
>   	struct kfd_ioctl_unmap_memory_from_gpu_args *args = data;
>   	struct kfd_process_device *pdd, *peer_pdd;
> -	void *mem;
> +	struct kgd_mem *mem;
>   	long err = 0;
>   	uint32_t *devices_arr = NULL, i;
>   	bool flush_tlb;
> @@ -1400,7 +1400,7 @@ static int kfd_ioctl_unmap_memory_from_gpu(struct file *filep,
>   		goto bind_process_to_device_failed;
>   	}
>   
> -	mem = kfd_process_device_translate_handle(pdd,
> +	mem = (struct kgd_mem *)kfd_process_device_translate_handle(pdd,
>   						GET_IDR_HANDLE(args->handle));
>   	if (!mem) {
>   		err = -ENOMEM;
> @@ -1414,7 +1414,7 @@ static int kfd_ioctl_unmap_memory_from_gpu(struct file *filep,
>   			goto get_mem_obj_from_handle_failed;
>   		}
>   		err = amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu(
> -			peer_pdd->dev->adev, (struct kgd_mem *)mem, peer_pdd->drm_priv);
> +			peer_pdd->dev->adev, mem, peer_pdd->drm_priv);
>   		if (err) {
>   			pr_err("Failed to unmap from gpu %d/%d\n",
>   			       i, args->n_devices);
> @@ -1426,7 +1426,7 @@ static int kfd_ioctl_unmap_memory_from_gpu(struct file *filep,
>   	flush_tlb = kfd_flush_tlb_after_unmap(pdd->dev->kfd);
>   	if (flush_tlb) {
>   		err = amdgpu_amdkfd_gpuvm_sync_memory(pdd->dev->adev,
> -				(struct kgd_mem *) mem, true);
> +				mem, true);
>   		if (err) {
>   			pr_debug("Sync memory failed, wait interrupted by user signal\n");
>   			goto sync_memory_failed;
> @@ -1434,7 +1434,7 @@ static int kfd_ioctl_unmap_memory_from_gpu(struct file *filep,
>   	}
>   
>   	/* Flush TLBs after waiting for the page table updates to complete */
> -	for (i = 0; i < args->n_devices; i++) {
> +	for (i = mem->n_dmaunmap_success; i < args->n_devices; i++) {
>   		peer_pdd = kfd_process_device_data_by_id(p, devices_arr[i]);
>   		if (WARN_ON_ONCE(!peer_pdd))
>   			continue;
> @@ -1442,8 +1442,14 @@ static int kfd_ioctl_unmap_memory_from_gpu(struct file *filep,
>   			kfd_flush_tlb(peer_pdd, TLB_FLUSH_HEAVYWEIGHT);
>   
>   		/* Remove dma mapping after tlb flush to avoid IO_PAGE_FAULT */
> -		amdgpu_amdkfd_gpuvm_dmaunmap_mem(mem, peer_pdd->drm_priv);
> +		err = amdgpu_amdkfd_gpuvm_dmaunmap_mem(mem, peer_pdd->drm_priv);
> +		if (err) {
> +			pr_debug("DMA unmapping failed, acquire interrupted by user signal\n");
> +			goto dmaunmap_failed;
> +		}
> +		mem->n_dmaunmap_success = i+1;
>   	}
> +	mem->n_dmaunmap_success = 0;
>   
>   	mutex_unlock(&p->mutex);
>   
> @@ -1455,6 +1461,7 @@ static int kfd_ioctl_unmap_memory_from_gpu(struct file *filep,
>   get_mem_obj_from_handle_failed:
>   unmap_memory_from_gpu_failed:
>   sync_memory_failed:
> +dmaunmap_failed:
>   	mutex_unlock(&p->mutex);
>   copy_from_user_failed:
>   	kfree(devices_arr);