[PATCH 7/9] drm/amdkfd: Fix and simplify sync object handling for KFD

Kuehling, Felix Felix.Kuehling at amd.com
Fri Nov 16 20:19:55 UTC 2018


Hi Christian,

Would you review this patch? Just looking at the code, calling
amdgpu_sync_fence with adev=NULL should be OK for us. It's just a bit
unusual compared to amdgpu's usage of this function. We've had this
patch in kfd-staging for a while without problems. If you're OK with
this I'll go ahead and push this upstream as well.

Thanks,
  Felix

On 2018-11-05 8:40 p.m., Kuehling, Felix wrote:
> The adev parameter in amdgpu_sync_fence and amdgpu_sync_resv is only
> needed for updating sync->last_vm_update. This breaks if different
> adevs are passed to calls for the same sync object.
>
> Always pass NULL for calls from KFD because sync objects used for
> KFD don't belong to any particular device, and KFD doesn't need the
> sync->last_vm_update fence.
>
> This fixes kernel log warnings on multi-GPU systems after recent
> changes in amdgpu_amdkfd_gpuvm_restore_process_bos.
>
> Signed-off-by: Felix Kuehling <Felix.Kuehling at amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 28 +++++-------------------
>  1 file changed, 5 insertions(+), 23 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> index d005371..572ac5f 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> @@ -395,23 +395,6 @@ static int vm_validate_pt_pd_bos(struct amdgpu_vm *vm)
>  	return 0;
>  }
>  
> -static int sync_vm_fence(struct amdgpu_device *adev, struct amdgpu_sync *sync,
> -			 struct dma_fence *f)
> -{
> -	int ret = amdgpu_sync_fence(adev, sync, f, false);
> -
> -	/* Sync objects can't handle multiple GPUs (contexts) updating
> -	 * sync->last_vm_update. Fortunately we don't need it for
> -	 * KFD's purposes, so we can just drop that fence.
> -	 */
> -	if (sync->last_vm_update) {
> -		dma_fence_put(sync->last_vm_update);
> -		sync->last_vm_update = NULL;
> -	}
> -
> -	return ret;
> -}
> -
>  static int vm_update_pds(struct amdgpu_vm *vm, struct amdgpu_sync *sync)
>  {
>  	struct amdgpu_bo *pd = vm->root.base.bo;
> @@ -422,7 +405,7 @@ static int vm_update_pds(struct amdgpu_vm *vm, struct amdgpu_sync *sync)
>  	if (ret)
>  		return ret;
>  
> -	return sync_vm_fence(adev, sync, vm->last_update);
> +	return amdgpu_sync_fence(NULL, sync, vm->last_update, false);
>  }
>  
>  /* add_bo_to_vm - Add a BO to a VM
> @@ -826,7 +809,7 @@ static int unmap_bo_from_gpuvm(struct amdgpu_device *adev,
>  	/* Add the eviction fence back */
>  	amdgpu_bo_fence(pd, &vm->process_info->eviction_fence->base, true);
>  
> -	sync_vm_fence(adev, sync, bo_va->last_pt_update);
> +	amdgpu_sync_fence(NULL, sync, bo_va->last_pt_update, false);
>  
>  	return 0;
>  }
> @@ -851,7 +834,7 @@ static int update_gpuvm_pte(struct amdgpu_device *adev,
>  		return ret;
>  	}
>  
> -	return sync_vm_fence(adev, sync, bo_va->last_pt_update);
> +	return amdgpu_sync_fence(NULL, sync, bo_va->last_pt_update, false);
>  }
>  
>  static int map_bo_to_gpuvm(struct amdgpu_device *adev,
> @@ -911,7 +894,7 @@ static int process_sync_pds_resv(struct amdkfd_process_info *process_info,
>  			    vm_list_node) {
>  		struct amdgpu_bo *pd = peer_vm->root.base.bo;
>  
> -		ret = amdgpu_sync_resv(amdgpu_ttm_adev(pd->tbo.bdev),
> +		ret = amdgpu_sync_resv(NULL,
>  					sync, pd->tbo.resv,
>  					AMDGPU_FENCE_OWNER_UNDEFINED, false);
>  		if (ret)
> @@ -2084,8 +2067,7 @@ int amdgpu_amdkfd_gpuvm_restore_process_bos(void *info, struct dma_fence **ef)
>  			pr_debug("Memory eviction: Validate BOs failed. Try again\n");
>  			goto validate_map_fail;
>  		}
> -		ret = amdgpu_sync_fence(amdgpu_ttm_adev(bo->tbo.bdev),
> -					&sync_obj, bo->tbo.moving, false);
> +		ret = amdgpu_sync_fence(NULL, &sync_obj, bo->tbo.moving, false);
>  		if (ret) {
>  			pr_debug("Memory eviction: Sync BO fence failed. Try again\n");
>  			goto validate_map_fail;


More information about the amd-gfx mailing list