[PATCH 1/2] drm/amdgpu: Update BO memory accounting to rely on allocation flag

Felix Kuehling felix.kuehling at amd.com
Thu Nov 11 13:43:41 UTC 2021


Am 2021-11-09 um 1:13 a.m. schrieb Ramesh Errabolu:
> Accounting system to track amount of available memory (system, TTM
> and VRAM of a device) relies on BO's domain. The change is to rely
> instead on allocation flag indicating BO type - VRAM, GTT, USERPTR,
> MMIO or DOORBELL
>
> Signed-off-by: Ramesh Errabolu <Ramesh.Errabolu at amd.com>

The code changes look good. Comments about comments inline ...


> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h    |  16 +++
>  .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c  | 101 +++++++++++-------
>  2 files changed, 79 insertions(+), 38 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> index 5f658823a637..8d31a742cd80 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> @@ -307,7 +307,23 @@ void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev);
>  void amdgpu_amdkfd_gpuvm_init_mem_limits(void);
>  void amdgpu_amdkfd_gpuvm_destroy_cb(struct amdgpu_device *adev,
>  				struct amdgpu_vm *vm);
> +
> +/**
> + * @amdgpu_amdkfd_release_notify() - Invoked when GEM object reference count
> + * reaches ZERO. Increases available memory by size of buffer including any
> + * reserved for control structures

"Increases available memory size ..." is an implementation detail that
doesn't matter to the callers of this function. It should not be part of
the interface definition. The interface description should be more
general, maybe:

* amdgpu_amdkfd_release_notify() - Notify KFD when GEM object is released
*
* Allows KFD to release its resources associated with the GEM object.
* ...


> + *
> + * @note: This api must be invoked on BOs that have been allocated via
> + * KFD interface amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu()
> + */
>  void amdgpu_amdkfd_release_notify(struct amdgpu_bo *bo);
> +
> +/**
> + * @amdgpu_amdkfd_reserve_system_mem - Decrease system memory that is
> + * available by an amount specified by input parameter

This is misleading. This function doesn't change availability of system
memory in general because it doesn't allocate any memory. You'll need to
be more specific:

* amdgpu_amdkfd_reserve_system_mem() - Decrease system memory limit for KFD applications


> + *
> + * @size: Size of buffer in bytes

What buffer?


> + */
>  void amdgpu_amdkfd_reserve_system_mem(uint64_t size);
>  #else
>  static inline
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> index 94fccf0b47ad..08675f89bfb2 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> @@ -120,8 +120,22 @@ static size_t amdgpu_amdkfd_acc_size(uint64_t size)
>  		PAGE_ALIGN(size);
>  }
>  
> +/**
> + * @amdgpu_amdkfd_reserve_mem_limit() - Decrease available memory by size
> + * of buffer including any reserved for control structures
> + *
> + * @adev: Device to which allocated BO belongs to
> + * @size: Size of buffer, in bytes, encapsulated by B0. This should be
> + * equivalent to amdgpu_bo_size(BO)
> + * @alloc_flag: Flag used in allocating a BO as noted above
> + *
> + * @note: This api must be invoked on BOs that have been allocated via
> + * KFD interface amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu()

Who needs to call it? Your statement sounds like callers of
amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu must call this function as well.
This is very misleading because this function is already called from
amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu.


> + *
> + * Return: returns -ENOMEM in case of error, ZERO otherwise
> + */
>  static int amdgpu_amdkfd_reserve_mem_limit(struct amdgpu_device *adev,
> -		uint64_t size, u32 domain, bool sg)
> +		uint64_t size, u32 alloc_flag)
>  {
>  	uint64_t reserved_for_pt =
>  		ESTIMATE_PT_SIZE(amdgpu_amdkfd_total_mem_size);
> @@ -131,20 +145,24 @@ static int amdgpu_amdkfd_reserve_mem_limit(struct amdgpu_device *adev,
>  	acc_size = amdgpu_amdkfd_acc_size(size);
>  
>  	vram_needed = 0;
> -	if (domain == AMDGPU_GEM_DOMAIN_GTT) {
> -		/* TTM GTT memory */
> +	if (alloc_flag & KFD_IOC_ALLOC_MEM_FLAGS_GTT) {
>  		system_mem_needed = acc_size + size;
>  		ttm_mem_needed = acc_size + size;
> -	} else if (domain == AMDGPU_GEM_DOMAIN_CPU && !sg) {
> -		/* Userptr */
> +	} else if (alloc_flag & KFD_IOC_ALLOC_MEM_FLAGS_VRAM) {
> +		system_mem_needed = acc_size;
> +		ttm_mem_needed = acc_size;
> +		vram_needed = size;
> +	} else if (alloc_flag & KFD_IOC_ALLOC_MEM_FLAGS_USERPTR) {
>  		system_mem_needed = acc_size + size;
>  		ttm_mem_needed = acc_size;
> -	} else {
> -		/* VRAM and SG */
> +	} else if (alloc_flag &
> +		   (KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL |
> +		    KFD_IOC_ALLOC_MEM_FLAGS_MMIO_REMAP)) {
>  		system_mem_needed = acc_size;
>  		ttm_mem_needed = acc_size;
> -		if (domain == AMDGPU_GEM_DOMAIN_VRAM)
> -			vram_needed = size;
> +	} else {
> +		pr_err("%s: Invalid BO type %#x\n", __func__, alloc_flag);
> +		return -ENOMEM;
>  	}
>  
>  	spin_lock(&kfd_mem_limit.mem_limit_lock);
> @@ -160,64 +178,71 @@ static int amdgpu_amdkfd_reserve_mem_limit(struct amdgpu_device *adev,
>  	    (adev->kfd.vram_used + vram_needed >
>  	     adev->gmc.real_vram_size - reserved_for_pt)) {
>  		ret = -ENOMEM;
> -	} else {
> -		kfd_mem_limit.system_mem_used += system_mem_needed;
> -		kfd_mem_limit.ttm_mem_used += ttm_mem_needed;
> -		adev->kfd.vram_used += vram_needed;
> +		goto release;
>  	}
>  
> +	/* Update memory accounting by decreasing available system
> +	 * memory, TTM memory and GPU memory as computed above
> +	 */
> +	adev->kfd.vram_used += vram_needed;
> +	kfd_mem_limit.system_mem_used += system_mem_needed;
> +	kfd_mem_limit.ttm_mem_used += ttm_mem_needed;
> +
> +release:
>  	spin_unlock(&kfd_mem_limit.mem_limit_lock);
>  	return ret;
>  }
>  
>  static void unreserve_mem_limit(struct amdgpu_device *adev,
> -		uint64_t size, u32 domain, bool sg)
> +		uint64_t size, u32 alloc_flag)
>  {
>  	size_t acc_size;
>  
>  	acc_size = amdgpu_amdkfd_acc_size(size);
>  
>  	spin_lock(&kfd_mem_limit.mem_limit_lock);
> -	if (domain == AMDGPU_GEM_DOMAIN_GTT) {
> +
> +	if (alloc_flag & KFD_IOC_ALLOC_MEM_FLAGS_GTT) {
>  		kfd_mem_limit.system_mem_used -= (acc_size + size);
>  		kfd_mem_limit.ttm_mem_used -= (acc_size + size);
> -	} else if (domain == AMDGPU_GEM_DOMAIN_CPU && !sg) {
> +	} else if (alloc_flag & KFD_IOC_ALLOC_MEM_FLAGS_VRAM) {
> +		kfd_mem_limit.system_mem_used -= acc_size;
> +		kfd_mem_limit.ttm_mem_used -= acc_size;
> +		adev->kfd.vram_used -= size;
> +	} else if (alloc_flag & KFD_IOC_ALLOC_MEM_FLAGS_USERPTR) {
>  		kfd_mem_limit.system_mem_used -= (acc_size + size);
>  		kfd_mem_limit.ttm_mem_used -= acc_size;
> -	} else {
> +	} else if (alloc_flag &
> +		   (KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL |
> +		    KFD_IOC_ALLOC_MEM_FLAGS_MMIO_REMAP)) {
>  		kfd_mem_limit.system_mem_used -= acc_size;
>  		kfd_mem_limit.ttm_mem_used -= acc_size;
> -		if (domain == AMDGPU_GEM_DOMAIN_VRAM) {
> -			adev->kfd.vram_used -= size;
> -			WARN_ONCE(adev->kfd.vram_used < 0,
> -				  "kfd VRAM memory accounting unbalanced");
> -		}
> +	} else {
> +		pr_err("%s: Invalid BO type %#x\n", __func__, alloc_flag);
> +		goto release;
>  	}
> -	WARN_ONCE(kfd_mem_limit.system_mem_used < 0,
> -		  "kfd system memory accounting unbalanced");
> +
> +	/* Alert user if memory accounting is not per expectation */

This comment is obvious and unnecessary, and also not even correct.
These WARN messages are not for the user because the user did not cause
them and can do nothing to avoid them. These messages point out bugs
elsewhere in the driver. So they are for engineers.

Regards,
  Felix


> +	WARN_ONCE(adev->kfd.vram_used < 0,
> +		  "KFD VRAM memory accounting unbalanced");
>  	WARN_ONCE(kfd_mem_limit.ttm_mem_used < 0,
> -		  "kfd TTM memory accounting unbalanced");
> +		  "KFD TTM memory accounting unbalanced");
> +	WARN_ONCE(kfd_mem_limit.system_mem_used < 0,
> +		  "KFD system memory accounting unbalanced");
>  
> +release:
>  	spin_unlock(&kfd_mem_limit.mem_limit_lock);
>  }
>  
>  void amdgpu_amdkfd_release_notify(struct amdgpu_bo *bo)
>  {
>  	struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev);
> -	u32 domain = bo->preferred_domains;
> -	bool sg = (bo->preferred_domains == AMDGPU_GEM_DOMAIN_CPU);
> -
> -	if (bo->flags & AMDGPU_AMDKFD_CREATE_USERPTR_BO) {
> -		domain = AMDGPU_GEM_DOMAIN_CPU;
> -		sg = false;
> -	}
> -
> -	unreserve_mem_limit(adev, amdgpu_bo_size(bo), domain, sg);
> +	u32 alloc_flags = bo->kfd_bo->alloc_flags;
> +	u64 size = amdgpu_bo_size(bo);
>  
> -	kfree(bo->kfd_bo);
> +	unreserve_mem_limit(adev, size, alloc_flags);
>  }
>  
> -
>  /* amdgpu_amdkfd_remove_eviction_fence - Removes eviction fence from BO's
>   *  reservation object.
>   *
> @@ -1452,7 +1477,7 @@ int amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu(
>  
>  	amdgpu_sync_create(&(*mem)->sync);
>  
> -	ret = amdgpu_amdkfd_reserve_mem_limit(adev, size, alloc_domain, !!sg);
> +	ret = amdgpu_amdkfd_reserve_mem_limit(adev, size, flags);
>  	if (ret) {
>  		pr_debug("Insufficient memory\n");
>  		goto err_reserve_limit;
> @@ -1508,7 +1533,7 @@ int amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu(
>  	/* Don't unreserve system mem limit twice */
>  	goto err_reserve_limit;
>  err_bo_create:
> -	unreserve_mem_limit(adev, size, alloc_domain, !!sg);
> +	unreserve_mem_limit(adev, size, flags);
>  err_reserve_limit:
>  	mutex_destroy(&(*mem)->lock);
>  	kfree(*mem);


More information about the amd-gfx mailing list