[PATCH 2/2] drm/amdkfd: Allow memory oversubscription on small APUs

Felix Kuehling felix.kuehling at amd.com
Fri Apr 26 22:44:45 UTC 2024


On 2024-04-26 04:37, Lang Yu wrote:
> The default ttm_tt_pages_limit is 1/2 of system memory.
> It is prone to out of memory with such a configuration.
Indiscriminately allowing the violation of all memory limits is not a 
good solution. It will lead to poor performance once you actually reach 
ttm_pages_limit and TTM starts swapping out BOs.

Regards,
   Felix


>
> Signed-off-by: Lang Yu <Lang.Yu at amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c       |  2 +-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h       |  4 ++--
>   drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 12 +++++++++---
>   3 files changed, 12 insertions(+), 6 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> index 3295838e9a1d..c01c6f3ab562 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> @@ -167,7 +167,7 @@ void amdgpu_amdkfd_device_init(struct amdgpu_device *adev)
>   	int i;
>   	int last_valid_bit;
>   
> -	amdgpu_amdkfd_gpuvm_init_mem_limits();
> +	amdgpu_amdkfd_gpuvm_init_mem_limits(adev);
>   
>   	if (adev->kfd.dev) {
>   		struct kgd2kfd_shared_resources gpu_resources = {
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> index 1de021ebdd46..13284dbd8c58 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> @@ -363,7 +363,7 @@ u64 amdgpu_amdkfd_xcp_memory_size(struct amdgpu_device *adev, int xcp_id);
>   
>   
>   #if IS_ENABLED(CONFIG_HSA_AMD)
> -void amdgpu_amdkfd_gpuvm_init_mem_limits(void);
> +void amdgpu_amdkfd_gpuvm_init_mem_limits(struct amdgpu_device *adev);
>   void amdgpu_amdkfd_gpuvm_destroy_cb(struct amdgpu_device *adev,
>   				struct amdgpu_vm *vm);
>   
> @@ -376,7 +376,7 @@ void amdgpu_amdkfd_release_notify(struct amdgpu_bo *bo);
>   void amdgpu_amdkfd_reserve_system_mem(uint64_t size);
>   #else
>   static inline
> -void amdgpu_amdkfd_gpuvm_init_mem_limits(void)
> +void amdgpu_amdkfd_gpuvm_init_mem_limits(struct amdgpu_device *adev)
>   {
>   }
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> index 7eb5afcc4895..a3e623a320b3 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> @@ -60,6 +60,7 @@ static struct {
>   	int64_t system_mem_used;
>   	int64_t ttm_mem_used;
>   	spinlock_t mem_limit_lock;
> +	bool alow_oversubscribe;
>   } kfd_mem_limit;
>   
>   static const char * const domain_bit_to_string[] = {
> @@ -110,7 +111,7 @@ static bool reuse_dmamap(struct amdgpu_device *adev, struct amdgpu_device *bo_ad
>    *  System (TTM + userptr) memory - 15/16th System RAM
>    *  TTM memory - 3/8th System RAM
>    */
> -void amdgpu_amdkfd_gpuvm_init_mem_limits(void)
> +void amdgpu_amdkfd_gpuvm_init_mem_limits(struct amdgpu_device *adev)
>   {
>   	struct sysinfo si;
>   	uint64_t mem;
> @@ -130,6 +131,7 @@ void amdgpu_amdkfd_gpuvm_init_mem_limits(void)
>   		kfd_mem_limit.max_system_mem_limit -= AMDGPU_RESERVE_MEM_LIMIT;
>   
>   	kfd_mem_limit.max_ttm_mem_limit = ttm_tt_pages_limit() << PAGE_SHIFT;
> +	kfd_mem_limit.alow_oversubscribe = !!(adev->flags & AMD_IS_APU);
>   	pr_debug("Kernel memory limit %lluM, TTM limit %lluM\n",
>   		(kfd_mem_limit.max_system_mem_limit >> 20),
>   		(kfd_mem_limit.max_ttm_mem_limit >> 20));
> @@ -221,8 +223,12 @@ int amdgpu_amdkfd_reserve_mem_limit(struct amdgpu_device *adev,
>   	     kfd_mem_limit.max_ttm_mem_limit) ||
>   	    (adev && xcp_id >= 0 && adev->kfd.vram_used[xcp_id] + vram_needed >
>   	     vram_size - reserved_for_pt - atomic64_read(&adev->vram_pin_size))) {
> -		ret = -ENOMEM;
> -		goto release;
> +		if (kfd_mem_limit.alow_oversubscribe) {
> +			pr_warn_ratelimited("Memory is getting oversubscried.\n");
> +		} else {
> +			ret = -ENOMEM;
> +			goto release;
> +		}
>   	}
>   
>   	/* Update memory accounting by decreasing available system


More information about the amd-gfx mailing list