[PATCH] drm/amdgpu: Rework memory limits to allow big allocations

Mon Aug 21 20:32:09 UTC 2023

On 2023-08-21 15:20, Rajneesh Bhardwaj wrote:
> Rework the KFD max system memory and ttm limit to allow bigger
> system memory allocations upto 63/64 of the available memory which is
> controlled by ttm module params pages_limit and page_pool_size. Also for
> NPS1 mode, report the max ttm limit as the available VRAM size. For max
> system memory limit, leave 1GB exclusively outside ROCm allocations i.e.
> on 16GB system, >14 GB can be used by ROCm still leaving some memory for
> other system applications and on 128GB systems (e.g. GFXIP 9.4.3 APU in
> NPS1 mode) nearly >120GB can be used by ROCm.
>
> Signed-off-by: Rajneesh Bhardwaj <rajneesh.bhardwaj at amd.com>
> ---
>   .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c  |  5 ++--
>   drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c         | 25 +++++++++++++------
>   2 files changed, 21 insertions(+), 9 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> index 9e18fe5eb190..3387dcdf1bc9 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> @@ -44,6 +44,7 @@
>    * changes to accumulate
>    */
>   #define AMDGPU_USERPTR_RESTORE_DELAY_MS 1
> +#define ONE_GB			(1UL << 30)
>   
>   /*
>    * Align VRAM availability to 2MB to avoid fragmentation caused by 4K allocations in the tail 2MB
> @@ -117,11 +118,11 @@ void amdgpu_amdkfd_gpuvm_init_mem_limits(void)
>   		return;
>   
>   	si_meminfo(&si);
> -	mem = si.freeram - si.freehigh;
> +	mem = si.totalram - si.totalhigh;
>   	mem *= si.mem_unit;
>   
>   	spin_lock_init(&kfd_mem_limit.mem_limit_lock);
> -	kfd_mem_limit.max_system_mem_limit = mem - (mem >> 4);
> +	kfd_mem_limit.max_system_mem_limit = mem - (mem >> 6) - (ONE_GB);

I believe this is an OK heuristic for large systems and medium-sized 
systems. But it produces a negative number or an underflow for systems 
with very small system memory (about 1.1GB).  It's not practical to run 
ROCm on such a small system, but the code at least needs to be robust 
here and produce something meaningful. E.g.

	kfd_mem_limit.max_system_mem_limit = mem - (mem >> 6);
	if (kfd_mem_limit.max_system_mem_limit < 2 * ONE_GB)
		kfd_mem_limit.max_system_mem_limit <<= 1;
	else
		kfd_mem_limit.max_system_mem_limit -= ONE_GB;

Since this change affects all GPUs and the change below is specific to 
GFXv9.4.3 APUs, I'd separate this into two patches.

>   	kfd_mem_limit.max_ttm_mem_limit = ttm_tt_pages_limit() << PAGE_SHIFT;
>   	pr_debug("Kernel memory limit %lluM, TTM limit %lluM\n",
>   		(kfd_mem_limit.max_system_mem_limit >> 20),
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> index 8447fcada8bb..4962e35df617 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> @@ -25,6 +25,7 @@
>   #include <linux/pci.h>
>   
>   #include <drm/drm_cache.h>
> +#include <drm/ttm/ttm_tt.h>
>   
>   #include "amdgpu.h"
>   #include "gmc_v9_0.h"
> @@ -1877,6 +1878,7 @@ static void
>   gmc_v9_0_init_acpi_mem_ranges(struct amdgpu_device *adev,
>   			      struct amdgpu_mem_partition_info *mem_ranges)
>   {
> +	uint64_t max_ttm_size = ttm_tt_pages_limit() << PAGE_SHIFT;
>   	int num_ranges = 0, ret, mem_groups;
>   	struct amdgpu_numa_info numa_info;
>   	int node_ids[MAX_MEM_RANGES];
> @@ -1913,8 +1915,17 @@ gmc_v9_0_init_acpi_mem_ranges(struct amdgpu_device *adev,
>   
>   	/* If there is only partition, don't use entire size */
>   	if (adev->gmc.num_mem_partitions == 1) {
> -		mem_ranges[0].size = mem_ranges[0].size * (mem_groups - 1);
> -		do_div(mem_ranges[0].size, mem_groups);
> +		if (max_ttm_size > mem_ranges[0].size || max_ttm_size <= 0) {

This gives some weird dis-continuous behaviour. For max_ttm_size > 
mem_ranges[0].size it gives you 3/4. For max_ttm_size == 
mem_ranges[0].size it gives you all the memory.

Also, why is this only applied for num_mem_partitions == 1? The TTM 
limit also applies when there are more memory partitions. Would it make 
more sense to always evenly divide the ttm_tt_pages_limit between all 
the memory partitions? And cap the size at the NUMA node size. I think 
that would eliminate special cases for different memory-partition 
configs and give you sensible behaviour in all cases.

Regards,
   Felix

> +			/* Report VRAM as 3/4th of available numa memory */
> +			mem_ranges[0].size = mem_ranges[0].size * (mem_groups - 1);
> +			do_div(mem_ranges[0].size, mem_groups);
> +		} else {
> +			/* Report VRAM as set by ttm.pages_limit or default ttm
> +			 * limit which is 1/2 of system memory
> +			 */
> +			mem_ranges[0].size = max_ttm_size;
> +		}
> +		pr_debug("NPS1 mode, setting VRAM size = %llu\n", mem_ranges[0].size);
>   	}
>   }
>   
> @@ -2159,6 +2170,11 @@ static int gmc_v9_0_sw_init(void *handle)
>   
>   	amdgpu_gmc_get_vbios_allocations(adev);
>   
> +	/* Memory manager */
> +	r = amdgpu_bo_init(adev);
> +	if (r)
> +		return r;
> +
>   #ifdef HAVE_ACPI_DEV_GET_FIRST_MATCH_DEV
>   	if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 3)) {
>   		r = gmc_v9_0_init_mem_ranges(adev);
> @@ -2167,11 +2183,6 @@ static int gmc_v9_0_sw_init(void *handle)
>   	}
>   #endif
>   
> -	/* Memory manager */
> -	r = amdgpu_bo_init(adev);
> -	if (r)
> -		return r;
> -
>   	r = gmc_v9_0_gart_init(adev);
>   	if (r)
>   		return r;