[PATCH] drm/amdkfd: option to disable system mem limit

philip yang yangp at amd.com
Tue Aug 4 16:57:32 UTC 2020


Ping.

On 2020-07-27 9:24 a.m., Philip Yang wrote:
> If multiple process share system memory through /dev/shm, KFD allocate
> memory should not fail if it reachs the system memory limit because
> one copy of physical system memory are shared by multiple process.
>
> Add module parameter to provide user option to disable system memory
> limit check, to run multiple process share memory application. By
> default the system memory limit is on.
>
> Print out debug message to warn user if KFD allocate memory failed
> because of system memory limit.
>
> Signed-off-by: Philip Yang <Philip.Yang at amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu.h              | 2 ++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 9 ++++++++-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c          | 9 +++++++++
>   3 files changed, 19 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index e97c088d03b3..3c0d5ecfe0d5 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -187,9 +187,11 @@ extern int amdgpu_force_asic_type;
>   #ifdef CONFIG_HSA_AMD
>   extern int sched_policy;
>   extern bool debug_evictions;
> +extern bool no_system_mem_limit;
>   #else
>   static const int sched_policy = KFD_SCHED_POLICY_HWS;
>   static const bool debug_evictions; /* = false */
> +static const bool no_system_mem_limit;
>   #endif
>   
>   extern int amdgpu_tmz;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> index 8703aa1fe4a5..502e8204c012 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> @@ -99,7 +99,10 @@ void amdgpu_amdkfd_gpuvm_init_mem_limits(void)
>   	mem *= si.mem_unit;
>   
>   	spin_lock_init(&kfd_mem_limit.mem_limit_lock);
> -	kfd_mem_limit.max_system_mem_limit = mem - (mem >> 4);
> +	if (no_system_mem_limit)
> +		kfd_mem_limit.max_system_mem_limit = U64_MAX;
> +	else
> +		kfd_mem_limit.max_system_mem_limit = mem - (mem >> 4);
>   	kfd_mem_limit.max_ttm_mem_limit = (mem >> 1) - (mem >> 3);
>   	pr_debug("Kernel memory limit %lluM, TTM limit %lluM\n",
>   		(kfd_mem_limit.max_system_mem_limit >> 20),
> @@ -148,6 +151,10 @@ static int amdgpu_amdkfd_reserve_mem_limit(struct amdgpu_device *adev,
>   
>   	spin_lock(&kfd_mem_limit.mem_limit_lock);
>   
> +	if (kfd_mem_limit.system_mem_used + system_mem_needed >
> +	    kfd_mem_limit.max_system_mem_limit)
> +		pr_debug("Set no_system_mem_limit if using shared memory\n");
> +
>   	if ((kfd_mem_limit.system_mem_used + system_mem_needed >
>   	     kfd_mem_limit.max_system_mem_limit) ||
>   	    (kfd_mem_limit.ttm_mem_used + ttm_mem_needed >
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> index 6291f5f0d223..e9acd0a9f327 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> @@ -715,6 +715,15 @@ MODULE_PARM_DESC(queue_preemption_timeout_ms, "queue preemption timeout in ms (1
>   bool debug_evictions;
>   module_param(debug_evictions, bool, 0644);
>   MODULE_PARM_DESC(debug_evictions, "enable eviction debug messages (false = default)");
> +
> +/**
> + * DOC: no_system_mem_limit(bool)
> + * Disable system memory limit, to support multiple process shared memory
> + */
> +bool no_system_mem_limit;
> +module_param(no_system_mem_limit, bool, 0644);
> +MODULE_PARM_DESC(no_system_mem_limit, "disable system memory limit (false = default)");
> +
>   #endif
>   
>   /**


More information about the amd-gfx mailing list