[PATCH v2] drm/amdkfd: Let VRAM allocations go to GTT domain on small APUs
Felix Kuehling
felix.kuehling at amd.com
Tue Apr 30 19:56:16 UTC 2024
On 2024-04-30 6:08, Lang Yu wrote:
> Small APUs(i.e., consumer, embedded products) usually have a small
> carveout device memory which can't satisfy most compute workloads
> memory allocation requirements.
>
> We can't even run a Basic MNIST Example with a default 512MB carveout.
> https://github.com/pytorch/examples/tree/main/mnist.
> Error Log when running mnist:
> "torch.cuda.OutOfMemoryError: HIP out of memory. Tried to allocate
> 84.00 MiB. GPU 0 has a total capacity of 512.00 MiB of which 0 bytes
> is free. Of the allocated memory 103.83 MiB is allocated by PyTorch,
> and 22.17 MiB is reserved by PyTorch but unallocated"
>
> Though we can change BIOS settings to enlarge carveout size,
> which is inflexible and may bring complaint. On the other hand,
> the memory resource can't be effectively used between host and device.
>
> The solution is MI300A approach, i.e., let VRAM allocations go to GTT.
> Then device and host can effectively share system memory.
>
> v2: Report local_mem_size_private as 0. (Felix)
>
> Signed-off-by: Lang Yu <Lang.Yu at amd.com>
Reviewed-by: Felix Kuehling <felix.kuehling at amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 5 +++++
> .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 20 ++++++++++---------
> drivers/gpu/drm/amd/amdkfd/kfd_migrate.c | 2 +-
> drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 6 ++++--
> drivers/gpu/drm/amd/amdkfd/kfd_svm.h | 3 ++-
> 5 files changed, 23 insertions(+), 13 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> index 7ba05f030dd1..e3738d417245 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> @@ -455,6 +455,9 @@ void amdgpu_amdkfd_get_local_mem_info(struct amdgpu_device *adev,
> else
> mem_info->local_mem_size_private =
> KFD_XCP_MEMORY_SIZE(adev, xcp->id);
> + } else if (adev->flags & AMD_IS_APU) {
> + mem_info->local_mem_size_public = (ttm_tt_pages_limit() << PAGE_SHIFT);
> + mem_info->local_mem_size_private = 0;
> } else {
> mem_info->local_mem_size_public = adev->gmc.visible_vram_size;
> mem_info->local_mem_size_private = adev->gmc.real_vram_size -
> @@ -824,6 +827,8 @@ u64 amdgpu_amdkfd_xcp_memory_size(struct amdgpu_device *adev, int xcp_id)
> }
> do_div(tmp, adev->xcp_mgr->num_xcp_per_mem_partition);
> return ALIGN_DOWN(tmp, PAGE_SIZE);
> + } else if (adev->flags & AMD_IS_APU) {
> + return (ttm_tt_pages_limit() << PAGE_SHIFT);
> } else {
> return adev->gmc.real_vram_size;
> }
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> index 4bdf59213384..5843c3d35cb9 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> @@ -196,7 +196,7 @@ int amdgpu_amdkfd_reserve_mem_limit(struct amdgpu_device *adev,
> return -EINVAL;
>
> vram_size = KFD_XCP_MEMORY_SIZE(adev, xcp_id);
> - if (adev->gmc.is_app_apu) {
> + if (adev->gmc.is_app_apu || adev->flags & AMD_IS_APU) {
> system_mem_needed = size;
> ttm_mem_needed = size;
> }
> @@ -232,7 +232,8 @@ int amdgpu_amdkfd_reserve_mem_limit(struct amdgpu_device *adev,
> "adev reference can't be null when vram is used");
> if (adev && xcp_id >= 0) {
> adev->kfd.vram_used[xcp_id] += vram_needed;
> - adev->kfd.vram_used_aligned[xcp_id] += adev->gmc.is_app_apu ?
> + adev->kfd.vram_used_aligned[xcp_id] +=
> + (adev->gmc.is_app_apu || adev->flags & AMD_IS_APU) ?
> vram_needed :
> ALIGN(vram_needed, VRAM_AVAILABLITY_ALIGN);
> }
> @@ -260,7 +261,7 @@ void amdgpu_amdkfd_unreserve_mem_limit(struct amdgpu_device *adev,
>
> if (adev) {
> adev->kfd.vram_used[xcp_id] -= size;
> - if (adev->gmc.is_app_apu) {
> + if (adev->gmc.is_app_apu || adev->flags & AMD_IS_APU) {
> adev->kfd.vram_used_aligned[xcp_id] -= size;
> kfd_mem_limit.system_mem_used -= size;
> kfd_mem_limit.ttm_mem_used -= size;
> @@ -889,7 +890,7 @@ static int kfd_mem_attach(struct amdgpu_device *adev, struct kgd_mem *mem,
> * if peer device has large BAR. In contrast, access over xGMI is
> * allowed for both small and large BAR configurations of peer device
> */
> - if ((adev != bo_adev && !adev->gmc.is_app_apu) &&
> + if ((adev != bo_adev && !(adev->gmc.is_app_apu || adev->flags & AMD_IS_APU)) &&
> ((mem->domain == AMDGPU_GEM_DOMAIN_VRAM) ||
> (mem->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL) ||
> (mem->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_MMIO_REMAP))) {
> @@ -1674,7 +1675,7 @@ size_t amdgpu_amdkfd_get_available_memory(struct amdgpu_device *adev,
> - atomic64_read(&adev->vram_pin_size)
> - reserved_for_pt;
>
> - if (adev->gmc.is_app_apu) {
> + if (adev->gmc.is_app_apu || adev->flags & AMD_IS_APU) {
> system_mem_available = no_system_mem_limit ?
> kfd_mem_limit.max_system_mem_limit :
> kfd_mem_limit.max_system_mem_limit -
> @@ -1722,7 +1723,7 @@ int amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu(
> if (flags & KFD_IOC_ALLOC_MEM_FLAGS_VRAM) {
> domain = alloc_domain = AMDGPU_GEM_DOMAIN_VRAM;
>
> - if (adev->gmc.is_app_apu) {
> + if (adev->gmc.is_app_apu || adev->flags & AMD_IS_APU) {
> domain = AMDGPU_GEM_DOMAIN_GTT;
> alloc_domain = AMDGPU_GEM_DOMAIN_GTT;
> alloc_flags = 0;
> @@ -1973,7 +1974,7 @@ int amdgpu_amdkfd_gpuvm_free_memory_of_gpu(
> if (size) {
> if (!is_imported &&
> (mem->bo->preferred_domains == AMDGPU_GEM_DOMAIN_VRAM ||
> - (adev->gmc.is_app_apu &&
> + ((adev->gmc.is_app_apu || adev->flags & AMD_IS_APU) &&
> mem->bo->preferred_domains == AMDGPU_GEM_DOMAIN_GTT)))
> *size = bo_size;
> else
> @@ -2395,8 +2396,9 @@ static int import_obj_create(struct amdgpu_device *adev,
> (*mem)->dmabuf = dma_buf;
> (*mem)->bo = bo;
> (*mem)->va = va;
> - (*mem)->domain = (bo->preferred_domains & AMDGPU_GEM_DOMAIN_VRAM) && !adev->gmc.is_app_apu ?
> - AMDGPU_GEM_DOMAIN_VRAM : AMDGPU_GEM_DOMAIN_GTT;
> + (*mem)->domain = (bo->preferred_domains & AMDGPU_GEM_DOMAIN_VRAM) &&
> + !(adev->gmc.is_app_apu || adev->flags & AMD_IS_APU) ?
> + AMDGPU_GEM_DOMAIN_VRAM : AMDGPU_GEM_DOMAIN_GTT;
>
> (*mem)->mapped_to_gpu_memory = 0;
> (*mem)->process_info = avm->process_info;
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
> index 4bcfbeac48fb..4816fcb9803a 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
> @@ -1023,7 +1023,7 @@ int kgd2kfd_init_zone_device(struct amdgpu_device *adev)
> if (amdgpu_ip_version(adev, GC_HWIP, 0) < IP_VERSION(9, 0, 1))
> return -EINVAL;
>
> - if (adev->gmc.is_app_apu)
> + if (adev->gmc.is_app_apu || adev->flags & AMD_IS_APU)
> return 0;
>
> pgmap = &kfddev->pgmap;
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
> index 386875e6eb96..069b81eeea03 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
> @@ -2619,7 +2619,8 @@ svm_range_best_restore_location(struct svm_range *prange,
> return -1;
> }
>
> - if (node->adev->gmc.is_app_apu)
> + if (node->adev->gmc.is_app_apu ||
> + node->adev->flags & AMD_IS_APU)
> return 0;
>
> if (prange->preferred_loc == gpuid ||
> @@ -3337,7 +3338,8 @@ svm_range_best_prefetch_location(struct svm_range *prange)
> goto out;
> }
>
> - if (bo_node->adev->gmc.is_app_apu) {
> + if (bo_node->adev->gmc.is_app_apu ||
> + bo_node->adev->flags & AMD_IS_APU) {
> best_loc = 0;
> goto out;
> }
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.h b/drivers/gpu/drm/amd/amdkfd/kfd_svm.h
> index 026863a0abcd..9c37bd0567ef 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.h
> @@ -201,7 +201,8 @@ void svm_range_list_lock_and_flush_work(struct svm_range_list *svms, struct mm_s
> * is initialized to not 0 when page migration register device memory.
> */
> #define KFD_IS_SVM_API_SUPPORTED(adev) ((adev)->kfd.pgmap.type != 0 ||\
> - (adev)->gmc.is_app_apu)
> + (adev)->gmc.is_app_apu ||\
> + ((adev)->flags & AMD_IS_APU))
>
> void svm_range_bo_unref_async(struct svm_range_bo *svm_bo);
>
More information about the amd-gfx
mailing list