[PATCH v2 08/12] drm/amdgpu: use doorbell manager for kfd kernel doorbells
Shashank Sharma
shashank.sharma at amd.com
Sat Apr 22 06:39:23 UTC 2023
Hey Felix,
Thanks for your comments, mine inline.
On 21/04/2023 21:58, Felix Kuehling wrote:
>
> On 2023-04-12 12:25, Shashank Sharma wrote:
>> This patch:
>> - adds a doorbell bo in kfd device structure.
>> - creates doorbell page for kfd kernel usages.
>> - updates the get_kernel_doorbell and free_kernel_doorbell functions
>> accordingly
>>
>> V2: Do not use wrapper API, use direct amdgpu_create_kernel(Alex)
>>
>> Cc: Alex Deucher <alexander.deucher at amd.com>
>> Cc: Christian Koenig <christian.koenig at amd.com>
>> Cc: Felix Kuehling <Felix.Kuehling at amd.com>
>> Signed-off-by: Shashank Sharma <shashank.sharma at amd.com>
>> ---
>> drivers/gpu/drm/amd/amdkfd/kfd_device.c | 2 -
>> drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c | 110 ++++++----------------
>> drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 6 ++
>> 3 files changed, 36 insertions(+), 82 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
>> b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
>> index b8936340742b..49e3c4e021af 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
>> @@ -435,8 +435,6 @@ struct kfd_dev *kgd2kfd_probe(struct
>> amdgpu_device *adev, bool vf)
>> atomic_set(&kfd->compute_profile, 0);
>> mutex_init(&kfd->doorbell_mutex);
>> - memset(&kfd->doorbell_available_index, 0,
>> - sizeof(kfd->doorbell_available_index));
>> atomic_set(&kfd->sram_ecc_flag, 0);
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c
>> b/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c
>> index cd4e61bf0493..82b4a56b0afc 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c
>> @@ -61,81 +61,39 @@ size_t kfd_doorbell_process_slice(struct kfd_dev
>> *kfd)
>> /* Doorbell calculations for device init. */
>> int kfd_doorbell_init(struct kfd_dev *kfd)
>> {
>> - size_t doorbell_start_offset;
>> - size_t doorbell_aperture_size;
>> - size_t doorbell_process_limit;
>> + int r;
>> + int size = PAGE_SIZE;
>
> On GPUs with 64-bit doorbells, ROCm uses two pages of doorbells per
> process. In hindsight that's probably overkill and we could live with
> a single doorbell page per process in terms of the number of doorbells
> needed. But this is not easy to change due to the way that doorbells
> are routed to SDMA engines, at least on Arcturus and later MI GPUs
> that have lots of SDMA engines and queues. We need enough doorbells in
> each process's doorbell slice to reach all SDMA queues. On Arcturus
> that's up to 64 queues. The way the routing is set up, only 32
> doorbells per page are routed to various SDMA engines, so we need two
> pages.
>
> Changing the doorbell routing is not trivial due to SRIOV support,
> where the routing is programmed by the hypervisor driver. The
> hypervisor driver and all guest drivers (Windows and Linux) have to
> agree on the routing. Changing it breaks the ABI between hypervisor
> and guest drivers.
If I understand correctly, you are suspecting that we are allocating
less doorbells per KFD process, which is the first impression it
creates. But if you observe closely, we have split the total KFD
doorbells into two parts,
- KFD kernel level doorbells: doorbells which are being used for kernel
rings tests, written by kernel doorbell_write calls, saved in struct
kfd->doorbells
size = PAGE_SIZE.
- KFD process level doorbells: doorbell pages which are allocated by
kernel but mapped and written by userspace processes, saved in struct
pdd->qpd->doorbells
size = kfd_doorbell_process_slice.
We realized that we only need 1-2 doorbells for KFD kernel level stuff
(so kept it one page), but need 2-page of doorbells for KFD process, so
they are sized accordingly.
We have also run kfd_test_suit and verified the changes for any
regression. Hope this helps in explaining the design.
- Shashank
>
> Regards,
> Felix
>
>
>> - /*
>> - * With MES enabled, just set the doorbell base as it is needed
>> - * to calculate doorbell physical address.
>> - */
>> - if (kfd->shared_resources.enable_mes) {
>> - kfd->doorbell_base =
>> - kfd->shared_resources.doorbell_physical_address;
>> - return 0;
>> - }
>> -
>> - /*
>> - * We start with calculations in bytes because the input data might
>> - * only be byte-aligned.
>> - * Only after we have done the rounding can we assume any
>> alignment.
>> - */
>> -
>> - doorbell_start_offset =
>> - roundup(kfd->shared_resources.doorbell_start_offset,
>> - kfd_doorbell_process_slice(kfd));
>> -
>> - doorbell_aperture_size =
>> - rounddown(kfd->shared_resources.doorbell_aperture_size,
>> - kfd_doorbell_process_slice(kfd));
>> -
>> - if (doorbell_aperture_size > doorbell_start_offset)
>> - doorbell_process_limit =
>> - (doorbell_aperture_size - doorbell_start_offset) /
>> - kfd_doorbell_process_slice(kfd);
>> - else
>> - return -ENOSPC;
>> -
>> - if (!kfd->max_doorbell_slices ||
>> - doorbell_process_limit < kfd->max_doorbell_slices)
>> - kfd->max_doorbell_slices = doorbell_process_limit;
>> -
>> - kfd->doorbell_base =
>> kfd->shared_resources.doorbell_physical_address +
>> - doorbell_start_offset;
>> -
>> - kfd->doorbell_base_dw_offset = doorbell_start_offset / sizeof(u32);
>> -
>> - kfd->doorbell_kernel_ptr = ioremap(kfd->doorbell_base,
>> - kfd_doorbell_process_slice(kfd));
>> -
>> - if (!kfd->doorbell_kernel_ptr)
>> + /* Bitmap to dynamically allocate doorbells from kernel page */
>> + kfd->doorbell_bitmap = bitmap_zalloc(size / sizeof(u32),
>> GFP_KERNEL);
>> + if (!kfd->doorbell_bitmap) {
>> + DRM_ERROR("Failed to allocate kernel doorbell bitmap\n");
>> return -ENOMEM;
>> + }
>> - pr_debug("Doorbell initialization:\n");
>> - pr_debug("doorbell base == 0x%08lX\n",
>> - (uintptr_t)kfd->doorbell_base);
>> -
>> - pr_debug("doorbell_base_dw_offset == 0x%08lX\n",
>> - kfd->doorbell_base_dw_offset);
>> -
>> - pr_debug("doorbell_process_limit == 0x%08lX\n",
>> - doorbell_process_limit);
>> -
>> - pr_debug("doorbell_kernel_offset == 0x%08lX\n",
>> - (uintptr_t)kfd->doorbell_base);
>> -
>> - pr_debug("doorbell aperture size == 0x%08lX\n",
>> - kfd->shared_resources.doorbell_aperture_size);
>> -
>> - pr_debug("doorbell kernel address == %p\n",
>> kfd->doorbell_kernel_ptr);
>> + /* Alloc a doorbell page for KFD kernel usages */
>> + r = amdgpu_bo_create_kernel(kfd->adev,
>> + size,
>> + PAGE_SIZE,
>> + AMDGPU_GEM_DOMAIN_DOORBELL,
>> + &kfd->doorbells,
>> + NULL,
>> + (void **)&kfd->doorbell_kernel_ptr);
>> + if (r) {
>> + pr_err("failed to allocate kernel doorbells\n");
>> + bitmap_free(kfd->doorbell_bitmap);
>> + return r;
>> + }
>> + pr_debug("Doorbell kernel address == %p\n",
>> kfd->doorbell_kernel_ptr);
>> return 0;
>> }
>> void kfd_doorbell_fini(struct kfd_dev *kfd)
>> {
>> - if (kfd->doorbell_kernel_ptr)
>> - iounmap(kfd->doorbell_kernel_ptr);
>> + bitmap_free(kfd->doorbell_bitmap);
>> + amdgpu_bo_free_kernel(&kfd->doorbells, NULL,
>> + (void **)&kfd->doorbell_kernel_ptr);
>> }
>> int kfd_doorbell_mmap(struct kfd_dev *dev, struct kfd_process
>> *process,
>> @@ -188,22 +146,15 @@ void __iomem *kfd_get_kernel_doorbell(struct
>> kfd_dev *kfd,
>> u32 inx;
>> mutex_lock(&kfd->doorbell_mutex);
>> - inx = find_first_zero_bit(kfd->doorbell_available_index,
>> - KFD_MAX_NUM_OF_QUEUES_PER_PROCESS);
>> + inx = find_first_zero_bit(kfd->doorbell_bitmap, PAGE_SIZE /
>> sizeof(u32));
>> - __set_bit(inx, kfd->doorbell_available_index);
>> + __set_bit(inx, kfd->doorbell_bitmap);
>> mutex_unlock(&kfd->doorbell_mutex);
>> if (inx >= KFD_MAX_NUM_OF_QUEUES_PER_PROCESS)
>> return NULL;
>> - inx *= kfd->device_info.doorbell_size / sizeof(u32);
>> -
>> - /*
>> - * Calculating the kernel doorbell offset using the first
>> - * doorbell page.
>> - */
>> - *doorbell_off = kfd->doorbell_base_dw_offset + inx;
>> + *doorbell_off = amdgpu_doorbell_index_on_bar(kfd->adev,
>> kfd->doorbells, inx);
>> pr_debug("Get kernel queue doorbell\n"
>> " doorbell offset == 0x%08X\n"
>> @@ -217,11 +168,10 @@ void kfd_release_kernel_doorbell(struct kfd_dev
>> *kfd, u32 __iomem *db_addr)
>> {
>> unsigned int inx;
>> - inx = (unsigned int)(db_addr - kfd->doorbell_kernel_ptr)
>> - * sizeof(u32) / kfd->device_info.doorbell_size;
>> + inx = (unsigned int)(db_addr - kfd->doorbell_kernel_ptr);
>> mutex_lock(&kfd->doorbell_mutex);
>> - __clear_bit(inx, kfd->doorbell_available_index);
>> + __clear_bit(inx, kfd->doorbell_bitmap);
>> mutex_unlock(&kfd->doorbell_mutex);
>> }
>> @@ -280,7 +230,7 @@ phys_addr_t kfd_get_process_doorbells(struct
>> kfd_process_device *pdd)
>> if (!pdd->doorbell_index) {
>> int r = kfd_alloc_process_doorbells(pdd->dev,
>> &pdd->doorbell_index);
>> - if (r)
>> + if (r < 0)
>> return 0;
>> }
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>> b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>> index 552c3ac85a13..0b199eb6dc88 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>> @@ -346,6 +346,12 @@ struct kfd_dev {
>> /* HMM page migration MEMORY_DEVICE_PRIVATE mapping */
>> struct dev_pagemap pgmap;
>> +
>> + /* Kernel doorbells for KFD device */
>> + struct amdgpu_bo *doorbells;
>> +
>> + /* bitmap for dynamic doorbell allocation from this object */
>> + unsigned long *doorbell_bitmap;
>> };
>> enum kfd_mempool {
More information about the amd-gfx
mailing list