[PATCH 13/16] drm/amdgpu: use doorbell manager for kfd process doorbells

Shashank Sharma shashank.sharma at amd.com
Fri Mar 31 08:28:14 UTC 2023


On 30/03/2023 22:54, Alex Deucher wrote:
> On Wed, Mar 29, 2023 at 11:48 AM Shashank Sharma
> <shashank.sharma at amd.com> wrote:
>> This patch:
>> - adds a new doorbell manager object in kfd pdd structure.
>> - allocates doorbells for a process while creating its pdd.
>> - frees the doorbells with pdd destroy.
>> - uses direct doorbell manager API for doorbell indexing.
>> - removes previous calls to allocate process doorbells as
>>    its not required anymore.
>>
>> PS: This patch ensures that we don't break the existing KFD
>>      functionality, but now KFD userspace library must also
>>      move to creating doorbell pages as AMDGPU GEM objects
>>      using libdrm functions in userspace. The reference code
>>      for the same is available with AMDGPU Usermode queue
>>      libdrm MR. Once this is done, we will not need this
>>      patch.
>>
>> Cc: Alex Deucher <alexander.deucher at amd.com>
>> Cc: Christian Koenig <christian.koenig at amd.com>
>> Signed-off-by: Shashank Sharma <shashank.sharma at amd.com>
>> ---
>>   drivers/gpu/drm/amd/amdkfd/kfd_chardev.c      | 13 ----
>>   .../drm/amd/amdkfd/kfd_device_queue_manager.c | 16 ++---
>>   drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c     | 59 +++++++++----------
>>   drivers/gpu/drm/amd/amdkfd/kfd_priv.h         |  8 +--
>>   drivers/gpu/drm/amd/amdkfd/kfd_process.c      | 26 ++++----
>>   .../amd/amdkfd/kfd_process_queue_manager.c    | 16 ++---
>>   6 files changed, 58 insertions(+), 80 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>> index 6d291aa6386b..0e40756417e5 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>> @@ -327,12 +327,6 @@ static int kfd_ioctl_create_queue(struct file *filep, struct kfd_process *p,
>>                  goto err_bind_process;
>>          }
>>
>> -       if (!pdd->doorbell_index &&
>> -           kfd_alloc_process_doorbells(dev, &pdd->doorbell_index) < 0) {
>> -               err = -ENOMEM;
>> -               goto err_alloc_doorbells;
>> -       }
>> -
>>          /* Starting with GFX11, wptr BOs must be mapped to GART for MES to determine work
>>           * on unmapped queues for usermode queue oversubscription (no aggregated doorbell)
>>           */
>> @@ -410,7 +404,6 @@ static int kfd_ioctl_create_queue(struct file *filep, struct kfd_process *p,
>>          if (wptr_bo)
>>                  amdgpu_amdkfd_free_gtt_mem(dev->adev, wptr_bo);
>>   err_wptr_map_gart:
>> -err_alloc_doorbells:
>>   err_bind_process:
>>   err_pdd:
>>          mutex_unlock(&p->mutex);
>> @@ -2163,12 +2156,6 @@ static int criu_restore_devices(struct kfd_process *p,
>>                          ret = PTR_ERR(pdd);
>>                          goto exit;
>>                  }
>> -
>> -               if (!pdd->doorbell_index &&
>> -                   kfd_alloc_process_doorbells(pdd->dev, &pdd->doorbell_index) < 0) {
>> -                       ret = -ENOMEM;
>> -                       goto exit;
>> -               }
>>          }
>>
>>          /*
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>> index ecb4c3abc629..5827db9b18a8 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>> @@ -362,7 +362,7 @@ static int allocate_doorbell(struct qcm_process_device *qpd,
>>                  /* For CP queues on SOC15 */
>>                  if (restore_id) {
>>                          /* make sure that ID is free  */
>> -                       if (__test_and_set_bit(*restore_id, qpd->doorbell_bitmap))
>> +                       if (__test_and_set_bit(*restore_id, qpd->proc_doorbells.doorbell_bitmap))
>>                                  return -EINVAL;
>>
>>                          q->doorbell_id = *restore_id;
>> @@ -370,20 +370,20 @@ static int allocate_doorbell(struct qcm_process_device *qpd,
>>                          /* or reserve a free doorbell ID */
>>                          unsigned int found;
>>
>> -                       found = find_first_zero_bit(qpd->doorbell_bitmap,
>> -                                               KFD_MAX_NUM_OF_QUEUES_PER_PROCESS);
>> +                       found = find_first_zero_bit(qpd->proc_doorbells.doorbell_bitmap,
>> +                                                   KFD_MAX_NUM_OF_QUEUES_PER_PROCESS);
>>                          if (found >= KFD_MAX_NUM_OF_QUEUES_PER_PROCESS) {
>>                                  pr_debug("No doorbells available");
>>                                  return -EBUSY;
>>                          }
>> -                       set_bit(found, qpd->doorbell_bitmap);
>> +                       set_bit(found, qpd->proc_doorbells.doorbell_bitmap);
>>                          q->doorbell_id = found;
>>                  }
>>          }
>>
>> -       q->properties.doorbell_off =
>> -               kfd_get_doorbell_dw_offset_in_bar(dev, qpd_to_pdd(qpd),
>> -                                         q->doorbell_id);
>> +       q->properties.doorbell_off = amdgpu_doorbell_index_on_bar(dev->adev,
>> +                                                                 qpd->proc_doorbells.bo,
>> +                                                                 q->doorbell_id);
>>          return 0;
>>   }
>>
>> @@ -398,7 +398,7 @@ static void deallocate_doorbell(struct qcm_process_device *qpd,
>>              q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)
>>                  return;
>>
>> -       old = test_and_clear_bit(q->doorbell_id, qpd->doorbell_bitmap);
>> +       old = test_and_clear_bit(q->doorbell_id, qpd->proc_doorbells.doorbell_bitmap);
>>          WARN_ON(!old);
>>   }
>>
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c b/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c
>> index df259f2cc58a..7d29653bff81 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c
>> @@ -228,46 +228,41 @@ uint64_t kfd_get_number_elems(struct kfd_dev *kfd)
>>
>>   phys_addr_t kfd_get_process_doorbells(struct kfd_process_device *pdd)
>>   {
>> -       if (!pdd->doorbell_index) {
>> -               int r = kfd_alloc_process_doorbells(pdd->dev,
>> -                                                   &pdd->doorbell_index);
>> -               if (r < 0)
>> -                       return 0;
>> -       }
>> +       struct amdgpu_device *adev = pdd->dev->adev;
>>
>> -       return pdd->dev->doorbell_base +
>> -               pdd->doorbell_index * kfd_doorbell_process_slice(pdd->dev);
>> +       /* Return base of the first doorbell of this process */
>> +       return adev->doorbell.base + pdd->qpd.proc_doorbells.start * sizeof(uint32_t);
>>   }
>>
>> -int kfd_alloc_process_doorbells(struct kfd_dev *kfd, unsigned int *doorbell_index)
>> +int kfd_alloc_process_doorbells(struct kfd_dev *kfd, struct kfd_process_device *pdd)
>>   {
>> -       int r = 0;
>> -
>> -       if (!kfd->shared_resources.enable_mes)
>> -               r = ida_simple_get(&kfd->doorbell_ida, 1,
>> -                                  kfd->max_doorbell_slices, GFP_KERNEL);
>> -       else
>> -               r = amdgpu_mes_alloc_process_doorbells(
>> -                               (struct amdgpu_device *)kfd->adev,
>> -                               doorbell_index);
>> -
>> -       if (r > 0)
>> -               *doorbell_index = r;
>> +       int r;
>> +       struct qcm_process_device *qpd = &pdd->qpd;
>> +       struct amdgpu_doorbell_obj *proc_doorbells = &qpd->proc_doorbells;
>> +
>> +       /* Allocate bitmap for dynamic doorbell allocation */
>> +       proc_doorbells->doorbell_bitmap = bitmap_zalloc(KFD_MAX_NUM_OF_QUEUES_PER_PROCESS,
>> +                                                       GFP_KERNEL);
>> +       if (!proc_doorbells->doorbell_bitmap) {
>> +               DRM_ERROR("Failed to allocate process doorbell bitmap\n");
>> +               return -ENOMEM;
>> +       }
>>
>> -       if (r < 0)
>> -               pr_err("Failed to allocate process doorbells\n");
>> +       /* Allocate doorbells for this process from the PCI BAR */
>> +       proc_doorbells->size = kfd_doorbell_process_slice(kfd);
>> +       r = amdgpu_doorbell_alloc_page(kfd->adev, proc_doorbells);
> Same thing here as the previous patch.  Just call
> amdgpu_bo_create_kernel(..DOORBELL..) and store the bo in the process
> structure.
>
> Alex

got it,

- Shashank

>
>> +       if (r) {
>> +               DRM_ERROR("Failed to allocate process doorbells\n");
>> +               return r;
>> +       }
>>
>>          return r;
>>   }
>>
>> -void kfd_free_process_doorbells(struct kfd_dev *kfd, unsigned int doorbell_index)
>> +void kfd_free_process_doorbells(struct kfd_dev *kfd, struct kfd_process_device *pdd)
>>   {
>> -       if (doorbell_index) {
>> -               if (!kfd->shared_resources.enable_mes)
>> -                       ida_simple_remove(&kfd->doorbell_ida, doorbell_index);
>> -               else
>> -                       amdgpu_mes_free_process_doorbells(
>> -                                       (struct amdgpu_device *)kfd->adev,
>> -                                       doorbell_index);
>> -       }
>> +       struct amdgpu_doorbell_obj *proc_doorbells = &pdd->qpd.proc_doorbells;
>> +
>> +       bitmap_free(proc_doorbells->doorbell_bitmap);
>> +       amdgpu_doorbell_free_page(kfd->adev, proc_doorbells);
>>   }
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>> index 0ed33416c35f..c97ed8e7e02d 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>> @@ -658,8 +658,8 @@ struct qcm_process_device {
>>          uint64_t ib_base;
>>          void *ib_kaddr;
>>
>> -       /* doorbell resources per process per device */
>> -       unsigned long *doorbell_bitmap;
>> +       /* physical doorbell pages */
>> +       struct amdgpu_doorbell_obj proc_doorbells;
>>   };
>>
>>   /* KFD Memory Eviction */
>> @@ -1006,9 +1006,9 @@ unsigned int kfd_get_doorbell_dw_offset_in_bar(struct kfd_dev *kfd,
>>                                          unsigned int doorbell_id);
>>   phys_addr_t kfd_get_process_doorbells(struct kfd_process_device *pdd);
>>   int kfd_alloc_process_doorbells(struct kfd_dev *kfd,
>> -                               unsigned int *doorbell_index);
>> +                                struct kfd_process_device *pdd);
>>   void kfd_free_process_doorbells(struct kfd_dev *kfd,
>> -                               unsigned int doorbell_index);
>> +                                struct kfd_process_device *pdd);
>>   /* GTT Sub-Allocator */
>>
>>   int kfd_gtt_sa_allocate(struct kfd_dev *kfd, unsigned int size,
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
>> index 51b1683ac5c1..68d0310c2d53 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
>> @@ -1037,10 +1037,9 @@ static void kfd_process_destroy_pdds(struct kfd_process *p)
>>                          free_pages((unsigned long)pdd->qpd.cwsr_kaddr,
>>                                  get_order(KFD_CWSR_TBA_TMA_SIZE));
>>
>> -               bitmap_free(pdd->qpd.doorbell_bitmap);
>>                  idr_destroy(&pdd->alloc_idr);
>>
>> -               kfd_free_process_doorbells(pdd->dev, pdd->doorbell_index);
>> +               kfd_free_process_doorbells(pdd->dev, pdd);
>>
>>                  if (pdd->dev->shared_resources.enable_mes)
>>                          amdgpu_amdkfd_free_gtt_mem(pdd->dev->adev,
>> @@ -1449,15 +1448,11 @@ static int init_doorbell_bitmap(struct qcm_process_device *qpd,
>>          unsigned int i;
>>          int range_start = dev->shared_resources.non_cp_doorbells_start;
>>          int range_end = dev->shared_resources.non_cp_doorbells_end;
>> +       struct amdgpu_doorbell_obj *proc_doorbells = &qpd->proc_doorbells;
>>
>>          if (!KFD_IS_SOC15(dev))
>>                  return 0;
>>
>> -       qpd->doorbell_bitmap = bitmap_zalloc(KFD_MAX_NUM_OF_QUEUES_PER_PROCESS,
>> -                                            GFP_KERNEL);
>> -       if (!qpd->doorbell_bitmap)
>> -               return -ENOMEM;
>> -
>>          /* Mask out doorbells reserved for SDMA, IH, and VCN on SOC15. */
>>          pr_debug("reserved doorbell 0x%03x - 0x%03x\n", range_start, range_end);
>>          pr_debug("reserved doorbell 0x%03x - 0x%03x\n",
>> @@ -1466,9 +1461,9 @@ static int init_doorbell_bitmap(struct qcm_process_device *qpd,
>>
>>          for (i = 0; i < KFD_MAX_NUM_OF_QUEUES_PER_PROCESS / 2; i++) {
>>                  if (i >= range_start && i <= range_end) {
>> -                       __set_bit(i, qpd->doorbell_bitmap);
>> +                       __set_bit(i, proc_doorbells->doorbell_bitmap);
>>                          __set_bit(i + KFD_QUEUE_DOORBELL_MIRROR_OFFSET,
>> -                                 qpd->doorbell_bitmap);
>> +                                 proc_doorbells->doorbell_bitmap);
>>                  }
>>          }
>>
>> @@ -1499,9 +1494,15 @@ struct kfd_process_device *kfd_create_process_device_data(struct kfd_dev *dev,
>>          if (!pdd)
>>                  return NULL;
>>
>> +       retval = kfd_alloc_process_doorbells(dev, pdd);
>> +       if (retval) {
>> +               pr_err("failed to allocate process doorbells\n");
>> +               goto err_free_pdd;
>> +       }
>> +
>>          if (init_doorbell_bitmap(&pdd->qpd, dev)) {
>>                  pr_err("Failed to init doorbell for process\n");
>> -               goto err_free_pdd;
>> +               goto err_free_db;
>>          }
>>
>>          pdd->dev = dev;
>> @@ -1529,7 +1530,7 @@ struct kfd_process_device *kfd_create_process_device_data(struct kfd_dev *dev,
>>                                                  false);
>>                  if (retval) {
>>                          pr_err("failed to allocate process context bo\n");
>> -                       goto err_free_pdd;
>> +                       goto err_free_db;
>>                  }
>>                  memset(pdd->proc_ctx_cpu_ptr, 0, AMDGPU_MES_PROC_CTX_SIZE);
>>          }
>> @@ -1541,6 +1542,9 @@ struct kfd_process_device *kfd_create_process_device_data(struct kfd_dev *dev,
>>
>>          return pdd;
>>
>> +err_free_db:
>> +       kfd_free_process_doorbells(pdd->dev, pdd);
>> +
>>   err_free_pdd:
>>          kfree(pdd);
>>          return NULL;
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
>> index 5137476ec18e..693688d789d3 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
>> @@ -348,13 +348,11 @@ int pqm_create_queue(struct process_queue_manager *pqm,
>>                  /* Return the doorbell offset within the doorbell page
>>                   * to the caller so it can be passed up to user mode
>>                   * (in bytes).
>> -                * There are always 1024 doorbells per process, so in case
>> -                * of 8-byte doorbells, there are two doorbell pages per
>> -                * process.
>> +                * relative doorbell index = Absolute doorbell index -
>> +                * absolute index of first doorbell in the page.
>>                   */
>> -               *p_doorbell_offset_in_process =
>> -                       (q->properties.doorbell_off * sizeof(uint32_t)) &
>> -                       (kfd_doorbell_process_slice(dev) - 1);
>> +               *p_doorbell_offset_in_process = (q->properties.doorbell_off
>> +                                               - pdd->qpd.proc_doorbells.start) * sizeof(uint32_t);
>>
>>          pr_debug("PQM After DQM create queue\n");
>>
>> @@ -858,12 +856,6 @@ int kfd_criu_restore_queue(struct kfd_process *p,
>>                  goto exit;
>>          }
>>
>> -       if (!pdd->doorbell_index &&
>> -           kfd_alloc_process_doorbells(pdd->dev, &pdd->doorbell_index) < 0) {
>> -               ret = -ENOMEM;
>> -               goto exit;
>> -       }
>> -
>>          /* data stored in this order: mqd, ctl_stack */
>>          mqd = q_extra_data;
>>          ctl_stack = mqd + q_data->mqd_size;
>> --
>> 2.40.0
>>


More information about the amd-gfx mailing list