[PATCH v3 09/12] drm/amdgpu: use doorbell manager for kfd process doorbells

Felix Kuehling felix.kuehling at amd.com
Thu Jul 13 19:05:07 UTC 2023


On 2023-06-20 13:16, Shashank Sharma wrote:
> This patch:
> - adds a doorbell object in kfd pdd structure.
> - allocates doorbells for a process while creating its queue.
> - frees the doorbells with pdd destroy.
> - moves doorbell bitmap init function to kfd_doorbell.c
>
> PS: This patch ensures that we don't break the existing KFD
>      functionality, but now KFD userspace library should also
>      create doorbell pages as AMDGPU GEM objects using libdrm
>      functions in userspace. The reference code for the same
>      is available with AMDGPU Usermode queue libdrm MR. Once
>      this is done, we will not need to create process doorbells
>      in kernel.
>
> V2: - Do not use doorbell wrapper API, use amdgpu_bo_create_kernel
>        instead (Alex).
>      - Do not use custom doorbell structure, instead use separate
>        variables for bo and doorbell_bitmap (Alex)
> V3:
>     - Do not allocate doorbell page with PDD, delay doorbell process
>       page allocation until really needed (Felix)
>
> Cc: Alex Deucher <alexander.deucher at amd.com>
> Cc: Christian Koenig <christian.koenig at amd.com>
> Cc: Felix Kuehling <Felix.Kuehling at amd.com>
> Acked-by: Christian König <christian.koenig at amd.com>
> Signed-off-by: Shashank Sharma <shashank.sharma at amd.com>

Reviewed-by: Felix Kuehling <Felilx.Kuehling at amd.com>


> ---
>   drivers/gpu/drm/amd/amdkfd/kfd_chardev.c      |  20 ++--
>   .../drm/amd/amdkfd/kfd_device_queue_manager.c |   8 +-
>   drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c     | 103 +++++++++++++-----
>   drivers/gpu/drm/amd/amdkfd/kfd_priv.h         |   9 +-
>   drivers/gpu/drm/amd/amdkfd/kfd_process.c      |  40 +------
>   .../amd/amdkfd/kfd_process_queue_manager.c    |  23 ++--
>   6 files changed, 108 insertions(+), 95 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> index 1b54a9aaae70..5d4f4fca793a 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> @@ -327,10 +327,12 @@ static int kfd_ioctl_create_queue(struct file *filep, struct kfd_process *p,
>   		goto err_bind_process;
>   	}
>   
> -	if (!pdd->doorbell_index &&
> -	    kfd_alloc_process_doorbells(dev, &pdd->doorbell_index) < 0) {
> -		err = -ENOMEM;
> -		goto err_alloc_doorbells;
> +	if (!pdd->qpd.proc_doorbells) {
> +		err = kfd_alloc_process_doorbells(dev, pdd);
> +		if (err) {
> +			pr_debug("failed to allocate process doorbells\n");
> +			goto err_bind_process;
> +		}
>   	}
>   
>   	/* Starting with GFX11, wptr BOs must be mapped to GART for MES to determine work
> @@ -410,7 +412,6 @@ static int kfd_ioctl_create_queue(struct file *filep, struct kfd_process *p,
>   	if (wptr_bo)
>   		amdgpu_amdkfd_free_gtt_mem(dev->adev, wptr_bo);
>   err_wptr_map_gart:
> -err_alloc_doorbells:
>   err_bind_process:
>   err_pdd:
>   	mutex_unlock(&p->mutex);
> @@ -2239,11 +2240,12 @@ static int criu_restore_devices(struct kfd_process *p,
>   			goto exit;
>   		}
>   
> -		if (!pdd->doorbell_index &&
> -		    kfd_alloc_process_doorbells(pdd->dev, &pdd->doorbell_index) < 0) {
> -			ret = -ENOMEM;
> -			goto exit;
> +		if (!pdd->qpd.proc_doorbells) {
> +			ret = kfd_alloc_process_doorbells(dev, pdd);
> +			if (ret)
> +				goto exit;
>   		}
> +
>   	}
>   
>   	/*
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> index 7a95698d83f7..834f640cf807 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> @@ -371,7 +371,7 @@ static int allocate_doorbell(struct qcm_process_device *qpd,
>   			unsigned int found;
>   
>   			found = find_first_zero_bit(qpd->doorbell_bitmap,
> -						KFD_MAX_NUM_OF_QUEUES_PER_PROCESS);
> +						    KFD_MAX_NUM_OF_QUEUES_PER_PROCESS);
>   			if (found >= KFD_MAX_NUM_OF_QUEUES_PER_PROCESS) {
>   				pr_debug("No doorbells available");
>   				return -EBUSY;
> @@ -381,9 +381,9 @@ static int allocate_doorbell(struct qcm_process_device *qpd,
>   		}
>   	}
>   
> -	q->properties.doorbell_off =
> -		kfd_get_doorbell_dw_offset_in_bar(dev, qpd_to_pdd(qpd),
> -					  q->doorbell_id);
> +	q->properties.doorbell_off = amdgpu_doorbell_index_on_bar(dev->adev,
> +								  qpd->proc_doorbells,
> +								  q->doorbell_id);
>   	return 0;
>   }
>   
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c b/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c
> index f7d45057ed32..c9ca21e1a99a 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c
> @@ -232,48 +232,97 @@ uint64_t kfd_get_number_elems(struct kfd_dev *kfd)
>   
>   }
>   
> +static int init_doorbell_bitmap(struct qcm_process_device *qpd,
> +				struct kfd_dev *dev)
> +{
> +	unsigned int i;
> +	int range_start = dev->shared_resources.non_cp_doorbells_start;
> +	int range_end = dev->shared_resources.non_cp_doorbells_end;
> +
> +	if (!KFD_IS_SOC15(dev))
> +		return 0;
> +
> +	/* Mask out doorbells reserved for SDMA, IH, and VCN on SOC15. */
> +	pr_debug("reserved doorbell 0x%03x - 0x%03x\n", range_start, range_end);
> +	pr_debug("reserved doorbell 0x%03x - 0x%03x\n",
> +			range_start + KFD_QUEUE_DOORBELL_MIRROR_OFFSET,
> +			range_end + KFD_QUEUE_DOORBELL_MIRROR_OFFSET);
> +
> +	for (i = 0; i < KFD_MAX_NUM_OF_QUEUES_PER_PROCESS / 2; i++) {
> +		if (i >= range_start && i <= range_end) {
> +			__set_bit(i, qpd->doorbell_bitmap);
> +			__set_bit(i + KFD_QUEUE_DOORBELL_MIRROR_OFFSET,
> +				  qpd->doorbell_bitmap);
> +		}
> +	}
> +
> +	return 0;
> +}
> +
>   phys_addr_t kfd_get_process_doorbells(struct kfd_process_device *pdd)
>   {
> -	if (!pdd->doorbell_index) {
> -		int r = kfd_alloc_process_doorbells(pdd->dev,
> -						    &pdd->doorbell_index);
> -		if (r < 0)
> +	struct amdgpu_device *adev = pdd->dev->adev;
> +	uint32_t first_db_index;
> +
> +	if (!pdd->qpd.proc_doorbells) {
> +		if (kfd_alloc_process_doorbells(pdd->dev, pdd))
> +			/* phys_addr_t 0 is error */
>   			return 0;
>   	}
>   
> -	return pdd->dev->doorbell_base +
> -		pdd->doorbell_index * kfd_doorbell_process_slice(pdd->dev);
> +	first_db_index = amdgpu_doorbell_index_on_bar(adev, pdd->qpd.proc_doorbells, 0);
> +	return adev->doorbell.base + first_db_index * sizeof(uint32_t);
>   }
>   
> -int kfd_alloc_process_doorbells(struct kfd_dev *kfd, unsigned int *doorbell_index)
> +int kfd_alloc_process_doorbells(struct kfd_dev *kfd, struct kfd_process_device *pdd)
>   {
> -	int r = 0;
> +	int r;
> +	struct qcm_process_device *qpd = &pdd->qpd;
>   
> -	if (!kfd->shared_resources.enable_mes)
> -		r = ida_simple_get(&kfd->doorbell_ida, 1,
> -				   kfd->max_doorbell_slices, GFP_KERNEL);
> -	else
> -		r = amdgpu_mes_alloc_process_doorbells(
> -				(struct amdgpu_device *)kfd->adev,
> -				doorbell_index);
> +	/* Allocate bitmap for dynamic doorbell allocation */
> +	qpd->doorbell_bitmap = bitmap_zalloc(KFD_MAX_NUM_OF_QUEUES_PER_PROCESS,
> +					     GFP_KERNEL);
> +	if (!qpd->doorbell_bitmap) {
> +		DRM_ERROR("Failed to allocate process doorbell bitmap\n");
> +		return -ENOMEM;
> +	}
>   
> -	if (r > 0)
> -		*doorbell_index = r;
> +	r = init_doorbell_bitmap(&pdd->qpd, kfd);
> +	if (r) {
> +		DRM_ERROR("Failed to initialize process doorbells\n");
> +		r = -ENOMEM;
> +		goto err;
> +	}
> +
> +	/* Allocate doorbells for this process */
> +	r = amdgpu_bo_create_kernel(kfd->adev,
> +				    kfd_doorbell_process_slice(kfd),
> +				    PAGE_SIZE,
> +				    AMDGPU_GEM_DOMAIN_DOORBELL,
> +				    &qpd->proc_doorbells,
> +				    NULL,
> +				    NULL);
> +	if (r) {
> +		DRM_ERROR("Failed to allocate process doorbells\n");
> +		goto err;
> +	}
>   
> -	if (r < 0)
> -		pr_err("Failed to allocate process doorbells\n");
> +	return 0;
>   
> +err:
> +	bitmap_free(qpd->doorbell_bitmap);
> +	qpd->doorbell_bitmap = NULL;
>   	return r;
>   }
>   
> -void kfd_free_process_doorbells(struct kfd_dev *kfd, unsigned int doorbell_index)
> +void kfd_free_process_doorbells(struct kfd_dev *kfd, struct kfd_process_device *pdd)
>   {
> -	if (doorbell_index) {
> -		if (!kfd->shared_resources.enable_mes)
> -			ida_simple_remove(&kfd->doorbell_ida, doorbell_index);
> -		else
> -			amdgpu_mes_free_process_doorbells(
> -					(struct amdgpu_device *)kfd->adev,
> -					doorbell_index);
> +	struct qcm_process_device *qpd = &pdd->qpd;
> +
> +	if (qpd->doorbell_bitmap) {
> +		bitmap_free(qpd->doorbell_bitmap);
> +		qpd->doorbell_bitmap = NULL;
>   	}
> +
> +	amdgpu_bo_free_kernel(&qpd->proc_doorbells, NULL, NULL);
>   }
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> index 42c215782d72..d0abfaccf8c1 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> @@ -664,7 +664,10 @@ struct qcm_process_device {
>   	uint64_t ib_base;
>   	void *ib_kaddr;
>   
> -	/* doorbell resources per process per device */
> +	/* doorbells for kfd process */
> +	struct amdgpu_bo *proc_doorbells;
> +
> +	/* bitmap for dynamic doorbell allocation from the bo */
>   	unsigned long *doorbell_bitmap;
>   };
>   
> @@ -1013,9 +1016,9 @@ unsigned int kfd_get_doorbell_dw_offset_in_bar(struct kfd_dev *kfd,
>   					unsigned int doorbell_id);
>   phys_addr_t kfd_get_process_doorbells(struct kfd_process_device *pdd);
>   int kfd_alloc_process_doorbells(struct kfd_dev *kfd,
> -				unsigned int *doorbell_index);
> +				 struct kfd_process_device *pdd);
>   void kfd_free_process_doorbells(struct kfd_dev *kfd,
> -				unsigned int doorbell_index);
> +				 struct kfd_process_device *pdd);
>   /* GTT Sub-Allocator */
>   
>   int kfd_gtt_sa_allocate(struct kfd_dev *kfd, unsigned int size,
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> index 07a9eaf9b7d8..bb1281a4feef 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> @@ -1037,10 +1037,9 @@ static void kfd_process_destroy_pdds(struct kfd_process *p)
>   			free_pages((unsigned long)pdd->qpd.cwsr_kaddr,
>   				get_order(KFD_CWSR_TBA_TMA_SIZE));
>   
> -		bitmap_free(pdd->qpd.doorbell_bitmap);
>   		idr_destroy(&pdd->alloc_idr);
>   
> -		kfd_free_process_doorbells(pdd->dev, pdd->doorbell_index);
> +		kfd_free_process_doorbells(pdd->dev, pdd);
>   
>   		if (pdd->dev->shared_resources.enable_mes)
>   			amdgpu_amdkfd_free_gtt_mem(pdd->dev->adev,
> @@ -1496,38 +1495,6 @@ static struct kfd_process *create_process(const struct task_struct *thread)
>   	return ERR_PTR(err);
>   }
>   
> -static int init_doorbell_bitmap(struct qcm_process_device *qpd,
> -			struct kfd_dev *dev)
> -{
> -	unsigned int i;
> -	int range_start = dev->shared_resources.non_cp_doorbells_start;
> -	int range_end = dev->shared_resources.non_cp_doorbells_end;
> -
> -	if (!KFD_IS_SOC15(dev))
> -		return 0;
> -
> -	qpd->doorbell_bitmap = bitmap_zalloc(KFD_MAX_NUM_OF_QUEUES_PER_PROCESS,
> -					     GFP_KERNEL);
> -	if (!qpd->doorbell_bitmap)
> -		return -ENOMEM;
> -
> -	/* Mask out doorbells reserved for SDMA, IH, and VCN on SOC15. */
> -	pr_debug("reserved doorbell 0x%03x - 0x%03x\n", range_start, range_end);
> -	pr_debug("reserved doorbell 0x%03x - 0x%03x\n",
> -			range_start + KFD_QUEUE_DOORBELL_MIRROR_OFFSET,
> -			range_end + KFD_QUEUE_DOORBELL_MIRROR_OFFSET);
> -
> -	for (i = 0; i < KFD_MAX_NUM_OF_QUEUES_PER_PROCESS / 2; i++) {
> -		if (i >= range_start && i <= range_end) {
> -			__set_bit(i, qpd->doorbell_bitmap);
> -			__set_bit(i + KFD_QUEUE_DOORBELL_MIRROR_OFFSET,
> -				  qpd->doorbell_bitmap);
> -		}
> -	}
> -
> -	return 0;
> -}
> -
>   struct kfd_process_device *kfd_get_process_device_data(struct kfd_dev *dev,
>   							struct kfd_process *p)
>   {
> @@ -1552,11 +1519,6 @@ struct kfd_process_device *kfd_create_process_device_data(struct kfd_dev *dev,
>   	if (!pdd)
>   		return NULL;
>   
> -	if (init_doorbell_bitmap(&pdd->qpd, dev)) {
> -		pr_err("Failed to init doorbell for process\n");
> -		goto err_free_pdd;
> -	}
> -
>   	pdd->dev = dev;
>   	INIT_LIST_HEAD(&pdd->qpd.queues_list);
>   	INIT_LIST_HEAD(&pdd->qpd.priv_queue_list);
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> index 4236539d9f93..5eff74015d51 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> @@ -344,17 +344,20 @@ int pqm_create_queue(struct process_queue_manager *pqm,
>   		goto err_create_queue;
>   	}
>   
> -	if (q && p_doorbell_offset_in_process)
> +	if (q && p_doorbell_offset_in_process) {
>   		/* Return the doorbell offset within the doorbell page
>   		 * to the caller so it can be passed up to user mode
>   		 * (in bytes).
> -		 * There are always 1024 doorbells per process, so in case
> -		 * of 8-byte doorbells, there are two doorbell pages per
> -		 * process.
> +		 * relative doorbell index = Absolute doorbell index -
> +		 * absolute index of first doorbell in the page.
>   		 */
> -		*p_doorbell_offset_in_process =
> -			(q->properties.doorbell_off * sizeof(uint32_t)) &
> -			(kfd_doorbell_process_slice(dev) - 1);
> +		uint32_t first_db_index = amdgpu_doorbell_index_on_bar(pdd->dev->adev,
> +								       pdd->qpd.proc_doorbells,
> +								       0);
> +
> +		*p_doorbell_offset_in_process = (q->properties.doorbell_off
> +						- first_db_index) * sizeof(uint32_t);
> +	}
>   
>   	pr_debug("PQM After DQM create queue\n");
>   
> @@ -858,12 +861,6 @@ int kfd_criu_restore_queue(struct kfd_process *p,
>   		goto exit;
>   	}
>   
> -	if (!pdd->doorbell_index &&
> -	    kfd_alloc_process_doorbells(pdd->dev, &pdd->doorbell_index) < 0) {
> -		ret = -ENOMEM;
> -		goto exit;
> -	}
> -
>   	/* data stored in this order: mqd, ctl_stack */
>   	mqd = q_extra_data;
>   	ctl_stack = mqd + q_data->mqd_size;


More information about the amd-gfx mailing list