[PATCH 1/3] drm/amdkfd: Allocate SDMA engines more fairly

Lazar, Lijo lijo.lazar at amd.com
Mon Aug 23 07:08:32 UTC 2021



On 8/20/2021 11:02 AM, Joseph Greathouse wrote:
> Give every process at most one queue from each SDMA engine.
> Previously, we allocated all SDMA engines and queues on a first-
> come-first-serve basis. This meant that it was possible for two
> processes racing on their allocation requests to each end up with
> two queues on the same SDMA engine. That could serialize the
> performance of commands to different queues.
> 
> This new mechanism allows each process at most a single queue
> on each SDMA engine. Processes will check for queue availability on
> SDMA engine 0, then 1, then 2, etc. and only allocate on that
> engine if there is an available queue slot. If there are no
> queue slots available (or if this process has already allocated
> a queue on this engine), it moves on and tries the next engine.
> 
> The Aldebaran chip has a small quirk that SDMA0 should only be
> used by the very first allocation from each process. It is OK to
> use any other SDMA engine at any time, but after the first SDMA
> allocation request from a process, the resulting engine must
> be from SDMA1 or above. This patch handles this case as well.
> 
> Signed-off-by: Joseph Greathouse <Joseph.Greathouse at amd.com>
> ---
>   .../drm/amd/amdkfd/kfd_device_queue_manager.c | 135 +++++++++++++-----
>   drivers/gpu/drm/amd/amdkfd/kfd_priv.h         |   2 +
>   drivers/gpu/drm/amd/amdkfd/kfd_process.c      |   3 +
>   3 files changed, 107 insertions(+), 33 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> index f8fce9d05f50..86bdb765f350 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> @@ -52,12 +52,14 @@ static int unmap_queues_cpsch(struct device_queue_manager *dqm,
>   static int map_queues_cpsch(struct device_queue_manager *dqm);
>   
>   static void deallocate_sdma_queue(struct device_queue_manager *dqm,
> +				struct qcm_process_device *qpd,
>   				struct queue *q);
>   
>   static inline void deallocate_hqd(struct device_queue_manager *dqm,
>   				struct queue *q);
>   static int allocate_hqd(struct device_queue_manager *dqm, struct queue *q);
>   static int allocate_sdma_queue(struct device_queue_manager *dqm,
> +				struct qcm_process_device *qpd,
>   				struct queue *q);
>   static void kfd_process_hw_exception(struct work_struct *work);
>   
> @@ -349,7 +351,7 @@ static int create_queue_nocpsch(struct device_queue_manager *dqm,
>   			q->pipe, q->queue);
>   	} else if (q->properties.type == KFD_QUEUE_TYPE_SDMA ||
>   		q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
> -		retval = allocate_sdma_queue(dqm, q);
> +		retval = allocate_sdma_queue(dqm, qpd, q);
>   		if (retval)
>   			goto deallocate_vmid;
>   		dqm->asic_ops.init_sdma_vm(dqm, q, qpd);
> @@ -410,7 +412,7 @@ static int create_queue_nocpsch(struct device_queue_manager *dqm,
>   		deallocate_hqd(dqm, q);
>   	else if (q->properties.type == KFD_QUEUE_TYPE_SDMA ||
>   		q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)
> -		deallocate_sdma_queue(dqm, q);
> +		deallocate_sdma_queue(dqm, qpd, q);
>   deallocate_vmid:
>   	if (list_empty(&qpd->queues_list))
>   		deallocate_vmid(dqm, qpd, q);
> @@ -475,9 +477,9 @@ static int destroy_queue_nocpsch_locked(struct device_queue_manager *dqm,
>   	if (q->properties.type == KFD_QUEUE_TYPE_COMPUTE)
>   		deallocate_hqd(dqm, q);
>   	else if (q->properties.type == KFD_QUEUE_TYPE_SDMA)
> -		deallocate_sdma_queue(dqm, q);
> +		deallocate_sdma_queue(dqm, qpd, q);
>   	else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)
> -		deallocate_sdma_queue(dqm, q);
> +		deallocate_sdma_queue(dqm, qpd, q);
>   	else {
>   		pr_debug("q->properties.type %d is invalid\n",
>   				q->properties.type);
> @@ -1039,42 +1041,93 @@ static void pre_reset(struct device_queue_manager *dqm)
>   	dqm_unlock(dqm);
>   }
>   
> +static uint64_t sdma_engine_mask(int engine, int num_engines)

Looks more like the queue mask for an engine, the name doesn't make it 
clear.

> +{
> +	uint64_t mask = 0;
> +
> +	engine %= num_engines;
> +	while (engine < (sizeof(uint64_t)*8)) {
> +		mask |= 1ULL << engine;
> +		engine += num_engines;
> +	}
> +	return mask;
> +}
> +
>   static int allocate_sdma_queue(struct device_queue_manager *dqm,
> +				struct qcm_process_device *qpd,
>   				struct queue *q)
>   {
> -	int bit;
> +	uint64_t available_queue_bitmap;
> +	unsigned int bit, engine, num_engines;
> +	bool found_sdma = false;
>   
>   	if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
> -		if (dqm->sdma_bitmap == 0) {

This is still a valid optimization and no need to loop through if 
nothing is available anyway. Valid for XGMI loop also.

> +		num_engines = get_num_sdma_engines(dqm);
> +		for_each_set_bit(engine, &(qpd->sdma_engine_bitmap), num_engines) {

Probably a naive question -

Theoretically there are 8 queues per engine as per the mask code. In the 
below code, there is an assumption that a process will use at best 
number of queues=max num of sdma engines or xgmi engines simultaneously. 
Is that true always? For ex: there are 2 sdma engines and 4 queues per 
engine. Can't multiple threads of a process initiate multiple sdma 
transactions > number of sdma engines available? This code limits that, 
but I don't know if that is a possible case.

Thanks,
Lijo

> +			/* Do not reuse SDMA0 for any subsequent SDMA queue
> +			 * requests on Aldebaran. If SDMA0's queues are all
> +			 * full, then this process should never use SDMA0
> +			 * for any further requests
> +			 */
> +			if (dqm->dev->device_info->asic_family == CHIP_ALDEBARAN &&
> +					engine == 0)
> +				qpd->sdma_engine_bitmap &= ~(1ULL << engine);
> +
> +			available_queue_bitmap = sdma_engine_mask(engine, num_engines);
> +			available_queue_bitmap &= dqm->sdma_bitmap;
> +
> +			if (!available_queue_bitmap)
> +				continue;
> +			/* Take the selected engine off the list so we will not
> +			 * allocate two queues onto the same engine
> +			 */
> +			qpd->sdma_engine_bitmap &= ~(1ULL << engine);
> +			found_sdma = true;
> +
> +			bit = __ffs64(available_queue_bitmap);
> +			dqm->sdma_bitmap &= ~(1ULL << bit);
> +			q->sdma_id = bit;
> +			q->properties.sdma_engine_id = q->sdma_id % num_engines;
> +			q->properties.sdma_queue_id = q->sdma_id / num_engines;
> +			break;
> +		}
> +		if (!found_sdma) {
>   			pr_err("No more SDMA queue to allocate\n");
>   			return -ENOMEM;
>   		}
> -
> -		bit = __ffs64(dqm->sdma_bitmap);
> -		dqm->sdma_bitmap &= ~(1ULL << bit);
> -		q->sdma_id = bit;
> -		q->properties.sdma_engine_id = q->sdma_id %
> -				get_num_sdma_engines(dqm);
> -		q->properties.sdma_queue_id = q->sdma_id /
> -				get_num_sdma_engines(dqm);
>   	} else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
> -		if (dqm->xgmi_sdma_bitmap == 0) {
> +		num_engines = get_num_xgmi_sdma_engines(dqm);
> +		for_each_set_bit(engine, &(qpd->xgmi_sdma_engine_bitmap), num_engines) {
> +			available_queue_bitmap = sdma_engine_mask(engine, num_engines);
> +			available_queue_bitmap &= dqm->xgmi_sdma_bitmap;
> +
> +			if (!available_queue_bitmap)
> +				continue;
> +			/* Take the selected engine off the list so we will not
> +			 * allocate two queues onto the same engine
> +			 */
> +			qpd->xgmi_sdma_engine_bitmap &= ~(1ULL << engine);
> +			found_sdma = true;
> +
> +			bit = __ffs64(available_queue_bitmap);
> +			dqm->xgmi_sdma_bitmap &= ~(1ULL << bit);
> +			q->sdma_id = bit;
> +			/* sdma_engine_id is sdma id including
> +			 * both PCIe-optimized SDMAs and XGMI-
> +			 * optimized SDMAs. The calculation below
> +			 * assumes the first N engines are always
> +			 * PCIe-optimized ones
> +			 */
> +			q->properties.sdma_engine_id = get_num_sdma_engines(dqm) +
> +				q->sdma_id % get_num_xgmi_sdma_engines(dqm);
> +			q->properties.sdma_queue_id = q->sdma_id /
> +				get_num_xgmi_sdma_engines(dqm);
> +			break;
> +		}
> +		if (!found_sdma) {
>   			pr_err("No more XGMI SDMA queue to allocate\n");
>   			return -ENOMEM;
>   		}
> -		bit = __ffs64(dqm->xgmi_sdma_bitmap);
> -		dqm->xgmi_sdma_bitmap &= ~(1ULL << bit);
> -		q->sdma_id = bit;
> -		/* sdma_engine_id is sdma id including
> -		 * both PCIe-optimized SDMAs and XGMI-
> -		 * optimized SDMAs. The calculation below
> -		 * assumes the first N engines are always
> -		 * PCIe-optimized ones
> -		 */
> -		q->properties.sdma_engine_id = get_num_sdma_engines(dqm) +
> -				q->sdma_id % get_num_xgmi_sdma_engines(dqm);
> -		q->properties.sdma_queue_id = q->sdma_id /
> -				get_num_xgmi_sdma_engines(dqm);
>   	}
>   
>   	pr_debug("SDMA engine id: %d\n", q->properties.sdma_engine_id);
> @@ -1084,16 +1137,32 @@ static int allocate_sdma_queue(struct device_queue_manager *dqm,
>   }
>   
>   static void deallocate_sdma_queue(struct device_queue_manager *dqm,
> +				struct qcm_process_device *qpd,
>   				struct queue *q)
>   {
> +	uint32_t engine = q->properties.sdma_engine_id;
> +
>   	if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
>   		if (q->sdma_id >= get_num_sdma_queues(dqm))
>   			return;
>   		dqm->sdma_bitmap |= (1ULL << q->sdma_id);
> +		/* Don't give SDMA0 back to be reallocated on Aldebaran.
> +		 * It is only OK to use this engine from the first allocation
> +		 * within a process.
> +		 */
> +		if (!(dqm->dev->device_info->asic_family == CHIP_ALDEBARAN &&
> +					engine == 0))
> +			qpd->sdma_engine_bitmap |= (1ULL << engine);
> +
>   	} else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
>   		if (q->sdma_id >= get_num_xgmi_sdma_queues(dqm))
>   			return;
>   		dqm->xgmi_sdma_bitmap |= (1ULL << q->sdma_id);
> +		/* engine ID in the queue properties is the global engine ID.
> +		 * The XGMI engine bitmap ignores the PCIe-optimized engines.
> +		 */
> +		engine -= get_num_sdma_engines(dqm);
> +		qpd->xgmi_sdma_engine_bitmap |= (1ULL << engine);
>   	}
>   }
>   
> @@ -1303,7 +1372,7 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q,
>   	if (q->properties.type == KFD_QUEUE_TYPE_SDMA ||
>   		q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
>   		dqm_lock(dqm);
> -		retval = allocate_sdma_queue(dqm, q);
> +		retval = allocate_sdma_queue(dqm, qpd, q);
>   		dqm_unlock(dqm);
>   		if (retval)
>   			goto out;
> @@ -1365,7 +1434,7 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q,
>   	if (q->properties.type == KFD_QUEUE_TYPE_SDMA ||
>   		q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
>   		dqm_lock(dqm);
> -		deallocate_sdma_queue(dqm, q);
> +		deallocate_sdma_queue(dqm, qpd, q);
>   		dqm_unlock(dqm);
>   	}
>   out:
> @@ -1536,7 +1605,7 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm,
>   
>   	if ((q->properties.type == KFD_QUEUE_TYPE_SDMA) ||
>   	    (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)) {
> -		deallocate_sdma_queue(dqm, q);
> +		deallocate_sdma_queue(dqm, qpd, q);
>   		pdd->sdma_past_activity_counter += sdma_val;
>   	}
>   
> @@ -1751,9 +1820,9 @@ static int process_termination_cpsch(struct device_queue_manager *dqm,
>   	/* Clear all user mode queues */
>   	list_for_each_entry(q, &qpd->queues_list, list) {
>   		if (q->properties.type == KFD_QUEUE_TYPE_SDMA)
> -			deallocate_sdma_queue(dqm, q);
> +			deallocate_sdma_queue(dqm, qpd, q);
>   		else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)
> -			deallocate_sdma_queue(dqm, q);
> +			deallocate_sdma_queue(dqm, qpd, q);
>   
>   		if (q->properties.is_active) {
>   			decrement_queue_count(dqm, q->properties.type);
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> index ab83b0de6b22..c38eebc9db4d 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> @@ -576,6 +576,8 @@ struct qcm_process_device {
>   	struct list_head priv_queue_list;
>   
>   	unsigned int queue_count;
> +	unsigned long sdma_engine_bitmap;
> +	unsigned long xgmi_sdma_engine_bitmap;
>   	unsigned int vmid;
>   	bool is_debug;
>   	unsigned int evicted; /* eviction counter, 0=active */
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> index 21ec8a18cad2..13c85624bf7d 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> @@ -1422,6 +1422,7 @@ struct kfd_process_device *kfd_create_process_device_data(struct kfd_dev *dev,
>   							struct kfd_process *p)
>   {
>   	struct kfd_process_device *pdd = NULL;
> +	const struct kfd_device_info *dev_info = dev->dqm->dev->device_info;
>   
>   	if (WARN_ON_ONCE(p->n_pdds >= MAX_GPU_INSTANCE))
>   		return NULL;
> @@ -1446,6 +1447,8 @@ struct kfd_process_device *kfd_create_process_device_data(struct kfd_dev *dev,
>   	pdd->qpd.pqm = &p->pqm;
>   	pdd->qpd.evicted = 0;
>   	pdd->qpd.mapped_gws_queue = false;
> +	pdd->qpd.sdma_engine_bitmap = BIT_ULL(dev_info->num_sdma_engines) - 1;
> +	pdd->qpd.xgmi_sdma_engine_bitmap = BIT_ULL(dev_info->num_xgmi_sdma_engines) - 1;
>   	pdd->process = p;
>   	pdd->bound = PDD_UNBOUND;
>   	pdd->already_dequeued = false;
> 


More information about the amd-gfx mailing list