[PATCH 1/3] drm/amdkfd: Allocate SDMA engines more fairly
Lazar, Lijo
lijo.lazar at amd.com
Mon Aug 23 07:08:32 UTC 2021
On 8/20/2021 11:02 AM, Joseph Greathouse wrote:
> Give every process at most one queue from each SDMA engine.
> Previously, we allocated all SDMA engines and queues on a first-
> come-first-serve basis. This meant that it was possible for two
> processes racing on their allocation requests to each end up with
> two queues on the same SDMA engine. That could serialize the
> performance of commands to different queues.
>
> This new mechanism allows each process at most a single queue
> on each SDMA engine. Processes will check for queue availability on
> SDMA engine 0, then 1, then 2, etc. and only allocate on that
> engine if there is an available queue slot. If there are no
> queue slots available (or if this process has already allocated
> a queue on this engine), it moves on and tries the next engine.
>
> The Aldebaran chip has a small quirk that SDMA0 should only be
> used by the very first allocation from each process. It is OK to
> use any other SDMA engine at any time, but after the first SDMA
> allocation request from a process, the resulting engine must
> be from SDMA1 or above. This patch handles this case as well.
>
> Signed-off-by: Joseph Greathouse <Joseph.Greathouse at amd.com>
> ---
> .../drm/amd/amdkfd/kfd_device_queue_manager.c | 135 +++++++++++++-----
> drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 2 +
> drivers/gpu/drm/amd/amdkfd/kfd_process.c | 3 +
> 3 files changed, 107 insertions(+), 33 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> index f8fce9d05f50..86bdb765f350 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> @@ -52,12 +52,14 @@ static int unmap_queues_cpsch(struct device_queue_manager *dqm,
> static int map_queues_cpsch(struct device_queue_manager *dqm);
>
> static void deallocate_sdma_queue(struct device_queue_manager *dqm,
> + struct qcm_process_device *qpd,
> struct queue *q);
>
> static inline void deallocate_hqd(struct device_queue_manager *dqm,
> struct queue *q);
> static int allocate_hqd(struct device_queue_manager *dqm, struct queue *q);
> static int allocate_sdma_queue(struct device_queue_manager *dqm,
> + struct qcm_process_device *qpd,
> struct queue *q);
> static void kfd_process_hw_exception(struct work_struct *work);
>
> @@ -349,7 +351,7 @@ static int create_queue_nocpsch(struct device_queue_manager *dqm,
> q->pipe, q->queue);
> } else if (q->properties.type == KFD_QUEUE_TYPE_SDMA ||
> q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
> - retval = allocate_sdma_queue(dqm, q);
> + retval = allocate_sdma_queue(dqm, qpd, q);
> if (retval)
> goto deallocate_vmid;
> dqm->asic_ops.init_sdma_vm(dqm, q, qpd);
> @@ -410,7 +412,7 @@ static int create_queue_nocpsch(struct device_queue_manager *dqm,
> deallocate_hqd(dqm, q);
> else if (q->properties.type == KFD_QUEUE_TYPE_SDMA ||
> q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)
> - deallocate_sdma_queue(dqm, q);
> + deallocate_sdma_queue(dqm, qpd, q);
> deallocate_vmid:
> if (list_empty(&qpd->queues_list))
> deallocate_vmid(dqm, qpd, q);
> @@ -475,9 +477,9 @@ static int destroy_queue_nocpsch_locked(struct device_queue_manager *dqm,
> if (q->properties.type == KFD_QUEUE_TYPE_COMPUTE)
> deallocate_hqd(dqm, q);
> else if (q->properties.type == KFD_QUEUE_TYPE_SDMA)
> - deallocate_sdma_queue(dqm, q);
> + deallocate_sdma_queue(dqm, qpd, q);
> else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)
> - deallocate_sdma_queue(dqm, q);
> + deallocate_sdma_queue(dqm, qpd, q);
> else {
> pr_debug("q->properties.type %d is invalid\n",
> q->properties.type);
> @@ -1039,42 +1041,93 @@ static void pre_reset(struct device_queue_manager *dqm)
> dqm_unlock(dqm);
> }
>
> +static uint64_t sdma_engine_mask(int engine, int num_engines)
Looks more like the queue mask for an engine, the name doesn't make it
clear.
> +{
> + uint64_t mask = 0;
> +
> + engine %= num_engines;
> + while (engine < (sizeof(uint64_t)*8)) {
> + mask |= 1ULL << engine;
> + engine += num_engines;
> + }
> + return mask;
> +}
> +
> static int allocate_sdma_queue(struct device_queue_manager *dqm,
> + struct qcm_process_device *qpd,
> struct queue *q)
> {
> - int bit;
> + uint64_t available_queue_bitmap;
> + unsigned int bit, engine, num_engines;
> + bool found_sdma = false;
>
> if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
> - if (dqm->sdma_bitmap == 0) {
This is still a valid optimization and no need to loop through if
nothing is available anyway. Valid for XGMI loop also.
> + num_engines = get_num_sdma_engines(dqm);
> + for_each_set_bit(engine, &(qpd->sdma_engine_bitmap), num_engines) {
Probably a naive question -
Theoretically there are 8 queues per engine as per the mask code. In the
below code, there is an assumption that a process will use at best
number of queues=max num of sdma engines or xgmi engines simultaneously.
Is that true always? For ex: there are 2 sdma engines and 4 queues per
engine. Can't multiple threads of a process initiate multiple sdma
transactions > number of sdma engines available? This code limits that,
but I don't know if that is a possible case.
Thanks,
Lijo
> + /* Do not reuse SDMA0 for any subsequent SDMA queue
> + * requests on Aldebaran. If SDMA0's queues are all
> + * full, then this process should never use SDMA0
> + * for any further requests
> + */
> + if (dqm->dev->device_info->asic_family == CHIP_ALDEBARAN &&
> + engine == 0)
> + qpd->sdma_engine_bitmap &= ~(1ULL << engine);
> +
> + available_queue_bitmap = sdma_engine_mask(engine, num_engines);
> + available_queue_bitmap &= dqm->sdma_bitmap;
> +
> + if (!available_queue_bitmap)
> + continue;
> + /* Take the selected engine off the list so we will not
> + * allocate two queues onto the same engine
> + */
> + qpd->sdma_engine_bitmap &= ~(1ULL << engine);
> + found_sdma = true;
> +
> + bit = __ffs64(available_queue_bitmap);
> + dqm->sdma_bitmap &= ~(1ULL << bit);
> + q->sdma_id = bit;
> + q->properties.sdma_engine_id = q->sdma_id % num_engines;
> + q->properties.sdma_queue_id = q->sdma_id / num_engines;
> + break;
> + }
> + if (!found_sdma) {
> pr_err("No more SDMA queue to allocate\n");
> return -ENOMEM;
> }
> -
> - bit = __ffs64(dqm->sdma_bitmap);
> - dqm->sdma_bitmap &= ~(1ULL << bit);
> - q->sdma_id = bit;
> - q->properties.sdma_engine_id = q->sdma_id %
> - get_num_sdma_engines(dqm);
> - q->properties.sdma_queue_id = q->sdma_id /
> - get_num_sdma_engines(dqm);
> } else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
> - if (dqm->xgmi_sdma_bitmap == 0) {
> + num_engines = get_num_xgmi_sdma_engines(dqm);
> + for_each_set_bit(engine, &(qpd->xgmi_sdma_engine_bitmap), num_engines) {
> + available_queue_bitmap = sdma_engine_mask(engine, num_engines);
> + available_queue_bitmap &= dqm->xgmi_sdma_bitmap;
> +
> + if (!available_queue_bitmap)
> + continue;
> + /* Take the selected engine off the list so we will not
> + * allocate two queues onto the same engine
> + */
> + qpd->xgmi_sdma_engine_bitmap &= ~(1ULL << engine);
> + found_sdma = true;
> +
> + bit = __ffs64(available_queue_bitmap);
> + dqm->xgmi_sdma_bitmap &= ~(1ULL << bit);
> + q->sdma_id = bit;
> + /* sdma_engine_id is sdma id including
> + * both PCIe-optimized SDMAs and XGMI-
> + * optimized SDMAs. The calculation below
> + * assumes the first N engines are always
> + * PCIe-optimized ones
> + */
> + q->properties.sdma_engine_id = get_num_sdma_engines(dqm) +
> + q->sdma_id % get_num_xgmi_sdma_engines(dqm);
> + q->properties.sdma_queue_id = q->sdma_id /
> + get_num_xgmi_sdma_engines(dqm);
> + break;
> + }
> + if (!found_sdma) {
> pr_err("No more XGMI SDMA queue to allocate\n");
> return -ENOMEM;
> }
> - bit = __ffs64(dqm->xgmi_sdma_bitmap);
> - dqm->xgmi_sdma_bitmap &= ~(1ULL << bit);
> - q->sdma_id = bit;
> - /* sdma_engine_id is sdma id including
> - * both PCIe-optimized SDMAs and XGMI-
> - * optimized SDMAs. The calculation below
> - * assumes the first N engines are always
> - * PCIe-optimized ones
> - */
> - q->properties.sdma_engine_id = get_num_sdma_engines(dqm) +
> - q->sdma_id % get_num_xgmi_sdma_engines(dqm);
> - q->properties.sdma_queue_id = q->sdma_id /
> - get_num_xgmi_sdma_engines(dqm);
> }
>
> pr_debug("SDMA engine id: %d\n", q->properties.sdma_engine_id);
> @@ -1084,16 +1137,32 @@ static int allocate_sdma_queue(struct device_queue_manager *dqm,
> }
>
> static void deallocate_sdma_queue(struct device_queue_manager *dqm,
> + struct qcm_process_device *qpd,
> struct queue *q)
> {
> + uint32_t engine = q->properties.sdma_engine_id;
> +
> if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
> if (q->sdma_id >= get_num_sdma_queues(dqm))
> return;
> dqm->sdma_bitmap |= (1ULL << q->sdma_id);
> + /* Don't give SDMA0 back to be reallocated on Aldebaran.
> + * It is only OK to use this engine from the first allocation
> + * within a process.
> + */
> + if (!(dqm->dev->device_info->asic_family == CHIP_ALDEBARAN &&
> + engine == 0))
> + qpd->sdma_engine_bitmap |= (1ULL << engine);
> +
> } else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
> if (q->sdma_id >= get_num_xgmi_sdma_queues(dqm))
> return;
> dqm->xgmi_sdma_bitmap |= (1ULL << q->sdma_id);
> + /* engine ID in the queue properties is the global engine ID.
> + * The XGMI engine bitmap ignores the PCIe-optimized engines.
> + */
> + engine -= get_num_sdma_engines(dqm);
> + qpd->xgmi_sdma_engine_bitmap |= (1ULL << engine);
> }
> }
>
> @@ -1303,7 +1372,7 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q,
> if (q->properties.type == KFD_QUEUE_TYPE_SDMA ||
> q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
> dqm_lock(dqm);
> - retval = allocate_sdma_queue(dqm, q);
> + retval = allocate_sdma_queue(dqm, qpd, q);
> dqm_unlock(dqm);
> if (retval)
> goto out;
> @@ -1365,7 +1434,7 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q,
> if (q->properties.type == KFD_QUEUE_TYPE_SDMA ||
> q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
> dqm_lock(dqm);
> - deallocate_sdma_queue(dqm, q);
> + deallocate_sdma_queue(dqm, qpd, q);
> dqm_unlock(dqm);
> }
> out:
> @@ -1536,7 +1605,7 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm,
>
> if ((q->properties.type == KFD_QUEUE_TYPE_SDMA) ||
> (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)) {
> - deallocate_sdma_queue(dqm, q);
> + deallocate_sdma_queue(dqm, qpd, q);
> pdd->sdma_past_activity_counter += sdma_val;
> }
>
> @@ -1751,9 +1820,9 @@ static int process_termination_cpsch(struct device_queue_manager *dqm,
> /* Clear all user mode queues */
> list_for_each_entry(q, &qpd->queues_list, list) {
> if (q->properties.type == KFD_QUEUE_TYPE_SDMA)
> - deallocate_sdma_queue(dqm, q);
> + deallocate_sdma_queue(dqm, qpd, q);
> else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)
> - deallocate_sdma_queue(dqm, q);
> + deallocate_sdma_queue(dqm, qpd, q);
>
> if (q->properties.is_active) {
> decrement_queue_count(dqm, q->properties.type);
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> index ab83b0de6b22..c38eebc9db4d 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> @@ -576,6 +576,8 @@ struct qcm_process_device {
> struct list_head priv_queue_list;
>
> unsigned int queue_count;
> + unsigned long sdma_engine_bitmap;
> + unsigned long xgmi_sdma_engine_bitmap;
> unsigned int vmid;
> bool is_debug;
> unsigned int evicted; /* eviction counter, 0=active */
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> index 21ec8a18cad2..13c85624bf7d 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> @@ -1422,6 +1422,7 @@ struct kfd_process_device *kfd_create_process_device_data(struct kfd_dev *dev,
> struct kfd_process *p)
> {
> struct kfd_process_device *pdd = NULL;
> + const struct kfd_device_info *dev_info = dev->dqm->dev->device_info;
>
> if (WARN_ON_ONCE(p->n_pdds >= MAX_GPU_INSTANCE))
> return NULL;
> @@ -1446,6 +1447,8 @@ struct kfd_process_device *kfd_create_process_device_data(struct kfd_dev *dev,
> pdd->qpd.pqm = &p->pqm;
> pdd->qpd.evicted = 0;
> pdd->qpd.mapped_gws_queue = false;
> + pdd->qpd.sdma_engine_bitmap = BIT_ULL(dev_info->num_sdma_engines) - 1;
> + pdd->qpd.xgmi_sdma_engine_bitmap = BIT_ULL(dev_info->num_xgmi_sdma_engines) - 1;
> pdd->process = p;
> pdd->bound = PDD_UNBOUND;
> pdd->already_dequeued = false;
>
More information about the amd-gfx
mailing list