[PATCH] drm/amdkfd: support per-queue reset on gfx9

Wed Jul 31 15:44:35 UTC 2024

On 2024-07-31 09:37, Jonathan Kim wrote:
> Support per-queue reset for GFX9.  The recommendation is for the driver
> to target reset the HW queue via a SPI MMIO register write.
>
> Since this requires pipe and HW queue info and MEC FW is limited to
> doorbell reports of hung queues after an unmap failure, scan the HW
> queue slots defined by SET_RESOURCES first to identify the user queue
> candidates to reset.
>
> Only signal reset events to processes that have had a queue reset.
>
> If queue reset fails, fall back to GPU reset.
>
> v3: address nitpicks
> - handle hang detect buffer ENOMEM
> - warn on multiple detect hang misuse
> - reset hang detect buffer to NULL on free
> - update DRM_ERR on reset to drm_err app warning message

I meant dev_err here to make sure we print the device identifier. That's 
what we mostly use in KFD. If drm_err does the same, that's fine, too. 
Looking at the definitions in drm_print.h, the only thing that drm_err 
adds is a "[drm]" tag in the message.

See one more comment inline.


>
> v2: move reset queue flag for house keeping to process device.
> split detect and reset into separate functions.
> make reset call safe during power saving modes.
> clean up some other nitpicks.
>
> Signed-off-by: Jonathan Kim <jonathan.kim at amd.com>

[snip]

@@ -1929,6 +1966,135 @@ static int map_queues_cpsch(struct 
device_queue_manager *dqm)
>   	return retval;
>   }
>   
> +static void set_queue_as_reset(struct device_queue_manager *dqm, struct queue *q,
> +			       struct qcm_process_device *qpd)
> +{
> +	struct kfd_process_device *pdd = qpd_to_pdd(qpd);
> +
> +	pr_err("queue id 0x%0x at pasid 0x%0x is reset\n",
> +	       q->properties.queue_id, q->process->pasid);

This could also be a dev_err(dqm->dev->adev->dev, ...) or 
drm_err(dqm->dev->adev->ddev, ...). With that fixed, the patch is

Reviewed-by: Felix Kuehling <felix.kuehling at amd.com>


> +
> +	pdd->has_reset_queue = true;
> +	if (q->properties.is_active) {
> +		q->properties.is_active = false;
> +		decrement_queue_count(dqm, qpd, q);
> +	}
> +}
> +
> +static int detect_queue_hang(struct device_queue_manager *dqm)
> +{
> +	int i;
> +
> +	/* detect should be used only in dqm locked queue reset */
> +	if (WARN_ON(dqm->detect_hang_count > 0))
> +		return 0;
> +
> +	memset(dqm->detect_hang_info, 0, dqm->detect_hang_info_size);
> +
> +	for (i = 0; i < AMDGPU_MAX_QUEUES; ++i) {
> +		uint32_t mec, pipe, queue;
> +		int xcc_id;
> +
> +		mec = (i / dqm->dev->kfd->shared_resources.num_queue_per_pipe)
> +			/ dqm->dev->kfd->shared_resources.num_pipe_per_mec;
> +
> +		if (mec || !test_bit(i, dqm->dev->kfd->shared_resources.cp_queue_bitmap))
> +			continue;
> +
> +		amdgpu_queue_mask_bit_to_mec_queue(dqm->dev->adev, i, &mec, &pipe, &queue);
> +
> +		for_each_inst(xcc_id, dqm->dev->xcc_mask) {
> +			uint64_t queue_addr = dqm->dev->kfd2kgd->hqd_get_pq_addr(
> +						dqm->dev->adev, pipe, queue, xcc_id);
> +			struct dqm_detect_hang_info hang_info;
> +
> +			if (!queue_addr)
> +				continue;
> +
> +			hang_info.pipe_id = pipe;
> +			hang_info.queue_id = queue;
> +			hang_info.xcc_id = xcc_id;
> +			hang_info.queue_address = queue_addr;
> +
> +			dqm->detect_hang_info[dqm->detect_hang_count] = hang_info;
> +			dqm->detect_hang_count++;
> +		}
> +	}
> +
> +	return dqm->detect_hang_count;
> +}
> +
> +static struct queue *find_queue_by_address(struct device_queue_manager *dqm, uint64_t queue_address)
> +{
> +	struct device_process_node *cur;
> +	struct qcm_process_device *qpd;
> +	struct queue *q;
> +
> +	list_for_each_entry(cur, &dqm->queues, list) {
> +		qpd = cur->qpd;
> +		list_for_each_entry(q, &qpd->queues_list, list) {
> +			if (queue_address == q->properties.queue_address)
> +				return q;
> +		}
> +	}
> +
> +	return NULL;
> +}
> +
> +/* only for compute queue */
> +static int reset_queues_on_hws_hang(struct device_queue_manager *dqm)
> +{
> +	int r = 0, reset_count = 0, i;
> +
> +	if (!dqm->detect_hang_info || dqm->is_hws_hang)
> +		return -EIO;
> +
> +	/* assume dqm locked. */
> +	if (!detect_queue_hang(dqm))
> +		return -ENOTRECOVERABLE;
> +
> +	for (i = 0; i < dqm->detect_hang_count; i++) {
> +		struct dqm_detect_hang_info hang_info = dqm->detect_hang_info[i];
> +		struct queue *q = find_queue_by_address(dqm, hang_info.queue_address);
> +		struct kfd_process_device *pdd;
> +		uint64_t queue_addr = 0;
> +
> +		if (!q) {
> +			r = -ENOTRECOVERABLE;
> +			goto reset_fail;
> +		}
> +
> +		pdd = kfd_get_process_device_data(dqm->dev, q->process);
> +		if (!pdd) {
> +			r = -ENOTRECOVERABLE;
> +			goto reset_fail;
> +		}
> +
> +		queue_addr = dqm->dev->kfd2kgd->hqd_reset(dqm->dev->adev,
> +				hang_info.pipe_id, hang_info.queue_id, hang_info.xcc_id,
> +				KFD_UNMAP_LATENCY_MS);
> +
> +		/* either reset failed or we reset an unexpected queue. */
> +		if (queue_addr != q->properties.queue_address) {
> +			r = -ENOTRECOVERABLE;
> +			goto reset_fail;
> +		}
> +
> +		set_queue_as_reset(dqm, q, &pdd->qpd);
> +		reset_count++;
> +	}
> +
> +	if (reset_count == dqm->detect_hang_count)
> +		kfd_signal_reset_event(dqm->dev);
> +	else
> +		r = -ENOTRECOVERABLE;
> +
> +reset_fail:
> +	dqm->detect_hang_count = 0;
> +
> +	return r;
> +}
> +
>   /* dqm->lock mutex has to be locked before calling this function */
>   static int unmap_queues_cpsch(struct device_queue_manager *dqm,
>   				enum kfd_unmap_queues_filter filter,
> @@ -1979,11 +2145,14 @@ static int unmap_queues_cpsch(struct device_queue_manager *dqm,
>   	 */
>   	mqd_mgr = dqm->mqd_mgrs[KFD_MQD_TYPE_HIQ];
>   	if (mqd_mgr->check_preemption_failed(mqd_mgr, dqm->packet_mgr.priv_queue->queue->mqd)) {
> -		while (halt_if_hws_hang)
> -			schedule();
> -		kfd_hws_hang(dqm);
> -		retval = -ETIME;
> -		goto out;
> +		if (reset_queues_on_hws_hang(dqm)) {
> +			while (halt_if_hws_hang)
> +				schedule();
> +			dqm->is_hws_hang = true;
> +			kfd_hws_hang(dqm);
> +			retval = -ETIME;
> +			goto out;
> +		}
>   	}
>   
>   	/* We need to reset the grace period value for this device */
> @@ -2002,8 +2171,7 @@ static int unmap_queues_cpsch(struct device_queue_manager *dqm,
>   }
>   
>   /* only for compute queue */
> -static int reset_queues_cpsch(struct device_queue_manager *dqm,
> -			uint16_t pasid)
> +static int reset_queues_cpsch(struct device_queue_manager *dqm, uint16_t pasid)
>   {
>   	int retval;
>   
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
> index 3b9b8eabaacc..dfb36a246637 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
> @@ -210,6 +210,13 @@ struct device_queue_manager_asic_ops {
>   				 struct kfd_node *dev);
>   };
>   
> +struct dqm_detect_hang_info {
> +	int pipe_id;
> +	int queue_id;
> +	int xcc_id;
> +	uint64_t queue_address;
> +};
> +
>   /**
>    * struct device_queue_manager
>    *
> @@ -264,6 +271,11 @@ struct device_queue_manager {
>   	uint32_t		wait_times;
>   
>   	wait_queue_head_t	destroy_wait;
> +
> +	/* for per-queue reset support */
> +	struct dqm_detect_hang_info *detect_hang_info;
> +	size_t detect_hang_info_size;
> +	int detect_hang_count;
>   };
>   
>   void device_queue_manager_init_cik(
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
> index 9b33d9d2c9ad..9393ddc2e828 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
> @@ -31,6 +31,7 @@
>   #include <linux/memory.h>
>   #include "kfd_priv.h"
>   #include "kfd_events.h"
> +#include "kfd_device_queue_manager.h"
>   #include <linux/device.h>
>   
>   /*
> @@ -1244,12 +1245,33 @@ void kfd_signal_reset_event(struct kfd_node *dev)
>   	idx = srcu_read_lock(&kfd_processes_srcu);
>   	hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) {
>   		int user_gpu_id = kfd_process_get_user_gpu_id(p, dev->id);
> +		struct kfd_process_device *pdd = kfd_get_process_device_data(dev, p);
>   
>   		if (unlikely(user_gpu_id == -EINVAL)) {
>   			WARN_ONCE(1, "Could not get user_gpu_id from dev->id:%x\n", dev->id);
>   			continue;
>   		}
>   
> +		if (unlikely(!pdd)) {
> +			WARN_ONCE(1, "Could not get device data from pasid:0x%x\n", p->pasid);
> +			continue;
> +		}
> +
> +		if (dev->dqm->detect_hang_count && !pdd->has_reset_queue)
> +			continue;
> +
> +		if (dev->dqm->detect_hang_count) {
> +			struct amdgpu_task_info *ti;
> +
> +			ti = amdgpu_vm_get_task_info_pasid(dev->adev, p->pasid);
> +			if (ti) {
> +				drm_err(&dev->adev->ddev,
> +					"Queues reset on process %s tid %d thread %s pid %d\n",
> +					ti->process_name, ti->tgid, ti->task_name, ti->pid);
> +				amdgpu_vm_put_task_info(ti);
> +			}
> +		}
> +
>   		rcu_read_lock();
>   
>   		id = KFD_FIRST_NONSIGNAL_EVENT_ID;
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
> index 66c73825c0a0..84e8ea3a8a0c 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
> @@ -321,8 +321,11 @@ static void update_mqd(struct mqd_manager *mm, void *mqd,
>   static bool check_preemption_failed(struct mqd_manager *mm, void *mqd)
>   {
>   	struct v9_mqd *m = (struct v9_mqd *)mqd;
> +	uint32_t doorbell_id = m->queue_doorbell_id0;
>   
> -	return kfd_check_hiq_mqd_doorbell_id(mm->dev, m->queue_doorbell_id0, 0);
> +	m->queue_doorbell_id0 = 0;
> +
> +	return kfd_check_hiq_mqd_doorbell_id(mm->dev, doorbell_id, 0);
>   }
>   
>   static int get_wave_state(struct mqd_manager *mm, void *mqd,
> @@ -624,6 +627,7 @@ static bool check_preemption_failed_v9_4_3(struct mqd_manager *mm, void *mqd)
>   		m = get_mqd(mqd + hiq_mqd_size * inst);
>   		ret |= kfd_check_hiq_mqd_doorbell_id(mm->dev,
>   					m->queue_doorbell_id0, inst);
> +		m->queue_doorbell_id0 = 0;
>   		++inst;
>   	}
>   
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> index b5cae48dff66..892a85408c09 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> @@ -843,6 +843,9 @@ struct kfd_process_device {
>   	void *proc_ctx_bo;
>   	uint64_t proc_ctx_gpu_addr;
>   	void *proc_ctx_cpu_ptr;
> +
> +	/* Tracks queue reset status */
> +	bool has_reset_queue;
>   };
>   
>   #define qpd_to_pdd(x) container_of(x, struct kfd_process_device, qpd)
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> index 9e29b92eb523..a902950cc060 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> @@ -1851,6 +1851,8 @@ int kfd_process_evict_queues(struct kfd_process *p, uint32_t trigger)
>   			goto fail;
>   		}
>   		n_evicted++;
> +
> +		pdd->dev->dqm->is_hws_hang = false;
>   	}
>   
>   	return r;
> diff --git a/drivers/gpu/drm/amd/include/kgd_kfd_interface.h b/drivers/gpu/drm/amd/include/kgd_kfd_interface.h
> index 6d094cf3587d..7744ca3ef4b1 100644
> --- a/drivers/gpu/drm/amd/include/kgd_kfd_interface.h
> +++ b/drivers/gpu/drm/amd/include/kgd_kfd_interface.h
> @@ -318,6 +318,12 @@ struct kfd2kgd_calls {
>   	void (*program_trap_handler_settings)(struct amdgpu_device *adev,
>   			uint32_t vmid, uint64_t tba_addr, uint64_t tma_addr,
>   			uint32_t inst);
> +	uint64_t (*hqd_get_pq_addr)(struct amdgpu_device *adev,
> +				    uint32_t pipe_id, uint32_t queue_id,
> +				    uint32_t inst);
> +	uint64_t (*hqd_reset)(struct amdgpu_device *adev,
> +			      uint32_t pipe_id, uint32_t queue_id,
> +			      uint32_t inst, unsigned int utimeout);
>   };
>   
>   #endif	/* KGD_KFD_INTERFACE_H_INCLUDED */