[PATCH 25/29] drm/amdkfd: add debug query event operation

Felix Kuehling felix.kuehling at amd.com
Wed Nov 30 00:44:10 UTC 2022


On 2022-10-31 12:23, Jonathan Kim wrote:
> Allow the debugger to a single query queue, device and process exception
> in a FIFO manner.

The implementation is not really FIFO because the order in which events 
are returned is independent of the order in which they were raised. Just 
remove the FIFO statement.

Other than that, this patch is

Reviewed-by: Felix Kuehling <Felix.Kuehling at amd.com>


> The KFD should also return the GPU or Queue id of the exception.
> The debugger also has the option of clearing exceptions after
> being queried.
>
> Signed-off-by: Jonathan Kim <jonathan.kim at amd.com>
> ---
>   drivers/gpu/drm/amd/amdkfd/kfd_chardev.c |  6 +++
>   drivers/gpu/drm/amd/amdkfd/kfd_debug.c   | 64 ++++++++++++++++++++++++
>   drivers/gpu/drm/amd/amdkfd/kfd_debug.h   |  5 ++
>   3 files changed, 75 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> index 200e11f02382..b918213a0087 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> @@ -2946,6 +2946,12 @@ static int kfd_ioctl_set_debug_trap(struct file *filep, struct kfd_process *p, v
>   		r = kfd_dbg_trap_set_flags(target, &args->set_flags.flags);
>   		break;
>   	case KFD_IOC_DBG_TRAP_QUERY_DEBUG_EVENT:
> +		r = kfd_dbg_ev_query_debug_event(target,
> +				&args->query_debug_event.queue_id,
> +				&args->query_debug_event.gpu_id,
> +				args->query_debug_event.exception_mask,
> +				&args->query_debug_event.exception_mask);
> +		break;
>   	case KFD_IOC_DBG_TRAP_QUERY_EXCEPTION_INFO:
>   	case KFD_IOC_DBG_TRAP_GET_QUEUE_SNAPSHOT:
>   	case KFD_IOC_DBG_TRAP_GET_DEVICE_SNAPSHOT:
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
> index 1f4d3fa0278e..6985a53b83e9 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
> @@ -33,6 +33,70 @@
>   #define MAX_WATCH_ADDRESSES	4
>   static DEFINE_SPINLOCK(watch_points_lock);
>   
> +int kfd_dbg_ev_query_debug_event(struct kfd_process *process,
> +		      unsigned int *queue_id,
> +		      unsigned int *gpu_id,
> +		      uint64_t exception_clear_mask,
> +		      uint64_t *event_status)
> +{
> +	struct process_queue_manager *pqm;
> +	struct process_queue_node *pqn;
> +	int i;
> +
> +	if (!(process && process->debug_trap_enabled))
> +		return -ENODATA;
> +
> +	mutex_lock(&process->event_mutex);
> +	*event_status = 0;
> +	*queue_id = 0;
> +	*gpu_id = 0;
> +
> +	/* find and report queue events */
> +	pqm = &process->pqm;
> +	list_for_each_entry(pqn, &pqm->queues, process_queue_list) {
> +		uint64_t tmp = process->exception_enable_mask;
> +
> +		if (!pqn->q)
> +			continue;
> +
> +		tmp &= pqn->q->properties.exception_status;
> +
> +		if (!tmp)
> +			continue;
> +
> +		*event_status = pqn->q->properties.exception_status;
> +		*queue_id = pqn->q->properties.queue_id;
> +		*gpu_id = pqn->q->device->id;
> +		pqn->q->properties.exception_status &= ~exception_clear_mask;
> +		goto out;
> +	}
> +
> +	/* find and report device events */
> +	for (i = 0; i < process->n_pdds; i++) {
> +		struct kfd_process_device *pdd = process->pdds[i];
> +		uint64_t tmp = process->exception_enable_mask
> +						& pdd->exception_status;
> +
> +		if (!tmp)
> +			continue;
> +
> +		*event_status = pdd->exception_status;
> +		*gpu_id = pdd->dev->id;
> +		pdd->exception_status &= ~exception_clear_mask;
> +		goto out;
> +	}
> +
> +	/* report process events */
> +	if (process->exception_enable_mask & process->exception_status) {
> +		*event_status = process->exception_status;
> +		process->exception_status &= ~exception_clear_mask;
> +	}
> +
> +out:
> +	mutex_unlock(&process->event_mutex);
> +	return *event_status ? 0 : -EAGAIN;
> +}
> +
>   void debug_event_write_work_handler(struct work_struct *work)
>   {
>   	struct kfd_process *process;
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
> index 12b80b6c96d0..c64ffd3efc46 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
> @@ -27,6 +27,11 @@
>   
>   void kfd_dbg_trap_deactivate(struct kfd_process *target, bool unwind, int unwind_count);
>   int kfd_dbg_trap_activate(struct kfd_process *target);
> +int kfd_dbg_ev_query_debug_event(struct kfd_process *process,
> +			unsigned int *queue_id,
> +			unsigned int *gpu_id,
> +			uint64_t exception_clear_mask,
> +			uint64_t *event_status);
>   bool kfd_set_dbg_ev_from_interrupt(struct kfd_dev *dev,
>   				   unsigned int pasid,
>   				   uint32_t doorbell_id,


More information about the amd-gfx mailing list