[PATCH 26/29] drm/amdkfd: add debug query exception info operation

Felix Kuehling felix.kuehling at amd.com
Wed Nov 30 00:50:24 UTC 2022


On 2022-10-31 12:23, Jonathan Kim wrote:
> Allow the debugger to query additional info based on an exception code.
> For device exceptions, it's currently only memory violation information.
> For process exceptions, it's currently only runtime information.
> Queue exception only report the queue exception status.
>
> The debugger has the option of clearing the target exception on query.
>
> Signed-off-by: Jonathan Kim <jonathan.kim at amd.com>

Reviewed-by: Felix Kuehling <Felix.Kuehling at amd.com>


> ---
>   drivers/gpu/drm/amd/amdkfd/kfd_chardev.c |   7 ++
>   drivers/gpu/drm/amd/amdkfd/kfd_debug.c   | 120 +++++++++++++++++++++++
>   drivers/gpu/drm/amd/amdkfd/kfd_debug.h   |   6 ++
>   3 files changed, 133 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> index b918213a0087..2c8f107237ee 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> @@ -2953,6 +2953,13 @@ static int kfd_ioctl_set_debug_trap(struct file *filep, struct kfd_process *p, v
>   				&args->query_debug_event.exception_mask);
>   		break;
>   	case KFD_IOC_DBG_TRAP_QUERY_EXCEPTION_INFO:
> +		r = kfd_dbg_trap_query_exception_info(target,
> +				args->query_exception_info.source_id,
> +				args->query_exception_info.exception_code,
> +				args->query_exception_info.clear_exception,
> +				(void __user *)args->query_exception_info.info_ptr,
> +				&args->query_exception_info.info_size);
> +		break;
>   	case KFD_IOC_DBG_TRAP_GET_QUEUE_SNAPSHOT:
>   	case KFD_IOC_DBG_TRAP_GET_DEVICE_SNAPSHOT:
>   		pr_warn("Debug op %i not supported yet\n", args->op);
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
> index 6985a53b83e9..a05fe32eac0e 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
> @@ -768,6 +768,126 @@ int kfd_dbg_trap_set_wave_launch_mode(struct kfd_process *target,
>   	return r;
>   }
>   
> +int kfd_dbg_trap_query_exception_info(struct kfd_process *target,
> +		uint32_t source_id,
> +		uint32_t exception_code,
> +		bool clear_exception,
> +		void __user *info,
> +		uint32_t *info_size)
> +{
> +	bool found = false;
> +	int r = 0;
> +	uint32_t copy_size, actual_info_size = 0;
> +	uint64_t *exception_status_ptr = NULL;
> +
> +	if (!target)
> +		return -EINVAL;
> +
> +	if (!info || !info_size)
> +		return -EINVAL;
> +
> +	mutex_lock(&target->event_mutex);
> +
> +	if (KFD_DBG_EC_TYPE_IS_QUEUE(exception_code)) {
> +		/* Per queue exceptions */
> +		struct queue *queue = NULL;
> +		int i;
> +		
> +		for (i = 0; i < target->n_pdds; i++) {
> +			struct kfd_process_device *pdd = target->pdds[i];
> +			struct qcm_process_device *qpd = &pdd->qpd;
> +
> +			list_for_each_entry(queue, &qpd->queues_list, list) {
> +				if (!found && queue->properties.queue_id == source_id) {
> +					found = true;
> +					break;
> +				}
> +			}
> +			if (found)
> +				break;
> +		}
> +
> +		if (!found) {
> +			r = -EINVAL;
> +			goto out;
> +		}
> +
> +		if (!(queue->properties.exception_status & KFD_EC_MASK(exception_code))) {
> +			r = -ENODATA;
> +			goto out;
> +		}
> +		exception_status_ptr = &queue->properties.exception_status;
> +	} else if (KFD_DBG_EC_TYPE_IS_DEVICE(exception_code)) {
> +		/* Per device exceptions */
> +		struct kfd_process_device *pdd = NULL;
> +		int i;
> +
> +		for (i = 0; i < target->n_pdds; i++) {
> +			pdd = target->pdds[i];
> +			if (pdd->dev->id == source_id) {
> +				found = true;
> +				break;
> +			}
> +		}
> +
> +		if (!found) {
> +			r = -EINVAL;
> +			goto out;
> +		}
> +
> +		if (!(pdd->exception_status & KFD_EC_MASK(exception_code))) {
> +			r = -ENODATA;
> +			goto out;
> +		}
> +
> +		if (exception_code == EC_DEVICE_MEMORY_VIOLATION) {
> +			copy_size = min((size_t)(*info_size), pdd->vm_fault_exc_data_size);
> +
> +			if (copy_to_user(info, pdd->vm_fault_exc_data, copy_size)) {
> +				r = -EFAULT;
> +				goto out;
> +			}
> +			actual_info_size = pdd->vm_fault_exc_data_size;
> +			if (clear_exception) {
> +				kfree(pdd->vm_fault_exc_data);
> +				pdd->vm_fault_exc_data = NULL;
> +				pdd->vm_fault_exc_data_size = 0;
> +			}
> +		}
> +		exception_status_ptr = &pdd->exception_status;
> +	} else if (KFD_DBG_EC_TYPE_IS_PROCESS(exception_code)) {
> +		/* Per process exceptions */
> +		if (!(target->exception_status & KFD_EC_MASK(exception_code))) {
> +			r = -ENODATA;
> +			goto out;
> +		}
> +
> +		if (exception_code == EC_PROCESS_RUNTIME) {
> +			copy_size = min((size_t)(*info_size), sizeof(target->runtime_info));
> +
> +			if (copy_to_user(info, (void *)&target->runtime_info, copy_size)) {
> +				r = -EFAULT;
> +				goto out;
> +			}
> +
> +			actual_info_size = sizeof(target->runtime_info);
> +		}
> +
> +		exception_status_ptr = &target->exception_status;
> +	} else {
> +		pr_debug("Bad exception type [%i]\n", exception_code);
> +		r = -EINVAL;
> +		goto out;
> +	}
> +
> +	*info_size = actual_info_size;
> +	if (clear_exception)
> +		*exception_status_ptr &= ~KFD_EC_MASK(exception_code);
> +out:
> +	mutex_unlock(&target->event_mutex);
> +	return r;
> +}
> +
>   void kfd_dbg_set_enabled_debug_exception_mask(struct kfd_process *target,
>   					uint64_t exception_set_mask)
>   {
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
> index c64ffd3efc46..58a5f14d1258 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
> @@ -63,6 +63,12 @@ int kfd_dbg_trap_set_dev_address_watch(struct kfd_process_device *pdd,
>   					uint32_t *watch_id,
>   					uint32_t watch_mode);
>   int kfd_dbg_trap_set_flags(struct kfd_process *target, uint32_t *flags);
> +int kfd_dbg_trap_query_exception_info(struct kfd_process *target,
> +		uint32_t source_id,
> +		uint32_t exception_code,
> +		bool clear_exception,
> +		void __user *info,
> +		uint32_t *info_size);
>   int kfd_dbg_send_exception_to_runtime(struct kfd_process *p,
>   					unsigned int dev_id,
>   					unsigned int queue_id,


More information about the amd-gfx mailing list