[PATCH 16/32] drm/amdkfd: add per process hw trap enable and disable functions

Felix Kuehling felix.kuehling at amd.com
Mon Mar 20 23:06:16 UTC 2023


On 2023-01-25 14:53, Jonathan Kim wrote:
> To enable HW debug mode per process, all devices must be debug enabled
> successfully.  If a failure occures, rewind the enablement of debug mode
> on the enabled devices.
>
> A power management scenario that needs to be considered is HW
> debug mode setting during GFXOFF.  During GFXOFF, these registers
> will be unreachable so we have to transiently disable GFXOFF when
> setting.  Also, some devices don't support the RLC save restore
> function for these debug registers so we have to disable GFXOFF
> completely during a debug session.
>
> Cooperative launch also has debugging restriction based on HW/FW bugs.
> If such bugs exists, the debugger cannot attach to a process that uses GWS
> resources nor can GWS resources be requested if a process is being
> debugged.
>
> Multi-process debug devices can only enable trap temporaries based
> on certain runtime scenerios, which will be explained when the
> runtime enable functions are implemented in a follow up patch.
>
> v2: add gfx11 support. fix fw checks. remove asic family name comments.
>
> Signed-off-by: Jonathan Kim <jonathan.kim at amd.com>
> ---
>   drivers/gpu/drm/amd/amdkfd/kfd_chardev.c      |   5 +
>   drivers/gpu/drm/amd/amdkfd/kfd_debug.c        | 148 +++++++++++++++++-
>   drivers/gpu/drm/amd/amdkfd/kfd_debug.h        |  29 ++++
>   .../drm/amd/amdkfd/kfd_device_queue_manager.c |   1 +
>   drivers/gpu/drm/amd/amdkfd/kfd_process.c      |   9 ++
>   5 files changed, 190 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> index f5f639de28f0..628178126d3b 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> @@ -1453,6 +1453,11 @@ static int kfd_ioctl_alloc_queue_gws(struct file *filep,
>   		goto out_unlock;
>   	}
>   
> +	if (!kfd_dbg_has_gws_support(dev) && p->debug_trap_enabled) {
> +		retval = -EBUSY;
> +		goto out_unlock;
> +	}
> +
>   	retval = pqm_set_gws(&p->pqm, args->queue_id, args->num_gws ? dev->gws : NULL);
>   	mutex_unlock(&p->mutex);
>   
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
> index 6e99a0160275..659dfc7411fe 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
> @@ -21,6 +21,7 @@
>    */
>   
>   #include "kfd_debug.h"
> +#include "kfd_device_queue_manager.h"
>   #include <linux/file.h>
>   
>   void debug_event_write_work_handler(struct work_struct *work)
> @@ -101,11 +102,68 @@ static int kfd_dbg_set_mes_debug_mode(struct kfd_process_device *pdd)
>   						pdd->watch_points, flags);
>   }
>   
> +/* kfd_dbg_trap_deactivate:
> + *	target: target process
> + *	unwind: If this is unwinding a failed kfd_dbg_trap_enable()
> + *	unwind_count:
> + *		If unwind == true, how far down the pdd list we need
> + *				to unwind
> + *		else: ignored
> + */
> +static void kfd_dbg_trap_deactivate(struct kfd_process *target, bool unwind, int unwind_count)
> +{
> +	int i, count = 0;
> +
> +	for (i = 0; i < target->n_pdds; i++) {
> +		struct kfd_process_device *pdd = target->pdds[i];
> +
> +		/* If this is an unwind, and we have unwound the required
> +		 * enable calls on the pdd list, we need to stop now
> +		 * otherwise we may mess up another debugger session.
> +		 */
> +		if (unwind && count == unwind_count)
> +			break;
> +
> +		/* GFX off is already disabled by debug activate if not RLC restore supported. */
> +		if (kfd_dbg_is_rlc_restore_supported(pdd->dev))
> +			amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
> +		pdd->spi_dbg_override =
> +				pdd->dev->kfd2kgd->disable_debug_trap(
> +				pdd->dev->adev,
> +				target->runtime_info.ttmp_setup,
> +				pdd->dev->vm_info.last_vmid_kfd);
> +		if (kfd_dbg_is_rlc_restore_supported(pdd->dev))
> +			amdgpu_gfx_off_ctrl(pdd->dev->adev, true);

Shouldn't this reenable GFXOFF unconditionally? It should not stay 
disabled on devices without RLC restore support, because we're ending 
the debug session here.


> +
> +		if (!kfd_dbg_is_per_vmid_supported(pdd->dev) &&
> +				release_debug_trap_vmid(pdd->dev->dqm, &pdd->qpd))
> +			pr_err("Failed to release debug vmid on [%i]\n", pdd->dev->id);
> +
> +		if (!pdd->dev->shared_resources.enable_mes)
> +			debug_refresh_runlist(pdd->dev->dqm);
> +		else
> +			kfd_dbg_set_mes_debug_mode(pdd);
> +
> +		count++;

Isn't count the same as i? Why do we need a separate variable here?

Regards,
   Felix


> +	}
> +
> +	kfd_dbg_set_workaround(target, false);
> +}
> +
>   int kfd_dbg_trap_disable(struct kfd_process *target)
>   {
>   	if (!target->debug_trap_enabled)
>   		return 0;
>   
> +	/*
> +	 * Defer deactivation to runtime if runtime not enabled otherwise reset
> +	 * attached running target runtime state to enable for re-attach.
> +	 */
> +	if (target->runtime_info.runtime_state == DEBUG_RUNTIME_STATE_ENABLED)
> +		kfd_dbg_trap_deactivate(target, false, 0);
> +	else if (target->runtime_info.runtime_state != DEBUG_RUNTIME_STATE_DISABLED)
> +		target->runtime_info.runtime_state = DEBUG_RUNTIME_STATE_ENABLED;
> +
>   	fput(target->dbg_ev_file);
>   	target->dbg_ev_file = NULL;
>   
> @@ -120,16 +178,96 @@ int kfd_dbg_trap_disable(struct kfd_process *target)
>   	return 0;
>   }
>   
> +static int kfd_dbg_trap_activate(struct kfd_process *target)
> +{
> +	int i, r = 0, unwind_count = 0;
> +
> +	r = kfd_dbg_set_workaround(target, true);
> +	if (r)
> +		return r;
> +
> +	for (i = 0; i < target->n_pdds; i++) {
> +		struct kfd_process_device *pdd = target->pdds[i];
> +
> +		if (!kfd_dbg_is_per_vmid_supported(pdd->dev)) {
> +			r = reserve_debug_trap_vmid(pdd->dev->dqm, &pdd->qpd);
> +
> +			if (r) {
> +				target->runtime_info.runtime_state = (r == -EBUSY) ?
> +							DEBUG_RUNTIME_STATE_ENABLED_BUSY :
> +							DEBUG_RUNTIME_STATE_ENABLED_ERROR;
> +
> +				goto unwind_err;
> +			}
> +		}
> +
> +		/* Disable GFX OFF to prevent garbage read/writes to debug registers.
> +		 * If RLC restore of debug registers is not supported and runtime enable
> +		 * hasn't done so already on ttmp setup request, restore the trap config registers.
> +		 *
> +		 * If RLC restore of debug registers is not supported, keep gfx off disabled for
> +		 * the debug session.
> +		 */
> +		amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
> +		if (!(kfd_dbg_is_rlc_restore_supported(pdd->dev) ||
> +						target->runtime_info.ttmp_setup))
> +			pdd->dev->kfd2kgd->enable_debug_trap(pdd->dev->adev, true,
> +								pdd->dev->vm_info.last_vmid_kfd);
> +
> +		pdd->spi_dbg_override = pdd->dev->kfd2kgd->enable_debug_trap(
> +					pdd->dev->adev,
> +					false,
> +					pdd->dev->vm_info.last_vmid_kfd);
> +
> +		if (kfd_dbg_is_rlc_restore_supported(pdd->dev))
> +			amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
> +
> +		if (!pdd->dev->shared_resources.enable_mes)
> +			r = debug_refresh_runlist(pdd->dev->dqm);
> +		else
> +			r = kfd_dbg_set_mes_debug_mode(pdd);
> +
> +		if (r) {
> +			target->runtime_info.runtime_state =
> +					DEBUG_RUNTIME_STATE_ENABLED_ERROR;
> +			goto unwind_err;
> +		}
> +
> +		/* Increment unwind_count as the last step */
> +		unwind_count++;

Similar to above. I think unwind_count is redundant. It'll have the same 
value as "i" in the next loop iteration.


> +	}
> +
> +	return 0;
> +
> +unwind_err:
> +	/* Enabling debug failed, we need to disable on
> +	 * all GPUs so the enable is all or nothing.
> +	 */
> +	kfd_dbg_trap_deactivate(target, true, unwind_count);
> +	return r;
> +}
> +
>   int kfd_dbg_trap_enable(struct kfd_process *target, uint32_t fd,
>   			void __user *runtime_info, uint32_t *runtime_size)
>   {
>   	struct file *f;
>   	uint32_t copy_size;
> -	int r = 0;
> +	int i, r = 0;
>   
>   	if (target->debug_trap_enabled)
>   		return -EALREADY;
>   
> +	/* Enable pre-checks */
> +	for (i = 0; i < target->n_pdds; i++) {
> +		struct kfd_process_device *pdd = target->pdds[i];
> +
> +		if (!KFD_IS_SOC15(pdd->dev))
> +			return -ENODEV;
> +
> +		if (!kfd_dbg_has_gws_support(pdd->dev) && pdd->qpd.num_gws)
> +			return -EBUSY;
> +	}
> +
>   	copy_size = min((size_t)(*runtime_size), sizeof(target->runtime_info));
>   
>   	f = fget(fd);
> @@ -140,6 +278,10 @@ int kfd_dbg_trap_enable(struct kfd_process *target, uint32_t fd,
>   
>   	target->dbg_ev_file = f;
>   
> +	/* defer activation to runtime if not runtime enabled */
> +	if (target->runtime_info.runtime_state == DEBUG_RUNTIME_STATE_ENABLED)
> +		kfd_dbg_trap_activate(target);
> +
>   	/* We already hold the process reference but hold another one for the
>   	 * debug session.
>   	 */
> @@ -149,8 +291,10 @@ int kfd_dbg_trap_enable(struct kfd_process *target, uint32_t fd,
>   	if (target->debugger_process)
>   		atomic_inc(&target->debugger_process->debugged_process_count);
>   
> -	if (copy_to_user(runtime_info, (void *)&target->runtime_info, copy_size))
> +	if (copy_to_user(runtime_info, (void *)&target->runtime_info, copy_size)) {
> +		kfd_dbg_trap_deactivate(target, false, 0);
>   		r = -EFAULT;
> +	}
>   
>   	*runtime_size = sizeof(target->runtime_info);
>   
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
> index 0c09f1729325..f199698d8d60 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
> @@ -41,4 +41,33 @@ static inline bool kfd_dbg_is_per_vmid_supported(struct kfd_dev *dev)
>   
>   void debug_event_write_work_handler(struct work_struct *work);
>   
> +/*
> + * If GFX off is enabled, chips that do not support RLC restore for the debug
> + * registers will disable GFX off temporarily for the entire debug session.
> + * See disable_on_trap_action_entry and enable_on_trap_action_exit for details.
> + */
> +static inline bool kfd_dbg_is_rlc_restore_supported(struct kfd_dev *dev)
> +{
> +	return !(KFD_GC_VERSION(dev) == IP_VERSION(10, 1, 10) ||
> +		 KFD_GC_VERSION(dev) == IP_VERSION(10, 1, 1));
> +}
> +
> +static inline bool kfd_dbg_has_gws_support(struct kfd_dev *dev)
> +{
> +	if ((KFD_GC_VERSION(dev) == IP_VERSION(9, 0, 1)
> +			&& dev->mec2_fw_version < 0x81b6) ||
> +		(KFD_GC_VERSION(dev) >= IP_VERSION(9, 1, 0)
> +			&& KFD_GC_VERSION(dev) <= IP_VERSION(9, 2, 2)
> +			&& dev->mec2_fw_version < 0x1b6) ||
> +		(KFD_GC_VERSION(dev) == IP_VERSION(9, 4, 0)
> +			&& dev->mec2_fw_version < 0x1b6) ||
> +		(KFD_GC_VERSION(dev) == IP_VERSION(9, 4, 1)
> +			&& dev->mec2_fw_version < 0x30) ||
> +		(KFD_GC_VERSION(dev) >= IP_VERSION(11, 0, 0) &&
> +			KFD_GC_VERSION(dev) < IP_VERSION(12, 0, 0)))
> +		return false;
> +
> +	/* Assume debugging and cooperative launch supported otherwise. */
> +	return true;
> +}
>   #endif
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> index be1985b87ea7..3b747e51684e 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> @@ -36,6 +36,7 @@
>   #include "kfd_kernel_queue.h"
>   #include "amdgpu_amdkfd.h"
>   #include "mes_api_def.h"
> +#include "kfd_debug.h"
>   
>   /* Size of the per-pipe EOP queue */
>   #define CIK_HPD_EOP_BYTES_LOG2 11
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> index 94c6545a58b4..0ef2d00af8b1 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> @@ -1181,6 +1181,7 @@ static void kfd_process_notifier_release(struct mmu_notifier *mn,
>   					struct mm_struct *mm)
>   {
>   	struct kfd_process *p;
> +	int i;
>   
>   	/*
>   	 * The kfd_process structure can not be free because the
> @@ -1198,6 +1199,14 @@ static void kfd_process_notifier_release(struct mmu_notifier *mn,
>   	cancel_delayed_work_sync(&p->eviction_work);
>   	cancel_delayed_work_sync(&p->restore_work);
>   
> +	for (i = 0; i < p->n_pdds; i++) {
> +		struct kfd_process_device *pdd = p->pdds[i];
> +
> +		/* re-enable GFX OFF since runtime enable with ttmp setup disabled it. */
> +		if (!kfd_dbg_is_rlc_restore_supported(pdd->dev) && p->runtime_info.ttmp_setup)
> +			amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
> +	}
> +
>   	/* Indicate to other users that MM is no longer valid */
>   	p->mm = NULL;
>   


More information about the amd-gfx mailing list