[PATCH 16/33] drm/amdkfd: add per process hw trap enable and disable functions
Felix Kuehling
felix.kuehling at amd.com
Tue May 30 20:04:56 UTC 2023
Am 2023-05-25 um 13:27 schrieb Jonathan Kim:
> To enable HW debug mode per process, all devices must be debug enabled
> successfully. If a failure occures, rewind the enablement of debug mode
> on the enabled devices.
>
> A power management scenario that needs to be considered is HW
> debug mode setting during GFXOFF. During GFXOFF, these registers
> will be unreachable so we have to transiently disable GFXOFF when
> setting. Also, some devices don't support the RLC save restore
> function for these debug registers so we have to disable GFXOFF
> completely during a debug session.
>
> Cooperative launch also has debugging restriction based on HW/FW bugs.
> If such bugs exists, the debugger cannot attach to a process that uses GWS
> resources nor can GWS resources be requested if a process is being
> debugged.
>
> Multi-process debug devices can only enable trap temporaries based
> on certain runtime scenerios, which will be explained when the
> runtime enable functions are implemented in a follow up patch.
>
> v2: spot fix with new kfd_node references
>
> Signed-off-by: Jonathan Kim <jonathan.kim at amd.com>
Reviewed-by: Felix Kuehling <Felix.Kuehling at amd.com>
> ---
> drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 5 +
> drivers/gpu/drm/amd/amdkfd/kfd_debug.c | 148 ++++++++++++++++++++++-
> drivers/gpu/drm/amd/amdkfd/kfd_debug.h | 29 +++++
> drivers/gpu/drm/amd/amdkfd/kfd_process.c | 10 ++
> 4 files changed, 190 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> index 7082d5d0f0e9..9d0c247f80fe 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> @@ -1488,6 +1488,11 @@ static int kfd_ioctl_alloc_queue_gws(struct file *filep,
> goto out_unlock;
> }
>
> + if (!kfd_dbg_has_gws_support(dev) && p->debug_trap_enabled) {
> + retval = -EBUSY;
> + goto out_unlock;
> + }
> +
> retval = pqm_set_gws(&p->pqm, args->queue_id, args->num_gws ? dev->gws : NULL);
> mutex_unlock(&p->mutex);
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
> index 898cc1fe3d13..73b07b5f17f1 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
> @@ -21,13 +21,78 @@
> */
>
> #include "kfd_debug.h"
> +#include "kfd_device_queue_manager.h"
> #include <linux/file.h>
>
> +static int kfd_dbg_set_mes_debug_mode(struct kfd_process_device *pdd)
> +{
> + uint32_t spi_dbg_cntl = pdd->spi_dbg_override | pdd->spi_dbg_launch_mode;
> + uint32_t flags = pdd->process->dbg_flags;
> +
> + if (!kfd_dbg_is_per_vmid_supported(pdd->dev))
> + return 0;
> +
> + return amdgpu_mes_set_shader_debugger(pdd->dev->adev, pdd->proc_ctx_gpu_addr, spi_dbg_cntl,
> + pdd->watch_points, flags);
> +}
> +
> +/* kfd_dbg_trap_deactivate:
> + * target: target process
> + * unwind: If this is unwinding a failed kfd_dbg_trap_enable()
> + * unwind_count:
> + * If unwind == true, how far down the pdd list we need
> + * to unwind
> + * else: ignored
> + */
> +static void kfd_dbg_trap_deactivate(struct kfd_process *target, bool unwind, int unwind_count)
> +{
> + int i;
> +
> + for (i = 0; i < target->n_pdds; i++) {
> + struct kfd_process_device *pdd = target->pdds[i];
> +
> + /* If this is an unwind, and we have unwound the required
> + * enable calls on the pdd list, we need to stop now
> + * otherwise we may mess up another debugger session.
> + */
> + if (unwind && i == unwind_count)
> + break;
> +
> + /* GFX off is already disabled by debug activate if not RLC restore supported. */
> + if (kfd_dbg_is_rlc_restore_supported(pdd->dev))
> + amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
> + pdd->spi_dbg_override =
> + pdd->dev->kfd2kgd->disable_debug_trap(
> + pdd->dev->adev,
> + target->runtime_info.ttmp_setup,
> + pdd->dev->vm_info.last_vmid_kfd);
> + amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
> +
> + if (!kfd_dbg_is_per_vmid_supported(pdd->dev) &&
> + release_debug_trap_vmid(pdd->dev->dqm, &pdd->qpd))
> + pr_err("Failed to release debug vmid on [%i]\n", pdd->dev->id);
> +
> + if (!pdd->dev->kfd->shared_resources.enable_mes)
> + debug_refresh_runlist(pdd->dev->dqm);
> + else
> + kfd_dbg_set_mes_debug_mode(pdd);
> + }
> +}
> +
> int kfd_dbg_trap_disable(struct kfd_process *target)
> {
> if (!target->debug_trap_enabled)
> return 0;
>
> + /*
> + * Defer deactivation to runtime if runtime not enabled otherwise reset
> + * attached running target runtime state to enable for re-attach.
> + */
> + if (target->runtime_info.runtime_state == DEBUG_RUNTIME_STATE_ENABLED)
> + kfd_dbg_trap_deactivate(target, false, 0);
> + else if (target->runtime_info.runtime_state != DEBUG_RUNTIME_STATE_DISABLED)
> + target->runtime_info.runtime_state = DEBUG_RUNTIME_STATE_ENABLED;
> +
> fput(target->dbg_ev_file);
> target->dbg_ev_file = NULL;
>
> @@ -42,16 +107,89 @@ int kfd_dbg_trap_disable(struct kfd_process *target)
> return 0;
> }
>
> +static int kfd_dbg_trap_activate(struct kfd_process *target)
> +{
> + int i, r = 0;
> +
> + for (i = 0; i < target->n_pdds; i++) {
> + struct kfd_process_device *pdd = target->pdds[i];
> +
> + if (!kfd_dbg_is_per_vmid_supported(pdd->dev)) {
> + r = reserve_debug_trap_vmid(pdd->dev->dqm, &pdd->qpd);
> +
> + if (r) {
> + target->runtime_info.runtime_state = (r == -EBUSY) ?
> + DEBUG_RUNTIME_STATE_ENABLED_BUSY :
> + DEBUG_RUNTIME_STATE_ENABLED_ERROR;
> +
> + goto unwind_err;
> + }
> + }
> +
> + /* Disable GFX OFF to prevent garbage read/writes to debug registers.
> + * If RLC restore of debug registers is not supported and runtime enable
> + * hasn't done so already on ttmp setup request, restore the trap config registers.
> + *
> + * If RLC restore of debug registers is not supported, keep gfx off disabled for
> + * the debug session.
> + */
> + amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
> + if (!(kfd_dbg_is_rlc_restore_supported(pdd->dev) ||
> + target->runtime_info.ttmp_setup))
> + pdd->dev->kfd2kgd->enable_debug_trap(pdd->dev->adev, true,
> + pdd->dev->vm_info.last_vmid_kfd);
> +
> + pdd->spi_dbg_override = pdd->dev->kfd2kgd->enable_debug_trap(
> + pdd->dev->adev,
> + false,
> + pdd->dev->vm_info.last_vmid_kfd);
> +
> + if (kfd_dbg_is_rlc_restore_supported(pdd->dev))
> + amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
> +
> + if (!pdd->dev->kfd->shared_resources.enable_mes)
> + r = debug_refresh_runlist(pdd->dev->dqm);
> + else
> + r = kfd_dbg_set_mes_debug_mode(pdd);
> +
> + if (r) {
> + target->runtime_info.runtime_state =
> + DEBUG_RUNTIME_STATE_ENABLED_ERROR;
> + goto unwind_err;
> + }
> + }
> +
> + return 0;
> +
> +unwind_err:
> + /* Enabling debug failed, we need to disable on
> + * all GPUs so the enable is all or nothing.
> + */
> + kfd_dbg_trap_deactivate(target, true, i);
> + return r;
> +}
> +
> int kfd_dbg_trap_enable(struct kfd_process *target, uint32_t fd,
> void __user *runtime_info, uint32_t *runtime_size)
> {
> struct file *f;
> uint32_t copy_size;
> - int r = 0;
> + int i, r = 0;
>
> if (target->debug_trap_enabled)
> return -EALREADY;
>
> + /* Enable pre-checks */
> + for (i = 0; i < target->n_pdds; i++) {
> + struct kfd_process_device *pdd = target->pdds[i];
> +
> + if (!KFD_IS_SOC15(pdd->dev))
> + return -ENODEV;
> +
> + if (!kfd_dbg_has_gws_support(pdd->dev) && pdd->qpd.num_gws)
> + return -EBUSY;
> + }
> +
> copy_size = min((size_t)(*runtime_size), sizeof(target->runtime_info));
>
> f = fget(fd);
> @@ -62,6 +200,10 @@ int kfd_dbg_trap_enable(struct kfd_process *target, uint32_t fd,
>
> target->dbg_ev_file = f;
>
> + /* defer activation to runtime if not runtime enabled */
> + if (target->runtime_info.runtime_state == DEBUG_RUNTIME_STATE_ENABLED)
> + kfd_dbg_trap_activate(target);
> +
> /* We already hold the process reference but hold another one for the
> * debug session.
> */
> @@ -71,8 +213,10 @@ int kfd_dbg_trap_enable(struct kfd_process *target, uint32_t fd,
> if (target->debugger_process)
> atomic_inc(&target->debugger_process->debugged_process_count);
>
> - if (copy_to_user(runtime_info, (void *)&target->runtime_info, copy_size))
> + if (copy_to_user(runtime_info, (void *)&target->runtime_info, copy_size)) {
> + kfd_dbg_trap_deactivate(target, false, 0);
> r = -EFAULT;
> + }
>
> *runtime_size = sizeof(target->runtime_info);
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
> index db6d72e7930f..17481f824647 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
> @@ -34,4 +34,33 @@ static inline bool kfd_dbg_is_per_vmid_supported(struct kfd_node *dev)
> return KFD_GC_VERSION(dev) == IP_VERSION(9, 4, 2);
> }
>
> +/*
> + * If GFX off is enabled, chips that do not support RLC restore for the debug
> + * registers will disable GFX off temporarily for the entire debug session.
> + * See disable_on_trap_action_entry and enable_on_trap_action_exit for details.
> + */
> +static inline bool kfd_dbg_is_rlc_restore_supported(struct kfd_node *dev)
> +{
> + return !(KFD_GC_VERSION(dev) == IP_VERSION(10, 1, 10) ||
> + KFD_GC_VERSION(dev) == IP_VERSION(10, 1, 1));
> +}
> +
> +static inline bool kfd_dbg_has_gws_support(struct kfd_node *dev)
> +{
> + if ((KFD_GC_VERSION(dev) == IP_VERSION(9, 0, 1)
> + && dev->kfd->mec2_fw_version < 0x81b6) ||
> + (KFD_GC_VERSION(dev) >= IP_VERSION(9, 1, 0)
> + && KFD_GC_VERSION(dev) <= IP_VERSION(9, 2, 2)
> + && dev->kfd->mec2_fw_version < 0x1b6) ||
> + (KFD_GC_VERSION(dev) == IP_VERSION(9, 4, 0)
> + && dev->kfd->mec2_fw_version < 0x1b6) ||
> + (KFD_GC_VERSION(dev) == IP_VERSION(9, 4, 1)
> + && dev->kfd->mec2_fw_version < 0x30) ||
> + (KFD_GC_VERSION(dev) >= IP_VERSION(11, 0, 0) &&
> + KFD_GC_VERSION(dev) < IP_VERSION(12, 0, 0)))
> + return false;
> +
> + /* Assume debugging and cooperative launch supported otherwise. */
> + return true;
> +}
> #endif
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> index 0281f79fcd7d..a3846bb87510 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> @@ -1165,9 +1165,19 @@ static void kfd_process_free_notifier(struct mmu_notifier *mn)
>
> static void kfd_process_notifier_release_internal(struct kfd_process *p)
> {
> + int i;
> +
> cancel_delayed_work_sync(&p->eviction_work);
> cancel_delayed_work_sync(&p->restore_work);
>
> + for (i = 0; i < p->n_pdds; i++) {
> + struct kfd_process_device *pdd = p->pdds[i];
> +
> + /* re-enable GFX OFF since runtime enable with ttmp setup disabled it. */
> + if (!kfd_dbg_is_rlc_restore_supported(pdd->dev) && p->runtime_info.ttmp_setup)
> + amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
> + }
> +
> /* Indicate to other users that MM is no longer valid */
> p->mm = NULL;
> kfd_dbg_trap_disable(p);
More information about the amd-gfx
mailing list