[PATCH 16/34] drm/amdkfd: add per process hw trap enable and disable functions
Jonathan Kim
jonathan.kim at amd.com
Mon Mar 27 18:43:21 UTC 2023
To enable HW debug mode per process, all devices must be debug enabled
successfully. If a failure occures, rewind the enablement of debug mode
on the enabled devices.
A power management scenario that needs to be considered is HW
debug mode setting during GFXOFF. During GFXOFF, these registers
will be unreachable so we have to transiently disable GFXOFF when
setting. Also, some devices don't support the RLC save restore
function for these debug registers so we have to disable GFXOFF
completely during a debug session.
Cooperative launch also has debugging restriction based on HW/FW bugs.
If such bugs exists, the debugger cannot attach to a process that uses GWS
resources nor can GWS resources be requested if a process is being
debugged.
Multi-process debug devices can only enable trap temporaries based
on certain runtime scenerios, which will be explained when the
runtime enable functions are implemented in a follow up patch.
v3: make gfx off restore unconditional on detach and get rid of unneeded
count variable for unwind.
also move gfx11 workaround to later so we're not defining unused
functions and better illustrate the use of the workaround.
v2: add gfx11 support. fix fw checks. remove asic family name comments.
Signed-off-by: Jonathan Kim <jonathan.kim at amd.com>
---
drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 5 +
drivers/gpu/drm/amd/amdkfd/kfd_debug.c | 148 ++++++++++++++++++++++-
drivers/gpu/drm/amd/amdkfd/kfd_debug.h | 29 +++++
drivers/gpu/drm/amd/amdkfd/kfd_process.c | 10 ++
4 files changed, 190 insertions(+), 2 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index 58c08d8a191f..c0c54206b1c2 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -1473,6 +1473,11 @@ static int kfd_ioctl_alloc_queue_gws(struct file *filep,
goto out_unlock;
}
+ if (!kfd_dbg_has_gws_support(dev) && p->debug_trap_enabled) {
+ retval = -EBUSY;
+ goto out_unlock;
+ }
+
retval = pqm_set_gws(&p->pqm, args->queue_id, args->num_gws ? dev->gws : NULL);
mutex_unlock(&p->mutex);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
index 2bf2aa21eccc..fe5b965ac8f6 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
@@ -21,13 +21,78 @@
*/
#include "kfd_debug.h"
+#include "kfd_device_queue_manager.h"
#include <linux/file.h>
+static int kfd_dbg_set_mes_debug_mode(struct kfd_process_device *pdd)
+{
+ uint32_t spi_dbg_cntl = pdd->spi_dbg_override | pdd->spi_dbg_launch_mode;
+ uint32_t flags = pdd->process->dbg_flags;
+
+ if (!kfd_dbg_is_per_vmid_supported(pdd->dev))
+ return 0;
+
+ return amdgpu_mes_set_shader_debugger(pdd->dev->adev, pdd->proc_ctx_gpu_addr, spi_dbg_cntl,
+ pdd->watch_points, flags);
+}
+
+/* kfd_dbg_trap_deactivate:
+ * target: target process
+ * unwind: If this is unwinding a failed kfd_dbg_trap_enable()
+ * unwind_count:
+ * If unwind == true, how far down the pdd list we need
+ * to unwind
+ * else: ignored
+ */
+static void kfd_dbg_trap_deactivate(struct kfd_process *target, bool unwind, int unwind_count)
+{
+ int i;
+
+ for (i = 0; i < target->n_pdds; i++) {
+ struct kfd_process_device *pdd = target->pdds[i];
+
+ /* If this is an unwind, and we have unwound the required
+ * enable calls on the pdd list, we need to stop now
+ * otherwise we may mess up another debugger session.
+ */
+ if (unwind && i == unwind_count)
+ break;
+
+ /* GFX off is already disabled by debug activate if not RLC restore supported. */
+ if (kfd_dbg_is_rlc_restore_supported(pdd->dev))
+ amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
+ pdd->spi_dbg_override =
+ pdd->dev->kfd2kgd->disable_debug_trap(
+ pdd->dev->adev,
+ target->runtime_info.ttmp_setup,
+ pdd->dev->vm_info.last_vmid_kfd);
+ amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
+
+ if (!kfd_dbg_is_per_vmid_supported(pdd->dev) &&
+ release_debug_trap_vmid(pdd->dev->dqm, &pdd->qpd))
+ pr_err("Failed to release debug vmid on [%i]\n", pdd->dev->id);
+
+ if (!pdd->dev->shared_resources.enable_mes)
+ debug_refresh_runlist(pdd->dev->dqm);
+ else
+ kfd_dbg_set_mes_debug_mode(pdd);
+ }
+}
+
int kfd_dbg_trap_disable(struct kfd_process *target)
{
if (!target->debug_trap_enabled)
return 0;
+ /*
+ * Defer deactivation to runtime if runtime not enabled otherwise reset
+ * attached running target runtime state to enable for re-attach.
+ */
+ if (target->runtime_info.runtime_state == DEBUG_RUNTIME_STATE_ENABLED)
+ kfd_dbg_trap_deactivate(target, false, 0);
+ else if (target->runtime_info.runtime_state != DEBUG_RUNTIME_STATE_DISABLED)
+ target->runtime_info.runtime_state = DEBUG_RUNTIME_STATE_ENABLED;
+
fput(target->dbg_ev_file);
target->dbg_ev_file = NULL;
@@ -42,16 +107,89 @@ int kfd_dbg_trap_disable(struct kfd_process *target)
return 0;
}
+static int kfd_dbg_trap_activate(struct kfd_process *target)
+{
+ int i, r = 0;
+
+ for (i = 0; i < target->n_pdds; i++) {
+ struct kfd_process_device *pdd = target->pdds[i];
+
+ if (!kfd_dbg_is_per_vmid_supported(pdd->dev)) {
+ r = reserve_debug_trap_vmid(pdd->dev->dqm, &pdd->qpd);
+
+ if (r) {
+ target->runtime_info.runtime_state = (r == -EBUSY) ?
+ DEBUG_RUNTIME_STATE_ENABLED_BUSY :
+ DEBUG_RUNTIME_STATE_ENABLED_ERROR;
+
+ goto unwind_err;
+ }
+ }
+
+ /* Disable GFX OFF to prevent garbage read/writes to debug registers.
+ * If RLC restore of debug registers is not supported and runtime enable
+ * hasn't done so already on ttmp setup request, restore the trap config registers.
+ *
+ * If RLC restore of debug registers is not supported, keep gfx off disabled for
+ * the debug session.
+ */
+ amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
+ if (!(kfd_dbg_is_rlc_restore_supported(pdd->dev) ||
+ target->runtime_info.ttmp_setup))
+ pdd->dev->kfd2kgd->enable_debug_trap(pdd->dev->adev, true,
+ pdd->dev->vm_info.last_vmid_kfd);
+
+ pdd->spi_dbg_override = pdd->dev->kfd2kgd->enable_debug_trap(
+ pdd->dev->adev,
+ false,
+ pdd->dev->vm_info.last_vmid_kfd);
+
+ if (kfd_dbg_is_rlc_restore_supported(pdd->dev))
+ amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
+
+ if (!pdd->dev->shared_resources.enable_mes)
+ r = debug_refresh_runlist(pdd->dev->dqm);
+ else
+ r = kfd_dbg_set_mes_debug_mode(pdd);
+
+ if (r) {
+ target->runtime_info.runtime_state =
+ DEBUG_RUNTIME_STATE_ENABLED_ERROR;
+ goto unwind_err;
+ }
+ }
+
+ return 0;
+
+unwind_err:
+ /* Enabling debug failed, we need to disable on
+ * all GPUs so the enable is all or nothing.
+ */
+ kfd_dbg_trap_deactivate(target, true, i);
+ return r;
+}
+
int kfd_dbg_trap_enable(struct kfd_process *target, uint32_t fd,
void __user *runtime_info, uint32_t *runtime_size)
{
struct file *f;
uint32_t copy_size;
- int r = 0;
+ int i, r = 0;
if (target->debug_trap_enabled)
return -EALREADY;
+ /* Enable pre-checks */
+ for (i = 0; i < target->n_pdds; i++) {
+ struct kfd_process_device *pdd = target->pdds[i];
+
+ if (!KFD_IS_SOC15(pdd->dev))
+ return -ENODEV;
+
+ if (!kfd_dbg_has_gws_support(pdd->dev) && pdd->qpd.num_gws)
+ return -EBUSY;
+ }
+
copy_size = min((size_t)(*runtime_size), sizeof(target->runtime_info));
f = fget(fd);
@@ -62,6 +200,10 @@ int kfd_dbg_trap_enable(struct kfd_process *target, uint32_t fd,
target->dbg_ev_file = f;
+ /* defer activation to runtime if not runtime enabled */
+ if (target->runtime_info.runtime_state == DEBUG_RUNTIME_STATE_ENABLED)
+ kfd_dbg_trap_activate(target);
+
/* We already hold the process reference but hold another one for the
* debug session.
*/
@@ -71,8 +213,10 @@ int kfd_dbg_trap_enable(struct kfd_process *target, uint32_t fd,
if (target->debugger_process)
atomic_inc(&target->debugger_process->debugged_process_count);
- if (copy_to_user(runtime_info, (void *)&target->runtime_info, copy_size))
+ if (copy_to_user(runtime_info, (void *)&target->runtime_info, copy_size)) {
+ kfd_dbg_trap_deactivate(target, false, 0);
r = -EFAULT;
+ }
*runtime_size = sizeof(target->runtime_info);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
index 20b8a3e97d8e..68a95c8105e5 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
@@ -34,4 +34,33 @@ static inline bool kfd_dbg_is_per_vmid_supported(struct kfd_dev *dev)
return KFD_GC_VERSION(dev) == IP_VERSION(9, 4, 2);
}
+/*
+ * If GFX off is enabled, chips that do not support RLC restore for the debug
+ * registers will disable GFX off temporarily for the entire debug session.
+ * See disable_on_trap_action_entry and enable_on_trap_action_exit for details.
+ */
+static inline bool kfd_dbg_is_rlc_restore_supported(struct kfd_dev *dev)
+{
+ return !(KFD_GC_VERSION(dev) == IP_VERSION(10, 1, 10) ||
+ KFD_GC_VERSION(dev) == IP_VERSION(10, 1, 1));
+}
+
+static inline bool kfd_dbg_has_gws_support(struct kfd_dev *dev)
+{
+ if ((KFD_GC_VERSION(dev) == IP_VERSION(9, 0, 1)
+ && dev->mec2_fw_version < 0x81b6) ||
+ (KFD_GC_VERSION(dev) >= IP_VERSION(9, 1, 0)
+ && KFD_GC_VERSION(dev) <= IP_VERSION(9, 2, 2)
+ && dev->mec2_fw_version < 0x1b6) ||
+ (KFD_GC_VERSION(dev) == IP_VERSION(9, 4, 0)
+ && dev->mec2_fw_version < 0x1b6) ||
+ (KFD_GC_VERSION(dev) == IP_VERSION(9, 4, 1)
+ && dev->mec2_fw_version < 0x30) ||
+ (KFD_GC_VERSION(dev) >= IP_VERSION(11, 0, 0) &&
+ KFD_GC_VERSION(dev) < IP_VERSION(12, 0, 0)))
+ return false;
+
+ /* Assume debugging and cooperative launch supported otherwise. */
+ return true;
+}
#endif
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
index ac3b7c245807..51dde18c4eef 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
@@ -1159,9 +1159,19 @@ static void kfd_process_free_notifier(struct mmu_notifier *mn)
static void kfd_process_notifier_release_internal(struct kfd_process *p)
{
+ int i;
+
cancel_delayed_work_sync(&p->eviction_work);
cancel_delayed_work_sync(&p->restore_work);
+ for (i = 0; i < p->n_pdds; i++) {
+ struct kfd_process_device *pdd = p->pdds[i];
+
+ /* re-enable GFX OFF since runtime enable with ttmp setup disabled it. */
+ if (!kfd_dbg_is_rlc_restore_supported(pdd->dev) && p->runtime_info.ttmp_setup)
+ amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
+ }
+
/* Indicate to other users that MM is no longer valid */
p->mm = NULL;
kfd_dbg_trap_disable(p);
--
2.25.1
More information about the amd-gfx
mailing list