[PATCH 13/29] drm/amdkfd: add per process hw trap enable and disable functions
Jonathan Kim
jonathan.kim at amd.com
Mon Oct 31 16:23:43 UTC 2022
To enable HW debug mode per process, all devices must be debug enabled
successfully. If a failure occures, rewind the enablement of debug mode
on the enabled devices.
A power management scenario that needs to be considered is HW
debug mode setting during GFXOFF. During GFXOFF, these registers
will be unreachable so we have to transiently disable GFXOFF when
setting. Also, some devices don't support the RLC save restore
function for these debug registers so we have to disable GFXOFF
completely during a debug session.
Cooperative launch also has debugging restriction based on FW bugs.
If such bugs exists, the debugger cannot attach to a process that uses GWS
resources nor can GWS resources be requested if a process is being
debugged.
Also multi-process debug devices can only enable trap temporaries based
on certain runtime scenerios, which will be explained when the
runtime enable functions are implemented in a follow up patch.
Signed-off-by: Jonathan Kim <jonathan.kim at amd.com>
---
drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 5 +
drivers/gpu/drm/amd/amdkfd/kfd_debug.c | 134 +++++++++++++++++-
drivers/gpu/drm/amd/amdkfd/kfd_debug.h | 23 +++
.../drm/amd/amdkfd/kfd_device_queue_manager.c | 1 +
drivers/gpu/drm/amd/amdkfd/kfd_process.c | 9 ++
5 files changed, 170 insertions(+), 2 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index d550dbe570fb..aeaedd0efa54 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -1450,6 +1450,11 @@ static int kfd_ioctl_alloc_queue_gws(struct file *filep,
goto out_unlock;
}
+ if (!kfd_dbg_has_gws_support(dev) && p->debug_trap_enabled) {
+ retval = -EBUSY;
+ goto out_unlock;
+ }
+
retval = pqm_set_gws(&p->pqm, args->queue_id, args->num_gws ? dev->gws : NULL);
mutex_unlock(&p->mutex);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
index f967f89903f7..f5a5d17cde14 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
@@ -21,6 +21,7 @@
*/
#include "kfd_debug.h"
+#include "kfd_device_queue_manager.h"
#include <linux/file.h>
void debug_event_write_work_handler(struct work_struct *work)
@@ -37,8 +38,59 @@ void debug_event_write_work_handler(struct work_struct *work)
kernel_write(process->dbg_ev_file, &write_data, 1, &pos);
}
+/* kfd_dbg_trap_deactivate:
+ * target: target process
+ * unwind: If this is unwinding a failed kfd_dbg_trap_enable()
+ * unwind_count:
+ * If unwind == true, how far down the pdd list we need
+ * to unwind
+ * else: ignored
+ */
+static void kfd_dbg_trap_deactivate(struct kfd_process *target, bool unwind, int unwind_count)
+{
+ int i, count = 0;
+
+ for (i = 0; i < target->n_pdds; i++) {
+ struct kfd_process_device *pdd = target->pdds[i];
+
+ /* If this is an unwind, and we have unwound the required
+ * enable calls on the pdd list, we need to stop now
+ * otherwise we may mess up another debugger session.
+ */
+ if (unwind && count == unwind_count)
+ break;
+
+ /* GFX off is already disabled by debug activate if not RLC restore supported. */
+ if (kfd_dbg_is_rlc_restore_supported(pdd->dev))
+ amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
+ pdd->spi_dbg_override =
+ pdd->dev->kfd2kgd->disable_debug_trap(
+ pdd->dev->adev,
+ target->runtime_info.ttmp_setup,
+ pdd->dev->vm_info.last_vmid_kfd);
+ if (kfd_dbg_is_rlc_restore_supported(pdd->dev))
+ amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
+
+ if (release_debug_trap_vmid(pdd->dev->dqm, &pdd->qpd))
+ pr_err("Failed to release debug vmid on [%i]\n", pdd->dev->id);
+
+ debug_refresh_runlist(pdd->dev->dqm);
+
+ count++;
+ }
+}
+
int kfd_dbg_trap_disable(struct kfd_process *target)
{
+ /*
+ * Defer deactivation to runtime if runtime not enabled otherwise reset
+ * attached running target runtime state to enable for re-attach.
+ */
+ if (target->runtime_info.runtime_state == DEBUG_RUNTIME_STATE_ENABLED)
+ kfd_dbg_trap_deactivate(target, false, 0);
+ else if (target->runtime_info.runtime_state != DEBUG_RUNTIME_STATE_DISABLED)
+ target->runtime_info.runtime_state = DEBUG_RUNTIME_STATE_ENABLED;
+
fput(target->dbg_ev_file);
target->dbg_ev_file = NULL;
@@ -53,16 +105,88 @@ int kfd_dbg_trap_disable(struct kfd_process *target)
return 0;
}
+static int kfd_dbg_trap_activate(struct kfd_process *target)
+{
+ int i, r = 0, unwind_count = 0;
+
+ for (i = 0; i < target->n_pdds; i++) {
+ struct kfd_process_device *pdd = target->pdds[i];
+
+ if (!kfd_dbg_is_per_vmid_supported(pdd->dev)) {
+ r = reserve_debug_trap_vmid(pdd->dev->dqm, &pdd->qpd);
+
+ if (r) {
+ target->runtime_info.runtime_state = (r == -EBUSY) ?
+ DEBUG_RUNTIME_STATE_ENABLED_BUSY :
+ DEBUG_RUNTIME_STATE_ENABLED_ERROR;
+
+ goto unwind_err;
+ }
+ }
+
+ /* Disable GFX OFF to prevent garbage read/writes to debug registers.
+ * If RLC restore of debug registers is not supported and runtime enable
+ * hasn't done so already on ttmp setup request, restore the trap config registers.
+ *
+ * If RLC restore of debug registers is not supported, keep gfx off disabled for
+ * the debug session.
+ */
+ amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
+ if (!(kfd_dbg_is_rlc_restore_supported(pdd->dev) ||
+ target->runtime_info.ttmp_setup))
+ pdd->dev->kfd2kgd->enable_debug_trap(pdd->dev->adev, true,
+ pdd->dev->vm_info.last_vmid_kfd);
+
+ pdd->spi_dbg_override = pdd->dev->kfd2kgd->enable_debug_trap(
+ pdd->dev->adev,
+ false,
+ pdd->dev->vm_info.last_vmid_kfd);
+
+ if (kfd_dbg_is_rlc_restore_supported(pdd->dev))
+ amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
+
+ r = debug_refresh_runlist(pdd->dev->dqm);
+ if (r) {
+ target->runtime_info.runtime_state =
+ DEBUG_RUNTIME_STATE_ENABLED_ERROR;
+ goto unwind_err;
+ }
+
+ /* Increment unwind_count as the last step */
+ unwind_count++;
+ }
+
+ return 0;
+
+unwind_err:
+ /* Enabling debug failed, we need to disable on
+ * all GPUs so the enable is all or nothing.
+ */
+ kfd_dbg_trap_deactivate(target, true, unwind_count);
+ return r;
+}
+
int kfd_dbg_trap_enable(struct kfd_process *target, uint32_t fd,
void __user *runtime_info, uint32_t *runtime_size)
{
struct file *f;
uint32_t copy_size;
- int r = 0;
+ int i, r = 0;
if (target->debug_trap_enabled)
return -EINVAL;
+ /* Enable pre-checks */
+ for (i = 0; i < target->n_pdds; i++) {
+ struct kfd_process_device *pdd = target->pdds[i];
+
+ if (!KFD_IS_SOC15(pdd->dev))
+ return -ENODEV;
+
+ if (!kfd_dbg_has_gws_support(pdd->dev) && pdd->qpd.num_gws)
+ return -EBUSY;
+ }
+
copy_size = min((size_t)(*runtime_size), sizeof(target->runtime_info));
f = fget(fd);
@@ -73,6 +197,10 @@ int kfd_dbg_trap_enable(struct kfd_process *target, uint32_t fd,
target->dbg_ev_file = f;
+ /* defer activation to runtime if not runtime enabled */
+ if (target->runtime_info.runtime_state == DEBUG_RUNTIME_STATE_ENABLED)
+ kfd_dbg_trap_activate(target);
+
/* We already hold the process reference but hold another one for the
* debug session.
*/
@@ -82,8 +210,10 @@ int kfd_dbg_trap_enable(struct kfd_process *target, uint32_t fd,
if (target->debugger_process)
atomic_inc(&target->debugger_process->debugged_process_count);
- if (copy_to_user(runtime_info, (void *)&target->runtime_info, copy_size))
+ if (copy_to_user(runtime_info, (void *)&target->runtime_info, copy_size)) {
+ kfd_dbg_trap_deactivate(target, false, 0);
r = -EFAULT;
+ }
*runtime_size = sizeof(target->runtime_info);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
index b7ecd603f277..1053b7ca24c5 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
@@ -40,4 +40,27 @@ static inline bool kfd_dbg_is_per_vmid_supported(struct kfd_dev *dev)
void debug_event_write_work_handler(struct work_struct *work);
+/*
+ * If GFX off is enabled, chips that do not support RLC restore for the debug
+ * registers will disable GFX off temporarily for the entire debug session.
+ * See disable_on_trap_action_entry and enable_on_trap_action_exit for details.
+ */
+static inline bool kfd_dbg_is_rlc_restore_supported(struct kfd_dev *dev)
+{
+ return !(KFD_GC_VERSION(dev) == IP_VERSION(10, 1, 10) || /* Navi10 */
+ KFD_GC_VERSION(dev) == IP_VERSION(10, 1, 1)); /* Navi14 */
+}
+
+static inline bool kfd_dbg_has_gws_support(struct kfd_dev *dev)
+{
+ return ((KFD_GC_VERSION(dev) == IP_VERSION(9, 0, 1)
+ && dev->mec2_fw_version >= 0x81b6) ||
+ (KFD_GC_VERSION(dev) >= IP_VERSION(9, 1, 0)
+ && KFD_GC_VERSION(dev) <= IP_VERSION(9, 2, 2)
+ && dev->mec2_fw_version >= 0x1b6) ||
+ (KFD_GC_VERSION(dev) == IP_VERSION(9, 4, 0)
+ && dev->mec2_fw_version >= 0x1b6) ||
+ (KFD_GC_VERSION(dev) == IP_VERSION(9, 4, 1)
+ && dev->mec2_fw_version >= 0x30));
+}
#endif
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index 1634cc2ee202..bf4787b4dc6c 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -36,6 +36,7 @@
#include "kfd_kernel_queue.h"
#include "amdgpu_amdkfd.h"
#include "mes_api_def.h"
+#include "kfd_debug.h"
/* Size of the per-pipe EOP queue */
#define CIK_HPD_EOP_BYTES_LOG2 11
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
index a57a55f6ccee..928fe5c42c1e 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
@@ -1161,6 +1161,7 @@ static void kfd_process_notifier_release(struct mmu_notifier *mn,
struct mm_struct *mm)
{
struct kfd_process *p;
+ int i;
/*
* The kfd_process structure can not be free because the
@@ -1178,6 +1179,14 @@ static void kfd_process_notifier_release(struct mmu_notifier *mn,
cancel_delayed_work_sync(&p->eviction_work);
cancel_delayed_work_sync(&p->restore_work);
+ for (i = 0; i < p->n_pdds; i++) {
+ struct kfd_process_device *pdd = p->pdds[i];
+
+ /* re-enable GFX OFF since runtime enable with ttmp setup disabled it. */
+ if (!kfd_dbg_is_rlc_restore_supported(pdd->dev) && p->runtime_info.ttmp_setup)
+ amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
+ }
+
if (p->debug_trap_enabled)
kfd_dbg_trap_disable(p);
--
2.25.1
More information about the amd-gfx
mailing list