[PATCH 03/29] drm/amdkfd: prepare per-process debug enable and disable
Felix Kuehling
felix.kuehling at amd.com
Tue Nov 22 23:31:39 UTC 2022
On 2022-10-31 12:23, Jonathan Kim wrote:
> The ROCm debugger will attach to a process to debug by PTRACE and will
> expect the KFD to prepare a process for the target PID, whether the
> target PID has opened the KFD device or not.
>
> This patch is to explicity handle this requirement. Further HW mode
> setting and runtime coordination requirements will be handled in
> following patches.
>
> In the case where the target process has not opened the KFD device,
> a new KFD process must be created for the target PID.
> The debugger as well as the target process for this case will have not
> acquired any VMs so handle process restoration to correctly account for
> this.
>
> To coordinate with HSA runtime, the debugger must be aware of the target
> process' runtime enablement status and will copy the runtime status
> information into the debugged KFD process for later query.
>
> On enablement, the debugger will subscribe to a set of exceptions where
> each exception events will notify the debugger through a pollable FIFO
> file descriptor that the debugger provides to the KFD to manage.
> Some events will be synchronously raised while other are scheduled,
> which is why a debug_event_workarea worker is initialized.
>
> Finally on process termination of either the debugger or the target,
> debugging must be disabled if it has not been done so.
>
> v2: relax debug trap disable and PTRACE ATTACH requirement.
One potential bug and one nit-pick inline ...
>
> Signed-off-by: Jonathan Kim <jonathan.kim at amd.com>
> ---
> drivers/gpu/drm/amd/amdkfd/Makefile | 3 +-
> drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 88 +++++++++++++++++-
> drivers/gpu/drm/amd/amdkfd/kfd_debug.c | 91 +++++++++++++++++++
> drivers/gpu/drm/amd/amdkfd/kfd_debug.h | 33 +++++++
> .../drm/amd/amdkfd/kfd_device_queue_manager.c | 24 ++++-
> drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 34 ++++++-
> drivers/gpu/drm/amd/amdkfd/kfd_process.c | 65 +++++++++----
> 7 files changed, 309 insertions(+), 29 deletions(-)
> create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_debug.c
> create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_debug.h
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/Makefile b/drivers/gpu/drm/amd/amdkfd/Makefile
> index e758c2a24cd0..747754428073 100644
> --- a/drivers/gpu/drm/amd/amdkfd/Makefile
> +++ b/drivers/gpu/drm/amd/amdkfd/Makefile
> @@ -55,7 +55,8 @@ AMDKFD_FILES := $(AMDKFD_PATH)/kfd_module.o \
> $(AMDKFD_PATH)/kfd_int_process_v9.o \
> $(AMDKFD_PATH)/kfd_int_process_v11.o \
> $(AMDKFD_PATH)/kfd_smi_events.o \
> - $(AMDKFD_PATH)/kfd_crat.o
> + $(AMDKFD_PATH)/kfd_crat.o \
> + $(AMDKFD_PATH)/kfd_debug.o
>
> ifneq ($(CONFIG_AMD_IOMMU_V2),)
> AMDKFD_FILES += $(AMDKFD_PATH)/kfd_iommu.o
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> index 11a960c83fb2..d550dbe570fb 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> @@ -44,6 +44,7 @@
> #include "amdgpu_amdkfd.h"
> #include "kfd_smi_events.h"
> #include "amdgpu_dma_buf.h"
> +#include "kfd_debug.h"
>
> static long kfd_ioctl(struct file *, unsigned int, unsigned long);
> static int kfd_open(struct inode *, struct file *);
> @@ -142,10 +143,15 @@ static int kfd_open(struct inode *inode, struct file *filep)
> return -EPERM;
> }
>
> - process = kfd_create_process(filep);
> + process = kfd_create_process(current);
> if (IS_ERR(process))
> return PTR_ERR(process);
>
> + if (kfd_process_init_cwsr_apu(process, filep)) {
> + kfd_unref_process(process);
> + return -EFAULT;
> + }
> +
> if (kfd_is_locked()) {
> dev_dbg(kfd_device, "kfd is locked!\n"
> "process %d unreferenced", process->pasid);
> @@ -2652,6 +2658,9 @@ static int kfd_ioctl_runtime_enable(struct file *filep, struct kfd_process *p, v
> static int kfd_ioctl_set_debug_trap(struct file *filep, struct kfd_process *p, void *data)
> {
> struct kfd_ioctl_dbg_trap_args *args = data;
> + struct task_struct *thread = NULL;
> + struct pid *pid = NULL;
> + struct kfd_process *target = NULL;
> int r = 0;
>
> if (sched_policy == KFD_SCHED_POLICY_NO_HWS) {
> @@ -2659,9 +2668,71 @@ static int kfd_ioctl_set_debug_trap(struct file *filep, struct kfd_process *p, v
> return -EINVAL;
> }
>
> + pid = find_get_pid(args->pid);
> + if (!pid) {
> + pr_debug("Cannot find pid info for %i\n", args->pid);
> + r = -ESRCH;
> + goto out;
> + }
> +
> + thread = get_pid_task(pid, PIDTYPE_PID);
> +
> + if (args->op == KFD_IOC_DBG_TRAP_ENABLE) {
> + bool create_process;
> +
> + rcu_read_lock();
> + create_process = thread && thread != current && ptrace_parent(thread) == current;
> + rcu_read_unlock();
> +
> + target = create_process ? kfd_create_process(thread) :
> + kfd_lookup_process_by_pid(pid);
> + } else {
> + target = kfd_lookup_process_by_pid(pid);
> + }
> +
> + if (!target) {
> + pr_debug("Cannot find process PID %i to debug\n", args->pid);
> + r = -ESRCH;
> + goto out;
> + }
> +
> + /* Check if target is still PTRACED. */
> + rcu_read_lock();
> + if (target != p && args->op == KFD_IOC_DBG_TRAP_DISABLE
> + && ptrace_parent(target->lead_thread) != current) {
Should this say args->op != KFD_IOC_DBT_TRAP_DISABLE? I think that's the
only op that would be allowed when the process is not ptrace attached.
> + pr_err("PID %i is not PTRACED and cannot be debugged\n", args->pid);
> + r = -EPERM;
> + }
> + rcu_read_unlock();
> +
> + if (r)
> + goto out;
> +
> + mutex_lock(&target->mutex);
> +
> + if (args->op != KFD_IOC_DBG_TRAP_ENABLE && !target->debug_trap_enabled) {
> + pr_err("PID %i not debug enabled for op %i\n", args->pid, args->op);
> + r = -EINVAL;
> + goto unlock_out;
> + }
> +
> switch (args->op) {
> case KFD_IOC_DBG_TRAP_ENABLE:
> + if (target != p)
> + target->debugger_process = p;
> +
> + r = kfd_dbg_trap_enable(target,
> + args->enable.dbg_fd,
> + (void __user *)args->enable.rinfo_ptr,
> + &args->enable.rinfo_size);
> + if (!r)
> + target->exception_enable_mask = args->enable.exception_mask;
> +
> + pr_warn("Debug functions limited\n");
> + break;
> case KFD_IOC_DBG_TRAP_DISABLE:
> + r = kfd_dbg_trap_disable(target);
> + break;
> case KFD_IOC_DBG_TRAP_SEND_RUNTIME_EVENT:
> case KFD_IOC_DBG_TRAP_SET_EXCEPTIONS_ENABLED:
> case KFD_IOC_DBG_TRAP_SET_WAVE_LAUNCH_OVERRIDE:
> @@ -2675,7 +2746,7 @@ static int kfd_ioctl_set_debug_trap(struct file *filep, struct kfd_process *p, v
> case KFD_IOC_DBG_TRAP_QUERY_EXCEPTION_INFO:
> case KFD_IOC_DBG_TRAP_GET_QUEUE_SNAPSHOT:
> case KFD_IOC_DBG_TRAP_GET_DEVICE_SNAPSHOT:
> - pr_warn("Debugging not supported yet\n");
> + pr_warn("Debug op %i not supported yet\n", args->op);
> r = -EACCES;
> break;
> default:
> @@ -2683,6 +2754,19 @@ static int kfd_ioctl_set_debug_trap(struct file *filep, struct kfd_process *p, v
> r = -EINVAL;
> }
>
> +unlock_out:
> + mutex_unlock(&target->mutex);
> +
> +out:
> + if (thread)
> + put_task_struct(thread);
> +
> + if (pid)
> + put_pid(pid);
> +
> + if (target)
> + kfd_unref_process(target);
> +
> return r;
> }
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
> new file mode 100644
> index 000000000000..f967f89903f7
> --- /dev/null
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
> @@ -0,0 +1,91 @@
> +/*
> + * Copyright 2022 Advanced Micro Devices, Inc.
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a
> + * copy of this software and associated documentation files (the "Software"),
> + * to deal in the Software without restriction, including without limitation
> + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
> + * and/or sell copies of the Software, and to permit persons to whom the
> + * Software is furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice shall be included in
> + * all copies or substantial portions of the Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
> + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
> + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
> + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
> + * OTHER DEALINGS IN THE SOFTWARE.
> + */
> +
> +#include "kfd_debug.h"
> +#include <linux/file.h>
> +
> +void debug_event_write_work_handler(struct work_struct *work)
> +{
> + struct kfd_process *process;
> +
> + static const char write_data = '.';
> + loff_t pos = 0;
> +
> + process = container_of(work,
> + struct kfd_process,
> + debug_event_workarea);
> +
> + kernel_write(process->dbg_ev_file, &write_data, 1, &pos);
> +}
> +
> +int kfd_dbg_trap_disable(struct kfd_process *target)
> +{
> + fput(target->dbg_ev_file);
> + target->dbg_ev_file = NULL;
> +
> + if (target->debugger_process) {
> + atomic_dec(&target->debugger_process->debugged_process_count);
> + target->debugger_process = NULL;
> + }
> +
> + target->debug_trap_enabled = false;
> + kfd_unref_process(target);
> +
> + return 0;
> +}
> +
> +int kfd_dbg_trap_enable(struct kfd_process *target, uint32_t fd,
> + void __user *runtime_info, uint32_t *runtime_size)
> +{
> + struct file *f;
> + uint32_t copy_size;
> + int r = 0;
> +
> + if (target->debug_trap_enabled)
> + return -EINVAL;
> +
> + copy_size = min((size_t)(*runtime_size), sizeof(target->runtime_info));
> +
> + f = fget(fd);
> + if (!f) {
> + pr_err("Failed to get file for (%i)\n", fd);
> + return -EBADF;
> + }
> +
> + target->dbg_ev_file = f;
> +
> + /* We already hold the process reference but hold another one for the
> + * debug session.
> + */
> + kref_get(&target->ref);
> + target->debug_trap_enabled = true;
> +
> + if (target->debugger_process)
> + atomic_inc(&target->debugger_process->debugged_process_count);
> +
> + if (copy_to_user(runtime_info, (void *)&target->runtime_info, copy_size))
> + r = -EFAULT;
> +
> + *runtime_size = sizeof(target->runtime_info);
> +
> + return r;
> +}
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
> new file mode 100644
> index 000000000000..b2217eb1399c
> --- /dev/null
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
> @@ -0,0 +1,33 @@
> +/*
> + * Copyright 2022 Advanced Micro Devices, Inc.
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a
> + * copy of this software and associated documentation files (the "Software"),
> + * to deal in the Software without restriction, including without limitation
> + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
> + * and/or sell copies of the Software, and to permit persons to whom the
> + * Software is furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice shall be included in
> + * all copies or substantial portions of the Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
> + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
> + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
> + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
> + * OTHER DEALINGS IN THE SOFTWARE.
> + */
> +
> +#ifndef KFD_DEBUG_EVENTS_H_INCLUDED
> +#define KFD_DEBUG_EVENTS_H_INCLUDED
> +
> +#include "kfd_priv.h"
> +
> +int kfd_dbg_trap_disable(struct kfd_process *target);
> +int kfd_dbg_trap_enable(struct kfd_process *target, uint32_t fd,
> + void __user *runtime_info,
> + uint32_t *runtime_info_size);
> +void debug_event_write_work_handler(struct work_struct *work);
> +#endif
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> index ecb4c3abc629..faa5d8c666ee 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> @@ -979,6 +979,14 @@ static int evict_process_queues_cpsch(struct device_queue_manager *dqm,
> goto out;
>
> pdd = qpd_to_pdd(qpd);
> +
> + /* The debugger creates processes that temporarily have not acquired
> + * all VMs for all devices and has no VMs itself.
> + * Skip queue eviction on process eviction.
> + */
> + if (!pdd->drm_priv)
> + goto out;
> +
> pr_debug_ratelimited("Evicting PASID 0x%x queues\n",
> pdd->process->pasid);
>
> @@ -1100,13 +1108,10 @@ static int restore_process_queues_cpsch(struct device_queue_manager *dqm,
> {
> struct queue *q;
> struct kfd_process_device *pdd;
> - uint64_t pd_base;
> uint64_t eviction_duration;
> int retval = 0;
>
> pdd = qpd_to_pdd(qpd);
> - /* Retrieve PD base */
> - pd_base = amdgpu_amdkfd_gpuvm_get_process_page_dir(pdd->drm_priv);
>
> dqm_lock(dqm);
> if (WARN_ON_ONCE(!qpd->evicted)) /* already restored, do nothing */
> @@ -1116,12 +1121,19 @@ static int restore_process_queues_cpsch(struct device_queue_manager *dqm,
> goto out;
> }
>
> + /* The debugger creates processes that temporarily have not acquired
> + * all VMs for all devices and has no VMs itself.
> + * Skip queue restore on process restore.
> + */
> + if (!pdd->drm_priv)
> + goto vm_not_acquired;
> +
> pr_debug_ratelimited("Restoring PASID 0x%x queues\n",
> pdd->process->pasid);
>
> /* Update PD Base in QPD */
> - qpd->page_table_base = pd_base;
> - pr_debug("Updated PD address to 0x%llx\n", pd_base);
> + qpd->page_table_base = amdgpu_amdkfd_gpuvm_get_process_page_dir(pdd->drm_priv);
> + pr_debug("Updated PD address to 0x%llx\n", qpd->page_table_base);
>
> /* activate all active queues on the qpd */
> list_for_each_entry(q, &qpd->queues_list, list) {
> @@ -1147,6 +1159,8 @@ static int restore_process_queues_cpsch(struct device_queue_manager *dqm,
> qpd->evicted = 0;
> eviction_duration = get_jiffies_64() - pdd->last_evict_timestamp;
> atomic64_add(eviction_duration, &pdd->evict_duration_counter);
> +vm_not_acquired:
> + qpd->evicted = 0;
qpd->evicted = 0 is duplicated here. It's only needed in one place.
Regards,
Felix
> out:
> dqm_unlock(dqm);
> return retval;
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> index bf610e3b683b..3ea61fa1db52 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> @@ -886,19 +886,48 @@ struct kfd_process {
> */
> unsigned long last_restore_timestamp;
>
> + /* Indicates device process is debug attached with reserved vmid. */
> + bool debug_trap_enabled;
> +
> + /* per-process-per device debug event fd file */
> + struct file *dbg_ev_file;
> +
> + /* If the process is a kfd debugger, we need to know so we can clean
> + * up at exit time. If a process enables debugging on itself, it does
> + * its own clean-up, so we don't set the flag here. We track this by
> + * counting the number of processes this process is debugging.
> + */
> + atomic_t debugged_process_count;
> +
> + /* If the process is a debugged, this is the debugger process */
> + struct kfd_process *debugger_process;
> +
> /* Kobj for our procfs */
> struct kobject *kobj;
> struct kobject *kobj_queues;
> struct attribute attr_pasid;
>
> + /* Keep track cwsr init */
> + bool has_cwsr;
> +
> + /* Exception code enable mask and status */
> + uint64_t exception_enable_mask;
> +
> /* shared virtual memory registered by this process */
> struct svm_range_list svms;
>
> bool xnack_enabled;
>
> + /* Work area for debugger event writer worker. */
> + struct work_struct debug_event_workarea;
> +
> atomic_t poison;
> /* Queues are in paused stated because we are in the process of doing a CRIU checkpoint */
> bool queues_paused;
> +
> + /* Tracks runtime enable status */
> + struct kfd_runtime_info runtime_info;
> +
> };
>
> #define KFD_PROCESS_TABLE_SIZE 5 /* bits: 32 entries */
> @@ -928,7 +957,7 @@ bool kfd_dev_is_large_bar(struct kfd_dev *dev);
>
> int kfd_process_create_wq(void);
> void kfd_process_destroy_wq(void);
> -struct kfd_process *kfd_create_process(struct file *filep);
> +struct kfd_process *kfd_create_process(struct task_struct *thread);
> struct kfd_process *kfd_get_process(const struct task_struct *task);
> struct kfd_process *kfd_lookup_process_by_pasid(u32 pasid);
> struct kfd_process *kfd_lookup_process_by_mm(const struct mm_struct *mm);
> @@ -1055,6 +1084,9 @@ void kfd_process_set_trap_handler(struct qcm_process_device *qpd,
> uint64_t tba_addr,
> uint64_t tma_addr);
>
> +/* CWSR initialization */
> +int kfd_process_init_cwsr_apu(struct kfd_process *process, struct file *filep);
> +
> /* CRIU */
> /*
> * Need to increment KFD_CRIU_PRIV_VERSION each time a change is made to any of the CRIU private
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> index 951b63677248..56ad38fcd26e 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> @@ -44,6 +44,7 @@ struct mm_struct;
> #include "kfd_iommu.h"
> #include "kfd_svm.h"
> #include "kfd_smi_events.h"
> +#include "kfd_debug.h"
>
> /*
> * List of struct kfd_process (field kfd_process).
> @@ -69,7 +70,6 @@ static struct kfd_process *find_process(const struct task_struct *thread,
> bool ref);
> static void kfd_process_ref_release(struct kref *ref);
> static struct kfd_process *create_process(const struct task_struct *thread);
> -static int kfd_process_init_cwsr_apu(struct kfd_process *p, struct file *filep);
>
> static void evict_process_worker(struct work_struct *work);
> static void restore_process_worker(struct work_struct *work);
> @@ -798,18 +798,19 @@ static void kfd_process_device_destroy_ib_mem(struct kfd_process_device *pdd)
> kfd_process_free_gpuvm(qpd->ib_mem, pdd, qpd->ib_kaddr);
> }
>
> -struct kfd_process *kfd_create_process(struct file *filep)
> +struct kfd_process *kfd_create_process(struct task_struct *thread)
> {
> struct kfd_process *process;
> - struct task_struct *thread = current;
> int ret;
>
> - if (!thread->mm)
> + if (!(thread->mm && mmget_not_zero(thread->mm)))
> return ERR_PTR(-EINVAL);
>
> /* Only the pthreads threading model is supported. */
> - if (thread->group_leader->mm != thread->mm)
> + if (thread->group_leader->mm != thread->mm) {
> + mmput(thread->mm);
> return ERR_PTR(-EINVAL);
> + }
>
> /*
> * take kfd processes mutex before starting of process creation
> @@ -827,10 +828,6 @@ struct kfd_process *kfd_create_process(struct file *filep)
> if (IS_ERR(process))
> goto out;
>
> - ret = kfd_process_init_cwsr_apu(process, filep);
> - if (ret)
> - goto out_destroy;
> -
> if (!procfs.kobj)
> goto out;
>
> @@ -864,16 +861,9 @@ struct kfd_process *kfd_create_process(struct file *filep)
> if (!IS_ERR(process))
> kref_get(&process->ref);
> mutex_unlock(&kfd_processes_mutex);
> + mmput(thread->mm);
>
> return process;
> -
> -out_destroy:
> - hash_del_rcu(&process->kfd_processes);
> - mutex_unlock(&kfd_processes_mutex);
> - synchronize_srcu(&kfd_processes_srcu);
> - /* kfd_process_free_notifier will trigger the cleanup */
> - mmu_notifier_put(&process->mmu_notifier);
> - return ERR_PTR(ret);
> }
>
> struct kfd_process *kfd_get_process(const struct task_struct *thread)
> @@ -1188,6 +1178,28 @@ static void kfd_process_notifier_release(struct mmu_notifier *mn,
> cancel_delayed_work_sync(&p->eviction_work);
> cancel_delayed_work_sync(&p->restore_work);
>
> + if (p->debug_trap_enabled)
> + kfd_dbg_trap_disable(p);
> +
> + if (atomic_read(&p->debugged_process_count) > 0) {
> + struct kfd_process *target;
> + unsigned int temp;
> + int idx = srcu_read_lock(&kfd_processes_srcu);
> +
> + hash_for_each_rcu(kfd_processes_table, temp, target, kfd_processes) {
> + if (target->debugger_process && target->debugger_process == p) {
> + mutex_lock_nested(&target->mutex, 1);
> + if (target->debug_trap_enabled)
> + kfd_dbg_trap_disable(target);
> + mutex_unlock(&target->mutex);
> + if (atomic_read(&p->debugged_process_count) == 0)
> + break;
> + }
> + }
> +
> + srcu_read_unlock(&kfd_processes_srcu, idx);
> + }
> +
> /* Indicate to other users that MM is no longer valid */
> p->mm = NULL;
>
> @@ -1200,11 +1212,14 @@ static const struct mmu_notifier_ops kfd_process_mmu_notifier_ops = {
> .free_notifier = kfd_process_free_notifier,
> };
>
> -static int kfd_process_init_cwsr_apu(struct kfd_process *p, struct file *filep)
> +int kfd_process_init_cwsr_apu(struct kfd_process *p, struct file *filep)
> {
> unsigned long offset;
> int i;
>
> + if (p->has_cwsr)
> + return 0;
> +
> for (i = 0; i < p->n_pdds; i++) {
> struct kfd_dev *dev = p->pdds[i]->dev;
> struct qcm_process_device *qpd = &p->pdds[i]->qpd;
> @@ -1233,6 +1248,8 @@ static int kfd_process_init_cwsr_apu(struct kfd_process *p, struct file *filep)
> qpd->tba_addr, qpd->tma_addr, qpd->cwsr_kaddr);
> }
>
> + p->has_cwsr = true;
> +
> return 0;
> }
>
> @@ -1375,6 +1392,10 @@ static struct kfd_process *create_process(const struct task_struct *thread)
> if (err)
> goto err_event_init;
> process->is_32bit_user_mode = in_compat_syscall();
> + process->debug_trap_enabled = false;
> + process->debugger_process = NULL;
> + process->exception_enable_mask = 0;
> + atomic_set(&process->debugged_process_count, 0);
>
> process->pasid = kfd_pasid_alloc();
> if (process->pasid == 0) {
> @@ -1422,6 +1443,8 @@ static struct kfd_process *create_process(const struct task_struct *thread)
> kfd_unref_process(process);
> get_task_struct(process->lead_thread);
>
> + INIT_WORK(&process->debug_event_workarea, debug_event_write_work_handler);
> +
> return process;
>
> err_register_notifier:
> @@ -1894,8 +1917,10 @@ static void restore_process_worker(struct work_struct *work)
> */
>
> p->last_restore_timestamp = get_jiffies_64();
> - ret = amdgpu_amdkfd_gpuvm_restore_process_bos(p->kgd_process_info,
> - &p->ef);
> + /* VMs may not have been acquired yet during debugging. */
> + if (p->kgd_process_info)
> + ret = amdgpu_amdkfd_gpuvm_restore_process_bos(p->kgd_process_info,
> + &p->ef);
> if (ret) {
> pr_debug("Failed to restore BOs of pasid 0x%x, retry after %d ms\n",
> p->pasid, PROCESS_BACK_OFF_TIME_MS);
More information about the amd-gfx
mailing list