[PATCH 03/32] drm/amdkfd: prepare per-process debug enable and disable
Kim, Jonathan
Jonathan.Kim at amd.com
Thu Mar 23 19:12:48 UTC 2023
[Public]
> -----Original Message-----
> From: Kuehling, Felix <Felix.Kuehling at amd.com>
> Sent: Thursday, February 16, 2023 6:44 PM
> To: Kim, Jonathan <Jonathan.Kim at amd.com>; amd-
> gfx at lists.freedesktop.org; dri-devel at lists.freedesktop.org
> Subject: Re: [PATCH 03/32] drm/amdkfd: prepare per-process debug enable
> and disable
>
>
> On 2023-01-25 14:53, Jonathan Kim wrote:
> > The ROCm debugger will attach to a process to debug by PTRACE and will
> > expect the KFD to prepare a process for the target PID, whether the
> > target PID has opened the KFD device or not.
> >
> > This patch is to explicity handle this requirement. Further HW mode
> > setting and runtime coordination requirements will be handled in
> > following patches.
> >
> > In the case where the target process has not opened the KFD device,
> > a new KFD process must be created for the target PID.
> > The debugger as well as the target process for this case will have not
> > acquired any VMs so handle process restoration to correctly account for
> > this.
> >
> > To coordinate with HSA runtime, the debugger must be aware of the target
> > process' runtime enablement status and will copy the runtime status
> > information into the debugged KFD process for later query.
> >
> > On enablement, the debugger will subscribe to a set of exceptions where
> > each exception events will notify the debugger through a pollable FIFO
> > file descriptor that the debugger provides to the KFD to manage.
> > Some events will be synchronously raised while other are scheduled,
> > which is why a debug_event_workarea worker is initialized.
> >
> > Finally on process termination of either the debugger or the target,
> > debugging must be disabled if it has not been done so.
> >
> > v3: fix typo on debug trap disable and PTRACE ATTACH relax check.
> > remove unnecessary queue eviction counter reset when there's nothing
> > to evict.
> > change err code to EALREADY if attaching to an already attached process.
> > move debug disable to release worker to avoid race with disable from
> > ioctl call.
> >
> > v2: relax debug trap disable and PTRACE ATTACH requirement.
> >
> > Signed-off-by: Jonathan Kim<jonathan.kim at amd.com>
> > ---
> > drivers/gpu/drm/amd/amdkfd/Makefile | 3 +-
> > drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 88 ++++++++++++++++-
> > drivers/gpu/drm/amd/amdkfd/kfd_debug.c | 94
> +++++++++++++++++++
> > drivers/gpu/drm/amd/amdkfd/kfd_debug.h | 33 +++++++
> > .../drm/amd/amdkfd/kfd_device_queue_manager.c | 22 ++++-
> > drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 34 ++++++-
> > drivers/gpu/drm/amd/amdkfd/kfd_process.c | 63 +++++++++----
> > 7 files changed, 308 insertions(+), 29 deletions(-)
> > create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_debug.c
> > create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_debug.h
> >
> > diff --git a/drivers/gpu/drm/amd/amdkfd/Makefile
> b/drivers/gpu/drm/amd/amdkfd/Makefile
> > index e758c2a24cd0..747754428073 100644
> > --- a/drivers/gpu/drm/amd/amdkfd/Makefile
> > +++ b/drivers/gpu/drm/amd/amdkfd/Makefile
> > @@ -55,7 +55,8 @@ AMDKFD_FILES := $(AMDKFD_PATH)/kfd_module.o \
> > $(AMDKFD_PATH)/kfd_int_process_v9.o \
> > $(AMDKFD_PATH)/kfd_int_process_v11.o \
> > $(AMDKFD_PATH)/kfd_smi_events.o \
> > - $(AMDKFD_PATH)/kfd_crat.o
> > + $(AMDKFD_PATH)/kfd_crat.o \
> > + $(AMDKFD_PATH)/kfd_debug.o
> >
> > ifneq ($(CONFIG_AMD_IOMMU_V2),)
> > AMDKFD_FILES += $(AMDKFD_PATH)/kfd_iommu.o
> > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> > index d3b019e64093..ee05c2e54ef6 100644
> > --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> > @@ -44,6 +44,7 @@
> > #include "amdgpu_amdkfd.h"
> > #include "kfd_smi_events.h"
> > #include "amdgpu_dma_buf.h"
> > +#include "kfd_debug.h"
> >
> > static long kfd_ioctl(struct file *, unsigned int, unsigned long);
> > static int kfd_open(struct inode *, struct file *);
> > @@ -142,10 +143,15 @@ static int kfd_open(struct inode *inode, struct
> file *filep)
> > return -EPERM;
> > }
> >
> > - process = kfd_create_process(filep);
> > + process = kfd_create_process(current);
> > if (IS_ERR(process))
> > return PTR_ERR(process);
> >
> > + if (kfd_process_init_cwsr_apu(process, filep)) {
> > + kfd_unref_process(process);
> > + return -EFAULT;
> > + }
> > +
> > if (kfd_is_locked()) {
> > dev_dbg(kfd_device, "kfd is locked!\n"
> > "process %d unreferenced", process->pasid);
> > @@ -2653,6 +2659,9 @@ static int kfd_ioctl_runtime_enable(struct file
> *filep, struct kfd_process *p, v
> > static int kfd_ioctl_set_debug_trap(struct file *filep, struct kfd_process
> *p, void *data)
> > {
> > struct kfd_ioctl_dbg_trap_args *args = data;
> > + struct task_struct *thread = NULL;
> > + struct pid *pid = NULL;
> > + struct kfd_process *target = NULL;
> > int r = 0;
> >
> > if (sched_policy == KFD_SCHED_POLICY_NO_HWS) {
> > @@ -2660,9 +2669,71 @@ static int kfd_ioctl_set_debug_trap(struct file
> *filep, struct kfd_process *p, v
> > return -EINVAL;
> > }
> >
> > + pid = find_get_pid(args->pid);
> > + if (!pid) {
> > + pr_debug("Cannot find pid info for %i\n", args->pid);
> > + r = -ESRCH;
> > + goto out;
> > + }
> > +
> > + thread = get_pid_task(pid, PIDTYPE_PID);
> > +
> > + if (args->op == KFD_IOC_DBG_TRAP_ENABLE) {
> > + bool create_process;
> > +
> > + rcu_read_lock();
> > + create_process = thread && thread != current &&
> ptrace_parent(thread) == current;
> > + rcu_read_unlock();
> > +
> > + target = create_process ? kfd_create_process(thread) :
> > + kfd_lookup_process_by_pid(pid);
> > + } else {
> > + target = kfd_lookup_process_by_pid(pid);
> > + }
> > +
> > + if (!target) {
> > + pr_debug("Cannot find process PID %i to debug\n", args-
> >pid);
> > + r = -ESRCH;
> > + goto out;
> > + }
> > +
> > + /* Check if target is still PTRACED. */
> > + rcu_read_lock();
> > + if (target != p && args->op != KFD_IOC_DBG_TRAP_DISABLE
> > + && ptrace_parent(target->lead_thread) !=
> current) {
> > + pr_err("PID %i is not PTRACED and cannot be debugged\n",
> args->pid);
> > + r = -EPERM;
> > + }
> > + rcu_read_unlock();
> > +
> > + if (r)
> > + goto out;
> > +
> > + mutex_lock(&target->mutex);
> > +
> > + if (args->op != KFD_IOC_DBG_TRAP_ENABLE && !target-
> >debug_trap_enabled) {
> > + pr_err("PID %i not debug enabled for op %i\n", args->pid,
> args->op);
> > + r = -EINVAL;
> > + goto unlock_out;
> > + }
> > +
> > switch (args->op) {
> > case KFD_IOC_DBG_TRAP_ENABLE:
> > + if (target != p)
> > + target->debugger_process = p;
> > +
> > + r = kfd_dbg_trap_enable(target,
> > + args->enable.dbg_fd,
> > + (void __user *)args->enable.rinfo_ptr,
> > + &args->enable.rinfo_size);
> > + if (!r)
> > + target->exception_enable_mask = args-
> >enable.exception_mask;
> > +
> > + pr_warn("Debug functions limited\n");
> > + break;
> > case KFD_IOC_DBG_TRAP_DISABLE:
> > + r = kfd_dbg_trap_disable(target);
> > + break;
> > case KFD_IOC_DBG_TRAP_SEND_RUNTIME_EVENT:
> > case KFD_IOC_DBG_TRAP_SET_EXCEPTIONS_ENABLED:
> > case KFD_IOC_DBG_TRAP_SET_WAVE_LAUNCH_OVERRIDE:
> > @@ -2676,7 +2747,7 @@ static int kfd_ioctl_set_debug_trap(struct file
> *filep, struct kfd_process *p, v
> > case KFD_IOC_DBG_TRAP_QUERY_EXCEPTION_INFO:
> > case KFD_IOC_DBG_TRAP_GET_QUEUE_SNAPSHOT:
> > case KFD_IOC_DBG_TRAP_GET_DEVICE_SNAPSHOT:
> > - pr_warn("Debugging not supported yet\n");
> > + pr_warn("Debug op %i not supported yet\n", args->op);
> > r = -EACCES;
> > break;
> > default:
> > @@ -2684,6 +2755,19 @@ static int kfd_ioctl_set_debug_trap(struct file
> *filep, struct kfd_process *p, v
> > r = -EINVAL;
> > }
> >
> > +unlock_out:
> > + mutex_unlock(&target->mutex);
> > +
> > +out:
> > + if (thread)
> > + put_task_struct(thread);
> > +
> > + if (pid)
> > + put_pid(pid);
> > +
> > + if (target)
> > + kfd_unref_process(target);
> > +
> > return r;
> > }
> >
> > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
> b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
> > new file mode 100644
> > index 000000000000..f6ea6db266b4
> > --- /dev/null
> > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
> > @@ -0,0 +1,94 @@
> > +/*
> > + * Copyright 2022 Advanced Micro Devices, Inc.
> > + *
> > + * Permission is hereby granted, free of charge, to any person obtaining a
> > + * copy of this software and associated documentation files (the
> "Software"),
> > + * to deal in the Software without restriction, including without limitation
> > + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
> > + * and/or sell copies of the Software, and to permit persons to whom the
> > + * Software is furnished to do so, subject to the following conditions:
> > + *
> > + * The above copyright notice and this permission notice shall be included
> in
> > + * all copies or substantial portions of the Software.
> > + *
> > + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY
> KIND, EXPRESS OR
> > + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
> MERCHANTABILITY,
> > + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO
> EVENT SHALL
> > + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM,
> DAMAGES OR
> > + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
> OTHERWISE,
> > + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
> THE USE OR
> > + * OTHER DEALINGS IN THE SOFTWARE.
> > + */
> > +
> > +#include "kfd_debug.h"
> > +#include <linux/file.h>
> > +
> > +void debug_event_write_work_handler(struct work_struct *work)
> > +{
> > + struct kfd_process *process;
> > +
> > + static const char write_data = '.';
> > + loff_t pos = 0;
> > +
> > + process = container_of(work,
> > + struct kfd_process,
> > + debug_event_workarea);
> > +
> > + kernel_write(process->dbg_ev_file, &write_data, 1, &pos);
> > +}
> > +
> > +int kfd_dbg_trap_disable(struct kfd_process *target)
> > +{
> > + if (!target->debug_trap_enabled)
> > + return 0;
> > +
> > + fput(target->dbg_ev_file);
> > + target->dbg_ev_file = NULL;
> > +
> > + if (target->debugger_process) {
> > + atomic_dec(&target->debugger_process-
> >debugged_process_count);
> > + target->debugger_process = NULL;
> > + }
> > +
> > + target->debug_trap_enabled = false;
> > + kfd_unref_process(target);
> > +
> > + return 0;
> > +}
> > +
> > +int kfd_dbg_trap_enable(struct kfd_process *target, uint32_t fd,
> > + void __user *runtime_info, uint32_t *runtime_size)
> > +{
> > + struct file *f;
> > + uint32_t copy_size;
> > + int r = 0;
> > +
> > + if (target->debug_trap_enabled)
> > + return -EALREADY;
> > +
> > + copy_size = min((size_t)(*runtime_size), sizeof(target-
> >runtime_info));
> > +
> > + f = fget(fd);
> > + if (!f) {
> > + pr_err("Failed to get file for (%i)\n", fd);
> > + return -EBADF;
> > + }
> > +
> > + target->dbg_ev_file = f;
> > +
> > + /* We already hold the process reference but hold another one for
> the
> > + * debug session.
> > + */
> > + kref_get(&target->ref);
> > + target->debug_trap_enabled = true;
> > +
> > + if (target->debugger_process)
> > + atomic_inc(&target->debugger_process-
> >debugged_process_count);
> > +
> > + if (copy_to_user(runtime_info, (void *)&target->runtime_info,
> copy_size))
> > + r = -EFAULT;
> > +
> > + *runtime_size = sizeof(target->runtime_info);
> > +
> > + return r;
> > +}
> > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
> b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
> > new file mode 100644
> > index 000000000000..b2217eb1399c
> > --- /dev/null
> > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
> > @@ -0,0 +1,33 @@
> > +/*
> > + * Copyright 2022 Advanced Micro Devices, Inc.
> > + *
> > + * Permission is hereby granted, free of charge, to any person obtaining a
> > + * copy of this software and associated documentation files (the
> "Software"),
> > + * to deal in the Software without restriction, including without limitation
> > + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
> > + * and/or sell copies of the Software, and to permit persons to whom the
> > + * Software is furnished to do so, subject to the following conditions:
> > + *
> > + * The above copyright notice and this permission notice shall be included
> in
> > + * all copies or substantial portions of the Software.
> > + *
> > + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY
> KIND, EXPRESS OR
> > + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
> MERCHANTABILITY,
> > + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO
> EVENT SHALL
> > + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM,
> DAMAGES OR
> > + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
> OTHERWISE,
> > + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
> THE USE OR
> > + * OTHER DEALINGS IN THE SOFTWARE.
> > + */
> > +
> > +#ifndef KFD_DEBUG_EVENTS_H_INCLUDED
> > +#define KFD_DEBUG_EVENTS_H_INCLUDED
> > +
> > +#include "kfd_priv.h"
> > +
> > +int kfd_dbg_trap_disable(struct kfd_process *target);
> > +int kfd_dbg_trap_enable(struct kfd_process *target, uint32_t fd,
> > + void __user *runtime_info,
> > + uint32_t *runtime_info_size);
> > +void debug_event_write_work_handler(struct work_struct *work);
> > +#endif
> > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> > index c06ada0844ba..a2ac98d06e71 100644
> > --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> > @@ -979,6 +979,14 @@ static int evict_process_queues_cpsch(struct
> device_queue_manager *dqm,
> > goto out;
> >
> > pdd = qpd_to_pdd(qpd);
> > +
> > + /* The debugger creates processes that temporarily have not
> acquired
> > + * all VMs for all devices and has no VMs itself.
> > + * Skip queue eviction on process eviction.
> > + */
> > + if (!pdd->drm_priv)
> > + goto out;
> > +
> This should be before qpd->
Sorry I didn't quite catch what you were saying here (did your comment get cutoff?).
Did you mean the pdd->drm_priv check needs to go before the if (qpd->evicted++ > 0) /* already evicted, do nothing */ check?
Thanks,
Jon
> > pr_debug_ratelimited("Evicting PASID 0x%x queues\n",
> > pdd->process->pasid);
> >
> > @@ -1100,13 +1108,10 @@ static int restore_process_queues_cpsch(struct
> device_queue_manager *dqm,
> > {
> > struct queue *q;
> > struct kfd_process_device *pdd;
> > - uint64_t pd_base;
> > uint64_t eviction_duration;
> > int retval = 0;
> >
> > pdd = qpd_to_pdd(qpd);
> > - /* Retrieve PD base */
> > - pd_base = amdgpu_amdkfd_gpuvm_get_process_page_dir(pdd-
> >drm_priv);
> >
> > dqm_lock(dqm);
> > if (WARN_ON_ONCE(!qpd->evicted)) /* already restored, do nothing
> */
> > @@ -1116,12 +1121,19 @@ static int restore_process_queues_cpsch(struct
> device_queue_manager *dqm,
> > goto out;
> > }
> >
> > + /* The debugger creates processes that temporarily have not
> acquired
> > + * all VMs for all devices and has no VMs itself.
> > + * Skip queue restore on process restore.
> > + */
> > + if (!pdd->drm_priv)
> > + goto out;
> > +
>
> I had a comment here that "qpd->evicted = 0;" was duplicated. It is
> still needed in this case. Otherwise the process will end up being
> created with all queues in an evicted state and no way to execute
> anything on the GPU.
>
> You only need one instance of "qpd->evicted = 0;", but it needs to be in
> the right place (after the vm_not_acquired label you had in v1 of this
> patch).
>
> Regards,
> Felix
>
>
> > pr_debug_ratelimited("Restoring PASID 0x%x queues\n",
> > pdd->process->pasid);
> >
> > /* Update PD Base in QPD */
> > - qpd->page_table_base = pd_base;
> > - pr_debug("Updated PD address to 0x%llx\n", pd_base);
> > + qpd->page_table_base =
> amdgpu_amdkfd_gpuvm_get_process_page_dir(pdd->drm_priv);
> > + pr_debug("Updated PD address to 0x%llx\n", qpd-
> >page_table_base);
> >
> > /* activate all active queues on the qpd */
> > list_for_each_entry(q, &qpd->queues_list, list) {
> > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> > index bfa30d12406b..62b75ba28425 100644
> > --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> > @@ -886,19 +886,48 @@ struct kfd_process {
> > */
> > unsigned long last_restore_timestamp;
> >
> > + /* Indicates device process is debug attached with reserved vmid. */
> > + bool debug_trap_enabled;
> > +
> > + /* per-process-per device debug event fd file */
> > + struct file *dbg_ev_file;
> > +
> > + /* If the process is a kfd debugger, we need to know so we can clean
> > + * up at exit time. If a process enables debugging on itself, it does
> > + * its own clean-up, so we don't set the flag here. We track this by
> > + * counting the number of processes this process is debugging.
> > + */
> > + atomic_t debugged_process_count;
> > +
> > + /* If the process is a debugged, this is the debugger process */
> > + struct kfd_process *debugger_process;
> > +
> > /* Kobj for our procfs */
> > struct kobject *kobj;
> > struct kobject *kobj_queues;
> > struct attribute attr_pasid;
> >
> > + /* Keep track cwsr init */
> > + bool has_cwsr;
> > +
> > + /* Exception code enable mask and status */
> > + uint64_t exception_enable_mask;
> > +
> > /* shared virtual memory registered by this process */
> > struct svm_range_list svms;
> >
> > bool xnack_enabled;
> >
> > + /* Work area for debugger event writer worker. */
> > + struct work_struct debug_event_workarea;
> > +
> > atomic_t poison;
> > /* Queues are in paused stated because we are in the process of
> doing a CRIU checkpoint */
> > bool queues_paused;
> > +
> > + /* Tracks runtime enable status */
> > + struct kfd_runtime_info runtime_info;
> > +
> > };
> >
> > #define KFD_PROCESS_TABLE_SIZE 5 /* bits: 32 entries */
> > @@ -928,7 +957,7 @@ bool kfd_dev_is_large_bar(struct kfd_dev *dev);
> >
> > int kfd_process_create_wq(void);
> > void kfd_process_destroy_wq(void);
> > -struct kfd_process *kfd_create_process(struct file *filep);
> > +struct kfd_process *kfd_create_process(struct task_struct *thread);
> > struct kfd_process *kfd_get_process(const struct task_struct *task);
> > struct kfd_process *kfd_lookup_process_by_pasid(u32 pasid);
> > struct kfd_process *kfd_lookup_process_by_mm(const struct mm_struct
> *mm);
> > @@ -1055,6 +1084,9 @@ void kfd_process_set_trap_handler(struct
> qcm_process_device *qpd,
> > uint64_t tba_addr,
> > uint64_t tma_addr);
> >
> > +/* CWSR initialization */
> > +int kfd_process_init_cwsr_apu(struct kfd_process *process, struct file
> *filep);
> > +
> > /* CRIU */
> > /*
> > * Need to increment KFD_CRIU_PRIV_VERSION each time a change is
> made to any of the CRIU private
> > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> > index 72df6286e240..e935158ab311 100644
> > --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> > @@ -44,6 +44,7 @@ struct mm_struct;
> > #include "kfd_iommu.h"
> > #include "kfd_svm.h"
> > #include "kfd_smi_events.h"
> > +#include "kfd_debug.h"
> >
> > /*
> > * List of struct kfd_process (field kfd_process).
> > @@ -69,7 +70,6 @@ static struct kfd_process *find_process(const struct
> task_struct *thread,
> > bool ref);
> > static void kfd_process_ref_release(struct kref *ref);
> > static struct kfd_process *create_process(const struct task_struct
> *thread);
> > -static int kfd_process_init_cwsr_apu(struct kfd_process *p, struct file
> *filep);
> >
> > static void evict_process_worker(struct work_struct *work);
> > static void restore_process_worker(struct work_struct *work);
> > @@ -798,18 +798,19 @@ static void
> kfd_process_device_destroy_ib_mem(struct kfd_process_device *pdd)
> > kfd_process_free_gpuvm(qpd->ib_mem, pdd, &qpd->ib_kaddr);
> > }
> >
> > -struct kfd_process *kfd_create_process(struct file *filep)
> > +struct kfd_process *kfd_create_process(struct task_struct *thread)
> > {
> > struct kfd_process *process;
> > - struct task_struct *thread = current;
> > int ret;
> >
> > - if (!thread->mm)
> > + if (!(thread->mm && mmget_not_zero(thread->mm)))
> > return ERR_PTR(-EINVAL);
> >
> > /* Only the pthreads threading model is supported. */
> > - if (thread->group_leader->mm != thread->mm)
> > + if (thread->group_leader->mm != thread->mm) {
> > + mmput(thread->mm);
> > return ERR_PTR(-EINVAL);
> > + }
> >
> > /*
> > * take kfd processes mutex before starting of process creation
> > @@ -827,10 +828,6 @@ struct kfd_process *kfd_create_process(struct file
> *filep)
> > if (IS_ERR(process))
> > goto out;
> >
> > - ret = kfd_process_init_cwsr_apu(process, filep);
> > - if (ret)
> > - goto out_destroy;
> > -
> > if (!procfs.kobj)
> > goto out;
> >
> > @@ -864,16 +861,9 @@ struct kfd_process *kfd_create_process(struct file
> *filep)
> > if (!IS_ERR(process))
> > kref_get(&process->ref);
> > mutex_unlock(&kfd_processes_mutex);
> > + mmput(thread->mm);
> >
> > return process;
> > -
> > -out_destroy:
> > - hash_del_rcu(&process->kfd_processes);
> > - mutex_unlock(&kfd_processes_mutex);
> > - synchronize_srcu(&kfd_processes_srcu);
> > - /* kfd_process_free_notifier will trigger the cleanup */
> > - mmu_notifier_put(&process->mmu_notifier);
> > - return ERR_PTR(ret);
> > }
> >
> > struct kfd_process *kfd_get_process(const struct task_struct *thread)
> > @@ -1115,6 +1105,26 @@ static void kfd_process_wq_release(struct
> work_struct *work)
> > struct kfd_process *p = container_of(work, struct kfd_process,
> > release_work);
> >
> > + kfd_dbg_trap_disable(p);
> > +
> > + if (atomic_read(&p->debugged_process_count) > 0) {
> > + struct kfd_process *target;
> > + unsigned int temp;
> > + int idx = srcu_read_lock(&kfd_processes_srcu);
> > +
> > + hash_for_each_rcu(kfd_processes_table, temp, target,
> kfd_processes) {
> > + if (target->debugger_process && target-
> >debugger_process == p) {
> > + mutex_lock(&target->mutex);
> > + kfd_dbg_trap_disable(target);
> > + mutex_unlock(&target->mutex);
> > + if (atomic_read(&p-
> >debugged_process_count) == 0)
> > + break;
> > + }
> > + }
> > +
> > + srcu_read_unlock(&kfd_processes_srcu, idx);
> > + }
> > +
> > kfd_process_dequeue_from_all_devices(p);
> > pqm_uninit(&p->pqm);
> >
> > @@ -1200,11 +1210,14 @@ static const struct mmu_notifier_ops
> kfd_process_mmu_notifier_ops = {
> > .free_notifier = kfd_process_free_notifier,
> > };
> >
> > -static int kfd_process_init_cwsr_apu(struct kfd_process *p, struct file
> *filep)
> > +int kfd_process_init_cwsr_apu(struct kfd_process *p, struct file *filep)
> > {
> > unsigned long offset;
> > int i;
> >
> > + if (p->has_cwsr)
> > + return 0;
> > +
> > for (i = 0; i < p->n_pdds; i++) {
> > struct kfd_dev *dev = p->pdds[i]->dev;
> > struct qcm_process_device *qpd = &p->pdds[i]->qpd;
> > @@ -1233,6 +1246,8 @@ static int kfd_process_init_cwsr_apu(struct
> kfd_process *p, struct file *filep)
> > qpd->tba_addr, qpd->tma_addr, qpd->cwsr_kaddr);
> > }
> >
> > + p->has_cwsr = true;
> > +
> > return 0;
> > }
> >
> > @@ -1375,6 +1390,10 @@ static struct kfd_process *create_process(const
> struct task_struct *thread)
> > if (err)
> > goto err_event_init;
> > process->is_32bit_user_mode = in_compat_syscall();
> > + process->debug_trap_enabled = false;
> > + process->debugger_process = NULL;
> > + process->exception_enable_mask = 0;
> > + atomic_set(&process->debugged_process_count, 0);
> >
> > process->pasid = kfd_pasid_alloc();
> > if (process->pasid == 0) {
> > @@ -1422,6 +1441,8 @@ static struct kfd_process *create_process(const
> struct task_struct *thread)
> > kfd_unref_process(process);
> > get_task_struct(process->lead_thread);
> >
> > + INIT_WORK(&process->debug_event_workarea,
> debug_event_write_work_handler);
> > +
> > return process;
> >
> > err_register_notifier:
> > @@ -1908,8 +1929,10 @@ static void restore_process_worker(struct
> work_struct *work)
> > */
> >
> > p->last_restore_timestamp = get_jiffies_64();
> > - ret = amdgpu_amdkfd_gpuvm_restore_process_bos(p-
> >kgd_process_info,
> > - &p->ef);
> > + /* VMs may not have been acquired yet during debugging. */
> > + if (p->kgd_process_info)
> > + ret = amdgpu_amdkfd_gpuvm_restore_process_bos(p-
> >kgd_process_info,
> > + &p->ef);
> > if (ret) {
> > pr_debug("Failed to restore BOs of pasid 0x%x, retry after %d
> ms\n",
> > p->pasid, PROCESS_BACK_OFF_TIME_MS);
More information about the amd-gfx
mailing list