<!DOCTYPE html><html><head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
</head>
<body>
<pre>Thanks, Felix. All issues have been fixed, and I will send out V2 soon.
Thanks
Lingshan</pre>
<div class="moz-cite-prefix">On 7/29/2025 3:48 AM, Felix Kuehling
wrote:<br>
</div>
<blockquote type="cite" cite="mid:51264616-aca9-4b82-822e-7c8856b47629@amd.com">On
2025-07-24 22:43, Zhu Lingshan wrote:
<br>
<blockquote type="cite">This commit introduces a new id field for
<br>
struct kfd process, which helps identify
<br>
a kfd process among multiple contexts that
<br>
all belong to a single user space program.
<br>
<br>
The sysfs entry of a secondary kfd process
<br>
is placed under the sysfs entry folder of
<br>
its primary kfd process.
<br>
<br>
The naming format of the sysfs entry of a secondary
<br>
kfd process is "context_%u" where %u is the process id.
<br>
<br>
Signed-off-by: Zhu Lingshan <a class="moz-txt-link-rfc2396E" href="mailto:lingshan.zhu@amd.com"><lingshan.zhu@amd.com></a>
<br>
---
<br>
drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 6 ++
<br>
drivers/gpu/drm/amd/amdkfd/kfd_process.c | 82
+++++++++++++++++++++++-
<br>
2 files changed, 85 insertions(+), 3 deletions(-)
<br>
<br>
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
<br>
index de701d72aa5c..a6e12c705734 100644
<br>
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
<br>
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
<br>
@@ -995,6 +995,9 @@ struct kfd_process {
<br>
/* Tracks debug per-vmid request for debug flags */
<br>
u32 dbg_flags;
<br>
+ /* kfd process id */
<br>
+ u16 id;
<br>
+
<br>
atomic_t poison;
<br>
/* Queues are in paused stated because we are in the
process of doing a CRIU checkpoint */
<br>
bool queues_paused;
<br>
@@ -1009,6 +1012,9 @@ struct kfd_process {
<br>
/* indicating whether this is a primary kfd_process */
<br>
bool primary;
<br>
+
<br>
+ /* The primary kfd_process allocating IDs for its secondary
kfd_process, 0 for primary kfd_process */
<br>
+ struct ida id_table;
<br>
};
<br>
#define KFD_PROCESS_TABLE_SIZE 8 /* bits: 256 entries */
<br>
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
<br>
index 440fde75d1e4..e1ba9015bb83 100644
<br>
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
<br>
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
<br>
@@ -54,6 +54,9 @@ DEFINE_MUTEX(kfd_processes_mutex);
<br>
DEFINE_SRCU(kfd_processes_srcu);
<br>
+#define KFD_PROCESS_ID_MIN 1
<br>
+#define KFD_PROCESS_ID_WIDTH 16
<br>
+
<br>
/* For process termination handling */
<br>
static struct workqueue_struct *kfd_process_wq;
<br>
@@ -827,6 +830,7 @@ static void
kfd_process_device_destroy_ib_mem(struct kfd_process_device
*pdd)
<br>
int kfd_create_process_sysfs(struct kfd_process *process)
<br>
{
<br>
+ struct kfd_process *primary_process;
<br>
int ret;
<br>
if (process->kobj) {
<br>
@@ -839,9 +843,22 @@ int kfd_create_process_sysfs(struct
kfd_process *process)
<br>
pr_warn("Creating procfs kobject failed");
<br>
return -ENOMEM;
<br>
}
<br>
- ret = kobject_init_and_add(process->kobj,
&procfs_type,
<br>
- procfs.kobj, "%d",
<br>
- (int)process->lead_thread->pid);
<br>
+
<br>
+ if (process->primary)
<br>
+ ret = kobject_init_and_add(process->kobj,
&procfs_type,
<br>
+ procfs.kobj, "%d",
<br>
+ (int)process->lead_thread->pid);
<br>
+ else {
<br>
+ primary_process =
kfd_lookup_process_by_mm(process->lead_thread->mm);
<br>
+ if (!primary_process)
<br>
+ return -EFAULT;
<br>
</blockquote>
<br>
EFAULT means "Bad address". A better error code would be ESRCH "No
such process".
<br>
<br>
<br>
<blockquote type="cite">+
<br>
+ ret = kobject_init_and_add(process->kobj,
&procfs_type,
<br>
+ primary_process->kobj, "context_%u",
<br>
+ process->id);
<br>
+ kfd_unref_process(primary_process);
<br>
+ }
<br>
+
<br>
if (ret) {
<br>
pr_warn("Creating procfs pid directory failed");
<br>
kobject_put(process->kobj);
<br>
@@ -863,6 +880,51 @@ int kfd_create_process_sysfs(struct
kfd_process *process)
<br>
return 0;
<br>
}
<br>
+static int kfd_process_alloc_id(struct kfd_process *process)
<br>
+{
<br>
+ u16 ret;
<br>
+ struct kfd_process *primary_process;
<br>
+
<br>
+ if (process->primary) {
<br>
+ process->id = 0;
<br>
+
<br>
+ return 0;
<br>
+ }
<br>
+
<br>
+ primary_process =
kfd_lookup_process_by_mm(process->lead_thread->mm);
<br>
+ if (!primary_process)
<br>
+ return -EFAULT;
<br>
</blockquote>
<br>
ESRCH
<br>
<br>
<br>
<blockquote type="cite">+
<br>
+ ret = ida_alloc_range(&primary_process->id_table,
KFD_PROCESS_ID_MIN,
<br>
+ 1 << (KFD_PROCESS_ID_WIDTH - 1), GFP_KERNEL);
<br>
</blockquote>
<br>
Did you mean (1 << KFD_PROCESS_ID_WIDTH) - 1? That would
give you the range from 1 to 0xffff, which is what I'd expect with
16-bit wide ID.
<br>
<br>
<br>
<blockquote type="cite">+ if (ret < 0)
<br>
+ return ret;
<br>
</blockquote>
<br>
You're leaking a process reference here.
<br>
<br>
<br>
<blockquote type="cite">+
<br>
+ process->id = ret;
<br>
+
<br>
+ kfd_unref_process(primary_process);
<br>
+
<br>
+ return 0;
<br>
+}
<br>
+
<br>
+static void kfd_process_free_id(struct kfd_process *process)
<br>
+{
<br>
+ struct kfd_process *primary_process;
<br>
+
<br>
+ if (process->primary)
<br>
+ return;
<br>
+
<br>
+ primary_process =
kfd_lookup_process_by_mm(process->lead_thread->mm);
<br>
+ if (!primary_process)
<br>
+ return;
<br>
+
<br>
+ ida_free(&primary_process->id_table,
process->id);
<br>
+
<br>
+ kfd_unref_process(primary_process);
<br>
+
<br>
+ return;
<br>
</blockquote>
<br>
This return statement is unnecessary.
<br>
<br>
<br>
<blockquote type="cite">+}
<br>
+
<br>
struct kfd_process *kfd_create_process(struct task_struct
*thread)
<br>
{
<br>
struct kfd_process *process;
<br>
@@ -1193,6 +1255,11 @@ static void kfd_process_wq_release(struct
work_struct *work)
<br>
if (ef)
<br>
dma_fence_signal(ef);
<br>
+ if (!p->primary)
<br>
+ kfd_process_free_id(p);
<br>
+ else
<br>
+ ida_destroy(&p->id_table);
<br>
+
<br>
kfd_process_remove_sysfs(p);
<br>
kfd_debugfs_remove_process(p);
<br>
@@ -1549,6 +1616,12 @@ static struct kfd_process
*create_process(const struct task_struct *thread, bool
<br>
process->queues_paused = false;
<br>
process->primary = primary;
<br>
+ err = kfd_process_alloc_id(process);
<br>
+ if (err) {
<br>
+ pr_err("Creating kfd process: failed to alloc an
id\n");
<br>
+ goto err_alloc_process;
<br>
</blockquote>
<br>
That's the wrong label for cleanup. You'd end up leaking the
process structure. You need to create a new label. See below.
<br>
<br>
<br>
<blockquote type="cite">+ }
<br>
+
<br>
INIT_DELAYED_WORK(&process->eviction_work,
evict_process_worker);
<br>
INIT_DELAYED_WORK(&process->restore_work,
restore_process_worker);
<br>
process->last_restore_timestamp = get_jiffies_64();
<br>
@@ -1599,6 +1672,8 @@ static struct kfd_process
*create_process(const struct task_struct *thread, bool
<br>
goto err_register_notifier;
<br>
}
<br>
BUG_ON(mn != &process->mmu_notifier);
<br>
+
<br>
+ ida_init(&process->id_table);
<br>
}
<br>
kfd_unref_process(process);
<br>
@@ -1619,6 +1694,7 @@ static struct kfd_process
*create_process(const struct task_struct *thread, bool
<br>
err_process_pqm_init:
<br>
kfd_event_free_process(process);
<br>
err_event_init:
<br>
+ kfd_process_free_id(process);
<br>
</blockquote>
<br>
You should add a new label here
<br>
<br>
err_alloc_id:
<br>
<br>
Regards,
<br>
Felix
<br>
<br>
<br>
<blockquote type="cite">
mutex_destroy(&process->mutex);
<br>
kfree(process);
<br>
err_alloc_process:
<br>
</blockquote>
</blockquote>
</body>
</html>