[PATCH 8/9] amdkfd: identify a secondary kfd process by its id

Felix Kuehling felix.kuehling at amd.com
Mon Jul 28 19:48:08 UTC 2025


On 2025-07-24 22:43, Zhu Lingshan wrote:
> This commit introduces a new id field for
> struct kfd process, which helps identify
> a kfd process among multiple contexts that
> all belong to a single user space program.
>
> The sysfs entry of a secondary kfd process
> is placed under the sysfs entry folder of
> its primary kfd process.
>
> The naming format of the sysfs entry of a secondary
> kfd process is "context_%u" where %u is the process id.
>
> Signed-off-by: Zhu Lingshan <lingshan.zhu at amd.com>
> ---
>   drivers/gpu/drm/amd/amdkfd/kfd_priv.h    |  6 ++
>   drivers/gpu/drm/amd/amdkfd/kfd_process.c | 82 +++++++++++++++++++++++-
>   2 files changed, 85 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> index de701d72aa5c..a6e12c705734 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> @@ -995,6 +995,9 @@ struct kfd_process {
>   	/* Tracks debug per-vmid request for debug flags */
>   	u32 dbg_flags;
>   
> +	/* kfd process id */
> +	u16 id;
> +
>   	atomic_t poison;
>   	/* Queues are in paused stated because we are in the process of doing a CRIU checkpoint */
>   	bool queues_paused;
> @@ -1009,6 +1012,9 @@ struct kfd_process {
>   
>   	/* indicating whether this is a primary kfd_process */
>   	bool primary;
> +
> +	/* The primary kfd_process allocating IDs for its secondary kfd_process, 0 for primary kfd_process */
> +	struct ida id_table;
>   };
>   
>   #define KFD_PROCESS_TABLE_SIZE 8 /* bits: 256 entries */
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> index 440fde75d1e4..e1ba9015bb83 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> @@ -54,6 +54,9 @@ DEFINE_MUTEX(kfd_processes_mutex);
>   
>   DEFINE_SRCU(kfd_processes_srcu);
>   
> +#define KFD_PROCESS_ID_MIN 1
> +#define KFD_PROCESS_ID_WIDTH 16
> +
>   /* For process termination handling */
>   static struct workqueue_struct *kfd_process_wq;
>   
> @@ -827,6 +830,7 @@ static void kfd_process_device_destroy_ib_mem(struct kfd_process_device *pdd)
>   
>   int kfd_create_process_sysfs(struct kfd_process *process)
>   {
> +	struct kfd_process *primary_process;
>   	int ret;
>   
>   	if (process->kobj) {
> @@ -839,9 +843,22 @@ int kfd_create_process_sysfs(struct kfd_process *process)
>   		pr_warn("Creating procfs kobject failed");
>   		return -ENOMEM;
>   	}
> -	ret = kobject_init_and_add(process->kobj, &procfs_type,
> -				   procfs.kobj, "%d",
> -				   (int)process->lead_thread->pid);
> +
> +	if (process->primary)
> +		ret = kobject_init_and_add(process->kobj, &procfs_type,
> +					   procfs.kobj, "%d",
> +					   (int)process->lead_thread->pid);
> +	else {
> +		primary_process = kfd_lookup_process_by_mm(process->lead_thread->mm);
> +		if (!primary_process)
> +			return -EFAULT;

EFAULT means "Bad address". A better error code would be ESRCH "No such 
process".


> +
> +		ret = kobject_init_and_add(process->kobj, &procfs_type,
> +					   primary_process->kobj, "context_%u",
> +					   process->id);
> +		kfd_unref_process(primary_process);
> +	}
> +
>   	if (ret) {
>   		pr_warn("Creating procfs pid directory failed");
>   		kobject_put(process->kobj);
> @@ -863,6 +880,51 @@ int kfd_create_process_sysfs(struct kfd_process *process)
>   	return 0;
>   }
>   
> +static int kfd_process_alloc_id(struct kfd_process *process)
> +{
> +	u16 ret;
> +	struct kfd_process *primary_process;
> +
> +	if (process->primary) {
> +		process->id = 0;
> +
> +		return 0;
> +	}
> +
> +	primary_process = kfd_lookup_process_by_mm(process->lead_thread->mm);
> +	if (!primary_process)
> +		return -EFAULT;

ESRCH


> +
> +	ret = ida_alloc_range(&primary_process->id_table, KFD_PROCESS_ID_MIN,
> +	     1 << (KFD_PROCESS_ID_WIDTH - 1), GFP_KERNEL);

Did you mean (1 << KFD_PROCESS_ID_WIDTH) - 1? That would give you the 
range from 1 to 0xffff, which is what I'd expect with 16-bit wide ID.


> +	if (ret < 0)
> +		return ret;

You're leaking a process reference here.


> +
> +	process->id = ret;
> +
> +	kfd_unref_process(primary_process);
> +
> +	return 0;
> +}
> +
> +static void kfd_process_free_id(struct kfd_process *process)
> +{
> +	struct kfd_process *primary_process;
> +
> +	if (process->primary)
> +		return;
> +
> +	primary_process = kfd_lookup_process_by_mm(process->lead_thread->mm);
> +	if (!primary_process)
> +		return;
> +
> +	ida_free(&primary_process->id_table, process->id);
> +
> +	kfd_unref_process(primary_process);
> +
> +	return;

This return statement is unnecessary.


> +}
> +
>   struct kfd_process *kfd_create_process(struct task_struct *thread)
>   {
>   	struct kfd_process *process;
> @@ -1193,6 +1255,11 @@ static void kfd_process_wq_release(struct work_struct *work)
>   	if (ef)
>   		dma_fence_signal(ef);
>   
> +	if (!p->primary)
> +		kfd_process_free_id(p);
> +	else
> +		ida_destroy(&p->id_table);
> +
>   	kfd_process_remove_sysfs(p);
>   	kfd_debugfs_remove_process(p);
>   
> @@ -1549,6 +1616,12 @@ static struct kfd_process *create_process(const struct task_struct *thread, bool
>   	process->queues_paused = false;
>   	process->primary = primary;
>   
> +	err = kfd_process_alloc_id(process);
> +	if (err) {
> +		pr_err("Creating kfd process: failed to alloc an id\n");
> +		goto err_alloc_process;

That's the wrong label for cleanup. You'd end up leaking the process 
structure. You need to create a new label. See below.


> +	}
> +
>   	INIT_DELAYED_WORK(&process->eviction_work, evict_process_worker);
>   	INIT_DELAYED_WORK(&process->restore_work, restore_process_worker);
>   	process->last_restore_timestamp = get_jiffies_64();
> @@ -1599,6 +1672,8 @@ static struct kfd_process *create_process(const struct task_struct *thread, bool
>   			goto err_register_notifier;
>   		}
>   		BUG_ON(mn != &process->mmu_notifier);
> +
> +		ida_init(&process->id_table);
>   	}
>   
>   	kfd_unref_process(process);
> @@ -1619,6 +1694,7 @@ static struct kfd_process *create_process(const struct task_struct *thread, bool
>   err_process_pqm_init:
>   	kfd_event_free_process(process);
>   err_event_init:
> +	kfd_process_free_id(process);

You should add a new label here

err_alloc_id:

Regards,
   Felix


>   	mutex_destroy(&process->mutex);
>   	kfree(process);
>   err_alloc_process:


More information about the amd-gfx mailing list