[Patch v4 04/24] drm/amdkfd: CRIU Implement KFD process_info ioctl
Felix Kuehling
felix.kuehling at amd.com
Mon Jan 10 22:47:56 UTC 2022
On 2021-12-22 7:36 p.m., Rajneesh Bhardwaj wrote:
> This IOCTL is expected to be called as a precursor to the actual
> Checkpoint operation. This does the basic discovery into the target
> process seized by CRIU and relays the information to the userspace that
> utilizes it to start the Checkpoint operation via another dedicated
> IOCTL.
>
> The process_info IOCTL determines the number of GPUs, buffer objects
> that are associated with the target process, its process id in
> caller's namespace since /proc/pid/mem interface maybe used to drain
> the contents of the discovered buffer objects in userspace and getpid
> returns the pid of CRIU dumper process. Also the pid of a process
> inside a container might be different than its global pid so return
> the ns pid.
>
> Signed-off-by: Rajneesh Bhardwaj <rajneesh.bhardwaj at amd.com>
> Signed-off-by: David Yat Sin <david.yatsin at amd.com>
> ---
> drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 55 +++++++++++++++++++++++-
> drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 2 +
> drivers/gpu/drm/amd/amdkfd/kfd_process.c | 14 ++++++
> 3 files changed, 70 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> index 1b863bd84c96..53d7a20e3c06 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> @@ -1857,6 +1857,41 @@ static int kfd_ioctl_svm(struct file *filep, struct kfd_process *p, void *data)
> }
> #endif
>
> +uint64_t get_process_num_bos(struct kfd_process *p)
> +{
> + uint64_t num_of_bos = 0, i;
> +
> + /* Run over all PDDs of the process */
> + for (i = 0; i < p->n_pdds; i++) {
> + struct kfd_process_device *pdd = p->pdds[i];
> + void *mem;
> + int id;
> +
> + idr_for_each_entry(&pdd->alloc_idr, mem, id) {
> + struct kgd_mem *kgd_mem = (struct kgd_mem *)mem;
> +
> + if ((uint64_t)kgd_mem->va > pdd->gpuvm_base)
> + num_of_bos++;
> + }
> + }
> + return num_of_bos;
> +}
> +
> +static void criu_get_process_object_info(struct kfd_process *p,
> + uint32_t *num_bos,
> + uint64_t *objs_priv_size)
> +{
> + uint64_t priv_size;
> +
> + *num_bos = get_process_num_bos(p);
> +
> + if (objs_priv_size) {
> + priv_size = sizeof(struct kfd_criu_process_priv_data);
> + priv_size += *num_bos * sizeof(struct kfd_criu_bo_priv_data);
> + *objs_priv_size = priv_size;
> + }
> +}
> +
> static int criu_checkpoint(struct file *filep,
> struct kfd_process *p,
> struct kfd_ioctl_criu_args *args)
> @@ -1889,7 +1924,25 @@ static int criu_process_info(struct file *filep,
> struct kfd_process *p,
> struct kfd_ioctl_criu_args *args)
> {
> - return 0;
> + int ret = 0;
> +
> + mutex_lock(&p->mutex);
> +
> + if (!kfd_has_process_device_data(p)) {
> + pr_err("No pdd for given process\n");
> + ret = -ENODEV;
> + goto err_unlock;
> + }
> +
> + args->pid = task_pid_nr_ns(p->lead_thread,
> + task_active_pid_ns(p->lead_thread));
> +
> + criu_get_process_object_info(p, &args->num_bos, &args->priv_data_size);
> +
> + dev_dbg(kfd_device, "Num of bos:%u\n", args->num_bos);
> +err_unlock:
> + mutex_unlock(&p->mutex);
> + return ret;
> }
>
> static int kfd_ioctl_criu(struct file *filep, struct kfd_process *p, void *data)
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> index e68f692362bb..4d9bc7af03af 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> @@ -950,6 +950,8 @@ void *kfd_process_device_translate_handle(struct kfd_process_device *p,
> void kfd_process_device_remove_obj_handle(struct kfd_process_device *pdd,
> int handle);
>
> +bool kfd_has_process_device_data(struct kfd_process *p);
> +
> /* PASIDs */
> int kfd_pasid_init(void);
> void kfd_pasid_exit(void);
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> index d4c8a6948a9f..f77d556ca0fc 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> @@ -1456,6 +1456,20 @@ static int init_doorbell_bitmap(struct qcm_process_device *qpd,
> return 0;
> }
>
> +bool kfd_has_process_device_data(struct kfd_process *p)
> +{
> + int i;
> +
> + for (i = 0; i < p->n_pdds; i++) {
> + struct kfd_process_device *pdd = p->pdds[i];
I think checking p->n_pdds is sufficient. All the pdds with i < n_pdds
should be non-NULL.
Regards,
Felix
> +
> + if (pdd)
> + return true;
> + }
> +
> + return false;
> +}
> +
> struct kfd_process_device *kfd_get_process_device_data(struct kfd_dev *dev,
> struct kfd_process *p)
> {
More information about the dri-devel
mailing list