[Patch v4 04/24] drm/amdkfd: CRIU Implement KFD process_info ioctl

Felix Kuehling felix.kuehling at amd.com
Mon Jan 10 22:47:56 UTC 2022


On 2021-12-22 7:36 p.m., Rajneesh Bhardwaj wrote:
> This IOCTL is expected to be called as a precursor to the actual
> Checkpoint operation. This does the basic discovery into the target
> process seized by CRIU and relays the information to the userspace that
> utilizes it to start the Checkpoint operation via another dedicated
> IOCTL.
>
> The process_info IOCTL determines the number of GPUs, buffer objects
> that are associated with the target process, its process id in
> caller's namespace since /proc/pid/mem interface maybe used to drain
> the contents of the discovered buffer objects in userspace and getpid
> returns the pid of CRIU dumper process. Also the pid of a process
> inside a container might be different than its global pid so return
> the ns pid.
>
> Signed-off-by: Rajneesh Bhardwaj <rajneesh.bhardwaj at amd.com>
> Signed-off-by: David Yat Sin <david.yatsin at amd.com>
> ---
>   drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 55 +++++++++++++++++++++++-
>   drivers/gpu/drm/amd/amdkfd/kfd_priv.h    |  2 +
>   drivers/gpu/drm/amd/amdkfd/kfd_process.c | 14 ++++++
>   3 files changed, 70 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> index 1b863bd84c96..53d7a20e3c06 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> @@ -1857,6 +1857,41 @@ static int kfd_ioctl_svm(struct file *filep, struct kfd_process *p, void *data)
>   }
>   #endif
>   
> +uint64_t get_process_num_bos(struct kfd_process *p)
> +{
> +	uint64_t num_of_bos = 0, i;
> +
> +	/* Run over all PDDs of the process */
> +	for (i = 0; i < p->n_pdds; i++) {
> +		struct kfd_process_device *pdd = p->pdds[i];
> +		void *mem;
> +		int id;
> +
> +		idr_for_each_entry(&pdd->alloc_idr, mem, id) {
> +			struct kgd_mem *kgd_mem = (struct kgd_mem *)mem;
> +
> +			if ((uint64_t)kgd_mem->va > pdd->gpuvm_base)
> +				num_of_bos++;
> +		}
> +	}
> +	return num_of_bos;
> +}
> +
> +static void criu_get_process_object_info(struct kfd_process *p,
> +					 uint32_t *num_bos,
> +					 uint64_t *objs_priv_size)
> +{
> +	uint64_t priv_size;
> +
> +	*num_bos = get_process_num_bos(p);
> +
> +	if (objs_priv_size) {
> +		priv_size = sizeof(struct kfd_criu_process_priv_data);
> +		priv_size += *num_bos * sizeof(struct kfd_criu_bo_priv_data);
> +		*objs_priv_size = priv_size;
> +	}
> +}
> +
>   static int criu_checkpoint(struct file *filep,
>   			   struct kfd_process *p,
>   			   struct kfd_ioctl_criu_args *args)
> @@ -1889,7 +1924,25 @@ static int criu_process_info(struct file *filep,
>   				struct kfd_process *p,
>   				struct kfd_ioctl_criu_args *args)
>   {
> -	return 0;
> +	int ret = 0;
> +
> +	mutex_lock(&p->mutex);
> +
> +	if (!kfd_has_process_device_data(p)) {
> +		pr_err("No pdd for given process\n");
> +		ret = -ENODEV;
> +		goto err_unlock;
> +	}
> +
> +	args->pid = task_pid_nr_ns(p->lead_thread,
> +					task_active_pid_ns(p->lead_thread));
> +
> +	criu_get_process_object_info(p, &args->num_bos, &args->priv_data_size);
> +
> +	dev_dbg(kfd_device, "Num of bos:%u\n", args->num_bos);
> +err_unlock:
> +	mutex_unlock(&p->mutex);
> +	return ret;
>   }
>   
>   static int kfd_ioctl_criu(struct file *filep, struct kfd_process *p, void *data)
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> index e68f692362bb..4d9bc7af03af 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> @@ -950,6 +950,8 @@ void *kfd_process_device_translate_handle(struct kfd_process_device *p,
>   void kfd_process_device_remove_obj_handle(struct kfd_process_device *pdd,
>   					int handle);
>   
> +bool kfd_has_process_device_data(struct kfd_process *p);
> +
>   /* PASIDs */
>   int kfd_pasid_init(void);
>   void kfd_pasid_exit(void);
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> index d4c8a6948a9f..f77d556ca0fc 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> @@ -1456,6 +1456,20 @@ static int init_doorbell_bitmap(struct qcm_process_device *qpd,
>   	return 0;
>   }
>   
> +bool kfd_has_process_device_data(struct kfd_process *p)
> +{
> +	int i;
> +
> +	for (i = 0; i < p->n_pdds; i++) {
> +		struct kfd_process_device *pdd = p->pdds[i];

I think checking p->n_pdds is sufficient. All the pdds with i < n_pdds 
should be non-NULL.

Regards,
   Felix


> +
> +		if (pdd)
> +			return true;
> +	}
> +
> +	return false;
> +}
> +
>   struct kfd_process_device *kfd_get_process_device_data(struct kfd_dev *dev,
>   							struct kfd_process *p)
>   {


More information about the dri-devel mailing list