[Patch v4 03/24] drm/amdkfd: CRIU Introduce Checkpoint-Restore APIs

Felix Kuehling felix.kuehling at amd.com
Mon Jan 10 22:08:25 UTC 2022


On 2021-12-22 7:36 p.m., Rajneesh Bhardwaj wrote:
> Checkpoint-Restore in userspace (CRIU) is a powerful tool that can
> snapshot a running process and later restore it on same or a remote
> machine but expects the processes that have a device file (e.g. GPU)
> associated with them, provide necessary driver support to assist CRIU
> and its extensible plugin interface. Thus, In order to support the
> Checkpoint-Restore of any ROCm process, the AMD Radeon Open Compute
> Kernel driver, needs to provide a set of new APIs that provide
> necessary VRAM metadata and its contents to a userspace component
> (CRIU plugin) that can store it in form of image files.
>
> This introduces some new ioctls which will be used to checkpoint-Restore
> any KFD bound user process. KFD doesn't allow any arbitrary ioctl call
> unless it is called by the group leader process. Since these ioctls are
> expected to be called from a KFD criu plugin which has elevated ptrace
> attached privileges and CAP_CHECKPOINT_RESTORE capabilities attached with
> the file descriptors so modify KFD to allow such calls.
>
> (API redesigned by David Yat Sin)
> Suggested-by: Felix Kuehling <felix.kuehling at amd.com>
> Signed-off-by: David Yat Sin <david.yatsin at amd.com>
> Signed-off-by: Rajneesh Bhardwaj <rajneesh.bhardwaj at amd.com>
> ---
>   drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 94 +++++++++++++++++++++++-
>   drivers/gpu/drm/amd/amdkfd/kfd_priv.h    | 65 +++++++++++++++-
>   include/uapi/linux/kfd_ioctl.h           | 79 +++++++++++++++++++-
>   3 files changed, 235 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> index 4bfc0c8ab764..1b863bd84c96 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> @@ -33,6 +33,7 @@
>   #include <linux/time.h>
>   #include <linux/mm.h>
>   #include <linux/mman.h>
> +#include <linux/ptrace.h>
>   #include <linux/dma-buf.h>
>   #include <asm/processor.h>
>   #include "kfd_priv.h"
> @@ -1856,6 +1857,75 @@ static int kfd_ioctl_svm(struct file *filep, struct kfd_process *p, void *data)
>   }
>   #endif
>   
> +static int criu_checkpoint(struct file *filep,
> +			   struct kfd_process *p,
> +			   struct kfd_ioctl_criu_args *args)
> +{
> +	return 0;
> +}
> +
> +static int criu_restore(struct file *filep,
> +			struct kfd_process *p,
> +			struct kfd_ioctl_criu_args *args)
> +{
> +	return 0;
> +}
> +
> +static int criu_unpause(struct file *filep,
> +			struct kfd_process *p,
> +			struct kfd_ioctl_criu_args *args)
> +{
> +	return 0;
> +}
> +
> +static int criu_resume(struct file *filep,
> +			struct kfd_process *p,
> +			struct kfd_ioctl_criu_args *args)
> +{
> +	return 0;
> +}
> +
> +static int criu_process_info(struct file *filep,
> +				struct kfd_process *p,
> +				struct kfd_ioctl_criu_args *args)
> +{
> +	return 0;
> +}
> +
> +static int kfd_ioctl_criu(struct file *filep, struct kfd_process *p, void *data)
> +{
> +	struct kfd_ioctl_criu_args *args = data;
> +	int ret;
> +
> +	dev_dbg(kfd_device, "CRIU operation: %d\n", args->op);
> +	switch (args->op) {
> +	case KFD_CRIU_OP_PROCESS_INFO:
> +		ret = criu_process_info(filep, p, args);
> +		break;
> +	case KFD_CRIU_OP_CHECKPOINT:
> +		ret = criu_checkpoint(filep, p, args);
> +		break;
> +	case KFD_CRIU_OP_UNPAUSE:
> +		ret = criu_unpause(filep, p, args);
> +		break;
> +	case KFD_CRIU_OP_RESTORE:
> +		ret = criu_restore(filep, p, args);
> +		break;
> +	case KFD_CRIU_OP_RESUME:
> +		ret = criu_resume(filep, p, args);
> +		break;
> +	default:
> +		dev_dbg(kfd_device, "Unsupported CRIU operation:%d\n", args->op);
> +		ret = -EINVAL;
> +		break;
> +	}
> +
> +	if (ret)
> +		dev_dbg(kfd_device, "CRIU operation:%d err:%d\n", args->op, ret);
> +
> +	return ret;
> +}
> +
>   #define AMDKFD_IOCTL_DEF(ioctl, _func, _flags) \
>   	[_IOC_NR(ioctl)] = {.cmd = ioctl, .func = _func, .flags = _flags, \
>   			    .cmd_drv = 0, .name = #ioctl}
> @@ -1959,6 +2029,9 @@ static const struct amdkfd_ioctl_desc amdkfd_ioctls[] = {
>   
>   	AMDKFD_IOCTL_DEF(AMDKFD_IOC_SET_XNACK_MODE,
>   			kfd_ioctl_set_xnack_mode, 0),
> +
> +	AMDKFD_IOCTL_DEF(AMDKFD_IOC_CRIU_OP,
> +			kfd_ioctl_criu, KFD_IOC_FLAG_CHECKPOINT_RESTORE),
>   };
>   
>   #define AMDKFD_CORE_IOCTL_COUNT	ARRAY_SIZE(amdkfd_ioctls)
> @@ -1973,6 +2046,7 @@ static long kfd_ioctl(struct file *filep, unsigned int cmd, unsigned long arg)
>   	char *kdata = NULL;
>   	unsigned int usize, asize;
>   	int retcode = -EINVAL;
> +	bool ptrace_attached = false;
>   
>   	if (nr >= AMDKFD_CORE_IOCTL_COUNT)
>   		goto err_i1;
> @@ -1998,7 +2072,15 @@ static long kfd_ioctl(struct file *filep, unsigned int cmd, unsigned long arg)
>   	 * processes need to create their own KFD device context.
>   	 */
>   	process = filep->private_data;
> -	if (process->lead_thread != current->group_leader) {
> +
> +	rcu_read_lock();
> +	if ((ioctl->flags & KFD_IOC_FLAG_CHECKPOINT_RESTORE) &&
> +	    ptrace_parent(process->lead_thread) == current)
> +		ptrace_attached = true;
> +	rcu_read_unlock();
> +
> +	if (process->lead_thread != current->group_leader
> +	    && !ptrace_attached) {
>   		dev_dbg(kfd_device, "Using KFD FD in wrong process\n");
>   		retcode = -EBADF;
>   		goto err_i1;
> @@ -2013,6 +2095,16 @@ static long kfd_ioctl(struct file *filep, unsigned int cmd, unsigned long arg)
>   		goto err_i1;
>   	}
>   
> +	/*
> +	 * Versions of docker shipped in Ubuntu 18.xx and 20.xx do not support
> +	 * CAP_CHECKPOINT_RESTORE, so we also allow access if CAP_SYS_ADMIN as CAP_SYS_ADMIN is a
> +	 * more priviledged access.
> +	 */
> +	if (unlikely(ioctl->flags & KFD_IOC_FLAG_CHECKPOINT_RESTORE)) {
> +		if (!capable(CAP_CHECKPOINT_RESTORE) && !capable(CAP_SYS_ADMIN))
> +			return -EACCES;
> +	}
> +
>   	if (cmd & (IOC_IN | IOC_OUT)) {
>   		if (asize <= sizeof(stack_kdata)) {
>   			kdata = stack_kdata;
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> index 1d3f012bcd2a..e68f692362bb 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> @@ -121,7 +121,26 @@
>    */
>   #define KFD_QUEUE_DOORBELL_MIRROR_OFFSET 512
>   
> -
> +/**
> + * enum kfd_ioctl_flags - KFD ioctl flags
> + * Various flags that can be set in &amdkfd_ioctl_desc.flags to control how
> + * userspace can use a given ioctl.
> + */
> +enum kfd_ioctl_flags {
> +	/*
> +	 * @KFD_IOC_FLAG_CHECKPOINT_RESTORE:
> +	 * Certain KFD ioctls such as AMDKFD_IOC_CRIU_OP can potentially
> +	 * perform privileged operations and load arbitrary data into MQDs and
> +	 * eventually HQD registers when the queue is mapped by HWS. In order to
> +	 * prevent this we should perform additional security checks.
> +	 *
> +	 * This is equivalent to callers with the CHECKPOINT_RESTORE capability.
> +	 *
> +	 * Note: Since earlier versions of docker do not support CHECKPOINT_RESTORE,
> +	 * we also allow ioctls with SYS_ADMIN capability.
> +	 */
> +	KFD_IOC_FLAG_CHECKPOINT_RESTORE = BIT(0),
> +};
>   /*
>    * Kernel module parameter to specify maximum number of supported queues per
>    * device
> @@ -1004,6 +1023,50 @@ void kfd_process_set_trap_handler(struct qcm_process_device *qpd,
>   				  uint64_t tba_addr,
>   				  uint64_t tma_addr);
>   
> +/* CRIU */
> +/*
> + * Need to increment KFD_CRIU_PRIV_VERSION each time a change is made to any of the CRIU private
> + * structures:
> + * kfd_criu_process_priv_data
> + * kfd_criu_device_priv_data
> + * kfd_criu_bo_priv_data
> + * kfd_criu_queue_priv_data
> + * kfd_criu_event_priv_data
> + * kfd_criu_svm_range_priv_data
> + */
> +
> +#define KFD_CRIU_PRIV_VERSION 1
> +
> +struct kfd_criu_process_priv_data {
> +	uint32_t version;
> +};
> +
> +struct kfd_criu_device_priv_data {
> +	/* For future use */
> +	uint64_t reserved;
> +};
> +
> +struct kfd_criu_bo_priv_data {
> +	uint64_t reserved;
> +};
> +
> +struct kfd_criu_svm_range_priv_data {
> +	uint32_t object_type;
> +	uint64_t reserved;

The compiler adds 32-bit padding on x86_64. I think you want to make 
"reserved" 32-bit here. Same in the two structures below.


> +};
> +
> +struct kfd_criu_queue_priv_data {
> +	uint32_t object_type;
> +	uint64_t reserved;
> +};
> +
> +struct kfd_criu_event_priv_data {
> +	uint32_t object_type;
> +	uint64_t reserved;
> +};
> +
> +/* CRIU - End */
> +
>   /* Queue Context Management */
>   int init_queue(struct queue **q, const struct queue_properties *properties);
>   void uninit_queue(struct queue *q);
> diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h
> index af96af174dc4..b5c297be081b 100644
> --- a/include/uapi/linux/kfd_ioctl.h
> +++ b/include/uapi/linux/kfd_ioctl.h
> @@ -468,6 +468,80 @@ struct kfd_ioctl_smi_events_args {
>   	__u32 anon_fd;	/* from KFD */
>   };
>   
> +/**************************************************************************************************
> + * CRIU IOCTLs (Checkpoint Restore In Userspace)
> + *
> + * When checkpointing a process, the userspace application will perform:
> + * 1. PROCESS_INFO op to determine current process information

Maybe mention that this also pauses execution by evicting all the 
queues. That makes the UNPAUSE op below less mysterious.


> + * 2. CHECKPOINT op to checkpoint process contents (BOs, queues, events, svm-ranges)
> + * 3. UNPAUSE op to un-evict all the queues
> + *
> + * When restoring a process, the CRIU userspace application will perform:
> + *
> + * 1. RESTORE op to restore process contents
> + * 2. RESUME op to start the process
> + *
> + * Note: Queues are forced into an evicted state after a successful PROCESS_INFO. If user
> + * application need to perform an UNPAUSE operation to complete or abort a checkpoint.
> + */
> +
> +enum kfd_criu_op {
> +	KFD_CRIU_OP_PROCESS_INFO,
> +	KFD_CRIU_OP_CHECKPOINT,
> +	KFD_CRIU_OP_UNPAUSE,
> +	KFD_CRIU_OP_RESTORE,
> +	KFD_CRIU_OP_RESUME,
> +};
> +
> +/**
> + * kfd_ioctl_criu_args - Arguments perform CRIU operation
> + * @devices:		[in/out] User pointer to memory location for devices information
> + * @bos:		[in/out] User pointer to memory location for BOs information

It would help to reference the _bucket structures to make it clear that 
the "buckets" are the public information pointed to by these pointers. 
Also that these are pointers to arrays of buckets with num_devices and 
num_bos giving the array size.


> + * @priv_data:		[in/out] User pointer to memory location for private data
> + * @priv_data_size:	[in/out] Size of priv_data in bytes
> + * @num_devices:	[in/out] Number of GPUs used by process
> + * @num_bos		[in/out] Number of BOs used by process
> + * @num_objects:	[in/out] Number of objects used by process. Objects are opaque to
> + *				 user application
> + * @pid:		[in/out] PID of the process being checkpointed/restored

Do you need the pid during restore? Restore runs in the context of the 
restoring process itself.


> + * @op			[in] Type of operation (kfd_criu_op)
> + *
> + * Takes and releases process lock.

The process lock is a KFD implementation detail. I don't think this 
comment belongs in the UAPI definition.


> + * Return: 0 on success, -errno on failure
> + */
> +struct kfd_ioctl_criu_args {
> +	__u64 devices;		/* Used during ops: CHECKPOINT, RESTORE */
> +	__u64 bos;		/* Used during ops: CHECKPOINT, RESTORE */
> +	__u64 priv_data;	/* Used during ops: CHECKPOINT, RESTORE */
> +	__u64 priv_data_size;	/* Used during ops: PROCESS_INFO, RESTORE */
> +	__u32 num_devices;	/* Used during ops: PROCESS_INFO, RESTORE */
> +	__u32 num_bos;		/* Used during ops: PROCESS_INFO, RESTORE */
> +	__u32 num_objects;	/* Used during ops: PROCESS_INFO, RESTORE */
> +	__u32 pid;		/* Used during ops: PROCESS_INFO, RESTORE */

Do you need the pid during restore? Restore runs in the context of the 
restoring process itself.


> +	__u32 op;
> +};
> +
> +struct kfd_criu_device_bucket {
> +	__u32 user_gpu_id;
> +	__u32 actual_gpu_id;
> +	__u32 drm_fd;
> +	__u32 pad;
> +};
> +
> +struct kfd_criu_bo_bucket {
> +	__u64 addr;
> +	__u64 size;
> +	__u64 offset;
> +	__u64 restored_offset;    /* During restore, updated offset for BO */
> +	__u32 gpu_id;

Maybe add a comment that this is the user_gpu_id (I think it is ...).

Regards,
   Felix


> +	__u32 alloc_flags;
> +	__u32 dmabuf_fd;
> +	__u32 pad;
> +};
> +
> +/* CRIU IOCTLs - END */
> +/**************************************************************************************************/
> +
>   /* Register offset inside the remapped mmio page
>    */
>   enum kfd_mmio_remap {
> @@ -742,7 +816,10 @@ struct kfd_ioctl_set_xnack_mode_args {
>   #define AMDKFD_IOC_SET_XNACK_MODE		\
>   		AMDKFD_IOWR(0x21, struct kfd_ioctl_set_xnack_mode_args)
>   
> +#define AMDKFD_IOC_CRIU_OP			\
> +		AMDKFD_IOWR(0x22, struct kfd_ioctl_criu_args)
> +
>   #define AMDKFD_COMMAND_START		0x01
> -#define AMDKFD_COMMAND_END		0x22
> +#define AMDKFD_COMMAND_END		0x23
>   
>   #endif


More information about the amd-gfx mailing list