[PATCH v2 03/18] drm/amdkfd: CRIU Introduce Checkpoint-Restore APIs

David Yat Sin david.yatsin at amd.com
Tue Aug 24 19:06:24 UTC 2021


From: Rajneesh Bhardwaj <rajneesh.bhardwaj at amd.com>

Checkpoint-Restore in userspace (CRIU) is a powerful tool that can
snapshot a running process and later restore it on same or a remote
machine but expects the processes that have a device file (e.g. GPU)
associated with them, provide necessary driver support to assist CRIU
and its extensible plugin interface. Thus, In order to support the
Checkpoint-Restore of any ROCm process, the AMD Radeon Open Compute
Kernel driver, needs to provide a set of new APIs that provide
necessary VRAM metadata and its contents to a userspace component
(CRIU plugin) that can store it in form of image files.

This introduces some new ioctls which will be used to checkpoint-Restore
any KFD bound user process. KFD doesn't allow any arbitrary ioctl call
unless it is called by the group leader process. Since these ioctls are
expected to be called from a KFD criu plugin which has elevated ptrace
attached privileges and CAP_SYS_ADMIN capabilities attached with the file
descriptors so modify KFD to allow such calls.

(API redesign suggested by Felix Kuehling and implemented by David Yat
Sin)

Signed-off-by: David Yat Sin <david.yatsin at amd.com>
Signed-off-by: Rajneesh Bhardwaj <rajneesh.bhardwaj at amd.com>
(cherry picked from commit 72f4907135aed9c037b9f442a6055b51733b518a)
(cherry picked from commit 33ff4953c5352f51d57a77ba8ae6614b7993e70d)
---
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c |  70 ++++++-
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h    |  69 +++++++
 include/uapi/linux/kfd_ioctl.h           | 221 ++++++++++++++++++++++-
 3 files changed, 358 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index 059c3f1ca27d..a1b60d29aae1 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -33,6 +33,7 @@
 #include <linux/time.h>
 #include <linux/mm.h>
 #include <linux/mman.h>
+#include <linux/ptrace.h>
 #include <linux/dma-buf.h>
 #include <asm/processor.h>
 #include "kfd_priv.h"
@@ -1802,6 +1803,44 @@ static int kfd_ioctl_svm(struct file *filep, struct kfd_process *p, void *data)
 	return -EPERM;
 }
 #endif
+static int kfd_ioctl_criu_dumper(struct file *filep,
+				struct kfd_process *p, void *data)
+{
+	pr_debug("Inside %s\n", __func__);
+
+	return 0;
+}
+
+static int kfd_ioctl_criu_restorer(struct file *filep,
+				struct kfd_process *p, void *data)
+{
+	pr_debug("Inside %s\n", __func__);
+
+	return 0;
+}
+
+static int kfd_ioctl_criu_pause(struct file *filep, struct kfd_process *p, void *data)
+{
+	pr_debug("Inside %s\n", __func__);
+
+	return 0;
+}
+
+static int kfd_ioctl_criu_resume(struct file *filep,
+				struct kfd_process *p, void *data)
+{
+	pr_debug("Inside %s\n", __func__);
+
+	return 0;
+}
+
+static int kfd_ioctl_criu_process_info(struct file *filep,
+				struct kfd_process *p, void *data)
+{
+	pr_debug("Inside %s\n", __func__);
+
+	return 0;
+}
 
 #define AMDKFD_IOCTL_DEF(ioctl, _func, _flags) \
 	[_IOC_NR(ioctl)] = {.cmd = ioctl, .func = _func, .flags = _flags, \
@@ -1906,6 +1945,21 @@ static const struct amdkfd_ioctl_desc amdkfd_ioctls[] = {
 
 	AMDKFD_IOCTL_DEF(AMDKFD_IOC_SET_XNACK_MODE,
 			kfd_ioctl_set_xnack_mode, 0),
+
+	AMDKFD_IOCTL_DEF(AMDKFD_IOC_CRIU_DUMPER,
+			 kfd_ioctl_criu_dumper, KFD_IOC_FLAG_PTRACE_ATTACHED),
+
+	AMDKFD_IOCTL_DEF(AMDKFD_IOC_CRIU_RESTORER,
+			 kfd_ioctl_criu_restorer, KFD_IOC_FLAG_ROOT_ONLY),
+
+	AMDKFD_IOCTL_DEF(AMDKFD_IOC_CRIU_PROCESS_INFO,
+			 kfd_ioctl_criu_process_info, KFD_IOC_FLAG_PTRACE_ATTACHED),
+
+	AMDKFD_IOCTL_DEF(AMDKFD_IOC_CRIU_RESUME,
+			 kfd_ioctl_criu_resume, KFD_IOC_FLAG_ROOT_ONLY),
+
+	AMDKFD_IOCTL_DEF(AMDKFD_IOC_CRIU_PAUSE,
+			 kfd_ioctl_criu_pause, KFD_IOC_FLAG_PTRACE_ATTACHED),
 };
 
 #define AMDKFD_CORE_IOCTL_COUNT	ARRAY_SIZE(amdkfd_ioctls)
@@ -1920,6 +1974,7 @@ static long kfd_ioctl(struct file *filep, unsigned int cmd, unsigned long arg)
 	char *kdata = NULL;
 	unsigned int usize, asize;
 	int retcode = -EINVAL;
+	bool ptrace_attached = false;
 
 	if (nr >= AMDKFD_CORE_IOCTL_COUNT)
 		goto err_i1;
@@ -1945,7 +2000,15 @@ static long kfd_ioctl(struct file *filep, unsigned int cmd, unsigned long arg)
 	 * processes need to create their own KFD device context.
 	 */
 	process = filep->private_data;
-	if (process->lead_thread != current->group_leader) {
+
+	rcu_read_lock();
+	if ((ioctl->flags & KFD_IOC_FLAG_PTRACE_ATTACHED) &&
+	    ptrace_parent(process->lead_thread) == current)
+		ptrace_attached = true;
+	rcu_read_unlock();
+
+	if (process->lead_thread != current->group_leader
+	    && !ptrace_attached) {
 		dev_dbg(kfd_device, "Using KFD FD in wrong process\n");
 		retcode = -EBADF;
 		goto err_i1;
@@ -1960,6 +2023,11 @@ static long kfd_ioctl(struct file *filep, unsigned int cmd, unsigned long arg)
 		goto err_i1;
 	}
 
+	/* KFD_IOC_FLAG_ROOT_ONLY is only for CAP_SYS_ADMIN */
+	if (unlikely((ioctl->flags & KFD_IOC_FLAG_ROOT_ONLY) &&
+		     !capable(CAP_SYS_ADMIN)))
+		return -EACCES;
+
 	if (cmd & (IOC_IN | IOC_OUT)) {
 		if (asize <= sizeof(stack_kdata)) {
 			kdata = stack_kdata;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index 64552f6b8ba4..768cc3fe95d2 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -121,7 +121,35 @@
  */
 #define KFD_QUEUE_DOORBELL_MIRROR_OFFSET 512
 
+/**
+ * enum kfd_ioctl_flags - KFD ioctl flags
+ * Various flags that can be set in &amdkfd_ioctl_desc.flags to control how
+ * userspace can use a given ioctl.
+ */
+enum kfd_ioctl_flags {
+	/**
+	 * @KFD_IOC_FLAG_ROOT_ONLY:
+	 * Certain KFD ioctls such as AMDKFD_IOC_CRIU_RESTORER can potentially
+	 * perform privileged operations and load arbitrary data into MQDs and
+	 * eventually HQD registers when the queue is mapped by HWS. In order to
+	 * prevent this we should perform additional security checks. In other
+	 * cases, certain ioctls such as AMDKFD_IOC_CRIU_RESUME might be called
+	 * by an external process e.g. CRIU restore process, for each resuming
+	 * tasks and thus require elevated privileges.
+	 *
+	 * This is equivalent to callers with the SYSADMIN capability.
+	 */
+	KFD_IOC_FLAG_ROOT_ONLY = BIT(0),
+	/**
+	 * @KFD_IOC_FLAG_PTRACE_ATTACHED:
+	 * Certain KFD ioctls such as AMDKFD_IOC_CRIU_HELPER and
+	 * AMDKFD_IOC_CRIU_DUMPER are expected to be called during a Checkpoint
+	 * operation triggered by CRIU. Since, these are expected to be called
+	 * from a PTRACE attached context, we must authenticate these.
+	 */
+	KFD_IOC_FLAG_PTRACE_ATTACHED = BIT(1),
 
+};
 /*
  * Kernel module parameter to specify maximum number of supported queues per
  * device
@@ -977,6 +1005,47 @@ void kfd_process_set_trap_handler(struct qcm_process_device *qpd,
 				  uint64_t tba_addr,
 				  uint64_t tma_addr);
 
+/* CRIU */
+/*
+ * Need to increment KFD_CRIU_PRIV_VERSION each time a change is made to any of the CRIU private
+ * structures:
+ * kfd_criu_process_priv_data
+ * kfd_criu_device_priv_data
+ * kfd_criu_bo_priv_data
+ * kfd_criu_queue_priv_data
+ * kfd_criu_event_priv_data
+ * kfd_criu_svm_range_priv_data
+ */
+
+#define KFD_CRIU_PRIV_VERSION 1
+
+struct kfd_criu_process_priv_data {
+	uint32_t version;
+};
+
+struct kfd_criu_device_priv_data {
+	/* For future use */
+	uint64_t reserved;
+};
+
+struct kfd_criu_bo_priv_data {
+	uint64_t reserved;
+};
+
+struct kfd_criu_svm_range_priv_data {
+	uint64_t reserved;
+};
+
+struct kfd_criu_queue_priv_data {
+	uint64_t reserved;
+};
+
+struct kfd_criu_event_priv_data {
+	uint64_t reserved;
+};
+
+/* CRIU - End */
+
 /* Queue Context Management */
 int init_queue(struct queue **q, const struct queue_properties *properties);
 void uninit_queue(struct queue *q);
diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h
index 3cb5b5dd9f77..206cad72329f 100644
--- a/include/uapi/linux/kfd_ioctl.h
+++ b/include/uapi/linux/kfd_ioctl.h
@@ -467,6 +467,210 @@ struct kfd_ioctl_smi_events_args {
 	__u32 anon_fd;	/* from KFD */
 };
 
+/**************************************************************************************************
+ * CRIU IOCTLs (Checkpoint Restore In Userspace)
+ *
+ * When checkpointing a process, the userspace application will perform:
+ * 1. CRIU_PAUSE (pause = 1) ioctl to make sure all queues are evicted
+ * 2. CRIU_PROCESS_INFO ioctl to determine current process information
+ * 3. CRIU_DUMPER ioctl for each type of objects
+ * 4. CRIU_PAUSE (pause = 0) ioctl to un-evict all the queues
+ *
+ * When restoring a process, the CRIU userspace application will perform:
+ * 1. CRIU_RESTORE ioctl for each type of objects
+ * 2. CRIU_RESUME ioctl to start the process
+ *
+ * Objects can be of type:
+ * * PROCESS: Contains process specific information
+ * * DEVICE: Contains per-GPU information for each GPU used by the process
+ * * BO: Buffer Object, BO metadata and memory contents
+ * * QUEUE: Queues information for all queues used by current process (e.g CP and SDMA)
+ * * EVENT: Events information (e.g interrupts or HW exceptions)
+ * * SVM_RANGE: HMM memory mappings information
+ *
+ * Before performing a CRIU_DUMPER ioctl for each type of object, the caller needs to allocate
+ * sufficient memory for:
+ * * size of bucket structure * Number of objects
+ *    The public structures for each type of object.
+ * * private data
+ *    The opaque private structures for each type of object.
+ *
+ * These values are obtained from the CRIU_PROCESS_INFO ioctl.
+ */
+
+/**
+ * kfd_ioctl_criu_process_info_args - Arguments to get process information
+ *
+ * @process_priv_data_size:       [out] Total size of private data for object type PROCESS
+ * @bos_priv_data_size:           [out] Total size of private data for object type BO
+ * @devices_priv_data_size:       [out] Total size of private data for object type DEVICE
+ * @queues_priv_data_size:        [out] Total size of private data for object type QUEUE
+ * @events_priv_data_size:        [out] Total size of private data for object type EVENTS
+ * @svm_ranges_priv_data_size:    [out] Total size of private data for object type SVM_RANGE (HMM)
+ * @total_bos:                    [out] Number of BOs used by the process
+ * @total_svm_ranges:             [out] Number of SVM_RANGEs used by the process (HMM)
+ * @total_devices:                [out] Number of GPUs used by the process
+ * @total_queues:                 [out] Number of queues used by the process
+ * @total_events:                 [out] Number of events used by the process
+ * @task_pid:                     [out] PID of the process being checkpointed
+ *
+ * Context: Expects process to be in paused state, see CRIU_PAUSE. Takes and releases process lock
+ *
+ * Return: 0 on success, -errno on failure
+ */
+struct kfd_ioctl_criu_process_info_args {
+	__u64 process_priv_data_size;
+	__u64 bos_priv_data_size;
+	__u64 devices_priv_data_size;
+	__u64 queues_priv_data_size;
+	__u64 events_priv_data_size;
+	__u64 svm_ranges_priv_data_size;
+	__u64 total_bos;
+	__u64 total_svm_ranges;
+	__u32 total_devices;
+	__u32 total_queues;
+	__u32 total_events;
+	__u32 task_pid;
+};
+
+struct kfd_criu_process_bucket {
+	__u64 priv_data_offset;
+	__u64 priv_data_size;
+};
+
+struct kfd_criu_device_bucket {
+	__u64 priv_data_offset;
+	__u64 priv_data_size;
+	__u32 user_gpu_id;
+	__u32 actual_gpu_id;
+	__u32 drm_fd;
+	__u32 pad;
+};
+
+struct kfd_criu_bo_bucket {
+	__u64 priv_data_offset;
+	__u64 priv_data_size;
+	__u64 addr;
+	__u64 size;
+	__u64 offset;
+	__u64 restored_offset;    /* During restore, updated offset for BO */
+	__u32 gpu_id;
+	__u32 alloc_flags;
+	__u32 dmabuf_fd;
+	__u32 pad;
+};
+
+struct kfd_criu_queue_bucket {
+	__u64 priv_data_offset;
+	__u64 priv_data_size;
+	__u32 gpu_id;
+	__u32 pad;
+};
+
+struct kfd_criu_event_bucket {
+	__u64 priv_data_offset;
+	__u64 priv_data_size;
+	__u32 gpu_id;
+	__u32 pad;
+};
+
+enum kfd_criu_object_type {
+	KFD_CRIU_OBJECT_TYPE_PROCESS	= 0,
+	KFD_CRIU_OBJECT_TYPE_DEVICE	= 1,
+	KFD_CRIU_OBJECT_TYPE_BO		= 2,
+	KFD_CRIU_OBJECT_TYPE_QUEUE	= 3,
+	KFD_CRIU_OBJECT_TYPE_EVENT	= 4,
+	KFD_CRIU_OBJECT_TYPE_SVM_RANGE	= 5,
+};
+
+
+/**
+ * kfd_ioctl_criu_dumper_args - Arguments for dumping objects
+ *
+ * @num_objects:         [in] Number of objects to be dumped
+ * @objects:             [out] User pointer to memory location to store object information
+ * @objects_size:        [in/out] Size of memory allocated in @objects
+ * @objects_index_start: [in] When doing partial dumps, this is the index of the first object to
+ *                            be dumped. [Not currently supported]
+ * @type:		 [in] Type of object to be dumped
+ *
+ * Before performing dump ioctl, user application needs to allocate at least:
+ * (num_objects * sizeof(kfd_criu_[type]_bucket)) + [type]_priv_data_size
+ *
+ * When ioctl returns, kernel drivers will fill @objects pointer with information that needs to be
+ * stored.
+ *
+ * Context: Expects process to be in paused state, see CRIU_PAUSE. Takes and releases process lock.
+ *
+ * Return: 0 on success, -errno on failure
+ */
+struct kfd_ioctl_criu_dumper_args {
+	__u64 num_objects;
+	__u64 objects;
+	__u64 objects_size;
+	__u64 objects_index_start;
+	__u32 type; /* enum kfd_criu_object_type */
+	__u32 pad;
+};
+
+/**
+ * kfd_ioctl_criu_restorer_args - Arguments for restoring objects
+ *
+ * @num_objects:           [in] Number of objects to be restored
+ * @objects:               [in/out] User pointer to memory location to store object information.
+ * @objects_size:          [in] Size of memory allocated in @objects
+ * @objects_index_start:   [in] When doing partial restores, this is the index of the first object
+ *                              to be dumped. [Not currently supported]
+ * @type:                  [in] Type of object to be restored
+ *
+ * Before performing restore ioctl, user application needs to fill @objects pointer with data
+ * returned during dump ioctl.
+ *
+ * Context: Takes and releases process lock.
+ *
+ * Return: 0 on success, -errno on failure
+ */
+struct kfd_ioctl_criu_restorer_args {
+	__u64 num_objects;
+	__u64 objects;
+	__u64 objects_size;
+	__u64 objects_index_start;
+	__u32 type; /* enum kfd_criu_object_type */
+	__u32 pad;
+};
+
+/**
+ * kfd_ioctl_criu_pause_args - Pause process
+ * @pause:              [in] Set to non-zero to pause all queues currently used by this process
+ *
+ *
+ * Process needs to be paused before process is dumped and un-paused once all objects have been
+ * dumped.
+ *
+ * Return: 0 on success, -errno on failure
+ */
+struct kfd_ioctl_criu_pause_args {
+	__u32 pause;
+	__u32 pad;
+};
+
+/**
+ * kfd_ioctl_criu_resume_args - Resume process
+ * @pid:                [in] Process ID of the task to be resumed
+ *
+ * Used during stage-4 of CRIU restore. This ioctl is called once all memory mappings, queues and
+ * events have been fully restored and the process is ready to run.
+ *
+ * Return: 0 on success, -errno on failure
+ */
+struct kfd_ioctl_criu_resume_args {
+	__u32 pid;
+	__u32 pad;
+};
+
+/* CRIU IOCTLs - END */
+/**************************************************************************************************/
+
 /* Register offset inside the remapped mmio page
  */
 enum kfd_mmio_remap {
@@ -740,7 +944,22 @@ struct kfd_ioctl_set_xnack_mode_args {
 #define AMDKFD_IOC_SET_XNACK_MODE		\
 		AMDKFD_IOWR(0x21, struct kfd_ioctl_set_xnack_mode_args)
 
+#define AMDKFD_IOC_CRIU_DUMPER			\
+		AMDKFD_IOWR(0x22, struct kfd_ioctl_criu_dumper_args)
+
+#define AMDKFD_IOC_CRIU_RESTORER		\
+		AMDKFD_IOWR(0x23, struct kfd_ioctl_criu_restorer_args)
+
+#define AMDKFD_IOC_CRIU_PROCESS_INFO		\
+		AMDKFD_IOWR(0x24, struct kfd_ioctl_criu_process_info_args)
+
+#define AMDKFD_IOC_CRIU_RESUME			\
+		AMDKFD_IOWR(0x25, struct kfd_ioctl_criu_resume_args)
+
+#define AMDKFD_IOC_CRIU_PAUSE			\
+		AMDKFD_IOWR(0x26, struct kfd_ioctl_criu_pause_args)
+
 #define AMDKFD_COMMAND_START		0x01
-#define AMDKFD_COMMAND_END		0x22
+#define AMDKFD_COMMAND_END		0x27
 
 #endif
-- 
2.17.1



More information about the amd-gfx mailing list