[PATCH] drm/amdkfd: Refactor kfd CRIU into its own file

Felix Kuehling felix.kuehling at amd.com
Mon May 6 20:10:52 UTC 2024


On 2024-05-06 15:20, David Francis wrote:
> The kfd CRIU code takes up about a thousand lines
> in the kfd_chardev file; move it to its own file.
>
> No functional change intended.
>
> Signed-off-by: David Francis <David.Francis at amd.com>
> ---
>   drivers/gpu/drm/amd/amdkfd/Makefile      |   1 +
>   drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 972 +---------------------
>   drivers/gpu/drm/amd/amdkfd/kfd_criu.c    | 989 +++++++++++++++++++++++
>   drivers/gpu/drm/amd/amdkfd/kfd_criu.h    |  50 ++
>   4 files changed, 1046 insertions(+), 966 deletions(-)
>   create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_criu.c
>   create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_criu.h
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/Makefile b/drivers/gpu/drm/amd/amdkfd/Makefile
> index 0d3d8972240d..e06af4073ac5 100644
> --- a/drivers/gpu/drm/amd/amdkfd/Makefile
> +++ b/drivers/gpu/drm/amd/amdkfd/Makefile
> @@ -32,6 +32,7 @@ AMDKFD_FILES	:= $(AMDKFD_PATH)/kfd_module.o \
>   		$(AMDKFD_PATH)/kfd_flat_memory.o \
>   		$(AMDKFD_PATH)/kfd_process.o \
>   		$(AMDKFD_PATH)/kfd_queue.o \
> +		$(AMDKFD_PATH)/kfd_criu.o \

Any particular reason for adding this in the middle and not the end?


>   		$(AMDKFD_PATH)/kfd_mqd_manager.o \
>   		$(AMDKFD_PATH)/kfd_mqd_manager_cik.o \
>   		$(AMDKFD_PATH)/kfd_mqd_manager_vi.o \
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> index 6b713fb0b818..e6e44a199a93 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> @@ -45,6 +45,7 @@

Can you remove #include <linux/fdtable.h> and "amdgpu_dma_buf.h" here? 
Or is it still needed by something else left in kfd_chardev.c?

Other than that, this patch is

Reviewed-by: Felix Kuehling <felix.kuehling at amd.com>


>   #include "kfd_smi_events.h"
>   #include "amdgpu_dma_buf.h"
>   #include "kfd_debug.h"
> +#include "kfd_criu.h"
>   
>   static long kfd_ioctl(struct file *, unsigned int, unsigned long);
>   static int kfd_open(struct inode *, struct file *);
> @@ -1751,967 +1752,6 @@ static int kfd_ioctl_svm(struct file *filep, struct kfd_process *p, void *data)
>   }
>   #endif
>   
> -static int criu_checkpoint_process(struct kfd_process *p,
> -			     uint8_t __user *user_priv_data,
> -			     uint64_t *priv_offset)
> -{
> -	struct kfd_criu_process_priv_data process_priv;
> -	int ret;
> -
> -	memset(&process_priv, 0, sizeof(process_priv));
> -
> -	process_priv.version = KFD_CRIU_PRIV_VERSION;
> -	/* For CR, we don't consider negative xnack mode which is used for
> -	 * querying without changing it, here 0 simply means disabled and 1
> -	 * means enabled so retry for finding a valid PTE.
> -	 */
> -	process_priv.xnack_mode = p->xnack_enabled ? 1 : 0;
> -
> -	ret = copy_to_user(user_priv_data + *priv_offset,
> -				&process_priv, sizeof(process_priv));
> -
> -	if (ret) {
> -		pr_err("Failed to copy process information to user\n");
> -		ret = -EFAULT;
> -	}
> -
> -	*priv_offset += sizeof(process_priv);
> -	return ret;
> -}
> -
> -static int criu_checkpoint_devices(struct kfd_process *p,
> -			     uint32_t num_devices,
> -			     uint8_t __user *user_addr,
> -			     uint8_t __user *user_priv_data,
> -			     uint64_t *priv_offset)
> -{
> -	struct kfd_criu_device_priv_data *device_priv = NULL;
> -	struct kfd_criu_device_bucket *device_buckets = NULL;
> -	int ret = 0, i;
> -
> -	device_buckets = kvzalloc(num_devices * sizeof(*device_buckets), GFP_KERNEL);
> -	if (!device_buckets) {
> -		ret = -ENOMEM;
> -		goto exit;
> -	}
> -
> -	device_priv = kvzalloc(num_devices * sizeof(*device_priv), GFP_KERNEL);
> -	if (!device_priv) {
> -		ret = -ENOMEM;
> -		goto exit;
> -	}
> -
> -	for (i = 0; i < num_devices; i++) {
> -		struct kfd_process_device *pdd = p->pdds[i];
> -
> -		device_buckets[i].user_gpu_id = pdd->user_gpu_id;
> -		device_buckets[i].actual_gpu_id = pdd->dev->id;
> -
> -		/*
> -		 * priv_data does not contain useful information for now and is reserved for
> -		 * future use, so we do not set its contents.
> -		 */
> -	}
> -
> -	ret = copy_to_user(user_addr, device_buckets, num_devices * sizeof(*device_buckets));
> -	if (ret) {
> -		pr_err("Failed to copy device information to user\n");
> -		ret = -EFAULT;
> -		goto exit;
> -	}
> -
> -	ret = copy_to_user(user_priv_data + *priv_offset,
> -			   device_priv,
> -			   num_devices * sizeof(*device_priv));
> -	if (ret) {
> -		pr_err("Failed to copy device information to user\n");
> -		ret = -EFAULT;
> -	}
> -	*priv_offset += num_devices * sizeof(*device_priv);
> -
> -exit:
> -	kvfree(device_buckets);
> -	kvfree(device_priv);
> -	return ret;
> -}
> -
> -static uint32_t get_process_num_bos(struct kfd_process *p)
> -{
> -	uint32_t num_of_bos = 0;
> -	int i;
> -
> -	/* Run over all PDDs of the process */
> -	for (i = 0; i < p->n_pdds; i++) {
> -		struct kfd_process_device *pdd = p->pdds[i];
> -		void *mem;
> -		int id;
> -
> -		idr_for_each_entry(&pdd->alloc_idr, mem, id) {
> -			struct kgd_mem *kgd_mem = (struct kgd_mem *)mem;
> -
> -			if (!kgd_mem->va || kgd_mem->va > pdd->gpuvm_base)
> -				num_of_bos++;
> -		}
> -	}
> -	return num_of_bos;
> -}
> -
> -static int criu_get_prime_handle(struct kgd_mem *mem,
> -				 int flags, u32 *shared_fd)
> -{
> -	struct dma_buf *dmabuf;
> -	int ret;
> -
> -	ret = amdgpu_amdkfd_gpuvm_export_dmabuf(mem, &dmabuf);
> -	if (ret) {
> -		pr_err("dmabuf export failed for the BO\n");
> -		return ret;
> -	}
> -
> -	ret = dma_buf_fd(dmabuf, flags);
> -	if (ret < 0) {
> -		pr_err("dmabuf create fd failed, ret:%d\n", ret);
> -		goto out_free_dmabuf;
> -	}
> -
> -	*shared_fd = ret;
> -	return 0;
> -
> -out_free_dmabuf:
> -	dma_buf_put(dmabuf);
> -	return ret;
> -}
> -
> -static int criu_checkpoint_bos(struct kfd_process *p,
> -			       uint32_t num_bos,
> -			       uint8_t __user *user_bos,
> -			       uint8_t __user *user_priv_data,
> -			       uint64_t *priv_offset)
> -{
> -	struct kfd_criu_bo_bucket *bo_buckets;
> -	struct kfd_criu_bo_priv_data *bo_privs;
> -	int ret = 0, pdd_index, bo_index = 0, id;
> -	void *mem;
> -
> -	bo_buckets = kvzalloc(num_bos * sizeof(*bo_buckets), GFP_KERNEL);
> -	if (!bo_buckets)
> -		return -ENOMEM;
> -
> -	bo_privs = kvzalloc(num_bos * sizeof(*bo_privs), GFP_KERNEL);
> -	if (!bo_privs) {
> -		ret = -ENOMEM;
> -		goto exit;
> -	}
> -
> -	for (pdd_index = 0; pdd_index < p->n_pdds; pdd_index++) {
> -		struct kfd_process_device *pdd = p->pdds[pdd_index];
> -		struct amdgpu_bo *dumper_bo;
> -		struct kgd_mem *kgd_mem;
> -
> -		idr_for_each_entry(&pdd->alloc_idr, mem, id) {
> -			struct kfd_criu_bo_bucket *bo_bucket;
> -			struct kfd_criu_bo_priv_data *bo_priv;
> -			int i, dev_idx = 0;
> -
> -			if (!mem) {
> -				ret = -ENOMEM;
> -				goto exit;
> -			}
> -
> -			kgd_mem = (struct kgd_mem *)mem;
> -			dumper_bo = kgd_mem->bo;
> -
> -			/* Skip checkpointing BOs that are used for Trap handler
> -			 * code and state. Currently, these BOs have a VA that
> -			 * is less GPUVM Base
> -			 */
> -			if (kgd_mem->va && kgd_mem->va <= pdd->gpuvm_base)
> -				continue;
> -
> -			bo_bucket = &bo_buckets[bo_index];
> -			bo_priv = &bo_privs[bo_index];
> -
> -			bo_bucket->gpu_id = pdd->user_gpu_id;
> -			bo_bucket->addr = (uint64_t)kgd_mem->va;
> -			bo_bucket->size = amdgpu_bo_size(dumper_bo);
> -			bo_bucket->alloc_flags = (uint32_t)kgd_mem->alloc_flags;
> -			bo_priv->idr_handle = id;
> -
> -			if (bo_bucket->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_USERPTR) {
> -				ret = amdgpu_ttm_tt_get_userptr(&dumper_bo->tbo,
> -								&bo_priv->user_addr);
> -				if (ret) {
> -					pr_err("Failed to obtain user address for user-pointer bo\n");
> -					goto exit;
> -				}
> -			}
> -			if (bo_bucket->alloc_flags
> -			    & (KFD_IOC_ALLOC_MEM_FLAGS_VRAM | KFD_IOC_ALLOC_MEM_FLAGS_GTT)) {
> -				ret = criu_get_prime_handle(kgd_mem,
> -						bo_bucket->alloc_flags &
> -						KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE ? DRM_RDWR : 0,
> -						&bo_bucket->dmabuf_fd);
> -				if (ret)
> -					goto exit;
> -			} else {
> -				bo_bucket->dmabuf_fd = KFD_INVALID_FD;
> -			}
> -
> -			if (bo_bucket->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL)
> -				bo_bucket->offset = KFD_MMAP_TYPE_DOORBELL |
> -					KFD_MMAP_GPU_ID(pdd->dev->id);
> -			else if (bo_bucket->alloc_flags &
> -				KFD_IOC_ALLOC_MEM_FLAGS_MMIO_REMAP)
> -				bo_bucket->offset = KFD_MMAP_TYPE_MMIO |
> -					KFD_MMAP_GPU_ID(pdd->dev->id);
> -			else
> -				bo_bucket->offset = amdgpu_bo_mmap_offset(dumper_bo);
> -
> -			for (i = 0; i < p->n_pdds; i++) {
> -				if (amdgpu_amdkfd_bo_mapped_to_dev(p->pdds[i]->dev->adev, kgd_mem))
> -					bo_priv->mapped_gpuids[dev_idx++] = p->pdds[i]->user_gpu_id;
> -			}
> -
> -			pr_debug("bo_size = 0x%llx, bo_addr = 0x%llx bo_offset = 0x%llx\n"
> -					"gpu_id = 0x%x alloc_flags = 0x%x idr_handle = 0x%x",
> -					bo_bucket->size,
> -					bo_bucket->addr,
> -					bo_bucket->offset,
> -					bo_bucket->gpu_id,
> -					bo_bucket->alloc_flags,
> -					bo_priv->idr_handle);
> -			bo_index++;
> -		}
> -	}
> -
> -	ret = copy_to_user(user_bos, bo_buckets, num_bos * sizeof(*bo_buckets));
> -	if (ret) {
> -		pr_err("Failed to copy BO information to user\n");
> -		ret = -EFAULT;
> -		goto exit;
> -	}
> -
> -	ret = copy_to_user(user_priv_data + *priv_offset, bo_privs, num_bos * sizeof(*bo_privs));
> -	if (ret) {
> -		pr_err("Failed to copy BO priv information to user\n");
> -		ret = -EFAULT;
> -		goto exit;
> -	}
> -
> -	*priv_offset += num_bos * sizeof(*bo_privs);
> -
> -exit:
> -	while (ret && bo_index--) {
> -		if (bo_buckets[bo_index].alloc_flags
> -		    & (KFD_IOC_ALLOC_MEM_FLAGS_VRAM | KFD_IOC_ALLOC_MEM_FLAGS_GTT))
> -			close_fd(bo_buckets[bo_index].dmabuf_fd);
> -	}
> -
> -	kvfree(bo_buckets);
> -	kvfree(bo_privs);
> -	return ret;
> -}
> -
> -static int criu_get_process_object_info(struct kfd_process *p,
> -					uint32_t *num_devices,
> -					uint32_t *num_bos,
> -					uint32_t *num_objects,
> -					uint64_t *objs_priv_size)
> -{
> -	uint64_t queues_priv_data_size, svm_priv_data_size, priv_size;
> -	uint32_t num_queues, num_events, num_svm_ranges;
> -	int ret;
> -
> -	*num_devices = p->n_pdds;
> -	*num_bos = get_process_num_bos(p);
> -
> -	ret = kfd_process_get_queue_info(p, &num_queues, &queues_priv_data_size);
> -	if (ret)
> -		return ret;
> -
> -	num_events = kfd_get_num_events(p);
> -
> -	ret = svm_range_get_info(p, &num_svm_ranges, &svm_priv_data_size);
> -	if (ret)
> -		return ret;
> -
> -	*num_objects = num_queues + num_events + num_svm_ranges;
> -
> -	if (objs_priv_size) {
> -		priv_size = sizeof(struct kfd_criu_process_priv_data);
> -		priv_size += *num_devices * sizeof(struct kfd_criu_device_priv_data);
> -		priv_size += *num_bos * sizeof(struct kfd_criu_bo_priv_data);
> -		priv_size += queues_priv_data_size;
> -		priv_size += num_events * sizeof(struct kfd_criu_event_priv_data);
> -		priv_size += svm_priv_data_size;
> -		*objs_priv_size = priv_size;
> -	}
> -	return 0;
> -}
> -
> -static int criu_checkpoint(struct file *filep,
> -			   struct kfd_process *p,
> -			   struct kfd_ioctl_criu_args *args)
> -{
> -	int ret;
> -	uint32_t num_devices, num_bos, num_objects;
> -	uint64_t priv_size, priv_offset = 0, bo_priv_offset;
> -
> -	if (!args->devices || !args->bos || !args->priv_data)
> -		return -EINVAL;
> -
> -	mutex_lock(&p->mutex);
> -
> -	if (!p->n_pdds) {
> -		pr_err("No pdd for given process\n");
> -		ret = -ENODEV;
> -		goto exit_unlock;
> -	}
> -
> -	/* Confirm all process queues are evicted */
> -	if (!p->queues_paused) {
> -		pr_err("Cannot dump process when queues are not in evicted state\n");
> -		/* CRIU plugin did not call op PROCESS_INFO before checkpointing */
> -		ret = -EINVAL;
> -		goto exit_unlock;
> -	}
> -
> -	ret = criu_get_process_object_info(p, &num_devices, &num_bos, &num_objects, &priv_size);
> -	if (ret)
> -		goto exit_unlock;
> -
> -	if (num_devices != args->num_devices ||
> -	    num_bos != args->num_bos ||
> -	    num_objects != args->num_objects ||
> -	    priv_size != args->priv_data_size) {
> -
> -		ret = -EINVAL;
> -		goto exit_unlock;
> -	}
> -
> -	/* each function will store private data inside priv_data and adjust priv_offset */
> -	ret = criu_checkpoint_process(p, (uint8_t __user *)args->priv_data, &priv_offset);
> -	if (ret)
> -		goto exit_unlock;
> -
> -	ret = criu_checkpoint_devices(p, num_devices, (uint8_t __user *)args->devices,
> -				(uint8_t __user *)args->priv_data, &priv_offset);
> -	if (ret)
> -		goto exit_unlock;
> -
> -	/* Leave room for BOs in the private data. They need to be restored
> -	 * before events, but we checkpoint them last to simplify the error
> -	 * handling.
> -	 */
> -	bo_priv_offset = priv_offset;
> -	priv_offset += num_bos * sizeof(struct kfd_criu_bo_priv_data);
> -
> -	if (num_objects) {
> -		ret = kfd_criu_checkpoint_queues(p, (uint8_t __user *)args->priv_data,
> -						 &priv_offset);
> -		if (ret)
> -			goto exit_unlock;
> -
> -		ret = kfd_criu_checkpoint_events(p, (uint8_t __user *)args->priv_data,
> -						 &priv_offset);
> -		if (ret)
> -			goto exit_unlock;
> -
> -		ret = kfd_criu_checkpoint_svm(p, (uint8_t __user *)args->priv_data, &priv_offset);
> -		if (ret)
> -			goto exit_unlock;
> -	}
> -
> -	/* This must be the last thing in this function that can fail.
> -	 * Otherwise we leak dmabuf file descriptors.
> -	 */
> -	ret = criu_checkpoint_bos(p, num_bos, (uint8_t __user *)args->bos,
> -			   (uint8_t __user *)args->priv_data, &bo_priv_offset);
> -
> -exit_unlock:
> -	mutex_unlock(&p->mutex);
> -	if (ret)
> -		pr_err("Failed to dump CRIU ret:%d\n", ret);
> -	else
> -		pr_debug("CRIU dump ret:%d\n", ret);
> -
> -	return ret;
> -}
> -
> -static int criu_restore_process(struct kfd_process *p,
> -				struct kfd_ioctl_criu_args *args,
> -				uint64_t *priv_offset,
> -				uint64_t max_priv_data_size)
> -{
> -	int ret = 0;
> -	struct kfd_criu_process_priv_data process_priv;
> -
> -	if (*priv_offset + sizeof(process_priv) > max_priv_data_size)
> -		return -EINVAL;
> -
> -	ret = copy_from_user(&process_priv,
> -				(void __user *)(args->priv_data + *priv_offset),
> -				sizeof(process_priv));
> -	if (ret) {
> -		pr_err("Failed to copy process private information from user\n");
> -		ret = -EFAULT;
> -		goto exit;
> -	}
> -	*priv_offset += sizeof(process_priv);
> -
> -	if (process_priv.version != KFD_CRIU_PRIV_VERSION) {
> -		pr_err("Invalid CRIU API version (checkpointed:%d current:%d)\n",
> -			process_priv.version, KFD_CRIU_PRIV_VERSION);
> -		return -EINVAL;
> -	}
> -
> -	pr_debug("Setting XNACK mode\n");
> -	if (process_priv.xnack_mode && !kfd_process_xnack_mode(p, true)) {
> -		pr_err("xnack mode cannot be set\n");
> -		ret = -EPERM;
> -		goto exit;
> -	} else {
> -		pr_debug("set xnack mode: %d\n", process_priv.xnack_mode);
> -		p->xnack_enabled = process_priv.xnack_mode;
> -	}
> -
> -exit:
> -	return ret;
> -}
> -
> -static int criu_restore_devices(struct kfd_process *p,
> -				struct kfd_ioctl_criu_args *args,
> -				uint64_t *priv_offset,
> -				uint64_t max_priv_data_size)
> -{
> -	struct kfd_criu_device_bucket *device_buckets;
> -	struct kfd_criu_device_priv_data *device_privs;
> -	int ret = 0;
> -	uint32_t i;
> -
> -	if (args->num_devices != p->n_pdds)
> -		return -EINVAL;
> -
> -	if (*priv_offset + (args->num_devices * sizeof(*device_privs)) > max_priv_data_size)
> -		return -EINVAL;
> -
> -	device_buckets = kmalloc_array(args->num_devices, sizeof(*device_buckets), GFP_KERNEL);
> -	if (!device_buckets)
> -		return -ENOMEM;
> -
> -	ret = copy_from_user(device_buckets, (void __user *)args->devices,
> -				args->num_devices * sizeof(*device_buckets));
> -	if (ret) {
> -		pr_err("Failed to copy devices buckets from user\n");
> -		ret = -EFAULT;
> -		goto exit;
> -	}
> -
> -	for (i = 0; i < args->num_devices; i++) {
> -		struct kfd_node *dev;
> -		struct kfd_process_device *pdd;
> -		struct file *drm_file;
> -
> -		/* device private data is not currently used */
> -
> -		if (!device_buckets[i].user_gpu_id) {
> -			pr_err("Invalid user gpu_id\n");
> -			ret = -EINVAL;
> -			goto exit;
> -		}
> -
> -		dev = kfd_device_by_id(device_buckets[i].actual_gpu_id);
> -		if (!dev) {
> -			pr_err("Failed to find device with gpu_id = %x\n",
> -				device_buckets[i].actual_gpu_id);
> -			ret = -EINVAL;
> -			goto exit;
> -		}
> -
> -		pdd = kfd_get_process_device_data(dev, p);
> -		if (!pdd) {
> -			pr_err("Failed to get pdd for gpu_id = %x\n",
> -					device_buckets[i].actual_gpu_id);
> -			ret = -EINVAL;
> -			goto exit;
> -		}
> -		pdd->user_gpu_id = device_buckets[i].user_gpu_id;
> -
> -		drm_file = fget(device_buckets[i].drm_fd);
> -		if (!drm_file) {
> -			pr_err("Invalid render node file descriptor sent from plugin (%d)\n",
> -				device_buckets[i].drm_fd);
> -			ret = -EINVAL;
> -			goto exit;
> -		}
> -
> -		if (pdd->drm_file) {
> -			ret = -EINVAL;
> -			goto exit;
> -		}
> -
> -		/* create the vm using render nodes for kfd pdd */
> -		if (kfd_process_device_init_vm(pdd, drm_file)) {
> -			pr_err("could not init vm for given pdd\n");
> -			/* On success, the PDD keeps the drm_file reference */
> -			fput(drm_file);
> -			ret = -EINVAL;
> -			goto exit;
> -		}
> -		/*
> -		 * pdd now already has the vm bound to render node so below api won't create a new
> -		 * exclusive kfd mapping but use existing one with renderDXXX but is still needed
> -		 * for iommu v2 binding  and runtime pm.
> -		 */
> -		pdd = kfd_bind_process_to_device(dev, p);
> -		if (IS_ERR(pdd)) {
> -			ret = PTR_ERR(pdd);
> -			goto exit;
> -		}
> -
> -		if (!pdd->qpd.proc_doorbells) {
> -			ret = kfd_alloc_process_doorbells(dev->kfd, pdd);
> -			if (ret)
> -				goto exit;
> -		}
> -	}
> -
> -	/*
> -	 * We are not copying device private data from user as we are not using the data for now,
> -	 * but we still adjust for its private data.
> -	 */
> -	*priv_offset += args->num_devices * sizeof(*device_privs);
> -
> -exit:
> -	kfree(device_buckets);
> -	return ret;
> -}
> -
> -static int criu_restore_memory_of_gpu(struct kfd_process_device *pdd,
> -				      struct kfd_criu_bo_bucket *bo_bucket,
> -				      struct kfd_criu_bo_priv_data *bo_priv,
> -				      struct kgd_mem **kgd_mem)
> -{
> -	int idr_handle;
> -	int ret;
> -	const bool criu_resume = true;
> -	u64 offset;
> -
> -	if (bo_bucket->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL) {
> -		if (bo_bucket->size !=
> -				kfd_doorbell_process_slice(pdd->dev->kfd))
> -			return -EINVAL;
> -
> -		offset = kfd_get_process_doorbells(pdd);
> -		if (!offset)
> -			return -ENOMEM;
> -	} else if (bo_bucket->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_MMIO_REMAP) {
> -		/* MMIO BOs need remapped bus address */
> -		if (bo_bucket->size != PAGE_SIZE) {
> -			pr_err("Invalid page size\n");
> -			return -EINVAL;
> -		}
> -		offset = pdd->dev->adev->rmmio_remap.bus_addr;
> -		if (!offset) {
> -			pr_err("amdgpu_amdkfd_get_mmio_remap_phys_addr failed\n");
> -			return -ENOMEM;
> -		}
> -	} else if (bo_bucket->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_USERPTR) {
> -		offset = bo_priv->user_addr;
> -	}
> -	/* Create the BO */
> -	ret = amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu(pdd->dev->adev, bo_bucket->addr,
> -						      bo_bucket->size, pdd->drm_priv, kgd_mem,
> -						      &offset, bo_bucket->alloc_flags, criu_resume);
> -	if (ret) {
> -		pr_err("Could not create the BO\n");
> -		return ret;
> -	}
> -	pr_debug("New BO created: size:0x%llx addr:0x%llx offset:0x%llx\n",
> -		 bo_bucket->size, bo_bucket->addr, offset);
> -
> -	/* Restore previous IDR handle */
> -	pr_debug("Restoring old IDR handle for the BO");
> -	idr_handle = idr_alloc(&pdd->alloc_idr, *kgd_mem, bo_priv->idr_handle,
> -			       bo_priv->idr_handle + 1, GFP_KERNEL);
> -
> -	if (idr_handle < 0) {
> -		pr_err("Could not allocate idr\n");
> -		amdgpu_amdkfd_gpuvm_free_memory_of_gpu(pdd->dev->adev, *kgd_mem, pdd->drm_priv,
> -						       NULL);
> -		return -ENOMEM;
> -	}
> -
> -	if (bo_bucket->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL)
> -		bo_bucket->restored_offset = KFD_MMAP_TYPE_DOORBELL | KFD_MMAP_GPU_ID(pdd->dev->id);
> -	if (bo_bucket->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_MMIO_REMAP) {
> -		bo_bucket->restored_offset = KFD_MMAP_TYPE_MMIO | KFD_MMAP_GPU_ID(pdd->dev->id);
> -	} else if (bo_bucket->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_GTT) {
> -		bo_bucket->restored_offset = offset;
> -	} else if (bo_bucket->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_VRAM) {
> -		bo_bucket->restored_offset = offset;
> -		/* Update the VRAM usage count */
> -		WRITE_ONCE(pdd->vram_usage, pdd->vram_usage + bo_bucket->size);
> -	}
> -	return 0;
> -}
> -
> -static int criu_restore_bo(struct kfd_process *p,
> -			   struct kfd_criu_bo_bucket *bo_bucket,
> -			   struct kfd_criu_bo_priv_data *bo_priv)
> -{
> -	struct kfd_process_device *pdd;
> -	struct kgd_mem *kgd_mem;
> -	int ret;
> -	int j;
> -
> -	pr_debug("Restoring BO size:0x%llx addr:0x%llx gpu_id:0x%x flags:0x%x idr_handle:0x%x\n",
> -		 bo_bucket->size, bo_bucket->addr, bo_bucket->gpu_id, bo_bucket->alloc_flags,
> -		 bo_priv->idr_handle);
> -
> -	pdd = kfd_process_device_data_by_id(p, bo_bucket->gpu_id);
> -	if (!pdd) {
> -		pr_err("Failed to get pdd\n");
> -		return -ENODEV;
> -	}
> -
> -	ret = criu_restore_memory_of_gpu(pdd, bo_bucket, bo_priv, &kgd_mem);
> -	if (ret)
> -		return ret;
> -
> -	/* now map these BOs to GPU/s */
> -	for (j = 0; j < p->n_pdds; j++) {
> -		struct kfd_node *peer;
> -		struct kfd_process_device *peer_pdd;
> -
> -		if (!bo_priv->mapped_gpuids[j])
> -			break;
> -
> -		peer_pdd = kfd_process_device_data_by_id(p, bo_priv->mapped_gpuids[j]);
> -		if (!peer_pdd)
> -			return -EINVAL;
> -
> -		peer = peer_pdd->dev;
> -
> -		peer_pdd = kfd_bind_process_to_device(peer, p);
> -		if (IS_ERR(peer_pdd))
> -			return PTR_ERR(peer_pdd);
> -
> -		ret = amdgpu_amdkfd_gpuvm_map_memory_to_gpu(peer->adev, kgd_mem,
> -							    peer_pdd->drm_priv);
> -		if (ret) {
> -			pr_err("Failed to map to gpu %d/%d\n", j, p->n_pdds);
> -			return ret;
> -		}
> -	}
> -
> -	pr_debug("map memory was successful for the BO\n");
> -	/* create the dmabuf object and export the bo */
> -	if (bo_bucket->alloc_flags
> -	    & (KFD_IOC_ALLOC_MEM_FLAGS_VRAM | KFD_IOC_ALLOC_MEM_FLAGS_GTT)) {
> -		ret = criu_get_prime_handle(kgd_mem, DRM_RDWR,
> -					    &bo_bucket->dmabuf_fd);
> -		if (ret)
> -			return ret;
> -	} else {
> -		bo_bucket->dmabuf_fd = KFD_INVALID_FD;
> -	}
> -
> -	return 0;
> -}
> -
> -static int criu_restore_bos(struct kfd_process *p,
> -			    struct kfd_ioctl_criu_args *args,
> -			    uint64_t *priv_offset,
> -			    uint64_t max_priv_data_size)
> -{
> -	struct kfd_criu_bo_bucket *bo_buckets = NULL;
> -	struct kfd_criu_bo_priv_data *bo_privs = NULL;
> -	int ret = 0;
> -	uint32_t i = 0;
> -
> -	if (*priv_offset + (args->num_bos * sizeof(*bo_privs)) > max_priv_data_size)
> -		return -EINVAL;
> -
> -	/* Prevent MMU notifications until stage-4 IOCTL (CRIU_RESUME) is received */
> -	amdgpu_amdkfd_block_mmu_notifications(p->kgd_process_info);
> -
> -	bo_buckets = kvmalloc_array(args->num_bos, sizeof(*bo_buckets), GFP_KERNEL);
> -	if (!bo_buckets)
> -		return -ENOMEM;
> -
> -	ret = copy_from_user(bo_buckets, (void __user *)args->bos,
> -			     args->num_bos * sizeof(*bo_buckets));
> -	if (ret) {
> -		pr_err("Failed to copy BOs information from user\n");
> -		ret = -EFAULT;
> -		goto exit;
> -	}
> -
> -	bo_privs = kvmalloc_array(args->num_bos, sizeof(*bo_privs), GFP_KERNEL);
> -	if (!bo_privs) {
> -		ret = -ENOMEM;
> -		goto exit;
> -	}
> -
> -	ret = copy_from_user(bo_privs, (void __user *)args->priv_data + *priv_offset,
> -			     args->num_bos * sizeof(*bo_privs));
> -	if (ret) {
> -		pr_err("Failed to copy BOs information from user\n");
> -		ret = -EFAULT;
> -		goto exit;
> -	}
> -	*priv_offset += args->num_bos * sizeof(*bo_privs);
> -
> -	/* Create and map new BOs */
> -	for (; i < args->num_bos; i++) {
> -		ret = criu_restore_bo(p, &bo_buckets[i], &bo_privs[i]);
> -		if (ret) {
> -			pr_debug("Failed to restore BO[%d] ret%d\n", i, ret);
> -			goto exit;
> -		}
> -	} /* done */
> -
> -	/* Copy only the buckets back so user can read bo_buckets[N].restored_offset */
> -	ret = copy_to_user((void __user *)args->bos,
> -				bo_buckets,
> -				(args->num_bos * sizeof(*bo_buckets)));
> -	if (ret)
> -		ret = -EFAULT;
> -
> -exit:
> -	while (ret && i--) {
> -		if (bo_buckets[i].alloc_flags
> -		   & (KFD_IOC_ALLOC_MEM_FLAGS_VRAM | KFD_IOC_ALLOC_MEM_FLAGS_GTT))
> -			close_fd(bo_buckets[i].dmabuf_fd);
> -	}
> -	kvfree(bo_buckets);
> -	kvfree(bo_privs);
> -	return ret;
> -}
> -
> -static int criu_restore_objects(struct file *filep,
> -				struct kfd_process *p,
> -				struct kfd_ioctl_criu_args *args,
> -				uint64_t *priv_offset,
> -				uint64_t max_priv_data_size)
> -{
> -	int ret = 0;
> -	uint32_t i;
> -
> -	BUILD_BUG_ON(offsetof(struct kfd_criu_queue_priv_data, object_type));
> -	BUILD_BUG_ON(offsetof(struct kfd_criu_event_priv_data, object_type));
> -	BUILD_BUG_ON(offsetof(struct kfd_criu_svm_range_priv_data, object_type));
> -
> -	for (i = 0; i < args->num_objects; i++) {
> -		uint32_t object_type;
> -
> -		if (*priv_offset + sizeof(object_type) > max_priv_data_size) {
> -			pr_err("Invalid private data size\n");
> -			return -EINVAL;
> -		}
> -
> -		ret = get_user(object_type, (uint32_t __user *)(args->priv_data + *priv_offset));
> -		if (ret) {
> -			pr_err("Failed to copy private information from user\n");
> -			goto exit;
> -		}
> -
> -		switch (object_type) {
> -		case KFD_CRIU_OBJECT_TYPE_QUEUE:
> -			ret = kfd_criu_restore_queue(p, (uint8_t __user *)args->priv_data,
> -						     priv_offset, max_priv_data_size);
> -			if (ret)
> -				goto exit;
> -			break;
> -		case KFD_CRIU_OBJECT_TYPE_EVENT:
> -			ret = kfd_criu_restore_event(filep, p, (uint8_t __user *)args->priv_data,
> -						     priv_offset, max_priv_data_size);
> -			if (ret)
> -				goto exit;
> -			break;
> -		case KFD_CRIU_OBJECT_TYPE_SVM_RANGE:
> -			ret = kfd_criu_restore_svm(p, (uint8_t __user *)args->priv_data,
> -						     priv_offset, max_priv_data_size);
> -			if (ret)
> -				goto exit;
> -			break;
> -		default:
> -			pr_err("Invalid object type:%u at index:%d\n", object_type, i);
> -			ret = -EINVAL;
> -			goto exit;
> -		}
> -	}
> -exit:
> -	return ret;
> -}
> -
> -static int criu_restore(struct file *filep,
> -			struct kfd_process *p,
> -			struct kfd_ioctl_criu_args *args)
> -{
> -	uint64_t priv_offset = 0;
> -	int ret = 0;
> -
> -	pr_debug("CRIU restore (num_devices:%u num_bos:%u num_objects:%u priv_data_size:%llu)\n",
> -		 args->num_devices, args->num_bos, args->num_objects, args->priv_data_size);
> -
> -	if (!args->bos || !args->devices || !args->priv_data || !args->priv_data_size ||
> -	    !args->num_devices || !args->num_bos)
> -		return -EINVAL;
> -
> -	mutex_lock(&p->mutex);
> -
> -	/*
> -	 * Set the process to evicted state to avoid running any new queues before all the memory
> -	 * mappings are ready.
> -	 */
> -	ret = kfd_process_evict_queues(p, KFD_QUEUE_EVICTION_CRIU_RESTORE);
> -	if (ret)
> -		goto exit_unlock;
> -
> -	/* Each function will adjust priv_offset based on how many bytes they consumed */
> -	ret = criu_restore_process(p, args, &priv_offset, args->priv_data_size);
> -	if (ret)
> -		goto exit_unlock;
> -
> -	ret = criu_restore_devices(p, args, &priv_offset, args->priv_data_size);
> -	if (ret)
> -		goto exit_unlock;
> -
> -	ret = criu_restore_bos(p, args, &priv_offset, args->priv_data_size);
> -	if (ret)
> -		goto exit_unlock;
> -
> -	ret = criu_restore_objects(filep, p, args, &priv_offset, args->priv_data_size);
> -	if (ret)
> -		goto exit_unlock;
> -
> -	if (priv_offset != args->priv_data_size) {
> -		pr_err("Invalid private data size\n");
> -		ret = -EINVAL;
> -	}
> -
> -exit_unlock:
> -	mutex_unlock(&p->mutex);
> -	if (ret)
> -		pr_err("Failed to restore CRIU ret:%d\n", ret);
> -	else
> -		pr_debug("CRIU restore successful\n");
> -
> -	return ret;
> -}
> -
> -static int criu_unpause(struct file *filep,
> -			struct kfd_process *p,
> -			struct kfd_ioctl_criu_args *args)
> -{
> -	int ret;
> -
> -	mutex_lock(&p->mutex);
> -
> -	if (!p->queues_paused) {
> -		mutex_unlock(&p->mutex);
> -		return -EINVAL;
> -	}
> -
> -	ret = kfd_process_restore_queues(p);
> -	if (ret)
> -		pr_err("Failed to unpause queues ret:%d\n", ret);
> -	else
> -		p->queues_paused = false;
> -
> -	mutex_unlock(&p->mutex);
> -
> -	return ret;
> -}
> -
> -static int criu_resume(struct file *filep,
> -			struct kfd_process *p,
> -			struct kfd_ioctl_criu_args *args)
> -{
> -	struct kfd_process *target = NULL;
> -	struct pid *pid = NULL;
> -	int ret = 0;
> -
> -	pr_debug("Inside %s, target pid for criu restore: %d\n", __func__,
> -		 args->pid);
> -
> -	pid = find_get_pid(args->pid);
> -	if (!pid) {
> -		pr_err("Cannot find pid info for %i\n", args->pid);
> -		return -ESRCH;
> -	}
> -
> -	pr_debug("calling kfd_lookup_process_by_pid\n");
> -	target = kfd_lookup_process_by_pid(pid);
> -
> -	put_pid(pid);
> -
> -	if (!target) {
> -		pr_debug("Cannot find process info for %i\n", args->pid);
> -		return -ESRCH;
> -	}
> -
> -	mutex_lock(&target->mutex);
> -	ret = kfd_criu_resume_svm(target);
> -	if (ret) {
> -		pr_err("kfd_criu_resume_svm failed for %i\n", args->pid);
> -		goto exit;
> -	}
> -
> -	ret =  amdgpu_amdkfd_criu_resume(target->kgd_process_info);
> -	if (ret)
> -		pr_err("amdgpu_amdkfd_criu_resume failed for %i\n", args->pid);
> -
> -exit:
> -	mutex_unlock(&target->mutex);
> -
> -	kfd_unref_process(target);
> -	return ret;
> -}
> -
> -static int criu_process_info(struct file *filep,
> -				struct kfd_process *p,
> -				struct kfd_ioctl_criu_args *args)
> -{
> -	int ret = 0;
> -
> -	mutex_lock(&p->mutex);
> -
> -	if (!p->n_pdds) {
> -		pr_err("No pdd for given process\n");
> -		ret = -ENODEV;
> -		goto err_unlock;
> -	}
> -
> -	ret = kfd_process_evict_queues(p, KFD_QUEUE_EVICTION_CRIU_CHECKPOINT);
> -	if (ret)
> -		goto err_unlock;
> -
> -	p->queues_paused = true;
> -
> -	args->pid = task_pid_nr_ns(p->lead_thread,
> -					task_active_pid_ns(p->lead_thread));
> -
> -	ret = criu_get_process_object_info(p, &args->num_devices, &args->num_bos,
> -					   &args->num_objects, &args->priv_data_size);
> -	if (ret)
> -		goto err_unlock;
> -
> -	dev_dbg(kfd_device, "Num of devices:%u bos:%u objects:%u priv_data_size:%lld\n",
> -				args->num_devices, args->num_bos, args->num_objects,
> -				args->priv_data_size);
> -
> -err_unlock:
> -	if (ret) {
> -		kfd_process_restore_queues(p);
> -		p->queues_paused = false;
> -	}
> -	mutex_unlock(&p->mutex);
> -	return ret;
> -}
> -
>   static int kfd_ioctl_criu(struct file *filep, struct kfd_process *p, void *data)
>   {
>   	struct kfd_ioctl_criu_args *args = data;
> @@ -2720,19 +1760,19 @@ static int kfd_ioctl_criu(struct file *filep, struct kfd_process *p, void *data)
>   	dev_dbg(kfd_device, "CRIU operation: %d\n", args->op);
>   	switch (args->op) {
>   	case KFD_CRIU_OP_PROCESS_INFO:
> -		ret = criu_process_info(filep, p, args);
> +		ret = kfd_criu_process_info(filep, p, args);
>   		break;
>   	case KFD_CRIU_OP_CHECKPOINT:
> -		ret = criu_checkpoint(filep, p, args);
> +		ret = kfd_criu_checkpoint(filep, p, args);
>   		break;
>   	case KFD_CRIU_OP_UNPAUSE:
> -		ret = criu_unpause(filep, p, args);
> +		ret = kfd_criu_unpause(filep, p, args);
>   		break;
>   	case KFD_CRIU_OP_RESTORE:
> -		ret = criu_restore(filep, p, args);
> +		ret = kfd_criu_restore(filep, p, args);
>   		break;
>   	case KFD_CRIU_OP_RESUME:
> -		ret = criu_resume(filep, p, args);
> +		ret = kfd_criu_resume(filep, p, args);
>   		break;
>   	default:
>   		dev_dbg(kfd_device, "Unsupported CRIU operation:%d\n", args->op);
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_criu.c b/drivers/gpu/drm/amd/amdkfd/kfd_criu.c
> new file mode 100644
> index 000000000000..72a9b358a642
> --- /dev/null
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_criu.c
> @@ -0,0 +1,989 @@
> +// SPDX-License-Identifier: GPL-2.0 OR MIT
> +/*
> + * Copyright 2024 Advanced Micro Devices, Inc.
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a
> + * copy of this software and associated documentation files (the "Software"),
> + * to deal in the Software without restriction, including without limitation
> + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
> + * and/or sell copies of the Software, and to permit persons to whom the
> + * Software is furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice shall be included in
> + * all copies or substantial portions of the Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
> + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
> + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
> + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
> + * OTHER DEALINGS IN THE SOFTWARE.
> + */
> +
> +#include <linux/dma-buf.h>
> +#include <linux/fdtable.h>
> +
> +#include "kfd_criu.h"
> +#include "kfd_svm.h"
> +
> +static int criu_checkpoint_process(struct kfd_process *p,
> +			     uint8_t __user *user_priv_data,
> +			     uint64_t *priv_offset)
> +{
> +	struct kfd_criu_process_priv_data process_priv;
> +	int ret;
> +
> +	memset(&process_priv, 0, sizeof(process_priv));
> +
> +	process_priv.version = KFD_CRIU_PRIV_VERSION;
> +	/* For CR, we don't consider negative xnack mode which is used for
> +	 * querying without changing it, here 0 simply means disabled and 1
> +	 * means enabled so retry for finding a valid PTE.
> +	 */
> +	process_priv.xnack_mode = p->xnack_enabled ? 1 : 0;
> +
> +	ret = copy_to_user(user_priv_data + *priv_offset,
> +				&process_priv, sizeof(process_priv));
> +
> +	if (ret) {
> +		pr_err("Failed to copy process information to user\n");
> +		ret = -EFAULT;
> +	}
> +
> +	*priv_offset += sizeof(process_priv);
> +	return ret;
> +}
> +
> +static int criu_checkpoint_devices(struct kfd_process *p,
> +			     uint32_t num_devices,
> +			     uint8_t __user *user_addr,
> +			     uint8_t __user *user_priv_data,
> +			     uint64_t *priv_offset)
> +{
> +	struct kfd_criu_device_priv_data *device_priv = NULL;
> +	struct kfd_criu_device_bucket *device_buckets = NULL;
> +	int ret = 0, i;
> +
> +	device_buckets = kvzalloc(num_devices * sizeof(*device_buckets), GFP_KERNEL);
> +	if (!device_buckets) {
> +		ret = -ENOMEM;
> +		goto exit;
> +	}
> +
> +	device_priv = kvzalloc(num_devices * sizeof(*device_priv), GFP_KERNEL);
> +	if (!device_priv) {
> +		ret = -ENOMEM;
> +		goto exit;
> +	}
> +
> +	for (i = 0; i < num_devices; i++) {
> +		struct kfd_process_device *pdd = p->pdds[i];
> +
> +		device_buckets[i].user_gpu_id = pdd->user_gpu_id;
> +		device_buckets[i].actual_gpu_id = pdd->dev->id;
> +
> +		/*
> +		 * priv_data does not contain useful information for now and is reserved for
> +		 * future use, so we do not set its contents.
> +		 */
> +	}
> +
> +	ret = copy_to_user(user_addr, device_buckets, num_devices * sizeof(*device_buckets));
> +	if (ret) {
> +		pr_err("Failed to copy device information to user\n");
> +		ret = -EFAULT;
> +		goto exit;
> +	}
> +
> +	ret = copy_to_user(user_priv_data + *priv_offset,
> +			   device_priv,
> +			   num_devices * sizeof(*device_priv));
> +	if (ret) {
> +		pr_err("Failed to copy device information to user\n");
> +		ret = -EFAULT;
> +	}
> +	*priv_offset += num_devices * sizeof(*device_priv);
> +
> +exit:
> +	kvfree(device_buckets);
> +	kvfree(device_priv);
> +	return ret;
> +}
> +
> +static uint32_t get_process_num_bos(struct kfd_process *p)
> +{
> +	uint32_t num_of_bos = 0;
> +	int i;
> +
> +	/* Run over all PDDs of the process */
> +	for (i = 0; i < p->n_pdds; i++) {
> +		struct kfd_process_device *pdd = p->pdds[i];
> +		void *mem;
> +		int id;
> +
> +		idr_for_each_entry(&pdd->alloc_idr, mem, id) {
> +			struct kgd_mem *kgd_mem = (struct kgd_mem *)mem;
> +
> +			if (!kgd_mem->va || kgd_mem->va > pdd->gpuvm_base)
> +				num_of_bos++;
> +		}
> +	}
> +	return num_of_bos;
> +}
> +
> +static int criu_get_prime_handle(struct kgd_mem *mem,
> +				 int flags, u32 *shared_fd)
> +{
> +	struct dma_buf *dmabuf;
> +	int ret;
> +
> +	ret = amdgpu_amdkfd_gpuvm_export_dmabuf(mem, &dmabuf);
> +	if (ret) {
> +		pr_err("dmabuf export failed for the BO\n");
> +		return ret;
> +	}
> +
> +	ret = dma_buf_fd(dmabuf, flags);
> +	if (ret < 0) {
> +		pr_err("dmabuf create fd failed, ret:%d\n", ret);
> +		goto out_free_dmabuf;
> +	}
> +
> +	*shared_fd = ret;
> +	return 0;
> +
> +out_free_dmabuf:
> +	dma_buf_put(dmabuf);
> +	return ret;
> +}
> +
> +static int criu_checkpoint_bos(struct kfd_process *p,
> +			       uint32_t num_bos,
> +			       uint8_t __user *user_bos,
> +			       uint8_t __user *user_priv_data,
> +			       uint64_t *priv_offset)
> +{
> +	struct kfd_criu_bo_bucket *bo_buckets;
> +	struct kfd_criu_bo_priv_data *bo_privs;
> +	int ret = 0, pdd_index, bo_index = 0, id;
> +	void *mem;
> +
> +	bo_buckets = kvzalloc(num_bos * sizeof(*bo_buckets), GFP_KERNEL);
> +	if (!bo_buckets)
> +		return -ENOMEM;
> +
> +	bo_privs = kvzalloc(num_bos * sizeof(*bo_privs), GFP_KERNEL);
> +	if (!bo_privs) {
> +		ret = -ENOMEM;
> +		goto exit;
> +	}
> +
> +	for (pdd_index = 0; pdd_index < p->n_pdds; pdd_index++) {
> +		struct kfd_process_device *pdd = p->pdds[pdd_index];
> +		struct amdgpu_bo *dumper_bo;
> +		struct kgd_mem *kgd_mem;
> +
> +		idr_for_each_entry(&pdd->alloc_idr, mem, id) {
> +			struct kfd_criu_bo_bucket *bo_bucket;
> +			struct kfd_criu_bo_priv_data *bo_priv;
> +			int i, dev_idx = 0;
> +
> +			if (!mem) {
> +				ret = -ENOMEM;
> +				goto exit;
> +			}
> +
> +			kgd_mem = (struct kgd_mem *)mem;
> +			dumper_bo = kgd_mem->bo;
> +
> +			/* Skip checkpointing BOs that are used for Trap handler
> +			 * code and state. Currently, these BOs have a VA that
> +			 * is less GPUVM Base
> +			 */
> +			if (kgd_mem->va && kgd_mem->va <= pdd->gpuvm_base)
> +				continue;
> +
> +			bo_bucket = &bo_buckets[bo_index];
> +			bo_priv = &bo_privs[bo_index];
> +
> +			bo_bucket->gpu_id = pdd->user_gpu_id;
> +			bo_bucket->addr = (uint64_t)kgd_mem->va;
> +			bo_bucket->size = amdgpu_bo_size(dumper_bo);
> +			bo_bucket->alloc_flags = (uint32_t)kgd_mem->alloc_flags;
> +			bo_priv->idr_handle = id;
> +
> +			if (bo_bucket->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_USERPTR) {
> +				ret = amdgpu_ttm_tt_get_userptr(&dumper_bo->tbo,
> +								&bo_priv->user_addr);
> +				if (ret) {
> +					pr_err("Failed to obtain user address for user-pointer bo\n");
> +					goto exit;
> +				}
> +			}
> +			if (bo_bucket->alloc_flags
> +			    & (KFD_IOC_ALLOC_MEM_FLAGS_VRAM | KFD_IOC_ALLOC_MEM_FLAGS_GTT)) {
> +				ret = criu_get_prime_handle(kgd_mem,
> +						bo_bucket->alloc_flags &
> +						KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE ? DRM_RDWR : 0,
> +						&bo_bucket->dmabuf_fd);
> +				if (ret)
> +					goto exit;
> +			} else {
> +				bo_bucket->dmabuf_fd = KFD_INVALID_FD;
> +			}
> +
> +			if (bo_bucket->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL)
> +				bo_bucket->offset = KFD_MMAP_TYPE_DOORBELL |
> +					KFD_MMAP_GPU_ID(pdd->dev->id);
> +			else if (bo_bucket->alloc_flags &
> +				KFD_IOC_ALLOC_MEM_FLAGS_MMIO_REMAP)
> +				bo_bucket->offset = KFD_MMAP_TYPE_MMIO |
> +					KFD_MMAP_GPU_ID(pdd->dev->id);
> +			else
> +				bo_bucket->offset = amdgpu_bo_mmap_offset(dumper_bo);
> +
> +			for (i = 0; i < p->n_pdds; i++) {
> +				if (amdgpu_amdkfd_bo_mapped_to_dev(p->pdds[i]->dev->adev, kgd_mem))
> +					bo_priv->mapped_gpuids[dev_idx++] = p->pdds[i]->user_gpu_id;
> +			}
> +
> +			pr_debug("bo_size = 0x%llx, bo_addr = 0x%llx bo_offset = 0x%llx\n"
> +					"gpu_id = 0x%x alloc_flags = 0x%x idr_handle = 0x%x",
> +					bo_bucket->size,
> +					bo_bucket->addr,
> +					bo_bucket->offset,
> +					bo_bucket->gpu_id,
> +					bo_bucket->alloc_flags,
> +					bo_priv->idr_handle);
> +			bo_index++;
> +		}
> +	}
> +
> +	ret = copy_to_user(user_bos, bo_buckets, num_bos * sizeof(*bo_buckets));
> +	if (ret) {
> +		pr_err("Failed to copy BO information to user\n");
> +		ret = -EFAULT;
> +		goto exit;
> +	}
> +
> +	ret = copy_to_user(user_priv_data + *priv_offset, bo_privs, num_bos * sizeof(*bo_privs));
> +	if (ret) {
> +		pr_err("Failed to copy BO priv information to user\n");
> +		ret = -EFAULT;
> +		goto exit;
> +	}
> +
> +	*priv_offset += num_bos * sizeof(*bo_privs);
> +
> +exit:
> +	while (ret && bo_index--) {
> +		if (bo_buckets[bo_index].alloc_flags
> +		    & (KFD_IOC_ALLOC_MEM_FLAGS_VRAM | KFD_IOC_ALLOC_MEM_FLAGS_GTT))
> +			close_fd(bo_buckets[bo_index].dmabuf_fd);
> +	}
> +
> +	kvfree(bo_buckets);
> +	kvfree(bo_privs);
> +	return ret;
> +}
> +
> +static int criu_get_process_object_info(struct kfd_process *p,
> +					uint32_t *num_devices,
> +					uint32_t *num_bos,
> +					uint32_t *num_objects,
> +					uint64_t *objs_priv_size)
> +{
> +	uint64_t queues_priv_data_size, svm_priv_data_size, priv_size;
> +	uint32_t num_queues, num_events, num_svm_ranges;
> +	int ret;
> +
> +	*num_devices = p->n_pdds;
> +	*num_bos = get_process_num_bos(p);
> +
> +	ret = kfd_process_get_queue_info(p, &num_queues, &queues_priv_data_size);
> +	if (ret)
> +		return ret;
> +
> +	num_events = kfd_get_num_events(p);
> +
> +	ret = svm_range_get_info(p, &num_svm_ranges, &svm_priv_data_size);
> +	if (ret)
> +		return ret;
> +
> +	*num_objects = num_queues + num_events + num_svm_ranges;
> +
> +	if (objs_priv_size) {
> +		priv_size = sizeof(struct kfd_criu_process_priv_data);
> +		priv_size += *num_devices * sizeof(struct kfd_criu_device_priv_data);
> +		priv_size += *num_bos * sizeof(struct kfd_criu_bo_priv_data);
> +		priv_size += queues_priv_data_size;
> +		priv_size += num_events * sizeof(struct kfd_criu_event_priv_data);
> +		priv_size += svm_priv_data_size;
> +		*objs_priv_size = priv_size;
> +	}
> +	return 0;
> +}
> +
> +int kfd_criu_checkpoint(struct file *filep,
> +			   struct kfd_process *p,
> +			   struct kfd_ioctl_criu_args *args)
> +{
> +	int ret;
> +	uint32_t num_devices, num_bos, num_objects;
> +	uint64_t priv_size, priv_offset = 0, bo_priv_offset;
> +
> +	if (!args->devices || !args->bos || !args->priv_data)
> +		return -EINVAL;
> +
> +	mutex_lock(&p->mutex);
> +
> +	if (!p->n_pdds) {
> +		pr_err("No pdd for given process\n");
> +		ret = -ENODEV;
> +		goto exit_unlock;
> +	}
> +
> +	/* Confirm all process queues are evicted */
> +	if (!p->queues_paused) {
> +		pr_err("Cannot dump process when queues are not in evicted state\n");
> +		/* CRIU plugin did not call op PROCESS_INFO before checkpointing */
> +		ret = -EINVAL;
> +		goto exit_unlock;
> +	}
> +
> +	ret = criu_get_process_object_info(p, &num_devices, &num_bos, &num_objects, &priv_size);
> +	if (ret)
> +		goto exit_unlock;
> +
> +	if (num_devices != args->num_devices ||
> +	    num_bos != args->num_bos ||
> +	    num_objects != args->num_objects ||
> +	    priv_size != args->priv_data_size) {
> +
> +		ret = -EINVAL;
> +		goto exit_unlock;
> +	}
> +
> +	/* each function will store private data inside priv_data and adjust priv_offset */
> +	ret = criu_checkpoint_process(p, (uint8_t __user *)args->priv_data, &priv_offset);
> +	if (ret)
> +		goto exit_unlock;
> +
> +	ret = criu_checkpoint_devices(p, num_devices, (uint8_t __user *)args->devices,
> +				(uint8_t __user *)args->priv_data, &priv_offset);
> +	if (ret)
> +		goto exit_unlock;
> +
> +	/* Leave room for BOs in the private data. They need to be restored
> +	 * before events, but we checkpoint them last to simplify the error
> +	 * handling.
> +	 */
> +	bo_priv_offset = priv_offset;
> +	priv_offset += num_bos * sizeof(struct kfd_criu_bo_priv_data);
> +
> +	if (num_objects) {
> +		ret = kfd_criu_checkpoint_queues(p, (uint8_t __user *)args->priv_data,
> +						 &priv_offset);
> +		if (ret)
> +			goto exit_unlock;
> +
> +		ret = kfd_criu_checkpoint_events(p, (uint8_t __user *)args->priv_data,
> +						 &priv_offset);
> +		if (ret)
> +			goto exit_unlock;
> +
> +		ret = kfd_criu_checkpoint_svm(p, (uint8_t __user *)args->priv_data, &priv_offset);
> +		if (ret)
> +			goto exit_unlock;
> +	}
> +
> +	/* This must be the last thing in this function that can fail.
> +	 * Otherwise we leak dmabuf file descriptors.
> +	 */
> +	ret = criu_checkpoint_bos(p, num_bos, (uint8_t __user *)args->bos,
> +			   (uint8_t __user *)args->priv_data, &bo_priv_offset);
> +
> +exit_unlock:
> +	mutex_unlock(&p->mutex);
> +	if (ret)
> +		pr_err("Failed to dump CRIU ret:%d\n", ret);
> +	else
> +		pr_debug("CRIU dump ret:%d\n", ret);
> +
> +	return ret;
> +}
> +
> +static int criu_restore_process(struct kfd_process *p,
> +				struct kfd_ioctl_criu_args *args,
> +				uint64_t *priv_offset,
> +				uint64_t max_priv_data_size)
> +{
> +	int ret = 0;
> +	struct kfd_criu_process_priv_data process_priv;
> +
> +	if (*priv_offset + sizeof(process_priv) > max_priv_data_size)
> +		return -EINVAL;
> +
> +	ret = copy_from_user(&process_priv,
> +				(void __user *)(args->priv_data + *priv_offset),
> +				sizeof(process_priv));
> +	if (ret) {
> +		pr_err("Failed to copy process private information from user\n");
> +		ret = -EFAULT;
> +		goto exit;
> +	}
> +	*priv_offset += sizeof(process_priv);
> +
> +	if (process_priv.version != KFD_CRIU_PRIV_VERSION) {
> +		pr_err("Invalid CRIU API version (checkpointed:%d current:%d)\n",
> +			process_priv.version, KFD_CRIU_PRIV_VERSION);
> +		return -EINVAL;
> +	}
> +
> +	pr_debug("Setting XNACK mode\n");
> +	if (process_priv.xnack_mode && !kfd_process_xnack_mode(p, true)) {
> +		pr_err("xnack mode cannot be set\n");
> +		ret = -EPERM;
> +		goto exit;
> +	} else {
> +		pr_debug("set xnack mode: %d\n", process_priv.xnack_mode);
> +		p->xnack_enabled = process_priv.xnack_mode;
> +	}
> +
> +exit:
> +	return ret;
> +}
> +
> +static int criu_restore_devices(struct kfd_process *p,
> +				struct kfd_ioctl_criu_args *args,
> +				uint64_t *priv_offset,
> +				uint64_t max_priv_data_size)
> +{
> +	struct kfd_criu_device_bucket *device_buckets;
> +	struct kfd_criu_device_priv_data *device_privs;
> +	int ret = 0;
> +	uint32_t i;
> +
> +	if (args->num_devices != p->n_pdds)
> +		return -EINVAL;
> +
> +	if (*priv_offset + (args->num_devices * sizeof(*device_privs)) > max_priv_data_size)
> +		return -EINVAL;
> +
> +	device_buckets = kmalloc_array(args->num_devices, sizeof(*device_buckets), GFP_KERNEL);
> +	if (!device_buckets)
> +		return -ENOMEM;
> +
> +	ret = copy_from_user(device_buckets, (void __user *)args->devices,
> +				args->num_devices * sizeof(*device_buckets));
> +	if (ret) {
> +		pr_err("Failed to copy devices buckets from user\n");
> +		ret = -EFAULT;
> +		goto exit;
> +	}
> +
> +	for (i = 0; i < args->num_devices; i++) {
> +		struct kfd_node *dev;
> +		struct kfd_process_device *pdd;
> +		struct file *drm_file;
> +
> +		/* device private data is not currently used */
> +
> +		if (!device_buckets[i].user_gpu_id) {
> +			pr_err("Invalid user gpu_id\n");
> +			ret = -EINVAL;
> +			goto exit;
> +		}
> +
> +		dev = kfd_device_by_id(device_buckets[i].actual_gpu_id);
> +		if (!dev) {
> +			pr_err("Failed to find device with gpu_id = %x\n",
> +				device_buckets[i].actual_gpu_id);
> +			ret = -EINVAL;
> +			goto exit;
> +		}
> +
> +		pdd = kfd_get_process_device_data(dev, p);
> +		if (!pdd) {
> +			pr_err("Failed to get pdd for gpu_id = %x\n",
> +					device_buckets[i].actual_gpu_id);
> +			ret = -EINVAL;
> +			goto exit;
> +		}
> +		pdd->user_gpu_id = device_buckets[i].user_gpu_id;
> +
> +		drm_file = fget(device_buckets[i].drm_fd);
> +		if (!drm_file) {
> +			pr_err("Invalid render node file descriptor sent from plugin (%d)\n",
> +				device_buckets[i].drm_fd);
> +			ret = -EINVAL;
> +			goto exit;
> +		}
> +
> +		if (pdd->drm_file) {
> +			ret = -EINVAL;
> +			goto exit;
> +		}
> +
> +		/* create the vm using render nodes for kfd pdd */
> +		if (kfd_process_device_init_vm(pdd, drm_file)) {
> +			pr_err("could not init vm for given pdd\n");
> +			/* On success, the PDD keeps the drm_file reference */
> +			fput(drm_file);
> +			ret = -EINVAL;
> +			goto exit;
> +		}
> +		/*
> +		 * pdd now already has the vm bound to render node so below api won't create a new
> +		 * exclusive kfd mapping but use existing one with renderDXXX but is still needed
> +		 * for iommu v2 binding  and runtime pm.
> +		 */
> +		pdd = kfd_bind_process_to_device(dev, p);
> +		if (IS_ERR(pdd)) {
> +			ret = PTR_ERR(pdd);
> +			goto exit;
> +		}
> +
> +		if (!pdd->qpd.proc_doorbells) {
> +			ret = kfd_alloc_process_doorbells(dev->kfd, pdd);
> +			if (ret)
> +				goto exit;
> +		}
> +	}
> +
> +	/*
> +	 * We are not copying device private data from user as we are not using the data for now,
> +	 * but we still adjust for its private data.
> +	 */
> +	*priv_offset += args->num_devices * sizeof(*device_privs);
> +
> +exit:
> +	kfree(device_buckets);
> +	return ret;
> +}
> +
> +static int criu_restore_memory_of_gpu(struct kfd_process_device *pdd,
> +				      struct kfd_criu_bo_bucket *bo_bucket,
> +				      struct kfd_criu_bo_priv_data *bo_priv,
> +				      struct kgd_mem **kgd_mem)
> +{
> +	int idr_handle;
> +	int ret;
> +	const bool criu_resume = true;
> +	u64 offset;
> +
> +	if (bo_bucket->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL) {
> +		if (bo_bucket->size !=
> +				kfd_doorbell_process_slice(pdd->dev->kfd))
> +			return -EINVAL;
> +
> +		offset = kfd_get_process_doorbells(pdd);
> +		if (!offset)
> +			return -ENOMEM;
> +	} else if (bo_bucket->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_MMIO_REMAP) {
> +		/* MMIO BOs need remapped bus address */
> +		if (bo_bucket->size != PAGE_SIZE) {
> +			pr_err("Invalid page size\n");
> +			return -EINVAL;
> +		}
> +		offset = pdd->dev->adev->rmmio_remap.bus_addr;
> +		if (!offset) {
> +			pr_err("amdgpu_amdkfd_get_mmio_remap_phys_addr failed\n");
> +			return -ENOMEM;
> +		}
> +	} else if (bo_bucket->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_USERPTR) {
> +		offset = bo_priv->user_addr;
> +	}
> +	/* Create the BO */
> +	ret = amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu(pdd->dev->adev, bo_bucket->addr,
> +						      bo_bucket->size, pdd->drm_priv, kgd_mem,
> +						      &offset, bo_bucket->alloc_flags, criu_resume);
> +	if (ret) {
> +		pr_err("Could not create the BO\n");
> +		return ret;
> +	}
> +	pr_debug("New BO created: size:0x%llx addr:0x%llx offset:0x%llx\n",
> +		 bo_bucket->size, bo_bucket->addr, offset);
> +
> +	/* Restore previous IDR handle */
> +	pr_debug("Restoring old IDR handle for the BO");
> +	idr_handle = idr_alloc(&pdd->alloc_idr, *kgd_mem, bo_priv->idr_handle,
> +			       bo_priv->idr_handle + 1, GFP_KERNEL);
> +
> +	if (idr_handle < 0) {
> +		pr_err("Could not allocate idr\n");
> +		amdgpu_amdkfd_gpuvm_free_memory_of_gpu(pdd->dev->adev, *kgd_mem, pdd->drm_priv,
> +						       NULL);
> +		return -ENOMEM;
> +	}
> +
> +	if (bo_bucket->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL)
> +		bo_bucket->restored_offset = KFD_MMAP_TYPE_DOORBELL | KFD_MMAP_GPU_ID(pdd->dev->id);
> +	if (bo_bucket->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_MMIO_REMAP) {
> +		bo_bucket->restored_offset = KFD_MMAP_TYPE_MMIO | KFD_MMAP_GPU_ID(pdd->dev->id);
> +	} else if (bo_bucket->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_GTT) {
> +		bo_bucket->restored_offset = offset;
> +	} else if (bo_bucket->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_VRAM) {
> +		bo_bucket->restored_offset = offset;
> +		/* Update the VRAM usage count */
> +		WRITE_ONCE(pdd->vram_usage, pdd->vram_usage + bo_bucket->size);
> +	}
> +	return 0;
> +}
> +
> +static int criu_restore_bo(struct kfd_process *p,
> +			   struct kfd_criu_bo_bucket *bo_bucket,
> +			   struct kfd_criu_bo_priv_data *bo_priv)
> +{
> +	struct kfd_process_device *pdd;
> +	struct kgd_mem *kgd_mem;
> +	int ret;
> +	int j;
> +
> +	pr_debug("Restoring BO size:0x%llx addr:0x%llx gpu_id:0x%x flags:0x%x idr_handle:0x%x\n",
> +		 bo_bucket->size, bo_bucket->addr, bo_bucket->gpu_id, bo_bucket->alloc_flags,
> +		 bo_priv->idr_handle);
> +
> +	pdd = kfd_process_device_data_by_id(p, bo_bucket->gpu_id);
> +	if (!pdd) {
> +		pr_err("Failed to get pdd\n");
> +		return -ENODEV;
> +	}
> +
> +	ret = criu_restore_memory_of_gpu(pdd, bo_bucket, bo_priv, &kgd_mem);
> +	if (ret)
> +		return ret;
> +
> +	/* now map these BOs to GPU/s */
> +	for (j = 0; j < p->n_pdds; j++) {
> +		struct kfd_node *peer;
> +		struct kfd_process_device *peer_pdd;
> +
> +		if (!bo_priv->mapped_gpuids[j])
> +			break;
> +
> +		peer_pdd = kfd_process_device_data_by_id(p, bo_priv->mapped_gpuids[j]);
> +		if (!peer_pdd)
> +			return -EINVAL;
> +
> +		peer = peer_pdd->dev;
> +
> +		peer_pdd = kfd_bind_process_to_device(peer, p);
> +		if (IS_ERR(peer_pdd))
> +			return PTR_ERR(peer_pdd);
> +
> +		ret = amdgpu_amdkfd_gpuvm_map_memory_to_gpu(peer->adev, kgd_mem,
> +							    peer_pdd->drm_priv);
> +		if (ret) {
> +			pr_err("Failed to map to gpu %d/%d\n", j, p->n_pdds);
> +			return ret;
> +		}
> +	}
> +
> +	pr_debug("map memory was successful for the BO\n");
> +	/* create the dmabuf object and export the bo */
> +	if (bo_bucket->alloc_flags
> +	    & (KFD_IOC_ALLOC_MEM_FLAGS_VRAM | KFD_IOC_ALLOC_MEM_FLAGS_GTT)) {
> +		ret = criu_get_prime_handle(kgd_mem, DRM_RDWR,
> +					    &bo_bucket->dmabuf_fd);
> +		if (ret)
> +			return ret;
> +	} else {
> +		bo_bucket->dmabuf_fd = KFD_INVALID_FD;
> +	}
> +
> +	return 0;
> +}
> +
> +static int criu_restore_bos(struct kfd_process *p,
> +			    struct kfd_ioctl_criu_args *args,
> +			    uint64_t *priv_offset,
> +			    uint64_t max_priv_data_size)
> +{
> +	struct kfd_criu_bo_bucket *bo_buckets = NULL;
> +	struct kfd_criu_bo_priv_data *bo_privs = NULL;
> +	int ret = 0;
> +	uint32_t i = 0;
> +
> +	if (*priv_offset + (args->num_bos * sizeof(*bo_privs)) > max_priv_data_size)
> +		return -EINVAL;
> +
> +	/* Prevent MMU notifications until stage-4 IOCTL (CRIU_RESUME) is received */
> +	amdgpu_amdkfd_block_mmu_notifications(p->kgd_process_info);
> +
> +	bo_buckets = kvmalloc_array(args->num_bos, sizeof(*bo_buckets), GFP_KERNEL);
> +	if (!bo_buckets)
> +		return -ENOMEM;
> +
> +	ret = copy_from_user(bo_buckets, (void __user *)args->bos,
> +			     args->num_bos * sizeof(*bo_buckets));
> +	if (ret) {
> +		pr_err("Failed to copy BOs information from user\n");
> +		ret = -EFAULT;
> +		goto exit;
> +	}
> +
> +	bo_privs = kvmalloc_array(args->num_bos, sizeof(*bo_privs), GFP_KERNEL);
> +	if (!bo_privs) {
> +		ret = -ENOMEM;
> +		goto exit;
> +	}
> +
> +	ret = copy_from_user(bo_privs, (void __user *)args->priv_data + *priv_offset,
> +			     args->num_bos * sizeof(*bo_privs));
> +	if (ret) {
> +		pr_err("Failed to copy BOs information from user\n");
> +		ret = -EFAULT;
> +		goto exit;
> +	}
> +	*priv_offset += args->num_bos * sizeof(*bo_privs);
> +
> +	/* Create and map new BOs */
> +	for (; i < args->num_bos; i++) {
> +		ret = criu_restore_bo(p, &bo_buckets[i], &bo_privs[i]);
> +		if (ret) {
> +			pr_debug("Failed to restore BO[%d] ret%d\n", i, ret);
> +			goto exit;
> +		}
> +	} /* done */
> +
> +	/* Copy only the buckets back so user can read bo_buckets[N].restored_offset */
> +	ret = copy_to_user((void __user *)args->bos,
> +				bo_buckets,
> +				(args->num_bos * sizeof(*bo_buckets)));
> +	if (ret)
> +		ret = -EFAULT;
> +
> +exit:
> +	while (ret && i--) {
> +		if (bo_buckets[i].alloc_flags
> +		   & (KFD_IOC_ALLOC_MEM_FLAGS_VRAM | KFD_IOC_ALLOC_MEM_FLAGS_GTT))
> +			close_fd(bo_buckets[i].dmabuf_fd);
> +	}
> +	kvfree(bo_buckets);
> +	kvfree(bo_privs);
> +	return ret;
> +}
> +
> +static int criu_restore_objects(struct file *filep,
> +				struct kfd_process *p,
> +				struct kfd_ioctl_criu_args *args,
> +				uint64_t *priv_offset,
> +				uint64_t max_priv_data_size)
> +{
> +	int ret = 0;
> +	uint32_t i;
> +
> +	BUILD_BUG_ON(offsetof(struct kfd_criu_queue_priv_data, object_type));
> +	BUILD_BUG_ON(offsetof(struct kfd_criu_event_priv_data, object_type));
> +	BUILD_BUG_ON(offsetof(struct kfd_criu_svm_range_priv_data, object_type));
> +
> +	for (i = 0; i < args->num_objects; i++) {
> +		uint32_t object_type;
> +
> +		if (*priv_offset + sizeof(object_type) > max_priv_data_size) {
> +			pr_err("Invalid private data size\n");
> +			return -EINVAL;
> +		}
> +
> +		ret = get_user(object_type, (uint32_t __user *)(args->priv_data + *priv_offset));
> +		if (ret) {
> +			pr_err("Failed to copy private information from user\n");
> +			goto exit;
> +		}
> +
> +		switch (object_type) {
> +		case KFD_CRIU_OBJECT_TYPE_QUEUE:
> +			ret = kfd_criu_restore_queue(p, (uint8_t __user *)args->priv_data,
> +						     priv_offset, max_priv_data_size);
> +			if (ret)
> +				goto exit;
> +			break;
> +		case KFD_CRIU_OBJECT_TYPE_EVENT:
> +			ret = kfd_criu_restore_event(filep, p, (uint8_t __user *)args->priv_data,
> +						     priv_offset, max_priv_data_size);
> +			if (ret)
> +				goto exit;
> +			break;
> +		case KFD_CRIU_OBJECT_TYPE_SVM_RANGE:
> +			ret = kfd_criu_restore_svm(p, (uint8_t __user *)args->priv_data,
> +						     priv_offset, max_priv_data_size);
> +			if (ret)
> +				goto exit;
> +			break;
> +		default:
> +			pr_err("Invalid object type:%u at index:%d\n", object_type, i);
> +			ret = -EINVAL;
> +			goto exit;
> +		}
> +	}
> +exit:
> +	return ret;
> +}
> +
> +int kfd_criu_restore(struct file *filep,
> +			struct kfd_process *p,
> +			struct kfd_ioctl_criu_args *args)
> +{
> +	uint64_t priv_offset = 0;
> +	int ret = 0;
> +
> +	pr_debug("CRIU restore (num_devices:%u num_bos:%u num_objects:%u priv_data_size:%llu)\n",
> +		 args->num_devices, args->num_bos, args->num_objects, args->priv_data_size);
> +
> +	if (!args->bos || !args->devices || !args->priv_data || !args->priv_data_size ||
> +	    !args->num_devices || !args->num_bos)
> +		return -EINVAL;
> +
> +	mutex_lock(&p->mutex);
> +
> +	/*
> +	 * Set the process to evicted state to avoid running any new queues before all the memory
> +	 * mappings are ready.
> +	 */
> +	ret = kfd_process_evict_queues(p, KFD_QUEUE_EVICTION_CRIU_RESTORE);
> +	if (ret)
> +		goto exit_unlock;
> +
> +	/* Each function will adjust priv_offset based on how many bytes they consumed */
> +	ret = criu_restore_process(p, args, &priv_offset, args->priv_data_size);
> +	if (ret)
> +		goto exit_unlock;
> +
> +	ret = criu_restore_devices(p, args, &priv_offset, args->priv_data_size);
> +	if (ret)
> +		goto exit_unlock;
> +
> +	ret = criu_restore_bos(p, args, &priv_offset, args->priv_data_size);
> +	if (ret)
> +		goto exit_unlock;
> +
> +	ret = criu_restore_objects(filep, p, args, &priv_offset, args->priv_data_size);
> +	if (ret)
> +		goto exit_unlock;
> +
> +	if (priv_offset != args->priv_data_size) {
> +		pr_err("Invalid private data size\n");
> +		ret = -EINVAL;
> +	}
> +
> +exit_unlock:
> +	mutex_unlock(&p->mutex);
> +	if (ret)
> +		pr_err("Failed to restore CRIU ret:%d\n", ret);
> +	else
> +		pr_debug("CRIU restore successful\n");
> +
> +	return ret;
> +}
> +
> +int kfd_criu_unpause(struct file *filep,
> +			struct kfd_process *p,
> +			struct kfd_ioctl_criu_args *args)
> +{
> +	int ret;
> +
> +	mutex_lock(&p->mutex);
> +
> +	if (!p->queues_paused) {
> +		mutex_unlock(&p->mutex);
> +		return -EINVAL;
> +	}
> +
> +	ret = kfd_process_restore_queues(p);
> +	if (ret)
> +		pr_err("Failed to unpause queues ret:%d\n", ret);
> +	else
> +		p->queues_paused = false;
> +
> +	mutex_unlock(&p->mutex);
> +
> +	return ret;
> +}
> +
> +int kfd_criu_resume(struct file *filep,
> +			struct kfd_process *p,
> +			struct kfd_ioctl_criu_args *args)
> +{
> +	struct kfd_process *target = NULL;
> +	struct pid *pid = NULL;
> +	int ret = 0;
> +
> +	pr_debug("Inside %s, target pid for criu restore: %d\n", __func__,
> +		 args->pid);
> +
> +	pid = find_get_pid(args->pid);
> +	if (!pid) {
> +		pr_err("Cannot find pid info for %i\n", args->pid);
> +		return -ESRCH;
> +	}
> +
> +	pr_debug("calling kfd_lookup_process_by_pid\n");
> +	target = kfd_lookup_process_by_pid(pid);
> +
> +	put_pid(pid);
> +
> +	if (!target) {
> +		pr_debug("Cannot find process info for %i\n", args->pid);
> +		return -ESRCH;
> +	}
> +
> +	mutex_lock(&target->mutex);
> +	ret = kfd_criu_resume_svm(target);
> +	if (ret) {
> +		pr_err("kfd_criu_resume_svm failed for %i\n", args->pid);
> +		goto exit;
> +	}
> +
> +	ret =  amdgpu_amdkfd_criu_resume(target->kgd_process_info);
> +	if (ret)
> +		pr_err("amdgpu_amdkfd_criu_resume failed for %i\n", args->pid);
> +
> +exit:
> +	mutex_unlock(&target->mutex);
> +
> +	kfd_unref_process(target);
> +	return ret;
> +}
> +
> +int kfd_criu_process_info(struct file *filep,
> +				struct kfd_process *p,
> +				struct kfd_ioctl_criu_args *args)
> +{
> +	int ret = 0;
> +
> +	mutex_lock(&p->mutex);
> +
> +	if (!p->n_pdds) {
> +		pr_err("No pdd for given process\n");
> +		ret = -ENODEV;
> +		goto err_unlock;
> +	}
> +
> +	ret = kfd_process_evict_queues(p, KFD_QUEUE_EVICTION_CRIU_CHECKPOINT);
> +	if (ret)
> +		goto err_unlock;
> +
> +	p->queues_paused = true;
> +
> +	args->pid = task_pid_nr_ns(p->lead_thread,
> +					task_active_pid_ns(p->lead_thread));
> +
> +	ret = criu_get_process_object_info(p, &args->num_devices, &args->num_bos,
> +					   &args->num_objects, &args->priv_data_size);
> +	if (ret)
> +		goto err_unlock;
> +
> +	dev_dbg(kfd_device, "Num of devices:%u bos:%u objects:%u priv_data_size:%lld\n",
> +				args->num_devices, args->num_bos, args->num_objects,
> +				args->priv_data_size);
> +
> +err_unlock:
> +	if (ret) {
> +		kfd_process_restore_queues(p);
> +		p->queues_paused = false;
> +	}
> +	mutex_unlock(&p->mutex);
> +	return ret;
> +}
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_criu.h b/drivers/gpu/drm/amd/amdkfd/kfd_criu.h
> new file mode 100644
> index 000000000000..1a3d418a9505
> --- /dev/null
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_criu.h
> @@ -0,0 +1,50 @@
> +/* SPDX-License-Identifier: GPL-2.0 OR MIT */
> +/*
> + * Copyright 2024 Advanced Micro Devices, Inc.
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a
> + * copy of this software and associated documentation files (the "Software"),
> + * to deal in the Software without restriction, including without limitation
> + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
> + * and/or sell copies of the Software, and to permit persons to whom the
> + * Software is furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice shall be included in
> + * all copies or substantial portions of the Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
> + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
> + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
> + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
> + * OTHER DEALINGS IN THE SOFTWARE.
> + */
> +
> +#ifndef __KFD_CRIU_H__
> +#define __KFD_CRIU_H__
> +
> +#include <uapi/linux/kfd_ioctl.h>
> +#include "kfd_priv.h"
> +
> +int kfd_criu_process_info(struct file *filep,
> +				struct kfd_process *p,
> +				struct kfd_ioctl_criu_args *args);
> +
> +int kfd_criu_checkpoint(struct file *filep,
> +			   struct kfd_process *p,
> +			   struct kfd_ioctl_criu_args *args);
> +
> +int kfd_criu_unpause(struct file *filep,
> +			struct kfd_process *p,
> +			struct kfd_ioctl_criu_args *args);
> +
> +int kfd_criu_restore(struct file *filep,
> +			struct kfd_process *p,
> +			struct kfd_ioctl_criu_args *args);
> +
> +int kfd_criu_resume(struct file *filep,
> +			struct kfd_process *p,
> +			struct kfd_ioctl_criu_args *args);
> +
> +#endif


More information about the amd-gfx mailing list