[PATCH] drm/amdkfd: Refactor kfd CRIU into its own file
Felix Kuehling
felix.kuehling at amd.com
Mon May 6 20:10:52 UTC 2024
On 2024-05-06 15:20, David Francis wrote:
> The kfd CRIU code takes up about a thousand lines
> in the kfd_chardev file; move it to its own file.
>
> No functional change intended.
>
> Signed-off-by: David Francis <David.Francis at amd.com>
> ---
> drivers/gpu/drm/amd/amdkfd/Makefile | 1 +
> drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 972 +---------------------
> drivers/gpu/drm/amd/amdkfd/kfd_criu.c | 989 +++++++++++++++++++++++
> drivers/gpu/drm/amd/amdkfd/kfd_criu.h | 50 ++
> 4 files changed, 1046 insertions(+), 966 deletions(-)
> create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_criu.c
> create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_criu.h
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/Makefile b/drivers/gpu/drm/amd/amdkfd/Makefile
> index 0d3d8972240d..e06af4073ac5 100644
> --- a/drivers/gpu/drm/amd/amdkfd/Makefile
> +++ b/drivers/gpu/drm/amd/amdkfd/Makefile
> @@ -32,6 +32,7 @@ AMDKFD_FILES := $(AMDKFD_PATH)/kfd_module.o \
> $(AMDKFD_PATH)/kfd_flat_memory.o \
> $(AMDKFD_PATH)/kfd_process.o \
> $(AMDKFD_PATH)/kfd_queue.o \
> + $(AMDKFD_PATH)/kfd_criu.o \
Any particular reason for adding this in the middle and not the end?
> $(AMDKFD_PATH)/kfd_mqd_manager.o \
> $(AMDKFD_PATH)/kfd_mqd_manager_cik.o \
> $(AMDKFD_PATH)/kfd_mqd_manager_vi.o \
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> index 6b713fb0b818..e6e44a199a93 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> @@ -45,6 +45,7 @@
Can you remove #include <linux/fdtable.h> and "amdgpu_dma_buf.h" here?
Or is it still needed by something else left in kfd_chardev.c?
Other than that, this patch is
Reviewed-by: Felix Kuehling <felix.kuehling at amd.com>
> #include "kfd_smi_events.h"
> #include "amdgpu_dma_buf.h"
> #include "kfd_debug.h"
> +#include "kfd_criu.h"
>
> static long kfd_ioctl(struct file *, unsigned int, unsigned long);
> static int kfd_open(struct inode *, struct file *);
> @@ -1751,967 +1752,6 @@ static int kfd_ioctl_svm(struct file *filep, struct kfd_process *p, void *data)
> }
> #endif
>
> -static int criu_checkpoint_process(struct kfd_process *p,
> - uint8_t __user *user_priv_data,
> - uint64_t *priv_offset)
> -{
> - struct kfd_criu_process_priv_data process_priv;
> - int ret;
> -
> - memset(&process_priv, 0, sizeof(process_priv));
> -
> - process_priv.version = KFD_CRIU_PRIV_VERSION;
> - /* For CR, we don't consider negative xnack mode which is used for
> - * querying without changing it, here 0 simply means disabled and 1
> - * means enabled so retry for finding a valid PTE.
> - */
> - process_priv.xnack_mode = p->xnack_enabled ? 1 : 0;
> -
> - ret = copy_to_user(user_priv_data + *priv_offset,
> - &process_priv, sizeof(process_priv));
> -
> - if (ret) {
> - pr_err("Failed to copy process information to user\n");
> - ret = -EFAULT;
> - }
> -
> - *priv_offset += sizeof(process_priv);
> - return ret;
> -}
> -
> -static int criu_checkpoint_devices(struct kfd_process *p,
> - uint32_t num_devices,
> - uint8_t __user *user_addr,
> - uint8_t __user *user_priv_data,
> - uint64_t *priv_offset)
> -{
> - struct kfd_criu_device_priv_data *device_priv = NULL;
> - struct kfd_criu_device_bucket *device_buckets = NULL;
> - int ret = 0, i;
> -
> - device_buckets = kvzalloc(num_devices * sizeof(*device_buckets), GFP_KERNEL);
> - if (!device_buckets) {
> - ret = -ENOMEM;
> - goto exit;
> - }
> -
> - device_priv = kvzalloc(num_devices * sizeof(*device_priv), GFP_KERNEL);
> - if (!device_priv) {
> - ret = -ENOMEM;
> - goto exit;
> - }
> -
> - for (i = 0; i < num_devices; i++) {
> - struct kfd_process_device *pdd = p->pdds[i];
> -
> - device_buckets[i].user_gpu_id = pdd->user_gpu_id;
> - device_buckets[i].actual_gpu_id = pdd->dev->id;
> -
> - /*
> - * priv_data does not contain useful information for now and is reserved for
> - * future use, so we do not set its contents.
> - */
> - }
> -
> - ret = copy_to_user(user_addr, device_buckets, num_devices * sizeof(*device_buckets));
> - if (ret) {
> - pr_err("Failed to copy device information to user\n");
> - ret = -EFAULT;
> - goto exit;
> - }
> -
> - ret = copy_to_user(user_priv_data + *priv_offset,
> - device_priv,
> - num_devices * sizeof(*device_priv));
> - if (ret) {
> - pr_err("Failed to copy device information to user\n");
> - ret = -EFAULT;
> - }
> - *priv_offset += num_devices * sizeof(*device_priv);
> -
> -exit:
> - kvfree(device_buckets);
> - kvfree(device_priv);
> - return ret;
> -}
> -
> -static uint32_t get_process_num_bos(struct kfd_process *p)
> -{
> - uint32_t num_of_bos = 0;
> - int i;
> -
> - /* Run over all PDDs of the process */
> - for (i = 0; i < p->n_pdds; i++) {
> - struct kfd_process_device *pdd = p->pdds[i];
> - void *mem;
> - int id;
> -
> - idr_for_each_entry(&pdd->alloc_idr, mem, id) {
> - struct kgd_mem *kgd_mem = (struct kgd_mem *)mem;
> -
> - if (!kgd_mem->va || kgd_mem->va > pdd->gpuvm_base)
> - num_of_bos++;
> - }
> - }
> - return num_of_bos;
> -}
> -
> -static int criu_get_prime_handle(struct kgd_mem *mem,
> - int flags, u32 *shared_fd)
> -{
> - struct dma_buf *dmabuf;
> - int ret;
> -
> - ret = amdgpu_amdkfd_gpuvm_export_dmabuf(mem, &dmabuf);
> - if (ret) {
> - pr_err("dmabuf export failed for the BO\n");
> - return ret;
> - }
> -
> - ret = dma_buf_fd(dmabuf, flags);
> - if (ret < 0) {
> - pr_err("dmabuf create fd failed, ret:%d\n", ret);
> - goto out_free_dmabuf;
> - }
> -
> - *shared_fd = ret;
> - return 0;
> -
> -out_free_dmabuf:
> - dma_buf_put(dmabuf);
> - return ret;
> -}
> -
> -static int criu_checkpoint_bos(struct kfd_process *p,
> - uint32_t num_bos,
> - uint8_t __user *user_bos,
> - uint8_t __user *user_priv_data,
> - uint64_t *priv_offset)
> -{
> - struct kfd_criu_bo_bucket *bo_buckets;
> - struct kfd_criu_bo_priv_data *bo_privs;
> - int ret = 0, pdd_index, bo_index = 0, id;
> - void *mem;
> -
> - bo_buckets = kvzalloc(num_bos * sizeof(*bo_buckets), GFP_KERNEL);
> - if (!bo_buckets)
> - return -ENOMEM;
> -
> - bo_privs = kvzalloc(num_bos * sizeof(*bo_privs), GFP_KERNEL);
> - if (!bo_privs) {
> - ret = -ENOMEM;
> - goto exit;
> - }
> -
> - for (pdd_index = 0; pdd_index < p->n_pdds; pdd_index++) {
> - struct kfd_process_device *pdd = p->pdds[pdd_index];
> - struct amdgpu_bo *dumper_bo;
> - struct kgd_mem *kgd_mem;
> -
> - idr_for_each_entry(&pdd->alloc_idr, mem, id) {
> - struct kfd_criu_bo_bucket *bo_bucket;
> - struct kfd_criu_bo_priv_data *bo_priv;
> - int i, dev_idx = 0;
> -
> - if (!mem) {
> - ret = -ENOMEM;
> - goto exit;
> - }
> -
> - kgd_mem = (struct kgd_mem *)mem;
> - dumper_bo = kgd_mem->bo;
> -
> - /* Skip checkpointing BOs that are used for Trap handler
> - * code and state. Currently, these BOs have a VA that
> - * is less GPUVM Base
> - */
> - if (kgd_mem->va && kgd_mem->va <= pdd->gpuvm_base)
> - continue;
> -
> - bo_bucket = &bo_buckets[bo_index];
> - bo_priv = &bo_privs[bo_index];
> -
> - bo_bucket->gpu_id = pdd->user_gpu_id;
> - bo_bucket->addr = (uint64_t)kgd_mem->va;
> - bo_bucket->size = amdgpu_bo_size(dumper_bo);
> - bo_bucket->alloc_flags = (uint32_t)kgd_mem->alloc_flags;
> - bo_priv->idr_handle = id;
> -
> - if (bo_bucket->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_USERPTR) {
> - ret = amdgpu_ttm_tt_get_userptr(&dumper_bo->tbo,
> - &bo_priv->user_addr);
> - if (ret) {
> - pr_err("Failed to obtain user address for user-pointer bo\n");
> - goto exit;
> - }
> - }
> - if (bo_bucket->alloc_flags
> - & (KFD_IOC_ALLOC_MEM_FLAGS_VRAM | KFD_IOC_ALLOC_MEM_FLAGS_GTT)) {
> - ret = criu_get_prime_handle(kgd_mem,
> - bo_bucket->alloc_flags &
> - KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE ? DRM_RDWR : 0,
> - &bo_bucket->dmabuf_fd);
> - if (ret)
> - goto exit;
> - } else {
> - bo_bucket->dmabuf_fd = KFD_INVALID_FD;
> - }
> -
> - if (bo_bucket->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL)
> - bo_bucket->offset = KFD_MMAP_TYPE_DOORBELL |
> - KFD_MMAP_GPU_ID(pdd->dev->id);
> - else if (bo_bucket->alloc_flags &
> - KFD_IOC_ALLOC_MEM_FLAGS_MMIO_REMAP)
> - bo_bucket->offset = KFD_MMAP_TYPE_MMIO |
> - KFD_MMAP_GPU_ID(pdd->dev->id);
> - else
> - bo_bucket->offset = amdgpu_bo_mmap_offset(dumper_bo);
> -
> - for (i = 0; i < p->n_pdds; i++) {
> - if (amdgpu_amdkfd_bo_mapped_to_dev(p->pdds[i]->dev->adev, kgd_mem))
> - bo_priv->mapped_gpuids[dev_idx++] = p->pdds[i]->user_gpu_id;
> - }
> -
> - pr_debug("bo_size = 0x%llx, bo_addr = 0x%llx bo_offset = 0x%llx\n"
> - "gpu_id = 0x%x alloc_flags = 0x%x idr_handle = 0x%x",
> - bo_bucket->size,
> - bo_bucket->addr,
> - bo_bucket->offset,
> - bo_bucket->gpu_id,
> - bo_bucket->alloc_flags,
> - bo_priv->idr_handle);
> - bo_index++;
> - }
> - }
> -
> - ret = copy_to_user(user_bos, bo_buckets, num_bos * sizeof(*bo_buckets));
> - if (ret) {
> - pr_err("Failed to copy BO information to user\n");
> - ret = -EFAULT;
> - goto exit;
> - }
> -
> - ret = copy_to_user(user_priv_data + *priv_offset, bo_privs, num_bos * sizeof(*bo_privs));
> - if (ret) {
> - pr_err("Failed to copy BO priv information to user\n");
> - ret = -EFAULT;
> - goto exit;
> - }
> -
> - *priv_offset += num_bos * sizeof(*bo_privs);
> -
> -exit:
> - while (ret && bo_index--) {
> - if (bo_buckets[bo_index].alloc_flags
> - & (KFD_IOC_ALLOC_MEM_FLAGS_VRAM | KFD_IOC_ALLOC_MEM_FLAGS_GTT))
> - close_fd(bo_buckets[bo_index].dmabuf_fd);
> - }
> -
> - kvfree(bo_buckets);
> - kvfree(bo_privs);
> - return ret;
> -}
> -
> -static int criu_get_process_object_info(struct kfd_process *p,
> - uint32_t *num_devices,
> - uint32_t *num_bos,
> - uint32_t *num_objects,
> - uint64_t *objs_priv_size)
> -{
> - uint64_t queues_priv_data_size, svm_priv_data_size, priv_size;
> - uint32_t num_queues, num_events, num_svm_ranges;
> - int ret;
> -
> - *num_devices = p->n_pdds;
> - *num_bos = get_process_num_bos(p);
> -
> - ret = kfd_process_get_queue_info(p, &num_queues, &queues_priv_data_size);
> - if (ret)
> - return ret;
> -
> - num_events = kfd_get_num_events(p);
> -
> - ret = svm_range_get_info(p, &num_svm_ranges, &svm_priv_data_size);
> - if (ret)
> - return ret;
> -
> - *num_objects = num_queues + num_events + num_svm_ranges;
> -
> - if (objs_priv_size) {
> - priv_size = sizeof(struct kfd_criu_process_priv_data);
> - priv_size += *num_devices * sizeof(struct kfd_criu_device_priv_data);
> - priv_size += *num_bos * sizeof(struct kfd_criu_bo_priv_data);
> - priv_size += queues_priv_data_size;
> - priv_size += num_events * sizeof(struct kfd_criu_event_priv_data);
> - priv_size += svm_priv_data_size;
> - *objs_priv_size = priv_size;
> - }
> - return 0;
> -}
> -
> -static int criu_checkpoint(struct file *filep,
> - struct kfd_process *p,
> - struct kfd_ioctl_criu_args *args)
> -{
> - int ret;
> - uint32_t num_devices, num_bos, num_objects;
> - uint64_t priv_size, priv_offset = 0, bo_priv_offset;
> -
> - if (!args->devices || !args->bos || !args->priv_data)
> - return -EINVAL;
> -
> - mutex_lock(&p->mutex);
> -
> - if (!p->n_pdds) {
> - pr_err("No pdd for given process\n");
> - ret = -ENODEV;
> - goto exit_unlock;
> - }
> -
> - /* Confirm all process queues are evicted */
> - if (!p->queues_paused) {
> - pr_err("Cannot dump process when queues are not in evicted state\n");
> - /* CRIU plugin did not call op PROCESS_INFO before checkpointing */
> - ret = -EINVAL;
> - goto exit_unlock;
> - }
> -
> - ret = criu_get_process_object_info(p, &num_devices, &num_bos, &num_objects, &priv_size);
> - if (ret)
> - goto exit_unlock;
> -
> - if (num_devices != args->num_devices ||
> - num_bos != args->num_bos ||
> - num_objects != args->num_objects ||
> - priv_size != args->priv_data_size) {
> -
> - ret = -EINVAL;
> - goto exit_unlock;
> - }
> -
> - /* each function will store private data inside priv_data and adjust priv_offset */
> - ret = criu_checkpoint_process(p, (uint8_t __user *)args->priv_data, &priv_offset);
> - if (ret)
> - goto exit_unlock;
> -
> - ret = criu_checkpoint_devices(p, num_devices, (uint8_t __user *)args->devices,
> - (uint8_t __user *)args->priv_data, &priv_offset);
> - if (ret)
> - goto exit_unlock;
> -
> - /* Leave room for BOs in the private data. They need to be restored
> - * before events, but we checkpoint them last to simplify the error
> - * handling.
> - */
> - bo_priv_offset = priv_offset;
> - priv_offset += num_bos * sizeof(struct kfd_criu_bo_priv_data);
> -
> - if (num_objects) {
> - ret = kfd_criu_checkpoint_queues(p, (uint8_t __user *)args->priv_data,
> - &priv_offset);
> - if (ret)
> - goto exit_unlock;
> -
> - ret = kfd_criu_checkpoint_events(p, (uint8_t __user *)args->priv_data,
> - &priv_offset);
> - if (ret)
> - goto exit_unlock;
> -
> - ret = kfd_criu_checkpoint_svm(p, (uint8_t __user *)args->priv_data, &priv_offset);
> - if (ret)
> - goto exit_unlock;
> - }
> -
> - /* This must be the last thing in this function that can fail.
> - * Otherwise we leak dmabuf file descriptors.
> - */
> - ret = criu_checkpoint_bos(p, num_bos, (uint8_t __user *)args->bos,
> - (uint8_t __user *)args->priv_data, &bo_priv_offset);
> -
> -exit_unlock:
> - mutex_unlock(&p->mutex);
> - if (ret)
> - pr_err("Failed to dump CRIU ret:%d\n", ret);
> - else
> - pr_debug("CRIU dump ret:%d\n", ret);
> -
> - return ret;
> -}
> -
> -static int criu_restore_process(struct kfd_process *p,
> - struct kfd_ioctl_criu_args *args,
> - uint64_t *priv_offset,
> - uint64_t max_priv_data_size)
> -{
> - int ret = 0;
> - struct kfd_criu_process_priv_data process_priv;
> -
> - if (*priv_offset + sizeof(process_priv) > max_priv_data_size)
> - return -EINVAL;
> -
> - ret = copy_from_user(&process_priv,
> - (void __user *)(args->priv_data + *priv_offset),
> - sizeof(process_priv));
> - if (ret) {
> - pr_err("Failed to copy process private information from user\n");
> - ret = -EFAULT;
> - goto exit;
> - }
> - *priv_offset += sizeof(process_priv);
> -
> - if (process_priv.version != KFD_CRIU_PRIV_VERSION) {
> - pr_err("Invalid CRIU API version (checkpointed:%d current:%d)\n",
> - process_priv.version, KFD_CRIU_PRIV_VERSION);
> - return -EINVAL;
> - }
> -
> - pr_debug("Setting XNACK mode\n");
> - if (process_priv.xnack_mode && !kfd_process_xnack_mode(p, true)) {
> - pr_err("xnack mode cannot be set\n");
> - ret = -EPERM;
> - goto exit;
> - } else {
> - pr_debug("set xnack mode: %d\n", process_priv.xnack_mode);
> - p->xnack_enabled = process_priv.xnack_mode;
> - }
> -
> -exit:
> - return ret;
> -}
> -
> -static int criu_restore_devices(struct kfd_process *p,
> - struct kfd_ioctl_criu_args *args,
> - uint64_t *priv_offset,
> - uint64_t max_priv_data_size)
> -{
> - struct kfd_criu_device_bucket *device_buckets;
> - struct kfd_criu_device_priv_data *device_privs;
> - int ret = 0;
> - uint32_t i;
> -
> - if (args->num_devices != p->n_pdds)
> - return -EINVAL;
> -
> - if (*priv_offset + (args->num_devices * sizeof(*device_privs)) > max_priv_data_size)
> - return -EINVAL;
> -
> - device_buckets = kmalloc_array(args->num_devices, sizeof(*device_buckets), GFP_KERNEL);
> - if (!device_buckets)
> - return -ENOMEM;
> -
> - ret = copy_from_user(device_buckets, (void __user *)args->devices,
> - args->num_devices * sizeof(*device_buckets));
> - if (ret) {
> - pr_err("Failed to copy devices buckets from user\n");
> - ret = -EFAULT;
> - goto exit;
> - }
> -
> - for (i = 0; i < args->num_devices; i++) {
> - struct kfd_node *dev;
> - struct kfd_process_device *pdd;
> - struct file *drm_file;
> -
> - /* device private data is not currently used */
> -
> - if (!device_buckets[i].user_gpu_id) {
> - pr_err("Invalid user gpu_id\n");
> - ret = -EINVAL;
> - goto exit;
> - }
> -
> - dev = kfd_device_by_id(device_buckets[i].actual_gpu_id);
> - if (!dev) {
> - pr_err("Failed to find device with gpu_id = %x\n",
> - device_buckets[i].actual_gpu_id);
> - ret = -EINVAL;
> - goto exit;
> - }
> -
> - pdd = kfd_get_process_device_data(dev, p);
> - if (!pdd) {
> - pr_err("Failed to get pdd for gpu_id = %x\n",
> - device_buckets[i].actual_gpu_id);
> - ret = -EINVAL;
> - goto exit;
> - }
> - pdd->user_gpu_id = device_buckets[i].user_gpu_id;
> -
> - drm_file = fget(device_buckets[i].drm_fd);
> - if (!drm_file) {
> - pr_err("Invalid render node file descriptor sent from plugin (%d)\n",
> - device_buckets[i].drm_fd);
> - ret = -EINVAL;
> - goto exit;
> - }
> -
> - if (pdd->drm_file) {
> - ret = -EINVAL;
> - goto exit;
> - }
> -
> - /* create the vm using render nodes for kfd pdd */
> - if (kfd_process_device_init_vm(pdd, drm_file)) {
> - pr_err("could not init vm for given pdd\n");
> - /* On success, the PDD keeps the drm_file reference */
> - fput(drm_file);
> - ret = -EINVAL;
> - goto exit;
> - }
> - /*
> - * pdd now already has the vm bound to render node so below api won't create a new
> - * exclusive kfd mapping but use existing one with renderDXXX but is still needed
> - * for iommu v2 binding and runtime pm.
> - */
> - pdd = kfd_bind_process_to_device(dev, p);
> - if (IS_ERR(pdd)) {
> - ret = PTR_ERR(pdd);
> - goto exit;
> - }
> -
> - if (!pdd->qpd.proc_doorbells) {
> - ret = kfd_alloc_process_doorbells(dev->kfd, pdd);
> - if (ret)
> - goto exit;
> - }
> - }
> -
> - /*
> - * We are not copying device private data from user as we are not using the data for now,
> - * but we still adjust for its private data.
> - */
> - *priv_offset += args->num_devices * sizeof(*device_privs);
> -
> -exit:
> - kfree(device_buckets);
> - return ret;
> -}
> -
> -static int criu_restore_memory_of_gpu(struct kfd_process_device *pdd,
> - struct kfd_criu_bo_bucket *bo_bucket,
> - struct kfd_criu_bo_priv_data *bo_priv,
> - struct kgd_mem **kgd_mem)
> -{
> - int idr_handle;
> - int ret;
> - const bool criu_resume = true;
> - u64 offset;
> -
> - if (bo_bucket->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL) {
> - if (bo_bucket->size !=
> - kfd_doorbell_process_slice(pdd->dev->kfd))
> - return -EINVAL;
> -
> - offset = kfd_get_process_doorbells(pdd);
> - if (!offset)
> - return -ENOMEM;
> - } else if (bo_bucket->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_MMIO_REMAP) {
> - /* MMIO BOs need remapped bus address */
> - if (bo_bucket->size != PAGE_SIZE) {
> - pr_err("Invalid page size\n");
> - return -EINVAL;
> - }
> - offset = pdd->dev->adev->rmmio_remap.bus_addr;
> - if (!offset) {
> - pr_err("amdgpu_amdkfd_get_mmio_remap_phys_addr failed\n");
> - return -ENOMEM;
> - }
> - } else if (bo_bucket->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_USERPTR) {
> - offset = bo_priv->user_addr;
> - }
> - /* Create the BO */
> - ret = amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu(pdd->dev->adev, bo_bucket->addr,
> - bo_bucket->size, pdd->drm_priv, kgd_mem,
> - &offset, bo_bucket->alloc_flags, criu_resume);
> - if (ret) {
> - pr_err("Could not create the BO\n");
> - return ret;
> - }
> - pr_debug("New BO created: size:0x%llx addr:0x%llx offset:0x%llx\n",
> - bo_bucket->size, bo_bucket->addr, offset);
> -
> - /* Restore previous IDR handle */
> - pr_debug("Restoring old IDR handle for the BO");
> - idr_handle = idr_alloc(&pdd->alloc_idr, *kgd_mem, bo_priv->idr_handle,
> - bo_priv->idr_handle + 1, GFP_KERNEL);
> -
> - if (idr_handle < 0) {
> - pr_err("Could not allocate idr\n");
> - amdgpu_amdkfd_gpuvm_free_memory_of_gpu(pdd->dev->adev, *kgd_mem, pdd->drm_priv,
> - NULL);
> - return -ENOMEM;
> - }
> -
> - if (bo_bucket->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL)
> - bo_bucket->restored_offset = KFD_MMAP_TYPE_DOORBELL | KFD_MMAP_GPU_ID(pdd->dev->id);
> - if (bo_bucket->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_MMIO_REMAP) {
> - bo_bucket->restored_offset = KFD_MMAP_TYPE_MMIO | KFD_MMAP_GPU_ID(pdd->dev->id);
> - } else if (bo_bucket->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_GTT) {
> - bo_bucket->restored_offset = offset;
> - } else if (bo_bucket->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_VRAM) {
> - bo_bucket->restored_offset = offset;
> - /* Update the VRAM usage count */
> - WRITE_ONCE(pdd->vram_usage, pdd->vram_usage + bo_bucket->size);
> - }
> - return 0;
> -}
> -
> -static int criu_restore_bo(struct kfd_process *p,
> - struct kfd_criu_bo_bucket *bo_bucket,
> - struct kfd_criu_bo_priv_data *bo_priv)
> -{
> - struct kfd_process_device *pdd;
> - struct kgd_mem *kgd_mem;
> - int ret;
> - int j;
> -
> - pr_debug("Restoring BO size:0x%llx addr:0x%llx gpu_id:0x%x flags:0x%x idr_handle:0x%x\n",
> - bo_bucket->size, bo_bucket->addr, bo_bucket->gpu_id, bo_bucket->alloc_flags,
> - bo_priv->idr_handle);
> -
> - pdd = kfd_process_device_data_by_id(p, bo_bucket->gpu_id);
> - if (!pdd) {
> - pr_err("Failed to get pdd\n");
> - return -ENODEV;
> - }
> -
> - ret = criu_restore_memory_of_gpu(pdd, bo_bucket, bo_priv, &kgd_mem);
> - if (ret)
> - return ret;
> -
> - /* now map these BOs to GPU/s */
> - for (j = 0; j < p->n_pdds; j++) {
> - struct kfd_node *peer;
> - struct kfd_process_device *peer_pdd;
> -
> - if (!bo_priv->mapped_gpuids[j])
> - break;
> -
> - peer_pdd = kfd_process_device_data_by_id(p, bo_priv->mapped_gpuids[j]);
> - if (!peer_pdd)
> - return -EINVAL;
> -
> - peer = peer_pdd->dev;
> -
> - peer_pdd = kfd_bind_process_to_device(peer, p);
> - if (IS_ERR(peer_pdd))
> - return PTR_ERR(peer_pdd);
> -
> - ret = amdgpu_amdkfd_gpuvm_map_memory_to_gpu(peer->adev, kgd_mem,
> - peer_pdd->drm_priv);
> - if (ret) {
> - pr_err("Failed to map to gpu %d/%d\n", j, p->n_pdds);
> - return ret;
> - }
> - }
> -
> - pr_debug("map memory was successful for the BO\n");
> - /* create the dmabuf object and export the bo */
> - if (bo_bucket->alloc_flags
> - & (KFD_IOC_ALLOC_MEM_FLAGS_VRAM | KFD_IOC_ALLOC_MEM_FLAGS_GTT)) {
> - ret = criu_get_prime_handle(kgd_mem, DRM_RDWR,
> - &bo_bucket->dmabuf_fd);
> - if (ret)
> - return ret;
> - } else {
> - bo_bucket->dmabuf_fd = KFD_INVALID_FD;
> - }
> -
> - return 0;
> -}
> -
> -static int criu_restore_bos(struct kfd_process *p,
> - struct kfd_ioctl_criu_args *args,
> - uint64_t *priv_offset,
> - uint64_t max_priv_data_size)
> -{
> - struct kfd_criu_bo_bucket *bo_buckets = NULL;
> - struct kfd_criu_bo_priv_data *bo_privs = NULL;
> - int ret = 0;
> - uint32_t i = 0;
> -
> - if (*priv_offset + (args->num_bos * sizeof(*bo_privs)) > max_priv_data_size)
> - return -EINVAL;
> -
> - /* Prevent MMU notifications until stage-4 IOCTL (CRIU_RESUME) is received */
> - amdgpu_amdkfd_block_mmu_notifications(p->kgd_process_info);
> -
> - bo_buckets = kvmalloc_array(args->num_bos, sizeof(*bo_buckets), GFP_KERNEL);
> - if (!bo_buckets)
> - return -ENOMEM;
> -
> - ret = copy_from_user(bo_buckets, (void __user *)args->bos,
> - args->num_bos * sizeof(*bo_buckets));
> - if (ret) {
> - pr_err("Failed to copy BOs information from user\n");
> - ret = -EFAULT;
> - goto exit;
> - }
> -
> - bo_privs = kvmalloc_array(args->num_bos, sizeof(*bo_privs), GFP_KERNEL);
> - if (!bo_privs) {
> - ret = -ENOMEM;
> - goto exit;
> - }
> -
> - ret = copy_from_user(bo_privs, (void __user *)args->priv_data + *priv_offset,
> - args->num_bos * sizeof(*bo_privs));
> - if (ret) {
> - pr_err("Failed to copy BOs information from user\n");
> - ret = -EFAULT;
> - goto exit;
> - }
> - *priv_offset += args->num_bos * sizeof(*bo_privs);
> -
> - /* Create and map new BOs */
> - for (; i < args->num_bos; i++) {
> - ret = criu_restore_bo(p, &bo_buckets[i], &bo_privs[i]);
> - if (ret) {
> - pr_debug("Failed to restore BO[%d] ret%d\n", i, ret);
> - goto exit;
> - }
> - } /* done */
> -
> - /* Copy only the buckets back so user can read bo_buckets[N].restored_offset */
> - ret = copy_to_user((void __user *)args->bos,
> - bo_buckets,
> - (args->num_bos * sizeof(*bo_buckets)));
> - if (ret)
> - ret = -EFAULT;
> -
> -exit:
> - while (ret && i--) {
> - if (bo_buckets[i].alloc_flags
> - & (KFD_IOC_ALLOC_MEM_FLAGS_VRAM | KFD_IOC_ALLOC_MEM_FLAGS_GTT))
> - close_fd(bo_buckets[i].dmabuf_fd);
> - }
> - kvfree(bo_buckets);
> - kvfree(bo_privs);
> - return ret;
> -}
> -
> -static int criu_restore_objects(struct file *filep,
> - struct kfd_process *p,
> - struct kfd_ioctl_criu_args *args,
> - uint64_t *priv_offset,
> - uint64_t max_priv_data_size)
> -{
> - int ret = 0;
> - uint32_t i;
> -
> - BUILD_BUG_ON(offsetof(struct kfd_criu_queue_priv_data, object_type));
> - BUILD_BUG_ON(offsetof(struct kfd_criu_event_priv_data, object_type));
> - BUILD_BUG_ON(offsetof(struct kfd_criu_svm_range_priv_data, object_type));
> -
> - for (i = 0; i < args->num_objects; i++) {
> - uint32_t object_type;
> -
> - if (*priv_offset + sizeof(object_type) > max_priv_data_size) {
> - pr_err("Invalid private data size\n");
> - return -EINVAL;
> - }
> -
> - ret = get_user(object_type, (uint32_t __user *)(args->priv_data + *priv_offset));
> - if (ret) {
> - pr_err("Failed to copy private information from user\n");
> - goto exit;
> - }
> -
> - switch (object_type) {
> - case KFD_CRIU_OBJECT_TYPE_QUEUE:
> - ret = kfd_criu_restore_queue(p, (uint8_t __user *)args->priv_data,
> - priv_offset, max_priv_data_size);
> - if (ret)
> - goto exit;
> - break;
> - case KFD_CRIU_OBJECT_TYPE_EVENT:
> - ret = kfd_criu_restore_event(filep, p, (uint8_t __user *)args->priv_data,
> - priv_offset, max_priv_data_size);
> - if (ret)
> - goto exit;
> - break;
> - case KFD_CRIU_OBJECT_TYPE_SVM_RANGE:
> - ret = kfd_criu_restore_svm(p, (uint8_t __user *)args->priv_data,
> - priv_offset, max_priv_data_size);
> - if (ret)
> - goto exit;
> - break;
> - default:
> - pr_err("Invalid object type:%u at index:%d\n", object_type, i);
> - ret = -EINVAL;
> - goto exit;
> - }
> - }
> -exit:
> - return ret;
> -}
> -
> -static int criu_restore(struct file *filep,
> - struct kfd_process *p,
> - struct kfd_ioctl_criu_args *args)
> -{
> - uint64_t priv_offset = 0;
> - int ret = 0;
> -
> - pr_debug("CRIU restore (num_devices:%u num_bos:%u num_objects:%u priv_data_size:%llu)\n",
> - args->num_devices, args->num_bos, args->num_objects, args->priv_data_size);
> -
> - if (!args->bos || !args->devices || !args->priv_data || !args->priv_data_size ||
> - !args->num_devices || !args->num_bos)
> - return -EINVAL;
> -
> - mutex_lock(&p->mutex);
> -
> - /*
> - * Set the process to evicted state to avoid running any new queues before all the memory
> - * mappings are ready.
> - */
> - ret = kfd_process_evict_queues(p, KFD_QUEUE_EVICTION_CRIU_RESTORE);
> - if (ret)
> - goto exit_unlock;
> -
> - /* Each function will adjust priv_offset based on how many bytes they consumed */
> - ret = criu_restore_process(p, args, &priv_offset, args->priv_data_size);
> - if (ret)
> - goto exit_unlock;
> -
> - ret = criu_restore_devices(p, args, &priv_offset, args->priv_data_size);
> - if (ret)
> - goto exit_unlock;
> -
> - ret = criu_restore_bos(p, args, &priv_offset, args->priv_data_size);
> - if (ret)
> - goto exit_unlock;
> -
> - ret = criu_restore_objects(filep, p, args, &priv_offset, args->priv_data_size);
> - if (ret)
> - goto exit_unlock;
> -
> - if (priv_offset != args->priv_data_size) {
> - pr_err("Invalid private data size\n");
> - ret = -EINVAL;
> - }
> -
> -exit_unlock:
> - mutex_unlock(&p->mutex);
> - if (ret)
> - pr_err("Failed to restore CRIU ret:%d\n", ret);
> - else
> - pr_debug("CRIU restore successful\n");
> -
> - return ret;
> -}
> -
> -static int criu_unpause(struct file *filep,
> - struct kfd_process *p,
> - struct kfd_ioctl_criu_args *args)
> -{
> - int ret;
> -
> - mutex_lock(&p->mutex);
> -
> - if (!p->queues_paused) {
> - mutex_unlock(&p->mutex);
> - return -EINVAL;
> - }
> -
> - ret = kfd_process_restore_queues(p);
> - if (ret)
> - pr_err("Failed to unpause queues ret:%d\n", ret);
> - else
> - p->queues_paused = false;
> -
> - mutex_unlock(&p->mutex);
> -
> - return ret;
> -}
> -
> -static int criu_resume(struct file *filep,
> - struct kfd_process *p,
> - struct kfd_ioctl_criu_args *args)
> -{
> - struct kfd_process *target = NULL;
> - struct pid *pid = NULL;
> - int ret = 0;
> -
> - pr_debug("Inside %s, target pid for criu restore: %d\n", __func__,
> - args->pid);
> -
> - pid = find_get_pid(args->pid);
> - if (!pid) {
> - pr_err("Cannot find pid info for %i\n", args->pid);
> - return -ESRCH;
> - }
> -
> - pr_debug("calling kfd_lookup_process_by_pid\n");
> - target = kfd_lookup_process_by_pid(pid);
> -
> - put_pid(pid);
> -
> - if (!target) {
> - pr_debug("Cannot find process info for %i\n", args->pid);
> - return -ESRCH;
> - }
> -
> - mutex_lock(&target->mutex);
> - ret = kfd_criu_resume_svm(target);
> - if (ret) {
> - pr_err("kfd_criu_resume_svm failed for %i\n", args->pid);
> - goto exit;
> - }
> -
> - ret = amdgpu_amdkfd_criu_resume(target->kgd_process_info);
> - if (ret)
> - pr_err("amdgpu_amdkfd_criu_resume failed for %i\n", args->pid);
> -
> -exit:
> - mutex_unlock(&target->mutex);
> -
> - kfd_unref_process(target);
> - return ret;
> -}
> -
> -static int criu_process_info(struct file *filep,
> - struct kfd_process *p,
> - struct kfd_ioctl_criu_args *args)
> -{
> - int ret = 0;
> -
> - mutex_lock(&p->mutex);
> -
> - if (!p->n_pdds) {
> - pr_err("No pdd for given process\n");
> - ret = -ENODEV;
> - goto err_unlock;
> - }
> -
> - ret = kfd_process_evict_queues(p, KFD_QUEUE_EVICTION_CRIU_CHECKPOINT);
> - if (ret)
> - goto err_unlock;
> -
> - p->queues_paused = true;
> -
> - args->pid = task_pid_nr_ns(p->lead_thread,
> - task_active_pid_ns(p->lead_thread));
> -
> - ret = criu_get_process_object_info(p, &args->num_devices, &args->num_bos,
> - &args->num_objects, &args->priv_data_size);
> - if (ret)
> - goto err_unlock;
> -
> - dev_dbg(kfd_device, "Num of devices:%u bos:%u objects:%u priv_data_size:%lld\n",
> - args->num_devices, args->num_bos, args->num_objects,
> - args->priv_data_size);
> -
> -err_unlock:
> - if (ret) {
> - kfd_process_restore_queues(p);
> - p->queues_paused = false;
> - }
> - mutex_unlock(&p->mutex);
> - return ret;
> -}
> -
> static int kfd_ioctl_criu(struct file *filep, struct kfd_process *p, void *data)
> {
> struct kfd_ioctl_criu_args *args = data;
> @@ -2720,19 +1760,19 @@ static int kfd_ioctl_criu(struct file *filep, struct kfd_process *p, void *data)
> dev_dbg(kfd_device, "CRIU operation: %d\n", args->op);
> switch (args->op) {
> case KFD_CRIU_OP_PROCESS_INFO:
> - ret = criu_process_info(filep, p, args);
> + ret = kfd_criu_process_info(filep, p, args);
> break;
> case KFD_CRIU_OP_CHECKPOINT:
> - ret = criu_checkpoint(filep, p, args);
> + ret = kfd_criu_checkpoint(filep, p, args);
> break;
> case KFD_CRIU_OP_UNPAUSE:
> - ret = criu_unpause(filep, p, args);
> + ret = kfd_criu_unpause(filep, p, args);
> break;
> case KFD_CRIU_OP_RESTORE:
> - ret = criu_restore(filep, p, args);
> + ret = kfd_criu_restore(filep, p, args);
> break;
> case KFD_CRIU_OP_RESUME:
> - ret = criu_resume(filep, p, args);
> + ret = kfd_criu_resume(filep, p, args);
> break;
> default:
> dev_dbg(kfd_device, "Unsupported CRIU operation:%d\n", args->op);
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_criu.c b/drivers/gpu/drm/amd/amdkfd/kfd_criu.c
> new file mode 100644
> index 000000000000..72a9b358a642
> --- /dev/null
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_criu.c
> @@ -0,0 +1,989 @@
> +// SPDX-License-Identifier: GPL-2.0 OR MIT
> +/*
> + * Copyright 2024 Advanced Micro Devices, Inc.
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a
> + * copy of this software and associated documentation files (the "Software"),
> + * to deal in the Software without restriction, including without limitation
> + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
> + * and/or sell copies of the Software, and to permit persons to whom the
> + * Software is furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice shall be included in
> + * all copies or substantial portions of the Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
> + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
> + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
> + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
> + * OTHER DEALINGS IN THE SOFTWARE.
> + */
> +
> +#include <linux/dma-buf.h>
> +#include <linux/fdtable.h>
> +
> +#include "kfd_criu.h"
> +#include "kfd_svm.h"
> +
> +static int criu_checkpoint_process(struct kfd_process *p,
> + uint8_t __user *user_priv_data,
> + uint64_t *priv_offset)
> +{
> + struct kfd_criu_process_priv_data process_priv;
> + int ret;
> +
> + memset(&process_priv, 0, sizeof(process_priv));
> +
> + process_priv.version = KFD_CRIU_PRIV_VERSION;
> + /* For CR, we don't consider negative xnack mode which is used for
> + * querying without changing it, here 0 simply means disabled and 1
> + * means enabled so retry for finding a valid PTE.
> + */
> + process_priv.xnack_mode = p->xnack_enabled ? 1 : 0;
> +
> + ret = copy_to_user(user_priv_data + *priv_offset,
> + &process_priv, sizeof(process_priv));
> +
> + if (ret) {
> + pr_err("Failed to copy process information to user\n");
> + ret = -EFAULT;
> + }
> +
> + *priv_offset += sizeof(process_priv);
> + return ret;
> +}
> +
> +static int criu_checkpoint_devices(struct kfd_process *p,
> + uint32_t num_devices,
> + uint8_t __user *user_addr,
> + uint8_t __user *user_priv_data,
> + uint64_t *priv_offset)
> +{
> + struct kfd_criu_device_priv_data *device_priv = NULL;
> + struct kfd_criu_device_bucket *device_buckets = NULL;
> + int ret = 0, i;
> +
> + device_buckets = kvzalloc(num_devices * sizeof(*device_buckets), GFP_KERNEL);
> + if (!device_buckets) {
> + ret = -ENOMEM;
> + goto exit;
> + }
> +
> + device_priv = kvzalloc(num_devices * sizeof(*device_priv), GFP_KERNEL);
> + if (!device_priv) {
> + ret = -ENOMEM;
> + goto exit;
> + }
> +
> + for (i = 0; i < num_devices; i++) {
> + struct kfd_process_device *pdd = p->pdds[i];
> +
> + device_buckets[i].user_gpu_id = pdd->user_gpu_id;
> + device_buckets[i].actual_gpu_id = pdd->dev->id;
> +
> + /*
> + * priv_data does not contain useful information for now and is reserved for
> + * future use, so we do not set its contents.
> + */
> + }
> +
> + ret = copy_to_user(user_addr, device_buckets, num_devices * sizeof(*device_buckets));
> + if (ret) {
> + pr_err("Failed to copy device information to user\n");
> + ret = -EFAULT;
> + goto exit;
> + }
> +
> + ret = copy_to_user(user_priv_data + *priv_offset,
> + device_priv,
> + num_devices * sizeof(*device_priv));
> + if (ret) {
> + pr_err("Failed to copy device information to user\n");
> + ret = -EFAULT;
> + }
> + *priv_offset += num_devices * sizeof(*device_priv);
> +
> +exit:
> + kvfree(device_buckets);
> + kvfree(device_priv);
> + return ret;
> +}
> +
> +static uint32_t get_process_num_bos(struct kfd_process *p)
> +{
> + uint32_t num_of_bos = 0;
> + int i;
> +
> + /* Run over all PDDs of the process */
> + for (i = 0; i < p->n_pdds; i++) {
> + struct kfd_process_device *pdd = p->pdds[i];
> + void *mem;
> + int id;
> +
> + idr_for_each_entry(&pdd->alloc_idr, mem, id) {
> + struct kgd_mem *kgd_mem = (struct kgd_mem *)mem;
> +
> + if (!kgd_mem->va || kgd_mem->va > pdd->gpuvm_base)
> + num_of_bos++;
> + }
> + }
> + return num_of_bos;
> +}
> +
> +static int criu_get_prime_handle(struct kgd_mem *mem,
> + int flags, u32 *shared_fd)
> +{
> + struct dma_buf *dmabuf;
> + int ret;
> +
> + ret = amdgpu_amdkfd_gpuvm_export_dmabuf(mem, &dmabuf);
> + if (ret) {
> + pr_err("dmabuf export failed for the BO\n");
> + return ret;
> + }
> +
> + ret = dma_buf_fd(dmabuf, flags);
> + if (ret < 0) {
> + pr_err("dmabuf create fd failed, ret:%d\n", ret);
> + goto out_free_dmabuf;
> + }
> +
> + *shared_fd = ret;
> + return 0;
> +
> +out_free_dmabuf:
> + dma_buf_put(dmabuf);
> + return ret;
> +}
> +
> +static int criu_checkpoint_bos(struct kfd_process *p,
> + uint32_t num_bos,
> + uint8_t __user *user_bos,
> + uint8_t __user *user_priv_data,
> + uint64_t *priv_offset)
> +{
> + struct kfd_criu_bo_bucket *bo_buckets;
> + struct kfd_criu_bo_priv_data *bo_privs;
> + int ret = 0, pdd_index, bo_index = 0, id;
> + void *mem;
> +
> + bo_buckets = kvzalloc(num_bos * sizeof(*bo_buckets), GFP_KERNEL);
> + if (!bo_buckets)
> + return -ENOMEM;
> +
> + bo_privs = kvzalloc(num_bos * sizeof(*bo_privs), GFP_KERNEL);
> + if (!bo_privs) {
> + ret = -ENOMEM;
> + goto exit;
> + }
> +
> + for (pdd_index = 0; pdd_index < p->n_pdds; pdd_index++) {
> + struct kfd_process_device *pdd = p->pdds[pdd_index];
> + struct amdgpu_bo *dumper_bo;
> + struct kgd_mem *kgd_mem;
> +
> + idr_for_each_entry(&pdd->alloc_idr, mem, id) {
> + struct kfd_criu_bo_bucket *bo_bucket;
> + struct kfd_criu_bo_priv_data *bo_priv;
> + int i, dev_idx = 0;
> +
> + if (!mem) {
> + ret = -ENOMEM;
> + goto exit;
> + }
> +
> + kgd_mem = (struct kgd_mem *)mem;
> + dumper_bo = kgd_mem->bo;
> +
> + /* Skip checkpointing BOs that are used for Trap handler
> + * code and state. Currently, these BOs have a VA that
> + * is less GPUVM Base
> + */
> + if (kgd_mem->va && kgd_mem->va <= pdd->gpuvm_base)
> + continue;
> +
> + bo_bucket = &bo_buckets[bo_index];
> + bo_priv = &bo_privs[bo_index];
> +
> + bo_bucket->gpu_id = pdd->user_gpu_id;
> + bo_bucket->addr = (uint64_t)kgd_mem->va;
> + bo_bucket->size = amdgpu_bo_size(dumper_bo);
> + bo_bucket->alloc_flags = (uint32_t)kgd_mem->alloc_flags;
> + bo_priv->idr_handle = id;
> +
> + if (bo_bucket->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_USERPTR) {
> + ret = amdgpu_ttm_tt_get_userptr(&dumper_bo->tbo,
> + &bo_priv->user_addr);
> + if (ret) {
> + pr_err("Failed to obtain user address for user-pointer bo\n");
> + goto exit;
> + }
> + }
> + if (bo_bucket->alloc_flags
> + & (KFD_IOC_ALLOC_MEM_FLAGS_VRAM | KFD_IOC_ALLOC_MEM_FLAGS_GTT)) {
> + ret = criu_get_prime_handle(kgd_mem,
> + bo_bucket->alloc_flags &
> + KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE ? DRM_RDWR : 0,
> + &bo_bucket->dmabuf_fd);
> + if (ret)
> + goto exit;
> + } else {
> + bo_bucket->dmabuf_fd = KFD_INVALID_FD;
> + }
> +
> + if (bo_bucket->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL)
> + bo_bucket->offset = KFD_MMAP_TYPE_DOORBELL |
> + KFD_MMAP_GPU_ID(pdd->dev->id);
> + else if (bo_bucket->alloc_flags &
> + KFD_IOC_ALLOC_MEM_FLAGS_MMIO_REMAP)
> + bo_bucket->offset = KFD_MMAP_TYPE_MMIO |
> + KFD_MMAP_GPU_ID(pdd->dev->id);
> + else
> + bo_bucket->offset = amdgpu_bo_mmap_offset(dumper_bo);
> +
> + for (i = 0; i < p->n_pdds; i++) {
> + if (amdgpu_amdkfd_bo_mapped_to_dev(p->pdds[i]->dev->adev, kgd_mem))
> + bo_priv->mapped_gpuids[dev_idx++] = p->pdds[i]->user_gpu_id;
> + }
> +
> + pr_debug("bo_size = 0x%llx, bo_addr = 0x%llx bo_offset = 0x%llx\n"
> + "gpu_id = 0x%x alloc_flags = 0x%x idr_handle = 0x%x",
> + bo_bucket->size,
> + bo_bucket->addr,
> + bo_bucket->offset,
> + bo_bucket->gpu_id,
> + bo_bucket->alloc_flags,
> + bo_priv->idr_handle);
> + bo_index++;
> + }
> + }
> +
> + ret = copy_to_user(user_bos, bo_buckets, num_bos * sizeof(*bo_buckets));
> + if (ret) {
> + pr_err("Failed to copy BO information to user\n");
> + ret = -EFAULT;
> + goto exit;
> + }
> +
> + ret = copy_to_user(user_priv_data + *priv_offset, bo_privs, num_bos * sizeof(*bo_privs));
> + if (ret) {
> + pr_err("Failed to copy BO priv information to user\n");
> + ret = -EFAULT;
> + goto exit;
> + }
> +
> + *priv_offset += num_bos * sizeof(*bo_privs);
> +
> +exit:
> + while (ret && bo_index--) {
> + if (bo_buckets[bo_index].alloc_flags
> + & (KFD_IOC_ALLOC_MEM_FLAGS_VRAM | KFD_IOC_ALLOC_MEM_FLAGS_GTT))
> + close_fd(bo_buckets[bo_index].dmabuf_fd);
> + }
> +
> + kvfree(bo_buckets);
> + kvfree(bo_privs);
> + return ret;
> +}
> +
> +static int criu_get_process_object_info(struct kfd_process *p,
> + uint32_t *num_devices,
> + uint32_t *num_bos,
> + uint32_t *num_objects,
> + uint64_t *objs_priv_size)
> +{
> + uint64_t queues_priv_data_size, svm_priv_data_size, priv_size;
> + uint32_t num_queues, num_events, num_svm_ranges;
> + int ret;
> +
> + *num_devices = p->n_pdds;
> + *num_bos = get_process_num_bos(p);
> +
> + ret = kfd_process_get_queue_info(p, &num_queues, &queues_priv_data_size);
> + if (ret)
> + return ret;
> +
> + num_events = kfd_get_num_events(p);
> +
> + ret = svm_range_get_info(p, &num_svm_ranges, &svm_priv_data_size);
> + if (ret)
> + return ret;
> +
> + *num_objects = num_queues + num_events + num_svm_ranges;
> +
> + if (objs_priv_size) {
> + priv_size = sizeof(struct kfd_criu_process_priv_data);
> + priv_size += *num_devices * sizeof(struct kfd_criu_device_priv_data);
> + priv_size += *num_bos * sizeof(struct kfd_criu_bo_priv_data);
> + priv_size += queues_priv_data_size;
> + priv_size += num_events * sizeof(struct kfd_criu_event_priv_data);
> + priv_size += svm_priv_data_size;
> + *objs_priv_size = priv_size;
> + }
> + return 0;
> +}
> +
> +int kfd_criu_checkpoint(struct file *filep,
> + struct kfd_process *p,
> + struct kfd_ioctl_criu_args *args)
> +{
> + int ret;
> + uint32_t num_devices, num_bos, num_objects;
> + uint64_t priv_size, priv_offset = 0, bo_priv_offset;
> +
> + if (!args->devices || !args->bos || !args->priv_data)
> + return -EINVAL;
> +
> + mutex_lock(&p->mutex);
> +
> + if (!p->n_pdds) {
> + pr_err("No pdd for given process\n");
> + ret = -ENODEV;
> + goto exit_unlock;
> + }
> +
> + /* Confirm all process queues are evicted */
> + if (!p->queues_paused) {
> + pr_err("Cannot dump process when queues are not in evicted state\n");
> + /* CRIU plugin did not call op PROCESS_INFO before checkpointing */
> + ret = -EINVAL;
> + goto exit_unlock;
> + }
> +
> + ret = criu_get_process_object_info(p, &num_devices, &num_bos, &num_objects, &priv_size);
> + if (ret)
> + goto exit_unlock;
> +
> + if (num_devices != args->num_devices ||
> + num_bos != args->num_bos ||
> + num_objects != args->num_objects ||
> + priv_size != args->priv_data_size) {
> +
> + ret = -EINVAL;
> + goto exit_unlock;
> + }
> +
> + /* each function will store private data inside priv_data and adjust priv_offset */
> + ret = criu_checkpoint_process(p, (uint8_t __user *)args->priv_data, &priv_offset);
> + if (ret)
> + goto exit_unlock;
> +
> + ret = criu_checkpoint_devices(p, num_devices, (uint8_t __user *)args->devices,
> + (uint8_t __user *)args->priv_data, &priv_offset);
> + if (ret)
> + goto exit_unlock;
> +
> + /* Leave room for BOs in the private data. They need to be restored
> + * before events, but we checkpoint them last to simplify the error
> + * handling.
> + */
> + bo_priv_offset = priv_offset;
> + priv_offset += num_bos * sizeof(struct kfd_criu_bo_priv_data);
> +
> + if (num_objects) {
> + ret = kfd_criu_checkpoint_queues(p, (uint8_t __user *)args->priv_data,
> + &priv_offset);
> + if (ret)
> + goto exit_unlock;
> +
> + ret = kfd_criu_checkpoint_events(p, (uint8_t __user *)args->priv_data,
> + &priv_offset);
> + if (ret)
> + goto exit_unlock;
> +
> + ret = kfd_criu_checkpoint_svm(p, (uint8_t __user *)args->priv_data, &priv_offset);
> + if (ret)
> + goto exit_unlock;
> + }
> +
> + /* This must be the last thing in this function that can fail.
> + * Otherwise we leak dmabuf file descriptors.
> + */
> + ret = criu_checkpoint_bos(p, num_bos, (uint8_t __user *)args->bos,
> + (uint8_t __user *)args->priv_data, &bo_priv_offset);
> +
> +exit_unlock:
> + mutex_unlock(&p->mutex);
> + if (ret)
> + pr_err("Failed to dump CRIU ret:%d\n", ret);
> + else
> + pr_debug("CRIU dump ret:%d\n", ret);
> +
> + return ret;
> +}
> +
> +static int criu_restore_process(struct kfd_process *p,
> + struct kfd_ioctl_criu_args *args,
> + uint64_t *priv_offset,
> + uint64_t max_priv_data_size)
> +{
> + int ret = 0;
> + struct kfd_criu_process_priv_data process_priv;
> +
> + if (*priv_offset + sizeof(process_priv) > max_priv_data_size)
> + return -EINVAL;
> +
> + ret = copy_from_user(&process_priv,
> + (void __user *)(args->priv_data + *priv_offset),
> + sizeof(process_priv));
> + if (ret) {
> + pr_err("Failed to copy process private information from user\n");
> + ret = -EFAULT;
> + goto exit;
> + }
> + *priv_offset += sizeof(process_priv);
> +
> + if (process_priv.version != KFD_CRIU_PRIV_VERSION) {
> + pr_err("Invalid CRIU API version (checkpointed:%d current:%d)\n",
> + process_priv.version, KFD_CRIU_PRIV_VERSION);
> + return -EINVAL;
> + }
> +
> + pr_debug("Setting XNACK mode\n");
> + if (process_priv.xnack_mode && !kfd_process_xnack_mode(p, true)) {
> + pr_err("xnack mode cannot be set\n");
> + ret = -EPERM;
> + goto exit;
> + } else {
> + pr_debug("set xnack mode: %d\n", process_priv.xnack_mode);
> + p->xnack_enabled = process_priv.xnack_mode;
> + }
> +
> +exit:
> + return ret;
> +}
> +
> +static int criu_restore_devices(struct kfd_process *p,
> + struct kfd_ioctl_criu_args *args,
> + uint64_t *priv_offset,
> + uint64_t max_priv_data_size)
> +{
> + struct kfd_criu_device_bucket *device_buckets;
> + struct kfd_criu_device_priv_data *device_privs;
> + int ret = 0;
> + uint32_t i;
> +
> + if (args->num_devices != p->n_pdds)
> + return -EINVAL;
> +
> + if (*priv_offset + (args->num_devices * sizeof(*device_privs)) > max_priv_data_size)
> + return -EINVAL;
> +
> + device_buckets = kmalloc_array(args->num_devices, sizeof(*device_buckets), GFP_KERNEL);
> + if (!device_buckets)
> + return -ENOMEM;
> +
> + ret = copy_from_user(device_buckets, (void __user *)args->devices,
> + args->num_devices * sizeof(*device_buckets));
> + if (ret) {
> + pr_err("Failed to copy devices buckets from user\n");
> + ret = -EFAULT;
> + goto exit;
> + }
> +
> + for (i = 0; i < args->num_devices; i++) {
> + struct kfd_node *dev;
> + struct kfd_process_device *pdd;
> + struct file *drm_file;
> +
> + /* device private data is not currently used */
> +
> + if (!device_buckets[i].user_gpu_id) {
> + pr_err("Invalid user gpu_id\n");
> + ret = -EINVAL;
> + goto exit;
> + }
> +
> + dev = kfd_device_by_id(device_buckets[i].actual_gpu_id);
> + if (!dev) {
> + pr_err("Failed to find device with gpu_id = %x\n",
> + device_buckets[i].actual_gpu_id);
> + ret = -EINVAL;
> + goto exit;
> + }
> +
> + pdd = kfd_get_process_device_data(dev, p);
> + if (!pdd) {
> + pr_err("Failed to get pdd for gpu_id = %x\n",
> + device_buckets[i].actual_gpu_id);
> + ret = -EINVAL;
> + goto exit;
> + }
> + pdd->user_gpu_id = device_buckets[i].user_gpu_id;
> +
> + drm_file = fget(device_buckets[i].drm_fd);
> + if (!drm_file) {
> + pr_err("Invalid render node file descriptor sent from plugin (%d)\n",
> + device_buckets[i].drm_fd);
> + ret = -EINVAL;
> + goto exit;
> + }
> +
> + if (pdd->drm_file) {
> + ret = -EINVAL;
> + goto exit;
> + }
> +
> + /* create the vm using render nodes for kfd pdd */
> + if (kfd_process_device_init_vm(pdd, drm_file)) {
> + pr_err("could not init vm for given pdd\n");
> + /* On success, the PDD keeps the drm_file reference */
> + fput(drm_file);
> + ret = -EINVAL;
> + goto exit;
> + }
> + /*
> + * pdd now already has the vm bound to render node so below api won't create a new
> + * exclusive kfd mapping but use existing one with renderDXXX but is still needed
> + * for iommu v2 binding and runtime pm.
> + */
> + pdd = kfd_bind_process_to_device(dev, p);
> + if (IS_ERR(pdd)) {
> + ret = PTR_ERR(pdd);
> + goto exit;
> + }
> +
> + if (!pdd->qpd.proc_doorbells) {
> + ret = kfd_alloc_process_doorbells(dev->kfd, pdd);
> + if (ret)
> + goto exit;
> + }
> + }
> +
> + /*
> + * We are not copying device private data from user as we are not using the data for now,
> + * but we still adjust for its private data.
> + */
> + *priv_offset += args->num_devices * sizeof(*device_privs);
> +
> +exit:
> + kfree(device_buckets);
> + return ret;
> +}
> +
> +static int criu_restore_memory_of_gpu(struct kfd_process_device *pdd,
> + struct kfd_criu_bo_bucket *bo_bucket,
> + struct kfd_criu_bo_priv_data *bo_priv,
> + struct kgd_mem **kgd_mem)
> +{
> + int idr_handle;
> + int ret;
> + const bool criu_resume = true;
> + u64 offset;
> +
> + if (bo_bucket->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL) {
> + if (bo_bucket->size !=
> + kfd_doorbell_process_slice(pdd->dev->kfd))
> + return -EINVAL;
> +
> + offset = kfd_get_process_doorbells(pdd);
> + if (!offset)
> + return -ENOMEM;
> + } else if (bo_bucket->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_MMIO_REMAP) {
> + /* MMIO BOs need remapped bus address */
> + if (bo_bucket->size != PAGE_SIZE) {
> + pr_err("Invalid page size\n");
> + return -EINVAL;
> + }
> + offset = pdd->dev->adev->rmmio_remap.bus_addr;
> + if (!offset) {
> + pr_err("amdgpu_amdkfd_get_mmio_remap_phys_addr failed\n");
> + return -ENOMEM;
> + }
> + } else if (bo_bucket->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_USERPTR) {
> + offset = bo_priv->user_addr;
> + }
> + /* Create the BO */
> + ret = amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu(pdd->dev->adev, bo_bucket->addr,
> + bo_bucket->size, pdd->drm_priv, kgd_mem,
> + &offset, bo_bucket->alloc_flags, criu_resume);
> + if (ret) {
> + pr_err("Could not create the BO\n");
> + return ret;
> + }
> + pr_debug("New BO created: size:0x%llx addr:0x%llx offset:0x%llx\n",
> + bo_bucket->size, bo_bucket->addr, offset);
> +
> + /* Restore previous IDR handle */
> + pr_debug("Restoring old IDR handle for the BO");
> + idr_handle = idr_alloc(&pdd->alloc_idr, *kgd_mem, bo_priv->idr_handle,
> + bo_priv->idr_handle + 1, GFP_KERNEL);
> +
> + if (idr_handle < 0) {
> + pr_err("Could not allocate idr\n");
> + amdgpu_amdkfd_gpuvm_free_memory_of_gpu(pdd->dev->adev, *kgd_mem, pdd->drm_priv,
> + NULL);
> + return -ENOMEM;
> + }
> +
> + if (bo_bucket->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL)
> + bo_bucket->restored_offset = KFD_MMAP_TYPE_DOORBELL | KFD_MMAP_GPU_ID(pdd->dev->id);
> + if (bo_bucket->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_MMIO_REMAP) {
> + bo_bucket->restored_offset = KFD_MMAP_TYPE_MMIO | KFD_MMAP_GPU_ID(pdd->dev->id);
> + } else if (bo_bucket->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_GTT) {
> + bo_bucket->restored_offset = offset;
> + } else if (bo_bucket->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_VRAM) {
> + bo_bucket->restored_offset = offset;
> + /* Update the VRAM usage count */
> + WRITE_ONCE(pdd->vram_usage, pdd->vram_usage + bo_bucket->size);
> + }
> + return 0;
> +}
> +
> +static int criu_restore_bo(struct kfd_process *p,
> + struct kfd_criu_bo_bucket *bo_bucket,
> + struct kfd_criu_bo_priv_data *bo_priv)
> +{
> + struct kfd_process_device *pdd;
> + struct kgd_mem *kgd_mem;
> + int ret;
> + int j;
> +
> + pr_debug("Restoring BO size:0x%llx addr:0x%llx gpu_id:0x%x flags:0x%x idr_handle:0x%x\n",
> + bo_bucket->size, bo_bucket->addr, bo_bucket->gpu_id, bo_bucket->alloc_flags,
> + bo_priv->idr_handle);
> +
> + pdd = kfd_process_device_data_by_id(p, bo_bucket->gpu_id);
> + if (!pdd) {
> + pr_err("Failed to get pdd\n");
> + return -ENODEV;
> + }
> +
> + ret = criu_restore_memory_of_gpu(pdd, bo_bucket, bo_priv, &kgd_mem);
> + if (ret)
> + return ret;
> +
> + /* now map these BOs to GPU/s */
> + for (j = 0; j < p->n_pdds; j++) {
> + struct kfd_node *peer;
> + struct kfd_process_device *peer_pdd;
> +
> + if (!bo_priv->mapped_gpuids[j])
> + break;
> +
> + peer_pdd = kfd_process_device_data_by_id(p, bo_priv->mapped_gpuids[j]);
> + if (!peer_pdd)
> + return -EINVAL;
> +
> + peer = peer_pdd->dev;
> +
> + peer_pdd = kfd_bind_process_to_device(peer, p);
> + if (IS_ERR(peer_pdd))
> + return PTR_ERR(peer_pdd);
> +
> + ret = amdgpu_amdkfd_gpuvm_map_memory_to_gpu(peer->adev, kgd_mem,
> + peer_pdd->drm_priv);
> + if (ret) {
> + pr_err("Failed to map to gpu %d/%d\n", j, p->n_pdds);
> + return ret;
> + }
> + }
> +
> + pr_debug("map memory was successful for the BO\n");
> + /* create the dmabuf object and export the bo */
> + if (bo_bucket->alloc_flags
> + & (KFD_IOC_ALLOC_MEM_FLAGS_VRAM | KFD_IOC_ALLOC_MEM_FLAGS_GTT)) {
> + ret = criu_get_prime_handle(kgd_mem, DRM_RDWR,
> + &bo_bucket->dmabuf_fd);
> + if (ret)
> + return ret;
> + } else {
> + bo_bucket->dmabuf_fd = KFD_INVALID_FD;
> + }
> +
> + return 0;
> +}
> +
> +static int criu_restore_bos(struct kfd_process *p,
> + struct kfd_ioctl_criu_args *args,
> + uint64_t *priv_offset,
> + uint64_t max_priv_data_size)
> +{
> + struct kfd_criu_bo_bucket *bo_buckets = NULL;
> + struct kfd_criu_bo_priv_data *bo_privs = NULL;
> + int ret = 0;
> + uint32_t i = 0;
> +
> + if (*priv_offset + (args->num_bos * sizeof(*bo_privs)) > max_priv_data_size)
> + return -EINVAL;
> +
> + /* Prevent MMU notifications until stage-4 IOCTL (CRIU_RESUME) is received */
> + amdgpu_amdkfd_block_mmu_notifications(p->kgd_process_info);
> +
> + bo_buckets = kvmalloc_array(args->num_bos, sizeof(*bo_buckets), GFP_KERNEL);
> + if (!bo_buckets)
> + return -ENOMEM;
> +
> + ret = copy_from_user(bo_buckets, (void __user *)args->bos,
> + args->num_bos * sizeof(*bo_buckets));
> + if (ret) {
> + pr_err("Failed to copy BOs information from user\n");
> + ret = -EFAULT;
> + goto exit;
> + }
> +
> + bo_privs = kvmalloc_array(args->num_bos, sizeof(*bo_privs), GFP_KERNEL);
> + if (!bo_privs) {
> + ret = -ENOMEM;
> + goto exit;
> + }
> +
> + ret = copy_from_user(bo_privs, (void __user *)args->priv_data + *priv_offset,
> + args->num_bos * sizeof(*bo_privs));
> + if (ret) {
> + pr_err("Failed to copy BOs information from user\n");
> + ret = -EFAULT;
> + goto exit;
> + }
> + *priv_offset += args->num_bos * sizeof(*bo_privs);
> +
> + /* Create and map new BOs */
> + for (; i < args->num_bos; i++) {
> + ret = criu_restore_bo(p, &bo_buckets[i], &bo_privs[i]);
> + if (ret) {
> + pr_debug("Failed to restore BO[%d] ret%d\n", i, ret);
> + goto exit;
> + }
> + } /* done */
> +
> + /* Copy only the buckets back so user can read bo_buckets[N].restored_offset */
> + ret = copy_to_user((void __user *)args->bos,
> + bo_buckets,
> + (args->num_bos * sizeof(*bo_buckets)));
> + if (ret)
> + ret = -EFAULT;
> +
> +exit:
> + while (ret && i--) {
> + if (bo_buckets[i].alloc_flags
> + & (KFD_IOC_ALLOC_MEM_FLAGS_VRAM | KFD_IOC_ALLOC_MEM_FLAGS_GTT))
> + close_fd(bo_buckets[i].dmabuf_fd);
> + }
> + kvfree(bo_buckets);
> + kvfree(bo_privs);
> + return ret;
> +}
> +
> +static int criu_restore_objects(struct file *filep,
> + struct kfd_process *p,
> + struct kfd_ioctl_criu_args *args,
> + uint64_t *priv_offset,
> + uint64_t max_priv_data_size)
> +{
> + int ret = 0;
> + uint32_t i;
> +
> + BUILD_BUG_ON(offsetof(struct kfd_criu_queue_priv_data, object_type));
> + BUILD_BUG_ON(offsetof(struct kfd_criu_event_priv_data, object_type));
> + BUILD_BUG_ON(offsetof(struct kfd_criu_svm_range_priv_data, object_type));
> +
> + for (i = 0; i < args->num_objects; i++) {
> + uint32_t object_type;
> +
> + if (*priv_offset + sizeof(object_type) > max_priv_data_size) {
> + pr_err("Invalid private data size\n");
> + return -EINVAL;
> + }
> +
> + ret = get_user(object_type, (uint32_t __user *)(args->priv_data + *priv_offset));
> + if (ret) {
> + pr_err("Failed to copy private information from user\n");
> + goto exit;
> + }
> +
> + switch (object_type) {
> + case KFD_CRIU_OBJECT_TYPE_QUEUE:
> + ret = kfd_criu_restore_queue(p, (uint8_t __user *)args->priv_data,
> + priv_offset, max_priv_data_size);
> + if (ret)
> + goto exit;
> + break;
> + case KFD_CRIU_OBJECT_TYPE_EVENT:
> + ret = kfd_criu_restore_event(filep, p, (uint8_t __user *)args->priv_data,
> + priv_offset, max_priv_data_size);
> + if (ret)
> + goto exit;
> + break;
> + case KFD_CRIU_OBJECT_TYPE_SVM_RANGE:
> + ret = kfd_criu_restore_svm(p, (uint8_t __user *)args->priv_data,
> + priv_offset, max_priv_data_size);
> + if (ret)
> + goto exit;
> + break;
> + default:
> + pr_err("Invalid object type:%u at index:%d\n", object_type, i);
> + ret = -EINVAL;
> + goto exit;
> + }
> + }
> +exit:
> + return ret;
> +}
> +
> +int kfd_criu_restore(struct file *filep,
> + struct kfd_process *p,
> + struct kfd_ioctl_criu_args *args)
> +{
> + uint64_t priv_offset = 0;
> + int ret = 0;
> +
> + pr_debug("CRIU restore (num_devices:%u num_bos:%u num_objects:%u priv_data_size:%llu)\n",
> + args->num_devices, args->num_bos, args->num_objects, args->priv_data_size);
> +
> + if (!args->bos || !args->devices || !args->priv_data || !args->priv_data_size ||
> + !args->num_devices || !args->num_bos)
> + return -EINVAL;
> +
> + mutex_lock(&p->mutex);
> +
> + /*
> + * Set the process to evicted state to avoid running any new queues before all the memory
> + * mappings are ready.
> + */
> + ret = kfd_process_evict_queues(p, KFD_QUEUE_EVICTION_CRIU_RESTORE);
> + if (ret)
> + goto exit_unlock;
> +
> + /* Each function will adjust priv_offset based on how many bytes they consumed */
> + ret = criu_restore_process(p, args, &priv_offset, args->priv_data_size);
> + if (ret)
> + goto exit_unlock;
> +
> + ret = criu_restore_devices(p, args, &priv_offset, args->priv_data_size);
> + if (ret)
> + goto exit_unlock;
> +
> + ret = criu_restore_bos(p, args, &priv_offset, args->priv_data_size);
> + if (ret)
> + goto exit_unlock;
> +
> + ret = criu_restore_objects(filep, p, args, &priv_offset, args->priv_data_size);
> + if (ret)
> + goto exit_unlock;
> +
> + if (priv_offset != args->priv_data_size) {
> + pr_err("Invalid private data size\n");
> + ret = -EINVAL;
> + }
> +
> +exit_unlock:
> + mutex_unlock(&p->mutex);
> + if (ret)
> + pr_err("Failed to restore CRIU ret:%d\n", ret);
> + else
> + pr_debug("CRIU restore successful\n");
> +
> + return ret;
> +}
> +
> +int kfd_criu_unpause(struct file *filep,
> + struct kfd_process *p,
> + struct kfd_ioctl_criu_args *args)
> +{
> + int ret;
> +
> + mutex_lock(&p->mutex);
> +
> + if (!p->queues_paused) {
> + mutex_unlock(&p->mutex);
> + return -EINVAL;
> + }
> +
> + ret = kfd_process_restore_queues(p);
> + if (ret)
> + pr_err("Failed to unpause queues ret:%d\n", ret);
> + else
> + p->queues_paused = false;
> +
> + mutex_unlock(&p->mutex);
> +
> + return ret;
> +}
> +
> +int kfd_criu_resume(struct file *filep,
> + struct kfd_process *p,
> + struct kfd_ioctl_criu_args *args)
> +{
> + struct kfd_process *target = NULL;
> + struct pid *pid = NULL;
> + int ret = 0;
> +
> + pr_debug("Inside %s, target pid for criu restore: %d\n", __func__,
> + args->pid);
> +
> + pid = find_get_pid(args->pid);
> + if (!pid) {
> + pr_err("Cannot find pid info for %i\n", args->pid);
> + return -ESRCH;
> + }
> +
> + pr_debug("calling kfd_lookup_process_by_pid\n");
> + target = kfd_lookup_process_by_pid(pid);
> +
> + put_pid(pid);
> +
> + if (!target) {
> + pr_debug("Cannot find process info for %i\n", args->pid);
> + return -ESRCH;
> + }
> +
> + mutex_lock(&target->mutex);
> + ret = kfd_criu_resume_svm(target);
> + if (ret) {
> + pr_err("kfd_criu_resume_svm failed for %i\n", args->pid);
> + goto exit;
> + }
> +
> + ret = amdgpu_amdkfd_criu_resume(target->kgd_process_info);
> + if (ret)
> + pr_err("amdgpu_amdkfd_criu_resume failed for %i\n", args->pid);
> +
> +exit:
> + mutex_unlock(&target->mutex);
> +
> + kfd_unref_process(target);
> + return ret;
> +}
> +
> +int kfd_criu_process_info(struct file *filep,
> + struct kfd_process *p,
> + struct kfd_ioctl_criu_args *args)
> +{
> + int ret = 0;
> +
> + mutex_lock(&p->mutex);
> +
> + if (!p->n_pdds) {
> + pr_err("No pdd for given process\n");
> + ret = -ENODEV;
> + goto err_unlock;
> + }
> +
> + ret = kfd_process_evict_queues(p, KFD_QUEUE_EVICTION_CRIU_CHECKPOINT);
> + if (ret)
> + goto err_unlock;
> +
> + p->queues_paused = true;
> +
> + args->pid = task_pid_nr_ns(p->lead_thread,
> + task_active_pid_ns(p->lead_thread));
> +
> + ret = criu_get_process_object_info(p, &args->num_devices, &args->num_bos,
> + &args->num_objects, &args->priv_data_size);
> + if (ret)
> + goto err_unlock;
> +
> + dev_dbg(kfd_device, "Num of devices:%u bos:%u objects:%u priv_data_size:%lld\n",
> + args->num_devices, args->num_bos, args->num_objects,
> + args->priv_data_size);
> +
> +err_unlock:
> + if (ret) {
> + kfd_process_restore_queues(p);
> + p->queues_paused = false;
> + }
> + mutex_unlock(&p->mutex);
> + return ret;
> +}
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_criu.h b/drivers/gpu/drm/amd/amdkfd/kfd_criu.h
> new file mode 100644
> index 000000000000..1a3d418a9505
> --- /dev/null
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_criu.h
> @@ -0,0 +1,50 @@
> +/* SPDX-License-Identifier: GPL-2.0 OR MIT */
> +/*
> + * Copyright 2024 Advanced Micro Devices, Inc.
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a
> + * copy of this software and associated documentation files (the "Software"),
> + * to deal in the Software without restriction, including without limitation
> + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
> + * and/or sell copies of the Software, and to permit persons to whom the
> + * Software is furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice shall be included in
> + * all copies or substantial portions of the Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
> + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
> + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
> + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
> + * OTHER DEALINGS IN THE SOFTWARE.
> + */
> +
> +#ifndef __KFD_CRIU_H__
> +#define __KFD_CRIU_H__
> +
> +#include <uapi/linux/kfd_ioctl.h>
> +#include "kfd_priv.h"
> +
> +int kfd_criu_process_info(struct file *filep,
> + struct kfd_process *p,
> + struct kfd_ioctl_criu_args *args);
> +
> +int kfd_criu_checkpoint(struct file *filep,
> + struct kfd_process *p,
> + struct kfd_ioctl_criu_args *args);
> +
> +int kfd_criu_unpause(struct file *filep,
> + struct kfd_process *p,
> + struct kfd_ioctl_criu_args *args);
> +
> +int kfd_criu_restore(struct file *filep,
> + struct kfd_process *p,
> + struct kfd_ioctl_criu_args *args);
> +
> +int kfd_criu_resume(struct file *filep,
> + struct kfd_process *p,
> + struct kfd_ioctl_criu_args *args);
> +
> +#endif
More information about the amd-gfx
mailing list