[Patch v4 06/24] drm/amdkfd: CRIU Implement KFD restore ioctl
Felix Kuehling
felix.kuehling at amd.com
Mon Jan 10 23:01:11 UTC 2022
On 2021-12-22 7:36 p.m., Rajneesh Bhardwaj wrote:
> This implements the KFD CRIU Restore ioctl that lays the basic
> foundation for the CRIU restore operation. It provides support to
> create the buffer objects corresponding to Non-Paged system memory
> mapped for GPU and/or CPU access and lays basic foundation for the
> userptrs buffer objects which will be added in a separate patch.
> This ioctl creates various types of buffer objects such as VRAM,
> MMIO, Doorbell, GTT based on the date sent from the userspace plugin.
> The data mostly contains the previously checkpointed KFD images from
> some KFD processs.
>
> While restoring a criu process, attach old IDR values to newly
> created BOs. This also adds the minimal gpu mapping support for a single
> gpu checkpoint restore use case.
>
> Signed-off-by: David Yat Sin <david.yatsin at amd.com>
> Signed-off-by: Rajneesh Bhardwaj <rajneesh.bhardwaj at amd.com>
> ---
> drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 298 ++++++++++++++++++++++-
> 1 file changed, 297 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> index cdbb92972338..c93f74ad073f 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> @@ -2069,11 +2069,307 @@ static int criu_checkpoint(struct file *filep,
> return ret;
> }
>
> +static int criu_restore_process(struct kfd_process *p,
> + struct kfd_ioctl_criu_args *args,
> + uint64_t *priv_offset,
> + uint64_t max_priv_data_size)
> +{
> + int ret = 0;
> + struct kfd_criu_process_priv_data process_priv;
> +
> + if (*priv_offset + sizeof(process_priv) > max_priv_data_size)
> + return -EINVAL;
> +
> + ret = copy_from_user(&process_priv,
> + (void __user *)(args->priv_data + *priv_offset),
> + sizeof(process_priv));
> + if (ret) {
> + pr_err("Failed to copy process private information from user\n");
> + ret = -EFAULT;
> + goto exit;
> + }
> + *priv_offset += sizeof(process_priv);
> +
> + if (process_priv.version != KFD_CRIU_PRIV_VERSION) {
> + pr_err("Invalid CRIU API version (checkpointed:%d current:%d)\n",
> + process_priv.version, KFD_CRIU_PRIV_VERSION);
> + return -EINVAL;
> + }
> +
> +exit:
> + return ret;
> +}
> +
> +static int criu_restore_bos(struct kfd_process *p,
> + struct kfd_ioctl_criu_args *args,
> + uint64_t *priv_offset,
> + uint64_t max_priv_data_size)
> +{
> + struct kfd_criu_bo_bucket *bo_buckets;
> + struct kfd_criu_bo_priv_data *bo_privs;
> + bool flush_tlbs = false;
> + int ret = 0, j = 0;
> + uint32_t i;
> +
> + if (*priv_offset + (args->num_bos * sizeof(*bo_privs)) > max_priv_data_size)
> + return -EINVAL;
> +
> + bo_buckets = kvmalloc_array(args->num_bos, sizeof(*bo_buckets), GFP_KERNEL);
> + if (!bo_buckets)
> + return -ENOMEM;
> +
> + ret = copy_from_user(bo_buckets, (void __user *)args->bos,
> + args->num_bos * sizeof(*bo_buckets));
> + if (ret) {
> + pr_err("Failed to copy BOs information from user\n");
> + ret = -EFAULT;
> + goto exit;
> + }
> +
> + bo_privs = kvmalloc_array(args->num_bos, sizeof(*bo_privs), GFP_KERNEL);
> + if (!bo_privs) {
> + ret = -ENOMEM;
> + goto exit;
> + }
> +
> + ret = copy_from_user(bo_privs, (void __user *)args->priv_data + *priv_offset,
> + args->num_bos * sizeof(*bo_privs));
> + if (ret) {
> + pr_err("Failed to copy BOs information from user\n");
> + ret = -EFAULT;
> + goto exit;
> + }
> + *priv_offset += args->num_bos * sizeof(*bo_privs);
> +
> + /* Create and map new BOs */
> + for (i = 0; i < args->num_bos; i++) {
> + struct kfd_criu_bo_bucket *bo_bucket;
> + struct kfd_criu_bo_priv_data *bo_priv;
> + struct kfd_dev *dev;
> + struct kfd_process_device *pdd;
> + void *mem;
> + u64 offset;
> + int idr_handle;
> +
> + bo_bucket = &bo_buckets[i];
> + bo_priv = &bo_privs[i];
> +
> + dev = kfd_device_by_id(bo_bucket->gpu_id);
> + if (!dev) {
> + ret = -EINVAL;
> + pr_err("Failed to get pdd\n");
> + goto exit;
> + }
> + pdd = kfd_get_process_device_data(dev, p);
> + if (!pdd) {
> + ret = -EINVAL;
> + pr_err("Failed to get pdd\n");
> + goto exit;
> + }
> +
> + pr_debug("kfd restore ioctl - bo_bucket[%d]:\n", i);
> + pr_debug("size = 0x%llx, bo_addr = 0x%llx bo_offset = 0x%llx\n"
> + "gpu_id = 0x%x alloc_flags = 0x%x\n"
> + "idr_handle = 0x%x\n",
> + bo_bucket->size,
> + bo_bucket->addr,
> + bo_bucket->offset,
> + bo_bucket->gpu_id,
> + bo_bucket->alloc_flags,
> + bo_priv->idr_handle);
> +
> + if (bo_bucket->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL) {
> + pr_debug("restore ioctl: KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL\n");
> + if (bo_bucket->size != kfd_doorbell_process_slice(dev)) {
> + ret = -EINVAL;
> + goto exit;
> + }
> + offset = kfd_get_process_doorbells(pdd);
> + } else if (bo_bucket->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_MMIO_REMAP) {
> + /* MMIO BOs need remapped bus address */
> + pr_debug("restore ioctl :KFD_IOC_ALLOC_MEM_FLAGS_MMIO_REMAP\n");
> + if (bo_bucket->size != PAGE_SIZE) {
> + pr_err("Invalid page size\n");
> + ret = -EINVAL;
> + goto exit;
> + }
> + offset = dev->adev->rmmio_remap.bus_addr;
> + if (!offset) {
> + pr_err("amdgpu_amdkfd_get_mmio_remap_phys_addr failed\n");
> + ret = -ENOMEM;
> + goto exit;
> + }
> + } else if (bo_bucket->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_USERPTR) {
> + offset = bo_priv->user_addr;
> + }
> +
> + /* Create the BO */
> + ret = amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu(dev->adev,
> + bo_bucket->addr,
> + bo_bucket->size,
> + pdd->drm_priv,
> + (struct kgd_mem **) &mem,
> + &offset,
> + bo_bucket->alloc_flags);
> + if (ret) {
> + pr_err("Could not create the BO\n");
> + ret = -ENOMEM;
> + goto exit;
> + }
> + pr_debug("New BO created: size = 0x%llx, bo_addr = 0x%llx bo_offset = 0x%llx\n",
> + bo_bucket->size, bo_bucket->addr, offset);
> +
> + /* Restore previuos IDR handle */
> + pr_debug("Restoring old IDR handle for the BO");
> + idr_handle = idr_alloc(&pdd->alloc_idr, mem,
> + bo_priv->idr_handle,
> + bo_priv->idr_handle + 1, GFP_KERNEL);
> + if (idr_handle < 0) {
> + pr_err("Could not allocate idr\n");
> + amdgpu_amdkfd_gpuvm_free_memory_of_gpu(dev->adev,
> + (struct kgd_mem *)mem,
> + pdd->drm_priv, NULL);
> +
> + ret = -ENOMEM;
> + goto exit;
> + }
> +
> + if (bo_bucket->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL)
> + bo_bucket->restored_offset = KFD_MMAP_TYPE_DOORBELL |
> + KFD_MMAP_GPU_ID(pdd->dev->id);
> + if (bo_bucket->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_MMIO_REMAP) {
> + bo_bucket->restored_offset = KFD_MMAP_TYPE_MMIO |
> + KFD_MMAP_GPU_ID(pdd->dev->id);
> + } else if (bo_bucket->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_GTT) {
> + bo_bucket->restored_offset = offset;
> + pr_debug("updating offset for GTT\n");
> + } else if (bo_bucket->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_VRAM) {
> + bo_bucket->restored_offset = offset;
> + /* Update the VRAM usage count */
> + WRITE_ONCE(pdd->vram_usage, pdd->vram_usage + bo_bucket->size);
> + pr_debug("updating offset for VRAM\n");
> + }
> +
> + /* now map these BOs to GPU/s */
> + for (j = 0; j < p->n_pdds; j++) {
> + struct kfd_process_device *pdd = p->pdds[j];
> + struct kfd_dev *peer;
> + struct kfd_process_device *peer_pdd;
> + bool table_freed = false;
> +
> + peer = kfd_device_by_id(pdd->dev->id);
> +
> + pr_debug("Inside mapping loop with desired gpu_id = 0x%x\n",
> + pdd->dev->id);
> + if (!peer) {
> + pr_debug("Getting device by id failed for 0x%x\n",
> + pdd->dev->id);
> + ret = -EINVAL;
> + goto exit;
> + }
> +
> + peer_pdd = kfd_bind_process_to_device(peer, p);
> + if (IS_ERR(peer_pdd)) {
> + ret = PTR_ERR(peer_pdd);
> + goto exit;
> + }
> + pr_debug("map mem in restore ioctl -> 0x%llx\n",
> + ((struct kgd_mem *)mem)->va);
> + ret = amdgpu_amdkfd_gpuvm_map_memory_to_gpu(peer->adev,
> + (struct kgd_mem *)mem, peer_pdd->drm_priv, &table_freed);
Are we mapping the BOs on all GPUs? That's incorrect. Not all BOs are
mapped on all GPUs. Checkpoint/restore needs to remember the set of GPUs
where a BOs was mapped and restore the mapping only on those GPUs.
Regards,
Felix
> + if (ret) {
> + pr_err("Failed to map to gpu %d/%d\n",
> + j, p->n_pdds);
> + goto exit;
> + }
> + if (table_freed)
> + flush_tlbs = true;
> + }
> +
> + ret = amdgpu_amdkfd_gpuvm_sync_memory(dev->adev,
> + (struct kgd_mem *) mem, true);
> + if (ret) {
> + pr_debug("Sync memory failed, wait interrupted by user signal\n");
> + goto exit;
> + }
> +
> + pr_debug("map memory was successful for the BO\n");
> + } /* done */
> +
> + if (flush_tlbs) {
> + /* Flush TLBs after waiting for the page table updates to complete */
> + for (j = 0; j < p->n_pdds; j++) {
> + struct kfd_dev *peer;
> + struct kfd_process_device *pdd = p->pdds[j];
> + struct kfd_process_device *peer_pdd;
> +
> + peer = kfd_device_by_id(pdd->dev->id);
> + if (WARN_ON_ONCE(!peer))
> + continue;
> + peer_pdd = kfd_get_process_device_data(peer, p);
> + if (WARN_ON_ONCE(!peer_pdd))
> + continue;
> + kfd_flush_tlb(peer_pdd, TLB_FLUSH_LEGACY);
> + }
> + }
> +
> + /* Copy only the buckets back so user can read bo_buckets[N].restored_offset */
> + ret = copy_to_user((void __user *)args->bos,
> + bo_buckets,
> + (args->num_bos * sizeof(*bo_buckets)));
> + if (ret)
> + ret = -EFAULT;
> +
> +exit:
> + kvfree(bo_buckets);
> + kvfree(bo_privs);
> + return ret;
> +}
> +
> static int criu_restore(struct file *filep,
> struct kfd_process *p,
> struct kfd_ioctl_criu_args *args)
> {
> - return 0;
> + uint64_t priv_offset = 0;
> + int ret = 0;
> +
> + pr_debug("CRIU restore (num_devices:%u num_bos:%u num_objects:%u priv_data_size:%llu)\n",
> + args->num_devices, args->num_bos, args->num_objects, args->priv_data_size);
> +
> + if (!args->bos || !args->devices || !args->priv_data || !args->priv_data_size ||
> + !args->num_devices || !args->num_bos)
> + return -EINVAL;
> +
> + mutex_lock(&p->mutex);
> +
> + /*
> + * Set the process to evicted state to avoid running any new queues before all the memory
> + * mappings are ready.
> + */
> + kfd_process_evict_queues(p);
> +
> + /* Each function will adjust priv_offset based on how many bytes they consumed */
> + ret = criu_restore_process(p, args, &priv_offset, args->priv_data_size);
> + if (ret)
> + goto exit_unlock;
> +
> + ret = criu_restore_bos(p, args, &priv_offset, args->priv_data_size);
> + if (ret)
> + goto exit_unlock;
> +
> + if (priv_offset != args->priv_data_size) {
> + pr_err("Invalid private data size\n");
> + ret = -EINVAL;
> + }
> +
> +exit_unlock:
> + mutex_unlock(&p->mutex);
> + if (ret)
> + pr_err("Failed to restore CRIU ret:%d\n", ret);
> + else
> + pr_debug("CRIU restore successful\n");
> +
> + return ret;
> }
>
> static int criu_unpause(struct file *filep,
More information about the amd-gfx
mailing list