[Patch v4 06/24] drm/amdkfd: CRIU Implement KFD restore ioctl

Felix Kuehling felix.kuehling at amd.com
Mon Jan 10 23:01:11 UTC 2022


On 2021-12-22 7:36 p.m., Rajneesh Bhardwaj wrote:
> This implements the KFD CRIU Restore ioctl that lays the basic
> foundation for the CRIU restore operation. It provides support to
> create the buffer objects corresponding to Non-Paged system memory
> mapped for GPU and/or CPU access and lays basic foundation for the
> userptrs buffer objects which will be added in a separate patch.
> This ioctl creates various types of buffer objects such as VRAM,
> MMIO, Doorbell, GTT based on the date sent from the userspace plugin.
> The data mostly contains the previously checkpointed KFD images from
> some KFD processs.
>
> While restoring a criu process, attach old IDR values to newly
> created BOs. This also adds the minimal gpu mapping support for a single
> gpu checkpoint restore use case.
>
> Signed-off-by: David Yat Sin <david.yatsin at amd.com>
> Signed-off-by: Rajneesh Bhardwaj <rajneesh.bhardwaj at amd.com>
> ---
>   drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 298 ++++++++++++++++++++++-
>   1 file changed, 297 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> index cdbb92972338..c93f74ad073f 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> @@ -2069,11 +2069,307 @@ static int criu_checkpoint(struct file *filep,
>   	return ret;
>   }
>   
> +static int criu_restore_process(struct kfd_process *p,
> +				struct kfd_ioctl_criu_args *args,
> +				uint64_t *priv_offset,
> +				uint64_t max_priv_data_size)
> +{
> +	int ret = 0;
> +	struct kfd_criu_process_priv_data process_priv;
> +
> +	if (*priv_offset + sizeof(process_priv) > max_priv_data_size)
> +		return -EINVAL;
> +
> +	ret = copy_from_user(&process_priv,
> +				(void __user *)(args->priv_data + *priv_offset),
> +				sizeof(process_priv));
> +	if (ret) {
> +		pr_err("Failed to copy process private information from user\n");
> +		ret = -EFAULT;
> +		goto exit;
> +	}
> +	*priv_offset += sizeof(process_priv);
> +
> +	if (process_priv.version != KFD_CRIU_PRIV_VERSION) {
> +		pr_err("Invalid CRIU API version (checkpointed:%d current:%d)\n",
> +			process_priv.version, KFD_CRIU_PRIV_VERSION);
> +		return -EINVAL;
> +	}
> +
> +exit:
> +	return ret;
> +}
> +
> +static int criu_restore_bos(struct kfd_process *p,
> +			    struct kfd_ioctl_criu_args *args,
> +			    uint64_t *priv_offset,
> +			    uint64_t max_priv_data_size)
> +{
> +	struct kfd_criu_bo_bucket *bo_buckets;
> +	struct kfd_criu_bo_priv_data *bo_privs;
> +	bool flush_tlbs = false;
> +	int ret = 0, j = 0;
> +	uint32_t i;
> +
> +	if (*priv_offset + (args->num_bos * sizeof(*bo_privs)) > max_priv_data_size)
> +		return -EINVAL;
> +
> +	bo_buckets = kvmalloc_array(args->num_bos, sizeof(*bo_buckets), GFP_KERNEL);
> +	if (!bo_buckets)
> +		return -ENOMEM;
> +
> +	ret = copy_from_user(bo_buckets, (void __user *)args->bos,
> +			     args->num_bos * sizeof(*bo_buckets));
> +	if (ret) {
> +		pr_err("Failed to copy BOs information from user\n");
> +		ret = -EFAULT;
> +		goto exit;
> +	}
> +
> +	bo_privs = kvmalloc_array(args->num_bos, sizeof(*bo_privs), GFP_KERNEL);
> +	if (!bo_privs) {
> +		ret = -ENOMEM;
> +		goto exit;
> +	}
> +
> +	ret = copy_from_user(bo_privs, (void __user *)args->priv_data + *priv_offset,
> +			     args->num_bos * sizeof(*bo_privs));
> +	if (ret) {
> +		pr_err("Failed to copy BOs information from user\n");
> +		ret = -EFAULT;
> +		goto exit;
> +	}
> +	*priv_offset += args->num_bos * sizeof(*bo_privs);
> +
> +	/* Create and map new BOs */
> +	for (i = 0; i < args->num_bos; i++) {
> +		struct kfd_criu_bo_bucket *bo_bucket;
> +		struct kfd_criu_bo_priv_data *bo_priv;
> +		struct kfd_dev *dev;
> +		struct kfd_process_device *pdd;
> +		void *mem;
> +		u64 offset;
> +		int idr_handle;
> +
> +		bo_bucket = &bo_buckets[i];
> +		bo_priv = &bo_privs[i];
> +
> +		dev = kfd_device_by_id(bo_bucket->gpu_id);
> +		if (!dev) {
> +			ret = -EINVAL;
> +			pr_err("Failed to get pdd\n");
> +			goto exit;
> +		}
> +		pdd = kfd_get_process_device_data(dev, p);
> +		if (!pdd) {
> +			ret = -EINVAL;
> +			pr_err("Failed to get pdd\n");
> +			goto exit;
> +		}
> +
> +		pr_debug("kfd restore ioctl - bo_bucket[%d]:\n", i);
> +		pr_debug("size = 0x%llx, bo_addr = 0x%llx bo_offset = 0x%llx\n"
> +			"gpu_id = 0x%x alloc_flags = 0x%x\n"
> +			"idr_handle = 0x%x\n",
> +			bo_bucket->size,
> +			bo_bucket->addr,
> +			bo_bucket->offset,
> +			bo_bucket->gpu_id,
> +			bo_bucket->alloc_flags,
> +			bo_priv->idr_handle);
> +
> +		if (bo_bucket->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL) {
> +			pr_debug("restore ioctl: KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL\n");
> +			if (bo_bucket->size != kfd_doorbell_process_slice(dev)) {
> +				ret = -EINVAL;
> +				goto exit;
> +			}
> +			offset = kfd_get_process_doorbells(pdd);
> +		} else if (bo_bucket->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_MMIO_REMAP) {
> +			/* MMIO BOs need remapped bus address */
> +			pr_debug("restore ioctl :KFD_IOC_ALLOC_MEM_FLAGS_MMIO_REMAP\n");
> +			if (bo_bucket->size != PAGE_SIZE) {
> +				pr_err("Invalid page size\n");
> +				ret = -EINVAL;
> +				goto exit;
> +			}
> +			offset = dev->adev->rmmio_remap.bus_addr;
> +			if (!offset) {
> +				pr_err("amdgpu_amdkfd_get_mmio_remap_phys_addr failed\n");
> +				ret = -ENOMEM;
> +				goto exit;
> +			}
> +		} else if (bo_bucket->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_USERPTR) {
> +			offset = bo_priv->user_addr;
> +		}
> +
> +		/* Create the BO */
> +		ret = amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu(dev->adev,
> +						bo_bucket->addr,
> +						bo_bucket->size,
> +						pdd->drm_priv,
> +						(struct kgd_mem **) &mem,
> +						&offset,
> +						bo_bucket->alloc_flags);
> +		if (ret) {
> +			pr_err("Could not create the BO\n");
> +			ret = -ENOMEM;
> +			goto exit;
> +		}
> +		pr_debug("New BO created: size = 0x%llx, bo_addr = 0x%llx bo_offset = 0x%llx\n",
> +			bo_bucket->size, bo_bucket->addr, offset);
> +
> +		/* Restore previuos IDR handle */
> +		pr_debug("Restoring old IDR handle for the BO");
> +		idr_handle = idr_alloc(&pdd->alloc_idr, mem,
> +				       bo_priv->idr_handle,
> +				       bo_priv->idr_handle + 1, GFP_KERNEL);
> +		if (idr_handle < 0) {
> +			pr_err("Could not allocate idr\n");
> +			amdgpu_amdkfd_gpuvm_free_memory_of_gpu(dev->adev,
> +						(struct kgd_mem *)mem,
> +						pdd->drm_priv, NULL);
> +
> +			ret = -ENOMEM;
> +			goto exit;
> +		}
> +
> +		if (bo_bucket->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL)
> +			bo_bucket->restored_offset = KFD_MMAP_TYPE_DOORBELL |
> +				KFD_MMAP_GPU_ID(pdd->dev->id);
> +		if (bo_bucket->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_MMIO_REMAP) {
> +			bo_bucket->restored_offset = KFD_MMAP_TYPE_MMIO |
> +				KFD_MMAP_GPU_ID(pdd->dev->id);
> +		} else if (bo_bucket->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_GTT) {
> +			bo_bucket->restored_offset = offset;
> +			pr_debug("updating offset for GTT\n");
> +		} else if (bo_bucket->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_VRAM) {
> +			bo_bucket->restored_offset = offset;
> +			/* Update the VRAM usage count */
> +			WRITE_ONCE(pdd->vram_usage, pdd->vram_usage + bo_bucket->size);
> +			pr_debug("updating offset for VRAM\n");
> +		}
> +
> +		/* now map these BOs to GPU/s */
> +		for (j = 0; j < p->n_pdds; j++) {
> +			struct kfd_process_device *pdd = p->pdds[j];
> +			struct kfd_dev *peer;
> +			struct kfd_process_device *peer_pdd;
> +			bool table_freed = false;
> +
> +			peer = kfd_device_by_id(pdd->dev->id);
> +
> +			pr_debug("Inside mapping loop with desired gpu_id = 0x%x\n",
> +							pdd->dev->id);
> +			if (!peer) {
> +				pr_debug("Getting device by id failed for 0x%x\n",
> +						pdd->dev->id);
> +				ret = -EINVAL;
> +				goto exit;
> +			}
> +
> +			peer_pdd = kfd_bind_process_to_device(peer, p);
> +			if (IS_ERR(peer_pdd)) {
> +				ret = PTR_ERR(peer_pdd);
> +				goto exit;
> +			}
> +			pr_debug("map mem in restore ioctl -> 0x%llx\n",
> +				 ((struct kgd_mem *)mem)->va);
> +			ret = amdgpu_amdkfd_gpuvm_map_memory_to_gpu(peer->adev,
> +				(struct kgd_mem *)mem, peer_pdd->drm_priv, &table_freed);

Are we mapping the BOs on all GPUs? That's incorrect. Not all BOs are 
mapped on all GPUs. Checkpoint/restore needs to remember the set of GPUs 
where a BOs was mapped and restore the mapping only on those GPUs.

Regards,
   Felix


> +			if (ret) {
> +				pr_err("Failed to map to gpu %d/%d\n",
> +				j, p->n_pdds);
> +				goto exit;
> +			}
> +			if (table_freed)
> +				flush_tlbs = true;
> +		}
> +
> +		ret = amdgpu_amdkfd_gpuvm_sync_memory(dev->adev,
> +						      (struct kgd_mem *) mem, true);
> +		if (ret) {
> +			pr_debug("Sync memory failed, wait interrupted by user signal\n");
> +			goto exit;
> +		}
> +
> +		pr_debug("map memory was successful for the BO\n");
> +	} /* done */
> +
> +	if (flush_tlbs) {
> +		/* Flush TLBs after waiting for the page table updates to complete */
> +		for (j = 0; j < p->n_pdds; j++) {
> +			struct kfd_dev *peer;
> +			struct kfd_process_device *pdd = p->pdds[j];
> +			struct kfd_process_device *peer_pdd;
> +
> +			peer = kfd_device_by_id(pdd->dev->id);
> +			if (WARN_ON_ONCE(!peer))
> +				continue;
> +			peer_pdd = kfd_get_process_device_data(peer, p);
> +			if (WARN_ON_ONCE(!peer_pdd))
> +				continue;
> +			kfd_flush_tlb(peer_pdd, TLB_FLUSH_LEGACY);
> +		}
> +	}
> +
> +	/* Copy only the buckets back so user can read bo_buckets[N].restored_offset */
> +	ret = copy_to_user((void __user *)args->bos,
> +				bo_buckets,
> +				(args->num_bos * sizeof(*bo_buckets)));
> +	if (ret)
> +		ret = -EFAULT;
> +
> +exit:
> +	kvfree(bo_buckets);
> +	kvfree(bo_privs);
> +	return ret;
> +}
> +
>   static int criu_restore(struct file *filep,
>   			struct kfd_process *p,
>   			struct kfd_ioctl_criu_args *args)
>   {
> -	return 0;
> +	uint64_t priv_offset = 0;
> +	int ret = 0;
> +
> +	pr_debug("CRIU restore (num_devices:%u num_bos:%u num_objects:%u priv_data_size:%llu)\n",
> +		 args->num_devices, args->num_bos, args->num_objects, args->priv_data_size);
> +
> +	if (!args->bos || !args->devices || !args->priv_data || !args->priv_data_size ||
> +	    !args->num_devices || !args->num_bos)
> +		return -EINVAL;
> +
> +	mutex_lock(&p->mutex);
> +
> +	/*
> +	 * Set the process to evicted state to avoid running any new queues before all the memory
> +	 * mappings are ready.
> +	 */
> +	kfd_process_evict_queues(p);
> +
> +	/* Each function will adjust priv_offset based on how many bytes they consumed */
> +	ret = criu_restore_process(p, args, &priv_offset, args->priv_data_size);
> +	if (ret)
> +		goto exit_unlock;
> +
> +	ret = criu_restore_bos(p, args, &priv_offset, args->priv_data_size);
> +	if (ret)
> +		goto exit_unlock;
> +
> +	if (priv_offset != args->priv_data_size) {
> +		pr_err("Invalid private data size\n");
> +		ret = -EINVAL;
> +	}
> +
> +exit_unlock:
> +	mutex_unlock(&p->mutex);
> +	if (ret)
> +		pr_err("Failed to restore CRIU ret:%d\n", ret);
> +	else
> +		pr_debug("CRIU restore successful\n");
> +
> +	return ret;
>   }
>   
>   static int criu_unpause(struct file *filep,


More information about the amd-gfx mailing list