[PATCH 15/18] drm/amdkfd: CRIU dump and restore events

Felix Kuehling felix.kuehling at amd.com
Mon Aug 23 18:39:40 UTC 2021


Am 2021-08-19 um 9:37 a.m. schrieb David Yat Sin:
> Add support to existing CRIU ioctl's to save and restore events during
> criu checkpoint and restore.
>
> Signed-off-by: David Yat Sin <david.yatsin at amd.com>
> ---
>  drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 130 +++++++-----
>  drivers/gpu/drm/amd/amdkfd/kfd_events.c  | 253 ++++++++++++++++++++---
>  drivers/gpu/drm/amd/amdkfd/kfd_priv.h    |  25 ++-
>  3 files changed, 329 insertions(+), 79 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> index 19f16e3dd769..c8f523d8ab81 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> @@ -1008,51 +1008,11 @@ static int kfd_ioctl_create_event(struct file *filp, struct kfd_process *p,
>  	 * through the event_page_offset field.
>  	 */
>  	if (args->event_page_offset) {
> -		struct kfd_dev *kfd;
> -		struct kfd_process_device *pdd;
> -		void *mem, *kern_addr;
> -		uint64_t size;
> -
> -		if (p->signal_page) {
> -			pr_err("Event page is already set\n");
> -			return -EINVAL;
> -		}
> -
> -		kfd = kfd_device_by_id(GET_GPU_ID(args->event_page_offset));
> -		if (!kfd) {
> -			pr_err("Getting device by id failed in %s\n", __func__);
> -			return -EINVAL;
> -		}
> -
>  		mutex_lock(&p->mutex);
> -		pdd = kfd_bind_process_to_device(kfd, p);
> -		if (IS_ERR(pdd)) {
> -			err = PTR_ERR(pdd);
> -			goto out_unlock;
> -		}
> -
> -		mem = kfd_process_device_translate_handle(pdd,
> -				GET_IDR_HANDLE(args->event_page_offset));
> -		if (!mem) {
> -			pr_err("Can't find BO, offset is 0x%llx\n",
> -			       args->event_page_offset);
> -			err = -EINVAL;
> -			goto out_unlock;
> -		}
> +		err = kfd_kmap_event_page(p, args->event_page_offset);
>  		mutex_unlock(&p->mutex);
> -
> -		err = amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel(kfd->kgd,
> -						mem, &kern_addr, &size);
> -		if (err) {
> -			pr_err("Failed to map event page to kernel\n");
> -			return err;
> -		}
> -
> -		err = kfd_event_page_set(p, kern_addr, size);
> -		if (err) {
> -			pr_err("Failed to set event page\n");
> +		if (err)
>  			return err;
> -		}
>  	}
>  
>  	err = kfd_event_create(filp, p, args->event_type,
> @@ -1061,10 +1021,7 @@ static int kfd_ioctl_create_event(struct file *filp, struct kfd_process *p,
>  				&args->event_page_offset,
>  				&args->event_slot_index);
>  
> -	return err;
> -
> -out_unlock:
> -	mutex_unlock(&p->mutex);
> +	pr_debug("Created event (id:0x%08x) (%s)\n", args->event_id, __func__);
>  	return err;
>  }
>  
> @@ -2208,6 +2165,41 @@ static int criu_dump_queues(struct kfd_process *p, struct kfd_ioctl_criu_dumper_
>  	return ret;
>  }
>  
> +static int criu_dump_events(struct kfd_process *p, struct kfd_ioctl_criu_dumper_args *args)
> +{
> +	struct kfd_criu_event_bucket *ev_buckets;
> +	uint32_t num_events;
> +	int ret =  0;
> +
> +	num_events = kfd_get_num_events(p);
> +	if (args->num_objects != num_events) {
> +		pr_err("Mismatch with number of events (current:%d user:%lld)\n",
> +							num_events, args->num_objects);
> +
> +	}
> +
> +	if (args->objects_size != args->num_objects *
> +				  (sizeof(*ev_buckets) + sizeof(struct kfd_criu_event_priv_data))) {
> +		pr_err("Invalid objects size for events\n");
> +		return -EINVAL;
> +	}
> +
> +	ev_buckets = kvzalloc(args->objects_size, GFP_KERNEL);
> +	if (!ev_buckets)
> +		return -ENOMEM;
> +
> +	ret = kfd_event_dump(p, ev_buckets, args->num_objects);
> +	if (!ret) {
> +		ret = copy_to_user((void __user *)args->objects, ev_buckets, args->objects_size);
> +		if (ret) {
> +			pr_err("Failed to copy events information to user\n");
> +			ret = -EFAULT;
> +		}
> +	}
> +	kvfree(ev_buckets);
> +	return ret;
> +}
> +
>  static int kfd_ioctl_criu_dumper(struct file *filep,
>  				struct kfd_process *p, void *data)
>  {
> @@ -2246,6 +2238,8 @@ static int kfd_ioctl_criu_dumper(struct file *filep,
>  		ret = criu_dump_queues(p, args);
>  		break;
>  	case KFD_CRIU_OBJECT_TYPE_EVENT:
> +		ret = criu_dump_events(p, args);
> +		break;
>  	case KFD_CRIU_OBJECT_TYPE_DEVICE:
>  	case KFD_CRIU_OBJECT_TYPE_SVM_RANGE:
>  	default:
> @@ -2676,6 +2670,40 @@ static int criu_restore_queues(struct kfd_process *p,
>  	return ret;
>  }
>  
> +static int criu_restore_events(struct file *filp, struct kfd_process *p,
> +			struct kfd_ioctl_criu_restorer_args *args)
> +{
> +	int ret = 0, i;
> +	uint8_t *objects, *private_data;
> +	struct kfd_criu_event_bucket *ev_buckets;
> +
> +	objects = kvzalloc(args->objects_size, GFP_KERNEL);
> +	if (!objects)
> +		return -ENOMEM;
> +
> +	ret = copy_from_user(objects, (void __user *)args->objects, args->objects_size);
> +	if (ret) {
> +		pr_err("Failed to copy event information from user\n");
> +		ret = -EFAULT;
> +		goto exit;
> +	}
> +
> +	ev_buckets = (struct kfd_criu_event_bucket *) objects;
> +	private_data = (void *)(ev_buckets + args->num_objects);
> +
> +	for (i = 0; i < args->num_objects; i++) {
> +		ret = kfd_event_restore(filp, p, &ev_buckets[i], private_data);
> +		if (ret) {
> +			pr_err("Failed to restore event (%d)\n", ret);
> +			goto exit;
> +		}
> +	}
> +
> +exit:
> +	kvfree(ev_buckets);
> +	return ret;
> +}
> +
>  static int kfd_ioctl_criu_restorer(struct file *filep,
>  				struct kfd_process *p, void *data)
>  {
> @@ -2698,6 +2726,8 @@ static int kfd_ioctl_criu_restorer(struct file *filep,
>  		ret = criu_restore_queues(p, args);
>  		break;
>  	case KFD_CRIU_OBJECT_TYPE_EVENT:
> +		ret = criu_restore_events(filep, p, args);
> +		break;
>  	case KFD_CRIU_OBJECT_TYPE_DEVICE:
>  	case KFD_CRIU_OBJECT_TYPE_SVM_RANGE:
>  	default:
> @@ -2799,9 +2829,13 @@ static int kfd_ioctl_criu_process_info(struct file *filep,
>  	args->queues_priv_data_size = queues_extra_data_size +
>  				(args->total_queues * sizeof(struct kfd_criu_queue_priv_data));
>  
> -	dev_dbg(kfd_device, "Num of bos:%llu queues:%u\n",
> +	args->total_events = kfd_get_num_events(p);
> +	args->events_priv_data_size = args->total_events * sizeof(struct kfd_criu_event_priv_data);
> +
> +	dev_dbg(kfd_device, "Num of bos:%llu queues:%u events:%u\n",
>  				args->total_bos,
> -				args->total_queues);
> +				args->total_queues,
> +				args->total_events);
>  err_unlock:
>  	mutex_unlock(&p->mutex);
>  	return ret;
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
> index ba2c2ce0c55a..18362478e351 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
> @@ -53,9 +53,9 @@ struct kfd_signal_page {
>  	uint64_t *kernel_address;
>  	uint64_t __user *user_address;
>  	bool need_to_free_pages;
> +	uint64_t user_handle; /* Needed for CRIU dumped and restore */
>  };
>  
> -
>  static uint64_t *page_slots(struct kfd_signal_page *page)
>  {
>  	return page->kernel_address;
> @@ -92,7 +92,8 @@ static struct kfd_signal_page *allocate_signal_page(struct kfd_process *p)
>  }
>  
>  static int allocate_event_notification_slot(struct kfd_process *p,
> -					    struct kfd_event *ev)
> +					    struct kfd_event *ev,
> +					    const int *restore_id)
>  {
>  	int id;
>  
> @@ -104,14 +105,19 @@ static int allocate_event_notification_slot(struct kfd_process *p,
>  		p->signal_mapped_size = 256*8;
>  	}
>  
> -	/*
> -	 * Compatibility with old user mode: Only use signal slots
> -	 * user mode has mapped, may be less than
> -	 * KFD_SIGNAL_EVENT_LIMIT. This also allows future increase
> -	 * of the event limit without breaking user mode.
> -	 */
> -	id = idr_alloc(&p->event_idr, ev, 0, p->signal_mapped_size / 8,
> -		       GFP_KERNEL);
> +	if (restore_id) {
> +		id = idr_alloc(&p->event_idr, ev, *restore_id, *restore_id + 1,
> +				GFP_KERNEL);
> +	} else {
> +		/*
> +		 * Compatibility with old user mode: Only use signal slots
> +		 * user mode has mapped, may be less than
> +		 * KFD_SIGNAL_EVENT_LIMIT. This also allows future increase
> +		 * of the event limit without breaking user mode.
> +		 */
> +		id = idr_alloc(&p->event_idr, ev, 0, p->signal_mapped_size / 8,
> +				GFP_KERNEL);
> +	}
>  	if (id < 0)
>  		return id;
>  
> @@ -178,9 +184,8 @@ static struct kfd_event *lookup_signaled_event_by_partial_id(
>  	return ev;
>  }
>  
> -static int create_signal_event(struct file *devkfd,
> -				struct kfd_process *p,
> -				struct kfd_event *ev)
> +static int create_signal_event(struct file *devkfd, struct kfd_process *p,
> +				struct kfd_event *ev, const int *restore_id)
>  {
>  	int ret;
>  
> @@ -193,7 +198,7 @@ static int create_signal_event(struct file *devkfd,
>  		return -ENOSPC;
>  	}
>  
> -	ret = allocate_event_notification_slot(p, ev);
> +	ret = allocate_event_notification_slot(p, ev, restore_id);
>  	if (ret) {
>  		pr_warn("Signal event wasn't created because out of kernel memory\n");
>  		return ret;
> @@ -209,16 +214,22 @@ static int create_signal_event(struct file *devkfd,
>  	return 0;
>  }
>  
> -static int create_other_event(struct kfd_process *p, struct kfd_event *ev)
> +static int create_other_event(struct kfd_process *p, struct kfd_event *ev, const int *restore_id)
>  {
> -	/* Cast KFD_LAST_NONSIGNAL_EVENT to uint32_t. This allows an
> -	 * intentional integer overflow to -1 without a compiler
> -	 * warning. idr_alloc treats a negative value as "maximum
> -	 * signed integer".
> -	 */
> -	int id = idr_alloc(&p->event_idr, ev, KFD_FIRST_NONSIGNAL_EVENT_ID,
> -			   (uint32_t)KFD_LAST_NONSIGNAL_EVENT_ID + 1,
> -			   GFP_KERNEL);
> +	int id;
> +
> +	if (restore_id)
> +		id = idr_alloc(&p->event_idr, ev, *restore_id, *restore_id + 1,
> +			GFP_KERNEL);
> +	else
> +		/* Cast KFD_LAST_NONSIGNAL_EVENT to uint32_t. This allows an
> +		 * intentional integer overflow to -1 without a compiler
> +		 * warning. idr_alloc treats a negative value as "maximum
> +		 * signed integer".
> +		 */
> +		id = idr_alloc(&p->event_idr, ev, KFD_FIRST_NONSIGNAL_EVENT_ID,
> +				(uint32_t)KFD_LAST_NONSIGNAL_EVENT_ID + 1,
> +				GFP_KERNEL);
>  
>  	if (id < 0)
>  		return id;
> @@ -295,8 +306,8 @@ static bool event_can_be_cpu_signaled(const struct kfd_event *ev)
>  	return ev->type == KFD_EVENT_TYPE_SIGNAL;
>  }
>  
> -int kfd_event_page_set(struct kfd_process *p, void *kernel_address,
> -		       uint64_t size)
> +static int kfd_event_page_set(struct kfd_process *p, void *kernel_address,
> +		       uint64_t size, uint64_t user_handle)
>  {
>  	struct kfd_signal_page *page;
>  
> @@ -315,10 +326,55 @@ int kfd_event_page_set(struct kfd_process *p, void *kernel_address,
>  
>  	p->signal_page = page;
>  	p->signal_mapped_size = size;
> -
> +	p->signal_page->user_handle = user_handle;
>  	return 0;
>  }
>  
> +int kfd_kmap_event_page(struct kfd_process *p, uint64_t event_page_offset)

This function should be static. I also think that this function and
criu_dump/restore_events could be moved into kfd_events.c.

Regards,
  Felix


> +{
> +	struct kfd_dev *kfd;
> +	struct kfd_process_device *pdd;
> +	void *mem, *kern_addr;
> +	uint64_t size;
> +	int err = 0;
> +
> +	if (p->signal_page) {
> +		pr_err("Event page is already set\n");
> +		return -EINVAL;
> +	}
> +
> +	kfd = kfd_device_by_id(GET_GPU_ID(event_page_offset));
> +	if (!kfd) {
> +		pr_err("Getting device by id failed in %s\n", __func__);
> +		return -EINVAL;
> +	}
> +
> +	pdd = kfd_bind_process_to_device(kfd, p);
> +	if (IS_ERR(pdd))
> +		return PTR_ERR(pdd);
> +
> +	mem = kfd_process_device_translate_handle(pdd,
> +			GET_IDR_HANDLE(event_page_offset));
> +	if (!mem) {
> +		pr_err("Can't find BO, offset is 0x%llx\n", event_page_offset);
> +		return -EINVAL;
> +	}
> +
> +	err = amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel(kfd->kgd,
> +					mem, &kern_addr, &size);
> +	if (err) {
> +		pr_err("Failed to map event page to kernel\n");
> +		return err;
> +	}
> +
> +	err = kfd_event_page_set(p, kern_addr, size, event_page_offset);
> +	if (err) {
> +		pr_err("Failed to set event page\n");
> +		return err;
> +	}
> +	return err;
> +}
> +
>  int kfd_event_create(struct file *devkfd, struct kfd_process *p,
>  		     uint32_t event_type, bool auto_reset, uint32_t node_id,
>  		     uint32_t *event_id, uint32_t *event_trigger_data,
> @@ -343,14 +399,14 @@ int kfd_event_create(struct file *devkfd, struct kfd_process *p,
>  	switch (event_type) {
>  	case KFD_EVENT_TYPE_SIGNAL:
>  	case KFD_EVENT_TYPE_DEBUG:
> -		ret = create_signal_event(devkfd, p, ev);
> +		ret = create_signal_event(devkfd, p, ev, NULL);
>  		if (!ret) {
>  			*event_page_offset = KFD_MMAP_TYPE_EVENTS;
>  			*event_slot_index = ev->event_id;
>  		}
>  		break;
>  	default:
> -		ret = create_other_event(p, ev);
> +		ret = create_other_event(p, ev, NULL);
>  		break;
>  	}
>  
> @@ -366,6 +422,147 @@ int kfd_event_create(struct file *devkfd, struct kfd_process *p,
>  	return ret;
>  }
>  
> +int kfd_event_restore(struct file *devkfd, struct kfd_process *p,
> +		      struct kfd_criu_event_bucket *ev_bucket,
> +		      uint8_t *priv_datas)
> +{
> +	int ret = 0;
> +	struct kfd_criu_event_priv_data *ev_priv;
> +	struct kfd_event *ev;
> +
> +	ev_priv = (struct kfd_criu_event_priv_data *)(priv_datas + ev_bucket->priv_data_offset);
> +
> +	if (ev_priv->user_handle) {
> +		ret = kfd_kmap_event_page(p, ev_priv->user_handle);
> +		if (ret)
> +			return ret;
> +	}
> +
> +	ev = kzalloc(sizeof(*ev), GFP_KERNEL);
> +	if (!ev)
> +		return -ENOMEM;
> +
> +	ev->type = ev_priv->type;
> +	ev->auto_reset = ev_priv->auto_reset;
> +	ev->signaled = ev_priv->signaled;
> +
> +	init_waitqueue_head(&ev->wq);
> +
> +	mutex_lock(&p->event_mutex);
> +	switch (ev->type) {
> +	case KFD_EVENT_TYPE_SIGNAL:
> +	case KFD_EVENT_TYPE_DEBUG:
> +		ret = create_signal_event(devkfd, p, ev, &ev_priv->event_id);
> +		break;
> +	case KFD_EVENT_TYPE_MEMORY:
> +		memcpy(&ev->memory_exception_data,
> +			&ev_priv->memory_exception_data,
> +			sizeof(struct kfd_hsa_memory_exception_data));
> +
> +		ev->memory_exception_data.gpu_id = ev_bucket->gpu_id;
> +		ret = create_other_event(p, ev, &ev_priv->event_id);
> +		break;
> +	case KFD_EVENT_TYPE_HW_EXCEPTION:
> +		memcpy(&ev->hw_exception_data,
> +			&ev_priv->hw_exception_data,
> +			sizeof(struct kfd_hsa_hw_exception_data));
> +
> +		ev->hw_exception_data.gpu_id = ev_bucket->gpu_id;
> +		ret = create_other_event(p, ev, &ev_priv->event_id);
> +		break;
> +	}
> +
> +	if (ret)
> +		kfree(ev);
> +
> +	mutex_unlock(&p->event_mutex);
> +
> +	return ret;
> +}
> +
> +int kfd_event_dump(struct kfd_process *p,
> +		   struct kfd_criu_event_bucket *ev_buckets,
> +		   uint32_t num_events)
> +{
> +	struct kfd_event *ev;
> +	struct kfd_criu_event_priv_data *ev_privs;
> +	uint32_t ev_id;
> +	int i = 0;
> +
> +	/* Private data for first event starts after all ev_buckets */
> +	ev_privs = (struct kfd_criu_event_priv_data *)((uint8_t *)ev_buckets +
> +						   (num_events * (sizeof(*ev_buckets))));
> +
> +
> +	idr_for_each_entry(&p->event_idr, ev, ev_id) {
> +		struct kfd_criu_event_bucket *ev_bucket;
> +		struct kfd_criu_event_priv_data *ev_priv;
> +
> +		if (i >= num_events) {
> +			pr_err("Number of events exceeds number allocated\n");
> +			return -ENOMEM;
> +		}
> +
> +		ev_bucket = &ev_buckets[i];
> +
> +		/* Currently, all events have same size of private_data, but the current ioctl's
> +		 * and CRIU plugin supports private_data of variable sizes
> +		 */
> +		ev_priv = &ev_privs[i];
> +
> +		ev_bucket->priv_data_offset = i * sizeof(*ev_priv);
> +		ev_bucket->priv_data_size = sizeof(*ev_priv);
> +
> +		/* We store the user_handle with the first event */
> +		if (i == 0 && p->signal_page)
> +			ev_priv->user_handle = p->signal_page->user_handle;
> +
> +		ev_priv->event_id = ev->event_id;
> +		ev_priv->auto_reset = ev->auto_reset;
> +		ev_priv->type = ev->type;
> +		ev_priv->signaled = ev->signaled;
> +
> +		/* We store the gpu_id in the bucket section so that the userspace CRIU plugin can
> +		 * modify it if needed.
> +		 */
> +		if (ev_priv->type == KFD_EVENT_TYPE_MEMORY) {
> +			memcpy(&ev_priv->memory_exception_data,
> +				&ev->memory_exception_data,
> +				sizeof(struct kfd_hsa_memory_exception_data));
> +
> +			ev_bucket->gpu_id = ev_priv->memory_exception_data.gpu_id;
> +		} else if (ev_priv->type == KFD_EVENT_TYPE_HW_EXCEPTION) {
> +			memcpy(&ev_priv->hw_exception_data,
> +				&ev->hw_exception_data,
> +				sizeof(struct kfd_hsa_hw_exception_data));
> +
> +			ev_bucket->gpu_id = ev_priv->hw_exception_data.gpu_id;
> +		} else
> +			ev_bucket->gpu_id = 0;
> +
> +		pr_debug("Dumped event[%d] id = 0x%08x auto_reset = %x type = %x signaled = %x\n",
> +			  i,
> +			  ev_priv->event_id,
> +			  ev_priv->auto_reset,
> +			  ev_priv->type,
> +			  ev_priv->signaled);
> +		i++;
> +	}
> +	return 0;
> +}
> +
> +int kfd_get_num_events(struct kfd_process *p)
> +{
> +	struct kfd_event *ev;
> +	uint32_t id;
> +	u32 num_events = 0;
> +
> +	idr_for_each_entry(&p->event_idr, ev, id)
> +		num_events++;
> +
> +	return num_events++;
> +}
> +
>  /* Assumes that p is current. */
>  int kfd_event_destroy(struct kfd_process *p, uint32_t event_id)
>  {
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> index 7ed6f831109d..bf10a5305ef7 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> @@ -1069,9 +1069,26 @@ struct kfd_criu_queue_priv_data {
>  };
>  
>  struct kfd_criu_event_priv_data {
> -	uint64_t reserved;
> +	uint64_t user_handle;
> +	uint32_t event_id;
> +	uint32_t auto_reset;
> +	uint32_t type;
> +	uint32_t signaled;
> +
> +	union {
> +		struct kfd_hsa_memory_exception_data memory_exception_data;
> +		struct kfd_hsa_hw_exception_data hw_exception_data;
> +	};
>  };
>  
> +int kfd_event_restore(struct file *devkfd, struct kfd_process *p,
> +		      struct kfd_criu_event_bucket *ev_bucket,
> +		      uint8_t *priv_datas);
> +
> +int kfd_event_dump(struct kfd_process *p,
> +		   struct kfd_criu_event_bucket *ev_buckets,
> +		   uint32_t num_events);
> +
>  /* CRIU - End */
>  
>  /* Queue Context Management */
> @@ -1238,12 +1255,14 @@ void kfd_signal_iommu_event(struct kfd_dev *dev,
>  void kfd_signal_hw_exception_event(u32 pasid);
>  int kfd_set_event(struct kfd_process *p, uint32_t event_id);
>  int kfd_reset_event(struct kfd_process *p, uint32_t event_id);
> -int kfd_event_page_set(struct kfd_process *p, void *kernel_address,
> -		       uint64_t size);
> +int kfd_kmap_event_page(struct kfd_process *p, uint64_t event_page_offset);
> +
>  int kfd_event_create(struct file *devkfd, struct kfd_process *p,
>  		     uint32_t event_type, bool auto_reset, uint32_t node_id,
>  		     uint32_t *event_id, uint32_t *event_trigger_data,
>  		     uint64_t *event_page_offset, uint32_t *event_slot_index);
> +
> +int kfd_get_num_events(struct kfd_process *p);
>  int kfd_event_destroy(struct kfd_process *p, uint32_t event_id);
>  
>  void kfd_signal_vm_fault_event(struct kfd_dev *dev, u32 pasid,


More information about the amd-gfx mailing list