[PATCH 08/18] drm/amdkfd: CRIU Implement KFD pause ioctl

David Yat Sin david.yatsin at amd.com
Thu Aug 19 13:37:03 UTC 2021


Introducing pause IOCTL. The CRIU amdgpu plugin is needs
to call AMDKFD_IOC_CRIU_PAUSE(pause = 1) before starting dump and
AMDKFD_IOC_CRIU_PAUSE(pause = 0) when dump is complete. This ensures
that the queues are not modified between each CRIU dump ioctl.

Signed-off-by: David Yat Sin <david.yatsin at amd.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 23 +++++++++++++++++++++--
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h    |  3 +++
 drivers/gpu/drm/amd/amdkfd/kfd_process.c |  1 +
 3 files changed, 25 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index f0c278e7d7e0..24e5c53261f5 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -1984,6 +1984,14 @@ static int kfd_ioctl_criu_dumper(struct file *filep,
 		goto err_unlock;
 	}
 
+	/* Confirm all process queues are evicted */
+	if (!p->queues_paused) {
+		pr_err("Cannot dump process when queues are not in evicted state\n");
+		/* CRIU plugin did not call AMDKFD_IOC_CRIU_PAUSE before dumping */
+		ret = -EINVAL;
+		goto err_unlock;
+	}
+
 	switch (args->type) {
 	case KFD_CRIU_OBJECT_TYPE_PROCESS:
 		ret = criu_dump_process(p, args);
@@ -2306,9 +2314,20 @@ static int kfd_ioctl_criu_restorer(struct file *filep,
 
 static int kfd_ioctl_criu_pause(struct file *filep, struct kfd_process *p, void *data)
 {
-	pr_debug("Inside %s\n", __func__);
+	int ret;
+	struct kfd_ioctl_criu_pause_args *args = data;
 
-	return 0;
+	if (args->pause)
+		ret = kfd_process_evict_queues(p);
+	else
+		ret = kfd_process_restore_queues(p);
+
+	if (ret)
+		pr_err("Failed to %s queues ret:%d\n", args->pause ? "evict" : "restore", ret);
+	else
+		p->queues_paused = !!(args->pause);
+
+	return ret;
 }
 
 static int kfd_ioctl_criu_resume(struct file *filep,
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index 719982605587..0b8165729cde 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -857,6 +857,9 @@ struct kfd_process {
 	bool svm_disabled;
 
 	bool xnack_enabled;
+
+	/* Queues are in paused stated because we are in the process of doing a CRIU checkpoint */
+	bool queues_paused;
 };
 
 #define KFD_PROCESS_TABLE_SIZE 5 /* bits: 32 entries */
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
index bbf21395fb06..e4cb2f778590 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
@@ -1268,6 +1268,7 @@ static struct kfd_process *create_process(const struct task_struct *thread)
 	process->lead_thread = thread->group_leader;
 	process->n_pdds = 0;
 	process->svm_disabled = false;
+	process->queues_paused = false;
 	INIT_DELAYED_WORK(&process->eviction_work, evict_process_worker);
 	INIT_DELAYED_WORK(&process->restore_work, restore_process_worker);
 	process->last_restore_timestamp = get_jiffies_64();
-- 
2.17.1



More information about the amd-gfx mailing list