[PATCH v3 07/16] drm/amdkfd: CRIU Implement KFD pause ioctl
David Yat Sin
david.yatsin at amd.com
Wed Sep 29 12:00:29 UTC 2021
Introducing pause IOCTL. The CRIU amdgpu plugin is needs
to call AMDKFD_IOC_CRIU_PAUSE(pause = 1) before starting dump and
AMDKFD_IOC_CRIU_PAUSE(pause = 0) when dump is complete. This ensures
that the queues are not modified between each CRIU dump ioctl.
Signed-off-by: David Yat Sin <david.yatsin at amd.com>
---
drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 23 ++++++++++++++++++++++-
drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 3 +++
drivers/gpu/drm/amd/amdkfd/kfd_process.c | 1 +
3 files changed, 26 insertions(+), 1 deletion(-)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index 668772a67f7a..791cb1555413 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -2027,6 +2027,14 @@ static int kfd_ioctl_criu_dumper(struct file *filep,
goto err_unlock;
}
+ /* Confirm all process queues are evicted */
+ if (!p->queues_paused) {
+ pr_err("Cannot dump process when queues are not in evicted state\n");
+ /* CRIU plugin did not call AMDKFD_IOC_CRIU_PAUSE before dumping */
+ ret = -EINVAL;
+ goto err_unlock;
+ }
+
switch (args->type) {
case KFD_CRIU_OBJECT_TYPE_PROCESS:
ret = criu_dump_process(p, args);
@@ -2363,7 +2371,20 @@ static int kfd_ioctl_criu_restorer(struct file *filep,
static int kfd_ioctl_criu_pause(struct file *filep, struct kfd_process *p, void *data)
{
- return 0;
+ int ret;
+ struct kfd_ioctl_criu_pause_args *args = data;
+
+ if (args->pause)
+ ret = kfd_process_evict_queues(p);
+ else
+ ret = kfd_process_restore_queues(p);
+
+ if (ret)
+ pr_err("Failed to %s queues ret:%d\n", args->pause ? "evict" : "restore", ret);
+ else
+ p->queues_paused = !!(args->pause);
+
+ return ret;
}
static int kfd_ioctl_criu_resume(struct file *filep,
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index 881af8e1b06c..e0601bfbcbf2 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -868,6 +868,9 @@ struct kfd_process {
struct svm_range_list svms;
bool xnack_enabled;
+
+ /* Queues are in paused stated because we are in the process of doing a CRIU checkpoint */
+ bool queues_paused;
};
#define KFD_PROCESS_TABLE_SIZE 5 /* bits: 32 entries */
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
index 65a389fb97ce..0f7c4c63ee99 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
@@ -1314,6 +1314,7 @@ static struct kfd_process *create_process(const struct task_struct *thread)
process->mm = thread->mm;
process->lead_thread = thread->group_leader;
process->n_pdds = 0;
+ process->queues_paused = false;
INIT_DELAYED_WORK(&process->eviction_work, evict_process_worker);
INIT_DELAYED_WORK(&process->restore_work, restore_process_worker);
process->last_restore_timestamp = get_jiffies_64();
--
2.17.1
More information about the amd-gfx
mailing list