[RFC PATCH 12/17] drm/amdkfd: CRIU restore CU mask for queues
Felix Kuehling
Felix.Kuehling at amd.com
Sat May 1 01:57:47 UTC 2021
From: David Yat Sin <david.yatsin at amd.com>
When re-creating queues during CRIU restore, restore the queue
with the same CU mask value used during CRIU dump.
Signed-off-by: David Yat Sin <david.yatsin at amd.com>
Change-Id: I1822fa2488f90365dfe7a3c5971a2ed6c0046b4a
---
drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 160 +++++++++++++++++++++--
drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 3 +
2 files changed, 154 insertions(+), 9 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index 130ab100abb2..1d2c2d986a44 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -1890,9 +1890,21 @@ static int kfd_devinfo_restore(struct kfd_process *p, struct kfd_criu_devinfo_bu
}
return 0;
}
-static void criu_dump_queue(struct kfd_process_device *pdd,
+
+static int get_queue_data_sizes(struct kfd_process_device *pdd,
+ struct queue *q,
+ uint32_t *cu_mask_size)
+{
+ int ret = 0;
+
+ *cu_mask_size = sizeof(uint32_t) * (q->properties.cu_mask_count / 32);
+ return ret;
+}
+
+static int criu_dump_queue(struct kfd_process_device *pdd,
struct queue *q,
- struct kfd_criu_q_bucket *q_bucket)
+ struct kfd_criu_q_bucket *q_bucket,
+ struct queue_restore_data *qrd)
{
q_bucket->gpu_id = pdd->dev->id;
q_bucket->type = q->properties.type;
@@ -1920,18 +1932,30 @@ static void criu_dump_queue(struct kfd_process_device *pdd,
q->properties.ctx_save_restore_area_size;
q_bucket->ctl_stack_size = q->properties.ctl_stack_size;
+ if (qrd->cu_mask_size)
+ memcpy(qrd->cu_mask, q->properties.cu_mask, qrd->cu_mask_size);
+
+ q_bucket->cu_mask_size = qrd->cu_mask_size;
+ return 0;
}
static int criu_dump_queues_device(struct kfd_process_device *pdd,
unsigned *q_index,
unsigned int max_num_queues,
- struct kfd_criu_q_bucket *user_buckets)
+ struct kfd_criu_q_bucket *user_buckets,
+ uint8_t *queues_data,
+ uint32_t *queues_data_offset,
+ uint32_t queues_data_size)
{
struct queue *q;
+ struct queue_restore_data qrd;
struct kfd_criu_q_bucket q_bucket;
+ uint8_t *data_ptr = NULL;
+ unsigned int data_ptr_size = 0;
int ret = 0;
list_for_each_entry(q, &pdd->qpd.queues_list, list) {
+ unsigned int q_data_size;
if (q->properties.type != KFD_QUEUE_TYPE_COMPUTE &&
q->properties.type != KFD_QUEUE_TYPE_SDMA &&
q->properties.type != KFD_QUEUE_TYPE_SDMA_XGMI) {
@@ -1949,7 +1973,49 @@ static int criu_dump_queues_device(struct kfd_process_device *pdd,
}
memset(&q_bucket, 0, sizeof(q_bucket));
- criu_dump_queue(pdd, q, &q_bucket);
+ memset(&qrd, 0, sizeof(qrd));
+
+ ret = get_queue_data_sizes(pdd, q, &qrd.cu_mask_size);
+ if (ret) {
+ pr_err("Failed to get queue dump info (%d)\n", ret);
+ ret = -EFAULT;
+ break;
+ }
+
+ q_data_size = qrd.cu_mask_size;
+
+ /* Increase local buffer space if needed */
+ if (data_ptr_size < q_data_size) {
+ if (data_ptr)
+ kfree(data_ptr);
+
+ data_ptr = (uint8_t*)kzalloc(q_data_size, GFP_KERNEL);
+ if (!data_ptr) {
+ ret = -ENOMEM;
+ break;
+ }
+ data_ptr_size = q_data_size;
+ }
+
+ qrd.cu_mask = data_ptr;
+ ret = criu_dump_queue(pdd, q, &q_bucket, &qrd);
+ if (ret)
+ break;
+
+ if (*queues_data_offset + q_data_size > queues_data_size) {
+ pr_err("Required memory exceeds user provided\n");
+ ret = -ENOSPC;
+ break;
+ }
+ ret = copy_to_user((void __user *) (queues_data + *queues_data_offset),
+ data_ptr, q_data_size);
+ if (ret) {
+ ret = -EFAULT;
+ break;
+ }
+ q_bucket.queues_data_offset = *queues_data_offset;
+ *queues_data_offset += q_data_size;
+
ret = copy_to_user((void __user *)&user_buckets[*q_index],
&q_bucket, sizeof(q_bucket));
if (ret) {
@@ -1959,6 +2025,10 @@ static int criu_dump_queues_device(struct kfd_process_device *pdd,
}
*q_index = *q_index + 1;
}
+
+ if (data_ptr)
+ kfree(data_ptr);
+
return ret;
}
@@ -1976,6 +2046,8 @@ static int kfd_ioctl_criu_dumper(struct file *filep,
struct kfd_criu_q_bucket *user_buckets =
(struct kfd_criu_q_bucket*) args->kfd_criu_q_buckets_ptr;
+ uint8_t *queues_data = (uint8_t*)args->queues_data_ptr;
+ uint32_t queues_data_offset = 0;
pr_info("Inside %s\n",__func__);
@@ -2076,7 +2148,10 @@ static int kfd_ioctl_criu_dumper(struct file *filep,
}
ret = criu_dump_queues_device(pdd, &q_index,
- args->num_of_queues, user_buckets);
+ args->num_of_queues, user_buckets,
+ queues_data, &queues_data_offset,
+ args->queues_data_size);
+
if (ret)
goto err_unlock;
}
@@ -2098,8 +2173,9 @@ static int kfd_ioctl_criu_dumper(struct file *filep,
return ret;
}
-static void set_queue_properties_from_criu(struct queue_properties *qp,
- struct kfd_criu_q_bucket *q_bucket)
+static int set_queue_properties_from_criu(struct queue_properties *qp,
+ struct kfd_criu_q_bucket *q_bucket,
+ struct queue_restore_data *qrd)
{
qp->is_interop = false;
qp->is_gws = q_bucket->is_gws;
@@ -2116,6 +2192,19 @@ static void set_queue_properties_from_criu(struct queue_properties *qp,
qp->ctl_stack_size = q_bucket->ctl_stack_size;
qp->type = q_bucket->type;
qp->format = q_bucket->format;
+
+ if (qrd->cu_mask_size) {
+ qp->cu_mask = kzalloc(qrd->cu_mask_size, GFP_KERNEL);
+ if (!qp->cu_mask) {
+ pr_err("Failed to allocate memory for CU mask\n");
+ return -ENOMEM;
+ }
+
+ memcpy(qp->cu_mask, qrd->cu_mask, qrd->cu_mask_size);
+ qp->cu_mask_count = (qrd->cu_mask_size / sizeof(uint32_t)) * 32;
+ }
+
+ return 0;
}
/* criu_restore_queue runs with the process mutex locked */
@@ -2148,7 +2237,10 @@ int criu_restore_queue(struct kfd_process *p,
q_bucket->q_address);
memset(&qp, 0, sizeof(qp));
- set_queue_properties_from_criu(&qp, q_bucket);
+ ret = set_queue_properties_from_criu(&qp, q_bucket, qrd);
+ if (ret)
+ goto err_create_queue;
+
print_queue_properties(&qp);
qrd->qid = q_bucket->q_id;
@@ -2158,12 +2250,18 @@ int criu_restore_queue(struct kfd_process *p,
ret = pqm_create_queue(&p->pqm, dev, NULL, &qp, &queue_id, qrd, NULL);
if (ret) {
pr_err("Failed to create new queue err:%d\n", ret);
- return -EINVAL;
+ ret = -EINVAL;
+ goto err_create_queue;
}
pr_debug("Queue id %d was restored successfully\n", queue_id);
return 0;
+err_create_queue:
+ if (qp.cu_mask)
+ kfree(qp.cu_mask);
+
+ return ret;
}
/* criu_restore_queues runs with the process mutex locked */
@@ -2176,6 +2274,11 @@ static int criu_restore_queues(struct kfd_process *p,
int ret;
struct kfd_criu_q_bucket *user_buckets =
(struct kfd_criu_q_bucket*) args->kfd_criu_q_buckets_ptr;
+
+ uint8_t *queues_data = (uint8_t*)args->queues_data_ptr;
+ uint8_t *data_ptr = NULL;
+ uint32_t data_ptr_size = 0;
+
/*
* This process will not have any queues at this point, but we are
* setting all the dqm's for this process to evicted state.
@@ -2185,6 +2288,7 @@ static int criu_restore_queues(struct kfd_process *p,
for (i = 0; i < args->num_of_queues; i++) {
struct kfd_criu_q_bucket q_bucket;
struct queue_restore_data qrd;
+ uint32_t q_data_size;
memset(&qrd, 0, sizeof(qrd));
@@ -2212,12 +2316,42 @@ static int criu_restore_queues(struct kfd_process *p,
ret = -EFAULT;
return ret;
}
+
+ q_data_size = q_bucket.cu_mask_size;
+
+ /* Increase local buffer space if needed */
+ if (q_data_size > data_ptr_size) {
+ if (data_ptr)
+ kfree(data_ptr);
+
+ data_ptr = (uint8_t*)kzalloc(q_data_size, GFP_KERNEL);
+ if (!data_ptr) {
+ ret = -ENOMEM;
+ break;
+ }
+ data_ptr_size = q_data_size;
+ }
+
+ ret = copy_from_user(data_ptr,
+ (void __user *) queues_data + q_bucket.queues_data_offset,
+ q_data_size);
+ if (ret) {
+ ret = -EFAULT;
+ break;
+ }
+
+ qrd.cu_mask_size = q_bucket.cu_mask_size;
+ qrd.cu_mask = data_ptr;
+
ret = criu_restore_queue(p, dev, pdd, &q_bucket, &qrd);
if (ret) {
pr_err("Failed to restore queue (%d)\n", ret);
break;
}
}
+
+ if (data_ptr)
+ kfree(data_ptr);
return ret;
}
@@ -2507,6 +2641,7 @@ static int kfd_ioctl_criu_helper(struct file *filep,
struct kfd_process *p, void *data)
{
struct kfd_ioctl_criu_helper_args *args = data;
+ u32 queues_data_size = 0;
struct kgd_mem *kgd_mem;
struct queue *q;
u64 num_of_bos = 0;
@@ -2544,6 +2679,12 @@ static int kfd_ioctl_criu_helper(struct file *filep,
q->properties.type == KFD_QUEUE_TYPE_SDMA ||
q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
+ u32 cu_mask_size = 0;
+ ret = get_queue_data_sizes(pdd, q, &cu_mask_size);
+ if (ret)
+ goto err_unlock;
+
+ queues_data_size += cu_mask_size;
q_index++;
} else {
pr_err("Unsupported queue type (%d)\n", q->properties.type);
@@ -2558,6 +2699,7 @@ static int kfd_ioctl_criu_helper(struct file *filep,
args->num_of_devices = p->n_pdds;
args->num_of_bos = num_of_bos;
args->num_of_queues = q_index;
+ args->queues_data_size = queues_data_size;
dev_dbg(kfd_device, "Num of bos = %llu\n", num_of_bos);
err_unlock:
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index a4a0713ca9a3..55180a6fe102 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -473,6 +473,9 @@ struct queue_restore_data {
uint32_t qid;
uint32_t sdma_id;
uint32_t doorbell_id;
+
+ void *cu_mask;
+ uint32_t cu_mask_size;
};
struct queue_properties {
--
2.17.1
More information about the amd-gfx
mailing list