[PATCH] drm/amd: Enable checkpoint and restore of VRAM Bos with no VA
Christian König
ckoenig.leichtzumerken at gmail.com
Thu Nov 16 11:11:19 UTC 2023
Am 16.11.23 um 03:47 schrieb Ramesh Errabolu:
> Tag VRAM BOs that do not have a VA with a unique Id, a 128-bit
> UUID. This unique Id is used to distinguish BOs that might
> otherwise be of same size. Checkpoint and restore assumes
> that these BOs are not imported into a DRM device that is
> accessible either from current process or its parent or
> child process
>
> Signed-off-by: Ramesh Errabolu <Ramesh.Errabolu at amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/Makefile | 3 +-
> drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h | 3 +-
> .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 29 ++-
> drivers/gpu/drm/amd/amdgpu/amdgpu_criu.c | 190 ++++++++++++++++++
> drivers/gpu/drm/amd/amdgpu/amdgpu_criu.h | 103 ++++++++++
> drivers/gpu/drm/amd/amdgpu/amdgpu_object.h | 17 ++
> drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 30 ++-
> drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 3 +-
> drivers/gpu/drm/amd/amdkfd/kfd_process.c | 2 +-
> 9 files changed, 370 insertions(+), 10 deletions(-)
> create mode 100644 drivers/gpu/drm/amd/amdgpu/amdgpu_criu.c
> create mode 100644 drivers/gpu/drm/amd/amdgpu/amdgpu_criu.h
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/Makefile b/drivers/gpu/drm/amd/amdgpu/Makefile
> index 260e32ef7bae..851e2c4db372 100644
> --- a/drivers/gpu/drm/amd/amdgpu/Makefile
> +++ b/drivers/gpu/drm/amd/amdgpu/Makefile
> @@ -270,7 +270,8 @@ amdgpu-y += \
> amdgpu_amdkfd_gc_9_4_3.o \
> amdgpu_amdkfd_gfx_v10.o \
> amdgpu_amdkfd_gfx_v10_3.o \
> - amdgpu_amdkfd_gfx_v11.o
> + amdgpu_amdkfd_gfx_v11.o \
> + amdgpu_criu.o
>
> ifneq ($(CONFIG_DRM_AMDGPU_CIK),)
> amdgpu-y += amdgpu_amdkfd_gfx_v7.o
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> index fcf8a98ad15e..6c0d7e6a66cd 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> @@ -289,7 +289,8 @@ size_t amdgpu_amdkfd_get_available_memory(struct amdgpu_device *adev,
> int amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu(
> struct amdgpu_device *adev, uint64_t va, uint64_t size,
> void *drm_priv, struct kgd_mem **mem,
> - uint64_t *offset, uint32_t flags, bool criu_resume);
> + uint64_t *offset, uint32_t flags,
> + bool criu_resume, uuid_t *uuid);
> int amdgpu_amdkfd_gpuvm_free_memory_of_gpu(
> struct amdgpu_device *adev, struct kgd_mem *mem, void *drm_priv,
> uint64_t *size);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> index 2e302956a279..b139ffd519e1 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> @@ -25,6 +25,7 @@
> #include <linux/pagemap.h>
> #include <linux/sched/mm.h>
> #include <linux/sched/task.h>
> +#include <linux/uuid.h>
> #include <drm/ttm/ttm_tt.h>
>
> #include "amdgpu_object.h"
> @@ -35,6 +36,7 @@
> #include "amdgpu_dma_buf.h"
> #include <uapi/linux/kfd_ioctl.h>
> #include "amdgpu_xgmi.h"
> +#include "amdgpu_criu.h"
> #include "kfd_priv.h"
> #include "kfd_smi_events.h"
>
> @@ -1718,7 +1720,8 @@ size_t amdgpu_amdkfd_get_available_memory(struct amdgpu_device *adev,
> int amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu(
> struct amdgpu_device *adev, uint64_t va, uint64_t size,
> void *drm_priv, struct kgd_mem **mem,
> - uint64_t *offset, uint32_t flags, bool criu_resume)
> + uint64_t *offset, uint32_t flags,
> + bool criu_resume, uuid_t *uuid)
> {
> struct amdgpu_vm *avm = drm_priv_to_vm(drm_priv);
> struct amdgpu_fpriv *fpriv = container_of(avm, struct amdgpu_fpriv, vm);
> @@ -1814,13 +1817,23 @@ int amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu(
> va, (*mem)->aql_queue ? size << 1 : size,
> domain_string(alloc_domain), xcp_id);
>
> - ret = amdgpu_gem_object_create(adev, aligned_size, 1, alloc_domain, alloc_flags,
> - bo_type, NULL, &gobj, xcp_id + 1);
> + /* Construction of VRAM BO one with no VA, during CRIU Restore
> + * should consult BO table. Will return either a previously
> + * constructed BO or will construct a BO anew
> + */
> + if (criu_resume && (va == 0) && (flags & KFD_IOC_ALLOC_MEM_FLAGS_VRAM))
> + ret = restore_vram_bo(adev, aligned_size, 1, alloc_flags, uuid,
> + &gobj, xcp_id + 1);
> + else
> + ret = amdgpu_gem_object_create(adev, aligned_size, 1, alloc_domain,
> + alloc_flags, bo_type, NULL, &gobj, xcp_id + 1);
> +
> if (ret) {
> pr_debug("Failed to create BO on domain %s. ret %d\n",
> domain_string(alloc_domain), ret);
> goto err_bo_create;
> }
> +
> ret = drm_vma_node_allow(&gobj->vma_node, drm_priv);
> if (ret) {
> pr_debug("Failed to allow vma node access. ret %d\n", ret);
> @@ -1843,6 +1856,16 @@ int amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu(
>
> add_kgd_mem_to_kfd_bo_list(*mem, avm->process_info, user_addr);
>
> + /* Initialize the UUID field of a BO that:
> + * - Represents a VRAM BO
> + * - Does not have a VA bound
> + * - Is allocated outside CRIU Resume procedure
> + */
> + if (!criu_resume && (va == 0) && (flags & KFD_IOC_ALLOC_MEM_FLAGS_VRAM)) {
> + if (uuid_is_null(&bo->uuid))
> + uuid_gen(&bo->uuid);
> + }
> +
> if (user_addr) {
> pr_debug("creating userptr BO for user_addr = %llx\n", user_addr);
> ret = init_user_pages(*mem, user_addr, criu_resume);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_criu.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_criu.c
> new file mode 100644
> index 000000000000..4b43a3df6913
> --- /dev/null
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_criu.c
> @@ -0,0 +1,190 @@
> +// SPDX-License-Identifier: MIT
> +/*
> + * Copyright 2022 Advanced Micro Devices, Inc.
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a
> + * copy of this software and associated documentation files (the "Software"),
> + * to deal in the Software without restriction, including without limitation
> + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
> + * and/or sell copies of the Software, and to permit persons to whom the
> + * Software is furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice shall be included in
> + * all copies or substantial portions of the Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
> + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
> + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
> + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
> + * OTHER DEALINGS IN THE SOFTWARE.
> + *
> + */
> +
> +#include "amdgpu_criu.h"
> +
> +/*
> + * Hash table to host BOs that have their unique IDs initialized
> + * The table comes into play during CRIU Restore procedure
> + *
> + * @note: Currently these BOs encapsulate device memory i.e. are
> + * VRAM BOs
> + */
> +DECLARE_HASHTABLE(criu_bo_table, CRIU_BO_TABLE_SIZE);
> +static DEFINE_MUTEX(criu_mutex);
> +
> +/* Global counter to track life of Hash table */
> +atomic_t criu_bo_counter = ATOMIC_INIT(0);
> +
> +void print_uuid(uuid_t *uuid)
All non-static function should have an amdgpu_ prefix.
> +{
> + pr_err("\n");
> + for (int idx = 0; idx < 16; idx++)
> + pr_err("Idx[%d] %d\n", idx, uuid->b[idx]);
> + pr_err("\n");
> +}
> +
> +void print_uuid_compare(uuid_t *uuid1, uuid_t *uuid2)
> +{
> + pr_err("\n");
> + for (int idx = 0; idx < 16; idx++)
> + pr_err("Idx[%d] %d, %d\n", idx, uuid1->b[idx], uuid2->b[idx]);
> + pr_err("\n");
> +}
> +
> +void inc_table_counter(uint32_t cntr)
> +{
> + int init;
> +
> + mutex_lock(&criu_mutex);
> + init = atomic_read(&criu_bo_counter);
> + if (init == 0x00) {
> + pr_debug("%s(), Invoking hash_init api\n", __func__);
> + hash_init(criu_bo_table);
> + }
> +
> + atomic_add(cntr, &criu_bo_counter);
> + init = atomic_read(&criu_bo_counter);
> + mutex_unlock(&criu_mutex);
> +}
> +
> +static void free_bo_table(void)
> +{
> + struct criu_bo_uuid *bo_uuid = NULL;
> + uint32_t bkt;
> +
> + hash_for_each_rcu(criu_bo_table, bkt, bo_uuid, node)
> + hash_del_rcu(&bo_uuid->node);
> +}
> +
> +void dec_table_counter(uint32_t cntr)
> +{
> + uint32_t deinit;
> +
> + mutex_lock(&criu_mutex);
> + atomic_sub(cntr, &criu_bo_counter);
> + deinit = atomic_read(&criu_bo_counter);
> + if (deinit == 0x00) {
> + pr_debug("%s(), Invoking free_bo_table api\n", __func__);
> + free_bo_table();
> + }
> +
> + if (deinit < 0)
> + pr_err("%s(), BO Table counter is inconsistent: %d\n", __func__, deinit);
> +
> + mutex_unlock(&criu_mutex);
> +}
> +
> +uint32_t query_table_counter(void)
> +{
> + uint32_t cntr;
> +
> + mutex_lock(&criu_mutex);
> + cntr = atomic_read(&criu_bo_counter);
> + mutex_unlock(&criu_mutex);
> + return cntr;
> +}
> +
> +/* Determine if BO is present in Hash table */
> +static void add_bo_uuid(struct criu_bo_uuid *bo_uuid)
> +{
> + mutex_lock(&criu_mutex);
> + hash_add_rcu(criu_bo_table, &bo_uuid->node, (uintptr_t)bo_uuid->uuid);
> + mutex_unlock(&criu_mutex);
> +}
> +
> +/* Determine if BO is present in Hash table
> + *
> + * @note: Does the look up object based on value of key
> + * and not just its integer value
> + */
> +static struct criu_bo_uuid *get_bo_uuid(uuid_t *uuid)
> +{
> + struct criu_bo_uuid *bo_uuid = NULL;
> + uint32_t bkt;
> +
> + mutex_lock(&criu_mutex);
> + hash_for_each_rcu(criu_bo_table, bkt, bo_uuid, node)
> + if (uuid_equal(uuid, bo_uuid->uuid))
> + goto ret_abo;
> +
> +ret_abo:
> + mutex_unlock(&criu_mutex);
> + return bo_uuid;
> +}
> +
> +int restore_vram_bo(struct amdgpu_device *adev,
> + unsigned long size, int align, u64 flags,
> + uuid_t *uuid, struct drm_gem_object **gobj, int8_t xcp_id_plus1)
> +{
> + enum ttm_bo_type bo_type = ttm_bo_type_device;
> + u32 domain = AMDGPU_GEM_DOMAIN_VRAM;
> + struct criu_bo_uuid *bo_uuid;
> + struct amdgpu_bo *abo;
> + int ret;
> +
> + /* Determine if VRAM was built originally for exporting it
> + * to peers. Currently the only VRAM BOs that are exportable
> + * are those that do not have a VA attached
> + */
> + if (unlikely(uuid == NULL)) {
> + pr_err("A NULL UUID is Illegal for VRAM BOs without a VA\n");
> + return -EINVAL;
> + }
> +
> + /* Determine if BO is already present in hash table */
> + bo_uuid = get_bo_uuid(uuid);
> +
> + /* Return the BO present in table */
> + if (bo_uuid != NULL) {
> + abo = bo_uuid->abo;
> + *gobj = &(abo->tbo.base);
> + return 0;
> + }
> +
> + /* Build the BO and add it to table before returning it */
> + ret = amdgpu_gem_object_create(adev, size, align,
> + domain, flags, bo_type, NULL, gobj, xcp_id_plus1);
> + if (ret) {
> + pr_err("Failed to Restore VRAM BO, Retval: %d\n", ret);
> + return ret;
> + }
> +
> + /* Re-init uuid of BO that identifies it uniquely and
> + * add the BO into the table
> + */
> + abo = gem_to_amdgpu_bo(*gobj);
> + uuid_copy(&(abo->uuid), uuid);
> + bo_uuid = kzalloc(sizeof(*bo_uuid), GFP_KERNEL);
> + if (bo_uuid == NULL)
> + return -EINVAL;
> + bo_uuid->abo = abo;
> + bo_uuid->uuid = uuid;
> + add_bo_uuid(bo_uuid);
> +
> + /* Return the BO that was built */
> + *gobj = &(abo->tbo.base);
> + return 0;
> +}
> +
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_criu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_criu.h
> new file mode 100644
> index 000000000000..b895c698a2e0
> --- /dev/null
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_criu.h
> @@ -0,0 +1,103 @@
> +/* SPDX-License-Identifier: MIT
> + *
> + * Copyright 2022 Advanced Micro Devices, Inc.
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a
> + * copy of this software and associated documentation files (the "Software"),
> + * to deal in the Software without restriction, including without limitation
> + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
> + * and/or sell copies of the Software, and to permit persons to whom the
> + * Software is furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice shall be included in
> + * all copies or substantial portions of the Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
> + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
> + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
> + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
> + * OTHER DEALINGS IN THE SOFTWARE.
> + *
> + */
> +#ifndef AMDGPU_CRIU_H_INCLUDED
> +#define AMDGPU_CRIU_H_INCLUDED
> +
> +#include <linux/list.h>
> +#include <linux/atomic.h>
> +#include <linux/pagemap.h>
> +#include <linux/dma-buf.h>
> +#include <linux/sched/mm.h>
> +#include <linux/hashtable.h>
> +#include <linux/sched/task.h>
> +#include <uapi/linux/kfd_ioctl.h>
> +
> +#include "amdgpu_object.h"
> +#include "amdgpu_gem.h"
> +#include "amdgpu_vm.h"
> +#include "amdgpu_amdkfd.h"
> +#include "amdgpu_dma_buf.h"
> +
> +/* Specify Hash table and its size to host VRAM BOs that have their
> + * unique IDs iniialized. These BOs which can be exported as Dmabuf
> + * allows user space to bind different virtual addresses on different
> + * DRM devices.
> + *
> + * @note: Currently these BOs encapsulate device memory i.e. are VRAM BOs
> + */
> +#define CRIU_BO_TABLE_SIZE 8
> +extern DECLARE_HASHTABLE(criu_bo_table, CRIU_BO_TABLE_SIZE);
> +
> +struct criu_bo_uuid {
> +
> + /* Unique ID of BO, serves the role of KEY */
> + uuid_t *uuid;
> +
> + /* Handle of BO, serves the role of VALUE */
> + struct amdgpu_bo *abo;
> +
> + /* Allows chaining of BO being managed by table */
> + struct hlist_node node;
> +};
> +
> +
> +/* Global counter to track life of Hash table */
> +extern atomic_t criu_bo_counter;
> +
> +/**
> + * restore_vram_bo() - Returns handle of a GEM object either by look up
> + * or by construction. Look up a Global BO table to determine if the BO
> + * of concern has already been constructed. By construction if the look
> + * up fails to find the BO in the global BO table
> + *
> + * NOTE: Following two conditions must be held TRUE when invoking this method
> + * - This method deals with VRAM BOs only. Invoking this method to handle
> + * BOs of other kinds is invalid.
> + * - This method is invoked during a CRIU Restore procedure. An Invocation
> + * outside of this scneario is invalid
> + *
> + * @adev: Handle of device to use in construction
> + * @size: BO's memory size in bytes
> + * @align: Alignment requirements, if any, in allocating memory
> + * @flags: Flags to apply in allocating memory
> + * @uuid: Handle of UUID object to be restored
> + * @gobj: Output parameter updated with handle of GEM object
> + * @xcp_id_plus1: ID of the XCD on which BO is to be created
> + *
> + * Return: ZERO if successful, a negative value in case of error
> + */
> +int restore_vram_bo(struct amdgpu_device *adev,
> + unsigned long size, int align, u64 flags,
> + uuid_t *uuid, struct drm_gem_object **gobj,
> + int8_t xcp_id_plus1);
> +
> +void print_uuid(uuid_t *uuid);
> +void print_uuid_compare(uuid_t *uuid1, uuid_t *uuid2);
> +
> +uint32_t query_table_counter(void);
> +void inc_table_counter(uint32_t cntr);
> +void dec_table_counter(uint32_t cntr);
> +
> +#endif /* AMDGPU_CRIU_H_INCLUDED */
> +
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
> index d28e21baef16..dc61b252fe49 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
> @@ -117,6 +117,23 @@ struct amdgpu_bo {
> * for memory accounting.
> */
> int8_t xcp_id;
> +
> + /*
> + * @uuid: Unique ID of a BO that is being exported. The 128-bit ID is
> + * considered to be unique across processes and time. One use of this
> + * ID is to support CRIU operations of Checkpointing & Restore.
> + *
> + * ID is a byte array of length UUID_SIZE. This is to accommodate UUID,
> + * a 128-bit number defined by RFC 4122. Hex string form of UUID is
> + * defined as a sequence of 32 hexadecimal digits, divided into five
> + * groups that are delimited by hyphens "-". The sequence of groups
> + * from length perspective is: 8-4-4-4-12.
> + *
> + * The default value of this field is set ZEROS. It is initialized to a
> + * NON-ZERO value when a BO is exported using GEM Prime Apis. Currently
> + * the only BOs that can be exported are GTT and VRAM BOs.
> + */
> + uuid_t uuid;
Clear NAK to add this to the amdgpu_bo structure.
This is not related to general BO handling but a specialized use case.
What exactly are you trying to do here in the first place?
Regards,
Christian.
> };
>
> struct amdgpu_bo_user {
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> index 06988cf1db51..310a48b627ef 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> @@ -38,6 +38,7 @@
> #include <linux/dma-buf.h>
> #include <linux/fdtable.h>
> #include <linux/processor.h>
> +#include <linux/uuid.h>
> #include "kfd_priv.h"
> #include "kfd_device_queue_manager.h"
> #include "kfd_svm.h"
> @@ -45,6 +46,7 @@
> #include "kfd_smi_events.h"
> #include "amdgpu_dma_buf.h"
> #include "kfd_debug.h"
> +#include "amdgpu_criu.h"
>
> static long kfd_ioctl(struct file *, unsigned int, unsigned long);
> static int kfd_open(struct inode *, struct file *);
> @@ -1147,7 +1149,7 @@ static int kfd_ioctl_alloc_memory_of_gpu(struct file *filep,
> err = amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu(
> dev->adev, args->va_addr, args->size,
> pdd->drm_priv, (struct kgd_mem **) &mem, &offset,
> - flags, false);
> + flags, false, NULL);
>
> if (err)
> goto err_unlock;
> @@ -1848,6 +1850,12 @@ static uint32_t get_process_num_bos(struct kfd_process *p)
> idr_for_each_entry(&pdd->alloc_idr, mem, id) {
> struct kgd_mem *kgd_mem = (struct kgd_mem *)mem;
>
> + /* Count BOs whose VA is either zero or is equal
> + * to or exceed GPUVMs base address
> + *
> + * @note: BOs whose VA is below GPUVM base are
> + * used internally, e.g. Trap handler buffer
> + */
> if (!kgd_mem->va || kgd_mem->va > pdd->gpuvm_base)
> num_of_bos++;
> }
> @@ -1936,6 +1944,12 @@ static int criu_checkpoint_bos(struct kfd_process *p,
> bo_bucket->alloc_flags = (uint32_t)kgd_mem->alloc_flags;
> bo_priv->idr_handle = id;
>
> + /* Copy uuid of BO that identifies it uniquely
> + * Currently this is true for only VRAM BOs that
> + * have been exported
> + */
> + uuid_copy((uuid_t *)bo_priv->uuid, &kgd_mem->bo->uuid);
> +
> if (bo_bucket->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_USERPTR) {
> ret = amdgpu_ttm_tt_get_userptr(&dumper_bo->tbo,
> &bo_priv->user_addr);
> @@ -2295,6 +2309,7 @@ static int criu_restore_memory_of_gpu(struct kfd_process_device *pdd,
> int ret;
> const bool criu_resume = true;
> u64 offset;
> + uuid_t *uuid;
>
> if (bo_bucket->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL) {
> if (bo_bucket->size !=
> @@ -2318,10 +2333,17 @@ static int criu_restore_memory_of_gpu(struct kfd_process_device *pdd,
> } else if (bo_bucket->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_USERPTR) {
> offset = bo_priv->user_addr;
> }
> - /* Create the BO */
> +
> + /* Acquire handle of UUID of BO if need be */
> + uuid = NULL;
> + if ((bo_bucket->addr == 0) &&
> + (bo_bucket->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_VRAM))
> + uuid = (uuid_t *)bo_priv->uuid;
> +
> ret = amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu(pdd->dev->adev, bo_bucket->addr,
> bo_bucket->size, pdd->drm_priv, kgd_mem,
> - &offset, bo_bucket->alloc_flags, criu_resume);
> + &offset, bo_bucket->alloc_flags,
> + criu_resume, uuid);
> if (ret) {
> pr_err("Could not create the BO\n");
> return ret;
> @@ -2728,10 +2750,12 @@ static int kfd_ioctl_criu(struct file *filep, struct kfd_process *p, void *data)
> ret = criu_unpause(filep, p, args);
> break;
> case KFD_CRIU_OP_RESTORE:
> + inc_table_counter(p->n_pdds);
> ret = criu_restore(filep, p, args);
> break;
> case KFD_CRIU_OP_RESUME:
> ret = criu_resume(filep, p, args);
> + dec_table_counter(p->n_pdds);
> break;
> default:
> dev_dbg(kfd_device, "Unsupported CRIU operation:%d\n", args->op);
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> index a40f8cfc6aa5..320408239896 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> @@ -1178,7 +1178,7 @@ int kfd_process_init_cwsr_apu(struct kfd_process *process, struct file *filep);
> * kfd_criu_svm_range_priv_data
> */
>
> -#define KFD_CRIU_PRIV_VERSION 1
> +#define KFD_CRIU_PRIV_VERSION 2
>
> struct kfd_criu_process_priv_data {
> uint32_t version;
> @@ -1193,6 +1193,7 @@ struct kfd_criu_device_priv_data {
> struct kfd_criu_bo_priv_data {
> uint64_t user_addr;
> uint32_t idr_handle;
> + uint8_t uuid[16]; /* Unique Id of BO whose size is UUID_SIZE */
> uint32_t mapped_gpuids[MAX_GPU_INSTANCE];
> };
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> index c10d050e1a61..1969eb9375c2 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> @@ -716,7 +716,7 @@ static int kfd_process_alloc_gpuvm(struct kfd_process_device *pdd,
>
> err = amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu(kdev->adev, gpu_va, size,
> pdd->drm_priv, mem, NULL,
> - flags, false);
> + flags, false, NULL);
> if (err)
> goto err_alloc_mem;
>
More information about the amd-gfx
mailing list