[PATCH 5/5] libhsakmt: allocate unified memory for ctx save restore area
Felix Kuehling
felix.kuehling at amd.com
Thu Jun 30 21:39:08 UTC 2022
On 2022-06-30 15:03, Eric Huang wrote:
> To improve performance on queue preemption, allocate ctx s/r
> area in VRAM instead of system memory, and migrate it back
> to system memory when VRAM is full.
>
> Signed-off-by: Eric Huang <jinhuieric.huang at amd.com>
> Change-Id: If775782027188dbe84b6868260e429373675434c
> ---
> include/hsakmttypes.h | 1 +
> src/queues.c | 103 ++++++++++++++++++++++++++++++++++++------
> 2 files changed, 90 insertions(+), 14 deletions(-)
>
> diff --git a/include/hsakmttypes.h b/include/hsakmttypes.h
> index 9063f85..2c1c7cc 100644
> --- a/include/hsakmttypes.h
> +++ b/include/hsakmttypes.h
> @@ -1329,6 +1329,7 @@ typedef enum _HSA_SVM_FLAGS {
> HSA_SVM_FLAG_GPU_RO = 0x00000008, // GPUs only read, allows replication
> HSA_SVM_FLAG_GPU_EXEC = 0x00000010, // Allow execution on GPU
> HSA_SVM_FLAG_GPU_READ_MOSTLY = 0x00000020, // GPUs mostly read, may allow similar optimizations as RO, but writes fault
> + HSA_SVM_FLAG_GPU_ALWAYS_MAPPED = 0x00000040, // Keep GPU memory mapping always valid as if XNACK is disable
> } HSA_SVM_FLAGS;
>
> typedef enum _HSA_SVM_ATTR_TYPE {
> diff --git a/src/queues.c b/src/queues.c
> index c83dd93..d5109f9 100644
> --- a/src/queues.c
> +++ b/src/queues.c
> @@ -68,6 +68,7 @@ struct queue {
> uint32_t eop_buffer_size;
> uint32_t gfxv;
> bool use_ats;
> + bool unified_ctx_save_restore;
> /* This queue structure is allocated from GPU with page aligned size
> * but only small bytes are used. We use the extra space in the end for
> * cu_mask bits array.
> @@ -383,13 +384,47 @@ static void free_exec_aligned_memory(void *addr, uint32_t size, uint32_t align,
> munmap(addr, size);
> }
>
> +static HSAKMT_STATUS register_svm_range(void *mem, uint32_t size,
> + uint32_t gpuNode, uint32_t prefetchNode,
> + uint32_t preferredNode, bool alwaysMapped)
> +{
> + HSA_SVM_ATTRIBUTE *attrs;
> + HSAuint64 s_attr;
> + HSAuint32 nattr;
> + HSAuint32 flags;
> +
> + flags = HSA_SVM_FLAG_HOST_ACCESS;
> +
> + if (alwaysMapped)
> + flags |= HSA_SVM_FLAG_GPU_ALWAYS_MAPPED;
> +
> + nattr = 5;
> + s_attr = sizeof(*attrs) * nattr;
> + attrs = (HSA_SVM_ATTRIBUTE *)alloca(s_attr);
> +
> + attrs[0].type = HSA_SVM_ATTR_PREFETCH_LOC;
> + attrs[0].value = prefetchNode;
> + attrs[1].type = HSA_SVM_ATTR_PREFERRED_LOC;
> + attrs[1].value = preferredNode;
> + attrs[2].type = HSA_SVM_ATTR_CLR_FLAGS;
> + attrs[2].value = ~flags;
> + attrs[3].type = HSA_SVM_ATTR_SET_FLAGS;
> + attrs[3].value = flags;
> + attrs[4].type = HSA_SVM_ATTR_ACCESS;
> + attrs[4].value = gpuNode;
> +
> + return hsaKmtSVMSetAttr(mem, size, nattr, attrs);
> +}
> +
> static void free_queue(struct queue *q)
> {
> if (q->eop_buffer)
> free_exec_aligned_memory(q->eop_buffer,
> q->eop_buffer_size,
> PAGE_SIZE, q->use_ats);
> - if (q->ctx_save_restore)
> + if (q->unified_ctx_save_restore)
> + free(q->ctx_save_restore);
> + else if (q->ctx_save_restore)
> free_exec_aligned_memory(q->ctx_save_restore,
> q->ctx_save_restore_size,
> PAGE_SIZE, q->use_ats);
> @@ -425,6 +460,8 @@ static int handle_concrete_asic(struct queue *q,
> if (ret) {
> uint32_t total_mem_alloc_size = 0;
> HsaUserContextSaveAreaHeader *header;
> + HsaNodeProperties node;
> + bool svm_api;
>
> args->ctx_save_restore_size = q->ctx_save_restore_size;
> args->ctl_stack_size = q->ctl_stack_size;
> @@ -434,22 +471,60 @@ static int handle_concrete_asic(struct queue *q,
> */
> total_mem_alloc_size = q->ctx_save_restore_size +
> q->debug_memory_size;
> - q->ctx_save_restore =
> - allocate_exec_aligned_memory(total_mem_alloc_size,
> - q->use_ats, NodeId, false, false);
>
> - if (!q->ctx_save_restore)
> - return HSAKMT_STATUS_NO_MEMORY;
> + if (hsaKmtGetNodeProperties(NodeId, &node))
> + svm_api = false;
> + else
> + svm_api = node.Capability.ui32.SVMAPISupported;
>
> - args->ctx_save_restore_address = (uintptr_t)q->ctx_save_restore;
> + /* Allocate unified memory for context save restore
> + * area on dGPU.
> + */
> + if (!q->use_ats && svm_api) {
> + uint32_t size = PAGE_ALIGN_UP(total_mem_alloc_size);
> + void *addr;
> + HSAKMT_STATUS r = HSAKMT_STATUS_ERROR;
> +
> + if (posix_memalign(&addr, GPU_HUGE_PAGE_SIZE, size))
> + pr_err("[%s] posix_memalign failed:\n", __func__);
> + else {
> + header = (HsaUserContextSaveAreaHeader *)addr;
> + header->ErrorEventId = 0;
> + if (Event)
> + header->ErrorEventId = Event->EventId;
> + header->ErrorReason = ErrPayload;
> + header->DebugOffset = q->ctx_save_restore_size;
> + header->DebugSize = q->debug_memory_size;
> +
> + r = register_svm_range(addr, size,
> + NodeId, NodeId, 0, true);
> +
> + if (r == HSAKMT_STATUS_SUCCESS) {
> + q->ctx_save_restore = addr;
> + q->unified_ctx_save_restore = true;
> + } else
> + free(addr);
> + }
> + }
>
> - header = (HsaUserContextSaveAreaHeader *)q->ctx_save_restore;
> - header->ErrorEventId = 0;
> - if (Event)
> - header->ErrorEventId = Event->EventId;
> - header->ErrorReason = ErrPayload;
> - header->DebugOffset = q->ctx_save_restore_size;
> - header->DebugSize = q->debug_memory_size;
> + if (!q->unified_ctx_save_restore) {
> + q->ctx_save_restore = allocate_exec_aligned_memory(
> + total_mem_alloc_size,
> + q->use_ats, NodeId, false, false);
> +
> + if (!q->ctx_save_restore)
> + return HSAKMT_STATUS_NO_MEMORY;
> +
> + header = (HsaUserContextSaveAreaHeader *)q->ctx_save_restore;
> + header->ErrorEventId = 0;
> + if (Event)
> + header->ErrorEventId = Event->EventId;
> + header->ErrorReason = ErrPayload;
> + header->DebugOffset = q->ctx_save_restore_size;
> + header->DebugSize = q->debug_memory_size;
> + }
Is there a way to refactor the code that avoids duplicating the header
initialization? Maybe move it into a helper function.
Other than that, the series is
Reviewed-by: Felix Kuehling <Felix.Kuehling at amd.com>
> +
> + args->ctx_save_restore_address = (uintptr_t)q->ctx_save_restore;
> }
>
> return HSAKMT_STATUS_SUCCESS;
More information about the amd-gfx
mailing list