[PATCH i-g-t v3 5/5] lib/amdgpu: Consolidate user queue implementation into amd_ip_blocks.h
vitaly prosyak
vprosyak at amd.com
Thu Jul 3 01:32:45 UTC 2025
The series of 5 patches looks good to me as we discussed.
The warning was addressed:
../lib/amdgpu/amd_command_submission.c:38:9: warning: ISO C90 forbids mixed declarations and code [-Wdeclaration-after-statement]
Reviewed-by: Vitaly Prosyak vitaly.prosyak at amd.com
On 2025-07-02 01:47, Jesse.Zhang wrote:
> This commit refactors the user queue (UMQ) implementation by:
>
> 1. Moving all UMQ-related definitions and functions from amd_userq.[ch]
> into amd_ip_blocks.h for better code organization and to reduce
> unnecessary file separation.
>
> 2. Adding user queue operations (create/submit/destroy) directly to the
> amdgpu_ip_funcs structure, making them IP-block specific.
>
> 3. Removing the now-obsolete amd_userq.[ch] files and updating all
> references to use the new IP block interface.
>
> Key changes include:
> - Defined UMQ constants (PAGE_SIZE, queue sizes) in header
> - Added packet building macros (amdgpu_pkt_begin/add_dw/end)
> - Integrated UMQ functions into IP block ops structure
> - Updated all tests to use IP block funcs instead of direct UMQ calls
> - Removed standalone UMQ implementation files
>
> This consolidation improves code maintainability and makes the UMQ
> implementation more consistent with the rest of the driver architecture.
>
> Signed-off-by: Jesse Zhang <Jesse.Zhang at amd.com>
> ---
> lib/amdgpu/amd_command_submission.c | 17 +-
> lib/amdgpu/amd_compute.c | 9 +-
> lib/amdgpu/amd_deadlock_helpers.c | 7 +-
> lib/amdgpu/amd_ip_blocks.c | 496 ++++++++++++++++++++++++++++
> lib/amdgpu/amd_ip_blocks.h | 40 +++
> lib/amdgpu/amd_memory.c | 3 +-
> lib/amdgpu/amd_userq.c | 494 ---------------------------
> lib/amdgpu/amd_userq.h | 55 ---
> lib/meson.build | 3 +-
> tests/amdgpu/amd_basic.c | 9 +-
> tests/amdgpu/amd_cs_nop.c | 9 +-
> tests/amdgpu/amd_deadlock.c | 1 -
> 12 files changed, 564 insertions(+), 579 deletions(-)
> delete mode 100644 lib/amdgpu/amd_userq.c
> delete mode 100644 lib/amdgpu/amd_userq.h
>
> diff --git a/lib/amdgpu/amd_command_submission.c b/lib/amdgpu/amd_command_submission.c
> index fcc356f8f..2b13e7a9b 100644
> --- a/lib/amdgpu/amd_command_submission.c
> +++ b/lib/amdgpu/amd_command_submission.c
> @@ -10,7 +10,6 @@
> #include "lib/amdgpu/amd_sdma.h"
> #include "lib/amdgpu/amd_PM4.h"
> #include "lib/amdgpu/amd_command_submission.h"
> -#include "lib/amdgpu/amd_userq.h"
> #include "ioctl_wrappers.h"
>
>
> @@ -33,7 +32,9 @@ int amdgpu_test_exec_cs_helper(amdgpu_device_handle device, unsigned int ip_type
> struct amdgpu_cs_fence fence_status = {0};
> amdgpu_va_handle va_handle;
> bool user_queue = ring_context->user_queue;
> + const struct amdgpu_ip_block_version *ip_block = NULL;
>
> + ip_block = get_ip_block(device, ip_type);
> amdgpu_bo_handle *all_res = alloca(sizeof(ring_context->resources[0]) * (ring_context->res_cnt + 1));
>
> if (expect_failure) {
> @@ -68,7 +69,7 @@ int amdgpu_test_exec_cs_helper(amdgpu_device_handle device, unsigned int ip_type
> memcpy(ring_ptr, ring_context->pm4, ring_context->pm4_dw * sizeof(*ring_context->pm4));
>
> if (user_queue)
> - amdgpu_user_queue_submit(device, ring_context, ip_type, ib_result_mc_address);
> + ip_block->funcs->userq_submit(device, ring_context, ip_type, ib_result_mc_address);
> else {
> ring_context->ib_info.ib_mc_address = ib_result_mc_address;
> ring_context->ib_info.size = ring_context->pm4_dw;
> @@ -163,7 +164,7 @@ void amdgpu_command_submission_write_linear_helper(amdgpu_device_handle device,
> gtt_flags[i] |= AMDGPU_GEM_CREATE_ENCRYPTED;
>
> if (user_queue) {
> - amdgpu_user_queue_create(device, ring_context, ip_block->type);
> + ip_block->funcs->userq_create(device, ring_context, ip_block->type);
> } else {
> r = amdgpu_cs_ctx_create(device, &ring_context->context_handle);
> igt_assert_eq(r, 0);
> @@ -246,7 +247,7 @@ void amdgpu_command_submission_write_linear_helper(amdgpu_device_handle device,
> free(ring_context->pm4);
>
> if (user_queue) {
> - amdgpu_user_queue_destroy(device, ring_context, ip_block->type);
> + ip_block->funcs->userq_destroy(device, ring_context, ip_block->type);
> } else {
> r = amdgpu_cs_ctx_free(ring_context->context_handle);
> igt_assert_eq(r, 0);
> @@ -287,7 +288,7 @@ void amdgpu_command_submission_const_fill_helper(amdgpu_device_handle device,
> igt_assert_eq(r, 0);
>
> if (user_queue) {
> - amdgpu_user_queue_create(device, ring_context, ip_block->type);
> + ip_block->funcs->userq_create(device, ring_context, ip_block->type);
> } else {
> r = amdgpu_cs_ctx_create(device, &ring_context->context_handle);
> igt_assert_eq(r, 0);
> @@ -341,7 +342,7 @@ void amdgpu_command_submission_const_fill_helper(amdgpu_device_handle device,
> free(ring_context->pm4);
>
> if (user_queue) {
> - amdgpu_user_queue_destroy(device, ring_context, ip_block->type);
> + ip_block->funcs->userq_destroy(device, ring_context, ip_block->type);
> } else {
> r = amdgpu_cs_ctx_free(ring_context->context_handle);
> igt_assert_eq(r, 0);
> @@ -383,7 +384,7 @@ void amdgpu_command_submission_copy_linear_helper(amdgpu_device_handle device,
>
>
> if (user_queue) {
> - amdgpu_user_queue_create(device, ring_context, ip_block->type);
> + ip_block->funcs->userq_create(device, ring_context, ip_block->type);
> } else {
> r = amdgpu_cs_ctx_create(device, &ring_context->context_handle);
> igt_assert_eq(r, 0);
> @@ -467,7 +468,7 @@ void amdgpu_command_submission_copy_linear_helper(amdgpu_device_handle device,
> free(ring_context->pm4);
>
> if (user_queue) {
> - amdgpu_user_queue_destroy(device, ring_context, ip_block->type);
> + ip_block->funcs->userq_destroy(device, ring_context, ip_block->type);
> } else {
> r = amdgpu_cs_ctx_free(ring_context->context_handle);
> igt_assert_eq(r, 0);
> diff --git a/lib/amdgpu/amd_compute.c b/lib/amdgpu/amd_compute.c
> index d53df241c..d92b99a76 100644
> --- a/lib/amdgpu/amd_compute.c
> +++ b/lib/amdgpu/amd_compute.c
> @@ -26,7 +26,6 @@
> #include "amd_memory.h"
> #include "amd_compute.h"
> #include "amd_sdma.h"
> -#include "amd_userq.h"
>
> /**
> *
> @@ -42,6 +41,7 @@ void amdgpu_command_submission_nop(amdgpu_device_handle device, enum amd_ip_bloc
> struct amdgpu_cs_request ibs_request;
> struct amdgpu_cs_ib_info ib_info;
> struct amdgpu_cs_fence fence_status;
> + const struct amdgpu_ip_block_version *ip_block = NULL;
> uint32_t *ptr;
> uint32_t expired;
> int r, instance;
> @@ -50,6 +50,7 @@ void amdgpu_command_submission_nop(amdgpu_device_handle device, enum amd_ip_bloc
>
> struct amdgpu_ring_context *ring_context;
>
> + ip_block = get_ip_block(device, type);
> ring_context = calloc(1, sizeof(*ring_context));
> igt_assert(ring_context);
>
> @@ -57,7 +58,7 @@ void amdgpu_command_submission_nop(amdgpu_device_handle device, enum amd_ip_bloc
> igt_assert_eq(r, 0);
>
> if (user_queue) {
> - amdgpu_user_queue_create(device, ring_context, type);
> + ip_block->funcs->userq_create(device, ring_context, type);
> } else {
> r = amdgpu_cs_ctx_create(device, &context_handle);
> igt_assert_eq(r, 0);
> @@ -93,7 +94,7 @@ void amdgpu_command_submission_nop(amdgpu_device_handle device, enum amd_ip_bloc
> ring_context->pm4_dw = 16;
>
> if (user_queue) {
> - amdgpu_user_queue_submit(device, ring_context, type,
> + ip_block->funcs->userq_submit(device, ring_context, type,
> ib_result_mc_address);
> } else {
> memset(&ib_info, 0, sizeof(struct amdgpu_cs_ib_info));
> @@ -131,7 +132,7 @@ void amdgpu_command_submission_nop(amdgpu_device_handle device, enum amd_ip_bloc
> }
>
> if (user_queue) {
> - amdgpu_user_queue_destroy(device, ring_context, type);
> + ip_block->funcs->userq_destroy(device, ring_context, type);
> } else {
> r = amdgpu_cs_ctx_free(context_handle);
> igt_assert_eq(r, 0);
> diff --git a/lib/amdgpu/amd_deadlock_helpers.c b/lib/amdgpu/amd_deadlock_helpers.c
> index f7845edb0..1ed407332 100644
> --- a/lib/amdgpu/amd_deadlock_helpers.c
> +++ b/lib/amdgpu/amd_deadlock_helpers.c
> @@ -13,7 +13,6 @@
> #include <signal.h>
> #include "amd_memory.h"
> #include "amd_deadlock_helpers.h"
> -#include "lib/amdgpu/amd_userq.h"
> #include "lib/amdgpu/amd_command_submission.h"
>
> #define MAX_JOB_COUNT 200
> @@ -292,11 +291,12 @@ bad_access_helper(amdgpu_device_handle device_handle, unsigned int cmd_error,
> struct amdgpu_ring_context *ring_context;
> int r = 0;
>
> + ip_block = get_ip_block(device_handle, ip_type);
> ring_context = calloc(1, sizeof(*ring_context));
> igt_assert(ring_context);
>
> if (user_queue) {
> - amdgpu_user_queue_create(device_handle, ring_context, ip_type);
> + ip_block->funcs->userq_create(device_handle, ring_context, ip_type);
> } else {
> if (priority == AMDGPU_CTX_PRIORITY_HIGH)
> r = amdgpu_cs_ctx_create2(device_handle, AMDGPU_CTX_PRIORITY_HIGH, &ring_context->context_handle);
> @@ -314,7 +314,6 @@ bad_access_helper(amdgpu_device_handle device_handle, unsigned int cmd_error,
> ring_context->user_queue = user_queue;
> ring_context->time_out = 0x7ffff;
> igt_assert(ring_context->pm4);
> - ip_block = get_ip_block(device_handle, ip_type);
> r = amdgpu_bo_alloc_and_map_sync(device_handle,
> ring_context->write_length * sizeof(uint32_t),
> 4096, AMDGPU_GEM_DOMAIN_GTT,
> @@ -345,7 +344,7 @@ bad_access_helper(amdgpu_device_handle device_handle, unsigned int cmd_error,
> amdgpu_bo_unmap_and_free(ring_context->bo, ring_context->va_handle, ring_context->bo_mc,
> ring_context->write_length * sizeof(uint32_t));
> if (user_queue) {
> - amdgpu_user_queue_destroy(device_handle, ring_context, ip_block->type);
> + ip_block->funcs->userq_destroy(device_handle, ring_context, ip_block->type);
> } else {
> free(ring_context->pm4);
> free(ring_context);
> diff --git a/lib/amdgpu/amd_ip_blocks.c b/lib/amdgpu/amd_ip_blocks.c
> index e4913c77d..718fcc0c4 100644
> --- a/lib/amdgpu/amd_ip_blocks.c
> +++ b/lib/amdgpu/amd_ip_blocks.c
> @@ -464,6 +464,496 @@ x_compare_pattern(const struct amdgpu_ip_funcs *func,
> return ret;
> }
>
> +#ifdef AMDGPU_USERQ_ENABLED
> +static void amdgpu_alloc_doorbell(amdgpu_device_handle device_handle,
> + struct amdgpu_userq_bo *doorbell_bo,
> + unsigned int size, unsigned int domain)
> +{
> + struct amdgpu_bo_alloc_request req = {0};
> + amdgpu_bo_handle buf_handle;
> + int r;
> +
> + req.alloc_size = ALIGN(size, PAGE_SIZE);
> + req.preferred_heap = domain;
> + r = amdgpu_bo_alloc(device_handle, &req, &buf_handle);
> + igt_assert_eq(r, 0);
> +
> + doorbell_bo->handle = buf_handle;
> + doorbell_bo->size = req.alloc_size;
> +
> + r = amdgpu_bo_cpu_map(doorbell_bo->handle,
> + (void **)&doorbell_bo->ptr);
> + igt_assert_eq(r, 0);
> +}
> +
> +int
> +amdgpu_bo_alloc_and_map_uq(amdgpu_device_handle device_handle, unsigned int size,
> + unsigned int alignment, unsigned int heap, uint64_t alloc_flags,
> + uint64_t mapping_flags, amdgpu_bo_handle *bo, void **cpu,
> + uint64_t *mc_address, amdgpu_va_handle *va_handle,
> + uint32_t timeline_syncobj_handle, uint64_t point)
> +{
> + struct amdgpu_bo_alloc_request request = {};
> + amdgpu_bo_handle buf_handle;
> + uint64_t vmc_addr;
> + int r;
> +
> + request.alloc_size = size;
> + request.phys_alignment = alignment;
> + request.preferred_heap = heap;
> + request.flags = alloc_flags;
> +
> + r = amdgpu_bo_alloc(device_handle, &request, &buf_handle);
> + if (r)
> + return r;
> +
> + r = amdgpu_va_range_alloc(device_handle,
> + amdgpu_gpu_va_range_general,
> + size, alignment, 0, &vmc_addr,
> + va_handle, 0);
> + if (r)
> + goto error_va_alloc;
> +
> + r = amdgpu_bo_va_op_raw2(device_handle, buf_handle, 0,
> + ALIGN(size, getpagesize()), vmc_addr,
> + AMDGPU_VM_PAGE_READABLE |
> + AMDGPU_VM_PAGE_WRITEABLE |
> + AMDGPU_VM_PAGE_EXECUTABLE |
> + mapping_flags,
> + AMDGPU_VA_OP_MAP,
> + timeline_syncobj_handle,
> + point, 0, 0);
> + if (r)
> + goto error_va_map;
> +
> + if (cpu) {
> + r = amdgpu_bo_cpu_map(buf_handle, cpu);
> + if (r)
> + goto error_cpu_map;
> + }
> +
> + *bo = buf_handle;
> + *mc_address = vmc_addr;
> +
> + return 0;
> +
> +error_cpu_map:
> + amdgpu_bo_va_op(buf_handle, 0, size, vmc_addr, 0, AMDGPU_VA_OP_UNMAP);
> +error_va_map:
> + amdgpu_va_range_free(*va_handle);
> +error_va_alloc:
> + amdgpu_bo_free(buf_handle);
> + return r;
> +}
> +
> +static void amdgpu_bo_unmap_and_free_uq(amdgpu_device_handle device_handle,
> + amdgpu_bo_handle bo, amdgpu_va_handle va_handle,
> + uint64_t mc_addr, uint64_t size,
> + uint32_t timeline_syncobj_handle,
> + uint64_t point, uint64_t syncobj_handles_array,
> + uint32_t num_syncobj_handles)
> +{
> + amdgpu_bo_cpu_unmap(bo);
> + amdgpu_bo_va_op_raw2(device_handle, bo, 0, size, mc_addr, 0, AMDGPU_VA_OP_UNMAP,
> + timeline_syncobj_handle, point,
> + syncobj_handles_array, num_syncobj_handles);
> + amdgpu_va_range_free(va_handle);
> + amdgpu_bo_free(bo);
> +}
> +
> +int amdgpu_timeline_syncobj_wait(amdgpu_device_handle device_handle,
> + uint32_t timeline_syncobj_handle, uint64_t point)
> +{
> + uint32_t flags = DRM_SYNCOBJ_QUERY_FLAGS_LAST_SUBMITTED;
> + int r;
> +
> + r = amdgpu_cs_syncobj_query2(device_handle, &timeline_syncobj_handle,
> + &point, 1, flags);
> + if (r)
> + return r;
> +
> + r = amdgpu_cs_syncobj_timeline_wait(device_handle, &timeline_syncobj_handle,
> + &point, 1, INT64_MAX,
> + DRM_SYNCOBJ_WAIT_FLAGS_WAIT_ALL |
> + DRM_SYNCOBJ_WAIT_FLAGS_WAIT_FOR_SUBMIT,
> + NULL);
> + if (r)
> + igt_warn("Timeline timed out\n");
> + return r;
> +}
> +
> +static void
> +amdgpu_user_queue_submit(amdgpu_device_handle device, struct amdgpu_ring_context *ring_context,
> + unsigned int ip_type, uint64_t mc_address)
> +{
> + int r;
> + uint32_t control = ring_context->pm4_dw;
> + uint32_t syncarray[1];
> + struct drm_amdgpu_userq_signal signal_data;
> + uint64_t timeout = ring_context->time_out ? ring_context->time_out : INT64_MAX;
> +
> + amdgpu_pkt_begin();
> +
> + if (ip_type == AMD_IP_DMA) {
> + /* For SDMA, we need to align the IB to 8 DW boundary */
> + unsigned int nop_count = (2 - lower_32_bits(*ring_context->wptr_cpu)) & 7;
> + for (unsigned int i = 0; i < nop_count; i++)
> + amdgpu_pkt_add_dw(SDMA_PKT_HEADER_OP(SDMA_NOP));
> + amdgpu_pkt_add_dw(SDMA_PKT_HEADER_OP(SDMA_OP_INDIRECT));
> + amdgpu_pkt_add_dw(lower_32_bits(mc_address) & 0xffffffe0); // 32-byte aligned
> + amdgpu_pkt_add_dw(upper_32_bits(mc_address));
> + amdgpu_pkt_add_dw(control); // IB length in DWORDS
> + amdgpu_pkt_add_dw(lower_32_bits(ring_context->csa.mc_addr)); // CSA MC address low
> + amdgpu_pkt_add_dw(upper_32_bits(ring_context->csa.mc_addr)); // CSA MC address high
> + if (ring_context->hw_ip_info.hw_ip_version_major <= 6)
> + amdgpu_pkt_add_dw(SDMA_PACKET(SDMA_OP_PROTECTED_FENCE, SDMA6_SUB_OP_PROTECTED_FENCE, 0));
> + else
> + amdgpu_pkt_add_dw(SDMA_PACKET(SDMA_OP_PROTECTED_FENCE, SDMA7_SUB_OP_PROTECTED_FENCE, 0));
> + } else {
> + /* Prepare the Indirect IB to submit the IB to user queue */
> + amdgpu_pkt_add_dw(PACKET3(PACKET3_INDIRECT_BUFFER, 2));
> + amdgpu_pkt_add_dw(lower_32_bits(mc_address));
> + amdgpu_pkt_add_dw(upper_32_bits(mc_address));
> +
> + if (ip_type == AMD_IP_GFX)
> + amdgpu_pkt_add_dw(control | S_3F3_INHERIT_VMID_MQD_GFX(1));
> + else
> + amdgpu_pkt_add_dw(control | S_3F3_VALID_COMPUTE(1)
> + | S_3F3_INHERIT_VMID_MQD_COMPUTE(1));
> +
> + amdgpu_pkt_add_dw(PACKET3(PACKET3_PROTECTED_FENCE_SIGNAL, 0));
> +
> + /* empty dword is needed for fence signal pm4 */
> + amdgpu_pkt_add_dw(0);
> + }
> +#if DETECT_CC_GCC && (DETECT_ARCH_X86 || DETECT_ARCH_X86_64)
> + asm volatile ("mfence" : : : "memory");
> +#endif
> +
> + /* Below call update the wptr address so will wait till all writes are completed */
> + amdgpu_pkt_end();
> +
> +#if DETECT_CC_GCC && (DETECT_ARCH_X86 || DETECT_ARCH_X86_64)
> + asm volatile ("mfence" : : : "memory");
> +#endif
> +
> + if (ip_type == AMD_IP_DMA)
> + *ring_context->wptr_cpu = *ring_context->wptr_cpu <<2;
> + /* Update the door bell */
> + ring_context->doorbell_cpu[DOORBELL_INDEX] = *ring_context->wptr_cpu;
> +
> + /* Add a fence packet for signal */
> + syncarray[0] = ring_context->timeline_syncobj_handle;
> + signal_data.queue_id = ring_context->queue_id;
> + signal_data.syncobj_handles = (uintptr_t)syncarray;
> + signal_data.num_syncobj_handles = 1;
> + signal_data.bo_read_handles = 0;
> + signal_data.bo_write_handles = 0;
> + signal_data.num_bo_read_handles = 0;
> + signal_data.num_bo_write_handles = 0;
> +
> + r = amdgpu_userq_signal(device, &signal_data);
> + igt_assert_eq(r, 0);
> +
> + r = amdgpu_cs_syncobj_wait(device, &ring_context->timeline_syncobj_handle, 1, timeout,
> + DRM_SYNCOBJ_WAIT_FLAGS_WAIT_ALL, NULL);
> + igt_assert_eq(r, 0);
> +}
> +
> +static void
> +amdgpu_user_queue_destroy(amdgpu_device_handle device_handle, struct amdgpu_ring_context *ctxt,
> + unsigned int type)
> +{
> + int r;
> +
> + if (type > AMD_IP_DMA) {
> + igt_info("Invalid IP not supported for UMQ Submission\n");
> + return;
> + }
> +
> + /* Free the Usermode Queue */
> + r = amdgpu_free_userqueue(device_handle, ctxt->queue_id);
> + igt_assert_eq(r, 0);
> +
> + switch (type) {
> + case AMD_IP_GFX:
> + amdgpu_bo_unmap_and_free_uq(device_handle, ctxt->csa.handle,
> + ctxt->csa.va_handle,
> + ctxt->csa.mc_addr, ctxt->info.gfx.csa_size,
> + ctxt->timeline_syncobj_handle, ++ctxt->point,
> + 0, 0);
> +
> + amdgpu_bo_unmap_and_free_uq(device_handle, ctxt->shadow.handle,
> + ctxt->shadow.va_handle,
> + ctxt->shadow.mc_addr, ctxt->info.gfx.shadow_size,
> + ctxt->timeline_syncobj_handle, ++ctxt->point,
> + 0, 0);
> +
> + r = amdgpu_timeline_syncobj_wait(device_handle, ctxt->timeline_syncobj_handle,
> + ctxt->point);
> + igt_assert_eq(r, 0);
> + break;
> +
> + case AMD_IP_COMPUTE:
> + amdgpu_bo_unmap_and_free_uq(device_handle, ctxt->eop.handle,
> + ctxt->eop.va_handle,
> + ctxt->eop.mc_addr, 256,
> + ctxt->timeline_syncobj_handle, ++ctxt->point,
> + 0, 0);
> +
> + r = amdgpu_timeline_syncobj_wait(device_handle, ctxt->timeline_syncobj_handle,
> + ctxt->point);
> + igt_assert_eq(r, 0);
> + break;
> +
> + case AMD_IP_DMA:
> + amdgpu_bo_unmap_and_free_uq(device_handle, ctxt->csa.handle,
> + ctxt->csa.va_handle,
> + ctxt->csa.mc_addr, ctxt->info.gfx.csa_size,
> + ctxt->timeline_syncobj_handle, ++ctxt->point,
> + 0, 0);
> +
> + r = amdgpu_timeline_syncobj_wait(device_handle, ctxt->timeline_syncobj_handle,
> + ctxt->point);
> + igt_assert_eq(r, 0);
> + break;
> +
> + default:
> + igt_info("IP invalid for cleanup\n");
> + }
> +
> + r = amdgpu_cs_destroy_syncobj(device_handle, ctxt->timeline_syncobj_handle);
> + igt_assert_eq(r, 0);
> +
> + /* Clean up doorbell*/
> + r = amdgpu_bo_cpu_unmap(ctxt->doorbell.handle);
> + igt_assert_eq(r, 0);
> +
> + r = amdgpu_bo_free(ctxt->doorbell.handle);
> + igt_assert_eq(r, 0);
> +
> + /* Clean up rptr wptr queue */
> + amdgpu_bo_unmap_and_free(ctxt->rptr.handle, ctxt->rptr.va_handle,
> + ctxt->rptr.mc_addr, 8);
> +
> + amdgpu_bo_unmap_and_free(ctxt->wptr.handle, ctxt->wptr.va_handle,
> + ctxt->wptr.mc_addr, 8);
> +
> + amdgpu_bo_unmap_and_free(ctxt->queue.handle, ctxt->queue.va_handle,
> + ctxt->queue.mc_addr, USERMODE_QUEUE_SIZE);
> +}
> +
> +static void
> +amdgpu_user_queue_create(amdgpu_device_handle device_handle, struct amdgpu_ring_context *ctxt,
> + unsigned int type)
> +{
> + int r;
> + uint64_t gtt_flags = 0, queue_flags = 0;
> + struct drm_amdgpu_userq_mqd_gfx11 gfx_mqd;
> + struct drm_amdgpu_userq_mqd_sdma_gfx11 sdma_mqd;
> + struct drm_amdgpu_userq_mqd_compute_gfx11 compute_mqd;
> + void *mqd;
> +
> + if (type > AMD_IP_DMA) {
> + igt_info("Invalid IP not supported for UMQ Submission\n");
> + return;
> + }
> +
> + if (ctxt->secure) {
> + gtt_flags |= AMDGPU_GEM_CREATE_ENCRYPTED;
> + queue_flags |= AMDGPU_USERQ_CREATE_FLAGS_QUEUE_SECURE;
> + }
> +
> + if (ctxt->priority)
> + queue_flags |= ctxt->priority & AMDGPU_USERQ_CREATE_FLAGS_QUEUE_PRIORITY_MASK;
> +
> + r = amdgpu_query_uq_fw_area_info(device_handle, AMD_IP_GFX, 0, &ctxt->info);
> + igt_assert_eq(r, 0);
> +
> + r = amdgpu_cs_create_syncobj2(device_handle, 0, &ctxt->timeline_syncobj_handle);
> + igt_assert_eq(r, 0);
> +
> + r = amdgpu_bo_alloc_and_map_uq(device_handle, USERMODE_QUEUE_SIZE,
> + ALIGNMENT,
> + AMDGPU_GEM_DOMAIN_GTT,
> + gtt_flags,
> + AMDGPU_VM_MTYPE_UC,
> + &ctxt->queue.handle, &ctxt->queue.ptr,
> + &ctxt->queue.mc_addr, &ctxt->queue.va_handle,
> + ctxt->timeline_syncobj_handle, ++ctxt->point);
> + igt_assert_eq(r, 0);
> +
> + r = amdgpu_bo_alloc_and_map_uq(device_handle, 8,
> + ALIGNMENT,
> + AMDGPU_GEM_DOMAIN_GTT,
> + gtt_flags,
> + AMDGPU_VM_MTYPE_UC,
> + &ctxt->wptr.handle, &ctxt->wptr.ptr,
> + &ctxt->wptr.mc_addr, &ctxt->wptr.va_handle,
> + ctxt->timeline_syncobj_handle, ++ctxt->point);
> + igt_assert_eq(r, 0);
> +
> + r = amdgpu_bo_alloc_and_map_uq(device_handle, 8,
> + ALIGNMENT,
> + AMDGPU_GEM_DOMAIN_GTT,
> + gtt_flags,
> + AMDGPU_VM_MTYPE_UC,
> + &ctxt->rptr.handle, &ctxt->rptr.ptr,
> + &ctxt->rptr.mc_addr, &ctxt->rptr.va_handle,
> + ctxt->timeline_syncobj_handle, ++ctxt->point);
> + igt_assert_eq(r, 0);
> +
> + switch (type) {
> + case AMD_IP_GFX:
> + r = amdgpu_bo_alloc_and_map_uq(device_handle, ctxt->info.gfx.shadow_size,
> + ctxt->info.gfx.shadow_alignment,
> + AMDGPU_GEM_DOMAIN_GTT,
> + gtt_flags,
> + AMDGPU_VM_MTYPE_UC,
> + &ctxt->shadow.handle, NULL,
> + &ctxt->shadow.mc_addr, &ctxt->shadow.va_handle,
> + ctxt->timeline_syncobj_handle, ++ctxt->point);
> + igt_assert_eq(r, 0);
> +
> + r = amdgpu_bo_alloc_and_map_uq(device_handle, ctxt->info.gfx.csa_size,
> + ctxt->info.gfx.csa_alignment,
> + AMDGPU_GEM_DOMAIN_GTT,
> + gtt_flags,
> + AMDGPU_VM_MTYPE_UC,
> + &ctxt->csa.handle, NULL,
> + &ctxt->csa.mc_addr, &ctxt->csa.va_handle,
> + ctxt->timeline_syncobj_handle, ++ctxt->point);
> + igt_assert_eq(r, 0);
> +
> + gfx_mqd.shadow_va = ctxt->shadow.mc_addr;
> + gfx_mqd.csa_va = ctxt->csa.mc_addr;
> + mqd = &gfx_mqd;
> + break;
> +
> + case AMD_IP_COMPUTE:
> + r = amdgpu_bo_alloc_and_map_uq(device_handle, 256,
> + ALIGNMENT,
> + AMDGPU_GEM_DOMAIN_GTT,
> + gtt_flags,
> + AMDGPU_VM_MTYPE_UC,
> + &ctxt->eop.handle, NULL,
> + &ctxt->eop.mc_addr, &ctxt->eop.va_handle,
> + ctxt->timeline_syncobj_handle, ++ctxt->point);
> + igt_assert_eq(r, 0);
> + compute_mqd.eop_va = ctxt->eop.mc_addr;
> + mqd = &compute_mqd;
> + break;
> +
> + case AMD_IP_DMA:
> + r = amdgpu_bo_alloc_and_map_uq(device_handle, ctxt->info.gfx.csa_size,
> + ctxt->info.gfx.csa_alignment,
> + AMDGPU_GEM_DOMAIN_GTT,
> + gtt_flags,
> + AMDGPU_VM_MTYPE_UC,
> + &ctxt->csa.handle, NULL,
> + &ctxt->csa.mc_addr, &ctxt->csa.va_handle,
> + ctxt->timeline_syncobj_handle, ++ctxt->point);
> + igt_assert_eq(r, 0);
> + sdma_mqd.csa_va = ctxt->csa.mc_addr;
> + mqd = &sdma_mqd;
> + break;
> +
> + default:
> + igt_info("Unsupported IP for UMQ submission\n");
> + return;
> +
> + }
> +
> + r = amdgpu_timeline_syncobj_wait(device_handle, ctxt->timeline_syncobj_handle,
> + ctxt->point);
> + igt_assert_eq(r, 0);
> +
> + amdgpu_alloc_doorbell(device_handle, &ctxt->doorbell, PAGE_SIZE,
> + AMDGPU_GEM_DOMAIN_DOORBELL);
> +
> + ctxt->doorbell_cpu = (uint64_t *)ctxt->doorbell.ptr;
> +
> + ctxt->wptr_cpu = (uint64_t *)ctxt->wptr.ptr;
> + ctxt->rptr_cpu = (uint64_t *)ctxt->rptr.ptr;
> +
> + ctxt->queue_cpu = (uint32_t *)ctxt->queue.ptr;
> + memset(ctxt->queue_cpu, 0, USERMODE_QUEUE_SIZE);
> +
> + /* get db bo handle */
> + amdgpu_bo_export(ctxt->doorbell.handle, amdgpu_bo_handle_type_kms, &ctxt->db_handle);
> +
> + /* Create the Usermode Queue */
> + switch (type) {
> + case AMD_IP_GFX:
> + r = amdgpu_create_userqueue(device_handle, AMDGPU_HW_IP_GFX,
> + ctxt->db_handle, DOORBELL_INDEX,
> + ctxt->queue.mc_addr, USERMODE_QUEUE_SIZE,
> + ctxt->wptr.mc_addr, ctxt->rptr.mc_addr,
> + mqd, queue_flags, &ctxt->queue_id);
> + igt_assert_eq(r, 0);
> + break;
> +
> + case AMD_IP_COMPUTE:
> + r = amdgpu_create_userqueue(device_handle, AMDGPU_HW_IP_COMPUTE,
> + ctxt->db_handle, DOORBELL_INDEX,
> + ctxt->queue.mc_addr, USERMODE_QUEUE_SIZE,
> + ctxt->wptr.mc_addr, ctxt->rptr.mc_addr,
> + mqd, queue_flags, &ctxt->queue_id);
> + igt_assert_eq(r, 0);
> + break;
> +
> + case AMD_IP_DMA:
> + r = amdgpu_create_userqueue(device_handle, AMDGPU_HW_IP_DMA,
> + ctxt->db_handle, DOORBELL_INDEX,
> + ctxt->queue.mc_addr, USERMODE_QUEUE_SIZE,
> + ctxt->wptr.mc_addr, ctxt->rptr.mc_addr,
> + mqd, queue_flags, &ctxt->queue_id);
> + igt_assert_eq(r, 0);
> + break;
> +
> + default:
> + igt_info("Unsupported IP, failed to create user queue\n");
> + return;
> +
> + }
> +}
> +#else
> +int
> +amdgpu_bo_alloc_and_map_uq(amdgpu_device_handle device_handle, unsigned int size,
> + unsigned int alignment, unsigned int heap, uint64_t alloc_flags,
> + uint64_t mapping_flags, amdgpu_bo_handle *bo, void **cpu,
> + uint64_t *mc_address, amdgpu_va_handle *va_handle,
> + uint32_t timeline_syncobj_handle, uint64_t point)
> +{
> + return 0;
> +}
> +
> +int
> +amdgpu_timeline_syncobj_wait(amdgpu_device_handle device_handle,
> + uint32_t timeline_syncobj_handle, uint64_t point)
> +{
> + return 0;
> +}
> +
> +static void
> +amdgpu_user_queue_submit(amdgpu_device_handle device, struct amdgpu_ring_context *ring_context,
> + unsigned int ip_type, uint64_t mc_address)
> +{
> +}
> +
> +static void
> +amdgpu_user_queue_destroy(amdgpu_device_handle device_handle, struct amdgpu_ring_context *ctxt,
> + unsigned int type)
> +{
> +}
> +
> +static void
> +amdgpu_user_queue_create(amdgpu_device_handle device_handle, struct amdgpu_ring_context *ctxt,
> + unsigned int type)
> +{
> +}
> +#endif
> +
> static struct amdgpu_ip_funcs gfx_v8_x_ip_funcs = {
> .family_id = FAMILY_VI,
> .align_mask = 0xff,
> @@ -479,6 +969,9 @@ static struct amdgpu_ip_funcs gfx_v8_x_ip_funcs = {
> .compare_pattern = x_compare_pattern,
> .get_reg_offset = gfx_v8_0_get_reg_offset,
> .wait_reg_mem = gfx_ring_wait_reg_mem,
> + .userq_create = amdgpu_user_queue_create,
> + .userq_submit = amdgpu_user_queue_submit,
> + .userq_destroy = amdgpu_user_queue_destroy,
> };
>
> static struct amdgpu_ip_funcs sdma_v3_x_ip_funcs = {
> @@ -496,6 +989,9 @@ static struct amdgpu_ip_funcs sdma_v3_x_ip_funcs = {
> .compare_pattern = x_compare_pattern,
> .get_reg_offset = gfx_v8_0_get_reg_offset,
> .wait_reg_mem = sdma_ring_wait_reg_mem,
> + .userq_create = amdgpu_user_queue_create,
> + .userq_submit = amdgpu_user_queue_submit,
> + .userq_destroy = amdgpu_user_queue_destroy,
> };
>
> struct amdgpu_ip_block_version gfx_v8_x_ip_block = {
> diff --git a/lib/amdgpu/amd_ip_blocks.h b/lib/amdgpu/amd_ip_blocks.h
> index 2a4fa1742..b5731e13b 100644
> --- a/lib/amdgpu/amd_ip_blocks.h
> +++ b/lib/amdgpu/amd_ip_blocks.h
> @@ -8,6 +8,8 @@
> #define AMD_IP_BLOCKS_H
>
> #include <amdgpu_drm.h>
> +#include <amdgpu.h>
> +#include <time.h>
> #include <stdlib.h>
> #include <string.h>
> #include <unistd.h>
> @@ -32,6 +34,29 @@
> #define S_3F3_VALID_COMPUTE(x) (((unsigned int)(x)&0x1) << 23)/* userqueue only */
> #define S_3F3_INHERIT_VMID_MQD_COMPUTE(x) (((unsigned int)(x)&0x1) << 30)/* userqueue only */
>
> +#ifndef PAGE_SIZE
> +#define PAGE_SIZE 4096
> +#endif
> +
> +#define USERMODE_QUEUE_SIZE (PAGE_SIZE * 256) //In bytes with total size as 1 Mbyte
> +#define ALIGNMENT 4096
> +#define DOORBELL_INDEX 4
> +#define USERMODE_QUEUE_SIZE_DW (USERMODE_QUEUE_SIZE >> 2)
> +#define USERMODE_QUEUE_SIZE_DW_MASK (USERMODE_QUEUE_SIZE_DW - 1)
> +
> +#define amdgpu_pkt_begin() uint32_t __num_dw_written = 0; \
> + uint32_t __ring_start = *ring_context->wptr_cpu & USERMODE_QUEUE_SIZE_DW_MASK;
> +
> +#define amdgpu_pkt_add_dw(value) do { \
> + *(ring_context->queue_cpu + \
> + ((__ring_start + __num_dw_written) & USERMODE_QUEUE_SIZE_DW_MASK)) \
> + = value; \
> + __num_dw_written++;\
> +} while (0)
> +
> +#define amdgpu_pkt_end() \
> + *ring_context->wptr_cpu += __num_dw_written
> +
> enum amd_ip_block_type {
> AMD_IP_GFX = 0,
> AMD_IP_COMPUTE,
> @@ -202,6 +227,10 @@ struct amdgpu_ip_funcs {
> int (*get_reg_offset)(enum general_reg reg);
> int (*wait_reg_mem)(const struct amdgpu_ip_funcs *func, const struct amdgpu_ring_context *context, uint32_t *pm4_dw);
>
> + /* userq functions */
> + void (*userq_create)(amdgpu_device_handle device_handle, struct amdgpu_ring_context *ctxt, unsigned int type);
> + void (*userq_submit)(amdgpu_device_handle device, struct amdgpu_ring_context *ring_context, unsigned int ip_type, uint64_t mc_address);
> + void (*userq_destroy)(amdgpu_device_handle device_handle, struct amdgpu_ring_context *ctxt, unsigned int type);
> };
>
> extern const struct amdgpu_ip_block_version gfx_v6_0_ip_block;
> @@ -280,4 +309,15 @@ get_pci_addr_from_fd(int fd, struct pci_addr *pci);
>
> bool
> is_support_page_queue(enum amd_ip_block_type ip_type, const struct pci_addr *pci);
> +
> +int
> +amdgpu_bo_alloc_and_map_uq(amdgpu_device_handle device_handle, unsigned int size,
> + unsigned int alignment, unsigned int heap, uint64_t alloc_flags,
> + uint64_t mapping_flags, amdgpu_bo_handle *bo, void **cpu,
> + uint64_t *mc_address, amdgpu_va_handle *va_handle,
> + uint32_t timeline_syncobj_handle, uint64_t point);
> +
> +int
> +amdgpu_timeline_syncobj_wait(amdgpu_device_handle device_handle,
> + uint32_t timeline_syncobj_handle, uint64_t point);
> #endif
> diff --git a/lib/amdgpu/amd_memory.c b/lib/amdgpu/amd_memory.c
> index 882c5c18f..72ea3db6c 100644
> --- a/lib/amdgpu/amd_memory.c
> +++ b/lib/amdgpu/amd_memory.c
> @@ -25,8 +25,7 @@
>
> #include "amd_memory.h"
> #include "amd_PM4.h"
> -#include "amd_userq.h"
> -
> +#include "amd_ip_blocks.h"
> /**
> *
> * @param device_handle
> diff --git a/lib/amdgpu/amd_userq.c b/lib/amdgpu/amd_userq.c
> deleted file mode 100644
> index 43a2d10c5..000000000
> --- a/lib/amdgpu/amd_userq.c
> +++ /dev/null
> @@ -1,494 +0,0 @@
> -// SPDX-License-Identifier: MIT
> -/*
> - * Copyright 2025 Advanced Micro Devices, Inc.
> - */
> -
> -#include "amd_userq.h"
> -#include "amd_memory.h"
> -#include "amd_PM4.h"
> -#include "amd_sdma.h"
> -#include "ioctl_wrappers.h"
> -
> -#ifdef AMDGPU_USERQ_ENABLED
> -static void amdgpu_alloc_doorbell(amdgpu_device_handle device_handle,
> - struct amdgpu_userq_bo *doorbell_bo,
> - unsigned int size, unsigned int domain)
> -{
> - struct amdgpu_bo_alloc_request req = {0};
> - amdgpu_bo_handle buf_handle;
> - int r;
> -
> - req.alloc_size = ALIGN(size, PAGE_SIZE);
> - req.preferred_heap = domain;
> - r = amdgpu_bo_alloc(device_handle, &req, &buf_handle);
> - igt_assert_eq(r, 0);
> -
> - doorbell_bo->handle = buf_handle;
> - doorbell_bo->size = req.alloc_size;
> -
> - r = amdgpu_bo_cpu_map(doorbell_bo->handle,
> - (void **)&doorbell_bo->ptr);
> - igt_assert_eq(r, 0);
> -}
> -
> -int
> -amdgpu_bo_alloc_and_map_uq(amdgpu_device_handle device_handle, unsigned int size,
> - unsigned int alignment, unsigned int heap, uint64_t alloc_flags,
> - uint64_t mapping_flags, amdgpu_bo_handle *bo, void **cpu,
> - uint64_t *mc_address, amdgpu_va_handle *va_handle,
> - uint32_t timeline_syncobj_handle, uint64_t point)
> -{
> - struct amdgpu_bo_alloc_request request = {};
> - amdgpu_bo_handle buf_handle;
> - uint64_t vmc_addr;
> - int r;
> -
> - request.alloc_size = size;
> - request.phys_alignment = alignment;
> - request.preferred_heap = heap;
> - request.flags = alloc_flags;
> -
> - r = amdgpu_bo_alloc(device_handle, &request, &buf_handle);
> - if (r)
> - return r;
> -
> - r = amdgpu_va_range_alloc(device_handle,
> - amdgpu_gpu_va_range_general,
> - size, alignment, 0, &vmc_addr,
> - va_handle, 0);
> - if (r)
> - goto error_va_alloc;
> -
> - r = amdgpu_bo_va_op_raw2(device_handle, buf_handle, 0,
> - ALIGN(size, getpagesize()), vmc_addr,
> - AMDGPU_VM_PAGE_READABLE |
> - AMDGPU_VM_PAGE_WRITEABLE |
> - AMDGPU_VM_PAGE_EXECUTABLE |
> - mapping_flags,
> - AMDGPU_VA_OP_MAP,
> - timeline_syncobj_handle,
> - point, 0, 0);
> - if (r)
> - goto error_va_map;
> -
> - if (cpu) {
> - r = amdgpu_bo_cpu_map(buf_handle, cpu);
> - if (r)
> - goto error_cpu_map;
> - }
> -
> - *bo = buf_handle;
> - *mc_address = vmc_addr;
> -
> - return 0;
> -
> -error_cpu_map:
> - amdgpu_bo_va_op(buf_handle, 0, size, vmc_addr, 0, AMDGPU_VA_OP_UNMAP);
> -error_va_map:
> - amdgpu_va_range_free(*va_handle);
> -error_va_alloc:
> - amdgpu_bo_free(buf_handle);
> - return r;
> -}
> -
> -static void amdgpu_bo_unmap_and_free_uq(amdgpu_device_handle device_handle,
> - amdgpu_bo_handle bo, amdgpu_va_handle va_handle,
> - uint64_t mc_addr, uint64_t size,
> - uint32_t timeline_syncobj_handle,
> - uint64_t point, uint64_t syncobj_handles_array,
> - uint32_t num_syncobj_handles)
> -{
> - amdgpu_bo_cpu_unmap(bo);
> - amdgpu_bo_va_op_raw2(device_handle, bo, 0, size, mc_addr, 0, AMDGPU_VA_OP_UNMAP,
> - timeline_syncobj_handle, point,
> - syncobj_handles_array, num_syncobj_handles);
> - amdgpu_va_range_free(va_handle);
> - amdgpu_bo_free(bo);
> -}
> -
> -int amdgpu_timeline_syncobj_wait(amdgpu_device_handle device_handle,
> - uint32_t timeline_syncobj_handle, uint64_t point)
> -{
> - uint32_t flags = DRM_SYNCOBJ_QUERY_FLAGS_LAST_SUBMITTED;
> - int r;
> -
> - r = amdgpu_cs_syncobj_query2(device_handle, &timeline_syncobj_handle,
> - &point, 1, flags);
> - if (r)
> - return r;
> -
> - r = amdgpu_cs_syncobj_timeline_wait(device_handle, &timeline_syncobj_handle,
> - &point, 1, INT64_MAX,
> - DRM_SYNCOBJ_WAIT_FLAGS_WAIT_ALL |
> - DRM_SYNCOBJ_WAIT_FLAGS_WAIT_FOR_SUBMIT,
> - NULL);
> - if (r)
> - igt_warn("Timeline timed out\n");
> - return r;
> -}
> -
> -void amdgpu_user_queue_submit(amdgpu_device_handle device, struct amdgpu_ring_context *ring_context,
> - unsigned int ip_type, uint64_t mc_address)
> -{
> - int r;
> - uint32_t control = ring_context->pm4_dw;
> - uint32_t syncarray[1];
> - struct drm_amdgpu_userq_signal signal_data;
> - uint64_t timeout = ring_context->time_out ? ring_context->time_out : INT64_MAX;
> -
> - amdgpu_pkt_begin();
> -
> - if (ip_type == AMD_IP_DMA) {
> - /* For SDMA, we need to align the IB to 8 DW boundary */
> - unsigned int nop_count = (2 - lower_32_bits(*ring_context->wptr_cpu)) & 7;
> - for (unsigned int i = 0; i < nop_count; i++)
> - amdgpu_pkt_add_dw(SDMA_PKT_HEADER_OP(SDMA_NOP));
> - amdgpu_pkt_add_dw(SDMA_PKT_HEADER_OP(SDMA_OP_INDIRECT));
> - amdgpu_pkt_add_dw(lower_32_bits(mc_address) & 0xffffffe0); // 32-byte aligned
> - amdgpu_pkt_add_dw(upper_32_bits(mc_address));
> - amdgpu_pkt_add_dw(control); // IB length in DWORDS
> - amdgpu_pkt_add_dw(lower_32_bits(ring_context->csa.mc_addr)); // CSA MC address low
> - amdgpu_pkt_add_dw(upper_32_bits(ring_context->csa.mc_addr)); // CSA MC address high
> - if (ring_context->hw_ip_info.hw_ip_version_major <= 6)
> - amdgpu_pkt_add_dw(SDMA_PACKET(SDMA_OP_PROTECTED_FENCE, SDMA6_SUB_OP_PROTECTED_FENCE, 0));
> - else
> - amdgpu_pkt_add_dw(SDMA_PACKET(SDMA_OP_PROTECTED_FENCE, SDMA7_SUB_OP_PROTECTED_FENCE, 0));
> - } else {
> - /* Prepare the Indirect IB to submit the IB to user queue */
> - amdgpu_pkt_add_dw(PACKET3(PACKET3_INDIRECT_BUFFER, 2));
> - amdgpu_pkt_add_dw(lower_32_bits(mc_address));
> - amdgpu_pkt_add_dw(upper_32_bits(mc_address));
> -
> - if (ip_type == AMD_IP_GFX)
> - amdgpu_pkt_add_dw(control | S_3F3_INHERIT_VMID_MQD_GFX(1));
> - else
> - amdgpu_pkt_add_dw(control | S_3F3_VALID_COMPUTE(1)
> - | S_3F3_INHERIT_VMID_MQD_COMPUTE(1));
> -
> - amdgpu_pkt_add_dw(PACKET3(PACKET3_PROTECTED_FENCE_SIGNAL, 0));
> -
> - /* empty dword is needed for fence signal pm4 */
> - amdgpu_pkt_add_dw(0);
> - }
> -#if DETECT_CC_GCC && (DETECT_ARCH_X86 || DETECT_ARCH_X86_64)
> - asm volatile ("mfence" : : : "memory");
> -#endif
> -
> - /* Below call update the wptr address so will wait till all writes are completed */
> - amdgpu_pkt_end();
> -
> -#if DETECT_CC_GCC && (DETECT_ARCH_X86 || DETECT_ARCH_X86_64)
> - asm volatile ("mfence" : : : "memory");
> -#endif
> -
> - if (ip_type == AMD_IP_DMA)
> - *ring_context->wptr_cpu = *ring_context->wptr_cpu <<2;
> - /* Update the door bell */
> - ring_context->doorbell_cpu[DOORBELL_INDEX] = *ring_context->wptr_cpu;
> -
> - /* Add a fence packet for signal */
> - syncarray[0] = ring_context->timeline_syncobj_handle;
> - signal_data.queue_id = ring_context->queue_id;
> - signal_data.syncobj_handles = (uintptr_t)syncarray;
> - signal_data.num_syncobj_handles = 1;
> - signal_data.bo_read_handles = 0;
> - signal_data.bo_write_handles = 0;
> - signal_data.num_bo_read_handles = 0;
> - signal_data.num_bo_write_handles = 0;
> -
> - r = amdgpu_userq_signal(device, &signal_data);
> - igt_assert_eq(r, 0);
> -
> - r = amdgpu_cs_syncobj_wait(device, &ring_context->timeline_syncobj_handle, 1, timeout,
> - DRM_SYNCOBJ_WAIT_FLAGS_WAIT_ALL, NULL);
> - igt_assert_eq(r, 0);
> -}
> -
> -void amdgpu_user_queue_destroy(amdgpu_device_handle device_handle, struct amdgpu_ring_context *ctxt,
> - unsigned int type)
> -{
> - int r;
> -
> - if (type > AMD_IP_DMA) {
> - igt_info("Invalid IP not supported for UMQ Submission\n");
> - return;
> - }
> -
> - /* Free the Usermode Queue */
> - r = amdgpu_free_userqueue(device_handle, ctxt->queue_id);
> - igt_assert_eq(r, 0);
> -
> - switch (type) {
> - case AMD_IP_GFX:
> - amdgpu_bo_unmap_and_free_uq(device_handle, ctxt->csa.handle,
> - ctxt->csa.va_handle,
> - ctxt->csa.mc_addr, ctxt->info.gfx.csa_size,
> - ctxt->timeline_syncobj_handle, ++ctxt->point,
> - 0, 0);
> -
> - amdgpu_bo_unmap_and_free_uq(device_handle, ctxt->shadow.handle,
> - ctxt->shadow.va_handle,
> - ctxt->shadow.mc_addr, ctxt->info.gfx.shadow_size,
> - ctxt->timeline_syncobj_handle, ++ctxt->point,
> - 0, 0);
> -
> - r = amdgpu_timeline_syncobj_wait(device_handle, ctxt->timeline_syncobj_handle,
> - ctxt->point);
> - igt_assert_eq(r, 0);
> - break;
> -
> - case AMD_IP_COMPUTE:
> - amdgpu_bo_unmap_and_free_uq(device_handle, ctxt->eop.handle,
> - ctxt->eop.va_handle,
> - ctxt->eop.mc_addr, 256,
> - ctxt->timeline_syncobj_handle, ++ctxt->point,
> - 0, 0);
> -
> - r = amdgpu_timeline_syncobj_wait(device_handle, ctxt->timeline_syncobj_handle,
> - ctxt->point);
> - igt_assert_eq(r, 0);
> - break;
> -
> - case AMD_IP_DMA:
> - amdgpu_bo_unmap_and_free_uq(device_handle, ctxt->csa.handle,
> - ctxt->csa.va_handle,
> - ctxt->csa.mc_addr, ctxt->info.gfx.csa_size,
> - ctxt->timeline_syncobj_handle, ++ctxt->point,
> - 0, 0);
> -
> - r = amdgpu_timeline_syncobj_wait(device_handle, ctxt->timeline_syncobj_handle,
> - ctxt->point);
> - igt_assert_eq(r, 0);
> - break;
> -
> - default:
> - igt_info("IP invalid for cleanup\n");
> - }
> -
> - r = amdgpu_cs_destroy_syncobj(device_handle, ctxt->timeline_syncobj_handle);
> - igt_assert_eq(r, 0);
> -
> - /* Clean up doorbell*/
> - r = amdgpu_bo_cpu_unmap(ctxt->doorbell.handle);
> - igt_assert_eq(r, 0);
> -
> - r = amdgpu_bo_free(ctxt->doorbell.handle);
> - igt_assert_eq(r, 0);
> -
> - /* Clean up rptr wptr queue */
> - amdgpu_bo_unmap_and_free(ctxt->rptr.handle, ctxt->rptr.va_handle,
> - ctxt->rptr.mc_addr, 8);
> -
> - amdgpu_bo_unmap_and_free(ctxt->wptr.handle, ctxt->wptr.va_handle,
> - ctxt->wptr.mc_addr, 8);
> -
> - amdgpu_bo_unmap_and_free(ctxt->queue.handle, ctxt->queue.va_handle,
> - ctxt->queue.mc_addr, USERMODE_QUEUE_SIZE);
> -}
> -
> -void amdgpu_user_queue_create(amdgpu_device_handle device_handle, struct amdgpu_ring_context *ctxt,
> - unsigned int type)
> -{
> - int r;
> - uint64_t gtt_flags = 0, queue_flags = 0;
> - struct drm_amdgpu_userq_mqd_gfx11 gfx_mqd;
> - struct drm_amdgpu_userq_mqd_sdma_gfx11 sdma_mqd;
> - struct drm_amdgpu_userq_mqd_compute_gfx11 compute_mqd;
> - void *mqd;
> -
> - if (type > AMD_IP_DMA) {
> - igt_info("Invalid IP not supported for UMQ Submission\n");
> - return;
> - }
> -
> - if (ctxt->secure) {
> - gtt_flags |= AMDGPU_GEM_CREATE_ENCRYPTED;
> - queue_flags |= AMDGPU_USERQ_CREATE_FLAGS_QUEUE_SECURE;
> - }
> -
> - if (ctxt->priority)
> - queue_flags |= ctxt->priority & AMDGPU_USERQ_CREATE_FLAGS_QUEUE_PRIORITY_MASK;
> -
> - r = amdgpu_query_uq_fw_area_info(device_handle, AMD_IP_GFX, 0, &ctxt->info);
> - igt_assert_eq(r, 0);
> -
> - r = amdgpu_cs_create_syncobj2(device_handle, 0, &ctxt->timeline_syncobj_handle);
> - igt_assert_eq(r, 0);
> -
> - r = amdgpu_bo_alloc_and_map_uq(device_handle, USERMODE_QUEUE_SIZE,
> - ALIGNMENT,
> - AMDGPU_GEM_DOMAIN_GTT,
> - gtt_flags,
> - AMDGPU_VM_MTYPE_UC,
> - &ctxt->queue.handle, &ctxt->queue.ptr,
> - &ctxt->queue.mc_addr, &ctxt->queue.va_handle,
> - ctxt->timeline_syncobj_handle, ++ctxt->point);
> - igt_assert_eq(r, 0);
> -
> - r = amdgpu_bo_alloc_and_map_uq(device_handle, 8,
> - ALIGNMENT,
> - AMDGPU_GEM_DOMAIN_GTT,
> - gtt_flags,
> - AMDGPU_VM_MTYPE_UC,
> - &ctxt->wptr.handle, &ctxt->wptr.ptr,
> - &ctxt->wptr.mc_addr, &ctxt->wptr.va_handle,
> - ctxt->timeline_syncobj_handle, ++ctxt->point);
> - igt_assert_eq(r, 0);
> -
> - r = amdgpu_bo_alloc_and_map_uq(device_handle, 8,
> - ALIGNMENT,
> - AMDGPU_GEM_DOMAIN_GTT,
> - gtt_flags,
> - AMDGPU_VM_MTYPE_UC,
> - &ctxt->rptr.handle, &ctxt->rptr.ptr,
> - &ctxt->rptr.mc_addr, &ctxt->rptr.va_handle,
> - ctxt->timeline_syncobj_handle, ++ctxt->point);
> - igt_assert_eq(r, 0);
> -
> - switch (type) {
> - case AMD_IP_GFX:
> - r = amdgpu_bo_alloc_and_map_uq(device_handle, ctxt->info.gfx.shadow_size,
> - ctxt->info.gfx.shadow_alignment,
> - AMDGPU_GEM_DOMAIN_GTT,
> - gtt_flags,
> - AMDGPU_VM_MTYPE_UC,
> - &ctxt->shadow.handle, NULL,
> - &ctxt->shadow.mc_addr, &ctxt->shadow.va_handle,
> - ctxt->timeline_syncobj_handle, ++ctxt->point);
> - igt_assert_eq(r, 0);
> -
> - r = amdgpu_bo_alloc_and_map_uq(device_handle, ctxt->info.gfx.csa_size,
> - ctxt->info.gfx.csa_alignment,
> - AMDGPU_GEM_DOMAIN_GTT,
> - gtt_flags,
> - AMDGPU_VM_MTYPE_UC,
> - &ctxt->csa.handle, NULL,
> - &ctxt->csa.mc_addr, &ctxt->csa.va_handle,
> - ctxt->timeline_syncobj_handle, ++ctxt->point);
> - igt_assert_eq(r, 0);
> -
> - gfx_mqd.shadow_va = ctxt->shadow.mc_addr;
> - gfx_mqd.csa_va = ctxt->csa.mc_addr;
> - mqd = &gfx_mqd;
> - break;
> -
> - case AMD_IP_COMPUTE:
> - r = amdgpu_bo_alloc_and_map_uq(device_handle, 256,
> - ALIGNMENT,
> - AMDGPU_GEM_DOMAIN_GTT,
> - gtt_flags,
> - AMDGPU_VM_MTYPE_UC,
> - &ctxt->eop.handle, NULL,
> - &ctxt->eop.mc_addr, &ctxt->eop.va_handle,
> - ctxt->timeline_syncobj_handle, ++ctxt->point);
> - igt_assert_eq(r, 0);
> - compute_mqd.eop_va = ctxt->eop.mc_addr;
> - mqd = &compute_mqd;
> - break;
> -
> - case AMD_IP_DMA:
> - r = amdgpu_bo_alloc_and_map_uq(device_handle, ctxt->info.gfx.csa_size,
> - ctxt->info.gfx.csa_alignment,
> - AMDGPU_GEM_DOMAIN_GTT,
> - gtt_flags,
> - AMDGPU_VM_MTYPE_UC,
> - &ctxt->csa.handle, NULL,
> - &ctxt->csa.mc_addr, &ctxt->csa.va_handle,
> - ctxt->timeline_syncobj_handle, ++ctxt->point);
> - igt_assert_eq(r, 0);
> - sdma_mqd.csa_va = ctxt->csa.mc_addr;
> - mqd = &sdma_mqd;
> - break;
> -
> - default:
> - igt_info("Unsupported IP for UMQ submission\n");
> - return;
> -
> - }
> -
> - r = amdgpu_timeline_syncobj_wait(device_handle, ctxt->timeline_syncobj_handle,
> - ctxt->point);
> - igt_assert_eq(r, 0);
> -
> - amdgpu_alloc_doorbell(device_handle, &ctxt->doorbell, PAGE_SIZE,
> - AMDGPU_GEM_DOMAIN_DOORBELL);
> -
> - ctxt->doorbell_cpu = (uint64_t *)ctxt->doorbell.ptr;
> -
> - ctxt->wptr_cpu = (uint64_t *)ctxt->wptr.ptr;
> - ctxt->rptr_cpu = (uint64_t *)ctxt->rptr.ptr;
> -
> - ctxt->queue_cpu = (uint32_t *)ctxt->queue.ptr;
> - memset(ctxt->queue_cpu, 0, USERMODE_QUEUE_SIZE);
> -
> - /* get db bo handle */
> - amdgpu_bo_export(ctxt->doorbell.handle, amdgpu_bo_handle_type_kms, &ctxt->db_handle);
> -
> - /* Create the Usermode Queue */
> - switch (type) {
> - case AMD_IP_GFX:
> - r = amdgpu_create_userqueue(device_handle, AMDGPU_HW_IP_GFX,
> - ctxt->db_handle, DOORBELL_INDEX,
> - ctxt->queue.mc_addr, USERMODE_QUEUE_SIZE,
> - ctxt->wptr.mc_addr, ctxt->rptr.mc_addr,
> - mqd, queue_flags, &ctxt->queue_id);
> - igt_assert_eq(r, 0);
> - break;
> -
> - case AMD_IP_COMPUTE:
> - r = amdgpu_create_userqueue(device_handle, AMDGPU_HW_IP_COMPUTE,
> - ctxt->db_handle, DOORBELL_INDEX,
> - ctxt->queue.mc_addr, USERMODE_QUEUE_SIZE,
> - ctxt->wptr.mc_addr, ctxt->rptr.mc_addr,
> - mqd, queue_flags, &ctxt->queue_id);
> - igt_assert_eq(r, 0);
> - break;
> -
> - case AMD_IP_DMA:
> - r = amdgpu_create_userqueue(device_handle, AMDGPU_HW_IP_DMA,
> - ctxt->db_handle, DOORBELL_INDEX,
> - ctxt->queue.mc_addr, USERMODE_QUEUE_SIZE,
> - ctxt->wptr.mc_addr, ctxt->rptr.mc_addr,
> - mqd, queue_flags, &ctxt->queue_id);
> - igt_assert_eq(r, 0);
> - break;
> -
> - default:
> - igt_info("Unsupported IP, failed to create user queue\n");
> - return;
> -
> - }
> -}
> -#else
> -int
> -amdgpu_bo_alloc_and_map_uq(amdgpu_device_handle device_handle, unsigned int size,
> - unsigned int alignment, unsigned int heap, uint64_t alloc_flags,
> - uint64_t mapping_flags, amdgpu_bo_handle *bo, void **cpu,
> - uint64_t *mc_address, amdgpu_va_handle *va_handle,
> - uint32_t timeline_syncobj_handle, uint64_t point)
> -{
> - return 0;
> -}
> -
> -int amdgpu_timeline_syncobj_wait(amdgpu_device_handle device_handle,
> - uint32_t timeline_syncobj_handle, uint64_t point)
> -{
> - return 0;
> -}
> -
> -void amdgpu_user_queue_submit(amdgpu_device_handle device, struct amdgpu_ring_context *ring_context,
> - unsigned int ip_type, uint64_t mc_address)
> -{
> -}
> -
> -void amdgpu_user_queue_destroy(amdgpu_device_handle device_handle, struct amdgpu_ring_context *ctxt,
> - unsigned int type)
> -{
> -}
> -
> -void amdgpu_user_queue_create(amdgpu_device_handle device_handle, struct amdgpu_ring_context *ctxt,
> - unsigned int type)
> -{
> -}
> -
> -#endif
> diff --git a/lib/amdgpu/amd_userq.h b/lib/amdgpu/amd_userq.h
> deleted file mode 100644
> index b29e97ccf..000000000
> --- a/lib/amdgpu/amd_userq.h
> +++ /dev/null
> @@ -1,55 +0,0 @@
> -/* SPDX-License-Identifier: MIT
> - * Copyright 2025 Advanced Micro Devices, Inc.
> - */
> -
> -#ifndef _AMD_USER_QUEUE_
> -#define _AMD_USER_QUEUE_
> -
> -#include <amdgpu_drm.h>
> -#include <amdgpu.h>
> -#include <time.h>
> -#include "amd_ip_blocks.h"
> -
> -
> -#ifndef PAGE_SIZE
> -#define PAGE_SIZE 4096
> -#endif
> -
> -#define USERMODE_QUEUE_SIZE (PAGE_SIZE * 256) //In bytes with total size as 1 Mbyte
> -#define ALIGNMENT 4096
> -#define DOORBELL_INDEX 4
> -#define USERMODE_QUEUE_SIZE_DW (USERMODE_QUEUE_SIZE >> 2)
> -#define USERMODE_QUEUE_SIZE_DW_MASK (USERMODE_QUEUE_SIZE_DW - 1)
> -
> -#define amdgpu_pkt_begin() uint32_t __num_dw_written = 0; \
> - uint32_t __ring_start = *ring_context->wptr_cpu & USERMODE_QUEUE_SIZE_DW_MASK;
> -
> -#define amdgpu_pkt_add_dw(value) do { \
> - *(ring_context->queue_cpu + \
> - ((__ring_start + __num_dw_written) & USERMODE_QUEUE_SIZE_DW_MASK)) \
> - = value; \
> - __num_dw_written++;\
> -} while (0)
> -
> -#define amdgpu_pkt_end() \
> - *ring_context->wptr_cpu += __num_dw_written
> -
> -int amdgpu_bo_alloc_and_map_uq(amdgpu_device_handle device_handle, unsigned int size,
> - unsigned int alignment, unsigned int heap, uint64_t alloc_flags,
> - uint64_t mapping_flags, amdgpu_bo_handle *bo, void **cpu,
> - uint64_t *mc_address, amdgpu_va_handle *va_handle,
> - uint32_t timeline_syncobj_handle, uint64_t point);
> -
> -int amdgpu_timeline_syncobj_wait(amdgpu_device_handle device_handle,
> - uint32_t timeline_syncobj_handle, uint64_t point);
> -
> -void amdgpu_user_queue_create(amdgpu_device_handle device_handle, struct amdgpu_ring_context *ctxt,
> - unsigned int ip_type);
> -
> -void amdgpu_user_queue_destroy(amdgpu_device_handle device_handle, struct amdgpu_ring_context *ctxt,
> - unsigned int ip_type);
> -
> -void amdgpu_user_queue_submit(amdgpu_device_handle device, struct amdgpu_ring_context *ring_context,
> - unsigned int ip_type, uint64_t mc_address);
> -
> -#endif
> diff --git a/lib/meson.build b/lib/meson.build
> index 5e4247aad..b0505190b 100644
> --- a/lib/meson.build
> +++ b/lib/meson.build
> @@ -170,8 +170,7 @@ if libdrm_amdgpu.found()
> 'amdgpu/amd_mem_leak.c',
> 'amdgpu/amd_mmd_shared.c',
> 'amdgpu/amd_jpeg_shared.c',
> - 'amdgpu/amd_vcn_shared.c',
> - 'amdgpu/amd_userq.c'
> + 'amdgpu/amd_vcn_shared.c'
> ]
> if libdrm_amdgpu.version().version_compare('> 2.4.99')
> lib_sources +=[ 'amdgpu/amd_dispatch.c',]
> diff --git a/tests/amdgpu/amd_basic.c b/tests/amdgpu/amd_basic.c
> index 04e8d62e5..8c6b466ce 100644
> --- a/tests/amdgpu/amd_basic.c
> +++ b/tests/amdgpu/amd_basic.c
> @@ -13,7 +13,6 @@
> #include "lib/amdgpu/amd_gfx.h"
> #include "lib/amdgpu/amd_shaders.h"
> #include "lib/amdgpu/amd_dispatch.h"
> -#include "lib/amdgpu/amd_userq.h"
>
> #define BUFFER_SIZE (8 * 1024)
>
> @@ -509,7 +508,7 @@ amdgpu_sync_dependency_test(amdgpu_device_handle device_handle, bool user_queue)
> igt_assert(ring_context);
>
> if (user_queue) {
> - amdgpu_user_queue_create(device_handle, ring_context, ip_block->type);
> + ip_block->funcs->userq_create(device_handle, ring_context, ip_block->type);
> } else {
> r = amdgpu_cs_ctx_create(device_handle, &context_handle[0]);
> igt_assert_eq(r, 0);
> @@ -608,7 +607,7 @@ amdgpu_sync_dependency_test(amdgpu_device_handle device_handle, bool user_queue)
>
> if (user_queue) {
> ring_context->pm4_dw = ib_info.size;
> - amdgpu_user_queue_submit(device_handle, ring_context, ip_block->type,
> + ip_block->funcs->userq_submit(device_handle, ring_context, ip_block->type,
> ib_result_mc_address);
> } else {
> r = amdgpu_cs_submit(context_handle[1], 0, &ibs_request, 1);
> @@ -648,7 +647,7 @@ amdgpu_sync_dependency_test(amdgpu_device_handle device_handle, bool user_queue)
>
> if (user_queue) {
> ring_context->pm4_dw = ib_info.size;
> - amdgpu_user_queue_submit(device_handle, ring_context, ip_block->type,
> + ip_block->funcs->userq_submit(device_handle, ring_context, ip_block->type,
> ib_info.ib_mc_address);
> } else {
> r = amdgpu_cs_submit(context_handle[0], 0, &ibs_request, 1);
> @@ -680,7 +679,7 @@ amdgpu_sync_dependency_test(amdgpu_device_handle device_handle, bool user_queue)
> ib_result_mc_address, const_alignment);
>
> if (user_queue) {
> - amdgpu_user_queue_destroy(device_handle, ring_context, ip_block->type);
> + ip_block->funcs->userq_destroy(device_handle, ring_context, ip_block->type);
> } else {
> amdgpu_cs_ctx_free(context_handle[0]);
> amdgpu_cs_ctx_free(context_handle[1]);
> diff --git a/tests/amdgpu/amd_cs_nop.c b/tests/amdgpu/amd_cs_nop.c
> index 96a385413..0f2c33168 100644
> --- a/tests/amdgpu/amd_cs_nop.c
> +++ b/tests/amdgpu/amd_cs_nop.c
> @@ -12,7 +12,6 @@
> #include "lib/amdgpu/amd_PM4.h"
> #include "lib/amdgpu/amd_ip_blocks.h"
> #include "lib/amdgpu/amd_memory.h"
> -#include "lib/amdgpu/amd_userq.h"
>
> static void amdgpu_cs_sync(amdgpu_context_handle context,
> unsigned int ip_type,
> @@ -54,12 +53,14 @@ static void nop_cs(amdgpu_device_handle device,
> amdgpu_bo_list_handle bo_list;
> amdgpu_va_handle va_handle;
> struct amdgpu_ring_context *ring_context;
> + const struct amdgpu_ip_block_version *ip_block = NULL;
>
> + ip_block = get_ip_block(device, ip_type);
> ring_context = calloc(1, sizeof(*ring_context));
> igt_assert(ring_context);
>
> if (user_queue)
> - amdgpu_user_queue_create(device, ring_context, ip_type);
> + ip_block->funcs->userq_create(device, ring_context, ip_type);
>
> r = amdgpu_bo_alloc_and_map_sync(device, 4096, 4096,
> AMDGPU_GEM_DOMAIN_GTT, 0, AMDGPU_VM_MTYPE_UC,
> @@ -107,7 +108,7 @@ static void nop_cs(amdgpu_device_handle device,
> igt_until_timeout(timeout) {
> if (user_queue) {
> ring_context->pm4_dw = ib_info.size;
> - amdgpu_user_queue_submit(device, ring_context, ip_type,
> + ip_block->funcs->userq_submit(device, ring_context, ip_type,
> ib_info.ib_mc_address);
> igt_assert_eq(r, 0);
> } else {
> @@ -140,7 +141,7 @@ static void nop_cs(amdgpu_device_handle device,
> amdgpu_bo_unmap_and_free(ib_result_handle, va_handle,
> ib_result_mc_address, 4096);
> if (user_queue)
> - amdgpu_user_queue_destroy(device, ring_context, ip_type);
> + ip_block->funcs->userq_destroy(device, ring_context, ip_type);
>
> free(ring_context);
> }
> diff --git a/tests/amdgpu/amd_deadlock.c b/tests/amdgpu/amd_deadlock.c
> index 19dff1dc0..3456f42c8 100644
> --- a/tests/amdgpu/amd_deadlock.c
> +++ b/tests/amdgpu/amd_deadlock.c
> @@ -9,7 +9,6 @@
> #include "lib/amdgpu/amd_command_submission.h"
> #include "lib/amdgpu/amd_deadlock_helpers.h"
> #include "lib/amdgpu/amdgpu_asic_addr.h"
> -#include "lib/amdgpu/amd_userq.h"
>
> #define AMDGPU_FAMILY_SI 110 /* Hainan, Oland, Verde, Pitcairn, Tahiti */
> #define AMDGPU_FAMILY_CI 120 /* Bonaire, Hawaii */
More information about the igt-dev
mailing list