[PATCH i-g-t 2/3] lib/amdgpu: Implement AQL packet submission support
Jesse.Zhang
Jesse.Zhang at amd.com
Fri Apr 25 03:31:13 UTC 2025
This patch adds:
1. A new `amd_hsa.h` header defining AQL packet types and structures
2. Extended `amdgpu_ring_context` to support AQL queues
3. New `amdgpu_aql_queue_submit()` for AQL packet submission
4. Memory barriers (`mfence`) for proper CPU-GPU synchronization
5. Signal handling for AQL packet completion
The implementation supports vendor-specific AQL packets initially,
with room for future expansion to other HSA packet types.
Signed-off-by: Jesse.Zhang <Jesse.Zhang at amd.com>
---
lib/amdgpu/amd_command_submission.c | 13 ++-
lib/amdgpu/amd_command_submission.h | 2 +-
lib/amdgpu/amd_hsa.h | 157 ++++++++++++++++++++++++++++
lib/amdgpu/amd_ip_blocks.h | 11 +-
lib/amdgpu/amd_user_queue.c | 95 ++++++++++++++++-
lib/amdgpu/amd_user_queue.h | 2 +
6 files changed, 274 insertions(+), 6 deletions(-)
create mode 100644 lib/amdgpu/amd_hsa.h
diff --git a/lib/amdgpu/amd_command_submission.c b/lib/amdgpu/amd_command_submission.c
index 7550fa8bc..54eb2b0f8 100644
--- a/lib/amdgpu/amd_command_submission.c
+++ b/lib/amdgpu/amd_command_submission.c
@@ -67,7 +67,9 @@ int amdgpu_test_exec_cs_helper(amdgpu_device_handle device, unsigned int ip_type
ring_ptr = ib_result_cpu;
memcpy(ring_ptr, ring_context->pm4, ring_context->pm4_dw * sizeof(*ring_context->pm4));
- if (user_queue)
+ if (ring_context->aql_queue)
+ amdgpu_aql_queue_submit(device, ring_context, ip_type, ib_result_mc_address);
+ else if (user_queue)
amdgpu_user_queue_submit(device, ring_context, ip_type, ib_result_mc_address);
else {
ring_context->ib_info.ib_mc_address = ib_result_mc_address;
@@ -356,7 +358,7 @@ void amdgpu_command_submission_const_fill_helper(amdgpu_device_handle device,
*/
void amdgpu_command_submission_copy_linear_helper(amdgpu_device_handle device,
const struct amdgpu_ip_block_version *ip_block,
- bool user_queue)
+ bool user_queue, bool aql_queue)
{
const int sdma_write_length = 1024;
const int pm4_dw = 256;
@@ -372,13 +374,18 @@ void amdgpu_command_submission_copy_linear_helper(amdgpu_device_handle device,
ring_context->pm4 = calloc(pm4_dw, sizeof(*ring_context->pm4));
ring_context->secure = false;
ring_context->pm4_size = pm4_dw;
+ ring_context->aql_size = pm4_dw;
ring_context->res_cnt = 2;
ring_context->user_queue = user_queue;
+ if (aql_queue) {
+ ring_context->aql_queue = true;
+ ring_context->aql_type = HSA_PACKET_TYPE_VENDOR_SPECIFIC;
+ }
igt_assert(ring_context->pm4);
+
r = amdgpu_query_hw_ip_info(device, ip_block->type, 0, &ring_context->hw_ip_info);
igt_assert_eq(r, 0);
-
if (user_queue) {
amdgpu_user_queue_create(device, ring_context, ip_block->type);
} else {
diff --git a/lib/amdgpu/amd_command_submission.h b/lib/amdgpu/amd_command_submission.h
index d0139b364..f51b777e1 100644
--- a/lib/amdgpu/amd_command_submission.h
+++ b/lib/amdgpu/amd_command_submission.h
@@ -42,5 +42,5 @@ void amdgpu_command_submission_const_fill_helper(amdgpu_device_handle device,
void amdgpu_command_submission_copy_linear_helper(amdgpu_device_handle device,
const struct amdgpu_ip_block_version *ip_block,
- bool user_queue);
+ bool user_queue, bool aql_queue);
#endif
diff --git a/lib/amdgpu/amd_hsa.h b/lib/amdgpu/amd_hsa.h
new file mode 100644
index 000000000..3a5332806
--- /dev/null
+++ b/lib/amdgpu/amd_hsa.h
@@ -0,0 +1,157 @@
+////////////////////////////////////////////////////////////////////////////////
+//
+// The University of Illinois/NCSA
+// Open Source License (NCSA)
+//
+// Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved.
+//
+// Developed by:
+//
+// AMD Research and AMD HSA Software Development
+//
+// Advanced Micro Devices, Inc.
+//
+// www.amd.com
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to
+// deal with the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+// - Redistributions of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimers.
+// - Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimers in
+// the documentation and/or other materials provided with the distribution.
+// - Neither the names of Advanced Micro Devices, Inc,
+// nor the names of its contributors may be used to endorse or promote
+// products derived from this Software without specific prior written
+// permission.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS WITH THE SOFTWARE.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef AMD_HSA_H_
+#define AMD_HSA_H_
+
+
+/**
+ * @brief AQL Packet type.
+ */
+typedef enum {
+ /**
+ * Vendor-specific packet.
+ */
+ HSA_PACKET_TYPE_VENDOR_SPECIFIC = 0,
+ /**
+ * The packet has been processed in the past, but has not been reassigned to
+ * the packet processor. A packet processor must not process a packet of this
+ * type. All queues support this packet type.
+ */
+ HSA_PACKET_TYPE_INVALID = 1,
+ /**
+ * Packet used by agents for dispatching jobs to kernel agents. Not all
+ * queues support packets of this type (see ::hsa_queue_feature_t).
+ */
+ HSA_PACKET_TYPE_KERNEL_DISPATCH = 2,
+ /**
+ * Packet used by agents to delay processing of subsequent packets, and to
+ * express complex dependencies between multiple packets. All queues support
+ * this packet type.
+ */
+ HSA_PACKET_TYPE_BARRIER_AND = 3,
+ /**
+ * Packet used by agents for dispatching jobs to agents. Not all
+ * queues support packets of this type (see ::hsa_queue_feature_t).
+ */
+ HSA_PACKET_TYPE_AGENT_DISPATCH = 4,
+ /**
+ * Packet used by agents to delay processing of subsequent packets, and to
+ * express complex dependencies between multiple packets. All queues support
+ * this packet type.
+ */
+ HSA_PACKET_TYPE_BARRIER_OR = 5
+} hsa_packet_type_t;
+
+/**
+ * @brief Sub-fields of the @a header field that is present in any AQL
+ * packet. The offset (with respect to the address of @a header) of a sub-field
+ * is identical to its enumeration constant. The width of each sub-field is
+ * determined by the corresponding value in ::hsa_packet_header_width_t. The
+ * offset and the width are expressed in bits.
+ */
+ typedef enum {
+ /**
+ * Packet type. The value of this sub-field must be one of
+ * ::hsa_packet_type_t. If the type is ::HSA_PACKET_TYPE_VENDOR_SPECIFIC, the
+ * packet layout is vendor-specific.
+ */
+ HSA_PACKET_HEADER_TYPE = 0,
+ /**
+ * Barrier bit. If the barrier bit is set, the processing of the current
+ * packet only launches when all preceding packets (within the same queue) are
+ * complete.
+ */
+ HSA_PACKET_HEADER_BARRIER = 8,
+ /**
+ * Acquire fence scope. The value of this sub-field determines the scope and
+ * type of the memory fence operation applied before the packet enters the
+ * active phase. An acquire fence ensures that any subsequent global segment
+ * or image loads by any unit of execution that belongs to a dispatch that has
+ * not yet entered the active phase on any queue of the same kernel agent,
+ * sees any data previously released at the scopes specified by the acquire
+ * fence. The value of this sub-field must be one of ::hsa_fence_scope_t.
+ */
+ HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE = 9,
+ /**
+ * @deprecated Renamed as ::HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE.
+ */
+ HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE = 9,
+ /**
+ * Release fence scope, The value of this sub-field determines the scope and
+ * type of the memory fence operation applied after kernel completion but
+ * before the packet is completed. A release fence makes any global segment or
+ * image data that was stored by any unit of execution that belonged to a
+ * dispatch that has completed the active phase on any queue of the same
+ * kernel agent visible in all the scopes specified by the release fence. The
+ * value of this sub-field must be one of ::hsa_fence_scope_t.
+ */
+ HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE = 11,
+ /**
+ * @deprecated Renamed as ::HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE.
+ */
+ HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE = 11
+ } hsa_packet_header_t;
+
+/**
+ * @brief Scope of the memory fence operation associated with a packet.
+ */
+typedef enum {
+ /**
+ * No scope (no fence is applied). The packet relies on external fences to
+ * ensure visibility of memory updates.
+ */
+ HSA_FENCE_SCOPE_NONE = 0,
+ /**
+ * The fence is applied with agent scope for the global segment.
+ */
+ HSA_FENCE_SCOPE_AGENT = 1,
+ /**
+ * The fence is applied across both agent and system scope for the global
+ * segment.
+ */
+ HSA_FENCE_SCOPE_SYSTEM = 2
+} hsa_fence_scope_t;
+
+#define AMD_AQL_FORMAT_PM4_IB 0x1
+
+#endif // header guard
diff --git a/lib/amdgpu/amd_ip_blocks.h b/lib/amdgpu/amd_ip_blocks.h
index 7d48f9107..32032402d 100644
--- a/lib/amdgpu/amd_ip_blocks.h
+++ b/lib/amdgpu/amd_ip_blocks.h
@@ -18,6 +18,7 @@
#include "amd_registers.h"
#include "amd_family.h"
+#include "amd_hsa.h"
#define MAX_CARDS_SUPPORTED 4
@@ -164,10 +165,13 @@ struct amdgpu_ring_context {
struct amdgpu_userq_bo wptr;
struct amdgpu_userq_bo csa;
struct amdgpu_userq_bo eop;
+ struct amdgpu_userq_bo signal;
uint32_t *queue_cpu;
- uint64_t *wptr_cpu;
+ volatile uint64_t *wptr_cpu;
+ volatile uint64_t *rptr_cpu;
uint64_t *doorbell_cpu;
+ uint64_t *signal_cpu;
uint32_t db_handle;
uint32_t queue_id;
@@ -176,6 +180,11 @@ struct amdgpu_ring_context {
uint64_t point;
bool user_queue;
+ uint32_t *aql; /* data of the packet */
+ uint32_t aql_size; /* max allocated packet size */
+ uint32_t aql_type; /* aql packet type*/
+ bool aql_queue; /* aql queue or not */
+
struct drm_amdgpu_info_uq_fw_areas info;
};
diff --git a/lib/amdgpu/amd_user_queue.c b/lib/amdgpu/amd_user_queue.c
index 444f9c022..6e32c41fd 100644
--- a/lib/amdgpu/amd_user_queue.c
+++ b/lib/amdgpu/amd_user_queue.c
@@ -7,6 +7,8 @@
#include "amd_memory.h"
#include "amd_PM4.h"
#include "ioctl_wrappers.h"
+#include <stdatomic.h>
+#include <stdio.h>
#ifdef AMDGPU_USERQ_ENABLED
static void amdgpu_alloc_doorbell(amdgpu_device_handle device_handle,
@@ -126,6 +128,79 @@ int amdgpu_timeline_syncobj_wait(amdgpu_device_handle device_handle,
return r;
}
+void amdgpu_aql_queue_submit(amdgpu_device_handle device, struct amdgpu_ring_context *ring_context,
+ unsigned int ip_type, uint64_t mc_address)
+{
+ uint32_t acquireFence = HSA_FENCE_SCOPE_NONE, releaseFence = HSA_FENCE_SCOPE_SYSTEM;
+ uint32_t control = ring_context->pm4_dw;
+ uint64_t header;
+
+ amdgpu_pkt_begin();
+ switch (ring_context->aql_type) {
+ case HSA_PACKET_TYPE_VENDOR_SPECIFIC:
+ header = HSA_PACKET_TYPE_VENDOR_SPECIFIC |
+ (acquireFence << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE) |
+ (releaseFence << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE) |
+ (AMD_AQL_FORMAT_PM4_IB << 16);
+ amdgpu_pkt_add_dw(header);
+ amdgpu_pkt_add_dw(PACKET3(PACKET3_INDIRECT_BUFFER, 2));
+ amdgpu_pkt_add_dw(lower_32_bits(mc_address));
+ amdgpu_pkt_add_dw(upper_32_bits(mc_address));
+
+ if (ip_type == AMD_IP_GFX)
+ amdgpu_pkt_add_dw(control | S_3F3_INHERIT_VMID_MQD_GFX(1));
+ else
+ amdgpu_pkt_add_dw(control | S_3F3_VALID_COMPUTE(1)
+ | S_3F3_INHERIT_VMID_MQD_COMPUTE(1));
+ /* remaining dws */
+ amdgpu_pkt_add_dw(0xa);
+ /* reserved */
+ amdgpu_pkt_add_dw(0x0);
+ amdgpu_pkt_add_dw(0x0);
+ amdgpu_pkt_add_dw(0x0);
+ amdgpu_pkt_add_dw(0x0);
+ amdgpu_pkt_add_dw(0x0);
+ amdgpu_pkt_add_dw(0x0);
+ amdgpu_pkt_add_dw(0x0);
+ amdgpu_pkt_add_dw(0x0);
+ /* COMPLETION_SIGNAL */
+ amdgpu_pkt_add_dw(lower_32_bits(ring_context->signal.mc_addr));
+ amdgpu_pkt_add_dw(upper_32_bits(ring_context->signal.mc_addr));
+ break;
+ case HSA_PACKET_TYPE_KERNEL_DISPATCH:
+ case HSA_PACKET_TYPE_AGENT_DISPATCH:
+ case HSA_PACKET_TYPE_BARRIER_AND:
+ case HSA_PACKET_TYPE_BARRIER_OR:
+ default:
+ break;
+ }
+
+ /* Below call update the wptr address so will wait till all writes are completed */
+ #if DETECT_CC_GCC && (DETECT_ARCH_X86 || DETECT_ARCH_X86_64)
+ asm volatile ("mfence" : : : "memory");
+ #endif
+ amdgpu_pkt_end();
+ #if DETECT_CC_GCC && (DETECT_ARCH_X86 || DETECT_ARCH_X86_64)
+ asm volatile ("mfence" : : : "memory");
+ #endif
+
+ /* Update the door bell */
+ ring_context->doorbell_cpu[DOORBELL_INDEX] = *ring_context->wptr_cpu;
+
+ #if DETECT_CC_GCC && (DETECT_ARCH_X86 || DETECT_ARCH_X86_64)
+ asm volatile ("mfence" : : : "memory");
+ #endif
+
+ /* Waiting for the packet to be consumed.
+ TODO: Use the check completion signal instead.
+ */
+ while((*ring_context->wptr_cpu) != (*ring_context->rptr_cpu));
+
+ #if DETECT_CC_GCC && (DETECT_ARCH_X86 || DETECT_ARCH_X86_64)
+ asm volatile ("mfence" : : : "memory");
+ #endif
+}
+
void amdgpu_user_queue_submit(amdgpu_device_handle device, struct amdgpu_ring_context *ring_context,
unsigned int ip_type, uint64_t mc_address)
{
@@ -134,7 +209,6 @@ void amdgpu_user_queue_submit(amdgpu_device_handle device, struct amdgpu_ring_co
uint32_t syncarray[1];
struct drm_amdgpu_userq_signal signal_data;
-
amdgpu_pkt_begin();
/* Prepare the Indirect IB to submit the IB to user queue */
amdgpu_pkt_add_dw(PACKET3(PACKET3_INDIRECT_BUFFER, 2));
@@ -264,6 +338,9 @@ void amdgpu_user_queue_destroy(amdgpu_device_handle device_handle, struct amdgpu
amdgpu_bo_unmap_and_free(ctxt->queue.handle, ctxt->queue.va_handle,
ctxt->queue.mc_addr, USERMODE_QUEUE_SIZE);
+
+ amdgpu_bo_unmap_and_free(ctxt->signal.handle, ctxt->signal.va_handle,
+ ctxt->signal.mc_addr, USERMODE_QUEUE_SIZE);
}
void amdgpu_user_queue_create(amdgpu_device_handle device_handle, struct amdgpu_ring_context *ctxt,
@@ -289,6 +366,9 @@ void amdgpu_user_queue_create(amdgpu_device_handle device_handle, struct amdgpu_
if (ctxt->priority)
queue_flags |= ctxt->priority & AMDGPU_USERQ_CREATE_FLAGS_QUEUE_PRIORITY_MASK;
+ if (ctxt->aql_queue)
+ queue_flags |= AMDGPU_USERQ_CREATE_FLAGS_QUEUE_AQL_COMPUTE;
+
r = amdgpu_query_uq_fw_area_info(device_handle, AMD_IP_GFX, 0, &ctxt->info);
igt_assert_eq(r, 0);
@@ -325,6 +405,16 @@ void amdgpu_user_queue_create(amdgpu_device_handle device_handle, struct amdgpu_
ctxt->timeline_syncobj_handle, ++ctxt->point);
igt_assert_eq(r, 0);
+ r = amdgpu_bo_alloc_and_map_uq(device_handle, 8,
+ ALIGNMENT,
+ AMDGPU_GEM_DOMAIN_GTT,
+ gtt_flags,
+ AMDGPU_VM_MTYPE_UC,
+ &ctxt->signal.handle, &ctxt->signal.ptr,
+ &ctxt->signal.mc_addr, &ctxt->signal.va_handle,
+ ctxt->timeline_syncobj_handle, ++ctxt->point);
+ igt_assert_eq(r, 0);
+
switch (type) {
case AMD_IP_GFX:
r = amdgpu_bo_alloc_and_map_uq(device_handle, ctxt->info.gfx.shadow_size,
@@ -396,6 +486,9 @@ void amdgpu_user_queue_create(amdgpu_device_handle device_handle, struct amdgpu_
ctxt->doorbell_cpu = (uint64_t *)ctxt->doorbell.ptr;
ctxt->wptr_cpu = (uint64_t *)ctxt->wptr.ptr;
+ ctxt->rptr_cpu = (uint64_t *)ctxt->rptr.ptr;
+
+ ctxt->signal_cpu = (uint64_t *)ctxt->signal.ptr;
ctxt->queue_cpu = (uint32_t *)ctxt->queue.ptr;
memset(ctxt->queue_cpu, 0, USERMODE_QUEUE_SIZE);
diff --git a/lib/amdgpu/amd_user_queue.h b/lib/amdgpu/amd_user_queue.h
index b29e97ccf..6bd8fb3b2 100644
--- a/lib/amdgpu/amd_user_queue.h
+++ b/lib/amdgpu/amd_user_queue.h
@@ -52,4 +52,6 @@ void amdgpu_user_queue_destroy(amdgpu_device_handle device_handle, struct amdgpu
void amdgpu_user_queue_submit(amdgpu_device_handle device, struct amdgpu_ring_context *ring_context,
unsigned int ip_type, uint64_t mc_address);
+void amdgpu_aql_queue_submit(amdgpu_device_handle device, struct amdgpu_ring_context *ring_context,
+ unsigned int ip_type, uint64_t mc_address);
#endif
--
2.49.0
More information about the igt-dev
mailing list