[PATCH i-g-t 1/2] lib/amdgpu: Add SDMA support for user queues

Jesse.Zhang Jesse.Zhang at amd.com
Mon Jun 30 09:23:41 UTC 2025


 This commit adds support for submitting SDMA work to user queues.
    The changes include:

    1. Added SDMA-specific packet definitions in amd_sdma.h for indirect buffer
       operations and protected fence signaling.

    2. Modified amdgpu_user_queue_submit() to handle SDMA-specific requirements:
       - Added alignment padding with NOP packets to ensure 8-DWORD alignment
       - Implemented SDMA-specific indirect buffer submission format
       - Added proper handling of CSA (Context Save Area) addresses
       - Included SDMA-specific fence signaling
       - Adjusted wptr handling for SDMA queues

    The changes ensure proper synchronization and alignment requirements are met
    for SDMA operations while maintaining backward compatibility with existing
    queue types.

    Signed-off-by: Jesse Zhang <Jesse.Zhang at amd.com>
---
 lib/amdgpu/amd_sdma.h  |  4 ++++
 lib/amdgpu/amd_userq.c | 50 +++++++++++++++++++++++++++++-------------
 2 files changed, 39 insertions(+), 15 deletions(-)

diff --git a/lib/amdgpu/amd_sdma.h b/lib/amdgpu/amd_sdma.h
index 3e568e47a..2a9339cfb 100644
--- a/lib/amdgpu/amd_sdma.h
+++ b/lib/amdgpu/amd_sdma.h
@@ -103,5 +103,9 @@
 #              define PACKET3_DMA_DATA_SI_CP_SYNC     (1 << 31)
 
 #define SDMA_NOP  0x0
+#define SDMA_OP_INDIRECT                                0x4
+#define SDMA_OP_PROTECTED_FENCE             0x5
+#define SDMA6_SUB_OP_PROTECTED_FENCE             0x1
+#define SDMA7_SUB_OP_PROTECTED_FENCE             0x3
 
 #endif
diff --git a/lib/amdgpu/amd_userq.c b/lib/amdgpu/amd_userq.c
index f8e1a4b45..10e82a5a4 100644
--- a/lib/amdgpu/amd_userq.c
+++ b/lib/amdgpu/amd_userq.c
@@ -6,6 +6,7 @@
 #include "amd_userq.h"
 #include "amd_memory.h"
 #include "amd_PM4.h"
+#include "amd_sdma.h"
 #include "ioctl_wrappers.h"
 
 #ifdef AMDGPU_USERQ_ENABLED
@@ -136,22 +137,39 @@ void amdgpu_user_queue_submit(amdgpu_device_handle device, struct amdgpu_ring_co
 	uint64_t timeout = ring_context->time_out ? ring_context->time_out : INT64_MAX;
 
 	amdgpu_pkt_begin();
-	/* Prepare the Indirect IB to submit the IB to user queue */
-	amdgpu_pkt_add_dw(PACKET3(PACKET3_INDIRECT_BUFFER, 2));
-	amdgpu_pkt_add_dw(lower_32_bits(mc_address));
-	amdgpu_pkt_add_dw(upper_32_bits(mc_address));
-
-	if (ip_type == AMD_IP_GFX)
-		amdgpu_pkt_add_dw(control | S_3F3_INHERIT_VMID_MQD_GFX(1));
-	else
-		amdgpu_pkt_add_dw(control | S_3F3_VALID_COMPUTE(1)
-					       | S_3F3_INHERIT_VMID_MQD_COMPUTE(1));
-
-	amdgpu_pkt_add_dw(PACKET3(PACKET3_PROTECTED_FENCE_SIGNAL, 0));
-
-	/* empty dword is needed for fence signal pm4 */
-	amdgpu_pkt_add_dw(0);
 
+	if (ip_type == AMD_IP_DMA) {
+		/* For SDMA, we need to align the IB to 8 DW boundary */
+		unsigned int nop_count = (2 - lower_32_bits(*ring_context->wptr_cpu)) & 7;
+		for (unsigned int i = 0; i < nop_count; i++)
+			amdgpu_pkt_add_dw(SDMA_PKT_HEADER_OP(SDMA_NOP));
+		amdgpu_pkt_add_dw(SDMA_PKT_HEADER_OP(SDMA_OP_INDIRECT));
+		amdgpu_pkt_add_dw(lower_32_bits(mc_address) & 0xffffffe0); // 32-byte aligned
+		amdgpu_pkt_add_dw(upper_32_bits(mc_address));
+		amdgpu_pkt_add_dw(control); // IB length in DWORDS
+		amdgpu_pkt_add_dw(lower_32_bits(ring_context->csa.mc_addr)); // CSA MC address low
+		amdgpu_pkt_add_dw(upper_32_bits(ring_context->csa.mc_addr)); // CSA MC address high
+		if (ring_context->hw_ip_info.hw_ip_version_major <= 6)
+			amdgpu_pkt_add_dw(SDMA_PACKET(SDMA_OP_PROTECTED_FENCE, SDMA6_SUB_OP_PROTECTED_FENCE, 0));
+		else
+			amdgpu_pkt_add_dw(SDMA_PACKET(SDMA_OP_PROTECTED_FENCE, SDMA7_SUB_OP_PROTECTED_FENCE, 0));
+	} else {
+		/* Prepare the Indirect IB to submit the IB to user queue */
+		amdgpu_pkt_add_dw(PACKET3(PACKET3_INDIRECT_BUFFER, 2));
+		amdgpu_pkt_add_dw(lower_32_bits(mc_address));
+		amdgpu_pkt_add_dw(upper_32_bits(mc_address));
+
+		if (ip_type == AMD_IP_GFX)
+			amdgpu_pkt_add_dw(control | S_3F3_INHERIT_VMID_MQD_GFX(1));
+		else
+			amdgpu_pkt_add_dw(control | S_3F3_VALID_COMPUTE(1)
+						| S_3F3_INHERIT_VMID_MQD_COMPUTE(1));
+
+		amdgpu_pkt_add_dw(PACKET3(PACKET3_PROTECTED_FENCE_SIGNAL, 0));
+
+		/* empty dword is needed for fence signal pm4 */
+		amdgpu_pkt_add_dw(0);
+	}
 #if DETECT_CC_GCC && (DETECT_ARCH_X86 || DETECT_ARCH_X86_64)
 	asm volatile ("mfence" : : : "memory");
 #endif
@@ -163,6 +181,8 @@ void amdgpu_user_queue_submit(amdgpu_device_handle device, struct amdgpu_ring_co
 	asm volatile ("mfence" : : : "memory");
 #endif
 
+	if (ip_type == AMD_IP_DMA)
+		*ring_context->wptr_cpu = *ring_context->wptr_cpu <<2;
 	/* Update the door bell */
 	ring_context->doorbell_cpu[DOORBELL_INDEX] = *ring_context->wptr_cpu;
 
-- 
2.49.0



More information about the igt-dev mailing list