[PATCH i-g-t 2/3] lib/amdgpu: enhance command submission and error handling with user queue support
vitaly prosyak
vprosyak at amd.com
Fri May 9 02:41:13 UTC 2025
The change looks good to me
Reviewed-by Vitaly Prosyak <vitaly.prosyak at amd.com>
On 2025-05-08 02:07, Jesse.Zhang wrote:
> This patch introduces several improvements to AMDGPU command submission
> and error handling infrastructure:
>
> 1. Fixed typos in time_out field initialization (rint_context -> ring_context)
> across multiple helper functions.
>
> 2. Enhanced bad_access_helper() to support user queues:
> - Added user_queue parameter to control submission path
> - Implemented separate handling for user queue vs legacy path
> - Added proper timeout initialization
> - Used sync object based buffer allocation (amdgpu_bo_alloc_and_map_sync)
> - Added timeline syncobj wait after user queue operations
>
> 3. Updated bad_access_ring_helper() to:
> - Accept user_queue parameter
> - Skip ring iteration for user queues (hardware scheduled)
> - Add version check for user queue support
> - Pass user_queue flag to bad_access_helper()
>
> Cc: Prosyak, Vitaly <Vitaly.Prosyak at amd.com>
> Cc: Sunil Khatri <sunil.khatri at amd.com>
> Cc: Christian Koenig <christian.koenig at amd.com>
> Cc: Alexander Deucher <alexander.deucher at amd.com>
>
> Signed-off-by: Jesse Zhang <jesse.zhang at amd.com>
> ---
> lib/amdgpu/amd_deadlock_helpers.c | 54 ++++++++++++++++++++++++-------
> lib/amdgpu/amd_deadlock_helpers.h | 2 +-
> 2 files changed, 43 insertions(+), 13 deletions(-)
>
> diff --git a/lib/amdgpu/amd_deadlock_helpers.c b/lib/amdgpu/amd_deadlock_helpers.c
> index c8a48930e..f52bf70e5 100644
> --- a/lib/amdgpu/amd_deadlock_helpers.c
> +++ b/lib/amdgpu/amd_deadlock_helpers.c
> @@ -12,6 +12,7 @@
> #include <signal.h>
> #include "amd_memory.h"
> #include "amd_deadlock_helpers.h"
> +#include "lib/amdgpu/amd_userq.h"
> #include "lib/amdgpu/amd_command_submission.h"
>
> #define MAX_JOB_COUNT 200
> @@ -274,7 +275,7 @@ void amdgpu_wait_memory_helper(amdgpu_device_handle device_handle, unsigned int
>
> static void
> bad_access_helper(amdgpu_device_handle device_handle, unsigned int cmd_error,
> - unsigned int ip_type, uint32_t priority)
> + unsigned int ip_type, uint32_t priority, bool user_queue)
> {
>
> const struct amdgpu_ip_block_version *ip_block = NULL;
> @@ -287,10 +288,14 @@ bad_access_helper(amdgpu_device_handle device_handle, unsigned int cmd_error,
> ring_context = calloc(1, sizeof(*ring_context));
> igt_assert(ring_context);
>
> - if (priority == AMDGPU_CTX_PRIORITY_HIGH)
> - r = amdgpu_cs_ctx_create2(device_handle, AMDGPU_CTX_PRIORITY_HIGH, &ring_context->context_handle);
> - else
> - r = amdgpu_cs_ctx_create(device_handle, &ring_context->context_handle);
> + if (user_queue) {
> + amdgpu_user_queue_create(device_handle, ring_context, ip_type);
> + } else {
> + if (priority == AMDGPU_CTX_PRIORITY_HIGH)
> + r = amdgpu_cs_ctx_create2(device_handle, AMDGPU_CTX_PRIORITY_HIGH, &ring_context->context_handle);
> + else
> + r = amdgpu_cs_ctx_create(device_handle, &ring_context->context_handle);
> + }
> igt_assert_eq(r, 0);
>
> /* setup parameters */
> @@ -299,16 +304,28 @@ bad_access_helper(amdgpu_device_handle device_handle, unsigned int cmd_error,
> ring_context->pm4_size = pm4_dw;
> ring_context->res_cnt = 1;
> ring_context->ring_id = 0;
> + ring_context->user_queue = user_queue;
> + ring_context->time_out = 0x7ffff;
> igt_assert(ring_context->pm4);
> ip_block = get_ip_block(device_handle, ip_type);
> - r = amdgpu_bo_alloc_and_map(device_handle,
> + r = amdgpu_bo_alloc_and_map_sync(device_handle,
> ring_context->write_length * sizeof(uint32_t),
> 4096, AMDGPU_GEM_DOMAIN_GTT,
> - AMDGPU_GEM_CREATE_CPU_GTT_USWC, &ring_context->bo,
> + AMDGPU_GEM_CREATE_CPU_GTT_USWC,
> + AMDGPU_VM_MTYPE_UC,
> + &ring_context->bo,
> (void **)&ring_context->bo_cpu,
> &ring_context->bo_mc,
> - &ring_context->va_handle);
> + &ring_context->va_handle,
> + ring_context->timeline_syncobj_handle,
> + ++ring_context->point, user_queue);
> igt_assert_eq(r, 0);
> + if (user_queue) {
> + r = amdgpu_timeline_syncobj_wait(device_handle,
> + ring_context->timeline_syncobj_handle,
> + ring_context->point);
> + igt_assert_eq(r, 0);
> + }
>
> memset((void *)ring_context->bo_cpu, 0, ring_context->write_length * sizeof(uint32_t));
> ring_context->resources[0] = ring_context->bo;
> @@ -320,8 +337,12 @@ bad_access_helper(amdgpu_device_handle device_handle, unsigned int cmd_error,
>
> amdgpu_bo_unmap_and_free(ring_context->bo, ring_context->va_handle, ring_context->bo_mc,
> ring_context->write_length * sizeof(uint32_t));
> - free(ring_context->pm4);
> - free(ring_context);
> + if (user_queue) {
> + amdgpu_user_queue_destroy(device_handle, ring_context, ip_block->type);
> + } else {
> + free(ring_context->pm4);
> + free(ring_context);
> + }
> }
>
> #define MAX_DMABUF_COUNT 0x20000
> @@ -419,7 +440,7 @@ amdgpu_hang_sdma_helper(amdgpu_device_handle device_handle, uint8_t hang_type)
> free_cmd_base(base_cmd);
> }
>
> -void bad_access_ring_helper(amdgpu_device_handle device_handle, unsigned int cmd_error, unsigned int ip_type, struct pci_addr *pci)
> +void bad_access_ring_helper(amdgpu_device_handle device_handle, unsigned int cmd_error, unsigned int ip_type, struct pci_addr *pci, bool user_queue)
> {
> int r;
> FILE *fp;
> @@ -436,6 +457,15 @@ void bad_access_ring_helper(amdgpu_device_handle device_handle, unsigned int cmd
> if (!info.available_rings)
> igt_info("SKIP ... as there's no ring for ip %d\n", ip_type);
>
> + if (user_queue) {
> + if (info.hw_ip_version_major < 11) {
> + igt_info("SKIP ... as user queueu doesn't support %d\n", ip_type);
> + return;
> + }
> + /* No need to iterate each ring, user queues are scheduled by hardware */
> + bad_access_helper(device_handle, cmd_error, ip_type, prio, user_queue);
> + return;
> + }
> support_page = is_support_page_queue(ip_type, pci);
> if (ip_type == AMD_IP_GFX)
> snprintf(sysfs, sizeof(sysfs) - 1, "/sys/kernel/debug/dri/%04x:%02x:%02x.%01x/amdgpu_gfx_sched_mask",
> @@ -505,7 +535,7 @@ void bad_access_ring_helper(amdgpu_device_handle device_handle, unsigned int cmd
> igt_assert_eq(r, 0);
> }
>
> - bad_access_helper(device_handle, cmd_error, ip_type, prio);
> + bad_access_helper(device_handle, cmd_error, ip_type, prio, user_queue);
> }
>
> /* recover the sched mask */
> diff --git a/lib/amdgpu/amd_deadlock_helpers.h b/lib/amdgpu/amd_deadlock_helpers.h
> index 1d654c490..fdbeb409c 100644
> --- a/lib/amdgpu/amd_deadlock_helpers.h
> +++ b/lib/amdgpu/amd_deadlock_helpers.h
> @@ -29,7 +29,7 @@
> void
> amdgpu_wait_memory_helper(amdgpu_device_handle device_handle, unsigned int ip_type, struct pci_addr *pci);
> void
> -bad_access_ring_helper(amdgpu_device_handle device_handle, unsigned int cmd_error, unsigned int ip_type, struct pci_addr *pci);
> +bad_access_ring_helper(amdgpu_device_handle device_handle, unsigned int cmd_error, unsigned int ip_type, struct pci_addr *pci, bool user_queue);
>
> void
> amdgpu_hang_sdma_ring_helper(amdgpu_device_handle device_handle, uint8_t hang_type, struct pci_addr *pci);
More information about the igt-dev
mailing list