[PATCH i-g-t 2/3] lib/amdgpu: enhance command submission and error handling with user queue support

Fri May 9 02:41:13 UTC 2025

The change looks good to me

Reviewed-by Vitaly Prosyak <vitaly.prosyak at amd.com>

On 2025-05-08 02:07, Jesse.Zhang wrote:
> This patch introduces several improvements to AMDGPU command submission
> and error handling infrastructure:
>
> 1. Fixed typos in time_out field initialization (rint_context -> ring_context)
>    across multiple helper functions.
>
> 2. Enhanced bad_access_helper() to support user queues:
>    - Added user_queue parameter to control submission path
>    - Implemented separate handling for user queue vs legacy path
>    - Added proper timeout initialization
>    - Used sync object based buffer allocation (amdgpu_bo_alloc_and_map_sync)
>    - Added timeline syncobj wait after user queue operations
>
> 3. Updated bad_access_ring_helper() to:
>    - Accept user_queue parameter
>    - Skip ring iteration for user queues (hardware scheduled)
>    - Add version check for user queue support
>    - Pass user_queue flag to bad_access_helper()
>
> Cc: Prosyak, Vitaly <Vitaly.Prosyak at amd.com>
> Cc: Sunil Khatri <sunil.khatri at amd.com>
> Cc: Christian Koenig <christian.koenig at amd.com>
> Cc: Alexander Deucher <alexander.deucher at amd.com>
>
> Signed-off-by: Jesse Zhang <jesse.zhang at amd.com>
> ---
>  lib/amdgpu/amd_deadlock_helpers.c | 54 ++++++++++++++++++++++++-------
>  lib/amdgpu/amd_deadlock_helpers.h |  2 +-
>  2 files changed, 43 insertions(+), 13 deletions(-)
>
> diff --git a/lib/amdgpu/amd_deadlock_helpers.c b/lib/amdgpu/amd_deadlock_helpers.c
> index c8a48930e..f52bf70e5 100644
> --- a/lib/amdgpu/amd_deadlock_helpers.c
> +++ b/lib/amdgpu/amd_deadlock_helpers.c
> @@ -12,6 +12,7 @@
>  #include <signal.h>
>  #include "amd_memory.h"
>  #include "amd_deadlock_helpers.h"
> +#include "lib/amdgpu/amd_userq.h"
>  #include "lib/amdgpu/amd_command_submission.h"
>  
>  #define MAX_JOB_COUNT 200
> @@ -274,7 +275,7 @@ void amdgpu_wait_memory_helper(amdgpu_device_handle device_handle, unsigned int
>  
>  static void
>  bad_access_helper(amdgpu_device_handle device_handle, unsigned int cmd_error,
> -			unsigned int ip_type, uint32_t priority)
> +			unsigned int ip_type, uint32_t priority, bool user_queue)
>  {
>  
>  	const struct amdgpu_ip_block_version *ip_block = NULL;
> @@ -287,10 +288,14 @@ bad_access_helper(amdgpu_device_handle device_handle, unsigned int cmd_error,
>  	ring_context = calloc(1, sizeof(*ring_context));
>  	igt_assert(ring_context);
>  
> -	if (priority == AMDGPU_CTX_PRIORITY_HIGH)
> -		r = amdgpu_cs_ctx_create2(device_handle, AMDGPU_CTX_PRIORITY_HIGH, &ring_context->context_handle);
> -	else
> -		r = amdgpu_cs_ctx_create(device_handle, &ring_context->context_handle);
> +	if (user_queue) {
> +		amdgpu_user_queue_create(device_handle, ring_context, ip_type);
> +	} else {
> +		if (priority == AMDGPU_CTX_PRIORITY_HIGH)
> +			r = amdgpu_cs_ctx_create2(device_handle, AMDGPU_CTX_PRIORITY_HIGH, &ring_context->context_handle);
> +		else
> +			r = amdgpu_cs_ctx_create(device_handle, &ring_context->context_handle);
> +	}
>  	igt_assert_eq(r, 0);
>  
>  	/* setup parameters */
> @@ -299,16 +304,28 @@ bad_access_helper(amdgpu_device_handle device_handle, unsigned int cmd_error,
>  	ring_context->pm4_size = pm4_dw;
>  	ring_context->res_cnt = 1;
>  	ring_context->ring_id = 0;
> +	ring_context->user_queue = user_queue;
> +	ring_context->time_out = 0x7ffff;
>  	igt_assert(ring_context->pm4);
>  	ip_block = get_ip_block(device_handle, ip_type);
> -	r = amdgpu_bo_alloc_and_map(device_handle,
> +	r = amdgpu_bo_alloc_and_map_sync(device_handle,
>  				    ring_context->write_length * sizeof(uint32_t),
>  				    4096, AMDGPU_GEM_DOMAIN_GTT,
> -					AMDGPU_GEM_CREATE_CPU_GTT_USWC, &ring_context->bo,
> +				    AMDGPU_GEM_CREATE_CPU_GTT_USWC,
> +				    AMDGPU_VM_MTYPE_UC,
> +				    &ring_context->bo,
>  				    (void **)&ring_context->bo_cpu,
>  				    &ring_context->bo_mc,
> -				    &ring_context->va_handle);
> +				    &ring_context->va_handle,
> +				    ring_context->timeline_syncobj_handle,
> +				    ++ring_context->point, user_queue);
>  	igt_assert_eq(r, 0);
> +	if (user_queue) {
> +		r = amdgpu_timeline_syncobj_wait(device_handle,
> +			ring_context->timeline_syncobj_handle,
> +			ring_context->point);
> +		igt_assert_eq(r, 0);
> +	}
>  
>  	memset((void *)ring_context->bo_cpu, 0, ring_context->write_length * sizeof(uint32_t));
>  	ring_context->resources[0] = ring_context->bo;
> @@ -320,8 +337,12 @@ bad_access_helper(amdgpu_device_handle device_handle, unsigned int cmd_error,
>  
>  	amdgpu_bo_unmap_and_free(ring_context->bo, ring_context->va_handle, ring_context->bo_mc,
>  				 ring_context->write_length * sizeof(uint32_t));
> -	free(ring_context->pm4);
> -	free(ring_context);
> +	if (user_queue) {
> +		amdgpu_user_queue_destroy(device_handle, ring_context, ip_block->type);
> +	} else {
> +		free(ring_context->pm4);
> +		free(ring_context);
> +	}
>  }
>  
>  #define MAX_DMABUF_COUNT 0x20000
> @@ -419,7 +440,7 @@ amdgpu_hang_sdma_helper(amdgpu_device_handle device_handle, uint8_t hang_type)
>  	free_cmd_base(base_cmd);
>  }
>  
> -void bad_access_ring_helper(amdgpu_device_handle device_handle, unsigned int cmd_error, unsigned int ip_type, struct pci_addr *pci)
> +void bad_access_ring_helper(amdgpu_device_handle device_handle, unsigned int cmd_error, unsigned int ip_type, struct pci_addr *pci, bool user_queue)
>  {
>  	int r;
>  	FILE *fp;
> @@ -436,6 +457,15 @@ void bad_access_ring_helper(amdgpu_device_handle device_handle, unsigned int cmd
>  	if (!info.available_rings)
>  		igt_info("SKIP ... as there's no ring for ip %d\n", ip_type);
>  
> +	if (user_queue) {
> +		if (info.hw_ip_version_major < 11) {
> +			igt_info("SKIP ... as user queueu doesn't support %d\n", ip_type);
> +			return;
> +		}
> +		/* No need to iterate each ring, user queues are scheduled by hardware */
> +		bad_access_helper(device_handle, cmd_error, ip_type, prio, user_queue);
> +		return;
> +	}
>  	support_page = is_support_page_queue(ip_type, pci);
>  	if (ip_type == AMD_IP_GFX)
>  		snprintf(sysfs, sizeof(sysfs) - 1, "/sys/kernel/debug/dri/%04x:%02x:%02x.%01x/amdgpu_gfx_sched_mask",
> @@ -505,7 +535,7 @@ void bad_access_ring_helper(amdgpu_device_handle device_handle, unsigned int cmd
>  			igt_assert_eq(r, 0);
>  		}
>  
> -		bad_access_helper(device_handle, cmd_error, ip_type, prio);
> +		bad_access_helper(device_handle, cmd_error, ip_type, prio, user_queue);
>  	}
>  
>  	/* recover the sched mask */
> diff --git a/lib/amdgpu/amd_deadlock_helpers.h b/lib/amdgpu/amd_deadlock_helpers.h
> index 1d654c490..fdbeb409c 100644
> --- a/lib/amdgpu/amd_deadlock_helpers.h
> +++ b/lib/amdgpu/amd_deadlock_helpers.h
> @@ -29,7 +29,7 @@
>  void
>  amdgpu_wait_memory_helper(amdgpu_device_handle device_handle, unsigned int ip_type, struct pci_addr *pci);
>  void
> -bad_access_ring_helper(amdgpu_device_handle device_handle, unsigned int cmd_error, unsigned int ip_type, struct pci_addr *pci);
> +bad_access_ring_helper(amdgpu_device_handle device_handle, unsigned int cmd_error, unsigned int ip_type, struct pci_addr *pci, bool user_queue);
>  
>  void
>  amdgpu_hang_sdma_ring_helper(amdgpu_device_handle device_handle, uint8_t hang_type, struct pci_addr *pci);