[PATCH i-g-t v3] tests/amdgpu: add KGQ test in queue reset

Sun Aug 11 02:18:23 UTC 2024

The changed looks good to me.

Reviewed-by: Vitaly Prosyak <vitaly.prosyak at amd.com>

On 2024-08-09 00:14, Jesse.zhang at amd.com wrote:
> Enhance the queue reset, add KGQ test.
>
> V2:
>   Some improvements regarding the selection of testing ring (Vitaly)
> V3:
>  Fix calculation of number of constant tests.
>
> Cc: Kamil Konieczny <kamil.konieczny at linux.intel.com>
> Cc: Alex Deucher <alexander.deucher at amd.com>
> Cc: Christian Koenig <christian.koenig at amd.com>
> Cc: Vitaly Prosyak <vitaly.prosyak at amd.com>
>
> Signed-off-by: Jesse Zhang <jesse.zhang at amd.com>
> ---
>  lib/amdgpu/amd_ip_blocks.h     |   7 ++
>  tests/amdgpu/amd_queue_reset.c | 145 ++++++++++++++-------------------
>  2 files changed, 69 insertions(+), 83 deletions(-)
>
> diff --git a/lib/amdgpu/amd_ip_blocks.h b/lib/amdgpu/amd_ip_blocks.h
> index 7fd883608..1b39d9945 100644
> --- a/lib/amdgpu/amd_ip_blocks.h
> +++ b/lib/amdgpu/amd_ip_blocks.h
> @@ -45,6 +45,13 @@ enum  cmd_error_type {
>  	BACKEND_SE_GC_SHADER_INVALID_USER_DATA /* COMPUTE_USER_DATA */
>  };
>  
> +struct dynamic_test{
> +	enum cmd_error_type test;
> +	const char *name;
> +	const char *describe;
> +};
> +#define for_each_test(t, T) for(typeof(*T) *t = T; t->name; t++)
> +
>  /* aux struct to hold misc parameters for convenience to maintain */
>  struct amdgpu_ring_context {
>  
> diff --git a/tests/amdgpu/amd_queue_reset.c b/tests/amdgpu/amd_queue_reset.c
> index 9ea0a1f24..bc8414c23 100644
> --- a/tests/amdgpu/amd_queue_reset.c
> +++ b/tests/amdgpu/amd_queue_reset.c
> @@ -846,25 +846,36 @@ free_contexts(amdgpu_device_handle device, amdgpu_context_handle *p_contexts,
>  	}
>  }
>  
> -/* TODO add logic to iterate for all */
>  static bool
> -get_next_rings(unsigned int ring_begin, unsigned int available_rings,
> -		unsigned int *next_ring, unsigned int *next_next_ring)
> +get_next_rings(unsigned int ring_begin, struct drm_amdgpu_info_hw_ip info[],
> +		unsigned int *good_job_ring, unsigned int *bad_job_ring,  unsigned int order)
>  {
> -	bool ret = false;
>  	unsigned int ring_id;
>  
> -	for (ring_id = ring_begin; (1 << ring_id) & available_rings; ring_id++) {
> -		*next_ring = ring_id;
> -		*next_next_ring = ring_id + 1;
> +	/* Check good job ring is available. By default good job run on compute ring */
> +	for (ring_id = ring_begin; (1 << ring_id) & info[0].available_rings; ring_id++) {
> +		if ((1 << *good_job_ring) & info[0].available_rings) {
> +			*good_job_ring = ring_id;
> +			/* check bad job ring is available */
> +			for (ring_id = ring_begin; (1 << ring_id) & info[order].available_rings; ring_id++) {
> +				/* if order is 0, bad job run on compute ring,
> +				 * It should skip good ring and find next ring to run bad job.
> +				 */
> +				if (!order)
> +					*bad_job_ring = *good_job_ring + 1;
> +				else
> +					*bad_job_ring = ring_id;
> +				if ((1 << *bad_job_ring) & info[order].available_rings) {
> +					return true;
> +				}
> +			}
>  
> -		if ((*next_ring  & available_rings) && (*next_next_ring & available_rings)) {
> -			ret = true;
> -			break;
>  		}
>  	}
> -	return ret;
> +
> +	return false;
>  }
> +
>  igt_main
>  {
>  	char cmdline[2048];
> @@ -878,7 +889,7 @@ igt_main
>  	posix_spawn_file_actions_t action;
>  	amdgpu_device_handle device;
>  	struct amdgpu_gpu_info gpu_info = {0};
> -	struct drm_amdgpu_info_hw_ip info = {0};
> +	struct drm_amdgpu_info_hw_ip info[2] = {0};
>  	int fd = -1;
>  	int fd_shm = -1;
>  	struct shmbuf *sh_mem = NULL;
> @@ -888,7 +899,7 @@ igt_main
>  	unsigned int ring_id_good = 0;
>  	unsigned int ring_id_bad = 1;
>  
> -	enum amd_ip_block_type ip_test = AMD_IP_COMPUTE;
> +	enum amd_ip_block_type ip_tests[2] = {AMD_IP_COMPUTE/*keep first*/, AMD_IP_GFX};
>  	enum amd_ip_block_type ip_background = AMD_IP_COMPUTE;
>  
>  	amdgpu_context_handle *arr_context_handle = NULL;
> @@ -897,14 +908,27 @@ igt_main
>  	 * which are shared between child processes ( test/monitor/main and
>  	 *  separate for background
>  	 */
> -	unsigned int arr_err[] = {
> -			CMD_STREAM_EXEC_INVALID_PACKET_LENGTH,
> -			CMD_STREAM_EXEC_INVALID_OPCODE,
> -			//CMD_STREAM_TRANS_BAD_MEM_ADDRESS_BY_SYNC,TODO  not job timeout, debug why for n31
> -			//CMD_STREAM_TRANS_BAD_REG_ADDRESS, TODO  amdgpu: device lost from bus! for n31
> -			BACKEND_SE_GC_SHADER_INVALID_PROGRAM_ADDR,
> -			BACKEND_SE_GC_SHADER_INVALID_PROGRAM_SETTING,
> -			BACKEND_SE_GC_SHADER_INVALID_USER_DATA
> +	struct dynamic_test arr_err[] = {
> +			{CMD_STREAM_EXEC_INVALID_PACKET_LENGTH, "CMD_STREAM_EXEC_INVALID_PACKET_LENGTH",
> +				"Stressful-and-multiple-cs-of-bad and good length-operations-using-multiple-processes"},
> +			{CMD_STREAM_EXEC_INVALID_OPCODE, "CMD_STREAM_EXEC_INVALID_OPCODE",
> +				"Stressful-and-multiple-cs-of-bad and good opcode-operations-using-multiple-processes"},
> +			//TODO  not job timeout, debug why for n31.
> +			//{CMD_STREAM_TRANS_BAD_MEM_ADDRESS_BY_SYNC,"CMD_STREAM_TRANS_BAD_MEM_ADDRESS_BY_SYNC",
> +			//	"Stressful-and-multiple-cs-of-bad and good mem-sync-operations-using-multiple-processes"},
> +			//TODO amdgpu: device lost from bus! for n31
> +			//{CMD_STREAM_TRANS_BAD_REG_ADDRESS,"CMD_STREAM_TRANS_BAD_REG_ADDRESS",
> +			//	"Stressful-and-multiple-cs-of-bad and good reg-operations-using-multiple-processes"},
> +			{BACKEND_SE_GC_SHADER_INVALID_PROGRAM_ADDR, "BACKEND_SE_GC_SHADER_INVALID_PROGRAM_ADDR",
> +				"Stressful-and-multiple-cs-of-bad and good shader-operations-using-multiple-processes"},
> +			//TODO  KGQ cannot revocer by queue reset, it maybe need a fw bugfix on naiv31
> +			//{BACKEND_SE_GC_SHADER_INVALID_PROGRAM_SETTING,"BACKEND_SE_GC_SHADER_INVALID_PROGRAM_SETTING",
> +			//	"Stressful-and-multiple-cs-of-bad and good shader-operations-using-multiple-processes"},
> +			{BACKEND_SE_GC_SHADER_INVALID_USER_DATA, "BACKEND_SE_GC_SHADER_INVALID_USER_DATA",
> +				"Stressful-and-multiple-cs-of-bad and good shader-operations-using-multiple-processes"},
> +			{BACKEND_SE_GC_SHADER_INVALID_SHADER, "BACKEND_SE_GC_SHADER_INVALID_SHADER",
> +				"Stressful-and-multiple-cs-of-bad and good shader-operations-using-multiple-processes"},
> +			{}
>  	};
>  
>  	int const_num_of_tests;
> @@ -921,7 +945,7 @@ igt_main
>  		if (is_run_subtest_parameter_found(argc, argv))
>  			const_num_of_tests = 1;
>  		else
> -			const_num_of_tests = ARRAY_SIZE(arr_err);
> +			const_num_of_tests = (sizeof(arr_err)/sizeof(struct dynamic_test) - 1) * ARRAY_SIZE(ip_tests);
>  
>  		if (!is_background_parameter_found(argc, argv)) {
>  			add_background_parameter(&argc, argv);
> @@ -943,8 +967,11 @@ igt_main
>  
>  		r = amdgpu_query_gpu_info(device, &gpu_info);
>  		igt_assert_eq(r, 0);
> -		 r = amdgpu_query_hw_ip_info(device, ip_test, 0, &info);
> -		igt_assert_eq(r, 0);
> +		for (int i = 0; i < ARRAY_SIZE(ip_tests); i++) {
> +			r = amdgpu_query_hw_ip_info(device, ip_tests[i], 0, &info[i]);
> +			igt_assert_eq(r, 0);
> +		}
> +
>  		r = setup_amdgpu_ip_blocks(major, minor, &gpu_info, device);
>  		igt_assert_eq(r, 0);
>  
> @@ -959,68 +986,20 @@ igt_main
>  		igt_require(sh_mem != NULL);
>  
>  		run_all(device, arr_context_handle,
> -			process, sh_mem, const_num_of_tests, info.hw_ip_version_major,
> +			process, sh_mem, const_num_of_tests, info[0].hw_ip_version_major,
>  			&monitor_child, &test_child);
>  	}
>  
> -	igt_describe("Stressful-and-multiple-cs-of-bad and good length-operations-using-multiple-processes");
> -	igt_subtest("amdgpu-compute-CMD_STREAM_EXEC_INVALID_PACKET_LENGTH") {
> -		if (arr_cap[ip_test] && get_next_rings(ring_id_good, info.available_rings, &ring_id_good, &ring_id_bad)) {
> -			set_next_test_to_run(sh_mem, CMD_STREAM_EXEC_INVALID_PACKET_LENGTH,
> -						ip_background, ip_test, ring_id_good, ring_id_bad);
> -		}
> -	}
> -
> -	igt_describe("Stressful-and-multiple-cs-of-bad and good opcode-operations-using-multiple-processes");
> -	igt_subtest("amdgpu-compute-CMD_STREAM_EXEC_INVALID_OPCODE") {
> -		if (arr_cap[ip_test] && get_next_rings(ring_id_good, info.available_rings, &ring_id_good, &ring_id_bad)) {
> -			set_next_test_to_run(sh_mem, CMD_STREAM_EXEC_INVALID_OPCODE,
> -						ip_background, ip_test, ring_id_good, ring_id_bad);
> -		}
> -	}
> -
> -	/* TODO  not job timeout, debug why for nv32
> -	 *igt_describe("Stressful-and-multiple-cs-of-bad and good mem-sync-operations-using-multiple-processes");
> -	 *igt_subtest_with_dynamic("amdgpu-compute-CMD_STREAM_TRANS_BAD_MEM_ADDRESS_BY_SYNC") {
> -	 *	if (arr_cap[ip_test] && get_next_rings(ring_id_good, info.available_rings, &ring_id_good, &ring_id_bad)) {
> -	 *		igt_dynamic_f("amdgpu-compute-CMD_STREAM_TRANS_BAD_MEM_ADDRESS_BY_SYNC")
> -	 *			set_next_test_to_run(sh_mem, CMD_STREAM_TRANS_BAD_MEM_ADDRESS_BY_SYNC,
> -	 *					ip_background, ip_test, ring_id_good, ring_id_bad);
> -	 *	}
> -	 */
> -
> -	/* TODO  amdgpu: device lost from bus! for nv32
> -	 *igt_describe("Stressful-and-multiple-cs-of-bad and good reg-operations-using-multiple-processes");
> -	 *igt_subtest_with_dynamic("amdgpu-compute-CMD_STREAM_TRANS_BAD_REG_ADDRESS") {
> -	 *	if (arr_cap[ip_test] && get_next_rings(ring_id_good, info.available_rings, &ring_id_good, &ring_id_bad)) {
> -	 *		igt_dynamic_f("amdgpu-compute-CMD_STREAM_TRANS_BAD_MEM_ADDRESS")
> -	 *			set_next_test_to_run(sh_mem, CMD_STREAM_TRANS_BAD_REG_ADDRESS,
> -	 *					ip_background, ip_test, ring_id_good, ring_id_bad);
> -	 *	}
> -	 */
> -
> -	//amdgpu_ring_soft_recovery
> -	igt_describe("Stressful-and-multiple-cs-of-bad and good shader-operations-using-multiple-processes");
> -	igt_subtest("Handful-by-soft-recovery-amdgpu-compute-BACKEND_SE_GC_SHADER_INVALID_PROGRAM_ADDR") {
> -		if (arr_cap[ip_test] && get_next_rings(ring_id_good, info.available_rings, &ring_id_good, &ring_id_bad)) {
> -			set_next_test_to_run(sh_mem, BACKEND_SE_GC_SHADER_INVALID_PROGRAM_ADDR,
> -						ip_background, ip_test, ring_id_good, ring_id_bad);
> -		}
> -	}
> -
> -	igt_describe("Stressful-and-multiple-cs-of-bad and good shader-operations-using-multiple-processes");
> -	igt_subtest("amdgpu-compute-BACKEND_SE_GC_SHADER_INVALID_PROGRAM_SETTING") {
> -		if (arr_cap[ip_test] && get_next_rings(ring_id_good, info.available_rings, &ring_id_good, &ring_id_bad)) {
> -			set_next_test_to_run(sh_mem, BACKEND_SE_GC_SHADER_INVALID_PROGRAM_SETTING,
> -						ip_background, ip_test, ring_id_good, ring_id_bad);
> -		}
> -	}
> -
> -	igt_describe("Stressful-and-multiple-cs-of-bad and good shader-operations-using-multiple-processes");
> -	igt_subtest("amdgpu-compute-BACKEND_SE_GC_SHADER_INVALID_USER_DATA") {
> -		if (arr_cap[ip_test] && get_next_rings(ring_id_good, info.available_rings, &ring_id_good, &ring_id_bad)) {
> -			set_next_test_to_run(sh_mem, BACKEND_SE_GC_SHADER_INVALID_USER_DATA,
> -						ip_background, ip_test, ring_id_good, ring_id_bad);
> +	for (int i = 0; i < ARRAY_SIZE(ip_tests); i++) {
> +		for (struct dynamic_test *it = &arr_err[0]; it->name; it++) {
> +			igt_describe("Stressful-and-multiple-cs-of-bad and good length-operations-using-multiple-processes");
> +			igt_subtest_with_dynamic_f("amdgpu-%s-%s", ip_tests[i] == AMD_IP_COMPUTE ? "compute":"gfx", it->name) {
> +				if (arr_cap[ip_tests[i]] && get_next_rings(ring_id_good, info, &ring_id_good, &ring_id_bad, i)) {
> +					igt_dynamic_f("amdgpu-%s", it->name);
> +					set_next_test_to_run(sh_mem, it->test,
> +							ip_background, ip_tests[i], ring_id_good, ring_id_bad);
> +				}
> +			}
>  		}
>  	}
>