[PATCH i-g-t v2] tests/amdgpu: add KGQ test in queue reset

Jesse.zhang@amd.com jesse.zhang at amd.com
Fri Aug 9 02:42:51 UTC 2024


Enhance the queue reset, add KGQ test.

V2:
Some improvements regarding the selection of testing ring (Vitaly)

Cc: Kamil Konieczny <kamil.konieczny at linux.intel.com>
Cc: Alex Deucher <alexander.deucher at amd.com>
Cc: Christian Koenig <christian.koenig at amd.com>
Cc: Vitaly Prosyak <vitaly.prosyak at amd.com>

Signed-off-by: Jesse Zhang <jesse.zhang at amd.com>
---
 lib/amdgpu/amd_ip_blocks.h     |   7 ++
 tests/amdgpu/amd_queue_reset.c | 145 ++++++++++++++-------------------
 2 files changed, 69 insertions(+), 83 deletions(-)

diff --git a/lib/amdgpu/amd_ip_blocks.h b/lib/amdgpu/amd_ip_blocks.h
index 7fd883608..1b39d9945 100644
--- a/lib/amdgpu/amd_ip_blocks.h
+++ b/lib/amdgpu/amd_ip_blocks.h
@@ -45,6 +45,13 @@ enum  cmd_error_type {
 	BACKEND_SE_GC_SHADER_INVALID_USER_DATA /* COMPUTE_USER_DATA */
 };
 
+struct dynamic_test{
+	enum cmd_error_type test;
+	const char *name;
+	const char *describe;
+};
+#define for_each_test(t, T) for(typeof(*T) *t = T; t->name; t++)
+
 /* aux struct to hold misc parameters for convenience to maintain */
 struct amdgpu_ring_context {
 
diff --git a/tests/amdgpu/amd_queue_reset.c b/tests/amdgpu/amd_queue_reset.c
index 9ea0a1f24..4ae69d6c2 100644
--- a/tests/amdgpu/amd_queue_reset.c
+++ b/tests/amdgpu/amd_queue_reset.c
@@ -846,25 +846,36 @@ free_contexts(amdgpu_device_handle device, amdgpu_context_handle *p_contexts,
 	}
 }
 
-/* TODO add logic to iterate for all */
 static bool
-get_next_rings(unsigned int ring_begin, unsigned int available_rings,
-		unsigned int *next_ring, unsigned int *next_next_ring)
+get_next_rings(unsigned int ring_begin, struct drm_amdgpu_info_hw_ip info[],
+		unsigned int *good_job_ring, unsigned int *bad_job_ring,  unsigned int order)
 {
-	bool ret = false;
 	unsigned int ring_id;
 
-	for (ring_id = ring_begin; (1 << ring_id) & available_rings; ring_id++) {
-		*next_ring = ring_id;
-		*next_next_ring = ring_id + 1;
+	/* Check good job ring is available. By default good job run on compute ring */
+	for (ring_id = ring_begin; (1 << ring_id) & info[0].available_rings; ring_id++) {
+		if ((1 << *good_job_ring) & info[0].available_rings) {
+			*good_job_ring = ring_id;
+			/* check bad job ring is available */
+			for (ring_id = ring_begin; (1 << ring_id) & info[order].available_rings; ring_id++) {
+				/* if order is 0, bad job run on compute ring,
+				 * It should skip good ring and find next ring to run bad job.
+				 */
+				if (!order)
+					*bad_job_ring = *good_job_ring + 1;
+				else
+					*bad_job_ring = ring_id;
+				if ((1 << *bad_job_ring) & info[order].available_rings) {
+					return true;
+				}
+			}
 
-		if ((*next_ring  & available_rings) && (*next_next_ring & available_rings)) {
-			ret = true;
-			break;
 		}
 	}
-	return ret;
+
+	return false;
 }
+
 igt_main
 {
 	char cmdline[2048];
@@ -878,7 +889,7 @@ igt_main
 	posix_spawn_file_actions_t action;
 	amdgpu_device_handle device;
 	struct amdgpu_gpu_info gpu_info = {0};
-	struct drm_amdgpu_info_hw_ip info = {0};
+	struct drm_amdgpu_info_hw_ip info[2] = {0};
 	int fd = -1;
 	int fd_shm = -1;
 	struct shmbuf *sh_mem = NULL;
@@ -888,7 +899,7 @@ igt_main
 	unsigned int ring_id_good = 0;
 	unsigned int ring_id_bad = 1;
 
-	enum amd_ip_block_type ip_test = AMD_IP_COMPUTE;
+	enum amd_ip_block_type ip_tests[2] = {AMD_IP_COMPUTE/*keep first*/, AMD_IP_GFX};
 	enum amd_ip_block_type ip_background = AMD_IP_COMPUTE;
 
 	amdgpu_context_handle *arr_context_handle = NULL;
@@ -897,14 +908,27 @@ igt_main
 	 * which are shared between child processes ( test/monitor/main and
 	 *  separate for background
 	 */
-	unsigned int arr_err[] = {
-			CMD_STREAM_EXEC_INVALID_PACKET_LENGTH,
-			CMD_STREAM_EXEC_INVALID_OPCODE,
-			//CMD_STREAM_TRANS_BAD_MEM_ADDRESS_BY_SYNC,TODO  not job timeout, debug why for n31
-			//CMD_STREAM_TRANS_BAD_REG_ADDRESS, TODO  amdgpu: device lost from bus! for n31
-			BACKEND_SE_GC_SHADER_INVALID_PROGRAM_ADDR,
-			BACKEND_SE_GC_SHADER_INVALID_PROGRAM_SETTING,
-			BACKEND_SE_GC_SHADER_INVALID_USER_DATA
+	struct dynamic_test arr_err[] = {
+			{CMD_STREAM_EXEC_INVALID_PACKET_LENGTH, "CMD_STREAM_EXEC_INVALID_PACKET_LENGTH",
+				"Stressful-and-multiple-cs-of-bad and good length-operations-using-multiple-processes"},
+			{CMD_STREAM_EXEC_INVALID_OPCODE, "CMD_STREAM_EXEC_INVALID_OPCODE",
+				"Stressful-and-multiple-cs-of-bad and good opcode-operations-using-multiple-processes"},
+			//TODO  not job timeout, debug why for n31.
+			//{CMD_STREAM_TRANS_BAD_MEM_ADDRESS_BY_SYNC,"CMD_STREAM_TRANS_BAD_MEM_ADDRESS_BY_SYNC",
+			//	"Stressful-and-multiple-cs-of-bad and good mem-sync-operations-using-multiple-processes"},
+			//TODO amdgpu: device lost from bus! for n31
+			//{CMD_STREAM_TRANS_BAD_REG_ADDRESS,"CMD_STREAM_TRANS_BAD_REG_ADDRESS",
+			//	"Stressful-and-multiple-cs-of-bad and good reg-operations-using-multiple-processes"},
+			{BACKEND_SE_GC_SHADER_INVALID_PROGRAM_ADDR, "BACKEND_SE_GC_SHADER_INVALID_PROGRAM_ADDR",
+				"Stressful-and-multiple-cs-of-bad and good shader-operations-using-multiple-processes"},
+			//TODO  KGQ cannot revocer by queue reset, it maybe need a fw bugfix on naiv31
+			//{BACKEND_SE_GC_SHADER_INVALID_PROGRAM_SETTING,"BACKEND_SE_GC_SHADER_INVALID_PROGRAM_SETTING",
+			//	"Stressful-and-multiple-cs-of-bad and good shader-operations-using-multiple-processes"},
+			{BACKEND_SE_GC_SHADER_INVALID_USER_DATA, "BACKEND_SE_GC_SHADER_INVALID_USER_DATA",
+				"Stressful-and-multiple-cs-of-bad and good shader-operations-using-multiple-processes"},
+			{BACKEND_SE_GC_SHADER_INVALID_SHADER, "BACKEND_SE_GC_SHADER_INVALID_SHADER",
+				"Stressful-and-multiple-cs-of-bad and good shader-operations-using-multiple-processes"},
+			{}
 	};
 
 	int const_num_of_tests;
@@ -921,7 +945,7 @@ igt_main
 		if (is_run_subtest_parameter_found(argc, argv))
 			const_num_of_tests = 1;
 		else
-			const_num_of_tests = ARRAY_SIZE(arr_err);
+			const_num_of_tests = sizeof(arr_err)/sizeof(struct dynamic_test) * ARRAY_SIZE(ip_tests);
 
 		if (!is_background_parameter_found(argc, argv)) {
 			add_background_parameter(&argc, argv);
@@ -943,8 +967,11 @@ igt_main
 
 		r = amdgpu_query_gpu_info(device, &gpu_info);
 		igt_assert_eq(r, 0);
-		 r = amdgpu_query_hw_ip_info(device, ip_test, 0, &info);
-		igt_assert_eq(r, 0);
+		for (int i = 0; i < ARRAY_SIZE(ip_tests); i++) {
+			r = amdgpu_query_hw_ip_info(device, ip_tests[i], 0, &info[i]);
+			igt_assert_eq(r, 0);
+		}
+
 		r = setup_amdgpu_ip_blocks(major, minor, &gpu_info, device);
 		igt_assert_eq(r, 0);
 
@@ -959,68 +986,20 @@ igt_main
 		igt_require(sh_mem != NULL);
 
 		run_all(device, arr_context_handle,
-			process, sh_mem, const_num_of_tests, info.hw_ip_version_major,
+			process, sh_mem, const_num_of_tests, info[0].hw_ip_version_major,
 			&monitor_child, &test_child);
 	}
 
-	igt_describe("Stressful-and-multiple-cs-of-bad and good length-operations-using-multiple-processes");
-	igt_subtest("amdgpu-compute-CMD_STREAM_EXEC_INVALID_PACKET_LENGTH") {
-		if (arr_cap[ip_test] && get_next_rings(ring_id_good, info.available_rings, &ring_id_good, &ring_id_bad)) {
-			set_next_test_to_run(sh_mem, CMD_STREAM_EXEC_INVALID_PACKET_LENGTH,
-						ip_background, ip_test, ring_id_good, ring_id_bad);
-		}
-	}
-
-	igt_describe("Stressful-and-multiple-cs-of-bad and good opcode-operations-using-multiple-processes");
-	igt_subtest("amdgpu-compute-CMD_STREAM_EXEC_INVALID_OPCODE") {
-		if (arr_cap[ip_test] && get_next_rings(ring_id_good, info.available_rings, &ring_id_good, &ring_id_bad)) {
-			set_next_test_to_run(sh_mem, CMD_STREAM_EXEC_INVALID_OPCODE,
-						ip_background, ip_test, ring_id_good, ring_id_bad);
-		}
-	}
-
-	/* TODO  not job timeout, debug why for nv32
-	 *igt_describe("Stressful-and-multiple-cs-of-bad and good mem-sync-operations-using-multiple-processes");
-	 *igt_subtest_with_dynamic("amdgpu-compute-CMD_STREAM_TRANS_BAD_MEM_ADDRESS_BY_SYNC") {
-	 *	if (arr_cap[ip_test] && get_next_rings(ring_id_good, info.available_rings, &ring_id_good, &ring_id_bad)) {
-	 *		igt_dynamic_f("amdgpu-compute-CMD_STREAM_TRANS_BAD_MEM_ADDRESS_BY_SYNC")
-	 *			set_next_test_to_run(sh_mem, CMD_STREAM_TRANS_BAD_MEM_ADDRESS_BY_SYNC,
-	 *					ip_background, ip_test, ring_id_good, ring_id_bad);
-	 *	}
-	 */
-
-	/* TODO  amdgpu: device lost from bus! for nv32
-	 *igt_describe("Stressful-and-multiple-cs-of-bad and good reg-operations-using-multiple-processes");
-	 *igt_subtest_with_dynamic("amdgpu-compute-CMD_STREAM_TRANS_BAD_REG_ADDRESS") {
-	 *	if (arr_cap[ip_test] && get_next_rings(ring_id_good, info.available_rings, &ring_id_good, &ring_id_bad)) {
-	 *		igt_dynamic_f("amdgpu-compute-CMD_STREAM_TRANS_BAD_MEM_ADDRESS")
-	 *			set_next_test_to_run(sh_mem, CMD_STREAM_TRANS_BAD_REG_ADDRESS,
-	 *					ip_background, ip_test, ring_id_good, ring_id_bad);
-	 *	}
-	 */
-
-	//amdgpu_ring_soft_recovery
-	igt_describe("Stressful-and-multiple-cs-of-bad and good shader-operations-using-multiple-processes");
-	igt_subtest("Handful-by-soft-recovery-amdgpu-compute-BACKEND_SE_GC_SHADER_INVALID_PROGRAM_ADDR") {
-		if (arr_cap[ip_test] && get_next_rings(ring_id_good, info.available_rings, &ring_id_good, &ring_id_bad)) {
-			set_next_test_to_run(sh_mem, BACKEND_SE_GC_SHADER_INVALID_PROGRAM_ADDR,
-						ip_background, ip_test, ring_id_good, ring_id_bad);
-		}
-	}
-
-	igt_describe("Stressful-and-multiple-cs-of-bad and good shader-operations-using-multiple-processes");
-	igt_subtest("amdgpu-compute-BACKEND_SE_GC_SHADER_INVALID_PROGRAM_SETTING") {
-		if (arr_cap[ip_test] && get_next_rings(ring_id_good, info.available_rings, &ring_id_good, &ring_id_bad)) {
-			set_next_test_to_run(sh_mem, BACKEND_SE_GC_SHADER_INVALID_PROGRAM_SETTING,
-						ip_background, ip_test, ring_id_good, ring_id_bad);
-		}
-	}
-
-	igt_describe("Stressful-and-multiple-cs-of-bad and good shader-operations-using-multiple-processes");
-	igt_subtest("amdgpu-compute-BACKEND_SE_GC_SHADER_INVALID_USER_DATA") {
-		if (arr_cap[ip_test] && get_next_rings(ring_id_good, info.available_rings, &ring_id_good, &ring_id_bad)) {
-			set_next_test_to_run(sh_mem, BACKEND_SE_GC_SHADER_INVALID_USER_DATA,
-						ip_background, ip_test, ring_id_good, ring_id_bad);
+	for (int i = 0; i < ARRAY_SIZE(ip_tests); i++) {
+		for (struct dynamic_test *it = &arr_err[0]; it->name; it++) {
+			igt_describe("Stressful-and-multiple-cs-of-bad and good length-operations-using-multiple-processes");
+			igt_subtest_with_dynamic_f("amdgpu-%s-%s", ip_tests[i] == AMD_IP_COMPUTE ? "compute":"gfx", it->name) {
+				if (arr_cap[ip_tests[i]] && get_next_rings(ring_id_good, info, &ring_id_good, &ring_id_bad, i)) {
+					igt_dynamic_f("amdgpu-%s", it->name);
+					set_next_test_to_run(sh_mem, it->test,
+							ip_background, ip_tests[i], ring_id_good, ring_id_bad);
+				}
+			}
 		}
 	}
 
-- 
2.25.1



More information about the igt-dev mailing list