[PATCH] tests/amd_queue_reset: add ability to skip subtest
Zhang, Jesse(Jie)
Jesse.Zhang at amd.com
Fri Aug 23 01:54:15 UTC 2024
[AMD Official Use Only - AMD Internal Distribution Only]
This patch look good for me.
Reviewed-by : Jesse.zhang at amd.com
-----Original Message-----
From: vitaly.prosyak at amd.com <vitaly.prosyak at amd.com>
Sent: Friday, August 23, 2024 8:04 AM
To: igt-dev at lists.freedesktop.org
Cc: Prosyak, Vitaly <Vitaly.Prosyak at amd.com>; Zhang, Jesse(Jie) <Jesse.Zhang at amd.com>; Deucher, Alexander <Alexander.Deucher at amd.com>; Koenig, Christian <Christian.Koenig at amd.com>
Subject: [PATCH] tests/amd_queue_reset: add ability to skip subtest
From: Vitaly Prosyak <vitaly.prosyak at amd.com>
Some ASICs may lack functionality for specific error types, such as CMD_STREAM_EXEC_INVALID_PACKET_LENGTH, etc. To effectively test the entire table of queue/pipe reset errors, we’ve added an exclude filter that allows us to skip certain subtests based on family and chip ID.
The complexity lies in ensuring that the three other processes (test, background, and monitoring) remain synchronized. This ensures that each subtest can be skipped when necessary, while the next subtest is properly executed.
Cc: Jesse Zhang <jesse.zhang at amd.com>
Cc: Alex Deucher <alexander.deucher at amd.com>
Cc: Christian Koenig <christian.koenig at amd.com>
Signed-off-by: Vitaly Prosyak <vitaly.prosyak at amd.com>
---
lib/amdgpu/amd_ip_blocks.h | 11 ++++
tests/amdgpu/amd_queue_reset.c | 115 +++++++++++++++++++++++++++------
2 files changed, 108 insertions(+), 18 deletions(-)
diff --git a/lib/amdgpu/amd_ip_blocks.h b/lib/amdgpu/amd_ip_blocks.h index 1b39d9945..86b82c40c 100644
--- a/lib/amdgpu/amd_ip_blocks.h
+++ b/lib/amdgpu/amd_ip_blocks.h
@@ -45,11 +45,22 @@ enum cmd_error_type {
BACKEND_SE_GC_SHADER_INVALID_USER_DATA /* COMPUTE_USER_DATA */ };
+#define _MAX_NUM_ASIC_ID_EXCLUDE_FILTER 3
+
+struct asic_id_filter
+{
+ int family_id;
+ int chip_id_begin;
+ int chip_id_end;
+};
+
struct dynamic_test{
enum cmd_error_type test;
const char *name;
const char *describe;
+ struct asic_id_filter exclude_filter[_MAX_NUM_ASIC_ID_EXCLUDE_FILTER];
};
+
#define for_each_test(t, T) for(typeof(*T) *t = T; t->name; t++)
/* aux struct to hold misc parameters for convenience to maintain */ diff --git a/tests/amdgpu/amd_queue_reset.c b/tests/amdgpu/amd_queue_reset.c index 6a094d812..a3bc37018 100644
--- a/tests/amdgpu/amd_queue_reset.c
+++ b/tests/amdgpu/amd_queue_reset.c
@@ -25,6 +25,7 @@
#include "lib/amdgpu/amd_command_submission.h"
#include "lib/amdgpu/amd_deadlock_helpers.h"
#include "lib/amdgpu/amd_dispatch.h"
+#include "lib/amdgpu/amdgpu_asic_addr.h"
#define NUM_CHILD_PROCESSES 4
@@ -67,7 +68,8 @@ struct shmbuf {
sem_t sync_sem_enter;
sem_t sync_sem_exit;
int count;
- bool test_completed;
+ bool sub_test_completed;
+ bool sub_test_is_skipped;
unsigned int test_flags;
int test_error_code;
bool reset_completed;
@@ -123,18 +125,38 @@ get_reset_state(struct shmbuf *sh_mem, unsigned int *flags)
bool reset_state;
sem_wait(&sh_mem->sem_state_mutex);
- reset_state = sh_mem->reset_completed;
+ reset_state = sh_mem->reset_completed || sh_mem->sub_test_is_skipped;
*flags = sh_mem->reset_flags;
sem_post(&sh_mem->sem_state_mutex);
return reset_state;
}
+static bool
+is_subtest_skipped(struct shmbuf *sh_mem) {
+ bool skipped;
+
+ sem_wait(&sh_mem->sem_state_mutex);
+ skipped = sh_mem->sub_test_is_skipped;
+ sem_post(&sh_mem->sem_state_mutex);
+
+ return skipped;
+}
+
+static void
+skip_sub_test(struct shmbuf *sh_mem)
+{
+ sem_wait(&sh_mem->sem_state_mutex);
+ sh_mem->sub_test_is_skipped = true;
+ sem_post(&sh_mem->sem_state_mutex);
+}
+
static void
set_test_state(struct shmbuf *sh_mem, bool test_state,
int error_code, enum error_code_bits bit) {
sem_wait(&sh_mem->sem_state_mutex);
- sh_mem->test_completed = test_state;
+ sh_mem->sub_test_completed = test_state;
sh_mem->test_error_code = error_code;
if (test_state)
set_bit(bit, &sh_mem->test_flags);
@@ -143,15 +165,13 @@ set_test_state(struct shmbuf *sh_mem, bool test_state,
sem_post(&sh_mem->sem_state_mutex);
}
-
-
static bool
get_test_state(struct shmbuf *sh_mem, int *error_code, unsigned int *flags) {
bool test_state;
sem_wait(&sh_mem->sem_state_mutex);
- test_state = sh_mem->test_completed;
+ test_state = sh_mem->sub_test_completed ||
+sh_mem->sub_test_is_skipped;
*error_code = sh_mem->test_error_code;
*flags = sh_mem->test_flags;
sem_post(&sh_mem->sem_state_mutex);
@@ -306,6 +326,7 @@ static void set_next_test_to_run(struct shmbuf *sh_mem, unsigned int error,
sh_mem->good_job.error = CMD_STREAM_EXEC_SUCCESS;
sh_mem->good_job.ip = ip_good;
sh_mem->good_job.ring_id = ring_id_good;
+ sh_mem->sub_test_is_skipped = false;
sem_post(&sh_mem->sem_state_mutex);
//sync and wait for complete
@@ -316,6 +337,14 @@ static void set_next_test_to_run(struct shmbuf *sh_mem, unsigned int error,
"Testing does not trigger reset \n"); }
+static void set_next_test_to_skip(struct shmbuf *sh_mem) {
+ skip_sub_test(sh_mem);
+ sync_point_enter(sh_mem);
+ wait_for_complete_iteration(sh_mem);
+ sync_point_exit(sh_mem);
+}
+
static int
shared_mem_destroy(struct shmbuf *shmp, int shm_fd, bool unmap) { @@ -373,8 +402,9 @@ shared_mem_create(struct shmbuf **ppbuf)
goto error;
shmp->count = 0;
- shmp->test_completed = false;
+ shmp->sub_test_completed = false;
shmp->reset_completed = false;
+ shmp->sub_test_is_skipped = false;
*ppbuf = shmp;
return shm_fd;
@@ -415,6 +445,37 @@ is_queue_reset_tests_enable(const struct amdgpu_gpu_info *gpu_info, uint32_t ver
return enable;
}
+static bool
+is_sub_test_queue_reset_enable(const struct amdgpu_gpu_info *gpu_info,
+ struct asic_id_filter exclude_filter[_MAX_NUM_ASIC_ID_EXCLUDE_FILTER],
+ const struct dynamic_test *it)
+{
+ int i;
+ bool enable = true;
+ int chip_id;
+ char error_str[128];
+ bool is_dispatch;
+
+ for (i = 0; i < _MAX_NUM_ASIC_ID_EXCLUDE_FILTER; i++) {
+ if (gpu_info->family_id == exclude_filter[i].family_id) {
+ chip_id = gpu_info->chip_external_rev - gpu_info->chip_rev;
+ if (chip_id >= exclude_filter[i].chip_id_begin &&
+ chip_id < exclude_filter[i].chip_id_end) {
+ enable = false;
+ is_dispatch_shader_test(it->test, error_str, &is_dispatch);
+ igt_info("PID %d SKIP subtest %s CHIP family (%s) %d chip %d, begin end [%d %d] excluded\n",
+ getpid(), error_str, g_pChip->name,
+ gpu_info->family_id, chip_id,
+ exclude_filter[i].chip_id_begin,
+ exclude_filter[i].chip_id_end);
+ break;
+ }
+ }
+ }
+
+ return enable;
+}
+
static int
amdgpu_write_linear(amdgpu_device_handle device, amdgpu_context_handle context_handle,
const struct amdgpu_ip_block_version *ip_block, @@ -501,6 +562,9 @@ run_monitor_child(amdgpu_device_handle device, amdgpu_context_handle *arr_contex
set_reset_state(sh_mem, false, ALL_RESET_BITS);
time(&start);
while (1) {
+ if (is_subtest_skipped(sh_mem))
+ break;
+
if (state_machine == 0) {
amdgpu_cs_query_reset_state2(arr_context[test_counter], &init_flags);
@@ -550,7 +614,7 @@ run_monitor_child(amdgpu_device_handle device, amdgpu_context_handle *arr_contex
if (cnt % 1000000 == 0) {
time(&end);
elapsed = difftime(end, start);
- if ( elapsed >= TEST_TIMEOUT) {
+ if (elapsed >= TEST_TIMEOUT) {
set_reset_state(sh_mem, true, NO_RESET_SET_BIT);
break;
}
@@ -582,6 +646,12 @@ run_test_child(amdgpu_device_handle device, amdgpu_context_handle *arr_context,
while (num_of_tests > 0) {
sync_point_enter(sh_mem);
+ if (is_subtest_skipped(sh_mem)) {
+ sync_point_exit(sh_mem);
+ num_of_tests--;
+ test_counter++;
+ continue;
+ }
set_test_state(sh_mem, false, 0, ERROR_CODE_SET_BIT);
read_next_job(sh_mem, &job, false);
bool_ret = is_dispatch_shader_test(job.error, error_str, &is_dispatch); @@ -631,6 +701,11 @@ run_background(amdgpu_device_handle device, struct shmbuf *sh_mem,
while (num_of_tests > 0) {
sync_point_enter(sh_mem);
+ if (is_subtest_skipped(sh_mem)) {
+ sync_point_exit(sh_mem);
+ num_of_tests--;
+ continue;
+ }
read_next_job(sh_mem, &job, true);
ip_block_test = get_ip_block(device, job.ip);
is_dispatch_shader_test(job.error, error_str, &is_dispatch); @@ -979,9 +1054,11 @@ igt_main
*/
struct dynamic_test arr_err[] = {
{CMD_STREAM_EXEC_INVALID_PACKET_LENGTH, "CMD_STREAM_EXEC_INVALID_PACKET_LENGTH",
- "Stressful-and-multiple-cs-of-bad and good length-operations-using-multiple-processes"},
+ "Stressful-and-multiple-cs-of-bad and good length-operations-using-multiple-processes",
+ { {FAMILY_UNKNOWN, 0x1, 0x10 }, {FAMILY_AI, 0x32, 0x3C },
+{FAMILY_AI, 0x3C, 0xFF } } },
{CMD_STREAM_EXEC_INVALID_OPCODE, "CMD_STREAM_EXEC_INVALID_OPCODE",
- "Stressful-and-multiple-cs-of-bad and good opcode-operations-using-multiple-processes"},
+ "Stressful-and-multiple-cs-of-bad and good opcode-operations-using-multiple-processes",
+ { {FAMILY_UNKNOWN, -1, -1 }, {FAMILY_UNKNOWN, -1, -1 },
+{FAMILY_UNKNOWN, -1, -1 } } },
//TODO not job timeout, debug why for n31.
//{CMD_STREAM_TRANS_BAD_MEM_ADDRESS_BY_SYNC,"CMD_STREAM_TRANS_BAD_MEM_ADDRESS_BY_SYNC",
// "Stressful-and-multiple-cs-of-bad and good mem-sync-operations-using-multiple-processes"},
@@ -989,14 +1066,17 @@ igt_main
//{CMD_STREAM_TRANS_BAD_REG_ADDRESS,"CMD_STREAM_TRANS_BAD_REG_ADDRESS",
// "Stressful-and-multiple-cs-of-bad and good reg-operations-using-multiple-processes"},
{BACKEND_SE_GC_SHADER_INVALID_PROGRAM_ADDR, "BACKEND_SE_GC_SHADER_INVALID_PROGRAM_ADDR",
- "Stressful-and-multiple-cs-of-bad and good shader-operations-using-multiple-processes"},
+ "Stressful-and-multiple-cs-of-bad and good shader-operations-using-multiple-processes",
+ { {FAMILY_UNKNOWN, 0x1, 0x10 }, {FAMILY_AI, 0x32, 0x3C },
+{FAMILY_AI, 0x3C, 0xFF } } },
//TODO KGQ cannot recover by queue reset, it maybe need a fw bugfix on naiv31
//{BACKEND_SE_GC_SHADER_INVALID_PROGRAM_SETTING,"BACKEND_SE_GC_SHADER_INVALID_PROGRAM_SETTING",
// "Stressful-and-multiple-cs-of-bad and good shader-operations-using-multiple-processes"},
{BACKEND_SE_GC_SHADER_INVALID_USER_DATA, "BACKEND_SE_GC_SHADER_INVALID_USER_DATA",
- "Stressful-and-multiple-cs-of-bad and good shader-operations-using-multiple-processes"},
+ "Stressful-and-multiple-cs-of-bad and good shader-operations-using-multiple-processes",
+ { {FAMILY_UNKNOWN, -1, -1 }, {FAMILY_UNKNOWN, -1, -1 },
+{FAMILY_UNKNOWN, -1, -1 } } },
{BACKEND_SE_GC_SHADER_INVALID_SHADER, "BACKEND_SE_GC_SHADER_INVALID_SHADER",
- "Stressful-and-multiple-cs-of-bad and good shader-operations-using-multiple-processes"},
+ "Stressful-and-multiple-cs-of-bad and good shader-operations-using-multiple-processes",
+ { {FAMILY_UNKNOWN, 0x1, 0x10 }, {FAMILY_AI, 0x32, 0x3C },
+{FAMILY_AI, 0x3C, 0xFF } } },
{}
};
@@ -1021,9 +1101,6 @@ igt_main
err = amdgpu_device_initialize(fd, &major, &minor, &device);
igt_require(err == 0);
- igt_info("Initialized amdgpu, driver version %d.%d\n",
- major, minor);
-
r = amdgpu_query_gpu_info(device, &gpu_info);
igt_assert_eq(r, 0);
@@ -1065,16 +1142,18 @@ igt_main
for (struct dynamic_test *it = &arr_err[0]; it->name; it++) {
igt_describe("Stressful-and-multiple-cs-of-bad-and-good-length-operations-using-multiple-processes");
igt_subtest_with_dynamic_f("amdgpu-%s-%s", ip_tests[i] == AMD_IP_COMPUTE ? "COMPUTE":"GFX", it->name) {
- if (arr_cap[ip_tests[i]] && get_next_rings(&ring_id_good, &ring_id_bad, info[0].available_rings,
+ if (arr_cap[ip_tests[i]] && is_sub_test_queue_reset_enable(&gpu_info, it->exclude_filter, it) &&
+ get_next_rings(&ring_id_good, &ring_id_bad,
+info[0].available_rings,
info[i].available_rings, ip_background != ip_tests[i], &ring_id_job_good, &ring_id_job_bad)) {
igt_dynamic_f("amdgpu-%s-ring-good-%d-bad-%d-%s", it->name, ring_id_job_good, ring_id_job_bad,
ip_tests[i] == AMD_IP_COMPUTE ? "COMPUTE":"GFX")
set_next_test_to_run(sh_mem, it->test, ip_background, ip_tests[i], ring_id_job_good, ring_id_job_bad);
+ } else {
+ set_next_test_to_skip(sh_mem);
}
}
}
}
-
igt_fixture {
if (process == PROCESS_TEST) {
waitpid(monitor_child, &monitorExitMethod, 0);
--
2.25.1
More information about the igt-dev
mailing list