[PATCH] tests/amd_queue_reset: refactor monitoring child process into a worker thread
Zhang, Jesse(Jie)
Jesse.Zhang at amd.com
Wed Sep 18 01:25:05 UTC 2024
[AMD Official Use Only - AMD Internal Distribution Only]
This patch look good to me,
Reviewed-by : Jesse.zhang <Jesse.zhang at amd.com>
-----Original Message-----
From: vitaly.prosyak at amd.com <vitaly.prosyak at amd.com>
Sent: Monday, September 16, 2024 9:33 AM
To: igt-dev at lists.freedesktop.org
Cc: Prosyak, Vitaly <Vitaly.Prosyak at amd.com>; Zhang, Jesse(Jie) <Jesse.Zhang at amd.com>; Deucher, Alexander <Alexander.Deucher at amd.com>; Koenig, Christian <Christian.Koenig at amd.com>
Subject: [PATCH] tests/amd_queue_reset: refactor monitoring child process into a worker thread
From: Vitaly Prosyak <vitaly.prosyak at amd.com>
This refactor was necessary because AMDGPU contexts cannot be exported or imported between processes. Previously, we used fork to share the context between child processes (test and monitoring), which was essential for querying the context status of failed jobs to validate GPU or queue reset progression using flags.
However, not all failures result in a queue reset. In cases of a GPU reset, the entire VRAM is lost, and previously created contexts could not be used in subsequent tests due to VM generation token mismatches. As a result, we had two options: eliminate the monitoring process entirely or refactor its functionality into a worker thread within the test process. We chose the latter. Now, the worker thread of the test process handles the monitoring tasks, ensuring smooth context management and satisfying all requirements.
Cc: Jesse Zhang <jesse.zhang at amd.com>
Cc: Alex Deucher <alexander.deucher at amd.com>
Cc: Christian Koenig <christian.koenig at amd.com>
Signed-off-by: Vitaly Prosyak <vitaly.prosyak at amd.com>
---
tests/amdgpu/amd_queue_reset.c | 224 +++++++++++++++++++--------------
1 file changed, 133 insertions(+), 91 deletions(-)
diff --git a/tests/amdgpu/amd_queue_reset.c b/tests/amdgpu/amd_queue_reset.c index 8162e2186..d130c16a6 100644
--- a/tests/amdgpu/amd_queue_reset.c
+++ b/tests/amdgpu/amd_queue_reset.c
@@ -32,7 +32,7 @@
#define SHARED_CHILD_DESCRIPTOR 3
-#define TEST_TIMEOUT 100 //100 seconds
+#define TEST_TIMEOUT 20 //100 seconds
enum process_type {
PROCESS_UNKNOWN,
@@ -80,6 +80,56 @@ struct shmbuf {
};
+// Local memory structure for communication between child process and
+worker thread struct localbuf {
+ pthread_mutex_t mutex; // Mutex for condition variable
+ pthread_cond_t cond_var; // Condition variable to signal context availability
+ int context_ready; // Flag to indicate if the context is ready
+ amdgpu_context_handle amdgpu_context;// Pointer to amdgpu context };
+
+struct thread_params {
+ amdgpu_device_handle device;
+ amdgpu_context_handle *arr_context;
+ struct shmbuf *sh_mem;
+ int num_of_tests;
+ struct localbuf local_mem;
+
+};
+
+static struct thread_params*
+allocate_thread_param(amdgpu_device_handle device, struct shmbuf *sh_mem,
+ int num_of_tests)
+{
+ struct thread_params *param = NULL;
+
+ param = malloc(sizeof(struct thread_params));
+ memset(param, 0, sizeof(struct thread_params));
+
+ param->device = device;
+ param->sh_mem = sh_mem;
+ param->num_of_tests = num_of_tests;
+ pthread_mutex_init(¶m->local_mem.mutex, NULL);
+ pthread_cond_init(¶m->local_mem.cond_var, NULL);
+ param->local_mem.amdgpu_context = NULL;
+ param->local_mem.context_ready = 0;
+
+ return param;
+}
+
+static void
+free_thread_param(struct thread_params *param) {
+ if (param) {
+ if (param->arr_context)
+ free(param->arr_context);
+
+ pthread_cond_destroy(¶m->local_mem.cond_var);
+ pthread_mutex_destroy(¶m->local_mem.mutex);
+ free(param);
+ }
+}
+
static inline
void set_bit(int nr, uint32_t *addr)
{
@@ -548,8 +598,8 @@ amdgpu_write_linear(amdgpu_device_handle device, amdgpu_context_handle context_h }
static int
-run_monitor_child(amdgpu_device_handle device, amdgpu_context_handle *arr_context,
- struct shmbuf *sh_mem, int num_of_tests)
+run_monitor_child(amdgpu_device_handle device, struct shmbuf *sh_mem,
+ int num_of_tests, struct thread_params *param)
{
int ret;
int test_counter = 0;
@@ -561,27 +611,32 @@ run_monitor_child(amdgpu_device_handle device, amdgpu_context_handle *arr_contex
int64_t cnt = 0;
time_t start, end;
double elapsed = 0;
+ amdgpu_context_handle local_context = NULL;
after_reset_state = after_reset_hangs = 0;
init_flags = in_process_flags = 0;
- ret = amdgpu_cs_query_reset_state2(arr_context[0], &init_flags);
- if (init_flags & AMDGPU_CTX_QUERY2_FLAGS_RESET_IN_PROGRESS)
- igt_assert_eq(init_flags & AMDGPU_CTX_QUERY2_FLAGS_RESET_IN_PROGRESS, 0);
-
while (num_of_tests > 0) {
sync_point_enter(sh_mem);
state_machine = 0;
error_code = 0;
flags = 0;
set_reset_state(sh_mem, false, ALL_RESET_BITS);
+
+ pthread_mutex_lock(¶m->local_mem.mutex);
+ while (!param->local_mem.context_ready)
+ pthread_cond_wait(¶m->local_mem.cond_var,
+¶m->local_mem.mutex);
+
+ local_context = param->local_mem.amdgpu_context;
+ pthread_mutex_unlock(¶m->local_mem.mutex);
+
time(&start);
while (1) {
if (is_subtest_skipped(sh_mem))
break;
if (state_machine == 0) {
- amdgpu_cs_query_reset_state2(arr_context[test_counter], &init_flags);
+ amdgpu_cs_query_reset_state2(local_context, &init_flags);
if (init_flags & AMDGPU_CTX_QUERY2_FLAGS_RESET)
state_machine = 1;
@@ -590,9 +645,9 @@ run_monitor_child(amdgpu_device_handle device, amdgpu_context_handle *arr_contex
state_machine = 2;
} else if (state_machine == 1) {
- amdgpu_cs_query_reset_state(arr_context[test_counter],
+ amdgpu_cs_query_reset_state(local_context,
&after_reset_state, &after_reset_hangs);
- amdgpu_cs_query_reset_state2(arr_context[test_counter],
+ amdgpu_cs_query_reset_state2(local_context,
&in_process_flags);
//TODO refactor this block !
@@ -613,9 +668,9 @@ run_monitor_child(amdgpu_device_handle device, amdgpu_context_handle *arr_contex
}
}
} else if (state_machine == 2) {
- amdgpu_cs_query_reset_state(arr_context[test_counter],
+ amdgpu_cs_query_reset_state(local_context,
&after_reset_state, &after_reset_hangs);
- amdgpu_cs_query_reset_state2(arr_context[test_counter],
+ amdgpu_cs_query_reset_state2(local_context,
&in_process_flags);
/* here we should start timer and wait for some time until
* the flag AMDGPU_CTX_QUERY2_FLAGS_RESET disappear @@ -643,13 +698,22 @@ run_monitor_child(amdgpu_device_handle device, amdgpu_context_handle *arr_contex
return ret;
}
+static void *
+worker_thread(void *arg)
+{
+ struct thread_params *param = (struct thread_params *)arg;
+
+ run_monitor_child(param->device, param->sh_mem, param->num_of_tests,
+param);
+
+ return NULL;
+}
static int
-run_test_child(amdgpu_device_handle device, amdgpu_context_handle *arr_context,
- struct shmbuf *sh_mem, int num_of_tests, uint32_t version)
+run_test_child(amdgpu_device_handle device, struct shmbuf *sh_mem,
+ int num_of_tests, uint32_t version, struct thread_params *param)
{
- int ret;
+ int ret, r;
bool bool_ret;
int test_counter = 0;
char error_str[128];
@@ -659,6 +723,7 @@ run_test_child(amdgpu_device_handle device, amdgpu_context_handle *arr_context,
struct job_struct job;
const struct amdgpu_ip_block_version *ip_block_test = NULL;
struct amdgpu_cs_err_codes err_codes;
+ amdgpu_context_handle local_context = NULL;
while (num_of_tests > 0) {
sync_point_enter(sh_mem);
@@ -676,11 +741,19 @@ run_test_child(amdgpu_device_handle device, amdgpu_context_handle *arr_context,
err_codes.err_code_cs_submit = 0;
err_codes.err_code_wait_for_fence = 0;
+ r = amdgpu_cs_ctx_create(device, &local_context);
+ igt_assert_eq(r, 0);
+ pthread_mutex_lock(¶m->local_mem.mutex);
+ param->local_mem.amdgpu_context = local_context;
+ param->local_mem.context_ready = 1; // Mark the context as ready
+ pthread_cond_signal(¶m->local_mem.cond_var); // Notify the worker thread
+ pthread_mutex_unlock(¶m->local_mem.mutex);
+
if (is_dispatch) {
- ret = amdgpu_memcpy_dispatch_test(device, arr_context[test_counter], job.ip, job.ring_id, version,
+ ret = amdgpu_memcpy_dispatch_test(device, local_context, job.ip,
+job.ring_id, version,
job.error, &err_codes);
} else {
- ret = amdgpu_write_linear(device, arr_context[test_counter],
+ ret = amdgpu_write_linear(device, local_context,
ip_block_test, &job, &err_codes);
}
@@ -692,11 +765,21 @@ run_test_child(amdgpu_device_handle device, amdgpu_context_handle *arr_context,
break;
sleep(1);
}
- /* validate expected result */
- //igt_assert_eq(err_codes.err_code_wait_for_fence, job.reset_err_result);
+
sync_point_exit(sh_mem);
+ pthread_mutex_lock(¶m->local_mem.mutex);
+ param->local_mem.context_ready = 0;
+ param->local_mem.amdgpu_context = NULL;
+ amdgpu_cs_ctx_free(local_context);
+ pthread_mutex_unlock(¶m->local_mem.mutex);
+
test_counter++;
+ igt_assert_eq(err_codes.err_code_cs_submit, 0);
+ igt_assert_eq(err_codes.err_code_wait_for_fence, job.reset_err_result);
+ //igt_info("====Test %s ip %s wait_for_fence %d==============\n", error_str,
+ // job.ip == AMD_IP_GFX? "AMD_IP_GFX" : "AMD_IP_COMPUTE",
+ // err_codes.err_code_wait_for_fence);
}
return ret;
}
@@ -718,10 +801,6 @@ run_background(amdgpu_device_handle device, struct shmbuf *sh_mem,
unsigned int flags;
struct amdgpu_cs_err_codes err_codes;
- r = amdgpu_cs_ctx_create(device, &context_handle);
- igt_assert_eq(r, 0);
-
-
while (num_of_tests > 0) {
sync_point_enter(sh_mem);
if (is_subtest_skipped(sh_mem)) {
@@ -732,6 +811,8 @@ run_background(amdgpu_device_handle device, struct shmbuf *sh_mem,
read_next_job(sh_mem, &job, true);
ip_block_test = get_ip_block(device, job.ip);
is_dispatch_shader_test(job.error, error_str, &is_dispatch);
+ r = amdgpu_cs_ctx_create(device, &context_handle);
+ igt_assert_eq(r, 0);
while (1) {
r = amdgpu_write_linear(device, context_handle, ip_block_test, &job, &err_codes);
@@ -756,37 +837,36 @@ run_background(amdgpu_device_handle device, struct shmbuf *sh_mem,
counter++;
}
+ r = amdgpu_cs_ctx_free(context_handle);
sync_point_exit(sh_mem);
num_of_tests--;
}
- r = amdgpu_cs_ctx_free(context_handle);
return r;
}
-
-
static int
-run_all(amdgpu_device_handle device, amdgpu_context_handle *arr_context_handle,
- enum process_type process, struct shmbuf *sh_mem, int num_of_tests,
- uint32_t version, pid_t *monitor_child, pid_t *test_child)
+run_all(amdgpu_device_handle device, enum process_type process,
+ struct shmbuf *sh_mem, int num_of_tests, uint32_t version,
+ pid_t *monitor_child, pid_t *test_child)
{
+ struct thread_params *param;
+ pthread_t worker;
+
if (process == PROCESS_TEST) {
- *monitor_child = fork();
- if (*monitor_child == -1) {
- igt_fail(IGT_EXIT_FAILURE);
- } else if (*monitor_child == 0) {
- *monitor_child = getppid();
- run_monitor_child(device, arr_context_handle, sh_mem, num_of_tests);
- igt_success();
- igt_exit();
- }
*test_child = fork();
if (*test_child == -1) {
igt_fail(IGT_EXIT_FAILURE);
} else if (*test_child == 0) {
*test_child = getppid();
- run_test_child(device, arr_context_handle, sh_mem, num_of_tests, version);
+ param = allocate_thread_param(device, sh_mem, num_of_tests);
+ if (pthread_create(&worker, NULL, worker_thread, param) != 0) {
+ igt_info("********Failed to create worker thread");
+ igt_fail(IGT_EXIT_FAILURE);
+ }
+ run_test_child(device, sh_mem, num_of_tests, version, param);
+ pthread_join(worker, NULL);
+ free_thread_param(param);
igt_success();
igt_exit();
@@ -909,9 +989,9 @@ is_run_device_parameter_found(int argc, char **argv)
if (strcmp(ONDEVICE, argv[i]) == 0) {
/* Get the sum for a specific device as a unique identifier */
p = argv[i+1];
- while(*p){
- res += *p;
- p++;
+ while (*p) {
+ res += *p;
+ p++;
}
break;
}
@@ -967,35 +1047,6 @@ launch_background_process(int argc, char **argv, char *path, pid_t *ppid, int sh
return status;
}
-static void
-create_contexts(amdgpu_device_handle device, amdgpu_context_handle **pp_contexts,
- int num_of_contexts)
-{
- amdgpu_context_handle *p_contexts = NULL;
- int i, r;
-
- p_contexts = (amdgpu_context_handle *)malloc(sizeof(amdgpu_context_handle)
- *num_of_contexts);
-
- for (i = 0; i < num_of_contexts; i++) {
- r = amdgpu_cs_ctx_create(device, &p_contexts[i]);
- igt_assert_eq(r, 0);
- }
- *pp_contexts = p_contexts;
-
-}
-static void
-free_contexts(amdgpu_device_handle device, amdgpu_context_handle *p_contexts,
- int num_of_contexts)
-{
- int i;
-
- if (p_contexts) {
- for (i = 0; i < num_of_contexts; i++)
- amdgpu_cs_ctx_free(p_contexts[i]);
- }
-}
-
static bool
adjust_begin_index(unsigned int *ring_begin, unsigned int available_good_rings) { @@ -1095,19 +1146,13 @@ igt_main
enum amd_ip_block_type ip_tests[2] = {AMD_IP_COMPUTE/*keep first*/, AMD_IP_GFX};
enum amd_ip_block_type ip_background = AMD_IP_COMPUTE;
- amdgpu_context_handle *arr_context_handle = NULL;
-
- /* TODO remove this , it is used only to create array of contexts
- * which are shared between child processes ( test/monitor/main and
- * separate for background
- */
struct dynamic_test arr_err[] = {
{CMD_STREAM_EXEC_INVALID_PACKET_LENGTH, "CMD_STREAM_EXEC_INVALID_PACKET_LENGTH",
"Stressful-and-multiple-cs-of-bad and good length-operations-using-multiple-processes",
- { { FAMILY_GFX1100, 0x1, 0xFF }, {FAMILY_AI, 0x32, 0xFF }, {FAMILY_AI, 0x3C, 0xFF } } , {-ECANCELED, -ECANCELED, 0 } },
+ { {FAMILY_GFX1100, 0x1, 0xFF }, {FAMILY_AI, 0x32, 0xFF },
+{FAMILY_AI, 0x3C, 0xFF } }, {-ECANCELED, -ECANCELED, -ECANCELED } },
{CMD_STREAM_EXEC_INVALID_OPCODE, "CMD_STREAM_EXEC_INVALID_OPCODE",
"Stressful-and-multiple-cs-of-bad and good opcode-operations-using-multiple-processes",
- { {FAMILY_UNKNOWN, -1, -1 }, {FAMILY_UNKNOWN, -1, -1 }, {FAMILY_UNKNOWN, -1, -1 } } , { 0, -ECANCELED, -ECANCELED } },
+ { {FAMILY_UNKNOWN, -1, -1 }, {FAMILY_UNKNOWN, -1, -1 },
+{FAMILY_UNKNOWN, -1, -1 } }, {-ECANCELED, -ECANCELED, -ECANCELED } },
//TODO not job timeout, debug why for n31.
//{CMD_STREAM_TRANS_BAD_MEM_ADDRESS_BY_SYNC,"CMD_STREAM_TRANS_BAD_MEM_ADDRESS_BY_SYNC",
// "Stressful-and-multiple-cs-of-bad and good mem-sync-operations-using-multiple-processes"},
@@ -1116,16 +1161,16 @@ igt_main
// "Stressful-and-multiple-cs-of-bad and good reg-operations-using-multiple-processes"},
{BACKEND_SE_GC_SHADER_INVALID_PROGRAM_ADDR, "BACKEND_SE_GC_SHADER_INVALID_PROGRAM_ADDR",
"Stressful-and-multiple-cs-of-bad and good shader-operations-using-multiple-processes",
- { {FAMILY_UNKNOWN, 0x1, 0x10 }, {FAMILY_AI, 0x32, 0x3C }, {FAMILY_AI, 0x3C, 0xFF } } , { -ECANCELED, -ECANCELED, -ECANCELED } },
+ { {FAMILY_UNKNOWN, 0x1, 0x10 }, {FAMILY_AI, 0x32, 0x3C },
+{FAMILY_AI, 0x3C, 0xFF } }, {-ENODATA, -ENODATA, -ENODATA } },
//TODO KGQ cannot recover by queue reset, it maybe need a fw bugfix on naiv31
//{BACKEND_SE_GC_SHADER_INVALID_PROGRAM_SETTING,"BACKEND_SE_GC_SHADER_INVALID_PROGRAM_SETTING",
// "Stressful-and-multiple-cs-of-bad and good shader-operations-using-multiple-processes"},
{BACKEND_SE_GC_SHADER_INVALID_USER_DATA, "BACKEND_SE_GC_SHADER_INVALID_USER_DATA",
"Stressful-and-multiple-cs-of-bad and good shader-operations-using-multiple-processes",
- { {FAMILY_UNKNOWN, -1, -1 }, {FAMILY_AI, 0x32, 0x3C }, {FAMILY_AI, 0x3C, 0xFF } } , { -ECANCELED, -ECANCELED, -ECANCELED } },
+ { {FAMILY_UNKNOWN, -1, -1 }, {FAMILY_AI, 0x32, 0x3C }, {FAMILY_AI,
+0x3C, 0xFF } }, {-ENODATA, -ENODATA, -ENODATA } },
{BACKEND_SE_GC_SHADER_INVALID_SHADER, "BACKEND_SE_GC_SHADER_INVALID_SHADER",
"Stressful-and-multiple-cs-of-bad and good shader-operations-using-multiple-processes",
- { {FAMILY_UNKNOWN, 0x1, 0x10 }, {FAMILY_AI, 0x32, 0x3C }, {FAMILY_AI, 0x3C, 0xFF } } , { -ECANCELED, -ECANCELED, -ECANCELED } },
+ { {FAMILY_UNKNOWN, 0x1, 0x10 }, {FAMILY_AI, 0x32, 0x3C },
+{FAMILY_AI, 0x3C, 0xFF } }, {-ENODATA, -ENODATA, -ENODATA } },
{}
};
@@ -1146,7 +1191,7 @@ igt_main
const_num_of_tests = (sizeof(arr_err)/sizeof(struct dynamic_test) - 1) * ARRAY_SIZE(ip_tests);
r = is_run_device_parameter_found(argc, argv);
- snprintf(shm_name,sizeof(shm_name),"/queue_reset_shm_%d",r);
+ snprintf(shm_name, sizeof(shm_name), "/queue_reset_shm_%d", r);
fd = drm_open_driver(DRIVER_AMDGPU);
err = amdgpu_device_initialize(fd, &major, &minor, &device); @@ -1175,19 +1220,17 @@ igt_main
} else {
process = PROCESS_BACKGROUND;
}
- if (process == PROCESS_TEST)
- create_contexts(device, &arr_context_handle, const_num_of_tests);
- else if (process == PROCESS_BACKGROUND)
+ if (process == PROCESS_BACKGROUND)
fd_shm = shared_mem_open(&sh_mem);
+
igt_require(fd_shm != -1);
igt_require(sh_mem != NULL);
- run_all(device, arr_context_handle,
- process, sh_mem, const_num_of_tests, info[0].hw_ip_version_major,
- &monitor_child, &test_child);
+ run_all(device, process, sh_mem, const_num_of_tests,
+ info[0].hw_ip_version_major, &monitor_child, &test_child);
}
//print expect error table for each test
- if(!argc) {// --list-subtests arg is 0
+ if (!argc) {// --list-subtests arg is 0
for (int i = 0; i < ARRAY_SIZE(ip_tests); i++)
for (struct dynamic_test *it = &arr_err[0]; it->name; it++) {
expect_error = ip_tests[i] == AMD_IP_COMPUTE ?
@@ -1224,7 +1267,6 @@ igt_main
waitpid(test_child, &testExitMethod, 0);
}
waitpid(pid_background, &backgrounExitMethod, 0);
- free_contexts(device, arr_context_handle, const_num_of_tests);
amdgpu_device_deinitialize(device);
drm_close_driver(fd);
shared_mem_destroy(sh_mem, fd_shm, true, shm_name);
--
2.25.1
More information about the igt-dev
mailing list