[PATCH i-g-t v3] tests/amdgpu: Add queue reset test
Kamil Konieczny
kamil.konieczny at linux.intel.com
Mon Aug 5 18:56:01 UTC 2024
Hi Jesse,
On 2024-08-02 at 15:07:44 +0800, Jesse Zhang wrote:
> Overview of Queue Reset Test Process:
> - Launch Child Test Process:
> Executes various tests, such as BACKEND_SE_GC_SHADER_INVALID_PROGRAM_ADDR,
> BACKEND_SE_GC_SHADER_INVALID_PROGRAM_SETTING, etc., to evaluate queue reset
> functionality.
> If the amdgpu driver encounters a job timeout, it attempts recovery in the following sequence:
> - Soft reset: Returns an error of -ENODATA for the given bad job. If unsuccessful, a queue reset is attempted.
> - Queue reset: Returns an error of -ENODATA for the given bad job. If unsuccessful, a full GPU reset is attempted.
> - Entire GPU reset: Returns an error of -ECANCELED or -ETIME for the given bad job.
> After each test, the test waits for the selected recovery process to complete using a monitoring process.
>
> - Launch Child Monitoring Process:
> During each test, this process calls amdgpu_cs_query_reset_state2 and communicates with the test process via
> shared memory to obtain the return code once a job is completed. It uses flags AMDGPU_CTX_QUERY2_FLAGS_RESET and
> AMDGPU_CTX_QUERY2_FLAGS_RESET_IN_PROGRESS, along with the return code, to ensure the correct recovery procedure
> (queue reset or entire GPU reset) is executed as required.
>
> - Launch Background Process:
> Utilizes posix_spawn to submit successful jobs to other rings. Communicates with the test and monitoring
> processes via shared memory to determine when background jobs should be interrupted and the next test should be run.
>
> - Main Test Process:
> Manages the above processes and pushes jobs to shared memory for the test process, sending appropriate signals as needed.
>
> - Synchronization:
> Sync points are established between the four processes at the beginning and end of each the test. Synchronization is
> implemented using shared memory and unnamed semaphores.
>
> This approach ensures thorough testing and validation of the queue reset functionality by actively monitoring and
> responding to different stages of the reset process.
>
> v2 : Enable queue reset test for drmlib v > 2.4.104.
> v3 : Fix subtest name duplication issue.
>
> Cc: Alex Deucher <alexander.deucher at amd.com>
> Cc: Christian Koenig <christian.koenig at amd.com>
> Signed-off-by: Jesse Zhang <jesse.zhang at amd.com>
> Signed-off-by: Vitaly Prosyak <vitaly.prosyak at amd.com>
> Reviewed-by: Jesse Zhang <jesse.zhang at amd.com>
Review should not be done by an author of a patch.
> ---
> tests/amdgpu/amd_queue_reset.c | 1051 ++++++++++++++++++++++++++++++++
> tests/amdgpu/meson.build | 5 +
> 2 files changed, 1056 insertions(+)
> create mode 100644 tests/amdgpu/amd_queue_reset.c
>
> diff --git a/tests/amdgpu/amd_queue_reset.c b/tests/amdgpu/amd_queue_reset.c
> new file mode 100644
> index 000000000..c0a83544c
> --- /dev/null
> +++ b/tests/amdgpu/amd_queue_reset.c
> @@ -0,0 +1,1051 @@
> +// SPDX-License-Identifier: MIT
> +/*
> + * Copyright 2024 Advanced Micro Devices, Inc.
> + */
> +#include <fcntl.h>
> +#include <sys/stat.h>
> +#include <sys/sysmacros.h>
> +#include <signal.h>
> +#include <sys/wait.h>
> +#include <unistd.h>
Move unistd as first, sort rest system ones alphabetically.
> +#include <spawn.h>
> +#include <semaphore.h>
> +#include <errno.h>
> +#include <assert.h>
> +
> +#include <amdgpu.h>
> +#include <amdgpu_drm.h>
> +
> +#include "igt.h"
> +#include "drmtest.h"
igt.h after drmtest.h
More nits about test names below.
> +
> +#include "lib/amdgpu/amd_PM4.h"
> +#include "lib/amdgpu/amd_ip_blocks.h"
> +#include "lib/amdgpu/amd_memory.h"
> +#include "lib/amdgpu/amd_command_submission.h"
> +#include "lib/amdgpu/amd_deadlock_helpers.h"
> +#include "lib/amdgpu/amd_dispatch.h"
> +
> +#define NUM_CHILD_PROCESSES 4
> +#define SHARED_CHILD_DESCRIPTOR 3
> +
> +#define SHARED_MEM_NAME "/queue_reset_shm"
> +
> +enum process_type {
> + PROCESS_UNKNOWN,
> + PROCESS_TEST,
> + PROCESS_BACKGROUND,
> +};
> +
> +struct job_struct {
> + unsigned int error;
> + enum amd_ip_block_type ip;
> + unsigned int ring_id;
> + /* additional data if necessary */
> +};
> +
> +enum error_code_bits {
> + ERROR_CODE_SET_BIT,
> +};
> +
> +enum reset_code_bits {
> + QUEUE_RESET_SET_BIT,
> + GPU_RESET_BEGIN_SET_BIT,
> + GPU_RESET_END_SUCCESS_SET_BIT,
> + GPU_RESET_END_FAILURE_SET_BIT,
> +
> + ALL_RESET_BITS = 0xf,
> +};
> +
> +struct shmbuf {
> + sem_t sem_mutex;
> + sem_t sem_state_mutex;
> + sem_t sync_sem_enter;
> + sem_t sync_sem_exit;
> + int count;
> + bool test_completed;
> + unsigned int test_flags;
> + int test_error_code;
> + bool reset_completed;
> + unsigned int reset_flags;
> + struct job_struct bad_job;
> + struct job_struct good_job;
> +
> +};
> +
> +static inline
> +void set_bit(int nr, uint32_t *addr)
> +{
> + *addr |= (1U << nr);
> +}
> +
> +static inline
> +void clear_bit(int nr, uint32_t *addr)
> +{
> + *addr &= ~(1U << nr);
> +}
> +
> +static inline
> +int test_bit(int nr, const uint32_t *addr)
> +{
> + return ((*addr >> nr) & 1U) != 0;
> +}
> +
> +static void
> +sync_point_signal(sem_t *psem, int num_signals)
> +{
> + int i;
> +
> + for (i = 0; i < num_signals; i++)
> + sem_post(psem);
> +}
> +
> +static void
> +set_reset_state(struct shmbuf *sh_mem, bool reset_state, enum reset_code_bits bit)
> +{
> + sem_wait(&sh_mem->sem_state_mutex);
> + sh_mem->reset_completed = reset_state;
> + if (reset_state)
> + set_bit(bit, &sh_mem->reset_flags);
> + else
> + clear_bit(bit, &sh_mem->reset_flags);
> +
> + sem_post(&sh_mem->sem_state_mutex);
> +}
> +
> +static bool
> +get_reset_state(struct shmbuf *sh_mem, unsigned int *flags)
> +{
> + bool reset_state;
> +
> + sem_wait(&sh_mem->sem_state_mutex);
> + reset_state = sh_mem->reset_completed;
> + *flags = sh_mem->reset_flags;
> + sem_post(&sh_mem->sem_state_mutex);
> + return reset_state;
> +}
> +
> +static void
> +set_test_state(struct shmbuf *sh_mem, bool test_state,
> + int error_code, enum error_code_bits bit)
> +{
> + sem_wait(&sh_mem->sem_state_mutex);
> + sh_mem->test_completed = test_state;
> + sh_mem->test_error_code = error_code;
> + if (test_state)
> + set_bit(bit, &sh_mem->test_flags);
> + else
> + clear_bit(bit, &sh_mem->test_flags);
> + sem_post(&sh_mem->sem_state_mutex);
> +}
> +
> +
> +
> +static bool
> +get_test_state(struct shmbuf *sh_mem, int *error_code, unsigned int *flags)
> +{
> + bool test_state;
> +
> + sem_wait(&sh_mem->sem_state_mutex);
> + test_state = sh_mem->test_completed;
> + *error_code = sh_mem->test_error_code;
> + *flags = sh_mem->test_flags;
> + sem_post(&sh_mem->sem_state_mutex);
> + return test_state;
> +}
> +
> +static void
> +sync_point_enter(struct shmbuf *sh_mem)
> +{
> +
> + sem_wait(&sh_mem->sem_mutex);
> + sh_mem->count++;
> + sem_post(&sh_mem->sem_mutex);
> +
> + if (sh_mem->count == NUM_CHILD_PROCESSES)
> + sync_point_signal(&sh_mem->sync_sem_enter, NUM_CHILD_PROCESSES);
> +
> + sem_wait(&sh_mem->sync_sem_enter);
> +}
> +
> +static void
> +sync_point_exit(struct shmbuf *sh_mem)
> +{
> + sem_wait(&sh_mem->sem_mutex);
> + sh_mem->count--;
> + sem_post(&sh_mem->sem_mutex);
> +
> + if (sh_mem->count == 0)
> + sync_point_signal(&sh_mem->sync_sem_exit, NUM_CHILD_PROCESSES);
> +
> + sem_wait(&sh_mem->sync_sem_exit);
> +}
> +
> +static bool
> +is_dispatch_shader_test(unsigned int err, char error_str[128], bool *is_dispatch)
> +{
> + static const struct error_struct {
> + enum cmd_error_type err;
> + bool is_shader_err;
> + const char *err_str;
> + } arr_err[] = {
> + { CMD_STREAM_EXEC_SUCCESS, false, "CMD_STREAM_EXEC_SUCCESS" },
> + { CMD_STREAM_EXEC_INVALID_OPCODE, false, "CMD_STREAM_EXEC_INVALID_OPCODE" },
> + { CMD_STREAM_EXEC_INVALID_PACKET_LENGTH, false, "CMD_STREAM_EXEC_INVALID_PACKET_LENGTH" },
> + { CMD_STREAM_EXEC_INVALID_PACKET_EOP_QUEUE, false, "CMD_STREAM_EXEC_INVALID_PACKET_EOP_QUEUE" },
> + { CMD_STREAM_TRANS_BAD_REG_ADDRESS, false, "CMD_STREAM_TRANS_BAD_REG_ADDRESS" },
> + { CMD_STREAM_TRANS_BAD_MEM_ADDRESS, false, "CMD_STREAM_TRANS_BAD_MEM_ADDRESS" },
> + { CMD_STREAM_TRANS_BAD_MEM_ADDRESS_BY_SYNC, false, "CMD_STREAM_TRANS_BAD_MEM_ADDRESS_BY_SYNC" },
> + { BACKEND_SE_GC_SHADER_EXEC_SUCCESS, true, "BACKEND_SE_GC_SHADER_EXEC_SUCCESS" },
> + { BACKEND_SE_GC_SHADER_INVALID_SHADER, true, "BACKEND_SE_GC_SHADER_INVALID_SHADER" },
> + { BACKEND_SE_GC_SHADER_INVALID_PROGRAM_ADDR, true, "BACKEND_SE_GC_SHADER_INVALID_PROGRAM_ADDR" },
> + { BACKEND_SE_GC_SHADER_INVALID_PROGRAM_SETTING, true, "BACKEND_SE_GC_SHADER_INVALID_PROGRAM_SETTING" },
> + { BACKEND_SE_GC_SHADER_INVALID_USER_DATA, true, "BACKEND_SE_GC_SHADER_INVALID_USER_DATA" }
> + };
> +
> + const int arr_size = ARRAY_SIZE(arr_err);
> + const struct error_struct *p;
> + bool ret = false;
> +
> + for (p = &arr_err[0]; p < &arr_err[arr_size]; p++) {
> + if (p->err == err) {
> + *is_dispatch = p->is_shader_err;
> + strcpy(error_str, p->err_str);
> + ret = true;
> + break;
> + }
> + }
> + return ret;
> +}
> +
> +
> +static bool
> +get_ip_type(unsigned int ip, char ip_str[64])
> +{
> + static const struct ip_struct {
> + enum amd_ip_block_type ip;
> + const char *ip_str;
> + } arr_ip[] = {
> + { AMD_IP_GFX, "AMD_IP_GFX" },
> + { AMD_IP_COMPUTE, "AMD_IP_COMPUTE" },
> + { AMD_IP_DMA, "AMD_IP_DMA" },
> + { AMD_IP_UVD, "AMD_IP_UVD" },
> + { AMD_IP_VCE, "AMD_IP_VCE" },
> + { AMD_IP_UVD_ENC, "AMD_IP_UVD_ENC" },
> + { AMD_IP_VCN_DEC, "AMD_IP_VCN_DEC" },
> + { AMD_IP_VCN_ENC, "AMD_IP_VCN_ENC" },
> + { AMD_IP_VCN_JPEG, "AMD_IP_VCN_JPEG" },
> + { AMD_IP_VPE, "AMD_IP_VPE" }
> + };
> +
> + const int arr_size = ARRAY_SIZE(arr_ip);
> + const struct ip_struct *p;
> + bool ret = false;
> +
> + for (p = &arr_ip[0]; p < &arr_ip[arr_size]; p++) {
> + if (p->ip == ip) {
> + strcpy(ip_str, p->ip_str);
> + ret = true;
> + break;
> + }
> + }
> + return ret;
> +}
> +
> +static int
> +read_next_job(struct shmbuf *sh_mem, struct job_struct *job, bool is_good)
> +{
> + sem_wait(&sh_mem->sem_state_mutex);
> + if (is_good)
> + *job = sh_mem->good_job;
> + else
> + *job = sh_mem->bad_job;
> + sem_post(&sh_mem->sem_state_mutex);
> + return 0;
> +}
> +
> +static void wait_for_complete_iteration(struct shmbuf *sh_mem)
> +{
> + int error_code;
> + unsigned int flags;
> + unsigned int reset_flags;
> +
> + while (1) {
> + if (get_test_state(sh_mem, &error_code, &flags) &&
> + get_reset_state(sh_mem, &reset_flags))
> + break;
> + sleep(1);
> + }
> +
> +}
> +
> +static void set_next_test_to_run(struct shmbuf *sh_mem, unsigned int error,
> + enum amd_ip_block_type ip_good, enum amd_ip_block_type ip_bad,
> + unsigned int ring_id_good, unsigned int ring_id_bad)
> +{
> + char error_str[128];
> + char ip_good_str[64];
> + char ip_bad_str[64];
> +
> + bool is_dispatch;
> +
> + is_dispatch_shader_test(error, error_str, &is_dispatch);
> + get_ip_type(ip_good, ip_good_str);
> + get_ip_type(ip_bad, ip_bad_str);
> +
> + //set jobs
> + sem_wait(&sh_mem->sem_state_mutex);
> + sh_mem->bad_job.error = error;
> + sh_mem->bad_job.ip = ip_bad;
> + sh_mem->bad_job.ring_id = ring_id_bad;
> + sh_mem->good_job.error = CMD_STREAM_EXEC_SUCCESS;
> + sh_mem->good_job.ip = ip_good;
> + sh_mem->good_job.ring_id = ring_id_good;
> + sem_post(&sh_mem->sem_state_mutex);
> +
> + //sync and wait for complete
> + sync_point_enter(sh_mem);
> + wait_for_complete_iteration(sh_mem);
> + sync_point_exit(sh_mem);
> +}
> +
> +static int
> +shared_mem_destroy(struct shmbuf *shmp, int shm_fd, bool unmap)
> +{
> + int ret = 0;
> +
> + if (shmp && unmap) {
> + munmap(shmp, sizeof(struct shmbuf));
> + sem_destroy(&shmp->sem_mutex);
> + sem_destroy(&shmp->sem_state_mutex);
> + sem_destroy(&shmp->sync_sem_enter);
> + sem_destroy(&shmp->sync_sem_exit);
> + }
> + if (shm_fd > 0)
> + close(shm_fd);
> +
> + shm_unlink(SHARED_MEM_NAME);
> +
> + return ret;
> +}
> +
> +static int
> +shared_mem_create(struct shmbuf **ppbuf)
> +{
> + int shm_fd = -1;
> + struct shmbuf *shmp = NULL;
> + bool unmap = false;
> +
> + // Create a shared memory object
> + shm_fd = shm_open(SHARED_MEM_NAME, O_CREAT | O_RDWR, 0666);
> + if (shm_fd == -1)
> + goto error;
> +
> +
> + // Configure the size of the shared memory object
> + if (ftruncate(shm_fd, sizeof(struct shmbuf)) == -1)
> + goto error;
> +
> + // Map the shared memory object
> + shmp = mmap(0, sizeof(struct shmbuf), PROT_WRITE, MAP_SHARED, shm_fd, 0);
> + if (shmp == MAP_FAILED)
> + goto error;
> +
> + unmap = true;
> + if (sem_init(&shmp->sem_mutex, 1, 1) == -1) {
> + unmap = true;
> + goto error;
> + }
> + if (sem_init(&shmp->sem_state_mutex, 1, 1) == -1)
> + goto error;
> +
> + if (sem_init(&shmp->sync_sem_enter, 1, 0) == -1)
> + goto error;
> +
> + if (sem_init(&shmp->sync_sem_exit, 1, 0) == -1)
> + goto error;
> +
> + shmp->count = 0;
> + shmp->test_completed = false;
> + shmp->reset_completed = false;
> +
> + *ppbuf = shmp;
> + return shm_fd;
> +
> +error:
> + shared_mem_destroy(shmp, shm_fd, unmap);
> + return shm_fd;
> +}
> +
> +static int
> +shared_mem_open(struct shmbuf **ppbuf)
> +{
> + int shm_fd = -1;
> + struct shmbuf *shmp = NULL;
> +
> + shmp = mmap(NULL, sizeof(*shmp), PROT_READ | PROT_WRITE, MAP_SHARED,
> + SHARED_CHILD_DESCRIPTOR, 0);
> + if (shmp == MAP_FAILED)
> + goto error;
> + else
> + shm_fd = SHARED_CHILD_DESCRIPTOR;
> +
> + *ppbuf = shmp;
> +
> + return shm_fd;
> +error:
> + return shm_fd;
> +}
> +
> +static bool
> +is_queue_reset_tests_enable(const struct amdgpu_gpu_info *gpu_info)
> +{
> + bool enable = true;
> + // TO DO
> +
> + return enable;
> +}
> +
> +static int
> +amdgpu_write_linear(amdgpu_device_handle device, amdgpu_context_handle context_handle,
> + const struct amdgpu_ip_block_version *ip_block,
> + struct job_struct *job)
> +{
> + const int pm4_dw = 256;
> + struct amdgpu_ring_context *ring_context;
> + int write_length, expect_failure;
> + int r;
> +
> + ring_context = calloc(1, sizeof(*ring_context));
> + igt_assert(ring_context);
> +
> + /* The firmware triggers a badop interrupt to prevent CP/ME from hanging.
> + * And it needs to be VIMID reset when receiving the interrupt.
> + * But for a long badop packet, fw still hangs, which is a fw bug.
> + * So please use a smaller size packet for temporary testing.
> + */
> + if ((job->ip == AMD_IP_GFX) && (job->error == CMD_STREAM_EXEC_INVALID_OPCODE)) {
> + write_length = 10;
> + expect_failure = 0;
> + } else {
> + write_length = 128;
> + expect_failure = job->error == CMD_STREAM_EXEC_SUCCESS ? 0 : 1;
> + }
> + /* setup parameters */
> + ring_context->write_length = write_length;
> + ring_context->pm4 = calloc(pm4_dw, sizeof(*ring_context->pm4));
> + ring_context->pm4_size = pm4_dw;
> + ring_context->res_cnt = 1;
> + ring_context->ring_id = job->ring_id;
> + igt_assert(ring_context->pm4);
> + ring_context->context_handle = context_handle;
> + r = amdgpu_bo_alloc_and_map(device,
> + ring_context->write_length * sizeof(uint32_t),
> + 4096, AMDGPU_GEM_DOMAIN_GTT,
> + AMDGPU_GEM_CREATE_CPU_GTT_USWC, &ring_context->bo,
> + (void **)&ring_context->bo_cpu,
> + &ring_context->bo_mc,
> + &ring_context->va_handle);
> + igt_assert_eq(r, 0);
> + memset((void *)ring_context->bo_cpu, 0, ring_context->write_length * sizeof(uint32_t));
> + ring_context->resources[0] = ring_context->bo;
> + ip_block->funcs->bad_write_linear(ip_block->funcs, ring_context,
> + &ring_context->pm4_dw, job->error);
> +
> + r = amdgpu_test_exec_cs_helper(device, ip_block->type, ring_context,
> + expect_failure);
> +
> + amdgpu_bo_unmap_and_free(ring_context->bo, ring_context->va_handle,
> + ring_context->bo_mc, ring_context->write_length * sizeof(uint32_t));
> + free(ring_context->pm4);
> + free(ring_context);
> + return r;
> +}
> +
> +static int
> +run_monitor_child(amdgpu_device_handle device, amdgpu_context_handle *arr_context,
> + struct shmbuf *sh_mem, int num_of_tests)
> +{
> + int ret;
> + int test_counter = 0;
> + uint64_t init_flags, in_process_flags;
> + uint32_t after_reset_state, after_reset_hangs;
> + int state_machine = 0;
> + int error_code;
> + unsigned int flags;
> +
> + after_reset_state = after_reset_hangs = 0;
> + init_flags = in_process_flags = 0;
> +
> + ret = amdgpu_cs_query_reset_state2(arr_context[0], &init_flags);
> + if (init_flags & AMDGPU_CTX_QUERY2_FLAGS_RESET_IN_PROGRESS)
> + igt_assert_eq(init_flags & AMDGPU_CTX_QUERY2_FLAGS_RESET_IN_PROGRESS, 0);
> +
> + while (num_of_tests > 0) {
> + sync_point_enter(sh_mem);
> + state_machine = 0;
> + error_code = 0;
> + flags = 0;
> + set_reset_state(sh_mem, false, ALL_RESET_BITS);
> + while (1) {
> + if (state_machine == 0) {
> + amdgpu_cs_query_reset_state2(arr_context[test_counter], &init_flags);
> +
> + if (init_flags & AMDGPU_CTX_QUERY2_FLAGS_RESET)
> + state_machine = 1;
> +
> + if (init_flags & AMDGPU_CTX_QUERY2_FLAGS_RESET_IN_PROGRESS)
> + state_machine = 2;
> +
> + } else if (state_machine == 1) {
> + amdgpu_cs_query_reset_state(arr_context[test_counter],
> + &after_reset_state, &after_reset_hangs);
> + amdgpu_cs_query_reset_state2(arr_context[test_counter],
> + &in_process_flags);
> +
> + //TODO refactor this block !
> + igt_assert_eq(in_process_flags & AMDGPU_CTX_QUERY2_FLAGS_RESET, 1);
> + if (get_test_state(sh_mem, &error_code, &flags) &&
> + test_bit(ERROR_CODE_SET_BIT, &flags)) {
> + if (error_code == -ENODATA) {
> + set_reset_state(sh_mem, true, QUEUE_RESET_SET_BIT);
> + break;
> + } else {
> + if (error_code != -ECANCELED && error_code == -ETIME) {
> + set_reset_state(sh_mem, true, GPU_RESET_END_FAILURE_SET_BIT);
> + break;
> + } else {
> + set_reset_state(sh_mem, true, GPU_RESET_BEGIN_SET_BIT);
> + state_machine = 2; //gpu reset stage
> + }
> + }
> + }
> + } else if (state_machine == 2) {
> + amdgpu_cs_query_reset_state(arr_context[test_counter],
> + &after_reset_state, &after_reset_hangs);
> + amdgpu_cs_query_reset_state2(arr_context[test_counter],
> + &in_process_flags);
> + /* here we should start timer and wait for some time until
> + * the flag AMDGPU_CTX_QUERY2_FLAGS_RESET disappear
> + */
> + if (!(in_process_flags & AMDGPU_CTX_QUERY2_FLAGS_RESET_IN_PROGRESS)) {
> + set_reset_state(sh_mem, true, GPU_RESET_END_SUCCESS_SET_BIT);
> + break;
> + }
> + }
> + }
> + sync_point_exit(sh_mem);
> + num_of_tests--;
> + test_counter++;
> + }
> + return ret;
> +}
> +
> +
> +
> +static int
> +run_test_child(amdgpu_device_handle device, amdgpu_context_handle *arr_context,
> + struct shmbuf *sh_mem, int num_of_tests, uint32_t version)
> +{
> + int ret;
> + bool bool_ret;
> + int test_counter = 0;
> + char error_str[128];
> + bool is_dispatch = false;
> + unsigned int reset_flags;
> +
> + struct job_struct job;
> + const struct amdgpu_ip_block_version *ip_block_test = NULL;
> +
> + while (num_of_tests > 0) {
> + sync_point_enter(sh_mem);
> + set_test_state(sh_mem, false, 0, ERROR_CODE_SET_BIT);
> + read_next_job(sh_mem, &job, false);
> + bool_ret = is_dispatch_shader_test(job.error, error_str, &is_dispatch);
> + igt_assert_eq(bool_ret, 1);
> + ip_block_test = get_ip_block(device, job.ip);
> + if (is_dispatch) {
> + ret = amdgpu_memcpy_dispatch_test(device, job.ip, job.ring_id, version,
> + job.error);
> + } else {
> + ret = amdgpu_write_linear(device, arr_context[test_counter],
> + ip_block_test, &job);
> + }
> +
> + num_of_tests--;
> + set_test_state(sh_mem, true, ret, ERROR_CODE_SET_BIT);
> + while (1) {
> + /*we may have GPU reset vs queue reset */
> + if (get_reset_state(sh_mem, &reset_flags))
> + break;
> + sleep(1);
> + }
> + sync_point_exit(sh_mem);
> + test_counter++;
> + }
> + return ret;
> +}
> +
> +static int
> +run_background(amdgpu_device_handle device, struct shmbuf *sh_mem,
> + int num_of_tests)
> +{
> +#define NUM_ITERATION 10000
> + char error_str[128];
> + bool is_dispatch = false;
> + unsigned int reset_flags;
> +
> + int r, counter = 0;
> + amdgpu_context_handle context_handle = NULL;
> + struct job_struct job;
> + const struct amdgpu_ip_block_version *ip_block_test = NULL;
> + int error_code;
> + unsigned int flags;
> +
> + r = amdgpu_cs_ctx_create(device, &context_handle);
> + igt_assert_eq(r, 0);
> +
> +
> + while (num_of_tests > 0) {
> + sync_point_enter(sh_mem);
> + read_next_job(sh_mem, &job, true);
> + ip_block_test = get_ip_block(device, job.ip);
> + is_dispatch_shader_test(job.error, error_str, &is_dispatch);
> + while (1) {
> + r = amdgpu_write_linear(device, context_handle, ip_block_test, &job);
> + if (get_test_state(sh_mem, &error_code, &flags) &&
> + get_reset_state(sh_mem, &reset_flags)) {
> + //if entire gpu reset then stop back ground jobs
> + break;
> + }
> + if (r != -ECANCELED && r != -ETIME && r != -ENODATA)
> + igt_assert_eq(r, 0);
> + /*
> + * TODO we have issue during gpu reset the return code assert we put after we check the
> + * test is completed othewise the job is failed due to
> + * amdgpu_job_run Skip job if VRAM is lost
> + * if (job->generation != amdgpu_vm_generation(adev, job->vm)
> + */
> + counter++;
> +
> + }
> + sync_point_exit(sh_mem);
> + num_of_tests--;
> + }
> + r = amdgpu_cs_ctx_free(context_handle);
> + return r;
> +}
> +
> +
> +
> +
> +static int
> +run_all(amdgpu_device_handle device, amdgpu_context_handle *arr_context_handle,
> + enum process_type process, struct shmbuf *sh_mem, int num_of_tests,
> + uint32_t version, pid_t *monitor_child, pid_t *test_child)
> +{
> + if (process == PROCESS_TEST) {
> + *monitor_child = fork();
> + if (*monitor_child == -1) {
> + igt_fail(IGT_EXIT_FAILURE);
> + } else if (*monitor_child == 0) {
> + *monitor_child = getppid();
> + run_monitor_child(device, arr_context_handle, sh_mem, num_of_tests);
> + igt_success();
> + igt_exit();
> + }
> + *test_child = fork();
> + if (*test_child == -1) {
> + igt_fail(IGT_EXIT_FAILURE);
> + } else if (*test_child == 0) {
> + *test_child = getppid();
> + run_test_child(device, arr_context_handle, sh_mem, num_of_tests, version);
> + igt_success();
> + igt_exit();
> +
> + }
> + } else if (process == PROCESS_BACKGROUND) {
> + run_background(device, sh_mem, num_of_tests);
> + igt_success();
> + igt_exit();
> + }
> + return 0;
> +}
> +
> +static bool
> +get_command_line(char cmdline[2048], int *pargc, char ***pppargv, char **ppath)
> +{
> + ssize_t total_length = 0;
> + char *tmpline;
> + char **argv = NULL;
> + char *path = NULL;
> + int length_cmd[16] = {0};
> + int i, argc = 0;
> + ssize_t num_read;
> +
> + int fd = open("/proc/self/cmdline", O_RDONLY);
> +
> + if (fd == -1) {
> + igt_info("**** Error opening /proc/self/cmdline");
> + return false;
> + }
> +
> + num_read = read(fd, cmdline, 2048 - 1);
> + close(fd);
> +
> + if (num_read == -1) {
> + igt_info("Error reading /proc/self/cmdline");
> + return false;
> + }
> + cmdline[num_read] = '\0';
> +
> + tmpline = cmdline;
> + memset(length_cmd, 0, sizeof(length_cmd));
> +
> + /*assumption that last parameter has 2 '\0' at the end*/
> + for (i = 0; total_length < num_read - 2; i++) {
> + length_cmd[i] = strlen(tmpline);
> + total_length += length_cmd[i];
> + tmpline += length_cmd[i] + 1;
> + argc++;
> + }
> + *pargc = argc;
> + if (argc == 0 || argc > 20) {
> + /* not support yet fancy things */
> + return false;
> + }
> + /* always do 2 extra for additional parameter */
> + argv = (char **)malloc(sizeof(argv) * (argc + 2));
> + memset(argv, 0, sizeof(argv) * (argc + 2));
> + tmpline = cmdline;
> + for (i = 0; i < argc; i++) {
> + argv[i] = (char *)malloc(sizeof(char) * length_cmd[i] + 1);
> + memcpy(argv[i], tmpline, length_cmd[i]);
> + argv[i][length_cmd[i]] = 0;
> + if (i == 0) {
> + path = (char *)malloc(sizeof(char) * length_cmd[0] + 1);
> + memcpy(path, tmpline, length_cmd[0]);
> + path[length_cmd[0]] = 0;
> + }
> + argv[i][length_cmd[i]] = 0;
> + tmpline += length_cmd[i] + 1;
> + }
> + *pppargv = argv;
> + *ppath = path;
> +
> + return true;
> +}
> +
> +#define BACKGROUND "background"
> +
> +static bool
> +is_background_parameter_found(int argc, char **argv)
> +{
> + bool ret = false;
> + int i;
> +
> + for (i = 1; i < argc; i++) {
> + if (strcmp(BACKGROUND, argv[i]) == 0) {
> + ret = true;
> + break;
> + }
> + }
> + return ret;
> +}
> +
> +#define RUNSUBTEST "--run-subtest"
> +static bool
> +is_run_subtest_parameter_found(int argc, char **argv)
> +{
> + bool ret = false;
> + int i;
> +
> + for (i = 1; i < argc; i++) {
> + if (strcmp(RUNSUBTEST, argv[i]) == 0) {
> + ret = true;
> + break;
> + }
> + }
> + return ret;
> +}
> +
> +static bool
> +add_background_parameter(int *pargc, char **argv)
> +{
> + int argc = *pargc;
> + int len = strlen(BACKGROUND);
> +
> + argv[argc] = (char *)malloc(sizeof(char) * len + 1);
> + memcpy(argv[argc], BACKGROUND, len);
> + argv[argc][len] = 0;
> + *pargc = argc + 1;
> + return true;
> +}
> +
> +static void
> +free_command_line(int argc, char **argv, char *path)
> +{
> + int i;
> +
> + for (i = 0; i <= argc; i++)
> + free(argv[i]);
> +
> + free(argv);
> + free(path);
> +
> +}
> +
> +static int
> +launch_background_process(int argc, char **argv, char *path, pid_t *ppid, int shm_fd)
> +{
> + int status;
> + posix_spawn_file_actions_t action;
> +
> + for(int i = 0; i < argc; i++) {
> + /* The background process only runs when a queue reset is actually triggered. */
> + if(strstr(argv[i], "list-subtests") != NULL)
> + return 0;
> + }
> + posix_spawn_file_actions_init(&action);
> + posix_spawn_file_actions_adddup2(&action, shm_fd, SHARED_CHILD_DESCRIPTOR);
> + status = posix_spawn(ppid, path, &action, NULL, argv, NULL);
> + posix_spawn_file_actions_destroy(&action);
> + if (status != 0)
> + igt_fail(IGT_EXIT_FAILURE);
> + return status;
> +}
> +
> +static void
> +create_contexts(amdgpu_device_handle device, amdgpu_context_handle **pp_contexts,
> + int num_of_contexts)
> +{
> + amdgpu_context_handle *p_contexts = NULL;
> + int i, r;
> +
> + p_contexts = (amdgpu_context_handle *)malloc(sizeof(amdgpu_context_handle)
> + *num_of_contexts);
> +
> + for (i = 0; i < num_of_contexts; i++) {
> + r = amdgpu_cs_ctx_create(device, &p_contexts[i]);
> + igt_assert_eq(r, 0);
> + }
> + *pp_contexts = p_contexts;
> +
> +}
> +static void
> +free_contexts(amdgpu_device_handle device, amdgpu_context_handle *p_contexts,
> + int num_of_contexts)
> +{
> + int i;
> +
> + if (p_contexts) {
> + for (i = 0; i < num_of_contexts; i++)
> + amdgpu_cs_ctx_free(p_contexts[i]);
> + }
> +}
> +
> +/* TODO add logic to iterate for all */
> +static bool
> +get_next_rings(unsigned int ring_begin, unsigned int available_rings,
> + unsigned int *next_ring, unsigned int *next_next_ring)
> +{
> + bool ret = false;
> + unsigned int ring_id;
> +
> + for (ring_id = ring_begin; (1 << ring_id) & available_rings; ring_id++) {
> + *next_ring = ring_id;
> + *next_next_ring = ring_id + 1;
> +
> + if ((*next_ring & available_rings) && (*next_next_ring & available_rings)) {
> + ret = true;
> + break;
> + }
> + }
> + return ret;
> +}
> +igt_main
> +{
> + char cmdline[2048];
> + int argc = 0;
> + char **argv = NULL;
> + char *path = NULL;
> + enum process_type process = PROCESS_UNKNOWN;
> + pid_t pid_background;
> + pid_t monitor_child, test_child;
> + int testExitMethod, monitorExitMethod, backgrounExitMethod;
> + posix_spawn_file_actions_t action;
> + amdgpu_device_handle device;
> + struct amdgpu_gpu_info gpu_info = {0};
> + struct drm_amdgpu_info_hw_ip info = {0};
> + int fd = -1;
> + int fd_shm = -1;
> + struct shmbuf *sh_mem = NULL;
> +
> + int r;
> + bool arr_cap[AMD_IP_MAX] = {0};
> + unsigned int ring_id_good = 0;
> + unsigned int ring_id_bad = 1;
> +
> + enum amd_ip_block_type ip_test = AMD_IP_COMPUTE;
> + enum amd_ip_block_type ip_background = AMD_IP_COMPUTE;
> +
> + amdgpu_context_handle *arr_context_handle = NULL;
> +
> + /* TODO remove this , it is used only to create array of contexts
> + * which are shared between child processes ( test/monitor/main and
> + * separate for background
> + */
> + unsigned int arr_err[] = {
> + CMD_STREAM_EXEC_INVALID_PACKET_LENGTH,
> + CMD_STREAM_EXEC_INVALID_OPCODE,
> + CMD_STREAM_TRANS_BAD_MEM_ADDRESS,
> + //CMD_STREAM_TRANS_BAD_MEM_ADDRESS_BY_SYNC,TODO not job timeout, debug why for n31
> + //CMD_STREAM_TRANS_BAD_REG_ADDRESS, TODO amdgpu: device lost from bus! for n31
> + BACKEND_SE_GC_SHADER_INVALID_PROGRAM_ADDR,
> + BACKEND_SE_GC_SHADER_INVALID_PROGRAM_SETTING,
> + BACKEND_SE_GC_SHADER_INVALID_USER_DATA
> + };
> +
> + int const_num_of_tests;
> +
> + posix_spawn_file_actions_init(&action);
> +
> + if (!get_command_line(cmdline, &argc, &argv, &path))
> + igt_fail(IGT_EXIT_FAILURE);
> +
> + if (is_run_subtest_parameter_found(argc, argv))
> + const_num_of_tests = 1;
> + else
> + const_num_of_tests = ARRAY_SIZE(arr_err);
> +
> + if (!is_background_parameter_found(argc, argv)) {
> + add_background_parameter(&argc, argv);
> + fd_shm = shared_mem_create(&sh_mem);
> + igt_require(fd_shm != -1);
> + launch_background_process(argc, argv, path, &pid_background, fd_shm);
> + process = PROCESS_TEST;
> + } else {
> + process = PROCESS_BACKGROUND;
> + }
> +
> + igt_fixture {
> + uint32_t major, minor;
> + int err;
> +
> + fd = drm_open_driver(DRIVER_AMDGPU);
> +
> + err = amdgpu_device_initialize(fd, &major, &minor, &device);
> + igt_require(err == 0);
> +
> + igt_info("Initialized amdgpu, driver version %d.%d\n",
> + major, minor);
> +
> + r = amdgpu_query_gpu_info(device, &gpu_info);
> + igt_assert_eq(r, 0);
> + r = amdgpu_query_hw_ip_info(device, ip_test, 0, &info);
> + igt_assert_eq(r, 0);
> + r = setup_amdgpu_ip_blocks(major, minor, &gpu_info, device);
> + igt_assert_eq(r, 0);
> +
> + asic_rings_readness(device, 1, arr_cap);
> + igt_skip_on(!is_queue_reset_tests_enable(&gpu_info));
> + if (process == PROCESS_TEST)
> + create_contexts(device, &arr_context_handle, const_num_of_tests);
> + else if (process == PROCESS_BACKGROUND)
> + fd_shm = shared_mem_open(&sh_mem);
> +
> + igt_require(fd_shm != -1);
> + igt_require(sh_mem != NULL);
> +
> + run_all(device, arr_context_handle,
> + process, sh_mem, const_num_of_tests, info.hw_ip_version_major,
> + &monitor_child, &test_child);
> + }
> +
> + igt_describe("Stressful-and-multiple-cs-of-bad and good length-operations-using-multiple-processes");
> + igt_subtest_with_dynamic("amdgpu-compute-CMD_STREAM_EXEC_INVALID_PACKET_LENGTH") {
> + if (arr_cap[ip_test] && get_next_rings(ring_id_good, info.available_rings, &ring_id_good, &ring_id_bad)) {
> + igt_dynamic_f("amdgpu-compute-CMD_STREAM_EXEC_INVALID_PACKET_LENGTH")
Why they are the same - name for subtest and name for dynamic sub-subtest?
igt_subtest_with_dynamic("amdgpu-compute-CMD_STREAM_EXEC_INVALID_PACKET_LENGTH") {
igt_dynamic_f-----------("amdgpu-compute-CMD_STREAM_EXEC_INVALID_PACKET_LENGTH")
Dynamic names should be created on the fly, for example
you could name it A-B, where
A=name for good ring,
B=name for bad ring
or in some other dynamic way and should be _different_ from base name.
Regards,
Kamil
> + set_next_test_to_run(sh_mem, CMD_STREAM_EXEC_INVALID_PACKET_LENGTH,
> + ip_background, ip_test, ring_id_good, ring_id_bad);
> + }
> + }
> +
> + igt_describe("Stressful-and-multiple-cs-of-bad and good opcode-operations-using-multiple-processes");
> + igt_subtest_with_dynamic("amdgpu-compute-CMD_STREAM_EXEC_INVALID_OPCODE") {
> + if (arr_cap[ip_test] && get_next_rings(ring_id_good, info.available_rings, &ring_id_good, &ring_id_bad)) {
> + igt_dynamic_f("amdgpu-compute-CMD_STREAM_EXEC_INVALID_OPCODE")
> + set_next_test_to_run(sh_mem, CMD_STREAM_EXEC_INVALID_OPCODE,
> + ip_background, ip_test, ring_id_good, ring_id_bad);
> + }
> + }
> +
> + igt_describe("Stressful-and-multiple-cs-of-bad and good mem-operations-using-multiple-processes");
> + igt_subtest_with_dynamic("amdgpu-compute-CMD_STREAM_TRANS_BAD_MEM_ADDRESS") {
> + if (arr_cap[ip_test] && get_next_rings(ring_id_good, info.available_rings, &ring_id_good, &ring_id_bad)) {
> + igt_dynamic_f("amdgpu-compute-CMD_STREAM_TRANS_BAD_MEM_ADDRESS")
> + set_next_test_to_run(sh_mem, CMD_STREAM_TRANS_BAD_MEM_ADDRESS,
> + ip_background, ip_test, ring_id_good, ring_id_bad);
> + }
> + }
> + /* TODO not job timeout, debug why for nv32
> + *igt_describe("Stressful-and-multiple-cs-of-bad and good mem-sync-operations-using-multiple-processes");
> + *igt_subtest_with_dynamic("amdgpu-compute-CMD_STREAM_TRANS_BAD_MEM_ADDRESS_BY_SYNC") {
> + * if (arr_cap[ip_test] && get_next_rings(ring_id_good, info.available_rings, &ring_id_good, &ring_id_bad)) {
> + * igt_dynamic_f("amdgpu-compute-CMD_STREAM_TRANS_BAD_MEM_ADDRESS_BY_SYNC")
> + * set_next_test_to_run(sh_mem, CMD_STREAM_TRANS_BAD_MEM_ADDRESS_BY_SYNC,
> + * ip_background, ip_test, ring_id_good, ring_id_bad);
> + * }
> + */
> +
> + /* TODO amdgpu: device lost from bus! for nv32
> + *igt_describe("Stressful-and-multiple-cs-of-bad and good reg-operations-using-multiple-processes");
> + *igt_subtest_with_dynamic("amdgpu-compute-CMD_STREAM_TRANS_BAD_REG_ADDRESS") {
> + * if (arr_cap[ip_test] && get_next_rings(ring_id_good, info.available_rings, &ring_id_good, &ring_id_bad)) {
> + * igt_dynamic_f("amdgpu-compute-CMD_STREAM_TRANS_BAD_MEM_ADDRESS_BY_SYNC")
> + * set_next_test_to_run(sh_mem, CMD_STREAM_TRANS_BAD_REG_ADDRESS,
> + * ip_background, ip_test, ring_id_good, ring_id_bad);
> + * }
> + */
> +
> + igt_describe("Stressful-and-multiple-cs-of-bad and good shader-operations-using-multiple-processes");
> + igt_subtest_with_dynamic("Handful-by-soft-recovery-amdgpu-compute-BACKEND_SE_GC_SHADER_INVALID_PROGRAM_ADDR") {
> + if (arr_cap[ip_test] && get_next_rings(ring_id_good, info.available_rings, &ring_id_good, &ring_id_bad)) {
> + igt_dynamic_f("amdgpu-BACKEND_SE_GC_SHADER_INVALID_PROGRAM_ADDR")//amdgpu_ring_soft_recovery
> + set_next_test_to_run(sh_mem, BACKEND_SE_GC_SHADER_INVALID_PROGRAM_ADDR,
> + ip_background, ip_test, ring_id_good, ring_id_bad);
> + }
> + }
> +
> + igt_describe("Stressful-and-multiple-cs-of-bad and good shader-operations-using-multiple-processes");
> + igt_subtest_with_dynamic("amdgpu-compute-BACKEND_SE_GC_SHADER_INVALID_PROGRAM_SETTING") {
> + if (arr_cap[ip_test] && get_next_rings(ring_id_good, info.available_rings, &ring_id_good, &ring_id_bad)) {
> + igt_dynamic_f("amdgpu-compute-BACKEND_SE_GC_SHADER_INVALID_PROGRAM_SETTING")
> + set_next_test_to_run(sh_mem, BACKEND_SE_GC_SHADER_INVALID_PROGRAM_SETTING,
> + ip_background, ip_test, ring_id_good, ring_id_bad);
> + }
> + }
> +
> + igt_describe("Stressful-and-multiple-cs-of-bad and good shader-operations-using-multiple-processes");
> + igt_subtest_with_dynamic("amdgpu-compute-BACKEND_SE_GC_SHADER_INVALID_USER_DATA") {
> + if (arr_cap[ip_test] && get_next_rings(ring_id_good, info.available_rings, &ring_id_good, &ring_id_bad)) {
> + igt_dynamic_f("amdgpu-compute-BACKEND_SE_GC_SHADER_INVALID_USER_DATA")
> + set_next_test_to_run(sh_mem, BACKEND_SE_GC_SHADER_INVALID_USER_DATA,
> + ip_background, ip_test, ring_id_good, ring_id_bad);
> + }
> + }
> +
> + igt_fixture {
> + if (process == PROCESS_TEST) {
> + waitpid(monitor_child, &monitorExitMethod, 0);
> + waitpid(test_child, &testExitMethod, 0);
> + }
> + waitpid(pid_background, &backgrounExitMethod, 0);
> + free_contexts(device, arr_context_handle, const_num_of_tests);
> + amdgpu_device_deinitialize(device);
> + drm_close_driver(fd);
> + shared_mem_destroy(sh_mem, fd_shm, true);
> + posix_spawn_file_actions_destroy(&action);
> + }
> + free_command_line(argc, argv, path);
> +}
> diff --git a/tests/amdgpu/meson.build b/tests/amdgpu/meson.build
> index 3982a665f..36d65f44b 100644
> --- a/tests/amdgpu/meson.build
> +++ b/tests/amdgpu/meson.build
> @@ -57,6 +57,11 @@ if libdrm_amdgpu.found()
> else
> warning('libdrm <= 2.4.109 found, amd_pstate test not applicable')
> endif
> + if libdrm_amdgpu.version().version_compare('> 2.4.104')
> + amdgpu_progs +=[ 'amd_queue_reset',]
> + else
> + warning('libdrm <= 2.4.104 found, amd_queue_reset test not applicable')
> + endif
> amdgpu_deps += libdrm_amdgpu
> endif
>
> --
> 2.25.1
>
More information about the igt-dev
mailing list