[PATCH] tests/intel: Add xe_coredump test, v3.

Matthew Brost matthew.brost at intel.com
Tue Jun 11 22:17:17 UTC 2024


On Tue, Jun 11, 2024 at 12:13:22PM +0200, Maarten Lankhorst wrote:
> Add a simple test that forces a GPU hang and then reads the resulting
> devcoredump file. Map a single userptr and BO, and dump the contents of
> those.
> 
> Changes since v1:
> - Almost completely rewrite test for readability, based on feedback.
> Changes since v2:
> - Remove retrying opening fd, 1s wait was missing and test passed without.
> 
> Signed-off-by: Maarten Lankhorst <maarten.lankhorst at linux.intel.com>
> ---
>  tests/intel/xe_coredump.c | 251 ++++++++++++++++++++++++++++++++++++++
>  tests/meson.build         |   1 +
>  2 files changed, 252 insertions(+)
>  create mode 100644 tests/intel/xe_coredump.c
> 
> diff --git a/tests/intel/xe_coredump.c b/tests/intel/xe_coredump.c
> new file mode 100644
> index 000000000..938b718db
> --- /dev/null
> +++ b/tests/intel/xe_coredump.c
> @@ -0,0 +1,251 @@
> +// SPDX-License-Identifier: MIT
> +/*
> + * Copyright © 2023 Intel Corporation
> + */
> +
> +/**
> + * TEST: Check devcoredump functionality
> + * Category: Software building block
> + * Sub-category: devcoredump
> + * Run type: BAT
> + * Functionality: Error dumping and readout.
> + */
> +
> +#include <unistd.h>
> +#include <fcntl.h>
> +#include <glob.h>
> +#include <string.h>
> +#include <wchar.h>
> +#include <sys/stat.h>
> +
> +#include "igt.h"
> +#include "igt_device.h"
> +#include "igt_io.h"
> +#include "igt_syncobj.h"
> +#include "igt_sysfs.h"
> +
> +#include "intel_pat.h"
> +
> +#include "xe_drm.h"
> +#include "xe/xe_ioctl.h"
> +#include "xe/xe_query.h"
> +
> +#ifndef DRM_XE_VM_BIND_FLAG_DUMPABLE
> +#define DRM_XE_VM_BIND_FLAG_DUMPABLE (1<<3)
> +#endif
> +
> +static struct xe_device *xe;
> +static uint32_t batch_bo;
> +static uint32_t *batch;
> +static void *userptr;
> +static uint32_t vm;
> +static int sysfd;
> +
> +#define MAX_N_ENGINES 32
> +
> +static void tryclear_hang(void)
> +{
> +	int fd = openat(sysfd, "devcoredump/data", O_RDWR);
> +	char buf[256];
> +
> +	if (fd < 0)
> +		return;
> +
> +	while (read(fd, buf, sizeof(buf)) > 0)
> +		{ }
> +	write(fd, "1", 1);
> +	close(fd);
> +}
> +
> +/*
> + * Helper to read and clear devcore.  We want to read it completely to ensure
> + * we catch any kernel side regressions like:
> + * https://gitlab.freedesktop.org/drm/msm/-/issues/20
> + */
> +static void
> +read_and_clear_hang(void)
> +{
> +	char buf[0x1000];
> +	int fd = openat(sysfd, "devcoredump/data", O_RDWR);
> +	igt_assert(fd >= 0);
> +
> +	/*
> +	 * We want to read the entire file but we can throw away the
> +	 * contents.. we just want to make sure that we exercise the
> +	 * kernel side codepaths hit when reading the devcore from
> +	 * sysfs
> +	 */
> +	igt_debug("---- begin coredump ----\n");
> +	while (1) {
> +		ssize_t ret;
> +
> +		ret = igt_readn(fd, buf, sizeof(buf) - 1);
> +		igt_assert(ret >= 0);
> +		if (ret == 0)
> +			break;
> +		buf[ret] = '\0';
> +		igt_debug("%s", buf);
> +	}
> +
> +	igt_debug("---- end coredump ----\n");
> +
> +	/* Clear the devcore: */
> +	igt_writen(fd, "1", 1);
> +
> +	close(fd);
> +}
> +
> +static void free_execqueue(void)
> +{
> +	int fd = xe->fd;
> +	xe_vm_destroy(fd, vm);
> +	vm = 0;
> +	gem_close(fd, batch_bo);
> +	munmap(batch, xe->default_alignment);
> +	munmap(userptr, xe->default_alignment);
> +	batch = userptr = NULL;
> +}
> +
> +static void recreate_execqueue(bool dumpable)
> +{
> +	struct drm_xe_sync sync = {
> +		.type = DRM_XE_SYNC_TYPE_SYNCOBJ,
> +		.flags = DRM_XE_SYNC_FLAG_SIGNAL,
> +	};
> +	struct drm_xe_vm_bind_op bind_ops[2] = { };
> +	int fd = xe->fd;
> +	uint32_t *ptr;
> +	uint64_t offset = xe->default_alignment - 4;
> +
> +	tryclear_hang();
> +
> +	if (vm)
> +		free_execqueue();
> +
> +	vm = xe_vm_create(fd, 0, 0);
> +	batch_bo = xe_bo_create(fd, vm, xe->default_alignment, system_memory(fd), 0);
> +	ptr = batch = xe_bo_map(xe->fd, batch_bo, xe->default_alignment);
> +
> +	memset(batch, 0, xe->default_alignment);
> +	*(ptr++) = MI_SEMAPHORE_WAIT | MI_SEMAPHORE_POLL | MI_SEMAPHORE_SAD_GTE_SDD;
> +	*(ptr++) = 1;
> +	*(ptr++) = offset >> 32;
> +	*(ptr++) = offset;
> +	*(ptr++) = MI_BATCH_BUFFER_END;

So if I'm reading this code correctly the hangs depend on job time
mechanism (i.e. by default after 5 seconds the job gets timed out and an
error capture occurs). Have you considered using spin library rather
than open coding this? xe_exec_reset uses this.

Another option to consider is build in the error capture testing into
that existing test too. This would personally be my preference to keep
the IGT suite tiddy in terms of number of tests. Not a blocker but I
think it is worth considering.

Matt 

> +
> +	userptr = mmap(0, xe->default_alignment, PROT_WRITE, MAP_SHARED | MAP_ANON, -1, 0);
> +	wmemset(userptr, 0xf1234567, xe->default_alignment / sizeof(wchar_t));
> +
> +	bind_ops[0].op = DRM_XE_VM_BIND_OP_MAP;
> +	bind_ops[0].obj = batch_bo;
> +	bind_ops[0].addr = 0;
> +
> +	bind_ops[1].op = DRM_XE_VM_BIND_OP_MAP_USERPTR;
> +	bind_ops[1].userptr = (size_t)userptr;
> +	bind_ops[1].addr = 1ULL << 40ULL;
> +
> +	if (dumpable)
> +		bind_ops[0].flags = bind_ops[1].flags = DRM_XE_VM_BIND_FLAG_DUMPABLE;
> +	bind_ops[0].range = bind_ops[1].range = xe->default_alignment;
> +	bind_ops[0].pat_index = bind_ops[1].pat_index = intel_get_pat_idx_wb(fd);
> +
> +	sync.handle = syncobj_create(fd, 0);
> +	xe_vm_bind_array(fd, vm, 0, bind_ops, ARRAY_SIZE(bind_ops), &sync, 1);
> +	syncobj_wait(fd, &sync.handle, 1, INT64_MAX, 0, NULL);
> +	syncobj_destroy(fd, sync.handle);
> +}
> +
> +static uint32_t hang_engine(struct drm_xe_engine_class_instance *hwe)
> +{
> +	uint32_t engine;
> +	struct drm_xe_sync sync = {
> +		.type = DRM_XE_SYNC_TYPE_SYNCOBJ,
> +		.flags = DRM_XE_SYNC_FLAG_SIGNAL,
> +		.handle = syncobj_create(xe->fd, 0),
> +	};
> +
> +	engine = xe_exec_queue_create(xe->fd, vm, hwe, 0);
> +	xe_exec_sync(xe->fd, engine, 0, &sync, 1);
> +
> +	return sync.handle;
> +}
> +
> +static void test_hang_one(void)
> +{
> +	uint32_t syncobj = hang_engine(&xe_engine(xe->fd, 0)->instance);
> +
> +	syncobj_wait(xe->fd, &syncobj, 1, INT64_MAX, 0, NULL);
> +	syncobj_destroy(xe->fd, syncobj);
> +
> +	read_and_clear_hang();
> +}
> +
> +/**
> + * SUBTEST: basic
> + * Description: Read out a full dumped VM.
> + * Test category: functionality test
> + */
> +static void basic(void)
> +{
> +	recreate_execqueue(true);
> +	test_hang_one();
> +}
> +
> +/**
> + * SUBTEST: empty-vm
> + * Description: Create an error dump without anything in VM to dump.
> + * Test category: functionality test
> + */
> +static void empty_vm(void)
> +{
> +	recreate_execqueue(false);
> +	test_hang_one();
> +}
> +
> +/**
> + * SUBTEST: all-simultaneously
> + * Description: Hang all engines at the same time, read out the dump.
> + * Test category: robustness test
> + */
> +static void all_simultaneously(void)
> +{
> +	uint32_t syncobj[MAX_N_ENGINES], i = 0;
> +	struct drm_xe_engine_class_instance *hwe;
> +
> +	recreate_execqueue(true);
> +	xe_for_each_engine(xe->fd, hwe)
> +		syncobj[i++] = hang_engine(hwe);
> +
> +	syncobj_wait(xe->fd, syncobj, i, INT64_MAX, 0, NULL);
> +	while (i--)
> +		syncobj_destroy(xe->fd, syncobj[i]);
> +
> +	read_and_clear_hang();
> +}
> +
> +igt_main
> +{
> +	igt_fixture {
> +		struct stat stat;
> +		int fd = drm_open_driver_render(DRIVER_XE);
> +		char str[256];
> +		xe = xe_device_get(fd);
> +
> +		igt_assert_eq(fstat(fd, &stat), 0);
> +		sprintf(str, "/sys/dev/char/%ld:%ld/device", stat.st_rdev >> 8, stat.st_rdev & 0xff);
> +		sysfd = open(str, O_DIRECTORY);
> +		igt_assert(sysfd >= 0);
> +	}
> +
> +	igt_describe("Test that hw fault coredump readout works");
> +	igt_subtest("basic")
> +		basic();
> +
> +	igt_describe("Hang all engines simultaneously");
> +	igt_subtest("all-simultaneously")
> +		all_simultaneously();
> +
> +	igt_describe("Ensure that snapshot works without anything to capture");
> +	igt_subtest("empty-vm")
> +		empty_vm();
> +}
> diff --git a/tests/meson.build b/tests/meson.build
> index 758ae090c..f611e2e2c 100644
> --- a/tests/meson.build
> +++ b/tests/meson.build
> @@ -280,6 +280,7 @@ intel_xe_progs = [
>  	'xe_compute',
>  	'xe_compute_preempt',
>  	'xe_copy_basic',
> +	'xe_coredump',
>  	'xe_dma_buf_sync',
>  	'xe_debugfs',
>  	'xe_drm_fdinfo',
> -- 
> 2.43.0
> 


More information about the igt-dev mailing list