[PATCH] tests/intel: Add xe_coredump test, v3.

John Harrison john.c.harrison at intel.com
Thu Jun 13 23:27:00 UTC 2024


On 6/11/2024 03:13, Maarten Lankhorst wrote:
> Add a simple test that forces a GPU hang and then reads the resulting
> devcoredump file. Map a single userptr and BO, and dump the contents of
> those.
It would be useful to add a more memory intensive test as well. I 
believe the i915 version has a sub-test which allocates the entire LMEM 
of a discrete card or something like half the SMEM of an integrated and 
verifies that it all comes out in the error capture.

While that extreme memory usage may not be especially common, it is a 
quite common debug tool to enable capture of all an OpenGL app's buffers 
to allow replay of the batch later. And that can be significant memory 
usage - 100MBs+. So it would be good to have a test which verifies such 
large buffers and large quantities of buffers can be captured and returned.

John.

>
> Changes since v1:
> - Almost completely rewrite test for readability, based on feedback.
> Changes since v2:
> - Remove retrying opening fd, 1s wait was missing and test passed without.
>
> Signed-off-by: Maarten Lankhorst <maarten.lankhorst at linux.intel.com>
> ---
>   tests/intel/xe_coredump.c | 251 ++++++++++++++++++++++++++++++++++++++
>   tests/meson.build         |   1 +
>   2 files changed, 252 insertions(+)
>   create mode 100644 tests/intel/xe_coredump.c
>
> diff --git a/tests/intel/xe_coredump.c b/tests/intel/xe_coredump.c
> new file mode 100644
> index 000000000..938b718db
> --- /dev/null
> +++ b/tests/intel/xe_coredump.c
> @@ -0,0 +1,251 @@
> +// SPDX-License-Identifier: MIT
> +/*
> + * Copyright © 2023 Intel Corporation
> + */
> +
> +/**
> + * TEST: Check devcoredump functionality
> + * Category: Software building block
> + * Sub-category: devcoredump
> + * Run type: BAT
> + * Functionality: Error dumping and readout.
> + */
> +
> +#include <unistd.h>
> +#include <fcntl.h>
> +#include <glob.h>
> +#include <string.h>
> +#include <wchar.h>
> +#include <sys/stat.h>
> +
> +#include "igt.h"
> +#include "igt_device.h"
> +#include "igt_io.h"
> +#include "igt_syncobj.h"
> +#include "igt_sysfs.h"
> +
> +#include "intel_pat.h"
> +
> +#include "xe_drm.h"
> +#include "xe/xe_ioctl.h"
> +#include "xe/xe_query.h"
> +
> +#ifndef DRM_XE_VM_BIND_FLAG_DUMPABLE
> +#define DRM_XE_VM_BIND_FLAG_DUMPABLE (1<<3)
> +#endif
> +
> +static struct xe_device *xe;
> +static uint32_t batch_bo;
> +static uint32_t *batch;
> +static void *userptr;
> +static uint32_t vm;
> +static int sysfd;
> +
> +#define MAX_N_ENGINES 32
> +
> +static void tryclear_hang(void)
> +{
> +	int fd = openat(sysfd, "devcoredump/data", O_RDWR);
> +	char buf[256];
> +
> +	if (fd < 0)
> +		return;
> +
> +	while (read(fd, buf, sizeof(buf)) > 0)
> +		{ }
> +	write(fd, "1", 1);
> +	close(fd);
> +}
> +
> +/*
> + * Helper to read and clear devcore.  We want to read it completely to ensure
> + * we catch any kernel side regressions like:
> + * https://gitlab.freedesktop.org/drm/msm/-/issues/20
> + */
> +static void
> +read_and_clear_hang(void)
> +{
> +	char buf[0x1000];
> +	int fd = openat(sysfd, "devcoredump/data", O_RDWR);
> +	igt_assert(fd >= 0);
> +
> +	/*
> +	 * We want to read the entire file but we can throw away the
> +	 * contents.. we just want to make sure that we exercise the
> +	 * kernel side codepaths hit when reading the devcore from
> +	 * sysfs
> +	 */
> +	igt_debug("---- begin coredump ----\n");
> +	while (1) {
> +		ssize_t ret;
> +
> +		ret = igt_readn(fd, buf, sizeof(buf) - 1);
> +		igt_assert(ret >= 0);
> +		if (ret == 0)
> +			break;
> +		buf[ret] = '\0';
> +		igt_debug("%s", buf);
> +	}
> +
> +	igt_debug("---- end coredump ----\n");
> +
> +	/* Clear the devcore: */
> +	igt_writen(fd, "1", 1);
> +
> +	close(fd);
> +}
> +
> +static void free_execqueue(void)
> +{
> +	int fd = xe->fd;
> +	xe_vm_destroy(fd, vm);
> +	vm = 0;
> +	gem_close(fd, batch_bo);
> +	munmap(batch, xe->default_alignment);
> +	munmap(userptr, xe->default_alignment);
> +	batch = userptr = NULL;
> +}
> +
> +static void recreate_execqueue(bool dumpable)
> +{
> +	struct drm_xe_sync sync = {
> +		.type = DRM_XE_SYNC_TYPE_SYNCOBJ,
> +		.flags = DRM_XE_SYNC_FLAG_SIGNAL,
> +	};
> +	struct drm_xe_vm_bind_op bind_ops[2] = { };
> +	int fd = xe->fd;
> +	uint32_t *ptr;
> +	uint64_t offset = xe->default_alignment - 4;
> +
> +	tryclear_hang();
> +
> +	if (vm)
> +		free_execqueue();
> +
> +	vm = xe_vm_create(fd, 0, 0);
> +	batch_bo = xe_bo_create(fd, vm, xe->default_alignment, system_memory(fd), 0);
> +	ptr = batch = xe_bo_map(xe->fd, batch_bo, xe->default_alignment);
> +
> +	memset(batch, 0, xe->default_alignment);
> +	*(ptr++) = MI_SEMAPHORE_WAIT | MI_SEMAPHORE_POLL | MI_SEMAPHORE_SAD_GTE_SDD;
> +	*(ptr++) = 1;
> +	*(ptr++) = offset >> 32;
> +	*(ptr++) = offset;
> +	*(ptr++) = MI_BATCH_BUFFER_END;
> +
> +	userptr = mmap(0, xe->default_alignment, PROT_WRITE, MAP_SHARED | MAP_ANON, -1, 0);
> +	wmemset(userptr, 0xf1234567, xe->default_alignment / sizeof(wchar_t));
> +
> +	bind_ops[0].op = DRM_XE_VM_BIND_OP_MAP;
> +	bind_ops[0].obj = batch_bo;
> +	bind_ops[0].addr = 0;
> +
> +	bind_ops[1].op = DRM_XE_VM_BIND_OP_MAP_USERPTR;
> +	bind_ops[1].userptr = (size_t)userptr;
> +	bind_ops[1].addr = 1ULL << 40ULL;
> +
> +	if (dumpable)
> +		bind_ops[0].flags = bind_ops[1].flags = DRM_XE_VM_BIND_FLAG_DUMPABLE;
> +	bind_ops[0].range = bind_ops[1].range = xe->default_alignment;
> +	bind_ops[0].pat_index = bind_ops[1].pat_index = intel_get_pat_idx_wb(fd);
> +
> +	sync.handle = syncobj_create(fd, 0);
> +	xe_vm_bind_array(fd, vm, 0, bind_ops, ARRAY_SIZE(bind_ops), &sync, 1);
> +	syncobj_wait(fd, &sync.handle, 1, INT64_MAX, 0, NULL);
> +	syncobj_destroy(fd, sync.handle);
> +}
> +
> +static uint32_t hang_engine(struct drm_xe_engine_class_instance *hwe)
> +{
> +	uint32_t engine;
> +	struct drm_xe_sync sync = {
> +		.type = DRM_XE_SYNC_TYPE_SYNCOBJ,
> +		.flags = DRM_XE_SYNC_FLAG_SIGNAL,
> +		.handle = syncobj_create(xe->fd, 0),
> +	};
> +
> +	engine = xe_exec_queue_create(xe->fd, vm, hwe, 0);
> +	xe_exec_sync(xe->fd, engine, 0, &sync, 1);
> +
> +	return sync.handle;
> +}
> +
> +static void test_hang_one(void)
> +{
> +	uint32_t syncobj = hang_engine(&xe_engine(xe->fd, 0)->instance);
> +
> +	syncobj_wait(xe->fd, &syncobj, 1, INT64_MAX, 0, NULL);
> +	syncobj_destroy(xe->fd, syncobj);
> +
> +	read_and_clear_hang();
> +}
> +
> +/**
> + * SUBTEST: basic
> + * Description: Read out a full dumped VM.
> + * Test category: functionality test
> + */
> +static void basic(void)
> +{
> +	recreate_execqueue(true);
> +	test_hang_one();
> +}
> +
> +/**
> + * SUBTEST: empty-vm
> + * Description: Create an error dump without anything in VM to dump.
> + * Test category: functionality test
> + */
> +static void empty_vm(void)
> +{
> +	recreate_execqueue(false);
> +	test_hang_one();
> +}
> +
> +/**
> + * SUBTEST: all-simultaneously
> + * Description: Hang all engines at the same time, read out the dump.
> + * Test category: robustness test
> + */
> +static void all_simultaneously(void)
> +{
> +	uint32_t syncobj[MAX_N_ENGINES], i = 0;
> +	struct drm_xe_engine_class_instance *hwe;
> +
> +	recreate_execqueue(true);
> +	xe_for_each_engine(xe->fd, hwe)
> +		syncobj[i++] = hang_engine(hwe);
> +
> +	syncobj_wait(xe->fd, syncobj, i, INT64_MAX, 0, NULL);
> +	while (i--)
> +		syncobj_destroy(xe->fd, syncobj[i]);
> +
> +	read_and_clear_hang();
> +}
> +
> +igt_main
> +{
> +	igt_fixture {
> +		struct stat stat;
> +		int fd = drm_open_driver_render(DRIVER_XE);
> +		char str[256];
> +		xe = xe_device_get(fd);
> +
> +		igt_assert_eq(fstat(fd, &stat), 0);
> +		sprintf(str, "/sys/dev/char/%ld:%ld/device", stat.st_rdev >> 8, stat.st_rdev & 0xff);
> +		sysfd = open(str, O_DIRECTORY);
> +		igt_assert(sysfd >= 0);
> +	}
> +
> +	igt_describe("Test that hw fault coredump readout works");
> +	igt_subtest("basic")
> +		basic();
> +
> +	igt_describe("Hang all engines simultaneously");
> +	igt_subtest("all-simultaneously")
> +		all_simultaneously();
> +
> +	igt_describe("Ensure that snapshot works without anything to capture");
> +	igt_subtest("empty-vm")
> +		empty_vm();
> +}
> diff --git a/tests/meson.build b/tests/meson.build
> index 758ae090c..f611e2e2c 100644
> --- a/tests/meson.build
> +++ b/tests/meson.build
> @@ -280,6 +280,7 @@ intel_xe_progs = [
>   	'xe_compute',
>   	'xe_compute_preempt',
>   	'xe_copy_basic',
> +	'xe_coredump',
>   	'xe_dma_buf_sync',
>   	'xe_debugfs',
>   	'xe_drm_fdinfo',



More information about the igt-dev mailing list