[PATCH i-g-t, v3 5/5] tests/intel/xe_render_copy: Render under copy stress

Thu Mar 27 13:19:11 UTC 2025

On Wed, Mar 26, 2025 at 01:03:00PM +0100, Francois Dugast wrote:
> These new tests are meant to observe the impact of stressing the copy
> engines with multiple copy jobs on a rendering job running in parallel.
> 
> Add the following tests:
> * "render-stress-0-copies"
> * "render-stress-1-copies"
> * "render-stress-2-copies"
> * "render-stress-4-copies"
> * "render-stress-16-copies"
> 
> It is expected to fail when many copies are running in parallel.
> 
> Signed-off-by: Francois Dugast <francois.dugast at intel.com>
> ---
>  tests/intel/xe_render_copy.c | 207 +++++++++++++++++++++++++++++++++++
>  1 file changed, 207 insertions(+)
> 
> diff --git a/tests/intel/xe_render_copy.c b/tests/intel/xe_render_copy.c
> index e2fbbc0f8..7739ba013 100644
> --- a/tests/intel/xe_render_copy.c
> +++ b/tests/intel/xe_render_copy.c
> @@ -14,6 +14,7 @@
>  #include "intel_bufops.h"
>  #include "xe/xe_ioctl.h"
>  #include "xe/xe_query.h"
> +#include "xe/xe_util.h"
>  
>  /**
>   * TEST: Copy memory using 3d engine
> @@ -437,6 +438,193 @@ static int render(struct buf_ops *bops, uint32_t tiling,
>  	return fails;
>  }
>  
> +/**
> + * TEST: Render while stressing copy functions
> + * Category: Core
> + * Mega feature: Render
> + * Sub-category: 3d
> + * Functionality: copy
> + * Test category: stress test
> + *
> + * SUBTEST: render-stress-%s-copies
> + * Description: Render while running %arg[1] parallel copies per supported engine
> + *
> + * arg[1]:
> + * @0: 0 parallel copies
> + * @1: 1 parallel copies
> + * @2: 2 parallel copies
> + * @4: 4 parallel copies
> + * @16: 16 parallel copies
> + */
> +
> +/*
> + * Copy parameters
> + */
> +#define COPY_SIZE		SZ_16M
> +#define COPY_N_SEQ_BLT_MEM	200
> +#define COPY_MAX_THREADS	64
> +
> +/*
> + * Render parameters
> + */
> +#define RENDER_TEST_TYPE	COPY_FULL
> +#define RENDER_TILING		T_LINEAR
> +#define RENDER_ITERATIONS	50
> +
> +static void stress_copy(int fd, uint32_t size, uint32_t region,
> +			struct drm_xe_engine_class_instance *hwe, int ncopies)
> +{
> +	uint32_t src_handle, dst_handle, vm, exec_queue, src_size;
> +	uint32_t bo_size = ALIGN(size, xe_get_default_alignment(fd));
> +	intel_ctx_t *ctx;
> +
> +	src_handle = xe_bo_create(fd, 0, bo_size, region, 0);
> +	dst_handle = xe_bo_create(fd, 0, bo_size, region, 0);
> +	vm = xe_vm_create(fd, 0, 0);
> +	exec_queue = xe_exec_queue_create(fd, vm, hwe, 0);
> +	ctx = intel_ctx_xe(fd, vm, exec_queue, 0, 0, 0);
> +
> +	src_size = bo_size;
> +
> +	xe_blt_mem_copy(fd, src_handle, dst_handle, ctx, src_size, size, 1, region, ncopies);

What I don't like in this attitude is arbitrary number of ncopies which
may not be fit to render-copy stress test.

Imo more reasonable approach should be:
1. start mem copy loop with cond bbe inside. Cond bbe should stop copy
   loop execution when some value is written to some memory address
   it compares.
2. start render-copy stress test loop should be executed within
   igt_until_timeout().
3. calculate number of render copies performed within 2.
4. write to memory address from 1. to stop copy loop execution.

I still have some doubts shouldn't this be a benchmark. For each
platform/configuration you may get different results. How to maintain
them inside igt? Which durations are expected, which are not?
What about executions on system / vram? What durations are expected
on BMG2, which are on LNL, etc.

--
Zbigniew

> +
> +	gem_close(fd, src_handle);
> +	gem_close(fd, dst_handle);
> +	xe_exec_queue_destroy(fd, exec_queue);
> +	xe_vm_destroy(fd, vm);
> +	free(ctx);
> +}
> +
> +typedef struct {
> +	int fd;
> +	uint32_t size;
> +	uint32_t region;
> +	struct drm_xe_engine_class_instance *hwe;
> +	uint32_t ncopies;
> +} data_thread_stress_copy;
> +
> +static void *run_thread_stress_copy(void *arg)
> +{
> +	data_thread_stress_copy *data = (data_thread_stress_copy *)arg;
> +
> +	stress_copy(data->fd, data->size, data->region, data->hwe, data->ncopies);
> +	pthread_exit(NULL);
> +}
> +
> +static void data_thread_stress_copy_init(data_thread_stress_copy *data, int fd)
> +{
> +	data->fd = fd;
> +	data->size = COPY_SIZE;
> +	data->ncopies = COPY_N_SEQ_BLT_MEM;
> +}
> +
> +typedef struct {
> +	int fd;
> +	uint32_t render_width;
> +	uint32_t render_height;
> +	uint32_t render_tiling;
> +	enum render_copy_testtype render_testtype;
> +	uint32_t iterations;
> +	uint64_t duration_total;
> +	uint64_t duration_min;
> +	uint64_t duration_max;
> +} data_thread_render;
> +
> +static void *run_thread_render(void *arg)
> +{
> +	data_thread_render *data = (data_thread_render *)arg;
> +	struct buf_ops *bops;
> +
> +	bops = buf_ops_create(data->fd);
> +
> +	for (int i = 0; i < data->iterations; i++) {
> +		uint64_t duration;
> +
> +		render(bops, data->render_tiling, data->render_width, data->render_height,
> +		       data->render_testtype, &duration);
> +		data->duration_total += duration;
> +		if (duration < data->duration_min)
> +			data->duration_min = duration;
> +		if (duration > data->duration_max)
> +			data->duration_max = duration;
> +	}
> +
> +	pthread_exit(NULL);
> +}
> +
> +static void data_thread_render_init(data_thread_render *data, int fd)
> +{
> +	data->fd = fd;
> +	data->duration_total = 0;
> +	data->duration_min = -1;
> +	data->duration_max = 0;
> +	data->render_width = WIDTH;
> +	data->render_height = HEIGHT;
> +	data->render_tiling = RENDER_TILING;
> +	data->render_testtype = RENDER_TEST_TYPE;
> +	data->iterations = RENDER_ITERATIONS;
> +}
> +
> +static bool has_copy_function(struct drm_xe_engine_class_instance *hwe)
> +{
> +	return hwe->engine_class == DRM_XE_ENGINE_CLASS_COPY;
> +}
> +
> +static void render_stress_copy(int fd, struct igt_collection *set,
> +			       uint32_t nparallel_copies_per_engine)
> +{
> +	struct igt_collection *regions;
> +	struct drm_xe_engine_class_instance *hwe;
> +	data_thread_stress_copy data_stress_copy[COPY_MAX_THREADS];
> +	pthread_t threads_stress_copy[COPY_MAX_THREADS];
> +	int count_threads_stress_copy = 0;
> +	data_thread_render data_render;
> +	pthread_t thread_render;
> +
> +	data_thread_render_init(&data_render, fd);
> +	igt_assert(pthread_create(&thread_render,
> +				  NULL,
> +				  run_thread_render,
> +				  &data_render) == 0);
> +
> +	for_each_variation_r(regions, 1, set) {
> +		xe_for_each_engine(fd, hwe) {
> +			if (!has_copy_function(hwe))
> +				continue;
> +
> +			for (int i = 0; i < nparallel_copies_per_engine; i++) {
> +				data_thread_stress_copy_init(
> +					&data_stress_copy[count_threads_stress_copy], fd);
> +				data_stress_copy[count_threads_stress_copy].region =
> +					igt_collection_get_value(regions, 0);
> +				data_stress_copy[count_threads_stress_copy].hwe = hwe;
> +				igt_assert(pthread_create(
> +						   &threads_stress_copy[count_threads_stress_copy],
> +						   NULL,
> +						   run_thread_stress_copy,
> +						   &data_stress_copy[count_threads_stress_copy])
> +					   == 0);
> +				count_threads_stress_copy++;
> +				igt_assert_lt(count_threads_stress_copy, COPY_MAX_THREADS);
> +			}
> +		}
> +	}
> +
> +	for (int i = 0; i < count_threads_stress_copy; i++)
> +		pthread_join(threads_stress_copy[i], NULL);
> +	pthread_join(thread_render, NULL);
> +
> +	igt_info("Render duration: avg = %ld ns, min = %ld ns, max = %ld ns\n",
> +		 data_render.duration_total / data_render.iterations,
> +		 data_render.duration_min, data_render.duration_max);
> +
> +	/*
> +	 * Even with the loops, this simple render should not take longer
> +	 * than one second to complete.
> +	 */
> +	igt_assert_lt_u64(data_render.duration_max, 1000000000);
> +}
> +
>  static int opt_handler(int opt, int opt_index, void *data)
>  {
>  	switch (opt) {
> @@ -477,11 +665,25 @@ igt_main_args("dpiW:H:", NULL, help_str, opt_handler, NULL)
>  	struct buf_ops *bops;
>  	const char *tiling_name;
>  	int tiling;
> +	struct igt_collection *set;
> +	const struct section {
> +		const char *name;
> +		unsigned int nparallel_copies_per_engine;
> +	} sections[] = {
> +		{ "0", 0 },
> +		{ "1", 1 },
> +		{ "2", 2 },
> +		{ "4", 4 },
> +		{ "16", 16 },
> +		{ NULL },
> +	};
>  
>  	igt_fixture {
>  		xe = drm_open_driver(DRIVER_XE);
>  		bops = buf_ops_create(xe);
>  		srand(time(NULL));
> +		set = xe_get_memory_region_set(xe,
> +					       DRM_XE_MEM_REGION_CLASS_SYSMEM);
>  	}
>  
>  	for (int id = 0; id <= COPY_FULL_COMPRESSED; id++) {
> @@ -501,6 +703,11 @@ igt_main_args("dpiW:H:", NULL, help_str, opt_handler, NULL)
>  		}
>  	}
>  
> +	for (const struct section *s = sections; s->name; s++)
> +		igt_subtest_f("render-stress-%s-copies", s->name) {
> +			render_stress_copy(xe, set, s->nparallel_copies_per_engine);
> +		}
> +
>  	igt_fixture {
>  		buf_ops_destroy(bops);
>  		drm_close_driver(xe);
> -- 
> 2.43.0
>