[PATCH 3/3] tests/amdgpu: add gang cs test

vitaly.prosyak at amd.com vitaly.prosyak at amd.com
Thu Jan 25 03:44:13 UTC 2024


From: Vitaly Prosyak <vitaly.prosyak at amd.com>

Add gang command submission test.
We submit ibs from different HW IP as a single command.
The test submits the copy command to the gfx ring and then
waits for the completion of another copy command to the compute
ring which takes longer due to its much bigger copy size.
So the copy commands are executed on COMPUTE and GFX ring as a
single command from the user.

Cc: Jesse Zhang <jesse.zhang at amd.com>
Cc: Alex Deucher <alexander.deucher at amd.com>
Cc: Christian Koenig <christian.koenig at amd.com>
Signed-off-by: Yogesh Mohan Marimuthu <yogesh.mohanmarimuthu at amd.com>
Signed-off-by: Vitaly Prosyak <vitaly.prosyak at amd.com>
Acked-by: Christian Koenig <christian.koenig at amd.com>
---
 tests/amdgpu/amd_gang_cs.c | 225 +++++++++++++++++++++++++++++++++++++
 tests/amdgpu/meson.build   |   2 +-
 2 files changed, 226 insertions(+), 1 deletion(-)
 create mode 100644 tests/amdgpu/amd_gang_cs.c

diff --git a/tests/amdgpu/amd_gang_cs.c b/tests/amdgpu/amd_gang_cs.c
new file mode 100644
index 000000000..f01074e30
--- /dev/null
+++ b/tests/amdgpu/amd_gang_cs.c
@@ -0,0 +1,225 @@
+// SPDX-License-Identifier: MIT
+// Copyright 2023 Advanced Micro Devices, Inc.
+
+#include "igt.h"
+#include "drmtest.h"
+#include <amdgpu.h>
+#include <amdgpu_drm.h>
+#include "lib/amdgpu/amd_PM4.h"
+#include "lib/amdgpu/amd_ip_blocks.h"
+#include "lib/amdgpu/amd_memory.h"
+#include "lib/amdgpu/amd_cs_radv.h"
+
+#define IB_SIZE	4096
+
+static void
+prepare_compute_cp_packet(amdgpu_device_handle device,
+		struct amdgpu_ring_context *ring_context,
+		const struct amdgpu_ip_block_version *ip_block)
+{
+	int r;
+
+	/* allocate buffer for compute  ring*/
+	r = amdgpu_bo_alloc_and_map(device,
+					ring_context->write_length * sizeof(uint32_t),
+					IB_SIZE, AMDGPU_GEM_DOMAIN_GTT, 0,
+					&ring_context->bo, (void **)&ring_context->bo_cpu,
+					&ring_context->bo_mc, &ring_context->va_handle);
+	igt_assert_eq(r, 0);
+	memset((void *)ring_context->bo_cpu, 0,
+			ring_context->write_length * sizeof(uint32_t));
+
+	/* allocate buffer for pm4 packet for compute ring*/
+	r = amdgpu_bo_alloc_and_map(device, IB_SIZE + ring_context->write_length *
+					sizeof(uint32_t),
+					IB_SIZE, AMDGPU_GEM_DOMAIN_GTT, 0,
+					&ring_context->bo2, (void **)&ring_context->bo2_cpu,
+					&ring_context->bo_mc2, &ring_context->va_handle2);
+	igt_assert_eq(r, 0);
+
+	memset((void *)ring_context->bo2_cpu, 0,
+				ring_context->write_length * sizeof(uint32_t));
+	/* assign fields used by ASIC dependent function */
+	ring_context->pm4 = (uint32_t *)ring_context->bo2_cpu;
+	ip_block->funcs->write_linear(ip_block->funcs, ring_context,
+				&ring_context->pm4_dw);
+}
+
+static void
+prepare_gfx_cp_mem_packet(amdgpu_device_handle device,
+		struct amdgpu_ring_context *ring_context,
+		const struct amdgpu_ip_block_version *ip_block)
+{
+	int r;
+	uint32_t write_length;
+	uint64_t bo_mc;
+
+	/* allocate buffer for gfx  */
+	r = amdgpu_bo_alloc_and_map(device,
+					ring_context->write_length2 * sizeof(uint32_t),
+					IB_SIZE, AMDGPU_GEM_DOMAIN_GTT, 0,
+					&ring_context->bo3, (void **)&ring_context->bo3_cpu,
+					&ring_context->bo_mc3, &ring_context->va_handle3);
+	igt_assert_eq(r, 0);
+	memset((void *)ring_context->bo3_cpu, 0,
+			ring_context->write_length2 * sizeof(uint32_t));
+
+	/* allocate buffer for pm4 packet gfx*/
+	r = amdgpu_bo_alloc_and_map(device,  IB_SIZE + ring_context->write_length2 *
+					sizeof(uint32_t),
+					IB_SIZE, AMDGPU_GEM_DOMAIN_GTT, 0,
+					&ring_context->bo4, (void **)&ring_context->bo4_cpu,
+					&ring_context->bo_mc4, &ring_context->va_handle4);
+	igt_assert_eq(r, 0);
+	memset((void *)ring_context->bo4_cpu, 0,
+			ring_context->write_length2 * sizeof(uint32_t));
+	/* assign fields used by ASIC dependent functions */
+	ring_context->pm4 = (uint32_t *)ring_context->bo4_cpu;
+	bo_mc = ring_context->bo_mc;
+	ring_context->bo_mc = ring_context->bo_mc3;
+	write_length = ring_context->write_length;
+	ring_context->write_length = ring_context->write_length2;
+
+	ip_block->funcs->write_linear(ip_block->funcs, ring_context,
+				&ring_context->pm4_dw2);
+	/* addr -1 of compute buf*/
+	ring_context->bo_mc = bo_mc + (write_length - 1) * 4;
+	ip_block->funcs->wait_reg_mem(ip_block->funcs, ring_context,
+				&ring_context->pm4_dw2);
+	ring_context->bo_mc = bo_mc;
+}
+
+static void
+amdgpu_cs_gang(amdgpu_device_handle device)
+{
+	/* keep as big as ib can hold for compute write data packet so that even
+	 * for powerful gpu, wait_data packet in gfx queue will have need to wait.
+	 */
+	const int sdma_write_length_compute = IB_SIZE * 3;
+	/* keep it small for gfx write data packet so that gfx need to wait for compute */
+	const int sdma_write_length_gfx = 4;
+
+	struct amdgpu_cs_request_radv request;
+	struct drm_amdgpu_bo_list_entry bo_handles[2] = {0};
+	struct amdgpu_ring_context *ring_context = NULL;
+
+	int r;
+
+	const struct amdgpu_ip_block_version *gfx_ip_block =
+			get_ip_block(device, AMD_IP_GFX);
+	const struct amdgpu_ip_block_version *compute_ip_block =
+			get_ip_block(device, AMD_IP_COMPUTE);
+
+	struct amdgpu_cs_fence fence_status = {0};
+	uint32_t expired;
+
+	memset(&request, 0, sizeof(request));
+	ring_context = malloc(sizeof(*ring_context));
+	memset(ring_context, 0, sizeof(*ring_context));
+	ring_context->write_length = sdma_write_length_compute;
+	ring_context->write_length2 = sdma_write_length_gfx;
+
+	r = amdgpu_cs_ctx_create(device, &ring_context->context_handle);
+	igt_assert_eq(r, 0);
+
+	prepare_compute_cp_packet(device, ring_context, compute_ip_block);
+	prepare_gfx_cp_mem_packet(device, ring_context, gfx_ip_block);
+
+	request.number_of_ibs = 2;
+
+	request.ibs[0].ib_mc_address = ring_context->bo_mc2; /* pm4 packet addr compute */
+	request.ibs[0].size = ring_context->pm4_dw; /* size p4 compute */
+	request.ibs[0].ip_type = AMDGPU_HW_IP_COMPUTE;
+
+	request.ibs[1].ib_mc_address =  ring_context->bo_mc4; /* p4 packet addr gfx */
+	request.ibs[1].size = ring_context->pm4_dw2;	/* size p4 gfx */
+	request.ibs[1].ip_type = AMDGPU_HW_IP_GFX;
+
+	bo_handles[0].bo_handle = amdgpu_get_bo_handle(ring_context->bo4);
+	bo_handles[0].bo_priority = 0;
+	bo_handles[1].bo_handle = amdgpu_get_bo_handle(ring_context->bo2);
+	bo_handles[1].bo_priority = 0;
+	request.handles = bo_handles;
+	request.num_handles = 2;
+
+	/* submit pm4 packets for gfx and compute as gang */
+	r = amdgpu_cs_submit_radv(device, ring_context, &request);
+
+	igt_assert_eq(r, 0);
+
+	/* wait for fence */
+	fence_status.context = ring_context->context_handle;
+	fence_status.ip_type = AMDGPU_HW_IP_GFX;
+	fence_status.fence = request.seq_no;
+	r = amdgpu_cs_wait_fences(&fence_status, 1, 1,
+				  AMDGPU_TIMEOUT_INFINITE,
+				  &expired, NULL);
+	igt_assert_eq(r, 0);
+
+	/* verify compute test result meets with expected */
+	ring_context->bo_cpu = ring_context->bo_cpu;
+	ring_context->write_length = sdma_write_length_gfx;
+	gfx_ip_block->funcs->compare(gfx_ip_block->funcs, ring_context, 1);
+
+	ring_context->bo_cpu = ring_context->bo3_cpu;
+	ring_context->write_length = sdma_write_length_compute;
+	compute_ip_block->funcs->compare(compute_ip_block->funcs, ring_context, 1);
+
+	amdgpu_bo_unmap_and_free(ring_context->bo, ring_context->va_handle,
+			ring_context->bo_mc, sdma_write_length_gfx * sizeof(uint32_t));
+
+	amdgpu_bo_unmap_and_free(ring_context->bo2, ring_context->va_handle2,
+			ring_context->bo_mc2, IB_SIZE);
+
+	amdgpu_bo_unmap_and_free(ring_context->bo3, ring_context->va_handle3,
+			ring_context->bo_mc3, sdma_write_length_compute * sizeof(uint32_t));
+
+	amdgpu_bo_unmap_and_free(ring_context->bo4, ring_context->va_handle4,
+			ring_context->bo_mc4, IB_SIZE);
+
+	r = amdgpu_cs_ctx_free(ring_context->context_handle);
+	igt_assert_eq(r, 0);
+	free(ring_context);
+}
+
+igt_main
+{
+	amdgpu_device_handle device;
+	struct amdgpu_gpu_info gpu_info = {0};
+	int fd = -1;
+	int r;
+	bool arr_cap[AMD_IP_MAX] = {0};
+
+	igt_fixture {
+		uint32_t major, minor;
+		int err;
+
+		fd = drm_open_driver(DRIVER_AMDGPU);
+
+		err = amdgpu_device_initialize(fd, &major, &minor, &device);
+		igt_require(err == 0);
+
+		igt_info("Initialized amdgpu, driver version %d.%d\n",
+			 major, minor);
+
+		r = amdgpu_query_gpu_info(device, &gpu_info);
+		igt_assert_eq(r, 0);
+		r = setup_amdgpu_ip_blocks(major, minor, &gpu_info, device);
+		igt_assert_eq(r, 0);
+		asic_rings_readness(device, 1, arr_cap);
+
+	}
+
+	igt_describe("Test GPU gang cs for gfx and compute rings");
+	igt_subtest_with_dynamic("amdgpu-cs-gang") {
+		if (arr_cap[AMD_IP_GFX] && arr_cap[AMD_IP_COMPUTE]) {
+			igt_dynamic_f("amdgpu-cs-gang-AMD_IP_GFX-AMD_IP_COMPUTE")
+				amdgpu_cs_gang(device);
+		}
+	}
+
+	igt_fixture {
+		amdgpu_device_deinitialize(device);
+		drm_close_driver(fd);
+	}
+}
diff --git a/tests/amdgpu/meson.build b/tests/amdgpu/meson.build
index bbb8edc93..5bd502496 100644
--- a/tests/amdgpu/meson.build
+++ b/tests/amdgpu/meson.build
@@ -14,6 +14,7 @@ if libdrm_amdgpu.found()
 			  'amd_dp_dsc',
 			  'amd_freesync_video_mode',
 			  'amd_hotplug',
+			  'amd_gang_cs' ,
 			  'amd_ilr',
 			  'amd_info',
 			  'amd_jpeg_dec',
@@ -54,7 +55,6 @@ if libdrm_amdgpu.found()
 	else
 		warning('libdrm <= 2.4.109 found, amd_pstate test not applicable')
 	endif
-
 	amdgpu_deps += libdrm_amdgpu
 endif
 
-- 
2.25.1



More information about the igt-dev mailing list