[PATCH 1/3] lib/amdgpu: add support for gang cs

Thu Jan 25 03:44:11 UTC 2024

From: Vitaly Prosyak <vitaly.prosyak at amd.com>

When gang command submission is used we need to add fields
for the second buf and second pm4 packet.

Add ASIC-dependent implementation of WAIT_REG_MEM used to poll on
location in the register or memory space until a reference value
is satisfied.

Cc: Jesse Zhang <jesse.zhang at amd.com>
Cc: Alex Deucher <alexander.deucher at amd.com>
Cc: Christian Koenig <christian.koenig at amd.com>
Signed-off-by: Yogesh Mohan Marimuthu <yogesh.mohanmarimuthu at amd.com>
Signed-off-by: Vitaly Prosyak <vitaly.prosyak at amd.com>
Acked-by: Christian Koenig <christian.koenig at amd.com>
---
 lib/amdgpu/amd_ip_blocks.c | 35 +++++++++++++++++++++++++++++++++++
 lib/amdgpu/amd_ip_blocks.h | 20 ++++++++++++++++----
 2 files changed, 51 insertions(+), 4 deletions(-)

diff --git a/lib/amdgpu/amd_ip_blocks.c b/lib/amdgpu/amd_ip_blocks.c
index a7ccfa38b..79ce7b5a8 100644
--- a/lib/amdgpu/amd_ip_blocks.c
+++ b/lib/amdgpu/amd_ip_blocks.c
@@ -288,6 +288,39 @@ gfx_ring_copy_linear(const struct amdgpu_ip_funcs *func,
 	return 0;
 }
 
+static int
+gfx_ring_wait_reg_mem(const struct amdgpu_ip_funcs *func,
+			const struct amdgpu_ring_context *ring_context,
+			uint32_t *pm4_dw)
+{
+	uint32_t i;
+
+	i = *pm4_dw;
+	ring_context->pm4[i++] = PACKET3(PACKET3_WAIT_REG_MEM, 5);
+	ring_context->pm4[i++] = (WAIT_REG_MEM_MEM_SPACE(1) | /* memory */
+							WAIT_REG_MEM_FUNCTION(3) | /* == */
+							WAIT_REG_MEM_ENGINE(0));  /* me */
+	ring_context->pm4[i++] = lower_32_bits(ring_context->bo_mc);
+	ring_context->pm4[i++] = upper_32_bits(ring_context->bo_mc);
+	ring_context->pm4[i++] = func->deadbeaf; /* reference value */
+	ring_context->pm4[i++] = 0xffffffff; /* and mask */
+	ring_context->pm4[i++] = 0x00000004; /* poll interval */
+	*pm4_dw = i;
+
+	return 0;
+}
+
+static int
+sdma_ring_wait_reg_mem(const struct amdgpu_ip_funcs *func,
+			const struct amdgpu_ring_context *ring_context,
+			uint32_t *pm4_dw)
+{
+	int r;
+
+	r = gfx_ring_wait_reg_mem(func, ring_context, pm4_dw);
+	return r;
+}
+
 /* we may cobine these two functions later */
 static int
 x_compare(const struct amdgpu_ip_funcs *func,
@@ -336,6 +369,7 @@ static struct amdgpu_ip_funcs gfx_v8_x_ip_funcs = {
 	.compare = x_compare,
 	.compare_pattern = x_compare_pattern,
 	.get_reg_offset = gfx_v8_0_get_reg_offset,
+	.wait_reg_mem = gfx_ring_wait_reg_mem,
 };
 
 static struct amdgpu_ip_funcs sdma_v3_x_ip_funcs = {
@@ -351,6 +385,7 @@ static struct amdgpu_ip_funcs sdma_v3_x_ip_funcs = {
 	.compare = x_compare,
 	.compare_pattern = x_compare_pattern,
 	.get_reg_offset = gfx_v8_0_get_reg_offset,
+	.wait_reg_mem = sdma_ring_wait_reg_mem,
 };
 
 struct amdgpu_ip_block_version gfx_v8_x_ip_block = {
diff --git a/lib/amdgpu/amd_ip_blocks.h b/lib/amdgpu/amd_ip_blocks.h
index aef433e7f..4cad30d1e 100644
--- a/lib/amdgpu/amd_ip_blocks.h
+++ b/lib/amdgpu/amd_ip_blocks.h
@@ -31,22 +31,31 @@ struct amdgpu_ring_context {
 	int res_cnt; /* num of bo in amdgpu_bo_handle resources[2] */
 
 	uint32_t write_length;  /* length of data */
+	uint32_t write_length2; /* length of data for second packet */
 	uint32_t *pm4;		/* data of the packet */
 	uint32_t pm4_size;	/* max allocated packet size */
 	bool secure;		/* secure or not */
 
-	uint64_t bo_mc;		/* result from amdgpu_bo_alloc_and_map */
-	uint64_t bo_mc2;	/* result from amdgpu_bo_alloc_and_map */
+	uint64_t bo_mc;		/* GPU address of first buffer */
+	uint64_t bo_mc2;	/* GPU address for p4 packet */
+	uint64_t bo_mc3;	/* GPU address of second buffer */
+	uint64_t bo_mc4;	/* GPU address of second p4 packet */
 
 	uint32_t pm4_dw;	/* actual size of pm4 */
+	uint32_t pm4_dw2;	/* actual size of second pm4 */
 
-	volatile uint32_t *bo_cpu;
-	volatile uint32_t *bo2_cpu;
+	volatile uint32_t *bo_cpu;	/* cpu adddress of mapped GPU buf */
+	volatile uint32_t *bo2_cpu;	/* cpu adddress of mapped pm4 */
+	volatile uint32_t *bo3_cpu;	/* cpu adddress of mapped GPU second buf */
+	volatile uint32_t *bo4_cpu;	/* cpu adddress of mapped second pm4 */
 
 	uint32_t bo_cpu_origin;
 
 	amdgpu_bo_handle bo;
 	amdgpu_bo_handle bo2;
+	amdgpu_bo_handle bo3;
+	amdgpu_bo_handle bo4;
+
 	amdgpu_bo_handle boa_vram[2];
 	amdgpu_bo_handle boa_gtt[2];
 
@@ -56,6 +65,8 @@ struct amdgpu_ring_context {
 	amdgpu_bo_handle resources[4]; /* amdgpu_bo_alloc_and_map */
 	amdgpu_va_handle va_handle;    /* amdgpu_bo_alloc_and_map */
 	amdgpu_va_handle va_handle2;   /* amdgpu_bo_alloc_and_map */
+	amdgpu_va_handle va_handle3;   /* amdgpu_bo_alloc_and_map */
+	amdgpu_va_handle va_handle4;   /* amdgpu_bo_alloc_and_map */
 
 	struct amdgpu_cs_ib_info ib_info;     /* amdgpu_bo_list_create */
 	struct amdgpu_cs_request ibs_request; /* amdgpu_cs_query_fence_status */
@@ -76,6 +87,7 @@ struct amdgpu_ip_funcs {
 	int (*compare)(const struct amdgpu_ip_funcs *func, const struct amdgpu_ring_context *context, int div);
 	int (*compare_pattern)(const struct amdgpu_ip_funcs *func, const struct amdgpu_ring_context *context, int div);
 	int (*get_reg_offset)(enum general_reg reg);
+	int (*wait_reg_mem)(const struct amdgpu_ip_funcs *func, const struct amdgpu_ring_context *context, uint32_t *pm4_dw);
 
 };
 
-- 
2.25.1