[PATCH] amdgpu/tests: make amdgpu_sync_dependency_test work on gfx12

Mon Aug 25 18:10:47 UTC 2025

From: Vitaly Prosyak <vitaly.prosyak at amd.com>

drop CLEAR_STATE and fix malformed preamble

On GC 12.0.0 (RX 9070/ gfx12) the amdgpu_sync_dependency_test reliably
timed out and was canceled (amdgpu_cs_query_fence_status() -> -ECANCELED).
Root cause was a bad PM4 preamble:

  1) We emitted PKT3_CLEAR_STATE (0-dword packet) and then incorrectly
     wrote an extra dword 0x80000000. That stray word is a TYPE2 packet
     and wedges the parser on gfx12.

  2) Even when well-formed, CLEAR_STATE is a graphics packet and is not
     accepted in the paths we use on gfx12. The ucode/queue is stricter
     and the job stalls.

This change removes CLEAR_STATE from the GFX12 and fixes
the malformed emission. We keep CONTEXT_CONTROL (harmless across gens).

The test logic is unchanged: IB#1 launches a tiny kernel that spins and
stores 42 to data_addr; IB#2 depends on IB#1’s fence and then WRITE_DATA
(WR_CONFIRM) writes 99 to the same location. Seeing 99 proves the kernel
completed and the dependency was honored.

With these fixes the test completes on gfx9/gfx10/gfx11 as before and
now passes on gfx12 without GPU reset.

Fixes: spurious -ECANCELED due to hangcheck after malformed/illegal PM4 preamble on gfx12

Cc: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer at amd.com>
Cc: Christian König <christian.koenig at amd.com>
Cc: Alex Deucher <alexander.deucher at amd.com>
Cc: Jesse Zhang <jesse.zhang at amd.com>

Signed-off-by: Vitaly Prosyak <vitaly.prosyak at amd.com>
---
 lib/amdgpu/amd_family.h       |  1 +
 lib/amdgpu/amd_ip_blocks.c    | 74 +++++++++++++++++++++++------
 lib/amdgpu/amd_ip_blocks_ex.c | 41 +++++++++++++++-
 lib/amdgpu/amd_shaders.c      | 88 +++++++++++++++++++++++++++--------
 4 files changed, 169 insertions(+), 35 deletions(-)

diff --git a/lib/amdgpu/amd_family.h b/lib/amdgpu/amd_family.h
index 60190c639..4514fff2a 100644
--- a/lib/amdgpu/amd_family.h
+++ b/lib/amdgpu/amd_family.h
@@ -128,6 +128,7 @@ enum chip_class {
 	GFX10,
 	GFX10_3,
 	GFX11,
+	GFX12,
 	NUM_GFX_VERSIONS
 };
 
diff --git a/lib/amdgpu/amd_ip_blocks.c b/lib/amdgpu/amd_ip_blocks.c
index d8d7efbef..6955ba7dc 100644
--- a/lib/amdgpu/amd_ip_blocks.c
+++ b/lib/amdgpu/amd_ip_blocks.c
@@ -1031,13 +1031,21 @@ amdgpu_device_ip_block_add(struct amdgpu_ip_block_version *ip_block_version)
 
 	amdgpu_ips.ip_blocks[amdgpu_ips.num_ip_blocks++] = ip_block_version;
 
+	return 0;
+}
+
+static int
+amdgpu_device_ip_block_ex_setup(struct amdgpu_ip_block_version *ip_block_version)
+{
+	if (amdgpu_ips.num_ip_blocks >= AMD_IP_MAX)
+		return -1;
+
 	if (ip_block_version->funcs &&
 		(!ip_block_version->funcs->gfx_program_compute ||
 		 !ip_block_version->funcs->gfx_dispatch_direct ||
 		 !ip_block_version->funcs->gfx_write_confirm )) {
 		amd_ip_blocks_ex_init(ip_block_version->funcs);
 	}
-
 	return 0;
 }
 
@@ -1221,9 +1229,11 @@ int setup_amdgpu_ip_blocks(uint32_t major, uint32_t minor, struct amdgpu_gpu_inf
 		{"GFX10",				GFX10},
 		{"GFX10_3",				GFX10_3},
 		{"GFX11",				GFX11},
+		{"GFX12",				GFX12},
 		{},
 	};
 	struct chip_info *info = &g_chip;
+	int i;
 
 	g_pChip = &g_chip;
 
@@ -1318,7 +1328,9 @@ int setup_amdgpu_ip_blocks(uint32_t major, uint32_t minor, struct amdgpu_gpu_inf
 	igt_info("amdgpu: %s (family_id, chip_external_rev): (%u, %u)\n",
 				info->name, amdinfo->family_id, amdinfo->chip_external_rev);
 
-	if (info->family >= CHIP_GFX1100) {
+	if (info->family >= CHIP_GFX1200) {
+		info->chip_class = GFX12;
+	} else if (info->family >= CHIP_GFX1100) {
 		info->chip_class = GFX11;
 	} else if (info->family >= CHIP_SIENNA_CICHLID) {
 		info->chip_class = GFX10_3;
@@ -1348,30 +1360,62 @@ int setup_amdgpu_ip_blocks(uint32_t major, uint32_t minor, struct amdgpu_gpu_inf
 	case GFX10:/* tested */
 	case GFX10_3: /* tested */
 	case GFX11: /* tested */
+	case GFX12: /* tested */
+		/* extra precaution if re-factor again */
+		igt_assert_eq(gfx_v8_x_ip_block.major, 8);
+		igt_assert_eq(compute_v8_x_ip_block.major, 8);
+		igt_assert_eq(sdma_v3_x_ip_block.major, 3);
+
+		/* Add three default IP blocks: GFX, Compute, and SDMA.
+		 *
+		 * These represent the base hardware engines available on the ASIC.
+		 * - gfx_v8_x_ip_block: graphics + compute pipeline control
+		 * - compute_v8_x_ip_block: dedicated compute support (user queues, shaders)
+		 * - sdma_v3_x_ip_block: asynchronous DMA engine for buffer transfers
+		 */
 		amdgpu_device_ip_block_add(&gfx_v8_x_ip_block);
 		amdgpu_device_ip_block_add(&compute_v8_x_ip_block);
 		amdgpu_device_ip_block_add(&sdma_v3_x_ip_block);
-		/*
-		 * The handling of a particular family _id is done into
-		 * each function and as a result the field family_id would be overwritten
-		 * during initialization which matches to actual family_id.
-		 * The initial design assumed that for different GPU families, we may
-		 * have different implementations, but it is not necessary.
-		 * TO DO: move family id as a parameter into IP functions and
-		 * remove it as a field
+
+		/* Assign the actual hardware identifiers to all registered IP blocks.
+		 *
+		 * These fields are taken from amdinfo (queried from the kernel):
+		 * - family_id: ASIC family (gfx9/gfx10/gfx11/gfx12, etc.)
+		 * - chip_external_rev: revision exposed to firmware
+		 * - chip_rev: internal revision used by HW-specific hooks
 		 */
-		for (int i = 0; i <  amdgpu_ips.num_ip_blocks; i++) {
+		for (i = 0; i < amdgpu_ips.num_ip_blocks; i++) {
 			amdgpu_ips.ip_blocks[i]->funcs->family_id = amdinfo->family_id;
 			amdgpu_ips.ip_blocks[i]->funcs->chip_external_rev = amdinfo->chip_external_rev;
 			amdgpu_ips.ip_blocks[i]->funcs->chip_rev = amdinfo->chip_rev;
 		}
-		/* extra precaution if re-factor again */
-		igt_assert_eq(gfx_v8_x_ip_block.major, 8);
-		igt_assert_eq(compute_v8_x_ip_block.major, 8);
-		igt_assert_eq(sdma_v3_x_ip_block.major, 3);
 
+		/* Configure ASIC-specific function hooks.
+		 *
+		 * Depending on family_id, each IP block installs the correct
+		 * callback implementations for:
+		 * - PM4 packet encoding (e.g., gfx10/gfx11/gfx12 differences)
+		 * - Register offsets (PGM_LO/PGM_RSRC, etc.)
+		 * - Compute and dispatch programming
+		 *
+		 * This ensures that test code emits commands in the correct
+		 * hardware-specific format for the active GPU.
+		 */
+		for (i = 0; i < amdgpu_ips.num_ip_blocks; i++)
+			amdgpu_device_ip_block_ex_setup(amdgpu_ips.ip_blocks[i]);
+
+		/* Sanity-check the initialized IP blocks to ensure the family_id matches
+		 * the ASIC info from the kernel. If these assertions fail, it usually
+		 * means that:
+		 * - IP block registration order changed, or
+		 * - amdinfo data mismatches the active hooks, or
+		 * - future refactoring broke expected initialization paths.
+		 */
 		igt_assert_eq(gfx_v8_x_ip_block.funcs->family_id, amdinfo->family_id);
 		igt_assert_eq(sdma_v3_x_ip_block.funcs->family_id, amdinfo->family_id);
+
+
+
 		break;
 	default:
 		igt_info("amdgpu: GFX11 or old.\n");
diff --git a/lib/amdgpu/amd_ip_blocks_ex.c b/lib/amdgpu/amd_ip_blocks_ex.c
index 21c4d952c..aab9a2a4a 100644
--- a/lib/amdgpu/amd_ip_blocks_ex.c
+++ b/lib/amdgpu/amd_ip_blocks_ex.c
@@ -139,6 +139,44 @@ static void gfx_program_compute_gfx11(
 	base->emit(base, tz);
 }
 
+static void gfx_program_compute_gfx12(
+	const struct amdgpu_ip_funcs *f,
+	struct amdgpu_cmd_base *base,
+	uint64_t code, uint64_t udata0,
+	uint32_t rsrc1, uint32_t rsrc2,
+	uint32_t tx, uint32_t ty, uint32_t tz)
+{
+	base->emit(base, PACKET3(PKT3_CONTEXT_CONTROL, 1));
+	base->emit(base, 0x80000000);
+	base->emit(base, 0x80000000);
+
+	base->emit(base, PACKET3(PKT3_SET_SH_REG, 2));
+	base->emit(base, f->get_reg_offset(COMPUTE_PGM_LO));
+	base->emit(base, (uint32_t)(code >> 8));
+	base->emit(base, (uint32_t)(code >> 40));
+
+	base->emit(base, PACKET3(PKT3_SET_SH_REG, 2));
+	base->emit(base, f->get_reg_offset(COMPUTE_PGM_RSRC1));
+	base->emit(base, rsrc1);
+	base->emit(base, rsrc2);
+
+	base->emit(base, PACKET3(PKT3_SET_SH_REG, 2));
+	base->emit(base, f->get_reg_offset(COMPUTE_USER_DATA_0));
+	base->emit(base, (uint32_t)udata0);
+	base->emit(base, (uint32_t)(udata0 >> 32));
+
+	base->emit(base, PACKET3(PKT3_SET_SH_REG, 1));
+	base->emit(base, f->get_reg_offset(COMPUTE_RESOURCE_LIMITS));
+	base->emit(base, 0);
+
+	base->emit(base, PACKET3(PKT3_SET_SH_REG, 3));
+	base->emit(base, f->get_reg_offset(COMPUTE_NUM_THREAD_X));
+	base->emit(base, tx);
+	base->emit(base, ty);
+	base->emit(base, tz);
+}
+
+
 static void gfx_dispatch_direct_gfx11(
 	const struct amdgpu_ip_funcs *f,
 	struct amdgpu_cmd_base *base,
@@ -172,7 +210,8 @@ void amd_ip_blocks_ex_init(struct amdgpu_ip_funcs *funcs)
 		funcs->gfx_dispatch_direct = gfx_dispatch_direct_gfx11;
 		break;
 	case AMDGPU_FAMILY_GC_12_0_0:
-		/*TODO*/
+		funcs->gfx_program_compute =gfx_program_compute_gfx12;
+		funcs->gfx_dispatch_direct = gfx_dispatch_direct_gfx11;
 		break;
 	default:
 		break;
diff --git a/lib/amdgpu/amd_shaders.c b/lib/amdgpu/amd_shaders.c
index e4931961c..adc4fc051 100644
--- a/lib/amdgpu/amd_shaders.c
+++ b/lib/amdgpu/amd_shaders.c
@@ -30,27 +30,77 @@
 #define DATA_OFFSET 1024
 
 #define SWAP_32(num) (((num & 0xff000000) >> 24) | \
-		      ((num & 0x0000ff00) << 8) | \
-		      ((num & 0x00ff0000) >> 8) | \
-		      ((num & 0x000000ff) << 24))
-/**
+				((num & 0x0000ff00) << 8) | \
+				((num & 0x00ff0000) >> 8) | \
+				((num & 0x000000ff) << 24))
+/*
+ * C equivalent of the shader_bin below. This tiny compute kernel performs a
+ * busy-wait loop, then writes the constant value 42 into a GPU virtual
+ * address passed via COMPUTE_USER_DATA_0..1. This shader is intentionally
+ * simple and designed to work across all GFX generations.
  *
- *s_mov_b32 s2, 0
- * s_cmp_gt_u32 s2, 0x98967f
- * ;;
- * s_cbranch_scc1 4
- * s_add_i32 s2, s2, 1
- * s_cmp_gt_u32 s2, 0x98967f
- * ;;
- * s_cbranch_scc0 65532
- * s_mov_b32 s3, 0xf000
- * ;;
- * s_mov_b32 s2, -1
- * v_mov_b32_e32 v0, 42
- * buffer_store_dword v0, off, s[0:3], 0
- * ;;
- * s_endpgm
+ * Host-side logic equivalent in C:
+ *
+ *	static void gpu_shader_standalone(volatile u32 *dst, u64 iters)
+ *	{
+ *		Busy loop comparable to the ISA spin, prevents compiler optimization.
+ *		for (u64 i = 0; i <= iters; i++)
+ *			asm volatile("" ::: "memory");
+ *
+ *		After spin completes, write sentinel value 42.
+ *		*dst = 42;
+ *	}
+ *
+ * The binary shader blob generated below executes equivalent logic on the GPU.
  */
+
+/*
+ * Shader ISA: spin_store_flat.s
+ *
+ * This kernel expects a 64-bit flat/global address provided in s[0:1]
+ * (passed via COMPUTE_USER_DATA_0..1) and behaves as follows:
+ *
+ *	1) Initialize a counter register (s2 = 0).
+ *	2) Increment s2 until it exceeds 0x0098967f (~10 million).
+ *	3) When the loop completes, store the literal 42 at *(u32 *)(s1:s0).
+ *	4) Terminate with s_endpgm.
+ *
+ * Notes:
+ *	- Uses flat/global addressing — no buffer SRD required.
+ *	- Works with 1x1x1 dispatches; single work-item.
+ *	- Safe for all GFX generations; no HW-specific instructions.
+ *
+ * Annotated assembly:
+ *
+ *	.text
+ *	.p2align 8
+ *	.globl spin_store_flat
+ *	.type	spin_store_flat, at function
+ *
+ * spin_store_flat:
+ *	s_mov_b32 s2, 0				Initialize loop counter.
+ *
+ *	s_cmp_gt_u32 s2, 0x0098967f		Early exit if counter exceeds limit.
+ *	s_cbranch_scc1 .Lexit
+ *
+ * .Lloop:
+ *	s_add_i32 s2, s2, 1			Increment counter.
+ *	s_cmp_gt_u32 s2, 0x0098967f		Continue looping if below threshold.
+ *	s_cbranch_scc0 .Lloop
+ *
+ *	v_mov_b32_e32 v0, 42			Load value 42 into v0.
+ *	v_mov_b32_e32 v1, s0			Load address low bits into v1.
+ *	v_mov_b32_e32 v2, s1			Load address high bits into v2.
+ *
+ *	flat_store_dword v[1:2], v0		Store 42 to *(u32 *)(s1:s0).
+ *
+ * .Lexit:
+ *	s_endpgm				End program.
+ *
+ *	.size spin_store_flat, .-spin_store_flat
+ */
+
+
 static  const
 uint32_t shader_bin[] = {
 	SWAP_32(0x800082be), SWAP_32(0x02ff08bf), SWAP_32(0x7f969800), SWAP_32(0x040085bf),
-- 
2.34.1