[PATCH] amdgpu/tests: make amdgpu_sync_dependency_test work on gfx12
vitaly.prosyak at amd.com
vitaly.prosyak at amd.com
Mon Aug 25 18:10:47 UTC 2025
From: Vitaly Prosyak <vitaly.prosyak at amd.com>
drop CLEAR_STATE and fix malformed preamble
On GC 12.0.0 (RX 9070/ gfx12) the amdgpu_sync_dependency_test reliably
timed out and was canceled (amdgpu_cs_query_fence_status() -> -ECANCELED).
Root cause was a bad PM4 preamble:
1) We emitted PKT3_CLEAR_STATE (0-dword packet) and then incorrectly
wrote an extra dword 0x80000000. That stray word is a TYPE2 packet
and wedges the parser on gfx12.
2) Even when well-formed, CLEAR_STATE is a graphics packet and is not
accepted in the paths we use on gfx12. The ucode/queue is stricter
and the job stalls.
This change removes CLEAR_STATE from the GFX12 and fixes
the malformed emission. We keep CONTEXT_CONTROL (harmless across gens).
The test logic is unchanged: IB#1 launches a tiny kernel that spins and
stores 42 to data_addr; IB#2 depends on IB#1’s fence and then WRITE_DATA
(WR_CONFIRM) writes 99 to the same location. Seeing 99 proves the kernel
completed and the dependency was honored.
With these fixes the test completes on gfx9/gfx10/gfx11 as before and
now passes on gfx12 without GPU reset.
Fixes: spurious -ECANCELED due to hangcheck after malformed/illegal PM4 preamble on gfx12
Cc: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer at amd.com>
Cc: Christian König <christian.koenig at amd.com>
Cc: Alex Deucher <alexander.deucher at amd.com>
Cc: Jesse Zhang <jesse.zhang at amd.com>
Signed-off-by: Vitaly Prosyak <vitaly.prosyak at amd.com>
---
lib/amdgpu/amd_family.h | 1 +
lib/amdgpu/amd_ip_blocks.c | 74 +++++++++++++++++++++++------
lib/amdgpu/amd_ip_blocks_ex.c | 41 +++++++++++++++-
lib/amdgpu/amd_shaders.c | 88 +++++++++++++++++++++++++++--------
4 files changed, 169 insertions(+), 35 deletions(-)
diff --git a/lib/amdgpu/amd_family.h b/lib/amdgpu/amd_family.h
index 60190c639..4514fff2a 100644
--- a/lib/amdgpu/amd_family.h
+++ b/lib/amdgpu/amd_family.h
@@ -128,6 +128,7 @@ enum chip_class {
GFX10,
GFX10_3,
GFX11,
+ GFX12,
NUM_GFX_VERSIONS
};
diff --git a/lib/amdgpu/amd_ip_blocks.c b/lib/amdgpu/amd_ip_blocks.c
index d8d7efbef..6955ba7dc 100644
--- a/lib/amdgpu/amd_ip_blocks.c
+++ b/lib/amdgpu/amd_ip_blocks.c
@@ -1031,13 +1031,21 @@ amdgpu_device_ip_block_add(struct amdgpu_ip_block_version *ip_block_version)
amdgpu_ips.ip_blocks[amdgpu_ips.num_ip_blocks++] = ip_block_version;
+ return 0;
+}
+
+static int
+amdgpu_device_ip_block_ex_setup(struct amdgpu_ip_block_version *ip_block_version)
+{
+ if (amdgpu_ips.num_ip_blocks >= AMD_IP_MAX)
+ return -1;
+
if (ip_block_version->funcs &&
(!ip_block_version->funcs->gfx_program_compute ||
!ip_block_version->funcs->gfx_dispatch_direct ||
!ip_block_version->funcs->gfx_write_confirm )) {
amd_ip_blocks_ex_init(ip_block_version->funcs);
}
-
return 0;
}
@@ -1221,9 +1229,11 @@ int setup_amdgpu_ip_blocks(uint32_t major, uint32_t minor, struct amdgpu_gpu_inf
{"GFX10", GFX10},
{"GFX10_3", GFX10_3},
{"GFX11", GFX11},
+ {"GFX12", GFX12},
{},
};
struct chip_info *info = &g_chip;
+ int i;
g_pChip = &g_chip;
@@ -1318,7 +1328,9 @@ int setup_amdgpu_ip_blocks(uint32_t major, uint32_t minor, struct amdgpu_gpu_inf
igt_info("amdgpu: %s (family_id, chip_external_rev): (%u, %u)\n",
info->name, amdinfo->family_id, amdinfo->chip_external_rev);
- if (info->family >= CHIP_GFX1100) {
+ if (info->family >= CHIP_GFX1200) {
+ info->chip_class = GFX12;
+ } else if (info->family >= CHIP_GFX1100) {
info->chip_class = GFX11;
} else if (info->family >= CHIP_SIENNA_CICHLID) {
info->chip_class = GFX10_3;
@@ -1348,30 +1360,62 @@ int setup_amdgpu_ip_blocks(uint32_t major, uint32_t minor, struct amdgpu_gpu_inf
case GFX10:/* tested */
case GFX10_3: /* tested */
case GFX11: /* tested */
+ case GFX12: /* tested */
+ /* extra precaution if re-factor again */
+ igt_assert_eq(gfx_v8_x_ip_block.major, 8);
+ igt_assert_eq(compute_v8_x_ip_block.major, 8);
+ igt_assert_eq(sdma_v3_x_ip_block.major, 3);
+
+ /* Add three default IP blocks: GFX, Compute, and SDMA.
+ *
+ * These represent the base hardware engines available on the ASIC.
+ * - gfx_v8_x_ip_block: graphics + compute pipeline control
+ * - compute_v8_x_ip_block: dedicated compute support (user queues, shaders)
+ * - sdma_v3_x_ip_block: asynchronous DMA engine for buffer transfers
+ */
amdgpu_device_ip_block_add(&gfx_v8_x_ip_block);
amdgpu_device_ip_block_add(&compute_v8_x_ip_block);
amdgpu_device_ip_block_add(&sdma_v3_x_ip_block);
- /*
- * The handling of a particular family _id is done into
- * each function and as a result the field family_id would be overwritten
- * during initialization which matches to actual family_id.
- * The initial design assumed that for different GPU families, we may
- * have different implementations, but it is not necessary.
- * TO DO: move family id as a parameter into IP functions and
- * remove it as a field
+
+ /* Assign the actual hardware identifiers to all registered IP blocks.
+ *
+ * These fields are taken from amdinfo (queried from the kernel):
+ * - family_id: ASIC family (gfx9/gfx10/gfx11/gfx12, etc.)
+ * - chip_external_rev: revision exposed to firmware
+ * - chip_rev: internal revision used by HW-specific hooks
*/
- for (int i = 0; i < amdgpu_ips.num_ip_blocks; i++) {
+ for (i = 0; i < amdgpu_ips.num_ip_blocks; i++) {
amdgpu_ips.ip_blocks[i]->funcs->family_id = amdinfo->family_id;
amdgpu_ips.ip_blocks[i]->funcs->chip_external_rev = amdinfo->chip_external_rev;
amdgpu_ips.ip_blocks[i]->funcs->chip_rev = amdinfo->chip_rev;
}
- /* extra precaution if re-factor again */
- igt_assert_eq(gfx_v8_x_ip_block.major, 8);
- igt_assert_eq(compute_v8_x_ip_block.major, 8);
- igt_assert_eq(sdma_v3_x_ip_block.major, 3);
+ /* Configure ASIC-specific function hooks.
+ *
+ * Depending on family_id, each IP block installs the correct
+ * callback implementations for:
+ * - PM4 packet encoding (e.g., gfx10/gfx11/gfx12 differences)
+ * - Register offsets (PGM_LO/PGM_RSRC, etc.)
+ * - Compute and dispatch programming
+ *
+ * This ensures that test code emits commands in the correct
+ * hardware-specific format for the active GPU.
+ */
+ for (i = 0; i < amdgpu_ips.num_ip_blocks; i++)
+ amdgpu_device_ip_block_ex_setup(amdgpu_ips.ip_blocks[i]);
+
+ /* Sanity-check the initialized IP blocks to ensure the family_id matches
+ * the ASIC info from the kernel. If these assertions fail, it usually
+ * means that:
+ * - IP block registration order changed, or
+ * - amdinfo data mismatches the active hooks, or
+ * - future refactoring broke expected initialization paths.
+ */
igt_assert_eq(gfx_v8_x_ip_block.funcs->family_id, amdinfo->family_id);
igt_assert_eq(sdma_v3_x_ip_block.funcs->family_id, amdinfo->family_id);
+
+
+
break;
default:
igt_info("amdgpu: GFX11 or old.\n");
diff --git a/lib/amdgpu/amd_ip_blocks_ex.c b/lib/amdgpu/amd_ip_blocks_ex.c
index 21c4d952c..aab9a2a4a 100644
--- a/lib/amdgpu/amd_ip_blocks_ex.c
+++ b/lib/amdgpu/amd_ip_blocks_ex.c
@@ -139,6 +139,44 @@ static void gfx_program_compute_gfx11(
base->emit(base, tz);
}
+static void gfx_program_compute_gfx12(
+ const struct amdgpu_ip_funcs *f,
+ struct amdgpu_cmd_base *base,
+ uint64_t code, uint64_t udata0,
+ uint32_t rsrc1, uint32_t rsrc2,
+ uint32_t tx, uint32_t ty, uint32_t tz)
+{
+ base->emit(base, PACKET3(PKT3_CONTEXT_CONTROL, 1));
+ base->emit(base, 0x80000000);
+ base->emit(base, 0x80000000);
+
+ base->emit(base, PACKET3(PKT3_SET_SH_REG, 2));
+ base->emit(base, f->get_reg_offset(COMPUTE_PGM_LO));
+ base->emit(base, (uint32_t)(code >> 8));
+ base->emit(base, (uint32_t)(code >> 40));
+
+ base->emit(base, PACKET3(PKT3_SET_SH_REG, 2));
+ base->emit(base, f->get_reg_offset(COMPUTE_PGM_RSRC1));
+ base->emit(base, rsrc1);
+ base->emit(base, rsrc2);
+
+ base->emit(base, PACKET3(PKT3_SET_SH_REG, 2));
+ base->emit(base, f->get_reg_offset(COMPUTE_USER_DATA_0));
+ base->emit(base, (uint32_t)udata0);
+ base->emit(base, (uint32_t)(udata0 >> 32));
+
+ base->emit(base, PACKET3(PKT3_SET_SH_REG, 1));
+ base->emit(base, f->get_reg_offset(COMPUTE_RESOURCE_LIMITS));
+ base->emit(base, 0);
+
+ base->emit(base, PACKET3(PKT3_SET_SH_REG, 3));
+ base->emit(base, f->get_reg_offset(COMPUTE_NUM_THREAD_X));
+ base->emit(base, tx);
+ base->emit(base, ty);
+ base->emit(base, tz);
+}
+
+
static void gfx_dispatch_direct_gfx11(
const struct amdgpu_ip_funcs *f,
struct amdgpu_cmd_base *base,
@@ -172,7 +210,8 @@ void amd_ip_blocks_ex_init(struct amdgpu_ip_funcs *funcs)
funcs->gfx_dispatch_direct = gfx_dispatch_direct_gfx11;
break;
case AMDGPU_FAMILY_GC_12_0_0:
- /*TODO*/
+ funcs->gfx_program_compute =gfx_program_compute_gfx12;
+ funcs->gfx_dispatch_direct = gfx_dispatch_direct_gfx11;
break;
default:
break;
diff --git a/lib/amdgpu/amd_shaders.c b/lib/amdgpu/amd_shaders.c
index e4931961c..adc4fc051 100644
--- a/lib/amdgpu/amd_shaders.c
+++ b/lib/amdgpu/amd_shaders.c
@@ -30,27 +30,77 @@
#define DATA_OFFSET 1024
#define SWAP_32(num) (((num & 0xff000000) >> 24) | \
- ((num & 0x0000ff00) << 8) | \
- ((num & 0x00ff0000) >> 8) | \
- ((num & 0x000000ff) << 24))
-/**
+ ((num & 0x0000ff00) << 8) | \
+ ((num & 0x00ff0000) >> 8) | \
+ ((num & 0x000000ff) << 24))
+/*
+ * C equivalent of the shader_bin below. This tiny compute kernel performs a
+ * busy-wait loop, then writes the constant value 42 into a GPU virtual
+ * address passed via COMPUTE_USER_DATA_0..1. This shader is intentionally
+ * simple and designed to work across all GFX generations.
*
- *s_mov_b32 s2, 0
- * s_cmp_gt_u32 s2, 0x98967f
- * ;;
- * s_cbranch_scc1 4
- * s_add_i32 s2, s2, 1
- * s_cmp_gt_u32 s2, 0x98967f
- * ;;
- * s_cbranch_scc0 65532
- * s_mov_b32 s3, 0xf000
- * ;;
- * s_mov_b32 s2, -1
- * v_mov_b32_e32 v0, 42
- * buffer_store_dword v0, off, s[0:3], 0
- * ;;
- * s_endpgm
+ * Host-side logic equivalent in C:
+ *
+ * static void gpu_shader_standalone(volatile u32 *dst, u64 iters)
+ * {
+ * Busy loop comparable to the ISA spin, prevents compiler optimization.
+ * for (u64 i = 0; i <= iters; i++)
+ * asm volatile("" ::: "memory");
+ *
+ * After spin completes, write sentinel value 42.
+ * *dst = 42;
+ * }
+ *
+ * The binary shader blob generated below executes equivalent logic on the GPU.
*/
+
+/*
+ * Shader ISA: spin_store_flat.s
+ *
+ * This kernel expects a 64-bit flat/global address provided in s[0:1]
+ * (passed via COMPUTE_USER_DATA_0..1) and behaves as follows:
+ *
+ * 1) Initialize a counter register (s2 = 0).
+ * 2) Increment s2 until it exceeds 0x0098967f (~10 million).
+ * 3) When the loop completes, store the literal 42 at *(u32 *)(s1:s0).
+ * 4) Terminate with s_endpgm.
+ *
+ * Notes:
+ * - Uses flat/global addressing — no buffer SRD required.
+ * - Works with 1x1x1 dispatches; single work-item.
+ * - Safe for all GFX generations; no HW-specific instructions.
+ *
+ * Annotated assembly:
+ *
+ * .text
+ * .p2align 8
+ * .globl spin_store_flat
+ * .type spin_store_flat, at function
+ *
+ * spin_store_flat:
+ * s_mov_b32 s2, 0 Initialize loop counter.
+ *
+ * s_cmp_gt_u32 s2, 0x0098967f Early exit if counter exceeds limit.
+ * s_cbranch_scc1 .Lexit
+ *
+ * .Lloop:
+ * s_add_i32 s2, s2, 1 Increment counter.
+ * s_cmp_gt_u32 s2, 0x0098967f Continue looping if below threshold.
+ * s_cbranch_scc0 .Lloop
+ *
+ * v_mov_b32_e32 v0, 42 Load value 42 into v0.
+ * v_mov_b32_e32 v1, s0 Load address low bits into v1.
+ * v_mov_b32_e32 v2, s1 Load address high bits into v2.
+ *
+ * flat_store_dword v[1:2], v0 Store 42 to *(u32 *)(s1:s0).
+ *
+ * .Lexit:
+ * s_endpgm End program.
+ *
+ * .size spin_store_flat, .-spin_store_flat
+ */
+
+
static const
uint32_t shader_bin[] = {
SWAP_32(0x800082be), SWAP_32(0x02ff08bf), SWAP_32(0x7f969800), SWAP_32(0x040085bf),
--
2.34.1
More information about the igt-dev
mailing list