[PATCH] amdgpu/tests: make amdgpu_sync_dependency_test work on gfx12

Tue Aug 26 00:37:09 UTC 2025

[AMD Official Use Only - AMD Internal Distribution Only]

This patch is good for me,
Reviewed-by: "Jesse.zhang at amd.com"

-----Original Message-----
From: vitaly.prosyak at amd.com <vitaly.prosyak at amd.com>
Sent: Tuesday, August 26, 2025 2:11 AM
To: igt-dev at lists.freedesktop.org
Cc: Prosyak, Vitaly <Vitaly.Prosyak at amd.com>; Pelloux-Prayer, Pierre-Eric <Pierre-eric.Pelloux-prayer at amd.com>; Koenig, Christian <Christian.Koenig at amd.com>; Deucher, Alexander <Alexander.Deucher at amd.com>; Zhang, Jesse(Jie) <Jesse.Zhang at amd.com>
Subject: [PATCH] amdgpu/tests: make amdgpu_sync_dependency_test work on gfx12

From: Vitaly Prosyak <vitaly.prosyak at amd.com>

drop CLEAR_STATE and fix malformed preamble

On GC 12.0.0 (RX 9070/ gfx12) the amdgpu_sync_dependency_test reliably timed out and was canceled (amdgpu_cs_query_fence_status() -> -ECANCELED).
Root cause was a bad PM4 preamble:

  1) We emitted PKT3_CLEAR_STATE (0-dword packet) and then incorrectly
     wrote an extra dword 0x80000000. That stray word is a TYPE2 packet
     and wedges the parser on gfx12.

  2) Even when well-formed, CLEAR_STATE is a graphics packet and is not
     accepted in the paths we use on gfx12. The ucode/queue is stricter
     and the job stalls.

This change removes CLEAR_STATE from the GFX12 and fixes the malformed emission. We keep CONTEXT_CONTROL (harmless across gens).

The test logic is unchanged: IB#1 launches a tiny kernel that spins and stores 42 to data_addr; IB#2 depends on IB#1’s fence and then WRITE_DATA
(WR_CONFIRM) writes 99 to the same location. Seeing 99 proves the kernel completed and the dependency was honored.

With these fixes the test completes on gfx9/gfx10/gfx11 as before and now passes on gfx12 without GPU reset.

Fixes: spurious -ECANCELED due to hangcheck after malformed/illegal PM4 preamble on gfx12

Cc: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer at amd.com>
Cc: Christian König <christian.koenig at amd.com>
Cc: Alex Deucher <alexander.deucher at amd.com>
Cc: Jesse Zhang <jesse.zhang at amd.com>

Signed-off-by: Vitaly Prosyak <vitaly.prosyak at amd.com>
---
 lib/amdgpu/amd_family.h       |  1 +
 lib/amdgpu/amd_ip_blocks.c    | 74 +++++++++++++++++++++++------
 lib/amdgpu/amd_ip_blocks_ex.c | 41 +++++++++++++++-
 lib/amdgpu/amd_shaders.c      | 88 +++++++++++++++++++++++++++--------
 4 files changed, 169 insertions(+), 35 deletions(-)

diff --git a/lib/amdgpu/amd_family.h b/lib/amdgpu/amd_family.h index 60190c639..4514fff2a 100644
--- a/lib/amdgpu/amd_family.h
+++ b/lib/amdgpu/amd_family.h
@@ -128,6 +128,7 @@ enum chip_class {
        GFX10,
        GFX10_3,
        GFX11,
+       GFX12,
        NUM_GFX_VERSIONS
 };

diff --git a/lib/amdgpu/amd_ip_blocks.c b/lib/amdgpu/amd_ip_blocks.c index d8d7efbef..6955ba7dc 100644
--- a/lib/amdgpu/amd_ip_blocks.c
+++ b/lib/amdgpu/amd_ip_blocks.c
@@ -1031,13 +1031,21 @@ amdgpu_device_ip_block_add(struct amdgpu_ip_block_version *ip_block_version)

        amdgpu_ips.ip_blocks[amdgpu_ips.num_ip_blocks++] = ip_block_version;

+       return 0;
+}
+
+static int
+amdgpu_device_ip_block_ex_setup(struct amdgpu_ip_block_version
+*ip_block_version) {
+       if (amdgpu_ips.num_ip_blocks >= AMD_IP_MAX)
+               return -1;
+
        if (ip_block_version->funcs &&
                (!ip_block_version->funcs->gfx_program_compute ||
                 !ip_block_version->funcs->gfx_dispatch_direct ||
                 !ip_block_version->funcs->gfx_write_confirm )) {
                amd_ip_blocks_ex_init(ip_block_version->funcs);
        }
-
        return 0;
 }

@@ -1221,9 +1229,11 @@ int setup_amdgpu_ip_blocks(uint32_t major, uint32_t minor, struct amdgpu_gpu_inf
                {"GFX10",                               GFX10},
                {"GFX10_3",                             GFX10_3},
                {"GFX11",                               GFX11},
+               {"GFX12",                               GFX12},
                {},
        };
        struct chip_info *info = &g_chip;
+       int i;

        g_pChip = &g_chip;

@@ -1318,7 +1328,9 @@ int setup_amdgpu_ip_blocks(uint32_t major, uint32_t minor, struct amdgpu_gpu_inf
        igt_info("amdgpu: %s (family_id, chip_external_rev): (%u, %u)\n",
                                info->name, amdinfo->family_id, amdinfo->chip_external_rev);

-       if (info->family >= CHIP_GFX1100) {
+       if (info->family >= CHIP_GFX1200) {
+               info->chip_class = GFX12;
+       } else if (info->family >= CHIP_GFX1100) {
                info->chip_class = GFX11;
        } else if (info->family >= CHIP_SIENNA_CICHLID) {
                info->chip_class = GFX10_3;
@@ -1348,30 +1360,62 @@ int setup_amdgpu_ip_blocks(uint32_t major, uint32_t minor, struct amdgpu_gpu_inf
        case GFX10:/* tested */
        case GFX10_3: /* tested */
        case GFX11: /* tested */
+       case GFX12: /* tested */
+               /* extra precaution if re-factor again */
+               igt_assert_eq(gfx_v8_x_ip_block.major, 8);
+               igt_assert_eq(compute_v8_x_ip_block.major, 8);
+               igt_assert_eq(sdma_v3_x_ip_block.major, 3);
+
+               /* Add three default IP blocks: GFX, Compute, and SDMA.
+                *
+                * These represent the base hardware engines available on the ASIC.
+                * - gfx_v8_x_ip_block: graphics + compute pipeline control
+                * - compute_v8_x_ip_block: dedicated compute support (user queues, shaders)
+                * - sdma_v3_x_ip_block: asynchronous DMA engine for buffer transfers
+                */
                amdgpu_device_ip_block_add(&gfx_v8_x_ip_block);
                amdgpu_device_ip_block_add(&compute_v8_x_ip_block);
                amdgpu_device_ip_block_add(&sdma_v3_x_ip_block);
-               /*
-                * The handling of a particular family _id is done into
-                * each function and as a result the field family_id would be overwritten
-                * during initialization which matches to actual family_id.
-                * The initial design assumed that for different GPU families, we may
-                * have different implementations, but it is not necessary.
-                * TO DO: move family id as a parameter into IP functions and
-                * remove it as a field
+
+               /* Assign the actual hardware identifiers to all registered IP blocks.
+                *
+                * These fields are taken from amdinfo (queried from the kernel):
+                * - family_id: ASIC family (gfx9/gfx10/gfx11/gfx12, etc.)
+                * - chip_external_rev: revision exposed to firmware
+                * - chip_rev: internal revision used by HW-specific hooks
                 */
-               for (int i = 0; i <  amdgpu_ips.num_ip_blocks; i++) {
+               for (i = 0; i < amdgpu_ips.num_ip_blocks; i++) {
                        amdgpu_ips.ip_blocks[i]->funcs->family_id = amdinfo->family_id;
                        amdgpu_ips.ip_blocks[i]->funcs->chip_external_rev = amdinfo->chip_external_rev;
                        amdgpu_ips.ip_blocks[i]->funcs->chip_rev = amdinfo->chip_rev;
                }
-               /* extra precaution if re-factor again */
-               igt_assert_eq(gfx_v8_x_ip_block.major, 8);
-               igt_assert_eq(compute_v8_x_ip_block.major, 8);
-               igt_assert_eq(sdma_v3_x_ip_block.major, 3);

+               /* Configure ASIC-specific function hooks.
+                *
+                * Depending on family_id, each IP block installs the correct
+                * callback implementations for:
+                * - PM4 packet encoding (e.g., gfx10/gfx11/gfx12 differences)
+                * - Register offsets (PGM_LO/PGM_RSRC, etc.)
+                * - Compute and dispatch programming
+                *
+                * This ensures that test code emits commands in the correct
+                * hardware-specific format for the active GPU.
+                */
+               for (i = 0; i < amdgpu_ips.num_ip_blocks; i++)
+                       amdgpu_device_ip_block_ex_setup(amdgpu_ips.ip_blocks[i]);
+
+               /* Sanity-check the initialized IP blocks to ensure the family_id matches
+                * the ASIC info from the kernel. If these assertions fail, it usually
+                * means that:
+                * - IP block registration order changed, or
+                * - amdinfo data mismatches the active hooks, or
+                * - future refactoring broke expected initialization paths.
+                */
                igt_assert_eq(gfx_v8_x_ip_block.funcs->family_id, amdinfo->family_id);
                igt_assert_eq(sdma_v3_x_ip_block.funcs->family_id, amdinfo->family_id);
+
+
+
                break;
        default:
                igt_info("amdgpu: GFX11 or old.\n");
diff --git a/lib/amdgpu/amd_ip_blocks_ex.c b/lib/amdgpu/amd_ip_blocks_ex.c index 21c4d952c..aab9a2a4a 100644
--- a/lib/amdgpu/amd_ip_blocks_ex.c
+++ b/lib/amdgpu/amd_ip_blocks_ex.c
@@ -139,6 +139,44 @@ static void gfx_program_compute_gfx11(
        base->emit(base, tz);
 }

+static void gfx_program_compute_gfx12(
+       const struct amdgpu_ip_funcs *f,
+       struct amdgpu_cmd_base *base,
+       uint64_t code, uint64_t udata0,
+       uint32_t rsrc1, uint32_t rsrc2,
+       uint32_t tx, uint32_t ty, uint32_t tz) {
+       base->emit(base, PACKET3(PKT3_CONTEXT_CONTROL, 1));
+       base->emit(base, 0x80000000);
+       base->emit(base, 0x80000000);
+
+       base->emit(base, PACKET3(PKT3_SET_SH_REG, 2));
+       base->emit(base, f->get_reg_offset(COMPUTE_PGM_LO));
+       base->emit(base, (uint32_t)(code >> 8));
+       base->emit(base, (uint32_t)(code >> 40));
+
+       base->emit(base, PACKET3(PKT3_SET_SH_REG, 2));
+       base->emit(base, f->get_reg_offset(COMPUTE_PGM_RSRC1));
+       base->emit(base, rsrc1);
+       base->emit(base, rsrc2);
+
+       base->emit(base, PACKET3(PKT3_SET_SH_REG, 2));
+       base->emit(base, f->get_reg_offset(COMPUTE_USER_DATA_0));
+       base->emit(base, (uint32_t)udata0);
+       base->emit(base, (uint32_t)(udata0 >> 32));
+
+       base->emit(base, PACKET3(PKT3_SET_SH_REG, 1));
+       base->emit(base, f->get_reg_offset(COMPUTE_RESOURCE_LIMITS));
+       base->emit(base, 0);
+
+       base->emit(base, PACKET3(PKT3_SET_SH_REG, 3));
+       base->emit(base, f->get_reg_offset(COMPUTE_NUM_THREAD_X));
+       base->emit(base, tx);
+       base->emit(base, ty);
+       base->emit(base, tz);
+}
+
+
 static void gfx_dispatch_direct_gfx11(
        const struct amdgpu_ip_funcs *f,
        struct amdgpu_cmd_base *base,
@@ -172,7 +210,8 @@ void amd_ip_blocks_ex_init(struct amdgpu_ip_funcs *funcs)
                funcs->gfx_dispatch_direct = gfx_dispatch_direct_gfx11;
                break;
        case AMDGPU_FAMILY_GC_12_0_0:
-               /*TODO*/
+               funcs->gfx_program_compute =gfx_program_compute_gfx12;
+               funcs->gfx_dispatch_direct = gfx_dispatch_direct_gfx11;
                break;
        default:
                break;
diff --git a/lib/amdgpu/amd_shaders.c b/lib/amdgpu/amd_shaders.c index e4931961c..adc4fc051 100644
--- a/lib/amdgpu/amd_shaders.c
+++ b/lib/amdgpu/amd_shaders.c
@@ -30,27 +30,77 @@
 #define DATA_OFFSET 1024

 #define SWAP_32(num) (((num & 0xff000000) >> 24) | \
-                     ((num & 0x0000ff00) << 8) | \
-                     ((num & 0x00ff0000) >> 8) | \
-                     ((num & 0x000000ff) << 24))
-/**
+                               ((num & 0x0000ff00) << 8) | \
+                               ((num & 0x00ff0000) >> 8) | \
+                               ((num & 0x000000ff) << 24))
+/*
+ * C equivalent of the shader_bin below. This tiny compute kernel
+performs a
+ * busy-wait loop, then writes the constant value 42 into a GPU virtual
+ * address passed via COMPUTE_USER_DATA_0..1. This shader is
+intentionally
+ * simple and designed to work across all GFX generations.
  *
- *s_mov_b32 s2, 0
- * s_cmp_gt_u32 s2, 0x98967f
- * ;;
- * s_cbranch_scc1 4
- * s_add_i32 s2, s2, 1
- * s_cmp_gt_u32 s2, 0x98967f
- * ;;
- * s_cbranch_scc0 65532
- * s_mov_b32 s3, 0xf000
- * ;;
- * s_mov_b32 s2, -1
- * v_mov_b32_e32 v0, 42
- * buffer_store_dword v0, off, s[0:3], 0
- * ;;
- * s_endpgm
+ * Host-side logic equivalent in C:
+ *
+ *     static void gpu_shader_standalone(volatile u32 *dst, u64 iters)
+ *     {
+ *             Busy loop comparable to the ISA spin, prevents compiler optimization.
+ *             for (u64 i = 0; i <= iters; i++)
+ *                     asm volatile("" ::: "memory");
+ *
+ *             After spin completes, write sentinel value 42.
+ *             *dst = 42;
+ *     }
+ *
+ * The binary shader blob generated below executes equivalent logic on the GPU.
  */
+
+/*
+ * Shader ISA: spin_store_flat.s
+ *
+ * This kernel expects a 64-bit flat/global address provided in s[0:1]
+ * (passed via COMPUTE_USER_DATA_0..1) and behaves as follows:
+ *
+ *     1) Initialize a counter register (s2 = 0).
+ *     2) Increment s2 until it exceeds 0x0098967f (~10 million).
+ *     3) When the loop completes, store the literal 42 at *(u32 *)(s1:s0).
+ *     4) Terminate with s_endpgm.
+ *
+ * Notes:
+ *     - Uses flat/global addressing — no buffer SRD required.
+ *     - Works with 1x1x1 dispatches; single work-item.
+ *     - Safe for all GFX generations; no HW-specific instructions.
+ *
+ * Annotated assembly:
+ *
+ *     .text
+ *     .p2align 8
+ *     .globl spin_store_flat
+ *     .type   spin_store_flat, at function
+ *
+ * spin_store_flat:
+ *     s_mov_b32 s2, 0                         Initialize loop counter.
+ *
+ *     s_cmp_gt_u32 s2, 0x0098967f             Early exit if counter exceeds limit.
+ *     s_cbranch_scc1 .Lexit
+ *
+ * .Lloop:
+ *     s_add_i32 s2, s2, 1                     Increment counter.
+ *     s_cmp_gt_u32 s2, 0x0098967f             Continue looping if below threshold.
+ *     s_cbranch_scc0 .Lloop
+ *
+ *     v_mov_b32_e32 v0, 42                    Load value 42 into v0.
+ *     v_mov_b32_e32 v1, s0                    Load address low bits into v1.
+ *     v_mov_b32_e32 v2, s1                    Load address high bits into v2.
+ *
+ *     flat_store_dword v[1:2], v0             Store 42 to *(u32 *)(s1:s0).
+ *
+ * .Lexit:
+ *     s_endpgm                                End program.
+ *
+ *     .size spin_store_flat, .-spin_store_flat
+ */
+
+
 static  const
 uint32_t shader_bin[] = {
        SWAP_32(0x800082be), SWAP_32(0x02ff08bf), SWAP_32(0x7f969800), SWAP_32(0x040085bf),
--
2.34.1