[PATCH i-g-t v2 2/2] tests/amdgpu: add fwm preemption test case for userq

Mon May 19 04:53:39 UTC 2025

[Public]

The fwm premption is tested using below algorithm,
1) create 2 queues
2) submit dma write job with unsatisfied fence on both the queues
3) if the dma write passes then the test has failed

v2: reduce timeout to 2ms

Cc: Prosyak, Vitaly <Vitaly.Prosyak at amd.com>
Cc: Jesse.Zhang <Jesse.zhang at amd.com>
Cc: Sunil Khatri <sunil.khatri at amd.com>
Signed-off-by: Yogesh Mohan Marimuthu <yogesh.mohanmarimuthu at amd.com>
Reviewed-by: Sunil Khatri <sunil.khatri at amd.com>
---
 lib/amdgpu/amd_PM4.h     |  17 ++++
 tests/amdgpu/amd_basic.c | 178 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 195 insertions(+)

diff --git a/lib/amdgpu/amd_PM4.h b/lib/amdgpu/amd_PM4.h index 8f59b4223..d6a8446f4 100644
--- a/lib/amdgpu/amd_PM4.h
+++ b/lib/amdgpu/amd_PM4.h
@@ -195,6 +195,23 @@
 #define PACKET3_INDIRECT_BUFFER                         0x3F
 #define PACKET3_PROTECTED_FENCE_SIGNAL                  0xd0

+#define PACKET3_FENCE_WAIT_MULTI                        0xd1
+#define                FENCE_WAIT_MULTI_PREEMPTABLE(x)         ((x) << 1)
+               /* 0 - no
+                * 1 - yes
+                */
+#define                FENCE_WAIT_MULTI_CACHE_POLICY(x)        ((x) << 2)
+               /* 0 - LRU
+                * 1 - Stream
+                * 2 - noa
+                * 3 - Bypass
+                */
+#define                FENCE_WAIT_MULTI_ENGINE(x)              ((x) << 8)
+               /* 0 - me
+                * 1 - pfp
+                */
+#define                FENCE_WAIT_MULTI_POLL_INTERVAL(x)       ((x) << 16)
+
 #define        PACKET3_WRITE_DATA                              0x37
 #define                WRITE_DATA_DST_SEL(x)                   ((x) << 8)
                /* 0 - register
diff --git a/tests/amdgpu/amd_basic.c b/tests/amdgpu/amd_basic.c index 914d27909..075057960 100644
--- a/tests/amdgpu/amd_basic.c
+++ b/tests/amdgpu/amd_basic.c
@@ -689,6 +689,176 @@ amdgpu_sync_dependency_test(amdgpu_device_handle device_handle, bool user_queue)
        free(ring_context);
 }

+static void
+amdgpu_fwm_preempt_test(amdgpu_device_handle device_handle) {
+       struct amdgpu_ring_context *ring_context_1;
+       struct amdgpu_ring_context *ring_context_2;
+       struct amdgpu_cmd_base *cmd_base_1 = get_cmd_base();
+       struct amdgpu_cmd_base *cmd_base_2 = get_cmd_base();
+       const struct amdgpu_ip_block_version *ip_block = get_ip_block(device_handle, AMD_IP_GFX);
+       int r;
+
+       ring_context_1 = calloc(1, sizeof(struct amdgpu_ring_context));
+       igt_assert(ring_context_1);
+       amdgpu_user_queue_create(device_handle, ring_context_1,
+ ip_block->type);
+
+       ring_context_2 = calloc(1, sizeof(struct amdgpu_ring_context));
+       igt_assert(ring_context_2);
+       amdgpu_user_queue_create(device_handle, ring_context_2,
+ ip_block->type);
+
+       /* allocate bo1 for dma, bo2 for fence and bo3 for ib for render context 1 */
+       ring_context_1->write_length = 1024;
+       r = amdgpu_bo_alloc_and_map_sync(device_handle , ring_context_1->write_length, 4096,
+                                        AMDGPU_GEM_DOMAIN_GTT, AMDGPU_GEM_CREATE_CPU_GTT_USWC,
+                                        AMDGPU_VM_MTYPE_UC, &ring_context_1->bo,
+                                        (void **)&ring_context_1->bo_cpu, &ring_context_1->bo_mc,
+                                        &ring_context_1->va_handle,
+                                        ring_context_1->timeline_syncobj_handle,
+                                        ++ring_context_1->point, true);
+       igt_assert_eq(r, 0);
+       memset((void *)ring_context_1->bo_cpu, 0,
+ ring_context_1->write_length);
+
+       r = amdgpu_bo_alloc_and_map_sync(device_handle, 4096, 4096, AMDGPU_GEM_DOMAIN_GTT,
+                                        AMDGPU_GEM_CREATE_CPU_GTT_USWC, AMDGPU_VM_MTYPE_UC,
+                                        &ring_context_1->bo2, (void **)&ring_context_1->bo2_cpu,
+                                        &ring_context_1->bo_mc2, &ring_context_1->va_handle2,
+                                        ring_context_1->timeline_syncobj_handle,
+                                        ++ring_context_1->point, true);
+       igt_assert_eq(r, 0);
+       memset((void *)ring_context_1->bo2_cpu, 0, 4096);
+
+       r = amdgpu_bo_alloc_and_map_sync(device_handle, 8192, 4096, AMDGPU_GEM_DOMAIN_GTT,
+                                        AMDGPU_GEM_CREATE_CPU_GTT_USWC, AMDGPU_VM_MTYPE_UC,
+                                        &ring_context_1->bo3, (void **)&ring_context_1->bo3_cpu,
+                                        &ring_context_1->bo_mc3, &ring_context_1->va_handle3,
+                                        ring_context_1->timeline_syncobj_handle,
+                                        ++ring_context_1->point, true);
+       igt_assert_eq(r, 0);
+       memset((void *)ring_context_1->bo3_cpu, 0, 4096);
+
+       /* allocate bo1 for dma, bo2 for fence and bo3 for ib for render context 2 */
+       ring_context_2->write_length = 1024;
+       r = amdgpu_bo_alloc_and_map_sync(device_handle , ring_context_2->write_length, 4096,
+                                        AMDGPU_GEM_DOMAIN_GTT, AMDGPU_GEM_CREATE_CPU_GTT_USWC,
+                                        AMDGPU_VM_MTYPE_UC, &ring_context_2->bo,
+                                        (void **)&ring_context_2->bo_cpu, &ring_context_2->bo_mc,
+                                        &ring_context_2->va_handle,
+                                        ring_context_2->timeline_syncobj_handle,
+                                        ++ring_context_2->point, true);
+       igt_assert_eq(r, 0);
+       memset((void *)ring_context_2->bo_cpu, 0,
+ ring_context_2->write_length);
+
+       r = amdgpu_bo_alloc_and_map_sync(device_handle, 4096, 4096, AMDGPU_GEM_DOMAIN_GTT,
+                                        AMDGPU_GEM_CREATE_CPU_GTT_USWC, AMDGPU_VM_MTYPE_UC,
+                                        &ring_context_2->bo2, (void **)&ring_context_2->bo2_cpu,
+                                        &ring_context_2->bo_mc2, &ring_context_2->va_handle2,
+                                        ring_context_2->timeline_syncobj_handle,
+                                        ++ring_context_2->point, true);
+       igt_assert_eq(r, 0);
+       memset((void *)ring_context_2->bo2_cpu, 0, 4096);
+
+       r = amdgpu_bo_alloc_and_map_sync(device_handle, 8192, 4096, AMDGPU_GEM_DOMAIN_GTT,
+                                        AMDGPU_GEM_CREATE_CPU_GTT_USWC, AMDGPU_VM_MTYPE_UC,
+                                        &ring_context_2->bo3, (void **)&ring_context_2->bo3_cpu,
+                                        &ring_context_2->bo_mc3, &ring_context_2->va_handle3,
+                                        ring_context_2->timeline_syncobj_handle,
+                                        ++ring_context_2->point, true);
+       igt_assert_eq(r, 0);
+       memset((void *)ring_context_2->bo3_cpu, 0, 4096);
+
+
+       /* wait for gtt mapping to complete */
+       r = amdgpu_timeline_syncobj_wait(device_handle, ring_context_1->timeline_syncobj_handle,
+                                        ring_context_1->point);
+       igt_assert_eq(r, 0);
+       r = amdgpu_timeline_syncobj_wait(device_handle, ring_context_2->timeline_syncobj_handle,
+                                        ring_context_2->point);
+       igt_assert_eq(r, 0);
+
+       /* assign cmd buffer for ring context 1 */
+       cmd_base_1->attach_buf(cmd_base_1, (void
+ *)ring_context_1->bo3_cpu, 8192);
+
+       /* create the ib for ring context 1 */
+       cmd_base_1->emit(cmd_base_1, PACKET3(PACKET3_FENCE_WAIT_MULTI, 4 * 1));
+       cmd_base_1->emit(cmd_base_1, FENCE_WAIT_MULTI_ENGINE(1) | FENCE_WAIT_MULTI_PREEMPTABLE(1) |
+                           FENCE_WAIT_MULTI_CACHE_POLICY(3) | FENCE_WAIT_MULTI_POLL_INTERVAL(4));
+       cmd_base_1->emit(cmd_base_1, ring_context_1->bo_mc2);
+       cmd_base_1->emit(cmd_base_1, ring_context_1->bo_mc2 >> 32);
+       cmd_base_1->emit(cmd_base_1, 10); // random incorrect fence value
+       cmd_base_1->emit(cmd_base_1, 0);
+
+       cmd_base_1->emit(cmd_base_1, PACKET3(PACKET3_WRITE_DATA, 3));
+       cmd_base_1->emit(cmd_base_1, WRITE_DATA_DST_SEL(5) | WR_CONFIRM |
+                           WRITE_DATA_CACHE_POLICY(3));
+       cmd_base_1->emit(cmd_base_1, 0xfffffffc & ring_context_1->bo_mc);
+       cmd_base_1->emit(cmd_base_1, (0xffffffff00000000 & ring_context_1->bo_mc) >> 32);
+       cmd_base_1->emit(cmd_base_1, 0xdead0000);
        Can we use the existing function write_linear to replace this?

      Regards
      Jesse

        The gfx_ring_write_linear() function does not add packets to existing cmd_base, it adds to ring_context->pm4 and does*pm4_dw = i.
        To use gfx_ring_write_linear(), I will have to do following which does not look good IMHO.

        ring_context_1->pm4 = ring_context_1->bo3_cpu + cmd_base_1->cdw;
        uint32_t ret_pm4_dw;
        gfx_ring_write_linear(...);
        cmd_base_1->cdw += ret_pm4_dw;

        Thank you,
        Yogesh

+
+       ring_context_1->pm4_dw = cmd_base_1->cdw;
+       amdgpu_user_queue_submit(device_handle, ring_context_1, ip_block->type,
+                                ring_context_1->bo_mc3, true);
+
+       /* if fwm packet got skipped by firmware then 0xdead000 will be written */
+       usleep(1000 * 2);
+       igt_assert_eq_u32(*ring_context_1->bo_cpu, 0);
+
+       /* assign cmd buffer for ring context 2 */
+       cmd_base_2->attach_buf(cmd_base_2, (void
+ *)ring_context_2->bo3_cpu, 8192);
+
+       /* create the ib for ring context 2 */
+       cmd_base_2->emit(cmd_base_2, PACKET3(PACKET3_FENCE_WAIT_MULTI, 4 * 1));
+       cmd_base_2->emit(cmd_base_2, FENCE_WAIT_MULTI_ENGINE(1) | FENCE_WAIT_MULTI_PREEMPTABLE(1) |
+                           FENCE_WAIT_MULTI_CACHE_POLICY(3) | FENCE_WAIT_MULTI_POLL_INTERVAL(4));
+       cmd_base_2->emit(cmd_base_2, ring_context_2->bo_mc2);
+       cmd_base_2->emit(cmd_base_2, ring_context_2->bo_mc2 >> 32);
+       cmd_base_2->emit(cmd_base_2, 10); // random incorrect fence value
+       cmd_base_2->emit(cmd_base_2, 0);
+
+       cmd_base_2->emit(cmd_base_2, PACKET3(PACKET3_WRITE_DATA, 3));
+       cmd_base_2->emit(cmd_base_2, WRITE_DATA_DST_SEL(5) | WR_CONFIRM |
+                           WRITE_DATA_CACHE_POLICY(3));
+       cmd_base_2->emit(cmd_base_2, 0xfffffffc & ring_context_2->bo_mc);
+       cmd_base_2->emit(cmd_base_2, (0xffffffff00000000 & ring_context_2->bo_mc) >> 32);
+       cmd_base_2->emit(cmd_base_2, 0xdead0000);
+
+       ring_context_2->pm4_dw = cmd_base_2->cdw;
+       amdgpu_user_queue_submit(device_handle, ring_context_2, ip_block->type,
+                                ring_context_2->bo_mc3, true);
+
+       /* if fwm packet got skipped by firmware then 0xdead000 will be written */
+       usleep(1000 * 2);
+       igt_assert_eq_u32(*ring_context_1->bo_cpu, 0);
+       igt_assert_eq_u32(*ring_context_2->bo_cpu, 0);
+
+       /* set the correct fence value to finish executing the ib */
+       *ring_context_1->bo2_cpu = 10;
+       *ring_context_2->bo2_cpu = 10;
+       usleep(1000 * 2);
+       igt_assert_eq_u32(*ring_context_1->bo_cpu, 0xdead0000);
+       igt_assert_eq_u32(*ring_context_2->bo_cpu, 0xdead0000);
+
+       amdgpu_user_queue_destroy(device_handle, ring_context_1, ip_block->type);
+       amdgpu_user_queue_destroy(device_handle, ring_context_2,
+ ip_block->type);
+
+       amdgpu_bo_unmap_and_free(ring_context_1->bo, ring_context_1->va_handle,
+                                ring_context_1->bo_mc, ring_context_1->write_length);
+       amdgpu_bo_unmap_and_free(ring_context_1->bo2, ring_context_1->va_handle2,
+                                ring_context_1->bo_mc2, 4096);
+       amdgpu_bo_unmap_and_free(ring_context_1->bo3, ring_context_1->va_handle3,
+                                ring_context_1->bo_mc3, 8192);
+       amdgpu_bo_unmap_and_free(ring_context_2->bo, ring_context_2->va_handle,
+                                ring_context_2->bo_mc, ring_context_2->write_length);
+       amdgpu_bo_unmap_and_free(ring_context_2->bo2, ring_context_2->va_handle2,
+                                ring_context_2->bo_mc2, 4096);
+       amdgpu_bo_unmap_and_free(ring_context_2->bo3, ring_context_2->va_handle3,
+                                ring_context_2->bo_mc3, 8192);
+       free_cmd_base(cmd_base_1);
+       free_cmd_base(cmd_base_2);
+       free(ring_context_1);
+       free(ring_context_2);
+}
+
 igt_main
 {
        amdgpu_device_handle device;
@@ -815,6 +985,14 @@ igt_main
                        amdgpu_sync_dependency_test(device, true);
                }
        }
+
+       igt_describe("Check-FWM-preempt-using-GFX-UMQ");
+       igt_subtest_with_dynamic("fwm-prempt-test-with-IP-GFX-UMQ") {
+               if (userq_arr_cap[AMD_IP_GFX]) {
+                       igt_dynamic_f("fwm-preempt-test-with-gfx-umq")
+                       amdgpu_fwm_preempt_test(device);
+               }
+       }
 #endif

        igt_fixture {
--
2.43.0
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://lists.freedesktop.org/archives/igt-dev/attachments/20250519/57ebc629/attachment-0001.htm>