<!DOCTYPE html><html><head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
  </head>
  <body>
    <p><br>
    </p>
    <div class="moz-cite-prefix">On 3/14/2024 2:00 PM,
      <a class="moz-txt-link-abbreviated" href="mailto:janga.rahul.kumar@intel.com">janga.rahul.kumar@intel.com</a> wrote:<br>
    </div>
    <blockquote type="cite" cite="mid:20240314210026.2463551-1-janga.rahul.kumar@intel.com">
      <pre class="moz-quote-pre" wrap="">From: Janga Rahul Kumar <a class="moz-txt-link-rfc2396E" href="mailto:janga.rahul.kumar@intel.com"><janga.rahul.kumar@intel.com></a>

Test submits long kernel with a higher threadgroup count, lower
iteration kernel and a short opencl kernel to exercise threadgroup
preemption scenario with WMTP disabled.

v2: Use macros instead of const values. Add documentation for
    validation check. (Nirmoy)

Cc: Nirmoy Das <a class="moz-txt-link-rfc2396E" href="mailto:nirmoy.das@intel.com"><nirmoy.das@intel.com></a>
Signed-off-by: Janga Rahul Kumar <a class="moz-txt-link-rfc2396E" href="mailto:janga.rahul.kumar@intel.com"><janga.rahul.kumar@intel.com></a>
Reviewed-by: Nirmoy Das <a class="moz-txt-link-rfc2396E" href="mailto:nirmoy.das@intel.com"><nirmoy.das@intel.com></a>
---
 lib/intel_compute.c              | 74 +++++++++++++++++++++++++-------
 lib/intel_compute.h              |  3 +-
 tests/intel/xe_compute_preempt.c | 24 +++++++++--
 3 files changed, 80 insertions(+), 21 deletions(-)

diff --git a/lib/intel_compute.c b/lib/intel_compute.c
index c5d253ebc..9d3b97efe 100644
--- a/lib/intel_compute.c
+++ b/lib/intel_compute.c
@@ -43,6 +43,13 @@
 #define XE2_ADDR_STATE_CONTEXT_DATA_BASE       0x900000UL
 #define OFFSET_STATE_SIP                       0xFFFF0000
 
+/*
+ * TGP  - ThreadGroup Preemption
+ * WMTP - Walker Mid Thread Preemption
+ */
+#define TGP_long_kernel_loop_count             10
+#define WMTP_long_kernel_loop_count            1000000
+
 struct bo_dict_entry {
        uint64_t addr;
        uint32_t size;
@@ -1162,7 +1169,8 @@ static void xe2lpg_compute_exec_compute(uint32_t *addr_bo_buffer_batch,
                                        uint64_t addr_state_contect_data_base,
                                        uint64_t offset_indirect_data_start,
                                        uint64_t kernel_start_pointer,
-                                       uint64_t sip_start_pointer)
+                                       uint64_t sip_start_pointer,
+                                       bool     threadgroup_preemption)
 {
        int b = 0;
 
@@ -1236,7 +1244,12 @@ static void xe2lpg_compute_exec_compute(uint32_t *addr_bo_buffer_batch,
        addr_bo_buffer_batch[b++] = 0xbe040000;
        addr_bo_buffer_batch[b++] = 0xffffffff;
        addr_bo_buffer_batch[b++] = 0x000003ff;
-       addr_bo_buffer_batch[b++] = 0x00000002;
+
+       if (threadgroup_preemption)
+               addr_bo_buffer_batch[b++] = 0x00200000; // Global workgroup size
+       else
+               addr_bo_buffer_batch[b++] = 0x00000002;
+
        addr_bo_buffer_batch[b++] = 0x00000001;
        addr_bo_buffer_batch[b++] = 0x00000001;
        addr_bo_buffer_batch[b++] = 0x00000000;
@@ -1251,7 +1264,12 @@ static void xe2lpg_compute_exec_compute(uint32_t *addr_bo_buffer_batch,
 
        addr_bo_buffer_batch[b++] = kernel_start_pointer;
        addr_bo_buffer_batch[b++] = 0x00000000;
-       addr_bo_buffer_batch[b++] = 0x00100000; // Enable Thread Preemption BitField:20
+
+       if (threadgroup_preemption)
+               addr_bo_buffer_batch[b++] = 0x00000000;
+       else
+               addr_bo_buffer_batch[b++] = 0x00100000; // Enable Mid Thread Preemption BitField:20
+
        addr_bo_buffer_batch[b++] = 0x00000000;
        addr_bo_buffer_batch[b++] = 0x00000000;
        addr_bo_buffer_batch[b++] = 0x0c000020;
@@ -1369,7 +1387,7 @@ static void xe2lpg_compute_exec(int fd, const unsigned char *kernel,
                                  ADDR_INSTRUCTION_STATE_BASE,
                                  XE2_ADDR_STATE_CONTEXT_DATA_BASE,
                                  OFFSET_INDIRECT_DATA_START,
-                                 OFFSET_KERNEL, 0);
+                                 OFFSET_KERNEL, 0, false);
 
        bo_execenv_exec(&execenv, ADDR_BATCH);
 
@@ -1527,7 +1545,8 @@ static void xe2lpg_compute_preempt_exec(int fd, const unsigned char *long_kernel
                                        unsigned int short_kernel_size,
                                        const unsigned char *sip_kernel,
                                        unsigned int sip_kernel_size,
-                                       struct drm_xe_engine_class_instance *eci)
+                                       struct drm_xe_engine_class_instance *eci,
+                                       bool threadgroup_preemption)
 {
 #define XE2_BO_PREEMPT_DICT_ENTRIES 11
        struct bo_dict_entry bo_dict_long[XE2_BO_PREEMPT_DICT_ENTRIES] = {
@@ -1564,6 +1583,7 @@ static void xe2lpg_compute_preempt_exec(int fd, const unsigned char *long_kernel
        struct bo_dict_entry bo_dict_short[XE2_BO_PREEMPT_DICT_ENTRIES];
        struct bo_execenv execenv_short, execenv_long;
        float *dinput;
+       unsigned int long_kernel_loop_count;
        struct drm_xe_sync sync_long = {
                .type = DRM_XE_SYNC_TYPE_SYNCOBJ,
                .flags = DRM_XE_SYNC_FLAG_SIGNAL,
@@ -1574,7 +1594,11 @@ static void xe2lpg_compute_preempt_exec(int fd, const unsigned char *long_kernel
                .flags = DRM_XE_SYNC_FLAG_SIGNAL,
                .handle = syncobj_create(fd, 0),
        };
-       unsigned int long_kernel_loop_count = 1000000;
+
+       if (threadgroup_preemption)
+               long_kernel_loop_count = TGP_long_kernel_loop_count;
+       else
+               long_kernel_loop_count = WMTP_long_kernel_loop_count;
 
        for (int i = 0; i < XE2_BO_PREEMPT_DICT_ENTRIES; ++i)
                bo_dict_short[i] = bo_dict_long[i];
@@ -1622,12 +1646,12 @@ static void xe2lpg_compute_preempt_exec(int fd, const unsigned char *long_kernel
        xe2lpg_compute_exec_compute(bo_dict_long[8].data, ADDR_GENERAL_STATE_BASE,
                                    ADDR_SURFACE_STATE_BASE, ADDR_DYNAMIC_STATE_BASE,
                                    ADDR_INSTRUCTION_STATE_BASE, XE2_ADDR_STATE_CONTEXT_DATA_BASE,
-                                   OFFSET_INDIRECT_DATA_START, OFFSET_KERNEL, OFFSET_STATE_SIP);
+                                   OFFSET_INDIRECT_DATA_START, OFFSET_KERNEL, OFFSET_STATE_SIP, threadgroup_preemption);
 
        xe2lpg_compute_exec_compute(bo_dict_short[8].data, ADDR_GENERAL_STATE_BASE,
                                    ADDR_SURFACE_STATE_BASE, ADDR_DYNAMIC_STATE_BASE,
                                    ADDR_INSTRUCTION_STATE_BASE, XE2_ADDR_STATE_CONTEXT_DATA_BASE,
-                                   OFFSET_INDIRECT_DATA_START, OFFSET_KERNEL, OFFSET_STATE_SIP);
+                                   OFFSET_INDIRECT_DATA_START, OFFSET_KERNEL, OFFSET_STATE_SIP, false);
 
        xe_exec_sync(fd, execenv_long.exec_queue, ADDR_BATCH, &sync_long, 1);
 
@@ -1655,9 +1679,21 @@ static void xe2lpg_compute_preempt_exec(int fd, const unsigned char *long_kernel
 
                f1 = ((float *) bo_dict_long[5].data)[i];
 
-               if (f1 != long_kernel_loop_count)
-                       igt_debug("[%4d] f1: %f != %u\n", i, f1, long_kernel_loop_count);
-               igt_assert(f1 == long_kernel_loop_count);
+               if (threadgroup_preemption) {
+                       if (f1 < long_kernel_loop_count)
+                               igt_debug("[%4d] f1: %f != %u\n", i, f1, long_kernel_loop_count);
+
+                       /* Final incremented value should be greater than loop count
+                        * as the kernel is ran by multiple threads and output variable
+                        * is shared among all threads. This enusres multiple threadgroup
+                        * workload execution
+                        */
+                       igt_assert(f1 > long_kernel_loop_count);
+               } else {
+                       if (f1 != long_kernel_loop_count)
+                               igt_debug("[%4d] f1: %f != %u\n", i, f1, long_kernel_loop_count);
+                       igt_assert(f1 == long_kernel_loop_count);
+               }
        }
 
        bo_execenv_unbind(&execenv_short, bo_dict_short, XE2_BO_PREEMPT_DICT_ENTRIES);
@@ -1675,7 +1711,8 @@ static const struct {
                             unsigned int short_kernel_size,
                             const unsigned char *sip_kernel,
                             unsigned int sip_kernel_size,
-                            struct drm_xe_engine_class_instance *eci);
+                            struct drm_xe_engine_class_instance *eci,
+                            bool threadgroup_preemption);
        uint32_t compat;
 } intel_compute_preempt_batches[] = {
        {
@@ -1686,7 +1723,8 @@ static const struct {
 };
 
 static bool __run_intel_compute_kernel_preempt(int fd,
-               struct drm_xe_engine_class_instance *eci)
+               struct drm_xe_engine_class_instance *eci,
+               bool threadgroup_preemption)
 {
        unsigned int ip_ver = intel_graphics_ver(intel_get_drm_devid(fd));
        unsigned int batch;
@@ -1724,7 +1762,8 @@ static bool __run_intel_compute_kernel_preempt(int fd,
                                                          kernels->kernel, kernels->size,
                                                          kernels->sip_kernel,
                                                          kernels->sip_kernel_size,
-                                                         eci);
+                                                         eci,
+                                                         threadgroup_preemption);
 
        return true;
 }
@@ -1733,11 +1772,14 @@ static bool __run_intel_compute_kernel_preempt(int fd,
  * exercise preemption scenario.
  *
  * @fd: file descriptor of the opened DRM Xe device
+ * @eci: engine class instance
+ * @thread_preemption: enable/disable threadgroup preemption test
  *
  * Returns true on success, false otherwise.
  */
 bool run_intel_compute_kernel_preempt(int fd,
-               struct drm_xe_engine_class_instance *eci)
+               struct drm_xe_engine_class_instance *eci,
+               bool threadgroup_preemption)
 {
-       return __run_intel_compute_kernel_preempt(fd, eci);
+       return __run_intel_compute_kernel_preempt(fd, eci, threadgroup_preemption);
 }
diff --git a/lib/intel_compute.h b/lib/intel_compute.h
index fe9637b91..3c2cd010c 100644
--- a/lib/intel_compute.h
+++ b/lib/intel_compute.h
@@ -37,5 +37,6 @@ extern const struct intel_compute_kernels intel_compute_square_kernels[];
 
 bool run_intel_compute_kernel(int fd);
 bool xe_run_intel_compute_kernel_on_engine(int fd, struct drm_xe_engine_class_instance *eci);
-bool run_intel_compute_kernel_preempt(int fd, struct drm_xe_engine_class_instance *eci);
+bool run_intel_compute_kernel_preempt(int fd, struct drm_xe_engine_class_instance *eci,
+                                     bool threadgroup_preemption);
 #endif /* INTEL_COMPUTE_H */
diff --git a/tests/intel/xe_compute_preempt.c b/tests/intel/xe_compute_preempt.c
index 0aeb10547..2bc27eff1 100644
--- a/tests/intel/xe_compute_preempt.c
+++ b/tests/intel/xe_compute_preempt.c
@@ -27,11 +27,16 @@
  * Description:
  *      Exercise multiple walker mid thread preemption scenario
  * Functionality: compute openCL kernel
+ * SUBTEST: compute-threadgroup-preempt
+ * GPU requirement: LNL
+ * Description:
+ *      Exercise compute walker threadgroup preemption scenario
+ * Functionality: compute openCL kernel
  */
 static void
-test_compute_preempt(int fd, struct drm_xe_engine_class_instance *hwe)
+test_compute_preempt(int fd, struct drm_xe_engine_class_instance *hwe, bool threadgroup_preemption)
 {
-       igt_require_f(run_intel_compute_kernel_preempt(fd, hwe), "GPU not supported\n");
+       igt_require_f(run_intel_compute_kernel_preempt(fd, hwe, threadgroup_preemption), "GPU not supported\n");
 }
 
 igt_main
@@ -49,7 +54,7 @@ igt_main
                                continue;
 
                        igt_dynamic_f("engine-%s", xe_engine_class_string(hwe->engine_class))
-                               test_compute_preempt(xe, hwe);
+                               test_compute_preempt(xe, hwe, false);
                }
        }
 
@@ -61,12 +66,23 @@ igt_main
 
                        igt_dynamic_f("engine-%s", xe_engine_class_string(hwe->engine_class)) {
                                igt_fork(child, 100)
-                                       test_compute_preempt(xe, hwe);
+                                       test_compute_preempt(xe, hwe, false);
                                igt_waitchildren();
                        }
                }
        }
 
+       igt_subtest_with_dynamic("compute-threadgroup-preempt") {
+               xe_for_each_engine(xe, hwe) {
+                       if (hwe->engine_class != DRM_XE_ENGINE_CLASS_COMPUTE &&
+                           hwe->engine_class != DRM_XE_ENGINE_CLASS_RENDER)
+                               continue;
+
+                       igt_dynamic_f("engine-%s", xe_engine_class_string(hwe->engine_class))
+                       test_compute_preempt(xe, hwe, true);
+               }
+       }
+
        igt_fixture
                drm_close_driver(xe);
 </pre>
    </blockquote>
    <br>
    <blockquote type="cite" cite="mid:20240314210026.2463551-1-janga.rahul.kumar@intel.com">
      <pre class="moz-quote-pre" wrap=""><span style="border: 0px; font: inherit; margin: 0px; padding: 0px; vertical-align: baseline; color: inherit; box-sizing: border-box;">Reviewed-by: Jagmeet Randhawa <a href="mailto:jagmeet.randhawa@intel.com" data-linkindex="0" id="LPlnk599238" style="border: 0px; font: inherit; margin: 0px; padding: 0px; vertical-align: baseline;"><jagmeet.randhawa@intel.com></a></span>
</pre>
    </blockquote>
  </body>
</html>