Mesa (main): ac: Add task shader ring information.

Wed Jun 8 09:12:36 UTC 2022

Module: Mesa
Branch: main
Commit: ac5ab8d227e48e7572a4fa3311ac7c900aae3082
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=ac5ab8d227e48e7572a4fa3311ac7c900aae3082

Author: Timur Kristóf <timur.kristof at gmail.com>
Date:   Tue May 31 13:20:23 2022 +0200

ac: Add task shader ring information.

Similarly to tessellation rings information, move the task
rings info to ac_gpu_info.

Signed-off-by: Timur Kristóf <timur.kristof at gmail.com>
Reviewed-by: Samuel Pitoiset <samuel.pitoiset at gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/16737>

---

 src/amd/common/ac_gpu_info.c    | 39 ++++++++++++++++++++++++++++++++++++
 src/amd/common/ac_gpu_info.h    | 44 +++++++++++++++++++++++++++++++++++++++++
 src/amd/vulkan/radv_constants.h |  5 -----
 src/amd/vulkan/radv_device.c    | 19 +-----------------
 src/amd/vulkan/radv_private.h   |  4 +---
 src/amd/vulkan/radv_shader.c    |  8 ++++----
 6 files changed, 89 insertions(+), 30 deletions(-)

diff --git a/src/amd/common/ac_gpu_info.c b/src/amd/common/ac_gpu_info.c
index e867228dec0..212dc7ef359 100644
--- a/src/amd/common/ac_gpu_info.c
+++ b/src/amd/common/ac_gpu_info.c
@@ -1858,3 +1858,42 @@ void ac_get_hs_info(struct radeon_info *info,
    hs->tess_offchip_ring_offset = align(hs->tess_factor_ring_size, 64 * 1024);
    hs->tess_offchip_ring_size = hs->max_offchip_buffers * hs->tess_offchip_block_dw_size * 4;
 }
+
+static uint16_t get_task_num_entries(enum radeon_family fam)
+{
+   /* Number of task shader ring entries. Needs to be a power of two.
+    * Use a low number on smaller chips so we don't waste space,
+    * but keep it high on bigger chips so it doesn't inhibit parallelism.
+    *
+    * This number is compiled into task/mesh shaders as a constant.
+    * In order to ensure this works fine with the shader cache, we must
+    * base this decision on the chip family, not the number of CUs in
+    * the current GPU. (So, the cache remains consistent for all
+    * chips in the same family.)
+    */
+   switch (fam) {
+   case CHIP_VANGOGH:
+   case CHIP_NAVI24:
+   case CHIP_REMBRANDT:
+      return 256;
+   case CHIP_NAVI21:
+   case CHIP_NAVI22:
+   case CHIP_NAVI23:
+   default:
+      return 1024;
+   }
+}
+
+void ac_get_task_info(struct radeon_info *info,
+                      struct ac_task_info *task_info)
+{
+   const uint16_t num_entries = get_task_num_entries(info->family);
+   const uint32_t draw_ring_bytes = num_entries * AC_TASK_DRAW_ENTRY_BYTES;
+   const uint32_t payload_ring_bytes = num_entries * AC_TASK_PAYLOAD_ENTRY_BYTES;
+
+   /* Ensure that the addresses of each ring are 256 byte aligned. */
+   task_info->num_entries = num_entries;
+   task_info->draw_ring_offset = ALIGN(AC_TASK_CTRLBUF_BYTES, 256);
+   task_info->payload_ring_offset = ALIGN(task_info->draw_ring_offset + draw_ring_bytes, 256);
+   task_info->bo_size_bytes = task_info->payload_ring_offset + payload_ring_bytes;
+}
diff --git a/src/amd/common/ac_gpu_info.h b/src/amd/common/ac_gpu_info.h
index 07f1cbb556f..9bcaf74d3a0 100644
--- a/src/amd/common/ac_gpu_info.h
+++ b/src/amd/common/ac_gpu_info.h
@@ -274,6 +274,50 @@ struct ac_hs_info {
 void ac_get_hs_info(struct radeon_info *info,
                     struct ac_hs_info *hs);
 
+/* Task rings BO layout information.
+ * This BO is shared between GFX and ACE queues so that the ACE and GFX
+ * firmware can cooperate on task->mesh dispatches and is also used to
+ * store the task payload which is passed to mesh shaders.
+ *
+ * The driver only needs to create this BO once,
+ * and it will always be able to accomodate the maximum needed
+ * task payload size.
+ *
+ * The following memory layout is used:
+ * 1. Control buffer: 9 DWORDs, 256 byte aligned
+ *    Used by the firmware to maintain the current state.
+ * (padding)
+ * 2. Draw ring: 4 DWORDs per entry, 256 byte aligned
+ *    Task shaders store the mesh dispatch size here.
+ * (padding)
+ * 3. Payload ring: 16K bytes per entry, 256 byte aligned.
+ *    This is where task payload is stored by task shaders and
+ *    read by mesh shaders.
+ *
+ */
+struct ac_task_info {
+   uint32_t draw_ring_offset;
+   uint32_t payload_ring_offset;
+   uint32_t bo_size_bytes;
+   uint16_t num_entries;
+};
+
+/* Size of each payload entry in the task payload ring.
+ * Spec requires minimum 16K bytes.
+ */
+#define AC_TASK_PAYLOAD_ENTRY_BYTES 16384
+
+/* Size of each draw entry in the task draw ring.
+ * 4 DWORDs per entry.
+ */
+#define AC_TASK_DRAW_ENTRY_BYTES 16
+
+/* Size of the task control buffer. 9 DWORDs. */
+#define AC_TASK_CTRLBUF_BYTES 36
+
+void ac_get_task_info(struct radeon_info *info,
+                      struct ac_task_info *task_info);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/amd/vulkan/radv_constants.h b/src/amd/vulkan/radv_constants.h
index c40330f3bc3..4b6d3b9667b 100644
--- a/src/amd/vulkan/radv_constants.h
+++ b/src/amd/vulkan/radv_constants.h
@@ -91,11 +91,6 @@
  */
 #define RADV_MAX_MEMORY_ALLOCATION_SIZE 0xFFFFFFFCull
 
-/* Size of each payload entry in the task payload ring.
- * Spec requires minimum 16K bytes.
- */
-#define RADV_TASK_PAYLOAD_ENTRY_BYTES 16384
-
 /* Number of invocations in each subgroup. */
 #define RADV_SUBGROUP_SIZE 64
 
diff --git a/src/amd/vulkan/radv_device.c b/src/amd/vulkan/radv_device.c
index d19a41ae345..4f3d7d730cc 100644
--- a/src/amd/vulkan/radv_device.c
+++ b/src/amd/vulkan/radv_device.c
@@ -834,24 +834,7 @@ radv_physical_device_try_create(struct radv_instance *instance, drmDevicePtr drm
       ac_get_gs_table_depth(device->rad_info.gfx_level, device->rad_info.family);
 
    ac_get_hs_info(&device->rad_info, &device->hs);
-
-   /* Number of task shader ring entries. Needs to be a power of two.
-    * Use a low number on smaller chips so we don't waste space,
-    * but keep it high on bigger chips so it doesn't inhibit parallelism.
-    */
-   switch (device->rad_info.family) {
-   case CHIP_VANGOGH:
-   case CHIP_NAVI24:
-   case CHIP_REMBRANDT:
-      device->task_num_entries = 256;
-      break;
-   case CHIP_NAVI21:
-   case CHIP_NAVI22:
-   case CHIP_NAVI23:
-   default:
-      device->task_num_entries = 1024;
-      break;
-   }
+   ac_get_task_info(&device->rad_info, &device->task_info);
 
    *device_out = device;
 
diff --git a/src/amd/vulkan/radv_private.h b/src/amd/vulkan/radv_private.h
index d2295187b20..2f01c6e573f 100644
--- a/src/amd/vulkan/radv_private.h
+++ b/src/amd/vulkan/radv_private.h
@@ -332,9 +332,7 @@ struct radv_physical_device {
    uint32_t gs_table_depth;
 
    struct ac_hs_info hs;
-
-   /* Number of entries in the task shader ring buffers. */
-   uint32_t task_num_entries;
+   struct ac_task_info task_info;
 };
 
 struct radv_instance {
diff --git a/src/amd/vulkan/radv_shader.c b/src/amd/vulkan/radv_shader.c
index 21174c2ed10..20dd59c3c83 100644
--- a/src/amd/vulkan/radv_shader.c
+++ b/src/amd/vulkan/radv_shader.c
@@ -1094,12 +1094,12 @@ radv_lower_io_to_mem(struct radv_device *device, struct radv_pipeline_stage *sta
       return true;
    } else if (nir->info.stage == MESA_SHADER_TASK) {
       ac_nir_apply_first_task_to_task_shader(nir);
-      ac_nir_lower_task_outputs_to_mem(nir, RADV_TASK_PAYLOAD_ENTRY_BYTES,
-                                       device->physical_device->task_num_entries);
+      ac_nir_lower_task_outputs_to_mem(nir, AC_TASK_PAYLOAD_ENTRY_BYTES,
+                                       device->physical_device->task_info.num_entries);
       return true;
    } else if (nir->info.stage == MESA_SHADER_MESH) {
-      ac_nir_lower_mesh_inputs_to_mem(nir, RADV_TASK_PAYLOAD_ENTRY_BYTES,
-                                      device->physical_device->task_num_entries);
+      ac_nir_lower_mesh_inputs_to_mem(nir, AC_TASK_PAYLOAD_ENTRY_BYTES,
+                                      device->physical_device->task_info.num_entries);
       return true;
    }