Mesa (main): pvr: Add compute update kernel in vkCmdDispatch().

Tue May 3 13:47:23 UTC 2022

Module: Mesa
Branch: main
Commit: 58d9afb80ba729cc9f9d43e635c0386b59815bfa
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=58d9afb80ba729cc9f9d43e635c0386b59815bfa

Author: Karmjit Mahil <Karmjit.Mahil at imgtec.com>
Date:   Mon Feb 14 14:27:33 2022 +0000

pvr: Add compute update kernel in vkCmdDispatch().

We upload a new data section whenever a patched variant is
needed. They will be freed at command buffer destruction since
the uploads are linked.

Co-authored-by: Rajnesh Kanwal <rajnesh.kanwal at imgtec.com>
Signed-off-by: Karmjit Mahil <Karmjit.Mahil at imgtec.com>
Signed-off-by: Rajnesh Kanwal <rajnesh.kanwal at imgtec.com>
Reviewed-by: Frank Binns <frank.binns at imgtec.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/16040>

---

 src/imagination/vulkan/pvr_cmd_buffer.c | 107 +++++++++++++++++-
 src/imagination/vulkan/pvr_pipeline.c   | 190 ++++++++++++++++++++++++--------
 src/imagination/vulkan/pvr_private.h    |  20 +++-
 3 files changed, 265 insertions(+), 52 deletions(-)

diff --git a/src/imagination/vulkan/pvr_cmd_buffer.c b/src/imagination/vulkan/pvr_cmd_buffer.c
index 73cd910eb32..f043054bcb0 100644
--- a/src/imagination/vulkan/pvr_cmd_buffer.c
+++ b/src/imagination/vulkan/pvr_cmd_buffer.c
@@ -2880,6 +2880,111 @@ static void pvr_compute_update_shared(struct pvr_cmd_buffer *cmd_buffer)
    pvr_compute_generate_control_stream(csb, &info);
 }
 
+static uint32_t
+pvr_compute_flat_pad_workgroup_size(const struct pvr_device_info *dev_info,
+                                    uint32_t workgroup_size,
+                                    uint32_t coeff_regs_count)
+{
+   uint32_t max_avail_coeff_regs =
+      rogue_get_cdm_max_local_mem_size_regs(dev_info);
+   uint32_t coeff_regs_count_aligned =
+      ALIGN_POT(coeff_regs_count,
+                PVRX(CDMCTRL_KERNEL0_USC_COMMON_SIZE_UNIT_SIZE) >> 2U);
+
+   /* If the work group size is > ROGUE_MAX_INSTANCES_PER_TASK. We now *always*
+    * pad the work group size to the next multiple of
+    * ROGUE_MAX_INSTANCES_PER_TASK.
+    *
+    * If we use more than 1/8th of the max coefficient registers then we round
+    * work group size up to the next multiple of ROGUE_MAX_INSTANCES_PER_TASK
+    */
+   /* TODO: See if this can be optimized. */
+   if (workgroup_size > ROGUE_MAX_INSTANCES_PER_TASK ||
+       coeff_regs_count_aligned > (max_avail_coeff_regs / 8)) {
+      assert(workgroup_size < rogue_get_compute_max_work_group_size(dev_info));
+
+      return ALIGN_POT(workgroup_size, ROGUE_MAX_INSTANCES_PER_TASK);
+   }
+
+   return workgroup_size;
+}
+
+/* TODO: Wire up the base_workgroup variant program when implementing
+ * VK_KHR_device_group. The values will also need patching into the program.
+ */
+static void pvr_compute_update_kernel(
+   struct pvr_cmd_buffer *cmd_buffer,
+   const uint32_t global_workgroup_size[static const PVR_WORKGROUP_DIMENSIONS])
+{
+   const struct pvr_device_info *dev_info =
+      &cmd_buffer->device->pdevice->dev_info;
+   struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
+   struct pvr_csb *csb = &state->current_sub_cmd->compute.control_stream;
+   const struct pvr_compute_pipeline *pipeline = state->compute_pipeline;
+   const struct pvr_pds_info *program_info =
+      &pipeline->state.primary_program_info;
+
+   struct pvr_compute_kernel_info info = {
+      .indirect_buffer_addr.addr = 0ULL,
+      .usc_target = PVRX(CDMCTRL_USC_TARGET_ANY),
+      .pds_temp_size =
+         DIV_ROUND_UP(program_info->temps_required << 2U,
+                      PVRX(CDMCTRL_KERNEL0_PDS_TEMP_SIZE_UNIT_SIZE)),
+
+      .pds_data_size =
+         DIV_ROUND_UP(program_info->data_size_in_dwords << 2U,
+                      PVRX(CDMCTRL_KERNEL0_PDS_DATA_SIZE_UNIT_SIZE)),
+      .pds_data_offset = pipeline->state.primary_program.data_offset,
+      .pds_code_offset = pipeline->state.primary_program.code_offset,
+
+      .sd_type = PVRX(CDMCTRL_SD_TYPE_USC),
+
+      .usc_unified_size =
+         DIV_ROUND_UP(pipeline->state.shader.input_register_count << 2U,
+                      PVRX(CDMCTRL_KERNEL0_USC_UNIFIED_SIZE_UNIT_SIZE)),
+
+      /* clang-format off */
+      .global_size = {
+         global_workgroup_size[0],
+         global_workgroup_size[1],
+         global_workgroup_size[2]
+      },
+      /* clang-format on */
+   };
+
+   uint32_t work_size = pipeline->state.shader.work_size;
+   uint32_t coeff_regs;
+
+   if (work_size > ROGUE_MAX_INSTANCES_PER_TASK) {
+      /* Enforce a single workgroup per cluster through allocation starvation.
+       */
+      coeff_regs = rogue_get_cdm_max_local_mem_size_regs(dev_info);
+   } else {
+      coeff_regs = pipeline->state.shader.coefficient_register_count;
+   }
+
+   info.usc_common_size =
+      DIV_ROUND_UP(coeff_regs << 2U,
+                   PVRX(CDMCTRL_KERNEL0_USC_COMMON_SIZE_UNIT_SIZE));
+
+   /* Use a whole slot per workgroup. */
+   work_size = MAX2(work_size, ROGUE_MAX_INSTANCES_PER_TASK);
+
+   coeff_regs += pipeline->state.shader.const_shared_reg_count;
+
+   work_size =
+      pvr_compute_flat_pad_workgroup_size(dev_info, work_size, coeff_regs);
+
+   info.local_size[0] = work_size;
+   info.local_size[1] = 1U;
+   info.local_size[2] = 1U;
+
+   info.max_instances =
+      pvr_compute_flat_slot_size(dev_info, coeff_regs, false, work_size);
+
+   pvr_compute_generate_control_stream(csb, &info);
+}
+
 void pvr_CmdDispatch(VkCommandBuffer commandBuffer,
                      uint32_t groupCountX,
                      uint32_t groupCountY,
@@ -2953,7 +3058,7 @@ void pvr_CmdDispatch(VkCommandBuffer commandBuffer,
 
    pvr_compute_update_shared(cmd_buffer);
 
-   /* FIXME: Create update kernel end emit control stream. */
+   pvr_compute_update_kernel(cmd_buffer, workgroup_size);
 }
 
 void pvr_CmdDispatchIndirect(VkCommandBuffer commandBuffer,
diff --git a/src/imagination/vulkan/pvr_pipeline.c b/src/imagination/vulkan/pvr_pipeline.c
index a8beb81be96..09995357df3 100644
--- a/src/imagination/vulkan/pvr_pipeline.c
+++ b/src/imagination/vulkan/pvr_pipeline.c
@@ -45,6 +45,7 @@
 #include "util/log.h"
 #include "util/macros.h"
 #include "util/ralloc.h"
+#include "util/u_math.h"
 #include "vk_alloc.h"
 #include "vk_log.h"
 #include "vk_object.h"
@@ -777,22 +778,17 @@ static void pvr_pds_uniform_program_destroy(
    vk_free2(&device->vk.alloc, allocator, pds_info->entries);
 }
 
-/* FIXME: See if pvr_device_init_compute_pds_program() and this could be merged.
- */
-static VkResult pvr_pds_compute_program_create_and_upload(
-   struct pvr_device *const device,
-   const VkAllocationCallbacks *const allocator,
+static void pvr_pds_compute_program_setup(
+   const struct pvr_device_info *dev_info,
    const uint32_t local_input_regs[static const PVR_WORKGROUP_DIMENSIONS],
    const uint32_t work_group_input_regs[static const PVR_WORKGROUP_DIMENSIONS],
    uint32_t barrier_coefficient,
    bool add_base_workgroup,
    uint32_t usc_temps,
    pvr_dev_addr_t usc_shader_dev_addr,
-   struct pvr_pds_upload *const pds_upload_out,
-   struct pvr_pds_info *const pds_info_out,
-   uint32_t *const base_workgroup_data_patching_offset_out)
+   struct pvr_pds_compute_shader_program *const program)
 {
-   struct pvr_pds_compute_shader_program program = {
+   *program = (struct pvr_pds_compute_shader_program){
       /* clang-format off */
       .local_input_regs = {
          local_input_regs[0],
@@ -815,27 +811,50 @@ static VkResult pvr_pds_compute_program_create_and_upload(
       .add_base_workgroup = add_base_workgroup,
       .kick_usc = true,
    };
-   struct pvr_device_info *dev_info = &device->pdevice->dev_info;
-   uint32_t staging_buffer_size;
-   uint32_t *staging_buffer;
-   VkResult result;
 
-   STATIC_ASSERT(ARRAY_SIZE(program.local_input_regs) ==
+   STATIC_ASSERT(ARRAY_SIZE(program->local_input_regs) ==
                  PVR_WORKGROUP_DIMENSIONS);
-   STATIC_ASSERT(ARRAY_SIZE(program.work_group_input_regs) ==
+   STATIC_ASSERT(ARRAY_SIZE(program->work_group_input_regs) ==
                  PVR_WORKGROUP_DIMENSIONS);
-   STATIC_ASSERT(ARRAY_SIZE(program.global_input_regs) ==
+   STATIC_ASSERT(ARRAY_SIZE(program->global_input_regs) ==
                  PVR_WORKGROUP_DIMENSIONS);
 
-   assert(!add_base_workgroup || base_workgroup_data_patching_offset_out);
-
-   pvr_pds_setup_doutu(&program.usc_task_control,
+   pvr_pds_setup_doutu(&program->usc_task_control,
                        usc_shader_dev_addr.addr,
                        usc_temps,
                        PVRX(PDSINST_DOUTU_SAMPLE_RATE_INSTANCE),
                        false);
 
-   pvr_pds_compute_shader(&program, NULL, PDS_GENERATE_SIZES, dev_info);
+   pvr_pds_compute_shader(program, NULL, PDS_GENERATE_SIZES, dev_info);
+}
+
+/* FIXME: See if pvr_device_init_compute_pds_program() and this could be merged.
+ */
+static VkResult pvr_pds_compute_program_create_and_upload(
+   struct pvr_device *const device,
+   const VkAllocationCallbacks *const allocator,
+   const uint32_t local_input_regs[static const PVR_WORKGROUP_DIMENSIONS],
+   const uint32_t work_group_input_regs[static const PVR_WORKGROUP_DIMENSIONS],
+   uint32_t barrier_coefficient,
+   uint32_t usc_temps,
+   pvr_dev_addr_t usc_shader_dev_addr,
+   struct pvr_pds_upload *const pds_upload_out,
+   struct pvr_pds_info *const pds_info_out)
+{
+   struct pvr_device_info *dev_info = &device->pdevice->dev_info;
+   struct pvr_pds_compute_shader_program program;
+   uint32_t staging_buffer_size;
+   uint32_t *staging_buffer;
+   VkResult result;
+
+   pvr_pds_compute_program_setup(dev_info,
+                                 local_input_regs,
+                                 work_group_input_regs,
+                                 barrier_coefficient,
+                                 false,
+                                 usc_temps,
+                                 usc_shader_dev_addr,
+                                 &program);
 
    /* FIXME: According to pvr_device_init_compute_pds_program() the code size
     * is in bytes. Investigate this.
@@ -864,16 +883,6 @@ static VkResult pvr_pds_compute_program_create_and_upload(
                           PDS_GENERATE_DATA_SEGMENT,
                           dev_info);
 
-   /* We'll need to patch the base workgroup in the PDS data section before
-    * dispatch so we give back the offsets at which to patch. We only need to
-    * save the offset for the first workgroup id since the workgroup ids are
-    * stored contiguously in the data segment.
-    */
-   if (add_base_workgroup) {
-      *base_workgroup_data_patching_offset_out =
-         program.base_workgroup_constant_offset_in_dwords[0];
-   }
-
    /* FIXME: Figure out the define for alignment of 16. */
    result = pvr_gpu_upload_pds(device,
                                &staging_buffer[program.code_size],
@@ -910,6 +919,97 @@ static void pvr_pds_compute_program_destroy(
    pvr_bo_free(device, pds_program->pvr_bo);
 }
 
+/* This only uploads the code segment. The data segment will need to be patched
+ * with the base workgroup before uploading.
+ */
+static VkResult pvr_pds_compute_base_workgroup_variant_program_init(
+   struct pvr_device *const device,
+   const VkAllocationCallbacks *const allocator,
+   const uint32_t local_input_regs[static const PVR_WORKGROUP_DIMENSIONS],
+   const uint32_t work_group_input_regs[static const PVR_WORKGROUP_DIMENSIONS],
+   uint32_t barrier_coefficient,
+   uint32_t usc_temps,
+   pvr_dev_addr_t usc_shader_dev_addr,
+   struct pvr_pds_base_workgroup_program *program_out)
+{
+   struct pvr_device_info *dev_info = &device->pdevice->dev_info;
+   struct pvr_pds_compute_shader_program program;
+   uint32_t buffer_size;
+   uint32_t *buffer;
+   VkResult result;
+
+   pvr_pds_compute_program_setup(dev_info,
+                                 local_input_regs,
+                                 work_group_input_regs,
+                                 barrier_coefficient,
+                                 true,
+                                 usc_temps,
+                                 usc_shader_dev_addr,
+                                 &program);
+
+   /* FIXME: According to pvr_device_init_compute_pds_program() the code size
+    * is in bytes. Investigate this.
+    */
+   buffer_size = MAX2(program.code_size, program.data_size) * sizeof(*buffer);
+
+   buffer = vk_alloc2(&device->vk.alloc,
+                      allocator,
+                      buffer_size,
+                      8,
+                      VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+   if (!buffer)
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   pvr_pds_compute_shader(&program,
+                          &buffer[0],
+                          PDS_GENERATE_CODE_SEGMENT,
+                          dev_info);
+
+   /* FIXME: Figure out the define for alignment of 16. */
+   result = pvr_gpu_upload_pds(device,
+                               NULL,
+                               0,
+                               0,
+                               buffer,
+                               program.code_size,
+                               16,
+                               16,
+                               &program_out->code_upload);
+   if (result != VK_SUCCESS) {
+      vk_free2(&device->vk.alloc, allocator, buffer);
+      return result;
+   }
+
+   pvr_pds_compute_shader(&program, buffer, PDS_GENERATE_DATA_SEGMENT, dev_info);
+
+   program_out->data_section = buffer;
+
+   /* We'll need to patch the base workgroup in the PDS data section before
+    * dispatch so we save the offsets at which to patch. We only need to save
+    * the offset for the first workgroup id since the workgroup ids are stored
+    * contiguously in the data segment.
+    */
+   program_out->base_workgroup_data_patching_offset =
+      program.base_workgroup_constant_offset_in_dwords[0];
+
+   program_out->info = (struct pvr_pds_info){
+      .temps_required = program.highest_temp,
+      .code_size_in_dwords = program.code_size,
+      .data_size_in_dwords = program.data_size,
+   };
+
+   return VK_SUCCESS;
+}
+
+static void pvr_pds_compute_base_workgroup_variant_program_finish(
+   struct pvr_device *device,
+   const VkAllocationCallbacks *const allocator,
+   struct pvr_pds_base_workgroup_program *const state)
+{
+   pvr_bo_free(device, state->code_upload.pvr_bo);
+   vk_free2(&device->vk.alloc, allocator, state->data_section);
+}
+
 /******************************************************************************
    Generic pipeline functions
  ******************************************************************************/
@@ -962,6 +1062,9 @@ static VkResult pvr_compute_pipeline_compile(
    compute_pipeline->state.shader.uses_barrier = false;
    compute_pipeline->state.shader.uses_num_workgroups = false;
    compute_pipeline->state.shader.const_shared_reg_count = 4;
+   compute_pipeline->state.shader.input_register_count = 8;
+   compute_pipeline->state.shader.work_size = 1 * 1 * 1;
+   compute_pipeline->state.shader.coefficient_register_count = 4;
 
    result = pvr_gpu_upload_usc(device,
                                pvr_usc_compute_shader,
@@ -1011,12 +1114,10 @@ static VkResult pvr_compute_pipeline_compile(
       local_input_regs,
       work_group_input_regs,
       barrier_coefficient,
-      false,
       pvr_pds_compute_program_params.usc_temps,
       compute_pipeline->state.shader.bo->vma->dev_addr,
       &compute_pipeline->state.primary_program,
-      &compute_pipeline->state.primary_program_info,
-      NULL);
+      &compute_pipeline->state.primary_program_info);
    if (result != VK_SUCCESS)
       goto err_free_uniform_program;
 
@@ -1029,27 +1130,27 @@ static VkResult pvr_compute_pipeline_compile(
       work_group_input_regs[2] != PVR_PDS_COMPUTE_INPUT_REG_UNUSED;
 
    if (compute_pipeline->state.flags.base_workgroup) {
-      result = pvr_pds_compute_program_create_and_upload(
+      result = pvr_pds_compute_base_workgroup_variant_program_init(
          device,
          allocator,
          local_input_regs,
          work_group_input_regs,
          barrier_coefficient,
-         true,
          pvr_pds_compute_program_params.usc_temps,
          compute_pipeline->state.shader.bo->vma->dev_addr,
-         &compute_pipeline->state.primary_program_base_workgroup_variant,
-         &compute_pipeline->state.primary_program_base_workgroup_variant_info,
-         &compute_pipeline->state.base_workgroup_ids_dword_offset);
+         &compute_pipeline->state.primary_base_workgroup_variant_program);
       if (result != VK_SUCCESS)
-         goto err_free_compute_program;
+         goto err_destroy_compute_program;
    }
 
    return VK_SUCCESS;
 
-err_free_compute_program:
-   if (compute_pipeline->state.flags.base_workgroup)
-      pvr_bo_free(device, compute_pipeline->state.primary_program.pvr_bo);
+err_destroy_compute_program:
+   pvr_pds_compute_program_destroy(
+      device,
+      allocator,
+      &compute_pipeline->state.primary_program,
+      &compute_pipeline->state.primary_program_info);
 
 err_free_uniform_program:
    pvr_bo_free(device, compute_pipeline->state.uniform.pds_code.pvr_bo);
@@ -1129,11 +1230,10 @@ static void pvr_compute_pipeline_destroy(
    struct pvr_compute_pipeline *const compute_pipeline)
 {
    if (compute_pipeline->state.flags.base_workgroup) {
-      pvr_pds_compute_program_destroy(
+      pvr_pds_compute_base_workgroup_variant_program_finish(
          device,
          allocator,
-         &compute_pipeline->state.primary_program_base_workgroup_variant,
-         &compute_pipeline->state.primary_program_base_workgroup_variant_info);
+         &compute_pipeline->state.primary_base_workgroup_variant_program);
    }
 
    pvr_pds_compute_program_destroy(
diff --git a/src/imagination/vulkan/pvr_private.h b/src/imagination/vulkan/pvr_private.h
index 895dabd276a..c75fea9a1ad 100644
--- a/src/imagination/vulkan/pvr_private.h
+++ b/src/imagination/vulkan/pvr_private.h
@@ -1036,6 +1036,9 @@ struct pvr_compute_pipeline {
          bool uses_num_workgroups;
 
          uint32_t const_shared_reg_count;
+         uint32_t input_register_count;
+         uint32_t work_size;
+         uint32_t coefficient_register_count;
       } shader;
 
       struct {
@@ -1047,12 +1050,17 @@ struct pvr_compute_pipeline {
       struct pvr_pds_upload primary_program;
       struct pvr_pds_info primary_program_info;
 
-      struct pvr_pds_upload primary_program_base_workgroup_variant;
-      struct pvr_pds_info primary_program_base_workgroup_variant_info;
-      /* Offset within the PDS data section at which the base workgroup id
-       * resides.
-       */
-      uint32_t base_workgroup_ids_dword_offset;
+      struct pvr_pds_base_workgroup_program {
+         struct pvr_pds_upload code_upload;
+
+         uint32_t *data_section;
+         /* Offset within the PDS data section at which the base workgroup id
+          * resides.
+          */
+         uint32_t base_workgroup_data_patching_offset;
+
+         struct pvr_pds_info info;
+      } primary_base_workgroup_variant_program;
    } state;
 };