Mesa (main): radv: upload shader binaries of a pipeline contiguously in memory

Thu Dec 2 07:39:17 UTC 2021

Module: Mesa
Branch: main
Commit: 3fa2220838ad8b039bc46900ee57b4c107bd2a77
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=3fa2220838ad8b039bc46900ee57b4c107bd2a77

Author: Samuel Pitoiset <samuel.pitoiset at gmail.com>
Date:   Fri Nov  5 13:58:12 2021 +0100

radv: upload shader binaries of a pipeline contiguously in memory

RGP expects shaders to be contiguous in memory, otherwise it explodes
because we have to generate huge captures with lot of holes.

This reduces capture sizes of Cyberpunk 2077 from ~3.5GiB to ~180MiB.

This should also help for future pipeline libraries.

Signed-off-by: Samuel Pitoiset <samuel.pitoiset at gmail.com>
Reviewed-by: Bas Nieuwenhuizen <bas at basnieuwenhuizen.nl>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13690>

---

 src/amd/vulkan/radv_cmd_buffer.c     |  13 +----
 src/amd/vulkan/radv_debug.c          |   5 --
 src/amd/vulkan/radv_pipeline.c       | 108 +++++++++++++++++++++++++++++------
 src/amd/vulkan/radv_pipeline_cache.c |  55 +++++++++++++++---
 src/amd/vulkan/radv_private.h        |  14 +++++
 src/amd/vulkan/radv_shader.c         |  36 +++---------
 src/amd/vulkan/radv_shader.h         |  10 +++-
 7 files changed, 169 insertions(+), 72 deletions(-)

diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c
index ead18219fb0..f3fcce1712e 100644
--- a/src/amd/vulkan/radv_cmd_buffer.c
+++ b/src/amd/vulkan/radv_cmd_buffer.c
@@ -1391,15 +1391,7 @@ radv_emit_graphics_pipeline(struct radv_cmd_buffer *cmd_buffer)
 
    radv_emit_batch_break_on_new_ps(cmd_buffer);
 
-   for (unsigned i = 0; i < MESA_SHADER_COMPUTE; i++) {
-      if (!pipeline->shaders[i])
-         continue;
-
-      radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, pipeline->shaders[i]->bo);
-   }
-
-   if (radv_pipeline_has_gs_copy_shader(pipeline))
-      radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, pipeline->gs_copy_shader->bo);
+   radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, pipeline->slab->alloc->arena->bo);
 
    if (unlikely(cmd_buffer->device->trace_bo))
       radv_save_pipeline(cmd_buffer, pipeline);
@@ -4848,8 +4840,7 @@ radv_emit_compute_pipeline(struct radv_cmd_buffer *cmd_buffer, struct radv_pipel
    cmd_buffer->compute_scratch_waves_wanted =
       MAX2(cmd_buffer->compute_scratch_waves_wanted, pipeline->max_waves);
 
-   radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs,
-                      pipeline->shaders[MESA_SHADER_COMPUTE]->bo);
+   radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, pipeline->slab->alloc->arena->bo);
 
    if (unlikely(cmd_buffer->device->trace_bo))
       radv_save_pipeline(cmd_buffer, pipeline);
diff --git a/src/amd/vulkan/radv_debug.c b/src/amd/vulkan/radv_debug.c
index f0dd9abd750..29943ac2ee0 100644
--- a/src/amd/vulkan/radv_debug.c
+++ b/src/amd/vulkan/radv_debug.c
@@ -832,10 +832,6 @@ radv_trap_handler_init(struct radv_device *device)
       return false;
    }
 
-   result = ws->buffer_make_resident(ws, device->trap_handler_shader->bo, true);
-   if (result != VK_SUCCESS)
-      return false;
-
    result = ws->buffer_create(ws, TMA_BO_SIZE, 256, RADEON_DOMAIN_VRAM,
                               RADEON_FLAG_CPU_ACCESS | RADEON_FLAG_NO_INTERPROCESS_SHARING |
                                  RADEON_FLAG_ZERO_VRAM | RADEON_FLAG_32BIT,
@@ -873,7 +869,6 @@ radv_trap_handler_finish(struct radv_device *device)
    struct radeon_winsys *ws = device->ws;
 
    if (unlikely(device->trap_handler_shader)) {
-      ws->buffer_make_resident(ws, device->trap_handler_shader->bo, false);
       radv_shader_destroy(device, device->trap_handler_shader);
    }
 
diff --git a/src/amd/vulkan/radv_pipeline.c b/src/amd/vulkan/radv_pipeline.c
index 17679d51450..1c806d60abb 100644
--- a/src/amd/vulkan/radv_pipeline.c
+++ b/src/amd/vulkan/radv_pipeline.c
@@ -171,6 +171,37 @@ radv_pipeline_has_gs_copy_shader(const struct radv_pipeline *pipeline)
    return !!pipeline->gs_copy_shader;
 }
 
+static struct radv_pipeline_slab *
+radv_pipeline_slab_create(struct radv_device *device, struct radv_pipeline *pipeline,
+                          uint32_t code_size)
+{
+   struct radv_pipeline_slab *slab;
+
+   slab = calloc(1, sizeof(*slab));
+   if (!slab)
+      return NULL;
+
+   slab->ref_count = 1;
+
+   slab->alloc = radv_alloc_shader_memory(device, code_size, pipeline);
+   if (!slab->alloc) {
+      free(slab);
+      return NULL;
+   }
+
+   return slab;
+}
+
+void
+radv_pipeline_slab_destroy(struct radv_device *device, struct radv_pipeline_slab *slab)
+{
+   if (!p_atomic_dec_zero(&slab->ref_count))
+      return;
+
+   radv_free_shader_memory(device, slab->alloc);
+   free(slab);
+}
+
 void
 radv_pipeline_destroy(struct radv_device *device, struct radv_pipeline *pipeline,
                       const VkAllocationCallbacks *allocator)
@@ -183,6 +214,9 @@ radv_pipeline_destroy(struct radv_device *device, struct radv_pipeline *pipeline
       free(pipeline->library.stages);
    }
 
+   if (pipeline->slab)
+      radv_pipeline_slab_destroy(device, pipeline->slab);
+
    for (unsigned i = 0; i < MESA_VULKAN_SHADER_STAGES; ++i)
       if (pipeline->shaders[i])
          radv_shader_destroy(device, pipeline->shaders[i]);
@@ -3354,6 +3388,61 @@ non_uniform_access_callback(const nir_src *src, void *_)
    return nir_chase_binding(*src).success ? 0x2 : 0x3;
 }
 
+
+VkResult
+radv_upload_shaders(struct radv_device *device, struct radv_pipeline *pipeline,
+                    struct radv_shader_binary **binaries, struct radv_shader_binary *gs_copy_binary)
+{
+   uint32_t code_size = 0;
+
+   /* Compute the total code size. */
+   for (int i = 0; i < MESA_VULKAN_SHADER_STAGES; i++) {
+      struct radv_shader *shader = pipeline->shaders[i];
+      if (!shader)
+         continue;
+
+      code_size += align(shader->code_size, RADV_SHADER_ALLOC_ALIGNMENT);
+   }
+
+   if (pipeline->gs_copy_shader) {
+      code_size += align(pipeline->gs_copy_shader->code_size, RADV_SHADER_ALLOC_ALIGNMENT);
+   }
+
+   /* Allocate memory for all shader binaries. */
+   pipeline->slab = radv_pipeline_slab_create(device, pipeline, code_size);
+   if (!pipeline->slab)
+      return VK_ERROR_OUT_OF_DEVICE_MEMORY;
+
+   /* Upload shader binaries. */
+   uint64_t slab_va = radv_buffer_get_va(pipeline->slab->alloc->arena->bo);
+   uint32_t slab_offset = pipeline->slab->alloc->offset;
+   char *slab_ptr = pipeline->slab->alloc->arena->ptr;
+
+   for (int i = 0; i < MESA_VULKAN_SHADER_STAGES; ++i) {
+      struct radv_shader *shader = pipeline->shaders[i];
+      if (!shader)
+         continue;
+
+      shader->va = slab_va + slab_offset;
+
+      void *dest_ptr = slab_ptr + slab_offset;
+      if (!radv_shader_binary_upload(device, binaries[i], shader, dest_ptr))
+         return VK_ERROR_OUT_OF_HOST_MEMORY;
+
+      slab_offset += align(shader->code_size, RADV_SHADER_ALLOC_ALIGNMENT);
+   }
+
+   if (pipeline->gs_copy_shader) {
+      pipeline->gs_copy_shader->va = slab_va + slab_offset;
+
+      void *dest_ptr = slab_ptr + slab_offset;
+      if (!radv_shader_binary_upload(device, gs_copy_binary, pipeline->gs_copy_shader, dest_ptr))
+         return VK_ERROR_OUT_OF_HOST_MEMORY;
+   }
+
+   return VK_SUCCESS;
+}
+
 VkResult
 radv_create_shaders(struct radv_pipeline *pipeline, struct radv_pipeline_layout *pipeline_layout,
                     struct radv_device *device, struct radv_pipeline_cache *cache,
@@ -3411,11 +3500,6 @@ radv_create_shaders(struct radv_pipeline *pipeline, struct radv_pipeline_layout
        radv_create_shaders_from_pipeline_cache(device, cache, hash, pipeline,
                                                stack_sizes, num_stack_sizes,
                                                &found_in_application_cache)) {
-      if (modules[MESA_SHADER_GEOMETRY] && !pipeline->shaders[MESA_SHADER_GEOMETRY]->info.is_ngg) {
-         /* We use the CS slot because graphics pipelines might use all the other ones. */
-         pipeline->gs_copy_shader = pipeline->shaders[MESA_SHADER_COMPUTE];
-         pipeline->shaders[MESA_SHADER_COMPUTE] = NULL;
-      }
       radv_stop_feedback(pipeline_feedback, found_in_application_cache);
       return VK_SUCCESS;
    }
@@ -3692,19 +3776,7 @@ radv_create_shaders(struct radv_pipeline *pipeline, struct radv_pipeline_layout
    }
 
    /* Upload shader binaries. */
-   for (int i = 0; i < MESA_VULKAN_SHADER_STAGES; ++i) {
-      struct radv_shader *shader = pipeline->shaders[i];
-      if (!shader)
-         continue;
-
-      if (!radv_shader_binary_upload(device, binaries[i], shader))
-         return VK_ERROR_OUT_OF_DEVICE_MEMORY;
-
-      if (i == MESA_SHADER_GEOMETRY && pipeline->gs_copy_shader) {
-         if (!radv_shader_binary_upload(device, gs_copy_binary, pipeline->gs_copy_shader))
-            return VK_ERROR_OUT_OF_DEVICE_MEMORY;
-      }
-   }
+   radv_upload_shaders(device, pipeline, binaries, gs_copy_binary);
 
    if (!keep_executable_info) {
       if (pipeline->gs_copy_shader) {
diff --git a/src/amd/vulkan/radv_pipeline_cache.c b/src/amd/vulkan/radv_pipeline_cache.c
index b75b9bc73d5..b7fb11614dc 100644
--- a/src/amd/vulkan/radv_pipeline_cache.c
+++ b/src/amd/vulkan/radv_pipeline_cache.c
@@ -39,6 +39,7 @@ struct cache_entry {
    uint32_t binary_sizes[MESA_VULKAN_SHADER_STAGES];
    uint32_t num_stack_sizes;
    struct radv_shader *shaders[MESA_VULKAN_SHADER_STAGES];
+   struct radv_pipeline_slab *slab;
    char code[0];
 };
 
@@ -94,6 +95,8 @@ radv_pipeline_cache_finish(struct radv_pipeline_cache *cache)
             if (cache->hash_table[i]->shaders[j])
                radv_shader_destroy(cache->device, cache->hash_table[i]->shaders[j]);
          }
+         if (cache->hash_table[i]->slab)
+            radv_pipeline_slab_destroy(cache->device, cache->hash_table[i]->slab);
          vk_free(&cache->alloc, cache->hash_table[i]);
       }
    mtx_destroy(&cache->mutex);
@@ -298,6 +301,7 @@ radv_create_shaders_from_pipeline_cache(
    uint32_t *num_stack_sizes, bool *found_in_application_cache)
 {
    struct cache_entry *entry;
+   VkResult result;
 
    if (!cache) {
       cache = device->mem_cache;
@@ -347,6 +351,9 @@ radv_create_shaders_from_pipeline_cache(
       }
    }
 
+   struct radv_shader_binary *binaries[MESA_VULKAN_SHADER_STAGES] = {NULL};
+   struct radv_shader_binary *gs_copy_binary = NULL;
+   bool needs_upload = false;
    char *p = entry->code;
    for (int i = 0; i < MESA_VULKAN_SHADER_STAGES; ++i) {
       if (!entry->shaders[i] && entry->binary_sizes[i]) {
@@ -356,13 +363,8 @@ radv_create_shaders_from_pipeline_cache(
 
          entry->shaders[i] = radv_shader_create(device, binary, false, true, NULL);
 
-         if (!radv_shader_binary_upload(device, binary, entry->shaders[i])) {
-            free(binary);
-            radv_pipeline_cache_unlock(cache);
-            return false;
-         }
-
-         free(binary);
+         needs_upload = true;
+         binaries[i] = binary;
       } else if (entry->binary_sizes[i]) {
          p += entry->binary_sizes[i];
       }
@@ -370,6 +372,33 @@ radv_create_shaders_from_pipeline_cache(
 
    memcpy(pipeline->shaders, entry->shaders, sizeof(entry->shaders));
 
+   if (pipeline->shaders[MESA_SHADER_GEOMETRY] &&
+       !pipeline->shaders[MESA_SHADER_GEOMETRY]->info.is_ngg) {
+      /* For the GS copy shader, RADV uses the compute shader slot to avoid a new cache entry. */
+      pipeline->gs_copy_shader = pipeline->shaders[MESA_SHADER_COMPUTE];
+      pipeline->shaders[MESA_SHADER_COMPUTE] = NULL;
+      gs_copy_binary = binaries[MESA_SHADER_COMPUTE];
+   }
+
+   if (needs_upload) {
+      result = radv_upload_shaders(device, pipeline, binaries, gs_copy_binary);
+
+      for (int i = 0; i < MESA_VULKAN_SHADER_STAGES; ++i) {
+         if (pipeline->shaders[i])
+            free(binaries[i]);
+      }
+      free(gs_copy_binary);
+
+      if (result != VK_SUCCESS) {
+         radv_pipeline_cache_unlock(cache);
+         return false;
+      }
+
+      entry->slab = pipeline->slab;
+   } else {
+      pipeline->slab = entry->slab;
+   }
+
    if (num_stack_sizes) {
       *num_stack_sizes = entry->num_stack_sizes;
       if (entry->num_stack_sizes) {
@@ -388,6 +417,7 @@ radv_create_shaders_from_pipeline_cache(
       for (int i = 0; i < MESA_VULKAN_SHADER_STAGES; ++i)
          if (entry->shaders[i])
             p_atomic_inc(&entry->shaders[i]->ref_count);
+      p_atomic_inc(&entry->slab->ref_count);
    }
 
    assert((uintptr_t)p <= (uintptr_t)entry + entry_size(entry));
@@ -417,6 +447,12 @@ radv_pipeline_cache_insert_shaders(struct radv_device *device, struct radv_pipel
          pipeline->shaders[i] = entry->shaders[i];
          p_atomic_inc(&pipeline->shaders[i]->ref_count);
       }
+
+      radv_pipeline_slab_destroy(cache->device, pipeline->slab);
+
+      pipeline->slab = entry->slab;
+      p_atomic_inc(&pipeline->slab->ref_count);
+
       radv_pipeline_cache_unlock(cache);
       return;
    }
@@ -499,6 +535,9 @@ radv_pipeline_cache_insert_shaders(struct radv_device *device, struct radv_pipel
       p_atomic_inc(&pipeline->shaders[i]->ref_count);
    }
 
+   entry->slab = pipeline->slab;
+   p_atomic_inc(&pipeline->slab->ref_count);
+
    radv_pipeline_cache_add_entry(cache, entry);
 
    cache->modified = true;
@@ -541,6 +580,7 @@ radv_pipeline_cache_load(struct radv_pipeline_cache *cache, const void *data, si
          memcpy(dest_entry, entry, size_of_entry);
          for (int i = 0; i < MESA_VULKAN_SHADER_STAGES; ++i)
             dest_entry->shaders[i] = NULL;
+         dest_entry->slab = NULL;
          radv_pipeline_cache_add_entry(cache, dest_entry);
       }
       p += size_of_entry;
@@ -638,6 +678,7 @@ radv_GetPipelineCacheData(VkDevice _device, VkPipelineCache _cache, size_t *pDat
       memcpy(p, entry, size_of_entry);
       for (int j = 0; j < MESA_VULKAN_SHADER_STAGES; ++j)
          ((struct cache_entry *)p)->shaders[j] = NULL;
+      ((struct cache_entry *)p)->slab = NULL;
       p = (char *)p + size_of_entry;
    }
    *pDataSize = (char *)p - (char *)pData;
diff --git a/src/amd/vulkan/radv_private.h b/src/amd/vulkan/radv_private.h
index e52e9cb529f..4d4d10b2492 100644
--- a/src/amd/vulkan/radv_private.h
+++ b/src/amd/vulkan/radv_private.h
@@ -373,6 +373,10 @@ void radv_pipeline_cache_insert_shaders(
    struct radv_pipeline *pipeline, struct radv_shader_binary *const *binaries,
    const struct radv_pipeline_shader_stack_size *stack_sizes, uint32_t num_stack_sizes);
 
+VkResult radv_upload_shaders(struct radv_device *device, struct radv_pipeline *pipeline,
+                             struct radv_shader_binary **binaries,
+                             struct radv_shader_binary *gs_copy_binary);
+
 enum radv_blit_ds_layout {
    RADV_BLIT_DS_LAYOUT_TILE_ENABLE,
    RADV_BLIT_DS_LAYOUT_TILE_DISABLE,
@@ -1783,6 +1787,14 @@ struct radv_pipeline_shader_stack_size {
    uint32_t non_recursive_size;
 };
 
+struct radv_pipeline_slab {
+   uint32_t ref_count;
+
+   union radv_shader_arena_block *alloc;
+};
+
+void radv_pipeline_slab_destroy(struct radv_device *device, struct radv_pipeline_slab *slab);
+
 struct radv_pipeline {
    struct vk_object_base base;
    enum radv_pipeline_type type;
@@ -1790,6 +1802,8 @@ struct radv_pipeline {
    struct radv_device *device;
    struct radv_dynamic_state dynamic_state;
 
+   struct radv_pipeline_slab *slab;
+
    bool need_indirect_descriptor_sets;
    struct radv_shader *shaders[MESA_VULKAN_SHADER_STAGES];
    struct radv_shader *gs_copy_shader;
diff --git a/src/amd/vulkan/radv_shader.c b/src/amd/vulkan/radv_shader.c
index 61ed8cfd8c4..2a401cf06d3 100644
--- a/src/amd/vulkan/radv_shader.c
+++ b/src/amd/vulkan/radv_shader.c
@@ -1089,8 +1089,8 @@ free_block_obj(struct radv_device *device, union radv_shader_arena_block *block)
  * this should allocate blocks for shaders fast and with no fragmentation, while still allowing
  * free'd memory to be re-used.
  */
-static union radv_shader_arena_block *
-alloc_shader_memory(struct radv_device *device, uint32_t size, void *ptr)
+union radv_shader_arena_block *
+radv_alloc_shader_memory(struct radv_device *device, uint32_t size, void *ptr)
 {
    size = align(size, RADV_SHADER_ALLOC_ALIGNMENT);
 
@@ -1209,8 +1209,8 @@ get_hole(struct radv_shader_arena *arena, struct list_head *head)
    return hole->freelist.prev ? hole : NULL;
 }
 
-static void
-free_shader_memory(struct radv_device *device, union radv_shader_arena_block *alloc)
+void
+radv_free_shader_memory(struct radv_device *device, union radv_shader_arena_block *alloc)
 {
    mtx_lock(&device->shader_arena_mutex);
 
@@ -1256,16 +1256,6 @@ free_shader_memory(struct radv_device *device, union radv_shader_arena_block *al
    mtx_unlock(&device->shader_arena_mutex);
 }
 
-static void *
-radv_alloc_shader_memory(struct radv_device *device, struct radv_shader *shader)
-{
-   shader->alloc = alloc_shader_memory(device, shader->code_size, shader);
-   if (!shader->alloc)
-      return NULL;
-   shader->bo = shader->alloc->arena->bo;
-   return shader->alloc->arena->ptr + shader->alloc->offset;
-}
-
 void
 radv_init_shader_arenas(struct radv_device *device)
 {
@@ -1608,16 +1598,8 @@ radv_open_rtld_binary(struct radv_device *device, const struct radv_shader *shad
 
 bool
 radv_shader_binary_upload(struct radv_device *device, const struct radv_shader_binary *binary,
-                          struct radv_shader *shader)
+                          struct radv_shader *shader, void *dest_ptr)
 {
-   void *dest_ptr;
-
-   dest_ptr = radv_alloc_shader_memory(device, shader);
-   if (!dest_ptr) {
-      free(shader);
-      return false;
-   }
-
    if (binary->type == RADV_BINARY_TYPE_RTLD) {
       struct ac_rtld_binary rtld_binary = {0};
 
@@ -1959,7 +1941,7 @@ upload_vs_prolog(struct radv_device *device, struct radv_prolog_binary *bin, uns
    if (!prolog)
       return NULL;
 
-   prolog->alloc = alloc_shader_memory(device, bin->code_size, NULL);
+   prolog->alloc = radv_alloc_shader_memory(device, bin->code_size, NULL);
    if (!prolog->alloc) {
       free(prolog);
       return NULL;
@@ -2027,8 +2009,6 @@ radv_shader_destroy(struct radv_device *device, struct radv_shader *shader)
    if (!p_atomic_dec_zero(&shader->ref_count))
       return;
 
-   free_shader_memory(device, shader->alloc);
-
    free(shader->spirv);
    free(shader->nir_string);
    free(shader->disasm_string);
@@ -2043,14 +2023,14 @@ radv_prolog_destroy(struct radv_device *device, struct radv_shader_prolog *prolo
    if (!prolog)
       return;
 
-   free_shader_memory(device, prolog->alloc);
+   radv_free_shader_memory(device, prolog->alloc);
    free(prolog);
 }
 
 uint64_t
 radv_shader_get_va(const struct radv_shader *shader)
 {
-   return radv_buffer_get_va(shader->bo) + shader->alloc->offset;
+   return shader->va;
 }
 
 struct radv_shader *
diff --git a/src/amd/vulkan/radv_shader.h b/src/amd/vulkan/radv_shader.h
index a09afe900c2..408f47115b8 100644
--- a/src/amd/vulkan/radv_shader.h
+++ b/src/amd/vulkan/radv_shader.h
@@ -454,8 +454,8 @@ union radv_shader_arena_block {
 struct radv_shader {
    uint32_t ref_count;
 
-   struct radeon_winsys_bo *bo;
-   union radv_shader_arena_block *alloc;
+   uint64_t va;
+
    struct ac_shader_config config;
    uint8_t *code_ptr;
    uint32_t code_size;
@@ -515,7 +515,11 @@ struct radv_shader *radv_shader_compile(
    struct radv_shader_binary **binary_out);
 
 bool radv_shader_binary_upload(struct radv_device *device, const struct radv_shader_binary *binary,
-                               struct radv_shader *shader);
+                               struct radv_shader *shader, void *dest_ptr);
+
+union radv_shader_arena_block *radv_alloc_shader_memory(struct radv_device *device, uint32_t size,
+                                                        void *ptr);
+void radv_free_shader_memory(struct radv_device *device, union radv_shader_arena_block *alloc);
 
 struct radv_shader *
 radv_create_gs_copy_shader(struct radv_device *device, struct nir_shader *nir,