Mesa (master): v3dv: add a buffer to image copy path using a texel buffer

Tue Nov 17 12:17:46 UTC 2020

Module: Mesa
Branch: master
Commit: ba69c36ada5609ef2a43bb34ce9b48786c083aa9
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=ba69c36ada5609ef2a43bb34ce9b48786c083aa9

Author: Iago Toral Quiroga <itoral at igalia.com>
Date:   Thu Nov 12 10:43:54 2020 +0100

v3dv: add a buffer to image copy path using a texel buffer

This is much faster than the blit fallback (which requires to upload
the linear buffer to a tiled image) and the CPU path.

A simple stress test involving 100 buffer to image copies of a
single layer image with 10 mipmap levels provides the following
results:

Path           | Recording Time | Execution Time |
-------------------------------------------------|
Texel Buffer   |     2.954s      |     0.137s    |
-------------------------------------------------|
Blit           |    10.732s      |     0.148s    |
-------------------------------------------------|
CPU            |     0.002s      |     1.453s    |
-------------------------------------------------|

So generally speaking, this texel buffer copy path is the fastest
of the paths that can do partial copies, however, the CPU path might
provide better results in cases where command buffer recording is
important to overall performance. This is probably the reason why
the CPU path seems to provide slightly better results for vkQuake2.

Reviewed-by: Alejandro Piñeiro <apinheiro at igalia.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7651>

---

 src/broadcom/vulkan/v3dv_device.c    |    2 +
 src/broadcom/vulkan/v3dv_formats.c   |   10 +
 src/broadcom/vulkan/v3dv_meta_copy.c | 1095 +++++++++++++++++++++++++++++-----
 src/broadcom/vulkan/v3dv_private.h   |   24 +-
 4 files changed, 967 insertions(+), 164 deletions(-)

diff --git a/src/broadcom/vulkan/v3dv_device.c b/src/broadcom/vulkan/v3dv_device.c
index ceba9bca193..50694e4374e 100644
--- a/src/broadcom/vulkan/v3dv_device.c
+++ b/src/broadcom/vulkan/v3dv_device.c
@@ -1327,6 +1327,7 @@ init_device_meta(struct v3dv_device *device)
    mtx_init(&device->meta.mtx, mtx_plain);
    v3dv_meta_clear_init(device);
    v3dv_meta_blit_init(device);
+   v3dv_meta_texel_buffer_copy_init(device);
 }
 
 static void
@@ -1335,6 +1336,7 @@ destroy_device_meta(struct v3dv_device *device)
    mtx_destroy(&device->meta.mtx);
    v3dv_meta_clear_finish(device);
    v3dv_meta_blit_finish(device);
+   v3dv_meta_texel_buffer_copy_finish(device);
 }
 
 VkResult
diff --git a/src/broadcom/vulkan/v3dv_formats.c b/src/broadcom/vulkan/v3dv_formats.c
index ac8b8e58f54..e316ab142bd 100644
--- a/src/broadcom/vulkan/v3dv_formats.c
+++ b/src/broadcom/vulkan/v3dv_formats.c
@@ -505,6 +505,16 @@ buffer_format_features(VkFormat vk_format, const struct v3dv_format *v3dv_format
    return flags;
 }
 
+bool
+v3dv_buffer_format_supports_features(VkFormat vk_format,
+                                     VkFormatFeatureFlags features)
+{
+   const struct v3dv_format *v3dv_format = v3dv_get_format(vk_format);
+   const VkFormatFeatureFlags supported =
+      buffer_format_features(vk_format, v3dv_format);
+   return (supported & features) == features;
+}
+
 void
 v3dv_GetPhysicalDeviceFormatProperties(VkPhysicalDevice physicalDevice,
                                        VkFormat format,
diff --git a/src/broadcom/vulkan/v3dv_meta_copy.c b/src/broadcom/vulkan/v3dv_meta_copy.c
index 98429081846..da91af994e1 100644
--- a/src/broadcom/vulkan/v3dv_meta_copy.c
+++ b/src/broadcom/vulkan/v3dv_meta_copy.c
@@ -78,6 +78,56 @@ v3dv_meta_blit_finish(struct v3dv_device *device)
    }
 }
 
+static uint32_t
+meta_texel_buffer_copy_key_hash(const void *key)
+{
+   return _mesa_hash_data(key, V3DV_META_TEXEL_BUFFER_COPY_CACHE_KEY_SIZE);
+}
+
+static bool
+meta_texel_buffer_copy_key_compare(const void *key1, const void *key2)
+{
+   return memcmp(key1, key2, V3DV_META_TEXEL_BUFFER_COPY_CACHE_KEY_SIZE) == 0;
+}
+
+void
+v3dv_meta_texel_buffer_copy_init(struct v3dv_device *device)
+{
+   for (uint32_t i = 0; i < 3; i++) {
+      device->meta.texel_buffer_copy.cache[i] =
+         _mesa_hash_table_create(NULL,
+                                 meta_texel_buffer_copy_key_hash,
+                                 meta_texel_buffer_copy_key_compare);
+   }
+}
+
+void
+v3dv_meta_texel_buffer_copy_finish(struct v3dv_device *device)
+{
+   VkDevice _device = v3dv_device_to_handle(device);
+
+   for (uint32_t i = 0; i < 3; i++) {
+      hash_table_foreach(device->meta.texel_buffer_copy.cache[i], entry) {
+         struct v3dv_meta_texel_buffer_copy_pipeline *item = entry->data;
+         v3dv_DestroyPipeline(_device, item->pipeline, &device->alloc);
+         v3dv_DestroyRenderPass(_device, item->pass, &device->alloc);
+         v3dv_DestroyRenderPass(_device, item->pass_no_load, &device->alloc);
+         vk_free(&device->alloc, item);
+      }
+      _mesa_hash_table_destroy(device->meta.texel_buffer_copy.cache[i], NULL);
+   }
+
+   if (device->meta.texel_buffer_copy.playout) {
+      v3dv_DestroyPipelineLayout(_device, device->meta.texel_buffer_copy.playout,
+                                 &device->alloc);
+   }
+
+   if (device->meta.texel_buffer_copy.dslayout) {
+      v3dv_DestroyDescriptorSetLayout(_device, device->meta.texel_buffer_copy.dslayout,
+                                      &device->alloc);
+   }
+}
+
 static inline bool
 can_use_tlb(struct v3dv_image *image,
             const VkOffset3D *offset,
@@ -2521,181 +2571,767 @@ create_tiled_image_from_buffer(struct v3dv_cmd_buffer *cmd_buffer,
       return true;
    return false;
 }
-/**
- * Returns true if the implementation supports the requested operation (even if
- * it failed to process it, for example, due to an out-of-memory error).
- */
+
 static bool
-copy_buffer_to_image_blit(struct v3dv_cmd_buffer *cmd_buffer,
-                          struct v3dv_image *image,
-                          struct v3dv_buffer *buffer,
-                          const VkBufferImageCopy *region)
+create_texel_buffer_copy_pipeline_layout(struct v3dv_device *device,
+                                         VkDescriptorSetLayout *ds_layout,
+                                         VkPipelineLayout *p_layout)
 {
-   bool handled = false;
+   VkResult result;
 
-   /* Generally, the bpp of the data in the buffer matches that of the
-    * destination image. The exception is the case where we are uploading
-    * stencil (8bpp) to a combined d24s8 image (32bpp).
-    */
-   uint32_t buffer_bpp = image->cpp;
+   if (*ds_layout == 0) {
+      VkDescriptorSetLayoutBinding ds_layout_binding = {
+         .binding = 0,
+         .descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER,
+         .descriptorCount = 1,
+         .stageFlags = VK_SHADER_STAGE_FRAGMENT_BIT,
+      };
+      VkDescriptorSetLayoutCreateInfo ds_layout_info = {
+         .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO,
+         .bindingCount = 1,
+         .pBindings = &ds_layout_binding,
+      };
+      result =
+         v3dv_CreateDescriptorSetLayout(v3dv_device_to_handle(device),
+                                        &ds_layout_info,
+                                        &device->alloc,
+                                        ds_layout);
+      if (result != VK_SUCCESS)
+         return false;
+   }
 
-   VkImageAspectFlags aspect = region->imageSubresource.aspectMask;
+   assert(*p_layout == 0);
+   VkPipelineLayoutCreateInfo p_layout_info = {
+      .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
+      .setLayoutCount = 1,
+      .pSetLayouts = ds_layout,
+      .pushConstantRangeCount = 1,
+      .pPushConstantRanges =
+         &(VkPushConstantRange) { VK_SHADER_STAGE_FRAGMENT_BIT, 0, 20 },
+   };
 
-   /* We are about to upload the buffer data to an image so we can then
-    * blit that to our destination region. Because we are going to implement
-    * the copy as a blit, we want our blit source and destination formats to be
-    * the same (to avoid any format conversions), so we choose a canonical
-    * format that matches the destination image bpp.
+   result =
+      v3dv_CreatePipelineLayout(v3dv_device_to_handle(device),
+                                &p_layout_info,
+                                &device->alloc,
+                                p_layout);
+   return result == VK_SUCCESS;
+}
+
+static VkResult
+create_texel_buffer_copy_descriptor_pool(struct v3dv_cmd_buffer *cmd_buffer)
+{
+   /* If this is not the first pool we create for this command buffer
+    * size it based on the size of the currently exhausted pool.
     */
-   VkColorComponentFlags cmask = 0; /* Write all components */
-   VkFormat src_format;
-   VkFormat dst_format;
-   switch (buffer_bpp) {
-   case 16:
-      assert(aspect == VK_IMAGE_ASPECT_COLOR_BIT);
-      src_format = VK_FORMAT_R32G32B32A32_UINT;
-      dst_format = src_format;
-      break;
-   case 8:
-      assert(aspect == VK_IMAGE_ASPECT_COLOR_BIT);
-      src_format = VK_FORMAT_R16G16B16A16_UINT;
-      dst_format = src_format;
-      break;
-   case 4:
-      switch (aspect) {
-      case VK_IMAGE_ASPECT_COLOR_BIT:
-         src_format = VK_FORMAT_R8G8B8A8_UINT;
-         dst_format = src_format;
-         break;
-      case VK_IMAGE_ASPECT_DEPTH_BIT:
-         assert(image->vk_format == VK_FORMAT_D32_SFLOAT ||
-                image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT ||
-                image->vk_format == VK_FORMAT_X8_D24_UNORM_PACK32);
-         if (image->tiling != VK_IMAGE_TILING_LINEAR) {
-            src_format = image->vk_format;
-         } else {
-            src_format = VK_FORMAT_R8G8B8A8_UINT;
-            aspect = VK_IMAGE_ASPECT_COLOR_BIT;
-            if (image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT) {
-               cmask = VK_COLOR_COMPONENT_R_BIT |
-                       VK_COLOR_COMPONENT_G_BIT |
-                       VK_COLOR_COMPONENT_B_BIT;
-            }
-         }
-         dst_format = src_format;
-         break;
-      case VK_IMAGE_ASPECT_STENCIL_BIT:
-         /* Since we don't support separate stencil this is always a stencil
-          * copy to a combined depth/stencil image. Becasue we don't support
-          * separate stencil images, we upload the buffer data to a compatible
-          * color R8UI image, and implement the blit as a compatible color
-          * blit to an RGBA8UI destination masking out writes to components
-          * GBA (which map to the D24 component of a S8D24 image).
-          */
-         assert(image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT);
-         buffer_bpp = 1;
-         src_format = VK_FORMAT_R8_UINT;
-         dst_format = VK_FORMAT_R8G8B8A8_UINT;
-         cmask = VK_COLOR_COMPONENT_R_BIT;
-         aspect = VK_IMAGE_ASPECT_COLOR_BIT;
-         break;
-      default:
-         unreachable("unsupported aspect");
-         return handled;
-      };
-      break;
-   case 2:
-      aspect = VK_IMAGE_ASPECT_COLOR_BIT;
-      src_format = VK_FORMAT_R16_UINT;
-      dst_format = src_format;
-      break;
-   case 1:
-      assert(aspect == VK_IMAGE_ASPECT_COLOR_BIT);
-      src_format = VK_FORMAT_R8_UINT;
-      dst_format = src_format;
-      break;
-   default:
-      unreachable("unsupported bit-size");
-      return handled;
+   uint32_t descriptor_count = 64;
+   if (cmd_buffer->meta.texel_buffer_copy.dspool != VK_NULL_HANDLE) {
+      struct v3dv_descriptor_pool *exhausted_pool =
+         v3dv_descriptor_pool_from_handle(cmd_buffer->meta.texel_buffer_copy.dspool);
+      descriptor_count = MIN2(exhausted_pool->max_entry_count * 2, 1024);
    }
 
-   /* We should be able to handle the blit if we reached here */
-   handled = true;
-
-   /* Obtain the 2D buffer region spec */
-   uint32_t buf_width, buf_height;
-   if (region->bufferRowLength == 0)
-      buf_width = region->imageExtent.width;
-   else
-      buf_width = region->bufferRowLength;
+   /* Create the descriptor pool */
+   cmd_buffer->meta.texel_buffer_copy.dspool = VK_NULL_HANDLE;
+   VkDescriptorPoolSize pool_size = {
+      .type = VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER,
+      .descriptorCount = descriptor_count,
+   };
+   VkDescriptorPoolCreateInfo info = {
+      .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO,
+      .maxSets = descriptor_count,
+      .poolSizeCount = 1,
+      .pPoolSizes = &pool_size,
+      .flags = 0,
+   };
+   VkResult result =
+      v3dv_CreateDescriptorPool(v3dv_device_to_handle(cmd_buffer->device),
+                                &info,
+                                &cmd_buffer->device->alloc,
+                                &cmd_buffer->meta.texel_buffer_copy.dspool);
 
-   if (region->bufferImageHeight == 0)
-      buf_height = region->imageExtent.height;
-   else
-      buf_height = region->bufferImageHeight;
+   if (result == VK_SUCCESS) {
+      assert(cmd_buffer->meta.texel_buffer_copy.dspool != VK_NULL_HANDLE);
+      v3dv_cmd_buffer_add_private_obj(
+         cmd_buffer, (uintptr_t)cmd_buffer->meta.texel_buffer_copy.dspool,
+         (v3dv_cmd_buffer_private_obj_destroy_cb)v3dv_DestroyDescriptorPool);
+   }
 
-   /* If the image is compressed, the bpp refers to blocks, not pixels */
-   uint32_t block_width = vk_format_get_blockwidth(image->vk_format);
-   uint32_t block_height = vk_format_get_blockheight(image->vk_format);
-   buf_width = buf_width / block_width;
-   buf_height = buf_height / block_height;
+   return result;
+}
 
-   /* Compute layers to copy */
-   uint32_t num_layers;
-   if (image->type != VK_IMAGE_TYPE_3D)
-      num_layers = region->imageSubresource.layerCount;
-   else
-      num_layers = region->imageExtent.depth;
-   assert(num_layers > 0);
+static VkResult
+allocate_texel_buffer_copy_descriptor_set(struct v3dv_cmd_buffer *cmd_buffer,
+                                          VkDescriptorSet *set)
+{
+   /* Make sure we have a descriptor pool */
+   VkResult result;
+   if (cmd_buffer->meta.texel_buffer_copy.dspool == VK_NULL_HANDLE) {
+      result = create_texel_buffer_copy_descriptor_pool(cmd_buffer);
+      if (result != VK_SUCCESS)
+         return result;
+   }
+   assert(cmd_buffer->meta.texel_buffer_copy.dspool != VK_NULL_HANDLE);
 
+   /* Allocate descriptor set */
    struct v3dv_device *device = cmd_buffer->device;
    VkDevice _device = v3dv_device_to_handle(device);
-   for (uint32_t i = 0; i < num_layers; i++) {
-      /* Create the source blit image from the source buffer.
-       *
-       * We can't texture from a linear image, so we can't just setup a blit
-       * straight from the buffer contents. Instead, we need to upload the
-       * buffer to a tiled image, and then copy that image to the selected
-       * region of the destination.
-       *
-       * FIXME: we could do better than this is we use a blit shader that has
-       * a UBO (for the buffer) as input instead of a texture. Then we would
-       * have to do some arithmetics in the shader to identify the offset into
-       * the UBO that we need to load for each pixel in the destination image
-       * (we would need to support all the possible copy formats we have above).
-       */
-      VkImageCreateInfo image_info = {
-         .sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO,
-         .imageType = VK_IMAGE_TYPE_2D,
-         .format = src_format,
-         .extent = { buf_width, buf_height, 1 },
-         .mipLevels = 1,
-         .arrayLayers = 1,
-         .samples = VK_SAMPLE_COUNT_1_BIT,
-         .tiling = VK_IMAGE_TILING_OPTIMAL,
-         .usage = VK_IMAGE_USAGE_TRANSFER_SRC_BIT |
-                  VK_IMAGE_USAGE_TRANSFER_DST_BIT,
-         .sharingMode = VK_SHARING_MODE_EXCLUSIVE,
-         .queueFamilyIndexCount = 0,
-         .initialLayout = VK_IMAGE_LAYOUT_GENERAL,
-      };
+   VkDescriptorSetAllocateInfo info = {
+      .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO,
+      .descriptorPool = cmd_buffer->meta.texel_buffer_copy.dspool,
+      .descriptorSetCount = 1,
+      .pSetLayouts = &device->meta.texel_buffer_copy.dslayout,
+   };
+   result = v3dv_AllocateDescriptorSets(_device, &info, set);
 
-      VkImage buffer_image;
-      VkResult result =
-         v3dv_CreateImage(_device, &image_info, &device->alloc, &buffer_image);
-      if (result != VK_SUCCESS)
-         return handled;
+   /* If we ran out of pool space, grow the pool and try again */
+   if (result == VK_ERROR_OUT_OF_POOL_MEMORY) {
+      result = create_texel_buffer_copy_descriptor_pool(cmd_buffer);
+      if (result == VK_SUCCESS) {
+         info.descriptorPool = cmd_buffer->meta.texel_buffer_copy.dspool;
+         result = v3dv_AllocateDescriptorSets(_device, &info, set);
+      }
+   }
 
-      v3dv_cmd_buffer_add_private_obj(
-         cmd_buffer, (uintptr_t)buffer_image,
-         (v3dv_cmd_buffer_private_obj_destroy_cb)v3dv_DestroyImage);
+   return result;
+}
 
-      /* Allocate and bind memory for the image */
-      VkDeviceMemory mem;
-      VkMemoryRequirements reqs;
-      v3dv_GetImageMemoryRequirements(_device, buffer_image, &reqs);
-      VkMemoryAllocateInfo alloc_info = {
-         .sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO,
+static void
+get_texel_buffer_copy_pipeline_cache_key(VkFormat format,
+                                         uint8_t *key)
+{
+   memset(key, 0, V3DV_META_TEXEL_BUFFER_COPY_CACHE_KEY_SIZE);
+
+   uint32_t *p = (uint32_t *) key;
+
+   *p = format;
+   p++;
+
+   assert(((uint8_t*)p - key) == V3DV_META_TEXEL_BUFFER_COPY_CACHE_KEY_SIZE);
+}
+
+static bool
+create_blit_render_pass(struct v3dv_device *device,
+                        VkFormat dst_format,
+                        VkFormat src_format,
+                        VkRenderPass *pass_load,
+                        VkRenderPass *pass_no_load);
+
+static nir_ssa_def *gen_rect_vertices(nir_builder *b);
+
+static bool
+create_pipeline(struct v3dv_device *device,
+                struct v3dv_render_pass *pass,
+                struct nir_shader *vs_nir,
+                struct nir_shader *fs_nir,
+                const VkPipelineVertexInputStateCreateInfo *vi_state,
+                const VkPipelineDepthStencilStateCreateInfo *ds_state,
+                const VkPipelineColorBlendStateCreateInfo *cb_state,
+                const VkPipelineMultisampleStateCreateInfo *ms_state,
+                const VkPipelineLayout layout,
+                VkPipeline *pipeline);
+
+static nir_shader *
+get_texel_buffer_copy_vs()
+{
+   const nir_shader_compiler_options *options = v3dv_pipeline_get_nir_options();
+   nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_VERTEX, options,
+                                                  "meta texel buffer copy vs");
+   nir_variable *vs_out_pos =
+      nir_variable_create(b.shader, nir_var_shader_out,
+                          glsl_vec4_type(), "gl_Position");
+   vs_out_pos->data.location = VARYING_SLOT_POS;
+
+   nir_ssa_def *pos = gen_rect_vertices(&b);
+   nir_store_var(&b, vs_out_pos, pos, 0xf);
+
+   return b.shader;
+}
+
+static nir_ssa_def *
+load_frag_coord(nir_builder *b)
+{
+   nir_foreach_shader_in_variable(var, b->shader) {
+      if (var->data.location == VARYING_SLOT_POS)
+         return nir_load_var(b, var);
+   }
+   nir_variable *pos = nir_variable_create(b->shader, nir_var_shader_in,
+                                           glsl_vec4_type(), NULL);
+   pos->data.location = VARYING_SLOT_POS;
+   return nir_load_var(b, pos);
+}
+
+static nir_shader *
+get_texel_buffer_copy_fs(struct v3dv_device *device, VkFormat format)
+{
+   const nir_shader_compiler_options *options = v3dv_pipeline_get_nir_options();
+   nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_FRAGMENT, options,
+                                                  "meta texel buffer copy fs");
+
+   /* We only use the copy from texel buffer shader to implement
+    * copy_buffer_to_image_shader, which always selects a compatible integer
+    * format for the copy.
+    */
+   assert(vk_format_is_int(format));
+
+   /* Fragment shader output color */
+   nir_variable *fs_out_color =
+      nir_variable_create(b.shader, nir_var_shader_out,
+                          glsl_uvec4_type(), "out_color");
+   fs_out_color->data.location = FRAG_RESULT_DATA0;
+
+   /* Texel buffer input */
+   const struct glsl_type *sampler_type =
+      glsl_sampler_type(GLSL_SAMPLER_DIM_BUF, false, false, GLSL_TYPE_UINT);
+   nir_variable *sampler =
+      nir_variable_create(b.shader, nir_var_uniform, sampler_type, "texel_buf");
+   sampler->data.descriptor_set = 0;
+   sampler->data.binding = 0;
+
+   /* Load the box describing the pixel region we want to copy from the
+    * texel buffer.
+    */
+   nir_intrinsic_instr *box =
+      nir_intrinsic_instr_create(b.shader, nir_intrinsic_load_push_constant);
+   box->src[0] = nir_src_for_ssa(nir_imm_int(&b, 0));
+   nir_intrinsic_set_base(box, 0);
+   nir_intrinsic_set_range(box, 16);
+   box->num_components = 4;
+   nir_ssa_dest_init(&box->instr, &box->dest, 4, 32, "box");
+   nir_builder_instr_insert(&b, &box->instr);
+
+   /* Load the buffer stride (this comes in texel units) */
+   nir_intrinsic_instr *stride =
+      nir_intrinsic_instr_create(b.shader, nir_intrinsic_load_push_constant);
+   stride->src[0] = nir_src_for_ssa(nir_imm_int(&b, 0));
+   nir_intrinsic_set_base(stride, 16);
+   nir_intrinsic_set_range(stride, 4);
+   stride->num_components = 1;
+   nir_ssa_dest_init(&stride->instr, &stride->dest, 1, 32, "buffer stride");
+   nir_builder_instr_insert(&b, &stride->instr);
+
+   /* Load the buffer offset (this comes in texel units) */
+   nir_intrinsic_instr *offset =
+      nir_intrinsic_instr_create(b.shader, nir_intrinsic_load_push_constant);
+   offset->src[0] = nir_src_for_ssa(nir_imm_int(&b, 0));
+   nir_intrinsic_set_base(offset, 20);
+   nir_intrinsic_set_range(offset, 4);
+   offset->num_components = 1;
+   nir_ssa_dest_init(&offset->instr, &offset->dest, 1, 32, "buffer offset");
+   nir_builder_instr_insert(&b, &offset->instr);
+
+   /* Pixel coordinate must be within the box, otherwise our buffer offsets
+    * could be out of bounds.
+    */
+   nir_ssa_def *coord = nir_f2i32(&b, load_frag_coord(&b));
+   nir_ssa_def *cond =
+      nir_iand(&b,
+         nir_iand(&b, nir_ige(&b, nir_channel(&b, coord, 0),
+                                  nir_channel(&b, &box->dest.ssa, 0)),
+                      nir_ige(&b, nir_channel(&b, coord, 1),
+                                  nir_channel(&b, &box->dest.ssa, 1))),
+         nir_iand(&b, nir_ige(&b, nir_channel(&b, &box->dest.ssa, 2),
+                                  nir_channel(&b, coord, 0)),
+                      nir_ige(&b, nir_channel(&b, &box->dest.ssa, 3),
+                                  nir_channel(&b, coord, 1))));
+
+   nir_if *if_stmt = nir_push_if(&b, cond);
+      /* Load pixel data from texel buffer based on the x,y offset of the pixel
+       * within the box. Texel buffers are 1D arrays of texels.
+       */
+      nir_ssa_def *x_offset =
+         nir_isub(&b, nir_channel(&b, coord, 0),
+                      nir_channel(&b, &box->dest.ssa, 0));
+      nir_ssa_def *y_offset =
+         nir_isub(&b, nir_channel(&b, coord, 1),
+                      nir_channel(&b, &box->dest.ssa, 1));
+      nir_ssa_def *texel_offset =
+         nir_iadd(&b, nir_iadd(&b, &offset->dest.ssa, x_offset),
+                      nir_imul(&b, y_offset, &stride->dest.ssa));
+
+      nir_ssa_def *tex_deref = &nir_build_deref_var(&b, sampler)->dest.ssa;
+      nir_tex_instr *tex = nir_tex_instr_create(b.shader, 2);
+      tex->sampler_dim = GLSL_SAMPLER_DIM_BUF;
+      tex->op = nir_texop_txf;
+      tex->src[0].src_type = nir_tex_src_coord;
+      tex->src[0].src = nir_src_for_ssa(texel_offset);
+      tex->src[1].src_type = nir_tex_src_texture_deref;
+      tex->src[1].src = nir_src_for_ssa(tex_deref);
+      tex->dest_type = nir_type_uint;
+      tex->is_array = false;
+      tex->coord_components = 1;
+      nir_ssa_dest_init(&tex->instr, &tex->dest, 4, 32, "texel buffer result");
+      nir_builder_instr_insert(&b, &tex->instr);
+
+      nir_store_var(&b, fs_out_color, &tex->dest.ssa, 0xf);
+   nir_pop_if(&b, if_stmt);
+
+   return b.shader;
+}
+
+static bool
+create_texel_buffer_copy_pipeline(struct v3dv_device *device,
+                                  VkFormat format,
+                                  VkRenderPass _pass,
+                                  VkPipelineLayout pipeline_layout,
+                                  VkPipeline *pipeline)
+{
+   struct v3dv_render_pass *pass = v3dv_render_pass_from_handle(_pass);
+
+   assert(vk_format_is_color(format));
+
+   nir_shader *vs_nir = get_texel_buffer_copy_vs();
+   nir_shader *fs_nir = get_texel_buffer_copy_fs(device, format);
+
+   const VkPipelineVertexInputStateCreateInfo vi_state = {
+      .sType = VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO,
+      .vertexBindingDescriptionCount = 0,
+      .vertexAttributeDescriptionCount = 0,
+   };
+
+   VkPipelineDepthStencilStateCreateInfo ds_state = {
+      .sType = VK_STRUCTURE_TYPE_PIPELINE_DEPTH_STENCIL_STATE_CREATE_INFO,
+   };
+
+   VkPipelineColorBlendAttachmentState blend_att_state[1] = { 0 };
+   blend_att_state[0] = (VkPipelineColorBlendAttachmentState) {
+      .blendEnable = false,
+      .colorWriteMask = VK_COLOR_COMPONENT_R_BIT |
+                        VK_COLOR_COMPONENT_G_BIT |
+                        VK_COLOR_COMPONENT_B_BIT |
+                        VK_COLOR_COMPONENT_A_BIT,
+   };
+
+   const VkPipelineColorBlendStateCreateInfo cb_state = {
+      .sType = VK_STRUCTURE_TYPE_PIPELINE_COLOR_BLEND_STATE_CREATE_INFO,
+      .logicOpEnable = false,
+      .attachmentCount = 1,
+      .pAttachments = blend_att_state
+   };
+
+   const VkPipelineMultisampleStateCreateInfo ms_state = {
+      .sType = VK_STRUCTURE_TYPE_PIPELINE_MULTISAMPLE_STATE_CREATE_INFO,
+      .rasterizationSamples = VK_SAMPLE_COUNT_1_BIT,
+      .sampleShadingEnable = false,
+      .pSampleMask = NULL,
+      .alphaToCoverageEnable = false,
+      .alphaToOneEnable = false,
+   };
+
+   return create_pipeline(device,
+                          pass,
+                          vs_nir, fs_nir,
+                          &vi_state,
+                          &ds_state,
+                          &cb_state,
+                          &ms_state,
+                          pipeline_layout,
+                          pipeline);
+}
+
+static bool
+get_copy_texel_buffer_pipeline(
+   struct v3dv_device *device,
+   VkFormat format,
+   VkImageType image_type,
+   struct v3dv_meta_texel_buffer_copy_pipeline **pipeline)
+{
+   bool ok = true;
+
+   mtx_lock(&device->meta.mtx);
+   if (!device->meta.texel_buffer_copy.playout) {
+      ok = create_texel_buffer_copy_pipeline_layout(
+               device,
+               &device->meta.texel_buffer_copy.dslayout,
+               &device->meta.texel_buffer_copy.playout);
+   }
+   mtx_unlock(&device->meta.mtx);
+   if (!ok)
+      return false;
+
+   uint8_t key[V3DV_META_TEXEL_BUFFER_COPY_CACHE_KEY_SIZE];
+   get_texel_buffer_copy_pipeline_cache_key(format, key);
+   mtx_lock(&device->meta.mtx);
+   struct hash_entry *entry =
+      _mesa_hash_table_search(device->meta.texel_buffer_copy.cache[image_type],
+                              &key);
+   if (entry) {
+      mtx_unlock(&device->meta.mtx);
+      *pipeline = entry->data;
+      return true;
+   }
+
+   *pipeline = vk_zalloc2(&device->alloc, NULL, sizeof(**pipeline), 8,
+                          VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
+
+   if (*pipeline == NULL)
+      goto fail;
+
+   /* The blit render pass is compatible */
+   ok = create_blit_render_pass(device, format, format,
+                                &(*pipeline)->pass,
+                                &(*pipeline)->pass_no_load);
+   if (!ok)
+      goto fail;
+
+   ok =
+      create_texel_buffer_copy_pipeline(device,
+                                        format,
+                                        (*pipeline)->pass,
+                                        device->meta.texel_buffer_copy.playout,
+                                        &(*pipeline)->pipeline);
+   if (!ok)
+      goto fail;
+
+   _mesa_hash_table_insert(device->meta.texel_buffer_copy.cache[image_type],
+                           &key, *pipeline);
+
+   mtx_unlock(&device->meta.mtx);
+   return true;
+
+fail:
+   mtx_unlock(&device->meta.mtx);
+
+   VkDevice _device = v3dv_device_to_handle(device);
+   if (*pipeline) {
+      if ((*pipeline)->pass)
+         v3dv_DestroyRenderPass(_device, (*pipeline)->pass, &device->alloc);
+      if ((*pipeline)->pipeline)
+         v3dv_DestroyPipeline(_device, (*pipeline)->pipeline, &device->alloc);
+      vk_free(&device->alloc, *pipeline);
+      *pipeline = NULL;
+   }
+
+   return false;
+}
+
+static bool
+texel_buffer_shader_copy(struct v3dv_cmd_buffer *cmd_buffer,
+                         VkImageAspectFlags aspect,
+                         struct v3dv_image *image,
+                         uint32_t num_layers,
+                         VkFormat dst_format,
+                         VkFormat src_format,
+                         struct v3dv_buffer *buffer,
+                         uint32_t buf_width,
+                         uint32_t buf_height,
+                         uint32_t buffer_bpp,
+                         VkColorComponentFlags cmask,
+                         const VkBufferImageCopy *region)
+{
+   VkResult result;
+   bool handled = false;
+
+   /* FIXME: we only only handle exact copies for now. */
+   if (src_format != dst_format)
+      return handled;
+
+   VkFormat format = dst_format;
+
+   /* FIXME: we only handle color copies for now. */
+   if (aspect != VK_IMAGE_ASPECT_COLOR_BIT)
+      return handled;
+
+   /* FIXME: we only handle uncompressed images for now. */
+   if (vk_format_is_compressed(image->vk_format))
+      return handled;
+
+   /* FIXME: support partial color masks */
+   const VkColorComponentFlags full_cmask = VK_COLOR_COMPONENT_R_BIT |
+                                            VK_COLOR_COMPONENT_G_BIT |
+                                            VK_COLOR_COMPONENT_B_BIT |
+                                            VK_COLOR_COMPONENT_A_BIT;
+   if (cmask == 0)
+      cmask = full_cmask;
+
+   if (cmask != full_cmask)
+      return handled;
+
+   /* The buffer needs to have VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT
+    * so we can bind it as a texel buffer. Otherwise, the buffer view
+    * we create below won't setup the texture state that we need for this.
+    */
+   if (!(buffer->usage & VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT)) {
+      if (v3dv_buffer_format_supports_features(
+            format, VK_FORMAT_FEATURE_UNIFORM_TEXEL_BUFFER_BIT)) {
+         buffer->usage |= VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT;
+      } else {
+         return handled;
+      }
+   }
+
+   /* At this point we should be able to handle the copy unless an unexpected
+    * error occurs, such as an OOM.
+    */
+   handled = true;
+
+   /* Get the texel buffer copy pipeline */
+   struct v3dv_meta_texel_buffer_copy_pipeline *pipeline = NULL;
+   bool ok = get_copy_texel_buffer_pipeline(cmd_buffer->device,
+                                            format, image->type, &pipeline);
+   if (!ok)
+      return handled;
+   assert(pipeline && pipeline->pipeline && pipeline->pass);
+
+   /* Setup descriptor set for the source texel buffer. We don't have to
+    * register the descriptor as a private command buffer object since
+    * all descriptors will be freed automatically with the descriptor
+    * pool.
+    */
+   VkDescriptorSet set;
+   result = allocate_texel_buffer_copy_descriptor_set(cmd_buffer, &set);
+   if (result != VK_SUCCESS)
+      return handled;
+
+   /* FIXME: for some reason passing region->bufferOffset here for the
+    * offset field doesn't work, making the following CTS tests fail:
+    *
+    * dEQP-VK.api.copy_and_blit.core.buffer_to_image.*buffer_offset*
+    *
+    * So instead we pass 0 here and we pass the offset in texels as a push
+    * constant to the shader, which seems to work correctly.
+    */
+   VkDevice _device = v3dv_device_to_handle(cmd_buffer->device);
+   VkBufferViewCreateInfo buffer_view_info = {
+      .sType = VK_STRUCTURE_TYPE_BUFFER_VIEW_CREATE_INFO,
+      .buffer = v3dv_buffer_to_handle(buffer),
+      .format = format,
+      .offset = 0,
+      .range = VK_WHOLE_SIZE,
+   };
+
+   VkBufferView texel_buffer_view;
+   result = v3dv_CreateBufferView(_device, &buffer_view_info,
+                                  &cmd_buffer->device->alloc,
+                                  &texel_buffer_view);
+   if (result != VK_SUCCESS)
+      return handled;
+
+   v3dv_cmd_buffer_add_private_obj(
+      cmd_buffer, (uintptr_t)texel_buffer_view,
+      (v3dv_cmd_buffer_private_obj_destroy_cb)v3dv_DestroyBufferView);
+
+   VkWriteDescriptorSet write = {
+      .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
+      .dstSet = set,
+      .dstBinding = 0,
+      .dstArrayElement = 0,
+      .descriptorCount = 1,
+      .descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER,
+      .pTexelBufferView = &texel_buffer_view,
+   };
+   v3dv_UpdateDescriptorSets(_device, 1, &write, 0, NULL);
+
+   /* Push command buffer state before starting meta operation */
+   v3dv_cmd_buffer_meta_state_push(cmd_buffer, true);
+
+   /* Bind common state for all layers */
+   VkCommandBuffer _cmd_buffer = v3dv_cmd_buffer_to_handle(cmd_buffer);
+   v3dv_CmdBindPipeline(_cmd_buffer,
+                        VK_PIPELINE_BIND_POINT_GRAPHICS,
+                        pipeline->pipeline);
+
+   v3dv_CmdBindDescriptorSets(_cmd_buffer,
+                              VK_PIPELINE_BIND_POINT_GRAPHICS,
+                              cmd_buffer->device->meta.texel_buffer_copy.playout,
+                              0, 1, &set,
+                              0, NULL);
+
+   const VkViewport viewport = {
+      .x = region->imageOffset.x,
+      .y = region->imageOffset.y,
+      .width = region->imageExtent.width,
+      .height = region->imageExtent.height,
+      .minDepth = 0.0f,
+      .maxDepth = 1.0f
+   };
+   v3dv_CmdSetViewport(_cmd_buffer, 0, 1, &viewport);
+   const VkRect2D scissor = {
+      .offset = { region->imageOffset.x, region->imageOffset.y },
+      .extent = { region->imageExtent.width, region->imageExtent.height }
+   };
+   v3dv_CmdSetScissor(_cmd_buffer, 0, 1, &scissor);
+
+   uint32_t dirty_dynamic_state =
+      V3DV_CMD_DIRTY_VIEWPORT | V3DV_CMD_DIRTY_SCISSOR;
+
+   bool can_skip_tlb_load = false;
+   const VkRect2D render_area = {
+     .offset = { region->imageOffset.x, region->imageOffset.y },
+     .extent = { region->imageExtent.width, region->imageExtent.height },
+   };
+
+   /* Record per-layer commands */
+   for (uint32_t i = 0; i < num_layers; i++) {
+      /* Setup framebuffer for this layer.
+       *
+       * FIXME: once we support geometry shaders, we should be able to have
+       *        a single layered framebuffer and emit just one draw call for
+       *        all layers using layered rendering.
+       */
+      VkImageViewCreateInfo image_view_info = {
+         .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO,
+         .image = v3dv_image_to_handle(image),
+         .viewType = v3dv_image_type_to_view_type(image->type),
+         .format = format,
+         .subresourceRange = {
+            .aspectMask = aspect,
+            .baseMipLevel = region->imageSubresource.mipLevel,
+            .levelCount = 1,
+            .baseArrayLayer = region->imageSubresource.baseArrayLayer + i,
+            .layerCount = 1
+         },
+      };
+      VkImageView image_view;
+      result = v3dv_CreateImageView(_device, &image_view_info,
+                                    &cmd_buffer->device->alloc, &image_view);
+      if (result != VK_SUCCESS)
+         goto fail;
+
+      VkFramebufferCreateInfo fb_info = {
+         .sType = VK_STRUCTURE_TYPE_FRAMEBUFFER_CREATE_INFO,
+         .renderPass = pipeline->pass,
+         .attachmentCount = 1,
+         .pAttachments = &image_view,
+         .width = u_minify(image->extent.width,
+                           region->imageSubresource.mipLevel),
+         .height = u_minify(image->extent.height,
+                            region->imageSubresource.mipLevel),
+         .layers = 1,
+      };
+
+      VkFramebuffer fb;
+      result = v3dv_CreateFramebuffer(_device, &fb_info,
+                                      &cmd_buffer->device->alloc, &fb);
+      if (result != VK_SUCCESS)
+         goto fail;
+
+      v3dv_cmd_buffer_add_private_obj(
+         cmd_buffer, (uintptr_t)fb,
+         (v3dv_cmd_buffer_private_obj_destroy_cb)v3dv_DestroyFramebuffer);
+
+      /* If the region we are about to blit is tile-aligned, then we can
+       * use the render pass version that won't pre-load the tile buffer
+       * with the dst image contents before the copy.
+       *
+       * The region is always the same for all layers, so we only need to
+       * compute this once.
+       */
+      if (i == 0) {
+         struct v3dv_render_pass *pipeline_pass =
+            v3dv_render_pass_from_handle(pipeline->pass);
+         can_skip_tlb_load =
+            v3dv_subpass_area_is_tile_aligned(&render_area,
+                                              v3dv_framebuffer_from_handle(fb),
+                                              pipeline_pass, 0);
+      }
+
+      VkRenderPassBeginInfo rp_info = {
+         .sType = VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO,
+         .renderPass = can_skip_tlb_load ? pipeline->pass_no_load :
+                                           pipeline->pass,
+         .framebuffer = fb,
+         .renderArea = render_area,
+         .clearValueCount = 0,
+      };
+
+      /* Record draw */
+      v3dv_CmdBeginRenderPass(_cmd_buffer, &rp_info, VK_SUBPASS_CONTENTS_INLINE);
+      struct v3dv_job *job = cmd_buffer->state.job;
+      if (!job)
+         goto fail;
+
+      const VkDeviceSize buf_offset =
+         region->bufferOffset / buffer_bpp  + i * buf_height * buf_width;
+      uint32_t push_data[6] = {
+         region->imageOffset.x,
+         region->imageOffset.y,
+         region->imageOffset.x + region->imageExtent.width - 1,
+         region->imageOffset.y + region->imageExtent.height - 1,
+         buf_width,
+         buf_offset,
+      };
+
+      v3dv_CmdPushConstants(_cmd_buffer,
+                            cmd_buffer->device->meta.texel_buffer_copy.playout,
+                            VK_SHADER_STAGE_FRAGMENT_BIT,
+                            0, sizeof(push_data), &push_data);
+
+      v3dv_CmdDraw(_cmd_buffer, 4, 1, 0, 0);
+
+      v3dv_CmdEndRenderPass(_cmd_buffer);
+   }
+
+fail:
+   v3dv_cmd_buffer_meta_state_pop(cmd_buffer, dirty_dynamic_state, true);
+
+   return handled;
+}
+
+static bool
+copy_buffer_to_image_blit(struct v3dv_cmd_buffer *cmd_buffer,
+                          VkImageAspectFlags aspect,
+                          struct v3dv_image *image,
+                          uint32_t num_layers,
+                          VkFormat dst_format,
+                          VkFormat src_format,
+                          struct v3dv_buffer *buffer,
+                          uint32_t buf_width,
+                          uint32_t buf_height,
+                          uint32_t buffer_bpp,
+                          VkColorComponentFlags cmask,
+                          const VkBufferImageCopy *region)
+{
+   perf_debug("Falling back to blit path for buffer to image copy.\n");
+
+   /* If the image is compressed, the bpp refers to blocks, not pixels */
+   uint32_t block_width = vk_format_get_blockwidth(image->vk_format);
+   uint32_t block_height = vk_format_get_blockheight(image->vk_format);
+   buf_width = buf_width / block_width;
+   buf_height = buf_height / block_height;
+
+   /* We should have configured the blit to use a supported format  */
+   bool handled = true;
+
+   struct v3dv_device *device = cmd_buffer->device;
+   VkDevice _device = v3dv_device_to_handle(device);
+   for (uint32_t i = 0; i < num_layers; i++) {
+      /* Otherwise, since we can't sample linear images we need to upload the
+       * linear buffer to a tiled image that we can use as a blit source, which
+       * is slow.
+       */
+      VkImageCreateInfo image_info = {
+         .sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO,
+         .imageType = VK_IMAGE_TYPE_2D,
+         .format = src_format,
+         .extent = { buf_width, buf_height, 1 },
+         .mipLevels = 1,
+         .arrayLayers = 1,
+         .samples = VK_SAMPLE_COUNT_1_BIT,
+         .tiling = VK_IMAGE_TILING_OPTIMAL,
+         .usage = VK_IMAGE_USAGE_TRANSFER_SRC_BIT |
+                  VK_IMAGE_USAGE_TRANSFER_DST_BIT,
+         .sharingMode = VK_SHARING_MODE_EXCLUSIVE,
+         .queueFamilyIndexCount = 0,
+         .initialLayout = VK_IMAGE_LAYOUT_GENERAL,
+      };
+
+      VkImage buffer_image;
+      VkResult result =
+         v3dv_CreateImage(_device, &image_info, &device->alloc, &buffer_image);
+      if (result != VK_SUCCESS)
+         return handled;
+
+      v3dv_cmd_buffer_add_private_obj(
+         cmd_buffer, (uintptr_t)buffer_image,
+         (v3dv_cmd_buffer_private_obj_destroy_cb)v3dv_DestroyImage);
+
+      /* Allocate and bind memory for the image */
+      VkDeviceMemory mem;
+      VkMemoryRequirements reqs;
+      v3dv_GetImageMemoryRequirements(_device, buffer_image, &reqs);
+      VkMemoryAllocateInfo alloc_info = {
+         .sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO,
          .allocationSize = reqs.size,
          .memoryTypeIndex = 0,
       };
@@ -2712,10 +3348,10 @@ copy_buffer_to_image_blit(struct v3dv_cmd_buffer *cmd_buffer,
          return handled;
 
       /* Upload buffer contents for the selected layer */
-      VkDeviceSize buffer_offset =
+      const VkDeviceSize buf_offset_bytes =
          region->bufferOffset + i * buf_height * buf_width * buffer_bpp;
       const VkBufferImageCopy buffer_image_copy = {
-         .bufferOffset = buffer_offset,
+         .bufferOffset = buf_offset_bytes,
          .bufferRowLength = region->bufferRowLength / block_width,
          .bufferImageHeight = region->bufferImageHeight / block_height,
          .imageSubresource = {
@@ -2793,8 +3429,139 @@ copy_buffer_to_image_blit(struct v3dv_cmd_buffer *cmd_buffer,
       }
    }
 
-   assert(handled);
-   return true;
+   return handled;
+}
+
+/**
+ * Returns true if the implementation supports the requested operation (even if
+ * it failed to process it, for example, due to an out-of-memory error).
+ */
+static bool
+copy_buffer_to_image_shader(struct v3dv_cmd_buffer *cmd_buffer,
+                            struct v3dv_image *image,
+                            struct v3dv_buffer *buffer,
+                            const VkBufferImageCopy *region,
+                            bool use_texel_buffer)
+{
+   /* Generally, the bpp of the data in the buffer matches that of the
+    * destination image. The exception is the case where we are uploading
+    * stencil (8bpp) to a combined d24s8 image (32bpp).
+    */
+   uint32_t buf_bpp = image->cpp;
+
+   VkImageAspectFlags aspect = region->imageSubresource.aspectMask;
+
+   /* We are about to upload the buffer data to an image so we can then
+    * blit that to our destination region. Because we are going to implement
+    * the copy as a blit, we want our blit source and destination formats to be
+    * the same (to avoid any format conversions), so we choose a canonical
+    * format that matches the destination image bpp.
+    */
+   VkColorComponentFlags cmask = 0; /* Write all components */
+   VkFormat src_format;
+   VkFormat dst_format;
+   switch (buf_bpp) {
+   case 16:
+      assert(aspect == VK_IMAGE_ASPECT_COLOR_BIT);
+      src_format = VK_FORMAT_R32G32B32A32_UINT;
+      dst_format = src_format;
+      break;
+   case 8:
+      assert(aspect == VK_IMAGE_ASPECT_COLOR_BIT);
+      src_format = VK_FORMAT_R16G16B16A16_UINT;
+      dst_format = src_format;
+      break;
+   case 4:
+      switch (aspect) {
+      case VK_IMAGE_ASPECT_COLOR_BIT:
+         src_format = VK_FORMAT_R8G8B8A8_UINT;
+         dst_format = src_format;
+         break;
+      case VK_IMAGE_ASPECT_DEPTH_BIT:
+         assert(image->vk_format == VK_FORMAT_D32_SFLOAT ||
+                image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT ||
+                image->vk_format == VK_FORMAT_X8_D24_UNORM_PACK32);
+         if (image->tiling != VK_IMAGE_TILING_LINEAR) {
+            src_format = image->vk_format;
+         } else {
+            src_format = VK_FORMAT_R8G8B8A8_UINT;
+            aspect = VK_IMAGE_ASPECT_COLOR_BIT;
+            if (image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT) {
+               cmask = VK_COLOR_COMPONENT_R_BIT |
+                       VK_COLOR_COMPONENT_G_BIT |
+                       VK_COLOR_COMPONENT_B_BIT;
+            }
+         }
+         dst_format = src_format;
+         break;
+      case VK_IMAGE_ASPECT_STENCIL_BIT:
+         /* Since we don't support separate stencil this is always a stencil
+          * copy to a combined depth/stencil image. Becasue we don't support
+          * separate stencil images, we upload the buffer data to a compatible
+          * color R8UI image, and implement the blit as a compatible color
+          * blit to an RGBA8UI destination masking out writes to components
+          * GBA (which map to the D24 component of a S8D24 image).
+          */
+         assert(image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT);
+         buf_bpp = 1;
+         src_format = VK_FORMAT_R8_UINT;
+         dst_format = VK_FORMAT_R8G8B8A8_UINT;
+         cmask = VK_COLOR_COMPONENT_R_BIT;
+         aspect = VK_IMAGE_ASPECT_COLOR_BIT;
+         break;
+      default:
+         unreachable("unsupported aspect");
+         return false;
+      };
+      break;
+   case 2:
+      aspect = VK_IMAGE_ASPECT_COLOR_BIT;
+      src_format = VK_FORMAT_R16_UINT;
+      dst_format = src_format;
+      break;
+   case 1:
+      assert(aspect == VK_IMAGE_ASPECT_COLOR_BIT);
+      src_format = VK_FORMAT_R8_UINT;
+      dst_format = src_format;
+      break;
+   default:
+      unreachable("unsupported bit-size");
+      return false;
+   }
+
+   /* Obtain the 2D buffer region spec */
+   uint32_t buf_width, buf_height;
+   if (region->bufferRowLength == 0)
+      buf_width = region->imageExtent.width;
+   else
+      buf_width = region->bufferRowLength;
+
+   if (region->bufferImageHeight == 0)
+      buf_height = region->imageExtent.height;
+   else
+      buf_height = region->bufferImageHeight;
+
+   /* Compute layers to copy */
+   uint32_t num_layers;
+   if (image->type != VK_IMAGE_TYPE_3D)
+      num_layers = region->imageSubresource.layerCount;
+   else
+      num_layers = region->imageExtent.depth;
+   assert(num_layers > 0);
+
+   if (use_texel_buffer) {
+      return texel_buffer_shader_copy(cmd_buffer, aspect,
+                                      image, num_layers,
+                                      dst_format, src_format,
+                                      buffer, buf_width, buf_height, buf_bpp,
+                                      cmask, region);
+   } else {
+      return copy_buffer_to_image_blit(cmd_buffer, aspect,
+                                       image, num_layers,
+                                       dst_format, src_format,
+                                       buffer, buf_width, buf_height, buf_bpp,
+                                       cmask, region);
+   }
 }
 
 /**
@@ -2882,9 +3649,11 @@ v3dv_CmdCopyBufferToImage(VkCommandBuffer commandBuffer,
          continue;
       if (copy_buffer_to_image_tlb(cmd_buffer, image, buffer, &pRegions[i]))
          continue;
+      if (copy_buffer_to_image_shader(cmd_buffer, image, buffer, &pRegions[i], true))
+         continue;
       if (copy_buffer_to_image_cpu(cmd_buffer, image, buffer, &pRegions[i]))
          continue;
-      if (copy_buffer_to_image_blit(cmd_buffer, image, buffer, &pRegions[i]))
+      if (copy_buffer_to_image_shader(cmd_buffer, image, buffer, &pRegions[i], false))
          continue;
       unreachable("Unsupported buffer to image copy.");
    }
diff --git a/src/broadcom/vulkan/v3dv_private.h b/src/broadcom/vulkan/v3dv_private.h
index 2c22b8da6d0..0b474e8b69e 100644
--- a/src/broadcom/vulkan/v3dv_private.h
+++ b/src/broadcom/vulkan/v3dv_private.h
@@ -174,6 +174,9 @@ void v3dv_meta_clear_finish(struct v3dv_device *device);
 void v3dv_meta_blit_init(struct v3dv_device *device);
 void v3dv_meta_blit_finish(struct v3dv_device *device);
 
+void v3dv_meta_texel_buffer_copy_init(struct v3dv_device *device);
+void v3dv_meta_texel_buffer_copy_finish(struct v3dv_device *device);
+
 struct v3dv_app_info {
    const char *app_name;
    uint32_t app_version;
@@ -246,7 +249,8 @@ struct v3dv_queue {
    struct v3dv_job *noop_job;
 };
 
-#define V3DV_META_BLIT_CACHE_KEY_SIZE (4 * sizeof(uint32_t))
+#define V3DV_META_BLIT_CACHE_KEY_SIZE              (4 * sizeof(uint32_t))
+#define V3DV_META_TEXEL_BUFFER_COPY_CACHE_KEY_SIZE (1 * sizeof(uint32_t))
 
 struct v3dv_meta_color_clear_pipeline {
    VkPipeline pipeline;
@@ -267,6 +271,13 @@ struct v3dv_meta_blit_pipeline {
    uint8_t key[V3DV_META_BLIT_CACHE_KEY_SIZE];
 };
 
+struct v3dv_meta_texel_buffer_copy_pipeline {
+   VkPipeline pipeline;
+   VkRenderPass pass;
+   VkRenderPass pass_no_load;
+   uint8_t key[V3DV_META_TEXEL_BUFFER_COPY_CACHE_KEY_SIZE];
+};
+
 struct v3dv_pipeline_cache_stats {
    uint32_t miss;
    uint32_t hit;
@@ -322,6 +333,11 @@ struct v3dv_device {
          VkPipelineLayout playout;
          struct hash_table *cache[3]; /* v3dv_meta_blit_pipeline for 1d, 2d, 3d */
       } blit;
+      struct {
+         VkDescriptorSetLayout dslayout;
+         VkPipelineLayout playout;
+         struct hash_table *cache[3]; /* v3dv_meta_texel_buffer_copy_pipeline for 1d, 2d, 3d */
+      } texel_buffer_copy;
    } meta;
 
    struct v3dv_bo_cache {
@@ -1166,6 +1182,10 @@ struct v3dv_cmd_buffer {
          /* The current descriptor pool for blit sources */
          VkDescriptorPool dspool;
       } blit;
+      struct {
+         /* The current descriptor pool for texel buffer copy sources */
+         VkDescriptorPool dspool;
+      } texel_buffer_copy;
    } meta;
 
    /* List of jobs in the command buffer. For primary command buffers it
@@ -1784,6 +1804,8 @@ void v3dv_get_internal_type_bpp_for_output_format(uint32_t format, uint32_t *typ
 uint8_t v3dv_get_tex_return_size(const struct v3dv_format *vf, bool compare_enable);
 bool v3dv_tfu_supports_tex_format(const struct v3d_device_info *devinfo,
                                   uint32_t tex_format);
+bool v3dv_buffer_format_supports_features(VkFormat vk_format,
+                                          VkFormatFeatureFlags features);
 bool v3dv_format_supports_tlb_resolve(const struct v3dv_format *format);
 
 uint32_t v3d_utile_width(int cpp);