Mesa (master): gallium/u_threaded: split draws that don't fit in a batch

Wed Apr 7 08:26:52 UTC 2021

Module: Mesa
Branch: master
Commit: 363c1ef0c0e10781799abe87ace5cc87621eb3bf
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=363c1ef0c0e10781799abe87ace5cc87621eb3bf

Author: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer at amd.com>
Date:   Tue Mar 30 15:51:50 2021 +0200

gallium/u_threaded: split draws that don't fit in a batch

Multidraws with large draw count will try to call tc_add_slot_based_call
with a payload larger than TC_CALLS_PER_BATCH.

This commit split these draws in multiple commands to avoid this issue.

Cc: mesa-stable
Closes: https://gitlab.freedesktop.org/mesa/mesa/-/issues/4534
Reviewed-by: Marek Olšák <marek.olsak at amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/9920>

---

 src/gallium/auxiliary/util/u_threaded_context.c | 100 ++++++++++++++++--------
 1 file changed, 69 insertions(+), 31 deletions(-)

diff --git a/src/gallium/auxiliary/util/u_threaded_context.c b/src/gallium/auxiliary/util/u_threaded_context.c
index d2259dd7cb7..058509f8f37 100644
--- a/src/gallium/auxiliary/util/u_threaded_context.c
+++ b/src/gallium/auxiliary/util/u_threaded_context.c
@@ -264,7 +264,7 @@ tc_add_sized_call(struct threaded_context *tc, enum tc_call_id id,
                   unsigned num_call_slots)
 {
    struct tc_batch *next = &tc->batch_slots[tc->next];
-
+   assert(num_call_slots <= TC_CALLS_PER_BATCH);
    tc_debug_check(tc);
 
    if (unlikely(next->num_total_call_slots + num_call_slots > TC_CALLS_PER_BATCH)) {
@@ -2527,6 +2527,10 @@ tc_draw_vbo(struct pipe_context *_pipe, const struct pipe_draw_info *info,
       return;
    }
 
+   const int draw_overhead_bytes = offsetof(struct tc_call, payload) + sizeof(struct tc_draw_multi);
+   const int one_draw_payload_bytes = sizeof(((struct tc_draw_multi*)NULL)->slot[0]);
+   const int slots_for_one_draw = DIV_ROUND_UP(draw_overhead_bytes + one_draw_payload_bytes,
+                                               sizeof(struct tc_call));
    /* Multi draw. */
    if (index_size && has_user_indices) {
       struct pipe_resource *buffer = NULL;
@@ -2553,43 +2557,77 @@ tc_draw_vbo(struct pipe_context *_pipe, const struct pipe_draw_info *info,
       if (unlikely(!buffer))
          return;
 
-      struct tc_draw_multi *p =
-         tc_add_slot_based_call(tc, TC_CALL_draw_multi, tc_draw_multi,
-                                num_draws);
-      memcpy(&p->info, info, DRAW_INFO_SIZE_WITHOUT_INDEXBUF_AND_MIN_MAX_INDEX);
-      p->info.index.resource = buffer;
-      p->num_draws = num_draws;
+      int total_offset = 0;
+      while (num_draws) {
+         struct tc_batch *next = &tc->batch_slots[tc->next];
+
+         int nb_slots_left = TC_CALLS_PER_BATCH - next->num_total_call_slots;
+         /* If there isn't enough place for one draw, try to fill the next one */
+         if (nb_slots_left < slots_for_one_draw)
+            nb_slots_left = TC_CALLS_PER_BATCH;
+         const int size_left_bytes = nb_slots_left * sizeof(struct tc_call);
 
-      /* Upload index buffers. */
-      for (unsigned i = 0, offset = 0; i < num_draws; i++) {
-         unsigned count = draws[i].count;
+         /* How many draws can we fit in the current batch */
+         const int dr = MIN2(num_draws, (size_left_bytes - draw_overhead_bytes) / one_draw_payload_bytes);
 
-         if (!count) {
-            p->slot[i].start = 0;
-            p->slot[i].count = 0;
-            continue;
+         struct tc_draw_multi *p =
+            tc_add_slot_based_call(tc, TC_CALL_draw_multi, tc_draw_multi,
+                                   dr);
+         memcpy(&p->info, info, DRAW_INFO_SIZE_WITHOUT_INDEXBUF_AND_MIN_MAX_INDEX);
+         p->info.index.resource = buffer;
+         p->num_draws = dr;
+
+         /* Upload index buffers. */
+         for (unsigned i = 0, offset = 0; i < dr; i++) {
+            unsigned count = draws[i + total_offset].count;
+
+            if (!count) {
+               p->slot[i].start = 0;
+               p->slot[i].count = 0;
+               continue;
+            }
+
+            unsigned size = count << index_size_shift;
+            memcpy(ptr + offset,
+                   (uint8_t*)info->index.user +
+                   (draws[i + total_offset].start << index_size_shift), size);
+            p->slot[i].start = (buffer_offset + offset) >> index_size_shift;
+            p->slot[i].count = count;
+            offset += size;
          }
 
-         unsigned size = count << index_size_shift;
-         memcpy(ptr + offset,
-                (uint8_t*)info->index.user +
-                (draws[i].start << index_size_shift), size);
-         p->slot[i].start = (buffer_offset + offset) >> index_size_shift;
-         p->slot[i].count = count;
-         offset += size;
+         total_offset += dr;
+         num_draws -= dr;
       }
    } else {
-      /* Non-indexed call or indexed with a real index buffer. */
-      struct tc_draw_multi *p =
-         tc_add_slot_based_call(tc, TC_CALL_draw_multi, tc_draw_multi,
-                                num_draws);
-      if (index_size && !info->take_index_buffer_ownership) {
-         tc_set_resource_reference(&p->info.index.resource,
-                                   info->index.resource);
+      int total_offset = 0;
+      while (num_draws) {
+         struct tc_batch *next = &tc->batch_slots[tc->next];
+
+         int nb_slots_left = TC_CALLS_PER_BATCH - next->num_total_call_slots;
+         /* If there isn't enough place for one draw, try to fill the next one */
+         if (nb_slots_left < slots_for_one_draw)
+            nb_slots_left = TC_CALLS_PER_BATCH;
+         const int size_left_bytes = nb_slots_left * sizeof(struct tc_call);
+
+         /* How many draws can we fit in the current batch */
+         const int dr = MIN2(num_draws, (size_left_bytes - draw_overhead_bytes) / one_draw_payload_bytes);
+
+         /* Non-indexed call or indexed with a real index buffer. */
+         struct tc_draw_multi *p =
+            tc_add_slot_based_call(tc, TC_CALL_draw_multi, tc_draw_multi,
+                                   dr);
+         if (index_size && !info->take_index_buffer_ownership) {
+            tc_set_resource_reference(&p->info.index.resource,
+                                      info->index.resource);
+         }
+         memcpy(&p->info, info, DRAW_INFO_SIZE_WITHOUT_MIN_MAX_INDEX);
+         p->num_draws = dr;
+         memcpy(p->slot, &draws[total_offset], sizeof(draws[0]) * dr);
+         num_draws -= dr;
+
+         total_offset += dr;
       }
-      memcpy(&p->info, info, DRAW_INFO_SIZE_WITHOUT_MIN_MAX_INDEX);
-      p->num_draws = num_draws;
-      memcpy(p->slot, draws, sizeof(draws[0]) * num_draws);
    }
 }