[Mesa-dev] [PATCH 2/2] gallium/u_vbuf: handle indirect multidraws correctly and efficiently (v2)

Fri Jul 27 21:46:10 UTC 2018

On Fri, Jul 27, 2018 at 5:08 PM, Eric Anholt <eric at anholt.net> wrote:
> Marek Olšák <maraeo at gmail.com> writes:
>
>> From: Marek Olšák <marek.olsak at amd.com>
>>
>> v2: need to do MAX{start+count} instead of MAX{count}
>>     added piglit tests
>> ---
>>  src/gallium/auxiliary/util/u_vbuf.c | 199 ++++++++++++++++++++++++----
>>  1 file changed, 175 insertions(+), 24 deletions(-)
>>
>> diff --git a/src/gallium/auxiliary/util/u_vbuf.c b/src/gallium/auxiliary/util/u_vbuf.c
>> index 746ff1085ce..ca53e6218fd 100644
>> --- a/src/gallium/auxiliary/util/u_vbuf.c
>> +++ b/src/gallium/auxiliary/util/u_vbuf.c
>> @@ -1124,20 +1124,45 @@ static void u_vbuf_set_driver_vertex_buffers(struct u_vbuf *mgr)
>>     unsigned start_slot, count;
>>
>>     start_slot = ffs(mgr->dirty_real_vb_mask) - 1;
>>     count = util_last_bit(mgr->dirty_real_vb_mask >> start_slot);
>>
>>     pipe->set_vertex_buffers(pipe, start_slot, count,
>>                              mgr->real_vertex_buffer + start_slot);
>>     mgr->dirty_real_vb_mask = 0;
>>  }
>>
>> +static void
>> +u_vbuf_split_indexed_multidraw(struct u_vbuf *mgr, struct pipe_draw_info *info,
>> +                               unsigned *indirect_data, unsigned stride,
>> +                               unsigned draw_count)
>> +{
>> +   assert(info->index_size);
>> +   info->indirect = NULL;
>> +
>> +   for (unsigned i = 0; i < draw_count; i++) {
>> +      unsigned offset = i * stride / 4;
>> +
>> +      info->count = indirect_data[offset + 0];
>> +      info->instance_count = indirect_data[offset + 1];
>> +
>> +      if (!info->count || !info->instance_count)
>> +         continue;
>> +
>> +      info->start = indirect_data[offset + 2];
>> +      info->index_bias = indirect_data[offset + 3];
>> +      info->start_instance = indirect_data[offset + 4];
>> +
>> +      u_vbuf_draw_vbo(mgr, info);
>> +   }
>> +}
>> +
>>  void u_vbuf_draw_vbo(struct u_vbuf *mgr, const struct pipe_draw_info *info)
>>  {
>>     struct pipe_context *pipe = mgr->pipe;
>>     int start_vertex;
>>     unsigned min_index;
>>     unsigned num_vertices;
>>     boolean unroll_indices = FALSE;
>>     const uint32_t used_vb_mask = mgr->ve->used_vb_mask;
>>     uint32_t user_vb_mask = mgr->user_vb_mask & used_vb_mask;
>>     const uint32_t incompatible_vb_mask =
>> @@ -1153,47 +1178,172 @@ void u_vbuf_draw_vbo(struct u_vbuf *mgr, const struct pipe_draw_info *info)
>>        if (mgr->dirty_real_vb_mask & used_vb_mask) {
>>           u_vbuf_set_driver_vertex_buffers(mgr);
>>        }
>>
>>        pipe->draw_vbo(pipe, info);
>>        return;
>>     }
>>
>>     new_info = *info;
>>
>> -   /* Fallback. We need to know all the parameters. */
>> +   /* Handle indirect (multi)draws. */
>>     if (new_info.indirect) {
>> -      struct pipe_transfer *transfer = NULL;
>> -      int *data;
>> -
>> -      if (new_info.index_size) {
>> -         data = pipe_buffer_map_range(pipe, new_info.indirect->buffer,
>> -                                      new_info.indirect->offset, 20,
>> -                                      PIPE_TRANSFER_READ, &transfer);
>> -         new_info.index_bias = data[3];
>> -         new_info.start_instance = data[4];
>> -      }
>> -      else {
>> -         data = pipe_buffer_map_range(pipe, new_info.indirect->buffer,
>> -                                      new_info.indirect->offset, 16,
>> -                                      PIPE_TRANSFER_READ, &transfer);
>> -         new_info.start_instance = data[3];
>> +      const struct pipe_draw_indirect_info *indirect = new_info.indirect;
>> +      unsigned draw_count = 0;
>> +
>> +      /* Get the number of draws. */
>> +      if (indirect->indirect_draw_count) {
>> +         pipe_buffer_read(pipe, indirect->indirect_draw_count,
>> +                          indirect->indirect_draw_count_offset,
>> +                          4, &draw_count);
>> +      } else {
>> +         draw_count = indirect->draw_count;
>>        }
>>
>> -      new_info.count = data[0];
>> -      new_info.instance_count = data[1];
>> -      new_info.start = data[2];
>> -      pipe_buffer_unmap(pipe, transfer);
>> -      new_info.indirect = NULL;
>> -
>> -      if (!new_info.count)
>> +      if (!draw_count)
>>           return;
>> +
>> +      unsigned data_size = (draw_count - 1) * indirect->stride +
>> +                           (new_info.index_size ? 20 : 16);
>> +      unsigned *data = alloca(data_size);
>
> I continue to believe that alloca isn't something we should be using on
> unbounded data_size like this.  We should be returing GL_OUT_OF_MEMORY
> when we fail, not segfaulting.
>
> We're already reading back the BOs, it's not like the allocation is a
> performance concern at this point.

radeonsi has optimizations where reading back BOs has no performance
impact other than reading from uncached memory, i.e. no sync and no
mmap overhead. In that case, malloc can make a difference. I agree
that it may be a little harder to justify considering the other things
that u_vbuf does.

Marek