[Mesa-stable] [PATCH] radv/winsys: allow to submit up to 4 IBs for chips without chaining

Juan A. Suarez Romero jasuarez at igalia.com
Wed Apr 25 11:06:14 UTC 2018


Thanks!

I'm enqueing it for next 18.0 stable.

	J.A.

On Tue, 2018-04-24 at 10:18 -0700, Bas Nieuwenhuizen wrote:
> Reviewed-by: Bas Nieuwenhuizen <bas at basnieuwenhuizen.nl>
> 
> On Tue, Apr 24, 2018 at 1:33 AM, Samuel Pitoiset
> <samuel.pitoiset at gmail.com> wrote:
> > The SI family doesn't support chaining which means the maximum
> > size in dwords per CS is limited. When that limit was reached
> > we failed to submit the CS and the application crashed.
> > 
> > This patch allows to submit up to 4 IBs which is currently the
> > limit, but recent amdgpu supports more than that.
> > 
> > Please note that we can reach the limit of 4 IBs per submit
> > but currently we can't improve that. The only solution is to
> > upgrade libdrm. That will be improved later but for now this
> > should fix crashes on SI or when using RADV_DEBUG=noibs.
> > 
> > Fixes: 36cb5508e89 ("radv/winsys: Fail early on overgrown cs.")
> > Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=105775
> > Signed-off-by: Samuel Pitoiset <samuel.pitoiset at gmail.com>
> > ---
> >  src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c | 219 ++++++++++++++----
> >  1 file changed, 169 insertions(+), 50 deletions(-)
> > 
> > diff --git a/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c b/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c
> > index 5632b1d4ee2..4195865a85d 100644
> > --- a/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c
> > +++ b/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c
> > @@ -66,6 +66,10 @@ struct radv_amdgpu_cs {
> >         struct radeon_winsys_bo     **virtual_buffers;
> >         uint8_t                     *virtual_buffer_priorities;
> >         int                         *virtual_buffer_hash_table;
> > +
> > +       /* For chips that don't support chaining. */
> > +       struct radeon_winsys_cs     *old_cs_buffers;
> > +       unsigned                    num_old_cs_buffers;
> >  };
> > 
> >  static inline struct radv_amdgpu_cs *
> > @@ -166,6 +170,12 @@ static void radv_amdgpu_cs_destroy(struct radeon_winsys_cs *rcs)
> >         for (unsigned i = 0; i < cs->num_old_ib_buffers; ++i)
> >                 cs->ws->base.buffer_destroy(cs->old_ib_buffers[i]);
> > 
> > +       for (unsigned i = 0; i < cs->num_old_cs_buffers; ++i) {
> > +               struct radeon_winsys_cs *rcs = &cs->old_cs_buffers[i];
> > +               free(rcs->buf);
> > +       }
> > +
> > +       free(cs->old_cs_buffers);
> >         free(cs->old_ib_buffers);
> >         free(cs->virtual_buffers);
> >         free(cs->virtual_buffer_priorities);
> > @@ -251,9 +261,46 @@ static void radv_amdgpu_cs_grow(struct radeon_winsys_cs *_cs, size_t min_size)
> >                 /* The total ib size cannot exceed limit_dws dwords. */
> >                 if (ib_dws > limit_dws)
> >                 {
> > -                       cs->failed = true;
> > +                       /* The maximum size in dwords has been reached,
> > +                        * try to allocate a new one.
> > +                        */
> > +                       if (cs->num_old_cs_buffers + 1 >= AMDGPU_CS_MAX_IBS_PER_SUBMIT) {
> > +                               /* TODO: Allow to submit more than 4 IBs. */
> > +                               fprintf(stderr, "amdgpu: Maximum number of IBs "
> > +                                               "per submit reached.\n");
> > +                               cs->failed = true;
> > +                               cs->base.cdw = 0;
> > +                               return;
> > +                       }
> > +
> > +                       cs->old_cs_buffers =
> > +                               realloc(cs->old_cs_buffers,
> > +                                       (cs->num_old_cs_buffers + 1) * sizeof(*cs->old_cs_buffers));
> > +                       if (!cs->old_cs_buffers) {
> > +                               cs->failed = true;
> > +                               cs->base.cdw = 0;
> > +                               return;
> > +                       }
> > +
> > +                       /* Store the current one for submitting it later. */
> > +                       cs->old_cs_buffers[cs->num_old_cs_buffers].cdw = cs->base.cdw;
> > +                       cs->old_cs_buffers[cs->num_old_cs_buffers].max_dw = cs->base.max_dw;
> > +                       cs->old_cs_buffers[cs->num_old_cs_buffers].buf = cs->base.buf;
> > +                       cs->num_old_cs_buffers++;
> > +
> > +                       /* Reset the cs, it will be re-allocated below. */
> >                         cs->base.cdw = 0;
> > -                       return;
> > +                       cs->base.buf = NULL;
> > +
> > +                       /* Re-compute the number of dwords to allocate. */
> > +                       ib_dws = MAX2(cs->base.cdw + min_size,
> > +                                     MIN2(cs->base.max_dw * 2, limit_dws));
> > +                       if (ib_dws > limit_dws) {
> > +                               fprintf(stderr, "amdgpu: Too high number of "
> > +                                               "dwords to allocate\n");
> > +                               cs->failed = true;
> > +                               return;
> > +                       }
> >                 }
> > 
> >                 uint32_t *new_buf = realloc(cs->base.buf, ib_dws * 4);
> > @@ -365,6 +412,15 @@ static void radv_amdgpu_cs_reset(struct radeon_winsys_cs *_cs)
> >                 cs->ib.ib_mc_address = radv_amdgpu_winsys_bo(cs->ib_buffer)->base.va;
> >                 cs->ib_size_ptr = &cs->ib.size;
> >                 cs->ib.size = 0;
> > +       } else {
> > +               for (unsigned i = 0; i < cs->num_old_cs_buffers; ++i) {
> > +                       struct radeon_winsys_cs *rcs = &cs->old_cs_buffers[i];
> > +                       free(rcs->buf);
> > +               }
> > +
> > +               free(cs->old_cs_buffers);
> > +               cs->old_cs_buffers = NULL;
> > +               cs->num_old_cs_buffers = 0;
> >         }
> >  }
> > 
> > @@ -515,7 +571,8 @@ static void radv_amdgpu_cs_execute_secondary(struct radeon_winsys_cs *_parent,
> >  static int radv_amdgpu_create_bo_list(struct radv_amdgpu_winsys *ws,
> >                                       struct radeon_winsys_cs **cs_array,
> >                                       unsigned count,
> > -                                     struct radv_amdgpu_winsys_bo *extra_bo,
> > +                                     struct radv_amdgpu_winsys_bo **extra_bo_array,
> > +                                     unsigned num_extra_bo,
> >                                       struct radeon_winsys_cs *extra_cs,
> >                                       amdgpu_bo_list_handle *bo_list)
> >  {
> > @@ -544,7 +601,7 @@ static int radv_amdgpu_create_bo_list(struct radv_amdgpu_winsys *ws,
> >                                           bo_list);
> >                 free(handles);
> >                 pthread_mutex_unlock(&ws->global_bo_list_lock);
> > -       } else if (count == 1 && !extra_bo && !extra_cs &&
> > +       } else if (count == 1 && !num_extra_bo && !extra_cs &&
> >                    !radv_amdgpu_cs(cs_array[0])->num_virtual_buffers) {
> >                 struct radv_amdgpu_cs *cs = (struct radv_amdgpu_cs*)cs_array[0];
> >                 if (cs->num_buffers == 0) {
> > @@ -554,8 +611,8 @@ static int radv_amdgpu_create_bo_list(struct radv_amdgpu_winsys *ws,
> >                 r = amdgpu_bo_list_create(ws->dev, cs->num_buffers, cs->handles,
> >                                           cs->priorities, bo_list);
> >         } else {
> > -               unsigned total_buffer_count = !!extra_bo;
> > -               unsigned unique_bo_count = !!extra_bo;
> > +               unsigned total_buffer_count = num_extra_bo;
> > +               unsigned unique_bo_count = num_extra_bo;
> >                 for (unsigned i = 0; i < count; ++i) {
> >                         struct radv_amdgpu_cs *cs = (struct radv_amdgpu_cs*)cs_array[i];
> >                         total_buffer_count += cs->num_buffers;
> > @@ -578,9 +635,9 @@ static int radv_amdgpu_create_bo_list(struct radv_amdgpu_winsys *ws,
> >                         return -ENOMEM;
> >                 }
> > 
> > -               if (extra_bo) {
> > -                       handles[0] = extra_bo->bo;
> > -                       priorities[0] = 8;
> > +               for (unsigned i = 0; i < num_extra_bo; i++) {
> > +                       handles[i] = extra_bo_array[i]->bo;
> > +                       priorities[i] = 8;
> >                 }
> > 
> >                 for (unsigned i = 0; i < count + !!extra_cs; ++i) {
> > @@ -710,7 +767,8 @@ static int radv_amdgpu_winsys_cs_submit_chained(struct radeon_winsys_ctx *_ctx,
> >                 }
> >         }
> > 
> > -       r = radv_amdgpu_create_bo_list(cs0->ws, cs_array, cs_count, NULL, initial_preamble_cs, &bo_list);
> > +       r = radv_amdgpu_create_bo_list(cs0->ws, cs_array, cs_count, NULL, 0, initial_preamble_cs,
> > +                                      &bo_list);
> >         if (r) {
> >                 fprintf(stderr, "amdgpu: buffer list creation failed for the "
> >                                 "chained submission(%d)\n", r);
> > @@ -777,7 +835,7 @@ static int radv_amdgpu_winsys_cs_submit_fallback(struct radeon_winsys_ctx *_ctx,
> > 
> >                 memset(&request, 0, sizeof(request));
> > 
> > -               r = radv_amdgpu_create_bo_list(cs0->ws, &cs_array[i], cnt, NULL,
> > +               r = radv_amdgpu_create_bo_list(cs0->ws, &cs_array[i], cnt, NULL, 0,
> >                                                preamble_cs, &bo_list);
> >                 if (r) {
> >                         fprintf(stderr, "amdgpu: buffer list creation failed "
> > @@ -857,68 +915,127 @@ static int radv_amdgpu_winsys_cs_submit_sysmem(struct radeon_winsys_ctx *_ctx,
> >         assert(cs_count);
> > 
> >         for (unsigned i = 0; i < cs_count;) {
> > -               struct amdgpu_cs_ib_info ib = {0};
> > -               struct radeon_winsys_bo *bo = NULL;
> > +               struct amdgpu_cs_ib_info ibs[AMDGPU_CS_MAX_IBS_PER_SUBMIT] = {0};
> > +               unsigned number_of_ibs = 1;
> > +               struct radeon_winsys_bo *bos[AMDGPU_CS_MAX_IBS_PER_SUBMIT] = {0};
> >                 struct radeon_winsys_cs *preamble_cs = i ? continue_preamble_cs : initial_preamble_cs;
> > +               struct radv_amdgpu_cs *cs = radv_amdgpu_cs(cs_array[i]);
> >                 uint32_t *ptr;
> >                 unsigned cnt = 0;
> >                 unsigned size = 0;
> >                 unsigned pad_words = 0;
> > -               if (preamble_cs)
> > -                       size += preamble_cs->cdw;
> > 
> > -               while (i + cnt < cs_count && 0xffff8 - size >= radv_amdgpu_cs(cs_array[i + cnt])->base.cdw) {
> > -                       size += radv_amdgpu_cs(cs_array[i + cnt])->base.cdw;
> > -                       ++cnt;
> > -               }
> > +               if (cs->num_old_cs_buffers > 0) {
> > +                       /* Special path when the maximum size in dwords has
> > +                        * been reached because we need to handle more than one
> > +                        * IB per submit.
> > +                        */
> > +                       unsigned new_cs_count = cs->num_old_cs_buffers + 1;
> > +                       struct radeon_winsys_cs *new_cs_array[AMDGPU_CS_MAX_IBS_PER_SUBMIT];
> > +                       unsigned idx = 0;
> > +
> > +                       for (unsigned j = 0; j < cs->num_old_cs_buffers; j++)
> > +                               new_cs_array[idx++] = &cs->old_cs_buffers[j];
> > +                       new_cs_array[idx++] = cs_array[i];
> > +
> > +                       for (unsigned j = 0; j < new_cs_count; j++) {
> > +                               struct radeon_winsys_cs *rcs = new_cs_array[j];
> > +                               bool needs_preamble = preamble_cs && j == 0;
> > +                               unsigned size = 0;
> > +
> > +                               if (needs_preamble)
> > +                                       size += preamble_cs->cdw;
> > +                               size += rcs->cdw;
> > +
> > +                               assert(size < 0xffff8);
> > +
> > +                               while (!size || (size & 7)) {
> > +                                       size++;
> > +                                       pad_words++;
> > +                               }
> > 
> > -               while(!size || (size & 7)) {
> > -                       size++;
> > -                       pad_words++;
> > -               }
> > -               assert(cnt);
> > +                               bos[j] = ws->buffer_create(ws, 4 * size, 4096,
> > +                                                          RADEON_DOMAIN_GTT,
> > +                                                          RADEON_FLAG_CPU_ACCESS |
> > +                                                          RADEON_FLAG_NO_INTERPROCESS_SHARING |
> > +                                                          RADEON_FLAG_READ_ONLY);
> > +                               ptr = ws->buffer_map(bos[j]);
> > 
> > -               bo = ws->buffer_create(ws, 4 * size, 4096, RADEON_DOMAIN_GTT,
> > -                                      RADEON_FLAG_CPU_ACCESS |
> > -                                      RADEON_FLAG_NO_INTERPROCESS_SHARING |
> > -                                      RADEON_FLAG_READ_ONLY);
> > -               ptr = ws->buffer_map(bo);
> > +                               if (needs_preamble) {
> > +                                       memcpy(ptr, preamble_cs->buf, preamble_cs->cdw * 4);
> > +                                       ptr += preamble_cs->cdw;
> > +                               }
> > 
> > -               if (preamble_cs) {
> > -                       memcpy(ptr, preamble_cs->buf, preamble_cs->cdw * 4);
> > -                       ptr += preamble_cs->cdw;
> > -               }
> > +                               memcpy(ptr, rcs->buf, 4 * rcs->cdw);
> > +                               ptr += rcs->cdw;
> > 
> > -               for (unsigned j = 0; j < cnt; ++j) {
> > -                       struct radv_amdgpu_cs *cs = radv_amdgpu_cs(cs_array[i + j]);
> > -                       memcpy(ptr, cs->base.buf, 4 * cs->base.cdw);
> > -                       ptr += cs->base.cdw;
> > +                               for (unsigned k = 0; k < pad_words; ++k)
> > +                                       *ptr++ = pad_word;
> > 
> > -               }
> > +                               ibs[j].size = size;
> > +                               ibs[j].ib_mc_address = radv_buffer_get_va(bos[j]);
> > +                       }
> > 
> > -               for (unsigned j = 0; j < pad_words; ++j)
> > -                       *ptr++ = pad_word;
> > +                       number_of_ibs = new_cs_count;
> > +                       cnt++;
> > +               } else {
> > +                       if (preamble_cs)
> > +                               size += preamble_cs->cdw;
> > 
> > -               memset(&request, 0, sizeof(request));
> > +                       while (i + cnt < cs_count && 0xffff8 - size >= radv_amdgpu_cs(cs_array[i + cnt])->base.cdw) {
> > +                               size += radv_amdgpu_cs(cs_array[i + cnt])->base.cdw;
> > +                               ++cnt;
> > +                       }
> > +
> > +                       while (!size || (size & 7)) {
> > +                               size++;
> > +                               pad_words++;
> > +                       }
> > +                       assert(cnt);
> > 
> > +                       bos[0] = ws->buffer_create(ws, 4 * size, 4096,
> > +                                                  RADEON_DOMAIN_GTT,
> > +                                                  RADEON_FLAG_CPU_ACCESS |
> > +                                                  RADEON_FLAG_NO_INTERPROCESS_SHARING |
> > +                                                  RADEON_FLAG_READ_ONLY);
> > +                       ptr = ws->buffer_map(bos[0]);
> > +
> > +                       if (preamble_cs) {
> > +                               memcpy(ptr, preamble_cs->buf, preamble_cs->cdw * 4);
> > +                               ptr += preamble_cs->cdw;
> > +                       }
> > +
> > +                       for (unsigned j = 0; j < cnt; ++j) {
> > +                               struct radv_amdgpu_cs *cs = radv_amdgpu_cs(cs_array[i + j]);
> > +                               memcpy(ptr, cs->base.buf, 4 * cs->base.cdw);
> > +                               ptr += cs->base.cdw;
> > +
> > +                       }
> > +
> > +                       for (unsigned j = 0; j < pad_words; ++j)
> > +                               *ptr++ = pad_word;
> > +
> > +                       ibs[0].size = size;
> > +                       ibs[0].ib_mc_address = radv_buffer_get_va(bos[0]);
> > +               }
> > 
> >                 r = radv_amdgpu_create_bo_list(cs0->ws, &cs_array[i], cnt,
> > -                                              (struct radv_amdgpu_winsys_bo*)bo,
> > -                                              preamble_cs, &bo_list);
> > +                                              (struct radv_amdgpu_winsys_bo **)bos,
> > +                                              number_of_ibs, preamble_cs,
> > +                                              &bo_list);
> >                 if (r) {
> >                         fprintf(stderr, "amdgpu: buffer list creation failed "
> >                                         "for the sysmem submission (%d)\n", r);
> >                         return r;
> >                 }
> > 
> > -               ib.size = size;
> > -               ib.ib_mc_address = radv_buffer_get_va(bo);
> > +               memset(&request, 0, sizeof(request));
> > 
> >                 request.ip_type = cs0->hw_ip;
> >                 request.ring = queue_idx;
> >                 request.resources = bo_list;
> > -               request.number_of_ibs = 1;
> > -               request.ibs = &ib;
> > +               request.number_of_ibs = number_of_ibs;
> > +               request.ibs = ibs;
> >                 request.fence_info = radv_set_cs_fence(ctx, cs0->hw_ip, queue_idx);
> > 
> >                 sem_info->cs_emit_signal = (i == cs_count - cnt) ? emit_signal_sem : false;
> > @@ -934,9 +1051,11 @@ static int radv_amdgpu_winsys_cs_submit_sysmem(struct radeon_winsys_ctx *_ctx,
> >                 if (bo_list)
> >                         amdgpu_bo_list_destroy(bo_list);
> > 
> > -               ws->buffer_destroy(bo);
> > -               if (r)
> > -                       return r;
> > +               for (unsigned j = 0; j < number_of_ibs; j++) {
> > +                       ws->buffer_destroy(bos[j]);
> > +                       if (r)
> > +                               return r;
> > +               }
> > 
> >                 i += cnt;
> >         }
> > --
> > 2.17.0
> > 
> 
> _______________________________________________
> mesa-stable mailing list
> mesa-stable at lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/mesa-stable


More information about the mesa-stable mailing list