[Mesa-dev] [PATCH 08/13] winsys/amdgpu: start with smaller IBs, growing as necessary

Marek Olšák maraeo at gmail.com
Wed May 18 09:57:03 UTC 2016


Reviewed-by: Marek Olšák <marek.olsak at amd.com>

Marek

On Tue, May 17, 2016 at 10:42 PM, Nicolai Hähnle <nhaehnle at gmail.com> wrote:
> From: Nicolai Hähnle <nicolai.haehnle at amd.com>
>
> This avoids allocating giant IBs from the outset, especially for CE and DMA.
>
> Since we now limit max_dw only by the size that the buffer happens to be
> (which, due to the buffer cache, can be even larger than the rounded-up size
> we request), the new function amdgpu_ib_max_submit_dwords controls when we
> submit an IB.
>
> With this change, we effectively never flush prematurely due to the CE IB,
> after an initial warm-up phase.
>
> v2:
> - clean up buffer_size calculation
> ---
>  src/gallium/winsys/amdgpu/drm/amdgpu_cs.c | 80 +++++++++++++++++++++++++++----
>  src/gallium/winsys/amdgpu/drm/amdgpu_cs.h |  1 +
>  2 files changed, 71 insertions(+), 10 deletions(-)
>
> diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
> index 1b2e89e..a09b4fb 100644
> --- a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
> +++ b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
> @@ -336,11 +336,33 @@ static unsigned amdgpu_cs_add_buffer(struct radeon_winsys_cs *rcs,
>     return index;
>  }
>
> -static bool amdgpu_ib_new_buffer(struct amdgpu_winsys *ws, struct amdgpu_ib *ib,
> -                                 unsigned buffer_size)
> +static bool amdgpu_ib_new_buffer(struct amdgpu_winsys *ws, struct amdgpu_ib *ib)
>  {
>     struct pb_buffer *pb;
>     uint8_t *mapped;
> +   unsigned buffer_size;
> +
> +   /* Always create a buffer that is 4 times larger than the maximum seen IB
> +    * size, aligned to a power of two. Limit to 512k dwords, which is the
> +    * largest power of two that fits into the size field of the INDIRECT_BUFFER
> +    * packet.
> +    */
> +   buffer_size = 4 * MIN2(util_next_power_of_two(4 * ib->max_ib_size),
> +                          512 * 1024);
> +
> +   switch (ib->ib_type) {
> +   case IB_CONST_PREAMBLE:
> +      buffer_size = MAX2(buffer_size, 4 * 1024);
> +      break;
> +   case IB_CONST:
> +      buffer_size = MAX2(buffer_size, 16 * 1024 * 4);
> +      break;
> +   case IB_MAIN:
> +      buffer_size = MAX2(buffer_size, 8 * 1024 * 4);
> +      break;
> +   default:
> +      unreachable("unhandled IB type");
> +   }
>
>     pb = ws->base.buffer_create(&ws->base, buffer_size,
>                                 ws->info.gart_page_size,
> @@ -364,6 +386,27 @@ static bool amdgpu_ib_new_buffer(struct amdgpu_winsys *ws, struct amdgpu_ib *ib,
>     return true;
>  }
>
> +static unsigned amdgpu_ib_max_submit_dwords(enum ib_type ib_type)
> +{
> +   switch (ib_type) {
> +   case IB_MAIN:
> +      /* Smaller submits means the GPU gets busy sooner and there is less
> +       * waiting for buffers and fences. Proof:
> +       *   http://www.phoronix.com/scan.php?page=article&item=mesa-111-si&num=1
> +       */
> +      return 20 * 1024;
> +   case IB_CONST_PREAMBLE:
> +   case IB_CONST:
> +      /* There isn't really any reason to limit CE IB size beyond the natural
> +       * limit implied by the main IB, except perhaps GTT size. Just return
> +       * an extremely large value that we never get anywhere close to.
> +       */
> +      return 16 * 1024 * 1024;
> +   default:
> +      unreachable("bad ib_type");
> +   }
> +}
> +
>  static bool amdgpu_get_new_ib(struct radeon_winsys *ws, struct amdgpu_cs *cs,
>                                enum ib_type ib_type)
>  {
> @@ -374,35 +417,36 @@ static bool amdgpu_get_new_ib(struct radeon_winsys *ws, struct amdgpu_cs *cs,
>      */
>     struct amdgpu_ib *ib = NULL;
>     struct amdgpu_cs_ib_info *info = &cs->csc->ib[ib_type];
> -   unsigned buffer_size, ib_size;
> +   unsigned ib_size = 0;
>
>     switch (ib_type) {
>     case IB_CONST_PREAMBLE:
>        ib = &cs->const_preamble_ib;
> -      buffer_size = 4 * 1024 * 4;
> -      ib_size = 1024 * 4;
> +      ib_size = 256 * 4;
>        break;
>     case IB_CONST:
>        ib = &cs->const_ib;
> -      buffer_size = 512 * 1024 * 4;
> -      ib_size = 128 * 1024 * 4;
> +      ib_size = 8 * 1024 * 4;
>        break;
>     case IB_MAIN:
>        ib = &cs->main;
> -      buffer_size = 128 * 1024 * 4;
> -      ib_size = 20 * 1024 * 4;
> +      ib_size = 4 * 1024 * 4;
>        break;
>     default:
>        unreachable("unhandled IB type");
>     }
>
> +   ib_size = MAX2(ib_size,
> +                  4 * MIN2(util_next_power_of_two(ib->max_ib_size),
> +                           amdgpu_ib_max_submit_dwords(ib_type)));
> +
>     ib->base.cdw = 0;
>     ib->base.buf = NULL;
>
>     /* Allocate a new buffer for IBs if the current buffer is all used. */
>     if (!ib->big_ib_buffer ||
>         ib->used_ib_space + ib_size > ib->big_ib_buffer->size) {
> -      if (!amdgpu_ib_new_buffer(aws, ib, buffer_size))
> +      if (!amdgpu_ib_new_buffer(aws, ib))
>           return false;
>     }
>
> @@ -412,6 +456,8 @@ static bool amdgpu_get_new_ib(struct radeon_winsys *ws, struct amdgpu_cs *cs,
>                          RADEON_USAGE_READ, 0, RADEON_PRIO_IB1);
>
>     ib->base.buf = (uint32_t*)(ib->ib_mapped + ib->used_ib_space);
> +
> +   ib_size = ib->big_ib_buffer->size - ib->used_ib_space;
>     ib->base.max_dw = ib_size / 4;
>     return true;
>  }
> @@ -624,7 +670,17 @@ static boolean amdgpu_cs_validate(struct radeon_winsys_cs *rcs)
>
>  static bool amdgpu_cs_check_space(struct radeon_winsys_cs *rcs, unsigned dw)
>  {
> +   struct amdgpu_ib *ib = amdgpu_ib(rcs);
> +   struct amdgpu_cs *cs = amdgpu_cs_from_ib(ib);
> +   unsigned requested_size = rcs->cdw + dw;
> +
>     assert(rcs->cdw <= rcs->max_dw);
> +
> +   if (requested_size > amdgpu_ib_max_submit_dwords(ib->ib_type))
> +      return false;
> +
> +   ib->max_ib_size = MAX2(ib->max_ib_size, requested_size);
> +
>     return rcs->max_dw - rcs->cdw >= dw;
>  }
>
> @@ -861,15 +917,19 @@ static void amdgpu_cs_flush(struct radeon_winsys_cs *rcs,
>        /* Set IB sizes. */
>        cur->ib[IB_MAIN].size = cs->main.base.cdw;
>        cs->main.used_ib_space += cs->main.base.cdw * 4;
> +      cs->main.max_ib_size = MAX2(cs->main.max_ib_size, cs->main.base.cdw);
>
>        if (cs->const_ib.ib_mapped) {
>           cur->ib[IB_CONST].size = cs->const_ib.base.cdw;
>           cs->const_ib.used_ib_space += cs->const_ib.base.cdw * 4;
> +         cs->const_ib.max_ib_size = MAX2(cs->const_ib.max_ib_size, cs->const_ib.base.cdw);
>        }
>
>        if (cs->const_preamble_ib.ib_mapped) {
>           cur->ib[IB_CONST_PREAMBLE].size = cs->const_preamble_ib.base.cdw;
>           cs->const_preamble_ib.used_ib_space += cs->const_preamble_ib.base.cdw * 4;
> +         cs->const_preamble_ib.max_ib_size =
> +            MAX2(cs->const_preamble_ib.max_ib_size, cs->const_preamble_ib.base.cdw);
>        }
>
>        /* Create a fence. */
> diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h
> index 25bad07..62811e9 100644
> --- a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h
> +++ b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h
> @@ -64,6 +64,7 @@ struct amdgpu_ib {
>     struct pb_buffer        *big_ib_buffer;
>     uint8_t                 *ib_mapped;
>     unsigned                used_ib_space;
> +   unsigned                max_ib_size;
>     enum ib_type            ib_type;
>  };
>
> --
> 2.7.4
>
> _______________________________________________
> mesa-dev mailing list
> mesa-dev at lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/mesa-dev


More information about the mesa-dev mailing list