[Mesa-dev] [PATCH 8/8] gallium/radeon: derive buffer placement and flags only once per buffer

Fri Aug 26 10:42:06 UTC 2016

I would prefer it if the function could be split in two functions
instead of using the initialized flag. I think we know whether it is
an initialization or a reinitialization per call site.

- Bas

On Thu, Aug 18, 2016 at 9:46 PM, Marek Olšák <maraeo at gmail.com> wrote:
> From: Marek Olšák <marek.olsak at amd.com>
>
> Invalidated buffers don't have to do this.
> ---
>  src/gallium/drivers/radeon/r600_buffer_common.c | 147 +++++++++++++-----------
>  src/gallium/drivers/radeon/r600_pipe_common.h   |   2 +
>  2 files changed, 80 insertions(+), 69 deletions(-)
>
> diff --git a/src/gallium/drivers/radeon/r600_buffer_common.c b/src/gallium/drivers/radeon/r600_buffer_common.c
> index 4480293..113a7dc 100644
> --- a/src/gallium/drivers/radeon/r600_buffer_common.c
> +++ b/src/gallium/drivers/radeon/r600_buffer_common.c
> @@ -98,91 +98,108 @@ void *r600_buffer_map_sync_with_rings(struct r600_common_context *ctx,
>         /* Setting the CS to NULL will prevent doing checks we have done already. */
>         return ctx->ws->buffer_map(resource->buf, NULL, usage);
>  }
>
>  bool r600_init_resource(struct r600_common_screen *rscreen,
>                         struct r600_resource *res,
>                         uint64_t size, unsigned alignment)
>  {
>         struct r600_texture *rtex = (struct r600_texture*)res;
>         struct pb_buffer *old_buf, *new_buf;
> -       enum radeon_bo_flag flags = 0;
> -
> -       switch (res->b.b.usage) {
> -       case PIPE_USAGE_STREAM:
> -               flags = RADEON_FLAG_GTT_WC;
> -               /* fall through */
> -       case PIPE_USAGE_STAGING:
> -               /* Transfers are likely to occur more often with these resources. */
> -               res->domains = RADEON_DOMAIN_GTT;
> -               break;
> -       case PIPE_USAGE_DYNAMIC:
> -               /* Older kernels didn't always flush the HDP cache before
> -                * CS execution
> -                */
> -               if (rscreen->info.drm_major == 2 &&
> -                   rscreen->info.drm_minor < 40) {
> +
> +       if (!res->initialized) {
> +               res->flags = 0;
> +
> +               switch (res->b.b.usage) {
> +               case PIPE_USAGE_STREAM:
> +                       res->flags = RADEON_FLAG_GTT_WC;
> +                       /* fall through */
> +               case PIPE_USAGE_STAGING:
> +                       /* Transfers are likely to occur more often with these
> +                        * resources. */
>                         res->domains = RADEON_DOMAIN_GTT;
> -                       flags |= RADEON_FLAG_GTT_WC;
> +                       break;
> +               case PIPE_USAGE_DYNAMIC:
> +                       /* Older kernels didn't always flush the HDP cache before
> +                        * CS execution
> +                        */
> +                       if (rscreen->info.drm_major == 2 &&
> +                           rscreen->info.drm_minor < 40) {
> +                               res->domains = RADEON_DOMAIN_GTT;
> +                               res->flags |= RADEON_FLAG_GTT_WC;
> +                               break;
> +                       }
> +                       res->flags |= RADEON_FLAG_CPU_ACCESS;
> +                       /* fall through */
> +               case PIPE_USAGE_DEFAULT:
> +               case PIPE_USAGE_IMMUTABLE:
> +               default:
> +                       /* Not listing GTT here improves performance in some
> +                        * apps. */
> +                       res->domains = RADEON_DOMAIN_VRAM;
> +                       res->flags |= RADEON_FLAG_GTT_WC;
>                         break;
>                 }
> -               flags |= RADEON_FLAG_CPU_ACCESS;
> -               /* fall through */
> -       case PIPE_USAGE_DEFAULT:
> -       case PIPE_USAGE_IMMUTABLE:
> -       default:
> -               /* Not listing GTT here improves performance in some apps. */
> -               res->domains = RADEON_DOMAIN_VRAM;
> -               flags |= RADEON_FLAG_GTT_WC;
> -               break;
> -       }
>
> -       if (res->b.b.target == PIPE_BUFFER &&
> -           res->b.b.flags & (PIPE_RESOURCE_FLAG_MAP_PERSISTENT |
> -                             PIPE_RESOURCE_FLAG_MAP_COHERENT)) {
> -               /* Use GTT for all persistent mappings with older kernels,
> -                * because they didn't always flush the HDP cache before CS
> -                * execution.
> -                *
> -                * Write-combined CPU mappings are fine, the kernel ensures all CPU
> -                * writes finish before the GPU executes a command stream.
> +               if (res->b.b.target == PIPE_BUFFER &&
> +                   res->b.b.flags & (PIPE_RESOURCE_FLAG_MAP_PERSISTENT |
> +                                     PIPE_RESOURCE_FLAG_MAP_COHERENT)) {
> +                       /* Use GTT for all persistent mappings with older
> +                        * kernels, because they didn't always flush the HDP
> +                        * cache before CS execution.
> +                        *
> +                        * Write-combined CPU mappings are fine, the kernel
> +                        * ensures all CPU writes finish before the GPU
> +                        * executes a command stream.
> +                        */
> +                       if (rscreen->info.drm_major == 2 &&
> +                           rscreen->info.drm_minor < 40)
> +                               res->domains = RADEON_DOMAIN_GTT;
> +                       else if (res->domains & RADEON_DOMAIN_VRAM)
> +                               res->flags |= RADEON_FLAG_CPU_ACCESS;
> +               }
> +
> +               /* Tiled textures are unmappable. Always put them in VRAM. */
> +               if (res->b.b.target != PIPE_BUFFER &&
> +                   rtex->surface.level[0].mode >= RADEON_SURF_MODE_1D) {
> +                       res->domains = RADEON_DOMAIN_VRAM;
> +                       res->flags &= ~RADEON_FLAG_CPU_ACCESS;
> +                       res->flags |= RADEON_FLAG_NO_CPU_ACCESS |
> +                                RADEON_FLAG_GTT_WC;
> +               }
> +
> +               /* If VRAM is just stolen system memory, allow both VRAM and
> +                * GTT, whichever has free space. If a buffer is evicted from
> +                * VRAM to GTT, it will stay there.
>                  */
> -               if (rscreen->info.drm_major == 2 &&
> -                   rscreen->info.drm_minor < 40)
> -                       res->domains = RADEON_DOMAIN_GTT;
> -               else if (res->domains & RADEON_DOMAIN_VRAM)
> -                       flags |= RADEON_FLAG_CPU_ACCESS;
> -       }
> +               if (!rscreen->info.has_dedicated_vram &&
> +                   res->domains == RADEON_DOMAIN_VRAM)
> +                       res->domains = RADEON_DOMAIN_VRAM_GTT;
>
> -       /* Tiled textures are unmappable. Always put them in VRAM. */
> -       if (res->b.b.target != PIPE_BUFFER &&
> -           rtex->surface.level[0].mode >= RADEON_SURF_MODE_1D) {
> -               res->domains = RADEON_DOMAIN_VRAM;
> -               flags &= ~RADEON_FLAG_CPU_ACCESS;
> -               flags |= RADEON_FLAG_NO_CPU_ACCESS |
> -                        RADEON_FLAG_GTT_WC;
> -       }
> +               if (rscreen->debug_flags & DBG_NO_WC)
> +                       res->flags &= ~RADEON_FLAG_GTT_WC;
>
> -       /* If VRAM is just stolen system memory, allow both VRAM and GTT,
> -        * whichever has free space. If a buffer is evicted from VRAM to GTT,
> -        * it will stay there.
> -        */
> -       if (!rscreen->info.has_dedicated_vram &&
> -           res->domains == RADEON_DOMAIN_VRAM)
> -               res->domains = RADEON_DOMAIN_VRAM_GTT;
> +               /* Set expected VRAM and GART usage for the buffer. */
> +               res->vram_usage = 0;
> +               res->gart_usage = 0;
>
> -       if (rscreen->debug_flags & DBG_NO_WC)
> -               flags &= ~RADEON_FLAG_GTT_WC;
> +               if (res->domains & RADEON_DOMAIN_VRAM)
> +                       res->vram_usage = size;
> +               else if (res->domains & RADEON_DOMAIN_GTT)
> +                       res->gart_usage = size;
> +
> +               res->initialized = true;
> +       }
>
>         /* Allocate a new resource. */
>         new_buf = rscreen->ws->buffer_create(rscreen->ws, size, alignment,
> -                                            res->domains, flags);
> +                                            res->domains, res->flags);
>         if (!new_buf) {
>                 return false;
>         }
>
>         /* Replace the pointer such that if res->buf wasn't NULL, it won't be
>          * NULL. This should prevent crashes with multiple contexts using
>          * the same buffer where one of the contexts invalidates it while
>          * the others are using it. */
>         old_buf = res->buf;
>         res->buf = new_buf; /* should be atomic */
> @@ -190,29 +207,20 @@ bool r600_init_resource(struct r600_common_screen *rscreen,
>         if (rscreen->info.has_virtual_memory)
>                 res->gpu_address = rscreen->ws->buffer_get_virtual_address(res->buf);
>         else
>                 res->gpu_address = 0;
>
>         pb_reference(&old_buf, NULL);
>
>         util_range_set_empty(&res->valid_buffer_range);
>         res->TC_L2_dirty = false;
>
> -       /* Set expected VRAM and GART usage for the buffer. */
> -       res->vram_usage = 0;
> -       res->gart_usage = 0;
> -
> -       if (res->domains & RADEON_DOMAIN_VRAM)
> -               res->vram_usage = size;
> -       else if (res->domains & RADEON_DOMAIN_GTT)
> -               res->gart_usage = size;
> -
>         /* Print debug information. */
>         if (rscreen->debug_flags & DBG_VM && res->b.b.target == PIPE_BUFFER) {
>                 fprintf(stderr, "VM start=0x%"PRIX64"  end=0x%"PRIX64" | Buffer %"PRIu64" bytes\n",
>                         res->gpu_address, res->gpu_address + res->buf->size,
>                         res->buf->size);
>         }
>         return true;
>  }
>
>  static void r600_buffer_destroy(struct pipe_screen *screen,
> @@ -496,20 +504,21 @@ r600_alloc_buffer_struct(struct pipe_screen *screen,
>  {
>         struct r600_resource *rbuffer;
>
>         rbuffer = MALLOC_STRUCT(r600_resource);
>
>         rbuffer->b.b = *templ;
>         pipe_reference_init(&rbuffer->b.b.reference, 1);
>         rbuffer->b.b.screen = screen;
>         rbuffer->b.vtbl = &r600_buffer_vtbl;
>         rbuffer->buf = NULL;
> +       rbuffer->initialized = false;
>         rbuffer->TC_L2_dirty = false;
>         rbuffer->is_shared = false;
>         util_range_init(&rbuffer->valid_buffer_range);
>         return rbuffer;
>  }
>
>  struct pipe_resource *r600_buffer_create(struct pipe_screen *screen,
>                                          const struct pipe_resource *templ,
>                                          unsigned alignment)
>  {
> diff --git a/src/gallium/drivers/radeon/r600_pipe_common.h b/src/gallium/drivers/radeon/r600_pipe_common.h
> index 5375044..358d5f4 100644
> --- a/src/gallium/drivers/radeon/r600_pipe_common.h
> +++ b/src/gallium/drivers/radeon/r600_pipe_common.h
> @@ -169,20 +169,22 @@ struct r600_resource {
>
>         /* Winsys objects. */
>         struct pb_buffer                *buf;
>         uint64_t                        gpu_address;
>         /* Memory usage if the buffer placement is optimal. */
>         uint64_t                        vram_usage;
>         uint64_t                        gart_usage;
>
>         /* Resource state. */
>         enum radeon_bo_domain           domains;
> +       enum radeon_bo_flag             flags;
> +       bool                            initialized;
>
>         /* The buffer range which is initialized (with a write transfer,
>          * streamout, DMA, or as a random access target). The rest of
>          * the buffer is considered invalid and can be mapped unsynchronized.
>          *
>          * This allows unsychronized mapping of a buffer range which hasn't
>          * been used yet. It's for applications which forget to use
>          * the unsynchronized map flag and expect the driver to figure it out.
>           */
>         struct util_range               valid_buffer_range;
> --
> 2.7.4
>
> _______________________________________________
> mesa-dev mailing list
> mesa-dev at lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/mesa-dev