[Mesa-dev] [PATCH 2/2] radeonsi: Implement DMA blit

Sun Mar 16 18:33:35 PDT 2014

Thanks for doing this! I have some comments...

1) As of SI, the maximum supported size for dword-aligned L2L, L2T,
and T2L copies is 0xffff8. The maximum supported size for byte-aligned
L2L copies is 0xfffe0. I'd like to have proper definitions for this,
e.g. SI_DMA_COPY_MAX_SIZE and SI_DMA_COPY_MAX_SIZE_DW. All occurrences
of 0x000fffff should be replaced appropriately.

Now the cosmetic stuff.

2) This is quite a lot of code, so I'd like all of this to be in a
separate file, e.g. si_dma.c.

3) r600/si_need_cs_space could be moved to drivers/radeon.

4) All calls to r600_context_bo_reloc could be moved out of the loops,
because SI supports virtual memory and therefore it's not required to
call the function before every packet. See also my explanation in
patch "winsys/radeon: only add duplicate relocations for DMA if VM
isn't supported".

5) Flushing the gfx CS is not required, because r600_context_bo_reloc
flushes it for you.

Please see also my latest DMA patches for r600g.

Thanks.

Marek

On Thu, Mar 13, 2014 at 8:45 AM, Niels Ole Salscheider
<niels_ole at salscheider-online.de> wrote:
> This code is a slightly modified version of evergreen_dma_blit (and
> evergreen_dma_copy as well as evergreen_dma_copy_tile).
> It would be nice to share some of the code in the long term.
>
> I have reused some "cik"-prefixed functions that also return the right
> value for SI. I am not sure if they should be renamed.
>
> Signed-off-by: Niels Ole Salscheider <niels_ole at salscheider-online.de>
> ---
>  src/gallium/drivers/radeonsi/si_hw_context.c |  65 +++++++
>  src/gallium/drivers/radeonsi/si_pipe.h       |   7 +
>  src/gallium/drivers/radeonsi/si_state.c      | 265 ++++++++++++++++++++++++++-
>  src/gallium/drivers/radeonsi/sid.h           |  15 ++
>  4 files changed, 346 insertions(+), 6 deletions(-)
>
> diff --git a/src/gallium/drivers/radeonsi/si_hw_context.c b/src/gallium/drivers/radeonsi/si_hw_context.c
> index d9fba01..76583a3 100644
> --- a/src/gallium/drivers/radeonsi/si_hw_context.c
> +++ b/src/gallium/drivers/radeonsi/si_hw_context.c
> @@ -25,6 +25,8 @@
>   */
>
>  #include "si_pipe.h"
> +#include "sid.h"
> +#include "../radeon/r600_cs.h"
>
>  /* initialize */
>  void si_need_cs_space(struct si_context *ctx, unsigned num_dw,
> @@ -186,6 +188,69 @@ void si_begin_new_cs(struct si_context *ctx)
>         ctx->b.initial_gfx_cs_size = ctx->b.rings.gfx.cs->cdw;
>  }
>
> +void si_need_dma_space(struct si_context *ctx, unsigned num_dw)
> +{
> +       /* The number of dwords we already used in the DMA so far. */
> +       num_dw += ctx->b.rings.dma.cs->cdw;
> +       /* Flush if there's not enough space. */
> +       if (num_dw > RADEON_MAX_CMDBUF_DWORDS) {
> +               ctx->b.rings.dma.flush(ctx, RADEON_FLUSH_ASYNC);
> +       }
> +}
> +
> +void si_dma_copy(struct si_context *ctx,
> +                struct pipe_resource *dst,
> +                struct pipe_resource *src,
> +                uint64_t dst_offset,
> +                uint64_t src_offset,
> +                uint64_t size)
> +{
> +       struct radeon_winsys_cs *cs = ctx->b.rings.dma.cs;
> +       unsigned i, ncopy, csize, sub_cmd, shift;
> +       struct r600_resource *rdst = (struct r600_resource*)dst;
> +       struct r600_resource *rsrc = (struct r600_resource*)src;
> +
> +       /* Mark the buffer range of destination as valid (initialized),
> +        * so that transfer_map knows it should wait for the GPU when mapping
> +        * that range. */
> +       util_range_add(&rdst->valid_buffer_range, dst_offset,
> +                      dst_offset + size);
> +
> +       /* make sure that the dma ring is only one active */
> +       ctx->b.rings.gfx.flush(ctx, RADEON_FLUSH_ASYNC);
> +       dst_offset += r600_resource_va(&ctx->screen->b.b, dst);
> +       src_offset += r600_resource_va(&ctx->screen->b.b, src);
> +
> +       /* see if we use dword or byte copy */
> +       if (!(dst_offset & 0x3) && !(src_offset & 0x3) && !(size & 0x3)) {
> +               size >>= 2;
> +               sub_cmd = 0x00;
> +               shift = 2;
> +       } else {
> +               sub_cmd = 0x40;
> +               shift = 0;
> +       }
> +       ncopy = (size / 0x000fffff) + !!(size % 0x000fffff);
> +
> +       si_need_dma_space(ctx, ncopy * 5);
> +       for (i = 0; i < ncopy; i++) {
> +               csize = size < 0x000fffff ? size : 0x000fffff;
> +               /* emit reloc before writting cs so that cs is always in consistent state */
> +               r600_context_bo_reloc(&ctx->b, &ctx->b.rings.dma, rsrc, RADEON_USAGE_READ,
> +                                     RADEON_PRIO_MIN);
> +               r600_context_bo_reloc(&ctx->b, &ctx->b.rings.dma, rdst, RADEON_USAGE_WRITE,
> +                                     RADEON_PRIO_MIN);
> +               cs->buf[cs->cdw++] = SI_DMA_PACKET(SI_DMA_PACKET_COPY, sub_cmd, csize);
> +               cs->buf[cs->cdw++] = dst_offset & 0xffffffff;
> +               cs->buf[cs->cdw++] = src_offset & 0xffffffff;
> +               cs->buf[cs->cdw++] = (dst_offset >> 32UL) & 0xff;
> +               cs->buf[cs->cdw++] = (src_offset >> 32UL) & 0xff;
> +               dst_offset += csize << shift;
> +               src_offset += csize << shift;
> +               size -= csize;
> +       }
> +}
> +
>  #if SI_TRACE_CS
>  void si_trace_emit(struct si_context *sctx)
>  {
> diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
> index 47dc8e7..45def1e 100644
> --- a/src/gallium/drivers/radeonsi/si_pipe.h
> +++ b/src/gallium/drivers/radeonsi/si_pipe.h
> @@ -171,6 +171,13 @@ void si_decompress_color_textures(struct si_context *sctx,
>  void si_context_flush(struct si_context *ctx, unsigned flags);
>  void si_begin_new_cs(struct si_context *ctx);
>  void si_need_cs_space(struct si_context *ctx, unsigned num_dw, boolean count_draw_in);
> +void si_need_dma_space(struct si_context *ctx, unsigned num_dw);
> +void si_dma_copy(struct si_context *ctx,
> +                struct pipe_resource *dst,
> +                struct pipe_resource *src,
> +                uint64_t dst_offset,
> +                uint64_t src_offset,
> +                uint64_t size);
>
>  /* si_pipe.c */
>  void si_flush(struct pipe_context *ctx, struct pipe_fence_handle **fence,
> diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c
> index 3843330..1be8398 100644
> --- a/src/gallium/drivers/radeonsi/si_state.c
> +++ b/src/gallium/drivers/radeonsi/si_state.c
> @@ -47,6 +47,48 @@ static void si_init_atom(struct r600_atom *atom, struct r600_atom **list_elem,
>         *list_elem = atom;
>  }
>
> +static unsigned si_array_mode(unsigned mode)
> +{
> +       switch (mode) {
> +       case RADEON_SURF_MODE_LINEAR_ALIGNED:
> +               return V_009910_ARRAY_LINEAR_ALIGNED;
> +       case RADEON_SURF_MODE_1D:
> +               return V_009910_ARRAY_1D_TILED_THIN1;
> +       case RADEON_SURF_MODE_2D:
> +               return V_009910_ARRAY_2D_TILED_THIN1;
> +       default:
> +       case RADEON_SURF_MODE_LINEAR:
> +               return V_009910_ARRAY_LINEAR_GENERAL;
> +       }
> +}
> +
> +static uint32_t si_num_banks(uint32_t nbanks)
> +{
> +       switch (nbanks) {
> +       case 2:
> +               return V_009910_ADDR_SURF_2_BANK;
> +       case 4:
> +               return V_009910_ADDR_SURF_4_BANK;
> +       case 8:
> +       default:
> +               return V_009910_ADDR_SURF_8_BANK;
> +       case 16:
> +               return V_009910_ADDR_SURF_16_BANK;
> +       }
> +}
> +
> +static uint32_t si_micro_tile_mode(struct si_screen *sscreen, unsigned tile_mode)
> +{
> +       if (sscreen->b.info.si_tile_mode_array_valid) {
> +               uint32_t gb_tile_mode = sscreen->b.info.si_tile_mode_array[tile_mode];
> +
> +               return G_009910_MICRO_TILE_MODE(gb_tile_mode);
> +       }
> +
> +       /* The kernel cannod return the tile mode array. Guess? */
> +       return V_009910_ADDR_SURF_THIN_MICRO_TILING;
> +}
> +
>  static uint32_t cik_num_banks(struct si_screen *sscreen, unsigned bpe, unsigned tile_split)
>  {
>         if (sscreen->b.info.cik_macrotile_mode_array_valid) {
> @@ -2926,18 +2968,229 @@ static void *si_create_blend_custom(struct si_context *sctx, unsigned mode)
>         return si_create_blend_state_mode(&sctx->b.b, &blend, mode);
>  }
>
> -static void si_dma_copy(struct pipe_context *ctx,
> +static void si_dma_copy_tile(struct si_context *ctx,
> +                            struct pipe_resource *dst,
> +                            unsigned dst_level,
> +                            unsigned dst_x,
> +                            unsigned dst_y,
> +                            unsigned dst_z,
> +                            struct pipe_resource *src,
> +                            unsigned src_level,
> +                            unsigned src_x,
> +                            unsigned src_y,
> +                            unsigned src_z,
> +                            unsigned copy_height,
> +                            unsigned pitch,
> +                            unsigned bpp)
> +{
> +       struct radeon_winsys_cs *cs = ctx->b.rings.dma.cs;
> +       struct si_screen *sscreen = ctx->screen;
> +       struct r600_texture *rsrc = (struct r600_texture*)src;
> +       struct r600_texture *rdst = (struct r600_texture*)dst;
> +       unsigned array_mode, lbpp, pitch_tile_max, slice_tile_max, size;
> +       unsigned ncopy, height, cheight, detile, i, x, y, z, src_mode, dst_mode;
> +       unsigned sub_cmd, bank_h, bank_w, mt_aspect, nbanks, tile_split, mt;
> +       uint64_t base, addr;
> +       unsigned pipe_config, tile_mode_index;
> +
> +       /* make sure that the dma ring is only one active */
> +       ctx->b.rings.gfx.flush(ctx, RADEON_FLUSH_ASYNC);
> +
> +       dst_mode = rdst->surface.level[dst_level].mode;
> +       src_mode = rsrc->surface.level[src_level].mode;
> +       /* downcast linear aligned to linear to simplify test */
> +       src_mode = src_mode == RADEON_SURF_MODE_LINEAR_ALIGNED ? RADEON_SURF_MODE_LINEAR : src_mode;
> +       dst_mode = dst_mode == RADEON_SURF_MODE_LINEAR_ALIGNED ? RADEON_SURF_MODE_LINEAR : dst_mode;
> +       assert(dst_mode != src_mode);
> +
> +       y = 0;
> +       sub_cmd = 0x8;
> +       lbpp = util_logbase2(bpp);
> +       pitch_tile_max = ((pitch / bpp) >> 3) - 1;
> +       nbanks = si_num_banks(ctx->screen->b.tiling_info.num_banks);
> +
> +       if (dst_mode == RADEON_SURF_MODE_LINEAR) {
> +               /* T2L */
> +               array_mode = si_array_mode(src_mode);
> +               slice_tile_max = (rsrc->surface.level[src_level].nblk_x * rsrc->surface.level[src_level].nblk_y) >> 6;
> +               slice_tile_max = slice_tile_max ? slice_tile_max - 1 : 0;
> +               /* linear height must be the same as the slice tile max height, it's ok even
> +                * if the linear destination/source have smaller heigh as the size of the
> +                * dma packet will be using the copy_height which is always smaller or equal
> +                * to the linear height
> +                */
> +               height = rsrc->surface.level[src_level].npix_y;
> +               detile = 1;
> +               x = src_x;
> +               y = src_y;
> +               z = src_z;
> +               base = rsrc->surface.level[src_level].offset;
> +               addr = rdst->surface.level[dst_level].offset;
> +               addr += rdst->surface.level[dst_level].slice_size * dst_z;
> +               addr += dst_y * pitch + dst_x * bpp;
> +               bank_h = cik_bank_wh(rsrc->surface.bankh);
> +               bank_w = cik_bank_wh(rsrc->surface.bankw);
> +               mt_aspect = cik_macro_tile_aspect(rsrc->surface.mtilea);
> +               tile_split = cik_tile_split(rsrc->surface.tile_split);
> +               tile_mode_index = si_tile_mode_index(rsrc, src_level,
> +                                                    util_format_has_stencil(util_format_description(src->format)));
> +               base += r600_resource_va(&ctx->screen->b.b, src);
> +               addr += r600_resource_va(&ctx->screen->b.b, dst);
> +       } else {
> +               /* L2T */
> +               array_mode = si_array_mode(dst_mode);
> +               slice_tile_max = (rdst->surface.level[dst_level].nblk_x * rdst->surface.level[dst_level].nblk_y) >> 6;
> +               slice_tile_max = slice_tile_max ? slice_tile_max - 1 : 0;
> +               /* linear height must be the same as the slice tile max height, it's ok even
> +                * if the linear destination/source have smaller heigh as the size of the
> +                * dma packet will be using the copy_height which is always smaller or equal
> +                * to the linear height
> +                */
> +               height = rdst->surface.level[dst_level].npix_y;
> +               detile = 0;
> +               x = dst_x;
> +               y = dst_y;
> +               z = dst_z;
> +               base = rdst->surface.level[dst_level].offset;
> +               addr = rsrc->surface.level[src_level].offset;
> +               addr += rsrc->surface.level[src_level].slice_size * src_z;
> +               addr += src_y * pitch + src_x * bpp;
> +               bank_h = cik_bank_wh(rdst->surface.bankh);
> +               bank_w = cik_bank_wh(rdst->surface.bankw);
> +               mt_aspect = cik_macro_tile_aspect(rdst->surface.mtilea);
> +               tile_split = cik_tile_split(rdst->surface.tile_split);
> +               tile_mode_index = si_tile_mode_index(rdst, dst_level,
> +                                                    util_format_has_stencil(util_format_description(dst->format)));
> +               base += r600_resource_va(&ctx->screen->b.b, dst);
> +               addr += r600_resource_va(&ctx->screen->b.b, src);
> +       }
> +
> +       pipe_config = cik_db_pipe_config(sscreen, tile_mode_index);
> +       mt = si_micro_tile_mode(sscreen, tile_mode_index);
> +       size = (copy_height * pitch) >> 2;
> +       ncopy = (size / 0x000fffff) + !!(size % 0x000fffff);
> +       si_need_dma_space(ctx, ncopy * 9);
> +
> +       for (i = 0; i < ncopy; i++) {
> +               cheight = copy_height;
> +               if (((cheight * pitch) >> 2) > 0x000fffff) {
> +                       cheight = (0x000fffff << 2) / pitch;
> +               }
> +               size = (cheight * pitch) >> 2;
> +               /* emit reloc before writting cs so that cs is always in consistent state */
> +               r600_context_bo_reloc(&ctx->b, &ctx->b.rings.dma, &rsrc->resource,
> +                                     RADEON_USAGE_READ, RADEON_PRIO_MIN);
> +               r600_context_bo_reloc(&ctx->b, &ctx->b.rings.dma, &rdst->resource,
> +                                     RADEON_USAGE_WRITE, RADEON_PRIO_MIN);
> +               cs->buf[cs->cdw++] = SI_DMA_PACKET(SI_DMA_PACKET_COPY, sub_cmd, size);
> +               cs->buf[cs->cdw++] = base >> 8;
> +               cs->buf[cs->cdw++] = (detile << 31) | (array_mode << 27) |
> +                                       (lbpp << 24) | (bank_h << 21) |
> +                                       (bank_w << 18) | (mt_aspect << 16);
> +               cs->buf[cs->cdw++] = (pitch_tile_max << 0) | ((height - 1) << 16);
> +               cs->buf[cs->cdw++] = (slice_tile_max << 0) | (pipe_config << 26);
> +               cs->buf[cs->cdw++] = (x << 0) | (z << 18);
> +               cs->buf[cs->cdw++] = (y << 0) | (tile_split << 21) | (nbanks << 25) | (mt << 27);
> +               cs->buf[cs->cdw++] = addr & 0xfffffffc;
> +               cs->buf[cs->cdw++] = (addr >> 32UL) & 0xff;
> +               copy_height -= cheight;
> +               addr += cheight * pitch;
> +               y += cheight;
> +       }
> +}
> +
> +static void si_dma_blit(struct pipe_context *ctx,
>                         struct pipe_resource *dst,
>                         unsigned dst_level,
> -                       unsigned dst_x, unsigned dst_y, unsigned dst_z,
> +                       unsigned dstx, unsigned dsty, unsigned dstz,
>                         struct pipe_resource *src,
>                         unsigned src_level,
>                         const struct pipe_box *src_box)
>  {
> -       /* XXX implement this or share evergreen_dma_blit with r600g */
> +       struct si_context *sctx = (struct si_context *)ctx;
> +       struct r600_texture *rsrc = (struct r600_texture*)src;
> +       struct r600_texture *rdst = (struct r600_texture*)dst;
> +       unsigned dst_pitch, src_pitch, bpp, dst_mode, src_mode, copy_height;
> +       unsigned src_w, dst_w;
> +       unsigned src_x, src_y;
> +       unsigned dst_x = dstx, dst_y = dsty, dst_z = dstz;
> +
> +       if (sctx->b.rings.dma.cs == NULL) {
> +               goto fallback;
> +       }
> +
> +       /* TODO: Implement DMA copy for CIK */
> +       if (sctx->b.chip_class >= CIK) {
> +               goto fallback;
> +       }
> +
> +       if (dst->target == PIPE_BUFFER && src->target == PIPE_BUFFER) {
> +               si_dma_copy(sctx, dst, src, dst_x, src_box->x, src_box->width);
> +               return;
> +       }
> +
> +       if (src->format != dst->format || src_box->depth > 1 ||
> +           rdst->dirty_level_mask != 0) {
> +               goto fallback;
> +       }
> +
> +       if (rsrc->dirty_level_mask) {
> +               ctx->flush_resource(ctx, src);
> +       }
> +
> +       src_x = util_format_get_nblocksx(src->format, src_box->x);
> +       dst_x = util_format_get_nblocksx(src->format, dst_x);
> +       src_y = util_format_get_nblocksy(src->format, src_box->y);
> +       dst_y = util_format_get_nblocksy(src->format, dst_y);
> +
> +       bpp = rdst->surface.bpe;
> +       dst_pitch = rdst->surface.level[dst_level].pitch_bytes;
> +       src_pitch = rsrc->surface.level[src_level].pitch_bytes;
> +       src_w = rsrc->surface.level[src_level].npix_x;
> +       dst_w = rdst->surface.level[dst_level].npix_x;
> +       copy_height = src_box->height / rsrc->surface.blk_h;
> +
> +       dst_mode = rdst->surface.level[dst_level].mode;
> +       src_mode = rsrc->surface.level[src_level].mode;
> +       /* downcast linear aligned to linear to simplify test */
> +       src_mode = src_mode == RADEON_SURF_MODE_LINEAR_ALIGNED ? RADEON_SURF_MODE_LINEAR : src_mode;
> +       dst_mode = dst_mode == RADEON_SURF_MODE_LINEAR_ALIGNED ? RADEON_SURF_MODE_LINEAR : dst_mode;
> +
> +       if (src_pitch != dst_pitch || src_box->x || dst_x || src_w != dst_w) {
> +               /* FIXME si can do partial blit */
> +               goto fallback;
> +       }
> +       /* the x test here are currently useless (because we don't support partial blit)
> +        * but keep them around so we don't forget about those
> +        */
> +       if ((src_pitch & 0x7) || (src_box->x & 0x7) || (dst_x & 0x7) || (src_box->y & 0x7) || (dst_y & 0x7)) {
> +               goto fallback;
> +       }
> +
> +       if (src_mode == dst_mode) {
> +               uint64_t dst_offset, src_offset;
> +               /* simple dma blit would do NOTE code here assume :
> +                *   src_box.x/y == 0
> +                *   dst_x/y == 0
> +                *   dst_pitch == src_pitch
> +                */
> +               src_offset= rsrc->surface.level[src_level].offset;
> +               src_offset += rsrc->surface.level[src_level].slice_size * src_box->z;
> +               src_offset += src_y * src_pitch + src_x * bpp;
> +               dst_offset = rdst->surface.level[dst_level].offset;
> +               dst_offset += rdst->surface.level[dst_level].slice_size * dst_z;
> +               dst_offset += dst_y * dst_pitch + dst_x * bpp;
> +               si_dma_copy(sctx, dst, src, dst_offset, src_offset,
> +                           src_box->height * src_pitch);
> +       } else {
> +               si_dma_copy_tile(sctx, dst, dst_level, dst_x, dst_y, dst_z,
> +                                src, src_level, src_x, src_y, src_box->z,
> +                                copy_height, dst_pitch, bpp);
> +       }
> +       return;
>
> -       /* Fallback: */
> -       ctx->resource_copy_region(ctx, dst, dst_level, dst_x, dst_y, dst_z,
> +fallback:
> +       ctx->resource_copy_region(ctx, dst, dst_level, dstx, dsty, dstz,
>                                   src, src_level, src_box);
>  }
>
> @@ -3021,7 +3274,7 @@ void si_init_state_functions(struct si_context *sctx)
>
>         sctx->b.b.texture_barrier = si_texture_barrier;
>         sctx->b.b.set_polygon_stipple = si_set_polygon_stipple;
> -       sctx->b.dma_copy = si_dma_copy;
> +       sctx->b.dma_copy = si_dma_blit;
>         sctx->b.set_occlusion_query_state = si_set_occlusion_query_state;
>         sctx->b.need_gfx_cs_space = si_need_gfx_cs_space;
>
> diff --git a/src/gallium/drivers/radeonsi/sid.h b/src/gallium/drivers/radeonsi/sid.h
> index 876d0a3..0a5ef2f 100644
> --- a/src/gallium/drivers/radeonsi/sid.h
> +++ b/src/gallium/drivers/radeonsi/sid.h
> @@ -8630,5 +8630,20 @@
>  #define R_028E30_CB_COLOR7_CLEAR_WORD0                                  0x028E30
>  #define R_028E34_CB_COLOR7_CLEAR_WORD1                                  0x028E34
>
> +/* SI async DMA packets */
> +#define SI_DMA_PACKET(cmd, sub_cmd, n) ((((cmd) & 0xF) << 28) |    \
> +                                       (((sub_cmd) & 0xFF) << 20) |\
> +                                       (((n) & 0xFFFFF) << 0))
> +/* SI async DMA Packet types */
> +#define    SI_DMA_PACKET_WRITE                     0x2
> +#define    SI_DMA_PACKET_COPY                      0x3
> +#define    SI_DMA_PACKET_INDIRECT_BUFFER           0x4
> +#define    SI_DMA_PACKET_SEMAPHORE                 0x5
> +#define    SI_DMA_PACKET_FENCE                     0x6
> +#define    SI_DMA_PACKET_TRAP                      0x7
> +#define    SI_DMA_PACKET_SRBM_WRITE                0x9
> +#define    SI_DMA_PACKET_CONSTANT_FILL             0xd
> +#define    SI_DMA_PACKET_NOP                       0xf
> +
>  #endif /* _SID_H */
>
> --
> 1.9.0
>
> _______________________________________________
> mesa-dev mailing list
> mesa-dev at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/mesa-dev