[Mesa-dev] [PATCH 3/3] radeon/winsys: add async dma infrastructure

Sun Jan 6 14:58:40 PST 2013

On Sun, Jan 6, 2013 at 4:00 PM, Marek Olšák <maraeo at gmail.com> wrote:
> I agree with Christian. You can use a separate instance of
> radeon_winsys_cs for the DMA CS. The winsys exposes all the functions
> you need (except one) for you to coordinate work between 2 command
> streams in the pipe driver. You may only need to expose one additional
> winsys function to the driver for synchronization, it's called
> "radeon_drm_cs_sync_flush". I'm confident that this can be implemented
> and layered on top of the winsys, presumably with fewer lines of code
> and cleaner.

The relocation add function need to access both the dma ring and the
cs ring no matter on which ring the relocation is added. Doing the
sync in the pipe driver would increase the code, each call site of
add_reloc would need to check if the bo is referenced by the other
ring and flush the other ring if so. Which also means that there is a
higher likelyhood that someone adding an add reloc forget about the
flushing.

> Some other comments:
> - DRM version checks are missing everywhere.

Winsys test for drm version and set has_dma accordingly (this version
here is missing couple of check on has_dma)

> - The duplication of resource_copy_region doesn't change anything in
> functionality, it only adds new useless lines of code.

It preparatory to difference btw evergreen and r6xx ie patch that
implement blit use that but i can move that diff to this patch.

>
> Some ideas:
> - There may be cases where we have to flush & wait for the gfx CS
> before we can start using the async DMA, and then the GFX CS must way
> until the async DMA is done. One of such cases is resource_copy_region
> which is done as part of rendering, not as part of texture/buffer
> transfers. In such cases, it would be better to use the CP DMA and
> avoid the flushing and synchronization hell entirely. The way I see
> it, the async DMA should be used for buffer/texture transfers when
> appropriate. In all the other cases, the CP DMA or the 3D engine
> should be used instead.
>
> Marek
>

Cheers,
Jerome

> On Fri, Jan 4, 2013 at 11:19 PM,  <j.glisse at gmail.com> wrote:
>> From: Jerome Glisse <jglisse at redhat.com>
>>
>> The design is to take advantage of the fact that kernel will emit
>> semaphore when buffer is referenced by different ring. So the only
>> thing we need to enforce synchronization btw dma and gfx/compute
>> ring is to make sure that we never reference same bo at the same
>> time on the dma and gfx ring.
>>
>> This is achieved by tracking relocation, when we add a relocation
>> to the dma ring for a bo we check first if the bo has an active
>> relocation on the gfx ring. If it's the case we flush the gfx ring.
>> We do the same when adding a bo to the gfx ring we check it does
>> not have a relocation on the dma ring if it has one we flush the
>> dma ring.
>>
>> This patch also simplify the helper query function to know if a bo
>> has pending write/read command.
>>
>> Signed-off-by: Jerome Glisse <jglisse at redhat.com>
>> ---
>>  src/gallium/drivers/r300/r300_emit.c               |  21 +-
>>  src/gallium/drivers/r300/r300_flush.c              |   7 +-
>>  src/gallium/drivers/r600/evergreen_hw_context.c    |  39 +++
>>  src/gallium/drivers/r600/evergreend.h              |  16 ++
>>  src/gallium/drivers/r600/r600.h                    |  13 +
>>  src/gallium/drivers/r600/r600_blit.c               |  94 +++++--
>>  src/gallium/drivers/r600/r600_hw_context.c         |  44 +++-
>>  src/gallium/drivers/r600/r600_pipe.c               |  13 +-
>>  src/gallium/drivers/r600/r600_pipe.h               |   2 +-
>>  src/gallium/drivers/r600/r600_texture.c            |   2 +-
>>  src/gallium/drivers/r600/r600d.h                   |  16 ++
>>  src/gallium/drivers/radeonsi/r600_hw_context.c     |   2 +-
>>  .../drivers/radeonsi/r600_hw_context_priv.h        |   2 +-
>>  src/gallium/drivers/radeonsi/r600_texture.c        |   2 +-
>>  src/gallium/drivers/radeonsi/radeonsi_pipe.c       |  13 +-
>>  src/gallium/winsys/radeon/drm/radeon_drm_bo.c      |  10 +-
>>  src/gallium/winsys/radeon/drm/radeon_drm_bo.h      |   2 +
>>  src/gallium/winsys/radeon/drm/radeon_drm_cs.c      | 270 +++++++++++++++++----
>>  src/gallium/winsys/radeon/drm/radeon_drm_cs.h      |  40 ++-
>>  src/gallium/winsys/radeon/drm/radeon_drm_winsys.c  |   6 +
>>  src/gallium/winsys/radeon/drm/radeon_winsys.h      |  28 ++-
>>  21 files changed, 509 insertions(+), 133 deletions(-)
>>
>> diff --git a/src/gallium/drivers/r300/r300_emit.c b/src/gallium/drivers/r300/r300_emit.c
>> index d1ed4b3..c824821 100644
>> --- a/src/gallium/drivers/r300/r300_emit.c
>> +++ b/src/gallium/drivers/r300/r300_emit.c
>> @@ -1184,7 +1184,8 @@ validate:
>>              assert(tex && tex->buf && "cbuf is marked, but NULL!");
>>              r300->rws->cs_add_reloc(r300->cs, tex->cs_buf,
>>                                      RADEON_USAGE_READWRITE,
>> -                                    r300_surface(fb->cbufs[i])->domain);
>> +                                    r300_surface(fb->cbufs[i])->domain,
>> +                                    RADEON_RING_DMA);
>>          }
>>          /* ...depth buffer... */
>>          if (fb->zsbuf) {
>> @@ -1192,7 +1193,8 @@ validate:
>>              assert(tex && tex->buf && "zsbuf is marked, but NULL!");
>>              r300->rws->cs_add_reloc(r300->cs, tex->cs_buf,
>>                                      RADEON_USAGE_READWRITE,
>> -                                    r300_surface(fb->zsbuf)->domain);
>> +                                    r300_surface(fb->zsbuf)->domain,
>> +                                    RADEON_RING_DMA);
>>          }
>>      }
>>      if (r300->textures_state.dirty) {
>> @@ -1204,18 +1206,21 @@ validate:
>>
>>              tex = r300_resource(texstate->sampler_views[i]->base.texture);
>>              r300->rws->cs_add_reloc(r300->cs, tex->cs_buf, RADEON_USAGE_READ,
>> -                                    tex->domain);
>> +                                    tex->domain,
>> +                                    RADEON_RING_DMA);
>>          }
>>      }
>>      /* ...occlusion query buffer... */
>>      if (r300->query_current)
>>          r300->rws->cs_add_reloc(r300->cs, r300->query_current->cs_buf,
>> -                                RADEON_USAGE_WRITE, RADEON_DOMAIN_GTT);
>> +                                RADEON_USAGE_WRITE, RADEON_DOMAIN_GTT,
>> +                                RADEON_RING_DMA);
>>      /* ...vertex buffer for SWTCL path... */
>>      if (r300->vbo)
>>          r300->rws->cs_add_reloc(r300->cs, r300_resource(r300->vbo)->cs_buf,
>>                                  RADEON_USAGE_READ,
>> -                                r300_resource(r300->vbo)->domain);
>> +                                r300_resource(r300->vbo)->domain,
>> +                                RADEON_RING_DMA);
>>      /* ...vertex buffers for HWTCL path... */
>>      if (do_validate_vertex_buffers && r300->vertex_arrays_dirty) {
>>          struct pipe_vertex_buffer *vbuf = r300->vertex_buffer;
>> @@ -1230,14 +1235,16 @@ validate:
>>
>>              r300->rws->cs_add_reloc(r300->cs, r300_resource(buf)->cs_buf,
>>                                      RADEON_USAGE_READ,
>> -                                    r300_resource(buf)->domain);
>> +                                    r300_resource(buf)->domain,
>> +                                    RADEON_RING_DMA);
>>          }
>>      }
>>      /* ...and index buffer for HWTCL path. */
>>      if (index_buffer)
>>          r300->rws->cs_add_reloc(r300->cs, r300_resource(index_buffer)->cs_buf,
>>                                  RADEON_USAGE_READ,
>> -                                r300_resource(index_buffer)->domain);
>> +                                r300_resource(index_buffer)->domain,
>> +                                RADEON_RING_DMA);
>>
>>      /* Now do the validation (flush is called inside cs_validate on failure). */
>>      if (!r300->rws->cs_validate(r300->cs)) {
>> diff --git a/src/gallium/drivers/r300/r300_flush.c b/src/gallium/drivers/r300/r300_flush.c
>> index 2170c59..ff3dbe9 100644
>> --- a/src/gallium/drivers/r300/r300_flush.c
>> +++ b/src/gallium/drivers/r300/r300_flush.c
>> @@ -70,8 +70,10 @@ void r300_flush(struct pipe_context *pipe,
>>      struct r300_context *r300 = r300_context(pipe);
>>      struct pb_buffer **rfence = (struct pb_buffer**)fence;
>>
>> +    /* r3xx-r5xx only have gfx ring */
>> +    flags |= RADEON_FLUSH_GFX;
>>      if (r300->draw && !r300->draw_vbo_locked)
>> -       r300_draw_flush_vbuf(r300);
>> +        r300_draw_flush_vbuf(r300);
>>
>>      if (r300->screen->info.drm_minor >= 12) {
>>          flags |= RADEON_FLUSH_KEEP_TILING_FLAGS;
>> @@ -84,7 +86,8 @@ void r300_flush(struct pipe_context *pipe,
>>          /* Add the fence as a dummy relocation. */
>>          r300->rws->cs_add_reloc(r300->cs,
>>                                  r300->rws->buffer_get_cs_handle(*rfence),
>> -                                RADEON_USAGE_READWRITE, RADEON_DOMAIN_GTT);
>> +                                RADEON_USAGE_READWRITE, RADEON_DOMAIN_GTT,
>> +                                RADEON_RING_DMA);
>>      }
>>
>>      if (r300->dirty_hw) {
>> diff --git a/src/gallium/drivers/r600/evergreen_hw_context.c b/src/gallium/drivers/r600/evergreen_hw_context.c
>> index 0ca7f9e..302f652 100644
>> --- a/src/gallium/drivers/r600/evergreen_hw_context.c
>> +++ b/src/gallium/drivers/r600/evergreen_hw_context.c
>> @@ -238,3 +238,42 @@ void evergreen_set_streamout_enable(struct r600_context *ctx, unsigned buffer_en
>>                 r600_write_context_reg(cs, R_028B94_VGT_STRMOUT_CONFIG, S_028B94_STREAMOUT_0_EN(0));
>>         }
>>  }
>> +
>> +void evergreen_dma_copy(struct r600_context *rctx,
>> +               struct pipe_resource *dst,
>> +               struct pipe_resource *src,
>> +               unsigned long dst_offset,
>> +               unsigned long src_offset,
>> +               unsigned long size)
>> +{
>> +       struct radeon_winsys_cs *cs = rctx->cs;
>> +       unsigned i, ncopy, csize, command, shift;
>> +       struct r600_resource *rdst = (struct r600_resource*)dst;
>> +       struct r600_resource *rsrc = (struct r600_resource*)src;
>> +
>> +       /* see if we use dword or byte copy */
>> +       if (!(dst_offset & 0x3) && !(src_offset & 0x3) && !(size & 0x3)) {
>> +               size >>= 2;
>> +               command = 0x00 << 20;
>> +               shift = 2;
>> +       } else {
>> +               command = 0x40 << 20;
>> +               shift = 0;
>> +       }
>> +       ncopy = (size / 0xfffff) + !!(size % 0xfffff);
>> +
>> +       r600_need_dma_space(rctx, ncopy * 5);
>> +       for (i = 0; i < ncopy; i++) {
>> +               csize = size < 0xfffff ? size : 0xfffff;
>> +               cs->dma_buf[cs->dma_cdw++] = DMA_PACKET(DMA_PACKET_COPY, 0, 0, csize) | command;
>> +               cs->dma_buf[cs->dma_cdw++] = dst_offset & 0xffffffff;
>> +               cs->dma_buf[cs->dma_cdw++] = src_offset & 0xffffffff;
>> +               cs->dma_buf[cs->dma_cdw++] = (dst_offset >> 32UL) & 0xff;
>> +               cs->dma_buf[cs->dma_cdw++] = (src_offset >> 32UL) & 0xff;
>> +               rctx->ws->cs_add_reloc(rctx->cs, rsrc->cs_buf, RADEON_USAGE_READ, rsrc->domains, RADEON_RING_DMA);
>> +               rctx->ws->cs_add_reloc(rctx->cs, rdst->cs_buf, RADEON_USAGE_WRITE, rdst->domains, RADEON_RING_DMA);
>> +               dst_offset += csize << shift;
>> +               src_offset += csize << shift;
>> +               size -= csize;
>> +       }
>> +}
>> diff --git a/src/gallium/drivers/r600/evergreend.h b/src/gallium/drivers/r600/evergreend.h
>> index d9dba95..d8c50b6 100644
>> --- a/src/gallium/drivers/r600/evergreend.h
>> +++ b/src/gallium/drivers/r600/evergreend.h
>> @@ -2317,4 +2317,20 @@
>>  #define   G_028AA8_SWITCH_ON_EOP(x)                    (((x) >> 17) & 0x1)
>>  #define   C_028AA8_SWITCH_ON_EOP                       0xFFFDFFFF
>>
>> +/* async DMA packets */
>> +#define DMA_PACKET(cmd, t, s, n)    ((((cmd) & 0xF) << 28) |    \
>> +                                    (((t) & 0x1) << 23) |       \
>> +                                    (((s) & 0x1) << 22) |       \
>> +                                    (((n) & 0xFFFFF) << 0))
>> +/* async DMA Packet types */
>> +#define    DMA_PACKET_WRITE                     0x2
>> +#define    DMA_PACKET_COPY                      0x3
>> +#define    DMA_PACKET_INDIRECT_BUFFER           0x4
>> +#define    DMA_PACKET_SEMAPHORE                 0x5
>> +#define    DMA_PACKET_FENCE                     0x6
>> +#define    DMA_PACKET_TRAP                      0x7
>> +#define    DMA_PACKET_SRBM_WRITE                0x9
>> +#define    DMA_PACKET_CONSTANT_FILL             0xd
>> +#define    DMA_PACKET_NOP                       0xf
>> +
>>  #endif
>> diff --git a/src/gallium/drivers/r600/r600.h b/src/gallium/drivers/r600/r600.h
>> index 260536e..90c756a 100644
>> --- a/src/gallium/drivers/r600/r600.h
>> +++ b/src/gallium/drivers/r600/r600.h
>> @@ -169,6 +169,19 @@ void r600_flush_emit(struct r600_context *ctx);
>>  void r600_context_streamout_begin(struct r600_context *ctx);
>>  void r600_context_streamout_end(struct r600_context *ctx);
>>  void r600_need_cs_space(struct r600_context *ctx, unsigned num_dw, boolean count_draw_in);
>> +void r600_need_dma_space(struct r600_context *ctx, unsigned num_dw);
>> +void r600_dma_copy(struct r600_context *rctx,
>> +               struct pipe_resource *dst,
>> +               struct pipe_resource *src,
>> +               unsigned long dst_offset,
>> +               unsigned long src_offset,
>> +               unsigned long size);
>> +void evergreen_dma_copy(struct r600_context *rctx,
>> +               struct pipe_resource *dst,
>> +               struct pipe_resource *src,
>> +               unsigned long dst_offset,
>> +               unsigned long src_offset,
>> +               unsigned long size);
>>  void r600_context_block_emit_dirty(struct r600_context *ctx, struct r600_block *block, unsigned pkt_flags);
>>
>>  int evergreen_context_init(struct r600_context *ctx);
>> diff --git a/src/gallium/drivers/r600/r600_blit.c b/src/gallium/drivers/r600/r600_blit.c
>> index 87b90ac..dcffda2 100644
>> --- a/src/gallium/drivers/r600/r600_blit.c
>> +++ b/src/gallium/drivers/r600/r600_blit.c
>> @@ -503,18 +503,26 @@ static void r600_clear_depth_stencil(struct pipe_context *ctx,
>>         r600_blitter_end(ctx);
>>  }
>>
>> -void r600_copy_buffer(struct pipe_context *ctx, struct
>> -                     pipe_resource *dst, unsigned dstx,
>> -                     struct pipe_resource *src, const struct pipe_box *src_box)
>> +void r600_copy_buffer(struct pipe_context *ctx, struct pipe_resource *dst,
>> +                     unsigned dstx, struct pipe_resource *src, const struct pipe_box *src_box)
>>  {
>>         struct r600_context *rctx = (struct r600_context*)ctx;
>>
>> -       if (rctx->screen->has_streamout &&
>> -           /* Require dword alignment. */
>> -           dstx % 4 == 0 && src_box->x % 4 == 0 && src_box->width % 4 == 0) {
>> -               r600_blitter_begin(ctx, R600_COPY_BUFFER);
>> -               util_blitter_copy_buffer(rctx->blitter, dst, dstx, src, src_box->x, src_box->width);
>> -               r600_blitter_end(ctx);
>> +       if (rctx->chip_class >= EVERGREEN) {
>> +               evergreen_dma_copy(rctx, dst, src, dstx, src_box->x, src_box->width);
>> +               return;
>> +       }
>> +
>> +       /* Require dword alignment. */
>> +       if (rctx->screen->has_streamout && dstx % 4 == 0 && src_box->x % 4 == 0 && src_box->width % 4 == 0) {
>> +               /* r6xx seems to be lockup prone with async dma just keep using gfx for it */
>> +               if (rctx->chip_class >= R700) {
>> +                       r600_dma_copy(rctx, dst, src, dstx, src_box->x, src_box->width);
>> +               } else {
>> +                       r600_blitter_begin(ctx, R600_COPY_BUFFER);
>> +                       util_blitter_copy_buffer(rctx->blitter, dst, dstx, src, src_box->x, src_box->width);
>> +                       r600_blitter_end(ctx);
>> +               }
>>         } else {
>>                 util_resource_copy_region(ctx, dst, 0, dstx, 0, 0, src, 0, src_box);
>>         }
>> @@ -530,13 +538,13 @@ static bool util_format_is_subsampled_2x1_32bpp(enum pipe_format format)
>>                desc->block.bits == 32;
>>  }
>>
>> -static void r600_resource_copy_region(struct pipe_context *ctx,
>> -                                     struct pipe_resource *dst,
>> -                                     unsigned dst_level,
>> -                                     unsigned dstx, unsigned dsty, unsigned dstz,
>> -                                     struct pipe_resource *src,
>> -                                     unsigned src_level,
>> -                                     const struct pipe_box *src_box)
>> +static void r600_resource_copy_region_gfx_blit(struct pipe_context *ctx,
>> +                                               struct pipe_resource *dst,
>> +                                               unsigned dst_level,
>> +                                               unsigned dstx, unsigned dsty, unsigned dstz,
>> +                                               struct pipe_resource *src,
>> +                                               unsigned src_level,
>> +                                               const struct pipe_box *src_box)
>>  {
>>         struct r600_context *rctx = (struct r600_context *)ctx;
>>         struct r600_texture *rsrc = (struct r600_texture*)src;
>> @@ -547,14 +555,6 @@ static void r600_resource_copy_region(struct pipe_context *ctx,
>>         struct pipe_box sbox;
>>         bool copy_all_samples;
>>
>> -       /* Handle buffers first. */
>> -       if (dst->target == PIPE_BUFFER && src->target == PIPE_BUFFER) {
>> -               r600_copy_buffer(ctx, dst, dstx, src, src_box);
>> -               return;
>> -       }
>> -
>> -       assert(u_max_sample(dst) == u_max_sample(src));
>> -
>>         /* The driver doesn't decompress resources automatically while
>>          * u_blitter is rendering. */
>>         if (!r600_decompress_subresource(ctx, src, src_level,
>> @@ -660,6 +660,46 @@ static void r600_resource_copy_region(struct pipe_context *ctx,
>>         pipe_sampler_view_reference(&src_view, NULL);
>>  }
>>
>> +static void r600_resource_copy_region(struct pipe_context *ctx,
>> +                                       struct pipe_resource *dst,
>> +                                       unsigned dst_level,
>> +                                       unsigned dstx, unsigned dsty, unsigned dstz,
>> +                                       struct pipe_resource *src,
>> +                                       unsigned src_level,
>> +                                       const struct pipe_box *src_box)
>> +{
>> +       /* Handle buffers first. */
>> +       if (dst->target == PIPE_BUFFER && src->target == PIPE_BUFFER) {
>> +               r600_copy_buffer(ctx, dst, dstx, src, src_box);
>> +               return;
>> +       }
>> +
>> +       assert(u_max_sample(dst) == u_max_sample(src));
>> +
>> +       r600_resource_copy_region_gfx_blit(ctx, dst, dst_level, dstx, dsty, dstz,
>> +                                               src, src_level, src_box);
>> +}
>> +
>> +static void evergreen_resource_copy_region(struct pipe_context *ctx,
>> +                                       struct pipe_resource *dst,
>> +                                       unsigned dst_level,
>> +                                       unsigned dstx, unsigned dsty, unsigned dstz,
>> +                                       struct pipe_resource *src,
>> +                                       unsigned src_level,
>> +                                       const struct pipe_box *src_box)
>> +{
>> +       /* Handle buffers first. */
>> +       if (dst->target == PIPE_BUFFER && src->target == PIPE_BUFFER) {
>> +               r600_copy_buffer(ctx, dst, dstx, src, src_box);
>> +               return;
>> +       }
>> +
>> +       assert(u_max_sample(dst) == u_max_sample(src));
>> +
>> +       r600_resource_copy_region_gfx_blit(ctx, dst, dst_level, dstx, dsty, dstz,
>> +                                               src, src_level, src_box);
>> +}
>> +
>>  /* For MSAA integer resolving to work, we change the format to NORM using this function. */
>>  static enum pipe_format int_to_norm_format(enum pipe_format format)
>>  {
>> @@ -792,6 +832,10 @@ void r600_init_blit_functions(struct r600_context *rctx)
>>         rctx->context.clear = r600_clear;
>>         rctx->context.clear_render_target = r600_clear_render_target;
>>         rctx->context.clear_depth_stencil = r600_clear_depth_stencil;
>> -       rctx->context.resource_copy_region = r600_resource_copy_region;
>> +       if (rctx->chip_class >= EVERGREEN) {
>> +               rctx->context.resource_copy_region = evergreen_resource_copy_region;
>> +       } else {
>> +               rctx->context.resource_copy_region = r600_resource_copy_region;
>> +       }
>>         rctx->context.blit = r600_blit;
>>  }
>> diff --git a/src/gallium/drivers/r600/r600_hw_context.c b/src/gallium/drivers/r600/r600_hw_context.c
>> index 163917e..7b8c499 100644
>> --- a/src/gallium/drivers/r600/r600_hw_context.c
>> +++ b/src/gallium/drivers/r600/r600_hw_context.c
>> @@ -413,7 +413,17 @@ void r600_need_cs_space(struct r600_context *ctx, unsigned num_dw,
>>
>>         /* Flush if there's not enough space. */
>>         if (num_dw > RADEON_MAX_CMDBUF_DWORDS) {
>> -               r600_flush(&ctx->context, NULL, RADEON_FLUSH_ASYNC);
>> +               r600_flush(&ctx->context, NULL, RADEON_FLUSH_GFX | RADEON_FLUSH_ASYNC);
>> +       }
>> +}
>> +
>> +void r600_need_dma_space(struct r600_context *ctx, unsigned num_dw)
>> +{
>> +       /* The number of dwords we already used in the DMA so far. */
>> +       num_dw += ctx->cs->dma_cdw;
>> +       /* Flush if there's not enough space. */
>> +       if (num_dw > RADEON_MAX_CMDBUF_DWORDS) {
>> +               r600_flush(&ctx->context, NULL, RADEON_FLUSH_DMA | RADEON_FLUSH_ASYNC);
>>         }
>>  }
>>
>> @@ -1050,3 +1060,35 @@ void r600_context_streamout_end(struct r600_context *ctx)
>>         ctx->flags |= R600_CONTEXT_WAIT_IDLE | R600_CONTEXT_FLUSH_AND_INV;
>>         ctx->num_cs_dw_streamout_end = 0;
>>  }
>> +
>> +void r600_dma_copy(struct r600_context *rctx,
>> +               struct pipe_resource *dst,
>> +               struct pipe_resource *src,
>> +               unsigned long dst_offset,
>> +               unsigned long src_offset,
>> +               unsigned long size)
>> +{
>> +       struct radeon_winsys_cs *cs = rctx->cs;
>> +       unsigned i, ncopy, csize, shift;
>> +       struct r600_resource *rdst = (struct r600_resource*)dst;
>> +       struct r600_resource *rsrc = (struct r600_resource*)src;
>> +
>> +       size >>= 2;
>> +       shift = 2;
>> +       ncopy = (size / 0xffff) + !!(size % 0xffff);
>> +
>> +       r600_need_dma_space(rctx, ncopy * 5);
>> +       for (i = 0; i < ncopy; i++) {
>> +               csize = size < 0xffff ? size : 0xffff;
>> +               cs->dma_buf[cs->dma_cdw++] = DMA_PACKET(DMA_PACKET_COPY, 0, 0, csize);
>> +               cs->dma_buf[cs->dma_cdw++] = dst_offset & 0xfffffffc;
>> +               cs->dma_buf[cs->dma_cdw++] = src_offset & 0xfffffffc;
>> +               cs->dma_buf[cs->dma_cdw++] = (dst_offset >> 32UL) & 0xff;
>> +               cs->dma_buf[cs->dma_cdw++] = (src_offset >> 32UL) & 0xff;
>> +               rctx->ws->cs_add_reloc(rctx->cs, rsrc->cs_buf, RADEON_USAGE_READ, rsrc->domains, RADEON_RING_DMA);
>> +               rctx->ws->cs_add_reloc(rctx->cs, rdst->cs_buf, RADEON_USAGE_WRITE, rdst->domains, RADEON_RING_DMA);
>> +               dst_offset += csize << shift;
>> +               src_offset += csize << shift;
>> +               size -= csize;
>> +       }
>> +}
>> diff --git a/src/gallium/drivers/r600/r600_pipe.c b/src/gallium/drivers/r600/r600_pipe.c
>> index 4ea669c..acf0b2a 100644
>> --- a/src/gallium/drivers/r600/r600_pipe.c
>> +++ b/src/gallium/drivers/r600/r600_pipe.c
>> @@ -145,12 +145,21 @@ void r600_flush(struct pipe_context *ctx, struct pipe_fence_handle **fence,
>>  static void r600_flush_from_st(struct pipe_context *ctx,
>>                                struct pipe_fence_handle **fence)
>>  {
>> -       r600_flush(ctx, fence, 0);
>> +       r600_flush(ctx, fence, RADEON_FLUSH_GFX | RADEON_FLUSH_DMA);
>>  }
>>
>>  static void r600_flush_from_winsys(void *ctx, unsigned flags)
>>  {
>> -       r600_flush((struct pipe_context*)ctx, NULL, flags);
>> +       struct r600_context *rctx = ctx;
>> +
>> +       if (flags & RADEON_FLUSH_DMA) {
>> +               rctx->ws->cs_flush(rctx->cs, RADEON_FLUSH_DMA | (flags & RADEON_FLUSH_ASYNC));
>> +               flags &= ~RADEON_FLUSH_DMA;
>> +       }
>> +
>> +       if (flags & (RADEON_FLUSH_GFX | RADEON_FLUSH_COMPUTE)) {
>> +               r600_flush((struct pipe_context*)ctx, NULL, flags);
>> +       }
>>  }
>>
>>  static void r600_destroy_context(struct pipe_context *context)
>> diff --git a/src/gallium/drivers/r600/r600_pipe.h b/src/gallium/drivers/r600/r600_pipe.h
>> index 934a6f5..0be4f34 100644
>> --- a/src/gallium/drivers/r600/r600_pipe.h
>> +++ b/src/gallium/drivers/r600/r600_pipe.h
>> @@ -833,7 +833,7 @@ static INLINE unsigned r600_context_bo_reloc(struct r600_context *ctx, struct r6
>>                                              enum radeon_bo_usage usage)
>>  {
>>         assert(usage);
>> -       return ctx->ws->cs_add_reloc(ctx->cs, rbo->cs_buf, usage, rbo->domains) * 4;
>> +       return ctx->ws->cs_add_reloc(ctx->cs, rbo->cs_buf, usage, rbo->domains, RADEON_RING_PM4) * 4;
>>  }
>>
>>  static INLINE void r600_write_value(struct radeon_winsys_cs *cs, unsigned value)
>> diff --git a/src/gallium/drivers/r600/r600_texture.c b/src/gallium/drivers/r600/r600_texture.c
>> index 0f67739..4ff520e 100644
>> --- a/src/gallium/drivers/r600/r600_texture.c
>> +++ b/src/gallium/drivers/r600/r600_texture.c
>> @@ -830,7 +830,7 @@ static void *r600_texture_transfer_map(struct pipe_context *ctx,
>>                 if (usage & PIPE_TRANSFER_READ) {
>>                         r600_copy_to_staging_texture(ctx, trans);
>>                         /* Always referenced in the blit. */
>> -                       r600_flush(ctx, NULL, 0);
>> +                       r600_flush(ctx, NULL, RADEON_FLUSH_GFX | RADEON_FLUSH_DMA);
>>                 }
>>         } else {
>>                 trans->transfer.stride = rtex->surface.level[level].pitch_bytes;
>> diff --git a/src/gallium/drivers/r600/r600d.h b/src/gallium/drivers/r600/r600d.h
>> index 69bfd7a..2b17664 100644
>> --- a/src/gallium/drivers/r600/r600d.h
>> +++ b/src/gallium/drivers/r600/r600d.h
>> @@ -3647,4 +3647,20 @@
>>  #define SQ_TEX_INST_SAMPLE_C_G_LB      0x1E
>>  #define SQ_TEX_INST_SAMPLE_C_G_LZ      0x1F
>>
>> +/* async DMA packets */
>> +#define DMA_PACKET(cmd, t, s, n)       ((((cmd) & 0xF) << 28) |        \
>> +                                       (((t) & 0x1) << 23) |           \
>> +                                       (((s) & 0x1) << 22) |           \
>> +                                       (((n) & 0xFFFF) << 0))
>> +/* async DMA Packet types */
>> +#define DMA_PACKET_WRITE               0x2
>> +#define DMA_PACKET_COPY                        0x3
>> +#define DMA_PACKET_INDIRECT_BUFFER     0x4
>> +#define DMA_PACKET_SEMAPHORE           0x5
>> +#define DMA_PACKET_FENCE               0x6
>> +#define DMA_PACKET_TRAP                        0x7
>> +#define DMA_PACKET_CONSTANT_FILL       0xd /* 7xx only */
>> +#define DMA_PACKET_NOP                 0xf
>> +
>> +
>>  #endif
>> diff --git a/src/gallium/drivers/radeonsi/r600_hw_context.c b/src/gallium/drivers/radeonsi/r600_hw_context.c
>> index bd348f9..ecddd44 100644
>> --- a/src/gallium/drivers/radeonsi/r600_hw_context.c
>> +++ b/src/gallium/drivers/radeonsi/r600_hw_context.c
>> @@ -144,7 +144,7 @@ void si_need_cs_space(struct r600_context *ctx, unsigned num_dw,
>>
>>         /* Flush if there's not enough space. */
>>         if (num_dw > RADEON_MAX_CMDBUF_DWORDS) {
>> -               radeonsi_flush(&ctx->context, NULL, RADEON_FLUSH_ASYNC);
>> +               radeonsi_flush(&ctx->context, NULL, RADEON_FLUSH_GFX | RADEON_FLUSH_ASYNC);
>>         }
>>  }
>>
>> diff --git a/src/gallium/drivers/radeonsi/r600_hw_context_priv.h b/src/gallium/drivers/radeonsi/r600_hw_context_priv.h
>> index c2a15eb..12e3a1e 100644
>> --- a/src/gallium/drivers/radeonsi/r600_hw_context_priv.h
>> +++ b/src/gallium/drivers/radeonsi/r600_hw_context_priv.h
>> @@ -39,7 +39,7 @@ static INLINE unsigned r600_context_bo_reloc(struct r600_context *ctx, struct si
>>                                              enum radeon_bo_usage usage)
>>  {
>>         assert(usage);
>> -       return ctx->ws->cs_add_reloc(ctx->cs, rbo->cs_buf, usage, rbo->domains) * 4;
>> +       return ctx->ws->cs_add_reloc(ctx->cs, rbo->cs_buf, usage, rbo->domains, RADEON_RING_PM4) * 4;
>>  }
>>
>>  #endif
>> diff --git a/src/gallium/drivers/radeonsi/r600_texture.c b/src/gallium/drivers/radeonsi/r600_texture.c
>> index de46640..f0e0bd7 100644
>> --- a/src/gallium/drivers/radeonsi/r600_texture.c
>> +++ b/src/gallium/drivers/radeonsi/r600_texture.c
>> @@ -374,7 +374,7 @@ static void *si_texture_transfer_map(struct pipe_context *ctx,
>>                 if (usage & PIPE_TRANSFER_READ) {
>>                         r600_copy_to_staging_texture(ctx, trans);
>>                         /* Always referenced in the blit. */
>> -                       radeonsi_flush(ctx, NULL, 0);
>> +                       radeonsi_flush(ctx, NULL, RADEON_FLUSH_GFX);
>>                 }
>>         } else {
>>                 trans->transfer.stride = rtex->surface.level[level].pitch_bytes;
>> diff --git a/src/gallium/drivers/radeonsi/radeonsi_pipe.c b/src/gallium/drivers/radeonsi/radeonsi_pipe.c
>> index 4bab97a..782c695 100644
>> --- a/src/gallium/drivers/radeonsi/radeonsi_pipe.c
>> +++ b/src/gallium/drivers/radeonsi/radeonsi_pipe.c
>> @@ -160,12 +160,21 @@ void radeonsi_flush(struct pipe_context *ctx, struct pipe_fence_handle **fence,
>>  static void r600_flush_from_st(struct pipe_context *ctx,
>>                                struct pipe_fence_handle **fence)
>>  {
>> -       radeonsi_flush(ctx, fence, 0);
>> +       radeonsi_flush(ctx, fence, RADEON_FLUSH_GFX | RADEON_FLUSH_DMA);
>>  }
>>
>>  static void r600_flush_from_winsys(void *ctx, unsigned flags)
>>  {
>> -       radeonsi_flush((struct pipe_context*)ctx, NULL, flags);
>> +       struct r600_context *rctx = ctx;
>> +
>> +       if (flags & RADEON_FLUSH_DMA) {
>> +               rctx->ws->cs_flush(rctx->cs, RADEON_FLUSH_DMA | (flags & RADEON_FLUSH_ASYNC));
>> +               flags &= ~RADEON_FLUSH_DMA;
>> +       }
>> +
>> +       if (flags & (RADEON_FLUSH_GFX | RADEON_FLUSH_COMPUTE)) {
>> +               radeonsi_flush((struct pipe_context*)ctx, NULL, flags);
>> +       }
>>  }
>>
>>  static void r600_destroy_context(struct pipe_context *context)
>> diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_bo.c b/src/gallium/winsys/radeon/drm/radeon_drm_bo.c
>> index 897e962..5818b6a 100644
>> --- a/src/gallium/winsys/radeon/drm/radeon_drm_bo.c
>> +++ b/src/gallium/winsys/radeon/drm/radeon_drm_bo.c
>> @@ -413,7 +413,7 @@ static void *radeon_bo_map(struct radeon_winsys_cs_handle *buf,
>>                   *
>>                   * Only check whether the buffer is being used for write. */
>>                  if (radeon_bo_is_referenced_by_cs_for_write(cs, bo)) {
>> -                    cs->flush_cs(cs->flush_data, RADEON_FLUSH_ASYNC);
>> +                    cs->flush_cs(cs->flush_data, RADEON_FLUSH_GFX | RADEON_FLUSH_DMA | RADEON_FLUSH_ASYNC);
>>                      return NULL;
>>                  }
>>
>> @@ -423,7 +423,7 @@ static void *radeon_bo_map(struct radeon_winsys_cs_handle *buf,
>>                  }
>>              } else {
>>                  if (radeon_bo_is_referenced_by_cs(cs, bo)) {
>> -                    cs->flush_cs(cs->flush_data, RADEON_FLUSH_ASYNC);
>> +                    cs->flush_cs(cs->flush_data, RADEON_FLUSH_GFX | RADEON_FLUSH_DMA | RADEON_FLUSH_ASYNC);
>>                      return NULL;
>>                  }
>>
>> @@ -442,14 +442,14 @@ static void *radeon_bo_map(struct radeon_winsys_cs_handle *buf,
>>                   *
>>                   * Only check whether the buffer is being used for write. */
>>                  if (radeon_bo_is_referenced_by_cs_for_write(cs, bo)) {
>> -                    cs->flush_cs(cs->flush_data, 0);
>> +                    cs->flush_cs(cs->flush_data, RADEON_FLUSH_GFX | RADEON_FLUSH_DMA);
>>                  }
>>                  radeon_bo_wait((struct pb_buffer*)bo,
>>                                 RADEON_USAGE_WRITE);
>>              } else {
>>                  /* Mapping for write. */
>>                  if (radeon_bo_is_referenced_by_cs(cs, bo)) {
>> -                    cs->flush_cs(cs->flush_data, 0);
>> +                    cs->flush_cs(cs->flush_data, RADEON_FLUSH_GFX | RADEON_FLUSH_DMA);
>>                  } else {
>>                      /* Try to avoid busy-waiting in radeon_bo_wait. */
>>                      if (p_atomic_read(&bo->num_active_ioctls))
>> @@ -763,7 +763,7 @@ static void radeon_bo_set_tiling(struct pb_buffer *_buf,
>>      /* Tiling determines how DRM treats the buffer data.
>>       * We must flush CS when changing it if the buffer is referenced. */
>>      if (cs && radeon_bo_is_referenced_by_cs(cs, bo)) {
>> -        cs->flush_cs(cs->flush_data, 0);
>> +        cs->flush_cs(cs->flush_data, RADEON_FLUSH_GFX | RADEON_FLUSH_DMA);
>>      }
>>
>>      while (p_atomic_read(&bo->num_active_ioctls)) {
>> diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_bo.h b/src/gallium/winsys/radeon/drm/radeon_drm_bo.h
>> index 82ea141..431e648 100644
>> --- a/src/gallium/winsys/radeon/drm/radeon_drm_bo.h
>> +++ b/src/gallium/winsys/radeon/drm/radeon_drm_bo.h
>> @@ -60,6 +60,8 @@ struct radeon_bo {
>>
>>      /* how many command streams is this bo referenced in? */
>>      int num_cs_references;
>> +    int write_pending;
>> +    int read_pending;
>>
>>      /* how many command streams, which are being emitted in a separate
>>       * thread, is this bo referenced in? */
>> diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_cs.c b/src/gallium/winsys/radeon/drm/radeon_drm_cs.c
>> index f9be961..685f098 100644
>> --- a/src/gallium/winsys/radeon/drm/radeon_drm_cs.c
>> +++ b/src/gallium/winsys/radeon/drm/radeon_drm_cs.c
>> @@ -92,9 +92,15 @@
>>  #define RADEON_CS_RING_COMPUTE      1
>>  #endif
>>
>> +#ifndef RADEON_CS_RING_DMA
>> +#define RADEON_CS_RING_DMA          2
>> +#endif
>> +
>>
>>  #define RELOC_DWORDS (sizeof(struct drm_radeon_cs_reloc) / sizeof(uint32_t))
>>
>> +static void radeon_drm_cs_flush(struct radeon_winsys_cs *rcs, unsigned flags);
>> +
>>  static boolean radeon_init_cs_context(struct radeon_cs_context *csc,
>>                                        struct radeon_drm_winsys *ws)
>>  {
>> @@ -137,6 +143,12 @@ static void radeon_cs_context_cleanup(struct radeon_cs_context *csc)
>>
>>      for (i = 0; i < csc->crelocs; i++) {
>>          p_atomic_dec(&csc->relocs_bo[i]->num_cs_references);
>> +        if (csc->relocs[i].read_domains) {
>> +            p_atomic_dec(&csc->relocs_bo[i]->read_pending);
>> +        }
>> +        if (csc->relocs[i].write_domain) {
>> +            p_atomic_dec(&csc->relocs_bo[i]->write_pending);
>> +        }
>>          radeon_bo_reference(&csc->relocs_bo[i], NULL);
>>      }
>>
>> @@ -182,11 +194,30 @@ static struct radeon_winsys_cs *radeon_drm_cs_create(struct radeon_winsys *rws)
>>          FREE(cs);
>>          return NULL;
>>      }
>> +    if (cs->ws->info.has_dma) {
>> +        if (!radeon_init_cs_context(&cs->dma1, cs->ws)) {
>> +            FREE(cs);
>> +            return NULL;
>> +        }
>> +        if (!radeon_init_cs_context(&cs->dma2, cs->ws)) {
>> +            FREE(cs);
>> +            return NULL;
>> +        }
>> +    }
>>
>>      /* Set the first command buffer as current. */
>>      cs->csc = &cs->csc1;
>>      cs->cst = &cs->csc2;
>>      cs->base.buf = cs->csc->buf;
>> +    if (cs->ws->info.has_dma) {
>> +        cs->dmac = &cs->dma1;
>> +        cs->dmat = &cs->dma2;
>> +        cs->base.dma_buf = cs->dmac->buf;
>> +    } else {
>> +        cs->dmac = NULL;
>> +        cs->dmat = NULL;
>> +        cs->base.dma_buf = NULL;
>> +    }
>>
>>      p_atomic_inc(&ws->num_cs);
>>      if (cs->ws->num_cpus > 1 && debug_get_option_thread())
>> @@ -196,13 +227,20 @@ static struct radeon_winsys_cs *radeon_drm_cs_create(struct radeon_winsys *rws)
>>
>>  #define OUT_CS(cs, value) (cs)->buf[(cs)->cdw++] = (value)
>>
>> -static INLINE void update_reloc_domains(struct drm_radeon_cs_reloc *reloc,
>> +static INLINE void update_reloc_domains(struct radeon_bo *bo,
>> +                                        struct drm_radeon_cs_reloc *reloc,
>>                                          enum radeon_bo_domain rd,
>>                                          enum radeon_bo_domain wd,
>>                                          enum radeon_bo_domain *added_domains)
>>  {
>>      *added_domains = (rd | wd) & ~(reloc->read_domains | reloc->write_domain);
>>
>> +    if (!reloc->read_domains && rd) {
>> +        p_atomic_inc(&bo->read_pending);
>> +    }
>> +    if (!reloc->write_domain && wd) {
>> +        p_atomic_inc(&bo->write_pending);
>> +    }
>>      reloc->read_domains |= rd;
>>      reloc->write_domain |= wd;
>>  }
>> @@ -244,38 +282,44 @@ int radeon_get_reloc(struct radeon_cs_context *csc, struct radeon_bo *bo)
>>      return -1;
>>  }
>>
>> -static unsigned radeon_add_reloc(struct radeon_cs_context *csc,
>> +static unsigned radeon_add_reloc(struct radeon_drm_cs *cs,
>>                                   struct radeon_bo *bo,
>>                                   enum radeon_bo_usage usage,
>>                                   enum radeon_bo_domain domains,
>> -                                 enum radeon_bo_domain *added_domains)
>> +                                 enum radeon_bo_domain *added_domains,
>> +                                 enum radeon_ring_type type)
>>  {
>> +    struct radeon_cs_context *csc = cs->csc;
>>      struct drm_radeon_cs_reloc *reloc;
>> -    unsigned i;
>> +    unsigned index;
>>      unsigned hash = bo->handle & (sizeof(csc->is_handle_added)-1);
>>      enum radeon_bo_domain rd = usage & RADEON_USAGE_READ ? domains : 0;
>>      enum radeon_bo_domain wd = usage & RADEON_USAGE_WRITE ? domains : 0;
>>
>> -    if (csc->is_handle_added[hash]) {
>> -        i = csc->reloc_indices_hashlist[hash];
>> -        reloc = &csc->relocs[i];
>> -        if (reloc->handle == bo->handle) {
>> -            update_reloc_domains(reloc, rd, wd, added_domains);
>> -            return i;
>> +    index = radeon_get_reloc(cs->csc, bo);
>> +    if (index != -1) {
>> +        if (type == RADEON_RING_DMA) {
>> +            /* we need to flush the cs so that we can add bo to dma ring
>> +             * the inter ring synchronization is then handled by the kernel
>> +             */
>> +            cs->flush_cs(cs->flush_data, RADEON_FLUSH_GFX | RADEON_FLUSH_ASYNC);
>> +            /* fallthrough do not return !!!
>> +             * For dma we don't need to check if we already added the relocation
>> +             * as we need to add relocation in order and duplicate them
>> +             */
>> +        } else {
>> +            reloc = &csc->relocs[index];
>> +            update_reloc_domains(bo, reloc, rd, wd, added_domains);
>> +            return index;
>>          }
>> +    }
>>
>> -        /* Hash collision, look for the BO in the list of relocs linearly. */
>> -        for (i = csc->crelocs; i != 0;) {
>> -            --i;
>> -            reloc = &csc->relocs[i];
>> -            if (reloc->handle == bo->handle) {
>> -                update_reloc_domains(reloc, rd, wd, added_domains);
>> -
>> -                csc->reloc_indices_hashlist[hash] = i;
>> -                /*printf("write_reloc collision, hash: %i, handle: %i\n", hash, bo->handle);*/
>> -                return i;
>> -            }
>> -        }
>> +    /* see comment above we don't need to check for duplicate */
>> +    if (type == RADEON_RING_DMA) {
>> +        csc = cs->dmac;
>> +    } else if (cs->ws->info.has_dma && cs->dmac->is_handle_added[hash]) {
>> +        /* need to flush dma */
>> +        cs->flush_cs(cs->flush_data, RADEON_FLUSH_DMA | RADEON_FLUSH_ASYNC);
>>      }
>>
>>      /* New relocation, check if the backing array is large enough. */
>> @@ -296,6 +340,12 @@ static unsigned radeon_add_reloc(struct radeon_cs_context *csc,
>>      csc->relocs_bo[csc->crelocs] = NULL;
>>      radeon_bo_reference(&csc->relocs_bo[csc->crelocs], bo);
>>      p_atomic_inc(&bo->num_cs_references);
>> +    if (rd) {
>> +        p_atomic_inc(&bo->read_pending);
>> +    }
>> +    if (wd) {
>> +        p_atomic_inc(&bo->write_pending);
>> +    }
>>      reloc = &csc->relocs[csc->crelocs];
>>      reloc->handle = bo->handle;
>>      reloc->read_domains = rd;
>> @@ -314,13 +364,13 @@ static unsigned radeon_add_reloc(struct radeon_cs_context *csc,
>>  static unsigned radeon_drm_cs_add_reloc(struct radeon_winsys_cs *rcs,
>>                                          struct radeon_winsys_cs_handle *buf,
>>                                          enum radeon_bo_usage usage,
>> -                                        enum radeon_bo_domain domains)
>> +                                        enum radeon_bo_domain domains,
>> +                                        enum radeon_ring_type type)
>>  {
>>      struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
>>      struct radeon_bo *bo = (struct radeon_bo*)buf;
>>      enum radeon_bo_domain added_domains;
>> -
>> -    unsigned index = radeon_add_reloc(cs->csc, bo, usage, domains, &added_domains);
>> +    unsigned index = radeon_add_reloc(cs, bo, usage, domains, &added_domains, type);
>>
>>      if (added_domains & RADEON_DOMAIN_GTT)
>>          cs->csc->used_gart += bo->base.size;
>> @@ -330,7 +380,7 @@ static unsigned radeon_drm_cs_add_reloc(struct radeon_winsys_cs *rcs,
>>      return index;
>>  }
>>
>> -static boolean radeon_drm_cs_validate(struct radeon_winsys_cs *rcs)
>> +static boolean radeon_drm_pm4_validate(struct radeon_winsys_cs *rcs)
>>  {
>>      struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
>>      boolean status =
>> @@ -347,13 +397,19 @@ static boolean radeon_drm_cs_validate(struct radeon_winsys_cs *rcs)
>>
>>          for (i = cs->csc->validated_crelocs; i < cs->csc->crelocs; i++) {
>>              p_atomic_dec(&cs->csc->relocs_bo[i]->num_cs_references);
>> +            if (cs->csc->relocs[i].read_domains) {
>> +                p_atomic_dec(&cs->csc->relocs_bo[i]->read_pending);
>> +            }
>> +            if (cs->csc->relocs[i].write_domain) {
>> +                p_atomic_dec(&cs->csc->relocs_bo[i]->write_pending);
>> +            }
>>              radeon_bo_reference(&cs->csc->relocs_bo[i], NULL);
>>          }
>>          cs->csc->crelocs = cs->csc->validated_crelocs;
>>
>>          /* Flush if there are any relocs. Clean up otherwise. */
>>          if (cs->csc->crelocs) {
>> -            cs->flush_cs(cs->flush_data, RADEON_FLUSH_ASYNC);
>> +            cs->flush_cs(cs->flush_data, RADEON_FLUSH_GFX | RADEON_FLUSH_ASYNC);
>>          } else {
>>              radeon_cs_context_cleanup(cs->csc);
>>
>> @@ -366,12 +422,65 @@ static boolean radeon_drm_cs_validate(struct radeon_winsys_cs *rcs)
>>      return status;
>>  }
>>
>> +static boolean radeon_drm_dma_validate(struct radeon_winsys_cs *rcs)
>> +{
>> +    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
>> +    boolean status =
>> +        cs->dmac->used_gart < cs->ws->info.gart_size * 0.8 &&
>> +        cs->dmac->used_vram < cs->ws->info.vram_size * 0.8;
>> +
>> +    if (status) {
>> +        cs->dmac->validated_crelocs = cs->dmac->crelocs;
>> +    } else {
>> +        /* Remove lately-added relocations. The validation failed with them
>> +         * and the CS is about to be flushed because of that. Keep only
>> +         * the already-validated relocations. */
>> +        unsigned i;
>> +
>> +        for (i = cs->dmac->validated_crelocs; i < cs->dmac->crelocs; i++) {
>> +            p_atomic_dec(&cs->dmac->relocs_bo[i]->num_cs_references);
>> +            if (cs->dmac->relocs[i].read_domains) {
>> +                p_atomic_dec(&cs->dmac->relocs_bo[i]->read_pending);
>> +            }
>> +            if (cs->dmac->relocs[i].write_domain) {
>> +                p_atomic_dec(&cs->dmac->relocs_bo[i]->write_pending);
>> +            }
>> +            radeon_bo_reference(&cs->dmac->relocs_bo[i], NULL);
>> +        }
>> +        cs->csc->crelocs = cs->dmac->validated_crelocs;
>> +
>> +        /* Flush if there are any relocs. Clean up otherwise. */
>> +        if (cs->dmac->crelocs) {
>> +            cs->flush_cs(cs->flush_data, RADEON_FLUSH_DMA | RADEON_FLUSH_ASYNC);
>> +        } else {
>> +            radeon_cs_context_cleanup(cs->dmac);
>> +
>> +            assert(cs->base.dma_cdw == 0);
>> +            if (cs->base.dma_cdw != 0) {
>> +                fprintf(stderr, "radeon: Unexpected error in %s.\n", __func__);
>> +            }
>> +        }
>> +    }
>> +    return status;
>> +}
>> +
>> +static boolean radeon_drm_cs_validate(struct radeon_winsys_cs *rcs)
>> +{
>> +    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
>> +    boolean r;
>> +
>> +    r = radeon_drm_pm4_validate(rcs);
>> +    if (cs->ws->info.has_dma) {
>> +        r |= radeon_drm_dma_validate(rcs);
>> +    }
>> +    return r;
>> +}
>> +
>>  static void radeon_drm_cs_write_reloc(struct radeon_winsys_cs *rcs,
>>                                        struct radeon_winsys_cs_handle *buf)
>>  {
>>      struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
>>      struct radeon_bo *bo = (struct radeon_bo*)buf;
>> -
>>      unsigned index = radeon_get_reloc(cs->csc, bo);
>>
>>      if (index == -1) {
>> @@ -416,7 +525,11 @@ static PIPE_THREAD_ROUTINE(radeon_drm_cs_emit_ioctl, param)
>>          pipe_semaphore_wait(&cs->flush_queued);
>>          if (cs->kill_thread)
>>              break;
>> -        radeon_drm_cs_emit_ioctl_oneshot(cs->cst);
>> +        if (cs->flush_started & RADEON_FLUSH_GFX) {
>> +            radeon_drm_cs_emit_ioctl_oneshot(cs->cst);
>> +        } else {
>> +            radeon_drm_cs_emit_ioctl_oneshot(cs->dmat);
>> +        }
>>          pipe_semaphore_signal(&cs->flush_completed);
>>      }
>>      pipe_semaphore_signal(&cs->flush_completed);
>> @@ -434,25 +547,48 @@ void radeon_drm_cs_sync_flush(struct radeon_drm_cs *cs)
>>
>>  DEBUG_GET_ONCE_BOOL_OPTION(noop, "RADEON_NOOP", FALSE)
>>
>> -static void radeon_drm_cs_flush(struct radeon_winsys_cs *rcs, unsigned flags)
>> +static void radeon_drm_dma_flush(struct radeon_drm_cs *cs, unsigned flags)
>>  {
>> -    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
>> -    struct radeon_cs_context *tmp;
>> +    /* If the CS is not empty or overflowed, emit it in a separate thread. */
>> +    if (cs->base.dma_cdw && cs->base.dma_cdw <= RADEON_MAX_CMDBUF_DWORDS && !debug_get_option_noop()) {
>> +        unsigned i, crelocs = cs->dmat->crelocs;
>>
>> -    if (rcs->cdw > RADEON_MAX_CMDBUF_DWORDS) {
>> -       fprintf(stderr, "radeon: command stream overflowed\n");
>> -    }
>> +        /* dma ib must be 256bytes aligned pad with nop */
>> +        for (i = cs->base.dma_cdw; i < align(cs->base.dma_cdw, 64); i++) {
>> +            cs->dmat->buf[i] = 0xf0000000;
>> +        }
>> +        cs->dmat->chunks[0].length_dw = align(cs->base.dma_cdw, 64);
>>
>> -    radeon_drm_cs_sync_flush(cs);
>> +        for (i = 0; i < crelocs; i++) {
>> +            /* Update the number of active asynchronous CS ioctls for the buffer. */
>> +            p_atomic_inc(&cs->dmat->relocs_bo[i]->num_active_ioctls);
>> +        }
>>
>> -    /* Flip command streams. */
>> -    tmp = cs->csc;
>> -    cs->csc = cs->cst;
>> -    cs->cst = tmp;
>> +        cs->dmat->flags[0] = 0;
>> +        cs->dmat->flags[1] = RADEON_CS_RING_DMA;
>> +        cs->dmat->cs.num_chunks = 3;
>> +        if (cs->ws->info.r600_virtual_address) {
>> +            cs->dmat->flags[0] |= RADEON_CS_USE_VM;
>> +        }
>> +        if (cs->thread && (flags & RADEON_FLUSH_ASYNC)) {
>> +            cs->flush_started = RADEON_FLUSH_DMA;
>> +            pipe_semaphore_signal(&cs->flush_queued);
>> +        } else {
>> +            radeon_drm_cs_emit_ioctl_oneshot(cs->dmat);
>> +        }
>> +    } else {
>> +        radeon_cs_context_cleanup(cs->dmat);
>> +    }
>> +
>> +    /* Prepare a new CS. */
>> +    cs->base.dma_buf = cs->dmac->buf;
>> +    cs->base.dma_cdw = 0;
>> +}
>>
>> +static void radeon_drm_pm4_flush(struct radeon_drm_cs *cs, unsigned flags)
>> +{
>>      /* If the CS is not empty or overflowed, emit it in a separate thread. */
>> -    if (cs->base.cdw && cs->base.cdw <= RADEON_MAX_CMDBUF_DWORDS &&
>> -       !debug_get_option_noop()) {
>> +    if (cs->base.cdw && cs->base.cdw <= RADEON_MAX_CMDBUF_DWORDS && !debug_get_option_noop()) {
>>          unsigned i, crelocs = cs->cst->crelocs;
>>
>>          cs->cst->chunks[0].length_dw = cs->base.cdw;
>> @@ -478,9 +614,8 @@ static void radeon_drm_cs_flush(struct radeon_winsys_cs *rcs, unsigned flags)
>>              cs->cst->cs.num_chunks = 3;
>>          }
>>
>> -        if (cs->thread &&
>> -            (flags & RADEON_FLUSH_ASYNC)) {
>> -            cs->flush_started = 1;
>> +        if (cs->thread && (flags & RADEON_FLUSH_ASYNC)) {
>> +            cs->flush_started = RADEON_FLUSH_GFX;
>>              pipe_semaphore_signal(&cs->flush_queued);
>>          } else {
>>              radeon_drm_cs_emit_ioctl_oneshot(cs->cst);
>> @@ -494,6 +629,33 @@ static void radeon_drm_cs_flush(struct radeon_winsys_cs *rcs, unsigned flags)
>>      cs->base.cdw = 0;
>>  }
>>
>> +static void radeon_drm_cs_flush(struct radeon_winsys_cs *rcs, unsigned flags)
>> +{
>> +    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
>> +    struct radeon_cs_context *tmp;
>> +
>> +    if (rcs->cdw > RADEON_MAX_CMDBUF_DWORDS) {
>> +       fprintf(stderr, "radeon: command stream overflowed\n");
>> +    }
>> +
>> +    radeon_drm_cs_sync_flush(cs);
>> +
>> +    /* Flip command streams. */
>> +    if (cs->ws->info.has_dma && flags & RADEON_FLUSH_DMA) {
>> +        tmp = cs->dmac;
>> +        cs->dmac = cs->dmat;
>> +        cs->dmat = tmp;
>> +        radeon_drm_dma_flush(cs, flags);
>> +    }
>> +    if (flags & (RADEON_FLUSH_GFX | RADEON_FLUSH_COMPUTE)) {
>> +        tmp = cs->csc;
>> +        cs->csc = cs->cst;
>> +        cs->cst = tmp;
>> +        radeon_drm_pm4_flush(cs, flags);
>> +    }
>> +
>> +}
>> +
>>  static void radeon_drm_cs_destroy(struct radeon_winsys_cs *rcs)
>>  {
>>      struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
>> @@ -508,9 +670,17 @@ static void radeon_drm_cs_destroy(struct radeon_winsys_cs *rcs)
>>      pipe_semaphore_destroy(&cs->flush_completed);
>>      radeon_cs_context_cleanup(&cs->csc1);
>>      radeon_cs_context_cleanup(&cs->csc2);
>> +    if (cs->ws->info.has_dma) {
>> +        radeon_cs_context_cleanup(&cs->dma1);
>> +        radeon_cs_context_cleanup(&cs->dma2);
>> +    }
>>      p_atomic_dec(&cs->ws->num_cs);
>>      radeon_destroy_cs_context(&cs->csc1);
>>      radeon_destroy_cs_context(&cs->csc2);
>> +    if (cs->ws->info.has_dma) {
>> +        radeon_destroy_cs_context(&cs->dma1);
>> +        radeon_destroy_cs_context(&cs->dma2);
>> +    }
>>      FREE(cs);
>>  }
>>
>> @@ -527,20 +697,14 @@ static boolean radeon_bo_is_referenced(struct radeon_winsys_cs *rcs,
>>                                         struct radeon_winsys_cs_handle *_buf,
>>                                         enum radeon_bo_usage usage)
>>  {
>> -    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
>>      struct radeon_bo *bo = (struct radeon_bo*)_buf;
>> -    int index;
>>
>>      if (!bo->num_cs_references)
>>          return FALSE;
>>
>> -    index = radeon_get_reloc(cs->csc, bo);
>> -    if (index == -1)
>> -        return FALSE;
>> -
>> -    if ((usage & RADEON_USAGE_WRITE) && cs->csc->relocs[index].write_domain)
>> +    if ((usage & RADEON_USAGE_WRITE) && bo->write_pending)
>>          return TRUE;
>> -    if ((usage & RADEON_USAGE_READ) && cs->csc->relocs[index].read_domains)
>> +    if ((usage & RADEON_USAGE_READ) && bo->read_pending)
>>          return TRUE;
>>
>>      return FALSE;
>> diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_cs.h b/src/gallium/winsys/radeon/drm/radeon_drm_cs.h
>> index 286eb6a..e632ad6 100644
>> --- a/src/gallium/winsys/radeon/drm/radeon_drm_cs.h
>> +++ b/src/gallium/winsys/radeon/drm/radeon_drm_cs.h
>> @@ -55,28 +55,33 @@ struct radeon_cs_context {
>>  };
>>
>>  struct radeon_drm_cs {
>> -    struct radeon_winsys_cs base;
>> +    struct radeon_winsys_cs     base;
>>
>>      /* We flip between these two CS. While one is being consumed
>>       * by the kernel in another thread, the other one is being filled
>>       * by the pipe driver. */
>> -    struct radeon_cs_context csc1;
>> -    struct radeon_cs_context csc2;
>> +    struct radeon_cs_context    csc1;
>> +    struct radeon_cs_context    csc2;
>> +    struct radeon_cs_context    dma1;
>> +    struct radeon_cs_context    dma2;
>>      /* The currently-used CS. */
>> -    struct radeon_cs_context *csc;
>> +    struct radeon_cs_context    *csc;
>> +    /* The currently-used DMA. */
>> +    struct radeon_cs_context    *dmac;
>>      /* The CS being currently-owned by the other thread. */
>> -    struct radeon_cs_context *cst;
>> +    struct radeon_cs_context    *cst;
>> +    struct radeon_cs_context    *dmat;
>>
>>      /* The winsys. */
>> -    struct radeon_drm_winsys *ws;
>> +    struct radeon_drm_winsys    *ws;
>>
>>      /* Flush CS. */
>>      void (*flush_cs)(void *ctx, unsigned flags);
>>      void *flush_data;
>>
>> -    pipe_thread thread;
>> -    int flush_started, kill_thread;
>> -    pipe_semaphore flush_queued, flush_completed;
>> +    pipe_thread                 thread;
>> +    int                         flush_started, kill_thread;
>> +    pipe_semaphore              flush_queued, flush_completed;
>>  };
>>
>>  int radeon_get_reloc(struct radeon_cs_context *csc, struct radeon_bo *bo);
>> @@ -91,25 +96,16 @@ static INLINE boolean
>>  radeon_bo_is_referenced_by_cs(struct radeon_drm_cs *cs,
>>                                struct radeon_bo *bo)
>>  {
>> -    int num_refs = bo->num_cs_references;
>> -    return num_refs == bo->rws->num_cs ||
>> -           (num_refs && radeon_get_reloc(cs->csc, bo) != -1);
>> +    return !!bo->num_cs_references;
>>  }
>>
>>  static INLINE boolean
>>  radeon_bo_is_referenced_by_cs_for_write(struct radeon_drm_cs *cs,
>>                                          struct radeon_bo *bo)
>>  {
>> -    int index;
>> -
>> -    if (!bo->num_cs_references)
>> -        return FALSE;
>> -
>> -    index = radeon_get_reloc(cs->csc, bo);
>> -    if (index == -1)
>> -        return FALSE;
>> -
>> -    return cs->csc->relocs[index].write_domain != 0;
>> +    if (bo->num_cs_references && bo->write_pending)
>> +        return TRUE;
>> +    return FALSE;
>>  }
>>
>>  static INLINE boolean
>> diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c b/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c
>> index bcfb448..a08eddd 100644
>> --- a/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c
>> +++ b/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c
>> @@ -319,6 +319,12 @@ static boolean do_winsys_init(struct radeon_drm_winsys *ws)
>>          break;
>>      }
>>
>> +    /* Check for dma */
>> +    ws->info.has_dma = FALSE;
>> +    if (ws->info.chip_class >= R700 && ws->info.drm_minor >= 27) {
>> +        ws->info.has_dma = TRUE;
>> +    }
>> +
>>      /* Get GEM info. */
>>      retval = drmCommandWriteRead(ws->fd, DRM_RADEON_GEM_INFO,
>>              &gem_info, sizeof(gem_info));
>> diff --git a/src/gallium/winsys/radeon/drm/radeon_winsys.h b/src/gallium/winsys/radeon/drm/radeon_winsys.h
>> index 16536dc..5ff463e 100644
>> --- a/src/gallium/winsys/radeon/drm/radeon_winsys.h
>> +++ b/src/gallium/winsys/radeon/drm/radeon_winsys.h
>> @@ -43,11 +43,13 @@
>>  #include "pipebuffer/pb_buffer.h"
>>  #include "libdrm/radeon_surface.h"
>>
>> -#define RADEON_MAX_CMDBUF_DWORDS (16 * 1024)
>> +#define RADEON_MAX_CMDBUF_DWORDS        (16 * 1024)
>>
>> -#define RADEON_FLUSH_ASYNC             (1 << 0)
>> -#define RADEON_FLUSH_KEEP_TILING_FLAGS (1 << 1) /* needs DRM 2.12.0 */
>> -#define RADEON_FLUSH_COMPUTE           (1 << 2)
>> +#define RADEON_FLUSH_ASYNC              (1 << 0)
>> +#define RADEON_FLUSH_KEEP_TILING_FLAGS  (1 << 1) /* needs DRM 2.12.0 */
>> +#define RADEON_FLUSH_COMPUTE            (1 << 2)
>> +#define RADEON_FLUSH_DMA                (1 << 3)
>> +#define RADEON_FLUSH_GFX                (1 << 4)
>>
>>  /* Tiling flags. */
>>  enum radeon_bo_layout {
>> @@ -137,12 +139,19 @@ enum chip_class {
>>      TAHITI,
>>  };
>>
>> +enum radeon_ring_type {
>> +    RADEON_RING_PM4 = 0,
>> +    RADEON_RING_DMA = 1,
>> +};
>> +
>>  struct winsys_handle;
>>  struct radeon_winsys_cs_handle;
>>
>>  struct radeon_winsys_cs {
>> -    unsigned cdw;  /* Number of used dwords. */
>> -    uint32_t *buf; /* The command buffer. */
>> +    unsigned                    cdw;      /* Number of used dwords. */
>> +    uint32_t                    *buf;     /* The command buffer. */
>> +    unsigned                    dma_cdw;  /* Number of used dwords. */
>> +    uint32_t                    *dma_buf; /* The command buffer. */
>>  };
>>
>>  struct radeon_info {
>> @@ -169,6 +178,7 @@ struct radeon_info {
>>      uint32_t                    r600_max_pipes;
>>      boolean                     r600_backend_map_valid;
>>      boolean                     r600_virtual_address;
>> +    boolean                     has_dma;
>>  };
>>
>>  enum radeon_feature_id {
>> @@ -222,8 +232,7 @@ struct radeon_winsys {
>>                                         boolean use_reusable_pool,
>>                                         enum radeon_bo_domain domain);
>>
>> -    struct radeon_winsys_cs_handle *(*buffer_get_cs_handle)(
>> -            struct pb_buffer *buf);
>> +    struct radeon_winsys_cs_handle *(*buffer_get_cs_handle)(struct pb_buffer *buf);
>>
>>      /**
>>       * Map the entire data store of a buffer object into the client's address
>> @@ -371,7 +380,8 @@ struct radeon_winsys {
>>      unsigned (*cs_add_reloc)(struct radeon_winsys_cs *cs,
>>                               struct radeon_winsys_cs_handle *buf,
>>                               enum radeon_bo_usage usage,
>> -                             enum radeon_bo_domain domain);
>> +                             enum radeon_bo_domain domain,
>> +                             enum radeon_ring_type type);
>>
>>      /**
>>       * Return TRUE if there is enough memory in VRAM and GTT for the relocs
>> --
>> 1.7.11.7
>>
>> _______________________________________________
>> mesa-dev mailing list
>> mesa-dev at lists.freedesktop.org
>> http://lists.freedesktop.org/mailman/listinfo/mesa-dev