[Mesa-dev] [PATCH 3/3] radeon/winsys: add async dma infrastructure

Jerome Glisse j.glisse at gmail.com
Fri Jan 4 16:38:27 PST 2013


On Fri, Jan 4, 2013 at 6:33 PM, Alex Deucher <alexdeucher at gmail.com> wrote:
> On Fri, Jan 4, 2013 at 5:19 PM,  <j.glisse at gmail.com> wrote:
>> From: Jerome Glisse <jglisse at redhat.com>
>>
>> The design is to take advantage of the fact that kernel will emit
>> semaphore when buffer is referenced by different ring. So the only
>> thing we need to enforce synchronization btw dma and gfx/compute
>> ring is to make sure that we never reference same bo at the same
>> time on the dma and gfx ring.
>>
>> This is achieved by tracking relocation, when we add a relocation
>> to the dma ring for a bo we check first if the bo has an active
>> relocation on the gfx ring. If it's the case we flush the gfx ring.
>> We do the same when adding a bo to the gfx ring we check it does
>> not have a relocation on the dma ring if it has one we flush the
>> dma ring.
>>
>> This patch also simplify the helper query function to know if a bo
>> has pending write/read command.
>
> Looks good.  A couple of minor comments below.  BTW, any performance gains?
>

No, there isn't much benchmark that will trigger a lot of buffer copy
AFAICT. Here is a WIP patch for texture copy :
http://people.freedesktop.org/~glisse/0001-r600g-r7xx-use-async-dma-for-resource-copy.patch

Kernel mostly reject the command stream so far i need to check what's going on.

Cheers,
Jerome

> Alex
>
>>
>> Signed-off-by: Jerome Glisse <jglisse at redhat.com>
>> ---
>>  src/gallium/drivers/r300/r300_emit.c               |  21 +-
>>  src/gallium/drivers/r300/r300_flush.c              |   7 +-
>>  src/gallium/drivers/r600/evergreen_hw_context.c    |  39 +++
>>  src/gallium/drivers/r600/evergreend.h              |  16 ++
>>  src/gallium/drivers/r600/r600.h                    |  13 +
>>  src/gallium/drivers/r600/r600_blit.c               |  94 +++++--
>>  src/gallium/drivers/r600/r600_hw_context.c         |  44 +++-
>>  src/gallium/drivers/r600/r600_pipe.c               |  13 +-
>>  src/gallium/drivers/r600/r600_pipe.h               |   2 +-
>>  src/gallium/drivers/r600/r600_texture.c            |   2 +-
>>  src/gallium/drivers/r600/r600d.h                   |  16 ++
>>  src/gallium/drivers/radeonsi/r600_hw_context.c     |   2 +-
>>  .../drivers/radeonsi/r600_hw_context_priv.h        |   2 +-
>>  src/gallium/drivers/radeonsi/r600_texture.c        |   2 +-
>>  src/gallium/drivers/radeonsi/radeonsi_pipe.c       |  13 +-
>>  src/gallium/winsys/radeon/drm/radeon_drm_bo.c      |  10 +-
>>  src/gallium/winsys/radeon/drm/radeon_drm_bo.h      |   2 +
>>  src/gallium/winsys/radeon/drm/radeon_drm_cs.c      | 270 +++++++++++++++++----
>>  src/gallium/winsys/radeon/drm/radeon_drm_cs.h      |  40 ++-
>>  src/gallium/winsys/radeon/drm/radeon_drm_winsys.c  |   6 +
>>  src/gallium/winsys/radeon/drm/radeon_winsys.h      |  28 ++-
>>  21 files changed, 509 insertions(+), 133 deletions(-)
>>
>> diff --git a/src/gallium/drivers/r300/r300_emit.c b/src/gallium/drivers/r300/r300_emit.c
>> index d1ed4b3..c824821 100644
>> --- a/src/gallium/drivers/r300/r300_emit.c
>> +++ b/src/gallium/drivers/r300/r300_emit.c
>> @@ -1184,7 +1184,8 @@ validate:
>>              assert(tex && tex->buf && "cbuf is marked, but NULL!");
>>              r300->rws->cs_add_reloc(r300->cs, tex->cs_buf,
>>                                      RADEON_USAGE_READWRITE,
>> -                                    r300_surface(fb->cbufs[i])->domain);
>> +                                    r300_surface(fb->cbufs[i])->domain,
>> +                                    RADEON_RING_DMA);
>>          }
>>          /* ...depth buffer... */
>>          if (fb->zsbuf) {
>> @@ -1192,7 +1193,8 @@ validate:
>>              assert(tex && tex->buf && "zsbuf is marked, but NULL!");
>>              r300->rws->cs_add_reloc(r300->cs, tex->cs_buf,
>>                                      RADEON_USAGE_READWRITE,
>> -                                    r300_surface(fb->zsbuf)->domain);
>> +                                    r300_surface(fb->zsbuf)->domain,
>> +                                    RADEON_RING_DMA);
>>          }
>>      }
>>      if (r300->textures_state.dirty) {
>> @@ -1204,18 +1206,21 @@ validate:
>>
>>              tex = r300_resource(texstate->sampler_views[i]->base.texture);
>>              r300->rws->cs_add_reloc(r300->cs, tex->cs_buf, RADEON_USAGE_READ,
>> -                                    tex->domain);
>> +                                    tex->domain,
>> +                                    RADEON_RING_DMA);
>>          }
>>      }
>>      /* ...occlusion query buffer... */
>>      if (r300->query_current)
>>          r300->rws->cs_add_reloc(r300->cs, r300->query_current->cs_buf,
>> -                                RADEON_USAGE_WRITE, RADEON_DOMAIN_GTT);
>> +                                RADEON_USAGE_WRITE, RADEON_DOMAIN_GTT,
>> +                                RADEON_RING_DMA);
>>      /* ...vertex buffer for SWTCL path... */
>>      if (r300->vbo)
>>          r300->rws->cs_add_reloc(r300->cs, r300_resource(r300->vbo)->cs_buf,
>>                                  RADEON_USAGE_READ,
>> -                                r300_resource(r300->vbo)->domain);
>> +                                r300_resource(r300->vbo)->domain,
>> +                                RADEON_RING_DMA);
>>      /* ...vertex buffers for HWTCL path... */
>>      if (do_validate_vertex_buffers && r300->vertex_arrays_dirty) {
>>          struct pipe_vertex_buffer *vbuf = r300->vertex_buffer;
>> @@ -1230,14 +1235,16 @@ validate:
>>
>>              r300->rws->cs_add_reloc(r300->cs, r300_resource(buf)->cs_buf,
>>                                      RADEON_USAGE_READ,
>> -                                    r300_resource(buf)->domain);
>> +                                    r300_resource(buf)->domain,
>> +                                    RADEON_RING_DMA);
>>          }
>>      }
>>      /* ...and index buffer for HWTCL path. */
>>      if (index_buffer)
>>          r300->rws->cs_add_reloc(r300->cs, r300_resource(index_buffer)->cs_buf,
>>                                  RADEON_USAGE_READ,
>> -                                r300_resource(index_buffer)->domain);
>> +                                r300_resource(index_buffer)->domain,
>> +                                RADEON_RING_DMA);
>>
>>      /* Now do the validation (flush is called inside cs_validate on failure). */
>>      if (!r300->rws->cs_validate(r300->cs)) {
>> diff --git a/src/gallium/drivers/r300/r300_flush.c b/src/gallium/drivers/r300/r300_flush.c
>> index 2170c59..ff3dbe9 100644
>> --- a/src/gallium/drivers/r300/r300_flush.c
>> +++ b/src/gallium/drivers/r300/r300_flush.c
>> @@ -70,8 +70,10 @@ void r300_flush(struct pipe_context *pipe,
>>      struct r300_context *r300 = r300_context(pipe);
>>      struct pb_buffer **rfence = (struct pb_buffer**)fence;
>>
>> +    /* r3xx-r5xx only have gfx ring */
>> +    flags |= RADEON_FLUSH_GFX;
>>      if (r300->draw && !r300->draw_vbo_locked)
>> -       r300_draw_flush_vbuf(r300);
>> +        r300_draw_flush_vbuf(r300);
>>
>>      if (r300->screen->info.drm_minor >= 12) {
>>          flags |= RADEON_FLUSH_KEEP_TILING_FLAGS;
>> @@ -84,7 +86,8 @@ void r300_flush(struct pipe_context *pipe,
>>          /* Add the fence as a dummy relocation. */
>>          r300->rws->cs_add_reloc(r300->cs,
>>                                  r300->rws->buffer_get_cs_handle(*rfence),
>> -                                RADEON_USAGE_READWRITE, RADEON_DOMAIN_GTT);
>> +                                RADEON_USAGE_READWRITE, RADEON_DOMAIN_GTT,
>> +                                RADEON_RING_DMA);
>>      }
>>
>>      if (r300->dirty_hw) {
>> diff --git a/src/gallium/drivers/r600/evergreen_hw_context.c b/src/gallium/drivers/r600/evergreen_hw_context.c
>> index 0ca7f9e..302f652 100644
>> --- a/src/gallium/drivers/r600/evergreen_hw_context.c
>> +++ b/src/gallium/drivers/r600/evergreen_hw_context.c
>> @@ -238,3 +238,42 @@ void evergreen_set_streamout_enable(struct r600_context *ctx, unsigned buffer_en
>>                 r600_write_context_reg(cs, R_028B94_VGT_STRMOUT_CONFIG, S_028B94_STREAMOUT_0_EN(0));
>>         }
>>  }
>> +
>> +void evergreen_dma_copy(struct r600_context *rctx,
>> +               struct pipe_resource *dst,
>> +               struct pipe_resource *src,
>> +               unsigned long dst_offset,
>> +               unsigned long src_offset,
>> +               unsigned long size)
>> +{
>> +       struct radeon_winsys_cs *cs = rctx->cs;
>> +       unsigned i, ncopy, csize, command, shift;
>> +       struct r600_resource *rdst = (struct r600_resource*)dst;
>> +       struct r600_resource *rsrc = (struct r600_resource*)src;
>> +
>> +       /* see if we use dword or byte copy */
>> +       if (!(dst_offset & 0x3) && !(src_offset & 0x3) && !(size & 0x3)) {
>> +               size >>= 2;
>> +               command = 0x00 << 20;
>> +               shift = 2;
>> +       } else {
>> +               command = 0x40 << 20;
>> +               shift = 0;
>> +       }
>> +       ncopy = (size / 0xfffff) + !!(size % 0xfffff);
>> +
>> +       r600_need_dma_space(rctx, ncopy * 5);
>> +       for (i = 0; i < ncopy; i++) {
>> +               csize = size < 0xfffff ? size : 0xfffff;
>> +               cs->dma_buf[cs->dma_cdw++] = DMA_PACKET(DMA_PACKET_COPY, 0, 0, csize) | command;
>
>
> I'd perfer to clean up the DMA_PACKET() macro on evergreen since the t
> and s bits don't really make sense on evergreen.  Just make the macro
> take 3 args and treat bits 27:20 as a sub-command field.  something
> like:

Ok with that

>> +#define DMA_PACKET(cmd, sub_cmd, n)    ((((cmd) & 0xF) << 28) |    \
>> +                                    (((sub_cmd) & 0xFF) << 20) |       \
>> +                                    (((n) & 0xFFFFF) << 0))
>
> Looks a little cleaner in my opinion.

Yes

>> +               cs->dma_buf[cs->dma_cdw++] = dst_offset & 0xffffffff;
>> +               cs->dma_buf[cs->dma_cdw++] = src_offset & 0xffffffff;
>> +               cs->dma_buf[cs->dma_cdw++] = (dst_offset >> 32UL) & 0xff;
>> +               cs->dma_buf[cs->dma_cdw++] = (src_offset >> 32UL) & 0xff;
>> +               rctx->ws->cs_add_reloc(rctx->cs, rsrc->cs_buf, RADEON_USAGE_READ, rsrc->domains, RADEON_RING_DMA);
>> +               rctx->ws->cs_add_reloc(rctx->cs, rdst->cs_buf, RADEON_USAGE_WRITE, rdst->domains, RADEON_RING_DMA);
>> +               dst_offset += csize << shift;
>> +               src_offset += csize << shift;
>> +               size -= csize;
>> +       }
>> +}
>> diff --git a/src/gallium/drivers/r600/evergreend.h b/src/gallium/drivers/r600/evergreend.h
>> index d9dba95..d8c50b6 100644
>> --- a/src/gallium/drivers/r600/evergreend.h
>> +++ b/src/gallium/drivers/r600/evergreend.h
>> @@ -2317,4 +2317,20 @@
>>  #define   G_028AA8_SWITCH_ON_EOP(x)                    (((x) >> 17) & 0x1)
>>  #define   C_028AA8_SWITCH_ON_EOP                       0xFFFDFFFF
>>
>> +/* async DMA packets */
>> +#define DMA_PACKET(cmd, t, s, n)    ((((cmd) & 0xF) << 28) |    \
>> +                                    (((t) & 0x1) << 23) |       \
>> +                                    (((s) & 0x1) << 22) |       \
>> +                                    (((n) & 0xFFFFF) << 0))
>> +/* async DMA Packet types */
>> +#define    DMA_PACKET_WRITE                     0x2
>> +#define    DMA_PACKET_COPY                      0x3
>> +#define    DMA_PACKET_INDIRECT_BUFFER           0x4
>> +#define    DMA_PACKET_SEMAPHORE                 0x5
>> +#define    DMA_PACKET_FENCE                     0x6
>> +#define    DMA_PACKET_TRAP                      0x7
>> +#define    DMA_PACKET_SRBM_WRITE                0x9
>> +#define    DMA_PACKET_CONSTANT_FILL             0xd
>> +#define    DMA_PACKET_NOP                       0xf
>> +
>>  #endif
>
> <snip>
>
>> +
>> +void r600_dma_copy(struct r600_context *rctx,
>> +               struct pipe_resource *dst,
>> +               struct pipe_resource *src,
>> +               unsigned long dst_offset,
>> +               unsigned long src_offset,
>> +               unsigned long size)
>> +{
>
> Maybe rename this r700_dma_copy() since r600 is flakey and seems to
> actually take a slightly different copy packet format?
>

Will do

>> +       struct radeon_winsys_cs *cs = rctx->cs;
>> +       unsigned i, ncopy, csize, shift;
>> +       struct r600_resource *rdst = (struct r600_resource*)dst;
>> +       struct r600_resource *rsrc = (struct r600_resource*)src;
>> +
>> +       size >>= 2;
>> +       shift = 2;
>> +       ncopy = (size / 0xffff) + !!(size % 0xffff);
>> +
>> +       r600_need_dma_space(rctx, ncopy * 5);
>> +       for (i = 0; i < ncopy; i++) {
>> +               csize = size < 0xffff ? size : 0xffff;
>> +               cs->dma_buf[cs->dma_cdw++] = DMA_PACKET(DMA_PACKET_COPY, 0, 0, csize);
>> +               cs->dma_buf[cs->dma_cdw++] = dst_offset & 0xfffffffc;
>> +               cs->dma_buf[cs->dma_cdw++] = src_offset & 0xfffffffc;
>> +               cs->dma_buf[cs->dma_cdw++] = (dst_offset >> 32UL) & 0xff;
>> +               cs->dma_buf[cs->dma_cdw++] = (src_offset >> 32UL) & 0xff;
>> +               rctx->ws->cs_add_reloc(rctx->cs, rsrc->cs_buf, RADEON_USAGE_READ, rsrc->domains, RADEON_RING_DMA);
>> +               rctx->ws->cs_add_reloc(rctx->cs, rdst->cs_buf, RADEON_USAGE_WRITE, rdst->domains, RADEON_RING_DMA);
>> +               dst_offset += csize << shift;
>> +               src_offset += csize << shift;
>> +               size -= csize;
>> +       }
>> +}


More information about the mesa-dev mailing list