[Mesa-dev] [PATCH 3/3] radeon/winsys: add async dma infrastructure
Alex Deucher
alexdeucher at gmail.com
Fri Jan 4 15:33:29 PST 2013
On Fri, Jan 4, 2013 at 5:19 PM, <j.glisse at gmail.com> wrote:
> From: Jerome Glisse <jglisse at redhat.com>
>
> The design is to take advantage of the fact that kernel will emit
> semaphore when buffer is referenced by different ring. So the only
> thing we need to enforce synchronization btw dma and gfx/compute
> ring is to make sure that we never reference same bo at the same
> time on the dma and gfx ring.
>
> This is achieved by tracking relocation, when we add a relocation
> to the dma ring for a bo we check first if the bo has an active
> relocation on the gfx ring. If it's the case we flush the gfx ring.
> We do the same when adding a bo to the gfx ring we check it does
> not have a relocation on the dma ring if it has one we flush the
> dma ring.
>
> This patch also simplify the helper query function to know if a bo
> has pending write/read command.
Looks good. A couple of minor comments below. BTW, any performance gains?
Alex
>
> Signed-off-by: Jerome Glisse <jglisse at redhat.com>
> ---
> src/gallium/drivers/r300/r300_emit.c | 21 +-
> src/gallium/drivers/r300/r300_flush.c | 7 +-
> src/gallium/drivers/r600/evergreen_hw_context.c | 39 +++
> src/gallium/drivers/r600/evergreend.h | 16 ++
> src/gallium/drivers/r600/r600.h | 13 +
> src/gallium/drivers/r600/r600_blit.c | 94 +++++--
> src/gallium/drivers/r600/r600_hw_context.c | 44 +++-
> src/gallium/drivers/r600/r600_pipe.c | 13 +-
> src/gallium/drivers/r600/r600_pipe.h | 2 +-
> src/gallium/drivers/r600/r600_texture.c | 2 +-
> src/gallium/drivers/r600/r600d.h | 16 ++
> src/gallium/drivers/radeonsi/r600_hw_context.c | 2 +-
> .../drivers/radeonsi/r600_hw_context_priv.h | 2 +-
> src/gallium/drivers/radeonsi/r600_texture.c | 2 +-
> src/gallium/drivers/radeonsi/radeonsi_pipe.c | 13 +-
> src/gallium/winsys/radeon/drm/radeon_drm_bo.c | 10 +-
> src/gallium/winsys/radeon/drm/radeon_drm_bo.h | 2 +
> src/gallium/winsys/radeon/drm/radeon_drm_cs.c | 270 +++++++++++++++++----
> src/gallium/winsys/radeon/drm/radeon_drm_cs.h | 40 ++-
> src/gallium/winsys/radeon/drm/radeon_drm_winsys.c | 6 +
> src/gallium/winsys/radeon/drm/radeon_winsys.h | 28 ++-
> 21 files changed, 509 insertions(+), 133 deletions(-)
>
> diff --git a/src/gallium/drivers/r300/r300_emit.c b/src/gallium/drivers/r300/r300_emit.c
> index d1ed4b3..c824821 100644
> --- a/src/gallium/drivers/r300/r300_emit.c
> +++ b/src/gallium/drivers/r300/r300_emit.c
> @@ -1184,7 +1184,8 @@ validate:
> assert(tex && tex->buf && "cbuf is marked, but NULL!");
> r300->rws->cs_add_reloc(r300->cs, tex->cs_buf,
> RADEON_USAGE_READWRITE,
> - r300_surface(fb->cbufs[i])->domain);
> + r300_surface(fb->cbufs[i])->domain,
> + RADEON_RING_DMA);
> }
> /* ...depth buffer... */
> if (fb->zsbuf) {
> @@ -1192,7 +1193,8 @@ validate:
> assert(tex && tex->buf && "zsbuf is marked, but NULL!");
> r300->rws->cs_add_reloc(r300->cs, tex->cs_buf,
> RADEON_USAGE_READWRITE,
> - r300_surface(fb->zsbuf)->domain);
> + r300_surface(fb->zsbuf)->domain,
> + RADEON_RING_DMA);
> }
> }
> if (r300->textures_state.dirty) {
> @@ -1204,18 +1206,21 @@ validate:
>
> tex = r300_resource(texstate->sampler_views[i]->base.texture);
> r300->rws->cs_add_reloc(r300->cs, tex->cs_buf, RADEON_USAGE_READ,
> - tex->domain);
> + tex->domain,
> + RADEON_RING_DMA);
> }
> }
> /* ...occlusion query buffer... */
> if (r300->query_current)
> r300->rws->cs_add_reloc(r300->cs, r300->query_current->cs_buf,
> - RADEON_USAGE_WRITE, RADEON_DOMAIN_GTT);
> + RADEON_USAGE_WRITE, RADEON_DOMAIN_GTT,
> + RADEON_RING_DMA);
> /* ...vertex buffer for SWTCL path... */
> if (r300->vbo)
> r300->rws->cs_add_reloc(r300->cs, r300_resource(r300->vbo)->cs_buf,
> RADEON_USAGE_READ,
> - r300_resource(r300->vbo)->domain);
> + r300_resource(r300->vbo)->domain,
> + RADEON_RING_DMA);
> /* ...vertex buffers for HWTCL path... */
> if (do_validate_vertex_buffers && r300->vertex_arrays_dirty) {
> struct pipe_vertex_buffer *vbuf = r300->vertex_buffer;
> @@ -1230,14 +1235,16 @@ validate:
>
> r300->rws->cs_add_reloc(r300->cs, r300_resource(buf)->cs_buf,
> RADEON_USAGE_READ,
> - r300_resource(buf)->domain);
> + r300_resource(buf)->domain,
> + RADEON_RING_DMA);
> }
> }
> /* ...and index buffer for HWTCL path. */
> if (index_buffer)
> r300->rws->cs_add_reloc(r300->cs, r300_resource(index_buffer)->cs_buf,
> RADEON_USAGE_READ,
> - r300_resource(index_buffer)->domain);
> + r300_resource(index_buffer)->domain,
> + RADEON_RING_DMA);
>
> /* Now do the validation (flush is called inside cs_validate on failure). */
> if (!r300->rws->cs_validate(r300->cs)) {
> diff --git a/src/gallium/drivers/r300/r300_flush.c b/src/gallium/drivers/r300/r300_flush.c
> index 2170c59..ff3dbe9 100644
> --- a/src/gallium/drivers/r300/r300_flush.c
> +++ b/src/gallium/drivers/r300/r300_flush.c
> @@ -70,8 +70,10 @@ void r300_flush(struct pipe_context *pipe,
> struct r300_context *r300 = r300_context(pipe);
> struct pb_buffer **rfence = (struct pb_buffer**)fence;
>
> + /* r3xx-r5xx only have gfx ring */
> + flags |= RADEON_FLUSH_GFX;
> if (r300->draw && !r300->draw_vbo_locked)
> - r300_draw_flush_vbuf(r300);
> + r300_draw_flush_vbuf(r300);
>
> if (r300->screen->info.drm_minor >= 12) {
> flags |= RADEON_FLUSH_KEEP_TILING_FLAGS;
> @@ -84,7 +86,8 @@ void r300_flush(struct pipe_context *pipe,
> /* Add the fence as a dummy relocation. */
> r300->rws->cs_add_reloc(r300->cs,
> r300->rws->buffer_get_cs_handle(*rfence),
> - RADEON_USAGE_READWRITE, RADEON_DOMAIN_GTT);
> + RADEON_USAGE_READWRITE, RADEON_DOMAIN_GTT,
> + RADEON_RING_DMA);
> }
>
> if (r300->dirty_hw) {
> diff --git a/src/gallium/drivers/r600/evergreen_hw_context.c b/src/gallium/drivers/r600/evergreen_hw_context.c
> index 0ca7f9e..302f652 100644
> --- a/src/gallium/drivers/r600/evergreen_hw_context.c
> +++ b/src/gallium/drivers/r600/evergreen_hw_context.c
> @@ -238,3 +238,42 @@ void evergreen_set_streamout_enable(struct r600_context *ctx, unsigned buffer_en
> r600_write_context_reg(cs, R_028B94_VGT_STRMOUT_CONFIG, S_028B94_STREAMOUT_0_EN(0));
> }
> }
> +
> +void evergreen_dma_copy(struct r600_context *rctx,
> + struct pipe_resource *dst,
> + struct pipe_resource *src,
> + unsigned long dst_offset,
> + unsigned long src_offset,
> + unsigned long size)
> +{
> + struct radeon_winsys_cs *cs = rctx->cs;
> + unsigned i, ncopy, csize, command, shift;
> + struct r600_resource *rdst = (struct r600_resource*)dst;
> + struct r600_resource *rsrc = (struct r600_resource*)src;
> +
> + /* see if we use dword or byte copy */
> + if (!(dst_offset & 0x3) && !(src_offset & 0x3) && !(size & 0x3)) {
> + size >>= 2;
> + command = 0x00 << 20;
> + shift = 2;
> + } else {
> + command = 0x40 << 20;
> + shift = 0;
> + }
> + ncopy = (size / 0xfffff) + !!(size % 0xfffff);
> +
> + r600_need_dma_space(rctx, ncopy * 5);
> + for (i = 0; i < ncopy; i++) {
> + csize = size < 0xfffff ? size : 0xfffff;
> + cs->dma_buf[cs->dma_cdw++] = DMA_PACKET(DMA_PACKET_COPY, 0, 0, csize) | command;
I'd perfer to clean up the DMA_PACKET() macro on evergreen since the t
and s bits don't really make sense on evergreen. Just make the macro
take 3 args and treat bits 27:20 as a sub-command field. something
like:
> +#define DMA_PACKET(cmd, sub_cmd, n) ((((cmd) & 0xF) << 28) | \
> + (((sub_cmd) & 0xFF) << 20) | \
> + (((n) & 0xFFFFF) << 0))
Looks a little cleaner in my opinion.
> + cs->dma_buf[cs->dma_cdw++] = dst_offset & 0xffffffff;
> + cs->dma_buf[cs->dma_cdw++] = src_offset & 0xffffffff;
> + cs->dma_buf[cs->dma_cdw++] = (dst_offset >> 32UL) & 0xff;
> + cs->dma_buf[cs->dma_cdw++] = (src_offset >> 32UL) & 0xff;
> + rctx->ws->cs_add_reloc(rctx->cs, rsrc->cs_buf, RADEON_USAGE_READ, rsrc->domains, RADEON_RING_DMA);
> + rctx->ws->cs_add_reloc(rctx->cs, rdst->cs_buf, RADEON_USAGE_WRITE, rdst->domains, RADEON_RING_DMA);
> + dst_offset += csize << shift;
> + src_offset += csize << shift;
> + size -= csize;
> + }
> +}
> diff --git a/src/gallium/drivers/r600/evergreend.h b/src/gallium/drivers/r600/evergreend.h
> index d9dba95..d8c50b6 100644
> --- a/src/gallium/drivers/r600/evergreend.h
> +++ b/src/gallium/drivers/r600/evergreend.h
> @@ -2317,4 +2317,20 @@
> #define G_028AA8_SWITCH_ON_EOP(x) (((x) >> 17) & 0x1)
> #define C_028AA8_SWITCH_ON_EOP 0xFFFDFFFF
>
> +/* async DMA packets */
> +#define DMA_PACKET(cmd, t, s, n) ((((cmd) & 0xF) << 28) | \
> + (((t) & 0x1) << 23) | \
> + (((s) & 0x1) << 22) | \
> + (((n) & 0xFFFFF) << 0))
> +/* async DMA Packet types */
> +#define DMA_PACKET_WRITE 0x2
> +#define DMA_PACKET_COPY 0x3
> +#define DMA_PACKET_INDIRECT_BUFFER 0x4
> +#define DMA_PACKET_SEMAPHORE 0x5
> +#define DMA_PACKET_FENCE 0x6
> +#define DMA_PACKET_TRAP 0x7
> +#define DMA_PACKET_SRBM_WRITE 0x9
> +#define DMA_PACKET_CONSTANT_FILL 0xd
> +#define DMA_PACKET_NOP 0xf
> +
> #endif
<snip>
> +
> +void r600_dma_copy(struct r600_context *rctx,
> + struct pipe_resource *dst,
> + struct pipe_resource *src,
> + unsigned long dst_offset,
> + unsigned long src_offset,
> + unsigned long size)
> +{
Maybe rename this r700_dma_copy() since r600 is flakey and seems to
actually take a slightly different copy packet format?
> + struct radeon_winsys_cs *cs = rctx->cs;
> + unsigned i, ncopy, csize, shift;
> + struct r600_resource *rdst = (struct r600_resource*)dst;
> + struct r600_resource *rsrc = (struct r600_resource*)src;
> +
> + size >>= 2;
> + shift = 2;
> + ncopy = (size / 0xffff) + !!(size % 0xffff);
> +
> + r600_need_dma_space(rctx, ncopy * 5);
> + for (i = 0; i < ncopy; i++) {
> + csize = size < 0xffff ? size : 0xffff;
> + cs->dma_buf[cs->dma_cdw++] = DMA_PACKET(DMA_PACKET_COPY, 0, 0, csize);
> + cs->dma_buf[cs->dma_cdw++] = dst_offset & 0xfffffffc;
> + cs->dma_buf[cs->dma_cdw++] = src_offset & 0xfffffffc;
> + cs->dma_buf[cs->dma_cdw++] = (dst_offset >> 32UL) & 0xff;
> + cs->dma_buf[cs->dma_cdw++] = (src_offset >> 32UL) & 0xff;
> + rctx->ws->cs_add_reloc(rctx->cs, rsrc->cs_buf, RADEON_USAGE_READ, rsrc->domains, RADEON_RING_DMA);
> + rctx->ws->cs_add_reloc(rctx->cs, rdst->cs_buf, RADEON_USAGE_WRITE, rdst->domains, RADEON_RING_DMA);
> + dst_offset += csize << shift;
> + src_offset += csize << shift;
> + size -= csize;
> + }
> +}
More information about the mesa-dev
mailing list