[Mesa-dev] [PATCH 3/3] radeon/winsys: add async dma infrastructure
Jerome Glisse
j.glisse at gmail.com
Fri Jan 4 16:38:27 PST 2013
On Fri, Jan 4, 2013 at 6:33 PM, Alex Deucher <alexdeucher at gmail.com> wrote:
> On Fri, Jan 4, 2013 at 5:19 PM, <j.glisse at gmail.com> wrote:
>> From: Jerome Glisse <jglisse at redhat.com>
>>
>> The design is to take advantage of the fact that kernel will emit
>> semaphore when buffer is referenced by different ring. So the only
>> thing we need to enforce synchronization btw dma and gfx/compute
>> ring is to make sure that we never reference same bo at the same
>> time on the dma and gfx ring.
>>
>> This is achieved by tracking relocation, when we add a relocation
>> to the dma ring for a bo we check first if the bo has an active
>> relocation on the gfx ring. If it's the case we flush the gfx ring.
>> We do the same when adding a bo to the gfx ring we check it does
>> not have a relocation on the dma ring if it has one we flush the
>> dma ring.
>>
>> This patch also simplify the helper query function to know if a bo
>> has pending write/read command.
>
> Looks good. A couple of minor comments below. BTW, any performance gains?
>
No, there isn't much benchmark that will trigger a lot of buffer copy
AFAICT. Here is a WIP patch for texture copy :
http://people.freedesktop.org/~glisse/0001-r600g-r7xx-use-async-dma-for-resource-copy.patch
Kernel mostly reject the command stream so far i need to check what's going on.
Cheers,
Jerome
> Alex
>
>>
>> Signed-off-by: Jerome Glisse <jglisse at redhat.com>
>> ---
>> src/gallium/drivers/r300/r300_emit.c | 21 +-
>> src/gallium/drivers/r300/r300_flush.c | 7 +-
>> src/gallium/drivers/r600/evergreen_hw_context.c | 39 +++
>> src/gallium/drivers/r600/evergreend.h | 16 ++
>> src/gallium/drivers/r600/r600.h | 13 +
>> src/gallium/drivers/r600/r600_blit.c | 94 +++++--
>> src/gallium/drivers/r600/r600_hw_context.c | 44 +++-
>> src/gallium/drivers/r600/r600_pipe.c | 13 +-
>> src/gallium/drivers/r600/r600_pipe.h | 2 +-
>> src/gallium/drivers/r600/r600_texture.c | 2 +-
>> src/gallium/drivers/r600/r600d.h | 16 ++
>> src/gallium/drivers/radeonsi/r600_hw_context.c | 2 +-
>> .../drivers/radeonsi/r600_hw_context_priv.h | 2 +-
>> src/gallium/drivers/radeonsi/r600_texture.c | 2 +-
>> src/gallium/drivers/radeonsi/radeonsi_pipe.c | 13 +-
>> src/gallium/winsys/radeon/drm/radeon_drm_bo.c | 10 +-
>> src/gallium/winsys/radeon/drm/radeon_drm_bo.h | 2 +
>> src/gallium/winsys/radeon/drm/radeon_drm_cs.c | 270 +++++++++++++++++----
>> src/gallium/winsys/radeon/drm/radeon_drm_cs.h | 40 ++-
>> src/gallium/winsys/radeon/drm/radeon_drm_winsys.c | 6 +
>> src/gallium/winsys/radeon/drm/radeon_winsys.h | 28 ++-
>> 21 files changed, 509 insertions(+), 133 deletions(-)
>>
>> diff --git a/src/gallium/drivers/r300/r300_emit.c b/src/gallium/drivers/r300/r300_emit.c
>> index d1ed4b3..c824821 100644
>> --- a/src/gallium/drivers/r300/r300_emit.c
>> +++ b/src/gallium/drivers/r300/r300_emit.c
>> @@ -1184,7 +1184,8 @@ validate:
>> assert(tex && tex->buf && "cbuf is marked, but NULL!");
>> r300->rws->cs_add_reloc(r300->cs, tex->cs_buf,
>> RADEON_USAGE_READWRITE,
>> - r300_surface(fb->cbufs[i])->domain);
>> + r300_surface(fb->cbufs[i])->domain,
>> + RADEON_RING_DMA);
>> }
>> /* ...depth buffer... */
>> if (fb->zsbuf) {
>> @@ -1192,7 +1193,8 @@ validate:
>> assert(tex && tex->buf && "zsbuf is marked, but NULL!");
>> r300->rws->cs_add_reloc(r300->cs, tex->cs_buf,
>> RADEON_USAGE_READWRITE,
>> - r300_surface(fb->zsbuf)->domain);
>> + r300_surface(fb->zsbuf)->domain,
>> + RADEON_RING_DMA);
>> }
>> }
>> if (r300->textures_state.dirty) {
>> @@ -1204,18 +1206,21 @@ validate:
>>
>> tex = r300_resource(texstate->sampler_views[i]->base.texture);
>> r300->rws->cs_add_reloc(r300->cs, tex->cs_buf, RADEON_USAGE_READ,
>> - tex->domain);
>> + tex->domain,
>> + RADEON_RING_DMA);
>> }
>> }
>> /* ...occlusion query buffer... */
>> if (r300->query_current)
>> r300->rws->cs_add_reloc(r300->cs, r300->query_current->cs_buf,
>> - RADEON_USAGE_WRITE, RADEON_DOMAIN_GTT);
>> + RADEON_USAGE_WRITE, RADEON_DOMAIN_GTT,
>> + RADEON_RING_DMA);
>> /* ...vertex buffer for SWTCL path... */
>> if (r300->vbo)
>> r300->rws->cs_add_reloc(r300->cs, r300_resource(r300->vbo)->cs_buf,
>> RADEON_USAGE_READ,
>> - r300_resource(r300->vbo)->domain);
>> + r300_resource(r300->vbo)->domain,
>> + RADEON_RING_DMA);
>> /* ...vertex buffers for HWTCL path... */
>> if (do_validate_vertex_buffers && r300->vertex_arrays_dirty) {
>> struct pipe_vertex_buffer *vbuf = r300->vertex_buffer;
>> @@ -1230,14 +1235,16 @@ validate:
>>
>> r300->rws->cs_add_reloc(r300->cs, r300_resource(buf)->cs_buf,
>> RADEON_USAGE_READ,
>> - r300_resource(buf)->domain);
>> + r300_resource(buf)->domain,
>> + RADEON_RING_DMA);
>> }
>> }
>> /* ...and index buffer for HWTCL path. */
>> if (index_buffer)
>> r300->rws->cs_add_reloc(r300->cs, r300_resource(index_buffer)->cs_buf,
>> RADEON_USAGE_READ,
>> - r300_resource(index_buffer)->domain);
>> + r300_resource(index_buffer)->domain,
>> + RADEON_RING_DMA);
>>
>> /* Now do the validation (flush is called inside cs_validate on failure). */
>> if (!r300->rws->cs_validate(r300->cs)) {
>> diff --git a/src/gallium/drivers/r300/r300_flush.c b/src/gallium/drivers/r300/r300_flush.c
>> index 2170c59..ff3dbe9 100644
>> --- a/src/gallium/drivers/r300/r300_flush.c
>> +++ b/src/gallium/drivers/r300/r300_flush.c
>> @@ -70,8 +70,10 @@ void r300_flush(struct pipe_context *pipe,
>> struct r300_context *r300 = r300_context(pipe);
>> struct pb_buffer **rfence = (struct pb_buffer**)fence;
>>
>> + /* r3xx-r5xx only have gfx ring */
>> + flags |= RADEON_FLUSH_GFX;
>> if (r300->draw && !r300->draw_vbo_locked)
>> - r300_draw_flush_vbuf(r300);
>> + r300_draw_flush_vbuf(r300);
>>
>> if (r300->screen->info.drm_minor >= 12) {
>> flags |= RADEON_FLUSH_KEEP_TILING_FLAGS;
>> @@ -84,7 +86,8 @@ void r300_flush(struct pipe_context *pipe,
>> /* Add the fence as a dummy relocation. */
>> r300->rws->cs_add_reloc(r300->cs,
>> r300->rws->buffer_get_cs_handle(*rfence),
>> - RADEON_USAGE_READWRITE, RADEON_DOMAIN_GTT);
>> + RADEON_USAGE_READWRITE, RADEON_DOMAIN_GTT,
>> + RADEON_RING_DMA);
>> }
>>
>> if (r300->dirty_hw) {
>> diff --git a/src/gallium/drivers/r600/evergreen_hw_context.c b/src/gallium/drivers/r600/evergreen_hw_context.c
>> index 0ca7f9e..302f652 100644
>> --- a/src/gallium/drivers/r600/evergreen_hw_context.c
>> +++ b/src/gallium/drivers/r600/evergreen_hw_context.c
>> @@ -238,3 +238,42 @@ void evergreen_set_streamout_enable(struct r600_context *ctx, unsigned buffer_en
>> r600_write_context_reg(cs, R_028B94_VGT_STRMOUT_CONFIG, S_028B94_STREAMOUT_0_EN(0));
>> }
>> }
>> +
>> +void evergreen_dma_copy(struct r600_context *rctx,
>> + struct pipe_resource *dst,
>> + struct pipe_resource *src,
>> + unsigned long dst_offset,
>> + unsigned long src_offset,
>> + unsigned long size)
>> +{
>> + struct radeon_winsys_cs *cs = rctx->cs;
>> + unsigned i, ncopy, csize, command, shift;
>> + struct r600_resource *rdst = (struct r600_resource*)dst;
>> + struct r600_resource *rsrc = (struct r600_resource*)src;
>> +
>> + /* see if we use dword or byte copy */
>> + if (!(dst_offset & 0x3) && !(src_offset & 0x3) && !(size & 0x3)) {
>> + size >>= 2;
>> + command = 0x00 << 20;
>> + shift = 2;
>> + } else {
>> + command = 0x40 << 20;
>> + shift = 0;
>> + }
>> + ncopy = (size / 0xfffff) + !!(size % 0xfffff);
>> +
>> + r600_need_dma_space(rctx, ncopy * 5);
>> + for (i = 0; i < ncopy; i++) {
>> + csize = size < 0xfffff ? size : 0xfffff;
>> + cs->dma_buf[cs->dma_cdw++] = DMA_PACKET(DMA_PACKET_COPY, 0, 0, csize) | command;
>
>
> I'd perfer to clean up the DMA_PACKET() macro on evergreen since the t
> and s bits don't really make sense on evergreen. Just make the macro
> take 3 args and treat bits 27:20 as a sub-command field. something
> like:
Ok with that
>> +#define DMA_PACKET(cmd, sub_cmd, n) ((((cmd) & 0xF) << 28) | \
>> + (((sub_cmd) & 0xFF) << 20) | \
>> + (((n) & 0xFFFFF) << 0))
>
> Looks a little cleaner in my opinion.
Yes
>> + cs->dma_buf[cs->dma_cdw++] = dst_offset & 0xffffffff;
>> + cs->dma_buf[cs->dma_cdw++] = src_offset & 0xffffffff;
>> + cs->dma_buf[cs->dma_cdw++] = (dst_offset >> 32UL) & 0xff;
>> + cs->dma_buf[cs->dma_cdw++] = (src_offset >> 32UL) & 0xff;
>> + rctx->ws->cs_add_reloc(rctx->cs, rsrc->cs_buf, RADEON_USAGE_READ, rsrc->domains, RADEON_RING_DMA);
>> + rctx->ws->cs_add_reloc(rctx->cs, rdst->cs_buf, RADEON_USAGE_WRITE, rdst->domains, RADEON_RING_DMA);
>> + dst_offset += csize << shift;
>> + src_offset += csize << shift;
>> + size -= csize;
>> + }
>> +}
>> diff --git a/src/gallium/drivers/r600/evergreend.h b/src/gallium/drivers/r600/evergreend.h
>> index d9dba95..d8c50b6 100644
>> --- a/src/gallium/drivers/r600/evergreend.h
>> +++ b/src/gallium/drivers/r600/evergreend.h
>> @@ -2317,4 +2317,20 @@
>> #define G_028AA8_SWITCH_ON_EOP(x) (((x) >> 17) & 0x1)
>> #define C_028AA8_SWITCH_ON_EOP 0xFFFDFFFF
>>
>> +/* async DMA packets */
>> +#define DMA_PACKET(cmd, t, s, n) ((((cmd) & 0xF) << 28) | \
>> + (((t) & 0x1) << 23) | \
>> + (((s) & 0x1) << 22) | \
>> + (((n) & 0xFFFFF) << 0))
>> +/* async DMA Packet types */
>> +#define DMA_PACKET_WRITE 0x2
>> +#define DMA_PACKET_COPY 0x3
>> +#define DMA_PACKET_INDIRECT_BUFFER 0x4
>> +#define DMA_PACKET_SEMAPHORE 0x5
>> +#define DMA_PACKET_FENCE 0x6
>> +#define DMA_PACKET_TRAP 0x7
>> +#define DMA_PACKET_SRBM_WRITE 0x9
>> +#define DMA_PACKET_CONSTANT_FILL 0xd
>> +#define DMA_PACKET_NOP 0xf
>> +
>> #endif
>
> <snip>
>
>> +
>> +void r600_dma_copy(struct r600_context *rctx,
>> + struct pipe_resource *dst,
>> + struct pipe_resource *src,
>> + unsigned long dst_offset,
>> + unsigned long src_offset,
>> + unsigned long size)
>> +{
>
> Maybe rename this r700_dma_copy() since r600 is flakey and seems to
> actually take a slightly different copy packet format?
>
Will do
>> + struct radeon_winsys_cs *cs = rctx->cs;
>> + unsigned i, ncopy, csize, shift;
>> + struct r600_resource *rdst = (struct r600_resource*)dst;
>> + struct r600_resource *rsrc = (struct r600_resource*)src;
>> +
>> + size >>= 2;
>> + shift = 2;
>> + ncopy = (size / 0xffff) + !!(size % 0xffff);
>> +
>> + r600_need_dma_space(rctx, ncopy * 5);
>> + for (i = 0; i < ncopy; i++) {
>> + csize = size < 0xffff ? size : 0xffff;
>> + cs->dma_buf[cs->dma_cdw++] = DMA_PACKET(DMA_PACKET_COPY, 0, 0, csize);
>> + cs->dma_buf[cs->dma_cdw++] = dst_offset & 0xfffffffc;
>> + cs->dma_buf[cs->dma_cdw++] = src_offset & 0xfffffffc;
>> + cs->dma_buf[cs->dma_cdw++] = (dst_offset >> 32UL) & 0xff;
>> + cs->dma_buf[cs->dma_cdw++] = (src_offset >> 32UL) & 0xff;
>> + rctx->ws->cs_add_reloc(rctx->cs, rsrc->cs_buf, RADEON_USAGE_READ, rsrc->domains, RADEON_RING_DMA);
>> + rctx->ws->cs_add_reloc(rctx->cs, rdst->cs_buf, RADEON_USAGE_WRITE, rdst->domains, RADEON_RING_DMA);
>> + dst_offset += csize << shift;
>> + src_offset += csize << shift;
>> + size -= csize;
>> + }
>> +}
More information about the mesa-dev
mailing list