[Mesa-dev] [PATCH 3/4] radeon: accelerate transfer_inline_write

Wed Aug 6 06:24:13 PDT 2014

On Wed, Aug 6, 2014 at 2:44 PM, Christian König <deathsimple at vodafone.de> wrote:
> Am 06.08.2014 um 13:45 schrieb Marek Olšák:
>
>> On Tue, Aug 5, 2014 at 7:31 PM, Christian König <deathsimple at vodafone.de>
>> wrote:
>>>
>>> From: Christian König <christian.koenig at amd.com>
>>>
>>> Not completely implemented, cause we need DMA copy support for every hw
>>> generation.
>>>
>>> Signed-off-by: Christian König <christian.koenig at amd.com>
>>> ---
>>>   src/gallium/drivers/radeon/r600_buffer_common.c |   2 +-
>>>   src/gallium/drivers/radeon/r600_pipe_common.c   |   2 +-
>>>   src/gallium/drivers/radeon/r600_texture.c       | 104
>>> ++++++++++++++++++++++--
>>>   3 files changed, 100 insertions(+), 8 deletions(-)
>>>
>>> diff --git a/src/gallium/drivers/radeon/r600_buffer_common.c
>>> b/src/gallium/drivers/radeon/r600_buffer_common.c
>>> index d747cbc..28ab30c 100644
>>> --- a/src/gallium/drivers/radeon/r600_buffer_common.c
>>> +++ b/src/gallium/drivers/radeon/r600_buffer_common.c
>>> @@ -372,7 +372,7 @@ static const struct u_resource_vtbl r600_buffer_vtbl
>>> =
>>>          r600_buffer_transfer_map,       /* transfer_map */
>>>          NULL,                           /* transfer_flush_region */
>>>          r600_buffer_transfer_unmap,     /* transfer_unmap */
>>> -       NULL                            /* transfer_inline_write */
>>> +       u_default_transfer_inline_write /* transfer_inline_write */
>>>   };
>>>
>>>   struct pipe_resource *r600_buffer_create(struct pipe_screen *screen,
>>> diff --git a/src/gallium/drivers/radeon/r600_pipe_common.c
>>> b/src/gallium/drivers/radeon/r600_pipe_common.c
>>> index 3476021..69d344e 100644
>>> --- a/src/gallium/drivers/radeon/r600_pipe_common.c
>>> +++ b/src/gallium/drivers/radeon/r600_pipe_common.c
>>> @@ -153,7 +153,7 @@ bool r600_common_context_init(struct
>>> r600_common_context *rctx,
>>>          rctx->b.transfer_map = u_transfer_map_vtbl;
>>>          rctx->b.transfer_flush_region = u_default_transfer_flush_region;
>>>          rctx->b.transfer_unmap = u_transfer_unmap_vtbl;
>>> -       rctx->b.transfer_inline_write = u_default_transfer_inline_write;
>>> +       rctx->b.transfer_inline_write = u_transfer_inline_write_vtbl;
>>>           rctx->b.memory_barrier = r600_memory_barrier;
>>>          rctx->b.flush = r600_flush_from_st;
>>>
>>> diff --git a/src/gallium/drivers/radeon/r600_texture.c
>>> b/src/gallium/drivers/radeon/r600_texture.c
>>> index 482bbff..89b3b55 100644
>>> --- a/src/gallium/drivers/radeon/r600_texture.c
>>> +++ b/src/gallium/drivers/radeon/r600_texture.c
>>> @@ -849,6 +849,47 @@ static struct pipe_resource
>>> *r600_texture_from_handle(struct pipe_screen *screen
>>>
>>> stride, buf, &surface);
>>>   }
>>>
>>> +static struct r600_texture *r600_texture_from_ptr(struct pipe_screen
>>> *screen,
>>> +                                                 const struct
>>> pipe_resource *templ,
>>> +                                                 void *pointer, unsigned
>>> stride)
>>> +{
>>> +       struct r600_common_screen *rscreen = (struct
>>> r600_common_screen*)screen;
>>> +       struct radeon_surface surface = {};
>>> +       struct r600_texture *tex;
>>> +       unsigned offset, size;
>>> +       struct pb_buffer *buf;
>>> +       int r;
>>> +
>>> +       /* Support only 2D textures without mipmaps */
>>> +       if ((templ->target != PIPE_TEXTURE_2D && templ->target !=
>>> PIPE_TEXTURE_RECT) ||
>>> +             templ->depth0 != 1 || templ->last_level != 0)
>>> +               return NULL;
>>> +
>>> +       /* stride needs to be at least dw aligned */
>>> +       if (stride % 4)
>>> +               return NULL;
>>> +
>>> +       offset = ((uintptr_t)pointer) & 0xfff;
>>> +       pointer = (void *)(((uintptr_t)pointer) - offset);
>>> +       size = align(stride * templ->height0 + offset, 0x1000);
>>> +
>>> +       /* avoid the overhead for small copies */
>>> +       if (size < 64*1024)
>>> +               return NULL;
>>> +
>>> +       buf = rscreen->ws->buffer_from_ptr(rscreen->ws, pointer, size);
>>> +       if (!buf)
>>> +               return NULL;
>>> +
>>> +       r = r600_init_surface(rscreen, &surface, templ,
>>> RADEON_SURF_MODE_LINEAR_ALIGNED, false);
>>
>> I know you change it the next patch, but I think the alignment for
>> LINEAR (not ALIGNED) is 8 pixels, right? Of course, libdrm_radeon
>> should be reviewed if it doesn't over-align the stride. The safest
>> thing would be to check if stride == surface[0].pitch_in_bytes.
>
>
> Yeah, correct.
>
> The problem here is that even RADEON_SURF_MODE_LINEAR couldn't even handle
> all different alignments the application could come up with for the base
> pointer and stride. The only thing that can handle dword aligned or even
> byte aligned subwindow copies is the async DMA partial copy command and that
> is only available on NI+.
>
> Apart from that testing if libdrm_radeon really comes up with the correct
> stride is indeed a good idea.
>
>
>>
>>
>>> +       if (r)
>>> +               return NULL;
>>> +
>>> +       tex = r600_texture_create_object(screen, templ, stride, buf,
>>> &surface);
>>> +       tex->surface.level[0].offset += offset;
>>> +       return tex;
>>> +}
>>> +
>>>   bool r600_init_flushed_depth_texture(struct pipe_context *ctx,
>>>                                       struct pipe_resource *texture,
>>>                                       struct r600_texture **staging)
>>> @@ -1112,14 +1153,65 @@ static void r600_texture_transfer_unmap(struct
>>> pipe_context *ctx,
>>>          FREE(transfer);
>>>   }
>>>
>>> +static void r600_texture_transfer_inline_write(struct pipe_context *ctx,
>>> +                                              struct pipe_resource *dst,
>>> +                                              unsigned level, unsigned
>>> usage,
>>> +                                              const struct pipe_box
>>> *box,
>>> +                                              const void *data,
>>> +                                              unsigned stride,
>>> +                                              unsigned layer_stride)
>>> +{
>>> +       struct r600_common_context *rctx = (struct
>>> r600_common_context*)ctx;
>>> +       struct r600_texture *rsrc;
>>> +       struct pipe_resource *src, templ = {};
>>> +       struct pipe_box src_box = {};
>>> +
>>> +       templ.target = PIPE_TEXTURE_2D;
>>> +       templ.format = dst->format;
>>> +
>>> +       templ.width0 = box->width;
>>> +       templ.height0 = box->height;
>>> +       templ.depth0 = 1;
>>> +       templ.array_size = 1;
>>> +
>>> +       templ.usage = PIPE_USAGE_STAGING;
>>> +       templ.bind = PIPE_BIND_TRANSFER_READ;
>>> +
>>> +       rsrc = r600_texture_from_ptr(ctx->screen, &templ, (void *)data,
>>> stride);
>>> +       src = (struct pipe_resource *)rsrc;
>>> +       if (!src) {
>>
>> As an optimization, you can test if the dst texture is referenced or
>> busy and if yes, use u_default_transfer_inline_write. If there is
>> non-trivial rendering going on, the buffer_wait call below will cost a
>> lot.
>
>
> Wouldn't u_default_transfer_inline_write block for the destination buffer to
> be idle as well? I would rather say that I need to only flush the context if
> there are draws queued up for the destination buffer.
>
> And to reduce the cost of the wait for the DMA from source to destination
> submitting it using a separate context sounds like the best approach to me.

transfer_inline_write never waits. Never. transfer_inline_write always
implies flags WRITE|DISCARD_RANGE even if you don't supply them.

The r600_texture.c upload code is even stricter and assumes that if
you don't supply PIPE_TRANSFER_READ, it automatically treats the
operation as having DISCARD_RANGE. That means there is no
synchronization for any texture upload.

DISCARD_RANGE means a staging texture is created, the data is uploaded
to it, and then it's blitted to the destination. The destination
doesn't need be synchronized with the CPU, because it only receives
blits from the GPU.

Marek