[Mesa-dev] [PATCH] radeonsi: use compute for resource_copy_region when possible

Tue Jan 15 22:42:08 UTC 2019

On Tue, Jan 15, 2019 at 3:54 PM Axel Davy <davyaxel0 at gmail.com> wrote:

> On 15/01/2019 18:50, Marek Olšák wrote:
> >
> > +void si_compute_copy_image(struct si_context *sctx,
> > +                        struct pipe_resource *dst,
> > +                        unsigned dst_level,
> > +                        struct pipe_resource *src,
> > +                        unsigned src_level,
> > +                        unsigned dstx, unsigned dsty, unsigned dstz,
> > +                        const struct pipe_box *src_box)
> > +{
> > +     struct pipe_context *ctx = &sctx->b;
> > +     unsigned width = src_box->width;
> > +     unsigned height = src_box->height;
> > +     unsigned depth = src_box->depth;
> > +
> > +     unsigned data[] = {src_box->x, src_box->y, src_box->z, 0, dstx,
> dsty, dstz, 0};
> > +
> > +     if (width == 0 || height == 0)
> > +             return;
> > +
> > +     sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH |
> > +                    si_get_flush_flags(sctx, SI_COHERENCY_SHADER,
> L2_STREAM);
> > +     si_make_CB_shader_coherent(sctx, dst->nr_samples, true);
> > +
> > +     struct pipe_constant_buffer saved_cb = {};
> > +     si_get_pipe_constant_buffer(sctx, PIPE_SHADER_COMPUTE, 0,
> &saved_cb);
> > +
> > +     struct si_images *images = &sctx->images[PIPE_SHADER_COMPUTE];
> > +     struct pipe_image_view saved_image[2] = {0};
> > +     util_copy_image_view(&saved_image[0], &images->views[0]);
> > +     util_copy_image_view(&saved_image[1], &images->views[1]);
> > +
> > +     void *saved_cs = sctx->cs_shader_state.program;
> > +
> > +     struct pipe_constant_buffer cb = {};
> > +     cb.buffer_size = sizeof(data);
> > +     cb.user_buffer = data;
> > +     ctx->set_constant_buffer(ctx, PIPE_SHADER_COMPUTE, 0, &cb);
> > +
> > +     struct pipe_image_view image[2] = {0};
> > +     image[0].resource = src;
> > +     image[0].shader_access = image[0].access = PIPE_IMAGE_ACCESS_READ;
> > +     image[0].format = util_format_linear(src->format);
> > +     image[0].u.tex.level = src_level;
> > +     image[0].u.tex.first_layer = 0;
> > +     image[0].u.tex.last_layer =
> > +             src->target == PIPE_TEXTURE_3D ? u_minify(src->depth0,
> src_level) - 1
> > +                                             :
> (unsigned)(src->array_size - 1);
> > +     image[1].resource = dst;
> > +     image[1].shader_access = image[1].access = PIPE_IMAGE_ACCESS_WRITE;
> > +     image[1].format = util_format_linear(dst->format);
> > +     image[1].u.tex.level = dst_level;
> > +     image[1].u.tex.first_layer = 0;
> > +     image[1].u.tex.last_layer =
> > +             dst->target == PIPE_TEXTURE_3D ? u_minify(dst->depth0,
> dst_level) - 1
> > +                                             :
> (unsigned)(dst->array_size - 1);
> > +
> > +     if (src->format == PIPE_FORMAT_R9G9B9E5_FLOAT)
> > +             image[0].format = image[1].format = PIPE_FORMAT_R32_UINT;
> > +
> > +     /* SNORM8 blitting has precision issues on some chips. Use the SINT
> > +      * equivalent instead, which doesn't force DCC decompression.
> > +      * Note that some chips avoid this issue by using SDMA.
> > +      */
> > +     if (util_format_is_snorm8(dst->format)) {
> > +             image[0].format = image[1].format =
> > +                     util_format_snorm8_to_sint8(dst->format);
> > +     }
> > +
> > +     ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 2, image);
> > +
> > +     struct pipe_grid_info info = {0};
> > +
> > +     if (dst->target == PIPE_TEXTURE_1D_ARRAY && src->target ==
> PIPE_TEXTURE_1D_ARRAY) {
> > +             if (!sctx->cs_copy_image_1d_array)
> > +                     sctx->cs_copy_image_1d_array =
> > +
>  si_create_copy_image_compute_shader_1d_array(ctx);
> > +             ctx->bind_compute_state(ctx, sctx->cs_copy_image_1d_array);
> > +             info.block[0] = 64;
> > +             info.last_block[0] = width % 64;
> > +             info.block[1] = 1;
> > +             info.block[2] = 1;
> > +             info.grid[0] = DIV_ROUND_UP(width, 64);
> > +             info.grid[1] = depth;
> > +             info.grid[2] = 1;
> > +     } else {
> > +             if (!sctx->cs_copy_image)
> > +                     sctx->cs_copy_image =
> si_create_copy_image_compute_shader(ctx);
> > +             ctx->bind_compute_state(ctx, sctx->cs_copy_image);
> > +             info.block[0] = 8;
> > +             info.last_block[0] = width % 8;
> > +             info.block[1] = 8;
> > +             info.last_block[1] = height % 8;
> > +             info.block[2] = 1;
> > +             info.grid[0] = DIV_ROUND_UP(width, 8);
> > +             info.grid[1] = DIV_ROUND_UP(height, 8);
> > +             info.grid[2] = depth;
> > +     }
> > +
> > +     ctx->launch_grid(ctx, &info);
> > +
> > +     sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH |
> > +                    (sctx->chip_class <= VI ?
> SI_CONTEXT_WRITEBACK_GLOBAL_L2 : 0) |
> > +                    si_get_flush_flags(sctx, SI_COHERENCY_SHADER,
> L2_STREAM);
> > +     ctx->bind_compute_state(ctx, saved_cs);
> > +     ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 2,
> saved_image);
> > +     ctx->set_constant_buffer(ctx, PIPE_SHADER_COMPUTE, 0, &saved_cb);
> > +}
> > +
>
> > +void *si_create_copy_image_compute_shader(struct pipe_context *ctx)
> > +{
> > +     static const char text[] =
> > +             "COMP\n"
> > +             "PROPERTY CS_FIXED_BLOCK_WIDTH 8\n"
> > +             "PROPERTY CS_FIXED_BLOCK_HEIGHT 8\n"
> > +             "PROPERTY CS_FIXED_BLOCK_DEPTH 1\n"
> > +             "DCL SV[0], THREAD_ID\n"
> > +             "DCL SV[1], BLOCK_ID\n"
> > +             "DCL IMAGE[0], 2D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT,
> WR\n"
> > +             "DCL IMAGE[1], 2D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT,
> WR\n"
> > +             "DCL CONST[0][0..1]\n" // 0:xyzw 1:xyzw
> > +             "DCL TEMP[0..4], LOCAL\n"
> > +             "IMM[0] UINT32 {8, 1, 0, 0}\n"
> > +             "MOV TEMP[0].xyz, CONST[0][0].xyzw\n"
> > +             "UMAD TEMP[1].xyz, SV[1].xyzz, IMM[0].xxyy, SV[0].xyzz\n"
> > +             "UADD TEMP[2].xyz, TEMP[1].xyzx, TEMP[0].xyzx\n"
> > +             "LOAD TEMP[3], IMAGE[0], TEMP[2].xyzx, 2D_ARRAY,
> PIPE_FORMAT_R32G32B32A32_FLOAT\n"
> > +             "MOV TEMP[4].xyz, CONST[0][1].xyzw\n"
> > +             "UADD TEMP[2].xyz, TEMP[1].xyzx, TEMP[4].xyzx\n"
> > +             "STORE IMAGE[1], TEMP[2].xyzz, TEMP[3], 2D_ARRAY,
> PIPE_FORMAT_R32G32B32A32_FLOAT\n"
> > +             "END\n";
> > +
> > +     struct tgsi_token tokens[1024];
> > +     struct pipe_compute_state state = {0};
> > +
> > +     if (!tgsi_text_translate(text, tokens, ARRAY_SIZE(tokens))) {
> > +             assert(false);
> > +             return NULL;
> > +     }
> > +
> > +     state.ir_type = PIPE_SHADER_IR_TGSI;
> > +     state.prog = tokens;
> > +
> > +     return ctx->create_compute_state(ctx, &state);
> > +}
> > +
> >
> Hi,
>
> Here is my summary of my understanding of the proposal implementation
> for the copy implementation:
>
> . Store input and output (x, y, z) offsets into a constant buffer
> . (8, 8) workgroups
> . Each workitem copies pixel (x+get_group_id(0)*8+get_local_id(0),
> y+get_group_id(1)*8+get_local_id(1),
> z+get_group_id(2)*8+get_local_id(2)). The pixel is RGBA.
>

The block size in Z is 1.

>
> Some questions:
> . What happens when the textures do not have some components ? R32F for
> example
>

Components that don't exist are not stored.

> . I'm not familiar with using images in compute shaders, but is it ok to
> declare as ARGB32F even if the input/output data is not float ?
>

The driver ignores the format specified by shaders.

>
> Some comments:
>
> . If src_x, dstx, etcs are not multiple of (8, 8), the workgroups won't
> be aligned well with the tiling pattern. Fortunately cache should
> mitigate the loss, but if that's an important case to handle, one could
> write the shader differently to have all workgroups (except at border)
> aligned. I guess one can benchmark see if that tiling alignment matters
> much here.
>

That would complicate the shaders too much.

. Overhead can be reduced by copying several pixels per work-item.
>

Patches welcome. :)

. If the src and dst region are perfectly aligned with the tiling
> pattern, the copy can be reduced to just moving a rectangle of memory
> (no tiling) and could be implemented with dma_copy if no conversion is
> needed or with a shader using buffers (no images), which would avoid
> using the image sampling hw which I believe can be more limiting than
> sampling a buffer when there is a lot of wavefronts. The data conversion
> can be done for no cost in the shader as it should be memory bound.
>

Too complicated.

> . (8, 8) is not optimal for linear tiled images (but I guess we don't
> often get to use them with resource_copy_region).
>

Yes, linear -> linear copies are slower. tiled <-> linear copies wouldn't
be improved.

Marek
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://lists.freedesktop.org/archives/mesa-dev/attachments/20190115/f6d53afc/attachment-0001.html>