[Mesa-dev] [PATCH] radeonsi: use compute for resource_copy_region when possible

Axel Davy davyaxel0 at gmail.com
Tue Jan 15 20:54:24 UTC 2019


On 15/01/2019 18:50, Marek Olšák wrote:
>   
> +void si_compute_copy_image(struct si_context *sctx,
> +			   struct pipe_resource *dst,
> +			   unsigned dst_level,
> +			   struct pipe_resource *src,
> +			   unsigned src_level,
> +			   unsigned dstx, unsigned dsty, unsigned dstz,
> +			   const struct pipe_box *src_box)
> +{
> +	struct pipe_context *ctx = &sctx->b;
> +	unsigned width = src_box->width;
> +	unsigned height = src_box->height;
> +	unsigned depth = src_box->depth;
> +
> +	unsigned data[] = {src_box->x, src_box->y, src_box->z, 0, dstx, dsty, dstz, 0};
> +
> +	if (width == 0 || height == 0)
> +		return;
> +
> +	sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH |
> +		       si_get_flush_flags(sctx, SI_COHERENCY_SHADER, L2_STREAM);
> +	si_make_CB_shader_coherent(sctx, dst->nr_samples, true);
> +
> +	struct pipe_constant_buffer saved_cb = {};
> +	si_get_pipe_constant_buffer(sctx, PIPE_SHADER_COMPUTE, 0, &saved_cb);
> +
> +	struct si_images *images = &sctx->images[PIPE_SHADER_COMPUTE];
> +	struct pipe_image_view saved_image[2] = {0};
> +	util_copy_image_view(&saved_image[0], &images->views[0]);
> +	util_copy_image_view(&saved_image[1], &images->views[1]);
> +
> +	void *saved_cs = sctx->cs_shader_state.program;
> +
> +	struct pipe_constant_buffer cb = {};
> +	cb.buffer_size = sizeof(data);
> +	cb.user_buffer = data;
> +	ctx->set_constant_buffer(ctx, PIPE_SHADER_COMPUTE, 0, &cb);
> +
> +	struct pipe_image_view image[2] = {0};
> +	image[0].resource = src;
> +	image[0].shader_access = image[0].access = PIPE_IMAGE_ACCESS_READ;
> +	image[0].format = util_format_linear(src->format);
> +	image[0].u.tex.level = src_level;
> +	image[0].u.tex.first_layer = 0;
> +	image[0].u.tex.last_layer =
> +		src->target == PIPE_TEXTURE_3D ? u_minify(src->depth0, src_level) - 1
> +						: (unsigned)(src->array_size - 1);
> +	image[1].resource = dst;
> +	image[1].shader_access = image[1].access = PIPE_IMAGE_ACCESS_WRITE;
> +	image[1].format = util_format_linear(dst->format);
> +	image[1].u.tex.level = dst_level;
> +	image[1].u.tex.first_layer = 0;
> +	image[1].u.tex.last_layer =
> +		dst->target == PIPE_TEXTURE_3D ? u_minify(dst->depth0, dst_level) - 1
> +						: (unsigned)(dst->array_size - 1);
> +
> +	if (src->format == PIPE_FORMAT_R9G9B9E5_FLOAT)
> +		image[0].format = image[1].format = PIPE_FORMAT_R32_UINT;
> +
> +	/* SNORM8 blitting has precision issues on some chips. Use the SINT
> +	 * equivalent instead, which doesn't force DCC decompression.
> +	 * Note that some chips avoid this issue by using SDMA.
> +	 */
> +	if (util_format_is_snorm8(dst->format)) {
> +		image[0].format = image[1].format =
> +			util_format_snorm8_to_sint8(dst->format);
> +	}
> +
> +	ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 2, image);
> +
> +	struct pipe_grid_info info = {0};
> +
> +	if (dst->target == PIPE_TEXTURE_1D_ARRAY && src->target == PIPE_TEXTURE_1D_ARRAY) {
> +		if (!sctx->cs_copy_image_1d_array)
> +			sctx->cs_copy_image_1d_array =
> +				si_create_copy_image_compute_shader_1d_array(ctx);
> +		ctx->bind_compute_state(ctx, sctx->cs_copy_image_1d_array);
> +		info.block[0] = 64;
> +		info.last_block[0] = width % 64;
> +		info.block[1] = 1;
> +		info.block[2] = 1;
> +		info.grid[0] = DIV_ROUND_UP(width, 64);
> +		info.grid[1] = depth;
> +		info.grid[2] = 1;
> +	} else {
> +		if (!sctx->cs_copy_image)
> +			sctx->cs_copy_image = si_create_copy_image_compute_shader(ctx);
> +		ctx->bind_compute_state(ctx, sctx->cs_copy_image);
> +		info.block[0] = 8;
> +		info.last_block[0] = width % 8;
> +		info.block[1] = 8;
> +		info.last_block[1] = height % 8;
> +		info.block[2] = 1;
> +		info.grid[0] = DIV_ROUND_UP(width, 8);
> +		info.grid[1] = DIV_ROUND_UP(height, 8);
> +		info.grid[2] = depth;
> +	}
> +
> +	ctx->launch_grid(ctx, &info);
> +
> +	sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH |
> +		       (sctx->chip_class <= VI ? SI_CONTEXT_WRITEBACK_GLOBAL_L2 : 0) |
> +		       si_get_flush_flags(sctx, SI_COHERENCY_SHADER, L2_STREAM);
> +	ctx->bind_compute_state(ctx, saved_cs);
> +	ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 2, saved_image);
> +	ctx->set_constant_buffer(ctx, PIPE_SHADER_COMPUTE, 0, &saved_cb);
> +}
> +

> +void *si_create_copy_image_compute_shader(struct pipe_context *ctx)
> +{
> +	static const char text[] =
> +		"COMP\n"
> +		"PROPERTY CS_FIXED_BLOCK_WIDTH 8\n"
> +		"PROPERTY CS_FIXED_BLOCK_HEIGHT 8\n"
> +		"PROPERTY CS_FIXED_BLOCK_DEPTH 1\n"
> +		"DCL SV[0], THREAD_ID\n"
> +		"DCL SV[1], BLOCK_ID\n"
> +		"DCL IMAGE[0], 2D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT, WR\n"
> +		"DCL IMAGE[1], 2D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT, WR\n"
> +		"DCL CONST[0][0..1]\n" // 0:xyzw 1:xyzw
> +		"DCL TEMP[0..4], LOCAL\n"
> +		"IMM[0] UINT32 {8, 1, 0, 0}\n"
> +		"MOV TEMP[0].xyz, CONST[0][0].xyzw\n"
> +		"UMAD TEMP[1].xyz, SV[1].xyzz, IMM[0].xxyy, SV[0].xyzz\n"
> +		"UADD TEMP[2].xyz, TEMP[1].xyzx, TEMP[0].xyzx\n"
> +		"LOAD TEMP[3], IMAGE[0], TEMP[2].xyzx, 2D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT\n"
> +		"MOV TEMP[4].xyz, CONST[0][1].xyzw\n"
> +		"UADD TEMP[2].xyz, TEMP[1].xyzx, TEMP[4].xyzx\n"
> +		"STORE IMAGE[1], TEMP[2].xyzz, TEMP[3], 2D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT\n"
> +		"END\n";
> +
> +	struct tgsi_token tokens[1024];
> +	struct pipe_compute_state state = {0};
> +
> +	if (!tgsi_text_translate(text, tokens, ARRAY_SIZE(tokens))) {
> +		assert(false);
> +		return NULL;
> +	}
> +
> +	state.ir_type = PIPE_SHADER_IR_TGSI;
> +	state.prog = tokens;
> +
> +	return ctx->create_compute_state(ctx, &state);
> +}
> +
>
Hi,

Here is my summary of my understanding of the proposal implementation 
for the copy implementation:

. Store input and output (x, y, z) offsets into a constant buffer
. (8, 8) workgroups
. Each workitem copies pixel (x+get_group_id(0)*8+get_local_id(0), 
y+get_group_id(1)*8+get_local_id(1), 
z+get_group_id(2)*8+get_local_id(2)). The pixel is RGBA.

Some questions:
. What happens when the textures do not have some components ? R32F for 
example
. I'm not familiar with using images in compute shaders, but is it ok to 
declare as ARGB32F even if the input/output data is not float ?

Some comments:

. If src_x, dstx, etcs are not multiple of (8, 8), the workgroups won't 
be aligned well with the tiling pattern. Fortunately cache should 
mitigate the loss, but if that's an important case to handle, one could 
write the shader differently to have all workgroups (except at border) 
aligned. I guess one can benchmark see if that tiling alignment matters 
much here.
. Overhead can be reduced by copying several pixels per work-item.
. If the src and dst region are perfectly aligned with the tiling 
pattern, the copy can be reduced to just moving a rectangle of memory 
(no tiling) and could be implemented with dma_copy if no conversion is 
needed or with a shader using buffers (no images), which would avoid 
using the image sampling hw which I believe can be more limiting than 
sampling a buffer when there is a lot of wavefronts. The data conversion 
can be done for no cost in the shader as it should be memory bound.
. (8, 8) is not optimal for linear tiled images (but I guess we don't 
often get to use them with resource_copy_region).


But most likely you already know all that and consider this is not worth 
complicating the code to speed up corner cases.

Yours,


Axel Davy



More information about the mesa-dev mailing list