[Mesa-dev] [PATCH 1/2] radeonsi: use compute for clear_render_target when possible

Dieter Nützel Dieter at nuetzel-hh.de
Tue Feb 12 04:41:36 UTC 2019


Maybe rebase?

Dieter

Am 24.01.2019 00:28, schrieb Marek Olšák:
> From: Sonny Jiang <sonny.jiang at amd.com>
> 
> Signed-off-by: Sonny Jiang <sonny.jiang at amd.com>
> Signed-off-by: Marek Olšák <marek.olsak at amd.com>
> ---
>  src/gallium/drivers/radeonsi/si_clear.c       |  6 ++
>  .../drivers/radeonsi/si_compute_blit.c        | 96 +++++++++++++++++++
>  src/gallium/drivers/radeonsi/si_pipe.c        |  4 +
>  src/gallium/drivers/radeonsi/si_pipe.h        |  9 ++
>  .../drivers/radeonsi/si_shaderlib_tgsi.c      | 69 +++++++++++++
>  5 files changed, 184 insertions(+)
> 
> diff --git a/src/gallium/drivers/radeonsi/si_clear.c
> b/src/gallium/drivers/radeonsi/si_clear.c
> index b3910a4651c..8afc01f2ccc 100644
> --- a/src/gallium/drivers/radeonsi/si_clear.c
> +++ b/src/gallium/drivers/radeonsi/si_clear.c
> @@ -664,20 +664,26 @@ static void si_clear(struct pipe_context *ctx,
> unsigned buffers,
>  }
> 
>  static void si_clear_render_target(struct pipe_context *ctx,
>  				   struct pipe_surface *dst,
>  				   const union pipe_color_union *color,
>  				   unsigned dstx, unsigned dsty,
>  				   unsigned width, unsigned height,
>  				   bool render_condition_enabled)
>  {
>  	struct si_context *sctx = (struct si_context *)ctx;
> +	struct si_texture *sdst = (struct si_texture*)dst->texture;
> +
> +	if (dst->texture->nr_samples <= 1 && !sdst->dcc_offset) {
> +		si_compute_clear_render_target(ctx, dst, color, dstx, dsty, width, 
> height);
> +		return;
> +	}
> 
>  	si_blitter_begin(sctx, SI_CLEAR_SURFACE |
>  			 (render_condition_enabled ? 0 : SI_DISABLE_RENDER_COND));
>  	util_blitter_clear_render_target(sctx->blitter, dst, color,
>  					 dstx, dsty, width, height);
>  	si_blitter_end(sctx);
>  }
> 
>  static void si_clear_depth_stencil(struct pipe_context *ctx,
>  				   struct pipe_surface *dst,
> diff --git a/src/gallium/drivers/radeonsi/si_compute_blit.c
> b/src/gallium/drivers/radeonsi/si_compute_blit.c
> index 38c48c30be9..f06497f4dac 100644
> --- a/src/gallium/drivers/radeonsi/si_compute_blit.c
> +++ b/src/gallium/drivers/radeonsi/si_compute_blit.c
> @@ -18,20 +18,21 @@
>   * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT 
> SHALL
>   * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
>   * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT 
> OR
>   * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE 
> OR THE
>   * USE OR OTHER DEALINGS IN THE SOFTWARE.
>   *
>   */
> 
>  #include "si_pipe.h"
>  #include "util/u_format.h"
> +#include "util/format_srgb.h"
> 
>  /* Note: Compute shaders always use SI_COMPUTE_DST_CACHE_POLICY for 
> dst
>   * and L2_STREAM for src.
>   */
>  static enum si_cache_policy get_cache_policy(struct si_context *sctx,
>  					     enum si_coherency coher,
>  					     uint64_t size)
>  {
>  	if ((sctx->chip_class >= GFX9 && (coher == SI_COHERENCY_CB_META ||
>  					  coher == SI_COHERENCY_CP)) ||
> @@ -418,10 +419,105 @@ void si_compute_copy_image(struct si_context 
> *sctx,
>  	ctx->bind_compute_state(ctx, saved_cs);
>  	ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 2, saved_image);
>  	ctx->set_constant_buffer(ctx, PIPE_SHADER_COMPUTE, 0, &saved_cb);
>  	si_compute_internal_end(sctx);
>  }
> 
>  void si_init_compute_blit_functions(struct si_context *sctx)
>  {
>  	sctx->b.clear_buffer = si_pipe_clear_buffer;
>  }
> +
> +/* Clear a region of a color surface to a constant value. */
> +void si_compute_clear_render_target(struct pipe_context *ctx,
> +				    struct pipe_surface *dstsurf,
> +				    const union pipe_color_union *color,
> +				    unsigned dstx, unsigned dsty,
> +				    unsigned width, unsigned height)
> +{
> +	struct si_context *sctx = (struct si_context *)ctx;
> +	unsigned num_layers = dstsurf->u.tex.last_layer -
> dstsurf->u.tex.first_layer + 1;
> +	unsigned data[4 + sizeof(color->ui)] = {dstx, dsty,
> dstsurf->u.tex.first_layer, 0};
> +
> +	if (width == 0 || height == 0)
> +		return;
> +
> +	if (util_format_is_srgb(dstsurf->format)) {
> +		union pipe_color_union color_srgb;
> +		for (int i = 0; i < 3; i++)
> +			color_srgb.f[i] = util_format_linear_to_srgb_float(color->f[i]);
> +		color_srgb.f[3] = color->f[3];
> +		memcpy(data + 4, color_srgb.ui, sizeof(color->ui));
> +	} else {
> +		memcpy(data + 4, color->ui, sizeof(color->ui));
> +	}
> +
> +	si_compute_internal_begin(sctx);
> +	sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH |
> +		       si_get_flush_flags(sctx, SI_COHERENCY_SHADER, L2_STREAM);
> +	si_make_CB_shader_coherent(sctx, dstsurf->texture->nr_samples, true);
> +
> +	struct pipe_constant_buffer saved_cb = {};
> +	si_get_pipe_constant_buffer(sctx, PIPE_SHADER_COMPUTE, 0, &saved_cb);
> +
> +	struct si_images *images = &sctx->images[PIPE_SHADER_COMPUTE];
> +	struct pipe_image_view saved_image = {0};
> +	util_copy_image_view(&saved_image, &images->views[0]);
> +
> +	void *saved_cs = sctx->cs_shader_state.program;
> +
> +	struct pipe_constant_buffer cb = {};
> +	cb.buffer_size = sizeof(data);
> +	cb.user_buffer = data;
> +	ctx->set_constant_buffer(ctx, PIPE_SHADER_COMPUTE, 0, &cb);
> +
> +	struct pipe_image_view image = {0};
> +	image.resource = dstsurf->texture;
> +	image.shader_access = image.access = PIPE_IMAGE_ACCESS_WRITE;
> +	image.format = util_format_linear(dstsurf->format);
> +	image.u.tex.level = dstsurf->u.tex.level;
> +	image.u.tex.first_layer = 0; /* 3D images ignore first_layer 
> (BASE_ARRAY) */
> +	image.u.tex.last_layer = dstsurf->u.tex.last_layer;
> +
> +	ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 1, &image);
> +
> +	struct pipe_grid_info info = {0};
> +
> +	if (dstsurf->texture->target != PIPE_TEXTURE_1D_ARRAY) {
> +		if (!sctx->cs_clear_render_target)
> +			sctx->cs_clear_render_target = si_clear_render_target_shader(ctx);
> +		ctx->bind_compute_state(ctx, sctx->cs_clear_render_target);
> +		info.block[0] = 8;
> +		sctx->compute_last_block[0] = width % 8;
> +		info.block[1] = 8;
> +		sctx->compute_last_block[1] = height % 8;
> +		info.block[2] = 1;
> +		info.grid[0] = DIV_ROUND_UP(width, 8);
> +		info.grid[1] = DIV_ROUND_UP(height, 8);
> +		info.grid[2] = num_layers;
> +	} else {
> +		if (!sctx->cs_clear_render_target_1d_array)
> +			sctx->cs_clear_render_target_1d_array =
> +				si_clear_render_target_shader_1d_array(ctx);
> +		ctx->bind_compute_state(ctx, sctx->cs_clear_render_target_1d_array);
> +		info.block[0] = 64;
> +		sctx->compute_last_block[0] = width % 64;
> +		info.block[1] = 1;
> +		info.block[2] = 1;
> +		info.grid[0] = DIV_ROUND_UP(width, 64);
> +		info.grid[1] = num_layers;
> +		info.grid[2] = 1;
> +	}
> +
> +	ctx->launch_grid(ctx, &info);
> +
> +	sctx->compute_last_block[0] = 0;
> +	sctx->compute_last_block[1] = 0;
> +
> +	sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH |
> +		       (sctx->chip_class <= VI ? SI_CONTEXT_WRITEBACK_GLOBAL_L2 : 0) 
> |
> +		       si_get_flush_flags(sctx, SI_COHERENCY_SHADER, L2_STREAM);
> +	ctx->bind_compute_state(ctx, saved_cs);
> +	ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 1, &saved_image);
> +	ctx->set_constant_buffer(ctx, PIPE_SHADER_COMPUTE, 0, &saved_cb);
> +	si_compute_internal_end(sctx);
> +}
> diff --git a/src/gallium/drivers/radeonsi/si_pipe.c
> b/src/gallium/drivers/radeonsi/si_pipe.c
> index 41d395d7d3f..439b550c4cf 100644
> --- a/src/gallium/drivers/radeonsi/si_pipe.c
> +++ b/src/gallium/drivers/radeonsi/si_pipe.c
> @@ -198,20 +198,24 @@ static void si_destroy_context(struct
> pipe_context *context)
>  	if (sctx->vs_blit_texcoord)
>  		sctx->b.delete_vs_state(&sctx->b, sctx->vs_blit_texcoord);
>  	if (sctx->cs_clear_buffer)
>  		sctx->b.delete_compute_state(&sctx->b, sctx->cs_clear_buffer);
>  	if (sctx->cs_copy_buffer)
>  		sctx->b.delete_compute_state(&sctx->b, sctx->cs_copy_buffer);
>  	if (sctx->cs_copy_image)
>  		sctx->b.delete_compute_state(&sctx->b, sctx->cs_copy_image);
>  	if (sctx->cs_copy_image_1d_array)
>  		sctx->b.delete_compute_state(&sctx->b, 
> sctx->cs_copy_image_1d_array);
> +	if (sctx->cs_clear_render_target)
> +		sctx->b.delete_compute_state(&sctx->b, 
> sctx->cs_clear_render_target);
> +	if (sctx->cs_clear_render_target_1d_array)
> +		sctx->b.delete_compute_state(&sctx->b,
> sctx->cs_clear_render_target_1d_array);
> 
>  	if (sctx->blitter)
>  		util_blitter_destroy(sctx->blitter);
> 
>  	/* Release DCC stats. */
>  	for (int i = 0; i < ARRAY_SIZE(sctx->dcc_stats); i++) {
>  		assert(!sctx->dcc_stats[i].query_active);
> 
>  		for (int j = 0; j < ARRAY_SIZE(sctx->dcc_stats[i].ps_stats); j++)
>  			if (sctx->dcc_stats[i].ps_stats[j])
> diff --git a/src/gallium/drivers/radeonsi/si_pipe.h
> b/src/gallium/drivers/radeonsi/si_pipe.h
> index 3a5d9d2fbd2..437144316d0 100644
> --- a/src/gallium/drivers/radeonsi/si_pipe.h
> +++ b/src/gallium/drivers/radeonsi/si_pipe.h
> @@ -805,20 +805,22 @@ struct si_context {
>  	void				*custom_blend_dcc_decompress;
>  	void				*vs_blit_pos;
>  	void				*vs_blit_pos_layered;
>  	void				*vs_blit_color;
>  	void				*vs_blit_color_layered;
>  	void				*vs_blit_texcoord;
>  	void				*cs_clear_buffer;
>  	void				*cs_copy_buffer;
>  	void				*cs_copy_image;
>  	void				*cs_copy_image_1d_array;
> +	void				*cs_clear_render_target;
> +	void				*cs_clear_render_target_1d_array;
>  	struct si_screen		*screen;
>  	struct pipe_debug_callback	debug;
>  	struct ac_llvm_compiler		compiler; /* only non-threaded compilation 
> */
>  	struct si_shader_ctx_state	fixed_func_tcs_shader;
>  	struct si_resource		*wait_mem_scratch;
>  	unsigned			wait_mem_number;
>  	uint16_t			prefetch_L2_mask;
> 
>  	bool				gfx_flush_in_progress:1;
>  	bool				gfx_last_ib_is_busy:1;
> @@ -1182,20 +1184,25 @@ void si_clear_buffer(struct si_context *sctx,
> struct pipe_resource *dst,
>  void si_copy_buffer(struct si_context *sctx,
>  		    struct pipe_resource *dst, struct pipe_resource *src,
>  		    uint64_t dst_offset, uint64_t src_offset, unsigned size);
>  void si_compute_copy_image(struct si_context *sctx,
>  			   struct pipe_resource *dst,
>  			   unsigned dst_level,
>  			   struct pipe_resource *src,
>  			   unsigned src_level,
>  			   unsigned dstx, unsigned dsty, unsigned dstz,
>  			   const struct pipe_box *src_box);
> +void si_compute_clear_render_target(struct pipe_context *ctx,
> +                                    struct pipe_surface *dstsurf,
> +                                    const union pipe_color_union 
> *color,
> +                                    unsigned dstx, unsigned dsty,
> +                                    unsigned width, unsigned height);
>  void si_init_compute_blit_functions(struct si_context *sctx);
> 
>  /* si_cp_dma.c */
>  #define SI_CPDMA_SKIP_CHECK_CS_SPACE	(1 << 0) /* don't call 
> need_cs_space */
>  #define SI_CPDMA_SKIP_SYNC_AFTER	(1 << 1) /* don't wait for DMA after
> the copy */
>  #define SI_CPDMA_SKIP_SYNC_BEFORE	(1 << 2) /* don't wait for DMA
> before the copy (RAW hazards) */
>  #define SI_CPDMA_SKIP_GFX_SYNC		(1 << 3) /* don't flush caches and
> don't wait for PS/CS */
>  #define SI_CPDMA_SKIP_BO_LIST_UPDATE	(1 << 4) /* don't update the BO 
> list */
>  #define SI_CPDMA_SKIP_ALL (SI_CPDMA_SKIP_CHECK_CS_SPACE | \
>  			   SI_CPDMA_SKIP_SYNC_AFTER | \
> @@ -1297,20 +1304,22 @@ void si_resume_queries(struct si_context 
> *sctx);
> 
>  /* si_shaderlib_tgsi.c */
>  void *si_get_blitter_vs(struct si_context *sctx, enum 
> blitter_attrib_type type,
>  			unsigned num_layers);
>  void *si_create_fixed_func_tcs(struct si_context *sctx);
>  void *si_create_dma_compute_shader(struct pipe_context *ctx,
>  				   unsigned num_dwords_per_thread,
>  				   bool dst_stream_cache_policy, bool is_copy);
>  void *si_create_copy_image_compute_shader(struct pipe_context *ctx);
>  void *si_create_copy_image_compute_shader_1d_array(struct pipe_context 
> *ctx);
> +void *si_clear_render_target_shader(struct pipe_context *ctx);
> +void *si_clear_render_target_shader_1d_array(struct pipe_context 
> *ctx);
>  void *si_create_query_result_cs(struct si_context *sctx);
> 
>  /* si_test_dma.c */
>  void si_test_dma(struct si_screen *sscreen);
> 
>  /* si_test_clearbuffer.c */
>  void si_test_dma_perf(struct si_screen *sscreen);
> 
>  /* si_uvd.c */
>  struct pipe_video_codec *si_uvd_create_decoder(struct pipe_context 
> *context,
> diff --git a/src/gallium/drivers/radeonsi/si_shaderlib_tgsi.c
> b/src/gallium/drivers/radeonsi/si_shaderlib_tgsi.c
> index 55f96b3a25e..91a23b1d7ed 100644
> --- a/src/gallium/drivers/radeonsi/si_shaderlib_tgsi.c
> +++ b/src/gallium/drivers/radeonsi/si_shaderlib_tgsi.c
> @@ -509,10 +509,79 @@ void
> *si_create_copy_image_compute_shader_1d_array(struct pipe_context
> *ctx)
>  	if (!tgsi_text_translate(text, tokens, ARRAY_SIZE(tokens))) {
>  		assert(false);
>  		return NULL;
>  	}
> 
>  	state.ir_type = PIPE_SHADER_IR_TGSI;
>  	state.prog = tokens;
> 
>  	return ctx->create_compute_state(ctx, &state);
>  }
> +
> +void *si_clear_render_target_shader(struct pipe_context *ctx)
> +{
> +	static const char text[] =
> +		"COMP\n"
> +		"PROPERTY CS_FIXED_BLOCK_WIDTH 8\n"
> +		"PROPERTY CS_FIXED_BLOCK_HEIGHT 8\n"
> +		"PROPERTY CS_FIXED_BLOCK_DEPTH 1\n"
> +		"DCL SV[0], THREAD_ID\n"
> +		"DCL SV[1], BLOCK_ID\n"
> +		"DCL IMAGE[0], 2D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT, WR\n"
> +		"DCL CONST[0][0..1]\n" // 0:xyzw 1:xyzw
> +		"DCL TEMP[0..3], LOCAL\n"
> +		"IMM[0] UINT32 {8, 1, 0, 0}\n"
> +		"MOV TEMP[0].xyz, CONST[0][0].xyzw\n"
> +		"UMAD TEMP[1].xyz, SV[1].xyzz, IMM[0].xxyy, SV[0].xyzz\n"
> +		"UADD TEMP[2].xyz, TEMP[1].xyzx, TEMP[0].xyzx\n"
> +		"MOV TEMP[3].xyzw, CONST[0][1].xyzw\n"
> +		"STORE IMAGE[0], TEMP[2].xyzz, TEMP[3], 2D_ARRAY,
> PIPE_FORMAT_R32G32B32A32_FLOAT\n"
> +		"END\n";
> +
> +	struct tgsi_token tokens[1024];
> +	struct pipe_compute_state state = {0};
> +
> +	if (!tgsi_text_translate(text, tokens, ARRAY_SIZE(tokens))) {
> +		assert(false);
> +		return NULL;
> +	}
> +
> +	state.ir_type = PIPE_SHADER_IR_TGSI;
> +	state.prog = tokens;
> +
> +	return ctx->create_compute_state(ctx, &state);
> +}
> +
> +/* TODO: Didn't really test 1D_ARRAY */
> +void *si_clear_render_target_shader_1d_array(struct pipe_context *ctx)
> +{
> +	static const char text[] =
> +		"COMP\n"
> +		"PROPERTY CS_FIXED_BLOCK_WIDTH 64\n"
> +		"PROPERTY CS_FIXED_BLOCK_HEIGHT 1\n"
> +		"PROPERTY CS_FIXED_BLOCK_DEPTH 1\n"
> +		"DCL SV[0], THREAD_ID\n"
> +		"DCL SV[1], BLOCK_ID\n"
> +		"DCL IMAGE[0], 1D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT, WR\n"
> +		"DCL CONST[0][0..1]\n" // 0:xyzw 1:xyzw
> +		"DCL TEMP[0..3], LOCAL\n"
> +		"IMM[0] UINT32 {64, 1, 0, 0}\n"
> +		"MOV TEMP[0].xy, CONST[0][0].xzzw\n"
> +		"UMAD TEMP[1].xy, SV[1].xyzz, IMM[0].xyyy, SV[0].xyzz\n"
> +		"UADD TEMP[2].xy, TEMP[1].xyzx, TEMP[0].xyzx\n"
> +		"MOV TEMP[3].xyzw, CONST[0][1].xyzw\n"
> +		"STORE IMAGE[0], TEMP[2].xyzz, TEMP[3], 1D_ARRAY,
> PIPE_FORMAT_R32G32B32A32_FLOAT\n"
> +		"END\n";
> +
> +	struct tgsi_token tokens[1024];
> +	struct pipe_compute_state state = {0};
> +
> +	if (!tgsi_text_translate(text, tokens, ARRAY_SIZE(tokens))) {
> +		assert(false);
> +		return NULL;
> +	}
> +
> +	state.ir_type = PIPE_SHADER_IR_TGSI;
> +	state.prog = tokens;
> +
> +	return ctx->create_compute_state(ctx, &state);
> +}


More information about the mesa-dev mailing list