[Mesa-dev] [PATCH 19/19] radeonsi: don't declare LDS in PS when ds_bpermute is used

Tue Oct 4 09:46:13 UTC 2016

Reviewed-by: Nicolai Hähnle <nicolai.haehnle at amd.com>

On 02.10.2016 23:09, Marek Olšák wrote:
> From: Marek Olšák <marek.olsak at amd.com>
>
> I guess this is not needed because dead code elimination removes
> the declaration.
> ---
>  src/gallium/drivers/radeonsi/si_pipe.c   | 3 +++
>  src/gallium/drivers/radeonsi/si_pipe.h   | 1 +
>  src/gallium/drivers/radeonsi/si_shader.c | 7 +++----
>  3 files changed, 7 insertions(+), 4 deletions(-)
>
> diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c
> index 43d6377..2aa679c 100644
> --- a/src/gallium/drivers/radeonsi/si_pipe.c
> +++ b/src/gallium/drivers/radeonsi/si_pipe.c
> @@ -810,20 +810,23 @@ struct pipe_screen *radeonsi_screen_create(struct radeon_winsys *ws)
>  		(sscreen->b.chip_class == VI &&
>  		 sscreen->b.info.pfp_fw_version >= 121 &&
>  		 sscreen->b.info.me_fw_version >= 87) ||
>  		(sscreen->b.chip_class == CIK &&
>  		 sscreen->b.info.pfp_fw_version >= 211 &&
>  		 sscreen->b.info.me_fw_version >= 173) ||
>  		(sscreen->b.chip_class == SI &&
>  		 sscreen->b.info.pfp_fw_version >= 121 &&
>  		 sscreen->b.info.me_fw_version >= 87);
>
> +	sscreen->has_ds_bpermute = HAVE_LLVM >= 0x0309 &&
> +				   sscreen->b.chip_class >= VI;
> +
>  	sscreen->b.has_cp_dma = true;
>  	sscreen->b.has_streamout = true;
>  	pipe_mutex_init(sscreen->shader_parts_mutex);
>  	sscreen->use_monolithic_shaders =
>  		HAVE_LLVM < 0x0308 ||
>  		(sscreen->b.debug_flags & DBG_MONOLITHIC_SHADERS) != 0;
>
>  	sscreen->b.barrier_flags.cp_to_L2 = SI_CONTEXT_INV_SMEM_L1 |
>  					    SI_CONTEXT_INV_VMEM_L1 |
>  					    SI_CONTEXT_INV_GLOBAL_L2;
> diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
> index 558e185..3cefee7 100644
> --- a/src/gallium/drivers/radeonsi/si_pipe.h
> +++ b/src/gallium/drivers/radeonsi/si_pipe.h
> @@ -76,20 +76,21 @@
>  struct si_compute;
>  struct hash_table;
>  struct u_suballocator;
>
>  struct si_screen {
>  	struct r600_common_screen	b;
>  	unsigned			gs_table_depth;
>  	unsigned			tess_offchip_block_dw_size;
>  	bool				has_distributed_tess;
>  	bool				has_draw_indirect_multi;
> +	bool				has_ds_bpermute;
>
>  	/* Whether shaders are monolithic (1-part) or separate (3-part). */
>  	bool				use_monolithic_shaders;
>  	bool				record_llvm_ir;
>
>  	pipe_mutex			shader_parts_mutex;
>  	struct si_shader_part		*vs_prologs;
>  	struct si_shader_part		*vs_epilogs;
>  	struct si_shader_part		*tcs_epilogs;
>  	struct si_shader_part		*ps_prologs;
> diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
> index 7844ebd..30bf093 100644
> --- a/src/gallium/drivers/radeonsi/si_shader.c
> +++ b/src/gallium/drivers/radeonsi/si_shader.c
> @@ -5002,43 +5002,41 @@ static void si_llvm_emit_ddxy(
>  	const struct lp_build_tgsi_action *action,
>  	struct lp_build_tgsi_context *bld_base,
>  	struct lp_build_emit_data *emit_data)
>  {
>  	struct si_shader_context *ctx = si_shader_context(bld_base);
>  	struct gallivm_state *gallivm = bld_base->base.gallivm;
>  	unsigned opcode = emit_data->info->opcode;
>  	LLVMValueRef thread_id, tl, trbl, tl_tid, trbl_tid, val, args[2];
>  	int idx;
>  	unsigned mask;
> -	bool has_ds_bpermute = HAVE_LLVM >= 0x0309 &&
> -			       ctx->screen->b.chip_class >= VI;
>
>  	thread_id = get_thread_id(ctx);
>
>  	if (opcode == TGSI_OPCODE_DDX_FINE)
>  		mask = TID_MASK_LEFT;
>  	else if (opcode == TGSI_OPCODE_DDY_FINE)
>  		mask = TID_MASK_TOP;
>  	else
>  		mask = TID_MASK_TOP_LEFT;
>
>  	tl_tid = LLVMBuildAnd(gallivm->builder, thread_id,
>  				lp_build_const_int32(gallivm, mask), "");
>
>  	/* for DDX we want to next X pixel, DDY next Y pixel. */
>  	idx = (opcode == TGSI_OPCODE_DDX || opcode == TGSI_OPCODE_DDX_FINE) ? 1 : 2;
>  	trbl_tid = LLVMBuildAdd(gallivm->builder, tl_tid,
>  				  lp_build_const_int32(gallivm, idx), "");
>
>  	val = LLVMBuildBitCast(gallivm->builder, emit_data->args[0], ctx->i32, "");
>
> -	if (has_ds_bpermute) {
> +	if (ctx->screen->has_ds_bpermute) {
>  		args[0] = LLVMBuildMul(gallivm->builder, tl_tid,
>  				       lp_build_const_int32(gallivm, 4), "");
>  		args[1] = val;
>  		tl = lp_build_intrinsic(gallivm->builder,
>  					"llvm.amdgcn.ds.bpermute", ctx->i32,
>  					args, 2, LLVMReadNoneAttribute);
>
>  		args[0] = LLVMBuildMul(gallivm->builder, trbl_tid,
>  				       lp_build_const_int32(gallivm, 4), "");
>  		trbl = lp_build_intrinsic(gallivm->builder,
> @@ -5731,21 +5729,22 @@ static void create_function(struct si_shader_context *ctx)
>  	for (i = 0; i <= last_sgpr; ++i)
>  		shader->info.num_input_sgprs += llvm_get_type_size(params[i]) / 4;
>
>  	/* Unused fragment shader inputs are eliminated by the compiler,
>  	 * so we don't know yet how many there will be.
>  	 */
>  	if (ctx->type != PIPE_SHADER_FRAGMENT)
>  		for (; i < num_params; ++i)
>  			shader->info.num_input_vgprs += llvm_get_type_size(params[i]) / 4;
>
> -	if (bld_base->info &&
> +	if (!ctx->screen->has_ds_bpermute &&
> +	    bld_base->info &&
>  	    (bld_base->info->opcode_count[TGSI_OPCODE_DDX] > 0 ||
>  	     bld_base->info->opcode_count[TGSI_OPCODE_DDY] > 0 ||
>  	     bld_base->info->opcode_count[TGSI_OPCODE_DDX_FINE] > 0 ||
>  	     bld_base->info->opcode_count[TGSI_OPCODE_DDY_FINE] > 0 ||
>  	     bld_base->info->opcode_count[TGSI_OPCODE_INTERP_OFFSET] > 0 ||
>  	     bld_base->info->opcode_count[TGSI_OPCODE_INTERP_SAMPLE] > 0))
>  		ctx->lds =
>  			LLVMAddGlobalInAddressSpace(gallivm->module,
>  						    LLVMArrayType(ctx->i32, 64),
>  						    "ddxy_lds",
>