[Mesa-dev] [PATCH 1/3] radeonsi: move building llvm.SI.load.const into ac_build_buffer_load

Mon May 22 06:44:21 UTC 2017

On 19.05.2017 17:54, Marek Olšák wrote:
> From: Marek Olšák <marek.olsak at amd.com>
>
> ---
>  src/amd/common/ac_llvm_build.c           | 50 ++++++++++++++++++++++++--------
>  src/amd/common/ac_llvm_build.h           |  3 +-
>  src/amd/common/ac_nir_to_llvm.c          |  2 +-
>  src/gallium/drivers/radeonsi/si_shader.c | 23 +++++++--------
>  4 files changed, 51 insertions(+), 27 deletions(-)
>
> diff --git a/src/amd/common/ac_llvm_build.c b/src/amd/common/ac_llvm_build.c
> index 87a1fb7..4b048ba 100644
> --- a/src/amd/common/ac_llvm_build.c
> +++ b/src/amd/common/ac_llvm_build.c
> @@ -626,47 +626,73 @@ ac_build_buffer_store_dword(struct ac_llvm_context *ctx,
>  LLVMValueRef
>  ac_build_buffer_load(struct ac_llvm_context *ctx,
>  		     LLVMValueRef rsrc,
>  		     int num_channels,
>  		     LLVMValueRef vindex,
>  		     LLVMValueRef voffset,
>  		     LLVMValueRef soffset,
>  		     unsigned inst_offset,
>  		     unsigned glc,
>  		     unsigned slc,
> -		     bool readonly_memory)
> +		     bool readonly_memory,
> +		     bool coherent)

I think the series is basically fine, but the use of "coherent" here is 
potentially confusing because "coherent" is already used as the memory 
qualifier in shaders, and it sets glc.

Are there any buffer loads with readonly_memory && !glc && !vindex && 
!voffset where we aren't allowed to use SMEM? If possible, we could just 
use that to control whether llvm.SI.load.const is used.

I seem to recall that there were some subtle differences in bounds 
checking, and perhaps those prevent the use of SMEM sometimes. But then 
I'd rather use a more explicit parameter name like smem_forbidden or 
smem_allowed.

Cheers,
Nicolai

>  {
> +	LLVMValueRef offset = LLVMConstInt(ctx->i32, inst_offset, 0);
> +	if (voffset)
> +		offset = LLVMBuildAdd(ctx->builder, offset, voffset, "");
> +	if (soffset)
> +		offset = LLVMBuildAdd(ctx->builder, offset, soffset, "");
> +
> +	/* Loads from a shader buffer that has no stores in the same shader
> +	 * and is non-coherent with other shader invocations can use SMEM.
> +	 */
> +	if (readonly_memory && !coherent) {
> +		assert(vindex == NULL);
> +		assert(glc == 0);
> +		assert(slc == 0);
> +
> +		LLVMValueRef result[4];
> +
> +		for (int i = 0; i < num_channels; i++) {
> +			if (i) {
> +				offset = LLVMBuildAdd(ctx->builder, offset,
> +						      LLVMConstInt(ctx->i32, 4, 0), "");
> +			}
> +			LLVMValueRef args[2] = {rsrc, offset};
> +			result[i] = ac_build_intrinsic(ctx, "llvm.SI.load.const.v4i32",
> +						       ctx->f32, args, 2,
> +						       AC_FUNC_ATTR_READNONE |
> +						       AC_FUNC_ATTR_LEGACY);
> +		}
> +		if (num_channels == 1)
> +			return result[0];
> +
> +		if (num_channels == 3)
> +			result[num_channels++] = LLVMGetUndef(ctx->f32);
> +		return ac_build_gather_values(ctx, result, num_channels);
> +	}
> +
>  	unsigned func = CLAMP(num_channels, 1, 3) - 1;
>
>  	LLVMValueRef args[] = {
>  		LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, ""),
>  		vindex ? vindex : LLVMConstInt(ctx->i32, 0, 0),
> -		LLVMConstInt(ctx->i32, inst_offset, 0),
> +		offset,
>  		LLVMConstInt(ctx->i1, glc, 0),
>  		LLVMConstInt(ctx->i1, slc, 0)
>  	};
>
>  	LLVMTypeRef types[] = {ctx->f32, LLVMVectorType(ctx->f32, 2),
>  			       ctx->v4f32};
>  	const char *type_names[] = {"f32", "v2f32", "v4f32"};
>  	char name[256];
>
> -	if (voffset) {
> -		args[2] = LLVMBuildAdd(ctx->builder, args[2], voffset,
> -				"");
> -	}
> -
> -	if (soffset) {
> -		args[2] = LLVMBuildAdd(ctx->builder, args[2], soffset,
> -				"");
> -	}
> -
>  	snprintf(name, sizeof(name), "llvm.amdgcn.buffer.load.%s",
>  		 type_names[func]);
>
>  	return ac_build_intrinsic(ctx, name, types[func], args,
>  				  ARRAY_SIZE(args),
>  				  /* READNONE means writes can't affect it, while
>  				   * READONLY means that writes can affect it. */
>  				  readonly_memory && HAVE_LLVM >= 0x0400 ?
>  					  AC_FUNC_ATTR_READNONE :
>  					  AC_FUNC_ATTR_READONLY);
> diff --git a/src/amd/common/ac_llvm_build.h b/src/amd/common/ac_llvm_build.h
> index 0ecbc4a..754461b 100644
> --- a/src/amd/common/ac_llvm_build.h
> +++ b/src/amd/common/ac_llvm_build.h
> @@ -136,21 +136,22 @@ ac_build_buffer_store_dword(struct ac_llvm_context *ctx,
>  LLVMValueRef
>  ac_build_buffer_load(struct ac_llvm_context *ctx,
>  		     LLVMValueRef rsrc,
>  		     int num_channels,
>  		     LLVMValueRef vindex,
>  		     LLVMValueRef voffset,
>  		     LLVMValueRef soffset,
>  		     unsigned inst_offset,
>  		     unsigned glc,
>  		     unsigned slc,
> -		     bool readonly_memory);
> +		     bool readonly_memory,
> +		     bool coherent);
>
>  LLVMValueRef ac_build_buffer_load_format(struct ac_llvm_context *ctx,
>  					 LLVMValueRef rsrc,
>  					 LLVMValueRef vindex,
>  					 LLVMValueRef voffset,
>  					 bool readonly_memory);
>
>  LLVMValueRef
>  ac_get_thread_id(struct ac_llvm_context *ctx);
>
> diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c
> index 8ae0a75..9794c70 100644
> --- a/src/amd/common/ac_nir_to_llvm.c
> +++ b/src/amd/common/ac_nir_to_llvm.c
> @@ -2816,21 +2816,21 @@ load_tes_input(struct nir_to_llvm_context *ctx,
>  	param = shader_io_get_unique_index(instr->variables[0]->var->data.location);
>  	if (instr->variables[0]->var->data.location == VARYING_SLOT_CLIP_DIST0 &&
>  	    is_compact && const_index > 3) {
>  		const_index -= 3;
>  		param++;
>  	}
>  	buf_addr = get_tcs_tes_buffer_address_params(ctx, param, const_index,
>  						     is_compact, vertex_index, indir_index);
>
>  	result = ac_build_buffer_load(&ctx->ac, ctx->hs_ring_tess_offchip, instr->num_components, NULL,
> -				      buf_addr, ctx->oc_lds, is_compact ? (4 * const_index) : 0, 1, 0, true);
> +				      buf_addr, ctx->oc_lds, is_compact ? (4 * const_index) : 0, 1, 0, true, true);
>  	result = trim_vector(ctx, result, instr->num_components);
>  	result = LLVMBuildBitCast(ctx->builder, result, get_def_type(ctx, &instr->dest.ssa), "");
>  	return result;
>  }
>
>  static LLVMValueRef
>  load_gs_input(struct nir_to_llvm_context *ctx,
>  	      nir_intrinsic_instr *instr)
>  {
>  	LLVMValueRef indir_index, vtx_offset;
> diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
> index 61f1384..7dd121b 100644
> --- a/src/gallium/drivers/radeonsi/si_shader.c
> +++ b/src/gallium/drivers/radeonsi/si_shader.c
> @@ -826,39 +826,39 @@ static LLVMValueRef buffer_load(struct lp_build_tgsi_context *bld_base,
>                                  LLVMValueRef base, bool readonly_memory)
>  {
>  	struct si_shader_context *ctx = si_shader_context(bld_base);
>  	struct gallivm_state *gallivm = &ctx->gallivm;
>  	LLVMValueRef value, value2;
>  	LLVMTypeRef llvm_type = tgsi2llvmtype(bld_base, type);
>  	LLVMTypeRef vec_type = LLVMVectorType(llvm_type, 4);
>
>  	if (swizzle == ~0) {
>  		value = ac_build_buffer_load(&ctx->ac, buffer, 4, NULL, base, offset,
> -					     0, 1, 0, readonly_memory);
> +					     0, 1, 0, readonly_memory, true);
>
>  		return LLVMBuildBitCast(gallivm->builder, value, vec_type, "");
>  	}
>
>  	if (!tgsi_type_is_64bit(type)) {
>  		value = ac_build_buffer_load(&ctx->ac, buffer, 4, NULL, base, offset,
> -					     0, 1, 0, readonly_memory);
> +					     0, 1, 0, readonly_memory, true);
>
>  		value = LLVMBuildBitCast(gallivm->builder, value, vec_type, "");
>  		return LLVMBuildExtractElement(gallivm->builder, value,
>  		                    LLVMConstInt(ctx->i32, swizzle, 0), "");
>  	}
>
>  	value = ac_build_buffer_load(&ctx->ac, buffer, 1, NULL, base, offset,
> -	                          swizzle * 4, 1, 0, readonly_memory);
> +	                          swizzle * 4, 1, 0, readonly_memory, true);
>
>  	value2 = ac_build_buffer_load(&ctx->ac, buffer, 1, NULL, base, offset,
> -	                           swizzle * 4 + 4, 1, 0, readonly_memory);
> +	                           swizzle * 4 + 4, 1, 0, readonly_memory, true);
>
>  	return si_llvm_emit_fetch_64bit(bld_base, type, value, value2);
>  }
>
>  /**
>   * Load from LDS.
>   *
>   * \param type		output value type
>   * \param swizzle	offset (typically 0..3); it can be ~0, which loads a vec4
>   * \param dw_addr	address in dwords
> @@ -1147,28 +1147,28 @@ static LLVMValueRef fetch_input_gs(
>  		vtx_offset_param += ctx->param_gs_vtx2_offset - 2;
>  	}
>  	vtx_offset = lp_build_mul_imm(uint,
>  				      LLVMGetParam(ctx->main_fn,
>  						   vtx_offset_param),
>  				      4);
>
>  	soffset = LLVMConstInt(ctx->i32, (param * 4 + swizzle) * 256, 0);
>
>  	value = ac_build_buffer_load(&ctx->ac, ctx->esgs_ring, 1, ctx->i32_0,
> -				     vtx_offset, soffset, 0, 1, 0, true);
> +				     vtx_offset, soffset, 0, 1, 0, true, true);
>  	if (tgsi_type_is_64bit(type)) {
>  		LLVMValueRef value2;
>  		soffset = LLVMConstInt(ctx->i32, (param * 4 + swizzle + 1) * 256, 0);
>
>  		value2 = ac_build_buffer_load(&ctx->ac, ctx->esgs_ring, 1,
>  					      ctx->i32_0, vtx_offset, soffset,
> -					      0, 1, 0, true);
> +					      0, 1, 0, true, true);
>  		return si_llvm_emit_fetch_64bit(bld_base, type,
>  						value, value2);
>  	}
>  	return LLVMBuildBitCast(gallivm->builder,
>  				value,
>  				tgsi2llvmtype(bld_base, type), "");
>  }
>
>  static int lookup_interp_param_index(unsigned interpolate, unsigned location)
>  {
> @@ -1382,26 +1382,22 @@ static LLVMValueRef get_sample_id(struct si_shader_context *ctx)
>  }
>
>
>  /**
>   * Load a dword from a constant buffer.
>   */
>  static LLVMValueRef buffer_load_const(struct si_shader_context *ctx,
>  				      LLVMValueRef resource,
>  				      LLVMValueRef offset)
>  {
> -	LLVMBuilderRef builder = ctx->gallivm.builder;
> -	LLVMValueRef args[2] = {resource, offset};
> -
> -	return lp_build_intrinsic(builder, "llvm.SI.load.const.v4i32", ctx->f32, args, 2,
> -				  LP_FUNC_ATTR_READNONE |
> -				  LP_FUNC_ATTR_LEGACY);
> +	return ac_build_buffer_load(&ctx->ac, resource, 1, NULL, offset, NULL,
> +				    0, 0, 0, true, false);
>  }
>
>  static LLVMValueRef load_sample_position(struct si_shader_context *ctx, LLVMValueRef sample_id)
>  {
>  	struct lp_build_context *uint_bld = &ctx->bld_base.uint_bld;
>  	struct gallivm_state *gallivm = &ctx->gallivm;
>  	LLVMBuilderRef builder = gallivm->builder;
>  	LLVMValueRef desc = LLVMGetParam(ctx->main_fn, ctx->param_rw_buffers);
>  	LLVMValueRef buf_index = LLVMConstInt(ctx->i32, SI_PS_CONST_SAMPLE_POSITIONS, 0);
>  	LLVMValueRef resource = ac_build_indexed_load_const(&ctx->ac, desc, buf_index);
> @@ -5192,21 +5188,22 @@ si_generate_gs_copy_shader(struct si_screen *sscreen,
>  				}
>
>  				LLVMValueRef soffset = LLVMConstInt(ctx.i32,
>  					offset * gs_selector->gs_max_out_vertices * 16 * 4, 0);
>  				offset++;
>
>  				outputs[i].values[chan] =
>  					ac_build_buffer_load(&ctx.ac,
>  							     ctx.gsvs_ring[0], 1,
>  							     ctx.i32_0, voffset,
> -							     soffset, 0, 1, 1, true);
> +							     soffset, 0, 1, 1,
> +							     true, true);
>  			}
>  		}
>
>  		/* Streamout and exports. */
>  		if (gs_selector->so.num_outputs) {
>  			si_llvm_emit_streamout(&ctx, outputs,
>  					       gsinfo->num_outputs,
>  					       stream);
>  		}
>
>

-- 
Lerne, wie die Welt wirklich ist,
Aber vergiss niemals, wie sie sein sollte.