[Mesa-dev] [PATCH 07/14] radeonsi: Add offchip buffer address calculation.

Tue May 10 16:25:35 UTC 2016

On 10.05.2016 05:52, Bas Nieuwenhuizen wrote:
> Instead of creating a memory area per patch and per vertex, we put
> the same attribute of every vertex & patch together. Most loads
> and stores access the same attribute across all lanes, only for
> different patches and vertices.
>
> For the TCS this results in tightly packed data for 4-component
> stores.
>
> For the TES this is not the case as within a patch the loads
> often also access the same vertex. However if there are < 4
> vertices/patch, this still results in a reduction of the number
> of cache lines. In the LDS situation we only do better than worst
> case if the data per patch < 64 bytes, which due to the
> tessellation factors is pretty much never.
>
> We do not use hardware swizzling for this. It would slightly reduce
> the number of executed VALU instructions, but I had issues with
> increased wait times that I haven't been able to solve yet.
> Furthermore, the tbuffer_store intrinsic does not support both
> VGPR offset and an index, so we have a problem storing
> indirectly indexed outputs. This can be solved by temporarily
> storing arrays in LDS and then copying them, but I don't think
> that is worth the effort. The difference in VALU cycles
> hardware swizzling gives is about 0.2% of total busy cycles.
> That is without handling the array case.
>
> I chose for attributes instead of components as they are often
> accessed together, and the software swizzling takes VALU cycles
> for calculating offsets.
>
> Signed-off-by: Bas Nieuwenhuizen <bas at basnieuwenhuizen.nl>
> ---
>   src/gallium/drivers/radeonsi/si_shader.c | 143 +++++++++++++++++++++++++++++++
>   1 file changed, 143 insertions(+)
>
> diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
> index adbee73..90830ee 100644
> --- a/src/gallium/drivers/radeonsi/si_shader.c
> +++ b/src/gallium/drivers/radeonsi/si_shader.c
> @@ -664,6 +664,149 @@ static LLVMValueRef get_dw_address(struct si_shader_context *ctx,
>   			    lp_build_const_int32(gallivm, param * 4), "");
>   }
>
> +/* The offchip buffer layout for TCS->TES is
> + *
> + * - attribute 0 of patch 0 vertex 0
> + * - attribute 0 of patch 0 vertex 1
> + * - attribute 0 of patch 0 vertex 2
> + *   ...
> + * - attribute 0 of patch 1 vertex 0
> + * - attribute 0 of patch 1 vertex 1
> + *   ...
> + * - attribute 1 of patch 0 vertex 0
> + * - attribute 1 of patch 0 vertex 1
> + *   ...
> + * - per patch attribute 0 of patch 0 vertex 0
> + * - per patch attribute 0 of patch 0 vertex 1

Should be "... of patch 0" and "... of patch 1", right?

> + *   ...
> + *
> + * Note that every attribute has 4 components.
> + */
> +static LLVMValueRef get_buffer_address(struct si_shader_context *ctx,
> +                                       LLVMValueRef vertex_index,
> +                                       LLVMValueRef param_index)
> +{
> +	struct gallivm_state *gallivm = ctx->radeon_bld.soa.bld_base.base.gallivm;
> +	LLVMValueRef base_addr, vertices_per_patch, num_patches, total_vertices;
> +	LLVMValueRef param_stride, constant16;
> +
> +	vertices_per_patch = unpack_param(ctx, SI_PARAM_TCS_OFFCHIP_LAYOUT, 9, 6);
> +	num_patches = unpack_param(ctx, SI_PARAM_TCS_OFFCHIP_LAYOUT, 0, 9);
> +	total_vertices = LLVMBuildMul(gallivm->builder, vertices_per_patch,
> +	                              num_patches, "");
> +
> +	constant16 = lp_build_const_int32(gallivm, 16);
> +	if (vertex_index) {
> +		base_addr = LLVMBuildMul(gallivm->builder,
> +                                         constant16,
> +		                         vertices_per_patch, "");
> +
> +		base_addr = LLVMBuildMul(gallivm->builder, get_rel_patch_id(ctx),
> +		                         base_addr, "");
> +
> +		base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
> +		                         LLVMBuildMul(gallivm->builder, vertex_index,
> +		                                      constant16, ""), "");

Nitpick, but... apply the distributive law and calculate (rel_patch_id * 
vertices_per_patch + vertex_idx) * 16.

> +
> +		param_stride = LLVMBuildMul(gallivm->builder, total_vertices,
> +		                            constant16, "");
> +	} else {
> +		LLVMValueRef patch_data_offset =
> +		           unpack_param(ctx, SI_PARAM_TCS_OFFCHIP_LAYOUT, 16, 16);
> +
> +		base_addr = LLVMBuildMul(gallivm->builder, get_rel_patch_id(ctx),
> +		                         constant16, "");
> +
> +		base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
> +		                         patch_data_offset, "");
> +
> +		param_stride = LLVMBuildMul(gallivm->builder, num_patches,
> +		                            constant16, "");
> +	}
> +
> +	base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
> +	                         LLVMBuildMul(gallivm->builder, param_index,
> +	                                      param_stride, ""), "");
> +
> +	return base_addr;
> +}
> +
> +static LLVMValueRef get_buffer_address_from_reg(struct si_shader_context *ctx,
> +                                       const struct tgsi_full_dst_register *dst,
> +                                       const struct tgsi_full_src_register *src)
> +{
> +	struct gallivm_state *gallivm = ctx->radeon_bld.soa.bld_base.base.gallivm;
> +	struct tgsi_shader_info *info = &ctx->shader->selector->info;
> +	ubyte *name, *index, *array_first;
> +	struct tgsi_full_dst_register reg;
> +	LLVMValueRef vertex_index = NULL;
> +	LLVMValueRef param_index = NULL;
> +	unsigned param_index_base, param_base;
> +
> +	/* Set the register description. The address computation is the same
> +	 * for sources and destinations. */
> +	if (src) {
> +		reg.Register.File = src->Register.File;
> +		reg.Register.Index = src->Register.Index;
> +		reg.Register.Indirect = src->Register.Indirect;
> +		reg.Register.Dimension = src->Register.Dimension;
> +		reg.Indirect = src->Indirect;
> +		reg.Dimension = src->Dimension;
> +		reg.DimIndirect = src->DimIndirect;
> +	} else
> +		reg = *dst;

You could reverse the roles of src and dst and use 
tgsi_full_src_register_from_dst, but I don't feel strongly either way.

Apart from the comments above, patches 1-3 and 5-7 are

Reviewed-by: Nicolai Hähnle <nicolai.haehnle at amd.com>

> +
> +	/* If the register is 2-dimensional (e.g. an array of vertices
> +	 * in a primitive), calculate the base address of the vertex. */
> +	if (reg.Register.Dimension) {
> +
> +		if (reg.Dimension.Indirect)
> +			vertex_index = get_indirect_index(ctx, &reg.DimIndirect,
> +			                                  reg.Dimension.Index);
> +		else
> +			vertex_index = lp_build_const_int32(gallivm,
> +			                                    reg.Dimension.Index);
> +	}
> +
> +	/* Get information about the register. */
> +	if (reg.Register.File == TGSI_FILE_INPUT) {
> +		name = info->input_semantic_name;
> +		index = info->input_semantic_index;
> +		array_first = info->input_array_first;
> +	} else if (reg.Register.File == TGSI_FILE_OUTPUT) {
> +		name = info->output_semantic_name;
> +		index = info->output_semantic_index;
> +		array_first = info->output_array_first;
> +	} else {
> +		assert(0);
> +		return NULL;
> +	}
> +
> +	if (reg.Register.Indirect) {
> +		if (reg.Indirect.ArrayID)
> +			param_base = array_first[reg.Indirect.ArrayID];
> +		else
> +			param_base = reg.Register.Index;
> +
> +		param_index = get_indirect_index(ctx, &reg.Indirect,
> +		                                 reg.Register.Index - param_base);
> +
> +	} else {
> +		param_base = reg.Register.Index;
> +		param_index = lp_build_const_int32(gallivm, 0);
> +	}
> +
> +	param_index_base = si_shader_io_get_unique_index(name[param_base],
> +	                                                 index[param_base]);
> +
> +	param_index = LLVMBuildAdd(gallivm->builder, param_index,
> +	                           lp_build_const_int32(gallivm, param_index_base),
> +	                           "");
> +
> +	/* Add the base address of the element. */
> +	return get_buffer_address(ctx, vertex_index, param_index);
> +}
> +
>   /* TBUFFER_STORE_FORMAT_{X,XY,XYZ,XYZW} <- the suffix is selected by num_channels=1..4.
>    * The type of vdata must be one of i32 (num_channels=1), v2i32 (num_channels=2),
>    * or v4i32 (num_channels=3,4). */
>