[Mesa-dev] [PATCH 07/14] radeonsi: Add offchip buffer address calculation.

Tue May 10 16:38:48 UTC 2016

On 10.05.2016 11:25, Nicolai Hähnle wrote:
> On 10.05.2016 05:52, Bas Nieuwenhuizen wrote:
>> Instead of creating a memory area per patch and per vertex, we put
>> the same attribute of every vertex & patch together. Most loads
>> and stores access the same attribute across all lanes, only for
>> different patches and vertices.
>>
>> For the TCS this results in tightly packed data for 4-component
>> stores.
>>
>> For the TES this is not the case as within a patch the loads
>> often also access the same vertex. However if there are < 4
>> vertices/patch, this still results in a reduction of the number
>> of cache lines. In the LDS situation we only do better than worst
>> case if the data per patch < 64 bytes, which due to the
>> tessellation factors is pretty much never.
>>
>> We do not use hardware swizzling for this. It would slightly reduce
>> the number of executed VALU instructions, but I had issues with
>> increased wait times that I haven't been able to solve yet.
>> Furthermore, the tbuffer_store intrinsic does not support both
>> VGPR offset and an index, so we have a problem storing
>> indirectly indexed outputs. This can be solved by temporarily
>> storing arrays in LDS and then copying them, but I don't think
>> that is worth the effort. The difference in VALU cycles
>> hardware swizzling gives is about 0.2% of total busy cycles.
>> That is without handling the array case.
>>
>> I chose for attributes instead of components as they are often
>> accessed together, and the software swizzling takes VALU cycles
>> for calculating offsets.
>>
>> Signed-off-by: Bas Nieuwenhuizen <bas at basnieuwenhuizen.nl>
>> ---
>>   src/gallium/drivers/radeonsi/si_shader.c | 143
>> +++++++++++++++++++++++++++++++
>>   1 file changed, 143 insertions(+)
>>
>> diff --git a/src/gallium/drivers/radeonsi/si_shader.c
>> b/src/gallium/drivers/radeonsi/si_shader.c
>> index adbee73..90830ee 100644
>> --- a/src/gallium/drivers/radeonsi/si_shader.c
>> +++ b/src/gallium/drivers/radeonsi/si_shader.c
>> @@ -664,6 +664,149 @@ static LLVMValueRef get_dw_address(struct
>> si_shader_context *ctx,
>>                   lp_build_const_int32(gallivm, param * 4), "");
>>   }
>>
>> +/* The offchip buffer layout for TCS->TES is
>> + *
>> + * - attribute 0 of patch 0 vertex 0
>> + * - attribute 0 of patch 0 vertex 1
>> + * - attribute 0 of patch 0 vertex 2
>> + *   ...
>> + * - attribute 0 of patch 1 vertex 0
>> + * - attribute 0 of patch 1 vertex 1
>> + *   ...
>> + * - attribute 1 of patch 0 vertex 0
>> + * - attribute 1 of patch 0 vertex 1
>> + *   ...
>> + * - per patch attribute 0 of patch 0 vertex 0
>> + * - per patch attribute 0 of patch 0 vertex 1
>
> Should be "... of patch 0" and "... of patch 1", right?
>
>> + *   ...
>> + *
>> + * Note that every attribute has 4 components.
>> + */
>> +static LLVMValueRef get_buffer_address(struct si_shader_context *ctx,
>> +                                       LLVMValueRef vertex_index,
>> +                                       LLVMValueRef param_index)

Actually, could you change those function names to something like 
get_tcs_tes_buffer_address?

Thanks,
Nicolai

>> +{
>> +    struct gallivm_state *gallivm =
>> ctx->radeon_bld.soa.bld_base.base.gallivm;
>> +    LLVMValueRef base_addr, vertices_per_patch, num_patches,
>> total_vertices;
>> +    LLVMValueRef param_stride, constant16;
>> +
>> +    vertices_per_patch = unpack_param(ctx,
>> SI_PARAM_TCS_OFFCHIP_LAYOUT, 9, 6);
>> +    num_patches = unpack_param(ctx, SI_PARAM_TCS_OFFCHIP_LAYOUT, 0, 9);
>> +    total_vertices = LLVMBuildMul(gallivm->builder, vertices_per_patch,
>> +                                  num_patches, "");
>> +
>> +    constant16 = lp_build_const_int32(gallivm, 16);
>> +    if (vertex_index) {
>> +        base_addr = LLVMBuildMul(gallivm->builder,
>> +                                         constant16,
>> +                                 vertices_per_patch, "");
>> +
>> +        base_addr = LLVMBuildMul(gallivm->builder,
>> get_rel_patch_id(ctx),
>> +                                 base_addr, "");
>> +
>> +        base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
>> +                                 LLVMBuildMul(gallivm->builder,
>> vertex_index,
>> +                                              constant16, ""), "");
>
> Nitpick, but... apply the distributive law and calculate (rel_patch_id *
> vertices_per_patch + vertex_idx) * 16.
>
>> +
>> +        param_stride = LLVMBuildMul(gallivm->builder, total_vertices,
>> +                                    constant16, "");
>> +    } else {
>> +        LLVMValueRef patch_data_offset =
>> +                   unpack_param(ctx, SI_PARAM_TCS_OFFCHIP_LAYOUT, 16,
>> 16);
>> +
>> +        base_addr = LLVMBuildMul(gallivm->builder,
>> get_rel_patch_id(ctx),
>> +                                 constant16, "");
>> +
>> +        base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
>> +                                 patch_data_offset, "");
>> +
>> +        param_stride = LLVMBuildMul(gallivm->builder, num_patches,
>> +                                    constant16, "");
>> +    }
>> +
>> +    base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
>> +                             LLVMBuildMul(gallivm->builder, param_index,
>> +                                          param_stride, ""), "");
>> +
>> +    return base_addr;
>> +}
>> +
>> +static LLVMValueRef get_buffer_address_from_reg(struct
>> si_shader_context *ctx,
>> +                                       const struct
>> tgsi_full_dst_register *dst,
>> +                                       const struct
>> tgsi_full_src_register *src)
>> +{
>> +    struct gallivm_state *gallivm =
>> ctx->radeon_bld.soa.bld_base.base.gallivm;
>> +    struct tgsi_shader_info *info = &ctx->shader->selector->info;
>> +    ubyte *name, *index, *array_first;
>> +    struct tgsi_full_dst_register reg;
>> +    LLVMValueRef vertex_index = NULL;
>> +    LLVMValueRef param_index = NULL;
>> +    unsigned param_index_base, param_base;
>> +
>> +    /* Set the register description. The address computation is the same
>> +     * for sources and destinations. */
>> +    if (src) {
>> +        reg.Register.File = src->Register.File;
>> +        reg.Register.Index = src->Register.Index;
>> +        reg.Register.Indirect = src->Register.Indirect;
>> +        reg.Register.Dimension = src->Register.Dimension;
>> +        reg.Indirect = src->Indirect;
>> +        reg.Dimension = src->Dimension;
>> +        reg.DimIndirect = src->DimIndirect;
>> +    } else
>> +        reg = *dst;
>
> You could reverse the roles of src and dst and use
> tgsi_full_src_register_from_dst, but I don't feel strongly either way.
>
> Apart from the comments above, patches 1-3 and 5-7 are
>
> Reviewed-by: Nicolai Hähnle <nicolai.haehnle at amd.com>
>
>> +
>> +    /* If the register is 2-dimensional (e.g. an array of vertices
>> +     * in a primitive), calculate the base address of the vertex. */
>> +    if (reg.Register.Dimension) {
>> +
>> +        if (reg.Dimension.Indirect)
>> +            vertex_index = get_indirect_index(ctx, &reg.DimIndirect,
>> +                                              reg.Dimension.Index);
>> +        else
>> +            vertex_index = lp_build_const_int32(gallivm,
>> +                                                reg.Dimension.Index);
>> +    }
>> +
>> +    /* Get information about the register. */
>> +    if (reg.Register.File == TGSI_FILE_INPUT) {
>> +        name = info->input_semantic_name;
>> +        index = info->input_semantic_index;
>> +        array_first = info->input_array_first;
>> +    } else if (reg.Register.File == TGSI_FILE_OUTPUT) {
>> +        name = info->output_semantic_name;
>> +        index = info->output_semantic_index;
>> +        array_first = info->output_array_first;
>> +    } else {
>> +        assert(0);
>> +        return NULL;
>> +    }
>> +
>> +    if (reg.Register.Indirect) {
>> +        if (reg.Indirect.ArrayID)
>> +            param_base = array_first[reg.Indirect.ArrayID];
>> +        else
>> +            param_base = reg.Register.Index;
>> +
>> +        param_index = get_indirect_index(ctx, &reg.Indirect,
>> +                                         reg.Register.Index -
>> param_base);
>> +
>> +    } else {
>> +        param_base = reg.Register.Index;
>> +        param_index = lp_build_const_int32(gallivm, 0);
>> +    }
>> +
>> +    param_index_base = si_shader_io_get_unique_index(name[param_base],
>> +                                                     index[param_base]);
>> +
>> +    param_index = LLVMBuildAdd(gallivm->builder, param_index,
>> +                               lp_build_const_int32(gallivm,
>> param_index_base),
>> +                               "");
>> +
>> +    /* Add the base address of the element. */
>> +    return get_buffer_address(ctx, vertex_index, param_index);
>> +}
>> +
>>   /* TBUFFER_STORE_FORMAT_{X,XY,XYZ,XYZW} <- the suffix is selected by
>> num_channels=1..4.
>>    * The type of vdata must be one of i32 (num_channels=1), v2i32
>> (num_channels=2),
>>    * or v4i32 (num_channels=3,4). */
>>