[Mesa-dev] [PATCH 04/14] radeonsi: Add buffer load functions.

Tue May 10 17:18:47 UTC 2016

On 10.05.2016 11:36, Bas Nieuwenhuizen wrote:
> On Tue, May 10, 2016 at 6:28 PM, Nicolai Hähnle <nhaehnle at gmail.com> wrote:
>> On 10.05.2016 11:25, Bas Nieuwenhuizen wrote:
>>>
>>> On Tue, May 10, 2016 at 6:13 PM, Nicolai Hähnle <nhaehnle at gmail.com>
>>> wrote:
>>>>
>>>> On 10.05.2016 05:52, Bas Nieuwenhuizen wrote:
>>>>>
>>>>>
>>>>> Signed-off-by: Bas Nieuwenhuizen <bas at basnieuwenhuizen.nl>
>>>>> ---
>>>>>     src/gallium/drivers/radeonsi/si_shader.c | 81
>>>>> ++++++++++++++++++++++++++++++++
>>>>>     1 file changed, 81 insertions(+)
>>>>>
>>>>> diff --git a/src/gallium/drivers/radeonsi/si_shader.c
>>>>> b/src/gallium/drivers/radeonsi/si_shader.c
>>>>> index 5897149..d3df4d6 100644
>>>>> --- a/src/gallium/drivers/radeonsi/si_shader.c
>>>>> +++ b/src/gallium/drivers/radeonsi/si_shader.c
>>>>> @@ -733,6 +733,87 @@ static void build_tbuffer_store_dwords(struct
>>>>> si_shader_context *ctx,
>>>>>                               V_008F0C_BUF_NUM_FORMAT_UINT, 1, 0, 1, 1,
>>>>> 0);
>>>>>     }
>>>>>
>>>>> +static LLVMValueRef build_buffer_load(struct si_shader_context *ctx,
>>>>> +                               LLVMValueRef rsrc,
>>>>> +                               int num_channels,
>>>>> +                               LLVMValueRef vindex,
>>>>> +                               LLVMValueRef voffset,
>>>>> +                               LLVMValueRef soffset,
>>>>> +                               unsigned inst_offset,
>>>>> +                               unsigned glc,
>>>>> +                               unsigned slc)
>>>>> +{
>>>>> +       struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
>>>>> +       LLVMValueRef args[] = {
>>>>> +               LLVMBuildBitCast(gallivm->builder, rsrc, ctx->v16i8,
>>>>> ""),
>>>>> +               voffset ? voffset : vindex,
>>>>> +               soffset,
>>>>> +               LLVMConstInt(ctx->i32, inst_offset, 0),
>>>>> +               LLVMConstInt(ctx->i32, voffset ? 1 : 0, 0), // offen
>>>>> +               LLVMConstInt(ctx->i32, vindex ? 1 : 0, 0), //idxen
>>>>> +               LLVMConstInt(ctx->i32, glc, 0),
>>>>> +               LLVMConstInt(ctx->i32, slc, 0),
>>>>> +               LLVMConstInt(ctx->i32, 0, 0), // TFE
>>>>> +       };
>>>>> +
>>>>> +       unsigned func = CLAMP(num_channels, 1, 3) - 1;
>>>>> +       LLVMTypeRef types[] = {ctx->i32, LLVMVectorType(ctx->i32, 2),
>>>>> ctx->v4i32};
>>>>> +       const char *type_names[] = {"i32", "v2i32", "v4i32"};
>>>>> +       const char *arg_type = "i32";
>>>>> +
>>>>> +       if (voffset && vindex) {
>>>>> +               LLVMValueRef vaddr[] = {vindex, voffset};
>>>>> +
>>>>> +               arg_type = "v2i32";
>>>>> +               args[1] = lp_build_gather_values(gallivm, vaddr, 2);
>>>>> +       }
>>>>> +
>>>>> +       char name[256];
>>>>> +       snprintf(name, sizeof(name), "llvm.SI.buffer.load.dword.%s.%s",
>>>>> +                type_names[func], arg_type);
>>>>
>>>>
>>>>
>>>> We're generally trying to get away from the llvm.SI.* intrinsics and use
>>>> the
>>>> llvm.amdgcn.* intrinsic instead - in this case llvm.amdgcn.buffer.load.
>>>
>>>
>>> The llvm.amdgcn.buffer.load doesn't allow specifiying VGPR + SGPR +
>>> immediate offset separately though as far as I can see. Furthermore I
>>> was trying to avoid a LLVM 3.9 dependency, although I can solve that
>>> with if based on LLVM version.
>>
>>
>> Fair enough on the LLVM version dependency.
>>
>> I also think you're right about llvm.amdgcn.buffer.load, but that's
>> something that should be fixed on the LLVM side eventually without
>> introducing a new intrinsic.
>
> I am not sure if LLVM can eventually be able to. I don't know if the
> vgpr + sgpr + immediate offset is wrapped around in 32-bit. Otherwise
> a v_add and the offsets have different behavior and LLVM should not
> sink the add into the load instruction.

That's a good point. I don't know the answer unfortunately...

Nicolai

>
> - Bas
>
>>
>> I think we should go for the if (HAVE_LLVM) approach even if it produces
>> slightly worse code for now (it really should be only one additional v_add
>> at most).
>>
>> Nicolai
>>
>>
>>> - Bas
>>>
>>>>
>>>> Nicolai
>>>>
>>>>
>>>>> +
>>>>> +       return lp_build_intrinsic(gallivm->builder, name, types[func],
>>>>> args,
>>>>> +                                 ARRAY_SIZE(args),
>>>>> LLVMReadOnlyAttribute
>>>>> |
>>>>> +
>>>>> LLVMNoUnwindAttribute);
>>>>> +}
>>>>> +
>>>>> +static LLVMValueRef buffer_load(struct lp_build_tgsi_context *bld_base,
>>>>> +                                enum tgsi_opcode_type type, unsigned
>>>>> swizzle,
>>>>> +                                LLVMValueRef buffer, LLVMValueRef
>>>>> offset,
>>>>> +                                LLVMValueRef base)
>>>>> +{
>>>>> +       struct si_shader_context *ctx = si_shader_context(bld_base);
>>>>> +       struct gallivm_state *gallivm = bld_base->base.gallivm;
>>>>> +       LLVMValueRef value, value2;
>>>>> +       LLVMTypeRef llvm_type = tgsi2llvmtype(bld_base, type);
>>>>> +       LLVMTypeRef vec_type = LLVMVectorType(llvm_type, 4);
>>>>> +
>>>>> +       if (swizzle == ~0) {
>>>>> +
>>>>> +               value = build_buffer_load(ctx, buffer, 4, NULL, base,
>>>>> offset,
>>>>> +                                         0, 1, 0);
>>>>> +
>>>>> +               return LLVMBuildBitCast(gallivm->builder, value,
>>>>> vec_type,
>>>>> "");
>>>>> +       }
>>>>> +
>>>>> +       if (type != TGSI_TYPE_DOUBLE) {
>>>>> +               value = build_buffer_load(ctx, buffer, 4, NULL, base,
>>>>> offset,
>>>>> +                                         0, 1, 0);
>>>>> +
>>>>> +               value = LLVMBuildBitCast(gallivm->builder, value,
>>>>> vec_type, "");
>>>>> +               return LLVMBuildExtractElement(gallivm->builder, value,
>>>>> +                                   lp_build_const_int32(gallivm,
>>>>> swizzle), "");
>>>>> +       }
>>>>> +
>>>>> +       value = build_buffer_load(ctx, buffer, 1, NULL, base, offset,
>>>>> +                                 swizzle * 4, 1, 0);
>>>>> +
>>>>> +       value2 = build_buffer_load(ctx, buffer, 1, NULL, base, offset,
>>>>> +                                  swizzle * 4 + 4, 1, 0);
>>>>> +
>>>>> +       return radeon_llvm_emit_fetch_double(bld_base, value, value2);
>>>>> +}
>>>>> +
>>>>>     /**
>>>>>      * Load from LDS.
>>>>>      *
>>>>>
>>>>
>>