[Mesa-dev] [RFC PATCH 2/2] ac/nir: Don't treat each element as a vec4 in compute shared memory
Alex Smith
asmith at feralinteractive.com
Tue Jul 4 09:56:34 UTC 2017
Hi Connor,
Can confirm this does work here, thanks!
Makes this patch redundant too, since that problem's also fixed by
your changes: https://lists.freedesktop.org/archives/mesa-dev/2017-June/161559.html
Alex
On 4 July 2017 at 04:32, Connor Abbott <cwabbott0 at gmail.com> wrote:
> I'm going to do a full CTS run later this week before I send the
> patches to the list, but can you confirm that this series fixes your
> problems: https://cgit.freedesktop.org/~cwabbott0/mesa/log/?h=radv-rewrite-vars
> (it passes all the CTS dEQP.compute.* tests, but that's not saying
> much...)
>
> On Mon, Jul 3, 2017 at 6:33 PM, Connor Abbott <cwabbott0 at gmail.com> wrote:
>> I think a better approach would be to translate the NIR variables
>> directly to LLVM variables instead of creating one giant LLVM
>> variable. I have a series coming up that does a similar thing for
>> local variables, I can try and convert shared variables too.
>>
>> On Tue, Jun 27, 2017 at 8:53 AM, Alex Smith <asmith at feralinteractive.com> wrote:
>>> Currently every element in shared memory (including individual elements
>>> of an array) are treated as a vec4. For example, the following:
>>>
>>> shared uint array[1024];
>>>
>>> gets treated as an array of 1024 vec4s with only the first component
>>> ever used.
>>>
>>> This can be highly wasteful of LDS space. We have a shader which only
>>> really requires ~24KB of shared memory, but since scalars are being
>>> padded to vec4, it actually ends up needing 96KB. This is more than
>>> the hardware supports, and leads to an LLVM compilation failure.
>>>
>>> Therefore, instead pack data in LDS according to the std430 layout.
>>>
>>> Signed-off-by: Alex Smith <asmith at feralinteractive.com>
>>> ---
>>> RFC since I'm not sure whether this is the best approach to handle this,
>>> and there may be something I've missed.
>>>
>>> No regressions seen in Mad Max or Dawn of War 3, which both have a few
>>> shaders which use shared memory. Also no regressions in the CTS
>>> 'dEQP-VK.compute.*' tests.
>>>
>>> I've also checked over various test cases using arrays/structs/etc. in
>>> shared memory, and as far as I can tell the generated LLVM/GCN code is
>>> correct.
>>> ---
>>> src/amd/common/ac_nir_to_llvm.c | 55 ++++++++++++++++++++++-------------------
>>> 1 file changed, 29 insertions(+), 26 deletions(-)
>>>
>>> diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c
>>> index 468ce4d..a48bb3b 100644
>>> --- a/src/amd/common/ac_nir_to_llvm.c
>>> +++ b/src/amd/common/ac_nir_to_llvm.c
>>> @@ -397,7 +397,7 @@ static LLVMValueRef get_shared_memory_ptr(struct nir_to_llvm_context *ctx,
>>> LLVMValueRef ptr;
>>> int addr_space;
>>>
>>> - offset = LLVMConstInt(ctx->i32, idx * 16, false);
>>> + offset = LLVMConstInt(ctx->i32, idx, false);
>>>
>>> ptr = ctx->shared_memory;
>>> ptr = LLVMBuildGEP(ctx->builder, ptr, &offset, 1, "");
>>> @@ -2404,7 +2404,7 @@ static LLVMValueRef visit_load_ubo_buffer(struct nir_to_llvm_context *ctx,
>>>
>>> static void
>>> radv_get_deref_offset(struct nir_to_llvm_context *ctx, nir_deref_var *deref,
>>> - bool vs_in, unsigned *vertex_index_out,
>>> + bool vs_in, bool compute_shared, unsigned *vertex_index_out,
>>> LLVMValueRef *vertex_index_ref,
>>> unsigned *const_out, LLVMValueRef *indir_out)
>>> {
>>> @@ -2445,7 +2445,14 @@ radv_get_deref_offset(struct nir_to_llvm_context *ctx, nir_deref_var *deref,
>>> if (tail->deref_type == nir_deref_type_array) {
>>> nir_deref_array *deref_array = nir_deref_as_array(tail);
>>> LLVMValueRef index, stride, local_offset;
>>> - unsigned size = glsl_count_attribute_slots(tail->type, vs_in);
>>> +
>>> + /* For compute LDS, data is packed rather than treating
>>> + * each individual element as a vec4. The returned
>>> + * index is used to index into an i32 array so divide
>>> + * by 4.
>>> + */
>>> + unsigned size = compute_shared ? glsl_get_std430_array_stride(tail->type, false) / 4
>>> + : glsl_count_attribute_slots(tail->type, vs_in);
>>>
>>> const_offset += size * deref_array->base_offset;
>>> if (deref_array->deref_array_type == nir_deref_array_type_direct)
>>> @@ -2465,7 +2472,11 @@ radv_get_deref_offset(struct nir_to_llvm_context *ctx, nir_deref_var *deref,
>>>
>>> for (unsigned i = 0; i < deref_struct->index; i++) {
>>> const struct glsl_type *ft = glsl_get_struct_field(parent_type, i);
>>> - const_offset += glsl_count_attribute_slots(ft, vs_in);
>>> +
>>> + /* Same as above. */
>>> + unsigned size = compute_shared ? glsl_get_std430_array_stride(ft, false) / 4
>>> + : glsl_count_attribute_slots(tail->type, vs_in);
>>> + const_offset += size;
>>> }
>>> } else
>>> unreachable("unsupported deref type");
>>> @@ -2641,7 +2652,7 @@ load_tcs_input(struct nir_to_llvm_context *ctx,
>>> const bool is_compact = instr->variables[0]->var->data.compact;
>>> param = shader_io_get_unique_index(instr->variables[0]->var->data.location);
>>> radv_get_deref_offset(ctx, instr->variables[0],
>>> - false, NULL, per_vertex ? &vertex_index : NULL,
>>> + false, false, NULL, per_vertex ? &vertex_index : NULL,
>>> &const_index, &indir_index);
>>>
>>> stride = unpack_param(ctx, ctx->tcs_in_layout, 13, 8);
>>> @@ -2673,7 +2684,7 @@ load_tcs_output(struct nir_to_llvm_context *ctx,
>>> const bool is_compact = instr->variables[0]->var->data.compact;
>>> param = shader_io_get_unique_index(instr->variables[0]->var->data.location);
>>> radv_get_deref_offset(ctx, instr->variables[0],
>>> - false, NULL, per_vertex ? &vertex_index : NULL,
>>> + false, false, NULL, per_vertex ? &vertex_index : NULL,
>>> &const_index, &indir_index);
>>>
>>> if (!instr->variables[0]->var->data.patch) {
>>> @@ -2712,7 +2723,7 @@ store_tcs_output(struct nir_to_llvm_context *ctx,
>>> const bool is_compact = instr->variables[0]->var->data.compact;
>>>
>>> radv_get_deref_offset(ctx, instr->variables[0],
>>> - false, NULL, per_vertex ? &vertex_index : NULL,
>>> + false, false, NULL, per_vertex ? &vertex_index : NULL,
>>> &const_index, &indir_index);
>>>
>>> param = shader_io_get_unique_index(instr->variables[0]->var->data.location);
>>> @@ -2779,7 +2790,7 @@ load_tes_input(struct nir_to_llvm_context *ctx,
>>> const bool is_compact = instr->variables[0]->var->data.compact;
>>>
>>> radv_get_deref_offset(ctx, instr->variables[0],
>>> - false, NULL, per_vertex ? &vertex_index : NULL,
>>> + false, false, NULL, per_vertex ? &vertex_index : NULL,
>>> &const_index, &indir_index);
>>> param = shader_io_get_unique_index(instr->variables[0]->var->data.location);
>>> if (instr->variables[0]->var->data.location == VARYING_SLOT_CLIP_DIST0 &&
>>> @@ -2808,7 +2819,7 @@ load_gs_input(struct nir_to_llvm_context *ctx,
>>> LLVMValueRef value[4], result;
>>> unsigned vertex_index;
>>> radv_get_deref_offset(ctx, instr->variables[0],
>>> - false, &vertex_index, NULL,
>>> + false, false, &vertex_index, NULL,
>>> &const_index, &indir_index);
>>> vtx_offset_param = vertex_index;
>>> assert(vtx_offset_param < 6);
>>> @@ -2849,8 +2860,9 @@ static LLVMValueRef visit_load_var(struct nir_to_llvm_context *ctx,
>>> unsigned const_index;
>>> bool vs_in = ctx->stage == MESA_SHADER_VERTEX &&
>>> instr->variables[0]->var->data.mode == nir_var_shader_in;
>>> - radv_get_deref_offset(ctx, instr->variables[0], vs_in, NULL, NULL,
>>> - &const_index, &indir_index);
>>> + bool compute_shared = instr->variables[0]->var->data.mode == nir_var_shared;
>>> + radv_get_deref_offset(ctx, instr->variables[0], vs_in, compute_shared,
>>> + NULL, NULL, &const_index, &indir_index);
>>>
>>> if (instr->dest.ssa.bit_size == 64)
>>> ve *= 2;
>>> @@ -2925,9 +2937,6 @@ static LLVMValueRef visit_load_var(struct nir_to_llvm_context *ctx,
>>> LLVMValueRef ptr = get_shared_memory_ptr(ctx, idx, ctx->i32);
>>> LLVMValueRef derived_ptr;
>>>
>>> - if (indir_index)
>>> - indir_index = LLVMBuildMul(ctx->builder, indir_index, LLVMConstInt(ctx->i32, 4, false), "");
>>> -
>>> for (unsigned chan = 0; chan < ve; chan++) {
>>> LLVMValueRef index = LLVMConstInt(ctx->i32, chan, false);
>>> if (indir_index)
>>> @@ -2955,7 +2964,8 @@ visit_store_var(struct nir_to_llvm_context *ctx,
>>> int writemask = instr->const_index[0];
>>> LLVMValueRef indir_index;
>>> unsigned const_index;
>>> - radv_get_deref_offset(ctx, instr->variables[0], false,
>>> + bool compute_shared = instr->variables[0]->var->data.mode == nir_var_shared;
>>> + radv_get_deref_offset(ctx, instr->variables[0], false, compute_shared,
>>> NULL, NULL, &const_index, &indir_index);
>>>
>>> if (get_elem_bits(ctx, LLVMTypeOf(src)) == 64) {
>>> @@ -3040,9 +3050,6 @@ visit_store_var(struct nir_to_llvm_context *ctx,
>>> case nir_var_shared: {
>>> LLVMValueRef ptr = get_shared_memory_ptr(ctx, idx, ctx->i32);
>>>
>>> - if (indir_index)
>>> - indir_index = LLVMBuildMul(ctx->builder, indir_index, LLVMConstInt(ctx->i32, 4, false), "");
>>> -
>>> for (unsigned chan = 0; chan < 8; chan++) {
>>> if (!(writemask & (1 << chan)))
>>> continue;
>>> @@ -5756,9 +5763,9 @@ handle_shader_outputs_post(struct nir_to_llvm_context *ctx)
>>>
>>> static void
>>> handle_shared_compute_var(struct nir_to_llvm_context *ctx,
>>> - struct nir_variable *variable, uint32_t *offset, int idx)
>>> + struct nir_variable *variable, uint32_t *offset)
>>> {
>>> - unsigned size = glsl_count_attribute_slots(variable->type, false);
>>> + unsigned size = glsl_get_std430_array_stride(variable->type, false);
>>> variable->data.driver_location = *offset;
>>> *offset += size;
>>> }
>>> @@ -5926,16 +5933,12 @@ LLVMModuleRef ac_translate_nir_to_llvm(LLVMTargetMachineRef tm,
>>> nir_foreach_variable(variable, &nir->shared)
>>> num_shared++;
>>> if (num_shared) {
>>> - int idx = 0;
>>> uint32_t shared_size = 0;
>>> LLVMValueRef var;
>>> LLVMTypeRef i8p = LLVMPointerType(ctx.i8, LOCAL_ADDR_SPACE);
>>> - nir_foreach_variable(variable, &nir->shared) {
>>> - handle_shared_compute_var(&ctx, variable, &shared_size, idx);
>>> - idx++;
>>> - }
>>> + nir_foreach_variable(variable, &nir->shared)
>>> + handle_shared_compute_var(&ctx, variable, &shared_size);
>>>
>>> - shared_size *= 16;
>>> var = LLVMAddGlobalInAddressSpace(ctx.module,
>>> LLVMArrayType(ctx.i8, shared_size),
>>> "compute_lds",
>>> --
>>> 2.9.4
>>>
>>> _______________________________________________
>>> mesa-dev mailing list
>>> mesa-dev at lists.freedesktop.org
>>> https://lists.freedesktop.org/mailman/listinfo/mesa-dev
More information about the mesa-dev
mailing list