[Mesa-dev] [PATCH 4/4] radv: reduce the number of loaded channels for vertex input fetches

Wed Feb 13 22:09:18 UTC 2019

On 2/13/19 10:59 PM, Bas Nieuwenhuizen wrote:
> On Tue, Feb 12, 2019 at 3:07 PM Samuel Pitoiset
> <samuel.pitoiset at gmail.com> wrote:
>> It's unnecessary to load more channels than the vertex attribute
>> format. The remaining channels are filled with 0 for y and z,
>> and 1 for w.
>>
>> 29077 shaders in 15096 tests
>> Totals:
>> SGPRS: 1321605 -> 1318869 (-0.21 %)
>> VGPRS: 935236 -> 932252 (-0.32 %)
>> Spilled SGPRs: 24860 -> 24776 (-0.34 %)
>> Code Size: 49832348 -> 49819464 (-0.03 %) bytes
>> Max Waves: 242101 -> 242611 (0.21 %)
>>
>> Totals from affected shaders:
>> SGPRS: 93675 -> 90939 (-2.92 %)
>> VGPRS: 58016 -> 55032 (-5.14 %)
>> Spilled SGPRs: 172 -> 88 (-48.84 %)
>> Code Size: 2862740 -> 2849856 (-0.45 %) bytes
>> Max Waves: 15474 -> 15984 (3.30 %)
>>
>> This mostly helps Croteam games (Talos/Sam2017).
>>
>> Signed-off-by: Samuel Pitoiset <samuel.pitoiset at gmail.com>
>> ---
>>   src/amd/vulkan/radv_nir_to_llvm.c | 83 ++++++++++++++++++++++++++++++-
>>   1 file changed, 81 insertions(+), 2 deletions(-)
>>
>> diff --git a/src/amd/vulkan/radv_nir_to_llvm.c b/src/amd/vulkan/radv_nir_to_llvm.c
>> index 7f74678d5f1..b1e0c64e4e1 100644
>> --- a/src/amd/vulkan/radv_nir_to_llvm.c
>> +++ b/src/amd/vulkan/radv_nir_to_llvm.c
>> @@ -1967,6 +1967,72 @@ adjust_vertex_fetch_alpha(struct radv_shader_context *ctx,
>>          return alpha;
>>   }
>>
>> +static unsigned
>> +get_num_channels_from_data_format(unsigned data_format)
>> +{
>> +       switch (data_format) {
>> +       case V_008F0C_BUF_DATA_FORMAT_8:
>> +       case V_008F0C_BUF_DATA_FORMAT_16:
>> +       case V_008F0C_BUF_DATA_FORMAT_32:
>> +               return 1;
>> +       case V_008F0C_BUF_DATA_FORMAT_8_8:
>> +       case V_008F0C_BUF_DATA_FORMAT_16_16:
>> +       case V_008F0C_BUF_DATA_FORMAT_32_32:
>> +               return 2;
>> +       case V_008F0C_BUF_DATA_FORMAT_10_11_11:
>> +       case V_008F0C_BUF_DATA_FORMAT_11_11_10:
>> +       case V_008F0C_BUF_DATA_FORMAT_32_32_32:
>> +               return 3;
>> +       case V_008F0C_BUF_DATA_FORMAT_8_8_8_8:
>> +       case V_008F0C_BUF_DATA_FORMAT_10_10_10_2:
>> +       case V_008F0C_BUF_DATA_FORMAT_2_10_10_10:
>> +       case V_008F0C_BUF_DATA_FORMAT_16_16_16_16:
>> +       case V_008F0C_BUF_DATA_FORMAT_32_32_32_32:
>> +               return 4;
>> +       default:
>> +               break;
>> +       }
>> +
>> +       return 4;
>> +}
>> +
>> +static LLVMValueRef
>> +radv_fixup_vertex_input_fetches(struct radv_shader_context *ctx,
>> +                               LLVMValueRef value,
>> +                               unsigned num_channels,
>> +                               bool is_float)
>> +{
>> +       LLVMValueRef zero = is_float ? ctx->ac.f32_0 : ctx->ac.i32_0;
>> +       LLVMValueRef one = is_float ? ctx->ac.f32_1 : ctx->ac.i32_1;
>> +       LLVMTypeRef elemtype;
>> +       LLVMValueRef chan[4];
>> +
>> +       if (LLVMGetTypeKind(LLVMTypeOf(value)) == LLVMVectorTypeKind) {
>> +               unsigned vec_size = LLVMGetVectorSize(LLVMTypeOf(value));
>> +
>> +               if (num_channels == 4 && vec_size == 4)
>> +                       return value;
> Just num_channels == vec_size ?
Ok.
>
>> +
>> +               num_channels = MIN2(num_channels, vec_size);
>> +
>> +               for (unsigned i = 0; i < num_channels; i++)
>> +                       chan[i] = ac_llvm_extract_elem(&ctx->ac, value, i);
>> +
>> +               elemtype = LLVMGetElementType(LLVMTypeOf(value));
>> +       } else {
>> +               if (num_channels) {
>> +                       assert(num_channels == 1);
>> +                       chan[0] = value;
>> +               }
>> +               elemtype = LLVMTypeOf(value);
>> +       }
>> +
>> +       for (unsigned i = num_channels; i < 4; i++)
>> +               chan[i] = i == 3 ? one : zero;
>> +
>> +       return ac_build_gather_values(&ctx->ac, chan, 4);
>> +}
>> +
>>   static void
>>   handle_vs_input_decl(struct radv_shader_context *ctx,
>>                       struct nir_variable *variable)
>> @@ -1979,7 +2045,7 @@ handle_vs_input_decl(struct radv_shader_context *ctx,
>>          unsigned attrib_count = glsl_count_attribute_slots(variable->type, true);
>>          uint8_t input_usage_mask =
>>                  ctx->shader_info->info.vs.input_usage_mask[variable->data.location];
>> -       unsigned num_channels = util_last_bit(input_usage_mask);
>> +       unsigned num_input_channels = util_last_bit(input_usage_mask);
>>
>>          variable->data.driver_location = variable->data.location * 4;
>>
>> @@ -1987,6 +2053,10 @@ handle_vs_input_decl(struct radv_shader_context *ctx,
>>          for (unsigned i = 0; i < attrib_count; ++i) {
>>                  LLVMValueRef output[4];
>>                  unsigned attrib_index = variable->data.location + i - VERT_ATTRIB_GENERIC0;
>> +               unsigned attrib_format = ctx->options->key.vs.vertex_attribute_formats[attrib_index];
>> +               unsigned data_format = attrib_format & 0x0f;
>> +               unsigned num_format = (attrib_format >> 4) & 0x07;
>> +               bool is_float = num_format == V_008F0C_BUF_NUM_FORMAT_FLOAT;
>>
>>                  if (ctx->options->key.vs.instance_rate_inputs & (1u << attrib_index)) {
>>                          uint32_t divisor = ctx->options->key.vs.instance_rate_divisors[attrib_index];
>> @@ -2018,12 +2088,21 @@ handle_vs_input_decl(struct radv_shader_context *ctx,
>>
>>                  t_list = ac_build_load_to_sgpr(&ctx->ac, t_list_ptr, t_offset);
>>
>> +               /* Adjust the number of channels to load based on the vertex
>> +                * attribute format.
>> +                */
>> +               unsigned num_format_channels = get_num_channels_from_data_format(data_format);
>> +               unsigned num_channels = MIN2(num_input_channels, num_format_channels);
>> +
>> +               assert(num_channels <= num_input_channels);
> Seems kinda redundant given the MIN2 above.
I forgot to remove that assertion before sending the series.
>
> Does this work with 3 components?

Yes, I think it works. CTS has plenty of tests that loads 3 components 
and I didn't spot any regressions.

Also note that ac_build_buffer_load_format() always loads 4 components 
if num_channels is 3.

>
> Otherwise r-b for the series.
>> +
>>                  input = ac_build_buffer_load_format(&ctx->ac, t_list,
>>                                                      buffer_index,
>>                                                      ctx->ac.i32_0,
>>                                                      num_channels, false, true);
>>
>> -               input = ac_build_expand_to_vec4(&ctx->ac, input, num_channels);
>> +               input = radv_fixup_vertex_input_fetches(ctx, input, num_channels,
>> +                                                       is_float);
>>
>>                  for (unsigned chan = 0; chan < 4; chan++) {
>>                          LLVMValueRef llvm_chan = LLVMConstInt(ctx->ac.i32, chan, false);
>> --
>> 2.20.1
>>
>> _______________________________________________
>> mesa-dev mailing list
>> mesa-dev at lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/mesa-dev