<div dir="ltr"><div class="gmail_extra"><div class="gmail_quote">On Thu, Mar 8, 2018 at 9:08 AM, Samuel Pitoiset <span dir="ltr"><<a href="mailto:samuel.pitoiset@gmail.com" target="_blank">samuel.pitoiset@gmail.com</a>></span> wrote:<br><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">RadeonSI does something similar, the VGPRs decrease is a win<br>
but not sure if we really want to implement that.<br>
<br>
Polaris10:<br>
Totals from affected shaders:<br>
SGPRS: 116376 -> 116768 (0.34 %)<br>
VGPRS: 76556 -> 74868 (-2.20 %)<br>
Spilled SGPRs: 10347 -> 10466 (1.15 %)<br>
Code Size: 5555072 -> 5569024 (0.25 %) bytes<br>
Max Waves: 9854 -> 9951 (0.98 %)<br>
<br>
Signed-off-by: Samuel Pitoiset <<a href="mailto:samuel.pitoiset@gmail.com">samuel.pitoiset@gmail.com</a>><br>
---<br>
 src/amd/common/ac_nir_to_llvm.<wbr>c | 118 ++++++++++++++++++++++++++++++<wbr>+---------<br>
 src/amd/common/ac_shader_abi.h  |   7 +++<br>
 2 files changed, 98 insertions(+), 27 deletions(-)<br>
<br>
diff --git a/src/amd/common/ac_nir_to_<wbr>llvm.c b/src/amd/common/ac_nir_to_<wbr>llvm.c<br>
index 644c85e2eb..eb0935972d 100644<br>
--- a/src/amd/common/ac_nir_to_<wbr>llvm.c<br>
+++ b/src/amd/common/ac_nir_to_<wbr>llvm.c<br>
@@ -3131,6 +3131,7 @@ static LLVMValueRef visit_load_var(struct ac_nir_context *ctx,<br>
                                   nir_intrinsic_instr *instr)<br>
 {<br>
        LLVMValueRef values[8];<br>
+       int location = instr->variables[0]->var-><wbr>data.location;<br>
        int idx = instr->variables[0]->var-><wbr>data.driver_location;<br>
        int ve = instr->dest.ssa.num_<wbr>components;<br>
        unsigned comp = instr->variables[0]->var-><wbr>data.location_frac;<br>
@@ -3167,6 +3168,19 @@ static LLVMValueRef visit_load_var(struct ac_nir_context *ctx,<br>
                                                     instr->num_components, vertex_index, const_index, type);<br>
                }<br>
<br>
+               LLVMValueRef inputs[4];<br>
+<br>
+               if (ctx->stage == MESA_SHADER_FRAGMENT) {<br>
+                       ctx->abi->load_fs_inputs(ctx-><wbr>abi, location,<br>
+                                                indir_index, const_index,<br>
+                                                stride, inputs);<br></blockquote><div><br></div><div>load_fs_inputs is NULL for radeonsi. Are you sure that radeonsi doesn't get here?<br><br></div><div>Marek<br></div><div> </div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">
+               } else {<br>
+                       unsigned index = idx +<br>
+                               (indir_index ? 0 : const_index * stride);<br>
+<br>
+                       memcpy(inputs, &ctx->abi->inputs[index], sizeof(inputs));<br>
+               }<br>
+<br>
                for (unsigned chan = comp; chan < ve + comp; chan++) {<br>
                        if (indir_index) {<br>
                                unsigned count = glsl_count_attribute_slots(<br>
@@ -3174,14 +3188,15 @@ static LLVMValueRef visit_load_var(struct ac_nir_context *ctx,<br>
                                                ctx->stage == MESA_SHADER_VERTEX);<br>
                                count -= chan / 4;<br>
                                LLVMValueRef tmp_vec = ac_build_gather_values_<wbr>extended(<br>
-                                               &ctx->ac, ctx->abi->inputs + idx + chan, count,<br>
+                                               &ctx->ac, inputs + chan, count,<br>
                                                stride, false, true);<br>
<br>
                                values[chan] = LLVMBuildExtractElement(ctx-><wbr>ac.builder,<br>
                                                                       tmp_vec,<br>
                                                                       indir_index, "");<br>
-                       } else<br>
-                               values[chan] = ctx->abi->inputs[idx + chan + const_index * stride];<br>
+                       } else {<br>
+                               values[chan] = inputs[chan];<br>
+                       }<br>
                }<br>
                break;<br>
        case nir_var_local:<br>
@@ -5556,45 +5571,93 @@ prepare_interp_optimize(struct radv_shader_context *ctx,<br>
        }<br>
 }<br>
<br>
+static unsigned<br>
+get_input_hw_index(struct radv_shader_context *ctx, unsigned idx)<br>
+{<br>
+       struct ac_shader_info *info = &ctx->shader_info->info;<br>
+       uint64_t mask = info->input_mask & ((1ull << idx) - 1);<br>
+<br>
+       mask &= ~(1ull << VARYING_SLOT_POS);<br>
+<br>
+       return util_bitcount64(mask);<br>
+}<br>
+<br>
+/* If this is true, preload FS inputs at the beginning of shaders. Otherwise,<br>
+ * reload them at each use. This must be true if the shader is using<br>
+ * derivatives and KILL, because KILL can leave the WQM and then a lazy<br>
+ * input load isn't in the WQM anymore.<br>
+ */<br>
+static bool<br>
+radv_preload_fs_inputs(struct radv_shader_context *ctx)<br>
+{<br>
+       return ctx->shader_info-><a href="http://info.ps">info.ps</a>.<wbr>uses_derivatives &&<br>
+              ctx->shader_info-><a href="http://info.ps">info.ps</a>.<wbr>uses_kill;<br>
+}<br>
+<br>
 static void<br>
-handle_fs_inputs(struct radv_shader_context *ctx,<br>
-                 struct nir_shader *nir)<br>
+radv_load_fs_inputs(struct radv_shader_context *ctx, unsigned idx,<br>
+                   LLVMValueRef out[4])<br>
 {<br>
        struct ac_shader_info *info = &ctx->shader_info->info;<br>
<br>
+       if (idx >= VARYING_SLOT_VAR0 ||<br>
+           idx == VARYING_SLOT_PNTC ||<br>
+           idx == VARYING_SLOT_PRIMITIVE_ID ||<br>
+           idx == VARYING_SLOT_LAYER) {<br>
+               unsigned interp_mode = info->ps.input_interp_mode[<wbr>idx];<br>
+               unsigned interp_loc = info->ps.input_interp_loc[idx]<wbr>;<br>
+               unsigned hw_index = get_input_hw_index(ctx, idx);<br>
+               LLVMValueRef interp_param =<br>
+                       lookup_interp_param(&ctx->abi, interp_mode, interp_loc);<br>
+<br>
+               interp_fs_input(ctx, hw_index, interp_param, ctx->abi.prim_mask,<br>
+                               &out[0]);<br>
+       } else if (idx == VARYING_SLOT_POS) {<br>
+               for (int i = 0; i < 3; ++i)<br>
+                       out[i] = ctx->abi.frag_pos[i];<br>
+<br>
+               out[3] = ac_build_fdiv(&ctx->ac, ctx->ac.f32_1,<br>
+                                      ctx->abi.frag_pos[3]);<br>
+       }<br>
+}<br>
+<br>
+static void<br>
+load_fs_inputs(struct ac_shader_abi *abi,<br>
+              unsigned location,<br>
+              LLVMValueRef indir_index,<br>
+              unsigned const_index,<br>
+              unsigned stride,<br>
+              LLVMValueRef out[4])<br>
+{<br>
+       struct radv_shader_context *ctx = radv_shader_context_from_abi(<wbr>abi);<br>
+<br>
+       if (!radv_preload_fs_inputs(ctx)) {<br>
+               radv_load_fs_inputs(ctx, location, out);<br>
+       } else {<br>
+               unsigned index = radeon_llvm_reg_index_soa(<wbr>location, 0);<br>
+<br>
+               index += (indir_index ? 0 : const_index * stride);<br>
+<br>
+               memcpy(out, &abi->inputs[index], sizeof(out[0]) * 4);<br>
+       }<br>
+}<br>
+<br>
+static void<br>
+handle_fs_inputs(struct radv_shader_context *ctx,<br>
+                 struct nir_shader *nir)<br>
+{<br>
        prepare_interp_optimize(ctx, nir);<br>
<br>
        nir_foreach_variable(variable, &nir->inputs)<br>
                handle_fs_input_decl(ctx, variable);<br>
<br>
-       unsigned index = 0;<br>
-<br>
        for (unsigned i = 0; i < RADEON_LLVM_MAX_INPUTS; ++i) {<br>
-               LLVMValueRef interp_param;<br>
                LLVMValueRef *inputs = ctx->inputs +radeon_llvm_reg_index_soa(i, 0);<br>
<br>
                if (!(ctx->shader_info->info.<wbr>input_mask & (1ull << i)))<br>
                        continue;<br>
<br>
-               if (i >= VARYING_SLOT_VAR0 || i == VARYING_SLOT_PNTC ||<br>
-                   i == VARYING_SLOT_PRIMITIVE_ID || i == VARYING_SLOT_LAYER) {<br>
-                       unsigned interp_mode = info->ps.input_interp_mode[i];<br>
-                       unsigned interp_loc = info->ps.input_interp_loc[i];<br>
-<br>
-                       interp_param = lookup_interp_param(&ctx->abi, interp_mode,<br>
-                                                          interp_loc);<br>
-<br>
-                       interp_fs_input(ctx, index, interp_param, ctx->abi.prim_mask,<br>
-                                       inputs);<br>
-<br>
-                       ++index;<br>
-               } else if (i == VARYING_SLOT_POS) {<br>
-                       for(int i = 0; i < 3; ++i)<br>
-                               inputs[i] = ctx->abi.frag_pos[i];<br>
-<br>
-                       inputs[3] = ac_build_fdiv(&ctx->ac, ctx->ac.f32_1,<br>
-                                                 ctx->abi.frag_pos[3]);<br>
-               }<br>
+               radv_load_fs_inputs(ctx, i, inputs);<br>
        }<br>
<br>
        if (ctx->shader_info->info.needs_<wbr>multiview_view_index)<br>
@@ -6924,6 +6987,7 @@ LLVMModuleRef ac_translate_nir_to_llvm(<wbr>LLVMTargetMachineRef tm,<br>
                        ctx.abi.load_base_vertex = radv_load_base_vertex;<br>
                } else if (shaders[i]->info.stage == MESA_SHADER_FRAGMENT) {<br>
                        shader_info->fs.can_discard = shaders[i]->info.fs.uses_<wbr>discard;<br>
+                       ctx.abi.load_fs_inputs = load_fs_inputs;<br>
                        ctx.abi.lookup_interp_param = lookup_interp_param;<br>
                        ctx.abi.load_sample_position = load_sample_position;<br>
                        ctx.abi.load_sample_mask_in = load_sample_mask_in;<br>
diff --git a/src/amd/common/ac_shader_<wbr>abi.h b/src/amd/common/ac_shader_<wbr>abi.h<br>
index 901e49b1f9..8e51ce9fdd 100644<br>
--- a/src/amd/common/ac_shader_<wbr>abi.h<br>
+++ b/src/amd/common/ac_shader_<wbr>abi.h<br>
@@ -97,6 +97,13 @@ struct ac_shader_abi {<br>
                                    unsigned const_index,<br>
                                    LLVMTypeRef type);<br>
<br>
+       void (*load_fs_inputs)(struct ac_shader_abi *abi,<br>
+                              unsigned location,<br>
+                              LLVMValueRef indir_index,<br>
+                              unsigned const_index,<br>
+                              unsigned stride,<br>
+                              LLVMValueRef out[4]);<br>
+<br>
        LLVMValueRef (*load_tess_varyings)(struct ac_shader_abi *abi,<br>
                                           LLVMTypeRef type,<br>
                                           LLVMValueRef vertex_index,<br>
<span class="HOEnZb"><font color="#888888">--<br>
2.16.2<br>
<br>
______________________________<wbr>_________________<br>
mesa-dev mailing list<br>
<a href="mailto:mesa-dev@lists.freedesktop.org">mesa-dev@lists.freedesktop.org</a><br>
<a href="https://lists.freedesktop.org/mailman/listinfo/mesa-dev" rel="noreferrer" target="_blank">https://lists.freedesktop.org/<wbr>mailman/listinfo/mesa-dev</a><br>
</font></span></blockquote></div><br></div></div>