[Mesa-dev] [PATCH 1/2] st/mesa: reduce time spent in calculating temp read/writes

Marek Olšák maraeo at gmail.com
Thu Aug 27 13:26:17 PDT 2015


Yes, I think it's useful too. I'm not familiar with this code, so you
can add

Acked-by: Marek Olšák <marek.olsak at amd.com>

and commit if there is no reviewer.

Marek

On Thu, Aug 27, 2015 at 9:48 PM, Dave Airlie <airlied at gmail.com> wrote:
> On 27 August 2015 at 21:57, Marek Olšák <maraeo at gmail.com> wrote:
>> We could just skip this for radeonsi or any driver that does regalloc,
>> because it's useless there.
>
> I did expect this comment from Ilia, though adding a CAP and
> piping it through is an option I'd look into later. (or someone who
> cares can do it sooner).
>
> it doesn't change however what this patch does, it looks useful for r600
> as is.
>
> Dave.
>
>>
>> Marek
>>
>> On Thu, Aug 27, 2015 at 5:30 AM, Dave Airlie <airlied at gmail.com> wrote:
>>> From: Dave Airlie <airlied at redhat.com>
>>>
>>> The glsl->tgsi convertor does some temporary register reduction
>>> however in profiling shader-db this shows up quite highly,
>>>
>>> so optimise things to reduce the number of loops through
>>> all the instructions we do. This drops merge_registers
>>> from 4-5% on the profile to 1%. I think this can be reduced
>>> further by possibly optimising the renumber pass.
>>>
>>> Signed-off-by: Dave Airlie <airlied at redhat.com>
>>> ---
>>>  src/mesa/state_tracker/st_glsl_to_tgsi.cpp | 153 +++++++++++++++--------------
>>>  1 file changed, 79 insertions(+), 74 deletions(-)
>>>
>>> diff --git a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
>>> index 65aae40..e07db11 100644
>>> --- a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
>>> +++ b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
>>> @@ -480,10 +480,9 @@ public:
>>>     void simplify_cmp(void);
>>>
>>>     void rename_temp_register(int index, int new_index);
>>> -   int get_first_temp_read(int index);
>>> -   int get_first_temp_write(int index);
>>> -   int get_last_temp_read(int index);
>>> -   int get_last_temp_write(int index);
>>> +   void get_first_temp_read(int *first_reads);
>>> +   void get_last_temp_read_first_temp_write(int *last_reads, int *first_writes);
>>> +   void get_last_temp_write(int *last_writes);
>>>
>>>     void copy_propagate(void);
>>>     int eliminate_dead_code(void);
>>> @@ -3688,8 +3687,8 @@ glsl_to_tgsi_visitor::rename_temp_register(int index, int new_index)
>>>     }
>>>  }
>>>
>>> -int
>>> -glsl_to_tgsi_visitor::get_first_temp_read(int index)
>>> +void
>>> +glsl_to_tgsi_visitor::get_first_temp_read(int *first_reads)
>>>  {
>>>     int depth = 0; /* loop depth */
>>>     int loop_start = -1; /* index of the first active BGNLOOP (if any) */
>>> @@ -3697,15 +3696,15 @@ glsl_to_tgsi_visitor::get_first_temp_read(int index)
>>>
>>>     foreach_in_list(glsl_to_tgsi_instruction, inst, &this->instructions) {
>>>        for (j = 0; j < num_inst_src_regs(inst); j++) {
>>> -         if (inst->src[j].file == PROGRAM_TEMPORARY &&
>>> -             inst->src[j].index == index) {
>>> -            return (depth == 0) ? i : loop_start;
>>> +         if (inst->src[j].file == PROGRAM_TEMPORARY) {
>>> +            if (first_reads[inst->src[j].index] == -1)
>>> +                first_reads[inst->src[j].index] = (depth == 0) ? i : loop_start;
>>>           }
>>>        }
>>>        for (j = 0; j < inst->tex_offset_num_offset; j++) {
>>> -         if (inst->tex_offsets[j].file == PROGRAM_TEMPORARY &&
>>> -             inst->tex_offsets[j].index == index) {
>>> -            return (depth == 0) ? i : loop_start;
>>> +         if (inst->tex_offsets[j].file == PROGRAM_TEMPORARY) {
>>> +            if (first_reads[inst->tex_offsets[j].index] == -1)
>>> +               first_reads[inst->tex_offsets[j].index] = (depth == 0) ? i : loop_start;
>>>           }
>>>        }
>>>        if (inst->op == TGSI_OPCODE_BGNLOOP) {
>>> @@ -3718,91 +3717,73 @@ glsl_to_tgsi_visitor::get_first_temp_read(int index)
>>>        assert(depth >= 0);
>>>        i++;
>>>     }
>>> -   return -1;
>>>  }
>>>
>>> -int
>>> -glsl_to_tgsi_visitor::get_first_temp_write(int index)
>>> +void
>>> +glsl_to_tgsi_visitor::get_last_temp_read_first_temp_write(int *last_reads, int *first_writes)
>>>  {
>>>     int depth = 0; /* loop depth */
>>>     int loop_start = -1; /* index of the first active BGNLOOP (if any) */
>>> -   int i = 0;
>>> -   unsigned j;
>>> -
>>> +   unsigned i = 0, j;
>>> +   int k;
>>>     foreach_in_list(glsl_to_tgsi_instruction, inst, &this->instructions) {
>>> +      for (j = 0; j < num_inst_src_regs(inst); j++) {
>>> +         if (inst->src[j].file == PROGRAM_TEMPORARY)
>>> +            last_reads[inst->src[j].index] = (depth == 0) ? i : -2;
>>> +      }
>>>        for (j = 0; j < num_inst_dst_regs(inst); j++) {
>>> -         if (inst->dst[j].file == PROGRAM_TEMPORARY && inst->dst[j].index == index) {
>>> -            return (depth == 0) ? i : loop_start;
>>> -         }
>>> +         if (inst->dst[j].file == PROGRAM_TEMPORARY)
>>> +            if (first_writes[inst->dst[j].index] == -1)
>>> +               first_writes[inst->dst[j].index] = (depth == 0) ? i : loop_start;
>>> +      }
>>> +      for (j = 0; j < inst->tex_offset_num_offset; j++) {
>>> +         if (inst->tex_offsets[j].file == PROGRAM_TEMPORARY)
>>> +            last_reads[inst->tex_offsets[j].index] = (depth == 0) ? i : -2;
>>>        }
>>>        if (inst->op == TGSI_OPCODE_BGNLOOP) {
>>>           if(depth++ == 0)
>>>              loop_start = i;
>>>        } else if (inst->op == TGSI_OPCODE_ENDLOOP) {
>>> -         if (--depth == 0)
>>> +         if (--depth == 0) {
>>>              loop_start = -1;
>>> -      }
>>> -      assert(depth >= 0);
>>> -      i++;
>>> -   }
>>> -   return -1;
>>> -}
>>> -
>>> -int
>>> -glsl_to_tgsi_visitor::get_last_temp_read(int index)
>>> -{
>>> -   int depth = 0; /* loop depth */
>>> -   int last = -1; /* index of last instruction that reads the temporary */
>>> -   unsigned i = 0, j;
>>> -
>>> -   foreach_in_list(glsl_to_tgsi_instruction, inst, &this->instructions) {
>>> -      for (j = 0; j < num_inst_src_regs(inst); j++) {
>>> -         if (inst->src[j].file == PROGRAM_TEMPORARY &&
>>> -             inst->src[j].index == index) {
>>> -            last = (depth == 0) ? i : -2;
>>> +            for (k = 0; k < this->next_temp; k++) {
>>> +               if (last_reads[k] == -2) {
>>> +                  last_reads[k] = i;
>>> +               }
>>> +            }
>>>           }
>>>        }
>>> -      for (j = 0; j < inst->tex_offset_num_offset; j++) {
>>> -          if (inst->tex_offsets[j].file == PROGRAM_TEMPORARY &&
>>> -              inst->tex_offsets[j].index == index)
>>> -              last = (depth == 0) ? i : -2;
>>> -      }
>>> -      if (inst->op == TGSI_OPCODE_BGNLOOP)
>>> -         depth++;
>>> -      else if (inst->op == TGSI_OPCODE_ENDLOOP)
>>> -         if (--depth == 0 && last == -2)
>>> -            last = i;
>>>        assert(depth >= 0);
>>>        i++;
>>>     }
>>> -   assert(last >= -1);
>>> -   return last;
>>>  }
>>>
>>> -int
>>> -glsl_to_tgsi_visitor::get_last_temp_write(int index)
>>> +void
>>> +glsl_to_tgsi_visitor::get_last_temp_write(int *last_writes)
>>>  {
>>>     int depth = 0; /* loop depth */
>>> -   int last = -1; /* index of last instruction that writes to the temporary */
>>> -   int i = 0;
>>> +   int i = 0, k;
>>>     unsigned j;
>>>
>>>     foreach_in_list(glsl_to_tgsi_instruction, inst, &this->instructions) {
>>>        for (j = 0; j < num_inst_dst_regs(inst); j++) {
>>> -         if (inst->dst[j].file == PROGRAM_TEMPORARY && inst->dst[j].index == index)
>>> -            last = (depth == 0) ? i : -2;
>>> +         if (inst->dst[j].file == PROGRAM_TEMPORARY)
>>> +            last_writes[inst->dst[j].index] = (depth == 0) ? i : -2;
>>>        }
>>>
>>>        if (inst->op == TGSI_OPCODE_BGNLOOP)
>>>           depth++;
>>>        else if (inst->op == TGSI_OPCODE_ENDLOOP)
>>> -         if (--depth == 0 && last == -2)
>>> -            last = i;
>>> +         if (--depth == 0) {
>>> +            for (k = 0; k < this->next_temp; k++) {
>>> +               if (last_writes[k] == -2) {
>>> +                  last_writes[k] = i;
>>> +               }
>>> +            }
>>> +         }
>>>        assert(depth >= 0);
>>>        i++;
>>>     }
>>> -   assert(last >= -1);
>>> -   return last;
>>>  }
>>>
>>>  /*
>>> @@ -4238,9 +4219,10 @@ glsl_to_tgsi_visitor::merge_registers(void)
>>>      * into an array so that we don't have to traverse the instruction list as
>>>      * much. */
>>>     for (i = 0; i < this->next_temp; i++) {
>>> -      last_reads[i] = get_last_temp_read(i);
>>> -      first_writes[i] = get_first_temp_write(i);
>>> +      last_reads[i] = -1;
>>> +      first_writes[i] = -1;
>>>     }
>>> +   get_last_temp_read_first_temp_write(last_reads, first_writes);
>>>
>>>     /* Start looking for registers with non-overlapping usages that can be
>>>      * merged together. */
>>> @@ -4281,15 +4263,21 @@ glsl_to_tgsi_visitor::renumber_registers(void)
>>>  {
>>>     int i = 0;
>>>     int new_index = 0;
>>> +   int *first_reads = rzalloc_array(mem_ctx, int, this->next_temp);
>>> +
>>> +   for (i = 0; i < this->next_temp; i++)
>>> +      first_reads[i] = -1;
>>> +   get_first_temp_read(first_reads);
>>>
>>>     for (i = 0; i < this->next_temp; i++) {
>>> -      if (get_first_temp_read(i) < 0) continue;
>>> +      if (first_reads[i] < 0) continue;
>>>        if (i != new_index)
>>>           rename_temp_register(i, new_index);
>>>        new_index++;
>>>     }
>>>
>>>     this->next_temp = new_index;
>>> +   ralloc_free(first_reads);
>>>  }
>>>
>>>  /**
>>> @@ -5764,14 +5752,31 @@ get_mesa_program(struct gl_context *ctx,
>>>  #if 0
>>>     /* Print out some information (for debugging purposes) used by the
>>>      * optimization passes. */
>>> -   for (i = 0; i < v->next_temp; i++) {
>>> -      int fr = v->get_first_temp_read(i);
>>> -      int fw = v->get_first_temp_write(i);
>>> -      int lr = v->get_last_temp_read(i);
>>> -      int lw = v->get_last_temp_write(i);
>>> -
>>> -      printf("Temp %d: FR=%3d FW=%3d LR=%3d LW=%3d\n", i, fr, fw, lr, lw);
>>> -      assert(fw <= fr);
>>> +   {
>>> +      int i;
>>> +      int *first_writes = rzalloc_array(v->mem_ctx, int, v->next_temp);
>>> +      int *first_reads = rzalloc_array(v->mem_ctx, int, v->next_temp);
>>> +      int *last_writes = rzalloc_array(v->mem_ctx, int, v->next_temp);
>>> +      int *last_reads = rzalloc_array(v->mem_ctx, int, v->next_temp);
>>> +
>>> +      for (i = 0; i < v->next_temp; i++) {
>>> +         first_writes[i] = -1;
>>> +         first_reads[i] = -1;
>>> +         last_writes[i] = -1;
>>> +         last_reads[i] = -1;
>>> +      }
>>> +      v->get_first_temp_read(first_reads);
>>> +      v->get_last_temp_read_first_temp_write(last_reads, first_writes);
>>> +      v->get_last_temp_write(last_writes);
>>> +      for (i = 0; i < v->next_temp; i++)
>>> +         printf("Temp %d: FR=%3d FW=%3d LR=%3d LW=%3d\n", i, first_reads[i],
>>> +                first_writes[i],
>>> +                last_reads[i],
>>> +                last_writes[i]);
>>> +      ralloc_free(first_writes);
>>> +      ralloc_free(first_reads);
>>> +      ralloc_free(last_writes);
>>> +      ralloc_free(last_reads);
>>>     }
>>>  #endif
>>>
>>> --
>>> 2.4.3
>>>
>>> _______________________________________________
>>> mesa-dev mailing list
>>> mesa-dev at lists.freedesktop.org
>>> http://lists.freedesktop.org/mailman/listinfo/mesa-dev


More information about the mesa-dev mailing list