[Mesa-dev] [RFC 7/9] i965/fs: Add pass to combine immediates.

Sat Nov 1 21:25:58 PDT 2014

On Sat, Nov 1, 2014 at 5:36 PM, Matt Turner <mattst88 at gmail.com> wrote:
> On Fri, Oct 31, 2014 at 7:35 PM, Connor Abbott <cwabbott0 at gmail.com> wrote:
>> On Fri, Oct 31, 2014 at 9:27 PM, Matt Turner <mattst88 at gmail.com> wrote:
>>> ---
>>>  src/mesa/drivers/dri/i965/Makefile.sources         |   1 +
>>>  src/mesa/drivers/dri/i965/brw_fs.cpp               |   2 +
>>>  src/mesa/drivers/dri/i965/brw_fs.h                 |   1 +
>>>  .../drivers/dri/i965/brw_fs_combine_constants.cpp  | 222 +++++++++++++++++++++
>>>  4 files changed, 226 insertions(+)
>>>  create mode 100644 src/mesa/drivers/dri/i965/brw_fs_combine_constants.cpp
>>>
>>> diff --git a/src/mesa/drivers/dri/i965/Makefile.sources b/src/mesa/drivers/dri/i965/Makefile.sources
>>> index 26bf458..8dc89f4 100644
>>> --- a/src/mesa/drivers/dri/i965/Makefile.sources
>>> +++ b/src/mesa/drivers/dri/i965/Makefile.sources
>>> @@ -54,6 +54,7 @@ i965_FILES = \
>>>         brw_ff_gs_emit.c \
>>>         brw_fs.cpp \
>>>         brw_fs_channel_expressions.cpp \
>>> +       brw_fs_combine_constants.cpp \
>>>         brw_fs_copy_propagation.cpp \
>>>         brw_fs_cse.cpp \
>>>         brw_fs_dead_code_eliminate.cpp \
>>> diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
>>> index b594331..d763edf 100644
>>> --- a/src/mesa/drivers/dri/i965/brw_fs.cpp
>>> +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
>>> @@ -3606,6 +3606,8 @@ fs_visitor::run()
>>>           dead_code_eliminate();
>>>        }
>>>
>>> +      opt_combine_constants();
>>> +
>>>        lower_uniform_pull_constant_loads();
>>>
>>>        assign_curb_setup();
>>> diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h
>>> index 2fa0949..f9cb99f 100644
>>> --- a/src/mesa/drivers/dri/i965/brw_fs.h
>>> +++ b/src/mesa/drivers/dri/i965/brw_fs.h
>>> @@ -454,6 +454,7 @@ public:
>>>     void no16(const char *msg, ...);
>>>     void lower_uniform_pull_constant_loads();
>>>     bool lower_load_payload();
>>> +   void opt_combine_constants();
>>>
>>>     void push_force_uncompressed();
>>>     void pop_force_uncompressed();
>>> diff --git a/src/mesa/drivers/dri/i965/brw_fs_combine_constants.cpp b/src/mesa/drivers/dri/i965/brw_fs_combine_constants.cpp
>>> new file mode 100644
>>> index 0000000..eef6bb8
>>> --- /dev/null
>>> +++ b/src/mesa/drivers/dri/i965/brw_fs_combine_constants.cpp
>>> @@ -0,0 +1,222 @@
>>> +/*
>>> + * Copyright © 2014 Intel Corporation
>>> + *
>>> + * Permission is hereby granted, free of charge, to any person obtaining a
>>> + * copy of this software and associated documentation files (the "Software"),
>>> + * to deal in the Software without restriction, including without limitation
>>> + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
>>> + * and/or sell copies of the Software, and to permit persons to whom the
>>> + * Software is furnished to do so, subject to the following conditions:
>>> + *
>>> + * The above copyright notice and this permission notice (including the next
>>> + * paragraph) shall be included in all copies or substantial portions of the
>>> + * Software.
>>> + *
>>> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
>>> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
>>> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
>>> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
>>> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
>>> + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
>>> + * IN THE SOFTWARE.
>>> + */
>>> +
>>> +/** @file brw_fs_combine_constants.cpp
>>> + *
>>> + * This file contains the opt_combine_constants() pass that runs after the
>>> + * regular optimization loop. It passes over the instruction list and
>>> + * selectively promotes immediate values to registers by emitting a mov(1)
>>> + * instruction.
>>> + *
>>> + * This is useful on Gen 7 particularly, because a few instructions can be
>>> + * coissued (i.e., issued in the same cycle as another thread on the same EU
>>> + * issues an instruction) under some circumstances, one of which is that they
>>> + * cannot use immediate values.
>>> + */
>>> +
>>> +#include "brw_fs.h"
>>> +#include "brw_fs_live_variables.h"
>>> +#include "brw_cfg.h"
>>> +
>>> +static bool
>>> +could_coissue(const fs_inst *inst)
>>> +{
>>> +   /* MAD can coissue, but while the PRM lists various restrictions for Align1
>>> +    * instructions related to data alignment and regioning, it doesn't list
>>> +    * similar restrictions for Align16 instructions. I don't expect that there
>>> +    * are any restrictions, since Align16 doesn't allow the kinds of operations
>>> +    * that are restricted in Align1 mode.
>>> +    *
>>> +    * Since MAD is an Align16 instruction we assume it can always coissue.
>>> +    */
>>> +   switch (inst->opcode) {
>>> +   case BRW_OPCODE_MOV:
>>> +   case BRW_OPCODE_CMP:
>>> +   case BRW_OPCODE_ADD:
>>> +   case BRW_OPCODE_MUL:
>>> +      return true;
>>> +   default:
>>> +      return false;
>>> +   }
>>> +}
>>> +
>>> +struct imm {
>>> +   bblock_t *block;
>>> +   float val;
>>> +
>>> +   uint8_t subreg_offset;
>>> +   uint16_t reg;
>>> +
>>> +   uint16_t uses_by_coissue;
>>> +};
>>> +
>>> +struct table {
>>> +   struct imm *imm;
>>> +   int size;
>>> +   int len;
>>> +};
>>> +
>>> +static struct imm *
>>> +find_imm(struct table *table, float val)
>>> +{
>>> +   assert(signbit(val) == 0);
>>> +
>>> +   for (int i = 0; i < table->len; i++) {
>>> +      if (table->imm[i].val == val) {
>>> +         return &table->imm[i];
>>> +      }
>>> +   }
>>> +   return NULL;
>>> +}
>>> +
>>> +static struct imm *
>>> +new_imm(struct table *table, void *mem_ctx)
>>> +{
>>> +   if (table->len == table->size) {
>>> +      table->size *= 2;
>>> +      table->imm = reralloc(mem_ctx, table->imm, struct imm, table->size);
>>> +   }
>>> +   return &table->imm[table->len++];
>>> +}
>>> +
>>> +static int
>>> +compare(const void *a, const void *b)
>>> +{
>>> +   return ((struct imm *)a)->block->num - ((struct imm *)b)->block->num;
>>> +}
>>> +
>>> +void
>>> +fs_visitor::opt_combine_constants()
>>> +{
>>> +   void *const_ctx = ralloc_context(NULL);
>>> +
>>> +   struct table table;
>>> +   table.size = 8;
>>> +   table.len = 0;
>>> +   table.imm = ralloc_array(const_ctx, struct imm, table.size);
>>> +
>>> +   cfg->calculate_idom();
>>> +
>>> +   /* Make a pass through all instructions and count the number of times each
>>> +    * constant is used by coissueable instructions.
>>> +    */
>>> +   foreach_block_and_inst(block, fs_inst, inst, cfg) {
>>> +      if (!could_coissue(inst))
>>> +         continue;
>>> +
>>> +      for (int i = 0; i < inst->sources; i++) {
>>> +         if (inst->src[i].file != IMM)
>>> +            continue;
>>> +
>>> +         float val = fabsf(inst->src[i].fixed_hw_reg.dw1.f);
>>> +         struct imm *imm = find_imm(&table, val);
>>> +
>>> +         if (imm) {
>>> +            imm->block = cfg_t::intersect(block, imm->block);
>>
>> Like I mentioned in the other reply, we could be smarter about keeping
>> track of where to insert the MOV. We can keep track of what
>> instruction to insert the MOV before as well as which basic block to
>> put it in. If you see another instruction within the current basic
>> block, don't update anything. If you see another instruction within a
>> different basic block, then if the instruction's basic block is
>> dominated by the current basic block (i.e. the intersection returns
>> the current basic block), don't do anything, otherwise put the MOV at
>> the end of the basic block returned by the intersection.
>
> Yeah, I think this is a good idea. I'll give this a try.
>
>>> +            imm->uses_by_coissue += could_coissue(inst);
>>> +         } else {
>>> +            imm = new_imm(&table, const_ctx);
>>> +            imm->block = block;
>>> +            imm->val = val;
>>> +            imm->uses_by_coissue = could_coissue(inst);
>>> +         }
>>> +      }
>>> +   }
>>> +
>>> +   /* Remove constants from the table that don't have enough uses to make them
>>> +    * profitable to store in a register.
>>> +    */
>>> +   for (int i = 0; i < table.len;) {
>>> +      struct imm *imm = &table.imm[i];
>>> +
>>> +      if (imm->uses_by_coissue < 4) {
>>> +         table.imm[i] = table.imm[table.len - 1];
>>> +         table.len--;
>>> +         continue;
>>> +      }
>>> +      i++;
>>> +   }
>>> +   if (table.len == 0) {
>>> +      ralloc_free(const_ctx);
>>> +      return;
>>> +   }
>>> +   if (cfg->num_blocks != 1)
>>> +      qsort(table.imm, table.len, sizeof(struct imm), compare);
>>> +
>>> +   fs_reg reg(this, glsl_type::float_type);
>>> +   reg.stride = 0;
>>> +   int i = 0;
>>> +
>>> +   foreach_block(block, cfg) {
>>> +      backend_instruction *inst = block->first_non_control_flow_inst();
>>> +
>>> +      for (; i < table.len; i++) {
>>> +         struct imm *imm = &table.imm[i];
>>> +
>>> +         if (imm->block->num != block->num)
>>> +            break;
>>> +
>>> +         fs_inst *mov = MOV(reg, fs_reg(imm->val));
>>> +         mov->exec_size = 1;
>>> +         inst->insert_before(block, mov);
>>> +         imm->reg = reg.reg;
>>> +         imm->subreg_offset = reg.subreg_offset;
>>> +
>>> +         reg.subreg_offset += sizeof(float);
>>> +         if (reg.subreg_offset == 32) {
>>
>> Instead of a magic "32," can we put a 8 * sizeof(float) here to make
>> the intent more obvious? And shouldn't this be exec_size *
>> sizeof(float) since we can hold 16 floats in 16-wide mode?
>
> Sure, sounds good.
>
> With the ability to allocate single SIMD8 registers in SIMD16 programs
> these days, I think this code is okay but I'll give exec_size *
> sizeof(float) a try.

Right, but if you want to always use 8 components then you should
allocate a SIMD8 register. IIUC (I haven't looked at what Jason did
too much) this isn't being done here, so in SIMD16 we're wasting half
the register. Either one should work, but SIMD8 might give a little
more flexibility to the register allocator - really, you want to be
able to allocate registers of size 1, but I guess that's for another
day...

>
>>> +            reg.reg = virtual_grf_alloc(1);
>>> +            reg.subreg_offset = 0;
>>> +         }
>>> +      }
>>> +
>>> +      foreach_inst_in_block(fs_inst, inst, block) {
>>> +         if (inst->opcode == BRW_OPCODE_MOV &&
>>> +             inst->src[0].file == IMM &&
>>> +             inst->dst.stride == 0)
>>> +            continue;
>>> +
>>> +         /* Tell Matt privately if you actually read this far. */
>>
>> Don't worry Matt, there are good things left in the world still :)
>>
>>> +         for (int i = 0; i < inst->sources; i++) {
>>> +            if (inst->src[i].file != IMM ||
>>> +                inst->src[i].type != BRW_REGISTER_TYPE_F)
>>> +               continue;
>>> +
>>> +            float val = fabsf(inst->src[i].fixed_hw_reg.dw1.f);
>>> +            struct imm *imm = find_imm(&table, val);
>>> +            if (!imm)
>>> +               continue;
>>> +
>>> +            inst->src[i].file = GRF;
>>> +            inst->src[i].reg = imm->reg;
>>> +            inst->src[i].subreg_offset = imm->subreg_offset;
>>> +            inst->src[i].stride = 0;
>>> +            inst->src[i].negate = signbit(inst->src[i].fixed_hw_reg.dw1.f) !=
>>> +                                  signbit(imm->val);
>>> +            assert(fabsf(inst->src[i].fixed_hw_reg.dw1.f) == imm->val);
>>> +         }
>>> +      }
>>> +   }
>>
>> Instead of sorting the immediates and then walking over all the
>> instructions and looking up the immediates again, wouldn't it be more
>> efficient (and perhaps less code) to store all the instructions where
>> each immediate is used in a vector in the table entry, and then walk
>> over that here?
>
> I suspect so. I'll make that change.
>
> Thanks Connor!