[Mesa-dev] [PATCH 6/8] i965: Add support for instruction compaction.

Paul Berry stereotype441 at gmail.com
Tue Sep 4 16:04:53 PDT 2012


On 31 August 2012 11:32, Eric Anholt <eric at anholt.net> wrote:

> This reduces program size by using some smaller encodings for common bit
> patterns in the Gen ISA, with the hope of making programs fit in the
> instruction cache better.  Unfortunately, we don't have any evidence for
> any
> particular program being measurably helped.
> ---
>  src/mesa/drivers/dri/i965/.gitignore        |    1 +
>  src/mesa/drivers/dri/i965/Makefile.am       |   12 +
>  src/mesa/drivers/dri/i965/Makefile.sources  |    1 +
>  src/mesa/drivers/dri/i965/brw_eu.c          |   38 +-
>  src/mesa/drivers/dri/i965/brw_eu.h          |   13 +
>  src/mesa/drivers/dri/i965/brw_eu_compact.c  |  594
> +++++++++++++++++++++++++++
>  src/mesa/drivers/dri/i965/brw_eu_debug.c    |    1 +
>  src/mesa/drivers/dri/i965/brw_fs.cpp        |    6 +
>  src/mesa/drivers/dri/i965/brw_structs.h     |   24 ++
>  src/mesa/drivers/dri/i965/test_eu_compact.c |  265 ++++++++++++
>  10 files changed, 947 insertions(+), 8 deletions(-)
>  create mode 100644 src/mesa/drivers/dri/i965/brw_eu_compact.c
>  create mode 100644 src/mesa/drivers/dri/i965/test_eu_compact.c
>
> diff --git a/src/mesa/drivers/dri/i965/.gitignore
> b/src/mesa/drivers/dri/i965/.gitignore
> index fe4578e..c6ea403 100644
> --- a/src/mesa/drivers/dri/i965/.gitignore
> +++ b/src/mesa/drivers/dri/i965/.gitignore
> @@ -1,3 +1,4 @@
>  Makefile
>  i965_symbols_test
>  libi965_dri.la
> +test_eu_compact
> diff --git a/src/mesa/drivers/dri/i965/Makefile.am
> b/src/mesa/drivers/dri/i965/Makefile.am
> index 0ac3de7..be1523d 100644
> --- a/src/mesa/drivers/dri/i965/Makefile.am
> +++ b/src/mesa/drivers/dri/i965/Makefile.am
> @@ -51,16 +51,28 @@ libi965_dri_la_SOURCES = \
>         $(i965_C_FILES) \
>         $(i965_CXX_FILES)
>
> +# list of libs to be linked against by i965_dri.so and i965 test programs.
>  COMMON_LIBS = \
>         libi965_dri.la \
>         $(DRI_LIB_DEPS) \
>         $(INTEL_LIBS) \
>         ../common/libdricommon.la
>
> +TEST_LIBS = \
> +       $(COMMON_LIBS) \
> +       ../common/libdri_test_stubs.la
> +
>  i965_dri_la_SOURCES =
>  i965_dri_la_LIBADD = $(COMMON_LIBS)
>  i965_dri_la_LDFLAGS = -module -avoid-version -shared
>
> +TESTS = test_eu_compact
> +check_PROGRAMS = test_eu_compact
> +
> +test_eu_compact_SOURCES = \
> +       test_eu_compact.c
> +test_eu_compact_LDADD = $(TEST_LIBS)
> +
>  # Provide compatibility with scripts for the old Mesa build system for
>  # a while by putting a link to the driver into /lib of the build tree.
>  all-local: i965_dri.la
> diff --git a/src/mesa/drivers/dri/i965/Makefile.sources
> b/src/mesa/drivers/dri/i965/Makefile.sources
> index d6d189a..3715b0f 100644
> --- a/src/mesa/drivers/dri/i965/Makefile.sources
> +++ b/src/mesa/drivers/dri/i965/Makefile.sources
> @@ -44,6 +44,7 @@ i965_C_FILES = \
>         brw_draw.c \
>         brw_draw_upload.c \
>         brw_eu.c \
> +       brw_eu_compact.c \
>         brw_eu_debug.c \
>         brw_eu_emit.c \
>         brw_eu_util.c \
> diff --git a/src/mesa/drivers/dri/i965/brw_eu.c
> b/src/mesa/drivers/dri/i965/brw_eu.c
> index d5080c3..c2515eb 100644
> --- a/src/mesa/drivers/dri/i965/brw_eu.c
> +++ b/src/mesa/drivers/dri/i965/brw_eu.c
> @@ -214,6 +214,11 @@ const GLuint *brw_get_program( struct brw_compile *p,
>  {
>     GLuint i;
>
> +   brw_compact_instructions(p);
> +
> +   /* We emit a cacheline (8 instructions) of NOPs at the end of the
> program to
> +    * make sure that instruction prefetch doesn't wander off into some
> other BO.
> +    */
>     for (i = 0; i < 8; i++)
>        brw_NOP(p);
>
> @@ -224,19 +229,36 @@ const GLuint *brw_get_program( struct brw_compile *p,
>  void
>  brw_dump_compile(struct brw_compile *p, FILE *out, int start, int end)
>  {
> +   struct brw_context *brw = p->brw;
> +   struct intel_context *intel = &brw->intel;
>     void *store = p->store;
> +   bool dump_hex = false;
>
> -   for (int offset = start; offset < end; offset += 16) {
> +   for (int offset = start; offset < end;) {
>        struct brw_instruction *insn = store + offset;
> -
> +      struct brw_instruction uncompacted;
>        printf("0x%08x: ", offset);
>
> -      if (0) {
> -        printf("0x%08x 0x%08x 0x%08x 0x%08x ",
> -               ((uint32_t *)insn)[3],
> -               ((uint32_t *)insn)[2],
> -               ((uint32_t *)insn)[1],
> -               ((uint32_t *)insn)[0]);
> +      if (insn->header.cmpt_control) {
> +        struct brw_compact_instruction *compacted = (void *)insn;
> +        if (dump_hex) {
> +           printf("0x%08x 0x%08x                       ",
> +                  ((uint32_t *)insn)[1],
> +                  ((uint32_t *)insn)[0]);
> +        }
> +
> +        brw_uncompact_instruction(intel, &uncompacted, compacted);
> +        insn = &uncompacted;
> +        offset += 8;
> +      } else {
> +        if (dump_hex) {
> +           printf("0x%08x 0x%08x 0x%08x 0x%08x ",
> +                  ((uint32_t *)insn)[3],
> +                  ((uint32_t *)insn)[2],
> +                  ((uint32_t *)insn)[1],
> +                  ((uint32_t *)insn)[0]);
> +        }
> +        offset += 16;
>        }
>
>        brw_disasm(stdout, insn, p->brw->intel.gen);
> diff --git a/src/mesa/drivers/dri/i965/brw_eu.h
> b/src/mesa/drivers/dri/i965/brw_eu.h
> index 2fa84df..01b8d08 100644
> --- a/src/mesa/drivers/dri/i965/brw_eu.h
> +++ b/src/mesa/drivers/dri/i965/brw_eu.h
> @@ -1107,6 +1107,19 @@ void brw_set_uip_jip(struct brw_compile *p);
>
>  uint32_t brw_swap_cmod(uint32_t cmod);
>
> +/* brw_eu_compact.c */
> +void brw_compact_instructions(struct brw_compile *c);
> +void brw_uncompact_instruction(struct intel_context *intel,
> +                              struct brw_instruction *dst,
> +                              struct brw_compact_instruction *src);
> +bool brw_try_compact_instruction(struct brw_compile *c,
> +                                 struct brw_compact_instruction *dst,
> +                                 struct brw_instruction *src);
> +
> +void brw_debug_compact_uncompact(struct intel_context *intel,
> +                                struct brw_instruction *orig,
> +                                struct brw_instruction *uncompacted);
> +
>  /* brw_optimize.c */
>  void brw_optimize(struct brw_compile *p);
>  void brw_remove_duplicate_mrf_moves(struct brw_compile *p);
> diff --git a/src/mesa/drivers/dri/i965/brw_eu_compact.c
> b/src/mesa/drivers/dri/i965/brw_eu_compact.c
> new file mode 100644
> index 0000000..2c15c85
> --- /dev/null
> +++ b/src/mesa/drivers/dri/i965/brw_eu_compact.c
> @@ -0,0 +1,594 @@
> +/*
> + * Copyright © 2012 Intel Corporation
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a
> + * copy of this software and associated documentation files (the
> "Software"),
> + * to deal in the Software without restriction, including without
> limitation
> + * the rights to use, copy, modify, merge, publish, distribute,
> sublicense,
> + * and/or sell copies of the Software, and to permit persons to whom the
> + * Software is furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice (including the
> next
> + * paragraph) shall be included in all copies or substantial portions of
> the
> + * Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
> EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
> MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT
> SHALL
> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
> OTHER
> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
> + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
> DEALINGS
> + * IN THE SOFTWARE.
> + */
> +
> +/** @file brw_eu_compact.c
> + *
> + * Instruction compaction is a feature of gm45 and newer hardware that
> allows
> + * for a smaller instruction encoding.
> + *
> + * The instruction cache is on the order of 32KB, and many programs
> generate far
> + * more instructions than that.  The instruction cache is built to barely
> keep
> + * up with instruction dispatch abaility in cache hit cases -- L1
> instruction
> + * cache misses that still hit in the next level could limit throughput by
> + * around 50%.
> + *
> + * The idea of instruction compaction is that most instructions use a tiny
> + * subset of the GPU functionality, so we can encode what would be a 16
> byte
> + * instruction in 8 bytes using some lookup tables for various fields.
> + */
> +
> +#include "brw_context.h"
> +#include "brw_eu.h"
> +
> +static const uint32_t gen6_control_index_table[32] = {
> +   0b00000000000000000,
> +   0b01000000000000000,
> +   0b00110000000000000,
> +   0b00000000100000000,
> +   0b00010000000000000,
> +   0b00001000100000000,
> +   0b00000000100000010,
> +   0b00000000000000010,
> +   0b01000000100000000,
> +   0b01010000000000000,
> +   0b10110000000000000,
> +   0b00100000000000000,
> +   0b11010000000000000,
> +   0b11000000000000000,
> +   0b01001000100000000,
> +   0b01000000000001000,
> +   0b01000000000000100,
> +   0b00000000000001000,
> +   0b00000000000000100,
> +   0b00111000100000000,
> +   0b00001000100000010,
> +   0b00110000100000000,
> +   0b00110000000000001,
> +   0b00100000000000001,
> +   0b00110000000000010,
> +   0b00110000000000101,
> +   0b00110000000001001,
> +   0b00110000000010000,
> +   0b00110000000000011,
> +   0b00110000000000100,
> +   0b00110000100001000,
> +   0b00100000000001001
> +};
> +
> +static const uint32_t gen6_datatype_table[32] = {
> +   0b001001110000000000,
> +   0b001000110000100000,
> +   0b001001110000000001,
> +   0b001000000001100000,
> +   0b001010110100101001,
> +   0b001000000110101101,
> +   0b001100011000101100,
> +   0b001011110110101101,
> +   0b001000000111101100,
> +   0b001000000001100001,
> +   0b001000110010100101,
> +   0b001000000001000001,
> +   0b001000001000110001,
> +   0b001000001000101001,
> +   0b001000000000100000,
> +   0b001000001000110010,
> +   0b001010010100101001,
> +   0b001011010010100101,
> +   0b001000000110100101,
> +   0b001100011000101001,
> +   0b001011011000101100,
> +   0b001011010110100101,
> +   0b001011110110100101,
> +   0b001111011110111101,
> +   0b001111011110111100,
> +   0b001111011110111101,
> +   0b001111011110011101,
> +   0b001111011110111110,
> +   0b001000000000100001,
> +   0b001000000000100010,
> +   0b001001111111011101,
> +   0b001000001110111110,
> +};
> +
> +static const uint32_t gen6_subreg_table[32] = {
> +   0b000000000000000,
> +   0b000000000000100,
> +   0b000000110000000,
> +   0b111000000000000,
> +   0b011110000001000,
> +   0b000010000000000,
> +   0b000000000010000,
> +   0b000110000001100,
> +   0b001000000000000,
> +   0b000001000000000,
> +   0b000001010010100,
> +   0b000000001010110,
> +   0b010000000000000,
> +   0b110000000000000,
> +   0b000100000000000,
> +   0b000000010000000,
> +   0b000000000001000,
> +   0b100000000000000,
> +   0b000001010000000,
> +   0b001010000000000,
> +   0b001100000000000,
> +   0b000000001010100,
> +   0b101101010010100,
> +   0b010100000000000,
> +   0b000000010001111,
> +   0b011000000000000,
> +   0b111110000000000,
> +   0b101000000000000,
> +   0b000000000001111,
> +   0b000100010001111,
> +   0b001000010001111,
> +   0b000110000000000,
> +};
> +
> +static const uint32_t gen6_src_index_table[32] = {
> +   0b000000000000,
> +   0b010110001000,
> +   0b010001101000,
> +   0b001000101000,
> +   0b011010010000,
> +   0b000100100000,
> +   0b010001101100,
> +   0b010101110000,
> +   0b011001111000,
> +   0b001100101000,
> +   0b010110001100,
> +   0b001000100000,
> +   0b010110001010,
> +   0b000000000010,
> +   0b010101010000,
> +   0b010101101000,
> +   0b111101001100,
> +   0b111100101100,
> +   0b011001110000,
> +   0b010110001001,
> +   0b010101011000,
> +   0b001101001000,
> +   0b010000101100,
> +   0b010000000000,
> +   0b001101110000,
> +   0b001100010000,
> +   0b001100000000,
> +   0b010001101010,
> +   0b001101111000,
> +   0b000001110000,
> +   0b001100100000,
> +   0b001101010000,
> +};
> +
> +static bool
> +set_control_index(struct brw_compact_instruction *dst,
> +                 struct brw_instruction *src)
> +{
> +
> +   uint32_t uncompressed = 0;
> +
> +   uncompressed |= src->header.access_mode << 0;
> +   uncompressed |= src->header.mask_control << 1;
> +   uncompressed |= src->header.dependency_control << 2;
> +   uncompressed |= src->header.compression_control << 4;
> +   uncompressed |= src->header.thread_control << 6;
> +   uncompressed |= src->header.predicate_control << 8;
> +   uncompressed |= src->header.predicate_inverse << 12;
> +   uncompressed |= src->header.execution_size << 13;
>

All of the above fields are packed into a contiguous pair of bytes in
brw_instruction, so effectively this is a really complex way of copying two
bytes from src to uncompressed.  Does GCC optimize this down to efficient
code?  If not, we should probably consider changing this to something like

uncompressed |= *(uint16_t *) (((char *) src) + 1);

Or, alternatively, make src->header a union (like the other dwords in
brw_instruction) and then this could be:

uncompressed |= (src->header.ud & 0xffff00) >> 8;


> +   uncompressed |= src->header.saturate << 16;
> +
> +   for (int i = 0; i < ARRAY_SIZE(gen6_control_index_table); i++) {
> +      if (gen6_control_index_table[i] == uncompressed) {
> +        dst->dw0.control_index = i;
> +        return true;
> +      }
> +   }
> +
> +   return false;
> +}
> +
> +static bool
> +set_datatype_index(struct brw_compact_instruction *dst,
> +                  struct brw_instruction *src)
> +{
> +   uint32_t uncompressed = 0;
> +
> +   uncompressed |= src->bits1.ud & 0x7fff;
> +   uncompressed |= (src->bits1.ud >> 29) << 15;
> +
> +   for (int i = 0; i < ARRAY_SIZE(gen6_datatype_table); i++) {
> +      if (gen6_datatype_table[i] == uncompressed) {
> +        dst->dw0.data_type_index = i;
> +        return true;
> +      }
> +   }
> +
> +   return false;
> +}
> +
> +static bool
> +set_subreg_index(struct brw_compact_instruction *dst,
> +                struct brw_instruction *src)
> +{
> +   uint32_t uncompressed = 0;
> +
> +   uncompressed |= src->bits1.da1.dest_subreg_nr << 0;
> +   uncompressed |= src->bits2.da1.src0_subreg_nr << 5;
> +   uncompressed |= src->bits3.da1.src1_subreg_nr << 10;
> +
> +   for (int i = 0; i < ARRAY_SIZE(gen6_subreg_table); i++) {
> +      if (gen6_subreg_table[i] == uncompressed) {
> +        dst->dw0.sub_reg_index = i;
> +        return true;
> +      }
> +   }
> +
> +   return false;
> +}
> +
> +static bool
> +get_src_index(uint32_t uncompressed,
> +             uint32_t *compressed)
> +{
> +   for (int i = 0; i < ARRAY_SIZE(gen6_src_index_table); i++) {
> +      if (gen6_src_index_table[i] == uncompressed) {
> +        *compressed = i;
> +        return true;
> +      }
> +   }
> +
> +   return false;
> +}
> +
> +static bool
> +set_src0_index(struct brw_compact_instruction *dst,
> +              struct brw_instruction *src)
> +{
> +   uint32_t compressed, uncompressed = 0;
> +
> +   uncompressed |= src->bits2.da1.src0_abs << 0;
> +   uncompressed |= src->bits2.da1.src0_negate << 1;
> +   uncompressed |= src->bits2.da1.src0_address_mode << 2;
> +   uncompressed |= src->bits2.da1.src0_horiz_stride << 3;
> +   uncompressed |= src->bits2.da1.src0_width << 5;
> +   uncompressed |= src->bits2.da1.src0_vert_stride << 8;
>

These values are also contiguous, and could be more efficiently written by
adding a "ud" field to the "bits2" union, and then doing:

uncompressed |= (src->bits2.ud & 0x1ffe000) >> 13;


> +
> +   if (!get_src_index(uncompressed, &compressed))
> +      return false;
> +
> +   dst->dw0.src0_index = compressed & 0x3;
> +   dst->dw1.src0_index = compressed >> 2;
> +
> +   return true;
> +}
> +
> +static bool
> +set_src1_index(struct brw_compact_instruction *dst,
> +              struct brw_instruction *src)
> +{
> +   uint32_t compressed, uncompressed = 0;
> +
> +   uncompressed |= src->bits3.da1.src1_abs << 0;
> +   uncompressed |= src->bits3.da1.src1_negate << 1;
> +   uncompressed |= src->bits3.da1.src1_address_mode << 2;
> +   uncompressed |= src->bits3.da1.src1_horiz_stride << 3;
> +   uncompressed |= src->bits3.da1.src1_width << 5;
> +   uncompressed |= src->bits3.da1.src1_vert_stride << 8;
>

Similar to the above, this could be written more efficiently as:

uncompressed |= (src->bits3.ud & 0x1ffe000) >> 13;


> +
> +   if (!get_src_index(uncompressed, &compressed))
> +      return false;
> +
> +   dst->dw1.src1_index = compressed;
> +
> +   return true;
> +}
> +
> +/**
> + * Tries to compact instruction src into dst.
> + *
> + * It doesn't modify dst unless src is compactable, which is relied on by
> + * brw_compact_instructions().
> + */
> +bool
> +brw_try_compact_instruction(struct brw_compile *c,
> +                           struct brw_compact_instruction *dst,
> +                           struct brw_instruction *src)
> +{
> +   struct brw_compact_instruction temp;
> +
> +   if (src->header.opcode == BRW_OPCODE_SEND ||
> +       src->header.opcode == BRW_OPCODE_SENDC) {
> +      if (src->bits3.generic.end_of_thread)
> +        return false;
> +
> +      return false; /* FINISHME: What else needs handling? */
> +   }
>

Could we have a comment explaining why we don't apply compaction to send
instructions?  I don't see anything in the bspec indicating that we can't.
Also, why is src->bits3.generic.end_of_thread handled as a separate case?


> +
> +   /* FINISHME: immediates */
> +   if (src->bits1.da1.src0_reg_file == BRW_IMMEDIATE_VALUE ||
> +       src->bits1.da1.src1_reg_file == BRW_IMMEDIATE_VALUE)
> +      return false;
> +
> +   memset(&temp, 0, sizeof(temp));
> +
> +   temp.dw0.opcode = src->header.opcode;
> +   temp.dw0.debug_control = src->header.debug_control;
> +   if (!set_control_index(&temp, src))
> +      return false;
> +   if (!set_datatype_index(&temp, src))
> +      return false;
> +   if (!set_subreg_index(&temp, src))
> +      return false;
> +   temp.dw0.acc_wr_control = src->header.acc_wr_control;
> +   temp.dw0.conditionalmod = src->header.destreg__conditionalmod;
> +   temp.dw0.flag_reg_nr = src->bits2.da1.flag_reg_nr;
> +   temp.dw0.cmpt_ctrl = 1;
> +   if (!set_src0_index(&temp, src))
> +      return false;
> +   if (!set_src1_index(&temp, src))
> +      return false;
> +   temp.dw1.dst_reg_nr = src->bits1.da1.dest_reg_nr;
> +   temp.dw1.src0_reg_nr = src->bits2.da1.src0_reg_nr;
> +   temp.dw1.src1_reg_nr = src->bits3.da1.src1_reg_nr;
> +
> +   *dst = temp;
> +
> +   return true;
> +}
> +
> +static void
> +set_uncompressed_control(struct brw_instruction *dst,
> +                        struct brw_compact_instruction *src)
> +{
> +   uint32_t compressed = gen6_control_index_table[src->dw0.control_index];
>

There's some inconsistent nomenclature here.  In the code above,
"compressed" represents the index into the table, and "uncompressed"
represents the value that is stored in the table.  Here, "compressed" is
the value that is stored in the table.  Maybe it would be clearer to use
the terms "index" and "value" or somesuch?  Another disadvantage of the
"compressed/uncompressed" nomenclature is that it might make people think
of instruction compression, which is a completely different feature from
instruction compaction :)

Note: I would also be ok with just calling it "uncompressed" as in the code
below.


> +
> +   dst->header.access_mode = compressed >> 0;
> +   dst->header.mask_control = compressed >> 1;
> +   dst->header.dependency_control = compressed >> 2;
> +   dst->header.compression_control = compressed >> 4;
> +   dst->header.thread_control = compressed >> 6;
> +   dst->header.predicate_control = compressed >> 8;
> +   dst->header.predicate_inverse = compressed >> 12;
> +   dst->header.execution_size = compressed >> 13;
> +   dst->header.saturate = compressed >> 16;
>

As in set_control_index(), this could be expressed a lot more compactly
because all of the fields but one are contiguous in the uncompressed
instruction.

+}
> +
> +
> +static void
> +set_uncompressed_datatype(struct brw_instruction *dst,
> +                         struct brw_compact_instruction *src)
> +{
> +   uint32_t uncompressed = gen6_datatype_table[src->dw0.data_type_index];
> +
> +   dst->bits1.ud &= ~(0x7 << 29);
> +   dst->bits1.ud |= ((uncompressed >> 15) & 0x7) << 29;
> +   dst->bits1.ud &= ~0x7fff;
> +   dst->bits1.ud |= uncompressed & 0x7fff;
> +}
> +
> +static void
> +set_uncompressed_subreg(struct brw_instruction *dst,
> +                       struct brw_compact_instruction *src)
> +{
> +   uint32_t uncompressed = gen6_subreg_table[src->dw0.sub_reg_index];
> +
> +   dst->bits1.da1.dest_subreg_nr = (uncompressed >> 0)  & 0x1f;
> +   dst->bits2.da1.src0_subreg_nr = (uncompressed >> 5)  & 0x1f;
> +   dst->bits3.da1.src1_subreg_nr = (uncompressed >> 10) & 0x1f;
> +}
> +
> +static void
> +set_uncompressed_src0(struct brw_instruction *dst,
> +                     struct brw_compact_instruction *src)
> +{
> +   uint32_t compressed = src->dw0.src0_index | src->dw1.src0_index << 2;
> +   uint32_t uncompressed = gen6_src_index_table[compressed];
> +
> +   dst->bits2.da1.src0_abs = uncompressed >> 0;
> +   dst->bits2.da1.src0_negate = uncompressed >> 1;
> +   dst->bits2.da1.src0_address_mode = uncompressed >> 2;
> +   dst->bits2.da1.src0_horiz_stride = uncompressed >> 3;
> +   dst->bits2.da1.src0_width = uncompressed >> 5;
> +   dst->bits2.da1.src0_vert_stride = uncompressed >> 8;
>

As with set_src0_index, this could be done more compactly since the fields
are contiguous.


> +}
> +
> +static void
> +set_uncompressed_src1(struct brw_instruction *dst,
> +                     struct brw_compact_instruction *src)
> +{
> +   uint32_t uncompressed = gen6_src_index_table[src->dw1.src1_index];
> +
> +   dst->bits3.da1.src1_abs = uncompressed >> 0;
> +   dst->bits3.da1.src1_negate = uncompressed >> 1;
> +   dst->bits3.da1.src1_address_mode = uncompressed >> 2;
> +   dst->bits3.da1.src1_horiz_stride = uncompressed >> 3;
> +   dst->bits3.da1.src1_width = uncompressed >> 5;
> +   dst->bits3.da1.src1_vert_stride = uncompressed >> 8;
>

Same as above.


> +}
> +
> +void
> +brw_uncompact_instruction(struct intel_context *intel,
> +                         struct brw_instruction *dst,
> +                         struct brw_compact_instruction *src)
> +{
> +   memset(dst, 0, sizeof(*dst));
> +
> +   dst->header.opcode = src->dw0.opcode;
> +   dst->header.debug_control = src->dw0.debug_control;
> +
> +   set_uncompressed_control(dst, src);
> +   set_uncompressed_datatype(dst, src);
> +   set_uncompressed_subreg(dst, src);
> +   dst->header.acc_wr_control = src->dw0.acc_wr_control;
> +   dst->header.destreg__conditionalmod = src->dw0.conditionalmod;
> +   dst->bits2.da1.flag_reg_nr = src->dw0.flag_reg_nr;
> +   set_uncompressed_src0(dst, src);
> +   set_uncompressed_src1(dst, src);
> +   dst->bits1.da1.dest_reg_nr = src->dw1.dst_reg_nr;
> +   dst->bits2.da1.src0_reg_nr = src->dw1.src0_reg_nr;
> +   dst->bits3.da1.src1_reg_nr = src->dw1.src1_reg_nr;
> +}
> +
> +void brw_debug_compact_uncompact(struct intel_context *intel,
> +                                struct brw_instruction *orig,
> +                                struct brw_instruction *uncompacted)
> +{
> +   fprintf(stderr, "Instruction compact/uncompact changed:\n");
> +
> +   fprintf(stderr, "  before: ");
> +   brw_disasm(stderr, orig, intel->gen);
> +
> +   fprintf(stderr, "  after:  ");
> +   brw_disasm(stderr, uncompacted, intel->gen);
> +
> +   uint32_t *before_bits = (uint32_t *)orig;
> +   uint32_t *after_bits = (uint32_t *)uncompacted;
> +   printf("  changed bits:\n");
> +   for (int i = 0; i < 128; i++) {
> +      uint32_t before = before_bits[i / 32] & (1 << (i & 31));
> +      uint32_t after = after_bits[i / 32] & (1 << (i & 31));
> +
> +      if (before != after) {
> +        printf("  bit %d, %s to %s\n", i,
> +               before ? "set" : "unset",
> +               after ? "set" : "unset");
> +      }
> +   }
> +}
> +
> +void
> +brw_compact_instructions(struct brw_compile *p)
> +{
> +   struct brw_context *brw = p->brw;
> +   struct intel_context *intel = &brw->intel;
> +   void *store = p->store;
> +
> +   assert(gen6_control_index_table[ARRAY_SIZE(gen6_control_index_table) -
> 1] != 0);
> +   assert(gen6_datatype_table[ARRAY_SIZE(gen6_datatype_table) - 1] != 0);
> +   assert(gen6_subreg_table[ARRAY_SIZE(gen6_subreg_table) - 1] != 0);
> +   assert(gen6_src_index_table[ARRAY_SIZE(gen6_src_index_table) - 1] !=
> 0);
> +
> +   if (intel->gen != 6)
> +      return;
> +
> +   /* FINISHME: If we are going to compress instructions between flow
> control,
> +    * we have to do fixups to flow control offsets to represent the new
> +    * distances, since flow control uses (virtual address distance)/2,
> not a
> +    * logical instruction count.
> +    */
> +   bool continue_compressing = true;
> +   for (int i = 0; i < p->nr_insn; i++) {
> +      if (p->store[i].header.opcode == BRW_OPCODE_WHILE)
> +        return;
> +   }
>

I realize this is going to change in a later patch, but it would be nice to
have a comment explaining why the presence of a WHILE instruction means
that we avoid compacting the entire program, whereas for other flow control
instructions we can compact up until the point where we find the offending
instruction.  (I'm guessing this is because WHILE is the only instruction
we emit which can cause a backwards jump?)


> +
> +   int src_offset;
> +   int offset = 0;
> +   for (src_offset = 0; src_offset < p->nr_insn * 16;) {
> +      struct brw_instruction *src = store + src_offset;
> +      void *dst = store + offset;
> +
> +      switch (src->header.opcode) {
> +      case BRW_OPCODE_IF:
> +      case BRW_OPCODE_HALT:
> +      case BRW_OPCODE_JMPI:
> +        continue_compressing = false;
> +        break;
> +      }
> +
> +      struct brw_instruction saved = *src;
> +
> +      if (continue_compressing &&
> +         !src->header.cmpt_control &&
> +         brw_try_compact_instruction(p, dst, src)) {
> +
> +        /* debug */
> +        struct brw_instruction uncompacted;
> +        brw_uncompact_instruction(intel, &uncompacted, dst);
> +        if (memcmp(&saved, &uncompacted, sizeof(uncompacted))) {
> +           brw_debug_compact_uncompact(intel, &saved, &uncompacted);
> +        }
>

Do you intend to keep this debug block in the final production code, or was
it temporary debugging code that you were planning to remove?  It seems
like the kind of thing that we would normally enclose in "if (0)".


> +
> +        offset += 8;
> +        src_offset += 16;
> +      } else {
> +        int size = src->header.cmpt_control ? 8 : 16;
> +
> +        /* It appears that the end of thread SEND instruction needs to be
> aligned. */
>

Based on bspec or experiment?  If bspec, it would be nice to cite a
reference.  If experiment, it would be nice to document what goes wrong if
it's not aligned.


> +        if ((src->header.opcode == BRW_OPCODE_SEND ||
> +             src->header.opcode == BRW_OPCODE_SENDC) &&
> +            src->bits3.generic.end_of_thread &&
> +            (offset & 8) != 0) {
> +           struct brw_compact_instruction *align = store + offset;
> +           memset(align, 0, sizeof(*align));
> +           align->dw0.opcode = BRW_OPCODE_NOP;
> +           align->dw0.cmpt_ctrl = 1;
> +           offset += 8;
> +           dst = store + offset;
> +        }
> +
> +        /* If we didn't compact this intruction, we need to move it down
> into
>

Instruction.


> +         * place.
> +         */
> +        if (offset != src_offset) {
> +           memmove(dst, src, size);
> +        }
> +        offset += size;
> +        src_offset += size;
> +      }
> +   }
> +
> +   /* p->nr_insn is counting the number of uncompacted instructions
> still, so
> +    * divide.  We do want to be sure there's a valid instruction in any
> +    * alignment padding, so that the next compression pass (for the FS
> 8/16
> +    * compile passes) parses correctly.
> +    */
>

Can I infer from this that when we're compiling a fragment shader, the
8-wide shader gets compiled and then compacted, then the 16-wide shader
gets compiled, then the whole fragment shader (both 8-wide and 16-wide
portions) gets compacted?  If so, that seems like a bit of a waste.  It
would be nice if we could ensure that the second compaction pass is only
applied to the 16-wide shader.  In addition to saving a little bit of
execution time, it would let us simplify the code above (which contains
special logic to ensure that already-compressed instructions don't get
re-compressed).


> +   if (offset & 8) {
> +      struct brw_compact_instruction *align = store + offset;
> +      memset(align, 0, sizeof(*align));
> +      align->dw0.opcode = BRW_OPCODE_NOP;
> +      align->dw0.cmpt_ctrl = 1;
> +      offset += 8;
> +   }
> +   p->next_insn_offset = offset;
> +   p->nr_insn = offset / 16;
> +
> +   if (0) {
> +      fprintf(stdout, "dumping compacted program\n");
> +      brw_dump_compile(p, stdout, 0, p->next_insn_offset);
> +
> +      int cmp = 0;
> +      for (offset = 0; offset < p->next_insn_offset;) {
> +        struct brw_instruction *insn = store + offset;
> +
> +        if (insn->header.cmpt_control) {
> +           offset += 8;
> +           cmp++;
> +        } else {
> +           offset += 16;
> +        }
> +      }
> +      fprintf(stderr, "%db/%db saved (%d%%)\n", cmp * 8, offset + cmp * 8,
> +             cmp * 8 * 100 / (offset + cmp * 8));
> +   }
> +}
> diff --git a/src/mesa/drivers/dri/i965/brw_eu_debug.c
> b/src/mesa/drivers/dri/i965/brw_eu_debug.c
> index 99453af..a8e10a9 100644
> --- a/src/mesa/drivers/dri/i965/brw_eu_debug.c
> +++ b/src/mesa/drivers/dri/i965/brw_eu_debug.c
> @@ -32,6 +32,7 @@
>
>  #include "main/mtypes.h"
>  #include "main/imports.h"
> +#include "brw_context.h"
>  #include "brw_eu.h"
>
>  void brw_print_reg( struct brw_reg hwreg )
> diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp
> b/src/mesa/drivers/dri/i965/brw_fs.cpp
> index a8d55ff..4b82e53 100644
> --- a/src/mesa/drivers/dri/i965/brw_fs.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
> @@ -1957,6 +1957,12 @@ fs_visitor::run()
>     brw_wm_payload_setup(brw, c);
>
>     if (c->dispatch_width == 16) {
> +      /* We have to do a compaction pass now, or the one at the end of
> +       * execution will squash down where our prog_offset start needs
> +       * to be.
> +       */
> +      brw_compact_instructions(p);
> +
>        /* align to 64 byte boundary. */
>        while ((c->func.nr_insn * sizeof(struct brw_instruction)) % 64) {
>          brw_NOP(p);
> diff --git a/src/mesa/drivers/dri/i965/brw_structs.h
> b/src/mesa/drivers/dri/i965/brw_structs.h
> index 465d2a2..906d352 100644
> --- a/src/mesa/drivers/dri/i965/brw_structs.h
> +++ b/src/mesa/drivers/dri/i965/brw_structs.h
> @@ -1048,6 +1048,8 @@ struct brw_instruction
>          GLuint dest_subreg_nr:3;
>          GLuint dest_reg_nr:8;
>        } da3src;
> +
> +      uint32_t ud;
>     } bits1;
>
>
> @@ -1534,5 +1536,27 @@ struct brw_instruction
>     } bits3;
>  };
>
> +struct brw_compact_instruction {
> +   struct {
> +      unsigned opcode:7;          /*  0- 6 */
> +      unsigned debug_control:1;   /*  7- 7 */
> +      unsigned control_index:5;   /*  8-12 */
> +      unsigned data_type_index:5; /* 13-17 */
> +      unsigned sub_reg_index:5;   /* 18-22 */
> +      unsigned acc_wr_control:1;  /* 23-23 */
> +      unsigned conditionalmod:4;  /* 24-27 */
> +      unsigned flag_reg_nr:1;     /* 28-28 */
> +      unsigned cmpt_ctrl:1;       /* 29-29 */
> +      unsigned src0_index:2;      /* 30-31 */
> +   } dw0;
> +
> +   struct {
> +      unsigned src0_index:3;  /* 32-24 */
> +      unsigned src1_index:5;  /* 35-39 */
> +      unsigned dst_reg_nr:8;  /* 40-47 */
> +      unsigned src0_reg_nr:8; /* 48-55 */
> +      unsigned src1_reg_nr:8; /* 56-63 */
> +   } dw1;
> +};
>
>  #endif
> diff --git a/src/mesa/drivers/dri/i965/test_eu_compact.c
> b/src/mesa/drivers/dri/i965/test_eu_compact.c
> new file mode 100644
> index 0000000..cedfc42
> --- /dev/null
> +++ b/src/mesa/drivers/dri/i965/test_eu_compact.c
> @@ -0,0 +1,265 @@
> +/*
> + * Copyright © 2012 Intel Corporation
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a
> + * copy of this software and associated documentation files (the
> "Software"),
> + * to deal in the Software without restriction, including without
> limitation
> + * the rights to use, copy, modify, merge, publish, distribute,
> sublicense,
> + * and/or sell copies of the Software, and to permit persons to whom the
> + * Software is furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice (including the
> next
> + * paragraph) shall be included in all copies or substantial portions of
> the
> + * Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
> EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
> MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT
> SHALL
> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
> OTHER
> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
> + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
> DEALINGS
> + * IN THE SOFTWARE.
> + */
> +
> +#include <stdlib.h>
> +#include <stdio.h>
> +#include <stdbool.h>
> +#include "glsl/ralloc.h"
> +#include "brw_context.h"
> +#include "brw_eu.h"
> +
> +static bool
> +test_compact_instruction(struct brw_compile *c, struct brw_instruction
> src)
> +{
> +   struct brw_context *brw = c->brw;
> +   struct intel_context *intel = &brw->intel;
> +
> +   struct brw_compact_instruction dst;
> +   memset(&dst, 0xd0, sizeof(dst));
> +
> +   if (brw_try_compact_instruction(c, &dst, &src)) {
> +      struct brw_instruction uncompacted;
> +
> +      brw_uncompact_instruction(intel, &uncompacted, &dst);
> +      if (memcmp(&uncompacted, &src, sizeof(src))) {
> +        brw_debug_compact_uncompact(intel, &src, &uncompacted);
> +        return false;
> +      }
> +   } else {
> +      struct brw_compact_instruction unchanged;
> +      memset(&unchanged, 0xd0, sizeof(unchanged));
> +      /* It's not supposed to change dst unless it compacted. */
> +      if (memcmp(&unchanged, &dst, sizeof(dst))) {
> +        fprintf(stderr, "Failed to compact, but dst changed\n");
> +        fprintf(stderr, "  Instruction: ");
> +        brw_disasm(stderr, &src, intel->gen);
> +        return false;
> +      }
> +   }
> +
> +   return true;
> +}
> +
> +/**
> + * When doing fuzz testing, pad bits won't round-trip.
> + *
> + * This sort of a superset of skip_bit, which is testing for changing
> bits that
> + * aren't worth testing for fuzzing.  We also just want to clear bits that
> + * become meaningless once fuzzing twiddles a related bit.
> + */
> +static void
> +clear_pad_bits(struct brw_instruction *inst)
> +{
> +   if (inst->header.opcode != BRW_OPCODE_SEND &&
> +       inst->header.opcode != BRW_OPCODE_SENDC &&
> +       inst->header.opcode != BRW_OPCODE_BREAK &&
> +       inst->header.opcode != BRW_OPCODE_CONTINUE &&
> +       inst->bits1.da1.src0_reg_file != BRW_IMMEDIATE_VALUE &&
> +       inst->bits1.da1.src1_reg_file != BRW_IMMEDIATE_VALUE) {
> +      if (inst->bits3.da1.src1_address_mode)
> +        inst->bits3.ia1.pad1 = 0;
> +      else
> +        inst->bits3.da1.pad0 = 0;
> +   }
> +}
> +
> +static bool
> +skip_bit(struct brw_instruction *src, int bit)
> +{
> +   /* pad bit */
> +   if (bit == 7)
> +      return true;
> +
> +   /* The compact bit -- uncompacted can't have it set. */
> +   if (bit == 29)
> +      return true;
> +
> +   /* pad bit */
> +   if (bit == 47)
> +      return true;
> +
> +   /* pad bits */
> +   if (bit >= 90 && bit <= 95)
> +      return true;
> +
> +   /* sometimes these are pad bits. */
> +   if (src->header.opcode != BRW_OPCODE_SEND &&
> +       src->header.opcode != BRW_OPCODE_SENDC &&
> +       src->header.opcode != BRW_OPCODE_BREAK &&
> +       src->header.opcode != BRW_OPCODE_CONTINUE &&
> +       src->bits1.da1.src0_reg_file != BRW_IMMEDIATE_VALUE &&
> +       src->bits1.da1.src1_reg_file != BRW_IMMEDIATE_VALUE &&
> +       bit >= 121) {
> +      return true;
> +   }
> +
> +   return false;
> +}
> +
> +static bool
> +test_fuzz_compact_instruction(struct brw_compile *c,
> +                             struct brw_instruction src)
> +{
> +   for (int bit0 = 0; bit0 < 128; bit0++) {
> +      if (skip_bit(&src, bit0))
> +        continue;
> +
> +      for (int bit1 = 0; bit1 < 128; bit1++) {
> +        struct brw_instruction instr = src;
> +        uint32_t *bits = (uint32_t *)&instr;
> +
> +        if (skip_bit(&src, bit1))
> +           continue;
> +
> +        bits[bit0 / 32] ^= (1 << (bit0 & 31));
> +        bits[bit1 / 32] ^= (1 << (bit1 & 31));
> +
> +        clear_pad_bits(&instr);
> +
> +        if (!test_compact_instruction(c, instr)) {
> +           printf("  twiddled bits for fuzzing %d, %d\n", bit0, bit1);
> +           return false;
> +        }
> +      }
> +   }
> +
> +   return true;
> +}
> +
> +static void
> +gen_ADD_GRF_GRF_GRF(struct brw_compile *c)
> +{
> +   struct brw_reg g0 = brw_vec8_grf(0, 0);
> +   struct brw_reg g2 = brw_vec8_grf(2, 0);
> +   struct brw_reg g4 = brw_vec8_grf(4, 0);
> +
> +   brw_ADD(c, g0, g2, g4);
> +}
> +
> +static void
> +gen_ADD_GRF_GRF_IMM(struct brw_compile *c)
> +{
> +   struct brw_reg g0 = brw_vec8_grf(0, 0);
> +   struct brw_reg g2 = brw_vec8_grf(2, 0);
> +
> +   brw_ADD(c, g0, g2, brw_imm_f(1.0));
> +}
> +
> +static void
> +gen_ADD_GRF_GRF_IMM_d(struct brw_compile *c)
> +{
> +   struct brw_reg g0 = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_D);
> +   struct brw_reg g2 = retype(brw_vec8_grf(2, 0), BRW_REGISTER_TYPE_D);
> +
> +   brw_ADD(c, g0, g2, brw_imm_d(1));
> +}
> +
> +static void
> +gen_MOV_GRF_GRF(struct brw_compile *c)
> +{
> +   struct brw_reg g0 = brw_vec8_grf(0, 0);
> +   struct brw_reg g2 = brw_vec8_grf(2, 0);
> +
> +   brw_MOV(c, g0, g2);
> +}
> +
> +static void
> +gen_ADD_MRF_GRF_GRF(struct brw_compile *c)
> +{
> +   struct brw_reg m6 = brw_vec8_reg(BRW_MESSAGE_REGISTER_FILE, 6, 0);
> +   struct brw_reg g2 = brw_vec8_grf(2, 0);
> +   struct brw_reg g4 = brw_vec8_grf(4, 0);
> +
> +   brw_ADD(c, m6, g2, g4);
> +}
> +
> +static void
> +gen_ADD_vec1_GRF_GRF_GRF(struct brw_compile *c)
> +{
> +   struct brw_reg g0 = brw_vec1_grf(0, 0);
> +   struct brw_reg g2 = brw_vec1_grf(2, 0);
> +   struct brw_reg g4 = brw_vec1_grf(4, 0);
> +
> +   brw_ADD(c, g0, g2, g4);
> +}
> +
> +static void
> +gen_PLN_MRF_GRF_GRF(struct brw_compile *c)
> +{
> +   struct brw_reg m6 = brw_vec8_reg(BRW_MESSAGE_REGISTER_FILE, 6, 0);
> +   struct brw_reg interp = brw_vec1_grf(2, 0);
> +   struct brw_reg g4 = brw_vec8_grf(4, 0);
> +
> +   brw_PLN(c, m6, interp, g4);
> +}
> +
> +struct {
> +   void (*func)(struct brw_compile *c);
> +} tests[] = {
> +   { gen_MOV_GRF_GRF },
> +   { gen_ADD_GRF_GRF_GRF },
> +   { gen_ADD_GRF_GRF_IMM },
> +   { gen_ADD_GRF_GRF_IMM_d },
> +   { gen_ADD_MRF_GRF_GRF },
> +   { gen_ADD_vec1_GRF_GRF_GRF },
> +   { gen_PLN_MRF_GRF_GRF },
> +};
> +
> +int
> +main(int argc, char **argv)
> +{
> +   struct brw_context *brw = calloc(1, sizeof(*brw));
> +   struct intel_context *intel = &brw->intel;
> +   intel->gen = 6;
> +   int ret = 0;
> +
> +   for (int i = 0; i < ARRAY_SIZE(tests); i++) {
> +      for (int align_16 = 0; align_16 <= 1; align_16++) {
> +        struct brw_compile *c = rzalloc(NULL, struct brw_compile);
> +        brw_init_compile(brw, c, c);
> +
> +        brw_set_predicate_control(c, BRW_PREDICATE_NONE);
> +        if (align_16)
> +           brw_set_access_mode(c, BRW_ALIGN_16);
> +        else
> +           brw_set_access_mode(c, BRW_ALIGN_1);
> +
> +        tests[i].func(c);
> +        assert(c->nr_insn == 1);
> +
> +        if (!test_compact_instruction(c, c->store[0])) {
> +           ret = 1;
> +           continue;
> +        }
> +
> +        if (!test_fuzz_compact_instruction(c, c->store[0])) {
> +           ret = 1;
> +           continue;
> +        }
> +
> +        ralloc_free(c);
> +      }
> +   }
> +
> +   return ret;
> +}
> --
> 1.7.10.4
>
> _______________________________________________
> mesa-dev mailing list
> mesa-dev at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/mesa-dev
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.freedesktop.org/archives/mesa-dev/attachments/20120904/3a833a82/attachment-0001.html>


More information about the mesa-dev mailing list