[Mesa-dev] [PATCH 2/6] i965/vec4/generator: use 1-Oword Block Read/Write messages for DF scratch writes/reads

Thu Jun 22 23:25:35 UTC 2017

Samuel Iglesias Gonsálvez <siglesias at igalia.com> writes:

> Signed-off-by: Samuel Iglesias Gonsálvez <siglesias at igalia.com>
> ---
>  src/intel/compiler/brw_eu_defines.h          |   2 +
>  src/intel/compiler/brw_shader.cpp            |   5 +
>  src/intel/compiler/brw_vec4.cpp              |   7 ++
>  src/intel/compiler/brw_vec4.h                |   8 ++
>  src/intel/compiler/brw_vec4_generator.cpp    | 136 +++++++++++++++++++++++++++
>  src/intel/compiler/brw_vec4_reg_allocate.cpp |   6 +-
>  src/intel/compiler/brw_vec4_visitor.cpp      |  49 ++++++++++
>  7 files changed, 212 insertions(+), 1 deletion(-)
>
> diff --git a/src/intel/compiler/brw_eu_defines.h b/src/intel/compiler/brw_eu_defines.h
> index 1af835d47e..3c148de0fa 100644
> --- a/src/intel/compiler/brw_eu_defines.h
> +++ b/src/intel/compiler/brw_eu_defines.h
> @@ -436,6 +436,8 @@ enum opcode {
>     VEC4_OPCODE_PICK_HIGH_32BIT,
>     VEC4_OPCODE_SET_LOW_32BIT,
>     VEC4_OPCODE_SET_HIGH_32BIT,
> +   VEC4_OPCODE_GEN4_SCRATCH_READ_1OWORD_LOW,
> +   VEC4_OPCODE_GEN4_SCRATCH_READ_1OWORD_HIGH,
>  

What's the point of introducing two different opcodes with essentially
the same semantics (read 32B worth of data) as the current
SHADER_OPCODE_GEN4_SCRATCH_READ?  Is there any downside from using the
current opcode with force_writemask_all?  If anything it would give you
better performance because you'd only have to set up one header (which
stalls the EU pipeline twice), send down one message to the dataport,
and avoid stalling to shuffle the data around in the return payload
(which prevents your two 1OWORD messages from being pipelined at all).

>     FS_OPCODE_DDX_COARSE,
>     FS_OPCODE_DDX_FINE,
> diff --git a/src/intel/compiler/brw_shader.cpp b/src/intel/compiler/brw_shader.cpp
> index 53d0742d2e..248feacbd2 100644
> --- a/src/intel/compiler/brw_shader.cpp
> +++ b/src/intel/compiler/brw_shader.cpp
> @@ -296,6 +296,11 @@ brw_instruction_name(const struct gen_device_info *devinfo, enum opcode op)
>     case FS_OPCODE_PACK:
>        return "pack";
>  
> +
> +   case VEC4_OPCODE_GEN4_SCRATCH_READ_1OWORD_LOW:
> +      return "gen4_scratch_read_1word_low";
> +   case VEC4_OPCODE_GEN4_SCRATCH_READ_1OWORD_HIGH:
> +      return "gen4_scratch_read_1word_high";
>     case SHADER_OPCODE_GEN4_SCRATCH_READ:
>        return "gen4_scratch_read";
>     case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
> diff --git a/src/intel/compiler/brw_vec4.cpp b/src/intel/compiler/brw_vec4.cpp
> index b443effca9..b6d409eea2 100644
> --- a/src/intel/compiler/brw_vec4.cpp
> +++ b/src/intel/compiler/brw_vec4.cpp
> @@ -259,6 +259,8 @@ bool
>  vec4_instruction::can_do_writemask(const struct gen_device_info *devinfo)
>  {
>     switch (opcode) {
> +   case VEC4_OPCODE_GEN4_SCRATCH_READ_1OWORD_LOW:
> +   case VEC4_OPCODE_GEN4_SCRATCH_READ_1OWORD_HIGH:
>     case SHADER_OPCODE_GEN4_SCRATCH_READ:
>     case VEC4_OPCODE_DOUBLE_TO_F32:
>     case VEC4_OPCODE_DOUBLE_TO_D32:
> @@ -335,6 +337,9 @@ vec4_visitor::implied_mrf_writes(vec4_instruction *inst)
>        return 1;
>     case VS_OPCODE_PULL_CONSTANT_LOAD:
>        return 2;
> +   case VEC4_OPCODE_GEN4_SCRATCH_READ_1OWORD_LOW:
> +   case VEC4_OPCODE_GEN4_SCRATCH_READ_1OWORD_HIGH:
> +      return 1;
>     case SHADER_OPCODE_GEN4_SCRATCH_READ:
>        return 2;
>     case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
> @@ -2091,6 +2096,8 @@ get_lowered_simd_width(const struct gen_device_info *devinfo,
>  {
>     /* Do not split some instructions that require special handling */
>     switch (inst->opcode) {
> +   case VEC4_OPCODE_GEN4_SCRATCH_READ_1OWORD_LOW:
> +   case VEC4_OPCODE_GEN4_SCRATCH_READ_1OWORD_HIGH:
>     case SHADER_OPCODE_GEN4_SCRATCH_READ:
>     case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
>        return inst->exec_size;
> diff --git a/src/intel/compiler/brw_vec4.h b/src/intel/compiler/brw_vec4.h
> index d828da02ea..a5b45aca21 100644
> --- a/src/intel/compiler/brw_vec4.h
> +++ b/src/intel/compiler/brw_vec4.h
> @@ -214,6 +214,9 @@ public:
>                          enum brw_conditional_mod condition);
>     vec4_instruction *IF(enum brw_predicate predicate);
>     EMIT1(SCRATCH_READ)
> +   vec4_instruction *DF_IVB_SCRATCH_READ(const dst_reg &dst, const src_reg &src0,
> +                                         bool low);
> +
>     EMIT2(SCRATCH_WRITE)
>     EMIT3(LRP)
>     EMIT1(BFREV)
> @@ -294,6 +297,11 @@ public:
>  			  dst_reg dst,
>  			  src_reg orig_src,
>  			  int base_offset);
> +   void emit_1grf_df_ivb_scratch_read(bblock_t *block,
> +                                      vec4_instruction *inst,
> +                                      dst_reg temp, src_reg orig_src,
> +                                      int base_offset, bool first_grf);
> +
>     void emit_scratch_write(bblock_t *block, vec4_instruction *inst,
>  			   int base_offset);
>     void emit_pull_constant_load(bblock_t *block, vec4_instruction *inst,
> diff --git a/src/intel/compiler/brw_vec4_generator.cpp b/src/intel/compiler/brw_vec4_generator.cpp
> index 334933d15a..3bb931385a 100644
> --- a/src/intel/compiler/brw_vec4_generator.cpp
> +++ b/src/intel/compiler/brw_vec4_generator.cpp
> @@ -1133,6 +1133,73 @@ generate_unpack_flags(struct brw_codegen *p,
>  }
>  
>  static void
> +generate_scratch_read_1oword(struct brw_codegen *p,
> +                             vec4_instruction *inst,
> +                             struct brw_reg dst,
> +                             struct brw_reg index,
> +                             bool low)
> +{
> +   const struct gen_device_info *devinfo = p->devinfo;
> +
> +   assert(devinfo->gen >= 7 && inst->exec_size == 4 &&
> +          type_sz(dst.type) == 8);
> +   brw_set_default_access_mode(p, BRW_ALIGN_1);
> +   brw_set_default_exec_size(p, BRW_EXECUTE_8);
> +
> +   if (!low) {
> +      /* Read second GRF (offset in OWORDs) */
> +      for (int i = 0; i < 2; i++) {
> +         brw_oword_block_read_scratch(p,
> +                                      dst,
> +                                      brw_message_reg(inst->base_mrf),
> +                                      1, 32*inst->offset + 16*i + 32, false, true);
> +         if (i == 0) {
> +            /* The scratch read message writes the 128 MSB (OWORD1 HIGH) of
> +             * the destination. We need to move them to dst.0 so we can
> +             * read the pending 128 bits without using a temporary register.
> +             */
> +            brw_set_default_exec_size(p, BRW_EXECUTE_4);
> +            struct brw_reg tmp =
> +               stride(suboffset(dst, 16 / type_sz(dst.type)),
> +                      4, 4, 1);
> +
> +            brw_set_default_mask_control(p, true);
> +            brw_MOV(p, dst, tmp);
> +            brw_set_default_mask_control(p, inst->force_writemask_all);
> +            brw_set_default_exec_size(p, BRW_EXECUTE_8);
> +         }
> +      }
> +   } else {
> +      /* Read first GRF (offset in OWORDs) */
> +      for (int i = 1; i >= 0; i--) {
> +         brw_oword_block_read_scratch(p,
> +                                      dst,
> +                                      brw_message_reg(inst->base_mrf),
> +                                      1, 32*inst->offset + 16*i, true, false);
> +
> +         if (i == 1) {
> +            /* The scratch read message writes the 128 LSB (OWORD1 LOW) of
> +             * the destination. We need to move them to dst.4 so we can
> +             * read the pending 128 bits without using a temporary register.
> +             */
> +            struct brw_reg tmp = stride(dst, 4, 4, 1);
> +            brw_set_default_exec_size(p, BRW_EXECUTE_4);
> +            brw_set_default_mask_control(p, true);
> +            brw_MOV(p,
> +                    suboffset(dst, 16 / type_sz(dst.type)),
> +                    tmp);
> +            brw_set_default_mask_control(p, inst->force_writemask_all);
> +            brw_set_default_exec_size(p, BRW_EXECUTE_8);
> +         }
> +      }
> +   }
> +
> +   brw_set_default_exec_size(p, cvt(inst->exec_size) - 1);
> +   brw_set_default_access_mode(p, BRW_ALIGN_16);
> +   return;
> +}
> +
> +static void
>  generate_scratch_read(struct brw_codegen *p,
>                        vec4_instruction *inst,
>                        struct brw_reg dst,
> @@ -1143,6 +1210,16 @@ generate_scratch_read(struct brw_codegen *p,
>  
>     gen6_resolve_implied_move(p, &header, inst->base_mrf);
>  
> +   if (devinfo->gen >= 7 && inst->exec_size == 4 &&
> +       type_sz(dst.type) == 8) {
> +      /* First read second GRF (offset in OWORDs) */
> +      struct brw_reg dst_high = suboffset(dst, 32 / type_sz(dst.type));
> +      generate_scratch_read_1oword(p, inst, dst_high, index, false);
> +      /* Now read first GRF (data from first vertex) */
> +      generate_scratch_read_1oword(p, inst, dst, index, true);
> +      return;
> +   }
> +
>     generate_oword_dual_block_offsets(p, brw_message_reg(inst->base_mrf + 1),
>  				     index);
>  
> @@ -1192,6 +1269,57 @@ generate_scratch_write(struct brw_codegen *p,
>     struct brw_reg header = brw_vec8_grf(0, 0);
>     bool write_commit;
>  
> +   if (devinfo->gen >= 7 && inst->exec_size == 4 &&
> +       type_sz(src.type) == 8) {
> +      brw_set_default_access_mode(p, BRW_ALIGN_1);
> +
> +      /* The messages only works with group == 0, we use the group to know which
> +       * message emit (1-OWORD LOW or 1-OWORD HIGH).
> +       */
> +      brw_set_default_group(p, 0);
> +
> +      if (inst->group == 0) {
> +         for (int i = 0; i < 2; i++) {
> +            brw_set_default_exec_size(p, BRW_EXECUTE_4);
> +            brw_set_default_mask_control(p, true);
> +            struct brw_reg temp =
> +               retype(suboffset(src, i * 16 / type_sz(src.type)), BRW_REGISTER_TYPE_UD);
> +            temp = stride(temp, 4, 4, 1);
> +
> +            brw_MOV(p, brw_uvec_mrf(4, inst->base_mrf + 1, 0),
> +                    temp);
> +            brw_set_default_mask_control(p, inst->force_writemask_all);
> +            brw_set_default_exec_size(p, BRW_EXECUTE_8);
> +
> +            /* Offset in OWORDs */
> +            brw_oword_block_write_scratch(p, brw_message_reg(inst->base_mrf),
> +                                          1, 32*inst->offset + 16*i, true, false);
> +         }
> +      } else {
> +         for (int i = 0; i < 2; i++) {
> +            brw_set_default_exec_size(p, BRW_EXECUTE_4);
> +
> +            brw_set_default_mask_control(p, true);
> +            struct brw_reg temp =
> +               retype(suboffset(src, i * 16 / type_sz(src.type)), BRW_REGISTER_TYPE_UD);
> +            temp = stride(temp, 4, 4, 1);
> +
> +            brw_MOV(p, brw_uvec_mrf(4, inst->base_mrf + 1, 4),
> +                    temp);
> +
> +            brw_set_default_mask_control(p, inst->force_writemask_all);
> +            brw_set_default_exec_size(p, BRW_EXECUTE_8);
> +
> +            /* Offset in OWORDs */
> +            brw_oword_block_write_scratch(p, brw_message_reg(inst->base_mrf),
> +                                          1, 32*inst->offset + 16*i + 32, false, true);
> +         }
> +      }
> +      brw_set_default_exec_size(p, cvt(inst->exec_size) - 1);
> +      brw_set_default_access_mode(p, BRW_ALIGN_16);
> +      return;
> +   }
> +
>     /* If the instruction is predicated, we'll predicate the send, not
>      * the header setup.
>      */
> @@ -1780,6 +1908,14 @@ generate_code(struct brw_codegen *p,
>           generate_vs_urb_write(p, inst);
>           break;
>  
> +      case VEC4_OPCODE_GEN4_SCRATCH_READ_1OWORD_LOW:
> +         generate_scratch_read_1oword(p, inst, dst, src[0], true);
> +         fill_count++;
> +         break;
> +      case VEC4_OPCODE_GEN4_SCRATCH_READ_1OWORD_HIGH:
> +         generate_scratch_read_1oword(p, inst, dst, src[0], false);
> +         fill_count++;
> +         break;
>        case SHADER_OPCODE_GEN4_SCRATCH_READ:
>           generate_scratch_read(p, inst, dst, src[0]);
>           fill_count++;
> diff --git a/src/intel/compiler/brw_vec4_reg_allocate.cpp b/src/intel/compiler/brw_vec4_reg_allocate.cpp
> index a0ba77b867..ec5ba10e86 100644
> --- a/src/intel/compiler/brw_vec4_reg_allocate.cpp
> +++ b/src/intel/compiler/brw_vec4_reg_allocate.cpp
> @@ -332,7 +332,9 @@ can_use_scratch_for_source(const vec4_instruction *inst, unsigned i,
>         * reusing scratch_reg for this instruction.
>         */
>        if (prev_inst->opcode == SHADER_OPCODE_GEN4_SCRATCH_WRITE ||
> -          prev_inst->opcode == SHADER_OPCODE_GEN4_SCRATCH_READ)
> +          prev_inst->opcode == SHADER_OPCODE_GEN4_SCRATCH_READ ||
> +          prev_inst->opcode == VEC4_OPCODE_GEN4_SCRATCH_READ_1OWORD_LOW ||
> +          prev_inst->opcode == VEC4_OPCODE_GEN4_SCRATCH_READ_1OWORD_HIGH)
>           continue;
>  
>        /* If the previous instruction does not write to scratch_reg, then check
> @@ -467,6 +469,8 @@ vec4_visitor::evaluate_spill_costs(float *spill_costs, bool *no_spill)
>           loop_scale /= 10;
>           break;
>  
> +      case VEC4_OPCODE_GEN4_SCRATCH_READ_1OWORD_LOW:
> +      case VEC4_OPCODE_GEN4_SCRATCH_READ_1OWORD_HIGH:
>        case SHADER_OPCODE_GEN4_SCRATCH_READ:
>        case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
>           for (int i = 0; i < 3; i++) {
> diff --git a/src/intel/compiler/brw_vec4_visitor.cpp b/src/intel/compiler/brw_vec4_visitor.cpp
> index 22ee4dd1c4..37ae31c0d5 100644
> --- a/src/intel/compiler/brw_vec4_visitor.cpp
> +++ b/src/intel/compiler/brw_vec4_visitor.cpp
> @@ -264,6 +264,24 @@ vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
>  }
>  
>  vec4_instruction *
> +vec4_visitor::DF_IVB_SCRATCH_READ(const dst_reg &dst,
> +                                  const src_reg &index,
> +                                  bool first_grf)
> +{
> +   vec4_instruction *inst;
> +   enum opcode op = first_grf ?
> +      VEC4_OPCODE_GEN4_SCRATCH_READ_1OWORD_LOW :
> +      VEC4_OPCODE_GEN4_SCRATCH_READ_1OWORD_HIGH;
> +
> +   inst = new(mem_ctx) vec4_instruction(op,
> +                                        dst, index);
> +   inst->base_mrf = FIRST_SPILL_MRF(devinfo->gen) + 1;
> +   inst->mlen = 1;
> +
> +   return inst;
> +}
> +
> +vec4_instruction *
>  vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
>                              const src_reg &index)
>  {
> @@ -1472,6 +1490,37 @@ vec4_visitor::get_scratch_offset(bblock_t *block, vec4_instruction *inst,
>  
>  /**
>   * Emits an instruction before @inst to load the value named by @orig_src
> + * from scratch space at @base_offset to @temp. This instruction only reads
> + * DF value on IVB, one GRF each time.
> + *
> + * @base_offset is measured in 32-byte units (the size of a register).
> + * @first_grf indicates if we want to read first vertex data (true) or
> + * the second (false).
> + */
> +void
> +vec4_visitor::emit_1grf_df_ivb_scratch_read(bblock_t *block,
> +                                            vec4_instruction *inst,
> +                                            dst_reg temp, src_reg orig_src,
> +                                            int base_offset, bool first_grf)
> +{
> +   assert(orig_src.offset % REG_SIZE == 0);
> +   src_reg index = get_scratch_offset(block, inst, 0, base_offset);
> +
> +   assert(devinfo->gen == 7 && !devinfo->is_haswell && type_sz(temp.type) == 8);
> +   temp.offset = 0;
> +   vec4_instruction *read = DF_IVB_SCRATCH_READ(temp, index, first_grf);
> +   read->exec_size = 4;
> +   /* The instruction will use group 0 but a different message depending of the
> +    * vertex data to load.
> +    */
> +   read->group = 0;
> +   read->offset = base_offset;
> +   read->size_written = 1;
> +   emit_before(block, inst, read);
> +}
> +
> +/**
> + * Emits an instruction before @inst to load the value named by @orig_src
>   * from scratch space at @base_offset to @temp.
>   *
>   * @base_offset is measured in 32-byte units (the size of a register).
> -- 
> 2.11.0
-------------- next part --------------
A non-text attachment was scrubbed...
Name: signature.asc
Type: application/pgp-signature
Size: 212 bytes
Desc: not available
URL: <https://lists.freedesktop.org/archives/mesa-dev/attachments/20170622/b5ea5d02/attachment-0001.sig>