[Mesa-dev] [PATCH 08/21] i965/vec4: implement imul_high for gen8

Tue Dec 23 10:38:36 PST 2014

On Mon, Dec 22, 2014 at 7:29 PM, Ben Widawsky
<benjamin.widawsky at intel.com> wrote:
> The fancy DW * DW = QW that was enabled earlier in the series for the fs does
> not work for the vec4 paths. vec4 paths use ALIGN16 mode, which is restricted
> from the extended precision granted by gen8.
>
> This is based on a similar patch from Ken for the FS backend which was no longer
> needed after I wrote equivalent code using the QW type.
>
> Cc: Kenneth Graunke <kenneth at whitecape.org>
> Signed-off-by: Ben Widawsky <ben at bwidawsk.net>
> ---
>  src/mesa/drivers/dri/i965/brw_defines.h          |  3 ++-
>  src/mesa/drivers/dri/i965/brw_eu.h               | 21 +++++++++++++++++++++
>  src/mesa/drivers/dri/i965/brw_fs_generator.cpp   | 21 ---------------------
>  src/mesa/drivers/dri/i965/brw_shader.cpp         |  2 ++
>  src/mesa/drivers/dri/i965/brw_vec4_generator.cpp | 12 ++++++++++++
>  src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp   | 10 ++++++++--
>  6 files changed, 45 insertions(+), 24 deletions(-)
>
> diff --git a/src/mesa/drivers/dri/i965/brw_defines.h b/src/mesa/drivers/dri/i965/brw_defines.h
> index 102ba4a..77e1008 100644
> --- a/src/mesa/drivers/dri/i965/brw_defines.h
> +++ b/src/mesa/drivers/dri/i965/brw_defines.h
> @@ -1101,7 +1101,8 @@ enum opcode {
>      */
>     GS_OPCODE_FF_SYNC_SET_PRIMITIVES,
>
> -   SHADER_OPCODE_MOV64,
> +   SHADER_OPCODE_MOV64, /* Used by the fs backend*/

Space before */

> +   SHADER_OPCODE_MUL64, /* Used by vec4 backend */
>  };
>
>  enum brw_urb_write_flags {
> diff --git a/src/mesa/drivers/dri/i965/brw_eu.h b/src/mesa/drivers/dri/i965/brw_eu.h
> index 22d5a0a..1ca19bc 100644
> --- a/src/mesa/drivers/dri/i965/brw_eu.h
> +++ b/src/mesa/drivers/dri/i965/brw_eu.h
> @@ -326,6 +326,27 @@ void brw_shader_time_add(struct brw_compile *p,
>                           uint32_t surf_index);
>
>  /**
> + * Change the register's data type from UD to W, doubling the strides in order
> + * to compensate for halving the data type width.
> + */
> +static inline struct brw_reg
> +ud_reg_to_w(struct brw_reg r)
> +{
> +   assert(r.type == BRW_REGISTER_TYPE_UD);
> +   r.type = BRW_REGISTER_TYPE_W;
> +
> +   /* The BRW_*_STRIDE enums are defined so that incrementing the field
> +    * doubles the real stride.
> +    */
> +   if (r.hstride != 0)
> +      ++r.hstride;
> +   if (r.vstride != 0)
> +      ++r.vstride;
> +
> +   return r;
> +}
> +
> +/**
>   * Return the generation-specific jump distance scaling factor.
>   *
>   * Given the number of instructions to jump, we need to scale by
> diff --git a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
> index 3a15837..15854b0 100644
> --- a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
> @@ -1375,27 +1375,6 @@ fs_generator::generate_set_sample_id(fs_inst *inst,
>     brw_pop_insn_state(p);
>  }
>
> -/**
> - * Change the register's data type from UD to W, doubling the strides in order
> - * to compensate for halving the data type width.
> - */
> -static struct brw_reg
> -ud_reg_to_w(struct brw_reg r)
> -{
> -   assert(r.type == BRW_REGISTER_TYPE_UD);
> -   r.type = BRW_REGISTER_TYPE_W;
> -
> -   /* The BRW_*_STRIDE enums are defined so that incrementing the field
> -    * doubles the real stride.
> -    */
> -   if (r.hstride != 0)
> -      ++r.hstride;
> -   if (r.vstride != 0)
> -      ++r.vstride;
> -
> -   return r;
> -}
> -
>  void
>  fs_generator::generate_pack_half_2x16_split(fs_inst *inst,
>                                              struct brw_reg dst,
> diff --git a/src/mesa/drivers/dri/i965/brw_shader.cpp b/src/mesa/drivers/dri/i965/brw_shader.cpp
> index 5373b75..3891f43 100644
> --- a/src/mesa/drivers/dri/i965/brw_shader.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_shader.cpp
> @@ -404,6 +404,8 @@ brw_instruction_name(enum opcode op)
>     case FS_OPCODE_REP_FB_WRITE:
>        return "rep_fb_write";
>
> +   case SHADER_OPCODE_MUL64:
> +      return "mov64";

"mul64"

>     case SHADER_OPCODE_MOV64:
>        return "mov64";
>     case SHADER_OPCODE_RCP:
> diff --git a/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp b/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp
> index b88a579..8994a02 100644
> --- a/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp
> @@ -1197,6 +1197,18 @@ vec4_generator::generate_code(const cfg_t *cfg)
>        case BRW_OPCODE_MUL:
>           brw_MUL(p, dst, src[0], src[1]);
>           break;
> +
> +      case SHADER_OPCODE_MUL64: {
> +         assert(brw->gen >= 8);
> +         struct brw_reg acc = retype(brw_acc_reg(8), dst.type);
> +         struct brw_reg src1_w =
> +            ud_reg_to_w(retype(src[1], BRW_REGISTER_TYPE_UD));
> +         if (src[1].type == BRW_REGISTER_TYPE_UD)
> +            src1_w.type = BRW_REGISTER_TYPE_UW;
> +
> +         brw_MUL(p, acc, src[0], src1_w);
> +         break;
> +      }
>        case BRW_OPCODE_MACH:
>           brw_MACH(p, dst, src[0], src[1]);
>           break;
> diff --git a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
> index 09d79c8..8d52a89 100644
> --- a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
> @@ -1492,13 +1492,19 @@ vec4_visitor::visit(ir_expression *ir)
>              emit(MOV(result_dst, src_reg(acc)));
>           }
>        } else {
> +         /* Gen8+ can natively multiply a DW * DW chopping off the upper bits of
> +          * the operation. No MACH is needed
> +          */

This hunk is supposed to be in some other patch.

>          emit(MUL(result_dst, op[0], op[1]));
>        }
>        break;
>     case ir_binop_imul_high: {
>        struct brw_reg acc = retype(brw_acc_reg(8), result_dst.type);
> -
> -      emit(MUL(acc, op[0], op[1]));
> +      if (brw->gen >= 8) {
> +         emit(SHADER_OPCODE_MUL64, acc, op[0], op[1]);

I don't think we still want to be using the accumulator?

Or, we don't want to be using MUL64 here? The MUL+MACH macro only
works when the MUL does a 16x32 multiply, and we're doing a 32x32
multiply here.

> +      } else {
> +         emit(MUL(acc, op[0], op[1]));
> +      }
>        emit(MACH(result_dst, op[0], op[1]));
>        break;
>     }
> --
> 2.2.1