[Mesa-dev] [PATCH 08/21] i965/vec4: implement imul_high for gen8

Tue Dec 23 10:46:28 PST 2014

On Tue, Dec 23, 2014 at 10:38:36AM -0800, Matt Turner wrote:
> On Mon, Dec 22, 2014 at 7:29 PM, Ben Widawsky
> <benjamin.widawsky at intel.com> wrote:
> > The fancy DW * DW = QW that was enabled earlier in the series for the fs does
> > not work for the vec4 paths. vec4 paths use ALIGN16 mode, which is restricted
> > from the extended precision granted by gen8.
> >
> > This is based on a similar patch from Ken for the FS backend which was no longer
> > needed after I wrote equivalent code using the QW type.
> >
> > Cc: Kenneth Graunke <kenneth at whitecape.org>
> > Signed-off-by: Ben Widawsky <ben at bwidawsk.net>
> > ---
> >  src/mesa/drivers/dri/i965/brw_defines.h          |  3 ++-
> >  src/mesa/drivers/dri/i965/brw_eu.h               | 21 +++++++++++++++++++++
> >  src/mesa/drivers/dri/i965/brw_fs_generator.cpp   | 21 ---------------------
> >  src/mesa/drivers/dri/i965/brw_shader.cpp         |  2 ++
> >  src/mesa/drivers/dri/i965/brw_vec4_generator.cpp | 12 ++++++++++++
> >  src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp   | 10 ++++++++--
> >  6 files changed, 45 insertions(+), 24 deletions(-)
> >
> > diff --git a/src/mesa/drivers/dri/i965/brw_defines.h b/src/mesa/drivers/dri/i965/brw_defines.h
> > index 102ba4a..77e1008 100644
> > --- a/src/mesa/drivers/dri/i965/brw_defines.h
> > +++ b/src/mesa/drivers/dri/i965/brw_defines.h
> > @@ -1101,7 +1101,8 @@ enum opcode {
> >      */
> >     GS_OPCODE_FF_SYNC_SET_PRIMITIVES,
> >
> > -   SHADER_OPCODE_MOV64,
> > +   SHADER_OPCODE_MOV64, /* Used by the fs backend*/
> 
> Space before */
> 
> > +   SHADER_OPCODE_MUL64, /* Used by vec4 backend */
> >  };
> >
> >  enum brw_urb_write_flags {
> > diff --git a/src/mesa/drivers/dri/i965/brw_eu.h b/src/mesa/drivers/dri/i965/brw_eu.h
> > index 22d5a0a..1ca19bc 100644
> > --- a/src/mesa/drivers/dri/i965/brw_eu.h
> > +++ b/src/mesa/drivers/dri/i965/brw_eu.h
> > @@ -326,6 +326,27 @@ void brw_shader_time_add(struct brw_compile *p,
> >                           uint32_t surf_index);
> >
> >  /**
> > + * Change the register's data type from UD to W, doubling the strides in order
> > + * to compensate for halving the data type width.
> > + */
> > +static inline struct brw_reg
> > +ud_reg_to_w(struct brw_reg r)
> > +{
> > +   assert(r.type == BRW_REGISTER_TYPE_UD);
> > +   r.type = BRW_REGISTER_TYPE_W;
> > +
> > +   /* The BRW_*_STRIDE enums are defined so that incrementing the field
> > +    * doubles the real stride.
> > +    */
> > +   if (r.hstride != 0)
> > +      ++r.hstride;
> > +   if (r.vstride != 0)
> > +      ++r.vstride;
> > +
> > +   return r;
> > +}
> > +
> > +/**
> >   * Return the generation-specific jump distance scaling factor.
> >   *
> >   * Given the number of instructions to jump, we need to scale by
> > diff --git a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
> > index 3a15837..15854b0 100644
> > --- a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
> > +++ b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
> > @@ -1375,27 +1375,6 @@ fs_generator::generate_set_sample_id(fs_inst *inst,
> >     brw_pop_insn_state(p);
> >  }
> >
> > -/**
> > - * Change the register's data type from UD to W, doubling the strides in order
> > - * to compensate for halving the data type width.
> > - */
> > -static struct brw_reg
> > -ud_reg_to_w(struct brw_reg r)
> > -{
> > -   assert(r.type == BRW_REGISTER_TYPE_UD);
> > -   r.type = BRW_REGISTER_TYPE_W;
> > -
> > -   /* The BRW_*_STRIDE enums are defined so that incrementing the field
> > -    * doubles the real stride.
> > -    */
> > -   if (r.hstride != 0)
> > -      ++r.hstride;
> > -   if (r.vstride != 0)
> > -      ++r.vstride;
> > -
> > -   return r;
> > -}
> > -
> >  void
> >  fs_generator::generate_pack_half_2x16_split(fs_inst *inst,
> >                                              struct brw_reg dst,
> > diff --git a/src/mesa/drivers/dri/i965/brw_shader.cpp b/src/mesa/drivers/dri/i965/brw_shader.cpp
> > index 5373b75..3891f43 100644
> > --- a/src/mesa/drivers/dri/i965/brw_shader.cpp
> > +++ b/src/mesa/drivers/dri/i965/brw_shader.cpp
> > @@ -404,6 +404,8 @@ brw_instruction_name(enum opcode op)
> >     case FS_OPCODE_REP_FB_WRITE:
> >        return "rep_fb_write";
> >
> > +   case SHADER_OPCODE_MUL64:
> > +      return "mov64";
> 
> "mul64"
> 
> >     case SHADER_OPCODE_MOV64:
> >        return "mov64";
> >     case SHADER_OPCODE_RCP:
> > diff --git a/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp b/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp
> > index b88a579..8994a02 100644
> > --- a/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp
> > +++ b/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp
> > @@ -1197,6 +1197,18 @@ vec4_generator::generate_code(const cfg_t *cfg)
> >        case BRW_OPCODE_MUL:
> >           brw_MUL(p, dst, src[0], src[1]);
> >           break;
> > +
> > +      case SHADER_OPCODE_MUL64: {
> > +         assert(brw->gen >= 8);
> > +         struct brw_reg acc = retype(brw_acc_reg(8), dst.type);
> > +         struct brw_reg src1_w =
> > +            ud_reg_to_w(retype(src[1], BRW_REGISTER_TYPE_UD));
> > +         if (src[1].type == BRW_REGISTER_TYPE_UD)
> > +            src1_w.type = BRW_REGISTER_TYPE_UW;
> > +
> > +         brw_MUL(p, acc, src[0], src1_w);
> > +         break;
> > +      }
> >        case BRW_OPCODE_MACH:
> >           brw_MACH(p, dst, src[0], src[1]);
> >           break;
> > diff --git a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
> > index 09d79c8..8d52a89 100644
> > --- a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
> > +++ b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
> > @@ -1492,13 +1492,19 @@ vec4_visitor::visit(ir_expression *ir)
> >              emit(MOV(result_dst, src_reg(acc)));
> >           }
> >        } else {
> > +         /* Gen8+ can natively multiply a DW * DW chopping off the upper bits of
> > +          * the operation. No MACH is needed
> > +          */
> 
> This hunk is supposed to be in some other patch.
> 
> >          emit(MUL(result_dst, op[0], op[1]));
> >        }
> >        break;
> >     case ir_binop_imul_high: {
> >        struct brw_reg acc = retype(brw_acc_reg(8), result_dst.type);
> > -
> > -      emit(MUL(acc, op[0], op[1]));
> > +      if (brw->gen >= 8) {
> > +         emit(SHADER_OPCODE_MUL64, acc, op[0], op[1]);
> 
> I don't think we still want to be using the accumulator?
> 
> Or, we don't want to be using MUL64 here? The MUL+MACH macro only
> works when the MUL does a 16x32 multiply, and we're doing a 32x32
> multiply here.

It shouldn't be a 32 x 32 since that is prohibited with align16. I'm not really
sure where the issue is here. The MUL64 is the special hack instruction to do
the correct thing for gen8 when we can't do 32x32 (stolen from Ken).

I'll fix the other two hunks, thanks.

> 
> > +      } else {
> > +         emit(MUL(acc, op[0], op[1]));
> > +      }
> >        emit(MACH(result_dst, op[0], op[1]));
> >        break;
> >     }
> > --
> > 2.2.1