[Mesa-dev] [PATCH 08/21] i965/vec4: implement imul_high for gen8
Matt Turner
mattst88 at gmail.com
Tue Dec 23 10:38:36 PST 2014
On Mon, Dec 22, 2014 at 7:29 PM, Ben Widawsky
<benjamin.widawsky at intel.com> wrote:
> The fancy DW * DW = QW that was enabled earlier in the series for the fs does
> not work for the vec4 paths. vec4 paths use ALIGN16 mode, which is restricted
> from the extended precision granted by gen8.
>
> This is based on a similar patch from Ken for the FS backend which was no longer
> needed after I wrote equivalent code using the QW type.
>
> Cc: Kenneth Graunke <kenneth at whitecape.org>
> Signed-off-by: Ben Widawsky <ben at bwidawsk.net>
> ---
> src/mesa/drivers/dri/i965/brw_defines.h | 3 ++-
> src/mesa/drivers/dri/i965/brw_eu.h | 21 +++++++++++++++++++++
> src/mesa/drivers/dri/i965/brw_fs_generator.cpp | 21 ---------------------
> src/mesa/drivers/dri/i965/brw_shader.cpp | 2 ++
> src/mesa/drivers/dri/i965/brw_vec4_generator.cpp | 12 ++++++++++++
> src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp | 10 ++++++++--
> 6 files changed, 45 insertions(+), 24 deletions(-)
>
> diff --git a/src/mesa/drivers/dri/i965/brw_defines.h b/src/mesa/drivers/dri/i965/brw_defines.h
> index 102ba4a..77e1008 100644
> --- a/src/mesa/drivers/dri/i965/brw_defines.h
> +++ b/src/mesa/drivers/dri/i965/brw_defines.h
> @@ -1101,7 +1101,8 @@ enum opcode {
> */
> GS_OPCODE_FF_SYNC_SET_PRIMITIVES,
>
> - SHADER_OPCODE_MOV64,
> + SHADER_OPCODE_MOV64, /* Used by the fs backend*/
Space before */
> + SHADER_OPCODE_MUL64, /* Used by vec4 backend */
> };
>
> enum brw_urb_write_flags {
> diff --git a/src/mesa/drivers/dri/i965/brw_eu.h b/src/mesa/drivers/dri/i965/brw_eu.h
> index 22d5a0a..1ca19bc 100644
> --- a/src/mesa/drivers/dri/i965/brw_eu.h
> +++ b/src/mesa/drivers/dri/i965/brw_eu.h
> @@ -326,6 +326,27 @@ void brw_shader_time_add(struct brw_compile *p,
> uint32_t surf_index);
>
> /**
> + * Change the register's data type from UD to W, doubling the strides in order
> + * to compensate for halving the data type width.
> + */
> +static inline struct brw_reg
> +ud_reg_to_w(struct brw_reg r)
> +{
> + assert(r.type == BRW_REGISTER_TYPE_UD);
> + r.type = BRW_REGISTER_TYPE_W;
> +
> + /* The BRW_*_STRIDE enums are defined so that incrementing the field
> + * doubles the real stride.
> + */
> + if (r.hstride != 0)
> + ++r.hstride;
> + if (r.vstride != 0)
> + ++r.vstride;
> +
> + return r;
> +}
> +
> +/**
> * Return the generation-specific jump distance scaling factor.
> *
> * Given the number of instructions to jump, we need to scale by
> diff --git a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
> index 3a15837..15854b0 100644
> --- a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
> @@ -1375,27 +1375,6 @@ fs_generator::generate_set_sample_id(fs_inst *inst,
> brw_pop_insn_state(p);
> }
>
> -/**
> - * Change the register's data type from UD to W, doubling the strides in order
> - * to compensate for halving the data type width.
> - */
> -static struct brw_reg
> -ud_reg_to_w(struct brw_reg r)
> -{
> - assert(r.type == BRW_REGISTER_TYPE_UD);
> - r.type = BRW_REGISTER_TYPE_W;
> -
> - /* The BRW_*_STRIDE enums are defined so that incrementing the field
> - * doubles the real stride.
> - */
> - if (r.hstride != 0)
> - ++r.hstride;
> - if (r.vstride != 0)
> - ++r.vstride;
> -
> - return r;
> -}
> -
> void
> fs_generator::generate_pack_half_2x16_split(fs_inst *inst,
> struct brw_reg dst,
> diff --git a/src/mesa/drivers/dri/i965/brw_shader.cpp b/src/mesa/drivers/dri/i965/brw_shader.cpp
> index 5373b75..3891f43 100644
> --- a/src/mesa/drivers/dri/i965/brw_shader.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_shader.cpp
> @@ -404,6 +404,8 @@ brw_instruction_name(enum opcode op)
> case FS_OPCODE_REP_FB_WRITE:
> return "rep_fb_write";
>
> + case SHADER_OPCODE_MUL64:
> + return "mov64";
"mul64"
> case SHADER_OPCODE_MOV64:
> return "mov64";
> case SHADER_OPCODE_RCP:
> diff --git a/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp b/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp
> index b88a579..8994a02 100644
> --- a/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp
> @@ -1197,6 +1197,18 @@ vec4_generator::generate_code(const cfg_t *cfg)
> case BRW_OPCODE_MUL:
> brw_MUL(p, dst, src[0], src[1]);
> break;
> +
> + case SHADER_OPCODE_MUL64: {
> + assert(brw->gen >= 8);
> + struct brw_reg acc = retype(brw_acc_reg(8), dst.type);
> + struct brw_reg src1_w =
> + ud_reg_to_w(retype(src[1], BRW_REGISTER_TYPE_UD));
> + if (src[1].type == BRW_REGISTER_TYPE_UD)
> + src1_w.type = BRW_REGISTER_TYPE_UW;
> +
> + brw_MUL(p, acc, src[0], src1_w);
> + break;
> + }
> case BRW_OPCODE_MACH:
> brw_MACH(p, dst, src[0], src[1]);
> break;
> diff --git a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
> index 09d79c8..8d52a89 100644
> --- a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
> @@ -1492,13 +1492,19 @@ vec4_visitor::visit(ir_expression *ir)
> emit(MOV(result_dst, src_reg(acc)));
> }
> } else {
> + /* Gen8+ can natively multiply a DW * DW chopping off the upper bits of
> + * the operation. No MACH is needed
> + */
This hunk is supposed to be in some other patch.
> emit(MUL(result_dst, op[0], op[1]));
> }
> break;
> case ir_binop_imul_high: {
> struct brw_reg acc = retype(brw_acc_reg(8), result_dst.type);
> -
> - emit(MUL(acc, op[0], op[1]));
> + if (brw->gen >= 8) {
> + emit(SHADER_OPCODE_MUL64, acc, op[0], op[1]);
I don't think we still want to be using the accumulator?
Or, we don't want to be using MUL64 here? The MUL+MACH macro only
works when the MUL does a 16x32 multiply, and we're doing a 32x32
multiply here.
> + } else {
> + emit(MUL(acc, op[0], op[1]));
> + }
> emit(MACH(result_dst, op[0], op[1]));
> break;
> }
> --
> 2.2.1
More information about the mesa-dev
mailing list