[Mesa-dev] [PATCH 5/8] gallium: remove TGSI opcode DPH

Sun Aug 20 00:49:11 UTC 2017

From: Marek Olšák <marek.olsak at amd.com>

use DP4 or DP3 + ADD.
---
 src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c | 17 ----------
 src/gallium/auxiliary/gallivm/lp_bld_tgsi_aos.c    |  3 --
 src/gallium/auxiliary/nir/tgsi_to_nir.c            | 12 -------
 src/gallium/auxiliary/tgsi/tgsi_exec.c             | 33 -------------------
 src/gallium/auxiliary/tgsi/tgsi_info.c             |  2 +-
 src/gallium/auxiliary/tgsi/tgsi_lowering.c         | 38 +++-------------------
 src/gallium/auxiliary/tgsi/tgsi_lowering.h         |  1 -
 src/gallium/auxiliary/tgsi/tgsi_opcode_tmp.h       |  1 -
 src/gallium/auxiliary/tgsi/tgsi_util.c             |  4 ---
 src/gallium/docs/source/tgsi.rst                   | 11 +------
 src/gallium/drivers/etnaviv/etnaviv_compiler.c     | 28 ----------------
 src/gallium/drivers/i915/i915_fpc_optimize.c       |  1 -
 src/gallium/drivers/i915/i915_fpc_translate.c      | 11 -------
 .../drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp  |  8 -----
 src/gallium/drivers/nouveau/nv30/nvfx_fragprog.c   |  5 ---
 src/gallium/drivers/nouveau/nv30/nvfx_vertprog.c   |  3 --
 src/gallium/drivers/r300/r300_tgsi_to_rc.c         |  1 -
 src/gallium/drivers/r600/r600_shader.c             | 13 ++------
 src/gallium/drivers/svga/svga_tgsi_insn.c          | 36 --------------------
 src/gallium/drivers/svga/svga_tgsi_vgpu10.c        | 36 --------------------
 src/gallium/include/pipe/p_shader_tokens.h         |  2 +-
 src/mesa/state_tracker/st_mesa_to_tgsi.c           | 13 ++++++--
 22 files changed, 22 insertions(+), 257 deletions(-)

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c
index 0319b88..f6baca0 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c
@@ -227,36 +227,20 @@ dp4_emit(
                                     emit_data->args[7] /* src1.w */);
    emit_data->output[emit_data->chan] = lp_build_emit_llvm_binary(bld_base,
                                                     TGSI_OPCODE_ADD, tmp0, tmp1);
 }
 
 static struct lp_build_tgsi_action dp4_action = {
    dp4_fetch_args,	 /* fetch_args */
    dp4_emit	 /* emit */
 };
 
-/* TGSI_OPCODE_DPH */
-static void
-dph_fetch_args(
-   struct lp_build_tgsi_context * bld_base,
-   struct lp_build_emit_data * emit_data)
-{
-   dp_fetch_args(bld_base, emit_data, 4);
-   /* src0.w */
-   emit_data->args[3] = bld_base->base.one;
-}
-
-const struct lp_build_tgsi_action dph_action = {
-   dph_fetch_args,	 /* fetch_args */
-   dp4_emit	 /* emit */
-};
-
 /* TGSI_OPCODE_DST */
 static void
 dst_fetch_args(
    struct lp_build_tgsi_context * bld_base,
    struct lp_build_emit_data * emit_data)
 {
    /* src0.y */
    emit_data->args[0] = lp_build_emit_fetch(bld_base, emit_data->inst,
                                             0, TGSI_CHAN_Y);
    /* src0.z */
@@ -1251,21 +1235,20 @@ u642d_emit(
                       emit_data->args[0],
                       bld_base->dbl_bld.vec_type, "");
 }
 
 void
 lp_set_default_actions(struct lp_build_tgsi_context * bld_base)
 {
    bld_base->op_actions[TGSI_OPCODE_DP2] = dp2_action;
    bld_base->op_actions[TGSI_OPCODE_DP3] = dp3_action;
    bld_base->op_actions[TGSI_OPCODE_DP4] = dp4_action;
-   bld_base->op_actions[TGSI_OPCODE_DPH] = dph_action;
    bld_base->op_actions[TGSI_OPCODE_DST] = dst_action;
    bld_base->op_actions[TGSI_OPCODE_EXP] = exp_action;
    bld_base->op_actions[TGSI_OPCODE_LIT] = lit_action;
    bld_base->op_actions[TGSI_OPCODE_LOG] = log_action;
    bld_base->op_actions[TGSI_OPCODE_PK2H] = pk2h_action;
    bld_base->op_actions[TGSI_OPCODE_RSQ] = rsq_action;
    bld_base->op_actions[TGSI_OPCODE_SQRT] = sqrt_action;
    bld_base->op_actions[TGSI_OPCODE_POW] = pow_action;
    bld_base->op_actions[TGSI_OPCODE_SCS] = scs_action;
    bld_base->op_actions[TGSI_OPCODE_UP2H] = up2h_action;
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_aos.c b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_aos.c
index 3cc5079..46abda0 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_aos.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_aos.c
@@ -586,23 +586,20 @@ lp_emit_instruction_aos(
       src0 = lp_build_emit_fetch(&bld->bld_base, inst, 0, LP_CHAN_ALL);
       src0 = swizzle_scalar_aos(bld, src0, TGSI_SWIZZLE_X);
       src1 = lp_build_emit_fetch(&bld->bld_base, inst, 1, LP_CHAN_ALL);
       src1 = swizzle_scalar_aos(bld, src1, TGSI_SWIZZLE_X);
       dst0 = lp_build_pow(&bld->bld_base.base, src0, src1);
       break;
 
    case TGSI_OPCODE_XPD:
       return FALSE;
 
-   case TGSI_OPCODE_DPH:
-      return FALSE;
-
    case TGSI_OPCODE_COS:
       src0 = lp_build_emit_fetch(&bld->bld_base, inst, 0, LP_CHAN_ALL);
       tmp0 = swizzle_scalar_aos(bld, src0, TGSI_SWIZZLE_X);
       dst0 = lp_build_cos(&bld->bld_base.base, tmp0);
       break;
 
    case TGSI_OPCODE_DDX:
       return FALSE;
 
    case TGSI_OPCODE_DDY:
diff --git a/src/gallium/auxiliary/nir/tgsi_to_nir.c b/src/gallium/auxiliary/nir/tgsi_to_nir.c
index 3daa896..ea1f064 100644
--- a/src/gallium/auxiliary/nir/tgsi_to_nir.c
+++ b/src/gallium/auxiliary/nir/tgsi_to_nir.c
@@ -1012,27 +1012,20 @@ ttn_dp3(nir_builder *b, nir_op op, nir_alu_dest dest, nir_ssa_def **src)
    ttn_move_dest(b, dest, nir_fdot3(b, src[0], src[1]));
 }
 
 static void
 ttn_dp4(nir_builder *b, nir_op op, nir_alu_dest dest, nir_ssa_def **src)
 {
    ttn_move_dest(b, dest, nir_fdot4(b, src[0], src[1]));
 }
 
 static void
-ttn_dph(nir_builder *b, nir_op op, nir_alu_dest dest, nir_ssa_def **src)
-{
-   ttn_move_dest(b, dest, nir_fadd(b, nir_fdot3(b, src[0], src[1]),
-                                   ttn_channel(b, src[1], W)));
-}
-
-static void
 ttn_umad(nir_builder *b, nir_op op, nir_alu_dest dest, nir_ssa_def **src)
 {
    ttn_move_dest(b, dest, nir_iadd(b, nir_imul(b, src[0], src[1]), src[2]));
 }
 
 static void
 ttn_arr(nir_builder *b, nir_op op, nir_alu_dest dest, nir_ssa_def **src)
 {
    ttn_move_dest(b, dest, nir_ffloor(b, nir_fadd(b, src[0], nir_imm_float(b, 0.5))));
 }
@@ -1527,21 +1520,20 @@ static const nir_op op_trans[TGSI_OPCODE_LAST] = {
    [TGSI_OPCODE_MAD] = nir_op_ffma,
    [TGSI_OPCODE_LRP] = 0,
    [TGSI_OPCODE_SQRT] = nir_op_fsqrt,
    [TGSI_OPCODE_FRC] = nir_op_ffract,
    [TGSI_OPCODE_FLR] = nir_op_ffloor,
    [TGSI_OPCODE_ROUND] = nir_op_fround_even,
    [TGSI_OPCODE_EX2] = nir_op_fexp2,
    [TGSI_OPCODE_LG2] = nir_op_flog2,
    [TGSI_OPCODE_POW] = nir_op_fpow,
    [TGSI_OPCODE_XPD] = 0,
-   [TGSI_OPCODE_DPH] = 0,
    [TGSI_OPCODE_COS] = nir_op_fcos,
    [TGSI_OPCODE_DDX] = nir_op_fddx,
    [TGSI_OPCODE_DDY] = nir_op_fddy,
    [TGSI_OPCODE_KILL] = 0,
    [TGSI_OPCODE_PK2H] = 0, /* XXX */
    [TGSI_OPCODE_PK2US] = 0, /* XXX */
    [TGSI_OPCODE_PK4B] = 0, /* XXX */
    [TGSI_OPCODE_PK4UB] = 0, /* XXX */
    [TGSI_OPCODE_SEQ] = nir_op_seq,
    [TGSI_OPCODE_SGT] = 0,
@@ -1756,24 +1748,20 @@ ttn_emit_instruction(struct ttn_compile *c)
       break;
 
    case TGSI_OPCODE_DP3:
       ttn_dp3(b, op_trans[tgsi_op], dest, src);
       break;
 
    case TGSI_OPCODE_DP4:
       ttn_dp4(b, op_trans[tgsi_op], dest, src);
       break;
 
-   case TGSI_OPCODE_DPH:
-      ttn_dph(b, op_trans[tgsi_op], dest, src);
-      break;
-
    case TGSI_OPCODE_UMAD:
       ttn_umad(b, op_trans[tgsi_op], dest, src);
       break;
 
    case TGSI_OPCODE_LRP:
       ttn_move_dest(b, dest, nir_flrp(b, src[2], src[1], src[0]));
       break;
 
    case TGSI_OPCODE_KILL:
       ttn_kill(b, op_trans[tgsi_op], dest, src);
diff --git a/src/gallium/auxiliary/tgsi/tgsi_exec.c b/src/gallium/auxiliary/tgsi/tgsi_exec.c
index 34a4af6..c9265fb 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_exec.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_exec.c
@@ -3178,49 +3178,20 @@ exec_dp4(struct tgsi_exec_machine *mach,
    }
 
    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
          store_dest(mach, &arg[2], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
       }
    }
 }
 
 static void
-exec_dph(struct tgsi_exec_machine *mach,
-         const struct tgsi_full_instruction *inst)
-{
-   unsigned int chan;
-   union tgsi_exec_channel arg[3];
-
-   fetch_source(mach, &arg[0], &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
-   fetch_source(mach, &arg[1], &inst->Src[1], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
-   micro_mul(&arg[2], &arg[0], &arg[1]);
-
-   fetch_source(mach, &arg[0], &inst->Src[0], TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
-   fetch_source(mach, &arg[1], &inst->Src[1], TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
-   micro_mad(&arg[2], &arg[0], &arg[1], &arg[2]);
-
-   fetch_source(mach, &arg[0], &inst->Src[0], TGSI_CHAN_Z, TGSI_EXEC_DATA_FLOAT);
-   fetch_source(mach, &arg[1], &inst->Src[1], TGSI_CHAN_Z, TGSI_EXEC_DATA_FLOAT);
-   micro_mad(&arg[0], &arg[0], &arg[1], &arg[2]);
-
-   fetch_source(mach, &arg[1], &inst->Src[1], TGSI_CHAN_W, TGSI_EXEC_DATA_FLOAT);
-   micro_add(&arg[0], &arg[0], &arg[1]);
-
-   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
-      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
-         store_dest(mach, &arg[0], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
-      }
-   }
-}
-
-static void
 exec_dp2(struct tgsi_exec_machine *mach,
          const struct tgsi_full_instruction *inst)
 {
    unsigned int chan;
    union tgsi_exec_channel arg[3];
 
    fetch_source(mach, &arg[0], &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
    fetch_source(mach, &arg[1], &inst->Src[1], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
    micro_mul(&arg[2], &arg[0], &arg[1]);
 
@@ -5179,24 +5150,20 @@ exec_instruction(
       break;
 
    case TGSI_OPCODE_POW:
       exec_scalar_binary(mach, inst, micro_pow, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
       break;
 
    case TGSI_OPCODE_XPD:
       exec_xpd(mach, inst);
       break;
 
-   case TGSI_OPCODE_DPH:
-      exec_dph(mach, inst);
-      break;
-
    case TGSI_OPCODE_COS:
       exec_scalar_unary(mach, inst, micro_cos, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
       break;
 
    case TGSI_OPCODE_DDX:
       exec_vector_unary(mach, inst, micro_ddx, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
       break;
 
    case TGSI_OPCODE_DDY:
       exec_vector_unary(mach, inst, micro_ddy, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
diff --git a/src/gallium/auxiliary/tgsi/tgsi_info.c b/src/gallium/auxiliary/tgsi/tgsi_info.c
index c31705a..b247794 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_info.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_info.c
@@ -65,21 +65,21 @@ static const struct tgsi_opcode_info opcode_info[TGSI_OPCODE_LAST] =
    { 1, 2, 1, 0, 0, 0, 0, OTHR, "TXF_LZ", TGSI_OPCODE_TXF_LZ },
    { 1, 1, 0, 0, 0, 0, 0, COMP, "FLR", TGSI_OPCODE_FLR },
    { 1, 1, 0, 0, 0, 0, 0, COMP, "ROUND", TGSI_OPCODE_ROUND },
    { 1, 1, 0, 0, 0, 0, 0, REPL, "EX2", TGSI_OPCODE_EX2 },
    { 1, 1, 0, 0, 0, 0, 0, REPL, "LG2", TGSI_OPCODE_LG2 },
    { 1, 2, 0, 0, 0, 0, 0, REPL, "POW", TGSI_OPCODE_POW },
    { 1, 2, 0, 0, 0, 0, 0, COMP, "XPD", TGSI_OPCODE_XPD },
    { 1, 1, 0, 0, 0, 0, 0, COMP, "U2I64", TGSI_OPCODE_U2I64 },
    { 1, 0, 0, 0, 0, 0, 0, OTHR, "CLOCK", TGSI_OPCODE_CLOCK },
    { 1, 1, 0, 0, 0, 0, 0, COMP, "I2I64", TGSI_OPCODE_I2I64 },
-   { 1, 2, 0, 0, 0, 0, 0, REPL, "DPH", TGSI_OPCODE_DPH },
+   { 1, 2, 0, 0, 0, 0, 0, REPL, "", 35 }, /* removed */
    { 1, 1, 0, 0, 0, 0, 0, REPL, "COS", TGSI_OPCODE_COS },
    { 1, 1, 0, 0, 0, 0, 0, COMP, "DDX", TGSI_OPCODE_DDX },
    { 1, 1, 0, 0, 0, 0, 0, COMP, "DDY", TGSI_OPCODE_DDY },
    { 0, 0, 0, 0, 0, 0, 0, NONE, "KILL", TGSI_OPCODE_KILL },
    { 1, 1, 0, 0, 0, 0, 0, REPL, "PK2H", TGSI_OPCODE_PK2H },
    { 1, 1, 0, 0, 0, 0, 0, REPL, "PK2US", TGSI_OPCODE_PK2US },
    { 1, 1, 0, 0, 0, 0, 0, REPL, "PK4B", TGSI_OPCODE_PK4B },
    { 1, 1, 0, 0, 0, 0, 0, REPL, "PK4UB", TGSI_OPCODE_PK4UB },
    { 1, 1, 0, 0, 0, 0, 1, COMP, "D2U64", TGSI_OPCODE_D2U64 },
    { 1, 2, 0, 0, 0, 0, 0, COMP, "SEQ", TGSI_OPCODE_SEQ },
diff --git a/src/gallium/auxiliary/tgsi/tgsi_lowering.c b/src/gallium/auxiliary/tgsi/tgsi_lowering.c
index 3013a41..f3b5ade 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_lowering.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_lowering.c
@@ -907,48 +907,42 @@ transform_log(struct tgsi_transform_context *tctx,
       tctx->emit_instruction(tctx, &new_inst);
    }
 }
 
 /* DP4 - 4-component Dot Product
  *   dst = src0.x \times src1.x + src0.y \times src1.y + src0.z \times src1.z + src0.w \times src1.w
  *
  * DP3 - 3-component Dot Product
  *   dst = src0.x \times src1.x + src0.y \times src1.y + src0.z \times src1.z
  *
- * DPH - Homogeneous Dot Product
- *   dst = src0.x \times src1.x + src0.y \times src1.y + src0.z \times src1.z + src1.w
- *
  * DP2 - 2-component Dot Product
  *   dst = src0.x \times src1.x + src0.y \times src1.y
  *
  * NOTE: these are translated into sequence of MUL/MAD(/ADD) scalar
  * operations, which is what you'd prefer for a ISA that is natively
  * scalar.  Probably a native vector ISA would at least already have
  * DP4/DP3 instructions, but perhaps there is room for an alternative
- * translation for DPH/DP2 using vector instructions.
+ * translation for DP2 using vector instructions.
  *
  * ; needs: 1 tmp
  * MUL tmpA.x, src0.x, src1.x
  * MAD tmpA.x, src0.y, src1.y, tmpA.x
- * if (DPH || DP3 || DP4) {
+ * if (DP3 || DP4) {
  *   MAD tmpA.x, src0.z, src1.z, tmpA.x
- *   if (DPH) {
- *     ADD tmpA.x, src1.w, tmpA.x
- *   } else if (DP4) {
+ *   if (DP4) {
  *     MAD tmpA.x, src0.w, src1.w, tmpA.x
  *   }
  * }
  * ; fixup last instruction to replicate into dst
  */
 #define DP4_GROW  (NINST(2) + NINST(3) + NINST(3) + NINST(3) - OINST(2))
 #define DP3_GROW  (NINST(2) + NINST(3) + NINST(3) - OINST(2))
-#define DPH_GROW  (NINST(2) + NINST(3) + NINST(3) + NINST(2) - OINST(2))
 #define DP2_GROW  (NINST(2) + NINST(3) - OINST(2))
 #define DOTP_TMP  1
 static void
 transform_dotp(struct tgsi_transform_context *tctx,
                struct tgsi_full_instruction *inst)
 {
    struct tgsi_lowering_context *ctx = tgsi_lowering_context(tctx);
    struct tgsi_full_dst_register *dst  = &inst->Dst[0];
    struct tgsi_full_src_register *src0 = &inst->Src[0];
    struct tgsi_full_src_register *src1 = &inst->Src[1];
@@ -973,47 +967,35 @@ transform_dotp(struct tgsi_transform_context *tctx,
       /* MAD tmpA.x, src0.y, src1.y, tmpA.x */
       new_inst = tgsi_default_full_instruction();
       new_inst.Instruction.Opcode = TGSI_OPCODE_MAD;
       new_inst.Instruction.NumDstRegs = 1;
       reg_dst(&new_inst.Dst[0], &ctx->tmp[A].dst, TGSI_WRITEMASK_X);
       new_inst.Instruction.NumSrcRegs = 3;
       reg_src(&new_inst.Src[0], src0, SWIZ(Y, Y, Y, Y));
       reg_src(&new_inst.Src[1], src1, SWIZ(Y, Y, Y, Y));
       reg_src(&new_inst.Src[2], &ctx->tmp[A].src, SWIZ(X, X, X, X));
 
-      if ((opcode == TGSI_OPCODE_DPH) ||
-          (opcode == TGSI_OPCODE_DP3) ||
+      if ((opcode == TGSI_OPCODE_DP3) ||
           (opcode == TGSI_OPCODE_DP4)) {
          tctx->emit_instruction(tctx, &new_inst);
 
          /* MAD tmpA.x, src0.z, src1.z, tmpA.x */
          new_inst = tgsi_default_full_instruction();
          new_inst.Instruction.Opcode = TGSI_OPCODE_MAD;
          new_inst.Instruction.NumDstRegs = 1;
          reg_dst(&new_inst.Dst[0], &ctx->tmp[A].dst, TGSI_WRITEMASK_X);
          new_inst.Instruction.NumSrcRegs = 3;
          reg_src(&new_inst.Src[0], src0, SWIZ(Z, Z, Z, Z));
          reg_src(&new_inst.Src[1], src1, SWIZ(Z, Z, Z, Z));
          reg_src(&new_inst.Src[2], &ctx->tmp[A].src, SWIZ(X, X, X, X));
 
-         if (opcode == TGSI_OPCODE_DPH) {
-            tctx->emit_instruction(tctx, &new_inst);
-
-            /* ADD tmpA.x, src1.w, tmpA.x */
-            new_inst = tgsi_default_full_instruction();
-            new_inst.Instruction.Opcode = TGSI_OPCODE_ADD;
-            new_inst.Instruction.NumDstRegs = 1;
-            reg_dst(&new_inst.Dst[0], &ctx->tmp[A].dst, TGSI_WRITEMASK_X);
-            new_inst.Instruction.NumSrcRegs = 2;
-            reg_src(&new_inst.Src[0], src1, SWIZ(W, W, W, W));
-            reg_src(&new_inst.Src[1], &ctx->tmp[A].src, SWIZ(X, X, X, X));
-         } else if (opcode == TGSI_OPCODE_DP4) {
+         if (opcode == TGSI_OPCODE_DP4) {
             tctx->emit_instruction(tctx, &new_inst);
 
             /* MAD tmpA.x, src0.w, src1.w, tmpA.x */
             new_inst = tgsi_default_full_instruction();
             new_inst.Instruction.Opcode = TGSI_OPCODE_MAD;
             new_inst.Instruction.NumDstRegs = 1;
             reg_dst(&new_inst.Dst[0], &ctx->tmp[A].dst, TGSI_WRITEMASK_X);
             new_inst.Instruction.NumSrcRegs = 3;
             reg_src(&new_inst.Src[0], src0, SWIZ(W, W, W, W));
             reg_src(&new_inst.Src[1], src1, SWIZ(W, W, W, W));
@@ -1527,25 +1509,20 @@ transform_instr(struct tgsi_transform_context *tctx,
    case TGSI_OPCODE_DP4:
       if (!ctx->config->lower_DP4)
          goto skip;
       transform_dotp(tctx, inst);
       break;
    case TGSI_OPCODE_DP3:
       if (!ctx->config->lower_DP3)
          goto skip;
       transform_dotp(tctx, inst);
       break;
-   case TGSI_OPCODE_DPH:
-      if (!ctx->config->lower_DPH)
-         goto skip;
-      transform_dotp(tctx, inst);
-      break;
    case TGSI_OPCODE_DP2:
       if (!ctx->config->lower_DP2)
          goto skip;
       transform_dotp(tctx, inst);
       break;
    case TGSI_OPCODE_FLR:
       if (!ctx->config->lower_FLR)
          goto skip;
       transform_flr_ceil(tctx, inst);
       break;
@@ -1625,21 +1602,20 @@ tgsi_transform_lowering(const struct tgsi_lowering_config *config,
          OPCS(XPD) ||
          OPCS(SCS) ||
          OPCS(LRP) ||
          OPCS(FRC) ||
          OPCS(POW) ||
          OPCS(LIT) ||
          OPCS(EXP) ||
          OPCS(LOG) ||
          OPCS(DP4) ||
          OPCS(DP3) ||
-         OPCS(DPH) ||
          OPCS(DP2) ||
          OPCS(FLR) ||
          OPCS(CEIL) ||
          OPCS(TRUNC) ||
          OPCS(TXP) ||
          ctx.two_side_colors ||
          ctx.saturate))
       return NULL;
 
 #if 0  /* debug */
@@ -1686,24 +1662,20 @@ tgsi_transform_lowering(const struct tgsi_lowering_config *config,
       numtmp = MAX2(numtmp, LOG_TMP);
    }
    if (OPCS(DP4)) {
       newlen += DP4_GROW * OPCS(DP4);
       numtmp = MAX2(numtmp, DOTP_TMP);
    }
    if (OPCS(DP3)) {
       newlen += DP3_GROW * OPCS(DP3);
       numtmp = MAX2(numtmp, DOTP_TMP);
    }
-   if (OPCS(DPH)) {
-      newlen += DPH_GROW * OPCS(DPH);
-      numtmp = MAX2(numtmp, DOTP_TMP);
-   }
    if (OPCS(DP2)) {
       newlen += DP2_GROW * OPCS(DP2);
       numtmp = MAX2(numtmp, DOTP_TMP);
    }
    if (OPCS(FLR)) {
       newlen += FLR_GROW * OPCS(FLR);
       numtmp = MAX2(numtmp, FLR_TMP);
    }
    if (OPCS(CEIL)) {
       newlen += CEIL_GROW * OPCS(CEIL);
diff --git a/src/gallium/auxiliary/tgsi/tgsi_lowering.h b/src/gallium/auxiliary/tgsi/tgsi_lowering.h
index 85e4b8e..f65d915 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_lowering.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_lowering.h
@@ -58,21 +58,20 @@ struct tgsi_lowering_config
    unsigned lower_XPD:1;
    unsigned lower_SCS:1;
    unsigned lower_LRP:1;
    unsigned lower_FRC:1;
    unsigned lower_POW:1;
    unsigned lower_LIT:1;
    unsigned lower_EXP:1;
    unsigned lower_LOG:1;
    unsigned lower_DP4:1;
    unsigned lower_DP3:1;
-   unsigned lower_DPH:1;
    unsigned lower_DP2:1;
    unsigned lower_FLR:1;
    unsigned lower_CEIL:1;
    unsigned lower_TRUNC:1;
 
    /* bitmask of (1 << TGSI_TEXTURE_type): */
    unsigned lower_TXP;
 
    /* To emulate certain texture wrap modes, this can be used
     * to saturate the specified tex coord to [0.0, 1.0].  The
diff --git a/src/gallium/auxiliary/tgsi/tgsi_opcode_tmp.h b/src/gallium/auxiliary/tgsi/tgsi_opcode_tmp.h
index e4cbdae..68b4af8 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_opcode_tmp.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_opcode_tmp.h
@@ -57,21 +57,20 @@ OP12_TEX(TEX_LZ)
 OP13(LRP)
 OP11(SQRT)
 OP11(FRC)
 OP12_TEX(TXF_LZ)
 OP11(FLR)
 OP11(ROUND)
 OP11(EX2)
 OP11(LG2)
 OP12(POW)
 OP12(XPD)
-OP12(DPH)
 OP11(COS)
 OP11(DDX)
 OP11(DDY)
 OP11(DDX_FINE)
 OP11(DDY_FINE)
 OP00(KILL)
 OP11(PK2H)
 OP11(PK2US)
 OP11(PK4B)
 OP11(PK4UB)
diff --git a/src/gallium/auxiliary/tgsi/tgsi_util.c b/src/gallium/auxiliary/tgsi/tgsi_util.c
index fc61351..6756bca 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_util.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_util.c
@@ -267,24 +267,20 @@ tgsi_util_get_inst_usage_mask(const struct tgsi_full_instruction *inst,
       break;
 
    case TGSI_OPCODE_DP3:
       read_mask = TGSI_WRITEMASK_XYZ;
       break;
 
    case TGSI_OPCODE_DP4:
       read_mask = TGSI_WRITEMASK_XYZW;
       break;
 
-   case TGSI_OPCODE_DPH:
-      read_mask = src_idx == 0 ? TGSI_WRITEMASK_XYZ : TGSI_WRITEMASK_XYZW;
-      break;
-
    case TGSI_OPCODE_TEX:
    case TGSI_OPCODE_TXD:
    case TGSI_OPCODE_TXB:
    case TGSI_OPCODE_TXL:
    case TGSI_OPCODE_TXP:
       if (src_idx == 0) {
          /* Note that the SHADOW variants use the Z component too */
          switch (inst->Texture.Texture) {
          case TGSI_TEXTURE_1D:
             read_mask = TGSI_WRITEMASK_X;
diff --git a/src/gallium/docs/source/tgsi.rst b/src/gallium/docs/source/tgsi.rst
index 6602162..54767a7 100644
--- a/src/gallium/docs/source/tgsi.rst
+++ b/src/gallium/docs/source/tgsi.rst
@@ -356,29 +356,20 @@ This instruction replicates its result.
 
   dst.x = src0.y \times src1.z - src1.y \times src0.z
 
   dst.y = src0.z \times src1.x - src1.z \times src0.x
 
   dst.z = src0.x \times src1.y - src1.x \times src0.y
 
   dst.w = 1
 
 
-.. opcode:: DPH - Homogeneous Dot Product
-
-This instruction replicates its result.
-
-.. math::
-
-  dst = src0.x \times src1.x + src0.y \times src1.y + src0.z \times src1.z + src1.w
-
-
 .. opcode:: COS - Cosine
 
 This instruction replicates its result.
 
 .. math::
 
   dst = \cos{src.x}
 
 
 .. opcode:: DDX, DDX_FINE - Derivative Relative To X
@@ -3665,21 +3656,21 @@ is known all three should be at least 1. If it is unknown they should all be set
 to 0 or not set.
 
 MUL_ZERO_WINS
 """""""""""""
 
 The MUL TGSI operation (FP32 multiplication) will return 0 if either
 of the operands are equal to 0. That means that 0 * Inf = 0. This
 should be set the same way for an entire pipeline. Note that this
 applies not only to the literal MUL TGSI opcode, but all FP32
 multiplications implied by other operations, such as MAD, FMA, DP2,
-DP3, DP4, DPH, DST, LOG, LRP, XPD, and possibly others. If there is a
+DP3, DP4, DST, LOG, LRP, XPD, and possibly others. If there is a
 mismatch between shaders, then it is unspecified whether this behavior
 will be enabled.
 
 FS_POST_DEPTH_COVERAGE
 """"""""""""""""""""""
 
 When enabled, the input for TGSI_SEMANTIC_SAMPLEMASK will exclude samples
 that have failed the depth/stencil tests. This is only valid when
 FS_EARLY_DEPTH_STENCIL is also specified.
 
diff --git a/src/gallium/drivers/etnaviv/etnaviv_compiler.c b/src/gallium/drivers/etnaviv/etnaviv_compiler.c
index 88f204c..3ccb737 100644
--- a/src/gallium/drivers/etnaviv/etnaviv_compiler.c
+++ b/src/gallium/drivers/etnaviv/etnaviv_compiler.c
@@ -1657,47 +1657,20 @@ trans_lg2(const struct instr_translater *t, struct etna_compile *c,
       emit_inst(c, &(struct etna_inst) {
          .opcode = INST_OPCODE_LOG,
          .sat = inst->Instruction.Saturate,
          .dst = convert_dst(c, &inst->Dst[0]),
          .src[2] = src[0],
       });
    }
 }
 
 static void
-trans_dph(const struct instr_translater *t, struct etna_compile *c,
-          const struct tgsi_full_instruction *inst, struct etna_inst_src *src)
-{
-   /*
-   DP3 tmp.xyzw, src0.xyzw, src1,xyzw, void
-   ADD dst.xyzw, tmp.xyzw, void, src1.wwww
-   */
-   struct etna_native_reg temp = etna_compile_get_inner_temp(c);
-   struct etna_inst ins[2] = { };
-
-   ins[0].opcode = INST_OPCODE_DP3;
-   ins[0].dst = etna_native_to_dst(temp, INST_COMPS_X | INST_COMPS_Y |
-                                         INST_COMPS_Z | INST_COMPS_W);
-   ins[0].src[0] = src[0];
-   ins[0].src[1] = src[1];
-
-   ins[1].opcode = INST_OPCODE_ADD;
-   ins[1].sat = inst->Instruction.Saturate;
-   ins[1].dst = convert_dst(c, &inst->Dst[0]);
-   ins[1].src[0] = etna_native_to_src(temp, INST_SWIZ_IDENTITY);
-   ins[1].src[2] = swizzle(src[1], SWIZZLE(W, W, W, W));
-
-   emit_inst(c, &ins[0]);
-   emit_inst(c, &ins[1]);
-}
-
-static void
 trans_sampler(const struct instr_translater *t, struct etna_compile *c,
               const struct tgsi_full_instruction *inst,
               struct etna_inst_src *src)
 {
    /* There is no native support for GL texture rectangle coordinates, so
     * we have to rescale from ([0, width], [0, height]) to ([0, 1], [0, 1]). */
    if (inst->Texture.Texture == TGSI_TEXTURE_RECT) {
       uint32_t unit = inst->Src[1].Register.Index;
       struct etna_inst ins[2] = { };
       struct etna_native_reg temp = etna_compile_get_inner_temp(c);
@@ -1826,21 +1799,20 @@ static const struct instr_translater translaters[TGSI_OPCODE_LAST] = {
    INSTR(BRK, trans_brk),
    INSTR(CONT, trans_cont),
 
    INSTR(MIN, trans_min_max, .opc = INST_OPCODE_SELECT, .cond = INST_CONDITION_GT),
    INSTR(MAX, trans_min_max, .opc = INST_OPCODE_SELECT, .cond = INST_CONDITION_LT),
 
    INSTR(ARL, trans_arl),
    INSTR(LRP, trans_lrp),
    INSTR(LIT, trans_lit),
    INSTR(SSG, trans_ssg),
-   INSTR(DPH, trans_dph),
 
    INSTR(SIN, trans_trig),
    INSTR(COS, trans_trig),
    INSTR(SCS, trans_trig),
 
    INSTR(SLT, trans_instr, .opc = INST_OPCODE_SET, .src = {0, 1, -1}, .cond = INST_CONDITION_LT),
    INSTR(SGE, trans_instr, .opc = INST_OPCODE_SET, .src = {0, 1, -1}, .cond = INST_CONDITION_GE),
    INSTR(SEQ, trans_instr, .opc = INST_OPCODE_SET, .src = {0, 1, -1}, .cond = INST_CONDITION_EQ),
    INSTR(SGT, trans_instr, .opc = INST_OPCODE_SET, .src = {0, 1, -1}, .cond = INST_CONDITION_GT),
    INSTR(SLE, trans_instr, .opc = INST_OPCODE_SET, .src = {0, 1, -1}, .cond = INST_CONDITION_LE),
diff --git a/src/gallium/drivers/i915/i915_fpc_optimize.c b/src/gallium/drivers/i915/i915_fpc_optimize.c
index 5f2a876..da06e16 100644
--- a/src/gallium/drivers/i915/i915_fpc_optimize.c
+++ b/src/gallium/drivers/i915/i915_fpc_optimize.c
@@ -78,21 +78,20 @@ static const struct {
 } op_table [TGSI_OPCODE_LAST] = {
    [ TGSI_OPCODE_ADD     ] = { false,   true,  TGSI_SWIZZLE_ZERO,  1,  2 },
    [ TGSI_OPCODE_CEIL    ] = { false,  false,                  0,  1,  1 },
    [ TGSI_OPCODE_CMP     ] = { false,  false,                  0,  1,  2 },
    [ TGSI_OPCODE_COS     ] = { false,  false,                  0,  1,  1 },
    [ TGSI_OPCODE_DDX     ] = { false,  false,                  0,  1,  0 },
    [ TGSI_OPCODE_DDY     ] = { false,  false,                  0,  1,  0 },
    [ TGSI_OPCODE_DP2     ] = { false,   true,   TGSI_SWIZZLE_ONE,  1,  2 },
    [ TGSI_OPCODE_DP3     ] = { false,   true,   TGSI_SWIZZLE_ONE,  1,  2 },
    [ TGSI_OPCODE_DP4     ] = { false,   true,   TGSI_SWIZZLE_ONE,  1,  2 },
-   [ TGSI_OPCODE_DPH     ] = { false,  false,                  0,  1,  2 },
    [ TGSI_OPCODE_DST     ] = { false,  false,                  0,  1,  2 },
    [ TGSI_OPCODE_END     ] = { false,  false,                  0,  0,  0 },
    [ TGSI_OPCODE_EX2     ] = { false,  false,                  0,  1,  1 },
    [ TGSI_OPCODE_FLR     ] = { false,  false,                  0,  1,  1 },
    [ TGSI_OPCODE_FRC     ] = { false,  false,                  0,  1,  1 },
    [ TGSI_OPCODE_KILL_IF ] = { false,  false,                  0,  0,  1 },
    [ TGSI_OPCODE_KILL    ] = { false,  false,                  0,  0,  0 },
    [ TGSI_OPCODE_LG2     ] = { false,  false,                  0,  1,  1 },
    [ TGSI_OPCODE_LIT     ] = { false,  false,                  0,  1,  1 },
    [ TGSI_OPCODE_LRP     ] = { false,  false,                  0,  1,  3 },
diff --git a/src/gallium/drivers/i915/i915_fpc_translate.c b/src/gallium/drivers/i915/i915_fpc_translate.c
index 241c92d..22a42ee 100644
--- a/src/gallium/drivers/i915/i915_fpc_translate.c
+++ b/src/gallium/drivers/i915/i915_fpc_translate.c
@@ -597,31 +597,20 @@ i915_translate_instruction(struct i915_fp_compile *p,
       break;
 
    case TGSI_OPCODE_DP3:
       emit_simple_arith(p, inst, A0_DP3, 2, fs);
       break;
 
    case TGSI_OPCODE_DP4:
       emit_simple_arith(p, inst, A0_DP4, 2, fs);
       break;
 
-   case TGSI_OPCODE_DPH:
-      src0 = src_vector(p, &inst->Src[0], fs);
-      src1 = src_vector(p, &inst->Src[1], fs);
-
-      i915_emit_arith(p,
-                      A0_DP4,
-                      get_result_vector(p, &inst->Dst[0]),
-                      get_result_flags(inst), 0,
-                      swizzle(src0, X, Y, Z, ONE), src1, 0);
-      break;
-
    case TGSI_OPCODE_DST:
       src0 = src_vector(p, &inst->Src[0], fs);
       src1 = src_vector(p, &inst->Src[1], fs);
 
       /* result[0] = 1    * 1;
        * result[1] = a[1] * b[1];
        * result[2] = a[2] * 1;
        * result[3] = 1    * b[3];
        */
       i915_emit_arith(p,
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
index 23c7d10..c06e74e 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
@@ -270,21 +270,20 @@ unsigned int Instruction::srcMask(unsigned int s) const
 
    switch (insn->Instruction.Opcode) {
    case TGSI_OPCODE_COS:
    case TGSI_OPCODE_SIN:
       return (mask & 0x8) | ((mask & 0x7) ? 0x1 : 0x0);
    case TGSI_OPCODE_DP2:
       return 0x3;
    case TGSI_OPCODE_DP3:
       return 0x7;
    case TGSI_OPCODE_DP4:
-   case TGSI_OPCODE_DPH:
    case TGSI_OPCODE_KILL_IF: /* WriteMask ignored */
       return 0xf;
    case TGSI_OPCODE_DST:
       return mask & (s ? 0xa : 0x6);
    case TGSI_OPCODE_EX2:
    case TGSI_OPCODE_EXP:
    case TGSI_OPCODE_LG2:
    case TGSI_OPCODE_LOG:
    case TGSI_OPCODE_POW:
    case TGSI_OPCODE_RCP:
@@ -3314,27 +3313,20 @@ Converter::handleInstruction(const struct tgsi_full_instruction *insn)
    case TGSI_OPCODE_DP3:
       val0 = buildDot(3);
       FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi)
          mkMov(dst0[c], val0);
       break;
    case TGSI_OPCODE_DP4:
       val0 = buildDot(4);
       FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi)
          mkMov(dst0[c], val0);
       break;
-   case TGSI_OPCODE_DPH:
-      val0 = buildDot(3);
-      src1 = fetchSrc(1, 3);
-      mkOp2(OP_ADD, TYPE_F32, val0, val0, src1);
-      FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi)
-         mkMov(dst0[c], val0);
-      break;
    case TGSI_OPCODE_DST:
       if (dst0[0])
          loadImm(dst0[0], 1.0f);
       if (dst0[1]) {
          src0 = fetchSrc(0, 1);
          src1 = fetchSrc(1, 1);
          mkOp2(OP_MUL, TYPE_F32, dst0[1], src0, src1)
             ->dnz = info->io.mul_zero_wins;
       }
       if (dst0[2])
diff --git a/src/gallium/drivers/nouveau/nv30/nvfx_fragprog.c b/src/gallium/drivers/nouveau/nv30/nvfx_fragprog.c
index 278a8a4..a3ed5c6 100644
--- a/src/gallium/drivers/nouveau/nv30/nvfx_fragprog.c
+++ b/src/gallium/drivers/nouveau/nv30/nvfx_fragprog.c
@@ -584,25 +584,20 @@ nvfx_fragprog_parse_instruction(struct nvfx_fpc *fpc,
       tmp = nvfx_src(temp(fpc));
       nvfx_fp_emit(fpc, arith(0, MUL, tmp.reg, NVFX_FP_MASK_X | NVFX_FP_MASK_Y, src[0], src[1], none));
       nvfx_fp_emit(fpc, arith(0, ADD, dst, mask, swz(tmp, X, X, X, X), swz(tmp, Y, Y, Y, Y), none));
       break;
    case TGSI_OPCODE_DP3:
       nvfx_fp_emit(fpc, arith(sat, DP3, dst, mask, src[0], src[1], none));
       break;
    case TGSI_OPCODE_DP4:
       nvfx_fp_emit(fpc, arith(sat, DP4, dst, mask, src[0], src[1], none));
       break;
-   case TGSI_OPCODE_DPH:
-      tmp = nvfx_src(temp(fpc));
-      nvfx_fp_emit(fpc, arith(0, DP3, tmp.reg, NVFX_FP_MASK_X, src[0], src[1], none));
-      nvfx_fp_emit(fpc, arith(sat, ADD, dst, mask, swz(tmp, X, X, X, X), swz(src[1], W, W, W, W), none));
-      break;
    case TGSI_OPCODE_DST:
       nvfx_fp_emit(fpc, arith(sat, DST, dst, mask, src[0], src[1], none));
       break;
    case TGSI_OPCODE_EX2:
       nvfx_fp_emit(fpc, arith(sat, EX2, dst, mask, src[0], none, none));
       break;
    case TGSI_OPCODE_FLR:
       nvfx_fp_emit(fpc, arith(sat, FLR, dst, mask, src[0], none, none));
       break;
    case TGSI_OPCODE_FRC:
diff --git a/src/gallium/drivers/nouveau/nv30/nvfx_vertprog.c b/src/gallium/drivers/nouveau/nv30/nvfx_vertprog.c
index bec9975..8ba3d5a 100644
--- a/src/gallium/drivers/nouveau/nv30/nvfx_vertprog.c
+++ b/src/gallium/drivers/nouveau/nv30/nvfx_vertprog.c
@@ -581,23 +581,20 @@ nvfx_vertprog_parse_instruction(struct nvfx_vpc *vpc,
       tmp = nvfx_src(temp(vpc));
       nvfx_vp_emit(vpc, arith(0, VEC, MUL, tmp.reg, NVFX_VP_MASK_X | NVFX_VP_MASK_Y, src[0], src[1], none));
       nvfx_vp_emit(vpc, arith(sat, VEC, ADD, dst, mask, swz(tmp, X, X, X, X), none, swz(tmp, Y, Y, Y, Y)));
       break;
    case TGSI_OPCODE_DP3:
       nvfx_vp_emit(vpc, arith(sat, VEC, DP3, dst, mask, src[0], src[1], none));
       break;
    case TGSI_OPCODE_DP4:
       nvfx_vp_emit(vpc, arith(sat, VEC, DP4, dst, mask, src[0], src[1], none));
       break;
-   case TGSI_OPCODE_DPH:
-      nvfx_vp_emit(vpc, arith(sat, VEC, DPH, dst, mask, src[0], src[1], none));
-      break;
    case TGSI_OPCODE_DST:
       nvfx_vp_emit(vpc, arith(sat, VEC, DST, dst, mask, src[0], src[1], none));
       break;
    case TGSI_OPCODE_EX2:
       nvfx_vp_emit(vpc, arith(sat, SCA, EX2, dst, mask, none, none, src[0]));
       break;
    case TGSI_OPCODE_EXP:
       nvfx_vp_emit(vpc, arith(sat, SCA, EXP, dst, mask, none, none, src[0]));
       break;
    case TGSI_OPCODE_FLR:
diff --git a/src/gallium/drivers/r300/r300_tgsi_to_rc.c b/src/gallium/drivers/r300/r300_tgsi_to_rc.c
index cc0ac48..78af124 100644
--- a/src/gallium/drivers/r300/r300_tgsi_to_rc.c
+++ b/src/gallium/drivers/r300/r300_tgsi_to_rc.c
@@ -51,21 +51,20 @@ static unsigned translate_opcode(unsigned opcode)
         case TGSI_OPCODE_SGE: return RC_OPCODE_SGE;
         case TGSI_OPCODE_MAD: return RC_OPCODE_MAD;
         case TGSI_OPCODE_LRP: return RC_OPCODE_LRP;
         case TGSI_OPCODE_FRC: return RC_OPCODE_FRC;
         case TGSI_OPCODE_FLR: return RC_OPCODE_FLR;
         case TGSI_OPCODE_ROUND: return RC_OPCODE_ROUND;
         case TGSI_OPCODE_EX2: return RC_OPCODE_EX2;
         case TGSI_OPCODE_LG2: return RC_OPCODE_LG2;
         case TGSI_OPCODE_POW: return RC_OPCODE_POW;
         case TGSI_OPCODE_XPD: return RC_OPCODE_XPD;
-        case TGSI_OPCODE_DPH: return RC_OPCODE_DPH;
         case TGSI_OPCODE_COS: return RC_OPCODE_COS;
         case TGSI_OPCODE_DDX: return RC_OPCODE_DDX;
         case TGSI_OPCODE_DDY: return RC_OPCODE_DDY;
         case TGSI_OPCODE_KILL: return RC_OPCODE_KILP;
      /* case TGSI_OPCODE_PK2H: return RC_OPCODE_PK2H; */
      /* case TGSI_OPCODE_PK2US: return RC_OPCODE_PK2US; */
      /* case TGSI_OPCODE_PK4B: return RC_OPCODE_PK4B; */
      /* case TGSI_OPCODE_PK4UB: return RC_OPCODE_PK4UB; */
         case TGSI_OPCODE_SEQ: return RC_OPCODE_SEQ;
         case TGSI_OPCODE_SGT: return RC_OPCODE_SGT;
diff --git a/src/gallium/drivers/r600/r600_shader.c b/src/gallium/drivers/r600/r600_shader.c
index d3728fb..fd76c93 100644
--- a/src/gallium/drivers/r600/r600_shader.c
+++ b/src/gallium/drivers/r600/r600_shader.c
@@ -6610,27 +6610,20 @@ static int tgsi_dp(struct r600_shader_ctx *ctx)
 				alu.src[0].sel = alu.src[1].sel = V_SQ_ALU_SRC_0;
 				alu.src[0].chan = alu.src[1].chan = 0;
 			}
 			break;
 		case TGSI_OPCODE_DP3:
 			if (i > 2) {
 				alu.src[0].sel = alu.src[1].sel = V_SQ_ALU_SRC_0;
 				alu.src[0].chan = alu.src[1].chan = 0;
 			}
 			break;
-		case TGSI_OPCODE_DPH:
-			if (i == 3) {
-				alu.src[0].sel = V_SQ_ALU_SRC_1;
-				alu.src[0].chan = 0;
-				alu.src[0].neg = 0;
-			}
-			break;
 		default:
 			break;
 		}
 		if (i == 3) {
 			alu.last = 1;
 		}
 		r = r600_bytecode_add_alu(ctx->bc, &alu);
 		if (r)
 			return r;
 	}
@@ -9096,21 +9089,21 @@ static const struct r600_shader_tgsi_instruction r600_shader_tgsi_instruction[]
 	[25]			= { ALU_OP0_NOP, tgsi_unsupported},
 	[TGSI_OPCODE_FLR]	= { ALU_OP1_FLOOR, tgsi_op2},
 	[TGSI_OPCODE_ROUND]	= { ALU_OP1_RNDNE, tgsi_op2},
 	[TGSI_OPCODE_EX2]	= { ALU_OP1_EXP_IEEE, tgsi_trans_srcx_replicate},
 	[TGSI_OPCODE_LG2]	= { ALU_OP1_LOG_IEEE, tgsi_trans_srcx_replicate},
 	[TGSI_OPCODE_POW]	= { ALU_OP0_NOP, tgsi_pow},
 	[TGSI_OPCODE_XPD]	= { ALU_OP0_NOP, tgsi_xpd},
 	[32]			= { ALU_OP0_NOP, tgsi_unsupported},
 	[33]			= { ALU_OP0_NOP, tgsi_unsupported},
 	[34]			= { ALU_OP0_NOP, tgsi_unsupported},
-	[TGSI_OPCODE_DPH]	= { ALU_OP2_DOT4_IEEE, tgsi_dp},
+	[35]			= { ALU_OP0_NOP, tgsi_unsupported},
 	[TGSI_OPCODE_COS]	= { ALU_OP1_COS, tgsi_trig},
 	[TGSI_OPCODE_DDX]	= { FETCH_OP_GET_GRADIENTS_H, tgsi_tex},
 	[TGSI_OPCODE_DDY]	= { FETCH_OP_GET_GRADIENTS_V, tgsi_tex},
 	[TGSI_OPCODE_KILL]	= { ALU_OP2_KILLGT, tgsi_kill},  /* unconditional kill */
 	[TGSI_OPCODE_PK2H]	= { ALU_OP0_NOP, tgsi_unsupported},
 	[TGSI_OPCODE_PK2US]	= { ALU_OP0_NOP, tgsi_unsupported},
 	[TGSI_OPCODE_PK4B]	= { ALU_OP0_NOP, tgsi_unsupported},
 	[TGSI_OPCODE_PK4UB]	= { ALU_OP0_NOP, tgsi_unsupported},
 	[44]			= { ALU_OP0_NOP, tgsi_unsupported},
 	[TGSI_OPCODE_SEQ]	= { ALU_OP2_SETE, tgsi_op2},
@@ -9294,21 +9287,21 @@ static const struct r600_shader_tgsi_instruction eg_shader_tgsi_instruction[] =
 	[25]			= { ALU_OP0_NOP, tgsi_unsupported},
 	[TGSI_OPCODE_FLR]	= { ALU_OP1_FLOOR, tgsi_op2},
 	[TGSI_OPCODE_ROUND]	= { ALU_OP1_RNDNE, tgsi_op2},
 	[TGSI_OPCODE_EX2]	= { ALU_OP1_EXP_IEEE, tgsi_trans_srcx_replicate},
 	[TGSI_OPCODE_LG2]	= { ALU_OP1_LOG_IEEE, tgsi_trans_srcx_replicate},
 	[TGSI_OPCODE_POW]	= { ALU_OP0_NOP, tgsi_pow},
 	[TGSI_OPCODE_XPD]	= { ALU_OP0_NOP, tgsi_xpd},
 	[32]			= { ALU_OP0_NOP, tgsi_unsupported},
 	[33]			= { ALU_OP0_NOP, tgsi_unsupported},
 	[34]			= { ALU_OP0_NOP, tgsi_unsupported},
-	[TGSI_OPCODE_DPH]	= { ALU_OP2_DOT4_IEEE, tgsi_dp},
+	[35]			= { ALU_OP0_NOP, tgsi_unsupported},
 	[TGSI_OPCODE_COS]	= { ALU_OP1_COS, tgsi_trig},
 	[TGSI_OPCODE_DDX]	= { FETCH_OP_GET_GRADIENTS_H, tgsi_tex},
 	[TGSI_OPCODE_DDY]	= { FETCH_OP_GET_GRADIENTS_V, tgsi_tex},
 	[TGSI_OPCODE_KILL]	= { ALU_OP2_KILLGT, tgsi_kill},  /* unconditional kill */
 	[TGSI_OPCODE_PK2H]	= { ALU_OP0_NOP, tgsi_pk2h},
 	[TGSI_OPCODE_PK2US]	= { ALU_OP0_NOP, tgsi_unsupported},
 	[TGSI_OPCODE_PK4B]	= { ALU_OP0_NOP, tgsi_unsupported},
 	[TGSI_OPCODE_PK4UB]	= { ALU_OP0_NOP, tgsi_unsupported},
 	[44]			= { ALU_OP0_NOP, tgsi_unsupported},
 	[TGSI_OPCODE_SEQ]	= { ALU_OP2_SETE, tgsi_op2},
@@ -9517,21 +9510,21 @@ static const struct r600_shader_tgsi_instruction cm_shader_tgsi_instruction[] =
 	[25]			= { ALU_OP0_NOP, tgsi_unsupported},
 	[TGSI_OPCODE_FLR]	= { ALU_OP1_FLOOR, tgsi_op2},
 	[TGSI_OPCODE_ROUND]	= { ALU_OP1_RNDNE, tgsi_op2},
 	[TGSI_OPCODE_EX2]	= { ALU_OP1_EXP_IEEE, cayman_emit_float_instr},
 	[TGSI_OPCODE_LG2]	= { ALU_OP1_LOG_IEEE, cayman_emit_float_instr},
 	[TGSI_OPCODE_POW]	= { ALU_OP0_NOP, cayman_pow},
 	[TGSI_OPCODE_XPD]	= { ALU_OP0_NOP, tgsi_xpd},
 	[32]			= { ALU_OP0_NOP, tgsi_unsupported},
 	[33]			= { ALU_OP0_NOP, tgsi_unsupported},
 	[34]			= { ALU_OP0_NOP, tgsi_unsupported},
-	[TGSI_OPCODE_DPH]	= { ALU_OP2_DOT4_IEEE, tgsi_dp},
+	[35]			= { ALU_OP0_NOP, tgsi_unsupported},
 	[TGSI_OPCODE_COS]	= { ALU_OP1_COS, cayman_trig},
 	[TGSI_OPCODE_DDX]	= { FETCH_OP_GET_GRADIENTS_H, tgsi_tex},
 	[TGSI_OPCODE_DDY]	= { FETCH_OP_GET_GRADIENTS_V, tgsi_tex},
 	[TGSI_OPCODE_KILL]	= { ALU_OP2_KILLGT, tgsi_kill},  /* unconditional kill */
 	[TGSI_OPCODE_PK2H]	= { ALU_OP0_NOP, tgsi_pk2h},
 	[TGSI_OPCODE_PK2US]	= { ALU_OP0_NOP, tgsi_unsupported},
 	[TGSI_OPCODE_PK4B]	= { ALU_OP0_NOP, tgsi_unsupported},
 	[TGSI_OPCODE_PK4UB]	= { ALU_OP0_NOP, tgsi_unsupported},
 	[44]			= { ALU_OP0_NOP, tgsi_unsupported},
 	[TGSI_OPCODE_SEQ]	= { ALU_OP2_SETE, tgsi_op2},
diff --git a/src/gallium/drivers/svga/svga_tgsi_insn.c b/src/gallium/drivers/svga/svga_tgsi_insn.c
index dd29f74..928330c 100644
--- a/src/gallium/drivers/svga/svga_tgsi_insn.c
+++ b/src/gallium/drivers/svga/svga_tgsi_insn.c
@@ -1230,53 +1230,20 @@ emit_dp2(struct svga_shader_emitter *emit,
    /* ADD  DST, TMP.xxxx, TMP.yyyy */
    if (!submit_op2( emit, inst_token( SVGA3DOP_ADD ), dst,
                     temp_src0, temp_src1 ))
       return FALSE;
 
    return TRUE;
 }
 
 
 /**
- * Translate the following TGSI DPH instruction.
- *    DPH  DST, SRC1, SRC2
- * To the following SVGA3D instruction sequence.
- *    DP3  TMP, SRC1, SRC2
- *    ADD  DST, TMP, SRC2.wwww
- */
-static boolean
-emit_dph(struct svga_shader_emitter *emit,
-         const struct tgsi_full_instruction *insn )
-{
-   SVGA3dShaderDestToken dst = translate_dst_register( emit, insn, 0 );
-   const struct src_register src0 = translate_src_register(
-      emit, &insn->Src[0] );
-   struct src_register src1 =
-      translate_src_register(emit, &insn->Src[1]);
-   SVGA3dShaderDestToken temp = get_temp( emit );
-
-   /* DP3  TMP, SRC1, SRC2 */
-   if (!submit_op2( emit, inst_token( SVGA3DOP_DP3 ), temp, src0, src1 ))
-      return FALSE;
-
-   src1 = scalar(src1, TGSI_SWIZZLE_W);
-
-   /* ADD  DST, TMP, SRC2.wwww */
-   if (!submit_op2( emit, inst_token( SVGA3DOP_ADD ), dst,
-                    src( temp ), src1 ))
-      return FALSE;
-
-   return TRUE;
-}
-
-
-/**
  * Sine / Cosine helper function.
  */
 static boolean
 do_emit_sincos(struct svga_shader_emitter *emit,
                SVGA3dShaderDestToken dst,
                struct src_register src0)
 {
    src0 = scalar(src0, TGSI_SWIZZLE_X);
    return submit_op1(emit, inst_token(SVGA3DOP_SINCOS), dst, src0);
 }
@@ -2917,23 +2884,20 @@ svga_emit_instruction(struct svga_shader_emitter *emit,
 
    case TGSI_OPCODE_CMP:
       return emit_cmp( emit, insn );
 
    case TGSI_OPCODE_DIV:
       return emit_div( emit, insn );
 
    case TGSI_OPCODE_DP2:
       return emit_dp2( emit, insn );
 
-   case TGSI_OPCODE_DPH:
-      return emit_dph( emit, insn );
-
    case TGSI_OPCODE_COS:
       return emit_cos( emit, insn );
 
    case TGSI_OPCODE_SIN:
       return emit_sin( emit, insn );
 
    case TGSI_OPCODE_SCS:
       return emit_sincos( emit, insn );
 
    case TGSI_OPCODE_END:
diff --git a/src/gallium/drivers/svga/svga_tgsi_vgpu10.c b/src/gallium/drivers/svga/svga_tgsi_vgpu10.c
index 56afd49..e57e78d 100644
--- a/src/gallium/drivers/svga/svga_tgsi_vgpu10.c
+++ b/src/gallium/drivers/svga/svga_tgsi_vgpu10.c
@@ -3571,54 +3571,20 @@ emit_cmp(struct svga_shader_emitter_v10 *emit,
                         &tmp_src, &inst->Src[1], &inst->Src[2],
                         inst->Instruction.Saturate);
 
    free_temp_indexes(emit);
 
    return TRUE;
 }
 
 
 /**
- * Emit code for TGSI_OPCODE_DPH instruction.
- */
-static boolean
-emit_dph(struct svga_shader_emitter_v10 *emit,
-         const struct tgsi_full_instruction *inst)
-{
-   /*
-    * DP3 tmp, s0, s1
-    * ADD dst, tmp, s1.wwww
-    */
-
-   struct tgsi_full_src_register s1_wwww =
-      swizzle_src(&inst->Src[1], TGSI_SWIZZLE_W, TGSI_SWIZZLE_W,
-                  TGSI_SWIZZLE_W, TGSI_SWIZZLE_W);
-
-   unsigned tmp = get_temp_index(emit);
-   struct tgsi_full_src_register tmp_src = make_src_temp_reg(tmp);
-   struct tgsi_full_dst_register tmp_dst = make_dst_temp_reg(tmp);
-
-   /* DP3 tmp, s0, s1 */
-   emit_instruction_op2(emit, VGPU10_OPCODE_DP3, &tmp_dst, &inst->Src[0],
-                        &inst->Src[1], FALSE);
-
-   /* ADD dst, tmp, s1.wwww */
-   emit_instruction_op2(emit, VGPU10_OPCODE_ADD, &inst->Dst[0], &tmp_src,
-                        &s1_wwww, inst->Instruction.Saturate);
-
-   free_temp_indexes(emit);
-
-   return TRUE;
-}
-
-
-/**
  * Emit code for TGSI_OPCODE_DST instruction.
  */
 static boolean
 emit_dst(struct svga_shader_emitter_v10 *emit,
          const struct tgsi_full_instruction *inst)
 {
    /*
     * dst.x = 1
     * dst.y = src0.y * src1.y
     * dst.z = src0.z
@@ -5705,22 +5671,20 @@ emit_vgpu10_instruction(struct svga_shader_emitter_v10 *emit,
       return emit_arl_uarl(emit, inst);
    case TGSI_OPCODE_BGNSUB:
       /* no-op */
       return TRUE;
    case TGSI_OPCODE_CAL:
       return emit_cal(emit, inst);
    case TGSI_OPCODE_CMP:
       return emit_cmp(emit, inst);
    case TGSI_OPCODE_COS:
       return emit_sincos(emit, inst);
-   case TGSI_OPCODE_DPH:
-      return emit_dph(emit, inst);
    case TGSI_OPCODE_DST:
       return emit_dst(emit, inst);
    case TGSI_OPCODE_EX2:
       return emit_ex2(emit, inst);
    case TGSI_OPCODE_EXP:
       return emit_exp(emit, inst);
    case TGSI_OPCODE_IF:
       return emit_if(emit, inst);
    case TGSI_OPCODE_KILL:
       return emit_kill(emit, inst);
diff --git a/src/gallium/include/pipe/p_shader_tokens.h b/src/gallium/include/pipe/p_shader_tokens.h
index 5c4af89..af0d025 100644
--- a/src/gallium/include/pipe/p_shader_tokens.h
+++ b/src/gallium/include/pipe/p_shader_tokens.h
@@ -365,21 +365,21 @@ struct tgsi_property_data {
 #define TGSI_OPCODE_TXF_LZ              25
 #define TGSI_OPCODE_FLR                 26
 #define TGSI_OPCODE_ROUND               27
 #define TGSI_OPCODE_EX2                 28
 #define TGSI_OPCODE_LG2                 29
 #define TGSI_OPCODE_POW                 30
 #define TGSI_OPCODE_XPD                 31
 #define TGSI_OPCODE_U2I64               32
 #define TGSI_OPCODE_CLOCK               33
 #define TGSI_OPCODE_I2I64               34
-#define TGSI_OPCODE_DPH                 35
+/* gap */
 #define TGSI_OPCODE_COS                 36
 #define TGSI_OPCODE_DDX                 37
 #define TGSI_OPCODE_DDY                 38
 #define TGSI_OPCODE_KILL                39 /* unconditional */
 #define TGSI_OPCODE_PK2H                40
 #define TGSI_OPCODE_PK2US               41
 #define TGSI_OPCODE_PK4B                42
 #define TGSI_OPCODE_PK4UB               43
 #define TGSI_OPCODE_D2U64               44
 #define TGSI_OPCODE_SEQ                 45
diff --git a/src/mesa/state_tracker/st_mesa_to_tgsi.c b/src/mesa/state_tracker/st_mesa_to_tgsi.c
index f6eb5ef..5f78026 100644
--- a/src/mesa/state_tracker/st_mesa_to_tgsi.c
+++ b/src/mesa/state_tracker/st_mesa_to_tgsi.c
@@ -450,22 +450,20 @@ translate_opcode( unsigned op )
    case OPCODE_ADD:
       return TGSI_OPCODE_ADD;
    case OPCODE_CMP:
       return TGSI_OPCODE_CMP;
    case OPCODE_COS:
       return TGSI_OPCODE_COS;
    case OPCODE_DP3:
       return TGSI_OPCODE_DP3;
    case OPCODE_DP4:
       return TGSI_OPCODE_DP4;
-   case OPCODE_DPH:
-      return TGSI_OPCODE_DPH;
    case OPCODE_DST:
       return TGSI_OPCODE_DST;
    case OPCODE_EX2:
       return TGSI_OPCODE_EX2;
    case OPCODE_EXP:
       return TGSI_OPCODE_EXP;
    case OPCODE_FLR:
       return TGSI_OPCODE_FLR;
    case OPCODE_FRC:
       return TGSI_OPCODE_FRC;
@@ -582,20 +580,31 @@ compile_instruction(
       break;
 
    case OPCODE_ABS:
       ureg_MOV(ureg, dst[0], ureg_abs(src[0]));
       break;
 
    case OPCODE_SUB:
       ureg_ADD(ureg, dst[0], src[0], ureg_negate(src[1]));
       break;
 
+   case OPCODE_DPH: {
+      struct ureg_dst temp = ureg_DECL_temporary(ureg);
+
+      /* DPH = DP4(src0, src1) where src0.w = 1. */
+      ureg_MOV(ureg, ureg_writemask(temp, TGSI_WRITEMASK_XYZ), src[0]);
+      ureg_MOV(ureg, ureg_writemask(temp, TGSI_WRITEMASK_W),
+               ureg_imm1f(ureg, 1));
+      ureg_DP4(ureg, dst[0], ureg_src(temp), src[1]);
+      break;
+   }
+
    default:
       ureg_insn( ureg, 
                  translate_opcode( inst->Opcode ), 
                  dst, num_dst, 
                  src, num_src, 0);
       break;
    }
 }
 
 
-- 
2.7.4