[Mesa-dev] [PATCH 5/5] gallium: remove TGSI_OPCODE_SUB

Thu Jan 5 16:15:07 UTC 2017

On 01.01.2017 01:04, Marek Olšák wrote:
> From: Marek Olšák <marek.olsak at amd.com>
>
> It's redundant with the source modifier.

This could have been split up, but oh well. Aside from Ilia's comment, 
patches 4&5 are

Reviewed-by: Nicolai Hähnle <nicolai.haehnle at amd.com>

> ---
>  src/gallium/auxiliary/draw/draw_pipe_aaline.c      |  2 +-
>  src/gallium/auxiliary/draw/draw_pipe_aapoint.c     | 20 ++++++------
>  src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c | 38 +++-------------------
>  src/gallium/auxiliary/gallivm/lp_bld_tgsi_aos.c    |  6 ----
>  src/gallium/auxiliary/nir/tgsi_to_nir.c            |  1 -
>  src/gallium/auxiliary/tgsi/tgsi_aa_point.c         | 20 ++++++------
>  src/gallium/auxiliary/tgsi/tgsi_exec.c             |  4 ---
>  src/gallium/auxiliary/tgsi/tgsi_info.c             |  2 +-
>  src/gallium/auxiliary/tgsi/tgsi_lowering.c         | 22 ++++++++-----
>  src/gallium/auxiliary/tgsi/tgsi_opcode_tmp.h       |  1 -
>  src/gallium/auxiliary/tgsi/tgsi_point_sprite.c     | 12 +++----
>  src/gallium/auxiliary/tgsi/tgsi_transform.h        |  8 +++--
>  src/gallium/auxiliary/tgsi/tgsi_util.c             |  1 -
>  src/gallium/auxiliary/util/u_pstipple.c            |  2 +-
>  src/gallium/auxiliary/vl/vl_bicubic_filter.c       |  4 +--
>  src/gallium/auxiliary/vl/vl_compositor.c           |  4 +--
>  src/gallium/auxiliary/vl/vl_deint_filter.c         |  8 ++---
>  src/gallium/drivers/i915/i915_fpc_optimize.c       |  1 -
>  src/gallium/drivers/i915/i915_fpc_translate.c      | 11 -------
>  src/gallium/drivers/ilo/shader/toy_tgsi.c          |  6 ----
>  .../drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp  |  2 --
>  src/gallium/drivers/nouveau/nv30/nvfx_fragprog.c   |  3 --
>  src/gallium/drivers/nouveau/nv30/nvfx_vertprog.c   |  3 --
>  src/gallium/drivers/r300/r300_tgsi_to_rc.c         |  1 -
>  src/gallium/drivers/r600/r600_shader.c             | 14 --------
>  src/gallium/drivers/svga/svga_tgsi_insn.c          | 27 ---------------
>  src/gallium/drivers/svga/svga_tgsi_vgpu10.c        | 25 --------------
>  src/gallium/include/pipe/p_shader_tokens.h         |  2 +-
>  src/gallium/state_trackers/xa/xa_tgsi.c            |  4 +--
>  src/mesa/state_tracker/st_atifs_to_tgsi.c          | 18 +++++-----
>  src/mesa/state_tracker/st_glsl_to_tgsi.cpp         |  3 +-
>  src/mesa/state_tracker/st_mesa_to_tgsi.c           |  6 ++--
>  src/mesa/state_tracker/st_tgsi_lower_yuv.c         |  3 +-
>  33 files changed, 82 insertions(+), 202 deletions(-)
>
> diff --git a/src/gallium/auxiliary/draw/draw_pipe_aaline.c b/src/gallium/auxiliary/draw/draw_pipe_aaline.c
> index c236caa..57ca12e 100644
> --- a/src/gallium/auxiliary/draw/draw_pipe_aaline.c
> +++ b/src/gallium/auxiliary/draw/draw_pipe_aaline.c
> @@ -278,21 +278,21 @@ aa_transform_epilog(struct tgsi_transform_context *ctx)
>        tgsi_transform_op1_inst(ctx, TGSI_OPCODE_MOV,
>                                TGSI_FILE_OUTPUT, aactx->colorOutput,
>                                TGSI_WRITEMASK_XYZ,
>                                TGSI_FILE_TEMPORARY, aactx->colorTemp);
>
>        /* MUL alpha */
>        tgsi_transform_op2_inst(ctx, TGSI_OPCODE_MUL,
>                                TGSI_FILE_OUTPUT, aactx->colorOutput,
>                                TGSI_WRITEMASK_W,
>                                TGSI_FILE_TEMPORARY, aactx->colorTemp,
> -                              TGSI_FILE_TEMPORARY, aactx->texTemp);
> +                              TGSI_FILE_TEMPORARY, aactx->texTemp, false);
>     }
>  }
>
>
>  /**
>   * TGSI instruction transform callback.
>   * Replace writes to result.color w/ a temp reg.
>   */
>  static void
>  aa_transform_inst(struct tgsi_transform_context *ctx,
> diff --git a/src/gallium/auxiliary/draw/draw_pipe_aapoint.c b/src/gallium/auxiliary/draw/draw_pipe_aapoint.c
> index 33ef8ec..2b96b8a 100644
> --- a/src/gallium/auxiliary/draw/draw_pipe_aapoint.c
> +++ b/src/gallium/auxiliary/draw/draw_pipe_aapoint.c
> @@ -206,88 +206,88 @@ aa_transform_prolog(struct tgsi_transform_context *ctx)
>      *  t0.x = distance of fragment from center point
>      *  t0.y = boolean, is t0.x > 1.0, also misc temp usage
>      *  t0.z = temporary for computing 1/(1-k) value
>      *  t0.w = final coverage value
>      */
>
>     /* MUL t0.xy, tex, tex;  # compute x^2, y^2 */
>     tgsi_transform_op2_inst(ctx, TGSI_OPCODE_MUL,
>                             TGSI_FILE_TEMPORARY, tmp0, TGSI_WRITEMASK_XY,
>                             TGSI_FILE_INPUT, texInput,
> -                           TGSI_FILE_INPUT, texInput);
> +                           TGSI_FILE_INPUT, texInput, false);
>
>     /* ADD t0.x, t0.x, t0.y;  # x^2 + y^2 */
>     tgsi_transform_op2_swz_inst(ctx, TGSI_OPCODE_ADD,
>                                 TGSI_FILE_TEMPORARY, tmp0, TGSI_WRITEMASK_X,
>                                 TGSI_FILE_TEMPORARY, tmp0, TGSI_SWIZZLE_X,
> -                               TGSI_FILE_TEMPORARY, tmp0, TGSI_SWIZZLE_Y);
> +                               TGSI_FILE_TEMPORARY, tmp0, TGSI_SWIZZLE_Y, false);
>
>  #if NORMALIZE  /* OPTIONAL normalization of length */
>     /* RSQ t0.x, t0.x; */
>     tgsi_transform_op1_inst(ctx, TGSI_OPCODE_RSQ,
>                             TGSI_FILE_TEMPORARY, tmp0, TGSI_WRITEMASK_X,
>                             TGSI_FILE_TEMPORARY, tmp0);
>
>     /* RCP t0.x, t0.x; */
>     tgsi_transform_op1_inst(ctx, TGSI_OPCODE_RCP,
>                             TGSI_FILE_TEMPORARY, tmp0, TGSI_WRITEMASK_X,
>                             TGSI_FILE_TEMPORARY, tmp0);
>  #endif
>
>     /* SGT t0.y, t0.xxxx, tex.wwww;  # bool b = d > 1 (NOTE tex.w == 1) */
>     tgsi_transform_op2_swz_inst(ctx, TGSI_OPCODE_SGT,
>                                 TGSI_FILE_TEMPORARY, tmp0, TGSI_WRITEMASK_Y,
>                                 TGSI_FILE_TEMPORARY, tmp0, TGSI_SWIZZLE_X,
> -                               TGSI_FILE_INPUT, texInput, TGSI_SWIZZLE_W);
> +                               TGSI_FILE_INPUT, texInput, TGSI_SWIZZLE_W, false);
>
>     /* KILL_IF -tmp0.yyyy;   # if -tmp0.y < 0, KILL */
>     tgsi_transform_kill_inst(ctx, TGSI_FILE_TEMPORARY, tmp0,
>                              TGSI_SWIZZLE_Y, TRUE);
>
>     /* compute coverage factor = (1-d)/(1-k) */
>
>     /* SUB t0.z, tex.w, tex.z;  # m = 1 - k */
> -   tgsi_transform_op2_swz_inst(ctx, TGSI_OPCODE_SUB,
> +   tgsi_transform_op2_swz_inst(ctx, TGSI_OPCODE_ADD,
>                                 TGSI_FILE_TEMPORARY, tmp0, TGSI_WRITEMASK_Z,
>                                 TGSI_FILE_INPUT, texInput, TGSI_SWIZZLE_W,
> -                               TGSI_FILE_INPUT, texInput, TGSI_SWIZZLE_Z);
> +                               TGSI_FILE_INPUT, texInput, TGSI_SWIZZLE_Z, true);
>
>     /* RCP t0.z, t0.z;  # t0.z = 1 / m */
>     newInst = tgsi_default_full_instruction();
>     newInst.Instruction.Opcode = TGSI_OPCODE_RCP;
>     newInst.Instruction.NumDstRegs = 1;
>     newInst.Dst[0].Register.File = TGSI_FILE_TEMPORARY;
>     newInst.Dst[0].Register.Index = tmp0;
>     newInst.Dst[0].Register.WriteMask = TGSI_WRITEMASK_Z;
>     newInst.Instruction.NumSrcRegs = 1;
>     newInst.Src[0].Register.File = TGSI_FILE_TEMPORARY;
>     newInst.Src[0].Register.Index = tmp0;
>     newInst.Src[0].Register.SwizzleX = TGSI_SWIZZLE_Z;
>     ctx->emit_instruction(ctx, &newInst);
>
>     /* SUB t0.y, 1, t0.x;  # d = 1 - d */
> -   tgsi_transform_op2_swz_inst(ctx, TGSI_OPCODE_SUB,
> +   tgsi_transform_op2_swz_inst(ctx, TGSI_OPCODE_ADD,
>                                 TGSI_FILE_TEMPORARY, tmp0, TGSI_WRITEMASK_Y,
>                                 TGSI_FILE_INPUT, texInput, TGSI_SWIZZLE_W,
> -                               TGSI_FILE_TEMPORARY, tmp0, TGSI_SWIZZLE_X);
> +                               TGSI_FILE_TEMPORARY, tmp0, TGSI_SWIZZLE_X, true);
>
>     /* MUL t0.w, t0.y, t0.z;   # coverage = d * m */
>     tgsi_transform_op2_swz_inst(ctx, TGSI_OPCODE_MUL,
>                                 TGSI_FILE_TEMPORARY, tmp0, TGSI_WRITEMASK_W,
>                                 TGSI_FILE_TEMPORARY, tmp0, TGSI_SWIZZLE_Y,
> -                               TGSI_FILE_TEMPORARY, tmp0, TGSI_SWIZZLE_Z);
> +                               TGSI_FILE_TEMPORARY, tmp0, TGSI_SWIZZLE_Z, false);
>
>     /* SLE t0.y, t0.x, tex.z;  # bool b = distance <= k */
>     tgsi_transform_op2_swz_inst(ctx, TGSI_OPCODE_SLE,
>                                 TGSI_FILE_TEMPORARY, tmp0, TGSI_WRITEMASK_Y,
>                                 TGSI_FILE_TEMPORARY, tmp0, TGSI_SWIZZLE_X,
> -                               TGSI_FILE_INPUT, texInput, TGSI_SWIZZLE_Z);
> +                               TGSI_FILE_INPUT, texInput, TGSI_SWIZZLE_Z, false);
>
>     /* CMP t0.w, -t0.y, tex.w, t0.w;
>      *  # if -t0.y < 0 then
>      *       t0.w = 1
>      *    else
>      *       t0.w = t0.w
>      */
>     tgsi_transform_op3_swz_inst(ctx, TGSI_OPCODE_CMP,
>                                 TGSI_FILE_TEMPORARY, tmp0, TGSI_WRITEMASK_W,
>                                 TGSI_FILE_TEMPORARY, tmp0, TGSI_SWIZZLE_Y, 1,
> @@ -311,21 +311,21 @@ aa_transform_epilog(struct tgsi_transform_context *ctx)
>     tgsi_transform_op1_inst(ctx, TGSI_OPCODE_MOV,
>                             TGSI_FILE_OUTPUT, aactx->colorOutput,
>                             TGSI_WRITEMASK_XYZ,
>                             TGSI_FILE_TEMPORARY, aactx->colorTemp);
>
>     /* MUL result.color.w, colorTemp, tmp0.w; */
>     tgsi_transform_op2_inst(ctx, TGSI_OPCODE_MUL,
>                             TGSI_FILE_OUTPUT, aactx->colorOutput,
>                             TGSI_WRITEMASK_W,
>                             TGSI_FILE_TEMPORARY, aactx->colorTemp,
> -                           TGSI_FILE_TEMPORARY, aactx->tmp0);
> +                           TGSI_FILE_TEMPORARY, aactx->tmp0, false);
>  }
>
>
>  /**
>   * TGSI transform callback.
>   * Called per instruction.
>   * Replace writes to result.color w/ a temp reg.
>   */
>  static void
>  aa_transform_inst(struct tgsi_transform_context *ctx,
> diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c
> index 7d939e8..91e959f 100644
> --- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c
> +++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c
> @@ -361,22 +361,22 @@ exp_emit(
>
>     /* floor( src0.x ) */
>     floor_x = lp_build_emit_llvm_unary(bld_base, TGSI_OPCODE_FLR,
>                                        emit_data->args[0]);
>
>     /* 2 ^ floor( src0.x ) */
>     emit_data->output[TGSI_CHAN_X] = lp_build_emit_llvm_unary(bld_base,
>                                         TGSI_OPCODE_EX2, floor_x);
>
>     /* src0.x - floor( src0.x ) */
> -   emit_data->output[TGSI_CHAN_Y] = lp_build_emit_llvm_binary(bld_base,
> -                   TGSI_OPCODE_SUB,  emit_data->args[0] /* src0.x */, floor_x);
> +   emit_data->output[TGSI_CHAN_Y] =
> +      lp_build_sub(&bld_base->base, emit_data->args[0] /* src0.x */, floor_x);
>
>     /* 2 ^ src0.x */
>     emit_data->output[TGSI_CHAN_Z] = lp_build_emit_llvm_unary(bld_base,
>                               TGSI_OPCODE_EX2, emit_data->args[0] /* src0.x */);
>
>     emit_data->output[TGSI_CHAN_W] = bld_base->base.one;
>  }
>
>  const struct lp_build_tgsi_action exp_action = {
>     scalar_unary_fetch_args,	 /* fetch_args */
> @@ -387,22 +387,22 @@ const struct lp_build_tgsi_action exp_action = {
>
>  static void
>  frc_emit(
>     const struct lp_build_tgsi_action * action,
>     struct lp_build_tgsi_context * bld_base,
>     struct lp_build_emit_data * emit_data)
>  {
>     LLVMValueRef tmp;
>     tmp = lp_build_emit_llvm_unary(bld_base, TGSI_OPCODE_FLR,
>                                    emit_data->args[0]);
> -   emit_data->output[emit_data->chan] = lp_build_emit_llvm_binary(bld_base,
> -                                       TGSI_OPCODE_SUB, emit_data->args[0], tmp);
> +   emit_data->output[emit_data->chan] =
> +      lp_build_sub(&bld_base->base, emit_data->args[0], tmp);
>  }
>
>  /* TGSI_OPCODE_KILL_IF */
>
>  static void
>  kil_fetch_args(
>     struct lp_build_tgsi_context * bld_base,
>     struct lp_build_emit_data * emit_data)
>  {
>     /* src0.x */
> @@ -763,33 +763,20 @@ scs_emit(
>
>     /* dst.w */
>     emit_data->output[TGSI_CHAN_W] = bld_base->base.one;
>  }
>
>  const struct lp_build_tgsi_action scs_action = {
>     scalar_unary_fetch_args,	 /* fetch_args */
>     scs_emit	 /* emit */
>  };
>
> -/* TGSI_OPCODE_SUB */
> -static void
> -sub_emit(
> -   const struct lp_build_tgsi_action * action,
> -   struct lp_build_tgsi_context * bld_base,
> -   struct lp_build_emit_data * emit_data)
> -{
> -   emit_data->output[emit_data->chan] =
> -      LLVMBuildFSub(bld_base->base.gallivm->builder,
> -                    emit_data->args[0],
> -                    emit_data->args[1], "");
> -}
> -
>  /* TGSI_OPCODE_F2U */
>  static void
>  f2u_emit(
>     const struct lp_build_tgsi_action * action,
>     struct lp_build_tgsi_context * bld_base,
>     struct lp_build_emit_data * emit_data)
>  {
>     emit_data->output[emit_data->chan] =
>        LLVMBuildFPToUI(bld_base->base.gallivm->builder,
>                        emit_data->args[0],
> @@ -949,21 +936,21 @@ xpd_helper(
>    LLVMValueRef a,
>    LLVMValueRef b,
>    LLVMValueRef c,
>    LLVMValueRef d)
>  {
>     LLVMValueRef tmp0, tmp1;
>
>     tmp0 = lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_MUL, a, b);
>     tmp1 = lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_MUL, c, d);
>
> -   return lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_SUB, tmp0, tmp1);
> +   return lp_build_sub(&bld_base->base, tmp0, tmp1);
>  }
>
>  static void
>  xpd_emit(
>     const struct lp_build_tgsi_action * action,
>     struct lp_build_tgsi_context * bld_base,
>     struct lp_build_emit_data * emit_data)
>  {
>     emit_data->output[TGSI_CHAN_X] = xpd_helper(bld_base,
>                emit_data->args[1] /* src0.y */, emit_data->args[5] /* src1.z */,
> @@ -1345,21 +1332,20 @@ lp_set_default_actions(struct lp_build_tgsi_context * bld_base)
>     bld_base->op_actions[TGSI_OPCODE_ARR].emit = arr_emit;
>     bld_base->op_actions[TGSI_OPCODE_CLAMP].emit = clamp_emit;
>     bld_base->op_actions[TGSI_OPCODE_END].emit = end_emit;
>     bld_base->op_actions[TGSI_OPCODE_FRC].emit = frc_emit;
>     bld_base->op_actions[TGSI_OPCODE_LRP].emit = lrp_emit;
>     bld_base->op_actions[TGSI_OPCODE_MAD].emit = mad_emit;
>     bld_base->op_actions[TGSI_OPCODE_MOV].emit = mov_emit;
>     bld_base->op_actions[TGSI_OPCODE_MUL].emit = mul_emit;
>     bld_base->op_actions[TGSI_OPCODE_DIV].emit = fdiv_emit;
>     bld_base->op_actions[TGSI_OPCODE_RCP].emit = rcp_emit;
> -   bld_base->op_actions[TGSI_OPCODE_SUB].emit = sub_emit;
>
>     bld_base->op_actions[TGSI_OPCODE_UARL].emit = mov_emit;
>     bld_base->op_actions[TGSI_OPCODE_F2U].emit = f2u_emit;
>     bld_base->op_actions[TGSI_OPCODE_U2F].emit = u2f_emit;
>     bld_base->op_actions[TGSI_OPCODE_UMAD].emit = umad_emit;
>     bld_base->op_actions[TGSI_OPCODE_UMUL].emit = umul_emit;
>     bld_base->op_actions[TGSI_OPCODE_IMUL_HI].emit = imul_hi_emit;
>     bld_base->op_actions[TGSI_OPCODE_UMUL_HI].emit = umul_hi_emit;
>
>     bld_base->op_actions[TGSI_OPCODE_MAX].emit = fmax_emit;
> @@ -2064,33 +2050,20 @@ sne_emit_cpu(
>  static void
>  ssg_emit_cpu(
>     const struct lp_build_tgsi_action * action,
>     struct lp_build_tgsi_context * bld_base,
>     struct lp_build_emit_data * emit_data)
>  {
>     emit_data->output[emit_data->chan] = lp_build_sgn(&bld_base->base,
>                                                         emit_data->args[0]);
>  }
>
> -/* TGSI_OPCODE_SUB (CPU Only) */
> -
> -static void
> -sub_emit_cpu(
> -   const struct lp_build_tgsi_action * action,
> -   struct lp_build_tgsi_context * bld_base,
> -   struct lp_build_emit_data * emit_data)
> -{
> -   emit_data->output[emit_data->chan] = lp_build_sub(&bld_base->base,
> -                                                        emit_data->args[0],
> -                                                        emit_data->args[1]);
> -}
> -
>  /* TGSI_OPCODE_TRUNC (CPU Only) */
>
>  static void
>  trunc_emit_cpu(
>     const struct lp_build_tgsi_action * action,
>     struct lp_build_tgsi_context * bld_base,
>     struct lp_build_emit_data * emit_data)
>  {
>     emit_data->output[emit_data->chan] = lp_build_trunc(&bld_base->base,
>                                                           emit_data->args[0]);
> @@ -2617,21 +2590,20 @@ lp_set_default_actions_cpu(
>     bld_base->op_actions[TGSI_OPCODE_ROUND].emit = round_emit_cpu;
>     bld_base->op_actions[TGSI_OPCODE_SEQ].emit = seq_emit_cpu;
>     bld_base->op_actions[TGSI_OPCODE_SGE].emit = sge_emit_cpu;
>     bld_base->op_actions[TGSI_OPCODE_SGT].emit = sgt_emit_cpu;
>     bld_base->op_actions[TGSI_OPCODE_SIN].emit = sin_emit_cpu;
>     bld_base->op_actions[TGSI_OPCODE_SHL].emit = shl_emit_cpu;
>     bld_base->op_actions[TGSI_OPCODE_SLE].emit = sle_emit_cpu;
>     bld_base->op_actions[TGSI_OPCODE_SLT].emit = slt_emit_cpu;
>     bld_base->op_actions[TGSI_OPCODE_SNE].emit = sne_emit_cpu;
>     bld_base->op_actions[TGSI_OPCODE_SSG].emit = ssg_emit_cpu;
> -   bld_base->op_actions[TGSI_OPCODE_SUB].emit = sub_emit_cpu;
>     bld_base->op_actions[TGSI_OPCODE_TRUNC].emit = trunc_emit_cpu;
>
>     bld_base->rsq_action.emit = recip_sqrt_emit_cpu;
>     bld_base->sqrt_action.emit = sqrt_emit_cpu;
>
>     bld_base->op_actions[TGSI_OPCODE_UADD].emit = uadd_emit_cpu;
>     bld_base->op_actions[TGSI_OPCODE_UCMP].emit = ucmp_emit_cpu;
>     bld_base->op_actions[TGSI_OPCODE_UDIV].emit = udiv_emit_cpu;
>     bld_base->op_actions[TGSI_OPCODE_UMAX].emit = umax_emit_cpu;
>     bld_base->op_actions[TGSI_OPCODE_UMIN].emit = umin_emit_cpu;
> diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_aos.c b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_aos.c
> index a5e439f..6c177b0 100644
> --- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_aos.c
> +++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_aos.c
> @@ -584,26 +584,20 @@ lp_emit_instruction_aos(
>
>     case TGSI_OPCODE_MAD:
>     /* TGSI_OPCODE_MADD */
>        src0 = lp_build_emit_fetch(&bld->bld_base, inst, 0, LP_CHAN_ALL);
>        src1 = lp_build_emit_fetch(&bld->bld_base, inst, 1, LP_CHAN_ALL);
>        src2 = lp_build_emit_fetch(&bld->bld_base, inst, 2, LP_CHAN_ALL);
>        tmp0 = lp_build_mul(&bld->bld_base.base, src0, src1);
>        dst0 = lp_build_add(&bld->bld_base.base, tmp0, src2);
>        break;
>
> -   case TGSI_OPCODE_SUB:
> -      src0 = lp_build_emit_fetch(&bld->bld_base, inst, 0, LP_CHAN_ALL);
> -      src1 = lp_build_emit_fetch(&bld->bld_base, inst, 1, LP_CHAN_ALL);
> -      dst0 = lp_build_sub(&bld->bld_base.base, src0, src1);
> -      break;
> -
>     case TGSI_OPCODE_LRP:
>        src0 = lp_build_emit_fetch(&bld->bld_base, inst, 0, LP_CHAN_ALL);
>        src1 = lp_build_emit_fetch(&bld->bld_base, inst, 1, LP_CHAN_ALL);
>        src2 = lp_build_emit_fetch(&bld->bld_base, inst, 2, LP_CHAN_ALL);
>        tmp0 = lp_build_sub(&bld->bld_base.base, src1, src2);
>        tmp0 = lp_build_mul(&bld->bld_base.base, src0, tmp0);
>        dst0 = lp_build_add(&bld->bld_base.base, tmp0, src2);
>        break;
>
>     case TGSI_OPCODE_DP2A:
> diff --git a/src/gallium/auxiliary/nir/tgsi_to_nir.c b/src/gallium/auxiliary/nir/tgsi_to_nir.c
> index af4a6e0..f3e8700 100644
> --- a/src/gallium/auxiliary/nir/tgsi_to_nir.c
> +++ b/src/gallium/auxiliary/nir/tgsi_to_nir.c
> @@ -1528,21 +1528,20 @@ static const nir_op op_trans[TGSI_OPCODE_LAST] = {
>     [TGSI_OPCODE_MUL] = nir_op_fmul,
>     [TGSI_OPCODE_ADD] = nir_op_fadd,
>     [TGSI_OPCODE_DP3] = 0,
>     [TGSI_OPCODE_DP4] = 0,
>     [TGSI_OPCODE_DST] = 0,
>     [TGSI_OPCODE_MIN] = nir_op_fmin,
>     [TGSI_OPCODE_MAX] = nir_op_fmax,
>     [TGSI_OPCODE_SLT] = nir_op_slt,
>     [TGSI_OPCODE_SGE] = nir_op_sge,
>     [TGSI_OPCODE_MAD] = nir_op_ffma,
> -   [TGSI_OPCODE_SUB] = nir_op_fsub,
>     [TGSI_OPCODE_LRP] = 0,
>     [TGSI_OPCODE_SQRT] = nir_op_fsqrt,
>     [TGSI_OPCODE_DP2A] = 0,
>     [TGSI_OPCODE_FRC] = nir_op_ffract,
>     [TGSI_OPCODE_CLAMP] = 0,
>     [TGSI_OPCODE_FLR] = nir_op_ffloor,
>     [TGSI_OPCODE_ROUND] = nir_op_fround_even,
>     [TGSI_OPCODE_EX2] = nir_op_fexp2,
>     [TGSI_OPCODE_LG2] = nir_op_flog2,
>     [TGSI_OPCODE_POW] = nir_op_fpow,
> diff --git a/src/gallium/auxiliary/tgsi/tgsi_aa_point.c b/src/gallium/auxiliary/tgsi/tgsi_aa_point.c
> index 9016eff..4b14a2f 100644
> --- a/src/gallium/auxiliary/tgsi/tgsi_aa_point.c
> +++ b/src/gallium/auxiliary/tgsi/tgsi_aa_point.c
> @@ -141,71 +141,71 @@ aa_prolog(struct tgsi_transform_context *ctx)
>      * Temp reg (t0) usage:
>      *  t0.x = distance of fragment from center point
>      *  t0.y = boolean, is t0.x > 0.5, also misc temp usage
>      *  t0.z = temporary for computing 1/(0.5-k) value
>      *  t0.w = final coverage value
>      */
>
>     tmp0 = ts->tmp;
>
>     /* SUB t0.xy, texIn, (0.5, 0,5) */
> -   tgsi_transform_op2_inst(ctx, TGSI_OPCODE_SUB,
> +   tgsi_transform_op2_inst(ctx, TGSI_OPCODE_ADD,
>                             TGSI_FILE_TEMPORARY, tmp0, TGSI_WRITEMASK_XY,
>                             TGSI_FILE_INPUT, texIn,
> -                           TGSI_FILE_IMMEDIATE, imm);
> +                           TGSI_FILE_IMMEDIATE, imm, true);
>
>     /* DP2 t0.x, t0.xy, t0.xy;  # t0.x = x^2 + y^2 */
>     tgsi_transform_op2_inst(ctx, TGSI_OPCODE_DP2,
>                             TGSI_FILE_TEMPORARY, tmp0, TGSI_WRITEMASK_X,
>                             TGSI_FILE_TEMPORARY, tmp0,
> -                           TGSI_FILE_TEMPORARY, tmp0);
> +                           TGSI_FILE_TEMPORARY, tmp0, false);
>
>     /* SQRT t0.x, t0.x */
>     tgsi_transform_op1_inst(ctx, TGSI_OPCODE_SQRT,
>                             TGSI_FILE_TEMPORARY, tmp0, TGSI_WRITEMASK_X,
>                             TGSI_FILE_TEMPORARY, tmp0);
>
>     /* compute coverage factor = (0.5-d)/(0.5-k) */
>
>     /* SUB t0.w, 0.5, texIn.z;  # t0.w = 0.5-k */
> -   tgsi_transform_op2_swz_inst(ctx, TGSI_OPCODE_SUB,
> +   tgsi_transform_op2_swz_inst(ctx, TGSI_OPCODE_ADD,
>                                 TGSI_FILE_TEMPORARY, tmp0, TGSI_WRITEMASK_W,
>                                 TGSI_FILE_IMMEDIATE, imm, TGSI_SWIZZLE_X,
> -                               TGSI_FILE_INPUT, texIn, TGSI_SWIZZLE_Z);
> +                               TGSI_FILE_INPUT, texIn, TGSI_SWIZZLE_Z, true);
>
>     /* SUB t0.y, 0.5, t0.x;  # t0.y = 0.5-d */
> -   tgsi_transform_op2_swz_inst(ctx, TGSI_OPCODE_SUB,
> +   tgsi_transform_op2_swz_inst(ctx, TGSI_OPCODE_ADD,
>                                 TGSI_FILE_TEMPORARY, tmp0, TGSI_WRITEMASK_Y,
>                                 TGSI_FILE_IMMEDIATE, imm, TGSI_SWIZZLE_X,
> -                               TGSI_FILE_TEMPORARY, tmp0, TGSI_SWIZZLE_X);
> +                               TGSI_FILE_TEMPORARY, tmp0, TGSI_SWIZZLE_X, true);
>
>     /* DIV t0.w, t0.y, t0.w;  # coverage = (0.5-d)/(0.5-k) */
>     tgsi_transform_op2_swz_inst(ctx, TGSI_OPCODE_DIV,
>                                 TGSI_FILE_TEMPORARY, tmp0, TGSI_WRITEMASK_W,
>                                 TGSI_FILE_TEMPORARY, tmp0, TGSI_SWIZZLE_Y,
> -                               TGSI_FILE_TEMPORARY, tmp0, TGSI_SWIZZLE_W);
> +                               TGSI_FILE_TEMPORARY, tmp0, TGSI_SWIZZLE_W, false);
>
>     /* If the coverage value is negative, it means the fragment is outside
>      * the point's circular boundary.  Kill it.
>      */
>     /* KILL_IF tmp0.w;  # if tmp0.w < 0 KILL */
>     tgsi_transform_kill_inst(ctx, TGSI_FILE_TEMPORARY, tmp0,
>                              TGSI_SWIZZLE_W, FALSE);
>
>     /* If the distance is less than the threshold, the coverage/alpha value
>      * will be greater than one.  Clamp to one here.
>      */
>     /* MIN tmp0.w, tmp0.w, 1.0 */
>     tgsi_transform_op2_swz_inst(ctx, TGSI_OPCODE_MIN,
>                                 TGSI_FILE_TEMPORARY, tmp0, TGSI_WRITEMASK_W,
>                                 TGSI_FILE_TEMPORARY, tmp0, TGSI_SWIZZLE_W,
> -                               TGSI_FILE_IMMEDIATE, imm, TGSI_SWIZZLE_W);
> +                               TGSI_FILE_IMMEDIATE, imm, TGSI_SWIZZLE_W, false);
>  }
>
>  /**
>   * TGSI instruction transform callback.
>   */
>  static void
>  aa_inst(struct tgsi_transform_context *ctx,
>          struct tgsi_full_instruction *inst)
>  {
>     struct aa_transform_context *ts = aa_transform_context(ctx);
> @@ -242,21 +242,21 @@ aa_epilog(struct tgsi_transform_context *ctx)
>     tgsi_transform_op1_inst(ctx, TGSI_OPCODE_MOV,
>                             TGSI_FILE_OUTPUT, ts->color_out,
>                             TGSI_WRITEMASK_XYZ,
>                             TGSI_FILE_TEMPORARY, ts->color_tmp);
>
>     /* MUL output.color.w colorTmp.w tmp0.w */
>     tgsi_transform_op2_inst(ctx, TGSI_OPCODE_MUL,
>                             TGSI_FILE_OUTPUT, ts->color_out,
>                             TGSI_WRITEMASK_W,
>                             TGSI_FILE_TEMPORARY, ts->color_tmp,
> -                           TGSI_FILE_TEMPORARY, ts->tmp);
> +                           TGSI_FILE_TEMPORARY, ts->tmp, false);
>  }
>
>  /**
>   * TGSI utility to transform a fragment shader to support antialiasing point.
>   *
>   * This utility accepts two inputs:
>   *\param tokens_in  -- the original token string of the shader
>   *\param aa_point_coord_index -- the semantic index of the generic register
>   *                            that contains the point sprite texture coord
>   *
> diff --git a/src/gallium/auxiliary/tgsi/tgsi_exec.c b/src/gallium/auxiliary/tgsi/tgsi_exec.c
> index 2f89de6..915cd10 100644
> --- a/src/gallium/auxiliary/tgsi/tgsi_exec.c
> +++ b/src/gallium/auxiliary/tgsi/tgsi_exec.c
> @@ -5201,24 +5201,20 @@ exec_instruction(
>        break;
>
>     case TGSI_OPCODE_SGE:
>        exec_vector_binary(mach, inst, micro_sge, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
>        break;
>
>     case TGSI_OPCODE_MAD:
>        exec_vector_trinary(mach, inst, micro_mad, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
>        break;
>
> -   case TGSI_OPCODE_SUB:
> -      exec_vector_binary(mach, inst, micro_sub, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
> -      break;
> -
>     case TGSI_OPCODE_LRP:
>        exec_vector_trinary(mach, inst, micro_lrp, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
>        break;
>
>     case TGSI_OPCODE_SQRT:
>        exec_scalar_unary(mach, inst, micro_sqrt, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
>        break;
>
>     case TGSI_OPCODE_DP2A:
>        exec_dp2a(mach, inst);
> diff --git a/src/gallium/auxiliary/tgsi/tgsi_info.c b/src/gallium/auxiliary/tgsi/tgsi_info.c
> index 9b2431f..a339ec2 100644
> --- a/src/gallium/auxiliary/tgsi/tgsi_info.c
> +++ b/src/gallium/auxiliary/tgsi/tgsi_info.c
> @@ -47,21 +47,21 @@ static const struct tgsi_opcode_info opcode_info[TGSI_OPCODE_LAST] =
>     { 1, 2, 0, 0, 0, 0, 0, COMP, "MUL", TGSI_OPCODE_MUL },
>     { 1, 2, 0, 0, 0, 0, 0, COMP, "ADD", TGSI_OPCODE_ADD },
>     { 1, 2, 0, 0, 0, 0, 0, REPL, "DP3", TGSI_OPCODE_DP3 },
>     { 1, 2, 0, 0, 0, 0, 0, REPL, "DP4", TGSI_OPCODE_DP4 },
>     { 1, 2, 0, 0, 0, 0, 0, CHAN, "DST", TGSI_OPCODE_DST },
>     { 1, 2, 0, 0, 0, 0, 0, COMP, "MIN", TGSI_OPCODE_MIN },
>     { 1, 2, 0, 0, 0, 0, 0, COMP, "MAX", TGSI_OPCODE_MAX },
>     { 1, 2, 0, 0, 0, 0, 0, COMP, "SLT", TGSI_OPCODE_SLT },
>     { 1, 2, 0, 0, 0, 0, 0, COMP, "SGE", TGSI_OPCODE_SGE },
>     { 1, 3, 0, 0, 0, 0, 0, COMP, "MAD", TGSI_OPCODE_MAD },
> -   { 1, 2, 0, 0, 0, 0, 0, COMP, "SUB", TGSI_OPCODE_SUB },
> +   { 1, 2, 0, 0, 0, 0, 0, COMP, "", 17 }, /* removed */
>     { 1, 3, 0, 0, 0, 0, 0, COMP, "LRP", TGSI_OPCODE_LRP },
>     { 1, 3, 0, 0, 0, 0, 0, COMP, "FMA", TGSI_OPCODE_FMA },
>     { 1, 1, 0, 0, 0, 0, 0, REPL, "SQRT", TGSI_OPCODE_SQRT },
>     { 1, 3, 0, 0, 0, 0, 0, REPL, "DP2A", TGSI_OPCODE_DP2A },
>     { 1, 1, 0, 0, 0, 0, 0, COMP, "F2U64", TGSI_OPCODE_F2U64 },
>     { 1, 1, 0, 0, 0, 0, 0, COMP, "F2I64", TGSI_OPCODE_F2I64 },
>     { 1, 1, 0, 0, 0, 0, 0, COMP, "FRC", TGSI_OPCODE_FRC },
>     { 1, 3, 0, 0, 0, 0, 0, COMP, "CLAMP", TGSI_OPCODE_CLAMP },
>     { 1, 1, 0, 0, 0, 0, 0, COMP, "FLR", TGSI_OPCODE_FLR },
>     { 1, 1, 0, 0, 0, 0, 0, COMP, "ROUND", TGSI_OPCODE_ROUND },
> diff --git a/src/gallium/auxiliary/tgsi/tgsi_lowering.c b/src/gallium/auxiliary/tgsi/tgsi_lowering.c
> index b0a28f2..bf6cbb3 100644
> --- a/src/gallium/auxiliary/tgsi/tgsi_lowering.c
> +++ b/src/gallium/auxiliary/tgsi/tgsi_lowering.c
> @@ -461,26 +461,27 @@ transform_frc(struct tgsi_transform_context *tctx,
>        new_inst = tgsi_default_full_instruction();
>        new_inst.Instruction.Opcode = TGSI_OPCODE_FLR;
>        new_inst.Instruction.NumDstRegs = 1;
>        reg_dst(&new_inst.Dst[0], &ctx->tmp[A].dst, TGSI_WRITEMASK_XYZW);
>        new_inst.Instruction.NumSrcRegs = 1;
>        reg_src(&new_inst.Src[0], src, SWIZ(X, Y, Z, W));
>        tctx->emit_instruction(tctx, &new_inst);
>
>        /* SUB dst, src, tmpA */
>        new_inst = tgsi_default_full_instruction();
> -      new_inst.Instruction.Opcode = TGSI_OPCODE_SUB;
> +      new_inst.Instruction.Opcode = TGSI_OPCODE_ADD;
>        new_inst.Instruction.NumDstRegs = 1;
>        reg_dst(&new_inst.Dst[0], dst, TGSI_WRITEMASK_XYZW);
>        new_inst.Instruction.NumSrcRegs = 2;
>        reg_src(&new_inst.Src[0], src, SWIZ(X, Y, Z, W));
>        reg_src(&new_inst.Src[1], &ctx->tmp[A].src, SWIZ(X, Y, Z, W));
> +      new_inst.Src[1].Register.Negate = 1;
>        tctx->emit_instruction(tctx, &new_inst);
>     }
>  }
>
>  /* POW - Power
>   *  dst.x = src0.x^{src1.x}
>   *  dst.y = src0.x^{src1.x}
>   *  dst.z = src0.x^{src1.x}
>   *  dst.w = src0.x^{src1.x}
>   *
> @@ -682,26 +683,27 @@ transform_exp(struct tgsi_transform_context *tctx,
>           new_inst = tgsi_default_full_instruction();
>           new_inst.Instruction.Opcode = TGSI_OPCODE_FRC;
>           new_inst.Instruction.NumDstRegs = 1;
>           reg_dst(&new_inst.Dst[0], &ctx->tmp[A].dst, TGSI_WRITEMASK_X);
>           new_inst.Instruction.NumSrcRegs = 1;
>           reg_src(&new_inst.Src[0], src, SWIZ(X, _, _, _));
>           tctx->emit_instruction(tctx, &new_inst);
>
>           /* SUB tmpA.x, src.x, tmpA.x */
>           new_inst = tgsi_default_full_instruction();
> -         new_inst.Instruction.Opcode = TGSI_OPCODE_SUB;
> +         new_inst.Instruction.Opcode = TGSI_OPCODE_ADD;
>           new_inst.Instruction.NumDstRegs = 1;
>           reg_dst(&new_inst.Dst[0], &ctx->tmp[A].dst, TGSI_WRITEMASK_X);
>           new_inst.Instruction.NumSrcRegs = 2;
>           reg_src(&new_inst.Src[0], src, SWIZ(X, _, _, _));
>           reg_src(&new_inst.Src[1], &ctx->tmp[A].src, SWIZ(X, _, _, _));
> +         new_inst.Src[1].Register.Negate = 1;
>           tctx->emit_instruction(tctx, &new_inst);
>       } else {
>           /* FLR tmpA.x, src.x */
>           new_inst = tgsi_default_full_instruction();
>           new_inst.Instruction.Opcode = TGSI_OPCODE_FLR;
>           new_inst.Instruction.NumDstRegs = 1;
>           reg_dst(&new_inst.Dst[0], &ctx->tmp[A].dst, TGSI_WRITEMASK_X);
>           new_inst.Instruction.NumSrcRegs = 1;
>           reg_src(&new_inst.Src[0], src, SWIZ(X, _, _, _));
>           tctx->emit_instruction(tctx, &new_inst);
> @@ -715,26 +717,27 @@ transform_exp(struct tgsi_transform_context *tctx,
>        new_inst.Instruction.NumDstRegs = 1;
>        reg_dst(&new_inst.Dst[0], &ctx->tmp[A].dst, TGSI_WRITEMASK_Y);
>        new_inst.Instruction.NumSrcRegs = 1;
>        reg_src(&new_inst.Src[0], src, SWIZ(X, _, _, _));
>        tctx->emit_instruction(tctx, &new_inst);
>     }
>
>     if (dst->Register.WriteMask & TGSI_WRITEMASK_Y) {
>        /* SUB dst.y, src.x, tmpA.x */
>        new_inst = tgsi_default_full_instruction();
> -      new_inst.Instruction.Opcode = TGSI_OPCODE_SUB;
> +      new_inst.Instruction.Opcode = TGSI_OPCODE_ADD;
>        new_inst.Instruction.NumDstRegs = 1;
>        reg_dst(&new_inst.Dst[0], dst, TGSI_WRITEMASK_Y);
>        new_inst.Instruction.NumSrcRegs = 2;
>        reg_src(&new_inst.Src[0], src, SWIZ(_, X, _, _));
>        reg_src(&new_inst.Src[1], &ctx->tmp[A].src, SWIZ(_, X, _, _));
> +      new_inst.Src[1].Register.Negate = 1;
>        tctx->emit_instruction(tctx, &new_inst);
>     }
>
>     if (dst->Register.WriteMask & TGSI_WRITEMASK_X) {
>        /* EX2 dst.x, tmpA.x */
>        new_inst = tgsi_default_full_instruction();
>        new_inst.Instruction.Opcode = TGSI_OPCODE_EX2;
>        new_inst.Instruction.NumDstRegs = 1;
>        reg_dst(&new_inst.Dst[0], dst, TGSI_WRITEMASK_X);
>        new_inst.Instruction.NumSrcRegs = 1;
> @@ -815,26 +818,27 @@ transform_log(struct tgsi_transform_context *tctx,
>           new_inst = tgsi_default_full_instruction();
>           new_inst.Instruction.Opcode = TGSI_OPCODE_FRC;
>           new_inst.Instruction.NumDstRegs = 1;
>           reg_dst(&new_inst.Dst[0], &ctx->tmp[A].dst, TGSI_WRITEMASK_Y);
>           new_inst.Instruction.NumSrcRegs = 1;
>           reg_src(&new_inst.Src[0], &ctx->tmp[A].src, SWIZ(_, X, _, _));
>           tctx->emit_instruction(tctx, &new_inst);
>
>           /* SUB tmpA.y, tmpA.x, tmpA.y */
>           new_inst = tgsi_default_full_instruction();
> -         new_inst.Instruction.Opcode = TGSI_OPCODE_SUB;
> +         new_inst.Instruction.Opcode = TGSI_OPCODE_ADD;
>           new_inst.Instruction.NumDstRegs = 1;
>           reg_dst(&new_inst.Dst[0], &ctx->tmp[A].dst, TGSI_WRITEMASK_Y);
>           new_inst.Instruction.NumSrcRegs = 2;
>           reg_src(&new_inst.Src[0], &ctx->tmp[A].src, SWIZ(_, X, _, _));
>           reg_src(&new_inst.Src[1], &ctx->tmp[A].src, SWIZ(_, Y, _, _));
> +         new_inst.Src[1].Register.Negate = 1;
>           tctx->emit_instruction(tctx, &new_inst);
>        } else {
>           /* FLR tmpA.y, tmpA.x */
>           new_inst = tgsi_default_full_instruction();
>           new_inst.Instruction.Opcode = TGSI_OPCODE_FLR;
>           new_inst.Instruction.NumDstRegs = 1;
>           reg_dst(&new_inst.Dst[0], &ctx->tmp[A].dst, TGSI_WRITEMASK_Y);
>           new_inst.Instruction.NumSrcRegs = 1;
>           reg_src(&new_inst.Src[0], &ctx->tmp[A].src, SWIZ(_, X, _, _));
>           tctx->emit_instruction(tctx, &new_inst);
> @@ -1065,29 +1069,28 @@ transform_flr_ceil(struct tgsi_transform_context *tctx,
>        reg_dst(&new_inst.Dst[0], &ctx->tmp[A].dst, TGSI_WRITEMASK_XYZW);
>        new_inst.Instruction.NumSrcRegs = 1;
>        reg_src(&new_inst.Src[0], src0, SWIZ(X, Y, Z, W));
>
>        if (opcode == TGSI_OPCODE_CEIL)
>           new_inst.Src[0].Register.Negate = !new_inst.Src[0].Register.Negate;
>        tctx->emit_instruction(tctx, &new_inst);
>
>        /* FLR: SUB dst, src, tmpA  CEIL: ADD dst, src, tmpA */
>        new_inst = tgsi_default_full_instruction();
> -      if (opcode == TGSI_OPCODE_CEIL)
> -         new_inst.Instruction.Opcode = TGSI_OPCODE_ADD;
> -      else
> -         new_inst.Instruction.Opcode = TGSI_OPCODE_SUB;
> +      new_inst.Instruction.Opcode = TGSI_OPCODE_ADD;
>        new_inst.Instruction.NumDstRegs = 1;
>        reg_dst(&new_inst.Dst[0], dst, TGSI_WRITEMASK_XYZW);
>        new_inst.Instruction.NumSrcRegs = 2;
>        reg_src(&new_inst.Src[0], src0, SWIZ(X, Y, Z, W));
>        reg_src(&new_inst.Src[1], &ctx->tmp[A].src, SWIZ(X, Y, Z, W));
> +      if (opcode == TGSI_OPCODE_FLR)
> +         new_inst.Src[1].Register.Negate = 1;
>        tctx->emit_instruction(tctx, &new_inst);
>     }
>  }
>
>  /* TRUNC - truncate off fractional part
>   *  dst.x = trunc(src.x)
>   *  dst.y = trunc(src.y)
>   *  dst.z = trunc(src.z)
>   *  dst.w = trunc(src.w)
>   *
> @@ -1117,28 +1120,29 @@ transform_trunc(struct tgsi_transform_context *tctx,
>           new_inst.Instruction.Opcode = TGSI_OPCODE_FRC;
>           new_inst.Instruction.NumDstRegs = 1;
>           reg_dst(&new_inst.Dst[0], &ctx->tmp[A].dst, TGSI_WRITEMASK_XYZW);
>           new_inst.Instruction.NumSrcRegs = 1;
>           reg_src(&new_inst.Src[0], src0, SWIZ(X, Y, Z, W));
>           new_inst.Src[0].Register.Absolute = true;
>           new_inst.Src[0].Register.Negate = false;
>           tctx->emit_instruction(tctx, &new_inst);
>
>           new_inst = tgsi_default_full_instruction();
> -         new_inst.Instruction.Opcode = TGSI_OPCODE_SUB;
> +         new_inst.Instruction.Opcode = TGSI_OPCODE_ADD;
>           new_inst.Instruction.NumDstRegs = 1;
>           reg_dst(&new_inst.Dst[0], &ctx->tmp[A].dst, TGSI_WRITEMASK_XYZW);
>           new_inst.Instruction.NumSrcRegs = 2;
>           reg_src(&new_inst.Src[0], src0, SWIZ(X, Y, Z, W));
>           new_inst.Src[0].Register.Absolute = true;
>           new_inst.Src[0].Register.Negate = false;
>           reg_src(&new_inst.Src[1], &ctx->tmp[A].src, SWIZ(X, Y, Z, W));
> +         new_inst.Src[1].Register.Negate = 1;
>           tctx->emit_instruction(tctx, &new_inst);
>        } else {
>           new_inst = tgsi_default_full_instruction();
>           new_inst.Instruction.Opcode = TGSI_OPCODE_FLR;
>           new_inst.Instruction.NumDstRegs = 1;
>           reg_dst(&new_inst.Dst[0], &ctx->tmp[A].dst, TGSI_WRITEMASK_XYZW);
>           new_inst.Instruction.NumSrcRegs = 1;
>           reg_src(&new_inst.Src[0], src0, SWIZ(X, Y, Z, W));
>           new_inst.Src[0].Register.Absolute = true;
>           new_inst.Src[0].Register.Negate = false;
> diff --git a/src/gallium/auxiliary/tgsi/tgsi_opcode_tmp.h b/src/gallium/auxiliary/tgsi/tgsi_opcode_tmp.h
> index d78dd66..13c443f 100644
> --- a/src/gallium/auxiliary/tgsi/tgsi_opcode_tmp.h
> +++ b/src/gallium/auxiliary/tgsi/tgsi_opcode_tmp.h
> @@ -66,21 +66,20 @@ OP11(LOG)
>  OP12(MUL)
>  OP12(ADD)
>  OP12(DP3)
>  OP12(DP4)
>  OP12(DST)
>  OP12(MIN)
>  OP12(MAX)
>  OP12(SLT)
>  OP12(SGE)
>  OP13(MAD)
> -OP12(SUB)
>  OP13(LRP)
>  OP11(SQRT)
>  OP13(DP2A)
>  OP11(FRC)
>  OP13(CLAMP)
>  OP11(FLR)
>  OP11(ROUND)
>  OP11(EX2)
>  OP11(LG2)
>  OP12(POW)
> diff --git a/src/gallium/auxiliary/tgsi/tgsi_point_sprite.c b/src/gallium/auxiliary/tgsi/tgsi_point_sprite.c
> index 713bd60..f60a17c 100644
> --- a/src/gallium/auxiliary/tgsi/tgsi_point_sprite.c
> +++ b/src/gallium/auxiliary/tgsi/tgsi_point_sprite.c
> @@ -288,21 +288,21 @@ psprite_emit_vertex_inst(struct tgsi_transform_context *ctx,
>
>     /**
>      * Set up the point scale vector
>      * scale = pointSize * pos.w * inverseViewport
>      */
>
>     /* MUL point_scale.x, point_size.x, point_pos.w */
>     tgsi_transform_op2_swz_inst(ctx, TGSI_OPCODE_MUL,
>                    TGSI_FILE_TEMPORARY, ts->point_scale_tmp, TGSI_WRITEMASK_X,
>                    TGSI_FILE_TEMPORARY, ts->point_size_tmp, TGSI_SWIZZLE_X,
> -                  TGSI_FILE_TEMPORARY, ts->point_pos_tmp, TGSI_SWIZZLE_W);
> +                  TGSI_FILE_TEMPORARY, ts->point_pos_tmp, TGSI_SWIZZLE_W, false);
>
>     /* MUL point_scale.xy, point_scale.xx, inverseViewport.xy */
>     inst = tgsi_default_full_instruction();
>     inst.Instruction.Opcode = TGSI_OPCODE_MUL;
>     inst.Instruction.NumDstRegs = 1;
>     tgsi_transform_dst_reg(&inst.Dst[0], TGSI_FILE_TEMPORARY,
>                            ts->point_scale_tmp, TGSI_WRITEMASK_XY);
>     inst.Instruction.NumSrcRegs = 2;
>     tgsi_transform_src_reg(&inst.Src[0], TGSI_FILE_TEMPORARY,
>                            ts->point_scale_tmp, TGSI_SWIZZLE_X,
> @@ -316,29 +316,29 @@ psprite_emit_vertex_inst(struct tgsi_transform_context *ctx,
>      * Set up the point coord threshold distance
>      * k = 0.5 - 1 / pointsize
>      */
>     if (ts->aa_point) {
>        tgsi_transform_op2_swz_inst(ctx, TGSI_OPCODE_DIV,
>                                    TGSI_FILE_TEMPORARY, ts->point_coord_k,
>                                    TGSI_WRITEMASK_X,
>                                    TGSI_FILE_IMMEDIATE, ts->point_imm,
>                                    TGSI_SWIZZLE_Y,
>                                    TGSI_FILE_TEMPORARY, ts->point_size_tmp,
> -                                  TGSI_SWIZZLE_X);
> +                                  TGSI_SWIZZLE_X, false);
>
> -      tgsi_transform_op2_swz_inst(ctx, TGSI_OPCODE_SUB,
> +      tgsi_transform_op2_swz_inst(ctx, TGSI_OPCODE_ADD,
>                                    TGSI_FILE_TEMPORARY, ts->point_coord_k,
>                                    TGSI_WRITEMASK_X,
>                                    TGSI_FILE_IMMEDIATE, ts->point_imm,
>                                    TGSI_SWIZZLE_Z,
>                                    TGSI_FILE_TEMPORARY, ts->point_coord_k,
> -                                  TGSI_SWIZZLE_X);
> +                                  TGSI_SWIZZLE_X, true);
>     }
>
>
>     for (i = 0; i < 4; i++) {
>        unsigned point_dir_swz = ts->point_dir_swz[i];
>        unsigned point_coord_swz = ts->point_coord_swz[i];
>
>        /* All outputs need to be emitted for each vertex */
>        for (j = 0; j < ts->num_orig_out; j++) {
>           if (ts->out_tmp_index[j] != INVALID_INDEX) {
> @@ -435,27 +435,27 @@ psprite_inst(struct tgsi_transform_context *ctx,
>         */
>        inst->Dst[0].Register.File = TGSI_FILE_TEMPORARY;
>        inst->Dst[0].Register.Index = ts->point_size_tmp;
>        ctx->emit_instruction(ctx, inst);
>
>        /* Clamp the point size */
>        /* MAX point_size_tmp.x, point_size_tmp.x, point_imm.y */
>        tgsi_transform_op2_swz_inst(ctx, TGSI_OPCODE_MAX,
>                   TGSI_FILE_TEMPORARY, ts->point_size_tmp, TGSI_WRITEMASK_X,
>                   TGSI_FILE_TEMPORARY, ts->point_size_tmp, TGSI_SWIZZLE_X,
> -                 TGSI_FILE_IMMEDIATE, ts->point_imm, TGSI_SWIZZLE_Y);
> +                 TGSI_FILE_IMMEDIATE, ts->point_imm, TGSI_SWIZZLE_Y, false);
>
>        /* MIN point_size_tmp.x, point_size_tmp.x, point_ivp.w */
>        tgsi_transform_op2_swz_inst(ctx, TGSI_OPCODE_MIN,
>                   TGSI_FILE_TEMPORARY, ts->point_size_tmp, TGSI_WRITEMASK_X,
>                   TGSI_FILE_TEMPORARY, ts->point_size_tmp, TGSI_SWIZZLE_X,
> -                 TGSI_FILE_CONSTANT, ts->point_ivp, TGSI_SWIZZLE_W);
> +                 TGSI_FILE_CONSTANT, ts->point_ivp, TGSI_SWIZZLE_W, false);
>     }
>     else if (inst->Dst[0].Register.File == TGSI_FILE_OUTPUT &&
>              inst->Dst[0].Register.Index == ts->point_pos_out) {
>        /**
>         * Replace point pos output reg with tmp reg.
>         */
>        inst->Dst[0].Register.File = TGSI_FILE_TEMPORARY;
>        inst->Dst[0].Register.Index = ts->point_pos_tmp;
>        ctx->emit_instruction(ctx, inst);
>     }
> diff --git a/src/gallium/auxiliary/tgsi/tgsi_transform.h b/src/gallium/auxiliary/tgsi/tgsi_transform.h
> index c21ff95..7ea8206 100644
> --- a/src/gallium/auxiliary/tgsi/tgsi_transform.h
> +++ b/src/gallium/auxiliary/tgsi/tgsi_transform.h
> @@ -274,35 +274,37 @@ tgsi_transform_op1_inst(struct tgsi_transform_context *ctx,
>
>  static inline void
>  tgsi_transform_op2_inst(struct tgsi_transform_context *ctx,
>                          unsigned opcode,
>                          unsigned dst_file,
>                          unsigned dst_index,
>                          unsigned dst_writemask,
>                          unsigned src0_file,
>                          unsigned src0_index,
>                          unsigned src1_file,
> -                        unsigned src1_index)
> +                        unsigned src1_index,
> +                        bool src1_negate)
>  {
>     struct tgsi_full_instruction inst;
>
>     inst = tgsi_default_full_instruction();
>     inst.Instruction.Opcode = opcode;
>     inst.Instruction.NumDstRegs = 1;
>     inst.Dst[0].Register.File = dst_file,
>     inst.Dst[0].Register.Index = dst_index;
>     inst.Dst[0].Register.WriteMask = dst_writemask;
>     inst.Instruction.NumSrcRegs = 2;
>     inst.Src[0].Register.File = src0_file;
>     inst.Src[0].Register.Index = src0_index;
>     inst.Src[1].Register.File = src1_file;
>     inst.Src[1].Register.Index = src1_index;
> +   inst.Src[1].Register.Negate = src1_negate;
>
>     ctx->emit_instruction(ctx, &inst);
>  }
>
>
>  static inline void
>  tgsi_transform_op3_inst(struct tgsi_transform_context *ctx,
>                          unsigned opcode,
>                          unsigned dst_file,
>                          unsigned dst_index,
> @@ -381,35 +383,37 @@ static inline void
>  tgsi_transform_op2_swz_inst(struct tgsi_transform_context *ctx,
>                              unsigned opcode,
>                              unsigned dst_file,
>                              unsigned dst_index,
>                              unsigned dst_writemask,
>                              unsigned src0_file,
>                              unsigned src0_index,
>                              unsigned src0_swizzle,
>                              unsigned src1_file,
>                              unsigned src1_index,
> -                            unsigned src1_swizzle)
> +                            unsigned src1_swizzle,
> +                            bool src1_negate)
>  {
>     struct tgsi_full_instruction inst;
>
>     inst = tgsi_default_full_instruction();
>     inst.Instruction.Opcode = opcode;
>     inst.Instruction.NumDstRegs = 1;
>     inst.Dst[0].Register.File = dst_file,
>     inst.Dst[0].Register.Index = dst_index;
>     inst.Dst[0].Register.WriteMask = dst_writemask;
>     inst.Instruction.NumSrcRegs = 2;
>     inst.Src[0].Register.File = src0_file;
>     inst.Src[0].Register.Index = src0_index;
>     inst.Src[1].Register.File = src1_file;
>     inst.Src[1].Register.Index = src1_index;
> +   inst.Src[1].Register.Negate = src1_negate;
>     switch (dst_writemask) {
>     case TGSI_WRITEMASK_X:
>        inst.Src[0].Register.SwizzleX = src0_swizzle;
>        inst.Src[1].Register.SwizzleX = src1_swizzle;
>        break;
>     case TGSI_WRITEMASK_Y:
>        inst.Src[0].Register.SwizzleY = src0_swizzle;
>        inst.Src[1].Register.SwizzleY = src1_swizzle;
>        break;
>     case TGSI_WRITEMASK_Z:
> diff --git a/src/gallium/auxiliary/tgsi/tgsi_util.c b/src/gallium/auxiliary/tgsi/tgsi_util.c
> index 4f02829..4a6a2ae 100644
> --- a/src/gallium/auxiliary/tgsi/tgsi_util.c
> +++ b/src/gallium/auxiliary/tgsi/tgsi_util.c
> @@ -185,21 +185,20 @@ tgsi_util_get_inst_usage_mask(const struct tgsi_full_instruction *inst,
>     case TGSI_OPCODE_ARR:
>     case TGSI_OPCODE_RCP:
>     case TGSI_OPCODE_MUL:
>     case TGSI_OPCODE_DIV:
>     case TGSI_OPCODE_ADD:
>     case TGSI_OPCODE_MIN:
>     case TGSI_OPCODE_MAX:
>     case TGSI_OPCODE_SLT:
>     case TGSI_OPCODE_SGE:
>     case TGSI_OPCODE_MAD:
> -   case TGSI_OPCODE_SUB:
>     case TGSI_OPCODE_LRP:
>     case TGSI_OPCODE_FMA:
>     case TGSI_OPCODE_FRC:
>     case TGSI_OPCODE_CEIL:
>     case TGSI_OPCODE_CLAMP:
>     case TGSI_OPCODE_FLR:
>     case TGSI_OPCODE_ROUND:
>     case TGSI_OPCODE_POW:
>     case TGSI_OPCODE_COS:
>     case TGSI_OPCODE_SIN:
> diff --git a/src/gallium/auxiliary/util/u_pstipple.c b/src/gallium/auxiliary/util/u_pstipple.c
> index f6ea535..ae4cfa1 100644
> --- a/src/gallium/auxiliary/util/u_pstipple.c
> +++ b/src/gallium/auxiliary/util/u_pstipple.c
> @@ -337,21 +337,21 @@ pstip_transform_prolog(struct tgsi_transform_context *ctx)
>      * texcoords.  Darn.
>      */
>
>     /* XXX invert wincoord if origin isn't lower-left... */
>
>     /* MUL texTemp, INPUT[wincoord], 1/32; */
>     tgsi_transform_op2_inst(ctx, TGSI_OPCODE_MUL,
>                             TGSI_FILE_TEMPORARY, texTemp,
>                             TGSI_WRITEMASK_XYZW,
>                             pctx->wincoordFile, wincoordInput,
> -                           TGSI_FILE_IMMEDIATE, pctx->numImmed);
> +                           TGSI_FILE_IMMEDIATE, pctx->numImmed, false);
>
>     /* TEX texTemp, texTemp, sampler, 2D; */
>     tgsi_transform_tex_inst(ctx,
>                             TGSI_FILE_TEMPORARY, texTemp,
>                             TGSI_FILE_TEMPORARY, texTemp,
>                             TGSI_TEXTURE_2D, sampIdx);
>
>     /* KILL_IF -texTemp;   # if -texTemp < 0, kill fragment */
>     tgsi_transform_kill_inst(ctx,
>                              TGSI_FILE_TEMPORARY, texTemp,
> diff --git a/src/gallium/auxiliary/vl/vl_bicubic_filter.c b/src/gallium/auxiliary/vl/vl_bicubic_filter.c
> index 0364d43..774702c 100644
> --- a/src/gallium/auxiliary/vl/vl_bicubic_filter.c
> +++ b/src/gallium/auxiliary/vl/vl_bicubic_filter.c
> @@ -179,22 +179,22 @@ create_frag_shader(struct vl_bicubic_filter *filter, unsigned video_width,
>     t = ureg_DECL_temporary(shader);
>
>     half_pixel = ureg_DECL_constant(shader, 0);
>     o_fragment = ureg_DECL_output(shader, TGSI_SEMANTIC_COLOR, 0);
>
>     /*
>      * temp = (i_vtex - (0.5/dst_size)) * i_size)
>      * t = frac(temp)
>      * vtex = floor(i_vtex)/i_size
>      */
> -   ureg_SUB(shader, ureg_writemask(t_array[21], TGSI_WRITEMASK_XY),
> -            i_vtex, half_pixel);
> +   ureg_ADD(shader, ureg_writemask(t_array[21], TGSI_WRITEMASK_XY),
> +            i_vtex, ureg_negate(half_pixel));
>     ureg_MUL(shader, ureg_writemask(t_array[22], TGSI_WRITEMASK_XY),
>              ureg_src(t_array[21]), ureg_imm2f(shader, video_width, video_height));
>     ureg_FRC(shader, ureg_writemask(t, TGSI_WRITEMASK_XY),
>              ureg_src(t_array[22]));
>
>     ureg_FLR(shader, ureg_writemask(t_array[22], TGSI_WRITEMASK_XY),
>              ureg_src(t_array[22]));
>     ureg_DIV(shader, ureg_writemask(t_array[22], TGSI_WRITEMASK_XY),
>              ureg_src(t_array[22]), ureg_imm2f(shader, video_width, video_height));
>     ureg_ADD(shader, ureg_writemask(t_array[22], TGSI_WRITEMASK_XY),
> diff --git a/src/gallium/auxiliary/vl/vl_compositor.c b/src/gallium/auxiliary/vl/vl_compositor.c
> index 03a0a64..e22e389 100644
> --- a/src/gallium/auxiliary/vl/vl_compositor.c
> +++ b/src/gallium/auxiliary/vl/vl_compositor.c
> @@ -144,22 +144,22 @@ create_frag_shader_weave(struct ureg_program *shader, struct ureg_dst fragment)
>        t_tc[i] = ureg_DECL_temporary(shader);
>        t_texel[i] = ureg_DECL_temporary(shader);
>     }
>
>     /* calculate the texture offsets
>      * t_tc.x = i_tc.x
>      * t_tc.y = (round(i_tc.y - 0.5) + 0.5) / height * 2
>      */
>     for (i = 0; i < 2; ++i) {
>        ureg_MOV(shader, ureg_writemask(t_tc[i], TGSI_WRITEMASK_X), i_tc[i]);
> -      ureg_SUB(shader, ureg_writemask(t_tc[i], TGSI_WRITEMASK_YZ),
> -               i_tc[i], ureg_imm1f(shader, 0.5f));
> +      ureg_ADD(shader, ureg_writemask(t_tc[i], TGSI_WRITEMASK_YZ),
> +               i_tc[i], ureg_imm1f(shader, -0.5f));
>        ureg_ROUND(shader, ureg_writemask(t_tc[i], TGSI_WRITEMASK_YZ), ureg_src(t_tc[i]));
>        ureg_MOV(shader, ureg_writemask(t_tc[i], TGSI_WRITEMASK_W),
>                 ureg_imm1f(shader, i ? 1.0f : 0.0f));
>        ureg_ADD(shader, ureg_writemask(t_tc[i], TGSI_WRITEMASK_YZ),
>                 ureg_src(t_tc[i]), ureg_imm1f(shader, 0.5f));
>        ureg_MUL(shader, ureg_writemask(t_tc[i], TGSI_WRITEMASK_Y),
>                 ureg_src(t_tc[i]), ureg_scalar(i_tc[0], TGSI_SWIZZLE_W));
>        ureg_MUL(shader, ureg_writemask(t_tc[i], TGSI_WRITEMASK_Z),
>                 ureg_src(t_tc[i]), ureg_scalar(i_tc[1], TGSI_SWIZZLE_W));
>     }
> diff --git a/src/gallium/auxiliary/vl/vl_deint_filter.c b/src/gallium/auxiliary/vl/vl_deint_filter.c
> index 3ca3b49..2eec5cb 100644
> --- a/src/gallium/auxiliary/vl/vl_deint_filter.c
> +++ b/src/gallium/auxiliary/vl/vl_deint_filter.c
> @@ -166,35 +166,35 @@ create_deint_frag_shader(struct vl_deint_filter *filter, unsigned field,
>     ureg_ADD(shader, t_comp_top, ureg_src(t_tex),
>              ureg_imm4f(shader, sizes->x * 0.5f, sizes->y * -0.5f, 0, 0));
>     ureg_ADD(shader, t_comp_bot, ureg_src(t_tex),
>              ureg_imm4f(shader, sizes->x * -0.5f, sizes->y * 0.5f, 1.0f, 0));
>
>     if (field == 0) {
>        /* interpolating top field -> current field is a bottom field */
>        // cur vs prev2
>        ureg_TEX(shader, t_a, TGSI_TEXTURE_2D_ARRAY, ureg_src(t_comp_bot), sampler_cur);
>        ureg_TEX(shader, t_b, TGSI_TEXTURE_2D_ARRAY, ureg_src(t_comp_bot), sampler_prevprev);
> -      ureg_SUB(shader, ureg_writemask(t_diff, TGSI_WRITEMASK_X), ureg_src(t_a), ureg_src(t_b));
> +      ureg_ADD(shader, ureg_writemask(t_diff, TGSI_WRITEMASK_X), ureg_src(t_a), ureg_negate(ureg_src(t_b)));
>        // prev vs next
>        ureg_TEX(shader, t_a, TGSI_TEXTURE_2D_ARRAY, ureg_src(t_comp_top), sampler_prev);
>        ureg_TEX(shader, t_b, TGSI_TEXTURE_2D_ARRAY, ureg_src(t_comp_top), sampler_next);
> -      ureg_SUB(shader, ureg_writemask(t_diff, TGSI_WRITEMASK_Y), ureg_src(t_a), ureg_src(t_b));
> +      ureg_ADD(shader, ureg_writemask(t_diff, TGSI_WRITEMASK_Y), ureg_src(t_a), ureg_negate(ureg_src(t_b)));
>     } else {
>        /* interpolating bottom field -> current field is a top field */
>        // cur vs prev2
>        ureg_TEX(shader, t_a, TGSI_TEXTURE_2D_ARRAY, ureg_src(t_comp_top), sampler_cur);
>        ureg_TEX(shader, t_b, TGSI_TEXTURE_2D_ARRAY, ureg_src(t_comp_top), sampler_prevprev);
> -      ureg_SUB(shader, ureg_writemask(t_diff, TGSI_WRITEMASK_X), ureg_src(t_a), ureg_src(t_b));
> +      ureg_ADD(shader, ureg_writemask(t_diff, TGSI_WRITEMASK_X), ureg_src(t_a), ureg_negate(ureg_src(t_b)));
>        // prev vs next
>        ureg_TEX(shader, t_a, TGSI_TEXTURE_2D_ARRAY, ureg_src(t_comp_bot), sampler_prev);
>        ureg_TEX(shader, t_b, TGSI_TEXTURE_2D_ARRAY, ureg_src(t_comp_bot), sampler_next);
> -      ureg_SUB(shader, ureg_writemask(t_diff, TGSI_WRITEMASK_Y), ureg_src(t_a), ureg_src(t_b));
> +      ureg_ADD(shader, ureg_writemask(t_diff, TGSI_WRITEMASK_Y), ureg_src(t_a), ureg_negate(ureg_src(t_b)));
>     }
>
>     // absolute maximum of differences
>     ureg_MAX(shader, ureg_writemask(t_diff, TGSI_WRITEMASK_X), ureg_abs(ureg_src(t_diff)),
>              ureg_scalar(ureg_abs(ureg_src(t_diff)), TGSI_SWIZZLE_Y));
>
>     if (field == 0) {
>        /* weave with prev top field */
>        ureg_TEX(shader, t_weave, TGSI_TEXTURE_2D_ARRAY, ureg_src(t_tex), sampler_prev);
>        /* get linear interpolation from current bottom field */
> diff --git a/src/gallium/drivers/i915/i915_fpc_optimize.c b/src/gallium/drivers/i915/i915_fpc_optimize.c
> index 7c3b9a9..5f2a876 100644
> --- a/src/gallium/drivers/i915/i915_fpc_optimize.c
> +++ b/src/gallium/drivers/i915/i915_fpc_optimize.c
> @@ -108,21 +108,20 @@ static const struct {
>     [ TGSI_OPCODE_RSQ     ] = { false,  false,                  0,  1,  1 },
>     [ TGSI_OPCODE_SCS     ] = { false,  false,                  0,  1,  1 },
>     [ TGSI_OPCODE_SEQ     ] = { false,  false,                  0,  1,  2 },
>     [ TGSI_OPCODE_SGE     ] = { false,  false,                  0,  1,  2 },
>     [ TGSI_OPCODE_SGT     ] = { false,  false,                  0,  1,  2 },
>     [ TGSI_OPCODE_SIN     ] = { false,  false,                  0,  1,  1 },
>     [ TGSI_OPCODE_SLE     ] = { false,  false,                  0,  1,  2 },
>     [ TGSI_OPCODE_SLT     ] = { false,  false,                  0,  1,  2 },
>     [ TGSI_OPCODE_SNE     ] = { false,  false,                  0,  1,  2 },
>     [ TGSI_OPCODE_SSG     ] = { false,  false,                  0,  1,  1 },
> -   [ TGSI_OPCODE_SUB     ] = { false,  false,                  0,  1,  2 },
>     [ TGSI_OPCODE_TEX     ] = {  true,  false,                  0,  1,  2 },
>     [ TGSI_OPCODE_TRUNC   ] = { false,  false,                  0,  1,  1 },
>     [ TGSI_OPCODE_TXB     ] = {  true,  false,                  0,  1,  2 },
>     [ TGSI_OPCODE_TXP     ] = {  true,  false,                  0,  1,  2 },
>     [ TGSI_OPCODE_XPD     ] = { false,  false,                  0,  1,  2 },
>  };
>
>  static boolean op_has_dst(unsigned opcode)
>  {
>     return (op_table[opcode].num_dst > 0);
> diff --git a/src/gallium/drivers/i915/i915_fpc_translate.c b/src/gallium/drivers/i915/i915_fpc_translate.c
> index 80caf31..241c92d 100644
> --- a/src/gallium/drivers/i915/i915_fpc_translate.c
> +++ b/src/gallium/drivers/i915/i915_fpc_translate.c
> @@ -1015,31 +1015,20 @@ i915_translate_instruction(struct i915_fp_compile *p,
>                        src0, 0);
>
>        i915_emit_arith(p,
>                        A0_ADD,
>                        get_result_vector(p, &inst->Dst[0]),
>                        A0_DEST_CHANNEL_ALL, 0,
>                        get_result_vector(p, &inst->Dst[0]),
>                        negate(tmp, 1, 1, 1, 1), 0);
>        break;
>
> -   case TGSI_OPCODE_SUB:
> -      src0 = src_vector(p, &inst->Src[0], fs);
> -      src1 = src_vector(p, &inst->Src[1], fs);
> -
> -      i915_emit_arith(p,
> -                      A0_ADD,
> -                      get_result_vector(p, &inst->Dst[0]),
> -                      get_result_flags(inst), 0,
> -                      src0, negate(src1, 1, 1, 1, 1), 0);
> -      break;
> -
>     case TGSI_OPCODE_TEX:
>        emit_tex(p, inst, T0_TEXLD, fs);
>        break;
>
>     case TGSI_OPCODE_TRUNC:
>        emit_simple_arith(p, inst, A0_TRC, 1, fs);
>        break;
>
>     case TGSI_OPCODE_TXB:
>        emit_tex(p, inst, T0_TEXLDB, fs);
> diff --git a/src/gallium/drivers/ilo/shader/toy_tgsi.c b/src/gallium/drivers/ilo/shader/toy_tgsi.c
> index a88f189..4d813f0 100644
> --- a/src/gallium/drivers/ilo/shader/toy_tgsi.c
> +++ b/src/gallium/drivers/ilo/shader/toy_tgsi.c
> @@ -43,21 +43,20 @@ static const struct {
>     [TGSI_OPCODE_RCP]          = { TOY_OPCODE_INV,                 1, 1 },
>     [TGSI_OPCODE_RSQ]          = { TOY_OPCODE_RSQ,                 1, 1 },
>     [TGSI_OPCODE_MUL]          = { GEN6_OPCODE_MUL,                 1, 2 },
>     [TGSI_OPCODE_ADD]          = { GEN6_OPCODE_ADD,                 1, 2 },
>     [TGSI_OPCODE_DP3]          = { GEN6_OPCODE_DP3,                 1, 2 },
>     [TGSI_OPCODE_DP4]          = { GEN6_OPCODE_DP4,                 1, 2 },
>     [TGSI_OPCODE_MIN]          = { GEN6_OPCODE_SEL,                 1, 2 },
>     [TGSI_OPCODE_MAX]          = { GEN6_OPCODE_SEL,                 1, 2 },
>     /* a later pass will move src[2] to accumulator */
>     [TGSI_OPCODE_MAD]          = { GEN6_OPCODE_MAC,                 1, 3 },
> -   [TGSI_OPCODE_SUB]          = { GEN6_OPCODE_ADD,                 1, 2 },
>     [TGSI_OPCODE_SQRT]         = { TOY_OPCODE_SQRT,                1, 1 },
>     [TGSI_OPCODE_FRC]          = { GEN6_OPCODE_FRC,                 1, 1 },
>     [TGSI_OPCODE_FLR]          = { GEN6_OPCODE_RNDD,                1, 1 },
>     [TGSI_OPCODE_ROUND]        = { GEN6_OPCODE_RNDE,                1, 1 },
>     [TGSI_OPCODE_EX2]          = { TOY_OPCODE_EXP,                 1, 1 },
>     [TGSI_OPCODE_LG2]          = { TOY_OPCODE_LOG,                 1, 1 },
>     [TGSI_OPCODE_POW]          = { TOY_OPCODE_POW,                 1, 2 },
>     [TGSI_OPCODE_DPH]          = { GEN6_OPCODE_DPH,                 1, 2 },
>     [TGSI_OPCODE_COS]          = { TOY_OPCODE_COS,                 1, 1 },
>     [TGSI_OPCODE_KILL]         = { TOY_OPCODE_KIL,                 0, 0 },
> @@ -137,23 +136,20 @@ aos_simple(struct toy_compiler *tc,
>     case TGSI_OPCODE_MIN:
>     case TGSI_OPCODE_IMIN:
>     case TGSI_OPCODE_UMIN:
>        cond_modifier = GEN6_COND_L;
>        break;
>     case TGSI_OPCODE_MAX:
>     case TGSI_OPCODE_IMAX:
>     case TGSI_OPCODE_UMAX:
>        cond_modifier = GEN6_COND_GE;
>        break;
> -   case TGSI_OPCODE_SUB:
> -      src[1] = tsrc_negate(src[1]);
> -      break;
>     case TGSI_OPCODE_IABS:
>        src[0] = tsrc_absolute(src[0]);
>        break;
>     case TGSI_OPCODE_IF:
>        cond_modifier = GEN6_COND_NZ;
>        num_src = 2;
>        assert(src[0].type == TOY_TYPE_F);
>        src[0] = tsrc_swizzle1(src[0], TOY_SWIZZLE_X);
>        src[1] = tsrc_imm_f(0.0f);
>        break;
> @@ -769,21 +765,20 @@ static const toy_tgsi_translate aos_translate_table[TGSI_OPCODE_LAST] = {
>     [TGSI_OPCODE_MUL]          = aos_simple,
>     [TGSI_OPCODE_ADD]          = aos_simple,
>     [TGSI_OPCODE_DP3]          = aos_simple,
>     [TGSI_OPCODE_DP4]          = aos_simple,
>     [TGSI_OPCODE_DST]          = aos_DST,
>     [TGSI_OPCODE_MIN]          = aos_simple,
>     [TGSI_OPCODE_MAX]          = aos_simple,
>     [TGSI_OPCODE_SLT]          = aos_set_on_cond,
>     [TGSI_OPCODE_SGE]          = aos_set_on_cond,
>     [TGSI_OPCODE_MAD]          = aos_simple,
> -   [TGSI_OPCODE_SUB]          = aos_simple,
>     [TGSI_OPCODE_LRP]          = aos_LRP,
>     [TGSI_OPCODE_SQRT]         = aos_simple,
>     [TGSI_OPCODE_DP2A]         = aos_DP2A,
>     [TGSI_OPCODE_FRC]          = aos_simple,
>     [TGSI_OPCODE_CLAMP]        = aos_CLAMP,
>     [TGSI_OPCODE_FLR]          = aos_simple,
>     [TGSI_OPCODE_ROUND]        = aos_simple,
>     [TGSI_OPCODE_EX2]          = aos_simple,
>     [TGSI_OPCODE_LG2]          = aos_simple,
>     [TGSI_OPCODE_POW]          = aos_simple,
> @@ -1311,21 +1306,20 @@ static const toy_tgsi_translate soa_translate_table[TGSI_OPCODE_LAST] = {
>     [TGSI_OPCODE_MUL]          = soa_per_channel,
>     [TGSI_OPCODE_ADD]          = soa_per_channel,
>     [TGSI_OPCODE_DP3]          = soa_dot_product,
>     [TGSI_OPCODE_DP4]          = soa_dot_product,
>     [TGSI_OPCODE_DST]          = soa_DST,
>     [TGSI_OPCODE_MIN]          = soa_per_channel,
>     [TGSI_OPCODE_MAX]          = soa_per_channel,
>     [TGSI_OPCODE_SLT]          = soa_per_channel,
>     [TGSI_OPCODE_SGE]          = soa_per_channel,
>     [TGSI_OPCODE_MAD]          = soa_per_channel,
> -   [TGSI_OPCODE_SUB]          = soa_per_channel,
>     [TGSI_OPCODE_LRP]          = soa_per_channel,
>     [TGSI_OPCODE_SQRT]         = soa_scalar_replicate,
>     [TGSI_OPCODE_DP2A]         = soa_dot_product,
>     [TGSI_OPCODE_FRC]          = soa_per_channel,
>     [TGSI_OPCODE_CLAMP]        = soa_per_channel,
>     [TGSI_OPCODE_FLR]          = soa_per_channel,
>     [TGSI_OPCODE_ROUND]        = soa_per_channel,
>     [TGSI_OPCODE_EX2]          = soa_scalar_replicate,
>     [TGSI_OPCODE_LG2]          = soa_scalar_replicate,
>     [TGSI_OPCODE_POW]          = soa_scalar_replicate,
> diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
> index b919098..86348e7 100644
> --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
> +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
> @@ -716,21 +716,20 @@ static nv50_ir::operation translateOpcode(uint opcode)
>
>     NV50_IR_OPCODE_CASE(MUL, MUL);
>     NV50_IR_OPCODE_CASE(ADD, ADD);
>
>     NV50_IR_OPCODE_CASE(MIN, MIN);
>     NV50_IR_OPCODE_CASE(MAX, MAX);
>     NV50_IR_OPCODE_CASE(SLT, SET);
>     NV50_IR_OPCODE_CASE(SGE, SET);
>     NV50_IR_OPCODE_CASE(MAD, MAD);
>     NV50_IR_OPCODE_CASE(FMA, FMA);
> -   NV50_IR_OPCODE_CASE(SUB, SUB);
>
>     NV50_IR_OPCODE_CASE(FLR, FLOOR);
>     NV50_IR_OPCODE_CASE(ROUND, CVT);
>     NV50_IR_OPCODE_CASE(EX2, EX2);
>     NV50_IR_OPCODE_CASE(LG2, LG2);
>     NV50_IR_OPCODE_CASE(POW, POW);
>
>     NV50_IR_OPCODE_CASE(COS, COS);
>     NV50_IR_OPCODE_CASE(DDX, DFDX);
>     NV50_IR_OPCODE_CASE(DDX_FINE, DFDX);
> @@ -2981,21 +2980,20 @@ Converter::handleInstruction(const struct tgsi_full_instruction *insn)
>     case TGSI_OPCODE_MOD:
>     case TGSI_OPCODE_UMOD:
>     case TGSI_OPCODE_MUL:
>     case TGSI_OPCODE_UMUL:
>     case TGSI_OPCODE_IMUL_HI:
>     case TGSI_OPCODE_UMUL_HI:
>     case TGSI_OPCODE_OR:
>     case TGSI_OPCODE_SHL:
>     case TGSI_OPCODE_ISHR:
>     case TGSI_OPCODE_USHR:
> -   case TGSI_OPCODE_SUB:
>     case TGSI_OPCODE_XOR:
>        FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi) {
>           src0 = fetchSrc(0, c);
>           src1 = fetchSrc(1, c);
>           geni = mkOp2(op, dstTy, dst0[c], src0, src1);
>           geni->subOp = tgsi::opcodeToSubOp(tgsi.getOpcode());
>        }
>        break;
>     case TGSI_OPCODE_MAD:
>     case TGSI_OPCODE_UMAD:
> diff --git a/src/gallium/drivers/nouveau/nv30/nvfx_fragprog.c b/src/gallium/drivers/nouveau/nv30/nvfx_fragprog.c
> index d031c68..4924d21 100644
> --- a/src/gallium/drivers/nouveau/nv30/nvfx_fragprog.c
> +++ b/src/gallium/drivers/nouveau/nv30/nvfx_fragprog.c
> @@ -743,23 +743,20 @@ nvfx_fragprog_parse_instruction(struct nvfx_fpc *fpc,
>        insn.cc_test = NVFX_COND_GT;
>        nvfx_fp_emit(fpc, insn);
>
>        if(!sat) {
>           insn = arith(0, MOV, dst, mask, minones, none, none);
>           insn.cc_test = NVFX_COND_LT;
>           nvfx_fp_emit(fpc, insn);
>        }
>        break;
>     }
> -   case TGSI_OPCODE_SUB:
> -      nvfx_fp_emit(fpc, arith(sat, ADD, dst, mask, src[0], neg(src[1]), none));
> -      break;
>     case TGSI_OPCODE_TEX:
>        nvfx_fp_emit(fpc, tex(sat, TEX, unit, dst, mask, src[0], none, none));
>        break;
>          case TGSI_OPCODE_TRUNC:
>                  tmp = nvfx_src(temp(fpc));
>                  insn = arith(0, MOV, none.reg, mask, src[0], none, none);
>                  insn.cc_update = 1;
>                  nvfx_fp_emit(fpc, insn);
>
>                  nvfx_fp_emit(fpc, arith(0, FLR, tmp.reg, mask, abs(src[0]), none, none));
> diff --git a/src/gallium/drivers/nouveau/nv30/nvfx_vertprog.c b/src/gallium/drivers/nouveau/nv30/nvfx_vertprog.c
> index a802c43..baea701 100644
> --- a/src/gallium/drivers/nouveau/nv30/nvfx_vertprog.c
> +++ b/src/gallium/drivers/nouveau/nv30/nvfx_vertprog.c
> @@ -665,23 +665,20 @@ nvfx_vertprog_parse_instruction(struct nvfx_vpc *vpc,
>        break;
>     case TGSI_OPCODE_SLT:
>        nvfx_vp_emit(vpc, arith(sat, VEC, SLT, dst, mask, src[0], src[1], none));
>        break;
>     case TGSI_OPCODE_SNE:
>        nvfx_vp_emit(vpc, arith(sat, VEC, SNE, dst, mask, src[0], src[1], none));
>        break;
>     case TGSI_OPCODE_SSG:
>        nvfx_vp_emit(vpc, arith(sat, VEC, SSG, dst, mask, src[0], none, none));
>        break;
> -   case TGSI_OPCODE_SUB:
> -      nvfx_vp_emit(vpc, arith(sat, VEC, ADD, dst, mask, src[0], none, neg(src[1])));
> -      break;
>     case TGSI_OPCODE_TRUNC:
>        tmp = nvfx_src(temp(vpc));
>        insn = arith(0, VEC, MOV, none.reg, mask, src[0], none, none);
>        insn.cc_update = 1;
>        nvfx_vp_emit(vpc, insn);
>
>        nvfx_vp_emit(vpc, arith(0, VEC, FLR, tmp.reg, mask, abs(src[0]), none, none));
>        nvfx_vp_emit(vpc, arith(sat, VEC, MOV, dst, mask, tmp, none, none));
>
>        insn = arith(sat, VEC, MOV, dst, mask, neg(tmp), none, none);
> diff --git a/src/gallium/drivers/r300/r300_tgsi_to_rc.c b/src/gallium/drivers/r300/r300_tgsi_to_rc.c
> index 9d1e59f..59dfa05 100644
> --- a/src/gallium/drivers/r300/r300_tgsi_to_rc.c
> +++ b/src/gallium/drivers/r300/r300_tgsi_to_rc.c
> @@ -43,21 +43,20 @@ static unsigned translate_opcode(unsigned opcode)
>          case TGSI_OPCODE_MUL: return RC_OPCODE_MUL;
>          case TGSI_OPCODE_ADD: return RC_OPCODE_ADD;
>          case TGSI_OPCODE_DP3: return RC_OPCODE_DP3;
>          case TGSI_OPCODE_DP4: return RC_OPCODE_DP4;
>          case TGSI_OPCODE_DST: return RC_OPCODE_DST;
>          case TGSI_OPCODE_MIN: return RC_OPCODE_MIN;
>          case TGSI_OPCODE_MAX: return RC_OPCODE_MAX;
>          case TGSI_OPCODE_SLT: return RC_OPCODE_SLT;
>          case TGSI_OPCODE_SGE: return RC_OPCODE_SGE;
>          case TGSI_OPCODE_MAD: return RC_OPCODE_MAD;
> -        case TGSI_OPCODE_SUB: return RC_OPCODE_SUB;
>          case TGSI_OPCODE_LRP: return RC_OPCODE_LRP;
>       /* case TGSI_OPCODE_DP2A: return RC_OPCODE_DP2A; */
>          case TGSI_OPCODE_FRC: return RC_OPCODE_FRC;
>          case TGSI_OPCODE_CLAMP: return RC_OPCODE_CLAMP;
>          case TGSI_OPCODE_FLR: return RC_OPCODE_FLR;
>          case TGSI_OPCODE_ROUND: return RC_OPCODE_ROUND;
>          case TGSI_OPCODE_EX2: return RC_OPCODE_EX2;
>          case TGSI_OPCODE_LG2: return RC_OPCODE_LG2;
>          case TGSI_OPCODE_POW: return RC_OPCODE_POW;
>          case TGSI_OPCODE_XPD: return RC_OPCODE_XPD;
> diff --git a/src/gallium/drivers/r600/r600_shader.c b/src/gallium/drivers/r600/r600_shader.c
> index c2996aa..ebe2744 100644
> --- a/src/gallium/drivers/r600/r600_shader.c
> +++ b/src/gallium/drivers/r600/r600_shader.c
> @@ -3797,23 +3797,20 @@ static int tgsi_op2_64_params(struct r600_shader_ctx *ctx, bool singledest, bool
>  				r600_bytecode_src(&alu.src[j], &ctx->src[j], fp64_switch(i));
>  			}
>  		} else {
>  			r600_bytecode_src(&alu.src[0], &ctx->src[1], fp64_switch(i));
>  			r600_bytecode_src(&alu.src[1], &ctx->src[0], fp64_switch(i));
>  		}
>
>  		/* handle some special cases */
>  		if (i == 1 || i == 3) {
>  			switch (ctx->parse.FullToken.FullInstruction.Instruction.Opcode) {
> -			case TGSI_OPCODE_SUB:
> -				r600_bytecode_src_toggle_neg(&alu.src[1]);
> -				break;
>  			case TGSI_OPCODE_DABS:
>  				r600_bytecode_src_set_abs(&alu.src[0]);
>  				break;
>  			default:
>  				break;
>  			}
>  		}
>  		if (i == lasti) {
>  			alu.last = 1;
>  		}
> @@ -3924,28 +3921,20 @@ static int tgsi_op2_s(struct r600_shader_ctx *ctx, int swap, int trans_only)
>
>  		alu.op = ctx->inst_info->op;
>  		if (!swap) {
>  			for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
>  				r600_bytecode_src(&alu.src[j], &ctx->src[j], i);
>  			}
>  		} else {
>  			r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
>  			r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
>  		}
> -		/* handle some special cases */
> -		switch (inst->Instruction.Opcode) {
> -		case TGSI_OPCODE_SUB:
> -			r600_bytecode_src_toggle_neg(&alu.src[1]);
> -			break;
> -		default:
> -			break;
> -		}
>  		if (i == lasti || trans_only) {
>  			alu.last = 1;
>  		}
>  		r = r600_bytecode_add_alu(ctx->bc, &alu);
>  		if (r)
>  			return r;
>  	}
>
>  	if (use_tmp) {
>  		/* move result from temp to dst */
> @@ -8999,21 +8988,20 @@ static const struct r600_shader_tgsi_instruction r600_shader_tgsi_instruction[]
>  	[TGSI_OPCODE_MUL]	= { ALU_OP2_MUL, tgsi_op2},
>  	[TGSI_OPCODE_ADD]	= { ALU_OP2_ADD, tgsi_op2},
>  	[TGSI_OPCODE_DP3]	= { ALU_OP2_DOT4, tgsi_dp},
>  	[TGSI_OPCODE_DP4]	= { ALU_OP2_DOT4, tgsi_dp},
>  	[TGSI_OPCODE_DST]	= { ALU_OP0_NOP, tgsi_opdst},
>  	[TGSI_OPCODE_MIN]	= { ALU_OP2_MIN, tgsi_op2},
>  	[TGSI_OPCODE_MAX]	= { ALU_OP2_MAX, tgsi_op2},
>  	[TGSI_OPCODE_SLT]	= { ALU_OP2_SETGT, tgsi_op2_swap},
>  	[TGSI_OPCODE_SGE]	= { ALU_OP2_SETGE, tgsi_op2},
>  	[TGSI_OPCODE_MAD]	= { ALU_OP3_MULADD, tgsi_op3},
> -	[TGSI_OPCODE_SUB]	= { ALU_OP2_ADD, tgsi_op2},
>  	[TGSI_OPCODE_LRP]	= { ALU_OP0_NOP, tgsi_lrp},
>  	[TGSI_OPCODE_FMA]	= { ALU_OP0_NOP, tgsi_unsupported},
>  	[TGSI_OPCODE_SQRT]	= { ALU_OP1_SQRT_IEEE, tgsi_trans_srcx_replicate},
>  	[TGSI_OPCODE_DP2A]	= { ALU_OP0_NOP, tgsi_unsupported},
>  	[22]			= { ALU_OP0_NOP, tgsi_unsupported},
>  	[23]			= { ALU_OP0_NOP, tgsi_unsupported},
>  	[TGSI_OPCODE_FRC]	= { ALU_OP1_FRACT, tgsi_op2},
>  	[TGSI_OPCODE_CLAMP]	= { ALU_OP0_NOP, tgsi_unsupported},
>  	[TGSI_OPCODE_FLR]	= { ALU_OP1_FLOOR, tgsi_op2},
>  	[TGSI_OPCODE_ROUND]	= { ALU_OP1_RNDNE, tgsi_op2},
> @@ -9198,21 +9186,20 @@ static const struct r600_shader_tgsi_instruction eg_shader_tgsi_instruction[] =
>  	[TGSI_OPCODE_MUL]	= { ALU_OP2_MUL, tgsi_op2},
>  	[TGSI_OPCODE_ADD]	= { ALU_OP2_ADD, tgsi_op2},
>  	[TGSI_OPCODE_DP3]	= { ALU_OP2_DOT4, tgsi_dp},
>  	[TGSI_OPCODE_DP4]	= { ALU_OP2_DOT4, tgsi_dp},
>  	[TGSI_OPCODE_DST]	= { ALU_OP0_NOP, tgsi_opdst},
>  	[TGSI_OPCODE_MIN]	= { ALU_OP2_MIN, tgsi_op2},
>  	[TGSI_OPCODE_MAX]	= { ALU_OP2_MAX, tgsi_op2},
>  	[TGSI_OPCODE_SLT]	= { ALU_OP2_SETGT, tgsi_op2_swap},
>  	[TGSI_OPCODE_SGE]	= { ALU_OP2_SETGE, tgsi_op2},
>  	[TGSI_OPCODE_MAD]	= { ALU_OP3_MULADD, tgsi_op3},
> -	[TGSI_OPCODE_SUB]	= { ALU_OP2_ADD, tgsi_op2},
>  	[TGSI_OPCODE_LRP]	= { ALU_OP0_NOP, tgsi_lrp},
>  	[TGSI_OPCODE_FMA]	= { ALU_OP3_FMA, tgsi_op3},
>  	[TGSI_OPCODE_SQRT]	= { ALU_OP1_SQRT_IEEE, tgsi_trans_srcx_replicate},
>  	[TGSI_OPCODE_DP2A]	= { ALU_OP0_NOP, tgsi_unsupported},
>  	[22]			= { ALU_OP0_NOP, tgsi_unsupported},
>  	[23]			= { ALU_OP0_NOP, tgsi_unsupported},
>  	[TGSI_OPCODE_FRC]	= { ALU_OP1_FRACT, tgsi_op2},
>  	[TGSI_OPCODE_CLAMP]	= { ALU_OP0_NOP, tgsi_unsupported},
>  	[TGSI_OPCODE_FLR]	= { ALU_OP1_FLOOR, tgsi_op2},
>  	[TGSI_OPCODE_ROUND]	= { ALU_OP1_RNDNE, tgsi_op2},
> @@ -9421,21 +9408,20 @@ static const struct r600_shader_tgsi_instruction cm_shader_tgsi_instruction[] =
>  	[TGSI_OPCODE_MUL]	= { ALU_OP2_MUL, tgsi_op2},
>  	[TGSI_OPCODE_ADD]	= { ALU_OP2_ADD, tgsi_op2},
>  	[TGSI_OPCODE_DP3]	= { ALU_OP2_DOT4, tgsi_dp},
>  	[TGSI_OPCODE_DP4]	= { ALU_OP2_DOT4, tgsi_dp},
>  	[TGSI_OPCODE_DST]	= { ALU_OP0_NOP, tgsi_opdst},
>  	[TGSI_OPCODE_MIN]	= { ALU_OP2_MIN, tgsi_op2},
>  	[TGSI_OPCODE_MAX]	= { ALU_OP2_MAX, tgsi_op2},
>  	[TGSI_OPCODE_SLT]	= { ALU_OP2_SETGT, tgsi_op2_swap},
>  	[TGSI_OPCODE_SGE]	= { ALU_OP2_SETGE, tgsi_op2},
>  	[TGSI_OPCODE_MAD]	= { ALU_OP3_MULADD, tgsi_op3},
> -	[TGSI_OPCODE_SUB]	= { ALU_OP2_ADD, tgsi_op2},
>  	[TGSI_OPCODE_LRP]	= { ALU_OP0_NOP, tgsi_lrp},
>  	[TGSI_OPCODE_FMA]	= { ALU_OP3_FMA, tgsi_op3},
>  	[TGSI_OPCODE_SQRT]	= { ALU_OP1_SQRT_IEEE, cayman_emit_float_instr},
>  	[TGSI_OPCODE_DP2A]	= { ALU_OP0_NOP, tgsi_unsupported},
>  	[22]			= { ALU_OP0_NOP, tgsi_unsupported},
>  	[23]			= { ALU_OP0_NOP, tgsi_unsupported},
>  	[TGSI_OPCODE_FRC]	= { ALU_OP1_FRACT, tgsi_op2},
>  	[TGSI_OPCODE_CLAMP]	= { ALU_OP0_NOP, tgsi_unsupported},
>  	[TGSI_OPCODE_FLR]	= { ALU_OP1_FLOOR, tgsi_op2},
>  	[TGSI_OPCODE_ROUND]	= { ALU_OP1_RNDNE, tgsi_op2},
> diff --git a/src/gallium/drivers/svga/svga_tgsi_insn.c b/src/gallium/drivers/svga/svga_tgsi_insn.c
> index 47a0afc..0efd72d 100644
> --- a/src/gallium/drivers/svga/svga_tgsi_insn.c
> +++ b/src/gallium/drivers/svga/svga_tgsi_insn.c
> @@ -1396,44 +1396,20 @@ emit_ssg(struct svga_shader_emitter *emit,
>                      zero ))
>        return FALSE;
>
>     /* ADD  DST, TMP0, TMP1 */
>     return submit_op2( emit, inst_token( SVGA3DOP_ADD ), dst, src( temp0 ),
>                        src( temp1 ) );
>  }
>
>
>  /**
> - * Translate/emit TGSI SUB instruction as:
> - * ADD DST, SRC0, negate(SRC1)
> - */
> -static boolean
> -emit_sub(struct svga_shader_emitter *emit,
> -         const struct tgsi_full_instruction *insn)
> -{
> -   SVGA3dShaderDestToken dst = translate_dst_register( emit, insn, 0 );
> -   struct src_register src0 = translate_src_register(
> -      emit, &insn->Src[0] );
> -   struct src_register src1 = translate_src_register(
> -      emit, &insn->Src[1] );
> -
> -   src1 = negate(src1);
> -
> -   if (!submit_op2( emit, inst_token( SVGA3DOP_ADD ), dst,
> -                    src0, src1 ))
> -      return FALSE;
> -
> -   return TRUE;
> -}
> -
> -
> -/**
>   * Translate/emit KILL_IF instruction (kill if any of X,Y,Z,W are negative).
>   */
>  static boolean
>  emit_kill_if(struct svga_shader_emitter *emit,
>               const struct tgsi_full_instruction *insn)
>  {
>     const struct tgsi_full_src_register *reg = &insn->Src[0];
>     struct src_register src0, srcIn;
>     const boolean special = (reg->Register.Absolute ||
>                              reg->Register.Negate ||
> @@ -2982,23 +2958,20 @@ svga_emit_instruction(struct svga_shader_emitter *emit,
>
>     case TGSI_OPCODE_SGE:
>        return emit_select_op( emit, PIPE_FUNC_GEQUAL, insn );
>
>     case TGSI_OPCODE_SLT:
>        return emit_select_op( emit, PIPE_FUNC_LESS, insn );
>
>     case TGSI_OPCODE_SLE:
>        return emit_select_op( emit, PIPE_FUNC_LEQUAL, insn );
>
> -   case TGSI_OPCODE_SUB:
> -      return emit_sub( emit, insn );
> -
>     case TGSI_OPCODE_POW:
>        return emit_pow( emit, insn );
>
>     case TGSI_OPCODE_EX2:
>        return emit_ex2( emit, insn );
>
>     case TGSI_OPCODE_EXP:
>        return emit_exp( emit, insn );
>
>     case TGSI_OPCODE_LOG:
> diff --git a/src/gallium/drivers/svga/svga_tgsi_vgpu10.c b/src/gallium/drivers/svga/svga_tgsi_vgpu10.c
> index e7cfb40..3131444 100644
> --- a/src/gallium/drivers/svga/svga_tgsi_vgpu10.c
> +++ b/src/gallium/drivers/svga/svga_tgsi_vgpu10.c
> @@ -4708,43 +4708,20 @@ emit_issg(struct svga_shader_emitter_v10 *emit,
>     emit_instruction_op2(emit, VGPU10_OPCODE_IADD, &inst->Dst[0],
>                          &tmp1_src, &neg_tmp2, FALSE);
>
>     free_temp_indexes(emit);
>
>     return TRUE;
>  }
>
>
>  /**
> - * Emit code for TGSI_OPCODE_SUB instruction.
> - */
> -static boolean
> -emit_sub(struct svga_shader_emitter_v10 *emit,
> -         const struct tgsi_full_instruction *inst)
> -{
> -   /* dst = SUB(s0, s1):
> -    *   dst = s0 - s1
> -    * Translates into:
> -    *   ADD dst, s0, neg(s1)
> -    */
> -   struct tgsi_full_src_register neg_src1 = negate_src(&inst->Src[1]);
> -
> -   /* ADD dst, s0, neg(s1) */
> -   emit_instruction_op2(emit, VGPU10_OPCODE_ADD, &inst->Dst[0],
> -                        &inst->Src[0], &neg_src1,
> -                        inst->Instruction.Saturate);
> -
> -   return TRUE;
> -}
> -
> -
> -/**
>   * Emit a comparison instruction.  The dest register will get
>   * 0 or ~0 values depending on the outcome of comparing src0 to src1.
>   */
>  static void
>  emit_comparison(struct svga_shader_emitter_v10 *emit,
>                  SVGA3dCmpFunc func,
>                  const struct tgsi_full_dst_register *dst,
>                  const struct tgsi_full_src_register *src0,
>                  const struct tgsi_full_src_register *src1)
>  {
> @@ -5794,22 +5771,20 @@ emit_vgpu10_instruction(struct svga_shader_emitter_v10 *emit,
>     case TGSI_OPCODE_SLE:
>        return emit_sle(emit, inst);
>     case TGSI_OPCODE_SLT:
>        return emit_slt(emit, inst);
>     case TGSI_OPCODE_SNE:
>        return emit_sne(emit, inst);
>     case TGSI_OPCODE_SSG:
>        return emit_ssg(emit, inst);
>     case TGSI_OPCODE_ISSG:
>        return emit_issg(emit, inst);
> -   case TGSI_OPCODE_SUB:
> -      return emit_sub(emit, inst);
>     case TGSI_OPCODE_TEX:
>        return emit_tex(emit, inst);
>     case TGSI_OPCODE_TXP:
>        return emit_txp(emit, inst);
>     case TGSI_OPCODE_TXB:
>     case TGSI_OPCODE_TXB2:
>     case TGSI_OPCODE_TXL:
>        return emit_txl_txb(emit, inst);
>     case TGSI_OPCODE_TXD:
>        return emit_txd(emit, inst);
> diff --git a/src/gallium/include/pipe/p_shader_tokens.h b/src/gallium/include/pipe/p_shader_tokens.h
> index 3538090..3384035 100644
> --- a/src/gallium/include/pipe/p_shader_tokens.h
> +++ b/src/gallium/include/pipe/p_shader_tokens.h
> @@ -338,21 +338,21 @@ struct tgsi_property_data {
>  #define TGSI_OPCODE_MUL                 7
>  #define TGSI_OPCODE_ADD                 8
>  #define TGSI_OPCODE_DP3                 9
>  #define TGSI_OPCODE_DP4                 10
>  #define TGSI_OPCODE_DST                 11
>  #define TGSI_OPCODE_MIN                 12
>  #define TGSI_OPCODE_MAX                 13
>  #define TGSI_OPCODE_SLT                 14
>  #define TGSI_OPCODE_SGE                 15
>  #define TGSI_OPCODE_MAD                 16
> -#define TGSI_OPCODE_SUB                 17
> +/* gap */
>  #define TGSI_OPCODE_LRP                 18
>  #define TGSI_OPCODE_FMA                 19
>  #define TGSI_OPCODE_SQRT                20
>  #define TGSI_OPCODE_DP2A                21
>  #define TGSI_OPCODE_F2U64               22
>  #define TGSI_OPCODE_F2I64               23
>  #define TGSI_OPCODE_FRC                 24
>  #define TGSI_OPCODE_CLAMP               25
>  #define TGSI_OPCODE_FLR                 26
>  #define TGSI_OPCODE_ROUND               27
> diff --git a/src/gallium/state_trackers/xa/xa_tgsi.c b/src/gallium/state_trackers/xa/xa_tgsi.c
> index f3f665d..344a576 100644
> --- a/src/gallium/state_trackers/xa/xa_tgsi.c
> +++ b/src/gallium/state_trackers/xa/xa_tgsi.c
> @@ -232,24 +232,24 @@ radial_gradient(struct ureg_program *ureg,
>  	     ureg_scalar(ureg_src(temp5), TGSI_SWIZZLE_Y),
>  	     ureg_scalar(ureg_src(temp5), TGSI_SWIZZLE_Y));
>      ureg_MAD(ureg, temp4,
>  	     ureg_scalar(ureg_src(temp5), TGSI_SWIZZLE_X),
>  	     ureg_scalar(ureg_src(temp5), TGSI_SWIZZLE_X), ureg_src(temp3));
>      ureg_MOV(ureg, temp4, ureg_negate(ureg_src(temp4)));
>      ureg_MUL(ureg, temp2, ureg_scalar(coords, TGSI_SWIZZLE_Z), ureg_src(temp4));
>      ureg_MUL(ureg, temp0,
>  	     ureg_scalar(const0124, TGSI_SWIZZLE_W), ureg_src(temp2));
>      ureg_MUL(ureg, temp3, ureg_src(temp1), ureg_src(temp1));
> -    ureg_SUB(ureg, temp2, ureg_src(temp3), ureg_src(temp0));
> +    ureg_ADD(ureg, temp2, ureg_src(temp3), ureg_negate(ureg_src(temp0)));
>      ureg_RSQ(ureg, temp2, ureg_abs(ureg_src(temp2)));
>      ureg_RCP(ureg, temp2, ureg_src(temp2));
> -    ureg_SUB(ureg, temp1, ureg_src(temp2), ureg_src(temp1));
> +    ureg_ADD(ureg, temp1, ureg_src(temp2), ureg_negate(ureg_src(temp1)));
>      ureg_ADD(ureg, temp0,
>  	     ureg_scalar(coords, TGSI_SWIZZLE_Z),
>  	     ureg_scalar(coords, TGSI_SWIZZLE_Z));
>      ureg_RCP(ureg, temp0, ureg_src(temp0));
>      ureg_MUL(ureg, temp2, ureg_src(temp1), ureg_src(temp0));
>      ureg_TEX(ureg, out, TGSI_TEXTURE_1D, ureg_src(temp2), sampler);
>
>      ureg_release_temporary(ureg, temp0);
>      ureg_release_temporary(ureg, temp1);
>      ureg_release_temporary(ureg, temp2);
> diff --git a/src/mesa/state_tracker/st_atifs_to_tgsi.c b/src/mesa/state_tracker/st_atifs_to_tgsi.c
> index 3aa7f84..b28c55c 100644
> --- a/src/mesa/state_tracker/st_atifs_to_tgsi.c
> +++ b/src/mesa/state_tracker/st_atifs_to_tgsi.c
> @@ -59,21 +59,21 @@ struct instruction_desc {
>     unsigned TGSI_opcode;
>     const char *name;
>     unsigned char arg_count;
>  };
>
>  static const struct instruction_desc inst_desc[] = {
>     {TGSI_OPCODE_MOV, "MOV", 1},
>     {TGSI_OPCODE_NOP, "UND", 0}, /* unused */
>     {TGSI_OPCODE_ADD, "ADD", 2},
>     {TGSI_OPCODE_MUL, "MUL", 2},
> -   {TGSI_OPCODE_SUB, "SUB", 2},
> +   {TGSI_OPCODE_NOP, "SUB", 2},
>     {TGSI_OPCODE_DP3, "DOT3", 2},
>     {TGSI_OPCODE_DP4, "DOT4", 2},
>     {TGSI_OPCODE_MAD, "MAD", 3},
>     {TGSI_OPCODE_LRP, "LERP", 3},
>     {TGSI_OPCODE_NOP, "CND", 3},
>     {TGSI_OPCODE_NOP, "CND0", 3},
>     {TGSI_OPCODE_NOP, "DOT2_ADD", 3}
>  };
>
>  static struct ureg_dst
> @@ -168,30 +168,30 @@ prepare_argument(struct st_translate *t, const unsigned argId,
>        break;
>     case GL_ALPHA:
>        src = ureg_scalar(src, TGSI_SWIZZLE_W);
>        break;
>     }
>     ureg_insn(t->ureg, TGSI_OPCODE_MOV, &arg, 1, &src, 1);
>
>     if (srcReg->argMod & GL_COMP_BIT_ATI) {
>        struct ureg_src modsrc[2];
>        modsrc[0] = ureg_imm1f(t->ureg, 1.0f);
> -      modsrc[1] = ureg_src(arg);
> +      modsrc[1] = ureg_negate(ureg_src(arg));
>
> -      ureg_insn(t->ureg, TGSI_OPCODE_SUB, &arg, 1, modsrc, 2);
> +      ureg_insn(t->ureg, TGSI_OPCODE_ADD, &arg, 1, modsrc, 2);
>     }
>     if (srcReg->argMod & GL_BIAS_BIT_ATI) {
>        struct ureg_src modsrc[2];
>        modsrc[0] = ureg_src(arg);
> -      modsrc[1] = ureg_imm1f(t->ureg, 0.5f);
> +      modsrc[1] = ureg_imm1f(t->ureg, -0.5f);
>
> -      ureg_insn(t->ureg, TGSI_OPCODE_SUB, &arg, 1, modsrc, 2);
> +      ureg_insn(t->ureg, TGSI_OPCODE_ADD, &arg, 1, modsrc, 2);
>     }
>     if (srcReg->argMod & GL_2X_BIT_ATI) {
>        struct ureg_src modsrc[2];
>        modsrc[0] = ureg_src(arg);
>        modsrc[1] = ureg_src(arg);
>
>        ureg_insn(t->ureg, TGSI_OPCODE_ADD, &arg, 1, modsrc, 2);
>     }
>     if (srcReg->argMod & GL_NEGATE_BIT_ATI) {
>        struct ureg_src modsrc[2];
> @@ -204,25 +204,27 @@ prepare_argument(struct st_translate *t, const unsigned argId,
>  }
>
>  /* These instructions need special treatment */
>  static void
>  emit_special_inst(struct st_translate *t, const struct instruction_desc *desc,
>                    struct ureg_dst *dst, struct ureg_src *args, unsigned argcount)
>  {
>     struct ureg_dst tmp[1];
>     struct ureg_src src[3];
>
> -   if (!strcmp(desc->name, "CND")) {
> +   if (!strcmp(desc->name, "SUB")) {
> +      ureg_ADD(t->ureg, *dst, args[0], ureg_negate(args[1]));
> +   } else if (!strcmp(desc->name, "CND")) {
>        tmp[0] = get_temp(t, MAX_NUM_FRAGMENT_REGISTERS_ATI + 2); /* re-purpose a3 */
>        src[0] = ureg_imm1f(t->ureg, 0.5f);
> -      src[1] = args[2];
> -      ureg_insn(t->ureg, TGSI_OPCODE_SUB, tmp, 1, src, 2);
> +      src[1] = ureg_negate(args[2]);
> +      ureg_insn(t->ureg, TGSI_OPCODE_ADD, tmp, 1, src, 2);
>        src[0] = ureg_src(tmp[0]);
>        src[1] = args[0];
>        src[2] = args[1];
>        ureg_insn(t->ureg, TGSI_OPCODE_CMP, dst, 1, src, 3);
>     } else if (!strcmp(desc->name, "CND0")) {
>        src[0] = args[2];
>        src[1] = args[1];
>        src[2] = args[0];
>        ureg_insn(t->ureg, TGSI_OPCODE_CMP, dst, 1, src, 3);
>     } else if (!strcmp(desc->name, "DOT2_ADD")) {
> diff --git a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
> index 1be1f6c..f738084 100644
> --- a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
> +++ b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
> @@ -1695,21 +1695,22 @@ glsl_to_tgsi_visitor::visit_expression(ir_expression* ir, st_src_reg *op)
>         * driver.
>         */
>        emit_asm(ir, TGSI_OPCODE_MOV, result_dst, st_src_reg_for_float(0.5));
>        break;
>     }
>
>     case ir_binop_add:
>        emit_asm(ir, TGSI_OPCODE_ADD, result_dst, op[0], op[1]);
>        break;
>     case ir_binop_sub:
> -      emit_asm(ir, TGSI_OPCODE_SUB, result_dst, op[0], op[1]);
> +      op[1].negate = 1;
> +      emit_asm(ir, TGSI_OPCODE_ADD, result_dst, op[0], op[1]);
>        break;
>
>     case ir_binop_mul:
>        emit_asm(ir, TGSI_OPCODE_MUL, result_dst, op[0], op[1]);
>        break;
>     case ir_binop_div:
>        if (result_dst.type == GLSL_TYPE_FLOAT || result_dst.type == GLSL_TYPE_DOUBLE)
>           assert(!"not reached: should be handled by ir_div_to_mul_rcp");
>        else
>           emit_asm(ir, TGSI_OPCODE_DIV, result_dst, op[0], op[1]);
> diff --git a/src/mesa/state_tracker/st_mesa_to_tgsi.c b/src/mesa/state_tracker/st_mesa_to_tgsi.c
> index 1768356..f906fed 100644
> --- a/src/mesa/state_tracker/st_mesa_to_tgsi.c
> +++ b/src/mesa/state_tracker/st_mesa_to_tgsi.c
> @@ -474,22 +474,20 @@ translate_opcode( unsigned op )
>     case OPCODE_RCP:
>        return TGSI_OPCODE_RCP;
>     case OPCODE_SCS:
>        return TGSI_OPCODE_SCS;
>     case OPCODE_SGE:
>        return TGSI_OPCODE_SGE;
>     case OPCODE_SIN:
>        return TGSI_OPCODE_SIN;
>     case OPCODE_SLT:
>        return TGSI_OPCODE_SLT;
> -   case OPCODE_SUB:
> -      return TGSI_OPCODE_SUB;
>     case OPCODE_TEX:
>        return TGSI_OPCODE_TEX;
>     case OPCODE_TXB:
>        return TGSI_OPCODE_TXB;
>     case OPCODE_TXP:
>        return TGSI_OPCODE_TXP;
>     case OPCODE_XPD:
>        return TGSI_OPCODE_XPD;
>     case OPCODE_END:
>        return TGSI_OPCODE_END;
> @@ -559,20 +557,24 @@ compile_instruction(
>        break;
>
>     case OPCODE_RSQ:
>        ureg_RSQ( ureg, dst[0], ureg_abs(src[0]) );
>        break;
>
>     case OPCODE_ABS:
>        ureg_MOV(ureg, dst[0], ureg_abs(src[0]));
>        break;
>
> +   case OPCODE_SUB:
> +      ureg_ADD(ureg, dst[0], src[0], ureg_negate(src[1]));
> +      break;
> +
>     default:
>        ureg_insn( ureg,
>                   translate_opcode( inst->Opcode ),
>                   dst, num_dst,
>                   src, num_src );
>        break;
>     }
>  }
>
>
> diff --git a/src/mesa/state_tracker/st_tgsi_lower_yuv.c b/src/mesa/state_tracker/st_tgsi_lower_yuv.c
> index e346b97..6acd173 100644
> --- a/src/mesa/state_tracker/st_tgsi_lower_yuv.c
> +++ b/src/mesa/state_tracker/st_tgsi_lower_yuv.c
> @@ -251,27 +251,28 @@ yuv_to_rgb(struct tgsi_transform_context *tctx,
>
>     /*
>      * IMM[0] FLT32 { 1.164,  0.000,  1.596,  0.0 }
>      * IMM[1] FLT32 { 1.164, -0.392, -0.813,  0.0 }
>      * IMM[2] FLT32 { 1.164,  2.017,  0.000,  0.0 }
>      * IMM[3] FLT32 { 0.0625, 0.500,  0.500,  1.0 }
>      */
>
>     /* SUB tmpA.xyz, tmpA, imm[3] */
>     inst = tgsi_default_full_instruction();
> -   inst.Instruction.Opcode = TGSI_OPCODE_SUB;
> +   inst.Instruction.Opcode = TGSI_OPCODE_ADD;
>     inst.Instruction.Saturate = 0;
>     inst.Instruction.NumDstRegs = 1;
>     inst.Instruction.NumSrcRegs = 2;
>     reg_dst(&inst.Dst[0], &ctx->tmp[A].dst, TGSI_WRITEMASK_XYZ);
>     reg_src(&inst.Src[0], &ctx->tmp[A].src, SWIZ(X, Y, Z, _));
>     reg_src(&inst.Src[1], &ctx->imm[3], SWIZ(X, Y, Z, _));
> +   inst.Src[1].Register.Negate = 1;
>     tctx->emit_instruction(tctx, &inst);
>
>     /* DP3 dst.x, tmpA, imm[0] */
>     inst = dp3_instruction();
>     reg_dst(&inst.Dst[0], dst, TGSI_WRITEMASK_X);
>     reg_src(&inst.Src[0], &ctx->tmp[A].src, SWIZ(X, Y, Z, W));
>     reg_src(&inst.Src[1], &ctx->imm[0], SWIZ(X, Y, Z, W));
>     tctx->emit_instruction(tctx, &inst);
>
>     /* DP3 dst.y, tmpA, imm[1] */
>