[Mesa-dev] [PATCH] gallivm: add fp64 support. (v2)

Tue Jun 30 08:37:07 PDT 2015

Am 30.06.2015 um 03:41 schrieb Dave Airlie:
> This adds support for ARB_gpu_shader_fp64 and ARB_vertex_attrib_64bit to
> llvmpipe.
> 
> Two things that don't mix well are SoA and doubles, see
> emit_fetch_double, and emit_store_double_chan in this.
> 
> I've also had to split emit_data.chan, to add src_chan,
> which can be different for doubles.
> 
> It handles indirect double fetches from temps, inputs, constants
> and immediates. It doesn't handle double stores to indirects,
> however it appears the mesa/st doesn't currently emit these,
> it always does UARL/MOV combos, which will work fine.
> 
> tested with piglit, no regressions, all the fp64 tests seem to pass.
> 
> v2:
> switch to using shuffles for fetch/store (Roland)
> assert on indirect double stores - mesa/st never emits these (it uses MOV)
> fix indirect temp/input/constant/immediates (Roland)
> typos/formatting fixes (Roland)
> 
> Signed-off-by: Dave Airlie <airlied at redhat.com>
> ---
>  src/gallium/auxiliary/gallivm/lp_bld_arit.c        |  12 +
>  src/gallium/auxiliary/gallivm/lp_bld_limits.h      |   1 +
>  src/gallium/auxiliary/gallivm/lp_bld_logic.c       |   2 +-
>  src/gallium/auxiliary/gallivm/lp_bld_tgsi.c        |  47 +++-
>  src/gallium/auxiliary/gallivm/lp_bld_tgsi.h        |   4 +
>  src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c | 246 ++++++++++++++++++++
>  src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.h |   5 +
>  src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c    | 256 ++++++++++++++++++---
>  8 files changed, 541 insertions(+), 32 deletions(-)
> 
> diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.c b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
> index 9daa93e..8fba43f 100644
> --- a/src/gallium/auxiliary/gallivm/lp_bld_arit.c
> +++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
> @@ -1997,6 +1997,12 @@ lp_build_floor(struct lp_build_context *bld,
>        LLVMTypeRef int_vec_type = bld->int_vec_type;
>        LLVMTypeRef vec_type = bld->vec_type;
>  
> +      if (type.width != 32) {
> +         char intrinsic[32];
> +         util_snprintf(intrinsic, sizeof intrinsic, "llvm.floor.v%uf%u", type.length, type.width);
> +         return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
> +      }
> +
>        assert(type.width == 32); /* might want to handle doubles at some point */
>  
>        inttype = type;
> @@ -2066,6 +2072,12 @@ lp_build_ceil(struct lp_build_context *bld,
>        LLVMTypeRef int_vec_type = bld->int_vec_type;
>        LLVMTypeRef vec_type = bld->vec_type;
>  
> +      if (type.width != 32) {
> +         char intrinsic[32];
> +         util_snprintf(intrinsic, sizeof intrinsic, "llvm.ceil.v%uf%u", type.length, type.width);
> +         return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
> +      }
> +
>        assert(type.width == 32); /* might want to handle doubles at some point */
>  
>        inttype = type;
> diff --git a/src/gallium/auxiliary/gallivm/lp_bld_limits.h b/src/gallium/auxiliary/gallivm/lp_bld_limits.h
> index 2851fd1..3db7261 100644
> --- a/src/gallium/auxiliary/gallivm/lp_bld_limits.h
> +++ b/src/gallium/auxiliary/gallivm/lp_bld_limits.h
> @@ -132,6 +132,7 @@ gallivm_get_shader_param(enum pipe_shader_cap param)
>     case PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE:
>        return 1;
>     case PIPE_SHADER_CAP_DOUBLES:
> +      return 1;
>     case PIPE_SHADER_CAP_TGSI_DROUND_SUPPORTED:
>     case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED:
>     case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED:
> diff --git a/src/gallium/auxiliary/gallivm/lp_bld_logic.c b/src/gallium/auxiliary/gallivm/lp_bld_logic.c
> index 80b53e5..f724cfa 100644
> --- a/src/gallium/auxiliary/gallivm/lp_bld_logic.c
> +++ b/src/gallium/auxiliary/gallivm/lp_bld_logic.c
> @@ -81,7 +81,7 @@ lp_build_compare_ext(struct gallivm_state *gallivm,
>                       boolean ordered)
>  {
>     LLVMBuilderRef builder = gallivm->builder;
> -   LLVMTypeRef int_vec_type = lp_build_int_vec_type(gallivm, type);
> +   LLVMTypeRef int_vec_type = lp_build_int_vec_type(gallivm, lp_type_int_vec(32, 32 * type.length));
>     LLVMValueRef zeros = LLVMConstNull(int_vec_type);
>     LLVMValueRef ones = LLVMConstAllOnes(int_vec_type);
>     LLVMValueRef cond;
> diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi.c b/src/gallium/auxiliary/gallivm/lp_bld_tgsi.c
> index e391d8a..1887956 100644
> --- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi.c
> +++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi.c
> @@ -175,13 +175,52 @@ void lp_build_fetch_args(
>     unsigned src;
>     for (src = 0; src < emit_data->info->num_src; src++) {
>        emit_data->args[src] = lp_build_emit_fetch(bld_base, emit_data->inst, src,
> -                                               emit_data->chan);
> +                                                 emit_data->src_chan);
>     }
>     emit_data->arg_count = emit_data->info->num_src;
>     lp_build_action_set_dst_type(emit_data, bld_base,
>  		emit_data->inst->Instruction.Opcode);
>  }
>  
> +/**
> + * with doubles src and dst channels aren't 1:1.
> + * check the src/dst types for the opcode,
> + * 1. if neither is double then src == dst;
> + * 2. if dest is double
> + *     - don't store to y or w
> + *     - if src is double then src == dst.
> + *     - else for f2d, d.xy = s.x
> + *     - else for f2d, d.zw = s.y
> + * 3. if dst is single, src is double
> + *    - map dst x,z to src xy;
> + *    - map dst y,w to src zw;
> + */
> +static int get_src_chan_idx(unsigned opcode,
> +                            int dst_chan_index)
> +{
> +   enum tgsi_opcode_type dtype = tgsi_opcode_infer_dst_type(opcode);
> +   enum tgsi_opcode_type stype = tgsi_opcode_infer_src_type(opcode);
> +
> +   if (dtype != TGSI_TYPE_DOUBLE && stype != TGSI_TYPE_DOUBLE)
> +      return dst_chan_index;
> +   if (dtype == TGSI_TYPE_DOUBLE) {
> +      if (dst_chan_index == 1 || dst_chan_index == 3)
> +         return -1;
> +      if (stype == TGSI_TYPE_DOUBLE)
> +         return dst_chan_index;
> +      if (dst_chan_index == 0)
> +         return 0;
> +      if (dst_chan_index == 2)
> +         return 1;
> +   } else {
> +      if (dst_chan_index == 0 || dst_chan_index == 2)
> +         return 0;
> +      if (dst_chan_index == 1 || dst_chan_index == 3)
> +         return 2;
> +   }
> +   return -1;
> +}
> +
>  /* XXX: COMMENT
>   * It should be assumed that this function ignores writemasks
>   */
> @@ -197,7 +236,6 @@ lp_build_tgsi_inst_llvm(
>     struct lp_build_emit_data emit_data;
>     unsigned chan_index;
>     LLVMValueRef val;
> -
>     bld_base->pc++;
>  
>     if (bld_base->emit_debug) {
> @@ -240,7 +278,12 @@ lp_build_tgsi_inst_llvm(
>     /* Emit the instructions */
>     if (info->output_mode == TGSI_OUTPUT_COMPONENTWISE && bld_base->soa) {
>        TGSI_FOR_EACH_DST0_ENABLED_CHANNEL(inst, chan_index) {
> +         int src_index = get_src_chan_idx(inst->Instruction.Opcode, chan_index);
> +         /* ignore channels 1/3 in double dst */
> +         if (src_index == -1)
> +            continue;
>           emit_data.chan = chan_index;
> +         emit_data.src_chan = src_index;
>           if (!action->fetch_args) {
>              lp_build_fetch_args(bld_base, &emit_data);
>           } else {
> diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h b/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h
> index 967373c..5809c5a 100644
> --- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h
> +++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h
> @@ -338,6 +338,7 @@ struct lp_build_tgsi_context
>     struct lp_build_context uint_bld;
>     struct lp_build_context int_bld;
>  
> +   struct lp_build_context dbl_bld;
>     /** This array stores functions that are used to transform TGSI opcodes to
>       * LLVM instructions.
>       */
> @@ -349,6 +350,9 @@ struct lp_build_tgsi_context
>  
>     struct lp_build_tgsi_action sqrt_action;
>  
> +   struct lp_build_tgsi_action drsq_action;
> +
> +   struct lp_build_tgsi_action dsqrt_action;
>     const struct tgsi_shader_info *info;
>  
>     lp_build_emit_fetch_fn emit_fetch_funcs[TGSI_FILE_COUNT];
> diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c
> index 9cb42b2..1f2af85 100644
> --- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c
> +++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c
> @@ -894,6 +894,125 @@ const struct lp_build_tgsi_action xpd_action = {
>     xpd_emit	 /* emit */
>  };
>  
> +/* TGSI_OPCODE_D2F */
> +static void
> +d2f_emit(
> +   const struct lp_build_tgsi_action * action,
> +   struct lp_build_tgsi_context * bld_base,
> +   struct lp_build_emit_data * emit_data)
> +{
> +   emit_data->output[emit_data->chan] =
> +      LLVMBuildFPTrunc(bld_base->base.gallivm->builder,
> +                      emit_data->args[0],
> +                       bld_base->base.vec_type, "");
> +}
> +
> +/* TGSI_OPCODE_D2I */
> +static void
> +d2i_emit(
> +   const struct lp_build_tgsi_action * action,
> +   struct lp_build_tgsi_context * bld_base,
> +   struct lp_build_emit_data * emit_data)
> +{
> +   emit_data->output[emit_data->chan] =
> +      LLVMBuildFPToSI(bld_base->base.gallivm->builder,
> +                      emit_data->args[0],
> +                      bld_base->base.int_vec_type, "");
> +}
> +
> +/* TGSI_OPCODE_D2U */
> +static void
> +d2u_emit(
> +   const struct lp_build_tgsi_action * action,
> +   struct lp_build_tgsi_context * bld_base,
> +   struct lp_build_emit_data * emit_data)
> +{
> +   emit_data->output[emit_data->chan] =
> +      LLVMBuildFPToUI(bld_base->base.gallivm->builder,
> +                      emit_data->args[0],
> +                      bld_base->base.int_vec_type, "");
> +}
> +
> +/* TGSI_OPCODE_F2D */
> +static void
> +f2d_emit(
> +   const struct lp_build_tgsi_action * action,
> +   struct lp_build_tgsi_context * bld_base,
> +   struct lp_build_emit_data * emit_data)
> +{
> +   emit_data->output[emit_data->chan] =
> +      LLVMBuildFPExt(bld_base->base.gallivm->builder,
> +                      emit_data->args[0],
> +                      bld_base->dbl_bld.vec_type, "");
> +}
> +
> +/* TGSI_OPCODE_U2D */
> +static void
> +u2d_emit(
> +   const struct lp_build_tgsi_action * action,
> +   struct lp_build_tgsi_context * bld_base,
> +   struct lp_build_emit_data * emit_data)
> +{
> +   emit_data->output[emit_data->chan] =
> +      LLVMBuildUIToFP(bld_base->base.gallivm->builder,
> +                      emit_data->args[0],
> +                      bld_base->dbl_bld.vec_type, "");
> +}
> +
> +/* TGSI_OPCODE_I2D */
> +static void
> +i2d_emit(
> +   const struct lp_build_tgsi_action * action,
> +   struct lp_build_tgsi_context * bld_base,
> +   struct lp_build_emit_data * emit_data)
> +{
> +   emit_data->output[emit_data->chan] =
> +      LLVMBuildSIToFP(bld_base->base.gallivm->builder,
> +                      emit_data->args[0],
> +                      bld_base->dbl_bld.vec_type, "");
> +}
> +
> +/* TGSI_OPCODE_DMAD */
> +static void
> +dmad_emit(
> +   const struct lp_build_tgsi_action * action,
> +   struct lp_build_tgsi_context * bld_base,
> +   struct lp_build_emit_data * emit_data)
> +{
> +   LLVMValueRef tmp;
> +   tmp = lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_DMUL,
> +                                   emit_data->args[0],
> +                                   emit_data->args[1]);
> +   emit_data->output[emit_data->chan] = lp_build_emit_llvm_binary(bld_base,
> +                                       TGSI_OPCODE_DADD, tmp, emit_data->args[2]);
> +}
> +
> +/*.TGSI_OPCODE_DRCP.*/
> +static void drcp_emit(
> +   const struct lp_build_tgsi_action * action,
> +   struct lp_build_tgsi_context * bld_base,
> +   struct lp_build_emit_data * emit_data)
> +{
> +   LLVMValueRef one;
> +   one = lp_build_const_vec(bld_base->dbl_bld.gallivm, bld_base->dbl_bld.type, 1.0f);
> +   emit_data->output[emit_data->chan] = LLVMBuildFDiv(
> +      bld_base->base.gallivm->builder,
> +      one, emit_data->args[0], "");
> +}
> +
> +/* TGSI_OPCODE_DFRAC */
> +static void dfrac_emit(
> +   const struct lp_build_tgsi_action * action,
> +   struct lp_build_tgsi_context * bld_base,
> +   struct lp_build_emit_data * emit_data)
> +{
> +   LLVMValueRef tmp;
> +   tmp = lp_build_floor(&bld_base->dbl_bld,
> +			emit_data->args[0]);
> +   emit_data->output[emit_data->chan] =  LLVMBuildFSub(bld_base->base.gallivm->builder,
> +                                                       emit_data->args[0], tmp, "");
> +}
> +
>  void
>  lp_set_default_actions(struct lp_build_tgsi_context * bld_base)
>  {
> @@ -948,6 +1067,25 @@ lp_set_default_actions(struct lp_build_tgsi_context * bld_base)
>  
>     bld_base->op_actions[TGSI_OPCODE_MAX].emit = fmax_emit;
>     bld_base->op_actions[TGSI_OPCODE_MIN].emit = fmin_emit;
> +
> +   bld_base->op_actions[TGSI_OPCODE_DADD].emit = add_emit;
> +   bld_base->op_actions[TGSI_OPCODE_DMAX].emit = fmax_emit;
> +   bld_base->op_actions[TGSI_OPCODE_DMIN].emit = fmin_emit;
> +   bld_base->op_actions[TGSI_OPCODE_DMUL].emit = mul_emit;
> +
> +   bld_base->op_actions[TGSI_OPCODE_D2F].emit = d2f_emit;
> +   bld_base->op_actions[TGSI_OPCODE_D2I].emit = d2i_emit;
> +   bld_base->op_actions[TGSI_OPCODE_D2U].emit = d2u_emit;
> +
> +   bld_base->op_actions[TGSI_OPCODE_F2D].emit = f2d_emit;
> +   bld_base->op_actions[TGSI_OPCODE_I2D].emit = i2d_emit;
> +   bld_base->op_actions[TGSI_OPCODE_U2D].emit = u2d_emit;
> +
> +   bld_base->op_actions[TGSI_OPCODE_DMAD].emit = dmad_emit;
> +
> +   bld_base->op_actions[TGSI_OPCODE_DRCP].emit = drcp_emit;
> +   bld_base->op_actions[TGSI_OPCODE_DFRAC].emit = dfrac_emit;
> +
>  }
>  
>  /* CPU Only default actions */
> @@ -1792,6 +1930,104 @@ xor_emit_cpu(
>                                                       emit_data->args[1]);
>  }
>  
> +/* TGSI_OPCODE_DABS (CPU Only) */
> +static void
> +dabs_emit_cpu(
> +   const struct lp_build_tgsi_action * action,
> +   struct lp_build_tgsi_context * bld_base,
> +   struct lp_build_emit_data * emit_data)
> +{
> +   emit_data->output[emit_data->chan] = lp_build_abs(&bld_base->dbl_bld,
> +                                                       emit_data->args[0]);
> +}
> +
> +/* TGSI_OPCODE_DNEG (CPU Only) */
> +static void
> +dneg_emit_cpu(
> +   const struct lp_build_tgsi_action * action,
> +   struct lp_build_tgsi_context * bld_base,
> +   struct lp_build_emit_data * emit_data)
> +{
> +   emit_data->output[emit_data->chan] = lp_build_sub(&bld_base->dbl_bld,
> +                                                     bld_base->dbl_bld.zero,
> +                                                     emit_data->args[0]);
> +}
> +
> +/* TGSI_OPCODE_DSET Helper (CPU Only) */
> +static void
> +dset_emit_cpu(
> +   const struct lp_build_tgsi_action * action,
> +   struct lp_build_tgsi_context * bld_base,
> +   struct lp_build_emit_data * emit_data,
> +   unsigned pipe_func)
> +{
> +   LLVMValueRef cond = lp_build_cmp(&bld_base->dbl_bld, pipe_func,
> +                                    emit_data->args[0], emit_data->args[1]);
> +   emit_data->output[emit_data->chan] = cond;
> +}
> +
> +/* TGSI_OPCODE_DSEQ (CPU Only) */
> +static void
> +dseq_emit_cpu(
> +   const struct lp_build_tgsi_action * action,
> +   struct lp_build_tgsi_context * bld_base,
> +   struct lp_build_emit_data * emit_data)
> +{
> +   dset_emit_cpu(action, bld_base, emit_data, PIPE_FUNC_EQUAL);
> +}
> +
> +/* TGSI_OPCODE_DSGE (CPU Only) */
> +static void
> +dsge_emit_cpu(
> +   const struct lp_build_tgsi_action * action,
> +   struct lp_build_tgsi_context * bld_base,
> +   struct lp_build_emit_data * emit_data)
> +{
> +   dset_emit_cpu(action, bld_base, emit_data, PIPE_FUNC_GEQUAL);
> +}
> +
> +/* TGSI_OPCODE_DSLT (CPU Only) */
> +static void
> +dslt_emit_cpu(
> +   const struct lp_build_tgsi_action * action,
> +   struct lp_build_tgsi_context * bld_base,
> +   struct lp_build_emit_data * emit_data)
> +{
> +   dset_emit_cpu(action, bld_base, emit_data, PIPE_FUNC_LESS);
> +}
> +
> +/* TGSI_OPCODE_DSNE (CPU Only) */
> +static void
> +dsne_emit_cpu(
> +   const struct lp_build_tgsi_action * action,
> +   struct lp_build_tgsi_context * bld_base,
> +   struct lp_build_emit_data * emit_data)
> +{
> +   dset_emit_cpu(action, bld_base, emit_data, PIPE_FUNC_NOTEQUAL);
> +}
> +
> +/* Double Reciprocal squareroot (CPU Only) */
> +static void
> +drecip_sqrt_emit_cpu(
> +   const struct lp_build_tgsi_action * action,
> +   struct lp_build_tgsi_context * bld_base,
> +   struct lp_build_emit_data * emit_data)
> +{
> +   emit_data->output[emit_data->chan] = lp_build_rsqrt(&bld_base->dbl_bld,
> +                                                         emit_data->args[0]);
> +}
> +
> +/* Double Squareroot (CPU Only) */
> +static void
> +dsqrt_emit_cpu(
> +   const struct lp_build_tgsi_action * action,
> +   struct lp_build_tgsi_context * bld_base,
> +   struct lp_build_emit_data * emit_data)
> +{
> +   emit_data->output[emit_data->chan] = lp_build_sqrt(&bld_base->dbl_bld,
> +                                                      emit_data->args[0]);
> +}
> +
>  void
>  lp_set_default_actions_cpu(
>     struct lp_build_tgsi_context * bld_base)
> @@ -1864,4 +2100,14 @@ lp_set_default_actions_cpu(
>  
>     bld_base->op_actions[TGSI_OPCODE_XOR].emit = xor_emit_cpu;
>  
> +   bld_base->op_actions[TGSI_OPCODE_DABS].emit = dabs_emit_cpu;
> +   bld_base->op_actions[TGSI_OPCODE_DNEG].emit = dneg_emit_cpu;
> +   bld_base->op_actions[TGSI_OPCODE_DSEQ].emit = dseq_emit_cpu;
> +   bld_base->op_actions[TGSI_OPCODE_DSGE].emit = dsge_emit_cpu;
> +   bld_base->op_actions[TGSI_OPCODE_DSLT].emit = dslt_emit_cpu;
> +   bld_base->op_actions[TGSI_OPCODE_DSNE].emit = dsne_emit_cpu;
> +
> +   bld_base->op_actions[TGSI_OPCODE_DRSQ].emit = drecip_sqrt_emit_cpu;
> +   bld_base->op_actions[TGSI_OPCODE_DSQRT].emit = dsqrt_emit_cpu;
> +
>  }
> diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.h b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.h
> index fc7fdbd..463d44e 100644
> --- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.h
> +++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.h
> @@ -71,6 +71,11 @@ struct lp_build_emit_data {
>      */
>     unsigned chan;
>  
> +   /**
> +    * This is used to specify the src channel to read from for doubles.
> +    */
> +   unsigned src_chan;
> +
>     /** The lp_build_tgsi_action::emit 'executes' the opcode and writes the
>      * results to this array.
>      */
> diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
> index 268379e..95d6786 100644
> --- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
> +++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
> @@ -947,15 +947,19 @@ static LLVMValueRef
>  build_gather(struct lp_build_tgsi_context *bld_base,
>               LLVMValueRef base_ptr,
>               LLVMValueRef indexes,
> -             LLVMValueRef overflow_mask)
> +             LLVMValueRef overflow_mask, LLVMValueRef indexes2)
>  {
>     struct gallivm_state *gallivm = bld_base->base.gallivm;
>     LLVMBuilderRef builder = gallivm->builder;
>     struct lp_build_context *uint_bld = &bld_base->uint_bld;
>     struct lp_build_context *bld = &bld_base->base;
> -   LLVMValueRef res = bld->undef;
> +   LLVMValueRef res;
>     unsigned i;
>  
> +   if (indexes2)
> +      res = LLVMGetUndef(LLVMVectorType(LLVMFloatTypeInContext(gallivm->context), bld_base->base.type.length * 2));
> +   else
> +      res = bld->undef;
>     /*
>      * overflow_mask is a vector telling us which channels
>      * in the vector overflowed. We use the overflow behavior for
> @@ -976,26 +980,47 @@ build_gather(struct lp_build_tgsi_context *bld_base,
>         * control flow.
>         */
>        indexes = lp_build_select(uint_bld, overflow_mask, uint_bld->zero, indexes);
> +      if (indexes2)
> +         indexes2 = lp_build_select(uint_bld, overflow_mask, uint_bld->zero, indexes2);
>     }
>  
>     /*
>      * Loop over elements of index_vec, load scalar value, insert it into 'res'.
>      */
> -   for (i = 0; i < bld->type.length; i++) {
> -      LLVMValueRef ii = lp_build_const_int32(bld->gallivm, i);
> -      LLVMValueRef index = LLVMBuildExtractElement(builder,
> -                                                   indexes, ii, "");
> +   for (i = 0; i < bld->type.length * (indexes2 ? 2 : 1); i++) {
> +      LLVMValueRef si, di;
> +      LLVMValueRef index;
>        LLVMValueRef scalar_ptr, scalar;
>  
> +      if (indexes2) {
> +         si = lp_build_const_int32(bld->gallivm, i >> 1);
> +         di = lp_build_const_int32(bld->gallivm, i);
> +      } else {
> +         si = lp_build_const_int32(bld->gallivm, i);
> +         di = si;
> +      }
> +
> +      if (indexes2 && (i & 1)) {
> +         index = LLVMBuildExtractElement(builder,
> +                                         indexes2, si, "");
> +      } else {
> +         index = LLVMBuildExtractElement(builder,
> +                                         indexes, si, "");
> +      }
>        scalar_ptr = LLVMBuildGEP(builder, base_ptr,
>                                  &index, 1, "gather_ptr");
>        scalar = LLVMBuildLoad(builder, scalar_ptr, "");
>  
> -      res = LLVMBuildInsertElement(builder, res, scalar, ii, "");
> +      res = LLVMBuildInsertElement(builder, res, scalar, di, "");
>     }
>  
>     if (overflow_mask) {
> -      res = lp_build_select(bld, overflow_mask, bld->zero, res);
> +      if (indexes2) {
> +         res = LLVMBuildBitCast(builder, res, bld_base->dbl_bld.vec_type, "");
> +         overflow_mask = LLVMBuildSExt(builder, overflow_mask, bld_base->dbl_bld.int_vec_type, "");
> +         res = lp_build_select(&bld_base->dbl_bld, overflow_mask, bld_base->dbl_bld.zero, res);
> +      } else
> +         res = lp_build_select(bld, overflow_mask, bld->zero, res);
>     }
This function looks pretty complex to me.
I wonder if it wouldn't make more sense to use the gather as it was and
just call it twice, with some shuffle for the fetched values afterwards.
(There is actually some good reason why build_gather should be a
"simple" function extracting elements, do loads, and insert the loaded
values in a straightforward manner, this is supposed to be able to turn
into a avx2 gather at some point, and everything doing something
different would need to be thrown out.)

>  
>     return res;
> @@ -1139,8 +1164,10 @@ stype_to_fetch(struct lp_build_tgsi_context * bld_base,
>     case TGSI_TYPE_SIGNED:
>        bld_fetch = &bld_base->int_bld;
>        break;
> -   case TGSI_TYPE_VOID:
>     case TGSI_TYPE_DOUBLE:
> +      bld_fetch = &bld_base->dbl_bld;
> +      break;
> +   case TGSI_TYPE_VOID:
>     default:
>        assert(0);
>        bld_fetch = NULL;
> @@ -1216,6 +1243,7 @@ emit_fetch_constant(
>           lp_build_const_int_vec(gallivm, uint_bld->type, swizzle);
>        LLVMValueRef index_vec;  /* index into the const buffer */
>        LLVMValueRef overflow_mask;
> +      LLVMValueRef index_vec2 = NULL;
>  
>        indirect_index = get_indirect_index(bld,
>                                            reg->Register.File,
> @@ -1235,22 +1263,33 @@ emit_fetch_constant(
>        index_vec = lp_build_shl_imm(uint_bld, indirect_index, 2);
>        index_vec = lp_build_add(uint_bld, index_vec, swizzle_vec);
>  
> +      if (stype == TGSI_TYPE_DOUBLE) {
> +         LLVMValueRef swizzle_vec2;
> +         swizzle_vec2 = lp_build_const_int_vec(gallivm, uint_bld->type, swizzle + 1);
> +         index_vec2 = lp_build_shl_imm(uint_bld, indirect_index, 2);
> +         index_vec2 = lp_build_add(uint_bld, index_vec2, swizzle_vec2);
> +      }
>        /* Gather values from the constant buffer */
> -      res = build_gather(bld_base, consts_ptr, index_vec, overflow_mask);
> +      res = build_gather(bld_base, consts_ptr, index_vec, overflow_mask, index_vec2);
>     }
>     else {
>        LLVMValueRef index;  /* index into the const buffer */
>        LLVMValueRef scalar, scalar_ptr;
> -
> +      struct lp_build_context *bld_broad = &bld_base->base;
>        index = lp_build_const_int32(gallivm, reg->Register.Index * 4 + swizzle);
>  
>        scalar_ptr = LLVMBuildGEP(builder, consts_ptr,
>                                  &index, 1, "");
> +      if (stype == TGSI_TYPE_DOUBLE) {
> +         LLVMTypeRef dptr_type = LLVMPointerType(LLVMDoubleTypeInContext(gallivm->context), 0);
> +         scalar_ptr = LLVMBuildBitCast(builder, scalar_ptr, dptr_type, "");
> +         bld_broad = &bld_base->dbl_bld;
> +      }
>        scalar = LLVMBuildLoad(builder, scalar_ptr, "");
> -      res = lp_build_broadcast_scalar(&bld_base->base, scalar);
> +      res = lp_build_broadcast_scalar(bld_broad, scalar);
>     }
>  
> -   if (stype == TGSI_TYPE_SIGNED || stype == TGSI_TYPE_UNSIGNED) {
> +   if (stype == TGSI_TYPE_SIGNED || stype == TGSI_TYPE_UNSIGNED || stype == TGSI_TYPE_DOUBLE) {
>        struct lp_build_context *bld_fetch = stype_to_fetch(bld_base, stype);
>        res = LLVMBuildBitCast(builder, res, bld_fetch->vec_type, "");
>     }
> @@ -1258,6 +1297,39 @@ emit_fetch_constant(
>     return res;
>  }
>  
> +/**
> + * Fetch double values from two separate channels.
> + * Doubles are stored split across two channels, like xy and zw.
> + * This function creates a set of 16 floats,
> + * extracts the values from the two channels,
> + * puts them in the correct place, then casts to 8 doubles.
> + */
> +static LLVMValueRef
> +emit_fetch_double(
> +   struct lp_build_tgsi_context * bld_base,
> +   enum tgsi_opcode_type stype,
> +   LLVMValueRef input,
> +   LLVMValueRef input2)
> +{
> +   struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);
> +   struct gallivm_state *gallivm = bld->bld_base.base.gallivm;
> +   LLVMBuilderRef builder = gallivm->builder;
> +   LLVMValueRef res;
> +   struct lp_build_context *bld_fetch = stype_to_fetch(bld_base, stype);
> +   int i;
> +   LLVMValueRef shuffles[16];
> +   int len = bld_base->base.type.length * 2;
> +   assert(len <= 16);
> +
> +   for (i = 0; i < bld_base->base.type.length * 2; i+=2) {
> +      shuffles[i] = lp_build_const_int32(gallivm, i / 2);
> +      shuffles[i + 1] = lp_build_const_int32(gallivm, i / 2 + bld_base->base.type.length);
> +   }
> +   res = LLVMBuildShuffleVector(builder, input, input2, LLVMConstVector(shuffles, len), "");
> +
> +   return LLVMBuildBitCast(builder, res, bld_fetch->vec_type, "");
> +}
> +
>  static LLVMValueRef
>  emit_fetch_immediate(
>     struct lp_build_tgsi_context * bld_base,
> @@ -1281,7 +1353,7 @@ emit_fetch_immediate(
>        if (reg->Register.Indirect) {
>           LLVMValueRef indirect_index;
>           LLVMValueRef index_vec;  /* index into the immediate register array */
> -
> +         LLVMValueRef index_vec2 = NULL;
>           indirect_index = get_indirect_index(bld,
>                                               reg->Register.File,
>                                               reg->Register.Index,
> @@ -1296,25 +1368,46 @@ emit_fetch_immediate(
>                                             indirect_index,
>                                             swizzle,
>                                             FALSE);
> -
> +         if (stype == TGSI_TYPE_DOUBLE)
> +            index_vec2 = get_soa_array_offsets(&bld_base->uint_bld,
> +                                              indirect_index,
> +                                              swizzle + 1,
> +                                              FALSE);
>           /* Gather values from the immediate register array */
> -         res = build_gather(bld_base, imms_array, index_vec, NULL);
> +         res = build_gather(bld_base, imms_array, index_vec, NULL, index_vec2);
>        } else {
>           LLVMValueRef lindex = lp_build_const_int32(gallivm,
>                                          reg->Register.Index * 4 + swizzle);
>           LLVMValueRef imms_ptr =  LLVMBuildGEP(builder,
>                                                  bld->imms_array, &lindex, 1, "");
>           res = LLVMBuildLoad(builder, imms_ptr, "");
> +
> +         if (stype == TGSI_TYPE_DOUBLE) {
> +            LLVMValueRef lindex1;
> +            LLVMValueRef imms_ptr2;
> +            LLVMValueRef res2;
> +
> +            lindex1 = lp_build_const_int32(gallivm,
> +                                           reg->Register.Index * 4 + swizzle + 1);
> +            imms_ptr2 = LLVMBuildGEP(builder,
> +                                      bld->imms_array, &lindex1, 1, "");
> +            res2 = LLVMBuildLoad(builder, imms_ptr2, "");
> +            res = emit_fetch_double(bld_base, stype, res, res2);
> +         }
>        }
>     }
>     else {
>        res = bld->immediates[reg->Register.Index][swizzle];
> +      if (stype == TGSI_TYPE_DOUBLE)
> +         res = emit_fetch_double(bld_base, stype, res, bld->immediates[reg->Register.Index][swizzle + 1]);
>     }
>  
>     if (stype == TGSI_TYPE_UNSIGNED) {
>        res = LLVMBuildBitCast(builder, res, bld_base->uint_bld.vec_type, "");
>     } else if (stype == TGSI_TYPE_SIGNED) {
>        res = LLVMBuildBitCast(builder, res, bld_base->int_bld.vec_type, "");
> +   } else if (stype == TGSI_TYPE_DOUBLE) {
> +      res = LLVMBuildBitCast(builder, res, bld_base->dbl_bld.vec_type, "");
>     }
>     return res;
>  }
> @@ -1334,6 +1427,7 @@ emit_fetch_input(
>     if (reg->Register.Indirect) {
>        LLVMValueRef indirect_index;
>        LLVMValueRef index_vec;  /* index into the input reg array */
> +      LLVMValueRef index_vec2 = NULL;
>        LLVMValueRef inputs_array;
>        LLVMTypeRef fptr_type;
>  
> @@ -1346,23 +1440,43 @@ emit_fetch_input(
>                                          indirect_index,
>                                          swizzle,
>                                          TRUE);
> -
> +      if (stype == TGSI_TYPE_DOUBLE) {
> +         index_vec2 = get_soa_array_offsets(&bld_base->uint_bld,
> +                                           indirect_index,
> +                                           swizzle + 1,
> +                                           TRUE);
> +      }
>        /* cast inputs_array pointer to float* */
>        fptr_type = LLVMPointerType(LLVMFloatTypeInContext(gallivm->context), 0);
>        inputs_array = LLVMBuildBitCast(builder, bld->inputs_array, fptr_type, "");
>  
>        /* Gather values from the input register array */
> -      res = build_gather(bld_base, inputs_array, index_vec, NULL);
> +      res = build_gather(bld_base, inputs_array, index_vec, NULL, index_vec2);
>     } else {
>        if (bld->indirect_files & (1 << TGSI_FILE_INPUT)) {
>           LLVMValueRef lindex = lp_build_const_int32(gallivm,
>                                          reg->Register.Index * 4 + swizzle);
> -         LLVMValueRef input_ptr =  LLVMBuildGEP(builder,
> -                                                bld->inputs_array, &lindex, 1, "");
> +         LLVMValueRef input_ptr = LLVMBuildGEP(builder,
> +                                               bld->inputs_array, &lindex, 1, "");
> +
>           res = LLVMBuildLoad(builder, input_ptr, "");
> +         if (stype == TGSI_TYPE_DOUBLE) {
> +            LLVMValueRef lindex1;
> +            LLVMValueRef input_ptr2;
> +            LLVMValueRef res2;
> +
> +            lindex1 = lp_build_const_int32(gallivm,
> +                                           reg->Register.Index * 4 + swizzle + 1);
> +            input_ptr2 = LLVMBuildGEP(builder,
> +                                      bld->inputs_array, &lindex1, 1, "");
> +            res2 = LLVMBuildLoad(builder, input_ptr2, "");
> +            res = emit_fetch_double(bld_base, stype, res, res2);
> +         }
>        }
>        else {
>           res = bld->inputs[reg->Register.Index][swizzle];
> +         if (stype == TGSI_TYPE_DOUBLE)
> +            res = emit_fetch_double(bld_base, stype, res, bld->inputs[reg->Register.Index][swizzle + 1]);
>        }
>     }
>  
> @@ -1372,6 +1486,8 @@ emit_fetch_input(
>        res = LLVMBuildBitCast(builder, res, bld_base->uint_bld.vec_type, "");
>     } else if (stype == TGSI_TYPE_SIGNED) {
>        res = LLVMBuildBitCast(builder, res, bld_base->int_bld.vec_type, "");
> +   } else if (stype == TGSI_TYPE_DOUBLE) {
> +      res = LLVMBuildBitCast(builder, res, bld_base->dbl_bld.vec_type, "");
>     }
>  
>     return res;
> @@ -1413,7 +1529,7 @@ emit_fetch_gs_input(
>     } else {
>        attrib_index = lp_build_const_int32(gallivm, reg->Register.Index);
>     }
> -   
> +
>     if (reg->Dimension.Indirect) {
>        vertex_index = get_indirect_index(bld,
>                                          reg->Register.File,
> @@ -1436,6 +1552,8 @@ emit_fetch_gs_input(
>        res = LLVMBuildBitCast(builder, res, bld_base->uint_bld.vec_type, "");
>     } else if (stype == TGSI_TYPE_SIGNED) {
>        res = LLVMBuildBitCast(builder, res, bld_base->int_bld.vec_type, "");
> +   } else if (stype == TGSI_TYPE_DOUBLE) {
> +      res = LLVMBuildBitCast(builder, res, bld_base->dbl_bld.vec_type, "");
>     }
>  
>     return res;
> @@ -1455,7 +1573,7 @@ emit_fetch_temporary(
>  
>     if (reg->Register.Indirect) {
>        LLVMValueRef indirect_index;
> -      LLVMValueRef index_vec;  /* index into the temp reg array */
> +      LLVMValueRef index_vec, index_vec2 = NULL;  /* index into the temp reg array */
>        LLVMValueRef temps_array;
>        LLVMTypeRef fptr_type;
>  
> @@ -1468,21 +1586,35 @@ emit_fetch_temporary(
>                                          indirect_index,
>                                          swizzle,
>                                          TRUE);
> +      if (stype == TGSI_TYPE_DOUBLE) {
> +               index_vec2 = get_soa_array_offsets(&bld_base->uint_bld,
> +                                                  indirect_index,
> +                                                  swizzle + 1,
> +                                                  TRUE);
> +      }
>  
>        /* cast temps_array pointer to float* */
>        fptr_type = LLVMPointerType(LLVMFloatTypeInContext(gallivm->context), 0);
>        temps_array = LLVMBuildBitCast(builder, bld->temps_array, fptr_type, "");
>  
>        /* Gather values from the temporary register array */
> -      res = build_gather(bld_base, temps_array, index_vec, NULL);
> +      res = build_gather(bld_base, temps_array, index_vec, NULL, index_vec2);
>     }
>     else {
>        LLVMValueRef temp_ptr;
>        temp_ptr = lp_get_temp_ptr_soa(bld, reg->Register.Index, swizzle);
>        res = LLVMBuildLoad(builder, temp_ptr, "");
> +
> +      if (stype == TGSI_TYPE_DOUBLE) {
> +         LLVMValueRef temp_ptr2, res2;
> +
> +         temp_ptr2 = lp_get_temp_ptr_soa(bld, reg->Register.Index, swizzle + 1);
> +         res2 = LLVMBuildLoad(builder, temp_ptr2, "");
> +         res = emit_fetch_double(bld_base, stype, res, res2);
> +      }
>     }
>  
> -   if (stype == TGSI_TYPE_SIGNED || stype == TGSI_TYPE_UNSIGNED) {
> +   if (stype == TGSI_TYPE_SIGNED || stype == TGSI_TYPE_UNSIGNED || stype == TGSI_TYPE_DOUBLE) {
>        struct lp_build_context *bld_fetch = stype_to_fetch(bld_base, stype);
>        res = LLVMBuildBitCast(builder, res, bld_fetch->vec_type, "");
>     }
> @@ -1648,6 +1780,46 @@ emit_fetch_predicate(
>     }
>  }
>  
> +/**
> + * store an array of 8 doubles into two arrays of 8 floats
> + * i.e.
> + * value is d0, d1, d2, d3 etc.
> + * each double has high and low pieces x, y
> + * so gets stored into the separate channels as:
> + * chan_ptr = d0.x, d1.x, d2.x, d3.x
> + * chan_ptr2 = d0.y, d1.y, d2.y, d3.y
> + */
> +static void
> +emit_store_double_chan(struct lp_build_tgsi_context *bld_base,
> +                       int dtype,
> +                       LLVMValueRef chan_ptr, LLVMValueRef chan_ptr2,
> +                       LLVMValueRef pred,
> +                       LLVMValueRef value)
> +{
> +   struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);
> +   struct gallivm_state *gallivm = bld_base->base.gallivm;
> +   LLVMBuilderRef builder = gallivm->builder;
> +   struct lp_build_context *float_bld = &bld_base->base;
> +   int i;
> +   if (dtype == TGSI_TYPE_DOUBLE) {
> +      LLVMValueRef temp, temp2;
> +      LLVMValueRef shuffles[8];
> +      LLVMValueRef shuffles2[8];
> +
> +      for (i = 0; i < bld_base->base.type.length; i++) {
> +         shuffles[i] = lp_build_const_int32(gallivm, i * 2);
> +         shuffles2[i] = lp_build_const_int32(gallivm, (i * 2) + 1);
> +      }
> +
> +      temp = LLVMBuildShuffleVector(builder, value, LLVMGetUndef(LLVMTypeOf(value)), LLVMConstVector(shuffles, bld_base->base.type.length), "");
> +      temp2 = LLVMBuildShuffleVector(builder, value, LLVMGetUndef(LLVMTypeOf(value)), LLVMConstVector(shuffles2, bld_base->base.type.length), "");
These lines are a bit long...

> +      lp_exec_mask_store(&bld->exec_mask, float_bld, pred, temp, chan_ptr);
> +      lp_exec_mask_store(&bld->exec_mask, float_bld, pred, temp2, chan_ptr2);
> +   } else {
> +      lp_exec_mask_store(&bld->exec_mask, float_bld, pred, value, chan_ptr);
Actually I think it would be nicer to keep the ordinary float path out
of this function (the name also implies storing doubles) and handle that
one-liner in the caller(s).
Might also make sense to actually get the chan_ptrs in this function
itself, it sort of feels unnatural to pass 2 of them around (as they are
obviously closely related), but no biggie.

> +   }
> +}
>  
>  /**
>   * Register store.
> @@ -1683,6 +1855,11 @@ emit_store_chan(
>     }
>  
>     if (reg->Register.Indirect) {
> +      /*
> +       * Currently the mesa/st doesn't generate indirect stores
> +       * to doubles, it normally uses MOV to do indirect stores.
> +       */
> +      assert(dtype != TGSI_TYPE_DOUBLE);
>        indirect_index = get_indirect_index(bld,
>                                            reg->Register.File,
>                                            reg->Register.Index,
> @@ -1721,13 +1898,21 @@ emit_store_chan(
>        else {
>           LLVMValueRef out_ptr = lp_get_output_ptr(bld, reg->Register.Index,
>                                                    chan_index);
> -         lp_exec_mask_store(&bld->exec_mask, float_bld, pred, value, out_ptr);
> +         LLVMValueRef out_ptr2 = NULL;
> +         if (dtype == TGSI_TYPE_DOUBLE)
> +            out_ptr2 = lp_get_output_ptr(bld, reg->Register.Index,
> +                                                  chan_index + 1);
> +
> +         emit_store_double_chan(bld_base, dtype, out_ptr, out_ptr2, pred, value);
>        }
>        break;
>  
>     case TGSI_FILE_TEMPORARY:
>        /* Temporaries are always stored as floats */
> -      value = LLVMBuildBitCast(builder, value, float_bld->vec_type, "");
> +      if (dtype != TGSI_TYPE_DOUBLE)
> +         value = LLVMBuildBitCast(builder, value, float_bld->vec_type, "");
> +      else
> +         value = LLVMBuildBitCast(builder, value,  LLVMVectorType(LLVMFloatTypeInContext(gallivm->context), bld_base->base.type.length * 2), "");
>  
>        if (reg->Register.Indirect) {
>           LLVMValueRef index_vec;  /* indexes into the temp registers */
> @@ -1747,9 +1932,12 @@ emit_store_chan(
>                             &bld->exec_mask, pred);
>        }
>        else {
> -         LLVMValueRef temp_ptr;
> +         LLVMValueRef temp_ptr, temp_ptr2 = NULL;
>           temp_ptr = lp_get_temp_ptr_soa(bld, reg->Register.Index, chan_index);
> -         lp_exec_mask_store(&bld->exec_mask, float_bld, pred, value, temp_ptr);
> +         if (dtype == TGSI_TYPE_DOUBLE)
> +            temp_ptr2 = lp_get_temp_ptr_soa(bld, reg->Register.Index, chan_index + 1);
> +
> +         emit_store_double_chan(bld_base, dtype, temp_ptr, temp_ptr2, pred, value);
>        }
>        break;
>  
> @@ -1818,13 +2006,16 @@ emit_store(
>  {
>     unsigned chan_index;
>     struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);
> -
> +   enum tgsi_opcode_type dtype = tgsi_opcode_infer_dst_type(inst->Instruction.Opcode);
>     if(info->num_dst) {
>        LLVMValueRef pred[TGSI_NUM_CHANNELS];
>  
>        emit_fetch_predicate( bld, inst, pred );
>  
>        TGSI_FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
> +
> +         if (dtype == TGSI_TYPE_DOUBLE && (chan_index == 1 || chan_index == 3))
> +             continue;
>           emit_store_chan(bld_base, inst, 0, chan_index, pred[chan_index], dst[chan_index]);
>        }
>     }
> @@ -2823,6 +3014,7 @@ void lp_emit_immediate_soa(
>                 lp_build_const_vec(gallivm, bld_base->base.type, imm->u[i].Float);
>  
>        break;
> +   case TGSI_IMM_FLOAT64:
>     case TGSI_IMM_UINT32:
>        for( i = 0; i < size; ++i ) {
>           LLVMValueRef tmp = lp_build_const_vec(gallivm, bld_base->uint_bld.type, imm->u[i].Uint);
> @@ -3674,6 +3866,12 @@ lp_build_tgsi_soa(struct gallivm_state *gallivm,
>     lp_build_context_init(&bld.bld_base.uint_bld, gallivm, lp_uint_type(type));
>     lp_build_context_init(&bld.bld_base.int_bld, gallivm, lp_int_type(type));
>     lp_build_context_init(&bld.elem_bld, gallivm, lp_elem_type(type));
> +   {
> +      struct lp_type dbl_type;
> +      dbl_type = type;
> +      dbl_type.width *= 2;
> +      lp_build_context_init(&bld.bld_base.dbl_bld, gallivm, dbl_type);
> +   }
>     bld.mask = mask;
>     bld.inputs = inputs;
>     bld.outputs = outputs;
> 

Looks good to me. I'm not entirely happy with the build_gather, but it
is fixable later, your choice.

Reviewed-by: Roland Scheidegger <sroland at vmware.com>