[Mesa-dev] [PATCH 2/3] gallivm: add fp64 support.

Mon Jun 29 07:58:47 PDT 2015

Don't worry about the AoS stuff. Only meant to do simple things.

Looks good overall, I guess it makes sense to not split execution too
(so you'd have native hw vector size there), llvm should handle that
pretty well these days (the sse intrinsics won't get used that way
probably (though there's a helper for that too which makes it possible
but it might not be hooked up, but I guess there's not really much need
for them).

Some comments inline.

Am 29.06.2015 um 09:21 schrieb Dave Airlie:
> This adds support for ARB_gpu_shader_fp64 and ARB_vertex_attrib_64bit to
> llvmpipe.
> 
> Two things that don't mix well are SoA and doubles, see
> emit_fetch_double, and emit_store_double_chan in this.
> 
> I've also had to split emit_data.chan, to add src_chan,
> which can be different for doubles.
> 
> Open issues:
> are intrinsics okay for floor/ceil?
The question is if they actually work if you don't have sse4.1 and don't
just crash (at least I assume with sse4.1 it turns into round
instruction). (Or on non-x86 cpus if there is no direct hw support). If
they don't you'd have to provide your own implementation (at least as a
fallback) or make support for the extension conditional. Otherwise llvm
intrinsics are just fine (traditionally we didn't really use them much
as most of the things we do with sse intrinsics were missing, and even
if some intrinsic existed it often didn't work, but that was a long time
ago - ideally we'd switch to llvm intrinsics where possible).

> should and of these functions have CPU versions?
> 
> tested with piglit, no regressions, all the fp64 tests seem to pass.
> 
> Signed-off-by: Dave Airlie <airlied at redhat.com>
> ---
>  src/gallium/auxiliary/gallivm/lp_bld_arit.c        |  12 ++
>  src/gallium/auxiliary/gallivm/lp_bld_limits.h      |   1 +
>  src/gallium/auxiliary/gallivm/lp_bld_logic.c       |   2 +-
>  src/gallium/auxiliary/gallivm/lp_bld_tgsi.c        |  47 +++-
>  src/gallium/auxiliary/gallivm/lp_bld_tgsi.h        |   4 +
>  src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c | 240 +++++++++++++++++++++
>  src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.h |   3 +
>  src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c    | 163 +++++++++++++-
>  8 files changed, 458 insertions(+), 14 deletions(-)
> 
> diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.c b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
> index 9daa93e..8fba43f 100644
> --- a/src/gallium/auxiliary/gallivm/lp_bld_arit.c
> +++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
> @@ -1997,6 +1997,12 @@ lp_build_floor(struct lp_build_context *bld,
>        LLVMTypeRef int_vec_type = bld->int_vec_type;
>        LLVMTypeRef vec_type = bld->vec_type;
>  
> +      if (type.width != 32) {
> +         char intrinsic[32];
> +         util_snprintf(intrinsic, sizeof intrinsic, "llvm.floor.v%uf%u", type.length, type.width);
> +         return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
> +      }
> +
>        assert(type.width == 32); /* might want to handle doubles at some point */
>  
>        inttype = type;
> @@ -2066,6 +2072,12 @@ lp_build_ceil(struct lp_build_context *bld,
>        LLVMTypeRef int_vec_type = bld->int_vec_type;
>        LLVMTypeRef vec_type = bld->vec_type;
>  
> +      if (type.width != 32) {
> +         char intrinsic[32];
> +         util_snprintf(intrinsic, sizeof intrinsic, "llvm.ceil.v%uf%u", type.length, type.width);
> +         return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
> +      }
> +
>        assert(type.width == 32); /* might want to handle doubles at some point */
>  
>        inttype = type;
> diff --git a/src/gallium/auxiliary/gallivm/lp_bld_limits.h b/src/gallium/auxiliary/gallivm/lp_bld_limits.h
> index 2851fd1..3db7261 100644
> --- a/src/gallium/auxiliary/gallivm/lp_bld_limits.h
> +++ b/src/gallium/auxiliary/gallivm/lp_bld_limits.h
> @@ -132,6 +132,7 @@ gallivm_get_shader_param(enum pipe_shader_cap param)
>     case PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE:
>        return 1;
>     case PIPE_SHADER_CAP_DOUBLES:
> +      return 1;
>     case PIPE_SHADER_CAP_TGSI_DROUND_SUPPORTED:
>     case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED:
>     case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED:
> diff --git a/src/gallium/auxiliary/gallivm/lp_bld_logic.c b/src/gallium/auxiliary/gallivm/lp_bld_logic.c
> index 80b53e5..f724cfa 100644
> --- a/src/gallium/auxiliary/gallivm/lp_bld_logic.c
> +++ b/src/gallium/auxiliary/gallivm/lp_bld_logic.c
> @@ -81,7 +81,7 @@ lp_build_compare_ext(struct gallivm_state *gallivm,
>                       boolean ordered)
>  {
>     LLVMBuilderRef builder = gallivm->builder;
> -   LLVMTypeRef int_vec_type = lp_build_int_vec_type(gallivm, type);
> +   LLVMTypeRef int_vec_type = lp_build_int_vec_type(gallivm, lp_type_int_vec(32, 32 * type.length));
>     LLVMValueRef zeros = LLVMConstNull(int_vec_type);
>     LLVMValueRef ones = LLVMConstAllOnes(int_vec_type);
>     LLVMValueRef cond;
> diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi.c b/src/gallium/auxiliary/gallivm/lp_bld_tgsi.c
> index e391d8a..1887956 100644
> --- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi.c
> +++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi.c
> @@ -175,13 +175,52 @@ void lp_build_fetch_args(
>     unsigned src;
>     for (src = 0; src < emit_data->info->num_src; src++) {
>        emit_data->args[src] = lp_build_emit_fetch(bld_base, emit_data->inst, src,
> -                                               emit_data->chan);
> +                                                 emit_data->src_chan);
>     }
>     emit_data->arg_count = emit_data->info->num_src;
>     lp_build_action_set_dst_type(emit_data, bld_base,
>  		emit_data->inst->Instruction.Opcode);
>  }
>  
> +/**
> + * with doubles src and dst channels aren't 1:1.
> + * check the src/dst types for the opcode,
> + * 1. if neither is double then src == dst;
> + * 2. if dest is double
> + *     - don't store to y or w
> + *     - if src is double then src == dst.
> + *     - else for f2d, d.xy = s.x
> + *     - else for f2d, d.zw = s.y
> + * 3. if dst is single, src is double
> + *    - map dst x,z to src xy;
> + *    - map dst y,w to src zw;
> + */
> +static int get_src_chan_idx(unsigned opcode,
> +                            int dst_chan_index)
> +{
> +   enum tgsi_opcode_type dtype = tgsi_opcode_infer_dst_type(opcode);
> +   enum tgsi_opcode_type stype = tgsi_opcode_infer_src_type(opcode);
> +
> +   if (dtype != TGSI_TYPE_DOUBLE && stype != TGSI_TYPE_DOUBLE)
> +      return dst_chan_index;
> +   if (dtype == TGSI_TYPE_DOUBLE) {
> +      if (dst_chan_index == 1 || dst_chan_index == 3)
> +         return -1;
> +      if (stype == TGSI_TYPE_DOUBLE)
> +         return dst_chan_index;
> +      if (dst_chan_index == 0)
> +         return 0;
> +      if (dst_chan_index == 2)
> +         return 1;
> +   } else {
> +      if (dst_chan_index == 0 || dst_chan_index == 2)
> +         return 0;
> +      if (dst_chan_index == 1 || dst_chan_index == 3)
> +         return 2;
> +   }
> +   return -1;
> +}
> +
>  /* XXX: COMMENT
>   * It should be assumed that this function ignores writemasks
>   */
> @@ -197,7 +236,6 @@ lp_build_tgsi_inst_llvm(
>     struct lp_build_emit_data emit_data;
>     unsigned chan_index;
>     LLVMValueRef val;
> -
>     bld_base->pc++;
>  
>     if (bld_base->emit_debug) {
> @@ -240,7 +278,12 @@ lp_build_tgsi_inst_llvm(
>     /* Emit the instructions */
>     if (info->output_mode == TGSI_OUTPUT_COMPONENTWISE && bld_base->soa) {
>        TGSI_FOR_EACH_DST0_ENABLED_CHANNEL(inst, chan_index) {
> +         int src_index = get_src_chan_idx(inst->Instruction.Opcode, chan_index);
> +         /* ignore channels 1/3 in double dst */
> +         if (src_index == -1)
> +            continue;
>           emit_data.chan = chan_index;
> +         emit_data.src_chan = src_index;
>           if (!action->fetch_args) {
>              lp_build_fetch_args(bld_base, &emit_data);
>           } else {
> diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h b/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h
> index 967373c..5809c5a 100644
> --- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h
> +++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h
> @@ -338,6 +338,7 @@ struct lp_build_tgsi_context
>     struct lp_build_context uint_bld;
>     struct lp_build_context int_bld;
>  
> +   struct lp_build_context dbl_bld;
>     /** This array stores functions that are used to transform TGSI opcodes to
>       * LLVM instructions.
>       */
> @@ -349,6 +350,9 @@ struct lp_build_tgsi_context
>  
>     struct lp_build_tgsi_action sqrt_action;
>  
> +   struct lp_build_tgsi_action drsq_action;
> +
> +   struct lp_build_tgsi_action dsqrt_action;
>     const struct tgsi_shader_info *info;
>  
>     lp_build_emit_fetch_fn emit_fetch_funcs[TGSI_FILE_COUNT];
> diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c
> index 9cb42b2..ca57e0e 100644
> --- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c
> +++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c
> @@ -894,6 +894,121 @@ const struct lp_build_tgsi_action xpd_action = {
>     xpd_emit	 /* emit */
>  };
>  
> +/* TGSI_OPCODE_D2F */
> +static void
> +d2f_emit(
> +   const struct lp_build_tgsi_action * action,
> +   struct lp_build_tgsi_context * bld_base,
> +   struct lp_build_emit_data * emit_data)
> +{
> +   emit_data->output[emit_data->chan] =
> +      LLVMBuildFPTrunc(bld_base->base.gallivm->builder,
> +                      emit_data->args[0],
> +                       bld_base->base.vec_type, "");
> +}
> +
> +/* TGSI_OPCODE_D2F */
That should be D2I

> +static void
> +d2i_emit(
> +   const struct lp_build_tgsi_action * action,
> +   struct lp_build_tgsi_context * bld_base,
> +   struct lp_build_emit_data * emit_data)
> +{
> +   emit_data->output[emit_data->chan] =
> +      LLVMBuildFPToSI(bld_base->base.gallivm->builder,
> +                      emit_data->args[0],
> +                      bld_base->base.int_vec_type, "");
> +}
> +
> +/* TGSI_OPCODE_D2U */
> +static void
> +d2u_emit(
> +   const struct lp_build_tgsi_action * action,
> +   struct lp_build_tgsi_context * bld_base,
> +   struct lp_build_emit_data * emit_data)
> +{
> +   emit_data->output[emit_data->chan] =
> +      LLVMBuildFPToUI(bld_base->base.gallivm->builder,
> +                      emit_data->args[0],
> +                      bld_base->base.int_vec_type, "");
> +}
> +
> +static void
> +f2d_emit(
> +   const struct lp_build_tgsi_action * action,
> +   struct lp_build_tgsi_context * bld_base,
> +   struct lp_build_emit_data * emit_data)
> +{
> +   emit_data->output[emit_data->chan] =
> +      LLVMBuildFPExt(bld_base->base.gallivm->builder,
> +                      emit_data->args[0],
> +                      bld_base->dbl_bld.vec_type, "");
> +}
> +static void
> +u2d_emit(
> +   const struct lp_build_tgsi_action * action,
> +   struct lp_build_tgsi_context * bld_base,
> +   struct lp_build_emit_data * emit_data)
> +{
> +   emit_data->output[emit_data->chan] =
> +      LLVMBuildUIToFP(bld_base->base.gallivm->builder,
> +                      emit_data->args[0],
> +                      bld_base->dbl_bld.vec_type, "");
> +}
> +
> +static void
> +i2d_emit(
> +   const struct lp_build_tgsi_action * action,
> +   struct lp_build_tgsi_context * bld_base,
> +   struct lp_build_emit_data * emit_data)
> +{
> +   emit_data->output[emit_data->chan] =
> +      LLVMBuildSIToFP(bld_base->base.gallivm->builder,
> +                      emit_data->args[0],
> +                      bld_base->dbl_bld.vec_type, "");
> +}
The empty line should be here not below.

> +/* TGSI_OPCODE_DMAD */
> +
> +static void
> +dmad_emit(
> +   const struct lp_build_tgsi_action * action,
> +   struct lp_build_tgsi_context * bld_base,
> +   struct lp_build_emit_data * emit_data)
> +{
> +   LLVMValueRef tmp;
> +   tmp = lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_DMUL,
> +                                   emit_data->args[0],
> +                                   emit_data->args[1]);
> +   emit_data->output[emit_data->chan] = lp_build_emit_llvm_binary(bld_base,
> +                                       TGSI_OPCODE_DADD, tmp, emit_data->args[2]);
> +}
> +
> +/*.TGSI_OPCODE_DRCP.*/
> +static void drcp_emit(
> +   const struct lp_build_tgsi_action * action,
> +   struct lp_build_tgsi_context * bld_base,
> +   struct lp_build_emit_data * emit_data)
> +{
> +   LLVMValueRef one;
> +   one = lp_build_const_vec(bld_base->dbl_bld.gallivm, bld_base->dbl_bld.type, 1.0f);
> +   emit_data->output[emit_data->chan] = LLVMBuildFDiv(
> +      bld_base->base.gallivm->builder,
> +      one, emit_data->args[0], "");
> +}
> +
> +/* TGSI_OPCODE_DFRAC */
> +static void dfrac_emit(
> +   const struct lp_build_tgsi_action * action,
> +   struct lp_build_tgsi_context * bld_base,
> +   struct lp_build_emit_data * emit_data)
> +{
> +   LLVMValueRef tmp;
> +   tmp = lp_build_floor(&bld_base->dbl_bld,
> +			emit_data->args[0]);
> +   emit_data->output[emit_data->chan] =  LLVMBuildFSub(bld_base->base.gallivm->builder,
> +                                                       emit_data->args[0], tmp, "");
> +}
> +
>  void
>  lp_set_default_actions(struct lp_build_tgsi_context * bld_base)
>  {
> @@ -948,6 +1063,25 @@ lp_set_default_actions(struct lp_build_tgsi_context * bld_base)
>  
>     bld_base->op_actions[TGSI_OPCODE_MAX].emit = fmax_emit;
>     bld_base->op_actions[TGSI_OPCODE_MIN].emit = fmin_emit;
> +
> +   bld_base->op_actions[TGSI_OPCODE_DADD].emit = add_emit;
> +   bld_base->op_actions[TGSI_OPCODE_DMAX].emit = fmax_emit;
> +   bld_base->op_actions[TGSI_OPCODE_DMIN].emit = fmin_emit;
> +   bld_base->op_actions[TGSI_OPCODE_DMUL].emit = mul_emit;
> +
> +   bld_base->op_actions[TGSI_OPCODE_D2F].emit = d2f_emit;
> +   bld_base->op_actions[TGSI_OPCODE_D2I].emit = d2i_emit;
> +   bld_base->op_actions[TGSI_OPCODE_D2U].emit = d2u_emit;
> +
> +   bld_base->op_actions[TGSI_OPCODE_F2D].emit = f2d_emit;
> +   bld_base->op_actions[TGSI_OPCODE_I2D].emit = i2d_emit;
> +   bld_base->op_actions[TGSI_OPCODE_U2D].emit = u2d_emit;
> +
> +   bld_base->op_actions[TGSI_OPCODE_DMAD].emit = dmad_emit;
> +
> +   bld_base->op_actions[TGSI_OPCODE_DRCP].emit = drcp_emit;
> +   bld_base->op_actions[TGSI_OPCODE_DFRAC].emit = dfrac_emit;
> +
>  }
>  
>  /* CPU Only default actions */
> @@ -1792,6 +1926,102 @@ xor_emit_cpu(
>                                                       emit_data->args[1]);
>  }
>  
> +static void
> +dabs_emit_cpu(
> +   const struct lp_build_tgsi_action * action,
> +   struct lp_build_tgsi_context * bld_base,
> +   struct lp_build_emit_data * emit_data)
> +{
> +   emit_data->output[emit_data->chan] = lp_build_abs(&bld_base->dbl_bld,
> +                                                       emit_data->args[0]);
> +}
> +
> +/* TGSI_OPCODE_DNEG (CPU Only) */
> +static void
> +dneg_emit_cpu(
> +   const struct lp_build_tgsi_action * action,
> +   struct lp_build_tgsi_context * bld_base,
> +   struct lp_build_emit_data * emit_data)
> +{
> +   emit_data->output[emit_data->chan] = lp_build_sub(&bld_base->dbl_bld,
> +                                                     bld_base->dbl_bld.zero,
> +                                                     emit_data->args[0]);
> +}
> +
> +/* TGSI_OPCODE_DSET Helper (CPU Only) */
> +static void
> +dset_emit_cpu(
> +   const struct lp_build_tgsi_action * action,
> +   struct lp_build_tgsi_context * bld_base,
> +   struct lp_build_emit_data * emit_data,
> +   unsigned pipe_func)
> +{
> +   LLVMValueRef cond = lp_build_cmp(&bld_base->dbl_bld, pipe_func,
> +                                    emit_data->args[0], emit_data->args[1]);
> +   emit_data->output[emit_data->chan] = cond;
> +}
> +
> +/* TGSI_OPCODE_DSEQ (CPU Only) */
> +static void
> +dseq_emit_cpu(
> +   const struct lp_build_tgsi_action * action,
> +   struct lp_build_tgsi_context * bld_base,
> +   struct lp_build_emit_data * emit_data)
> +{
> +   dset_emit_cpu(action, bld_base, emit_data, PIPE_FUNC_EQUAL);
> +}
> +
> +/* TGSI_OPCODE_DSGE (CPU Only) */
> +static void
> +dsge_emit_cpu(
> +   const struct lp_build_tgsi_action * action,
> +   struct lp_build_tgsi_context * bld_base,
> +   struct lp_build_emit_data * emit_data)
> +{
> +   dset_emit_cpu(action, bld_base, emit_data, PIPE_FUNC_GEQUAL);
> +}
> +
> +/* TGSI_OPCODE_DSLT (CPU Only) */
> +static void
> +dslt_emit_cpu(
> +   const struct lp_build_tgsi_action * action,
> +   struct lp_build_tgsi_context * bld_base,
> +   struct lp_build_emit_data * emit_data)
> +{
> +   dset_emit_cpu(action, bld_base, emit_data, PIPE_FUNC_LESS);
> +}
> +
> +/* TGSI_OPCODE_DSNE (CPU Only) */
> +static void
> +dsne_emit_cpu(
> +   const struct lp_build_tgsi_action * action,
> +   struct lp_build_tgsi_context * bld_base,
> +   struct lp_build_emit_data * emit_data)
> +{
> +   dset_emit_cpu(action, bld_base, emit_data, PIPE_FUNC_NOTEQUAL);
> +}
> +
> +/* Reciprical squareroot (CPU Only) */
reciprocal

> +static void
> +drecip_sqrt_emit_cpu(
> +   const struct lp_build_tgsi_action * action,
> +   struct lp_build_tgsi_context * bld_base,
> +   struct lp_build_emit_data * emit_data)
> +{
> +   emit_data->output[emit_data->chan] = lp_build_rsqrt(&bld_base->dbl_bld,
> +                                                         emit_data->args[0]);
> +}
> +
> +static void
> +dsqrt_emit_cpu(
> +   const struct lp_build_tgsi_action * action,
> +   struct lp_build_tgsi_context * bld_base,
> +   struct lp_build_emit_data * emit_data)
> +{
> +   emit_data->output[emit_data->chan] = lp_build_sqrt(&bld_base->dbl_bld,
> +                                                      emit_data->args[0]);
> +}
> +
>  void
>  lp_set_default_actions_cpu(
>     struct lp_build_tgsi_context * bld_base)
> @@ -1864,4 +2094,14 @@ lp_set_default_actions_cpu(
>  
>     bld_base->op_actions[TGSI_OPCODE_XOR].emit = xor_emit_cpu;
>  
> +   bld_base->op_actions[TGSI_OPCODE_DABS].emit = dabs_emit_cpu;
> +   bld_base->op_actions[TGSI_OPCODE_DNEG].emit = dneg_emit_cpu;
> +   bld_base->op_actions[TGSI_OPCODE_DSEQ].emit = dseq_emit_cpu;
> +   bld_base->op_actions[TGSI_OPCODE_DSGE].emit = dsge_emit_cpu;
> +   bld_base->op_actions[TGSI_OPCODE_DSLT].emit = dslt_emit_cpu;
> +   bld_base->op_actions[TGSI_OPCODE_DSNE].emit = dsne_emit_cpu;
> +
> +   bld_base->op_actions[TGSI_OPCODE_DRSQ].emit = drecip_sqrt_emit_cpu;
> +   bld_base->op_actions[TGSI_OPCODE_DSQRT].emit = dsqrt_emit_cpu;
> +
>  }
> diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.h b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.h
> index fc7fdbd..1b3b01c 100644
> --- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.h
> +++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.h
> @@ -71,6 +71,9 @@ struct lp_build_emit_data {
>      */
>     unsigned chan;
>  
> +   /**
> +    * This is used to specifed the src channel to read from for doubles */
specify

> +   unsigned src_chan;
>     /** The lp_build_tgsi_action::emit 'executes' the opcode and writes the
>      * results to this array.
>      */
> diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
> index 268379e..10372fe 100644
> --- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
> +++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
> @@ -1139,8 +1139,10 @@ stype_to_fetch(struct lp_build_tgsi_context * bld_base,
>     case TGSI_TYPE_SIGNED:
>        bld_fetch = &bld_base->int_bld;
>        break;
> -   case TGSI_TYPE_VOID:
>     case TGSI_TYPE_DOUBLE:
> +      bld_fetch = &bld_base->dbl_bld;
> +      break;
> +   case TGSI_TYPE_VOID:
>     default:
>        assert(0);
>        bld_fetch = NULL;
> @@ -1241,13 +1243,18 @@ emit_fetch_constant(
>     else {
>        LLVMValueRef index;  /* index into the const buffer */
>        LLVMValueRef scalar, scalar_ptr;
> -
> +      struct lp_build_context *bld_broad = &bld_base->base;
>        index = lp_build_const_int32(gallivm, reg->Register.Index * 4 + swizzle);
>  
>        scalar_ptr = LLVMBuildGEP(builder, consts_ptr,
>                                  &index, 1, "");
> +      if (stype == TGSI_TYPE_DOUBLE) {
> +         LLVMTypeRef dptr_type = LLVMPointerType(LLVMDoubleTypeInContext(gallivm->context), 0);
> +         scalar_ptr = LLVMBuildBitCast(builder, scalar_ptr, dptr_type, "");
> +         bld_broad = &bld_base->dbl_bld;
> +      }
>        scalar = LLVMBuildLoad(builder, scalar_ptr, "");
> -      res = lp_build_broadcast_scalar(&bld_base->base, scalar);
> +      res = lp_build_broadcast_scalar(bld_broad, scalar);
>     }
>  
>     if (stype == TGSI_TYPE_SIGNED || stype == TGSI_TYPE_UNSIGNED) {
> @@ -1258,6 +1265,45 @@ emit_fetch_constant(
>     return res;
>  }
>  
> +/**
> + * Fetch double values from two separate channels.
> + * Doubles are stored split across two channels, like xy and zw.
> + * this functions creates a set of 16 floats,
This function

> + * extracts the values from the two channels,
> + * puts them in the correct place, then casts to 8 doubles.
> + */
> +static LLVMValueRef
> +emit_fetch_double(
> +   struct lp_build_tgsi_context * bld_base,
> +   enum tgsi_opcode_type stype,
> +   LLVMValueRef input,
> +   LLVMValueRef input2)
> +{
> +   struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);
> +   struct gallivm_state *gallivm = bld->bld_base.base.gallivm;
> +   LLVMBuilderRef builder = gallivm->builder;
> +   LLVMValueRef res;
> +   struct lp_build_context *bld_fetch = stype_to_fetch(bld_base, stype);
> +   int i;
> +
> +   /* need to create an array of floats interleaved,
> +      then cast that to that to a double array */
grammar

> +   res = LLVMBuildBitCast(builder, bld_base->dbl_bld.undef, LLVMVectorType(LLVMFloatTypeInContext(gallivm->context), bld_base->base.type.length * 2), "");
> +
> +   for (i = 0; i < bld_base->base.type.length * 2; i+=2) {
> +      LLVMValueRef scalar, scalar2;
> +      LLVMValueRef ii = lp_build_const_int32(gallivm, i);
> +      LLVMValueRef ii1 = lp_build_const_int32(gallivm, i + 1);
> +      LLVMValueRef si = lp_build_const_int32(gallivm, i >> 1);
> +
> +      scalar = LLVMBuildExtractElement(builder, input, si, "");
> +      res = LLVMBuildInsertElement(builder, res, scalar, ii, "");
> +      scalar2 = LLVMBuildExtractElement(builder, input2, si, "");
> +      res = LLVMBuildInsertElement(builder, res, scalar2, ii1, "");
> +   }
Did you check what code this generated? Traditionally, we tried to avoid
the extract/insert stuff where possible and use shuffles instead.
Because llvm would actually do inserts/extracts (i.e. move from simd
domain to integer domain and back, which is pretty horrendous, and
doubly so on some non-intel cpus which have like 15+ cycles latency for
this). It is possible though this is no longer a problem, llvm 3.6 or
3.7 got some majorly improved shuffle optimizer which might also catch this.

> +   return LLVMBuildBitCast(builder, res, bld_fetch->vec_type, "");
> +}
> +
>  static LLVMValueRef
>  emit_fetch_immediate(
>     struct lp_build_tgsi_context * bld_base,
> @@ -1309,12 +1355,16 @@ emit_fetch_immediate(
>     }
>     else {
>        res = bld->immediates[reg->Register.Index][swizzle];
> +      if (stype == TGSI_TYPE_DOUBLE)
> +         res = emit_fetch_double(bld_base, stype, res, bld->immediates[reg->Register.Index][swizzle + 1]);
>     }
Does the indirect case above actually work?

>  
>     if (stype == TGSI_TYPE_UNSIGNED) {
>        res = LLVMBuildBitCast(builder, res, bld_base->uint_bld.vec_type, "");
>     } else if (stype == TGSI_TYPE_SIGNED) {
>        res = LLVMBuildBitCast(builder, res, bld_base->int_bld.vec_type, "");
> +   } else if (stype == TGSI_TYPE_DOUBLE) {
> +      res = LLVMBuildBitCast(builder, res, bld_base->dbl_bld.vec_type, "");
>     }
>     return res;
>  }
> @@ -1357,12 +1407,27 @@ emit_fetch_input(
>        if (bld->indirect_files & (1 << TGSI_FILE_INPUT)) {
>           LLVMValueRef lindex = lp_build_const_int32(gallivm,
>                                          reg->Register.Index * 4 + swizzle);
> -         LLVMValueRef input_ptr =  LLVMBuildGEP(builder,
> -                                                bld->inputs_array, &lindex, 1, "");
> +         LLVMValueRef input_ptr = LLVMBuildGEP(builder,
> +                                               bld->inputs_array, &lindex, 1, "");
> +
>           res = LLVMBuildLoad(builder, input_ptr, "");
> +         if (stype == TGSI_TYPE_DOUBLE) {
> +            LLVMValueRef lindex1;
> +            LLVMValueRef input_ptr2;
> +            LLVMValueRef res2;
> +
> +            lindex1 = lp_build_const_int32(gallivm,
> +                                           reg->Register.Index * 4 + swizzle + 1);
> +            input_ptr2 = LLVMBuildGEP(builder,
> +                                      bld->inputs_array, &lindex1, 1, "");
> +            res2 = LLVMBuildLoad(builder, input_ptr2, "");
> +            res = emit_fetch_double(bld_base, stype, res, res2);
> +         }
>        }
>        else {
>           res = bld->inputs[reg->Register.Index][swizzle];
> +         if (stype == TGSI_TYPE_DOUBLE)
> +            res = emit_fetch_double(bld_base, stype, res, bld->inputs[reg->Register.Index][swizzle + 1]);
>        }
>     }
>  
> @@ -1372,6 +1437,8 @@ emit_fetch_input(
>        res = LLVMBuildBitCast(builder, res, bld_base->uint_bld.vec_type, "");
>     } else if (stype == TGSI_TYPE_SIGNED) {
>        res = LLVMBuildBitCast(builder, res, bld_base->int_bld.vec_type, "");
> +   } else if (stype == TGSI_TYPE_DOUBLE) {
> +      res = LLVMBuildBitCast(builder, res, bld_base->dbl_bld.vec_type, "");
>     }
>  
>     return res;
> @@ -1413,7 +1480,7 @@ emit_fetch_gs_input(
>     } else {
>        attrib_index = lp_build_const_int32(gallivm, reg->Register.Index);
>     }
> -   
> +
>     if (reg->Dimension.Indirect) {
>        vertex_index = get_indirect_index(bld,
>                                          reg->Register.File,
> @@ -1436,6 +1503,8 @@ emit_fetch_gs_input(
>        res = LLVMBuildBitCast(builder, res, bld_base->uint_bld.vec_type, "");
>     } else if (stype == TGSI_TYPE_SIGNED) {
>        res = LLVMBuildBitCast(builder, res, bld_base->int_bld.vec_type, "");
> +   } else if (stype == TGSI_TYPE_DOUBLE) {
> +      res = LLVMBuildBitCast(builder, res, bld_base->dbl_bld.vec_type, "");
>     }
>  
>     return res;
> @@ -1480,6 +1549,14 @@ emit_fetch_temporary(
>        LLVMValueRef temp_ptr;
>        temp_ptr = lp_get_temp_ptr_soa(bld, reg->Register.Index, swizzle);
>        res = LLVMBuildLoad(builder, temp_ptr, "");
> +
> +      if (stype == TGSI_TYPE_DOUBLE) {
> +         LLVMValueRef temp_ptr2, res2;
> +
> +         temp_ptr2 = lp_get_temp_ptr_soa(bld, reg->Register.Index, swizzle + 1);
> +         res2 = LLVMBuildLoad(builder, temp_ptr2, "");
> +         res = emit_fetch_double(bld_base, stype, res, res2);
> +      }
Same as above.

>     }
>  
>     if (stype == TGSI_TYPE_SIGNED || stype == TGSI_TYPE_UNSIGNED) {
> @@ -1648,6 +1725,49 @@ emit_fetch_predicate(
>     }
>  }
>  
> +/**
> + * store an array of 8 doubles into two arrays of 8 floats
> + * i.e.
> + * value is d0, d1, d2, d3 etc.
> + * each double has high and low pieces x, y
> + * so gets stored into the separate channels as:
> + * chan_ptr = d0.x, d1.x, d2.x, d3.x
> + * chan_ptr2 = d0.y, d1.y, d2.y, d3.y
> + */
> +static void
> +emit_store_double_chan(struct lp_build_tgsi_context *bld_base,
> +                       int dtype,
> +                       LLVMValueRef chan_ptr, LLVMValueRef chan_ptr2,
> +                       LLVMValueRef pred,
> +                       LLVMValueRef value)
> +{
> +   struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);
> +   struct gallivm_state *gallivm = bld_base->base.gallivm;
> +   LLVMBuilderRef builder = gallivm->builder;
> +   struct lp_build_context *float_bld = &bld_base->base;
> +   int i;
> +   if (dtype == TGSI_TYPE_DOUBLE) {
> +      LLVMValueRef temp, temp2;
> +
> +      temp = float_bld->undef;
> +      temp2 = float_bld->undef;
> +      for (i = 0; i < bld_base->base.type.length * 2; i += 2) {
> +         LLVMValueRef scalar, scalar2;
> +         LLVMValueRef ii = lp_build_const_int32(gallivm, i);
> +         LLVMValueRef ii1 = lp_build_const_int32(gallivm, i + 1);
> +         LLVMValueRef si = lp_build_const_int32(gallivm, i >> 1);
> +         scalar = LLVMBuildExtractElement(builder, value, ii, "");
> +         temp = LLVMBuildInsertElement(builder, temp, scalar, si, "");
> +         scalar2 = LLVMBuildExtractElement(builder, value, ii1, "");
> +         temp2 = LLVMBuildInsertElement(builder, temp2, scalar2, si, "");
> +      }
Same reservations about insert/extract as previously.

> +
> +      lp_exec_mask_store(&bld->exec_mask, float_bld, pred, temp, chan_ptr);
> +      lp_exec_mask_store(&bld->exec_mask, float_bld, pred, temp2, chan_ptr2);
> +   } else {
> +      lp_exec_mask_store(&bld->exec_mask, float_bld, pred, value, chan_ptr);
> +   }
> +}
>  
>  /**
>   * Register store.
> @@ -1721,13 +1841,21 @@ emit_store_chan(
>        else {
>           LLVMValueRef out_ptr = lp_get_output_ptr(bld, reg->Register.Index,
>                                                    chan_index);
> -         lp_exec_mask_store(&bld->exec_mask, float_bld, pred, value, out_ptr);
> +         LLVMValueRef out_ptr2 = NULL;
> +         if (dtype == TGSI_TYPE_DOUBLE)
> +            out_ptr2 = lp_get_output_ptr(bld, reg->Register.Index,
> +                                                  chan_index + 1);
> +
> +         emit_store_double_chan(bld_base, dtype, out_ptr, out_ptr2, pred, value);
>        }
>        break;
>  
>     case TGSI_FILE_TEMPORARY:
>        /* Temporaries are always stored as floats */
> -      value = LLVMBuildBitCast(builder, value, float_bld->vec_type, "");
> +      if (dtype != TGSI_TYPE_DOUBLE)
> +         value = LLVMBuildBitCast(builder, value, float_bld->vec_type, "");
> +      else
> +         value = LLVMBuildBitCast(builder, value,  LLVMVectorType(LLVMFloatTypeInContext(gallivm->context), bld_base->base.type.length * 2), "");
>  
>        if (reg->Register.Indirect) {
>           LLVMValueRef index_vec;  /* indexes into the temp registers */
> @@ -1747,9 +1875,12 @@ emit_store_chan(
>                             &bld->exec_mask, pred);
>        }
>        else {
> -         LLVMValueRef temp_ptr;
> +         LLVMValueRef temp_ptr, temp_ptr2 = NULL;
>           temp_ptr = lp_get_temp_ptr_soa(bld, reg->Register.Index, chan_index);
> -         lp_exec_mask_store(&bld->exec_mask, float_bld, pred, value, temp_ptr);
> +         if (dtype == TGSI_TYPE_DOUBLE)
> +            temp_ptr2 = lp_get_temp_ptr_soa(bld, reg->Register.Index, chan_index + 1);
> +
> +         emit_store_double_chan(bld_base, dtype, temp_ptr, temp_ptr2, pred, value);
Wondering about the indirect case here as well...

>        }
>        break;
>  
> @@ -1818,13 +1949,16 @@ emit_store(
>  {
>     unsigned chan_index;
>     struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);
> -
> +   enum tgsi_opcode_type dtype = tgsi_opcode_infer_dst_type(inst->Instruction.Opcode);
>     if(info->num_dst) {
>        LLVMValueRef pred[TGSI_NUM_CHANNELS];
>  
>        emit_fetch_predicate( bld, inst, pred );
>  
>        TGSI_FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
> +
> +         if (dtype == TGSI_TYPE_DOUBLE && (chan_index == 1 || chan_index == 3))
> +             continue;
>           emit_store_chan(bld_base, inst, 0, chan_index, pred[chan_index], dst[chan_index]);
>        }
>     }
> @@ -2823,6 +2957,7 @@ void lp_emit_immediate_soa(
>                 lp_build_const_vec(gallivm, bld_base->base.type, imm->u[i].Float);
>  
>        break;
> +   case TGSI_IMM_FLOAT64:
>     case TGSI_IMM_UINT32:
>        for( i = 0; i < size; ++i ) {
>           LLVMValueRef tmp = lp_build_const_vec(gallivm, bld_base->uint_bld.type, imm->u[i].Uint);
> @@ -3674,6 +3809,12 @@ lp_build_tgsi_soa(struct gallivm_state *gallivm,
>     lp_build_context_init(&bld.bld_base.uint_bld, gallivm, lp_uint_type(type));
>     lp_build_context_init(&bld.bld_base.int_bld, gallivm, lp_int_type(type));
>     lp_build_context_init(&bld.elem_bld, gallivm, lp_elem_type(type));
> +   {
> +      struct lp_type dbl_type;
> +      dbl_type = type;
> +      dbl_type.width *= 2;
> +      lp_build_context_init(&bld.bld_base.dbl_bld, gallivm, dbl_type);
> +   }
>     bld.mask = mask;
>     bld.inputs = inputs;
>     bld.outputs = outputs;
> 

Roland