[Mesa-dev] [PATCH] gallivm: optimize lp_build_minify for sse

Tue Nov 5 12:04:29 PST 2013

Looks great to me

Jose

----- Original Message -----
> From: Roland Scheidegger <sroland at vmware.com>
> 
> SSE can't handle true vector shifts (with variable shift count),
> so llvm is turning them into a mess of extracts, scalar shifts and inserts.
> It is however possible to emulate them in lp_build_minify with float muls,
> which should be way faster (saves over 20 instructions per 8-wide
> lp_build_minify). This wouldn't work for "generic" 32bit shifts though
> since we've got only 24bits of mantissa (actually for left shifts it would
> work by using sse41 int mul instead of float mul but not for right shifts).
> Note that this has very limited scope for now, since this is only used with
> per-pixel lod (otherwise we're avoiding the non-constant shift count by doing
> per-quad shifts manually), and only 1d textures even then (though the latter
> should change).
> ---
>  src/gallium/auxiliary/gallivm/lp_bld_sample.c     |   62
>  +++++++++++++++++----
>  src/gallium/auxiliary/gallivm/lp_bld_sample.h     |    3 +-
>  src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c |    2 +-
>  3 files changed, 54 insertions(+), 13 deletions(-)
> 
> diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample.c
> b/src/gallium/auxiliary/gallivm/lp_bld_sample.c
> index a032d9d..e60a035 100644
> --- a/src/gallium/auxiliary/gallivm/lp_bld_sample.c
> +++ b/src/gallium/auxiliary/gallivm/lp_bld_sample.c
> @@ -36,6 +36,7 @@
>  #include "pipe/p_state.h"
>  #include "util/u_format.h"
>  #include "util/u_math.h"
> +#include "util/u_cpu_detect.h"
>  #include "lp_bld_arit.h"
>  #include "lp_bld_const.h"
>  #include "lp_bld_debug.h"
> @@ -248,7 +249,7 @@ lp_build_rho(struct lp_build_sample_context *bld,
>     first_level = bld->dynamic_state->first_level(bld->dynamic_state,
>                                                   bld->gallivm,
>                                                   texture_unit);
>     first_level_vec = lp_build_broadcast_scalar(int_size_bld, first_level);
> -   int_size = lp_build_minify(int_size_bld, bld->int_size, first_level_vec);
> +   int_size = lp_build_minify(int_size_bld, bld->int_size, first_level_vec,
> TRUE);
>     float_size = lp_build_int_to_float(float_size_bld, int_size);
>  
>     if (cube_rho) {
> @@ -1089,12 +1090,14 @@ lp_build_get_mip_offsets(struct
> lp_build_sample_context *bld,
>  
>  /**
>   * Codegen equivalent for u_minify().
> + * @param lod_scalar  if lod is a (broadcasted) scalar
>   * Return max(1, base_size >> level);
>   */
>  LLVMValueRef
>  lp_build_minify(struct lp_build_context *bld,
>                  LLVMValueRef base_size,
> -                LLVMValueRef level)
> +                LLVMValueRef level,
> +                boolean lod_scalar)
>  {
>     LLVMBuilderRef builder = bld->gallivm->builder;
>     assert(lp_check_value(bld->type, base_size));
> @@ -1105,10 +1108,49 @@ lp_build_minify(struct lp_build_context *bld,
>        return base_size;
>     }
>     else {
> -      LLVMValueRef size =
> -         LLVMBuildLShr(builder, base_size, level, "minify");
> +      LLVMValueRef size;
>        assert(bld->type.sign);
> -      size = lp_build_max(bld, size, bld->one);
> +      if (lod_scalar ||
> +         (util_cpu_caps.has_avx2 || !util_cpu_caps.has_sse)) {
> +         size = LLVMBuildLShr(builder, base_size, level, "minify");
> +         size = lp_build_max(bld, size, bld->one);
> +      }
> +      else {
> +         /*
> +          * emulate shift with float mul, since intel "forgot" shifts with
> +          * per-element shift count until avx2, which results in terrible
> +          * scalar extraction (both count and value), scalar shift,
> +          * vector reinsertion. Should not be an issue on any non-x86 cpu
> +          * with a vector instruction set.
> +          * On cpus with AMD's XOP this should also be unnecessary but I'm
> +          * not sure if llvm would emit this with current flags.
> +          */
> +         LLVMValueRef const127, const23, lf;
> +         struct lp_type ftype;
> +         struct lp_build_context fbld;
> +         ftype = lp_type_float_vec(32, bld->type.length * bld->type.width);
> +         lp_build_context_init(&fbld, bld->gallivm, ftype);
> +         const127 = lp_build_const_int_vec(bld->gallivm, bld->type, 127);
> +         const23 = lp_build_const_int_vec(bld->gallivm, bld->type, 23);
> +
> +         /* calculate 2^(-level) float */
> +         lf = lp_build_sub(bld, const127, level);
> +         lf = lp_build_shl(bld, lf, const23);
> +         lf = LLVMBuildBitCast(builder, lf, fbld.vec_type, "");
> +
> +         /* finish shift operation by doing float mul */
> +         base_size = lp_build_int_to_float(&fbld, base_size);
> +         size = lp_build_mul(&fbld, base_size, lf);
> +         /*
> +          * do the max also with floats because
> +          * a) non-emulated int max requires sse41
> +          *    (this is actually a lie as we could cast to 16bit values
> +          *    as 16bit is sufficient and 16bit int max is sse2)
> +          * b) with avx we can do int max 4-wide but float max 8-wide
> +          */
> +         size = lp_build_max(&fbld, size, fbld.one);
> +         size = lp_build_itrunc(&fbld, size);
> +      }
>        return size;
>     }
>  }
> @@ -1185,7 +1227,7 @@ lp_build_mipmap_level_sizes(struct
> lp_build_sample_context *bld,
>      */
>     if (bld->num_mips == 1) {
>        ilevel_vec = lp_build_broadcast_scalar(&bld->int_size_bld, ilevel);
> -      *out_size = lp_build_minify(&bld->int_size_bld, bld->int_size,
> ilevel_vec);
> +      *out_size = lp_build_minify(&bld->int_size_bld, bld->int_size,
> ilevel_vec, TRUE);
>     }
>     else {
>        LLVMValueRef int_size_vec;
> @@ -1229,7 +1271,7 @@ lp_build_mipmap_level_sizes(struct
> lp_build_sample_context *bld,
>                                                   bld4.type,
>                                                   ilevel,
>                                                   indexi);
> -            tmp[i] = lp_build_minify(&bld4, int_size_vec, ileveli);
> +            tmp[i] = lp_build_minify(&bld4, int_size_vec, ileveli, TRUE);
>           }
>           /*
>            * out_size is [w0, h0, d0, _, w1, h1, d1, _, ...] vector for dims
>            > 1,
> @@ -1248,7 +1290,6 @@ lp_build_mipmap_level_sizes(struct
> lp_build_sample_context *bld,
>           * with 4-wide vector pack all elements into a 8xi16 vector
>           * (on which we can still do useful math) instead of using a 16xi32
>           * vector.
> -         * FIXME: some callers can't handle this yet.
>           * For dims == 1 this will create [w0, w1, w2, w3, ...] vector.
>           * For dims > 1 this will create [w0, h0, d0, _, w1, h1, d1, _, ...]
>           vector.
>           */
> @@ -1257,8 +1298,7 @@ lp_build_mipmap_level_sizes(struct
> lp_build_sample_context *bld,
>              assert(bld->int_size_in_bld.type.length == 1);
>              int_size_vec = lp_build_broadcast_scalar(&bld->int_coord_bld,
>                                                       bld->int_size);
> -            /* vector shift with variable shift count alert... */
> -            *out_size = lp_build_minify(&bld->int_coord_bld, int_size_vec,
> ilevel);
> +            *out_size = lp_build_minify(&bld->int_coord_bld, int_size_vec,
> ilevel, FALSE);
>           }
>           else {
>              LLVMValueRef ilevel1;
> @@ -1267,7 +1307,7 @@ lp_build_mipmap_level_sizes(struct
> lp_build_sample_context *bld,
>                 ilevel1 = lp_build_extract_broadcast(bld->gallivm,
>                 bld->int_coord_type,
>                                                      bld->int_size_in_bld.type,
>                                                      ilevel, indexi);
>                 tmp[i] = bld->int_size;
> -               tmp[i] = lp_build_minify(&bld->int_size_in_bld, tmp[i],
> ilevel1);
> +               tmp[i] = lp_build_minify(&bld->int_size_in_bld, tmp[i],
> ilevel1, TRUE);
>              }
>              *out_size = lp_build_concat(bld->gallivm, tmp,
>                                          bld->int_size_in_bld.type,
> diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample.h
> b/src/gallium/auxiliary/gallivm/lp_bld_sample.h
> index 5039128..fd4e053 100644
> --- a/src/gallium/auxiliary/gallivm/lp_bld_sample.h
> +++ b/src/gallium/auxiliary/gallivm/lp_bld_sample.h
> @@ -547,7 +547,8 @@ lp_build_sample_nop(struct gallivm_state *gallivm,
>  LLVMValueRef
>  lp_build_minify(struct lp_build_context *bld,
>                  LLVMValueRef base_size,
> -                LLVMValueRef level);
> +                LLVMValueRef level,
> +                boolean lod_scalar);
>  
>  
>  #endif /* LP_BLD_SAMPLE_H */
> diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
> b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
> index 2d83331..e8c04d1 100644
> --- a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
> +++ b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
> @@ -2940,7 +2940,7 @@ lp_build_size_query_soa(struct gallivm_state *gallivm,
>                                      lp_build_const_int32(gallivm, 2), "");
>     }
>  
> -   size = lp_build_minify(&bld_int_vec4, size, lod);
> +   size = lp_build_minify(&bld_int_vec4, size, lod, TRUE);
>  
>     if (has_array)
>        size = LLVMBuildInsertElement(gallivm->builder, size,
> --
> 1.7.9.5
>