[Mesa-dev] [PATCH] gallivm: use llvm jit code for decoding s3tc

Wed Dec 19 07:35:16 UTC 2018

On 19/12/2018 03:51, sroland at vmware.com wrote:
> From: Roland Scheidegger <sroland at vmware.com>
> 
> This is (much) faster than using the util fallback.
> (Note that there's two methods here, one would use a cache, similar to
> the existing code (although the cache was disabled), except the block
> decode is done with jit code, the other directly decodes the required
> pixels. For now don't use the cache (being direct-mapped is suboptimal,
> but it's difficult to come up with something better which doesn't have
> too much overhead.)
> ---
>   src/gallium/auxiliary/Makefile.sources        |    2 +-
>   src/gallium/auxiliary/gallivm/lp_bld_format.h |    6 +-
>   .../auxiliary/gallivm/lp_bld_format_aos.c     |    5 +-
>   .../auxiliary/gallivm/lp_bld_format_cached.c  |  374 ---
>   .../auxiliary/gallivm/lp_bld_format_s3tc.c    | 2229 +++++++++++++++++
>   .../auxiliary/gallivm/lp_bld_sample_soa.c     |    4 -
>   src/gallium/auxiliary/meson.build             |    2 +-
>   7 files changed, 2239 insertions(+), 383 deletions(-)
>   delete mode 100644 src/gallium/auxiliary/gallivm/lp_bld_format_cached.c
>   create mode 100644 src/gallium/auxiliary/gallivm/lp_bld_format_s3tc.c
> 
> diff --git a/src/gallium/auxiliary/Makefile.sources b/src/gallium/auxiliary/Makefile.sources
> index 87a490e555d..50e88088ff8 100644
> --- a/src/gallium/auxiliary/Makefile.sources
> +++ b/src/gallium/auxiliary/Makefile.sources
> @@ -418,11 +418,11 @@ GALLIVM_SOURCES := \
>   	gallivm/lp_bld_flow.h \
>   	gallivm/lp_bld_format_aos_array.c \
>   	gallivm/lp_bld_format_aos.c \
> -	gallivm/lp_bld_format_cached.c \
>   	gallivm/lp_bld_format_float.c \
>   	gallivm/lp_bld_format.c \
>   	gallivm/lp_bld_format.h \
>   	gallivm/lp_bld_format_soa.c \
> +	gallivm/lp_bld_format_s3tc.c \
>   	gallivm/lp_bld_format_srgb.c \
>   	gallivm/lp_bld_format_yuv.c \
>   	gallivm/lp_bld_gather.c \

I suppose we need to update src/gallium/auxiliary/meson.build too.  It's 
a pity that meson doesn't understand Makefile.sources

> diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format.h b/src/gallium/auxiliary/gallivm/lp_bld_format.h
> index 6540caaa293..b1e95c4e6db 100644
> --- a/src/gallium/auxiliary/gallivm/lp_bld_format.h
> +++ b/src/gallium/auxiliary/gallivm/lp_bld_format.h
> @@ -165,8 +165,12 @@ lp_build_fetch_subsampled_rgba_aos(struct gallivm_state *gallivm,
>                                      LLVMValueRef j);
>   
>   
> +/*
> + * S3TC
> + */
> +
>   LLVMValueRef
> -lp_build_fetch_cached_texels(struct gallivm_state *gallivm,
> +lp_build_fetch_s3tc_rgba_aos(struct gallivm_state *gallivm,
>                                const struct util_format_description *format_desc,
>                                unsigned n,
>                                LLVMValueRef base_ptr,
> diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c b/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c
> index b52acca1b3e..21680dba74a 100644
> --- a/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c
> +++ b/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c
> @@ -464,6 +464,7 @@ lp_build_pack_rgba_aos(struct gallivm_state *gallivm,
>    * \param ptr  address of the pixel block (or the texel if uncompressed)
>    * \param i, j  the sub-block pixel coordinates.  For non-compressed formats
>    *              these will always be (0, 0).
> + * \param cache  optional value pointing to a lp_build_format_cache structure
>    * \return  a 4 element vector with the pixel's RGBA values.
>    */
>   LLVMValueRef
> @@ -728,7 +729,7 @@ lp_build_fetch_rgba_aos(struct gallivm_state *gallivm,
>       * s3tc rgb formats
>       */
>   
> -   if (format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC && cache) {
> +   if (format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC) {
>         struct lp_type tmp_type;
>         LLVMValueRef tmp;
>   
> @@ -737,7 +738,7 @@ lp_build_fetch_rgba_aos(struct gallivm_state *gallivm,
>         tmp_type.length = num_pixels * 4;
>         tmp_type.norm = TRUE;
>   
> -      tmp = lp_build_fetch_cached_texels(gallivm,
> +      tmp = lp_build_fetch_s3tc_rgba_aos(gallivm,
>                                            format_desc,
>                                            num_pixels,
>                                            base_ptr,
> diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_cached.c b/src/gallium/auxiliary/gallivm/lp_bld_format_cached.c
> deleted file mode 100644
> index e08062dcacd..00000000000
> --- a/src/gallium/auxiliary/gallivm/lp_bld_format_cached.c
> +++ /dev/null
> @@ -1,374 +0,0 @@
> -/**************************************************************************
> - *
> - * Copyright 2015 VMware, Inc.
> - * All Rights Reserved.
> - *
> - * Permission is hereby granted, free of charge, to any person obtaining a
> - * copy of this software and associated documentation files (the
> - * "Software"), to deal in the Software without restriction, including
> - * without limitation the rights to use, copy, modify, merge, publish,
> - * distribute, sub license, and/or sell copies of the Software, and to
> - * permit persons to whom the Software is furnished to do so, subject to
> - * the following conditions:
> - *
> - * The above copyright notice and this permission notice (including the
> - * next paragraph) shall be included in all copies or substantial portions
> - * of the Software.
> - *
> - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
> - * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
> - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
> - * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
> - * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
> - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
> - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
> - *
> - **************************************************************************/
> -
> -#include "lp_bld_format.h"
> -#include "lp_bld_type.h"
> -#include "lp_bld_struct.h"
> -#include "lp_bld_const.h"
> -#include "lp_bld_flow.h"
> -#include "lp_bld_swizzle.h"
> -
> -#include "util/u_math.h"
> -
> -
> -/**
> - * @file
> - * Complex block-compression based formats are handled here by using a cache,
> - * so re-decoding of every pixel is not required.
> - * Especially for bilinear filtering, texel reuse is very high hence even
> - * a small cache helps.
> - * The elements in the cache are the decoded blocks - currently things
> - * are restricted to formats which are 4x4 block based, and the decoded
> - * texels must fit into 4x8 bits.
> - * The cache is direct mapped so hitrates aren't all that great and cache
> - * thrashing could happen.
> - *
> - * @author Roland Scheidegger <sroland at vmware.com>
> - */
> -
> -
> -#if LP_BUILD_FORMAT_CACHE_DEBUG
> -static void
> -update_cache_access(struct gallivm_state *gallivm,
> -                    LLVMValueRef ptr,
> -                    unsigned count,
> -                    unsigned index)
> -{
> -   LLVMBuilderRef builder = gallivm->builder;
> -   LLVMValueRef member_ptr, cache_access;
> -
> -   assert(index == LP_BUILD_FORMAT_CACHE_MEMBER_ACCESS_TOTAL ||
> -          index == LP_BUILD_FORMAT_CACHE_MEMBER_ACCESS_MISS);
> -
> -   member_ptr = lp_build_struct_get_ptr(gallivm, ptr, index, "");
> -   cache_access = LLVMBuildLoad(builder, member_ptr, "cache_access");
> -   cache_access = LLVMBuildAdd(builder, cache_access,
> -                               LLVMConstInt(LLVMInt64TypeInContext(gallivm->context),
> -                                                                   count, 0), "");
> -   LLVMBuildStore(builder, cache_access, member_ptr);
> -}
> -#endif
> -
> -
> -static void
> -store_cached_block(struct gallivm_state *gallivm,
> -                   LLVMValueRef *col,
> -                   LLVMValueRef tag_value,
> -                   LLVMValueRef hash_index,
> -                   LLVMValueRef cache)
> -{
> -   LLVMBuilderRef builder = gallivm->builder;
> -   LLVMValueRef ptr, indices[3];
> -   LLVMTypeRef type_ptr4x32;
> -   unsigned count;
> -
> -   type_ptr4x32 = LLVMPointerType(LLVMVectorType(LLVMInt32TypeInContext(gallivm->context), 4), 0);
> -   indices[0] = lp_build_const_int32(gallivm, 0);
> -   indices[1] = lp_build_const_int32(gallivm, LP_BUILD_FORMAT_CACHE_MEMBER_TAGS);
> -   indices[2] = hash_index;
> -   ptr = LLVMBuildGEP(builder, cache, indices, ARRAY_SIZE(indices), "");
> -   LLVMBuildStore(builder, tag_value, ptr);
> -
> -   indices[1] = lp_build_const_int32(gallivm, LP_BUILD_FORMAT_CACHE_MEMBER_DATA);
> -   hash_index = LLVMBuildMul(builder, hash_index,
> -                             lp_build_const_int32(gallivm, 16), "");
> -   for (count = 0; count < 4; count++) {
> -      indices[2] = hash_index;
> -      ptr = LLVMBuildGEP(builder, cache, indices, ARRAY_SIZE(indices), "");
> -      ptr = LLVMBuildBitCast(builder, ptr, type_ptr4x32, "");
> -      LLVMBuildStore(builder, col[count], ptr);
> -      hash_index = LLVMBuildAdd(builder, hash_index,
> -                                lp_build_const_int32(gallivm, 4), "");
> -   }
> -}
> -
> -
> -static LLVMValueRef
> -lookup_cached_pixel(struct gallivm_state *gallivm,
> -                    LLVMValueRef ptr,
> -                    LLVMValueRef index)
> -{
> -   LLVMBuilderRef builder = gallivm->builder;
> -   LLVMValueRef member_ptr, indices[3];
> -
> -   indices[0] = lp_build_const_int32(gallivm, 0);
> -   indices[1] = lp_build_const_int32(gallivm, LP_BUILD_FORMAT_CACHE_MEMBER_DATA);
> -   indices[2] = index;
> -   member_ptr = LLVMBuildGEP(builder, ptr, indices, ARRAY_SIZE(indices), "");
> -   return LLVMBuildLoad(builder, member_ptr, "cache_data");
> -}
> -
> -
> -static LLVMValueRef
> -lookup_tag_data(struct gallivm_state *gallivm,
> -                LLVMValueRef ptr,
> -                LLVMValueRef index)
> -{
> -   LLVMBuilderRef builder = gallivm->builder;
> -   LLVMValueRef member_ptr, indices[3];
> -
> -   indices[0] = lp_build_const_int32(gallivm, 0);
> -   indices[1] = lp_build_const_int32(gallivm, LP_BUILD_FORMAT_CACHE_MEMBER_TAGS);
> -   indices[2] = index;
> -   member_ptr = LLVMBuildGEP(builder, ptr, indices, ARRAY_SIZE(indices), "");
> -   return LLVMBuildLoad(builder, member_ptr, "tag_data");
> -}
> -
> -
> -static void
> -update_cached_block(struct gallivm_state *gallivm,
> -                    const struct util_format_description *format_desc,
> -                    LLVMValueRef ptr_addr,
> -                    LLVMValueRef hash_index,
> -                    LLVMValueRef cache)
> -
> -{
> -   LLVMBuilderRef builder = gallivm->builder;
> -   LLVMTypeRef i8t = LLVMInt8TypeInContext(gallivm->context);
> -   LLVMTypeRef pi8t = LLVMPointerType(i8t, 0);
> -   LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context);
> -   LLVMTypeRef i32x4 = LLVMVectorType(LLVMInt32TypeInContext(gallivm->context), 4);
> -   LLVMValueRef function;
> -   LLVMValueRef tag_value, tmp_ptr;
> -   LLVMValueRef col[4];
> -   unsigned i, j;
> -
> -   /*
> -    * Use format_desc->fetch_rgba_8unorm() for each pixel in the block.
> -    * This doesn't actually make any sense whatsoever, someone would need
> -    * to write a function doing this for all pixels in a block (either as
> -    * an external c function or with generated code). Don't ask.
> -    */
> -
> -   {
> -      /*
> -       * Function to call looks like:
> -       *   fetch(uint8_t *dst, const uint8_t *src, unsigned i, unsigned j)
> -       */
> -      LLVMTypeRef ret_type;
> -      LLVMTypeRef arg_types[4];
> -      LLVMTypeRef function_type;
> -
> -      assert(format_desc->fetch_rgba_8unorm);
> -
> -      ret_type = LLVMVoidTypeInContext(gallivm->context);
> -      arg_types[0] = pi8t;
> -      arg_types[1] = pi8t;
> -      arg_types[2] = i32t;
> -      arg_types[3] = i32t;
> -      function_type = LLVMFunctionType(ret_type, arg_types,
> -                                       ARRAY_SIZE(arg_types), 0);
> -
> -      /* make const pointer for the C fetch_rgba_8unorm function */
> -      function = lp_build_const_int_pointer(gallivm,
> -         func_to_pointer((func_pointer) format_desc->fetch_rgba_8unorm));
> -
> -      /* cast the callee pointer to the function's type */
> -      function = LLVMBuildBitCast(builder, function,
> -                                  LLVMPointerType(function_type, 0),
> -                                  "cast callee");
> -   }
> -
> -   tmp_ptr = lp_build_array_alloca(gallivm, i32x4,
> -                                   lp_build_const_int32(gallivm, 16),
> -                                   "tmp_decode_store");
> -   tmp_ptr = LLVMBuildBitCast(builder, tmp_ptr, pi8t, "");
> -
> -   /*
> -    * Invoke format_desc->fetch_rgba_8unorm() for each pixel.
> -    * This is going to be really really slow.
> -    * Note: the block store format is actually
> -    * x0y0x0y1x0y2x0y3 x1y0x1y1x1y2x1y3 ...
> -    */
> -   for (i = 0; i < 4; ++i) {
> -      for (j = 0; j < 4; ++j) {
> -         LLVMValueRef args[4];
> -         LLVMValueRef dst_offset = lp_build_const_int32(gallivm, (i * 4 + j) * 4);
> -
> -         /*
> -          * Note we actually supply a pointer to the start of the block,
> -          * not the start of the texture.
> -          */
> -         args[0] = LLVMBuildGEP(gallivm->builder, tmp_ptr, &dst_offset, 1, "");
> -         args[1] = ptr_addr;
> -         args[2] = LLVMConstInt(i32t, i, 0);
> -         args[3] = LLVMConstInt(i32t, j, 0);
> -         LLVMBuildCall(builder, function, args, ARRAY_SIZE(args), "");
> -      }
> -   }
> -
> -   /* Finally store the block - pointless mem copy + update tag. */
> -   tmp_ptr = LLVMBuildBitCast(builder, tmp_ptr, LLVMPointerType(i32x4, 0), "");
> -   for (i = 0; i < 4; ++i) {
> -      LLVMValueRef tmp_offset = lp_build_const_int32(gallivm, i);
> -      LLVMValueRef ptr = LLVMBuildGEP(gallivm->builder, tmp_ptr, &tmp_offset, 1, "");
> -      col[i] = LLVMBuildLoad(builder, ptr, "");
> -   }
> -
> -   tag_value = LLVMBuildPtrToInt(gallivm->builder, ptr_addr,
> -                                 LLVMInt64TypeInContext(gallivm->context), "");
> -   store_cached_block(gallivm, col, tag_value, hash_index, cache);
> -}
> -
> -
> -/*
> - * Do a cached lookup.
> - *
> - * Returns (vectors of) 4x8 rgba aos value
> - */
> -LLVMValueRef
> -lp_build_fetch_cached_texels(struct gallivm_state *gallivm,
> -                             const struct util_format_description *format_desc,
> -                             unsigned n,
> -                             LLVMValueRef base_ptr,
> -                             LLVMValueRef offset,
> -                             LLVMValueRef i,
> -                             LLVMValueRef j,
> -                             LLVMValueRef cache)
> -
> -{
> -   LLVMBuilderRef builder = gallivm->builder;
> -   unsigned count, low_bit, log2size;
> -   LLVMValueRef color, offset_stored, addr, ptr_addrtrunc, tmp;
> -   LLVMValueRef ij_index, hash_index, hash_mask, block_index;
> -   LLVMTypeRef i8t = LLVMInt8TypeInContext(gallivm->context);
> -   LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context);
> -   LLVMTypeRef i64t = LLVMInt64TypeInContext(gallivm->context);
> -   struct lp_type type;
> -   struct lp_build_context bld32;
> -   memset(&type, 0, sizeof type);
> -   type.width = 32;
> -   type.length = n;
> -
> -   assert(format_desc->block.width == 4);
> -   assert(format_desc->block.height == 4);
> -
> -   lp_build_context_init(&bld32, gallivm, type);
> -
> -   /*
> -    * compute hash - we use direct mapped cache, the hash function could
> -    *                be better but it needs to be simple
> -    * per-element:
> -    *    compare offset with offset stored at tag (hash)
> -    *    if not equal decode/store block, update tag
> -    *    extract color from cache
> -    *    assemble result vector
> -    */
> -
> -   /* TODO: not ideal with 32bit pointers... */
> -
> -   low_bit = util_logbase2(format_desc->block.bits / 8);
> -   log2size = util_logbase2(LP_BUILD_FORMAT_CACHE_SIZE);
> -   addr = LLVMBuildPtrToInt(builder, base_ptr, i64t, "");
> -   ptr_addrtrunc = LLVMBuildPtrToInt(builder, base_ptr, i32t, "");
> -   ptr_addrtrunc = lp_build_broadcast_scalar(&bld32, ptr_addrtrunc);
> -   /* For the hash function, first mask off the unused lowest bits. Then just
> -      do some xor with address bits - only use lower 32bits */
> -   ptr_addrtrunc = LLVMBuildAdd(builder, offset, ptr_addrtrunc, "");
> -   ptr_addrtrunc = LLVMBuildLShr(builder, ptr_addrtrunc,
> -                                 lp_build_const_int_vec(gallivm, type, low_bit), "");
> -   /* This only really makes sense for size 64,128,256 */
> -   hash_index = ptr_addrtrunc;
> -   ptr_addrtrunc = LLVMBuildLShr(builder, ptr_addrtrunc,
> -                                 lp_build_const_int_vec(gallivm, type, 2*log2size), "");
> -   hash_index = LLVMBuildXor(builder, ptr_addrtrunc, hash_index, "");
> -   tmp = LLVMBuildLShr(builder, hash_index,
> -                       lp_build_const_int_vec(gallivm, type, log2size), "");
> -   hash_index = LLVMBuildXor(builder, hash_index, tmp, "");
> -
> -   hash_mask = lp_build_const_int_vec(gallivm, type, LP_BUILD_FORMAT_CACHE_SIZE - 1);
> -   hash_index = LLVMBuildAnd(builder, hash_index, hash_mask, "");
> -   ij_index = LLVMBuildShl(builder, i, lp_build_const_int_vec(gallivm, type, 2), "");
> -   ij_index = LLVMBuildAdd(builder, ij_index, j, "");
> -   block_index = LLVMBuildShl(builder, hash_index,
> -                              lp_build_const_int_vec(gallivm, type, 4), "");
> -   block_index = LLVMBuildAdd(builder, ij_index, block_index, "");
> -
> -   if (n > 1) {
> -      color = LLVMGetUndef(LLVMVectorType(i32t, n));
> -      for (count = 0; count < n; count++) {
> -         LLVMValueRef index, cond, colorx;
> -         LLVMValueRef block_indexx, hash_indexx, addrx, offsetx, ptr_addrx;
> -         struct lp_build_if_state if_ctx;
> -
> -         index = lp_build_const_int32(gallivm, count);
> -         offsetx = LLVMBuildExtractElement(builder, offset, index, "");
> -         addrx = LLVMBuildZExt(builder, offsetx, i64t, "");
> -         addrx = LLVMBuildAdd(builder, addrx, addr, "");
> -         block_indexx = LLVMBuildExtractElement(builder, block_index, index, "");
> -         hash_indexx = LLVMBuildLShr(builder, block_indexx,
> -                                     lp_build_const_int32(gallivm, 4), "");
> -         offset_stored = lookup_tag_data(gallivm, cache, hash_indexx);
> -         cond = LLVMBuildICmp(builder, LLVMIntNE, offset_stored, addrx, "");
> -
> -         lp_build_if(&if_ctx, gallivm, cond);
> -         {
> -            ptr_addrx = LLVMBuildIntToPtr(builder, addrx,
> -                                          LLVMPointerType(i8t, 0), "");
> -            update_cached_block(gallivm, format_desc, ptr_addrx, hash_indexx, cache);
> -#if LP_BUILD_FORMAT_CACHE_DEBUG
> -            update_cache_access(gallivm, cache, 1,
> -                                LP_BUILD_FORMAT_CACHE_MEMBER_ACCESS_MISS);
> -#endif
> -         }
> -         lp_build_endif(&if_ctx);
> -
> -         colorx = lookup_cached_pixel(gallivm, cache, block_indexx);
> -
> -         color = LLVMBuildInsertElement(builder, color, colorx,
> -                                        lp_build_const_int32(gallivm, count), "");
> -      }
> -   }
> -   else {
> -      LLVMValueRef cond;
> -      struct lp_build_if_state if_ctx;
> -
> -      tmp = LLVMBuildZExt(builder, offset, i64t, "");
> -      addr = LLVMBuildAdd(builder, tmp, addr, "");
> -      offset_stored = lookup_tag_data(gallivm, cache, hash_index);
> -      cond = LLVMBuildICmp(builder, LLVMIntNE, offset_stored, addr, "");
> -
> -      lp_build_if(&if_ctx, gallivm, cond);
> -      {
> -         tmp = LLVMBuildIntToPtr(builder, addr, LLVMPointerType(i8t, 0), "");
> -         update_cached_block(gallivm, format_desc, tmp, hash_index, cache);
> -#if LP_BUILD_FORMAT_CACHE_DEBUG
> -         update_cache_access(gallivm, cache, 1,
> -                             LP_BUILD_FORMAT_CACHE_MEMBER_ACCESS_MISS);
> -#endif
> -      }
> -      lp_build_endif(&if_ctx);
> -
> -      color = lookup_cached_pixel(gallivm, cache, block_index);
> -   }
> -#if LP_BUILD_FORMAT_CACHE_DEBUG
> -   update_cache_access(gallivm, cache, n,
> -                       LP_BUILD_FORMAT_CACHE_MEMBER_ACCESS_TOTAL);
> -#endif
> -   return LLVMBuildBitCast(builder, color, LLVMVectorType(i8t, n * 4), "");
> -}
> -
> diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_s3tc.c b/src/gallium/auxiliary/gallivm/lp_bld_format_s3tc.c
> new file mode 100644
> index 00000000000..2b143566f24
> --- /dev/null
> +++ b/src/gallium/auxiliary/gallivm/lp_bld_format_s3tc.c
> @@ -0,0 +1,2229 @@
> +/**************************************************************************
> + *
> + * Copyright 2010-2018 VMware, Inc.
> + * All Rights Reserved.
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a
> + * copy of this software and associated documentation files (the
> + * "Software"), to deal in the Software without restriction, including
> + * without limitation the rights to use, copy, modify, merge, publish,
> + * distribute, sub license, and/or sell copies of the Software, and to
> + * permit persons to whom the Software is furnished to do so, subject to
> + * the following conditions:
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
> + * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
> + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
> + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
> + * USE OR OTHER DEALINGS IN THE SOFTWARE.
> + *
> + * The above copyright notice and this permission notice (including the
> + * next paragraph) shall be included in all copies or substantial portions
> + * of the Software.
> + *
> + **************************************************************************/
> +
> +
> +/**
> + * @file
> + * s3tc pixel format manipulation.
> + *
> + * @author Roland Scheidegger <sroland at vmware.com>
> + */
> +
> +
> +#include "util/u_format.h"
> +#include "util/u_math.h"
> +#include "util/u_string.h"
> +#include "util/u_cpu_detect.h"
> +#include "util/u_debug.h"
> +
> +#include "lp_bld_arit.h"
> +#include "lp_bld_type.h"
> +#include "lp_bld_const.h"
> +#include "lp_bld_conv.h"
> +#include "lp_bld_gather.h"
> +#include "lp_bld_format.h"
> +#include "lp_bld_logic.h"
> +#include "lp_bld_pack.h"
> +#include "lp_bld_flow.h"
> +#include "lp_bld_printf.h"
> +#include "lp_bld_struct.h"
> +#include "lp_bld_swizzle.h"
> +#include "lp_bld_init.h"
> +#include "lp_bld_debug.h"
> +#include "lp_bld_intr.h"
> +
> +
> +/**
> + * Reverse an interleave2_half
> + * (ie. pick every second element, independent lower/upper halfs)
> + * sse2 can only do that with 32bit (shufps) or larger elements
> + * natively. (Otherwise, and/pack (even) or shift/pack (odd)
> + * could be used, ideally llvm would do that for us.)
> + * XXX: Unfortunately, this does NOT translate to a shufps if those
> + * are int vectors (and casting will not help, llvm needs to recognize it
> + * as "real" float). Instead, llvm will use a pshufd/pshufd/punpcklqdq
> + * sequence which I'm pretty sure is a lot worse despite domain transition
> + * penalties with shufps (except maybe on Nehalem).
> + */
> +static LLVMValueRef
> +lp_build_uninterleave2_half(struct gallivm_state *gallivm,
> +                            struct lp_type type,
> +                            LLVMValueRef a,
> +                            LLVMValueRef b,
> +                            unsigned lo_hi)
> +{
> +   LLVMValueRef shuffle, elems[LP_MAX_VECTOR_LENGTH];
> +   unsigned i, j;
> +
> +   assert(type.length <= LP_MAX_VECTOR_LENGTH);
> +   assert(lo_hi < 2);
> +
> +   if (type.length * type.width == 256) {
> +      assert(type.length >= 4);
> +      for (i = 0, j = 0; i < type.length; ++i) {
> +         if (i == type.length / 4) {
> +            j = type.length;
> +         } else if (i == type.length / 2) {
> +            j = type.length / 2;
> +         } else if (i == 3 * type.length / 4) {
> +            j = 3 * type.length / 4;
> +         } else {
> +            j += 2;
> +         }
> +         elems[i] = lp_build_const_int32(gallivm, j + lo_hi);
> +      }
> +   } else {
> +      for (i = 0; i < type.length; ++i) {
> +         elems[i] = lp_build_const_int32(gallivm, 2*i + lo_hi);
> +      }
> +   }
> +
> +   shuffle = LLVMConstVector(elems, type.length);
> +
> +   return LLVMBuildShuffleVector(gallivm->builder, a, b, shuffle, "");
> +
> +}
> +
> +
> +/**
> + * Build shuffle for extending vectors.
> + */
> +static LLVMValueRef
> +lp_build_const_extend_shuffle(struct gallivm_state *gallivm,
> +                              unsigned n, unsigned length)
> +{
> +   LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
> +   unsigned i;
> +
> +   assert(n <= length);
> +   assert(length <= LP_MAX_VECTOR_LENGTH);
> +
> +   /* TODO: cache results in a static table */
> +
> +   for(i = 0; i < n; i++) {
> +      elems[i] = lp_build_const_int32(gallivm, i);
> +   }
> +   for (i = n; i < length; i++) {
> +      elems[i] = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
> +   }
> +
> +   return LLVMConstVector(elems, length);
> +}
> +
> +static LLVMValueRef
> +lp_build_const_unpackx2_shuffle(struct gallivm_state *gallivm, unsigned n)
> +{
> +   LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
> +   unsigned i, j;
> +
> +   assert(n <= LP_MAX_VECTOR_LENGTH);
> +
> +   /* TODO: cache results in a static table */
> +
> +   for(i = 0, j = 0; i < n; i += 2, ++j) {
> +      elems[i + 0] = lp_build_const_int32(gallivm, 0 + j);
> +      elems[i + 1] = lp_build_const_int32(gallivm, n + j);
> +      elems[n + i + 0] = lp_build_const_int32(gallivm, 0 + n/2 + j);
> +      elems[n + i + 1] = lp_build_const_int32(gallivm, n + n/2 + j);
> +   }
> +
> +   return LLVMConstVector(elems, n * 2);
> +}
> +
> +/*
> + * broadcast 1 element to all elements
> + */
> +static LLVMValueRef
> +lp_build_const_shuffle1(struct gallivm_state *gallivm,
> +                        unsigned index, unsigned n)
> +{
> +   LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
> +   unsigned i;
> +
> +   assert(n <= LP_MAX_VECTOR_LENGTH);
> +
> +   /* TODO: cache results in a static table */
> +
> +   for (i = 0; i < n; i++) {
> +      elems[i] = lp_build_const_int32(gallivm, index);
> +   }
> +
> +   return LLVMConstVector(elems, n);
> +}
> +
> +/*
> + * move 1 element to pos 0, rest undef
> + */
> +static LLVMValueRef
> +lp_build_shuffle1undef(struct gallivm_state *gallivm,
> +                       LLVMValueRef a, unsigned index, unsigned n)
> +{
> +   LLVMValueRef elems[LP_MAX_VECTOR_LENGTH], shuf;
> +   unsigned i;
> +
> +   assert(n <= LP_MAX_VECTOR_LENGTH);
> +
> +   elems[0] = lp_build_const_int32(gallivm, index);
> +
> +   for (i = 1; i < n; i++) {
> +      elems[i] = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
> +   }
> +   shuf = LLVMConstVector(elems, n);
> +
> +   return LLVMBuildShuffleVector(gallivm->builder, a, a, shuf, "");
> +}
> +
> +static boolean
> +format_dxt1_variant(enum pipe_format format)
> +{
> +  return format == PIPE_FORMAT_DXT1_RGB ||
> +         format == PIPE_FORMAT_DXT1_RGBA ||
> +         format == PIPE_FORMAT_DXT1_SRGB ||
> +         format == PIPE_FORMAT_DXT1_SRGBA;
> +
> +}
> +
> +/**
> + * Gather elements from scatter positions in memory into vectors.
> + * This is customised for fetching texels from s3tc textures.
> + * For SSE, typical value is length=4.
> + *
> + * @param length length of the offsets
> + * @param colors the stored colors of the blocks will be extracted into this.
> + * @param codewords the codewords of the blocks will be extracted into this.
> + * @param alpha_lo used for storing lower 32bit of alpha components for dxt3/5
> + * @param alpha_hi used for storing higher 32bit of alpha components for dxt3/5
> + * @param base_ptr base pointer, should be a i8 pointer type.
> + * @param offsets vector with offsets
> + */
> +static void
> +lp_build_gather_s3tc(struct gallivm_state *gallivm,
> +                     unsigned length,
> +                     const struct util_format_description *format_desc,
> +                     LLVMValueRef *colors,
> +                     LLVMValueRef *codewords,
> +                     LLVMValueRef *alpha_lo,
> +                     LLVMValueRef *alpha_hi,
> +                     LLVMValueRef base_ptr,
> +                     LLVMValueRef offsets)
> +{
> +   LLVMBuilderRef builder = gallivm->builder;
> +   unsigned block_bits = format_desc->block.bits;
> +   unsigned i;
> +   LLVMValueRef elems[8];
> +   LLVMTypeRef type32 = LLVMInt32TypeInContext(gallivm->context);
> +   LLVMTypeRef type64 = LLVMInt64TypeInContext(gallivm->context);
> +   LLVMTypeRef type32dxt;
> +   struct lp_type lp_type32dxt;
> +
> +   memset(&lp_type32dxt, 0, sizeof lp_type32dxt);
> +   lp_type32dxt.width = 32;
> +   lp_type32dxt.length = block_bits / 32;
> +   type32dxt = lp_build_vec_type(gallivm, lp_type32dxt);
> +
> +   assert(block_bits == 64 || block_bits == 128);
> +   assert(length == 1 || length == 4 || length == 8);
> +
> +   for (i = 0; i < length; ++i) {
> +      elems[i] = lp_build_gather_elem(gallivm, length,
> +                                      block_bits, block_bits, TRUE,
> +                                      base_ptr, offsets, i, FALSE);
> +      elems[i] = LLVMBuildBitCast(builder, elems[i], type32dxt, "");
> +   }
> +   if (length == 1) {
> +      LLVMValueRef elem = elems[0];
> +      if (block_bits == 128) {
> +         *alpha_lo = LLVMBuildExtractElement(builder, elem,
> +                                             lp_build_const_int32(gallivm, 0), "");
> +         *alpha_hi = LLVMBuildExtractElement(builder, elem,
> +                                             lp_build_const_int32(gallivm, 1), "");
> +         *colors = LLVMBuildExtractElement(builder, elem,
> +                                           lp_build_const_int32(gallivm, 2), "");
> +         *codewords = LLVMBuildExtractElement(builder, elem,
> +                                              lp_build_const_int32(gallivm, 3), "");
> +      }
> +      else {
> +         *alpha_lo = LLVMGetUndef(type32);
> +         *alpha_hi = LLVMGetUndef(type32);
> +         *colors = LLVMBuildExtractElement(builder, elem,
> +                                           lp_build_const_int32(gallivm, 0), "");
> +         *codewords = LLVMBuildExtractElement(builder, elem,
> +                                              lp_build_const_int32(gallivm, 1), "");
> +      }
> +   }
> +   else {
> +      LLVMValueRef tmp[4], cc01, cc23;
> +      struct lp_type lp_type32, lp_type64, lp_type32dxt;
> +      memset(&lp_type32, 0, sizeof lp_type32);
> +      lp_type32.width = 32;
> +      lp_type32.length = length;
> +      memset(&lp_type64, 0, sizeof lp_type64);
> +      lp_type64.width = 64;
> +      lp_type64.length = length/2;
> +
> +      if (block_bits == 128) {
> +         if (length == 8) {
> +            for (i = 0; i < 4; ++i) {
> +               tmp[0] = elems[i];
> +               tmp[1] = elems[i+4];
> +               elems[i] = lp_build_concat(gallivm, tmp, lp_type32dxt, 2);
> +            }
> +         }
> +         lp_build_transpose_aos(gallivm, lp_type32, elems, tmp);
> +         *colors = tmp[2];
> +         *codewords = tmp[3];
> +         *alpha_lo = tmp[0];
> +         *alpha_hi = tmp[1];
> +      } else {
> +         LLVMTypeRef type64_vec = LLVMVectorType(type64, length/2);
> +         LLVMTypeRef type32_vec = LLVMVectorType(type32, length);
> +
> +         for (i = 0; i < length; ++i) {
> +            /* no-op shuffle */
> +            elems[i] = LLVMBuildShuffleVector(builder, elems[i],
> +                                              LLVMGetUndef(type32dxt),
> +                                              lp_build_const_extend_shuffle(gallivm, 2, 4), "");
> +         }
> +         if (length == 8) {
> +            for (i = 0; i < 4; ++i) {
> +               tmp[0] = elems[i];
> +               tmp[1] = elems[i+4];
> +               elems[i] = lp_build_concat(gallivm, tmp, lp_type32, 2);
> +            }
> +         }
> +         cc01 = lp_build_interleave2_half(gallivm, lp_type32, elems[0], elems[1], 0);
> +         cc23 = lp_build_interleave2_half(gallivm, lp_type32, elems[2], elems[3], 0);
> +         cc01 = LLVMBuildBitCast(builder, cc01, type64_vec, "");
> +         cc23 = LLVMBuildBitCast(builder, cc23, type64_vec, "");
> +         *colors = lp_build_interleave2_half(gallivm, lp_type64, cc01, cc23, 0);
> +         *codewords = lp_build_interleave2_half(gallivm, lp_type64, cc01, cc23, 1);
> +         *colors = LLVMBuildBitCast(builder, *colors, type32_vec, "");
> +         *codewords = LLVMBuildBitCast(builder, *codewords, type32_vec, "");
> +      }
> +   }
> +}
> +
> +/** Convert from <n x i32> containing 2 x n rgb565 colors
> + * to 2 <n x i32> rgba8888 colors
> + * This is the most optimized version I can think of
> + * should be nearly as fast as decoding only one color
> + * NOTE: alpha channel will be set to 0
> + * @param colors  is a <n x i32> vector containing the rgb565 colors
> + */
> +static void
> +color_expand2_565_to_8888(struct gallivm_state *gallivm,
> +                          unsigned n,
> +                          LLVMValueRef colors,
> +                          LLVMValueRef *color0,
> +                          LLVMValueRef *color1)
> +{
> +   LLVMBuilderRef builder = gallivm->builder;
> +   LLVMValueRef r, g, b, rblo, glo;
> +   LLVMValueRef rgblomask, rb, rgb0, rgb1;
> +   struct lp_type type, type16, type8;
> +
> +   assert(n > 1);
> +
> +   memset(&type, 0, sizeof type);
> +   type.width = 32;
> +   type.length = n;
> +
> +   memset(&type16, 0, sizeof type16);
> +   type16.width = 16;
> +   type16.length = 2 * n;
> +
> +   memset(&type8, 0, sizeof type8);
> +   type8.width = 8;
> +   type8.length = 4 * n;
> +
> +   rgblomask = lp_build_const_int_vec(gallivm, type16, 0x0707);
> +   colors = LLVMBuildBitCast(builder, colors,
> +                             lp_build_vec_type(gallivm, type16), "");
> +   /* move r into low 8 bits, b into high 8 bits, g into another reg (low bits)
> +    * make sure low bits of r are zero - could use AND but requires constant */
> +   r = LLVMBuildLShr(builder, colors, lp_build_const_int_vec(gallivm, type16, 11), "");
> +   r = LLVMBuildShl(builder, r, lp_build_const_int_vec(gallivm, type16, 3), "");
> +   b = LLVMBuildShl(builder, colors, lp_build_const_int_vec(gallivm, type16, 11), "");
> +   rb = LLVMBuildOr(builder, r, b, "");
> +   rblo = LLVMBuildLShr(builder, rb, lp_build_const_int_vec(gallivm, type16, 5), "");
> +   /* don't have byte shift hence need mask */
> +   rblo = LLVMBuildAnd(builder, rblo, rgblomask, "");
> +   rb = LLVMBuildOr(builder, rb, rblo, "");
> +
> +   /* make sure low bits of g are zero */
> +   g = LLVMBuildAnd(builder, colors, lp_build_const_int_vec(gallivm, type16, 0x07e0), "");
> +   g = LLVMBuildLShr(builder, g, lp_build_const_int_vec(gallivm, type16, 3), "");
> +   glo = LLVMBuildLShr(builder, g, lp_build_const_int_vec(gallivm, type16, 6), "");
> +   g = LLVMBuildOr(builder, g, glo, "");
> +
> +   rb = LLVMBuildBitCast(builder, rb, lp_build_vec_type(gallivm, type8), "");
> +   g = LLVMBuildBitCast(builder, g, lp_build_vec_type(gallivm, type8), "");
> +   rgb0 = lp_build_interleave2_half(gallivm, type8, rb, g, 0);
> +   rgb1 = lp_build_interleave2_half(gallivm, type8, rb, g, 1);
> +
> +   rgb0 = LLVMBuildBitCast(builder, rgb0, lp_build_vec_type(gallivm, type), "");
> +   rgb1 = LLVMBuildBitCast(builder, rgb1, lp_build_vec_type(gallivm, type), "");
> +
> +   /* rgb0 is rgb00, rgb01, rgb10, rgb11
> +    * instead of rgb00, rgb10, rgb20, rgb30 hence need reshuffle
> +    * on x86 this _should_ just generate one shufps...
> +    */
> +   *color0 = lp_build_uninterleave2_half(gallivm, type, rgb0, rgb1, 0);
> +   *color1 = lp_build_uninterleave2_half(gallivm, type, rgb0, rgb1, 1);
> +}
> +
> +
> +/** Convert from <n x i32> containing rgb565 colors
> + * (in first 16 bits) to <n x i32> rgba8888 colors
> + * bits 16-31 MBZ
> + * NOTE: alpha channel will be set to 0
> + * @param colors  is a <n x i32> vector containing the rgb565 colors
> + */
> +static LLVMValueRef
> +color_expand_565_to_8888(struct gallivm_state *gallivm,
> +                         unsigned n,
> +                         LLVMValueRef colors)
> +{
> +   LLVMBuilderRef builder = gallivm->builder;
> +   LLVMValueRef rgba, r, g, b, rgblo, glo;
> +   LLVMValueRef rbhimask, g6mask, rgblomask;
> +   struct lp_type type;
> +   memset(&type, 0, sizeof type);
> +   type.width = 32;
> +   type.length = n;
> +
> +   /* color expansion:
> +    * first extract and shift colors into their final locations
> +    * (high bits - low bits zero at this point)
> +    * then replicate highest bits to the lowest bits
> +    * note rb replication can be done in parallel but not g
> +    * (different shift)
> +    * r5mask = 0xf800, g6mask = 0x07e0, b5mask = 0x001f
> +    * rhigh = 8, ghigh = 5, bhigh = 19
> +    * rblow = 5, glow = 6
> +    * rgblowmask = 0x00070307
> +    * r = colors >> rhigh
> +    * b = colors << bhigh
> +    * g = (colors & g6mask) << ghigh
> +    * rb = (r | b) rbhimask
> +    * rbtmp = rb >> rblow
> +    * gtmp = rb >> glow
> +    * rbtmp = rbtmp | gtmp
> +    * rbtmp = rbtmp & rgblowmask
> +    * rgb = rb | g | rbtmp
> +    */
> +   g6mask = lp_build_const_int_vec(gallivm, type, 0x07e0);
> +   rbhimask = lp_build_const_int_vec(gallivm, type, 0x00f800f8);
> +   rgblomask = lp_build_const_int_vec(gallivm, type, 0x00070307);
> +
> +   r = LLVMBuildLShr(builder, colors, lp_build_const_int_vec(gallivm, type, 8), "");
> +   b = LLVMBuildShl(builder, colors, lp_build_const_int_vec(gallivm, type, 19), "");
> +   g = LLVMBuildAnd(builder, colors, g6mask, "");
> +   g = LLVMBuildShl(builder, g, lp_build_const_int_vec(gallivm, type, 5), "");
> +   rgba = LLVMBuildOr(builder, r, b, "");
> +   rgba = LLVMBuildAnd(builder, rgba, rbhimask, "");
> +   rgblo = LLVMBuildLShr(builder, rgba, lp_build_const_int_vec(gallivm, type, 5), "");
> +   glo = LLVMBuildLShr(builder, g, lp_build_const_int_vec(gallivm, type, 6), "");
> +   rgblo = LLVMBuildOr(builder, rgblo, glo, "");
> +   rgblo = LLVMBuildAnd(builder, rgblo, rgblomask, "");
> +   rgba = LLVMBuildOr(builder, rgba, g, "");
> +   rgba = LLVMBuildOr(builder, rgba, rgblo, "");
> +
> +   return rgba;
> +}
> +
> +
> +/**
> + * Calculate 1/3(v1-v0) + v0
> + * and 2*1/3(v1-v0) + v0
> + */
> +static void
> +lp_build_lerp23(struct lp_build_context *bld,
> +                LLVMValueRef v0,
> +                LLVMValueRef v1,
> +                LLVMValueRef *res0,
> +                LLVMValueRef *res1)
> +{
> +   struct gallivm_state *gallivm = bld->gallivm;
> +   LLVMValueRef x, x_lo, x_hi, delta_lo, delta_hi;
> +   LLVMValueRef mul_lo, mul_hi, v0_lo, v0_hi, v1_lo, v1_hi, tmp;
> +   const struct lp_type type = bld->type;
> +   LLVMBuilderRef builder = bld->gallivm->builder;
> +   struct lp_type i16_type = lp_wider_type(type);
> +   struct lp_build_context bld2;
> +
> +   assert(lp_check_value(type, v0));
> +   assert(lp_check_value(type, v1));
> +   assert(!type.floating && !type.fixed && !type.norm && type.width == 8);
> +
> +   lp_build_context_init(&bld2, gallivm, i16_type);
> +   bld2.type.sign = TRUE;
> +   x = lp_build_const_int_vec(gallivm, bld->type, 255*1/3);
> +
> +   /* FIXME: use native avx256 unpack/pack */
> +   lp_build_unpack2(gallivm, type, i16_type, x, &x_lo, &x_hi);
> +   lp_build_unpack2(gallivm, type, i16_type, v0, &v0_lo, &v0_hi);
> +   lp_build_unpack2(gallivm, type, i16_type, v1, &v1_lo, &v1_hi);
> +   delta_lo = lp_build_sub(&bld2, v1_lo, v0_lo);
> +   delta_hi = lp_build_sub(&bld2, v1_hi, v0_hi);
> +
> +   mul_lo = LLVMBuildMul(builder, x_lo, delta_lo, "");
> +   mul_hi = LLVMBuildMul(builder, x_hi, delta_hi, "");
> +
> +   x_lo = LLVMBuildLShr(builder, mul_lo, lp_build_const_int_vec(gallivm, i16_type, 8), "");
> +   x_hi = LLVMBuildLShr(builder, mul_hi, lp_build_const_int_vec(gallivm, i16_type, 8), "");
> +   /* lerp optimization: pack now, do add afterwards */
> +   tmp = lp_build_pack2(gallivm, i16_type, type, x_lo, x_hi);
> +   *res0 = lp_build_add(bld, tmp, v0);
> +
> +   x_lo = LLVMBuildLShr(builder, mul_lo, lp_build_const_int_vec(gallivm, i16_type, 7), "");
> +   x_hi = LLVMBuildLShr(builder, mul_hi, lp_build_const_int_vec(gallivm, i16_type, 7), "");
> +   /* unlike above still need mask (but add still afterwards). */
> +   x_lo = LLVMBuildAnd(builder, x_lo, lp_build_const_int_vec(gallivm, i16_type, 0xff), "");
> +   x_hi = LLVMBuildAnd(builder, x_hi, lp_build_const_int_vec(gallivm, i16_type, 0xff), "");
> +   tmp = lp_build_pack2(gallivm, i16_type, type, x_lo, x_hi);
> +   *res1 = lp_build_add(bld, tmp, v0);
> +}
> +
> +/**
> + * Convert from <n x i64> s3tc dxt1 to <4n x i8> RGBA AoS
> + * @param colors  is a <n x i32> vector with n x 2x16bit colors
> + * @param codewords  is a <n x i32> vector containing the codewords
> + * @param i  is a <n x i32> vector with the x pixel coordinate (0 to 3)
> + * @param j  is a <n x i32> vector with the y pixel coordinate (0 to 3)
> + */
> +static LLVMValueRef
> +s3tc_dxt1_full_to_rgba_aos(struct gallivm_state *gallivm,
> +                           unsigned n,
> +                           enum pipe_format format,
> +                           LLVMValueRef colors,
> +                           LLVMValueRef codewords,
> +                           LLVMValueRef i,
> +                           LLVMValueRef j)
> +{
> +   LLVMBuilderRef builder = gallivm->builder;
> +   LLVMValueRef color0, color1, color2, color3, color2_2, color3_2;
> +   LLVMValueRef rgba, a, colors0, colors1, col0, col1, const2;
> +   LLVMValueRef bit_pos, sel_mask, sel_lo, sel_hi, indices;
> +   struct lp_type type, type8;
> +   struct lp_build_context bld8, bld32;
> +   boolean is_dxt1_variant = format_dxt1_variant(format);
> +
> +   memset(&type, 0, sizeof type);
> +   type.width = 32;
> +   type.length = n;
> +
> +   memset(&type8, 0, sizeof type8);
> +   type8.width = 8;
> +   type8.length = 4*n;
> +
> +   assert(lp_check_value(type, i));
> +   assert(lp_check_value(type, j));
> +
> +   a = lp_build_const_int_vec(gallivm, type, 0xff000000);
> +
> +   lp_build_context_init(&bld32, gallivm, type);
> +   lp_build_context_init(&bld8, gallivm, type8);
> +
> +   /*
> +    * works as follows:
> +    * - expand color0/color1 to rgba8888
> +    * - calculate color2/3 (interpolation) according to color0 < color1 rules
> +    * - calculate color2/3 according to color0 >= color1 rules
> +    * - do selection of color2/3 according to comparison of color0/1
> +    * - extract indices (vector shift).
> +    * - use compare/select to select the correct color. Since we have 2bit
> +    *   indices (and 4 colors), needs at least three compare/selects.
> +    */
> +   /*
> +    * expand the two colors
> +    */
> +   col0 = LLVMBuildAnd(builder, colors, lp_build_const_int_vec(gallivm, type, 0x0000ffff), "");
> +   col1 = LLVMBuildLShr(builder, colors, lp_build_const_int_vec(gallivm, type, 16), "");
> +   if (n > 1) {
> +      color_expand2_565_to_8888(gallivm, n, colors, &color0, &color1);
> +   }
> +   else {
> +      color0 = color_expand_565_to_8888(gallivm, n, col0);
> +      color1 = color_expand_565_to_8888(gallivm, n, col1);
> +   }
> +
> +   /*
> +    * interpolate colors
> +    * color2_1 is 2/3 color0 + 1/3 color1
> +    * color3_1 is 1/3 color0 + 2/3 color1
> +    * color2_2 is 1/2 color0 + 1/2 color1
> +    * color3_2 is 0
> +    */
> +
> +   colors0 = LLVMBuildBitCast(builder, color0, bld8.vec_type, "");
> +   colors1 = LLVMBuildBitCast(builder, color1, bld8.vec_type, "");
> +   /* can combine 2 lerps into one mostly - still looks expensive enough. */
> +   lp_build_lerp23(&bld8, colors0, colors1, &color2, &color3);
> +   color2 = LLVMBuildBitCast(builder, color2, bld32.vec_type, "");
> +   color3 = LLVMBuildBitCast(builder, color3, bld32.vec_type, "");
> +
> +   /* dxt3/5 always use 4-color encoding */
> +   if (is_dxt1_variant) {
> +      /* fix up alpha */
> +      if (format == PIPE_FORMAT_DXT1_RGBA ||
> +          format == PIPE_FORMAT_DXT1_SRGBA) {
> +         color0 = LLVMBuildOr(builder, color0, a, "");
> +         color1 = LLVMBuildOr(builder, color1, a, "");
> +         color3 = LLVMBuildOr(builder, color3, a, "");
> +      }
> +      /*
> +       * XXX with sse2 and 16x8 vectors, should use pavgb even when n == 1.
> +       * Much cheaper (but we don't care that much if n == 1).
> +       */
> +      if ((util_cpu_caps.has_sse2 && n == 4) ||
> +          (util_cpu_caps.has_avx2 && n == 8)) {
> +         LLVMValueRef intrargs[2];
> +         char *intr_name = n == 8 ? "llvm.x86.avx2.pavg.b" :
> +                                    "llvm.x86.sse2.pavg.b";
> +         intrargs[0] = colors0;
> +         intrargs[1] = colors1;
> +         color2_2 = lp_build_intrinsic(builder, intr_name,
> +                                       bld8.vec_type, intrargs, 2, 0);
> +         color2_2 = LLVMBuildBitCast(builder, color2_2, bld32.vec_type, "");
> +      }
> +      else {
> +         struct lp_type i16_type = lp_wider_type(type8);
> +         struct lp_build_context bld2;
> +         LLVMValueRef v0_lo, v0_hi, v1_lo, v1_hi, addlo, addhi;
> +
> +         lp_build_context_init(&bld2, gallivm, i16_type);
> +         bld2.type.sign = TRUE;
> +
> +         /*
> +          * This isn't as expensive as it looks (the unpack is the same as
> +          * for lerp23), with correct rounding.
> +          * (Note that while rounding is correct, this will always round down,
> +          * whereas pavgb will always round up.)
> +          */
> +         /* FIXME: use native avx256 unpack/pack */
> +         lp_build_unpack2(gallivm, type8, i16_type, colors0, &v0_lo, &v0_hi);
> +         lp_build_unpack2(gallivm, type8, i16_type, colors1, &v1_lo, &v1_hi);
> +
> +         addlo = lp_build_add(&bld2, v0_lo, v1_lo);
> +         addhi = lp_build_add(&bld2, v0_hi, v1_hi);
> +         addlo = LLVMBuildLShr(builder, addlo,
> +                               lp_build_const_int_vec(gallivm, i16_type, 1), "");
> +         addhi = LLVMBuildLShr(builder, addhi,
> +                               lp_build_const_int_vec(gallivm, i16_type, 1), "");
> +         color2_2 = lp_build_pack2(gallivm, i16_type, type8, addlo, addhi);
> +         color2_2 = LLVMBuildBitCast(builder, color2_2, bld32.vec_type, "");
> +      }
> +      color3_2 = lp_build_const_int_vec(gallivm, type, 0);
> +
> +      /* select between colors2/3 */
> +      /* signed compare is faster saves some xors */
> +      type.sign = TRUE;
> +      sel_mask = lp_build_compare(gallivm, type, PIPE_FUNC_GREATER, col0, col1);
> +      color2 = lp_build_select(&bld32, sel_mask, color2, color2_2);
> +      color3 = lp_build_select(&bld32, sel_mask, color3, color3_2);
> +      type.sign = FALSE;
> +
> +      if (format == PIPE_FORMAT_DXT1_RGBA ||
> +          format == PIPE_FORMAT_DXT1_SRGBA) {
> +         color2 = LLVMBuildOr(builder, color2, a, "");
> +      }
> +   }
> +
> +   const2 = lp_build_const_int_vec(gallivm, type, 2);
> +   /* extract 2-bit index values */
> +   bit_pos = LLVMBuildShl(builder, j, const2, "");
> +   bit_pos = LLVMBuildAdd(builder, bit_pos, i, "");
> +   bit_pos = LLVMBuildAdd(builder, bit_pos, bit_pos, "");
> +   /*
> +    * NOTE: This innocent looking shift is very expensive with x86/ssex.
> +    * Shifts with per-elemnent shift count get roughly translated to
> +    * extract (count), extract (value), shift, move (back to xmm), unpack
> +    * per element!
> +    * So about 20 instructions here for 4xi32.
> +    * Newer llvm versions (3.7+) will not do extract/insert but use a
> +    * a couple constant count vector shifts plus shuffles. About same
> +    * amount of instructions unfortunately...
> +    * Would get much worse with 8xi16 even...
> +    * We could actually do better here:
> +    * - subtract bit_pos from 128+30, shl 23, convert float to int...
> +    * - now do mul with codewords followed by shr 30...
> +    * But requires 32bit->32bit mul, sse41 only (well that's emulatable
> +    * with 2 32bit->64bit muls...) and not exactly cheap
> +    * AVX2, of course, fixes this nonsense.
> +    */
> +   indices = LLVMBuildLShr(builder, codewords, bit_pos, "");
> +
> +   /* finally select the colors */
> +   sel_lo = LLVMBuildAnd(builder, indices, bld32.one, "");
> +   sel_lo = lp_build_compare(gallivm, type, PIPE_FUNC_EQUAL, sel_lo, bld32.one);
> +   color0 = lp_build_select(&bld32, sel_lo, color1, color0);
> +   color2 = lp_build_select(&bld32, sel_lo, color3, color2);
> +   sel_hi = LLVMBuildAnd(builder, indices, const2, "");
> +   sel_hi = lp_build_compare(gallivm, type, PIPE_FUNC_EQUAL, sel_hi, const2);
> +   rgba = lp_build_select(&bld32, sel_hi, color2, color0);
> +
> +   /* fix up alpha */
> +   if (format == PIPE_FORMAT_DXT1_RGB ||
> +       format == PIPE_FORMAT_DXT1_SRGB) {
> +      rgba = LLVMBuildOr(builder, rgba, a, "");
> +   }
> +   return LLVMBuildBitCast(builder, rgba, bld8.vec_type, "");
> +}
> +
> +
> +static LLVMValueRef
> +s3tc_dxt1_to_rgba_aos(struct gallivm_state *gallivm,
> +                      unsigned n,
> +                      enum pipe_format format,
> +                      LLVMValueRef colors,
> +                      LLVMValueRef codewords,
> +                      LLVMValueRef i,
> +                      LLVMValueRef j)
> +{
> +   return s3tc_dxt1_full_to_rgba_aos(gallivm, n, format,
> +                                     colors, codewords, i, j);
> +}
> +
> +
> +/**
> + * Convert from <n x i128> s3tc dxt3 to <4n x i8> RGBA AoS
> + * @param colors  is a <n x i32> vector with n x 2x16bit colors
> + * @param codewords  is a <n x i32> vector containing the codewords
> + * @param alphas  is a <n x i64> vector containing the alpha values
> + * @param i  is a <n x i32> vector with the x pixel coordinate (0 to 3)
> + * @param j  is a <n x i32> vector with the y pixel coordinate (0 to 3)
> + */
> +static LLVMValueRef
> +s3tc_dxt3_to_rgba_aos(struct gallivm_state *gallivm,
> +                      unsigned n,
> +                      enum pipe_format format,
> +                      LLVMValueRef colors,
> +                      LLVMValueRef codewords,
> +                      LLVMValueRef alpha_low,
> +                      LLVMValueRef alpha_hi,
> +                      LLVMValueRef i,
> +                      LLVMValueRef j)
> +{
> +   LLVMBuilderRef builder = gallivm->builder;
> +   LLVMValueRef rgba, tmp, tmp2;
> +   LLVMValueRef bit_pos, sel_mask;
> +   struct lp_type type, type8;
> +   struct lp_build_context bld;
> +
> +   memset(&type, 0, sizeof type);
> +   type.width = 32;
> +   type.length = n;
> +
> +   memset(&type8, 0, sizeof type8);
> +   type8.width = 8;
> +   type8.length = n*4;
> +
> +   assert(lp_check_value(type, i));
> +   assert(lp_check_value(type, j));
> +
> +   lp_build_context_init(&bld, gallivm, type);
> +
> +   rgba = s3tc_dxt1_to_rgba_aos(gallivm, n, format,
> +                                colors, codewords, i, j);
> +
> +   rgba = LLVMBuildBitCast(builder, rgba, bld.vec_type, "");
> +
> +   /*
> +    * Extract alpha values. Since we now need to select from
> +    * which 32bit vector values are fetched, construct selection
> +    * mask from highest bit of bit_pos, and use select, then shift
> +    * according to the bit_pos (without the highest bit).
> +    * Note this is pointless for n == 1 case. Could just
> +    * directly use 64bit arithmetic if we'd extract 64bit
> +    * alpha value instead of 2x32...
> +    */
> +   /* pos = 4*(4j+i) */
> +   bit_pos = LLVMBuildShl(builder, j, lp_build_const_int_vec(gallivm, type, 2), "");
> +   bit_pos = LLVMBuildAdd(builder, bit_pos, i, "");
> +   bit_pos = LLVMBuildShl(builder, bit_pos,
> +                          lp_build_const_int_vec(gallivm, type, 2), "");
> +   sel_mask = LLVMBuildLShr(builder, bit_pos,
> +                            lp_build_const_int_vec(gallivm, type, 5), "");
> +   sel_mask = LLVMBuildSub(builder, sel_mask, bld.one, "");
> +   tmp = lp_build_select(&bld, sel_mask, alpha_low, alpha_hi);
> +   bit_pos = LLVMBuildAnd(builder, bit_pos,
> +                          lp_build_const_int_vec(gallivm, type, 0xffffffdf), "");
> +   /* Warning: slow shift with per element count */
> +   /*
> +    * Could do pshufb here as well - just use appropriate 2 bits in bit_pos
> +    * to select the right byte with pshufb. Then for the remaining one bit
> +    * just do shift/select.
> +    */
> +   tmp = LLVMBuildLShr(builder, tmp, bit_pos, "");
> +
> +   /* combined expand from a4 to a8 and shift into position */
> +   tmp = LLVMBuildShl(builder, tmp, lp_build_const_int_vec(gallivm, type, 28), "");
> +   tmp2 = LLVMBuildLShr(builder, tmp, lp_build_const_int_vec(gallivm, type, 4), "");
> +   tmp = LLVMBuildOr(builder, tmp, tmp2, "");
> +
> +   rgba = LLVMBuildOr(builder, tmp, rgba, "");
> +
> +   return LLVMBuildBitCast(builder, rgba, lp_build_vec_type(gallivm, type8), "");
> +}
> +
> +static LLVMValueRef
> +lp_build_lerpdxta(struct gallivm_state *gallivm,
> +                  LLVMValueRef alpha0,
> +                  LLVMValueRef alpha1,
> +                  LLVMValueRef code,
> +                  LLVMValueRef sel_mask,
> +                  unsigned n)
> +{
> +   /*
> +    * note we're doing lerp in 16bit since 32bit pmulld is only available in sse41
> +    * (plus pmullw is actually faster...)
> +    * we just pretend our 32bit values (which are really only 8bit) are 16bits.
> +    * Note that this is obviously a disaster for the scalar case.
> +    */
> +   LLVMBuilderRef builder = gallivm->builder;
> +   LLVMValueRef delta, ainterp;
> +   LLVMValueRef weight5, weight7, weight;
> +   struct lp_type type32, type16, type8;
> +   struct lp_build_context bld16;
> +
> +   memset(&type32, 0, sizeof type32);
> +   type32.width = 32;
> +   type32.length = n;
> +   memset(&type16, 0, sizeof type16);
> +   type16.width = 16;
> +   type16.length = 2*n;
> +   type16.sign = TRUE;
> +   memset(&type8, 0, sizeof type8);
> +   type8.width = 8;
> +   type8.length = 4*n;
> +
> +   lp_build_context_init(&bld16, gallivm, type16);
> +   /* 255/7 is a bit off - increase accuracy at the expense of shift later */
> +   sel_mask = LLVMBuildBitCast(builder, sel_mask, bld16.vec_type, "");
> +   weight5 = lp_build_const_int_vec(gallivm, type16, 255*64/5);
> +   weight7 = lp_build_const_int_vec(gallivm, type16, 255*64/7);
> +   weight = lp_build_select(&bld16, sel_mask, weight7, weight5);
> +
> +   alpha0 = LLVMBuildBitCast(builder, alpha0, bld16.vec_type, "");
> +   alpha1 = LLVMBuildBitCast(builder, alpha1, bld16.vec_type, "");
> +   code = LLVMBuildBitCast(builder, code, bld16.vec_type, "");
> +   /* we'll get garbage in the elements which had code 0 (or larger than 5 or 7)
> +      but we don't care */
> +   code = LLVMBuildSub(builder, code, bld16.one, "");
> +
> +   weight = LLVMBuildMul(builder, weight, code, "");
> +   weight = LLVMBuildLShr(builder, weight,
> +                          lp_build_const_int_vec(gallivm, type16, 6), "");
> +
> +   delta = LLVMBuildSub(builder, alpha1, alpha0, "");
> +
> +   ainterp = LLVMBuildMul(builder, delta, weight, "");
> +   ainterp = LLVMBuildLShr(builder, ainterp,
> +                           lp_build_const_int_vec(gallivm, type16, 8), "");
> +
> +   ainterp = LLVMBuildBitCast(builder, ainterp, lp_build_vec_type(gallivm, type8), "");
> +   alpha0 = LLVMBuildBitCast(builder, alpha0, lp_build_vec_type(gallivm, type8), "");
> +   ainterp = LLVMBuildAdd(builder, alpha0, ainterp, "");
> +   ainterp = LLVMBuildBitCast(builder, ainterp, lp_build_vec_type(gallivm, type32), "");
> +
> +   return ainterp;
> +}
> +
> +/**
> + * Convert from <n x i128> s3tc dxt5 to <4n x i8> RGBA AoS
> + * @param colors  is a <n x i32> vector with n x 2x16bit colors
> + * @param codewords  is a <n x i32> vector containing the codewords
> + * @param alphas  is a <n x i64> vector containing the alpha values
> + * @param i  is a <n x i32> vector with the x pixel coordinate (0 to 3)
> + * @param j  is a <n x i32> vector with the y pixel coordinate (0 to 3)
> + */
> +static LLVMValueRef
> +s3tc_dxt5_full_to_rgba_aos(struct gallivm_state *gallivm,
> +                           unsigned n,
> +                           enum pipe_format format,
> +                           LLVMValueRef colors,
> +                           LLVMValueRef codewords,
> +                           LLVMValueRef alpha_lo,
> +                           LLVMValueRef alpha_hi,
> +                           LLVMValueRef i,
> +                           LLVMValueRef j)
> +{
> +   LLVMBuilderRef builder = gallivm->builder;
> +   LLVMValueRef rgba, tmp, alpha0, alpha1, alphac, alphac0, bit_pos, shift;
> +   LLVMValueRef sel_mask, tmp_mask, alpha, alpha64, code_s;
> +   LLVMValueRef mask6, mask7, ainterp;
> +   LLVMTypeRef i64t = LLVMInt64TypeInContext(gallivm->context);
> +   LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context);
> +   struct lp_type type, type8;
> +   struct lp_build_context bld32;
> +
> +   memset(&type, 0, sizeof type);
> +   type.width = 32;
> +   type.length = n;
> +
> +   memset(&type8, 0, sizeof type8);
> +   type8.width = 8;
> +   type8.length = n*4;
> +
> +   assert(lp_check_value(type, i));
> +   assert(lp_check_value(type, j));
> +
> +   lp_build_context_init(&bld32, gallivm, type);
> +
> +   assert(lp_check_value(type, i));
> +   assert(lp_check_value(type, j));
> +
> +   rgba = s3tc_dxt1_to_rgba_aos(gallivm, n, format,
> +                                colors, codewords, i, j);
> +
> +   rgba = LLVMBuildBitCast(builder, rgba, bld32.vec_type, "");
> +
> +   /* this looks pretty complex for vectorization:
> +    * extract a0/a1 values
> +    * extract code
> +    * select weights for interpolation depending on a0 > a1
> +    * mul weights by code - 1
> +    * lerp a0/a1/weights
> +    * use selects for getting either a0, a1, interp a, interp a/0.0, interp a/1.0
> +    */
> +
> +   alpha0 = LLVMBuildAnd(builder, alpha_lo,
> +                         lp_build_const_int_vec(gallivm, type, 0xff), "");
> +   alpha1 = LLVMBuildLShr(builder, alpha_lo,
> +                          lp_build_const_int_vec(gallivm, type, 8), "");
> +   alpha1 = LLVMBuildAnd(builder, alpha1,
> +                         lp_build_const_int_vec(gallivm, type, 0xff), "");
> +
> +   /* pos = 3*(4j+i) */
> +   bit_pos = LLVMBuildShl(builder, j, lp_build_const_int_vec(gallivm, type, 2), "");
> +   bit_pos = LLVMBuildAdd(builder, bit_pos, i, "");
> +   tmp = LLVMBuildAdd(builder, bit_pos, bit_pos, "");
> +   bit_pos = LLVMBuildAdd(builder, bit_pos, tmp, "");
> +   /* get rid of first 2 bytes - saves shifts of alpha_lo/hi */
> +   bit_pos = LLVMBuildAdd(builder, bit_pos,
> +                          lp_build_const_int_vec(gallivm, type, 16), "");
> +
> +   if (n == 1) {
> +      struct lp_type type64;
> +      memset(&type64, 0, sizeof type64);
> +      type64.width = 64;
> +      type64.length = 1;
> +      /* This is pretty pointless could avoid by just directly extracting
> +         64bit in the first place but makes it more complicated elsewhere */
> +      alpha_lo = LLVMBuildZExt(builder, alpha_lo, i64t, "");
> +      alpha_hi = LLVMBuildZExt(builder, alpha_hi, i64t, "");
> +      alphac0 = LLVMBuildShl(builder, alpha_hi,
> +                             lp_build_const_int_vec(gallivm, type64, 32), "");
> +      alphac0 = LLVMBuildOr(builder, alpha_lo, alphac0, "");
> +
> +      shift = LLVMBuildZExt(builder, bit_pos, i64t, "");
> +      alphac0 = LLVMBuildLShr(builder, alphac0, shift, "");
> +      alphac0 = LLVMBuildTrunc(builder, alphac0, i32t, "");
> +      alphac = LLVMBuildAnd(builder, alphac0,
> +                            lp_build_const_int_vec(gallivm, type, 0x7), "");
> +   }
> +   else {
> +      /*
> +       * Using non-native vector length here (actually, with avx2 and
> +       * n == 4 llvm will indeed expand to ymm regs...)
> +       * At least newer llvm versions handle that ok.
> +       * llvm 3.7+ will even handle the emulated 64bit shift with variable
> +       * shift count without extraction (and it's actually easier to
> +       * emulate than the 32bit one).
> +       */
> +      alpha64 = LLVMBuildShuffleVector(builder, alpha_lo, alpha_hi,
> +                                       lp_build_const_unpackx2_shuffle(gallivm, n), "");
> +
> +      alpha64 = LLVMBuildBitCast(builder, alpha64, LLVMVectorType(i64t, n), "");
> +      shift = LLVMBuildZExt(builder, bit_pos, LLVMVectorType(i64t, n), "");
> +      alphac = LLVMBuildLShr(builder, alpha64, shift, "");
> +      alphac = LLVMBuildTrunc(builder, alphac, bld32.vec_type, "");
> +
> +      alphac = LLVMBuildAnd(builder, alphac,
> +                            lp_build_const_int_vec(gallivm, type, 0x7), "");
> +   }
> +
> +   /* signed compare is faster saves some xors */
> +   type.sign = TRUE;
> +   /* alpha0 > alpha1 selection */
> +   sel_mask = lp_build_compare(gallivm, type, PIPE_FUNC_GREATER,
> +                               alpha0, alpha1);
> +   ainterp = lp_build_lerpdxta(gallivm, alpha0, alpha1, alphac, sel_mask, n);
> +
> +   /*
> +    * if a0 > a1 then we select a0 for case 0, a1 for case 1, interp otherwise.
> +    * else we select a0 for case 0, a1 for case 1,
> +    * interp for case 2-5, 00 for 6 and 0xff(ffffff) for 7
> +    * a = (c == 0) ? a0 : a1
> +    * a = (c > 1) ? ainterp : a
> +    * Finally handle case 6/7 for !(a0 > a1)
> +    * a = (!(a0 > a1) && c == 6) ? 0 : a (andnot with mask)
> +    * a = (!(a0 > a1) && c == 7) ? 0xffffffff : a (or with mask)
> +    */
> +   tmp_mask = lp_build_compare(gallivm, type, PIPE_FUNC_EQUAL,
> +                               alphac, bld32.zero);
> +   alpha = lp_build_select(&bld32, tmp_mask, alpha0, alpha1);
> +   tmp_mask = lp_build_compare(gallivm, type, PIPE_FUNC_GREATER,
> +                               alphac, bld32.one);
> +   alpha = lp_build_select(&bld32, tmp_mask, ainterp, alpha);
> +
> +   code_s = LLVMBuildAnd(builder, alphac,
> +                         LLVMBuildNot(builder, sel_mask, ""), "");
> +   mask6 = lp_build_compare(gallivm, type, PIPE_FUNC_EQUAL,
> +                            code_s, lp_build_const_int_vec(gallivm, type, 6));
> +   mask7 = lp_build_compare(gallivm, type, PIPE_FUNC_EQUAL,
> +                            code_s, lp_build_const_int_vec(gallivm, type, 7));
> +   alpha = LLVMBuildAnd(builder, alpha, LLVMBuildNot(builder, mask6, ""), "");
> +   alpha = LLVMBuildOr(builder, alpha, mask7, "");
> +
> +   alpha = LLVMBuildShl(builder, alpha, lp_build_const_int_vec(gallivm, type, 24), "");
> +   rgba = LLVMBuildOr(builder, alpha, rgba, "");
> +
> +   return LLVMBuildBitCast(builder, rgba, lp_build_vec_type(gallivm, type8), "");
> +}
> +
> +
> +static void
> +lp_build_gather_s3tc_simple_scalar(struct gallivm_state *gallivm,
> +                                   const struct util_format_description *format_desc,
> +                                   LLVMValueRef *dxt_block,
> +                                   LLVMValueRef ptr)
> +{
> +   LLVMBuilderRef builder = gallivm->builder;
> +   unsigned block_bits = format_desc->block.bits;
> +   LLVMValueRef elem, shuf;
> +   LLVMTypeRef type32 = LLVMIntTypeInContext(gallivm->context, 32);
> +   LLVMTypeRef src_type = LLVMIntTypeInContext(gallivm->context, block_bits);
> +   LLVMTypeRef src_ptr_type = LLVMPointerType(src_type, 0);
> +   LLVMTypeRef type32_4 = LLVMVectorType(type32, 4);
> +
> +   assert(block_bits == 64 || block_bits == 128);
> +
> +   ptr = LLVMBuildBitCast(builder, ptr, src_ptr_type, "");
> +   elem = LLVMBuildLoad(builder, ptr, "");
> +
> +   if (block_bits == 128) {
> +      /* just return block as is */
> +      *dxt_block = LLVMBuildBitCast(builder, elem, type32_4, "");
> +   }
> +   else {
> +      LLVMTypeRef type32_2 = LLVMVectorType(type32, 2);
> +      shuf = lp_build_const_extend_shuffle(gallivm, 2, 4);
> +      elem = LLVMBuildBitCast(builder, elem, type32_2, "");
> +      *dxt_block = LLVMBuildShuffleVector(builder, elem,
> +                                          LLVMGetUndef(type32_2), shuf, "");
> +   }
> +}
> +
> +
> +static void
> +s3tc_store_cached_block(struct gallivm_state *gallivm,
> +                        LLVMValueRef *col,
> +                        LLVMValueRef tag_value,
> +                        LLVMValueRef hash_index,
> +                        LLVMValueRef cache)
> +{
> +   LLVMBuilderRef builder = gallivm->builder;
> +   LLVMValueRef ptr, indices[3];
> +   LLVMTypeRef type_ptr4x32;
> +   unsigned count;
> +
> +   type_ptr4x32 = LLVMPointerType(LLVMVectorType(LLVMInt32TypeInContext(gallivm->context), 4), 0);
> +   indices[0] = lp_build_const_int32(gallivm, 0);
> +   indices[1] = lp_build_const_int32(gallivm, LP_BUILD_FORMAT_CACHE_MEMBER_TAGS);
> +   indices[2] = hash_index;
> +   ptr = LLVMBuildGEP(builder, cache, indices, ARRAY_SIZE(indices), "");
> +   LLVMBuildStore(builder, tag_value, ptr);
> +
> +   indices[1] = lp_build_const_int32(gallivm, LP_BUILD_FORMAT_CACHE_MEMBER_DATA);
> +   hash_index = LLVMBuildMul(builder, hash_index,
> +                             lp_build_const_int32(gallivm, 16), "");
> +   for (count = 0; count < 4; count++) {
> +      indices[2] = hash_index;
> +      ptr = LLVMBuildGEP(builder, cache, indices, ARRAY_SIZE(indices), "");
> +      ptr = LLVMBuildBitCast(builder, ptr, type_ptr4x32, "");
> +      LLVMBuildStore(builder, col[count], ptr);
> +      hash_index = LLVMBuildAdd(builder, hash_index,
> +                                lp_build_const_int32(gallivm, 4), "");
> +   }
> +}
> +
> +static LLVMValueRef
> +s3tc_lookup_cached_pixel(struct gallivm_state *gallivm,
> +                         LLVMValueRef ptr,
> +                         LLVMValueRef index)
> +{
> +   LLVMBuilderRef builder = gallivm->builder;
> +   LLVMValueRef member_ptr, indices[3];
> +
> +   indices[0] = lp_build_const_int32(gallivm, 0);
> +   indices[1] = lp_build_const_int32(gallivm, LP_BUILD_FORMAT_CACHE_MEMBER_DATA);
> +   indices[2] = index;
> +   member_ptr = LLVMBuildGEP(builder, ptr, indices, ARRAY_SIZE(indices), "");
> +   return LLVMBuildLoad(builder, member_ptr, "cache_data");
> +}
> +
> +static LLVMValueRef
> +s3tc_lookup_tag_data(struct gallivm_state *gallivm,
> +                     LLVMValueRef ptr,
> +                     LLVMValueRef index)
> +{
> +   LLVMBuilderRef builder = gallivm->builder;
> +   LLVMValueRef member_ptr, indices[3];
> +
> +   indices[0] = lp_build_const_int32(gallivm, 0);
> +   indices[1] = lp_build_const_int32(gallivm, LP_BUILD_FORMAT_CACHE_MEMBER_TAGS);
> +   indices[2] = index;
> +   member_ptr = LLVMBuildGEP(builder, ptr, indices, ARRAY_SIZE(indices), "");
> +   return LLVMBuildLoad(builder, member_ptr, "tag_data");
> +}
> +
> +#if LP_BUILD_FORMAT_CACHE_DEBUG
> +static void
> +s3tc_update_cache_access(struct gallivm_state *gallivm,
> +                         LLVMValueRef ptr,
> +                         unsigned count,
> +                         unsigned index)
> +{
> +   LLVMBuilderRef builder = gallivm->builder;
> +   LLVMValueRef member_ptr, cache_access;
> +
> +   assert(index == LP_BUILD_FORMAT_CACHE_MEMBER_ACCESS_TOTAL ||
> +          index == LP_BUILD_FORMAT_CACHE_MEMBER_ACCESS_MISS);
> +
> +   member_ptr = lp_build_struct_get_ptr(gallivm, ptr, index, "");
> +   cache_access = LLVMBuildLoad(builder, member_ptr, "cache_access");
> +   cache_access = LLVMBuildAdd(builder, cache_access,
> +                               LLVMConstInt(LLVMInt64TypeInContext(gallivm->context),
> +                                                                   count, 0), "");
> +   LLVMBuildStore(builder, cache_access, member_ptr);
> +}
> +#endif
> +
> +/**
> + * Calculate 1/3(v1-v0) + v0 and 2*1/3(v1-v0) + v0.
> + * The lerp is performed between the first 2 32bit colors
> + * in the source vector, both results are returned packed in result vector.
> + */
> +static LLVMValueRef
> +lp_build_lerp23_single(struct lp_build_context *bld,
> +                       LLVMValueRef v01)
> +{
> +   struct gallivm_state *gallivm = bld->gallivm;
> +   LLVMValueRef x, mul, delta, res, v0, v1, elems[8];
> +   const struct lp_type type = bld->type;
> +   LLVMBuilderRef builder = bld->gallivm->builder;
> +   struct lp_type i16_type = lp_wider_type(type);
> +   struct lp_type i32_type = lp_wider_type(i16_type);
> +   struct lp_build_context bld2;
> +
> +   assert(!type.floating && !type.fixed && !type.norm && type.width == 8);
> +
> +   lp_build_context_init(&bld2, gallivm, i16_type);
> +   bld2.type.sign = TRUE;
> +
> +   /* weights 256/3, 256*2/3, with correct rounding */
> +   elems[0] = elems[1] = elems[2] = elems[3] =
> +      lp_build_const_elem(gallivm, i16_type, 255*1/3);
> +   elems[4] = elems[5] = elems[6] = elems[7] =
> +      lp_build_const_elem(gallivm, i16_type, 171);
> +   x = LLVMConstVector(elems, 8);
> +
> +   /*
> +    * v01 has col0 in 32bit elem 0, col1 in elem 1.
> +    * Interleave/unpack will give us separate v0/v1 vectors.
> +    */
> +   v01 = lp_build_interleave2(gallivm, i32_type, v01, v01, 0);
> +   v01 = LLVMBuildBitCast(builder, v01, bld->vec_type, "");
> +
> +   lp_build_unpack2(gallivm, type, i16_type, v01, &v0, &v1);
> +   delta = lp_build_sub(&bld2, v1, v0);
> +
> +   mul = LLVMBuildMul(builder, x, delta, "");
> +
> +   mul = LLVMBuildLShr(builder, mul, lp_build_const_int_vec(gallivm, i16_type, 8), "");
> +   /* lerp optimization: pack now, do add afterwards */
> +   res = lp_build_pack2(gallivm, i16_type, type, mul, bld2.undef);
> +   /* only lower 2 elems are valid - for these v0 is really v0 */
> +   return lp_build_add(bld, res, v01);
> +}
> +
> +/*
> + * decode one dxt1 block.
> + */
> +static void
> +s3tc_decode_block_dxt1(struct gallivm_state *gallivm,
> +                       enum pipe_format format,
> +                       LLVMValueRef dxt_block,
> +                       LLVMValueRef *col)
> +{
> +   LLVMBuilderRef builder = gallivm->builder;
> +   LLVMValueRef color01, color23, color01_16, color0123;
> +   LLVMValueRef rgba, tmp, a, sel_mask, indices, code, const2;
> +   struct lp_type type8, type32, type16, type64;
> +   struct lp_build_context bld8, bld32, bld16, bld64;
> +   unsigned i;
> +   boolean is_dxt1_variant = format_dxt1_variant(format);
> +
> +   memset(&type32, 0, sizeof type32);
> +   type32.width = 32;
> +   type32.length = 4;
> +   type32.sign = TRUE;
> +
> +   memset(&type8, 0, sizeof type8);
> +   type8.width = 8;
> +   type8.length = 16;
> +
> +   memset(&type16, 0, sizeof type16);
> +   type16.width = 16;
> +   type16.length = 8;
> +
> +   memset(&type64, 0, sizeof type64);
> +   type64.width = 64;
> +   type64.length = 2;
> +
> +   a = lp_build_const_int_vec(gallivm, type32, 0xff000000);
> +   const2 = lp_build_const_int_vec(gallivm, type32, 2);
> +
> +   lp_build_context_init(&bld32, gallivm, type32);
> +   lp_build_context_init(&bld16, gallivm, type16);
> +   lp_build_context_init(&bld8, gallivm, type8);
> +   lp_build_context_init(&bld64, gallivm, type64);
> +
> +   if (is_dxt1_variant) {
> +      color01 = lp_build_shuffle1undef(gallivm, dxt_block, 0, 4);
> +      code = lp_build_shuffle1undef(gallivm, dxt_block, 1, 4);
> +   } else {
> +      color01 = lp_build_shuffle1undef(gallivm, dxt_block, 2, 4);
> +      code = lp_build_shuffle1undef(gallivm, dxt_block, 3, 4);
> +   }
> +   code = LLVMBuildBitCast(builder, code, bld8.vec_type, "");
> +   /* expand bytes to dwords */
> +   code = lp_build_interleave2(gallivm, type8, code, code, 0);
> +   code = lp_build_interleave2(gallivm, type8, code, code, 0);
> +
> +
> +   /*
> +    * works as follows:
> +    * - expand color0/color1 to rgba8888
> +    * - calculate color2/3 (interpolation) according to color0 < color1 rules
> +    * - calculate color2/3 according to color0 >= color1 rules
> +    * - do selection of color2/3 according to comparison of color0/1
> +    * - extract indices.
> +    * - use compare/select to select the correct color. Since we have 2bit
> +    *   indices (and 4 colors), needs at least three compare/selects.
> +    */
> +
> +   /*
> +    * expand the two colors
> +    */
> +   color01 = LLVMBuildBitCast(builder, color01, bld16.vec_type, "");
> +   color01 = lp_build_interleave2(gallivm, type16, color01,
> +                                  bld16.zero, 0);
> +   color01_16 = LLVMBuildBitCast(builder, color01, bld32.vec_type, "");
> +   color01 = color_expand_565_to_8888(gallivm, 4, color01_16);
> +
> +   /*
> +    * interpolate colors
> +    * color2_1 is 2/3 color0 + 1/3 color1
> +    * color3_1 is 1/3 color0 + 2/3 color1
> +    * color2_2 is 1/2 color0 + 1/2 color1
> +    * color3_2 is 0
> +    */
> +
> +   /* TODO: since this is now always scalar, should
> +    * probably just use control flow here instead of calculating
> +    * both cases and then selection
> +    */
> +   if (format == PIPE_FORMAT_DXT1_RGBA ||
> +       format == PIPE_FORMAT_DXT1_SRGBA) {
> +      color01 = LLVMBuildOr(builder, color01, a, "");
> +   }
> +   /* can combine 2 lerps into one mostly */
> +   color23 = lp_build_lerp23_single(&bld8, color01);
> +   color23 = LLVMBuildBitCast(builder, color23, bld32.vec_type, "");
> +
> +   /* dxt3/5 always use 4-color encoding */
> +   if (is_dxt1_variant) {
> +      LLVMValueRef color23_2, color2_2;
> +
> +      if (util_cpu_caps.has_sse2) {
> +         LLVMValueRef intrargs[2];
> +         intrargs[0] = LLVMBuildBitCast(builder, color01, bld8.vec_type, "");
> +         /* same interleave as for lerp23 - correct result in 2nd element */
> +         intrargs[1] = lp_build_interleave2(gallivm, type32, color01, color01, 0);
> +         intrargs[1] = LLVMBuildBitCast(builder, intrargs[1], bld8.vec_type, "");
> +         color2_2 = lp_build_intrinsic(builder, "llvm.x86.sse2.pavg.b",
> +                                       bld8.vec_type, intrargs, 2, 0);
> +      }
> +      else {
> +         LLVMValueRef v01, v0, v1, vhalf;
> +         /*
> +          * This isn't as expensive as it looks (the unpack is the same as
> +          * for lerp23, which is the reason why we do the pointless
> +          * interleave2 too), with correct rounding (the two lower elements
> +          * will be the same).
> +          */
> +         v01 = lp_build_interleave2(gallivm, type32, color01, color01, 0);
> +         v01 = LLVMBuildBitCast(builder, v01, bld8.vec_type, "");
> +         lp_build_unpack2(gallivm, type8, type16, v01, &v0, &v1);
> +         vhalf = lp_build_add(&bld16, v0, v1);
> +         vhalf = LLVMBuildLShr(builder, vhalf, bld16.one, "");
> +         color2_2 = lp_build_pack2(gallivm, type16, type8, vhalf, bld16.undef);
> +      }
> +      /* shuffle in color 3 as elem 2 zero, color 2 elem 1 */
> +      color23_2 = LLVMBuildBitCast(builder, color2_2, bld64.vec_type, "");
> +      color23_2 = LLVMBuildLShr(builder, color23_2,
> +                                lp_build_const_int_vec(gallivm, type64, 32), "");
> +      color23_2 = LLVMBuildBitCast(builder, color23_2, bld32.vec_type, "");
> +
> +      tmp = LLVMBuildBitCast(builder, color01_16, bld64.vec_type, "");
> +      tmp = LLVMBuildLShr(builder, tmp,
> +                          lp_build_const_int_vec(gallivm, type64, 32), "");
> +      tmp = LLVMBuildBitCast(builder, tmp, bld32.vec_type, "");
> +      sel_mask = lp_build_compare(gallivm, type32, PIPE_FUNC_GREATER,
> +                                  color01_16, tmp);
> +      sel_mask = lp_build_interleave2(gallivm, type32, sel_mask, sel_mask, 0);
> +      color23 = lp_build_select(&bld32, sel_mask, color23, color23_2);
> +   }
> +
> +   if (util_cpu_caps.has_ssse3) {
> +      /*
> +       * Use pshufb as mini-lut. (Only doable with intrinsics as the
> +       * final shuffles are non-constant. pshufb is awesome!)
> +       */
> +      LLVMValueRef shuf[16], low2mask;
> +      LLVMValueRef intrargs[2], lut_ind, lut_adj;
> +
> +      color01 = LLVMBuildBitCast(builder, color01, bld64.vec_type, "");
> +      color23 = LLVMBuildBitCast(builder, color23, bld64.vec_type, "");
> +      color0123 = lp_build_interleave2(gallivm, type64, color01, color23, 0);
> +      color0123 = LLVMBuildBitCast(builder, color0123, bld32.vec_type, "");
> +
> +      if (format == PIPE_FORMAT_DXT1_RGB ||
> +          format == PIPE_FORMAT_DXT1_SRGB) {
> +         color0123 = LLVMBuildOr(builder, color0123, a, "");
> +      }
> +
> +      /* shuffle as r0r1r2r3g0g1... */
> +      for (i = 0; i < 4; i++) {
> +         shuf[4*i] = lp_build_const_int32(gallivm, 0 + i);
> +         shuf[4*i+1] = lp_build_const_int32(gallivm, 4 + i);
> +         shuf[4*i+2] = lp_build_const_int32(gallivm, 8 + i);
> +         shuf[4*i+3] = lp_build_const_int32(gallivm, 12 + i);
> +      }
> +      color0123 = LLVMBuildBitCast(builder, color0123, bld8.vec_type, "");
> +      color0123 = LLVMBuildShuffleVector(builder, color0123, bld8.undef,
> +                                         LLVMConstVector(shuf, 16), "");
> +
> +      /* lowest 2 bits of each 8 bit value contain index into "LUT" */
> +      low2mask = lp_build_const_int_vec(gallivm, type8, 3);
> +      /* add 0/4/8/12 for r/g/b/a */
> +      lut_adj = lp_build_const_int_vec(gallivm, type32, 0x0c080400);
> +      lut_adj = LLVMBuildBitCast(builder, lut_adj, bld8.vec_type, "");
> +      intrargs[0] = color0123;
> +      for (i = 0; i < 4; i++) {
> +         lut_ind = LLVMBuildAnd(builder, code, low2mask, "");
> +         lut_ind = LLVMBuildOr(builder, lut_ind, lut_adj, "");
> +         intrargs[1] = lut_ind;
> +         col[i] = lp_build_intrinsic(builder, "llvm.x86.ssse3.pshuf.b.128",
> +                                     bld8.vec_type, intrargs, 2, 0);
> +         col[i] = LLVMBuildBitCast(builder, col[i], bld32.vec_type, "");
> +         code = LLVMBuildBitCast(builder, code, bld32.vec_type, "");
> +         code = LLVMBuildLShr(builder, code, const2, "");
> +         code = LLVMBuildBitCast(builder, code, bld8.vec_type, "");
> +      }
> +   }
> +   else {
> +      /* Thanks to vectorization can do 4 texels in parallel */
> +      LLVMValueRef color0, color1, color2, color3;
> +      if (format == PIPE_FORMAT_DXT1_RGB ||
> +          format == PIPE_FORMAT_DXT1_SRGB) {
> +         color01 = LLVMBuildOr(builder, color01, a, "");
> +         color23 = LLVMBuildOr(builder, color23, a, "");
> +      }
> +      color0 = LLVMBuildShuffleVector(builder, color01, bld32.undef,
> +                                      lp_build_const_shuffle1(gallivm, 0, 4), "");
> +      color1 = LLVMBuildShuffleVector(builder, color01, bld32.undef,
> +                                      lp_build_const_shuffle1(gallivm, 1, 4), "");
> +      color2 = LLVMBuildShuffleVector(builder, color23, bld32.undef,
> +                                      lp_build_const_shuffle1(gallivm, 0, 4), "");
> +      color3 = LLVMBuildShuffleVector(builder, color23, bld32.undef,
> +                                      lp_build_const_shuffle1(gallivm, 1, 4), "");
> +      code = LLVMBuildBitCast(builder, code, bld32.vec_type, "");
> +
> +      for (i = 0; i < 4; i++) {
> +         /* select the colors */
> +         LLVMValueRef selmasklo, rgba01, rgba23, bitlo;
> +         bitlo = bld32.one;
> +         indices = LLVMBuildAnd(builder, code, bitlo, "");
> +         selmasklo = lp_build_compare(gallivm, type32, PIPE_FUNC_EQUAL,
> +                                      indices, bitlo);
> +         rgba01 = lp_build_select(&bld32, selmasklo, color1, color0);
> +
> +         LLVMValueRef selmaskhi;
> +         indices = LLVMBuildAnd(builder, code, const2, "");
> +         selmaskhi = lp_build_compare(gallivm, type32, PIPE_FUNC_EQUAL,
> +                                      indices, const2);
> +         rgba23 = lp_build_select(&bld32, selmasklo, color3, color2);
> +         rgba = lp_build_select(&bld32, selmaskhi, rgba23, rgba01);
> +
> +         /*
> +          * Note that this will give "wrong" order.
> +          * col0 will be rgba0, rgba4, rgba8, rgba12, col1 rgba1, rgba5, ...
> +          * This would be easily fixable by using different shuffle, bitlo/hi
> +          * vectors above (and different shift), but seems slightly easier to
> +          * deal with for dxt3/dxt5 alpha too. So instead change lookup.
> +          */
> +         col[i] = rgba;
> +         code = LLVMBuildLShr(builder, code, const2, "");
> +      }
> +   }
> +}
> +
> +/*
> + * decode one dxt3 block.
> + */
> +static void
> +s3tc_decode_block_dxt3(struct gallivm_state *gallivm,
> +                       enum pipe_format format,
> +                       LLVMValueRef dxt_block,
> +                       LLVMValueRef *col)
> +{
> +   LLVMBuilderRef builder = gallivm->builder;
> +   LLVMValueRef alpha, alphas0, alphas1, shift4_16, a[4], mask8hi;
> +   struct lp_type type32, type8, type16;
> +   unsigned i;
> +
> +   memset(&type32, 0, sizeof type32);
> +   type32.width = 32;
> +   type32.length = 4;
> +
> +   memset(&type8, 0, sizeof type8);
> +   type8.width = 8;
> +   type8.length = 16;
> +
> +   memset(&type16, 0, sizeof type16);
> +   type16.width = 16;
> +   type16.length = 8;
> +
> +   s3tc_decode_block_dxt1(gallivm, format, dxt_block, col);
> +
> +   shift4_16 = lp_build_const_int_vec(gallivm, type16, 4);
> +   mask8hi = lp_build_const_int_vec(gallivm, type32, 0xff000000);
> +
> +   alpha = LLVMBuildBitCast(builder, dxt_block,
> +                            lp_build_vec_type(gallivm, type8), "");
> +   alpha = lp_build_interleave2(gallivm, type8, alpha, alpha, 0);
> +   alpha = LLVMBuildBitCast(builder, alpha,
> +                            lp_build_vec_type(gallivm, type16), "");
> +   alpha = LLVMBuildAnd(builder, alpha,
> +                        lp_build_const_int_vec(gallivm, type16, 0xf00f), "");
> +   alphas0 = LLVMBuildLShr(builder, alpha, shift4_16, "");
> +   alphas1 = LLVMBuildShl(builder, alpha, shift4_16, "");
> +   alpha = LLVMBuildOr(builder, alphas0, alpha, "");
> +   alpha = LLVMBuildOr(builder, alphas1, alpha, "");
> +   alpha = LLVMBuildBitCast(builder, alpha,
> +                            lp_build_vec_type(gallivm, type32), "");
> +   /*
> +    * alpha now contains elems 0,1,2,3,... (ubytes)
> +    * we need 0,4,8,12, 1,5,9,13 etc. in dwords to match color (which
> +    * is just as easy as "natural" order - 3 shift/and instead of 6 unpack).
> +    */
> +   a[0] = LLVMBuildShl(builder, alpha,
> +                       lp_build_const_int_vec(gallivm, type32, 24), "");
> +   a[1] = LLVMBuildShl(builder, alpha,
> +                       lp_build_const_int_vec(gallivm, type32, 16), "");
> +   a[1] = LLVMBuildAnd(builder, a[1], mask8hi, "");
> +   a[2] = LLVMBuildShl(builder, alpha,
> +                       lp_build_const_int_vec(gallivm, type32, 8), "");
> +   a[2] = LLVMBuildAnd(builder, a[2], mask8hi, "");
> +   a[3] = LLVMBuildAnd(builder, alpha, mask8hi, "");
> +
> +   for (i = 0; i < 4; i++) {
> +      col[i] = LLVMBuildOr(builder, col[i], a[i], "");
> +   }
> +}
> +
> +
> +static LLVMValueRef
> +lp_build_lerpdxta_block(struct gallivm_state *gallivm,
> +                        LLVMValueRef alpha0,
> +                        LLVMValueRef alpha1,
> +                        LLVMValueRef code,
> +                        LLVMValueRef sel_mask)
> +{
> +   LLVMBuilderRef builder = gallivm->builder;
> +   LLVMValueRef delta, ainterp;
> +   LLVMValueRef weight5, weight7, weight;
> +   struct lp_type type16;
> +   struct lp_build_context bld;
> +
> +   memset(&type16, 0, sizeof type16);
> +   type16.width = 16;
> +   type16.length = 8;
> +   type16.sign = TRUE;
> +
> +   lp_build_context_init(&bld, gallivm, type16);
> +   /*
> +    * 256/7 is only 36.57 so we'd lose quite some precision. Since it would
> +    * actually be desirable to do this here with even higher accuracy than
> +    * even 8 bit (more or less required for rgtc, albeit that's not handled
> +    * here right now), shift the weights after multiplication by code.
> +    */
> +   weight5 = lp_build_const_int_vec(gallivm, type16, 256*64/5);
> +   weight7 = lp_build_const_int_vec(gallivm, type16, 256*64/7);
> +   weight = lp_build_select(&bld, sel_mask, weight7, weight5);
> +
> +   /*
> +    * we'll get garbage in the elements which had code 0 (or larger than
> +    * 5 or 7) but we don't care (or rather, need to fix up anyway).
> +    */
> +   code = LLVMBuildSub(builder, code, bld.one, "");
> +
> +   weight = LLVMBuildMul(builder, weight, code, "");
> +   weight = LLVMBuildLShr(builder, weight,
> +                          lp_build_const_int_vec(gallivm, type16, 6), "");
> +
> +   delta = LLVMBuildSub(builder, alpha1, alpha0, "");
> +
> +   ainterp = LLVMBuildMul(builder, delta, weight, "");
> +   ainterp = LLVMBuildLShr(builder, ainterp,
> +                           lp_build_const_int_vec(gallivm, type16, 8), "");
> +
> +   /* lerp is done later (with packed values) */
> +
> +   return ainterp;
> +}
> +
> +
> +/*
> + * decode one dxt5 block.
> + */
> +static void
> +s3tc_decode_block_dxt5(struct gallivm_state *gallivm,
> +                       enum pipe_format format,
> +                       LLVMValueRef dxt_block,
> +                       LLVMValueRef *col)
> +{
> +   LLVMBuilderRef builder = gallivm->builder;
> +   LLVMValueRef alpha, alpha0, alpha1, ares;
> +   LLVMValueRef ainterp, ainterp0, ainterp1, shuffle1, sel_mask, sel_mask2;
> +   LLVMValueRef a[4], acode, tmp0, tmp1;
> +   LLVMTypeRef i64t, i32t;
> +   struct lp_type type32, type64, type8, type16;
> +   struct lp_build_context bld16, bld8;
> +   unsigned i;
> +
> +   memset(&type32, 0, sizeof type32);
> +   type32.width = 32;
> +   type32.length = 4;
> +
> +   memset(&type64, 0, sizeof type64);
> +   type64.width = 64;
> +   type64.length = 2;
> +
> +   memset(&type8, 0, sizeof type8);
> +   type8.width = 8;
> +   type8.length = 16;
> +
> +   memset(&type16, 0, sizeof type16);
> +   type16.width = 16;
> +   type16.length = 8;
> +
> +   lp_build_context_init(&bld16, gallivm, type16);
> +   lp_build_context_init(&bld8, gallivm, type8);
> +
> +   i64t = lp_build_vec_type(gallivm, type64);
> +   i32t = lp_build_vec_type(gallivm, type32);
> +
> +   s3tc_decode_block_dxt1(gallivm, format, dxt_block, col);
> +
> +   /*
> +    * three possible strategies for vectorizing alpha:
> +    * 1) compute all 8 values then use scalar extraction
> +    *    (i.e. have all 8 alpha values packed in one 64bit scalar
> +    *    and do something like ax = vals >> (codex * 8) followed
> +    *    by inserting these values back into color)
> +    * 2) same as 8 but just use pshufb as a mini-LUT for selection.
> +    *    (without pshufb would need boatloads of cmp/selects trying to
> +    *    keep things vectorized for essentially scalar selection).
> +    * 3) do something similar to the uncached case
> +    *    needs more calculations (need to calc 16 values instead of 8 though
> +    *    that's only an issue for the lerp which we need to do twice otherwise
> +    *    everything still fits into 128bit) but keeps things vectorized mostly.
> +    * Trying 3) here though not sure it's really faster...
> +    * With pshufb, we try 2) (cheaper and more accurate)
> +    */
> +
> +   /*
> +    * Ideally, we'd use 2 variable 16bit shifts here (byte shifts wouldn't
> +    * help since code crosses 8bit boundaries). But variable shifts are
> +    * AVX2 only, and even then only dword/quadword (intel _really_ hates
> +    * shifts!). Instead, emulate by 16bit muls.
> +    * Also, the required byte shuffles are essentially non-emulatable, so
> +    * require ssse3 (albeit other archs might do them fine).
> +    * This is not directly tied to ssse3 - just need sane byte shuffles.
> +    * But ordering is going to be different below so use same condition.
> +    */
> +
> +
> +   /* vectorize alpha */
> +   alpha = LLVMBuildBitCast(builder, dxt_block, i64t, "");
> +   alpha0 = LLVMBuildAnd(builder, alpha,
> +                         lp_build_const_int_vec(gallivm, type64, 0xff), "");
> +   alpha0 = LLVMBuildBitCast(builder, alpha0, bld16.vec_type, "");
> +   alpha = LLVMBuildBitCast(builder, alpha, bld16.vec_type, "");
> +   alpha1 = LLVMBuildLShr(builder, alpha,
> +                          lp_build_const_int_vec(gallivm, type16, 8), "");
> +   alpha = LLVMBuildBitCast(builder, alpha,  i64t, "");
> +   shuffle1 = lp_build_const_shuffle1(gallivm, 0, 8);
> +   /* XXX this shuffle broken with LLVM 2.8 */
> +   alpha0 = LLVMBuildShuffleVector(builder, alpha0, alpha0, shuffle1, "");
> +   alpha1 = LLVMBuildShuffleVector(builder, alpha1, alpha1, shuffle1, "");
> +
> +   type16.sign = TRUE;
> +   sel_mask = lp_build_compare(gallivm, type16, PIPE_FUNC_GREATER,
> +                               alpha0, alpha1);
> +   type16.sign = FALSE;
> +   sel_mask = LLVMBuildBitCast(builder, sel_mask, bld8.vec_type, "");
> +
> +   if (!util_cpu_caps.has_ssse3) {
> +      LLVMValueRef acodeg, mask1, acode0, acode1;
> +
> +      /* extraction of the 3 bit values into something more useful is HARD */
> +      /* first steps are actually scalar */
> +      acode = LLVMBuildLShr(builder, alpha,
> +                            lp_build_const_int_vec(gallivm, type64, 16), "");
> +      tmp0 = LLVMBuildAnd(builder, acode,
> +                          lp_build_const_int_vec(gallivm, type64, 0xffffff), "");
> +      tmp1 =  LLVMBuildLShr(builder, acode,
> +                            lp_build_const_int_vec(gallivm, type64, 24), "");
> +      tmp0 = LLVMBuildBitCast(builder, tmp0, i32t, "");
> +      tmp1 = LLVMBuildBitCast(builder, tmp1, i32t, "");
> +      acode = lp_build_interleave2(gallivm, type32, tmp0, tmp1, 0);
> +      /* now have 2x24bit in 4x32bit, order 01234567, 89..., undef, undef */
> +      tmp0 = LLVMBuildAnd(builder, acode,
> +                          lp_build_const_int_vec(gallivm, type32, 0xfff), "");
> +      tmp1 =  LLVMBuildLShr(builder, acode,
> +                            lp_build_const_int_vec(gallivm, type32, 12), "");
> +      acode = lp_build_interleave2(gallivm, type32, tmp0, tmp1, 0);
> +      /* now have 4x12bit in 4x32bit, order 0123, 4567, ,,, */
> +      tmp0 = LLVMBuildAnd(builder, acode,
> +                          lp_build_const_int_vec(gallivm, type32, 0x3f), "");
> +      tmp1 =  LLVMBuildLShr(builder, acode,
> +                            lp_build_const_int_vec(gallivm, type32, 6), "");
> +      /* use signed pack doesn't matter and otherwise need sse41 */
> +      type32.sign = type16.sign = TRUE;
> +      acode = lp_build_pack2(gallivm, type32, type16, tmp0, tmp1);
> +      type32.sign = type16.sign = FALSE;
> +      /* now have 8x6bit in 8x16bit, 01, 45, 89, ..., 23, 67, ... */
> +      acode0 = LLVMBuildAnd(builder, acode,
> +                            lp_build_const_int_vec(gallivm, type16, 0x7), "");
> +      acode1 =  LLVMBuildLShr(builder, acode,
> +                              lp_build_const_int_vec(gallivm, type16, 3), "");
> +      acode = lp_build_pack2(gallivm, type16, type8, acode0, acode1);
> +      /* acode0 contains elems 0,4,8,12,2,6,10,14, acode1 1,5,9,... */
> +
> +      acodeg = LLVMBuildAnd(builder, acode,
> +                            LLVMBuildNot(builder, sel_mask, ""), "");
> +      mask1 = lp_build_compare(gallivm, type8, PIPE_FUNC_EQUAL,
> +                               acode, bld8.one);
> +
> +      sel_mask = LLVMBuildBitCast(builder, sel_mask, bld16.vec_type, "");
> +      ainterp0 = lp_build_lerpdxta_block(gallivm, alpha0, alpha1, acode0, sel_mask);
> +      ainterp1 = lp_build_lerpdxta_block(gallivm, alpha0, alpha1, acode1, sel_mask);
> +      sel_mask = LLVMBuildBitCast(builder, sel_mask, bld8.vec_type, "");
> +      ainterp = lp_build_pack2(gallivm, type16, type8, ainterp0, ainterp1);
> +      alpha0 = lp_build_pack2(gallivm, type16, type8, alpha0, alpha0);
> +      alpha1 = lp_build_pack2(gallivm, type16, type8, alpha1, alpha1);
> +      ainterp = LLVMBuildAdd(builder, ainterp, alpha0, "");
> +      /* Fix up val01 */
> +      sel_mask2 = lp_build_compare(gallivm, type8, PIPE_FUNC_EQUAL,
> +                                   acode, bld8.zero);
> +      ainterp = lp_build_select(&bld8, sel_mask2, alpha0, ainterp);
> +      ainterp = lp_build_select(&bld8, mask1, alpha1, ainterp);
> +
> +      /* fix up val67 if a0 <= a1 */
> +      sel_mask2 = lp_build_compare(gallivm, type8, PIPE_FUNC_EQUAL,
> +                                   acodeg, lp_build_const_int_vec(gallivm, type8, 6));
> +      ares = LLVMBuildAnd(builder, ainterp, LLVMBuildNot(builder, sel_mask2, ""), "");
> +      sel_mask2 = lp_build_compare(gallivm, type8, PIPE_FUNC_EQUAL,
> +                                   acodeg, lp_build_const_int_vec(gallivm, type8, 7));
> +      ares = LLVMBuildOr(builder, ares, sel_mask2, "");
> +
> +      /* unpack in right order (0,4,8,12,1,5,..) */
> +      /* this gives us zero, a0, zero, a4, zero, a8, ... for tmp0 */
> +      tmp0 = lp_build_interleave2(gallivm, type8, bld8.zero, ares, 0);
> +      tmp1 = lp_build_interleave2(gallivm, type8, bld8.zero, ares, 1);
> +      tmp0 = LLVMBuildBitCast(builder, tmp0, bld16.vec_type, "");
> +      tmp1 = LLVMBuildBitCast(builder, tmp1, bld16.vec_type, "");
> +
> +      a[0] = lp_build_interleave2(gallivm, type16, bld16.zero, tmp0, 0);
> +      a[1] = lp_build_interleave2(gallivm, type16, bld16.zero, tmp1, 0);
> +      a[2] = lp_build_interleave2(gallivm, type16, bld16.zero, tmp0, 1);
> +      a[3] = lp_build_interleave2(gallivm, type16, bld16.zero, tmp1, 1);
> +   }
> +   else {
> +      LLVMValueRef elems[16], intrargs[2], shufa, mulclo, mulchi, mask8hi;
> +      LLVMTypeRef type16s = LLVMInt16TypeInContext(gallivm->context);
> +      LLVMTypeRef type8s = LLVMInt8TypeInContext(gallivm->context);
> +      unsigned i, j;
> +      /*
> +       * Ideally, we'd use 2 variable 16bit shifts here (byte shifts wouldn't
> +       * help since code crosses 8bit boundaries). But variable shifts are
> +       * AVX2 only, and even then only dword/quadword (intel _really_ hates
> +       * shifts!). Instead, emulate by 16bit muls.
> +       * Also, the required byte shuffles are essentially non-emulatable, so
> +       * require ssse3 (albeit other archs might do them fine, but the
> +       * complete path is ssse3 only for now).
> +       */
> +      for (i = 0, j = 0; i < 16; i += 8, j += 3) {
> +         elems[i+0] = elems[i+1] = elems[i+2] = lp_build_const_int32(gallivm, j+2);
> +         elems[i+3] = elems[i+4] = lp_build_const_int32(gallivm, j+3);
> +         elems[i+5] = elems[i+6] = elems[i+7] = lp_build_const_int32(gallivm, j+4);
> +      }
> +      shufa = LLVMConstVector(elems, 16);
> +      alpha = LLVMBuildBitCast(builder, alpha, bld8.vec_type, "");
> +      acode = LLVMBuildShuffleVector(builder, alpha, bld8.undef, shufa, "");
> +      acode = LLVMBuildBitCast(builder, acode, bld16.vec_type, "");
> +      /*
> +       * Put 0/2/4/6 into high 3 bits of 16 bits (save AND mask)
> +       * Do the same for 1/3/5/7 (albeit still need mask there - ideally
> +       * we'd place them into bits 4-7 so could save shift but impossible.)
> +       */
> +      for (i = 0; i < 8; i += 4) {
> +         elems[i+0] = LLVMConstInt(type16s, 1 << (13-0), 0);
> +         elems[i+1] = LLVMConstInt(type16s, 1 << (13-6), 0);
> +         elems[i+2] = LLVMConstInt(type16s, 1 << (13-4), 0);
> +         elems[i+3] = LLVMConstInt(type16s, 1 << (13-2), 0);
> +      }
> +      mulclo = LLVMConstVector(elems, 8);
> +      for (i = 0; i < 8; i += 4) {
> +         elems[i+0] = LLVMConstInt(type16s, 1 << (13-3), 0);
> +         elems[i+1] = LLVMConstInt(type16s, 1 << (13-9), 0);
> +         elems[i+2] = LLVMConstInt(type16s, 1 << (13-7), 0);
> +         elems[i+3] = LLVMConstInt(type16s, 1 << (13-5), 0);
> +      }
> +      mulchi = LLVMConstVector(elems, 8);
> +
> +      tmp0 = LLVMBuildMul(builder, acode, mulclo, "");
> +      tmp1 = LLVMBuildMul(builder, acode, mulchi, "");
> +      tmp0 = LLVMBuildLShr(builder, tmp0,
> +                           lp_build_const_int_vec(gallivm, type16, 13), "");
> +      tmp1 = LLVMBuildLShr(builder, tmp1,
> +                           lp_build_const_int_vec(gallivm, type16, 5), "");
> +      tmp1 = LLVMBuildAnd(builder, tmp1,
> +                          lp_build_const_int_vec(gallivm, type16, 0x700), "");
> +      acode = LLVMBuildOr(builder, tmp0, tmp1, "");
> +      acode = LLVMBuildBitCast(builder, acode, bld8.vec_type, "");
> +
> +      /*
> +       * Note that ordering is different here to non-ssse3 path:
> +       * 0/1/2/3/4/5...
> +       */
> +
> +      LLVMValueRef weight0, weight1, weight, delta;
> +      LLVMValueRef constff_elem7, const0_elem6;
> +      /* weights, correctly rounded (round(256*x/7)) */
> +      elems[0] = LLVMConstInt(type16s, 256, 0);
> +      elems[1] = LLVMConstInt(type16s, 0, 0);
> +      elems[2] = LLVMConstInt(type16s, 219, 0);
> +      elems[3] =  LLVMConstInt(type16s, 183, 0);
> +      elems[4] =  LLVMConstInt(type16s, 146, 0);
> +      elems[5] =  LLVMConstInt(type16s, 110, 0);
> +      elems[6] =  LLVMConstInt(type16s, 73, 0);
> +      elems[7] =  LLVMConstInt(type16s, 37, 0);
> +      weight0 = LLVMConstVector(elems, 8);
> +
> +      elems[0] = LLVMConstInt(type16s, 256, 0);
> +      elems[1] = LLVMConstInt(type16s, 0, 0);
> +      elems[2] = LLVMConstInt(type16s, 205, 0);
> +      elems[3] =  LLVMConstInt(type16s, 154, 0);
> +      elems[4] =  LLVMConstInt(type16s, 102, 0);
> +      elems[5] =  LLVMConstInt(type16s, 51, 0);
> +      elems[6] =  LLVMConstInt(type16s, 0, 0);
> +      elems[7] =  LLVMConstInt(type16s, 0, 0);
> +      weight1 = LLVMConstVector(elems, 8);
> +
> +      weight0 = LLVMBuildBitCast(builder, weight0, bld8.vec_type, "");
> +      weight1 = LLVMBuildBitCast(builder, weight1, bld8.vec_type, "");
> +      weight = lp_build_select(&bld8, sel_mask, weight0, weight1);
> +      weight = LLVMBuildBitCast(builder, weight, bld16.vec_type, "");
> +
> +      for (i = 0; i < 16; i++) {
> +         elems[i] = LLVMConstNull(type8s);
> +      }
> +      elems[7] = LLVMConstInt(type8s, 255, 0);
> +      constff_elem7 = LLVMConstVector(elems, 16);
> +
> +      for (i = 0; i < 16; i++) {
> +         elems[i] = LLVMConstInt(type8s, 255, 0);
> +      }
> +      elems[6] = LLVMConstInt(type8s, 0, 0);
> +      const0_elem6 = LLVMConstVector(elems, 16);
> +
> +      /* standard simple lerp - but the version we need isn't available */
> +      delta = LLVMBuildSub(builder, alpha0, alpha1, "");
> +      ainterp = LLVMBuildMul(builder, delta, weight, "");
> +      ainterp = LLVMBuildLShr(builder, ainterp,
> +                              lp_build_const_int_vec(gallivm, type16, 8), "");
> +      ainterp = LLVMBuildBitCast(builder, ainterp, bld8.vec_type, "");
> +      alpha1 = LLVMBuildBitCast(builder, alpha1, bld8.vec_type, "");
> +      ainterp = LLVMBuildAdd(builder, ainterp, alpha1, "");
> +      ainterp = LLVMBuildBitCast(builder, ainterp, bld16.vec_type, "");
> +      ainterp = lp_build_pack2(gallivm, type16, type8, ainterp, bld16.undef);
> +
> +      /* fixing 0/0xff case is slightly more complex */
> +      constff_elem7 = LLVMBuildAnd(builder, constff_elem7,
> +                                   LLVMBuildNot(builder, sel_mask, ""), "");
> +      const0_elem6 = LLVMBuildOr(builder, const0_elem6, sel_mask, "");
> +      ainterp = LLVMBuildOr(builder, ainterp, constff_elem7, "");
> +      ainterp = LLVMBuildAnd(builder, ainterp, const0_elem6, "");
> +
> +      /* now pick all 16 elements at once! */
> +      intrargs[0] = ainterp;
> +      intrargs[1] = acode;
> +      ares = lp_build_intrinsic(builder, "llvm.x86.ssse3.pshuf.b.128",
> +                                bld8.vec_type, intrargs, 2, 0);
> +
> +      ares = LLVMBuildBitCast(builder, ares, i32t, "");
> +      mask8hi = lp_build_const_int_vec(gallivm, type32, 0xff000000);
> +      a[0] = LLVMBuildShl(builder, ares,
> +                          lp_build_const_int_vec(gallivm, type32, 24), "");
> +      a[1] = LLVMBuildShl(builder, ares,
> +                          lp_build_const_int_vec(gallivm, type32, 16), "");
> +      a[1] = LLVMBuildAnd(builder, a[1], mask8hi, "");
> +      a[2] = LLVMBuildShl(builder, ares,
> +                          lp_build_const_int_vec(gallivm, type32, 8), "");
> +      a[2] = LLVMBuildAnd(builder, a[2], mask8hi, "");
> +      a[3] = LLVMBuildAnd(builder, ares, mask8hi, "");
> +   }
> +
> +   for (i = 0; i < 4; i++) {
> +      a[i] = LLVMBuildBitCast(builder, a[i], i32t, "");
> +      col[i] = LLVMBuildOr(builder, col[i], a[i], "");
> +   }
> +}
> +
> +
> +static void
> +generate_update_cache_one_block(struct gallivm_state *gallivm,
> +                                LLVMValueRef function,
> +                                const struct util_format_description *format_desc)
> +{
> +   LLVMBasicBlockRef block;
> +   LLVMBuilderRef old_builder;
> +   LLVMValueRef ptr_addr;
> +   LLVMValueRef hash_index;
> +   LLVMValueRef cache;
> +   LLVMValueRef dxt_block, tag_value;
> +   LLVMValueRef col[LP_MAX_VECTOR_LENGTH];
> +
> +   ptr_addr     = LLVMGetParam(function, 0);
> +   hash_index   = LLVMGetParam(function, 1);
> +   cache        = LLVMGetParam(function, 2);
> +
> +   lp_build_name(ptr_addr,   "ptr_addr"  );
> +   lp_build_name(hash_index, "hash_index");
> +   lp_build_name(cache,      "cache_addr");
> +
> +   /*
> +    * Function body
> +    */
> +
> +   old_builder = gallivm->builder;
> +   block = LLVMAppendBasicBlockInContext(gallivm->context, function, "entry");
> +   gallivm->builder = LLVMCreateBuilderInContext(gallivm->context);
> +   LLVMPositionBuilderAtEnd(gallivm->builder, block);
> +
> +   lp_build_gather_s3tc_simple_scalar(gallivm, format_desc, &dxt_block,
> +                                      ptr_addr);
> +
> +   switch (format_desc->format) {
> +   case PIPE_FORMAT_DXT1_RGB:
> +   case PIPE_FORMAT_DXT1_RGBA:
> +   case PIPE_FORMAT_DXT1_SRGB:
> +   case PIPE_FORMAT_DXT1_SRGBA:
> +      s3tc_decode_block_dxt1(gallivm, format_desc->format, dxt_block, col);
> +      break;
> +   case PIPE_FORMAT_DXT3_RGBA:
> +   case PIPE_FORMAT_DXT3_SRGBA:
> +      s3tc_decode_block_dxt3(gallivm, format_desc->format, dxt_block, col);
> +      break;
> +   case PIPE_FORMAT_DXT5_RGBA:
> +   case PIPE_FORMAT_DXT5_SRGBA:
> +      s3tc_decode_block_dxt5(gallivm, format_desc->format, dxt_block, col);
> +      break;
> +   default:
> +      assert(0);
> +      s3tc_decode_block_dxt1(gallivm, format_desc->format, dxt_block, col);
> +      break;
> +   }
> +
> +   tag_value = LLVMBuildPtrToInt(gallivm->builder, ptr_addr,
> +                                 LLVMInt64TypeInContext(gallivm->context), "");
> +   s3tc_store_cached_block(gallivm, col, tag_value, hash_index, cache);
> +
> +   LLVMBuildRetVoid(gallivm->builder);
> +
> +   LLVMDisposeBuilder(gallivm->builder);
> +   gallivm->builder = old_builder;
> +
> +   gallivm_verify_function(gallivm, function);
> +}
> +
> +
> +static void
> +update_cached_block(struct gallivm_state *gallivm,
> +                    const struct util_format_description *format_desc,
> +                    LLVMValueRef ptr_addr,
> +                    LLVMValueRef hash_index,
> +                    LLVMValueRef cache)
> +
> +{
> +   LLVMBuilderRef builder = gallivm->builder;
> +   LLVMModuleRef module = gallivm->module;
> +   char name[256];
> +   LLVMTypeRef i8t = LLVMInt8TypeInContext(gallivm->context);
> +   LLVMTypeRef pi8t = LLVMPointerType(i8t, 0);
> +   LLVMValueRef function, inst;
> +   LLVMBasicBlockRef bb;
> +   LLVMValueRef args[3];
> +
> +   util_snprintf(name, sizeof name, "%s_update_cache_one_block",
> +                 format_desc->short_name);
> +   function = LLVMGetNamedFunction(module, name);
> +
> +   if (!function) {
> +      LLVMTypeRef ret_type;
> +      LLVMTypeRef arg_types[3];
> +      LLVMTypeRef function_type;
> +      unsigned arg;
> +
> +      /*
> +       * Generate the function prototype.
> +       */
> +
> +      ret_type = LLVMVoidTypeInContext(gallivm->context);
> +      arg_types[0] = pi8t;
> +      arg_types[1] = LLVMInt32TypeInContext(gallivm->context);
> +      arg_types[2] = LLVMTypeOf(cache); // XXX: put right type here
> +      function_type = LLVMFunctionType(ret_type, arg_types, ARRAY_SIZE(arg_types), 0);
> +      function = LLVMAddFunction(module, name, function_type);
> +
> +      for (arg = 0; arg < ARRAY_SIZE(arg_types); ++arg)
> +         if (LLVMGetTypeKind(arg_types[arg]) == LLVMPointerTypeKind)
> +            lp_add_function_attr(function, arg + 1, LP_FUNC_ATTR_NOALIAS);
> +
> +      LLVMSetFunctionCallConv(function, LLVMFastCallConv);
> +      LLVMSetVisibility(function, LLVMHiddenVisibility);
> +      generate_update_cache_one_block(gallivm, function, format_desc);
> +   }
> +
> +   args[0] = ptr_addr;
> +   args[1] = hash_index;
> +   args[2] = cache;
> +
> +   LLVMBuildCall(builder, function, args, ARRAY_SIZE(args), "");
> +   bb = LLVMGetInsertBlock(builder);
> +   inst = LLVMGetLastInstruction(bb);
> +   LLVMSetInstructionCallConv(inst, LLVMFastCallConv);
> +}
> +
> +/*
> + * cached lookup
> + */
> +static LLVMValueRef
> +compressed_fetch_cached(struct gallivm_state *gallivm,
> +                        const struct util_format_description *format_desc,
> +                        unsigned n,
> +                        LLVMValueRef base_ptr,
> +                        LLVMValueRef offset,
> +                        LLVMValueRef i,
> +                        LLVMValueRef j,
> +                        LLVMValueRef cache)
> +
> +{
> +   LLVMBuilderRef builder = gallivm->builder;
> +   unsigned count, low_bit, log2size;
> +   LLVMValueRef color, offset_stored, addr, ptr_addrtrunc, tmp;
> +   LLVMValueRef ij_index, hash_index, hash_mask, block_index;
> +   LLVMTypeRef i8t = LLVMInt8TypeInContext(gallivm->context);
> +   LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context);
> +   LLVMTypeRef i64t = LLVMInt64TypeInContext(gallivm->context);
> +   struct lp_type type;
> +   struct lp_build_context bld32;
> +   memset(&type, 0, sizeof type);
> +   type.width = 32;
> +   type.length = n;
> +
> +   lp_build_context_init(&bld32, gallivm, type);
> +
> +   /*
> +    * compute hash - we use direct mapped cache, the hash function could
> +    *                be better but it needs to be simple
> +    * per-element:
> +    *    compare offset with offset stored at tag (hash)
> +    *    if not equal extract block, store block, update tag
> +    *    extract color from cache
> +    *    assemble colors
> +    */
> +
> +   low_bit = util_logbase2(format_desc->block.bits / 8);
> +   log2size = util_logbase2(LP_BUILD_FORMAT_CACHE_SIZE);
> +   addr = LLVMBuildPtrToInt(builder, base_ptr, i64t, "");
> +   ptr_addrtrunc = LLVMBuildPtrToInt(builder, base_ptr, i32t, "");
> +   ptr_addrtrunc = lp_build_broadcast_scalar(&bld32, ptr_addrtrunc);
> +   /* For the hash function, first mask off the unused lowest bits. Then just
> +      do some xor with address bits - only use lower 32bits */
> +   ptr_addrtrunc = LLVMBuildAdd(builder, offset, ptr_addrtrunc, "");
> +   ptr_addrtrunc = LLVMBuildLShr(builder, ptr_addrtrunc,
> +                                 lp_build_const_int_vec(gallivm, type, low_bit), "");
> +   /* This only really makes sense for size 64,128,256 */
> +   hash_index = ptr_addrtrunc;
> +   ptr_addrtrunc = LLVMBuildLShr(builder, ptr_addrtrunc,
> +                                 lp_build_const_int_vec(gallivm, type, 2*log2size), "");
> +   hash_index = LLVMBuildXor(builder, ptr_addrtrunc, hash_index, "");
> +   tmp = LLVMBuildLShr(builder, hash_index,
> +                       lp_build_const_int_vec(gallivm, type, log2size), "");
> +   hash_index = LLVMBuildXor(builder, hash_index, tmp, "");
> +
> +   hash_mask = lp_build_const_int_vec(gallivm, type, LP_BUILD_FORMAT_CACHE_SIZE - 1);
> +   hash_index = LLVMBuildAnd(builder, hash_index, hash_mask, "");
> +   ij_index = LLVMBuildShl(builder, i, lp_build_const_int_vec(gallivm, type, 2), "");
> +   ij_index = LLVMBuildAdd(builder, ij_index, j, "");
> +   block_index = LLVMBuildShl(builder, hash_index,
> +                              lp_build_const_int_vec(gallivm, type, 4), "");
> +   block_index = LLVMBuildAdd(builder, ij_index, block_index, "");
> +
> +   if (n > 1) {
> +      color = bld32.undef;
> +      for (count = 0; count < n; count++) {
> +         LLVMValueRef index, cond, colorx;
> +         LLVMValueRef block_indexx, hash_indexx, addrx, offsetx, ptr_addrx;
> +         struct lp_build_if_state if_ctx;
> +
> +         index = lp_build_const_int32(gallivm, count);
> +         offsetx = LLVMBuildExtractElement(builder, offset, index, "");
> +         addrx = LLVMBuildZExt(builder, offsetx, i64t, "");
> +         addrx = LLVMBuildAdd(builder, addrx, addr, "");
> +         block_indexx = LLVMBuildExtractElement(builder, block_index, index, "");
> +         hash_indexx = LLVMBuildLShr(builder, block_indexx,
> +                                     lp_build_const_int32(gallivm, 4), "");
> +         offset_stored = s3tc_lookup_tag_data(gallivm, cache, hash_indexx);
> +         cond = LLVMBuildICmp(builder, LLVMIntNE, offset_stored, addrx, "");
> +
> +         lp_build_if(&if_ctx, gallivm, cond);
> +         {
> +            ptr_addrx = LLVMBuildIntToPtr(builder, addrx,
> +                                          LLVMPointerType(i8t, 0), "");
> +            update_cached_block(gallivm, format_desc, ptr_addrx, hash_indexx, cache);
> +#if LP_BUILD_FORMAT_CACHE_DEBUG
> +            s3tc_update_cache_access(gallivm, cache, 1,
> +                                     LP_BUILD_FORMAT_CACHE_MEMBER_ACCESS_MISS);
> +#endif
> +         }
> +         lp_build_endif(&if_ctx);
> +
> +         colorx = s3tc_lookup_cached_pixel(gallivm, cache, block_indexx);
> +
> +         color = LLVMBuildInsertElement(builder, color, colorx,
> +                                        lp_build_const_int32(gallivm, count), "");
> +      }
> +   }
> +   else {
> +      LLVMValueRef cond;
> +      struct lp_build_if_state if_ctx;
> +
> +      tmp = LLVMBuildZExt(builder, offset, i64t, "");
> +      addr = LLVMBuildAdd(builder, tmp, addr, "");
> +      offset_stored = s3tc_lookup_tag_data(gallivm, cache, hash_index);
> +      cond = LLVMBuildICmp(builder, LLVMIntNE, offset_stored, addr, "");
> +
> +      lp_build_if(&if_ctx, gallivm, cond);
> +      {
> +         tmp = LLVMBuildIntToPtr(builder, addr, LLVMPointerType(i8t, 0), "");
> +         update_cached_block(gallivm, format_desc, tmp, hash_index, cache);
> +#if LP_BUILD_FORMAT_CACHE_DEBUG
> +         s3tc_update_cache_access(gallivm, cache, 1,
> +                                  LP_BUILD_FORMAT_CACHE_MEMBER_ACCESS_MISS);
> +#endif
> +      }
> +      lp_build_endif(&if_ctx);
> +
> +      color = s3tc_lookup_cached_pixel(gallivm, cache, block_index);
> +   }
> +#if LP_BUILD_FORMAT_CACHE_DEBUG
> +   s3tc_update_cache_access(gallivm, cache, n,
> +                            LP_BUILD_FORMAT_CACHE_MEMBER_ACCESS_TOTAL);
> +#endif
> +   return LLVMBuildBitCast(builder, color, LLVMVectorType(i8t, n * 4), "");
> +}
> +
> +
> +static LLVMValueRef
> +s3tc_dxt5_to_rgba_aos(struct gallivm_state *gallivm,
> +                      unsigned n,
> +                      enum pipe_format format,
> +                      LLVMValueRef colors,
> +                      LLVMValueRef codewords,
> +                      LLVMValueRef alpha_lo,
> +                      LLVMValueRef alpha_hi,
> +                      LLVMValueRef i,
> +                      LLVMValueRef j)
> +{
> +   return s3tc_dxt5_full_to_rgba_aos(gallivm, n, format, colors,
> +                                     codewords, alpha_lo, alpha_hi, i, j);
> +}
> +
> +
> +/**
> + * @param n  number of pixels processed (usually n=4, but it should also work with n=1
> + *           and multiples of 4)
> + * @param base_ptr  base pointer (32bit or 64bit pointer depending on the architecture)
> + * @param offset <n x i32> vector with the relative offsets of the S3TC blocks
> + * @param i  is a <n x i32> vector with the x subpixel coordinate (0..3)
> + * @param j  is a <n x i32> vector with the y subpixel coordinate (0..3)
> + * @return  a <4*n x i8> vector with the pixel RGBA values in AoS
> + */
> +LLVMValueRef
> +lp_build_fetch_s3tc_rgba_aos(struct gallivm_state *gallivm,
> +                             const struct util_format_description *format_desc,
> +                             unsigned n,
> +                             LLVMValueRef base_ptr,
> +                             LLVMValueRef offset,
> +                             LLVMValueRef i,
> +                             LLVMValueRef j,
> +                             LLVMValueRef cache)
> +{
> +   LLVMValueRef rgba;
> +   LLVMTypeRef i8t = LLVMInt8TypeInContext(gallivm->context);
> +   LLVMBuilderRef builder = gallivm->builder;
> +
> +   assert(format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC);
> +   assert(format_desc->block.width == 4);
> +   assert(format_desc->block.height == 4);
> +
> +   assert((n == 1) || (n % 4 == 0));
> +
> +/*   debug_printf("format = %d\n", format_desc->format);*/
> +   if (cache) {
> +      rgba = compressed_fetch_cached(gallivm, format_desc, n,
> +                                     base_ptr, offset, i, j, cache);
> +      return rgba;
> +   }
> +
> +   if (n > 4) {
> +      unsigned count;
> +      LLVMTypeRef i8_vectype = LLVMVectorType(i8t, 4 * n);
> +      LLVMTypeRef i128_type = LLVMIntTypeInContext(gallivm->context, 128);
> +      LLVMTypeRef i128_vectype =  LLVMVectorType(i128_type, n / 4);
> +      LLVMTypeRef i324_vectype = LLVMVectorType(LLVMInt32TypeInContext(
> +                                                gallivm->context), 4);
> +      LLVMValueRef offset4, i4, j4, rgba4[LP_MAX_VECTOR_LENGTH/16];
> +      struct lp_type lp_324_vectype = lp_type_uint_vec(32, 128);
> +
> +      assert(n / 4 <= ARRAY_SIZE(rgba4));
> +
> +      rgba = LLVMGetUndef(i128_vectype);
> +
> +      for (count = 0; count < n / 4; count++) {
> +         LLVMValueRef colors, codewords, alpha_lo, alpha_hi;
> +
> +         i4 = lp_build_extract_range(gallivm, i, count * 4, 4);
> +         j4 = lp_build_extract_range(gallivm, j, count * 4, 4);
> +         offset4 = lp_build_extract_range(gallivm, offset, count * 4, 4);
> +
> +         lp_build_gather_s3tc(gallivm, 4, format_desc, &colors, &codewords,
> +                              &alpha_lo, &alpha_hi, base_ptr, offset4);
> +
> +         switch (format_desc->format) {
> +         case PIPE_FORMAT_DXT1_RGB:
> +         case PIPE_FORMAT_DXT1_RGBA:
> +         case PIPE_FORMAT_DXT1_SRGB:
> +         case PIPE_FORMAT_DXT1_SRGBA:
> +            rgba4[count] = s3tc_dxt1_to_rgba_aos(gallivm, 4, format_desc->format,
> +                                                 colors, codewords, i4, j4);
> +            break;
> +         case PIPE_FORMAT_DXT3_RGBA:
> +         case PIPE_FORMAT_DXT3_SRGBA:
> +            rgba4[count] = s3tc_dxt3_to_rgba_aos(gallivm, 4, format_desc->format, colors,
> +                                                 codewords, alpha_lo, alpha_hi, i4, j4);
> +            break;
> +         case PIPE_FORMAT_DXT5_RGBA:
> +         case PIPE_FORMAT_DXT5_SRGBA:
> +            rgba4[count] = s3tc_dxt5_to_rgba_aos(gallivm, 4, format_desc->format, colors,
> +                                                 codewords, alpha_lo, alpha_hi, i4, j4);
> +            break;
> +         default:
> +            assert(0);
> +            rgba4[count] = LLVMGetUndef(LLVMVectorType(i8t, 4));
> +            break;
> +         }
> +         /* shuffles typically give best results with dword elements...*/
> +         rgba4[count] = LLVMBuildBitCast(builder, rgba4[count], i324_vectype, "");
> +      }
> +      rgba = lp_build_concat(gallivm, rgba4, lp_324_vectype, n / 4);
> +      rgba = LLVMBuildBitCast(builder, rgba, i8_vectype, "");
> +   }
> +   else {
> +      LLVMValueRef colors, codewords, alpha_lo, alpha_hi;
> +
> +      lp_build_gather_s3tc(gallivm, n, format_desc, &colors, &codewords,
> +                           &alpha_lo, &alpha_hi, base_ptr, offset);
> +
> +      switch (format_desc->format) {
> +      case PIPE_FORMAT_DXT1_RGB:
> +      case PIPE_FORMAT_DXT1_RGBA:
> +      case PIPE_FORMAT_DXT1_SRGB:
> +      case PIPE_FORMAT_DXT1_SRGBA:
> +         rgba = s3tc_dxt1_to_rgba_aos(gallivm, n, format_desc->format,
> +                                      colors, codewords, i, j);
> +         break;
> +      case PIPE_FORMAT_DXT3_RGBA:
> +      case PIPE_FORMAT_DXT3_SRGBA:
> +         rgba = s3tc_dxt3_to_rgba_aos(gallivm, n, format_desc->format, colors,
> +                                      codewords, alpha_lo, alpha_hi, i, j);
> +         break;
> +      case PIPE_FORMAT_DXT5_RGBA:
> +      case PIPE_FORMAT_DXT5_SRGBA:
> +         rgba = s3tc_dxt5_to_rgba_aos(gallivm, n, format_desc->format, colors,
> +                                      codewords, alpha_lo, alpha_hi, i, j);
> +         break;
> +      default:
> +         assert(0);
> +         rgba = LLVMGetUndef(LLVMVectorType(i8t, 4*n));
> +         break;
> +      }
> +   }
> +
> +   /* always return just decompressed values - srgb conversion is done later */
> +
> +   return rgba;
> +}
> diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
> index 018cca8f9df..a6662c5e01b 100644
> --- a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
> +++ b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
> @@ -3549,10 +3549,6 @@ lp_build_sample_soa_func(struct gallivm_state *gallivm,
>         const struct util_format_description *format_desc;
>         format_desc = util_format_description(static_texture_state->format);
>         if (format_desc && format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC) {
> -         /*
> -          * This is not 100% correct, if we have cache but the
> -          * util_format_s3tc_prefer is true the cache won't get used
> -          * regardless (could hook up the block decode there...) */
>            need_cache = TRUE;

I'm a bit confused.  Based on your comment description, shouldnt this be 
FALSE?  Or is this dead code?

>         }
>      }
> diff --git a/src/gallium/auxiliary/meson.build b/src/gallium/auxiliary/meson.build
> index a4dbcf7b4ca..57f7e69050f 100644
> --- a/src/gallium/auxiliary/meson.build
> +++ b/src/gallium/auxiliary/meson.build
> @@ -389,8 +389,8 @@ if with_llvm
>       'gallivm/lp_bld_flow.h',
>       'gallivm/lp_bld_format_aos_array.c',
>       'gallivm/lp_bld_format_aos.c',
> -    'gallivm/lp_bld_format_cached.c',
>       'gallivm/lp_bld_format_float.c',
> +    'gallivm/lp_bld_format_s3tc.c',
>       'gallivm/lp_bld_format.c',
>       'gallivm/lp_bld_format.h',
>       'gallivm/lp_bld_format_soa.c',
> 

Otherwise looks great.  Thanks!

Reviewed-by: Jose Fonseca <jfonseca at vmware.com>