[Mesa-dev] [PATCH] gallivm: fix broken 8-wide s3tc decoding

Tue May 7 01:15:42 UTC 2019

LGTM.  Just two little nits below.

Reviewed-by: Brian Paul <brianp at vmware.com>

Perhaps you could review the 5-patch series of clean-ups I posted on 
Saturday?

On 05/06/2019 06:12 PM, sroland at vmware.com wrote:
> From: Roland Scheidegger <sroland at vmware.com>
> 
> Brian noticed there was an uninitialized var for the 8-wide case and 128
> bit blocks, which made it always crash. Likewise, the 64bit block case
> had another crash bug due to type mismatch.
> Color decode (used for all s3tc formats) also had a bogus shuffle for
> this case, leading to decode artifacts.
> Fix these all up, which makes the code actually work 8-wide. Note that
> it's still not used - I've verified it works, and the generated assembly
> does look quite a bit simpler actually (20-30% less instructions for the
> s3tc decode part with avx2), however in practice it still seems to be
> sligthly slower for some unknown reason (tested with openarena) on my
> haswell box, so for now continue to split things into 4-wide vectors
> before decoding.
> ---
>   .../auxiliary/gallivm/lp_bld_format_s3tc.c    | 33 +++++++++----------
>   1 file changed, 16 insertions(+), 17 deletions(-)
> 
> diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_s3tc.c b/src/gallium/auxiliary/gallivm/lp_bld_format_s3tc.c
> index 9561c349dad..8f6e9bec18a 100644
> --- a/src/gallium/auxiliary/gallivm/lp_bld_format_s3tc.c
> +++ b/src/gallium/auxiliary/gallivm/lp_bld_format_s3tc.c
> @@ -77,24 +77,17 @@ lp_build_uninterleave2_half(struct gallivm_state *gallivm,
>                               unsigned lo_hi)
>   {
>      LLVMValueRef shuffle, elems[LP_MAX_VECTOR_LENGTH];
> -   unsigned i, j;
> +   unsigned i;
>   
>      assert(type.length <= LP_MAX_VECTOR_LENGTH);
>      assert(lo_hi < 2);
>   
>      if (type.length * type.width == 256) {
> -      assert(type.length >= 4);
> -      for (i = 0, j = 0; i < type.length; ++i) {
> -         if (i == type.length / 4) {
> -            j = type.length;
> -         } else if (i == type.length / 2) {
> -            j = type.length / 2;
> -         } else if (i == 3 * type.length / 4) {
> -            j = 3 * type.length / 4;
> -         } else {
> -            j += 2;
> -         }
> -         elems[i] = lp_build_const_int32(gallivm, j + lo_hi);
> +      assert(type.length == 8);
> +      assert(type.width == 32);
> +      const unsigned shufvals[8] = {0, 2, 8, 10, 4, 6, 12, 14};

could be static

> +      for (i = 0; i < type.length; ++i) {
> +         elems[i] = lp_build_const_int32(gallivm, shufvals[i] + lo_hi);
>         }
>      } else {
>         for (i = 0; i < type.length; ++i) {
> @@ -277,7 +270,7 @@ lp_build_gather_s3tc(struct gallivm_state *gallivm,
>      }
>      else {
>         LLVMValueRef tmp[4], cc01, cc23;
> -      struct lp_type lp_type32, lp_type64, lp_type32dxt;
> +      struct lp_type lp_type32, lp_type64;
>         memset(&lp_type32, 0, sizeof lp_type32);
>         lp_type32.width = 32;
>         lp_type32.length = length;
> @@ -309,10 +302,14 @@ lp_build_gather_s3tc(struct gallivm_state *gallivm,
>                                                 lp_build_const_extend_shuffle(gallivm, 2, 4), "");
>            }
>            if (length == 8) {
> +            struct lp_type lp_type32_4;
> +            memset(&lp_type32_4, 0, sizeof lp_type32_4);

Maybe we could move toward using initializers in these case?  struct 
lp_type lp_type32_4 = {0}; ?

> +            lp_type32_4.width = 32;
> +            lp_type32_4.length = 4;
>               for (i = 0; i < 4; ++i) {
>                  tmp[0] = elems[i];
>                  tmp[1] = elems[i+4];
> -               elems[i] = lp_build_concat(gallivm, tmp, lp_type32, 2);
> +               elems[i] = lp_build_concat(gallivm, tmp, lp_type32_4, 2);
>               }
>            }
>            cc01 = lp_build_interleave2_half(gallivm, lp_type32, elems[0], elems[1], 0);
> @@ -811,7 +808,7 @@ s3tc_dxt3_to_rgba_aos(struct gallivm_state *gallivm,
>      tmp = lp_build_select(&bld, sel_mask, alpha_low, alpha_hi);
>      bit_pos = LLVMBuildAnd(builder, bit_pos,
>                             lp_build_const_int_vec(gallivm, type, 0xffffffdf), "");
> -   /* Warning: slow shift with per element count */
> +   /* Warning: slow shift with per element count (without avx2) */
>      /*
>       * Could do pshufb here as well - just use appropriate 2 bits in bit_pos
>       * to select the right byte with pshufb. Then for the remaining one bit
> @@ -1640,7 +1637,6 @@ s3tc_decode_block_dxt5(struct gallivm_state *gallivm,
>                             lp_build_const_int_vec(gallivm, type16, 8), "");
>      alpha = LLVMBuildBitCast(builder, alpha,  i64t, "");
>      shuffle1 = lp_build_const_shuffle1(gallivm, 0, 8);
> -   /* XXX this shuffle broken with LLVM 2.8 */
>      alpha0 = LLVMBuildShuffleVector(builder, alpha0, alpha0, shuffle1, "");
>      alpha1 = LLVMBuildShuffleVector(builder, alpha1, alpha1, shuffle1, "");
>   
> @@ -2176,6 +2172,9 @@ lp_build_fetch_s3tc_rgba_aos(struct gallivm_state *gallivm,
>         return rgba;
>      }
>   
> +   /*
> +    * Could use n > 8 here with avx2, but doesn't seem faster.
> +    */
>      if (n > 4) {
>         unsigned count;
>         LLVMTypeRef i8_vectype = LLVMVectorType(i8t, 4 * n);
>