[Mesa-dev] [PATCH 1/4] llvmpipe: (trivial) minimally simplify mask construction

Wed Jan 4 16:15:22 UTC 2017

On 21/12/16 04:01, sroland at vmware.com wrote:
> From: Roland Scheidegger <sroland at vmware.com>
>
> simd instruction sets usually have comparisons for equal, not unequal.
> So use a different comparison against the mask itself - which also means
> we don't need a all-zero as well as a all-one (for the pxor) reg.
>
> Also add code to avoid scalar expansion of i1 values which we definitely
> shouldn't do. There's problems with this though with llvm select
> interaction, so it's disabled (basically using llvm select instead of
> intrinsics may still produce atrocious code, even in cases where we
> figured it should not, albeit I think this could probably be fixed
> with some better selection of optimization passes, but I have zero
> idea there really).
> ---
>  src/gallium/auxiliary/gallivm/lp_bld_logic.c |  2 ++
>  src/gallium/drivers/llvmpipe/lp_bld_depth.c  | 52 ++++++++++++++++++++++------
>  src/gallium/drivers/llvmpipe/lp_state_fs.c   | 16 +++++----
>  3 files changed, 53 insertions(+), 17 deletions(-)
>
> diff --git a/src/gallium/auxiliary/gallivm/lp_bld_logic.c b/src/gallium/auxiliary/gallivm/lp_bld_logic.c
> index 1a50e82..524917a 100644
> --- a/src/gallium/auxiliary/gallivm/lp_bld_logic.c
> +++ b/src/gallium/auxiliary/gallivm/lp_bld_logic.c
> @@ -327,6 +327,8 @@ lp_build_select(struct lp_build_context *bld,
>         * supported yet for a long time, and LLVM will generate poor code when
>         * the mask is not the result of a comparison.
>         * Also, llvm 3.7 may miscompile them (bug 94972).
> +       * XXX: Even if the instruction was an SExt, this may still produce
> +       * terrible code. Try piglit stencil-twoside.
>         */
>
>        /* Convert the mask to a vector of booleans.
> diff --git a/src/gallium/drivers/llvmpipe/lp_bld_depth.c b/src/gallium/drivers/llvmpipe/lp_bld_depth.c
> index 0c27c2f..d5d5c5a 100644
> --- a/src/gallium/drivers/llvmpipe/lp_bld_depth.c
> +++ b/src/gallium/drivers/llvmpipe/lp_bld_depth.c
> @@ -963,16 +963,48 @@ lp_build_depth_stencil_test(struct gallivm_state *gallivm,
>     if (stencil[0].enabled) {
>
>        if (face) {
> -         LLVMValueRef zero = lp_build_const_int32(gallivm, 0);
> -
> -         /* front_facing = face != 0 ? ~0 : 0 */
> -         front_facing = LLVMBuildICmp(builder, LLVMIntNE, face, zero, "");
> -         front_facing = LLVMBuildSExt(builder, front_facing,
> -                                      LLVMIntTypeInContext(gallivm->context,
> -                                             s_bld.type.length*s_bld.type.width),
> -                                      "");
> -         front_facing = LLVMBuildBitCast(builder, front_facing,
> -                                         s_bld.int_vec_type, "");
> +         if (0) {
> +            /*
> +             * XXX: the scalar expansion below produces atrocious code
> +             * (basically producing a 64bit scalar value, then moving the 2
> +             * 32bit pieces separately to simd, plus 4 shuffles, which is
> +             * seriously lame). But the scalar-simd transitions are always
> +             * tricky, so no big surprise there.
> +             * This here would be way better, however llvm has some serious
> +             * trouble later using it in the select, probably because it will
> +             * recognize the expression as constant and move the simd value
> +             * away (out of the loop) - and then it will suddenly try
> +             * constructing i1 high-bit masks out of it later...
> +             * (Try piglit stencil-twoside.)
> +             * Note this is NOT due to using SExt/Trunc, it fails exactly the
> +             * same even when using native compare/select.
> +             * I cannot reproduce this problem when using stand-alone compiler
> +             * though, suggesting some problem with optimization passes...
> +             * (With stand-alone compilation, the construction of this mask
> +             * value, no matter if the easy 3 instruction here or the complex
> +             * 16+ one below, never gets separated from where it's used.)
> +             * The scalar code still has the same problem, but the generated
> +             * code looks a bit better at least for some reason, even if
> +             * mostly by luck (the fundamental issue clearly is the same).
> +             */

> +            front_facing = lp_build_broadcast(gallivm, s_bld.vec_type, face);
> +            /* front_facing = face != 0 ? ~0 : 0 */
> +            front_facing = lp_build_compare(gallivm, s_bld.type,
> +                                            PIPE_FUNC_NOTEQUAL,
> +                                            front_facing, s_bld.zero);
> +         } else {

Let's leave the command about the ICmp/SExt innefficiencies, but remove 
this dead path.

> +            LLVMValueRef zero = lp_build_const_int32(gallivm, 0);
> +
> +            /* front_facing = face != 0 ? ~0 : 0 */
> +            front_facing = LLVMBuildICmp(builder, LLVMIntNE, face, zero, "");
> +            front_facing = LLVMBuildSExt(builder, front_facing,
> +                                         LLVMIntTypeInContext(gallivm->context,
> +                                                s_bld.type.length*s_bld.type.width),
> +                                         "");
> +            front_facing = LLVMBuildBitCast(builder, front_facing,
> +                                            s_bld.int_vec_type, "");
> +
> +         }
>        }
>
>        s_pass_mask = lp_build_stencil_test(&s_bld, stencil,
> diff --git a/src/gallium/drivers/llvmpipe/lp_state_fs.c b/src/gallium/drivers/llvmpipe/lp_state_fs.c
> index 0910815..a36389c 100644
> --- a/src/gallium/drivers/llvmpipe/lp_state_fs.c
> +++ b/src/gallium/drivers/llvmpipe/lp_state_fs.c
> @@ -127,7 +127,7 @@ generate_quad_mask(struct gallivm_state *gallivm,
>     struct lp_type mask_type;
>     LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context);
>     LLVMValueRef bits[16];
> -   LLVMValueRef mask;
> +   LLVMValueRef mask, bits_vec;
>     int shift, i;
>
>     /*
> @@ -179,15 +179,15 @@ generate_quad_mask(struct gallivm_state *gallivm,
>        bits[4*i + 2] = LLVMConstInt(i32t, 1ULL << (j + 4), 0);
>        bits[4*i + 3] = LLVMConstInt(i32t, 1ULL << (j + 5), 0);
>     }
> -   mask = LLVMBuildAnd(builder, mask, LLVMConstVector(bits, fs_type.length), "");
> +   bits_vec = LLVMConstVector(bits, fs_type.length);
> +   mask = LLVMBuildAnd(builder, mask, bits_vec, "");
>
>     /*
> -    * mask = mask != 0 ? ~0 : 0
> +    * mask = mask == bits ? ~0 : 0
>      */
>     mask = lp_build_compare(gallivm,
> -                           mask_type, PIPE_FUNC_NOTEQUAL,
> -                           mask,
> -                           lp_build_const_int_vec(gallivm, mask_type, 0));
> +                           mask_type, PIPE_FUNC_EQUAL,
> +                           mask, bits_vec);
>
>     return mask;
>  }
> @@ -2476,8 +2476,10 @@ dump_fs_variant_key(const struct lp_fragment_shader_variant_key *key)
>     for (i = 0; i < key->nr_cbufs; ++i) {
>        debug_printf("cbuf_format[%u] = %s\n", i, util_format_name(key->cbuf_format[i]));
>     }
> -   if (key->depth.enabled) {
> +   if (key->depth.enabled || key->stencil[0].enabled) {
>        debug_printf("depth.format = %s\n", util_format_name(key->zsbuf_format));
> +   }
> +   if (key->depth.enabled) {
>        debug_printf("depth.func = %s\n", util_dump_func(key->depth.func, TRUE));
>        debug_printf("depth.writemask = %u\n", key->depth.writemask);
>     }
>

Otherwise looks good.

Reviewed-by: Jose Fonseca <jfonseca at vmware.com>