[Mesa-dev] [PATCH] gallivm: don't use saturated unsigned add/sub intrinsics for llvm 8.0
Jose Fonseca
jfonseca at vmware.com
Thu Aug 23 20:13:31 UTC 2018
On 23/08/18 18:53, sroland at vmware.com wrote:
> From: Roland Scheidegger <sroland at vmware.com>
>
> These have been removed. Unfortunately auto-upgrade doesn't work for
> jit. (Worse, it seems we don't get a compilation error anymore when
> compiling the shader, rather llvm will just do a call to a null
> function in the jitted shaders making it difficult to detect when
> intrinsics vanish.)
>
> Luckily the signed ones are still there, I helped convincing llvm
> removing them is a bad idea for now, since while the unsigned ones have
> sort of agreed-upon simplest patterns to replace them with, this is not
> the case for the signed ones, and they require _significantly_ more
> complex patterns - to the point that the recognition is IMHO probably
> unlikely to ever work reliably in practice (due to other optimizations
> interfering). (Even for the relatively trivial unsigned patterns, llvm
> already added test cases where recognition doesn't work, unsaturated
> add followed by saturated add may produce atrocious code.)
> Nevertheless, it seems there's a serious quest to squash all
> cpu-specific intrinsics going on, so I'd expect patches to nuke them as
> well to resurface.
>
> Adapt the existing fallback code to match the simple patterns llvm uses
> and hope for the best. I've verified with lp_test_blend that it does
> produce the expected saturated assembly instructions. Though our
> cmp/select build helpers don't use boolean masks, but it doesn't seem
> to interfere with llvm's ability to recognize the pattern.
>
> Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=106231
> ---
> src/gallium/auxiliary/gallivm/lp_bld_arit.c | 87 ++++++++++++++-------
> 1 file changed, 60 insertions(+), 27 deletions(-)
>
> diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.c b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
> index e922474ef61..f348833206b 100644
> --- a/src/gallium/auxiliary/gallivm/lp_bld_arit.c
> +++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
> @@ -557,23 +557,27 @@ lp_build_add(struct lp_build_context *bld,
> if (!type.floating && !type.fixed) {
> if (type.width * type.length == 128) {
> if (util_cpu_caps.has_sse2) {
> - if (type.width == 8)
> - intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b";
> - if (type.width == 16)
> - intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w";
> + if (type.width == 8)
> + intrinsic = type.sign ? "llvm.x86.sse2.padds.b" :
> + HAVE_LLVM < 0x0800 ? "llvm.x86.sse2.paddus.b" : NULL;
> + if (type.width == 16)
> + intrinsic = type.sign ? "llvm.x86.sse2.padds.w" :
> + HAVE_LLVM < 0x0800 ? "llvm.x86.sse2.paddus.w" : NULL;
> } else if (util_cpu_caps.has_altivec) {
> - if (type.width == 8)
> - intrinsic = type.sign ? "llvm.ppc.altivec.vaddsbs" : "llvm.ppc.altivec.vaddubs";
> - if (type.width == 16)
> - intrinsic = type.sign ? "llvm.ppc.altivec.vaddshs" : "llvm.ppc.altivec.vadduhs";
> + if (type.width == 8)
> + intrinsic = type.sign ? "llvm.ppc.altivec.vaddsbs" : "llvm.ppc.altivec.vaddubs";
> + if (type.width == 16)
> + intrinsic = type.sign ? "llvm.ppc.altivec.vaddshs" : "llvm.ppc.altivec.vadduhs";
> }
> }
> if (type.width * type.length == 256) {
> if (util_cpu_caps.has_avx2) {
> - if (type.width == 8)
> - intrinsic = type.sign ? "llvm.x86.avx2.padds.b" : "llvm.x86.avx2.paddus.b";
> - if (type.width == 16)
> - intrinsic = type.sign ? "llvm.x86.avx2.padds.w" : "llvm.x86.avx2.paddus.w";
> + if (type.width == 8)
> + intrinsic = type.sign ? "llvm.x86.avx2.padds.b" :
> + HAVE_LLVM < 0x0800 ? "llvm.x86.avx2.paddus.b" : NULL;
> + if (type.width == 16)
> + intrinsic = type.sign ? "llvm.x86.avx2.padds.w" :
> + HAVE_LLVM < 0x0800 ? "llvm.x86.avx2.paddus.w" : NULL;
> }
> }
> }
> @@ -592,8 +596,6 @@ lp_build_add(struct lp_build_context *bld,
> LLVMValueRef a_clamp_max = lp_build_min_simple(bld, a, LLVMBuildSub(builder, max_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
> LLVMValueRef a_clamp_min = lp_build_max_simple(bld, a, LLVMBuildSub(builder, min_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
> a = lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, b, bld->zero), a_clamp_max, a_clamp_min);
> - } else {
> - a = lp_build_min_simple(bld, a, lp_build_comp(bld, b), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
Why you removed this?
> }
> }
>
> @@ -612,6 +614,24 @@ lp_build_add(struct lp_build_context *bld,
> if(bld->type.norm && (bld->type.floating || bld->type.fixed))
> res = lp_build_min_simple(bld, res, bld->one, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
>
> + if (type.norm && !type.floating && !type.fixed) {
> + if (!type.sign) {
> + /*
> + * newer llvm versions no longer support the intrinsics, but recognize
> + * the pattern. Since auto-upgrade of intrinsics doesn't work for jit
> + * code, it is important we match the pattern llvm uses (and pray llvm
> + * doesn't change it - and hope they decide on the same pattern for
> + * all backends supporting it...).
> + * NOTE: cmp/select does sext/trunc of the mask. Does not seem to
> + * interfere with llvm's ability to recognize the pattern but seems
> + * a bit brittle.
> + */
> + LLVMValueRef overflowed = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, res);
> + res = lp_build_select(bld, overflowed,
> + LLVMConstAllOnes(bld->int_vec_type), res);
> + }
> + }
> +
> /* XXX clamp to floor of -1 or 0??? */
>
> return res;
> @@ -858,23 +878,27 @@ lp_build_sub(struct lp_build_context *bld,
> if (!type.floating && !type.fixed) {
> if (type.width * type.length == 128) {
> if (util_cpu_caps.has_sse2) {
> - if (type.width == 8)
> - intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b";
> - if (type.width == 16)
> - intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w";
> + if (type.width == 8)
> + intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" :
> + HAVE_LLVM < 0x0800 ? "llvm.x86.sse2.psubus.b" : NULL;
> + if (type.width == 16)
> + intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" :
> + HAVE_LLVM < 0x0800 ? "llvm.x86.sse2.psubus.w" : NULL;
> } else if (util_cpu_caps.has_altivec) {
> - if (type.width == 8)
> - intrinsic = type.sign ? "llvm.ppc.altivec.vsubsbs" : "llvm.ppc.altivec.vsububs";
> - if (type.width == 16)
> - intrinsic = type.sign ? "llvm.ppc.altivec.vsubshs" : "llvm.ppc.altivec.vsubuhs";
> + if (type.width == 8)
> + intrinsic = type.sign ? "llvm.ppc.altivec.vsubsbs" : "llvm.ppc.altivec.vsububs";
> + if (type.width == 16)
> + intrinsic = type.sign ? "llvm.ppc.altivec.vsubshs" : "llvm.ppc.altivec.vsubuhs";
> }
> }
> if (type.width * type.length == 256) {
> if (util_cpu_caps.has_avx2) {
> - if (type.width == 8)
> - intrinsic = type.sign ? "llvm.x86.avx2.psubs.b" : "llvm.x86.avx2.psubus.b";
> - if (type.width == 16)
> - intrinsic = type.sign ? "llvm.x86.avx2.psubs.w" : "llvm.x86.avx2.psubus.w";
> + if (type.width == 8)
> + intrinsic = type.sign ? "llvm.x86.avx2.psubs.b" :
> + HAVE_LLVM < 0x0800 ? "llvm.x86.avx2.psubus.b" : NULL;
> + if (type.width == 16)
> + intrinsic = type.sign ? "llvm.x86.avx2.psubs.w" :
> + HAVE_LLVM < 0x0800 ? "llvm.x86.avx2.psubus.w" : NULL;
> }
> }
> }
> @@ -894,7 +918,16 @@ lp_build_sub(struct lp_build_context *bld,
> LLVMValueRef a_clamp_min = lp_build_max_simple(bld, a, LLVMBuildAdd(builder, min_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
> a = lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, b, bld->zero), a_clamp_min, a_clamp_max);
> } else {
> - a = lp_build_max_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
> + /*
> + * This must match llvm pattern for saturated unsigned sub.
> + * (lp_build_max_simple actually does the job with its current
> + * definition but do it explicitly here.)
> + * NOTE: cmp/select does sext/trunc of the mask. Does not seem to
> + * interfere with llvm's ability to recognize the pattern but seems
> + * a bit brittle.
> + */
> + LLVMValueRef no_ov = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
> + a = lp_build_select(bld, no_ov, a, b);
> }
> }
>
>
Otherwise looks great AFAICT. Thanks
Reviewed-by: Jose Fonseca <jfonseca at vmware.com>
More information about the mesa-dev
mailing list