[Mesa-dev] [PATCH 2/5] llvmpipe: add POWER8 portability file - u_pwr8.h

Tue Dec 29 17:18:53 PST 2015

Am 29.12.2015 um 17:12 schrieb Oded Gabbay:
> This file provides a portability layer that will make it easier to convert
> SSE-based functions to VMX/VSX-based functions.
> 
> All the functions implemented in this file are prefixed using "vec_".
> Therefore, when converting from SSE-based function, one needs to simply
> replace the "_mm_" prefix of the SSE function being called to "vec_".
> 
> Having said that, not all functions could be converted as such, due to the
> differences between the architectures. So, when doing such
> conversion hurt the performance, I preferred to implement a more ad-hoc
> solution. For example, converting the _mm_shuffle_epi32 needed to be done
> using ad-hoc masks instead of a generic function.
> 
> All the functions in this file support both little-endian and big-endian.
> 
> All of the functions are implemented using the Altivec/VMX intrinsics,
> except one where I needed to use inline assembly (due to missing
> intrinsic).
> 
> Signed-off-by: Oded Gabbay <oded.gabbay at gmail.com>
> ---
>  src/gallium/auxiliary/util/u_pwr8.h | 298 ++++++++++++++++++++++++++++++++++++
>  1 file changed, 298 insertions(+)
>  create mode 100644 src/gallium/auxiliary/util/u_pwr8.h
> 
> diff --git a/src/gallium/auxiliary/util/u_pwr8.h b/src/gallium/auxiliary/util/u_pwr8.h
> new file mode 100644
> index 0000000..c6e5fac
> --- /dev/null
> +++ b/src/gallium/auxiliary/util/u_pwr8.h
> @@ -0,0 +1,298 @@
> +/*
> + * Copyright 2015 Red Hat Inc.
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a
> + * copy of this software and associated documentation files (the "Software"),
> + * to deal in the Software without restriction, including without limitation
> + * on the rights to use, copy, modify, merge, publish, distribute, sub
> + * license, and/or sell copies of the Software, and to permit persons to whom
> + * the Software is furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice (including the next
> + * paragraph) shall be included in all copies or substantial portions of the
> + * Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
> + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
> + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
> + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
> + * USE OR OTHER DEALINGS IN THE SOFTWARE.
> + *
> + * Author: Oded Gabbay <oded.gabbay at redhat.com>
> + */
> +
> +/**
> + * @file
> + * POWER8 intrinsics portability header.
> + *
> + */
> +
> +#ifndef U_PWR8_H_
> +#define U_PWR8_H_
> +
> +#ifdef _ARCH_PWR8
> +
> +#define VECTOR_ALIGN_16 __attribute__ ((__aligned__ (16)))
> +
> +typedef VECTOR_ALIGN_16 vector unsigned char __m128i;
> +
> +typedef VECTOR_ALIGN_16 union m128i {
> +   __m128i m128i;
> +   vector signed int m128si;
> +   vector unsigned int m128ui;
> +   ubyte ub[16];
> +   ushort us[8];
> +   int i[4];
> +   uint ui[4];
> +} __m128i_union;
> +
> +static inline __m128i
> +vec_set_epi32 (int i3, int i2, int i1, int i0)
> +{
> +   __m128i_union vdst;
> +
> +#ifdef PIPE_ARCH_LITTLE_ENDIAN
> +   vdst.i[0] = i0;
> +   vdst.i[1] = i1;
> +   vdst.i[2] = i2;
> +   vdst.i[3] = i3;
> +#else
> +   vdst.i[3] = i0;
> +   vdst.i[2] = i1;
> +   vdst.i[1] = i2;
> +   vdst.i[0] = i3;
> +#endif
> +
> +   return (__m128i) vdst.m128si;
> +}
> +
> +static inline __m128i
> +vec_setr_epi32 (int i0, int i1, int i2, int i3)
> +{
> +  return vec_set_epi32 (i3, i2, i1, i0);
> +}
> +
> +static inline __m128i
> +vec_unpacklo_epi32 (__m128i even, __m128i odd)
> +{
> +   static const __m128i perm_mask =
> +#ifdef PIPE_ARCH_LITTLE_ENDIAN
> +      { 0,  1,  2,  3, 16, 17, 18, 19,  4,  5,  6,  7, 20, 21, 22, 23};
> +#else
> +      {24, 25, 26, 27,  8,  9, 10, 11, 28, 29, 30, 31, 12, 13, 14, 15};
> +#endif
> +
> +   return vec_perm (even, odd, perm_mask);
> +}
> +
> +static inline __m128i
> +vec_unpackhi_epi32 (__m128i even, __m128i odd)
> +{
> +   static const __m128i perm_mask =
> +#ifdef PIPE_ARCH_LITTLE_ENDIAN
> +      { 8,  9, 10, 11, 24, 25, 26, 27, 12, 13, 14, 15, 28, 29, 30, 31};
> +#else
> +      {16, 17, 18, 19,  0,  1,  2,  3, 20, 21, 22, 23,  4,  5,  6,  7};
> +#endif
> +
> +   return vec_perm (even, odd, perm_mask);
> +}
> +
> +static inline __m128i
> +vec_unpacklo_epi64 (__m128i even, __m128i odd)
> +{
> +   static const __m128i perm_mask =
> +#ifdef PIPE_ARCH_LITTLE_ENDIAN
> +      { 0,  1,  2,  3,  4,  5,  6,  7, 16, 17, 18, 19, 20, 21, 22, 23};
> +#else
> +      {24, 25, 26, 27, 28, 29, 30, 31,  8,  9, 10, 11, 12, 13, 14, 15};
> +#endif
> +
> +   return vec_perm (even, odd, perm_mask);
> +}
> +
> +static inline __m128i
> +vec_unpackhi_epi64 (__m128i even, __m128i odd)
> +{
> +   static const __m128i perm_mask =
> +#ifdef PIPE_ARCH_LITTLE_ENDIAN
> +      { 8,  9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31};
> +#else
> +      {16, 17, 18, 19, 20, 21, 22, 23,  0,  1,  2,  3,  4,  5,  6,  7};
> +#endif
> +
> +   return vec_perm (even, odd, perm_mask);
> +}
> +
> +static inline __m128i
> +vec_add_epi32 (__m128i a, __m128i b)
> +{
> +   return (__m128i) vec_add ((vector signed int) a, (vector signed int) b);
> +}
> +
> +static inline __m128i
> +vec_sub_epi32 (__m128i a, __m128i b)
> +{
> +   return (__m128i) vec_sub ((vector signed int) a, (vector signed int) b);
> +}
> +
> +static inline __m128i
> +vec_mullo_epi32 (__m128i a, __m128i b)
> +{
> +   __m128i v;
> +
> +   __asm__(
> +           "vmuluwm %0, %1, %2   \n"
> +           : "=v" (v)
> +           : "v" (a), "v" (b)
> +           );
> +
> +   return v;
> +}
> +
> +static inline void
> +transpose4_epi32(const __m128i * restrict a,
> +                 const __m128i * restrict b,
> +                 const __m128i * restrict c,
> +                 const __m128i * restrict d,
> +                 __m128i * restrict o,
> +                 __m128i * restrict p,
> +                 __m128i * restrict q,
> +                 __m128i * restrict r)
> +{
> +   __m128i t0 = vec_unpacklo_epi32(*a, *b);
> +   __m128i t1 = vec_unpacklo_epi32(*c, *d);
> +   __m128i t2 = vec_unpackhi_epi32(*a, *b);
> +   __m128i t3 = vec_unpackhi_epi32(*c, *d);
> +
> +   *o = vec_unpacklo_epi64(t0, t1);
> +   *p = vec_unpackhi_epi64(t0, t1);
> +   *q = vec_unpacklo_epi64(t2, t3);
> +   *r = vec_unpackhi_epi64(t2, t3);
> +}
> +
> +static inline __m128i
> +vec_slli_epi32 (__m128i vsrc, unsigned int count)
> +{
> +   __m128i_union vec_count;
> +
> +   if (count >= 32)
> +      return (__m128i) vec_splats (0);
> +   else if (count == 0)
> +      return vsrc;
> +
> +   /* In VMX, all shift count fields must contain the same value */
> +   vec_count.m128si = (vector signed int) vec_splats (count);
> +   return (__m128i) vec_sl ((vector signed int) vsrc, vec_count.m128ui);
> +}
> +
> +static inline __m128i
> +vec_srli_epi32 (__m128i vsrc, unsigned int count)
> +{
> +   __m128i_union vec_count;
> +
> +   if (count >= 32)
> +      return (__m128i) vec_splats (0);
> +   else if (count == 0)
> +      return vsrc;
> +
> +   /* In VMX, all shift count fields must contain the same value */
> +   vec_count.m128si = (vector signed int) vec_splats (count);
> +   return (__m128i) vec_sr ((vector signed int) vsrc, vec_count.m128ui);
> +}
> +
> +static inline __m128i
> +vec_srai_epi32 (__m128i vsrc, unsigned int count)
> +{
> +   __m128i_union vec_count;
> +
> +   if (count >= 32)
> +      return (__m128i) vec_splats (0);
> +   else if (count == 0)
> +      return vsrc;
> +
> +   /* In VMX, all shift count fields must contain the same value */
> +   vec_count.m128si = (vector signed int) vec_splats (count);
> +   return (__m128i) vec_sra ((vector signed int) vsrc, vec_count.m128ui);
> +}
> +
> +static inline __m128i
> +vec_cmpeq_epi32 (__m128i a, __m128i b)
> +{
> +   return (__m128i) vec_cmpeq ((vector signed int) a, (vector signed int) b);
> +}
> +
> +static inline __m128i
> +vec_loadu_si128 (const uint32_t* src)
> +{
> +   __m128i_union vsrc;
> +
> +#ifdef PIPE_ARCH_LITTLE_ENDIAN
> +
> +   vsrc.m128ui = *((vector unsigned int *) src);
> +
> +#else
> +
> +   __m128i vmask, tmp1, tmp2;
> +
> +   vmask = vec_lvsl(0, src);
> +
> +   tmp1 = (typeof(tmp1))vec_ld (0, src);
> +   tmp2 = (typeof(tmp2))vec_ld (15, src);
> +   vsrc.m128ui = (typeof(vsrc.m128ui)) vec_perm (tmp1, tmp2, vmask);
> +
> +#endif
> +
> +   return vsrc.m128i;
> +}
> +
> +static inline void
> +vec_store_si128 (uint32_t* dest, __m128i vdata)
> +{
> +   vec_st ((vector unsigned int) vdata, 0, dest);
> +}
> +
> +static inline int
> +vec_movemask_epi8 (__m128i vsrc)
> +{
> +   __m128i_union vtemp;
> +   int result;
> +
> +   vtemp.m128i = __builtin_vec_vgbbd(vsrc);
> +
> +#ifdef PIPE_ARCH_LITTLE_ENDIAN
> +   result = vtemp.ub[15] << 8 | vtemp.ub[7];
> +#else
> +   result = vtemp.ub[0] << 8 | vtemp.ub[8];
> +#endif
> +
> +   return result;
> +}
> +
> +static inline __m128i
> +vec_packs_epi16 (__m128i a, __m128i b)
> +{
> +#ifdef PIPE_ARCH_LITTLE_ENDIAN
> +   return (__m128i) vec_packs ((vector signed short) a,
> +                               (vector signed short) b);
> +#else
> +   return (__m128i) vec_packs ((vector signed short) b,
> +                               (vector signed short) a);
> +#endif
> +}
> +
> +static inline __m128i
> +vec_packs_epi32 (__m128i a, __m128i b)
> +{
> +#ifdef PIPE_ARCH_LITTLE_ENDIAN
> +   return (__m128i) vec_packs ((vector signed int) a, (vector signed int) b);
> +#else
> +   return (__m128i) vec_packs ((vector signed int) b, (vector signed int) a);
> +#endif
> +}
> +
> +#endif /* _ARCH_PWR8 */
> +
> +#endif /* U_PWR8_H_ */
> 

For which functions do you actually need power8?
I believe __builtin_vec_vgbbd used for vec_movemask_epi8 is one?
I have actually been thinking about how we deal with these masks some
time ago. It looks suboptimal since for the most part we're packing the
masks and then just have to unpack them again in the jit code, so I
always believed there's just overhead due to that (could at least pass
as one-byte-per-mask bit). The problem with that though is that those
masks also interact with the c code, albeit typically just checking for
a specific value, so as long as you can do that efficiently with vector
instructions it should be possible to work around it (sse41 in fact has
the very efficient ptest instruction, but unfortunately it can't really
be integrated well into surrounding c code due to the fact it just sets
the flags register and has no return value, plus it would need runtime
feature detection).
Plus there's some problem since the "recursive" evaluation of blocks
(for size 64/16/4) mostly use the same functions, and the outer ones
really only need some scalar value which can be efficiently dealt with
with c code.

FWIW looks like the unaligned load is somewhat complex too. I am not
entirely sure why we use unaligned values in the first place there, as
far as I can tell it should be possible to fix this with a couple
PIPE_ALIGN_VAR(16) for the affected struct (surely we'd prefer aligned
variables if we can). Unless there's some reason that doesn't work.

Roland