[Mesa-dev] [PATCH 3/5] llvmpipe: Optimize do_triangle_ccw for POWER8

Tue Dec 29 15:17:43 PST 2015

The idea looks right to me.
Though frankly I don't like our current setup code too much - in
particular the mix between c, assembly, and jit code, with some
duplication (plus the lots of transpose everywhere). There's likely
optimization potential to be found there.

Roland

Am 29.12.2015 um 17:12 schrieb Oded Gabbay:
> This patch converts the SSE optimization done in do_triangle_ccw to
> VMX/VSX.
> 
> I measured the results on POWER8 machine with 32 cores at 3.4GHz and
> 16GB of RAM.
> 
>                       FPS/Score
>   Name            Before     After    Delta
> ------------------------------------------------
> glmark2 (score)   136.6      139.8    2.34%
> openarena         16.14      16.35    1.30%
> xonotic           4.655      4.707    1.11%
> 
> Signed-off-by: Oded Gabbay <oded.gabbay at gmail.com>
> ---
>  src/gallium/drivers/llvmpipe/lp_setup_tri.c | 96 +++++++++++++++++++++++++++++
>  1 file changed, 96 insertions(+)
> 
> diff --git a/src/gallium/drivers/llvmpipe/lp_setup_tri.c b/src/gallium/drivers/llvmpipe/lp_setup_tri.c
> index b1671dd..cfa9874 100644
> --- a/src/gallium/drivers/llvmpipe/lp_setup_tri.c
> +++ b/src/gallium/drivers/llvmpipe/lp_setup_tri.c
> @@ -46,6 +46,9 @@
>  
>  #if defined(PIPE_ARCH_SSE)
>  #include <emmintrin.h>
> +#elif defined(_ARCH_PWR8)
> +#include <altivec.h>
> +#include "util/u_pwr8.h"
>  #endif
>  
>  static inline int
> @@ -462,6 +465,99 @@ do_triangle_ccw(struct lp_setup_context *setup,
>        STORE_PLANE(plane[2], p2);
>  #undef STORE_PLANE
>     } else
> +#elif defined(_ARCH_PWR8)
> +   if (setup->fb.width <= MAX_FIXED_LENGTH32 &&
> +       setup->fb.height <= MAX_FIXED_LENGTH32 &&
> +       (bbox.x1 - bbox.x0) <= MAX_FIXED_LENGTH32 &&
> +       (bbox.y1 - bbox.y0) <= MAX_FIXED_LENGTH32) {
> +      unsigned int bottom_edge;
> +      __m128i vertx, verty;
> +      __m128i shufx, shufy;
> +      __m128i dcdx, dcdy, c;
> +      __m128i unused;
> +      __m128i dcdx_neg_mask;
> +      __m128i dcdy_neg_mask;
> +      __m128i dcdx_zero_mask;
> +      __m128i top_left_flag;
> +      __m128i c_inc_mask, c_inc;
> +      __m128i eo, p0, p1, p2;
> +      __m128i_union vshuf_mask;
> +      __m128i zero = vec_splats((unsigned char) 0);
> +      PIPE_ALIGN_VAR(16) int32_t temp_vec[4];
> +
> +#ifdef PIPE_ARCH_LITTLE_ENDIAN
> +      vshuf_mask.i[0] = 0x07060504;
> +      vshuf_mask.i[1] = 0x0B0A0908;
> +      vshuf_mask.i[2] = 0x03020100;
> +      vshuf_mask.i[3] = 0x0F0E0D0C;
> +#else
> +      vshuf_mask.i[0] = 0x00010203;
> +      vshuf_mask.i[1] = 0x0C0D0E0F;
> +      vshuf_mask.i[2] = 0x04050607;
> +      vshuf_mask.i[3] = 0x08090A0B;
> +#endif
> +
> +      /* vertex x coords */
> +      vertx = vec_loadu_si128((const uint32_t *) position->x);
> +      /* vertex y coords */
> +      verty = vec_loadu_si128((const uint32_t *) position->y);
> +
> +      shufx = vec_perm (vertx, vertx, vshuf_mask.m128i);
> +      shufy = vec_perm (verty, verty, vshuf_mask.m128i);
> +
> +      dcdx = vec_sub_epi32(verty, shufy);
> +      dcdy = vec_sub_epi32(vertx, shufx);
> +
> +      dcdx_neg_mask = vec_srai_epi32(dcdx, 31);
> +      dcdx_zero_mask = vec_cmpeq_epi32(dcdx, zero);
> +      dcdy_neg_mask = vec_srai_epi32(dcdy, 31);
> +
> +      bottom_edge = (setup->bottom_edge_rule == 0) ? ~0 : 0;
> +      top_left_flag = (__m128i) vec_splats(bottom_edge);
> +
> +      c_inc_mask = vec_or(dcdx_neg_mask,
> +                                vec_and(dcdx_zero_mask,
> +                                              vec_xor(dcdy_neg_mask,
> +                                                            top_left_flag)));
> +
> +      c_inc = vec_srli_epi32(c_inc_mask, 31);
> +
> +      c = vec_sub_epi32(vec_mullo_epi32(dcdx, vertx),
> +                        vec_mullo_epi32(dcdy, verty));
> +
> +      c = vec_add_epi32(c, c_inc);
> +
> +      /* Scale up to match c:
> +       */
> +      dcdx = vec_slli_epi32(dcdx, FIXED_ORDER);
> +      dcdy = vec_slli_epi32(dcdy, FIXED_ORDER);
> +
> +      /* Calculate trivial reject values:
> +       */
> +      eo = vec_sub_epi32(vec_andc(dcdy_neg_mask, dcdy),
> +                         vec_and(dcdx_neg_mask, dcdx));
> +
> +      /* ei = _mm_sub_epi32(_mm_sub_epi32(dcdy, dcdx), eo); */
> +
> +      /* Pointless transpose which gets undone immediately in
> +       * rasterization:
> +       */
> +      transpose4_epi32(&c, &dcdx, &dcdy, &eo,
> +                       &p0, &p1, &p2, &unused);
> +
> +#define STORE_PLANE(plane, vec) do {                  \
> +         vec_store_si128((uint32_t *)&temp_vec, vec); \
> +         plane.c    = (int64_t)temp_vec[0];           \
> +         plane.dcdx = temp_vec[1];                    \
> +         plane.dcdy = temp_vec[2];                    \
> +         plane.eo   = temp_vec[3];                    \
> +      } while(0)
> +
> +      STORE_PLANE(plane[0], p0);
> +      STORE_PLANE(plane[1], p1);
> +      STORE_PLANE(plane[2], p2);
> +#undef STORE_PLANE
> +   } else
>  #endif
>     {
>        int i;
>