[Mesa-dev] [PATCH 3/5] llvmpipe: Optimize do_triangle_ccw for POWER8

Roland Scheidegger sroland at vmware.com
Wed Dec 30 18:13:45 PST 2015


Am 30.12.2015 um 10:59 schrieb Oded Gabbay:
> On Wed, Dec 30, 2015 at 1:17 AM, Roland Scheidegger <sroland at vmware.com> wrote:
>> The idea looks right to me.
>> Though frankly I don't like our current setup code too much - in
>> particular the mix between c, assembly, and jit code, with some
>> duplication (plus the lots of transpose everywhere). There's likely
>> optimization potential to be found there.
>>
>> Roland
> 
> Agreed, if we can remove the transpose, that would be some 3-5% boost for sure.
> 

I actually forgot about this, but I'm really surprised you could measure
a difference as this code (but not the one in rasterization) is
effectively disabled in practice ever since the switch to 8 bit subpixel
precision (as the max fb size allowed now is really tiny, 128x128...)
Unless you were playing with those values...
Should have probably added some XXX comment back then...

Roland


> 
>>
>> Am 29.12.2015 um 17:12 schrieb Oded Gabbay:
>>> This patch converts the SSE optimization done in do_triangle_ccw to
>>> VMX/VSX.
>>>
>>> I measured the results on POWER8 machine with 32 cores at 3.4GHz and
>>> 16GB of RAM.
>>>
>>>                       FPS/Score
>>>   Name            Before     After    Delta
>>> ------------------------------------------------
>>> glmark2 (score)   136.6      139.8    2.34%
>>> openarena         16.14      16.35    1.30%
>>> xonotic           4.655      4.707    1.11%
>>>
>>> Signed-off-by: Oded Gabbay <oded.gabbay at gmail.com>
>>> ---
>>>  src/gallium/drivers/llvmpipe/lp_setup_tri.c | 96 +++++++++++++++++++++++++++++
>>>  1 file changed, 96 insertions(+)
>>>
>>> diff --git a/src/gallium/drivers/llvmpipe/lp_setup_tri.c b/src/gallium/drivers/llvmpipe/lp_setup_tri.c
>>> index b1671dd..cfa9874 100644
>>> --- a/src/gallium/drivers/llvmpipe/lp_setup_tri.c
>>> +++ b/src/gallium/drivers/llvmpipe/lp_setup_tri.c
>>> @@ -46,6 +46,9 @@
>>>
>>>  #if defined(PIPE_ARCH_SSE)
>>>  #include <emmintrin.h>
>>> +#elif defined(_ARCH_PWR8)
>>> +#include <altivec.h>
>>> +#include "util/u_pwr8.h"
>>>  #endif
>>>
>>>  static inline int
>>> @@ -462,6 +465,99 @@ do_triangle_ccw(struct lp_setup_context *setup,
>>>        STORE_PLANE(plane[2], p2);
>>>  #undef STORE_PLANE
>>>     } else
>>> +#elif defined(_ARCH_PWR8)
>>> +   if (setup->fb.width <= MAX_FIXED_LENGTH32 &&
>>> +       setup->fb.height <= MAX_FIXED_LENGTH32 &&
>>> +       (bbox.x1 - bbox.x0) <= MAX_FIXED_LENGTH32 &&
>>> +       (bbox.y1 - bbox.y0) <= MAX_FIXED_LENGTH32) {
>>> +      unsigned int bottom_edge;
>>> +      __m128i vertx, verty;
>>> +      __m128i shufx, shufy;
>>> +      __m128i dcdx, dcdy, c;
>>> +      __m128i unused;
>>> +      __m128i dcdx_neg_mask;
>>> +      __m128i dcdy_neg_mask;
>>> +      __m128i dcdx_zero_mask;
>>> +      __m128i top_left_flag;
>>> +      __m128i c_inc_mask, c_inc;
>>> +      __m128i eo, p0, p1, p2;
>>> +      __m128i_union vshuf_mask;
>>> +      __m128i zero = vec_splats((unsigned char) 0);
>>> +      PIPE_ALIGN_VAR(16) int32_t temp_vec[4];
>>> +
>>> +#ifdef PIPE_ARCH_LITTLE_ENDIAN
>>> +      vshuf_mask.i[0] = 0x07060504;
>>> +      vshuf_mask.i[1] = 0x0B0A0908;
>>> +      vshuf_mask.i[2] = 0x03020100;
>>> +      vshuf_mask.i[3] = 0x0F0E0D0C;
>>> +#else
>>> +      vshuf_mask.i[0] = 0x00010203;
>>> +      vshuf_mask.i[1] = 0x0C0D0E0F;
>>> +      vshuf_mask.i[2] = 0x04050607;
>>> +      vshuf_mask.i[3] = 0x08090A0B;
>>> +#endif
>>> +
>>> +      /* vertex x coords */
>>> +      vertx = vec_loadu_si128((const uint32_t *) position->x);
>>> +      /* vertex y coords */
>>> +      verty = vec_loadu_si128((const uint32_t *) position->y);
>>> +
>>> +      shufx = vec_perm (vertx, vertx, vshuf_mask.m128i);
>>> +      shufy = vec_perm (verty, verty, vshuf_mask.m128i);
>>> +
>>> +      dcdx = vec_sub_epi32(verty, shufy);
>>> +      dcdy = vec_sub_epi32(vertx, shufx);
>>> +
>>> +      dcdx_neg_mask = vec_srai_epi32(dcdx, 31);
>>> +      dcdx_zero_mask = vec_cmpeq_epi32(dcdx, zero);
>>> +      dcdy_neg_mask = vec_srai_epi32(dcdy, 31);
>>> +
>>> +      bottom_edge = (setup->bottom_edge_rule == 0) ? ~0 : 0;
>>> +      top_left_flag = (__m128i) vec_splats(bottom_edge);
>>> +
>>> +      c_inc_mask = vec_or(dcdx_neg_mask,
>>> +                                vec_and(dcdx_zero_mask,
>>> +                                              vec_xor(dcdy_neg_mask,
>>> +                                                            top_left_flag)));
>>> +
>>> +      c_inc = vec_srli_epi32(c_inc_mask, 31);
>>> +
>>> +      c = vec_sub_epi32(vec_mullo_epi32(dcdx, vertx),
>>> +                        vec_mullo_epi32(dcdy, verty));
>>> +
>>> +      c = vec_add_epi32(c, c_inc);
>>> +
>>> +      /* Scale up to match c:
>>> +       */
>>> +      dcdx = vec_slli_epi32(dcdx, FIXED_ORDER);
>>> +      dcdy = vec_slli_epi32(dcdy, FIXED_ORDER);
>>> +
>>> +      /* Calculate trivial reject values:
>>> +       */
>>> +      eo = vec_sub_epi32(vec_andc(dcdy_neg_mask, dcdy),
>>> +                         vec_and(dcdx_neg_mask, dcdx));
>>> +
>>> +      /* ei = _mm_sub_epi32(_mm_sub_epi32(dcdy, dcdx), eo); */
>>> +
>>> +      /* Pointless transpose which gets undone immediately in
>>> +       * rasterization:
>>> +       */
>>> +      transpose4_epi32(&c, &dcdx, &dcdy, &eo,
>>> +                       &p0, &p1, &p2, &unused);
>>> +
>>> +#define STORE_PLANE(plane, vec) do {                  \
>>> +         vec_store_si128((uint32_t *)&temp_vec, vec); \
>>> +         plane.c    = (int64_t)temp_vec[0];           \
>>> +         plane.dcdx = temp_vec[1];                    \
>>> +         plane.dcdy = temp_vec[2];                    \
>>> +         plane.eo   = temp_vec[3];                    \
>>> +      } while(0)
>>> +
>>> +      STORE_PLANE(plane[0], p0);
>>> +      STORE_PLANE(plane[1], p1);
>>> +      STORE_PLANE(plane[2], p2);
>>> +#undef STORE_PLANE
>>> +   } else
>>>  #endif
>>>     {
>>>        int i;
>>>
>>



More information about the mesa-dev mailing list