[Mesa-dev] [PATCH 1/3] llvmpipe: use vector loads for (optimized) tri raster funcs

Tue Mar 15 01:37:44 UTC 2016

On Mon, Feb 1, 2016 at 8:14 AM, Brian Paul <brianp at vmware.com> wrote:
> On 01/31/2016 06:00 PM, sroland at vmware.com wrote:
>>
>> From: Roland Scheidegger <sroland at vmware.com>
>>
>> When we switched to 64bit rasterization, we could no longer use straight
>> aligned loads for loading the plane data. However, what the code actually
>> does for loading 3 planes, is 12 scalar loads + 9 unpacks, and then
>> there's
>> another 8 unpacks for the transpose we need (!).
>> It would be possible to do the (scalar) loads of course already transposed
>> (at least saving the additional unpacks), however instead juse use
>
>
> "just"
>
>> (un)aligned vector loads, and recalculate the eo values, which is much
>> less
>> instructions (note in case of the triangle_32_3_4 case, the eo values are
>> not even used, making the scalar loads + unpacks for them all the more
>> pointless).
>> This drops execution time of the triangle_32_3_4 function considerably,
>> albeit it doesn't really make a measurable difference (for small tris
>> we're
>> essentially limited by vertex throughput in any case), for
>> triangle_32_3_16
>> it's essentially noise (the loop is more costly than the initial code
>> there).
>> (I'm thinking about just ditching storing the eo values in the plane data,
>> so could switch back to using aligned planes, however right now they are
>> still used in the other raster functions dealing with planes with scalar
>> code. Also not touching the ppc code, might not be that bad there in any
>> case.)
>
>
> Would you mind putting a blank line between paragraphs to ease readability?
> Same for 3/3.
>
> Otherwise, these look good AFAICT.
> Reviewed-by: Brian Paul <brianp at vmware.com>
>
>
>
>> ---
>>   src/gallium/drivers/llvmpipe/lp_rast.h     | 13 --------
>>   src/gallium/drivers/llvmpipe/lp_rast_tri.c | 48
>> +++++++++++++++---------------
>>   2 files changed, 24 insertions(+), 37 deletions(-)
>>
>> diff --git a/src/gallium/drivers/llvmpipe/lp_rast.h
>> b/src/gallium/drivers/llvmpipe/lp_rast.h
>> index db45cbb..34008e1 100644
>> --- a/src/gallium/drivers/llvmpipe/lp_rast.h
>> +++ b/src/gallium/drivers/llvmpipe/lp_rast.h
>> @@ -308,17 +308,4 @@ void
>>   lp_debug_draw_bins_by_coverage( struct lp_scene *scene );
>>
>>
>> -#ifdef PIPE_ARCH_SSE
>> -#include <emmintrin.h>
>> -#include "util/u_sse.h"
>> -
>> -static inline __m128i
>> -lp_plane_to_m128i(const struct lp_rast_plane *plane)
>> -{
>> -   return _mm_setr_epi32((int32_t)plane->c, (int32_t)plane->dcdx,
>> -                         (int32_t)plane->dcdy, (int32_t)plane->eo);
>> -}
>> -
>> -#endif
>> -
>>   #endif
>> diff --git a/src/gallium/drivers/llvmpipe/lp_rast_tri.c
>> b/src/gallium/drivers/llvmpipe/lp_rast_tri.c
>> index 0ae6ec2..f4a2f02 100644
>> --- a/src/gallium/drivers/llvmpipe/lp_rast_tri.c
>> +++ b/src/gallium/drivers/llvmpipe/lp_rast_tri.c
>> @@ -239,7 +239,7 @@ sign_bits4(const __m128i *cstep, int cdiff)
>>
>>   void
>>   lp_rast_triangle_32_3_16(struct lp_rasterizer_task *task,
>> -                      const union lp_rast_cmd_arg arg)
>> +                         const union lp_rast_cmd_arg arg)
>>   {
>>      const struct lp_rast_triangle *tri = arg.triangle.tri;
>>      const struct lp_rast_plane *plane = GET_PLANES(tri);
>> @@ -250,26 +250,29 @@ lp_rast_triangle_32_3_16(struct lp_rasterizer_task
>> *task,
>>      struct { unsigned mask:16; unsigned i:8; unsigned j:8; } out[16];
>>      unsigned nr = 0;
>>
>> -   __m128i p0 = lp_plane_to_m128i(&plane[0]); /* c, dcdx, dcdy, eo */
>> -   __m128i p1 = lp_plane_to_m128i(&plane[1]); /* c, dcdx, dcdy, eo */
>> -   __m128i p2 = lp_plane_to_m128i(&plane[2]); /* c, dcdx, dcdy, eo */
>> +   /* p0 and p2 are aligned, p1 is not (plane size 24 bytes). */
>> +   __m128i p0 = _mm_load_si128((__m128i *)&plane[0]); /* clo, chi, dcdx,
>> dcdy */
>> +   __m128i p1 = _mm_loadu_si128((__m128i *)&plane[1]);
>> +   __m128i p2 = _mm_load_si128((__m128i *)&plane[2]);
>>      __m128i zero = _mm_setzero_si128();
>>
>> -   __m128i c;
>> -   __m128i dcdx;
>> -   __m128i dcdy;
>> -   __m128i rej4;
>> -
>> -   __m128i dcdx2;
>> -   __m128i dcdx3;
>> +   __m128i c, dcdx, dcdy, rej4;
>> +   __m128i dcdx_neg_mask, dcdy_neg_mask;
>> +   __m128i dcdx2, dcdx3;
>>
>>      __m128i span_0;                /* 0,dcdx,2dcdx,3dcdx for plane 0 */
>>      __m128i span_1;                /* 0,dcdx,2dcdx,3dcdx for plane 1 */
>>      __m128i span_2;                /* 0,dcdx,2dcdx,3dcdx for plane 2 */
>>      __m128i unused;
>> -
>> +
>>      transpose4_epi32(&p0, &p1, &p2, &zero,
>> -                    &c, &dcdx, &dcdy, &rej4);
>> +                    &c, &unused, &dcdx, &dcdy);
>> +
>> +   /* recalc eo - easier than trying to load as scalars / shuffle... */
>> +   dcdx_neg_mask = _mm_srai_epi32(dcdx, 31);
>> +   dcdy_neg_mask = _mm_srai_epi32(dcdy, 31);
>> +   rej4 = _mm_sub_epi32(_mm_andnot_si128(dcdy_neg_mask, dcdy),
>> +                        _mm_and_si128(dcdx_neg_mask, dcdx));
>>
>>      /* Adjust dcdx;
>>       */
>> @@ -349,32 +352,29 @@ lp_rast_triangle_32_3_16(struct lp_rasterizer_task
>> *task,
>>
>>   void
>>   lp_rast_triangle_32_3_4(struct lp_rasterizer_task *task,
>> -                     const union lp_rast_cmd_arg arg)
>> +                        const union lp_rast_cmd_arg arg)
>>   {
>>      const struct lp_rast_triangle *tri = arg.triangle.tri;
>>      const struct lp_rast_plane *plane = GET_PLANES(tri);
>>      unsigned x = (arg.triangle.plane_mask & 0xff) + task->x;
>>      unsigned y = (arg.triangle.plane_mask >> 8) + task->y;
>>
>> -   __m128i p0 = lp_plane_to_m128i(&plane[0]); /* c, dcdx, dcdy, eo */
>> -   __m128i p1 = lp_plane_to_m128i(&plane[1]); /* c, dcdx, dcdy, eo */
>> -   __m128i p2 = lp_plane_to_m128i(&plane[2]); /* c, dcdx, dcdy, eo */
>> +   /* p0 and p2 are aligned, p1 is not (plane size 24 bytes). */

Yes p0 and p2 are aligned inside "plane", but there is no guarantee
that "plane" itself is aligned to a multiple of 8 (that's only true on
64 bit). On 32 bit systems this causes crashes.

Stéphane

>> +   __m128i p0 = _mm_load_si128((__m128i *)&plane[0]); /* clo, chi, dcdx,
>> dcdy */
>> +   __m128i p1 = _mm_loadu_si128((__m128i *)&plane[1]);
>> +   __m128i p2 = _mm_load_si128((__m128i *)&plane[2]);
>>      __m128i zero = _mm_setzero_si128();
>>
>> -   __m128i c;
>> -   __m128i dcdx;
>> -   __m128i dcdy;
>> +   __m128i c, dcdx, dcdy;
>> +   __m128i dcdx2, dcdx3;
>>
>> -   __m128i dcdx2;
>> -   __m128i dcdx3;
>> -
>>      __m128i span_0;                /* 0,dcdx,2dcdx,3dcdx for plane 0 */
>>      __m128i span_1;                /* 0,dcdx,2dcdx,3dcdx for plane 1 */
>>      __m128i span_2;                /* 0,dcdx,2dcdx,3dcdx for plane 2 */
>>      __m128i unused;
>>
>>      transpose4_epi32(&p0, &p1, &p2, &zero,
>> -                    &c, &dcdx, &dcdy, &unused);
>> +                    &c, &unused, &dcdx, &dcdy);
>>
>>      /* Adjust dcdx;
>>       */
>>
>
> _______________________________________________
> mesa-dev mailing list
> mesa-dev at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/mesa-dev