[Mesa-dev] [PATCH 1/3] llvmpipe: use vector loads for (optimized) tri raster funcs

Mon Feb 1 16:14:46 UTC 2016

On 01/31/2016 06:00 PM, sroland at vmware.com wrote:
> From: Roland Scheidegger <sroland at vmware.com>
>
> When we switched to 64bit rasterization, we could no longer use straight
> aligned loads for loading the plane data. However, what the code actually
> does for loading 3 planes, is 12 scalar loads + 9 unpacks, and then there's
> another 8 unpacks for the transpose we need (!).
> It would be possible to do the (scalar) loads of course already transposed
> (at least saving the additional unpacks), however instead juse use

"just"

> (un)aligned vector loads, and recalculate the eo values, which is much less
> instructions (note in case of the triangle_32_3_4 case, the eo values are
> not even used, making the scalar loads + unpacks for them all the more
> pointless).
> This drops execution time of the triangle_32_3_4 function considerably,
> albeit it doesn't really make a measurable difference (for small tris we're
> essentially limited by vertex throughput in any case), for triangle_32_3_16
> it's essentially noise (the loop is more costly than the initial code there).
> (I'm thinking about just ditching storing the eo values in the plane data,
> so could switch back to using aligned planes, however right now they are
> still used in the other raster functions dealing with planes with scalar
> code. Also not touching the ppc code, might not be that bad there in any
> case.)

Would you mind putting a blank line between paragraphs to ease 
readability?  Same for 3/3.

Otherwise, these look good AFAICT.
Reviewed-by: Brian Paul <brianp at vmware.com>

> ---
>   src/gallium/drivers/llvmpipe/lp_rast.h     | 13 --------
>   src/gallium/drivers/llvmpipe/lp_rast_tri.c | 48 +++++++++++++++---------------
>   2 files changed, 24 insertions(+), 37 deletions(-)
>
> diff --git a/src/gallium/drivers/llvmpipe/lp_rast.h b/src/gallium/drivers/llvmpipe/lp_rast.h
> index db45cbb..34008e1 100644
> --- a/src/gallium/drivers/llvmpipe/lp_rast.h
> +++ b/src/gallium/drivers/llvmpipe/lp_rast.h
> @@ -308,17 +308,4 @@ void
>   lp_debug_draw_bins_by_coverage( struct lp_scene *scene );
>
>
> -#ifdef PIPE_ARCH_SSE
> -#include <emmintrin.h>
> -#include "util/u_sse.h"
> -
> -static inline __m128i
> -lp_plane_to_m128i(const struct lp_rast_plane *plane)
> -{
> -   return _mm_setr_epi32((int32_t)plane->c, (int32_t)plane->dcdx,
> -                         (int32_t)plane->dcdy, (int32_t)plane->eo);
> -}
> -
> -#endif
> -
>   #endif
> diff --git a/src/gallium/drivers/llvmpipe/lp_rast_tri.c b/src/gallium/drivers/llvmpipe/lp_rast_tri.c
> index 0ae6ec2..f4a2f02 100644
> --- a/src/gallium/drivers/llvmpipe/lp_rast_tri.c
> +++ b/src/gallium/drivers/llvmpipe/lp_rast_tri.c
> @@ -239,7 +239,7 @@ sign_bits4(const __m128i *cstep, int cdiff)
>
>   void
>   lp_rast_triangle_32_3_16(struct lp_rasterizer_task *task,
> -                      const union lp_rast_cmd_arg arg)
> +                         const union lp_rast_cmd_arg arg)
>   {
>      const struct lp_rast_triangle *tri = arg.triangle.tri;
>      const struct lp_rast_plane *plane = GET_PLANES(tri);
> @@ -250,26 +250,29 @@ lp_rast_triangle_32_3_16(struct lp_rasterizer_task *task,
>      struct { unsigned mask:16; unsigned i:8; unsigned j:8; } out[16];
>      unsigned nr = 0;
>
> -   __m128i p0 = lp_plane_to_m128i(&plane[0]); /* c, dcdx, dcdy, eo */
> -   __m128i p1 = lp_plane_to_m128i(&plane[1]); /* c, dcdx, dcdy, eo */
> -   __m128i p2 = lp_plane_to_m128i(&plane[2]); /* c, dcdx, dcdy, eo */
> +   /* p0 and p2 are aligned, p1 is not (plane size 24 bytes). */
> +   __m128i p0 = _mm_load_si128((__m128i *)&plane[0]); /* clo, chi, dcdx, dcdy */
> +   __m128i p1 = _mm_loadu_si128((__m128i *)&plane[1]);
> +   __m128i p2 = _mm_load_si128((__m128i *)&plane[2]);
>      __m128i zero = _mm_setzero_si128();
>
> -   __m128i c;
> -   __m128i dcdx;
> -   __m128i dcdy;
> -   __m128i rej4;
> -
> -   __m128i dcdx2;
> -   __m128i dcdx3;
> +   __m128i c, dcdx, dcdy, rej4;
> +   __m128i dcdx_neg_mask, dcdy_neg_mask;
> +   __m128i dcdx2, dcdx3;
>
>      __m128i span_0;                /* 0,dcdx,2dcdx,3dcdx for plane 0 */
>      __m128i span_1;                /* 0,dcdx,2dcdx,3dcdx for plane 1 */
>      __m128i span_2;                /* 0,dcdx,2dcdx,3dcdx for plane 2 */
>      __m128i unused;
> -
> +
>      transpose4_epi32(&p0, &p1, &p2, &zero,
> -                    &c, &dcdx, &dcdy, &rej4);
> +                    &c, &unused, &dcdx, &dcdy);
> +
> +   /* recalc eo - easier than trying to load as scalars / shuffle... */
> +   dcdx_neg_mask = _mm_srai_epi32(dcdx, 31);
> +   dcdy_neg_mask = _mm_srai_epi32(dcdy, 31);
> +   rej4 = _mm_sub_epi32(_mm_andnot_si128(dcdy_neg_mask, dcdy),
> +                        _mm_and_si128(dcdx_neg_mask, dcdx));
>
>      /* Adjust dcdx;
>       */
> @@ -349,32 +352,29 @@ lp_rast_triangle_32_3_16(struct lp_rasterizer_task *task,
>
>   void
>   lp_rast_triangle_32_3_4(struct lp_rasterizer_task *task,
> -                     const union lp_rast_cmd_arg arg)
> +                        const union lp_rast_cmd_arg arg)
>   {
>      const struct lp_rast_triangle *tri = arg.triangle.tri;
>      const struct lp_rast_plane *plane = GET_PLANES(tri);
>      unsigned x = (arg.triangle.plane_mask & 0xff) + task->x;
>      unsigned y = (arg.triangle.plane_mask >> 8) + task->y;
>
> -   __m128i p0 = lp_plane_to_m128i(&plane[0]); /* c, dcdx, dcdy, eo */
> -   __m128i p1 = lp_plane_to_m128i(&plane[1]); /* c, dcdx, dcdy, eo */
> -   __m128i p2 = lp_plane_to_m128i(&plane[2]); /* c, dcdx, dcdy, eo */
> +   /* p0 and p2 are aligned, p1 is not (plane size 24 bytes). */
> +   __m128i p0 = _mm_load_si128((__m128i *)&plane[0]); /* clo, chi, dcdx, dcdy */
> +   __m128i p1 = _mm_loadu_si128((__m128i *)&plane[1]);
> +   __m128i p2 = _mm_load_si128((__m128i *)&plane[2]);
>      __m128i zero = _mm_setzero_si128();
>
> -   __m128i c;
> -   __m128i dcdx;
> -   __m128i dcdy;
> +   __m128i c, dcdx, dcdy;
> +   __m128i dcdx2, dcdx3;
>
> -   __m128i dcdx2;
> -   __m128i dcdx3;
> -
>      __m128i span_0;                /* 0,dcdx,2dcdx,3dcdx for plane 0 */
>      __m128i span_1;                /* 0,dcdx,2dcdx,3dcdx for plane 1 */
>      __m128i span_2;                /* 0,dcdx,2dcdx,3dcdx for plane 2 */
>      __m128i unused;
>
>      transpose4_epi32(&p0, &p1, &p2, &zero,
> -                    &c, &dcdx, &dcdy, &unused);
> +                    &c, &unused, &dcdx, &dcdy);
>
>      /* Adjust dcdx;
>       */
>