[Mesa-dev] [PATCH 1/3] llvmpipe: use vector loads for (optimized) tri raster funcs
Brian Paul
brianp at vmware.com
Mon Feb 1 16:14:46 UTC 2016
On 01/31/2016 06:00 PM, sroland at vmware.com wrote:
> From: Roland Scheidegger <sroland at vmware.com>
>
> When we switched to 64bit rasterization, we could no longer use straight
> aligned loads for loading the plane data. However, what the code actually
> does for loading 3 planes, is 12 scalar loads + 9 unpacks, and then there's
> another 8 unpacks for the transpose we need (!).
> It would be possible to do the (scalar) loads of course already transposed
> (at least saving the additional unpacks), however instead juse use
"just"
> (un)aligned vector loads, and recalculate the eo values, which is much less
> instructions (note in case of the triangle_32_3_4 case, the eo values are
> not even used, making the scalar loads + unpacks for them all the more
> pointless).
> This drops execution time of the triangle_32_3_4 function considerably,
> albeit it doesn't really make a measurable difference (for small tris we're
> essentially limited by vertex throughput in any case), for triangle_32_3_16
> it's essentially noise (the loop is more costly than the initial code there).
> (I'm thinking about just ditching storing the eo values in the plane data,
> so could switch back to using aligned planes, however right now they are
> still used in the other raster functions dealing with planes with scalar
> code. Also not touching the ppc code, might not be that bad there in any
> case.)
Would you mind putting a blank line between paragraphs to ease
readability? Same for 3/3.
Otherwise, these look good AFAICT.
Reviewed-by: Brian Paul <brianp at vmware.com>
> ---
> src/gallium/drivers/llvmpipe/lp_rast.h | 13 --------
> src/gallium/drivers/llvmpipe/lp_rast_tri.c | 48 +++++++++++++++---------------
> 2 files changed, 24 insertions(+), 37 deletions(-)
>
> diff --git a/src/gallium/drivers/llvmpipe/lp_rast.h b/src/gallium/drivers/llvmpipe/lp_rast.h
> index db45cbb..34008e1 100644
> --- a/src/gallium/drivers/llvmpipe/lp_rast.h
> +++ b/src/gallium/drivers/llvmpipe/lp_rast.h
> @@ -308,17 +308,4 @@ void
> lp_debug_draw_bins_by_coverage( struct lp_scene *scene );
>
>
> -#ifdef PIPE_ARCH_SSE
> -#include <emmintrin.h>
> -#include "util/u_sse.h"
> -
> -static inline __m128i
> -lp_plane_to_m128i(const struct lp_rast_plane *plane)
> -{
> - return _mm_setr_epi32((int32_t)plane->c, (int32_t)plane->dcdx,
> - (int32_t)plane->dcdy, (int32_t)plane->eo);
> -}
> -
> -#endif
> -
> #endif
> diff --git a/src/gallium/drivers/llvmpipe/lp_rast_tri.c b/src/gallium/drivers/llvmpipe/lp_rast_tri.c
> index 0ae6ec2..f4a2f02 100644
> --- a/src/gallium/drivers/llvmpipe/lp_rast_tri.c
> +++ b/src/gallium/drivers/llvmpipe/lp_rast_tri.c
> @@ -239,7 +239,7 @@ sign_bits4(const __m128i *cstep, int cdiff)
>
> void
> lp_rast_triangle_32_3_16(struct lp_rasterizer_task *task,
> - const union lp_rast_cmd_arg arg)
> + const union lp_rast_cmd_arg arg)
> {
> const struct lp_rast_triangle *tri = arg.triangle.tri;
> const struct lp_rast_plane *plane = GET_PLANES(tri);
> @@ -250,26 +250,29 @@ lp_rast_triangle_32_3_16(struct lp_rasterizer_task *task,
> struct { unsigned mask:16; unsigned i:8; unsigned j:8; } out[16];
> unsigned nr = 0;
>
> - __m128i p0 = lp_plane_to_m128i(&plane[0]); /* c, dcdx, dcdy, eo */
> - __m128i p1 = lp_plane_to_m128i(&plane[1]); /* c, dcdx, dcdy, eo */
> - __m128i p2 = lp_plane_to_m128i(&plane[2]); /* c, dcdx, dcdy, eo */
> + /* p0 and p2 are aligned, p1 is not (plane size 24 bytes). */
> + __m128i p0 = _mm_load_si128((__m128i *)&plane[0]); /* clo, chi, dcdx, dcdy */
> + __m128i p1 = _mm_loadu_si128((__m128i *)&plane[1]);
> + __m128i p2 = _mm_load_si128((__m128i *)&plane[2]);
> __m128i zero = _mm_setzero_si128();
>
> - __m128i c;
> - __m128i dcdx;
> - __m128i dcdy;
> - __m128i rej4;
> -
> - __m128i dcdx2;
> - __m128i dcdx3;
> + __m128i c, dcdx, dcdy, rej4;
> + __m128i dcdx_neg_mask, dcdy_neg_mask;
> + __m128i dcdx2, dcdx3;
>
> __m128i span_0; /* 0,dcdx,2dcdx,3dcdx for plane 0 */
> __m128i span_1; /* 0,dcdx,2dcdx,3dcdx for plane 1 */
> __m128i span_2; /* 0,dcdx,2dcdx,3dcdx for plane 2 */
> __m128i unused;
> -
> +
> transpose4_epi32(&p0, &p1, &p2, &zero,
> - &c, &dcdx, &dcdy, &rej4);
> + &c, &unused, &dcdx, &dcdy);
> +
> + /* recalc eo - easier than trying to load as scalars / shuffle... */
> + dcdx_neg_mask = _mm_srai_epi32(dcdx, 31);
> + dcdy_neg_mask = _mm_srai_epi32(dcdy, 31);
> + rej4 = _mm_sub_epi32(_mm_andnot_si128(dcdy_neg_mask, dcdy),
> + _mm_and_si128(dcdx_neg_mask, dcdx));
>
> /* Adjust dcdx;
> */
> @@ -349,32 +352,29 @@ lp_rast_triangle_32_3_16(struct lp_rasterizer_task *task,
>
> void
> lp_rast_triangle_32_3_4(struct lp_rasterizer_task *task,
> - const union lp_rast_cmd_arg arg)
> + const union lp_rast_cmd_arg arg)
> {
> const struct lp_rast_triangle *tri = arg.triangle.tri;
> const struct lp_rast_plane *plane = GET_PLANES(tri);
> unsigned x = (arg.triangle.plane_mask & 0xff) + task->x;
> unsigned y = (arg.triangle.plane_mask >> 8) + task->y;
>
> - __m128i p0 = lp_plane_to_m128i(&plane[0]); /* c, dcdx, dcdy, eo */
> - __m128i p1 = lp_plane_to_m128i(&plane[1]); /* c, dcdx, dcdy, eo */
> - __m128i p2 = lp_plane_to_m128i(&plane[2]); /* c, dcdx, dcdy, eo */
> + /* p0 and p2 are aligned, p1 is not (plane size 24 bytes). */
> + __m128i p0 = _mm_load_si128((__m128i *)&plane[0]); /* clo, chi, dcdx, dcdy */
> + __m128i p1 = _mm_loadu_si128((__m128i *)&plane[1]);
> + __m128i p2 = _mm_load_si128((__m128i *)&plane[2]);
> __m128i zero = _mm_setzero_si128();
>
> - __m128i c;
> - __m128i dcdx;
> - __m128i dcdy;
> + __m128i c, dcdx, dcdy;
> + __m128i dcdx2, dcdx3;
>
> - __m128i dcdx2;
> - __m128i dcdx3;
> -
> __m128i span_0; /* 0,dcdx,2dcdx,3dcdx for plane 0 */
> __m128i span_1; /* 0,dcdx,2dcdx,3dcdx for plane 1 */
> __m128i span_2; /* 0,dcdx,2dcdx,3dcdx for plane 2 */
> __m128i unused;
>
> transpose4_epi32(&p0, &p1, &p2, &zero,
> - &c, &dcdx, &dcdy, &unused);
> + &c, &unused, &dcdx, &dcdy);
>
> /* Adjust dcdx;
> */
>
More information about the mesa-dev
mailing list