Mesa (llvmpipe-rast-64): llvmpipe: add support for 32bit optimized sse paths

Zack Rusin zack at kemper.freedesktop.org
Tue Nov 19 21:04:25 UTC 2013


Module: Mesa
Branch: llvmpipe-rast-64
Commit: 98ecab3785a4dd742f7a519d6eeeb3e0a53bc71f
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=98ecab3785a4dd742f7a519d6eeeb3e0a53bc71f

Author: Zack Rusin <zackr at vmware.com>
Date:   Tue Nov 19 16:02:34 2013 -0500

llvmpipe: add support for 32bit optimized sse paths

if the triangle is bounded by a 128x128 box then we can
use only 32bit arthemtic which is nice because the sse
paths for it can be nicely optimized.

---

 src/gallium/drivers/llvmpipe/lp_rast.c         |   11 ++
 src/gallium/drivers/llvmpipe/lp_rast.h         |   33 ++++++-
 src/gallium/drivers/llvmpipe/lp_rast_priv.h    |   27 ++++++
 src/gallium/drivers/llvmpipe/lp_rast_tri.c     |  120 ++++++++++++++++++------
 src/gallium/drivers/llvmpipe/lp_rast_tri_tmp.h |    6 +-
 src/gallium/drivers/llvmpipe/lp_setup_tri.c    |   35 ++++++-
 6 files changed, 191 insertions(+), 41 deletions(-)

diff --git a/src/gallium/drivers/llvmpipe/lp_rast.c b/src/gallium/drivers/llvmpipe/lp_rast.c
index af661e9..0cd62c2 100644
--- a/src/gallium/drivers/llvmpipe/lp_rast.c
+++ b/src/gallium/drivers/llvmpipe/lp_rast.c
@@ -589,6 +589,17 @@ static lp_rast_cmd_func dispatch[LP_RAST_OP_MAX] =
    lp_rast_begin_query,
    lp_rast_end_query,
    lp_rast_set_state,
+   lp_rast_triangle_32_1,
+   lp_rast_triangle_32_2,
+   lp_rast_triangle_32_3,
+   lp_rast_triangle_32_4,
+   lp_rast_triangle_32_5,
+   lp_rast_triangle_32_6,
+   lp_rast_triangle_32_7,
+   lp_rast_triangle_32_8,
+   lp_rast_triangle_32_3_4,
+   lp_rast_triangle_32_3_16,
+   lp_rast_triangle_32_4_16
 };
 
 
diff --git a/src/gallium/drivers/llvmpipe/lp_rast.h b/src/gallium/drivers/llvmpipe/lp_rast.h
index ce60665..b81d94f 100644
--- a/src/gallium/drivers/llvmpipe/lp_rast.h
+++ b/src/gallium/drivers/llvmpipe/lp_rast.h
@@ -60,6 +60,8 @@ struct cmd_bin;
  */
 #define MAX_FIXED_LENGTH (1 << (((FIXED_TYPE_WIDTH/2) - 1) - FIXED_ORDER))
 
+#define MAX_FIXED_LENGTH32 (1 << (((32/2) - 1) - FIXED_ORDER))
+
 /* Rasterizer output size going to jit fs, width/height */
 #define LP_RASTER_BLOCK_SIZE 4
 
@@ -104,9 +106,6 @@ struct lp_rast_shader_inputs {
    /* followed by a0, dadx, dady and planes[] */
 };
 
-/* Note: the order of these values is important as they are loaded by
- * sse code in rasterization:
- */
 struct lp_rast_plane {
    /* edge function values at minx,miny ?? */
    int64_t c;
@@ -279,8 +278,19 @@ lp_rast_arg_null( void )
 #define LP_RAST_OP_BEGIN_QUERY       0xf
 #define LP_RAST_OP_END_QUERY         0x10
 #define LP_RAST_OP_SET_STATE         0x11
-
-#define LP_RAST_OP_MAX               0x12
+#define LP_RAST_OP_TRIANGLE_32_1     0x12
+#define LP_RAST_OP_TRIANGLE_32_2     0x13
+#define LP_RAST_OP_TRIANGLE_32_3     0x14
+#define LP_RAST_OP_TRIANGLE_32_4     0x15
+#define LP_RAST_OP_TRIANGLE_32_5     0x16
+#define LP_RAST_OP_TRIANGLE_32_6     0x17
+#define LP_RAST_OP_TRIANGLE_32_7     0x18
+#define LP_RAST_OP_TRIANGLE_32_8     0x19
+#define LP_RAST_OP_TRIANGLE_32_3_4   0x1a
+#define LP_RAST_OP_TRIANGLE_32_3_16  0x1b
+#define LP_RAST_OP_TRIANGLE_32_4_16  0x1c
+
+#define LP_RAST_OP_MAX               0x1d
 #define LP_RAST_OP_MASK              0xff
 
 void
@@ -291,4 +301,17 @@ void
 lp_debug_draw_bins_by_coverage( struct lp_scene *scene );
 
 
+#ifdef PIPE_ARCH_SSE
+#include <emmintrin.h>
+#include "util/u_sse.h"
+
+static INLINE __m128i
+lp_plane_to_m128i(const struct lp_rast_plane *plane)
+{
+   return _mm_setr_epi32((int32_t)plane->c, (int32_t)plane->dcdx,
+                         (int32_t)plane->dcdy, (int32_t)plane->eo);
+}
+
+#endif
+
 #endif
diff --git a/src/gallium/drivers/llvmpipe/lp_rast_priv.h b/src/gallium/drivers/llvmpipe/lp_rast_priv.h
index 41fe097..77ec329 100644
--- a/src/gallium/drivers/llvmpipe/lp_rast_priv.h
+++ b/src/gallium/drivers/llvmpipe/lp_rast_priv.h
@@ -355,6 +355,33 @@ void lp_rast_triangle_3_16( struct lp_rasterizer_task *,
 void lp_rast_triangle_4_16( struct lp_rasterizer_task *, 
                             const union lp_rast_cmd_arg );
 
+
+void lp_rast_triangle_32_1( struct lp_rasterizer_task *, 
+                         const union lp_rast_cmd_arg );
+void lp_rast_triangle_32_2( struct lp_rasterizer_task *, 
+                         const union lp_rast_cmd_arg );
+void lp_rast_triangle_32_3( struct lp_rasterizer_task *, 
+                         const union lp_rast_cmd_arg );
+void lp_rast_triangle_32_4( struct lp_rasterizer_task *, 
+                         const union lp_rast_cmd_arg );
+void lp_rast_triangle_32_5( struct lp_rasterizer_task *, 
+                         const union lp_rast_cmd_arg );
+void lp_rast_triangle_32_6( struct lp_rasterizer_task *, 
+                         const union lp_rast_cmd_arg );
+void lp_rast_triangle_32_7( struct lp_rasterizer_task *, 
+                         const union lp_rast_cmd_arg );
+void lp_rast_triangle_32_8( struct lp_rasterizer_task *, 
+                         const union lp_rast_cmd_arg );
+
+void lp_rast_triangle_32_3_4(struct lp_rasterizer_task *,
+			  const union lp_rast_cmd_arg );
+
+void lp_rast_triangle_32_3_16( struct lp_rasterizer_task *, 
+                            const union lp_rast_cmd_arg );
+
+void lp_rast_triangle_32_4_16( struct lp_rasterizer_task *, 
+                            const union lp_rast_cmd_arg );
+
 void
 lp_rast_set_state(struct lp_rasterizer_task *task,
                   const union lp_rast_cmd_arg arg);
diff --git a/src/gallium/drivers/llvmpipe/lp_rast_tri.c b/src/gallium/drivers/llvmpipe/lp_rast_tri.c
index 8ddda5e..41f6fbf 100644
--- a/src/gallium/drivers/llvmpipe/lp_rast_tri.c
+++ b/src/gallium/drivers/llvmpipe/lp_rast_tri.c
@@ -35,12 +35,6 @@
 #include "lp_perf.h"
 #include "lp_rast_priv.h"
 
-/* TODO */
-#undef PIPE_ARCH_SSE
-
-
-
-
 /**
  * Shade all pixels in a 4x4 block.
  */
@@ -69,8 +63,6 @@ block_full_16(struct lp_rasterizer_task *task,
 	 block_full_4(task, tri, x + ix, y + iy);
 }
 
-#if !defined(PIPE_ARCH_SSE)
-
 static INLINE unsigned
 build_mask_linear(int64_t c, int64_t dcdx, int64_t dcdy)
 {
@@ -125,6 +117,13 @@ lp_rast_triangle_3_16(struct lp_rasterizer_task *task,
 }
 
 void
+lp_rast_triangle_3_4(struct lp_rasterizer_task *task,
+                      const union lp_rast_cmd_arg arg)
+{
+   lp_rast_triangle_3_16(task, arg);
+}
+
+void
 lp_rast_triangle_4_16(struct lp_rasterizer_task *task,
                       const union lp_rast_cmd_arg arg)
 {
@@ -134,11 +133,33 @@ lp_rast_triangle_4_16(struct lp_rasterizer_task *task,
    lp_rast_triangle_4(task, arg2);
 }
 
+#if !defined(PIPE_ARCH_SSE)
+
 void
-lp_rast_triangle_3_4(struct lp_rasterizer_task *task,
+lp_rast_triangle_32_3_16(struct lp_rasterizer_task *task,
+                         const union lp_rast_cmd_arg arg)
+{
+   union lp_rast_cmd_arg arg2;
+   arg2.triangle.tri = arg.triangle.tri;
+   arg2.triangle.plane_mask = (1<<3)-1;
+   lp_rast_triangle_32_3(task, arg2);
+}
+
+void
+lp_rast_triangle_32_4_16(struct lp_rasterizer_task *task,
+                         const union lp_rast_cmd_arg arg)
+{
+   union lp_rast_cmd_arg arg2;
+   arg2.triangle.tri = arg.triangle.tri;
+   arg2.triangle.plane_mask = (1<<4)-1;
+   lp_rast_triangle_32_4(task, arg2);
+}
+
+void
+lp_rast_triangle_32_3_4(struct lp_rasterizer_task *task,
                       const union lp_rast_cmd_arg arg)
 {
-   lp_rast_triangle_3_16(task, arg);
+   lp_rast_triangle_32_3_16(task, arg);
 }
 
 #else
@@ -147,12 +168,12 @@ lp_rast_triangle_3_4(struct lp_rasterizer_task *task,
 
 
 static INLINE void
-build_masks(int c, 
-	    int cdiff,
-	    int dcdx,
-	    int dcdy,
-	    unsigned *outmask,
-	    unsigned *partmask)
+build_masks_32(int c, 
+               int cdiff,
+               int dcdx,
+               int dcdy,
+               unsigned *outmask,
+               unsigned *partmask)
 {
    __m128i cstep0 = _mm_setr_epi32(c, c+dcdx, c+dcdx*2, c+dcdx*3);
    __m128i xdcdy = _mm_set1_epi32(dcdy);
@@ -193,7 +214,7 @@ build_masks(int c,
 
 
 static INLINE unsigned
-build_mask_linear(int c, int dcdx, int dcdy)
+build_mask_linear_32(int c, int dcdx, int dcdy)
 {
    __m128i cstep0 = _mm_setr_epi32(c, c+dcdx, c+dcdx*2, c+dcdx*3);
    __m128i xdcdy = _mm_set1_epi32(dcdy);
@@ -251,7 +272,7 @@ sign_bits4(const __m128i *cstep, int cdiff)
 
 
 void
-lp_rast_triangle_3_16(struct lp_rasterizer_task *task,
+lp_rast_triangle_32_3_16(struct lp_rasterizer_task *task,
                       const union lp_rast_cmd_arg arg)
 {
    const struct lp_rast_triangle *tri = arg.triangle.tri;
@@ -263,9 +284,9 @@ lp_rast_triangle_3_16(struct lp_rasterizer_task *task,
    struct { unsigned mask:16; unsigned i:8; unsigned j:8; } out[16];
    unsigned nr = 0;
 
-   __m128i p0 = _mm_load_si128((__m128i *)&plane[0]); /* c, dcdx, dcdy, eo */
-   __m128i p1 = _mm_load_si128((__m128i *)&plane[1]); /* c, dcdx, dcdy, eo */
-   __m128i p2 = _mm_load_si128((__m128i *)&plane[2]); /* c, dcdx, dcdy, eo */
+   __m128i p0 = lp_plane_to_m128i(&plane[0]); /* c, dcdx, dcdy, eo */
+   __m128i p1 = lp_plane_to_m128i(&plane[1]); /* c, dcdx, dcdy, eo */
+   __m128i p2 = lp_plane_to_m128i(&plane[2]); /* c, dcdx, dcdy, eo */
    __m128i zero = _mm_setzero_si128();
 
    __m128i c;
@@ -365,7 +386,7 @@ lp_rast_triangle_3_16(struct lp_rasterizer_task *task,
 
 
 void
-lp_rast_triangle_3_4(struct lp_rasterizer_task *task,
+lp_rast_triangle_32_3_4(struct lp_rasterizer_task *task,
                      const union lp_rast_cmd_arg arg)
 {
    const struct lp_rast_triangle *tri = arg.triangle.tri;
@@ -373,9 +394,9 @@ lp_rast_triangle_3_4(struct lp_rasterizer_task *task,
    unsigned x = (arg.triangle.plane_mask & 0xff) + task->x;
    unsigned y = (arg.triangle.plane_mask >> 8) + task->y;
 
-   __m128i p0 = _mm_load_si128((__m128i *)&plane[0]); /* c, dcdx, dcdy, eo */
-   __m128i p1 = _mm_load_si128((__m128i *)&plane[1]); /* c, dcdx, dcdy, eo */
-   __m128i p2 = _mm_load_si128((__m128i *)&plane[2]); /* c, dcdx, dcdy, eo */
+   __m128i p0 = lp_plane_to_m128i(&plane[0]); /* c, dcdx, dcdy, eo */
+   __m128i p1 = lp_plane_to_m128i(&plane[1]); /* c, dcdx, dcdy, eo */
+   __m128i p2 = lp_plane_to_m128i(&plane[2]); /* c, dcdx, dcdy, eo */
    __m128i zero = _mm_setzero_si128();
 
    __m128i c;
@@ -453,7 +474,8 @@ lp_rast_triangle_3_4(struct lp_rasterizer_task *task,
 #endif
 
 
-
+#define BUILD_MASKS(c, cdiff, dcdx, dcdy, omask, pmask) build_masks(c, cdiff, dcdx, dcdy, omask, pmask)
+#define BUILD_MASK_LINEAR(c, dcdx, dcdy) build_mask_linear(c, dcdx, dcdy)
 
 #define TAG(x) x##_1
 #define NR_PLANES 1
@@ -471,7 +493,7 @@ lp_rast_triangle_3_4(struct lp_rasterizer_task *task,
 
 #define TAG(x) x##_4
 #define NR_PLANES 4
-#define TRI_16 lp_rast_triangle_4_16
+/*#define TRI_16 lp_rast_triangle_4_16*/
 #include "lp_rast_tri_tmp.h"
 
 #define TAG(x) x##_5
@@ -490,3 +512,47 @@ lp_rast_triangle_3_4(struct lp_rasterizer_task *task,
 #define NR_PLANES 8
 #include "lp_rast_tri_tmp.h"
 
+#ifdef PIPE_ARCH_SSE
+#undef BUILD_MASKS
+#undef BUILD_MASK_LINEAR
+#define BUILD_MASKS(c, cdiff, dcdx, dcdy, omask, pmask) build_masks_32((int)c, (int)cdiff, dcdx, dcdy, omask, pmask)
+#define BUILD_MASK_LINEAR(c, dcdx, dcdy) build_mask_linear_32((int)c, dcdx, dcdy)
+#endif
+
+#define TAG(x) x##_32_1
+#define NR_PLANES 1
+#include "lp_rast_tri_tmp.h"
+
+#define TAG(x) x##_32_2
+#define NR_PLANES 2
+#include "lp_rast_tri_tmp.h"
+
+#define TAG(x) x##_32_3
+#define NR_PLANES 3
+/*#define TRI_4 lp_rast_triangle_3_4*/
+/*#define TRI_16 lp_rast_triangle_3_16*/
+#include "lp_rast_tri_tmp.h"
+
+#define TAG(x) x##_32_4
+#define NR_PLANES 4
+#ifdef PIPE_ARCH_SSE
+#define TRI_16 lp_rast_triangle_32_4_16
+#endif
+#include "lp_rast_tri_tmp.h"
+
+#define TAG(x) x##_32_5
+#define NR_PLANES 5
+#include "lp_rast_tri_tmp.h"
+
+#define TAG(x) x##_32_6
+#define NR_PLANES 6
+#include "lp_rast_tri_tmp.h"
+
+#define TAG(x) x##_32_7
+#define NR_PLANES 7
+#include "lp_rast_tri_tmp.h"
+
+#define TAG(x) x##_32_8
+#define NR_PLANES 8
+#include "lp_rast_tri_tmp.h"
+
diff --git a/src/gallium/drivers/llvmpipe/lp_rast_tri_tmp.h b/src/gallium/drivers/llvmpipe/lp_rast_tri_tmp.h
index a00cbb2..52f6e99 100644
--- a/src/gallium/drivers/llvmpipe/lp_rast_tri_tmp.h
+++ b/src/gallium/drivers/llvmpipe/lp_rast_tri_tmp.h
@@ -50,7 +50,7 @@ TAG(do_block_4)(struct lp_rasterizer_task *task,
    int j;
 
    for (j = 0; j < NR_PLANES; j++) {
-      mask &= ~build_mask_linear(c[j] - 1, 
+      mask &= ~BUILD_MASK_LINEAR(c[j] - 1, 
 				 -plane[j].dcdx,
 				 plane[j].dcdy);
    }
@@ -85,7 +85,7 @@ TAG(do_block_16)(struct lp_rasterizer_task *task,
       const int64_t ei = plane[j].dcdy - plane[j].dcdx - plane[j].eo;
       const int64_t cio = IMUL64(ei, 4) - 1;
 
-      build_masks(c[j] + cox,
+      BUILD_MASKS(c[j] + cox,
 		  cio - cox,
 		  dcdx, dcdy, 
 		  &outmask,   /* sign bits from c[i][0..15] + cox */
@@ -185,7 +185,7 @@ TAG(lp_rast_triangle)(struct lp_rasterizer_task *task,
          const int64_t ei = plane[j].dcdy - plane[j].dcdx - plane[j].eo;
          const int64_t cio = IMUL64(ei, 16) - 1;
 
-         build_masks(c[j] + cox,
+         BUILD_MASKS(c[j] + cox,
                      cio - cox,
                      dcdx, dcdy,
                      &outmask,   /* sign bits from c[i][0..15] + cox */
diff --git a/src/gallium/drivers/llvmpipe/lp_setup_tri.c b/src/gallium/drivers/llvmpipe/lp_setup_tri.c
index 2a77987..62d2855 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup_tri.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup_tri.c
@@ -190,6 +190,19 @@ lp_rast_tri_tab[MAX_PLANES+1] = {
    LP_RAST_OP_TRIANGLE_8
 };
 
+static unsigned
+lp_rast_32_tri_tab[MAX_PLANES+1] = {
+   0,               /* should be impossible */
+   LP_RAST_OP_TRIANGLE_32_1,
+   LP_RAST_OP_TRIANGLE_32_2,
+   LP_RAST_OP_TRIANGLE_32_3,
+   LP_RAST_OP_TRIANGLE_32_4,
+   LP_RAST_OP_TRIANGLE_32_5,
+   LP_RAST_OP_TRIANGLE_32_6,
+   LP_RAST_OP_TRIANGLE_32_7,
+   LP_RAST_OP_TRIANGLE_32_8
+};
+
 
 
 /**
@@ -586,7 +599,6 @@ lp_setup_bin_triangle( struct lp_setup_context *setup,
    struct lp_scene *scene = setup->scene;
    struct u_rect trimmed_box = *bbox;   
    int i;
-
    /* What is the largest power-of-two boundary this triangle crosses:
     */
    int dx = floor_pot((bbox->x0 ^ bbox->x1) |
@@ -595,8 +607,10 @@ lp_setup_bin_triangle( struct lp_setup_context *setup,
    /* The largest dimension of the rasterized area of the triangle
     * (aligned to a 4x4 grid), rounded down to the nearest power of two:
     */
-   int sz = floor_pot((bbox->x1 - (bbox->x0 & ~3)) |
-		      (bbox->y1 - (bbox->y0 & ~3)));
+   int max_sz = ((bbox->x1 - (bbox->x0 & ~3)) |
+                 (bbox->y1 - (bbox->y0 & ~3)));
+   int sz = floor_pot(max_sz);
+   boolean use_32bits = max_sz <= MAX_FIXED_LENGTH32;
 
    /* Now apply scissor, etc to the bounding box.  Could do this
     * earlier, but it confuses the logic for tri-16 and would force
@@ -627,6 +641,8 @@ lp_setup_bin_triangle( struct lp_setup_context *setup,
             assert(py + 4 <= TILE_SIZE);
             return lp_scene_bin_cmd_with_state( scene, ix0, iy0,
                                                 setup->fs.stored,
+                                                use_32bits ?
+                                                LP_RAST_OP_TRIANGLE_32_3_4 :
                                                 LP_RAST_OP_TRIANGLE_3_4,
                                                 lp_rast_arg_triangle_contained(tri, px, py) );
          }
@@ -649,6 +665,8 @@ lp_setup_bin_triangle( struct lp_setup_context *setup,
 
             return lp_scene_bin_cmd_with_state( scene, ix0, iy0,
                                                 setup->fs.stored,
+                                                use_32bits ?
+                                                LP_RAST_OP_TRIANGLE_32_3_16 :
                                                 LP_RAST_OP_TRIANGLE_3_16,
                                                 lp_rast_arg_triangle_contained(tri, px, py) );
          }
@@ -663,6 +681,8 @@ lp_setup_bin_triangle( struct lp_setup_context *setup,
 
          return lp_scene_bin_cmd_with_state(scene, ix0, iy0,
                                             setup->fs.stored,
+                                            use_32bits ?
+                                            LP_RAST_OP_TRIANGLE_32_4_16 :
                                             LP_RAST_OP_TRIANGLE_4_16,
                                             lp_rast_arg_triangle_contained(tri, px, py));
       }
@@ -670,9 +690,10 @@ lp_setup_bin_triangle( struct lp_setup_context *setup,
 
       /* Triangle is contained in a single tile:
        */
-      return lp_scene_bin_cmd_with_state( scene, ix0, iy0, setup->fs.stored,
-                                          lp_rast_tri_tab[nr_planes], 
-                                          lp_rast_arg_triangle(tri, (1<<nr_planes)-1) );
+      return lp_scene_bin_cmd_with_state(
+         scene, ix0, iy0, setup->fs.stored,
+         use_32bits ? lp_rast_32_tri_tab[nr_planes] : lp_rast_tri_tab[nr_planes],
+         lp_rast_arg_triangle(tri, (1<<nr_planes)-1));
    }
    else
    {
@@ -746,6 +767,8 @@ lp_setup_bin_triangle( struct lp_setup_context *setup,
                
                if (!lp_scene_bin_cmd_with_state( scene, x, y,
                                                  setup->fs.stored,
+                                                 use_32bits ?
+                                                 lp_rast_32_tri_tab[count] :
                                                  lp_rast_tri_tab[count],
                                                  lp_rast_arg_triangle(tri, partial) ))
                   goto fail;




More information about the mesa-commit mailing list