Mesa (llvmpipe-rast-64): llvmpipe: add support for 32bit optimized sse paths
Zack Rusin
zack at kemper.freedesktop.org
Tue Nov 19 21:04:25 UTC 2013
Module: Mesa
Branch: llvmpipe-rast-64
Commit: 98ecab3785a4dd742f7a519d6eeeb3e0a53bc71f
URL: http://cgit.freedesktop.org/mesa/mesa/commit/?id=98ecab3785a4dd742f7a519d6eeeb3e0a53bc71f
Author: Zack Rusin <zackr at vmware.com>
Date: Tue Nov 19 16:02:34 2013 -0500
llvmpipe: add support for 32bit optimized sse paths
if the triangle is bounded by a 128x128 box then we can
use only 32bit arthemtic which is nice because the sse
paths for it can be nicely optimized.
---
src/gallium/drivers/llvmpipe/lp_rast.c | 11 ++
src/gallium/drivers/llvmpipe/lp_rast.h | 33 ++++++-
src/gallium/drivers/llvmpipe/lp_rast_priv.h | 27 ++++++
src/gallium/drivers/llvmpipe/lp_rast_tri.c | 120 ++++++++++++++++++------
src/gallium/drivers/llvmpipe/lp_rast_tri_tmp.h | 6 +-
src/gallium/drivers/llvmpipe/lp_setup_tri.c | 35 ++++++-
6 files changed, 191 insertions(+), 41 deletions(-)
diff --git a/src/gallium/drivers/llvmpipe/lp_rast.c b/src/gallium/drivers/llvmpipe/lp_rast.c
index af661e9..0cd62c2 100644
--- a/src/gallium/drivers/llvmpipe/lp_rast.c
+++ b/src/gallium/drivers/llvmpipe/lp_rast.c
@@ -589,6 +589,17 @@ static lp_rast_cmd_func dispatch[LP_RAST_OP_MAX] =
lp_rast_begin_query,
lp_rast_end_query,
lp_rast_set_state,
+ lp_rast_triangle_32_1,
+ lp_rast_triangle_32_2,
+ lp_rast_triangle_32_3,
+ lp_rast_triangle_32_4,
+ lp_rast_triangle_32_5,
+ lp_rast_triangle_32_6,
+ lp_rast_triangle_32_7,
+ lp_rast_triangle_32_8,
+ lp_rast_triangle_32_3_4,
+ lp_rast_triangle_32_3_16,
+ lp_rast_triangle_32_4_16
};
diff --git a/src/gallium/drivers/llvmpipe/lp_rast.h b/src/gallium/drivers/llvmpipe/lp_rast.h
index ce60665..b81d94f 100644
--- a/src/gallium/drivers/llvmpipe/lp_rast.h
+++ b/src/gallium/drivers/llvmpipe/lp_rast.h
@@ -60,6 +60,8 @@ struct cmd_bin;
*/
#define MAX_FIXED_LENGTH (1 << (((FIXED_TYPE_WIDTH/2) - 1) - FIXED_ORDER))
+#define MAX_FIXED_LENGTH32 (1 << (((32/2) - 1) - FIXED_ORDER))
+
/* Rasterizer output size going to jit fs, width/height */
#define LP_RASTER_BLOCK_SIZE 4
@@ -104,9 +106,6 @@ struct lp_rast_shader_inputs {
/* followed by a0, dadx, dady and planes[] */
};
-/* Note: the order of these values is important as they are loaded by
- * sse code in rasterization:
- */
struct lp_rast_plane {
/* edge function values at minx,miny ?? */
int64_t c;
@@ -279,8 +278,19 @@ lp_rast_arg_null( void )
#define LP_RAST_OP_BEGIN_QUERY 0xf
#define LP_RAST_OP_END_QUERY 0x10
#define LP_RAST_OP_SET_STATE 0x11
-
-#define LP_RAST_OP_MAX 0x12
+#define LP_RAST_OP_TRIANGLE_32_1 0x12
+#define LP_RAST_OP_TRIANGLE_32_2 0x13
+#define LP_RAST_OP_TRIANGLE_32_3 0x14
+#define LP_RAST_OP_TRIANGLE_32_4 0x15
+#define LP_RAST_OP_TRIANGLE_32_5 0x16
+#define LP_RAST_OP_TRIANGLE_32_6 0x17
+#define LP_RAST_OP_TRIANGLE_32_7 0x18
+#define LP_RAST_OP_TRIANGLE_32_8 0x19
+#define LP_RAST_OP_TRIANGLE_32_3_4 0x1a
+#define LP_RAST_OP_TRIANGLE_32_3_16 0x1b
+#define LP_RAST_OP_TRIANGLE_32_4_16 0x1c
+
+#define LP_RAST_OP_MAX 0x1d
#define LP_RAST_OP_MASK 0xff
void
@@ -291,4 +301,17 @@ void
lp_debug_draw_bins_by_coverage( struct lp_scene *scene );
+#ifdef PIPE_ARCH_SSE
+#include <emmintrin.h>
+#include "util/u_sse.h"
+
+static INLINE __m128i
+lp_plane_to_m128i(const struct lp_rast_plane *plane)
+{
+ return _mm_setr_epi32((int32_t)plane->c, (int32_t)plane->dcdx,
+ (int32_t)plane->dcdy, (int32_t)plane->eo);
+}
+
+#endif
+
#endif
diff --git a/src/gallium/drivers/llvmpipe/lp_rast_priv.h b/src/gallium/drivers/llvmpipe/lp_rast_priv.h
index 41fe097..77ec329 100644
--- a/src/gallium/drivers/llvmpipe/lp_rast_priv.h
+++ b/src/gallium/drivers/llvmpipe/lp_rast_priv.h
@@ -355,6 +355,33 @@ void lp_rast_triangle_3_16( struct lp_rasterizer_task *,
void lp_rast_triangle_4_16( struct lp_rasterizer_task *,
const union lp_rast_cmd_arg );
+
+void lp_rast_triangle_32_1( struct lp_rasterizer_task *,
+ const union lp_rast_cmd_arg );
+void lp_rast_triangle_32_2( struct lp_rasterizer_task *,
+ const union lp_rast_cmd_arg );
+void lp_rast_triangle_32_3( struct lp_rasterizer_task *,
+ const union lp_rast_cmd_arg );
+void lp_rast_triangle_32_4( struct lp_rasterizer_task *,
+ const union lp_rast_cmd_arg );
+void lp_rast_triangle_32_5( struct lp_rasterizer_task *,
+ const union lp_rast_cmd_arg );
+void lp_rast_triangle_32_6( struct lp_rasterizer_task *,
+ const union lp_rast_cmd_arg );
+void lp_rast_triangle_32_7( struct lp_rasterizer_task *,
+ const union lp_rast_cmd_arg );
+void lp_rast_triangle_32_8( struct lp_rasterizer_task *,
+ const union lp_rast_cmd_arg );
+
+void lp_rast_triangle_32_3_4(struct lp_rasterizer_task *,
+ const union lp_rast_cmd_arg );
+
+void lp_rast_triangle_32_3_16( struct lp_rasterizer_task *,
+ const union lp_rast_cmd_arg );
+
+void lp_rast_triangle_32_4_16( struct lp_rasterizer_task *,
+ const union lp_rast_cmd_arg );
+
void
lp_rast_set_state(struct lp_rasterizer_task *task,
const union lp_rast_cmd_arg arg);
diff --git a/src/gallium/drivers/llvmpipe/lp_rast_tri.c b/src/gallium/drivers/llvmpipe/lp_rast_tri.c
index 8ddda5e..41f6fbf 100644
--- a/src/gallium/drivers/llvmpipe/lp_rast_tri.c
+++ b/src/gallium/drivers/llvmpipe/lp_rast_tri.c
@@ -35,12 +35,6 @@
#include "lp_perf.h"
#include "lp_rast_priv.h"
-/* TODO */
-#undef PIPE_ARCH_SSE
-
-
-
-
/**
* Shade all pixels in a 4x4 block.
*/
@@ -69,8 +63,6 @@ block_full_16(struct lp_rasterizer_task *task,
block_full_4(task, tri, x + ix, y + iy);
}
-#if !defined(PIPE_ARCH_SSE)
-
static INLINE unsigned
build_mask_linear(int64_t c, int64_t dcdx, int64_t dcdy)
{
@@ -125,6 +117,13 @@ lp_rast_triangle_3_16(struct lp_rasterizer_task *task,
}
void
+lp_rast_triangle_3_4(struct lp_rasterizer_task *task,
+ const union lp_rast_cmd_arg arg)
+{
+ lp_rast_triangle_3_16(task, arg);
+}
+
+void
lp_rast_triangle_4_16(struct lp_rasterizer_task *task,
const union lp_rast_cmd_arg arg)
{
@@ -134,11 +133,33 @@ lp_rast_triangle_4_16(struct lp_rasterizer_task *task,
lp_rast_triangle_4(task, arg2);
}
+#if !defined(PIPE_ARCH_SSE)
+
void
-lp_rast_triangle_3_4(struct lp_rasterizer_task *task,
+lp_rast_triangle_32_3_16(struct lp_rasterizer_task *task,
+ const union lp_rast_cmd_arg arg)
+{
+ union lp_rast_cmd_arg arg2;
+ arg2.triangle.tri = arg.triangle.tri;
+ arg2.triangle.plane_mask = (1<<3)-1;
+ lp_rast_triangle_32_3(task, arg2);
+}
+
+void
+lp_rast_triangle_32_4_16(struct lp_rasterizer_task *task,
+ const union lp_rast_cmd_arg arg)
+{
+ union lp_rast_cmd_arg arg2;
+ arg2.triangle.tri = arg.triangle.tri;
+ arg2.triangle.plane_mask = (1<<4)-1;
+ lp_rast_triangle_32_4(task, arg2);
+}
+
+void
+lp_rast_triangle_32_3_4(struct lp_rasterizer_task *task,
const union lp_rast_cmd_arg arg)
{
- lp_rast_triangle_3_16(task, arg);
+ lp_rast_triangle_32_3_16(task, arg);
}
#else
@@ -147,12 +168,12 @@ lp_rast_triangle_3_4(struct lp_rasterizer_task *task,
static INLINE void
-build_masks(int c,
- int cdiff,
- int dcdx,
- int dcdy,
- unsigned *outmask,
- unsigned *partmask)
+build_masks_32(int c,
+ int cdiff,
+ int dcdx,
+ int dcdy,
+ unsigned *outmask,
+ unsigned *partmask)
{
__m128i cstep0 = _mm_setr_epi32(c, c+dcdx, c+dcdx*2, c+dcdx*3);
__m128i xdcdy = _mm_set1_epi32(dcdy);
@@ -193,7 +214,7 @@ build_masks(int c,
static INLINE unsigned
-build_mask_linear(int c, int dcdx, int dcdy)
+build_mask_linear_32(int c, int dcdx, int dcdy)
{
__m128i cstep0 = _mm_setr_epi32(c, c+dcdx, c+dcdx*2, c+dcdx*3);
__m128i xdcdy = _mm_set1_epi32(dcdy);
@@ -251,7 +272,7 @@ sign_bits4(const __m128i *cstep, int cdiff)
void
-lp_rast_triangle_3_16(struct lp_rasterizer_task *task,
+lp_rast_triangle_32_3_16(struct lp_rasterizer_task *task,
const union lp_rast_cmd_arg arg)
{
const struct lp_rast_triangle *tri = arg.triangle.tri;
@@ -263,9 +284,9 @@ lp_rast_triangle_3_16(struct lp_rasterizer_task *task,
struct { unsigned mask:16; unsigned i:8; unsigned j:8; } out[16];
unsigned nr = 0;
- __m128i p0 = _mm_load_si128((__m128i *)&plane[0]); /* c, dcdx, dcdy, eo */
- __m128i p1 = _mm_load_si128((__m128i *)&plane[1]); /* c, dcdx, dcdy, eo */
- __m128i p2 = _mm_load_si128((__m128i *)&plane[2]); /* c, dcdx, dcdy, eo */
+ __m128i p0 = lp_plane_to_m128i(&plane[0]); /* c, dcdx, dcdy, eo */
+ __m128i p1 = lp_plane_to_m128i(&plane[1]); /* c, dcdx, dcdy, eo */
+ __m128i p2 = lp_plane_to_m128i(&plane[2]); /* c, dcdx, dcdy, eo */
__m128i zero = _mm_setzero_si128();
__m128i c;
@@ -365,7 +386,7 @@ lp_rast_triangle_3_16(struct lp_rasterizer_task *task,
void
-lp_rast_triangle_3_4(struct lp_rasterizer_task *task,
+lp_rast_triangle_32_3_4(struct lp_rasterizer_task *task,
const union lp_rast_cmd_arg arg)
{
const struct lp_rast_triangle *tri = arg.triangle.tri;
@@ -373,9 +394,9 @@ lp_rast_triangle_3_4(struct lp_rasterizer_task *task,
unsigned x = (arg.triangle.plane_mask & 0xff) + task->x;
unsigned y = (arg.triangle.plane_mask >> 8) + task->y;
- __m128i p0 = _mm_load_si128((__m128i *)&plane[0]); /* c, dcdx, dcdy, eo */
- __m128i p1 = _mm_load_si128((__m128i *)&plane[1]); /* c, dcdx, dcdy, eo */
- __m128i p2 = _mm_load_si128((__m128i *)&plane[2]); /* c, dcdx, dcdy, eo */
+ __m128i p0 = lp_plane_to_m128i(&plane[0]); /* c, dcdx, dcdy, eo */
+ __m128i p1 = lp_plane_to_m128i(&plane[1]); /* c, dcdx, dcdy, eo */
+ __m128i p2 = lp_plane_to_m128i(&plane[2]); /* c, dcdx, dcdy, eo */
__m128i zero = _mm_setzero_si128();
__m128i c;
@@ -453,7 +474,8 @@ lp_rast_triangle_3_4(struct lp_rasterizer_task *task,
#endif
-
+#define BUILD_MASKS(c, cdiff, dcdx, dcdy, omask, pmask) build_masks(c, cdiff, dcdx, dcdy, omask, pmask)
+#define BUILD_MASK_LINEAR(c, dcdx, dcdy) build_mask_linear(c, dcdx, dcdy)
#define TAG(x) x##_1
#define NR_PLANES 1
@@ -471,7 +493,7 @@ lp_rast_triangle_3_4(struct lp_rasterizer_task *task,
#define TAG(x) x##_4
#define NR_PLANES 4
-#define TRI_16 lp_rast_triangle_4_16
+/*#define TRI_16 lp_rast_triangle_4_16*/
#include "lp_rast_tri_tmp.h"
#define TAG(x) x##_5
@@ -490,3 +512,47 @@ lp_rast_triangle_3_4(struct lp_rasterizer_task *task,
#define NR_PLANES 8
#include "lp_rast_tri_tmp.h"
+#ifdef PIPE_ARCH_SSE
+#undef BUILD_MASKS
+#undef BUILD_MASK_LINEAR
+#define BUILD_MASKS(c, cdiff, dcdx, dcdy, omask, pmask) build_masks_32((int)c, (int)cdiff, dcdx, dcdy, omask, pmask)
+#define BUILD_MASK_LINEAR(c, dcdx, dcdy) build_mask_linear_32((int)c, dcdx, dcdy)
+#endif
+
+#define TAG(x) x##_32_1
+#define NR_PLANES 1
+#include "lp_rast_tri_tmp.h"
+
+#define TAG(x) x##_32_2
+#define NR_PLANES 2
+#include "lp_rast_tri_tmp.h"
+
+#define TAG(x) x##_32_3
+#define NR_PLANES 3
+/*#define TRI_4 lp_rast_triangle_3_4*/
+/*#define TRI_16 lp_rast_triangle_3_16*/
+#include "lp_rast_tri_tmp.h"
+
+#define TAG(x) x##_32_4
+#define NR_PLANES 4
+#ifdef PIPE_ARCH_SSE
+#define TRI_16 lp_rast_triangle_32_4_16
+#endif
+#include "lp_rast_tri_tmp.h"
+
+#define TAG(x) x##_32_5
+#define NR_PLANES 5
+#include "lp_rast_tri_tmp.h"
+
+#define TAG(x) x##_32_6
+#define NR_PLANES 6
+#include "lp_rast_tri_tmp.h"
+
+#define TAG(x) x##_32_7
+#define NR_PLANES 7
+#include "lp_rast_tri_tmp.h"
+
+#define TAG(x) x##_32_8
+#define NR_PLANES 8
+#include "lp_rast_tri_tmp.h"
+
diff --git a/src/gallium/drivers/llvmpipe/lp_rast_tri_tmp.h b/src/gallium/drivers/llvmpipe/lp_rast_tri_tmp.h
index a00cbb2..52f6e99 100644
--- a/src/gallium/drivers/llvmpipe/lp_rast_tri_tmp.h
+++ b/src/gallium/drivers/llvmpipe/lp_rast_tri_tmp.h
@@ -50,7 +50,7 @@ TAG(do_block_4)(struct lp_rasterizer_task *task,
int j;
for (j = 0; j < NR_PLANES; j++) {
- mask &= ~build_mask_linear(c[j] - 1,
+ mask &= ~BUILD_MASK_LINEAR(c[j] - 1,
-plane[j].dcdx,
plane[j].dcdy);
}
@@ -85,7 +85,7 @@ TAG(do_block_16)(struct lp_rasterizer_task *task,
const int64_t ei = plane[j].dcdy - plane[j].dcdx - plane[j].eo;
const int64_t cio = IMUL64(ei, 4) - 1;
- build_masks(c[j] + cox,
+ BUILD_MASKS(c[j] + cox,
cio - cox,
dcdx, dcdy,
&outmask, /* sign bits from c[i][0..15] + cox */
@@ -185,7 +185,7 @@ TAG(lp_rast_triangle)(struct lp_rasterizer_task *task,
const int64_t ei = plane[j].dcdy - plane[j].dcdx - plane[j].eo;
const int64_t cio = IMUL64(ei, 16) - 1;
- build_masks(c[j] + cox,
+ BUILD_MASKS(c[j] + cox,
cio - cox,
dcdx, dcdy,
&outmask, /* sign bits from c[i][0..15] + cox */
diff --git a/src/gallium/drivers/llvmpipe/lp_setup_tri.c b/src/gallium/drivers/llvmpipe/lp_setup_tri.c
index 2a77987..62d2855 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup_tri.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup_tri.c
@@ -190,6 +190,19 @@ lp_rast_tri_tab[MAX_PLANES+1] = {
LP_RAST_OP_TRIANGLE_8
};
+static unsigned
+lp_rast_32_tri_tab[MAX_PLANES+1] = {
+ 0, /* should be impossible */
+ LP_RAST_OP_TRIANGLE_32_1,
+ LP_RAST_OP_TRIANGLE_32_2,
+ LP_RAST_OP_TRIANGLE_32_3,
+ LP_RAST_OP_TRIANGLE_32_4,
+ LP_RAST_OP_TRIANGLE_32_5,
+ LP_RAST_OP_TRIANGLE_32_6,
+ LP_RAST_OP_TRIANGLE_32_7,
+ LP_RAST_OP_TRIANGLE_32_8
+};
+
/**
@@ -586,7 +599,6 @@ lp_setup_bin_triangle( struct lp_setup_context *setup,
struct lp_scene *scene = setup->scene;
struct u_rect trimmed_box = *bbox;
int i;
-
/* What is the largest power-of-two boundary this triangle crosses:
*/
int dx = floor_pot((bbox->x0 ^ bbox->x1) |
@@ -595,8 +607,10 @@ lp_setup_bin_triangle( struct lp_setup_context *setup,
/* The largest dimension of the rasterized area of the triangle
* (aligned to a 4x4 grid), rounded down to the nearest power of two:
*/
- int sz = floor_pot((bbox->x1 - (bbox->x0 & ~3)) |
- (bbox->y1 - (bbox->y0 & ~3)));
+ int max_sz = ((bbox->x1 - (bbox->x0 & ~3)) |
+ (bbox->y1 - (bbox->y0 & ~3)));
+ int sz = floor_pot(max_sz);
+ boolean use_32bits = max_sz <= MAX_FIXED_LENGTH32;
/* Now apply scissor, etc to the bounding box. Could do this
* earlier, but it confuses the logic for tri-16 and would force
@@ -627,6 +641,8 @@ lp_setup_bin_triangle( struct lp_setup_context *setup,
assert(py + 4 <= TILE_SIZE);
return lp_scene_bin_cmd_with_state( scene, ix0, iy0,
setup->fs.stored,
+ use_32bits ?
+ LP_RAST_OP_TRIANGLE_32_3_4 :
LP_RAST_OP_TRIANGLE_3_4,
lp_rast_arg_triangle_contained(tri, px, py) );
}
@@ -649,6 +665,8 @@ lp_setup_bin_triangle( struct lp_setup_context *setup,
return lp_scene_bin_cmd_with_state( scene, ix0, iy0,
setup->fs.stored,
+ use_32bits ?
+ LP_RAST_OP_TRIANGLE_32_3_16 :
LP_RAST_OP_TRIANGLE_3_16,
lp_rast_arg_triangle_contained(tri, px, py) );
}
@@ -663,6 +681,8 @@ lp_setup_bin_triangle( struct lp_setup_context *setup,
return lp_scene_bin_cmd_with_state(scene, ix0, iy0,
setup->fs.stored,
+ use_32bits ?
+ LP_RAST_OP_TRIANGLE_32_4_16 :
LP_RAST_OP_TRIANGLE_4_16,
lp_rast_arg_triangle_contained(tri, px, py));
}
@@ -670,9 +690,10 @@ lp_setup_bin_triangle( struct lp_setup_context *setup,
/* Triangle is contained in a single tile:
*/
- return lp_scene_bin_cmd_with_state( scene, ix0, iy0, setup->fs.stored,
- lp_rast_tri_tab[nr_planes],
- lp_rast_arg_triangle(tri, (1<<nr_planes)-1) );
+ return lp_scene_bin_cmd_with_state(
+ scene, ix0, iy0, setup->fs.stored,
+ use_32bits ? lp_rast_32_tri_tab[nr_planes] : lp_rast_tri_tab[nr_planes],
+ lp_rast_arg_triangle(tri, (1<<nr_planes)-1));
}
else
{
@@ -746,6 +767,8 @@ lp_setup_bin_triangle( struct lp_setup_context *setup,
if (!lp_scene_bin_cmd_with_state( scene, x, y,
setup->fs.stored,
+ use_32bits ?
+ lp_rast_32_tri_tab[count] :
lp_rast_tri_tab[count],
lp_rast_arg_triangle(tri, partial) ))
goto fail;
More information about the mesa-commit
mailing list