Mesa (lp-binning): llvmpipe: do final the pixel in/ out triangle test in the fragment shader

Brian Paul brianp at kemper.freedesktop.org
Wed Dec 16 15:12:36 PST 2009


Module: Mesa
Branch: lp-binning
Commit: ab9438193083b7f9a3180cb9cea45e269131048a
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=ab9438193083b7f9a3180cb9cea45e269131048a

Author: Brian Paul <brianp at vmware.com>
Date:   Wed Dec 16 16:02:59 2009 -0700

llvmpipe: do final the pixel in/out triangle test in the fragment shader

The test to determine which of the pixels in a 2x2 quad is now done in
the fragment shader rather than in the calling C code.  This is a little
faster but there's a few more things to do.

Note that the step[] array elements are in a different order now.  Rather
than being in row-major order for the 4x4 grid, they're in "quad-major"
order.  The setup of the step arrays is a little more complicated now.
So is the course/intermediate tile test code, but some lookup tables
help with that.

Next steps:
 - early-cull 2x2 quads which are totally outside the triangle.
 - skip the in/out test for fully contained quads
 - make the in/out comparison code tighter/faster.

---

 src/gallium/drivers/llvmpipe/lp_jit.h       |    9 +-
 src/gallium/drivers/llvmpipe/lp_rast.c      |   76 ++-------
 src/gallium/drivers/llvmpipe/lp_rast.h      |   11 +-
 src/gallium/drivers/llvmpipe/lp_rast_priv.h |   11 +--
 src/gallium/drivers/llvmpipe/lp_rast_tri.c  |  222 +++++++++++++++------------
 src/gallium/drivers/llvmpipe/lp_setup_tri.c |   49 +++----
 src/gallium/drivers/llvmpipe/lp_state_fs.c  |  144 ++++++++++++++++--
 7 files changed, 302 insertions(+), 220 deletions(-)

diff --git a/src/gallium/drivers/llvmpipe/lp_jit.h b/src/gallium/drivers/llvmpipe/lp_jit.h
index 7eccb5d..e8fb7d9 100644
--- a/src/gallium/drivers/llvmpipe/lp_jit.h
+++ b/src/gallium/drivers/llvmpipe/lp_jit.h
@@ -114,9 +114,14 @@ typedef void
                     const void *a0,
                     const void *dadx,
                     const void *dady,
-                    const uint32_t *mask,
                     void *color,
-                    void *depth);
+                    void *depth,
+                    const int32_t c1,
+                    const int32_t c2,
+                    const int32_t c3,
+                    const int32_t *step1,
+                    const int32_t *step2,
+                    const int32_t *step3);
 
 
 
diff --git a/src/gallium/drivers/llvmpipe/lp_rast.c b/src/gallium/drivers/llvmpipe/lp_rast.c
index ec87d90..b1bd27d 100644
--- a/src/gallium/drivers/llvmpipe/lp_rast.c
+++ b/src/gallium/drivers/llvmpipe/lp_rast.c
@@ -25,6 +25,7 @@
  *
  **************************************************************************/
 
+#include <limits.h>
 #include "util/u_memory.h"
 #include "util/u_math.h"
 #include "util/u_cpu_detect.h"
@@ -279,6 +280,8 @@ void lp_rast_shade_tile( struct lp_rasterizer *rast,
                          unsigned thread_index,
                          const union lp_rast_cmd_arg arg )
 {
+   /* Set c1,c2,c3 to large values so the in/out test always passes */
+   const int32_t c1 = INT_MAX/2, c2 = INT_MAX/2, c3 = INT_MAX/2;
    const struct lp_rast_shader_inputs *inputs = arg.shade_tile;
    const unsigned tile_x = rast->tasks[thread_index].x;
    const unsigned tile_y = rast->tasks[thread_index].y;
@@ -296,7 +299,7 @@ void lp_rast_shade_tile( struct lp_rasterizer *rast,
                               inputs,
                               tile_x + x,
                               tile_y + y,
-                              mask);
+                              c1, c2, c3);
 }
 
 
@@ -308,58 +311,25 @@ void lp_rast_shade_quads( struct lp_rasterizer *rast,
                           unsigned thread_index,
                           const struct lp_rast_shader_inputs *inputs,
                           unsigned x, unsigned y,
-                          unsigned mask)
+                          int32_t c1, int32_t c2, int32_t c3)
 {
-#if 1
    const struct lp_rast_state *state = rast->tasks[thread_index].current_state;
    struct lp_rast_tile *tile = &rast->tasks[thread_index].tile;
    void *color;
    void *depth;
-   uint32_t ALIGN16_ATTRIB masks[2][2][2][2];
    unsigned ix, iy;
    int block_offset;
 
+#ifdef DEBUG
    assert(state);
 
    /* Sanity checks */
    assert(x % TILE_VECTOR_WIDTH == 0);
    assert(y % TILE_VECTOR_HEIGHT == 0);
 
-   /* mask: the rasterizer wants to treat pixels in 4x4 blocks, but
-    * the pixel shader wants to swizzle them into 4 2x2 quads.
-    * 
-    * Additionally, the pixel shader wants masks as full dword ~0,
-    * while the rasterizer wants to pack per-pixel bits tightly.
-    */
-#if 0
-   unsigned qx, qy;
-   for (qy = 0; qy < 2; ++qy)
-      for (qx = 0; qx < 2; ++qx)
-	 for (iy = 0; iy < 2; ++iy)
-	    for (ix = 0; ix < 2; ++ix)
-	       masks[qy][qx][iy][ix] = mask & (1 << (qy*8+iy*4+qx*2+ix)) ? ~0 : 0;
-#else
-   masks[0][0][0][0] = mask & (1 << (0*8+0*4+0*2+0)) ? ~0 : 0;
-   masks[0][0][0][1] = mask & (1 << (0*8+0*4+0*2+1)) ? ~0 : 0;
-   masks[0][0][1][0] = mask & (1 << (0*8+1*4+0*2+0)) ? ~0 : 0;
-   masks[0][0][1][1] = mask & (1 << (0*8+1*4+0*2+1)) ? ~0 : 0;
-   masks[0][1][0][0] = mask & (1 << (0*8+0*4+1*2+0)) ? ~0 : 0;
-   masks[0][1][0][1] = mask & (1 << (0*8+0*4+1*2+1)) ? ~0 : 0;
-   masks[0][1][1][0] = mask & (1 << (0*8+1*4+1*2+0)) ? ~0 : 0;
-   masks[0][1][1][1] = mask & (1 << (0*8+1*4+1*2+1)) ? ~0 : 0;
-
-   masks[1][0][0][0] = mask & (1 << (1*8+0*4+0*2+0)) ? ~0 : 0;
-   masks[1][0][0][1] = mask & (1 << (1*8+0*4+0*2+1)) ? ~0 : 0;
-   masks[1][0][1][0] = mask & (1 << (1*8+1*4+0*2+0)) ? ~0 : 0;
-   masks[1][0][1][1] = mask & (1 << (1*8+1*4+0*2+1)) ? ~0 : 0;
-   masks[1][1][0][0] = mask & (1 << (1*8+0*4+1*2+0)) ? ~0 : 0;
-   masks[1][1][0][1] = mask & (1 << (1*8+0*4+1*2+1)) ? ~0 : 0;
-   masks[1][1][1][0] = mask & (1 << (1*8+1*4+1*2+0)) ? ~0 : 0;
-   masks[1][1][1][1] = mask & (1 << (1*8+1*4+1*2+1)) ? ~0 : 0;
-#endif
-
    assert((x % 4) == 0);
    assert((y % 4) == 0);
+#endif
 
    ix = x % TILE_SIZE;
    iy = y % TILE_SIZE;
@@ -373,39 +343,27 @@ void lp_rast_shade_quads( struct lp_rasterizer *rast,
    /* depth buffer */
    depth = tile->depth + block_offset;
 
-   /* XXX: This will most likely fail on 32bit x86 without -mstackrealign */
-   assert(lp_check_alignment(masks, 16));
-
+#ifdef DEBUG
    assert(lp_check_alignment(depth, 16));
    assert(lp_check_alignment(color, 16));
    assert(lp_check_alignment(state->jit_context.blend_color, 16));
 
+   assert(lp_check_alignment(inputs->step[0], 16));
+   assert(lp_check_alignment(inputs->step[1], 16));
+   assert(lp_check_alignment(inputs->step[2], 16));
+#endif
+
    /* run shader */
    state->jit_function( &state->jit_context,
                         x, y,
                         inputs->a0,
                         inputs->dadx,
                         inputs->dady,
-                        &masks[0][0][0][0],
                         color,
-                        depth);
-#else
-   struct lp_rast_tile *tile = &rast->tile;
-   unsigned chan_index;
-   unsigned q, ix, iy;
-
-   x %= TILE_SIZE;
-   y %= TILE_SIZE;
-
-   /* mask */
-   for (q = 0; q < 4; ++q)
-      for(iy = 0; iy < 2; ++iy)
-         for(ix = 0; ix < 2; ++ix)
-            if(masks[q] & (1 << (iy*2 + ix)))
-               for (chan_index = 0; chan_index < NUM_CHANNELS; ++chan_index)
-                  TILE_PIXEL(tile->color, x + q*2 + ix, y + iy, chan_index) = 0xff;
-
-#endif
+                        depth,
+                        c1, c2, c3,
+                        inputs->step[0], inputs->step[1], inputs->step[2]
+                        );
 }
 
 
diff --git a/src/gallium/drivers/llvmpipe/lp_rast.h b/src/gallium/drivers/llvmpipe/lp_rast.h
index 2dd0193..46e22f6 100644
--- a/src/gallium/drivers/llvmpipe/lp_rast.h
+++ b/src/gallium/drivers/llvmpipe/lp_rast.h
@@ -80,6 +80,9 @@ struct lp_rast_shader_inputs {
    float (*a0)[4];
    float (*dadx)[4];
    float (*dady)[4];
+
+   /* edge/step info for 3 edges and 4x4 block of pixels */
+   int ALIGN16_ATTRIB step[3][16];
 };
 
 
@@ -117,14 +120,10 @@ struct lp_rast_triangle {
    int dx31;
 
    /* edge function values at minx,miny ?? */
-   int c1;
-   int c2;
-   int c3;
-
-   int step[3][16];
+   int c1, c2, c3;
 
    /* inputs for the shader */
-   struct lp_rast_shader_inputs inputs;
+   struct lp_rast_shader_inputs ALIGN16_ATTRIB inputs;
 };
 
 
diff --git a/src/gallium/drivers/llvmpipe/lp_rast_priv.h b/src/gallium/drivers/llvmpipe/lp_rast_priv.h
index 79a90f6..cd72d7e 100644
--- a/src/gallium/drivers/llvmpipe/lp_rast_priv.h
+++ b/src/gallium/drivers/llvmpipe/lp_rast_priv.h
@@ -61,15 +61,6 @@ struct lp_rasterizer_task
 
    unsigned x, y;          /**< Pos of this tile in framebuffer, in pixels */
 
-   /* Pixel blocks produced during rasterization
-    */
-   unsigned nr_blocks;
-   struct {
-      unsigned x;
-      unsigned y;
-      unsigned mask;
-   } blocks[256];
-
    const struct lp_rast_state *current_state;
 
    /** "back" pointer */
@@ -133,6 +124,6 @@ void lp_rast_shade_quads( struct lp_rasterizer *rast,
                           unsigned thread_index,
                           const struct lp_rast_shader_inputs *inputs,
                           unsigned x, unsigned y,
-                          unsigned masks);
+                          int32_t c1, int32_t c2, int32_t c3);
 
 #endif
diff --git a/src/gallium/drivers/llvmpipe/lp_rast_tri.c b/src/gallium/drivers/llvmpipe/lp_rast_tri.c
index 6c96010..9b18612 100644
--- a/src/gallium/drivers/llvmpipe/lp_rast_tri.c
+++ b/src/gallium/drivers/llvmpipe/lp_rast_tri.c
@@ -29,6 +29,7 @@
  * Rasterization for binned triangles within a tile
  */
 
+#include <limits.h>
 #include "util/u_math.h"
 #include "lp_debug.h"
 #include "lp_rast_priv.h"
@@ -36,42 +37,89 @@
 
 
 /**
- * Add a 4x4 block of pixels to the block list.
- * All pixels are known to be inside the triangle's bounds.
+ * Map an index in [0,15] to an x,y position, multiplied by 4.
+ * This is used to get the position of each subtile in a 4x4
+ * grid of edge step values.
+ */
+static const int pos_table4[16][2] = {
+   { 0, 0 },
+   { 4, 0 },
+   { 0, 4 },
+   { 4, 4 },
+   { 8, 0 },
+   { 12, 0 },
+   { 8, 4 },
+   { 12, 4 },
+   { 0, 8 },
+   { 4, 8 },
+   { 0, 12 },
+   { 4, 12 },
+   { 8, 8 },
+   { 12, 8 },
+   { 8, 12 },
+   { 12, 12 }
+};
+
+
+static const int pos_table16[16][2] = {
+   { 0, 0 },
+   { 16, 0 },
+   { 0, 16 },
+   { 16, 16 },
+   { 32, 0 },
+   { 48, 0 },
+   { 32, 16 },
+   { 48, 16 },
+   { 0, 32 },
+   { 16, 32 },
+   { 0, 48 },
+   { 16, 48 },
+   { 32, 32 },
+   { 48, 32 },
+   { 32, 48 },
+   { 48, 48 }
+};
+
+
+/**
+ * Shade all pixels in a 4x4 block.
  */
 static void
-block_full_4( struct lp_rasterizer_task *rast_task, int x, int y )
+block_full_4( struct lp_rasterizer_task *rast_task,
+              const struct lp_rast_triangle *tri,
+              int x, int y )
 {
-   const unsigned i = rast_task->nr_blocks;
-   assert(x % 4 == 0);
-   assert(y % 4 == 0);
-   rast_task->blocks[i].x = x;
-   rast_task->blocks[i].y = y;
-   rast_task->blocks[i].mask = ~0;
-   rast_task->nr_blocks++;
+   /* Set c1,c2,c3 to large values so the in/out test always passes */
+   const int32_t c1 = INT_MAX/2, c2 = INT_MAX/2, c3 = INT_MAX/2;
+   lp_rast_shade_quads(rast_task->rast,
+                       rast_task->thread_index,
+                       &tri->inputs, 
+                       x, y,
+                       c1, c2, c3);
 }
 
 
 /**
- * Add a 16x16 block of pixels to the block list.
- * All pixels are known to be inside the triangle's bounds.
+ * Shade all pixels in a 16x16 block.
  */
 static void
-block_full_16( struct lp_rasterizer_task *rast_task, int x, int y )
+block_full_16( struct lp_rasterizer_task *rast_task,
+               const struct lp_rast_triangle *tri,
+               int x, int y )
 {
    unsigned ix, iy;
    assert(x % 16 == 0);
    assert(y % 16 == 0);
    for (iy = 0; iy < 16; iy += 4)
       for (ix = 0; ix < 16; ix += 4)
-	 block_full_4(rast_task, x + ix, y + iy);
+	 block_full_4(rast_task, tri, x + ix, y + iy);
 }
 
 
 /**
- * Evaluate each pixel in a 4x4 block to determine if it lies within
- * the triangle's bounds.
- * Generate a mask of in/out flags and add the block to the blocks list.
+ * Pass the 4x4 pixel block to the shader function.
+ * Determination of which of the 16 pixels lies inside the triangle
+ * will be done as part of the fragment shader.
  */
 static void
 do_block_4( struct lp_rasterizer_task *rast_task,
@@ -81,28 +129,11 @@ do_block_4( struct lp_rasterizer_task *rast_task,
 	    int c2,
 	    int c3 )
 {
-   int i;
-   unsigned mask = 0;
-
-   assert(x % 4 == 0);
-   assert(y % 4 == 0);
-
-   for (i = 0; i < 16; i++) {
-      int any_negative = ((c1 + tri->step[0][i]) | 
-                          (c2 + tri->step[1][i]) | 
-                          (c3 + tri->step[2][i])) >> 31;
-      mask |= (~any_negative) & (1 << i);
-   }
-   
-   /* As we do trivial reject already, masks should rarely be all zero:
-    */
-   if (mask) {
-      const unsigned i = rast_task->nr_blocks;
-      rast_task->blocks[i].x = x;
-      rast_task->blocks[i].y = y;
-      rast_task->blocks[i].mask = mask;
-      rast_task->nr_blocks++;
-   }
+   lp_rast_shade_quads(rast_task->rast,
+                       rast_task->thread_index,
+                       &tri->inputs, 
+                       x, y,
+                       c1, c2, c3);
 }
 
 
@@ -118,40 +149,42 @@ do_block_16( struct lp_rasterizer_task *rast_task,
              int c2,
              int c3 )
 {
-   int ix, iy, i = 0;
+   const int ei1 = tri->ei1 * 4;
+   const int ei2 = tri->ei2 * 4;
+   const int ei3 = tri->ei3 * 4;
 
-   int ei1 = tri->ei1 * 4;
-   int ei2 = tri->ei2 * 4;
-   int ei3 = tri->ei3 * 4;
+   const int eo1 = tri->eo1 * 4;
+   const int eo2 = tri->eo2 * 4;
+   const int eo3 = tri->eo3 * 4;
 
-   int eo1 = tri->eo1 * 4;
-   int eo2 = tri->eo2 * 4;
-   int eo3 = tri->eo3 * 4;
+   int i;
 
    assert(x % 16 == 0);
    assert(y % 16 == 0);
 
-   for (iy = 0; iy < 16; iy+=4) {
-      for (ix = 0; ix < 16; ix+=4, i++) {
-	 int cx1 = c1 + (tri->step[0][i] * 4);
-	 int cx2 = c2 + (tri->step[1][i] * 4);
-	 int cx3 = c3 + (tri->step[2][i] * 4);
-	 
-	 if (cx1 + eo1 < 0 ||
-	     cx2 + eo2 < 0 ||
-	     cx3 + eo3 < 0) {
-            /* the block is completely outside the triangle - nop */
-	 }
-	 else if (cx1 + ei1 > 0 &&
-		  cx2 + ei2 > 0 &&
-		  cx3 + ei3 > 0) {
+   for (i = 0; i < 16; i++) {
+      int cx1 = c1 + (tri->inputs.step[0][i] * 4);
+      int cx2 = c2 + (tri->inputs.step[1][i] * 4);
+      int cx3 = c3 + (tri->inputs.step[2][i] * 4);
+
+      if (cx1 + eo1 < 0 ||
+          cx2 + eo2 < 0 ||
+          cx3 + eo3 < 0) {
+         /* the block is completely outside the triangle - nop */
+      }
+      else {
+         int px = x + pos_table4[i][0];
+         int py = y + pos_table4[i][1];
+         if (cx1 + ei1 > 0 &&
+             cx2 + ei2 > 0 &&
+             cx3 + ei3 > 0) {
             /* the block is completely inside the triangle */
-	    block_full_4(rast_task, x+ix, y+iy);
-	 }
-	 else {
+            block_full_4(rast_task, tri, px, py);
+         }
+         else {
             /* the block is partially in/out of the triangle */
-	    do_block_4(rast_task, tri, x+ix, y+iy, cx1, cx2, cx3);
-	 }
+            do_block_4(rast_task, tri, px, py, cx1, cx2, cx3);
+         }
       }
    }
 }
@@ -171,8 +204,7 @@ lp_rast_triangle( struct lp_rasterizer *rast,
 
    int x = rast_task->x;
    int y = rast_task->y;
-   int ix, iy;
-   unsigned i = 0;
+   unsigned i;
 
    int c1 = tri->c1 + tri->dx12 * y - tri->dy12 * x;
    int c2 = tri->c2 + tri->dx23 * y - tri->dy23 * x;
@@ -186,48 +218,36 @@ lp_rast_triangle( struct lp_rasterizer *rast,
    int eo2 = tri->eo2 * 16;
    int eo3 = tri->eo3 * 16;
 
-   assert(Elements(rast_task->blocks) == (TILE_SIZE * TILE_SIZE) / (4*4));
-
    LP_DBG(DEBUG_RAST, "lp_rast_triangle\n");
 
-   rast_task->nr_blocks = 0;
-
    /* Walk over the tile to build a list of 4x4 pixel blocks which will
     * be filled/shaded.  We do this at two granularities: 16x16 blocks
     * and then 4x4 blocks.
     */
-   for (iy = 0; iy < TILE_SIZE; iy += 16) {
-      for (ix = 0; ix < TILE_SIZE; ix += 16, i++) {
-	 int cx1 = c1 + (tri->step[0][i] * 16);
-	 int cx2 = c2 + (tri->step[1][i] * 16);
-	 int cx3 = c3 + (tri->step[2][i] * 16);
-	 
-	 if (cx1 + eo1 < 0 ||
-	     cx2 + eo2 < 0 ||
-	     cx3 + eo3 < 0) {
-            /* the block is completely outside the triangle - nop */
-	 }
-	 else if (cx1 + ei1 > 0 &&
-		  cx2 + ei2 > 0 &&
-		  cx3 + ei3 > 0) {
+   for (i = 0; i < 16; i++) {
+      int cx1 = c1 + (tri->inputs.step[0][i] * 16);
+      int cx2 = c2 + (tri->inputs.step[1][i] * 16);
+      int cx3 = c3 + (tri->inputs.step[2][i] * 16);
+
+      if (cx1 + eo1 < 0 ||
+          cx2 + eo2 < 0 ||
+          cx3 + eo3 < 0) {
+         /* the block is completely outside the triangle - nop */
+      }
+      else {
+         int px = x + pos_table16[i][0];
+         int py = y + pos_table16[i][1];
+
+         if (cx1 + ei1 > 0 &&
+             cx2 + ei2 > 0 &&
+             cx3 + ei3 > 0) {
             /* the block is completely inside the triangle */
-	    block_full_16(rast_task, x+ix, y+iy);
-	 }
-	 else {
+            block_full_16(rast_task, tri, px, py);
+         }
+         else {
             /* the block is partially in/out of the triangle */
-	    do_block_16(rast_task, tri, x+ix, y+iy, cx1, cx2, cx3);
-	 }
+            do_block_16(rast_task, tri, px, py, cx1, cx2, cx3);
+         }
       }
    }
-
-   assert(rast_task->nr_blocks <= Elements(rast_task->blocks));
-
-   /* Shade the 4x4 pixel blocks */
-   for (i = 0; i < rast_task->nr_blocks; i++) 
-      lp_rast_shade_quads(rast,
-                          thread_index,
-                          &tri->inputs, 
-			  rast_task->blocks[i].x,
-			  rast_task->blocks[i].y,
-			  rast_task->blocks[i].mask);
 }
diff --git a/src/gallium/drivers/llvmpipe/lp_setup_tri.c b/src/gallium/drivers/llvmpipe/lp_setup_tri.c
index aeaf260..e15b987 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup_tri.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup_tri.c
@@ -265,7 +265,7 @@ do_triangle_ccw(struct setup_context *setup,
    const int y3 = subpixel_snap(v3[0][1]);
 
    struct lp_scene *scene = lp_setup_get_current_scene(setup);
-   struct lp_rast_triangle *tri = lp_scene_alloc( scene, sizeof *tri );
+   struct lp_rast_triangle *tri = lp_scene_alloc_aligned( scene, sizeof *tri, 16 );
    float area, oneoverarea;
    int minx, maxx, miny, maxy;
 
@@ -354,38 +354,29 @@ do_triangle_ccw(struct setup_context *setup,
    tri->ei3 = tri->dx31 - tri->dy31 - tri->eo3;
 
    {
-      int xstep1 = -tri->dy12;
-      int xstep2 = -tri->dy23;
-      int xstep3 = -tri->dy31;
+      const int xstep1 = -tri->dy12;
+      const int xstep2 = -tri->dy23;
+      const int xstep3 = -tri->dy31;
 
-      int ystep1 = tri->dx12;
-      int ystep2 = tri->dx23;
-      int ystep3 = tri->dx31;
+      const int ystep1 = tri->dx12;
+      const int ystep2 = tri->dx23;
+      const int ystep3 = tri->dx31;
       
-      int ix, iy;
+      int qx, qy, ix, iy;
       int i = 0;
 
-      int c1 = 0;
-      int c2 = 0;
-      int c3 = 0;
-      
-      for (iy = 0; iy < 4; iy++) {
-	 int cx1 = c1;
-	 int cx2 = c2;
-	 int cx3 = c3;
-
-	 for (ix = 0; ix < 4; ix++, i++) {
-	    tri->step[0][i] = cx1;
-	    tri->step[1][i] = cx2;
-	    tri->step[2][i] = cx3;
-	    cx1 += xstep1;
-	    cx2 += xstep2;
-	    cx3 += xstep3;
-	 }
-
-	 c1 += ystep1;
-	 c2 += ystep2;
-	 c3 += ystep3;
+      for (qy = 0; qy < 2; qy++) {
+         for (qx = 0; qx < 2; qx++) {
+            for (iy = 0; iy < 2; iy++) {
+               for (ix = 0; ix < 2; ix++, i++) {
+                  int x = qx * 2 + ix;
+                  int y = qy * 2 + iy;
+                  tri->inputs.step[0][i] = x * xstep1 + y * ystep1;
+                  tri->inputs.step[1][i] = x * xstep2 + y * ystep2;
+                  tri->inputs.step[2][i] = x * xstep3 + y * ystep3;
+               }
+            }
+         }
       }
    }
 
diff --git a/src/gallium/drivers/llvmpipe/lp_state_fs.c b/src/gallium/drivers/llvmpipe/lp_state_fs.c
index c0d5a70..4af37e3 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_fs.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_fs.c
@@ -176,7 +176,92 @@ generate_depth(LLVMBuilderRef builder,
 
 
 /**
+ * Generate the code to do inside/outside triangle testing for the
+ * four pixels in a 2x2 quad.  This will set the four elements of the
+ * quad mask vector to 0 or ~0.
+ * \param i  which quad of the quad group to test, in [0,3]
+ */
+static void
+generate_tri_edge_mask(LLVMBuilderRef builder,
+                       unsigned i,
+                       LLVMValueRef *mask,      /* ivec4, out */
+                       LLVMValueRef c0,         /* int32 */
+                       LLVMValueRef c1,         /* int32 */
+                       LLVMValueRef c2,         /* int32 */
+                       LLVMValueRef step0_ptr,  /* ivec4 */
+                       LLVMValueRef step1_ptr,  /* ivec4 */
+                       LLVMValueRef step2_ptr)  /* ivec4 */
+{
+   /*
+     c0_vec = splat(c0)
+     c1_vec = splat(c1)
+     c2_vec = splat(c2)
+     s0_vec = c0_vec + step0_ptr[i]
+     s1_vec = c1_vec + step1_ptr[i]
+     s2_vec = c2_vec + step2_ptr[i]
+     m0_vec = s0_vec > {0,0,0,0}
+     m1_vec = s1_vec > {0,0,0,0}
+     m2_vec = s2_vec > {0,0,0,0}
+     mask = m0_vec & m1_vec & m2_vec
+    */
+   struct lp_type i32_type;
+   LLVMTypeRef i32vec4_type;
+
+   LLVMValueRef index;
+   LLVMValueRef c0_vec, c1_vec, c2_vec;
+   LLVMValueRef step0_vec, step1_vec, step2_vec;
+   LLVMValueRef m0_vec, m1_vec, m2_vec;
+   LLVMValueRef s0_vec, s1_vec, s2_vec;
+   LLVMValueRef m;
+
+   LLVMValueRef zeros;
+
+   assert(i < 4);
+   
+   /* int32 vector type */
+   memset(&i32_type, 0, sizeof i32_type);
+   i32_type.floating = FALSE; /* values are integers */
+   i32_type.sign = TRUE;      /* values are signed */
+   i32_type.norm = FALSE;     /* values are not normalized */
+   i32_type.width = 32;       /* 32-bit int values */
+   i32_type.length = 4;       /* 4 elements per vector */
+
+   i32vec4_type = lp_build_int32_vec4_type();
+
+   /* int32_vec4 zero = {0,0,0,0} */
+   zeros = LLVMConstNull(i32vec4_type);
+
+   c0_vec = lp_build_broadcast(builder, i32vec4_type, c0);
+   c1_vec = lp_build_broadcast(builder, i32vec4_type, c1);
+   c2_vec = lp_build_broadcast(builder, i32vec4_type, c2);
+
+   index = LLVMConstInt(LLVMInt32Type(), i, 0);
+   step0_vec = LLVMBuildLoad(builder, LLVMBuildGEP(builder, step0_ptr, &index, 1, ""), "");
+   step1_vec = LLVMBuildLoad(builder, LLVMBuildGEP(builder, step1_ptr, &index, 1, ""), "");
+   step2_vec = LLVMBuildLoad(builder, LLVMBuildGEP(builder, step2_ptr, &index, 1, ""), "");
+
+   /** XXX with a little work, we could remove the add here and just
+    * compare c0_vec > step0_vec.
+    */
+   s0_vec = LLVMBuildAdd(builder, c0_vec, step0_vec, "");
+   s1_vec = LLVMBuildAdd(builder, c1_vec, step1_vec, "");
+   s2_vec = LLVMBuildAdd(builder, c2_vec, step2_vec, "");
+   m0_vec = lp_build_compare(builder, i32_type, PIPE_FUNC_GREATER, s0_vec, zeros);
+   m1_vec = lp_build_compare(builder, i32_type, PIPE_FUNC_GREATER, s1_vec, zeros);
+   m2_vec = lp_build_compare(builder, i32_type, PIPE_FUNC_GREATER, s2_vec, zeros);
+
+   m = LLVMBuildAnd(builder, m0_vec, m1_vec, "");
+   m = LLVMBuildAnd(builder, m, m2_vec, "");
+
+   lp_build_name(m, "m");
+
+   *mask = m;
+}
+
+
+/**
  * Generate the fragment shader, depth/stencil test, and alpha tests.
+ * \param i  which quad in the tile, in range [0,3]
  */
 static void
 generate_fs(struct llvmpipe_context *lp,
@@ -190,7 +275,13 @@ generate_fs(struct llvmpipe_context *lp,
             struct lp_build_sampler_soa *sampler,
             LLVMValueRef *pmask,
             LLVMValueRef *color,
-            LLVMValueRef depth_ptr)
+            LLVMValueRef depth_ptr,
+            LLVMValueRef c0,
+            LLVMValueRef c1,
+            LLVMValueRef c2,
+            LLVMValueRef step0_ptr,
+            LLVMValueRef step1_ptr,
+            LLVMValueRef step2_ptr)
 {
    const struct tgsi_token *tokens = shader->base.tokens;
    LLVMTypeRef elem_type;
@@ -205,6 +296,8 @@ generate_fs(struct llvmpipe_context *lp,
    unsigned attrib;
    unsigned chan;
 
+   assert(i < 4);
+
    elem_type = lp_build_elem_type(type);
    vec_type = lp_build_vec_type(type);
    int_vec_type = lp_build_int_vec_type(type);
@@ -224,8 +317,13 @@ generate_fs(struct llvmpipe_context *lp,
    }
    lp_build_flow_scope_declare(flow, &z);
 
+   /* do triangle edge testing */
+   generate_tri_edge_mask(builder, i, pmask,
+                          c0, c1, c2, step0_ptr, step1_ptr, step2_ptr);
+
    lp_build_mask_begin(&mask, flow, type, *pmask);
 
+
    early_depth_test =
       key->depth.enabled &&
       !key->alpha.enabled &&
@@ -376,17 +474,18 @@ generate_fragment(struct llvmpipe_context *lp,
    LLVMTypeRef fs_int_vec_type;
    LLVMTypeRef blend_vec_type;
    LLVMTypeRef blend_int_vec_type;
-   LLVMTypeRef arg_types[9];
+   LLVMTypeRef arg_types[14];
    LLVMTypeRef func_type;
+   LLVMTypeRef int32_vec4_type = lp_build_int32_vec4_type();
    LLVMValueRef context_ptr;
    LLVMValueRef x;
    LLVMValueRef y;
    LLVMValueRef a0_ptr;
    LLVMValueRef dadx_ptr;
    LLVMValueRef dady_ptr;
-   LLVMValueRef mask_ptr;
    LLVMValueRef color_ptr;
    LLVMValueRef depth_ptr;
+   LLVMValueRef c0, c1, c2, step0_ptr, step1_ptr, step2_ptr;
    LLVMBasicBlockRef block;
    LLVMBuilderRef builder;
    LLVMValueRef x0;
@@ -468,9 +567,17 @@ generate_fragment(struct llvmpipe_context *lp,
    arg_types[3] = LLVMPointerType(fs_elem_type, 0);    /* a0 */
    arg_types[4] = LLVMPointerType(fs_elem_type, 0);    /* dadx */
    arg_types[5] = LLVMPointerType(fs_elem_type, 0);    /* dady */
-   arg_types[6] = LLVMPointerType(fs_int_vec_type, 0); /* mask */
-   arg_types[7] = LLVMPointerType(blend_vec_type, 0);  /* color */
-   arg_types[8] = LLVMPointerType(fs_int_vec_type, 0); /* depth */
+   arg_types[6] = LLVMPointerType(blend_vec_type, 0);  /* color */
+   arg_types[7] = LLVMPointerType(fs_int_vec_type, 0); /* depth */
+   arg_types[8] = LLVMInt32Type();                     /* c0 */
+   arg_types[9] = LLVMInt32Type();                    /* c1 */
+   arg_types[10] = LLVMInt32Type();                    /* c2 */
+   /* Note: the step arrays are built as int32[16] but we interpret
+    * them here as int32_vec4[4].
+    */
+   arg_types[11] = LLVMPointerType(int32_vec4_type, 0);/* step0 */
+   arg_types[12] = LLVMPointerType(int32_vec4_type, 0);/* step1 */
+   arg_types[13] = LLVMPointerType(int32_vec4_type, 0);/* step2 */
 
    func_type = LLVMFunctionType(LLVMVoidType(), arg_types, Elements(arg_types), 0);
 
@@ -486,9 +593,14 @@ generate_fragment(struct llvmpipe_context *lp,
    a0_ptr       = LLVMGetParam(variant->function, 3);
    dadx_ptr     = LLVMGetParam(variant->function, 4);
    dady_ptr     = LLVMGetParam(variant->function, 5);
-   mask_ptr     = LLVMGetParam(variant->function, 6);
-   color_ptr    = LLVMGetParam(variant->function, 7);
-   depth_ptr    = LLVMGetParam(variant->function, 8);
+   color_ptr    = LLVMGetParam(variant->function, 6);
+   depth_ptr    = LLVMGetParam(variant->function, 7);
+   c0           = LLVMGetParam(variant->function, 8);
+   c1           = LLVMGetParam(variant->function, 9);
+   c2           = LLVMGetParam(variant->function, 10);
+   step0_ptr    = LLVMGetParam(variant->function, 11);
+   step1_ptr    = LLVMGetParam(variant->function, 12);
+   step2_ptr    = LLVMGetParam(variant->function, 13);
 
    lp_build_name(context_ptr, "context");
    lp_build_name(x, "x");
@@ -496,9 +608,14 @@ generate_fragment(struct llvmpipe_context *lp,
    lp_build_name(a0_ptr, "a0");
    lp_build_name(dadx_ptr, "dadx");
    lp_build_name(dady_ptr, "dady");
-   lp_build_name(mask_ptr, "mask");
    lp_build_name(color_ptr, "color");
    lp_build_name(depth_ptr, "depth");
+   lp_build_name(c0, "c0");
+   lp_build_name(c1, "c1");
+   lp_build_name(c2, "c2");
+   lp_build_name(step0_ptr, "step0");
+   lp_build_name(step1_ptr, "step1");
+   lp_build_name(step2_ptr, "step2");
 
    /*
     * Function body
@@ -526,7 +643,6 @@ generate_fragment(struct llvmpipe_context *lp,
       if(i != 0)
          lp_build_interp_soa_update(&interp, i);
 
-      fs_mask[i] = LLVMBuildLoad(builder, LLVMBuildGEP(builder, mask_ptr, &index, 1, ""), "");
       depth_ptr_i = LLVMBuildGEP(builder, depth_ptr, &index, 1, "");
 
       generate_fs(lp, shader, key,
@@ -536,9 +652,11 @@ generate_fragment(struct llvmpipe_context *lp,
                   i,
                   &interp,
                   sampler,
-                  &fs_mask[i],
+                  &fs_mask[i], /* output */
                   out_color,
-                  depth_ptr_i);
+                  depth_ptr_i,
+                  c0, c1, c2,
+                  step0_ptr, step1_ptr, step2_ptr);
 
       for(chan = 0; chan < NUM_CHANNELS; ++chan)
          fs_out_color[chan][i] = out_color[chan];



More information about the mesa-commit mailing list