Mesa (llvmpipe-rast-64): llvmpipe: implement 64 bit triangle rasterization

Fri Oct 25 04:24:39 UTC 2013

Module: Mesa
Branch: llvmpipe-rast-64
Commit: ba9964b5ab6ca9c1dac4e9eb82afc2d173ffb94c
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=ba9964b5ab6ca9c1dac4e9eb82afc2d173ffb94c

Author: Zack Rusin <zackr at vmware.com>
Date:   Fri Oct 25 00:20:50 2013 -0400

llvmpipe: implement 64 bit triangle rasterization

currently only the c paths work, the sse/avx paths haven't
been ported to the 64 bit arithmetic. we need this to increase
our subpixel precision to 8 bits.

---

 src/gallium/auxiliary/util/u_sse.h             |   25 +++++
 src/gallium/drivers/llvmpipe/lp_rast.h         |   14 ++-
 src/gallium/drivers/llvmpipe/lp_rast_debug.c   |    6 +-
 src/gallium/drivers/llvmpipe/lp_rast_tri.c     |   56 +++++-----
 src/gallium/drivers/llvmpipe/lp_rast_tri_tmp.h |   52 +++++-----
 src/gallium/drivers/llvmpipe/lp_setup_line.c   |    2 +-
 src/gallium/drivers/llvmpipe/lp_setup_tri.c    |  137 ++++++++++++++++++------
 7 files changed, 194 insertions(+), 98 deletions(-)

diff --git a/src/gallium/auxiliary/util/u_sse.h b/src/gallium/auxiliary/util/u_sse.h
index d100c47..4ba8b61 100644
--- a/src/gallium/auxiliary/util/u_sse.h
+++ b/src/gallium/auxiliary/util/u_sse.h
@@ -254,6 +254,31 @@ transpose4_epi32(const __m128i * restrict a,
 
 #define SCALAR_EPI32(m, i) _mm_shuffle_epi32((m), _MM_SHUFFLE(i,i,i,i))
 
+#if defined(PIPE_ARCH_AVX2)
+
+#include <immintrin.h>
+
+static INLINE void
+transpose4_epi64(const __m256i * restrict a,
+                 const __m256i * restrict b,
+                 const __m256i * restrict c,
+                 const __m256i * restrict d,
+                 __m256i * restrict o,
+                 __m256i * restrict p,
+                 __m256i * restrict q,
+                 __m256i * restrict r)
+{
+  __m256i t0 = _mm256_unpacklo_epi64(*a, *b);
+  __m256i t1 = _mm256_unpacklo_epi64(*c, *d);
+  __m256i t2 = _mm256_unpackhi_epi64(*a, *b);
+  __m256i t3 = _mm256_unpackhi_epi64(*c, *d);
+
+  *o = _mm256_unpacklo_epi64(t0, t1);
+  *p = _mm256_unpackhi_epi64(t0, t1);
+  *q = _mm256_unpacklo_epi64(t2, t3);
+  *r = _mm256_unpackhi_epi64(t2, t3);
+}
+#endif /* PIPE_ARCH_AVX2 */
 
 #endif /* PIPE_ARCH_SSE */
 
diff --git a/src/gallium/drivers/llvmpipe/lp_rast.h b/src/gallium/drivers/llvmpipe/lp_rast.h
index 43c598d..54641b1 100644
--- a/src/gallium/drivers/llvmpipe/lp_rast.h
+++ b/src/gallium/drivers/llvmpipe/lp_rast.h
@@ -46,10 +46,11 @@ struct lp_scene;
 struct lp_fence;
 struct cmd_bin;
 
-#define FIXED_TYPE_WIDTH 32
+#define FIXED_TYPE_WIDTH 64
 /** For sub-pixel positioning */
-#define FIXED_ORDER 4
+#define FIXED_ORDER 8
 #define FIXED_ONE (1<<FIXED_ORDER)
+#define FIXED_SHIFT (FIXED_TYPE_WIDTH - 1)
 /** Maximum length of an edge in a primitive in pixels.
  *  If the framebuffer is large we have to think about fixed-point
  *  integer overflow. Coordinates need ((FIXED_TYPE_WIDTH/2) - 1) bits
@@ -64,6 +65,7 @@ struct cmd_bin;
 
 #define LP_MAX_ACTIVE_BINNED_QUERIES 16
 
+#define IMUL64(a, b) (((int64_t)a) * ((int64_t)b))
 
 struct lp_rasterizer_task;
 
@@ -107,13 +109,13 @@ struct lp_rast_shader_inputs {
  */
 struct lp_rast_plane {
    /* edge function values at minx,miny ?? */
-   int c;
+   int64_t c;
 
-   int dcdx;
-   int dcdy;
+   int64_t dcdx;
+   int64_t dcdy;
 
    /* one-pixel sized trivial reject offsets for each plane */
-   int eo;
+   int64_t eo;
 };
 
 /**
diff --git a/src/gallium/drivers/llvmpipe/lp_rast_debug.c b/src/gallium/drivers/llvmpipe/lp_rast_debug.c
index 3bc75aa..587c793 100644
--- a/src/gallium/drivers/llvmpipe/lp_rast_debug.c
+++ b/src/gallium/drivers/llvmpipe/lp_rast_debug.c
@@ -195,8 +195,8 @@ debug_triangle(int tilex, int tiley,
    while (plane_mask) {
       plane[nr_planes] = tri_plane[u_bit_scan(&plane_mask)];
       plane[nr_planes].c = (plane[nr_planes].c +
-                            plane[nr_planes].dcdy * tiley -
-                            plane[nr_planes].dcdx * tilex);
+                            IMUL64(plane[nr_planes].dcdy, tiley) -
+                            IMUL64(plane[nr_planes].dcdx, tilex));
       nr_planes++;
    }
 
@@ -217,7 +217,7 @@ debug_triangle(int tilex, int tiley,
       }
 
       for (i = 0; i < nr_planes; i++) {
-         plane[i].c += plane[i].dcdx * TILE_SIZE;
+         plane[i].c += IMUL64(plane[i].dcdx, TILE_SIZE);
          plane[i].c += plane[i].dcdy;
       }
    }
diff --git a/src/gallium/drivers/llvmpipe/lp_rast_tri.c b/src/gallium/drivers/llvmpipe/lp_rast_tri.c
index 5ef070a..02f2894 100644
--- a/src/gallium/drivers/llvmpipe/lp_rast_tri.c
+++ b/src/gallium/drivers/llvmpipe/lp_rast_tri.c
@@ -69,41 +69,41 @@ block_full_16(struct lp_rasterizer_task *task,
 #if !defined(PIPE_ARCH_SSE)
 
 static INLINE unsigned
-build_mask_linear(int c, int dcdx, int dcdy)
+build_mask_linear(int64_t c, int64_t dcdx, int64_t dcdy)
 {
-   int mask = 0;
-
-   int c0 = c;
-   int c1 = c0 + dcdy;
-   int c2 = c1 + dcdy;
-   int c3 = c2 + dcdy;
-
-   mask |= ((c0 + 0 * dcdx) >> 31) & (1 << 0);
-   mask |= ((c0 + 1 * dcdx) >> 31) & (1 << 1);
-   mask |= ((c0 + 2 * dcdx) >> 31) & (1 << 2);
-   mask |= ((c0 + 3 * dcdx) >> 31) & (1 << 3);
-   mask |= ((c1 + 0 * dcdx) >> 31) & (1 << 4);
-   mask |= ((c1 + 1 * dcdx) >> 31) & (1 << 5);
-   mask |= ((c1 + 2 * dcdx) >> 31) & (1 << 6);
-   mask |= ((c1 + 3 * dcdx) >> 31) & (1 << 7); 
-   mask |= ((c2 + 0 * dcdx) >> 31) & (1 << 8);
-   mask |= ((c2 + 1 * dcdx) >> 31) & (1 << 9);
-   mask |= ((c2 + 2 * dcdx) >> 31) & (1 << 10);
-   mask |= ((c2 + 3 * dcdx) >> 31) & (1 << 11);
-   mask |= ((c3 + 0 * dcdx) >> 31) & (1 << 12);
-   mask |= ((c3 + 1 * dcdx) >> 31) & (1 << 13);
-   mask |= ((c3 + 2 * dcdx) >> 31) & (1 << 14);
-   mask |= ((c3 + 3 * dcdx) >> 31) & (1 << 15);
+   unsigned mask = 0;
+
+   int64_t c0 = c;
+   int64_t c1 = c0 + dcdy;
+   int64_t c2 = c1 + dcdy;
+   int64_t c3 = c2 + dcdy;
+
+   mask |= ((c0 + 0 * dcdx) >> FIXED_SHIFT) & (1 << 0);
+   mask |= ((c0 + 1 * dcdx) >> FIXED_SHIFT) & (1 << 1);
+   mask |= ((c0 + 2 * dcdx) >> FIXED_SHIFT) & (1 << 2);
+   mask |= ((c0 + 3 * dcdx) >> FIXED_SHIFT) & (1 << 3);
+   mask |= ((c1 + 0 * dcdx) >> FIXED_SHIFT) & (1 << 4);
+   mask |= ((c1 + 1 * dcdx) >> FIXED_SHIFT) & (1 << 5);
+   mask |= ((c1 + 2 * dcdx) >> FIXED_SHIFT) & (1 << 6);
+   mask |= ((c1 + 3 * dcdx) >> FIXED_SHIFT) & (1 << 7);
+   mask |= ((c2 + 0 * dcdx) >> FIXED_SHIFT) & (1 << 8);
+   mask |= ((c2 + 1 * dcdx) >> FIXED_SHIFT) & (1 << 9);
+   mask |= ((c2 + 2 * dcdx) >> FIXED_SHIFT) & (1 << 10);
+   mask |= ((c2 + 3 * dcdx) >> FIXED_SHIFT) & (1 << 11);
+   mask |= ((c3 + 0 * dcdx) >> FIXED_SHIFT) & (1 << 12);
+   mask |= ((c3 + 1 * dcdx) >> FIXED_SHIFT) & (1 << 13);
+   mask |= ((c3 + 2 * dcdx) >> FIXED_SHIFT) & (1 << 14);
+   mask |= ((c3 + 3 * dcdx) >> FIXED_SHIFT) & (1 << 15);
   
    return mask;
 }
 
 
 static INLINE void
-build_masks(int c, 
-	    int cdiff,
-	    int dcdx,
-	    int dcdy,
+build_masks(int64_t c,
+            int64_t cdiff,
+            int64_t dcdx,
+            int64_t dcdy,
 	    unsigned *outmask,
 	    unsigned *partmask)
 {
diff --git a/src/gallium/drivers/llvmpipe/lp_rast_tri_tmp.h b/src/gallium/drivers/llvmpipe/lp_rast_tri_tmp.h
index 4825d65..a00cbb2 100644
--- a/src/gallium/drivers/llvmpipe/lp_rast_tri_tmp.h
+++ b/src/gallium/drivers/llvmpipe/lp_rast_tri_tmp.h
@@ -44,7 +44,7 @@ TAG(do_block_4)(struct lp_rasterizer_task *task,
                 const struct lp_rast_triangle *tri,
                 const struct lp_rast_plane *plane,
                 int x, int y,
-                const int *c)
+                const int64_t *c)
 {
    unsigned mask = 0xffff;
    int j;
@@ -70,7 +70,7 @@ TAG(do_block_16)(struct lp_rasterizer_task *task,
                  const struct lp_rast_triangle *tri,
                  const struct lp_rast_plane *plane,
                  int x, int y,
-                 const int *c)
+                 const int64_t *c)
 {
    unsigned outmask, inmask, partmask, partial_mask;
    unsigned j;
@@ -79,11 +79,11 @@ TAG(do_block_16)(struct lp_rasterizer_task *task,
    partmask = 0;                /* outside one or more trivial accept planes */
 
    for (j = 0; j < NR_PLANES; j++) {
-      const int dcdx = -plane[j].dcdx * 4;
-      const int dcdy = plane[j].dcdy * 4;
-      const int cox = plane[j].eo * 4;
-      const int ei = plane[j].dcdy - plane[j].dcdx - plane[j].eo;
-      const int cio = ei * 4 - 1;
+      const int64_t dcdx = -IMUL64(plane[j].dcdx, 4);
+      const int64_t dcdy = IMUL64(plane[j].dcdy, 4);
+      const int64_t cox = IMUL64(plane[j].eo, 4);
+      const int64_t ei = plane[j].dcdy - plane[j].dcdx - plane[j].eo;
+      const int64_t cio = IMUL64(ei, 4) - 1;
 
       build_masks(c[j] + cox,
 		  cio - cox,
@@ -116,7 +116,7 @@ TAG(do_block_16)(struct lp_rasterizer_task *task,
       int iy = (i >> 2) * 4;
       int px = x + ix;
       int py = y + iy; 
-      int cx[NR_PLANES];
+      int64_t cx[NR_PLANES];
 
       partial_mask &= ~(1 << i);
 
@@ -124,8 +124,8 @@ TAG(do_block_16)(struct lp_rasterizer_task *task,
 
       for (j = 0; j < NR_PLANES; j++)
          cx[j] = (c[j] 
-		  - plane[j].dcdx * ix
-		  + plane[j].dcdy * iy);
+                  - IMUL64(plane[j].dcdx, ix)
+                  + IMUL64(plane[j].dcdy, iy));
 
       TAG(do_block_4)(task, tri, plane, px, py, cx);
    }
@@ -160,7 +160,7 @@ TAG(lp_rast_triangle)(struct lp_rasterizer_task *task,
    const struct lp_rast_plane *tri_plane = GET_PLANES(tri);
    const int x = task->x, y = task->y;
    struct lp_rast_plane plane[NR_PLANES];
-   int c[NR_PLANES];
+   int64_t c[NR_PLANES];
    unsigned outmask, inmask, partmask, partial_mask;
    unsigned j = 0;
 
@@ -176,20 +176,20 @@ TAG(lp_rast_triangle)(struct lp_rasterizer_task *task,
       int i = ffs(plane_mask) - 1;
       plane[j] = tri_plane[i];
       plane_mask &= ~(1 << i);
-      c[j] = plane[j].c + plane[j].dcdy * y - plane[j].dcdx * x;
+      c[j] = plane[j].c + IMUL64(plane[j].dcdy, y) - IMUL64(plane[j].dcdx, x);
 
       {
-	 const int dcdx = -plane[j].dcdx * 16;
-	 const int dcdy = plane[j].dcdy * 16;
-	 const int cox = plane[j].eo * 16;
-         const int ei = plane[j].dcdy - plane[j].dcdx - plane[j].eo;
-         const int cio = ei * 16 - 1;
-
-	 build_masks(c[j] + cox,
-		     cio - cox,
-		     dcdx, dcdy, 
-		     &outmask,   /* sign bits from c[i][0..15] + cox */
-		     &partmask); /* sign bits from c[i][0..15] + cio */
+         const int64_t dcdx = -IMUL64(plane[j].dcdx, 16);
+         const int64_t dcdy = IMUL64(plane[j].dcdy, 16);
+         const int64_t cox = IMUL64(plane[j].eo, 16);
+         const int64_t ei = plane[j].dcdy - plane[j].dcdx - plane[j].eo;
+         const int64_t cio = IMUL64(ei, 16) - 1;
+
+         build_masks(c[j] + cox,
+                     cio - cox,
+                     dcdx, dcdy,
+                     &outmask,   /* sign bits from c[i][0..15] + cox */
+                     &partmask); /* sign bits from c[i][0..15] + cio */
       }
 
       j++;
@@ -219,12 +219,12 @@ TAG(lp_rast_triangle)(struct lp_rasterizer_task *task,
       int iy = (i >> 2) * 16;
       int px = x + ix;
       int py = y + iy;
-      int cx[NR_PLANES];
+      int64_t cx[NR_PLANES];
 
       for (j = 0; j < NR_PLANES; j++)
          cx[j] = (c[j]
-		  - plane[j].dcdx * ix
-		  + plane[j].dcdy * iy);
+                  - IMUL64(plane[j].dcdx, ix)
+                  + IMUL64(plane[j].dcdy, iy));
 
       partial_mask &= ~(1 << i);
 
diff --git a/src/gallium/drivers/llvmpipe/lp_setup_line.c b/src/gallium/drivers/llvmpipe/lp_setup_line.c
index e1686ea..9b3321e 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup_line.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup_line.c
@@ -641,7 +641,7 @@ try_setup_line( struct lp_setup_context *setup,
       /* half-edge constants, will be interated over the whole render
        * target.
        */
-      plane[i].c = plane[i].dcdx * x[i] - plane[i].dcdy * y[i];
+      plane[i].c = IMUL64(plane[i].dcdx, x[i]) - IMUL64(plane[i].dcdy, y[i]);
 
       
       /* correct for top-left vs. bottom-left fill convention.  
diff --git a/src/gallium/drivers/llvmpipe/lp_setup_tri.c b/src/gallium/drivers/llvmpipe/lp_setup_tri.c
index 2164f3a..aae48e7 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup_tri.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup_tri.c
@@ -42,11 +42,15 @@
 
 #define NUM_CHANNELS 4
 
+#if defined(PIPE_ARCH_AVX2)
+#include <immintrin.h>
+#endif
+
 #if defined(PIPE_ARCH_SSE)
 #include <emmintrin.h>
 #endif
-   
-static INLINE int
+
+static INLINE int64_t
 subpixel_snap(float a)
 {
    return util_iround(FIXED_ONE * a);
@@ -61,13 +65,13 @@ fixed_to_float(int a)
 
 /* Position and area in fixed point coordinates */
 struct fixed_position {
-   int x[4];
-   int y[4];
-   int area;
-   int dx01;
-   int dy01;
-   int dx20;
-   int dy20;
+   int64_t x[4];
+   int64_t y[4];
+   int64_t area;
+   int64_t dx01;
+   int64_t dy01;
+   int64_t dx20;
+   int64_t dy20;
 };
 
 
@@ -362,7 +366,72 @@ do_triangle_ccw(struct lp_setup_context *setup,
 
    plane = GET_PLANES(tri);
 
-#if defined(PIPE_ARCH_SSE)
+#if defined(PIPE_ARCH_AVX2)
+   {
+      __m256i vertx, verty;
+      __m256i shufx, shufy;
+      __m256i dcdx, dcdy, c;
+      __m256i unused;
+      __m256i dcdx_neg_mask;
+      __m256i dcdy_neg_mask;
+      __m256i dcdx_zero_mask;
+      __m256i top_left_flag;
+      __m256i c_inc_mask, c_inc;
+      __m256i eo, p0, p1, p2;
+      __m256i zero = _mm256_setzero_si256();
+
+      vertx = _mm256_loadu_si256((__m256i *)position->x); /* vertex x coords */
+      verty = _mm256_loadu_si256((__m256i *)position->y); /* vertex y coords */
+
+      shufx = _mm256_shuffle_epi64(vertx, _MM_SHUFFLE(3,0,2,1));
+      shufy = _mm256_shuffle_epi64(verty, _MM_SHUFFLE(3,0,2,1));
+
+      dcdx = _mm256_sub_epi64(verty, shufy);
+      dcdy = _mm256_sub_epi64(vertx, shufx);
+
+      dcdx_neg_mask = _mm256_srai_epi64(dcdx, FIXED_SHIFT);
+      dcdx_zero_mask = _mm256_cmpeq_epi64(dcdx, zero);
+      dcdy_neg_mask = _mm256_srai_epi64(dcdy, FIXED_SHIFT);
+
+      top_left_flag = _mm256_set1_epi64x(
+         (setup->bottom_edge_rule == 0) ? ~0 : 0);
+
+      c_inc_mask = _mm256_or_si256(
+         dcdx_neg_mask,
+         _mm256_and_si256(dcdx_zero_mask,
+                          _mm256_xor_si256(dcdy_neg_mask,
+                                           top_left_flag)));
+
+      c_inc = _mm256_srli_epi64(c_inc_mask, FIXED_SHIFT);
+
+      c = _mm256_sub_epi64(mm256_mullo_epi64(dcdx, vertx),
+                           mm256_mullo_epi64(dcdy, verty));
+
+      c = _mm256_add_epi64(c, c_inc);
+
+      /* Scale up to match c:
+       */
+      dcdx = _mm256_slli_epi64(dcdx, FIXED_ORDER);
+      dcdy = _mm256_slli_epi64(dcdy, FIXED_ORDER);
+
+      /* Calculate trivial reject values:
+       */
+      eo = _mm256_sub_epi64(_mm256_andnot_si256(dcdy_neg_mask, dcdy),
+                            _mm256_and_si256(dcdx_neg_mask, dcdx));
+
+      /* ei = _mm_sub_epi32(_mm_sub_epi32(dcdy, dcdx), eo); */
+
+      /* Pointless transpose which gets undone immediately in
+       * rasterization:
+       */
+      transpose4_epi64(&c, &dcdx, &dcdy, &eo,
+                       &p0, &p1, &p2, &unused);
+
+      _mm256_store_si256((__m256i *)&plane[0], p0);
+      _mm256_store_si256((__m256i *)&plane[1], p1);
+      _mm256_store_si256((__m256i *)&plane[2], p2);
+   }
+#elif defined(PIPE_ARCH_SSE)
    {
       __m128i vertx, verty;
       __m128i shufx, shufy;
@@ -439,7 +508,8 @@ do_triangle_ccw(struct lp_setup_context *setup,
          /* half-edge constants, will be interated over the whole render
           * target.
           */
-         plane[i].c = plane[i].dcdx * position->x[i] - plane[i].dcdy * position->y[i];
+         plane[i].c = IMUL64(plane[i].dcdx, position->x[i]) -
+               IMUL64(plane[i].dcdy, position->y[i]);
 
          /* correct for top-left vs. bottom-left fill convention.
           */         
@@ -476,19 +546,19 @@ do_triangle_ccw(struct lp_setup_context *setup,
 #endif
 
    if (0) {
-      debug_printf("p0: %08x/%08x/%08x/%08x\n",
+      debug_printf("p0: %16lx/%08lx/%08lx/%08lx\n",
                    plane[0].c,
                    plane[0].dcdx,
                    plane[0].dcdy,
                    plane[0].eo);
       
-      debug_printf("p1: %08x/%08x/%08x/%08x\n",
+      debug_printf("p1: %16lx/%08lx/%08lx/%08lx\n",
                    plane[1].c,
                    plane[1].dcdx,
                    plane[1].dcdy,
                    plane[1].eo);
       
-      debug_printf("p0: %08x/%08x/%08x/%08x\n",
+      debug_printf("p2: %16lx/%08lx/%08lx/%08lx\n",
                    plane[2].c,
                    plane[2].dcdx,
                    plane[2].dcdy,
@@ -669,7 +739,7 @@ lp_setup_bin_triangle( struct lp_setup_context *setup,
    else
    {
       struct lp_rast_plane *plane = GET_PLANES(tri);
-      int c[MAX_PLANES];
+      int64_t c[MAX_PLANES];
       int ei[MAX_PLANES];
 
       int eo[MAX_PLANES];
@@ -684,8 +754,8 @@ lp_setup_bin_triangle( struct lp_setup_context *setup,
       
       for (i = 0; i < nr_planes; i++) {
          c[i] = (plane[i].c + 
-                 plane[i].dcdy * iy0 * TILE_SIZE - 
-                 plane[i].dcdx * ix0 * TILE_SIZE);
+                 IMUL64(plane[i].dcdy, iy0) * TILE_SIZE -
+                 IMUL64(plane[i].dcdx, ix0) * TILE_SIZE);
 
          ei[i] = (plane[i].dcdy - 
                   plane[i].dcdx - 
@@ -705,22 +775,22 @@ lp_setup_bin_triangle( struct lp_setup_context *setup,
        */
       for (y = iy0; y <= iy1; y++)
       {
-	 boolean in = FALSE;  /* are we inside the triangle? */
-	 int cx[MAX_PLANES];
+         boolean in = FALSE;  /* are we inside the triangle? */
+         int64_t cx[MAX_PLANES];
 
          for (i = 0; i < nr_planes; i++)
             cx[i] = c[i];
 
-	 for (x = ix0; x <= ix1; x++)
-	 {
+         for (x = ix0; x <= ix1; x++)
+         {
             int out = 0;
             int partial = 0;
 
             for (i = 0; i < nr_planes; i++) {
-               int planeout = cx[i] + eo[i];
-               int planepartial = cx[i] + ei[i] - 1;
-               out |= (planeout >> 31);
-               partial |= (planepartial >> 31) & (1<<i);
+               int64_t planeout = cx[i] + eo[i];
+               int64_t planepartial = cx[i] + ei[i] - 1;
+               out |= (planeout >> 63);
+               partial |= (planepartial >> 63) & (1<<i);
             }
 
             if (out) {
@@ -730,7 +800,7 @@ lp_setup_bin_triangle( struct lp_setup_context *setup,
                LP_COUNT(nr_empty_64);
             }
             else if (partial) {
-               /* Not trivially accepted by at least one plane - 
+               /* Not trivially accepted by at least one plane -
                 * rasterize/shade partial tile
                 */
                int count = util_bitcount(partial);
@@ -738,7 +808,7 @@ lp_setup_bin_triangle( struct lp_setup_context *setup,
                
                if (!lp_scene_bin_cmd_with_state( scene, x, y,
                                                  setup->fs.stored,
-                                                 lp_rast_tri_tab[count], 
+                                                 lp_rast_tri_tab[count],
                                                  lp_rast_arg_triangle(tri, partial) ))
                   goto fail;
 
@@ -752,14 +822,12 @@ lp_setup_bin_triangle( struct lp_setup_context *setup,
                   goto fail;
             }
 
-	    /* Iterate cx values across the region:
-	     */
+            /* Iterate cx values across the region: */
             for (i = 0; i < nr_planes; i++)
                cx[i] += xstep[i];
-	 }
-      
-	 /* Iterate c values down the region:
-	  */
+         }
+
+         /* Iterate c values down the region: */
          for (i = 0; i < nr_planes; i++)
             c[i] += ystep[i];
       }
@@ -823,7 +891,8 @@ calc_fixed_position( struct lp_setup_context *setup,
    position->dx20 = position->x[2] - position->x[0];
    position->dy20 = position->y[2] - position->y[0];
 
-   position->area = position->dx01 * position->dy20 - position->dx20 * position->dy01;
+   position->area = IMUL64(position->dx01, position->dy20) -
+         IMUL64(position->dx20, position->dy01);
 }