[Mesa-dev] [PATCH] llvmpipe: support 8bit subpixel precision

Thu Nov 21 04:01:19 PST 2013

Great!

Couple of comments inline.

On 11/21/2013 12:02 AM, Zack Rusin wrote:
> 8 bit precision is required by d3d10 but unfortunately
> requires 64 bit rasterizer. This commit implements
> 64 bit rasterization with full support for 8bit subpixel
> precision. It's a combination of all individual commits
> from the llvmpipe-rast-64 branch.
>
> Signed-off-by: Zack Rusin <zackr at vmware.com>
> ---
>   src/gallium/drivers/llvmpipe/lp_rast.c         |  11 ++
>   src/gallium/drivers/llvmpipe/lp_rast.h         |  47 +++++--
>   src/gallium/drivers/llvmpipe/lp_rast_debug.c   |   6 +-
>   src/gallium/drivers/llvmpipe/lp_rast_priv.h    |  27 ++++
>   src/gallium/drivers/llvmpipe/lp_rast_tri.c     | 173 +++++++++++++++++--------
>   src/gallium/drivers/llvmpipe/lp_rast_tri_tmp.h |  56 ++++----
>   src/gallium/drivers/llvmpipe/lp_setup_line.c   |   2 +-
>   src/gallium/drivers/llvmpipe/lp_setup_tri.c    | 155 ++++++++++++++--------
>   src/gallium/tests/graw/SConscript              |   1 +
>   src/gallium/tests/graw/tri-large.c             | 173 +++++++++++++++++++++++++
>   10 files changed, 500 insertions(+), 151 deletions(-)
>   create mode 100644 src/gallium/tests/graw/tri-large.c
>
> diff --git a/src/gallium/drivers/llvmpipe/lp_rast.c b/src/gallium/drivers/llvmpipe/lp_rast.c
> index af661e9..0cd62c2 100644
> --- a/src/gallium/drivers/llvmpipe/lp_rast.c
> +++ b/src/gallium/drivers/llvmpipe/lp_rast.c
> @@ -589,6 +589,17 @@ static lp_rast_cmd_func dispatch[LP_RAST_OP_MAX] =
>      lp_rast_begin_query,
>      lp_rast_end_query,
>      lp_rast_set_state,
> +   lp_rast_triangle_32_1,
> +   lp_rast_triangle_32_2,
> +   lp_rast_triangle_32_3,
> +   lp_rast_triangle_32_4,
> +   lp_rast_triangle_32_5,
> +   lp_rast_triangle_32_6,
> +   lp_rast_triangle_32_7,
> +   lp_rast_triangle_32_8,
> +   lp_rast_triangle_32_3_4,
> +   lp_rast_triangle_32_3_16,
> +   lp_rast_triangle_32_4_16
>   };
>
>
> diff --git a/src/gallium/drivers/llvmpipe/lp_rast.h b/src/gallium/drivers/llvmpipe/lp_rast.h
> index 43c598d..b81d94f 100644
> --- a/src/gallium/drivers/llvmpipe/lp_rast.h
> +++ b/src/gallium/drivers/llvmpipe/lp_rast.h
> @@ -46,10 +46,11 @@ struct lp_scene;
>   struct lp_fence;
>   struct cmd_bin;
>
> -#define FIXED_TYPE_WIDTH 32
> +#define FIXED_TYPE_WIDTH 64
>   /** For sub-pixel positioning */
> -#define FIXED_ORDER 4
> +#define FIXED_ORDER 8
>   #define FIXED_ONE (1<<FIXED_ORDER)
> +#define FIXED_SHIFT (FIXED_TYPE_WIDTH - 1)
>   /** Maximum length of an edge in a primitive in pixels.
>    *  If the framebuffer is large we have to think about fixed-point
>    *  integer overflow. Coordinates need ((FIXED_TYPE_WIDTH/2) - 1) bits
> @@ -59,11 +60,14 @@ struct cmd_bin;
>    */
>   #define MAX_FIXED_LENGTH (1 << (((FIXED_TYPE_WIDTH/2) - 1) - FIXED_ORDER))
>
> +#define MAX_FIXED_LENGTH32 (1 << (((32/2) - 1) - FIXED_ORDER))
> +
>   /* Rasterizer output size going to jit fs, width/height */
>   #define LP_RASTER_BLOCK_SIZE 4
>
>   #define LP_MAX_ACTIVE_BINNED_QUERIES 16
>
> +#define IMUL64(a, b) (((int64_t)(a)) * ((int64_t)(b)))
>
>   struct lp_rasterizer_task;
>
> @@ -102,18 +106,15 @@ struct lp_rast_shader_inputs {
>      /* followed by a0, dadx, dady and planes[] */
>   };
>
> -/* Note: the order of these values is important as they are loaded by
> - * sse code in rasterization:
> - */
>   struct lp_rast_plane {
>      /* edge function values at minx,miny ?? */
> -   int c;
> +   int64_t c;
>
> -   int dcdx;
> -   int dcdy;
> +   int32_t dcdx;
> +   int32_t dcdy;
>
>      /* one-pixel sized trivial reject offsets for each plane */
> -   int eo;
> +   int64_t eo;
>   };
I'm still not entirely happy that even for rasterization we need 64bits. 
And even more unhappy that we have to deal with pseudo-64bit values even 
when we can use the 32bit versions so we get a metric crapload of scalar 
loads. But then again I'm not happy about other things in 
rasterization... It may be avoidable by doing some per-tile fixup (the 
64bit rasterization, the need for having 64bit structs for 32bit 
rasterization could be avoided by using separate plane arg for 32bit). 
But we can always fix that later. I bet that if you're on a 32bit arch 
it will be very slow.

>
>   /**
> @@ -277,8 +278,19 @@ lp_rast_arg_null( void )
>   #define LP_RAST_OP_BEGIN_QUERY       0xf
>   #define LP_RAST_OP_END_QUERY         0x10
>   #define LP_RAST_OP_SET_STATE         0x11
> -
> -#define LP_RAST_OP_MAX               0x12
> +#define LP_RAST_OP_TRIANGLE_32_1     0x12
> +#define LP_RAST_OP_TRIANGLE_32_2     0x13
> +#define LP_RAST_OP_TRIANGLE_32_3     0x14
> +#define LP_RAST_OP_TRIANGLE_32_4     0x15
> +#define LP_RAST_OP_TRIANGLE_32_5     0x16
> +#define LP_RAST_OP_TRIANGLE_32_6     0x17
> +#define LP_RAST_OP_TRIANGLE_32_7     0x18
> +#define LP_RAST_OP_TRIANGLE_32_8     0x19
> +#define LP_RAST_OP_TRIANGLE_32_3_4   0x1a
> +#define LP_RAST_OP_TRIANGLE_32_3_16  0x1b
> +#define LP_RAST_OP_TRIANGLE_32_4_16  0x1c
> +
> +#define LP_RAST_OP_MAX               0x1d
>   #define LP_RAST_OP_MASK              0xff
>
>   void
> @@ -289,4 +301,17 @@ void
>   lp_debug_draw_bins_by_coverage( struct lp_scene *scene );
>
>
> +#ifdef PIPE_ARCH_SSE
> +#include <emmintrin.h>
> +#include "util/u_sse.h"
> +
> +static INLINE __m128i
> +lp_plane_to_m128i(const struct lp_rast_plane *plane)
> +{
> +   return _mm_setr_epi32((int32_t)plane->c, (int32_t)plane->dcdx,
> +                         (int32_t)plane->dcdy, (int32_t)plane->eo);
> +}
> +
> +#endif
> +
>   #endif
> diff --git a/src/gallium/drivers/llvmpipe/lp_rast_debug.c b/src/gallium/drivers/llvmpipe/lp_rast_debug.c
> index 3bc75aa..587c793 100644
> --- a/src/gallium/drivers/llvmpipe/lp_rast_debug.c
> +++ b/src/gallium/drivers/llvmpipe/lp_rast_debug.c
> @@ -195,8 +195,8 @@ debug_triangle(int tilex, int tiley,
>      while (plane_mask) {
>         plane[nr_planes] = tri_plane[u_bit_scan(&plane_mask)];
>         plane[nr_planes].c = (plane[nr_planes].c +
> -                            plane[nr_planes].dcdy * tiley -
> -                            plane[nr_planes].dcdx * tilex);
> +                            IMUL64(plane[nr_planes].dcdy, tiley) -
> +                            IMUL64(plane[nr_planes].dcdx, tilex));
>         nr_planes++;
>      }
>
> @@ -217,7 +217,7 @@ debug_triangle(int tilex, int tiley,
>         }
>
>         for (i = 0; i < nr_planes; i++) {
> -         plane[i].c += plane[i].dcdx * TILE_SIZE;
> +         plane[i].c += IMUL64(plane[i].dcdx, TILE_SIZE);
>            plane[i].c += plane[i].dcdy;
>         }
>      }
> diff --git a/src/gallium/drivers/llvmpipe/lp_rast_priv.h b/src/gallium/drivers/llvmpipe/lp_rast_priv.h
> index 41fe097..77ec329 100644
> --- a/src/gallium/drivers/llvmpipe/lp_rast_priv.h
> +++ b/src/gallium/drivers/llvmpipe/lp_rast_priv.h
> @@ -355,6 +355,33 @@ void lp_rast_triangle_3_16( struct lp_rasterizer_task *,
>   void lp_rast_triangle_4_16( struct lp_rasterizer_task *,
>                               const union lp_rast_cmd_arg );
>
> +
> +void lp_rast_triangle_32_1( struct lp_rasterizer_task *,
> +                         const union lp_rast_cmd_arg );
> +void lp_rast_triangle_32_2( struct lp_rasterizer_task *,
> +                         const union lp_rast_cmd_arg );
> +void lp_rast_triangle_32_3( struct lp_rasterizer_task *,
> +                         const union lp_rast_cmd_arg );
> +void lp_rast_triangle_32_4( struct lp_rasterizer_task *,
> +                         const union lp_rast_cmd_arg );
> +void lp_rast_triangle_32_5( struct lp_rasterizer_task *,
> +                         const union lp_rast_cmd_arg );
> +void lp_rast_triangle_32_6( struct lp_rasterizer_task *,
> +                         const union lp_rast_cmd_arg );
> +void lp_rast_triangle_32_7( struct lp_rasterizer_task *,
> +                         const union lp_rast_cmd_arg );
> +void lp_rast_triangle_32_8( struct lp_rasterizer_task *,
> +                         const union lp_rast_cmd_arg );
> +
> +void lp_rast_triangle_32_3_4(struct lp_rasterizer_task *,
> +			  const union lp_rast_cmd_arg );
> +
> +void lp_rast_triangle_32_3_16( struct lp_rasterizer_task *,
> +                            const union lp_rast_cmd_arg );
> +
> +void lp_rast_triangle_32_4_16( struct lp_rasterizer_task *,
> +                            const union lp_rast_cmd_arg );
> +
>   void
>   lp_rast_set_state(struct lp_rasterizer_task *task,
>                     const union lp_rast_cmd_arg arg);
> diff --git a/src/gallium/drivers/llvmpipe/lp_rast_tri.c b/src/gallium/drivers/llvmpipe/lp_rast_tri.c
> index 5ef070a..41f6fbf 100644
> --- a/src/gallium/drivers/llvmpipe/lp_rast_tri.c
> +++ b/src/gallium/drivers/llvmpipe/lp_rast_tri.c
> @@ -35,9 +35,6 @@
>   #include "lp_perf.h"
>   #include "lp_rast_priv.h"
>
> -
> -
> -
>   /**
>    * Shade all pixels in a 4x4 block.
>    */
> @@ -66,44 +63,42 @@ block_full_16(struct lp_rasterizer_task *task,
>   	 block_full_4(task, tri, x + ix, y + iy);
>   }
>
> -#if !defined(PIPE_ARCH_SSE)
> -
>   static INLINE unsigned
> -build_mask_linear(int c, int dcdx, int dcdy)
> +build_mask_linear(int64_t c, int64_t dcdx, int64_t dcdy)
>   {
> -   int mask = 0;
> -
> -   int c0 = c;
> -   int c1 = c0 + dcdy;
> -   int c2 = c1 + dcdy;
> -   int c3 = c2 + dcdy;
> -
> -   mask |= ((c0 + 0 * dcdx) >> 31) & (1 << 0);
> -   mask |= ((c0 + 1 * dcdx) >> 31) & (1 << 1);
> -   mask |= ((c0 + 2 * dcdx) >> 31) & (1 << 2);
> -   mask |= ((c0 + 3 * dcdx) >> 31) & (1 << 3);
> -   mask |= ((c1 + 0 * dcdx) >> 31) & (1 << 4);
> -   mask |= ((c1 + 1 * dcdx) >> 31) & (1 << 5);
> -   mask |= ((c1 + 2 * dcdx) >> 31) & (1 << 6);
> -   mask |= ((c1 + 3 * dcdx) >> 31) & (1 << 7);
> -   mask |= ((c2 + 0 * dcdx) >> 31) & (1 << 8);
> -   mask |= ((c2 + 1 * dcdx) >> 31) & (1 << 9);
> -   mask |= ((c2 + 2 * dcdx) >> 31) & (1 << 10);
> -   mask |= ((c2 + 3 * dcdx) >> 31) & (1 << 11);
> -   mask |= ((c3 + 0 * dcdx) >> 31) & (1 << 12);
> -   mask |= ((c3 + 1 * dcdx) >> 31) & (1 << 13);
> -   mask |= ((c3 + 2 * dcdx) >> 31) & (1 << 14);
> -   mask |= ((c3 + 3 * dcdx) >> 31) & (1 << 15);
> +   unsigned mask = 0;
> +
> +   int64_t c0 = c;
> +   int64_t c1 = c0 + dcdy;
> +   int64_t c2 = c1 + dcdy;
> +   int64_t c3 = c2 + dcdy;
> +
> +   mask |= ((c0 + 0 * dcdx) >> FIXED_SHIFT) & (1 << 0);
> +   mask |= ((c0 + 1 * dcdx) >> FIXED_SHIFT) & (1 << 1);
> +   mask |= ((c0 + 2 * dcdx) >> FIXED_SHIFT) & (1 << 2);
> +   mask |= ((c0 + 3 * dcdx) >> FIXED_SHIFT) & (1 << 3);
> +   mask |= ((c1 + 0 * dcdx) >> FIXED_SHIFT) & (1 << 4);
> +   mask |= ((c1 + 1 * dcdx) >> FIXED_SHIFT) & (1 << 5);
> +   mask |= ((c1 + 2 * dcdx) >> FIXED_SHIFT) & (1 << 6);
> +   mask |= ((c1 + 3 * dcdx) >> FIXED_SHIFT) & (1 << 7);
> +   mask |= ((c2 + 0 * dcdx) >> FIXED_SHIFT) & (1 << 8);
> +   mask |= ((c2 + 1 * dcdx) >> FIXED_SHIFT) & (1 << 9);
> +   mask |= ((c2 + 2 * dcdx) >> FIXED_SHIFT) & (1 << 10);
> +   mask |= ((c2 + 3 * dcdx) >> FIXED_SHIFT) & (1 << 11);
> +   mask |= ((c3 + 0 * dcdx) >> FIXED_SHIFT) & (1 << 12);
> +   mask |= ((c3 + 1 * dcdx) >> FIXED_SHIFT) & (1 << 13);
> +   mask |= ((c3 + 2 * dcdx) >> FIXED_SHIFT) & (1 << 14);
> +   mask |= ((c3 + 3 * dcdx) >> FIXED_SHIFT) & (1 << 15);
>
>      return mask;
>   }
>
>
>   static INLINE void
> -build_masks(int c,
> -	    int cdiff,
> -	    int dcdx,
> -	    int dcdy,
> +build_masks(int64_t c,
> +            int64_t cdiff,
> +            int64_t dcdx,
> +            int64_t dcdy,
>   	    unsigned *outmask,
>   	    unsigned *partmask)
>   {
> @@ -122,6 +117,13 @@ lp_rast_triangle_3_16(struct lp_rasterizer_task *task,
>   }
>
>   void
> +lp_rast_triangle_3_4(struct lp_rasterizer_task *task,
> +                      const union lp_rast_cmd_arg arg)
> +{
> +   lp_rast_triangle_3_16(task, arg);
> +}
> +
> +void
>   lp_rast_triangle_4_16(struct lp_rasterizer_task *task,
>                         const union lp_rast_cmd_arg arg)
>   {
> @@ -131,11 +133,33 @@ lp_rast_triangle_4_16(struct lp_rasterizer_task *task,
>      lp_rast_triangle_4(task, arg2);
>   }
>
> +#if !defined(PIPE_ARCH_SSE)
> +
>   void
> -lp_rast_triangle_3_4(struct lp_rasterizer_task *task,
> +lp_rast_triangle_32_3_16(struct lp_rasterizer_task *task,
> +                         const union lp_rast_cmd_arg arg)
> +{
> +   union lp_rast_cmd_arg arg2;
> +   arg2.triangle.tri = arg.triangle.tri;
> +   arg2.triangle.plane_mask = (1<<3)-1;
> +   lp_rast_triangle_32_3(task, arg2);
> +}
> +
> +void
> +lp_rast_triangle_32_4_16(struct lp_rasterizer_task *task,
> +                         const union lp_rast_cmd_arg arg)
> +{
> +   union lp_rast_cmd_arg arg2;
> +   arg2.triangle.tri = arg.triangle.tri;
> +   arg2.triangle.plane_mask = (1<<4)-1;
> +   lp_rast_triangle_32_4(task, arg2);
> +}
> +
> +void
> +lp_rast_triangle_32_3_4(struct lp_rasterizer_task *task,
>                         const union lp_rast_cmd_arg arg)
>   {
> -   lp_rast_triangle_3_16(task, arg);
> +   lp_rast_triangle_32_3_16(task, arg);
>   }
>
>   #else
> @@ -144,12 +168,12 @@ lp_rast_triangle_3_4(struct lp_rasterizer_task *task,
>
>
>   static INLINE void
> -build_masks(int c,
> -	    int cdiff,
> -	    int dcdx,
> -	    int dcdy,
> -	    unsigned *outmask,
> -	    unsigned *partmask)
> +build_masks_32(int c,
> +               int cdiff,
> +               int dcdx,
> +               int dcdy,
> +               unsigned *outmask,
> +               unsigned *partmask)
>   {
>      __m128i cstep0 = _mm_setr_epi32(c, c+dcdx, c+dcdx*2, c+dcdx*3);
>      __m128i xdcdy = _mm_set1_epi32(dcdy);
> @@ -190,7 +214,7 @@ build_masks(int c,
>
>
>   static INLINE unsigned
> -build_mask_linear(int c, int dcdx, int dcdy)
> +build_mask_linear_32(int c, int dcdx, int dcdy)
>   {
>      __m128i cstep0 = _mm_setr_epi32(c, c+dcdx, c+dcdx*2, c+dcdx*3);
>      __m128i xdcdy = _mm_set1_epi32(dcdy);
> @@ -248,7 +272,7 @@ sign_bits4(const __m128i *cstep, int cdiff)
>
>
>   void
> -lp_rast_triangle_3_16(struct lp_rasterizer_task *task,
> +lp_rast_triangle_32_3_16(struct lp_rasterizer_task *task,
>                         const union lp_rast_cmd_arg arg)
>   {
>      const struct lp_rast_triangle *tri = arg.triangle.tri;
> @@ -260,9 +284,9 @@ lp_rast_triangle_3_16(struct lp_rasterizer_task *task,
>      struct { unsigned mask:16; unsigned i:8; unsigned j:8; } out[16];
>      unsigned nr = 0;
>
> -   __m128i p0 = _mm_load_si128((__m128i *)&plane[0]); /* c, dcdx, dcdy, eo */
> -   __m128i p1 = _mm_load_si128((__m128i *)&plane[1]); /* c, dcdx, dcdy, eo */
> -   __m128i p2 = _mm_load_si128((__m128i *)&plane[2]); /* c, dcdx, dcdy, eo */
> +   __m128i p0 = lp_plane_to_m128i(&plane[0]); /* c, dcdx, dcdy, eo */
> +   __m128i p1 = lp_plane_to_m128i(&plane[1]); /* c, dcdx, dcdy, eo */
> +   __m128i p2 = lp_plane_to_m128i(&plane[2]); /* c, dcdx, dcdy, eo */
>      __m128i zero = _mm_setzero_si128();
>
>      __m128i c;
> @@ -362,7 +386,7 @@ lp_rast_triangle_3_16(struct lp_rasterizer_task *task,
>
>
>   void
> -lp_rast_triangle_3_4(struct lp_rasterizer_task *task,
> +lp_rast_triangle_32_3_4(struct lp_rasterizer_task *task,
>                        const union lp_rast_cmd_arg arg)
>   {
>      const struct lp_rast_triangle *tri = arg.triangle.tri;
> @@ -370,9 +394,9 @@ lp_rast_triangle_3_4(struct lp_rasterizer_task *task,
>      unsigned x = (arg.triangle.plane_mask & 0xff) + task->x;
>      unsigned y = (arg.triangle.plane_mask >> 8) + task->y;
>
> -   __m128i p0 = _mm_load_si128((__m128i *)&plane[0]); /* c, dcdx, dcdy, eo */
> -   __m128i p1 = _mm_load_si128((__m128i *)&plane[1]); /* c, dcdx, dcdy, eo */
> -   __m128i p2 = _mm_load_si128((__m128i *)&plane[2]); /* c, dcdx, dcdy, eo */
> +   __m128i p0 = lp_plane_to_m128i(&plane[0]); /* c, dcdx, dcdy, eo */
> +   __m128i p1 = lp_plane_to_m128i(&plane[1]); /* c, dcdx, dcdy, eo */
> +   __m128i p2 = lp_plane_to_m128i(&plane[2]); /* c, dcdx, dcdy, eo */
>      __m128i zero = _mm_setzero_si128();
>
>      __m128i c;
> @@ -450,7 +474,8 @@ lp_rast_triangle_3_4(struct lp_rasterizer_task *task,
>   #endif
>
>
> -
> +#define BUILD_MASKS(c, cdiff, dcdx, dcdy, omask, pmask) build_masks(c, cdiff, dcdx, dcdy, omask, pmask)
> +#define BUILD_MASK_LINEAR(c, dcdx, dcdy) build_mask_linear(c, dcdx, dcdy)
>
>   #define TAG(x) x##_1
>   #define NR_PLANES 1
> @@ -468,7 +493,7 @@ lp_rast_triangle_3_4(struct lp_rasterizer_task *task,
>
>   #define TAG(x) x##_4
>   #define NR_PLANES 4
> -#define TRI_16 lp_rast_triangle_4_16
> +/*#define TRI_16 lp_rast_triangle_4_16*/
>   #include "lp_rast_tri_tmp.h"
>
>   #define TAG(x) x##_5
> @@ -487,3 +512,47 @@ lp_rast_triangle_3_4(struct lp_rasterizer_task *task,
>   #define NR_PLANES 8
>   #include "lp_rast_tri_tmp.h"
>
> +#ifdef PIPE_ARCH_SSE
> +#undef BUILD_MASKS
> +#undef BUILD_MASK_LINEAR
> +#define BUILD_MASKS(c, cdiff, dcdx, dcdy, omask, pmask) build_masks_32((int)c, (int)cdiff, dcdx, dcdy, omask, pmask)
> +#define BUILD_MASK_LINEAR(c, dcdx, dcdy) build_mask_linear_32((int)c, dcdx, dcdy)
> +#endif
> +
> +#define TAG(x) x##_32_1
> +#define NR_PLANES 1
> +#include "lp_rast_tri_tmp.h"
> +
> +#define TAG(x) x##_32_2
> +#define NR_PLANES 2
> +#include "lp_rast_tri_tmp.h"
> +
> +#define TAG(x) x##_32_3
> +#define NR_PLANES 3
> +/*#define TRI_4 lp_rast_triangle_3_4*/
> +/*#define TRI_16 lp_rast_triangle_3_16*/
> +#include "lp_rast_tri_tmp.h"
> +
> +#define TAG(x) x##_32_4
> +#define NR_PLANES 4
> +#ifdef PIPE_ARCH_SSE
> +#define TRI_16 lp_rast_triangle_32_4_16
> +#endif
> +#include "lp_rast_tri_tmp.h"
> +
> +#define TAG(x) x##_32_5
> +#define NR_PLANES 5
> +#include "lp_rast_tri_tmp.h"
> +
> +#define TAG(x) x##_32_6
> +#define NR_PLANES 6
> +#include "lp_rast_tri_tmp.h"
> +
> +#define TAG(x) x##_32_7
> +#define NR_PLANES 7
> +#include "lp_rast_tri_tmp.h"
> +
> +#define TAG(x) x##_32_8
> +#define NR_PLANES 8
> +#include "lp_rast_tri_tmp.h"
> +
> diff --git a/src/gallium/drivers/llvmpipe/lp_rast_tri_tmp.h b/src/gallium/drivers/llvmpipe/lp_rast_tri_tmp.h
> index 4825d65..52f6e99 100644
> --- a/src/gallium/drivers/llvmpipe/lp_rast_tri_tmp.h
> +++ b/src/gallium/drivers/llvmpipe/lp_rast_tri_tmp.h
> @@ -44,13 +44,13 @@ TAG(do_block_4)(struct lp_rasterizer_task *task,
>                   const struct lp_rast_triangle *tri,
>                   const struct lp_rast_plane *plane,
>                   int x, int y,
> -                const int *c)
> +                const int64_t *c)
>   {
>      unsigned mask = 0xffff;
>      int j;
>
>      for (j = 0; j < NR_PLANES; j++) {
> -      mask &= ~build_mask_linear(c[j] - 1,
> +      mask &= ~BUILD_MASK_LINEAR(c[j] - 1,
>   				 -plane[j].dcdx,
>   				 plane[j].dcdy);
>      }
> @@ -70,7 +70,7 @@ TAG(do_block_16)(struct lp_rasterizer_task *task,
>                    const struct lp_rast_triangle *tri,
>                    const struct lp_rast_plane *plane,
>                    int x, int y,
> -                 const int *c)
> +                 const int64_t *c)
>   {
>      unsigned outmask, inmask, partmask, partial_mask;
>      unsigned j;
> @@ -79,13 +79,13 @@ TAG(do_block_16)(struct lp_rasterizer_task *task,
>      partmask = 0;                /* outside one or more trivial accept planes */
>
>      for (j = 0; j < NR_PLANES; j++) {
> -      const int dcdx = -plane[j].dcdx * 4;
> -      const int dcdy = plane[j].dcdy * 4;
> -      const int cox = plane[j].eo * 4;
> -      const int ei = plane[j].dcdy - plane[j].dcdx - plane[j].eo;
> -      const int cio = ei * 4 - 1;
> +      const int64_t dcdx = -IMUL64(plane[j].dcdx, 4);
> +      const int64_t dcdy = IMUL64(plane[j].dcdy, 4);
> +      const int64_t cox = IMUL64(plane[j].eo, 4);
> +      const int64_t ei = plane[j].dcdy - plane[j].dcdx - plane[j].eo;
> +      const int64_t cio = IMUL64(ei, 4) - 1;
>
> -      build_masks(c[j] + cox,
> +      BUILD_MASKS(c[j] + cox,
>   		  cio - cox,
>   		  dcdx, dcdy,
>   		  &outmask,   /* sign bits from c[i][0..15] + cox */
> @@ -116,7 +116,7 @@ TAG(do_block_16)(struct lp_rasterizer_task *task,
>         int iy = (i >> 2) * 4;
>         int px = x + ix;
>         int py = y + iy;
> -      int cx[NR_PLANES];
> +      int64_t cx[NR_PLANES];
>
>         partial_mask &= ~(1 << i);
>
> @@ -124,8 +124,8 @@ TAG(do_block_16)(struct lp_rasterizer_task *task,
>
>         for (j = 0; j < NR_PLANES; j++)
>            cx[j] = (c[j]
> -		  - plane[j].dcdx * ix
> -		  + plane[j].dcdy * iy);
> +                  - IMUL64(plane[j].dcdx, ix)
> +                  + IMUL64(plane[j].dcdy, iy));
>
>         TAG(do_block_4)(task, tri, plane, px, py, cx);
>      }
> @@ -160,7 +160,7 @@ TAG(lp_rast_triangle)(struct lp_rasterizer_task *task,
>      const struct lp_rast_plane *tri_plane = GET_PLANES(tri);
>      const int x = task->x, y = task->y;
>      struct lp_rast_plane plane[NR_PLANES];
> -   int c[NR_PLANES];
> +   int64_t c[NR_PLANES];
>      unsigned outmask, inmask, partmask, partial_mask;
>      unsigned j = 0;
>
> @@ -176,20 +176,20 @@ TAG(lp_rast_triangle)(struct lp_rasterizer_task *task,
>         int i = ffs(plane_mask) - 1;
>         plane[j] = tri_plane[i];
>         plane_mask &= ~(1 << i);
> -      c[j] = plane[j].c + plane[j].dcdy * y - plane[j].dcdx * x;
> +      c[j] = plane[j].c + IMUL64(plane[j].dcdy, y) - IMUL64(plane[j].dcdx, x);
>
>         {
> -	 const int dcdx = -plane[j].dcdx * 16;
> -	 const int dcdy = plane[j].dcdy * 16;
> -	 const int cox = plane[j].eo * 16;
> -         const int ei = plane[j].dcdy - plane[j].dcdx - plane[j].eo;
> -         const int cio = ei * 16 - 1;
> -
> -	 build_masks(c[j] + cox,
> -		     cio - cox,
> -		     dcdx, dcdy,
> -		     &outmask,   /* sign bits from c[i][0..15] + cox */
> -		     &partmask); /* sign bits from c[i][0..15] + cio */
> +         const int64_t dcdx = -IMUL64(plane[j].dcdx, 16);
> +         const int64_t dcdy = IMUL64(plane[j].dcdy, 16);
> +         const int64_t cox = IMUL64(plane[j].eo, 16);
> +         const int64_t ei = plane[j].dcdy - plane[j].dcdx - plane[j].eo;
> +         const int64_t cio = IMUL64(ei, 16) - 1;
> +
> +         BUILD_MASKS(c[j] + cox,
> +                     cio - cox,
> +                     dcdx, dcdy,
> +                     &outmask,   /* sign bits from c[i][0..15] + cox */
> +                     &partmask); /* sign bits from c[i][0..15] + cio */
>         }
>
>         j++;
> @@ -219,12 +219,12 @@ TAG(lp_rast_triangle)(struct lp_rasterizer_task *task,
>         int iy = (i >> 2) * 16;
>         int px = x + ix;
>         int py = y + iy;
> -      int cx[NR_PLANES];
> +      int64_t cx[NR_PLANES];
>
>         for (j = 0; j < NR_PLANES; j++)
>            cx[j] = (c[j]
> -		  - plane[j].dcdx * ix
> -		  + plane[j].dcdy * iy);
> +                  - IMUL64(plane[j].dcdx, ix)
> +                  + IMUL64(plane[j].dcdy, iy));
>
>         partial_mask &= ~(1 << i);
>
> diff --git a/src/gallium/drivers/llvmpipe/lp_setup_line.c b/src/gallium/drivers/llvmpipe/lp_setup_line.c
> index e1686ea..9b3321e 100644
> --- a/src/gallium/drivers/llvmpipe/lp_setup_line.c
> +++ b/src/gallium/drivers/llvmpipe/lp_setup_line.c
> @@ -641,7 +641,7 @@ try_setup_line( struct lp_setup_context *setup,
>         /* half-edge constants, will be interated over the whole render
>          * target.
>          */
> -      plane[i].c = plane[i].dcdx * x[i] - plane[i].dcdy * y[i];
> +      plane[i].c = IMUL64(plane[i].dcdx, x[i]) - IMUL64(plane[i].dcdy, y[i]);
>
>
>         /* correct for top-left vs. bottom-left fill convention.
> diff --git a/src/gallium/drivers/llvmpipe/lp_setup_tri.c b/src/gallium/drivers/llvmpipe/lp_setup_tri.c
> index 2164f3a..1507a5c 100644
> --- a/src/gallium/drivers/llvmpipe/lp_setup_tri.c
> +++ b/src/gallium/drivers/llvmpipe/lp_setup_tri.c
> @@ -40,13 +40,15 @@
>   #include "lp_state_setup.h"
>   #include "lp_context.h"
>
> +#include <inttypes.h>
> +
>   #define NUM_CHANNELS 4
>
>   #if defined(PIPE_ARCH_SSE)
>   #include <emmintrin.h>
>   #endif
> -
> -static INLINE int
> +
> +static INLINE int64_t
>   subpixel_snap(float a)
>   {
>      return util_iround(FIXED_ONE * a);
> @@ -61,13 +63,13 @@ fixed_to_float(int a)
>
>   /* Position and area in fixed point coordinates */
>   struct fixed_position {
> -   int x[4];
> -   int y[4];
> -   int area;
> -   int dx01;
> -   int dy01;
> -   int dx20;
> -   int dy20;
> +   int64_t x[4];
> +   int64_t y[4];
> +   int64_t area;
> +   int64_t dx01;
> +   int64_t dy01;
> +   int64_t dx20;
> +   int64_t dy20;
>   };
I think these really all should be int (or int32_t) except area. 
Otherwise you're just forcing unnecessary 64bit arithmetic (might not be 
that much slower on a 64bit arch, but still I can't see why).

>
>
> @@ -187,6 +189,19 @@ lp_rast_tri_tab[MAX_PLANES+1] = {
>      LP_RAST_OP_TRIANGLE_8
>   };
>
> +static unsigned
> +lp_rast_32_tri_tab[MAX_PLANES+1] = {
> +   0,               /* should be impossible */
> +   LP_RAST_OP_TRIANGLE_32_1,
> +   LP_RAST_OP_TRIANGLE_32_2,
> +   LP_RAST_OP_TRIANGLE_32_3,
> +   LP_RAST_OP_TRIANGLE_32_4,
> +   LP_RAST_OP_TRIANGLE_32_5,
> +   LP_RAST_OP_TRIANGLE_32_6,
> +   LP_RAST_OP_TRIANGLE_32_7,
> +   LP_RAST_OP_TRIANGLE_32_8
> +};
> +
>
>
>   /**
> @@ -363,7 +378,10 @@ do_triangle_ccw(struct lp_setup_context *setup,
>      plane = GET_PLANES(tri);
>
>   #if defined(PIPE_ARCH_SSE)
> -   {
> +   if (setup->fb.width <= MAX_FIXED_LENGTH32 &&
> +       setup->fb.height <= MAX_FIXED_LENGTH32 &&
> +       (bbox.x1 - bbox.x0) <= MAX_FIXED_LENGTH32 &&
> +       (bbox.y1 - bbox.y0) <= MAX_FIXED_LENGTH32) {
>         __m128i vertx, verty;
>         __m128i shufx, shufy;
>         __m128i dcdx, dcdy, c;
> @@ -375,9 +393,12 @@ do_triangle_ccw(struct lp_setup_context *setup,
>         __m128i c_inc_mask, c_inc;
>         __m128i eo, p0, p1, p2;
>         __m128i zero = _mm_setzero_si128();
> +      PIPE_ALIGN_VAR(16) int32_t temp_vec[4];
>
> -      vertx = _mm_loadu_si128((__m128i *)position->x); /* vertex x coords */
> -      verty = _mm_loadu_si128((__m128i *)position->y); /* vertex y coords */
> +      vertx = _mm_setr_epi32((int32_t)position->x[0], (int32_t)position->x[1],
> +                             (int32_t)position->x[2], (int32_t)position->x[3]);
> +      verty = _mm_setr_epi32((int32_t)position->y[0], (int32_t)position->y[1],
> +                             (int32_t)position->y[2], (int32_t)position->y[3]);
If you'd left those as 32bit values, you wouldn't need this. That said, 
since it apparently only works if not only the tri is small but the fb 
is very small the code is entirely useless as is anyway.
I think though you could get rid of the small fb requirement very 
easily, because only the two multiplication results can be large, but 
not the subtraction of these two afterwards (otherwise 32bit 
rasterization would never work neither), and nothing else neither. Right 
now the multiplication we do there is 32bit/32bit->64bit anyway and then 
shuffled back to 32bit, so you simply could do the subtraction with the 
64bit values (and luckily we have 64bit sse2 subtraction) before 
shuffling things back.
I could be wrong though...
(And actually, it looks a bit inefficient anyway, because we duplicate 
the calc_fixed_position calculations here, though for all planes not 
just one, but we need the earlier calculation for rotation, culling etc. 
I guess that could be made quite a bit more efficient, but that would be 
another project. Worse, we do just about the same calculations another 
time in the jitted setup code though with floats, and draw does it the 
4th time...).

>
>         shufx = _mm_shuffle_epi32(vertx, _MM_SHUFFLE(3,0,2,1));
>         shufy = _mm_shuffle_epi32(verty, _MM_SHUFFLE(3,0,2,1));
> @@ -421,11 +442,20 @@ do_triangle_ccw(struct lp_setup_context *setup,
>         transpose4_epi32(&c, &dcdx, &dcdy, &eo,
>                          &p0, &p1, &p2, &unused);
>
> -      _mm_store_si128((__m128i *)&plane[0], p0);
> -      _mm_store_si128((__m128i *)&plane[1], p1);
> -      _mm_store_si128((__m128i *)&plane[2], p2);
> -   }
> -#else
> +#define STORE_PLANE(plane, vec) do {                 \
> +         _mm_store_si128((__m128i *)&temp_vec, vec); \
> +         plane.c    = (int64_t)temp_vec[0];          \
> +         plane.dcdx = temp_vec[1];                   \
> +         plane.dcdy = temp_vec[2];                   \
> +         plane.eo   = temp_vec[3];                   \
> +      } while(0)
> +
> +      STORE_PLANE(plane[0], p0);
> +      STORE_PLANE(plane[1], p1);
> +      STORE_PLANE(plane[2], p2);
> +#undef STORE_PLANE
> +   } else
> +#endif
>      {
>         int i;
>         plane[0].dcdy = position->dx01;
> @@ -439,7 +469,8 @@ do_triangle_ccw(struct lp_setup_context *setup,
>            /* half-edge constants, will be interated over the whole render
>             * target.
>             */
> -         plane[i].c = plane[i].dcdx * position->x[i] - plane[i].dcdy * position->y[i];
> +         plane[i].c = IMUL64(plane[i].dcdx, position->x[i]) -
> +               IMUL64(plane[i].dcdy, position->y[i]);
>
>            /* correct for top-left vs. bottom-left fill convention.
>             */
> @@ -460,8 +491,12 @@ do_triangle_ccw(struct lp_setup_context *setup,
>               }
>            }
>
> -         plane[i].dcdx *= FIXED_ONE;
> -         plane[i].dcdy *= FIXED_ONE;
> +         /* Scale up to match c:
> +          */
> +         assert((plane[i].dcdx << FIXED_ORDER) >> FIXED_ORDER == plane[i].dcdx);
> +         assert((plane[i].dcdy << FIXED_ORDER) >> FIXED_ORDER == plane[i].dcdy);
> +         plane[i].dcdx <<= FIXED_ORDER;
> +         plane[i].dcdy <<= FIXED_ORDER;
>
>            /* find trivial reject offsets for each edge for a single-pixel
>             * sized block.  These will be scaled up at each recursive level to
> @@ -473,22 +508,21 @@ do_triangle_ccw(struct lp_setup_context *setup,
>            if (plane[i].dcdy > 0) plane[i].eo += plane[i].dcdy;
>         }
>      }
> -#endif
>
>      if (0) {
> -      debug_printf("p0: %08x/%08x/%08x/%08x\n",
> +      debug_printf("p0: %"PRIx64"/%08x/%08x/%"PRIx64"\n",
Everytime I see such printfs, my eyes really hurt. Is that godawful 
ugly. Definitely not your fault though :-).

>                      plane[0].c,
>                      plane[0].dcdx,
>                      plane[0].dcdy,
>                      plane[0].eo);
>
> -      debug_printf("p1: %08x/%08x/%08x/%08x\n",
> +      debug_printf("p1: %"PRIx64"/%08x/%08x/%"PRIx64"\n",
>                      plane[1].c,
>                      plane[1].dcdx,
>                      plane[1].dcdy,
>                      plane[1].eo);
>
> -      debug_printf("p0: %08x/%08x/%08x/%08x\n",
> +      debug_printf("p2: %"PRIx64"/%08x/%08x/%"PRIx64"\n",
>                      plane[2].c,
>                      plane[2].dcdx,
>                      plane[2].dcdy,
> @@ -578,7 +612,6 @@ lp_setup_bin_triangle( struct lp_setup_context *setup,
>      struct lp_scene *scene = setup->scene;
>      struct u_rect trimmed_box = *bbox;
>      int i;
> -
>      /* What is the largest power-of-two boundary this triangle crosses:
>       */
>      int dx = floor_pot((bbox->x0 ^ bbox->x1) |
> @@ -587,8 +620,10 @@ lp_setup_bin_triangle( struct lp_setup_context *setup,
>      /* The largest dimension of the rasterized area of the triangle
>       * (aligned to a 4x4 grid), rounded down to the nearest power of two:
>       */
> -   int sz = floor_pot((bbox->x1 - (bbox->x0 & ~3)) |
> -		      (bbox->y1 - (bbox->y0 & ~3)));
> +   int max_sz = ((bbox->x1 - (bbox->x0 & ~3)) |
> +                 (bbox->y1 - (bbox->y0 & ~3)));
> +   int sz = floor_pot(max_sz);
> +   boolean use_32bits = max_sz <= MAX_FIXED_LENGTH32;
I am wondering is "smaller equal" actually correct? Might well be though.

>
>      /* Now apply scissor, etc to the bounding box.  Could do this
>       * earlier, but it confuses the logic for tri-16 and would force
> @@ -619,6 +654,8 @@ lp_setup_bin_triangle( struct lp_setup_context *setup,
>               assert(py + 4 <= TILE_SIZE);
>               return lp_scene_bin_cmd_with_state( scene, ix0, iy0,
>                                                   setup->fs.stored,
> +                                                use_32bits ?
> +                                                LP_RAST_OP_TRIANGLE_32_3_4 :
>                                                   LP_RAST_OP_TRIANGLE_3_4,
>                                                   lp_rast_arg_triangle_contained(tri, px, py) );
>            }
> @@ -641,6 +678,8 @@ lp_setup_bin_triangle( struct lp_setup_context *setup,
>
>               return lp_scene_bin_cmd_with_state( scene, ix0, iy0,
>                                                   setup->fs.stored,
> +                                                use_32bits ?
> +                                                LP_RAST_OP_TRIANGLE_32_3_16 :
>                                                   LP_RAST_OP_TRIANGLE_3_16,
>                                                   lp_rast_arg_triangle_contained(tri, px, py) );
>            }
> @@ -655,6 +694,8 @@ lp_setup_bin_triangle( struct lp_setup_context *setup,
>
>            return lp_scene_bin_cmd_with_state(scene, ix0, iy0,
>                                               setup->fs.stored,
> +                                            use_32bits ?
> +                                            LP_RAST_OP_TRIANGLE_32_4_16 :
>                                               LP_RAST_OP_TRIANGLE_4_16,
>                                               lp_rast_arg_triangle_contained(tri, px, py));
>         }
> @@ -662,19 +703,20 @@ lp_setup_bin_triangle( struct lp_setup_context *setup,
>
>         /* Triangle is contained in a single tile:
>          */
> -      return lp_scene_bin_cmd_with_state( scene, ix0, iy0, setup->fs.stored,
> -                                          lp_rast_tri_tab[nr_planes],
> -                                          lp_rast_arg_triangle(tri, (1<<nr_planes)-1) );
> +      return lp_scene_bin_cmd_with_state(
> +         scene, ix0, iy0, setup->fs.stored,
> +         use_32bits ? lp_rast_32_tri_tab[nr_planes] : lp_rast_tri_tab[nr_planes],
> +         lp_rast_arg_triangle(tri, (1<<nr_planes)-1));
>      }
>      else
>      {
>         struct lp_rast_plane *plane = GET_PLANES(tri);
> -      int c[MAX_PLANES];
> -      int ei[MAX_PLANES];
> +      int64_t c[MAX_PLANES];
> +      int64_t ei[MAX_PLANES];
>
> -      int eo[MAX_PLANES];
> -      int xstep[MAX_PLANES];
> -      int ystep[MAX_PLANES];
> +      int64_t eo[MAX_PLANES];
> +      int64_t xstep[MAX_PLANES];
> +      int64_t ystep[MAX_PLANES];
>         int x, y;
>
>         int ix0 = trimmed_box.x0 / TILE_SIZE;
> @@ -684,16 +726,16 @@ lp_setup_bin_triangle( struct lp_setup_context *setup,
>
>         for (i = 0; i < nr_planes; i++) {
>            c[i] = (plane[i].c +
> -                 plane[i].dcdy * iy0 * TILE_SIZE -
> -                 plane[i].dcdx * ix0 * TILE_SIZE);
> +                 IMUL64(plane[i].dcdy, iy0) * TILE_SIZE -
> +                 IMUL64(plane[i].dcdx, ix0) * TILE_SIZE);
>
>            ei[i] = (plane[i].dcdy -
>                     plane[i].dcdx -
>                     plane[i].eo) << TILE_ORDER;
>
>            eo[i] = plane[i].eo << TILE_ORDER;
> -         xstep[i] = -(plane[i].dcdx << TILE_ORDER);
> -         ystep[i] = plane[i].dcdy << TILE_ORDER;
> +         xstep[i] = -(((int64_t)plane[i].dcdx) << TILE_ORDER);
> +         ystep[i] = ((int64_t)plane[i].dcdy) << TILE_ORDER;
>         }
>
>
> @@ -705,22 +747,22 @@ lp_setup_bin_triangle( struct lp_setup_context *setup,
>          */
>         for (y = iy0; y <= iy1; y++)
>         {
> -	 boolean in = FALSE;  /* are we inside the triangle? */
> -	 int cx[MAX_PLANES];
> +         boolean in = FALSE;  /* are we inside the triangle? */
> +         int64_t cx[MAX_PLANES];
>
>            for (i = 0; i < nr_planes; i++)
>               cx[i] = c[i];
>
> -	 for (x = ix0; x <= ix1; x++)
> -	 {
> +         for (x = ix0; x <= ix1; x++)
> +         {
>               int out = 0;
>               int partial = 0;
>
>               for (i = 0; i < nr_planes; i++) {
> -               int planeout = cx[i] + eo[i];
> -               int planepartial = cx[i] + ei[i] - 1;
> -               out |= (planeout >> 31);
> -               partial |= (planepartial >> 31) & (1<<i);
> +               int64_t planeout = cx[i] + eo[i];
> +               int64_t planepartial = cx[i] + ei[i] - 1;
> +               out |= (planeout >> 63);
> +               partial |= (planepartial >> 63) & (1<<i);
>               }
>
>               if (out) {
> @@ -730,7 +772,7 @@ lp_setup_bin_triangle( struct lp_setup_context *setup,
>                  LP_COUNT(nr_empty_64);
>               }
>               else if (partial) {
> -               /* Not trivially accepted by at least one plane -
> +               /* Not trivially accepted by at least one plane -
>                   * rasterize/shade partial tile
>                   */
>                  int count = util_bitcount(partial);
> @@ -738,7 +780,9 @@ lp_setup_bin_triangle( struct lp_setup_context *setup,
>
>                  if (!lp_scene_bin_cmd_with_state( scene, x, y,
>                                                    setup->fs.stored,
> -                                                 lp_rast_tri_tab[count],
> +                                                 use_32bits ?
> +                                                 lp_rast_32_tri_tab[count] :
> +                                                 lp_rast_tri_tab[count],
>                                                    lp_rast_arg_triangle(tri, partial) ))
>                     goto fail;
>
> @@ -752,14 +796,12 @@ lp_setup_bin_triangle( struct lp_setup_context *setup,
>                     goto fail;
>               }
>
> -	    /* Iterate cx values across the region:
> -	     */
> +            /* Iterate cx values across the region: */
>               for (i = 0; i < nr_planes; i++)
>                  cx[i] += xstep[i];
> -	 }
> -
> -	 /* Iterate c values down the region:
> -	  */
> +         }
> +
> +         /* Iterate c values down the region: */
>            for (i = 0; i < nr_planes; i++)
>               c[i] += ystep[i];
>         }
> @@ -823,7 +865,8 @@ calc_fixed_position( struct lp_setup_context *setup,
>      position->dx20 = position->x[2] - position->x[0];
>      position->dy20 = position->y[2] - position->y[0];
>
> -   position->area = position->dx01 * position->dy20 - position->dx20 * position->dy01;
> +   position->area = IMUL64(position->dx01, position->dy20) -
> +         IMUL64(position->dx20, position->dy01);
>   }
>
>
> diff --git a/src/gallium/tests/graw/SConscript b/src/gallium/tests/graw/SConscript
> index 8740ff3..8723807 100644
> --- a/src/gallium/tests/graw/SConscript
> +++ b/src/gallium/tests/graw/SConscript
> @@ -29,6 +29,7 @@ progs = [
>       'tex-srgb',
>       'tex-swizzle',
>       'tri',
> +    'tri-large',
>       'tri-gs',
>       'tri-instanced',
>       'vs-test',
> diff --git a/src/gallium/tests/graw/tri-large.c b/src/gallium/tests/graw/tri-large.c
> new file mode 100644
> index 0000000..3fbbfb3
> --- /dev/null
> +++ b/src/gallium/tests/graw/tri-large.c
> @@ -0,0 +1,173 @@
> +/* Display a cleared blue window.  This demo has no dependencies on
> + * any utility code, just the graw interface and gallium.
> + */
> +
> +#include "graw_util.h"
> +#include "util/u_debug.h"
> +
> +#include <stdio.h>
> +
> +static struct graw_info info;
> +
> +static const int WIDTH = 4*2048;
> +static const int HEIGHT = 4*2048;
> +
> +
> +struct vertex {
> +   float position[4];
> +   float color[4];
> +};
> +
> +static boolean FlatShade = FALSE;
> +
> +
> +static struct vertex vertices[3] =
> +{
> +   {
> +      { -1.0f, -1.0f, 0.0f, 1.0f },
> +      { 1.0f, 0.0f, 0.0f, 1.0f }
> +   },
> +   {
> +      { -1.0f, 1.0f, 0.0f, 1.0f },
> +      { 0.0f, 1.0f, 0.0f, 1.0f }
> +   },
> +   {
> +      { 1.0f, 1.0f, 0.0f, 1.0f },
> +      { 0.0f, 0.0f, 1.0f, 1.0f }
> +   }
> +};
> +
> +
> +static void set_vertices( void )
> +{
> +   struct pipe_vertex_element ve[2];
> +   struct pipe_vertex_buffer vbuf;
> +   void *handle;
> +
> +   memset(ve, 0, sizeof ve);
> +
> +   ve[0].src_offset = Offset(struct vertex, position);
> +   ve[0].src_format = PIPE_FORMAT_R32G32B32A32_FLOAT;
> +   ve[1].src_offset = Offset(struct vertex, color);
> +   ve[1].src_format = PIPE_FORMAT_R32G32B32A32_FLOAT;
> +
> +   handle = info.ctx->create_vertex_elements_state(info.ctx, 2, ve);
> +   info.ctx->bind_vertex_elements_state(info.ctx, handle);
> +
> +   memset(&vbuf, 0, sizeof vbuf);
> +
> +   vbuf.stride = sizeof( struct vertex );
> +   vbuf.buffer_offset = 0;
> +   vbuf.buffer = pipe_buffer_create_with_data(info.ctx,
> +                                              PIPE_BIND_VERTEX_BUFFER,
> +                                              PIPE_USAGE_STATIC,
> +                                              sizeof(vertices),
> +                                              vertices);
> +
> +   info.ctx->set_vertex_buffers(info.ctx, 0, 1, &vbuf);
> +}
> +
> +
> +static void set_vertex_shader( void )
> +{
> +   void *handle;
> +   const char *text =
> +      "VERT\n"
> +      "DCL IN[0]\n"
> +      "DCL IN[1]\n"
> +      "DCL OUT[0], POSITION\n"
> +      "DCL OUT[1], COLOR\n"
> +      "  0: MOV OUT[1], IN[1]\n"
> +      "  1: MOV OUT[0], IN[0]\n"
> +      "  2: END\n";
> +
> +   handle = graw_parse_vertex_shader(info.ctx, text);
> +   info.ctx->bind_vs_state(info.ctx, handle);
> +}
> +
> +
> +static void set_fragment_shader( void )
> +{
> +   void *handle;
> +   const char *text =
> +      "FRAG\n"
> +      "DCL IN[0], COLOR, LINEAR\n"
> +      "DCL OUT[0], COLOR\n"
> +      "  0: MOV OUT[0], IN[0]\n"
> +      "  1: END\n";
> +
> +   handle = graw_parse_fragment_shader(info.ctx, text);
> +   info.ctx->bind_fs_state(info.ctx, handle);
> +}
> +
> +
> +static void draw( void )
> +{
> +   union pipe_color_union clear_color = { {1,0,1,1} };
> +
> +   info.ctx->clear(info.ctx, PIPE_CLEAR_COLOR, &clear_color, 0, 0);
> +   util_draw_arrays(info.ctx, PIPE_PRIM_TRIANGLES, 0, 3);
> +   info.ctx->flush(info.ctx, NULL, 0);
> +
> +   graw_save_surface_to_file(info.ctx, info.color_surf[0], NULL);
> +
> +   graw_util_flush_front(&info);
> +}
> +
> +
> +static void init( void )
> +{
> +   if (!graw_util_create_window(&info, WIDTH, HEIGHT, 1, FALSE))
> +      exit(1);
> +
> +   graw_util_default_state(&info, FALSE);
> +
> +   {
> +      struct pipe_rasterizer_state rasterizer;
> +      void *handle;
> +      memset(&rasterizer, 0, sizeof rasterizer);
> +      rasterizer.cull_face = PIPE_FACE_NONE;
> +      rasterizer.half_pixel_center = 1;
> +      rasterizer.bottom_edge_rule = 1;
> +      rasterizer.flatshade = FlatShade;
> +      rasterizer.depth_clip = 1;
> +      handle = info.ctx->create_rasterizer_state(info.ctx, &rasterizer);
> +      info.ctx->bind_rasterizer_state(info.ctx, handle);
> +   }
> +
> +
> +   graw_util_viewport(&info, 0, 0, WIDTH, HEIGHT, 30, 1000);
> +
> +   set_vertices();
> +   set_vertex_shader();
> +   set_fragment_shader();
> +}
> +
> +static void args(int argc, char *argv[])
> +{
> +   int i;
> +
> +   for (i = 1; i < argc; ) {
> +      if (graw_parse_args(&i, argc, argv)) {
> +         /* ok */
> +      }
> +      else if (strcmp(argv[i], "-f") == 0) {
> +         FlatShade = TRUE;
> +         i++;
> +      }
> +      else {
> +         printf("Invalid arg %s\n", argv[i]);
> +         exit(1);
> +      }
> +   }
> +}
> +
> +int main( int argc, char *argv[] )
> +{
> +   args(argc, argv);
> +   init();
> +
> +   graw_set_display_func( draw );
> +   graw_main_loop();
> +   return 0;
> +}
>

Otherwise looks good to me, though maybe get rid of the tri subdivision 
code in a followup commit?

Roland