xf86-video-intel: Branch 'exa-i965' - 11 commits - src/exa_sf.g4a src/exa_sf_mask.g4a src/exa_sf_mask_prog.h src/exa_sf_prog.h src/exa_wm_masknoca.g4a src/exa_wm_masknoca_prog.h src/exa_wm_nomask.g4a src/exa_wm_nomask_prog.h src/i830_exa.c src/i830.h src/i830_memory.c src/i965_composite_wm_nomask.g4a src/i965_composite_wm_nomask.h src/i965_exa_render.c src/Makefile.am

Wed Nov 29 11:20:43 EET 2006

src/Makefile.am            |    4 
 src/exa_sf.g4a             |   17 +
 src/exa_sf_mask.g4a        |   53 +++++
 src/exa_sf_mask_prog.h     |   25 ++
 src/exa_sf_prog.h          |   17 +
 src/exa_wm_masknoca.g4a    |  202 +++++++++++++++++++++
 src/exa_wm_masknoca_prog.h |   95 +++++++++
 src/exa_wm_nomask.g4a      |    8 
 src/exa_wm_nomask_prog.h   |    4 
 src/i830.h                 |    2 
 src/i830_exa.c             |    7 
 src/i830_memory.c          |   36 +++
 src/i965_exa_render.c      |  430 +++++++++++++++++++++++----------------------
 13 files changed, 678 insertions(+), 222 deletions(-)

New commits:
diff-tree db391e8e4c4d87bfe3ccad0de14dd5b47b69b8fe (from 290f15cd4cda97727ebcaadacbbbf7650278934b)
Author: Wang Zhenyu <zhenyu.z.wang at intel.com>
Date:   Wed Nov 29 17:16:46 2006 +0800

    shut up warning

diff --git a/src/i965_exa_render.c b/src/i965_exa_render.c
index 6f2bc84..2d1ce5f 100644
--- a/src/i965_exa_render.c
+++ b/src/i965_exa_render.c
@@ -1011,10 +1011,8 @@ I965EXAComposite(PixmapPtr pDst, int src
 
     srcXend = srcX + w;
     srcYend = srcY + h;
-    if (pMask) {
-        maskXend = maskX + w;
-        maskYend = maskY + h;
-    }
+    maskXend = maskX + w;
+    maskYend = maskY + h;
     if (is_transform[0]) {
         v.vector[0] = IntToxFixed(srcX);
         v.vector[1] = IntToxFixed(srcY);
diff-tree 290f15cd4cda97727ebcaadacbbbf7650278934b (from 3d4edd325f3859c749ee42df102bb4239eac5287)
Author: Wang Zhenyu <zhenyu.z.wang at intel.com>
Date:   Wed Nov 29 17:14:55 2006 +0800

    fix alpha blending state

diff --git a/src/i965_exa_render.c b/src/i965_exa_render.c
index c4a3f97..6f2bc84 100644
--- a/src/i965_exa_render.c
+++ b/src/i965_exa_render.c
@@ -562,21 +562,26 @@ ErrorF("i965 prepareComposite\n");
    cc_state->cc0.stencil_enable = 0;   /* disable stencil */
    cc_state->cc2.depth_test = 0;       /* disable depth test */
    cc_state->cc2.logicop_enable = 0;   /* disable logic op */
-   cc_state->cc3.ia_blend_enable = 0;  /* blend alpha just like colors */
+   cc_state->cc3.ia_blend_enable = 1;  /* blend alpha just like colors */
    cc_state->cc3.blend_enable = 1;     /* enable color blend */
    cc_state->cc3.alpha_test = 0;       /* disable alpha test */
    cc_state->cc4.cc_viewport_state_offset = (state_base_offset + cc_viewport_offset) >> 5;
    cc_state->cc5.dither_enable = 0;    /* disable dither */
-//   cc_state->cc5.logicop_func = 0xc;   /* COPY */
-//   cc_state->cc5.statistics_enable = 1;
-//   cc_state->cc5.ia_blend_function = BRW_BLENDFUNCTION_ADD;
-//   cc_state->cc5.ia_src_blend_factor = BRW_BLENDFACTOR_ONE;
-//   cc_state->cc5.ia_dest_blend_factor = BRW_BLENDFACTOR_ONE;
-   cc_state->cc6.blend_function = BRW_BLENDFUNCTION_ADD;
+   cc_state->cc5.logicop_func = 0xc;   /* COPY */
+   cc_state->cc5.statistics_enable = 1;
+   cc_state->cc5.ia_blend_function = BRW_BLENDFUNCTION_ADD;
    I965GetBlendCntl(op, pMaskPicture, pDstPicture->format, 
 		    &src_blend, &dst_blend);
+   /* XXX: alpha blend factor should be same as color, but check
+	   for CA case in future */
+   cc_state->cc5.ia_src_blend_factor = src_blend;
+   cc_state->cc5.ia_dest_blend_factor = dst_blend;
+   cc_state->cc6.blend_function = BRW_BLENDFUNCTION_ADD;
    cc_state->cc6.src_blend_factor = src_blend;
    cc_state->cc6.dest_blend_factor = dst_blend;
+   cc_state->cc6.clamp_post_alpha_blend = 1; 
+   cc_state->cc6.clamp_pre_alpha_blend = 1; 
+   cc_state->cc6.clamp_range = 0;  /* clamp range [0,1] */
 
    /* Upload system kernel */
    memcpy (sip_kernel, sip_kernel_static, sizeof (sip_kernel_static));
diff-tree 3d4edd325f3859c749ee42df102bb4239eac5287 (from a704120b15efae47344a90d972e7f3da64a202a6)
Author: Wang Zhenyu <zhenyu.z.wang at intel.com>
Date:   Wed Nov 29 17:05:32 2006 +0800

    Add in sf/wm program for mask picture without CA

diff --git a/src/exa_sf_mask.g4a b/src/exa_sf_mask.g4a
new file mode 100644
index 0000000..ab519ce
--- /dev/null
+++ b/src/exa_sf_mask.g4a
@@ -0,0 +1,53 @@
+
+/* FIXME how to setup second coeffient for mask tex coord */
+
+/* 
+   g3 (v0) { u0, v0, 1.0, 1.0 }  ==> {u0, v0, 1.0, 1.0, mu0, mv0, 1.0, 1.0}  Co[0](u0) Co[1](v0) Co[2](mu0) Co[3](mv0)
+   g4 (v1) { u1, v1, 1.0, 1.0 }  ==> {u1, v1, 1.0, 1.0, mu1, mv1, 1.0, 1.0}
+   g5 (v2) { u2, v2 }  ==> (u2, v2, mu2, mv2}
+   g6      { 1/(x1-x0), 1/(y1-y0) }
+   g7      { u1-u0, v1-v0, 0, 0}  ==>{u1-u0, v1-v0,0, 0, mu1-mu0, mv1-mv0, 0, 0}
+	   -> { (u1-u0)/(x1-x0), (v1-v0)/(y1-y0) }  ==>{(u1-u0)/(x1-x0), (v1-v0)/(y1-y0),(mu1-mu0)/(x1-x0), (mv1-mv0)/(y1-y0)
+		Cx,		 Cy 			Cx[0],		 Cy[0],		 Cx[1], 	    Cy[1]
+ */
+
+/* assign Cx[0], Cx[1] to src, same to Cy, Co 
+          Cx[2], Cx[3] to mask, same to Cy, Co */
+
+send (1) 0 g6<1>F g1.12<0,1,0>F math inv scalar mlen 1 rlen 1 { align1 };
+send (1) 0 g6.4<1>F g1.20<0,1,0>F math inv scalar mlen 1 rlen 1 { align1 };
+add (8) g7<1>F g4<8,8,1>F -g3<8,8,1>F { align1 };
+/* Cx[0] */
+mul (1) g7<1>F g7<0,1,0>F g6<0,1,0>F { align1 };
+/* Cy[0] */
+mul (1) g7.4<1>F g7.4<0,1,0>F g6.4<0,1,0>F { align1 };
+/* Cx[2] */
+mul (1) g7.16<1>F g7.16<0,1,0>F g6<0,1,0>F { align1 };
+/* Cy[2] */
+mul (1) g7.20<1>F g7.20<0,1,0>F g6.4<0,1,0>F { align1 };
+
+/* src Cx[0], Cx[1] */
+mov (8) m1<1>F g7<0,1,0>F { align1 };
+/* mask Cx[2], Cx[3] */
+mov (1) m1.8<1>F g7.16<0,1,0>F { align1 };
+mov (1) m1.12<1>F g7.16<0,1,0>F { align1 };
+/* src Cy[0], Cy[1] */
+mov (8) m2<1>F g7.4<0,1,0>F { align1 };
+/* mask Cy[2], Cy[3] */
+mov (1) m2.8<1>F g7.20<0,1,0>F { align1 };
+mov (1) m2.12<1>F g7.20<0,1,0>F { align1 };
+/* src Co[0], Co[1] */
+mov (8) m3<1>F g3<8,8,1>F { align1 };
+/* mask Co[2], Co[3] */
+mov (1) m3.8<1>F g3.16<0,1,0>F { align1 };
+mov (1) m3.12<1>F g3.20<0,1,0>F { align1 };
+
+send (8) 0 null g0<8,8,1>F urb 0 transpose used complete mlen 4 rlen 0 { align1 EOT };
+nop;
+nop;
+nop;
+nop;
+nop;
+nop;
+nop;
+nop;
diff --git a/src/exa_sf_mask_prog.h b/src/exa_sf_mask_prog.h
new file mode 100644
index 0000000..cd7f460
--- /dev/null
+++ b/src/exa_sf_mask_prog.h
@@ -0,0 +1,25 @@
+   { 0x00000031, 0x20c01fbd, 0x0000002c, 0x01110081 },
+   { 0x00000031, 0x20c41fbd, 0x00000034, 0x01110081 },
+   { 0x00600040, 0x20e077bd, 0x008d0080, 0x008d4060 },
+   { 0x00000041, 0x20e077bd, 0x000000e0, 0x000000c0 },
+   { 0x00000041, 0x20e477bd, 0x000000e4, 0x000000c4 },
+   { 0x00000041, 0x20f077bd, 0x000000f0, 0x000000c0 },
+   { 0x00000041, 0x20f477bd, 0x000000f4, 0x000000c4 },
+   { 0x00600001, 0x202003be, 0x000000e0, 0x00000000 },
+   { 0x00000001, 0x202803be, 0x000000f0, 0x00000000 },
+   { 0x00000001, 0x202c03be, 0x000000f0, 0x00000000 },
+   { 0x00600001, 0x204003be, 0x000000e4, 0x00000000 },
+   { 0x00000001, 0x204803be, 0x000000f4, 0x00000000 },
+   { 0x00000001, 0x204c03be, 0x000000f4, 0x00000000 },
+   { 0x00600001, 0x206003be, 0x008d0060, 0x00000000 },
+   { 0x00000001, 0x206803be, 0x00000070, 0x00000000 },
+   { 0x00000001, 0x206c03be, 0x00000074, 0x00000000 },
+   { 0x00600031, 0x20001fbc, 0x008d0000, 0x8640c800 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
diff --git a/src/exa_wm_masknoca.g4a b/src/exa_wm_masknoca.g4a
new file mode 100644
index 0000000..195203c
--- /dev/null
+++ b/src/exa_wm_masknoca.g4a
@@ -0,0 +1,202 @@
+/*
+ * This's for exa composite operation in no mask picture case.
+ * The simplest case is just sending what src picture has to dst picture.
+ * XXX: This is still experimental, and should be fixed to support multiple texture
+ * map, and conditional mul actions. 
+ */
+
+/* I think this should be same as in g4a program for texture video,
+   as we also use 16-pixel dispatch. and SF scale in g3 is useful for us. */
+
+/* The initial payload of the thread is always g0.
+ * WM_URB (incoming URB entries) is g3
+   As mask texture coeffient needs extra setup urb starting from g4, we should
+   shift this location. 
+
+ * X0_R is g4->g6
+ * X1_R is g5->g7
+ * Y0_R is g6->g8
+ * Y1_R is g7->g9
+
+     * X0: {ss0.x, ss0.x+1, ss0.x,   ss0.x+1, ss1.x, ss1.x+1, ss1.x,   ss1.x+y}
+     * Y0: {ss0.y, ss0.y,   ss0.y+1, ss0.y+1, ss1.y, ss1.y,   ss1.y+1, ss1.y+1}
+     * X1: {ss2.x, ss2.x+1, ss2.x,   ss2.x+1, ss3.x, ss3.x+1, ss3.x,   ss3.x+y}
+     * Y1: {ss2.y, ss2.y,   ss2.y+1, ss2.y+1, ss3.y, ss3.y,   ss3.y+1, ss3.y+1}
+ */
+
+/* multitexture program with src and mask texture */
+/* - load src texture */
+/* - load mask texture */
+/* - mul src.X with mask's alpha */
+/* - write out src.X */
+
+    /* Set up ss0.x coordinates*/
+mov (1) g6<1>F g1.8<0,1,0>UW { align1 };
+add (1) g6.4<1>F g1.8<0,1,0>UW 1UB { align1 };
+mov (1) g6.8<1>F g1.8<0,1,0>UW { align1 };
+add (1) g6.12<1>F g1.8<0,1,0>UW 1UB { align1 };
+    /* Set up ss0.y coordinates */
+mov (1) g8<1>F g1.10<0,1,0>UW { align1 };
+mov (1) g8.4<1>F g1.10<0,1,0>UW { align1 };
+add (1) g8.8<1>F g1.10<0,1,0>UW 1UB { align1 };
+add (1) g8.12<1>F g1.10<0,1,0>UW 1UB { align1 };
+    /* set up ss1.x coordinates */
+mov (1) g6.16<1>F g1.12<0,1,0>UW { align1 };
+add (1) g6.20<1>F g1.12<0,1,0>UW 1UB { align1 };
+mov (1) g6.24<1>F g1.12<0,1,0>UW { align1 };
+add (1) g6.28<1>F g1.12<0,1,0>UW 1UB { align1 };
+    /* set up ss1.y coordinates */
+mov (1) g8.16<1>F g1.14<0,1,0>UW { align1 };
+mov (1) g8.20<1>F g1.14<0,1,0>UW { align1 };
+add (1) g8.24<1>F g1.14<0,1,0>UW 1UB { align1 };
+add (1) g8.28<1>F g1.14<0,1,0>UW 1UB { align1 };
+    /* Set up ss2.x coordinates */
+mov (1) g9<1>F g1.16<0,1,0>UW { align1 };
+add (1) g9.4<1>F g1.16<0,1,0>UW 1UB { align1 };
+mov (1) g9.8<1>F g1.16<0,1,0>UW { align1 };
+add (1) g9.12<1>F g1.16<0,1,0>UW 1UB { align1 };
+    /* Set up ss2.y coordinates */
+mov (1) g9<1>F g1.18<0,1,0>UW { align1 };
+mov (1) g9.4<1>F g1.18<0,1,0>UW { align1 };
+add (1) g9.8<1>F g1.18<0,1,0>UW 1UB { align1 };
+add (1) g9.12<1>F g1.18<0,1,0>UW 1UB { align1 };
+    /* Set up ss3.x coordinates */
+mov (1) g7.16<1>F g1.20<0,1,0>UW { align1 };
+add (1) g7.20<1>F g1.20<0,1,0>UW 1UB { align1 };
+mov (1) g7.24<1>F g1.20<0,1,0>UW { align1 };
+add (1) g7.28<1>F g1.20<0,1,0>UW 1UB { align1 };
+    /* Set up ss3.y coordinates */
+mov (1) g9.16<1>F g1.22<0,1,0>UW { align1 };
+mov (1) g9.20<1>F g1.22<0,1,0>UW { align1 };
+add (1) g9.24<1>F g1.22<0,1,0>UW 1UB { align1 };
+add (1) g9.28<1>F g1.22<0,1,0>UW 1UB { align1 };
+
+    /* Now, map these screen space coordinates into texture coordinates. */
+/* This is for src texture */
+/* I don't want to change origin ssX coords, as it will be used later in mask */
+/* so store tex coords in g10, g11, g12, g13 */
+
+    /* subtract screen-space X origin of vertex 0. */
+add (8) g10<1>F g6<8,8,1>F -g1<0,1,0>F { align1 };
+add (8) g11<1>F g7<8,8,1>F -g1<0,1,0>F { align1 };
+    /* scale by texture X increment */
+/* Cx[0] */
+mul (8) g10<1>F g10<8,8,1>F g3<0,1,0>F { align1 };
+mul (8) g11<1>F g11<8,8,1>F g3<0,1,0>F { align1 };
+    /* add in texture X offset */
+/* Co[0] */
+add (8) g10<1>F g10<8,8,1>F g3.12<0,1,0>F { align1 };
+add (8) g11<1>F g11<8,8,1>F g3.12<0,1,0>F { align1 };
+    /* subtract screen-space Y origin of vertex 0. */
+add (8) g12<1>F g8<8,8,1>F -g1.4<0,1,0>F { align1 };
+add (8) g13<1>F g9<8,8,1>F -g1.4<0,1,0>F { align1 };
+    /* scale by texture Y increment */
+/* Cy[0] */
+mul (8) g12<1>F g12<8,8,1>F g3.4<0,1,0>F { align1 };
+mul (8) g13<1>F g13<8,8,1>F g3.4<0,1,0>F { align1 };
+    /* add in texture Y offset */
+/* Co[1] */
+add (8) g12<1>F g12<8,8,1>F g3.28<0,1,0>F { align1 };
+add (8) g13<1>F g13<8,8,1>F g3.28<0,1,0>F { align1 };
+
+/* prepare sampler read back gX register, which would be written back to output */
+
+/* use simd16 sampler, param 0 is u, param 1 is v. */
+/* 'payload' loading, assuming tex coord start from g4 */
+mov (8) m1<1>F g10<8,8,1>F { align1 };
+mov (8) m2<1>F g11<8,8,1>F { align1 }; /* param 0 u in m1, m2 */
+mov (8) m3<1>F g12<8,8,1>F { align1 };
+mov (8) m4<1>F g13<8,8,1>F { align1 }; /* param 1 v in m3, m4 */
+
+/* m0 will be copied with g0, as it contains send desc */
+/* emit sampler 'send' cmd */
+
+/* src texture readback: g14-g21 */
+send (16) 0 		/* msg reg index */
+	g14<1>UW 	/* readback */
+	g0<8,8,1>UW  	/* copy to msg start reg*/
+	sampler (1,0,F)  /* sampler message description, 
+				(binding_table,sampler_index,datatype). 
+			    here(src->dst) we should use src_sampler and 
+			    src_surface */
+	mlen 5 rlen 8 { align1 };   /* required message len 5, readback len 8 */
+
+mov (8) g21<1>UD g21<8,8,1>UD { align1 };  /* wait sampler return */
+
+/* sampler mask texture, use g10, g11, g12, g13 */
+    /* subtract screen-space X origin of vertex 0. */
+add (8) g10<1>F g6<8,8,1>F -g1<0,1,0>F { align1 };
+add (8) g11<1>F g7<8,8,1>F -g1<0,1,0>F { align1 };
+    /* scale by texture X increment */
+/* Cx[2] */
+mul (8) g10<1>F g10<8,8,1>F g4<0,1,0>F { align1 };
+mul (8) g11<1>F g11<8,8,1>F g4<0,1,0>F { align1 };
+    /* add in texture X offset */
+/* Co[2] */
+add (8) g10<1>F g10<8,8,1>F g4.12<0,1,0>F { align1 };
+add (8) g11<1>F g11<8,8,1>F g4.12<0,1,0>F { align1 };
+    /* subtract screen-space Y origin of vertex 0. */
+add (8) g12<1>F g8<8,8,1>F -g1.4<0,1,0>F { align1 };
+add (8) g13<1>F g9<8,8,1>F -g1.4<0,1,0>F { align1 };
+    /* scale by texture Y increment */
+/* Cy[2] */
+mul (8) g12<1>F g12<8,8,1>F g4.4<0,1,0>F { align1 };
+mul (8) g13<1>F g13<8,8,1>F g4.4<0,1,0>F { align1 };
+    /* add in texture Y offset */
+/* Co[3] */
+add (8) g12<1>F g12<8,8,1>F g4.28<0,1,0>F { align1 };
+add (8) g13<1>F g13<8,8,1>F g4.28<0,1,0>F { align1 };
+
+mov (8) m1<1>F g10<8,8,1>F { align1 };
+mov (8) m2<1>F g11<8,8,1>F { align1 }; 
+mov (8) m3<1>F g12<8,8,1>F { align1 };
+mov (8) m4<1>F g13<8,8,1>F { align1 };
+
+/* mask sampler g22-g29 */
+/* binding_table (2), sampler (1) */
+send (16) 0 g22<1>UW g0<8,8,1>UW sampler (2,1,F) mlen 5 rlen 8 { align1 };
+mov (8) g29<1>UD g29<8,8,1>UD { align1 };  /* wait sampler return */
+
+/* mul mask's alpha channel g28,g29 to src (g14-g21), then write out src */
+mul (8) g14<1>F g14<8,8,1>F g28<8,8,1>F { align1 };
+mul (8) g15<1>F g15<8,8,1>F g29<8,8,1>F { align1 };
+mul (8) g16<1>F g16<8,8,1>F g28<8,8,1>F { align1 };
+mul (8) g17<1>F g17<8,8,1>F g29<8,8,1>F { align1 };
+mul (8) g18<1>F g18<8,8,1>F g28<8,8,1>F { align1 };
+mul (8) g19<1>F g19<8,8,1>F g29<8,8,1>F { align1 };
+mul (8) g20<1>F g20<8,8,1>F g28<8,8,1>F { align1 };
+mul (8) g21<1>F g21<8,8,1>F g29<8,8,1>F { align1 };
+
+/* prepare data in m2-m5 for subspan(1,0), m6-m9 for subspan(3,2), then it's ready to write */
+mov (8) m2<1>F g14<8,8,1>F { align1 };
+mov (8) m3<1>F g16<8,8,1>F { align1 };
+mov (8) m4<1>F g18<8,8,1>F { align1 };
+mov (8) m5<1>F g20<8,8,1>F { align1 };
+mov (8) m6<1>F g15<8,8,1>F { align1 };
+mov (8) m7<1>F g17<8,8,1>F { align1 };
+mov (8) m8<1>F g19<8,8,1>F { align1 };
+mov (8) m9<1>F g21<8,8,1>F { align1 };
+
+/* m0, m1 are all direct passed by PS thread payload */
+mov (8) m1<1>UD g1<8,8,1>UD { align1 mask_disable };
+
+/* write */
+send (16) 0 acc0<1>UW g0<8,8,1>UW write (
+	0,  /* binding_table */
+	8,  /* pixel scordboard clear, msg type simd16 single source */
+	4,  /* render target write */
+	0   /* no write commit message */
+	) 
+	mlen 10
+	rlen 0
+	{ align1 EOT };
+
+nop;
+nop;
+nop;
+nop;
+nop;
+nop;
+nop;
+nop;
+nop;
diff --git a/src/exa_wm_masknoca_prog.h b/src/exa_wm_masknoca_prog.h
new file mode 100644
index 0000000..66eb960
--- /dev/null
+++ b/src/exa_wm_masknoca_prog.h
@@ -0,0 +1,95 @@
+   { 0x00000001, 0x20c0013d, 0x00000028, 0x00000000 },
+   { 0x00000040, 0x20c40d3d, 0x00000028, 0x00000001 },
+   { 0x00000001, 0x20c8013d, 0x00000028, 0x00000000 },
+   { 0x00000040, 0x20cc0d3d, 0x00000028, 0x00000001 },
+   { 0x00000001, 0x2100013d, 0x0000002a, 0x00000000 },
+   { 0x00000001, 0x2104013d, 0x0000002a, 0x00000000 },
+   { 0x00000040, 0x21080d3d, 0x0000002a, 0x00000001 },
+   { 0x00000040, 0x210c0d3d, 0x0000002a, 0x00000001 },
+   { 0x00000001, 0x20d0013d, 0x0000002c, 0x00000000 },
+   { 0x00000040, 0x20d40d3d, 0x0000002c, 0x00000001 },
+   { 0x00000001, 0x20d8013d, 0x0000002c, 0x00000000 },
+   { 0x00000040, 0x20dc0d3d, 0x0000002c, 0x00000001 },
+   { 0x00000001, 0x2110013d, 0x0000002e, 0x00000000 },
+   { 0x00000001, 0x2114013d, 0x0000002e, 0x00000000 },
+   { 0x00000040, 0x21180d3d, 0x0000002e, 0x00000001 },
+   { 0x00000040, 0x211c0d3d, 0x0000002e, 0x00000001 },
+   { 0x00000001, 0x2120013d, 0x00000030, 0x00000000 },
+   { 0x00000040, 0x21240d3d, 0x00000030, 0x00000001 },
+   { 0x00000001, 0x2128013d, 0x00000030, 0x00000000 },
+   { 0x00000040, 0x212c0d3d, 0x00000030, 0x00000001 },
+   { 0x00000001, 0x2120013d, 0x00000032, 0x00000000 },
+   { 0x00000001, 0x2124013d, 0x00000032, 0x00000000 },
+   { 0x00000040, 0x21280d3d, 0x00000032, 0x00000001 },
+   { 0x00000040, 0x212c0d3d, 0x00000032, 0x00000001 },
+   { 0x00000001, 0x20f0013d, 0x00000034, 0x00000000 },
+   { 0x00000040, 0x20f40d3d, 0x00000034, 0x00000001 },
+   { 0x00000001, 0x20f8013d, 0x00000034, 0x00000000 },
+   { 0x00000040, 0x20fc0d3d, 0x00000034, 0x00000001 },
+   { 0x00000001, 0x2130013d, 0x00000036, 0x00000000 },
+   { 0x00000001, 0x2134013d, 0x00000036, 0x00000000 },
+   { 0x00000040, 0x21380d3d, 0x00000036, 0x00000001 },
+   { 0x00000040, 0x213c0d3d, 0x00000036, 0x00000001 },
+   { 0x00600040, 0x214077bd, 0x008d00c0, 0x00004020 },
+   { 0x00600040, 0x216077bd, 0x008d00e0, 0x00004020 },
+   { 0x00600041, 0x214077bd, 0x008d0140, 0x00000060 },
+   { 0x00600041, 0x216077bd, 0x008d0160, 0x00000060 },
+   { 0x00600040, 0x214077bd, 0x008d0140, 0x0000006c },
+   { 0x00600040, 0x216077bd, 0x008d0160, 0x0000006c },
+   { 0x00600040, 0x218077bd, 0x008d0100, 0x00004024 },
+   { 0x00600040, 0x21a077bd, 0x008d0120, 0x00004024 },
+   { 0x00600041, 0x218077bd, 0x008d0180, 0x00000064 },
+   { 0x00600041, 0x21a077bd, 0x008d01a0, 0x00000064 },
+   { 0x00600040, 0x218077bd, 0x008d0180, 0x0000007c },
+   { 0x00600040, 0x21a077bd, 0x008d01a0, 0x0000007c },
+   { 0x00600001, 0x202003be, 0x008d0140, 0x00000000 },
+   { 0x00600001, 0x204003be, 0x008d0160, 0x00000000 },
+   { 0x00600001, 0x206003be, 0x008d0180, 0x00000000 },
+   { 0x00600001, 0x208003be, 0x008d01a0, 0x00000000 },
+   { 0x00800031, 0x21c01d29, 0x008d0000, 0x02580001 },
+   { 0x00600001, 0x22a00021, 0x008d02a0, 0x00000000 },
+   { 0x00600040, 0x214077bd, 0x008d00c0, 0x00004020 },
+   { 0x00600040, 0x216077bd, 0x008d00e0, 0x00004020 },
+   { 0x00600041, 0x214077bd, 0x008d0140, 0x00000080 },
+   { 0x00600041, 0x216077bd, 0x008d0160, 0x00000080 },
+   { 0x00600040, 0x214077bd, 0x008d0140, 0x0000008c },
+   { 0x00600040, 0x216077bd, 0x008d0160, 0x0000008c },
+   { 0x00600040, 0x218077bd, 0x008d0100, 0x00004024 },
+   { 0x00600040, 0x21a077bd, 0x008d0120, 0x00004024 },
+   { 0x00600041, 0x218077bd, 0x008d0180, 0x00000084 },
+   { 0x00600041, 0x21a077bd, 0x008d01a0, 0x00000084 },
+   { 0x00600040, 0x218077bd, 0x008d0180, 0x0000009c },
+   { 0x00600040, 0x21a077bd, 0x008d01a0, 0x0000009c },
+   { 0x00600001, 0x202003be, 0x008d0140, 0x00000000 },
+   { 0x00600001, 0x204003be, 0x008d0160, 0x00000000 },
+   { 0x00600001, 0x206003be, 0x008d0180, 0x00000000 },
+   { 0x00600001, 0x208003be, 0x008d01a0, 0x00000000 },
+   { 0x00800031, 0x22c01d29, 0x008d0000, 0x02580102 },
+   { 0x00600001, 0x23a00021, 0x008d03a0, 0x00000000 },
+   { 0x00600041, 0x21c077bd, 0x008d01c0, 0x008d0380 },
+   { 0x00600041, 0x21e077bd, 0x008d01e0, 0x008d03a0 },
+   { 0x00600041, 0x220077bd, 0x008d0200, 0x008d0380 },
+   { 0x00600041, 0x222077bd, 0x008d0220, 0x008d03a0 },
+   { 0x00600041, 0x224077bd, 0x008d0240, 0x008d0380 },
+   { 0x00600041, 0x226077bd, 0x008d0260, 0x008d03a0 },
+   { 0x00600041, 0x228077bd, 0x008d0280, 0x008d0380 },
+   { 0x00600041, 0x22a077bd, 0x008d02a0, 0x008d03a0 },
+   { 0x00600001, 0x204003be, 0x008d01c0, 0x00000000 },
+   { 0x00600001, 0x206003be, 0x008d0200, 0x00000000 },
+   { 0x00600001, 0x208003be, 0x008d0240, 0x00000000 },
+   { 0x00600001, 0x20a003be, 0x008d0280, 0x00000000 },
+   { 0x00600001, 0x20c003be, 0x008d01e0, 0x00000000 },
+   { 0x00600001, 0x20e003be, 0x008d0220, 0x00000000 },
+   { 0x00600001, 0x210003be, 0x008d0260, 0x00000000 },
+   { 0x00600001, 0x212003be, 0x008d02a0, 0x00000000 },
+   { 0x00600201, 0x20200022, 0x008d0020, 0x00000000 },
+   { 0x00800031, 0x24001d28, 0x008d0000, 0x85a04800 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
diff --git a/src/i965_exa_render.c b/src/i965_exa_render.c
index 4bc90c1..c4a3f97 100644
--- a/src/i965_exa_render.c
+++ b/src/i965_exa_render.c
@@ -344,12 +344,16 @@ static const CARD32 sf_kernel_static[][4
 #include "exa_sf_prog.h"
 };
 
+static const CARD32 sf_kernel_static_mask[][4] = {
+#include "exa_sf_mask_prog.h"
+};
+
 /* ps kernels */
 #define PS_KERNEL_NUM_GRF   32
 #define PS_MAX_THREADS	   32
 /* 1: no mask */
 static const CARD32 ps_kernel_static_nomask [][4] = {
-	#include "exa_wm_nomask_prog.h"
+#include "exa_wm_nomask_prog.h"
 };
 
 /* 2: mask with componentAlpha, src * mask color, XXX: later */
@@ -359,7 +363,7 @@ static const CARD32 ps_kernel_static_mas
 
 /* 3: mask without componentAlpha, src * mask alpha */
 static const CARD32 ps_kernel_static_masknoca [][4] = {
-/*#include "i965_composite_wm_masknoca.h" */
+#include "exa_wm_masknoca_prog.h"
 };
 
 Bool
@@ -375,11 +379,6 @@ I965EXAPrepareComposite(int op, PictureP
  
 ErrorF("i965 prepareComposite\n");
 
-    /* FIXME: fallback in pMask for now, would be enable after finish
-	wm kernel program */
-    if (pMask)
-	I830FALLBACK("No mask support yet.\n");
-
     I965GetDestFormat(pDstPicture, &dst_format);
     src_offset = exaGetPixmapOffset(pSrc);
     src_pitch = exaGetPixmapPitch(pSrc);
@@ -436,7 +435,10 @@ ErrorF("i965 prepareComposite\n");
    /* keep current sf_kernel, which will send one setup urb entry to
 	PS kernel */
    sf_kernel_offset = ALIGN(next_offset, 64);
-   next_offset = sf_kernel_offset + sizeof (sf_kernel_static);
+   if (pMask) 
+       next_offset = sf_kernel_offset + sizeof (sf_kernel_static_mask);
+   else
+       next_offset = sf_kernel_offset + sizeof (sf_kernel_static);
 
    //XXX: ps_kernel may be seperated, fix with offset
    ps_kernel_offset = ALIGN(next_offset, 64);
@@ -746,7 +748,10 @@ ErrorF("i965 prepareComposite\n");
     * calculate dA/dx and dA/dy.  Hand these interpolation coefficients
     * back to SF which then hands pixels off to WM.
     */
-   memcpy (sf_kernel, sf_kernel_static, sizeof (sf_kernel_static));
+   if (pMask) 
+       memcpy (sf_kernel, sf_kernel_static_mask, sizeof (sf_kernel_static));
+   else
+       memcpy (sf_kernel, sf_kernel_static, sizeof (sf_kernel_static));
 
    memset(sf_state, 0, sizeof(*sf_state));
    sf_state->thread0.kernel_start_pointer = 
@@ -780,7 +785,6 @@ ErrorF("i965 prepareComposite\n");
    /* Set up the PS kernel (dispatched by WM) 
     */
     
-    // XXX: replace to texture blend shader, and different cases 
    if (pMask) {
 	if (pMaskPicture->componentAlpha)
    	    memcpy (ps_kernel, ps_kernel_static_maskca, sizeof (ps_kernel_static_maskca));
diff-tree a704120b15efae47344a90d972e7f3da64a202a6 (from e3c70c68e39183226e498271c44e98ef1b96a681)
Author: Wang Zhenyu <zhenyu.z.wang at intel.com>
Date:   Wed Nov 29 16:52:44 2006 +0800

    misc cleanup for G965 vs/sf/wm states

diff --git a/src/i965_exa_render.c b/src/i965_exa_render.c
index 51b2c60..4bc90c1 100644
--- a/src/i965_exa_render.c
+++ b/src/i965_exa_render.c
@@ -266,6 +266,7 @@ I965EXACheckComposite(int op, PicturePtr
 
 #define ALIGN(i,m)    (((i) + (m) - 1) & ~((m) - 1))
 #define MIN(a,b) ((a) < (b) ? (a) : (b))
+#define BRW_GRF_BLOCKS(nreg)    ((nreg + 15) / 16 - 1)
 
 int urb_vs_start, urb_vs_size;
 int urb_gs_start, urb_gs_size;
@@ -336,9 +337,8 @@ static const CARD32 sip_kernel_static[][
  * with the base texture coordinate. It was extracted from the Mesa driver
  */
 
-#define SF_KERNEL_NUM_GRF  10
-#define SF_KERNEL_NUM_URB  8
-#define SF_MAX_THREADS	   4
+#define SF_KERNEL_NUM_GRF  16
+#define SF_MAX_THREADS	   1
 
 static const CARD32 sf_kernel_static[][4] = {
 #include "exa_sf_prog.h"
@@ -468,7 +468,6 @@ ErrorF("i965 prepareComposite\n");
    next_offset = vb_offset + vb_size;
 
    /* And then the general state: */
-   //XXX: fix for texture map and target surface
    dest_surf_offset = ALIGN(next_offset, 32);
    next_offset = dest_surf_offset + sizeof(*dest_surf_state);
 
@@ -534,8 +533,8 @@ ErrorF("i965 prepareComposite\n");
 #define URB_CLIP_ENTRY_SIZE   0
 #define URB_CLIP_ENTRIES      0
    
-#define URB_SF_ENTRY_SIZE     4
-#define URB_SF_ENTRIES	      8
+#define URB_SF_ENTRY_SIZE     2
+#define URB_SF_ENTRIES	      1
 
    urb_vs_start = 0;
    urb_vs_size = URB_VS_ENTRIES * URB_VS_ENTRY_SIZE;
@@ -564,7 +563,6 @@ ErrorF("i965 prepareComposite\n");
    cc_state->cc3.ia_blend_enable = 0;  /* blend alpha just like colors */
    cc_state->cc3.blend_enable = 1;     /* enable color blend */
    cc_state->cc3.alpha_test = 0;       /* disable alpha test */
-   // XXX:cc_viewport needed? 
    cc_state->cc4.cc_viewport_state_offset = (state_base_offset + cc_viewport_offset) >> 5;
    cc_state->cc5.dither_enable = 0;    /* disable dither */
 //   cc_state->cc5.logicop_func = 0xc;   /* COPY */
@@ -585,7 +583,6 @@ ErrorF("i965 prepareComposite\n");
    memset(dest_surf_state, 0, sizeof(*dest_surf_state));
    dest_surf_state->ss0.surface_type = BRW_SURFACE_2D;
    dest_surf_state->ss0.data_return_format = BRW_SURFACERETURNFORMAT_FLOAT32;
-   // XXX: should compare with picture's cpp?...8 bit surf?
    if (pDst->drawable.bitsPerPixel == 16) {
       dest_surf_state->ss0.surface_format = BRW_SURFACEFORMAT_B5G6R5_UNORM;
    } else {
@@ -601,14 +598,12 @@ ErrorF("i965 prepareComposite\n");
    dest_surf_state->ss0.mipmap_layout_mode = 0;
    dest_surf_state->ss0.render_cache_read_mode = 0;
    
-   // XXX: fix to picture address & size
    dest_surf_state->ss1.base_addr = dst_offset;
    dest_surf_state->ss2.height = pDst->drawable.height - 1;
    dest_surf_state->ss2.width = pDst->drawable.width - 1;
    dest_surf_state->ss2.mip_count = 0;
    dest_surf_state->ss2.render_target_rotation = 0;
    dest_surf_state->ss3.pitch = dst_pitch - 1; 
-   // tiled surface?
 
    /* Set up the source surface state buffer */
    memset(src_surf_state, 0, sizeof(*src_surf_state));
@@ -741,8 +736,10 @@ ErrorF("i965 prepareComposite\n");
 
    /* Set up the vertex shader to be disabled (passthrough) */
    memset(vs_state, 0, sizeof(*vs_state));
-   // XXX: vs URB should be defined for VF vertex URB store. done already?
+   vs_state->thread4.nr_urb_entries = URB_VS_ENTRIES;
+   vs_state->thread4.urb_entry_allocation_size = URB_VS_ENTRY_SIZE - 1;
    vs_state->vs6.vs_enable = 0;
+   vs_state->vs6.vert_cache_disable = 1;
 
    // XXX: sf_kernel? keep it as now
    /* Set up the SF kernel to do coord interp: for each attribute,
@@ -754,7 +751,7 @@ ErrorF("i965 prepareComposite\n");
    memset(sf_state, 0, sizeof(*sf_state));
    sf_state->thread0.kernel_start_pointer = 
 	       (state_base_offset + sf_kernel_offset) >> 6;
-   sf_state->thread0.grf_reg_count = ((SF_KERNEL_NUM_GRF & ~15) / 16);
+   sf_state->thread0.grf_reg_count = BRW_GRF_BLOCKS(SF_KERNEL_NUM_GRF);
    sf_state->sf1.single_program_flow = 1;
    sf_state->sf1.binding_table_entry_count = 0;
    sf_state->sf1.thread_priority = 0;
@@ -795,7 +792,7 @@ ErrorF("i965 prepareComposite\n");
    memset (wm_state, 0, sizeof (*wm_state));
    wm_state->thread0.kernel_start_pointer = 
 	    (state_base_offset + ps_kernel_offset) >> 6;
-   wm_state->thread0.grf_reg_count = ((PS_KERNEL_NUM_GRF & ~15) / 16);
+   wm_state->thread0.grf_reg_count = BRW_GRF_BLOCKS(PS_KERNEL_NUM_GRF);
    wm_state->thread1.single_program_flow = 1;
    if (!pMask)
        wm_state->thread1.binding_table_entry_count = 2; /* 1 tex and fb */
@@ -808,7 +805,10 @@ ErrorF("i965 prepareComposite\n");
    // XXX: urb allocation
    wm_state->thread3.const_urb_entry_read_length = 0;
    wm_state->thread3.const_urb_entry_read_offset = 0;
-   wm_state->thread3.urb_entry_read_length = 1;  /* one per pair of attrib */
+   if (pMask)
+       wm_state->thread3.urb_entry_read_length = 2;  /* two per pair of attrib */
+   else 
+       wm_state->thread3.urb_entry_read_length = 1;  /* one per pair of attrib */
    wm_state->thread3.urb_entry_read_offset = 0;
    // wm kernel use urb from 3, see wm_program in compiler module
    wm_state->thread3.dispatch_grf_start_reg = 3; /* must match kernel */
diff-tree e3c70c68e39183226e498271c44e98ef1b96a681 (from aa515c54f0cfd9025fc38dc4b7938ff17a8a13fb)
Author: Wang Zhenyu <zhenyu.z.wang at intel.com>
Date:   Wed Nov 29 16:40:15 2006 +0800

    WM kernel needs scratch space

diff --git a/src/i965_exa_render.c b/src/i965_exa_render.c
index 51c2006..51b2c60 100644
--- a/src/i965_exa_render.c
+++ b/src/i965_exa_render.c
@@ -297,6 +297,7 @@ int dest_surf_offset, src_surf_offset, m
 int src_sampler_offset, mask_sampler_offset,vs_offset;
 int sf_offset, wm_offset, cc_offset, vb_offset, cc_viewport_offset;
 int sf_kernel_offset, ps_kernel_offset, sip_kernel_offset;
+int wm_scratch_offset;
 int binding_table_offset;
 int default_color_offset; 
 int next_offset, total_state_size;
@@ -426,6 +427,9 @@ ErrorF("i965 prepareComposite\n");
    wm_offset = ALIGN(next_offset, 32);
    next_offset = wm_offset + sizeof(*wm_state);
     
+   wm_scratch_offset = ALIGN(next_offset, 1024);
+   next_offset = wm_scratch_offset + 1024 * PS_MAX_THREADS;
+
    cc_offset = ALIGN(next_offset, 32);
    next_offset = cc_offset + sizeof(*cc_state);
 
@@ -798,7 +802,8 @@ ErrorF("i965 prepareComposite\n");
    else
        wm_state->thread1.binding_table_entry_count = 3; /* 2 tex and fb */
 
-   wm_state->thread2.scratch_space_base_pointer = 0;
+   wm_state->thread2.scratch_space_base_pointer = (state_base_offset + 
+						   wm_scratch_offset)>>10;
    wm_state->thread2.per_thread_scratch_space = 0;
    // XXX: urb allocation
    wm_state->thread3.const_urb_entry_read_length = 0;
diff-tree aa515c54f0cfd9025fc38dc4b7938ff17a8a13fb (from b6eba96584bcd2c024f6443d9f3728eb65b234fb)
Author: Wang Zhenyu <zhenyu.z.wang at intel.com>
Date:   Wed Nov 29 16:37:06 2006 +0800

    Setup default border color for our samplers

diff --git a/src/i965_exa_render.c b/src/i965_exa_render.c
index 94eabfb..51c2006 100644
--- a/src/i965_exa_render.c
+++ b/src/i965_exa_render.c
@@ -278,6 +278,7 @@ struct brw_surface_state *src_surf_state
 struct brw_surface_state *mask_surf_state;
 struct brw_sampler_state *src_sampler_state;
 struct brw_sampler_state *mask_sampler_state;  
+struct brw_sampler_default_color *default_color_state;
 
 struct brw_vs_unit_state *vs_state;
 struct brw_sf_unit_state *sf_state;
@@ -297,6 +298,7 @@ int src_sampler_offset, mask_sampler_off
 int sf_offset, wm_offset, cc_offset, vb_offset, cc_viewport_offset;
 int sf_kernel_offset, ps_kernel_offset, sip_kernel_offset;
 int binding_table_offset;
+int default_color_offset; 
 int next_offset, total_state_size;
 char *state_base;
 int state_base_offset;
@@ -478,6 +480,9 @@ ErrorF("i965 prepareComposite\n");
    binding_table_offset = ALIGN(next_offset, 32);
    next_offset = binding_table_offset + (binding_table_entries * 4);
 
+   default_color_offset = ALIGN(next_offset, 32);
+   next_offset = default_color_offset + sizeof(*default_color_state);
+
    total_state_size = next_offset;
    assert(total_state_size < EXA_LINEAR_EXTRA);
 
@@ -508,6 +513,8 @@ ErrorF("i965 prepareComposite\n");
 
    vb = (void *)(state_base + vb_offset);
 
+   default_color_state = (void*)(state_base + default_color_offset);
+
    /* Set up a default static partitioning of the URB, which is supposed to
     * allow anything we would want to do, at potentially lower performance.
     */
@@ -541,7 +548,6 @@ ErrorF("i965 prepareComposite\n");
     * here, but we should have synced the 3D engine already in I830PutImage.
     */
 
-// needed?
    memset (cc_viewport, 0, sizeof (*cc_viewport));
    cc_viewport->min_depth = -1.e35;
    cc_viewport->max_depth = 1.e35;
@@ -678,18 +684,25 @@ ErrorF("i965 prepareComposite\n");
 	I830FALLBACK("Bad filter 0x%x\n", pSrcPicture->filter);
    }
 
+   memset(default_color_state, 0, sizeof(*default_color_state));
+   default_color_state->color[0] = 1.0; /* RGBA format */
+   default_color_state->color[1] = 0.0; 
+   default_color_state->color[2] = 0.0; 
+   default_color_state->color[3] = 0.0; 
+
+   src_sampler_state->ss0.default_color_mode = 0; /* GL mode */
+
    if (!pSrcPicture->repeat) {
-	/* XXX: clamp_border and set border to 0 */
-   	src_sampler_state->ss1.r_wrap_mode = BRW_TEXCOORDMODE_CLAMP; 
-   	src_sampler_state->ss1.s_wrap_mode = BRW_TEXCOORDMODE_CLAMP;
-   	src_sampler_state->ss1.t_wrap_mode = BRW_TEXCOORDMODE_CLAMP;
+   	src_sampler_state->ss1.r_wrap_mode = BRW_TEXCOORDMODE_CLAMP_BORDER; 
+   	src_sampler_state->ss1.s_wrap_mode = BRW_TEXCOORDMODE_CLAMP_BORDER;
+   	src_sampler_state->ss1.t_wrap_mode = BRW_TEXCOORDMODE_CLAMP_BORDER;
+	src_sampler_state->ss2.default_color_pointer = 
+			(state_base_offset + default_color_offset) >> 5;
    } else {
    	src_sampler_state->ss1.r_wrap_mode = BRW_TEXCOORDMODE_WRAP; 
    	src_sampler_state->ss1.s_wrap_mode = BRW_TEXCOORDMODE_WRAP;
    	src_sampler_state->ss1.t_wrap_mode = BRW_TEXCOORDMODE_WRAP;
    }
-   /* XXX: ss2 has border color pointer, which should be in general state address,
-    	   and just a single texel tex map, with R32G32B32A32_FLOAT */
    src_sampler_state->ss3.chroma_key_enable = 0; /* disable chromakey */
 
    if (pMask) {
@@ -709,17 +722,16 @@ ErrorF("i965 prepareComposite\n");
    	}
 
    	if (!pMaskPicture->repeat) {
-	/* XXX: clamp_border and set border to 0 */
-   	    mask_sampler_state->ss1.r_wrap_mode = BRW_TEXCOORDMODE_CLAMP; 
-   	    mask_sampler_state->ss1.s_wrap_mode = BRW_TEXCOORDMODE_CLAMP;
-   	    mask_sampler_state->ss1.t_wrap_mode = BRW_TEXCOORDMODE_CLAMP;
+   	    mask_sampler_state->ss1.r_wrap_mode = BRW_TEXCOORDMODE_CLAMP_BORDER; 
+   	    mask_sampler_state->ss1.s_wrap_mode = BRW_TEXCOORDMODE_CLAMP_BORDER;
+   	    mask_sampler_state->ss1.t_wrap_mode = BRW_TEXCOORDMODE_CLAMP_BORDER;
+            mask_sampler_state->ss2.default_color_pointer = 
+				(state_base_offset + default_color_offset)>>5;
    	} else {
    	    mask_sampler_state->ss1.r_wrap_mode = BRW_TEXCOORDMODE_WRAP; 
    	    mask_sampler_state->ss1.s_wrap_mode = BRW_TEXCOORDMODE_WRAP;
    	    mask_sampler_state->ss1.t_wrap_mode = BRW_TEXCOORDMODE_WRAP;
     	}
-   /* XXX: ss2 has border color pointer, which should be in general state address,
-    	   and just a single texel tex map, with R32G32B32A32_FLOAT */
    	mask_sampler_state->ss3.chroma_key_enable = 0; /* disable chromakey */
    }
 
diff-tree b6eba96584bcd2c024f6443d9f3728eb65b234fb (from 453842c9ff733af45fa665d9db6a35164f45c60a)
Author: Wang Zhenyu <zhenyu.z.wang at intel.com>
Date:   Wed Nov 29 16:30:53 2006 +0800

    fix vertex buffer size

diff --git a/src/i965_exa_render.c b/src/i965_exa_render.c
index 9127d65..94eabfb 100644
--- a/src/i965_exa_render.c
+++ b/src/i965_exa_render.c
@@ -233,16 +233,12 @@ Bool
 I965EXACheckComposite(int op, PicturePtr pSrcPicture, PicturePtr pMaskPicture,
 		      PicturePtr pDstPicture)
 {
-	/* check op*/
-	/* check op with mask's componentAlpha*/
-	/* check textures */
-	/* check dst buffer format */
     CARD32 tmp1;
     
     /* Check for unsupported compositing operations. */
     if (op >= sizeof(I965BlendOp) / sizeof(I965BlendOp[0]))
         I830FALLBACK("Unsupported Composite op 0x%x\n", op);
-                                                                                                                                                            
+
     if (pMaskPicture != NULL && pMaskPicture->componentAlpha) {
         /* Check if it's component alpha that relies on a source alpha and on
          * the source value.  We can only get one of those into the single
@@ -305,7 +301,7 @@ int next_offset, total_state_size;
 char *state_base;
 int state_base_offset;
 float *vb;
-int vb_size = 4 * 4 ; /* 4 DWORDS per vertex, 4 vertices for TRIFAN*/ 
+int vb_size = (4 * 4) * 4 ; /* 4 DWORDS per vertex*/ 
 
 CARD32 src_blend, dst_blend;
 
diff-tree 453842c9ff733af45fa665d9db6a35164f45c60a (from 18ad7d5cf04081d89a9f978ccc7794116f7c498b)
Author: Wang Zhenyu <zhenyu.z.wang at intel.com>
Date:   Wed Nov 29 16:24:24 2006 +0800

    clean up issue cmd to ring buffer
    
    Make it easy to track different part of ring state, and
    use rectlist primitive instead.

diff --git a/src/i965_exa_render.c b/src/i965_exa_render.c
index 2c3e43b..9127d65 100644
--- a/src/i965_exa_render.c
+++ b/src/i965_exa_render.c
@@ -376,11 +376,6 @@ I965EXAPrepareComposite(int op, PictureP
  
 ErrorF("i965 prepareComposite\n");
 
-//    i965_3d_pipeline_setup(pScrn);
-//    i965_surf_setup(pScrn, pSrcPicture, pMaskPicture, pDstPicture,
-//   			pSrc, pMask, pDst);
-    // then setup blend, and shader program 
-    
     /* FIXME: fallback in pMask for now, would be enable after finish
 	wm kernel program */
     if (pMask)
@@ -819,62 +814,65 @@ ErrorF("i965 prepareComposite\n");
     * rendering pipe
     */
    {
-   
-   BEGIN_LP_RING((pMask?48:46));
-   // MI_FLUSH prior to PIPELINE_SELECT
-   OUT_RING(MI_FLUSH | 
+	BEGIN_LP_RING(2);
+   	OUT_RING(MI_FLUSH | 
 	    MI_STATE_INSTRUCTION_CACHE_FLUSH |
 	    BRW_MI_GLOBAL_SNAPSHOT_RESET);
+	OUT_RING(MI_NOOP);
+	ADVANCE_LP_RING();
+   }
+   {
+        BEGIN_LP_RING(12);
    
-   /* Match Mesa driver setup */
-   OUT_RING(BRW_PIPELINE_SELECT | PIPELINE_SELECT_3D);
+        /* Match Mesa driver setup */
+        OUT_RING(BRW_PIPELINE_SELECT | PIPELINE_SELECT_3D);
    
+   	OUT_RING(BRW_CS_URB_STATE | 0);
+   	OUT_RING((0 << 4) |  /* URB Entry Allocation Size */
+            (0 << 0));  /* Number of URB Entries */
+
    /* Zero out the two base address registers so all offsets are absolute */
-   // XXX: zero out...
-   OUT_RING(BRW_STATE_BASE_ADDRESS | 4);
-   // why this's not state_base_offset? -> because later we'll always add on
-   // state_base_offset to offset params. see SIP
-   OUT_RING(0 | BASE_ADDRESS_MODIFY);  /* Generate state base address */
-   OUT_RING(0 | BASE_ADDRESS_MODIFY);  /* Surface state base address */
-   OUT_RING(0 | BASE_ADDRESS_MODIFY);  /* media base addr, don't care */
-   OUT_RING(0x10000000 | BASE_ADDRESS_MODIFY);  /* general state max addr, disabled */
-   OUT_RING(0x10000000 | BASE_ADDRESS_MODIFY);  /* media object state max addr, disabled */
+   	OUT_RING(BRW_STATE_BASE_ADDRESS | 4);
+   	OUT_RING(0 | BASE_ADDRESS_MODIFY);  /* Generate state base address */
+   	OUT_RING(0 | BASE_ADDRESS_MODIFY);  /* Surface state base address */
+   	OUT_RING(0 | BASE_ADDRESS_MODIFY);  /* media base addr, don't care */
+   	OUT_RING(0x10000000 | BASE_ADDRESS_MODIFY);  /* general state max addr, disabled */
+   	OUT_RING(0x10000000 | BASE_ADDRESS_MODIFY);  /* media object state max addr, disabled */
 
    /* Set system instruction pointer */
-   OUT_RING(BRW_STATE_SIP | 0);
-   OUT_RING(state_base_offset + sip_kernel_offset); /* system instruction pointer */
-      
+   	OUT_RING(BRW_STATE_SIP | 0);
+   	OUT_RING(state_base_offset + sip_kernel_offset); /* system instruction pointer */
+	OUT_RING(MI_NOOP);
+	ADVANCE_LP_RING();
+   }
+   {
+	BEGIN_LP_RING(26);
    /* Pipe control */
-   // XXX: pipe control write cache before enabling color blending
-   // vol2, geometry pipeline 1.8.4
-   OUT_RING(BRW_PIPE_CONTROL |
+   	OUT_RING(BRW_PIPE_CONTROL |
 	    BRW_PIPE_CONTROL_NOWRITE |
 	    BRW_PIPE_CONTROL_IS_FLUSH |
 	    2);
-   OUT_RING(0);			       /* Destination address */
-   OUT_RING(0);			       /* Immediate data low DW */
-   OUT_RING(0);			       /* Immediate data high DW */
+   	OUT_RING(0);			       /* Destination address */
+   	OUT_RING(0);			       /* Immediate data low DW */
+   	OUT_RING(0);			       /* Immediate data high DW */
 
    /* Binding table pointers */
-   OUT_RING(BRW_3DSTATE_BINDING_TABLE_POINTERS | 4);
-   OUT_RING(0); /* vs */
-   OUT_RING(0); /* gs */
-   OUT_RING(0); /* clip */
-   OUT_RING(0); /* sf */
+   	OUT_RING(BRW_3DSTATE_BINDING_TABLE_POINTERS | 4);
+   	OUT_RING(0); /* vs */
+   	OUT_RING(0); /* gs */
+   	OUT_RING(0); /* clip */
+   	OUT_RING(0); /* sf */
    /* Only the PS uses the binding table */
-   OUT_RING(state_base_offset + binding_table_offset); /* ps */
-
-   //ring 20
+   	OUT_RING(state_base_offset + binding_table_offset); /* ps */
 
    /* The drawing rectangle clipping is always on.  Set it to values that
     * shouldn't do any clipping.
     */
-    //XXX: fix for picture size
-   OUT_RING(BRW_3DSTATE_DRAWING_RECTANGLE | 2);	/* XXX 3 for BLC or CTG */
-   OUT_RING(0x00000000);	/* ymin, xmin */
-   OUT_RING((pScrn->virtualX - 1) |
-	    (pScrn->virtualY - 1) << 16); /* ymax, xmax */
-   OUT_RING(0x00000000);	/* yorigin, xorigin */
+   	OUT_RING(BRW_3DSTATE_DRAWING_RECTANGLE | 2);	/* XXX 3 for BLC or CTG */
+   	OUT_RING(0x00000000);	/* ymin, xmin */
+   	OUT_RING((pScrn->virtualX - 1) |
+ 	         (pScrn->virtualY - 1) << 16); /* ymax, xmax */
+   	OUT_RING(0x00000000);	/* yorigin, xorigin */
 
    /* skip the depth buffer */
    /* skip the polygon stipple */
@@ -882,90 +880,82 @@ ErrorF("i965 prepareComposite\n");
    /* skip the line stipple */
    
    /* Set the pointers to the 3d pipeline state */
-   OUT_RING(BRW_3DSTATE_PIPELINED_POINTERS | 5);
-   OUT_RING(state_base_offset + vs_offset);  /* 32 byte aligned */
-   OUT_RING(BRW_GS_DISABLE);		     /* disable GS, resulting in passthrough */
-   OUT_RING(BRW_CLIP_DISABLE);		     /* disable CLIP, resulting in passthrough */
-   OUT_RING(state_base_offset + sf_offset);  /* 32 byte aligned */
-   OUT_RING(state_base_offset + wm_offset);  /* 32 byte aligned */
-   OUT_RING(state_base_offset + cc_offset);  /* 64 byte aligned */
+   	OUT_RING(BRW_3DSTATE_PIPELINED_POINTERS | 5);
+   	OUT_RING(state_base_offset + vs_offset);  /* 32 byte aligned */
+   	OUT_RING(BRW_GS_DISABLE);		     /* disable GS, resulting in passthrough */
+   	OUT_RING(BRW_CLIP_DISABLE);		     /* disable CLIP, resulting in passthrough */
+   	OUT_RING(state_base_offset + sf_offset);  /* 32 byte aligned */
+   	OUT_RING(state_base_offset + wm_offset);  /* 32 byte aligned */
+   	OUT_RING(state_base_offset + cc_offset);  /* 64 byte aligned */
 
    /* URB fence */
-   // XXX: CS for const URB needed? if not, cs_fence should be equal to sf_fence
-   OUT_RING(BRW_URB_FENCE |
-	    UF0_CS_REALLOC |
-	    UF0_SF_REALLOC |
-	    UF0_CLIP_REALLOC |
-	    UF0_GS_REALLOC |
-	    UF0_VS_REALLOC |
-	    1);
-   OUT_RING(((urb_clip_start + urb_clip_size) << UF1_CLIP_FENCE_SHIFT) |
-	    ((urb_gs_start + urb_gs_size) << UF1_GS_FENCE_SHIFT) |
-	    ((urb_vs_start + urb_vs_size) << UF1_VS_FENCE_SHIFT));
-   OUT_RING(((urb_cs_start + urb_cs_size) << UF2_CS_FENCE_SHIFT) |
-	    ((urb_sf_start + urb_sf_size) << UF2_SF_FENCE_SHIFT));
+   	OUT_RING(BRW_URB_FENCE |
+        	 UF0_CS_REALLOC |
+	    	 UF0_SF_REALLOC |
+	    	 UF0_CLIP_REALLOC |
+	         UF0_GS_REALLOC |
+	         UF0_VS_REALLOC |
+	    	 1);
+   	OUT_RING(((urb_clip_start + urb_clip_size) << UF1_CLIP_FENCE_SHIFT) |
+	    	 ((urb_gs_start + urb_gs_size) << UF1_GS_FENCE_SHIFT) |
+	    	 ((urb_vs_start + urb_vs_size) << UF1_VS_FENCE_SHIFT));
+   	OUT_RING(((urb_cs_start + urb_cs_size) << UF2_CS_FENCE_SHIFT) |
+	     	 ((urb_sf_start + urb_sf_size) << UF2_SF_FENCE_SHIFT));
 
    /* Constant buffer state */
-   // XXX: needed? seems no usage, as we don't have CONSTANT_BUFFER definition
-   OUT_RING(BRW_CS_URB_STATE | 0);
-   OUT_RING(((URB_CS_ENTRY_SIZE - 1) << 4) | /* URB Entry Allocation Size */
-	    (URB_CS_ENTRIES << 0));	     /* Number of URB Entries */
-   
+   	OUT_RING(BRW_CS_URB_STATE | 0);
+   	OUT_RING(((URB_CS_ENTRY_SIZE - 1) << 4) | /* URB Entry Allocation Size */
+	    	 (URB_CS_ENTRIES << 0));	     /* Number of URB Entries */
+	ADVANCE_LP_RING();
+   }
+   {
+        int nelem = pMask ? 3: 2;
+   	BEGIN_LP_RING(pMask?12:10);
    /* Set up the pointer to our vertex buffer */
-   // XXX: double check
-  // int vb_pitch = 4 * 4;  // XXX: pitch should include mask's coords? possible
-  // all three coords on one row?
-   int nelem = pMask ? 3: 2;
-   OUT_RING(BRW_3DSTATE_VERTEX_BUFFERS | 3); //XXX: should be 4n-1 -> 3
-   OUT_RING((0 << VB0_BUFFER_INDEX_SHIFT) |
-	    VB0_VERTEXDATA |
-	    ((4 * 2 * nelem) << VB0_BUFFER_PITCH_SHIFT)); 
-   		// pitch includes all vertex data, 4bytes for 1 dword, each
-		// element has 2 coords (x,y)(s0,t0), nelem to reflect possible
-		// mask
-   OUT_RING(state_base_offset + vb_offset);
-   OUT_RING(4 * nelem); // max index, prim has 4 coords
-   OUT_RING(0); // ignore for VERTEXDATA, but still there
+   	OUT_RING(BRW_3DSTATE_VERTEX_BUFFERS | 3); 
+   	OUT_RING((0 << VB0_BUFFER_INDEX_SHIFT) |
+	    	 VB0_VERTEXDATA |
+	    	 ((4 * 2 * nelem) << VB0_BUFFER_PITCH_SHIFT)); 
+   	OUT_RING(state_base_offset + vb_offset);
+   	OUT_RING(2); // max index, prim has 4 coords
+   	OUT_RING(0); // ignore for VERTEXDATA, but still there
 
    /* Set up our vertex elements, sourced from the single vertex buffer. */
-   OUT_RING(BRW_3DSTATE_VERTEX_ELEMENTS | ((2 * nelem) - 1));  // XXX: 2n-1, (x,y) + (s0,t0) +
-						//   possible (s1, t1)
+   	OUT_RING(BRW_3DSTATE_VERTEX_ELEMENTS | ((2 * nelem) - 1));  
    /* offset 0: X,Y -> {X, Y, 1.0, 1.0} */
-   OUT_RING((0 << VE0_VERTEX_BUFFER_INDEX_SHIFT) |
-	    VE0_VALID |
-	    (BRW_SURFACEFORMAT_R32G32_FLOAT << VE0_FORMAT_SHIFT) |
-	    (0 << VE0_OFFSET_SHIFT));
-   OUT_RING((BRW_VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_0_SHIFT) |
-	    (BRW_VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_1_SHIFT) |
-	    (BRW_VFCOMPONENT_STORE_1_FLT << VE1_VFCOMPONENT_2_SHIFT) |
-	    (BRW_VFCOMPONENT_STORE_1_FLT << VE1_VFCOMPONENT_3_SHIFT) |
-	    (0 << VE1_DESTINATION_ELEMENT_OFFSET_SHIFT));
+   	OUT_RING((0 << VE0_VERTEX_BUFFER_INDEX_SHIFT) |
+	    	 VE0_VALID |
+	    	 (BRW_SURFACEFORMAT_R32G32_FLOAT << VE0_FORMAT_SHIFT) |
+	    	 (0 << VE0_OFFSET_SHIFT));
+   	OUT_RING((BRW_VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_0_SHIFT) |
+	    	 (BRW_VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_1_SHIFT) |
+	     	 (BRW_VFCOMPONENT_STORE_1_FLT << VE1_VFCOMPONENT_2_SHIFT) |
+	    	 (BRW_VFCOMPONENT_STORE_1_FLT << VE1_VFCOMPONENT_3_SHIFT) |
+	    	 (0 << VE1_DESTINATION_ELEMENT_OFFSET_SHIFT));
    /* offset 8: S0, T0 -> {S0, T0, 1.0, 1.0} */
-   OUT_RING((0 << VE0_VERTEX_BUFFER_INDEX_SHIFT) |
-	    VE0_VALID |
-	    (BRW_SURFACEFORMAT_R32G32_FLOAT << VE0_FORMAT_SHIFT) |
-	    (8 << VE0_OFFSET_SHIFT));
-   OUT_RING((BRW_VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_0_SHIFT) |
-	    (BRW_VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_1_SHIFT) |
-	    (BRW_VFCOMPONENT_STORE_1_FLT << VE1_VFCOMPONENT_2_SHIFT) |
-	    (BRW_VFCOMPONENT_STORE_1_FLT << VE1_VFCOMPONENT_3_SHIFT) |
-	    (4 << VE1_DESTINATION_ELEMENT_OFFSET_SHIFT));
-
-   if (pMask) {
    	OUT_RING((0 << VE0_VERTEX_BUFFER_INDEX_SHIFT) |
-	    VE0_VALID |
-	    (BRW_SURFACEFORMAT_R32G32_FLOAT << VE0_FORMAT_SHIFT) |
-	    (16 << VE0_OFFSET_SHIFT));
-	OUT_RING((BRW_VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_0_SHIFT) |
-	    (BRW_VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_1_SHIFT) |
-	    (BRW_VFCOMPONENT_STORE_1_FLT << VE1_VFCOMPONENT_2_SHIFT) |
-	    (BRW_VFCOMPONENT_STORE_1_FLT << VE1_VFCOMPONENT_3_SHIFT) |
-	    (8 << VE1_DESTINATION_ELEMENT_OFFSET_SHIFT)); 
-		//XXX: is this has alignment issue? and thread access problem?
-   }
+	    	 VE0_VALID |
+	    	 (BRW_SURFACEFORMAT_R32G32_FLOAT << VE0_FORMAT_SHIFT) |
+	    	 (8 << VE0_OFFSET_SHIFT));
+   	OUT_RING((BRW_VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_0_SHIFT) |
+	    	 (BRW_VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_1_SHIFT) |
+	    	 (BRW_VFCOMPONENT_STORE_1_FLT << VE1_VFCOMPONENT_2_SHIFT) |
+	     	 (BRW_VFCOMPONENT_STORE_1_FLT << VE1_VFCOMPONENT_3_SHIFT) |
+	    	 (4 << VE1_DESTINATION_ELEMENT_OFFSET_SHIFT));
+
+   	if (pMask) {
+   		OUT_RING((0 << VE0_VERTEX_BUFFER_INDEX_SHIFT) |
+	    		 VE0_VALID |
+	    		 (BRW_SURFACEFORMAT_R32G32_FLOAT << VE0_FORMAT_SHIFT) |
+	    		 (16 << VE0_OFFSET_SHIFT));
+		OUT_RING((BRW_VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_0_SHIFT) |
+	    		 (BRW_VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_1_SHIFT) |
+	    		 (BRW_VFCOMPONENT_STORE_1_FLT << VE1_VFCOMPONENT_2_SHIFT) |
+	    		 (BRW_VFCOMPONENT_STORE_1_FLT << VE1_VFCOMPONENT_3_SHIFT) |
+	    		 (8 << VE1_DESTINATION_ELEMENT_OFFSET_SHIFT)); 
+   	}
    
-   ADVANCE_LP_RING();
-    
+   	ADVANCE_LP_RING();
    }
 
 #ifdef I830DEBUG
@@ -983,7 +973,7 @@ I965EXAComposite(PixmapPtr pDst, int src
     I830Ptr pI830 = I830PTR(pScrn);
     int srcXend, srcYend, maskXend, maskYend;
     PictVector v;
-    int pMask = 1, i = 0;
+    int pMask = 1, i;
 
     DPRINTF(PFX, "Composite: srcX %d, srcY %d\n\t maskX %d, maskY %d\n\t"
 	    "dstX %d, dstY %d\n\twidth %d, height %d\n\t"
@@ -999,8 +989,10 @@ I965EXAComposite(PixmapPtr pDst, int src
 
     srcXend = srcX + w;
     srcYend = srcY + h;
-    maskXend = maskX + w;
-    maskYend = maskY + h;
+    if (pMask) {
+        maskXend = maskX + w;
+        maskYend = maskY + h;
+    }
     if (is_transform[0]) {
         v.vector[0] = IntToxFixed(srcX);
         v.vector[1] = IntToxFixed(srcY);
@@ -1035,51 +1027,45 @@ I965EXAComposite(PixmapPtr pDst, int src
 		"dstX %d, dstY %d\n", srcX, srcY, srcXend, srcYend,
 		maskX, maskY, maskXend, maskYend, dstX, dstY);
 
- 
-    vb[i++] = (float)dstX;
-    vb[i++] = (float)dstY;
-    vb[i++] = (float)srcX / scale_units[0][0];
-    vb[i++] = (float)srcY / scale_units[0][1];
-    if (pMask) {
-        vb[i++] = (float)maskX / scale_units[1][0];
-        vb[i++] = (float)maskY / scale_units[1][1];
-    }
-
-    vb[i++] = (float)dstX;
-    vb[i++] = (float)(dstY + h);
-    vb[i++] = (float)srcX / scale_units[0][0];
-    vb[i++] = (float)srcYend / scale_units[0][1];
+    i = 0;
+    /* rect (x2,y2) */
+    vb[i++] = (float)(srcXend) / scale_units[0][0];
+    vb[i++] = (float)(srcYend) / scale_units[0][1];
     if (pMask) {
-        vb[i++] = (float)maskX / scale_units[1][0];
+        vb[i++] = (float)maskXend / scale_units[1][0];
         vb[i++] = (float)maskYend / scale_units[1][1];
     }
-
     vb[i++] = (float)(dstX + w);
     vb[i++] = (float)(dstY + h);
-    vb[i++] = (float)srcXend / scale_units[0][0];
-    vb[i++] = (float)srcYend / scale_units[0][1];
+
+    /* rect (x1,y2) */
+    vb[i++] = (float)(srcX)/ scale_units[0][0];
+    vb[i++] = (float)(srcYend)/ scale_units[0][1];
     if (pMask) {
-        vb[i++] = (float)maskXend / scale_units[1][0];
+        vb[i++] = (float)maskX / scale_units[1][0];
         vb[i++] = (float)maskYend / scale_units[1][1];
     }
+    vb[i++] = (float)dstX;
+    vb[i++] = (float)(dstY + h);
 
-    vb[i++] = (float)(dstX + w);
-    vb[i++] = (float)dstY;
-    vb[i++] = (float)srcXend / scale_units[0][0];
-    vb[i++] = (float)srcY / scale_units[0][1];
+    /* rect (x1,y1) */
+    vb[i++] = (float)(srcX) / scale_units[0][0];
+    vb[i++] = (float)(srcY) / scale_units[0][1];
     if (pMask) {
-        vb[i++] = (float)maskXend / scale_units[1][0];
+        vb[i++] = (float)maskX / scale_units[1][0];
         vb[i++] = (float)maskY / scale_units[1][1];
     }
-
+    vb[i++] = (float)dstX;
+    vb[i++] = (float)dstY;
+   
     {
       BEGIN_LP_RING(6);
       OUT_RING(BRW_3DPRIMITIVE | 
 	       BRW_3DPRIMITIVE_VERTEX_SEQUENTIAL |
-	       (_3DPRIM_TRIFAN << BRW_3DPRIMITIVE_TOPOLOGY_SHIFT) | 
+	       (_3DPRIM_RECTLIST << BRW_3DPRIMITIVE_TOPOLOGY_SHIFT) | 
 	       (0 << 9) |  /* CTG - indirect vertex count */
 	       4);
-      OUT_RING(4);  /* vertex count per instance */
+      OUT_RING(3);  /* vertex count per instance */
       OUT_RING(0); /* start vertex offset */
       OUT_RING(1); /* single instance */
       OUT_RING(0); /* start instance location */
@@ -1090,4 +1076,19 @@ I965EXAComposite(PixmapPtr pDst, int src
     ErrorF("sync after 3dprimitive");
     I830Sync(pScrn);
 #endif
+    /* we must be sure that the pipeline is flushed before next exa draw,
+       because that will be new state, binding state and instructions*/
+    {
+	BEGIN_LP_RING(4);
+   	OUT_RING(BRW_PIPE_CONTROL |
+	    BRW_PIPE_CONTROL_NOWRITE |
+	    BRW_PIPE_CONTROL_WC_FLUSH |
+	    BRW_PIPE_CONTROL_IS_FLUSH |
+	    (1 << 10) |  /* XXX texture cache flush for BLC/CTG */
+	    2);
+   	OUT_RING(0); /* Destination address */
+   	OUT_RING(0); /* Immediate data low DW */
+   	OUT_RING(0); /* Immediate data high DW */
+	ADVANCE_LP_RING();
+    }
 }
diff-tree 18ad7d5cf04081d89a9f978ccc7794116f7c498b (from 3d5bd0c14eea7951540f7a12eee257428f78e2d1)
Author: Wang Zhenyu <zhenyu.z.wang at intel.com>
Date:   Wed Nov 29 16:04:16 2006 +0800

    remove wrong scale_units

diff --git a/src/i965_exa_render.c b/src/i965_exa_render.c
index 2751778..2c3e43b 100644
--- a/src/i965_exa_render.c
+++ b/src/i965_exa_render.c
@@ -397,8 +397,6 @@ ErrorF("i965 prepareComposite\n");
     }
     scale_units[0][0] = pSrc->drawable.width;
     scale_units[0][1] = pSrc->drawable.height;
-    scale_units[2][0] = pDst->drawable.width;
-    scale_units[2][1] = pDst->drawable.height;
 
     if (pSrcPicture->transform) {
 	is_transform[0] = TRUE;
diff-tree 3d5bd0c14eea7951540f7a12eee257428f78e2d1 (from a06beb5f80f097fac3b718e742742bb32f1c1194)
Author: Wang Zhenyu <zhenyu.z.wang at intel.com>
Date:   Wed Nov 29 16:02:16 2006 +0800

    Rename exa sf/wm program files
    
    Also fix some minors in wm program.

diff --git a/src/Makefile.am b/src/Makefile.am
index 494a921..890e90f 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -78,8 +78,8 @@ sf_prog.h: packed_yuv_sf.g4a
 	intel-gen4asm -o sf_prog.h packed_yuv_sf.g4a
 wm_prog.h: packed_yuv_wm.g4a
 	intel-gen4asm -o wm_prog.h packed_yuv_wm.g4a
-i965_composite_wm_nomask.h: i965_composite_wm_nomask.g4a
-	intel-gen4asm -o i965_composite_wm_nomask.h i965_composite_wm_nomask.g4a
+exa_wm_nomask_prog.h: exa_wm_nomask.g4a
+	intel-gen4asm -o exa_wm_nomask_prog.h exa_wm_nomask.g4a
 endif
 
 if DRI
diff --git a/src/exa_sf.g4a b/src/exa_sf.g4a
new file mode 100644
index 0000000..8c1398f
--- /dev/null
+++ b/src/exa_sf.g4a
@@ -0,0 +1,17 @@
+send (1) 0 g6<1>F g1.12<0,1,0>F math inv scalar mlen 1 rlen 1 { align1 };
+send (1) 0 g6.4<1>F g1.20<0,1,0>F math inv scalar mlen 1 rlen 1 { align1 };
+add (8) g7<1>F g4<8,8,1>F -g3<8,8,1>F { align1 };
+mul (1) g7<1>F g7<0,1,0>F g6<0,1,0>F { align1 };
+mul (1) g7.4<1>F g7.4<0,1,0>F g6.4<0,1,0>F { align1 };
+mov (8) m1<1>F g7<0,1,0>F { align1 };
+mov (8) m2<1>F g7.4<0,1,0>F { align1 };
+mov (8) m3<1>F g3<8,8,1>F { align1 };
+send (8) 0 null g0<8,8,1>F urb 0 transpose used complete mlen 4 rlen 0 { align1 EOT };
+nop;
+nop;
+nop;
+nop;
+nop;
+nop;
+nop;
+nop;
diff --git a/src/exa_sf_prog.h b/src/exa_sf_prog.h
new file mode 100644
index 0000000..830d176
--- /dev/null
+++ b/src/exa_sf_prog.h
@@ -0,0 +1,17 @@
+   { 0x00000031, 0x20c01fbd, 0x0000002c, 0x01110081 },
+   { 0x00000031, 0x20c41fbd, 0x00000034, 0x01110081 },
+   { 0x00600040, 0x20e077bd, 0x008d0080, 0x008d4060 },
+   { 0x00000041, 0x20e077bd, 0x000000e0, 0x000000c0 },
+   { 0x00000041, 0x20e477bd, 0x000000e4, 0x000000c4 },
+   { 0x00600001, 0x202003be, 0x000000e0, 0x00000000 },
+   { 0x00600001, 0x204003be, 0x000000e4, 0x00000000 },
+   { 0x00600001, 0x206003be, 0x008d0060, 0x00000000 },
+   { 0x00600031, 0x20001fbc, 0x008d0000, 0x8640c800 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
diff --git a/src/exa_wm_nomask.g4a b/src/exa_wm_nomask.g4a
new file mode 100644
index 0000000..8e851a3
--- /dev/null
+++ b/src/exa_wm_nomask.g4a
@@ -0,0 +1,143 @@
+/*
+ * This's for exa composite operation in no mask picture case.
+ * The simplest case is just sending what src picture has to dst picture.
+ */
+
+/* I think this should be same as in g4a program for texture video,
+   as we also use 16-pixel dispatch. and SF scale in g3 is useful for us. */
+
+/* The initial payload of the thread is always g0.
+ * WM_URB (incoming URB entries) is g3
+ * X0_R is g4
+ * X1_R is g5
+ * Y0_R is g6
+ * Y1_R is g7
+ */
+
+    /* Set up ss0.x coordinates*/
+mov (1) g4<1>F g1.8<0,1,0>UW { align1 };
+add (1) g4.4<1>F g1.8<0,1,0>UW 1UB { align1 };
+mov (1) g4.8<1>F g1.8<0,1,0>UW { align1 };
+add (1) g4.12<1>F g1.8<0,1,0>UW 1UB { align1 };
+    /* Set up ss0.y coordinates */
+mov (1) g6<1>F g1.10<0,1,0>UW { align1 };
+mov (1) g6.4<1>F g1.10<0,1,0>UW { align1 };
+add (1) g6.8<1>F g1.10<0,1,0>UW 1UB { align1 };
+add (1) g6.12<1>F g1.10<0,1,0>UW 1UB { align1 };
+    /* set up ss1.x coordinates */
+mov (1) g4.16<1>F g1.12<0,1,0>UW { align1 };
+add (1) g4.20<1>F g1.12<0,1,0>UW 1UB { align1 };
+mov (1) g4.24<1>F g1.12<0,1,0>UW { align1 };
+add (1) g4.28<1>F g1.12<0,1,0>UW 1UB { align1 };
+    /* set up ss1.y coordinates */
+mov (1) g6.16<1>F g1.14<0,1,0>UW { align1 };
+mov (1) g6.20<1>F g1.14<0,1,0>UW { align1 };
+add (1) g6.24<1>F g1.14<0,1,0>UW 1UB { align1 };
+add (1) g6.28<1>F g1.14<0,1,0>UW 1UB { align1 };
+    /* Set up ss2.x coordinates */
+mov (1) g5<1>F g1.16<0,1,0>UW { align1 };
+add (1) g5.4<1>F g1.16<0,1,0>UW 1UB { align1 };
+mov (1) g5.8<1>F g1.16<0,1,0>UW { align1 };
+add (1) g5.12<1>F g1.16<0,1,0>UW 1UB { align1 };
+    /* Set up ss2.y coordinates */
+mov (1) g7<1>F g1.18<0,1,0>UW { align1 };
+mov (1) g7.4<1>F g1.18<0,1,0>UW { align1 };
+add (1) g7.8<1>F g1.18<0,1,0>UW 1UB { align1 };
+add (1) g7.12<1>F g1.18<0,1,0>UW 1UB { align1 };
+    /* Set up ss3.x coordinates */
+mov (1) g5.16<1>F g1.20<0,1,0>UW { align1 };
+add (1) g5.20<1>F g1.20<0,1,0>UW 1UB { align1 };
+mov (1) g5.24<1>F g1.20<0,1,0>UW { align1 };
+add (1) g5.28<1>F g1.20<0,1,0>UW 1UB { align1 };
+    /* Set up ss3.y coordinates */
+mov (1) g7.16<1>F g1.22<0,1,0>UW { align1 };
+mov (1) g7.20<1>F g1.22<0,1,0>UW { align1 };
+add (1) g7.24<1>F g1.22<0,1,0>UW 1UB { align1 };
+add (1) g7.28<1>F g1.22<0,1,0>UW 1UB { align1 };
+
+    /* Now, map these screen space coordinates into texture coordinates. */
+    /* subtract screen-space X origin of vertex 0. */
+add (8) g4<1>F g4<8,8,1>F -g1<0,1,0>F { align1 };
+add (8) g5<1>F g5<8,8,1>F -g1<0,1,0>F { align1 };
+    /* scale by texture X increment */
+mul (8) g4<1>F g4<8,8,1>F g3<0,1,0>F { align1 };
+mul (8) g5<1>F g5<8,8,1>F g3<0,1,0>F { align1 };
+    /* add in texture X offset */
+add (8) g4<1>F g4<8,8,1>F g3.12<0,1,0>F { align1 };
+add (8) g5<1>F g5<8,8,1>F g3.12<0,1,0>F { align1 };
+    /* subtract screen-space Y origin of vertex 0. */
+add (8) g6<1>F g6<8,8,1>F -g1.4<0,1,0>F { align1 };
+add (8) g7<1>F g7<8,8,1>F -g1.4<0,1,0>F { align1 };
+    /* scale by texture Y increment */
+mul (8) g6<1>F g6<8,8,1>F g3.20<0,1,0>F { align1 };
+mul (8) g7<1>F g7<8,8,1>F g3.20<0,1,0>F { align1 };
+    /* add in texture Y offset */
+add (8) g6<1>F g6<8,8,1>F g3.28<0,1,0>F { align1 };
+add (8) g7<1>F g7<8,8,1>F g3.28<0,1,0>F { align1 };
+
+/* prepare sampler read back gX register, which would be written back to output */
+
+/* use simd16 sampler, param 0 is u, param 1 is v. */
+/* 'payload' loading, assuming tex coord start from g4 */
+mov (8) m1<1>F g4<8,8,1>F { align1 };
+mov (8) m2<1>F g5<8,8,1>F { align1 };  /* param 0 u in m1, m2 */
+mov (8) m3<1>F g6<8,8,1>F { align1 };
+mov (8) m4<1>F g7<8,8,1>F { align1 };  /* param 1 v in m3, m4 */
+
+/* m0 will be copied with g0, as it contains send desc */
+/* emit sampler 'send' cmd */
+send (16) 0 		/* msg reg index */
+	g12<1>UW 	/* readback */
+	g0<8,8,1>UW  	/* copy to msg start reg*/
+	sampler (1,0,F)  /* sampler message description, (binding_table,sampler_index,datatype)
+			 /* here(src->dst) we should use src_sampler and src_surface */
+	mlen 5 rlen 8 { align1 };   /* required message len 5, readback len 8 */
+
+mov (8) g19<1>UD g19<8,8,1>UD { align1 };  /* wait sampler return */
+/* if we set up read-back reg correctly, emit dataport write 'send' cmd with EOT */
+
+/* m0, m1 are all direct passed by PS thread payload */
+mov (8) m1<1>F g1<8,8,1>F { align1 };
+
+/* prepare data in m2-m5 for subspan(1,0), m6-m9 for subspan(3,2), then it's ready to write */
+/* g12 -> m2
+   g13 -> m6
+   g14 -> m3
+   g15 -> m7
+   g16 -> m4
+   g17 -> m8
+   g18 -> m5
+   g19 -> m9
+*/
+mov (8) m2<1>F g12<8,8,1>F { align1 };
+mov (8) m3<1>F g14<8,8,1>F { align1 };
+mov (8) m4<1>F g16<8,8,1>F { align1 };
+mov (8) m5<1>F g18<8,8,1>F { align1 };
+mov (8) m6<1>F g13<8,8,1>F { align1 };
+mov (8) m7<1>F g15<8,8,1>F { align1 };
+mov (8) m8<1>F g17<8,8,1>F { align1 };
+mov (8) m9<1>F g19<8,8,1>F { align1 };
+
+/* m0, m1 are all direct passed by PS thread payload */
+mov (8) m1<1>UD g1<8,8,1>UD { align1 mask_disable };
+
+/* write */
+send (16) 0 acc0<1>UW g0<8,8,1>UW write (
+	0,  /* binding_table */
+	8,  /* pixel scordboard clear, msg type simd16 single source */
+	4,  /* render target write */
+	0   /* no write commit message */
+	) 
+	mlen 10
+	rlen 0
+	{ align1 EOT };
+
+nop;
+nop;
+nop;
+nop;
+nop;
+nop;
+nop;
+nop;
+nop;
diff --git a/src/exa_wm_nomask_prog.h b/src/exa_wm_nomask_prog.h
new file mode 100644
index 0000000..7870b3b
--- /dev/null
+++ b/src/exa_wm_nomask_prog.h
@@ -0,0 +1,70 @@
+   { 0x00000001, 0x2080013d, 0x00000028, 0x00000000 },
+   { 0x00000040, 0x20840d3d, 0x00000028, 0x00000001 },
+   { 0x00000001, 0x2088013d, 0x00000028, 0x00000000 },
+   { 0x00000040, 0x208c0d3d, 0x00000028, 0x00000001 },
+   { 0x00000001, 0x20c0013d, 0x0000002a, 0x00000000 },
+   { 0x00000001, 0x20c4013d, 0x0000002a, 0x00000000 },
+   { 0x00000040, 0x20c80d3d, 0x0000002a, 0x00000001 },
+   { 0x00000040, 0x20cc0d3d, 0x0000002a, 0x00000001 },
+   { 0x00000001, 0x2090013d, 0x0000002c, 0x00000000 },
+   { 0x00000040, 0x20940d3d, 0x0000002c, 0x00000001 },
+   { 0x00000001, 0x2098013d, 0x0000002c, 0x00000000 },
+   { 0x00000040, 0x209c0d3d, 0x0000002c, 0x00000001 },
+   { 0x00000001, 0x20d0013d, 0x0000002e, 0x00000000 },
+   { 0x00000001, 0x20d4013d, 0x0000002e, 0x00000000 },
+   { 0x00000040, 0x20d80d3d, 0x0000002e, 0x00000001 },
+   { 0x00000040, 0x20dc0d3d, 0x0000002e, 0x00000001 },
+   { 0x00000001, 0x20a0013d, 0x00000030, 0x00000000 },
+   { 0x00000040, 0x20a40d3d, 0x00000030, 0x00000001 },
+   { 0x00000001, 0x20a8013d, 0x00000030, 0x00000000 },
+   { 0x00000040, 0x20ac0d3d, 0x00000030, 0x00000001 },
+   { 0x00000001, 0x20e0013d, 0x00000032, 0x00000000 },
+   { 0x00000001, 0x20e4013d, 0x00000032, 0x00000000 },
+   { 0x00000040, 0x20e80d3d, 0x00000032, 0x00000001 },
+   { 0x00000040, 0x20ec0d3d, 0x00000032, 0x00000001 },
+   { 0x00000001, 0x20b0013d, 0x00000034, 0x00000000 },
+   { 0x00000040, 0x20b40d3d, 0x00000034, 0x00000001 },
+   { 0x00000001, 0x20b8013d, 0x00000034, 0x00000000 },
+   { 0x00000040, 0x20bc0d3d, 0x00000034, 0x00000001 },
+   { 0x00000001, 0x20f0013d, 0x00000036, 0x00000000 },
+   { 0x00000001, 0x20f4013d, 0x00000036, 0x00000000 },
+   { 0x00000040, 0x20f80d3d, 0x00000036, 0x00000001 },
+   { 0x00000040, 0x20fc0d3d, 0x00000036, 0x00000001 },
+   { 0x00600040, 0x208077bd, 0x008d0080, 0x00004020 },
+   { 0x00600040, 0x20a077bd, 0x008d00a0, 0x00004020 },
+   { 0x00600041, 0x208077bd, 0x008d0080, 0x00000060 },
+   { 0x00600041, 0x20a077bd, 0x008d00a0, 0x00000060 },
+   { 0x00600040, 0x208077bd, 0x008d0080, 0x0000006c },
+   { 0x00600040, 0x20a077bd, 0x008d00a0, 0x0000006c },
+   { 0x00600040, 0x20c077bd, 0x008d00c0, 0x00004024 },
+   { 0x00600040, 0x20e077bd, 0x008d00e0, 0x00004024 },
+   { 0x00600041, 0x20c077bd, 0x008d00c0, 0x00000074 },
+   { 0x00600041, 0x20e077bd, 0x008d00e0, 0x00000074 },
+   { 0x00600040, 0x20c077bd, 0x008d00c0, 0x0000007c },
+   { 0x00600040, 0x20e077bd, 0x008d00e0, 0x0000007c },
+   { 0x00600001, 0x202003be, 0x008d0080, 0x00000000 },
+   { 0x00600001, 0x204003be, 0x008d00a0, 0x00000000 },
+   { 0x00600001, 0x206003be, 0x008d00c0, 0x00000000 },
+   { 0x00600001, 0x208003be, 0x008d00e0, 0x00000000 },
+   { 0x00800031, 0x21801d29, 0x008d0000, 0x02580001 },
+   { 0x00600001, 0x22600021, 0x008d0260, 0x00000000 },
+   { 0x00600001, 0x202003be, 0x008d0020, 0x00000000 },
+   { 0x00600001, 0x204003be, 0x008d0180, 0x00000000 },
+   { 0x00600001, 0x206003be, 0x008d01c0, 0x00000000 },
+   { 0x00600001, 0x208003be, 0x008d0200, 0x00000000 },
+   { 0x00600001, 0x20a003be, 0x008d0240, 0x00000000 },
+   { 0x00600001, 0x20c003be, 0x008d01a0, 0x00000000 },
+   { 0x00600001, 0x20e003be, 0x008d01e0, 0x00000000 },
+   { 0x00600001, 0x210003be, 0x008d0220, 0x00000000 },
+   { 0x00600001, 0x212003be, 0x008d0260, 0x00000000 },
+   { 0x00600201, 0x20200022, 0x008d0020, 0x00000000 },
+   { 0x00800031, 0x24001d28, 0x008d0000, 0x85a04800 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
diff --git a/src/i965_composite_wm_nomask.g4a b/src/i965_composite_wm_nomask.g4a
deleted file mode 100644
index 927d86a..0000000
--- a/src/i965_composite_wm_nomask.g4a
+++ /dev/null
@@ -1,141 +0,0 @@
-/*
- * This's for exa composite operation in no mask picture case.
- * The simplest case is just sending what src picture has to dst picture.
- * XXX: This is still experimental, and should be fixed to support multiple texture
- * map, and conditional mul actions. 
- */
-
-/* I think this should be same as in g4a program for texture video,
-   as we also use 16-pixel dispatch. and SF scale in g3 is useful for us. */
-
-/* The initial payload of the thread is always g0.
- * WM_URB (incoming URB entries) is g3
- * X0_R is g4
- * X1_R is g5
- * Y0_R is g6
- * Y1_R is g7
- */
-
-    /* Set up ss0.x coordinates*/
-mov (1) g4<1>F g1.8<0,1,0>UW { align1 };
-add (1) g4.4<1>F g1.8<0,1,0>UW 1UB { align1 };
-mov (1) g4.8<1>F g1.8<0,1,0>UW { align1 };
-add (1) g4.12<1>F g1.8<0,1,0>UW 1UB { align1 };
-    /* Set up ss0.y coordinates */
-mov (1) g6<1>F g1.10<0,1,0>UW { align1 };
-mov (1) g6.4<1>F g1.10<0,1,0>UW { align1 };
-add (1) g6.8<1>F g1.10<0,1,0>UW 1UB { align1 };
-add (1) g6.12<1>F g1.10<0,1,0>UW 1UB { align1 };
-    /* set up ss1.x coordinates */
-mov (1) g4.16<1>F g1.12<0,1,0>UW { align1 };
-add (1) g4.20<1>F g1.12<0,1,0>UW 1UB { align1 };
-mov (1) g4.24<1>F g1.12<0,1,0>UW { align1 };
-add (1) g4.28<1>F g1.12<0,1,0>UW 1UB { align1 };
-    /* set up ss1.y coordinates */
-mov (1) g6.16<1>F g1.14<0,1,0>UW { align1 };
-mov (1) g6.20<1>F g1.14<0,1,0>UW { align1 };
-add (1) g6.24<1>F g1.14<0,1,0>UW 1UB { align1 };
-add (1) g6.28<1>F g1.14<0,1,0>UW 1UB { align1 };
-    /* Set up ss2.x coordinates */
-mov (1) g5<1>F g1.16<0,1,0>UW { align1 };
-add (1) g5.4<1>F g1.16<0,1,0>UW 1UB { align1 };
-mov (1) g5.8<1>F g1.16<0,1,0>UW { align1 };
-add (1) g5.12<1>F g1.16<0,1,0>UW 1UB { align1 };
-    /* Set up ss2.y coordinates */
-mov (1) g7<1>F g1.18<0,1,0>UW { align1 };
-mov (1) g7.4<1>F g1.18<0,1,0>UW { align1 };
-add (1) g7.8<1>F g1.18<0,1,0>UW 1UB { align1 };
-add (1) g7.12<1>F g1.18<0,1,0>UW 1UB { align1 };
-    /* Set up ss3.x coordinates */
-mov (1) g5.16<1>F g1.20<0,1,0>UW { align1 };
-add (1) g5.20<1>F g1.20<0,1,0>UW 1UB { align1 };
-mov (1) g5.24<1>F g1.20<0,1,0>UW { align1 };
-add (1) g5.28<1>F g1.20<0,1,0>UW 1UB { align1 };
-    /* Set up ss3.y coordinates */
-mov (1) g7.16<1>F g1.22<0,1,0>UW { align1 };
-mov (1) g7.20<1>F g1.22<0,1,0>UW { align1 };
-add (1) g7.24<1>F g1.22<0,1,0>UW 1UB { align1 };
-add (1) g7.28<1>F g1.22<0,1,0>UW 1UB { align1 };
-
-    /* Now, map these screen space coordinates into texture coordinates. */
-    /* subtract screen-space X origin of vertex 0. */
-add (8) g4<1>F g4<8,8,1>F -g1<0,1,0>F { align1 };
-add (8) g5<1>F g5<8,8,1>F -g1<0,1,0>F { align1 };
-    /* scale by texture X increment */
-mul (8) g4<1>F g4<8,8,1>F g3<0,1,0>F { align1 };
-mul (8) g5<1>F g5<8,8,1>F g3<0,1,0>F { align1 };
-    /* add in texture X offset */
-add (8) g4<1>F g4<8,8,1>F g3.12<0,1,0>F { align1 };
-add (8) g5<1>F g5<8,8,1>F g3.12<0,1,0>F { align1 };
-    /* subtract screen-space Y origin of vertex 0. */
-add (8) g6<1>F g6<8,8,1>F -g1.4<0,1,0>F { align1 };
-add (8) g7<1>F g7<8,8,1>F -g1.4<0,1,0>F { align1 };
-    /* scale by texture Y increment */
-mul (8) g6<1>F g6<8,8,1>F g3.20<0,1,0>F { align1 };
-mul (8) g7<1>F g7<8,8,1>F g3.20<0,1,0>F { align1 };
-    /* add in texture Y offset */
-add (8) g6<1>F g6<8,8,1>F g3.28<0,1,0>F { align1 };
-add (8) g7<1>F g7<8,8,1>F g3.28<0,1,0>F { align1 };
-
-/* prepare sampler read back gX register, which would be written back to output */
-
-/* use simd16 sampler, param 0 is u, param 1 is v. */
-/* 'payload' loading, assuming tex coord start from g4 */
-mov (8) m1<1>F g4<8,8,1>F { align1 };
-mov (8) m2<1>F g5<8,8,1>F { align1 };  /* param 0 u in m1, m2 */
-mov (8) m3<1>F g6<8,8,1>F { align1 };
-mov (8) m4<1>F g7<8,8,1>F { align1 };  /* param 1 v in m3, m4 */
-
-/* m0 will be copied with g0, as it contains send desc */
-/* emit sampler 'send' cmd */
-send (16) 0 		/* msg reg index */
-	g12<1>UW 	/* readback */
-	g0<8,8,1>UW  	/* copy to msg start reg*/
-	sampler (1,0,F)  /* sampler message description, (binding_table,sampler_index,datatype)
-			 /* here(src->dst) we should use src_sampler and src_surface */
-	mlen 5 rlen 8 { align1 };   /* required message len 5, readback len 8 */
-
-/* if we set up read-back reg correctly, emit dataport write 'send' cmd with EOT */
-
-/* m0, m1 are all direct passed by PS thread payload */
-mov (8) m1<1>F g1<8,8,1>F { align1 };
-
-/* prepare data in m2-m5 for subspan(1,0), m6-m9 for subspan(3,2), then it's ready to write */
-/* g12 -> m2
-   g13 -> m6
-   g14 -> m3
-   g15 -> m7
-   g16 -> m4
-   g17 -> m8
-   g18 -> m5
-   g19 -> m9
-*/
-mov (8) m2<1>F g12<8,8,1>F { align1 };
-mov (8) m3<1>F g14<8,8,1>F { align1 };
-mov (8) m4<1>F g16<8,8,1>F { align1 };
-mov (8) m5<1>F g18<8,8,1>F { align1 };
-mov (8) m6<1>F g13<8,8,1>F { align1 };
-mov (8) m7<1>F g15<8,8,1>F { align1 };
-mov (8) m8<1>F g17<8,8,1>F { align1 };
-mov (8) m9<1>F g19<8,8,1>F { align1 };
-
-/* write */
-send (16) 0 null g0<8,8,1>UW write (
-	0,  /* binding_table */
-	8,  /* pixel scordboard clear, msg type simd16 single source */
-	4,  /* render target write */
-	0   /* no write commit message */
-	) 
-	mlen 10
-	rlen 0
-	{ align1 EOT };
-
-nop;
-nop;
-nop;
-nop;
-nop;
-nop;
-nop;
-nop;
-nop;
diff --git a/src/i965_composite_wm_nomask.h b/src/i965_composite_wm_nomask.h
deleted file mode 100644
index bd99dd9..0000000
--- a/src/i965_composite_wm_nomask.h
+++ /dev/null
@@ -1,68 +0,0 @@
-   { 0x00000001, 0x2080013d, 0x00000028, 0x00000000 },
-   { 0x00000040, 0x20840d3d, 0x00000028, 0x00000001 },
-   { 0x00000001, 0x2088013d, 0x00000028, 0x00000000 },
-   { 0x00000040, 0x208c0d3d, 0x00000028, 0x00000001 },
-   { 0x00000001, 0x20c0013d, 0x0000002a, 0x00000000 },
-   { 0x00000001, 0x20c4013d, 0x0000002a, 0x00000000 },
-   { 0x00000040, 0x20c80d3d, 0x0000002a, 0x00000001 },
-   { 0x00000040, 0x20cc0d3d, 0x0000002a, 0x00000001 },
-   { 0x00000001, 0x2090013d, 0x0000002c, 0x00000000 },
-   { 0x00000040, 0x20940d3d, 0x0000002c, 0x00000001 },
-   { 0x00000001, 0x2098013d, 0x0000002c, 0x00000000 },
-   { 0x00000040, 0x209c0d3d, 0x0000002c, 0x00000001 },
-   { 0x00000001, 0x20d0013d, 0x0000002e, 0x00000000 },
-   { 0x00000001, 0x20d4013d, 0x0000002e, 0x00000000 },
-   { 0x00000040, 0x20d80d3d, 0x0000002e, 0x00000001 },
-   { 0x00000040, 0x20dc0d3d, 0x0000002e, 0x00000001 },
-   { 0x00000001, 0x20a0013d, 0x00000030, 0x00000000 },
-   { 0x00000040, 0x20a40d3d, 0x00000030, 0x00000001 },
-   { 0x00000001, 0x20a8013d, 0x00000030, 0x00000000 },
-   { 0x00000040, 0x20ac0d3d, 0x00000030, 0x00000001 },
-   { 0x00000001, 0x20e0013d, 0x00000032, 0x00000000 },
-   { 0x00000001, 0x20e4013d, 0x00000032, 0x00000000 },
-   { 0x00000040, 0x20e80d3d, 0x00000032, 0x00000001 },
-   { 0x00000040, 0x20ec0d3d, 0x00000032, 0x00000001 },
-   { 0x00000001, 0x20b0013d, 0x00000034, 0x00000000 },
-   { 0x00000040, 0x20b40d3d, 0x00000034, 0x00000001 },
-   { 0x00000001, 0x20b8013d, 0x00000034, 0x00000000 },
-   { 0x00000040, 0x20bc0d3d, 0x00000034, 0x00000001 },
-   { 0x00000001, 0x20f0013d, 0x00000036, 0x00000000 },
-   { 0x00000001, 0x20f4013d, 0x00000036, 0x00000000 },
-   { 0x00000040, 0x20f80d3d, 0x00000036, 0x00000001 },
-   { 0x00000040, 0x20fc0d3d, 0x00000036, 0x00000001 },
-   { 0x00600040, 0x208077bd, 0x008d0080, 0x00004020 },
-   { 0x00600040, 0x20a077bd, 0x008d00a0, 0x00004020 },
-   { 0x00600041, 0x208077bd, 0x008d0080, 0x00000060 },
-   { 0x00600041, 0x20a077bd, 0x008d00a0, 0x00000060 },
-   { 0x00600040, 0x208077bd, 0x008d0080, 0x0000006c },
-   { 0x00600040, 0x20a077bd, 0x008d00a0, 0x0000006c },
-   { 0x00600040, 0x20c077bd, 0x008d00c0, 0x00004024 },
-   { 0x00600040, 0x20e077bd, 0x008d00e0, 0x00004024 },
-   { 0x00600041, 0x20c077bd, 0x008d00c0, 0x00000074 },
-   { 0x00600041, 0x20e077bd, 0x008d00e0, 0x00000074 },
-   { 0x00600040, 0x20c077bd, 0x008d00c0, 0x0000007c },
-   { 0x00600040, 0x20e077bd, 0x008d00e0, 0x0000007c },
-   { 0x00600001, 0x202003be, 0x008d0080, 0x00000000 },
-   { 0x00600001, 0x204003be, 0x008d00a0, 0x00000000 },
-   { 0x00600001, 0x206003be, 0x008d00c0, 0x00000000 },
-   { 0x00600001, 0x208003be, 0x008d00e0, 0x00000000 },
-   { 0x00800031, 0x21801d29, 0x008d0000, 0x02580001 },
-   { 0x00600001, 0x202003be, 0x008d0020, 0x00000000 },
-   { 0x00600001, 0x204003be, 0x008d0180, 0x00000000 },
-   { 0x00600001, 0x206003be, 0x008d01c0, 0x00000000 },
-   { 0x00600001, 0x208003be, 0x008d0200, 0x00000000 },
-   { 0x00600001, 0x20a003be, 0x008d0240, 0x00000000 },
-   { 0x00600001, 0x20c003be, 0x008d01a0, 0x00000000 },
-   { 0x00600001, 0x20e003be, 0x008d01e0, 0x00000000 },
-   { 0x00600001, 0x210003be, 0x008d0220, 0x00000000 },
-   { 0x00600001, 0x212003be, 0x008d0260, 0x00000000 },
-   { 0x00800031, 0x20001d3c, 0x008d0000, 0x85a04800 },
-   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
-   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
-   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
-   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
-   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
-   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
-   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
-   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
-   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
diff --git a/src/i965_exa_render.c b/src/i965_exa_render.c
index fe3007b..2751778 100644
--- a/src/i965_exa_render.c
+++ b/src/i965_exa_render.c
@@ -342,7 +342,7 @@ static const CARD32 sip_kernel_static[][
 #define SF_MAX_THREADS	   4
 
 static const CARD32 sf_kernel_static[][4] = {
-#include "sf_prog.h"
+#include "exa_sf_prog.h"
 };
 
 /* ps kernels */
@@ -350,7 +350,7 @@ static const CARD32 sf_kernel_static[][4
 #define PS_MAX_THREADS	   32
 /* 1: no mask */
 static const CARD32 ps_kernel_static_nomask [][4] = {
-	#include "i965_composite_wm_nomask.h"
+	#include "exa_wm_nomask_prog.h"
 };
 
 /* 2: mask with componentAlpha, src * mask color, XXX: later */
diff-tree a06beb5f80f097fac3b718e742742bb32f1c1194 (from 4198f1216eb13b30d1e92d4395e98861f4324c38)
Author: Wang Zhenyu <zhenyu.z.wang at intel.com>
Date:   Wed Nov 29 15:47:19 2006 +0800

    EXA state mem for G965
    
    Not split offscreen mem for exa, but alloc a dedicated one
    for G965 states.

diff --git a/src/i830.h b/src/i830.h
index df1c171..2a68499 100644
--- a/src/i830.h
+++ b/src/i830.h
@@ -73,6 +73,7 @@ SOFTWARE OR THE USE OR OTHER DEALINGS IN
 #ifdef I830_USE_EXA
 #include "exa.h"
 Bool I830EXAInit(ScreenPtr pScreen);
+#define EXA_LINEAR_EXTRA	(64*1024)
 #endif
 
 #ifdef I830_USE_XAA
@@ -243,6 +244,7 @@ typedef struct _I830Rec {
    I830MemRange Scratch2;
 #ifdef I830_USE_EXA
    I830MemRange Offscreen;
+   I830MemRange EXAStateMem;  /* specific exa state for G965 */
 #endif
    /* Regions allocated either from the above pools, or from agpgart. */
    I830MemRange	*CursorMem;
diff --git a/src/i830_exa.c b/src/i830_exa.c
index 9b2e6b2..8b07ecb 100644
--- a/src/i830_exa.c
+++ b/src/i830_exa.c
@@ -424,7 +424,6 @@ IntelEXADoneComposite(PixmapPtr pDst)
 #endif
 }
 
-#define BRW_LINEAR_EXTRA (32*1024)
 /*
  * TODO:
  *   - Dual head?
@@ -447,11 +446,7 @@ I830EXAInit(ScreenPtr pScreen)
     pI830->EXADriverPtr->exa_minor = 0;
     pI830->EXADriverPtr->memoryBase = pI830->FbBase;
     pI830->EXADriverPtr->offScreenBase = pI830->Offscreen.Start;
-    if (IS_I965G(pI830))
-    	pI830->EXADriverPtr->memorySize = pI830->Offscreen.End -
-					BRW_LINEAR_EXTRA; /* BRW needs state buffer*/
-    else
-    	pI830->EXADriverPtr->memorySize = pI830->Offscreen.End;
+    pI830->EXADriverPtr->memorySize = pI830->Offscreen.End;
 	   
     DPRINTF(PFX, "EXA Mem: memoryBase 0x%x, end 0x%x, offscreen base 0x%x, memorySize 0x%x\n",
 		pI830->EXADriverPtr->memoryBase,
diff --git a/src/i830_memory.c b/src/i830_memory.c
index e3307d6..4a8d480 100644
--- a/src/i830_memory.c
+++ b/src/i830_memory.c
@@ -905,6 +905,25 @@ I830Allocate2DMemory(ScrnInfoPtr pScrn, 
 		       "offscreen memory at 0x%lx, size %ld KB\n", 
 			pI830->Offscreen.Start, pI830->Offscreen.Size/1024);
       }
+      if (IS_I965G(pI830)) {
+          memset(&(pI830->EXAStateMem), 0, sizeof(I830MemRange));
+          pI830->EXAStateMem.Key = -1;
+          size = ROUND_TO_PAGE(EXA_LINEAR_EXTRA);
+          align = GTT_PAGE_SIZE;
+          alloced = I830AllocVidMem(pScrn, &(pI830->EXAStateMem),
+				&(pI830->StolenPool), size, align,
+				flags | FROM_ANYWHERE | ALLOCATE_AT_TOP);
+          if (alloced < size) {
+             if (!dryrun) {
+         	 xf86DrvMsg(pScrn->scrnIndex, X_ERROR,
+		    "G965: Failed to allocate exa state buffer space.\n");
+             }
+             return FALSE;
+          }
+          xf86DrvMsgVerb(pScrn->scrnIndex, X_INFO, verbosity,
+ 		  "%sAllocated %ld kB for the G965 exa state buffer at 0x%lx - 0x%lx.\n", s, 
+ 		alloced / 1024, pI830->EXAStateMem.Start, pI830->EXAStateMem.End);
+      }
 #endif
    } else {
       long lineSize;
@@ -1545,6 +1564,11 @@ I830FixupOffsets(ScrnInfoPtr pScrn)
       I830FixOffset(pScrn, &(pI830->TexMem));
    }
 #endif
+#ifdef I830_USE_EXA
+   I830FixOffset(pScrn, &(pI830->Offscreen));
+   if (IS_I965G(pI830))
+       I830FixOffset(pScrn, &(pI830->EXAStateMem));
+#endif
    return TRUE;
 }
 
@@ -1945,6 +1969,12 @@ I830BindAGPMemory(ScrnInfoPtr pScrn)
 	    return FALSE;
       }
 #endif
+#ifdef I830_USE_EXA
+     if (!BindMemRange(pScrn, &(pI830->Offscreen)))
+	return FALSE;
+     if (IS_I965G(pI830) && !BindMemRange(pScrn, &(pI830->EXAStateMem)))
+	return FALSE;
+#endif
       pI830->GttBound = 1;
    }
 
@@ -2029,6 +2059,12 @@ I830UnbindAGPMemory(ScrnInfoPtr pScrn)
 	    return FALSE;
       }
 #endif
+#ifdef I830_USE_EXA
+     if (!UnbindMemRange(pScrn, &(pI830->Offscreen)))
+	return FALSE;
+     if (IS_I965G(pI830) && !UnbindMemRange(pScrn, &(pI830->EXAStateMem)))
+	return FALSE;
+#endif
       if (!xf86ReleaseGART(pScrn->scrnIndex))
 	 return FALSE;
 
diff --git a/src/i965_exa_render.c b/src/i965_exa_render.c
index 7fbf99c..fe3007b 100644
--- a/src/i965_exa_render.c
+++ b/src/i965_exa_render.c
@@ -490,21 +490,12 @@ ErrorF("i965 prepareComposite\n");
    next_offset = binding_table_offset + (binding_table_entries * 4);
 
    total_state_size = next_offset;
+   assert(total_state_size < EXA_LINEAR_EXTRA);
 
-   /*
-    * XXX: Use the extra space allocated at the end of the exa offscreen buffer?
-    */
-#define BRW_LINEAR_EXTRA	(32*1024)
-
-   state_base_offset = (pI830->Offscreen.End -
-			BRW_LINEAR_EXTRA);
-   
+   state_base_offset = pI830->EXAStateMem.Start;
    state_base_offset = ALIGN(state_base_offset, 64);
    state_base = (char *)(pI830->FbBase + state_base_offset);
-   /* Set up our pointers to state structures in framebuffer.  It would probably
-    * be a good idea to fill these structures out in system memory and then dump
-    * them there, instead.
-    */
+
    vs_state = (void *)(state_base + vs_offset);
    sf_state = (void *)(state_base + sf_offset);
    wm_state = (void *)(state_base + wm_offset);