Hi Chris,<br>Did this (and other patches that you wrote) get merged yet? Did you create an account? I&#39;m interested in your work and would like to test it.<br><br><div class="gmail_quote">On Wed, May 6, 2009 at 10:28 AM, Christoph Bumiller <span dir="ltr">&lt;<a href="mailto:e0425955@student.tuwien.ac.at" target="_blank">e0425955@student.tuwien.ac.at</a>&gt;</span> wrote:<br>

<blockquote class="gmail_quote" style="margin: 0pt 0pt 0pt 0.8ex; border-left: 1px solid rgb(204, 204, 204); padding-left: 1ex;">Hi ! I&#39;ve been trying to improve NV50 shader generation a bit the last couple of weeks, so here is<br>


what I&#39;ve produced. I don&#39;t know if it&#39;s usable for you or just a pile of horrible hacks, but at<br>
least it makes some mesa demos render more correcly, p.e. the teapot (aside from mip-mapping issues<br>
of the floor texture), arbfplight, and I think the gears also didn&#39;t appear as they should before,<br>
and I hope it doesn&#39;t break others that worked.<br>
I also tried playing neverball and neverputt, which at some point worked fine, but now it locks up<br>
the GPU again after a certain (short) amount of time. That&#39;s probably not related to my<br>
modifications, because it crashes without the patches as well (plus has some flickering and other<br>
graphics errors). It seems to work OK if I run it with valgrind, though.<br>
There also are and have been some random graphics errors that spam the kernel log with invalid<br>
method NV50TCL_VERTEX_END, so if something doesn&#39;t look right, try to restart the program, or toggle<br>
some options in the mesa demos (show help, etc.).<br>
<br>
There might, as always, be some bugs in the patches, of course, and the they probably can&#39;t be<br>
committed unmodified. I&#39;ve not put them in the email text but as attachments because there&#39;s rather<br>
many changes. There&#39;s a short description (commit log) in each patch, but I hope the code speaks for<br>
itself, otherwise I&#39;ll provide more explanation / add more comments ... later.<br>
These don&#39;t represent everything I&#39;ve tried to improve, but the rest isn&#39;t in any usable shape yet.<br>
<br>
If anyone who knows their way around the gallium code has time, please have a look and tell me what<br>
you think. Thank you.<br>
<font color="#888888"><br>
Christoph<br>
</font><br>commit 7ab9fc73707be46375668e557b5a5c1a373096ad<br>
Author: chr &lt;chr@LAPTOP.(none)&gt;<br>
Date:   Sun May 3 21:03:35 2009 +0200<br>
<br>
    Remove some memory leaks: free allocated temp in all opcode cases<br>
    of tx_insn; free nv50_regs for immds in LIT and those allocated in<br>
    tgsi_src.<br>
    Make LRP use 2 instructions (SUB,MAD) instead of 3 (NEG,MAD,MAD).<br>
<br>
diff --git a/src/gallium/drivers/nv50/nv50_program.c b/src/gallium/drivers/nv50/nv50_program.c<br>
index 2d15868..1a94327 100644<br>
--- a/src/gallium/drivers/nv50/nv50_program.c<br>
+++ b/src/gallium/drivers/nv50/nv50_program.c<br>
@@ -28,6 +28,7 @@<br>
 #include &quot;pipe/p_shader_tokens.h&quot;<br>
 #include &quot;tgsi/tgsi_parse.h&quot;<br>
 #include &quot;tgsi/tgsi_util.h&quot;<br>
+#include &quot;tgsi/tgsi_dump.h&quot;<br>
<br>
 #include &quot;nv50_context.h&quot;<br>
<br>
@@ -795,12 +796,6 @@ emit_lit(struct nv50_pc *pc, struct nv50_reg **dst, unsigned mask,<br>
        struct nv50_reg *pos128 = alloc_immd(pc,  127.999999);<br>
        struct nv50_reg *tmp[4];<br>
<br>
-       if (mask &amp; (1 &lt;&lt; 0))<br>
-               emit_mov(pc, dst[0], one);<br>
-<br>
-       if (mask &amp; (1 &lt;&lt; 3))<br>
-               emit_mov(pc, dst[3], one);<br>
-<br>
        if (mask &amp; (3 &lt;&lt; 1)) {<br>
                if (mask &amp; (1 &lt;&lt; 1))<br>
                        tmp[0] = dst[1];<br>
@@ -823,6 +818,18 @@ emit_lit(struct nv50_pc *pc, struct nv50_reg **dst, unsigned mask,<br>
                emit_mov(pc, dst[2], zero);<br>
                set_pred(pc, 3, 0, pc-&gt;p-&gt;exec_tail);<br>
        }<br>
+<br>
+       /* do this last, in case src[i,j] == dst[0,3] */<br>
+       if (mask &amp; (1 &lt;&lt; 0))<br>
+               emit_mov(pc, dst[0], one);<br>
+<br>
+       if (mask &amp; (1 &lt;&lt; 3))<br>
+               emit_mov(pc, dst[3], one);<br>
+<br>
+       FREE(pos128);<br>
+       FREE(neg128);<br>
+       FREE(zero);<br>
+       FREE(one);<br>
 }<br>
<br>
 static void<br>
@@ -885,8 +892,9 @@ tgsi_src(struct nv50_pc *pc, int chan, const struct tgsi_full_src_register *src)<br>
 {<br>
        struct nv50_reg *r = NULL;<br>
        struct nv50_reg *temp;<br>
-       unsigned c;<br>
+       unsigned sgn, c;<br>
<br>
+       sgn = tgsi_util_get_full_src_register_sign_mode(src, chan);<br>
        c = tgsi_util_get_full_src_register_extswizzle(src, chan);<br>
        switch (c) {<br>
        case TGSI_EXTSWIZZLE_X:<br>
@@ -915,16 +923,18 @@ tgsi_src(struct nv50_pc *pc, int chan, const struct tgsi_full_src_register *src)<br>
                break;<br>
        case TGSI_EXTSWIZZLE_ZERO:<br>
                r = alloc_immd(pc, 0.0);<br>
-               break;<br>
+               return r;<br>
        case TGSI_EXTSWIZZLE_ONE:<br>
-               r = alloc_immd(pc, 1.0);<br>
-               break;<br>
+               if (sgn == TGSI_UTIL_SIGN_TOGGLE || sgn == TGSI_UTIL_SIGN_SET)<br>
+                       return alloc_immd(pc, -1.0);<br>
+               else<br>
+                       return alloc_immd(pc, 1.0);<br>
        default:<br>
                assert(0);<br>
                break;<br>
        }<br>
<br>
-       switch (tgsi_util_get_full_src_register_sign_mode(src, chan)) {<br>
+       switch (sgn) {<br>
        case TGSI_UTIL_SIGN_KEEP:<br>
                break;<br>
        case TGSI_UTIL_SIGN_CLEAR:<br>
@@ -955,7 +965,7 @@ static boolean<br>
 nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)<br>
 {<br>
        const struct tgsi_full_instruction *inst = &amp;tok-&gt;FullInstruction;<br>
-       struct nv50_reg *rdst[4], *dst[4], *src[3][4], *temp;<br>
+       struct nv50_reg *rdst[4], *dst[4], *src[3][4], *temp = NULL;<br>
        unsigned mask, sat, unit;<br>
        int i, c;<br>
<br>
@@ -1021,7 +1031,6 @@ nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)<br>
                                continue;<br>
                        emit_mov(pc, dst[c], temp);<br>
                }<br>
-               free_temp(pc, temp);<br>
                break;<br>
        case TGSI_OPCODE_DP4:<br>
                temp = alloc_temp(pc, NULL);<br>
@@ -1034,7 +1043,6 @@ nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)<br>
                                continue;<br>
                        emit_mov(pc, dst[c], temp);<br>
                }<br>
-               free_temp(pc, temp);<br>
                break;<br>
        case TGSI_OPCODE_DPH:<br>
                temp = alloc_temp(pc, NULL);<br>
@@ -1047,7 +1055,6 @@ nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)<br>
                                continue;<br>
                        emit_mov(pc, dst[c], temp);<br>
                }<br>
-               free_temp(pc, temp);<br>
                break;<br>
        case TGSI_OPCODE_DST:<br>
        {<br>
@@ -1072,7 +1079,6 @@ nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)<br>
                                continue;<br>
                        emit_mov(pc, dst[c], temp);<br>
                }<br>
-               free_temp(pc, temp);<br>
                break;<br>
        case TGSI_OPCODE_FLR:<br>
                for (c = 0; c &lt; 4; c++) {<br>
@@ -1089,7 +1095,6 @@ nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)<br>
                        emit_flr(pc, temp, src[0][c]);<br>
                        emit_sub(pc, dst[c], src[0][c], temp);<br>
                }<br>
-               free_temp(pc, temp);<br>
                break;<br>
        case TGSI_OPCODE_KIL:<br>
                emit_kil(pc, src[0][0]);<br>
@@ -1110,15 +1115,12 @@ nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)<br>
                }<br>
                break;<br>
        case TGSI_OPCODE_LRP:<br>
+               temp = alloc_temp(pc, NULL);<br>
                for (c = 0; c &lt; 4; c++) {<br>
                        if (!(mask &amp; (1 &lt;&lt; c)))<br>
                                continue;<br>
-                       /*XXX: we can do better than this */<br>
-                       temp = alloc_temp(pc, NULL);<br>
-                       emit_neg(pc, temp, src[0][c]);<br>
-                       emit_mad(pc, temp, temp, src[2][c], src[2][c]);<br>
-                       emit_mad(pc, dst[c], src[0][c], src[1][c], temp);<br>
-                       free_temp(pc, temp);<br>
+                       emit_sub(pc, temp, src[1][c], src[2][c]);<br>
+                       emit_mad(pc, dst[c], temp, src[0][c], src[2][c]);<br>
                }<br>
                break;<br>
        case TGSI_OPCODE_MAD:<br>
@@ -1164,7 +1166,6 @@ nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)<br>
                                continue;<br>
                        emit_mov(pc, dst[c], temp);<br>
                }<br>
-               free_temp(pc, temp);<br>
                break;<br>
        case TGSI_OPCODE_RCP:<br>
                for (c = 0; c &lt; 4; c++) {<br>
@@ -1259,7 +1260,6 @@ nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)<br>
                        emit_mul(pc, temp, src[0][1], src[1][0]);<br>
                        emit_msb(pc, dst[2], src[0][0], src[1][1], temp);<br>
                }<br>
-               free_temp(pc, temp);<br>
                break;<br>
        case TGSI_OPCODE_END:<br>
                break;<br>
@@ -1268,6 +1268,9 @@ nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)<br>
                return FALSE;<br>
        }<br>
<br>
+       if (temp)<br>
+               free_temp(pc, temp);<br>
+<br>
        if (sat) {<br>
                for (c = 0; c &lt; 4; c++) {<br>
                        struct nv50_program_exec *e;<br>
@@ -1288,6 +1291,19 @@ nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)<br>
                }<br>
        }<br>
<br>
+       for (i = 0; i &lt; inst-&gt;Instruction.NumSrcRegs; i++) {<br>
+               for (c = 0; c &lt; 4; c++) {<br>
+                       if (!src[i][c])<br>
+                               continue;<br>
+                       if (src[i][c]-&gt;index == -1 &amp;&amp; src[i][c]-&gt;type == P_IMMD)<br>
+                               FREE(src[i][c]);<br>
+<br>
+                       /* Might also release temporaries not used anymore in this loop,<br>
+                        * therefore no temp_immd and temp_immd_nr like for temp_temp.<br>
+                        */<br>
+               }<br>
+       }<br>
+<br>
        kill_temp_temp(pc);<br>
        return TRUE;<br>
 }<br>
<br>commit 93d8cfb3e13179d6ed28c4989cefc92389008f0b<br>
Author: chr &lt;chr@LAPTOP.(none)&gt;<br>
Date:   Tue May 5 20:54:43 2009 +0200<br>
<br>
    - extend nv50_pc to track insn nr, add allow half insn boolean<br>
    - extend nv50_reg to record insn of last use and FP output hw index<br>
    - add some functions for later use<br>
    - modify alloc_reg to prefer final FP output hw if set<br>
    - record interpolation mode in tx_prep<br>
    - count number of insns in tx_prep<br>
    - record depth output, and position and color input indices<br>
    - inspect instructions for register usage<br>
    - set pc-&gt;allow32 to FALSE on first and last insn<br>
<br>
    shouldn&#39;t change generated shader code yet<br>
<br>
diff --git a/src/gallium/drivers/nv50/nv50_program.c b/src/gallium/drivers/nv50/nv50_program.c<br>
index 1a94327..cb92a31 100644<br>
--- a/src/gallium/drivers/nv50/nv50_program.c<br>
+++ b/src/gallium/drivers/nv50/nv50_program.c<br>
@@ -86,6 +86,9 @@ struct nv50_reg {<br>
<br>
        int hw;<br>
        int neg;<br>
+<br>
+       int rhw; /* result hw for FP outputs */<br>
+       int acc; /* instruction where this reg is last read (first insn == 1) */<br>
 };<br>
<br>
 struct nv50_pc {<br>
@@ -109,6 +112,12 @@ struct nv50_pc {<br>
<br>
        struct nv50_reg *temp_temp[16];<br>
        unsigned temp_temp_nr;<br>
+<br>
+       /* current instruction and total number of insns */<br>
+       unsigned insn_cur;<br>
+       unsigned insn_nr;<br>
+<br>
+       boolean allow32; /* TRUE when half insns are allowed */<br>
 };<br>
<br>
 static void<br>
@@ -132,7 +141,24 @@ alloc_reg(struct nv50_pc *pc, struct nv50_reg *reg)<br>
                return;<br>
        }<br>
<br>
-       for (i = 0; i &lt; NV50_SU_MAX_TEMP; i++) {<br>
+       i = 0;<br>
+       if (reg-&gt;rhw != -1) {<br>
+               /* try to allocate temporary with index rhw first */<br>
+               if (!(pc-&gt;r_temp[reg-&gt;rhw])) {<br>
+                       pc-&gt;r_temp[reg-&gt;rhw] = reg;<br>
+                       reg-&gt;hw = reg-&gt;rhw;<br>
+                       if (pc-&gt;p-&gt;cfg.high_temp &lt; (reg-&gt;rhw + 1))<br>
+                               pc-&gt;p-&gt;cfg.high_temp = reg-&gt;rhw + 1;<br>
+                       return;<br>
+               }<br>
+        /* If we can&#39;t allocate the final destination index of the output,<br>
+         * put it in a high temporary so we need not shuffle around later.<br>
+                * (like, $r0 needs to go in $r1 and $r1 in $r0 etc.)<br>
+         */<br>
+               i = pc-&gt;result_nr * 4;<br>
+       }<br>
+<br>
+       for (; i &lt; NV50_SU_MAX_TEMP; i++) {<br>
                if (!(pc-&gt;r_temp[i])) {<br>
                        pc-&gt;r_temp[i] = reg;<br>
                        reg-&gt;hw = i;<br>
@@ -160,6 +186,7 @@ alloc_temp(struct nv50_pc *pc, struct nv50_reg *dst)<br>
                        r-&gt;type = P_TEMP;<br>
                        r-&gt;index = -1;<br>
                        r-&gt;hw = i;<br>
+                       r-&gt;rhw = -1;<br>
                        pc-&gt;r_temp[i] = r;<br>
                        return r;<br>
                }<br>
@@ -169,6 +196,56 @@ alloc_temp(struct nv50_pc *pc, struct nv50_reg *dst)<br>
        return NULL;<br>
 }<br>
<br>
+static struct nv50_reg *<br>
+alloc_preferred_temp(struct nv50_pc *pc, int hw)<br>
+{<br>
+       struct nv50_reg *r;<br>
+<br>
+       if (hw &gt;= NV50_SU_MAX_TEMP || hw == -1 || pc-&gt;r_temp[hw])<br>
+               return alloc_temp(pc, NULL);<br>
+<br>
+       r = CALLOC_STRUCT(nv50_reg);<br>
+       r-&gt;type = P_TEMP;<br>
+       r-&gt;index = -1;<br>
+       r-&gt;hw = hw;<br>
+       r-&gt;rhw = -1;<br>
+       pc-&gt;r_temp[hw] = r;<br>
+<br>
+       return r;<br>
+}<br>
+<br>
+/* Assign the hw of the discarded temporary register src<br>
+ * to the tgsi register dst and free src.<br>
+ */<br>
+static void<br>
+assimilate_temp(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)<br>
+{<br>
+       assert(dst-&gt;index != -1 &amp;&amp; src-&gt;index == -1 &amp;&amp; src-&gt;hw != -1);<br>
+<br>
+       if (dst-&gt;hw != -1)<br>
+               pc-&gt;r_temp[dst-&gt;hw] = NULL;<br>
+       pc-&gt;r_temp[src-&gt;hw] = dst;<br>
+       dst-&gt;hw = src-&gt;hw;<br>
+<br>
+       FREE(src);<br>
+}<br>
+<br>
+/* release the hardware resource held by r */<br>
+static void<br>
+release_hw(struct nv50_pc *pc, struct nv50_reg *r)<br>
+{<br>
+       assert(r-&gt;type == P_TEMP);<br>
+       if (r-&gt;hw == -1)<br>
+               return;<br>
+<br>
+       assert(pc-&gt;r_temp[r-&gt;hw] == r);<br>
+       pc-&gt;r_temp[r-&gt;hw] = NULL;<br>
+<br>
+       r-&gt;acc = 0;<br>
+       if (r-&gt;index == -1)<br>
+               FREE(r);<br>
+}<br>
+<br>
 static void<br>
 free_temp(struct nv50_pc *pc, struct nv50_reg *r)<br>
 {<br>
@@ -251,7 +328,14 @@ alloc_immd(struct nv50_pc *pc, float f)<br>
        struct nv50_reg *r = CALLOC_STRUCT(nv50_reg);<br>
        unsigned hw;<br>
<br>
-       hw = ctor_immd(pc, f, 0, 0, 0) * 4;<br>
+       /* don&#39;t allocate more space if the value is already there */<br>
+       for (hw = 0; hw &lt; pc-&gt;immd_nr * 4; ++hw)<br>
+               if (pc-&gt;immd_buf[hw] == f)<br>
+                       break;<br>
+<br>
+       if (hw == pc-&gt;immd_nr * 4)<br>
+               hw = ctor_immd(pc, f, 0, 0, 0) * 4;<br>
+<br>
        r-&gt;type = P_IMMD;<br>
        r-&gt;hw = hw;<br>
        r-&gt;index = -1;<br>
@@ -355,6 +439,12 @@ set_immd(struct nv50_pc *pc, struct nv50_reg *imm, struct nv50_program_exec *e)<br>
        e-&gt;inst[1] |= (val &gt;&gt; 6) &lt;&lt; 2;<br>
 }<br>
<br>
+<br>
+#define INTERP_LINEAR          0<br>
+#define INTERP_FLAT                    1<br>
+#define INTERP_PERSPECTIVE     2<br>
+#define INTERP_CENTROID                4<br>
+<br>
 static void<br>
 emit_interp(struct nv50_pc *pc, struct nv50_reg *dst,<br>
            struct nv50_reg *src, struct nv50_reg *iv)<br>
@@ -535,6 +625,14 @@ set_src_2(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e)<br>
        e-&gt;inst[1] |= (src-&gt;hw &lt;&lt; 14);<br>
 }<br>
<br>
+static boolean<br>
+requires_long(struct nv50_program_exec *e, struct nv50_reg *src)<br>
+{<br>
+       if (is_long(e) || src-&gt;type == P_IMMD || src-&gt;type == P_CONST)<br>
+               return TRUE;<br>
+       return FALSE;<br>
+}<br>
+<br>
 static void<br>
 emit_mul(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,<br>
         struct nv50_reg *src1)<br>
@@ -870,6 +968,62 @@ emit_kil(struct nv50_pc *pc, struct nv50_reg *src)<br>
        emit(pc, e);<br>
 }<br>
<br>
+static void<br>
+emit_nop(struct nv50_pc *pc, boolean l)<br>
+{<br>
+       struct nv50_program_exec *e = exec(pc);<br>
+<br>
+       e-&gt;inst[0] = 0xF0000000;<br>
+       if (l) {<br>
+               set_long(pc, e);<br>
+               e-&gt;inst[1] = 0xE0000000;<br>
+       }<br>
+<br>
+       emit(pc, e);<br>
+}<br>
+<br>
+/* Adjust a bitmask that indicates what components of a source are used,<br>
+ * we use this in tx_prep so we only load interpolants that are needed.<br>
+ */<br>
+static void<br>
+insn_adjust_mask(const struct tgsi_full_instruction *insn, unsigned *mask)<br>
+{<br>
+       const struct tgsi_instruction_ext_texture *tex;<br>
+<br>
+       switch (insn-&gt;Instruction.Opcode) {<br>
+       case TGSI_OPCODE_DP3:<br>
+               *mask = 0x7;<br>
+               break;<br>
+       case TGSI_OPCODE_DP4:<br>
+       case TGSI_OPCODE_DPH:<br>
+               *mask = 0xF;<br>
+               break;<br>
+       case TGSI_OPCODE_LIT:<br>
+               *mask = 0xB;<br>
+               break;<br>
+       case TGSI_OPCODE_RCP:<br>
+       case TGSI_OPCODE_RSQ:<br>
+               *mask = 0x1;<br>
+               break;<br>
+       case TGSI_OPCODE_TXP:<br>
+               *mask = 0x8;<br>
+               /* fall through to TEX */<br>
+       case TGSI_OPCODE_TEX:<br>
+               assert(insn-&gt;Instruction.Extended);<br>
+               tex = &amp;insn-&gt;InstructionExtTexture;<br>
+<br>
+               if (tex-&gt;Texture == TGSI_TEXTURE_1D)<br>
+                       *mask |= 0x1;<br>
+               else<br>
+               if (tex-&gt;Texture == TGSI_TEXTURE_2D)<br>
+                       *mask |= 0x3;<br>
+               else<br>
+                       *mask |= 0x7;<br>
+       default:<br>
+               break;<br>
+       }<br>
+}<br>
+<br>
 static struct nv50_reg *<br>
 tgsi_dst(struct nv50_pc *pc, int c, const struct tgsi_full_dst_register *dst)<br>
 {<br>
@@ -1308,12 +1462,53 @@ nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)<br>
        return TRUE;<br>
 }<br>
<br>
+static void<br>
+set_acc_array(unsigned *p, const struct tgsi_full_src_register *src,<br>
+                         unsigned mask, unsigned n)<br>
+{<br>
+       unsigned k, c;<br>
+<br>
+       for (c = 0; c &lt; 4; c++) {<br>
+               if (!(mask &amp; (1 &lt;&lt; c)))<br>
+                       continue;<br>
+<br>
+               k = tgsi_util_get_full_src_register_extswizzle(src, c);<br>
+               switch (k) {<br>
+               case TGSI_EXTSWIZZLE_X:<br>
+               case TGSI_EXTSWIZZLE_Y:<br>
+               case TGSI_EXTSWIZZLE_Z:<br>
+               case TGSI_EXTSWIZZLE_W:<br>
+                       p[src-&gt;SrcRegister.Index * 4 + k] = n;<br>
+                       break;<br>
+               default:<br>
+                       break;<br>
+               }<br>
+       }<br>
+}<br>
+<br>
 static boolean<br>
 nv50_program_tx_prep(struct nv50_pc *pc)<br>
 {<br>
        struct tgsi_parse_context p;<br>
        boolean ret = FALSE;<br>
        unsigned i, c;<br>
+       unsigned fcol, bcol, fcrd, depr;<br>
+<br>
+       /* record interpolation mode from declaration */<br>
+       boolean centroid_load = FALSE;<br>
+       boolean perspect_load = FALSE;<br>
+       unsigned interp_mode[32];<br>
+<br>
+       /* track register usage for temps and attrs */<br>
+       unsigned *last_t_use = NULL;<br>
+       unsigned *last_a_use = NULL;<br>
+<br>
+       depr = fcol = bcol = fcrd = 0xFFFFFFFF;<br>
+<br>
+       if (pc-&gt;p-&gt;type == PIPE_SHADER_FRAGMENT) {<br>
+               pc-&gt;p-&gt;cfg.fp.regs[0] = 0x01000404;<br>
+               pc-&gt;p-&gt;cfg.fp.regs[1] = 0x00000400;<br>
+       }<br>
<br>
        tgsi_parse_init(&amp;p, pc-&gt;p-&gt;pipe.tokens);<br>
        while (!tgsi_parse_end_of_tokens(&amp;p)) {<br>
@@ -1326,6 +1521,10 @@ nv50_program_tx_prep(struct nv50_pc *pc)<br>
                        const struct tgsi_full_immediate *imm =<br>
                                &amp;p.FullToken.FullImmediate;<br>
<br>
+#ifdef NV50_PROGRAM_DUMP<br>
+                       tgsi_dump_immediate(imm);<br>
+#endif<br>
+<br>
                        ctor_immd(pc, imm-&gt;u.ImmediateFloat32[0].Float,<br>
                                      imm-&gt;u.ImmediateFloat32[1].Float,<br>
                                      imm-&gt;u.ImmediateFloat32[2].Float,<br>
@@ -1335,11 +1534,16 @@ nv50_program_tx_prep(struct nv50_pc *pc)<br>
                case TGSI_TOKEN_TYPE_DECLARATION:<br>
                {<br>
                        const struct tgsi_full_declaration *d;<br>
-                       unsigned last;<br>
+                       unsigned last, first, mode;<br>
<br>
                        d = &amp;p.FullToken.FullDeclaration;<br>
+                       first = d-&gt;DeclarationRange.First;<br>
                        last = d-&gt;DeclarationRange.Last;<br>
<br>
+#ifdef NV50_PROGRAM_DUMP<br>
+                       tgsi_dump_declaration(d);<br>
+#endif<br>
+<br>
                        switch (d-&gt;Declaration.File) {<br>
                        case TGSI_FILE_TEMPORARY:<br>
                                if (pc-&gt;temp_nr &lt; (last + 1))<br>
@@ -1348,10 +1552,71 @@ nv50_program_tx_prep(struct nv50_pc *pc)<br>
                        case TGSI_FILE_OUTPUT:<br>
                                if (pc-&gt;result_nr &lt; (last + 1))<br>
                                        pc-&gt;result_nr = last + 1;<br>
+<br>
+                               if (!d-&gt;Declaration.Semantic)<br>
+                                       break;<br>
+<br>
+                               switch (d-&gt;Semantic.SemanticName) {<br>
+                               case TGSI_SEMANTIC_POSITION:<br>
+                                       depr = first;<br>
+                                       pc-&gt;p-&gt;cfg.fp.regs[2] |= 0x00000100;<br>
+                                       pc-&gt;p-&gt;cfg.fp.regs[3] |= 0x00000011;<br>
+                                       break;<br>
+                               default:<br>
+                                       break;<br>
+                               }<br>
                                break;<br>
                        case TGSI_FILE_INPUT:<br>
+                       {<br>
                                if (pc-&gt;attr_nr &lt; (last + 1))<br>
                                        pc-&gt;attr_nr = last + 1;<br>
+<br>
+                               if (pc-&gt;p-&gt;type != PIPE_SHADER_FRAGMENT)<br>
+                                       break;<br>
+<br>
+                           switch (d-&gt;Declaration.Interpolate) {<br>
+                               case TGSI_INTERPOLATE_CONSTANT:<br>
+                                       mode = INTERP_FLAT;<br>
+                                       break;<br>
+                               case TGSI_INTERPOLATE_PERSPECTIVE:<br>
+                                       mode = INTERP_PERSPECTIVE;<br>
+                                       perspect_load = TRUE;<br>
+                                       break;<br>
+                               default:<br>
+                                       mode = INTERP_LINEAR;<br>
+                                       break;<br>
+                               }<br>
+<br>
+                               if (d-&gt;Declaration.Semantic) {<br>
+                                       switch (d-&gt;Semantic.SemanticName) {<br>
+                                       case TGSI_SEMANTIC_POSITION:<br>
+                                               fcrd = first;<br>
+                                               break;<br>
+                                       case TGSI_SEMANTIC_COLOR:<br>
+                                               fcol = first;<br>
+                                               mode = INTERP_PERSPECTIVE;<br>
+                                               perspect_load = TRUE;<br>
+                                               break;<br>
+                                       case TGSI_SEMANTIC_BCOLOR:<br>
+                                               bcol = first;<br>
+                                               mode = INTERP_PERSPECTIVE;<br>
+                                               perspect_load = TRUE;<br>
+                                               break;<br>
+                                       default:<br>
+                                               break;<br>
+                                       }<br>
+                               }<br>
+<br>
+                               if (d-&gt;Declaration.Centroid) {<br>
+                                       mode |= INTERP_CENTROID;<br>
+                                       centroid_load = TRUE;<br>
+                                       perspect_load = FALSE;<br>
+                               }<br>
+<br>
+                               assert(last &lt; 32);<br>
+                               for (i = first; i &lt;= last; i++)<br>
+                                       interp_mode[i] = mode;<br>
+                       }<br>
                                break;<br>
                        case TGSI_FILE_CONSTANT:<br>
                                if (pc-&gt;param_nr &lt; (last + 1))<br>
@@ -1367,6 +1632,43 @@ nv50_program_tx_prep(struct nv50_pc *pc)<br>
                }<br>
                        break;<br>
                case TGSI_TOKEN_TYPE_INSTRUCTION:<br>
+               {<br>
+                       const struct tgsi_full_instruction *insn;<br>
+                       const struct tgsi_full_src_register *src;<br>
+                       const struct tgsi_dst_register *dst;<br>
+                       unsigned mask;<br>
+<br>
+                       pc-&gt;insn_nr++;<br>
+<br>
+                       if (!last_t_use) {<br>
+                               last_t_use = CALLOC(pc-&gt;temp_nr * 4, sizeof(unsigned));<br>
+                               last_a_use = CALLOC(pc-&gt;attr_nr * 4, sizeof(unsigned));<br>
+                       }<br>
+<br>
+                       insn = &amp;tok-&gt;FullInstruction;<br>
+                       dst = &amp;insn-&gt;FullDstRegisters[0].DstRegister;<br>
+                       mask = dst-&gt;WriteMask;<br>
+<br>
+#ifdef NV50_PROGRAM_DUMP<br>
+                       tgsi_dump_instruction(insn, 1);<br>
+#endif<br>
+                       if (dst-&gt;File == TGSI_FILE_TEMPORARY) {<br>
+                               for (c = 0; c &lt; 4; c++)<br>
+                                       if (mask &amp; (1 &lt;&lt; c))<br>
+                                               last_t_use[dst-&gt;Index * 4 + c] = pc-&gt;insn_nr;<br>
+                       }<br>
+<br>
+                       for (i = 0; i &lt; insn-&gt;Instruction.NumSrcRegs; ++i) {<br>
+                               src = &amp;insn-&gt;FullSrcRegisters[i];<br>
+                               insn_adjust_mask(insn, &amp;mask);<br>
+<br>
+                               if (src-&gt;SrcRegister.File == TGSI_FILE_TEMPORARY)<br>
+                                       set_acc_array(last_t_use, src, mask, pc-&gt;insn_nr);<br>
+                               else<br>
+                               if (src-&gt;SrcRegister.File == TGSI_FILE_INPUT)<br>
+                                       set_acc_array(last_a_use, src, mask, pc-&gt;insn_nr);<br>
+                       }<br>
+               }<br>
                        break;<br>
                default:<br>
                        break;<br>
@@ -1487,6 +1789,11 @@ nv50_program_tx_prep(struct nv50_pc *pc)<br>
                }<br>
        }<br>
<br>
+       if (last_t_use)<br>
+               FREE(last_t_use);<br>
+       if (last_a_use)<br>
+               FREE(last_a_use);<br>
+<br>
        ret = TRUE;<br>
 out_err:<br>
        tgsi_parse_free(&amp;p);<br>
@@ -1516,8 +1823,15 @@ nv50_program_tx(struct nv50_program *p)<br>
<br>
                tgsi_parse_token(&amp;parse);<br>
<br>
+               /* don&#39;t allow half insn on first and last (not END) instruction */<br>
+               if (pc-&gt;insn_cur == 0 || pc-&gt;insn_cur + 2 == pc-&gt;insn_nr)<br>
+                       pc-&gt;allow32 = FALSE;<br>
+               else<br>
+                       pc-&gt;allow32 = TRUE;<br>
+<br>
                switch (tok-&gt;Token.Type) {<br>
                case TGSI_TOKEN_TYPE_INSTRUCTION:<br>
+                       ++pc-&gt;insn_cur;<br>
                        ret = nv50_program_tx_insn(pc, tok);<br>
                        if (ret == FALSE)<br>
                                goto out_err;<br>
diff --git a/src/gallium/drivers/nv50/nv50_program.h b/src/gallium/drivers/nv50/nv50_program.h<br>
index 78deed6..3b3b6bb 100644<br>
--- a/src/gallium/drivers/nv50/nv50_program.h<br>
+++ b/src/gallium/drivers/nv50/nv50_program.h<br>
@@ -39,6 +39,11 @@ struct nv50_program {<br>
                struct {<br>
                        unsigned attr[2];<br>
                } vp;<br>
+               struct {<br>
+                       unsigned regs[4];<br>
+                       unsigned map[4];<br>
+                       unsigned high_map;<br>
+               } fp;<br>
        } cfg;<br>
 };<br>
<br>
<br>commit ebcc4b9cf61a25d8ef2fa87eecfb5e4e75b47bca<br>
Author: chr &lt;chr@LAPTOP.(none)&gt;<br>
Date:   Tue May 5 20:56:12 2009 +0200<br>
<br>
    - more correct loading FP interpolants, also consider interpolation mode<br>
    - use tgsi resource nv50_regs to store attributes<br>
    - improve values of shader registers<br>
    - make sure FP depth output goes where it&#39;s supposed to go<br>
    - loop through all instructions and make sure there are no single half insns<br>
<br>
diff --git a/src/gallium/drivers/nv50/nv50_program.c b/src/gallium/drivers/nv50/nv50_program.c<br>
index cb92a31..9acf882 100644<br>
--- a/src/gallium/drivers/nv50/nv50_program.c<br>
+++ b/src/gallium/drivers/nv50/nv50_program.c<br>
@@ -445,20 +445,29 @@ set_immd(struct nv50_pc *pc, struct nv50_reg *imm, struct nv50_program_exec *e)<br>
 #define INTERP_PERSPECTIVE     2<br>
 #define INTERP_CENTROID                4<br>
<br>
+/* interpolant index has been stored in dst-&gt;rhw */<br>
 static void<br>
-emit_interp(struct nv50_pc *pc, struct nv50_reg *dst,<br>
-           struct nv50_reg *src, struct nv50_reg *iv)<br>
+emit_interp(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *iv,<br>
+               unsigned mode)<br>
 {<br>
+       assert(dst-&gt;rhw != -1);<br>
        struct nv50_program_exec *e = exec(pc);<br>
<br>
        e-&gt;inst[0] |= 0x80000000;<br>
        set_dst(pc, dst, e);<br>
-       alloc_reg(pc, src);<br>
-       e-&gt;inst[0] |= (src-&gt;hw &lt;&lt; 16);<br>
-       if (iv) {<br>
-               e-&gt;inst[0] |= (1 &lt;&lt; 25);<br>
-               alloc_reg(pc, iv);<br>
-               e-&gt;inst[0] |= (iv-&gt;hw &lt;&lt; 9);<br>
+       e-&gt;inst[0] |= (dst-&gt;rhw &lt;&lt; 16);<br>
+<br>
+       if (mode &amp; INTERP_FLAT) {<br>
+               e-&gt;inst[0] |= (1 &lt;&lt; 8);<br>
+       } else {<br>
+               if (mode &amp; INTERP_PERSPECTIVE) {<br>
+                       e-&gt;inst[0] |= (1 &lt;&lt; 25);<br>
+                       alloc_reg(pc, iv);<br>
+                       e-&gt;inst[0] |= (iv-&gt;hw &lt;&lt; 9);<br>
+               }<br>
+<br>
+               if (mode &amp; INTERP_CENTROID)<br>
+                       e-&gt;inst[0] |= (1 &lt;&lt; 24);<br>
        }<br>
<br>
        emit(pc, e);<br>
@@ -982,6 +991,43 @@ emit_nop(struct nv50_pc *pc, boolean l)<br>
        emit(pc, e);<br>
 }<br>
<br>
+static void<br>
+convert_to_long(struct nv50_pc *pc, struct nv50_program_exec *e)<br>
+{<br>
+       unsigned q = 0, m = ~0;<br>
+<br>
+       assert(!is_long(e));<br>
+<br>
+       switch (e-&gt;inst[0] &gt;&gt; 28) {<br>
+               case 0x1:<br>
+                       /* MOV */<br>
+                       q = 0x0403c000;<br>
+                       m = 0xFFFF7FFF;<br>
+                       break;<br>
+               case 0x8:<br>
+                       /* INTERP */<br>
+                       m = ~0x02000000;<br>
+                       if (e-&gt;inst[0] &amp; 0x02000000)<br>
+                               q = 0x00020000;<br>
+                       break;<br>
+               case 0xC:<br>
+                       /* MUL */<br>
+                       break;<br>
+               case 0x9:<br>
+                       /* RCP */<br>
+                       break;<br>
+               default:<br>
+                       assert(0);<br>
+                       break;<br>
+       }<br>
+<br>
+       set_long(pc, e);<br>
+       pc-&gt;p-&gt;exec_size++;<br>
+<br>
+       e-&gt;inst[0] &amp;= m;<br>
+       e-&gt;inst[1] |= q;<br>
+}<br>
+<br>
 /* Adjust a bitmask that indicates what components of a source are used,<br>
  * we use this in tx_prep so we only load interpolants that are needed.<br>
  */<br>
@@ -1005,20 +1051,21 @@ insn_adjust_mask(const struct tgsi_full_instruction *insn, unsigned *mask)<br>
        case TGSI_OPCODE_RSQ:<br>
                *mask = 0x1;<br>
                break;<br>
-       case TGSI_OPCODE_TXP:<br>
-               *mask = 0x8;<br>
-               /* fall through to TEX */<br>
        case TGSI_OPCODE_TEX:<br>
+       case TGSI_OPCODE_TXP:<br>
                assert(insn-&gt;Instruction.Extended);<br>
                tex = &amp;insn-&gt;InstructionExtTexture;<br>
<br>
+               *mask = 0x7;<br>
                if (tex-&gt;Texture == TGSI_TEXTURE_1D)<br>
-                       *mask |= 0x1;<br>
+                       *mask = 0x1;<br>
                else<br>
                if (tex-&gt;Texture == TGSI_TEXTURE_2D)<br>
-                       *mask |= 0x3;<br>
-               else<br>
-                       *mask |= 0x7;<br>
+                       *mask = 0x3;<br>
+<br>
+               if (insn-&gt;Instruction.Opcode == TGSI_OPCODE_TXP)<br>
+                       *mask |= 0x8;<br>
+               break;<br>
        default:<br>
                break;<br>
        }<br>
@@ -1255,6 +1302,7 @@ nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)<br>
                emit_kil(pc, src[0][1]);<br>
                emit_kil(pc, src[0][2]);<br>
                emit_kil(pc, src[0][3]);<br>
+               pc-&gt;p-&gt;cfg.fp.regs[2] |= 0x00100000;<br>
                break;<br>
        case TGSI_OPCODE_LIT:<br>
                emit_lit(pc, &amp;dst[0], mask, &amp;src[0][0]);<br>
@@ -1503,7 +1551,7 @@ nv50_program_tx_prep(struct nv50_pc *pc)<br>
        unsigned *last_t_use = NULL;<br>
        unsigned *last_a_use = NULL;<br>
<br>
-       depr = fcol = bcol = fcrd = 0xFFFFFFFF;<br>
+       depr = fcol = bcol = fcrd = 0xFFFF;<br>
<br>
        if (pc-&gt;p-&gt;type == PIPE_SHADER_FRAGMENT) {<br>
                pc-&gt;p-&gt;cfg.fp.regs[0] = 0x01000404;<br>
@@ -1683,37 +1731,106 @@ nv50_program_tx_prep(struct nv50_pc *pc)<br>
                for (i = 0; i &lt; pc-&gt;temp_nr; i++) {<br>
                        for (c = 0; c &lt; 4; c++) {<br>
                                pc-&gt;temp[i*4+c].type = P_TEMP;<br>
-                               pc-&gt;temp[i*4+c].hw = -1;<br>
+                               pc-&gt;temp[i*4+c].hw = pc-&gt;temp[i*4+c].rhw = -1;<br>
                                pc-&gt;temp[i*4+c].index = i;<br>
+                               pc-&gt;temp[i*4+c].acc = last_t_use[i*4+c];<br>
                        }<br>
                }<br>
        }<br>
<br>
        if (pc-&gt;attr_nr) {<br>
-               struct nv50_reg *iv = NULL;<br>
-               int aid = 0;<br>
+               struct nv50_reg *iv, *iv_c = NULL, *iv_p = NULL;<br>
+               int oid, off = 4, mid = 0, aid = 0;<br>
+<br>
+               /* off = VP output id offset to i*4 (oid = i*4 + off + c)<br>
+                * aid = FP attribute/interpolant id (incremented only for used attrs)<br>
+                * mid = VP output mapping field ID (HPOS not counted)<br>
+                */<br>
<br>
                pc-&gt;attr = CALLOC(pc-&gt;attr_nr * 4, sizeof(struct nv50_reg));<br>
                if (!pc-&gt;attr)<br>
                        goto out_err;<br>
<br>
+               i = 0;<br>
                if (pc-&gt;p-&gt;type == PIPE_SHADER_FRAGMENT) {<br>
-                       iv = alloc_temp(pc, NULL);<br>
-                       emit_interp(pc, iv, iv, NULL);<br>
-                       emit_flop(pc, 0, iv, iv);<br>
-                       aid++;<br>
+<br>
+                       if (fcrd != 0xFFFF) {<br>
+                               assert(fcrd == 0); /* position input should always be 0 */<br>
+                               i = 1;<br>
+                               off = 0;<br>
+                               for (c = 0; c &lt; 4; ++c) {<br>
+                                       if (last_a_use[c] == 0)<br>
+                                               continue;<br>
+<br>
+                                       pc-&gt;attr[c].index = fcrd;<br>
+                                       pc-&gt;attr[c].type = P_TEMP;<br>
+                                       pc-&gt;attr[c].acc = last_a_use[c];<br>
+                                       pc-&gt;attr[c].hw = pc-&gt;attr[c].rhw = -1;<br>
+<br>
+                                       alloc_reg(pc, &amp;pc-&gt;attr[c]);<br>
+                                       pc-&gt;attr[c].rhw = aid++;<br>
+<br>
+                                       emit_interp(pc, &amp;pc-&gt;attr[c], NULL, INTERP_LINEAR);<br>
+                                       pc-&gt;p-&gt;cfg.fp.regs[1] |= (1 &lt;&lt; (24 + c));<br>
+<br>
+                                       switch (c) {<br>
+                                       case 0:<br>
+                                       case 1:<br>
+                                               /* should probably do viewport stuff here */<br>
+                                               break;<br>
+                                       case 3:<br>
+                                               iv_p = &amp;pc-&gt;attr[c];<br>
+                                               emit_flop(pc, 0, iv_p, iv_p);<br>
+                                               break;<br>
+                                       default:<br>
+                                               break;<br>
+                                       }<br>
+                               }<br>
+                       }<br>
+<br>
+                       if (perspect_load &amp;&amp; !iv_p) {<br>
+                               iv_p = alloc_temp(pc, NULL);<br>
+                               iv_p-&gt;rhw = aid++;<br>
+                               emit_interp(pc, iv_p, NULL, INTERP_LINEAR);<br>
+                               emit_flop(pc, 0, iv_p, iv_p);<br>
+                               pc-&gt;p-&gt;cfg.fp.regs[1] |= 0x08000000;<br>
+                       }<br>
+<br>
+                       if (centroid_load) {<br>
+                               iv_c = alloc_temp(pc, NULL);<br>
+                               iv_c-&gt;rhw = iv_p ? aid - 1 : aid++;<br>
+                               emit_interp(pc, iv_c, NULL, INTERP_CENTROID);<br>
+                               emit_flop(pc, 0, iv_c, iv_c);<br>
+                               pc-&gt;p-&gt;cfg.fp.regs[1] |= 0x08000000;<br>
+                       }<br>
                }<br>
<br>
-               for (i = 0; i &lt; pc-&gt;attr_nr; i++) {<br>
+               for (; i &lt; pc-&gt;attr_nr; i++) {<br>
                        struct nv50_reg *a = &amp;pc-&gt;attr[i*4];<br>
+                       iv = (interp_mode[i] &amp; INTERP_CENTROID) ? iv_c : iv_p;<br>
<br>
                        for (c = 0; c &lt; 4; c++) {<br>
                                if (pc-&gt;p-&gt;type == PIPE_SHADER_FRAGMENT) {<br>
-                                       struct nv50_reg *at =<br>
-                                               alloc_temp(pc, NULL);<br>
-                                       pc-&gt;attr[i*4+c].type = at-&gt;type;<br>
-                                       pc-&gt;attr[i*4+c].hw = at-&gt;hw;<br>
-                                       pc-&gt;attr[i*4+c].index = at-&gt;index;<br>
+                                       a[c].hw = a[c].rhw = -1;<br>
+                                       a[c].index = -1;<br>
+                                       if (last_a_use[i*4+c] == 0)<br>
+                                               continue;<br>
+<br>
+                                       if (i == fcol || i == bcol)<br>
+                                               pc-&gt;p-&gt;cfg.fp.regs[0] += 0x00010000;<br>
+                                       pc-&gt;p-&gt;cfg.fp.regs[1] += 0x00010000;<br>
+<br>
+                                       a[c].index = i;<br>
+                                       a[c].type = P_TEMP;<br>
+                                       a[c].acc = last_a_use[i*4+c];<br>
+<br>
+                                       alloc_reg(pc, &amp;a[c]);<br>
+                                       a[c].rhw = aid++;<br>
+                                       emit_interp(pc, &amp;a[c], iv, interp_mode[i]);<br>
+<br>
+                                       oid = off + i * 4 + c;<br>
+                                       pc-&gt;p-&gt;cfg.fp.map[mid / 4] |= oid &lt;&lt; (8 * (mid % 4));<br>
+                                       mid++;<br>
                                } else {<br>
                                        pc-&gt;p-&gt;cfg.vp.attr[aid/32] |=<br>
                                                (1 &lt;&lt; (aid % 32));<br>
@@ -1722,18 +1839,16 @@ nv50_program_tx_prep(struct nv50_pc *pc)<br>
                                        pc-&gt;attr[i*4+c].index = i;<br>
                                }<br>
                        }<br>
+               }<br>
<br>
-                       if (pc-&gt;p-&gt;type != PIPE_SHADER_FRAGMENT)<br>
-                               continue;<br>
+               if (pc-&gt;p-&gt;type == PIPE_SHADER_FRAGMENT) {<br>
+                       pc-&gt;p-&gt;cfg.fp.high_map = (mid / 4) + ((mid % 4) ? 1 : 0);<br>
<br>
-                       emit_interp(pc, &amp;a[0], &amp;a[0], iv);<br>
-                       emit_interp(pc, &amp;a[1], &amp;a[1], iv);<br>
-                       emit_interp(pc, &amp;a[2], &amp;a[2], iv);<br>
-                       emit_interp(pc, &amp;a[3], &amp;a[3], iv);<br>
+                       if (iv_p &amp;&amp; iv_p-&gt;index == -1)<br>
+                               free_temp(pc, iv_p);<br>
+                       if (iv_c)<br>
+                               free_temp(pc, iv_c);<br>
                }<br>
-<br>
-               if (iv)<br>
-                       free_temp(pc, iv);<br>
        }<br>
<br>
        if (pc-&gt;result_nr) {<br>
@@ -1748,9 +1863,15 @@ nv50_program_tx_prep(struct nv50_pc *pc)<br>
                                if (pc-&gt;p-&gt;type == PIPE_SHADER_FRAGMENT) {<br>
                                        pc-&gt;result[i*4+c].type = P_TEMP;<br>
                                        pc-&gt;result[i*4+c].hw = -1;<br>
+                                       if (i == depr) {<br>
+                                               pc-&gt;result[i*4+c].rhw = ((c == 2) ?<br>
+                                                       (pc-&gt;result_nr - 1) * 4 : -1);<br>
+                                       } else<br>
+                                               pc-&gt;result[i*4+c].rhw = rid++;<br>
                                } else {<br>
                                        pc-&gt;result[i*4+c].type = P_RESULT;<br>
                                        pc-&gt;result[i*4+c].hw = rid++;<br>
+                                       pc-&gt;result[i*4+c].rhw = -1;<br>
                                }<br>
                                pc-&gt;result[i*4+c].index = i;<br>
                        }<br>
@@ -1805,6 +1926,7 @@ nv50_program_tx(struct nv50_program *p)<br>
 {<br>
        struct tgsi_parse_context parse;<br>
        struct nv50_pc *pc;<br>
+       unsigned i, k;<br>
        boolean ret;<br>
<br>
        pc = CALLOC_STRUCT(nv50_pc);<br>
@@ -1843,10 +1965,42 @@ nv50_program_tx(struct nv50_program *p)<br>
<br>
        if (p-&gt;type == PIPE_SHADER_FRAGMENT) {<br>
                struct nv50_reg out;<br>
-<br>
                out.type = P_TEMP;<br>
-               for (out.hw = 0; out.hw &lt; pc-&gt;result_nr * 4; out.hw++)<br>
-                       emit_mov(pc, &amp;out, &amp;pc-&gt;result[out.hw]);<br>
+<br>
+               for (i = 0; i &lt; pc-&gt;result_nr * 4; i++) {<br>
+                       if (pc-&gt;result[i].rhw == -1)<br>
+                               continue;<br>
+                       if (pc-&gt;result[i].hw != pc-&gt;result[i].rhw) {<br>
+                               out.hw = pc-&gt;result[i].rhw;<br>
+                               emit_mov(pc, &amp;out, &amp;pc-&gt;result[i]);<br>
+                       }<br>
+                       if (pc-&gt;p-&gt;cfg.high_result &lt; pc-&gt;result[i].rhw + 1)<br>
+                               pc-&gt;p-&gt;cfg.high_result = pc-&gt;result[i].rhw + 1;<br>
+               }<br>
+       }<br>
+<br>
+       /* look for single half instructions and make them long */<br>
+       struct nv50_program_exec *e, *e_prev;<br>
+<br>
+       for (k = 0, e = pc-&gt;p-&gt;exec_head, e_prev = NULL; e; e = e-&gt;next) {<br>
+               if (!is_long(e))<br>
+                       k++;<br>
+<br>
+               if (!e-&gt;next || is_long(e-&gt;next)) {<br>
+                       if (k &amp; 1)<br>
+                               convert_to_long(pc, e);<br>
+                       k = 0;<br>
+               }<br>
+<br>
+               if (e-&gt;next)<br>
+                       e_prev = e;<br>
+       }<br>
+<br>
+       if (!is_long(pc-&gt;p-&gt;exec_tail)) {<br>
+               /* this may occur if moving FP results */<br>
+               assert(e_prev &amp;&amp; !is_long(e_prev));<br>
+               convert_to_long(pc, e_prev);<br>
+               convert_to_long(pc, pc-&gt;p-&gt;exec_tail);<br>
        }<br>
<br>
        assert(is_long(pc-&gt;p-&gt;exec_tail) &amp;&amp; !is_immd(pc-&gt;p-&gt;exec_head));<br>
@@ -1973,7 +2127,7 @@ nv50_program_validate_code(struct nv50_context *nv50, struct nv50_program *p)<br>
                if (is_long(e))<br>
                        NOUVEAU_ERR(&quot;0x%08x\n&quot;, e-&gt;inst[1]);<br>
        }<br>
-<br>
+       FREE(up);<br>
 #endif<br>
<br>
        up = ptr = MALLOC(p-&gt;exec_size * 4);<br>
@@ -2058,6 +2212,7 @@ nv50_fragprog_validate(struct nv50_context *nv50)<br>
        struct nouveau_grobj *tesla = nv50-&gt;screen-&gt;tesla;<br>
        struct nv50_program *p = nv50-&gt;fragprog;<br>
        struct nouveau_stateobj *so;<br>
+       unsigned i;<br>
<br>
        if (!p-&gt;translated) {<br>
                nv50_program_validate(nv50, p);<br>
@@ -2068,24 +2223,30 @@ nv50_fragprog_validate(struct nv50_context *nv50)<br>
        nv50_program_validate_data(nv50, p);<br>
        nv50_program_validate_code(nv50, p);<br>
<br>
-       so = so_new(64, 2);<br>
+       so = so_new(32, 2);<br>
        so_method(so, tesla, NV50TCL_FP_ADDRESS_HIGH, 2);<br>
        so_reloc (so, p-&gt;buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |<br>
                  NOUVEAU_BO_HIGH, 0, 0);<br>
        so_reloc (so, p-&gt;buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |<br>
                  NOUVEAU_BO_LOW, 0, 0);<br>
        so_method(so, tesla, 0x1904, 4);<br>
-       so_data  (so, 0x00040404); /* p: 0x01000404 */<br>
+       so_data  (so, p-&gt;cfg.fp.regs[0]); /* 0x01000404 / 0x00040404 etc. */<br>
        so_data  (so, 0x00000004);<br>
        so_data  (so, 0x00000000);<br>
        so_data  (so, 0x00000000);<br>
-       so_method(so, tesla, 0x16bc, 3); /*XXX: fixme */<br>
+       so_method(so, tesla, 0x16bc, 1 + p-&gt;cfg.fp.high_map);<br>
        so_data  (so, 0x03020100);<br>
-       so_data  (so, 0x07060504);<br>
-       so_data  (so, 0x0b0a0908);<br>
+       for (i = 0; i &lt; p-&gt;cfg.fp.high_map; i++)<br>
+               so_data(so, p-&gt;cfg.fp.map[i]);<br>
        so_method(so, tesla, 0x1988, 2);<br>
-       so_data  (so, 0x08080408); //0x08040404); /* p: 0x0f000401 */<br>
+       so_data  (so, p-&gt;cfg.fp.regs[1]); /* 0x08040404 / 0x0f000401 etc. */<br>
        so_data  (so, p-&gt;cfg.high_temp);<br>
+       so_method(so, tesla, 0x1298, 1);<br>
+       so_data  (so, p-&gt;cfg.high_result);<br>
+       so_method(so, tesla, 0x19a8, 1);<br>
+       so_data  (so, p-&gt;cfg.fp.regs[2]);<br>
+       so_method(so, tesla, 0x196c, 1);<br>
+       so_data  (so, p-&gt;cfg.fp.regs[3]);<br>
        so_method(so, tesla, 0x1414, 1);<br>
        so_data  (so, 0); /* program start offset */<br>
        so_ref(so, &amp;nv50-&gt;state.fragprog);<br>
<br>commit dacf2f879d63b5bf756da62eee901379336e7335<br>
Author: chr &lt;chr@LAPTOP.(none)&gt;<br>
Date:   Tue May 5 20:57:15 2009 +0200<br>
<br>
    - avoid overwriting sources before they&#39;re used in cases where dst == src<br>
    - add magical adjustment for register 1988 (I should find out how that<br>
      really works)<br>
<br>
diff --git a/src/gallium/drivers/nv50/nv50_program.c b/src/gallium/drivers/nv50/nv50_program.c<br>
index 9acf882..e4fc261 100644<br>
--- a/src/gallium/drivers/nv50/nv50_program.c<br>
+++ b/src/gallium/drivers/nv50/nv50_program.c<br>
@@ -1162,12 +1162,40 @@ tgsi_src(struct nv50_pc *pc, int chan, const struct tgsi_full_src_register *src)<br>
        return r;<br>
 }<br>
<br>
+/* returns TRUE if instruction can overwrite sources before they&#39;re read */<br>
+static boolean<br>
+direct2dest_op(const struct tgsi_full_instruction *insn)<br>
+{<br>
+       if (insn-&gt;Instruction.Saturate)<br>
+               return FALSE;<br>
+<br>
+       switch (insn-&gt;Instruction.Opcode) {<br>
+       case TGSI_OPCODE_COS:<br>
+       case TGSI_OPCODE_DP3:<br>
+       case TGSI_OPCODE_DP4:<br>
+       case TGSI_OPCODE_DPH:<br>
+       case TGSI_OPCODE_KIL:<br>
+       case TGSI_OPCODE_LIT:<br>
+       case TGSI_OPCODE_POW:<br>
+       case TGSI_OPCODE_RCP:<br>
+       case TGSI_OPCODE_RSQ:<br>
+       case TGSI_OPCODE_SCS:<br>
+       case TGSI_OPCODE_SIN:<br>
+       case TGSI_OPCODE_TEX:<br>
+       case TGSI_OPCODE_TXP:<br>
+               return FALSE;<br>
+       default:<br>
+               return TRUE;<br>
+       }<br>
+}<br>
+<br>
 static boolean<br>
 nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)<br>
 {<br>
        const struct tgsi_full_instruction *inst = &amp;tok-&gt;FullInstruction;<br>
        struct nv50_reg *rdst[4], *dst[4], *src[3][4], *temp = NULL;<br>
        unsigned mask, sat, unit;<br>
+       boolean assimilate = FALSE;<br>
        int i, c;<br>
<br>
        mask = inst-&gt;FullDstRegisters[0].DstRegister.WriteMask;<br>
@@ -1178,6 +1206,12 @@ nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)<br>
                        dst[c] = tgsi_dst(pc, c, &amp;inst-&gt;FullDstRegisters[0]);<br>
                else<br>
                        dst[c] = NULL;<br>
+<br>
+               rdst[c] = NULL;<br>
+<br>
+               src[0][c] = NULL;<br>
+               src[1][c] = NULL;<br>
+               src[2][c] = NULL;<br>
        }<br>
<br>
        for (i = 0; i &lt; inst-&gt;Instruction.NumSrcRegs; i++) {<br>
@@ -1195,8 +1229,35 @@ nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)<br>
                        rdst[c] = dst[c];<br>
                        dst[c] = temp_temp(pc);<br>
                }<br>
+       } else if (direct2dest_op(inst)) {<br>
+               for (c = 0; c &lt; 4; c++) {<br>
+                       if (!dst[c] || dst[c]-&gt;type != P_TEMP)<br>
+                               continue;<br>
+<br>
+                       for (i = c + 1; i &lt; 4; i++) {<br>
+                               if (dst[c] == src[0][i] ||<br>
+                                       dst[c] == src[1][i] ||<br>
+                                       dst[c] == src[2][i])<br>
+                                       break;<br>
+                       }<br>
+                       if (i == 4)<br>
+                               continue;<br>
+<br>
+                       assimilate = TRUE;<br>
+                       rdst[c] = dst[c];<br>
+                       dst[c] = alloc_preferred_temp(pc, rdst[c]-&gt;rhw);<br>
+               }<br>
+       } else if (inst-&gt;Instruction.Opcode == TGSI_OPCODE_LIT) {<br>
+               /* XXX: shouldn&#39;t give LIT an extra case here */<br>
+               if (src[0][1] == dst[1] ||<br>
+                       src[0][3] == dst[1]) {<br>
+                       assimilate = TRUE;<br>
+                       rdst[1] = dst[1];<br>
+                       dst[1] = alloc_temp(pc, NULL);<br>
+               }<br>
        }<br>
<br>
+       i = -1;<br>
        switch (inst-&gt;Instruction.Opcode) {<br>
        case TGSI_OPCODE_ABS:<br>
                for (c = 0; c &lt; 4; c++) {<br>
@@ -1373,14 +1434,22 @@ nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)<br>
                for (c = 0; c &lt; 4; c++) {<br>
                        if (!(mask &amp; (1 &lt;&lt; c)))<br>
                                continue;<br>
-                       emit_flop(pc, 0, dst[c], src[0][0]);<br>
+                       if (i == -1) {<br>
+                               emit_flop(pc, 0, dst[c], src[0][0]);<br>
+                               i = c;<br>
+                       } else<br>
+                               emit_mov(pc, dst[c], dst[i]);<br>
                }<br>
                break;<br>
        case TGSI_OPCODE_RSQ:<br>
                for (c = 0; c &lt; 4; c++) {<br>
                        if (!(mask &amp; (1 &lt;&lt; c)))<br>
                                continue;<br>
-                       emit_flop(pc, 2, dst[c], src[0][0]);<br>
+                       if (i == -1) {<br>
+                               emit_flop(pc, 2, dst[c], src[0][0]);<br>
+                               i = c;<br>
+                       } else<br>
+                               emit_mov(pc, dst[c], dst[i]);<br>
                }<br>
                break;<br>
        case TGSI_OPCODE_SCS:<br>
@@ -1491,6 +1560,10 @@ nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)<br>
                        set_src_0(pc, dst[c], e);<br>
                        emit(pc, e);<br>
                }<br>
+       } else if (assimilate) {<br>
+               for (c = 0; c &lt; 4; c++)<br>
+                       if (rdst[c])<br>
+                               assimilate_temp(pc, rdst[c], dst[c]);<br>
        }<br>
<br>
        for (i = 0; i &lt; inst-&gt;Instruction.NumSrcRegs; i++) {<br>
@@ -1499,10 +1572,9 @@ nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)<br>
                                continue;<br>
                        if (src[i][c]-&gt;index == -1 &amp;&amp; src[i][c]-&gt;type == P_IMMD)<br>
                                FREE(src[i][c]);<br>
-<br>
-                       /* Might also release temporaries not used anymore in this loop,<br>
-                        * therefore no temp_immd and temp_immd_nr like for temp_temp.<br>
-                        */<br>
+                       else<br>
+                       if (src[i][c]-&gt;acc == pc-&gt;insn_cur)<br>
+                               release_hw(pc, src[i][c]);<br>
                }<br>
        }<br>
<br>
@@ -1996,6 +2068,18 @@ nv50_program_tx(struct nv50_program *p)<br>
                        e_prev = e;<br>
        }<br>
<br>
+       /* adjust register 1988 &#39;heuristically&#39; */<br>
+       /* XXX: make this go away */<br>
+       for (i = 0, k = 0; k &lt; 4; ++k)<br>
+               if (pc-&gt;p-&gt;cfg.fp.regs[1] &amp; (1 &lt;&lt; (24 + k)))<br>
+                       i++;<br>
+       if (i &gt; 3 || i &lt; ((pc-&gt;p-&gt;cfg.fp.regs[1] &gt;&gt; 16) &amp; 0xFF) + 3) {<br>
+               pc-&gt;p-&gt;cfg.fp.regs[1] &amp;= 0xFFFFFF00;<br>
+               pc-&gt;p-&gt;cfg.fp.regs[1] |= ((pc-&gt;p-&gt;cfg.fp.regs[1] &gt;&gt; 16) &amp; 0xFF);<br>
+       } else {<br>
+               pc-&gt;p-&gt;cfg.fp.regs[1] |= (3 - i);<br>
+       }<br>
+<br>
        if (!is_long(pc-&gt;p-&gt;exec_tail)) {<br>
                /* this may occur if moving FP results */<br>
                assert(e_prev &amp;&amp; !is_long(e_prev));<br>
<br>commit 4411b1e3b3c11c69ec11148783327759a94165e2<br>
Author: chr &lt;chr@LAPTOP.(none)&gt;<br>
Date:   Wed May 6 11:46:17 2009 +0200<br>
<br>
    Enable half insns and immediates for MOV and ADD.<br>
<br>
diff --git a/src/gallium/drivers/nv50/nv50_program.c b/src/gallium/drivers/nv50/nv50_program.c<br>
index e4fc261..2ab7b57 100644<br>
--- a/src/gallium/drivers/nv50/nv50_program.c<br>
+++ b/src/gallium/drivers/nv50/nv50_program.c<br>
@@ -426,7 +426,7 @@ set_dst(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_program_exec *e)<br>
 static INLINE void<br>
 set_immd(struct nv50_pc *pc, struct nv50_reg *imm, struct nv50_program_exec *e)<br>
 {<br>
-       unsigned val = fui(pc-&gt;immd_buf[imm-&gt;hw]); /* XXX */<br>
+       unsigned val = fui(pc-&gt;immd_buf[imm-&gt;hw - pc-&gt;param_nr * 4]);<br>
<br>
        set_long(pc, e);<br>
        /*XXX: can&#39;t be predicated - bits overlap.. catch cases where both<br>
@@ -505,12 +505,11 @@ emit_mov(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)<br>
<br>
        set_dst(pc, dst, e);<br>
<br>
-       if (0 &amp;&amp; dst-&gt;type != P_RESULT &amp;&amp; src-&gt;type == P_IMMD) {<br>
+       if (pc-&gt;allow32 &amp;&amp; dst-&gt;type != P_RESULT &amp;&amp; src-&gt;type == P_IMMD) {<br>
                set_immd(pc, src, e);<br>
                /*XXX: 32-bit, but steals part of &quot;half&quot; reg space - need to<br>
                 *     catch and handle this case if/when we do half-regs<br>
                 */<br>
-               e-&gt;inst[0] |= 0x00008000;<br>
        } else<br>
        if (src-&gt;type == P_IMMD || src-&gt;type == P_CONST) {<br>
                set_long(pc, e);<br>
@@ -526,13 +525,15 @@ emit_mov(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)<br>
                e-&gt;inst[0] |= (src-&gt;hw &lt;&lt; 9);<br>
        }<br>
<br>
-       /* We really should support &quot;half&quot; instructions here at some point,<br>
-        * but I don&#39;t feel confident enough about them yet.<br>
-        */<br>
-       set_long(pc, e);<br>
-       if (is_long(e) &amp;&amp; !is_immd(e)) {<br>
+       if (!is_long(e) || is_immd(e))<br>
+               e-&gt;inst[0] |= 0x00008000;<br>
+       else {<br>
                e-&gt;inst[1] |= 0x04000000; /* 32-bit */<br>
-               e-&gt;inst[1] |= 0x0003c000; /* &quot;subsubop&quot; 0xf == mov */<br>
+<br>
+               /* XXX: look into this 0x3 or 0xf again */<br>
+               e-&gt;inst[1] |= 0x0000c000; /* &quot;subsubop&quot; 0x3 */<br>
+               if (!(e-&gt;inst[1] &amp; 0x20000000))<br>
+                       e-&gt;inst[1] |= 0x00030000; /* &quot;subsubop&quot; 0xf */<br>
        }<br>
<br>
        emit(pc, e);<br>
@@ -606,6 +607,7 @@ set_src_1(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e)<br>
        e-&gt;inst[0] |= (src-&gt;hw &lt;&lt; 16);<br>
 }<br>
<br>
+/* XXX: can source 2 really be a constant ? */<br>
 static void<br>
 set_src_2(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e)<br>
 {<br>
@@ -670,7 +672,10 @@ emit_add(struct nv50_pc *pc, struct nv50_reg *dst,<br>
        check_swap_src_0_1(pc, &amp;src0, &amp;src1);<br>
        set_dst(pc, dst, e);<br>
        set_src_0(pc, src0, e);<br>
-       if (is_long(e))<br>
+       if (!is_long(e) &amp;&amp; src1-&gt;type == P_IMMD &amp;&amp; pc-&gt;allow32)<br>
+               set_immd(pc, src1, e);<br>
+       else<br>
+       if (requires_long(e, src1))<br>
                set_src_2(pc, src1, e);<br>
        else<br>
                set_src_1(pc, src1, e);<br>
@@ -902,6 +907,7 @@ emit_lit(struct nv50_pc *pc, struct nv50_reg **dst, unsigned mask,<br>
        struct nv50_reg *neg128 = alloc_immd(pc, -127.999999);<br>
        struct nv50_reg *pos128 = alloc_immd(pc,  127.999999);<br>
        struct nv50_reg *tmp[4];<br>
+       boolean allow32 = pc-&gt;allow32;<br>
<br>
        if (mask &amp; (3 &lt;&lt; 1)) {<br>
                if (mask &amp; (1 &lt;&lt; 1))<br>
@@ -911,6 +917,8 @@ emit_lit(struct nv50_pc *pc, struct nv50_reg **dst, unsigned mask,<br>
                emit_minmax(pc, 4, tmp[0], src[0], zero);<br>
        }<br>
<br>
+       pc-&gt;allow32 = FALSE;<br>
+<br>
        if (mask &amp; (1 &lt;&lt; 2)) {<br>
                set_pred_wr(pc, 1, 0, pc-&gt;p-&gt;exec_tail);<br>
<br>
@@ -926,6 +934,8 @@ emit_lit(struct nv50_pc *pc, struct nv50_reg **dst, unsigned mask,<br>
                set_pred(pc, 3, 0, pc-&gt;p-&gt;exec_tail);<br>
        }<br>
<br>
+       pc-&gt;allow32 = allow32;<br>
+<br>
        /* do this last, in case src[i,j] == dst[0,3] */<br>
        if (mask &amp; (1 &lt;&lt; 0))<br>
                emit_mov(pc, dst[0], one);<br>
<br>commit 88dbc993e651da91d66c4ca471d11ee5aa2b5085<br>
Author: chr &lt;chr@LAPTOP.(none)&gt;<br>
Date:   Wed May 6 11:50:17 2009 +0200<br>
<br>
    Use multiple (3 for now: PVP, PFP, PMISC) constant buffers.<br>
<br>
diff --git a/src/gallium/drivers/nv50/nv50_program.c b/src/gallium/drivers/nv50/nv50_program.c<br>
index 2ab7b57..6e279bd 100644<br>
--- a/src/gallium/drivers/nv50/nv50_program.c<br>
+++ b/src/gallium/drivers/nv50/nv50_program.c<br>
@@ -426,7 +426,7 @@ set_dst(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_program_exec *e)<br>
 static INLINE void<br>
 set_immd(struct nv50_pc *pc, struct nv50_reg *imm, struct nv50_program_exec *e)<br>
 {<br>
-       unsigned val = fui(pc-&gt;immd_buf[imm-&gt;hw - pc-&gt;param_nr * 4]);<br>
+       unsigned val = fui(pc-&gt;immd_buf[imm-&gt;hw]);<br>
<br>
        set_long(pc, e);<br>
        /*XXX: can&#39;t be predicated - bits overlap.. catch cases where both<br>
@@ -478,22 +478,14 @@ set_data(struct nv50_pc *pc, struct nv50_reg *src, unsigned m, unsigned s,<br>
         struct nv50_program_exec *e)<br>
 {<br>
        set_long(pc, e);<br>
-#if 1<br>
-       e-&gt;inst[1] |= (1 &lt;&lt; 22);<br>
-#else<br>
-       if (src-&gt;type == P_IMMD) {<br>
-               e-&gt;inst[1] |= (NV50_CB_PMISC &lt;&lt; 22);<br>
-       } else {<br>
-               if (pc-&gt;p-&gt;type == PIPE_SHADER_VERTEX)<br>
-                       e-&gt;inst[1] |= (NV50_CB_PVP &lt;&lt; 22);<br>
-               else<br>
-                       e-&gt;inst[1] |= (NV50_CB_PFP &lt;&lt; 22);<br>
-       }<br>
-#endif<br>
<br>
+       /* XXX: <a href="http://param.bs" target="_blank">param.bs</a> can be extracted from inst[1] */<br>
+       e-&gt;<a href="http://param.bs" target="_blank">param.bs</a> = (src-&gt;type == P_IMMD) ? 0 : 1;<br>
        e-&gt;param.index = src-&gt;hw;<br>
        e-&gt;param.shift = s;<br>
        e-&gt;param.mask = m &lt;&lt; (s % 32);<br>
+<br>
+       e-&gt;inst[1] |= (e-&gt;<a href="http://param.bs" target="_blank">param.bs</a> &lt;&lt; 22);<br>
 }<br>
<br>
 static void<br>
@@ -1502,7 +1494,7 @@ nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)<br>
                }<br>
                break;<br>
        case TGSI_OPCODE_TEX:<br>
-       case TGSI_OPCODE_TXP:<br>
+       case TGSI_OPCODE_TXP: /* XXX: TXP should use w-component as iv on interp */<br>
        {<br>
                struct nv50_reg *t[4];<br>
                struct nv50_program_exec *e;<br>
@@ -1977,7 +1969,7 @@ nv50_program_tx_prep(struct nv50_pc *pc)<br>
        }<br>
<br>
        if (pc-&gt;immd_nr) {<br>
-               int rid = pc-&gt;param_nr * 4;<br>
+               int rid = 0;<br>
<br>
                pc-&gt;immd = CALLOC(pc-&gt;immd_nr * 4, sizeof(struct nv50_reg));<br>
                if (!pc-&gt;immd)<br>
@@ -2121,7 +2113,7 @@ nv50_program_validate(struct nv50_context *nv50, struct nv50_program *p)<br>
<br>
 static void<br>
 nv50_program_upload_data(struct nv50_context *nv50, float *map,<br>
-                        unsigned start, unsigned count)<br>
+                       unsigned start, unsigned count, unsigned cbuf)<br>
 {<br>
        struct nouveau_channel *chan = nv50-&gt;screen-&gt;nvws-&gt;channel;<br>
        struct nouveau_grobj *tesla = nv50-&gt;screen-&gt;tesla;<br>
@@ -2130,7 +2122,7 @@ nv50_program_upload_data(struct nv50_context *nv50, float *map,<br>
                unsigned nr = count &gt; 2047 ? 2047 : count;<br>
<br>
                BEGIN_RING(chan, tesla, 0x00000f00, 1);<br>
-               OUT_RING  (chan, (NV50_CB_PMISC &lt;&lt; 0) | (start &lt;&lt; 8));<br>
+               OUT_RING  (chan, (cbuf &lt;&lt; 0) | (start &lt;&lt; 8));<br>
                BEGIN_RING(chan, tesla, 0x40000f04, nr);<br>
                OUT_RINGp (chan, map, nr);<br>
<br>
@@ -2145,35 +2137,48 @@ nv50_program_validate_data(struct nv50_context *nv50, struct nv50_program *p)<br>
 {<br>
        struct nouveau_winsys *nvws = nv50-&gt;screen-&gt;nvws;<br>
        struct pipe_winsys *ws = nv50-&gt;pipe.winsys;<br>
-       unsigned nr = p-&gt;param_nr + p-&gt;immd_nr;<br>
<br>
-       if (!p-&gt;data &amp;&amp; nr) {<br>
-               struct nouveau_resource *heap = nv50-&gt;screen-&gt;vp_data_heap;<br>
+       if (!p-&gt;data[0] &amp;&amp; p-&gt;immd_nr) {<br>
+               struct nouveau_resource *heap = nv50-&gt;screen-&gt;immd_heap[0];<br>
+<br>
+               if (nvws-&gt;res_alloc(heap, p-&gt;immd_nr, p, &amp;p-&gt;data[0])) {<br>
+                       while (heap-&gt;next &amp;&amp; heap-&gt;size &lt; p-&gt;immd_nr) {<br>
+                               struct nv50_program *evict = heap-&gt;next-&gt;priv;<br>
+                               nvws-&gt;res_free(&amp;evict-&gt;data[0]);<br>
+                       }<br>
+<br>
+                       if (nvws-&gt;res_alloc(heap, p-&gt;immd_nr, p, &amp;p-&gt;data[0]))<br>
+                               assert(0);<br>
+               }<br>
+<br>
+               /* immediates only need to be uploaded again when freed */<br>
+               nv50_program_upload_data(nv50, p-&gt;immd, p-&gt;data[0]-&gt;start,<br>
+                                                                p-&gt;immd_nr, NV50_CB_PMISC);<br>
+       }<br>
+<br>
+       if (!p-&gt;data[1] &amp;&amp; p-&gt;param_nr) {<br>
+               struct nouveau_resource *heap = nv50-&gt;screen-&gt;parm_heap[p-&gt;type];<br>
<br>
-               if (nvws-&gt;res_alloc(heap, nr, p, &amp;p-&gt;data)) {<br>
-                       while (heap-&gt;next &amp;&amp; heap-&gt;size &lt; nr) {<br>
+               if (nvws-&gt;res_alloc(heap, p-&gt;param_nr, p, &amp;p-&gt;data[1])) {<br>
+                       while (heap-&gt;next &amp;&amp; heap-&gt;size &lt; p-&gt;param_nr) {<br>
                                struct nv50_program *evict = heap-&gt;next-&gt;priv;<br>
-                               nvws-&gt;res_free(&amp;evict-&gt;data);<br>
+                               nvws-&gt;res_free(&amp;evict-&gt;data[1]);<br>
                        }<br>
<br>
-                       if (nvws-&gt;res_alloc(heap, nr, p, &amp;p-&gt;data))<br>
+                       if (nvws-&gt;res_alloc(heap, p-&gt;param_nr, p, &amp;p-&gt;data[1]))<br>
                                assert(0);<br>
                }<br>
        }<br>
<br>
        if (p-&gt;param_nr) {<br>
+               unsigned cbuf;<br>
                float *map = ws-&gt;buffer_map(ws, nv50-&gt;constbuf[p-&gt;type],<br>
                                            PIPE_BUFFER_USAGE_CPU_READ);<br>
-               nv50_program_upload_data(nv50, map, p-&gt;data-&gt;start,<br>
-                                        p-&gt;param_nr);<br>
+               cbuf = (p-&gt;type == PIPE_SHADER_VERTEX) ? NV50_CB_PVP : NV50_CB_PFP;<br>
+               nv50_program_upload_data(nv50, map, p-&gt;data[1]-&gt;start,<br>
+                                                                p-&gt;param_nr, cbuf);<br>
                ws-&gt;buffer_unmap(ws, nv50-&gt;constbuf[p-&gt;type]);<br>
        }<br>
-<br>
-       if (p-&gt;immd_nr) {<br>
-               nv50_program_upload_data(nv50, p-&gt;immd,<br>
-                                        p-&gt;data-&gt;start + p-&gt;param_nr,<br>
-                                        p-&gt;immd_nr);<br>
-       }<br>
 }<br>
<br>
 static void<br>
@@ -2193,20 +2198,26 @@ nv50_program_validate_code(struct nv50_context *nv50, struct nv50_program *p)<br>
                upload = TRUE;<br>
        }<br>
<br>
-       if (p-&gt;data &amp;&amp; p-&gt;data-&gt;start != p-&gt;data_start) {<br>
+       if ((p-&gt;data[0] &amp;&amp; p-&gt;data[0]-&gt;start != p-&gt;data_start[0]) ||<br>
+               (p-&gt;data[1] &amp;&amp; p-&gt;data[1]-&gt;start != p-&gt;data_start[1]))<br>
+       {<br>
                for (e = p-&gt;exec_head; e; e = e-&gt;next) {<br>
                        unsigned ei, ci;<br>
<br>
                        if (e-&gt;param.index &lt; 0)<br>
                                continue;<br>
                        ei = e-&gt;param.shift &gt;&gt; 5;<br>
-                       ci = e-&gt;param.index + p-&gt;data-&gt;start;<br>
+                       ci = e-&gt;param.index + p-&gt;data[e-&gt;<a href="http://param.bs" target="_blank">param.bs</a>]-&gt;start;<br>
<br>
                        e-&gt;inst[ei] &amp;= ~e-&gt;param.mask;<br>
                        e-&gt;inst[ei] |= (ci &lt;&lt; e-&gt;param.shift);<br>
                }<br>
<br>
-               p-&gt;data_start = p-&gt;data-&gt;start;<br>
+               if (p-&gt;data[0])<br>
+                       p-&gt;data_start[0] = p-&gt;data[0]-&gt;start;<br>
+               if (p-&gt;data[1])<br>
+                       p-&gt;data_start[1] = p-&gt;data[1]-&gt;start;<br>
+<br>
                upload = TRUE;<br>
        }<br>
<br>
@@ -2364,7 +2375,8 @@ nv50_program_destroy(struct nv50_context *nv50, struct nv50_program *p)<br>
        if (p-&gt;buffer)<br>
                pipe_buffer_reference(&amp;p-&gt;buffer, NULL);<br>
<br>
-       nv50-&gt;screen-&gt;nvws-&gt;res_free(&amp;p-&gt;data);<br>
+       nv50-&gt;screen-&gt;nvws-&gt;res_free(&amp;p-&gt;data[0]);<br>
+       nv50-&gt;screen-&gt;nvws-&gt;res_free(&amp;p-&gt;data[1]);<br>
<br>
        p-&gt;translated = 0;<br>
 }<br>
diff --git a/src/gallium/drivers/nv50/nv50_program.h b/src/gallium/drivers/nv50/nv50_program.h<br>
index 3b3b6bb..9dd0f37 100644<br>
--- a/src/gallium/drivers/nv50/nv50_program.h<br>
+++ b/src/gallium/drivers/nv50/nv50_program.h<br>
@@ -10,6 +10,7 @@ struct nv50_program_exec {<br>
        unsigned inst[2];<br>
        struct {<br>
                int index;<br>
+               int bs; /* buffer selector */<br>
                unsigned mask;<br>
                unsigned shift;<br>
        } param;<br>
@@ -24,8 +25,8 @@ struct nv50_program {<br>
        struct nv50_program_exec *exec_head;<br>
        struct nv50_program_exec *exec_tail;<br>
        unsigned exec_size;<br>
-       struct nouveau_resource *data;<br>
-       unsigned data_start;<br>
+       struct nouveau_resource *data[2];<br>
+       unsigned data_start[2];<br>
<br>
        struct pipe_buffer *buffer;<br>
<br>
diff --git a/src/gallium/drivers/nv50/nv50_screen.c b/src/gallium/drivers/nv50/nv50_screen.c<br>
index 2980564..268eeeb 100644<br>
--- a/src/gallium/drivers/nv50/nv50_screen.c<br>
+++ b/src/gallium/drivers/nv50/nv50_screen.c<br>
@@ -290,20 +290,61 @@ nv50_screen_create(struct pipe_winsys *ws, struct nouveau_winsys *nvws)<br>
        so_method(so, screen-&gt;tesla, 0x16b8, 1);<br>
        so_data  (so, 8);<br>
<br>
-       /* Shared constant buffer */<br>
-       screen-&gt;constbuf = screen-&gt;pipe.buffer_create(&amp;screen-&gt;pipe, 0, 0, 128 * 4 * 4);<br>
-       if (nvws-&gt;res_init(&amp;screen-&gt;vp_data_heap, 0, 128)) {<br>
-               NOUVEAU_ERR(&quot;Error initialising constant buffer\n&quot;);<br>
+       /* constant buffers for immediates and VP/FP parameters */<br>
+       screen-&gt;constbuf_misc[0] =<br>
+               screen-&gt;pipe.buffer_create(&amp;screen-&gt;pipe, 0, 0, 128 * 4 * 4);<br>
+<br>
+       screen-&gt;constbuf_parm[0] =<br>
+               screen-&gt;pipe.buffer_create(&amp;screen-&gt;pipe, 0, 0, 128 * 4 * 4);<br>
+<br>
+       screen-&gt;constbuf_parm[1] =<br>
+               screen-&gt;pipe.buffer_create(&amp;screen-&gt;pipe, 0, 0, 128 * 4 * 4);<br>
+<br>
+       if (nvws-&gt;res_init(&amp;screen-&gt;immd_heap[0], 0, 128) ||<br>
+               nvws-&gt;res_init(&amp;screen-&gt;parm_heap[0], 0, 128) ||<br>
+               nvws-&gt;res_init(&amp;screen-&gt;parm_heap[1], 0, 128))<br>
+       {<br>
+               NOUVEAU_ERR(&quot;Error initialising constant buffers.\n&quot;);<br>
                nv50_screen_destroy(&amp;screen-&gt;pipe);<br>
                return NULL;<br>
        }<br>
<br>
        so_method(so, screen-&gt;tesla, 0x1280, 3);<br>
-       so_reloc (so, screen-&gt;constbuf, 0, NOUVEAU_BO_VRAM |<br>
+       so_reloc (so, screen-&gt;constbuf_misc[0], 0, NOUVEAU_BO_VRAM |<br>
                  NOUVEAU_BO_RD | NOUVEAU_BO_HIGH, 0, 0);<br>
-       so_reloc (so, screen-&gt;constbuf, 0, NOUVEAU_BO_VRAM |<br>
+       so_reloc (so, screen-&gt;constbuf_misc[0], 0, NOUVEAU_BO_VRAM |<br>
                  NOUVEAU_BO_RD | NOUVEAU_BO_LOW, 0, 0);<br>
-       so_data  (so, (NV50_CB_PMISC &lt;&lt; 16) | 0x00001000);<br>
+       so_data  (so, (NV50_CB_PMISC &lt;&lt; 16) | 0x00000800);<br>
+       so_method(so, screen-&gt;tesla, 0x1694, 1);<br>
+       so_data  (so, 0x00000001 | (NV50_CB_PMISC &lt;&lt; 12));<br>
+       so_method(so, screen-&gt;tesla, 0x1694, 1);<br>
+       so_data  (so, 0x00000031 | (NV50_CB_PMISC &lt;&lt; 12));<br>
+<br>
+       so_method(so, screen-&gt;tesla, 0x1280, 3);<br>
+       so_reloc (so, screen-&gt;constbuf_parm[0], 0, NOUVEAU_BO_VRAM |<br>
+                 NOUVEAU_BO_RD | NOUVEAU_BO_HIGH, 0, 0);<br>
+       so_reloc (so, screen-&gt;constbuf_parm[0], 0, NOUVEAU_BO_VRAM |<br>
+                 NOUVEAU_BO_RD | NOUVEAU_BO_LOW, 0, 0);<br>
+       so_data  (so, (NV50_CB_PVP &lt;&lt; 16) | 0x00000800);<br>
+       so_method(so, screen-&gt;tesla, 0x1694, 1);<br>
+       so_data  (so, 0x00000101 | (NV50_CB_PVP &lt;&lt; 12));<br>
+<br>
+       so_method(so, screen-&gt;tesla, 0x1280, 3);<br>
+       so_reloc (so, screen-&gt;constbuf_parm[1], 0, NOUVEAU_BO_VRAM |<br>
+                 NOUVEAU_BO_RD | NOUVEAU_BO_HIGH, 0, 0);<br>
+       so_reloc (so, screen-&gt;constbuf_parm[1], 0, NOUVEAU_BO_VRAM |<br>
+                 NOUVEAU_BO_RD | NOUVEAU_BO_LOW, 0, 0);<br>
+       so_data  (so, (NV50_CB_PMISC &lt;&lt; 16) | 0x00000800);<br>
+       so_method(so, screen-&gt;tesla, 0x1694, 1);<br>
+       so_data  (so, 0x00000131 | (NV50_CB_PFP &lt;&lt; 12));<br>
+<br>
+       /*<br>
+       // map VP and FP CB index 0 to CB_PMISC<br>
+       so_method(so, screen-&gt;tesla, 0x1694, 1);<br>
+       so_data  (so, 0x000BBNP1 = 0x00000001);<br>
+       so_method(so, screen-&gt;tesla, 0x1694, 1);<br>
+       so_data  (so, 0x000BBNP1 = 0x00000031);<br>
+       */<br>
<br>
        /* Texture sampler/image unit setup - we abuse the constant buffer<br>
         * upload mechanism for the moment to upload data to the tex config<br>
diff --git a/src/gallium/drivers/nv50/nv50_screen.h b/src/gallium/drivers/nv50/nv50_screen.h<br>
index db567aa..31b8ef2 100644<br>
--- a/src/gallium/drivers/nv50/nv50_screen.h<br>
+++ b/src/gallium/drivers/nv50/nv50_screen.h<br>
@@ -15,8 +15,11 @@ struct nv50_screen {<br>
        struct nouveau_grobj *m2mf;<br>
        struct nouveau_notifier *sync;<br>
<br>
-       struct pipe_buffer *constbuf;<br>
-       struct nouveau_resource *vp_data_heap;<br>
+       struct pipe_buffer *constbuf_misc[1];<br>
+       struct pipe_buffer *constbuf_parm[2];<br>
+<br>
+       struct nouveau_resource *immd_heap[1];<br>
+       struct nouveau_resource *parm_heap[2];<br>
<br>
        struct pipe_buffer *tic;<br>
        struct pipe_buffer *tsc;<br>
<br>_______________________________________________<br>
Nouveau mailing list<br>
<a href="mailto:Nouveau@lists.freedesktop.org" target="_blank">Nouveau@lists.freedesktop.org</a><br>
<a href="http://lists.freedesktop.org/mailman/listinfo/nouveau" target="_blank">http://lists.freedesktop.org/mailman/listinfo/nouveau</a><br>
<br></blockquote></div><br>